diff options
Diffstat (limited to 'contrib/llvm/lib')
1291 files changed, 151409 insertions, 61805 deletions
diff --git a/contrib/llvm/lib/Analysis/AliasAnalysis.cpp b/contrib/llvm/lib/Analysis/AliasAnalysis.cpp index 44d137d..35f2e97 100644 --- a/contrib/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/contrib/llvm/lib/Analysis/AliasAnalysis.cpp @@ -25,9 +25,16 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/CFLAliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/ObjCARCAliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TypeBasedAliasAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DataLayout.h" @@ -40,44 +47,72 @@ #include "llvm/Pass.h" using namespace llvm; -// Register the AliasAnalysis interface, providing a nice name to refer to. -INITIALIZE_ANALYSIS_GROUP(AliasAnalysis, "Alias Analysis", NoAA) -char AliasAnalysis::ID = 0; +/// Allow disabling BasicAA from the AA results. This is particularly useful +/// when testing to isolate a single AA implementation. +static cl::opt<bool> DisableBasicAA("disable-basicaa", cl::Hidden, + cl::init(false)); + +AAResults::AAResults(AAResults &&Arg) : AAs(std::move(Arg.AAs)) { + for (auto &AA : AAs) + AA->setAAResults(this); +} + +AAResults &AAResults::operator=(AAResults &&Arg) { + AAs = std::move(Arg.AAs); + for (auto &AA : AAs) + AA->setAAResults(this); + return *this; +} + +AAResults::~AAResults() { +// FIXME; It would be nice to at least clear out the pointers back to this +// aggregation here, but we end up with non-nesting lifetimes in the legacy +// pass manager that prevent this from working. In the legacy pass manager +// we'll end up with dangling references here in some cases. +#if 0 + for (auto &AA : AAs) + AA->setAAResults(nullptr); +#endif +} //===----------------------------------------------------------------------===// // Default chaining methods //===----------------------------------------------------------------------===// -AliasResult AliasAnalysis::alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) { - assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!"); - return AA->alias(LocA, LocB); +AliasResult AAResults::alias(const MemoryLocation &LocA, + const MemoryLocation &LocB) { + for (const auto &AA : AAs) { + auto Result = AA->alias(LocA, LocB); + if (Result != MayAlias) + return Result; + } + return MayAlias; } -bool AliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) { - assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!"); - return AA->pointsToConstantMemory(Loc, OrLocal); -} +bool AAResults::pointsToConstantMemory(const MemoryLocation &Loc, + bool OrLocal) { + for (const auto &AA : AAs) + if (AA->pointsToConstantMemory(Loc, OrLocal)) + return true; -AliasAnalysis::ModRefResult -AliasAnalysis::getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx) { - assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!"); - return AA->getArgModRefInfo(CS, ArgIdx); + return false; } -void AliasAnalysis::deleteValue(Value *V) { - assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!"); - AA->deleteValue(V); -} +ModRefInfo AAResults::getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx) { + ModRefInfo Result = MRI_ModRef; + + for (const auto &AA : AAs) { + Result = ModRefInfo(Result & AA->getArgModRefInfo(CS, ArgIdx)); -void AliasAnalysis::addEscapingUse(Use &U) { - assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!"); - AA->addEscapingUse(U); + // Early-exit the moment we reach the bottom of the lattice. + if (Result == MRI_NoModRef) + return Result; + } + + return Result; } -AliasAnalysis::ModRefResult -AliasAnalysis::getModRefInfo(Instruction *I, ImmutableCallSite Call) { +ModRefInfo AAResults::getModRefInfo(Instruction *I, ImmutableCallSite Call) { // We may have two calls if (auto CS = ImmutableCallSite(I)) { // Check if the two calls modify the same memory @@ -88,289 +123,215 @@ AliasAnalysis::getModRefInfo(Instruction *I, ImmutableCallSite Call) { // is that if the call references what this instruction // defines, it must be clobbered by this location. const MemoryLocation DefLoc = MemoryLocation::get(I); - if (getModRefInfo(Call, DefLoc) != AliasAnalysis::NoModRef) - return AliasAnalysis::ModRef; - } - return AliasAnalysis::NoModRef; -} - -AliasAnalysis::ModRefResult -AliasAnalysis::getModRefInfo(ImmutableCallSite CS, const MemoryLocation &Loc) { - assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!"); - - ModRefBehavior MRB = getModRefBehavior(CS); - if (MRB == DoesNotAccessMemory) - return NoModRef; - - ModRefResult Mask = ModRef; - if (onlyReadsMemory(MRB)) - Mask = Ref; - - if (onlyAccessesArgPointees(MRB)) { - bool doesAlias = false; - ModRefResult AllArgsMask = NoModRef; - if (doesAccessArgPointees(MRB)) { - for (ImmutableCallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end(); - AI != AE; ++AI) { - const Value *Arg = *AI; - if (!Arg->getType()->isPointerTy()) - continue; - unsigned ArgIdx = std::distance(CS.arg_begin(), AI); - MemoryLocation ArgLoc = - MemoryLocation::getForArgument(CS, ArgIdx, *TLI); - if (!isNoAlias(ArgLoc, Loc)) { - ModRefResult ArgMask = getArgModRefInfo(CS, ArgIdx); - doesAlias = true; - AllArgsMask = ModRefResult(AllArgsMask | ArgMask); - } - } - } - if (!doesAlias) - return NoModRef; - Mask = ModRefResult(Mask & AllArgsMask); + if (getModRefInfo(Call, DefLoc) != MRI_NoModRef) + return MRI_ModRef; } + return MRI_NoModRef; +} - // If Loc is a constant memory location, the call definitely could not - // modify the memory location. - if ((Mask & Mod) && pointsToConstantMemory(Loc)) - Mask = ModRefResult(Mask & ~Mod); - - // If this is the end of the chain, don't forward. - if (!AA) return Mask; - - // Otherwise, fall back to the next AA in the chain. But we can merge - // in any mask we've managed to compute. - return ModRefResult(AA->getModRefInfo(CS, Loc) & Mask); -} - -AliasAnalysis::ModRefResult -AliasAnalysis::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) { - assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!"); - - // If CS1 or CS2 are readnone, they don't interact. - ModRefBehavior CS1B = getModRefBehavior(CS1); - if (CS1B == DoesNotAccessMemory) return NoModRef; - - ModRefBehavior CS2B = getModRefBehavior(CS2); - if (CS2B == DoesNotAccessMemory) return NoModRef; - - // If they both only read from memory, there is no dependence. - if (onlyReadsMemory(CS1B) && onlyReadsMemory(CS2B)) - return NoModRef; - - AliasAnalysis::ModRefResult Mask = ModRef; - - // If CS1 only reads memory, the only dependence on CS2 can be - // from CS1 reading memory written by CS2. - if (onlyReadsMemory(CS1B)) - Mask = ModRefResult(Mask & Ref); - - // If CS2 only access memory through arguments, accumulate the mod/ref - // information from CS1's references to the memory referenced by - // CS2's arguments. - if (onlyAccessesArgPointees(CS2B)) { - AliasAnalysis::ModRefResult R = NoModRef; - if (doesAccessArgPointees(CS2B)) { - for (ImmutableCallSite::arg_iterator - I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) { - const Value *Arg = *I; - if (!Arg->getType()->isPointerTy()) - continue; - unsigned CS2ArgIdx = std::distance(CS2.arg_begin(), I); - auto CS2ArgLoc = MemoryLocation::getForArgument(CS2, CS2ArgIdx, *TLI); - - // ArgMask indicates what CS2 might do to CS2ArgLoc, and the dependence of - // CS1 on that location is the inverse. - ModRefResult ArgMask = getArgModRefInfo(CS2, CS2ArgIdx); - if (ArgMask == Mod) - ArgMask = ModRef; - else if (ArgMask == Ref) - ArgMask = Mod; - - R = ModRefResult((R | (getModRefInfo(CS1, CS2ArgLoc) & ArgMask)) & Mask); - if (R == Mask) - break; - } - } - return R; - } +ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS, + const MemoryLocation &Loc) { + ModRefInfo Result = MRI_ModRef; - // If CS1 only accesses memory through arguments, check if CS2 references - // any of the memory referenced by CS1's arguments. If not, return NoModRef. - if (onlyAccessesArgPointees(CS1B)) { - AliasAnalysis::ModRefResult R = NoModRef; - if (doesAccessArgPointees(CS1B)) { - for (ImmutableCallSite::arg_iterator - I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) { - const Value *Arg = *I; - if (!Arg->getType()->isPointerTy()) - continue; - unsigned CS1ArgIdx = std::distance(CS1.arg_begin(), I); - auto CS1ArgLoc = MemoryLocation::getForArgument(CS1, CS1ArgIdx, *TLI); - - // ArgMask indicates what CS1 might do to CS1ArgLoc; if CS1 might Mod - // CS1ArgLoc, then we care about either a Mod or a Ref by CS2. If CS1 - // might Ref, then we care only about a Mod by CS2. - ModRefResult ArgMask = getArgModRefInfo(CS1, CS1ArgIdx); - ModRefResult ArgR = getModRefInfo(CS2, CS1ArgLoc); - if (((ArgMask & Mod) != NoModRef && (ArgR & ModRef) != NoModRef) || - ((ArgMask & Ref) != NoModRef && (ArgR & Mod) != NoModRef)) - R = ModRefResult((R | ArgMask) & Mask); - - if (R == Mask) - break; - } - } - return R; - } + for (const auto &AA : AAs) { + Result = ModRefInfo(Result & AA->getModRefInfo(CS, Loc)); - // If this is the end of the chain, don't forward. - if (!AA) return Mask; + // Early-exit the moment we reach the bottom of the lattice. + if (Result == MRI_NoModRef) + return Result; + } - // Otherwise, fall back to the next AA in the chain. But we can merge - // in any mask we've managed to compute. - return ModRefResult(AA->getModRefInfo(CS1, CS2) & Mask); + return Result; } -AliasAnalysis::ModRefBehavior -AliasAnalysis::getModRefBehavior(ImmutableCallSite CS) { - assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!"); +ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1, + ImmutableCallSite CS2) { + ModRefInfo Result = MRI_ModRef; + + for (const auto &AA : AAs) { + Result = ModRefInfo(Result & AA->getModRefInfo(CS1, CS2)); + + // Early-exit the moment we reach the bottom of the lattice. + if (Result == MRI_NoModRef) + return Result; + } + + return Result; +} - ModRefBehavior Min = UnknownModRefBehavior; +FunctionModRefBehavior AAResults::getModRefBehavior(ImmutableCallSite CS) { + FunctionModRefBehavior Result = FMRB_UnknownModRefBehavior; - // Call back into the alias analysis with the other form of getModRefBehavior - // to see if it can give a better response. - if (const Function *F = CS.getCalledFunction()) - Min = getModRefBehavior(F); + for (const auto &AA : AAs) { + Result = FunctionModRefBehavior(Result & AA->getModRefBehavior(CS)); - // If this is the end of the chain, don't forward. - if (!AA) return Min; + // Early-exit the moment we reach the bottom of the lattice. + if (Result == FMRB_DoesNotAccessMemory) + return Result; + } - // Otherwise, fall back to the next AA in the chain. But we can merge - // in any result we've managed to compute. - return ModRefBehavior(AA->getModRefBehavior(CS) & Min); + return Result; } -AliasAnalysis::ModRefBehavior -AliasAnalysis::getModRefBehavior(const Function *F) { - assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!"); - return AA->getModRefBehavior(F); +FunctionModRefBehavior AAResults::getModRefBehavior(const Function *F) { + FunctionModRefBehavior Result = FMRB_UnknownModRefBehavior; + + for (const auto &AA : AAs) { + Result = FunctionModRefBehavior(Result & AA->getModRefBehavior(F)); + + // Early-exit the moment we reach the bottom of the lattice. + if (Result == FMRB_DoesNotAccessMemory) + return Result; + } + + return Result; } //===----------------------------------------------------------------------===// -// AliasAnalysis non-virtual helper method implementation +// Helper method implementation //===----------------------------------------------------------------------===// -AliasAnalysis::ModRefResult -AliasAnalysis::getModRefInfo(const LoadInst *L, const MemoryLocation &Loc) { +ModRefInfo AAResults::getModRefInfo(const LoadInst *L, + const MemoryLocation &Loc) { // Be conservative in the face of volatile/atomic. if (!L->isUnordered()) - return ModRef; + return MRI_ModRef; // If the load address doesn't alias the given address, it doesn't read // or write the specified memory. if (Loc.Ptr && !alias(MemoryLocation::get(L), Loc)) - return NoModRef; + return MRI_NoModRef; // Otherwise, a load just reads. - return Ref; + return MRI_Ref; } -AliasAnalysis::ModRefResult -AliasAnalysis::getModRefInfo(const StoreInst *S, const MemoryLocation &Loc) { +ModRefInfo AAResults::getModRefInfo(const StoreInst *S, + const MemoryLocation &Loc) { // Be conservative in the face of volatile/atomic. if (!S->isUnordered()) - return ModRef; + return MRI_ModRef; if (Loc.Ptr) { // If the store address cannot alias the pointer in question, then the // specified memory cannot be modified by the store. if (!alias(MemoryLocation::get(S), Loc)) - return NoModRef; + return MRI_NoModRef; // If the pointer is a pointer to constant memory, then it could not have // been modified by this store. if (pointsToConstantMemory(Loc)) - return NoModRef; - + return MRI_NoModRef; } // Otherwise, a store just writes. - return Mod; + return MRI_Mod; } -AliasAnalysis::ModRefResult -AliasAnalysis::getModRefInfo(const VAArgInst *V, const MemoryLocation &Loc) { +ModRefInfo AAResults::getModRefInfo(const VAArgInst *V, + const MemoryLocation &Loc) { if (Loc.Ptr) { // If the va_arg address cannot alias the pointer in question, then the // specified memory cannot be accessed by the va_arg. if (!alias(MemoryLocation::get(V), Loc)) - return NoModRef; + return MRI_NoModRef; // If the pointer is a pointer to constant memory, then it could not have // been modified by this va_arg. if (pointsToConstantMemory(Loc)) - return NoModRef; + return MRI_NoModRef; } // Otherwise, a va_arg reads and writes. - return ModRef; + return MRI_ModRef; +} + +ModRefInfo AAResults::getModRefInfo(const CatchPadInst *CatchPad, + const MemoryLocation &Loc) { + if (Loc.Ptr) { + // If the pointer is a pointer to constant memory, + // then it could not have been modified by this catchpad. + if (pointsToConstantMemory(Loc)) + return MRI_NoModRef; + } + + // Otherwise, a catchpad reads and writes. + return MRI_ModRef; +} + +ModRefInfo AAResults::getModRefInfo(const CatchReturnInst *CatchRet, + const MemoryLocation &Loc) { + if (Loc.Ptr) { + // If the pointer is a pointer to constant memory, + // then it could not have been modified by this catchpad. + if (pointsToConstantMemory(Loc)) + return MRI_NoModRef; + } + + // Otherwise, a catchret reads and writes. + return MRI_ModRef; } -AliasAnalysis::ModRefResult -AliasAnalysis::getModRefInfo(const AtomicCmpXchgInst *CX, - const MemoryLocation &Loc) { +ModRefInfo AAResults::getModRefInfo(const AtomicCmpXchgInst *CX, + const MemoryLocation &Loc) { // Acquire/Release cmpxchg has properties that matter for arbitrary addresses. if (CX->getSuccessOrdering() > Monotonic) - return ModRef; + return MRI_ModRef; // If the cmpxchg address does not alias the location, it does not access it. if (Loc.Ptr && !alias(MemoryLocation::get(CX), Loc)) - return NoModRef; + return MRI_NoModRef; - return ModRef; + return MRI_ModRef; } -AliasAnalysis::ModRefResult -AliasAnalysis::getModRefInfo(const AtomicRMWInst *RMW, - const MemoryLocation &Loc) { +ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW, + const MemoryLocation &Loc) { // Acquire/Release atomicrmw has properties that matter for arbitrary addresses. if (RMW->getOrdering() > Monotonic) - return ModRef; + return MRI_ModRef; // If the atomicrmw address does not alias the location, it does not access it. if (Loc.Ptr && !alias(MemoryLocation::get(RMW), Loc)) - return NoModRef; + return MRI_NoModRef; - return ModRef; + return MRI_ModRef; } -// FIXME: this is really just shoring-up a deficiency in alias analysis. -// BasicAA isn't willing to spend linear time determining whether an alloca -// was captured before or after this particular call, while we are. However, -// with a smarter AA in place, this test is just wasting compile time. -AliasAnalysis::ModRefResult AliasAnalysis::callCapturesBefore( - const Instruction *I, const MemoryLocation &MemLoc, DominatorTree *DT) { +/// \brief Return information about whether a particular call site modifies +/// or reads the specified memory location \p MemLoc before instruction \p I +/// in a BasicBlock. A ordered basic block \p OBB can be used to speed up +/// instruction-ordering queries inside the BasicBlock containing \p I. +/// FIXME: this is really just shoring-up a deficiency in alias analysis. +/// BasicAA isn't willing to spend linear time determining whether an alloca +/// was captured before or after this particular call, while we are. However, +/// with a smarter AA in place, this test is just wasting compile time. +ModRefInfo AAResults::callCapturesBefore(const Instruction *I, + const MemoryLocation &MemLoc, + DominatorTree *DT, + OrderedBasicBlock *OBB) { if (!DT) - return AliasAnalysis::ModRef; + return MRI_ModRef; - const Value *Object = GetUnderlyingObject(MemLoc.Ptr, *DL); + const Value *Object = + GetUnderlyingObject(MemLoc.Ptr, I->getModule()->getDataLayout()); if (!isIdentifiedObject(Object) || isa<GlobalValue>(Object) || isa<Constant>(Object)) - return AliasAnalysis::ModRef; + return MRI_ModRef; ImmutableCallSite CS(I); if (!CS.getInstruction() || CS.getInstruction() == Object) - return AliasAnalysis::ModRef; + return MRI_ModRef; if (llvm::PointerMayBeCapturedBefore(Object, /* ReturnCaptures */ true, /* StoreCaptures */ true, I, DT, - /* include Object */ true)) - return AliasAnalysis::ModRef; + /* include Object */ true, + /* OrderedBasicBlock */ OBB)) + return MRI_ModRef; unsigned ArgNo = 0; - AliasAnalysis::ModRefResult R = AliasAnalysis::NoModRef; + ModRefInfo R = MRI_NoModRef; for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end(); CI != CE; ++CI, ++ArgNo) { // Only look at the no-capture or byval pointer arguments. If this @@ -389,50 +350,20 @@ AliasAnalysis::ModRefResult AliasAnalysis::callCapturesBefore( if (CS.doesNotAccessMemory(ArgNo)) continue; if (CS.onlyReadsMemory(ArgNo)) { - R = AliasAnalysis::Ref; + R = MRI_Ref; continue; } - return AliasAnalysis::ModRef; + return MRI_ModRef; } return R; } -// AliasAnalysis destructor: DO NOT move this to the header file for -// AliasAnalysis or else clients of the AliasAnalysis class may not depend on -// the AliasAnalysis.o file in the current .a file, causing alias analysis -// support to not be included in the tool correctly! -// -AliasAnalysis::~AliasAnalysis() {} - -/// InitializeAliasAnalysis - Subclasses must call this method to initialize the -/// AliasAnalysis interface before any other methods are called. -/// -void AliasAnalysis::InitializeAliasAnalysis(Pass *P, const DataLayout *NewDL) { - DL = NewDL; - auto *TLIP = P->getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - TLI = TLIP ? &TLIP->getTLI() : nullptr; - AA = &P->getAnalysis<AliasAnalysis>(); -} - -// getAnalysisUsage - All alias analysis implementations should invoke this -// directly (using AliasAnalysis::getAnalysisUsage(AU)). -void AliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AliasAnalysis>(); // All AA's chain -} - -/// getTypeStoreSize - Return the DataLayout store size for the given type, -/// if known, or a conservative value otherwise. -/// -uint64_t AliasAnalysis::getTypeStoreSize(Type *Ty) { - return DL ? DL->getTypeStoreSize(Ty) : MemoryLocation::UnknownSize; -} - /// canBasicBlockModify - Return true if it is possible for execution of the /// specified basic block to modify the location Loc. /// -bool AliasAnalysis::canBasicBlockModify(const BasicBlock &BB, - const MemoryLocation &Loc) { - return canInstructionRangeModRef(BB.front(), BB.back(), Loc, Mod); +bool AAResults::canBasicBlockModify(const BasicBlock &BB, + const MemoryLocation &Loc) { + return canInstructionRangeModRef(BB.front(), BB.back(), Loc, MRI_Mod); } /// canInstructionRangeModRef - Return true if it is possible for the @@ -440,28 +371,178 @@ bool AliasAnalysis::canBasicBlockModify(const BasicBlock &BB, /// mode) the location Loc. The instructions to consider are all /// of the instructions in the range of [I1,I2] INCLUSIVE. /// I1 and I2 must be in the same basic block. -bool AliasAnalysis::canInstructionRangeModRef(const Instruction &I1, - const Instruction &I2, - const MemoryLocation &Loc, - const ModRefResult Mode) { +bool AAResults::canInstructionRangeModRef(const Instruction &I1, + const Instruction &I2, + const MemoryLocation &Loc, + const ModRefInfo Mode) { assert(I1.getParent() == I2.getParent() && "Instructions not in same basic block!"); - BasicBlock::const_iterator I = &I1; - BasicBlock::const_iterator E = &I2; + BasicBlock::const_iterator I = I1.getIterator(); + BasicBlock::const_iterator E = I2.getIterator(); ++E; // Convert from inclusive to exclusive range. for (; I != E; ++I) // Check every instruction in range - if (getModRefInfo(I, Loc) & Mode) + if (getModRefInfo(&*I, Loc) & Mode) return true; return false; } +// Provide a definition for the root virtual destructor. +AAResults::Concept::~Concept() {} + +namespace { +/// A wrapper pass for external alias analyses. This just squirrels away the +/// callback used to run any analyses and register their results. +struct ExternalAAWrapperPass : ImmutablePass { + typedef std::function<void(Pass &, Function &, AAResults &)> CallbackT; + + CallbackT CB; + + static char ID; + + ExternalAAWrapperPass() : ImmutablePass(ID) { + initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry()); + } + explicit ExternalAAWrapperPass(CallbackT CB) + : ImmutablePass(ID), CB(std::move(CB)) { + initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } +}; +} + +char ExternalAAWrapperPass::ID = 0; +INITIALIZE_PASS(ExternalAAWrapperPass, "external-aa", "External Alias Analysis", + false, true) + +ImmutablePass * +llvm::createExternalAAWrapperPass(ExternalAAWrapperPass::CallbackT Callback) { + return new ExternalAAWrapperPass(std::move(Callback)); +} + +AAResultsWrapperPass::AAResultsWrapperPass() : FunctionPass(ID) { + initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +char AAResultsWrapperPass::ID = 0; + +INITIALIZE_PASS_BEGIN(AAResultsWrapperPass, "aa", + "Function Alias Analysis Results", false, true) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(CFLAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ExternalAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ObjCARCAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScopedNoAliasAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TypeBasedAAWrapperPass) +INITIALIZE_PASS_END(AAResultsWrapperPass, "aa", + "Function Alias Analysis Results", false, true) + +FunctionPass *llvm::createAAResultsWrapperPass() { + return new AAResultsWrapperPass(); +} + +/// Run the wrapper pass to rebuild an aggregation over known AA passes. +/// +/// This is the legacy pass manager's interface to the new-style AA results +/// aggregation object. Because this is somewhat shoe-horned into the legacy +/// pass manager, we hard code all the specific alias analyses available into +/// it. While the particular set enabled is configured via commandline flags, +/// adding a new alias analysis to LLVM will require adding support for it to +/// this list. +bool AAResultsWrapperPass::runOnFunction(Function &F) { + // NB! This *must* be reset before adding new AA results to the new + // AAResults object because in the legacy pass manager, each instance + // of these will refer to the *same* immutable analyses, registering and + // unregistering themselves with them. We need to carefully tear down the + // previous object first, in this case replacing it with an empty one, before + // registering new results. + AAR.reset(new AAResults()); + + // BasicAA is always available for function analyses. Also, we add it first + // so that it can trump TBAA results when it proves MustAlias. + // FIXME: TBAA should have an explicit mode to support this and then we + // should reconsider the ordering here. + if (!DisableBasicAA) + AAR->addAAResult(getAnalysis<BasicAAWrapperPass>().getResult()); + + // Populate the results with the currently available AAs. + if (auto *WrapperPass = getAnalysisIfAvailable<ScopedNoAliasAAWrapperPass>()) + AAR->addAAResult(WrapperPass->getResult()); + if (auto *WrapperPass = getAnalysisIfAvailable<TypeBasedAAWrapperPass>()) + AAR->addAAResult(WrapperPass->getResult()); + if (auto *WrapperPass = + getAnalysisIfAvailable<objcarc::ObjCARCAAWrapperPass>()) + AAR->addAAResult(WrapperPass->getResult()); + if (auto *WrapperPass = getAnalysisIfAvailable<GlobalsAAWrapperPass>()) + AAR->addAAResult(WrapperPass->getResult()); + if (auto *WrapperPass = getAnalysisIfAvailable<SCEVAAWrapperPass>()) + AAR->addAAResult(WrapperPass->getResult()); + if (auto *WrapperPass = getAnalysisIfAvailable<CFLAAWrapperPass>()) + AAR->addAAResult(WrapperPass->getResult()); + + // If available, run an external AA providing callback over the results as + // well. + if (auto *WrapperPass = getAnalysisIfAvailable<ExternalAAWrapperPass>()) + if (WrapperPass->CB) + WrapperPass->CB(*this, F, *AAR); + + // Analyses don't mutate the IR, so return false. + return false; +} + +void AAResultsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<BasicAAWrapperPass>(); + + // We also need to mark all the alias analysis passes we will potentially + // probe in runOnFunction as used here to ensure the legacy pass manager + // preserves them. This hard coding of lists of alias analyses is specific to + // the legacy pass manager. + AU.addUsedIfAvailable<ScopedNoAliasAAWrapperPass>(); + AU.addUsedIfAvailable<TypeBasedAAWrapperPass>(); + AU.addUsedIfAvailable<objcarc::ObjCARCAAWrapperPass>(); + AU.addUsedIfAvailable<GlobalsAAWrapperPass>(); + AU.addUsedIfAvailable<SCEVAAWrapperPass>(); + AU.addUsedIfAvailable<CFLAAWrapperPass>(); +} + +AAResults llvm::createLegacyPMAAResults(Pass &P, Function &F, + BasicAAResult &BAR) { + AAResults AAR; + + // Add in our explicitly constructed BasicAA results. + if (!DisableBasicAA) + AAR.addAAResult(BAR); + + // Populate the results with the other currently available AAs. + if (auto *WrapperPass = + P.getAnalysisIfAvailable<ScopedNoAliasAAWrapperPass>()) + AAR.addAAResult(WrapperPass->getResult()); + if (auto *WrapperPass = P.getAnalysisIfAvailable<TypeBasedAAWrapperPass>()) + AAR.addAAResult(WrapperPass->getResult()); + if (auto *WrapperPass = + P.getAnalysisIfAvailable<objcarc::ObjCARCAAWrapperPass>()) + AAR.addAAResult(WrapperPass->getResult()); + if (auto *WrapperPass = P.getAnalysisIfAvailable<GlobalsAAWrapperPass>()) + AAR.addAAResult(WrapperPass->getResult()); + if (auto *WrapperPass = P.getAnalysisIfAvailable<SCEVAAWrapperPass>()) + AAR.addAAResult(WrapperPass->getResult()); + if (auto *WrapperPass = P.getAnalysisIfAvailable<CFLAAWrapperPass>()) + AAR.addAAResult(WrapperPass->getResult()); + + return AAR; +} + /// isNoAliasCall - Return true if this pointer is returned by a noalias /// function. bool llvm::isNoAliasCall(const Value *V) { - if (isa<CallInst>(V) || isa<InvokeInst>(V)) - return ImmutableCallSite(cast<Instruction>(V)) - .paramHasAttr(0, Attribute::NoAlias); + if (auto CS = ImmutableCallSite(V)) + return CS.paramHasAttr(0, Attribute::NoAlias); return false; } diff --git a/contrib/llvm/lib/Analysis/AliasAnalysisCounter.cpp b/contrib/llvm/lib/Analysis/AliasAnalysisCounter.cpp deleted file mode 100644 index 9b6a5a4..0000000 --- a/contrib/llvm/lib/Analysis/AliasAnalysisCounter.cpp +++ /dev/null @@ -1,173 +0,0 @@ -//===- AliasAnalysisCounter.cpp - Alias Analysis Query Counter ------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements a pass which can be used to count how many alias queries -// are being made and how the alias analysis implementation being used responds. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -using namespace llvm; - -static cl::opt<bool> -PrintAll("count-aa-print-all-queries", cl::ReallyHidden, cl::init(true)); -static cl::opt<bool> -PrintAllFailures("count-aa-print-all-failed-queries", cl::ReallyHidden); - -namespace { - class AliasAnalysisCounter : public ModulePass, public AliasAnalysis { - unsigned No, May, Partial, Must; - unsigned NoMR, JustRef, JustMod, MR; - Module *M; - public: - static char ID; // Class identification, replacement for typeinfo - AliasAnalysisCounter() : ModulePass(ID) { - initializeAliasAnalysisCounterPass(*PassRegistry::getPassRegistry()); - No = May = Partial = Must = 0; - NoMR = JustRef = JustMod = MR = 0; - } - - void printLine(const char *Desc, unsigned Val, unsigned Sum) { - errs() << " " << Val << " " << Desc << " responses (" - << Val*100/Sum << "%)\n"; - } - ~AliasAnalysisCounter() override { - unsigned AASum = No+May+Partial+Must; - unsigned MRSum = NoMR+JustRef+JustMod+MR; - if (AASum + MRSum) { // Print a report if any counted queries occurred... - errs() << "\n===== Alias Analysis Counter Report =====\n" - << " Analysis counted:\n" - << " " << AASum << " Total Alias Queries Performed\n"; - if (AASum) { - printLine("no alias", No, AASum); - printLine("may alias", May, AASum); - printLine("partial alias", Partial, AASum); - printLine("must alias", Must, AASum); - errs() << " Alias Analysis Counter Summary: " << No*100/AASum << "%/" - << May*100/AASum << "%/" - << Partial*100/AASum << "%/" - << Must*100/AASum<<"%\n\n"; - } - - errs() << " " << MRSum << " Total Mod/Ref Queries Performed\n"; - if (MRSum) { - printLine("no mod/ref", NoMR, MRSum); - printLine("ref", JustRef, MRSum); - printLine("mod", JustMod, MRSum); - printLine("mod/ref", MR, MRSum); - errs() << " Mod/Ref Analysis Counter Summary: " <<NoMR*100/MRSum - << "%/" << JustRef*100/MRSum << "%/" << JustMod*100/MRSum - << "%/" << MR*100/MRSum <<"%\n\n"; - } - } - } - - bool runOnModule(Module &M) override { - this->M = &M; - InitializeAliasAnalysis(this, &M.getDataLayout()); - return false; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AliasAnalysis::getAnalysisUsage(AU); - AU.addRequired<AliasAnalysis>(); - AU.setPreservesAll(); - } - - /// getAdjustedAnalysisPointer - This method is used when a pass implements - /// an analysis interface through multiple inheritance. If needed, it - /// should override this to adjust the this pointer as needed for the - /// specified pass info. - void *getAdjustedAnalysisPointer(AnalysisID PI) override { - if (PI == &AliasAnalysis::ID) - return (AliasAnalysis*)this; - return this; - } - - // FIXME: We could count these too... - bool pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) override { - return getAnalysis<AliasAnalysis>().pointsToConstantMemory(Loc, OrLocal); - } - - // Forwarding functions: just delegate to a real AA implementation, counting - // the number of responses... - AliasResult alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) override; - - ModRefResult getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) override; - ModRefResult getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2) override { - return AliasAnalysis::getModRefInfo(CS1,CS2); - } - }; -} - -char AliasAnalysisCounter::ID = 0; -INITIALIZE_AG_PASS(AliasAnalysisCounter, AliasAnalysis, "count-aa", - "Count Alias Analysis Query Responses", false, true, false) - -ModulePass *llvm::createAliasAnalysisCounterPass() { - return new AliasAnalysisCounter(); -} - -AliasResult AliasAnalysisCounter::alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) { - AliasResult R = getAnalysis<AliasAnalysis>().alias(LocA, LocB); - - const char *AliasString = nullptr; - switch (R) { - case NoAlias: No++; AliasString = "No alias"; break; - case MayAlias: May++; AliasString = "May alias"; break; - case PartialAlias: Partial++; AliasString = "Partial alias"; break; - case MustAlias: Must++; AliasString = "Must alias"; break; - } - - if (PrintAll || (PrintAllFailures && R == MayAlias)) { - errs() << AliasString << ":\t"; - errs() << "[" << LocA.Size << "B] "; - LocA.Ptr->printAsOperand(errs(), true, M); - errs() << ", "; - errs() << "[" << LocB.Size << "B] "; - LocB.Ptr->printAsOperand(errs(), true, M); - errs() << "\n"; - } - - return R; -} - -AliasAnalysis::ModRefResult -AliasAnalysisCounter::getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) { - ModRefResult R = getAnalysis<AliasAnalysis>().getModRefInfo(CS, Loc); - - const char *MRString = nullptr; - switch (R) { - case NoModRef: NoMR++; MRString = "NoModRef"; break; - case Ref: JustRef++; MRString = "JustRef"; break; - case Mod: JustMod++; MRString = "JustMod"; break; - case ModRef: MR++; MRString = "ModRef"; break; - } - - if (PrintAll || (PrintAllFailures && R == ModRef)) { - errs() << MRString << ": Ptr: "; - errs() << "[" << Loc.Size << "B] "; - Loc.Ptr->printAsOperand(errs(), true, M); - errs() << "\t<->" << *CS.getInstruction() << '\n'; - } - return R; -} diff --git a/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp b/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp index 5d1b001..12917b6 100644 --- a/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp +++ b/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp @@ -21,8 +21,10 @@ #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/Pass.h" @@ -57,7 +59,7 @@ namespace { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.setPreservesAll(); } @@ -81,7 +83,7 @@ namespace { char AAEval::ID = 0; INITIALIZE_PASS_BEGIN(AAEval, "aa-eval", "Exhaustive Alias Analysis Precision Evaluator", false, true) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(AAEval, "aa-eval", "Exhaustive Alias Analysis Precision Evaluator", false, true) @@ -139,16 +141,17 @@ static inline bool isInterestingPointer(Value *V) { } bool AAEval::runOnFunction(Function &F) { - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + const DataLayout &DL = F.getParent()->getDataLayout(); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); SetVector<Value *> Pointers; - SetVector<CallSite> CallSites; + SmallSetVector<CallSite, 16> CallSites; SetVector<Value *> Loads; SetVector<Value *> Stores; - for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) - if (I->getType()->isPointerTy()) // Add all pointer arguments. - Pointers.insert(I); + for (auto &I : F.args()) + if (I.getType()->isPointerTy()) // Add all pointer arguments. + Pointers.insert(&I); for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) { if (I->getType()->isPointerTy()) // Add all pointer instructions. @@ -164,10 +167,9 @@ bool AAEval::runOnFunction(Function &F) { if (!isa<Function>(Callee) && isInterestingPointer(Callee)) Pointers.insert(Callee); // Consider formals. - for (CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end(); - AI != AE; ++AI) - if (isInterestingPointer(*AI)) - Pointers.insert(*AI); + for (Use &DataOp : CS.data_ops()) + if (isInterestingPointer(DataOp)) + Pointers.insert(DataOp); CallSites.insert(CS); } else { // Consider all operands. @@ -188,12 +190,12 @@ bool AAEval::runOnFunction(Function &F) { I1 != E; ++I1) { uint64_t I1Size = MemoryLocation::UnknownSize; Type *I1ElTy = cast<PointerType>((*I1)->getType())->getElementType(); - if (I1ElTy->isSized()) I1Size = AA.getTypeStoreSize(I1ElTy); + if (I1ElTy->isSized()) I1Size = DL.getTypeStoreSize(I1ElTy); for (SetVector<Value *>::iterator I2 = Pointers.begin(); I2 != I1; ++I2) { uint64_t I2Size = MemoryLocation::UnknownSize; Type *I2ElTy =cast<PointerType>((*I2)->getType())->getElementType(); - if (I2ElTy->isSized()) I2Size = AA.getTypeStoreSize(I2ElTy); + if (I2ElTy->isSized()) I2Size = DL.getTypeStoreSize(I2ElTy); switch (AA.alias(*I1, I1Size, *I2, I2Size)) { case NoAlias: @@ -281,30 +283,29 @@ bool AAEval::runOnFunction(Function &F) { } // Mod/ref alias analysis: compare all pairs of calls and values - for (SetVector<CallSite>::iterator C = CallSites.begin(), - Ce = CallSites.end(); C != Ce; ++C) { + for (auto C = CallSites.begin(), Ce = CallSites.end(); C != Ce; ++C) { Instruction *I = C->getInstruction(); for (SetVector<Value *>::iterator V = Pointers.begin(), Ve = Pointers.end(); V != Ve; ++V) { uint64_t Size = MemoryLocation::UnknownSize; Type *ElTy = cast<PointerType>((*V)->getType())->getElementType(); - if (ElTy->isSized()) Size = AA.getTypeStoreSize(ElTy); + if (ElTy->isSized()) Size = DL.getTypeStoreSize(ElTy); switch (AA.getModRefInfo(*C, *V, Size)) { - case AliasAnalysis::NoModRef: + case MRI_NoModRef: PrintModRefResults("NoModRef", PrintNoModRef, I, *V, F.getParent()); ++NoModRefCount; break; - case AliasAnalysis::Mod: + case MRI_Mod: PrintModRefResults("Just Mod", PrintMod, I, *V, F.getParent()); ++ModCount; break; - case AliasAnalysis::Ref: + case MRI_Ref: PrintModRefResults("Just Ref", PrintRef, I, *V, F.getParent()); ++RefCount; break; - case AliasAnalysis::ModRef: + case MRI_ModRef: PrintModRefResults("Both ModRef", PrintModRef, I, *V, F.getParent()); ++ModRefCount; break; @@ -313,25 +314,24 @@ bool AAEval::runOnFunction(Function &F) { } // Mod/ref alias analysis: compare all pairs of calls - for (SetVector<CallSite>::iterator C = CallSites.begin(), - Ce = CallSites.end(); C != Ce; ++C) { - for (SetVector<CallSite>::iterator D = CallSites.begin(); D != Ce; ++D) { + for (auto C = CallSites.begin(), Ce = CallSites.end(); C != Ce; ++C) { + for (auto D = CallSites.begin(); D != Ce; ++D) { if (D == C) continue; switch (AA.getModRefInfo(*C, *D)) { - case AliasAnalysis::NoModRef: + case MRI_NoModRef: PrintModRefResults("NoModRef", PrintNoModRef, *C, *D, F.getParent()); ++NoModRefCount; break; - case AliasAnalysis::Mod: + case MRI_Mod: PrintModRefResults("Just Mod", PrintMod, *C, *D, F.getParent()); ++ModCount; break; - case AliasAnalysis::Ref: + case MRI_Ref: PrintModRefResults("Just Ref", PrintRef, *C, *D, F.getParent()); ++RefCount; break; - case AliasAnalysis::ModRef: + case MRI_ModRef: PrintModRefResults("Both ModRef", PrintModRef, *C, *D, F.getParent()); ++ModRefCount; break; diff --git a/contrib/llvm/lib/Analysis/AliasDebugger.cpp b/contrib/llvm/lib/Analysis/AliasDebugger.cpp deleted file mode 100644 index e5107b3..0000000 --- a/contrib/llvm/lib/Analysis/AliasDebugger.cpp +++ /dev/null @@ -1,136 +0,0 @@ -//===- AliasDebugger.cpp - Simple Alias Analysis Use Checker --------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This simple pass checks alias analysis users to ensure that if they -// create a new value, they do not query AA without informing it of the value. -// It acts as a shim over any other AA pass you want. -// -// Yes keeping track of every value in the program is expensive, but this is -// a debugging pass. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include <set> -using namespace llvm; - -namespace { - - class AliasDebugger : public ModulePass, public AliasAnalysis { - - //What we do is simple. Keep track of every value the AA could - //know about, and verify that queries are one of those. - //A query to a value that didn't exist when the AA was created - //means someone forgot to update the AA when creating new values - - std::set<const Value*> Vals; - - public: - static char ID; // Class identification, replacement for typeinfo - AliasDebugger() : ModulePass(ID) { - initializeAliasDebuggerPass(*PassRegistry::getPassRegistry()); - } - - bool runOnModule(Module &M) override { - InitializeAliasAnalysis(this, &M.getDataLayout()); // set up super class - - for(Module::global_iterator I = M.global_begin(), - E = M.global_end(); I != E; ++I) { - Vals.insert(&*I); - for (User::const_op_iterator OI = I->op_begin(), - OE = I->op_end(); OI != OE; ++OI) - Vals.insert(*OI); - } - - for(Module::iterator I = M.begin(), - E = M.end(); I != E; ++I){ - Vals.insert(&*I); - if(!I->isDeclaration()) { - for (Function::arg_iterator AI = I->arg_begin(), AE = I->arg_end(); - AI != AE; ++AI) - Vals.insert(&*AI); - for (Function::const_iterator FI = I->begin(), FE = I->end(); - FI != FE; ++FI) - for (BasicBlock::const_iterator BI = FI->begin(), BE = FI->end(); - BI != BE; ++BI) { - Vals.insert(&*BI); - for (User::const_op_iterator OI = BI->op_begin(), - OE = BI->op_end(); OI != OE; ++OI) - Vals.insert(*OI); - } - } - - } - return false; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AliasAnalysis::getAnalysisUsage(AU); - AU.setPreservesAll(); // Does not transform code - } - - /// getAdjustedAnalysisPointer - This method is used when a pass implements - /// an analysis interface through multiple inheritance. If needed, it - /// should override this to adjust the this pointer as needed for the - /// specified pass info. - void *getAdjustedAnalysisPointer(AnalysisID PI) override { - if (PI == &AliasAnalysis::ID) - return (AliasAnalysis*)this; - return this; - } - - //------------------------------------------------ - // Implement the AliasAnalysis API - // - AliasResult alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) override { - assert(Vals.find(LocA.Ptr) != Vals.end() && - "Never seen value in AA before"); - assert(Vals.find(LocB.Ptr) != Vals.end() && - "Never seen value in AA before"); - return AliasAnalysis::alias(LocA, LocB); - } - - ModRefResult getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) override { - assert(Vals.find(Loc.Ptr) != Vals.end() && "Never seen value in AA before"); - return AliasAnalysis::getModRefInfo(CS, Loc); - } - - ModRefResult getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2) override { - return AliasAnalysis::getModRefInfo(CS1,CS2); - } - - bool pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) override { - assert(Vals.find(Loc.Ptr) != Vals.end() && "Never seen value in AA before"); - return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); - } - - void deleteValue(Value *V) override { - assert(Vals.find(V) != Vals.end() && "Never seen value in AA before"); - AliasAnalysis::deleteValue(V); - } - - }; -} - -char AliasDebugger::ID = 0; -INITIALIZE_AG_PASS(AliasDebugger, AliasAnalysis, "debug-aa", - "AA use debugger", false, true, false) - -Pass *llvm::createAliasDebugger() { return new AliasDebugger(); } - diff --git a/contrib/llvm/lib/Analysis/AliasSetTracker.cpp b/contrib/llvm/lib/Analysis/AliasSetTracker.cpp index 54d0f43..3094049 100644 --- a/contrib/llvm/lib/Analysis/AliasSetTracker.cpp +++ b/contrib/llvm/lib/Analysis/AliasSetTracker.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" @@ -167,8 +168,7 @@ bool AliasSet::aliasesPointer(const Value *Ptr, uint64_t Size, if (!UnknownInsts.empty()) { for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) if (AA.getModRefInfo(UnknownInsts[i], - MemoryLocation(Ptr, Size, AAInfo)) != - AliasAnalysis::NoModRef) + MemoryLocation(Ptr, Size, AAInfo)) != MRI_NoModRef) return true; } @@ -182,16 +182,14 @@ bool AliasSet::aliasesUnknownInst(const Instruction *Inst, for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) { ImmutableCallSite C1(getUnknownInst(i)), C2(Inst); - if (!C1 || !C2 || - AA.getModRefInfo(C1, C2) != AliasAnalysis::NoModRef || - AA.getModRefInfo(C2, C1) != AliasAnalysis::NoModRef) + if (!C1 || !C2 || AA.getModRefInfo(C1, C2) != MRI_NoModRef || + AA.getModRefInfo(C2, C1) != MRI_NoModRef) return true; } for (iterator I = begin(), E = end(); I != E; ++I) - if (AA.getModRefInfo( - Inst, MemoryLocation(I.getPointer(), I.getSize(), I.getAAInfo())) != - AliasAnalysis::NoModRef) + if (AA.getModRefInfo(Inst, MemoryLocation(I.getPointer(), I.getSize(), + I.getAAInfo())) != MRI_NoModRef) return true; return false; @@ -223,7 +221,7 @@ AliasSet *AliasSetTracker::findAliasSetForPointer(const Value *Ptr, if (Cur->Forward || !Cur->aliasesPointer(Ptr, Size, AAInfo, AA)) continue; if (!FoundSet) { // If this is the first alias set ptr can go into. - FoundSet = Cur; // Remember it. + FoundSet = &*Cur; // Remember it. } else { // Otherwise, we must merge the sets. FoundSet->mergeSetIn(*Cur, *this); // Merge in contents. } @@ -257,7 +255,7 @@ AliasSet *AliasSetTracker::findAliasSetForUnknownInst(Instruction *Inst) { if (Cur->Forward || !Cur->aliasesUnknownInst(Inst, AA)) continue; if (!FoundSet) // If this is the first alias set ptr can go into. - FoundSet = Cur; // Remember it. + FoundSet = &*Cur; // Remember it. else if (!Cur->Forward) // Otherwise, we must merge the sets. FoundSet->mergeSetIn(*Cur, *this); // Merge in contents. } @@ -309,8 +307,9 @@ bool AliasSetTracker::add(LoadInst *LI) { AliasSet::AccessLattice Access = AliasSet::RefAccess; bool NewPtr; + const DataLayout &DL = LI->getModule()->getDataLayout(); AliasSet &AS = addPointer(LI->getOperand(0), - AA.getTypeStoreSize(LI->getType()), + DL.getTypeStoreSize(LI->getType()), AAInfo, Access, NewPtr); if (LI->isVolatile()) AS.setVolatile(); return NewPtr; @@ -324,9 +323,10 @@ bool AliasSetTracker::add(StoreInst *SI) { AliasSet::AccessLattice Access = AliasSet::ModAccess; bool NewPtr; + const DataLayout &DL = SI->getModule()->getDataLayout(); Value *Val = SI->getOperand(0); AliasSet &AS = addPointer(SI->getOperand(1), - AA.getTypeStoreSize(Val->getType()), + DL.getTypeStoreSize(Val->getType()), AAInfo, Access, NewPtr); if (SI->isVolatile()) AS.setVolatile(); return NewPtr; @@ -372,8 +372,8 @@ bool AliasSetTracker::add(Instruction *I) { } void AliasSetTracker::add(BasicBlock &BB) { - for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) - add(I); + for (auto &I : BB) + add(&I); } void AliasSetTracker::add(const AliasSetTracker &AST) { @@ -443,7 +443,8 @@ AliasSetTracker::remove(Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo) { } bool AliasSetTracker::remove(LoadInst *LI) { - uint64_t Size = AA.getTypeStoreSize(LI->getType()); + const DataLayout &DL = LI->getModule()->getDataLayout(); + uint64_t Size = DL.getTypeStoreSize(LI->getType()); AAMDNodes AAInfo; LI->getAAMetadata(AAInfo); @@ -455,7 +456,8 @@ bool AliasSetTracker::remove(LoadInst *LI) { } bool AliasSetTracker::remove(StoreInst *SI) { - uint64_t Size = AA.getTypeStoreSize(SI->getOperand(0)->getType()); + const DataLayout &DL = SI->getModule()->getDataLayout(); + uint64_t Size = DL.getTypeStoreSize(SI->getOperand(0)->getType()); AAMDNodes AAInfo; SI->getAAMetadata(AAInfo); @@ -505,9 +507,6 @@ bool AliasSetTracker::remove(Instruction *I) { // dangling pointers to deleted instructions. // void AliasSetTracker::deleteValue(Value *PtrVal) { - // Notify the alias analysis implementation that this value is gone. - AA.deleteValue(PtrVal); - // If this is a call instruction, remove the callsite from the appropriate // AliasSet (if present). if (Instruction *Inst = dyn_cast<Instruction>(PtrVal)) { @@ -650,11 +649,12 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); } bool runOnFunction(Function &F) override { - Tracker = new AliasSetTracker(getAnalysis<AliasAnalysis>()); + auto &AAWP = getAnalysis<AAResultsWrapperPass>(); + Tracker = new AliasSetTracker(AAWP.getAAResults()); for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) Tracker->add(&*I); @@ -668,6 +668,6 @@ namespace { char AliasSetPrinter::ID = 0; INITIALIZE_PASS_BEGIN(AliasSetPrinter, "print-alias-sets", "Alias Set Printer", false, true) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(AliasSetPrinter, "print-alias-sets", "Alias Set Printer", false, true) diff --git a/contrib/llvm/lib/Analysis/Analysis.cpp b/contrib/llvm/lib/Analysis/Analysis.cpp index 842ff0a..9c1ac00 100644 --- a/contrib/llvm/lib/Analysis/Analysis.cpp +++ b/contrib/llvm/lib/Analysis/Analysis.cpp @@ -20,23 +20,23 @@ using namespace llvm; /// initializeAnalysis - Initialize all passes linked into the Analysis library. void llvm::initializeAnalysis(PassRegistry &Registry) { - initializeAliasAnalysisAnalysisGroup(Registry); - initializeAliasAnalysisCounterPass(Registry); initializeAAEvalPass(Registry); - initializeAliasDebuggerPass(Registry); initializeAliasSetPrinterPass(Registry); - initializeNoAAPass(Registry); - initializeBasicAliasAnalysisPass(Registry); - initializeBlockFrequencyInfoPass(Registry); - initializeBranchProbabilityInfoPass(Registry); + initializeBasicAAWrapperPassPass(Registry); + initializeBlockFrequencyInfoWrapperPassPass(Registry); + initializeBranchProbabilityInfoWrapperPassPass(Registry); + initializeCallGraphWrapperPassPass(Registry); + initializeCallGraphPrinterPass(Registry); + initializeCallGraphViewerPass(Registry); initializeCostModelAnalysisPass(Registry); initializeCFGViewerPass(Registry); initializeCFGPrinterPass(Registry); initializeCFGOnlyViewerPass(Registry); initializeCFGOnlyPrinterPass(Registry); - initializeCFLAliasAnalysisPass(Registry); + initializeCFLAAWrapperPassPass(Registry); initializeDependenceAnalysisPass(Registry); initializeDelinearizationPass(Registry); + initializeDemandedBitsPass(Registry); initializeDivergenceAnalysisPass(Registry); initializeDominanceFrontierPass(Registry); initializeDomViewerPass(Registry); @@ -47,34 +47,40 @@ void llvm::initializeAnalysis(PassRegistry &Registry) { initializePostDomPrinterPass(Registry); initializePostDomOnlyViewerPass(Registry); initializePostDomOnlyPrinterPass(Registry); + initializeAAResultsWrapperPassPass(Registry); + initializeGlobalsAAWrapperPassPass(Registry); initializeIVUsersPass(Registry); initializeInstCountPass(Registry); initializeIntervalPartitionPass(Registry); initializeLazyValueInfoPass(Registry); - initializeLibCallAliasAnalysisPass(Registry); initializeLintPass(Registry); initializeLoopInfoWrapperPassPass(Registry); initializeMemDepPrinterPass(Registry); initializeMemDerefPrinterPass(Registry); initializeMemoryDependenceAnalysisPass(Registry); initializeModuleDebugInfoPrinterPass(Registry); + initializeObjCARCAAWrapperPassPass(Registry); initializePostDominatorTreePass(Registry); initializeRegionInfoPassPass(Registry); initializeRegionViewerPass(Registry); initializeRegionPrinterPass(Registry); initializeRegionOnlyViewerPass(Registry); initializeRegionOnlyPrinterPass(Registry); - initializeScalarEvolutionPass(Registry); - initializeScalarEvolutionAliasAnalysisPass(Registry); + initializeSCEVAAWrapperPassPass(Registry); + initializeScalarEvolutionWrapperPassPass(Registry); initializeTargetTransformInfoWrapperPassPass(Registry); - initializeTypeBasedAliasAnalysisPass(Registry); - initializeScopedNoAliasAAPass(Registry); + initializeTypeBasedAAWrapperPassPass(Registry); + initializeScopedNoAliasAAWrapperPassPass(Registry); } void LLVMInitializeAnalysis(LLVMPassRegistryRef R) { initializeAnalysis(*unwrap(R)); } +void LLVMInitializeIPA(LLVMPassRegistryRef R) { + initializeAnalysis(*unwrap(R)); +} + LLVMBool LLVMVerifyModule(LLVMModuleRef M, LLVMVerifierFailureAction Action, char **OutMessages) { raw_ostream *DebugOS = Action != LLVMReturnStatusAction ? &errs() : nullptr; diff --git a/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 3586354..c3d2803 100644 --- a/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -13,24 +13,21 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Analysis/Passes.h" -#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" @@ -42,6 +39,18 @@ #include <algorithm> using namespace llvm; +/// Enable analysis of recursive PHI nodes. +static cl::opt<bool> EnableRecPhiAnalysis("basicaa-recphi", cl::Hidden, + cl::init(false)); + +/// SearchLimitReached / SearchTimes shows how often the limit of +/// to decompose GEPs is reached. It will affect the precision +/// of basic alias analysis. +#define DEBUG_TYPE "basicaa" +STATISTIC(SearchLimitReached, "Number of times the limit to " + "decompose GEPs is reached"); +STATISTIC(SearchTimes, "Number of times a GEP is decomposed"); + /// Cutoff after which to stop analysing a set of phi nodes potentially involved /// in a cycle. Because we are analysing 'through' phi nodes we need to be /// careful with value equivalence. We use reachability to make sure a value @@ -57,8 +66,8 @@ static const unsigned MaxLookupSearchDepth = 6; // Useful predicates //===----------------------------------------------------------------------===// -/// isNonEscapingLocalObject - Return true if the pointer is to a function-local -/// object that never escapes from the function. +/// Returns true if the pointer is to a function-local object that never +/// escapes from the function. static bool isNonEscapingLocalObject(const Value *V) { // If this is a local allocation, check to see if it escapes. if (isa<AllocaInst>(V) || isNoAliasCall(V)) @@ -82,8 +91,8 @@ static bool isNonEscapingLocalObject(const Value *V) { return false; } -/// isEscapeSource - Return true if the pointer is one which would have -/// been considered an escape by isNonEscapingLocalObject. +/// Returns true if the pointer is one which would have been considered an +/// escape by isNonEscapingLocalObject. static bool isEscapeSource(const Value *V) { if (isa<CallInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V)) return true; @@ -97,8 +106,7 @@ static bool isEscapeSource(const Value *V) { return false; } -/// getObjectSize - Return the size of the object specified by V, or -/// UnknownSize if unknown. +/// Returns the size of the object specified by V, or UnknownSize if unknown. static uint64_t getObjectSize(const Value *V, const DataLayout &DL, const TargetLibraryInfo &TLI, bool RoundToAlign = false) { @@ -108,8 +116,8 @@ static uint64_t getObjectSize(const Value *V, const DataLayout &DL, return MemoryLocation::UnknownSize; } -/// isObjectSmallerThan - Return true if we can prove that the object specified -/// by V is smaller than Size. +/// Returns true if we can prove that the object specified by V is smaller than +/// Size. static bool isObjectSmallerThan(const Value *V, uint64_t Size, const DataLayout &DL, const TargetLibraryInfo &TLI) { @@ -144,15 +152,14 @@ static bool isObjectSmallerThan(const Value *V, uint64_t Size, // This function needs to use the aligned object size because we allow // reads a bit past the end given sufficient alignment. - uint64_t ObjectSize = getObjectSize(V, DL, TLI, /*RoundToAlign*/true); + uint64_t ObjectSize = getObjectSize(V, DL, TLI, /*RoundToAlign*/ true); return ObjectSize != MemoryLocation::UnknownSize && ObjectSize < Size; } -/// isObjectSize - Return true if we can prove that the object specified -/// by V has size Size. -static bool isObjectSize(const Value *V, uint64_t Size, - const DataLayout &DL, const TargetLibraryInfo &TLI) { +/// Returns true if we can prove that the object specified by V has size Size. +static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL, + const TargetLibraryInfo &TLI) { uint64_t ObjectSize = getObjectSize(V, DL, TLI); return ObjectSize != MemoryLocation::UnknownSize && ObjectSize == Size; } @@ -161,42 +168,20 @@ static bool isObjectSize(const Value *V, uint64_t Size, // GetElementPtr Instruction Decomposition and Analysis //===----------------------------------------------------------------------===// -namespace { - enum ExtensionKind { - EK_NotExtended, - EK_SignExt, - EK_ZeroExt - }; - - struct VariableGEPIndex { - const Value *V; - ExtensionKind Extension; - int64_t Scale; - - bool operator==(const VariableGEPIndex &Other) const { - return V == Other.V && Extension == Other.Extension && - Scale == Other.Scale; - } - - bool operator!=(const VariableGEPIndex &Other) const { - return !operator==(Other); - } - }; -} - - -/// GetLinearExpression - Analyze the specified value as a linear expression: -/// "A*V + B", where A and B are constant integers. Return the scale and offset -/// values as APInts and return V as a Value*, and return whether we looked -/// through any sign or zero extends. The incoming Value is known to have -/// IntegerType and it may already be sign or zero extended. +/// Analyzes the specified value as a linear expression: "A*V + B", where A and +/// B are constant integers. +/// +/// Returns the scale and offset values as APInts and return V as a Value*, and +/// return whether we looked through any sign or zero extends. The incoming +/// Value is known to have IntegerType and it may already be sign or zero +/// extended. /// /// Note that this looks through extends, so the high bits may not be /// represented in the result. -static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset, - ExtensionKind &Extension, - const DataLayout &DL, unsigned Depth, - AssumptionCache *AC, DominatorTree *DT) { +/*static*/ const Value *BasicAAResult::GetLinearExpression( + const Value *V, APInt &Scale, APInt &Offset, unsigned &ZExtBits, + unsigned &SExtBits, const DataLayout &DL, unsigned Depth, + AssumptionCache *AC, DominatorTree *DT, bool &NSW, bool &NUW) { assert(V->getType()->isIntegerTy() && "Not an integer value"); // Limit our recursion depth. @@ -206,54 +191,125 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset, return V; } - if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) { + if (const ConstantInt *Const = dyn_cast<ConstantInt>(V)) { + // if it's a constant, just convert it to an offset and remove the variable. + // If we've been called recursively the Offset bit width will be greater + // than the constant's (the Offset's always as wide as the outermost call), + // so we'll zext here and process any extension in the isa<SExtInst> & + // isa<ZExtInst> cases below. + Offset += Const->getValue().zextOrSelf(Offset.getBitWidth()); + assert(Scale == 0 && "Constant values don't have a scale"); + return V; + } + + if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) { if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) { + + // If we've been called recursively then Offset and Scale will be wider + // that the BOp operands. We'll always zext it here as we'll process sign + // extensions below (see the isa<SExtInst> / isa<ZExtInst> cases). + APInt RHS = RHSC->getValue().zextOrSelf(Offset.getBitWidth()); + switch (BOp->getOpcode()) { - default: break; + default: + // We don't understand this instruction, so we can't decompose it any + // further. + Scale = 1; + Offset = 0; + return V; case Instruction::Or: // X|C == X+C if all the bits in C are unset in X. Otherwise we can't // analyze it. if (!MaskedValueIsZero(BOp->getOperand(0), RHSC->getValue(), DL, 0, AC, - BOp, DT)) - break; - // FALL THROUGH. + BOp, DT)) { + Scale = 1; + Offset = 0; + return V; + } + // FALL THROUGH. case Instruction::Add: - V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension, - DL, Depth + 1, AC, DT); - Offset += RHSC->getValue(); - return V; + V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits, + SExtBits, DL, Depth + 1, AC, DT, NSW, NUW); + Offset += RHS; + break; + case Instruction::Sub: + V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits, + SExtBits, DL, Depth + 1, AC, DT, NSW, NUW); + Offset -= RHS; + break; case Instruction::Mul: - V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension, - DL, Depth + 1, AC, DT); - Offset *= RHSC->getValue(); - Scale *= RHSC->getValue(); - return V; + V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits, + SExtBits, DL, Depth + 1, AC, DT, NSW, NUW); + Offset *= RHS; + Scale *= RHS; + break; case Instruction::Shl: - V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension, - DL, Depth + 1, AC, DT); - Offset <<= RHSC->getValue().getLimitedValue(); - Scale <<= RHSC->getValue().getLimitedValue(); + V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits, + SExtBits, DL, Depth + 1, AC, DT, NSW, NUW); + Offset <<= RHS.getLimitedValue(); + Scale <<= RHS.getLimitedValue(); + // the semantics of nsw and nuw for left shifts don't match those of + // multiplications, so we won't propagate them. + NSW = NUW = false; return V; } + + if (isa<OverflowingBinaryOperator>(BOp)) { + NUW &= BOp->hasNoUnsignedWrap(); + NSW &= BOp->hasNoSignedWrap(); + } + return V; } } // Since GEP indices are sign extended anyway, we don't care about the high // bits of a sign or zero extended value - just scales and offsets. The // extensions have to be consistent though. - if ((isa<SExtInst>(V) && Extension != EK_ZeroExt) || - (isa<ZExtInst>(V) && Extension != EK_SignExt)) { + if (isa<SExtInst>(V) || isa<ZExtInst>(V)) { Value *CastOp = cast<CastInst>(V)->getOperand(0); - unsigned OldWidth = Scale.getBitWidth(); + unsigned NewWidth = V->getType()->getPrimitiveSizeInBits(); unsigned SmallWidth = CastOp->getType()->getPrimitiveSizeInBits(); - Scale = Scale.trunc(SmallWidth); - Offset = Offset.trunc(SmallWidth); - Extension = isa<SExtInst>(V) ? EK_SignExt : EK_ZeroExt; - - Value *Result = GetLinearExpression(CastOp, Scale, Offset, Extension, DL, - Depth + 1, AC, DT); - Scale = Scale.zext(OldWidth); - Offset = Offset.zext(OldWidth); + unsigned OldZExtBits = ZExtBits, OldSExtBits = SExtBits; + const Value *Result = + GetLinearExpression(CastOp, Scale, Offset, ZExtBits, SExtBits, DL, + Depth + 1, AC, DT, NSW, NUW); + + // zext(zext(%x)) == zext(%x), and similiarly for sext; we'll handle this + // by just incrementing the number of bits we've extended by. + unsigned ExtendedBy = NewWidth - SmallWidth; + + if (isa<SExtInst>(V) && ZExtBits == 0) { + // sext(sext(%x, a), b) == sext(%x, a + b) + + if (NSW) { + // We haven't sign-wrapped, so it's valid to decompose sext(%x + c) + // into sext(%x) + sext(c). We'll sext the Offset ourselves: + unsigned OldWidth = Offset.getBitWidth(); + Offset = Offset.trunc(SmallWidth).sext(NewWidth).zextOrSelf(OldWidth); + } else { + // We may have signed-wrapped, so don't decompose sext(%x + c) into + // sext(%x) + sext(c) + Scale = 1; + Offset = 0; + Result = CastOp; + ZExtBits = OldZExtBits; + SExtBits = OldSExtBits; + } + SExtBits += ExtendedBy; + } else { + // sext(zext(%x, a), b) = zext(zext(%x, a), b) = zext(%x, a + b) + + if (!NUW) { + // We may have unsigned-wrapped, so don't decompose zext(%x + c) into + // zext(%x) + zext(c) + Scale = 1; + Offset = 0; + Result = CastOp; + ZExtBits = OldZExtBits; + SExtBits = OldSExtBits; + } + ZExtBits += ExtendedBy; + } return Result; } @@ -263,29 +319,27 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset, return V; } -/// DecomposeGEPExpression - If V is a symbolic pointer expression, decompose it -/// into a base pointer with a constant offset and a number of scaled symbolic -/// offsets. +/// If V is a symbolic pointer expression, decompose it into a base pointer +/// with a constant offset and a number of scaled symbolic offsets. /// -/// The scaled symbolic offsets (represented by pairs of a Value* and a scale in -/// the VarIndices vector) are Value*'s that are known to be scaled by the -/// specified amount, but which may have other unrepresented high bits. As such, -/// the gep cannot necessarily be reconstructed from its decomposed form. +/// The scaled symbolic offsets (represented by pairs of a Value* and a scale +/// in the VarIndices vector) are Value*'s that are known to be scaled by the +/// specified amount, but which may have other unrepresented high bits. As +/// such, the gep cannot necessarily be reconstructed from its decomposed form. /// /// When DataLayout is around, this function is capable of analyzing everything /// that GetUnderlyingObject can look through. To be able to do that /// GetUnderlyingObject and DecomposeGEPExpression must use the same search -/// depth (MaxLookupSearchDepth). -/// When DataLayout not is around, it just looks through pointer casts. -/// -static const Value * -DecomposeGEPExpression(const Value *V, int64_t &BaseOffs, - SmallVectorImpl<VariableGEPIndex> &VarIndices, - bool &MaxLookupReached, const DataLayout &DL, - AssumptionCache *AC, DominatorTree *DT) { +/// depth (MaxLookupSearchDepth). When DataLayout not is around, it just looks +/// through pointer casts. +/*static*/ const Value *BasicAAResult::DecomposeGEPExpression( + const Value *V, int64_t &BaseOffs, + SmallVectorImpl<VariableGEPIndex> &VarIndices, bool &MaxLookupReached, + const DataLayout &DL, AssumptionCache *AC, DominatorTree *DT) { // Limit recursion depth to limit compile time in crazy cases. unsigned MaxLookup = MaxLookupSearchDepth; MaxLookupReached = false; + SearchTimes++; BaseOffs = 0; do { @@ -318,7 +372,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs, // updated when GetUnderlyingObject is updated). TLI should be // provided also. if (const Value *Simplified = - SimplifyInstruction(const_cast<Instruction *>(I), DL)) { + SimplifyInstruction(const_cast<Instruction *>(I), DL)) { V = Simplified; continue; } @@ -333,43 +387,47 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs, unsigned AS = GEPOp->getPointerAddressSpace(); // Walk the indices of the GEP, accumulating them into BaseOff/VarIndices. gep_type_iterator GTI = gep_type_begin(GEPOp); - for (User::const_op_iterator I = GEPOp->op_begin()+1, - E = GEPOp->op_end(); I != E; ++I) { - Value *Index = *I; + for (User::const_op_iterator I = GEPOp->op_begin() + 1, E = GEPOp->op_end(); + I != E; ++I) { + const Value *Index = *I; // Compute the (potentially symbolic) offset in bytes for this index. if (StructType *STy = dyn_cast<StructType>(*GTI++)) { // For a struct, add the member offset. unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue(); - if (FieldNo == 0) continue; + if (FieldNo == 0) + continue; BaseOffs += DL.getStructLayout(STy)->getElementOffset(FieldNo); continue; } // For an array/pointer, add the element offset, explicitly scaled. - if (ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) { - if (CIdx->isZero()) continue; + if (const ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) { + if (CIdx->isZero()) + continue; BaseOffs += DL.getTypeAllocSize(*GTI) * CIdx->getSExtValue(); continue; } uint64_t Scale = DL.getTypeAllocSize(*GTI); - ExtensionKind Extension = EK_NotExtended; + unsigned ZExtBits = 0, SExtBits = 0; // If the integer type is smaller than the pointer size, it is implicitly // sign extended to pointer size. unsigned Width = Index->getType()->getIntegerBitWidth(); - if (DL.getPointerSizeInBits(AS) > Width) - Extension = EK_SignExt; + unsigned PointerSize = DL.getPointerSizeInBits(AS); + if (PointerSize > Width) + SExtBits += PointerSize - Width; // Use GetLinearExpression to decompose the index into a C1*V+C2 form. APInt IndexScale(Width, 0), IndexOffset(Width, 0); - Index = GetLinearExpression(Index, IndexScale, IndexOffset, Extension, DL, - 0, AC, DT); + bool NSW = true, NUW = true; + Index = GetLinearExpression(Index, IndexScale, IndexOffset, ZExtBits, + SExtBits, DL, 0, AC, DT, NSW, NUW); // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale. // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale. - BaseOffs += IndexOffset.getSExtValue()*Scale; + BaseOffs += IndexOffset.getSExtValue() * Scale; Scale *= IndexScale.getSExtValue(); // If we already had an occurrence of this index variable, merge this @@ -377,23 +435,23 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs, // A[x][x] -> x*16 + x*4 -> x*20 // This also ensures that 'x' only appears in the index list once. for (unsigned i = 0, e = VarIndices.size(); i != e; ++i) { - if (VarIndices[i].V == Index && - VarIndices[i].Extension == Extension) { + if (VarIndices[i].V == Index && VarIndices[i].ZExtBits == ZExtBits && + VarIndices[i].SExtBits == SExtBits) { Scale += VarIndices[i].Scale; - VarIndices.erase(VarIndices.begin()+i); + VarIndices.erase(VarIndices.begin() + i); break; } } // Make sure that we have a scale that makes sense for this target's // pointer size. - if (unsigned ShiftBits = 64 - DL.getPointerSizeInBits(AS)) { + if (unsigned ShiftBits = 64 - PointerSize) { Scale <<= ShiftBits; Scale = (int64_t)Scale >> ShiftBits; } if (Scale) { - VariableGEPIndex Entry = {Index, Extension, + VariableGEPIndex Entry = {Index, ZExtBits, SExtBits, static_cast<int64_t>(Scale)}; VarIndices.push_back(Entry); } @@ -405,196 +463,25 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs, // If the chain of expressions is too deep, just return early. MaxLookupReached = true; + SearchLimitReached++; return V; } -//===----------------------------------------------------------------------===// -// BasicAliasAnalysis Pass -//===----------------------------------------------------------------------===// - -#ifndef NDEBUG -static const Function *getParent(const Value *V) { - if (const Instruction *inst = dyn_cast<Instruction>(V)) - return inst->getParent()->getParent(); - - if (const Argument *arg = dyn_cast<Argument>(V)) - return arg->getParent(); - - return nullptr; -} - -static bool notDifferentParent(const Value *O1, const Value *O2) { - - const Function *F1 = getParent(O1); - const Function *F2 = getParent(O2); - - return !F1 || !F2 || F1 == F2; -} -#endif - -namespace { - /// BasicAliasAnalysis - This is the primary alias analysis implementation. - struct BasicAliasAnalysis : public ImmutablePass, public AliasAnalysis { - static char ID; // Class identification, replacement for typeinfo - BasicAliasAnalysis() : ImmutablePass(ID) { - initializeBasicAliasAnalysisPass(*PassRegistry::getPassRegistry()); - } - - bool doInitialization(Module &M) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AliasAnalysis>(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - } - - AliasResult alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) override { - assert(AliasCache.empty() && "AliasCache must be cleared after use!"); - assert(notDifferentParent(LocA.Ptr, LocB.Ptr) && - "BasicAliasAnalysis doesn't support interprocedural queries."); - AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.AATags, - LocB.Ptr, LocB.Size, LocB.AATags); - // AliasCache rarely has more than 1 or 2 elements, always use - // shrink_and_clear so it quickly returns to the inline capacity of the - // SmallDenseMap if it ever grows larger. - // FIXME: This should really be shrink_to_inline_capacity_and_clear(). - AliasCache.shrink_and_clear(); - VisitedPhiBBs.clear(); - return Alias; - } - - ModRefResult getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) override; - - ModRefResult getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2) override; - - /// pointsToConstantMemory - Chase pointers until we find a (constant - /// global) or not. - bool pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) override; - - /// Get the location associated with a pointer argument of a callsite. - ModRefResult getArgModRefInfo(ImmutableCallSite CS, - unsigned ArgIdx) override; - - /// getModRefBehavior - Return the behavior when calling the given - /// call site. - ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override; - - /// getModRefBehavior - Return the behavior when calling the given function. - /// For use when the call site is not known. - ModRefBehavior getModRefBehavior(const Function *F) override; - - /// getAdjustedAnalysisPointer - This method is used when a pass implements - /// an analysis interface through multiple inheritance. If needed, it - /// should override this to adjust the this pointer as needed for the - /// specified pass info. - void *getAdjustedAnalysisPointer(const void *ID) override { - if (ID == &AliasAnalysis::ID) - return (AliasAnalysis*)this; - return this; - } - - private: - // AliasCache - Track alias queries to guard against recursion. - typedef std::pair<MemoryLocation, MemoryLocation> LocPair; - typedef SmallDenseMap<LocPair, AliasResult, 8> AliasCacheTy; - AliasCacheTy AliasCache; - - /// \brief Track phi nodes we have visited. When interpret "Value" pointer - /// equality as value equality we need to make sure that the "Value" is not - /// part of a cycle. Otherwise, two uses could come from different - /// "iterations" of a cycle and see different values for the same "Value" - /// pointer. - /// The following example shows the problem: - /// %p = phi(%alloca1, %addr2) - /// %l = load %ptr - /// %addr1 = gep, %alloca2, 0, %l - /// %addr2 = gep %alloca2, 0, (%l + 1) - /// alias(%p, %addr1) -> MayAlias ! - /// store %l, ... - SmallPtrSet<const BasicBlock*, 8> VisitedPhiBBs; - - // Visited - Track instructions visited by pointsToConstantMemory. - SmallPtrSet<const Value*, 16> Visited; - - /// \brief Check whether two Values can be considered equivalent. - /// - /// In addition to pointer equivalence of \p V1 and \p V2 this checks - /// whether they can not be part of a cycle in the value graph by looking at - /// all visited phi nodes an making sure that the phis cannot reach the - /// value. We have to do this because we are looking through phi nodes (That - /// is we say noalias(V, phi(VA, VB)) if noalias(V, VA) and noalias(V, VB). - bool isValueEqualInPotentialCycles(const Value *V1, const Value *V2); - - /// \brief Dest and Src are the variable indices from two decomposed - /// GetElementPtr instructions GEP1 and GEP2 which have common base - /// pointers. Subtract the GEP2 indices from GEP1 to find the symbolic - /// difference between the two pointers. - void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest, - const SmallVectorImpl<VariableGEPIndex> &Src); - - // aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP - // instruction against another. - AliasResult aliasGEP(const GEPOperator *V1, uint64_t V1Size, - const AAMDNodes &V1AAInfo, - const Value *V2, uint64_t V2Size, - const AAMDNodes &V2AAInfo, - const Value *UnderlyingV1, const Value *UnderlyingV2); - - // aliasPHI - Provide a bunch of ad-hoc rules to disambiguate a PHI - // instruction against another. - AliasResult aliasPHI(const PHINode *PN, uint64_t PNSize, - const AAMDNodes &PNAAInfo, - const Value *V2, uint64_t V2Size, - const AAMDNodes &V2AAInfo); - - /// aliasSelect - Disambiguate a Select instruction against another value. - AliasResult aliasSelect(const SelectInst *SI, uint64_t SISize, - const AAMDNodes &SIAAInfo, - const Value *V2, uint64_t V2Size, - const AAMDNodes &V2AAInfo); - - AliasResult aliasCheck(const Value *V1, uint64_t V1Size, - AAMDNodes V1AATag, - const Value *V2, uint64_t V2Size, - AAMDNodes V2AATag); - }; -} // End of anonymous namespace - -// Register this pass... -char BasicAliasAnalysis::ID = 0; -INITIALIZE_AG_PASS_BEGIN(BasicAliasAnalysis, AliasAnalysis, "basicaa", - "Basic Alias Analysis (stateless AA impl)", - false, true, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_PASS_END(BasicAliasAnalysis, AliasAnalysis, "basicaa", - "Basic Alias Analysis (stateless AA impl)", - false, true, false) - - -ImmutablePass *llvm::createBasicAliasAnalysisPass() { - return new BasicAliasAnalysis(); -} - -/// pointsToConstantMemory - Returns whether the given pointer value -/// points to memory that is local to the function, with global constants being -/// considered local to all functions. -bool BasicAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) { +/// Returns whether the given pointer value points to memory that is local to +/// the function, with global constants being considered local to all +/// functions. +bool BasicAAResult::pointsToConstantMemory(const MemoryLocation &Loc, + bool OrLocal) { assert(Visited.empty() && "Visited must be cleared after use!"); unsigned MaxLookup = 8; SmallVector<const Value *, 16> Worklist; Worklist.push_back(Loc.Ptr); do { - const Value *V = GetUnderlyingObject(Worklist.pop_back_val(), *DL); + const Value *V = GetUnderlyingObject(Worklist.pop_back_val(), DL); if (!Visited.insert(V).second) { Visited.clear(); - return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); } // An alloca instruction defines local memory. @@ -608,7 +495,7 @@ bool BasicAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc, // others. GV may even be a declaration, not a definition. if (!GV->isConstant()) { Visited.clear(); - return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); } continue; } @@ -626,7 +513,7 @@ bool BasicAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc, // Don't bother inspecting phi nodes with many operands. if (PN->getNumIncomingValues() > MaxLookup) { Visited.clear(); - return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); } for (Value *IncValue : PN->incoming_values()) Worklist.push_back(IncValue); @@ -635,7 +522,7 @@ bool BasicAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc, // Otherwise be conservative. Visited.clear(); - return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); } while (!Worklist.empty() && --MaxLookup); @@ -656,66 +543,56 @@ static bool isMemsetPattern16(const Function *MS, isa<IntegerType>(MemsetType->getParamType(2))) return true; } - return false; } -/// getModRefBehavior - Return the behavior when calling the given call site. -AliasAnalysis::ModRefBehavior -BasicAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) { +/// Returns the behavior when calling the given call site. +FunctionModRefBehavior BasicAAResult::getModRefBehavior(ImmutableCallSite CS) { if (CS.doesNotAccessMemory()) // Can't do better than this. - return DoesNotAccessMemory; + return FMRB_DoesNotAccessMemory; - ModRefBehavior Min = UnknownModRefBehavior; + FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior; // If the callsite knows it only reads memory, don't return worse // than that. if (CS.onlyReadsMemory()) - Min = OnlyReadsMemory; + Min = FMRB_OnlyReadsMemory; if (CS.onlyAccessesArgMemory()) - Min = ModRefBehavior(Min & OnlyAccessesArgumentPointees); + Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees); - // The AliasAnalysis base class has some smarts, lets use them. - return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min); + // The AAResultBase base class has some smarts, lets use them. + return FunctionModRefBehavior(AAResultBase::getModRefBehavior(CS) & Min); } -/// getModRefBehavior - Return the behavior when calling the given function. -/// For use when the call site is not known. -AliasAnalysis::ModRefBehavior -BasicAliasAnalysis::getModRefBehavior(const Function *F) { +/// Returns the behavior when calling the given function. For use when the call +/// site is not known. +FunctionModRefBehavior BasicAAResult::getModRefBehavior(const Function *F) { // If the function declares it doesn't access memory, we can't do better. if (F->doesNotAccessMemory()) - return DoesNotAccessMemory; - - // For intrinsics, we can check the table. - if (Intrinsic::ID iid = F->getIntrinsicID()) { -#define GET_INTRINSIC_MODREF_BEHAVIOR -#include "llvm/IR/Intrinsics.gen" -#undef GET_INTRINSIC_MODREF_BEHAVIOR - } + return FMRB_DoesNotAccessMemory; - ModRefBehavior Min = UnknownModRefBehavior; + FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior; // If the function declares it only reads memory, go with that. if (F->onlyReadsMemory()) - Min = OnlyReadsMemory; + Min = FMRB_OnlyReadsMemory; if (F->onlyAccessesArgMemory()) - Min = ModRefBehavior(Min & OnlyAccessesArgumentPointees); - - const TargetLibraryInfo &TLI = - getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - if (isMemsetPattern16(F, TLI)) - Min = OnlyAccessesArgumentPointees; + Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees); // Otherwise be conservative. - return ModRefBehavior(AliasAnalysis::getModRefBehavior(F) & Min); + return FunctionModRefBehavior(AAResultBase::getModRefBehavior(F) & Min); } -AliasAnalysis::ModRefResult -BasicAliasAnalysis::getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx) { +/// Returns true if this is a writeonly (i.e Mod only) parameter. Currently, +/// we don't have a writeonly attribute, so this only knows about builtin +/// intrinsics and target library functions. We could consider adding a +/// writeonly attribute in the future and moving all of these facts to either +/// Intrinsics.td or InferFunctionAttr.cpp +static bool isWriteOnlyParam(ImmutableCallSite CS, unsigned ArgIdx, + const TargetLibraryInfo &TLI) { if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) switch (II->getIntrinsicID()) { default: @@ -723,50 +600,106 @@ BasicAliasAnalysis::getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx) { case Intrinsic::memset: case Intrinsic::memcpy: case Intrinsic::memmove: - assert((ArgIdx == 0 || ArgIdx == 1) && - "Invalid argument index for memory intrinsic"); - return ArgIdx ? Ref : Mod; + // We don't currently have a writeonly attribute. All other properties + // of these intrinsics are nicely described via attributes in + // Intrinsics.td and handled generically. + if (ArgIdx == 0) + return true; } // We can bound the aliasing properties of memset_pattern16 just as we can // for memcpy/memset. This is particularly important because the // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16 - // whenever possible. - if (CS.getCalledFunction() && - isMemsetPattern16(CS.getCalledFunction(), *TLI)) { - assert((ArgIdx == 0 || ArgIdx == 1) && - "Invalid argument index for memset_pattern16"); - return ArgIdx ? Ref : Mod; - } - // FIXME: Handle memset_pattern4 and memset_pattern8 also. + // whenever possible. Note that all but the missing writeonly attribute are + // handled via InferFunctionAttr. + if (CS.getCalledFunction() && isMemsetPattern16(CS.getCalledFunction(), TLI)) + if (ArgIdx == 0) + return true; + + // TODO: memset_pattern4, memset_pattern8 + // TODO: _chk variants + // TODO: strcmp, strcpy + + return false; +} + +ModRefInfo BasicAAResult::getArgModRefInfo(ImmutableCallSite CS, + unsigned ArgIdx) { + + // Emulate the missing writeonly attribute by checking for known builtin + // intrinsics and target library functions. + if (isWriteOnlyParam(CS, ArgIdx, TLI)) + return MRI_Mod; - return AliasAnalysis::getArgModRefInfo(CS, ArgIdx); + if (CS.paramHasAttr(ArgIdx + 1, Attribute::ReadOnly)) + return MRI_Ref; + + if (CS.paramHasAttr(ArgIdx + 1, Attribute::ReadNone)) + return MRI_NoModRef; + + return AAResultBase::getArgModRefInfo(CS, ArgIdx); } static bool isAssumeIntrinsic(ImmutableCallSite CS) { const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction()); - if (II && II->getIntrinsicID() == Intrinsic::assume) - return true; + return II && II->getIntrinsicID() == Intrinsic::assume; +} - return false; +#ifndef NDEBUG +static const Function *getParent(const Value *V) { + if (const Instruction *inst = dyn_cast<Instruction>(V)) + return inst->getParent()->getParent(); + + if (const Argument *arg = dyn_cast<Argument>(V)) + return arg->getParent(); + + return nullptr; } -bool BasicAliasAnalysis::doInitialization(Module &M) { - InitializeAliasAnalysis(this, &M.getDataLayout()); - return true; +static bool notDifferentParent(const Value *O1, const Value *O2) { + + const Function *F1 = getParent(O1); + const Function *F2 = getParent(O2); + + return !F1 || !F2 || F1 == F2; +} +#endif + +AliasResult BasicAAResult::alias(const MemoryLocation &LocA, + const MemoryLocation &LocB) { + assert(notDifferentParent(LocA.Ptr, LocB.Ptr) && + "BasicAliasAnalysis doesn't support interprocedural queries."); + + // If we have a directly cached entry for these locations, we have recursed + // through this once, so just return the cached results. Notably, when this + // happens, we don't clear the cache. + auto CacheIt = AliasCache.find(LocPair(LocA, LocB)); + if (CacheIt != AliasCache.end()) + return CacheIt->second; + + AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.AATags, LocB.Ptr, + LocB.Size, LocB.AATags); + // AliasCache rarely has more than 1 or 2 elements, always use + // shrink_and_clear so it quickly returns to the inline capacity of the + // SmallDenseMap if it ever grows larger. + // FIXME: This should really be shrink_to_inline_capacity_and_clear(). + AliasCache.shrink_and_clear(); + VisitedPhiBBs.clear(); + return Alias; } -/// getModRefInfo - Check to see if the specified callsite can clobber the -/// specified memory object. Since we only look at local properties of this -/// function, we really can't say much about this query. We do, however, use -/// simple "address taken" analysis on local objects. -AliasAnalysis::ModRefResult -BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) { +/// Checks to see if the specified callsite can clobber the specified memory +/// object. +/// +/// Since we only look at local properties of this function, we really can't +/// say much about this query. We do, however, use simple "address taken" +/// analysis on local objects. +ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS, + const MemoryLocation &Loc) { assert(notDifferentParent(CS.getInstruction(), Loc.Ptr) && "AliasAnalysis query involving multiple functions!"); - const Value *Object = GetUnderlyingObject(Loc.Ptr, *DL); + const Value *Object = GetUnderlyingObject(Loc.Ptr, DL); // If this is a tail call and Loc.Ptr points to a stack location, we know that // the tail call cannot access or modify the local stack. @@ -776,7 +709,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS, if (isa<AllocaInst>(Object)) if (const CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) if (CI->isTailCall()) - return NoModRef; + return MRI_NoModRef; // If the pointer is to a locally allocated object that does not escape, // then the call can not mod/ref the pointer unless the call takes the pointer @@ -798,41 +731,42 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS, // is impossible to alias the pointer we're checking. If not, we have to // assume that the call could touch the pointer, even though it doesn't // escape. - if (!isNoAlias(MemoryLocation(*CI), MemoryLocation(Object))) { + AliasResult AR = + getBestAAResults().alias(MemoryLocation(*CI), MemoryLocation(Object)); + if (AR) { PassedAsArg = true; break; } } if (!PassedAsArg) - return NoModRef; + return MRI_NoModRef; } // While the assume intrinsic is marked as arbitrarily writing so that // proper control dependencies will be maintained, it never aliases any // particular memory location. if (isAssumeIntrinsic(CS)) - return NoModRef; + return MRI_NoModRef; - // The AliasAnalysis base class has some smarts, lets use them. - return AliasAnalysis::getModRefInfo(CS, Loc); + // The AAResultBase base class has some smarts, lets use them. + return AAResultBase::getModRefInfo(CS, Loc); } -AliasAnalysis::ModRefResult -BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2) { +ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS1, + ImmutableCallSite CS2) { // While the assume intrinsic is marked as arbitrarily writing so that // proper control dependencies will be maintained, it never aliases any // particular memory location. if (isAssumeIntrinsic(CS1) || isAssumeIntrinsic(CS2)) - return NoModRef; + return MRI_NoModRef; - // The AliasAnalysis base class has some smarts, lets use them. - return AliasAnalysis::getModRefInfo(CS1, CS2); + // The AAResultBase base class has some smarts, lets use them. + return AAResultBase::getModRefInfo(CS1, CS2); } -/// \brief Provide ad-hoc rules to disambiguate accesses through two GEP -/// operators, both having the exact same pointer operand. +/// Provide ad-hoc rules to disambiguate accesses through two GEP operators, +/// both having the exact same pointer operand. static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1, uint64_t V1Size, const GEPOperator *GEP2, @@ -860,10 +794,9 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1, ConstantInt *C2 = dyn_cast<ConstantInt>(GEP2->getOperand(GEP2->getNumOperands() - 1)); - // If the last (struct) indices aren't constants, we can't say anything. - // If they're identical, the other indices might be also be dynamically - // equal, so the GEPs can alias. - if (!C1 || !C2 || C1 == C2) + // If the last (struct) indices are constants and are equal, the other indices + // might be also be dynamically equal, so the GEPs can alias. + if (C1 && C2 && C1 == C2) return MayAlias; // Find the last-indexed type of the GEP, i.e., the type you'd get if @@ -886,12 +819,49 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1, IntermediateIndices.push_back(GEP1->getOperand(i + 1)); } - StructType *LastIndexedStruct = - dyn_cast<StructType>(GetElementPtrInst::getIndexedType( - GEP1->getSourceElementType(), IntermediateIndices)); + auto *Ty = GetElementPtrInst::getIndexedType( + GEP1->getSourceElementType(), IntermediateIndices); + StructType *LastIndexedStruct = dyn_cast<StructType>(Ty); + + if (isa<SequentialType>(Ty)) { + // We know that: + // - both GEPs begin indexing from the exact same pointer; + // - the last indices in both GEPs are constants, indexing into a sequential + // type (array or pointer); + // - both GEPs only index through arrays prior to that. + // + // Because array indices greater than the number of elements are valid in + // GEPs, unless we know the intermediate indices are identical between + // GEP1 and GEP2 we cannot guarantee that the last indexed arrays don't + // partially overlap. We also need to check that the loaded size matches + // the element size, otherwise we could still have overlap. + const uint64_t ElementSize = + DL.getTypeStoreSize(cast<SequentialType>(Ty)->getElementType()); + if (V1Size != ElementSize || V2Size != ElementSize) + return MayAlias; + + for (unsigned i = 0, e = GEP1->getNumIndices() - 1; i != e; ++i) + if (GEP1->getOperand(i + 1) != GEP2->getOperand(i + 1)) + return MayAlias; - if (!LastIndexedStruct) + // Now we know that the array/pointer that GEP1 indexes into and that + // that GEP2 indexes into must either precisely overlap or be disjoint. + // Because they cannot partially overlap and because fields in an array + // cannot overlap, if we can prove the final indices are different between + // GEP1 and GEP2, we can conclude GEP1 and GEP2 don't alias. + + // If the last indices are constants, we've already checked they don't + // equal each other so we can exit early. + if (C1 && C2) + return NoAlias; + if (isKnownNonEqual(GEP1->getOperand(GEP1->getNumOperands() - 1), + GEP2->getOperand(GEP2->getNumOperands() - 1), + DL)) + return NoAlias; return MayAlias; + } else if (!LastIndexedStruct || !C1 || !C2) { + return MayAlias; + } // We know that: // - both GEPs begin indexing from the exact same pointer; @@ -925,39 +895,21 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1, return MayAlias; } -/// aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP instruction -/// against another pointer. We know that V1 is a GEP, but we don't know -/// anything about V2. UnderlyingV1 is GetUnderlyingObject(GEP1, DL), -/// UnderlyingV2 is the same for V2. +/// Provides a bunch of ad-hoc rules to disambiguate a GEP instruction against +/// another pointer. /// -AliasResult BasicAliasAnalysis::aliasGEP( - const GEPOperator *GEP1, uint64_t V1Size, const AAMDNodes &V1AAInfo, - const Value *V2, uint64_t V2Size, const AAMDNodes &V2AAInfo, - const Value *UnderlyingV1, const Value *UnderlyingV2) { +/// We know that V1 is a GEP, but we don't know anything about V2. +/// UnderlyingV1 is GetUnderlyingObject(GEP1, DL), UnderlyingV2 is the same for +/// V2. +AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size, + const AAMDNodes &V1AAInfo, const Value *V2, + uint64_t V2Size, const AAMDNodes &V2AAInfo, + const Value *UnderlyingV1, + const Value *UnderlyingV2) { int64_t GEP1BaseOffset; bool GEP1MaxLookupReached; SmallVector<VariableGEPIndex, 4> GEP1VariableIndices; - // We have to get two AssumptionCaches here because GEP1 and V2 may be from - // different functions. - // FIXME: This really doesn't make any sense. We get a dominator tree below - // that can only refer to a single function. But this function (aliasGEP) is - // a method on an immutable pass that can be called when there *isn't* - // a single function. The old pass management layer makes this "work", but - // this isn't really a clean solution. - AssumptionCacheTracker &ACT = getAnalysis<AssumptionCacheTracker>(); - AssumptionCache *AC1 = nullptr, *AC2 = nullptr; - if (auto *GEP1I = dyn_cast<Instruction>(GEP1)) - AC1 = &ACT.getAssumptionCache( - const_cast<Function &>(*GEP1I->getParent()->getParent())); - if (auto *I2 = dyn_cast<Instruction>(V2)) - AC2 = &ACT.getAssumptionCache( - const_cast<Function &>(*I2->getParent()->getParent())); - - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - // If we have two gep instructions with must-alias or not-alias'ing base // pointers, figure out if the indexes to the GEP tell us anything about the // derived pointer. @@ -971,9 +923,8 @@ AliasResult BasicAliasAnalysis::aliasGEP( // identical. if ((BaseAlias == MayAlias) && V1Size == V2Size) { // Do the base pointers alias assuming type and size. - AliasResult PreciseBaseAlias = aliasCheck(UnderlyingV1, V1Size, - V1AAInfo, UnderlyingV2, - V2Size, V2AAInfo); + AliasResult PreciseBaseAlias = aliasCheck(UnderlyingV1, V1Size, V1AAInfo, + UnderlyingV2, V2Size, V2AAInfo); if (PreciseBaseAlias == NoAlias) { // See if the computed offset from the common pointer tells us about the // relation of the resulting pointer. @@ -982,15 +933,15 @@ AliasResult BasicAliasAnalysis::aliasGEP( SmallVector<VariableGEPIndex, 4> GEP2VariableIndices; const Value *GEP2BasePtr = DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices, - GEP2MaxLookupReached, *DL, AC2, DT); + GEP2MaxLookupReached, DL, &AC, DT); const Value *GEP1BasePtr = DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, - GEP1MaxLookupReached, *DL, AC1, DT); + GEP1MaxLookupReached, DL, &AC, DT); // DecomposeGEPExpression and GetUnderlyingObject should return the // same result except when DecomposeGEPExpression has no DataLayout. + // FIXME: They always have a DataLayout so this should become an + // assert. if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) { - assert(!DL && - "DecomposeGEPExpression and GetUnderlyingObject disagree!"); return MayAlias; } // If the max search depth is reached the result is undefined @@ -1007,35 +958,35 @@ AliasResult BasicAliasAnalysis::aliasGEP( // If we get a No or May, then return it immediately, no amount of analysis // will improve this situation. - if (BaseAlias != MustAlias) return BaseAlias; + if (BaseAlias != MustAlias) + return BaseAlias; // Otherwise, we have a MustAlias. Since the base pointers alias each other // exactly, see if the computed offset from the common pointer tells us // about the relation of the resulting pointer. const Value *GEP1BasePtr = DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, - GEP1MaxLookupReached, *DL, AC1, DT); + GEP1MaxLookupReached, DL, &AC, DT); int64_t GEP2BaseOffset; bool GEP2MaxLookupReached; SmallVector<VariableGEPIndex, 4> GEP2VariableIndices; const Value *GEP2BasePtr = DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices, - GEP2MaxLookupReached, *DL, AC2, DT); + GEP2MaxLookupReached, DL, &AC, DT); // DecomposeGEPExpression and GetUnderlyingObject should return the // same result except when DecomposeGEPExpression has no DataLayout. + // FIXME: They always have a DataLayout so this should become an assert. if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) { - assert(!DL && - "DecomposeGEPExpression and GetUnderlyingObject disagree!"); return MayAlias; } // If we know the two GEPs are based off of the exact same pointer (and not // just the same underlying object), see if that tells us anything about // the resulting pointers. - if (DL && GEP1->getPointerOperand() == GEP2->getPointerOperand()) { - AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, *DL); + if (GEP1->getPointerOperand() == GEP2->getPointerOperand()) { + AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, DL); // If we couldn't find anything interesting, don't abandon just yet. if (R != MayAlias) return R; @@ -1072,13 +1023,12 @@ AliasResult BasicAliasAnalysis::aliasGEP( const Value *GEP1BasePtr = DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, - GEP1MaxLookupReached, *DL, AC1, DT); + GEP1MaxLookupReached, DL, &AC, DT); // DecomposeGEPExpression and GetUnderlyingObject should return the // same result except when DecomposeGEPExpression has no DataLayout. + // FIXME: They always have a DataLayout so this should become an assert. if (GEP1BasePtr != UnderlyingV1) { - assert(!DL && - "DecomposeGEPExpression and GetUnderlyingObject disagree!"); return MayAlias; } // If the max search depth is reached the result is undefined @@ -1124,12 +1074,42 @@ AliasResult BasicAliasAnalysis::aliasGEP( } } - // Try to distinguish something like &A[i][1] against &A[42][0]. - // Grab the least significant bit set in any of the scales. if (!GEP1VariableIndices.empty()) { uint64_t Modulo = 0; - for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i) - Modulo |= (uint64_t) GEP1VariableIndices[i].Scale; + bool AllPositive = true; + for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i) { + + // Try to distinguish something like &A[i][1] against &A[42][0]. + // Grab the least significant bit set in any of the scales. We + // don't need std::abs here (even if the scale's negative) as we'll + // be ^'ing Modulo with itself later. + Modulo |= (uint64_t)GEP1VariableIndices[i].Scale; + + if (AllPositive) { + // If the Value could change between cycles, then any reasoning about + // the Value this cycle may not hold in the next cycle. We'll just + // give up if we can't determine conditions that hold for every cycle: + const Value *V = GEP1VariableIndices[i].V; + + bool SignKnownZero, SignKnownOne; + ComputeSignBit(const_cast<Value *>(V), SignKnownZero, SignKnownOne, DL, + 0, &AC, nullptr, DT); + + // Zero-extension widens the variable, and so forces the sign + // bit to zero. + bool IsZExt = GEP1VariableIndices[i].ZExtBits > 0 || isa<ZExtInst>(V); + SignKnownZero |= IsZExt; + SignKnownOne &= !IsZExt; + + // If the variable begins with a zero then we know it's + // positive, regardless of whether the value is signed or + // unsigned. + int64_t Scale = GEP1VariableIndices[i].Scale; + AllPositive = + (SignKnownZero && Scale >= 0) || (SignKnownOne && Scale < 0); + } + } + Modulo = Modulo ^ (Modulo & (Modulo - 1)); // We can compute the difference between the two addresses @@ -1140,6 +1120,16 @@ AliasResult BasicAliasAnalysis::aliasGEP( V2Size != MemoryLocation::UnknownSize && ModOffset >= V2Size && V1Size <= Modulo - ModOffset) return NoAlias; + + // If we know all the variables are positive, then GEP1 >= GEP1BasePtr. + // If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers + // don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr. + if (AllPositive && GEP1BaseOffset > 0 && V2Size <= (uint64_t)GEP1BaseOffset) + return NoAlias; + + if (constantOffsetHeuristic(GEP1VariableIndices, V1Size, V2Size, + GEP1BaseOffset, &AC, DT)) + return NoAlias; } // Statically, we can see that the base objects are the same, but the @@ -1164,46 +1154,44 @@ static AliasResult MergeAliasResults(AliasResult A, AliasResult B) { return MayAlias; } -/// aliasSelect - Provide a bunch of ad-hoc rules to disambiguate a Select -/// instruction against another. -AliasResult BasicAliasAnalysis::aliasSelect(const SelectInst *SI, - uint64_t SISize, - const AAMDNodes &SIAAInfo, - const Value *V2, uint64_t V2Size, - const AAMDNodes &V2AAInfo) { +/// Provides a bunch of ad-hoc rules to disambiguate a Select instruction +/// against another. +AliasResult BasicAAResult::aliasSelect(const SelectInst *SI, uint64_t SISize, + const AAMDNodes &SIAAInfo, + const Value *V2, uint64_t V2Size, + const AAMDNodes &V2AAInfo) { // If the values are Selects with the same condition, we can do a more precise // check: just check for aliases between the values on corresponding arms. if (const SelectInst *SI2 = dyn_cast<SelectInst>(V2)) if (SI->getCondition() == SI2->getCondition()) { - AliasResult Alias = - aliasCheck(SI->getTrueValue(), SISize, SIAAInfo, - SI2->getTrueValue(), V2Size, V2AAInfo); + AliasResult Alias = aliasCheck(SI->getTrueValue(), SISize, SIAAInfo, + SI2->getTrueValue(), V2Size, V2AAInfo); if (Alias == MayAlias) return MayAlias; AliasResult ThisAlias = - aliasCheck(SI->getFalseValue(), SISize, SIAAInfo, - SI2->getFalseValue(), V2Size, V2AAInfo); + aliasCheck(SI->getFalseValue(), SISize, SIAAInfo, + SI2->getFalseValue(), V2Size, V2AAInfo); return MergeAliasResults(ThisAlias, Alias); } // If both arms of the Select node NoAlias or MustAlias V2, then returns // NoAlias / MustAlias. Otherwise, returns MayAlias. AliasResult Alias = - aliasCheck(V2, V2Size, V2AAInfo, SI->getTrueValue(), SISize, SIAAInfo); + aliasCheck(V2, V2Size, V2AAInfo, SI->getTrueValue(), SISize, SIAAInfo); if (Alias == MayAlias) return MayAlias; AliasResult ThisAlias = - aliasCheck(V2, V2Size, V2AAInfo, SI->getFalseValue(), SISize, SIAAInfo); + aliasCheck(V2, V2Size, V2AAInfo, SI->getFalseValue(), SISize, SIAAInfo); return MergeAliasResults(ThisAlias, Alias); } -// aliasPHI - Provide a bunch of ad-hoc rules to disambiguate a PHI instruction -// against another. -AliasResult BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize, - const AAMDNodes &PNAAInfo, - const Value *V2, uint64_t V2Size, - const AAMDNodes &V2AAInfo) { +/// Provide a bunch of ad-hoc rules to disambiguate a PHI instruction against +/// another. +AliasResult BasicAAResult::aliasPHI(const PHINode *PN, uint64_t PNSize, + const AAMDNodes &PNAAInfo, const Value *V2, + uint64_t V2Size, + const AAMDNodes &V2AAInfo) { // Track phi nodes we have visited. We use this information when we determine // value equivalence. VisitedPhiBBs.insert(PN->getParent()); @@ -1232,9 +1220,9 @@ AliasResult BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize, for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { AliasResult ThisAlias = - aliasCheck(PN->getIncomingValue(i), PNSize, PNAAInfo, - PN2->getIncomingValueForBlock(PN->getIncomingBlock(i)), - V2Size, V2AAInfo); + aliasCheck(PN->getIncomingValue(i), PNSize, PNAAInfo, + PN2->getIncomingValueForBlock(PN->getIncomingBlock(i)), + V2Size, V2AAInfo); Alias = MergeAliasResults(ThisAlias, Alias); if (Alias == MayAlias) break; @@ -1247,8 +1235,9 @@ AliasResult BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize, return Alias; } - SmallPtrSet<Value*, 4> UniqueSrc; - SmallVector<Value*, 4> V1Srcs; + SmallPtrSet<Value *, 4> UniqueSrc; + SmallVector<Value *, 4> V1Srcs; + bool isRecursive = false; for (Value *PV1 : PN->incoming_values()) { if (isa<PHINode>(PV1)) // If any of the source itself is a PHI, return MayAlias conservatively @@ -1256,12 +1245,33 @@ AliasResult BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize, // sides are PHI nodes. In which case, this is O(m x n) time where 'm' // and 'n' are the number of PHI sources. return MayAlias; + + if (EnableRecPhiAnalysis) + if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) { + // Check whether the incoming value is a GEP that advances the pointer + // result of this PHI node (e.g. in a loop). If this is the case, we + // would recurse and always get a MayAlias. Handle this case specially + // below. + if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 && + isa<ConstantInt>(PV1GEP->idx_begin())) { + isRecursive = true; + continue; + } + } + if (UniqueSrc.insert(PV1).second) V1Srcs.push_back(PV1); } - AliasResult Alias = aliasCheck(V2, V2Size, V2AAInfo, - V1Srcs[0], PNSize, PNAAInfo); + // If this PHI node is recursive, set the size of the accessed memory to + // unknown to represent all the possible values the GEP could advance the + // pointer to. + if (isRecursive) + PNSize = MemoryLocation::UnknownSize; + + AliasResult Alias = + aliasCheck(V2, V2Size, V2AAInfo, V1Srcs[0], PNSize, PNAAInfo); + // Early exit if the check of the first PHI source against V2 is MayAlias. // Other results are not possible. if (Alias == MayAlias) @@ -1272,8 +1282,8 @@ AliasResult BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize, for (unsigned i = 1, e = V1Srcs.size(); i != e; ++i) { Value *V = V1Srcs[i]; - AliasResult ThisAlias = aliasCheck(V2, V2Size, V2AAInfo, - V, PNSize, PNAAInfo); + AliasResult ThisAlias = + aliasCheck(V2, V2Size, V2AAInfo, V, PNSize, PNAAInfo); Alias = MergeAliasResults(ThisAlias, Alias); if (Alias == MayAlias) break; @@ -1282,13 +1292,11 @@ AliasResult BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize, return Alias; } -// aliasCheck - Provide a bunch of ad-hoc rules to disambiguate in common cases, -// such as array references. -// -AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size, - AAMDNodes V1AAInfo, const Value *V2, - uint64_t V2Size, - AAMDNodes V2AAInfo) { +/// Provides a bunch of ad-hoc rules to disambiguate in common cases, such as +/// array references. +AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size, + AAMDNodes V1AAInfo, const Value *V2, + uint64_t V2Size, AAMDNodes V2AAInfo) { // If either of the memory references is empty, it doesn't matter what the // pointer values are. if (V1Size == 0 || V2Size == 0) @@ -1313,11 +1321,11 @@ AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size, return MustAlias; if (!V1->getType()->isPointerTy() || !V2->getType()->isPointerTy()) - return NoAlias; // Scalars cannot alias each other + return NoAlias; // Scalars cannot alias each other // Figure out what objects these things are pointing to if we can. - const Value *O1 = GetUnderlyingObject(V1, *DL, MaxLookupSearchDepth); - const Value *O2 = GetUnderlyingObject(V2, *DL, MaxLookupSearchDepth); + const Value *O1 = GetUnderlyingObject(V1, DL, MaxLookupSearchDepth); + const Value *O2 = GetUnderlyingObject(V2, DL, MaxLookupSearchDepth); // Null values in the default address space don't point to any object, so they // don't alias any other pointer. @@ -1366,12 +1374,11 @@ AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size, // If the size of one access is larger than the entire object on the other // side, then we know such behavior is undefined and can assume no alias. - if (DL) - if ((V1Size != MemoryLocation::UnknownSize && - isObjectSmallerThan(O2, V1Size, *DL, *TLI)) || - (V2Size != MemoryLocation::UnknownSize && - isObjectSmallerThan(O1, V2Size, *DL, *TLI))) - return NoAlias; + if ((V1Size != MemoryLocation::UnknownSize && + isObjectSmallerThan(O2, V1Size, DL, TLI)) || + (V2Size != MemoryLocation::UnknownSize && + isObjectSmallerThan(O1, V2Size, DL, TLI))) + return NoAlias; // Check the cache before climbing up use-def chains. This also terminates // otherwise infinitely recursive queries. @@ -1380,7 +1387,7 @@ AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size, if (V1 > V2) std::swap(Locs.first, Locs.second); std::pair<AliasCacheTy::iterator, bool> Pair = - AliasCache.insert(std::make_pair(Locs, MayAlias)); + AliasCache.insert(std::make_pair(Locs, MayAlias)); if (!Pair.second) return Pair.first->second; @@ -1393,8 +1400,10 @@ AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size, std::swap(V1AAInfo, V2AAInfo); } if (const GEPOperator *GV1 = dyn_cast<GEPOperator>(V1)) { - AliasResult Result = aliasGEP(GV1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O1, O2); - if (Result != MayAlias) return AliasCache[Locs] = Result; + AliasResult Result = + aliasGEP(GV1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O1, O2); + if (Result != MayAlias) + return AliasCache[Locs] = Result; } if (isa<PHINode>(V2) && !isa<PHINode>(V1)) { @@ -1403,9 +1412,9 @@ AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size, std::swap(V1AAInfo, V2AAInfo); } if (const PHINode *PN = dyn_cast<PHINode>(V1)) { - AliasResult Result = aliasPHI(PN, V1Size, V1AAInfo, - V2, V2Size, V2AAInfo); - if (Result != MayAlias) return AliasCache[Locs] = Result; + AliasResult Result = aliasPHI(PN, V1Size, V1AAInfo, V2, V2Size, V2AAInfo); + if (Result != MayAlias) + return AliasCache[Locs] = Result; } if (isa<SelectInst>(V2) && !isa<SelectInst>(V1)) { @@ -1414,29 +1423,38 @@ AliasResult BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size, std::swap(V1AAInfo, V2AAInfo); } if (const SelectInst *S1 = dyn_cast<SelectInst>(V1)) { - AliasResult Result = aliasSelect(S1, V1Size, V1AAInfo, - V2, V2Size, V2AAInfo); - if (Result != MayAlias) return AliasCache[Locs] = Result; + AliasResult Result = + aliasSelect(S1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo); + if (Result != MayAlias) + return AliasCache[Locs] = Result; } // If both pointers are pointing into the same object and one of them // accesses is accessing the entire object, then the accesses must // overlap in some way. - if (DL && O1 == O2) + if (O1 == O2) if ((V1Size != MemoryLocation::UnknownSize && - isObjectSize(O1, V1Size, *DL, *TLI)) || + isObjectSize(O1, V1Size, DL, TLI)) || (V2Size != MemoryLocation::UnknownSize && - isObjectSize(O2, V2Size, *DL, *TLI))) + isObjectSize(O2, V2Size, DL, TLI))) return AliasCache[Locs] = PartialAlias; - AliasResult Result = - AliasAnalysis::alias(MemoryLocation(V1, V1Size, V1AAInfo), - MemoryLocation(V2, V2Size, V2AAInfo)); + // Recurse back into the best AA results we have, potentially with refined + // memory locations. We have already ensured that BasicAA has a MayAlias + // cache result for these, so any recursion back into BasicAA won't loop. + AliasResult Result = getBestAAResults().alias(Locs.first, Locs.second); return AliasCache[Locs] = Result; } -bool BasicAliasAnalysis::isValueEqualInPotentialCycles(const Value *V, - const Value *V2) { +/// Check whether two Values can be considered equivalent. +/// +/// In addition to pointer equivalence of \p V1 and \p V2 this checks whether +/// they can not be part of a cycle in the value graph by looking at all +/// visited phi nodes an making sure that the phis cannot reach the value. We +/// have to do this because we are looking through phi nodes (That is we say +/// noalias(V, phi(VA, VB)) if noalias(V, VA) and noalias(V, VB). +bool BasicAAResult::isValueEqualInPotentialCycles(const Value *V, + const Value *V2) { if (V != V2) return false; @@ -1450,28 +1468,21 @@ bool BasicAliasAnalysis::isValueEqualInPotentialCycles(const Value *V, if (VisitedPhiBBs.size() > MaxNumPhiBBsValueReachabilityCheck) return false; - // Use dominance or loop info if available. - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); - LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - // Make sure that the visited phis cannot reach the Value. This ensures that // the Values cannot come from different iterations of a potential cycle the // phi nodes could be involved in. for (auto *P : VisitedPhiBBs) - if (isPotentiallyReachable(P->begin(), Inst, DT, LI)) + if (isPotentiallyReachable(&P->front(), Inst, DT, LI)) return false; return true; } -/// GetIndexDifference - Dest and Src are the variable indices from two -/// decomposed GetElementPtr instructions GEP1 and GEP2 which have common base -/// pointers. Subtract the GEP2 indices from GEP1 to find the symbolic -/// difference between the two pointers. -void BasicAliasAnalysis::GetIndexDifference( +/// Computes the symbolic difference between two de-composed GEPs. +/// +/// Dest and Src are the variable indices from two decomposed GetElementPtr +/// instructions GEP1 and GEP2 which have common base pointers. +void BasicAAResult::GetIndexDifference( SmallVectorImpl<VariableGEPIndex> &Dest, const SmallVectorImpl<VariableGEPIndex> &Src) { if (Src.empty()) @@ -1479,14 +1490,14 @@ void BasicAliasAnalysis::GetIndexDifference( for (unsigned i = 0, e = Src.size(); i != e; ++i) { const Value *V = Src[i].V; - ExtensionKind Extension = Src[i].Extension; + unsigned ZExtBits = Src[i].ZExtBits, SExtBits = Src[i].SExtBits; int64_t Scale = Src[i].Scale; // Find V in Dest. This is N^2, but pointer indices almost never have more // than a few variable indexes. for (unsigned j = 0, e = Dest.size(); j != e; ++j) { if (!isValueEqualInPotentialCycles(Dest[j].V, V) || - Dest[j].Extension != Extension) + Dest[j].ZExtBits != ZExtBits || Dest[j].SExtBits != SExtBits) continue; // If we found it, subtract off Scale V's from the entry in Dest. If it @@ -1501,8 +1512,120 @@ void BasicAliasAnalysis::GetIndexDifference( // If we didn't consume this entry, add it to the end of the Dest list. if (Scale) { - VariableGEPIndex Entry = { V, Extension, -Scale }; + VariableGEPIndex Entry = {V, ZExtBits, SExtBits, -Scale}; Dest.push_back(Entry); } } } + +bool BasicAAResult::constantOffsetHeuristic( + const SmallVectorImpl<VariableGEPIndex> &VarIndices, uint64_t V1Size, + uint64_t V2Size, int64_t BaseOffset, AssumptionCache *AC, + DominatorTree *DT) { + if (VarIndices.size() != 2 || V1Size == MemoryLocation::UnknownSize || + V2Size == MemoryLocation::UnknownSize) + return false; + + const VariableGEPIndex &Var0 = VarIndices[0], &Var1 = VarIndices[1]; + + if (Var0.ZExtBits != Var1.ZExtBits || Var0.SExtBits != Var1.SExtBits || + Var0.Scale != -Var1.Scale) + return false; + + unsigned Width = Var1.V->getType()->getIntegerBitWidth(); + + // We'll strip off the Extensions of Var0 and Var1 and do another round + // of GetLinearExpression decomposition. In the example above, if Var0 + // is zext(%x + 1) we should get V1 == %x and V1Offset == 1. + + APInt V0Scale(Width, 0), V0Offset(Width, 0), V1Scale(Width, 0), + V1Offset(Width, 0); + bool NSW = true, NUW = true; + unsigned V0ZExtBits = 0, V0SExtBits = 0, V1ZExtBits = 0, V1SExtBits = 0; + const Value *V0 = GetLinearExpression(Var0.V, V0Scale, V0Offset, V0ZExtBits, + V0SExtBits, DL, 0, AC, DT, NSW, NUW); + NSW = true, NUW = true; + const Value *V1 = GetLinearExpression(Var1.V, V1Scale, V1Offset, V1ZExtBits, + V1SExtBits, DL, 0, AC, DT, NSW, NUW); + + if (V0Scale != V1Scale || V0ZExtBits != V1ZExtBits || + V0SExtBits != V1SExtBits || !isValueEqualInPotentialCycles(V0, V1)) + return false; + + // We have a hit - Var0 and Var1 only differ by a constant offset! + + // If we've been sext'ed then zext'd the maximum difference between Var0 and + // Var1 is possible to calculate, but we're just interested in the absolute + // minimum difference between the two. The minimum distance may occur due to + // wrapping; consider "add i3 %i, 5": if %i == 7 then 7 + 5 mod 8 == 4, and so + // the minimum distance between %i and %i + 5 is 3. + APInt MinDiff = V0Offset - V1Offset, Wrapped = -MinDiff; + MinDiff = APIntOps::umin(MinDiff, Wrapped); + uint64_t MinDiffBytes = MinDiff.getZExtValue() * std::abs(Var0.Scale); + + // We can't definitely say whether GEP1 is before or after V2 due to wrapping + // arithmetic (i.e. for some values of GEP1 and V2 GEP1 < V2, and for other + // values GEP1 > V2). We'll therefore only declare NoAlias if both V1Size and + // V2Size can fit in the MinDiffBytes gap. + return V1Size + std::abs(BaseOffset) <= MinDiffBytes && + V2Size + std::abs(BaseOffset) <= MinDiffBytes; +} + +//===----------------------------------------------------------------------===// +// BasicAliasAnalysis Pass +//===----------------------------------------------------------------------===// + +char BasicAA::PassID; + +BasicAAResult BasicAA::run(Function &F, AnalysisManager<Function> *AM) { + return BasicAAResult(F.getParent()->getDataLayout(), + AM->getResult<TargetLibraryAnalysis>(F), + AM->getResult<AssumptionAnalysis>(F), + AM->getCachedResult<DominatorTreeAnalysis>(F), + AM->getCachedResult<LoopAnalysis>(F)); +} + +BasicAAWrapperPass::BasicAAWrapperPass() : FunctionPass(ID) { + initializeBasicAAWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +char BasicAAWrapperPass::ID = 0; +void BasicAAWrapperPass::anchor() {} + +INITIALIZE_PASS_BEGIN(BasicAAWrapperPass, "basicaa", + "Basic Alias Analysis (stateless AA impl)", true, true) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(BasicAAWrapperPass, "basicaa", + "Basic Alias Analysis (stateless AA impl)", true, true) + +FunctionPass *llvm::createBasicAAWrapperPass() { + return new BasicAAWrapperPass(); +} + +bool BasicAAWrapperPass::runOnFunction(Function &F) { + auto &ACT = getAnalysis<AssumptionCacheTracker>(); + auto &TLIWP = getAnalysis<TargetLibraryInfoWrapperPass>(); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); + + Result.reset(new BasicAAResult(F.getParent()->getDataLayout(), TLIWP.getTLI(), + ACT.getAssumptionCache(F), + DTWP ? &DTWP->getDomTree() : nullptr, + LIWP ? &LIWP->getLoopInfo() : nullptr)); + + return false; +} + +void BasicAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); +} + +BasicAAResult llvm::createLegacyPMBasicAAResult(Pass &P, Function &F) { + return BasicAAResult( + F.getParent()->getDataLayout(), + P.getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), + P.getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F)); +} diff --git a/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp b/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp index 3d819eb..90b7a33 100644 --- a/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp +++ b/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp @@ -55,7 +55,7 @@ struct GraphTraits<BlockFrequencyInfo *> { typedef Function::const_iterator nodes_iterator; static inline const NodeType *getEntryNode(const BlockFrequencyInfo *G) { - return G->getFunction()->begin(); + return &G->getFunction()->front(); } static ChildIteratorType child_begin(const NodeType *N) { return succ_begin(N); @@ -105,51 +105,36 @@ struct DOTGraphTraits<BlockFrequencyInfo*> : public DefaultDOTGraphTraits { } // end namespace llvm #endif -INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq", - "Block Frequency Analysis", true, true) -INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq", - "Block Frequency Analysis", true, true) - -char BlockFrequencyInfo::ID = 0; - +BlockFrequencyInfo::BlockFrequencyInfo() {} -BlockFrequencyInfo::BlockFrequencyInfo() : FunctionPass(ID) { - initializeBlockFrequencyInfoPass(*PassRegistry::getPassRegistry()); -} - -BlockFrequencyInfo::~BlockFrequencyInfo() {} - -void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<BranchProbabilityInfo>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.setPreservesAll(); +BlockFrequencyInfo::BlockFrequencyInfo(const Function &F, + const BranchProbabilityInfo &BPI, + const LoopInfo &LI) { + calculate(F, BPI, LI); } -bool BlockFrequencyInfo::runOnFunction(Function &F) { - BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>(); - LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); +void BlockFrequencyInfo::calculate(const Function &F, + const BranchProbabilityInfo &BPI, + const LoopInfo &LI) { if (!BFI) BFI.reset(new ImplType); - BFI->doFunction(&F, &BPI, &LI); + BFI->calculate(F, BPI, LI); #ifndef NDEBUG if (ViewBlockFreqPropagationDAG != GVDT_None) view(); #endif - return false; -} - -void BlockFrequencyInfo::releaseMemory() { BFI.reset(); } - -void BlockFrequencyInfo::print(raw_ostream &O, const Module *) const { - if (BFI) BFI->print(O); } BlockFrequency BlockFrequencyInfo::getBlockFreq(const BasicBlock *BB) const { return BFI ? BFI->getBlockFreq(BB) : 0; } +void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB, + uint64_t Freq) { + assert(BFI && "Expected analysis to be available"); + BFI->setBlockFreq(BB, Freq); +} + /// Pop up a ghostview window with the current block frequency propagation /// rendered using dot. void BlockFrequencyInfo::view() const { @@ -180,3 +165,49 @@ BlockFrequencyInfo::printBlockFreq(raw_ostream &OS, uint64_t BlockFrequencyInfo::getEntryFreq() const { return BFI ? BFI->getEntryFreq() : 0; } + +void BlockFrequencyInfo::releaseMemory() { BFI.reset(); } + +void BlockFrequencyInfo::print(raw_ostream &OS) const { + if (BFI) + BFI->print(OS); +} + + +INITIALIZE_PASS_BEGIN(BlockFrequencyInfoWrapperPass, "block-freq", + "Block Frequency Analysis", true, true) +INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(BlockFrequencyInfoWrapperPass, "block-freq", + "Block Frequency Analysis", true, true) + +char BlockFrequencyInfoWrapperPass::ID = 0; + + +BlockFrequencyInfoWrapperPass::BlockFrequencyInfoWrapperPass() + : FunctionPass(ID) { + initializeBlockFrequencyInfoWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +BlockFrequencyInfoWrapperPass::~BlockFrequencyInfoWrapperPass() {} + +void BlockFrequencyInfoWrapperPass::print(raw_ostream &OS, + const Module *) const { + BFI.print(OS); +} + +void BlockFrequencyInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BranchProbabilityInfoWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.setPreservesAll(); +} + +void BlockFrequencyInfoWrapperPass::releaseMemory() { BFI.releaseMemory(); } + +bool BlockFrequencyInfoWrapperPass::runOnFunction(Function &F) { + BranchProbabilityInfo &BPI = + getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + BFI.calculate(F, BPI, LI); + return false; +} diff --git a/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp index 6ceda06..48e23af 100644 --- a/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp +++ b/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -530,6 +530,13 @@ BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { return Freqs[Node.Index].Scaled; } +void BlockFrequencyInfoImplBase::setBlockFreq(const BlockNode &Node, + uint64_t Freq) { + assert(Node.isValid() && "Expected valid node"); + assert(Node.Index < Freqs.size() && "Expected legal index"); + Freqs[Node.Index].Integer = Freq; +} + std::string BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const { return std::string(); @@ -743,7 +750,10 @@ void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(HeaderNode)]; DEBUG(dbgs() << " - Add back edge mass for node " << getBlockName(HeaderNode) << ": " << BackedgeMass << "\n"); - Dist.addLocal(HeaderNode, BackedgeMass.getMass()); + if (BackedgeMass.getMass() > 0) + Dist.addLocal(HeaderNode, BackedgeMass.getMass()); + else + DEBUG(dbgs() << " Nothing added. Back edge mass is zero\n"); } DitheringDistributer D(Dist, LoopMass); diff --git a/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp index 430b412..cf0cc8d 100644 --- a/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp +++ b/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp @@ -27,13 +27,13 @@ using namespace llvm; #define DEBUG_TYPE "branch-prob" -INITIALIZE_PASS_BEGIN(BranchProbabilityInfo, "branch-prob", +INITIALIZE_PASS_BEGIN(BranchProbabilityInfoWrapperPass, "branch-prob", "Branch Probability Analysis", false, true) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_END(BranchProbabilityInfo, "branch-prob", +INITIALIZE_PASS_END(BranchProbabilityInfoWrapperPass, "branch-prob", "Branch Probability Analysis", false, true) -char BranchProbabilityInfo::ID = 0; +char BranchProbabilityInfoWrapperPass::ID = 0; // Weights are for internal use only. They are used by heuristics to help to // estimate edges' probability. Example: @@ -108,13 +108,6 @@ static const uint32_t IH_TAKEN_WEIGHT = 1024 * 1024 - 1; /// instruction. This is essentially never taken. static const uint32_t IH_NONTAKEN_WEIGHT = 1; -// Standard weight value. Used when none of the heuristics set weight for -// the edge. -static const uint32_t NORMAL_WEIGHT = 16; - -// Minimum weight of an edge. Please note, that weight is NEVER 0. -static const uint32_t MIN_WEIGHT = 1; - /// \brief Calculate edge weights for successors lead to unreachable. /// /// Predict that a successor which leads necessarily to an @@ -147,22 +140,34 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(BasicBlock *BB) { if (TI->getNumSuccessors() == 1 || UnreachableEdges.empty()) return false; - uint32_t UnreachableWeight = - std::max(UR_TAKEN_WEIGHT / (unsigned)UnreachableEdges.size(), MIN_WEIGHT); - for (SmallVectorImpl<unsigned>::iterator I = UnreachableEdges.begin(), - E = UnreachableEdges.end(); - I != E; ++I) - setEdgeWeight(BB, *I, UnreachableWeight); + // If the terminator is an InvokeInst, check only the normal destination block + // as the unwind edge of InvokeInst is also very unlikely taken. + if (auto *II = dyn_cast<InvokeInst>(TI)) + if (PostDominatedByUnreachable.count(II->getNormalDest())) { + PostDominatedByUnreachable.insert(BB); + // Return false here so that edge weights for InvokeInst could be decided + // in calcInvokeHeuristics(). + return false; + } - if (ReachableEdges.empty()) + if (ReachableEdges.empty()) { + BranchProbability Prob(1, UnreachableEdges.size()); + for (unsigned SuccIdx : UnreachableEdges) + setEdgeProbability(BB, SuccIdx, Prob); return true; - uint32_t ReachableWeight = - std::max(UR_NONTAKEN_WEIGHT / (unsigned)ReachableEdges.size(), - NORMAL_WEIGHT); - for (SmallVectorImpl<unsigned>::iterator I = ReachableEdges.begin(), - E = ReachableEdges.end(); - I != E; ++I) - setEdgeWeight(BB, *I, ReachableWeight); + } + + BranchProbability UnreachableProb(UR_TAKEN_WEIGHT, + (UR_TAKEN_WEIGHT + UR_NONTAKEN_WEIGHT) * + UnreachableEdges.size()); + BranchProbability ReachableProb(UR_NONTAKEN_WEIGHT, + (UR_TAKEN_WEIGHT + UR_NONTAKEN_WEIGHT) * + ReachableEdges.size()); + + for (unsigned SuccIdx : UnreachableEdges) + setEdgeProbability(BB, SuccIdx, UnreachableProb); + for (unsigned SuccIdx : ReachableEdges) + setEdgeProbability(BB, SuccIdx, ReachableProb); return true; } @@ -213,10 +218,18 @@ bool BranchProbabilityInfo::calcMetadataWeights(BasicBlock *BB) { WeightSum = 0; for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { - uint32_t W = Weights[i] / ScalingFactor; - WeightSum += W; - setEdgeWeight(BB, i, W); + Weights[i] /= ScalingFactor; + WeightSum += Weights[i]; } + + if (WeightSum == 0) { + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) + setEdgeProbability(BB, i, {1, e}); + } else { + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) + setEdgeProbability(BB, i, {Weights[i], static_cast<uint32_t>(WeightSum)}); + } + assert(WeightSum <= UINT32_MAX && "Expected weights to scale down to 32 bits"); @@ -265,21 +278,24 @@ bool BranchProbabilityInfo::calcColdCallHeuristics(BasicBlock *BB) { if (TI->getNumSuccessors() == 1 || ColdEdges.empty()) return false; - uint32_t ColdWeight = - std::max(CC_TAKEN_WEIGHT / (unsigned) ColdEdges.size(), MIN_WEIGHT); - for (SmallVectorImpl<unsigned>::iterator I = ColdEdges.begin(), - E = ColdEdges.end(); - I != E; ++I) - setEdgeWeight(BB, *I, ColdWeight); - - if (NormalEdges.empty()) + if (NormalEdges.empty()) { + BranchProbability Prob(1, ColdEdges.size()); + for (unsigned SuccIdx : ColdEdges) + setEdgeProbability(BB, SuccIdx, Prob); return true; - uint32_t NormalWeight = std::max( - CC_NONTAKEN_WEIGHT / (unsigned) NormalEdges.size(), NORMAL_WEIGHT); - for (SmallVectorImpl<unsigned>::iterator I = NormalEdges.begin(), - E = NormalEdges.end(); - I != E; ++I) - setEdgeWeight(BB, *I, NormalWeight); + } + + BranchProbability ColdProb(CC_TAKEN_WEIGHT, + (CC_TAKEN_WEIGHT + CC_NONTAKEN_WEIGHT) * + ColdEdges.size()); + BranchProbability NormalProb(CC_NONTAKEN_WEIGHT, + (CC_TAKEN_WEIGHT + CC_NONTAKEN_WEIGHT) * + NormalEdges.size()); + + for (unsigned SuccIdx : ColdEdges) + setEdgeProbability(BB, SuccIdx, ColdProb); + for (unsigned SuccIdx : NormalEdges) + setEdgeProbability(BB, SuccIdx, NormalProb); return true; } @@ -312,15 +328,18 @@ bool BranchProbabilityInfo::calcPointerHeuristics(BasicBlock *BB) { if (!isProb) std::swap(TakenIdx, NonTakenIdx); - setEdgeWeight(BB, TakenIdx, PH_TAKEN_WEIGHT); - setEdgeWeight(BB, NonTakenIdx, PH_NONTAKEN_WEIGHT); + BranchProbability TakenProb(PH_TAKEN_WEIGHT, + PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT); + setEdgeProbability(BB, TakenIdx, TakenProb); + setEdgeProbability(BB, NonTakenIdx, TakenProb.getCompl()); return true; } // Calculate Edge Weights using "Loop Branch Heuristics". Predict backedges // as taken, exiting edges as not-taken. -bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB) { - Loop *L = LI->getLoopFor(BB); +bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB, + const LoopInfo &LI) { + Loop *L = LI.getLoopFor(BB); if (!L) return false; @@ -340,37 +359,35 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB) { if (BackEdges.empty() && ExitingEdges.empty()) return false; - if (uint32_t numBackEdges = BackEdges.size()) { - uint32_t backWeight = LBH_TAKEN_WEIGHT / numBackEdges; - if (backWeight < NORMAL_WEIGHT) - backWeight = NORMAL_WEIGHT; + // Collect the sum of probabilities of back-edges/in-edges/exiting-edges, and + // normalize them so that they sum up to one. + SmallVector<BranchProbability, 4> Probs(3, BranchProbability::getZero()); + unsigned Denom = (BackEdges.empty() ? 0 : LBH_TAKEN_WEIGHT) + + (InEdges.empty() ? 0 : LBH_TAKEN_WEIGHT) + + (ExitingEdges.empty() ? 0 : LBH_NONTAKEN_WEIGHT); + if (!BackEdges.empty()) + Probs[0] = BranchProbability(LBH_TAKEN_WEIGHT, Denom); + if (!InEdges.empty()) + Probs[1] = BranchProbability(LBH_TAKEN_WEIGHT, Denom); + if (!ExitingEdges.empty()) + Probs[2] = BranchProbability(LBH_NONTAKEN_WEIGHT, Denom); - for (SmallVectorImpl<unsigned>::iterator EI = BackEdges.begin(), - EE = BackEdges.end(); EI != EE; ++EI) { - setEdgeWeight(BB, *EI, backWeight); - } + if (uint32_t numBackEdges = BackEdges.size()) { + auto Prob = Probs[0] / numBackEdges; + for (unsigned SuccIdx : BackEdges) + setEdgeProbability(BB, SuccIdx, Prob); } if (uint32_t numInEdges = InEdges.size()) { - uint32_t inWeight = LBH_TAKEN_WEIGHT / numInEdges; - if (inWeight < NORMAL_WEIGHT) - inWeight = NORMAL_WEIGHT; - - for (SmallVectorImpl<unsigned>::iterator EI = InEdges.begin(), - EE = InEdges.end(); EI != EE; ++EI) { - setEdgeWeight(BB, *EI, inWeight); - } + auto Prob = Probs[1] / numInEdges; + for (unsigned SuccIdx : InEdges) + setEdgeProbability(BB, SuccIdx, Prob); } if (uint32_t numExitingEdges = ExitingEdges.size()) { - uint32_t exitWeight = LBH_NONTAKEN_WEIGHT / numExitingEdges; - if (exitWeight < MIN_WEIGHT) - exitWeight = MIN_WEIGHT; - - for (SmallVectorImpl<unsigned>::iterator EI = ExitingEdges.begin(), - EE = ExitingEdges.end(); EI != EE; ++EI) { - setEdgeWeight(BB, *EI, exitWeight); - } + auto Prob = Probs[2] / numExitingEdges; + for (unsigned SuccIdx : ExitingEdges) + setEdgeProbability(BB, SuccIdx, Prob); } return true; @@ -452,9 +469,10 @@ bool BranchProbabilityInfo::calcZeroHeuristics(BasicBlock *BB) { if (!isProb) std::swap(TakenIdx, NonTakenIdx); - setEdgeWeight(BB, TakenIdx, ZH_TAKEN_WEIGHT); - setEdgeWeight(BB, NonTakenIdx, ZH_NONTAKEN_WEIGHT); - + BranchProbability TakenProb(ZH_TAKEN_WEIGHT, + ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT); + setEdgeProbability(BB, TakenIdx, TakenProb); + setEdgeProbability(BB, NonTakenIdx, TakenProb.getCompl()); return true; } @@ -488,9 +506,10 @@ bool BranchProbabilityInfo::calcFloatingPointHeuristics(BasicBlock *BB) { if (!isProb) std::swap(TakenIdx, NonTakenIdx); - setEdgeWeight(BB, TakenIdx, FPH_TAKEN_WEIGHT); - setEdgeWeight(BB, NonTakenIdx, FPH_NONTAKEN_WEIGHT); - + BranchProbability TakenProb(FPH_TAKEN_WEIGHT, + FPH_TAKEN_WEIGHT + FPH_NONTAKEN_WEIGHT); + setEdgeProbability(BB, TakenIdx, TakenProb); + setEdgeProbability(BB, NonTakenIdx, TakenProb.getCompl()); return true; } @@ -499,82 +518,30 @@ bool BranchProbabilityInfo::calcInvokeHeuristics(BasicBlock *BB) { if (!II) return false; - setEdgeWeight(BB, 0/*Index for Normal*/, IH_TAKEN_WEIGHT); - setEdgeWeight(BB, 1/*Index for Unwind*/, IH_NONTAKEN_WEIGHT); + BranchProbability TakenProb(IH_TAKEN_WEIGHT, + IH_TAKEN_WEIGHT + IH_NONTAKEN_WEIGHT); + setEdgeProbability(BB, 0 /*Index for Normal*/, TakenProb); + setEdgeProbability(BB, 1 /*Index for Unwind*/, TakenProb.getCompl()); return true; } -void BranchProbabilityInfo::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<LoopInfoWrapperPass>(); - AU.setPreservesAll(); -} - -bool BranchProbabilityInfo::runOnFunction(Function &F) { - DEBUG(dbgs() << "---- Branch Probability Info : " << F.getName() - << " ----\n\n"); - LastF = &F; // Store the last function we ran on for printing. - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - assert(PostDominatedByUnreachable.empty()); - assert(PostDominatedByColdCall.empty()); - - // Walk the basic blocks in post-order so that we can build up state about - // the successors of a block iteratively. - for (auto BB : post_order(&F.getEntryBlock())) { - DEBUG(dbgs() << "Computing probabilities for " << BB->getName() << "\n"); - if (calcUnreachableHeuristics(BB)) - continue; - if (calcMetadataWeights(BB)) - continue; - if (calcColdCallHeuristics(BB)) - continue; - if (calcLoopBranchHeuristics(BB)) - continue; - if (calcPointerHeuristics(BB)) - continue; - if (calcZeroHeuristics(BB)) - continue; - if (calcFloatingPointHeuristics(BB)) - continue; - calcInvokeHeuristics(BB); - } - - PostDominatedByUnreachable.clear(); - PostDominatedByColdCall.clear(); - return false; -} - void BranchProbabilityInfo::releaseMemory() { - Weights.clear(); + Probs.clear(); } -void BranchProbabilityInfo::print(raw_ostream &OS, const Module *) const { +void BranchProbabilityInfo::print(raw_ostream &OS) const { OS << "---- Branch Probabilities ----\n"; // We print the probabilities from the last function the analysis ran over, // or the function it is currently running over. assert(LastF && "Cannot print prior to running over a function"); - for (Function::const_iterator BI = LastF->begin(), BE = LastF->end(); - BI != BE; ++BI) { - for (succ_const_iterator SI = succ_begin(BI), SE = succ_end(BI); - SI != SE; ++SI) { - printEdgeProbability(OS << " ", BI, *SI); + for (const auto &BI : *LastF) { + for (succ_const_iterator SI = succ_begin(&BI), SE = succ_end(&BI); SI != SE; + ++SI) { + printEdgeProbability(OS << " ", &BI, *SI); } } } -uint32_t BranchProbabilityInfo::getSumForBlock(const BasicBlock *BB) const { - uint32_t Sum = 0; - - for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) { - uint32_t Weight = getEdgeWeight(BB, I.getSuccessorIndex()); - uint32_t PrevSum = Sum; - - Sum += Weight; - assert(Sum >= PrevSum); (void) PrevSum; - } - - return Sum; -} - bool BranchProbabilityInfo:: isEdgeHot(const BasicBlock *Src, const BasicBlock *Dst) const { // Hot probability is at least 4/5 = 80% @@ -583,97 +550,74 @@ isEdgeHot(const BasicBlock *Src, const BasicBlock *Dst) const { } BasicBlock *BranchProbabilityInfo::getHotSucc(BasicBlock *BB) const { - uint32_t Sum = 0; - uint32_t MaxWeight = 0; + auto MaxProb = BranchProbability::getZero(); BasicBlock *MaxSucc = nullptr; for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) { BasicBlock *Succ = *I; - uint32_t Weight = getEdgeWeight(BB, Succ); - uint32_t PrevSum = Sum; - - Sum += Weight; - assert(Sum > PrevSum); (void) PrevSum; - - if (Weight > MaxWeight) { - MaxWeight = Weight; + auto Prob = getEdgeProbability(BB, Succ); + if (Prob > MaxProb) { + MaxProb = Prob; MaxSucc = Succ; } } // Hot probability is at least 4/5 = 80% - if (BranchProbability(MaxWeight, Sum) > BranchProbability(4, 5)) + if (MaxProb > BranchProbability(4, 5)) return MaxSucc; return nullptr; } -/// Get the raw edge weight for the edge. If can't find it, return -/// DEFAULT_WEIGHT value. Here an edge is specified using PredBlock and an index -/// to the successors. -uint32_t BranchProbabilityInfo:: -getEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors) const { - DenseMap<Edge, uint32_t>::const_iterator I = - Weights.find(std::make_pair(Src, IndexInSuccessors)); +/// Get the raw edge probability for the edge. If can't find it, return a +/// default probability 1/N where N is the number of successors. Here an edge is +/// specified using PredBlock and an +/// index to the successors. +BranchProbability +BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src, + unsigned IndexInSuccessors) const { + auto I = Probs.find(std::make_pair(Src, IndexInSuccessors)); - if (I != Weights.end()) + if (I != Probs.end()) return I->second; - return DEFAULT_WEIGHT; + return {1, + static_cast<uint32_t>(std::distance(succ_begin(Src), succ_end(Src)))}; } -uint32_t BranchProbabilityInfo::getEdgeWeight(const BasicBlock *Src, - succ_const_iterator Dst) const { - return getEdgeWeight(Src, Dst.getSuccessorIndex()); +BranchProbability +BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src, + succ_const_iterator Dst) const { + return getEdgeProbability(Src, Dst.getSuccessorIndex()); } -/// Get the raw edge weight calculated for the block pair. This returns the sum -/// of all raw edge weights from Src to Dst. -uint32_t BranchProbabilityInfo:: -getEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst) const { - uint32_t Weight = 0; - bool FoundWeight = false; - DenseMap<Edge, uint32_t>::const_iterator MapI; +/// Get the raw edge probability calculated for the block pair. This returns the +/// sum of all raw edge probabilities from Src to Dst. +BranchProbability +BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src, + const BasicBlock *Dst) const { + auto Prob = BranchProbability::getZero(); + bool FoundProb = false; for (succ_const_iterator I = succ_begin(Src), E = succ_end(Src); I != E; ++I) if (*I == Dst) { - MapI = Weights.find(std::make_pair(Src, I.getSuccessorIndex())); - if (MapI != Weights.end()) { - FoundWeight = true; - Weight += MapI->second; + auto MapI = Probs.find(std::make_pair(Src, I.getSuccessorIndex())); + if (MapI != Probs.end()) { + FoundProb = true; + Prob += MapI->second; } } - return (!FoundWeight) ? DEFAULT_WEIGHT : Weight; + uint32_t succ_num = std::distance(succ_begin(Src), succ_end(Src)); + return FoundProb ? Prob : BranchProbability(1, succ_num); } -/// Set the edge weight for a given edge specified by PredBlock and an index -/// to the successors. -void BranchProbabilityInfo:: -setEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors, - uint32_t Weight) { - Weights[std::make_pair(Src, IndexInSuccessors)] = Weight; - DEBUG(dbgs() << "set edge " << Src->getName() << " -> " - << IndexInSuccessors << " successor weight to " - << Weight << "\n"); -} - -/// Get an edge's probability, relative to other out-edges from Src. -BranchProbability BranchProbabilityInfo:: -getEdgeProbability(const BasicBlock *Src, unsigned IndexInSuccessors) const { - uint32_t N = getEdgeWeight(Src, IndexInSuccessors); - uint32_t D = getSumForBlock(Src); - - return BranchProbability(N, D); -} - -/// Get the probability of going from Src to Dst. It returns the sum of all -/// probabilities for edges from Src to Dst. -BranchProbability BranchProbabilityInfo:: -getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const { - - uint32_t N = getEdgeWeight(Src, Dst); - uint32_t D = getSumForBlock(Src); - - return BranchProbability(N, D); +/// Set the edge probability for a given edge specified by PredBlock and an +/// index to the successors. +void BranchProbabilityInfo::setEdgeProbability(const BasicBlock *Src, + unsigned IndexInSuccessors, + BranchProbability Prob) { + Probs[std::make_pair(Src, IndexInSuccessors)] = Prob; + DEBUG(dbgs() << "set edge " << Src->getName() << " -> " << IndexInSuccessors + << " successor probability to " << Prob << "\n"); } raw_ostream & @@ -688,3 +632,54 @@ BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS, return OS; } + +void BranchProbabilityInfo::calculate(Function &F, const LoopInfo& LI) { + DEBUG(dbgs() << "---- Branch Probability Info : " << F.getName() + << " ----\n\n"); + LastF = &F; // Store the last function we ran on for printing. + assert(PostDominatedByUnreachable.empty()); + assert(PostDominatedByColdCall.empty()); + + // Walk the basic blocks in post-order so that we can build up state about + // the successors of a block iteratively. + for (auto BB : post_order(&F.getEntryBlock())) { + DEBUG(dbgs() << "Computing probabilities for " << BB->getName() << "\n"); + if (calcUnreachableHeuristics(BB)) + continue; + if (calcMetadataWeights(BB)) + continue; + if (calcColdCallHeuristics(BB)) + continue; + if (calcLoopBranchHeuristics(BB, LI)) + continue; + if (calcPointerHeuristics(BB)) + continue; + if (calcZeroHeuristics(BB)) + continue; + if (calcFloatingPointHeuristics(BB)) + continue; + calcInvokeHeuristics(BB); + } + + PostDominatedByUnreachable.clear(); + PostDominatedByColdCall.clear(); +} + +void BranchProbabilityInfoWrapperPass::getAnalysisUsage( + AnalysisUsage &AU) const { + AU.addRequired<LoopInfoWrapperPass>(); + AU.setPreservesAll(); +} + +bool BranchProbabilityInfoWrapperPass::runOnFunction(Function &F) { + const LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + BPI.calculate(F, LI); + return false; +} + +void BranchProbabilityInfoWrapperPass::releaseMemory() { BPI.releaseMemory(); } + +void BranchProbabilityInfoWrapperPass::print(raw_ostream &OS, + const Module *) const { + BPI.print(OS); +} diff --git a/contrib/llvm/lib/Analysis/CFG.cpp b/contrib/llvm/lib/Analysis/CFG.cpp index e15109b..0dfd57d 100644 --- a/contrib/llvm/lib/Analysis/CFG.cpp +++ b/contrib/llvm/lib/Analysis/CFG.cpp @@ -69,8 +69,9 @@ void llvm::FindFunctionBackedges(const Function &F, /// and return its position in the terminator instruction's list of /// successors. It is an error to call this with a block that is not a /// successor. -unsigned llvm::GetSuccessorNumber(BasicBlock *BB, BasicBlock *Succ) { - TerminatorInst *Term = BB->getTerminator(); +unsigned llvm::GetSuccessorNumber(const BasicBlock *BB, + const BasicBlock *Succ) { + const TerminatorInst *Term = BB->getTerminator(); #ifndef NDEBUG unsigned e = Term->getNumSuccessors(); #endif @@ -203,7 +204,8 @@ bool llvm::isPotentiallyReachable(const Instruction *A, const Instruction *B, return true; // Linear scan, start at 'A', see whether we hit 'B' or the end first. - for (BasicBlock::const_iterator I = A, E = BB->end(); I != E; ++I) { + for (BasicBlock::const_iterator I = A->getIterator(), E = BB->end(); I != E; + ++I) { if (&*I == B) return true; } diff --git a/contrib/llvm/lib/Analysis/CFLAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/CFLAliasAnalysis.cpp index fe1c088..4843ed6 100644 --- a/contrib/llvm/lib/Analysis/CFLAliasAnalysis.cpp +++ b/contrib/llvm/lib/Analysis/CFLAliasAnalysis.cpp @@ -27,18 +27,17 @@ // time. //===----------------------------------------------------------------------===// +#include "llvm/Analysis/CFLAliasAnalysis.h" #include "StratifiedSets.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Compiler.h" @@ -47,7 +46,6 @@ #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <cassert> -#include <forward_list> #include <memory> #include <tuple> @@ -55,6 +53,19 @@ using namespace llvm; #define DEBUG_TYPE "cfl-aa" +CFLAAResult::CFLAAResult(const TargetLibraryInfo &TLI) : AAResultBase(TLI) {} +CFLAAResult::CFLAAResult(CFLAAResult &&Arg) : AAResultBase(std::move(Arg)) {} + +// \brief Information we have about a function and would like to keep around +struct CFLAAResult::FunctionInfo { + StratifiedSets<Value *> Sets; + // Lots of functions have < 4 returns. Adjust as necessary. + SmallVector<Value *, 4> ReturnedValues; + + FunctionInfo(StratifiedSets<Value *> &&S, SmallVector<Value *, 4> &&RV) + : Sets(std::move(S)), ReturnedValues(std::move(RV)) {} +}; + // Try to go from a Value* to a Function*. Never returns nullptr. static Optional<Function *> parentFunctionOfValue(Value *); @@ -141,129 +152,13 @@ struct Edge { : From(From), To(To), Weight(W), AdditionalAttrs(A) {} }; -// \brief Information we have about a function and would like to keep around -struct FunctionInfo { - StratifiedSets<Value *> Sets; - // Lots of functions have < 4 returns. Adjust as necessary. - SmallVector<Value *, 4> ReturnedValues; - - FunctionInfo(StratifiedSets<Value *> &&S, SmallVector<Value *, 4> &&RV) - : Sets(std::move(S)), ReturnedValues(std::move(RV)) {} -}; - -struct CFLAliasAnalysis; - -struct FunctionHandle : public CallbackVH { - FunctionHandle(Function *Fn, CFLAliasAnalysis *CFLAA) - : CallbackVH(Fn), CFLAA(CFLAA) { - assert(Fn != nullptr); - assert(CFLAA != nullptr); - } - - ~FunctionHandle() override {} - - void deleted() override { removeSelfFromCache(); } - void allUsesReplacedWith(Value *) override { removeSelfFromCache(); } - -private: - CFLAliasAnalysis *CFLAA; - - void removeSelfFromCache(); -}; - -struct CFLAliasAnalysis : public ImmutablePass, public AliasAnalysis { -private: - /// \brief Cached mapping of Functions to their StratifiedSets. - /// If a function's sets are currently being built, it is marked - /// in the cache as an Optional without a value. This way, if we - /// have any kind of recursion, it is discernable from a function - /// that simply has empty sets. - DenseMap<Function *, Optional<FunctionInfo>> Cache; - std::forward_list<FunctionHandle> Handles; - -public: - static char ID; - - CFLAliasAnalysis() : ImmutablePass(ID) { - initializeCFLAliasAnalysisPass(*PassRegistry::getPassRegistry()); - } - - ~CFLAliasAnalysis() override {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AliasAnalysis::getAnalysisUsage(AU); - } - - void *getAdjustedAnalysisPointer(const void *ID) override { - if (ID == &AliasAnalysis::ID) - return (AliasAnalysis *)this; - return this; - } - - /// \brief Inserts the given Function into the cache. - void scan(Function *Fn); - - void evict(Function *Fn) { Cache.erase(Fn); } - - /// \brief Ensures that the given function is available in the cache. - /// Returns the appropriate entry from the cache. - const Optional<FunctionInfo> &ensureCached(Function *Fn) { - auto Iter = Cache.find(Fn); - if (Iter == Cache.end()) { - scan(Fn); - Iter = Cache.find(Fn); - assert(Iter != Cache.end()); - assert(Iter->second.hasValue()); - } - return Iter->second; - } - - AliasResult query(const MemoryLocation &LocA, const MemoryLocation &LocB); - - AliasResult alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) override { - if (LocA.Ptr == LocB.Ptr) { - if (LocA.Size == LocB.Size) { - return MustAlias; - } else { - return PartialAlias; - } - } - - // Comparisons between global variables and other constants should be - // handled by BasicAA. - // TODO: ConstantExpr handling -- CFLAA may report NoAlias when comparing - // a GlobalValue and ConstantExpr, but every query needs to have at least - // one Value tied to a Function, and neither GlobalValues nor ConstantExprs - // are. - if (isa<Constant>(LocA.Ptr) && isa<Constant>(LocB.Ptr)) { - return AliasAnalysis::alias(LocA, LocB); - } - - AliasResult QueryResult = query(LocA, LocB); - if (QueryResult == MayAlias) - return AliasAnalysis::alias(LocA, LocB); - - return QueryResult; - } - - bool doInitialization(Module &M) override; -}; - -void FunctionHandle::removeSelfFromCache() { - assert(CFLAA != nullptr); - auto *Val = getValPtr(); - CFLAA->evict(cast<Function>(Val)); - setValPtr(nullptr); -} - // \brief Gets the edges our graph should have, based on an Instruction* class GetEdgesVisitor : public InstVisitor<GetEdgesVisitor, void> { - CFLAliasAnalysis &AA; + CFLAAResult &AA; SmallVectorImpl<Edge> &Output; public: - GetEdgesVisitor(CFLAliasAnalysis &AA, SmallVectorImpl<Edge> &Output) + GetEdgesVisitor(CFLAAResult &AA, SmallVectorImpl<Edge> &Output) : AA(AA), Output(Output) {} void visitInstruction(Instruction &) { @@ -480,6 +375,8 @@ public: } template <typename InstT> void visitCallLikeInst(InstT &Inst) { + // TODO: Add support for noalias args/all the other fun function attributes + // that we can tack on. SmallVector<Function *, 4> Targets; if (getPossibleTargets(&Inst, Targets)) { if (tryInterproceduralAnalysis(Targets, &Inst, Inst.arg_operands())) @@ -488,8 +385,16 @@ public: Output.clear(); } + // Because the function is opaque, we need to note that anything + // could have happened to the arguments, and that the result could alias + // just about anything, too. + // The goal of the loop is in part to unify many Values into one set, so we + // don't care if the function is void there. for (Value *V : Inst.arg_operands()) Output.push_back(Edge(&Inst, V, EdgeType::Assign, AttrAll)); + if (Inst.getNumArgOperands() == 0 && + Inst.getType() != Type::getVoidTy(Inst.getContext())) + Output.push_back(Edge(&Inst, &Inst, EdgeType::Assign, AttrAll)); } void visitCallInst(CallInst &Inst) { visitCallLikeInst(Inst); } @@ -624,7 +529,7 @@ public: // ----- Various Edge iterators for the graph ----- // // \brief Iterator for edges. Because this graph is bidirected, we don't - // allow modificaiton of the edges using this iterator. Additionally, the + // allow modification of the edges using this iterator. Additionally, the // iterator becomes invalid if you add edges to or from the node you're // getting the edges of. struct EdgeIterator : public std::iterator<std::forward_iterator_tag, @@ -727,16 +632,6 @@ typedef WeightedBidirectionalGraph<std::pair<EdgeType, StratifiedAttrs>> GraphT; typedef DenseMap<Value *, GraphT::Node> NodeMapT; } -// -- Setting up/registering CFLAA pass -- // -char CFLAliasAnalysis::ID = 0; - -INITIALIZE_AG_PASS(CFLAliasAnalysis, AliasAnalysis, "cfl-aa", - "CFL-Based AA implementation", false, true, false) - -ImmutablePass *llvm::createCFLAliasAnalysisPass() { - return new CFLAliasAnalysis(); -} - //===----------------------------------------------------------------------===// // Function declarations that require types defined in the namespace above //===----------------------------------------------------------------------===// @@ -751,12 +646,10 @@ static Optional<StratifiedAttr> valueToAttrIndex(Value *Val); static EdgeType flipWeight(EdgeType); // Gets edges of the given Instruction*, writing them to the SmallVector*. -static void argsToEdges(CFLAliasAnalysis &, Instruction *, - SmallVectorImpl<Edge> &); +static void argsToEdges(CFLAAResult &, Instruction *, SmallVectorImpl<Edge> &); // Gets edges of the given ConstantExpr*, writing them to the SmallVector*. -static void argsToEdges(CFLAliasAnalysis &, ConstantExpr *, - SmallVectorImpl<Edge> &); +static void argsToEdges(CFLAAResult &, ConstantExpr *, SmallVectorImpl<Edge> &); // Gets the "Level" that one should travel in StratifiedSets // given an EdgeType. @@ -764,13 +657,13 @@ static Level directionOfEdgeType(EdgeType); // Builds the graph needed for constructing the StratifiedSets for the // given function -static void buildGraphFrom(CFLAliasAnalysis &, Function *, +static void buildGraphFrom(CFLAAResult &, Function *, SmallVectorImpl<Value *> &, NodeMapT &, GraphT &); // Gets the edges of a ConstantExpr as if it was an Instruction. This // function also acts on any nested ConstantExprs, adding the edges // of those to the given SmallVector as well. -static void constexprToEdges(CFLAliasAnalysis &, ConstantExpr &, +static void constexprToEdges(CFLAAResult &, ConstantExpr &, SmallVectorImpl<Edge> &); // Given an Instruction, this will add it to the graph, along with any @@ -779,16 +672,13 @@ static void constexprToEdges(CFLAliasAnalysis &, ConstantExpr &, // %0 = load i16* getelementptr ([1 x i16]* @a, 0, 0), align 2 // addInstructionToGraph would add both the `load` and `getelementptr` // instructions to the graph appropriately. -static void addInstructionToGraph(CFLAliasAnalysis &, Instruction &, +static void addInstructionToGraph(CFLAAResult &, Instruction &, SmallVectorImpl<Value *> &, NodeMapT &, GraphT &); // Notes whether it would be pointless to add the given Value to our sets. static bool canSkipAddingToSets(Value *Val); -// Builds the graph + StratifiedSets for a function. -static FunctionInfo buildSetsFrom(CFLAliasAnalysis &, Function *); - static Optional<Function *> parentFunctionOfValue(Value *Val) { if (auto *Inst = dyn_cast<Instruction>(Val)) { auto *Bb = Inst->getParent(); @@ -825,7 +715,7 @@ static bool hasUsefulEdges(Instruction *Inst) { } static bool hasUsefulEdges(ConstantExpr *CE) { - // ConstantExpr doens't have terminators, invokes, or fences, so only needs + // ConstantExpr doesn't have terminators, invokes, or fences, so only needs // to check for compares. return CE->getOpcode() != Instruction::ICmp && CE->getOpcode() != Instruction::FCmp; @@ -862,7 +752,7 @@ static EdgeType flipWeight(EdgeType Initial) { llvm_unreachable("Incomplete coverage of EdgeType enum"); } -static void argsToEdges(CFLAliasAnalysis &Analysis, Instruction *Inst, +static void argsToEdges(CFLAAResult &Analysis, Instruction *Inst, SmallVectorImpl<Edge> &Output) { assert(hasUsefulEdges(Inst) && "Expected instructions to have 'useful' edges"); @@ -870,7 +760,7 @@ static void argsToEdges(CFLAliasAnalysis &Analysis, Instruction *Inst, v.visit(Inst); } -static void argsToEdges(CFLAliasAnalysis &Analysis, ConstantExpr *CE, +static void argsToEdges(CFLAAResult &Analysis, ConstantExpr *CE, SmallVectorImpl<Edge> &Output) { assert(hasUsefulEdges(CE) && "Expected constant expr to have 'useful' edges"); GetEdgesVisitor v(Analysis, Output); @@ -889,7 +779,7 @@ static Level directionOfEdgeType(EdgeType Weight) { llvm_unreachable("Incomplete switch coverage"); } -static void constexprToEdges(CFLAliasAnalysis &Analysis, +static void constexprToEdges(CFLAAResult &Analysis, ConstantExpr &CExprToCollapse, SmallVectorImpl<Edge> &Results) { SmallVector<ConstantExpr *, 4> Worklist; @@ -919,7 +809,7 @@ static void constexprToEdges(CFLAliasAnalysis &Analysis, } } -static void addInstructionToGraph(CFLAliasAnalysis &Analysis, Instruction &Inst, +static void addInstructionToGraph(CFLAAResult &Analysis, Instruction &Inst, SmallVectorImpl<Value *> &ReturnedValues, NodeMapT &Map, GraphT &Graph) { const auto findOrInsertNode = [&Map, &Graph](Value *Val) { @@ -982,7 +872,7 @@ static void addInstructionToGraph(CFLAliasAnalysis &Analysis, Instruction &Inst, // buy us much that we don't already have. I'd like to add interprocedural // analysis prior to this however, in case that somehow requires the graph // produced by this for efficient execution -static void buildGraphFrom(CFLAliasAnalysis &Analysis, Function *Fn, +static void buildGraphFrom(CFLAAResult &Analysis, Function *Fn, SmallVectorImpl<Value *> &ReturnedValues, NodeMapT &Map, GraphT &Graph) { for (auto &Bb : Fn->getBasicBlockList()) @@ -1012,12 +902,13 @@ static bool canSkipAddingToSets(Value *Val) { return false; } -static FunctionInfo buildSetsFrom(CFLAliasAnalysis &Analysis, Function *Fn) { +// Builds the graph + StratifiedSets for a function. +CFLAAResult::FunctionInfo CFLAAResult::buildSetsFrom(Function *Fn) { NodeMapT Map; GraphT Graph; SmallVector<Value *, 4> ReturnedValues; - buildGraphFrom(Analysis, Fn, ReturnedValues, Map, Graph); + buildGraphFrom(*this, Fn, ReturnedValues, Map, Graph); DenseMap<GraphT::Node, Value *> NodeValueMap; NodeValueMap.resize(Map.size()); @@ -1098,19 +989,35 @@ static FunctionInfo buildSetsFrom(CFLAliasAnalysis &Analysis, Function *Fn) { return FunctionInfo(Builder.build(), std::move(ReturnedValues)); } -void CFLAliasAnalysis::scan(Function *Fn) { +void CFLAAResult::scan(Function *Fn) { auto InsertPair = Cache.insert(std::make_pair(Fn, Optional<FunctionInfo>())); (void)InsertPair; assert(InsertPair.second && "Trying to scan a function that has already been cached"); - FunctionInfo Info(buildSetsFrom(*this, Fn)); + FunctionInfo Info(buildSetsFrom(Fn)); Cache[Fn] = std::move(Info); Handles.push_front(FunctionHandle(Fn, this)); } -AliasResult CFLAliasAnalysis::query(const MemoryLocation &LocA, - const MemoryLocation &LocB) { +void CFLAAResult::evict(Function *Fn) { Cache.erase(Fn); } + +/// \brief Ensures that the given function is available in the cache. +/// Returns the appropriate entry from the cache. +const Optional<CFLAAResult::FunctionInfo> & +CFLAAResult::ensureCached(Function *Fn) { + auto Iter = Cache.find(Fn); + if (Iter == Cache.end()) { + scan(Fn); + Iter = Cache.find(Fn); + assert(Iter != Cache.end()); + assert(Iter->second.hasValue()); + } + return Iter->second; +} + +AliasResult CFLAAResult::query(const MemoryLocation &LocA, + const MemoryLocation &LocB) { auto *ValA = const_cast<Value *>(LocA.Ptr); auto *ValB = const_cast<Value *>(LocB.Ptr); @@ -1176,7 +1083,37 @@ AliasResult CFLAliasAnalysis::query(const MemoryLocation &LocA, return NoAlias; } -bool CFLAliasAnalysis::doInitialization(Module &M) { - InitializeAliasAnalysis(this, &M.getDataLayout()); - return true; +CFLAAResult CFLAA::run(Function &F, AnalysisManager<Function> *AM) { + return CFLAAResult(AM->getResult<TargetLibraryAnalysis>(F)); +} + +char CFLAA::PassID; + +char CFLAAWrapperPass::ID = 0; +INITIALIZE_PASS_BEGIN(CFLAAWrapperPass, "cfl-aa", "CFL-Based Alias Analysis", + false, true) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(CFLAAWrapperPass, "cfl-aa", "CFL-Based Alias Analysis", + false, true) + +ImmutablePass *llvm::createCFLAAWrapperPass() { return new CFLAAWrapperPass(); } + +CFLAAWrapperPass::CFLAAWrapperPass() : ImmutablePass(ID) { + initializeCFLAAWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +bool CFLAAWrapperPass::doInitialization(Module &M) { + Result.reset( + new CFLAAResult(getAnalysis<TargetLibraryInfoWrapperPass>().getTLI())); + return false; +} + +bool CFLAAWrapperPass::doFinalization(Module &M) { + Result.reset(); + return false; +} + +void CFLAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } diff --git a/contrib/llvm/lib/Analysis/IPA/CallGraph.cpp b/contrib/llvm/lib/Analysis/CallGraph.cpp index e2799d9..7cec962 100644 --- a/contrib/llvm/lib/Analysis/IPA/CallGraph.cpp +++ b/contrib/llvm/lib/Analysis/CallGraph.cpp @@ -22,7 +22,7 @@ using namespace llvm; CallGraph::CallGraph(Module &M) : M(M), Root(nullptr), ExternalCallingNode(getOrInsertFunction(nullptr)), - CallsExternalNode(new CallGraphNode(nullptr)) { + CallsExternalNode(llvm::make_unique<CallGraphNode>(nullptr)) { // Add every function to the call graph. for (Function &F : M) addToCallGraph(&F); @@ -32,10 +32,19 @@ CallGraph::CallGraph(Module &M) Root = ExternalCallingNode; } +CallGraph::CallGraph(CallGraph &&Arg) + : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)), Root(Arg.Root), + ExternalCallingNode(Arg.ExternalCallingNode), + CallsExternalNode(std::move(Arg.CallsExternalNode)) { + Arg.FunctionMap.clear(); + Arg.Root = nullptr; + Arg.ExternalCallingNode = nullptr; +} + CallGraph::~CallGraph() { // CallsExternalNode is not in the function map, delete it explicitly. - CallsExternalNode->allReferencesDropped(); - delete CallsExternalNode; + if (CallsExternalNode) + CallsExternalNode->allReferencesDropped(); // Reset all node's use counts to zero before deleting them to prevent an // assertion from firing. @@ -43,8 +52,6 @@ CallGraph::~CallGraph() { for (auto &I : FunctionMap) I.second->allReferencesDropped(); #endif - for (auto &I : FunctionMap) - delete I.second; } void CallGraph::addToCallGraph(Function *F) { @@ -70,7 +77,7 @@ void CallGraph::addToCallGraph(Function *F) { // If this function is not defined in this translation unit, it could call // anything. if (F->isDeclaration() && !F->isIntrinsic()) - Node->addCalledFunction(CallSite(), CallsExternalNode); + Node->addCalledFunction(CallSite(), CallsExternalNode.get()); // Look for calls by this function. for (Function::iterator BB = F->begin(), BBE = F->end(); BB != BBE; ++BB) @@ -83,7 +90,7 @@ void CallGraph::addToCallGraph(Function *F) { // Indirect calls of intrinsics are not allowed so no need to check. // We can be more precise here by using TargetArg returned by // Intrinsic::isLeaf. - Node->addCalledFunction(CS, CallsExternalNode); + Node->addCalledFunction(CS, CallsExternalNode.get()); else if (!Callee->isIntrinsic()) Node->addCalledFunction(CS, getOrInsertFunction(Callee)); } @@ -105,7 +112,7 @@ void CallGraph::print(raw_ostream &OS) const { Nodes.reserve(FunctionMap.size()); for (auto I = begin(), E = end(); I != E; ++I) - Nodes.push_back(I->second); + Nodes.push_back(I->second.get()); std::sort(Nodes.begin(), Nodes.end(), [](CallGraphNode *LHS, CallGraphNode *RHS) { @@ -120,9 +127,8 @@ void CallGraph::print(raw_ostream &OS) const { CN->print(OS); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void CallGraph::dump() const { print(dbgs()); } -#endif // removeFunctionFromModule - Unlink the function from this module, returning // it. Because this removes the function from the module, the call graph node @@ -134,7 +140,6 @@ Function *CallGraph::removeFunctionFromModule(CallGraphNode *CGN) { assert(CGN->empty() && "Cannot remove function from call " "graph if it references other functions!"); Function *F = CGN->getFunction(); // Get the function for the call graph node - delete CGN; // Delete the call graph node for this func FunctionMap.erase(F); // Remove the call graph node from the map M.getFunctionList().remove(F); @@ -152,7 +157,7 @@ void CallGraph::spliceFunction(const Function *From, const Function *To) { "Pointing CallGraphNode at a function that already exists"); FunctionMapTy::iterator I = FunctionMap.find(From); I->second->F = const_cast<Function*>(To); - FunctionMap[To] = I->second; + FunctionMap[To] = std::move(I->second); FunctionMap.erase(I); } @@ -160,12 +165,13 @@ void CallGraph::spliceFunction(const Function *From, const Function *To) { // it will insert a new CallGraphNode for the specified function if one does // not already exist. CallGraphNode *CallGraph::getOrInsertFunction(const Function *F) { - CallGraphNode *&CGN = FunctionMap[F]; + auto &CGN = FunctionMap[F]; if (CGN) - return CGN; + return CGN.get(); assert((!F || F->getParent() == &M) && "Function not in current module!"); - return CGN = new CallGraphNode(const_cast<Function*>(F)); + CGN = llvm::make_unique<CallGraphNode>(const_cast<Function *>(F)); + return CGN.get(); } //===----------------------------------------------------------------------===// @@ -190,9 +196,8 @@ void CallGraphNode::print(raw_ostream &OS) const { OS << '\n'; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void CallGraphNode::dump() const { print(dbgs()); } -#endif /// removeCallEdgeFor - This method removes the edge in the node for the /// specified call site. Note that this method takes linear time, so it @@ -297,6 +302,5 @@ void CallGraphWrapperPass::print(raw_ostream &OS, const Module *) const { G->print(OS); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void CallGraphWrapperPass::dump() const { print(dbgs(), nullptr); } -#endif diff --git a/contrib/llvm/lib/Analysis/IPA/CallGraphSCCPass.cpp b/contrib/llvm/lib/Analysis/CallGraphSCCPass.cpp index 07b389a..6dd1d0a 100644 --- a/contrib/llvm/lib/Analysis/IPA/CallGraphSCCPass.cpp +++ b/contrib/llvm/lib/Analysis/CallGraphSCCPass.cpp @@ -612,9 +612,10 @@ namespace { bool runOnSCC(CallGraphSCC &SCC) override { Out << Banner; for (CallGraphNode *CGN : SCC) { - if (CGN->getFunction()) - CGN->getFunction()->print(Out); - else + if (CGN->getFunction()) { + if (isFunctionInPrintList(CGN->getFunction()->getName())) + CGN->getFunction()->print(Out); + } else Out << "\nPrinting <null> Function\n"; } return false; diff --git a/contrib/llvm/lib/Analysis/IPA/CallPrinter.cpp b/contrib/llvm/lib/Analysis/CallPrinter.cpp index 68dcd3c..68dcd3c 100644 --- a/contrib/llvm/lib/Analysis/IPA/CallPrinter.cpp +++ b/contrib/llvm/lib/Analysis/CallPrinter.cpp diff --git a/contrib/llvm/lib/Analysis/CaptureTracking.cpp b/contrib/llvm/lib/Analysis/CaptureTracking.cpp index 52ef807..1add2fa 100644 --- a/contrib/llvm/lib/Analysis/CaptureTracking.cpp +++ b/contrib/llvm/lib/Analysis/CaptureTracking.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" @@ -52,63 +53,6 @@ namespace { bool Captured; }; - struct NumberedInstCache { - SmallDenseMap<const Instruction *, unsigned, 32> NumberedInsts; - BasicBlock::const_iterator LastInstFound; - unsigned LastInstPos; - const BasicBlock *BB; - - NumberedInstCache(const BasicBlock *BasicB) : LastInstPos(0), BB(BasicB) { - LastInstFound = BB->end(); - } - - /// \brief Find the first instruction 'A' or 'B' in 'BB'. Number out - /// instruction while walking 'BB'. - const Instruction *find(const Instruction *A, const Instruction *B) { - const Instruction *Inst = nullptr; - assert(!(LastInstFound == BB->end() && LastInstPos != 0) && - "Instruction supposed to be in NumberedInsts"); - - // Start the search with the instruction found in the last lookup round. - auto II = BB->begin(); - auto IE = BB->end(); - if (LastInstFound != IE) - II = std::next(LastInstFound); - - // Number all instructions up to the point where we find 'A' or 'B'. - for (++LastInstPos; II != IE; ++II, ++LastInstPos) { - Inst = cast<Instruction>(II); - NumberedInsts[Inst] = LastInstPos; - if (Inst == A || Inst == B) - break; - } - - assert(II != IE && "Instruction not found?"); - LastInstFound = II; - return Inst; - } - - /// \brief Find out whether 'A' dominates 'B', meaning whether 'A' - /// comes before 'B' in 'BB'. This is a simplification that considers - /// cached instruction positions and ignores other basic blocks, being - /// only relevant to compare relative instructions positions inside 'BB'. - bool dominates(const Instruction *A, const Instruction *B) { - assert(A->getParent() == B->getParent() && - "Instructions must be in the same basic block!"); - - unsigned NA = NumberedInsts.lookup(A); - unsigned NB = NumberedInsts.lookup(B); - if (NA && NB) - return NA < NB; - if (NA) - return true; - if (NB) - return false; - - return A == find(A, B); - } - }; - /// Only find pointer captures which happen before the given instruction. Uses /// the dominator tree to determine whether one instruction is before another. /// Only support the case where the Value is defined in the same basic block @@ -116,8 +60,8 @@ namespace { struct CapturesBefore : public CaptureTracker { CapturesBefore(bool ReturnCaptures, const Instruction *I, DominatorTree *DT, - bool IncludeI) - : LocalInstCache(I->getParent()), BeforeHere(I), DT(DT), + bool IncludeI, OrderedBasicBlock *IC) + : OrderedBB(IC), BeforeHere(I), DT(DT), ReturnCaptures(ReturnCaptures), IncludeI(IncludeI), Captured(false) {} void tooManyUses() override { Captured = true; } @@ -131,18 +75,18 @@ namespace { // Compute the case where both instructions are inside the same basic // block. Since instructions in the same BB as BeforeHere are numbered in - // 'LocalInstCache', avoid using 'dominates' and 'isPotentiallyReachable' + // 'OrderedBB', avoid using 'dominates' and 'isPotentiallyReachable' // which are very expensive for large basic blocks. if (BB == BeforeHere->getParent()) { // 'I' dominates 'BeforeHere' => not safe to prune. // - // The value defined by an invoke dominates an instruction only if it - // dominates every instruction in UseBB. A PHI is dominated only if - // the instruction dominates every possible use in the UseBB. Since + // The value defined by an invoke dominates an instruction only + // if it dominates every instruction in UseBB. A PHI is dominated only + // if the instruction dominates every possible use in the UseBB. Since // UseBB == BB, avoid pruning. if (isa<InvokeInst>(BeforeHere) || isa<PHINode>(I) || I == BeforeHere) return false; - if (!LocalInstCache.dominates(BeforeHere, I)) + if (!OrderedBB->dominates(BeforeHere, I)) return false; // 'BeforeHere' comes before 'I', it's safe to prune if we also @@ -157,10 +101,7 @@ namespace { SmallVector<BasicBlock*, 32> Worklist; Worklist.append(succ_begin(BB), succ_end(BB)); - if (!isPotentiallyReachableFromMany(Worklist, BB, DT)) - return true; - - return false; + return !isPotentiallyReachableFromMany(Worklist, BB, DT); } // If the value is defined in the same basic block as use and BeforeHere, @@ -196,7 +137,7 @@ namespace { return true; } - NumberedInstCache LocalInstCache; + OrderedBasicBlock *OrderedBB; const Instruction *BeforeHere; DominatorTree *DT; @@ -238,21 +179,29 @@ bool llvm::PointerMayBeCaptured(const Value *V, /// returning the value (or part of it) from the function counts as capturing /// it or not. The boolean StoreCaptures specified whether storing the value /// (or part of it) into memory anywhere automatically counts as capturing it -/// or not. +/// or not. A ordered basic block \p OBB can be used in order to speed up +/// queries about relative order among instructions in the same basic block. bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures, bool StoreCaptures, const Instruction *I, - DominatorTree *DT, bool IncludeI) { + DominatorTree *DT, bool IncludeI, + OrderedBasicBlock *OBB) { assert(!isa<GlobalValue>(V) && "It doesn't make sense to ask whether a global is captured."); + bool UseNewOBB = OBB == nullptr; if (!DT) return PointerMayBeCaptured(V, ReturnCaptures, StoreCaptures); + if (UseNewOBB) + OBB = new OrderedBasicBlock(I->getParent()); // TODO: See comment in PointerMayBeCaptured regarding what could be done // with StoreCaptures. - CapturesBefore CB(ReturnCaptures, I, DT, IncludeI); + CapturesBefore CB(ReturnCaptures, I, DT, IncludeI, OBB); PointerMayBeCaptured(V, &CB); + + if (UseNewOBB) + delete OBB; return CB.Captured; } @@ -300,8 +249,9 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) { // that loading a value from a pointer does not cause the pointer to be // captured, even though the loaded value might be the pointer itself // (think of self-referential objects). - CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); - for (CallSite::arg_iterator A = B; A != E; ++A) + CallSite::data_operand_iterator B = + CS.data_operands_begin(), E = CS.data_operands_end(); + for (CallSite::data_operand_iterator A = B; A != E; ++A) if (A->get() == V && !CS.doesNotCapture(A - B)) // The parameter is not marked 'nocapture' - captured. if (Tracker->captured(U)) diff --git a/contrib/llvm/lib/Analysis/CodeMetrics.cpp b/contrib/llvm/lib/Analysis/CodeMetrics.cpp index 46a2c43..4090b4c 100644 --- a/contrib/llvm/lib/Analysis/CodeMetrics.cpp +++ b/contrib/llvm/lib/Analysis/CodeMetrics.cpp @@ -45,14 +45,8 @@ static void completeEphemeralValues(SmallVector<const Value *, 16> &WorkSet, continue; // If all uses of this value are ephemeral, then so is this value. - bool FoundNEUse = false; - for (const User *I : V->users()) - if (!EphValues.count(I)) { - FoundNEUse = true; - break; - } - - if (FoundNEUse) + if (!std::all_of(V->user_begin(), V->user_end(), + [&](const User *U) { return EphValues.count(U); })) continue; EphValues.insert(V); @@ -116,7 +110,7 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB, for (BasicBlock::const_iterator II = BB->begin(), E = BB->end(); II != E; ++II) { // Skip ephemeral values. - if (EphValues.count(II)) + if (EphValues.count(&*II)) continue; // Special handling for calls. @@ -155,6 +149,9 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB, if (isa<ExtractElementInst>(II) || II->getType()->isVectorTy()) ++NumVectorInsts; + if (II->getType()->isTokenTy() && II->isUsedOutsideOfBlock(BB)) + notDuplicatable = true; + if (const CallInst *CI = dyn_cast<CallInst>(II)) if (CI->cannotDuplicate()) notDuplicatable = true; diff --git a/contrib/llvm/lib/Analysis/ConstantFolding.cpp b/contrib/llvm/lib/Analysis/ConstantFolding.cpp index 02a5aef..ccb5663 100644 --- a/contrib/llvm/lib/Analysis/ConstantFolding.cpp +++ b/contrib/llvm/lib/Analysis/ConstantFolding.cpp @@ -248,8 +248,7 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV, // Look through ptr->int and ptr->ptr casts. if (CE->getOpcode() == Instruction::PtrToInt || - CE->getOpcode() == Instruction::BitCast || - CE->getOpcode() == Instruction::AddrSpaceCast) + CE->getOpcode() == Instruction::BitCast) return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, DL); // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5) @@ -532,6 +531,10 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, if (GV->isConstant() && GV->hasDefinitiveInitializer()) return GV->getInitializer(); + if (auto *GA = dyn_cast<GlobalAlias>(C)) + if (GA->getAliasee() && !GA->mayBeOverridden()) + return ConstantFoldLoadFromConstPtr(GA->getAliasee(), DL); + // If the loaded value isn't a constant expr, we can't handle it. ConstantExpr *CE = dyn_cast<ConstantExpr>(C); if (!CE) @@ -1236,6 +1239,9 @@ bool llvm::canConstantFoldCallTo(const Function *F) { case Intrinsic::sqrt: case Intrinsic::sin: case Intrinsic::cos: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: case Intrinsic::pow: case Intrinsic::powi: case Intrinsic::bswap: @@ -1276,24 +1282,30 @@ bool llvm::canConstantFoldCallTo(const Function *F) { // return true for a name like "cos\0blah" which strcmp would return equal to // "cos", but has length 8. switch (Name[0]) { - default: return false; + default: + return false; case 'a': - return Name == "acos" || Name == "asin" || Name == "atan" || Name =="atan2"; + return Name == "acos" || Name == "asin" || Name == "atan" || + Name == "atan2" || Name == "acosf" || Name == "asinf" || + Name == "atanf" || Name == "atan2f"; case 'c': - return Name == "cos" || Name == "ceil" || Name == "cosf" || Name == "cosh"; + return Name == "ceil" || Name == "cos" || Name == "cosh" || + Name == "ceilf" || Name == "cosf" || Name == "coshf"; case 'e': - return Name == "exp" || Name == "exp2"; + return Name == "exp" || Name == "exp2" || Name == "expf" || Name == "exp2f"; case 'f': - return Name == "fabs" || Name == "fmod" || Name == "floor"; + return Name == "fabs" || Name == "floor" || Name == "fmod" || + Name == "fabsf" || Name == "floorf" || Name == "fmodf"; case 'l': - return Name == "log" || Name == "log10"; + return Name == "log" || Name == "log10" || Name == "logf" || + Name == "log10f"; case 'p': - return Name == "pow"; + return Name == "pow" || Name == "powf"; case 's': return Name == "sin" || Name == "sinh" || Name == "sqrt" || - Name == "sinf" || Name == "sqrtf"; + Name == "sinf" || Name == "sinhf" || Name == "sqrtf"; case 't': - return Name == "tan" || Name == "tanh"; + return Name == "tan" || Name == "tanh" || Name == "tanf" || Name == "tanhf"; } } @@ -1422,6 +1434,36 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, return ConstantFP::get(Ty->getContext(), V); } + if (IntrinsicID == Intrinsic::floor) { + APFloat V = Op->getValueAPF(); + V.roundToIntegral(APFloat::rmTowardNegative); + return ConstantFP::get(Ty->getContext(), V); + } + + if (IntrinsicID == Intrinsic::ceil) { + APFloat V = Op->getValueAPF(); + V.roundToIntegral(APFloat::rmTowardPositive); + return ConstantFP::get(Ty->getContext(), V); + } + + if (IntrinsicID == Intrinsic::trunc) { + APFloat V = Op->getValueAPF(); + V.roundToIntegral(APFloat::rmTowardZero); + return ConstantFP::get(Ty->getContext(), V); + } + + if (IntrinsicID == Intrinsic::rint) { + APFloat V = Op->getValueAPF(); + V.roundToIntegral(APFloat::rmNearestTiesToEven); + return ConstantFP::get(Ty->getContext(), V); + } + + if (IntrinsicID == Intrinsic::nearbyint) { + APFloat V = Op->getValueAPF(); + V.roundToIntegral(APFloat::rmNearestTiesToEven); + return ConstantFP::get(Ty->getContext(), V); + } + /// We only fold functions with finite arguments. Folding NaN and inf is /// likely to be aborted with an exception anyway, and some host libms /// have known errors raising exceptions. @@ -1448,10 +1490,6 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, return ConstantFoldFP(exp, V, Ty); case Intrinsic::exp2: return ConstantFoldFP(exp2, V, Ty); - case Intrinsic::floor: - return ConstantFoldFP(floor, V, Ty); - case Intrinsic::ceil: - return ConstantFoldFP(ceil, V, Ty); case Intrinsic::sin: return ConstantFoldFP(sin, V, Ty); case Intrinsic::cos: @@ -1463,43 +1501,51 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, switch (Name[0]) { case 'a': - if (Name == "acos" && TLI->has(LibFunc::acos)) + if ((Name == "acos" && TLI->has(LibFunc::acos)) || + (Name == "acosf" && TLI->has(LibFunc::acosf))) return ConstantFoldFP(acos, V, Ty); - else if (Name == "asin" && TLI->has(LibFunc::asin)) + else if ((Name == "asin" && TLI->has(LibFunc::asin)) || + (Name == "asinf" && TLI->has(LibFunc::asinf))) return ConstantFoldFP(asin, V, Ty); - else if (Name == "atan" && TLI->has(LibFunc::atan)) + else if ((Name == "atan" && TLI->has(LibFunc::atan)) || + (Name == "atanf" && TLI->has(LibFunc::atanf))) return ConstantFoldFP(atan, V, Ty); break; case 'c': - if (Name == "ceil" && TLI->has(LibFunc::ceil)) + if ((Name == "ceil" && TLI->has(LibFunc::ceil)) || + (Name == "ceilf" && TLI->has(LibFunc::ceilf))) return ConstantFoldFP(ceil, V, Ty); - else if (Name == "cos" && TLI->has(LibFunc::cos)) + else if ((Name == "cos" && TLI->has(LibFunc::cos)) || + (Name == "cosf" && TLI->has(LibFunc::cosf))) return ConstantFoldFP(cos, V, Ty); - else if (Name == "cosh" && TLI->has(LibFunc::cosh)) + else if ((Name == "cosh" && TLI->has(LibFunc::cosh)) || + (Name == "coshf" && TLI->has(LibFunc::coshf))) return ConstantFoldFP(cosh, V, Ty); - else if (Name == "cosf" && TLI->has(LibFunc::cosf)) - return ConstantFoldFP(cos, V, Ty); break; case 'e': - if (Name == "exp" && TLI->has(LibFunc::exp)) + if ((Name == "exp" && TLI->has(LibFunc::exp)) || + (Name == "expf" && TLI->has(LibFunc::expf))) return ConstantFoldFP(exp, V, Ty); - - if (Name == "exp2" && TLI->has(LibFunc::exp2)) { + if ((Name == "exp2" && TLI->has(LibFunc::exp2)) || + (Name == "exp2f" && TLI->has(LibFunc::exp2f))) // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a // C99 library. return ConstantFoldBinaryFP(pow, 2.0, V, Ty); - } break; case 'f': - if (Name == "fabs" && TLI->has(LibFunc::fabs)) + if ((Name == "fabs" && TLI->has(LibFunc::fabs)) || + (Name == "fabsf" && TLI->has(LibFunc::fabsf))) return ConstantFoldFP(fabs, V, Ty); - else if (Name == "floor" && TLI->has(LibFunc::floor)) + else if ((Name == "floor" && TLI->has(LibFunc::floor)) || + (Name == "floorf" && TLI->has(LibFunc::floorf))) return ConstantFoldFP(floor, V, Ty); break; case 'l': - if (Name == "log" && V > 0 && TLI->has(LibFunc::log)) + if ((Name == "log" && V > 0 && TLI->has(LibFunc::log)) || + (Name == "logf" && V > 0 && TLI->has(LibFunc::logf))) return ConstantFoldFP(log, V, Ty); - else if (Name == "log10" && V > 0 && TLI->has(LibFunc::log10)) + else if ((Name == "log10" && V > 0 && TLI->has(LibFunc::log10)) || + (Name == "log10f" && V > 0 && TLI->has(LibFunc::log10f))) return ConstantFoldFP(log10, V, Ty); else if (IntrinsicID == Intrinsic::sqrt && (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())) { @@ -1516,21 +1562,22 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, } break; case 's': - if (Name == "sin" && TLI->has(LibFunc::sin)) + if ((Name == "sin" && TLI->has(LibFunc::sin)) || + (Name == "sinf" && TLI->has(LibFunc::sinf))) return ConstantFoldFP(sin, V, Ty); - else if (Name == "sinh" && TLI->has(LibFunc::sinh)) + else if ((Name == "sinh" && TLI->has(LibFunc::sinh)) || + (Name == "sinhf" && TLI->has(LibFunc::sinhf))) return ConstantFoldFP(sinh, V, Ty); - else if (Name == "sqrt" && V >= 0 && TLI->has(LibFunc::sqrt)) - return ConstantFoldFP(sqrt, V, Ty); - else if (Name == "sqrtf" && V >= 0 && TLI->has(LibFunc::sqrtf)) + else if ((Name == "sqrt" && V >= 0 && TLI->has(LibFunc::sqrt)) || + (Name == "sqrtf" && V >= 0 && TLI->has(LibFunc::sqrtf))) return ConstantFoldFP(sqrt, V, Ty); - else if (Name == "sinf" && TLI->has(LibFunc::sinf)) - return ConstantFoldFP(sin, V, Ty); break; case 't': - if (Name == "tan" && TLI->has(LibFunc::tan)) + if ((Name == "tan" && TLI->has(LibFunc::tan)) || + (Name == "tanf" && TLI->has(LibFunc::tanf))) return ConstantFoldFP(tan, V, Ty); - else if (Name == "tanh" && TLI->has(LibFunc::tanh)) + else if ((Name == "tanh" && TLI->has(LibFunc::tanh)) || + (Name == "tanhf" && TLI->has(LibFunc::tanhf))) return ConstantFoldFP(tanh, V, Ty); break; default: @@ -1633,11 +1680,14 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, if (!TLI) return nullptr; - if (Name == "pow" && TLI->has(LibFunc::pow)) + if ((Name == "pow" && TLI->has(LibFunc::pow)) || + (Name == "powf" && TLI->has(LibFunc::powf))) return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty); - if (Name == "fmod" && TLI->has(LibFunc::fmod)) + if ((Name == "fmod" && TLI->has(LibFunc::fmod)) || + (Name == "fmodf" && TLI->has(LibFunc::fmodf))) return ConstantFoldBinaryFP(fmod, Op1V, Op2V, Ty); - if (Name == "atan2" && TLI->has(LibFunc::atan2)) + if ((Name == "atan2" && TLI->has(LibFunc::atan2)) || + (Name == "atan2f" && TLI->has(LibFunc::atan2f))) return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty); } else if (ConstantInt *Op2C = dyn_cast<ConstantInt>(Operands[1])) { if (IntrinsicID == Intrinsic::powi && Ty->isHalfTy()) diff --git a/contrib/llvm/lib/Analysis/CostModel.cpp b/contrib/llvm/lib/Analysis/CostModel.cpp index b529c1a..0383cbf 100644 --- a/contrib/llvm/lib/Analysis/CostModel.cpp +++ b/contrib/llvm/lib/Analysis/CostModel.cpp @@ -152,10 +152,7 @@ static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft, Mask[i] = val; SmallVector<int, 16> ActualMask = SI->getShuffleMask(); - if (Mask != ActualMask) - return false; - - return true; + return Mask == ActualMask; } static bool matchPairwiseReductionAtLevel(const BinaryOperator *BinOp, @@ -383,10 +380,8 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const { return -1; switch (I->getOpcode()) { - case Instruction::GetElementPtr:{ - Type *ValTy = I->getOperand(0)->getType()->getPointerElementType(); - return TTI->getAddressComputationCost(ValTy); - } + case Instruction::GetElementPtr: + return TTI->getUserCost(I); case Instruction::Ret: case Instruction::PHI: @@ -505,12 +500,12 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const { } case Instruction::Call: if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { - SmallVector<Type*, 4> Tys; + SmallVector<Value *, 4> Args; for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J) - Tys.push_back(II->getArgOperand(J)->getType()); + Args.push_back(II->getArgOperand(J)); return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), - Tys); + Args); } return -1; default: @@ -525,7 +520,7 @@ void CostModelAnalysis::print(raw_ostream &OS, const Module*) const { for (Function::iterator B = F->begin(), BE = F->end(); B != BE; ++B) { for (BasicBlock::iterator it = B->begin(), e = B->end(); it != e; ++it) { - Instruction *Inst = it; + Instruction *Inst = &*it; unsigned Cost = getInstructionCost(Inst); if (Cost != (unsigned)-1) OS << "Cost Model: Found an estimated cost of " << Cost; diff --git a/contrib/llvm/lib/Analysis/Delinearization.cpp b/contrib/llvm/lib/Analysis/Delinearization.cpp index 9d15786..baee8b3 100644 --- a/contrib/llvm/lib/Analysis/Delinearization.cpp +++ b/contrib/llvm/lib/Analysis/Delinearization.cpp @@ -60,12 +60,12 @@ public: void Delinearization::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); } bool Delinearization::runOnFunction(Function &F) { this->F = &F; - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); return false; } @@ -102,20 +102,14 @@ void Delinearization::print(raw_ostream &O, const Module *) const { if (!BasePointer) break; AccessFn = SE->getMinusSCEV(AccessFn, BasePointer); - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(AccessFn); - - // Do not try to delinearize memory accesses that are not AddRecs. - if (!AR) - break; - O << "\n"; O << "Inst:" << *Inst << "\n"; O << "In Loop with Header: " << L->getHeader()->getName() << "\n"; - O << "AddRec: " << *AR << "\n"; + O << "AccessFunction: " << *AccessFn << "\n"; SmallVector<const SCEV *, 3> Subscripts, Sizes; - SE->delinearize(AR, Subscripts, Sizes, SE->getElementSize(Inst)); + SE->delinearize(AccessFn, Subscripts, Sizes, SE->getElementSize(Inst)); if (Subscripts.size() == 0 || Sizes.size() == 0 || Subscripts.size() != Sizes.size()) { O << "failed to delinearize\n"; diff --git a/contrib/llvm/lib/Analysis/DemandedBits.cpp b/contrib/llvm/lib/Analysis/DemandedBits.cpp new file mode 100644 index 0000000..912c5ce --- /dev/null +++ b/contrib/llvm/lib/Analysis/DemandedBits.cpp @@ -0,0 +1,392 @@ +//===---- DemandedBits.cpp - Determine demanded bits ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements a demanded bits analysis. A demanded bit is one that +// contributes to a result; bits that are not demanded can be either zero or +// one without affecting control or data flow. For example in this sequence: +// +// %1 = add i32 %x, %y +// %2 = trunc i32 %1 to i16 +// +// Only the lowest 16 bits of %1 are demanded; the rest are removed by the +// trunc. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/DemandedBits.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "demanded-bits" + +char DemandedBits::ID = 0; +INITIALIZE_PASS_BEGIN(DemandedBits, "demanded-bits", "Demanded bits analysis", + false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(DemandedBits, "demanded-bits", "Demanded bits analysis", + false, false) + +DemandedBits::DemandedBits() : FunctionPass(ID), F(nullptr), Analyzed(false) { + initializeDemandedBitsPass(*PassRegistry::getPassRegistry()); +} + +void DemandedBits::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.setPreservesAll(); +} + +static bool isAlwaysLive(Instruction *I) { + return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || + I->isEHPad() || I->mayHaveSideEffects(); +} + +void DemandedBits::determineLiveOperandBits( + const Instruction *UserI, const Instruction *I, unsigned OperandNo, + const APInt &AOut, APInt &AB, APInt &KnownZero, APInt &KnownOne, + APInt &KnownZero2, APInt &KnownOne2) { + unsigned BitWidth = AB.getBitWidth(); + + // We're called once per operand, but for some instructions, we need to + // compute known bits of both operands in order to determine the live bits of + // either (when both operands are instructions themselves). We don't, + // however, want to do this twice, so we cache the result in APInts that live + // in the caller. For the two-relevant-operands case, both operand values are + // provided here. + auto ComputeKnownBits = + [&](unsigned BitWidth, const Value *V1, const Value *V2) { + const DataLayout &DL = I->getModule()->getDataLayout(); + KnownZero = APInt(BitWidth, 0); + KnownOne = APInt(BitWidth, 0); + computeKnownBits(const_cast<Value *>(V1), KnownZero, KnownOne, DL, 0, + AC, UserI, DT); + + if (V2) { + KnownZero2 = APInt(BitWidth, 0); + KnownOne2 = APInt(BitWidth, 0); + computeKnownBits(const_cast<Value *>(V2), KnownZero2, KnownOne2, DL, + 0, AC, UserI, DT); + } + }; + + switch (UserI->getOpcode()) { + default: break; + case Instruction::Call: + case Instruction::Invoke: + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI)) + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::bswap: + // The alive bits of the input are the swapped alive bits of + // the output. + AB = AOut.byteSwap(); + break; + case Intrinsic::ctlz: + if (OperandNo == 0) { + // We need some output bits, so we need all bits of the + // input to the left of, and including, the leftmost bit + // known to be one. + ComputeKnownBits(BitWidth, I, nullptr); + AB = APInt::getHighBitsSet(BitWidth, + std::min(BitWidth, KnownOne.countLeadingZeros()+1)); + } + break; + case Intrinsic::cttz: + if (OperandNo == 0) { + // We need some output bits, so we need all bits of the + // input to the right of, and including, the rightmost bit + // known to be one. + ComputeKnownBits(BitWidth, I, nullptr); + AB = APInt::getLowBitsSet(BitWidth, + std::min(BitWidth, KnownOne.countTrailingZeros()+1)); + } + break; + } + break; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + // Find the highest live output bit. We don't need any more input + // bits than that (adds, and thus subtracts, ripple only to the + // left). + AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits()); + break; + case Instruction::Shl: + if (OperandNo == 0) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(UserI->getOperand(1))) { + uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); + AB = AOut.lshr(ShiftAmt); + + // If the shift is nuw/nsw, then the high bits are not dead + // (because we've promised that they *must* be zero). + const ShlOperator *S = cast<ShlOperator>(UserI); + if (S->hasNoSignedWrap()) + AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1); + else if (S->hasNoUnsignedWrap()) + AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt); + } + break; + case Instruction::LShr: + if (OperandNo == 0) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(UserI->getOperand(1))) { + uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); + AB = AOut.shl(ShiftAmt); + + // If the shift is exact, then the low bits are not dead + // (they must be zero). + if (cast<LShrOperator>(UserI)->isExact()) + AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + } + break; + case Instruction::AShr: + if (OperandNo == 0) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(UserI->getOperand(1))) { + uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); + AB = AOut.shl(ShiftAmt); + // Because the high input bit is replicated into the + // high-order bits of the result, if we need any of those + // bits, then we must keep the highest input bit. + if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt)) + .getBoolValue()) + AB.setBit(BitWidth-1); + + // If the shift is exact, then the low bits are not dead + // (they must be zero). + if (cast<AShrOperator>(UserI)->isExact()) + AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + } + break; + case Instruction::And: + AB = AOut; + + // For bits that are known zero, the corresponding bits in the + // other operand are dead (unless they're both zero, in which + // case they can't both be dead, so just mark the LHS bits as + // dead). + if (OperandNo == 0) { + ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); + AB &= ~KnownZero2; + } else { + if (!isa<Instruction>(UserI->getOperand(0))) + ComputeKnownBits(BitWidth, UserI->getOperand(0), I); + AB &= ~(KnownZero & ~KnownZero2); + } + break; + case Instruction::Or: + AB = AOut; + + // For bits that are known one, the corresponding bits in the + // other operand are dead (unless they're both one, in which + // case they can't both be dead, so just mark the LHS bits as + // dead). + if (OperandNo == 0) { + ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); + AB &= ~KnownOne2; + } else { + if (!isa<Instruction>(UserI->getOperand(0))) + ComputeKnownBits(BitWidth, UserI->getOperand(0), I); + AB &= ~(KnownOne & ~KnownOne2); + } + break; + case Instruction::Xor: + case Instruction::PHI: + AB = AOut; + break; + case Instruction::Trunc: + AB = AOut.zext(BitWidth); + break; + case Instruction::ZExt: + AB = AOut.trunc(BitWidth); + break; + case Instruction::SExt: + AB = AOut.trunc(BitWidth); + // Because the high input bit is replicated into the + // high-order bits of the result, if we need any of those + // bits, then we must keep the highest input bit. + if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(), + AOut.getBitWidth() - BitWidth)) + .getBoolValue()) + AB.setBit(BitWidth-1); + break; + case Instruction::Select: + if (OperandNo != 0) + AB = AOut; + break; + case Instruction::ICmp: + // Count the number of leading zeroes in each operand. + ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); + auto NumLeadingZeroes = std::min(KnownZero.countLeadingOnes(), + KnownZero2.countLeadingOnes()); + AB = ~APInt::getHighBitsSet(BitWidth, NumLeadingZeroes); + break; + } +} + +bool DemandedBits::runOnFunction(Function& Fn) { + F = &Fn; + Analyzed = false; + return false; +} + +void DemandedBits::performAnalysis() { + if (Analyzed) + // Analysis already completed for this function. + return; + Analyzed = true; + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(*F); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + Visited.clear(); + AliveBits.clear(); + + SmallVector<Instruction*, 128> Worklist; + + // Collect the set of "root" instructions that are known live. + for (Instruction &I : instructions(*F)) { + if (!isAlwaysLive(&I)) + continue; + + DEBUG(dbgs() << "DemandedBits: Root: " << I << "\n"); + // For integer-valued instructions, set up an initial empty set of alive + // bits and add the instruction to the work list. For other instructions + // add their operands to the work list (for integer values operands, mark + // all bits as live). + if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) { + if (!AliveBits.count(&I)) { + AliveBits[&I] = APInt(IT->getBitWidth(), 0); + Worklist.push_back(&I); + } + + continue; + } + + // Non-integer-typed instructions... + for (Use &OI : I.operands()) { + if (Instruction *J = dyn_cast<Instruction>(OI)) { + if (IntegerType *IT = dyn_cast<IntegerType>(J->getType())) + AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth()); + Worklist.push_back(J); + } + } + // To save memory, we don't add I to the Visited set here. Instead, we + // check isAlwaysLive on every instruction when searching for dead + // instructions later (we need to check isAlwaysLive for the + // integer-typed instructions anyway). + } + + // Propagate liveness backwards to operands. + while (!Worklist.empty()) { + Instruction *UserI = Worklist.pop_back_val(); + + DEBUG(dbgs() << "DemandedBits: Visiting: " << *UserI); + APInt AOut; + if (UserI->getType()->isIntegerTy()) { + AOut = AliveBits[UserI]; + DEBUG(dbgs() << " Alive Out: " << AOut); + } + DEBUG(dbgs() << "\n"); + + if (!UserI->getType()->isIntegerTy()) + Visited.insert(UserI); + + APInt KnownZero, KnownOne, KnownZero2, KnownOne2; + // Compute the set of alive bits for each operand. These are anded into the + // existing set, if any, and if that changes the set of alive bits, the + // operand is added to the work-list. + for (Use &OI : UserI->operands()) { + if (Instruction *I = dyn_cast<Instruction>(OI)) { + if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) { + unsigned BitWidth = IT->getBitWidth(); + APInt AB = APInt::getAllOnesValue(BitWidth); + if (UserI->getType()->isIntegerTy() && !AOut && + !isAlwaysLive(UserI)) { + AB = APInt(BitWidth, 0); + } else { + // If all bits of the output are dead, then all bits of the input + // Bits of each operand that are used to compute alive bits of the + // output are alive, all others are dead. + determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB, + KnownZero, KnownOne, + KnownZero2, KnownOne2); + } + + // If we've added to the set of alive bits (or the operand has not + // been previously visited), then re-queue the operand to be visited + // again. + APInt ABPrev(BitWidth, 0); + auto ABI = AliveBits.find(I); + if (ABI != AliveBits.end()) + ABPrev = ABI->second; + + APInt ABNew = AB | ABPrev; + if (ABNew != ABPrev || ABI == AliveBits.end()) { + AliveBits[I] = std::move(ABNew); + Worklist.push_back(I); + } + } else if (!Visited.count(I)) { + Worklist.push_back(I); + } + } + } + } +} + +APInt DemandedBits::getDemandedBits(Instruction *I) { + performAnalysis(); + + const DataLayout &DL = I->getParent()->getModule()->getDataLayout(); + if (AliveBits.count(I)) + return AliveBits[I]; + return APInt::getAllOnesValue(DL.getTypeSizeInBits(I->getType())); +} + +bool DemandedBits::isInstructionDead(Instruction *I) { + performAnalysis(); + + return !Visited.count(I) && AliveBits.find(I) == AliveBits.end() && + !isAlwaysLive(I); +} + +void DemandedBits::print(raw_ostream &OS, const Module *M) const { + // This is gross. But the alternative is making all the state mutable + // just because of this one debugging method. + const_cast<DemandedBits*>(this)->performAnalysis(); + for (auto &KV : AliveBits) { + OS << "DemandedBits: 0x" << utohexstr(KV.second.getLimitedValue()) << " for " + << *KV.first << "\n"; + } +} + +FunctionPass *llvm::createDemandedBitsPass() { + return new DemandedBits(); +} diff --git a/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp b/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp index 4826ac4..4040ad3 100644 --- a/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -117,8 +117,8 @@ Delinearize("da-delinearize", cl::init(false), cl::Hidden, cl::ZeroOrMore, INITIALIZE_PASS_BEGIN(DependenceAnalysis, "da", "Dependence Analysis", true, true) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(DependenceAnalysis, "da", "Dependence Analysis", true, true) @@ -132,8 +132,8 @@ FunctionPass *llvm::createDependenceAnalysisPass() { bool DependenceAnalysis::runOnFunction(Function &F) { this->F = &F; - AA = &getAnalysis<AliasAnalysis>(); - SE = &getAnalysis<ScalarEvolution>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); return false; } @@ -145,8 +145,8 @@ void DependenceAnalysis::releaseMemory() { void DependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequiredTransitive<AliasAnalysis>(); - AU.addRequiredTransitive<ScalarEvolution>(); + AU.addRequiredTransitive<AAResultsWrapperPass>(); + AU.addRequiredTransitive<ScalarEvolutionWrapperPass>(); AU.addRequiredTransitive<LoopInfoWrapperPass>(); } @@ -233,7 +233,8 @@ FullDependence::FullDependence(Instruction *Source, Instruction *Destination, : Dependence(Source, Destination), Levels(CommonLevels), LoopIndependent(PossiblyLoopIndependent) { Consistent = true; - DV = CommonLevels ? new DVEntry[CommonLevels] : nullptr; + if (CommonLevels) + DV = make_unique<DVEntry[]>(CommonLevels); } // The rest are simple getters that hide the implementation. @@ -371,7 +372,7 @@ void DependenceAnalysis::Constraint::setLine(const SCEV *AA, void DependenceAnalysis::Constraint::setDistance(const SCEV *D, const Loop *CurLoop) { Kind = Distance; - A = SE->getConstant(D->getType(), 1); + A = SE->getOne(D->getType()); B = SE->getNegativeSCEV(A); C = SE->getNegativeSCEV(D); AssociatedLoop = CurLoop; @@ -500,10 +501,10 @@ bool DependenceAnalysis::intersectConstraints(Constraint *X, if (!C1B2_C2B1 || !C1A2_C2A1 || !A1B2_A2B1 || !A2B1_A1B2) return false; - APInt Xtop = C1B2_C2B1->getValue()->getValue(); - APInt Xbot = A1B2_A2B1->getValue()->getValue(); - APInt Ytop = C1A2_C2A1->getValue()->getValue(); - APInt Ybot = A2B1_A1B2->getValue()->getValue(); + APInt Xtop = C1B2_C2B1->getAPInt(); + APInt Xbot = A1B2_A2B1->getAPInt(); + APInt Ytop = C1A2_C2A1->getAPInt(); + APInt Ybot = A2B1_A1B2->getAPInt(); DEBUG(dbgs() << "\t\tXtop = " << Xtop << "\n"); DEBUG(dbgs() << "\t\tXbot = " << Xbot << "\n"); DEBUG(dbgs() << "\t\tYtop = " << Ytop << "\n"); @@ -527,7 +528,7 @@ bool DependenceAnalysis::intersectConstraints(Constraint *X, } if (const SCEVConstant *CUB = collectConstantUpperBound(X->getAssociatedLoop(), Prod1->getType())) { - APInt UpperBound = CUB->getValue()->getValue(); + APInt UpperBound = CUB->getAPInt(); DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n"); if (Xq.sgt(UpperBound) || Yq.sgt(UpperBound)) { X->setEmpty(); @@ -630,8 +631,8 @@ static AliasResult underlyingObjectsAlias(AliasAnalysis *AA, const Value *B) { const Value *AObj = GetUnderlyingObject(A, DL); const Value *BObj = GetUnderlyingObject(B, DL); - return AA->alias(AObj, AA->getTypeStoreSize(AObj->getType()), - BObj, AA->getTypeStoreSize(BObj->getType())); + return AA->alias(AObj, DL.getTypeStoreSize(AObj->getType()), + BObj, DL.getTypeStoreSize(BObj->getType())); } @@ -1114,8 +1115,8 @@ bool DependenceAnalysis::strongSIVtest(const SCEV *Coeff, // Can we compute distance? if (isa<SCEVConstant>(Delta) && isa<SCEVConstant>(Coeff)) { - APInt ConstDelta = cast<SCEVConstant>(Delta)->getValue()->getValue(); - APInt ConstCoeff = cast<SCEVConstant>(Coeff)->getValue()->getValue(); + APInt ConstDelta = cast<SCEVConstant>(Delta)->getAPInt(); + APInt ConstCoeff = cast<SCEVConstant>(Coeff)->getAPInt(); APInt Distance = ConstDelta; // these need to be initialized APInt Remainder = ConstDelta; APInt::sdivrem(ConstDelta, ConstCoeff, Distance, Remainder); @@ -1256,11 +1257,9 @@ bool DependenceAnalysis::weakCrossingSIVtest(const SCEV *Coeff, assert(SE->isKnownPositive(ConstCoeff) && "ConstCoeff should be positive"); // compute SplitIter for use by DependenceAnalysis::getSplitIteration() - SplitIter = - SE->getUDivExpr(SE->getSMaxExpr(SE->getConstant(Delta->getType(), 0), - Delta), - SE->getMulExpr(SE->getConstant(Delta->getType(), 2), - ConstCoeff)); + SplitIter = SE->getUDivExpr( + SE->getSMaxExpr(SE->getZero(Delta->getType()), Delta), + SE->getMulExpr(SE->getConstant(Delta->getType(), 2), ConstCoeff)); DEBUG(dbgs() << "\t Split iter = " << *SplitIter << "\n"); const SCEVConstant *ConstDelta = dyn_cast<SCEVConstant>(Delta); @@ -1302,14 +1301,14 @@ bool DependenceAnalysis::weakCrossingSIVtest(const SCEV *Coeff, return true; } Result.DV[Level].Splitable = false; - Result.DV[Level].Distance = SE->getConstant(Delta->getType(), 0); + Result.DV[Level].Distance = SE->getZero(Delta->getType()); return false; } } // check that Coeff divides Delta - APInt APDelta = ConstDelta->getValue()->getValue(); - APInt APCoeff = ConstCoeff->getValue()->getValue(); + APInt APDelta = ConstDelta->getAPInt(); + APInt APCoeff = ConstCoeff->getAPInt(); APInt Distance = APDelta; // these need to be initialzed APInt Remainder = APDelta; APInt::sdivrem(APDelta, APCoeff, Distance, Remainder); @@ -1463,10 +1462,10 @@ bool DependenceAnalysis::exactSIVtest(const SCEV *SrcCoeff, // find gcd APInt G, X, Y; - APInt AM = ConstSrcCoeff->getValue()->getValue(); - APInt BM = ConstDstCoeff->getValue()->getValue(); + APInt AM = ConstSrcCoeff->getAPInt(); + APInt BM = ConstDstCoeff->getAPInt(); unsigned Bits = AM.getBitWidth(); - if (findGCD(Bits, AM, BM, ConstDelta->getValue()->getValue(), G, X, Y)) { + if (findGCD(Bits, AM, BM, ConstDelta->getAPInt(), G, X, Y)) { // gcd doesn't divide Delta, no dependence ++ExactSIVindependence; ++ExactSIVsuccesses; @@ -1481,7 +1480,7 @@ bool DependenceAnalysis::exactSIVtest(const SCEV *SrcCoeff, // UM is perhaps unavailable, let's check if (const SCEVConstant *CUB = collectConstantUpperBound(CurLoop, Delta->getType())) { - UM = CUB->getValue()->getValue(); + UM = CUB->getAPInt(); DEBUG(dbgs() << "\t UM = " << UM << "\n"); UMvalid = true; } @@ -1609,8 +1608,8 @@ bool DependenceAnalysis::exactSIVtest(const SCEV *SrcCoeff, static bool isRemainderZero(const SCEVConstant *Dividend, const SCEVConstant *Divisor) { - APInt ConstDividend = Dividend->getValue()->getValue(); - APInt ConstDivisor = Divisor->getValue()->getValue(); + APInt ConstDividend = Dividend->getAPInt(); + APInt ConstDivisor = Divisor->getAPInt(); return ConstDividend.srem(ConstDivisor) == 0; } @@ -1665,8 +1664,8 @@ bool DependenceAnalysis::weakZeroSrcSIVtest(const SCEV *DstCoeff, Level--; Result.Consistent = false; const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst); - NewConstraint.setLine(SE->getConstant(Delta->getType(), 0), - DstCoeff, Delta, CurLoop); + NewConstraint.setLine(SE->getZero(Delta->getType()), DstCoeff, Delta, + CurLoop); DEBUG(dbgs() << "\t Delta = " << *Delta << "\n"); if (isKnownPredicate(CmpInst::ICMP_EQ, SrcConst, DstConst)) { if (Level < CommonLevels) { @@ -1775,8 +1774,8 @@ bool DependenceAnalysis::weakZeroDstSIVtest(const SCEV *SrcCoeff, Level--; Result.Consistent = false; const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst); - NewConstraint.setLine(SrcCoeff, SE->getConstant(Delta->getType(), 0), - Delta, CurLoop); + NewConstraint.setLine(SrcCoeff, SE->getZero(Delta->getType()), Delta, + CurLoop); DEBUG(dbgs() << "\t Delta = " << *Delta << "\n"); if (isKnownPredicate(CmpInst::ICMP_EQ, DstConst, SrcConst)) { if (Level < CommonLevels) { @@ -1867,10 +1866,10 @@ bool DependenceAnalysis::exactRDIVtest(const SCEV *SrcCoeff, // find gcd APInt G, X, Y; - APInt AM = ConstSrcCoeff->getValue()->getValue(); - APInt BM = ConstDstCoeff->getValue()->getValue(); + APInt AM = ConstSrcCoeff->getAPInt(); + APInt BM = ConstDstCoeff->getAPInt(); unsigned Bits = AM.getBitWidth(); - if (findGCD(Bits, AM, BM, ConstDelta->getValue()->getValue(), G, X, Y)) { + if (findGCD(Bits, AM, BM, ConstDelta->getAPInt(), G, X, Y)) { // gcd doesn't divide Delta, no dependence ++ExactRDIVindependence; return true; @@ -1884,7 +1883,7 @@ bool DependenceAnalysis::exactRDIVtest(const SCEV *SrcCoeff, // SrcUM is perhaps unavailable, let's check if (const SCEVConstant *UpperBound = collectConstantUpperBound(SrcLoop, Delta->getType())) { - SrcUM = UpperBound->getValue()->getValue(); + SrcUM = UpperBound->getAPInt(); DEBUG(dbgs() << "\t SrcUM = " << SrcUM << "\n"); SrcUMvalid = true; } @@ -1894,7 +1893,7 @@ bool DependenceAnalysis::exactRDIVtest(const SCEV *SrcCoeff, // UM is perhaps unavailable, let's check if (const SCEVConstant *UpperBound = collectConstantUpperBound(DstLoop, Delta->getType())) { - DstUM = UpperBound->getValue()->getValue(); + DstUM = UpperBound->getAPInt(); DEBUG(dbgs() << "\t DstUM = " << DstUM << "\n"); DstUMvalid = true; } @@ -2307,7 +2306,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src, Constant = getConstantPart(Product); if (!Constant) return false; - APInt ConstCoeff = Constant->getValue()->getValue(); + APInt ConstCoeff = Constant->getAPInt(); RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs()); Coefficients = AddRec->getStart(); } @@ -2328,7 +2327,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src, Constant = getConstantPart(Product); if (!Constant) return false; - APInt ConstCoeff = Constant->getValue()->getValue(); + APInt ConstCoeff = Constant->getAPInt(); RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs()); Coefficients = AddRec->getStart(); } @@ -2352,7 +2351,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src, const SCEVConstant *ConstOp = getConstantPart(Product); if (!ConstOp) return false; - APInt ConstOpValue = ConstOp->getValue()->getValue(); + APInt ConstOpValue = ConstOp->getAPInt(); ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD, ConstOpValue.abs()); } @@ -2362,7 +2361,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src, } if (!Constant) return false; - APInt ConstDelta = cast<SCEVConstant>(Constant)->getValue()->getValue(); + APInt ConstDelta = cast<SCEVConstant>(Constant)->getAPInt(); DEBUG(dbgs() << " ConstDelta = " << ConstDelta << "\n"); if (ConstDelta == 0) return false; @@ -2410,7 +2409,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src, Constant = getConstantPart(Product); else Constant = cast<SCEVConstant>(Coeff); - APInt ConstCoeff = Constant->getValue()->getValue(); + APInt ConstCoeff = Constant->getAPInt(); RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs()); } Inner = AddRec->getStart(); @@ -2428,7 +2427,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src, Constant = getConstantPart(Product); else Constant = cast<SCEVConstant>(Coeff); - APInt ConstCoeff = Constant->getValue()->getValue(); + APInt ConstCoeff = Constant->getAPInt(); RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs()); } Inner = AddRec->getStart(); @@ -2445,7 +2444,7 @@ bool DependenceAnalysis::gcdMIVtest(const SCEV *Src, // or constant, in which case we give up on this direction. continue; } - APInt ConstCoeff = Constant->getValue()->getValue(); + APInt ConstCoeff = Constant->getAPInt(); RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs()); DEBUG(dbgs() << "\tRunningGCD = " << RunningGCD << "\n"); if (RunningGCD != 0) { @@ -2728,10 +2727,10 @@ void DependenceAnalysis::findBoundsALL(CoefficientInfo *A, // If the difference is 0, we won't need to know the number of iterations. if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].NegPart, B[K].PosPart)) Bound[K].Lower[Dependence::DVEntry::ALL] = - SE->getConstant(A[K].Coeff->getType(), 0); + SE->getZero(A[K].Coeff->getType()); if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].PosPart, B[K].NegPart)) Bound[K].Upper[Dependence::DVEntry::ALL] = - SE->getConstant(A[K].Coeff->getType(), 0); + SE->getZero(A[K].Coeff->getType()); } } @@ -2800,9 +2799,8 @@ void DependenceAnalysis::findBoundsLT(CoefficientInfo *A, Bound[K].Lower[Dependence::DVEntry::LT] = nullptr; // Default value = -infinity. Bound[K].Upper[Dependence::DVEntry::LT] = nullptr; // Default value = +infinity. if (Bound[K].Iterations) { - const SCEV *Iter_1 = - SE->getMinusSCEV(Bound[K].Iterations, - SE->getConstant(Bound[K].Iterations->getType(), 1)); + const SCEV *Iter_1 = SE->getMinusSCEV( + Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType())); const SCEV *NegPart = getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff)); Bound[K].Lower[Dependence::DVEntry::LT] = @@ -2847,9 +2845,8 @@ void DependenceAnalysis::findBoundsGT(CoefficientInfo *A, Bound[K].Lower[Dependence::DVEntry::GT] = nullptr; // Default value = -infinity. Bound[K].Upper[Dependence::DVEntry::GT] = nullptr; // Default value = +infinity. if (Bound[K].Iterations) { - const SCEV *Iter_1 = - SE->getMinusSCEV(Bound[K].Iterations, - SE->getConstant(Bound[K].Iterations->getType(), 1)); + const SCEV *Iter_1 = SE->getMinusSCEV( + Bound[K].Iterations, SE->getOne(Bound[K].Iterations->getType())); const SCEV *NegPart = getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart)); Bound[K].Lower[Dependence::DVEntry::GT] = @@ -2874,13 +2871,13 @@ void DependenceAnalysis::findBoundsGT(CoefficientInfo *A, // X^+ = max(X, 0) const SCEV *DependenceAnalysis::getPositivePart(const SCEV *X) const { - return SE->getSMaxExpr(X, SE->getConstant(X->getType(), 0)); + return SE->getSMaxExpr(X, SE->getZero(X->getType())); } // X^- = min(X, 0) const SCEV *DependenceAnalysis::getNegativePart(const SCEV *X) const { - return SE->getSMinExpr(X, SE->getConstant(X->getType(), 0)); + return SE->getSMinExpr(X, SE->getZero(X->getType())); } @@ -2891,7 +2888,7 @@ DependenceAnalysis::CoefficientInfo * DependenceAnalysis::collectCoeffInfo(const SCEV *Subscript, bool SrcFlag, const SCEV *&Constant) const { - const SCEV *Zero = SE->getConstant(Subscript->getType(), 0); + const SCEV *Zero = SE->getZero(Subscript->getType()); CoefficientInfo *CI = new CoefficientInfo[MaxLevels + 1]; for (unsigned K = 1; K <= MaxLevels; ++K) { CI[K].Coeff = Zero; @@ -2975,7 +2972,7 @@ const SCEV *DependenceAnalysis::findCoefficient(const SCEV *Expr, const Loop *TargetLoop) const { const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr); if (!AddRec) - return SE->getConstant(Expr->getType(), 0); + return SE->getZero(Expr->getType()); if (AddRec->getLoop() == TargetLoop) return AddRec->getStepRecurrence(*SE); return findCoefficient(AddRec->getStart(), TargetLoop); @@ -3110,8 +3107,8 @@ bool DependenceAnalysis::propagateLine(const SCEV *&Src, const SCEVConstant *Bconst = dyn_cast<SCEVConstant>(B); const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C); if (!Bconst || !Cconst) return false; - APInt Beta = Bconst->getValue()->getValue(); - APInt Charlie = Cconst->getValue()->getValue(); + APInt Beta = Bconst->getAPInt(); + APInt Charlie = Cconst->getAPInt(); APInt CdivB = Charlie.sdiv(Beta); assert(Charlie.srem(Beta) == 0 && "C should be evenly divisible by B"); const SCEV *AP_K = findCoefficient(Dst, CurLoop); @@ -3125,8 +3122,8 @@ bool DependenceAnalysis::propagateLine(const SCEV *&Src, const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A); const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C); if (!Aconst || !Cconst) return false; - APInt Alpha = Aconst->getValue()->getValue(); - APInt Charlie = Cconst->getValue()->getValue(); + APInt Alpha = Aconst->getAPInt(); + APInt Charlie = Cconst->getAPInt(); APInt CdivA = Charlie.sdiv(Alpha); assert(Charlie.srem(Alpha) == 0 && "C should be evenly divisible by A"); const SCEV *A_K = findCoefficient(Src, CurLoop); @@ -3139,8 +3136,8 @@ bool DependenceAnalysis::propagateLine(const SCEV *&Src, const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A); const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C); if (!Aconst || !Cconst) return false; - APInt Alpha = Aconst->getValue()->getValue(); - APInt Charlie = Cconst->getValue()->getValue(); + APInt Alpha = Aconst->getAPInt(); + APInt Charlie = Cconst->getAPInt(); APInt CdivA = Charlie.sdiv(Alpha); assert(Charlie.srem(Alpha) == 0 && "C should be evenly divisible by A"); const SCEV *A_K = findCoefficient(Src, CurLoop); @@ -3244,20 +3241,36 @@ void DependenceAnalysis::updateDirection(Dependence::DVEntry &Level, /// source and destination array references are recurrences on a nested loop, /// this function flattens the nested recurrences into separate recurrences /// for each loop level. -bool DependenceAnalysis::tryDelinearize(const SCEV *SrcSCEV, - const SCEV *DstSCEV, - SmallVectorImpl<Subscript> &Pair, - const SCEV *ElementSize) { +bool DependenceAnalysis::tryDelinearize(Instruction *Src, + Instruction *Dst, + SmallVectorImpl<Subscript> &Pair) +{ + Value *SrcPtr = getPointerOperand(Src); + Value *DstPtr = getPointerOperand(Dst); + + Loop *SrcLoop = LI->getLoopFor(Src->getParent()); + Loop *DstLoop = LI->getLoopFor(Dst->getParent()); + + // Below code mimics the code in Delinearization.cpp + const SCEV *SrcAccessFn = + SE->getSCEVAtScope(SrcPtr, SrcLoop); + const SCEV *DstAccessFn = + SE->getSCEVAtScope(DstPtr, DstLoop); + const SCEVUnknown *SrcBase = - dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcSCEV)); + dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcAccessFn)); const SCEVUnknown *DstBase = - dyn_cast<SCEVUnknown>(SE->getPointerBase(DstSCEV)); + dyn_cast<SCEVUnknown>(SE->getPointerBase(DstAccessFn)); if (!SrcBase || !DstBase || SrcBase != DstBase) return false; - SrcSCEV = SE->getMinusSCEV(SrcSCEV, SrcBase); - DstSCEV = SE->getMinusSCEV(DstSCEV, DstBase); + const SCEV *ElementSize = SE->getElementSize(Src); + if (ElementSize != SE->getElementSize(Dst)) + return false; + + const SCEV *SrcSCEV = SE->getMinusSCEV(SrcAccessFn, SrcBase); + const SCEV *DstSCEV = SE->getMinusSCEV(DstAccessFn, DstBase); const SCEVAddRecExpr *SrcAR = dyn_cast<SCEVAddRecExpr>(SrcSCEV); const SCEVAddRecExpr *DstAR = dyn_cast<SCEVAddRecExpr>(DstSCEV); @@ -3330,7 +3343,6 @@ static void dumpSmallBitVector(SmallBitVector &BV) { } #endif - // depends - // Returns NULL if there is no dependence. // Otherwise, return a Dependence with as many details as possible. @@ -3425,10 +3437,11 @@ DependenceAnalysis::depends(Instruction *Src, Instruction *Dst, Pair[0].Dst = DstSCEV; } - if (Delinearize && Pairs == 1 && CommonLevels > 1 && - tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair, SE->getElementSize(Src))) { - DEBUG(dbgs() << " delinerized GEP\n"); - Pairs = Pair.size(); + if (Delinearize && CommonLevels > 1) { + if (tryDelinearize(Src, Dst, Pair)) { + DEBUG(dbgs() << " delinerized GEP\n"); + Pairs = Pair.size(); + } } for (unsigned P = 0; P < Pairs; ++P) { @@ -3746,9 +3759,7 @@ DependenceAnalysis::depends(Instruction *Src, Instruction *Dst, return nullptr; } - auto Final = make_unique<FullDependence>(Result); - Result.DV = nullptr; - return std::move(Final); + return make_unique<FullDependence>(std::move(Result)); } @@ -3852,10 +3863,11 @@ const SCEV *DependenceAnalysis::getSplitIteration(const Dependence &Dep, Pair[0].Dst = DstSCEV; } - if (Delinearize && Pairs == 1 && CommonLevels > 1 && - tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair, SE->getElementSize(Src))) { - DEBUG(dbgs() << " delinerized GEP\n"); - Pairs = Pair.size(); + if (Delinearize && CommonLevels > 1) { + if (tryDelinearize(Src, Dst, Pair)) { + DEBUG(dbgs() << " delinerized GEP\n"); + Pairs = Pair.size(); + } } for (unsigned P = 0; P < Pairs; ++P) { diff --git a/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp b/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp index e5ee295..5ae6d74 100644 --- a/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp +++ b/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp @@ -1,4 +1,4 @@ -//===- DivergenceAnalysis.cpp ------ Divergence Analysis ------------------===// +//===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==// // // The LLVM Compiler Infrastructure // @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// // -// This file defines divergence analysis which determines whether a branch in a -// GPU program is divergent. It can help branch optimizations such as jump +// This file implements divergence analysis which determines whether a branch +// in a GPU program is divergent.It can help branch optimizations such as jump // threading and loop unswitching to make better decisions. // // GPU programs typically use the SIMD execution model, where multiple threads @@ -61,75 +61,31 @@ // 2. memory as black box. It conservatively considers values loaded from // generic or local address as divergent. This can be improved by leveraging // pointer analysis. +// //===----------------------------------------------------------------------===// -#include <vector> -#include "llvm/IR/Dominators.h" -#include "llvm/ADT/DenseSet.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/Function.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Value.h" -#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include <vector> using namespace llvm; -#define DEBUG_TYPE "divergence" - -namespace { -class DivergenceAnalysis : public FunctionPass { -public: - static char ID; - - DivergenceAnalysis() : FunctionPass(ID) { - initializeDivergenceAnalysisPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<PostDominatorTree>(); - AU.setPreservesAll(); - } - - bool runOnFunction(Function &F) override; - - // Print all divergent branches in the function. - void print(raw_ostream &OS, const Module *) const override; - - // Returns true if V is divergent. - bool isDivergent(const Value *V) const { return DivergentValues.count(V); } - // Returns true if V is uniform/non-divergent. - bool isUniform(const Value *V) const { return !isDivergent(V); } - -private: - // Stores all divergent values. - DenseSet<const Value *> DivergentValues; -}; -} // End of anonymous namespace - -// Register this pass. -char DivergenceAnalysis::ID = 0; -INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis", - false, true) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) -INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis", - false, true) - namespace { class DivergencePropagator { public: - DivergencePropagator(Function &F, TargetTransformInfo &TTI, - DominatorTree &DT, PostDominatorTree &PDT, - DenseSet<const Value *> &DV) + DivergencePropagator(Function &F, TargetTransformInfo &TTI, DominatorTree &DT, + PostDominatorTree &PDT, DenseSet<const Value *> &DV) : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV) {} void populateWithSourcesOfDivergence(); void propagate(); @@ -140,7 +96,7 @@ private: // A helper function that explores sync dependents of TI. void exploreSyncDependency(TerminatorInst *TI); // Computes the influence region from Start to End. This region includes all - // basic blocks on any path from Start to End. + // basic blocks on any simple path from Start to End. void computeInfluenceRegion(BasicBlock *Start, BasicBlock *End, DenseSet<BasicBlock *> &InfluenceRegion); // Finds all users of I that are outside the influence region, and add these @@ -153,13 +109,13 @@ private: DominatorTree &DT; PostDominatorTree &PDT; std::vector<Value *> Worklist; // Stack for DFS. - DenseSet<const Value *> &DV; // Stores all divergent values. + DenseSet<const Value *> &DV; // Stores all divergent values. }; void DivergencePropagator::populateWithSourcesOfDivergence() { Worklist.clear(); DV.clear(); - for (auto &I : inst_range(F)) { + for (auto &I : instructions(F)) { if (TTI.isSourceOfDivergence(&I)) { Worklist.push_back(&I); DV.insert(&I); @@ -191,8 +147,8 @@ void DivergencePropagator::exploreSyncDependency(TerminatorInst *TI) { for (auto I = IPostDom->begin(); isa<PHINode>(I); ++I) { // A PHINode is uniform if it returns the same value no matter which path is // taken. - if (!cast<PHINode>(I)->hasConstantValue() && DV.insert(I).second) - Worklist.push_back(I); + if (!cast<PHINode>(I)->hasConstantValue() && DV.insert(&*I).second) + Worklist.push_back(&*I); } // Propagation rule 2: if a value defined in a loop is used outside, the user @@ -242,21 +198,33 @@ void DivergencePropagator::findUsersOutsideInfluenceRegion( } } +// A helper function for computeInfluenceRegion that adds successors of "ThisBB" +// to the influence region. +static void +addSuccessorsToInfluenceRegion(BasicBlock *ThisBB, BasicBlock *End, + DenseSet<BasicBlock *> &InfluenceRegion, + std::vector<BasicBlock *> &InfluenceStack) { + for (BasicBlock *Succ : successors(ThisBB)) { + if (Succ != End && InfluenceRegion.insert(Succ).second) + InfluenceStack.push_back(Succ); + } +} + void DivergencePropagator::computeInfluenceRegion( BasicBlock *Start, BasicBlock *End, DenseSet<BasicBlock *> &InfluenceRegion) { assert(PDT.properlyDominates(End, Start) && "End does not properly dominate Start"); + + // The influence region starts from the end of "Start" to the beginning of + // "End". Therefore, "Start" should not be in the region unless "Start" is in + // a loop that doesn't contain "End". std::vector<BasicBlock *> InfluenceStack; - InfluenceStack.push_back(Start); - InfluenceRegion.insert(Start); + addSuccessorsToInfluenceRegion(Start, End, InfluenceRegion, InfluenceStack); while (!InfluenceStack.empty()) { BasicBlock *BB = InfluenceStack.back(); InfluenceStack.pop_back(); - for (BasicBlock *Succ : successors(BB)) { - if (End != Succ && InfluenceRegion.insert(Succ).second) - InfluenceStack.push_back(Succ); - } + addSuccessorsToInfluenceRegion(BB, End, InfluenceRegion, InfluenceStack); } } @@ -286,10 +254,25 @@ void DivergencePropagator::propagate() { } /// end namespace anonymous +// Register this pass. +char DivergenceAnalysis::ID = 0; +INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis", + false, true) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) +INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis", + false, true) + FunctionPass *llvm::createDivergenceAnalysisPass() { return new DivergenceAnalysis(); } +void DivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<PostDominatorTree>(); + AU.setPreservesAll(); +} + bool DivergenceAnalysis::runOnFunction(Function &F) { auto *TTIWP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>(); if (TTIWP == nullptr) @@ -329,8 +312,8 @@ void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { if (DivergentValues.count(&Arg)) OS << "DIVERGENT: " << Arg << "\n"; } - // Iterate instructions using inst_range to ensure a deterministic order. - for (auto &I : inst_range(F)) { + // Iterate instructions using instructions() to ensure a deterministic order. + for (auto &I : instructions(F)) { if (DivergentValues.count(&I)) OS << "DIVERGENT:" << I << "\n"; } diff --git a/contrib/llvm/lib/Analysis/EHPersonalities.cpp b/contrib/llvm/lib/Analysis/EHPersonalities.cpp new file mode 100644 index 0000000..01be8b3 --- /dev/null +++ b/contrib/llvm/lib/Analysis/EHPersonalities.cpp @@ -0,0 +1,106 @@ +//===- EHPersonalities.cpp - Compute EH-related information ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +/// See if the given exception handling personality function is one that we +/// understand. If so, return a description of it; otherwise return Unknown. +EHPersonality llvm::classifyEHPersonality(const Value *Pers) { + const Function *F = + Pers ? dyn_cast<Function>(Pers->stripPointerCasts()) : nullptr; + if (!F) + return EHPersonality::Unknown; + return StringSwitch<EHPersonality>(F->getName()) + .Case("__gnat_eh_personality", EHPersonality::GNU_Ada) + .Case("__gxx_personality_v0", EHPersonality::GNU_CXX) + .Case("__gcc_personality_v0", EHPersonality::GNU_C) + .Case("__objc_personality_v0", EHPersonality::GNU_ObjC) + .Case("_except_handler3", EHPersonality::MSVC_X86SEH) + .Case("_except_handler4", EHPersonality::MSVC_X86SEH) + .Case("__C_specific_handler", EHPersonality::MSVC_Win64SEH) + .Case("__CxxFrameHandler3", EHPersonality::MSVC_CXX) + .Case("ProcessCLRException", EHPersonality::CoreCLR) + .Default(EHPersonality::Unknown); +} + +bool llvm::canSimplifyInvokeNoUnwind(const Function *F) { + EHPersonality Personality = classifyEHPersonality(F->getPersonalityFn()); + // We can't simplify any invokes to nounwind functions if the personality + // function wants to catch asynch exceptions. The nounwind attribute only + // implies that the function does not throw synchronous exceptions. + return !isAsynchronousEHPersonality(Personality); +} + +DenseMap<BasicBlock *, ColorVector> llvm::colorEHFunclets(Function &F) { + SmallVector<std::pair<BasicBlock *, BasicBlock *>, 16> Worklist; + BasicBlock *EntryBlock = &F.getEntryBlock(); + DenseMap<BasicBlock *, ColorVector> BlockColors; + + // Build up the color map, which maps each block to its set of 'colors'. + // For any block B the "colors" of B are the set of funclets F (possibly + // including a root "funclet" representing the main function) such that + // F will need to directly contain B or a copy of B (where the term "directly + // contain" is used to distinguish from being "transitively contained" in + // a nested funclet). + // + // Note: Despite not being a funclet in the truest sense, a catchswitch is + // considered to belong to its own funclet for the purposes of coloring. + + DEBUG_WITH_TYPE("winehprepare-coloring", dbgs() << "\nColoring funclets for " + << F.getName() << "\n"); + + Worklist.push_back({EntryBlock, EntryBlock}); + + while (!Worklist.empty()) { + BasicBlock *Visiting; + BasicBlock *Color; + std::tie(Visiting, Color) = Worklist.pop_back_val(); + DEBUG_WITH_TYPE("winehprepare-coloring", + dbgs() << "Visiting " << Visiting->getName() << ", " + << Color->getName() << "\n"); + Instruction *VisitingHead = Visiting->getFirstNonPHI(); + if (VisitingHead->isEHPad()) { + // Mark this funclet head as a member of itself. + Color = Visiting; + } + // Note that this is a member of the given color. + ColorVector &Colors = BlockColors[Visiting]; + if (std::find(Colors.begin(), Colors.end(), Color) == Colors.end()) + Colors.push_back(Color); + else + continue; + + DEBUG_WITH_TYPE("winehprepare-coloring", + dbgs() << " Assigned color \'" << Color->getName() + << "\' to block \'" << Visiting->getName() + << "\'.\n"); + + BasicBlock *SuccColor = Color; + TerminatorInst *Terminator = Visiting->getTerminator(); + if (auto *CatchRet = dyn_cast<CatchReturnInst>(Terminator)) { + Value *ParentPad = CatchRet->getParentPad(); + if (isa<ConstantTokenNone>(ParentPad)) + SuccColor = EntryBlock; + else + SuccColor = cast<Instruction>(ParentPad)->getParent(); + } + + for (BasicBlock *Succ : successors(Visiting)) + Worklist.push_back({Succ, SuccColor}); + } + return BlockColors; +} diff --git a/contrib/llvm/lib/Analysis/GlobalsModRef.cpp b/contrib/llvm/lib/Analysis/GlobalsModRef.cpp new file mode 100644 index 0000000..1babb82 --- /dev/null +++ b/contrib/llvm/lib/Analysis/GlobalsModRef.cpp @@ -0,0 +1,972 @@ +//===- GlobalsModRef.cpp - Simple Mod/Ref Analysis for Globals ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This simple pass provides alias and mod/ref information for global values +// that do not have their address taken, and keeps track of whether functions +// read or write memory (are "pure"). For this simple (but very common) case, +// we can provide pretty accurate and useful information. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +using namespace llvm; + +#define DEBUG_TYPE "globalsmodref-aa" + +STATISTIC(NumNonAddrTakenGlobalVars, + "Number of global vars without address taken"); +STATISTIC(NumNonAddrTakenFunctions,"Number of functions without address taken"); +STATISTIC(NumNoMemFunctions, "Number of functions that do not access memory"); +STATISTIC(NumReadMemFunctions, "Number of functions that only read memory"); +STATISTIC(NumIndirectGlobalVars, "Number of indirect global objects"); + +// An option to enable unsafe alias results from the GlobalsModRef analysis. +// When enabled, GlobalsModRef will provide no-alias results which in extremely +// rare cases may not be conservatively correct. In particular, in the face of +// transforms which cause assymetry between how effective GetUnderlyingObject +// is for two pointers, it may produce incorrect results. +// +// These unsafe results have been returned by GMR for many years without +// causing significant issues in the wild and so we provide a mechanism to +// re-enable them for users of LLVM that have a particular performance +// sensitivity and no known issues. The option also makes it easy to evaluate +// the performance impact of these results. +static cl::opt<bool> EnableUnsafeGlobalsModRefAliasResults( + "enable-unsafe-globalsmodref-alias-results", cl::init(false), cl::Hidden); + +/// The mod/ref information collected for a particular function. +/// +/// We collect information about mod/ref behavior of a function here, both in +/// general and as pertains to specific globals. We only have this detailed +/// information when we know *something* useful about the behavior. If we +/// saturate to fully general mod/ref, we remove the info for the function. +class GlobalsAAResult::FunctionInfo { + typedef SmallDenseMap<const GlobalValue *, ModRefInfo, 16> GlobalInfoMapType; + + /// Build a wrapper struct that has 8-byte alignment. All heap allocations + /// should provide this much alignment at least, but this makes it clear we + /// specifically rely on this amount of alignment. + struct LLVM_ALIGNAS(8) AlignedMap { + AlignedMap() {} + AlignedMap(const AlignedMap &Arg) : Map(Arg.Map) {} + GlobalInfoMapType Map; + }; + + /// Pointer traits for our aligned map. + struct AlignedMapPointerTraits { + static inline void *getAsVoidPointer(AlignedMap *P) { return P; } + static inline AlignedMap *getFromVoidPointer(void *P) { + return (AlignedMap *)P; + } + enum { NumLowBitsAvailable = 3 }; + static_assert(AlignOf<AlignedMap>::Alignment >= (1 << NumLowBitsAvailable), + "AlignedMap insufficiently aligned to have enough low bits."); + }; + + /// The bit that flags that this function may read any global. This is + /// chosen to mix together with ModRefInfo bits. + enum { MayReadAnyGlobal = 4 }; + + /// Checks to document the invariants of the bit packing here. + static_assert((MayReadAnyGlobal & MRI_ModRef) == 0, + "ModRef and the MayReadAnyGlobal flag bits overlap."); + static_assert(((MayReadAnyGlobal | MRI_ModRef) >> + AlignedMapPointerTraits::NumLowBitsAvailable) == 0, + "Insufficient low bits to store our flag and ModRef info."); + +public: + FunctionInfo() : Info() {} + ~FunctionInfo() { + delete Info.getPointer(); + } + // Spell out the copy ond move constructors and assignment operators to get + // deep copy semantics and correct move semantics in the face of the + // pointer-int pair. + FunctionInfo(const FunctionInfo &Arg) + : Info(nullptr, Arg.Info.getInt()) { + if (const auto *ArgPtr = Arg.Info.getPointer()) + Info.setPointer(new AlignedMap(*ArgPtr)); + } + FunctionInfo(FunctionInfo &&Arg) + : Info(Arg.Info.getPointer(), Arg.Info.getInt()) { + Arg.Info.setPointerAndInt(nullptr, 0); + } + FunctionInfo &operator=(const FunctionInfo &RHS) { + delete Info.getPointer(); + Info.setPointerAndInt(nullptr, RHS.Info.getInt()); + if (const auto *RHSPtr = RHS.Info.getPointer()) + Info.setPointer(new AlignedMap(*RHSPtr)); + return *this; + } + FunctionInfo &operator=(FunctionInfo &&RHS) { + delete Info.getPointer(); + Info.setPointerAndInt(RHS.Info.getPointer(), RHS.Info.getInt()); + RHS.Info.setPointerAndInt(nullptr, 0); + return *this; + } + + /// Returns the \c ModRefInfo info for this function. + ModRefInfo getModRefInfo() const { + return ModRefInfo(Info.getInt() & MRI_ModRef); + } + + /// Adds new \c ModRefInfo for this function to its state. + void addModRefInfo(ModRefInfo NewMRI) { + Info.setInt(Info.getInt() | NewMRI); + } + + /// Returns whether this function may read any global variable, and we don't + /// know which global. + bool mayReadAnyGlobal() const { return Info.getInt() & MayReadAnyGlobal; } + + /// Sets this function as potentially reading from any global. + void setMayReadAnyGlobal() { Info.setInt(Info.getInt() | MayReadAnyGlobal); } + + /// Returns the \c ModRefInfo info for this function w.r.t. a particular + /// global, which may be more precise than the general information above. + ModRefInfo getModRefInfoForGlobal(const GlobalValue &GV) const { + ModRefInfo GlobalMRI = mayReadAnyGlobal() ? MRI_Ref : MRI_NoModRef; + if (AlignedMap *P = Info.getPointer()) { + auto I = P->Map.find(&GV); + if (I != P->Map.end()) + GlobalMRI = ModRefInfo(GlobalMRI | I->second); + } + return GlobalMRI; + } + + /// Add mod/ref info from another function into ours, saturating towards + /// MRI_ModRef. + void addFunctionInfo(const FunctionInfo &FI) { + addModRefInfo(FI.getModRefInfo()); + + if (FI.mayReadAnyGlobal()) + setMayReadAnyGlobal(); + + if (AlignedMap *P = FI.Info.getPointer()) + for (const auto &G : P->Map) + addModRefInfoForGlobal(*G.first, G.second); + } + + void addModRefInfoForGlobal(const GlobalValue &GV, ModRefInfo NewMRI) { + AlignedMap *P = Info.getPointer(); + if (!P) { + P = new AlignedMap(); + Info.setPointer(P); + } + auto &GlobalMRI = P->Map[&GV]; + GlobalMRI = ModRefInfo(GlobalMRI | NewMRI); + } + + /// Clear a global's ModRef info. Should be used when a global is being + /// deleted. + void eraseModRefInfoForGlobal(const GlobalValue &GV) { + if (AlignedMap *P = Info.getPointer()) + P->Map.erase(&GV); + } + +private: + /// All of the information is encoded into a single pointer, with a three bit + /// integer in the low three bits. The high bit provides a flag for when this + /// function may read any global. The low two bits are the ModRefInfo. And + /// the pointer, when non-null, points to a map from GlobalValue to + /// ModRefInfo specific to that GlobalValue. + PointerIntPair<AlignedMap *, 3, unsigned, AlignedMapPointerTraits> Info; +}; + +void GlobalsAAResult::DeletionCallbackHandle::deleted() { + Value *V = getValPtr(); + if (auto *F = dyn_cast<Function>(V)) + GAR->FunctionInfos.erase(F); + + if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + if (GAR->NonAddressTakenGlobals.erase(GV)) { + // This global might be an indirect global. If so, remove it and + // remove any AllocRelatedValues for it. + if (GAR->IndirectGlobals.erase(GV)) { + // Remove any entries in AllocsForIndirectGlobals for this global. + for (auto I = GAR->AllocsForIndirectGlobals.begin(), + E = GAR->AllocsForIndirectGlobals.end(); + I != E; ++I) + if (I->second == GV) + GAR->AllocsForIndirectGlobals.erase(I); + } + + // Scan the function info we have collected and remove this global + // from all of them. + for (auto &FIPair : GAR->FunctionInfos) + FIPair.second.eraseModRefInfoForGlobal(*GV); + } + } + + // If this is an allocation related to an indirect global, remove it. + GAR->AllocsForIndirectGlobals.erase(V); + + // And clear out the handle. + setValPtr(nullptr); + GAR->Handles.erase(I); + // This object is now destroyed! +} + +FunctionModRefBehavior GlobalsAAResult::getModRefBehavior(const Function *F) { + FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior; + + if (FunctionInfo *FI = getFunctionInfo(F)) { + if (FI->getModRefInfo() == MRI_NoModRef) + Min = FMRB_DoesNotAccessMemory; + else if ((FI->getModRefInfo() & MRI_Mod) == 0) + Min = FMRB_OnlyReadsMemory; + } + + return FunctionModRefBehavior(AAResultBase::getModRefBehavior(F) & Min); +} + +FunctionModRefBehavior +GlobalsAAResult::getModRefBehavior(ImmutableCallSite CS) { + FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior; + + if (const Function *F = CS.getCalledFunction()) + if (FunctionInfo *FI = getFunctionInfo(F)) { + if (FI->getModRefInfo() == MRI_NoModRef) + Min = FMRB_DoesNotAccessMemory; + else if ((FI->getModRefInfo() & MRI_Mod) == 0) + Min = FMRB_OnlyReadsMemory; + } + + return FunctionModRefBehavior(AAResultBase::getModRefBehavior(CS) & Min); +} + +/// Returns the function info for the function, or null if we don't have +/// anything useful to say about it. +GlobalsAAResult::FunctionInfo * +GlobalsAAResult::getFunctionInfo(const Function *F) { + auto I = FunctionInfos.find(F); + if (I != FunctionInfos.end()) + return &I->second; + return nullptr; +} + +/// AnalyzeGlobals - Scan through the users of all of the internal +/// GlobalValue's in the program. If none of them have their "address taken" +/// (really, their address passed to something nontrivial), record this fact, +/// and record the functions that they are used directly in. +void GlobalsAAResult::AnalyzeGlobals(Module &M) { + SmallPtrSet<Function *, 64> TrackedFunctions; + for (Function &F : M) + if (F.hasLocalLinkage()) + if (!AnalyzeUsesOfPointer(&F)) { + // Remember that we are tracking this global. + NonAddressTakenGlobals.insert(&F); + TrackedFunctions.insert(&F); + Handles.emplace_front(*this, &F); + Handles.front().I = Handles.begin(); + ++NumNonAddrTakenFunctions; + } + + SmallPtrSet<Function *, 64> Readers, Writers; + for (GlobalVariable &GV : M.globals()) + if (GV.hasLocalLinkage()) { + if (!AnalyzeUsesOfPointer(&GV, &Readers, + GV.isConstant() ? nullptr : &Writers)) { + // Remember that we are tracking this global, and the mod/ref fns + NonAddressTakenGlobals.insert(&GV); + Handles.emplace_front(*this, &GV); + Handles.front().I = Handles.begin(); + + for (Function *Reader : Readers) { + if (TrackedFunctions.insert(Reader).second) { + Handles.emplace_front(*this, Reader); + Handles.front().I = Handles.begin(); + } + FunctionInfos[Reader].addModRefInfoForGlobal(GV, MRI_Ref); + } + + if (!GV.isConstant()) // No need to keep track of writers to constants + for (Function *Writer : Writers) { + if (TrackedFunctions.insert(Writer).second) { + Handles.emplace_front(*this, Writer); + Handles.front().I = Handles.begin(); + } + FunctionInfos[Writer].addModRefInfoForGlobal(GV, MRI_Mod); + } + ++NumNonAddrTakenGlobalVars; + + // If this global holds a pointer type, see if it is an indirect global. + if (GV.getType()->getElementType()->isPointerTy() && + AnalyzeIndirectGlobalMemory(&GV)) + ++NumIndirectGlobalVars; + } + Readers.clear(); + Writers.clear(); + } +} + +/// AnalyzeUsesOfPointer - Look at all of the users of the specified pointer. +/// If this is used by anything complex (i.e., the address escapes), return +/// true. Also, while we are at it, keep track of those functions that read and +/// write to the value. +/// +/// If OkayStoreDest is non-null, stores into this global are allowed. +bool GlobalsAAResult::AnalyzeUsesOfPointer(Value *V, + SmallPtrSetImpl<Function *> *Readers, + SmallPtrSetImpl<Function *> *Writers, + GlobalValue *OkayStoreDest) { + if (!V->getType()->isPointerTy()) + return true; + + for (Use &U : V->uses()) { + User *I = U.getUser(); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + if (Readers) + Readers->insert(LI->getParent()->getParent()); + } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + if (V == SI->getOperand(1)) { + if (Writers) + Writers->insert(SI->getParent()->getParent()); + } else if (SI->getOperand(1) != OkayStoreDest) { + return true; // Storing the pointer + } + } else if (Operator::getOpcode(I) == Instruction::GetElementPtr) { + if (AnalyzeUsesOfPointer(I, Readers, Writers)) + return true; + } else if (Operator::getOpcode(I) == Instruction::BitCast) { + if (AnalyzeUsesOfPointer(I, Readers, Writers, OkayStoreDest)) + return true; + } else if (auto CS = CallSite(I)) { + // Make sure that this is just the function being called, not that it is + // passing into the function. + if (CS.isDataOperand(&U)) { + // Detect calls to free. + if (CS.isArgOperand(&U) && isFreeCall(I, &TLI)) { + if (Writers) + Writers->insert(CS->getParent()->getParent()); + } else { + return true; // Argument of an unknown call. + } + } + } else if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) { + if (!isa<ConstantPointerNull>(ICI->getOperand(1))) + return true; // Allow comparison against null. + } else { + return true; + } + } + + return false; +} + +/// AnalyzeIndirectGlobalMemory - We found an non-address-taken global variable +/// which holds a pointer type. See if the global always points to non-aliased +/// heap memory: that is, all initializers of the globals are allocations, and +/// those allocations have no use other than initialization of the global. +/// Further, all loads out of GV must directly use the memory, not store the +/// pointer somewhere. If this is true, we consider the memory pointed to by +/// GV to be owned by GV and can disambiguate other pointers from it. +bool GlobalsAAResult::AnalyzeIndirectGlobalMemory(GlobalVariable *GV) { + // Keep track of values related to the allocation of the memory, f.e. the + // value produced by the malloc call and any casts. + std::vector<Value *> AllocRelatedValues; + + // If the initializer is a valid pointer, bail. + if (Constant *C = GV->getInitializer()) + if (!C->isNullValue()) + return false; + + // Walk the user list of the global. If we find anything other than a direct + // load or store, bail out. + for (User *U : GV->users()) { + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + // The pointer loaded from the global can only be used in simple ways: + // we allow addressing of it and loading storing to it. We do *not* allow + // storing the loaded pointer somewhere else or passing to a function. + if (AnalyzeUsesOfPointer(LI)) + return false; // Loaded pointer escapes. + // TODO: Could try some IP mod/ref of the loaded pointer. + } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + // Storing the global itself. + if (SI->getOperand(0) == GV) + return false; + + // If storing the null pointer, ignore it. + if (isa<ConstantPointerNull>(SI->getOperand(0))) + continue; + + // Check the value being stored. + Value *Ptr = GetUnderlyingObject(SI->getOperand(0), + GV->getParent()->getDataLayout()); + + if (!isAllocLikeFn(Ptr, &TLI)) + return false; // Too hard to analyze. + + // Analyze all uses of the allocation. If any of them are used in a + // non-simple way (e.g. stored to another global) bail out. + if (AnalyzeUsesOfPointer(Ptr, /*Readers*/ nullptr, /*Writers*/ nullptr, + GV)) + return false; // Loaded pointer escapes. + + // Remember that this allocation is related to the indirect global. + AllocRelatedValues.push_back(Ptr); + } else { + // Something complex, bail out. + return false; + } + } + + // Okay, this is an indirect global. Remember all of the allocations for + // this global in AllocsForIndirectGlobals. + while (!AllocRelatedValues.empty()) { + AllocsForIndirectGlobals[AllocRelatedValues.back()] = GV; + Handles.emplace_front(*this, AllocRelatedValues.back()); + Handles.front().I = Handles.begin(); + AllocRelatedValues.pop_back(); + } + IndirectGlobals.insert(GV); + Handles.emplace_front(*this, GV); + Handles.front().I = Handles.begin(); + return true; +} + +void GlobalsAAResult::CollectSCCMembership(CallGraph &CG) { + // We do a bottom-up SCC traversal of the call graph. In other words, we + // visit all callees before callers (leaf-first). + unsigned SCCID = 0; + for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) { + const std::vector<CallGraphNode *> &SCC = *I; + assert(!SCC.empty() && "SCC with no functions?"); + + for (auto *CGN : SCC) + if (Function *F = CGN->getFunction()) + FunctionToSCCMap[F] = SCCID; + ++SCCID; + } +} + +/// AnalyzeCallGraph - At this point, we know the functions where globals are +/// immediately stored to and read from. Propagate this information up the call +/// graph to all callers and compute the mod/ref info for all memory for each +/// function. +void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) { + // We do a bottom-up SCC traversal of the call graph. In other words, we + // visit all callees before callers (leaf-first). + for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) { + const std::vector<CallGraphNode *> &SCC = *I; + assert(!SCC.empty() && "SCC with no functions?"); + + if (!SCC[0]->getFunction() || SCC[0]->getFunction()->mayBeOverridden()) { + // Calls externally or is weak - can't say anything useful. Remove any existing + // function records (may have been created when scanning globals). + for (auto *Node : SCC) + FunctionInfos.erase(Node->getFunction()); + continue; + } + + FunctionInfo &FI = FunctionInfos[SCC[0]->getFunction()]; + bool KnowNothing = false; + + // Collect the mod/ref properties due to called functions. We only compute + // one mod-ref set. + for (unsigned i = 0, e = SCC.size(); i != e && !KnowNothing; ++i) { + Function *F = SCC[i]->getFunction(); + if (!F) { + KnowNothing = true; + break; + } + + if (F->isDeclaration()) { + // Try to get mod/ref behaviour from function attributes. + if (F->doesNotAccessMemory()) { + // Can't do better than that! + } else if (F->onlyReadsMemory()) { + FI.addModRefInfo(MRI_Ref); + if (!F->isIntrinsic()) + // This function might call back into the module and read a global - + // consider every global as possibly being read by this function. + FI.setMayReadAnyGlobal(); + } else { + FI.addModRefInfo(MRI_ModRef); + // Can't say anything useful unless it's an intrinsic - they don't + // read or write global variables of the kind considered here. + KnowNothing = !F->isIntrinsic(); + } + continue; + } + + for (CallGraphNode::iterator CI = SCC[i]->begin(), E = SCC[i]->end(); + CI != E && !KnowNothing; ++CI) + if (Function *Callee = CI->second->getFunction()) { + if (FunctionInfo *CalleeFI = getFunctionInfo(Callee)) { + // Propagate function effect up. + FI.addFunctionInfo(*CalleeFI); + } else { + // Can't say anything about it. However, if it is inside our SCC, + // then nothing needs to be done. + CallGraphNode *CalleeNode = CG[Callee]; + if (std::find(SCC.begin(), SCC.end(), CalleeNode) == SCC.end()) + KnowNothing = true; + } + } else { + KnowNothing = true; + } + } + + // If we can't say anything useful about this SCC, remove all SCC functions + // from the FunctionInfos map. + if (KnowNothing) { + for (auto *Node : SCC) + FunctionInfos.erase(Node->getFunction()); + continue; + } + + // Scan the function bodies for explicit loads or stores. + for (auto *Node : SCC) { + if (FI.getModRefInfo() == MRI_ModRef) + break; // The mod/ref lattice saturates here. + for (Instruction &I : instructions(Node->getFunction())) { + if (FI.getModRefInfo() == MRI_ModRef) + break; // The mod/ref lattice saturates here. + + // We handle calls specially because the graph-relevant aspects are + // handled above. + if (auto CS = CallSite(&I)) { + if (isAllocationFn(&I, &TLI) || isFreeCall(&I, &TLI)) { + // FIXME: It is completely unclear why this is necessary and not + // handled by the above graph code. + FI.addModRefInfo(MRI_ModRef); + } else if (Function *Callee = CS.getCalledFunction()) { + // The callgraph doesn't include intrinsic calls. + if (Callee->isIntrinsic()) { + FunctionModRefBehavior Behaviour = + AAResultBase::getModRefBehavior(Callee); + FI.addModRefInfo(ModRefInfo(Behaviour & MRI_ModRef)); + } + } + continue; + } + + // All non-call instructions we use the primary predicates for whether + // thay read or write memory. + if (I.mayReadFromMemory()) + FI.addModRefInfo(MRI_Ref); + if (I.mayWriteToMemory()) + FI.addModRefInfo(MRI_Mod); + } + } + + if ((FI.getModRefInfo() & MRI_Mod) == 0) + ++NumReadMemFunctions; + if (FI.getModRefInfo() == MRI_NoModRef) + ++NumNoMemFunctions; + + // Finally, now that we know the full effect on this SCC, clone the + // information to each function in the SCC. + // FI is a reference into FunctionInfos, so copy it now so that it doesn't + // get invalidated if DenseMap decides to re-hash. + FunctionInfo CachedFI = FI; + for (unsigned i = 1, e = SCC.size(); i != e; ++i) + FunctionInfos[SCC[i]->getFunction()] = CachedFI; + } +} + +// GV is a non-escaping global. V is a pointer address that has been loaded from. +// If we can prove that V must escape, we can conclude that a load from V cannot +// alias GV. +static bool isNonEscapingGlobalNoAliasWithLoad(const GlobalValue *GV, + const Value *V, + int &Depth, + const DataLayout &DL) { + SmallPtrSet<const Value *, 8> Visited; + SmallVector<const Value *, 8> Inputs; + Visited.insert(V); + Inputs.push_back(V); + do { + const Value *Input = Inputs.pop_back_val(); + + if (isa<GlobalValue>(Input) || isa<Argument>(Input) || isa<CallInst>(Input) || + isa<InvokeInst>(Input)) + // Arguments to functions or returns from functions are inherently + // escaping, so we can immediately classify those as not aliasing any + // non-addr-taken globals. + // + // (Transitive) loads from a global are also safe - if this aliased + // another global, its address would escape, so no alias. + continue; + + // Recurse through a limited number of selects, loads and PHIs. This is an + // arbitrary depth of 4, lower numbers could be used to fix compile time + // issues if needed, but this is generally expected to be only be important + // for small depths. + if (++Depth > 4) + return false; + + if (auto *LI = dyn_cast<LoadInst>(Input)) { + Inputs.push_back(GetUnderlyingObject(LI->getPointerOperand(), DL)); + continue; + } + if (auto *SI = dyn_cast<SelectInst>(Input)) { + const Value *LHS = GetUnderlyingObject(SI->getTrueValue(), DL); + const Value *RHS = GetUnderlyingObject(SI->getFalseValue(), DL); + if (Visited.insert(LHS).second) + Inputs.push_back(LHS); + if (Visited.insert(RHS).second) + Inputs.push_back(RHS); + continue; + } + if (auto *PN = dyn_cast<PHINode>(Input)) { + for (const Value *Op : PN->incoming_values()) { + Op = GetUnderlyingObject(Op, DL); + if (Visited.insert(Op).second) + Inputs.push_back(Op); + } + continue; + } + + return false; + } while (!Inputs.empty()); + + // All inputs were known to be no-alias. + return true; +} + +// There are particular cases where we can conclude no-alias between +// a non-addr-taken global and some other underlying object. Specifically, +// a non-addr-taken global is known to not be escaped from any function. It is +// also incorrect for a transformation to introduce an escape of a global in +// a way that is observable when it was not there previously. One function +// being transformed to introduce an escape which could possibly be observed +// (via loading from a global or the return value for example) within another +// function is never safe. If the observation is made through non-atomic +// operations on different threads, it is a data-race and UB. If the +// observation is well defined, by being observed the transformation would have +// changed program behavior by introducing the observed escape, making it an +// invalid transform. +// +// This property does require that transformations which *temporarily* escape +// a global that was not previously escaped, prior to restoring it, cannot rely +// on the results of GMR::alias. This seems a reasonable restriction, although +// currently there is no way to enforce it. There is also no realistic +// optimization pass that would make this mistake. The closest example is +// a transformation pass which does reg2mem of SSA values but stores them into +// global variables temporarily before restoring the global variable's value. +// This could be useful to expose "benign" races for example. However, it seems +// reasonable to require that a pass which introduces escapes of global +// variables in this way to either not trust AA results while the escape is +// active, or to be forced to operate as a module pass that cannot co-exist +// with an alias analysis such as GMR. +bool GlobalsAAResult::isNonEscapingGlobalNoAlias(const GlobalValue *GV, + const Value *V) { + // In order to know that the underlying object cannot alias the + // non-addr-taken global, we must know that it would have to be an escape. + // Thus if the underlying object is a function argument, a load from + // a global, or the return of a function, it cannot alias. We can also + // recurse through PHI nodes and select nodes provided all of their inputs + // resolve to one of these known-escaping roots. + SmallPtrSet<const Value *, 8> Visited; + SmallVector<const Value *, 8> Inputs; + Visited.insert(V); + Inputs.push_back(V); + int Depth = 0; + do { + const Value *Input = Inputs.pop_back_val(); + + if (auto *InputGV = dyn_cast<GlobalValue>(Input)) { + // If one input is the very global we're querying against, then we can't + // conclude no-alias. + if (InputGV == GV) + return false; + + // Distinct GlobalVariables never alias, unless overriden or zero-sized. + // FIXME: The condition can be refined, but be conservative for now. + auto *GVar = dyn_cast<GlobalVariable>(GV); + auto *InputGVar = dyn_cast<GlobalVariable>(InputGV); + if (GVar && InputGVar && + !GVar->isDeclaration() && !InputGVar->isDeclaration() && + !GVar->mayBeOverridden() && !InputGVar->mayBeOverridden()) { + Type *GVType = GVar->getInitializer()->getType(); + Type *InputGVType = InputGVar->getInitializer()->getType(); + if (GVType->isSized() && InputGVType->isSized() && + (DL.getTypeAllocSize(GVType) > 0) && + (DL.getTypeAllocSize(InputGVType) > 0)) + continue; + } + + // Conservatively return false, even though we could be smarter + // (e.g. look through GlobalAliases). + return false; + } + + if (isa<Argument>(Input) || isa<CallInst>(Input) || + isa<InvokeInst>(Input)) { + // Arguments to functions or returns from functions are inherently + // escaping, so we can immediately classify those as not aliasing any + // non-addr-taken globals. + continue; + } + + // Recurse through a limited number of selects, loads and PHIs. This is an + // arbitrary depth of 4, lower numbers could be used to fix compile time + // issues if needed, but this is generally expected to be only be important + // for small depths. + if (++Depth > 4) + return false; + + if (auto *LI = dyn_cast<LoadInst>(Input)) { + // A pointer loaded from a global would have been captured, and we know + // that the global is non-escaping, so no alias. + const Value *Ptr = GetUnderlyingObject(LI->getPointerOperand(), DL); + if (isNonEscapingGlobalNoAliasWithLoad(GV, Ptr, Depth, DL)) + // The load does not alias with GV. + continue; + // Otherwise, a load could come from anywhere, so bail. + return false; + } + if (auto *SI = dyn_cast<SelectInst>(Input)) { + const Value *LHS = GetUnderlyingObject(SI->getTrueValue(), DL); + const Value *RHS = GetUnderlyingObject(SI->getFalseValue(), DL); + if (Visited.insert(LHS).second) + Inputs.push_back(LHS); + if (Visited.insert(RHS).second) + Inputs.push_back(RHS); + continue; + } + if (auto *PN = dyn_cast<PHINode>(Input)) { + for (const Value *Op : PN->incoming_values()) { + Op = GetUnderlyingObject(Op, DL); + if (Visited.insert(Op).second) + Inputs.push_back(Op); + } + continue; + } + + // FIXME: It would be good to handle other obvious no-alias cases here, but + // it isn't clear how to do so reasonbly without building a small version + // of BasicAA into this code. We could recurse into AAResultBase::alias + // here but that seems likely to go poorly as we're inside the + // implementation of such a query. Until then, just conservatievly retun + // false. + return false; + } while (!Inputs.empty()); + + // If all the inputs to V were definitively no-alias, then V is no-alias. + return true; +} + +/// alias - If one of the pointers is to a global that we are tracking, and the +/// other is some random pointer, we know there cannot be an alias, because the +/// address of the global isn't taken. +AliasResult GlobalsAAResult::alias(const MemoryLocation &LocA, + const MemoryLocation &LocB) { + // Get the base object these pointers point to. + const Value *UV1 = GetUnderlyingObject(LocA.Ptr, DL); + const Value *UV2 = GetUnderlyingObject(LocB.Ptr, DL); + + // If either of the underlying values is a global, they may be non-addr-taken + // globals, which we can answer queries about. + const GlobalValue *GV1 = dyn_cast<GlobalValue>(UV1); + const GlobalValue *GV2 = dyn_cast<GlobalValue>(UV2); + if (GV1 || GV2) { + // If the global's address is taken, pretend we don't know it's a pointer to + // the global. + if (GV1 && !NonAddressTakenGlobals.count(GV1)) + GV1 = nullptr; + if (GV2 && !NonAddressTakenGlobals.count(GV2)) + GV2 = nullptr; + + // If the two pointers are derived from two different non-addr-taken + // globals we know these can't alias. + if (GV1 && GV2 && GV1 != GV2) + return NoAlias; + + // If one is and the other isn't, it isn't strictly safe but we can fake + // this result if necessary for performance. This does not appear to be + // a common problem in practice. + if (EnableUnsafeGlobalsModRefAliasResults) + if ((GV1 || GV2) && GV1 != GV2) + return NoAlias; + + // Check for a special case where a non-escaping global can be used to + // conclude no-alias. + if ((GV1 || GV2) && GV1 != GV2) { + const GlobalValue *GV = GV1 ? GV1 : GV2; + const Value *UV = GV1 ? UV2 : UV1; + if (isNonEscapingGlobalNoAlias(GV, UV)) + return NoAlias; + } + + // Otherwise if they are both derived from the same addr-taken global, we + // can't know the two accesses don't overlap. + } + + // These pointers may be based on the memory owned by an indirect global. If + // so, we may be able to handle this. First check to see if the base pointer + // is a direct load from an indirect global. + GV1 = GV2 = nullptr; + if (const LoadInst *LI = dyn_cast<LoadInst>(UV1)) + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0))) + if (IndirectGlobals.count(GV)) + GV1 = GV; + if (const LoadInst *LI = dyn_cast<LoadInst>(UV2)) + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0))) + if (IndirectGlobals.count(GV)) + GV2 = GV; + + // These pointers may also be from an allocation for the indirect global. If + // so, also handle them. + if (!GV1) + GV1 = AllocsForIndirectGlobals.lookup(UV1); + if (!GV2) + GV2 = AllocsForIndirectGlobals.lookup(UV2); + + // Now that we know whether the two pointers are related to indirect globals, + // use this to disambiguate the pointers. If the pointers are based on + // different indirect globals they cannot alias. + if (GV1 && GV2 && GV1 != GV2) + return NoAlias; + + // If one is based on an indirect global and the other isn't, it isn't + // strictly safe but we can fake this result if necessary for performance. + // This does not appear to be a common problem in practice. + if (EnableUnsafeGlobalsModRefAliasResults) + if ((GV1 || GV2) && GV1 != GV2) + return NoAlias; + + return AAResultBase::alias(LocA, LocB); +} + +ModRefInfo GlobalsAAResult::getModRefInfoForArgument(ImmutableCallSite CS, + const GlobalValue *GV) { + if (CS.doesNotAccessMemory()) + return MRI_NoModRef; + ModRefInfo ConservativeResult = CS.onlyReadsMemory() ? MRI_Ref : MRI_ModRef; + + // Iterate through all the arguments to the called function. If any argument + // is based on GV, return the conservative result. + for (auto &A : CS.args()) { + SmallVector<Value*, 4> Objects; + GetUnderlyingObjects(A, Objects, DL); + + // All objects must be identified. + if (!std::all_of(Objects.begin(), Objects.end(), isIdentifiedObject)) + return ConservativeResult; + + if (std::find(Objects.begin(), Objects.end(), GV) != Objects.end()) + return ConservativeResult; + } + + // We identified all objects in the argument list, and none of them were GV. + return MRI_NoModRef; +} + +ModRefInfo GlobalsAAResult::getModRefInfo(ImmutableCallSite CS, + const MemoryLocation &Loc) { + unsigned Known = MRI_ModRef; + + // If we are asking for mod/ref info of a direct call with a pointer to a + // global we are tracking, return information if we have it. + if (const GlobalValue *GV = + dyn_cast<GlobalValue>(GetUnderlyingObject(Loc.Ptr, DL))) + if (GV->hasLocalLinkage()) + if (const Function *F = CS.getCalledFunction()) + if (NonAddressTakenGlobals.count(GV)) + if (const FunctionInfo *FI = getFunctionInfo(F)) + Known = FI->getModRefInfoForGlobal(*GV) | + getModRefInfoForArgument(CS, GV); + + if (Known == MRI_NoModRef) + return MRI_NoModRef; // No need to query other mod/ref analyses + return ModRefInfo(Known & AAResultBase::getModRefInfo(CS, Loc)); +} + +GlobalsAAResult::GlobalsAAResult(const DataLayout &DL, + const TargetLibraryInfo &TLI) + : AAResultBase(TLI), DL(DL) {} + +GlobalsAAResult::GlobalsAAResult(GlobalsAAResult &&Arg) + : AAResultBase(std::move(Arg)), DL(Arg.DL), + NonAddressTakenGlobals(std::move(Arg.NonAddressTakenGlobals)), + IndirectGlobals(std::move(Arg.IndirectGlobals)), + AllocsForIndirectGlobals(std::move(Arg.AllocsForIndirectGlobals)), + FunctionInfos(std::move(Arg.FunctionInfos)), + Handles(std::move(Arg.Handles)) { + // Update the parent for each DeletionCallbackHandle. + for (auto &H : Handles) { + assert(H.GAR == &Arg); + H.GAR = this; + } +} + +/*static*/ GlobalsAAResult +GlobalsAAResult::analyzeModule(Module &M, const TargetLibraryInfo &TLI, + CallGraph &CG) { + GlobalsAAResult Result(M.getDataLayout(), TLI); + + // Discover which functions aren't recursive, to feed into AnalyzeGlobals. + Result.CollectSCCMembership(CG); + + // Find non-addr taken globals. + Result.AnalyzeGlobals(M); + + // Propagate on CG. + Result.AnalyzeCallGraph(CG, M); + + return Result; +} + +GlobalsAAResult GlobalsAA::run(Module &M, AnalysisManager<Module> *AM) { + return GlobalsAAResult::analyzeModule(M, + AM->getResult<TargetLibraryAnalysis>(M), + AM->getResult<CallGraphAnalysis>(M)); +} + +char GlobalsAA::PassID; + +char GlobalsAAWrapperPass::ID = 0; +INITIALIZE_PASS_BEGIN(GlobalsAAWrapperPass, "globals-aa", + "Globals Alias Analysis", false, true) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(GlobalsAAWrapperPass, "globals-aa", + "Globals Alias Analysis", false, true) + +ModulePass *llvm::createGlobalsAAWrapperPass() { + return new GlobalsAAWrapperPass(); +} + +GlobalsAAWrapperPass::GlobalsAAWrapperPass() : ModulePass(ID) { + initializeGlobalsAAWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +bool GlobalsAAWrapperPass::runOnModule(Module &M) { + Result.reset(new GlobalsAAResult(GlobalsAAResult::analyzeModule( + M, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), + getAnalysis<CallGraphWrapperPass>().getCallGraph()))); + return false; +} + +bool GlobalsAAWrapperPass::doFinalization(Module &M) { + Result.reset(); + return false; +} + +void GlobalsAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<CallGraphWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); +} diff --git a/contrib/llvm/lib/Analysis/IPA/GlobalsModRef.cpp b/contrib/llvm/lib/Analysis/IPA/GlobalsModRef.cpp deleted file mode 100644 index 28fb49c..0000000 --- a/contrib/llvm/lib/Analysis/IPA/GlobalsModRef.cpp +++ /dev/null @@ -1,609 +0,0 @@ -//===- GlobalsModRef.cpp - Simple Mod/Ref Analysis for Globals ------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This simple pass provides alias and mod/ref information for global values -// that do not have their address taken, and keeps track of whether functions -// read or write memory (are "pure"). For this simple (but very common) case, -// we can provide pretty accurate and useful information. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Analysis/Passes.h" -#include "llvm/ADT/SCCIterator.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/CallGraph.h" -#include "llvm/Analysis/MemoryBuiltins.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include <set> -using namespace llvm; - -#define DEBUG_TYPE "globalsmodref-aa" - -STATISTIC(NumNonAddrTakenGlobalVars, - "Number of global vars without address taken"); -STATISTIC(NumNonAddrTakenFunctions,"Number of functions without address taken"); -STATISTIC(NumNoMemFunctions, "Number of functions that do not access memory"); -STATISTIC(NumReadMemFunctions, "Number of functions that only read memory"); -STATISTIC(NumIndirectGlobalVars, "Number of indirect global objects"); - -namespace { -/// FunctionRecord - One instance of this structure is stored for every -/// function in the program. Later, the entries for these functions are -/// removed if the function is found to call an external function (in which -/// case we know nothing about it. -struct FunctionRecord { - /// GlobalInfo - Maintain mod/ref info for all of the globals without - /// addresses taken that are read or written (transitively) by this - /// function. - std::map<const GlobalValue *, unsigned> GlobalInfo; - - /// MayReadAnyGlobal - May read global variables, but it is not known which. - bool MayReadAnyGlobal; - - unsigned getInfoForGlobal(const GlobalValue *GV) const { - unsigned Effect = MayReadAnyGlobal ? AliasAnalysis::Ref : 0; - std::map<const GlobalValue *, unsigned>::const_iterator I = - GlobalInfo.find(GV); - if (I != GlobalInfo.end()) - Effect |= I->second; - return Effect; - } - - /// FunctionEffect - Capture whether or not this function reads or writes to - /// ANY memory. If not, we can do a lot of aggressive analysis on it. - unsigned FunctionEffect; - - FunctionRecord() : MayReadAnyGlobal(false), FunctionEffect(0) {} -}; - -/// GlobalsModRef - The actual analysis pass. -class GlobalsModRef : public ModulePass, public AliasAnalysis { - /// NonAddressTakenGlobals - The globals that do not have their addresses - /// taken. - std::set<const GlobalValue *> NonAddressTakenGlobals; - - /// IndirectGlobals - The memory pointed to by this global is known to be - /// 'owned' by the global. - std::set<const GlobalValue *> IndirectGlobals; - - /// AllocsForIndirectGlobals - If an instruction allocates memory for an - /// indirect global, this map indicates which one. - std::map<const Value *, const GlobalValue *> AllocsForIndirectGlobals; - - /// FunctionInfo - For each function, keep track of what globals are - /// modified or read. - std::map<const Function *, FunctionRecord> FunctionInfo; - -public: - static char ID; - GlobalsModRef() : ModulePass(ID) { - initializeGlobalsModRefPass(*PassRegistry::getPassRegistry()); - } - - bool runOnModule(Module &M) override { - InitializeAliasAnalysis(this, &M.getDataLayout()); - - // Find non-addr taken globals. - AnalyzeGlobals(M); - - // Propagate on CG. - AnalyzeCallGraph(getAnalysis<CallGraphWrapperPass>().getCallGraph(), M); - return false; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AliasAnalysis::getAnalysisUsage(AU); - AU.addRequired<CallGraphWrapperPass>(); - AU.setPreservesAll(); // Does not transform code - } - - //------------------------------------------------ - // Implement the AliasAnalysis API - // - AliasResult alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) override; - ModRefResult getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) override; - ModRefResult getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2) override { - return AliasAnalysis::getModRefInfo(CS1, CS2); - } - - /// getModRefBehavior - Return the behavior of the specified function if - /// called from the specified call site. The call site may be null in which - /// case the most generic behavior of this function should be returned. - ModRefBehavior getModRefBehavior(const Function *F) override { - ModRefBehavior Min = UnknownModRefBehavior; - - if (FunctionRecord *FR = getFunctionInfo(F)) { - if (FR->FunctionEffect == 0) - Min = DoesNotAccessMemory; - else if ((FR->FunctionEffect & Mod) == 0) - Min = OnlyReadsMemory; - } - - return ModRefBehavior(AliasAnalysis::getModRefBehavior(F) & Min); - } - - /// getModRefBehavior - Return the behavior of the specified function if - /// called from the specified call site. The call site may be null in which - /// case the most generic behavior of this function should be returned. - ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override { - ModRefBehavior Min = UnknownModRefBehavior; - - if (const Function *F = CS.getCalledFunction()) - if (FunctionRecord *FR = getFunctionInfo(F)) { - if (FR->FunctionEffect == 0) - Min = DoesNotAccessMemory; - else if ((FR->FunctionEffect & Mod) == 0) - Min = OnlyReadsMemory; - } - - return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min); - } - - void deleteValue(Value *V) override; - void addEscapingUse(Use &U) override; - - /// getAdjustedAnalysisPointer - This method is used when a pass implements - /// an analysis interface through multiple inheritance. If needed, it - /// should override this to adjust the this pointer as needed for the - /// specified pass info. - void *getAdjustedAnalysisPointer(AnalysisID PI) override { - if (PI == &AliasAnalysis::ID) - return (AliasAnalysis *)this; - return this; - } - -private: - /// getFunctionInfo - Return the function info for the function, or null if - /// we don't have anything useful to say about it. - FunctionRecord *getFunctionInfo(const Function *F) { - std::map<const Function *, FunctionRecord>::iterator I = - FunctionInfo.find(F); - if (I != FunctionInfo.end()) - return &I->second; - return nullptr; - } - - void AnalyzeGlobals(Module &M); - void AnalyzeCallGraph(CallGraph &CG, Module &M); - bool AnalyzeUsesOfPointer(Value *V, std::vector<Function *> &Readers, - std::vector<Function *> &Writers, - GlobalValue *OkayStoreDest = nullptr); - bool AnalyzeIndirectGlobalMemory(GlobalValue *GV); -}; -} - -char GlobalsModRef::ID = 0; -INITIALIZE_AG_PASS_BEGIN(GlobalsModRef, AliasAnalysis, "globalsmodref-aa", - "Simple mod/ref analysis for globals", false, true, - false) -INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_AG_PASS_END(GlobalsModRef, AliasAnalysis, "globalsmodref-aa", - "Simple mod/ref analysis for globals", false, true, - false) - -Pass *llvm::createGlobalsModRefPass() { return new GlobalsModRef(); } - -/// AnalyzeGlobals - Scan through the users of all of the internal -/// GlobalValue's in the program. If none of them have their "address taken" -/// (really, their address passed to something nontrivial), record this fact, -/// and record the functions that they are used directly in. -void GlobalsModRef::AnalyzeGlobals(Module &M) { - std::vector<Function *> Readers, Writers; - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) - if (I->hasLocalLinkage()) { - if (!AnalyzeUsesOfPointer(I, Readers, Writers)) { - // Remember that we are tracking this global. - NonAddressTakenGlobals.insert(I); - ++NumNonAddrTakenFunctions; - } - Readers.clear(); - Writers.clear(); - } - - for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; - ++I) - if (I->hasLocalLinkage()) { - if (!AnalyzeUsesOfPointer(I, Readers, Writers)) { - // Remember that we are tracking this global, and the mod/ref fns - NonAddressTakenGlobals.insert(I); - - for (unsigned i = 0, e = Readers.size(); i != e; ++i) - FunctionInfo[Readers[i]].GlobalInfo[I] |= Ref; - - if (!I->isConstant()) // No need to keep track of writers to constants - for (unsigned i = 0, e = Writers.size(); i != e; ++i) - FunctionInfo[Writers[i]].GlobalInfo[I] |= Mod; - ++NumNonAddrTakenGlobalVars; - - // If this global holds a pointer type, see if it is an indirect global. - if (I->getType()->getElementType()->isPointerTy() && - AnalyzeIndirectGlobalMemory(I)) - ++NumIndirectGlobalVars; - } - Readers.clear(); - Writers.clear(); - } -} - -/// AnalyzeUsesOfPointer - Look at all of the users of the specified pointer. -/// If this is used by anything complex (i.e., the address escapes), return -/// true. Also, while we are at it, keep track of those functions that read and -/// write to the value. -/// -/// If OkayStoreDest is non-null, stores into this global are allowed. -bool GlobalsModRef::AnalyzeUsesOfPointer(Value *V, - std::vector<Function *> &Readers, - std::vector<Function *> &Writers, - GlobalValue *OkayStoreDest) { - if (!V->getType()->isPointerTy()) - return true; - - for (Use &U : V->uses()) { - User *I = U.getUser(); - if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - Readers.push_back(LI->getParent()->getParent()); - } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - if (V == SI->getOperand(1)) { - Writers.push_back(SI->getParent()->getParent()); - } else if (SI->getOperand(1) != OkayStoreDest) { - return true; // Storing the pointer - } - } else if (Operator::getOpcode(I) == Instruction::GetElementPtr) { - if (AnalyzeUsesOfPointer(I, Readers, Writers)) - return true; - } else if (Operator::getOpcode(I) == Instruction::BitCast) { - if (AnalyzeUsesOfPointer(I, Readers, Writers, OkayStoreDest)) - return true; - } else if (auto CS = CallSite(I)) { - // Make sure that this is just the function being called, not that it is - // passing into the function. - if (!CS.isCallee(&U)) { - // Detect calls to free. - if (isFreeCall(I, TLI)) - Writers.push_back(CS->getParent()->getParent()); - else - return true; // Argument of an unknown call. - } - } else if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) { - if (!isa<ConstantPointerNull>(ICI->getOperand(1))) - return true; // Allow comparison against null. - } else { - return true; - } - } - - return false; -} - -/// AnalyzeIndirectGlobalMemory - We found an non-address-taken global variable -/// which holds a pointer type. See if the global always points to non-aliased -/// heap memory: that is, all initializers of the globals are allocations, and -/// those allocations have no use other than initialization of the global. -/// Further, all loads out of GV must directly use the memory, not store the -/// pointer somewhere. If this is true, we consider the memory pointed to by -/// GV to be owned by GV and can disambiguate other pointers from it. -bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) { - // Keep track of values related to the allocation of the memory, f.e. the - // value produced by the malloc call and any casts. - std::vector<Value *> AllocRelatedValues; - - // Walk the user list of the global. If we find anything other than a direct - // load or store, bail out. - for (User *U : GV->users()) { - if (LoadInst *LI = dyn_cast<LoadInst>(U)) { - // The pointer loaded from the global can only be used in simple ways: - // we allow addressing of it and loading storing to it. We do *not* allow - // storing the loaded pointer somewhere else or passing to a function. - std::vector<Function *> ReadersWriters; - if (AnalyzeUsesOfPointer(LI, ReadersWriters, ReadersWriters)) - return false; // Loaded pointer escapes. - // TODO: Could try some IP mod/ref of the loaded pointer. - } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) { - // Storing the global itself. - if (SI->getOperand(0) == GV) - return false; - - // If storing the null pointer, ignore it. - if (isa<ConstantPointerNull>(SI->getOperand(0))) - continue; - - // Check the value being stored. - Value *Ptr = GetUnderlyingObject(SI->getOperand(0), - GV->getParent()->getDataLayout()); - - if (!isAllocLikeFn(Ptr, TLI)) - return false; // Too hard to analyze. - - // Analyze all uses of the allocation. If any of them are used in a - // non-simple way (e.g. stored to another global) bail out. - std::vector<Function *> ReadersWriters; - if (AnalyzeUsesOfPointer(Ptr, ReadersWriters, ReadersWriters, GV)) - return false; // Loaded pointer escapes. - - // Remember that this allocation is related to the indirect global. - AllocRelatedValues.push_back(Ptr); - } else { - // Something complex, bail out. - return false; - } - } - - // Okay, this is an indirect global. Remember all of the allocations for - // this global in AllocsForIndirectGlobals. - while (!AllocRelatedValues.empty()) { - AllocsForIndirectGlobals[AllocRelatedValues.back()] = GV; - AllocRelatedValues.pop_back(); - } - IndirectGlobals.insert(GV); - return true; -} - -/// AnalyzeCallGraph - At this point, we know the functions where globals are -/// immediately stored to and read from. Propagate this information up the call -/// graph to all callers and compute the mod/ref info for all memory for each -/// function. -void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) { - // We do a bottom-up SCC traversal of the call graph. In other words, we - // visit all callees before callers (leaf-first). - for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) { - const std::vector<CallGraphNode *> &SCC = *I; - assert(!SCC.empty() && "SCC with no functions?"); - - if (!SCC[0]->getFunction()) { - // Calls externally - can't say anything useful. Remove any existing - // function records (may have been created when scanning globals). - for (unsigned i = 0, e = SCC.size(); i != e; ++i) - FunctionInfo.erase(SCC[i]->getFunction()); - continue; - } - - FunctionRecord &FR = FunctionInfo[SCC[0]->getFunction()]; - - bool KnowNothing = false; - unsigned FunctionEffect = 0; - - // Collect the mod/ref properties due to called functions. We only compute - // one mod-ref set. - for (unsigned i = 0, e = SCC.size(); i != e && !KnowNothing; ++i) { - Function *F = SCC[i]->getFunction(); - if (!F) { - KnowNothing = true; - break; - } - - if (F->isDeclaration()) { - // Try to get mod/ref behaviour from function attributes. - if (F->doesNotAccessMemory()) { - // Can't do better than that! - } else if (F->onlyReadsMemory()) { - FunctionEffect |= Ref; - if (!F->isIntrinsic()) - // This function might call back into the module and read a global - - // consider every global as possibly being read by this function. - FR.MayReadAnyGlobal = true; - } else { - FunctionEffect |= ModRef; - // Can't say anything useful unless it's an intrinsic - they don't - // read or write global variables of the kind considered here. - KnowNothing = !F->isIntrinsic(); - } - continue; - } - - for (CallGraphNode::iterator CI = SCC[i]->begin(), E = SCC[i]->end(); - CI != E && !KnowNothing; ++CI) - if (Function *Callee = CI->second->getFunction()) { - if (FunctionRecord *CalleeFR = getFunctionInfo(Callee)) { - // Propagate function effect up. - FunctionEffect |= CalleeFR->FunctionEffect; - - // Incorporate callee's effects on globals into our info. - for (const auto &G : CalleeFR->GlobalInfo) - FR.GlobalInfo[G.first] |= G.second; - FR.MayReadAnyGlobal |= CalleeFR->MayReadAnyGlobal; - } else { - // Can't say anything about it. However, if it is inside our SCC, - // then nothing needs to be done. - CallGraphNode *CalleeNode = CG[Callee]; - if (std::find(SCC.begin(), SCC.end(), CalleeNode) == SCC.end()) - KnowNothing = true; - } - } else { - KnowNothing = true; - } - } - - // If we can't say anything useful about this SCC, remove all SCC functions - // from the FunctionInfo map. - if (KnowNothing) { - for (unsigned i = 0, e = SCC.size(); i != e; ++i) - FunctionInfo.erase(SCC[i]->getFunction()); - continue; - } - - // Scan the function bodies for explicit loads or stores. - for (auto *Node : SCC) { - if (FunctionEffect == ModRef) - break; // The mod/ref lattice saturates here. - for (Instruction &I : inst_range(Node->getFunction())) { - if (FunctionEffect == ModRef) - break; // The mod/ref lattice saturates here. - - // We handle calls specially because the graph-relevant aspects are - // handled above. - if (auto CS = CallSite(&I)) { - if (isAllocationFn(&I, TLI) || isFreeCall(&I, TLI)) { - // FIXME: It is completely unclear why this is necessary and not - // handled by the above graph code. - FunctionEffect |= ModRef; - } else if (Function *Callee = CS.getCalledFunction()) { - // The callgraph doesn't include intrinsic calls. - if (Callee->isIntrinsic()) { - ModRefBehavior Behaviour = - AliasAnalysis::getModRefBehavior(Callee); - FunctionEffect |= (Behaviour & ModRef); - } - } - continue; - } - - // All non-call instructions we use the primary predicates for whether - // thay read or write memory. - if (I.mayReadFromMemory()) - FunctionEffect |= Ref; - if (I.mayWriteToMemory()) - FunctionEffect |= Mod; - } - } - - if ((FunctionEffect & Mod) == 0) - ++NumReadMemFunctions; - if (FunctionEffect == 0) - ++NumNoMemFunctions; - FR.FunctionEffect = FunctionEffect; - - // Finally, now that we know the full effect on this SCC, clone the - // information to each function in the SCC. - for (unsigned i = 1, e = SCC.size(); i != e; ++i) - FunctionInfo[SCC[i]->getFunction()] = FR; - } -} - -/// alias - If one of the pointers is to a global that we are tracking, and the -/// other is some random pointer, we know there cannot be an alias, because the -/// address of the global isn't taken. -AliasResult GlobalsModRef::alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) { - // Get the base object these pointers point to. - const Value *UV1 = GetUnderlyingObject(LocA.Ptr, *DL); - const Value *UV2 = GetUnderlyingObject(LocB.Ptr, *DL); - - // If either of the underlying values is a global, they may be non-addr-taken - // globals, which we can answer queries about. - const GlobalValue *GV1 = dyn_cast<GlobalValue>(UV1); - const GlobalValue *GV2 = dyn_cast<GlobalValue>(UV2); - if (GV1 || GV2) { - // If the global's address is taken, pretend we don't know it's a pointer to - // the global. - if (GV1 && !NonAddressTakenGlobals.count(GV1)) - GV1 = nullptr; - if (GV2 && !NonAddressTakenGlobals.count(GV2)) - GV2 = nullptr; - - // If the two pointers are derived from two different non-addr-taken - // globals, or if one is and the other isn't, we know these can't alias. - if ((GV1 || GV2) && GV1 != GV2) - return NoAlias; - - // Otherwise if they are both derived from the same addr-taken global, we - // can't know the two accesses don't overlap. - } - - // These pointers may be based on the memory owned by an indirect global. If - // so, we may be able to handle this. First check to see if the base pointer - // is a direct load from an indirect global. - GV1 = GV2 = nullptr; - if (const LoadInst *LI = dyn_cast<LoadInst>(UV1)) - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0))) - if (IndirectGlobals.count(GV)) - GV1 = GV; - if (const LoadInst *LI = dyn_cast<LoadInst>(UV2)) - if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0))) - if (IndirectGlobals.count(GV)) - GV2 = GV; - - // These pointers may also be from an allocation for the indirect global. If - // so, also handle them. - if (AllocsForIndirectGlobals.count(UV1)) - GV1 = AllocsForIndirectGlobals[UV1]; - if (AllocsForIndirectGlobals.count(UV2)) - GV2 = AllocsForIndirectGlobals[UV2]; - - // Now that we know whether the two pointers are related to indirect globals, - // use this to disambiguate the pointers. If either pointer is based on an - // indirect global and if they are not both based on the same indirect global, - // they cannot alias. - if ((GV1 || GV2) && GV1 != GV2) - return NoAlias; - - return AliasAnalysis::alias(LocA, LocB); -} - -AliasAnalysis::ModRefResult -GlobalsModRef::getModRefInfo(ImmutableCallSite CS, const MemoryLocation &Loc) { - unsigned Known = ModRef; - - // If we are asking for mod/ref info of a direct call with a pointer to a - // global we are tracking, return information if we have it. - const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout(); - if (const GlobalValue *GV = - dyn_cast<GlobalValue>(GetUnderlyingObject(Loc.Ptr, DL))) - if (GV->hasLocalLinkage()) - if (const Function *F = CS.getCalledFunction()) - if (NonAddressTakenGlobals.count(GV)) - if (const FunctionRecord *FR = getFunctionInfo(F)) - Known = FR->getInfoForGlobal(GV); - - if (Known == NoModRef) - return NoModRef; // No need to query other mod/ref analyses - return ModRefResult(Known & AliasAnalysis::getModRefInfo(CS, Loc)); -} - -//===----------------------------------------------------------------------===// -// Methods to update the analysis as a result of the client transformation. -// -void GlobalsModRef::deleteValue(Value *V) { - if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) { - if (NonAddressTakenGlobals.erase(GV)) { - // This global might be an indirect global. If so, remove it and remove - // any AllocRelatedValues for it. - if (IndirectGlobals.erase(GV)) { - // Remove any entries in AllocsForIndirectGlobals for this global. - for (std::map<const Value *, const GlobalValue *>::iterator - I = AllocsForIndirectGlobals.begin(), - E = AllocsForIndirectGlobals.end(); - I != E;) { - if (I->second == GV) { - AllocsForIndirectGlobals.erase(I++); - } else { - ++I; - } - } - } - } - } - - // Otherwise, if this is an allocation related to an indirect global, remove - // it. - AllocsForIndirectGlobals.erase(V); - - AliasAnalysis::deleteValue(V); -} - -void GlobalsModRef::addEscapingUse(Use &U) { - // For the purposes of this analysis, it is conservatively correct to treat - // a newly escaping value equivalently to a deleted one. We could perhaps - // be more precise by processing the new use and attempting to update our - // saved analysis results to accommodate it. - deleteValue(U); - - AliasAnalysis::addEscapingUse(U); -} diff --git a/contrib/llvm/lib/Analysis/IPA/IPA.cpp b/contrib/llvm/lib/Analysis/IPA/IPA.cpp deleted file mode 100644 index 806bfb8..0000000 --- a/contrib/llvm/lib/Analysis/IPA/IPA.cpp +++ /dev/null @@ -1,30 +0,0 @@ -//===-- IPA.cpp -----------------------------------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the common initialization routines for the IPA library. -// -//===----------------------------------------------------------------------===// - -#include "llvm/InitializePasses.h" -#include "llvm-c/Initialization.h" -#include "llvm/PassRegistry.h" - -using namespace llvm; - -/// initializeIPA - Initialize all passes linked into the IPA library. -void llvm::initializeIPA(PassRegistry &Registry) { - initializeCallGraphWrapperPassPass(Registry); - initializeCallGraphPrinterPass(Registry); - initializeCallGraphViewerPass(Registry); - initializeGlobalsModRefPass(Registry); -} - -void LLVMInitializeIPA(LLVMPassRegistryRef R) { - initializeIPA(*unwrap(R)); -} diff --git a/contrib/llvm/lib/Analysis/IVUsers.cpp b/contrib/llvm/lib/Analysis/IVUsers.cpp index 926787d..e0c5d8f 100644 --- a/contrib/llvm/lib/Analysis/IVUsers.cpp +++ b/contrib/llvm/lib/Analysis/IVUsers.cpp @@ -39,7 +39,7 @@ INITIALIZE_PASS_BEGIN(IVUsers, "iv-users", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(IVUsers, "iv-users", "Induction Variable Users", false, true) @@ -255,7 +255,7 @@ void IVUsers::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.setPreservesAll(); } @@ -266,7 +266,7 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) { *L->getHeader()->getParent()); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); // Collect ephemeral values so that AddUsersIfInteresting skips them. EphValues.clear(); @@ -276,7 +276,7 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) { // them by stride. Start by finding all of the PHI nodes in the header for // this loop. If they are induction variables, inspect their uses. for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) - (void)AddUsersIfInteresting(I); + (void)AddUsersIfInteresting(&*I); return false; } diff --git a/contrib/llvm/lib/Analysis/IPA/InlineCost.cpp b/contrib/llvm/lib/Analysis/InlineCost.cpp index c0d2e37..a86a703 100644 --- a/contrib/llvm/lib/Analysis/IPA/InlineCost.cpp +++ b/contrib/llvm/lib/Analysis/InlineCost.cpp @@ -115,11 +115,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { /// inlining has the given attribute set either at the call site or the /// function declaration. Primarily used to inspect call site specific /// attributes since these can be more precise than the ones on the callee - /// itself. + /// itself. bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); /// Return true if the given value is known non null within the callee if - /// inlined through this particular callsite. + /// inlined through this particular callsite. bool isKnownNonNullInCallee(Value *V); // Custom analysis routines. @@ -156,6 +156,8 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { bool visitSwitchInst(SwitchInst &SI); bool visitIndirectBrInst(IndirectBrInst &IBI); bool visitResumeInst(ResumeInst &RI); + bool visitCleanupReturnInst(CleanupReturnInst &RI); + bool visitCatchReturnInst(CatchReturnInst &RI); bool visitUnreachableInst(UnreachableInst &I); public: @@ -832,8 +834,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS); if (CA.analyzeCall(CS)) { // We were able to inline the indirect call! Subtract the cost from the - // bonus we want to apply, but don't go below zero. - Cost -= std::max(0, InlineConstants::IndirectCallThreshold - CA.getCost()); + // threshold to get the bonus we want to apply, but don't go below zero. + Cost -= std::max(0, CA.getThreshold() - CA.getCost()); } return Base::visitCallSite(CS); @@ -903,6 +905,18 @@ bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { return false; } +bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { + // FIXME: It's not clear that a single instruction is an accurate model for + // the inline cost of a cleanupret instruction. + return false; +} + +bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { + // FIXME: It's not clear that a single instruction is an accurate model for + // the inline cost of a catchret instruction. + return false; +} + bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { // FIXME: It might be reasonably to discount the cost of instructions leading // to unreachable as they have the lowest possible impact on both runtime and @@ -946,20 +960,21 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB, continue; // Skip ephemeral values. - if (EphValues.count(I)) + if (EphValues.count(&*I)) continue; ++NumInstructions; if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) ++NumVectorInstructions; - // If the instruction is floating point, and the target says this operation is - // expensive or the function has the "use-soft-float" attribute, this may - // eventually become a library call. Treat the cost as such. + // If the instruction is floating point, and the target says this operation + // is expensive or the function has the "use-soft-float" attribute, this may + // eventually become a library call. Treat the cost as such. if (I->getType()->isFloatingPointTy()) { bool hasSoftFloatAttr = false; - // If the function has the "use-soft-float" attribute, mark it as expensive. + // If the function has the "use-soft-float" attribute, mark it as + // expensive. if (F.hasFnAttribute("use-soft-float")) { Attribute Attr = F.getFnAttribute("use-soft-float"); StringRef Val = Attr.getValueAsString(); @@ -977,7 +992,7 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB, // all of the per-instruction logic. The visit tree returns true if we // consumed the instruction in any way, and false if the instruction's base // cost should count against inlining. - if (Base::visit(I)) + if (Base::visit(&*I)) ++NumInstructionsSimplified; else Cost += InlineConstants::InstrCost; @@ -1157,15 +1172,15 @@ bool CallAnalyzer::analyzeCall(CallSite CS) { FAI != FAE; ++FAI, ++CAI) { assert(CAI != CS.arg_end()); if (Constant *C = dyn_cast<Constant>(CAI)) - SimplifiedValues[FAI] = C; + SimplifiedValues[&*FAI] = C; Value *PtrArg = *CAI; if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { - ConstantOffsetPtrs[FAI] = std::make_pair(PtrArg, C->getValue()); + ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); // We can SROA any pointer arguments derived from alloca instructions. if (isa<AllocaInst>(PtrArg)) { - SROAArgValues[FAI] = PtrArg; + SROAArgValues[&*FAI] = PtrArg; SROAArgCosts[PtrArg] = 0; } } @@ -1281,7 +1296,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) { else if (NumVectorInstructions <= NumInstructions / 2) Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus); - return Cost < Threshold; + return Cost <= std::max(0, Threshold); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1304,36 +1319,6 @@ void CallAnalyzer::dump() { } #endif -INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis", - true, true) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis", - true, true) - -char InlineCostAnalysis::ID = 0; - -InlineCostAnalysis::InlineCostAnalysis() : CallGraphSCCPass(ID) {} - -InlineCostAnalysis::~InlineCostAnalysis() {} - -void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesAll(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - CallGraphSCCPass::getAnalysisUsage(AU); -} - -bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) { - TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); - ACT = &getAnalysis<AssumptionCacheTracker>(); - return false; -} - -InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) { - return getInlineCost(CS, CS.getCalledFunction(), Threshold); -} - /// \brief Test that two functions either have or have not the given attribute /// at the same time. template<typename AttrKind> @@ -1346,14 +1331,19 @@ static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) { static bool functionsHaveCompatibleAttributes(Function *Caller, Function *Callee, TargetTransformInfo &TTI) { - return TTI.hasCompatibleFunctionAttributes(Caller, Callee) && - attributeMatches(Caller, Callee, Attribute::SanitizeAddress) && - attributeMatches(Caller, Callee, Attribute::SanitizeMemory) && - attributeMatches(Caller, Callee, Attribute::SanitizeThread); + return TTI.areInlineCompatible(Caller, Callee) && + AttributeFuncs::areInlineCompatible(*Caller, *Callee); +} + +InlineCost llvm::getInlineCost(CallSite CS, int Threshold, + TargetTransformInfo &CalleeTTI, + AssumptionCacheTracker *ACT) { + return getInlineCost(CS, CS.getCalledFunction(), Threshold, CalleeTTI, ACT); } -InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee, - int Threshold) { +InlineCost llvm::getInlineCost(CallSite CS, Function *Callee, int Threshold, + TargetTransformInfo &CalleeTTI, + AssumptionCacheTracker *ACT) { // Cannot inline indirect calls. if (!Callee) return llvm::InlineCost::getNever(); @@ -1368,8 +1358,7 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee, // Never inline functions with conflicting attributes (unless callee has // always-inline attribute). - if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, - TTIWP->getTTI(*Callee))) + if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI)) return llvm::InlineCost::getNever(); // Don't inline this call if the caller has the optnone attribute. @@ -1386,7 +1375,7 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee, DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() << "...\n"); - CallAnalyzer CA(TTIWP->getTTI(*Callee), ACT, *Callee, Threshold, CS); + CallAnalyzer CA(CalleeTTI, ACT, *Callee, Threshold, CS); bool ShouldInline = CA.analyzeCall(CS); DEBUG(CA.dump()); @@ -1400,7 +1389,7 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee, return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); } -bool InlineCostAnalysis::isInlineViable(Function &F) { +bool llvm::isInlineViable(Function &F) { bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { // Disallow inlining of functions which contain indirect branches or @@ -1408,9 +1397,8 @@ bool InlineCostAnalysis::isInlineViable(Function &F) { if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken()) return false; - for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; - ++II) { - CallSite CS(II); + for (auto &II : *BI) { + CallSite CS(&II); if (!CS) continue; diff --git a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp index a7f8f5c..6dfe625 100644 --- a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp @@ -70,7 +70,7 @@ static Value *SimplifyOrInst(Value *, Value *, const Query &, unsigned); static Value *SimplifyXorInst(Value *, Value *, const Query &, unsigned); static Value *SimplifyTruncInst(Value *, Type *, const Query &, unsigned); -/// getFalse - For a boolean type, or a vector of boolean type, return false, or +/// For a boolean type, or a vector of boolean type, return false, or /// a vector with every element false, as appropriate for the type. static Constant *getFalse(Type *Ty) { assert(Ty->getScalarType()->isIntegerTy(1) && @@ -78,7 +78,7 @@ static Constant *getFalse(Type *Ty) { return Constant::getNullValue(Ty); } -/// getTrue - For a boolean type, or a vector of boolean type, return true, or +/// For a boolean type, or a vector of boolean type, return true, or /// a vector with every element true, as appropriate for the type. static Constant *getTrue(Type *Ty) { assert(Ty->getScalarType()->isIntegerTy(1) && @@ -100,7 +100,7 @@ static bool isSameCompare(Value *V, CmpInst::Predicate Pred, Value *LHS, CRHS == LHS; } -/// ValueDominatesPHI - Does the given value dominate the specified phi node? +/// Does the given value dominate the specified phi node? static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) { Instruction *I = dyn_cast<Instruction>(V); if (!I) @@ -122,7 +122,7 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) { return DT->dominates(I, P); } - // Otherwise, if the instruction is in the entry block, and is not an invoke, + // Otherwise, if the instruction is in the entry block and is not an invoke, // then it obviously dominates all phi nodes. if (I->getParent() == &I->getParent()->getParent()->getEntryBlock() && !isa<InvokeInst>(I)) @@ -131,8 +131,8 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) { return false; } -/// ExpandBinOp - Simplify "A op (B op' C)" by distributing op over op', turning -/// it into "(A op B) op' (A op C)". Here "op" is given by Opcode and "op'" is +/// Simplify "A op (B op' C)" by distributing op over op', turning it into +/// "(A op B) op' (A op C)". Here "op" is given by Opcode and "op'" is /// given by OpcodeToExpand, while "A" corresponds to LHS and "B op' C" to RHS. /// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)". /// Returns the simplified value, or null if no simplification was performed. @@ -193,8 +193,8 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS, return nullptr; } -/// SimplifyAssociativeBinOp - Generic simplifications for associative binary -/// operations. Returns the simpler value, or null if none was found. +/// Generic simplifications for associative binary operations. +/// Returns the simpler value, or null if none was found. static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS, const Query &Q, unsigned MaxRecurse) { Instruction::BinaryOps Opcode = (Instruction::BinaryOps)Opc; @@ -290,10 +290,10 @@ static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS, return nullptr; } -/// ThreadBinOpOverSelect - In the case of a binary operation with a select -/// instruction as an operand, try to simplify the binop by seeing whether -/// evaluating it on both branches of the select results in the same value. -/// Returns the common value if so, otherwise returns null. +/// In the case of a binary operation with a select instruction as an operand, +/// try to simplify the binop by seeing whether evaluating it on both branches +/// of the select results in the same value. Returns the common value if so, +/// otherwise returns null. static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS, const Query &Q, unsigned MaxRecurse) { // Recursion is always used, so bail out at once if we already hit the limit. @@ -362,10 +362,9 @@ static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS, return nullptr; } -/// ThreadCmpOverSelect - In the case of a comparison with a select instruction, -/// try to simplify the comparison by seeing whether both branches of the select -/// result in the same value. Returns the common value if so, otherwise returns -/// null. +/// In the case of a comparison with a select instruction, try to simplify the +/// comparison by seeing whether both branches of the select result in the same +/// value. Returns the common value if so, otherwise returns null. static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Query &Q, unsigned MaxRecurse) { @@ -444,10 +443,10 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS, return nullptr; } -/// ThreadBinOpOverPHI - In the case of a binary operation with an operand that -/// is a PHI instruction, try to simplify the binop by seeing whether evaluating -/// it on the incoming phi values yields the same result for every value. If so -/// returns the common value, otherwise returns null. +/// In the case of a binary operation with an operand that is a PHI instruction, +/// try to simplify the binop by seeing whether evaluating it on the incoming +/// phi values yields the same result for every value. If so returns the common +/// value, otherwise returns null. static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS, const Query &Q, unsigned MaxRecurse) { // Recursion is always used, so bail out at once if we already hit the limit. @@ -486,10 +485,10 @@ static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS, return CommonValue; } -/// ThreadCmpOverPHI - In the case of a comparison with a PHI instruction, try -/// try to simplify the comparison by seeing whether comparing with all of the -/// incoming phi values yields the same result every time. If so returns the -/// common result, otherwise returns null. +/// In the case of a comparison with a PHI instruction, try to simplify the +/// comparison by seeing whether comparing with all of the incoming phi values +/// yields the same result every time. If so returns the common result, +/// otherwise returns null. static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Query &Q, unsigned MaxRecurse) { // Recursion is always used, so bail out at once if we already hit the limit. @@ -524,8 +523,8 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS, return CommonValue; } -/// SimplifyAddInst - Given operands for an Add, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an Add, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, const Query &Q, unsigned MaxRecurse) { if (Constant *CLHS = dyn_cast<Constant>(Op0)) { @@ -656,8 +655,8 @@ static Constant *computePointerDifference(const DataLayout &DL, Value *LHS, return ConstantExpr::getSub(LHSOffset, RHSOffset); } -/// SimplifySubInst - Given operands for a Sub, see if we can -/// fold the result. If not, this returns null. +/// Given operands for a Sub, see if we can fold the result. +/// If not, this returns null. static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, const Query &Q, unsigned MaxRecurse) { if (Constant *CLHS = dyn_cast<Constant>(Op0)) @@ -889,8 +888,8 @@ static Value *SimplifyFMulInst(Value *Op0, Value *Op1, return nullptr; } -/// SimplifyMulInst - Given operands for a Mul, see if we can -/// fold the result. If not, this returns null. +/// Given operands for a Mul, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q, unsigned MaxRecurse) { if (Constant *CLHS = dyn_cast<Constant>(Op0)) { @@ -989,8 +988,8 @@ Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const DataLayout &DL, RecursionLimit); } -/// SimplifyDiv - Given operands for an SDiv or UDiv, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an SDiv or UDiv, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1, const Query &Q, unsigned MaxRecurse) { if (Constant *C0 = dyn_cast<Constant>(Op0)) { @@ -1075,8 +1074,8 @@ static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1, return nullptr; } -/// SimplifySDivInst - Given operands for an SDiv, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an SDiv, see if we can fold the result. +/// If not, this returns null. static Value *SimplifySDivInst(Value *Op0, Value *Op1, const Query &Q, unsigned MaxRecurse) { if (Value *V = SimplifyDiv(Instruction::SDiv, Op0, Op1, Q, MaxRecurse)) @@ -1093,8 +1092,8 @@ Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const DataLayout &DL, RecursionLimit); } -/// SimplifyUDivInst - Given operands for a UDiv, see if we can -/// fold the result. If not, this returns null. +/// Given operands for a UDiv, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q, unsigned MaxRecurse) { if (Value *V = SimplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse)) @@ -1154,8 +1153,8 @@ Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF, RecursionLimit); } -/// SimplifyRem - Given operands for an SRem or URem, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an SRem or URem, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1, const Query &Q, unsigned MaxRecurse) { if (Constant *C0 = dyn_cast<Constant>(Op0)) { @@ -1215,8 +1214,8 @@ static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1, return nullptr; } -/// SimplifySRemInst - Given operands for an SRem, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an SRem, see if we can fold the result. +/// If not, this returns null. static Value *SimplifySRemInst(Value *Op0, Value *Op1, const Query &Q, unsigned MaxRecurse) { if (Value *V = SimplifyRem(Instruction::SRem, Op0, Op1, Q, MaxRecurse)) @@ -1233,8 +1232,8 @@ Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const DataLayout &DL, RecursionLimit); } -/// SimplifyURemInst - Given operands for a URem, see if we can -/// fold the result. If not, this returns null. +/// Given operands for a URem, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q, unsigned MaxRecurse) { if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse)) @@ -1279,7 +1278,7 @@ Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF, RecursionLimit); } -/// isUndefShift - Returns true if a shift by \c Amount always yields undef. +/// Returns true if a shift by \c Amount always yields undef. static bool isUndefShift(Value *Amount) { Constant *C = dyn_cast<Constant>(Amount); if (!C) @@ -1306,8 +1305,8 @@ static bool isUndefShift(Value *Amount) { return false; } -/// SimplifyShift - Given operands for an Shl, LShr or AShr, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an Shl, LShr or AShr, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1, const Query &Q, unsigned MaxRecurse) { if (Constant *C0 = dyn_cast<Constant>(Op0)) { @@ -1375,8 +1374,8 @@ static Value *SimplifyRightShift(unsigned Opcode, Value *Op0, Value *Op1, return nullptr; } -/// SimplifyShlInst - Given operands for an Shl, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an Shl, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, const Query &Q, unsigned MaxRecurse) { if (Value *V = SimplifyShift(Instruction::Shl, Op0, Op1, Q, MaxRecurse)) @@ -1402,8 +1401,8 @@ Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, RecursionLimit); } -/// SimplifyLShrInst - Given operands for an LShr, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an LShr, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact, const Query &Q, unsigned MaxRecurse) { if (Value *V = SimplifyRightShift(Instruction::LShr, Op0, Op1, isExact, Q, @@ -1427,8 +1426,8 @@ Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact, RecursionLimit); } -/// SimplifyAShrInst - Given operands for an AShr, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an AShr, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact, const Query &Q, unsigned MaxRecurse) { if (Value *V = SimplifyRightShift(Instruction::AShr, Op0, Op1, isExact, Q, @@ -1502,8 +1501,8 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp, return nullptr; } -// Simplify (and (icmp ...) (icmp ...)) to true when we can tell that the range -// of possible values cannot be satisfied. +/// Simplify (and (icmp ...) (icmp ...)) to true when we can tell that the range +/// of possible values cannot be satisfied. static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) { ICmpInst::Predicate Pred0, Pred1; ConstantInt *CI1, *CI2; @@ -1554,8 +1553,8 @@ static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) { return nullptr; } -/// SimplifyAndInst - Given operands for an And, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an And, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q, unsigned MaxRecurse) { if (Constant *CLHS = dyn_cast<Constant>(Op0)) { @@ -1661,8 +1660,8 @@ Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const DataLayout &DL, RecursionLimit); } -// Simplify (or (icmp ...) (icmp ...)) to true when we can tell that the union -// contains all possible values. +/// Simplify (or (icmp ...) (icmp ...)) to true when we can tell that the union +/// contains all possible values. static Value *SimplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) { ICmpInst::Predicate Pred0, Pred1; ConstantInt *CI1, *CI2; @@ -1713,8 +1712,8 @@ static Value *SimplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) { return nullptr; } -/// SimplifyOrInst - Given operands for an Or, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an Or, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q, unsigned MaxRecurse) { if (Constant *CLHS = dyn_cast<Constant>(Op0)) { @@ -1849,8 +1848,8 @@ Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const DataLayout &DL, RecursionLimit); } -/// SimplifyXorInst - Given operands for a Xor, see if we can -/// fold the result. If not, this returns null. +/// Given operands for a Xor, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q, unsigned MaxRecurse) { if (Constant *CLHS = dyn_cast<Constant>(Op0)) { @@ -1910,9 +1909,9 @@ static Type *GetCompareTy(Value *Op) { return CmpInst::makeCmpResultType(Op->getType()); } -/// ExtractEquivalentCondition - Rummage around inside V looking for something -/// equivalent to the comparison "LHS Pred RHS". Return such a value if found, -/// otherwise return null. Helper function for analyzing max/min idioms. +/// Rummage around inside V looking for something equivalent to the comparison +/// "LHS Pred RHS". Return such a value if found, otherwise return null. +/// Helper function for analyzing max/min idioms. static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred, Value *LHS, Value *RHS) { SelectInst *SI = dyn_cast<SelectInst>(V); @@ -2090,8 +2089,7 @@ static Constant *computePointerICmp(const DataLayout &DL, // Is the set of underlying objects all noalias calls? auto IsNAC = [](SmallVectorImpl<Value *> &Objects) { - return std::all_of(Objects.begin(), Objects.end(), - [](Value *V){ return isNoAliasCall(V); }); + return std::all_of(Objects.begin(), Objects.end(), isNoAliasCall); }; // Is the set of underlying objects all things which must be disjoint from @@ -2101,21 +2099,17 @@ static Constant *computePointerICmp(const DataLayout &DL, // that might be resolve lazily to symbols in another dynamically-loaded // library (and, thus, could be malloc'ed by the implementation). auto IsAllocDisjoint = [](SmallVectorImpl<Value *> &Objects) { - return std::all_of(Objects.begin(), Objects.end(), - [](Value *V){ - if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) - return AI->getParent() && AI->getParent()->getParent() && - AI->isStaticAlloca(); - if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) - return (GV->hasLocalLinkage() || - GV->hasHiddenVisibility() || - GV->hasProtectedVisibility() || - GV->hasUnnamedAddr()) && - !GV->isThreadLocal(); - if (const Argument *A = dyn_cast<Argument>(V)) - return A->hasByValAttr(); - return false; - }); + return std::all_of(Objects.begin(), Objects.end(), [](Value *V) { + if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) + return AI->getParent() && AI->getFunction() && AI->isStaticAlloca(); + if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) + return (GV->hasLocalLinkage() || GV->hasHiddenVisibility() || + GV->hasProtectedVisibility() || GV->hasUnnamedAddr()) && + !GV->isThreadLocal(); + if (const Argument *A = dyn_cast<Argument>(V)) + return A->hasByValAttr(); + return false; + }); }; if ((IsNAC(LHSUObjs) && IsAllocDisjoint(RHSUObjs)) || @@ -2128,8 +2122,8 @@ static Constant *computePointerICmp(const DataLayout &DL, return nullptr; } -/// SimplifyICmpInst - Given operands for an ICmpInst, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an ICmpInst, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, const Query &Q, unsigned MaxRecurse) { CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate; @@ -2176,6 +2170,19 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, // X >=u 1 -> X if (match(RHS, m_One())) return LHS; + if (isImpliedCondition(RHS, LHS, Q.DL)) + return getTrue(ITy); + break; + case ICmpInst::ICMP_SGE: + /// For signed comparison, the values for an i1 are 0 and -1 + /// respectively. This maps into a truth table of: + /// LHS | RHS | LHS >=s RHS | LHS implies RHS + /// 0 | 0 | 1 (0 >= 0) | 1 + /// 0 | 1 | 1 (0 >= -1) | 1 + /// 1 | 0 | 0 (-1 >= 0) | 0 + /// 1 | 1 | 1 (-1 >= -1) | 1 + if (isImpliedCondition(LHS, RHS, Q.DL)) + return getTrue(ITy); break; case ICmpInst::ICMP_SLT: // X <s 0 -> X @@ -2187,6 +2194,10 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, if (match(RHS, m_One())) return LHS; break; + case ICmpInst::ICMP_ULE: + if (isImpliedCondition(LHS, RHS, Q.DL)) + return getTrue(ITy); + break; } } @@ -2360,9 +2371,19 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, } else if (match(LHS, m_And(m_Value(), m_ConstantInt(CI2)))) { // 'and x, CI2' produces [0, CI2]. Upper = CI2->getValue() + 1; + } else if (match(LHS, m_NUWAdd(m_Value(), m_ConstantInt(CI2)))) { + // 'add nuw x, CI2' produces [CI2, UINT_MAX]. + Lower = CI2->getValue(); } - if (Lower != Upper) { - ConstantRange LHS_CR = ConstantRange(Lower, Upper); + + ConstantRange LHS_CR = Lower != Upper ? ConstantRange(Lower, Upper) + : ConstantRange(Width, true); + + if (auto *I = dyn_cast<Instruction>(LHS)) + if (auto *Ranges = I->getMetadata(LLVMContext::MD_range)) + LHS_CR = LHS_CR.intersectWith(getConstantRangeFromMetadata(*Ranges)); + + if (!LHS_CR.isFullSet()) { if (RHS_CR.contains(LHS_CR)) return ConstantInt::getTrue(RHS->getContext()); if (RHS_CR.inverse().contains(LHS_CR)) @@ -2370,6 +2391,30 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, } } + // If both operands have range metadata, use the metadata + // to simplify the comparison. + if (isa<Instruction>(RHS) && isa<Instruction>(LHS)) { + auto RHS_Instr = dyn_cast<Instruction>(RHS); + auto LHS_Instr = dyn_cast<Instruction>(LHS); + + if (RHS_Instr->getMetadata(LLVMContext::MD_range) && + LHS_Instr->getMetadata(LLVMContext::MD_range)) { + auto RHS_CR = getConstantRangeFromMetadata( + *RHS_Instr->getMetadata(LLVMContext::MD_range)); + auto LHS_CR = getConstantRangeFromMetadata( + *LHS_Instr->getMetadata(LLVMContext::MD_range)); + + auto Satisfied_CR = ConstantRange::makeSatisfyingICmpRegion(Pred, RHS_CR); + if (Satisfied_CR.contains(LHS_CR)) + return ConstantInt::getTrue(RHS->getContext()); + + auto InversedSatisfied_CR = ConstantRange::makeSatisfyingICmpRegion( + CmpInst::getInversePredicate(Pred), RHS_CR); + if (InversedSatisfied_CR.contains(LHS_CR)) + return ConstantInt::getFalse(RHS->getContext()); + } + } + // Compare of cast, for example (zext X) != 0 -> X != 0 if (isa<CastInst>(LHS) && (isa<Constant>(RHS) || isa<CastInst>(RHS))) { Instruction *LI = cast<CastInst>(LHS); @@ -2529,6 +2574,14 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, } } + // icmp eq|ne X, Y -> false|true if X != Y + if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) && + isKnownNonEqual(LHS, RHS, Q.DL, Q.AC, Q.CxtI, Q.DT)) { + LLVMContext &Ctx = LHS->getType()->getContext(); + return Pred == ICmpInst::ICMP_NE ? + ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx); + } + // Special logic for binary operators. BinaryOperator *LBO = dyn_cast<BinaryOperator>(LHS); BinaryOperator *RBO = dyn_cast<BinaryOperator>(RHS); @@ -3039,13 +3092,13 @@ Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, - Instruction *CxtI) { + const Instruction *CxtI) { return ::SimplifyICmpInst(Predicate, LHS, RHS, Query(DL, TLI, DT, AC, CxtI), RecursionLimit); } -/// SimplifyFCmpInst - Given operands for an FCmpInst, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an FCmpInst, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, FastMathFlags FMF, const Query &Q, unsigned MaxRecurse) { @@ -3169,8 +3222,7 @@ Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, Query(DL, TLI, DT, AC, CxtI), RecursionLimit); } -/// SimplifyWithOpReplaced - See if V simplifies when its operand Op is -/// replaced with RepOp. +/// See if V simplifies when its operand Op is replaced with RepOp. static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, const Query &Q, unsigned MaxRecurse) { @@ -3253,8 +3305,8 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, return nullptr; } -/// SimplifySelectInst - Given operands for a SelectInst, see if we can fold -/// the result. If not, this returns null. +/// Given operands for a SelectInst, see if we can fold the result. +/// If not, this returns null. static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal, Value *FalseVal, const Query &Q, unsigned MaxRecurse) { @@ -3391,8 +3443,8 @@ Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, Query(DL, TLI, DT, AC, CxtI), RecursionLimit); } -/// SimplifyGEPInst - Given operands for an GetElementPtrInst, see if we can -/// fold the result. If not, this returns null. +/// Given operands for an GetElementPtrInst, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, const Query &Q, unsigned) { // The type of the GEP pointer operand. @@ -3484,8 +3536,8 @@ Value *llvm::SimplifyGEPInst(ArrayRef<Value *> Ops, const DataLayout &DL, Ops, Query(DL, TLI, DT, AC, CxtI), RecursionLimit); } -/// SimplifyInsertValueInst - Given operands for an InsertValueInst, see if we -/// can fold the result. If not, this returns null. +/// Given operands for an InsertValueInst, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyInsertValueInst(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs, const Query &Q, unsigned) { @@ -3521,8 +3573,8 @@ Value *llvm::SimplifyInsertValueInst( RecursionLimit); } -/// SimplifyExtractValueInst - Given operands for an ExtractValueInst, see if we -/// can fold the result. If not, this returns null. +/// Given operands for an ExtractValueInst, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs, const Query &, unsigned) { if (auto *CAgg = dyn_cast<Constant>(Agg)) @@ -3556,8 +3608,8 @@ Value *llvm::SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs, RecursionLimit); } -/// SimplifyExtractElementInst - Given operands for an ExtractElementInst, see if we -/// can fold the result. If not, this returns null. +/// Given operands for an ExtractElementInst, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const Query &, unsigned) { if (auto *CVec = dyn_cast<Constant>(Vec)) { @@ -3588,7 +3640,7 @@ Value *llvm::SimplifyExtractElementInst( RecursionLimit); } -/// SimplifyPHINode - See if we can fold the given phi. If not, returns null. +/// See if we can fold the given phi. If not, returns null. static Value *SimplifyPHINode(PHINode *PN, const Query &Q) { // If all of the PHI's incoming values are the same then replace the PHI node // with the common value. @@ -3638,8 +3690,8 @@ Value *llvm::SimplifyTruncInst(Value *Op, Type *Ty, const DataLayout &DL, //=== Helper functions for higher up the class hierarchy. -/// SimplifyBinOp - Given operands for a BinaryOperator, see if we can -/// fold the result. If not, this returns null. +/// Given operands for a BinaryOperator, see if we can fold the result. +/// If not, this returns null. static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const Query &Q, unsigned MaxRecurse) { switch (Opcode) { @@ -3705,8 +3757,8 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, } } -/// SimplifyFPBinOp - Given operands for a BinaryOperator, see if we can -/// fold the result. If not, this returns null. +/// Given operands for a BinaryOperator, see if we can fold the result. +/// If not, this returns null. /// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the /// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp. static Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS, @@ -3741,8 +3793,7 @@ Value *llvm::SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS, RecursionLimit); } -/// SimplifyCmpInst - Given operands for a CmpInst, see if we can -/// fold the result. +/// Given operands for a CmpInst, see if we can fold the result. static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS, const Query &Q, unsigned MaxRecurse) { if (CmpInst::isIntPredicate((CmpInst::Predicate)Predicate)) @@ -3880,8 +3931,8 @@ Value *llvm::SimplifyCall(Value *V, ArrayRef<Value *> Args, Query(DL, TLI, DT, AC, CxtI), RecursionLimit); } -/// SimplifyInstruction - See if we can compute a simplified version of this -/// instruction. If not, this returns null. +/// See if we can compute a simplified version of this instruction. +/// If not, this returns null. Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC) { @@ -4024,6 +4075,17 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL, break; } + // In general, it is possible for computeKnownBits to determine all bits in a + // value even when the operands are not all constants. + if (!Result && I->getType()->isIntegerTy()) { + unsigned BitWidth = I->getType()->getScalarSizeInBits(); + APInt KnownZero(BitWidth, 0); + APInt KnownOne(BitWidth, 0); + computeKnownBits(I, KnownZero, KnownOne, DL, /*Depth*/0, AC, I, DT); + if ((KnownZero | KnownOne).isAllOnesValue()) + Result = ConstantInt::get(I->getContext(), KnownOne); + } + /// If called on unreachable code, the above logic may report that the /// instruction simplified to itself. Make life easier for users by /// detecting that case here, returning a safe value instead. diff --git a/contrib/llvm/lib/Analysis/LazyCallGraph.cpp b/contrib/llvm/lib/Analysis/LazyCallGraph.cpp index c8d0410..0f0f31e 100644 --- a/contrib/llvm/lib/Analysis/LazyCallGraph.cpp +++ b/contrib/llvm/lib/Analysis/LazyCallGraph.cpp @@ -198,7 +198,8 @@ void LazyCallGraph::SCC::insertOutgoingEdge(Node &CallerN, Node &CalleeN) { assert(CalleeC.isDescendantOf(*this) && "Callee must be a descendant of the Caller."); - // The only change required is to add this SCC to the parent set of the callee. + // The only change required is to add this SCC to the parent set of the + // callee. CalleeC.ParentSCCs.insert(this); } @@ -454,8 +455,7 @@ void LazyCallGraph::SCC::internalDFS( } SmallVector<LazyCallGraph::SCC *, 1> -LazyCallGraph::SCC::removeIntraSCCEdge(Node &CallerN, - Node &CalleeN) { +LazyCallGraph::SCC::removeIntraSCCEdge(Node &CallerN, Node &CalleeN) { // First remove it from the node. CallerN.removeEdgeInternal(CalleeN.getFunction()); @@ -522,7 +522,7 @@ LazyCallGraph::SCC::removeIntraSCCEdge(Node &CallerN, // the leaf SCC list. if (!IsLeafSCC && !ResultSCCs.empty()) G->LeafSCCs.erase(std::remove(G->LeafSCCs.begin(), G->LeafSCCs.end(), this), - G->LeafSCCs.end()); + G->LeafSCCs.end()); // Return the new list of SCCs. return ResultSCCs; diff --git a/contrib/llvm/lib/Analysis/LazyValueInfo.cpp b/contrib/llvm/lib/Analysis/LazyValueInfo.cpp index a6ae7f2..0d1d34e 100644 --- a/contrib/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/contrib/llvm/lib/Analysis/LazyValueInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Debug.h" @@ -64,10 +65,10 @@ class LVILatticeVal { enum LatticeValueTy { /// This Value has no known value yet. undefined, - + /// This Value has a specific constant value. constant, - + /// This Value is known to not have the specified value. notconstant, @@ -77,13 +78,13 @@ class LVILatticeVal { /// This value is not known to be constant, and we know that it has a value. overdefined }; - + /// Val: This stores the current lattice value along with the Constant* for /// the constant if this is a 'constant' or 'notconstant' value. LatticeValueTy Tag; Constant *Val; ConstantRange Range; - + public: LVILatticeVal() : Tag(undefined), Val(nullptr), Range(1, true) {} @@ -104,29 +105,34 @@ public: Res.markConstantRange(CR); return Res; } + static LVILatticeVal getOverdefined() { + LVILatticeVal Res; + Res.markOverdefined(); + return Res; + } bool isUndefined() const { return Tag == undefined; } bool isConstant() const { return Tag == constant; } bool isNotConstant() const { return Tag == notconstant; } bool isConstantRange() const { return Tag == constantrange; } bool isOverdefined() const { return Tag == overdefined; } - + Constant *getConstant() const { assert(isConstant() && "Cannot get the constant of a non-constant!"); return Val; } - + Constant *getNotConstant() const { assert(isNotConstant() && "Cannot get the constant of a non-notconstant!"); return Val; } - + ConstantRange getConstantRange() const { assert(isConstantRange() && "Cannot get the constant-range of a non-constant-range!"); return Range; } - + /// Return true if this is a change in status. bool markOverdefined() { if (isOverdefined()) @@ -150,7 +156,7 @@ public: Val = V; return true; } - + /// Return true if this is a change in status. bool markNotConstant(Constant *V) { assert(V && "Marking constant with NULL"); @@ -168,27 +174,27 @@ public: Val = V; return true; } - + /// Return true if this is a change in status. bool markConstantRange(const ConstantRange NewR) { if (isConstantRange()) { if (NewR.isEmptySet()) return markOverdefined(); - + bool changed = Range != NewR; Range = NewR; return changed; } - + assert(isUndefined()); if (NewR.isEmptySet()) return markOverdefined(); - + Tag = constantrange; Range = NewR; return true; } - + /// Merge the specified lattice value into this one, updating this /// one and returning true if anything changed. bool mergeIn(const LVILatticeVal &RHS, const DataLayout &DL) { @@ -267,7 +273,7 @@ public: return markConstantRange(NewR); } }; - + } // end anonymous namespace. namespace llvm { @@ -295,9 +301,9 @@ raw_ostream &operator<<(raw_ostream &OS, const LVILatticeVal &Val) { namespace { /// A callback value handle updates the cache when values are erased. class LazyValueInfoCache; - struct LVIValueHandle : public CallbackVH { + struct LVIValueHandle final : public CallbackVH { LazyValueInfoCache *Parent; - + LVIValueHandle(Value *V, LazyValueInfoCache *P) : CallbackVH(V), Parent(P) { } @@ -308,24 +314,27 @@ namespace { }; } -namespace { +namespace { /// This is the cache kept by LazyValueInfo which /// maintains information about queries across the clients' queries. class LazyValueInfoCache { /// This is all of the cached block information for exactly one Value*. /// The entries are sorted by the BasicBlock* of the /// entries, allowing us to do a lookup with a binary search. - typedef std::map<AssertingVH<BasicBlock>, LVILatticeVal> ValueCacheEntryTy; + /// Over-defined lattice values are recorded in OverDefinedCache to reduce + /// memory overhead. + typedef SmallDenseMap<AssertingVH<BasicBlock>, LVILatticeVal, 4> + ValueCacheEntryTy; /// This is all of the cached information for all values, /// mapped from Value* to key information. std::map<LVIValueHandle, ValueCacheEntryTy> ValueCache; - + /// This tracks, on a per-block basis, the set of values that are - /// over-defined at the end of that block. This is required - /// for cache updating. - typedef std::pair<AssertingVH<BasicBlock>, Value*> OverDefinedPairTy; - DenseSet<OverDefinedPairTy> OverDefinedCache; + /// over-defined at the end of that block. + typedef DenseMap<AssertingVH<BasicBlock>, SmallPtrSet<Value *, 4>> + OverDefinedCacheTy; + OverDefinedCacheTy OverDefinedCache; /// Keep track of all blocks that we have ever seen, so we /// don't spend time removing unused blocks from our caches. @@ -357,9 +366,13 @@ namespace { void insertResult(Value *Val, BasicBlock *BB, const LVILatticeVal &Result) { SeenBlocks.insert(BB); - lookup(Val)[BB] = Result; + + // Insert over-defined values into their own cache to reduce memory + // overhead. if (Result.isOverdefined()) - OverDefinedCache.insert(std::make_pair(BB, Val)); + OverDefinedCache[BB].insert(Val); + else + lookup(Val)[BB] = Result; } LVILatticeVal getBlockValue(Value *Val, BasicBlock *BB); @@ -382,11 +395,39 @@ namespace { Instruction *BBI); void solve(); - + ValueCacheEntryTy &lookup(Value *V) { return ValueCache[LVIValueHandle(V, this)]; } + bool isOverdefined(Value *V, BasicBlock *BB) const { + auto ODI = OverDefinedCache.find(BB); + + if (ODI == OverDefinedCache.end()) + return false; + + return ODI->second.count(V); + } + + bool hasCachedValueInfo(Value *V, BasicBlock *BB) { + if (isOverdefined(V, BB)) + return true; + + LVIValueHandle ValHandle(V, this); + auto I = ValueCache.find(ValHandle); + if (I == ValueCache.end()) + return false; + + return I->second.count(BB); + } + + LVILatticeVal getCachedValueInfo(Value *V, BasicBlock *BB) { + if (isOverdefined(V, BB)) + return LVILatticeVal::getOverdefined(); + + return lookup(V)[BB]; + } + public: /// This is the query interface to determine the lattice /// value for the specified Value* at the end of the specified block. @@ -402,15 +443,15 @@ namespace { /// value for the specified Value* that is true on the specified edge. LVILatticeVal getValueOnEdge(Value *V, BasicBlock *FromBB,BasicBlock *ToBB, Instruction *CxtI = nullptr); - + /// This is the update interface to inform the cache that an edge from /// PredBB to OldSucc has been threaded to be from PredBB to NewSucc. void threadEdge(BasicBlock *PredBB,BasicBlock *OldSucc,BasicBlock *NewSucc); - + /// This is part of the update interface to inform the cache /// that a block has been deleted. void eraseBlock(BasicBlock *BB); - + /// clear - Empty the cache. void clear() { SeenBlocks.clear(); @@ -425,15 +466,17 @@ namespace { } // end anonymous namespace void LVIValueHandle::deleted() { - typedef std::pair<AssertingVH<BasicBlock>, Value*> OverDefinedPairTy; - - SmallVector<OverDefinedPairTy, 4> ToErase; - for (const OverDefinedPairTy &P : Parent->OverDefinedCache) - if (P.second == getValPtr()) - ToErase.push_back(P); - for (const OverDefinedPairTy &P : ToErase) - Parent->OverDefinedCache.erase(P); - + SmallVector<AssertingVH<BasicBlock>, 4> ToErase; + for (auto &I : Parent->OverDefinedCache) { + SmallPtrSetImpl<Value *> &ValueSet = I.second; + if (ValueSet.count(getValPtr())) + ValueSet.erase(getValPtr()); + if (ValueSet.empty()) + ToErase.push_back(I.first); + } + for (auto &BB : ToErase) + Parent->OverDefinedCache.erase(BB); + // This erasure deallocates *this, so it MUST happen after we're done // using any and all members of *this. Parent->ValueCache.erase(*this); @@ -446,15 +489,11 @@ void LazyValueInfoCache::eraseBlock(BasicBlock *BB) { return; SeenBlocks.erase(I); - SmallVector<OverDefinedPairTy, 4> ToErase; - for (const OverDefinedPairTy& P : OverDefinedCache) - if (P.first == BB) - ToErase.push_back(P); - for (const OverDefinedPairTy &P : ToErase) - OverDefinedCache.erase(P); + auto ODI = OverDefinedCache.find(BB); + if (ODI != OverDefinedCache.end()) + OverDefinedCache.erase(ODI); - for (std::map<LVIValueHandle, ValueCacheEntryTy>::iterator - I = ValueCache.begin(), E = ValueCache.end(); I != E; ++I) + for (auto I = ValueCache.begin(), E = ValueCache.end(); I != E; ++I) I->second.erase(BB); } @@ -466,7 +505,8 @@ void LazyValueInfoCache::solve() { if (solveBlockValue(e.second, e.first)) { // The work item was completely processed. assert(BlockValueStack.top() == e && "Nothing should have been pushed!"); - assert(lookup(e.second).count(e.first) && "Result should be in cache!"); + assert(hasCachedValueInfo(e.second, e.first) && + "Result should be in cache!"); BlockValueStack.pop(); BlockValueSet.erase(e); @@ -482,11 +522,7 @@ bool LazyValueInfoCache::hasBlockValue(Value *Val, BasicBlock *BB) { if (isa<Constant>(Val)) return true; - LVIValueHandle ValHandle(Val, this); - std::map<LVIValueHandle, ValueCacheEntryTy>::iterator I = - ValueCache.find(ValHandle); - if (I == ValueCache.end()) return false; - return I->second.count(BB); + return hasCachedValueInfo(Val, BB); } LVILatticeVal LazyValueInfoCache::getBlockValue(Value *Val, BasicBlock *BB) { @@ -495,17 +531,36 @@ LVILatticeVal LazyValueInfoCache::getBlockValue(Value *Val, BasicBlock *BB) { return LVILatticeVal::get(VC); SeenBlocks.insert(BB); - return lookup(Val)[BB]; + return getCachedValueInfo(Val, BB); +} + +static LVILatticeVal getFromRangeMetadata(Instruction *BBI) { + switch (BBI->getOpcode()) { + default: break; + case Instruction::Load: + case Instruction::Call: + case Instruction::Invoke: + if (MDNode *Ranges = BBI->getMetadata(LLVMContext::MD_range)) + if (isa<IntegerType>(BBI->getType())) { + ConstantRange Result = getConstantRangeFromMetadata(*Ranges); + return LVILatticeVal::getRange(Result); + } + break; + }; + // Nothing known - Note that we do not want overdefined here. We may know + // something else about the value and not having range metadata shouldn't + // cause us to throw away those facts. + return LVILatticeVal(); } bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) { if (isa<Constant>(Val)) return true; - if (lookup(Val).count(BB)) { + if (hasCachedValueInfo(Val, BB)) { // If we have a cached value, use that. DEBUG(dbgs() << " reuse BB '" << BB->getName() - << "' val=" << lookup(Val)[BB] << '\n'); + << "' val=" << getCachedValueInfo(Val, BB) << '\n'); // Since we're reusing a cached value, we don't need to update the // OverDefinedCache. The cache will have been properly updated whenever the @@ -516,7 +571,7 @@ bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) { // Hold off inserting this value into the Cache in case we have to return // false and come back later. LVILatticeVal Res; - + Instruction *BBI = dyn_cast<Instruction>(Val); if (!BBI || BBI->getParent() != BB) { if (!solveBlockValueNonLocal(Res, Val, BB)) @@ -532,12 +587,18 @@ bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) { return true; } - if (AllocaInst *AI = dyn_cast<AllocaInst>(BBI)) { - Res = LVILatticeVal::getNot(ConstantPointerNull::get(AI->getType())); + // If this value is a nonnull pointer, record it's range and bailout. + PointerType *PT = dyn_cast<PointerType>(BBI->getType()); + if (PT && isKnownNonNull(BBI)) { + Res = LVILatticeVal::getNot(ConstantPointerNull::get(PT)); insertResult(Val, BB, Res); return true; } + // If this is an instruction which supports range metadata, return the + // implied range. TODO: This should be an intersection, not a union. + Res.mergeIn(getFromRangeMetadata(BBI), DL); + // We can only analyze the definitions of certain classes of instructions // (integral binops and casts at the moment), so bail if this isn't one. LVILatticeVal Result; @@ -661,7 +722,7 @@ bool LazyValueInfoCache::solveBlockValueNonLocal(LVILatticeVal &BBLV, PointerType *PTy = cast<PointerType>(Val->getType()); Result = LVILatticeVal::getNot(ConstantPointerNull::get(PTy)); } - + BBLV = Result; return true; } @@ -674,7 +735,7 @@ bool LazyValueInfoCache::solveBlockValueNonLocal(LVILatticeVal &BBLV, BBLV = Result; return true; } - + bool LazyValueInfoCache::solveBlockValuePHINode(LVILatticeVal &BBLV, PHINode *PN, BasicBlock *BB) { LVILatticeVal Result; // Start Undefined. @@ -700,7 +761,7 @@ bool LazyValueInfoCache::solveBlockValuePHINode(LVILatticeVal &BBLV, if (Result.isOverdefined()) { DEBUG(dbgs() << " compute BB '" << BB->getName() << "' - overdefined because of pred.\n"); - + BBLV = Result; return true; } @@ -765,7 +826,7 @@ bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV, BBLV.markOverdefined(); return true; } - + ConstantRange LHSRange = LHSVal.getConstantRange(); ConstantRange RHSRange(1); IntegerType *ResultTy = cast<IntegerType>(BBI->getType()); @@ -819,7 +880,7 @@ bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV, case Instruction::Or: Result.markConstantRange(LHSRange.binaryOr(RHSRange)); break; - + // Unhandled instructions are overdefined. default: DEBUG(dbgs() << " compute BB '" << BB->getName() @@ -827,7 +888,7 @@ bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV, Result.markOverdefined(); break; } - + BBLV = Result; return true; } @@ -877,7 +938,7 @@ bool getValueFromFromCondition(Value *Val, ICmpInst *ICI, /// Val is not constrained on the edge. static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom, BasicBlock *BBTo, LVILatticeVal &Result) { - // TODO: Handle more complex conditionals. If (v == 0 || v2 < 1) is false, we + // TODO: Handle more complex conditionals. If (v == 0 || v2 < 1) is false, we // know that v != 0. if (BranchInst *BI = dyn_cast<BranchInst>(BBFrom->getTerminator())) { // If this is a conditional branch and only one successor goes to BBTo, then @@ -887,7 +948,7 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom, bool isTrueDest = BI->getSuccessor(0) == BBTo; assert(BI->getSuccessor(!isTrueDest) == BBTo && "BBTo isn't a successor of BBFrom"); - + // If V is the condition of the branch itself, then we know exactly what // it is. if (BI->getCondition() == Val) { @@ -895,7 +956,7 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom, Type::getInt1Ty(Val->getContext()), isTrueDest)); return true; } - + // If the condition of the branch is an equality comparison, we may be // able to infer the value. if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) @@ -997,7 +1058,7 @@ LVILatticeVal LazyValueInfoCache::getValueInBlock(Value *V, BasicBlock *BB, Instruction *CxtI) { DEBUG(dbgs() << "LVI Getting block end value " << *V << " at '" << BB->getName() << "'\n"); - + assert(BlockValueStack.empty() && BlockValueSet.empty()); pushBlockValue(std::make_pair(BB, V)); @@ -1014,6 +1075,8 @@ LVILatticeVal LazyValueInfoCache::getValueAt(Value *V, Instruction *CxtI) { << CxtI->getName() << "'\n"); LVILatticeVal Result; + if (auto *I = dyn_cast<Instruction>(V)) + Result = getFromRangeMetadata(I); mergeAssumeBlockValueConstantRange(V, Result, CxtI); DEBUG(dbgs() << " Result = " << Result << "\n"); @@ -1025,7 +1088,7 @@ getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB, Instruction *CxtI) { DEBUG(dbgs() << "LVI Getting edge value " << *V << " from '" << FromBB->getName() << "' to '" << ToBB->getName() << "'\n"); - + LVILatticeVal Result; if (!getEdgeValue(V, FromBB, ToBB, Result, CxtI)) { solve(); @@ -1040,24 +1103,24 @@ getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB, void LazyValueInfoCache::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc, BasicBlock *NewSucc) { - // When an edge in the graph has been threaded, values that we could not - // determine a value for before (i.e. were marked overdefined) may be possible - // to solve now. We do NOT try to proactively update these values. Instead, - // we clear their entries from the cache, and allow lazy updating to recompute - // them when needed. - + // When an edge in the graph has been threaded, values that we could not + // determine a value for before (i.e. were marked overdefined) may be + // possible to solve now. We do NOT try to proactively update these values. + // Instead, we clear their entries from the cache, and allow lazy updating to + // recompute them when needed. + // The updating process is fairly simple: we need to drop cached info // for all values that were marked overdefined in OldSucc, and for those same // values in any successor of OldSucc (except NewSucc) in which they were // also marked overdefined. std::vector<BasicBlock*> worklist; worklist.push_back(OldSucc); - - DenseSet<Value*> ClearSet; - for (OverDefinedPairTy &P : OverDefinedCache) - if (P.first == OldSucc) - ClearSet.insert(P.second); - + + auto I = OverDefinedCache.find(OldSucc); + if (I == OverDefinedCache.end()) + return; // Nothing to process here. + SmallVector<Value *, 4> ValsToClear(I->second.begin(), I->second.end()); + // Use a worklist to perform a depth-first search of OldSucc's successors. // NOTE: We do not need a visited list since any blocks we have already // visited will have had their overdefined markers cleared already, and we @@ -1065,32 +1128,31 @@ void LazyValueInfoCache::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc, while (!worklist.empty()) { BasicBlock *ToUpdate = worklist.back(); worklist.pop_back(); - + // Skip blocks only accessible through NewSucc. if (ToUpdate == NewSucc) continue; - + bool changed = false; - for (Value *V : ClearSet) { + for (Value *V : ValsToClear) { // If a value was marked overdefined in OldSucc, and is here too... - DenseSet<OverDefinedPairTy>::iterator OI = - OverDefinedCache.find(std::make_pair(ToUpdate, V)); - if (OI == OverDefinedCache.end()) continue; - - // Remove it from the caches. - ValueCacheEntryTy &Entry = ValueCache[LVIValueHandle(V, this)]; - ValueCacheEntryTy::iterator CI = Entry.find(ToUpdate); - - assert(CI != Entry.end() && "Couldn't find entry to update?"); - Entry.erase(CI); - OverDefinedCache.erase(OI); - - // If we removed anything, then we potentially need to update + auto OI = OverDefinedCache.find(ToUpdate); + if (OI == OverDefinedCache.end()) + continue; + SmallPtrSetImpl<Value *> &ValueSet = OI->second; + if (!ValueSet.count(V)) + continue; + + ValueSet.erase(V); + if (ValueSet.empty()) + OverDefinedCache.erase(OI); + + // If we removed anything, then we potentially need to update // blocks successors too. changed = true; } if (!changed) continue; - + worklist.insert(worklist.end(), succ_begin(ToUpdate), succ_end(ToUpdate)); } } @@ -1158,7 +1220,7 @@ Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB, } /// Determine whether the specified value is known to be a -/// constant on the specified edge. Return null if not. +/// constant on the specified edge. Return null if not. Constant *LazyValueInfo::getConstantOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB, Instruction *CxtI) { @@ -1190,26 +1252,26 @@ static LazyValueInfo::Tristate getPredicateResult(unsigned Pred, Constant *C, return ResCI->isZero() ? LazyValueInfo::False : LazyValueInfo::True; return LazyValueInfo::Unknown; } - + if (Result.isConstantRange()) { ConstantInt *CI = dyn_cast<ConstantInt>(C); if (!CI) return LazyValueInfo::Unknown; - + ConstantRange CR = Result.getConstantRange(); if (Pred == ICmpInst::ICMP_EQ) { if (!CR.contains(CI->getValue())) return LazyValueInfo::False; - + if (CR.isSingleElement() && CR.contains(CI->getValue())) return LazyValueInfo::True; } else if (Pred == ICmpInst::ICMP_NE) { if (!CR.contains(CI->getValue())) return LazyValueInfo::True; - + if (CR.isSingleElement() && CR.contains(CI->getValue())) return LazyValueInfo::False; } - + // Handle more complex predicates. ConstantRange TrueValues = ICmpInst::makeConstantRange((ICmpInst::Predicate)Pred, CI->getValue()); @@ -1219,7 +1281,7 @@ static LazyValueInfo::Tristate getPredicateResult(unsigned Pred, Constant *C, return LazyValueInfo::False; return LazyValueInfo::Unknown; } - + if (Result.isNotConstant()) { // If this is an equality comparison, we can try to fold it knowing that // "V != C1". @@ -1240,7 +1302,7 @@ static LazyValueInfo::Tristate getPredicateResult(unsigned Pred, Constant *C, } return LazyValueInfo::Unknown; } - + return LazyValueInfo::Unknown; } @@ -1266,20 +1328,69 @@ LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C, if (Ret != Unknown) return Ret; - // TODO: Move this logic inside getValueAt so that it can be cached rather - // than re-queried on each call. This would also allow us to merge the - // underlying lattice values to get more information + // Note: The following bit of code is somewhat distinct from the rest of LVI; + // LVI as a whole tries to compute a lattice value which is conservatively + // correct at a given location. In this case, we have a predicate which we + // weren't able to prove about the merged result, and we're pushing that + // predicate back along each incoming edge to see if we can prove it + // separately for each input. As a motivating example, consider: + // bb1: + // %v1 = ... ; constantrange<1, 5> + // br label %merge + // bb2: + // %v2 = ... ; constantrange<10, 20> + // br label %merge + // merge: + // %phi = phi [%v1, %v2] ; constantrange<1,20> + // %pred = icmp eq i32 %phi, 8 + // We can't tell from the lattice value for '%phi' that '%pred' is false + // along each path, but by checking the predicate over each input separately, + // we can. + // We limit the search to one step backwards from the current BB and value. + // We could consider extending this to search further backwards through the + // CFG and/or value graph, but there are non-obvious compile time vs quality + // tradeoffs. if (CxtI) { - // For a comparison where the V is outside this block, it's possible - // that we've branched on it before. Look to see if the value is known - // on all incoming edges. BasicBlock *BB = CxtI->getParent(); + + // Function entry or an unreachable block. Bail to avoid confusing + // analysis below. pred_iterator PI = pred_begin(BB), PE = pred_end(BB); - if (PI != PE && - (!isa<Instruction>(V) || - cast<Instruction>(V)->getParent() != BB)) { + if (PI == PE) + return Unknown; + + // If V is a PHI node in the same block as the context, we need to ask + // questions about the predicate as applied to the incoming value along + // each edge. This is useful for eliminating cases where the predicate is + // known along all incoming edges. + if (auto *PHI = dyn_cast<PHINode>(V)) + if (PHI->getParent() == BB) { + Tristate Baseline = Unknown; + for (unsigned i = 0, e = PHI->getNumIncomingValues(); i < e; i++) { + Value *Incoming = PHI->getIncomingValue(i); + BasicBlock *PredBB = PHI->getIncomingBlock(i); + // Note that PredBB may be BB itself. + Tristate Result = getPredicateOnEdge(Pred, Incoming, C, PredBB, BB, + CxtI); + + // Keep going as long as we've seen a consistent known result for + // all inputs. + Baseline = (i == 0) ? Result /* First iteration */ + : (Baseline == Result ? Baseline : Unknown); /* All others */ + if (Baseline == Unknown) + break; + } + if (Baseline != Unknown) + return Baseline; + } + + // For a comparison where the V is outside this block, it's possible + // that we've branched on it before. Look to see if the value is known + // on all incoming edges. + if (!isa<Instruction>(V) || + cast<Instruction>(V)->getParent() != BB) { // For predecessor edge, determine if the comparison is true or false - // on that edge. If they're all true or all false, we can conclude + // on that edge. If they're all true or all false, we can conclude // the value of the comparison in this block. Tristate Baseline = getPredicateOnEdge(Pred, V, C, *PI, BB, CxtI); if (Baseline != Unknown) { diff --git a/contrib/llvm/lib/Analysis/LibCallAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/LibCallAliasAnalysis.cpp deleted file mode 100644 index 991a0e3..0000000 --- a/contrib/llvm/lib/Analysis/LibCallAliasAnalysis.cpp +++ /dev/null @@ -1,141 +0,0 @@ -//===- LibCallAliasAnalysis.cpp - Implement AliasAnalysis for libcalls ----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the LibCallAliasAnalysis class. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Analysis/LibCallAliasAnalysis.h" -#include "llvm/Analysis/LibCallSemantics.h" -#include "llvm/Analysis/Passes.h" -#include "llvm/IR/Function.h" -#include "llvm/Pass.h" -using namespace llvm; - -// Register this pass... -char LibCallAliasAnalysis::ID = 0; -INITIALIZE_AG_PASS(LibCallAliasAnalysis, AliasAnalysis, "libcall-aa", - "LibCall Alias Analysis", false, true, false) - -FunctionPass *llvm::createLibCallAliasAnalysisPass(LibCallInfo *LCI) { - return new LibCallAliasAnalysis(LCI); -} - -LibCallAliasAnalysis::~LibCallAliasAnalysis() { - delete LCI; -} - -void LibCallAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { - AliasAnalysis::getAnalysisUsage(AU); - AU.setPreservesAll(); // Does not transform code -} - -bool LibCallAliasAnalysis::runOnFunction(Function &F) { - // set up super class - InitializeAliasAnalysis(this, &F.getParent()->getDataLayout()); - return false; -} - -/// AnalyzeLibCallDetails - Given a call to a function with the specified -/// LibCallFunctionInfo, see if we can improve the mod/ref footprint of the call -/// vs the specified pointer/size. -AliasAnalysis::ModRefResult -LibCallAliasAnalysis::AnalyzeLibCallDetails(const LibCallFunctionInfo *FI, - ImmutableCallSite CS, - const MemoryLocation &Loc) { - // If we have a function, check to see what kind of mod/ref effects it - // has. Start by including any info globally known about the function. - AliasAnalysis::ModRefResult MRInfo = FI->UniversalBehavior; - if (MRInfo == NoModRef) return MRInfo; - - // If that didn't tell us that the function is 'readnone', check to see - // if we have detailed info and if 'P' is any of the locations we know - // about. - const LibCallFunctionInfo::LocationMRInfo *Details = FI->LocationDetails; - if (Details == nullptr) - return MRInfo; - - // If the details array is of the 'DoesNot' kind, we only know something if - // the pointer is a match for one of the locations in 'Details'. If we find a - // match, we can prove some interactions cannot happen. - // - if (FI->DetailsType == LibCallFunctionInfo::DoesNot) { - // Find out if the pointer refers to a known location. - for (unsigned i = 0; Details[i].LocationID != ~0U; ++i) { - const LibCallLocationInfo &LocInfo = - LCI->getLocationInfo(Details[i].LocationID); - LibCallLocationInfo::LocResult Res = LocInfo.isLocation(CS, Loc); - if (Res != LibCallLocationInfo::Yes) continue; - - // If we find a match against a location that we 'do not' interact with, - // learn this info into MRInfo. - return ModRefResult(MRInfo & ~Details[i].MRInfo); - } - return MRInfo; - } - - // If the details are of the 'DoesOnly' sort, we know something if the pointer - // is a match for one of the locations in 'Details'. Also, if we can prove - // that the pointers is *not* one of the locations in 'Details', we know that - // the call is NoModRef. - assert(FI->DetailsType == LibCallFunctionInfo::DoesOnly); - - // Find out if the pointer refers to a known location. - bool NoneMatch = true; - for (unsigned i = 0; Details[i].LocationID != ~0U; ++i) { - const LibCallLocationInfo &LocInfo = - LCI->getLocationInfo(Details[i].LocationID); - LibCallLocationInfo::LocResult Res = LocInfo.isLocation(CS, Loc); - if (Res == LibCallLocationInfo::No) continue; - - // If we don't know if this pointer points to the location, then we have to - // assume it might alias in some case. - if (Res == LibCallLocationInfo::Unknown) { - NoneMatch = false; - continue; - } - - // If we know that this pointer definitely is pointing into the location, - // merge in this information. - return ModRefResult(MRInfo & Details[i].MRInfo); - } - - // If we found that the pointer is guaranteed to not match any of the - // locations in our 'DoesOnly' rule, then we know that the pointer must point - // to some other location. Since the libcall doesn't mod/ref any other - // locations, return NoModRef. - if (NoneMatch) - return NoModRef; - - // Otherwise, return any other info gained so far. - return MRInfo; -} - -// getModRefInfo - Check to see if the specified callsite can clobber the -// specified memory object. -// -AliasAnalysis::ModRefResult -LibCallAliasAnalysis::getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) { - ModRefResult MRInfo = ModRef; - - // If this is a direct call to a function that LCI knows about, get the - // information about the runtime function. - if (LCI) { - if (const Function *F = CS.getCalledFunction()) { - if (const LibCallFunctionInfo *FI = LCI->getFunctionInfo(F)) { - MRInfo = ModRefResult(MRInfo & AnalyzeLibCallDetails(FI, CS, Loc)); - if (MRInfo == NoModRef) return NoModRef; - } - } - } - - // The AliasAnalysis base class has some smarts, lets use them. - return (ModRefResult)(MRInfo | AliasAnalysis::getModRefInfo(CS, Loc)); -} diff --git a/contrib/llvm/lib/Analysis/LibCallSemantics.cpp b/contrib/llvm/lib/Analysis/LibCallSemantics.cpp deleted file mode 100644 index 003c81e..0000000 --- a/contrib/llvm/lib/Analysis/LibCallSemantics.cpp +++ /dev/null @@ -1,89 +0,0 @@ -//===- LibCallSemantics.cpp - Describe library semantics ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements interfaces that can be used to describe language -// specific runtime library interfaces (e.g. libc, libm, etc) to LLVM -// optimizers. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Analysis/LibCallSemantics.h" -#include "llvm/ADT/StringMap.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm/IR/Function.h" -using namespace llvm; - -/// This impl pointer in ~LibCallInfo is actually a StringMap. This -/// helper does the cast. -static StringMap<const LibCallFunctionInfo*> *getMap(void *Ptr) { - return static_cast<StringMap<const LibCallFunctionInfo*> *>(Ptr); -} - -LibCallInfo::~LibCallInfo() { - delete getMap(Impl); -} - -const LibCallLocationInfo &LibCallInfo::getLocationInfo(unsigned LocID) const { - // Get location info on the first call. - if (NumLocations == 0) - NumLocations = getLocationInfo(Locations); - - assert(LocID < NumLocations && "Invalid location ID!"); - return Locations[LocID]; -} - - -/// Return the LibCallFunctionInfo object corresponding to -/// the specified function if we have it. If not, return null. -const LibCallFunctionInfo * -LibCallInfo::getFunctionInfo(const Function *F) const { - StringMap<const LibCallFunctionInfo*> *Map = getMap(Impl); - - /// If this is the first time we are querying for this info, lazily construct - /// the StringMap to index it. - if (!Map) { - Impl = Map = new StringMap<const LibCallFunctionInfo*>(); - - const LibCallFunctionInfo *Array = getFunctionInfoArray(); - if (!Array) return nullptr; - - // We now have the array of entries. Populate the StringMap. - for (unsigned i = 0; Array[i].Name; ++i) - (*Map)[Array[i].Name] = Array+i; - } - - // Look up this function in the string map. - return Map->lookup(F->getName()); -} - -/// See if the given exception handling personality function is one that we -/// understand. If so, return a description of it; otherwise return Unknown. -EHPersonality llvm::classifyEHPersonality(const Value *Pers) { - const Function *F = dyn_cast<Function>(Pers->stripPointerCasts()); - if (!F) - return EHPersonality::Unknown; - return StringSwitch<EHPersonality>(F->getName()) - .Case("__gnat_eh_personality", EHPersonality::GNU_Ada) - .Case("__gxx_personality_v0", EHPersonality::GNU_CXX) - .Case("__gcc_personality_v0", EHPersonality::GNU_C) - .Case("__objc_personality_v0", EHPersonality::GNU_ObjC) - .Case("_except_handler3", EHPersonality::MSVC_X86SEH) - .Case("_except_handler4", EHPersonality::MSVC_X86SEH) - .Case("__C_specific_handler", EHPersonality::MSVC_Win64SEH) - .Case("__CxxFrameHandler3", EHPersonality::MSVC_CXX) - .Default(EHPersonality::Unknown); -} - -bool llvm::canSimplifyInvokeNoUnwind(const Function *F) { - EHPersonality Personality = classifyEHPersonality(F->getPersonalityFn()); - // We can't simplify any invokes to nounwind functions if the personality - // function wants to catch asynch exceptions. The nounwind attribute only - // implies that the function does not throw synchronous exceptions. - return !isAsynchronousEHPersonality(Personality); -} diff --git a/contrib/llvm/lib/Analysis/Lint.cpp b/contrib/llvm/lib/Analysis/Lint.cpp index 0b9308a..2dfb09c 100644 --- a/contrib/llvm/lib/Analysis/Lint.cpp +++ b/contrib/llvm/lib/Analysis/Lint.cpp @@ -49,6 +49,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LegacyPassManager.h" @@ -98,12 +99,13 @@ namespace { void visitInsertElementInst(InsertElementInst &I); void visitUnreachableInst(UnreachableInst &I); - Value *findValue(Value *V, const DataLayout &DL, bool OffsetOk) const; - Value *findValueImpl(Value *V, const DataLayout &DL, bool OffsetOk, + Value *findValue(Value *V, bool OffsetOk) const; + Value *findValueImpl(Value *V, bool OffsetOk, SmallPtrSetImpl<Value *> &Visited) const; public: Module *Mod; + const DataLayout *DL; AliasAnalysis *AA; AssumptionCache *AC; DominatorTree *DT; @@ -121,7 +123,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); @@ -165,7 +167,7 @@ INITIALIZE_PASS_BEGIN(Lint, "lint", "Statically lint-checks LLVM IR", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(Lint, "lint", "Statically lint-checks LLVM IR", false, true) @@ -178,7 +180,8 @@ INITIALIZE_PASS_END(Lint, "lint", "Statically lint-checks LLVM IR", // bool Lint::runOnFunction(Function &F) { Mod = F.getParent(); - AA = &getAnalysis<AliasAnalysis>(); + DL = &F.getParent()->getDataLayout(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); @@ -200,12 +203,11 @@ void Lint::visitFunction(Function &F) { void Lint::visitCallSite(CallSite CS) { Instruction &I = *CS.getInstruction(); Value *Callee = CS.getCalledValue(); - const DataLayout &DL = CS->getModule()->getDataLayout(); visitMemoryReference(I, Callee, MemoryLocation::UnknownSize, 0, nullptr, MemRef::Callee); - if (Function *F = dyn_cast<Function>(findValue(Callee, DL, + if (Function *F = dyn_cast<Function>(findValue(Callee, /*OffsetOk=*/false))) { Assert(CS.getCallingConv() == F->getCallingConv(), "Undefined behavior: Caller and callee calling convention differ", @@ -232,7 +234,7 @@ void Lint::visitCallSite(CallSite CS) { for (; AI != AE; ++AI) { Value *Actual = *AI; if (PI != PE) { - Argument *Formal = PI++; + Argument *Formal = &*PI++; Assert(Formal->getType() == Actual->getType(), "Undefined behavior: Call argument type mismatches " "callee parameter type", @@ -253,8 +255,8 @@ void Lint::visitCallSite(CallSite CS) { if (Formal->hasStructRetAttr() && Actual->getType()->isPointerTy()) { Type *Ty = cast<PointerType>(Formal->getType())->getElementType(); - visitMemoryReference(I, Actual, AA->getTypeStoreSize(Ty), - DL.getABITypeAlignment(Ty), Ty, + visitMemoryReference(I, Actual, DL->getTypeStoreSize(Ty), + DL->getABITypeAlignment(Ty), Ty, MemRef::Read | MemRef::Write); } } @@ -264,7 +266,7 @@ void Lint::visitCallSite(CallSite CS) { if (CS.isCall() && cast<CallInst>(CS.getInstruction())->isTailCall()) for (CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end(); AI != AE; ++AI) { - Value *Obj = findValue(*AI, DL, /*OffsetOk=*/true); + Value *Obj = findValue(*AI, /*OffsetOk=*/true); Assert(!isa<AllocaInst>(Obj), "Undefined behavior: Call with \"tail\" keyword references " "alloca", @@ -291,7 +293,7 @@ void Lint::visitCallSite(CallSite CS) { // overlap is not distinguished from the case where nothing is known. uint64_t Size = 0; if (const ConstantInt *Len = - dyn_cast<ConstantInt>(findValue(MCI->getLength(), DL, + dyn_cast<ConstantInt>(findValue(MCI->getLength(), /*OffsetOk=*/false))) if (Len->getValue().isIntN(32)) Size = Len->getValue().getZExtValue(); @@ -343,13 +345,6 @@ void Lint::visitCallSite(CallSite CS) { visitMemoryReference(I, CS.getArgument(0), MemoryLocation::UnknownSize, 0, nullptr, MemRef::Read | MemRef::Write); break; - - case Intrinsic::eh_begincatch: - visitEHBeginCatch(II); - break; - case Intrinsic::eh_endcatch: - visitEHEndCatch(II); - break; } } @@ -367,8 +362,7 @@ void Lint::visitReturnInst(ReturnInst &I) { "Unusual: Return statement in function with noreturn attribute", &I); if (Value *V = I.getReturnValue()) { - Value *Obj = - findValue(V, F->getParent()->getDataLayout(), /*OffsetOk=*/true); + Value *Obj = findValue(V, /*OffsetOk=*/true); Assert(!isa<AllocaInst>(Obj), "Unusual: Returning alloca value", &I); } } @@ -383,8 +377,7 @@ void Lint::visitMemoryReference(Instruction &I, if (Size == 0) return; - Value *UnderlyingObject = - findValue(Ptr, I.getModule()->getDataLayout(), /*OffsetOk=*/true); + Value *UnderlyingObject = findValue(Ptr, /*OffsetOk=*/true); Assert(!isa<ConstantPointerNull>(UnderlyingObject), "Undefined behavior: Null pointer dereference", &I); Assert(!isa<UndefValue>(UnderlyingObject), @@ -423,9 +416,8 @@ void Lint::visitMemoryReference(Instruction &I, // Check for buffer overflows and misalignment. // Only handles memory references that read/write something simple like an // alloca instruction or a global variable. - auto &DL = I.getModule()->getDataLayout(); int64_t Offset = 0; - if (Value *Base = GetPointerBaseWithConstantOffset(Ptr, Offset, DL)) { + if (Value *Base = GetPointerBaseWithConstantOffset(Ptr, Offset, *DL)) { // OK, so the access is to a constant offset from Ptr. Check that Ptr is // something we can handle and if so extract the size of this base object // along with its alignment. @@ -435,20 +427,20 @@ void Lint::visitMemoryReference(Instruction &I, if (AllocaInst *AI = dyn_cast<AllocaInst>(Base)) { Type *ATy = AI->getAllocatedType(); if (!AI->isArrayAllocation() && ATy->isSized()) - BaseSize = DL.getTypeAllocSize(ATy); + BaseSize = DL->getTypeAllocSize(ATy); BaseAlign = AI->getAlignment(); if (BaseAlign == 0 && ATy->isSized()) - BaseAlign = DL.getABITypeAlignment(ATy); + BaseAlign = DL->getABITypeAlignment(ATy); } else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) { // If the global may be defined differently in another compilation unit // then don't warn about funky memory accesses. if (GV->hasDefinitiveInitializer()) { Type *GTy = GV->getType()->getElementType(); if (GTy->isSized()) - BaseSize = DL.getTypeAllocSize(GTy); + BaseSize = DL->getTypeAllocSize(GTy); BaseAlign = GV->getAlignment(); if (BaseAlign == 0 && GTy->isSized()) - BaseAlign = DL.getABITypeAlignment(GTy); + BaseAlign = DL->getABITypeAlignment(GTy); } } @@ -462,7 +454,7 @@ void Lint::visitMemoryReference(Instruction &I, // Accesses that say that the memory is more aligned than it is are not // defined. if (Align == 0 && Ty && Ty->isSized()) - Align = DL.getABITypeAlignment(Ty); + Align = DL->getABITypeAlignment(Ty); Assert(!BaseAlign || Align <= MinAlign(BaseAlign, Offset), "Undefined behavior: Memory reference address is misaligned", &I); } @@ -470,13 +462,13 @@ void Lint::visitMemoryReference(Instruction &I, void Lint::visitLoadInst(LoadInst &I) { visitMemoryReference(I, I.getPointerOperand(), - AA->getTypeStoreSize(I.getType()), I.getAlignment(), + DL->getTypeStoreSize(I.getType()), I.getAlignment(), I.getType(), MemRef::Read); } void Lint::visitStoreInst(StoreInst &I) { visitMemoryReference(I, I.getPointerOperand(), - AA->getTypeStoreSize(I.getOperand(0)->getType()), + DL->getTypeStoreSize(I.getOperand(0)->getType()), I.getAlignment(), I.getOperand(0)->getType(), MemRef::Write); } @@ -492,208 +484,26 @@ void Lint::visitSub(BinaryOperator &I) { } void Lint::visitLShr(BinaryOperator &I) { - if (ConstantInt *CI = dyn_cast<ConstantInt>( - findValue(I.getOperand(1), I.getModule()->getDataLayout(), - /*OffsetOk=*/false))) + if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(I.getOperand(1), + /*OffsetOk=*/false))) Assert(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()), "Undefined result: Shift count out of range", &I); } void Lint::visitAShr(BinaryOperator &I) { - if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue( - I.getOperand(1), I.getModule()->getDataLayout(), /*OffsetOk=*/false))) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(findValue(I.getOperand(1), /*OffsetOk=*/false))) Assert(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()), "Undefined result: Shift count out of range", &I); } void Lint::visitShl(BinaryOperator &I) { - if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue( - I.getOperand(1), I.getModule()->getDataLayout(), /*OffsetOk=*/false))) + if (ConstantInt *CI = + dyn_cast<ConstantInt>(findValue(I.getOperand(1), /*OffsetOk=*/false))) Assert(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()), "Undefined result: Shift count out of range", &I); } -static bool -allPredsCameFromLandingPad(BasicBlock *BB, - SmallSet<BasicBlock *, 4> &VisitedBlocks) { - VisitedBlocks.insert(BB); - if (BB->isLandingPad()) - return true; - // If we find a block with no predecessors, the search failed. - if (pred_empty(BB)) - return false; - for (BasicBlock *Pred : predecessors(BB)) { - if (VisitedBlocks.count(Pred)) - continue; - if (!allPredsCameFromLandingPad(Pred, VisitedBlocks)) - return false; - } - return true; -} - -static bool -allSuccessorsReachEndCatch(BasicBlock *BB, BasicBlock::iterator InstBegin, - IntrinsicInst **SecondBeginCatch, - SmallSet<BasicBlock *, 4> &VisitedBlocks) { - VisitedBlocks.insert(BB); - for (BasicBlock::iterator I = InstBegin, E = BB->end(); I != E; ++I) { - IntrinsicInst *IC = dyn_cast<IntrinsicInst>(I); - if (IC && IC->getIntrinsicID() == Intrinsic::eh_endcatch) - return true; - // If we find another begincatch while looking for an endcatch, - // that's also an error. - if (IC && IC->getIntrinsicID() == Intrinsic::eh_begincatch) { - *SecondBeginCatch = IC; - return false; - } - } - - // If we reach a block with no successors while searching, the - // search has failed. - if (succ_empty(BB)) - return false; - // Otherwise, search all of the successors. - for (BasicBlock *Succ : successors(BB)) { - if (VisitedBlocks.count(Succ)) - continue; - if (!allSuccessorsReachEndCatch(Succ, Succ->begin(), SecondBeginCatch, - VisitedBlocks)) - return false; - } - return true; -} - -void Lint::visitEHBeginCatch(IntrinsicInst *II) { - // The checks in this function make a potentially dubious assumption about - // the CFG, namely that any block involved in a catch is only used for the - // catch. This will very likely be true of IR generated by a front end, - // but it may cease to be true, for example, if the IR is run through a - // pass which combines similar blocks. - // - // In general, if we encounter a block the isn't dominated by the catch - // block while we are searching the catch block's successors for a call - // to end catch intrinsic, then it is possible that it will be legal for - // a path through this block to never reach a call to llvm.eh.endcatch. - // An analogous statement could be made about our search for a landing - // pad among the catch block's predecessors. - // - // What is actually required is that no path is possible at runtime that - // reaches a call to llvm.eh.begincatch without having previously visited - // a landingpad instruction and that no path is possible at runtime that - // calls llvm.eh.begincatch and does not subsequently call llvm.eh.endcatch - // (mentally adjusting for the fact that in reality these calls will be - // removed before code generation). - // - // Because this is a lint check, we take a pessimistic approach and warn if - // the control flow is potentially incorrect. - - SmallSet<BasicBlock *, 4> VisitedBlocks; - BasicBlock *CatchBB = II->getParent(); - - // The begin catch must occur in a landing pad block or all paths - // to it must have come from a landing pad. - Assert(allPredsCameFromLandingPad(CatchBB, VisitedBlocks), - "llvm.eh.begincatch may be reachable without passing a landingpad", - II); - - // Reset the visited block list. - VisitedBlocks.clear(); - - IntrinsicInst *SecondBeginCatch = nullptr; - - // This has to be called before it is asserted. Otherwise, the first assert - // below can never be hit. - bool EndCatchFound = allSuccessorsReachEndCatch( - CatchBB, std::next(static_cast<BasicBlock::iterator>(II)), - &SecondBeginCatch, VisitedBlocks); - Assert( - SecondBeginCatch == nullptr, - "llvm.eh.begincatch may be called a second time before llvm.eh.endcatch", - II, SecondBeginCatch); - Assert(EndCatchFound, - "Some paths from llvm.eh.begincatch may not reach llvm.eh.endcatch", - II); -} - -static bool allPredCameFromBeginCatch( - BasicBlock *BB, BasicBlock::reverse_iterator InstRbegin, - IntrinsicInst **SecondEndCatch, SmallSet<BasicBlock *, 4> &VisitedBlocks) { - VisitedBlocks.insert(BB); - // Look for a begincatch in this block. - for (BasicBlock::reverse_iterator RI = InstRbegin, RE = BB->rend(); RI != RE; - ++RI) { - IntrinsicInst *IC = dyn_cast<IntrinsicInst>(&*RI); - if (IC && IC->getIntrinsicID() == Intrinsic::eh_begincatch) - return true; - // If we find another end catch before we find a begin catch, that's - // an error. - if (IC && IC->getIntrinsicID() == Intrinsic::eh_endcatch) { - *SecondEndCatch = IC; - return false; - } - // If we encounter a landingpad instruction, the search failed. - if (isa<LandingPadInst>(*RI)) - return false; - } - // If while searching we find a block with no predeccesors, - // the search failed. - if (pred_empty(BB)) - return false; - // Search any predecessors we haven't seen before. - for (BasicBlock *Pred : predecessors(BB)) { - if (VisitedBlocks.count(Pred)) - continue; - if (!allPredCameFromBeginCatch(Pred, Pred->rbegin(), SecondEndCatch, - VisitedBlocks)) - return false; - } - return true; -} - -void Lint::visitEHEndCatch(IntrinsicInst *II) { - // The check in this function makes a potentially dubious assumption about - // the CFG, namely that any block involved in a catch is only used for the - // catch. This will very likely be true of IR generated by a front end, - // but it may cease to be true, for example, if the IR is run through a - // pass which combines similar blocks. - // - // In general, if we encounter a block the isn't post-dominated by the - // end catch block while we are searching the end catch block's predecessors - // for a call to the begin catch intrinsic, then it is possible that it will - // be legal for a path to reach the end catch block without ever having - // called llvm.eh.begincatch. - // - // What is actually required is that no path is possible at runtime that - // reaches a call to llvm.eh.endcatch without having previously visited - // a call to llvm.eh.begincatch (mentally adjusting for the fact that in - // reality these calls will be removed before code generation). - // - // Because this is a lint check, we take a pessimistic approach and warn if - // the control flow is potentially incorrect. - - BasicBlock *EndCatchBB = II->getParent(); - - // Alls paths to the end catch call must pass through a begin catch call. - - // If llvm.eh.begincatch wasn't called in the current block, we'll use this - // lambda to recursively look for it in predecessors. - SmallSet<BasicBlock *, 4> VisitedBlocks; - IntrinsicInst *SecondEndCatch = nullptr; - - // This has to be called before it is asserted. Otherwise, the first assert - // below can never be hit. - bool BeginCatchFound = - allPredCameFromBeginCatch(EndCatchBB, BasicBlock::reverse_iterator(II), - &SecondEndCatch, VisitedBlocks); - Assert( - SecondEndCatch == nullptr, - "llvm.eh.endcatch may be called a second time after llvm.eh.begincatch", - II, SecondEndCatch); - Assert(BeginCatchFound, - "llvm.eh.endcatch may be reachable without passing llvm.eh.begincatch", - II); -} - static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC) { // Assume undef could be zero. @@ -777,25 +587,23 @@ void Lint::visitIndirectBrInst(IndirectBrInst &I) { } void Lint::visitExtractElementInst(ExtractElementInst &I) { - if (ConstantInt *CI = dyn_cast<ConstantInt>( - findValue(I.getIndexOperand(), I.getModule()->getDataLayout(), - /*OffsetOk=*/false))) + if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(I.getIndexOperand(), + /*OffsetOk=*/false))) Assert(CI->getValue().ult(I.getVectorOperandType()->getNumElements()), "Undefined result: extractelement index out of range", &I); } void Lint::visitInsertElementInst(InsertElementInst &I) { - if (ConstantInt *CI = dyn_cast<ConstantInt>( - findValue(I.getOperand(2), I.getModule()->getDataLayout(), - /*OffsetOk=*/false))) + if (ConstantInt *CI = dyn_cast<ConstantInt>(findValue(I.getOperand(2), + /*OffsetOk=*/false))) Assert(CI->getValue().ult(I.getType()->getNumElements()), "Undefined result: insertelement index out of range", &I); } void Lint::visitUnreachableInst(UnreachableInst &I) { // This isn't undefined behavior, it's merely suspicious. - Assert(&I == I.getParent()->begin() || - std::prev(BasicBlock::iterator(&I))->mayHaveSideEffects(), + Assert(&I == &I.getParent()->front() || + std::prev(I.getIterator())->mayHaveSideEffects(), "Unusual: unreachable immediately preceded by instruction without " "side effects", &I); @@ -808,13 +616,13 @@ void Lint::visitUnreachableInst(UnreachableInst &I) { /// Most analysis passes don't require this logic, because instcombine /// will simplify most of these kinds of things away. But it's a goal of /// this Lint pass to be useful even on non-optimized IR. -Value *Lint::findValue(Value *V, const DataLayout &DL, bool OffsetOk) const { +Value *Lint::findValue(Value *V, bool OffsetOk) const { SmallPtrSet<Value *, 4> Visited; - return findValueImpl(V, DL, OffsetOk, Visited); + return findValueImpl(V, OffsetOk, Visited); } /// findValueImpl - Implementation helper for findValue. -Value *Lint::findValueImpl(Value *V, const DataLayout &DL, bool OffsetOk, +Value *Lint::findValueImpl(Value *V, bool OffsetOk, SmallPtrSetImpl<Value *> &Visited) const { // Detect self-referential values. if (!Visited.insert(V).second) @@ -825,17 +633,18 @@ Value *Lint::findValueImpl(Value *V, const DataLayout &DL, bool OffsetOk, // TODO: Look through eliminable cast pairs. // TODO: Look through calls with unique return values. // TODO: Look through vector insert/extract/shuffle. - V = OffsetOk ? GetUnderlyingObject(V, DL) : V->stripPointerCasts(); + V = OffsetOk ? GetUnderlyingObject(V, *DL) : V->stripPointerCasts(); if (LoadInst *L = dyn_cast<LoadInst>(V)) { - BasicBlock::iterator BBI = L; + BasicBlock::iterator BBI = L->getIterator(); BasicBlock *BB = L->getParent(); SmallPtrSet<BasicBlock *, 4> VisitedBlocks; for (;;) { if (!VisitedBlocks.insert(BB).second) break; - if (Value *U = FindAvailableLoadedValue(L->getPointerOperand(), - BB, BBI, 6, AA)) - return findValueImpl(U, DL, OffsetOk, Visited); + if (Value *U = + FindAvailableLoadedValue(L->getPointerOperand(), + BB, BBI, DefMaxInstsToScan, AA)) + return findValueImpl(U, OffsetOk, Visited); if (BBI != BB->begin()) break; BB = BB->getUniquePredecessor(); if (!BB) break; @@ -844,38 +653,38 @@ Value *Lint::findValueImpl(Value *V, const DataLayout &DL, bool OffsetOk, } else if (PHINode *PN = dyn_cast<PHINode>(V)) { if (Value *W = PN->hasConstantValue()) if (W != V) - return findValueImpl(W, DL, OffsetOk, Visited); + return findValueImpl(W, OffsetOk, Visited); } else if (CastInst *CI = dyn_cast<CastInst>(V)) { - if (CI->isNoopCast(DL)) - return findValueImpl(CI->getOperand(0), DL, OffsetOk, Visited); + if (CI->isNoopCast(*DL)) + return findValueImpl(CI->getOperand(0), OffsetOk, Visited); } else if (ExtractValueInst *Ex = dyn_cast<ExtractValueInst>(V)) { if (Value *W = FindInsertedValue(Ex->getAggregateOperand(), Ex->getIndices())) if (W != V) - return findValueImpl(W, DL, OffsetOk, Visited); + return findValueImpl(W, OffsetOk, Visited); } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) { // Same as above, but for ConstantExpr instead of Instruction. if (Instruction::isCast(CE->getOpcode())) { if (CastInst::isNoopCast(Instruction::CastOps(CE->getOpcode()), CE->getOperand(0)->getType(), CE->getType(), - DL.getIntPtrType(V->getType()))) - return findValueImpl(CE->getOperand(0), DL, OffsetOk, Visited); + DL->getIntPtrType(V->getType()))) + return findValueImpl(CE->getOperand(0), OffsetOk, Visited); } else if (CE->getOpcode() == Instruction::ExtractValue) { ArrayRef<unsigned> Indices = CE->getIndices(); if (Value *W = FindInsertedValue(CE->getOperand(0), Indices)) if (W != V) - return findValueImpl(W, DL, OffsetOk, Visited); + return findValueImpl(W, OffsetOk, Visited); } } // As a last resort, try SimplifyInstruction or constant folding. if (Instruction *Inst = dyn_cast<Instruction>(V)) { - if (Value *W = SimplifyInstruction(Inst, DL, TLI, DT, AC)) - return findValueImpl(W, DL, OffsetOk, Visited); + if (Value *W = SimplifyInstruction(Inst, *DL, TLI, DT, AC)) + return findValueImpl(W, OffsetOk, Visited); } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) { - if (Value *W = ConstantFoldConstantExpression(CE, DL, TLI)) + if (Value *W = ConstantFoldConstantExpression(CE, *DL, TLI)) if (W != V) - return findValueImpl(W, DL, OffsetOk, Visited); + return findValueImpl(W, OffsetOk, Visited); } return V; diff --git a/contrib/llvm/lib/Analysis/Loads.cpp b/contrib/llvm/lib/Analysis/Loads.cpp index 624c5a1..4b2fa3c 100644 --- a/contrib/llvm/lib/Analysis/Loads.cpp +++ b/contrib/llvm/lib/Analysis/Loads.cpp @@ -118,7 +118,8 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom, // from/to. If so, the previous load or store would have already trapped, // so there is no harm doing an extra load (also, CSE will later eliminate // the load entirely). - BasicBlock::iterator BBI = ScanFrom, E = ScanFrom->getParent()->begin(); + BasicBlock::iterator BBI = ScanFrom->getIterator(), + E = ScanFrom->getParent()->begin(); // We can at least always strip pointer casts even though we can't use the // base here. @@ -161,6 +162,18 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom, return false; } +/// DefMaxInstsToScan - the default number of maximum instructions +/// to scan in the block, used by FindAvailableLoadedValue(). +/// FindAvailableLoadedValue() was introduced in r60148, to improve jump +/// threading in part by eliminating partially redundant loads. +/// At that point, the value of MaxInstsToScan was already set to '6' +/// without documented explanation. +cl::opt<unsigned> +llvm::DefMaxInstsToScan("available-load-scan-limit", cl::init(6), cl::Hidden, + cl::desc("Use this to specify the default maximum number of instructions " + "to scan backward from a given instruction, when searching for " + "available loaded value")); + /// \brief Scan the ScanBB block backwards to see if we have the value at the /// memory address *Ptr locally available within a small number of instructions. /// @@ -199,7 +212,7 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB, while (ScanFrom != ScanBB->begin()) { // We must ignore debug info directives when counting (otherwise they // would affect codegen). - Instruction *Inst = --ScanFrom; + Instruction *Inst = &*--ScanFrom; if (isa<DbgInfoIntrinsic>(Inst)) continue; @@ -246,9 +259,7 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB, // If we have alias analysis and it says the store won't modify the loaded // value, ignore the store. - if (AA && - (AA->getModRefInfo(SI, StrippedPtr, AccessSize) & - AliasAnalysis::Mod) == 0) + if (AA && (AA->getModRefInfo(SI, StrippedPtr, AccessSize) & MRI_Mod) == 0) continue; // Otherwise the store that may or may not alias the pointer, bail out. @@ -261,8 +272,7 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB, // If alias analysis claims that it really won't modify the load, // ignore it. if (AA && - (AA->getModRefInfo(Inst, StrippedPtr, AccessSize) & - AliasAnalysis::Mod) == 0) + (AA->getModRefInfo(Inst, StrippedPtr, AccessSize) & MRI_Mod) == 0) continue; // May modify the pointer, bail out. diff --git a/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp index becbae4..8bcdcb8 100644 --- a/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -58,12 +58,12 @@ static cl::opt<unsigned> MemoryCheckMergeThreshold( /// Maximum SIMD width. const unsigned VectorizerParams::MaxVectorWidth = 64; -/// \brief We collect interesting dependences up to this threshold. -static cl::opt<unsigned> MaxInterestingDependence( - "max-interesting-dependences", cl::Hidden, - cl::desc("Maximum number of interesting dependences collected by " - "loop-access analysis (default = 100)"), - cl::init(100)); +/// \brief We collect dependences up to this threshold. +static cl::opt<unsigned> + MaxDependences("max-dependences", cl::Hidden, + cl::desc("Maximum number of dependences collected by " + "loop-access analysis (default = 100)"), + cl::init(100)); bool VectorizerParams::isInterleaveForced() { return ::VectorizationInterleave.getNumOccurrences() > 0; @@ -87,11 +87,10 @@ Value *llvm::stripIntegerCast(Value *V) { return V; } -const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE, +const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const ValueToValueMap &PtrToStride, Value *Ptr, Value *OrigPtr) { - - const SCEV *OrigSCEV = SE->getSCEV(Ptr); + const SCEV *OrigSCEV = PSE.getSCEV(Ptr); // If there is an entry in the map return the SCEV of the pointer with the // symbolic stride replaced by one. @@ -108,36 +107,82 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE, ValueToValueMap RewriteMap; RewriteMap[StrideVal] = One; - const SCEV *ByOne = - SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true); - DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne + ScalarEvolution *SE = PSE.getSE(); + const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal)); + const auto *CT = + static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType())); + + PSE.addPredicate(*SE->getEqualPredicate(U, CT)); + auto *Expr = PSE.getSCEV(Ptr); + + DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *Expr << "\n"); - return ByOne; + return Expr; } // Otherwise, just return the SCEV of the original pointer. - return SE->getSCEV(Ptr); + return OrigSCEV; } void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, unsigned ASId, - const ValueToValueMap &Strides) { + const ValueToValueMap &Strides, + PredicatedScalarEvolution &PSE) { // Get the stride replaced scev. - const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); + const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); assert(AR && "Invalid addrec expression"); + ScalarEvolution *SE = PSE.getSE(); const SCEV *Ex = SE->getBackedgeTakenCount(Lp); + + const SCEV *ScStart = AR->getStart(); const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); - Pointers.emplace_back(Ptr, AR->getStart(), ScEnd, WritePtr, DepSetId, ASId, - Sc); + const SCEV *Step = AR->getStepRecurrence(*SE); + + // For expressions with negative step, the upper bound is ScStart and the + // lower bound is ScEnd. + if (const SCEVConstant *CStep = dyn_cast<const SCEVConstant>(Step)) { + if (CStep->getValue()->isNegative()) + std::swap(ScStart, ScEnd); + } else { + // Fallback case: the step is not constant, but the we can still + // get the upper and lower bounds of the interval by using min/max + // expressions. + ScStart = SE->getUMinExpr(ScStart, ScEnd); + ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd); + } + + Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc); +} + +SmallVector<RuntimePointerChecking::PointerCheck, 4> +RuntimePointerChecking::generateChecks() const { + SmallVector<PointerCheck, 4> Checks; + + for (unsigned I = 0; I < CheckingGroups.size(); ++I) { + for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) { + const RuntimePointerChecking::CheckingPtrGroup &CGI = CheckingGroups[I]; + const RuntimePointerChecking::CheckingPtrGroup &CGJ = CheckingGroups[J]; + + if (needsChecking(CGI, CGJ)) + Checks.push_back(std::make_pair(&CGI, &CGJ)); + } + } + return Checks; +} + +void RuntimePointerChecking::generateChecks( + MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) { + assert(Checks.empty() && "Checks is not empty"); + groupChecks(DepCands, UseDependencies); + Checks = generateChecks(); } -bool RuntimePointerChecking::needsChecking( - const CheckingPtrGroup &M, const CheckingPtrGroup &N, - const SmallVectorImpl<int> *PtrPartition) const { +bool RuntimePointerChecking::needsChecking(const CheckingPtrGroup &M, + const CheckingPtrGroup &N) const { for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I) for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J) - if (needsChecking(M.Members[I], N.Members[J], PtrPartition)) + if (needsChecking(M.Members[I], N.Members[J])) return true; return false; } @@ -204,8 +249,31 @@ void RuntimePointerChecking::groupChecks( CheckingGroups.clear(); + // If we need to check two pointers to the same underlying object + // with a non-constant difference, we shouldn't perform any pointer + // grouping with those pointers. This is because we can easily get + // into cases where the resulting check would return false, even when + // the accesses are safe. + // + // The following example shows this: + // for (i = 0; i < 1000; ++i) + // a[5000 + i * m] = a[i] + a[i + 9000] + // + // Here grouping gives a check of (5000, 5000 + 1000 * m) against + // (0, 10000) which is always false. However, if m is 1, there is no + // dependence. Not grouping the checks for a[i] and a[i + 9000] allows + // us to perform an accurate check in this case. + // + // The above case requires that we have an UnknownDependence between + // accesses to the same underlying object. This cannot happen unless + // ShouldRetryWithRuntimeCheck is set, and therefore UseDependencies + // is also false. In this case we will use the fallback path and create + // separate checking groups for all pointers. + // If we don't have the dependency partitions, construct a new - // checking pointer group for each pointer. + // checking pointer group for each pointer. This is also required + // for correctness, because in this case we can have checking between + // pointers to the same underlying object. if (!UseDependencies) { for (unsigned I = 0; I < Pointers.size(); ++I) CheckingGroups.push_back(CheckingPtrGroup(I, *this)); @@ -222,7 +290,7 @@ void RuntimePointerChecking::groupChecks( // don't process them twice. SmallSet<unsigned, 2> Seen; - // Go through all equivalence classes, get the the "pointer check groups" + // Go through all equivalence classes, get the "pointer check groups" // and add them to the overall solution. We use the order in which accesses // appear in 'Pointers' to enforce determinism. for (unsigned I = 0; I < Pointers.size(); ++I) { @@ -280,8 +348,14 @@ void RuntimePointerChecking::groupChecks( } } -bool RuntimePointerChecking::needsChecking( - unsigned I, unsigned J, const SmallVectorImpl<int> *PtrPartition) const { +bool RuntimePointerChecking::arePointersInSamePartition( + const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1, + unsigned PtrIdx2) { + return (PtrToPartition[PtrIdx1] != -1 && + PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]); +} + +bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const { const PointerInfo &PointerI = Pointers[I]; const PointerInfo &PointerJ = Pointers[J]; @@ -297,85 +371,45 @@ bool RuntimePointerChecking::needsChecking( if (PointerI.AliasSetId != PointerJ.AliasSetId) return false; - // If PtrPartition is set omit checks between pointers of the same partition. - // Partition number -1 means that the pointer is used in multiple partitions. - // In this case we can't omit the check. - if (PtrPartition && (*PtrPartition)[I] != -1 && - (*PtrPartition)[I] == (*PtrPartition)[J]) - return false; - return true; } -void RuntimePointerChecking::print( - raw_ostream &OS, unsigned Depth, - const SmallVectorImpl<int> *PtrPartition) const { - - OS.indent(Depth) << "Run-time memory checks:\n"; - +void RuntimePointerChecking::printChecks( + raw_ostream &OS, const SmallVectorImpl<PointerCheck> &Checks, + unsigned Depth) const { unsigned N = 0; - for (unsigned I = 0; I < CheckingGroups.size(); ++I) - for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) - if (needsChecking(CheckingGroups[I], CheckingGroups[J], PtrPartition)) { - OS.indent(Depth) << "Check " << N++ << ":\n"; - OS.indent(Depth + 2) << "Comparing group " << I << ":\n"; - - for (unsigned K = 0; K < CheckingGroups[I].Members.size(); ++K) { - OS.indent(Depth + 2) - << *Pointers[CheckingGroups[I].Members[K]].PointerValue << "\n"; - if (PtrPartition) - OS << " (Partition: " - << (*PtrPartition)[CheckingGroups[I].Members[K]] << ")" - << "\n"; - } + for (const auto &Check : Checks) { + const auto &First = Check.first->Members, &Second = Check.second->Members; - OS.indent(Depth + 2) << "Against group " << J << ":\n"; + OS.indent(Depth) << "Check " << N++ << ":\n"; - for (unsigned K = 0; K < CheckingGroups[J].Members.size(); ++K) { - OS.indent(Depth + 2) - << *Pointers[CheckingGroups[J].Members[K]].PointerValue << "\n"; - if (PtrPartition) - OS << " (Partition: " - << (*PtrPartition)[CheckingGroups[J].Members[K]] << ")" - << "\n"; - } - } + OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n"; + for (unsigned K = 0; K < First.size(); ++K) + OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n"; - OS.indent(Depth) << "Grouped accesses:\n"; - for (unsigned I = 0; I < CheckingGroups.size(); ++I) { - OS.indent(Depth + 2) << "Group " << I << ":\n"; - OS.indent(Depth + 4) << "(Low: " << *CheckingGroups[I].Low - << " High: " << *CheckingGroups[I].High << ")\n"; - for (unsigned J = 0; J < CheckingGroups[I].Members.size(); ++J) { - OS.indent(Depth + 6) << "Member: " - << *Pointers[CheckingGroups[I].Members[J]].Expr - << "\n"; - } + OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n"; + for (unsigned K = 0; K < Second.size(); ++K) + OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n"; } } -unsigned RuntimePointerChecking::getNumberOfChecks( - const SmallVectorImpl<int> *PtrPartition) const { - - unsigned NumPartitions = CheckingGroups.size(); - unsigned CheckCount = 0; +void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const { - for (unsigned I = 0; I < NumPartitions; ++I) - for (unsigned J = I + 1; J < NumPartitions; ++J) - if (needsChecking(CheckingGroups[I], CheckingGroups[J], PtrPartition)) - CheckCount++; - return CheckCount; -} + OS.indent(Depth) << "Run-time memory checks:\n"; + printChecks(OS, Checks, Depth); -bool RuntimePointerChecking::needsAnyChecking( - const SmallVectorImpl<int> *PtrPartition) const { - unsigned NumPointers = Pointers.size(); + OS.indent(Depth) << "Grouped accesses:\n"; + for (unsigned I = 0; I < CheckingGroups.size(); ++I) { + const auto &CG = CheckingGroups[I]; - for (unsigned I = 0; I < NumPointers; ++I) - for (unsigned J = I + 1; J < NumPointers; ++J) - if (needsChecking(I, J, PtrPartition)) - return true; - return false; + OS.indent(Depth + 2) << "Group " << &CG << ":\n"; + OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High + << ")\n"; + for (unsigned J = 0; J < CG.Members.size(); ++J) { + OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr + << "\n"; + } + } } namespace { @@ -390,9 +424,10 @@ public: typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; AccessAnalysis(const DataLayout &Dl, AliasAnalysis *AA, LoopInfo *LI, - MemoryDepChecker::DepCandidates &DA) - : DL(Dl), AST(*AA), LI(LI), DepCands(DA), - IsRTCheckAnalysisNeeded(false) {} + MemoryDepChecker::DepCandidates &DA, + PredicatedScalarEvolution &PSE) + : DL(Dl), AST(*AA), LI(LI), DepCands(DA), IsRTCheckAnalysisNeeded(false), + PSE(PSE) {} /// \brief Register a load and whether it is only read from. void addLoad(MemoryLocation &Loc, bool IsReadOnly) { @@ -435,7 +470,7 @@ public: /// We decided that no dependence analysis would be used. Reset the state. void resetDepChecks(MemoryDepChecker &DepChecker) { CheckDeps.clear(); - DepChecker.clearInterestingDependences(); + DepChecker.clearDependences(); } MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } @@ -477,14 +512,18 @@ private: /// (i.e. ShouldRetryWithRuntimeCheck), isDependencyCheckNeeded is cleared /// while this remains set if we have potentially dependent accesses. bool IsRTCheckAnalysisNeeded; + + /// The SCEV predicate containing all the SCEV-related assumptions. + PredicatedScalarEvolution &PSE; }; } // end anonymous namespace /// \brief Check whether a pointer can participate in a runtime bounds check. -static bool hasComputableBounds(ScalarEvolution *SE, - const ValueToValueMap &Strides, Value *Ptr) { - const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); +static bool hasComputableBounds(PredicatedScalarEvolution &PSE, + const ValueToValueMap &Strides, Value *Ptr, + Loop *L) { + const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); if (!AR) return false; @@ -527,11 +566,11 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck, else ++NumReadPtrChecks; - if (hasComputableBounds(SE, StridesMap, Ptr) && + if (hasComputableBounds(PSE, StridesMap, Ptr, TheLoop) && // When we run after a failing dependency check we have to make sure // we don't have wrapping pointers. (!ShouldCheckStride || - isStridedPtr(SE, Ptr, TheLoop, StridesMap) == 1)) { + isStridedPtr(PSE, Ptr, TheLoop, StridesMap) == 1)) { // The id of the dependence set. unsigned DepId; @@ -545,7 +584,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck, // Each access has its own dependence set. DepId = RunningDepId++; - RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap); + RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE); DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); } else { @@ -599,9 +638,9 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck, } if (NeedRTCheck && CanDoRT) - RtCheck.groupChecks(DepCands, IsDepCheckNeeded); + RtCheck.generateChecks(DepCands, IsDepCheckNeeded); - DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks(nullptr) + DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks() << " pointer comparisons.\n"); RtCheck.Need = NeedRTCheck; @@ -706,6 +745,11 @@ void AccessAnalysis::processMemAccesses() { GetUnderlyingObjects(Ptr, TempObjects, DL, LI); DEBUG(dbgs() << "Underlying objects for pointer " << *Ptr << "\n"); for (Value *UnderlyingObj : TempObjects) { + // nullptr never alias, don't join sets for pointer that have "null" + // in their UnderlyingObjects list. + if (isa<ConstantPointerNull>(UnderlyingObj)) + continue; + UnderlyingObjToAccessMap::iterator Prev = ObjToLastAccess.find(UnderlyingObj); if (Prev != ObjToLastAccess.end()) @@ -775,20 +819,20 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR, } /// \brief Check whether the access through \p Ptr has a constant stride. -int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, - const ValueToValueMap &StridesMap) { - const Type *Ty = Ptr->getType(); +int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, + const Loop *Lp, const ValueToValueMap &StridesMap) { + Type *Ty = Ptr->getType(); assert(Ty->isPointerTy() && "Unexpected non-ptr"); // Make sure that the pointer does not point to aggregate types. - const PointerType *PtrTy = cast<PointerType>(Ty); + auto *PtrTy = cast<PointerType>(Ty); if (PtrTy->getElementType()->isAggregateType()) { DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type" << *Ptr << "\n"); return 0; } - const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr); + const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr); const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); if (!AR) { @@ -801,6 +845,7 @@ int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, if (Lp != AR->getLoop()) { DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " << *Ptr << " SCEV: " << *PtrScev << "\n"); + return 0; } // The address calculation must not wrap. Otherwise, a dependence could be @@ -811,16 +856,16 @@ int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, // to access the pointer value "0" which is undefined behavior in address // space 0, therefore we can also vectorize this case. bool IsInBoundsGEP = isInBoundsGep(Ptr); - bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, SE, Lp); + bool IsNoWrapAddRec = isNoWrapAddRec(Ptr, AR, PSE.getSE(), Lp); bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) { DEBUG(dbgs() << "LAA: Bad stride - Pointer may wrap in the address space " - << *Ptr << " SCEV: " << *PtrScev << "\n"); + << *Ptr << " SCEV: " << *PtrScev << "\n"); return 0; } // Check the step is constant. - const SCEV *Step = AR->getStepRecurrence(*SE); + const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); // Calculate the pointer stride and check if it is constant. const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); @@ -832,7 +877,7 @@ int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, auto &DL = Lp->getHeader()->getModule()->getDataLayout(); int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType()); - const APInt &APStepVal = C->getValue()->getValue(); + const APInt &APStepVal = C->getAPInt(); // Huge step value - give up. if (APStepVal.getBitWidth() > 64) @@ -872,15 +917,15 @@ bool MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) { llvm_unreachable("unexpected DepType!"); } -bool MemoryDepChecker::Dependence::isInterestingDependence(DepType Type) { +bool MemoryDepChecker::Dependence::isBackward() const { switch (Type) { case NoDep: case Forward: + case ForwardButPreventsForwarding: + case Unknown: return false; case BackwardVectorizable: - case Unknown: - case ForwardButPreventsForwarding: case Backward: case BackwardVectorizableButPreventsForwarding: return true; @@ -889,17 +934,21 @@ bool MemoryDepChecker::Dependence::isInterestingDependence(DepType Type) { } bool MemoryDepChecker::Dependence::isPossiblyBackward() const { + return isBackward() || Type == Unknown; +} + +bool MemoryDepChecker::Dependence::isForward() const { switch (Type) { - case NoDep: case Forward: case ForwardButPreventsForwarding: - return false; + return true; + case NoDep: case Unknown: case BackwardVectorizable: case Backward: case BackwardVectorizableButPreventsForwarding: - return true; + return false; } llvm_unreachable("unexpected DepType!"); } @@ -999,11 +1048,11 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, BPtr->getType()->getPointerAddressSpace()) return Dependence::Unknown; - const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr); - const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr); + const SCEV *AScev = replaceSymbolicStrideSCEV(PSE, Strides, APtr); + const SCEV *BScev = replaceSymbolicStrideSCEV(PSE, Strides, BPtr); - int StrideAPtr = isStridedPtr(SE, APtr, InnermostLoop, Strides); - int StrideBPtr = isStridedPtr(SE, BPtr, InnermostLoop, Strides); + int StrideAPtr = isStridedPtr(PSE, APtr, InnermostLoop, Strides); + int StrideBPtr = isStridedPtr(PSE, BPtr, InnermostLoop, Strides); const SCEV *Src = AScev; const SCEV *Sink = BScev; @@ -1020,12 +1069,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, std::swap(StrideAPtr, StrideBPtr); } - const SCEV *Dist = SE->getMinusSCEV(Sink, Src); + const SCEV *Dist = PSE.getSE()->getMinusSCEV(Sink, Src); DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink - << "(Induction step: " << StrideAPtr << ")\n"); + << "(Induction step: " << StrideAPtr << ")\n"); DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to " - << *InstMap[BIdx] << ": " << *Dist << "\n"); + << *InstMap[BIdx] << ": " << *Dist << "\n"); // Need accesses with constant stride. We don't want to vectorize // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in @@ -1048,7 +1097,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, unsigned TypeByteSize = DL.getTypeAllocSize(ATy); // Negative distances are not plausible dependencies. - const APInt &Val = C->getValue()->getValue(); + const APInt &Val = C->getAPInt(); if (Val.isNegative()) { bool IsTrueDataDependence = (AIsWrite && !BIsWrite); if (IsTrueDataDependence && @@ -1064,7 +1113,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // Could be improved to assert type sizes are the same (i32 == float, etc). if (Val == 0) { if (ATy == BTy) - return Dependence::NoDep; + return Dependence::Forward; DEBUG(dbgs() << "LAA: Zero dependence difference but different types\n"); return Dependence::Unknown; } @@ -1203,22 +1252,21 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets, isDependent(*A.first, A.second, *B.first, B.second, Strides); SafeForVectorization &= Dependence::isSafeForVectorization(Type); - // Gather dependences unless we accumulated MaxInterestingDependence + // Gather dependences unless we accumulated MaxDependences // dependences. In that case return as soon as we find the first // unsafe dependence. This puts a limit on this quadratic // algorithm. - if (RecordInterestingDependences) { - if (Dependence::isInterestingDependence(Type)) - InterestingDependences.push_back( - Dependence(A.second, B.second, Type)); - - if (InterestingDependences.size() >= MaxInterestingDependence) { - RecordInterestingDependences = false; - InterestingDependences.clear(); + if (RecordDependences) { + if (Type != Dependence::NoDep) + Dependences.push_back(Dependence(A.second, B.second, Type)); + + if (Dependences.size() >= MaxDependences) { + RecordDependences = false; + Dependences.clear(); DEBUG(dbgs() << "Too many dependences, stopped recording\n"); } } - if (!RecordInterestingDependences && !SafeForVectorization) + if (!RecordDependences && !SafeForVectorization) return false; } ++OI; @@ -1227,8 +1275,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets, } } - DEBUG(dbgs() << "Total Interesting Dependences: " - << InterestingDependences.size() << "\n"); + DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n"); return SafeForVectorization; } @@ -1298,10 +1345,10 @@ bool LoopAccessInfo::canAnalyzeLoop() { } // ScalarEvolution needs to be able to find the exit count. - const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); - if (ExitCount == SE->getCouldNotCompute()) { - emitAnalysis(LoopAccessReport() << - "could not determine number of loop iterations"); + const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop); + if (ExitCount == PSE.getSE()->getCouldNotCompute()) { + emitAnalysis(LoopAccessReport() + << "could not determine number of loop iterations"); DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n"); return false; } @@ -1370,7 +1417,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) { if (it->mayWriteToMemory()) { StoreInst *St = dyn_cast<StoreInst>(it); if (!St) { - emitAnalysis(LoopAccessReport(it) << + emitAnalysis(LoopAccessReport(&*it) << "instruction cannot be vectorized"); CanVecMem = false; return; @@ -1402,7 +1449,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) { MemoryDepChecker::DepCandidates DependentAccesses; AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(), - AA, LI, DependentAccesses); + AA, LI, DependentAccesses, PSE); // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -1453,7 +1500,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) { // read a few words, modify, and write a few words, and some of the // words may be written to the same address. bool IsReadOnlyPtr = false; - if (Seen.insert(Ptr).second || !isStridedPtr(SE, Ptr, TheLoop, Strides)) { + if (Seen.insert(Ptr).second || !isStridedPtr(PSE, Ptr, TheLoop, Strides)) { ++NumReads; IsReadOnlyPtr = true; } @@ -1483,7 +1530,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) { // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. bool CanDoRTIfNeeded = - Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides); + Accesses.canCheckPtrAtRT(PtrRtChecking, PSE.getSE(), TheLoop, Strides); if (!CanDoRTIfNeeded) { emitAnalysis(LoopAccessReport() << "cannot identify array bounds"); DEBUG(dbgs() << "LAA: We can't vectorize because we can't find " @@ -1510,6 +1557,7 @@ void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) { PtrRtChecking.reset(); PtrRtChecking.Need = true; + auto *SE = PSE.getSE(); CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(PtrRtChecking, SE, TheLoop, Strides, true); @@ -1552,7 +1600,7 @@ void LoopAccessInfo::emitAnalysis(LoopAccessReport &Message) { } bool LoopAccessInfo::isUniform(Value *V) const { - return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); + return (PSE.getSE()->isLoopInvariant(PSE.getSE()->getSCEV(V), TheLoop)); } // FIXME: this function is currently a duplicate of the one in @@ -1566,86 +1614,115 @@ static Instruction *getFirstInst(Instruction *FirstInst, Value *V, return nullptr; } -std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeCheck( - Instruction *Loc, const SmallVectorImpl<int> *PtrPartition) const { - if (!PtrRtChecking.Need) - return std::make_pair(nullptr, nullptr); +namespace { +/// \brief IR Values for the lower and upper bounds of a pointer evolution. We +/// need to use value-handles because SCEV expansion can invalidate previously +/// expanded values. Thus expansion of a pointer can invalidate the bounds for +/// a previous one. +struct PointerBounds { + TrackingVH<Value> Start; + TrackingVH<Value> End; +}; +} // end anonymous namespace - SmallVector<TrackingVH<Value>, 2> Starts; - SmallVector<TrackingVH<Value>, 2> Ends; +/// \brief Expand code for the lower and upper bound of the pointer group \p CG +/// in \p TheLoop. \return the values for the bounds. +static PointerBounds +expandBounds(const RuntimePointerChecking::CheckingPtrGroup *CG, Loop *TheLoop, + Instruction *Loc, SCEVExpander &Exp, ScalarEvolution *SE, + const RuntimePointerChecking &PtrRtChecking) { + Value *Ptr = PtrRtChecking.Pointers[CG->Members[0]].PointerValue; + const SCEV *Sc = SE->getSCEV(Ptr); + + if (SE->isLoopInvariant(Sc, TheLoop)) { + DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" << *Ptr + << "\n"); + return {Ptr, Ptr}; + } else { + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + LLVMContext &Ctx = Loc->getContext(); + + // Use this type for pointer arithmetic. + Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); + Value *Start = nullptr, *End = nullptr; + + DEBUG(dbgs() << "LAA: Adding RT check for range:\n"); + Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc); + End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc); + DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High << "\n"); + return {Start, End}; + } +} - LLVMContext &Ctx = Loc->getContext(); - SCEVExpander Exp(*SE, DL, "induction"); - Instruction *FirstInst = nullptr; +/// \brief Turns a collection of checks into a collection of expanded upper and +/// lower bounds for both pointers in the check. +static SmallVector<std::pair<PointerBounds, PointerBounds>, 4> expandBounds( + const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks, + Loop *L, Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp, + const RuntimePointerChecking &PtrRtChecking) { + SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds; + + // Here we're relying on the SCEV Expander's cache to only emit code for the + // same bounds once. + std::transform( + PointerChecks.begin(), PointerChecks.end(), + std::back_inserter(ChecksWithBounds), + [&](const RuntimePointerChecking::PointerCheck &Check) { + PointerBounds + First = expandBounds(Check.first, L, Loc, Exp, SE, PtrRtChecking), + Second = expandBounds(Check.second, L, Loc, Exp, SE, PtrRtChecking); + return std::make_pair(First, Second); + }); + + return ChecksWithBounds; +} - for (unsigned i = 0; i < PtrRtChecking.CheckingGroups.size(); ++i) { - const RuntimePointerChecking::CheckingPtrGroup &CG = - PtrRtChecking.CheckingGroups[i]; - Value *Ptr = PtrRtChecking.Pointers[CG.Members[0]].PointerValue; - const SCEV *Sc = SE->getSCEV(Ptr); - - if (SE->isLoopInvariant(Sc, TheLoop)) { - DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" << *Ptr - << "\n"); - Starts.push_back(Ptr); - Ends.push_back(Ptr); - } else { - unsigned AS = Ptr->getType()->getPointerAddressSpace(); - - // Use this type for pointer arithmetic. - Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); - Value *Start = nullptr, *End = nullptr; - - DEBUG(dbgs() << "LAA: Adding RT check for range:\n"); - Start = Exp.expandCodeFor(CG.Low, PtrArithTy, Loc); - End = Exp.expandCodeFor(CG.High, PtrArithTy, Loc); - DEBUG(dbgs() << "Start: " << *CG.Low << " End: " << *CG.High << "\n"); - Starts.push_back(Start); - Ends.push_back(End); - } - } +std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks( + Instruction *Loc, + const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks) + const { + auto *SE = PSE.getSE(); + SCEVExpander Exp(*SE, DL, "induction"); + auto ExpandedChecks = + expandBounds(PointerChecks, TheLoop, Loc, SE, Exp, PtrRtChecking); + LLVMContext &Ctx = Loc->getContext(); + Instruction *FirstInst = nullptr; IRBuilder<> ChkBuilder(Loc); // Our instructions might fold to a constant. Value *MemoryRuntimeCheck = nullptr; - for (unsigned i = 0; i < PtrRtChecking.CheckingGroups.size(); ++i) { - for (unsigned j = i + 1; j < PtrRtChecking.CheckingGroups.size(); ++j) { - const RuntimePointerChecking::CheckingPtrGroup &CGI = - PtrRtChecking.CheckingGroups[i]; - const RuntimePointerChecking::CheckingPtrGroup &CGJ = - PtrRtChecking.CheckingGroups[j]; - - if (!PtrRtChecking.needsChecking(CGI, CGJ, PtrPartition)) - continue; - - unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); - unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); - - assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && - (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && - "Trying to bounds check pointers with different address spaces"); - Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); - Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); - - Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); - Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); - Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); - Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); - - Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); - FirstInst = getFirstInst(FirstInst, Cmp0, Loc); - Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); - FirstInst = getFirstInst(FirstInst, Cmp1, Loc); - Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); + for (const auto &Check : ExpandedChecks) { + const PointerBounds &A = Check.first, &B = Check.second; + // Check if two pointers (A and B) conflict where conflict is computed as: + // start(A) <= end(B) && start(B) <= end(A) + unsigned AS0 = A.Start->getType()->getPointerAddressSpace(); + unsigned AS1 = B.Start->getType()->getPointerAddressSpace(); + + assert((AS0 == B.End->getType()->getPointerAddressSpace()) && + (AS1 == A.End->getType()->getPointerAddressSpace()) && + "Trying to bounds check pointers with different address spaces"); + + Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); + Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); + + Value *Start0 = ChkBuilder.CreateBitCast(A.Start, PtrArithTy0, "bc"); + Value *Start1 = ChkBuilder.CreateBitCast(B.Start, PtrArithTy1, "bc"); + Value *End0 = ChkBuilder.CreateBitCast(A.End, PtrArithTy1, "bc"); + Value *End1 = ChkBuilder.CreateBitCast(B.End, PtrArithTy0, "bc"); + + Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); + FirstInst = getFirstInst(FirstInst, Cmp0, Loc); + Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); + FirstInst = getFirstInst(FirstInst, Cmp1, Loc); + Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); + FirstInst = getFirstInst(FirstInst, IsConflict, Loc); + if (MemoryRuntimeCheck) { + IsConflict = + ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx"); FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - if (MemoryRuntimeCheck) { - IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, - "conflict.rdx"); - FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - } - MemoryRuntimeCheck = IsConflict; } + MemoryRuntimeCheck = IsConflict; } if (!MemoryRuntimeCheck) @@ -1661,12 +1738,20 @@ std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeCheck( return std::make_pair(FirstInst, Check); } +std::pair<Instruction *, Instruction *> +LoopAccessInfo::addRuntimeChecks(Instruction *Loc) const { + if (!PtrRtChecking.Need) + return std::make_pair(nullptr, nullptr); + + return addRuntimeChecks(Loc, PtrRtChecking.getChecks()); +} + LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, const DataLayout &DL, const TargetLibraryInfo *TLI, AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, const ValueToValueMap &Strides) - : PtrRtChecking(SE), DepChecker(SE, L), TheLoop(L), SE(SE), DL(DL), + : PSE(*SE), PtrRtChecking(SE), DepChecker(PSE, L), TheLoop(L), DL(DL), TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1U), CanVecMem(false), StoreToLoopInvariantAddress(false) { @@ -1685,14 +1770,14 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { if (Report) OS.indent(Depth) << "Report: " << Report->str() << "\n"; - if (auto *InterestingDependences = DepChecker.getInterestingDependences()) { - OS.indent(Depth) << "Interesting Dependences:\n"; - for (auto &Dep : *InterestingDependences) { + if (auto *Dependences = DepChecker.getDependences()) { + OS.indent(Depth) << "Dependences:\n"; + for (auto &Dep : *Dependences) { Dep.print(OS, Depth + 2, DepChecker.getMemoryInstructions()); OS << "\n"; } } else - OS.indent(Depth) << "Too many interesting dependences, not recorded\n"; + OS.indent(Depth) << "Too many dependences, not recorded\n"; // List the pair of accesses need run-time checks to prove independence. PtrRtChecking.print(OS, Depth); @@ -1701,6 +1786,9 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { OS.indent(Depth) << "Store to invariant address was " << (StoreToLoopInvariantAddress ? "" : "not ") << "found in loop.\n"; + + OS.indent(Depth) << "SCEV assumptions:\n"; + PSE.getUnionPredicate().print(OS, Depth); } const LoopAccessInfo & @@ -1714,8 +1802,8 @@ LoopAccessAnalysis::getInfo(Loop *L, const ValueToValueMap &Strides) { if (!LAI) { const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); - LAI = llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, LI, - Strides); + LAI = + llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, LI, Strides); #ifndef NDEBUG LAI->NumSymbolicStrides = Strides.size(); #endif @@ -1737,10 +1825,10 @@ void LoopAccessAnalysis::print(raw_ostream &OS, const Module *M) const { } bool LoopAccessAnalysis::runOnFunction(Function &F) { - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); TLI = TLIP ? &TLIP->getTLI() : nullptr; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); @@ -1748,8 +1836,8 @@ bool LoopAccessAnalysis::runOnFunction(Function &F) { } void LoopAccessAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<ScalarEvolution>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); @@ -1761,8 +1849,8 @@ static const char laa_name[] = "Loop Access Analysis"; #define LAA_NAME "loop-accesses" INITIALIZE_PASS_BEGIN(LoopAccessAnalysis, LAA_NAME, laa_name, false, true) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(LoopAccessAnalysis, LAA_NAME, laa_name, false, true) diff --git a/contrib/llvm/lib/Analysis/LoopInfo.cpp b/contrib/llvm/lib/Analysis/LoopInfo.cpp index 6b6faf8..0c725fc 100644 --- a/contrib/llvm/lib/Analysis/LoopInfo.cpp +++ b/contrib/llvm/lib/Analysis/LoopInfo.cpp @@ -102,8 +102,8 @@ bool Loop::makeLoopInvariant(Instruction *I, bool &Changed, return false; if (I->mayReadFromMemory()) return false; - // The landingpad instruction is immobile. - if (isa<LandingPadInst>(I)) + // EH block instructions are immobile. + if (I->isEHPad()) return false; // Determine the insertion point, unless one was given. if (!InsertPt) { @@ -120,6 +120,13 @@ bool Loop::makeLoopInvariant(Instruction *I, bool &Changed, // Hoist. I->moveBefore(InsertPt); + + // There is possibility of hoisting this instruction above some arbitrary + // condition. Any metadata defined on it can be control dependent on this + // condition. Conservatively strip it here so that we don't give any wrong + // information to the optimizer. + I->dropUnknownNonDebugMetadata(); + Changed = true; return true; } @@ -172,7 +179,13 @@ PHINode *Loop::getCanonicalInductionVariable() const { bool Loop::isLCSSAForm(DominatorTree &DT) const { for (block_iterator BI = block_begin(), E = block_end(); BI != E; ++BI) { BasicBlock *BB = *BI; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;++I) + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;++I) { + // Tokens can't be used in PHI nodes and live-out tokens prevent loop + // optimizations, so for the purposes of considered LCSSA form, we + // can ignore them. + if (I->getType()->isTokenTy()) + continue; + for (Use &U : I->uses()) { Instruction *UI = cast<Instruction>(U.getUser()); BasicBlock *UserBB = UI->getParent(); @@ -188,11 +201,21 @@ bool Loop::isLCSSAForm(DominatorTree &DT) const { DT.isReachableFromEntry(UserBB)) return false; } + } } return true; } +bool Loop::isRecursivelyLCSSAForm(DominatorTree &DT) const { + if (!isLCSSAForm(DT)) + return false; + + return std::all_of(begin(), end(), [&](const Loop *L) { + return L->isRecursivelyLCSSAForm(DT); + }); +} + /// isLoopSimplifyForm - Return true if the Loop is in the form that /// the LoopSimplify form transforms loops to, which is sometimes called /// normal form. @@ -211,15 +234,23 @@ bool Loop::isSafeToClone() const { if (isa<IndirectBrInst>((*I)->getTerminator())) return false; - if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator())) + if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator())) { if (II->cannotDuplicate()) return false; + // Return false if any loop blocks contain invokes to EH-pads other than + // landingpads; we don't know how to split those edges yet. + auto *FirstNonPHI = II->getUnwindDest()->getFirstNonPHI(); + if (FirstNonPHI->isEHPad() && !isa<LandingPadInst>(FirstNonPHI)) + return false; + } for (BasicBlock::iterator BI = (*I)->begin(), BE = (*I)->end(); BI != BE; ++BI) { if (const CallInst *CI = dyn_cast<CallInst>(BI)) { if (CI->cannotDuplicate()) return false; } + if (BI->getType()->isTokenTy() && BI->isUsedOutsideOfBlock(*I)) + return false; } } return true; @@ -602,14 +633,14 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) { return NearLoop; } -/// updateUnloop - The last backedge has been removed from a loop--now the -/// "unloop". Find a new parent for the blocks contained within unloop and -/// update the loop tree. We don't necessarily have valid dominators at this -/// point, but LoopInfo is still valid except for the removal of this loop. -/// -/// Note that Unloop may now be an empty loop. Calling Loop::getHeader without -/// checking first is illegal. -void LoopInfo::updateUnloop(Loop *Unloop) { +LoopInfo::LoopInfo(const DominatorTreeBase<BasicBlock> &DomTree) { + analyze(DomTree); +} + +void LoopInfo::markAsRemoved(Loop *Unloop) { + assert(!Unloop->isInvalid() && "Loop has already been removed"); + Unloop->invalidate(); + RemovedLoops.push_back(Unloop); // First handle the special case of no parent loop to simplify the algorithm. if (!Unloop->getParentLoop()) { @@ -675,7 +706,7 @@ LoopInfo LoopAnalysis::run(Function &F, AnalysisManager<Function> *AM) { // objects. I don't want to add that kind of complexity until the scope of // the problem is better understood. LoopInfo LI; - LI.Analyze(AM->getResult<DominatorTreeAnalysis>(F)); + LI.analyze(AM->getResult<DominatorTreeAnalysis>(F)); return LI; } @@ -685,6 +716,20 @@ PreservedAnalyses LoopPrinterPass::run(Function &F, return PreservedAnalyses::all(); } +PrintLoopPass::PrintLoopPass() : OS(dbgs()) {} +PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner) + : OS(OS), Banner(Banner) {} + +PreservedAnalyses PrintLoopPass::run(Loop &L) { + OS << Banner; + for (auto *Block : L.blocks()) + if (Block) + Block->print(OS); + else + OS << "Printing <null> block"; + return PreservedAnalyses::all(); +} + //===----------------------------------------------------------------------===// // LoopInfo implementation // @@ -698,7 +743,7 @@ INITIALIZE_PASS_END(LoopInfoWrapperPass, "loops", "Natural Loop Information", bool LoopInfoWrapperPass::runOnFunction(Function &) { releaseMemory(); - LI.Analyze(getAnalysis<DominatorTreeWrapperPass>().getDomTree()); + LI.analyze(getAnalysis<DominatorTreeWrapperPass>().getDomTree()); return false; } diff --git a/contrib/llvm/lib/Analysis/LoopPass.cpp b/contrib/llvm/lib/Analysis/LoopPass.cpp index e9fcf02..8163231 100644 --- a/contrib/llvm/lib/Analysis/LoopPass.cpp +++ b/contrib/llvm/lib/Analysis/LoopPass.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PassManager.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" @@ -27,35 +28,30 @@ namespace { /// PrintLoopPass - Print a Function corresponding to a Loop. /// -class PrintLoopPass : public LoopPass { -private: - std::string Banner; - raw_ostream &Out; // raw_ostream to print on. +class PrintLoopPassWrapper : public LoopPass { + PrintLoopPass P; public: static char ID; - PrintLoopPass(const std::string &B, raw_ostream &o) - : LoopPass(ID), Banner(B), Out(o) {} + PrintLoopPassWrapper() : LoopPass(ID) {} + PrintLoopPassWrapper(raw_ostream &OS, const std::string &Banner) + : LoopPass(ID), P(OS, Banner) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); } bool runOnLoop(Loop *L, LPPassManager &) override { - Out << Banner; - for (Loop::block_iterator b = L->block_begin(), be = L->block_end(); - b != be; - ++b) { - if (*b) - (*b)->print(Out); - else - Out << "Printing <null> block"; - } + auto BBI = find_if(L->blocks().begin(), L->blocks().end(), + [](BasicBlock *BB) { return BB; }); + if (BBI != L->blocks().end() && + isFunctionInPrintList((*BBI)->getParent()->getName())) + P.run(*L); return false; } }; -char PrintLoopPass::ID = 0; +char PrintLoopPassWrapper::ID = 0; } //===----------------------------------------------------------------------===// @@ -66,81 +62,34 @@ char LPPassManager::ID = 0; LPPassManager::LPPassManager() : FunctionPass(ID), PMDataManager() { - skipThisLoop = false; - redoThisLoop = false; LI = nullptr; CurrentLoop = nullptr; } -/// Delete loop from the loop queue and loop hierarchy (LoopInfo). -void LPPassManager::deleteLoopFromQueue(Loop *L) { - - LI->updateUnloop(L); - - // Notify passes that the loop is being deleted. - deleteSimpleAnalysisLoop(L); - - // If L is current loop then skip rest of the passes and let - // runOnFunction remove L from LQ. Otherwise, remove L from LQ now - // and continue applying other passes on CurrentLoop. - if (CurrentLoop == L) - skipThisLoop = true; - - delete L; - - if (skipThisLoop) - return; - - for (std::deque<Loop *>::iterator I = LQ.begin(), - E = LQ.end(); I != E; ++I) { - if (*I == L) { - LQ.erase(I); - break; - } - } -} - // Inset loop into loop nest (LoopInfo) and loop queue (LQ). -void LPPassManager::insertLoop(Loop *L, Loop *ParentLoop) { - - assert (CurrentLoop != L && "Cannot insert CurrentLoop"); +Loop &LPPassManager::addLoop(Loop *ParentLoop) { + // Create a new loop. LI will take ownership. + Loop *L = new Loop(); - // Insert into loop nest - if (ParentLoop) - ParentLoop->addChildLoop(L); - else + // Insert into the loop nest and the loop queue. + if (!ParentLoop) { + // This is the top level loop. LI->addTopLevelLoop(L); - - insertLoopIntoQueue(L); -} - -void LPPassManager::insertLoopIntoQueue(Loop *L) { - // Insert L into loop queue - if (L == CurrentLoop) - redoLoop(L); - else if (!L->getParentLoop()) - // This is top level loop. LQ.push_front(L); - else { - // Insert L after the parent loop. - for (std::deque<Loop *>::iterator I = LQ.begin(), - E = LQ.end(); I != E; ++I) { - if (*I == L->getParentLoop()) { - // deque does not support insert after. - ++I; - LQ.insert(I, 1, L); - break; - } - } + return *L; } -} -// Reoptimize this loop. LPPassManager will re-insert this loop into the -// queue. This allows LoopPass to change loop nest for the loop. This -// utility may send LPPassManager into infinite loops so use caution. -void LPPassManager::redoLoop(Loop *L) { - assert (CurrentLoop == L && "Can redo only CurrentLoop"); - redoThisLoop = true; + ParentLoop->addChildLoop(L); + // Insert L into the loop queue after the parent loop. + for (auto I = LQ.begin(), E = LQ.end(); I != E; ++I) { + if (*I == L->getParentLoop()) { + // deque does not support insert after. + ++I; + LQ.insert(I, 1, L); + break; + } + } + return *L; } /// cloneBasicBlockSimpleAnalysis - Invoke cloneBasicBlockAnalysis hook for @@ -229,10 +178,8 @@ bool LPPassManager::runOnFunction(Function &F) { // Walk Loops while (!LQ.empty()) { - - CurrentLoop = LQ.back(); - skipThisLoop = false; - redoThisLoop = false; + bool LoopWasDeleted = false; + CurrentLoop = LQ.back(); // Run all passes on the current Loop. for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) { @@ -250,14 +197,18 @@ bool LPPassManager::runOnFunction(Function &F) { Changed |= P->runOnLoop(CurrentLoop, *this); } + LoopWasDeleted = CurrentLoop->isInvalid(); if (Changed) dumpPassInfo(P, MODIFICATION_MSG, ON_LOOP_MSG, - skipThisLoop ? "<deleted>" : - CurrentLoop->getHeader()->getName()); + LoopWasDeleted ? "<deleted>" + : CurrentLoop->getHeader()->getName()); dumpPreservedSet(P); - if (!skipThisLoop) { + if (LoopWasDeleted) { + // Notify passes that the loop is being deleted. + deleteSimpleAnalysisLoop(CurrentLoop); + } else { // Manually check that this loop is still healthy. This is done // instead of relying on LoopInfo::verifyLoop since LoopInfo // is a function pass and it's really expensive to verify every @@ -276,12 +227,11 @@ bool LPPassManager::runOnFunction(Function &F) { removeNotPreservedAnalysis(P); recordAvailableAnalysis(P); - removeDeadPasses(P, - skipThisLoop ? "<deleted>" : - CurrentLoop->getHeader()->getName(), + removeDeadPasses(P, LoopWasDeleted ? "<deleted>" + : CurrentLoop->getHeader()->getName(), ON_LOOP_MSG); - if (skipThisLoop) + if (LoopWasDeleted) // Do not run other passes on this loop. break; } @@ -289,17 +239,15 @@ bool LPPassManager::runOnFunction(Function &F) { // If the loop was deleted, release all the loop passes. This frees up // some memory, and avoids trouble with the pass manager trying to call // verifyAnalysis on them. - if (skipThisLoop) + if (LoopWasDeleted) { for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) { Pass *P = getContainedPass(Index); freePass(P, "<deleted>", ON_LOOP_MSG); } + } // Pop the loop from queue after running all passes. LQ.pop_back(); - - if (redoThisLoop) - LQ.push_back(CurrentLoop); } // Finalization @@ -327,7 +275,7 @@ void LPPassManager::dumpPassStructure(unsigned Offset) { Pass *LoopPass::createPrinterPass(raw_ostream &O, const std::string &Banner) const { - return new PrintLoopPass(Banner, O); + return new PrintLoopPassWrapper(O, Banner); } // Check if this pass is suitable for the current LPPassManager, if diff --git a/contrib/llvm/lib/Analysis/MemDepPrinter.cpp b/contrib/llvm/lib/Analysis/MemDepPrinter.cpp index da3b829..078cefe 100644 --- a/contrib/llvm/lib/Analysis/MemDepPrinter.cpp +++ b/contrib/llvm/lib/Analysis/MemDepPrinter.cpp @@ -49,7 +49,7 @@ namespace { void print(raw_ostream &OS, const Module * = nullptr) const override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequiredTransitive<AliasAnalysis>(); + AU.addRequiredTransitive<AAResultsWrapperPass>(); AU.addRequiredTransitive<MemoryDependenceAnalysis>(); AU.setPreservesAll(); } @@ -96,7 +96,7 @@ bool MemDepPrinter::runOnFunction(Function &F) { // All this code uses non-const interfaces because MemDep is not // const-friendly, though nothing is actually modified. - for (auto &I : inst_range(F)) { + for (auto &I : instructions(F)) { Instruction *Inst = &I; if (!Inst->mayReadFromMemory() && !Inst->mayWriteToMemory()) @@ -135,7 +135,7 @@ bool MemDepPrinter::runOnFunction(Function &F) { } void MemDepPrinter::print(raw_ostream &OS, const Module *M) const { - for (const auto &I : inst_range(*F)) { + for (const auto &I : instructions(*F)) { const Instruction *Inst = &I; DepSetMap::const_iterator DI = Deps.find(Inst); diff --git a/contrib/llvm/lib/Analysis/MemDerefPrinter.cpp b/contrib/llvm/lib/Analysis/MemDerefPrinter.cpp index fa292a2..36f1424 100644 --- a/contrib/llvm/lib/Analysis/MemDerefPrinter.cpp +++ b/contrib/llvm/lib/Analysis/MemDerefPrinter.cpp @@ -22,7 +22,8 @@ using namespace llvm; namespace { struct MemDerefPrinter : public FunctionPass { - SmallVector<Value *, 4> Vec; + SmallVector<Value *, 4> Deref; + SmallPtrSet<Value *, 4> DerefAndAligned; static char ID; // Pass identification, replacement for typeid MemDerefPrinter() : FunctionPass(ID) { @@ -34,7 +35,8 @@ namespace { bool runOnFunction(Function &F) override; void print(raw_ostream &OS, const Module * = nullptr) const override; void releaseMemory() override { - Vec.clear(); + Deref.clear(); + DerefAndAligned.clear(); } }; } @@ -51,11 +53,13 @@ FunctionPass *llvm::createMemDerefPrinter() { bool MemDerefPrinter::runOnFunction(Function &F) { const DataLayout &DL = F.getParent()->getDataLayout(); - for (auto &I: inst_range(F)) { + for (auto &I: instructions(F)) { if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { Value *PO = LI->getPointerOperand(); if (isDereferenceablePointer(PO, DL)) - Vec.push_back(PO); + Deref.push_back(PO); + if (isDereferenceableAndAlignedPointer(PO, LI->getAlignment(), DL)) + DerefAndAligned.insert(PO); } } return false; @@ -63,8 +67,12 @@ bool MemDerefPrinter::runOnFunction(Function &F) { void MemDerefPrinter::print(raw_ostream &OS, const Module *M) const { OS << "The following are dereferenceable:\n"; - for (auto &V: Vec) { + for (Value *V: Deref) { V->print(OS); + if (DerefAndAligned.count(V)) + OS << "\t(aligned)"; + else + OS << "\t(unaligned)"; OS << "\n\n"; } } diff --git a/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp b/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp index 8ddac8f..9e896ae 100644 --- a/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp +++ b/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp @@ -31,7 +31,7 @@ using namespace llvm; #define DEBUG_TYPE "memory-builtins" -enum AllocType { +enum AllocType : uint8_t { OpNewLike = 1<<0, // allocates; never returns null MallocLike = 1<<1 | OpNewLike, // allocates; may return null CallocLike = 1<<2, // allocates + bzero @@ -62,6 +62,14 @@ static const AllocFnsTy AllocationFnData[] = { {LibFunc::ZnajRKSt9nothrow_t, MallocLike, 2, 0, -1}, // new[](unsigned int, nothrow) {LibFunc::Znam, OpNewLike, 1, 0, -1}, // new[](unsigned long) {LibFunc::ZnamRKSt9nothrow_t, MallocLike, 2, 0, -1}, // new[](unsigned long, nothrow) + {LibFunc::msvc_new_int, OpNewLike, 1, 0, -1}, // new(unsigned int) + {LibFunc::msvc_new_int_nothrow, MallocLike, 2, 0, -1}, // new(unsigned int, nothrow) + {LibFunc::msvc_new_longlong, OpNewLike, 1, 0, -1}, // new(unsigned long long) + {LibFunc::msvc_new_longlong_nothrow, MallocLike, 2, 0, -1}, // new(unsigned long long, nothrow) + {LibFunc::msvc_new_array_int, OpNewLike, 1, 0, -1}, // new[](unsigned int) + {LibFunc::msvc_new_array_int_nothrow, MallocLike, 2, 0, -1}, // new[](unsigned int, nothrow) + {LibFunc::msvc_new_array_longlong, OpNewLike, 1, 0, -1}, // new[](unsigned long long) + {LibFunc::msvc_new_array_longlong_nothrow, MallocLike, 2, 0, -1}, // new[](unsigned long long, nothrow) {LibFunc::calloc, CallocLike, 2, 0, 1}, {LibFunc::realloc, ReallocLike, 2, 1, -1}, {LibFunc::reallocf, ReallocLike, 2, 1, -1}, @@ -107,18 +115,13 @@ static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy, if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn)) return nullptr; - unsigned i = 0; - bool found = false; - for ( ; i < array_lengthof(AllocationFnData); ++i) { - if (AllocationFnData[i].Func == TLIFn) { - found = true; - break; - } - } - if (!found) + const AllocFnsTy *FnData = + std::find_if(std::begin(AllocationFnData), std::end(AllocationFnData), + [TLIFn](const AllocFnsTy &Fn) { return Fn.Func == TLIFn; }); + + if (FnData == std::end(AllocationFnData)) return nullptr; - const AllocFnsTy *FnData = &AllocationFnData[i]; if ((FnData->AllocTy & AllocTy) != FnData->AllocTy) return nullptr; @@ -184,20 +187,6 @@ bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI, return getAllocationData(V, AllocLike, TLI, LookThroughBitCast); } -/// \brief Tests if a value is a call or invoke to a library function that -/// reallocates memory (such as realloc). -bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI, - bool LookThroughBitCast) { - return getAllocationData(V, ReallocLike, TLI, LookThroughBitCast); -} - -/// \brief Tests if a value is a call or invoke to a library function that -/// allocates memory and never returns null (such as operator new). -bool llvm::isOperatorNewLikeFn(const Value *V, const TargetLibraryInfo *TLI, - bool LookThroughBitCast) { - return getAllocationData(V, OpNewLike, TLI, LookThroughBitCast); -} - /// extractMallocCall - Returns the corresponding CallInst if the instruction /// is a malloc call. Since CallInst::CreateMalloc() only creates calls, we /// ignore InvokeInst here. @@ -313,14 +302,26 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) { unsigned ExpectedNumParams; if (TLIFn == LibFunc::free || TLIFn == LibFunc::ZdlPv || // operator delete(void*) - TLIFn == LibFunc::ZdaPv) // operator delete[](void*) + TLIFn == LibFunc::ZdaPv || // operator delete[](void*) + TLIFn == LibFunc::msvc_delete_ptr32 || // operator delete(void*) + TLIFn == LibFunc::msvc_delete_ptr64 || // operator delete(void*) + TLIFn == LibFunc::msvc_delete_array_ptr32 || // operator delete[](void*) + TLIFn == LibFunc::msvc_delete_array_ptr64) // operator delete[](void*) ExpectedNumParams = 1; else if (TLIFn == LibFunc::ZdlPvj || // delete(void*, uint) TLIFn == LibFunc::ZdlPvm || // delete(void*, ulong) TLIFn == LibFunc::ZdlPvRKSt9nothrow_t || // delete(void*, nothrow) TLIFn == LibFunc::ZdaPvj || // delete[](void*, uint) TLIFn == LibFunc::ZdaPvm || // delete[](void*, ulong) - TLIFn == LibFunc::ZdaPvRKSt9nothrow_t) // delete[](void*, nothrow) + TLIFn == LibFunc::ZdaPvRKSt9nothrow_t || // delete[](void*, nothrow) + TLIFn == LibFunc::msvc_delete_ptr32_int || // delete(void*, uint) + TLIFn == LibFunc::msvc_delete_ptr64_longlong || // delete(void*, ulonglong) + TLIFn == LibFunc::msvc_delete_ptr32_nothrow || // delete(void*, nothrow) + TLIFn == LibFunc::msvc_delete_ptr64_nothrow || // delete(void*, nothrow) + TLIFn == LibFunc::msvc_delete_array_ptr32_int || // delete[](void*, uint) + TLIFn == LibFunc::msvc_delete_array_ptr64_longlong || // delete[](void*, ulonglong) + TLIFn == LibFunc::msvc_delete_array_ptr32_nothrow || // delete[](void*, nothrow) + TLIFn == LibFunc::msvc_delete_array_ptr64_nothrow) // delete[](void*, nothrow) ExpectedNumParams = 2; else return nullptr; @@ -621,7 +622,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) { // always generate code immediately before the instruction being // processed, so that the generated code dominates the same BBs - Instruction *PrevInsertPoint = Builder.GetInsertPoint(); + BuilderTy::InsertPointGuard Guard(Builder); if (Instruction *I = dyn_cast<Instruction>(V)) Builder.SetInsertPoint(I); @@ -650,9 +651,6 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) { Result = unknown(); } - if (PrevInsertPoint) - Builder.SetInsertPoint(PrevInsertPoint); - // Don't reuse CacheIt since it may be invalid at this point. CacheMap[V] = Result; return Result; @@ -742,7 +740,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitPHINode(PHINode &PHI) { // compute offset/size for each PHI incoming pointer for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i) { - Builder.SetInsertPoint(PHI.getIncomingBlock(i)->getFirstInsertionPt()); + Builder.SetInsertPoint(&*PHI.getIncomingBlock(i)->getFirstInsertionPt()); SizeOffsetEvalType EdgeData = compute_(PHI.getIncomingValue(i)); if (!bothKnown(EdgeData)) { diff --git a/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index 782a67b..6918360 100644 --- a/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -22,7 +22,9 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/PHITransAddr.h" +#include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -49,7 +51,11 @@ STATISTIC(NumCacheCompleteNonLocalPtr, "Number of block queries that were completely cached"); // Limit for the number of instructions to scan in a block. -static const unsigned int BlockScanLimit = 100; + +static cl::opt<unsigned> BlockScanLimit( + "memdep-block-scan-limit", cl::Hidden, cl::init(100), + cl::desc("The number of instructions to scan in a block in memory " + "dependency analysis (default = 100)")); // Limit on the number of memdep results to process. static const unsigned int NumResultsLimit = 100; @@ -60,7 +66,8 @@ char MemoryDependenceAnalysis::ID = 0; INITIALIZE_PASS_BEGIN(MemoryDependenceAnalysis, "memdep", "Memory Dependence Analysis", false, true) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(MemoryDependenceAnalysis, "memdep", "Memory Dependence Analysis", false, true) @@ -87,15 +94,17 @@ void MemoryDependenceAnalysis::releaseMemory() { void MemoryDependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequired<AssumptionCacheTracker>(); - AU.addRequiredTransitive<AliasAnalysis>(); + AU.addRequiredTransitive<AAResultsWrapperPass>(); + AU.addRequiredTransitive<TargetLibraryInfoWrapperPass>(); } bool MemoryDependenceAnalysis::runOnFunction(Function &F) { - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); return false; } @@ -118,43 +127,43 @@ static void RemoveFromReverseMap(DenseMap<Instruction*, /// location, fill in Loc with the details, otherwise set Loc.Ptr to null. /// Return a ModRefInfo value describing the general behavior of the /// instruction. -static AliasAnalysis::ModRefResult -GetLocation(const Instruction *Inst, MemoryLocation &Loc, AliasAnalysis *AA) { +static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc, + const TargetLibraryInfo &TLI) { if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) { if (LI->isUnordered()) { Loc = MemoryLocation::get(LI); - return AliasAnalysis::Ref; + return MRI_Ref; } if (LI->getOrdering() == Monotonic) { Loc = MemoryLocation::get(LI); - return AliasAnalysis::ModRef; + return MRI_ModRef; } Loc = MemoryLocation(); - return AliasAnalysis::ModRef; + return MRI_ModRef; } if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { if (SI->isUnordered()) { Loc = MemoryLocation::get(SI); - return AliasAnalysis::Mod; + return MRI_Mod; } if (SI->getOrdering() == Monotonic) { Loc = MemoryLocation::get(SI); - return AliasAnalysis::ModRef; + return MRI_ModRef; } Loc = MemoryLocation(); - return AliasAnalysis::ModRef; + return MRI_ModRef; } if (const VAArgInst *V = dyn_cast<VAArgInst>(Inst)) { Loc = MemoryLocation::get(V); - return AliasAnalysis::ModRef; + return MRI_ModRef; } - if (const CallInst *CI = isFreeCall(Inst, AA->getTargetLibraryInfo())) { + if (const CallInst *CI = isFreeCall(Inst, &TLI)) { // calls to free() deallocate the entire structure Loc = MemoryLocation(CI->getArgOperand(0)); - return AliasAnalysis::Mod; + return MRI_Mod; } if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { @@ -170,7 +179,7 @@ GetLocation(const Instruction *Inst, MemoryLocation &Loc, AliasAnalysis *AA) { cast<ConstantInt>(II->getArgOperand(0))->getZExtValue(), AAInfo); // These intrinsics don't really modify the memory, but returning Mod // will allow them to be handled conservatively. - return AliasAnalysis::Mod; + return MRI_Mod; case Intrinsic::invariant_end: II->getAAMetadata(AAInfo); Loc = MemoryLocation( @@ -178,7 +187,7 @@ GetLocation(const Instruction *Inst, MemoryLocation &Loc, AliasAnalysis *AA) { cast<ConstantInt>(II->getArgOperand(1))->getZExtValue(), AAInfo); // These intrinsics don't really modify the memory, but returning Mod // will allow them to be handled conservatively. - return AliasAnalysis::Mod; + return MRI_Mod; default: break; } @@ -186,10 +195,10 @@ GetLocation(const Instruction *Inst, MemoryLocation &Loc, AliasAnalysis *AA) { // Otherwise, just do the coarse-grained thing that always works. if (Inst->mayWriteToMemory()) - return AliasAnalysis::ModRef; + return MRI_ModRef; if (Inst->mayReadFromMemory()) - return AliasAnalysis::Ref; - return AliasAnalysis::NoModRef; + return MRI_Ref; + return MRI_NoModRef; } /// getCallSiteDependencyFrom - Private helper for finding the local @@ -207,14 +216,14 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall, if (!Limit) return MemDepResult::getUnknown(); - Instruction *Inst = --ScanIt; + Instruction *Inst = &*--ScanIt; // If this inst is a memory op, get the pointer it accessed MemoryLocation Loc; - AliasAnalysis::ModRefResult MR = GetLocation(Inst, Loc, AA); + ModRefInfo MR = GetLocation(Inst, Loc, *TLI); if (Loc.Ptr) { // A simple instruction. - if (AA->getModRefInfo(CS, Loc) != AliasAnalysis::NoModRef) + if (AA->getModRefInfo(CS, Loc) != MRI_NoModRef) return MemDepResult::getClobber(Inst); continue; } @@ -224,10 +233,10 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall, if (isa<DbgInfoIntrinsic>(Inst)) continue; // If these two calls do not interfere, look past it. switch (AA->getModRefInfo(CS, InstCS)) { - case AliasAnalysis::NoModRef: + case MRI_NoModRef: // If the two calls are the same, return InstCS as a Def, so that // CS can be found redundant and eliminated. - if (isReadOnlyCall && !(MR & AliasAnalysis::Mod) && + if (isReadOnlyCall && !(MR & MRI_Mod) && CS.getInstruction()->isIdenticalToWhenDefined(Inst)) return MemDepResult::getDef(Inst); @@ -241,7 +250,7 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall, // If we could not obtain a pointer for the instruction and the instruction // touches memory then assume that this is a dependency. - if (MR != AliasAnalysis::NoModRef) + if (MR != MRI_NoModRef) return MemDepResult::getClobber(Inst); } @@ -371,6 +380,75 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom( const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt, BasicBlock *BB, Instruction *QueryInst) { + if (QueryInst != nullptr) { + if (auto *LI = dyn_cast<LoadInst>(QueryInst)) { + MemDepResult invariantGroupDependency = + getInvariantGroupPointerDependency(LI, BB); + + if (invariantGroupDependency.isDef()) + return invariantGroupDependency; + } + } + return getSimplePointerDependencyFrom(MemLoc, isLoad, ScanIt, BB, QueryInst); +} + +MemDepResult +MemoryDependenceAnalysis::getInvariantGroupPointerDependency(LoadInst *LI, + BasicBlock *BB) { + Value *LoadOperand = LI->getPointerOperand(); + // It's is not safe to walk the use list of global value, because function + // passes aren't allowed to look outside their functions. + if (isa<GlobalValue>(LoadOperand)) + return MemDepResult::getUnknown(); + + auto *InvariantGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group); + if (!InvariantGroupMD) + return MemDepResult::getUnknown(); + + MemDepResult Result = MemDepResult::getUnknown(); + llvm::SmallSet<Value *, 14> Seen; + // Queue to process all pointers that are equivalent to load operand. + llvm::SmallVector<Value *, 8> LoadOperandsQueue; + LoadOperandsQueue.push_back(LoadOperand); + while (!LoadOperandsQueue.empty()) { + Value *Ptr = LoadOperandsQueue.pop_back_val(); + if (isa<GlobalValue>(Ptr)) + continue; + + if (auto *BCI = dyn_cast<BitCastInst>(Ptr)) { + if (!Seen.count(BCI->getOperand(0))) { + LoadOperandsQueue.push_back(BCI->getOperand(0)); + Seen.insert(BCI->getOperand(0)); + } + } + + for (Use &Us : Ptr->uses()) { + auto *U = dyn_cast<Instruction>(Us.getUser()); + if (!U || U == LI || !DT->dominates(U, LI)) + continue; + + if (auto *BCI = dyn_cast<BitCastInst>(U)) { + if (!Seen.count(BCI)) { + LoadOperandsQueue.push_back(BCI); + Seen.insert(BCI); + } + continue; + } + // If we hit load/store with the same invariant.group metadata (and the + // same pointer operand) we can assume that value pointed by pointer + // operand didn't change. + if ((isa<LoadInst>(U) || isa<StoreInst>(U)) && U->getParent() == BB && + U->getMetadata(LLVMContext::MD_invariant_group) == InvariantGroupMD) + return MemDepResult::getDef(U); + } + } + return Result; +} + +MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom( + const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt, + BasicBlock *BB, Instruction *QueryInst) { + const Value *MemLocBase = nullptr; int64_t MemLocOffset = 0; unsigned Limit = BlockScanLimit; @@ -399,7 +477,7 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom( // being 42. A key property of this program however is that if either // 1 or 4 were missing, there would be a race between the store of 42 // either the store of 0 or the load (making the whole progam racy). - // The paper mentionned above shows that the same property is respected + // The paper mentioned above shows that the same property is respected // by every program that can detect any optimisation of that kind: either // it is racy (undefined) or there is a release followed by an acquire // between the pair of accesses under consideration. @@ -416,9 +494,15 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom( const DataLayout &DL = BB->getModule()->getDataLayout(); + // Create a numbered basic block to lazily compute and cache instruction + // positions inside a BB. This is used to provide fast queries for relative + // position between two instructions in a BB and can be used by + // AliasAnalysis::callCapturesBefore. + OrderedBasicBlock OBB(BB); + // Walk backwards through the basic block, looking for dependencies. while (ScanIt != BB->begin()) { - Instruction *Inst = --ScanIt; + Instruction *Inst = &*--ScanIt; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) // Debug intrinsics don't (and can't) cause dependencies. @@ -567,7 +651,7 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom( // If alias analysis can tell that this store is guaranteed to not modify // the query pointer, ignore it. Use getModRefInfo to handle cases where // the query pointer points to constant memory etc. - if (AA->getModRefInfo(SI, MemLoc) == AliasAnalysis::NoModRef) + if (AA->getModRefInfo(SI, MemLoc) == MRI_NoModRef) continue; // Ok, this store might clobber the query pointer. Check to see if it is @@ -594,7 +678,6 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom( // a subsequent bitcast of the malloc call result. There can be stores to // the malloced memory between the malloc call and its bitcast uses, and we // need to continue scanning until the malloc call. - const TargetLibraryInfo *TLI = AA->getTargetLibraryInfo(); if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst, TLI)) { const Value *AccessPtr = GetUnderlyingObject(MemLoc.Ptr, DL); @@ -602,13 +685,13 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom( return MemDepResult::getDef(Inst); if (isInvariantLoad) continue; - // Be conservative if the accessed pointer may alias the allocation. - if (AA->alias(Inst, AccessPtr) != NoAlias) - return MemDepResult::getClobber(Inst); - // If the allocation is not aliased and does not read memory (like - // strdup), it is safe to ignore. - if (isa<AllocaInst>(Inst) || - isMallocLikeFn(Inst, TLI) || isCallocLikeFn(Inst, TLI)) + // Be conservative if the accessed pointer may alias the allocation - + // fallback to the generic handling below. + if ((AA->alias(Inst, AccessPtr) == NoAlias) && + // If the allocation is not aliased and does not read memory (like + // strdup), it is safe to ignore. + (isa<AllocaInst>(Inst) || isMallocLikeFn(Inst, TLI) || + isCallocLikeFn(Inst, TLI))) continue; } @@ -616,17 +699,17 @@ MemDepResult MemoryDependenceAnalysis::getPointerDependencyFrom( continue; // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer. - AliasAnalysis::ModRefResult MR = AA->getModRefInfo(Inst, MemLoc); + ModRefInfo MR = AA->getModRefInfo(Inst, MemLoc); // If necessary, perform additional analysis. - if (MR == AliasAnalysis::ModRef) - MR = AA->callCapturesBefore(Inst, MemLoc, DT); + if (MR == MRI_ModRef) + MR = AA->callCapturesBefore(Inst, MemLoc, DT, &OBB); switch (MR) { - case AliasAnalysis::NoModRef: + case MRI_NoModRef: // If the call has no effect on the queried pointer, just ignore it. continue; - case AliasAnalysis::Mod: + case MRI_Mod: return MemDepResult::getClobber(Inst); - case AliasAnalysis::Ref: + case MRI_Ref: // If the call is known to never store to the pointer, and if this is a // load query, we can safely ignore it (scan past it). if (isLoad) @@ -677,20 +760,20 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) { LocalCache = MemDepResult::getNonFuncLocal(); } else { MemoryLocation MemLoc; - AliasAnalysis::ModRefResult MR = GetLocation(QueryInst, MemLoc, AA); + ModRefInfo MR = GetLocation(QueryInst, MemLoc, *TLI); if (MemLoc.Ptr) { // If we can do a pointer scan, make it happen. - bool isLoad = !(MR & AliasAnalysis::Mod); + bool isLoad = !(MR & MRI_Mod); if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(QueryInst)) isLoad |= II->getIntrinsicID() == Intrinsic::lifetime_start; - LocalCache = getPointerDependencyFrom(MemLoc, isLoad, ScanPos, - QueryParent, QueryInst); + LocalCache = getPointerDependencyFrom( + MemLoc, isLoad, ScanPos->getIterator(), QueryParent, QueryInst); } else if (isa<CallInst>(QueryInst) || isa<InvokeInst>(QueryInst)) { CallSite QueryCS(QueryInst); bool isReadOnly = AA->onlyReadsMemory(QueryCS); - LocalCache = getCallSiteDependencyFrom(QueryCS, isReadOnly, ScanPos, - QueryParent); + LocalCache = getCallSiteDependencyFrom( + QueryCS, isReadOnly, ScanPos->getIterator(), QueryParent); } else // Non-memory instruction. LocalCache = MemDepResult::getUnknown(); @@ -709,10 +792,8 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) { static void AssertSorted(MemoryDependenceAnalysis::NonLocalDepInfo &Cache, int Count = -1) { if (Count == -1) Count = Cache.size(); - if (Count == 0) return; - - for (unsigned i = 1; i != unsigned(Count); ++i) - assert(!(Cache[i] < Cache[i-1]) && "Cache isn't sorted!"); + assert(std::is_sorted(Cache.begin(), Cache.begin() + Count) && + "Cache isn't sorted!"); } #endif @@ -813,7 +894,7 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) { BasicBlock::iterator ScanPos = DirtyBB->end(); if (ExistingResult) { if (Instruction *Inst = ExistingResult->getResult().getInst()) { - ScanPos = Inst; + ScanPos = Inst->getIterator(); // We're removing QueryInst's use of Inst. RemoveFromReverseMap(ReverseNonLocalDeps, Inst, QueryCS.getInstruction()); @@ -952,11 +1033,11 @@ MemDepResult MemoryDependenceAnalysis::GetNonLocalInfoForBlock( assert(ExistingResult->getResult().getInst()->getParent() == BB && "Instruction invalidated?"); ++NumCacheDirtyNonLocalPtr; - ScanPos = ExistingResult->getResult().getInst(); + ScanPos = ExistingResult->getResult().getInst()->getIterator(); // Eliminating the dirty entry from 'Cache', so update the reverse info. ValueIsLoadPair CacheKey(Loc.Ptr, isLoad); - RemoveFromReverseMap(ReverseNonLocalPtrDeps, ScanPos, CacheKey); + RemoveFromReverseMap(ReverseNonLocalPtrDeps, &*ScanPos, CacheKey); } else { ++NumUncacheNonLocalPtr; } @@ -1507,7 +1588,7 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) { // the entire block to get to this point. MemDepResult NewDirtyVal; if (!RemInst->isTerminator()) - NewDirtyVal = MemDepResult::getDirty(++BasicBlock::iterator(RemInst)); + NewDirtyVal = MemDepResult::getDirty(&*++RemInst->getIterator()); ReverseDepMapType::iterator ReverseDepIt = ReverseLocalDeps.find(RemInst); if (ReverseDepIt != ReverseLocalDeps.end()) { @@ -1614,7 +1695,6 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) { assert(!NonLocalDeps.count(RemInst) && "RemInst got reinserted?"); - AA->deleteValue(RemInst); DEBUG(verifyRemoved(RemInst)); } /// verifyRemoved - Verify that the specified instruction does not occur diff --git a/contrib/llvm/lib/Analysis/NoAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/NoAliasAnalysis.cpp deleted file mode 100644 index 322a9a8..0000000 --- a/contrib/llvm/lib/Analysis/NoAliasAnalysis.cpp +++ /dev/null @@ -1,95 +0,0 @@ -//===- NoAliasAnalysis.cpp - Minimal Alias Analysis Impl ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the default implementation of the Alias Analysis interface -// that simply returns "I don't know" for all queries. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -using namespace llvm; - -namespace { - /// NoAA - This class implements the -no-aa pass, which always returns "I - /// don't know" for alias queries. NoAA is unlike other alias analysis - /// implementations, in that it does not chain to a previous analysis. As - /// such it doesn't follow many of the rules that other alias analyses must. - /// - struct NoAA : public ImmutablePass, public AliasAnalysis { - static char ID; // Class identification, replacement for typeinfo - NoAA() : ImmutablePass(ID) { - initializeNoAAPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override {} - - bool doInitialization(Module &M) override { - // Note: NoAA does not call InitializeAliasAnalysis because it's - // special and does not support chaining. - DL = &M.getDataLayout(); - return true; - } - - AliasResult alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) override { - return MayAlias; - } - - ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override { - return UnknownModRefBehavior; - } - ModRefBehavior getModRefBehavior(const Function *F) override { - return UnknownModRefBehavior; - } - - bool pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) override { - return false; - } - ModRefResult getArgModRefInfo(ImmutableCallSite CS, - unsigned ArgIdx) override { - return ModRef; - } - - ModRefResult getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) override { - return ModRef; - } - ModRefResult getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2) override { - return ModRef; - } - - void deleteValue(Value *V) override {} - void addEscapingUse(Use &U) override {} - - /// getAdjustedAnalysisPointer - This method is used when a pass implements - /// an analysis interface through multiple inheritance. If needed, it - /// should override this to adjust the this pointer as needed for the - /// specified pass info. - void *getAdjustedAnalysisPointer(const void *ID) override { - if (ID == &AliasAnalysis::ID) - return (AliasAnalysis*)this; - return this; - } - }; -} // End of anonymous namespace - -// Register this pass... -char NoAA::ID = 0; -INITIALIZE_AG_PASS(NoAA, AliasAnalysis, "no-aa", - "No Alias Analysis (always returns 'may' alias)", - true, true, true) - -ImmutablePass *llvm::createNoAAPass() { return new NoAA(); } diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp index 3893aab..25f660f 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp +++ b/contrib/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp @@ -18,66 +18,46 @@ /// used. Naive LLVM IR transformations which would otherwise be /// behavior-preserving may break these assumptions. /// +/// TODO: Theoretically we could check for dependencies between objc_* calls +/// and FMRB_OnlyAccessesArgumentPointees calls or other well-behaved calls. +/// //===----------------------------------------------------------------------===// -#include "ObjCARC.h" -#include "ObjCARCAliasAnalysis.h" +#include "llvm/Analysis/ObjCARCAliasAnalysis.h" +#include "llvm/Analysis/ObjCARCAnalysisUtils.h" +#include "llvm/IR/Function.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" #include "llvm/PassAnalysisSupport.h" #include "llvm/PassSupport.h" #define DEBUG_TYPE "objc-arc-aa" -namespace llvm { - class Function; - class Value; -} - using namespace llvm; using namespace llvm::objcarc; -// Register this pass... -char ObjCARCAliasAnalysis::ID = 0; -INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa", - "ObjC-ARC-Based Alias Analysis", false, true, false) - -ImmutablePass *llvm::createObjCARCAliasAnalysisPass() { - return new ObjCARCAliasAnalysis(); -} - -bool ObjCARCAliasAnalysis::doInitialization(Module &M) { - InitializeAliasAnalysis(this, &M.getDataLayout()); - return true; -} - -void -ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesAll(); - AliasAnalysis::getAnalysisUsage(AU); -} - -AliasResult ObjCARCAliasAnalysis::alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) { +AliasResult ObjCARCAAResult::alias(const MemoryLocation &LocA, + const MemoryLocation &LocB) { if (!EnableARCOpts) - return AliasAnalysis::alias(LocA, LocB); + return AAResultBase::alias(LocA, LocB); // First, strip off no-ops, including ObjC-specific no-ops, and try making a // precise alias query. const Value *SA = GetRCIdentityRoot(LocA.Ptr); const Value *SB = GetRCIdentityRoot(LocB.Ptr); AliasResult Result = - AliasAnalysis::alias(MemoryLocation(SA, LocA.Size, LocA.AATags), - MemoryLocation(SB, LocB.Size, LocB.AATags)); + AAResultBase::alias(MemoryLocation(SA, LocA.Size, LocA.AATags), + MemoryLocation(SB, LocB.Size, LocB.AATags)); if (Result != MayAlias) return Result; // If that failed, climb to the underlying object, including climbing through // ObjC-specific no-ops, and try making an imprecise alias query. - const Value *UA = GetUnderlyingObjCPtr(SA, *DL); - const Value *UB = GetUnderlyingObjCPtr(SB, *DL); + const Value *UA = GetUnderlyingObjCPtr(SA, DL); + const Value *UB = GetUnderlyingObjCPtr(SB, DL); if (UA != SA || UB != SB) { - Result = AliasAnalysis::alias(MemoryLocation(UA), MemoryLocation(UB)); + Result = AAResultBase::alias(MemoryLocation(UA), MemoryLocation(UB)); // We can't use MustAlias or PartialAlias results here because // GetUnderlyingObjCPtr may return an offsetted pointer value. if (Result == NoAlias) @@ -89,55 +69,47 @@ AliasResult ObjCARCAliasAnalysis::alias(const MemoryLocation &LocA, return MayAlias; } -bool ObjCARCAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) { +bool ObjCARCAAResult::pointsToConstantMemory(const MemoryLocation &Loc, + bool OrLocal) { if (!EnableARCOpts) - return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); // First, strip off no-ops, including ObjC-specific no-ops, and try making // a precise alias query. const Value *S = GetRCIdentityRoot(Loc.Ptr); - if (AliasAnalysis::pointsToConstantMemory( + if (AAResultBase::pointsToConstantMemory( MemoryLocation(S, Loc.Size, Loc.AATags), OrLocal)) return true; // If that failed, climb to the underlying object, including climbing through // ObjC-specific no-ops, and try making an imprecise alias query. - const Value *U = GetUnderlyingObjCPtr(S, *DL); + const Value *U = GetUnderlyingObjCPtr(S, DL); if (U != S) - return AliasAnalysis::pointsToConstantMemory(MemoryLocation(U), OrLocal); + return AAResultBase::pointsToConstantMemory(MemoryLocation(U), OrLocal); // If that failed, fail. We don't need to chain here, since that's covered // by the earlier precise query. return false; } -AliasAnalysis::ModRefBehavior -ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) { - // We have nothing to do. Just chain to the next AliasAnalysis. - return AliasAnalysis::getModRefBehavior(CS); -} - -AliasAnalysis::ModRefBehavior -ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) { +FunctionModRefBehavior ObjCARCAAResult::getModRefBehavior(const Function *F) { if (!EnableARCOpts) - return AliasAnalysis::getModRefBehavior(F); + return AAResultBase::getModRefBehavior(F); switch (GetFunctionClass(F)) { case ARCInstKind::NoopCast: - return DoesNotAccessMemory; + return FMRB_DoesNotAccessMemory; default: break; } - return AliasAnalysis::getModRefBehavior(F); + return AAResultBase::getModRefBehavior(F); } -AliasAnalysis::ModRefResult -ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) { +ModRefInfo ObjCARCAAResult::getModRefInfo(ImmutableCallSite CS, + const MemoryLocation &Loc) { if (!EnableARCOpts) - return AliasAnalysis::getModRefInfo(CS, Loc); + return AAResultBase::getModRefInfo(CS, Loc); switch (GetBasicARCInstKind(CS.getInstruction())) { case ARCInstKind::Retain: @@ -151,18 +123,48 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, // These functions don't access any memory visible to the compiler. // Note that this doesn't include objc_retainBlock, because it updates // pointers when it copies block data. - return NoModRef; + return MRI_NoModRef; default: break; } - return AliasAnalysis::getModRefInfo(CS, Loc); + return AAResultBase::getModRefInfo(CS, Loc); +} + +ObjCARCAAResult ObjCARCAA::run(Function &F, AnalysisManager<Function> *AM) { + return ObjCARCAAResult(F.getParent()->getDataLayout(), + AM->getResult<TargetLibraryAnalysis>(F)); +} + +char ObjCARCAA::PassID; + +char ObjCARCAAWrapperPass::ID = 0; +INITIALIZE_PASS_BEGIN(ObjCARCAAWrapperPass, "objc-arc-aa", + "ObjC-ARC-Based Alias Analysis", false, true) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(ObjCARCAAWrapperPass, "objc-arc-aa", + "ObjC-ARC-Based Alias Analysis", false, true) + +ImmutablePass *llvm::createObjCARCAAWrapperPass() { + return new ObjCARCAAWrapperPass(); +} + +ObjCARCAAWrapperPass::ObjCARCAAWrapperPass() : ImmutablePass(ID) { + initializeObjCARCAAWrapperPassPass(*PassRegistry::getPassRegistry()); } -AliasAnalysis::ModRefResult -ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2) { - // TODO: Theoretically we could check for dependencies between objc_* calls - // and OnlyAccessesArgumentPointees calls or other well-behaved calls. - return AliasAnalysis::getModRefInfo(CS1, CS2); +bool ObjCARCAAWrapperPass::doInitialization(Module &M) { + Result.reset(new ObjCARCAAResult( + M.getDataLayout(), getAnalysis<TargetLibraryInfoWrapperPass>().getTLI())); + return false; +} + +bool ObjCARCAAWrapperPass::doFinalization(Module &M) { + Result.reset(); + return false; +} + +void ObjCARCAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } diff --git a/contrib/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp b/contrib/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp new file mode 100644 index 0000000..e3e74aa --- /dev/null +++ b/contrib/llvm/lib/Analysis/ObjCARCAnalysisUtils.cpp @@ -0,0 +1,28 @@ +//===- ObjCARCAnalysisUtils.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements common infrastructure for libLLVMObjCARCOpts.a, which +// implements several scalar transformations over the LLVM intermediate +// representation, including the C bindings for that library. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/ObjCARCAnalysisUtils.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; +using namespace llvm::objcarc; + +/// \brief A handy option to enable/disable all ARC Optimizations. +bool llvm::objcarc::EnableARCOpts; +static cl::opt<bool, true> +EnableARCOptimizations("enable-objc-arc-opts", + cl::desc("enable/disable all ARC Optimizations"), + cl::location(EnableARCOpts), + cl::init(true)); diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.cpp b/contrib/llvm/lib/Analysis/ObjCARCInstKind.cpp index afb873a..133b635 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.cpp +++ b/contrib/llvm/lib/Analysis/ObjCARCInstKind.cpp @@ -19,7 +19,9 @@ /// //===----------------------------------------------------------------------===// -#include "ObjCARC.h" +#include "llvm/Analysis/ObjCARCInstKind.h" +#include "llvm/Analysis/ObjCARCAnalysisUtils.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/IR/Intrinsics.h" using namespace llvm; @@ -91,7 +93,7 @@ ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) { .Default(ARCInstKind::CallOrUser); // One argument. - const Argument *A0 = AI++; + const Argument *A0 = &*AI++; if (AI == AE) // Argument is a pointer. if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) { @@ -129,7 +131,7 @@ ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) { } // Two arguments, first is i8**. - const Argument *A1 = AI++; + const Argument *A1 = &*AI++; if (AI == AE) if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType())) diff --git a/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp b/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp new file mode 100644 index 0000000..0f0016f --- /dev/null +++ b/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp @@ -0,0 +1,85 @@ +//===- OrderedBasicBlock.cpp --------------------------------- -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the OrderedBasicBlock class. OrderedBasicBlock +// maintains an interface where clients can query if one instruction comes +// before another in a BasicBlock. Since BasicBlock currently lacks a reliable +// way to query relative position between instructions one can use +// OrderedBasicBlock to do such queries. OrderedBasicBlock is lazily built on a +// source BasicBlock and maintains an internal Instruction -> Position map. A +// OrderedBasicBlock instance should be discarded whenever the source +// BasicBlock changes. +// +// It's currently used by the CaptureTracker in order to find relative +// positions of a pair of instructions inside a BasicBlock. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/OrderedBasicBlock.h" +#include "llvm/IR/Instruction.h" +using namespace llvm; + +OrderedBasicBlock::OrderedBasicBlock(const BasicBlock *BasicB) + : NextInstPos(0), BB(BasicB) { + LastInstFound = BB->end(); +} + +/// \brief Given no cached results, find if \p A comes before \p B in \p BB. +/// Cache and number out instruction while walking \p BB. +bool OrderedBasicBlock::comesBefore(const Instruction *A, + const Instruction *B) { + const Instruction *Inst = nullptr; + assert(!(LastInstFound == BB->end() && NextInstPos != 0) && + "Instruction supposed to be in NumberedInsts"); + + // Start the search with the instruction found in the last lookup round. + auto II = BB->begin(); + auto IE = BB->end(); + if (LastInstFound != IE) + II = std::next(LastInstFound); + + // Number all instructions up to the point where we find 'A' or 'B'. + for (; II != IE; ++II) { + Inst = cast<Instruction>(II); + NumberedInsts[Inst] = NextInstPos++; + if (Inst == A || Inst == B) + break; + } + + assert(II != IE && "Instruction not found?"); + assert((Inst == A || Inst == B) && "Should find A or B"); + LastInstFound = II; + return Inst == A; +} + +/// \brief Find out whether \p A dominates \p B, meaning whether \p A +/// comes before \p B in \p BB. This is a simplification that considers +/// cached instruction positions and ignores other basic blocks, being +/// only relevant to compare relative instructions positions inside \p BB. +bool OrderedBasicBlock::dominates(const Instruction *A, const Instruction *B) { + assert(A->getParent() == B->getParent() && + "Instructions must be in the same basic block!"); + + // First we lookup the instructions. If they don't exist, lookup will give us + // back ::end(). If they both exist, we compare the numbers. Otherwise, if NA + // exists and NB doesn't, it means NA must come before NB because we would + // have numbered NB as well if it didn't. The same is true for NB. If it + // exists, but NA does not, NA must come after it. If neither exist, we need + // to number the block and cache the results (by calling comesBefore). + auto NAI = NumberedInsts.find(A); + auto NBI = NumberedInsts.find(B); + if (NAI != NumberedInsts.end() && NBI != NumberedInsts.end()) + return NAI->second < NBI->second; + if (NAI != NumberedInsts.end()) + return true; + if (NBI != NumberedInsts.end()) + return false; + + return comesBefore(A, B); +} diff --git a/contrib/llvm/lib/Analysis/RegionInfo.cpp b/contrib/llvm/lib/Analysis/RegionInfo.cpp index 8cd8534..f59d267 100644 --- a/contrib/llvm/lib/Analysis/RegionInfo.cpp +++ b/contrib/llvm/lib/Analysis/RegionInfo.cpp @@ -21,6 +21,9 @@ #include <algorithm> #include <iterator> #include <set> +#ifndef NDEBUG +#include "llvm/Analysis/RegionPrinter.h" +#endif using namespace llvm; @@ -103,6 +106,12 @@ void RegionInfo::recalculate(Function &F, DominatorTree *DT_, calculate(F); } +#ifndef NDEBUG +void RegionInfo::view() { viewRegion(this); } + +void RegionInfo::viewOnly() { viewRegionOnly(this); } +#endif + //===----------------------------------------------------------------------===// // RegionInfoPass implementation // diff --git a/contrib/llvm/lib/Analysis/RegionPrinter.cpp b/contrib/llvm/lib/Analysis/RegionPrinter.cpp index d7f5109..acb218d 100644 --- a/contrib/llvm/lib/Analysis/RegionPrinter.cpp +++ b/contrib/llvm/lib/Analysis/RegionPrinter.cpp @@ -20,6 +20,9 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#ifndef NDEBUG +#include "llvm/IR/LegacyPassManager.h" +#endif using namespace llvm; @@ -55,25 +58,22 @@ struct DOTGraphTraits<RegionNode*> : public DefaultDOTGraphTraits { } }; -template<> -struct DOTGraphTraits<RegionInfoPass*> : public DOTGraphTraits<RegionNode*> { +template <> +struct DOTGraphTraits<RegionInfo *> : public DOTGraphTraits<RegionNode *> { DOTGraphTraits (bool isSimple = false) : DOTGraphTraits<RegionNode*>(isSimple) {} - static std::string getGraphName(RegionInfoPass *DT) { - return "Region Graph"; - } + static std::string getGraphName(const RegionInfo *) { return "Region Graph"; } - std::string getNodeLabel(RegionNode *Node, RegionInfoPass *G) { - RegionInfo &RI = G->getRegionInfo(); - return DOTGraphTraits<RegionNode*>::getNodeLabel(Node, - reinterpret_cast<RegionNode*>(RI.getTopLevelRegion())); + std::string getNodeLabel(RegionNode *Node, RegionInfo *G) { + return DOTGraphTraits<RegionNode *>::getNodeLabel( + Node, reinterpret_cast<RegionNode *>(G->getTopLevelRegion())); } std::string getEdgeAttributes(RegionNode *srcNode, - GraphTraits<RegionInfo*>::ChildIteratorType CI, RegionInfoPass *G) { - RegionInfo &RI = G->getRegionInfo(); + GraphTraits<RegionInfo *>::ChildIteratorType CI, + RegionInfo *G) { RegionNode *destNode = *CI; if (srcNode->isSubRegion() || destNode->isSubRegion()) @@ -83,7 +83,7 @@ struct DOTGraphTraits<RegionInfoPass*> : public DOTGraphTraits<RegionNode*> { BasicBlock *srcBB = srcNode->getNodeAs<BasicBlock>(); BasicBlock *destBB = destNode->getNodeAs<BasicBlock>(); - Region *R = RI.getRegionFor(destBB); + Region *R = G->getRegionFor(destBB); while (R && R->getParent()) if (R->getParent()->getEntry() == destBB) @@ -91,7 +91,7 @@ struct DOTGraphTraits<RegionInfoPass*> : public DOTGraphTraits<RegionNode*> { else break; - if (R->getEntry() == destBB && R->contains(srcBB)) + if (R && R->getEntry() == destBB && R->contains(srcBB)) return "constraint=false"; return ""; @@ -99,8 +99,7 @@ struct DOTGraphTraits<RegionInfoPass*> : public DOTGraphTraits<RegionNode*> { // Print the cluster of the subregions. This groups the single basic blocks // and adds a different background color for each group. - static void printRegionCluster(const Region &R, - GraphWriter<RegionInfoPass*> &GW, + static void printRegionCluster(const Region &R, GraphWriter<RegionInfo *> &GW, unsigned depth = 0) { raw_ostream &O = GW.getOStream(); O.indent(2 * depth) << "subgraph cluster_" << static_cast<const void*>(&R) @@ -132,50 +131,81 @@ struct DOTGraphTraits<RegionInfoPass*> : public DOTGraphTraits<RegionNode*> { O.indent(2 * depth) << "}\n"; } - static void addCustomGraphFeatures(const RegionInfoPass* RIP, - GraphWriter<RegionInfoPass*> &GW) { - const RegionInfo &RI = RIP->getRegionInfo(); + static void addCustomGraphFeatures(const RegionInfo *G, + GraphWriter<RegionInfo *> &GW) { raw_ostream &O = GW.getOStream(); O << "\tcolorscheme = \"paired12\"\n"; - printRegionCluster(*RI.getTopLevelRegion(), GW, 4); + printRegionCluster(*G->getTopLevelRegion(), GW, 4); } }; } //end namespace llvm namespace { +struct RegionInfoPassGraphTraits { + static RegionInfo *getGraph(RegionInfoPass *RIP) { + return &RIP->getRegionInfo(); + } +}; + +struct RegionPrinter + : public DOTGraphTraitsPrinter<RegionInfoPass, false, RegionInfo *, + RegionInfoPassGraphTraits> { + static char ID; + RegionPrinter() + : DOTGraphTraitsPrinter<RegionInfoPass, false, RegionInfo *, + RegionInfoPassGraphTraits>("reg", ID) { + initializeRegionPrinterPass(*PassRegistry::getPassRegistry()); + } +}; +char RegionPrinter::ID = 0; + +struct RegionOnlyPrinter + : public DOTGraphTraitsPrinter<RegionInfoPass, true, RegionInfo *, + RegionInfoPassGraphTraits> { + static char ID; + RegionOnlyPrinter() + : DOTGraphTraitsPrinter<RegionInfoPass, true, RegionInfo *, + RegionInfoPassGraphTraits>("reg", ID) { + initializeRegionOnlyPrinterPass(*PassRegistry::getPassRegistry()); + } +}; +char RegionOnlyPrinter::ID = 0; + struct RegionViewer - : public DOTGraphTraitsViewer<RegionInfoPass, false> { + : public DOTGraphTraitsViewer<RegionInfoPass, false, RegionInfo *, + RegionInfoPassGraphTraits> { static char ID; - RegionViewer() : DOTGraphTraitsViewer<RegionInfoPass, false>("reg", ID){ + RegionViewer() + : DOTGraphTraitsViewer<RegionInfoPass, false, RegionInfo *, + RegionInfoPassGraphTraits>("reg", ID) { initializeRegionViewerPass(*PassRegistry::getPassRegistry()); } }; char RegionViewer::ID = 0; struct RegionOnlyViewer - : public DOTGraphTraitsViewer<RegionInfoPass, true> { + : public DOTGraphTraitsViewer<RegionInfoPass, true, RegionInfo *, + RegionInfoPassGraphTraits> { static char ID; - RegionOnlyViewer() : DOTGraphTraitsViewer<RegionInfoPass, true>("regonly", ID) { + RegionOnlyViewer() + : DOTGraphTraitsViewer<RegionInfoPass, true, RegionInfo *, + RegionInfoPassGraphTraits>("regonly", ID) { initializeRegionOnlyViewerPass(*PassRegistry::getPassRegistry()); } }; char RegionOnlyViewer::ID = 0; -struct RegionPrinter - : public DOTGraphTraitsPrinter<RegionInfoPass, false> { - static char ID; - RegionPrinter() : - DOTGraphTraitsPrinter<RegionInfoPass, false>("reg", ID) { - initializeRegionPrinterPass(*PassRegistry::getPassRegistry()); - } -}; -char RegionPrinter::ID = 0; } //end anonymous namespace INITIALIZE_PASS(RegionPrinter, "dot-regions", "Print regions of function to 'dot' file", true, true) +INITIALIZE_PASS( + RegionOnlyPrinter, "dot-regions-only", + "Print regions of function to 'dot' file (with no function bodies)", true, + true) + INITIALIZE_PASS(RegionViewer, "view-regions", "View regions of function", true, true) @@ -183,25 +213,12 @@ INITIALIZE_PASS(RegionOnlyViewer, "view-regions-only", "View regions of function (with no function bodies)", true, true) -namespace { - -struct RegionOnlyPrinter - : public DOTGraphTraitsPrinter<RegionInfoPass, true> { - static char ID; - RegionOnlyPrinter() : - DOTGraphTraitsPrinter<RegionInfoPass, true>("reg", ID) { - initializeRegionOnlyPrinterPass(*PassRegistry::getPassRegistry()); - } -}; +FunctionPass *llvm::createRegionPrinterPass() { return new RegionPrinter(); } +FunctionPass *llvm::createRegionOnlyPrinterPass() { + return new RegionOnlyPrinter(); } -char RegionOnlyPrinter::ID = 0; -INITIALIZE_PASS(RegionOnlyPrinter, "dot-regions-only", - "Print regions of function to 'dot' file " - "(with no function bodies)", - true, true) - FunctionPass* llvm::createRegionViewerPass() { return new RegionViewer(); } @@ -210,11 +227,41 @@ FunctionPass* llvm::createRegionOnlyViewerPass() { return new RegionOnlyViewer(); } -FunctionPass* llvm::createRegionPrinterPass() { - return new RegionPrinter(); +#ifndef NDEBUG +static void viewRegionInfo(RegionInfo *RI, bool ShortNames) { + assert(RI && "Argument must be non-null"); + + llvm::Function *F = RI->getTopLevelRegion()->getEntry()->getParent(); + std::string GraphName = DOTGraphTraits<RegionInfo *>::getGraphName(RI); + + llvm::ViewGraph(RI, "reg", ShortNames, + Twine(GraphName) + " for '" + F->getName() + "' function"); } -FunctionPass* llvm::createRegionOnlyPrinterPass() { - return new RegionOnlyPrinter(); +static void invokeFunctionPass(const Function *F, FunctionPass *ViewerPass) { + assert(F && "Argument must be non-null"); + assert(!F->isDeclaration() && "Function must have an implementation"); + + // The viewer and analysis passes do not modify anything, so we can safely + // remove the const qualifier + auto NonConstF = const_cast<Function *>(F); + + llvm::legacy::FunctionPassManager FPM(NonConstF->getParent()); + FPM.add(ViewerPass); + FPM.doInitialization(); + FPM.run(*NonConstF); + FPM.doFinalization(); } +void llvm::viewRegion(RegionInfo *RI) { viewRegionInfo(RI, false); } + +void llvm::viewRegion(const Function *F) { + invokeFunctionPass(F, createRegionViewerPass()); +} + +void llvm::viewRegionOnly(RegionInfo *RI) { viewRegionInfo(RI, true); } + +void llvm::viewRegionOnly(const Function *F) { + invokeFunctionPass(F, createRegionOnlyViewerPass()); +} +#endif diff --git a/contrib/llvm/lib/Analysis/ScalarEvolution.cpp b/contrib/llvm/lib/Analysis/ScalarEvolution.cpp index 9c7c175..34074ef 100644 --- a/contrib/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/contrib/llvm/lib/Analysis/ScalarEvolution.cpp @@ -83,11 +83,13 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/SaveAndRestore.h" #include <algorithm> using namespace llvm; @@ -114,16 +116,6 @@ static cl::opt<bool> VerifySCEV("verify-scev", cl::desc("Verify ScalarEvolution's backedge taken counts (slow)")); -INITIALIZE_PASS_BEGIN(ScalarEvolution, "scalar-evolution", - "Scalar Evolution Analysis", false, true) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END(ScalarEvolution, "scalar-evolution", - "Scalar Evolution Analysis", false, true) -char ScalarEvolution::ID = 0; - //===----------------------------------------------------------------------===// // SCEV class definitions //===----------------------------------------------------------------------===// @@ -132,12 +124,11 @@ char ScalarEvolution::ID = 0; // Implementation of the SCEV class. // -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void SCEV::dump() const { print(dbgs()); dbgs() << '\n'; } -#endif void SCEV::print(raw_ostream &OS) const { switch (static_cast<SCEVTypes>(getSCEVType())) { @@ -303,7 +294,7 @@ bool SCEV::isNonConstantNegative() const { if (!SC) return false; // Return true if the value is negative, this matches things like (-42 * V). - return SC->getValue()->getValue().isNegative(); + return SC->getAPInt().isNegative(); } SCEVCouldNotCompute::SCEVCouldNotCompute() : @@ -455,179 +446,179 @@ bool SCEVUnknown::isOffsetOf(Type *&CTy, Constant *&FieldNo) const { //===----------------------------------------------------------------------===// namespace { - /// SCEVComplexityCompare - Return true if the complexity of the LHS is less - /// than the complexity of the RHS. This comparator is used to canonicalize - /// expressions. - class SCEVComplexityCompare { - const LoopInfo *const LI; - public: - explicit SCEVComplexityCompare(const LoopInfo *li) : LI(li) {} - - // Return true or false if LHS is less than, or at least RHS, respectively. - bool operator()(const SCEV *LHS, const SCEV *RHS) const { - return compare(LHS, RHS) < 0; - } - - // Return negative, zero, or positive, if LHS is less than, equal to, or - // greater than RHS, respectively. A three-way result allows recursive - // comparisons to be more efficient. - int compare(const SCEV *LHS, const SCEV *RHS) const { - // Fast-path: SCEVs are uniqued so we can do a quick equality check. - if (LHS == RHS) - return 0; - - // Primarily, sort the SCEVs by their getSCEVType(). - unsigned LType = LHS->getSCEVType(), RType = RHS->getSCEVType(); - if (LType != RType) - return (int)LType - (int)RType; - - // Aside from the getSCEVType() ordering, the particular ordering - // isn't very important except that it's beneficial to be consistent, - // so that (a + b) and (b + a) don't end up as different expressions. - switch (static_cast<SCEVTypes>(LType)) { - case scUnknown: { - const SCEVUnknown *LU = cast<SCEVUnknown>(LHS); - const SCEVUnknown *RU = cast<SCEVUnknown>(RHS); - - // Sort SCEVUnknown values with some loose heuristics. TODO: This is - // not as complete as it could be. - const Value *LV = LU->getValue(), *RV = RU->getValue(); - - // Order pointer values after integer values. This helps SCEVExpander - // form GEPs. - bool LIsPointer = LV->getType()->isPointerTy(), - RIsPointer = RV->getType()->isPointerTy(); - if (LIsPointer != RIsPointer) - return (int)LIsPointer - (int)RIsPointer; - - // Compare getValueID values. - unsigned LID = LV->getValueID(), - RID = RV->getValueID(); - if (LID != RID) - return (int)LID - (int)RID; - - // Sort arguments by their position. - if (const Argument *LA = dyn_cast<Argument>(LV)) { - const Argument *RA = cast<Argument>(RV); - unsigned LArgNo = LA->getArgNo(), RArgNo = RA->getArgNo(); - return (int)LArgNo - (int)RArgNo; - } - - // For instructions, compare their loop depth, and their operand - // count. This is pretty loose. - if (const Instruction *LInst = dyn_cast<Instruction>(LV)) { - const Instruction *RInst = cast<Instruction>(RV); - - // Compare loop depths. - const BasicBlock *LParent = LInst->getParent(), - *RParent = RInst->getParent(); - if (LParent != RParent) { - unsigned LDepth = LI->getLoopDepth(LParent), - RDepth = LI->getLoopDepth(RParent); - if (LDepth != RDepth) - return (int)LDepth - (int)RDepth; - } - - // Compare the number of operands. - unsigned LNumOps = LInst->getNumOperands(), - RNumOps = RInst->getNumOperands(); - return (int)LNumOps - (int)RNumOps; - } +/// SCEVComplexityCompare - Return true if the complexity of the LHS is less +/// than the complexity of the RHS. This comparator is used to canonicalize +/// expressions. +class SCEVComplexityCompare { + const LoopInfo *const LI; +public: + explicit SCEVComplexityCompare(const LoopInfo *li) : LI(li) {} - return 0; - } + // Return true or false if LHS is less than, or at least RHS, respectively. + bool operator()(const SCEV *LHS, const SCEV *RHS) const { + return compare(LHS, RHS) < 0; + } - case scConstant: { - const SCEVConstant *LC = cast<SCEVConstant>(LHS); - const SCEVConstant *RC = cast<SCEVConstant>(RHS); - - // Compare constant values. - const APInt &LA = LC->getValue()->getValue(); - const APInt &RA = RC->getValue()->getValue(); - unsigned LBitWidth = LA.getBitWidth(), RBitWidth = RA.getBitWidth(); - if (LBitWidth != RBitWidth) - return (int)LBitWidth - (int)RBitWidth; - return LA.ult(RA) ? -1 : 1; + // Return negative, zero, or positive, if LHS is less than, equal to, or + // greater than RHS, respectively. A three-way result allows recursive + // comparisons to be more efficient. + int compare(const SCEV *LHS, const SCEV *RHS) const { + // Fast-path: SCEVs are uniqued so we can do a quick equality check. + if (LHS == RHS) + return 0; + + // Primarily, sort the SCEVs by their getSCEVType(). + unsigned LType = LHS->getSCEVType(), RType = RHS->getSCEVType(); + if (LType != RType) + return (int)LType - (int)RType; + + // Aside from the getSCEVType() ordering, the particular ordering + // isn't very important except that it's beneficial to be consistent, + // so that (a + b) and (b + a) don't end up as different expressions. + switch (static_cast<SCEVTypes>(LType)) { + case scUnknown: { + const SCEVUnknown *LU = cast<SCEVUnknown>(LHS); + const SCEVUnknown *RU = cast<SCEVUnknown>(RHS); + + // Sort SCEVUnknown values with some loose heuristics. TODO: This is + // not as complete as it could be. + const Value *LV = LU->getValue(), *RV = RU->getValue(); + + // Order pointer values after integer values. This helps SCEVExpander + // form GEPs. + bool LIsPointer = LV->getType()->isPointerTy(), + RIsPointer = RV->getType()->isPointerTy(); + if (LIsPointer != RIsPointer) + return (int)LIsPointer - (int)RIsPointer; + + // Compare getValueID values. + unsigned LID = LV->getValueID(), + RID = RV->getValueID(); + if (LID != RID) + return (int)LID - (int)RID; + + // Sort arguments by their position. + if (const Argument *LA = dyn_cast<Argument>(LV)) { + const Argument *RA = cast<Argument>(RV); + unsigned LArgNo = LA->getArgNo(), RArgNo = RA->getArgNo(); + return (int)LArgNo - (int)RArgNo; } - case scAddRecExpr: { - const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS); - const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS); - - // Compare addrec loop depths. - const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop(); - if (LLoop != RLoop) { - unsigned LDepth = LLoop->getLoopDepth(), - RDepth = RLoop->getLoopDepth(); + // For instructions, compare their loop depth, and their operand + // count. This is pretty loose. + if (const Instruction *LInst = dyn_cast<Instruction>(LV)) { + const Instruction *RInst = cast<Instruction>(RV); + + // Compare loop depths. + const BasicBlock *LParent = LInst->getParent(), + *RParent = RInst->getParent(); + if (LParent != RParent) { + unsigned LDepth = LI->getLoopDepth(LParent), + RDepth = LI->getLoopDepth(RParent); if (LDepth != RDepth) return (int)LDepth - (int)RDepth; } - // Addrec complexity grows with operand count. - unsigned LNumOps = LA->getNumOperands(), RNumOps = RA->getNumOperands(); - if (LNumOps != RNumOps) - return (int)LNumOps - (int)RNumOps; + // Compare the number of operands. + unsigned LNumOps = LInst->getNumOperands(), + RNumOps = RInst->getNumOperands(); + return (int)LNumOps - (int)RNumOps; + } - // Lexicographically compare. - for (unsigned i = 0; i != LNumOps; ++i) { - long X = compare(LA->getOperand(i), RA->getOperand(i)); - if (X != 0) - return X; - } + return 0; + } - return 0; + case scConstant: { + const SCEVConstant *LC = cast<SCEVConstant>(LHS); + const SCEVConstant *RC = cast<SCEVConstant>(RHS); + + // Compare constant values. + const APInt &LA = LC->getAPInt(); + const APInt &RA = RC->getAPInt(); + unsigned LBitWidth = LA.getBitWidth(), RBitWidth = RA.getBitWidth(); + if (LBitWidth != RBitWidth) + return (int)LBitWidth - (int)RBitWidth; + return LA.ult(RA) ? -1 : 1; + } + + case scAddRecExpr: { + const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS); + const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS); + + // Compare addrec loop depths. + const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop(); + if (LLoop != RLoop) { + unsigned LDepth = LLoop->getLoopDepth(), + RDepth = RLoop->getLoopDepth(); + if (LDepth != RDepth) + return (int)LDepth - (int)RDepth; } - case scAddExpr: - case scMulExpr: - case scSMaxExpr: - case scUMaxExpr: { - const SCEVNAryExpr *LC = cast<SCEVNAryExpr>(LHS); - const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS); - - // Lexicographically compare n-ary expressions. - unsigned LNumOps = LC->getNumOperands(), RNumOps = RC->getNumOperands(); - if (LNumOps != RNumOps) - return (int)LNumOps - (int)RNumOps; - - for (unsigned i = 0; i != LNumOps; ++i) { - if (i >= RNumOps) - return 1; - long X = compare(LC->getOperand(i), RC->getOperand(i)); - if (X != 0) - return X; - } + // Addrec complexity grows with operand count. + unsigned LNumOps = LA->getNumOperands(), RNumOps = RA->getNumOperands(); + if (LNumOps != RNumOps) return (int)LNumOps - (int)RNumOps; + + // Lexicographically compare. + for (unsigned i = 0; i != LNumOps; ++i) { + long X = compare(LA->getOperand(i), RA->getOperand(i)); + if (X != 0) + return X; } - case scUDivExpr: { - const SCEVUDivExpr *LC = cast<SCEVUDivExpr>(LHS); - const SCEVUDivExpr *RC = cast<SCEVUDivExpr>(RHS); + return 0; + } + + case scAddExpr: + case scMulExpr: + case scSMaxExpr: + case scUMaxExpr: { + const SCEVNAryExpr *LC = cast<SCEVNAryExpr>(LHS); + const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS); - // Lexicographically compare udiv expressions. - long X = compare(LC->getLHS(), RC->getLHS()); + // Lexicographically compare n-ary expressions. + unsigned LNumOps = LC->getNumOperands(), RNumOps = RC->getNumOperands(); + if (LNumOps != RNumOps) + return (int)LNumOps - (int)RNumOps; + + for (unsigned i = 0; i != LNumOps; ++i) { + if (i >= RNumOps) + return 1; + long X = compare(LC->getOperand(i), RC->getOperand(i)); if (X != 0) return X; - return compare(LC->getRHS(), RC->getRHS()); } + return (int)LNumOps - (int)RNumOps; + } - case scTruncate: - case scZeroExtend: - case scSignExtend: { - const SCEVCastExpr *LC = cast<SCEVCastExpr>(LHS); - const SCEVCastExpr *RC = cast<SCEVCastExpr>(RHS); + case scUDivExpr: { + const SCEVUDivExpr *LC = cast<SCEVUDivExpr>(LHS); + const SCEVUDivExpr *RC = cast<SCEVUDivExpr>(RHS); - // Compare cast expressions by operand. - return compare(LC->getOperand(), RC->getOperand()); - } + // Lexicographically compare udiv expressions. + long X = compare(LC->getLHS(), RC->getLHS()); + if (X != 0) + return X; + return compare(LC->getRHS(), RC->getRHS()); + } - case scCouldNotCompute: - llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); - } - llvm_unreachable("Unknown SCEV kind!"); + case scTruncate: + case scZeroExtend: + case scSignExtend: { + const SCEVCastExpr *LC = cast<SCEVCastExpr>(LHS); + const SCEVCastExpr *RC = cast<SCEVCastExpr>(RHS); + + // Compare cast expressions by operand. + return compare(LC->getOperand(), RC->getOperand()); } - }; -} + + case scCouldNotCompute: + llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); + } + llvm_unreachable("Unknown SCEV kind!"); + } +}; +} // end anonymous namespace /// GroupByComplexity - Given a list of SCEV objects, order them by their /// complexity, and group objects of the same complexity together by value. @@ -675,24 +666,22 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops, } } -namespace { -struct FindSCEVSize { - int Size; - FindSCEVSize() : Size(0) {} - - bool follow(const SCEV *S) { - ++Size; - // Keep looking at all operands of S. - return true; - } - bool isDone() const { - return false; - } -}; -} - // Returns the size of the SCEV S. static inline int sizeOfSCEV(const SCEV *S) { + struct FindSCEVSize { + int Size; + FindSCEVSize() : Size(0) {} + + bool follow(const SCEV *S) { + ++Size; + // Keep looking at all operands of S. + return true; + } + bool isDone() const { + return false; + } + }; + FindSCEVSize F; SCEVTraversal<FindSCEVSize> ST(F); ST.visitAll(S); @@ -771,8 +760,8 @@ public: void visitConstant(const SCEVConstant *Numerator) { if (const SCEVConstant *D = dyn_cast<SCEVConstant>(Denominator)) { - APInt NumeratorVal = Numerator->getValue()->getValue(); - APInt DenominatorVal = D->getValue()->getValue(); + APInt NumeratorVal = Numerator->getAPInt(); + APInt DenominatorVal = D->getAPInt(); uint32_t NumeratorBW = NumeratorVal.getBitWidth(); uint32_t DenominatorBW = DenominatorVal.getBitWidth(); @@ -792,17 +781,15 @@ public: void visitAddRecExpr(const SCEVAddRecExpr *Numerator) { const SCEV *StartQ, *StartR, *StepQ, *StepR; - assert(Numerator->isAffine() && "Numerator should be affine"); + if (!Numerator->isAffine()) + return cannotDivide(Numerator); divide(SE, Numerator->getStart(), Denominator, &StartQ, &StartR); divide(SE, Numerator->getStepRecurrence(SE), Denominator, &StepQ, &StepR); // Bail out if the types do not match. Type *Ty = Denominator->getType(); if (Ty != StartQ->getType() || Ty != StartR->getType() || - Ty != StepQ->getType() || Ty != StepR->getType()) { - Quotient = Zero; - Remainder = Numerator; - return; - } + Ty != StepQ->getType() || Ty != StepR->getType()) + return cannotDivide(Numerator); Quotient = SE.getAddRecExpr(StartQ, StepQ, Numerator->getLoop(), Numerator->getNoWrapFlags()); Remainder = SE.getAddRecExpr(StartR, StepR, Numerator->getLoop(), @@ -818,11 +805,8 @@ public: divide(SE, Op, Denominator, &Q, &R); // Bail out if types do not match. - if (Ty != Q->getType() || Ty != R->getType()) { - Quotient = Zero; - Remainder = Numerator; - return; - } + if (Ty != Q->getType() || Ty != R->getType()) + return cannotDivide(Numerator); Qs.push_back(Q); Rs.push_back(R); @@ -845,11 +829,8 @@ public: bool FoundDenominatorTerm = false; for (const SCEV *Op : Numerator->operands()) { // Bail out if types do not match. - if (Ty != Op->getType()) { - Quotient = Zero; - Remainder = Numerator; - return; - } + if (Ty != Op->getType()) + return cannotDivide(Numerator); if (FoundDenominatorTerm) { Qs.push_back(Op); @@ -865,11 +846,8 @@ public: } // Bail out if types do not match. - if (Ty != Q->getType()) { - Quotient = Zero; - Remainder = Numerator; - return; - } + if (Ty != Q->getType()) + return cannotDivide(Numerator); FoundDenominatorTerm = true; Qs.push_back(Q); @@ -884,11 +862,8 @@ public: return; } - if (!isa<SCEVUnknown>(Denominator)) { - Quotient = Zero; - Remainder = Numerator; - return; - } + if (!isa<SCEVUnknown>(Denominator)) + return cannotDivide(Numerator); // The Remainder is obtained by replacing Denominator by 0 in Numerator. ValueToValueMap RewriteMap; @@ -908,15 +883,12 @@ public: // Quotient is (Numerator - Remainder) divided by Denominator. const SCEV *Q, *R; const SCEV *Diff = SE.getMinusSCEV(Numerator, Remainder); - if (sizeOfSCEV(Diff) > sizeOfSCEV(Numerator)) { - // This SCEV does not seem to simplify: fail the division here. - Quotient = Zero; - Remainder = Numerator; - return; - } + // This SCEV does not seem to simplify: fail the division here. + if (sizeOfSCEV(Diff) > sizeOfSCEV(Numerator)) + return cannotDivide(Numerator); divide(SE, Diff, Denominator, &Q, &R); - assert(R == Zero && - "(Numerator - Remainder) should evenly divide Denominator"); + if (R != Zero) + return cannotDivide(Numerator); Quotient = Q; } @@ -924,11 +896,18 @@ private: SCEVDivision(ScalarEvolution &S, const SCEV *Numerator, const SCEV *Denominator) : SE(S), Denominator(Denominator) { - Zero = SE.getConstant(Denominator->getType(), 0); - One = SE.getConstant(Denominator->getType(), 1); + Zero = SE.getZero(Denominator->getType()); + One = SE.getOne(Denominator->getType()); + + // We generally do not know how to divide Expr by Denominator. We + // initialize the division to a "cannot divide" state to simplify the rest + // of the code. + cannotDivide(Numerator); + } - // By default, we don't know how to divide Expr by Denominator. - // Providing the default here simplifies the rest of the code. + // Convenience function for giving up on the division. We set the quotient to + // be equal to zero and the remainder to be equal to the numerator. + void cannotDivide(const SCEV *Numerator) { Quotient = Zero; Remainder = Numerator; } @@ -1151,8 +1130,8 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op, // If the input value is a chrec scev, truncate the chrec's operands. if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Op)) { SmallVector<const SCEV *, 4> Operands; - for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) - Operands.push_back(getTruncateExpr(AddRec->getOperand(i), Ty)); + for (const SCEV *Op : AddRec->operands()) + Operands.push_back(getTruncateExpr(Op, Ty)); return getAddRecExpr(Operands, AddRec->getLoop(), SCEV::FlagAnyWrap); } @@ -1287,7 +1266,9 @@ static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty, // `Step`: // 1. NSW/NUW flags on the step increment. - const SCEV *PreStart = SE->getAddExpr(DiffOps, SA->getNoWrapFlags()); + auto PreStartFlags = + ScalarEvolution::maskFlags(SA->getNoWrapFlags(), SCEV::FlagNUW); + const SCEV *PreStart = SE->getAddExpr(DiffOps, PreStartFlags); const SCEVAddRecExpr *PreAR = dyn_cast<SCEVAddRecExpr>( SE->getAddRecExpr(PreStart, Step, L, SCEV::FlagAnyWrap)); @@ -1322,9 +1303,9 @@ static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty, ExtendOpTraits<ExtendOpTy>::getOverflowLimitForStep(Step, &Pred, SE); if (OverflowLimit && - SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit)) { + SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit)) return PreStart; - } + return nullptr; } @@ -1390,24 +1371,22 @@ bool ScalarEvolution::proveNoWrapByVaryingStart(const SCEV *Start, if (!StartC) return false; - APInt StartAI = StartC->getValue()->getValue(); + APInt StartAI = StartC->getAPInt(); for (unsigned Delta : {-2, -1, 1, 2}) { const SCEV *PreStart = getConstant(StartAI - Delta); + FoldingSetNodeID ID; + ID.AddInteger(scAddRecExpr); + ID.AddPointer(PreStart); + ID.AddPointer(Step); + ID.AddPointer(L); + void *IP = nullptr; + const auto *PreAR = + static_cast<SCEVAddRecExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); + // Give up if we don't already have the add recurrence we need because // actually constructing an add recurrence is relatively expensive. - const SCEVAddRecExpr *PreAR = [&]() { - FoldingSetNodeID ID; - ID.AddInteger(scAddRecExpr); - ID.AddPointer(PreStart); - ID.AddPointer(Step); - ID.AddPointer(L); - void *IP = nullptr; - return static_cast<SCEVAddRecExpr *>( - this->UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); - }(); - if (PreAR && PreAR->getNoWrapFlags(WrapType)) { // proves (2) const SCEV *DeltaS = getConstant(StartC->getType(), Delta); ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; @@ -1578,6 +1557,18 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op, } } + if (auto *SA = dyn_cast<SCEVAddExpr>(Op)) { + // zext((A + B + ...)<nuw>) --> (zext(A) + zext(B) + ...)<nuw> + if (SA->getNoWrapFlags(SCEV::FlagNUW)) { + // If the addition does not unsign overflow then we can, by definition, + // commute the zero extension with the addition operation. + SmallVector<const SCEV *, 4> Ops; + for (const auto *Op : SA->operands()) + Ops.push_back(getZeroExtendExpr(Op, Ty)); + return getAddExpr(Ops, SCEV::FlagNUW); + } + } + // The cast wasn't folded; create an explicit cast node. // Recompute the insert position, as it may have been invalidated. if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S; @@ -1635,14 +1626,14 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, } // sext(C1 + (C2 * x)) --> C1 + sext(C2 * x) if C1 < C2 - if (auto SA = dyn_cast<SCEVAddExpr>(Op)) { + if (auto *SA = dyn_cast<SCEVAddExpr>(Op)) { if (SA->getNumOperands() == 2) { - auto SC1 = dyn_cast<SCEVConstant>(SA->getOperand(0)); - auto SMul = dyn_cast<SCEVMulExpr>(SA->getOperand(1)); + auto *SC1 = dyn_cast<SCEVConstant>(SA->getOperand(0)); + auto *SMul = dyn_cast<SCEVMulExpr>(SA->getOperand(1)); if (SMul && SC1) { - if (auto SC2 = dyn_cast<SCEVConstant>(SMul->getOperand(0))) { - const APInt &C1 = SC1->getValue()->getValue(); - const APInt &C2 = SC2->getValue()->getValue(); + if (auto *SC2 = dyn_cast<SCEVConstant>(SMul->getOperand(0))) { + const APInt &C1 = SC1->getAPInt(); + const APInt &C2 = SC2->getAPInt(); if (C1.isStrictlyPositive() && C2.isStrictlyPositive() && C2.ugt(C1) && C2.isPowerOf2()) return getAddExpr(getSignExtendExpr(SC1, Ty), @@ -1650,6 +1641,16 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, } } } + + // sext((A + B + ...)<nsw>) --> (sext(A) + sext(B) + ...)<nsw> + if (SA->getNoWrapFlags(SCEV::FlagNSW)) { + // If the addition does not sign overflow then we can, by definition, + // commute the sign extension with the addition operation. + SmallVector<const SCEV *, 4> Ops; + for (const auto *Op : SA->operands()) + Ops.push_back(getSignExtendExpr(Op, Ty)); + return getAddExpr(Ops, SCEV::FlagNSW); + } } // If the input value is a chrec scev, and we can prove that the value // did not overflow the old, smaller, value, we can sign extend all of the @@ -1754,16 +1755,16 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op, // If Start and Step are constants, check if we can apply this // transformation: // sext{C1,+,C2} --> C1 + sext{0,+,C2} if C1 < C2 - auto SC1 = dyn_cast<SCEVConstant>(Start); - auto SC2 = dyn_cast<SCEVConstant>(Step); + auto *SC1 = dyn_cast<SCEVConstant>(Start); + auto *SC2 = dyn_cast<SCEVConstant>(Step); if (SC1 && SC2) { - const APInt &C1 = SC1->getValue()->getValue(); - const APInt &C2 = SC2->getValue()->getValue(); + const APInt &C1 = SC1->getAPInt(); + const APInt &C2 = SC2->getAPInt(); if (C1.isStrictlyPositive() && C2.isStrictlyPositive() && C2.ugt(C1) && C2.isPowerOf2()) { Start = getSignExtendExpr(Start, Ty); - const SCEV *NewAR = getAddRecExpr(getConstant(AR->getType(), 0), Step, - L, AR->getNoWrapFlags()); + const SCEV *NewAR = getAddRecExpr(getZero(AR->getType()), Step, L, + AR->getNoWrapFlags()); return getAddExpr(Start, getSignExtendExpr(NewAR, Ty)); } } @@ -1798,7 +1799,7 @@ const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op, // Sign-extend negative constants. if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op)) - if (SC->getValue()->getValue().isNegative()) + if (SC->getAPInt().isNegative()) return getSignExtendExpr(Op, Ty); // Peel off a truncate cast. @@ -1876,7 +1877,7 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M, // Pull a buried constant out to the outside. if (Scale != 1 || AccumulatedConstant != 0 || C->getValue()->isZero()) Interesting = true; - AccumulatedConstant += Scale * C->getValue()->getValue(); + AccumulatedConstant += Scale * C->getAPInt(); } // Next comes everything else. We're especially interested in multiplies @@ -1885,7 +1886,7 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M, const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[i]); if (Mul && isa<SCEVConstant>(Mul->getOperand(0))) { APInt NewScale = - Scale * cast<SCEVConstant>(Mul->getOperand(0))->getValue()->getValue(); + Scale * cast<SCEVConstant>(Mul->getOperand(0))->getAPInt(); if (Mul->getNumOperands() == 2 && isa<SCEVAddExpr>(Mul->getOperand(1))) { // A multiplication of a constant with another add; recurse. const SCEVAddExpr *Add = cast<SCEVAddExpr>(Mul->getOperand(1)); @@ -1898,8 +1899,7 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M, // the map. SmallVector<const SCEV *, 4> MulOps(Mul->op_begin()+1, Mul->op_end()); const SCEV *Key = SE.getMulExpr(MulOps); - std::pair<DenseMap<const SCEV *, APInt>::iterator, bool> Pair = - M.insert(std::make_pair(Key, NewScale)); + auto Pair = M.insert(std::make_pair(Key, NewScale)); if (Pair.second) { NewOps.push_back(Pair.first->first); } else { @@ -1927,22 +1927,15 @@ CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M, return Interesting; } -namespace { - struct APIntCompare { - bool operator()(const APInt &LHS, const APInt &RHS) const { - return LHS.ult(RHS); - } - }; -} - // We're trying to construct a SCEV of type `Type' with `Ops' as operands and // `OldFlags' as can't-wrap behavior. Infer a more aggressive set of // can't-overflow flags for the operation if possible. static SCEV::NoWrapFlags StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type, const SmallVectorImpl<const SCEV *> &Ops, - SCEV::NoWrapFlags OldFlags) { + SCEV::NoWrapFlags Flags) { using namespace std::placeholders; + typedef OverflowingBinaryOperator OBO; bool CanAnalyze = Type == scAddExpr || Type == scAddRecExpr || Type == scMulExpr; @@ -1951,18 +1944,42 @@ StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type, int SignOrUnsignMask = SCEV::FlagNUW | SCEV::FlagNSW; SCEV::NoWrapFlags SignOrUnsignWrap = - ScalarEvolution::maskFlags(OldFlags, SignOrUnsignMask); + ScalarEvolution::maskFlags(Flags, SignOrUnsignMask); // If FlagNSW is true and all the operands are non-negative, infer FlagNUW. - auto IsKnownNonNegative = - std::bind(std::mem_fn(&ScalarEvolution::isKnownNonNegative), SE, _1); + auto IsKnownNonNegative = [&](const SCEV *S) { + return SE->isKnownNonNegative(S); + }; + + if (SignOrUnsignWrap == SCEV::FlagNSW && all_of(Ops, IsKnownNonNegative)) + Flags = + ScalarEvolution::setFlags(Flags, (SCEV::NoWrapFlags)SignOrUnsignMask); - if (SignOrUnsignWrap == SCEV::FlagNSW && - std::all_of(Ops.begin(), Ops.end(), IsKnownNonNegative)) - return ScalarEvolution::setFlags(OldFlags, - (SCEV::NoWrapFlags)SignOrUnsignMask); + SignOrUnsignWrap = ScalarEvolution::maskFlags(Flags, SignOrUnsignMask); + + if (SignOrUnsignWrap != SignOrUnsignMask && Type == scAddExpr && + Ops.size() == 2 && isa<SCEVConstant>(Ops[0])) { + + // (A + C) --> (A + C)<nsw> if the addition does not sign overflow + // (A + C) --> (A + C)<nuw> if the addition does not unsign overflow + + const APInt &C = cast<SCEVConstant>(Ops[0])->getAPInt(); + if (!(SignOrUnsignWrap & SCEV::FlagNSW)) { + auto NSWRegion = + ConstantRange::makeNoWrapRegion(Instruction::Add, C, OBO::NoSignedWrap); + if (NSWRegion.contains(SE->getSignedRange(Ops[1]))) + Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW); + } + if (!(SignOrUnsignWrap & SCEV::FlagNUW)) { + auto NUWRegion = + ConstantRange::makeNoWrapRegion(Instruction::Add, C, + OBO::NoUnsignedWrap); + if (NUWRegion.contains(SE->getUnsignedRange(Ops[1]))) + Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW); + } + } - return OldFlags; + return Flags; } /// getAddExpr - Get a canonical add expression, or something simpler if @@ -1980,10 +1997,10 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops, "SCEVAddExpr operand types don't match!"); #endif - Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags); - // Sort by complexity, this groups all similar expression types together. - GroupByComplexity(Ops, LI); + GroupByComplexity(Ops, &LI); + + Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags); // If there are any constants, fold them together. unsigned Idx = 0; @@ -1992,8 +2009,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops, assert(Idx < Ops.size()); while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) { // We found two constants, fold them together! - Ops[0] = getConstant(LHSC->getValue()->getValue() + - RHSC->getValue()->getValue()); + Ops[0] = getConstant(LHSC->getAPInt() + RHSC->getAPInt()); if (Ops.size() == 2) return Ops[0]; Ops.erase(Ops.begin()+1); // Erase the folded element LHSC = cast<SCEVConstant>(Ops[0]); @@ -2063,8 +2079,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops, break; } LargeMulOps.push_back(T->getOperand()); - } else if (const SCEVConstant *C = - dyn_cast<SCEVConstant>(M->getOperand(j))) { + } else if (const auto *C = dyn_cast<SCEVConstant>(M->getOperand(j))) { LargeMulOps.push_back(getAnyExtendExpr(C, SrcType)); } else { Ok = false; @@ -2123,24 +2138,28 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops, if (CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant, Ops.data(), Ops.size(), APInt(BitWidth, 1), *this)) { + struct APIntCompare { + bool operator()(const APInt &LHS, const APInt &RHS) const { + return LHS.ult(RHS); + } + }; + // Some interesting folding opportunity is present, so its worthwhile to // re-generate the operands list. Group the operands by constant scale, // to avoid multiplying by the same constant scale multiple times. std::map<APInt, SmallVector<const SCEV *, 4>, APIntCompare> MulOpLists; - for (SmallVectorImpl<const SCEV *>::const_iterator I = NewOps.begin(), - E = NewOps.end(); I != E; ++I) - MulOpLists[M.find(*I)->second].push_back(*I); + for (const SCEV *NewOp : NewOps) + MulOpLists[M.find(NewOp)->second].push_back(NewOp); // Re-generate the operands list. Ops.clear(); if (AccumulatedConstant != 0) Ops.push_back(getConstant(AccumulatedConstant)); - for (std::map<APInt, SmallVector<const SCEV *, 4>, APIntCompare>::iterator - I = MulOpLists.begin(), E = MulOpLists.end(); I != E; ++I) - if (I->first != 0) - Ops.push_back(getMulExpr(getConstant(I->first), - getAddExpr(I->second))); + for (auto &MulOp : MulOpLists) + if (MulOp.first != 0) + Ops.push_back(getMulExpr(getConstant(MulOp.first), + getAddExpr(MulOp.second))); if (Ops.empty()) - return getConstant(Ty, 0); + return getZero(Ty); if (Ops.size() == 1) return Ops[0]; return getAddExpr(Ops); @@ -2168,7 +2187,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops, MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end()); InnerMul = getMulExpr(MulOps); } - const SCEV *One = getConstant(Ty, 1); + const SCEV *One = getOne(Ty); const SCEV *AddOne = getAddExpr(One, InnerMul); const SCEV *OuterMul = getMulExpr(AddOne, MulOpSCEV); if (Ops.size() == 2) return OuterMul; @@ -2279,8 +2298,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops, AddRec->op_end()); for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]); ++OtherIdx) - if (const SCEVAddRecExpr *OtherAddRec = - dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx])) + if (const auto *OtherAddRec = dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx])) if (OtherAddRec->getLoop() == AddRecLoop) { for (unsigned i = 0, e = OtherAddRec->getNumOperands(); i != e; ++i) { @@ -2388,10 +2406,10 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops, "SCEVMulExpr operand types don't match!"); #endif - Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags); - // Sort by complexity, this groups all similar expression types together. - GroupByComplexity(Ops, LI); + GroupByComplexity(Ops, &LI); + + Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags); // If there are any constants, fold them together. unsigned Idx = 0; @@ -2410,9 +2428,8 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops, ++Idx; while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) { // We found two constants, fold them together! - ConstantInt *Fold = ConstantInt::get(getContext(), - LHSC->getValue()->getValue() * - RHSC->getValue()->getValue()); + ConstantInt *Fold = + ConstantInt::get(getContext(), LHSC->getAPInt() * RHSC->getAPInt()); Ops[0] = getConstant(Fold); Ops.erase(Ops.begin()+1); // Erase the folded element if (Ops.size() == 1) return Ops[0]; @@ -2433,23 +2450,19 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops, if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1])) { SmallVector<const SCEV *, 4> NewOps; bool AnyFolded = false; - for (SCEVAddRecExpr::op_iterator I = Add->op_begin(), - E = Add->op_end(); I != E; ++I) { - const SCEV *Mul = getMulExpr(Ops[0], *I); + for (const SCEV *AddOp : Add->operands()) { + const SCEV *Mul = getMulExpr(Ops[0], AddOp); if (!isa<SCEVMulExpr>(Mul)) AnyFolded = true; NewOps.push_back(Mul); } if (AnyFolded) return getAddExpr(NewOps); - } - else if (const SCEVAddRecExpr * - AddRec = dyn_cast<SCEVAddRecExpr>(Ops[1])) { + } else if (const auto *AddRec = dyn_cast<SCEVAddRecExpr>(Ops[1])) { // Negation preserves a recurrence's no self-wrap property. SmallVector<const SCEV *, 4> Operands; - for (SCEVAddRecExpr::op_iterator I = AddRec->op_begin(), - E = AddRec->op_end(); I != E; ++I) { - Operands.push_back(getMulExpr(Ops[0], *I)); - } + for (const SCEV *AddRecOp : AddRec->operands()) + Operands.push_back(getMulExpr(Ops[0], AddRecOp)); + return getAddRecExpr(Operands, AddRec->getLoop(), AddRec->getNoWrapFlags(SCEV::FlagNW)); } @@ -2560,7 +2573,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops, SmallVector<const SCEV*, 7> AddRecOps; for (int x = 0, xe = AddRec->getNumOperands() + OtherAddRec->getNumOperands() - 1; x != xe && !Overflow; ++x) { - const SCEV *Term = getConstant(Ty, 0); + const SCEV *Term = getZero(Ty); for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) { uint64_t Coeff1 = Choose(x, 2*x - y, Overflow); for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1), @@ -2638,11 +2651,11 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS, // its operands. // TODO: Generalize this to non-constants by using known-bits information. Type *Ty = LHS->getType(); - unsigned LZ = RHSC->getValue()->getValue().countLeadingZeros(); + unsigned LZ = RHSC->getAPInt().countLeadingZeros(); unsigned MaxShiftAmt = getTypeSizeInBits(Ty) - LZ - 1; // For non-power-of-two values, effectively round the value up to the // nearest power of two. - if (!RHSC->getValue()->getValue().isPowerOf2()) + if (!RHSC->getAPInt().isPowerOf2()) ++MaxShiftAmt; IntegerType *ExtTy = IntegerType::get(getContext(), getTypeSizeInBits(Ty) + MaxShiftAmt); @@ -2650,18 +2663,17 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS, if (const SCEVConstant *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*this))) { // {X,+,N}/C --> {X/C,+,N/C} if safe and N/C can be folded. - const APInt &StepInt = Step->getValue()->getValue(); - const APInt &DivInt = RHSC->getValue()->getValue(); + const APInt &StepInt = Step->getAPInt(); + const APInt &DivInt = RHSC->getAPInt(); if (!StepInt.urem(DivInt) && getZeroExtendExpr(AR, ExtTy) == getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy), getZeroExtendExpr(Step, ExtTy), AR->getLoop(), SCEV::FlagAnyWrap)) { SmallVector<const SCEV *, 4> Operands; - for (unsigned i = 0, e = AR->getNumOperands(); i != e; ++i) - Operands.push_back(getUDivExpr(AR->getOperand(i), RHS)); - return getAddRecExpr(Operands, AR->getLoop(), - SCEV::FlagNW); + for (const SCEV *Op : AR->operands()) + Operands.push_back(getUDivExpr(Op, RHS)); + return getAddRecExpr(Operands, AR->getLoop(), SCEV::FlagNW); } /// Get a canonical UDivExpr for a recurrence. /// {X,+,N}/C => {Y,+,N}/C where Y=X-(X%N). Safe when C%N=0. @@ -2672,7 +2684,7 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS, getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy), getZeroExtendExpr(Step, ExtTy), AR->getLoop(), SCEV::FlagAnyWrap)) { - const APInt &StartInt = StartC->getValue()->getValue(); + const APInt &StartInt = StartC->getAPInt(); const APInt &StartRem = StartInt.urem(StepInt); if (StartRem != 0) LHS = getAddRecExpr(getConstant(StartInt - StartRem), Step, @@ -2682,8 +2694,8 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS, // (A*B)/C --> A*(B/C) if safe and B/C can be folded. if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(LHS)) { SmallVector<const SCEV *, 4> Operands; - for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) - Operands.push_back(getZeroExtendExpr(M->getOperand(i), ExtTy)); + for (const SCEV *Op : M->operands()) + Operands.push_back(getZeroExtendExpr(Op, ExtTy)); if (getZeroExtendExpr(M, ExtTy) == getMulExpr(Operands)) // Find an operand that's safely divisible. for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) { @@ -2700,8 +2712,8 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS, // (A+B)/C --> (A/C + B/C) if safe and A/C and B/C can be folded. if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(LHS)) { SmallVector<const SCEV *, 4> Operands; - for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i) - Operands.push_back(getZeroExtendExpr(A->getOperand(i), ExtTy)); + for (const SCEV *Op : A->operands()) + Operands.push_back(getZeroExtendExpr(Op, ExtTy)); if (getZeroExtendExpr(A, ExtTy) == getAddExpr(Operands)) { Operands.clear(); for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i) { @@ -2739,8 +2751,8 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS, } static const APInt gcd(const SCEVConstant *C1, const SCEVConstant *C2) { - APInt A = C1->getValue()->getValue().abs(); - APInt B = C2->getValue()->getValue().abs(); + APInt A = C1->getAPInt().abs(); + APInt B = C2->getAPInt().abs(); uint32_t ABW = A.getBitWidth(); uint32_t BBW = B.getBitWidth(); @@ -2769,8 +2781,7 @@ const SCEV *ScalarEvolution::getUDivExactExpr(const SCEV *LHS, if (const SCEVConstant *RHSCst = dyn_cast<SCEVConstant>(RHS)) { // If the mulexpr multiplies by a constant, then that constant must be the // first element of the mulexpr. - if (const SCEVConstant *LHSCst = - dyn_cast<SCEVConstant>(Mul->getOperand(0))) { + if (const auto *LHSCst = dyn_cast<SCEVConstant>(Mul->getOperand(0))) { if (LHSCst == RHSCst) { SmallVector<const SCEV *, 2> Operands; Operands.append(Mul->op_begin() + 1, Mul->op_end()); @@ -2782,10 +2793,10 @@ const SCEV *ScalarEvolution::getUDivExactExpr(const SCEV *LHS, // check. APInt Factor = gcd(LHSCst, RHSCst); if (!Factor.isIntN(1)) { - LHSCst = cast<SCEVConstant>( - getConstant(LHSCst->getValue()->getValue().udiv(Factor))); - RHSCst = cast<SCEVConstant>( - getConstant(RHSCst->getValue()->getValue().udiv(Factor))); + LHSCst = + cast<SCEVConstant>(getConstant(LHSCst->getAPInt().udiv(Factor))); + RHSCst = + cast<SCEVConstant>(getConstant(RHSCst->getAPInt().udiv(Factor))); SmallVector<const SCEV *, 2> Operands; Operands.push_back(LHSCst); Operands.append(Mul->op_begin() + 1, Mul->op_end()); @@ -2859,22 +2870,19 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands, // Canonicalize nested AddRecs in by nesting them in order of loop depth. if (const SCEVAddRecExpr *NestedAR = dyn_cast<SCEVAddRecExpr>(Operands[0])) { const Loop *NestedLoop = NestedAR->getLoop(); - if (L->contains(NestedLoop) ? - (L->getLoopDepth() < NestedLoop->getLoopDepth()) : - (!NestedLoop->contains(L) && - DT->dominates(L->getHeader(), NestedLoop->getHeader()))) { + if (L->contains(NestedLoop) + ? (L->getLoopDepth() < NestedLoop->getLoopDepth()) + : (!NestedLoop->contains(L) && + DT.dominates(L->getHeader(), NestedLoop->getHeader()))) { SmallVector<const SCEV *, 4> NestedOperands(NestedAR->op_begin(), NestedAR->op_end()); Operands[0] = NestedAR->getStart(); // AddRecs require their operands be loop-invariant with respect to their // loops. Don't perform this transformation if it would break this // requirement. - bool AllInvariant = true; - for (unsigned i = 0, e = Operands.size(); i != e; ++i) - if (!isLoopInvariant(Operands[i], L)) { - AllInvariant = false; - break; - } + bool AllInvariant = all_of( + Operands, [&](const SCEV *Op) { return isLoopInvariant(Op, L); }); + if (AllInvariant) { // Create a recurrence for the outer loop with the same step size. // @@ -2884,12 +2892,10 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands, maskFlags(Flags, SCEV::FlagNW | NestedAR->getNoWrapFlags()); NestedOperands[0] = getAddRecExpr(Operands, L, OuterFlags); - AllInvariant = true; - for (unsigned i = 0, e = NestedOperands.size(); i != e; ++i) - if (!isLoopInvariant(NestedOperands[i], NestedLoop)) { - AllInvariant = false; - break; - } + AllInvariant = all_of(NestedOperands, [&](const SCEV *Op) { + return isLoopInvariant(Op, NestedLoop); + }); + if (AllInvariant) { // Ok, both add recurrences are valid after the transformation. // @@ -2936,10 +2942,11 @@ ScalarEvolution::getGEPExpr(Type *PointeeType, const SCEV *BaseExpr, // FIXME(PR23527): Don't blindly transfer the inbounds flag from the GEP // instruction to its SCEV, because the Instruction may be guarded by control // flow and the no-overflow bits may not be valid for the expression in any - // context. + // context. This can be fixed similarly to how these flags are handled for + // adds. SCEV::NoWrapFlags Wrap = InBounds ? SCEV::FlagNSW : SCEV::FlagAnyWrap; - const SCEV *TotalOffset = getConstant(IntPtrTy, 0); + const SCEV *TotalOffset = getZero(IntPtrTy); // The address space is unimportant. The first thing we do on CurTy is getting // its element type. Type *CurTy = PointerType::getUnqual(PointeeType); @@ -2996,7 +3003,7 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) { #endif // Sort by complexity, this groups all similar expression types together. - GroupByComplexity(Ops, LI); + GroupByComplexity(Ops, &LI); // If there are any constants, fold them together. unsigned Idx = 0; @@ -3005,9 +3012,8 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) { assert(Idx < Ops.size()); while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) { // We found two constants, fold them together! - ConstantInt *Fold = ConstantInt::get(getContext(), - APIntOps::smax(LHSC->getValue()->getValue(), - RHSC->getValue()->getValue())); + ConstantInt *Fold = ConstantInt::get( + getContext(), APIntOps::smax(LHSC->getAPInt(), RHSC->getAPInt())); Ops[0] = getConstant(Fold); Ops.erase(Ops.begin()+1); // Erase the folded element if (Ops.size() == 1) return Ops[0]; @@ -3100,7 +3106,7 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) { #endif // Sort by complexity, this groups all similar expression types together. - GroupByComplexity(Ops, LI); + GroupByComplexity(Ops, &LI); // If there are any constants, fold them together. unsigned Idx = 0; @@ -3109,9 +3115,8 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) { assert(Idx < Ops.size()); while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) { // We found two constants, fold them together! - ConstantInt *Fold = ConstantInt::get(getContext(), - APIntOps::umax(LHSC->getValue()->getValue(), - RHSC->getValue()->getValue())); + ConstantInt *Fold = ConstantInt::get( + getContext(), APIntOps::umax(LHSC->getAPInt(), RHSC->getAPInt())); Ops[0] = getConstant(Fold); Ops.erase(Ops.begin()+1); // Erase the folded element if (Ops.size() == 1) return Ops[0]; @@ -3200,8 +3205,7 @@ const SCEV *ScalarEvolution::getSizeOfExpr(Type *IntTy, Type *AllocTy) { // We can bypass creating a target-independent // constant expression and then folding it back into a ConstantInt. // This is just a compile-time optimization. - return getConstant(IntTy, - F->getParent()->getDataLayout().getTypeAllocSize(AllocTy)); + return getConstant(IntTy, getDataLayout().getTypeAllocSize(AllocTy)); } const SCEV *ScalarEvolution::getOffsetOfExpr(Type *IntTy, @@ -3211,9 +3215,7 @@ const SCEV *ScalarEvolution::getOffsetOfExpr(Type *IntTy, // constant expression and then folding it back into a ConstantInt. // This is just a compile-time optimization. return getConstant( - IntTy, - F->getParent()->getDataLayout().getStructLayout(STy)->getElementOffset( - FieldNo)); + IntTy, getDataLayout().getStructLayout(STy)->getElementOffset(FieldNo)); } const SCEV *ScalarEvolution::getUnknown(Value *V) { @@ -3255,7 +3257,7 @@ bool ScalarEvolution::isSCEVable(Type *Ty) const { /// for which isSCEVable must return true. uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const { assert(isSCEVable(Ty) && "Type is not SCEVable!"); - return F->getParent()->getDataLayout().getTypeSizeInBits(Ty); + return getDataLayout().getTypeSizeInBits(Ty); } /// getEffectiveSCEVType - Return a type with the same bitwidth as @@ -3265,20 +3267,20 @@ uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const { Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const { assert(isSCEVable(Ty) && "Type is not SCEVable!"); - if (Ty->isIntegerTy()) { + if (Ty->isIntegerTy()) return Ty; - } // The only other support type is pointer. assert(Ty->isPointerTy() && "Unexpected non-pointer non-integer type!"); - return F->getParent()->getDataLayout().getIntPtrType(Ty); + return getDataLayout().getIntPtrType(Ty); } const SCEV *ScalarEvolution::getCouldNotCompute() { - return &CouldNotCompute; + return CouldNotCompute.get(); } -namespace { + +bool ScalarEvolution::checkValidity(const SCEV *S) const { // Helper class working with SCEVTraversal to figure out if a SCEV contains // a SCEVUnknown with null value-pointer. FindInvalidSCEVUnknown::FindOne // is set iff if find such SCEVUnknown. @@ -3300,9 +3302,7 @@ namespace { } bool isDone() const { return FindOne; } }; -} -bool ScalarEvolution::checkValidity(const SCEV *S) const { FindInvalidSCEVUnknown F; SCEVTraversal<FindInvalidSCEVUnknown> ST(F); ST.visitAll(S); @@ -3315,35 +3315,39 @@ bool ScalarEvolution::checkValidity(const SCEV *S) const { const SCEV *ScalarEvolution::getSCEV(Value *V) { assert(isSCEVable(V->getType()) && "Value is not SCEVable!"); + const SCEV *S = getExistingSCEV(V); + if (S == nullptr) { + S = createSCEV(V); + ValueExprMap.insert(std::make_pair(SCEVCallbackVH(V, this), S)); + } + return S; +} + +const SCEV *ScalarEvolution::getExistingSCEV(Value *V) { + assert(isSCEVable(V->getType()) && "Value is not SCEVable!"); + ValueExprMapType::iterator I = ValueExprMap.find_as(V); if (I != ValueExprMap.end()) { const SCEV *S = I->second; if (checkValidity(S)) return S; - else - ValueExprMap.erase(I); + ValueExprMap.erase(I); } - const SCEV *S = createSCEV(V); - - // The process of creating a SCEV for V may have caused other SCEVs - // to have been created, so it's necessary to insert the new entry - // from scratch, rather than trying to remember the insert position - // above. - ValueExprMap.insert(std::make_pair(SCEVCallbackVH(V, this), S)); - return S; + return nullptr; } /// getNegativeSCEV - Return a SCEV corresponding to -V = -1*V /// -const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V) { +const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V, + SCEV::NoWrapFlags Flags) { if (const SCEVConstant *VC = dyn_cast<SCEVConstant>(V)) return getConstant( cast<ConstantInt>(ConstantExpr::getNeg(VC->getValue()))); Type *Ty = V->getType(); Ty = getEffectiveSCEVType(Ty); - return getMulExpr(V, - getConstant(cast<ConstantInt>(Constant::getAllOnesValue(Ty)))); + return getMulExpr( + V, getConstant(cast<ConstantInt>(Constant::getAllOnesValue(Ty))), Flags); } /// getNotSCEV - Return a SCEV corresponding to ~V = -1-V @@ -3362,15 +3366,40 @@ const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) { /// getMinusSCEV - Return LHS-RHS. Minus is represented in SCEV as A+B*-1. const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags) { - assert(!maskFlags(Flags, SCEV::FlagNUW) && "subtraction does not have NUW"); - // Fast path: X - X --> 0. if (LHS == RHS) - return getConstant(LHS->getType(), 0); + return getZero(LHS->getType()); + + // We represent LHS - RHS as LHS + (-1)*RHS. This transformation + // makes it so that we cannot make much use of NUW. + auto AddFlags = SCEV::FlagAnyWrap; + const bool RHSIsNotMinSigned = + !getSignedRange(RHS).getSignedMin().isMinSignedValue(); + if (maskFlags(Flags, SCEV::FlagNSW) == SCEV::FlagNSW) { + // Let M be the minimum representable signed value. Then (-1)*RHS + // signed-wraps if and only if RHS is M. That can happen even for + // a NSW subtraction because e.g. (-1)*M signed-wraps even though + // -1 - M does not. So to transfer NSW from LHS - RHS to LHS + + // (-1)*RHS, we need to prove that RHS != M. + // + // If LHS is non-negative and we know that LHS - RHS does not + // signed-wrap, then RHS cannot be M. So we can rule out signed-wrap + // either by proving that RHS > M or that LHS >= 0. + if (RHSIsNotMinSigned || isKnownNonNegative(LHS)) { + AddFlags = SCEV::FlagNSW; + } + } + + // FIXME: Find a correct way to transfer NSW to (-1)*M when LHS - + // RHS is NSW and LHS >= 0. + // + // The difficulty here is that the NSW flag may have been proven + // relative to a loop that is to be found in a recurrence in LHS and + // not in RHS. Applying NSW to (-1)*M may then let the NSW have a + // larger scope than intended. + auto NegFlags = RHSIsNotMinSigned ? SCEV::FlagNSW : SCEV::FlagAnyWrap; - // X - Y --> X + -Y. - // X -(nsw || nuw) Y --> X + -Y. - return getAddExpr(LHS, getNegativeSCEV(RHS)); + return getAddExpr(LHS, getNegativeSCEV(RHS, NegFlags), AddFlags); } /// getTruncateOrZeroExtend - Return a SCEV corresponding to a conversion of the @@ -3513,16 +3542,14 @@ const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) { if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(V)) { return getPointerBase(Cast->getOperand()); - } - else if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(V)) { + } else if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(V)) { const SCEV *PtrOp = nullptr; - for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end(); - I != E; ++I) { - if ((*I)->getType()->isPointerTy()) { + for (const SCEV *NAryOp : NAry->operands()) { + if (NAryOp->getType()->isPointerTy()) { // Cannot find the base of an expression with multiple pointer operands. if (PtrOp) return V; - PtrOp = *I; + PtrOp = NAryOp; } } if (!PtrOp) @@ -3558,8 +3585,7 @@ ScalarEvolution::ForgetSymbolicName(Instruction *PN, const SCEV *SymName) { if (!Visited.insert(I).second) continue; - ValueExprMapType::iterator It = - ValueExprMap.find_as(static_cast<Value *>(I)); + auto It = ValueExprMap.find_as(static_cast<Value *>(I)); if (It != ValueExprMap.end()) { const SCEV *Old = It->second; @@ -3587,165 +3613,476 @@ ScalarEvolution::ForgetSymbolicName(Instruction *PN, const SCEV *SymName) { } } -/// createNodeForPHI - PHI nodes have two cases. Either the PHI node exists in -/// a loop header, making it a potential recurrence, or it doesn't. -/// -const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) { - if (const Loop *L = LI->getLoopFor(PN->getParent())) - if (L->getHeader() == PN->getParent()) { - // The loop may have multiple entrances or multiple exits; we can analyze - // this phi as an addrec if it has a unique entry value and a unique - // backedge value. - Value *BEValueV = nullptr, *StartValueV = nullptr; - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - Value *V = PN->getIncomingValue(i); - if (L->contains(PN->getIncomingBlock(i))) { - if (!BEValueV) { - BEValueV = V; - } else if (BEValueV != V) { - BEValueV = nullptr; - break; - } - } else if (!StartValueV) { - StartValueV = V; - } else if (StartValueV != V) { - StartValueV = nullptr; - break; - } - } - if (BEValueV && StartValueV) { - // While we are analyzing this PHI node, handle its value symbolically. - const SCEV *SymbolicName = getUnknown(PN); - assert(ValueExprMap.find_as(PN) == ValueExprMap.end() && - "PHI node already processed?"); - ValueExprMap.insert(std::make_pair(SCEVCallbackVH(PN, this), SymbolicName)); - - // Using this symbolic name for the PHI, analyze the value coming around - // the back-edge. - const SCEV *BEValue = getSCEV(BEValueV); - - // NOTE: If BEValue is loop invariant, we know that the PHI node just - // has a special value for the first iteration of the loop. - - // If the value coming around the backedge is an add with the symbolic - // value we just inserted, then we found a simple induction variable! - if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BEValue)) { - // If there is a single occurrence of the symbolic value, replace it - // with a recurrence. - unsigned FoundIndex = Add->getNumOperands(); - for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i) - if (Add->getOperand(i) == SymbolicName) - if (FoundIndex == e) { - FoundIndex = i; - break; - } +namespace { +class SCEVInitRewriter : public SCEVRewriteVisitor<SCEVInitRewriter> { +public: + static const SCEV *rewrite(const SCEV *Scev, const Loop *L, + ScalarEvolution &SE) { + SCEVInitRewriter Rewriter(L, SE); + const SCEV *Result = Rewriter.visit(Scev); + return Rewriter.isValid() ? Result : SE.getCouldNotCompute(); + } - if (FoundIndex != Add->getNumOperands()) { - // Create an add with everything but the specified operand. - SmallVector<const SCEV *, 8> Ops; - for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i) - if (i != FoundIndex) - Ops.push_back(Add->getOperand(i)); - const SCEV *Accum = getAddExpr(Ops); - - // This is not a valid addrec if the step amount is varying each - // loop iteration, but is not itself an addrec in this loop. - if (isLoopInvariant(Accum, L) || - (isa<SCEVAddRecExpr>(Accum) && - cast<SCEVAddRecExpr>(Accum)->getLoop() == L)) { - SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap; - - // If the increment doesn't overflow, then neither the addrec nor - // the post-increment will overflow. - if (const AddOperator *OBO = dyn_cast<AddOperator>(BEValueV)) { - if (OBO->getOperand(0) == PN) { - if (OBO->hasNoUnsignedWrap()) - Flags = setFlags(Flags, SCEV::FlagNUW); - if (OBO->hasNoSignedWrap()) - Flags = setFlags(Flags, SCEV::FlagNSW); - } - } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(BEValueV)) { - // If the increment is an inbounds GEP, then we know the address - // space cannot be wrapped around. We cannot make any guarantee - // about signed or unsigned overflow because pointers are - // unsigned but we may have a negative index from the base - // pointer. We can guarantee that no unsigned wrap occurs if the - // indices form a positive value. - if (GEP->isInBounds() && GEP->getOperand(0) == PN) { - Flags = setFlags(Flags, SCEV::FlagNW); - - const SCEV *Ptr = getSCEV(GEP->getPointerOperand()); - if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr))) - Flags = setFlags(Flags, SCEV::FlagNUW); - } + SCEVInitRewriter(const Loop *L, ScalarEvolution &SE) + : SCEVRewriteVisitor(SE), L(L), Valid(true) {} - // We cannot transfer nuw and nsw flags from subtraction - // operations -- sub nuw X, Y is not the same as add nuw X, -Y - // for instance. - } + const SCEV *visitUnknown(const SCEVUnknown *Expr) { + if (!(SE.getLoopDisposition(Expr, L) == ScalarEvolution::LoopInvariant)) + Valid = false; + return Expr; + } - const SCEV *StartVal = getSCEV(StartValueV); - const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags); - - // Since the no-wrap flags are on the increment, they apply to the - // post-incremented value as well. - if (isLoopInvariant(Accum, L)) - (void)getAddRecExpr(getAddExpr(StartVal, Accum), - Accum, L, Flags); - - // Okay, for the entire analysis of this edge we assumed the PHI - // to be symbolic. We now need to go back and purge all of the - // entries for the scalars that use the symbolic expression. - ForgetSymbolicName(PN, SymbolicName); - ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV; - return PHISCEV; - } + const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { + // Only allow AddRecExprs for this loop. + if (Expr->getLoop() == L) + return Expr->getStart(); + Valid = false; + return Expr; + } + + bool isValid() { return Valid; } + +private: + const Loop *L; + bool Valid; +}; + +class SCEVShiftRewriter : public SCEVRewriteVisitor<SCEVShiftRewriter> { +public: + static const SCEV *rewrite(const SCEV *Scev, const Loop *L, + ScalarEvolution &SE) { + SCEVShiftRewriter Rewriter(L, SE); + const SCEV *Result = Rewriter.visit(Scev); + return Rewriter.isValid() ? Result : SE.getCouldNotCompute(); + } + + SCEVShiftRewriter(const Loop *L, ScalarEvolution &SE) + : SCEVRewriteVisitor(SE), L(L), Valid(true) {} + + const SCEV *visitUnknown(const SCEVUnknown *Expr) { + // Only allow AddRecExprs for this loop. + if (!(SE.getLoopDisposition(Expr, L) == ScalarEvolution::LoopInvariant)) + Valid = false; + return Expr; + } + + const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { + if (Expr->getLoop() == L && Expr->isAffine()) + return SE.getMinusSCEV(Expr, Expr->getStepRecurrence(SE)); + Valid = false; + return Expr; + } + bool isValid() { return Valid; } + +private: + const Loop *L; + bool Valid; +}; +} // end anonymous namespace + +const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) { + const Loop *L = LI.getLoopFor(PN->getParent()); + if (!L || L->getHeader() != PN->getParent()) + return nullptr; + + // The loop may have multiple entrances or multiple exits; we can analyze + // this phi as an addrec if it has a unique entry value and a unique + // backedge value. + Value *BEValueV = nullptr, *StartValueV = nullptr; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *V = PN->getIncomingValue(i); + if (L->contains(PN->getIncomingBlock(i))) { + if (!BEValueV) { + BEValueV = V; + } else if (BEValueV != V) { + BEValueV = nullptr; + break; + } + } else if (!StartValueV) { + StartValueV = V; + } else if (StartValueV != V) { + StartValueV = nullptr; + break; + } + } + if (BEValueV && StartValueV) { + // While we are analyzing this PHI node, handle its value symbolically. + const SCEV *SymbolicName = getUnknown(PN); + assert(ValueExprMap.find_as(PN) == ValueExprMap.end() && + "PHI node already processed?"); + ValueExprMap.insert(std::make_pair(SCEVCallbackVH(PN, this), SymbolicName)); + + // Using this symbolic name for the PHI, analyze the value coming around + // the back-edge. + const SCEV *BEValue = getSCEV(BEValueV); + + // NOTE: If BEValue is loop invariant, we know that the PHI node just + // has a special value for the first iteration of the loop. + + // If the value coming around the backedge is an add with the symbolic + // value we just inserted, then we found a simple induction variable! + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BEValue)) { + // If there is a single occurrence of the symbolic value, replace it + // with a recurrence. + unsigned FoundIndex = Add->getNumOperands(); + for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i) + if (Add->getOperand(i) == SymbolicName) + if (FoundIndex == e) { + FoundIndex = i; + break; } - } else if (const SCEVAddRecExpr *AddRec = - dyn_cast<SCEVAddRecExpr>(BEValue)) { - // Otherwise, this could be a loop like this: - // i = 0; for (j = 1; ..; ++j) { .... i = j; } - // In this case, j = {1,+,1} and BEValue is j. - // Because the other in-value of i (0) fits the evolution of BEValue - // i really is an addrec evolution. - if (AddRec->getLoop() == L && AddRec->isAffine()) { - const SCEV *StartVal = getSCEV(StartValueV); - - // If StartVal = j.start - j.stride, we can use StartVal as the - // initial step of the addrec evolution. - if (StartVal == getMinusSCEV(AddRec->getOperand(0), - AddRec->getOperand(1))) { - // FIXME: For constant StartVal, we should be able to infer - // no-wrap flags. - const SCEV *PHISCEV = - getAddRecExpr(StartVal, AddRec->getOperand(1), L, - SCEV::FlagAnyWrap); - - // Okay, for the entire analysis of this edge we assumed the PHI - // to be symbolic. We now need to go back and purge all of the - // entries for the scalars that use the symbolic expression. - ForgetSymbolicName(PN, SymbolicName); - ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV; - return PHISCEV; + + if (FoundIndex != Add->getNumOperands()) { + // Create an add with everything but the specified operand. + SmallVector<const SCEV *, 8> Ops; + for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i) + if (i != FoundIndex) + Ops.push_back(Add->getOperand(i)); + const SCEV *Accum = getAddExpr(Ops); + + // This is not a valid addrec if the step amount is varying each + // loop iteration, but is not itself an addrec in this loop. + if (isLoopInvariant(Accum, L) || + (isa<SCEVAddRecExpr>(Accum) && + cast<SCEVAddRecExpr>(Accum)->getLoop() == L)) { + SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap; + + // If the increment doesn't overflow, then neither the addrec nor + // the post-increment will overflow. + if (const AddOperator *OBO = dyn_cast<AddOperator>(BEValueV)) { + if (OBO->getOperand(0) == PN) { + if (OBO->hasNoUnsignedWrap()) + Flags = setFlags(Flags, SCEV::FlagNUW); + if (OBO->hasNoSignedWrap()) + Flags = setFlags(Flags, SCEV::FlagNSW); + } + } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(BEValueV)) { + // If the increment is an inbounds GEP, then we know the address + // space cannot be wrapped around. We cannot make any guarantee + // about signed or unsigned overflow because pointers are + // unsigned but we may have a negative index from the base + // pointer. We can guarantee that no unsigned wrap occurs if the + // indices form a positive value. + if (GEP->isInBounds() && GEP->getOperand(0) == PN) { + Flags = setFlags(Flags, SCEV::FlagNW); + + const SCEV *Ptr = getSCEV(GEP->getPointerOperand()); + if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr))) + Flags = setFlags(Flags, SCEV::FlagNUW); } + + // We cannot transfer nuw and nsw flags from subtraction + // operations -- sub nuw X, Y is not the same as add nuw X, -Y + // for instance. } + + const SCEV *StartVal = getSCEV(StartValueV); + const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags); + + // Since the no-wrap flags are on the increment, they apply to the + // post-incremented value as well. + if (isLoopInvariant(Accum, L)) + (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags); + + // Okay, for the entire analysis of this edge we assumed the PHI + // to be symbolic. We now need to go back and purge all of the + // entries for the scalars that use the symbolic expression. + ForgetSymbolicName(PN, SymbolicName); + ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV; + return PHISCEV; + } + } + } else { + // Otherwise, this could be a loop like this: + // i = 0; for (j = 1; ..; ++j) { .... i = j; } + // In this case, j = {1,+,1} and BEValue is j. + // Because the other in-value of i (0) fits the evolution of BEValue + // i really is an addrec evolution. + // + // We can generalize this saying that i is the shifted value of BEValue + // by one iteration: + // PHI(f(0), f({1,+,1})) --> f({0,+,1}) + const SCEV *Shifted = SCEVShiftRewriter::rewrite(BEValue, L, *this); + const SCEV *Start = SCEVInitRewriter::rewrite(Shifted, L, *this); + if (Shifted != getCouldNotCompute() && + Start != getCouldNotCompute()) { + const SCEV *StartVal = getSCEV(StartValueV); + if (Start == StartVal) { + // Okay, for the entire analysis of this edge we assumed the PHI + // to be symbolic. We now need to go back and purge all of the + // entries for the scalars that use the symbolic expression. + ForgetSymbolicName(PN, SymbolicName); + ValueExprMap[SCEVCallbackVH(PN, this)] = Shifted; + return Shifted; } } } + } + + return nullptr; +} + +// Checks if the SCEV S is available at BB. S is considered available at BB +// if S can be materialized at BB without introducing a fault. +static bool IsAvailableOnEntry(const Loop *L, DominatorTree &DT, const SCEV *S, + BasicBlock *BB) { + struct CheckAvailable { + bool TraversalDone = false; + bool Available = true; + + const Loop *L = nullptr; // The loop BB is in (can be nullptr) + BasicBlock *BB = nullptr; + DominatorTree &DT; + + CheckAvailable(const Loop *L, BasicBlock *BB, DominatorTree &DT) + : L(L), BB(BB), DT(DT) {} + + bool setUnavailable() { + TraversalDone = true; + Available = false; + return false; + } + + bool follow(const SCEV *S) { + switch (S->getSCEVType()) { + case scConstant: case scTruncate: case scZeroExtend: case scSignExtend: + case scAddExpr: case scMulExpr: case scUMaxExpr: case scSMaxExpr: + // These expressions are available if their operand(s) is/are. + return true; + + case scAddRecExpr: { + // We allow add recurrences that are on the loop BB is in, or some + // outer loop. This guarantees availability because the value of the + // add recurrence at BB is simply the "current" value of the induction + // variable. We can relax this in the future; for instance an add + // recurrence on a sibling dominating loop is also available at BB. + const auto *ARLoop = cast<SCEVAddRecExpr>(S)->getLoop(); + if (L && (ARLoop == L || ARLoop->contains(L))) + return true; + + return setUnavailable(); + } + + case scUnknown: { + // For SCEVUnknown, we check for simple dominance. + const auto *SU = cast<SCEVUnknown>(S); + Value *V = SU->getValue(); + + if (isa<Argument>(V)) + return false; + + if (isa<Instruction>(V) && DT.dominates(cast<Instruction>(V), BB)) + return false; + + return setUnavailable(); + } + + case scUDivExpr: + case scCouldNotCompute: + // We do not try to smart about these at all. + return setUnavailable(); + } + llvm_unreachable("switch should be fully covered!"); + } + + bool isDone() { return TraversalDone; } + }; + + CheckAvailable CA(L, BB, DT); + SCEVTraversal<CheckAvailable> ST(CA); + + ST.visitAll(S); + return CA.Available; +} + +// Try to match a control flow sequence that branches out at BI and merges back +// at Merge into a "C ? LHS : RHS" select pattern. Return true on a successful +// match. +static bool BrPHIToSelect(DominatorTree &DT, BranchInst *BI, PHINode *Merge, + Value *&C, Value *&LHS, Value *&RHS) { + C = BI->getCondition(); + + BasicBlockEdge LeftEdge(BI->getParent(), BI->getSuccessor(0)); + BasicBlockEdge RightEdge(BI->getParent(), BI->getSuccessor(1)); + + if (!LeftEdge.isSingleEdge()) + return false; + + assert(RightEdge.isSingleEdge() && "Follows from LeftEdge.isSingleEdge()"); + + Use &LeftUse = Merge->getOperandUse(0); + Use &RightUse = Merge->getOperandUse(1); + + if (DT.dominates(LeftEdge, LeftUse) && DT.dominates(RightEdge, RightUse)) { + LHS = LeftUse; + RHS = RightUse; + return true; + } + + if (DT.dominates(LeftEdge, RightUse) && DT.dominates(RightEdge, LeftUse)) { + LHS = RightUse; + RHS = LeftUse; + return true; + } + + return false; +} + +const SCEV *ScalarEvolution::createNodeFromSelectLikePHI(PHINode *PN) { + if (PN->getNumIncomingValues() == 2) { + const Loop *L = LI.getLoopFor(PN->getParent()); + + // We don't want to break LCSSA, even in a SCEV expression tree. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (LI.getLoopFor(PN->getIncomingBlock(i)) != L) + return nullptr; + + // Try to match + // + // br %cond, label %left, label %right + // left: + // br label %merge + // right: + // br label %merge + // merge: + // V = phi [ %x, %left ], [ %y, %right ] + // + // as "select %cond, %x, %y" + + BasicBlock *IDom = DT[PN->getParent()]->getIDom()->getBlock(); + assert(IDom && "At least the entry block should dominate PN"); + + auto *BI = dyn_cast<BranchInst>(IDom->getTerminator()); + Value *Cond = nullptr, *LHS = nullptr, *RHS = nullptr; + + if (BI && BI->isConditional() && + BrPHIToSelect(DT, BI, PN, Cond, LHS, RHS) && + IsAvailableOnEntry(L, DT, getSCEV(LHS), PN->getParent()) && + IsAvailableOnEntry(L, DT, getSCEV(RHS), PN->getParent())) + return createNodeForSelectOrPHI(PN, Cond, LHS, RHS); + } + + return nullptr; +} + +const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) { + if (const SCEV *S = createAddRecFromPHI(PN)) + return S; + + if (const SCEV *S = createNodeFromSelectLikePHI(PN)) + return S; // If the PHI has a single incoming value, follow that value, unless the // PHI's incoming blocks are in a different loop, in which case doing so // risks breaking LCSSA form. Instcombine would normally zap these, but // it doesn't have DominatorTree information, so it may miss cases. - if (Value *V = - SimplifyInstruction(PN, F->getParent()->getDataLayout(), TLI, DT, AC)) - if (LI->replacementPreservesLCSSAForm(PN, V)) + if (Value *V = SimplifyInstruction(PN, getDataLayout(), &TLI, &DT, &AC)) + if (LI.replacementPreservesLCSSAForm(PN, V)) return getSCEV(V); // If it's not a loop phi, we can't handle it yet. return getUnknown(PN); } +const SCEV *ScalarEvolution::createNodeForSelectOrPHI(Instruction *I, + Value *Cond, + Value *TrueVal, + Value *FalseVal) { + // Handle "constant" branch or select. This can occur for instance when a + // loop pass transforms an inner loop and moves on to process the outer loop. + if (auto *CI = dyn_cast<ConstantInt>(Cond)) + return getSCEV(CI->isOne() ? TrueVal : FalseVal); + + // Try to match some simple smax or umax patterns. + auto *ICI = dyn_cast<ICmpInst>(Cond); + if (!ICI) + return getUnknown(I); + + Value *LHS = ICI->getOperand(0); + Value *RHS = ICI->getOperand(1); + + switch (ICI->getPredicate()) { + case ICmpInst::ICMP_SLT: + case ICmpInst::ICMP_SLE: + std::swap(LHS, RHS); + // fall through + case ICmpInst::ICMP_SGT: + case ICmpInst::ICMP_SGE: + // a >s b ? a+x : b+x -> smax(a, b)+x + // a >s b ? b+x : a+x -> smin(a, b)+x + if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType())) { + const SCEV *LS = getNoopOrSignExtend(getSCEV(LHS), I->getType()); + const SCEV *RS = getNoopOrSignExtend(getSCEV(RHS), I->getType()); + const SCEV *LA = getSCEV(TrueVal); + const SCEV *RA = getSCEV(FalseVal); + const SCEV *LDiff = getMinusSCEV(LA, LS); + const SCEV *RDiff = getMinusSCEV(RA, RS); + if (LDiff == RDiff) + return getAddExpr(getSMaxExpr(LS, RS), LDiff); + LDiff = getMinusSCEV(LA, RS); + RDiff = getMinusSCEV(RA, LS); + if (LDiff == RDiff) + return getAddExpr(getSMinExpr(LS, RS), LDiff); + } + break; + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_ULE: + std::swap(LHS, RHS); + // fall through + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_UGE: + // a >u b ? a+x : b+x -> umax(a, b)+x + // a >u b ? b+x : a+x -> umin(a, b)+x + if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType())) { + const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType()); + const SCEV *RS = getNoopOrZeroExtend(getSCEV(RHS), I->getType()); + const SCEV *LA = getSCEV(TrueVal); + const SCEV *RA = getSCEV(FalseVal); + const SCEV *LDiff = getMinusSCEV(LA, LS); + const SCEV *RDiff = getMinusSCEV(RA, RS); + if (LDiff == RDiff) + return getAddExpr(getUMaxExpr(LS, RS), LDiff); + LDiff = getMinusSCEV(LA, RS); + RDiff = getMinusSCEV(RA, LS); + if (LDiff == RDiff) + return getAddExpr(getUMinExpr(LS, RS), LDiff); + } + break; + case ICmpInst::ICMP_NE: + // n != 0 ? n+x : 1+x -> umax(n, 1)+x + if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType()) && + isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) { + const SCEV *One = getOne(I->getType()); + const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType()); + const SCEV *LA = getSCEV(TrueVal); + const SCEV *RA = getSCEV(FalseVal); + const SCEV *LDiff = getMinusSCEV(LA, LS); + const SCEV *RDiff = getMinusSCEV(RA, One); + if (LDiff == RDiff) + return getAddExpr(getUMaxExpr(One, LS), LDiff); + } + break; + case ICmpInst::ICMP_EQ: + // n == 0 ? 1+x : n+x -> umax(n, 1)+x + if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType()) && + isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) { + const SCEV *One = getOne(I->getType()); + const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType()); + const SCEV *LA = getSCEV(TrueVal); + const SCEV *RA = getSCEV(FalseVal); + const SCEV *LDiff = getMinusSCEV(LA, One); + const SCEV *RDiff = getMinusSCEV(RA, LS); + if (LDiff == RDiff) + return getAddExpr(getUMaxExpr(One, LS), LDiff); + } + break; + default: + break; + } + + return getUnknown(I); +} + /// createNodeForGEP - Expand GEP instructions into add and multiply /// operations. This allows them to be analyzed by regular SCEV code. /// @@ -3769,7 +4106,7 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) { uint32_t ScalarEvolution::GetMinTrailingZeros(const SCEV *S) { if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) - return C->getValue()->getValue().countTrailingZeros(); + return C->getAPInt().countTrailingZeros(); if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(S)) return std::min(GetMinTrailingZeros(T->getOperand()), @@ -3834,8 +4171,8 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) { // For a SCEVUnknown, ask ValueTracking. unsigned BitWidth = getTypeSizeInBits(U->getType()); APInt Zeros(BitWidth, 0), Ones(BitWidth, 0); - computeKnownBits(U->getValue(), Zeros, Ones, - F->getParent()->getDataLayout(), 0, AC, nullptr, DT); + computeKnownBits(U->getValue(), Zeros, Ones, getDataLayout(), 0, &AC, + nullptr, &DT); return Zeros.countTrailingOnes(); } @@ -3846,26 +4183,9 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) { /// GetRangeFromMetadata - Helper method to assign a range to V from /// metadata present in the IR. static Optional<ConstantRange> GetRangeFromMetadata(Value *V) { - if (Instruction *I = dyn_cast<Instruction>(V)) { - if (MDNode *MD = I->getMetadata(LLVMContext::MD_range)) { - ConstantRange TotalRange( - cast<IntegerType>(I->getType())->getBitWidth(), false); - - unsigned NumRanges = MD->getNumOperands() / 2; - assert(NumRanges >= 1); - - for (unsigned i = 0; i < NumRanges; ++i) { - ConstantInt *Lower = - mdconst::extract<ConstantInt>(MD->getOperand(2 * i + 0)); - ConstantInt *Upper = - mdconst::extract<ConstantInt>(MD->getOperand(2 * i + 1)); - ConstantRange Range(Lower->getValue(), Upper->getValue()); - TotalRange = TotalRange.unionWith(Range); - } - - return TotalRange; - } - } + if (Instruction *I = dyn_cast<Instruction>(V)) + if (MDNode *MD = I->getMetadata(LLVMContext::MD_range)) + return getConstantRangeFromMetadata(*MD); return None; } @@ -3887,7 +4207,7 @@ ScalarEvolution::getRange(const SCEV *S, return I->second; if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) - return setRange(C, SignHint, ConstantRange(C->getValue()->getValue())); + return setRange(C, SignHint, ConstantRange(C->getAPInt())); unsigned BitWidth = getTypeSizeInBits(S->getType()); ConstantRange ConservativeResult(BitWidth, /*isFullSet=*/true); @@ -3965,9 +4285,8 @@ ScalarEvolution::getRange(const SCEV *S, if (AddRec->getNoWrapFlags(SCEV::FlagNUW)) if (const SCEVConstant *C = dyn_cast<SCEVConstant>(AddRec->getStart())) if (!C->getValue()->isZero()) - ConservativeResult = - ConservativeResult.intersectWith( - ConstantRange(C->getValue()->getValue(), APInt(BitWidth, 0))); + ConservativeResult = ConservativeResult.intersectWith( + ConstantRange(C->getAPInt(), APInt(BitWidth, 0))); // If there's no signed wrap, and all the operands have the same sign or // zero, the value won't ever change sign. @@ -4065,18 +4384,18 @@ ScalarEvolution::getRange(const SCEV *S, // Split here to avoid paying the compile-time cost of calling both // computeKnownBits and ComputeNumSignBits. This restriction can be lifted // if needed. - const DataLayout &DL = F->getParent()->getDataLayout(); + const DataLayout &DL = getDataLayout(); if (SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED) { // For a SCEVUnknown, ask ValueTracking. APInt Zeros(BitWidth, 0), Ones(BitWidth, 0); - computeKnownBits(U->getValue(), Zeros, Ones, DL, 0, AC, nullptr, DT); + computeKnownBits(U->getValue(), Zeros, Ones, DL, 0, &AC, nullptr, &DT); if (Ones != ~Zeros + 1) ConservativeResult = ConservativeResult.intersectWith(ConstantRange(Ones, ~Zeros + 1)); } else { assert(SignHint == ScalarEvolution::HINT_RANGE_SIGNED && "generalize as needed!"); - unsigned NS = ComputeNumSignBits(U->getValue(), DL, 0, AC, nullptr, DT); + unsigned NS = ComputeNumSignBits(U->getValue(), DL, 0, &AC, nullptr, &DT); if (NS > 1) ConservativeResult = ConservativeResult.intersectWith( ConstantRange(APInt::getSignedMinValue(BitWidth).ashr(NS - 1), @@ -4089,8 +4408,64 @@ ScalarEvolution::getRange(const SCEV *S, return setRange(S, SignHint, ConservativeResult); } -/// createSCEV - We know that there is no SCEV for the specified value. -/// Analyze the expression. +SCEV::NoWrapFlags ScalarEvolution::getNoWrapFlagsFromUB(const Value *V) { + if (isa<ConstantExpr>(V)) return SCEV::FlagAnyWrap; + const BinaryOperator *BinOp = cast<BinaryOperator>(V); + + // Return early if there are no flags to propagate to the SCEV. + SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap; + if (BinOp->hasNoUnsignedWrap()) + Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW); + if (BinOp->hasNoSignedWrap()) + Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW); + if (Flags == SCEV::FlagAnyWrap) { + return SCEV::FlagAnyWrap; + } + + // Here we check that BinOp is in the header of the innermost loop + // containing BinOp, since we only deal with instructions in the loop + // header. The actual loop we need to check later will come from an add + // recurrence, but getting that requires computing the SCEV of the operands, + // which can be expensive. This check we can do cheaply to rule out some + // cases early. + Loop *innermostContainingLoop = LI.getLoopFor(BinOp->getParent()); + if (innermostContainingLoop == nullptr || + innermostContainingLoop->getHeader() != BinOp->getParent()) + return SCEV::FlagAnyWrap; + + // Only proceed if we can prove that BinOp does not yield poison. + if (!isKnownNotFullPoison(BinOp)) return SCEV::FlagAnyWrap; + + // At this point we know that if V is executed, then it does not wrap + // according to at least one of NSW or NUW. If V is not executed, then we do + // not know if the calculation that V represents would wrap. Multiple + // instructions can map to the same SCEV. If we apply NSW or NUW from V to + // the SCEV, we must guarantee no wrapping for that SCEV also when it is + // derived from other instructions that map to the same SCEV. We cannot make + // that guarantee for cases where V is not executed. So we need to find the + // loop that V is considered in relation to and prove that V is executed for + // every iteration of that loop. That implies that the value that V + // calculates does not wrap anywhere in the loop, so then we can apply the + // flags to the SCEV. + // + // We check isLoopInvariant to disambiguate in case we are adding two + // recurrences from different loops, so that we know which loop to prove + // that V is executed in. + for (int OpIndex = 0; OpIndex < 2; ++OpIndex) { + const SCEV *Op = getSCEV(BinOp->getOperand(OpIndex)); + if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(Op)) { + const int OtherOpIndex = 1 - OpIndex; + const SCEV *OtherOp = getSCEV(BinOp->getOperand(OtherOpIndex)); + if (isLoopInvariant(OtherOp, AddRec->getLoop()) && + isGuaranteedToExecuteForEveryIteration(BinOp, AddRec->getLoop())) + return Flags; + } + } + return SCEV::FlagAnyWrap; +} + +/// createSCEV - We know that there is no SCEV for the specified value. Analyze +/// the expression. /// const SCEV *ScalarEvolution::createSCEV(Value *V) { if (!isSCEVable(V->getType())) @@ -4104,14 +4479,14 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) { // reachable. Such instructions don't matter, and they aren't required // to obey basic rules for definitions dominating uses which this // analysis depends on. - if (!DT->isReachableFromEntry(I->getParent())) + if (!DT.isReachableFromEntry(I->getParent())) return getUnknown(V); } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) Opcode = CE->getOpcode(); else if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) return getConstant(CI); else if (isa<ConstantPointerNull>(V)) - return getConstant(V->getType(), 0); + return getZero(V->getType()); else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) return GA->mayBeOverridden() ? getUnknown(V) : getSCEV(GA->getAliasee()); else @@ -4126,47 +4501,79 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) { // because it leads to N-1 getAddExpr calls for N ultimate operands. // Instead, gather up all the operands and make a single getAddExpr call. // LLVM IR canonical form means we need only traverse the left operands. - // - // Don't apply this instruction's NSW or NUW flags to the new - // expression. The instruction may be guarded by control flow that the - // no-wrap behavior depends on. Non-control-equivalent instructions can be - // mapped to the same SCEV expression, and it would be incorrect to transfer - // NSW/NUW semantics to those operations. SmallVector<const SCEV *, 4> AddOps; - AddOps.push_back(getSCEV(U->getOperand(1))); - for (Value *Op = U->getOperand(0); ; Op = U->getOperand(0)) { - unsigned Opcode = Op->getValueID() - Value::InstructionVal; - if (Opcode != Instruction::Add && Opcode != Instruction::Sub) + for (Value *Op = U;; Op = U->getOperand(0)) { + U = dyn_cast<Operator>(Op); + unsigned Opcode = U ? U->getOpcode() : 0; + if (!U || (Opcode != Instruction::Add && Opcode != Instruction::Sub)) { + assert(Op != V && "V should be an add"); + AddOps.push_back(getSCEV(Op)); + break; + } + + if (auto *OpSCEV = getExistingSCEV(U)) { + AddOps.push_back(OpSCEV); + break; + } + + // If a NUW or NSW flag can be applied to the SCEV for this + // addition, then compute the SCEV for this addition by itself + // with a separate call to getAddExpr. We need to do that + // instead of pushing the operands of the addition onto AddOps, + // since the flags are only known to apply to this particular + // addition - they may not apply to other additions that can be + // formed with operands from AddOps. + const SCEV *RHS = getSCEV(U->getOperand(1)); + SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(U); + if (Flags != SCEV::FlagAnyWrap) { + const SCEV *LHS = getSCEV(U->getOperand(0)); + if (Opcode == Instruction::Sub) + AddOps.push_back(getMinusSCEV(LHS, RHS, Flags)); + else + AddOps.push_back(getAddExpr(LHS, RHS, Flags)); break; - U = cast<Operator>(Op); - const SCEV *Op1 = getSCEV(U->getOperand(1)); + } + if (Opcode == Instruction::Sub) - AddOps.push_back(getNegativeSCEV(Op1)); + AddOps.push_back(getNegativeSCEV(RHS)); else - AddOps.push_back(Op1); + AddOps.push_back(RHS); } - AddOps.push_back(getSCEV(U->getOperand(0))); return getAddExpr(AddOps); } + case Instruction::Mul: { - // Don't transfer NSW/NUW for the same reason as AddExpr. SmallVector<const SCEV *, 4> MulOps; - MulOps.push_back(getSCEV(U->getOperand(1))); - for (Value *Op = U->getOperand(0); - Op->getValueID() == Instruction::Mul + Value::InstructionVal; - Op = U->getOperand(0)) { - U = cast<Operator>(Op); + for (Value *Op = U;; Op = U->getOperand(0)) { + U = dyn_cast<Operator>(Op); + if (!U || U->getOpcode() != Instruction::Mul) { + assert(Op != V && "V should be a mul"); + MulOps.push_back(getSCEV(Op)); + break; + } + + if (auto *OpSCEV = getExistingSCEV(U)) { + MulOps.push_back(OpSCEV); + break; + } + + SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(U); + if (Flags != SCEV::FlagAnyWrap) { + MulOps.push_back(getMulExpr(getSCEV(U->getOperand(0)), + getSCEV(U->getOperand(1)), Flags)); + break; + } + MulOps.push_back(getSCEV(U->getOperand(1))); } - MulOps.push_back(getSCEV(U->getOperand(0))); return getMulExpr(MulOps); } case Instruction::UDiv: return getUDivExpr(getSCEV(U->getOperand(0)), getSCEV(U->getOperand(1))); case Instruction::Sub: - return getMinusSCEV(getSCEV(U->getOperand(0)), - getSCEV(U->getOperand(1))); + return getMinusSCEV(getSCEV(U->getOperand(0)), getSCEV(U->getOperand(1)), + getNoWrapFlagsFromUB(U)); case Instruction::And: // For an expression like x&255 that merely masks off the high bits, // use zext(trunc(x)) as the SCEV expression. @@ -4185,8 +4592,8 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) { unsigned TZ = A.countTrailingZeros(); unsigned BitWidth = A.getBitWidth(); APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(U->getOperand(0), KnownZero, KnownOne, - F->getParent()->getDataLayout(), 0, AC, nullptr, DT); + computeKnownBits(U->getOperand(0), KnownZero, KnownOne, getDataLayout(), + 0, &AC, nullptr, &DT); APInt EffectiveMask = APInt::getLowBitsSet(BitWidth, BitWidth - LZ - TZ).shl(TZ); @@ -4286,9 +4693,18 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) { if (SA->getValue().uge(BitWidth)) break; + // It is currently not resolved how to interpret NSW for left + // shift by BitWidth - 1, so we avoid applying flags in that + // case. Remove this check (or this comment) once the situation + // is resolved. See + // http://lists.llvm.org/pipermail/llvm-dev/2015-April/084195.html + // and http://reviews.llvm.org/D8890 . + auto Flags = SCEV::FlagAnyWrap; + if (SA->getValue().ult(BitWidth - 1)) Flags = getNoWrapFlagsFromUB(U); + Constant *X = ConstantInt::get(getContext(), APInt::getOneBitSet(BitWidth, SA->getZExtValue())); - return getMulExpr(getSCEV(U->getOperand(0)), getSCEV(X)); + return getMulExpr(getSCEV(U->getOperand(0)), getSCEV(X), Flags); } break; @@ -4363,94 +4779,13 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) { return createNodeForPHI(cast<PHINode>(U)); case Instruction::Select: - // This could be a smax or umax that was lowered earlier. - // Try to recover it. - if (ICmpInst *ICI = dyn_cast<ICmpInst>(U->getOperand(0))) { - Value *LHS = ICI->getOperand(0); - Value *RHS = ICI->getOperand(1); - switch (ICI->getPredicate()) { - case ICmpInst::ICMP_SLT: - case ICmpInst::ICMP_SLE: - std::swap(LHS, RHS); - // fall through - case ICmpInst::ICMP_SGT: - case ICmpInst::ICMP_SGE: - // a >s b ? a+x : b+x -> smax(a, b)+x - // a >s b ? b+x : a+x -> smin(a, b)+x - if (getTypeSizeInBits(LHS->getType()) <= - getTypeSizeInBits(U->getType())) { - const SCEV *LS = getNoopOrSignExtend(getSCEV(LHS), U->getType()); - const SCEV *RS = getNoopOrSignExtend(getSCEV(RHS), U->getType()); - const SCEV *LA = getSCEV(U->getOperand(1)); - const SCEV *RA = getSCEV(U->getOperand(2)); - const SCEV *LDiff = getMinusSCEV(LA, LS); - const SCEV *RDiff = getMinusSCEV(RA, RS); - if (LDiff == RDiff) - return getAddExpr(getSMaxExpr(LS, RS), LDiff); - LDiff = getMinusSCEV(LA, RS); - RDiff = getMinusSCEV(RA, LS); - if (LDiff == RDiff) - return getAddExpr(getSMinExpr(LS, RS), LDiff); - } - break; - case ICmpInst::ICMP_ULT: - case ICmpInst::ICMP_ULE: - std::swap(LHS, RHS); - // fall through - case ICmpInst::ICMP_UGT: - case ICmpInst::ICMP_UGE: - // a >u b ? a+x : b+x -> umax(a, b)+x - // a >u b ? b+x : a+x -> umin(a, b)+x - if (getTypeSizeInBits(LHS->getType()) <= - getTypeSizeInBits(U->getType())) { - const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), U->getType()); - const SCEV *RS = getNoopOrZeroExtend(getSCEV(RHS), U->getType()); - const SCEV *LA = getSCEV(U->getOperand(1)); - const SCEV *RA = getSCEV(U->getOperand(2)); - const SCEV *LDiff = getMinusSCEV(LA, LS); - const SCEV *RDiff = getMinusSCEV(RA, RS); - if (LDiff == RDiff) - return getAddExpr(getUMaxExpr(LS, RS), LDiff); - LDiff = getMinusSCEV(LA, RS); - RDiff = getMinusSCEV(RA, LS); - if (LDiff == RDiff) - return getAddExpr(getUMinExpr(LS, RS), LDiff); - } - break; - case ICmpInst::ICMP_NE: - // n != 0 ? n+x : 1+x -> umax(n, 1)+x - if (getTypeSizeInBits(LHS->getType()) <= - getTypeSizeInBits(U->getType()) && - isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) { - const SCEV *One = getConstant(U->getType(), 1); - const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), U->getType()); - const SCEV *LA = getSCEV(U->getOperand(1)); - const SCEV *RA = getSCEV(U->getOperand(2)); - const SCEV *LDiff = getMinusSCEV(LA, LS); - const SCEV *RDiff = getMinusSCEV(RA, One); - if (LDiff == RDiff) - return getAddExpr(getUMaxExpr(One, LS), LDiff); - } - break; - case ICmpInst::ICMP_EQ: - // n == 0 ? 1+x : n+x -> umax(n, 1)+x - if (getTypeSizeInBits(LHS->getType()) <= - getTypeSizeInBits(U->getType()) && - isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) { - const SCEV *One = getConstant(U->getType(), 1); - const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), U->getType()); - const SCEV *LA = getSCEV(U->getOperand(1)); - const SCEV *RA = getSCEV(U->getOperand(2)); - const SCEV *LDiff = getMinusSCEV(LA, One); - const SCEV *RDiff = getMinusSCEV(RA, LS); - if (LDiff == RDiff) - return getAddExpr(getUMaxExpr(One, LS), LDiff); - } - break; - default: - break; - } - } + // U can also be a select constant expr, which let fall through. Since + // createNodeForSelect only works for a condition that is an `ICmpInst`, and + // constant expressions cannot have instructions as operands, we'd have + // returned getUnknown for a select constant expressions anyway. + if (isa<Instruction>(U)) + return createNodeForSelectOrPHI(cast<Instruction>(U), U->getOperand(0), + U->getOperand(1), U->getOperand(2)); default: // We cannot analyze this expression. break; @@ -4534,8 +4869,7 @@ ScalarEvolution::getSmallConstantTripMultiple(Loop *L, return 1; // Get the trip count from the BE count by adding 1. - const SCEV *TCMul = getAddExpr(ExitCount, - getConstant(ExitCount->getType(), 1)); + const SCEV *TCMul = getAddExpr(ExitCount, getOne(ExitCount->getType())); // FIXME: SCEV distributes multiplication as V1*C1 + V2*C1. We could attempt // to factor simple cases. if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(TCMul)) @@ -4610,10 +4944,10 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) { if (!Pair.second) return Pair.first->second; - // ComputeBackedgeTakenCount may allocate memory for its result. Inserting it + // computeBackedgeTakenCount may allocate memory for its result. Inserting it // into the BackedgeTakenCounts map transfers ownership. Otherwise, the result // must be cleared in this scope. - BackedgeTakenInfo Result = ComputeBackedgeTakenCount(L); + BackedgeTakenInfo Result = computeBackedgeTakenCount(L); if (Result.getExact(this) != getCouldNotCompute()) { assert(isLoopInvariant(Result.getExact(this), L) && @@ -4666,7 +5000,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) { } // Re-lookup the insert position, since the call to - // ComputeBackedgeTakenCount above could result in a + // computeBackedgeTakenCount above could result in a // recusive call to getBackedgeTakenInfo (on a different // loop), which would invalidate the iterator computed // earlier. @@ -4744,12 +5078,12 @@ void ScalarEvolution::forgetValue(Value *V) { } /// getExact - Get the exact loop backedge taken count considering all loop -/// exits. A computable result can only be return for loops with a single exit. -/// Returning the minimum taken count among all exits is incorrect because one -/// of the loop's exit limit's may have been skipped. HowFarToZero assumes that -/// the limit of each loop test is never skipped. This is a valid assumption as -/// long as the loop exits via that test. For precise results, it is the -/// caller's responsibility to specify the relevant loop exit using +/// exits. A computable result can only be returned for loops with a single +/// exit. Returning the minimum taken count among all exits is incorrect +/// because one of the loop's exit limit's may have been skipped. HowFarToZero +/// assumes that the limit of each loop test is never skipped. This is a valid +/// assumption as long as the loop exits via that test. For precise results, it +/// is the caller's responsibility to specify the relevant loop exit using /// getExact(ExitingBlock, SE). const SCEV * ScalarEvolution::BackedgeTakenInfo::getExact(ScalarEvolution *SE) const { @@ -4847,10 +5181,10 @@ void ScalarEvolution::BackedgeTakenInfo::clear() { delete[] ExitNotTaken.getNextExit(); } -/// ComputeBackedgeTakenCount - Compute the number of times the backedge +/// computeBackedgeTakenCount - Compute the number of times the backedge /// of the specified loop will execute. ScalarEvolution::BackedgeTakenInfo -ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) { +ScalarEvolution::computeBackedgeTakenCount(const Loop *L) { SmallVector<BasicBlock *, 8> ExitingBlocks; L->getExitingBlocks(ExitingBlocks); @@ -4864,7 +5198,7 @@ ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) { // and compute maxBECount. for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { BasicBlock *ExitBB = ExitingBlocks[i]; - ExitLimit EL = ComputeExitLimit(L, ExitBB); + ExitLimit EL = computeExitLimit(L, ExitBB); // 1. For each exit that can be computed, add an entry to ExitCounts. // CouldComputeBECount is true only if all exits can be computed. @@ -4885,7 +5219,7 @@ ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) { // MaxBECount is conservatively the maximum EL.Max, where CouldNotCompute is // considered greater than any computable EL.Max. if (EL.Max != getCouldNotCompute() && Latch && - DT->dominates(ExitBB, Latch)) { + DT.dominates(ExitBB, Latch)) { if (!MustExitMaxBECount) MustExitMaxBECount = EL.Max; else { @@ -4906,13 +5240,11 @@ ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) { return BackedgeTakenInfo(ExitCounts, CouldComputeBECount, MaxBECount); } -/// ComputeExitLimit - Compute the number of times the backedge of the specified -/// loop will execute if it exits via the specified block. ScalarEvolution::ExitLimit -ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) { +ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock) { - // Okay, we've chosen an exiting block. See what condition causes us to - // exit at this block and remember the exit block and whether all other targets + // Okay, we've chosen an exiting block. See what condition causes us to exit + // at this block and remember the exit block and whether all other targets // lead to the loop header. bool MustExecuteLoopHeader = true; BasicBlock *Exit = nullptr; @@ -4952,8 +5284,7 @@ ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) { if (!Pred) return getCouldNotCompute(); TerminatorInst *PredTerm = Pred->getTerminator(); - for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i) { - BasicBlock *PredSucc = PredTerm->getSuccessor(i); + for (const BasicBlock *PredSucc : PredTerm->successors()) { if (PredSucc == BB) continue; // If the predecessor has a successor that isn't BB and isn't @@ -4976,19 +5307,19 @@ ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) { if (BranchInst *BI = dyn_cast<BranchInst>(Term)) { assert(BI->isConditional() && "If unconditional, it can't be in loop!"); // Proceed to the next level to examine the exit condition expression. - return ComputeExitLimitFromCond(L, BI->getCondition(), BI->getSuccessor(0), + return computeExitLimitFromCond(L, BI->getCondition(), BI->getSuccessor(0), BI->getSuccessor(1), /*ControlsExit=*/IsOnlyExit); } if (SwitchInst *SI = dyn_cast<SwitchInst>(Term)) - return ComputeExitLimitFromSingleExitSwitch(L, SI, Exit, + return computeExitLimitFromSingleExitSwitch(L, SI, Exit, /*ControlsExit=*/IsOnlyExit); return getCouldNotCompute(); } -/// ComputeExitLimitFromCond - Compute the number of times the +/// computeExitLimitFromCond - Compute the number of times the /// backedge of the specified loop will execute if its exit condition /// were a conditional branch of ExitCond, TBB, and FBB. /// @@ -4997,7 +5328,7 @@ ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) { /// condition is true and can infer that failing to meet the condition prior to /// integer wraparound results in undefined behavior. ScalarEvolution::ExitLimit -ScalarEvolution::ComputeExitLimitFromCond(const Loop *L, +ScalarEvolution::computeExitLimitFromCond(const Loop *L, Value *ExitCond, BasicBlock *TBB, BasicBlock *FBB, @@ -5007,9 +5338,9 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L, if (BO->getOpcode() == Instruction::And) { // Recurse on the operands of the and. bool EitherMayExit = L->contains(TBB); - ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB, + ExitLimit EL0 = computeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB, ControlsExit && !EitherMayExit); - ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB, + ExitLimit EL1 = computeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB, ControlsExit && !EitherMayExit); const SCEV *BECount = getCouldNotCompute(); const SCEV *MaxBECount = getCouldNotCompute(); @@ -5042,9 +5373,9 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L, if (BO->getOpcode() == Instruction::Or) { // Recurse on the operands of the or. bool EitherMayExit = L->contains(FBB); - ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB, + ExitLimit EL0 = computeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB, ControlsExit && !EitherMayExit); - ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB, + ExitLimit EL1 = computeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB, ControlsExit && !EitherMayExit); const SCEV *BECount = getCouldNotCompute(); const SCEV *MaxBECount = getCouldNotCompute(); @@ -5079,7 +5410,7 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L, // With an icmp, it may be feasible to compute an exact backedge-taken count. // Proceed to the next level to examine the icmp. if (ICmpInst *ExitCondICmp = dyn_cast<ICmpInst>(ExitCond)) - return ComputeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit); + return computeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit); // Check for a constant condition. These are normally stripped out by // SimplifyCFG, but ScalarEvolution may be used by a pass which wishes to @@ -5091,18 +5422,15 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L, return getCouldNotCompute(); else // The backedge is never taken. - return getConstant(CI->getType(), 0); + return getZero(CI->getType()); } // If it's not an integer or pointer comparison then compute it the hard way. - return ComputeExitCountExhaustively(L, ExitCond, !L->contains(TBB)); + return computeExitCountExhaustively(L, ExitCond, !L->contains(TBB)); } -/// ComputeExitLimitFromICmp - Compute the number of times the -/// backedge of the specified loop will execute if its exit condition -/// were a conditional branch of the ICmpInst ExitCond, TBB, and FBB. ScalarEvolution::ExitLimit -ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L, +ScalarEvolution::computeExitLimitFromICmp(const Loop *L, ICmpInst *ExitCond, BasicBlock *TBB, BasicBlock *FBB, @@ -5119,11 +5447,16 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L, if (LoadInst *LI = dyn_cast<LoadInst>(ExitCond->getOperand(0))) if (Constant *RHS = dyn_cast<Constant>(ExitCond->getOperand(1))) { ExitLimit ItCnt = - ComputeLoadConstantCompareExitLimit(LI, RHS, L, Cond); + computeLoadConstantCompareExitLimit(LI, RHS, L, Cond); if (ItCnt.hasAnyInfo()) return ItCnt; } + ExitLimit ShiftEL = computeShiftCompareExitLimit( + ExitCond->getOperand(0), ExitCond->getOperand(1), L, Cond); + if (ShiftEL.hasAnyInfo()) + return ShiftEL; + const SCEV *LHS = getSCEV(ExitCond->getOperand(0)); const SCEV *RHS = getSCEV(ExitCond->getOperand(1)); @@ -5149,7 +5482,7 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L, if (AddRec->getLoop() == L) { // Form the constant range. ConstantRange CompRange( - ICmpInst::makeConstantRange(Cond, RHSC->getValue()->getValue())); + ICmpInst::makeConstantRange(Cond, RHSC->getAPInt())); const SCEV *Ret = AddRec->getNumIterationsInRange(CompRange, *this); if (!isa<SCEVCouldNotCompute>(Ret)) return Ret; @@ -5183,21 +5516,13 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L, break; } default: -#if 0 - dbgs() << "ComputeBackedgeTakenCount "; - if (ExitCond->getOperand(0)->getType()->isUnsigned()) - dbgs() << "[unsigned] "; - dbgs() << *LHS << " " - << Instruction::getOpcodeName(Instruction::ICmp) - << " " << *RHS << "\n"; -#endif break; } - return ComputeExitCountExhaustively(L, ExitCond, !L->contains(TBB)); + return computeExitCountExhaustively(L, ExitCond, !L->contains(TBB)); } ScalarEvolution::ExitLimit -ScalarEvolution::ComputeExitLimitFromSingleExitSwitch(const Loop *L, +ScalarEvolution::computeExitLimitFromSingleExitSwitch(const Loop *L, SwitchInst *Switch, BasicBlock *ExitingBlock, bool ControlsExit) { @@ -5230,11 +5555,11 @@ EvaluateConstantChrecAtConstant(const SCEVAddRecExpr *AddRec, ConstantInt *C, return cast<SCEVConstant>(Val)->getValue(); } -/// ComputeLoadConstantCompareExitLimit - Given an exit condition of +/// computeLoadConstantCompareExitLimit - Given an exit condition of /// 'icmp op load X, cst', try to see if we can compute the backedge /// execution count. ScalarEvolution::ExitLimit -ScalarEvolution::ComputeLoadConstantCompareExitLimit( +ScalarEvolution::computeLoadConstantCompareExitLimit( LoadInst *LI, Constant *RHS, const Loop *L, @@ -5303,11 +5628,6 @@ ScalarEvolution::ComputeLoadConstantCompareExitLimit( Result = ConstantExpr::getICmp(predicate, Result, RHS); if (!isa<ConstantInt>(Result)) break; // Couldn't decide for sure if (cast<ConstantInt>(Result)->getValue().isMinValue()) { -#if 0 - dbgs() << "\n***\n*** Computed loop count " << *ItCst - << "\n*** From global " << *GV << "*** BB: " << *L->getHeader() - << "***\n"; -#endif ++NumArrayLenItCounts; return getConstant(ItCst); // Found terminating iteration! } @@ -5315,6 +5635,149 @@ ScalarEvolution::ComputeLoadConstantCompareExitLimit( return getCouldNotCompute(); } +ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit( + Value *LHS, Value *RHSV, const Loop *L, ICmpInst::Predicate Pred) { + ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV); + if (!RHS) + return getCouldNotCompute(); + + const BasicBlock *Latch = L->getLoopLatch(); + if (!Latch) + return getCouldNotCompute(); + + const BasicBlock *Predecessor = L->getLoopPredecessor(); + if (!Predecessor) + return getCouldNotCompute(); + + // Return true if V is of the form "LHS `shift_op` <positive constant>". + // Return LHS in OutLHS and shift_opt in OutOpCode. + auto MatchPositiveShift = + [](Value *V, Value *&OutLHS, Instruction::BinaryOps &OutOpCode) { + + using namespace PatternMatch; + + ConstantInt *ShiftAmt; + if (match(V, m_LShr(m_Value(OutLHS), m_ConstantInt(ShiftAmt)))) + OutOpCode = Instruction::LShr; + else if (match(V, m_AShr(m_Value(OutLHS), m_ConstantInt(ShiftAmt)))) + OutOpCode = Instruction::AShr; + else if (match(V, m_Shl(m_Value(OutLHS), m_ConstantInt(ShiftAmt)))) + OutOpCode = Instruction::Shl; + else + return false; + + return ShiftAmt->getValue().isStrictlyPositive(); + }; + + // Recognize a "shift recurrence" either of the form %iv or of %iv.shifted in + // + // loop: + // %iv = phi i32 [ %iv.shifted, %loop ], [ %val, %preheader ] + // %iv.shifted = lshr i32 %iv, <positive constant> + // + // Return true on a succesful match. Return the corresponding PHI node (%iv + // above) in PNOut and the opcode of the shift operation in OpCodeOut. + auto MatchShiftRecurrence = + [&](Value *V, PHINode *&PNOut, Instruction::BinaryOps &OpCodeOut) { + Optional<Instruction::BinaryOps> PostShiftOpCode; + + { + Instruction::BinaryOps OpC; + Value *V; + + // If we encounter a shift instruction, "peel off" the shift operation, + // and remember that we did so. Later when we inspect %iv's backedge + // value, we will make sure that the backedge value uses the same + // operation. + // + // Note: the peeled shift operation does not have to be the same + // instruction as the one feeding into the PHI's backedge value. We only + // really care about it being the same *kind* of shift instruction -- + // that's all that is required for our later inferences to hold. + if (MatchPositiveShift(LHS, V, OpC)) { + PostShiftOpCode = OpC; + LHS = V; + } + } + + PNOut = dyn_cast<PHINode>(LHS); + if (!PNOut || PNOut->getParent() != L->getHeader()) + return false; + + Value *BEValue = PNOut->getIncomingValueForBlock(Latch); + Value *OpLHS; + + return + // The backedge value for the PHI node must be a shift by a positive + // amount + MatchPositiveShift(BEValue, OpLHS, OpCodeOut) && + + // of the PHI node itself + OpLHS == PNOut && + + // and the kind of shift should be match the kind of shift we peeled + // off, if any. + (!PostShiftOpCode.hasValue() || *PostShiftOpCode == OpCodeOut); + }; + + PHINode *PN; + Instruction::BinaryOps OpCode; + if (!MatchShiftRecurrence(LHS, PN, OpCode)) + return getCouldNotCompute(); + + const DataLayout &DL = getDataLayout(); + + // The key rationale for this optimization is that for some kinds of shift + // recurrences, the value of the recurrence "stabilizes" to either 0 or -1 + // within a finite number of iterations. If the condition guarding the + // backedge (in the sense that the backedge is taken if the condition is true) + // is false for the value the shift recurrence stabilizes to, then we know + // that the backedge is taken only a finite number of times. + + ConstantInt *StableValue = nullptr; + switch (OpCode) { + default: + llvm_unreachable("Impossible case!"); + + case Instruction::AShr: { + // {K,ashr,<positive-constant>} stabilizes to signum(K) in at most + // bitwidth(K) iterations. + Value *FirstValue = PN->getIncomingValueForBlock(Predecessor); + bool KnownZero, KnownOne; + ComputeSignBit(FirstValue, KnownZero, KnownOne, DL, 0, nullptr, + Predecessor->getTerminator(), &DT); + auto *Ty = cast<IntegerType>(RHS->getType()); + if (KnownZero) + StableValue = ConstantInt::get(Ty, 0); + else if (KnownOne) + StableValue = ConstantInt::get(Ty, -1, true); + else + return getCouldNotCompute(); + + break; + } + case Instruction::LShr: + case Instruction::Shl: + // Both {K,lshr,<positive-constant>} and {K,shl,<positive-constant>} + // stabilize to 0 in at most bitwidth(K) iterations. + StableValue = ConstantInt::get(cast<IntegerType>(RHS->getType()), 0); + break; + } + + auto *Result = + ConstantFoldCompareInstOperands(Pred, StableValue, RHS, DL, &TLI); + assert(Result->getType()->isIntegerTy(1) && + "Otherwise cannot be an operand to a branch instruction"); + + if (Result->isZeroValue()) { + unsigned BitWidth = getTypeSizeInBits(RHS->getType()); + const SCEV *UpperBound = + getConstant(getEffectiveSCEVType(RHS->getType()), BitWidth); + return ExitLimit(getCouldNotCompute(), UpperBound); + } + + return getCouldNotCompute(); +} /// CanConstantFold - Return true if we can constant fold an instruction of the /// specified type, assuming that all operands were constants. @@ -5356,12 +5819,10 @@ getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L, // Otherwise, we can evaluate this instruction if all of its operands are // constant or derived from a PHI node themselves. PHINode *PHI = nullptr; - for (Instruction::op_iterator OpI = UseInst->op_begin(), - OpE = UseInst->op_end(); OpI != OpE; ++OpI) { - - if (isa<Constant>(*OpI)) continue; + for (Value *Op : UseInst->operands()) { + if (isa<Constant>(Op)) continue; - Instruction *OpInst = dyn_cast<Instruction>(*OpI); + Instruction *OpInst = dyn_cast<Instruction>(Op); if (!OpInst || !canConstantEvolve(OpInst, L)) return nullptr; PHINode *P = dyn_cast<PHINode>(OpInst); @@ -5395,9 +5856,8 @@ static PHINode *getConstantEvolvingPHI(Value *V, const Loop *L) { Instruction *I = dyn_cast<Instruction>(V); if (!I || !canConstantEvolve(I, L)) return nullptr; - if (PHINode *PN = dyn_cast<PHINode>(I)) { + if (PHINode *PN = dyn_cast<PHINode>(I)) return PN; - } // Record non-constant instructions contained by the loop. DenseMap<Instruction *, PHINode *> PHIMap; @@ -5454,6 +5914,30 @@ static Constant *EvaluateExpression(Value *V, const Loop *L, TLI); } + +// If every incoming value to PN except the one for BB is a specific Constant, +// return that, else return nullptr. +static Constant *getOtherIncomingValue(PHINode *PN, BasicBlock *BB) { + Constant *IncomingVal = nullptr; + + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + if (PN->getIncomingBlock(i) == BB) + continue; + + auto *CurrentVal = dyn_cast<Constant>(PN->getIncomingValue(i)); + if (!CurrentVal) + return nullptr; + + if (IncomingVal != CurrentVal) { + if (IncomingVal) + return nullptr; + IncomingVal = CurrentVal; + } + } + + return IncomingVal; +} + /// getConstantEvolutionLoopExitValue - If we know that the specified Phi is /// in the header of its containing loop, we know the loop executes a /// constant number of times, and the PHI node is just a recurrence @@ -5462,8 +5946,7 @@ Constant * ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN, const APInt &BEs, const Loop *L) { - DenseMap<PHINode*, Constant*>::const_iterator I = - ConstantEvolutionLoopExitValue.find(PN); + auto I = ConstantEvolutionLoopExitValue.find(PN); if (I != ConstantEvolutionLoopExitValue.end()) return I->second; @@ -5476,22 +5959,21 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN, BasicBlock *Header = L->getHeader(); assert(PN->getParent() == Header && "Can't evaluate PHI not in loop header!"); - // Since the loop is canonicalized, the PHI node must have two entries. One - // entry must be a constant (coming in from outside of the loop), and the - // second must be derived from the same PHI. - bool SecondIsBackedge = L->contains(PN->getIncomingBlock(1)); - PHINode *PHI = nullptr; - for (BasicBlock::iterator I = Header->begin(); - (PHI = dyn_cast<PHINode>(I)); ++I) { - Constant *StartCST = - dyn_cast<Constant>(PHI->getIncomingValue(!SecondIsBackedge)); + BasicBlock *Latch = L->getLoopLatch(); + if (!Latch) + return nullptr; + + for (auto &I : *Header) { + PHINode *PHI = dyn_cast<PHINode>(&I); + if (!PHI) break; + auto *StartCST = getOtherIncomingValue(PHI, Latch); if (!StartCST) continue; CurrentIterVals[PHI] = StartCST; } if (!CurrentIterVals.count(PN)) return RetVal = nullptr; - Value *BEValue = PN->getIncomingValue(SecondIsBackedge); + Value *BEValue = PN->getIncomingValueForBlock(Latch); // Execute the loop symbolically to determine the exit value. if (BEs.getActiveBits() >= 32) @@ -5499,7 +5981,7 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN, unsigned NumIterations = BEs.getZExtValue(); // must be in range unsigned IterationNum = 0; - const DataLayout &DL = F->getParent()->getDataLayout(); + const DataLayout &DL = getDataLayout(); for (; ; ++IterationNum) { if (IterationNum == NumIterations) return RetVal = CurrentIterVals[PN]; // Got exit value! @@ -5508,7 +5990,7 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN, // EvaluateExpression adds non-phi values to the CurrentIterVals map. DenseMap<Instruction *, Constant *> NextIterVals; Constant *NextPHI = - EvaluateExpression(BEValue, L, CurrentIterVals, DL, TLI); + EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI); if (!NextPHI) return nullptr; // Couldn't evaluate! NextIterVals[PN] = NextPHI; @@ -5519,23 +6001,21 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN, // cease to be able to evaluate one of them or if they stop evolving, // because that doesn't necessarily prevent us from computing PN. SmallVector<std::pair<PHINode *, Constant *>, 8> PHIsToCompute; - for (DenseMap<Instruction *, Constant *>::const_iterator - I = CurrentIterVals.begin(), E = CurrentIterVals.end(); I != E; ++I){ - PHINode *PHI = dyn_cast<PHINode>(I->first); + for (const auto &I : CurrentIterVals) { + PHINode *PHI = dyn_cast<PHINode>(I.first); if (!PHI || PHI == PN || PHI->getParent() != Header) continue; - PHIsToCompute.push_back(std::make_pair(PHI, I->second)); + PHIsToCompute.emplace_back(PHI, I.second); } // We use two distinct loops because EvaluateExpression may invalidate any // iterators into CurrentIterVals. - for (SmallVectorImpl<std::pair<PHINode *, Constant*> >::const_iterator - I = PHIsToCompute.begin(), E = PHIsToCompute.end(); I != E; ++I) { - PHINode *PHI = I->first; + for (const auto &I : PHIsToCompute) { + PHINode *PHI = I.first; Constant *&NextPHI = NextIterVals[PHI]; if (!NextPHI) { // Not already computed. - Value *BEValue = PHI->getIncomingValue(SecondIsBackedge); - NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, TLI); + Value *BEValue = PHI->getIncomingValueForBlock(Latch); + NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI); } - if (NextPHI != I->second) + if (NextPHI != I.second) StoppedEvolving = false; } @@ -5548,12 +6028,7 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN, } } -/// ComputeExitCountExhaustively - If the loop is known to execute a -/// constant number of times (the condition evolves only from constants), -/// try to evaluate a few iterations of the loop until we get the exit -/// condition gets a value of ExitWhen (true or false). If we cannot -/// evaluate the trip count of the loop, return getCouldNotCompute(). -const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L, +const SCEV *ScalarEvolution::computeExitCountExhaustively(const Loop *L, Value *Cond, bool ExitWhen) { PHINode *PN = getConstantEvolvingPHI(Cond, L); @@ -5567,14 +6042,14 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L, BasicBlock *Header = L->getHeader(); assert(PN->getParent() == Header && "Can't evaluate PHI not in loop header!"); - // One entry must be a constant (coming in from outside of the loop), and the - // second must be derived from the same PHI. - bool SecondIsBackedge = L->contains(PN->getIncomingBlock(1)); - PHINode *PHI = nullptr; - for (BasicBlock::iterator I = Header->begin(); - (PHI = dyn_cast<PHINode>(I)); ++I) { - Constant *StartCST = - dyn_cast<Constant>(PHI->getIncomingValue(!SecondIsBackedge)); + BasicBlock *Latch = L->getLoopLatch(); + assert(Latch && "Should follow from NumIncomingValues == 2!"); + + for (auto &I : *Header) { + PHINode *PHI = dyn_cast<PHINode>(&I); + if (!PHI) + break; + auto *StartCST = getOtherIncomingValue(PHI, Latch); if (!StartCST) continue; CurrentIterVals[PHI] = StartCST; } @@ -5585,10 +6060,10 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L, // the loop symbolically to determine when the condition gets a value of // "ExitWhen". unsigned MaxIterations = MaxBruteForceIterations; // Limit analysis. - const DataLayout &DL = F->getParent()->getDataLayout(); + const DataLayout &DL = getDataLayout(); for (unsigned IterationNum = 0; IterationNum != MaxIterations;++IterationNum){ - ConstantInt *CondVal = dyn_cast_or_null<ConstantInt>( - EvaluateExpression(Cond, L, CurrentIterVals, DL, TLI)); + auto *CondVal = dyn_cast_or_null<ConstantInt>( + EvaluateExpression(Cond, L, CurrentIterVals, DL, &TLI)); // Couldn't symbolically evaluate. if (!CondVal) return getCouldNotCompute(); @@ -5605,20 +6080,17 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L, // calling EvaluateExpression on them because that may invalidate iterators // into CurrentIterVals. SmallVector<PHINode *, 8> PHIsToCompute; - for (DenseMap<Instruction *, Constant *>::const_iterator - I = CurrentIterVals.begin(), E = CurrentIterVals.end(); I != E; ++I){ - PHINode *PHI = dyn_cast<PHINode>(I->first); + for (const auto &I : CurrentIterVals) { + PHINode *PHI = dyn_cast<PHINode>(I.first); if (!PHI || PHI->getParent() != Header) continue; PHIsToCompute.push_back(PHI); } - for (SmallVectorImpl<PHINode *>::const_iterator I = PHIsToCompute.begin(), - E = PHIsToCompute.end(); I != E; ++I) { - PHINode *PHI = *I; + for (PHINode *PHI : PHIsToCompute) { Constant *&NextPHI = NextIterVals[PHI]; if (NextPHI) continue; // Already computed! - Value *BEValue = PHI->getIncomingValue(SecondIsBackedge); - NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, TLI); + Value *BEValue = PHI->getIncomingValueForBlock(Latch); + NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI); } CurrentIterVals.swap(NextIterVals); } @@ -5638,22 +6110,22 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L, /// In the case that a relevant loop exit value cannot be computed, the /// original value V is returned. const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) { + SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values = + ValuesAtScopes[V]; // Check to see if we've folded this expression at this loop before. - SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values = ValuesAtScopes[V]; - for (unsigned u = 0; u < Values.size(); u++) { - if (Values[u].first == L) - return Values[u].second ? Values[u].second : V; - } - Values.push_back(std::make_pair(L, static_cast<const SCEV *>(nullptr))); + for (auto &LS : Values) + if (LS.first == L) + return LS.second ? LS.second : V; + + Values.emplace_back(L, nullptr); + // Otherwise compute it. const SCEV *C = computeSCEVAtScope(V, L); - SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values2 = ValuesAtScopes[V]; - for (unsigned u = Values2.size(); u > 0; u--) { - if (Values2[u - 1].first == L) { - Values2[u - 1].second = C; + for (auto &LS : reverse(ValuesAtScopes[V])) + if (LS.first == L) { + LS.second = C; break; } - } return C; } @@ -5763,7 +6235,7 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { // exit value from the loop without using SCEVs. if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(V)) { if (Instruction *I = dyn_cast<Instruction>(SU->getValue())) { - const Loop *LI = (*this->LI)[I->getParent()]; + const Loop *LI = this->LI[I->getParent()]; if (LI && LI->getParentLoop() == L) // Looking for loop exit value. if (PHINode *PN = dyn_cast<PHINode>(I)) if (PN->getParent() == LI->getHeader()) { @@ -5777,9 +6249,8 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { // Okay, we know how many times the containing loop executes. If // this is a constant evolving PHI node, get the final value at // the specified iteration number. - Constant *RV = getConstantEvolutionLoopExitValue(PN, - BTCC->getValue()->getValue(), - LI); + Constant *RV = + getConstantEvolutionLoopExitValue(PN, BTCC->getAPInt(), LI); if (RV) return getSCEV(RV); } } @@ -5791,8 +6262,7 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { if (CanConstantFold(I)) { SmallVector<Constant *, 4> Operands; bool MadeImprovement = false; - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - Value *Op = I->getOperand(i); + for (Value *Op : I->operands()) { if (Constant *C = dyn_cast<Constant>(Op)) { Operands.push_back(C); continue; @@ -5821,16 +6291,16 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { // Check to see if getSCEVAtScope actually made an improvement. if (MadeImprovement) { Constant *C = nullptr; - const DataLayout &DL = F->getParent()->getDataLayout(); + const DataLayout &DL = getDataLayout(); if (const CmpInst *CI = dyn_cast<CmpInst>(I)) C = ConstantFoldCompareInstOperands(CI->getPredicate(), Operands[0], - Operands[1], DL, TLI); + Operands[1], DL, &TLI); else if (const LoadInst *LI = dyn_cast<LoadInst>(I)) { if (!LI->isVolatile()) C = ConstantFoldLoadFromConstPtr(Operands[0], DL); } else C = ConstantFoldInstOperands(I->getOpcode(), I->getType(), Operands, - DL, TLI); + DL, &TLI); if (!C) return V; return getSCEV(C); } @@ -6021,10 +6491,10 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) { return std::make_pair(CNC, CNC); } - uint32_t BitWidth = LC->getValue()->getValue().getBitWidth(); - const APInt &L = LC->getValue()->getValue(); - const APInt &M = MC->getValue()->getValue(); - const APInt &N = NC->getValue()->getValue(); + uint32_t BitWidth = LC->getAPInt().getBitWidth(); + const APInt &L = LC->getAPInt(); + const APInt &M = MC->getAPInt(); + const APInt &N = NC->getAPInt(); APInt Two(BitWidth, 2); APInt Four(BitWidth, 4); @@ -6103,10 +6573,6 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) { const SCEVConstant *R1 = dyn_cast<SCEVConstant>(Roots.first); const SCEVConstant *R2 = dyn_cast<SCEVConstant>(Roots.second); if (R1 && R2) { -#if 0 - dbgs() << "HFTZ: " << *V << " - sol#1: " << *R1 - << " sol#2: " << *R2 << "\n"; -#endif // Pick the smallest positive root value. if (ConstantInt *CB = dyn_cast<ConstantInt>(ConstantExpr::getICmp(CmpInst::ICMP_ULT, @@ -6160,7 +6626,7 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) { // For negative steps (counting down to zero): // N = Start/-Step // First compute the unsigned distance from zero in the direction of Step. - bool CountDown = StepC->getValue()->getValue().isNegative(); + bool CountDown = StepC->getAPInt().isNegative(); const SCEV *Distance = CountDown ? Start : getNegativeSCEV(Start); // Handle unitary steps, which cannot wraparound. @@ -6185,13 +6651,53 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) { // done by counting and comparing the number of trailing zeros of Step and // Distance. if (!CountDown) { - const APInt &StepV = StepC->getValue()->getValue(); + const APInt &StepV = StepC->getAPInt(); // StepV.isPowerOf2() returns true if StepV is an positive power of two. It // also returns true if StepV is maximally negative (eg, INT_MIN), but that // case is not handled as this code is guarded by !CountDown. if (StepV.isPowerOf2() && - GetMinTrailingZeros(Distance) >= StepV.countTrailingZeros()) - return getUDivExactExpr(Distance, Step); + GetMinTrailingZeros(Distance) >= StepV.countTrailingZeros()) { + // Here we've constrained the equation to be of the form + // + // 2^(N + k) * Distance' = (StepV == 2^N) * X (mod 2^W) ... (0) + // + // where we're operating on a W bit wide integer domain and k is + // non-negative. The smallest unsigned solution for X is the trip count. + // + // (0) is equivalent to: + // + // 2^(N + k) * Distance' - 2^N * X = L * 2^W + // <=> 2^N(2^k * Distance' - X) = L * 2^(W - N) * 2^N + // <=> 2^k * Distance' - X = L * 2^(W - N) + // <=> 2^k * Distance' = L * 2^(W - N) + X ... (1) + // + // The smallest X satisfying (1) is unsigned remainder of dividing the LHS + // by 2^(W - N). + // + // <=> X = 2^k * Distance' URem 2^(W - N) ... (2) + // + // E.g. say we're solving + // + // 2 * Val = 2 * X (in i8) ... (3) + // + // then from (2), we get X = Val URem i8 128 (k = 0 in this case). + // + // Note: It is tempting to solve (3) by setting X = Val, but Val is not + // necessarily the smallest unsigned value of X that satisfies (3). + // E.g. if Val is i8 -127 then the smallest value of X that satisfies (3) + // is i8 1, not i8 -127 + + const auto *ModuloResult = getUDivExactExpr(Distance, Step); + + // Since SCEV does not have a URem node, we construct one using a truncate + // and a zero extend. + + unsigned NarrowWidth = StepV.getBitWidth() - StepV.countTrailingZeros(); + auto *NarrowTy = IntegerType::get(getContext(), NarrowWidth); + auto *WideTy = Distance->getType(); + + return getZeroExtendExpr(getTruncateExpr(ModuloResult, NarrowTy), WideTy); + } } // If the condition controls loop exit (the loop exits only if the expression @@ -6207,8 +6713,7 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) { // Then, try to solve the above equation provided that Start is constant. if (const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start)) - return SolveLinEquationWithOverflow(StepC->getValue()->getValue(), - -StartC->getValue()->getValue(), + return SolveLinEquationWithOverflow(StepC->getAPInt(), -StartC->getAPInt(), *this); return getCouldNotCompute(); } @@ -6226,7 +6731,7 @@ ScalarEvolution::HowFarToNonZero(const SCEV *V, const Loop *L) { // already. If so, the backedge will execute zero times. if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) { if (!C->getValue()->isNullValue()) - return getConstant(C->getType(), 0); + return getZero(C->getType()); return getCouldNotCompute(); // Otherwise it will loop infinitely. } @@ -6251,7 +6756,7 @@ ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(BasicBlock *BB) { // A loop's header is defined to be a block that dominates the loop. // If the header has a unique predecessor outside the loop, it must be // a block that has exactly one successor that can reach the loop. - if (Loop *L = LI->getLoopFor(BB)) + if (Loop *L = LI.getLoopFor(BB)) return std::make_pair(L->getLoopPredecessor(), L->getHeader()); return std::pair<BasicBlock *, BasicBlock *>(); @@ -6267,13 +6772,20 @@ static bool HasSameValue(const SCEV *A, const SCEV *B) { // Quick check to see if they are the same SCEV. if (A == B) return true; + auto ComputesEqualValues = [](const Instruction *A, const Instruction *B) { + // Not all instructions that are "identical" compute the same value. For + // instance, two distinct alloca instructions allocating the same type are + // identical and do not read memory; but compute distinct values. + return A->isIdenticalTo(B) && (isa<BinaryOperator>(A) || isa<GetElementPtrInst>(A)); + }; + // Otherwise, if they're both SCEVUnknown, it's possible that they hold // two different instructions with the same value. Check for this case. if (const SCEVUnknown *AU = dyn_cast<SCEVUnknown>(A)) if (const SCEVUnknown *BU = dyn_cast<SCEVUnknown>(B)) if (const Instruction *AI = dyn_cast<Instruction>(AU->getValue())) if (const Instruction *BI = dyn_cast<Instruction>(BU->getValue())) - if (AI->isIdenticalTo(BI) && !AI->mayReadFromMemory()) + if (ComputesEqualValues(AI, BI)) return true; // Otherwise assume they may have a different value. @@ -6324,7 +6836,7 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred, // If there's a constant operand, canonicalize comparisons with boundary // cases, and canonicalize *-or-equal comparisons to regular comparisons. if (const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS)) { - const APInt &RA = RC->getValue()->getValue(); + const APInt &RA = RC->getAPInt(); switch (Pred) { default: llvm_unreachable("Unexpected ICmpInst::Predicate value!"); case ICmpInst::ICMP_EQ: @@ -6515,16 +7027,14 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred, Pred = ICmpInst::ICMP_ULT; Changed = true; } else if (!getUnsignedRange(LHS).getUnsignedMin().isMinValue()) { - LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS, - SCEV::FlagNUW); + LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS); Pred = ICmpInst::ICMP_ULT; Changed = true; } break; case ICmpInst::ICMP_UGE: if (!getUnsignedRange(RHS).getUnsignedMin().isMinValue()) { - RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS, - SCEV::FlagNUW); + RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS); Pred = ICmpInst::ICMP_UGT; Changed = true; } else if (!getUnsignedRange(LHS).getUnsignedMax().isMaxValue()) { @@ -6612,10 +7122,140 @@ bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred, if (LeftGuarded && RightGuarded) return true; + if (isKnownPredicateViaSplitting(Pred, LHS, RHS)) + return true; + // Otherwise see what can be done with known constant ranges. return isKnownPredicateWithRanges(Pred, LHS, RHS); } +bool ScalarEvolution::isMonotonicPredicate(const SCEVAddRecExpr *LHS, + ICmpInst::Predicate Pred, + bool &Increasing) { + bool Result = isMonotonicPredicateImpl(LHS, Pred, Increasing); + +#ifndef NDEBUG + // Verify an invariant: inverting the predicate should turn a monotonically + // increasing change to a monotonically decreasing one, and vice versa. + bool IncreasingSwapped; + bool ResultSwapped = isMonotonicPredicateImpl( + LHS, ICmpInst::getSwappedPredicate(Pred), IncreasingSwapped); + + assert(Result == ResultSwapped && "should be able to analyze both!"); + if (ResultSwapped) + assert(Increasing == !IncreasingSwapped && + "monotonicity should flip as we flip the predicate"); +#endif + + return Result; +} + +bool ScalarEvolution::isMonotonicPredicateImpl(const SCEVAddRecExpr *LHS, + ICmpInst::Predicate Pred, + bool &Increasing) { + + // A zero step value for LHS means the induction variable is essentially a + // loop invariant value. We don't really depend on the predicate actually + // flipping from false to true (for increasing predicates, and the other way + // around for decreasing predicates), all we care about is that *if* the + // predicate changes then it only changes from false to true. + // + // A zero step value in itself is not very useful, but there may be places + // where SCEV can prove X >= 0 but not prove X > 0, so it is helpful to be + // as general as possible. + + switch (Pred) { + default: + return false; // Conservative answer + + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_UGE: + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_ULE: + if (!LHS->getNoWrapFlags(SCEV::FlagNUW)) + return false; + + Increasing = Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE; + return true; + + case ICmpInst::ICMP_SGT: + case ICmpInst::ICMP_SGE: + case ICmpInst::ICMP_SLT: + case ICmpInst::ICMP_SLE: { + if (!LHS->getNoWrapFlags(SCEV::FlagNSW)) + return false; + + const SCEV *Step = LHS->getStepRecurrence(*this); + + if (isKnownNonNegative(Step)) { + Increasing = Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE; + return true; + } + + if (isKnownNonPositive(Step)) { + Increasing = Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE; + return true; + } + + return false; + } + + } + + llvm_unreachable("switch has default clause!"); +} + +bool ScalarEvolution::isLoopInvariantPredicate( + ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Loop *L, + ICmpInst::Predicate &InvariantPred, const SCEV *&InvariantLHS, + const SCEV *&InvariantRHS) { + + // If there is a loop-invariant, force it into the RHS, otherwise bail out. + if (!isLoopInvariant(RHS, L)) { + if (!isLoopInvariant(LHS, L)) + return false; + + std::swap(LHS, RHS); + Pred = ICmpInst::getSwappedPredicate(Pred); + } + + const SCEVAddRecExpr *ArLHS = dyn_cast<SCEVAddRecExpr>(LHS); + if (!ArLHS || ArLHS->getLoop() != L) + return false; + + bool Increasing; + if (!isMonotonicPredicate(ArLHS, Pred, Increasing)) + return false; + + // If the predicate "ArLHS `Pred` RHS" monotonically increases from false to + // true as the loop iterates, and the backedge is control dependent on + // "ArLHS `Pred` RHS" == true then we can reason as follows: + // + // * if the predicate was false in the first iteration then the predicate + // is never evaluated again, since the loop exits without taking the + // backedge. + // * if the predicate was true in the first iteration then it will + // continue to be true for all future iterations since it is + // monotonically increasing. + // + // For both the above possibilities, we can replace the loop varying + // predicate with its value on the first iteration of the loop (which is + // loop invariant). + // + // A similar reasoning applies for a monotonically decreasing predicate, by + // replacing true with false and false with true in the above two bullets. + + auto P = Increasing ? Pred : ICmpInst::getInversePredicate(Pred); + + if (!isLoopBackedgeGuardedByCond(L, P, LHS, RHS)) + return false; + + InvariantPred = Pred; + InvariantLHS = ArLHS->getStart(); + InvariantRHS = RHS; + return true; +} + bool ScalarEvolution::isKnownPredicateWithRanges(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { @@ -6690,6 +7330,84 @@ ScalarEvolution::isKnownPredicateWithRanges(ICmpInst::Predicate Pred, return false; } +bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred, + const SCEV *LHS, + const SCEV *RHS) { + + // Match Result to (X + Y)<ExpectedFlags> where Y is a constant integer. + // Return Y via OutY. + auto MatchBinaryAddToConst = + [this](const SCEV *Result, const SCEV *X, APInt &OutY, + SCEV::NoWrapFlags ExpectedFlags) { + const SCEV *NonConstOp, *ConstOp; + SCEV::NoWrapFlags FlagsPresent; + + if (!splitBinaryAdd(Result, ConstOp, NonConstOp, FlagsPresent) || + !isa<SCEVConstant>(ConstOp) || NonConstOp != X) + return false; + + OutY = cast<SCEVConstant>(ConstOp)->getAPInt(); + return (FlagsPresent & ExpectedFlags) == ExpectedFlags; + }; + + APInt C; + + switch (Pred) { + default: + break; + + case ICmpInst::ICMP_SGE: + std::swap(LHS, RHS); + case ICmpInst::ICMP_SLE: + // X s<= (X + C)<nsw> if C >= 0 + if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) && C.isNonNegative()) + return true; + + // (X + C)<nsw> s<= X if C <= 0 + if (MatchBinaryAddToConst(LHS, RHS, C, SCEV::FlagNSW) && + !C.isStrictlyPositive()) + return true; + break; + + case ICmpInst::ICMP_SGT: + std::swap(LHS, RHS); + case ICmpInst::ICMP_SLT: + // X s< (X + C)<nsw> if C > 0 + if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) && + C.isStrictlyPositive()) + return true; + + // (X + C)<nsw> s< X if C < 0 + if (MatchBinaryAddToConst(LHS, RHS, C, SCEV::FlagNSW) && C.isNegative()) + return true; + break; + } + + return false; +} + +bool ScalarEvolution::isKnownPredicateViaSplitting(ICmpInst::Predicate Pred, + const SCEV *LHS, + const SCEV *RHS) { + if (Pred != ICmpInst::ICMP_ULT || ProvingSplitPredicate) + return false; + + // Allowing arbitrary number of activations of isKnownPredicateViaSplitting on + // the stack can result in exponential time complexity. + SaveAndRestore<bool> Restore(ProvingSplitPredicate, true); + + // If L >= 0 then I `ult` L <=> I >= 0 && I `slt` L + // + // To prove L >= 0 we use isKnownNonNegative whereas to prove I >= 0 we use + // isKnownPredicate. isKnownPredicate is more powerful, but also more + // expensive; and using isKnownNonNegative(RHS) is sufficient for most of the + // interesting cases seen in practice. We can consider "upgrading" L >= 0 to + // use isKnownPredicate later if needed. + return isKnownNonNegative(RHS) && + isKnownPredicate(CmpInst::ICMP_SGE, LHS, getZero(LHS->getType())) && + isKnownPredicate(CmpInst::ICMP_SLT, LHS, RHS); +} + /// isLoopBackedgeGuardedByCond - Test whether the backedge of the loop is /// protected by a conditional between LHS and RHS. This is used to /// to eliminate casts. @@ -6715,46 +7433,49 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L, LoopContinuePredicate->getSuccessor(0) != L->getHeader())) return true; + // We don't want more than one activation of the following loops on the stack + // -- that can lead to O(n!) time complexity. + if (WalkingBEDominatingConds) + return false; + + SaveAndRestore<bool> ClearOnExit(WalkingBEDominatingConds, true); + + // See if we can exploit a trip count to prove the predicate. + const auto &BETakenInfo = getBackedgeTakenInfo(L); + const SCEV *LatchBECount = BETakenInfo.getExact(Latch, this); + if (LatchBECount != getCouldNotCompute()) { + // We know that Latch branches back to the loop header exactly + // LatchBECount times. This means the backdege condition at Latch is + // equivalent to "{0,+,1} u< LatchBECount". + Type *Ty = LatchBECount->getType(); + auto NoWrapFlags = SCEV::NoWrapFlags(SCEV::FlagNUW | SCEV::FlagNW); + const SCEV *LoopCounter = + getAddRecExpr(getZero(Ty), getOne(Ty), L, NoWrapFlags); + if (isImpliedCond(Pred, LHS, RHS, ICmpInst::ICMP_ULT, LoopCounter, + LatchBECount)) + return true; + } + // Check conditions due to any @llvm.assume intrinsics. - for (auto &AssumeVH : AC->assumptions()) { + for (auto &AssumeVH : AC.assumptions()) { if (!AssumeVH) continue; auto *CI = cast<CallInst>(AssumeVH); - if (!DT->dominates(CI, Latch->getTerminator())) + if (!DT.dominates(CI, Latch->getTerminator())) continue; if (isImpliedCond(Pred, LHS, RHS, CI->getArgOperand(0), false)) return true; } - struct ClearWalkingBEDominatingCondsOnExit { - ScalarEvolution &SE; - - explicit ClearWalkingBEDominatingCondsOnExit(ScalarEvolution &SE) - : SE(SE){}; - - ~ClearWalkingBEDominatingCondsOnExit() { - SE.WalkingBEDominatingConds = false; - } - }; - - // We don't want more than one activation of the following loop on the stack - // -- that can lead to O(n!) time complexity. - if (WalkingBEDominatingConds) - return false; - - WalkingBEDominatingConds = true; - ClearWalkingBEDominatingCondsOnExit ClearOnExit(*this); - // If the loop is not reachable from the entry block, we risk running into an // infinite loop as we walk up into the dom tree. These loops do not matter // anyway, so we just return a conservative answer when we see them. - if (!DT->isReachableFromEntry(L->getHeader())) + if (!DT.isReachableFromEntry(L->getHeader())) return false; - for (DomTreeNode *DTN = (*DT)[Latch], *HeaderDTN = (*DT)[L->getHeader()]; - DTN != HeaderDTN; - DTN = DTN->getIDom()) { + for (DomTreeNode *DTN = DT[Latch], *HeaderDTN = DT[L->getHeader()]; + DTN != HeaderDTN; DTN = DTN->getIDom()) { assert(DTN && "should reach the loop header before reaching the root!"); @@ -6778,7 +7499,7 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L, // We're constructively (and conservatively) enumerating edges within the // loop body that dominate the latch. The dominator tree better agree // with us on this: - assert(DT->dominates(DominatingEdge, Latch) && "should be!"); + assert(DT.dominates(DominatingEdge, Latch) && "should be!"); if (isImpliedCond(Pred, LHS, RHS, Condition, BB != ContinuePredicate->getSuccessor(0))) @@ -6823,11 +7544,11 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L, } // Check conditions due to any @llvm.assume intrinsics. - for (auto &AssumeVH : AC->assumptions()) { + for (auto &AssumeVH : AC.assumptions()) { if (!AssumeVH) continue; auto *CI = cast<CallInst>(AssumeVH); - if (!DT->dominates(CI, L->getHeader())) + if (!DT.dominates(CI, L->getHeader())) continue; if (isImpliedCond(Pred, LHS, RHS, CI->getArgOperand(0), false)) @@ -6837,6 +7558,7 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L, return false; } +namespace { /// RAII wrapper to prevent recursive application of isImpliedCond. /// ScalarEvolution's PendingLoopPredicates set must be empty unless we are /// currently evaluating isImpliedCond. @@ -6854,6 +7576,7 @@ struct MarkPendingLoopPredicate { LoopPreds.erase(Cond); } }; +} // end anonymous namespace /// isImpliedCond - Test whether the condition described by Pred, LHS, /// and RHS is true whenever the given Cond value evaluates to true. @@ -6892,6 +7615,14 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *FoundLHS = getSCEV(ICI->getOperand(0)); const SCEV *FoundRHS = getSCEV(ICI->getOperand(1)); + return isImpliedCond(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS); +} + +bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, + const SCEV *RHS, + ICmpInst::Predicate FoundPred, + const SCEV *FoundLHS, + const SCEV *FoundRHS) { // Balance the types. if (getTypeSizeInBits(LHS->getType()) < getTypeSizeInBits(FoundLHS->getType())) { @@ -6947,6 +7678,13 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, RHS, LHS, FoundLHS, FoundRHS); } + // Unsigned comparison is the same as signed comparison when both the operands + // are non-negative. + if (CmpInst::isUnsigned(FoundPred) && + CmpInst::getSignedPredicate(FoundPred) == Pred && + isKnownNonNegative(FoundLHS) && isKnownNonNegative(FoundRHS)) + return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS); + // Check if we can make progress by sharpening ranges. if (FoundPred == ICmpInst::ICMP_NE && (isa<SCEVConstant>(FoundLHS) || isa<SCEVConstant>(FoundRHS))) { @@ -6970,7 +7708,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, APInt Min = ICmpInst::isSigned(Pred) ? getSignedRange(V).getSignedMin() : getUnsignedRange(V).getUnsignedMin(); - if (Min == C->getValue()->getValue()) { + if (Min == C->getAPInt()) { // Given (V >= Min && V != Min) we conclude V >= (Min + 1). // This is true even if (Min + 1) wraps around -- in case of // wraparound, (Min + 1) < Min, so (V >= Min => V >= (Min + 1)). @@ -7021,6 +7759,149 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, return false; } +bool ScalarEvolution::splitBinaryAdd(const SCEV *Expr, + const SCEV *&L, const SCEV *&R, + SCEV::NoWrapFlags &Flags) { + const auto *AE = dyn_cast<SCEVAddExpr>(Expr); + if (!AE || AE->getNumOperands() != 2) + return false; + + L = AE->getOperand(0); + R = AE->getOperand(1); + Flags = AE->getNoWrapFlags(); + return true; +} + +bool ScalarEvolution::computeConstantDifference(const SCEV *Less, + const SCEV *More, + APInt &C) { + // We avoid subtracting expressions here because this function is usually + // fairly deep in the call stack (i.e. is called many times). + + if (isa<SCEVAddRecExpr>(Less) && isa<SCEVAddRecExpr>(More)) { + const auto *LAR = cast<SCEVAddRecExpr>(Less); + const auto *MAR = cast<SCEVAddRecExpr>(More); + + if (LAR->getLoop() != MAR->getLoop()) + return false; + + // We look at affine expressions only; not for correctness but to keep + // getStepRecurrence cheap. + if (!LAR->isAffine() || !MAR->isAffine()) + return false; + + if (LAR->getStepRecurrence(*this) != MAR->getStepRecurrence(*this)) + return false; + + Less = LAR->getStart(); + More = MAR->getStart(); + + // fall through + } + + if (isa<SCEVConstant>(Less) && isa<SCEVConstant>(More)) { + const auto &M = cast<SCEVConstant>(More)->getAPInt(); + const auto &L = cast<SCEVConstant>(Less)->getAPInt(); + C = M - L; + return true; + } + + const SCEV *L, *R; + SCEV::NoWrapFlags Flags; + if (splitBinaryAdd(Less, L, R, Flags)) + if (const auto *LC = dyn_cast<SCEVConstant>(L)) + if (R == More) { + C = -(LC->getAPInt()); + return true; + } + + if (splitBinaryAdd(More, L, R, Flags)) + if (const auto *LC = dyn_cast<SCEVConstant>(L)) + if (R == Less) { + C = LC->getAPInt(); + return true; + } + + return false; +} + +bool ScalarEvolution::isImpliedCondOperandsViaNoOverflow( + ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, + const SCEV *FoundLHS, const SCEV *FoundRHS) { + if (Pred != CmpInst::ICMP_SLT && Pred != CmpInst::ICMP_ULT) + return false; + + const auto *AddRecLHS = dyn_cast<SCEVAddRecExpr>(LHS); + if (!AddRecLHS) + return false; + + const auto *AddRecFoundLHS = dyn_cast<SCEVAddRecExpr>(FoundLHS); + if (!AddRecFoundLHS) + return false; + + // We'd like to let SCEV reason about control dependencies, so we constrain + // both the inequalities to be about add recurrences on the same loop. This + // way we can use isLoopEntryGuardedByCond later. + + const Loop *L = AddRecFoundLHS->getLoop(); + if (L != AddRecLHS->getLoop()) + return false; + + // FoundLHS u< FoundRHS u< -C => (FoundLHS + C) u< (FoundRHS + C) ... (1) + // + // FoundLHS s< FoundRHS s< INT_MIN - C => (FoundLHS + C) s< (FoundRHS + C) + // ... (2) + // + // Informal proof for (2), assuming (1) [*]: + // + // We'll also assume (A s< B) <=> ((A + INT_MIN) u< (B + INT_MIN)) ... (3)[**] + // + // Then + // + // FoundLHS s< FoundRHS s< INT_MIN - C + // <=> (FoundLHS + INT_MIN) u< (FoundRHS + INT_MIN) u< -C [ using (3) ] + // <=> (FoundLHS + INT_MIN + C) u< (FoundRHS + INT_MIN + C) [ using (1) ] + // <=> (FoundLHS + INT_MIN + C + INT_MIN) s< + // (FoundRHS + INT_MIN + C + INT_MIN) [ using (3) ] + // <=> FoundLHS + C s< FoundRHS + C + // + // [*]: (1) can be proved by ruling out overflow. + // + // [**]: This can be proved by analyzing all the four possibilities: + // (A s< 0, B s< 0), (A s< 0, B s>= 0), (A s>= 0, B s< 0) and + // (A s>= 0, B s>= 0). + // + // Note: + // Despite (2), "FoundRHS s< INT_MIN - C" does not mean that "FoundRHS + C" + // will not sign underflow. For instance, say FoundLHS = (i8 -128), FoundRHS + // = (i8 -127) and C = (i8 -100). Then INT_MIN - C = (i8 -28), and FoundRHS + // s< (INT_MIN - C). Lack of sign overflow / underflow in "FoundRHS + C" is + // neither necessary nor sufficient to prove "(FoundLHS + C) s< (FoundRHS + + // C)". + + APInt LDiff, RDiff; + if (!computeConstantDifference(FoundLHS, LHS, LDiff) || + !computeConstantDifference(FoundRHS, RHS, RDiff) || + LDiff != RDiff) + return false; + + if (LDiff == 0) + return true; + + APInt FoundRHSLimit; + + if (Pred == CmpInst::ICMP_ULT) { + FoundRHSLimit = -RDiff; + } else { + assert(Pred == CmpInst::ICMP_SLT && "Checked above!"); + FoundRHSLimit = APInt::getSignedMinValue(getTypeSizeInBits(RHS->getType())) - RDiff; + } + + // Try to prove (1) or (2), as needed. + return isLoopEntryGuardedByCond(L, Pred, FoundRHS, + getConstant(FoundRHSLimit)); +} + /// isImpliedCondOperands - Test whether the condition described by Pred, /// LHS, and RHS is true whenever the condition described by Pred, FoundLHS, /// and FoundRHS is true. @@ -7031,6 +7912,9 @@ bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred, if (isImpliedCondOperandsViaRanges(Pred, LHS, RHS, FoundLHS, FoundRHS)) return true; + if (isImpliedCondOperandsViaNoOverflow(Pred, LHS, RHS, FoundLHS, FoundRHS)) + return true; + return isImpliedCondOperandsHelper(Pred, LHS, RHS, FoundLHS, FoundRHS) || // ~x < ~y --> x > y @@ -7043,17 +7927,13 @@ bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred, /// If Expr computes ~A, return A else return nullptr static const SCEV *MatchNotExpr(const SCEV *Expr) { const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Expr); - if (!Add || Add->getNumOperands() != 2) return nullptr; - - const SCEVConstant *AddLHS = dyn_cast<SCEVConstant>(Add->getOperand(0)); - if (!(AddLHS && AddLHS->getValue()->getValue().isAllOnesValue())) + if (!Add || Add->getNumOperands() != 2 || + !Add->getOperand(0)->isAllOnesValue()) return nullptr; const SCEVMulExpr *AddRHS = dyn_cast<SCEVMulExpr>(Add->getOperand(1)); - if (!AddRHS || AddRHS->getNumOperands() != 2) return nullptr; - - const SCEVConstant *MulLHS = dyn_cast<SCEVConstant>(AddRHS->getOperand(0)); - if (!(MulLHS && MulLHS->getValue()->getValue().isAllOnesValue())) + if (!AddRHS || AddRHS->getNumOperands() != 2 || + !AddRHS->getOperand(0)->isAllOnesValue()) return nullptr; return AddRHS->getOperand(1); @@ -7067,8 +7947,7 @@ static bool IsMaxConsistingOf(const SCEV *MaybeMaxExpr, const MaxExprType *MaxExpr = dyn_cast<MaxExprType>(MaybeMaxExpr); if (!MaxExpr) return false; - auto It = std::find(MaxExpr->op_begin(), MaxExpr->op_end(), Candidate); - return It != MaxExpr->op_end(); + return find(MaxExpr->operands(), Candidate) != MaxExpr->op_end(); } @@ -7084,6 +7963,38 @@ static bool IsMinConsistingOf(ScalarEvolution &SE, return IsMaxConsistingOf<MaxExprType>(MaybeMaxExpr, SE.getNotSCEV(Candidate)); } +static bool IsKnownPredicateViaAddRecStart(ScalarEvolution &SE, + ICmpInst::Predicate Pred, + const SCEV *LHS, const SCEV *RHS) { + + // If both sides are affine addrecs for the same loop, with equal + // steps, and we know the recurrences don't wrap, then we only + // need to check the predicate on the starting values. + + if (!ICmpInst::isRelational(Pred)) + return false; + + const SCEVAddRecExpr *LAR = dyn_cast<SCEVAddRecExpr>(LHS); + if (!LAR) + return false; + const SCEVAddRecExpr *RAR = dyn_cast<SCEVAddRecExpr>(RHS); + if (!RAR) + return false; + if (LAR->getLoop() != RAR->getLoop()) + return false; + if (!LAR->isAffine() || !RAR->isAffine()) + return false; + + if (LAR->getStepRecurrence(SE) != RAR->getStepRecurrence(SE)) + return false; + + SCEV::NoWrapFlags NW = ICmpInst::isSigned(Pred) ? + SCEV::FlagNSW : SCEV::FlagNUW; + if (!LAR->getNoWrapFlags(NW) || !RAR->getNoWrapFlags(NW)) + return false; + + return SE.isKnownPredicate(Pred, LAR->getStart(), RAR->getStart()); +} /// Is LHS `Pred` RHS true on the virtue of LHS or RHS being a Min or Max /// expression? @@ -7129,7 +8040,9 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred, auto IsKnownPredicateFull = [this](ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { return isKnownPredicateWithRanges(Pred, LHS, RHS) || - IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS); + IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) || + IsKnownPredicateViaAddRecStart(*this, Pred, LHS, RHS) || + isKnownPredicateViaNoOverflow(Pred, LHS, RHS); }; switch (Pred) { @@ -7185,7 +8098,7 @@ bool ScalarEvolution::isImpliedCondOperandsViaRanges(ICmpInst::Predicate Pred, !isa<SCEVConstant>(AddLHS->getOperand(0))) return false; - APInt ConstFoundRHS = cast<SCEVConstant>(FoundRHS)->getValue()->getValue(); + APInt ConstFoundRHS = cast<SCEVConstant>(FoundRHS)->getAPInt(); // `FoundLHSRange` is the range we know `FoundLHS` to be in by virtue of the // antecedent "`FoundLHS` `Pred` `FoundRHS`". @@ -7194,13 +8107,12 @@ bool ScalarEvolution::isImpliedCondOperandsViaRanges(ICmpInst::Predicate Pred, // Since `LHS` is `FoundLHS` + `AddLHS->getOperand(0)`, we can compute a range // for `LHS`: - APInt Addend = - cast<SCEVConstant>(AddLHS->getOperand(0))->getValue()->getValue(); + APInt Addend = cast<SCEVConstant>(AddLHS->getOperand(0))->getAPInt(); ConstantRange LHSRange = FoundLHSRange.add(ConstantRange(Addend)); // We can also compute the range of values for `LHS` that satisfy the // consequent, "`LHS` `Pred` `RHS`": - APInt ConstRHS = cast<SCEVConstant>(RHS)->getValue()->getValue(); + APInt ConstRHS = cast<SCEVConstant>(RHS)->getAPInt(); ConstantRange SatisfyingLHSRange = ConstantRange::makeSatisfyingICmpRegion(Pred, ConstRHS); @@ -7217,7 +8129,7 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride, if (NoWrap) return false; unsigned BitWidth = getTypeSizeInBits(RHS->getType()); - const SCEV *One = getConstant(Stride->getType(), 1); + const SCEV *One = getOne(Stride->getType()); if (IsSigned) { APInt MaxRHS = getSignedRange(RHS).getSignedMax(); @@ -7246,7 +8158,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride, if (NoWrap) return false; unsigned BitWidth = getTypeSizeInBits(RHS->getType()); - const SCEV *One = getConstant(Stride->getType(), 1); + const SCEV *One = getOne(Stride->getType()); if (IsSigned) { APInt MinRHS = getSignedRange(RHS).getSignedMin(); @@ -7271,7 +8183,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride, // stride and presence of the equality in the comparison. const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step, bool Equality) { - const SCEV *One = getConstant(Step->getType(), 1); + const SCEV *One = getOne(Step->getType()); Delta = Equality ? getAddExpr(Delta, Step) : getAddExpr(Delta, getMinusSCEV(Step, One)); return getUDivExpr(Delta, Step); @@ -7324,7 +8236,7 @@ ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS, // overflow, in which case if RHS - Start is a constant, we don't need to // do a max operation since we can just figure it out statically if (NoWrap && isa<SCEVConstant>(Diff)) { - APInt D = dyn_cast<const SCEVConstant>(Diff)->getValue()->getValue(); + APInt D = dyn_cast<const SCEVConstant>(Diff)->getAPInt(); if (D.isNegative()) End = Start; } else @@ -7405,7 +8317,7 @@ ScalarEvolution::HowManyGreaterThans(const SCEV *LHS, const SCEV *RHS, // overflow, in which case if RHS - Start is a constant, we don't need to // do a max operation since we can just figure it out statically if (NoWrap && isa<SCEVConstant>(Diff)) { - APInt D = dyn_cast<const SCEVConstant>(Diff)->getValue()->getValue(); + APInt D = dyn_cast<const SCEVConstant>(Diff)->getAPInt(); if (!D.isNegative()) End = Start; } else @@ -7460,23 +8372,20 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range, if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(getStart())) if (!SC->getValue()->isZero()) { SmallVector<const SCEV *, 4> Operands(op_begin(), op_end()); - Operands[0] = SE.getConstant(SC->getType(), 0); + Operands[0] = SE.getZero(SC->getType()); const SCEV *Shifted = SE.getAddRecExpr(Operands, getLoop(), getNoWrapFlags(FlagNW)); - if (const SCEVAddRecExpr *ShiftedAddRec = - dyn_cast<SCEVAddRecExpr>(Shifted)) + if (const auto *ShiftedAddRec = dyn_cast<SCEVAddRecExpr>(Shifted)) return ShiftedAddRec->getNumIterationsInRange( - Range.subtract(SC->getValue()->getValue()), SE); + Range.subtract(SC->getAPInt()), SE); // This is strange and shouldn't happen. return SE.getCouldNotCompute(); } // The only time we can solve this is when we have all constant indices. // Otherwise, we cannot determine the overflow conditions. - for (unsigned i = 0, e = getNumOperands(); i != e; ++i) - if (!isa<SCEVConstant>(getOperand(i))) - return SE.getCouldNotCompute(); - + if (any_of(operands(), [](const SCEV *Op) { return !isa<SCEVConstant>(Op); })) + return SE.getCouldNotCompute(); // Okay at this point we know that all elements of the chrec are constants and // that the start element is zero. @@ -7485,7 +8394,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range, // iteration exits. unsigned BitWidth = SE.getTypeSizeInBits(getType()); if (!Range.contains(APInt(BitWidth, 0))) - return SE.getConstant(getType(), 0); + return SE.getZero(getType()); if (isAffine()) { // If this is an affine expression then we have this situation: @@ -7496,7 +8405,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range, // If A is negative then the lower of the range is the last possible loop // value. Also note that we already checked for a full range. APInt One(BitWidth,1); - APInt A = cast<SCEVConstant>(getOperand(1))->getValue()->getValue(); + APInt A = cast<SCEVConstant>(getOperand(1))->getAPInt(); APInt End = A.sge(One) ? (Range.getUpper() - One) : Range.getLower(); // The exit value should be (End+A)/A. @@ -7528,15 +8437,13 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range, FlagAnyWrap); // Next, solve the constructed addrec - std::pair<const SCEV *,const SCEV *> Roots = - SolveQuadraticEquation(cast<SCEVAddRecExpr>(NewAddRec), SE); + auto Roots = SolveQuadraticEquation(cast<SCEVAddRecExpr>(NewAddRec), SE); const SCEVConstant *R1 = dyn_cast<SCEVConstant>(Roots.first); const SCEVConstant *R2 = dyn_cast<SCEVConstant>(Roots.second); if (R1) { // Pick the smallest positive root value. - if (ConstantInt *CB = - dyn_cast<ConstantInt>(ConstantExpr::getICmp(ICmpInst::ICMP_ULT, - R1->getValue(), R2->getValue()))) { + if (ConstantInt *CB = dyn_cast<ConstantInt>(ConstantExpr::getICmp( + ICmpInst::ICMP_ULT, R1->getValue(), R2->getValue()))) { if (!CB->getZExtValue()) std::swap(R1, R2); // R1 is the minimum root now. @@ -7549,7 +8456,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range, if (Range.contains(R1Val->getValue())) { // The next iteration must be out of the range... ConstantInt *NextVal = - ConstantInt::get(SE.getContext(), R1->getValue()->getValue()+1); + ConstantInt::get(SE.getContext(), R1->getAPInt() + 1); R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE); if (!Range.contains(R1Val->getValue())) @@ -7560,7 +8467,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range, // If R1 was not in the range, then it is a good return value. Make // sure that R1-1 WAS in the range though, just in case. ConstantInt *NextVal = - ConstantInt::get(SE.getContext(), R1->getValue()->getValue()-1); + ConstantInt::get(SE.getContext(), R1->getAPInt() - 1); R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE); if (Range.contains(R1Val->getValue())) return R1; @@ -7644,9 +8551,84 @@ struct SCEVCollectTerms { } bool isDone() const { return false; } }; + +// Check if a SCEV contains an AddRecExpr. +struct SCEVHasAddRec { + bool &ContainsAddRec; + + SCEVHasAddRec(bool &ContainsAddRec) : ContainsAddRec(ContainsAddRec) { + ContainsAddRec = false; + } + + bool follow(const SCEV *S) { + if (isa<SCEVAddRecExpr>(S)) { + ContainsAddRec = true; + + // Stop recursion: once we collected a term, do not walk its operands. + return false; + } + + // Keep looking. + return true; + } + bool isDone() const { return false; } +}; + +// Find factors that are multiplied with an expression that (possibly as a +// subexpression) contains an AddRecExpr. In the expression: +// +// 8 * (100 + %p * %q * (%a + {0, +, 1}_loop)) +// +// "%p * %q" are factors multiplied by the expression "(%a + {0, +, 1}_loop)" +// that contains the AddRec {0, +, 1}_loop. %p * %q are likely to be array size +// parameters as they form a product with an induction variable. +// +// This collector expects all array size parameters to be in the same MulExpr. +// It might be necessary to later add support for collecting parameters that are +// spread over different nested MulExpr. +struct SCEVCollectAddRecMultiplies { + SmallVectorImpl<const SCEV *> &Terms; + ScalarEvolution &SE; + + SCEVCollectAddRecMultiplies(SmallVectorImpl<const SCEV *> &T, ScalarEvolution &SE) + : Terms(T), SE(SE) {} + + bool follow(const SCEV *S) { + if (auto *Mul = dyn_cast<SCEVMulExpr>(S)) { + bool HasAddRec = false; + SmallVector<const SCEV *, 0> Operands; + for (auto Op : Mul->operands()) { + if (isa<SCEVUnknown>(Op)) { + Operands.push_back(Op); + } else { + bool ContainsAddRec; + SCEVHasAddRec ContiansAddRec(ContainsAddRec); + visitAll(Op, ContiansAddRec); + HasAddRec |= ContainsAddRec; + } + } + if (Operands.size() == 0) + return true; + + if (!HasAddRec) + return false; + + Terms.push_back(SE.getMulExpr(Operands)); + // Stop recursion: once we collected a term, do not walk its operands. + return false; + } + + // Keep looking. + return true; + } + bool isDone() const { return false; } +}; } -/// Find parametric terms in this SCEVAddRecExpr. +/// Find parametric terms in this SCEVAddRecExpr. We first for parameters in +/// two places: +/// 1) The strides of AddRec expressions. +/// 2) Unknowns that are multiplied with AddRec expressions. void ScalarEvolution::collectParametricTerms(const SCEV *Expr, SmallVectorImpl<const SCEV *> &Terms) { SmallVector<const SCEV *, 4> Strides; @@ -7669,6 +8651,9 @@ void ScalarEvolution::collectParametricTerms(const SCEV *Expr, for (const SCEV *T : Terms) dbgs() << *T << "\n"; }); + + SCEVCollectAddRecMultiplies MulCollector(Terms, *this); + visitAll(Expr, MulCollector); } static bool findArrayDimensionsRec(ScalarEvolution &SE, @@ -7718,30 +8703,28 @@ static bool findArrayDimensionsRec(ScalarEvolution &SE, return true; } -namespace { -struct FindParameter { - bool FoundParameter; - FindParameter() : FoundParameter(false) {} - - bool follow(const SCEV *S) { - if (isa<SCEVUnknown>(S)) { - FoundParameter = true; - // Stop recursion: we found a parameter. - return false; - } - // Keep looking. - return true; - } - bool isDone() const { - // Stop recursion if we have found a parameter. - return FoundParameter; - } -}; -} - // Returns true when S contains at least a SCEVUnknown parameter. static inline bool containsParameters(const SCEV *S) { + struct FindParameter { + bool FoundParameter; + FindParameter() : FoundParameter(false) {} + + bool follow(const SCEV *S) { + if (isa<SCEVUnknown>(S)) { + FoundParameter = true; + // Stop recursion: we found a parameter. + return false; + } + // Keep looking. + return true; + } + bool isDone() const { + // Stop recursion if we have found a parameter. + return FoundParameter; + } + }; + FindParameter F; SCEVTraversal<FindParameter> ST(F); ST.visitAll(S); @@ -7829,11 +8812,13 @@ void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms, ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this); - // Divide all terms by the element size. + // Try to divide all terms by the element size. If term is not divisible by + // element size, proceed with the original term. for (const SCEV *&Term : Terms) { const SCEV *Q, *R; SCEVDivision::divide(SE, Term, ElementSize, &Q, &R); - Term = Q; + if (!Q->isZero()) + Term = Q; } SmallVector<const SCEV *, 4> NewTerms; @@ -7875,7 +8860,7 @@ void ScalarEvolution::computeAccessFunctions( if (Sizes.empty()) return; - if (auto AR = dyn_cast<SCEVAddRecExpr>(Expr)) + if (auto *AR = dyn_cast<SCEVAddRecExpr>(Expr)) if (!AR->isAffine()) return; @@ -8059,58 +9044,55 @@ ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se) // ScalarEvolution Class Implementation //===----------------------------------------------------------------------===// -ScalarEvolution::ScalarEvolution() - : FunctionPass(ID), WalkingBEDominatingConds(false), ValuesAtScopes(64), - LoopDispositions(64), BlockDispositions(64), FirstUnknown(nullptr) { - initializeScalarEvolutionPass(*PassRegistry::getPassRegistry()); -} - -bool ScalarEvolution::runOnFunction(Function &F) { - this->F = &F; - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - return false; -} - -void ScalarEvolution::releaseMemory() { +ScalarEvolution::ScalarEvolution(Function &F, TargetLibraryInfo &TLI, + AssumptionCache &AC, DominatorTree &DT, + LoopInfo &LI) + : F(F), TLI(TLI), AC(AC), DT(DT), LI(LI), + CouldNotCompute(new SCEVCouldNotCompute()), + WalkingBEDominatingConds(false), ProvingSplitPredicate(false), + ValuesAtScopes(64), LoopDispositions(64), BlockDispositions(64), + FirstUnknown(nullptr) {} + +ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg) + : F(Arg.F), TLI(Arg.TLI), AC(Arg.AC), DT(Arg.DT), LI(Arg.LI), + CouldNotCompute(std::move(Arg.CouldNotCompute)), + ValueExprMap(std::move(Arg.ValueExprMap)), + WalkingBEDominatingConds(false), ProvingSplitPredicate(false), + BackedgeTakenCounts(std::move(Arg.BackedgeTakenCounts)), + ConstantEvolutionLoopExitValue( + std::move(Arg.ConstantEvolutionLoopExitValue)), + ValuesAtScopes(std::move(Arg.ValuesAtScopes)), + LoopDispositions(std::move(Arg.LoopDispositions)), + BlockDispositions(std::move(Arg.BlockDispositions)), + UnsignedRanges(std::move(Arg.UnsignedRanges)), + SignedRanges(std::move(Arg.SignedRanges)), + UniqueSCEVs(std::move(Arg.UniqueSCEVs)), + UniquePreds(std::move(Arg.UniquePreds)), + SCEVAllocator(std::move(Arg.SCEVAllocator)), + FirstUnknown(Arg.FirstUnknown) { + Arg.FirstUnknown = nullptr; +} + +ScalarEvolution::~ScalarEvolution() { // Iterate through all the SCEVUnknown instances and call their // destructors, so that they release their references to their values. - for (SCEVUnknown *U = FirstUnknown; U; U = U->Next) - U->~SCEVUnknown(); + for (SCEVUnknown *U = FirstUnknown; U;) { + SCEVUnknown *Tmp = U; + U = U->Next; + Tmp->~SCEVUnknown(); + } FirstUnknown = nullptr; ValueExprMap.clear(); // Free any extra memory created for ExitNotTakenInfo in the unlikely event // that a loop had multiple computable exits. - for (DenseMap<const Loop*, BackedgeTakenInfo>::iterator I = - BackedgeTakenCounts.begin(), E = BackedgeTakenCounts.end(); - I != E; ++I) { - I->second.clear(); - } + for (auto &BTCI : BackedgeTakenCounts) + BTCI.second.clear(); assert(PendingLoopPredicates.empty() && "isImpliedCond garbage"); assert(!WalkingBEDominatingConds && "isLoopBackedgeGuardedByCond garbage!"); - - BackedgeTakenCounts.clear(); - ConstantEvolutionLoopExitValue.clear(); - ValuesAtScopes.clear(); - LoopDispositions.clear(); - BlockDispositions.clear(); - UnsignedRanges.clear(); - SignedRanges.clear(); - UniqueSCEVs.clear(); - SCEVAllocator.Reset(); -} - -void ScalarEvolution::getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesAll(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequiredTransitive<LoopInfoWrapperPass>(); - AU.addRequiredTransitive<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); + assert(!ProvingSplitPredicate && "ProvingSplitPredicate garbage!"); } bool ScalarEvolution::hasLoopInvariantBackedgeTakenCount(const Loop *L) { @@ -8152,7 +9134,7 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE, OS << "\n"; } -void ScalarEvolution::print(raw_ostream &OS, const Module *) const { +void ScalarEvolution::print(raw_ostream &OS) const { // ScalarEvolution's implementation of the print method is to print // out SCEV values of all instructions that are interesting. Doing // this potentially causes it to create new SCEV objects though, @@ -8162,13 +9144,13 @@ void ScalarEvolution::print(raw_ostream &OS, const Module *) const { ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this); OS << "Classifying expressions for: "; - F->printAsOperand(OS, /*PrintType=*/false); + F.printAsOperand(OS, /*PrintType=*/false); OS << "\n"; - for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) - if (isSCEVable(I->getType()) && !isa<CmpInst>(*I)) { - OS << *I << '\n'; + for (Instruction &I : instructions(F)) + if (isSCEVable(I.getType()) && !isa<CmpInst>(I)) { + OS << I << '\n'; OS << " --> "; - const SCEV *SV = SE.getSCEV(&*I); + const SCEV *SV = SE.getSCEV(&I); SV->print(OS); if (!isa<SCEVCouldNotCompute>(SV)) { OS << " U: "; @@ -8177,7 +9159,7 @@ void ScalarEvolution::print(raw_ostream &OS, const Module *) const { SE.getSignedRange(SV).print(OS); } - const Loop *L = LI->getLoopFor((*I).getParent()); + const Loop *L = LI.getLoopFor(I.getParent()); const SCEV *AtUse = SE.getSCEVAtScope(SV, L); if (AtUse != SV) { @@ -8205,9 +9187,9 @@ void ScalarEvolution::print(raw_ostream &OS, const Module *) const { } OS << "Determining loop execution counts for: "; - F->printAsOperand(OS, /*PrintType=*/false); + F.printAsOperand(OS, /*PrintType=*/false); OS << "\n"; - for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) + for (LoopInfo::iterator I = LI.begin(), E = LI.end(); I != E; ++I) PrintLoopInfo(OS, &SE, *I); } @@ -8260,9 +9242,8 @@ ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) { // This recurrence is variant w.r.t. L if any of its operands // are variant. - for (SCEVAddRecExpr::op_iterator I = AR->op_begin(), E = AR->op_end(); - I != E; ++I) - if (!isLoopInvariant(*I, L)) + for (auto *Op : AR->operands()) + if (!isLoopInvariant(Op, L)) return LoopVariant; // Otherwise it's loop-invariant. @@ -8272,11 +9253,9 @@ ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) { case scMulExpr: case scUMaxExpr: case scSMaxExpr: { - const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S); bool HasVarying = false; - for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end(); - I != E; ++I) { - LoopDisposition D = getLoopDisposition(*I, L); + for (auto *Op : cast<SCEVNAryExpr>(S)->operands()) { + LoopDisposition D = getLoopDisposition(Op, L); if (D == LoopVariant) return LoopVariant; if (D == LoopComputable) @@ -8300,7 +9279,7 @@ ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) { // invariant if they are not contained in the specified loop. // Instructions are never considered invariant in the function body // (null loop) because they are defined within the "loop". - if (Instruction *I = dyn_cast<Instruction>(cast<SCEVUnknown>(S)->getValue())) + if (auto *I = dyn_cast<Instruction>(cast<SCEVUnknown>(S)->getValue())) return (L && !L->contains(I)) ? LoopInvariant : LoopVariant; return LoopInvariant; case scCouldNotCompute: @@ -8351,7 +9330,7 @@ ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) { // produces the addrec's value is a PHI, and a PHI effectively properly // dominates its entire containing block. const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(S); - if (!DT->dominates(AR->getLoop()->getHeader(), BB)) + if (!DT.dominates(AR->getLoop()->getHeader(), BB)) return DoesNotDominateBlock; } // FALL THROUGH into SCEVNAryExpr handling. @@ -8361,9 +9340,8 @@ ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) { case scSMaxExpr: { const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S); bool Proper = true; - for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end(); - I != E; ++I) { - BlockDisposition D = getBlockDisposition(*I, BB); + for (const SCEV *NAryOp : NAry->operands()) { + BlockDisposition D = getBlockDisposition(NAryOp, BB); if (D == DoesNotDominateBlock) return DoesNotDominateBlock; if (D == DominatesBlock) @@ -8388,7 +9366,7 @@ ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) { dyn_cast<Instruction>(cast<SCEVUnknown>(S)->getValue())) { if (I->getParent() == BB) return DominatesBlock; - if (DT->properlyDominates(I->getParent(), BB)) + if (DT.properlyDominates(I->getParent(), BB)) return ProperlyDominatesBlock; return DoesNotDominateBlock; } @@ -8407,24 +9385,22 @@ bool ScalarEvolution::properlyDominates(const SCEV *S, const BasicBlock *BB) { return getBlockDisposition(S, BB) == ProperlyDominatesBlock; } -namespace { -// Search for a SCEV expression node within an expression tree. -// Implements SCEVTraversal::Visitor. -struct SCEVSearch { - const SCEV *Node; - bool IsFound; +bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const { + // Search for a SCEV expression node within an expression tree. + // Implements SCEVTraversal::Visitor. + struct SCEVSearch { + const SCEV *Node; + bool IsFound; - SCEVSearch(const SCEV *N): Node(N), IsFound(false) {} + SCEVSearch(const SCEV *N): Node(N), IsFound(false) {} - bool follow(const SCEV *S) { - IsFound |= (S == Node); - return !IsFound; - } - bool isDone() const { return IsFound; } -}; -} + bool follow(const SCEV *S) { + IsFound |= (S == Node); + return !IsFound; + } + bool isDone() const { return IsFound; } + }; -bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const { SCEVSearch Search(Op); visitAll(S, Search); return Search.IsFound; @@ -8463,43 +9439,39 @@ static void replaceSubString(std::string &Str, StringRef From, StringRef To) { /// getLoopBackedgeTakenCounts - Helper method for verifyAnalysis. static void getLoopBackedgeTakenCounts(Loop *L, VerifyMap &Map, ScalarEvolution &SE) { - for (Loop::reverse_iterator I = L->rbegin(), E = L->rend(); I != E; ++I) { - getLoopBackedgeTakenCounts(*I, Map, SE); // recurse. - - std::string &S = Map[L]; - if (S.empty()) { - raw_string_ostream OS(S); - SE.getBackedgeTakenCount(L)->print(OS); + std::string &S = Map[L]; + if (S.empty()) { + raw_string_ostream OS(S); + SE.getBackedgeTakenCount(L)->print(OS); - // false and 0 are semantically equivalent. This can happen in dead loops. - replaceSubString(OS.str(), "false", "0"); - // Remove wrap flags, their use in SCEV is highly fragile. - // FIXME: Remove this when SCEV gets smarter about them. - replaceSubString(OS.str(), "<nw>", ""); - replaceSubString(OS.str(), "<nsw>", ""); - replaceSubString(OS.str(), "<nuw>", ""); - } + // false and 0 are semantically equivalent. This can happen in dead loops. + replaceSubString(OS.str(), "false", "0"); + // Remove wrap flags, their use in SCEV is highly fragile. + // FIXME: Remove this when SCEV gets smarter about them. + replaceSubString(OS.str(), "<nw>", ""); + replaceSubString(OS.str(), "<nsw>", ""); + replaceSubString(OS.str(), "<nuw>", ""); } -} -void ScalarEvolution::verifyAnalysis() const { - if (!VerifySCEV) - return; + for (auto *R : reverse(*L)) + getLoopBackedgeTakenCounts(R, Map, SE); // recurse. +} +void ScalarEvolution::verify() const { ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this); // Gather stringified backedge taken counts for all loops using SCEV's caches. // FIXME: It would be much better to store actual values instead of strings, // but SCEV pointers will change if we drop the caches. VerifyMap BackedgeDumpsOld, BackedgeDumpsNew; - for (LoopInfo::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I) + for (LoopInfo::reverse_iterator I = LI.rbegin(), E = LI.rend(); I != E; ++I) getLoopBackedgeTakenCounts(*I, BackedgeDumpsOld, SE); - // Gather stringified backedge taken counts for all loops without using - // SCEV's caches. - SE.releaseMemory(); - for (LoopInfo::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I) - getLoopBackedgeTakenCounts(*I, BackedgeDumpsNew, SE); + // Gather stringified backedge taken counts for all loops using a fresh + // ScalarEvolution object. + ScalarEvolution SE2(F, TLI, AC, DT, LI); + for (LoopInfo::reverse_iterator I = LI.rbegin(), E = LI.rend(); I != E; ++I) + getLoopBackedgeTakenCounts(*I, BackedgeDumpsNew, SE2); // Now compare whether they're the same with and without caches. This allows // verifying that no pass changed the cache. @@ -8532,3 +9504,238 @@ void ScalarEvolution::verifyAnalysis() const { // TODO: Verify more things. } + +char ScalarEvolutionAnalysis::PassID; + +ScalarEvolution ScalarEvolutionAnalysis::run(Function &F, + AnalysisManager<Function> *AM) { + return ScalarEvolution(F, AM->getResult<TargetLibraryAnalysis>(F), + AM->getResult<AssumptionAnalysis>(F), + AM->getResult<DominatorTreeAnalysis>(F), + AM->getResult<LoopAnalysis>(F)); +} + +PreservedAnalyses +ScalarEvolutionPrinterPass::run(Function &F, AnalysisManager<Function> *AM) { + AM->getResult<ScalarEvolutionAnalysis>(F).print(OS); + return PreservedAnalyses::all(); +} + +INITIALIZE_PASS_BEGIN(ScalarEvolutionWrapperPass, "scalar-evolution", + "Scalar Evolution Analysis", false, true) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(ScalarEvolutionWrapperPass, "scalar-evolution", + "Scalar Evolution Analysis", false, true) +char ScalarEvolutionWrapperPass::ID = 0; + +ScalarEvolutionWrapperPass::ScalarEvolutionWrapperPass() : FunctionPass(ID) { + initializeScalarEvolutionWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +bool ScalarEvolutionWrapperPass::runOnFunction(Function &F) { + SE.reset(new ScalarEvolution( + F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), + getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), + getAnalysis<DominatorTreeWrapperPass>().getDomTree(), + getAnalysis<LoopInfoWrapperPass>().getLoopInfo())); + return false; +} + +void ScalarEvolutionWrapperPass::releaseMemory() { SE.reset(); } + +void ScalarEvolutionWrapperPass::print(raw_ostream &OS, const Module *) const { + SE->print(OS); +} + +void ScalarEvolutionWrapperPass::verifyAnalysis() const { + if (!VerifySCEV) + return; + + SE->verify(); +} + +void ScalarEvolutionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequiredTransitive<AssumptionCacheTracker>(); + AU.addRequiredTransitive<LoopInfoWrapperPass>(); + AU.addRequiredTransitive<DominatorTreeWrapperPass>(); + AU.addRequiredTransitive<TargetLibraryInfoWrapperPass>(); +} + +const SCEVPredicate * +ScalarEvolution::getEqualPredicate(const SCEVUnknown *LHS, + const SCEVConstant *RHS) { + FoldingSetNodeID ID; + // Unique this node based on the arguments + ID.AddInteger(SCEVPredicate::P_Equal); + ID.AddPointer(LHS); + ID.AddPointer(RHS); + void *IP = nullptr; + if (const auto *S = UniquePreds.FindNodeOrInsertPos(ID, IP)) + return S; + SCEVEqualPredicate *Eq = new (SCEVAllocator) + SCEVEqualPredicate(ID.Intern(SCEVAllocator), LHS, RHS); + UniquePreds.InsertNode(Eq, IP); + return Eq; +} + +namespace { +class SCEVPredicateRewriter : public SCEVRewriteVisitor<SCEVPredicateRewriter> { +public: + static const SCEV *rewrite(const SCEV *Scev, ScalarEvolution &SE, + SCEVUnionPredicate &A) { + SCEVPredicateRewriter Rewriter(SE, A); + return Rewriter.visit(Scev); + } + + SCEVPredicateRewriter(ScalarEvolution &SE, SCEVUnionPredicate &P) + : SCEVRewriteVisitor(SE), P(P) {} + + const SCEV *visitUnknown(const SCEVUnknown *Expr) { + auto ExprPreds = P.getPredicatesForExpr(Expr); + for (auto *Pred : ExprPreds) + if (const auto *IPred = dyn_cast<const SCEVEqualPredicate>(Pred)) + if (IPred->getLHS() == Expr) + return IPred->getRHS(); + + return Expr; + } + +private: + SCEVUnionPredicate &P; +}; +} // end anonymous namespace + +const SCEV *ScalarEvolution::rewriteUsingPredicate(const SCEV *Scev, + SCEVUnionPredicate &Preds) { + return SCEVPredicateRewriter::rewrite(Scev, *this, Preds); +} + +/// SCEV predicates +SCEVPredicate::SCEVPredicate(const FoldingSetNodeIDRef ID, + SCEVPredicateKind Kind) + : FastID(ID), Kind(Kind) {} + +SCEVEqualPredicate::SCEVEqualPredicate(const FoldingSetNodeIDRef ID, + const SCEVUnknown *LHS, + const SCEVConstant *RHS) + : SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) {} + +bool SCEVEqualPredicate::implies(const SCEVPredicate *N) const { + const auto *Op = dyn_cast<const SCEVEqualPredicate>(N); + + if (!Op) + return false; + + return Op->LHS == LHS && Op->RHS == RHS; +} + +bool SCEVEqualPredicate::isAlwaysTrue() const { return false; } + +const SCEV *SCEVEqualPredicate::getExpr() const { return LHS; } + +void SCEVEqualPredicate::print(raw_ostream &OS, unsigned Depth) const { + OS.indent(Depth) << "Equal predicate: " << *LHS << " == " << *RHS << "\n"; +} + +/// Union predicates don't get cached so create a dummy set ID for it. +SCEVUnionPredicate::SCEVUnionPredicate() + : SCEVPredicate(FoldingSetNodeIDRef(nullptr, 0), P_Union) {} + +bool SCEVUnionPredicate::isAlwaysTrue() const { + return all_of(Preds, + [](const SCEVPredicate *I) { return I->isAlwaysTrue(); }); +} + +ArrayRef<const SCEVPredicate *> +SCEVUnionPredicate::getPredicatesForExpr(const SCEV *Expr) { + auto I = SCEVToPreds.find(Expr); + if (I == SCEVToPreds.end()) + return ArrayRef<const SCEVPredicate *>(); + return I->second; +} + +bool SCEVUnionPredicate::implies(const SCEVPredicate *N) const { + if (const auto *Set = dyn_cast<const SCEVUnionPredicate>(N)) + return all_of(Set->Preds, + [this](const SCEVPredicate *I) { return this->implies(I); }); + + auto ScevPredsIt = SCEVToPreds.find(N->getExpr()); + if (ScevPredsIt == SCEVToPreds.end()) + return false; + auto &SCEVPreds = ScevPredsIt->second; + + return any_of(SCEVPreds, + [N](const SCEVPredicate *I) { return I->implies(N); }); +} + +const SCEV *SCEVUnionPredicate::getExpr() const { return nullptr; } + +void SCEVUnionPredicate::print(raw_ostream &OS, unsigned Depth) const { + for (auto Pred : Preds) + Pred->print(OS, Depth); +} + +void SCEVUnionPredicate::add(const SCEVPredicate *N) { + if (const auto *Set = dyn_cast<const SCEVUnionPredicate>(N)) { + for (auto Pred : Set->Preds) + add(Pred); + return; + } + + if (implies(N)) + return; + + const SCEV *Key = N->getExpr(); + assert(Key && "Only SCEVUnionPredicate doesn't have an " + " associated expression!"); + + SCEVToPreds[Key].push_back(N); + Preds.push_back(N); +} + +PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE) + : SE(SE), Generation(0) {} + +const SCEV *PredicatedScalarEvolution::getSCEV(Value *V) { + const SCEV *Expr = SE.getSCEV(V); + RewriteEntry &Entry = RewriteMap[Expr]; + + // If we already have an entry and the version matches, return it. + if (Entry.second && Generation == Entry.first) + return Entry.second; + + // We found an entry but it's stale. Rewrite the stale entry + // acording to the current predicate. + if (Entry.second) + Expr = Entry.second; + + const SCEV *NewSCEV = SE.rewriteUsingPredicate(Expr, Preds); + Entry = {Generation, NewSCEV}; + + return NewSCEV; +} + +void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) { + if (Preds.implies(&Pred)) + return; + Preds.add(&Pred); + updateGeneration(); +} + +const SCEVUnionPredicate &PredicatedScalarEvolution::getUnionPredicate() const { + return Preds; +} + +void PredicatedScalarEvolution::updateGeneration() { + // If the generation number wrapped recompute everything. + if (++Generation == 0) { + for (auto &II : RewriteMap) { + const SCEV *Rewritten = II.second.second; + II.second = {Generation, SE.rewriteUsingPredicate(Rewritten, Preds)}; + } + } +} diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp index 6bc0d85..2e50c80 100644 --- a/contrib/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp +++ b/contrib/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp @@ -19,125 +19,42 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; -namespace { - /// ScalarEvolutionAliasAnalysis - This is a simple alias analysis - /// implementation that uses ScalarEvolution to answer queries. - class ScalarEvolutionAliasAnalysis : public FunctionPass, - public AliasAnalysis { - ScalarEvolution *SE; - - public: - static char ID; // Class identification, replacement for typeinfo - ScalarEvolutionAliasAnalysis() : FunctionPass(ID), SE(nullptr) { - initializeScalarEvolutionAliasAnalysisPass( - *PassRegistry::getPassRegistry()); - } - - /// getAdjustedAnalysisPointer - This method is used when a pass implements - /// an analysis interface through multiple inheritance. If needed, it - /// should override this to adjust the this pointer as needed for the - /// specified pass info. - void *getAdjustedAnalysisPointer(AnalysisID PI) override { - if (PI == &AliasAnalysis::ID) - return (AliasAnalysis*)this; - return this; - } - - private: - void getAnalysisUsage(AnalysisUsage &AU) const override; - bool runOnFunction(Function &F) override; - AliasResult alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) override; - - Value *GetBaseValue(const SCEV *S); - }; -} // End of anonymous namespace - -// Register this pass... -char ScalarEvolutionAliasAnalysis::ID = 0; -INITIALIZE_AG_PASS_BEGIN(ScalarEvolutionAliasAnalysis, AliasAnalysis, "scev-aa", - "ScalarEvolution-based Alias Analysis", false, true, false) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) -INITIALIZE_AG_PASS_END(ScalarEvolutionAliasAnalysis, AliasAnalysis, "scev-aa", - "ScalarEvolution-based Alias Analysis", false, true, false) - -FunctionPass *llvm::createScalarEvolutionAliasAnalysisPass() { - return new ScalarEvolutionAliasAnalysis(); -} - -void -ScalarEvolutionAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequiredTransitive<ScalarEvolution>(); - AU.setPreservesAll(); - AliasAnalysis::getAnalysisUsage(AU); -} - -bool -ScalarEvolutionAliasAnalysis::runOnFunction(Function &F) { - InitializeAliasAnalysis(this, &F.getParent()->getDataLayout()); - SE = &getAnalysis<ScalarEvolution>(); - return false; -} - -/// GetBaseValue - Given an expression, try to find a -/// base value. Return null is none was found. -Value * -ScalarEvolutionAliasAnalysis::GetBaseValue(const SCEV *S) { - if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { - // In an addrec, assume that the base will be in the start, rather - // than the step. - return GetBaseValue(AR->getStart()); - } else if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(S)) { - // If there's a pointer operand, it'll be sorted at the end of the list. - const SCEV *Last = A->getOperand(A->getNumOperands()-1); - if (Last->getType()->isPointerTy()) - return GetBaseValue(Last); - } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { - // This is a leaf node. - return U->getValue(); - } - // No Identified object found. - return nullptr; -} - -AliasResult ScalarEvolutionAliasAnalysis::alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) { +AliasResult SCEVAAResult::alias(const MemoryLocation &LocA, + const MemoryLocation &LocB) { // If either of the memory references is empty, it doesn't matter what the // pointer values are. This allows the code below to ignore this special // case. if (LocA.Size == 0 || LocB.Size == 0) return NoAlias; - // This is ScalarEvolutionAliasAnalysis. Get the SCEVs! - const SCEV *AS = SE->getSCEV(const_cast<Value *>(LocA.Ptr)); - const SCEV *BS = SE->getSCEV(const_cast<Value *>(LocB.Ptr)); + // This is SCEVAAResult. Get the SCEVs! + const SCEV *AS = SE.getSCEV(const_cast<Value *>(LocA.Ptr)); + const SCEV *BS = SE.getSCEV(const_cast<Value *>(LocB.Ptr)); // If they evaluate to the same expression, it's a MustAlias. - if (AS == BS) return MustAlias; + if (AS == BS) + return MustAlias; // If something is known about the difference between the two addresses, // see if it's enough to prove a NoAlias. - if (SE->getEffectiveSCEVType(AS->getType()) == - SE->getEffectiveSCEVType(BS->getType())) { - unsigned BitWidth = SE->getTypeSizeInBits(AS->getType()); + if (SE.getEffectiveSCEVType(AS->getType()) == + SE.getEffectiveSCEVType(BS->getType())) { + unsigned BitWidth = SE.getTypeSizeInBits(AS->getType()); APInt ASizeInt(BitWidth, LocA.Size); APInt BSizeInt(BitWidth, LocB.Size); // Compute the difference between the two pointers. - const SCEV *BA = SE->getMinusSCEV(BS, AS); + const SCEV *BA = SE.getMinusSCEV(BS, AS); // Test whether the difference is known to be great enough that memory of // the given sizes don't overlap. This assumes that ASizeInt and BSizeInt // are non-zero, which is special-cased above. - if (ASizeInt.ule(SE->getUnsignedRange(BA).getUnsignedMin()) && - (-BSizeInt).uge(SE->getUnsignedRange(BA).getUnsignedMax())) + if (ASizeInt.ule(SE.getUnsignedRange(BA).getUnsignedMin()) && + (-BSizeInt).uge(SE.getUnsignedRange(BA).getUnsignedMax())) return NoAlias; // Folding the subtraction while preserving range information can be tricky @@ -145,13 +62,13 @@ AliasResult ScalarEvolutionAliasAnalysis::alias(const MemoryLocation &LocA, // and try again to see if things fold better that way. // Compute the difference between the two pointers. - const SCEV *AB = SE->getMinusSCEV(AS, BS); + const SCEV *AB = SE.getMinusSCEV(AS, BS); // Test whether the difference is known to be great enough that memory of // the given sizes don't overlap. This assumes that ASizeInt and BSizeInt // are non-zero, which is special-cased above. - if (BSizeInt.ule(SE->getUnsignedRange(AB).getUnsignedMin()) && - (-ASizeInt).uge(SE->getUnsignedRange(AB).getUnsignedMax())) + if (BSizeInt.ule(SE.getUnsignedRange(AB).getUnsignedMin()) && + (-ASizeInt).uge(SE.getUnsignedRange(AB).getUnsignedMax())) return NoAlias; } @@ -170,5 +87,62 @@ AliasResult ScalarEvolutionAliasAnalysis::alias(const MemoryLocation &LocA, return NoAlias; // Forward the query to the next analysis. - return AliasAnalysis::alias(LocA, LocB); + return AAResultBase::alias(LocA, LocB); +} + +/// Given an expression, try to find a base value. +/// +/// Returns null if none was found. +Value *SCEVAAResult::GetBaseValue(const SCEV *S) { + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + // In an addrec, assume that the base will be in the start, rather + // than the step. + return GetBaseValue(AR->getStart()); + } else if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(S)) { + // If there's a pointer operand, it'll be sorted at the end of the list. + const SCEV *Last = A->getOperand(A->getNumOperands() - 1); + if (Last->getType()->isPointerTy()) + return GetBaseValue(Last); + } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { + // This is a leaf node. + return U->getValue(); + } + // No Identified object found. + return nullptr; +} + +SCEVAAResult SCEVAA::run(Function &F, AnalysisManager<Function> *AM) { + return SCEVAAResult(AM->getResult<TargetLibraryAnalysis>(F), + AM->getResult<ScalarEvolutionAnalysis>(F)); +} + +char SCEVAA::PassID; + +char SCEVAAWrapperPass::ID = 0; +INITIALIZE_PASS_BEGIN(SCEVAAWrapperPass, "scev-aa", + "ScalarEvolution-based Alias Analysis", false, true) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(SCEVAAWrapperPass, "scev-aa", + "ScalarEvolution-based Alias Analysis", false, true) + +FunctionPass *llvm::createSCEVAAWrapperPass() { + return new SCEVAAWrapperPass(); +} + +SCEVAAWrapperPass::SCEVAAWrapperPass() : FunctionPass(ID) { + initializeSCEVAAWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +bool SCEVAAWrapperPass::runOnFunction(Function &F) { + Result.reset( + new SCEVAAResult(getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), + getAnalysis<ScalarEvolutionWrapperPass>().getSE())); + return false; +} + +void SCEVAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp index fee2a2d..921403d 100644 --- a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp +++ b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp @@ -63,7 +63,7 @@ Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty, // Create a new cast, and leave the old cast in place in case // it is being used as an insert point. Clear its operand // so that it doesn't hold anything live. - Ret = CastInst::Create(Op, V, Ty, "", IP); + Ret = CastInst::Create(Op, V, Ty, "", &*IP); Ret->takeName(CI); CI->replaceAllUsesWith(Ret); CI->setOperand(0, UndefValue::get(V->getType())); @@ -75,17 +75,39 @@ Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty, // Create a new cast. if (!Ret) - Ret = CastInst::Create(Op, V, Ty, V->getName(), IP); + Ret = CastInst::Create(Op, V, Ty, V->getName(), &*IP); // We assert at the end of the function since IP might point to an // instruction with different dominance properties than a cast // (an invoke for example) and not dominate BIP (but the cast does). - assert(SE.DT->dominates(Ret, BIP)); + assert(SE.DT.dominates(Ret, &*BIP)); rememberInstruction(Ret); return Ret; } +static BasicBlock::iterator findInsertPointAfter(Instruction *I, + BasicBlock *MustDominate) { + BasicBlock::iterator IP = ++I->getIterator(); + if (auto *II = dyn_cast<InvokeInst>(I)) + IP = II->getNormalDest()->begin(); + + while (isa<PHINode>(IP)) + ++IP; + + while (IP->isEHPad()) { + if (isa<FuncletPadInst>(IP) || isa<LandingPadInst>(IP)) { + ++IP; + } else if (isa<CatchSwitchInst>(IP)) { + IP = MustDominate->getFirstInsertionPt(); + } else { + llvm_unreachable("unexpected eh pad!"); + } + } + + return IP; +} + /// InsertNoopCastOfTo - Insert a cast of V to the specified type, /// which must be possible with a noop cast, doing what we can to share /// the casts. @@ -135,19 +157,14 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) { while ((isa<BitCastInst>(IP) && isa<Argument>(cast<BitCastInst>(IP)->getOperand(0)) && cast<BitCastInst>(IP)->getOperand(0) != A) || - isa<DbgInfoIntrinsic>(IP) || - isa<LandingPadInst>(IP)) + isa<DbgInfoIntrinsic>(IP)) ++IP; return ReuseOrCreateCast(A, Ty, Op, IP); } // Cast the instruction immediately after the instruction. Instruction *I = cast<Instruction>(V); - BasicBlock::iterator IP = I; ++IP; - if (InvokeInst *II = dyn_cast<InvokeInst>(I)) - IP = II->getNormalDest()->begin(); - while (isa<PHINode>(IP) || isa<LandingPadInst>(IP)) - ++IP; + BasicBlock::iterator IP = findInsertPointAfter(I, Builder.GetInsertBlock()); return ReuseOrCreateCast(I, Ty, Op, IP); } @@ -174,7 +191,7 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode, ScanLimit++; if (IP->getOpcode() == (unsigned)Opcode && IP->getOperand(0) == LHS && IP->getOperand(1) == RHS) - return IP; + return &*IP; if (IP == BlockBegin) break; } } @@ -184,13 +201,13 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode, BuilderType::InsertPointGuard Guard(Builder); // Move the insertion point out of as many loops as we can. - while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) { + while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) { if (!L->isLoopInvariant(LHS) || !L->isLoopInvariant(RHS)) break; BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) break; // Ok, move up a level. - Builder.SetInsertPoint(Preheader, Preheader->getTerminator()); + Builder.SetInsertPoint(Preheader->getTerminator()); } // If we haven't found this binop, insert it. @@ -229,19 +246,15 @@ static bool FactorOutConstant(const SCEV *&S, const SCEV *&Remainder, // Check for divisibility. if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor)) { ConstantInt *CI = - ConstantInt::get(SE.getContext(), - C->getValue()->getValue().sdiv( - FC->getValue()->getValue())); + ConstantInt::get(SE.getContext(), C->getAPInt().sdiv(FC->getAPInt())); // If the quotient is zero and the remainder is non-zero, reject // the value at this scale. It will be considered for subsequent // smaller scales. if (!CI->isZero()) { const SCEV *Div = SE.getConstant(CI); S = Div; - Remainder = - SE.getAddExpr(Remainder, - SE.getConstant(C->getValue()->getValue().srem( - FC->getValue()->getValue()))); + Remainder = SE.getAddExpr( + Remainder, SE.getConstant(C->getAPInt().srem(FC->getAPInt()))); return true; } } @@ -254,10 +267,9 @@ static bool FactorOutConstant(const SCEV *&S, const SCEV *&Remainder, // of the given factor. If so, we can factor it. const SCEVConstant *FC = cast<SCEVConstant>(Factor); if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0))) - if (!C->getValue()->getValue().srem(FC->getValue()->getValue())) { + if (!C->getAPInt().srem(FC->getAPInt())) { SmallVector<const SCEV *, 4> NewMulOps(M->op_begin(), M->op_end()); - NewMulOps[0] = SE.getConstant( - C->getValue()->getValue().sdiv(FC->getValue()->getValue())); + NewMulOps[0] = SE.getConstant(C->getAPInt().sdiv(FC->getAPInt())); S = SE.getMulExpr(NewMulOps); return true; } @@ -402,8 +414,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, const SCEV *ElSize = SE.getSizeOfExpr(IntPtrTy, ElTy); if (!ElSize->isZero()) { SmallVector<const SCEV *, 8> NewOps; - for (unsigned i = 0, e = Ops.size(); i != e; ++i) { - const SCEV *Op = Ops[i]; + for (const SCEV *Op : Ops) { const SCEV *Remainder = SE.getConstant(Ty, 0); if (FactorOutConstant(Op, Remainder, ElSize, SE, DL)) { // Op now has ElSize factored out. @@ -414,7 +425,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, } else { // The operand was not divisible, so add it to the list of operands // we'll scan next iteration. - NewOps.push_back(Ops[i]); + NewOps.push_back(Op); } } // If we made any changes, update Ops. @@ -483,7 +494,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, Type::getInt8PtrTy(Ty->getContext(), PTy->getAddressSpace())); assert(!isa<Instruction>(V) || - SE.DT->dominates(cast<Instruction>(V), Builder.GetInsertPoint())); + SE.DT.dominates(cast<Instruction>(V), &*Builder.GetInsertPoint())); // Expand the operands for a plain byte offset. Value *Idx = expandCodeFor(SE.getAddExpr(Ops), Ty); @@ -508,7 +519,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, ScanLimit++; if (IP->getOpcode() == Instruction::GetElementPtr && IP->getOperand(0) == V && IP->getOperand(1) == Idx) - return IP; + return &*IP; if (IP == BlockBegin) break; } } @@ -517,13 +528,13 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, BuilderType::InsertPointGuard Guard(Builder); // Move the insertion point out of as many loops as we can. - while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) { + while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) { if (!L->isLoopInvariant(V) || !L->isLoopInvariant(Idx)) break; BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) break; // Ok, move up a level. - Builder.SetInsertPoint(Preheader, Preheader->getTerminator()); + Builder.SetInsertPoint(Preheader->getTerminator()); } // Emit a GEP. @@ -537,16 +548,13 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, BuilderType::InsertPoint SaveInsertPt = Builder.saveIP(); // Move the insertion point out of as many loops as we can. - while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) { + while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) { if (!L->isLoopInvariant(V)) break; - bool AnyIndexNotLoopInvariant = false; - for (SmallVectorImpl<Value *>::const_iterator I = GepIndices.begin(), - E = GepIndices.end(); I != E; ++I) - if (!L->isLoopInvariant(*I)) { - AnyIndexNotLoopInvariant = true; - break; - } + bool AnyIndexNotLoopInvariant = + std::any_of(GepIndices.begin(), GepIndices.end(), + [L](Value *Op) { return !L->isLoopInvariant(Op); }); + if (AnyIndexNotLoopInvariant) break; @@ -554,7 +562,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, if (!Preheader) break; // Ok, move up a level. - Builder.SetInsertPoint(Preheader, Preheader->getTerminator()); + Builder.SetInsertPoint(Preheader->getTerminator()); } // Insert a pretty getelementptr. Note that this GEP is not marked inbounds, @@ -563,9 +571,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin, Value *Casted = V; if (V->getType() != PTy) Casted = InsertNoopCastOfTo(Casted, PTy); - Value *GEP = Builder.CreateGEP(OriginalElTy, Casted, - GepIndices, - "scevgep"); + Value *GEP = Builder.CreateGEP(OriginalElTy, Casted, GepIndices, "scevgep"); Ops.push_back(SE.getUnknown(GEP)); rememberInstruction(GEP); @@ -593,8 +599,7 @@ static const Loop *PickMostRelevantLoop(const Loop *A, const Loop *B, /// expression, according to PickMostRelevantLoop. const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) { // Test whether we've already computed the most relevant loop for this SCEV. - std::pair<DenseMap<const SCEV *, const Loop *>::iterator, bool> Pair = - RelevantLoops.insert(std::make_pair(S, nullptr)); + auto Pair = RelevantLoops.insert(std::make_pair(S, nullptr)); if (!Pair.second) return Pair.first->second; @@ -603,7 +608,7 @@ const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) { return nullptr; if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { if (const Instruction *I = dyn_cast<Instruction>(U->getValue())) - return Pair.first->second = SE.LI->getLoopFor(I->getParent()); + return Pair.first->second = SE.LI.getLoopFor(I->getParent()); // A non-instruction has no relevant loops. return nullptr; } @@ -611,9 +616,8 @@ const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) { const Loop *L = nullptr; if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) L = AR->getLoop(); - for (SCEVNAryExpr::op_iterator I = N->op_begin(), E = N->op_end(); - I != E; ++I) - L = PickMostRelevantLoop(L, getRelevantLoop(*I), *SE.DT); + for (const SCEV *Op : N->operands()) + L = PickMostRelevantLoop(L, getRelevantLoop(Op), SE.DT); return RelevantLoops[N] = L; } if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) { @@ -621,10 +625,8 @@ const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) { return RelevantLoops[C] = Result; } if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { - const Loop *Result = - PickMostRelevantLoop(getRelevantLoop(D->getLHS()), - getRelevantLoop(D->getRHS()), - *SE.DT); + const Loop *Result = PickMostRelevantLoop( + getRelevantLoop(D->getLHS()), getRelevantLoop(D->getRHS()), SE.DT); return RelevantLoops[D] = Result; } llvm_unreachable("Unexpected SCEV type!"); @@ -679,13 +681,12 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) { // Sort by loop. Use a stable sort so that constants follow non-constants and // pointer operands precede non-pointer operands. - std::stable_sort(OpsAndLoops.begin(), OpsAndLoops.end(), LoopCompare(*SE.DT)); + std::stable_sort(OpsAndLoops.begin(), OpsAndLoops.end(), LoopCompare(SE.DT)); // Emit instructions to add all the operands. Hoist as much as possible // out of loops, and form meaningful getelementptrs where possible. Value *Sum = nullptr; - for (SmallVectorImpl<std::pair<const Loop *, const SCEV *> >::iterator - I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E; ) { + for (auto I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E;) { const Loop *CurLoop = I->first; const SCEV *Op = I->second; if (!Sum) { @@ -747,14 +748,13 @@ Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) { OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I)); // Sort by loop. Use a stable sort so that constants follow non-constants. - std::stable_sort(OpsAndLoops.begin(), OpsAndLoops.end(), LoopCompare(*SE.DT)); + std::stable_sort(OpsAndLoops.begin(), OpsAndLoops.end(), LoopCompare(SE.DT)); // Emit instructions to mul all the operands. Hoist as much as possible // out of loops. Value *Prod = nullptr; - for (SmallVectorImpl<std::pair<const Loop *, const SCEV *> >::iterator - I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E; ++I) { - const SCEV *Op = I->second; + for (const auto &I : OpsAndLoops) { + const SCEV *Op = I.second; if (!Prod) { // This is the first operand. Just expand it. Prod = expand(Op); @@ -788,7 +788,7 @@ Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) { Value *LHS = expandCodeFor(S->getLHS(), Ty); if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getRHS())) { - const APInt &RHS = SC->getValue()->getValue(); + const APInt &RHS = SC->getAPInt(); if (RHS.isPowerOf2()) return InsertBinop(Instruction::LShr, LHS, ConstantInt::get(Ty, RHS.logBase2())); @@ -834,7 +834,7 @@ bool SCEVExpander::isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV, for (User::op_iterator OI = IncV->op_begin()+1, OE = IncV->op_end(); OI != OE; ++OI) if (Instruction *OInst = dyn_cast<Instruction>(OI)) - if (!SE.DT->dominates(OInst, IVIncInsertPos)) + if (!SE.DT.dominates(OInst, IVIncInsertPos)) return false; } // Advance to the next instruction. @@ -873,19 +873,18 @@ Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV, case Instruction::Add: case Instruction::Sub: { Instruction *OInst = dyn_cast<Instruction>(IncV->getOperand(1)); - if (!OInst || SE.DT->dominates(OInst, InsertPos)) + if (!OInst || SE.DT.dominates(OInst, InsertPos)) return dyn_cast<Instruction>(IncV->getOperand(0)); return nullptr; } case Instruction::BitCast: return dyn_cast<Instruction>(IncV->getOperand(0)); case Instruction::GetElementPtr: - for (Instruction::op_iterator I = IncV->op_begin()+1, E = IncV->op_end(); - I != E; ++I) { + for (auto I = IncV->op_begin() + 1, E = IncV->op_end(); I != E; ++I) { if (isa<Constant>(*I)) continue; if (Instruction *OInst = dyn_cast<Instruction>(*I)) { - if (!SE.DT->dominates(OInst, InsertPos)) + if (!SE.DT.dominates(OInst, InsertPos)) return nullptr; } if (allowScale) { @@ -912,13 +911,16 @@ Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV, /// it available to other uses in this loop. Recursively hoist any operands, /// until we reach a value that dominates InsertPos. bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) { - if (SE.DT->dominates(IncV, InsertPos)) + if (SE.DT.dominates(IncV, InsertPos)) return true; // InsertPos must itself dominate IncV so that IncV's new position satisfies // its existing users. - if (isa<PHINode>(InsertPos) - || !SE.DT->dominates(InsertPos->getParent(), IncV->getParent())) + if (isa<PHINode>(InsertPos) || + !SE.DT.dominates(InsertPos->getParent(), IncV->getParent())) + return false; + + if (!SE.LI.movementPreservesLCSSAForm(IncV, InsertPos)) return false; // Check that the chain of IV operands leading back to Phi can be hoisted. @@ -930,11 +932,10 @@ bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) { // IncV is safe to hoist. IVIncs.push_back(IncV); IncV = Oper; - if (SE.DT->dominates(IncV, InsertPos)) + if (SE.DT.dominates(IncV, InsertPos)) break; } - for (SmallVectorImpl<Instruction*>::reverse_iterator I = IVIncs.rbegin(), - E = IVIncs.rend(); I != E; ++I) { + for (auto I = IVIncs.rbegin(), E = IVIncs.rend(); I != E; ++I) { (*I)->moveBefore(InsertPos); } return true; @@ -1002,7 +1003,7 @@ static void hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist, } /// \brief Check whether we can cheaply express the requested SCEV in terms of -/// the available PHI SCEV by truncation and/or invertion of the step. +/// the available PHI SCEV by truncation and/or inversion of the step. static bool canBeCheaplyTransformed(ScalarEvolution &SE, const SCEVAddRecExpr *Phi, const SCEVAddRecExpr *Requested, @@ -1084,12 +1085,13 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, // Only try partially matching scevs that need truncation and/or // step-inversion if we know this loop is outside the current loop. - bool TryNonMatchingSCEV = IVIncInsertLoop && - SE.DT->properlyDominates(LatchBlock, IVIncInsertLoop->getHeader()); + bool TryNonMatchingSCEV = + IVIncInsertLoop && + SE.DT.properlyDominates(LatchBlock, IVIncInsertLoop->getHeader()); - for (BasicBlock::iterator I = L->getHeader()->begin(); - PHINode *PN = dyn_cast<PHINode>(I); ++I) { - if (!SE.isSCEVable(PN->getType())) + for (auto &I : *L->getHeader()) { + auto *PN = dyn_cast<PHINode>(&I); + if (!PN || !SE.isSCEVable(PN->getType())) continue; const SCEVAddRecExpr *PhiSCEV = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PN)); @@ -1142,7 +1144,7 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, // Potentially, move the increment. We have made sure in // isExpandedAddRecExprPHI or hoistIVInc that this is possible. if (L == IVIncInsertLoop) - hoistBeforePos(SE.DT, IncV, IVIncInsertPos, AddRecPhiMatch); + hoistBeforePos(&SE.DT, IncV, IVIncInsertPos, AddRecPhiMatch); // Ok, the add recurrence looks usable. // Remember this PHI, even in post-inc mode. @@ -1167,13 +1169,13 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, PostIncLoops.clear(); // Expand code for the start value. - Value *StartV = expandCodeFor(Normalized->getStart(), ExpandTy, - L->getHeader()->begin()); + Value *StartV = + expandCodeFor(Normalized->getStart(), ExpandTy, &L->getHeader()->front()); // StartV must be hoisted into L's preheader to dominate the new phi. assert(!isa<Instruction>(StartV) || - SE.DT->properlyDominates(cast<Instruction>(StartV)->getParent(), - L->getHeader())); + SE.DT.properlyDominates(cast<Instruction>(StartV)->getParent(), + L->getHeader())); // Expand code for the step value. Do this before creating the PHI so that PHI // reuse code doesn't see an incomplete PHI. @@ -1185,7 +1187,7 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, if (useSubtract) Step = SE.getNegativeSCEV(Step); // Expand the step somewhere that dominates the loop header. - Value *StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin()); + Value *StepV = expandCodeFor(Step, IntTy, &L->getHeader()->front()); // The no-wrap behavior proved by IsIncrement(NUW|NSW) is only applicable if // we actually do emit an addition. It does not apply if we emit a @@ -1249,9 +1251,8 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { if (PostIncLoops.count(L)) { PostIncLoopSet Loops; Loops.insert(L); - Normalized = - cast<SCEVAddRecExpr>(TransformForPostIncUse(Normalize, S, nullptr, - nullptr, Loops, SE, *SE.DT)); + Normalized = cast<SCEVAddRecExpr>(TransformForPostIncUse( + Normalize, S, nullptr, nullptr, Loops, SE, SE.DT)); } // Strip off any non-loop-dominating component from the addrec start. @@ -1301,9 +1302,9 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { // For an expansion to use the postinc form, the client must call // expandCodeFor with an InsertPoint that is either outside the PostIncLoop // or dominated by IVIncInsertPos. - if (isa<Instruction>(Result) - && !SE.DT->dominates(cast<Instruction>(Result), - Builder.GetInsertPoint())) { + if (isa<Instruction>(Result) && + !SE.DT.dominates(cast<Instruction>(Result), + &*Builder.GetInsertPoint())) { // The induction variable's postinc expansion does not dominate this use. // IVUsers tries to prevent this case, so it is rare. However, it can // happen when an IVUser outside the loop is not dominated by the latch @@ -1321,7 +1322,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { { // Expand the step somewhere that dominates the loop header. BuilderType::InsertPointGuard Guard(Builder); - StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin()); + StepV = expandCodeFor(Step, IntTy, &L->getHeader()->front()); } Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract); } @@ -1395,13 +1396,9 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { Value *V = expand(SE.getAddRecExpr(NewOps, S->getLoop(), S->getNoWrapFlags(SCEV::FlagNW))); BasicBlock::iterator NewInsertPt = - std::next(BasicBlock::iterator(cast<Instruction>(V))); - BuilderType::InsertPointGuard Guard(Builder); - while (isa<PHINode>(NewInsertPt) || isa<DbgInfoIntrinsic>(NewInsertPt) || - isa<LandingPadInst>(NewInsertPt)) - ++NewInsertPt; + findInsertPointAfter(cast<Instruction>(V), Builder.GetInsertBlock()); V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), nullptr, - NewInsertPt); + &*NewInsertPt); return V; } @@ -1442,7 +1439,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { BasicBlock *Header = L->getHeader(); pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header); CanonicalIV = PHINode::Create(Ty, std::distance(HPB, HPE), "indvar", - Header->begin()); + &Header->front()); rememberInstruction(CanonicalIV); SmallSet<BasicBlock *, 4> PredSeen; @@ -1587,7 +1584,8 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) { Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty, Instruction *IP) { - Builder.SetInsertPoint(IP->getParent(), IP); + assert(IP); + Builder.SetInsertPoint(IP); return expandCodeFor(SH, Ty); } @@ -1605,8 +1603,8 @@ Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty) { Value *SCEVExpander::expand(const SCEV *S) { // Compute an insertion point for this SCEV object. Hoist the instructions // as far out in the loop nest as possible. - Instruction *InsertPt = Builder.GetInsertPoint(); - for (Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock()); ; + Instruction *InsertPt = &*Builder.GetInsertPoint(); + for (Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock());; L = L->getParentLoop()) if (SE.isLoopInvariant(S, L)) { if (!L) break; @@ -1616,30 +1614,29 @@ Value *SCEVExpander::expand(const SCEV *S) { // LSR sets the insertion point for AddRec start/step values to the // block start to simplify value reuse, even though it's an invalid // position. SCEVExpander must correct for this in all cases. - InsertPt = L->getHeader()->getFirstInsertionPt(); + InsertPt = &*L->getHeader()->getFirstInsertionPt(); } } else { // If the SCEV is computable at this level, insert it into the header // after the PHIs (and after any other instructions that we've inserted // there) so that it is guaranteed to dominate any user inside the loop. if (L && SE.hasComputableLoopEvolution(S, L) && !PostIncLoops.count(L)) - InsertPt = L->getHeader()->getFirstInsertionPt(); + InsertPt = &*L->getHeader()->getFirstInsertionPt(); while (InsertPt != Builder.GetInsertPoint() && (isInsertedInstruction(InsertPt) || isa<DbgInfoIntrinsic>(InsertPt))) { - InsertPt = std::next(BasicBlock::iterator(InsertPt)); + InsertPt = &*std::next(InsertPt->getIterator()); } break; } // Check to see if we already expanded this here. - std::map<std::pair<const SCEV *, Instruction *>, TrackingVH<Value> >::iterator - I = InsertedExpressions.find(std::make_pair(S, InsertPt)); + auto I = InsertedExpressions.find(std::make_pair(S, InsertPt)); if (I != InsertedExpressions.end()) return I->second; BuilderType::InsertPointGuard Guard(Builder); - Builder.SetInsertPoint(InsertPt->getParent(), InsertPt); + Builder.SetInsertPoint(InsertPt); // Expand the expression into instructions. Value *V = visit(S); @@ -1677,8 +1674,8 @@ SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L, // Emit code for it. BuilderType::InsertPointGuard Guard(Builder); - PHINode *V = cast<PHINode>(expandCodeFor(H, nullptr, - L->getHeader()->begin())); + PHINode *V = + cast<PHINode>(expandCodeFor(H, nullptr, &L->getHeader()->front())); return V; } @@ -1694,10 +1691,13 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, const TargetTransformInfo *TTI) { // Find integer phis in order of increasing width. SmallVector<PHINode*, 8> Phis; - for (BasicBlock::iterator I = L->getHeader()->begin(); - PHINode *Phi = dyn_cast<PHINode>(I); ++I) { - Phis.push_back(Phi); + for (auto &I : *L->getHeader()) { + if (auto *PN = dyn_cast<PHINode>(&I)) + Phis.push_back(PN); + else + break; } + if (TTI) std::sort(Phis.begin(), Phis.end(), [](Value *LHS, Value *RHS) { // Put pointers at the back and make sure pointer < pointer = false. @@ -1711,13 +1711,23 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, DenseMap<const SCEV *, PHINode *> ExprToIVMap; // Process phis from wide to narrow. Map wide phis to their truncation // so narrow phis can reuse them. - for (SmallVectorImpl<PHINode*>::const_iterator PIter = Phis.begin(), - PEnd = Phis.end(); PIter != PEnd; ++PIter) { - PHINode *Phi = *PIter; + for (PHINode *Phi : Phis) { + auto SimplifyPHINode = [&](PHINode *PN) -> Value * { + if (Value *V = SimplifyInstruction(PN, DL, &SE.TLI, &SE.DT, &SE.AC)) + return V; + if (!SE.isSCEVable(PN->getType())) + return nullptr; + auto *Const = dyn_cast<SCEVConstant>(SE.getSCEV(PN)); + if (!Const) + return nullptr; + return Const->getValue(); + }; // Fold constant phis. They may be congruent to other constant phis and // would confuse the logic below that expects proper IVs. - if (Value *V = SimplifyInstruction(Phi, DL, SE.TLI, SE.DT, SE.AC)) { + if (Value *V = SimplifyPHINode(Phi)) { + if (V->getType() != Phi->getType()) + continue; Phi->replaceAllUsesWith(V); DeadInsts.emplace_back(Phi); ++NumElim; @@ -1784,7 +1794,7 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, if (OrigInc->getType() != IsomorphicInc->getType()) { Instruction *IP = nullptr; if (PHINode *PN = dyn_cast<PHINode>(OrigInc)) - IP = PN->getParent()->getFirstInsertionPt(); + IP = &*PN->getParent()->getFirstInsertionPt(); else IP = OrigInc->getNextNode(); @@ -1802,7 +1812,7 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, ++NumElim; Value *NewIV = OrigPhiRef; if (OrigPhiRef->getType() != Phi->getType()) { - IRBuilder<> Builder(L->getHeader()->getFirstInsertionPt()); + IRBuilder<> Builder(&*L->getHeader()->getFirstInsertionPt()); Builder.SetCurrentDebugLocation(Phi->getDebugLoc()); NewIV = Builder.CreateTruncOrBitCast(OrigPhiRef, Phi->getType(), IVName); } @@ -1812,8 +1822,46 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT, return NumElim; } +Value *SCEVExpander::findExistingExpansion(const SCEV *S, + const Instruction *At, Loop *L) { + using namespace llvm::PatternMatch; + + SmallVector<BasicBlock *, 4> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + // Look for suitable value in simple conditions at the loop exits. + for (BasicBlock *BB : ExitingBlocks) { + ICmpInst::Predicate Pred; + Instruction *LHS, *RHS; + BasicBlock *TrueBB, *FalseBB; + + if (!match(BB->getTerminator(), + m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)), + TrueBB, FalseBB))) + continue; + + if (SE.getSCEV(LHS) == S && SE.DT.dominates(LHS, At)) + return LHS; + + if (SE.getSCEV(RHS) == S && SE.DT.dominates(RHS, At)) + return RHS; + } + + // There is potential to make this significantly smarter, but this simple + // heuristic already gets some interesting cases. + + // Can not find suitable value. + return nullptr; +} + bool SCEVExpander::isHighCostExpansionHelper( - const SCEV *S, Loop *L, SmallPtrSetImpl<const SCEV *> &Processed) { + const SCEV *S, Loop *L, const Instruction *At, + SmallPtrSetImpl<const SCEV *> &Processed) { + + // If we can find an existing value for this scev avaliable at the point "At" + // then consider the expression cheap. + if (At && findExistingExpansion(S, At, L) != nullptr) + return false; // Zero/One operand expressions switch (S->getSCEVType()) { @@ -1821,14 +1869,14 @@ bool SCEVExpander::isHighCostExpansionHelper( case scConstant: return false; case scTruncate: - return isHighCostExpansionHelper(cast<SCEVTruncateExpr>(S)->getOperand(), L, - Processed); + return isHighCostExpansionHelper(cast<SCEVTruncateExpr>(S)->getOperand(), + L, At, Processed); case scZeroExtend: return isHighCostExpansionHelper(cast<SCEVZeroExtendExpr>(S)->getOperand(), - L, Processed); + L, At, Processed); case scSignExtend: return isHighCostExpansionHelper(cast<SCEVSignExtendExpr>(S)->getOperand(), - L, Processed); + L, At, Processed); } if (!Processed.insert(S).second) @@ -1836,10 +1884,10 @@ bool SCEVExpander::isHighCostExpansionHelper( if (auto *UDivExpr = dyn_cast<SCEVUDivExpr>(S)) { // If the divisor is a power of two and the SCEV type fits in a native - // integer, consider the divison cheap irrespective of whether it occurs in + // integer, consider the division cheap irrespective of whether it occurs in // the user code since it can be lowered into a right shift. if (auto *SC = dyn_cast<SCEVConstant>(UDivExpr->getRHS())) - if (SC->getValue()->getValue().isPowerOf2()) { + if (SC->getAPInt().isPowerOf2()) { const DataLayout &DL = L->getHeader()->getParent()->getParent()->getDataLayout(); unsigned Width = cast<IntegerType>(UDivExpr->getType())->getBitWidth(); @@ -1855,22 +1903,14 @@ bool SCEVExpander::isHighCostExpansionHelper( if (!ExitingBB) return true; - BranchInst *ExitingBI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); - if (!ExitingBI || !ExitingBI->isConditional()) + // At the beginning of this function we already tried to find existing value + // for plain 'S'. Now try to lookup 'S + 1' since it is common pattern + // involving division. This is just a simple search heuristic. + if (!At) + At = &ExitingBB->back(); + if (!findExistingExpansion( + SE.getAddExpr(S, SE.getConstant(S->getType(), 1)), At, L)) return true; - - ICmpInst *OrigCond = dyn_cast<ICmpInst>(ExitingBI->getCondition()); - if (!OrigCond) - return true; - - const SCEV *RHS = SE.getSCEV(OrigCond->getOperand(1)); - RHS = SE.getMinusSCEV(RHS, SE.getConstant(RHS->getType(), 1)); - if (RHS != S) { - const SCEV *LHS = SE.getSCEV(OrigCond->getOperand(0)); - LHS = SE.getMinusSCEV(LHS, SE.getConstant(LHS->getType(), 1)); - if (LHS != S) - return true; - } } // HowManyLessThans uses a Max expression whenever the loop is not guarded by @@ -1882,11 +1922,9 @@ bool SCEVExpander::isHighCostExpansionHelper( // BackedgeTakenCount. They may already exist in program code, and if not, // they are not too expensive rematerialize. if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(S)) { - for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end(); - I != E; ++I) { - if (isHighCostExpansionHelper(*I, L, Processed)) + for (auto *Op : NAry->operands()) + if (isHighCostExpansionHelper(Op, L, At, Processed)) return true; - } } // If we haven't recognized an expensive SCEV pattern, assume it's an @@ -1894,6 +1932,43 @@ bool SCEVExpander::isHighCostExpansionHelper( return false; } +Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred, + Instruction *IP) { + assert(IP); + switch (Pred->getKind()) { + case SCEVPredicate::P_Union: + return expandUnionPredicate(cast<SCEVUnionPredicate>(Pred), IP); + case SCEVPredicate::P_Equal: + return expandEqualPredicate(cast<SCEVEqualPredicate>(Pred), IP); + } + llvm_unreachable("Unknown SCEV predicate type"); +} + +Value *SCEVExpander::expandEqualPredicate(const SCEVEqualPredicate *Pred, + Instruction *IP) { + Value *Expr0 = expandCodeFor(Pred->getLHS(), Pred->getLHS()->getType(), IP); + Value *Expr1 = expandCodeFor(Pred->getRHS(), Pred->getRHS()->getType(), IP); + + Builder.SetInsertPoint(IP); + auto *I = Builder.CreateICmpNE(Expr0, Expr1, "ident.check"); + return I; +} + +Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union, + Instruction *IP) { + auto *BoolType = IntegerType::get(IP->getContext(), 1); + Value *Check = ConstantInt::getNullValue(BoolType); + + // Loop over all checks in this set. + for (auto Pred : Union->getPredicates()) { + auto *NextCheck = expandCodeForPredicate(Pred, IP); + Builder.SetInsertPoint(IP); + Check = Builder.CreateOr(Check, NextCheck); + } + + return Check; +} + namespace { // Search for a SCEV subexpression that is not safe to expand. Any expression // that may expand to a !isSafeToSpeculativelyExecute value is unsafe, namely diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp index b238fe4..b7fd5d5 100644 --- a/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp +++ b/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp @@ -109,7 +109,7 @@ TransformImpl(const SCEV *S, Instruction *User, Value *OperandValToReplace) { SmallVector<const SCEV *, 8> Operands; const Loop *L = AR->getLoop(); // The addrec conceptually uses its operands at loop entry. - Instruction *LUser = L->getHeader()->begin(); + Instruction *LUser = &L->getHeader()->front(); // Transform each operand. for (SCEVNAryExpr::op_iterator I = AR->op_begin(), E = AR->op_end(); I != E; ++I) { diff --git a/contrib/llvm/lib/Analysis/ScopedNoAliasAA.cpp b/contrib/llvm/lib/Analysis/ScopedNoAliasAA.cpp index a5fca3e..486f3a5 100644 --- a/contrib/llvm/lib/Analysis/ScopedNoAliasAA.cpp +++ b/contrib/llvm/lib/Analysis/ScopedNoAliasAA.cpp @@ -26,28 +26,29 @@ // ... = load %ptr2, !alias.scope !{ !scope1, !scope2 }, !noalias !{ !scope1 } // // When evaluating an aliasing query, if one of the instructions is associated -// has a set of noalias scopes in some domain that is superset of the alias +// has a set of noalias scopes in some domain that is a superset of the alias // scopes in that domain of some other instruction, then the two memory // accesses are assumed not to alias. // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" + using namespace llvm; // A handy option for disabling scoped no-alias functionality. The same effect // can also be achieved by stripping the associated metadata tags from IR, but // this option is sometimes more convenient. -static cl::opt<bool> -EnableScopedNoAlias("enable-scoped-noalias", cl::init(true)); +static cl::opt<bool> EnableScopedNoAlias("enable-scoped-noalias", + cl::init(true)); namespace { /// AliasScopeNode - This is a simple wrapper around an MDNode which provides @@ -57,7 +58,7 @@ class AliasScopeNode { const MDNode *Node; public: - AliasScopeNode() : Node(0) {} + AliasScopeNode() : Node(nullptr) {} explicit AliasScopeNode(const MDNode *N) : Node(N) {} /// getNode - Get the MDNode for this AliasScopeNode. @@ -70,79 +71,74 @@ public: return dyn_cast_or_null<MDNode>(Node->getOperand(1)); } }; +} // end of anonymous namespace -/// ScopedNoAliasAA - This is a simple alias analysis -/// implementation that uses scoped-noalias metadata to answer queries. -class ScopedNoAliasAA : public ImmutablePass, public AliasAnalysis { -public: - static char ID; // Class identification, replacement for typeinfo - ScopedNoAliasAA() : ImmutablePass(ID) { - initializeScopedNoAliasAAPass(*PassRegistry::getPassRegistry()); - } +AliasResult ScopedNoAliasAAResult::alias(const MemoryLocation &LocA, + const MemoryLocation &LocB) { + if (!EnableScopedNoAlias) + return AAResultBase::alias(LocA, LocB); - bool doInitialization(Module &M) override; + // Get the attached MDNodes. + const MDNode *AScopes = LocA.AATags.Scope, *BScopes = LocB.AATags.Scope; - /// getAdjustedAnalysisPointer - This method is used when a pass implements - /// an analysis interface through multiple inheritance. If needed, it - /// should override this to adjust the this pointer as needed for the - /// specified pass info. - void *getAdjustedAnalysisPointer(const void *PI) override { - if (PI == &AliasAnalysis::ID) - return (AliasAnalysis*)this; - return this; - } + const MDNode *ANoAlias = LocA.AATags.NoAlias, *BNoAlias = LocB.AATags.NoAlias; -protected: - bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias) const; - void collectMDInDomain(const MDNode *List, const MDNode *Domain, - SmallPtrSetImpl<const MDNode *> &Nodes) const; - -private: - void getAnalysisUsage(AnalysisUsage &AU) const override; - AliasResult alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) override; - bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal) override; - ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override; - ModRefBehavior getModRefBehavior(const Function *F) override; - ModRefResult getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) override; - ModRefResult getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2) override; -}; -} // End of anonymous namespace + if (!mayAliasInScopes(AScopes, BNoAlias)) + return NoAlias; -// Register this pass... -char ScopedNoAliasAA::ID = 0; -INITIALIZE_AG_PASS(ScopedNoAliasAA, AliasAnalysis, "scoped-noalias", - "Scoped NoAlias Alias Analysis", false, true, false) + if (!mayAliasInScopes(BScopes, ANoAlias)) + return NoAlias; -ImmutablePass *llvm::createScopedNoAliasAAPass() { - return new ScopedNoAliasAA(); + // If they may alias, chain to the next AliasAnalysis. + return AAResultBase::alias(LocA, LocB); } -bool ScopedNoAliasAA::doInitialization(Module &M) { - InitializeAliasAnalysis(this, &M.getDataLayout()); - return true; +ModRefInfo ScopedNoAliasAAResult::getModRefInfo(ImmutableCallSite CS, + const MemoryLocation &Loc) { + if (!EnableScopedNoAlias) + return AAResultBase::getModRefInfo(CS, Loc); + + if (!mayAliasInScopes(Loc.AATags.Scope, CS.getInstruction()->getMetadata( + LLVMContext::MD_noalias))) + return MRI_NoModRef; + + if (!mayAliasInScopes( + CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope), + Loc.AATags.NoAlias)) + return MRI_NoModRef; + + return AAResultBase::getModRefInfo(CS, Loc); } -void -ScopedNoAliasAA::getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesAll(); - AliasAnalysis::getAnalysisUsage(AU); +ModRefInfo ScopedNoAliasAAResult::getModRefInfo(ImmutableCallSite CS1, + ImmutableCallSite CS2) { + if (!EnableScopedNoAlias) + return AAResultBase::getModRefInfo(CS1, CS2); + + if (!mayAliasInScopes( + CS1.getInstruction()->getMetadata(LLVMContext::MD_alias_scope), + CS2.getInstruction()->getMetadata(LLVMContext::MD_noalias))) + return MRI_NoModRef; + + if (!mayAliasInScopes( + CS2.getInstruction()->getMetadata(LLVMContext::MD_alias_scope), + CS1.getInstruction()->getMetadata(LLVMContext::MD_noalias))) + return MRI_NoModRef; + + return AAResultBase::getModRefInfo(CS1, CS2); } -void -ScopedNoAliasAA::collectMDInDomain(const MDNode *List, const MDNode *Domain, - SmallPtrSetImpl<const MDNode *> &Nodes) const { +void ScopedNoAliasAAResult::collectMDInDomain( + const MDNode *List, const MDNode *Domain, + SmallPtrSetImpl<const MDNode *> &Nodes) const { for (unsigned i = 0, ie = List->getNumOperands(); i != ie; ++i) if (const MDNode *MD = dyn_cast<MDNode>(List->getOperand(i))) if (AliasScopeNode(MD).getDomain() == Domain) Nodes.insert(MD); } -bool -ScopedNoAliasAA::mayAliasInScopes(const MDNode *Scopes, - const MDNode *NoAlias) const { +bool ScopedNoAliasAAResult::mayAliasInScopes(const MDNode *Scopes, + const MDNode *NoAlias) const { if (!Scopes || !NoAlias) return true; @@ -177,76 +173,40 @@ ScopedNoAliasAA::mayAliasInScopes(const MDNode *Scopes, return true; } -AliasResult ScopedNoAliasAA::alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) { - if (!EnableScopedNoAlias) - return AliasAnalysis::alias(LocA, LocB); - - // Get the attached MDNodes. - const MDNode *AScopes = LocA.AATags.Scope, - *BScopes = LocB.AATags.Scope; +ScopedNoAliasAAResult ScopedNoAliasAA::run(Function &F, + AnalysisManager<Function> *AM) { + return ScopedNoAliasAAResult(AM->getResult<TargetLibraryAnalysis>(F)); +} - const MDNode *ANoAlias = LocA.AATags.NoAlias, - *BNoAlias = LocB.AATags.NoAlias; +char ScopedNoAliasAA::PassID; - if (!mayAliasInScopes(AScopes, BNoAlias)) - return NoAlias; - - if (!mayAliasInScopes(BScopes, ANoAlias)) - return NoAlias; +char ScopedNoAliasAAWrapperPass::ID = 0; +INITIALIZE_PASS_BEGIN(ScopedNoAliasAAWrapperPass, "scoped-noalias", + "Scoped NoAlias Alias Analysis", false, true) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(ScopedNoAliasAAWrapperPass, "scoped-noalias", + "Scoped NoAlias Alias Analysis", false, true) - // If they may alias, chain to the next AliasAnalysis. - return AliasAnalysis::alias(LocA, LocB); +ImmutablePass *llvm::createScopedNoAliasAAWrapperPass() { + return new ScopedNoAliasAAWrapperPass(); } -bool ScopedNoAliasAA::pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) { - return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); +ScopedNoAliasAAWrapperPass::ScopedNoAliasAAWrapperPass() : ImmutablePass(ID) { + initializeScopedNoAliasAAWrapperPassPass(*PassRegistry::getPassRegistry()); } -AliasAnalysis::ModRefBehavior -ScopedNoAliasAA::getModRefBehavior(ImmutableCallSite CS) { - return AliasAnalysis::getModRefBehavior(CS); +bool ScopedNoAliasAAWrapperPass::doInitialization(Module &M) { + Result.reset(new ScopedNoAliasAAResult( + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI())); + return false; } -AliasAnalysis::ModRefBehavior -ScopedNoAliasAA::getModRefBehavior(const Function *F) { - return AliasAnalysis::getModRefBehavior(F); +bool ScopedNoAliasAAWrapperPass::doFinalization(Module &M) { + Result.reset(); + return false; } -AliasAnalysis::ModRefResult -ScopedNoAliasAA::getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) { - if (!EnableScopedNoAlias) - return AliasAnalysis::getModRefInfo(CS, Loc); - - if (!mayAliasInScopes(Loc.AATags.Scope, CS.getInstruction()->getMetadata( - LLVMContext::MD_noalias))) - return NoModRef; - - if (!mayAliasInScopes( - CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope), - Loc.AATags.NoAlias)) - return NoModRef; - - return AliasAnalysis::getModRefInfo(CS, Loc); -} - -AliasAnalysis::ModRefResult -ScopedNoAliasAA::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) { - if (!EnableScopedNoAlias) - return AliasAnalysis::getModRefInfo(CS1, CS2); - - if (!mayAliasInScopes( - CS1.getInstruction()->getMetadata(LLVMContext::MD_alias_scope), - CS2.getInstruction()->getMetadata(LLVMContext::MD_noalias))) - return NoModRef; - - if (!mayAliasInScopes( - CS2.getInstruction()->getMetadata(LLVMContext::MD_alias_scope), - CS1.getInstruction()->getMetadata(LLVMContext::MD_noalias))) - return NoModRef; - - return AliasAnalysis::getModRefInfo(CS1, CS2); +void ScopedNoAliasAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } - diff --git a/contrib/llvm/lib/Analysis/SparsePropagation.cpp b/contrib/llvm/lib/Analysis/SparsePropagation.cpp index edd82f5..f5a927b 100644 --- a/contrib/llvm/lib/Analysis/SparsePropagation.cpp +++ b/contrib/llvm/lib/Analysis/SparsePropagation.cpp @@ -328,17 +328,17 @@ void SparseSolver::Solve(Function &F) { void SparseSolver::Print(Function &F, raw_ostream &OS) const { OS << "\nFUNCTION: " << F.getName() << "\n"; - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (!BBExecutable.count(BB)) + for (auto &BB : F) { + if (!BBExecutable.count(&BB)) OS << "INFEASIBLE: "; OS << "\t"; - if (BB->hasName()) - OS << BB->getName() << ":\n"; + if (BB.hasName()) + OS << BB.getName() << ":\n"; else OS << "; anon bb\n"; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - LatticeFunc->PrintValue(getLatticeState(I), OS); - OS << *I << "\n"; + for (auto &I : BB) { + LatticeFunc->PrintValue(getLatticeState(&I), OS); + OS << I << "\n"; } OS << "\n"; diff --git a/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp b/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp index 635c50c..ce38819 100644 --- a/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -52,19 +52,27 @@ static bool hasSinCosPiStret(const Triple &T) { /// specified target triple. This should be carefully written so that a missing /// target triple gets a sane set of defaults. static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, - const char *const *StandardNames) { -#ifndef NDEBUG + ArrayRef<const char *> StandardNames) { // Verify that the StandardNames array is in alphabetical order. - for (unsigned F = 1; F < LibFunc::NumLibFuncs; ++F) { - if (strcmp(StandardNames[F-1], StandardNames[F]) >= 0) - llvm_unreachable("TargetLibraryInfoImpl function names must be sorted"); + assert(std::is_sorted(StandardNames.begin(), StandardNames.end(), + [](const char *LHS, const char *RHS) { + return strcmp(LHS, RHS) < 0; + }) && + "TargetLibraryInfoImpl function names must be sorted"); + + if (T.getArch() == Triple::r600 || + T.getArch() == Triple::amdgcn) { + TLI.setUnavailable(LibFunc::ldexp); + TLI.setUnavailable(LibFunc::ldexpf); + TLI.setUnavailable(LibFunc::ldexpl); } -#endif // !NDEBUG // There are no library implementations of mempcy and memset for AMD gpus and // these can be difficult to lower in the backend. if (T.getArch() == Triple::r600 || - T.getArch() == Triple::amdgcn) { + T.getArch() == Triple::amdgcn || + T.getArch() == Triple::wasm32 || + T.getArch() == Triple::wasm64) { TLI.setUnavailable(LibFunc::memcpy); TLI.setUnavailable(LibFunc::memset); TLI.setUnavailable(LibFunc::memset_pattern16); @@ -72,13 +80,14 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, } // memset_pattern16 is only available on iOS 3.0 and Mac OS X 10.5 and later. + // All versions of watchOS support it. if (T.isMacOSX()) { if (T.isMacOSXVersionLT(10, 5)) TLI.setUnavailable(LibFunc::memset_pattern16); } else if (T.isiOS()) { if (T.isOSVersionLT(3, 0)) TLI.setUnavailable(LibFunc::memset_pattern16); - } else { + } else if (!T.isWatchOS()) { TLI.setUnavailable(LibFunc::memset_pattern16); } @@ -286,8 +295,13 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, } break; case Triple::IOS: + case Triple::TvOS: + case Triple::WatchOS: TLI.setUnavailable(LibFunc::exp10l); - if (T.isOSVersionLT(7, 0)) { + if (!T.isWatchOS() && (T.isOSVersionLT(7, 0) || + (T.isOSVersionLT(9, 0) && + (T.getArch() == Triple::x86 || + T.getArch() == Triple::x86_64)))) { TLI.setUnavailable(LibFunc::exp10); TLI.setUnavailable(LibFunc::exp10f); } else { @@ -311,12 +325,14 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, // ffsl is available on at least Darwin, Mac OS X, iOS, FreeBSD, and // Linux (GLIBC): // http://developer.apple.com/library/mac/#documentation/Darwin/Reference/ManPages/man3/ffsl.3.html - // http://svn.freebsd.org/base/user/eri/pf45/head/lib/libc/string/ffsl.c + // http://svn.freebsd.org/base/head/lib/libc/string/ffsl.c // http://www.gnu.org/software/gnulib/manual/html_node/ffsl.html switch (T.getOS()) { case Triple::Darwin: case Triple::MacOSX: case Triple::IOS: + case Triple::TvOS: + case Triple::WatchOS: case Triple::FreeBSD: case Triple::Linux: break; @@ -325,9 +341,14 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, } // ffsll is available on at least FreeBSD and Linux (GLIBC): - // http://svn.freebsd.org/base/user/eri/pf45/head/lib/libc/string/ffsll.c + // http://svn.freebsd.org/base/head/lib/libc/string/ffsll.c // http://www.gnu.org/software/gnulib/manual/html_node/ffsll.html switch (T.getOS()) { + case Triple::Darwin: + case Triple::MacOSX: + case Triple::IOS: + case Triple::TvOS: + case Triple::WatchOS: case Triple::FreeBSD: case Triple::Linux: break; @@ -335,6 +356,16 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc::ffsll); } + // The following functions are available on at least FreeBSD: + // http://svn.freebsd.org/base/head/lib/libc/string/fls.c + // http://svn.freebsd.org/base/head/lib/libc/string/flsl.c + // http://svn.freebsd.org/base/head/lib/libc/string/flsll.c + if (!T.isOSFreeBSD()) { + TLI.setUnavailable(LibFunc::fls); + TLI.setUnavailable(LibFunc::flsl); + TLI.setUnavailable(LibFunc::flsll); + } + // The following functions are available on at least Linux: if (!T.isOSLinux()) { TLI.setUnavailable(LibFunc::dunder_strdup); diff --git a/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp b/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp index 7d1c3fb..9c1d3fd 100644 --- a/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -46,30 +46,37 @@ TargetTransformInfo &TargetTransformInfo::operator=(TargetTransformInfo &&RHS) { return *this; } -unsigned TargetTransformInfo::getOperationCost(unsigned Opcode, Type *Ty, - Type *OpTy) const { - return TTIImpl->getOperationCost(Opcode, Ty, OpTy); +int TargetTransformInfo::getOperationCost(unsigned Opcode, Type *Ty, + Type *OpTy) const { + int Cost = TTIImpl->getOperationCost(Opcode, Ty, OpTy); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getCallCost(FunctionType *FTy, - int NumArgs) const { - return TTIImpl->getCallCost(FTy, NumArgs); +int TargetTransformInfo::getCallCost(FunctionType *FTy, int NumArgs) const { + int Cost = TTIImpl->getCallCost(FTy, NumArgs); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned -TargetTransformInfo::getCallCost(const Function *F, - ArrayRef<const Value *> Arguments) const { - return TTIImpl->getCallCost(F, Arguments); +int TargetTransformInfo::getCallCost(const Function *F, + ArrayRef<const Value *> Arguments) const { + int Cost = TTIImpl->getCallCost(F, Arguments); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned -TargetTransformInfo::getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<const Value *> Arguments) const { - return TTIImpl->getIntrinsicCost(IID, RetTy, Arguments); +int TargetTransformInfo::getIntrinsicCost( + Intrinsic::ID IID, Type *RetTy, ArrayRef<const Value *> Arguments) const { + int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getUserCost(const User *U) const { - return TTIImpl->getUserCost(U); +int TargetTransformInfo::getUserCost(const User *U) const { + int Cost = TTIImpl->getUserCost(U); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } bool TargetTransformInfo::hasBranchDivergence() const { @@ -106,14 +113,20 @@ bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, Scale, AddrSpace); } -bool TargetTransformInfo::isLegalMaskedStore(Type *DataType, - int Consecutive) const { - return TTIImpl->isLegalMaskedStore(DataType, Consecutive); +bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const { + return TTIImpl->isLegalMaskedStore(DataType); +} + +bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType) const { + return TTIImpl->isLegalMaskedLoad(DataType); } -bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType, - int Consecutive) const { - return TTIImpl->isLegalMaskedLoad(DataType, Consecutive); +bool TargetTransformInfo::isLegalMaskedGather(Type *DataType) const { + return TTIImpl->isLegalMaskedGather(DataType); +} + +bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType) const { + return TTIImpl->isLegalMaskedGather(DataType); } int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, @@ -121,8 +134,10 @@ int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const { - return TTIImpl->getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, - Scale, AddrSpace); + int Cost = TTIImpl->getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, + Scale, AddrSpace); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } bool TargetTransformInfo::isTruncateFree(Type *Ty1, Type *Ty2) const { @@ -153,6 +168,10 @@ bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) c return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } +bool TargetTransformInfo::enableInterleavedAccessVectorization() const { + return TTIImpl->enableInterleavedAccessVectorization(); +} + TargetTransformInfo::PopcntSupportKind TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const { return TTIImpl->getPopcntSupport(IntTyWidthInBit); @@ -162,22 +181,30 @@ bool TargetTransformInfo::haveFastSqrt(Type *Ty) const { return TTIImpl->haveFastSqrt(Ty); } -unsigned TargetTransformInfo::getFPOpCost(Type *Ty) const { - return TTIImpl->getFPOpCost(Ty); +int TargetTransformInfo::getFPOpCost(Type *Ty) const { + int Cost = TTIImpl->getFPOpCost(Ty); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty) const { - return TTIImpl->getIntImmCost(Imm, Ty); +int TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty) const { + int Cost = TTIImpl->getIntImmCost(Imm, Ty); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) const { - return TTIImpl->getIntImmCost(Opcode, Idx, Imm, Ty); +int TargetTransformInfo::getIntImmCost(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty) const { + int Cost = TTIImpl->getIntImmCost(Opcode, Idx, Imm, Ty); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) const { - return TTIImpl->getIntImmCost(IID, Idx, Imm, Ty); +int TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) const { + int Cost = TTIImpl->getIntImmCost(IID, Idx, Imm, Ty); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const { @@ -192,81 +219,122 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const { return TTIImpl->getMaxInterleaveFactor(VF); } -unsigned TargetTransformInfo::getArithmeticInstrCost( +int TargetTransformInfo::getArithmeticInstrCost( unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo) const { - return TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty, - int Index, Type *SubTp) const { - return TTIImpl->getShuffleCost(Kind, Ty, Index, SubTp); +int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty, int Index, + Type *SubTp) const { + int Cost = TTIImpl->getShuffleCost(Kind, Ty, Index, SubTp); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src) const { - return TTIImpl->getCastInstrCost(Opcode, Dst, Src); +int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src) const { + int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getCFInstrCost(unsigned Opcode) const { - return TTIImpl->getCFInstrCost(Opcode); +int TargetTransformInfo::getCFInstrCost(unsigned Opcode) const { + int Cost = TTIImpl->getCFInstrCost(Opcode); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) const { - return TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy); +int TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) const { + int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) const { - return TTIImpl->getVectorInstrCost(Opcode, Val, Index); +int TargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const { + int Cost = TTIImpl->getVectorInstrCost(Opcode, Val, Index); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) const { - return TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); +int TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) const { + int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned -TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) const { - return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); +int TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) const { + int Cost = + TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + +int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + Value *Ptr, bool VariableMask, + unsigned Alignment) const { + int Cost = TTIImpl->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getInterleavedMemoryOpCost( +int TargetTransformInfo::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace) const { - return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned -TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Type *> Tys) const { - return TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys); +int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef<Type *> Tys) const { + int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy, - ArrayRef<Type *> Tys) const { - return TTIImpl->getCallInstrCost(F, RetTy, Tys); +int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef<Value *> Args) const { + int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + +int TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy, + ArrayRef<Type *> Tys) const { + int Cost = TTIImpl->getCallInstrCost(F, RetTy, Tys); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const { return TTIImpl->getNumberOfParts(Tp); } -unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp, - bool IsComplex) const { - return TTIImpl->getAddressComputationCost(Tp, IsComplex); +int TargetTransformInfo::getAddressComputationCost(Type *Tp, + bool IsComplex) const { + int Cost = TTIImpl->getAddressComputationCost(Tp, IsComplex); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } -unsigned TargetTransformInfo::getReductionCost(unsigned Opcode, Type *Ty, - bool IsPairwiseForm) const { - return TTIImpl->getReductionCost(Opcode, Ty, IsPairwiseForm); +int TargetTransformInfo::getReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwiseForm) const { + int Cost = TTIImpl->getReductionCost(Opcode, Ty, IsPairwiseForm); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; } unsigned @@ -284,9 +352,9 @@ Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic( return TTIImpl->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType); } -bool TargetTransformInfo::hasCompatibleFunctionAttributes( - const Function *Caller, const Function *Callee) const { - return TTIImpl->hasCompatibleFunctionAttributes(Caller, Callee); +bool TargetTransformInfo::areInlineCompatible(const Function *Caller, + const Function *Callee) const { + return TTIImpl->areInlineCompatible(Caller, Callee); } TargetTransformInfo::Concept::~Concept() {} @@ -294,16 +362,16 @@ TargetTransformInfo::Concept::~Concept() {} TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} TargetIRAnalysis::TargetIRAnalysis( - std::function<Result(Function &)> TTICallback) + std::function<Result(const Function &)> TTICallback) : TTICallback(TTICallback) {} -TargetIRAnalysis::Result TargetIRAnalysis::run(Function &F) { +TargetIRAnalysis::Result TargetIRAnalysis::run(const Function &F) { return TTICallback(F); } char TargetIRAnalysis::PassID; -TargetIRAnalysis::Result TargetIRAnalysis::getDefaultTTI(Function &F) { +TargetIRAnalysis::Result TargetIRAnalysis::getDefaultTTI(const Function &F) { return Result(F.getParent()->getDataLayout()); } @@ -327,7 +395,7 @@ TargetTransformInfoWrapperPass::TargetTransformInfoWrapperPass( *PassRegistry::getPassRegistry()); } -TargetTransformInfo &TargetTransformInfoWrapperPass::getTTI(Function &F) { +TargetTransformInfo &TargetTransformInfoWrapperPass::getTTI(const Function &F) { TTI = TIRA.run(F); return *TTI; } diff --git a/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp index 4e9c6f6..9f92391 100644 --- a/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp +++ b/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp @@ -70,7 +70,7 @@ // A a; // } B; // -// For an acess to B.a.s, we attach !5 (a path tag node) to the load/store +// For an access to B.a.s, we attach !5 (a path tag node) to the load/store // instruction. The base type is !4 (struct B), the access type is !2 (scalar // type short) and the offset is 4. // @@ -121,15 +121,13 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Analysis/Passes.h" -#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/TypeBasedAliasAnalysis.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/ADT/SetVector.h" #include "llvm/IR/Constants.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" -#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" -#include "llvm/ADT/SetVector.h" using namespace llvm; // A handy option for disabling TBAA functionality. The same effect can also be @@ -138,199 +136,138 @@ using namespace llvm; static cl::opt<bool> EnableTBAA("enable-tbaa", cl::init(true)); namespace { - /// TBAANode - This is a simple wrapper around an MDNode which provides a - /// higher-level interface by hiding the details of how alias analysis - /// information is encoded in its operands. - class TBAANode { - const MDNode *Node; - - public: - TBAANode() : Node(nullptr) {} - explicit TBAANode(const MDNode *N) : Node(N) {} - - /// getNode - Get the MDNode for this TBAANode. - const MDNode *getNode() const { return Node; } - - /// getParent - Get this TBAANode's Alias tree parent. - TBAANode getParent() const { - if (Node->getNumOperands() < 2) - return TBAANode(); - MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1)); - if (!P) - return TBAANode(); - // Ok, this node has a valid parent. Return it. - return TBAANode(P); - } - - /// TypeIsImmutable - Test if this TBAANode represents a type for objects - /// which are not modified (by any means) in the context where this - /// AliasAnalysis is relevant. - bool TypeIsImmutable() const { - if (Node->getNumOperands() < 3) - return false; - ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(2)); - if (!CI) - return false; - return CI->getValue()[0]; - } - }; - - /// This is a simple wrapper around an MDNode which provides a - /// higher-level interface by hiding the details of how alias analysis - /// information is encoded in its operands. - class TBAAStructTagNode { - /// This node should be created with createTBAAStructTagNode. - const MDNode *Node; +/// TBAANode - This is a simple wrapper around an MDNode which provides a +/// higher-level interface by hiding the details of how alias analysis +/// information is encoded in its operands. +class TBAANode { + const MDNode *Node; + +public: + TBAANode() : Node(nullptr) {} + explicit TBAANode(const MDNode *N) : Node(N) {} + + /// getNode - Get the MDNode for this TBAANode. + const MDNode *getNode() const { return Node; } + + /// getParent - Get this TBAANode's Alias tree parent. + TBAANode getParent() const { + if (Node->getNumOperands() < 2) + return TBAANode(); + MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1)); + if (!P) + return TBAANode(); + // Ok, this node has a valid parent. Return it. + return TBAANode(P); + } - public: - explicit TBAAStructTagNode(const MDNode *N) : Node(N) {} + /// TypeIsImmutable - Test if this TBAANode represents a type for objects + /// which are not modified (by any means) in the context where this + /// AliasAnalysis is relevant. + bool TypeIsImmutable() const { + if (Node->getNumOperands() < 3) + return false; + ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(2)); + if (!CI) + return false; + return CI->getValue()[0]; + } +}; - /// Get the MDNode for this TBAAStructTagNode. - const MDNode *getNode() const { return Node; } +/// This is a simple wrapper around an MDNode which provides a +/// higher-level interface by hiding the details of how alias analysis +/// information is encoded in its operands. +class TBAAStructTagNode { + /// This node should be created with createTBAAStructTagNode. + const MDNode *Node; - const MDNode *getBaseType() const { - return dyn_cast_or_null<MDNode>(Node->getOperand(0)); - } - const MDNode *getAccessType() const { - return dyn_cast_or_null<MDNode>(Node->getOperand(1)); - } - uint64_t getOffset() const { - return mdconst::extract<ConstantInt>(Node->getOperand(2))->getZExtValue(); - } - /// TypeIsImmutable - Test if this TBAAStructTagNode represents a type for - /// objects which are not modified (by any means) in the context where this - /// AliasAnalysis is relevant. - bool TypeIsImmutable() const { - if (Node->getNumOperands() < 4) - return false; - ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(3)); - if (!CI) - return false; - return CI->getValue()[0]; - } - }; - - /// This is a simple wrapper around an MDNode which provides a - /// higher-level interface by hiding the details of how alias analysis - /// information is encoded in its operands. - class TBAAStructTypeNode { - /// This node should be created with createTBAAStructTypeNode. - const MDNode *Node; - - public: - TBAAStructTypeNode() : Node(nullptr) {} - explicit TBAAStructTypeNode(const MDNode *N) : Node(N) {} - - /// Get the MDNode for this TBAAStructTypeNode. - const MDNode *getNode() const { return Node; } - - /// Get this TBAAStructTypeNode's field in the type DAG with - /// given offset. Update the offset to be relative to the field type. - TBAAStructTypeNode getParent(uint64_t &Offset) const { - // Parent can be omitted for the root node. - if (Node->getNumOperands() < 2) - return TBAAStructTypeNode(); +public: + explicit TBAAStructTagNode(const MDNode *N) : Node(N) {} - // Fast path for a scalar type node and a struct type node with a single - // field. - if (Node->getNumOperands() <= 3) { - uint64_t Cur = Node->getNumOperands() == 2 - ? 0 - : mdconst::extract<ConstantInt>(Node->getOperand(2)) - ->getZExtValue(); - Offset -= Cur; - MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1)); - if (!P) - return TBAAStructTypeNode(); - return TBAAStructTypeNode(P); - } + /// Get the MDNode for this TBAAStructTagNode. + const MDNode *getNode() const { return Node; } - // Assume the offsets are in order. We return the previous field if - // the current offset is bigger than the given offset. - unsigned TheIdx = 0; - for (unsigned Idx = 1; Idx < Node->getNumOperands(); Idx += 2) { - uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(Idx + 1)) - ->getZExtValue(); - if (Cur > Offset) { - assert(Idx >= 3 && - "TBAAStructTypeNode::getParent should have an offset match!"); - TheIdx = Idx - 2; - break; - } - } - // Move along the last field. - if (TheIdx == 0) - TheIdx = Node->getNumOperands() - 2; - uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(TheIdx + 1)) - ->getZExtValue(); + const MDNode *getBaseType() const { + return dyn_cast_or_null<MDNode>(Node->getOperand(0)); + } + const MDNode *getAccessType() const { + return dyn_cast_or_null<MDNode>(Node->getOperand(1)); + } + uint64_t getOffset() const { + return mdconst::extract<ConstantInt>(Node->getOperand(2))->getZExtValue(); + } + /// TypeIsImmutable - Test if this TBAAStructTagNode represents a type for + /// objects which are not modified (by any means) in the context where this + /// AliasAnalysis is relevant. + bool TypeIsImmutable() const { + if (Node->getNumOperands() < 4) + return false; + ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(3)); + if (!CI) + return false; + return CI->getValue()[0]; + } +}; + +/// This is a simple wrapper around an MDNode which provides a +/// higher-level interface by hiding the details of how alias analysis +/// information is encoded in its operands. +class TBAAStructTypeNode { + /// This node should be created with createTBAAStructTypeNode. + const MDNode *Node; + +public: + TBAAStructTypeNode() : Node(nullptr) {} + explicit TBAAStructTypeNode(const MDNode *N) : Node(N) {} + + /// Get the MDNode for this TBAAStructTypeNode. + const MDNode *getNode() const { return Node; } + + /// Get this TBAAStructTypeNode's field in the type DAG with + /// given offset. Update the offset to be relative to the field type. + TBAAStructTypeNode getParent(uint64_t &Offset) const { + // Parent can be omitted for the root node. + if (Node->getNumOperands() < 2) + return TBAAStructTypeNode(); + + // Fast path for a scalar type node and a struct type node with a single + // field. + if (Node->getNumOperands() <= 3) { + uint64_t Cur = Node->getNumOperands() == 2 + ? 0 + : mdconst::extract<ConstantInt>(Node->getOperand(2)) + ->getZExtValue(); Offset -= Cur; - MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(TheIdx)); + MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1)); if (!P) return TBAAStructTypeNode(); return TBAAStructTypeNode(P); } - }; -} - -namespace { - /// TypeBasedAliasAnalysis - This is a simple alias analysis - /// implementation that uses TypeBased to answer queries. - class TypeBasedAliasAnalysis : public ImmutablePass, - public AliasAnalysis { - public: - static char ID; // Class identification, replacement for typeinfo - TypeBasedAliasAnalysis() : ImmutablePass(ID) { - initializeTypeBasedAliasAnalysisPass(*PassRegistry::getPassRegistry()); - } - bool doInitialization(Module &M) override; - - /// getAdjustedAnalysisPointer - This method is used when a pass implements - /// an analysis interface through multiple inheritance. If needed, it - /// should override this to adjust the this pointer as needed for the - /// specified pass info. - void *getAdjustedAnalysisPointer(const void *PI) override { - if (PI == &AliasAnalysis::ID) - return (AliasAnalysis*)this; - return this; + // Assume the offsets are in order. We return the previous field if + // the current offset is bigger than the given offset. + unsigned TheIdx = 0; + for (unsigned Idx = 1; Idx < Node->getNumOperands(); Idx += 2) { + uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(Idx + 1)) + ->getZExtValue(); + if (Cur > Offset) { + assert(Idx >= 3 && + "TBAAStructTypeNode::getParent should have an offset match!"); + TheIdx = Idx - 2; + break; + } } - - bool Aliases(const MDNode *A, const MDNode *B) const; - bool PathAliases(const MDNode *A, const MDNode *B) const; - - private: - void getAnalysisUsage(AnalysisUsage &AU) const override; - AliasResult alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) override; - bool pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) override; - ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override; - ModRefBehavior getModRefBehavior(const Function *F) override; - ModRefResult getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) override; - ModRefResult getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2) override; - }; -} // End of anonymous namespace - -// Register this pass... -char TypeBasedAliasAnalysis::ID = 0; -INITIALIZE_AG_PASS(TypeBasedAliasAnalysis, AliasAnalysis, "tbaa", - "Type-Based Alias Analysis", false, true, false) - -ImmutablePass *llvm::createTypeBasedAliasAnalysisPass() { - return new TypeBasedAliasAnalysis(); -} - -bool TypeBasedAliasAnalysis::doInitialization(Module &M) { - InitializeAliasAnalysis(this, &M.getDataLayout()); - return true; -} - -void -TypeBasedAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesAll(); - AliasAnalysis::getAnalysisUsage(AU); + // Move along the last field. + if (TheIdx == 0) + TheIdx = Node->getNumOperands() - 2; + uint64_t Cur = mdconst::extract<ConstantInt>(Node->getOperand(TheIdx + 1)) + ->getZExtValue(); + Offset -= Cur; + MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(TheIdx)); + if (!P) + return TBAAStructTypeNode(); + return TBAAStructTypeNode(P); + } +}; } /// Check the first operand of the tbaa tag node, if it is a MDNode, we treat @@ -342,145 +279,36 @@ static bool isStructPathTBAA(const MDNode *MD) { return isa<MDNode>(MD->getOperand(0)) && MD->getNumOperands() >= 3; } -/// Aliases - Test whether the type represented by A may alias the -/// type represented by B. -bool -TypeBasedAliasAnalysis::Aliases(const MDNode *A, - const MDNode *B) const { - // Make sure that both MDNodes are struct-path aware. - if (isStructPathTBAA(A) && isStructPathTBAA(B)) - return PathAliases(A, B); - - // Keep track of the root node for A and B. - TBAANode RootA, RootB; - - // Climb the tree from A to see if we reach B. - for (TBAANode T(A); ; ) { - if (T.getNode() == B) - // B is an ancestor of A. - return true; - - RootA = T; - T = T.getParent(); - if (!T.getNode()) - break; - } - - // Climb the tree from B to see if we reach A. - for (TBAANode T(B); ; ) { - if (T.getNode() == A) - // A is an ancestor of B. - return true; - - RootB = T; - T = T.getParent(); - if (!T.getNode()) - break; - } - - // Neither node is an ancestor of the other. - - // If they have different roots, they're part of different potentially - // unrelated type systems, so we must be conservative. - if (RootA.getNode() != RootB.getNode()) - return true; - - // If they have the same root, then we've proved there's no alias. - return false; -} - -/// Test whether the struct-path tag represented by A may alias the -/// struct-path tag represented by B. -bool -TypeBasedAliasAnalysis::PathAliases(const MDNode *A, - const MDNode *B) const { - // Verify that both input nodes are struct-path aware. - assert(isStructPathTBAA(A) && "MDNode A is not struct-path aware."); - assert(isStructPathTBAA(B) && "MDNode B is not struct-path aware."); - - // Keep track of the root node for A and B. - TBAAStructTypeNode RootA, RootB; - TBAAStructTagNode TagA(A), TagB(B); - - // TODO: We need to check if AccessType of TagA encloses AccessType of - // TagB to support aggregate AccessType. If yes, return true. - - // Start from the base type of A, follow the edge with the correct offset in - // the type DAG and adjust the offset until we reach the base type of B or - // until we reach the Root node. - // Compare the adjusted offset once we have the same base. - - // Climb the type DAG from base type of A to see if we reach base type of B. - const MDNode *BaseA = TagA.getBaseType(); - const MDNode *BaseB = TagB.getBaseType(); - uint64_t OffsetA = TagA.getOffset(), OffsetB = TagB.getOffset(); - for (TBAAStructTypeNode T(BaseA); ; ) { - if (T.getNode() == BaseB) - // Base type of A encloses base type of B, check if the offsets match. - return OffsetA == OffsetB; - - RootA = T; - // Follow the edge with the correct offset, OffsetA will be adjusted to - // be relative to the field type. - T = T.getParent(OffsetA); - if (!T.getNode()) - break; - } - - // Reset OffsetA and climb the type DAG from base type of B to see if we reach - // base type of A. - OffsetA = TagA.getOffset(); - for (TBAAStructTypeNode T(BaseB); ; ) { - if (T.getNode() == BaseA) - // Base type of B encloses base type of A, check if the offsets match. - return OffsetA == OffsetB; - - RootB = T; - // Follow the edge with the correct offset, OffsetB will be adjusted to - // be relative to the field type. - T = T.getParent(OffsetB); - if (!T.getNode()) - break; - } - - // Neither node is an ancestor of the other. - - // If they have different roots, they're part of different potentially - // unrelated type systems, so we must be conservative. - if (RootA.getNode() != RootB.getNode()) - return true; - - // If they have the same root, then we've proved there's no alias. - return false; -} - -AliasResult TypeBasedAliasAnalysis::alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) { +AliasResult TypeBasedAAResult::alias(const MemoryLocation &LocA, + const MemoryLocation &LocB) { if (!EnableTBAA) - return AliasAnalysis::alias(LocA, LocB); + return AAResultBase::alias(LocA, LocB); // Get the attached MDNodes. If either value lacks a tbaa MDNode, we must // be conservative. const MDNode *AM = LocA.AATags.TBAA; - if (!AM) return AliasAnalysis::alias(LocA, LocB); + if (!AM) + return AAResultBase::alias(LocA, LocB); const MDNode *BM = LocB.AATags.TBAA; - if (!BM) return AliasAnalysis::alias(LocA, LocB); + if (!BM) + return AAResultBase::alias(LocA, LocB); // If they may alias, chain to the next AliasAnalysis. if (Aliases(AM, BM)) - return AliasAnalysis::alias(LocA, LocB); + return AAResultBase::alias(LocA, LocB); // Otherwise return a definitive result. return NoAlias; } -bool TypeBasedAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) { +bool TypeBasedAAResult::pointsToConstantMemory(const MemoryLocation &Loc, + bool OrLocal) { if (!EnableTBAA) - return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); const MDNode *M = Loc.AATags.TBAA; - if (!M) return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); + if (!M) + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); // If this is an "immutable" type, we can assume the pointer is pointing // to constant memory. @@ -488,80 +316,82 @@ bool TypeBasedAliasAnalysis::pointsToConstantMemory(const MemoryLocation &Loc, (isStructPathTBAA(M) && TBAAStructTagNode(M).TypeIsImmutable())) return true; - return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); } -AliasAnalysis::ModRefBehavior -TypeBasedAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) { +FunctionModRefBehavior +TypeBasedAAResult::getModRefBehavior(ImmutableCallSite CS) { if (!EnableTBAA) - return AliasAnalysis::getModRefBehavior(CS); + return AAResultBase::getModRefBehavior(CS); - ModRefBehavior Min = UnknownModRefBehavior; + FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior; // If this is an "immutable" type, we can assume the call doesn't write // to memory. if (const MDNode *M = CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa)) if ((!isStructPathTBAA(M) && TBAANode(M).TypeIsImmutable()) || (isStructPathTBAA(M) && TBAAStructTagNode(M).TypeIsImmutable())) - Min = OnlyReadsMemory; + Min = FMRB_OnlyReadsMemory; - return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min); + return FunctionModRefBehavior(AAResultBase::getModRefBehavior(CS) & Min); } -AliasAnalysis::ModRefBehavior -TypeBasedAliasAnalysis::getModRefBehavior(const Function *F) { +FunctionModRefBehavior TypeBasedAAResult::getModRefBehavior(const Function *F) { // Functions don't have metadata. Just chain to the next implementation. - return AliasAnalysis::getModRefBehavior(F); + return AAResultBase::getModRefBehavior(F); } -AliasAnalysis::ModRefResult -TypeBasedAliasAnalysis::getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) { +ModRefInfo TypeBasedAAResult::getModRefInfo(ImmutableCallSite CS, + const MemoryLocation &Loc) { if (!EnableTBAA) - return AliasAnalysis::getModRefInfo(CS, Loc); + return AAResultBase::getModRefInfo(CS, Loc); if (const MDNode *L = Loc.AATags.TBAA) if (const MDNode *M = CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa)) if (!Aliases(L, M)) - return NoModRef; + return MRI_NoModRef; - return AliasAnalysis::getModRefInfo(CS, Loc); + return AAResultBase::getModRefInfo(CS, Loc); } -AliasAnalysis::ModRefResult -TypeBasedAliasAnalysis::getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2) { +ModRefInfo TypeBasedAAResult::getModRefInfo(ImmutableCallSite CS1, + ImmutableCallSite CS2) { if (!EnableTBAA) - return AliasAnalysis::getModRefInfo(CS1, CS2); + return AAResultBase::getModRefInfo(CS1, CS2); if (const MDNode *M1 = CS1.getInstruction()->getMetadata(LLVMContext::MD_tbaa)) if (const MDNode *M2 = CS2.getInstruction()->getMetadata(LLVMContext::MD_tbaa)) if (!Aliases(M1, M2)) - return NoModRef; + return MRI_NoModRef; - return AliasAnalysis::getModRefInfo(CS1, CS2); + return AAResultBase::getModRefInfo(CS1, CS2); } bool MDNode::isTBAAVtableAccess() const { if (!isStructPathTBAA(this)) { - if (getNumOperands() < 1) return false; + if (getNumOperands() < 1) + return false; if (MDString *Tag1 = dyn_cast<MDString>(getOperand(0))) { - if (Tag1->getString() == "vtable pointer") return true; + if (Tag1->getString() == "vtable pointer") + return true; } return false; } // For struct-path aware TBAA, we use the access type of the tag. - if (getNumOperands() < 2) return false; + if (getNumOperands() < 2) + return false; MDNode *Tag = cast_or_null<MDNode>(getOperand(1)); - if (!Tag) return false; + if (!Tag) + return false; if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) { - if (Tag1->getString() == "vtable pointer") return true; + if (Tag1->getString() == "vtable pointer") + return true; } - return false; + return false; } MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { @@ -575,9 +405,11 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { bool StructPath = isStructPathTBAA(A) && isStructPathTBAA(B); if (StructPath) { A = cast_or_null<MDNode>(A->getOperand(1)); - if (!A) return nullptr; + if (!A) + return nullptr; B = cast_or_null<MDNode>(B->getOperand(1)); - if (!B) return nullptr; + if (!B) + return nullptr; } SmallSetVector<MDNode *, 4> PathA; @@ -604,7 +436,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { int IB = PathB.size() - 1; MDNode *Ret = nullptr; - while (IA >= 0 && IB >=0) { + while (IA >= 0 && IB >= 0) { if (PathA[IA] == PathB[IB]) Ret = PathA[IA]; else @@ -644,3 +476,147 @@ void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const { N.NoAlias = getMetadata(LLVMContext::MD_noalias); } +/// Aliases - Test whether the type represented by A may alias the +/// type represented by B. +bool TypeBasedAAResult::Aliases(const MDNode *A, const MDNode *B) const { + // Make sure that both MDNodes are struct-path aware. + if (isStructPathTBAA(A) && isStructPathTBAA(B)) + return PathAliases(A, B); + + // Keep track of the root node for A and B. + TBAANode RootA, RootB; + + // Climb the tree from A to see if we reach B. + for (TBAANode T(A);;) { + if (T.getNode() == B) + // B is an ancestor of A. + return true; + + RootA = T; + T = T.getParent(); + if (!T.getNode()) + break; + } + + // Climb the tree from B to see if we reach A. + for (TBAANode T(B);;) { + if (T.getNode() == A) + // A is an ancestor of B. + return true; + + RootB = T; + T = T.getParent(); + if (!T.getNode()) + break; + } + + // Neither node is an ancestor of the other. + + // If they have different roots, they're part of different potentially + // unrelated type systems, so we must be conservative. + if (RootA.getNode() != RootB.getNode()) + return true; + + // If they have the same root, then we've proved there's no alias. + return false; +} + +/// Test whether the struct-path tag represented by A may alias the +/// struct-path tag represented by B. +bool TypeBasedAAResult::PathAliases(const MDNode *A, const MDNode *B) const { + // Verify that both input nodes are struct-path aware. + assert(isStructPathTBAA(A) && "MDNode A is not struct-path aware."); + assert(isStructPathTBAA(B) && "MDNode B is not struct-path aware."); + + // Keep track of the root node for A and B. + TBAAStructTypeNode RootA, RootB; + TBAAStructTagNode TagA(A), TagB(B); + + // TODO: We need to check if AccessType of TagA encloses AccessType of + // TagB to support aggregate AccessType. If yes, return true. + + // Start from the base type of A, follow the edge with the correct offset in + // the type DAG and adjust the offset until we reach the base type of B or + // until we reach the Root node. + // Compare the adjusted offset once we have the same base. + + // Climb the type DAG from base type of A to see if we reach base type of B. + const MDNode *BaseA = TagA.getBaseType(); + const MDNode *BaseB = TagB.getBaseType(); + uint64_t OffsetA = TagA.getOffset(), OffsetB = TagB.getOffset(); + for (TBAAStructTypeNode T(BaseA);;) { + if (T.getNode() == BaseB) + // Base type of A encloses base type of B, check if the offsets match. + return OffsetA == OffsetB; + + RootA = T; + // Follow the edge with the correct offset, OffsetA will be adjusted to + // be relative to the field type. + T = T.getParent(OffsetA); + if (!T.getNode()) + break; + } + + // Reset OffsetA and climb the type DAG from base type of B to see if we reach + // base type of A. + OffsetA = TagA.getOffset(); + for (TBAAStructTypeNode T(BaseB);;) { + if (T.getNode() == BaseA) + // Base type of B encloses base type of A, check if the offsets match. + return OffsetA == OffsetB; + + RootB = T; + // Follow the edge with the correct offset, OffsetB will be adjusted to + // be relative to the field type. + T = T.getParent(OffsetB); + if (!T.getNode()) + break; + } + + // Neither node is an ancestor of the other. + + // If they have different roots, they're part of different potentially + // unrelated type systems, so we must be conservative. + if (RootA.getNode() != RootB.getNode()) + return true; + + // If they have the same root, then we've proved there's no alias. + return false; +} + +TypeBasedAAResult TypeBasedAA::run(Function &F, AnalysisManager<Function> *AM) { + return TypeBasedAAResult(AM->getResult<TargetLibraryAnalysis>(F)); +} + +char TypeBasedAA::PassID; + +char TypeBasedAAWrapperPass::ID = 0; +INITIALIZE_PASS_BEGIN(TypeBasedAAWrapperPass, "tbaa", + "Type-Based Alias Analysis", false, true) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(TypeBasedAAWrapperPass, "tbaa", "Type-Based Alias Analysis", + false, true) + +ImmutablePass *llvm::createTypeBasedAAWrapperPass() { + return new TypeBasedAAWrapperPass(); +} + +TypeBasedAAWrapperPass::TypeBasedAAWrapperPass() : ImmutablePass(ID) { + initializeTypeBasedAAWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +bool TypeBasedAAWrapperPass::doInitialization(Module &M) { + Result.reset(new TypeBasedAAResult( + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI())); + return false; +} + +bool TypeBasedAAWrapperPass::doFinalization(Module &M) { + Result.reset(); + return false; +} + +void TypeBasedAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); +} diff --git a/contrib/llvm/lib/Analysis/ValueTracking.cpp b/contrib/llvm/lib/Analysis/ValueTracking.cpp index fa0d779..a83e207 100644 --- a/contrib/llvm/lib/Analysis/ValueTracking.cpp +++ b/contrib/llvm/lib/Analysis/ValueTracking.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/ValueTracking.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -43,7 +44,7 @@ const unsigned MaxDepth = 6; /// Enable an experimental feature to leverage information about dominating /// conditions to compute known bits. The individual options below control how -/// hard we search. The defaults are choosen to be fairly aggressive. If you +/// hard we search. The defaults are chosen to be fairly aggressive. If you /// run into compile time problems when testing, scale them back and report /// your findings. static cl::opt<bool> EnableDomConditions("value-tracking-dom-conditions", @@ -58,12 +59,12 @@ static cl::opt<unsigned> DomConditionsMaxDepth("dom-conditions-max-depth", /// conditions? static cl::opt<unsigned> DomConditionsMaxDomBlocks("dom-conditions-dom-blocks", cl::Hidden, - cl::init(20000)); + cl::init(20)); // Controls the number of uses of the value searched for possible // dominating comparisons. static cl::opt<unsigned> DomConditionsMaxUses("dom-conditions-max-uses", - cl::Hidden, cl::init(2000)); + cl::Hidden, cl::init(20)); // If true, don't consider only compares whose only use is a branch. static cl::opt<bool> DomConditionsSingleCmpUse("dom-conditions-single-cmp-use", @@ -185,6 +186,25 @@ bool llvm::isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth, return ::isKnownNonZero(V, DL, Depth, Query(AC, safeCxtI(V, CxtI), DT)); } +bool llvm::isKnownNonNegative(Value *V, const DataLayout &DL, unsigned Depth, + AssumptionCache *AC, const Instruction *CxtI, + const DominatorTree *DT) { + bool NonNegative, Negative; + ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT); + return NonNegative; +} + +static bool isKnownNonEqual(Value *V1, Value *V2, const DataLayout &DL, + const Query &Q); + +bool llvm::isKnownNonEqual(Value *V1, Value *V2, const DataLayout &DL, + AssumptionCache *AC, const Instruction *CxtI, + const DominatorTree *DT) { + return ::isKnownNonEqual(V1, V2, DL, Query(AC, + safeCxtI(V1, safeCxtI(V2, CxtI)), + DT)); +} + static bool MaskedValueIsZero(Value *V, const APInt &Mask, const DataLayout &DL, unsigned Depth, const Query &Q); @@ -320,7 +340,7 @@ static void computeKnownBitsMul(Value *Op0, Value *Op1, bool NSW, } // If low bits are zero in either operand, output low known-0 bits. - // Also compute a conserative estimate for high known-0 bits. + // Also compute a conservative estimate for high known-0 bits. // More trickiness is possible, but this is sufficient for the // interesting case of alignment computation. KnownOne.clearAllBits(); @@ -347,26 +367,30 @@ static void computeKnownBitsMul(Value *Op0, Value *Op1, bool NSW, } void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges, - APInt &KnownZero) { + APInt &KnownZero, + APInt &KnownOne) { unsigned BitWidth = KnownZero.getBitWidth(); unsigned NumRanges = Ranges.getNumOperands() / 2; assert(NumRanges >= 1); - // Use the high end of the ranges to find leading zeros. - unsigned MinLeadingZeros = BitWidth; + KnownZero.setAllBits(); + KnownOne.setAllBits(); + for (unsigned i = 0; i < NumRanges; ++i) { ConstantInt *Lower = mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 0)); ConstantInt *Upper = mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 1)); ConstantRange Range(Lower->getValue(), Upper->getValue()); - if (Range.isWrappedSet()) - MinLeadingZeros = 0; // -1 has no zeros - unsigned LeadingZeros = (Upper->getValue() - 1).countLeadingZeros(); - MinLeadingZeros = std::min(LeadingZeros, MinLeadingZeros); - } - KnownZero = APInt::getHighBitsSet(BitWidth, MinLeadingZeros); + // The first CommonPrefixBits of all values in Range are equal. + unsigned CommonPrefixBits = + (Range.getUnsignedMax() ^ Range.getUnsignedMin()).countLeadingZeros(); + + APInt Mask = APInt::getHighBitsSet(BitWidth, CommonPrefixBits); + KnownOne &= Range.getUnsignedMax() & Mask; + KnownZero &= ~Range.getUnsignedMax() & Mask; + } } static bool isEphemeralValueOf(Instruction *I, const Value *E) { @@ -374,20 +398,20 @@ static bool isEphemeralValueOf(Instruction *I, const Value *E) { SmallPtrSet<const Value *, 32> Visited; SmallPtrSet<const Value *, 16> EphValues; + // The instruction defining an assumption's condition itself is always + // considered ephemeral to that assumption (even if it has other + // non-ephemeral users). See r246696's test case for an example. + if (std::find(I->op_begin(), I->op_end(), E) != I->op_end()) + return true; + while (!WorkSet.empty()) { const Value *V = WorkSet.pop_back_val(); if (!Visited.insert(V).second) continue; // If all uses of this value are ephemeral, then so is this value. - bool FoundNEUse = false; - for (const User *I : V->users()) - if (!EphValues.count(I)) { - FoundNEUse = true; - break; - } - - if (!FoundNEUse) { + if (std::all_of(V->user_begin(), V->user_end(), + [&](const User *U) { return EphValues.count(U); })) { if (V == E) return true; @@ -447,7 +471,7 @@ static bool isValidAssumeForContext(Value *V, const Query &Q) { for (BasicBlock::const_iterator I = std::next(BasicBlock::const_iterator(Q.CxtI)), IE(Inv); I != IE; ++I) - if (!isSafeToSpeculativelyExecute(I) && !isAssumeLikeIntrinsic(I)) + if (!isSafeToSpeculativelyExecute(&*I) && !isAssumeLikeIntrinsic(&*I)) return false; return !isEphemeralValueOf(Inv, Q.CxtI); @@ -464,14 +488,14 @@ static bool isValidAssumeForContext(Value *V, const Query &Q) { // of the block); the common case is that the assume will come first. for (BasicBlock::iterator I = std::next(BasicBlock::iterator(Inv)), IE = Inv->getParent()->end(); I != IE; ++I) - if (I == Q.CxtI) + if (&*I == Q.CxtI) return true; // The context must come first... for (BasicBlock::const_iterator I = std::next(BasicBlock::const_iterator(Q.CxtI)), IE(Inv); I != IE; ++I) - if (!isSafeToSpeculativelyExecute(I) && !isAssumeLikeIntrinsic(I)) + if (!isSafeToSpeculativelyExecute(&*I) && !isAssumeLikeIntrinsic(&*I)) return false; return !isEphemeralValueOf(Inv, Q.CxtI); @@ -601,6 +625,11 @@ static void computeKnownBitsFromDominatingCondition(Value *V, APInt &KnownZero, if (!Q.DT || !Q.CxtI) return; Instruction *Cxt = const_cast<Instruction *>(Q.CxtI); + // The context instruction might be in a statically unreachable block. If + // so, asking dominator queries may yield suprising results. (e.g. the block + // may not have a dom tree node) + if (!Q.DT->isReachableFromEntry(Cxt->getParent())) + return; // Avoid useless work if (auto VI = dyn_cast<Instruction>(V)) @@ -647,7 +676,9 @@ static void computeKnownBitsFromDominatingCondition(Value *V, APInt &KnownZero, // instruction. Finding a condition where one path dominates the context // isn't enough because both the true and false cases could merge before // the context instruction we're actually interested in. Instead, we need - // to ensure that the taken *edge* dominates the context instruction. + // to ensure that the taken *edge* dominates the context instruction. We + // know that the edge must be reachable since we started from a reachable + // block. BasicBlock *BB0 = BI->getSuccessor(0); BasicBlockEdge Edge(BI->getParent(), BB0); if (!Edge.isSingleEdge() || !Q.DT->dominates(Edge, Q.CxtI->getParent())) @@ -941,6 +972,90 @@ static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero, } } +// Compute known bits from a shift operator, including those with a +// non-constant shift amount. KnownZero and KnownOne are the outputs of this +// function. KnownZero2 and KnownOne2 are pre-allocated temporaries with the +// same bit width as KnownZero and KnownOne. KZF and KOF are operator-specific +// functors that, given the known-zero or known-one bits respectively, and a +// shift amount, compute the implied known-zero or known-one bits of the shift +// operator's result respectively for that shift amount. The results from calling +// KZF and KOF are conservatively combined for all permitted shift amounts. +template <typename KZFunctor, typename KOFunctor> +static void computeKnownBitsFromShiftOperator(Operator *I, + APInt &KnownZero, APInt &KnownOne, + APInt &KnownZero2, APInt &KnownOne2, + const DataLayout &DL, unsigned Depth, const Query &Q, + KZFunctor KZF, KOFunctor KOF) { + unsigned BitWidth = KnownZero.getBitWidth(); + + if (auto *SA = dyn_cast<ConstantInt>(I->getOperand(1))) { + unsigned ShiftAmt = SA->getLimitedValue(BitWidth-1); + + computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1, Q); + KnownZero = KZF(KnownZero, ShiftAmt); + KnownOne = KOF(KnownOne, ShiftAmt); + return; + } + + computeKnownBits(I->getOperand(1), KnownZero, KnownOne, DL, Depth + 1, Q); + + // Note: We cannot use KnownZero.getLimitedValue() here, because if + // BitWidth > 64 and any upper bits are known, we'll end up returning the + // limit value (which implies all bits are known). + uint64_t ShiftAmtKZ = KnownZero.zextOrTrunc(64).getZExtValue(); + uint64_t ShiftAmtKO = KnownOne.zextOrTrunc(64).getZExtValue(); + + // It would be more-clearly correct to use the two temporaries for this + // calculation. Reusing the APInts here to prevent unnecessary allocations. + KnownZero.clearAllBits(), KnownOne.clearAllBits(); + + // If we know the shifter operand is nonzero, we can sometimes infer more + // known bits. However this is expensive to compute, so be lazy about it and + // only compute it when absolutely necessary. + Optional<bool> ShifterOperandIsNonZero; + + // Early exit if we can't constrain any well-defined shift amount. + if (!(ShiftAmtKZ & (BitWidth - 1)) && !(ShiftAmtKO & (BitWidth - 1))) { + ShifterOperandIsNonZero = + isKnownNonZero(I->getOperand(1), DL, Depth + 1, Q); + if (!*ShifterOperandIsNonZero) + return; + } + + computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL, Depth + 1, Q); + + KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + for (unsigned ShiftAmt = 0; ShiftAmt < BitWidth; ++ShiftAmt) { + // Combine the shifted known input bits only for those shift amounts + // compatible with its known constraints. + if ((ShiftAmt & ~ShiftAmtKZ) != ShiftAmt) + continue; + if ((ShiftAmt | ShiftAmtKO) != ShiftAmt) + continue; + // If we know the shifter is nonzero, we may be able to infer more known + // bits. This check is sunk down as far as possible to avoid the expensive + // call to isKnownNonZero if the cheaper checks above fail. + if (ShiftAmt == 0) { + if (!ShifterOperandIsNonZero.hasValue()) + ShifterOperandIsNonZero = + isKnownNonZero(I->getOperand(1), DL, Depth + 1, Q); + if (*ShifterOperandIsNonZero) + continue; + } + + KnownZero &= KZF(KnownZero2, ShiftAmt); + KnownOne &= KOF(KnownOne2, ShiftAmt); + } + + // If there are no compatible shift amounts, then we've proven that the shift + // amount must be >= the BitWidth, and the result is undefined. We could + // return anything we'd like, but we need to make sure the sets of known bits + // stay disjoint (it should be better for some other code to actually + // propagate the undef than to pick a value here using known bits). + if ((KnownZero & KnownOne) != 0) + KnownZero.clearAllBits(), KnownOne.clearAllBits(); +} + static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero, APInt &KnownOne, const DataLayout &DL, unsigned Depth, const Query &Q) { @@ -951,7 +1066,7 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero, default: break; case Instruction::Load: if (MDNode *MD = cast<LoadInst>(I)->getMetadata(LLVMContext::MD_range)) - computeKnownBitsFromRangeMetadata(*MD, KnownZero); + computeKnownBitsFromRangeMetadata(*MD, KnownZero, KnownOne); break; case Instruction::And: { // If either the LHS or the RHS are Zero, the result is zero. @@ -962,6 +1077,22 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero, KnownOne &= KnownOne2; // Output known-0 are known to be clear if zero in either the LHS | RHS. KnownZero |= KnownZero2; + + // and(x, add (x, -1)) is a common idiom that always clears the low bit; + // here we handle the more general case of adding any odd number by + // matching the form add(x, add(x, y)) where y is odd. + // TODO: This could be generalized to clearing any bit set in y where the + // following bit is known to be unset in y. + Value *Y = nullptr; + if (match(I->getOperand(0), m_Add(m_Specific(I->getOperand(1)), + m_Value(Y))) || + match(I->getOperand(1), m_Add(m_Specific(I->getOperand(0)), + m_Value(Y)))) { + APInt KnownZero3(BitWidth, 0), KnownOne3(BitWidth, 0); + computeKnownBits(Y, KnownZero3, KnownOne3, DL, Depth + 1, Q); + if (KnownOne3.countTrailingOnes() > 0) + KnownZero |= APInt::getLowBitsSet(BitWidth, 1); + } break; } case Instruction::Or: { @@ -1050,7 +1181,8 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero, } case Instruction::BitCast: { Type *SrcTy = I->getOperand(0)->getType(); - if ((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) && + if ((SrcTy->isIntegerTy() || SrcTy->isPointerTy() || + SrcTy->isFloatingPointTy()) && // TODO: For now, not handling conversions like: // (bitcast i64 %x to <2 x i32>) !I->getType()->isVectorTy()) { @@ -1077,48 +1209,54 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero, KnownOne |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth); break; } - case Instruction::Shl: + case Instruction::Shl: { // (shl X, C1) & C2 == 0 iff (X & C2 >>u C1) == 0 - if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) { - uint64_t ShiftAmt = SA->getLimitedValue(BitWidth); - computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1, Q); - KnownZero <<= ShiftAmt; - KnownOne <<= ShiftAmt; - KnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt); // low bits known 0 - } + auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) { + return (KnownZero << ShiftAmt) | + APInt::getLowBitsSet(BitWidth, ShiftAmt); // Low bits known 0. + }; + + auto KOF = [BitWidth](const APInt &KnownOne, unsigned ShiftAmt) { + return KnownOne << ShiftAmt; + }; + + computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne, + KnownZero2, KnownOne2, DL, Depth, Q, + KZF, KOF); break; - case Instruction::LShr: + } + case Instruction::LShr: { // (ushr X, C1) & C2 == 0 iff (-1 >> C1) & C2 == 0 - if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) { - // Compute the new bits that are at the top now. - uint64_t ShiftAmt = SA->getLimitedValue(BitWidth); - - // Unsigned shift right. - computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1, Q); - KnownZero = APIntOps::lshr(KnownZero, ShiftAmt); - KnownOne = APIntOps::lshr(KnownOne, ShiftAmt); - // high bits known zero. - KnownZero |= APInt::getHighBitsSet(BitWidth, ShiftAmt); - } + auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) { + return APIntOps::lshr(KnownZero, ShiftAmt) | + // High bits known zero. + APInt::getHighBitsSet(BitWidth, ShiftAmt); + }; + + auto KOF = [BitWidth](const APInt &KnownOne, unsigned ShiftAmt) { + return APIntOps::lshr(KnownOne, ShiftAmt); + }; + + computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne, + KnownZero2, KnownOne2, DL, Depth, Q, + KZF, KOF); break; - case Instruction::AShr: + } + case Instruction::AShr: { // (ashr X, C1) & C2 == 0 iff (-1 >> C1) & C2 == 0 - if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) { - // Compute the new bits that are at the top now. - uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1); + auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) { + return APIntOps::ashr(KnownZero, ShiftAmt); + }; - // Signed shift right. - computeKnownBits(I->getOperand(0), KnownZero, KnownOne, DL, Depth + 1, Q); - KnownZero = APIntOps::lshr(KnownZero, ShiftAmt); - KnownOne = APIntOps::lshr(KnownOne, ShiftAmt); + auto KOF = [BitWidth](const APInt &KnownOne, unsigned ShiftAmt) { + return APIntOps::ashr(KnownOne, ShiftAmt); + }; - APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt)); - if (KnownZero[BitWidth-ShiftAmt-1]) // New bits are known zero. - KnownZero |= HighBits; - else if (KnownOne[BitWidth-ShiftAmt-1]) // New bits are known one. - KnownOne |= HighBits; - } + computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne, + KnownZero2, KnownOne2, DL, Depth, Q, + KZF, KOF); break; + } case Instruction::Sub: { bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap(); computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW, @@ -1336,13 +1474,19 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero, case Instruction::Call: case Instruction::Invoke: if (MDNode *MD = cast<Instruction>(I)->getMetadata(LLVMContext::MD_range)) - computeKnownBitsFromRangeMetadata(*MD, KnownZero); + computeKnownBitsFromRangeMetadata(*MD, KnownZero, KnownOne); // If a range metadata is attached to this IntrinsicInst, intersect the // explicit range specified by the metadata and the implicit range of // the intrinsic. if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { switch (II->getIntrinsicID()) { default: break; + case Intrinsic::bswap: + computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL, + Depth + 1, Q); + KnownZero |= KnownZero2.byteSwap(); + KnownOne |= KnownOne2.byteSwap(); + break; case Intrinsic::ctlz: case Intrinsic::cttz: { unsigned LowBits = Log2_32(BitWidth)+1; @@ -1353,8 +1497,24 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero, break; } case Intrinsic::ctpop: { - unsigned LowBits = Log2_32(BitWidth)+1; - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - LowBits); + computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, DL, + Depth + 1, Q); + // We can bound the space the count needs. Also, bits known to be zero + // can't contribute to the population. + unsigned BitsPossiblySet = BitWidth - KnownZero2.countPopulation(); + unsigned LeadingZeros = + APInt(BitWidth, BitsPossiblySet).countLeadingZeros(); + assert(LeadingZeros <= BitWidth); + KnownZero |= APInt::getHighBitsSet(BitWidth, LeadingZeros); + KnownOne &= ~KnownZero; + // TODO: we could bound KnownOne using the lower bound on the number + // of bits which might be set provided by popcnt KnownOne2. + break; + } + case Intrinsic::fabs: { + Type *Ty = II->getType(); + APInt SignBit = APInt::getSignBit(Ty->getScalarSizeInBits()); + KnownZero |= APInt::getSplat(Ty->getPrimitiveSizeInBits(), SignBit); break; } case Intrinsic::x86_sse42_crc32_64_64: @@ -1394,6 +1554,46 @@ static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero, } } +static unsigned getAlignment(const Value *V, const DataLayout &DL) { + unsigned Align = 0; + if (auto *GO = dyn_cast<GlobalObject>(V)) { + Align = GO->getAlignment(); + if (Align == 0) { + if (auto *GVar = dyn_cast<GlobalVariable>(GO)) { + Type *ObjectType = GVar->getType()->getElementType(); + if (ObjectType->isSized()) { + // If the object is defined in the current Module, we'll be giving + // it the preferred alignment. Otherwise, we have to assume that it + // may only have the minimum ABI alignment. + if (GVar->isStrongDefinitionForLinker()) + Align = DL.getPreferredAlignment(GVar); + else + Align = DL.getABITypeAlignment(ObjectType); + } + } + } + } else if (const Argument *A = dyn_cast<Argument>(V)) { + Align = A->getType()->isPointerTy() ? A->getParamAlignment() : 0; + + if (!Align && A->hasStructRetAttr()) { + // An sret parameter has at least the ABI alignment of the return type. + Type *EltTy = cast<PointerType>(A->getType())->getElementType(); + if (EltTy->isSized()) + Align = DL.getABITypeAlignment(EltTy); + } + } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) + Align = AI->getAlignment(); + else if (auto CS = ImmutableCallSite(V)) + Align = CS.getAttributes().getParamAlignment(AttributeSet::ReturnIndex); + else if (const LoadInst *LI = dyn_cast<LoadInst>(V)) + if (MDNode *MD = LI->getMetadata(LLVMContext::MD_align)) { + ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0)); + Align = CI->getLimitedValue(); + } + + return Align; +} + /// Determine which bits of V are known to be either zero or one and return /// them in the KnownZero/KnownOne bit sets. /// @@ -1416,8 +1616,9 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, unsigned BitWidth = KnownZero.getBitWidth(); assert((V->getType()->isIntOrIntVectorTy() || + V->getType()->isFPOrFPVectorTy() || V->getType()->getScalarType()->isPointerTy()) && - "Not integer or pointer type!"); + "Not integer, floating point, or pointer type!"); assert((DL.getTypeSizeInBits(V->getType()->getScalarType()) == BitWidth) && (!V->getType()->isIntOrIntVectorTy() || V->getType()->getScalarSizeInBits() == BitWidth) && @@ -1454,59 +1655,6 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, return; } - // The address of an aligned GlobalValue has trailing zeros. - if (auto *GO = dyn_cast<GlobalObject>(V)) { - unsigned Align = GO->getAlignment(); - if (Align == 0) { - if (auto *GVar = dyn_cast<GlobalVariable>(GO)) { - Type *ObjectType = GVar->getType()->getElementType(); - if (ObjectType->isSized()) { - // If the object is defined in the current Module, we'll be giving - // it the preferred alignment. Otherwise, we have to assume that it - // may only have the minimum ABI alignment. - if (GVar->isStrongDefinitionForLinker()) - Align = DL.getPreferredAlignment(GVar); - else - Align = DL.getABITypeAlignment(ObjectType); - } - } - } - if (Align > 0) - KnownZero = APInt::getLowBitsSet(BitWidth, - countTrailingZeros(Align)); - else - KnownZero.clearAllBits(); - KnownOne.clearAllBits(); - return; - } - - if (Argument *A = dyn_cast<Argument>(V)) { - unsigned Align = A->getType()->isPointerTy() ? A->getParamAlignment() : 0; - - if (!Align && A->hasStructRetAttr()) { - // An sret parameter has at least the ABI alignment of the return type. - Type *EltTy = cast<PointerType>(A->getType())->getElementType(); - if (EltTy->isSized()) - Align = DL.getABITypeAlignment(EltTy); - } - - if (Align) - KnownZero = APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align)); - else - KnownZero.clearAllBits(); - KnownOne.clearAllBits(); - - // Don't give up yet... there might be an assumption that provides more - // information... - computeKnownBitsFromAssume(V, KnownZero, KnownOne, DL, Depth, Q); - - // Or a dominating condition for that matter - if (EnableDomConditions && Depth <= DomConditionsMaxDepth) - computeKnownBitsFromDominatingCondition(V, KnownZero, KnownOne, DL, - Depth, Q); - return; - } - // Start out not knowing anything. KnownZero.clearAllBits(); KnownOne.clearAllBits(); @@ -1525,6 +1673,14 @@ void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, if (Operator *I = dyn_cast<Operator>(V)) computeKnownBitsFromOperator(I, KnownZero, KnownOne, DL, Depth, Q); + + // Aligned pointers have trailing zeros - refine KnownZero set + if (V->getType()->isPointerTy()) { + unsigned Align = getAlignment(V, DL); + if (Align) + KnownZero |= APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align)); + } + // computeKnownBitsFromAssume and computeKnownBitsFromDominatingCondition // strictly refines KnownZero and KnownOne. Therefore, we run them after // computeKnownBitsFromOperator. @@ -1587,9 +1743,10 @@ bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth, return false; Value *X = nullptr, *Y = nullptr; - // A shift of a power of two is a power of two or zero. + // A shift left or a logical shift right of a power of two is a power of two + // or zero. if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) || - match(V, m_Shr(m_Value(X), m_Value())))) + match(V, m_LShr(m_Value(X), m_Value())))) return isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q, DL); if (ZExtInst *ZI = dyn_cast<ZExtInst>(V)) @@ -1812,6 +1969,23 @@ bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth, ComputeSignBit(X, XKnownNonNegative, XKnownNegative, DL, Depth, Q); if (XKnownNegative) return true; + + // If the shifter operand is a constant, and all of the bits shifted + // out are known to be zero, and X is known non-zero then at least one + // non-zero bit must remain. + if (ConstantInt *Shift = dyn_cast<ConstantInt>(Y)) { + APInt KnownZero(BitWidth, 0); + APInt KnownOne(BitWidth, 0); + computeKnownBits(X, KnownZero, KnownOne, DL, Depth, Q); + + auto ShiftVal = Shift->getLimitedValue(BitWidth - 1); + // Is there a known one in the portion not shifted out? + if (KnownOne.countLeadingZeros() < BitWidth - ShiftVal) + return true; + // Are all the bits to be shifted out known zero? + if (KnownZero.countTrailingOnes() >= ShiftVal) + return isKnownNonZero(X, DL, Depth, Q); + } } // div exact can only produce a zero if the dividend is zero. else if (match(V, m_Exact(m_IDiv(m_Value(X), m_Value())))) { @@ -1871,6 +2045,26 @@ bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth, isKnownNonZero(SI->getFalseValue(), DL, Depth, Q)) return true; } + // PHI + else if (PHINode *PN = dyn_cast<PHINode>(V)) { + // Try and detect a recurrence that monotonically increases from a + // starting value, as these are common as induction variables. + if (PN->getNumIncomingValues() == 2) { + Value *Start = PN->getIncomingValue(0); + Value *Induction = PN->getIncomingValue(1); + if (isa<ConstantInt>(Induction) && !isa<ConstantInt>(Start)) + std::swap(Start, Induction); + if (ConstantInt *C = dyn_cast<ConstantInt>(Start)) { + if (!C->isZero() && !C->isNegative()) { + ConstantInt *X; + if ((match(Induction, m_NSWAdd(m_Specific(PN), m_ConstantInt(X))) || + match(Induction, m_NUWAdd(m_Specific(PN), m_ConstantInt(X)))) && + !X->isNegative()) + return true; + } + } + } + } if (!BitWidth) return false; APInt KnownZero(BitWidth, 0); @@ -1879,6 +2073,51 @@ bool isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth, return KnownOne != 0; } +/// Return true if V2 == V1 + X, where X is known non-zero. +static bool isAddOfNonZero(Value *V1, Value *V2, const DataLayout &DL, + const Query &Q) { + BinaryOperator *BO = dyn_cast<BinaryOperator>(V1); + if (!BO || BO->getOpcode() != Instruction::Add) + return false; + Value *Op = nullptr; + if (V2 == BO->getOperand(0)) + Op = BO->getOperand(1); + else if (V2 == BO->getOperand(1)) + Op = BO->getOperand(0); + else + return false; + return isKnownNonZero(Op, DL, 0, Q); +} + +/// Return true if it is known that V1 != V2. +static bool isKnownNonEqual(Value *V1, Value *V2, const DataLayout &DL, + const Query &Q) { + if (V1->getType()->isVectorTy() || V1 == V2) + return false; + if (V1->getType() != V2->getType()) + // We can't look through casts yet. + return false; + if (isAddOfNonZero(V1, V2, DL, Q) || isAddOfNonZero(V2, V1, DL, Q)) + return true; + + if (IntegerType *Ty = dyn_cast<IntegerType>(V1->getType())) { + // Are any known bits in V1 contradictory to known bits in V2? If V1 + // has a known zero where V2 has a known one, they must not be equal. + auto BitWidth = Ty->getBitWidth(); + APInt KnownZero1(BitWidth, 0); + APInt KnownOne1(BitWidth, 0); + computeKnownBits(V1, KnownZero1, KnownOne1, DL, 0, Q); + APInt KnownZero2(BitWidth, 0); + APInt KnownOne2(BitWidth, 0); + computeKnownBits(V2, KnownZero2, KnownOne2, DL, 0, Q); + + auto OppositeBits = (KnownZero1 & KnownOne2) | (KnownZero2 & KnownOne1); + if (OppositeBits.getBoolValue()) + return true; + } + return false; +} + /// Return true if 'V & Mask' is known to be zero. We use this predicate to /// simplify operations downstream. Mask is known to be zero for bits that V /// cannot have. @@ -2317,6 +2556,9 @@ bool llvm::CannotBeOrderedLessThanZero(const Value *V, unsigned Depth) { switch (I->getOpcode()) { default: break; + // Unsigned integers are always nonnegative. + case Instruction::UIToFP: + return true; case Instruction::FMul: // x*x is always non-negative or a NaN. if (I->getOperand(0) == I->getOperand(1)) @@ -2327,6 +2569,9 @@ bool llvm::CannotBeOrderedLessThanZero(const Value *V, unsigned Depth) { case Instruction::FRem: return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1) && CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1); + case Instruction::Select: + return CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1) && + CannotBeOrderedLessThanZero(I->getOperand(2), Depth+1); case Instruction::FPExt: case Instruction::FPTrunc: // Widening/narrowing never change sign. @@ -2335,6 +2580,12 @@ bool llvm::CannotBeOrderedLessThanZero(const Value *V, unsigned Depth) { if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) switch (II->getIntrinsicID()) { default: break; + case Intrinsic::maxnum: + return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1) || + CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1); + case Intrinsic::minnum: + return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1) && + CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1); case Intrinsic::exp: case Intrinsic::exp2: case Intrinsic::fabs: @@ -2545,7 +2796,7 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range, } // This insert value inserts something else than what we are looking for. - // See if the (aggregrate) value inserted into has the value we are + // See if the (aggregate) value inserted into has the value we are // looking for, then. if (*req_idx != *i) return FindInsertedValue(I->getAggregateOperand(), idx_range, @@ -2560,7 +2811,7 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range, } if (ExtractValueInst *I = dyn_cast<ExtractValueInst>(V)) { - // If we're extracting a value from an aggregrate that was extracted from + // If we're extracting a value from an aggregate that was extracted from // something else, we can extract from that something else directly instead. // However, we will need to chain I's indices with the requested indices. @@ -2591,7 +2842,12 @@ Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL) { unsigned BitWidth = DL.getPointerTypeSizeInBits(Ptr->getType()); APInt ByteOffset(BitWidth, 0); - while (1) { + + // We walk up the defs but use a visited set to handle unreachable code. In + // that case, we stop after accumulating the cycle once (not that it + // matters). + SmallPtrSet<Value *, 16> Visited; + while (Visited.insert(Ptr).second) { if (Ptr->getType()->isVectorTy()) break; @@ -2935,20 +3191,42 @@ static bool isDereferenceableFromAttribute(const Value *V, const DataLayout &DL, return isDereferenceableFromAttribute(V, Offset, Ty, DL, CtxI, DT, TLI); } -/// Return true if Value is always a dereferenceable pointer. -/// +static bool isAligned(const Value *Base, APInt Offset, unsigned Align, + const DataLayout &DL) { + APInt BaseAlign(Offset.getBitWidth(), getAlignment(Base, DL)); + + if (!BaseAlign) { + Type *Ty = Base->getType()->getPointerElementType(); + if (!Ty->isSized()) + return false; + BaseAlign = DL.getABITypeAlignment(Ty); + } + + APInt Alignment(Offset.getBitWidth(), Align); + + assert(Alignment.isPowerOf2() && "must be a power of 2!"); + return BaseAlign.uge(Alignment) && !(Offset & (Alignment-1)); +} + +static bool isAligned(const Value *Base, unsigned Align, const DataLayout &DL) { + Type *Ty = Base->getType(); + assert(Ty->isSized() && "must be sized"); + APInt Offset(DL.getTypeStoreSizeInBits(Ty), 0); + return isAligned(Base, Offset, Align, DL); +} + /// Test if V is always a pointer to allocated and suitably aligned memory for /// a simple load or store. -static bool isDereferenceablePointer(const Value *V, const DataLayout &DL, - const Instruction *CtxI, - const DominatorTree *DT, - const TargetLibraryInfo *TLI, - SmallPtrSetImpl<const Value *> &Visited) { +static bool isDereferenceableAndAlignedPointer( + const Value *V, unsigned Align, const DataLayout &DL, + const Instruction *CtxI, const DominatorTree *DT, + const TargetLibraryInfo *TLI, SmallPtrSetImpl<const Value *> &Visited) { // Note that it is not safe to speculate into a malloc'd region because // malloc may return null. - // These are obviously ok. - if (isa<AllocaInst>(V)) return true; + // These are obviously ok if aligned. + if (isa<AllocaInst>(V)) + return isAligned(V, Align, DL); // It's not always safe to follow a bitcast, for example: // bitcast i8* (alloca i8) to i32* @@ -2963,21 +3241,22 @@ static bool isDereferenceablePointer(const Value *V, const DataLayout &DL, if (STy->isSized() && DTy->isSized() && (DL.getTypeStoreSize(STy) >= DL.getTypeStoreSize(DTy)) && (DL.getABITypeAlignment(STy) >= DL.getABITypeAlignment(DTy))) - return isDereferenceablePointer(BC->getOperand(0), DL, CtxI, - DT, TLI, Visited); + return isDereferenceableAndAlignedPointer(BC->getOperand(0), Align, DL, + CtxI, DT, TLI, Visited); } // Global variables which can't collapse to null are ok. if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) - return !GV->hasExternalWeakLinkage(); + if (!GV->hasExternalWeakLinkage()) + return isAligned(V, Align, DL); // byval arguments are okay. if (const Argument *A = dyn_cast<Argument>(V)) if (A->hasByValAttr()) - return true; - + return isAligned(V, Align, DL); + if (isDereferenceableFromAttribute(V, DL, CtxI, DT, TLI)) - return true; + return isAligned(V, Align, DL); // For GEPs, determine if the indexing lands within the allocated object. if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { @@ -2985,61 +3264,76 @@ static bool isDereferenceablePointer(const Value *V, const DataLayout &DL, Type *Ty = VTy->getPointerElementType(); const Value *Base = GEP->getPointerOperand(); - // Conservatively require that the base pointer be fully dereferenceable. + // Conservatively require that the base pointer be fully dereferenceable + // and aligned. if (!Visited.insert(Base).second) return false; - if (!isDereferenceablePointer(Base, DL, CtxI, - DT, TLI, Visited)) + if (!isDereferenceableAndAlignedPointer(Base, Align, DL, CtxI, DT, TLI, + Visited)) return false; - + APInt Offset(DL.getPointerTypeSizeInBits(VTy), 0); if (!GEP->accumulateConstantOffset(DL, Offset)) return false; - - // Check if the load is within the bounds of the underlying object. + + // Check if the load is within the bounds of the underlying object + // and offset is aligned. uint64_t LoadSize = DL.getTypeStoreSize(Ty); Type *BaseType = Base->getType()->getPointerElementType(); - return (Offset + LoadSize).ule(DL.getTypeAllocSize(BaseType)); + assert(isPowerOf2_32(Align) && "must be a power of 2!"); + return (Offset + LoadSize).ule(DL.getTypeAllocSize(BaseType)) && + !(Offset & APInt(Offset.getBitWidth(), Align-1)); } // For gc.relocate, look through relocations - if (const IntrinsicInst *I = dyn_cast<IntrinsicInst>(V)) - if (I->getIntrinsicID() == Intrinsic::experimental_gc_relocate) { - GCRelocateOperands RelocateInst(I); - return isDereferenceablePointer(RelocateInst.getDerivedPtr(), DL, CtxI, - DT, TLI, Visited); - } + if (const GCRelocateInst *RelocateInst = dyn_cast<GCRelocateInst>(V)) + return isDereferenceableAndAlignedPointer( + RelocateInst->getDerivedPtr(), Align, DL, CtxI, DT, TLI, Visited); if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(V)) - return isDereferenceablePointer(ASC->getOperand(0), DL, CtxI, - DT, TLI, Visited); + return isDereferenceableAndAlignedPointer(ASC->getOperand(0), Align, DL, + CtxI, DT, TLI, Visited); // If we don't know, assume the worst. return false; } -bool llvm::isDereferenceablePointer(const Value *V, const DataLayout &DL, - const Instruction *CtxI, - const DominatorTree *DT, - const TargetLibraryInfo *TLI) { +bool llvm::isDereferenceableAndAlignedPointer(const Value *V, unsigned Align, + const DataLayout &DL, + const Instruction *CtxI, + const DominatorTree *DT, + const TargetLibraryInfo *TLI) { // When dereferenceability information is provided by a dereferenceable // attribute, we know exactly how many bytes are dereferenceable. If we can // determine the exact offset to the attributed variable, we can use that // information here. Type *VTy = V->getType(); Type *Ty = VTy->getPointerElementType(); + + // Require ABI alignment for loads without alignment specification + if (Align == 0) + Align = DL.getABITypeAlignment(Ty); + if (Ty->isSized()) { APInt Offset(DL.getTypeStoreSizeInBits(VTy), 0); const Value *BV = V->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); - + if (Offset.isNonNegative()) - if (isDereferenceableFromAttribute(BV, Offset, Ty, DL, - CtxI, DT, TLI)) + if (isDereferenceableFromAttribute(BV, Offset, Ty, DL, CtxI, DT, TLI) && + isAligned(BV, Offset, Align, DL)) return true; } SmallPtrSet<const Value *, 32> Visited; - return ::isDereferenceablePointer(V, DL, CtxI, DT, TLI, Visited); + return ::isDereferenceableAndAlignedPointer(V, Align, DL, CtxI, DT, TLI, + Visited); +} + +bool llvm::isDereferenceablePointer(const Value *V, const DataLayout &DL, + const Instruction *CtxI, + const DominatorTree *DT, + const TargetLibraryInfo *TLI) { + return isDereferenceableAndAlignedPointer(V, 1, DL, CtxI, DT, TLI); } bool llvm::isSafeToSpeculativelyExecute(const Value *V, @@ -3089,10 +3383,15 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V, const LoadInst *LI = cast<LoadInst>(Inst); if (!LI->isUnordered() || // Speculative load may create a race that did not exist in the source. - LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeThread)) + LI->getParent()->getParent()->hasFnAttribute( + Attribute::SanitizeThread) || + // Speculative load may load data from dirty regions. + LI->getParent()->getParent()->hasFnAttribute( + Attribute::SanitizeAddress)) return false; const DataLayout &DL = LI->getModule()->getDataLayout(); - return isDereferenceablePointer(LI->getPointerOperand(), DL, CtxI, DT, TLI); + return isDereferenceableAndAlignedPointer( + LI->getPointerOperand(), LI->getAlignment(), DL, CtxI, DT, TLI); } case Instruction::Call: { if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { @@ -3147,16 +3446,27 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V, case Instruction::Switch: case Instruction::Unreachable: case Instruction::Fence: - case Instruction::LandingPad: case Instruction::AtomicRMW: case Instruction::AtomicCmpXchg: + case Instruction::LandingPad: case Instruction::Resume: + case Instruction::CatchSwitch: + case Instruction::CatchPad: + case Instruction::CatchRet: + case Instruction::CleanupPad: + case Instruction::CleanupRet: return false; // Misc instructions which have effects } } +bool llvm::mayBeMemoryDependent(const Instruction &I) { + return I.mayReadOrWriteMemory() || !isSafeToSpeculativelyExecute(&I); +} + /// Return true if we know that the specified value is never null. bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) { + assert(V->getType()->isPointerTy() && "V must be pointer type"); + // Alloca never returns null, malloc might. if (isa<AllocaInst>(V)) return true; @@ -3164,9 +3474,12 @@ bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) { if (const Argument *A = dyn_cast<Argument>(V)) return A->hasByValOrInAllocaAttr() || A->hasNonNullAttr(); - // Global values are not null unless extern weak. + // A global variable in address space 0 is non null unless extern weak. + // Other address spaces may have null as a valid address for a global, + // so we can't assume anything. if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) - return !GV->hasExternalWeakLinkage(); + return !GV->hasExternalWeakLinkage() && + GV->getType()->getAddressSpace() == 0; // A Load tagged w/nonnull metadata is never null. if (const LoadInst *LI = dyn_cast<LoadInst>(V)) @@ -3176,16 +3489,14 @@ bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) { if (CS.isReturnNonNull()) return true; - // operator new never returns null. - if (isOperatorNewLikeFn(V, TLI, /*LookThroughBitCast=*/true)) - return true; - return false; } static bool isKnownNonNullFromDominatingCondition(const Value *V, const Instruction *CtxI, const DominatorTree *DT) { + assert(V->getType()->isPointerTy() && "V must be pointer type"); + unsigned NumUsesExplored = 0; for (auto U : V->users()) { // Avoid massive lists @@ -3316,40 +3627,339 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(Value *LHS, Value *RHS, return OverflowResult::MayOverflow; } -static SelectPatternFlavor matchSelectPattern(ICmpInst::Predicate Pred, +static OverflowResult computeOverflowForSignedAdd( + Value *LHS, Value *RHS, AddOperator *Add, const DataLayout &DL, + AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { + if (Add && Add->hasNoSignedWrap()) { + return OverflowResult::NeverOverflows; + } + + bool LHSKnownNonNegative, LHSKnownNegative; + bool RHSKnownNonNegative, RHSKnownNegative; + ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0, + AC, CxtI, DT); + ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0, + AC, CxtI, DT); + + if ((LHSKnownNonNegative && RHSKnownNegative) || + (LHSKnownNegative && RHSKnownNonNegative)) { + // The sign bits are opposite: this CANNOT overflow. + return OverflowResult::NeverOverflows; + } + + // The remaining code needs Add to be available. Early returns if not so. + if (!Add) + return OverflowResult::MayOverflow; + + // If the sign of Add is the same as at least one of the operands, this add + // CANNOT overflow. This is particularly useful when the sum is + // @llvm.assume'ed non-negative rather than proved so from analyzing its + // operands. + bool LHSOrRHSKnownNonNegative = + (LHSKnownNonNegative || RHSKnownNonNegative); + bool LHSOrRHSKnownNegative = (LHSKnownNegative || RHSKnownNegative); + if (LHSOrRHSKnownNonNegative || LHSOrRHSKnownNegative) { + bool AddKnownNonNegative, AddKnownNegative; + ComputeSignBit(Add, AddKnownNonNegative, AddKnownNegative, DL, + /*Depth=*/0, AC, CxtI, DT); + if ((AddKnownNonNegative && LHSOrRHSKnownNonNegative) || + (AddKnownNegative && LHSOrRHSKnownNegative)) { + return OverflowResult::NeverOverflows; + } + } + + return OverflowResult::MayOverflow; +} + +OverflowResult llvm::computeOverflowForSignedAdd(AddOperator *Add, + const DataLayout &DL, + AssumptionCache *AC, + const Instruction *CxtI, + const DominatorTree *DT) { + return ::computeOverflowForSignedAdd(Add->getOperand(0), Add->getOperand(1), + Add, DL, AC, CxtI, DT); +} + +OverflowResult llvm::computeOverflowForSignedAdd(Value *LHS, Value *RHS, + const DataLayout &DL, + AssumptionCache *AC, + const Instruction *CxtI, + const DominatorTree *DT) { + return ::computeOverflowForSignedAdd(LHS, RHS, nullptr, DL, AC, CxtI, DT); +} + +bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) { + // FIXME: This conservative implementation can be relaxed. E.g. most + // atomic operations are guaranteed to terminate on most platforms + // and most functions terminate. + + return !I->isAtomic() && // atomics may never succeed on some platforms + !isa<CallInst>(I) && // could throw and might not terminate + !isa<InvokeInst>(I) && // might not terminate and could throw to + // non-successor (see bug 24185 for details). + !isa<ResumeInst>(I) && // has no successors + !isa<ReturnInst>(I); // has no successors +} + +bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I, + const Loop *L) { + // The loop header is guaranteed to be executed for every iteration. + // + // FIXME: Relax this constraint to cover all basic blocks that are + // guaranteed to be executed at every iteration. + if (I->getParent() != L->getHeader()) return false; + + for (const Instruction &LI : *L->getHeader()) { + if (&LI == I) return true; + if (!isGuaranteedToTransferExecutionToSuccessor(&LI)) return false; + } + llvm_unreachable("Instruction not contained in its own parent basic block."); +} + +bool llvm::propagatesFullPoison(const Instruction *I) { + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Xor: + case Instruction::Trunc: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + // These operations all propagate poison unconditionally. Note that poison + // is not any particular value, so xor or subtraction of poison with + // itself still yields poison, not zero. + return true; + + case Instruction::AShr: + case Instruction::SExt: + // For these operations, one bit of the input is replicated across + // multiple output bits. A replicated poison bit is still poison. + return true; + + case Instruction::Shl: { + // Left shift *by* a poison value is poison. The number of + // positions to shift is unsigned, so no negative values are + // possible there. Left shift by zero places preserves poison. So + // it only remains to consider left shift of poison by a positive + // number of places. + // + // A left shift by a positive number of places leaves the lowest order bit + // non-poisoned. However, if such a shift has a no-wrap flag, then we can + // make the poison operand violate that flag, yielding a fresh full-poison + // value. + auto *OBO = cast<OverflowingBinaryOperator>(I); + return OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap(); + } + + case Instruction::Mul: { + // A multiplication by zero yields a non-poison zero result, so we need to + // rule out zero as an operand. Conservatively, multiplication by a + // non-zero constant is not multiplication by zero. + // + // Multiplication by a non-zero constant can leave some bits + // non-poisoned. For example, a multiplication by 2 leaves the lowest + // order bit unpoisoned. So we need to consider that. + // + // Multiplication by 1 preserves poison. If the multiplication has a + // no-wrap flag, then we can make the poison operand violate that flag + // when multiplied by any integer other than 0 and 1. + auto *OBO = cast<OverflowingBinaryOperator>(I); + if (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) { + for (Value *V : OBO->operands()) { + if (auto *CI = dyn_cast<ConstantInt>(V)) { + // A ConstantInt cannot yield poison, so we can assume that it is + // the other operand that is poison. + return !CI->isZero(); + } + } + } + return false; + } + + case Instruction::GetElementPtr: + // A GEP implicitly represents a sequence of additions, subtractions, + // truncations, sign extensions and multiplications. The multiplications + // are by the non-zero sizes of some set of types, so we do not have to be + // concerned with multiplication by zero. If the GEP is in-bounds, then + // these operations are implicitly no-signed-wrap so poison is propagated + // by the arguments above for Add, Sub, Trunc, SExt and Mul. + return cast<GEPOperator>(I)->isInBounds(); + + default: + return false; + } +} + +const Value *llvm::getGuaranteedNonFullPoisonOp(const Instruction *I) { + switch (I->getOpcode()) { + case Instruction::Store: + return cast<StoreInst>(I)->getPointerOperand(); + + case Instruction::Load: + return cast<LoadInst>(I)->getPointerOperand(); + + case Instruction::AtomicCmpXchg: + return cast<AtomicCmpXchgInst>(I)->getPointerOperand(); + + case Instruction::AtomicRMW: + return cast<AtomicRMWInst>(I)->getPointerOperand(); + + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return I->getOperand(1); + + default: + return nullptr; + } +} + +bool llvm::isKnownNotFullPoison(const Instruction *PoisonI) { + // We currently only look for uses of poison values within the same basic + // block, as that makes it easier to guarantee that the uses will be + // executed given that PoisonI is executed. + // + // FIXME: Expand this to consider uses beyond the same basic block. To do + // this, look out for the distinction between post-dominance and strong + // post-dominance. + const BasicBlock *BB = PoisonI->getParent(); + + // Set of instructions that we have proved will yield poison if PoisonI + // does. + SmallSet<const Value *, 16> YieldsPoison; + YieldsPoison.insert(PoisonI); + + for (BasicBlock::const_iterator I = PoisonI->getIterator(), E = BB->end(); + I != E; ++I) { + if (&*I != PoisonI) { + const Value *NotPoison = getGuaranteedNonFullPoisonOp(&*I); + if (NotPoison != nullptr && YieldsPoison.count(NotPoison)) return true; + if (!isGuaranteedToTransferExecutionToSuccessor(&*I)) + return false; + } + + // Mark poison that propagates from I through uses of I. + if (YieldsPoison.count(&*I)) { + for (const User *User : I->users()) { + const Instruction *UserI = cast<Instruction>(User); + if (UserI->getParent() == BB && propagatesFullPoison(UserI)) + YieldsPoison.insert(User); + } + } + } + return false; +} + +static bool isKnownNonNaN(Value *V, FastMathFlags FMF) { + if (FMF.noNaNs()) + return true; + + if (auto *C = dyn_cast<ConstantFP>(V)) + return !C->isNaN(); + return false; +} + +static bool isKnownNonZero(Value *V) { + if (auto *C = dyn_cast<ConstantFP>(V)) + return !C->isZero(); + return false; +} + +static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred, + FastMathFlags FMF, Value *CmpLHS, Value *CmpRHS, Value *TrueVal, Value *FalseVal, Value *&LHS, Value *&RHS) { LHS = CmpLHS; RHS = CmpRHS; - // (icmp X, Y) ? X : Y - if (TrueVal == CmpLHS && FalseVal == CmpRHS) { - switch (Pred) { - default: return SPF_UNKNOWN; // Equality. - case ICmpInst::ICMP_UGT: - case ICmpInst::ICMP_UGE: return SPF_UMAX; - case ICmpInst::ICMP_SGT: - case ICmpInst::ICMP_SGE: return SPF_SMAX; - case ICmpInst::ICMP_ULT: - case ICmpInst::ICMP_ULE: return SPF_UMIN; - case ICmpInst::ICMP_SLT: - case ICmpInst::ICMP_SLE: return SPF_SMIN; + // If the predicate is an "or-equal" (FP) predicate, then signed zeroes may + // return inconsistent results between implementations. + // (0.0 <= -0.0) ? 0.0 : -0.0 // Returns 0.0 + // minNum(0.0, -0.0) // May return -0.0 or 0.0 (IEEE 754-2008 5.3.1) + // Therefore we behave conservatively and only proceed if at least one of the + // operands is known to not be zero, or if we don't care about signed zeroes. + switch (Pred) { + default: break; + case CmpInst::FCMP_OGE: case CmpInst::FCMP_OLE: + case CmpInst::FCMP_UGE: case CmpInst::FCMP_ULE: + if (!FMF.noSignedZeros() && !isKnownNonZero(CmpLHS) && + !isKnownNonZero(CmpRHS)) + return {SPF_UNKNOWN, SPNB_NA, false}; + } + + SelectPatternNaNBehavior NaNBehavior = SPNB_NA; + bool Ordered = false; + + // When given one NaN and one non-NaN input: + // - maxnum/minnum (C99 fmaxf()/fminf()) return the non-NaN input. + // - A simple C99 (a < b ? a : b) construction will return 'b' (as the + // ordered comparison fails), which could be NaN or non-NaN. + // so here we discover exactly what NaN behavior is required/accepted. + if (CmpInst::isFPPredicate(Pred)) { + bool LHSSafe = isKnownNonNaN(CmpLHS, FMF); + bool RHSSafe = isKnownNonNaN(CmpRHS, FMF); + + if (LHSSafe && RHSSafe) { + // Both operands are known non-NaN. + NaNBehavior = SPNB_RETURNS_ANY; + } else if (CmpInst::isOrdered(Pred)) { + // An ordered comparison will return false when given a NaN, so it + // returns the RHS. + Ordered = true; + if (LHSSafe) + // LHS is non-NaN, so if RHS is NaN then NaN will be returned. + NaNBehavior = SPNB_RETURNS_NAN; + else if (RHSSafe) + NaNBehavior = SPNB_RETURNS_OTHER; + else + // Completely unsafe. + return {SPF_UNKNOWN, SPNB_NA, false}; + } else { + Ordered = false; + // An unordered comparison will return true when given a NaN, so it + // returns the LHS. + if (LHSSafe) + // LHS is non-NaN, so if RHS is NaN then non-NaN will be returned. + NaNBehavior = SPNB_RETURNS_OTHER; + else if (RHSSafe) + NaNBehavior = SPNB_RETURNS_NAN; + else + // Completely unsafe. + return {SPF_UNKNOWN, SPNB_NA, false}; } } - // (icmp X, Y) ? Y : X if (TrueVal == CmpRHS && FalseVal == CmpLHS) { + std::swap(CmpLHS, CmpRHS); + Pred = CmpInst::getSwappedPredicate(Pred); + if (NaNBehavior == SPNB_RETURNS_NAN) + NaNBehavior = SPNB_RETURNS_OTHER; + else if (NaNBehavior == SPNB_RETURNS_OTHER) + NaNBehavior = SPNB_RETURNS_NAN; + Ordered = !Ordered; + } + + // ([if]cmp X, Y) ? X : Y + if (TrueVal == CmpLHS && FalseVal == CmpRHS) { switch (Pred) { - default: return SPF_UNKNOWN; // Equality. + default: return {SPF_UNKNOWN, SPNB_NA, false}; // Equality. case ICmpInst::ICMP_UGT: - case ICmpInst::ICMP_UGE: return SPF_UMIN; + case ICmpInst::ICMP_UGE: return {SPF_UMAX, SPNB_NA, false}; case ICmpInst::ICMP_SGT: - case ICmpInst::ICMP_SGE: return SPF_SMIN; + case ICmpInst::ICMP_SGE: return {SPF_SMAX, SPNB_NA, false}; case ICmpInst::ICMP_ULT: - case ICmpInst::ICMP_ULE: return SPF_UMAX; + case ICmpInst::ICMP_ULE: return {SPF_UMIN, SPNB_NA, false}; case ICmpInst::ICMP_SLT: - case ICmpInst::ICMP_SLE: return SPF_SMAX; + case ICmpInst::ICMP_SLE: return {SPF_SMIN, SPNB_NA, false}; + case FCmpInst::FCMP_UGT: + case FCmpInst::FCMP_UGE: + case FCmpInst::FCMP_OGT: + case FCmpInst::FCMP_OGE: return {SPF_FMAXNUM, NaNBehavior, Ordered}; + case FCmpInst::FCMP_ULT: + case FCmpInst::FCMP_ULE: + case FCmpInst::FCMP_OLT: + case FCmpInst::FCMP_OLE: return {SPF_FMINNUM, NaNBehavior, Ordered}; } } @@ -3360,13 +3970,13 @@ static SelectPatternFlavor matchSelectPattern(ICmpInst::Predicate Pred, // ABS(X) ==> (X >s 0) ? X : -X and (X >s -1) ? X : -X // NABS(X) ==> (X >s 0) ? -X : X and (X >s -1) ? -X : X if (Pred == ICmpInst::ICMP_SGT && (C1->isZero() || C1->isMinusOne())) { - return (CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS; + return {(CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false}; } // ABS(X) ==> (X <s 0) ? -X : X and (X <s 1) ? -X : X // NABS(X) ==> (X <s 0) ? X : -X and (X <s 1) ? X : -X if (Pred == ICmpInst::ICMP_SLT && (C1->isZero() || C1->isOne())) { - return (CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS; + return {(CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false}; } } @@ -3377,24 +3987,36 @@ static SelectPatternFlavor matchSelectPattern(ICmpInst::Predicate Pred, match(CmpLHS, m_Not(m_Specific(TrueVal))))) { LHS = TrueVal; RHS = FalseVal; - return SPF_SMIN; + return {SPF_SMIN, SPNB_NA, false}; } } } // TODO: (X > 4) ? X : 5 --> (X >= 5) ? X : 5 --> MAX(X, 5) - return SPF_UNKNOWN; + return {SPF_UNKNOWN, SPNB_NA, false}; } -static Constant *lookThroughCast(ICmpInst *CmpI, Value *V1, Value *V2, - Instruction::CastOps *CastOp) { +static Value *lookThroughCast(CmpInst *CmpI, Value *V1, Value *V2, + Instruction::CastOps *CastOp) { CastInst *CI = dyn_cast<CastInst>(V1); Constant *C = dyn_cast<Constant>(V2); - if (!CI || !C) + CastInst *CI2 = dyn_cast<CastInst>(V2); + if (!CI) return nullptr; *CastOp = CI->getOpcode(); + if (CI2) { + // If V1 and V2 are both the same cast from the same type, we can look + // through V1. + if (CI2->getOpcode() == CI->getOpcode() && + CI2->getSrcTy() == CI->getSrcTy()) + return CI2->getOperand(0); + return nullptr; + } else if (!C) { + return nullptr; + } + if (isa<SExtInst>(CI) && CmpI->isSigned()) { Constant *T = ConstantExpr::getTrunc(C, CI->getSrcTy()); // This is only valid if the truncated value can be sign-extended @@ -3409,39 +4031,200 @@ static Constant *lookThroughCast(ICmpInst *CmpI, Value *V1, Value *V2, if (isa<TruncInst>(CI)) return ConstantExpr::getIntegerCast(C, CI->getSrcTy(), CmpI->isSigned()); + if (isa<FPToUIInst>(CI)) + return ConstantExpr::getUIToFP(C, CI->getSrcTy(), true); + + if (isa<FPToSIInst>(CI)) + return ConstantExpr::getSIToFP(C, CI->getSrcTy(), true); + + if (isa<UIToFPInst>(CI)) + return ConstantExpr::getFPToUI(C, CI->getSrcTy(), true); + + if (isa<SIToFPInst>(CI)) + return ConstantExpr::getFPToSI(C, CI->getSrcTy(), true); + + if (isa<FPTruncInst>(CI)) + return ConstantExpr::getFPExtend(C, CI->getSrcTy(), true); + + if (isa<FPExtInst>(CI)) + return ConstantExpr::getFPTrunc(C, CI->getSrcTy(), true); + return nullptr; } -SelectPatternFlavor llvm::matchSelectPattern(Value *V, +SelectPatternResult llvm::matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp) { SelectInst *SI = dyn_cast<SelectInst>(V); - if (!SI) return SPF_UNKNOWN; + if (!SI) return {SPF_UNKNOWN, SPNB_NA, false}; - ICmpInst *CmpI = dyn_cast<ICmpInst>(SI->getCondition()); - if (!CmpI) return SPF_UNKNOWN; + CmpInst *CmpI = dyn_cast<CmpInst>(SI->getCondition()); + if (!CmpI) return {SPF_UNKNOWN, SPNB_NA, false}; - ICmpInst::Predicate Pred = CmpI->getPredicate(); + CmpInst::Predicate Pred = CmpI->getPredicate(); Value *CmpLHS = CmpI->getOperand(0); Value *CmpRHS = CmpI->getOperand(1); Value *TrueVal = SI->getTrueValue(); Value *FalseVal = SI->getFalseValue(); + FastMathFlags FMF; + if (isa<FPMathOperator>(CmpI)) + FMF = CmpI->getFastMathFlags(); // Bail out early. if (CmpI->isEquality()) - return SPF_UNKNOWN; + return {SPF_UNKNOWN, SPNB_NA, false}; // Deal with type mismatches. if (CastOp && CmpLHS->getType() != TrueVal->getType()) { - if (Constant *C = lookThroughCast(CmpI, TrueVal, FalseVal, CastOp)) - return ::matchSelectPattern(Pred, CmpLHS, CmpRHS, + if (Value *C = lookThroughCast(CmpI, TrueVal, FalseVal, CastOp)) + return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS, cast<CastInst>(TrueVal)->getOperand(0), C, LHS, RHS); - if (Constant *C = lookThroughCast(CmpI, FalseVal, TrueVal, CastOp)) - return ::matchSelectPattern(Pred, CmpLHS, CmpRHS, + if (Value *C = lookThroughCast(CmpI, FalseVal, TrueVal, CastOp)) + return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS, C, cast<CastInst>(FalseVal)->getOperand(0), LHS, RHS); } - return ::matchSelectPattern(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal, + return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS, TrueVal, FalseVal, LHS, RHS); } + +ConstantRange llvm::getConstantRangeFromMetadata(MDNode &Ranges) { + const unsigned NumRanges = Ranges.getNumOperands() / 2; + assert(NumRanges >= 1 && "Must have at least one range!"); + assert(Ranges.getNumOperands() % 2 == 0 && "Must be a sequence of pairs"); + + auto *FirstLow = mdconst::extract<ConstantInt>(Ranges.getOperand(0)); + auto *FirstHigh = mdconst::extract<ConstantInt>(Ranges.getOperand(1)); + + ConstantRange CR(FirstLow->getValue(), FirstHigh->getValue()); + + for (unsigned i = 1; i < NumRanges; ++i) { + auto *Low = mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 0)); + auto *High = mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 1)); + + // Note: unionWith will potentially create a range that contains values not + // contained in any of the original N ranges. + CR = CR.unionWith(ConstantRange(Low->getValue(), High->getValue())); + } + + return CR; +} + +/// Return true if "icmp Pred LHS RHS" is always true. +static bool isTruePredicate(CmpInst::Predicate Pred, Value *LHS, Value *RHS, + const DataLayout &DL, unsigned Depth, + AssumptionCache *AC, const Instruction *CxtI, + const DominatorTree *DT) { + assert(!LHS->getType()->isVectorTy() && "TODO: extend to handle vectors!"); + if (ICmpInst::isTrueWhenEqual(Pred) && LHS == RHS) + return true; + + switch (Pred) { + default: + return false; + + case CmpInst::ICMP_SLE: { + const APInt *C; + + // LHS s<= LHS +_{nsw} C if C >= 0 + if (match(RHS, m_NSWAdd(m_Specific(LHS), m_APInt(C)))) + return !C->isNegative(); + return false; + } + + case CmpInst::ICMP_ULE: { + const APInt *C; + + // LHS u<= LHS +_{nuw} C for any C + if (match(RHS, m_NUWAdd(m_Specific(LHS), m_APInt(C)))) + return true; + + // Match A to (X +_{nuw} CA) and B to (X +_{nuw} CB) + auto MatchNUWAddsToSameValue = [&](Value *A, Value *B, Value *&X, + const APInt *&CA, const APInt *&CB) { + if (match(A, m_NUWAdd(m_Value(X), m_APInt(CA))) && + match(B, m_NUWAdd(m_Specific(X), m_APInt(CB)))) + return true; + + // If X & C == 0 then (X | C) == X +_{nuw} C + if (match(A, m_Or(m_Value(X), m_APInt(CA))) && + match(B, m_Or(m_Specific(X), m_APInt(CB)))) { + unsigned BitWidth = CA->getBitWidth(); + APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); + computeKnownBits(X, KnownZero, KnownOne, DL, Depth + 1, AC, CxtI, DT); + + if ((KnownZero & *CA) == *CA && (KnownZero & *CB) == *CB) + return true; + } + + return false; + }; + + Value *X; + const APInt *CLHS, *CRHS; + if (MatchNUWAddsToSameValue(LHS, RHS, X, CLHS, CRHS)) + return CLHS->ule(*CRHS); + + return false; + } + } +} + +/// Return true if "icmp Pred BLHS BRHS" is true whenever "icmp Pred +/// ALHS ARHS" is true. +static bool isImpliedCondOperands(CmpInst::Predicate Pred, Value *ALHS, + Value *ARHS, Value *BLHS, Value *BRHS, + const DataLayout &DL, unsigned Depth, + AssumptionCache *AC, const Instruction *CxtI, + const DominatorTree *DT) { + switch (Pred) { + default: + return false; + + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_SLE: + return isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS, DL, Depth, AC, CxtI, + DT) && + isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS, DL, Depth, AC, CxtI, + DT); + + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_ULE: + return isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS, DL, Depth, AC, CxtI, + DT) && + isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS, DL, Depth, AC, CxtI, + DT); + } +} + +bool llvm::isImpliedCondition(Value *LHS, Value *RHS, const DataLayout &DL, + unsigned Depth, AssumptionCache *AC, + const Instruction *CxtI, + const DominatorTree *DT) { + assert(LHS->getType() == RHS->getType() && "mismatched type"); + Type *OpTy = LHS->getType(); + assert(OpTy->getScalarType()->isIntegerTy(1)); + + // LHS ==> RHS by definition + if (LHS == RHS) return true; + + if (OpTy->isVectorTy()) + // TODO: extending the code below to handle vectors + return false; + assert(OpTy->isIntegerTy(1) && "implied by above"); + + ICmpInst::Predicate APred, BPred; + Value *ALHS, *ARHS; + Value *BLHS, *BRHS; + + if (!match(LHS, m_ICmp(APred, m_Value(ALHS), m_Value(ARHS))) || + !match(RHS, m_ICmp(BPred, m_Value(BLHS), m_Value(BRHS)))) + return false; + + if (APred == BPred) + return isImpliedCondOperands(APred, ALHS, ARHS, BLHS, BRHS, DL, Depth, AC, + CxtI, DT); + + return false; +} diff --git a/contrib/llvm/lib/Analysis/VectorUtils.cpp b/contrib/llvm/lib/Analysis/VectorUtils.cpp index 8c671ef..4b244ec 100644 --- a/contrib/llvm/lib/Analysis/VectorUtils.cpp +++ b/contrib/llvm/lib/Analysis/VectorUtils.cpp @@ -11,13 +11,20 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Value.h" +#include "llvm/IR/Constants.h" + +using namespace llvm; +using namespace llvm::PatternMatch; /// \brief Identify if the intrinsic is trivially vectorizable. /// This method returns true if the intrinsic's argument types are all @@ -79,7 +86,7 @@ bool llvm::hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, /// d) call should only reads memory. /// If all these condition is met then return ValidIntrinsicID /// else return not_intrinsic. -llvm::Intrinsic::ID +Intrinsic::ID llvm::checkUnaryFloatSignature(const CallInst &I, Intrinsic::ID ValidIntrinsicID) { if (I.getNumArgOperands() != 1 || @@ -98,7 +105,7 @@ llvm::checkUnaryFloatSignature(const CallInst &I, /// d) call should only reads memory. /// If all these condition is met then return ValidIntrinsicID /// else return not_intrinsic. -llvm::Intrinsic::ID +Intrinsic::ID llvm::checkBinaryFloatSignature(const CallInst &I, Intrinsic::ID ValidIntrinsicID) { if (I.getNumArgOperands() != 2 || @@ -114,8 +121,8 @@ llvm::checkBinaryFloatSignature(const CallInst &I, /// \brief Returns intrinsic ID for call. /// For the input call instruction it finds mapping intrinsic and returns /// its ID, in case it does not found it return not_intrinsic. -llvm::Intrinsic::ID llvm::getIntrinsicIDForCall(CallInst *CI, - const TargetLibraryInfo *TLI) { +Intrinsic::ID llvm::getIntrinsicIDForCall(CallInst *CI, + const TargetLibraryInfo *TLI) { // If we have an intrinsic call, check if it is trivially vectorizable. if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { Intrinsic::ID ID = II->getIntrinsicID(); @@ -228,8 +235,7 @@ unsigned llvm::getGEPInductionOperand(const GetElementPtrInst *Gep) { cast<PointerType>(Gep->getType()->getScalarType())->getElementType()); // Walk backwards and try to peel off zeros. - while (LastOperand > 1 && - match(Gep->getOperand(LastOperand), llvm::PatternMatch::m_Zero())) { + while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) { // Find the type we're currently indexing into. gep_type_iterator GEPTI = gep_type_begin(Gep); std::advance(GEPTI, LastOperand - 1); @@ -247,8 +253,7 @@ unsigned llvm::getGEPInductionOperand(const GetElementPtrInst *Gep) { /// \brief If the argument is a GEP, then returns the operand identified by /// getGEPInductionOperand. However, if there is some other non-loop-invariant /// operand, it returns that instead. -llvm::Value *llvm::stripGetElementPtr(llvm::Value *Ptr, ScalarEvolution *SE, - Loop *Lp) { +Value *llvm::stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); if (!GEP) return Ptr; @@ -265,8 +270,8 @@ llvm::Value *llvm::stripGetElementPtr(llvm::Value *Ptr, ScalarEvolution *SE, } /// \brief If a value has only one user that is a CastInst, return it. -llvm::Value *llvm::getUniqueCastUse(llvm::Value *Ptr, Loop *Lp, Type *Ty) { - llvm::Value *UniqueCast = nullptr; +Value *llvm::getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) { + Value *UniqueCast = nullptr; for (User *U : Ptr->users()) { CastInst *CI = dyn_cast<CastInst>(U); if (CI && CI->getType() == Ty) { @@ -281,16 +286,15 @@ llvm::Value *llvm::getUniqueCastUse(llvm::Value *Ptr, Loop *Lp, Type *Ty) { /// \brief Get the stride of a pointer access in a loop. Looks for symbolic /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise. -llvm::Value *llvm::getStrideFromPointer(llvm::Value *Ptr, ScalarEvolution *SE, - Loop *Lp) { - const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); +Value *llvm::getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { + auto *PtrTy = dyn_cast<PointerType>(Ptr->getType()); if (!PtrTy || PtrTy->isAggregateType()) return nullptr; // Try to remove a gep instruction to make the pointer (actually index at this // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the // pointer, otherwise, we are analyzing the index. - llvm::Value *OrigPtr = Ptr; + Value *OrigPtr = Ptr; // The size of the pointer access. int64_t PtrAccessSize = 1; @@ -320,8 +324,7 @@ llvm::Value *llvm::getStrideFromPointer(llvm::Value *Ptr, ScalarEvolution *SE, if (M->getOperand(0)->getSCEVType() != scConstant) return nullptr; - const APInt &APStepVal = - cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue(); + const APInt &APStepVal = cast<SCEVConstant>(M->getOperand(0))->getAPInt(); // Huge step value - give up. if (APStepVal.getBitWidth() > 64) @@ -346,7 +349,7 @@ llvm::Value *llvm::getStrideFromPointer(llvm::Value *Ptr, ScalarEvolution *SE, if (!U) return nullptr; - llvm::Value *Stride = U->getValue(); + Value *Stride = U->getValue(); if (!Lp->isLoopInvariant(Stride)) return nullptr; @@ -361,7 +364,7 @@ llvm::Value *llvm::getStrideFromPointer(llvm::Value *Ptr, ScalarEvolution *SE, /// \brief Given a vector and an element number, see if the scalar value is /// already around as a register, for example if it were inserted then extracted /// from the vector. -llvm::Value *llvm::findScalarElement(llvm::Value *V, unsigned EltNo) { +Value *llvm::findScalarElement(Value *V, unsigned EltNo) { assert(V->getType()->isVectorTy() && "Not looking at a vector?"); VectorType *VTy = cast<VectorType>(V->getType()); unsigned Width = VTy->getNumElements(); @@ -399,14 +402,166 @@ llvm::Value *llvm::findScalarElement(llvm::Value *V, unsigned EltNo) { // Extract a value from a vector add operation with a constant zero. Value *Val = nullptr; Constant *Con = nullptr; - if (match(V, - llvm::PatternMatch::m_Add(llvm::PatternMatch::m_Value(Val), - llvm::PatternMatch::m_Constant(Con)))) { + if (match(V, m_Add(m_Value(Val), m_Constant(Con)))) if (Constant *Elt = Con->getAggregateElement(EltNo)) if (Elt->isNullValue()) return findScalarElement(Val, EltNo); - } // Otherwise, we don't know. return nullptr; } + +/// \brief Get splat value if the input is a splat vector or return nullptr. +/// This function is not fully general. It checks only 2 cases: +/// the input value is (1) a splat constants vector or (2) a sequence +/// of instructions that broadcast a single value into a vector. +/// +const llvm::Value *llvm::getSplatValue(const Value *V) { + + if (auto *C = dyn_cast<Constant>(V)) + if (isa<VectorType>(V->getType())) + return C->getSplatValue(); + + auto *ShuffleInst = dyn_cast<ShuffleVectorInst>(V); + if (!ShuffleInst) + return nullptr; + // All-zero (or undef) shuffle mask elements. + for (int MaskElt : ShuffleInst->getShuffleMask()) + if (MaskElt != 0 && MaskElt != -1) + return nullptr; + // The first shuffle source is 'insertelement' with index 0. + auto *InsertEltInst = + dyn_cast<InsertElementInst>(ShuffleInst->getOperand(0)); + if (!InsertEltInst || !isa<ConstantInt>(InsertEltInst->getOperand(2)) || + !cast<ConstantInt>(InsertEltInst->getOperand(2))->isNullValue()) + return nullptr; + + return InsertEltInst->getOperand(1); +} + +MapVector<Instruction *, uint64_t> +llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB, + const TargetTransformInfo *TTI) { + + // DemandedBits will give us every value's live-out bits. But we want + // to ensure no extra casts would need to be inserted, so every DAG + // of connected values must have the same minimum bitwidth. + EquivalenceClasses<Value *> ECs; + SmallVector<Value *, 16> Worklist; + SmallPtrSet<Value *, 4> Roots; + SmallPtrSet<Value *, 16> Visited; + DenseMap<Value *, uint64_t> DBits; + SmallPtrSet<Instruction *, 4> InstructionSet; + MapVector<Instruction *, uint64_t> MinBWs; + + // Determine the roots. We work bottom-up, from truncs or icmps. + bool SeenExtFromIllegalType = false; + for (auto *BB : Blocks) + for (auto &I : *BB) { + InstructionSet.insert(&I); + + if (TTI && (isa<ZExtInst>(&I) || isa<SExtInst>(&I)) && + !TTI->isTypeLegal(I.getOperand(0)->getType())) + SeenExtFromIllegalType = true; + + // Only deal with non-vector integers up to 64-bits wide. + if ((isa<TruncInst>(&I) || isa<ICmpInst>(&I)) && + !I.getType()->isVectorTy() && + I.getOperand(0)->getType()->getScalarSizeInBits() <= 64) { + // Don't make work for ourselves. If we know the loaded type is legal, + // don't add it to the worklist. + if (TTI && isa<TruncInst>(&I) && TTI->isTypeLegal(I.getType())) + continue; + + Worklist.push_back(&I); + Roots.insert(&I); + } + } + // Early exit. + if (Worklist.empty() || (TTI && !SeenExtFromIllegalType)) + return MinBWs; + + // Now proceed breadth-first, unioning values together. + while (!Worklist.empty()) { + Value *Val = Worklist.pop_back_val(); + Value *Leader = ECs.getOrInsertLeaderValue(Val); + + if (Visited.count(Val)) + continue; + Visited.insert(Val); + + // Non-instructions terminate a chain successfully. + if (!isa<Instruction>(Val)) + continue; + Instruction *I = cast<Instruction>(Val); + + // If we encounter a type that is larger than 64 bits, we can't represent + // it so bail out. + if (DB.getDemandedBits(I).getBitWidth() > 64) + return MapVector<Instruction *, uint64_t>(); + + uint64_t V = DB.getDemandedBits(I).getZExtValue(); + DBits[Leader] |= V; + + // Casts, loads and instructions outside of our range terminate a chain + // successfully. + if (isa<SExtInst>(I) || isa<ZExtInst>(I) || isa<LoadInst>(I) || + !InstructionSet.count(I)) + continue; + + // Unsafe casts terminate a chain unsuccessfully. We can't do anything + // useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to + // transform anything that relies on them. + if (isa<BitCastInst>(I) || isa<PtrToIntInst>(I) || isa<IntToPtrInst>(I) || + !I->getType()->isIntegerTy()) { + DBits[Leader] |= ~0ULL; + continue; + } + + // We don't modify the types of PHIs. Reductions will already have been + // truncated if possible, and inductions' sizes will have been chosen by + // indvars. + if (isa<PHINode>(I)) + continue; + + if (DBits[Leader] == ~0ULL) + // All bits demanded, no point continuing. + continue; + + for (Value *O : cast<User>(I)->operands()) { + ECs.unionSets(Leader, O); + Worklist.push_back(O); + } + } + + // Now we've discovered all values, walk them to see if there are + // any users we didn't see. If there are, we can't optimize that + // chain. + for (auto &I : DBits) + for (auto *U : I.first->users()) + if (U->getType()->isIntegerTy() && DBits.count(U) == 0) + DBits[ECs.getOrInsertLeaderValue(I.first)] |= ~0ULL; + + for (auto I = ECs.begin(), E = ECs.end(); I != E; ++I) { + uint64_t LeaderDemandedBits = 0; + for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI) + LeaderDemandedBits |= DBits[*MI]; + + uint64_t MinBW = (sizeof(LeaderDemandedBits) * 8) - + llvm::countLeadingZeros(LeaderDemandedBits); + // Round up to a power of 2 + if (!isPowerOf2_64((uint64_t)MinBW)) + MinBW = NextPowerOf2(MinBW); + for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI) { + if (!isa<Instruction>(*MI)) + continue; + Type *Ty = (*MI)->getType(); + if (Roots.count(*MI)) + Ty = cast<Instruction>(*MI)->getOperand(0)->getType(); + if (MinBW < Ty->getScalarSizeInBits()) + MinBWs[cast<Instruction>(*MI)] = MinBW; + } + } + + return MinBWs; +} diff --git a/contrib/llvm/lib/AsmParser/LLLexer.cpp b/contrib/llvm/lib/AsmParser/LLLexer.cpp index 5c4bab7..26eca23 100644 --- a/contrib/llvm/lib/AsmParser/LLLexer.cpp +++ b/contrib/llvm/lib/AsmParser/LLLexer.cpp @@ -105,7 +105,7 @@ void LLLexer::FP80HexToIntPair(const char *Buffer, const char *End, Pair[1] += hexDigitValue(*Buffer); } Pair[0] = 0; - for (int i=0; i<16; i++, Buffer++) { + for (int i = 0; i < 16 && Buffer != End; i++, Buffer++) { Pair[0] *= 16; Pair[0] += hexDigitValue(*Buffer); } @@ -523,9 +523,14 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(zeroinitializer); KEYWORD(undef); KEYWORD(null); + KEYWORD(none); KEYWORD(to); + KEYWORD(caller); + KEYWORD(within); + KEYWORD(from); KEYWORD(tail); KEYWORD(musttail); + KEYWORD(notail); KEYWORD(target); KEYWORD(triple); KEYWORD(unwind); @@ -586,6 +591,10 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(preserve_mostcc); KEYWORD(preserve_allcc); KEYWORD(ghccc); + KEYWORD(x86_intrcc); + KEYWORD(hhvmcc); + KEYWORD(hhvm_ccc); + KEYWORD(cxx_fast_tlscc); KEYWORD(cc); KEYWORD(c); @@ -601,6 +610,8 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(convergent); KEYWORD(dereferenceable); KEYWORD(dereferenceable_or_null); + KEYWORD(inaccessiblememonly); + KEYWORD(inaccessiblemem_or_argmemonly); KEYWORD(inlinehint); KEYWORD(inreg); KEYWORD(jumptable); @@ -613,6 +624,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(noduplicate); KEYWORD(noimplicitfloat); KEYWORD(noinline); + KEYWORD(norecurse); KEYWORD(nonlazybind); KEYWORD(nonnull); KEYWORD(noredzone); @@ -690,6 +702,7 @@ lltok::Kind LLLexer::LexIdentifier() { TYPEKEYWORD("label", Type::getLabelTy(Context)); TYPEKEYWORD("metadata", Type::getMetadataTy(Context)); TYPEKEYWORD("x86_mmx", Type::getX86_MMXTy(Context)); + TYPEKEYWORD("token", Type::getTokenTy(Context)); #undef TYPEKEYWORD // Keywords for instructions. @@ -749,6 +762,11 @@ lltok::Kind LLLexer::LexIdentifier() { INSTKEYWORD(extractvalue, ExtractValue); INSTKEYWORD(insertvalue, InsertValue); INSTKEYWORD(landingpad, LandingPad); + INSTKEYWORD(cleanupret, CleanupRet); + INSTKEYWORD(catchret, CatchRet); + INSTKEYWORD(catchswitch, CatchSwitch); + INSTKEYWORD(catchpad, CatchPad); + INSTKEYWORD(cleanuppad, CleanupPad); #undef INSTKEYWORD #define DWKEYWORD(TYPE, TOKEN) \ @@ -763,6 +781,7 @@ lltok::Kind LLLexer::LexIdentifier() { DWKEYWORD(VIRTUALITY, DwarfVirtuality); DWKEYWORD(LANG, DwarfLang); DWKEYWORD(OP, DwarfOp); + DWKEYWORD(MACINFO, DwarfMacinfo); #undef DWKEYWORD if (Keyword.startswith("DIFlag")) { diff --git a/contrib/llvm/lib/AsmParser/LLParser.cpp b/contrib/llvm/lib/AsmParser/LLParser.cpp index 1c6e7bd..3471a2d 100644 --- a/contrib/llvm/lib/AsmParser/LLParser.cpp +++ b/contrib/llvm/lib/AsmParser/LLParser.cpp @@ -13,6 +13,7 @@ #include "LLParser.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/AsmParser/SlotMapping.h" #include "llvm/IR/AutoUpgrade.h" #include "llvm/IR/CallingConv.h" @@ -48,6 +49,32 @@ bool LLParser::Run() { ValidateEndOfModule(); } +bool LLParser::parseStandaloneConstantValue(Constant *&C, + const SlotMapping *Slots) { + restoreParsingState(Slots); + Lex.Lex(); + + Type *Ty = nullptr; + if (ParseType(Ty) || parseConstantValue(Ty, C)) + return true; + if (Lex.getKind() != lltok::Eof) + return Error(Lex.getLoc(), "expected end of string"); + return false; +} + +void LLParser::restoreParsingState(const SlotMapping *Slots) { + if (!Slots) + return; + NumberedVals = Slots->GlobalValues; + NumberedMetadata = Slots->MetadataNodes; + for (const auto &I : Slots->NamedTypes) + NamedTypes.insert( + std::make_pair(I.getKey(), std::make_pair(I.second, LocTy()))); + for (const auto &I : Slots->Types) + NumberedTypes.insert( + std::make_pair(I.first, std::make_pair(I.second, LocTy()))); +} + /// ValidateEndOfModule - Do final validity and sanity checks at the end of the /// module. bool LLParser::ValidateEndOfModule() { @@ -158,7 +185,7 @@ bool LLParser::ValidateEndOfModule() { // Look for intrinsic functions and CallInst that need to be upgraded for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ) - UpgradeCallsToIntrinsic(FI++); // must be post-increment, as we remove + UpgradeCallsToIntrinsic(&*FI++); // must be post-increment, as we remove UpgradeDebugInfo(*M); @@ -169,6 +196,10 @@ bool LLParser::ValidateEndOfModule() { // the mapping from LLParser as it doesn't need it anymore. Slots->GlobalValues = std::move(NumberedVals); Slots->MetadataNodes = std::move(NumberedMetadata); + for (const auto &I : NamedTypes) + Slots->NamedTypes.insert(std::make_pair(I.getKey(), I.second.first)); + for (const auto &I : NumberedTypes) + Slots->Types.insert(std::make_pair(I.first, I.second.first)); return false; } @@ -647,6 +678,12 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc, unsigned L, return Error(NameLoc, "symbol with local linkage must have default visibility"); + Type *Ty; + LocTy ExplicitTypeLoc = Lex.getLoc(); + if (ParseType(Ty) || + ParseToken(lltok::comma, "expected comma after alias's type")) + return true; + Constant *Aliasee; LocTy AliaseeLoc = Lex.getLoc(); if (Lex.getKind() != lltok::kw_bitcast && @@ -669,11 +706,35 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc, unsigned L, auto *PTy = dyn_cast<PointerType>(AliaseeType); if (!PTy) return Error(AliaseeLoc, "An alias must have pointer type"); + unsigned AddrSpace = PTy->getAddressSpace(); + + if (Ty != PTy->getElementType()) + return Error( + ExplicitTypeLoc, + "explicit pointee type doesn't match operand's pointee type"); + + GlobalValue *GVal = nullptr; + + // See if the alias was forward referenced, if so, prepare to replace the + // forward reference. + if (!Name.empty()) { + GVal = M->getNamedValue(Name); + if (GVal) { + if (!ForwardRefVals.erase(Name)) + return Error(NameLoc, "redefinition of global '@" + Name + "'"); + } + } else { + auto I = ForwardRefValIDs.find(NumberedVals.size()); + if (I != ForwardRefValIDs.end()) { + GVal = I->second.first; + ForwardRefValIDs.erase(I); + } + } // Okay, create the alias but do not insert it into the module yet. std::unique_ptr<GlobalAlias> GA( - GlobalAlias::create(PTy, (GlobalValue::LinkageTypes)Linkage, Name, - Aliasee, /*Parent*/ nullptr)); + GlobalAlias::create(Ty, AddrSpace, (GlobalValue::LinkageTypes)Linkage, + Name, Aliasee, /*Parent*/ nullptr)); GA->setThreadLocalMode(TLM); GA->setVisibility((GlobalValue::VisibilityTypes)Visibility); GA->setDLLStorageClass((GlobalValue::DLLStorageClassTypes)DLLStorageClass); @@ -682,27 +743,17 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc, unsigned L, if (Name.empty()) NumberedVals.push_back(GA.get()); - // See if this value already exists in the symbol table. If so, it is either - // a redefinition or a definition of a forward reference. - if (GlobalValue *Val = M->getNamedValue(Name)) { - // See if this was a redefinition. If so, there is no entry in - // ForwardRefVals. - std::map<std::string, std::pair<GlobalValue*, LocTy> >::iterator - I = ForwardRefVals.find(Name); - if (I == ForwardRefVals.end()) - return Error(NameLoc, "redefinition of global named '@" + Name + "'"); - - // Otherwise, this was a definition of forward ref. Verify that types - // agree. - if (Val->getType() != GA->getType()) - return Error(NameLoc, - "forward reference and definition of alias have different types"); + if (GVal) { + // Verify that types agree. + if (GVal->getType() != GA->getType()) + return Error( + ExplicitTypeLoc, + "forward reference and definition of alias have different types"); // If they agree, just RAUW the old value with the alias and remove the // forward ref info. - Val->replaceAllUsesWith(GA.get()); - Val->eraseFromParent(); - ForwardRefVals.erase(I); + GVal->replaceAllUsesWith(GA.get()); + GVal->eraseFromParent(); } // Insert into the module, we know its name won't collide now. @@ -767,12 +818,11 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc, if (!Name.empty()) { GVal = M->getNamedValue(Name); if (GVal) { - if (!ForwardRefVals.erase(Name) || !isa<GlobalValue>(GVal)) + if (!ForwardRefVals.erase(Name)) return Error(NameLoc, "redefinition of global '@" + Name + "'"); } } else { - std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator - I = ForwardRefValIDs.find(NumberedVals.size()); + auto I = ForwardRefValIDs.find(NumberedVals.size()); if (I != ForwardRefValIDs.end()) { GVal = I->second.first; ForwardRefValIDs.erase(I); @@ -903,14 +953,8 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B, } // Target-dependent attributes: case lltok::StringConstant: { - std::string Attr = Lex.getStrVal(); - Lex.Lex(); - std::string Val; - if (EatIfPresent(lltok::equal) && - ParseStringConstant(Val)) + if (ParseStringAttribute(B)) return true; - - B.addAttribute(Attr, Val); continue; } @@ -951,6 +995,10 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B, case lltok::kw_builtin: B.addAttribute(Attribute::Builtin); break; case lltok::kw_cold: B.addAttribute(Attribute::Cold); break; case lltok::kw_convergent: B.addAttribute(Attribute::Convergent); break; + case lltok::kw_inaccessiblememonly: + B.addAttribute(Attribute::InaccessibleMemOnly); break; + case lltok::kw_inaccessiblemem_or_argmemonly: + B.addAttribute(Attribute::InaccessibleMemOrArgMemOnly); break; case lltok::kw_inlinehint: B.addAttribute(Attribute::InlineHint); break; case lltok::kw_jumptable: B.addAttribute(Attribute::JumpTable); break; case lltok::kw_minsize: B.addAttribute(Attribute::MinSize); break; @@ -963,6 +1011,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B, case lltok::kw_nonlazybind: B.addAttribute(Attribute::NonLazyBind); break; case lltok::kw_noredzone: B.addAttribute(Attribute::NoRedZone); break; case lltok::kw_noreturn: B.addAttribute(Attribute::NoReturn); break; + case lltok::kw_norecurse: B.addAttribute(Attribute::NoRecurse); break; case lltok::kw_nounwind: B.addAttribute(Attribute::NoUnwind); break; case lltok::kw_optnone: B.addAttribute(Attribute::OptimizeNone); break; case lltok::kw_optsize: B.addAttribute(Attribute::OptimizeForSize); break; @@ -1015,6 +1064,17 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B, // GlobalValue Reference/Resolution Routines. //===----------------------------------------------------------------------===// +static inline GlobalValue *createGlobalFwdRef(Module *M, PointerType *PTy, + const std::string &Name) { + if (auto *FT = dyn_cast<FunctionType>(PTy->getElementType())) + return Function::Create(FT, GlobalValue::ExternalWeakLinkage, Name, M); + else + return new GlobalVariable(*M, PTy->getElementType(), false, + GlobalValue::ExternalWeakLinkage, nullptr, Name, + nullptr, GlobalVariable::NotThreadLocal, + PTy->getAddressSpace()); +} + /// GetGlobalVal - Get a value with the specified name or ID, creating a /// forward reference record if needed. This can return null if the value /// exists but does not have the right type. @@ -1033,8 +1093,7 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty, // If this is a forward reference for the value, see if we already created a // forward ref record. if (!Val) { - std::map<std::string, std::pair<GlobalValue*, LocTy> >::iterator - I = ForwardRefVals.find(Name); + auto I = ForwardRefVals.find(Name); if (I != ForwardRefVals.end()) Val = I->second.first; } @@ -1048,15 +1107,7 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty, } // Otherwise, create a new forward reference for this value and remember it. - GlobalValue *FwdVal; - if (FunctionType *FT = dyn_cast<FunctionType>(PTy->getElementType())) - FwdVal = Function::Create(FT, GlobalValue::ExternalWeakLinkage, Name, M); - else - FwdVal = new GlobalVariable(*M, PTy->getElementType(), false, - GlobalValue::ExternalWeakLinkage, nullptr, Name, - nullptr, GlobalVariable::NotThreadLocal, - PTy->getAddressSpace()); - + GlobalValue *FwdVal = createGlobalFwdRef(M, PTy, Name); ForwardRefVals[Name] = std::make_pair(FwdVal, Loc); return FwdVal; } @@ -1073,8 +1124,7 @@ GlobalValue *LLParser::GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc) { // If this is a forward reference for the value, see if we already created a // forward ref record. if (!Val) { - std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator - I = ForwardRefValIDs.find(ID); + auto I = ForwardRefValIDs.find(ID); if (I != ForwardRefValIDs.end()) Val = I->second.first; } @@ -1088,13 +1138,7 @@ GlobalValue *LLParser::GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc) { } // Otherwise, create a new forward reference for this value and remember it. - GlobalValue *FwdVal; - if (FunctionType *FT = dyn_cast<FunctionType>(PTy->getElementType())) - FwdVal = Function::Create(FT, GlobalValue::ExternalWeakLinkage, "", M); - else - FwdVal = new GlobalVariable(*M, PTy->getElementType(), false, - GlobalValue::ExternalWeakLinkage, nullptr, ""); - + GlobalValue *FwdVal = createGlobalFwdRef(M, PTy, ""); ForwardRefValIDs[ID] = std::make_pair(FwdVal, Loc); return FwdVal; } @@ -1217,6 +1261,19 @@ bool LLParser::ParseOptionalAddrSpace(unsigned &AddrSpace) { ParseToken(lltok::rparen, "expected ')' in address space"); } +/// ParseStringAttribute +/// := StringConstant +/// := StringConstant '=' StringConstant +bool LLParser::ParseStringAttribute(AttrBuilder &B) { + std::string Attr = Lex.getStrVal(); + Lex.Lex(); + std::string Val; + if (EatIfPresent(lltok::equal) && ParseStringConstant(Val)) + return true; + B.addAttribute(Attr, Val); + return false; +} + /// ParseOptionalParamAttrs - Parse a potentially empty list of parameter attributes. bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) { bool HaveError = false; @@ -1228,6 +1285,11 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) { switch (Token) { default: // End of attributes. return HaveError; + case lltok::StringConstant: { + if (ParseStringAttribute(B)) + return true; + continue; + } case lltok::kw_align: { unsigned Alignment; if (ParseOptionalAlignment(Alignment)) @@ -1309,6 +1371,11 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) { switch (Token) { default: // End of attributes. return HaveError; + case lltok::StringConstant: { + if (ParseStringAttribute(B)) + return true; + continue; + } case lltok::kw_dereferenceable: { uint64_t Bytes; if (ParseOptionalDerefAttrBytes(lltok::kw_dereferenceable, Bytes)) @@ -1323,6 +1390,13 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) { B.addDereferenceableOrNullAttr(Bytes); continue; } + case lltok::kw_align: { + unsigned Alignment; + if (ParseOptionalAlignment(Alignment)) + return true; + B.addAlignmentAttr(Alignment); + continue; + } case lltok::kw_inreg: B.addAttribute(Attribute::InReg); break; case lltok::kw_noalias: B.addAttribute(Attribute::NoAlias); break; case lltok::kw_nonnull: B.addAttribute(Attribute::NonNull); break; @@ -1330,7 +1404,6 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) { case lltok::kw_zeroext: B.addAttribute(Attribute::ZExt); break; // Error handling. - case lltok::kw_align: case lltok::kw_byval: case lltok::kw_inalloca: case lltok::kw_nest: @@ -1473,6 +1546,10 @@ bool LLParser::ParseOptionalDLLStorageClass(unsigned &Res) { /// ::= 'preserve_mostcc' /// ::= 'preserve_allcc' /// ::= 'ghccc' +/// ::= 'x86_intrcc' +/// ::= 'hhvmcc' +/// ::= 'hhvm_ccc' +/// ::= 'cxx_fast_tlscc' /// ::= 'cc' UINT /// bool LLParser::ParseOptionalCallingConv(unsigned &CC) { @@ -1501,6 +1578,10 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) { case lltok::kw_preserve_mostcc:CC = CallingConv::PreserveMost; break; case lltok::kw_preserve_allcc: CC = CallingConv::PreserveAll; break; case lltok::kw_ghccc: CC = CallingConv::GHC; break; + case lltok::kw_x86_intrcc: CC = CallingConv::X86_INTR; break; + case lltok::kw_hhvmcc: CC = CallingConv::HHVM; break; + case lltok::kw_hhvm_ccc: CC = CallingConv::HHVM_C; break; + case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break; case lltok::kw_cc: { Lex.Lex(); return ParseUInt32(CC); @@ -1883,7 +1964,59 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList, return false; } +/// ParseOptionalOperandBundles +/// ::= /*empty*/ +/// ::= '[' OperandBundle [, OperandBundle ]* ']' +/// +/// OperandBundle +/// ::= bundle-tag '(' ')' +/// ::= bundle-tag '(' Type Value [, Type Value ]* ')' +/// +/// bundle-tag ::= String Constant +bool LLParser::ParseOptionalOperandBundles( + SmallVectorImpl<OperandBundleDef> &BundleList, PerFunctionState &PFS) { + LocTy BeginLoc = Lex.getLoc(); + if (!EatIfPresent(lltok::lsquare)) + return false; + + while (Lex.getKind() != lltok::rsquare) { + // If this isn't the first operand bundle, we need a comma. + if (!BundleList.empty() && + ParseToken(lltok::comma, "expected ',' in input list")) + return true; + std::string Tag; + if (ParseStringConstant(Tag)) + return true; + + if (ParseToken(lltok::lparen, "expected '(' in operand bundle")) + return true; + + std::vector<Value *> Inputs; + while (Lex.getKind() != lltok::rparen) { + // If this isn't the first input, we need a comma. + if (!Inputs.empty() && + ParseToken(lltok::comma, "expected ',' in input list")) + return true; + + Type *Ty = nullptr; + Value *Input = nullptr; + if (ParseType(Ty) || ParseValue(Ty, Input, PFS)) + return true; + Inputs.push_back(Input); + } + + BundleList.emplace_back(std::move(Tag), std::move(Inputs)); + + Lex.Lex(); // Lex the ')'. + } + + if (BundleList.empty()) + return Error(BeginLoc, "operand bundle set must not be empty"); + + Lex.Lex(); // Lex the ']'. + return false; +} /// ParseArgumentList - Parse the argument list for a function type or function /// prototype. @@ -2146,31 +2279,29 @@ LLParser::PerFunctionState::PerFunctionState(LLParser &p, Function &f, : P(p), F(f), FunctionNumber(functionNumber) { // Insert unnamed arguments into the NumberedVals list. - for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); - AI != E; ++AI) - if (!AI->hasName()) - NumberedVals.push_back(AI); + for (Argument &A : F.args()) + if (!A.hasName()) + NumberedVals.push_back(&A); } LLParser::PerFunctionState::~PerFunctionState() { // If there were any forward referenced non-basicblock values, delete them. - for (std::map<std::string, std::pair<Value*, LocTy> >::iterator - I = ForwardRefVals.begin(), E = ForwardRefVals.end(); I != E; ++I) - if (!isa<BasicBlock>(I->second.first)) { - I->second.first->replaceAllUsesWith( - UndefValue::get(I->second.first->getType())); - delete I->second.first; - I->second.first = nullptr; - } - for (std::map<unsigned, std::pair<Value*, LocTy> >::iterator - I = ForwardRefValIDs.begin(), E = ForwardRefValIDs.end(); I != E; ++I) - if (!isa<BasicBlock>(I->second.first)) { - I->second.first->replaceAllUsesWith( - UndefValue::get(I->second.first->getType())); - delete I->second.first; - I->second.first = nullptr; - } + for (const auto &P : ForwardRefVals) { + if (isa<BasicBlock>(P.second.first)) + continue; + P.second.first->replaceAllUsesWith( + UndefValue::get(P.second.first->getType())); + delete P.second.first; + } + + for (const auto &P : ForwardRefValIDs) { + if (isa<BasicBlock>(P.second.first)) + continue; + P.second.first->replaceAllUsesWith( + UndefValue::get(P.second.first->getType())); + delete P.second.first; + } } bool LLParser::PerFunctionState::FinishFunction() { @@ -2189,16 +2320,15 @@ bool LLParser::PerFunctionState::FinishFunction() { /// GetVal - Get a value with the specified name or ID, creating a /// forward reference record if needed. This can return null if the value /// exists but does not have the right type. -Value *LLParser::PerFunctionState::GetVal(const std::string &Name, - Type *Ty, LocTy Loc) { +Value *LLParser::PerFunctionState::GetVal(const std::string &Name, Type *Ty, + LocTy Loc) { // Look this name up in the normal function symbol table. Value *Val = F.getValueSymbolTable().lookup(Name); // If this is a forward reference for the value, see if we already created a // forward ref record. if (!Val) { - std::map<std::string, std::pair<Value*, LocTy> >::iterator - I = ForwardRefVals.find(Name); + auto I = ForwardRefVals.find(Name); if (I != ForwardRefVals.end()) Val = I->second.first; } @@ -2222,25 +2352,24 @@ Value *LLParser::PerFunctionState::GetVal(const std::string &Name, // Otherwise, create a new forward reference for this value and remember it. Value *FwdVal; - if (Ty->isLabelTy()) + if (Ty->isLabelTy()) { FwdVal = BasicBlock::Create(F.getContext(), Name, &F); - else + } else { FwdVal = new Argument(Ty, Name); + } ForwardRefVals[Name] = std::make_pair(FwdVal, Loc); return FwdVal; } -Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty, - LocTy Loc) { +Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty, LocTy Loc) { // Look this name up in the normal function symbol table. Value *Val = ID < NumberedVals.size() ? NumberedVals[ID] : nullptr; // If this is a forward reference for the value, see if we already created a // forward ref record. if (!Val) { - std::map<unsigned, std::pair<Value*, LocTy> >::iterator - I = ForwardRefValIDs.find(ID); + auto I = ForwardRefValIDs.find(ID); if (I != ForwardRefValIDs.end()) Val = I->second.first; } @@ -2263,10 +2392,11 @@ Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty, // Otherwise, create a new forward reference for this value and remember it. Value *FwdVal; - if (Ty->isLabelTy()) + if (Ty->isLabelTy()) { FwdVal = BasicBlock::Create(F.getContext(), "", &F); - else + } else { FwdVal = new Argument(Ty); + } ForwardRefValIDs[ID] = std::make_pair(FwdVal, Loc); return FwdVal; @@ -2295,14 +2425,15 @@ bool LLParser::PerFunctionState::SetInstName(int NameID, return P.Error(NameLoc, "instruction expected to be numbered '%" + Twine(NumberedVals.size()) + "'"); - std::map<unsigned, std::pair<Value*, LocTy> >::iterator FI = - ForwardRefValIDs.find(NameID); + auto FI = ForwardRefValIDs.find(NameID); if (FI != ForwardRefValIDs.end()) { - if (FI->second.first->getType() != Inst->getType()) + Value *Sentinel = FI->second.first; + if (Sentinel->getType() != Inst->getType()) return P.Error(NameLoc, "instruction forward referenced with type '" + getTypeString(FI->second.first->getType()) + "'"); - FI->second.first->replaceAllUsesWith(Inst); - delete FI->second.first; + + Sentinel->replaceAllUsesWith(Inst); + delete Sentinel; ForwardRefValIDs.erase(FI); } @@ -2311,14 +2442,15 @@ bool LLParser::PerFunctionState::SetInstName(int NameID, } // Otherwise, the instruction had a name. Resolve forward refs and set it. - std::map<std::string, std::pair<Value*, LocTy> >::iterator - FI = ForwardRefVals.find(NameStr); + auto FI = ForwardRefVals.find(NameStr); if (FI != ForwardRefVals.end()) { - if (FI->second.first->getType() != Inst->getType()) + Value *Sentinel = FI->second.first; + if (Sentinel->getType() != Inst->getType()) return P.Error(NameLoc, "instruction forward referenced with type '" + getTypeString(FI->second.first->getType()) + "'"); - FI->second.first->replaceAllUsesWith(Inst); - delete FI->second.first; + + Sentinel->replaceAllUsesWith(Inst); + delete Sentinel; ForwardRefVals.erase(FI); } @@ -2421,6 +2553,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { case lltok::kw_null: ID.Kind = ValID::t_Null; break; case lltok::kw_undef: ID.Kind = ValID::t_Undef; break; case lltok::kw_zeroinitializer: ID.Kind = ValID::t_Zero; break; + case lltok::kw_none: ID.Kind = ValID::t_None; break; case lltok::lbrace: { // ValID ::= '{' ConstVector '}' @@ -2430,9 +2563,10 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { ParseToken(lltok::rbrace, "expected end of struct constant")) return true; - ID.ConstantStructElts = new Constant*[Elts.size()]; + ID.ConstantStructElts = make_unique<Constant *[]>(Elts.size()); ID.UIntVal = Elts.size(); - memcpy(ID.ConstantStructElts, Elts.data(), Elts.size()*sizeof(Elts[0])); + memcpy(ID.ConstantStructElts.get(), Elts.data(), + Elts.size() * sizeof(Elts[0])); ID.Kind = ValID::t_ConstantStruct; return false; } @@ -2451,8 +2585,9 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { return true; if (isPackedStruct) { - ID.ConstantStructElts = new Constant*[Elts.size()]; - memcpy(ID.ConstantStructElts, Elts.data(), Elts.size()*sizeof(Elts[0])); + ID.ConstantStructElts = make_unique<Constant *[]>(Elts.size()); + memcpy(ID.ConstantStructElts.get(), Elts.data(), + Elts.size() * sizeof(Elts[0])); ID.UIntVal = Elts.size(); ID.Kind = ValID::t_PackedConstantStruct; return false; @@ -2891,7 +3026,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) { } } - SmallPtrSet<const Type*, 4> Visited; + SmallPtrSet<Type*, 4> Visited; if (!Indices.empty() && !Ty->isSized(&Visited)) return Error(ID.Loc, "base element of getelementptr must be sized"); @@ -3066,6 +3201,11 @@ struct DwarfTagField : public MDUnsignedField { DwarfTagField(dwarf::Tag DefaultTag) : MDUnsignedField(DefaultTag, dwarf::DW_TAG_hi_user) {} }; +struct DwarfMacinfoTypeField : public MDUnsignedField { + DwarfMacinfoTypeField() : MDUnsignedField(0, dwarf::DW_MACINFO_vendor_ext) {} + DwarfMacinfoTypeField(dwarf::MacinfoRecordType DefaultType) + : MDUnsignedField(DefaultType, dwarf::DW_MACINFO_vendor_ext) {} +}; struct DwarfAttEncodingField : public MDUnsignedField { DwarfAttEncodingField() : MDUnsignedField(0, dwarf::DW_ATE_hi_user) {} }; @@ -3159,6 +3299,26 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfTagField &Result) { template <> bool LLParser::ParseMDField(LocTy Loc, StringRef Name, + DwarfMacinfoTypeField &Result) { + if (Lex.getKind() == lltok::APSInt) + return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result)); + + if (Lex.getKind() != lltok::DwarfMacinfo) + return TokError("expected DWARF macinfo type"); + + unsigned Macinfo = dwarf::getMacinfo(Lex.getStrVal()); + if (Macinfo == dwarf::DW_MACINFO_invalid) + return TokError( + "invalid DWARF macinfo type" + Twine(" '") + Lex.getStrVal() + "'"); + assert(Macinfo <= Result.Max && "Expected valid DWARF macinfo type"); + + Result.assign(Macinfo); + Lex.Lex(); + return false; +} + +template <> +bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfVirtualityField &Result) { if (Lex.getKind() == lltok::APSInt) return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result)); @@ -3569,8 +3729,11 @@ bool LLParser::ParseDIFile(MDNode *&Result, bool IsDistinct) { /// isOptimized: true, flags: "-O2", runtimeVersion: 1, /// splitDebugFilename: "abc.debug", emissionKind: 1, /// enums: !1, retainedTypes: !2, subprograms: !3, -/// globals: !4, imports: !5, dwoId: 0x0abcd) +/// globals: !4, imports: !5, macros: !6, dwoId: 0x0abcd) bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) { + if (!IsDistinct) + return Lex.Error("missing 'distinct', required for !DICompileUnit"); + #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED) \ REQUIRED(language, DwarfLangField, ); \ REQUIRED(file, MDField, (/* AllowNull */ false)); \ @@ -3585,16 +3748,16 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) { OPTIONAL(subprograms, MDField, ); \ OPTIONAL(globals, MDField, ); \ OPTIONAL(imports, MDField, ); \ + OPTIONAL(macros, MDField, ); \ OPTIONAL(dwoId, MDUnsignedField, ); PARSE_MD_FIELDS(); #undef VISIT_MD_FIELDS - Result = GET_OR_DISTINCT(DICompileUnit, - (Context, language.Val, file.Val, producer.Val, - isOptimized.Val, flags.Val, runtimeVersion.Val, - splitDebugFilename.Val, emissionKind.Val, enums.Val, - retainedTypes.Val, subprograms.Val, globals.Val, - imports.Val, dwoId.Val)); + Result = DICompileUnit::getDistinct( + Context, language.Val, file.Val, producer.Val, isOptimized.Val, flags.Val, + runtimeVersion.Val, splitDebugFilename.Val, emissionKind.Val, enums.Val, + retainedTypes.Val, subprograms.Val, globals.Val, imports.Val, macros.Val, + dwoId.Val); return false; } @@ -3604,9 +3767,10 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) { /// isDefinition: true, scopeLine: 8, containingType: !3, /// virtuality: DW_VIRTUALTIY_pure_virtual, /// virtualIndex: 10, flags: 11, -/// isOptimized: false, function: void ()* @_Z3foov, -/// templateParams: !4, declaration: !5, variables: !6) +/// isOptimized: false, templateParams: !4, declaration: !5, +/// variables: !6) bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) { + auto Loc = Lex.getLoc(); #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED) \ OPTIONAL(scope, MDField, ); \ OPTIONAL(name, MDStringField, ); \ @@ -3622,19 +3786,23 @@ bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) { OPTIONAL(virtualIndex, MDUnsignedField, (0, UINT32_MAX)); \ OPTIONAL(flags, DIFlagField, ); \ OPTIONAL(isOptimized, MDBoolField, ); \ - OPTIONAL(function, MDConstant, ); \ OPTIONAL(templateParams, MDField, ); \ OPTIONAL(declaration, MDField, ); \ OPTIONAL(variables, MDField, ); PARSE_MD_FIELDS(); #undef VISIT_MD_FIELDS + if (isDefinition.Val && !IsDistinct) + return Lex.Error( + Loc, + "missing 'distinct', required for !DISubprogram when 'isDefinition'"); + Result = GET_OR_DISTINCT( - DISubprogram, (Context, scope.Val, name.Val, linkageName.Val, file.Val, - line.Val, type.Val, isLocal.Val, isDefinition.Val, - scopeLine.Val, containingType.Val, virtuality.Val, - virtualIndex.Val, flags.Val, isOptimized.Val, function.Val, - templateParams.Val, declaration.Val, variables.Val)); + DISubprogram, + (Context, scope.Val, name.Val, linkageName.Val, file.Val, line.Val, + type.Val, isLocal.Val, isDefinition.Val, scopeLine.Val, + containingType.Val, virtuality.Val, virtualIndex.Val, flags.Val, + isOptimized.Val, templateParams.Val, declaration.Val, variables.Val)); return false; } @@ -3685,6 +3853,39 @@ bool LLParser::ParseDINamespace(MDNode *&Result, bool IsDistinct) { return false; } +/// ParseDIMacro: +/// ::= !DIMacro(macinfo: type, line: 9, name: "SomeMacro", value: "SomeValue") +bool LLParser::ParseDIMacro(MDNode *&Result, bool IsDistinct) { +#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED) \ + REQUIRED(type, DwarfMacinfoTypeField, ); \ + REQUIRED(line, LineField, ); \ + REQUIRED(name, MDStringField, ); \ + OPTIONAL(value, MDStringField, ); + PARSE_MD_FIELDS(); +#undef VISIT_MD_FIELDS + + Result = GET_OR_DISTINCT(DIMacro, + (Context, type.Val, line.Val, name.Val, value.Val)); + return false; +} + +/// ParseDIMacroFile: +/// ::= !DIMacroFile(line: 9, file: !2, nodes: !3) +bool LLParser::ParseDIMacroFile(MDNode *&Result, bool IsDistinct) { +#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED) \ + OPTIONAL(type, DwarfMacinfoTypeField, (dwarf::DW_MACINFO_start_file)); \ + REQUIRED(line, LineField, ); \ + REQUIRED(file, MDField, ); \ + OPTIONAL(nodes, MDField, ); + PARSE_MD_FIELDS(); +#undef VISIT_MD_FIELDS + + Result = GET_OR_DISTINCT(DIMacroFile, + (Context, type.Val, line.Val, file.Val, nodes.Val)); + return false; +} + + /// ParseDIModule: /// ::= !DIModule(scope: !0, name: "SomeModule", configMacros: "-DNDEBUG", /// includePath: "/usr/include", isysroot: "/") @@ -3762,24 +3963,25 @@ bool LLParser::ParseDIGlobalVariable(MDNode *&Result, bool IsDistinct) { } /// ParseDILocalVariable: -/// ::= !DILocalVariable(tag: DW_TAG_arg_variable, scope: !0, name: "foo", +/// ::= !DILocalVariable(arg: 7, scope: !0, name: "foo", +/// file: !1, line: 7, type: !2, arg: 2, flags: 7) +/// ::= !DILocalVariable(scope: !0, name: "foo", /// file: !1, line: 7, type: !2, arg: 2, flags: 7) bool LLParser::ParseDILocalVariable(MDNode *&Result, bool IsDistinct) { #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED) \ - REQUIRED(tag, DwarfTagField, ); \ REQUIRED(scope, MDField, (/* AllowNull */ false)); \ OPTIONAL(name, MDStringField, ); \ + OPTIONAL(arg, MDUnsignedField, (0, UINT16_MAX)); \ OPTIONAL(file, MDField, ); \ OPTIONAL(line, LineField, ); \ OPTIONAL(type, MDField, ); \ - OPTIONAL(arg, MDUnsignedField, (0, UINT16_MAX)); \ OPTIONAL(flags, DIFlagField, ); PARSE_MD_FIELDS(); #undef VISIT_MD_FIELDS Result = GET_OR_DISTINCT(DILocalVariable, - (Context, tag.Val, scope.Val, name.Val, file.Val, - line.Val, type.Val, arg.Val, flags.Val)); + (Context, scope.Val, name.Val, file.Val, line.Val, + type.Val, arg.Val, flags.Val)); return false; } @@ -3969,13 +4171,11 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V, V = PFS->GetVal(ID.StrVal, Ty, ID.Loc); return V == nullptr; case ValID::t_InlineAsm: { - PointerType *PTy = dyn_cast<PointerType>(Ty); - FunctionType *FTy = - PTy ? dyn_cast<FunctionType>(PTy->getElementType()) : nullptr; - if (!FTy || !InlineAsm::Verify(FTy, ID.StrVal2)) + if (!ID.FTy || !InlineAsm::Verify(ID.FTy, ID.StrVal2)) return Error(ID.Loc, "invalid type for inline asm constraint string"); - V = InlineAsm::get(FTy, ID.StrVal, ID.StrVal2, ID.UIntVal&1, - (ID.UIntVal>>1)&1, (InlineAsm::AsmDialect(ID.UIntVal>>2))); + V = InlineAsm::get(ID.FTy, ID.StrVal, ID.StrVal2, ID.UIntVal & 1, + (ID.UIntVal >> 1) & 1, + (InlineAsm::AsmDialect(ID.UIntVal >> 2))); return false; } case ValID::t_GlobalName: @@ -4035,6 +4235,11 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V, return Error(ID.Loc, "invalid type for null constant"); V = Constant::getNullValue(Ty); return false; + case ValID::t_None: + if (!Ty->isTokenTy()) + return Error(ID.Loc, "invalid type for none constant"); + V = Constant::getNullValue(Ty); + return false; case ValID::t_Constant: if (ID.ConstantVal->getType() != Ty) return Error(ID.Loc, "constant expression type mismatch"); @@ -4056,8 +4261,8 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V, return Error(ID.Loc, "element " + Twine(i) + " of struct initializer doesn't match struct element type"); - V = ConstantStruct::get(ST, makeArrayRef(ID.ConstantStructElts, - ID.UIntVal)); + V = ConstantStruct::get( + ST, makeArrayRef(ID.ConstantStructElts.get(), ID.UIntVal)); } else return Error(ID.Loc, "constant expression type mismatch"); return false; @@ -4065,11 +4270,35 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V, llvm_unreachable("Invalid ValID"); } +bool LLParser::parseConstantValue(Type *Ty, Constant *&C) { + C = nullptr; + ValID ID; + auto Loc = Lex.getLoc(); + if (ParseValID(ID, /*PFS=*/nullptr)) + return true; + switch (ID.Kind) { + case ValID::t_APSInt: + case ValID::t_APFloat: + case ValID::t_Undef: + case ValID::t_Constant: + case ValID::t_ConstantStruct: + case ValID::t_PackedConstantStruct: { + Value *V; + if (ConvertValIDToValue(Ty, ID, V, /*PFS=*/nullptr)) + return true; + assert(isa<Constant>(V) && "Expected a constant value"); + C = cast<Constant>(V); + return false; + } + default: + return Error(Loc, "expected a constant value"); + } +} + bool LLParser::ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS) { V = nullptr; ValID ID; - return ParseValID(ID, PFS) || - ConvertValIDToValue(Ty, ID, V, PFS); + return ParseValID(ID, PFS) || ConvertValIDToValue(Ty, ID, V, PFS); } bool LLParser::ParseTypeAndValue(Value *&V, PerFunctionState *PFS) { @@ -4242,8 +4471,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { if (!FunctionName.empty()) { // If this was a definition of a forward reference, remove the definition // from the forward reference table and fill in the forward ref. - std::map<std::string, std::pair<GlobalValue*, LocTy> >::iterator FRVI = - ForwardRefVals.find(FunctionName); + auto FRVI = ForwardRefVals.find(FunctionName); if (FRVI != ForwardRefVals.end()) { Fn = M->getFunction(FunctionName); if (!Fn) @@ -4265,8 +4493,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) { } else { // If this is a definition of a forward referenced function, make sure the // types agree. - std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator I - = ForwardRefValIDs.find(NumberedVals.size()); + auto I = ForwardRefValIDs.find(NumberedVals.size()); if (I != ForwardRefValIDs.end()) { Fn = cast<Function>(I->second.first); if (Fn->getType() != PFT) @@ -4498,6 +4725,11 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB, case lltok::kw_indirectbr: return ParseIndirectBr(Inst, PFS); case lltok::kw_invoke: return ParseInvoke(Inst, PFS); case lltok::kw_resume: return ParseResume(Inst, PFS); + case lltok::kw_cleanupret: return ParseCleanupRet(Inst, PFS); + case lltok::kw_catchret: return ParseCatchRet(Inst, PFS); + case lltok::kw_catchswitch: return ParseCatchSwitch(Inst, PFS); + case lltok::kw_catchpad: return ParseCatchPad(Inst, PFS); + case lltok::kw_cleanuppad: return ParseCleanupPad(Inst, PFS); // Binary Operators. case lltok::kw_add: case lltok::kw_sub: @@ -4580,6 +4812,7 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB, case lltok::kw_call: return ParseCall(Inst, PFS, CallInst::TCK_None); case lltok::kw_tail: return ParseCall(Inst, PFS, CallInst::TCK_Tail); case lltok::kw_musttail: return ParseCall(Inst, PFS, CallInst::TCK_MustTail); + case lltok::kw_notail: return ParseCall(Inst, PFS, CallInst::TCK_NoTail); // Memory. case lltok::kw_alloca: return ParseAlloc(Inst, PFS); case lltok::kw_load: return ParseLoad(Inst, PFS); @@ -4798,15 +5031,15 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) { LocTy RetTypeLoc; ValID CalleeID; SmallVector<ParamInfo, 16> ArgList; + SmallVector<OperandBundleDef, 2> BundleList; BasicBlock *NormalBB, *UnwindBB; - if (ParseOptionalCallingConv(CC) || - ParseOptionalReturnAttrs(RetAttrs) || + if (ParseOptionalCallingConv(CC) || ParseOptionalReturnAttrs(RetAttrs) || ParseType(RetType, RetTypeLoc, true /*void allowed*/) || - ParseValID(CalleeID) || - ParseParameterList(ArgList, PFS) || + ParseValID(CalleeID) || ParseParameterList(ArgList, PFS) || ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false, NoBuiltinLoc) || + ParseOptionalOperandBundles(BundleList, PFS) || ParseToken(lltok::kw_to, "expected 'to' in invoke") || ParseTypeAndBasicBlock(NormalBB, PFS) || ParseToken(lltok::kw_unwind, "expected 'unwind' in invoke") || @@ -4829,6 +5062,8 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) { Ty = FunctionType::get(RetType, ParamTypes, false); } + CalleeID.FTy = Ty; + // Look up the callee. Value *Callee; if (ConvertValIDToValue(PointerType::getUnqual(Ty), CalleeID, Callee, &PFS)) @@ -4880,7 +5115,8 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) { // Finish off the Attribute and check them AttributeSet PAL = AttributeSet::get(Context, Attrs); - InvokeInst *II = InvokeInst::Create(Ty, Callee, NormalBB, UnwindBB, Args); + InvokeInst *II = + InvokeInst::Create(Ty, Callee, NormalBB, UnwindBB, Args, BundleList); II->setCallingConv(CC); II->setAttributes(PAL); ForwardRefAttrGroups[II] = FwdRefAttrGrps; @@ -4900,6 +5136,183 @@ bool LLParser::ParseResume(Instruction *&Inst, PerFunctionState &PFS) { return false; } +bool LLParser::ParseExceptionArgs(SmallVectorImpl<Value *> &Args, + PerFunctionState &PFS) { + if (ParseToken(lltok::lsquare, "expected '[' in catchpad/cleanuppad")) + return true; + + while (Lex.getKind() != lltok::rsquare) { + // If this isn't the first argument, we need a comma. + if (!Args.empty() && + ParseToken(lltok::comma, "expected ',' in argument list")) + return true; + + // Parse the argument. + LocTy ArgLoc; + Type *ArgTy = nullptr; + if (ParseType(ArgTy, ArgLoc)) + return true; + + Value *V; + if (ArgTy->isMetadataTy()) { + if (ParseMetadataAsValue(V, PFS)) + return true; + } else { + if (ParseValue(ArgTy, V, PFS)) + return true; + } + Args.push_back(V); + } + + Lex.Lex(); // Lex the ']'. + return false; +} + +/// ParseCleanupRet +/// ::= 'cleanupret' from Value unwind ('to' 'caller' | TypeAndValue) +bool LLParser::ParseCleanupRet(Instruction *&Inst, PerFunctionState &PFS) { + Value *CleanupPad = nullptr; + + if (ParseToken(lltok::kw_from, "expected 'from' after cleanupret")) + return true; + + if (ParseValue(Type::getTokenTy(Context), CleanupPad, PFS)) + return true; + + if (ParseToken(lltok::kw_unwind, "expected 'unwind' in cleanupret")) + return true; + + BasicBlock *UnwindBB = nullptr; + if (Lex.getKind() == lltok::kw_to) { + Lex.Lex(); + if (ParseToken(lltok::kw_caller, "expected 'caller' in cleanupret")) + return true; + } else { + if (ParseTypeAndBasicBlock(UnwindBB, PFS)) { + return true; + } + } + + Inst = CleanupReturnInst::Create(CleanupPad, UnwindBB); + return false; +} + +/// ParseCatchRet +/// ::= 'catchret' from Parent Value 'to' TypeAndValue +bool LLParser::ParseCatchRet(Instruction *&Inst, PerFunctionState &PFS) { + Value *CatchPad = nullptr; + + if (ParseToken(lltok::kw_from, "expected 'from' after catchret")) + return true; + + if (ParseValue(Type::getTokenTy(Context), CatchPad, PFS)) + return true; + + BasicBlock *BB; + if (ParseToken(lltok::kw_to, "expected 'to' in catchret") || + ParseTypeAndBasicBlock(BB, PFS)) + return true; + + Inst = CatchReturnInst::Create(CatchPad, BB); + return false; +} + +/// ParseCatchSwitch +/// ::= 'catchswitch' within Parent +bool LLParser::ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS) { + Value *ParentPad; + LocTy BBLoc; + + if (ParseToken(lltok::kw_within, "expected 'within' after catchswitch")) + return true; + + if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar && + Lex.getKind() != lltok::LocalVarID) + return TokError("expected scope value for catchswitch"); + + if (ParseValue(Type::getTokenTy(Context), ParentPad, PFS)) + return true; + + if (ParseToken(lltok::lsquare, "expected '[' with catchswitch labels")) + return true; + + SmallVector<BasicBlock *, 32> Table; + do { + BasicBlock *DestBB; + if (ParseTypeAndBasicBlock(DestBB, PFS)) + return true; + Table.push_back(DestBB); + } while (EatIfPresent(lltok::comma)); + + if (ParseToken(lltok::rsquare, "expected ']' after catchswitch labels")) + return true; + + if (ParseToken(lltok::kw_unwind, + "expected 'unwind' after catchswitch scope")) + return true; + + BasicBlock *UnwindBB = nullptr; + if (EatIfPresent(lltok::kw_to)) { + if (ParseToken(lltok::kw_caller, "expected 'caller' in catchswitch")) + return true; + } else { + if (ParseTypeAndBasicBlock(UnwindBB, PFS)) + return true; + } + + auto *CatchSwitch = + CatchSwitchInst::Create(ParentPad, UnwindBB, Table.size()); + for (BasicBlock *DestBB : Table) + CatchSwitch->addHandler(DestBB); + Inst = CatchSwitch; + return false; +} + +/// ParseCatchPad +/// ::= 'catchpad' ParamList 'to' TypeAndValue 'unwind' TypeAndValue +bool LLParser::ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS) { + Value *CatchSwitch = nullptr; + + if (ParseToken(lltok::kw_within, "expected 'within' after catchpad")) + return true; + + if (Lex.getKind() != lltok::LocalVar && Lex.getKind() != lltok::LocalVarID) + return TokError("expected scope value for catchpad"); + + if (ParseValue(Type::getTokenTy(Context), CatchSwitch, PFS)) + return true; + + SmallVector<Value *, 8> Args; + if (ParseExceptionArgs(Args, PFS)) + return true; + + Inst = CatchPadInst::Create(CatchSwitch, Args); + return false; +} + +/// ParseCleanupPad +/// ::= 'cleanuppad' within Parent ParamList +bool LLParser::ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS) { + Value *ParentPad = nullptr; + + if (ParseToken(lltok::kw_within, "expected 'within' after cleanuppad")) + return true; + + if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar && + Lex.getKind() != lltok::LocalVarID) + return TokError("expected scope value for cleanuppad"); + + if (ParseValue(Type::getTokenTy(Context), ParentPad, PFS)) + return true; + + SmallVector<Value *, 8> Args; + if (ParseExceptionArgs(Args, PFS)) + return true; + + Inst = CleanupPadInst::Create(ParentPad, Args); + return false; +} + //===----------------------------------------------------------------------===// // Binary Operators. //===----------------------------------------------------------------------===// @@ -5196,12 +5609,14 @@ bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) { } /// ParseCall -/// ::= 'call' OptionalCallingConv OptionalAttrs Type Value -/// ParameterList OptionalAttrs -/// ::= 'tail' 'call' OptionalCallingConv OptionalAttrs Type Value -/// ParameterList OptionalAttrs -/// ::= 'musttail' 'call' OptionalCallingConv OptionalAttrs Type Value -/// ParameterList OptionalAttrs +/// ::= 'call' OptionalFastMathFlags OptionalCallingConv +/// OptionalAttrs Type Value ParameterList OptionalAttrs +/// ::= 'tail' 'call' OptionalFastMathFlags OptionalCallingConv +/// OptionalAttrs Type Value ParameterList OptionalAttrs +/// ::= 'musttail' 'call' OptionalFastMathFlags OptionalCallingConv +/// OptionalAttrs Type Value ParameterList OptionalAttrs +/// ::= 'notail' 'call' OptionalFastMathFlags OptionalCallingConv +/// OptionalAttrs Type Value ParameterList OptionalAttrs bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS, CallInst::TailCallKind TCK) { AttrBuilder RetAttrs, FnAttrs; @@ -5212,20 +5627,29 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS, LocTy RetTypeLoc; ValID CalleeID; SmallVector<ParamInfo, 16> ArgList; + SmallVector<OperandBundleDef, 2> BundleList; LocTy CallLoc = Lex.getLoc(); - if ((TCK != CallInst::TCK_None && - ParseToken(lltok::kw_call, "expected 'tail call'")) || - ParseOptionalCallingConv(CC) || - ParseOptionalReturnAttrs(RetAttrs) || + if (TCK != CallInst::TCK_None && + ParseToken(lltok::kw_call, + "expected 'tail call', 'musttail call', or 'notail call'")) + return true; + + FastMathFlags FMF = EatFastMathFlagsIfPresent(); + + if (ParseOptionalCallingConv(CC) || ParseOptionalReturnAttrs(RetAttrs) || ParseType(RetType, RetTypeLoc, true /*void allowed*/) || ParseValID(CalleeID) || ParseParameterList(ArgList, PFS, TCK == CallInst::TCK_MustTail, PFS.getFunction().isVarArg()) || - ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false, - BuiltinLoc)) + ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false, BuiltinLoc) || + ParseOptionalOperandBundles(BundleList, PFS)) return true; + if (FMF.any() && !RetType->isFPOrFPVectorTy()) + return Error(CallLoc, "fast-math-flags specified for call without " + "floating-point scalar or vector return type"); + // If RetType is a non-function pointer type, then this is the short syntax // for the call, which means that RetType is just the return type. Infer the // rest of the function argument types from the arguments that are present. @@ -5242,6 +5666,8 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS, Ty = FunctionType::get(RetType, ParamTypes, false); } + CalleeID.FTy = Ty; + // Look up the callee. Value *Callee; if (ConvertValIDToValue(PointerType::getUnqual(Ty), CalleeID, Callee, &PFS)) @@ -5293,9 +5719,11 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS, // Finish off the Attribute and check them AttributeSet PAL = AttributeSet::get(Context, Attrs); - CallInst *CI = CallInst::Create(Ty, Callee, Args); + CallInst *CI = CallInst::Create(Ty, Callee, Args, BundleList); CI->setTailCallKind(TCK); CI->setCallingConv(CC); + if (FMF.any()) + CI->setFastMathFlags(FMF); CI->setAttributes(PAL); ForwardRefAttrGroups[CI] = FwdRefAttrGrps; Inst = CI; @@ -5614,7 +6042,7 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) { Indices.push_back(Val); } - SmallPtrSet<const Type*, 4> Visited; + SmallPtrSet<Type*, 4> Visited; if (!Indices.empty() && !Ty->isSized(&Visited)) return Error(Loc, "base element of getelementptr must be sized"); diff --git a/contrib/llvm/lib/AsmParser/LLParser.h b/contrib/llvm/lib/AsmParser/LLParser.h index 6e57b3e..f61a5e5 100644 --- a/contrib/llvm/lib/AsmParser/LLParser.h +++ b/contrib/llvm/lib/AsmParser/LLParser.h @@ -46,29 +46,32 @@ namespace llvm { /// or a symbolic (%var) reference. This is just a discriminated union. struct ValID { enum { - t_LocalID, t_GlobalID, // ID in UIntVal. - t_LocalName, t_GlobalName, // Name in StrVal. - t_APSInt, t_APFloat, // Value in APSIntVal/APFloatVal. - t_Null, t_Undef, t_Zero, // No value. - t_EmptyArray, // No value: [] - t_Constant, // Value in ConstantVal. - t_InlineAsm, // Value in StrVal/StrVal2/UIntVal. - t_ConstantStruct, // Value in ConstantStructElts. - t_PackedConstantStruct // Value in ConstantStructElts. - } Kind; + t_LocalID, t_GlobalID, // ID in UIntVal. + t_LocalName, t_GlobalName, // Name in StrVal. + t_APSInt, t_APFloat, // Value in APSIntVal/APFloatVal. + t_Null, t_Undef, t_Zero, t_None, // No value. + t_EmptyArray, // No value: [] + t_Constant, // Value in ConstantVal. + t_InlineAsm, // Value in FTy/StrVal/StrVal2/UIntVal. + t_ConstantStruct, // Value in ConstantStructElts. + t_PackedConstantStruct // Value in ConstantStructElts. + } Kind = t_LocalID; LLLexer::LocTy Loc; unsigned UIntVal; + FunctionType *FTy = nullptr; std::string StrVal, StrVal2; APSInt APSIntVal; - APFloat APFloatVal; + APFloat APFloatVal{0.0}; Constant *ConstantVal; - Constant **ConstantStructElts; - - ValID() : Kind(t_LocalID), APFloatVal(0.0) {} - ~ValID() { - if (Kind == t_ConstantStruct || Kind == t_PackedConstantStruct) - delete [] ConstantStructElts; + std::unique_ptr<Constant *[]> ConstantStructElts; + + ValID() = default; + ValID(const ValID &RHS) + : Kind(RHS.Kind), Loc(RHS.Loc), UIntVal(RHS.UIntVal), FTy(RHS.FTy), + StrVal(RHS.StrVal), StrVal2(RHS.StrVal2), APSIntVal(RHS.APSIntVal), + APFloatVal(RHS.APFloatVal), ConstantVal(RHS.ConstantVal) { + assert(!RHS.ConstantStructElts); } bool operator<(const ValID &RHS) const { @@ -143,6 +146,8 @@ namespace llvm { Slots(Slots), BlockAddressPFS(nullptr) {} bool Run(); + bool parseStandaloneConstantValue(Constant *&C, const SlotMapping *Slots); + LLVMContext &getContext() { return Context; } private: @@ -154,6 +159,10 @@ namespace llvm { return Error(Lex.getLoc(), Msg); } + /// Restore the internal name and slot mappings using the mappings that + /// were created at an earlier parsing stage. + void restoreParsingState(const SlotMapping *Slots); + /// GetGlobalVal - Get a value with the specified name or ID, creating a /// forward reference record if needed. This can return null if the value /// exists but does not have the right type. @@ -210,6 +219,8 @@ namespace llvm { return ParseUInt64(Val); } + bool ParseStringAttribute(AttrBuilder &B); + bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM); bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM); bool parseOptionalUnnamedAddr(bool &UnnamedAddr) { @@ -343,10 +354,12 @@ namespace llvm { bool ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V, PerFunctionState *PFS); + bool parseConstantValue(Type *Ty, Constant *&C); bool ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS); bool ParseValue(Type *Ty, Value *&V, PerFunctionState &PFS) { return ParseValue(Ty, V, &PFS); } + bool ParseValue(Type *Ty, Value *&V, LocTy &Loc, PerFunctionState &PFS) { Loc = Lex.getLoc(); @@ -381,6 +394,13 @@ namespace llvm { bool IsMustTailCall = false, bool InVarArgsFunc = false); + bool + ParseOptionalOperandBundles(SmallVectorImpl<OperandBundleDef> &BundleList, + PerFunctionState &PFS); + + bool ParseExceptionArgs(SmallVectorImpl<Value *> &Args, + PerFunctionState &PFS); + // Constant Parsing. bool ParseValID(ValID &ID, PerFunctionState *PFS = nullptr); bool ParseGlobalValue(Type *Ty, Constant *&V); @@ -441,6 +461,11 @@ namespace llvm { bool ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS); bool ParseInvoke(Instruction *&Inst, PerFunctionState &PFS); bool ParseResume(Instruction *&Inst, PerFunctionState &PFS); + bool ParseCleanupRet(Instruction *&Inst, PerFunctionState &PFS); + bool ParseCatchRet(Instruction *&Inst, PerFunctionState &PFS); + bool ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS); + bool ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS); + bool ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS); bool ParseArithmetic(Instruction *&I, PerFunctionState &PFS, unsigned Opc, unsigned OperandType); diff --git a/contrib/llvm/lib/AsmParser/LLToken.h b/contrib/llvm/lib/AsmParser/LLToken.h index 691f085..29a7f16 100644 --- a/contrib/llvm/lib/AsmParser/LLToken.h +++ b/contrib/llvm/lib/AsmParser/LLToken.h @@ -49,10 +49,14 @@ namespace lltok { kw_external, kw_thread_local, kw_localdynamic, kw_initialexec, kw_localexec, kw_zeroinitializer, - kw_undef, kw_null, + kw_undef, kw_null, kw_none, kw_to, + kw_caller, + kw_within, + kw_from, kw_tail, kw_musttail, + kw_notail, kw_target, kw_triple, kw_unwind, @@ -96,6 +100,9 @@ namespace lltok { kw_webkit_jscc, kw_anyregcc, kw_preserve_mostcc, kw_preserve_allcc, kw_ghccc, + kw_x86_intrcc, + kw_hhvmcc, kw_hhvm_ccc, + kw_cxx_fast_tlscc, // Attributes: kw_attributes, @@ -109,6 +116,8 @@ namespace lltok { kw_convergent, kw_dereferenceable, kw_dereferenceable_or_null, + kw_inaccessiblememonly, + kw_inaccessiblemem_or_argmemonly, kw_inlinehint, kw_inreg, kw_jumptable, @@ -121,6 +130,7 @@ namespace lltok { kw_noduplicate, kw_noimplicitfloat, kw_noinline, + kw_norecurse, kw_nonlazybind, kw_nonnull, kw_noredzone, @@ -177,7 +187,8 @@ namespace lltok { kw_landingpad, kw_personality, kw_cleanup, kw_catch, kw_filter, kw_ret, kw_br, kw_switch, kw_indirectbr, kw_invoke, kw_resume, - kw_unreachable, + kw_unreachable, kw_cleanupret, kw_catchswitch, kw_catchret, kw_catchpad, + kw_cleanuppad, kw_alloca, kw_load, kw_store, kw_fence, kw_cmpxchg, kw_atomicrmw, kw_getelementptr, @@ -209,6 +220,7 @@ namespace lltok { DwarfLang, // DW_LANG_foo DwarfOp, // DW_OP_foo DIFlag, // DIFlagFoo + DwarfMacinfo, // DW_MACINFO_foo // Type valued tokens (TyVal). Type, diff --git a/contrib/llvm/lib/AsmParser/Parser.cpp b/contrib/llvm/lib/AsmParser/Parser.cpp index 9145a54..4e55e62 100644 --- a/contrib/llvm/lib/AsmParser/Parser.cpp +++ b/contrib/llvm/lib/AsmParser/Parser.cpp @@ -66,3 +66,15 @@ std::unique_ptr<Module> llvm::parseAssemblyString(StringRef AsmString, MemoryBufferRef F(AsmString, "<string>"); return parseAssembly(F, Err, Context, Slots); } + +Constant *llvm::parseConstantValue(StringRef Asm, SMDiagnostic &Err, + const Module &M, const SlotMapping *Slots) { + SourceMgr SM; + std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Asm); + SM.AddNewSourceBuffer(std::move(Buf), SMLoc()); + Constant *C; + if (LLParser(Asm, SM, Err, const_cast<Module *>(&M)) + .parseStandaloneConstantValue(C, Slots)) + return nullptr; + return C; +} diff --git a/contrib/llvm/lib/Bitcode/Reader/BitReader.cpp b/contrib/llvm/lib/Bitcode/Reader/BitReader.cpp index 289c76e..385c18a 100644 --- a/contrib/llvm/lib/Bitcode/Reader/BitReader.cpp +++ b/contrib/llvm/lib/Bitcode/Reader/BitReader.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "llvm-c/BitReader.h" +#include "llvm-c/Core.h" #include "llvm/Bitcode/ReaderWriter.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/LLVMContext.h" @@ -22,12 +23,25 @@ using namespace llvm; /* Builds a module from the bitcode in the specified memory buffer, returning a reference to the module via the OutModule parameter. Returns 0 on success. Optionally returns a human-readable error message via OutMessage. */ -LLVMBool LLVMParseBitcode(LLVMMemoryBufferRef MemBuf, - LLVMModuleRef *OutModule, char **OutMessage) { +LLVMBool LLVMParseBitcode(LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutModule, + char **OutMessage) { return LLVMParseBitcodeInContext(wrap(&getGlobalContext()), MemBuf, OutModule, OutMessage); } +LLVMBool LLVMParseBitcode2(LLVMMemoryBufferRef MemBuf, + LLVMModuleRef *OutModule) { + return LLVMParseBitcodeInContext2(wrap(&getGlobalContext()), MemBuf, + OutModule); +} + +static void diagnosticHandler(const DiagnosticInfo &DI, void *C) { + auto *Message = reinterpret_cast<std::string *>(C); + raw_string_ostream Stream(*Message); + DiagnosticPrinterRawOStream DP(Stream); + DI.print(DP); +} + LLVMBool LLVMParseBitcodeInContext(LLVMContextRef ContextRef, LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutModule, @@ -35,18 +49,36 @@ LLVMBool LLVMParseBitcodeInContext(LLVMContextRef ContextRef, MemoryBufferRef Buf = unwrap(MemBuf)->getMemBufferRef(); LLVMContext &Ctx = *unwrap(ContextRef); + LLVMContext::DiagnosticHandlerTy OldDiagnosticHandler = + Ctx.getDiagnosticHandler(); + void *OldDiagnosticContext = Ctx.getDiagnosticContext(); std::string Message; - raw_string_ostream Stream(Message); - DiagnosticPrinterRawOStream DP(Stream); + Ctx.setDiagnosticHandler(diagnosticHandler, &Message, true); + + ErrorOr<std::unique_ptr<Module>> ModuleOrErr = parseBitcodeFile(Buf, Ctx); + + Ctx.setDiagnosticHandler(OldDiagnosticHandler, OldDiagnosticContext, true); - ErrorOr<std::unique_ptr<Module>> ModuleOrErr = parseBitcodeFile( - Buf, Ctx, [&](const DiagnosticInfo &DI) { DI.print(DP); }); if (ModuleOrErr.getError()) { - if (OutMessage) { - Stream.flush(); + if (OutMessage) *OutMessage = strdup(Message.c_str()); - } - *OutModule = wrap((Module*)nullptr); + *OutModule = wrap((Module *)nullptr); + return 1; + } + + *OutModule = wrap(ModuleOrErr.get().release()); + return 0; +} + +LLVMBool LLVMParseBitcodeInContext2(LLVMContextRef ContextRef, + LLVMMemoryBufferRef MemBuf, + LLVMModuleRef *OutModule) { + MemoryBufferRef Buf = unwrap(MemBuf)->getMemBufferRef(); + LLVMContext &Ctx = *unwrap(ContextRef); + + ErrorOr<std::unique_ptr<Module>> ModuleOrErr = parseBitcodeFile(Buf, Ctx); + if (ModuleOrErr.getError()) { + *OutModule = wrap((Module *)nullptr); return 1; } @@ -59,26 +91,50 @@ LLVMBool LLVMParseBitcodeInContext(LLVMContextRef ContextRef, Optionally returns a human-readable error message via OutMessage. */ LLVMBool LLVMGetBitcodeModuleInContext(LLVMContextRef ContextRef, LLVMMemoryBufferRef MemBuf, - LLVMModuleRef *OutM, - char **OutMessage) { + LLVMModuleRef *OutM, char **OutMessage) { + LLVMContext &Ctx = *unwrap(ContextRef); + LLVMContext::DiagnosticHandlerTy OldDiagnosticHandler = + Ctx.getDiagnosticHandler(); + void *OldDiagnosticContext = Ctx.getDiagnosticContext(); + std::string Message; + Ctx.setDiagnosticHandler(diagnosticHandler, &Message, true); std::unique_ptr<MemoryBuffer> Owner(unwrap(MemBuf)); ErrorOr<std::unique_ptr<Module>> ModuleOrErr = - getLazyBitcodeModule(std::move(Owner), *unwrap(ContextRef)); + getLazyBitcodeModule(std::move(Owner), Ctx); Owner.release(); + Ctx.setDiagnosticHandler(OldDiagnosticHandler, OldDiagnosticContext, true); - if (std::error_code EC = ModuleOrErr.getError()) { + if (ModuleOrErr.getError()) { *OutM = wrap((Module *)nullptr); if (OutMessage) - *OutMessage = strdup(EC.message().c_str()); + *OutMessage = strdup(Message.c_str()); return 1; } *OutM = wrap(ModuleOrErr.get().release()); return 0; +} + +LLVMBool LLVMGetBitcodeModuleInContext2(LLVMContextRef ContextRef, + LLVMMemoryBufferRef MemBuf, + LLVMModuleRef *OutM) { + LLVMContext &Ctx = *unwrap(ContextRef); + std::unique_ptr<MemoryBuffer> Owner(unwrap(MemBuf)); + + ErrorOr<std::unique_ptr<Module>> ModuleOrErr = + getLazyBitcodeModule(std::move(Owner), Ctx); + Owner.release(); + if (ModuleOrErr.getError()) { + *OutM = wrap((Module *)nullptr); + return 1; + } + + *OutM = wrap(ModuleOrErr.get().release()); + return 0; } LLVMBool LLVMGetBitcodeModule(LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutM, @@ -87,20 +143,7 @@ LLVMBool LLVMGetBitcodeModule(LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutM, OutMessage); } -/* Deprecated: Use LLVMGetBitcodeModuleInContext instead. */ -LLVMBool LLVMGetBitcodeModuleProviderInContext(LLVMContextRef ContextRef, - LLVMMemoryBufferRef MemBuf, - LLVMModuleProviderRef *OutMP, - char **OutMessage) { - return LLVMGetBitcodeModuleInContext(ContextRef, MemBuf, - reinterpret_cast<LLVMModuleRef*>(OutMP), - OutMessage); -} - -/* Deprecated: Use LLVMGetBitcodeModule instead. */ -LLVMBool LLVMGetBitcodeModuleProvider(LLVMMemoryBufferRef MemBuf, - LLVMModuleProviderRef *OutMP, - char **OutMessage) { - return LLVMGetBitcodeModuleProviderInContext(LLVMGetGlobalContext(), MemBuf, - OutMP, OutMessage); +LLVMBool LLVMGetBitcodeModule2(LLVMMemoryBufferRef MemBuf, + LLVMModuleRef *OutM) { + return LLVMGetBitcodeModuleInContext2(LLVMGetGlobalContext(), MemBuf, OutM); } diff --git a/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index c04e8b9..2ad4b32 100644 --- a/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/OperandTraits.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/FunctionInfo.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/DataStream.h" #include "llvm/Support/ManagedStatic.h" @@ -93,35 +94,35 @@ public: void resolveConstantForwardRefs(); }; -class BitcodeReaderMDValueList { +class BitcodeReaderMetadataList { unsigned NumFwdRefs; bool AnyFwdRefs; unsigned MinFwdRef; unsigned MaxFwdRef; - std::vector<TrackingMDRef> MDValuePtrs; + std::vector<TrackingMDRef> MetadataPtrs; LLVMContext &Context; public: - BitcodeReaderMDValueList(LLVMContext &C) + BitcodeReaderMetadataList(LLVMContext &C) : NumFwdRefs(0), AnyFwdRefs(false), Context(C) {} // vector compatibility methods - unsigned size() const { return MDValuePtrs.size(); } - void resize(unsigned N) { MDValuePtrs.resize(N); } - void push_back(Metadata *MD) { MDValuePtrs.emplace_back(MD); } - void clear() { MDValuePtrs.clear(); } - Metadata *back() const { return MDValuePtrs.back(); } - void pop_back() { MDValuePtrs.pop_back(); } - bool empty() const { return MDValuePtrs.empty(); } + unsigned size() const { return MetadataPtrs.size(); } + void resize(unsigned N) { MetadataPtrs.resize(N); } + void push_back(Metadata *MD) { MetadataPtrs.emplace_back(MD); } + void clear() { MetadataPtrs.clear(); } + Metadata *back() const { return MetadataPtrs.back(); } + void pop_back() { MetadataPtrs.pop_back(); } + bool empty() const { return MetadataPtrs.empty(); } Metadata *operator[](unsigned i) const { - assert(i < MDValuePtrs.size()); - return MDValuePtrs[i]; + assert(i < MetadataPtrs.size()); + return MetadataPtrs[i]; } void shrinkTo(unsigned N) { assert(N <= size() && "Invalid shrinkTo request!"); - MDValuePtrs.resize(N); + MetadataPtrs.resize(N); } Metadata *getValueFwdRef(unsigned Idx); @@ -131,17 +132,27 @@ public: class BitcodeReader : public GVMaterializer { LLVMContext &Context; - DiagnosticHandlerFunction DiagnosticHandler; Module *TheModule = nullptr; std::unique_ptr<MemoryBuffer> Buffer; std::unique_ptr<BitstreamReader> StreamFile; BitstreamCursor Stream; + // Next offset to start scanning for lazy parsing of function bodies. uint64_t NextUnreadBit = 0; + // Last function offset found in the VST. + uint64_t LastFunctionBlockBit = 0; bool SeenValueSymbolTable = false; + uint64_t VSTOffset = 0; + // Contains an arbitrary and optional string identifying the bitcode producer + std::string ProducerIdentification; + // Number of module level metadata records specified by the + // MODULE_CODE_METADATA_VALUES record. + unsigned NumModuleMDs = 0; + // Support older bitcode without the MODULE_CODE_METADATA_VALUES record. + bool SeenModuleValuesRecord = false; std::vector<Type*> TypeList; BitcodeReaderValueList ValueList; - BitcodeReaderMDValueList MDValueList; + BitcodeReaderMetadataList MetadataList; std::vector<Comdat *> ComdatList; SmallVector<Instruction *, 64> InstructionList; @@ -157,7 +168,7 @@ class BitcodeReader : public GVMaterializer { /// is thus not represented here. As such all indices are off by one. std::vector<AttributeSet> MAttributes; - /// \brief The set of attribute groups. + /// The set of attribute groups. std::map<unsigned, AttributeSet> MAttributeGroups; /// While parsing a function body, this is a list of the basic blocks for the @@ -208,23 +219,24 @@ class BitcodeReader : public GVMaterializer { /// (e.g.) blockaddress forward references. bool WillMaterializeAllForwardRefs = false; - /// Functions that have block addresses taken. This is usually empty. - SmallPtrSet<const Function *, 4> BlockAddressesTaken; - /// True if any Metadata block has been materialized. bool IsMetadataMaterialized = false; bool StripDebugInfo = false; + /// Functions that need to be matched with subprograms when upgrading old + /// metadata. + SmallDenseMap<Function *, DISubprogram *, 16> FunctionsWithSPs; + + std::vector<std::string> BundleTags; + public: std::error_code error(BitcodeError E, const Twine &Message); std::error_code error(BitcodeError E); std::error_code error(const Twine &Message); - BitcodeReader(MemoryBuffer *Buffer, LLVMContext &Context, - DiagnosticHandlerFunction DiagnosticHandler); - BitcodeReader(LLVMContext &Context, - DiagnosticHandlerFunction DiagnosticHandler); + BitcodeReader(MemoryBuffer *Buffer, LLVMContext &Context); + BitcodeReader(LLVMContext &Context); ~BitcodeReader() override { freeState(); } std::error_code materializeForwardReferencedFunctions(); @@ -233,11 +245,9 @@ public: void releaseBuffer(); - bool isDematerializable(const GlobalValue *GV) const override; std::error_code materialize(GlobalValue *GV) override; - std::error_code materializeModule(Module *M) override; + std::error_code materializeModule() override; std::vector<StructType *> getIdentifiedStructTypes() const override; - void dematerialize(GlobalValue *GV) override; /// \brief Main interface to parsing a bitcode buffer. /// \returns true if an error occurred. @@ -249,6 +259,9 @@ public: /// \returns true if an error occurred. ErrorOr<std::string> parseTriple(); + /// Cheap mechanism to just extract the identification block out of bitcode. + ErrorOr<std::string> parseIdentificationBlock(); + static uint64_t decodeSignRotatedValue(uint64_t V); /// Materialize any deferred Metadata block. @@ -256,7 +269,20 @@ public: void setStripDebugInfo() override; + /// Save the mapping between the metadata values and the corresponding + /// value id that were recorded in the MetadataList during parsing. If + /// OnlyTempMD is true, then only record those entries that are still + /// temporary metadata. This interface is used when metadata linking is + /// performed as a postpass, such as during function importing. + void saveMetadataList(DenseMap<const Metadata *, unsigned> &MetadataToIDs, + bool OnlyTempMD) override; + private: + /// Parse the "IDENTIFICATION_BLOCK_ID" block, populate the + // ProducerIdentification data member, and do some basic enforcement on the + // "epoch" encoded in the bitcode. + std::error_code parseBitcodeVersion(); + std::vector<StructType *> IdentifiedStructTypes; StructType *createIdentifiedStructType(LLVMContext &Context, StringRef Name); StructType *createIdentifiedStructType(LLVMContext &Context); @@ -268,7 +294,7 @@ private: return ValueList.getValueFwdRef(ID, Ty); } Metadata *getFnMetadataByID(unsigned ID) { - return MDValueList.getValueFwdRef(ID); + return MetadataList.getValueFwdRef(ID); } BasicBlock *getBasicBlock(unsigned ID) const { if (ID >= FunctionBBs.size()) return nullptr; // Invalid ID @@ -351,21 +377,28 @@ private: /// a corresponding error code. std::error_code parseAlignmentValue(uint64_t Exponent, unsigned &Alignment); std::error_code parseAttrKind(uint64_t Code, Attribute::AttrKind *Kind); - std::error_code parseModule(bool Resume, bool ShouldLazyLoadMetadata = false); + std::error_code parseModule(uint64_t ResumeBit, + bool ShouldLazyLoadMetadata = false); std::error_code parseAttributeBlock(); std::error_code parseAttributeGroupBlock(); std::error_code parseTypeTable(); std::error_code parseTypeTableBody(); + std::error_code parseOperandBundleTags(); - std::error_code parseValueSymbolTable(); + ErrorOr<Value *> recordValue(SmallVectorImpl<uint64_t> &Record, + unsigned NameIndex, Triple &TT); + std::error_code parseValueSymbolTable(uint64_t Offset = 0); std::error_code parseConstants(); + std::error_code rememberAndSkipFunctionBodies(); std::error_code rememberAndSkipFunctionBody(); /// Save the positions of the Metadata blocks and skip parsing the blocks. std::error_code rememberAndSkipMetadata(); std::error_code parseFunctionBody(Function *F); std::error_code globalCleanup(); std::error_code resolveGlobalAndAliasInits(); - std::error_code parseMetadata(); + std::error_code parseMetadata(bool ModuleLevel = false); + std::error_code parseMetadataKinds(); + std::error_code parseMetadataKindRecord(SmallVectorImpl<uint64_t> &Record); std::error_code parseMetadataAttachment(Function &F); ErrorOr<std::string> parseModuleTriple(); std::error_code parseUseLists(); @@ -376,6 +409,94 @@ private: Function *F, DenseMap<Function *, uint64_t>::iterator DeferredFunctionInfoIterator); }; + +/// Class to manage reading and parsing function summary index bitcode +/// files/sections. +class FunctionIndexBitcodeReader { + DiagnosticHandlerFunction DiagnosticHandler; + + /// Eventually points to the function index built during parsing. + FunctionInfoIndex *TheIndex = nullptr; + + std::unique_ptr<MemoryBuffer> Buffer; + std::unique_ptr<BitstreamReader> StreamFile; + BitstreamCursor Stream; + + /// \brief Used to indicate whether we are doing lazy parsing of summary data. + /// + /// If false, the summary section is fully parsed into the index during + /// the initial parse. Otherwise, if true, the caller is expected to + /// invoke \a readFunctionSummary for each summary needed, and the summary + /// section is thus parsed lazily. + bool IsLazy = false; + + /// Used to indicate whether caller only wants to check for the presence + /// of the function summary bitcode section. All blocks are skipped, + /// but the SeenFuncSummary boolean is set. + bool CheckFuncSummaryPresenceOnly = false; + + /// Indicates whether we have encountered a function summary section + /// yet during parsing, used when checking if file contains function + /// summary section. + bool SeenFuncSummary = false; + + /// \brief Map populated during function summary section parsing, and + /// consumed during ValueSymbolTable parsing. + /// + /// Used to correlate summary records with VST entries. For the per-module + /// index this maps the ValueID to the parsed function summary, and + /// for the combined index this maps the summary record's bitcode + /// offset to the function summary (since in the combined index the + /// VST records do not hold value IDs but rather hold the function + /// summary record offset). + DenseMap<uint64_t, std::unique_ptr<FunctionSummary>> SummaryMap; + + /// Map populated during module path string table parsing, from the + /// module ID to a string reference owned by the index's module + /// path string table, used to correlate with combined index function + /// summary records. + DenseMap<uint64_t, StringRef> ModuleIdMap; + +public: + std::error_code error(BitcodeError E, const Twine &Message); + std::error_code error(BitcodeError E); + std::error_code error(const Twine &Message); + + FunctionIndexBitcodeReader(MemoryBuffer *Buffer, + DiagnosticHandlerFunction DiagnosticHandler, + bool IsLazy = false, + bool CheckFuncSummaryPresenceOnly = false); + FunctionIndexBitcodeReader(DiagnosticHandlerFunction DiagnosticHandler, + bool IsLazy = false, + bool CheckFuncSummaryPresenceOnly = false); + ~FunctionIndexBitcodeReader() { freeState(); } + + void freeState(); + + void releaseBuffer(); + + /// Check if the parser has encountered a function summary section. + bool foundFuncSummary() { return SeenFuncSummary; } + + /// \brief Main interface to parsing a bitcode buffer. + /// \returns true if an error occurred. + std::error_code parseSummaryIndexInto(std::unique_ptr<DataStreamer> Streamer, + FunctionInfoIndex *I); + + /// \brief Interface for parsing a function summary lazily. + std::error_code parseFunctionSummary(std::unique_ptr<DataStreamer> Streamer, + FunctionInfoIndex *I, + size_t FunctionSummaryOffset); + +private: + std::error_code parseModule(); + std::error_code parseValueSymbolTable(); + std::error_code parseEntireSummary(); + std::error_code parseModuleStringTable(); + std::error_code initStream(std::unique_ptr<DataStreamer> Streamer); + std::error_code initStreamFromBuffer(); + std::error_code initLazyStream(std::unique_ptr<DataStreamer> Streamer); +}; } // namespace BitcodeDiagnosticInfo::BitcodeDiagnosticInfo(std::error_code EC, @@ -397,43 +518,51 @@ static std::error_code error(DiagnosticHandlerFunction DiagnosticHandler, return error(DiagnosticHandler, EC, EC.message()); } -static std::error_code error(DiagnosticHandlerFunction DiagnosticHandler, +static std::error_code error(LLVMContext &Context, std::error_code EC, const Twine &Message) { - return error(DiagnosticHandler, - make_error_code(BitcodeError::CorruptedBitcode), Message); + return error([&](const DiagnosticInfo &DI) { Context.diagnose(DI); }, EC, + Message); +} + +static std::error_code error(LLVMContext &Context, std::error_code EC) { + return error(Context, EC, EC.message()); +} + +static std::error_code error(LLVMContext &Context, const Twine &Message) { + return error(Context, make_error_code(BitcodeError::CorruptedBitcode), + Message); } std::error_code BitcodeReader::error(BitcodeError E, const Twine &Message) { - return ::error(DiagnosticHandler, make_error_code(E), Message); + if (!ProducerIdentification.empty()) { + return ::error(Context, make_error_code(E), + Message + " (Producer: '" + ProducerIdentification + + "' Reader: 'LLVM " + LLVM_VERSION_STRING "')"); + } + return ::error(Context, make_error_code(E), Message); } std::error_code BitcodeReader::error(const Twine &Message) { - return ::error(DiagnosticHandler, - make_error_code(BitcodeError::CorruptedBitcode), Message); + if (!ProducerIdentification.empty()) { + return ::error(Context, make_error_code(BitcodeError::CorruptedBitcode), + Message + " (Producer: '" + ProducerIdentification + + "' Reader: 'LLVM " + LLVM_VERSION_STRING "')"); + } + return ::error(Context, make_error_code(BitcodeError::CorruptedBitcode), + Message); } std::error_code BitcodeReader::error(BitcodeError E) { - return ::error(DiagnosticHandler, make_error_code(E)); -} - -static DiagnosticHandlerFunction getDiagHandler(DiagnosticHandlerFunction F, - LLVMContext &C) { - if (F) - return F; - return [&C](const DiagnosticInfo &DI) { C.diagnose(DI); }; + return ::error(Context, make_error_code(E)); } -BitcodeReader::BitcodeReader(MemoryBuffer *Buffer, LLVMContext &Context, - DiagnosticHandlerFunction DiagnosticHandler) - : Context(Context), - DiagnosticHandler(getDiagHandler(DiagnosticHandler, Context)), - Buffer(Buffer), ValueList(Context), MDValueList(Context) {} +BitcodeReader::BitcodeReader(MemoryBuffer *Buffer, LLVMContext &Context) + : Context(Context), Buffer(Buffer), ValueList(Context), + MetadataList(Context) {} -BitcodeReader::BitcodeReader(LLVMContext &Context, - DiagnosticHandlerFunction DiagnosticHandler) - : Context(Context), - DiagnosticHandler(getDiagHandler(DiagnosticHandler, Context)), - Buffer(nullptr), ValueList(Context), MDValueList(Context) {} +BitcodeReader::BitcodeReader(LLVMContext &Context) + : Context(Context), Buffer(nullptr), ValueList(Context), + MetadataList(Context) {} std::error_code BitcodeReader::materializeForwardReferencedFunctions() { if (WillMaterializeAllForwardRefs) @@ -472,7 +601,7 @@ void BitcodeReader::freeState() { Buffer = nullptr; std::vector<Type*>().swap(TypeList); ValueList.clear(); - MDValueList.clear(); + MetadataList.clear(); std::vector<Comdat *>().swap(ComdatList); std::vector<AttributeSet>().swap(MAttributes); @@ -779,6 +908,8 @@ void BitcodeReaderValueList::assignValue(Value *V, unsigned Idx) { OldV->replaceAllUsesWith(V); delete PrevVal; } + + return; } @@ -904,7 +1035,7 @@ void BitcodeReaderValueList::resolveConstantForwardRefs() { } } -void BitcodeReaderMDValueList::assignValue(Metadata *MD, unsigned Idx) { +void BitcodeReaderMetadataList::assignValue(Metadata *MD, unsigned Idx) { if (Idx == size()) { push_back(MD); return; @@ -913,7 +1044,7 @@ void BitcodeReaderMDValueList::assignValue(Metadata *MD, unsigned Idx) { if (Idx >= size()) resize(Idx+1); - TrackingMDRef &OldMD = MDValuePtrs[Idx]; + TrackingMDRef &OldMD = MetadataPtrs[Idx]; if (!OldMD) { OldMD.reset(MD); return; @@ -925,11 +1056,11 @@ void BitcodeReaderMDValueList::assignValue(Metadata *MD, unsigned Idx) { --NumFwdRefs; } -Metadata *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) { +Metadata *BitcodeReaderMetadataList::getValueFwdRef(unsigned Idx) { if (Idx >= size()) resize(Idx + 1); - if (Metadata *MD = MDValuePtrs[Idx]) + if (Metadata *MD = MetadataPtrs[Idx]) return MD; // Track forward refs to be resolved later. @@ -944,11 +1075,11 @@ Metadata *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) { // Create and return a placeholder, which will later be RAUW'd. Metadata *MD = MDNode::getTemporary(Context, None).release(); - MDValuePtrs[Idx].reset(MD); + MetadataPtrs[Idx].reset(MD); return MD; } -void BitcodeReaderMDValueList::tryToResolveCycles() { +void BitcodeReaderMetadataList::tryToResolveCycles() { if (!AnyFwdRefs) // Nothing to do. return; @@ -959,7 +1090,7 @@ void BitcodeReaderMDValueList::tryToResolveCycles() { // Resolve any cycles. for (unsigned I = MinFwdRef, E = MaxFwdRef + 1; I != E; ++I) { - auto &MD = MDValuePtrs[I]; + auto &MD = MetadataPtrs[I]; auto *N = dyn_cast_or_null<MDNode>(MD); if (!N) continue; @@ -1102,6 +1233,10 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::Cold; case bitc::ATTR_KIND_CONVERGENT: return Attribute::Convergent; + case bitc::ATTR_KIND_INACCESSIBLEMEM_ONLY: + return Attribute::InaccessibleMemOnly; + case bitc::ATTR_KIND_INACCESSIBLEMEM_OR_ARGMEMONLY: + return Attribute::InaccessibleMemOrArgMemOnly; case bitc::ATTR_KIND_INLINE_HINT: return Attribute::InlineHint; case bitc::ATTR_KIND_IN_REG: @@ -1126,6 +1261,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::NoImplicitFloat; case bitc::ATTR_KIND_NO_INLINE: return Attribute::NoInline; + case bitc::ATTR_KIND_NO_RECURSE: + return Attribute::NoRecurse; case bitc::ATTR_KIND_NON_LAZY_BIND: return Attribute::NonLazyBind; case bitc::ATTR_KIND_NON_NULL: @@ -1360,6 +1497,9 @@ std::error_code BitcodeReader::parseTypeTableBody() { case bitc::TYPE_CODE_X86_MMX: // X86_MMX ResultTy = Type::getX86_MMXTy(Context); break; + case bitc::TYPE_CODE_TOKEN: // TOKEN + ResultTy = Type::getTokenTy(Context); + break; case bitc::TYPE_CODE_INTEGER: { // INTEGER: [width] if (Record.size() < 1) return error("Invalid record"); @@ -1524,7 +1664,107 @@ std::error_code BitcodeReader::parseTypeTableBody() { } } -std::error_code BitcodeReader::parseValueSymbolTable() { +std::error_code BitcodeReader::parseOperandBundleTags() { + if (Stream.EnterSubBlock(bitc::OPERAND_BUNDLE_TAGS_BLOCK_ID)) + return error("Invalid record"); + + if (!BundleTags.empty()) + return error("Invalid multiple blocks"); + + SmallVector<uint64_t, 64> Record; + + while (1) { + BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); + + switch (Entry.Kind) { + case BitstreamEntry::SubBlock: // Handled for us already. + case BitstreamEntry::Error: + return error("Malformed block"); + case BitstreamEntry::EndBlock: + return std::error_code(); + case BitstreamEntry::Record: + // The interesting case. + break; + } + + // Tags are implicitly mapped to integers by their order. + + if (Stream.readRecord(Entry.ID, Record) != bitc::OPERAND_BUNDLE_TAG) + return error("Invalid record"); + + // OPERAND_BUNDLE_TAG: [strchr x N] + BundleTags.emplace_back(); + if (convertToString(Record, 0, BundleTags.back())) + return error("Invalid record"); + Record.clear(); + } +} + +/// Associate a value with its name from the given index in the provided record. +ErrorOr<Value *> BitcodeReader::recordValue(SmallVectorImpl<uint64_t> &Record, + unsigned NameIndex, Triple &TT) { + SmallString<128> ValueName; + if (convertToString(Record, NameIndex, ValueName)) + return error("Invalid record"); + unsigned ValueID = Record[0]; + if (ValueID >= ValueList.size() || !ValueList[ValueID]) + return error("Invalid record"); + Value *V = ValueList[ValueID]; + + StringRef NameStr(ValueName.data(), ValueName.size()); + if (NameStr.find_first_of(0) != StringRef::npos) + return error("Invalid value name"); + V->setName(NameStr); + auto *GO = dyn_cast<GlobalObject>(V); + if (GO) { + if (GO->getComdat() == reinterpret_cast<Comdat *>(1)) { + if (TT.isOSBinFormatMachO()) + GO->setComdat(nullptr); + else + GO->setComdat(TheModule->getOrInsertComdat(V->getName())); + } + } + return V; +} + +/// Parse the value symbol table at either the current parsing location or +/// at the given bit offset if provided. +std::error_code BitcodeReader::parseValueSymbolTable(uint64_t Offset) { + uint64_t CurrentBit; + // Pass in the Offset to distinguish between calling for the module-level + // VST (where we want to jump to the VST offset) and the function-level + // VST (where we don't). + if (Offset > 0) { + // Save the current parsing location so we can jump back at the end + // of the VST read. + CurrentBit = Stream.GetCurrentBitNo(); + Stream.JumpToBit(Offset * 32); +#ifndef NDEBUG + // Do some checking if we are in debug mode. + BitstreamEntry Entry = Stream.advance(); + assert(Entry.Kind == BitstreamEntry::SubBlock); + assert(Entry.ID == bitc::VALUE_SYMTAB_BLOCK_ID); +#else + // In NDEBUG mode ignore the output so we don't get an unused variable + // warning. + Stream.advance(); +#endif + } + + // Compute the delta between the bitcode indices in the VST (the word offset + // to the word-aligned ENTER_SUBBLOCK for the function block, and that + // expected by the lazy reader. The reader's EnterSubBlock expects to have + // already read the ENTER_SUBBLOCK code (size getAbbrevIDWidth) and BlockID + // (size BlockIDWidth). Note that we access the stream's AbbrevID width here + // just before entering the VST subblock because: 1) the EnterSubBlock + // changes the AbbrevID width; 2) the VST block is nested within the same + // outer MODULE_BLOCK as the FUNCTION_BLOCKs and therefore have the same + // AbbrevID width before calling EnterSubBlock; and 3) when we want to + // jump to the FUNCTION_BLOCK using this offset later, we don't want + // to rely on the stream's AbbrevID width being that of the MODULE_BLOCK. + unsigned FuncBitcodeOffsetDelta = + Stream.getAbbrevIDWidth() + bitc::BlockIDWidth; + if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID)) return error("Invalid record"); @@ -1542,6 +1782,8 @@ std::error_code BitcodeReader::parseValueSymbolTable() { case BitstreamEntry::Error: return error("Malformed block"); case BitstreamEntry::EndBlock: + if (Offset > 0) + Stream.JumpToBit(CurrentBit); return std::error_code(); case BitstreamEntry::Record: // The interesting case. @@ -1554,23 +1796,39 @@ std::error_code BitcodeReader::parseValueSymbolTable() { default: // Default behavior: unknown type. break; case bitc::VST_CODE_ENTRY: { // VST_ENTRY: [valueid, namechar x N] - if (convertToString(Record, 1, ValueName)) - return error("Invalid record"); - unsigned ValueID = Record[0]; - if (ValueID >= ValueList.size() || !ValueList[ValueID]) - return error("Invalid record"); - Value *V = ValueList[ValueID]; - - V->setName(StringRef(ValueName.data(), ValueName.size())); - if (auto *GO = dyn_cast<GlobalObject>(V)) { - if (GO->getComdat() == reinterpret_cast<Comdat *>(1)) { - if (TT.isOSBinFormatMachO()) - GO->setComdat(nullptr); - else - GO->setComdat(TheModule->getOrInsertComdat(V->getName())); - } + ErrorOr<Value *> ValOrErr = recordValue(Record, 1, TT); + if (std::error_code EC = ValOrErr.getError()) + return EC; + ValOrErr.get(); + break; + } + case bitc::VST_CODE_FNENTRY: { + // VST_FNENTRY: [valueid, offset, namechar x N] + ErrorOr<Value *> ValOrErr = recordValue(Record, 2, TT); + if (std::error_code EC = ValOrErr.getError()) + return EC; + Value *V = ValOrErr.get(); + + auto *GO = dyn_cast<GlobalObject>(V); + if (!GO) { + // If this is an alias, need to get the actual Function object + // it aliases, in order to set up the DeferredFunctionInfo entry below. + auto *GA = dyn_cast<GlobalAlias>(V); + if (GA) + GO = GA->getBaseObject(); + assert(GO); } - ValueName.clear(); + + uint64_t FuncWordOffset = Record[1]; + Function *F = dyn_cast<Function>(GO); + assert(F); + uint64_t FuncBitOffset = FuncWordOffset * 32; + DeferredFunctionInfo[F] = FuncBitOffset + FuncBitcodeOffsetDelta; + // Set the LastFunctionBlockBit to point to the last function block. + // Later when parsing is resumed after function materialization, + // we can simply skip that last function block. + if (FuncBitOffset > LastFunctionBlockBit) + LastFunctionBlockBit = FuncBitOffset; break; } case bitc::VST_CODE_BBENTRY: { @@ -1588,19 +1846,51 @@ std::error_code BitcodeReader::parseValueSymbolTable() { } } +/// Parse a single METADATA_KIND record, inserting result in MDKindMap. +std::error_code +BitcodeReader::parseMetadataKindRecord(SmallVectorImpl<uint64_t> &Record) { + if (Record.size() < 2) + return error("Invalid record"); + + unsigned Kind = Record[0]; + SmallString<8> Name(Record.begin() + 1, Record.end()); + + unsigned NewKind = TheModule->getMDKindID(Name.str()); + if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second) + return error("Conflicting METADATA_KIND records"); + return std::error_code(); +} + static int64_t unrotateSign(uint64_t U) { return U & 1 ? ~(U >> 1) : U >> 1; } -std::error_code BitcodeReader::parseMetadata() { +/// Parse a METADATA_BLOCK. If ModuleLevel is true then we are parsing +/// module level metadata. +std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) { IsMetadataMaterialized = true; - unsigned NextMDValueNo = MDValueList.size(); + unsigned NextMetadataNo = MetadataList.size(); + if (ModuleLevel && SeenModuleValuesRecord) { + // Now that we are parsing the module level metadata, we want to restart + // the numbering of the MD values, and replace temp MD created earlier + // with their real values. If we saw a METADATA_VALUE record then we + // would have set the MetadataList size to the number specified in that + // record, to support parsing function-level metadata first, and we need + // to reset back to 0 to fill the MetadataList in with the parsed module + // The function-level metadata parsing should have reset the MetadataList + // size back to the value reported by the METADATA_VALUE record, saved in + // NumModuleMDs. + assert(NumModuleMDs == MetadataList.size() && + "Expected MetadataList to only contain module level values"); + NextMetadataNo = 0; + } if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID)) return error("Invalid record"); SmallVector<uint64_t, 64> Record; - auto getMD = - [&](unsigned ID) -> Metadata *{ return MDValueList.getValueFwdRef(ID); }; + auto getMD = [&](unsigned ID) -> Metadata * { + return MetadataList.getValueFwdRef(ID); + }; auto getMDOrNull = [&](unsigned ID) -> Metadata *{ if (ID) return getMD(ID - 1); @@ -1624,7 +1914,10 @@ std::error_code BitcodeReader::parseMetadata() { case BitstreamEntry::Error: return error("Malformed block"); case BitstreamEntry::EndBlock: - MDValueList.tryToResolveCycles(); + MetadataList.tryToResolveCycles(); + assert((!(ModuleLevel && SeenModuleValuesRecord) || + NumModuleMDs == MetadataList.size()) && + "Inconsistent bitcode: METADATA_VALUES mismatch"); return std::error_code(); case BitstreamEntry::Record: // The interesting case. @@ -1652,7 +1945,8 @@ std::error_code BitcodeReader::parseMetadata() { unsigned Size = Record.size(); NamedMDNode *NMD = TheModule->getOrInsertNamedMetadata(Name); for (unsigned i = 0; i != Size; ++i) { - MDNode *MD = dyn_cast_or_null<MDNode>(MDValueList.getValueFwdRef(Record[i])); + MDNode *MD = + dyn_cast_or_null<MDNode>(MetadataList.getValueFwdRef(Record[i])); if (!MD) return error("Invalid record"); NMD->addOperand(MD); @@ -1669,7 +1963,7 @@ std::error_code BitcodeReader::parseMetadata() { // If this isn't a LocalAsMetadata record, we're dropping it. This used // to be legal, but there's no upgrade path. auto dropRecord = [&] { - MDValueList.assignValue(MDNode::get(Context, None), NextMDValueNo++); + MetadataList.assignValue(MDNode::get(Context, None), NextMetadataNo++); }; if (Record.size() != 2) { dropRecord(); @@ -1682,9 +1976,9 @@ std::error_code BitcodeReader::parseMetadata() { break; } - MDValueList.assignValue( + MetadataList.assignValue( LocalAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_OLD_NODE: { @@ -1699,7 +1993,7 @@ std::error_code BitcodeReader::parseMetadata() { if (!Ty) return error("Invalid record"); if (Ty->isMetadataTy()) - Elts.push_back(MDValueList.getValueFwdRef(Record[i+1])); + Elts.push_back(MetadataList.getValueFwdRef(Record[i + 1])); else if (!Ty->isVoidTy()) { auto *MD = ValueAsMetadata::get(ValueList.getValueFwdRef(Record[i + 1], Ty)); @@ -1709,7 +2003,7 @@ std::error_code BitcodeReader::parseMetadata() { } else Elts.push_back(nullptr); } - MDValueList.assignValue(MDNode::get(Context, Elts), NextMDValueNo++); + MetadataList.assignValue(MDNode::get(Context, Elts), NextMetadataNo++); break; } case bitc::METADATA_VALUE: { @@ -1720,9 +2014,9 @@ std::error_code BitcodeReader::parseMetadata() { if (Ty->isMetadataTy() || Ty->isVoidTy()) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( ValueAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_DISTINCT_NODE: @@ -1732,10 +2026,10 @@ std::error_code BitcodeReader::parseMetadata() { SmallVector<Metadata *, 8> Elts; Elts.reserve(Record.size()); for (unsigned ID : Record) - Elts.push_back(ID ? MDValueList.getValueFwdRef(ID - 1) : nullptr); - MDValueList.assignValue(IsDistinct ? MDNode::getDistinct(Context, Elts) - : MDNode::get(Context, Elts), - NextMDValueNo++); + Elts.push_back(ID ? MetadataList.getValueFwdRef(ID - 1) : nullptr); + MetadataList.assignValue(IsDistinct ? MDNode::getDistinct(Context, Elts) + : MDNode::get(Context, Elts), + NextMetadataNo++); break; } case bitc::METADATA_LOCATION: { @@ -1744,13 +2038,13 @@ std::error_code BitcodeReader::parseMetadata() { unsigned Line = Record[1]; unsigned Column = Record[2]; - MDNode *Scope = cast<MDNode>(MDValueList.getValueFwdRef(Record[3])); + MDNode *Scope = cast<MDNode>(MetadataList.getValueFwdRef(Record[3])); Metadata *InlinedAt = - Record[4] ? MDValueList.getValueFwdRef(Record[4] - 1) : nullptr; - MDValueList.assignValue( + Record[4] ? MetadataList.getValueFwdRef(Record[4] - 1) : nullptr; + MetadataList.assignValue( GET_OR_DISTINCT(DILocation, Record[0], (Context, Line, Column, Scope, InlinedAt)), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_GENERIC_DEBUG: { @@ -1766,63 +2060,65 @@ std::error_code BitcodeReader::parseMetadata() { auto *Header = getMDString(Record[3]); SmallVector<Metadata *, 8> DwarfOps; for (unsigned I = 4, E = Record.size(); I != E; ++I) - DwarfOps.push_back(Record[I] ? MDValueList.getValueFwdRef(Record[I] - 1) - : nullptr); - MDValueList.assignValue(GET_OR_DISTINCT(GenericDINode, Record[0], - (Context, Tag, Header, DwarfOps)), - NextMDValueNo++); + DwarfOps.push_back( + Record[I] ? MetadataList.getValueFwdRef(Record[I] - 1) : nullptr); + MetadataList.assignValue( + GET_OR_DISTINCT(GenericDINode, Record[0], + (Context, Tag, Header, DwarfOps)), + NextMetadataNo++); break; } case bitc::METADATA_SUBRANGE: { if (Record.size() != 3) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DISubrange, Record[0], (Context, Record[1], unrotateSign(Record[2]))), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_ENUMERATOR: { if (Record.size() != 3) return error("Invalid record"); - MDValueList.assignValue(GET_OR_DISTINCT(DIEnumerator, Record[0], - (Context, unrotateSign(Record[1]), - getMDString(Record[2]))), - NextMDValueNo++); + MetadataList.assignValue( + GET_OR_DISTINCT( + DIEnumerator, Record[0], + (Context, unrotateSign(Record[1]), getMDString(Record[2]))), + NextMetadataNo++); break; } case bitc::METADATA_BASIC_TYPE: { if (Record.size() != 6) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DIBasicType, Record[0], (Context, Record[1], getMDString(Record[2]), Record[3], Record[4], Record[5])), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_DERIVED_TYPE: { if (Record.size() != 12) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DIDerivedType, Record[0], (Context, Record[1], getMDString(Record[2]), getMDOrNull(Record[3]), Record[4], getMDOrNull(Record[5]), getMDOrNull(Record[6]), Record[7], Record[8], Record[9], Record[10], getMDOrNull(Record[11]))), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_COMPOSITE_TYPE: { if (Record.size() != 16) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DICompositeType, Record[0], (Context, Record[1], getMDString(Record[2]), getMDOrNull(Record[3]), Record[4], @@ -1831,17 +2127,17 @@ std::error_code BitcodeReader::parseMetadata() { getMDOrNull(Record[11]), Record[12], getMDOrNull(Record[13]), getMDOrNull(Record[14]), getMDString(Record[15]))), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_SUBROUTINE_TYPE: { if (Record.size() != 3) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DISubroutineType, Record[0], (Context, Record[1], getMDOrNull(Record[2]))), - NextMDValueNo++); + NextMetadataNo++); break; } @@ -1849,12 +2145,12 @@ std::error_code BitcodeReader::parseMetadata() { if (Record.size() != 6) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DIModule, Record[0], (Context, getMDOrNull(Record[1]), - getMDString(Record[2]), getMDString(Record[3]), - getMDString(Record[4]), getMDString(Record[5]))), - NextMDValueNo++); + getMDString(Record[2]), getMDString(Record[3]), + getMDString(Record[4]), getMDString(Record[5]))), + NextMetadataNo++); break; } @@ -1862,185 +2158,260 @@ std::error_code BitcodeReader::parseMetadata() { if (Record.size() != 3) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DIFile, Record[0], (Context, getMDString(Record[1]), getMDString(Record[2]))), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_COMPILE_UNIT: { - if (Record.size() < 14 || Record.size() > 15) + if (Record.size() < 14 || Record.size() > 16) return error("Invalid record"); - MDValueList.assignValue( - GET_OR_DISTINCT( - DICompileUnit, Record[0], - (Context, Record[1], getMDOrNull(Record[2]), - getMDString(Record[3]), Record[4], getMDString(Record[5]), - Record[6], getMDString(Record[7]), Record[8], - getMDOrNull(Record[9]), getMDOrNull(Record[10]), - getMDOrNull(Record[11]), getMDOrNull(Record[12]), - getMDOrNull(Record[13]), Record.size() == 14 ? 0 : Record[14])), - NextMDValueNo++); + // Ignore Record[0], which indicates whether this compile unit is + // distinct. It's always distinct. + MetadataList.assignValue( + DICompileUnit::getDistinct( + Context, Record[1], getMDOrNull(Record[2]), + getMDString(Record[3]), Record[4], getMDString(Record[5]), + Record[6], getMDString(Record[7]), Record[8], + getMDOrNull(Record[9]), getMDOrNull(Record[10]), + getMDOrNull(Record[11]), getMDOrNull(Record[12]), + getMDOrNull(Record[13]), + Record.size() <= 15 ? 0 : getMDOrNull(Record[15]), + Record.size() <= 14 ? 0 : Record[14]), + NextMetadataNo++); break; } case bitc::METADATA_SUBPROGRAM: { - if (Record.size() != 19) - return error("Invalid record"); - - MDValueList.assignValue( - GET_OR_DISTINCT( - DISubprogram, Record[0], - (Context, getMDOrNull(Record[1]), getMDString(Record[2]), - getMDString(Record[3]), getMDOrNull(Record[4]), Record[5], - getMDOrNull(Record[6]), Record[7], Record[8], Record[9], - getMDOrNull(Record[10]), Record[11], Record[12], Record[13], - Record[14], getMDOrNull(Record[15]), getMDOrNull(Record[16]), - getMDOrNull(Record[17]), getMDOrNull(Record[18]))), - NextMDValueNo++); + if (Record.size() != 18 && Record.size() != 19) + return error("Invalid record"); + + bool HasFn = Record.size() == 19; + DISubprogram *SP = GET_OR_DISTINCT( + DISubprogram, + Record[0] || Record[8], // All definitions should be distinct. + (Context, getMDOrNull(Record[1]), getMDString(Record[2]), + getMDString(Record[3]), getMDOrNull(Record[4]), Record[5], + getMDOrNull(Record[6]), Record[7], Record[8], Record[9], + getMDOrNull(Record[10]), Record[11], Record[12], Record[13], + Record[14], getMDOrNull(Record[15 + HasFn]), + getMDOrNull(Record[16 + HasFn]), getMDOrNull(Record[17 + HasFn]))); + MetadataList.assignValue(SP, NextMetadataNo++); + + // Upgrade sp->function mapping to function->sp mapping. + if (HasFn && Record[15]) { + if (auto *CMD = dyn_cast<ConstantAsMetadata>(getMDOrNull(Record[15]))) + if (auto *F = dyn_cast<Function>(CMD->getValue())) { + if (F->isMaterializable()) + // Defer until materialized; unmaterialized functions may not have + // metadata. + FunctionsWithSPs[F] = SP; + else if (!F->empty()) + F->setSubprogram(SP); + } + } break; } case bitc::METADATA_LEXICAL_BLOCK: { if (Record.size() != 5) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DILexicalBlock, Record[0], (Context, getMDOrNull(Record[1]), getMDOrNull(Record[2]), Record[3], Record[4])), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_LEXICAL_BLOCK_FILE: { if (Record.size() != 4) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DILexicalBlockFile, Record[0], (Context, getMDOrNull(Record[1]), getMDOrNull(Record[2]), Record[3])), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_NAMESPACE: { if (Record.size() != 5) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DINamespace, Record[0], (Context, getMDOrNull(Record[1]), getMDOrNull(Record[2]), getMDString(Record[3]), Record[4])), - NextMDValueNo++); + NextMetadataNo++); + break; + } + case bitc::METADATA_MACRO: { + if (Record.size() != 5) + return error("Invalid record"); + + MetadataList.assignValue( + GET_OR_DISTINCT(DIMacro, Record[0], + (Context, Record[1], Record[2], + getMDString(Record[3]), getMDString(Record[4]))), + NextMetadataNo++); + break; + } + case bitc::METADATA_MACRO_FILE: { + if (Record.size() != 5) + return error("Invalid record"); + + MetadataList.assignValue( + GET_OR_DISTINCT(DIMacroFile, Record[0], + (Context, Record[1], Record[2], + getMDOrNull(Record[3]), getMDOrNull(Record[4]))), + NextMetadataNo++); break; } case bitc::METADATA_TEMPLATE_TYPE: { if (Record.size() != 3) return error("Invalid record"); - MDValueList.assignValue(GET_OR_DISTINCT(DITemplateTypeParameter, - Record[0], - (Context, getMDString(Record[1]), - getMDOrNull(Record[2]))), - NextMDValueNo++); + MetadataList.assignValue(GET_OR_DISTINCT(DITemplateTypeParameter, + Record[0], + (Context, getMDString(Record[1]), + getMDOrNull(Record[2]))), + NextMetadataNo++); break; } case bitc::METADATA_TEMPLATE_VALUE: { if (Record.size() != 5) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DITemplateValueParameter, Record[0], (Context, Record[1], getMDString(Record[2]), getMDOrNull(Record[3]), getMDOrNull(Record[4]))), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_GLOBAL_VAR: { if (Record.size() != 11) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DIGlobalVariable, Record[0], (Context, getMDOrNull(Record[1]), getMDString(Record[2]), getMDString(Record[3]), getMDOrNull(Record[4]), Record[5], getMDOrNull(Record[6]), Record[7], Record[8], getMDOrNull(Record[9]), getMDOrNull(Record[10]))), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_LOCAL_VAR: { // 10th field is for the obseleted 'inlinedAt:' field. - if (Record.size() != 9 && Record.size() != 10) + if (Record.size() < 8 || Record.size() > 10) return error("Invalid record"); - MDValueList.assignValue( + // 2nd field used to be an artificial tag, either DW_TAG_auto_variable or + // DW_TAG_arg_variable. + bool HasTag = Record.size() > 8; + MetadataList.assignValue( GET_OR_DISTINCT(DILocalVariable, Record[0], - (Context, Record[1], getMDOrNull(Record[2]), - getMDString(Record[3]), getMDOrNull(Record[4]), - Record[5], getMDOrNull(Record[6]), Record[7], - Record[8])), - NextMDValueNo++); + (Context, getMDOrNull(Record[1 + HasTag]), + getMDString(Record[2 + HasTag]), + getMDOrNull(Record[3 + HasTag]), Record[4 + HasTag], + getMDOrNull(Record[5 + HasTag]), Record[6 + HasTag], + Record[7 + HasTag])), + NextMetadataNo++); break; } case bitc::METADATA_EXPRESSION: { if (Record.size() < 1) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DIExpression, Record[0], (Context, makeArrayRef(Record).slice(1))), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_OBJC_PROPERTY: { if (Record.size() != 8) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DIObjCProperty, Record[0], (Context, getMDString(Record[1]), getMDOrNull(Record[2]), Record[3], getMDString(Record[4]), getMDString(Record[5]), Record[6], getMDOrNull(Record[7]))), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_IMPORTED_ENTITY: { if (Record.size() != 6) return error("Invalid record"); - MDValueList.assignValue( + MetadataList.assignValue( GET_OR_DISTINCT(DIImportedEntity, Record[0], (Context, Record[1], getMDOrNull(Record[2]), getMDOrNull(Record[3]), Record[4], getMDString(Record[5]))), - NextMDValueNo++); + NextMetadataNo++); break; } case bitc::METADATA_STRING: { std::string String(Record.begin(), Record.end()); llvm::UpgradeMDStringConstant(String); Metadata *MD = MDString::get(Context, String); - MDValueList.assignValue(MD, NextMDValueNo++); + MetadataList.assignValue(MD, NextMetadataNo++); break; } case bitc::METADATA_KIND: { - if (Record.size() < 2) - return error("Invalid record"); + // Support older bitcode files that had METADATA_KIND records in a + // block with METADATA_BLOCK_ID. + if (std::error_code EC = parseMetadataKindRecord(Record)) + return EC; + break; + } + } + } +#undef GET_OR_DISTINCT +} - unsigned Kind = Record[0]; - SmallString<8> Name(Record.begin()+1, Record.end()); +/// Parse the metadata kinds out of the METADATA_KIND_BLOCK. +std::error_code BitcodeReader::parseMetadataKinds() { + if (Stream.EnterSubBlock(bitc::METADATA_KIND_BLOCK_ID)) + return error("Invalid record"); - unsigned NewKind = TheModule->getMDKindID(Name.str()); - if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second) - return error("Conflicting METADATA_KIND records"); + SmallVector<uint64_t, 64> Record; + + // Read all the records. + while (1) { + BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); + + switch (Entry.Kind) { + case BitstreamEntry::SubBlock: // Handled for us already. + case BitstreamEntry::Error: + return error("Malformed block"); + case BitstreamEntry::EndBlock: + return std::error_code(); + case BitstreamEntry::Record: + // The interesting case. + break; + } + + // Read a record. + Record.clear(); + unsigned Code = Stream.readRecord(Entry.ID, Record); + switch (Code) { + default: // Default behavior: ignore. + break; + case bitc::METADATA_KIND: { + if (std::error_code EC = parseMetadataKindRecord(Record)) + return EC; break; } } } -#undef GET_OR_DISTINCT } /// Decode a signed value stored with the sign bit in the LSB for dense VBR @@ -2283,8 +2654,6 @@ std::error_code BitcodeReader::parseConstants() { return error("Invalid record"); Type *EltTy = cast<SequentialType>(CurTy)->getElementType(); - unsigned Size = Record.size(); - if (EltTy->isIntegerTy(8)) { SmallVector<uint8_t, 16> Elts(Record.begin(), Record.end()); if (isa<VectorType>(CurTy)) @@ -2309,21 +2678,24 @@ std::error_code BitcodeReader::parseConstants() { V = ConstantDataVector::get(Context, Elts); else V = ConstantDataArray::get(Context, Elts); + } else if (EltTy->isHalfTy()) { + SmallVector<uint16_t, 16> Elts(Record.begin(), Record.end()); + if (isa<VectorType>(CurTy)) + V = ConstantDataVector::getFP(Context, Elts); + else + V = ConstantDataArray::getFP(Context, Elts); } else if (EltTy->isFloatTy()) { - SmallVector<float, 16> Elts(Size); - std::transform(Record.begin(), Record.end(), Elts.begin(), BitsToFloat); + SmallVector<uint32_t, 16> Elts(Record.begin(), Record.end()); if (isa<VectorType>(CurTy)) - V = ConstantDataVector::get(Context, Elts); + V = ConstantDataVector::getFP(Context, Elts); else - V = ConstantDataArray::get(Context, Elts); + V = ConstantDataArray::getFP(Context, Elts); } else if (EltTy->isDoubleTy()) { - SmallVector<double, 16> Elts(Size); - std::transform(Record.begin(), Record.end(), Elts.begin(), - BitsToDouble); + SmallVector<uint64_t, 16> Elts(Record.begin(), Record.end()); if (isa<VectorType>(CurTy)) - V = ConstantDataVector::get(Context, Elts); + V = ConstantDataVector::getFP(Context, Elts); else - V = ConstantDataArray::get(Context, Elts); + V = ConstantDataArray::getFP(Context, Elts); } else { return error("Invalid type for value"); } @@ -2410,11 +2782,12 @@ std::error_code BitcodeReader::parseConstants() { Type *SelectorTy = Type::getInt1Ty(Context); - // If CurTy is a vector of length n, then Record[0] must be a <n x i1> - // vector. Otherwise, it must be a single bit. + // The selector might be an i1 or an <n x i1> + // Get the type from the ValueList before getting a forward ref. if (VectorType *VTy = dyn_cast<VectorType>(CurTy)) - SelectorTy = VectorType::get(Type::getInt1Ty(Context), - VTy->getNumElements()); + if (Value *V = ValueList[Record[0]]) + if (SelectorTy != V->getType()) + SelectorTy = VectorType::get(SelectorTy, VTy->getNumElements()); V = ConstantExpr::getSelect(ValueList.getConstantFwdRef(Record[0], SelectorTy), @@ -2567,9 +2940,6 @@ std::error_code BitcodeReader::parseConstants() { if (!Fn) return error("Invalid record"); - // Don't let Fn get dematerialized. - BlockAddressesTaken.insert(Fn); - // If the function is already parsed we can insert the block address right // away. BasicBlock *BB; @@ -2584,7 +2954,7 @@ std::error_code BitcodeReader::parseConstants() { return error("Invalid ID"); ++BBI; } - BB = BBI; + BB = &*BBI; } else { // Otherwise insert a placeholder and remember it so it can be inserted // when the function is parsed. @@ -2652,7 +3022,7 @@ std::error_code BitcodeReader::parseUseLists() { V = ValueList[ID]; unsigned NumUses = 0; SmallDenseMap<const Use *, unsigned, 16> Order; - for (const Use &U : V->uses()) { + for (const Use &U : V->materialized_uses()) { if (++NumUses > Record.size()) break; Order[&U] = Record[NumUses - 1]; @@ -2688,7 +3058,7 @@ std::error_code BitcodeReader::materializeMetadata() { for (uint64_t BitPos : DeferredMetadataInfo) { // Move the bit stream to the saved position. Stream.JumpToBit(BitPos); - if (std::error_code EC = parseMetadata()) + if (std::error_code EC = parseMetadata(true)) return EC; } DeferredMetadataInfo.clear(); @@ -2697,6 +3067,35 @@ std::error_code BitcodeReader::materializeMetadata() { void BitcodeReader::setStripDebugInfo() { StripDebugInfo = true; } +void BitcodeReader::saveMetadataList( + DenseMap<const Metadata *, unsigned> &MetadataToIDs, bool OnlyTempMD) { + for (unsigned ID = 0; ID < MetadataList.size(); ++ID) { + Metadata *MD = MetadataList[ID]; + auto *N = dyn_cast_or_null<MDNode>(MD); + assert((!N || (N->isResolved() || N->isTemporary())) && + "Found non-resolved non-temp MDNode while saving metadata"); + // Save all values if !OnlyTempMD, otherwise just the temporary metadata. + // Note that in the !OnlyTempMD case we need to save all Metadata, not + // just MDNode, as we may have references to other types of module-level + // metadata (e.g. ValueAsMetadata) from instructions. + if (!OnlyTempMD || (N && N->isTemporary())) { + // Will call this after materializing each function, in order to + // handle remapping of the function's instructions/metadata. + // See if we already have an entry in that case. + if (OnlyTempMD && MetadataToIDs.count(MD)) { + assert(MetadataToIDs[MD] == ID && "Inconsistent metadata value id"); + continue; + } + if (N && N->isTemporary()) + // Ensure that we assert if someone tries to RAUW this temporary + // metadata while it is the key of a map. The flag will be set back + // to true when the saved metadata list is destroyed. + N->setCanReplace(false); + MetadataToIDs[MD] = ID; + } + } +} + /// When we see the block for a function body, remember where it is and then /// skip it. This lets us lazily deserialize the functions. std::error_code BitcodeReader::rememberAndSkipFunctionBody() { @@ -2709,6 +3108,9 @@ std::error_code BitcodeReader::rememberAndSkipFunctionBody() { // Save the current stream state. uint64_t CurBit = Stream.GetCurrentBitNo(); + assert( + (DeferredFunctionInfo[Fn] == 0 || DeferredFunctionInfo[Fn] == CurBit) && + "Mismatch between VST and scanned function offsets"); DeferredFunctionInfo[Fn] = CurBit; // Skip over the function block for now. @@ -2741,10 +3143,91 @@ std::error_code BitcodeReader::globalCleanup() { return std::error_code(); } -std::error_code BitcodeReader::parseModule(bool Resume, +/// Support for lazy parsing of function bodies. This is required if we +/// either have an old bitcode file without a VST forward declaration record, +/// or if we have an anonymous function being materialized, since anonymous +/// functions do not have a name and are therefore not in the VST. +std::error_code BitcodeReader::rememberAndSkipFunctionBodies() { + Stream.JumpToBit(NextUnreadBit); + + if (Stream.AtEndOfStream()) + return error("Could not find function in stream"); + + if (!SeenFirstFunctionBody) + return error("Trying to materialize functions before seeing function blocks"); + + // An old bitcode file with the symbol table at the end would have + // finished the parse greedily. + assert(SeenValueSymbolTable); + + SmallVector<uint64_t, 64> Record; + + while (1) { + BitstreamEntry Entry = Stream.advance(); + switch (Entry.Kind) { + default: + return error("Expect SubBlock"); + case BitstreamEntry::SubBlock: + switch (Entry.ID) { + default: + return error("Expect function block"); + case bitc::FUNCTION_BLOCK_ID: + if (std::error_code EC = rememberAndSkipFunctionBody()) + return EC; + NextUnreadBit = Stream.GetCurrentBitNo(); + return std::error_code(); + } + } + } +} + +std::error_code BitcodeReader::parseBitcodeVersion() { + if (Stream.EnterSubBlock(bitc::IDENTIFICATION_BLOCK_ID)) + return error("Invalid record"); + + // Read all the records. + SmallVector<uint64_t, 64> Record; + while (1) { + BitstreamEntry Entry = Stream.advance(); + + switch (Entry.Kind) { + default: + case BitstreamEntry::Error: + return error("Malformed block"); + case BitstreamEntry::EndBlock: + return std::error_code(); + case BitstreamEntry::Record: + // The interesting case. + break; + } + + // Read a record. + Record.clear(); + unsigned BitCode = Stream.readRecord(Entry.ID, Record); + switch (BitCode) { + default: // Default behavior: reject + return error("Invalid value"); + case bitc::IDENTIFICATION_CODE_STRING: { // IDENTIFICATION: [strchr x + // N] + convertToString(Record, 0, ProducerIdentification); + break; + } + case bitc::IDENTIFICATION_CODE_EPOCH: { // EPOCH: [epoch#] + unsigned epoch = (unsigned)Record[0]; + if (epoch != bitc::BITCODE_CURRENT_EPOCH) { + return error( + Twine("Incompatible epoch: Bitcode '") + Twine(epoch) + + "' vs current: '" + Twine(bitc::BITCODE_CURRENT_EPOCH) + "'"); + } + } + } + } +} + +std::error_code BitcodeReader::parseModule(uint64_t ResumeBit, bool ShouldLazyLoadMetadata) { - if (Resume) - Stream.JumpToBit(NextUnreadBit); + if (ResumeBit) + Stream.JumpToBit(ResumeBit); else if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID)) return error("Invalid record"); @@ -2785,9 +3268,23 @@ std::error_code BitcodeReader::parseModule(bool Resume, return EC; break; case bitc::VALUE_SYMTAB_BLOCK_ID: - if (std::error_code EC = parseValueSymbolTable()) - return EC; - SeenValueSymbolTable = true; + if (!SeenValueSymbolTable) { + // Either this is an old form VST without function index and an + // associated VST forward declaration record (which would have caused + // the VST to be jumped to and parsed before it was encountered + // normally in the stream), or there were no function blocks to + // trigger an earlier parsing of the VST. + assert(VSTOffset == 0 || FunctionsWithBodies.empty()); + if (std::error_code EC = parseValueSymbolTable()) + return EC; + SeenValueSymbolTable = true; + } else { + // We must have had a VST forward declaration record, which caused + // the parser to jump to and parse the VST earlier. + assert(VSTOffset > 0); + if (Stream.SkipBlock()) + return error("Invalid record"); + } break; case bitc::CONSTANTS_BLOCK_ID: if (std::error_code EC = parseConstants()) @@ -2802,7 +3299,11 @@ std::error_code BitcodeReader::parseModule(bool Resume, break; } assert(DeferredMetadataInfo.empty() && "Unexpected deferred metadata"); - if (std::error_code EC = parseMetadata()) + if (std::error_code EC = parseMetadata(true)) + return EC; + break; + case bitc::METADATA_KIND_BLOCK_ID: + if (std::error_code EC = parseMetadataKinds()) return EC; break; case bitc::FUNCTION_BLOCK_ID: @@ -2815,8 +3316,39 @@ std::error_code BitcodeReader::parseModule(bool Resume, SeenFirstFunctionBody = true; } + if (VSTOffset > 0) { + // If we have a VST forward declaration record, make sure we + // parse the VST now if we haven't already. It is needed to + // set up the DeferredFunctionInfo vector for lazy reading. + if (!SeenValueSymbolTable) { + if (std::error_code EC = + BitcodeReader::parseValueSymbolTable(VSTOffset)) + return EC; + SeenValueSymbolTable = true; + // Fall through so that we record the NextUnreadBit below. + // This is necessary in case we have an anonymous function that + // is later materialized. Since it will not have a VST entry we + // need to fall back to the lazy parse to find its offset. + } else { + // If we have a VST forward declaration record, but have already + // parsed the VST (just above, when the first function body was + // encountered here), then we are resuming the parse after + // materializing functions. The ResumeBit points to the + // start of the last function block recorded in the + // DeferredFunctionInfo map. Skip it. + if (Stream.SkipBlock()) + return error("Invalid record"); + continue; + } + } + + // Support older bitcode files that did not have the function + // index in the VST, nor a VST forward declaration record, as + // well as anonymous functions that do not have VST entries. + // Build the DeferredFunctionInfo vector on the fly. if (std::error_code EC = rememberAndSkipFunctionBody()) return EC; + // Suspend parsing when we reach the function bodies. Subsequent // materialization calls will resume it when necessary. If the bitcode // file is old, the symbol table will be at the end instead and will not @@ -2830,6 +3362,10 @@ std::error_code BitcodeReader::parseModule(bool Resume, if (std::error_code EC = parseUseLists()) return EC; break; + case bitc::OPERAND_BUNDLE_TAGS_BLOCK_ID: + if (std::error_code EC = parseOperandBundleTags()) + return EC; + break; } continue; @@ -2840,7 +3376,8 @@ std::error_code BitcodeReader::parseModule(bool Resume, // Read a record. - switch (Stream.readRecord(Entry.ID, Record)) { + auto BitCode = Stream.readRecord(Entry.ID, Record); + switch (BitCode) { default: break; // Default behavior, ignore unknown content. case bitc::MODULE_CODE_VERSION: { // VERSION: [version#] if (Record.size() < 1) @@ -3012,11 +3549,14 @@ std::error_code BitcodeReader::parseModule(bool Resume, auto *FTy = dyn_cast<FunctionType>(Ty); if (!FTy) return error("Invalid type for value"); + auto CC = static_cast<CallingConv::ID>(Record[1]); + if (CC & ~CallingConv::MaxID) + return error("Invalid calling convention ID"); Function *Func = Function::Create(FTy, GlobalValue::ExternalLinkage, "", TheModule); - Func->setCallingConv(static_cast<CallingConv::ID>(Record[1])); + Func->setCallingConv(CC); bool isProto = Record[2]; uint64_t RawLinkage = Record[3]; Func->setLinkage(getDecodedLinkage(RawLinkage)); @@ -3079,35 +3619,51 @@ std::error_code BitcodeReader::parseModule(bool Resume, } break; } - // ALIAS: [alias type, aliasee val#, linkage] - // ALIAS: [alias type, aliasee val#, linkage, visibility, dllstorageclass] - case bitc::MODULE_CODE_ALIAS: { - if (Record.size() < 3) + // ALIAS: [alias type, addrspace, aliasee val#, linkage] + // ALIAS: [alias type, addrspace, aliasee val#, linkage, visibility, dllstorageclass] + case bitc::MODULE_CODE_ALIAS: + case bitc::MODULE_CODE_ALIAS_OLD: { + bool NewRecord = BitCode == bitc::MODULE_CODE_ALIAS; + if (Record.size() < (3 + (unsigned)NewRecord)) return error("Invalid record"); - Type *Ty = getTypeByID(Record[0]); + unsigned OpNum = 0; + Type *Ty = getTypeByID(Record[OpNum++]); if (!Ty) return error("Invalid record"); - auto *PTy = dyn_cast<PointerType>(Ty); - if (!PTy) - return error("Invalid type for value"); - auto *NewGA = - GlobalAlias::create(PTy, getDecodedLinkage(Record[2]), "", TheModule); + unsigned AddrSpace; + if (!NewRecord) { + auto *PTy = dyn_cast<PointerType>(Ty); + if (!PTy) + return error("Invalid type for value"); + Ty = PTy->getElementType(); + AddrSpace = PTy->getAddressSpace(); + } else { + AddrSpace = Record[OpNum++]; + } + + auto Val = Record[OpNum++]; + auto Linkage = Record[OpNum++]; + auto *NewGA = GlobalAlias::create( + Ty, AddrSpace, getDecodedLinkage(Linkage), "", TheModule); // Old bitcode files didn't have visibility field. // Local linkage must have default visibility. - if (Record.size() > 3 && !NewGA->hasLocalLinkage()) - // FIXME: Change to an error if non-default in 4.0. - NewGA->setVisibility(getDecodedVisibility(Record[3])); - if (Record.size() > 4) - NewGA->setDLLStorageClass(getDecodedDLLStorageClass(Record[4])); + if (OpNum != Record.size()) { + auto VisInd = OpNum++; + if (!NewGA->hasLocalLinkage()) + // FIXME: Change to an error if non-default in 4.0. + NewGA->setVisibility(getDecodedVisibility(Record[VisInd])); + } + if (OpNum != Record.size()) + NewGA->setDLLStorageClass(getDecodedDLLStorageClass(Record[OpNum++])); else - upgradeDLLImportExportLinkage(NewGA, Record[2]); - if (Record.size() > 5) - NewGA->setThreadLocalMode(getDecodedThreadLocalMode(Record[5])); - if (Record.size() > 6) - NewGA->setUnnamedAddr(Record[6]); + upgradeDLLImportExportLinkage(NewGA, Linkage); + if (OpNum != Record.size()) + NewGA->setThreadLocalMode(getDecodedThreadLocalMode(Record[OpNum++])); + if (OpNum != Record.size()) + NewGA->setUnnamedAddr(Record[OpNum++]); ValueList.push_back(NewGA); - AliasInits.push_back(std::make_pair(NewGA, Record[1])); + AliasInits.push_back(std::make_pair(NewGA, Val)); break; } /// MODULE_CODE_PURGEVALS: [numvals] @@ -3117,11 +3673,52 @@ std::error_code BitcodeReader::parseModule(bool Resume, return error("Invalid record"); ValueList.shrinkTo(Record[0]); break; + /// MODULE_CODE_VSTOFFSET: [offset] + case bitc::MODULE_CODE_VSTOFFSET: + if (Record.size() < 1) + return error("Invalid record"); + VSTOffset = Record[0]; + break; + /// MODULE_CODE_METADATA_VALUES: [numvals] + case bitc::MODULE_CODE_METADATA_VALUES: + if (Record.size() < 1) + return error("Invalid record"); + assert(!IsMetadataMaterialized); + // This record contains the number of metadata values in the module-level + // METADATA_BLOCK. It is used to support lazy parsing of metadata as + // a postpass, where we will parse function-level metadata first. + // This is needed because the ids of metadata are assigned implicitly + // based on their ordering in the bitcode, with the function-level + // metadata ids starting after the module-level metadata ids. Otherwise, + // we would have to parse the module-level metadata block to prime the + // MetadataList when we are lazy loading metadata during function + // importing. Initialize the MetadataList size here based on the + // record value, regardless of whether we are doing lazy metadata + // loading, so that we have consistent handling and assertion + // checking in parseMetadata for module-level metadata. + NumModuleMDs = Record[0]; + SeenModuleValuesRecord = true; + assert(MetadataList.size() == 0); + MetadataList.resize(NumModuleMDs); + break; } Record.clear(); } } +/// Helper to read the header common to all bitcode files. +static bool hasValidBitcodeHeader(BitstreamCursor &Stream) { + // Sniff for the signature. + if (Stream.Read(8) != 'B' || + Stream.Read(8) != 'C' || + Stream.Read(4) != 0x0 || + Stream.Read(4) != 0xC || + Stream.Read(4) != 0xE || + Stream.Read(4) != 0xD) + return false; + return true; +} + std::error_code BitcodeReader::parseBitcodeInto(std::unique_ptr<DataStreamer> Streamer, Module *M, bool ShouldLazyLoadMetadata) { @@ -3131,12 +3728,7 @@ BitcodeReader::parseBitcodeInto(std::unique_ptr<DataStreamer> Streamer, return EC; // Sniff for the signature. - if (Stream.Read(8) != 'B' || - Stream.Read(8) != 'C' || - Stream.Read(4) != 0x0 || - Stream.Read(4) != 0xC || - Stream.Read(4) != 0xE || - Stream.Read(4) != 0xD) + if (!hasValidBitcodeHeader(Stream)) return error("Invalid bitcode signature"); // We expect a number of well-defined blocks, though we don't necessarily @@ -3153,8 +3745,13 @@ BitcodeReader::parseBitcodeInto(std::unique_ptr<DataStreamer> Streamer, if (Entry.Kind != BitstreamEntry::SubBlock) return error("Malformed block"); + if (Entry.ID == bitc::IDENTIFICATION_BLOCK_ID) { + parseBitcodeVersion(); + continue; + } + if (Entry.ID == bitc::MODULE_BLOCK_ID) - return parseModule(false, ShouldLazyLoadMetadata); + return parseModule(0, ShouldLazyLoadMetadata); if (Stream.SkipBlock()) return error("Invalid record"); @@ -3204,12 +3801,7 @@ ErrorOr<std::string> BitcodeReader::parseTriple() { return EC; // Sniff for the signature. - if (Stream.Read(8) != 'B' || - Stream.Read(8) != 'C' || - Stream.Read(4) != 0x0 || - Stream.Read(4) != 0xC || - Stream.Read(4) != 0xE || - Stream.Read(4) != 0xD) + if (!hasValidBitcodeHeader(Stream)) return error("Invalid bitcode signature"); // We expect a number of well-defined blocks, though we don't necessarily @@ -3239,6 +3831,41 @@ ErrorOr<std::string> BitcodeReader::parseTriple() { } } +ErrorOr<std::string> BitcodeReader::parseIdentificationBlock() { + if (std::error_code EC = initStream(nullptr)) + return EC; + + // Sniff for the signature. + if (!hasValidBitcodeHeader(Stream)) + return error("Invalid bitcode signature"); + + // We expect a number of well-defined blocks, though we don't necessarily + // need to understand them all. + while (1) { + BitstreamEntry Entry = Stream.advance(); + switch (Entry.Kind) { + case BitstreamEntry::Error: + return error("Malformed block"); + case BitstreamEntry::EndBlock: + return std::error_code(); + + case BitstreamEntry::SubBlock: + if (Entry.ID == bitc::IDENTIFICATION_BLOCK_ID) { + if (std::error_code EC = parseBitcodeVersion()) + return EC; + return ProducerIdentification; + } + // Ignore other sub-blocks. + if (Stream.SkipBlock()) + return error("Malformed block"); + continue; + case BitstreamEntry::Record: + Stream.skipRecord(Entry.ID); + continue; + } + } +} + /// Parse metadata attachments. std::error_code BitcodeReader::parseMetadataAttachment(Function &F) { if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID)) @@ -3274,7 +3901,7 @@ std::error_code BitcodeReader::parseMetadataAttachment(Function &F) { auto K = MDKindMap.find(Record[I]); if (K == MDKindMap.end()) return error("Invalid ID"); - Metadata *MD = MDValueList.getValueFwdRef(Record[I + 1]); + Metadata *MD = MetadataList.getValueFwdRef(Record[I + 1]); F.setMetadata(K->second, cast<MDNode>(MD)); } continue; @@ -3288,7 +3915,7 @@ std::error_code BitcodeReader::parseMetadataAttachment(Function &F) { MDKindMap.find(Kind); if (I == MDKindMap.end()) return error("Invalid ID"); - Metadata *Node = MDValueList.getValueFwdRef(Record[i + 1]); + Metadata *Node = MetadataList.getValueFwdRef(Record[i + 1]); if (isa<LocalAsMetadata>(Node)) // Drop the attachment. This used to be legal, but there's no // upgrade path. @@ -3303,17 +3930,17 @@ std::error_code BitcodeReader::parseMetadataAttachment(Function &F) { } } -static std::error_code typeCheckLoadStoreInst(DiagnosticHandlerFunction DH, - Type *ValType, Type *PtrType) { +static std::error_code typeCheckLoadStoreInst(Type *ValType, Type *PtrType) { + LLVMContext &Context = PtrType->getContext(); if (!isa<PointerType>(PtrType)) - return error(DH, "Load/Store operand is not a pointer type"); + return error(Context, "Load/Store operand is not a pointer type"); Type *ElemType = cast<PointerType>(PtrType)->getElementType(); if (ValType && ValType != ElemType) - return error(DH, "Explicit load/store type does not match pointee type of " - "pointer operand"); + return error(Context, "Explicit load/store type does not match pointee " + "type of pointer operand"); if (!PointerType::isLoadableOrStorableType(ElemType)) - return error(DH, "Cannot load/store from pointer"); + return error(Context, "Cannot load/store from pointer"); return std::error_code(); } @@ -3324,11 +3951,11 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { InstructionList.clear(); unsigned ModuleValueListSize = ValueList.size(); - unsigned ModuleMDValueListSize = MDValueList.size(); + unsigned ModuleMetadataListSize = MetadataList.size(); // Add all the function arguments to the value table. - for(Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) - ValueList.push_back(I); + for (Argument &I : F->args()) + ValueList.push_back(&I); unsigned NextValueNo = ValueList.size(); BasicBlock *CurBB = nullptr; @@ -3344,6 +3971,8 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { return nullptr; }; + std::vector<OperandBundleDef> OperandBundles; + // Read all the records. SmallVector<uint64_t, 64> Record; while (1) { @@ -3452,8 +4081,10 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { unsigned ScopeID = Record[2], IAID = Record[3]; MDNode *Scope = nullptr, *IA = nullptr; - if (ScopeID) Scope = cast<MDNode>(MDValueList.getValueFwdRef(ScopeID-1)); - if (IAID) IA = cast<MDNode>(MDValueList.getValueFwdRef(IAID-1)); + if (ScopeID) + Scope = cast<MDNode>(MetadataList.getValueFwdRef(ScopeID - 1)); + if (IAID) + IA = cast<MDNode>(MetadataList.getValueFwdRef(IAID - 1)); LastLoc = DebugLoc::get(Line, Col, Scope, IA); I->setDebugLoc(LastLoc); I = nullptr; @@ -3515,7 +4146,10 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { CurBB->getInstList().push_back(Temp); } } else { - I = CastInst::Create((Instruction::CastOps)Opc, Op, ResTy); + auto CastOp = (Instruction::CastOps)Opc; + if (!CastInst::castIsValid(CastOp, Op, ResTy)) + return error("Invalid cast"); + I = CastInst::Create(CastOp, Op, ResTy); } InstructionList.push_back(I); break; @@ -3811,6 +4445,110 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { } break; } + case bitc::FUNC_CODE_INST_CLEANUPRET: { // CLEANUPRET: [val] or [val,bb#] + if (Record.size() != 1 && Record.size() != 2) + return error("Invalid record"); + unsigned Idx = 0; + Value *CleanupPad = + getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context)); + if (!CleanupPad) + return error("Invalid record"); + BasicBlock *UnwindDest = nullptr; + if (Record.size() == 2) { + UnwindDest = getBasicBlock(Record[Idx++]); + if (!UnwindDest) + return error("Invalid record"); + } + + I = CleanupReturnInst::Create(CleanupPad, UnwindDest); + InstructionList.push_back(I); + break; + } + case bitc::FUNC_CODE_INST_CATCHRET: { // CATCHRET: [val,bb#] + if (Record.size() != 2) + return error("Invalid record"); + unsigned Idx = 0; + Value *CatchPad = + getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context)); + if (!CatchPad) + return error("Invalid record"); + BasicBlock *BB = getBasicBlock(Record[Idx++]); + if (!BB) + return error("Invalid record"); + + I = CatchReturnInst::Create(CatchPad, BB); + InstructionList.push_back(I); + break; + } + case bitc::FUNC_CODE_INST_CATCHSWITCH: { // CATCHSWITCH: [tok,num,(bb)*,bb?] + // We must have, at minimum, the outer scope and the number of arguments. + if (Record.size() < 2) + return error("Invalid record"); + + unsigned Idx = 0; + + Value *ParentPad = + getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context)); + + unsigned NumHandlers = Record[Idx++]; + + SmallVector<BasicBlock *, 2> Handlers; + for (unsigned Op = 0; Op != NumHandlers; ++Op) { + BasicBlock *BB = getBasicBlock(Record[Idx++]); + if (!BB) + return error("Invalid record"); + Handlers.push_back(BB); + } + + BasicBlock *UnwindDest = nullptr; + if (Idx + 1 == Record.size()) { + UnwindDest = getBasicBlock(Record[Idx++]); + if (!UnwindDest) + return error("Invalid record"); + } + + if (Record.size() != Idx) + return error("Invalid record"); + + auto *CatchSwitch = + CatchSwitchInst::Create(ParentPad, UnwindDest, NumHandlers); + for (BasicBlock *Handler : Handlers) + CatchSwitch->addHandler(Handler); + I = CatchSwitch; + InstructionList.push_back(I); + break; + } + case bitc::FUNC_CODE_INST_CATCHPAD: + case bitc::FUNC_CODE_INST_CLEANUPPAD: { // [tok,num,(ty,val)*] + // We must have, at minimum, the outer scope and the number of arguments. + if (Record.size() < 2) + return error("Invalid record"); + + unsigned Idx = 0; + + Value *ParentPad = + getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context)); + + unsigned NumArgOperands = Record[Idx++]; + + SmallVector<Value *, 2> Args; + for (unsigned Op = 0; Op != NumArgOperands; ++Op) { + Value *Val; + if (getValueTypePair(Record, Idx, NextValueNo, Val)) + return error("Invalid record"); + Args.push_back(Val); + } + + if (Record.size() != Idx) + return error("Invalid record"); + + if (BitCode == bitc::FUNC_CODE_INST_CLEANUPPAD) + I = CleanupPadInst::Create(ParentPad, Args); + else + I = CatchPadInst::Create(ParentPad, Args); + InstructionList.push_back(I); + break; + } case bitc::FUNC_CODE_INST_SWITCH: { // SWITCH: [opty, op0, op1, ...] // Check magic if ((Record[0] >> 16) == SWITCH_INST_MAGIC) { @@ -3973,10 +4711,11 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { } } - I = InvokeInst::Create(Callee, NormalBB, UnwindBB, Ops); + I = InvokeInst::Create(Callee, NormalBB, UnwindBB, Ops, OperandBundles); + OperandBundles.clear(); InstructionList.push_back(I); - cast<InvokeInst>(I) - ->setCallingConv(static_cast<CallingConv::ID>(~(1U << 13) & CCInfo)); + cast<InvokeInst>(I)->setCallingConv( + static_cast<CallingConv::ID>(CallingConv::MaxID & CCInfo)); cast<InvokeInst>(I)->setAttributes(PAL); break; } @@ -4081,6 +4820,8 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { uint64_t AlignRecord = Record[3]; const uint64_t InAllocaMask = uint64_t(1) << 5; const uint64_t ExplicitTypeMask = uint64_t(1) << 6; + // Reserve bit 7 for SwiftError flag. + // const uint64_t SwiftErrorMask = uint64_t(1) << 7; const uint64_t FlagMask = InAllocaMask | ExplicitTypeMask; bool InAlloca = AlignRecord & InAllocaMask; Type *Ty = getTypeByID(Record[0]); @@ -4115,8 +4856,7 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { Type *Ty = nullptr; if (OpNum + 3 == Record.size()) Ty = getTypeByID(Record[OpNum++]); - if (std::error_code EC = - typeCheckLoadStoreInst(DiagnosticHandler, Ty, Op->getType())) + if (std::error_code EC = typeCheckLoadStoreInst(Ty, Op->getType())) return EC; if (!Ty) Ty = cast<PointerType>(Op->getType())->getElementType(); @@ -4140,8 +4880,7 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { Type *Ty = nullptr; if (OpNum + 5 == Record.size()) Ty = getTypeByID(Record[OpNum++]); - if (std::error_code EC = - typeCheckLoadStoreInst(DiagnosticHandler, Ty, Op->getType())) + if (std::error_code EC = typeCheckLoadStoreInst(Ty, Op->getType())) return EC; if (!Ty) Ty = cast<PointerType>(Op->getType())->getElementType(); @@ -4175,8 +4914,8 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { OpNum + 2 != Record.size()) return error("Invalid record"); - if (std::error_code EC = typeCheckLoadStoreInst( - DiagnosticHandler, Val->getType(), Ptr->getType())) + if (std::error_code EC = + typeCheckLoadStoreInst(Val->getType(), Ptr->getType())) return EC; unsigned Align; if (std::error_code EC = parseAlignmentValue(Record[OpNum], Align)) @@ -4199,8 +4938,8 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { OpNum + 4 != Record.size()) return error("Invalid record"); - if (std::error_code EC = typeCheckLoadStoreInst( - DiagnosticHandler, Val->getType(), Ptr->getType())) + if (std::error_code EC = + typeCheckLoadStoreInst(Val->getType(), Ptr->getType())) return EC; AtomicOrdering Ordering = getDecodedOrdering(Record[OpNum + 2]); if (Ordering == NotAtomic || Ordering == Acquire || @@ -4237,8 +4976,8 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { return error("Invalid record"); SynchronizationScope SynchScope = getDecodedSynchScope(Record[OpNum + 2]); - if (std::error_code EC = typeCheckLoadStoreInst( - DiagnosticHandler, Cmp->getType(), Ptr->getType())) + if (std::error_code EC = + typeCheckLoadStoreInst(Cmp->getType(), Ptr->getType())) return EC; AtomicOrdering FailureOrdering; if (Record.size() < 7) @@ -4299,7 +5038,7 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { break; } case bitc::FUNC_CODE_INST_CALL: { - // CALL: [paramattrs, cc, fnty, fnid, arg0, arg1...] + // CALL: [paramattrs, cc, fmf, fnty, fnid, arg0, arg1...] if (Record.size() < 3) return error("Invalid record"); @@ -4307,8 +5046,15 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { AttributeSet PAL = getAttributes(Record[OpNum++]); unsigned CCInfo = Record[OpNum++]; + FastMathFlags FMF; + if ((CCInfo >> bitc::CALL_FMF) & 1) { + FMF = getDecodedFastMathFlags(Record[OpNum++]); + if (!FMF.any()) + return error("Fast math flags indicator set for call with no FMF"); + } + FunctionType *FTy = nullptr; - if (CCInfo >> 15 & 1 && + if (CCInfo >> bitc::CALL_EXPLICIT_TYPE & 1 && !(FTy = dyn_cast<FunctionType>(getTypeByID(Record[OpNum++])))) return error("Explicit call type is not a function type"); @@ -4354,17 +5100,26 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { } } - I = CallInst::Create(FTy, Callee, Args); + I = CallInst::Create(FTy, Callee, Args, OperandBundles); + OperandBundles.clear(); InstructionList.push_back(I); cast<CallInst>(I)->setCallingConv( - static_cast<CallingConv::ID>((~(1U << 14) & CCInfo) >> 1)); + static_cast<CallingConv::ID>((0x7ff & CCInfo) >> bitc::CALL_CCONV)); CallInst::TailCallKind TCK = CallInst::TCK_None; - if (CCInfo & 1) + if (CCInfo & 1 << bitc::CALL_TAIL) TCK = CallInst::TCK_Tail; - if (CCInfo & (1 << 14)) + if (CCInfo & (1 << bitc::CALL_MUSTTAIL)) TCK = CallInst::TCK_MustTail; + if (CCInfo & (1 << bitc::CALL_NOTAIL)) + TCK = CallInst::TCK_NoTail; cast<CallInst>(I)->setTailCallKind(TCK); cast<CallInst>(I)->setAttributes(PAL); + if (FMF.any()) { + if (!isa<FPMathOperator>(I)) + return error("Fast-math-flags specified for call without " + "floating-point scalar or vector return type"); + I->setFastMathFlags(FMF); + } break; } case bitc::FUNC_CODE_INST_VAARG: { // VAARG: [valistty, valist, instty] @@ -4379,6 +5134,28 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { InstructionList.push_back(I); break; } + + case bitc::FUNC_CODE_OPERAND_BUNDLE: { + // A call or an invoke can be optionally prefixed with some variable + // number of operand bundle blocks. These blocks are read into + // OperandBundles and consumed at the next call or invoke instruction. + + if (Record.size() < 1 || Record[0] >= BundleTags.size()) + return error("Invalid record"); + + std::vector<Value *> Inputs; + + unsigned OpNum = 1; + while (OpNum != Record.size()) { + Value *Op; + if (getValueTypePair(Record, OpNum, NextValueNo, Op)) + return error("Invalid record"); + Inputs.push_back(Op); + } + + OperandBundles.emplace_back(BundleTags[Record[0]], std::move(Inputs)); + continue; + } } // Add instruction to end of current BB. If there is no current BB, reject @@ -4387,6 +5164,10 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { delete I; return error("Invalid instruction with no BB"); } + if (!OperandBundles.empty()) { + delete I; + return error("Operand bundles found with no consumer"); + } CurBB->getInstList().push_back(I); // If this was a terminator instruction, move to the next block. @@ -4402,6 +5183,9 @@ std::error_code BitcodeReader::parseFunctionBody(Function *F) { OutOfRecordLoop: + if (!OperandBundles.empty()) + return error("Operand bundles found with no consumer"); + // Check the function list for unresolved values. if (Argument *A = dyn_cast<Argument>(ValueList.back())) { if (!A->getParent()) { @@ -4421,7 +5205,7 @@ OutOfRecordLoop: // Trim the value list down to the size it was before we parsed this function. ValueList.shrinkTo(ModuleValueListSize); - MDValueList.shrinkTo(ModuleMDValueListSize); + MetadataList.shrinkTo(ModuleMetadataListSize); std::vector<BasicBlock*>().swap(FunctionBBs); return std::error_code(); } @@ -4431,11 +5215,14 @@ std::error_code BitcodeReader::findFunctionInStream( Function *F, DenseMap<Function *, uint64_t>::iterator DeferredFunctionInfoIterator) { while (DeferredFunctionInfoIterator->second == 0) { - if (Stream.AtEndOfStream()) - return error("Could not find function in stream"); - // ParseModule will parse the next body in the stream and set its - // position in the DeferredFunctionInfo map. - if (std::error_code EC = parseModule(true)) + // This is the fallback handling for the old format bitcode that + // didn't contain the function index in the VST, or when we have + // an anonymous function which would not have a VST entry. + // Assert that we have one of those two cases. + assert(VSTOffset == 0 || !F->hasName()); + // Parse the next body in the stream and set its position in the + // DeferredFunctionInfo map. + if (std::error_code EC = rememberAndSkipFunctionBodies()) return EC; } return std::error_code(); @@ -4448,8 +5235,12 @@ std::error_code BitcodeReader::findFunctionInStream( void BitcodeReader::releaseBuffer() { Buffer.release(); } std::error_code BitcodeReader::materialize(GlobalValue *GV) { - if (std::error_code EC = materializeMetadata()) - return EC; + // In older bitcode we must materialize the metadata before parsing + // any functions, in order to set up the MetadataList properly. + if (!SeenModuleValuesRecord) { + if (std::error_code EC = materializeMetadata()) + return EC; + } Function *F = dyn_cast<Function>(GV); // If it's not a function or is already material, ignore the request. @@ -4476,7 +5267,8 @@ std::error_code BitcodeReader::materialize(GlobalValue *GV) { // Upgrade any old intrinsic calls in the function. for (auto &I : UpgradedIntrinsics) { - for (auto UI = I.first->user_begin(), UE = I.first->user_end(); UI != UE;) { + for (auto UI = I.first->materialized_user_begin(), UE = I.first->user_end(); + UI != UE;) { User *U = *UI; ++UI; if (CallInst *CI = dyn_cast<CallInst>(U)) @@ -4484,41 +5276,16 @@ std::error_code BitcodeReader::materialize(GlobalValue *GV) { } } + // Finish fn->subprogram upgrade for materialized functions. + if (DISubprogram *SP = FunctionsWithSPs.lookup(F)) + F->setSubprogram(SP); + // Bring in any functions that this function forward-referenced via // blockaddresses. return materializeForwardReferencedFunctions(); } -bool BitcodeReader::isDematerializable(const GlobalValue *GV) const { - const Function *F = dyn_cast<Function>(GV); - if (!F || F->isDeclaration()) - return false; - - // Dematerializing F would leave dangling references that wouldn't be - // reconnected on re-materialization. - if (BlockAddressesTaken.count(F)) - return false; - - return DeferredFunctionInfo.count(const_cast<Function*>(F)); -} - -void BitcodeReader::dematerialize(GlobalValue *GV) { - Function *F = dyn_cast<Function>(GV); - // If this function isn't dematerializable, this is a noop. - if (!F || !isDematerializable(F)) - return; - - assert(DeferredFunctionInfo.count(F) && "No info to read function later?"); - - // Just forget the function body, we can remat it later. - F->dropAllReferences(); - F->setIsMaterializable(true); -} - -std::error_code BitcodeReader::materializeModule(Module *M) { - assert(M == TheModule && - "Can only Materialize the Module this BitcodeReader is attached to."); - +std::error_code BitcodeReader::materializeModule() { if (std::error_code EC = materializeMetadata()) return EC; @@ -4527,16 +5294,16 @@ std::error_code BitcodeReader::materializeModule(Module *M) { // Iterate over the module, deserializing any functions that are still on // disk. - for (Module::iterator F = TheModule->begin(), E = TheModule->end(); - F != E; ++F) { - if (std::error_code EC = materialize(F)) + for (Function &F : *TheModule) { + if (std::error_code EC = materialize(&F)) return EC; } - // At this point, if there are any function bodies, the current bit is - // pointing to the END_BLOCK record after them. Now make sure the rest - // of the bits in the module have been read. - if (NextUnreadBit) - parseModule(true); + // At this point, if there are any function bodies, parse the rest of + // the bits in the module past the last function block we have recorded + // through either lazy scanning or the VST. + if (LastFunctionBlockBit || NextUnreadBit) + parseModule(LastFunctionBlockBit > NextUnreadBit ? LastFunctionBlockBit + : NextUnreadBit); // Check that all block address forward references got resolved (as we // promised above). @@ -4561,7 +5328,7 @@ std::error_code BitcodeReader::materializeModule(Module *M) { for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++) UpgradeInstWithTBAATag(InstsWithTBAATag[I]); - UpgradeDebugInfo(*M); + UpgradeDebugInfo(*TheModule); return std::error_code(); } @@ -4622,6 +5389,416 @@ BitcodeReader::initLazyStream(std::unique_ptr<DataStreamer> Streamer) { return std::error_code(); } +std::error_code FunctionIndexBitcodeReader::error(BitcodeError E, + const Twine &Message) { + return ::error(DiagnosticHandler, make_error_code(E), Message); +} + +std::error_code FunctionIndexBitcodeReader::error(const Twine &Message) { + return ::error(DiagnosticHandler, + make_error_code(BitcodeError::CorruptedBitcode), Message); +} + +std::error_code FunctionIndexBitcodeReader::error(BitcodeError E) { + return ::error(DiagnosticHandler, make_error_code(E)); +} + +FunctionIndexBitcodeReader::FunctionIndexBitcodeReader( + MemoryBuffer *Buffer, DiagnosticHandlerFunction DiagnosticHandler, + bool IsLazy, bool CheckFuncSummaryPresenceOnly) + : DiagnosticHandler(DiagnosticHandler), Buffer(Buffer), IsLazy(IsLazy), + CheckFuncSummaryPresenceOnly(CheckFuncSummaryPresenceOnly) {} + +FunctionIndexBitcodeReader::FunctionIndexBitcodeReader( + DiagnosticHandlerFunction DiagnosticHandler, bool IsLazy, + bool CheckFuncSummaryPresenceOnly) + : DiagnosticHandler(DiagnosticHandler), Buffer(nullptr), IsLazy(IsLazy), + CheckFuncSummaryPresenceOnly(CheckFuncSummaryPresenceOnly) {} + +void FunctionIndexBitcodeReader::freeState() { Buffer = nullptr; } + +void FunctionIndexBitcodeReader::releaseBuffer() { Buffer.release(); } + +// Specialized value symbol table parser used when reading function index +// blocks where we don't actually create global values. +// At the end of this routine the function index is populated with a map +// from function name to FunctionInfo. The function info contains +// the function block's bitcode offset as well as the offset into the +// function summary section. +std::error_code FunctionIndexBitcodeReader::parseValueSymbolTable() { + if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID)) + return error("Invalid record"); + + SmallVector<uint64_t, 64> Record; + + // Read all the records for this value table. + SmallString<128> ValueName; + while (1) { + BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); + + switch (Entry.Kind) { + case BitstreamEntry::SubBlock: // Handled for us already. + case BitstreamEntry::Error: + return error("Malformed block"); + case BitstreamEntry::EndBlock: + return std::error_code(); + case BitstreamEntry::Record: + // The interesting case. + break; + } + + // Read a record. + Record.clear(); + switch (Stream.readRecord(Entry.ID, Record)) { + default: // Default behavior: ignore (e.g. VST_CODE_BBENTRY records). + break; + case bitc::VST_CODE_FNENTRY: { + // VST_FNENTRY: [valueid, offset, namechar x N] + if (convertToString(Record, 2, ValueName)) + return error("Invalid record"); + unsigned ValueID = Record[0]; + uint64_t FuncOffset = Record[1]; + std::unique_ptr<FunctionInfo> FuncInfo = + llvm::make_unique<FunctionInfo>(FuncOffset); + if (foundFuncSummary() && !IsLazy) { + DenseMap<uint64_t, std::unique_ptr<FunctionSummary>>::iterator SMI = + SummaryMap.find(ValueID); + assert(SMI != SummaryMap.end() && "Summary info not found"); + FuncInfo->setFunctionSummary(std::move(SMI->second)); + } + TheIndex->addFunctionInfo(ValueName, std::move(FuncInfo)); + + ValueName.clear(); + break; + } + case bitc::VST_CODE_COMBINED_FNENTRY: { + // VST_FNENTRY: [offset, namechar x N] + if (convertToString(Record, 1, ValueName)) + return error("Invalid record"); + uint64_t FuncSummaryOffset = Record[0]; + std::unique_ptr<FunctionInfo> FuncInfo = + llvm::make_unique<FunctionInfo>(FuncSummaryOffset); + if (foundFuncSummary() && !IsLazy) { + DenseMap<uint64_t, std::unique_ptr<FunctionSummary>>::iterator SMI = + SummaryMap.find(FuncSummaryOffset); + assert(SMI != SummaryMap.end() && "Summary info not found"); + FuncInfo->setFunctionSummary(std::move(SMI->second)); + } + TheIndex->addFunctionInfo(ValueName, std::move(FuncInfo)); + + ValueName.clear(); + break; + } + } + } +} + +// Parse just the blocks needed for function index building out of the module. +// At the end of this routine the function Index is populated with a map +// from function name to FunctionInfo. The function info contains +// either the parsed function summary information (when parsing summaries +// eagerly), or just to the function summary record's offset +// if parsing lazily (IsLazy). +std::error_code FunctionIndexBitcodeReader::parseModule() { + if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID)) + return error("Invalid record"); + + // Read the function index for this module. + while (1) { + BitstreamEntry Entry = Stream.advance(); + + switch (Entry.Kind) { + case BitstreamEntry::Error: + return error("Malformed block"); + case BitstreamEntry::EndBlock: + return std::error_code(); + + case BitstreamEntry::SubBlock: + if (CheckFuncSummaryPresenceOnly) { + if (Entry.ID == bitc::FUNCTION_SUMMARY_BLOCK_ID) { + SeenFuncSummary = true; + // No need to parse the rest since we found the summary. + return std::error_code(); + } + if (Stream.SkipBlock()) + return error("Invalid record"); + continue; + } + switch (Entry.ID) { + default: // Skip unknown content. + if (Stream.SkipBlock()) + return error("Invalid record"); + break; + case bitc::BLOCKINFO_BLOCK_ID: + // Need to parse these to get abbrev ids (e.g. for VST) + if (Stream.ReadBlockInfoBlock()) + return error("Malformed block"); + break; + case bitc::VALUE_SYMTAB_BLOCK_ID: + if (std::error_code EC = parseValueSymbolTable()) + return EC; + break; + case bitc::FUNCTION_SUMMARY_BLOCK_ID: + SeenFuncSummary = true; + if (IsLazy) { + // Lazy parsing of summary info, skip it. + if (Stream.SkipBlock()) + return error("Invalid record"); + } else if (std::error_code EC = parseEntireSummary()) + return EC; + break; + case bitc::MODULE_STRTAB_BLOCK_ID: + if (std::error_code EC = parseModuleStringTable()) + return EC; + break; + } + continue; + + case BitstreamEntry::Record: + Stream.skipRecord(Entry.ID); + continue; + } + } +} + +// Eagerly parse the entire function summary block (i.e. for all functions +// in the index). This populates the FunctionSummary objects in +// the index. +std::error_code FunctionIndexBitcodeReader::parseEntireSummary() { + if (Stream.EnterSubBlock(bitc::FUNCTION_SUMMARY_BLOCK_ID)) + return error("Invalid record"); + + SmallVector<uint64_t, 64> Record; + + while (1) { + BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); + + switch (Entry.Kind) { + case BitstreamEntry::SubBlock: // Handled for us already. + case BitstreamEntry::Error: + return error("Malformed block"); + case BitstreamEntry::EndBlock: + return std::error_code(); + case BitstreamEntry::Record: + // The interesting case. + break; + } + + // Read a record. The record format depends on whether this + // is a per-module index or a combined index file. In the per-module + // case the records contain the associated value's ID for correlation + // with VST entries. In the combined index the correlation is done + // via the bitcode offset of the summary records (which were saved + // in the combined index VST entries). The records also contain + // information used for ThinLTO renaming and importing. + Record.clear(); + uint64_t CurRecordBit = Stream.GetCurrentBitNo(); + switch (Stream.readRecord(Entry.ID, Record)) { + default: // Default behavior: ignore. + break; + // FS_PERMODULE_ENTRY: [valueid, islocal, instcount] + case bitc::FS_CODE_PERMODULE_ENTRY: { + unsigned ValueID = Record[0]; + bool IsLocal = Record[1]; + unsigned InstCount = Record[2]; + std::unique_ptr<FunctionSummary> FS = + llvm::make_unique<FunctionSummary>(InstCount); + FS->setLocalFunction(IsLocal); + // The module path string ref set in the summary must be owned by the + // index's module string table. Since we don't have a module path + // string table section in the per-module index, we create a single + // module path string table entry with an empty (0) ID to take + // ownership. + FS->setModulePath( + TheIndex->addModulePath(Buffer->getBufferIdentifier(), 0)); + SummaryMap[ValueID] = std::move(FS); + } + // FS_COMBINED_ENTRY: [modid, instcount] + case bitc::FS_CODE_COMBINED_ENTRY: { + uint64_t ModuleId = Record[0]; + unsigned InstCount = Record[1]; + std::unique_ptr<FunctionSummary> FS = + llvm::make_unique<FunctionSummary>(InstCount); + FS->setModulePath(ModuleIdMap[ModuleId]); + SummaryMap[CurRecordBit] = std::move(FS); + } + } + } + llvm_unreachable("Exit infinite loop"); +} + +// Parse the module string table block into the Index. +// This populates the ModulePathStringTable map in the index. +std::error_code FunctionIndexBitcodeReader::parseModuleStringTable() { + if (Stream.EnterSubBlock(bitc::MODULE_STRTAB_BLOCK_ID)) + return error("Invalid record"); + + SmallVector<uint64_t, 64> Record; + + SmallString<128> ModulePath; + while (1) { + BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); + + switch (Entry.Kind) { + case BitstreamEntry::SubBlock: // Handled for us already. + case BitstreamEntry::Error: + return error("Malformed block"); + case BitstreamEntry::EndBlock: + return std::error_code(); + case BitstreamEntry::Record: + // The interesting case. + break; + } + + Record.clear(); + switch (Stream.readRecord(Entry.ID, Record)) { + default: // Default behavior: ignore. + break; + case bitc::MST_CODE_ENTRY: { + // MST_ENTRY: [modid, namechar x N] + if (convertToString(Record, 1, ModulePath)) + return error("Invalid record"); + uint64_t ModuleId = Record[0]; + StringRef ModulePathInMap = TheIndex->addModulePath(ModulePath, ModuleId); + ModuleIdMap[ModuleId] = ModulePathInMap; + ModulePath.clear(); + break; + } + } + } + llvm_unreachable("Exit infinite loop"); +} + +// Parse the function info index from the bitcode streamer into the given index. +std::error_code FunctionIndexBitcodeReader::parseSummaryIndexInto( + std::unique_ptr<DataStreamer> Streamer, FunctionInfoIndex *I) { + TheIndex = I; + + if (std::error_code EC = initStream(std::move(Streamer))) + return EC; + + // Sniff for the signature. + if (!hasValidBitcodeHeader(Stream)) + return error("Invalid bitcode signature"); + + // We expect a number of well-defined blocks, though we don't necessarily + // need to understand them all. + while (1) { + if (Stream.AtEndOfStream()) { + // We didn't really read a proper Module block. + return error("Malformed block"); + } + + BitstreamEntry Entry = + Stream.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs); + + if (Entry.Kind != BitstreamEntry::SubBlock) + return error("Malformed block"); + + // If we see a MODULE_BLOCK, parse it to find the blocks needed for + // building the function summary index. + if (Entry.ID == bitc::MODULE_BLOCK_ID) + return parseModule(); + + if (Stream.SkipBlock()) + return error("Invalid record"); + } +} + +// Parse the function information at the given offset in the buffer into +// the index. Used to support lazy parsing of function summaries from the +// combined index during importing. +// TODO: This function is not yet complete as it won't have a consumer +// until ThinLTO function importing is added. +std::error_code FunctionIndexBitcodeReader::parseFunctionSummary( + std::unique_ptr<DataStreamer> Streamer, FunctionInfoIndex *I, + size_t FunctionSummaryOffset) { + TheIndex = I; + + if (std::error_code EC = initStream(std::move(Streamer))) + return EC; + + // Sniff for the signature. + if (!hasValidBitcodeHeader(Stream)) + return error("Invalid bitcode signature"); + + Stream.JumpToBit(FunctionSummaryOffset); + + BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); + + switch (Entry.Kind) { + default: + return error("Malformed block"); + case BitstreamEntry::Record: + // The expected case. + break; + } + + // TODO: Read a record. This interface will be completed when ThinLTO + // importing is added so that it can be tested. + SmallVector<uint64_t, 64> Record; + switch (Stream.readRecord(Entry.ID, Record)) { + case bitc::FS_CODE_COMBINED_ENTRY: + default: + return error("Invalid record"); + } + + return std::error_code(); +} + +std::error_code +FunctionIndexBitcodeReader::initStream(std::unique_ptr<DataStreamer> Streamer) { + if (Streamer) + return initLazyStream(std::move(Streamer)); + return initStreamFromBuffer(); +} + +std::error_code FunctionIndexBitcodeReader::initStreamFromBuffer() { + const unsigned char *BufPtr = (const unsigned char *)Buffer->getBufferStart(); + const unsigned char *BufEnd = BufPtr + Buffer->getBufferSize(); + + if (Buffer->getBufferSize() & 3) + return error("Invalid bitcode signature"); + + // If we have a wrapper header, parse it and ignore the non-bc file contents. + // The magic number is 0x0B17C0DE stored in little endian. + if (isBitcodeWrapper(BufPtr, BufEnd)) + if (SkipBitcodeWrapperHeader(BufPtr, BufEnd, true)) + return error("Invalid bitcode wrapper header"); + + StreamFile.reset(new BitstreamReader(BufPtr, BufEnd)); + Stream.init(&*StreamFile); + + return std::error_code(); +} + +std::error_code FunctionIndexBitcodeReader::initLazyStream( + std::unique_ptr<DataStreamer> Streamer) { + // Check and strip off the bitcode wrapper; BitstreamReader expects never to + // see it. + auto OwnedBytes = + llvm::make_unique<StreamingMemoryObject>(std::move(Streamer)); + StreamingMemoryObject &Bytes = *OwnedBytes; + StreamFile = llvm::make_unique<BitstreamReader>(std::move(OwnedBytes)); + Stream.init(&*StreamFile); + + unsigned char buf[16]; + if (Bytes.readBytes(buf, 16, 0) != 16) + return error("Invalid bitcode signature"); + + if (!isBitcode(buf, buf + 16)) + return error("Invalid bitcode signature"); + + if (isBitcodeWrapper(buf, buf + 4)) { + const unsigned char *bitcodeStart = buf; + const unsigned char *bitcodeEnd = buf + 16; + SkipBitcodeWrapperHeader(bitcodeStart, bitcodeEnd, false); + Bytes.dropLeadingBytes(bitcodeStart - buf); + Bytes.setKnownObjectSize(bitcodeEnd - bitcodeStart); + } + return std::error_code(); +} + namespace { class BitcodeErrorCategoryType : public std::error_category { const char *name() const LLVM_NOEXCEPT override { @@ -4669,7 +5846,7 @@ getBitcodeModuleImpl(std::unique_ptr<DataStreamer> Streamer, StringRef Name, if (MaterializeAll) { // Read in the entire module, and destroy the BitcodeReader. - if (std::error_code EC = M->materializeAllPermanently()) + if (std::error_code EC = M->materializeAll()) return cleanupOnError(EC); } else { // Resolve forward references from blockaddresses. @@ -4690,10 +5867,8 @@ getBitcodeModuleImpl(std::unique_ptr<DataStreamer> Streamer, StringRef Name, static ErrorOr<std::unique_ptr<Module>> getLazyBitcodeModuleImpl(std::unique_ptr<MemoryBuffer> &&Buffer, LLVMContext &Context, bool MaterializeAll, - DiagnosticHandlerFunction DiagnosticHandler, bool ShouldLazyLoadMetadata = false) { - BitcodeReader *R = - new BitcodeReader(Buffer.get(), Context, DiagnosticHandler); + BitcodeReader *R = new BitcodeReader(Buffer.get(), Context); ErrorOr<std::unique_ptr<Module>> Ret = getBitcodeModuleImpl(nullptr, Buffer->getBufferIdentifier(), R, Context, @@ -4705,41 +5880,124 @@ getLazyBitcodeModuleImpl(std::unique_ptr<MemoryBuffer> &&Buffer, return Ret; } -ErrorOr<std::unique_ptr<Module>> llvm::getLazyBitcodeModule( - std::unique_ptr<MemoryBuffer> &&Buffer, LLVMContext &Context, - DiagnosticHandlerFunction DiagnosticHandler, bool ShouldLazyLoadMetadata) { +ErrorOr<std::unique_ptr<Module>> +llvm::getLazyBitcodeModule(std::unique_ptr<MemoryBuffer> &&Buffer, + LLVMContext &Context, bool ShouldLazyLoadMetadata) { return getLazyBitcodeModuleImpl(std::move(Buffer), Context, false, - DiagnosticHandler, ShouldLazyLoadMetadata); + ShouldLazyLoadMetadata); } -ErrorOr<std::unique_ptr<Module>> llvm::getStreamedBitcodeModule( - StringRef Name, std::unique_ptr<DataStreamer> Streamer, - LLVMContext &Context, DiagnosticHandlerFunction DiagnosticHandler) { +ErrorOr<std::unique_ptr<Module>> +llvm::getStreamedBitcodeModule(StringRef Name, + std::unique_ptr<DataStreamer> Streamer, + LLVMContext &Context) { std::unique_ptr<Module> M = make_unique<Module>(Name, Context); - BitcodeReader *R = new BitcodeReader(Context, DiagnosticHandler); + BitcodeReader *R = new BitcodeReader(Context); return getBitcodeModuleImpl(std::move(Streamer), Name, R, Context, false, false); } -ErrorOr<std::unique_ptr<Module>> -llvm::parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, - DiagnosticHandlerFunction DiagnosticHandler) { +ErrorOr<std::unique_ptr<Module>> llvm::parseBitcodeFile(MemoryBufferRef Buffer, + LLVMContext &Context) { std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false); - return getLazyBitcodeModuleImpl(std::move(Buf), Context, true, - DiagnosticHandler); + return getLazyBitcodeModuleImpl(std::move(Buf), Context, true); // TODO: Restore the use-lists to the in-memory state when the bitcode was // written. We must defer until the Module has been fully materialized. } -std::string -llvm::getBitcodeTargetTriple(MemoryBufferRef Buffer, LLVMContext &Context, - DiagnosticHandlerFunction DiagnosticHandler) { +std::string llvm::getBitcodeTargetTriple(MemoryBufferRef Buffer, + LLVMContext &Context) { std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false); - auto R = llvm::make_unique<BitcodeReader>(Buf.release(), Context, - DiagnosticHandler); + auto R = llvm::make_unique<BitcodeReader>(Buf.release(), Context); ErrorOr<std::string> Triple = R->parseTriple(); if (Triple.getError()) return ""; return Triple.get(); } + +std::string llvm::getBitcodeProducerString(MemoryBufferRef Buffer, + LLVMContext &Context) { + std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false); + BitcodeReader R(Buf.release(), Context); + ErrorOr<std::string> ProducerString = R.parseIdentificationBlock(); + if (ProducerString.getError()) + return ""; + return ProducerString.get(); +} + +// Parse the specified bitcode buffer, returning the function info index. +// If IsLazy is false, parse the entire function summary into +// the index. Otherwise skip the function summary section, and only create +// an index object with a map from function name to function summary offset. +// The index is used to perform lazy function summary reading later. +ErrorOr<std::unique_ptr<FunctionInfoIndex>> +llvm::getFunctionInfoIndex(MemoryBufferRef Buffer, + DiagnosticHandlerFunction DiagnosticHandler, + bool IsLazy) { + std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false); + FunctionIndexBitcodeReader R(Buf.get(), DiagnosticHandler, IsLazy); + + auto Index = llvm::make_unique<FunctionInfoIndex>(); + + auto cleanupOnError = [&](std::error_code EC) { + R.releaseBuffer(); // Never take ownership on error. + return EC; + }; + + if (std::error_code EC = R.parseSummaryIndexInto(nullptr, Index.get())) + return cleanupOnError(EC); + + Buf.release(); // The FunctionIndexBitcodeReader owns it now. + return std::move(Index); +} + +// Check if the given bitcode buffer contains a function summary block. +bool llvm::hasFunctionSummary(MemoryBufferRef Buffer, + DiagnosticHandlerFunction DiagnosticHandler) { + std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false); + FunctionIndexBitcodeReader R(Buf.get(), DiagnosticHandler, false, true); + + auto cleanupOnError = [&](std::error_code EC) { + R.releaseBuffer(); // Never take ownership on error. + return false; + }; + + if (std::error_code EC = R.parseSummaryIndexInto(nullptr, nullptr)) + return cleanupOnError(EC); + + Buf.release(); // The FunctionIndexBitcodeReader owns it now. + return R.foundFuncSummary(); +} + +// This method supports lazy reading of function summary data from the combined +// index during ThinLTO function importing. When reading the combined index +// file, getFunctionInfoIndex is first invoked with IsLazy=true. +// Then this method is called for each function considered for importing, +// to parse the summary information for the given function name into +// the index. +std::error_code llvm::readFunctionSummary( + MemoryBufferRef Buffer, DiagnosticHandlerFunction DiagnosticHandler, + StringRef FunctionName, std::unique_ptr<FunctionInfoIndex> Index) { + std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false); + FunctionIndexBitcodeReader R(Buf.get(), DiagnosticHandler); + + auto cleanupOnError = [&](std::error_code EC) { + R.releaseBuffer(); // Never take ownership on error. + return EC; + }; + + // Lookup the given function name in the FunctionMap, which may + // contain a list of function infos in the case of a COMDAT. Walk through + // and parse each function summary info at the function summary offset + // recorded when parsing the value symbol table. + for (const auto &FI : Index->getFunctionInfoList(FunctionName)) { + size_t FunctionSummaryOffset = FI->bitcodeIndex(); + if (std::error_code EC = + R.parseFunctionSummary(nullptr, Index.get(), FunctionSummaryOffset)) + return cleanupOnError(EC); + } + + Buf.release(); // The FunctionIndexBitcodeReader owns it now. + return std::error_code(); +} diff --git a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 1a70ba5..a899a0c 100644 --- a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -13,14 +13,18 @@ #include "llvm/Bitcode/ReaderWriter.h" #include "ValueEnumerator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Triple.h" #include "llvm/Bitcode/BitstreamWriter.h" #include "llvm/Bitcode/LLVMBitCodes.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/UseListOrder.h" @@ -174,6 +178,10 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_IN_ALLOCA; case Attribute::Cold: return bitc::ATTR_KIND_COLD; + case Attribute::InaccessibleMemOnly: + return bitc::ATTR_KIND_INACCESSIBLEMEM_ONLY; + case Attribute::InaccessibleMemOrArgMemOnly: + return bitc::ATTR_KIND_INACCESSIBLEMEM_OR_ARGMEMONLY; case Attribute::InlineHint: return bitc::ATTR_KIND_INLINE_HINT; case Attribute::InReg: @@ -198,6 +206,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_NO_IMPLICIT_FLOAT; case Attribute::NoInline: return bitc::ATTR_KIND_NO_INLINE; + case Attribute::NoRecurse: + return bitc::ATTR_KIND_NO_RECURSE; case Attribute::NonLazyBind: return bitc::ATTR_KIND_NON_LAZY_BIND; case Attribute::NonNull: @@ -405,6 +415,7 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) { case Type::LabelTyID: Code = bitc::TYPE_CODE_LABEL; break; case Type::MetadataTyID: Code = bitc::TYPE_CODE_METADATA; break; case Type::X86_MMXTyID: Code = bitc::TYPE_CODE_X86_MMX; break; + case Type::TokenTyID: Code = bitc::TYPE_CODE_TOKEN; break; case Type::IntegerTyID: // INTEGER: [width] Code = bitc::TYPE_CODE_INTEGER; @@ -573,10 +584,41 @@ static void writeComdats(const ValueEnumerator &VE, BitstreamWriter &Stream) { } } -// Emit top-level description of module, including target triple, inline asm, -// descriptors for global variables, and function prototype info. -static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE, - BitstreamWriter &Stream) { +/// Write a record that will eventually hold the word offset of the +/// module-level VST. For now the offset is 0, which will be backpatched +/// after the real VST is written. Returns the bit offset to backpatch. +static uint64_t WriteValueSymbolTableForwardDecl(const ValueSymbolTable &VST, + BitstreamWriter &Stream) { + if (VST.empty()) + return 0; + + // Write a placeholder value in for the offset of the real VST, + // which is written after the function blocks so that it can include + // the offset of each function. The placeholder offset will be + // updated when the real VST is written. + BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_VSTOFFSET)); + // Blocks are 32-bit aligned, so we can use a 32-bit word offset to + // hold the real VST offset. Must use fixed instead of VBR as we don't + // know how many VBR chunks to reserve ahead of time. + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); + unsigned VSTOffsetAbbrev = Stream.EmitAbbrev(Abbv); + + // Emit the placeholder + uint64_t Vals[] = {bitc::MODULE_CODE_VSTOFFSET, 0}; + Stream.EmitRecordWithAbbrev(VSTOffsetAbbrev, Vals); + + // Compute and return the bit offset to the placeholder, which will be + // patched when the real VST is written. We can simply subtract the 32-bit + // fixed size from the current bit number to get the location to backpatch. + return Stream.GetCurrentBitNo() - 32; +} + +/// Emit top-level description of module, including target triple, inline asm, +/// descriptors for global variables, and function prototype info. +/// Returns the bit offset to backpatch with the location of the real VST. +static uint64_t WriteModuleInfo(const Module *M, const ValueEnumerator &VE, + BitstreamWriter &Stream) { // Emit various pieces of data attached to a module. if (!M->getTargetTriple().empty()) WriteStringRecord(bitc::MODULE_CODE_TRIPLE, M->getTargetTriple(), @@ -725,7 +767,8 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE, // Emit the alias information. for (const GlobalAlias &A : M->aliases()) { // ALIAS: [alias type, aliasee val#, linkage, visibility] - Vals.push_back(VE.getTypeID(A.getType())); + Vals.push_back(VE.getTypeID(A.getValueType())); + Vals.push_back(A.getType()->getAddressSpace()); Vals.push_back(VE.getValueID(A.getAliasee())); Vals.push_back(getEncodedLinkage(A)); Vals.push_back(getEncodedVisibility(A)); @@ -736,6 +779,25 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE, Stream.EmitRecord(bitc::MODULE_CODE_ALIAS, Vals, AbbrevToUse); Vals.clear(); } + + // Write a record indicating the number of module-level metadata IDs + // This is needed because the ids of metadata are assigned implicitly + // based on their ordering in the bitcode, with the function-level + // metadata ids starting after the module-level metadata ids. For + // function importing where we lazy load the metadata as a postpass, + // we want to avoid parsing the module-level metadata before parsing + // the imported functions. + BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_METADATA_VALUES)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); + unsigned MDValsAbbrev = Stream.EmitAbbrev(Abbv); + Vals.push_back(VE.numMDs()); + Stream.EmitRecord(bitc::MODULE_CODE_METADATA_VALUES, Vals, MDValsAbbrev); + Vals.clear(); + + uint64_t VSTOffsetPlaceholder = + WriteValueSymbolTableForwardDecl(M->getValueSymbolTable(), Stream); + return VSTOffsetPlaceholder; } static uint64_t GetOptimizationFlags(const Value *V) { @@ -943,7 +1005,8 @@ static void WriteDICompileUnit(const DICompileUnit *N, BitstreamWriter &Stream, SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) { - Record.push_back(N->isDistinct()); + assert(N->isDistinct() && "Expected distinct compile units"); + Record.push_back(/* IsDistinct */ true); Record.push_back(N->getSourceLanguage()); Record.push_back(VE.getMetadataOrNullID(N->getFile())); Record.push_back(VE.getMetadataOrNullID(N->getRawProducer())); @@ -958,6 +1021,7 @@ static void WriteDICompileUnit(const DICompileUnit *N, Record.push_back(VE.getMetadataOrNullID(N->getGlobalVariables().get())); Record.push_back(VE.getMetadataOrNullID(N->getImportedEntities().get())); Record.push_back(N->getDWOId()); + Record.push_back(VE.getMetadataOrNullID(N->getMacros().get())); Stream.EmitRecord(bitc::METADATA_COMPILE_UNIT, Record, Abbrev); Record.clear(); @@ -982,7 +1046,6 @@ static void WriteDISubprogram(const DISubprogram *N, const ValueEnumerator &VE, Record.push_back(N->getVirtualIndex()); Record.push_back(N->getFlags()); Record.push_back(N->isOptimized()); - Record.push_back(VE.getMetadataOrNullID(N->getRawFunction())); Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams().get())); Record.push_back(VE.getMetadataOrNullID(N->getDeclaration())); Record.push_back(VE.getMetadataOrNullID(N->getVariables().get())); @@ -1034,6 +1097,33 @@ static void WriteDINamespace(const DINamespace *N, const ValueEnumerator &VE, Record.clear(); } +static void WriteDIMacro(const DIMacro *N, const ValueEnumerator &VE, + BitstreamWriter &Stream, + SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) { + Record.push_back(N->isDistinct()); + Record.push_back(N->getMacinfoType()); + Record.push_back(N->getLine()); + Record.push_back(VE.getMetadataOrNullID(N->getRawName())); + Record.push_back(VE.getMetadataOrNullID(N->getRawValue())); + + Stream.EmitRecord(bitc::METADATA_MACRO, Record, Abbrev); + Record.clear(); +} + +static void WriteDIMacroFile(const DIMacroFile *N, const ValueEnumerator &VE, + BitstreamWriter &Stream, + SmallVectorImpl<uint64_t> &Record, + unsigned Abbrev) { + Record.push_back(N->isDistinct()); + Record.push_back(N->getMacinfoType()); + Record.push_back(N->getLine()); + Record.push_back(VE.getMetadataOrNullID(N->getFile())); + Record.push_back(VE.getMetadataOrNullID(N->getElements().get())); + + Stream.EmitRecord(bitc::METADATA_MACRO_FILE, Record, Abbrev); + Record.clear(); +} + static void WriteDIModule(const DIModule *N, const ValueEnumerator &VE, BitstreamWriter &Stream, SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) { @@ -1100,7 +1190,6 @@ static void WriteDILocalVariable(const DILocalVariable *N, SmallVectorImpl<uint64_t> &Record, unsigned Abbrev) { Record.push_back(N->isDistinct()); - Record.push_back(N->getTag()); Record.push_back(VE.getMetadataOrNullID(N->getScope())); Record.push_back(VE.getMetadataOrNullID(N->getRawName())); Record.push_back(VE.getMetadataOrNullID(N->getFile())); @@ -1310,16 +1399,15 @@ static void WriteMetadataAttachment(const Function &F, Record.clear(); } - for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) - for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); - I != E; ++I) { + for (const BasicBlock &BB : F) + for (const Instruction &I : BB) { MDs.clear(); - I->getAllMetadataOtherThanDebugLoc(MDs); + I.getAllMetadataOtherThanDebugLoc(MDs); // If no metadata, ignore instruction. if (MDs.empty()) continue; - Record.push_back(VE.getInstructionID(I)); + Record.push_back(VE.getInstructionID(&I)); for (unsigned i = 0, e = MDs.size(); i != e; ++i) { Record.push_back(MDs[i].first); @@ -1342,7 +1430,7 @@ static void WriteModuleMetadataStore(const Module *M, BitstreamWriter &Stream) { if (Names.empty()) return; - Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3); + Stream.EnterSubblock(bitc::METADATA_KIND_BLOCK_ID, 3); for (unsigned MDKindID = 0, e = Names.size(); MDKindID != e; ++MDKindID) { Record.push_back(MDKindID); @@ -1356,6 +1444,33 @@ static void WriteModuleMetadataStore(const Module *M, BitstreamWriter &Stream) { Stream.ExitBlock(); } +static void WriteOperandBundleTags(const Module *M, BitstreamWriter &Stream) { + // Write metadata kinds + // + // OPERAND_BUNDLE_TAGS_BLOCK_ID : N x OPERAND_BUNDLE_TAG + // + // OPERAND_BUNDLE_TAG - [strchr x N] + + SmallVector<StringRef, 8> Tags; + M->getOperandBundleTags(Tags); + + if (Tags.empty()) + return; + + Stream.EnterSubblock(bitc::OPERAND_BUNDLE_TAGS_BLOCK_ID, 3); + + SmallVector<uint64_t, 64> Record; + + for (auto Tag : Tags) { + Record.append(Tag.begin(), Tag.end()); + + Stream.EmitRecord(bitc::OPERAND_BUNDLE_TAG, Record, 0); + Record.clear(); + } + + Stream.ExitBlock(); +} + static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) { if ((int64_t)V >= 0) Vals.push_back(V << 1); @@ -1515,19 +1630,10 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal, if (isa<IntegerType>(EltTy)) { for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) Record.push_back(CDS->getElementAsInteger(i)); - } else if (EltTy->isFloatTy()) { - for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) { - union { float F; uint32_t I; }; - F = CDS->getElementAsFloat(i); - Record.push_back(I); - } } else { - assert(EltTy->isDoubleTy() && "Unknown ConstantData element type"); - for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) { - union { double F; uint64_t I; }; - F = CDS->getElementAsDouble(i); - Record.push_back(I); - } + for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) + Record.push_back( + CDS->getElementAsAPFloat(i).bitcastToAPInt().getLimitedValue()); } } else if (isa<ConstantArray>(C) || isa<ConstantStruct>(C) || isa<ConstantVector>(C)) { @@ -1664,6 +1770,23 @@ static bool PushValueAndType(const Value *V, unsigned InstID, return false; } +static void WriteOperandBundles(BitstreamWriter &Stream, ImmutableCallSite CS, + unsigned InstID, ValueEnumerator &VE) { + SmallVector<unsigned, 64> Record; + LLVMContext &C = CS.getInstruction()->getContext(); + + for (unsigned i = 0, e = CS.getNumOperandBundles(); i != e; ++i) { + const auto &Bundle = CS.getOperandBundleAt(i); + Record.push_back(C.getOperandBundleTagID(Bundle.getTagName())); + + for (auto &Input : Bundle.Inputs) + PushValueAndType(Input, InstID, Record, VE); + + Stream.EmitRecord(bitc::FUNC_CODE_OPERAND_BUNDLE, Record); + Record.clear(); + } +} + /// pushValue - Like PushValueAndType, but where the type of the value is /// omitted (perhaps it was already encoded in an earlier operand). static void pushValue(const Value *V, unsigned InstID, @@ -1806,10 +1929,9 @@ static void WriteInstruction(const Instruction &I, unsigned InstID, Vals.push_back(VE.getTypeID(SI.getCondition()->getType())); pushValue(SI.getCondition(), InstID, Vals, VE); Vals.push_back(VE.getValueID(SI.getDefaultDest())); - for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end(); - i != e; ++i) { - Vals.push_back(VE.getValueID(i.getCaseValue())); - Vals.push_back(VE.getValueID(i.getCaseSuccessor())); + for (SwitchInst::ConstCaseIt Case : SI.cases()) { + Vals.push_back(VE.getValueID(Case.getCaseValue())); + Vals.push_back(VE.getValueID(Case.getCaseSuccessor())); } } break; @@ -1826,6 +1948,10 @@ static void WriteInstruction(const Instruction &I, unsigned InstID, const InvokeInst *II = cast<InvokeInst>(&I); const Value *Callee = II->getCalledValue(); FunctionType *FTy = II->getFunctionType(); + + if (II->hasOperandBundles()) + WriteOperandBundles(Stream, II, InstID, VE); + Code = bitc::FUNC_CODE_INST_INVOKE; Vals.push_back(VE.getAttributeID(II->getAttributes())); @@ -1851,6 +1977,49 @@ static void WriteInstruction(const Instruction &I, unsigned InstID, Code = bitc::FUNC_CODE_INST_RESUME; PushValueAndType(I.getOperand(0), InstID, Vals, VE); break; + case Instruction::CleanupRet: { + Code = bitc::FUNC_CODE_INST_CLEANUPRET; + const auto &CRI = cast<CleanupReturnInst>(I); + pushValue(CRI.getCleanupPad(), InstID, Vals, VE); + if (CRI.hasUnwindDest()) + Vals.push_back(VE.getValueID(CRI.getUnwindDest())); + break; + } + case Instruction::CatchRet: { + Code = bitc::FUNC_CODE_INST_CATCHRET; + const auto &CRI = cast<CatchReturnInst>(I); + pushValue(CRI.getCatchPad(), InstID, Vals, VE); + Vals.push_back(VE.getValueID(CRI.getSuccessor())); + break; + } + case Instruction::CleanupPad: + case Instruction::CatchPad: { + const auto &FuncletPad = cast<FuncletPadInst>(I); + Code = isa<CatchPadInst>(FuncletPad) ? bitc::FUNC_CODE_INST_CATCHPAD + : bitc::FUNC_CODE_INST_CLEANUPPAD; + pushValue(FuncletPad.getParentPad(), InstID, Vals, VE); + + unsigned NumArgOperands = FuncletPad.getNumArgOperands(); + Vals.push_back(NumArgOperands); + for (unsigned Op = 0; Op != NumArgOperands; ++Op) + PushValueAndType(FuncletPad.getArgOperand(Op), InstID, Vals, VE); + break; + } + case Instruction::CatchSwitch: { + Code = bitc::FUNC_CODE_INST_CATCHSWITCH; + const auto &CatchSwitch = cast<CatchSwitchInst>(I); + + pushValue(CatchSwitch.getParentPad(), InstID, Vals, VE); + + unsigned NumHandlers = CatchSwitch.getNumHandlers(); + Vals.push_back(NumHandlers); + for (const BasicBlock *CatchPadBB : CatchSwitch.handlers()) + Vals.push_back(VE.getValueID(CatchPadBB)); + + if (CatchSwitch.hasUnwindDest()) + Vals.push_back(VE.getValueID(CatchSwitch.getUnwindDest())); + break; + } case Instruction::Unreachable: Code = bitc::FUNC_CODE_INST_UNREACHABLE; AbbrevToUse = FUNCTION_INST_UNREACHABLE_ABBREV; @@ -1902,6 +2071,8 @@ static void WriteInstruction(const Instruction &I, unsigned InstID, assert(AlignRecord < 1 << 5 && "alignment greater than 1 << 64"); AlignRecord |= AI.isUsedWithInAlloca() << 5; AlignRecord |= 1 << 6; + // Reserve bit 7 for SwiftError flag. + // AlignRecord |= AI.isSwiftError() << 7; Vals.push_back(AlignRecord); break; } @@ -1971,11 +2142,23 @@ static void WriteInstruction(const Instruction &I, unsigned InstID, const CallInst &CI = cast<CallInst>(I); FunctionType *FTy = CI.getFunctionType(); + if (CI.hasOperandBundles()) + WriteOperandBundles(Stream, &CI, InstID, VE); + Code = bitc::FUNC_CODE_INST_CALL; Vals.push_back(VE.getAttributeID(CI.getAttributes())); - Vals.push_back((CI.getCallingConv() << 1) | unsigned(CI.isTailCall()) | - unsigned(CI.isMustTailCall()) << 14 | 1 << 15); + + unsigned Flags = GetOptimizationFlags(&I); + Vals.push_back(CI.getCallingConv() << bitc::CALL_CCONV | + unsigned(CI.isTailCall()) << bitc::CALL_TAIL | + unsigned(CI.isMustTailCall()) << bitc::CALL_MUSTTAIL | + 1 << bitc::CALL_EXPLICIT_TYPE | + unsigned(CI.isNoTailCall()) << bitc::CALL_NOTAIL | + unsigned(Flags != 0) << bitc::CALL_FMF); + if (Flags != 0) + Vals.push_back(Flags); + Vals.push_back(VE.getTypeID(FTy)); PushValueAndType(CI.getCalledValue(), InstID, Vals, VE); // Callee @@ -2008,56 +2191,149 @@ static void WriteInstruction(const Instruction &I, unsigned InstID, Vals.clear(); } -// Emit names for globals/functions etc. -static void WriteValueSymbolTable(const ValueSymbolTable &VST, - const ValueEnumerator &VE, - BitstreamWriter &Stream) { - if (VST.empty()) return; +enum StringEncoding { SE_Char6, SE_Fixed7, SE_Fixed8 }; + +/// Determine the encoding to use for the given string name and length. +static StringEncoding getStringEncoding(const char *Str, unsigned StrLen) { + bool isChar6 = true; + for (const char *C = Str, *E = C + StrLen; C != E; ++C) { + if (isChar6) + isChar6 = BitCodeAbbrevOp::isChar6(*C); + if ((unsigned char)*C & 128) + // don't bother scanning the rest. + return SE_Fixed8; + } + if (isChar6) + return SE_Char6; + else + return SE_Fixed7; +} + +/// Emit names for globals/functions etc. The VSTOffsetPlaceholder, +/// BitcodeStartBit and FunctionIndex are only passed for the module-level +/// VST, where we are including a function bitcode index and need to +/// backpatch the VST forward declaration record. +static void WriteValueSymbolTable( + const ValueSymbolTable &VST, const ValueEnumerator &VE, + BitstreamWriter &Stream, uint64_t VSTOffsetPlaceholder = 0, + uint64_t BitcodeStartBit = 0, + DenseMap<const Function *, std::unique_ptr<FunctionInfo>> *FunctionIndex = + nullptr) { + if (VST.empty()) { + // WriteValueSymbolTableForwardDecl should have returned early as + // well. Ensure this handling remains in sync by asserting that + // the placeholder offset is not set. + assert(VSTOffsetPlaceholder == 0); + return; + } + + if (VSTOffsetPlaceholder > 0) { + // Get the offset of the VST we are writing, and backpatch it into + // the VST forward declaration record. + uint64_t VSTOffset = Stream.GetCurrentBitNo(); + // The BitcodeStartBit was the stream offset of the actual bitcode + // (e.g. excluding any initial darwin header). + VSTOffset -= BitcodeStartBit; + assert((VSTOffset & 31) == 0 && "VST block not 32-bit aligned"); + Stream.BackpatchWord(VSTOffsetPlaceholder, VSTOffset / 32); + } + Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4); + // For the module-level VST, add abbrev Ids for the VST_CODE_FNENTRY + // records, which are not used in the per-function VSTs. + unsigned FnEntry8BitAbbrev; + unsigned FnEntry7BitAbbrev; + unsigned FnEntry6BitAbbrev; + if (VSTOffsetPlaceholder > 0) { + // 8-bit fixed-width VST_FNENTRY function strings. + BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); + FnEntry8BitAbbrev = Stream.EmitAbbrev(Abbv); + + // 7-bit fixed width VST_FNENTRY function strings. + Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7)); + FnEntry7BitAbbrev = Stream.EmitAbbrev(Abbv); + + // 6-bit char6 VST_FNENTRY function strings. + Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6)); + FnEntry6BitAbbrev = Stream.EmitAbbrev(Abbv); + } + // FIXME: Set up the abbrev, we know how many values there are! // FIXME: We know if the type names can use 7-bit ascii. SmallVector<unsigned, 64> NameVals; - for (ValueSymbolTable::const_iterator SI = VST.begin(), SE = VST.end(); - SI != SE; ++SI) { - - const ValueName &Name = *SI; - + for (const ValueName &Name : VST) { // Figure out the encoding to use for the name. - bool is7Bit = true; - bool isChar6 = true; - for (const char *C = Name.getKeyData(), *E = C+Name.getKeyLength(); - C != E; ++C) { - if (isChar6) - isChar6 = BitCodeAbbrevOp::isChar6(*C); - if ((unsigned char)*C & 128) { - is7Bit = false; - break; // don't bother scanning the rest. - } - } + StringEncoding Bits = + getStringEncoding(Name.getKeyData(), Name.getKeyLength()); unsigned AbbrevToUse = VST_ENTRY_8_ABBREV; + NameVals.push_back(VE.getValueID(Name.getValue())); + + Function *F = dyn_cast<Function>(Name.getValue()); + if (!F) { + // If value is an alias, need to get the aliased base object to + // see if it is a function. + auto *GA = dyn_cast<GlobalAlias>(Name.getValue()); + if (GA && GA->getBaseObject()) + F = dyn_cast<Function>(GA->getBaseObject()); + } // VST_ENTRY: [valueid, namechar x N] + // VST_FNENTRY: [valueid, funcoffset, namechar x N] // VST_BBENTRY: [bbid, namechar x N] unsigned Code; - if (isa<BasicBlock>(SI->getValue())) { + if (isa<BasicBlock>(Name.getValue())) { Code = bitc::VST_CODE_BBENTRY; - if (isChar6) + if (Bits == SE_Char6) AbbrevToUse = VST_BBENTRY_6_ABBREV; + } else if (F && !F->isDeclaration()) { + // Must be the module-level VST, where we pass in the Index and + // have a VSTOffsetPlaceholder. The function-level VST should not + // contain any Function symbols. + assert(FunctionIndex); + assert(VSTOffsetPlaceholder > 0); + + // Save the word offset of the function (from the start of the + // actual bitcode written to the stream). + assert(FunctionIndex->count(F) == 1); + uint64_t BitcodeIndex = + (*FunctionIndex)[F]->bitcodeIndex() - BitcodeStartBit; + assert((BitcodeIndex & 31) == 0 && "function block not 32-bit aligned"); + NameVals.push_back(BitcodeIndex / 32); + + Code = bitc::VST_CODE_FNENTRY; + AbbrevToUse = FnEntry8BitAbbrev; + if (Bits == SE_Char6) + AbbrevToUse = FnEntry6BitAbbrev; + else if (Bits == SE_Fixed7) + AbbrevToUse = FnEntry7BitAbbrev; } else { Code = bitc::VST_CODE_ENTRY; - if (isChar6) + if (Bits == SE_Char6) AbbrevToUse = VST_ENTRY_6_ABBREV; - else if (is7Bit) + else if (Bits == SE_Fixed7) AbbrevToUse = VST_ENTRY_7_ABBREV; } - NameVals.push_back(VE.getValueID(SI->getValue())); - for (const char *P = Name.getKeyData(), - *E = Name.getKeyData()+Name.getKeyLength(); P != E; ++P) - NameVals.push_back((unsigned char)*P); + for (const auto P : Name.getKey()) + NameVals.push_back((unsigned char)P); // Emit the finished record. Stream.EmitRecord(Code, NameVals, AbbrevToUse); @@ -2066,6 +2342,66 @@ static void WriteValueSymbolTable(const ValueSymbolTable &VST, Stream.ExitBlock(); } +/// Emit function names and summary offsets for the combined index +/// used by ThinLTO. +static void WriteCombinedValueSymbolTable(const FunctionInfoIndex &Index, + BitstreamWriter &Stream) { + Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4); + + // 8-bit fixed-width VST_COMBINED_FNENTRY function strings. + BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_FNENTRY)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); + unsigned FnEntry8BitAbbrev = Stream.EmitAbbrev(Abbv); + + // 7-bit fixed width VST_COMBINED_FNENTRY function strings. + Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_FNENTRY)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7)); + unsigned FnEntry7BitAbbrev = Stream.EmitAbbrev(Abbv); + + // 6-bit char6 VST_COMBINED_FNENTRY function strings. + Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_FNENTRY)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6)); + unsigned FnEntry6BitAbbrev = Stream.EmitAbbrev(Abbv); + + // FIXME: We know if the type names can use 7-bit ascii. + SmallVector<unsigned, 64> NameVals; + + for (const auto &FII : Index) { + for (const auto &FI : FII.getValue()) { + NameVals.push_back(FI->bitcodeIndex()); + + StringRef FuncName = FII.first(); + + // Figure out the encoding to use for the name. + StringEncoding Bits = getStringEncoding(FuncName.data(), FuncName.size()); + + // VST_COMBINED_FNENTRY: [funcsumoffset, namechar x N] + unsigned AbbrevToUse = FnEntry8BitAbbrev; + if (Bits == SE_Char6) + AbbrevToUse = FnEntry6BitAbbrev; + else if (Bits == SE_Fixed7) + AbbrevToUse = FnEntry7BitAbbrev; + + for (const auto P : FuncName) + NameVals.push_back((unsigned char)P); + + // Emit the finished record. + Stream.EmitRecord(bitc::VST_CODE_COMBINED_FNENTRY, NameVals, AbbrevToUse); + NameVals.clear(); + } + } + Stream.ExitBlock(); +} + static void WriteUseList(ValueEnumerator &VE, UseListOrder &&Order, BitstreamWriter &Stream) { assert(Order.Shuffle.size() >= 2 && "Shuffle too small"); @@ -2100,9 +2436,34 @@ static void WriteUseListBlock(const Function *F, ValueEnumerator &VE, Stream.ExitBlock(); } -/// WriteFunction - Emit a function body to the module stream. -static void WriteFunction(const Function &F, ValueEnumerator &VE, - BitstreamWriter &Stream) { +/// \brief Save information for the given function into the function index. +/// +/// At a minimum this saves the bitcode index of the function record that +/// was just written. However, if we are emitting function summary information, +/// for example for ThinLTO, then a \a FunctionSummary object is created +/// to hold the provided summary information. +static void SaveFunctionInfo( + const Function &F, + DenseMap<const Function *, std::unique_ptr<FunctionInfo>> &FunctionIndex, + unsigned NumInsts, uint64_t BitcodeIndex, bool EmitFunctionSummary) { + std::unique_ptr<FunctionSummary> FuncSummary; + if (EmitFunctionSummary) { + FuncSummary = llvm::make_unique<FunctionSummary>(NumInsts); + FuncSummary->setLocalFunction(F.hasLocalLinkage()); + } + FunctionIndex[&F] = + llvm::make_unique<FunctionInfo>(BitcodeIndex, std::move(FuncSummary)); +} + +/// Emit a function body to the module stream. +static void WriteFunction( + const Function &F, ValueEnumerator &VE, BitstreamWriter &Stream, + DenseMap<const Function *, std::unique_ptr<FunctionInfo>> &FunctionIndex, + bool EmitFunctionSummary) { + // Save the bitcode index of the start of this function block for recording + // in the VST. + uint64_t BitcodeIndex = Stream.GetCurrentBitNo(); + Stream.EnterSubblock(bitc::FUNCTION_BLOCK_ID, 4); VE.incorporateFunction(F); @@ -2128,6 +2489,7 @@ static void WriteFunction(const Function &F, ValueEnumerator &VE, bool NeedsMetadataAttachment = F.hasMetadata(); DILocation *LastDL = nullptr; + unsigned NumInsts = 0; // Finally, emit all the instructions, in order. for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) @@ -2135,6 +2497,9 @@ static void WriteFunction(const Function &F, ValueEnumerator &VE, I != E; ++I) { WriteInstruction(*I, InstID, VE, Stream, Vals); + if (!isa<DbgInfoIntrinsic>(I)) + ++NumInsts; + if (!I->getType()->isVoidTy()) ++InstID; @@ -2171,6 +2536,9 @@ static void WriteFunction(const Function &F, ValueEnumerator &VE, WriteUseListBlock(&F, VE, Stream); VE.purgeFunction(); Stream.ExitBlock(); + + SaveFunctionInfo(F, FunctionIndex, NumInsts, BitcodeIndex, + EmitFunctionSummary); } // Emit blockinfo, which defines the standard abbreviations etc. @@ -2348,9 +2716,183 @@ static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) { Stream.ExitBlock(); } +/// Write the module path strings, currently only used when generating +/// a combined index file. +static void WriteModStrings(const FunctionInfoIndex &I, + BitstreamWriter &Stream) { + Stream.EnterSubblock(bitc::MODULE_STRTAB_BLOCK_ID, 3); + + // TODO: See which abbrev sizes we actually need to emit + + // 8-bit fixed-width MST_ENTRY strings. + BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); + unsigned Abbrev8Bit = Stream.EmitAbbrev(Abbv); + + // 7-bit fixed width MST_ENTRY strings. + Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7)); + unsigned Abbrev7Bit = Stream.EmitAbbrev(Abbv); + + // 6-bit char6 MST_ENTRY strings. + Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6)); + unsigned Abbrev6Bit = Stream.EmitAbbrev(Abbv); + + SmallVector<unsigned, 64> NameVals; + for (const StringMapEntry<uint64_t> &MPSE : I.modPathStringEntries()) { + StringEncoding Bits = + getStringEncoding(MPSE.getKey().data(), MPSE.getKey().size()); + unsigned AbbrevToUse = Abbrev8Bit; + if (Bits == SE_Char6) + AbbrevToUse = Abbrev6Bit; + else if (Bits == SE_Fixed7) + AbbrevToUse = Abbrev7Bit; + + NameVals.push_back(MPSE.getValue()); + + for (const auto P : MPSE.getKey()) + NameVals.push_back((unsigned char)P); + + // Emit the finished record. + Stream.EmitRecord(bitc::MST_CODE_ENTRY, NameVals, AbbrevToUse); + NameVals.clear(); + } + Stream.ExitBlock(); +} + +// Helper to emit a single function summary record. +static void WritePerModuleFunctionSummaryRecord( + SmallVector<unsigned, 64> &NameVals, FunctionSummary *FS, unsigned ValueID, + unsigned FSAbbrev, BitstreamWriter &Stream) { + assert(FS); + NameVals.push_back(ValueID); + NameVals.push_back(FS->isLocalFunction()); + NameVals.push_back(FS->instCount()); + + // Emit the finished record. + Stream.EmitRecord(bitc::FS_CODE_PERMODULE_ENTRY, NameVals, FSAbbrev); + NameVals.clear(); +} + +/// Emit the per-module function summary section alongside the rest of +/// the module's bitcode. +static void WritePerModuleFunctionSummary( + DenseMap<const Function *, std::unique_ptr<FunctionInfo>> &FunctionIndex, + const Module *M, const ValueEnumerator &VE, BitstreamWriter &Stream) { + Stream.EnterSubblock(bitc::FUNCTION_SUMMARY_BLOCK_ID, 3); + + // Abbrev for FS_CODE_PERMODULE_ENTRY. + BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::FS_CODE_PERMODULE_ENTRY)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // islocal + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount + unsigned FSAbbrev = Stream.EmitAbbrev(Abbv); + + SmallVector<unsigned, 64> NameVals; + for (auto &I : FunctionIndex) { + // Skip anonymous functions. We will emit a function summary for + // any aliases below. + if (!I.first->hasName()) + continue; + + WritePerModuleFunctionSummaryRecord( + NameVals, I.second->functionSummary(), + VE.getValueID(M->getValueSymbolTable().lookup(I.first->getName())), + FSAbbrev, Stream); + } + + for (const GlobalAlias &A : M->aliases()) { + if (!A.getBaseObject()) + continue; + const Function *F = dyn_cast<Function>(A.getBaseObject()); + if (!F || F->isDeclaration()) + continue; + + assert(FunctionIndex.count(F) == 1); + WritePerModuleFunctionSummaryRecord( + NameVals, FunctionIndex[F]->functionSummary(), + VE.getValueID(M->getValueSymbolTable().lookup(A.getName())), FSAbbrev, + Stream); + } + + Stream.ExitBlock(); +} + +/// Emit the combined function summary section into the combined index +/// file. +static void WriteCombinedFunctionSummary(const FunctionInfoIndex &I, + BitstreamWriter &Stream) { + Stream.EnterSubblock(bitc::FUNCTION_SUMMARY_BLOCK_ID, 3); + + // Abbrev for FS_CODE_COMBINED_ENTRY. + BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::FS_CODE_COMBINED_ENTRY)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount + unsigned FSAbbrev = Stream.EmitAbbrev(Abbv); + + SmallVector<unsigned, 64> NameVals; + for (const auto &FII : I) { + for (auto &FI : FII.getValue()) { + FunctionSummary *FS = FI->functionSummary(); + assert(FS); + + NameVals.push_back(I.getModuleId(FS->modulePath())); + NameVals.push_back(FS->instCount()); + + // Record the starting offset of this summary entry for use + // in the VST entry. Add the current code size since the + // reader will invoke readRecord after the abbrev id read. + FI->setBitcodeIndex(Stream.GetCurrentBitNo() + Stream.GetAbbrevIDWidth()); + + // Emit the finished record. + Stream.EmitRecord(bitc::FS_CODE_COMBINED_ENTRY, NameVals, FSAbbrev); + NameVals.clear(); + } + } + + Stream.ExitBlock(); +} + +// Create the "IDENTIFICATION_BLOCK_ID" containing a single string with the +// current llvm version, and a record for the epoch number. +static void WriteIdentificationBlock(const Module *M, BitstreamWriter &Stream) { + Stream.EnterSubblock(bitc::IDENTIFICATION_BLOCK_ID, 5); + + // Write the "user readable" string identifying the bitcode producer + BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::IDENTIFICATION_CODE_STRING)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6)); + auto StringAbbrev = Stream.EmitAbbrev(Abbv); + WriteStringRecord(bitc::IDENTIFICATION_CODE_STRING, + "LLVM" LLVM_VERSION_STRING, StringAbbrev, Stream); + + // Write the epoch version + Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::IDENTIFICATION_CODE_EPOCH)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); + auto EpochAbbrev = Stream.EmitAbbrev(Abbv); + SmallVector<unsigned, 1> Vals = {bitc::BITCODE_CURRENT_EPOCH}; + Stream.EmitRecord(bitc::IDENTIFICATION_CODE_EPOCH, Vals, EpochAbbrev); + Stream.ExitBlock(); +} + /// WriteModule - Emit the specified module to the bitstream. static void WriteModule(const Module *M, BitstreamWriter &Stream, - bool ShouldPreserveUseListOrder) { + bool ShouldPreserveUseListOrder, + uint64_t BitcodeStartBit, bool EmitFunctionSummary) { Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3); SmallVector<unsigned, 1> Vals; @@ -2377,7 +2919,7 @@ static void WriteModule(const Module *M, BitstreamWriter &Stream, // Emit top-level description of module, including target triple, inline asm, // descriptors for global variables, and function prototype info. - WriteModuleInfo(M, VE, Stream); + uint64_t VSTOffsetPlaceholder = WriteModuleInfo(M, VE, Stream); // Emit constants. WriteModuleConstants(VE, Stream); @@ -2388,17 +2930,25 @@ static void WriteModule(const Module *M, BitstreamWriter &Stream, // Emit metadata. WriteModuleMetadataStore(M, Stream); - // Emit names for globals/functions etc. - WriteValueSymbolTable(M->getValueSymbolTable(), VE, Stream); - // Emit module-level use-lists. if (VE.shouldPreserveUseListOrder()) WriteUseListBlock(nullptr, VE, Stream); + WriteOperandBundleTags(M, Stream); + // Emit function bodies. + DenseMap<const Function *, std::unique_ptr<FunctionInfo>> FunctionIndex; for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) if (!F->isDeclaration()) - WriteFunction(*F, VE, Stream); + WriteFunction(*F, VE, Stream, FunctionIndex, EmitFunctionSummary); + + // Need to write after the above call to WriteFunction which populates + // the summary information in the index. + if (EmitFunctionSummary) + WritePerModuleFunctionSummary(FunctionIndex, M, VE, Stream); + + WriteValueSymbolTable(M->getValueSymbolTable(), VE, Stream, + VSTOffsetPlaceholder, BitcodeStartBit, &FunctionIndex); Stream.ExitBlock(); } @@ -2473,10 +3023,22 @@ static void EmitDarwinBCHeaderAndTrailer(SmallVectorImpl<char> &Buffer, Buffer.push_back(0); } +/// Helper to write the header common to all bitcode files. +static void WriteBitcodeHeader(BitstreamWriter &Stream) { + // Emit the file header. + Stream.Emit((unsigned)'B', 8); + Stream.Emit((unsigned)'C', 8); + Stream.Emit(0x0, 4); + Stream.Emit(0xC, 4); + Stream.Emit(0xE, 4); + Stream.Emit(0xD, 4); +} + /// WriteBitcodeToFile - Write the specified module to the specified output /// stream. void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out, - bool ShouldPreserveUseListOrder) { + bool ShouldPreserveUseListOrder, + bool EmitFunctionSummary) { SmallVector<char, 0> Buffer; Buffer.reserve(256*1024); @@ -2489,17 +3051,20 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out, // Emit the module into the buffer. { BitstreamWriter Stream(Buffer); + // Save the start bit of the actual bitcode, in case there is space + // saved at the start for the darwin header above. The reader stream + // will start at the bitcode, and we need the offset of the VST + // to line up. + uint64_t BitcodeStartBit = Stream.GetCurrentBitNo(); // Emit the file header. - Stream.Emit((unsigned)'B', 8); - Stream.Emit((unsigned)'C', 8); - Stream.Emit(0x0, 4); - Stream.Emit(0xC, 4); - Stream.Emit(0xE, 4); - Stream.Emit(0xD, 4); + WriteBitcodeHeader(Stream); + + WriteIdentificationBlock(M, Stream); // Emit the module. - WriteModule(M, Stream, ShouldPreserveUseListOrder); + WriteModule(M, Stream, ShouldPreserveUseListOrder, BitcodeStartBit, + EmitFunctionSummary); } if (TT.isOSDarwin()) @@ -2508,3 +3073,38 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out, // Write the generated bitstream to "Out". Out.write((char*)&Buffer.front(), Buffer.size()); } + +// Write the specified function summary index to the given raw output stream, +// where it will be written in a new bitcode block. This is used when +// writing the combined index file for ThinLTO. +void llvm::WriteFunctionSummaryToFile(const FunctionInfoIndex &Index, + raw_ostream &Out) { + SmallVector<char, 0> Buffer; + Buffer.reserve(256 * 1024); + + BitstreamWriter Stream(Buffer); + + // Emit the bitcode header. + WriteBitcodeHeader(Stream); + + Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3); + + SmallVector<unsigned, 1> Vals; + unsigned CurVersion = 1; + Vals.push_back(CurVersion); + Stream.EmitRecord(bitc::MODULE_CODE_VERSION, Vals); + + // Write the module paths in the combined index. + WriteModStrings(Index, Stream); + + // Write the function summary combined index records. + WriteCombinedFunctionSummary(Index, Stream); + + // Need a special VST writer for the combined index (we don't have a + // real VST and real values when this is invoked). + WriteCombinedValueSymbolTable(Index, Stream); + + Stream.ExitBlock(); + + Out.write((char *)&Buffer.front(), Buffer.size()); +} diff --git a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp index 3165743..24de99a 100644 --- a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp +++ b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp @@ -19,7 +19,7 @@ using namespace llvm; PreservedAnalyses BitcodeWriterPass::run(Module &M) { - WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder); + WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder, EmitFunctionSummary); return PreservedAnalyses::all(); } @@ -27,17 +27,21 @@ namespace { class WriteBitcodePass : public ModulePass { raw_ostream &OS; // raw_ostream to print on bool ShouldPreserveUseListOrder; + bool EmitFunctionSummary; public: static char ID; // Pass identification, replacement for typeid - explicit WriteBitcodePass(raw_ostream &o, bool ShouldPreserveUseListOrder) + explicit WriteBitcodePass(raw_ostream &o, bool ShouldPreserveUseListOrder, + bool EmitFunctionSummary) : ModulePass(ID), OS(o), - ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {} + ShouldPreserveUseListOrder(ShouldPreserveUseListOrder), + EmitFunctionSummary(EmitFunctionSummary) {} const char *getPassName() const override { return "Bitcode Writer"; } bool runOnModule(Module &M) override { - WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder); + WriteBitcodeToFile(&M, OS, ShouldPreserveUseListOrder, + EmitFunctionSummary); return false; } }; @@ -46,6 +50,8 @@ namespace { char WriteBitcodePass::ID = 0; ModulePass *llvm::createBitcodeWriterPass(raw_ostream &Str, - bool ShouldPreserveUseListOrder) { - return new WriteBitcodePass(Str, ShouldPreserveUseListOrder); + bool ShouldPreserveUseListOrder, + bool EmitFunctionSummary) { + return new WriteBitcodePass(Str, ShouldPreserveUseListOrder, + EmitFunctionSummary); } diff --git a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp index 44dd604..e07563b 100644 --- a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp +++ b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp @@ -87,15 +87,9 @@ static OrderMap orderModule(const Module &M) { if (!isa<GlobalValue>(A.getAliasee())) orderValue(A.getAliasee(), OM); for (const Function &F : M) { - if (F.hasPrefixData()) - if (!isa<GlobalValue>(F.getPrefixData())) - orderValue(F.getPrefixData(), OM); - if (F.hasPrologueData()) - if (!isa<GlobalValue>(F.getPrologueData())) - orderValue(F.getPrologueData(), OM); - if (F.hasPersonalityFn()) - if (!isa<GlobalValue>(F.getPersonalityFn())) - orderValue(F.getPersonalityFn(), OM); + for (const Use &U : F.operands()) + if (!isa<GlobalValue>(U.get())) + orderValue(U.get(), OM); } OM.LastGlobalConstantID = OM.size(); @@ -273,12 +267,8 @@ static UseListOrderStack predictUseListOrder(const Module &M) { for (const GlobalAlias &A : M.aliases()) predictValueUseListOrder(A.getAliasee(), nullptr, OM, Stack); for (const Function &F : M) { - if (F.hasPrefixData()) - predictValueUseListOrder(F.getPrefixData(), nullptr, OM, Stack); - if (F.hasPrologueData()) - predictValueUseListOrder(F.getPrologueData(), nullptr, OM, Stack); - if (F.hasPersonalityFn()) - predictValueUseListOrder(F.getPersonalityFn(), nullptr, OM, Stack); + for (const Use &U : F.operands()) + predictValueUseListOrder(U.get(), nullptr, OM, Stack); } return Stack; @@ -321,20 +311,10 @@ ValueEnumerator::ValueEnumerator(const Module &M, for (const GlobalAlias &GA : M.aliases()) EnumerateValue(GA.getAliasee()); - // Enumerate the prefix data constants. + // Enumerate any optional Function data. for (const Function &F : M) - if (F.hasPrefixData()) - EnumerateValue(F.getPrefixData()); - - // Enumerate the prologue data constants. - for (const Function &F : M) - if (F.hasPrologueData()) - EnumerateValue(F.getPrologueData()); - - // Enumerate the personality functions. - for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) - if (I->hasPersonalityFn()) - EnumerateValue(I->getPersonalityFn()); + for (const Use &U : F.operands()) + EnumerateValue(U.get()); // Enumerate the metadata type. // @@ -425,7 +405,7 @@ unsigned ValueEnumerator::getValueID(const Value *V) const { void ValueEnumerator::dump() const { print(dbgs(), ValueMap, "Default"); dbgs() << '\n'; - print(dbgs(), MDValueMap, "MetaData"); + print(dbgs(), MetadataMap, "MetaData"); dbgs() << '\n'; } @@ -512,10 +492,8 @@ void ValueEnumerator::EnumerateValueSymbolTable(const ValueSymbolTable &VST) { /// Insert all of the values referenced by named metadata in the specified /// module. void ValueEnumerator::EnumerateNamedMetadata(const Module &M) { - for (Module::const_named_metadata_iterator I = M.named_metadata_begin(), - E = M.named_metadata_end(); - I != E; ++I) - EnumerateNamedMDNode(I); + for (const auto &I : M.named_metadata()) + EnumerateNamedMDNode(&I); } void ValueEnumerator::EnumerateNamedMDNode(const NamedMDNode *MD) { @@ -544,7 +522,7 @@ void ValueEnumerator::EnumerateMetadata(const Metadata *MD) { // EnumerateMDNodeOperands() from re-visiting MD in a cyclic graph. // // Return early if there's already an ID. - if (!MDValueMap.insert(std::make_pair(MD, 0)).second) + if (!MetadataMap.insert(std::make_pair(MD, 0)).second) return; // Visit operands first to minimize RAUW. @@ -557,10 +535,10 @@ void ValueEnumerator::EnumerateMetadata(const Metadata *MD) { HasDILocation |= isa<DILocation>(MD); HasGenericDINode |= isa<GenericDINode>(MD); - // Replace the dummy ID inserted above with the correct one. MDValueMap may + // Replace the dummy ID inserted above with the correct one. MetadataMap may // have changed by inserting operands, so we need a fresh lookup here. MDs.push_back(MD); - MDValueMap[MD] = MDs.size(); + MetadataMap[MD] = MDs.size(); } /// EnumerateFunctionLocalMetadataa - Incorporate function-local metadata @@ -568,12 +546,12 @@ void ValueEnumerator::EnumerateMetadata(const Metadata *MD) { void ValueEnumerator::EnumerateFunctionLocalMetadata( const LocalAsMetadata *Local) { // Check to see if it's already in! - unsigned &MDValueID = MDValueMap[Local]; - if (MDValueID) + unsigned &MetadataID = MetadataMap[Local]; + if (MetadataID) return; MDs.push_back(Local); - MDValueID = MDs.size(); + MetadataID = MDs.size(); EnumerateValue(Local->getValue()); @@ -729,23 +707,20 @@ void ValueEnumerator::incorporateFunction(const Function &F) { NumModuleMDs = MDs.size(); // Adding function arguments to the value table. - for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); - I != E; ++I) - EnumerateValue(I); + for (const auto &I : F.args()) + EnumerateValue(&I); FirstFuncConstantID = Values.size(); // Add all function-level constants to the value table. - for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I) - for (User::const_op_iterator OI = I->op_begin(), E = I->op_end(); - OI != E; ++OI) { - if ((isa<Constant>(*OI) && !isa<GlobalValue>(*OI)) || - isa<InlineAsm>(*OI)) - EnumerateValue(*OI); + for (const BasicBlock &BB : F) { + for (const Instruction &I : BB) + for (const Use &OI : I.operands()) { + if ((isa<Constant>(OI) && !isa<GlobalValue>(OI)) || isa<InlineAsm>(OI)) + EnumerateValue(OI); } - BasicBlocks.push_back(BB); - ValueMap[BB] = BasicBlocks.size(); + BasicBlocks.push_back(&BB); + ValueMap[&BB] = BasicBlocks.size(); } // Optimize the constant layout. @@ -759,18 +734,17 @@ void ValueEnumerator::incorporateFunction(const Function &F) { SmallVector<LocalAsMetadata *, 8> FnLocalMDVector; // Add all of the instructions. - for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I) { - for (User::const_op_iterator OI = I->op_begin(), E = I->op_end(); - OI != E; ++OI) { - if (auto *MD = dyn_cast<MetadataAsValue>(&*OI)) + for (const BasicBlock &BB : F) { + for (const Instruction &I : BB) { + for (const Use &OI : I.operands()) { + if (auto *MD = dyn_cast<MetadataAsValue>(&OI)) if (auto *Local = dyn_cast<LocalAsMetadata>(MD->getMetadata())) // Enumerate metadata after the instructions they might refer to. FnLocalMDVector.push_back(Local); } - if (!I->getType()->isVoidTy()) - EnumerateValue(I); + if (!I.getType()->isVoidTy()) + EnumerateValue(&I); } } @@ -784,7 +758,7 @@ void ValueEnumerator::purgeFunction() { for (unsigned i = NumModuleValues, e = Values.size(); i != e; ++i) ValueMap.erase(Values[i].first); for (unsigned i = NumModuleMDs, e = MDs.size(); i != e; ++i) - MDValueMap.erase(MDs[i]); + MetadataMap.erase(MDs[i]); for (unsigned i = 0, e = BasicBlocks.size(); i != e; ++i) ValueMap.erase(BasicBlocks[i]); @@ -797,8 +771,8 @@ void ValueEnumerator::purgeFunction() { static void IncorporateFunctionInfoGlobalBBIDs(const Function *F, DenseMap<const BasicBlock*, unsigned> &IDMap) { unsigned Counter = 0; - for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB) - IDMap[BB] = ++Counter; + for (const BasicBlock &BB : *F) + IDMap[&BB] = ++Counter; } /// getGlobalBasicBlockID - This returns the function-specific ID for the diff --git a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h index 92d166e..9fb8325 100644 --- a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h +++ b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h @@ -63,7 +63,7 @@ private: std::vector<const Metadata *> MDs; SmallVector<const LocalAsMetadata *, 8> FunctionLocalMDs; typedef DenseMap<const Metadata *, unsigned> MetadataMapType; - MetadataMapType MDValueMap; + MetadataMapType MetadataMap; bool HasMDString; bool HasDILocation; bool HasGenericDINode; @@ -93,7 +93,7 @@ private: /// before incorporation. unsigned NumModuleValues; - /// When a function is incorporated, this is the size of the MDValues list + /// When a function is incorporated, this is the size of the Metadatas list /// before incorporation. unsigned NumModuleMDs; @@ -117,8 +117,9 @@ public: return ID - 1; } unsigned getMetadataOrNullID(const Metadata *MD) const { - return MDValueMap.lookup(MD); + return MetadataMap.lookup(MD); } + unsigned numMDs() const { return MDs.size(); } bool hasMDString() const { return HasMDString; } bool hasDILocation() const { return HasDILocation; } diff --git a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp index 5fe4c4b..4060db7 100644 --- a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -142,16 +142,15 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { assert(!State); State = new AggressiveAntiDepState(TRI->getNumRegs(), BB); - bool IsReturnBlock = (!BB->empty() && BB->back().isReturn()); + bool IsReturnBlock = BB->isReturnBlock(); std::vector<unsigned> &KillIndices = State->GetKillIndices(); std::vector<unsigned> &DefIndices = State->GetDefIndices(); // Examine the live-in regs of all successors. for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), SE = BB->succ_end(); SI != SE; ++SI) - for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(), - E = (*SI)->livein_end(); I != E; ++I) { - for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) { + for (const auto &LI : (*SI)->liveins()) { + for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) { unsigned Reg = *AI; State->UnionGroups(Reg, 0); KillIndices[Reg] = BB->size(); @@ -365,9 +364,11 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI, // If MI's defs have a special allocation requirement, don't allow // any def registers to be changed. Also assume all registers - // defined in a call must not be changed (ABI). + // defined in a call must not be changed (ABI). Inline assembly may + // reference either system calls or the register directly. Skip it until we + // can tell user specified registers from compiler-specified. if (MI->isCall() || MI->hasExtraDefRegAllocReq() || - TII->isPredicated(MI)) { + TII->isPredicated(MI) || MI->isInlineAsm()) { DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)"); State->UnionGroups(Reg, 0); } @@ -429,6 +430,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI, // If MI's uses have special allocation requirement, don't allow // any use registers to be changed. Also assume all registers // used in a call must not be changed (ABI). + // Inline Assembly register uses also cannot be safely changed. // FIXME: The issue with predicated instruction is more complex. We are being // conservatively here because the kill markers cannot be trusted after // if-conversion: @@ -444,7 +446,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI, // changed. bool Special = MI->isCall() || MI->hasExtraSrcRegAllocReq() || - TII->isPredicated(MI); + TII->isPredicated(MI) || MI->isInlineAsm(); // Scan the register uses for this instruction and update // live-ranges, groups and RegRefs. @@ -509,15 +511,8 @@ BitVector AggressiveAntiDepBreaker::GetRenameRegisters(unsigned Reg) { // Check all references that need rewriting for Reg. For each, use // the corresponding register class to narrow the set of registers // that are appropriate for renaming. - std::pair<std::multimap<unsigned, - AggressiveAntiDepState::RegisterReference>::iterator, - std::multimap<unsigned, - AggressiveAntiDepState::RegisterReference>::iterator> - Range = State->GetRegRefs().equal_range(Reg); - for (std::multimap<unsigned, - AggressiveAntiDepState::RegisterReference>::iterator Q = Range.first, - QE = Range.second; Q != QE; ++Q) { - const TargetRegisterClass *RC = Q->second.RC; + for (const auto &Q : make_range(State->GetRegRefs().equal_range(Reg))) { + const TargetRegisterClass *RC = Q.second.RC; if (!RC) continue; BitVector RCBV = TRI->getAllocatableSet(MF, RC); @@ -685,9 +680,8 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters( // We cannot rename 'Reg' to 'NewReg' if one of the uses of 'Reg' also // defines 'NewReg' via an early-clobber operand. - auto Range = RegRefs.equal_range(Reg); - for (auto Q = Range.first, QE = Range.second; Q != QE; ++Q) { - auto UseMI = Q->second.Operand->getParent(); + for (const auto &Q : make_range(RegRefs.equal_range(Reg))) { + MachineInstr *UseMI = Q.second.Operand->getParent(); int Idx = UseMI->findRegisterDefOperandIdx(NewReg, false, true, TRI); if (Idx == -1) continue; @@ -698,6 +692,20 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters( } } + // Also, we cannot rename 'Reg' to 'NewReg' if the instruction defining + // 'Reg' is an early-clobber define and that instruction also uses + // 'NewReg'. + for (const auto &Q : make_range(RegRefs.equal_range(Reg))) { + if (!Q.second.Operand->isDef() || !Q.second.Operand->isEarlyClobber()) + continue; + + MachineInstr *DefMI = Q.second.Operand->getParent(); + if (DefMI->readsRegister(NewReg, TRI)) { + DEBUG(dbgs() << "(ec)"); + goto next_super_reg; + } + } + // Record that 'Reg' can be renamed to 'NewReg'. RenameMap.insert(std::pair<unsigned, unsigned>(Reg, NewReg)); } @@ -920,23 +928,16 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies( // Update the references to the old register CurrReg to // refer to the new register NewReg. - std::pair<std::multimap<unsigned, - AggressiveAntiDepState::RegisterReference>::iterator, - std::multimap<unsigned, - AggressiveAntiDepState::RegisterReference>::iterator> - Range = RegRefs.equal_range(CurrReg); - for (std::multimap<unsigned, - AggressiveAntiDepState::RegisterReference>::iterator - Q = Range.first, QE = Range.second; Q != QE; ++Q) { - Q->second.Operand->setReg(NewReg); + for (const auto &Q : make_range(RegRefs.equal_range(CurrReg))) { + Q.second.Operand->setReg(NewReg); // If the SU for the instruction being updated has debug // information related to the anti-dependency register, make // sure to update that as well. - const SUnit *SU = MISUnitMap[Q->second.Operand->getParent()]; + const SUnit *SU = MISUnitMap[Q.second.Operand->getParent()]; if (!SU) continue; for (DbgValueVector::iterator DVI = DbgValues.begin(), DVE = DbgValues.end(); DVI != DVE; ++DVI) - if (DVI->second == Q->second.Operand->getParent()) + if (DVI->second == Q.second.Operand->getParent()) UpdateDbgValue(DVI->first, AntiDepReg, NewReg); } diff --git a/contrib/llvm/lib/CodeGen/AllocationOrder.cpp b/contrib/llvm/lib/CodeGen/AllocationOrder.cpp index dc9bcff..40451c0 100644 --- a/contrib/llvm/lib/CodeGen/AllocationOrder.cpp +++ b/contrib/llvm/lib/CodeGen/AllocationOrder.cpp @@ -29,12 +29,13 @@ using namespace llvm; // Compare VirtRegMap::getRegAllocPref(). AllocationOrder::AllocationOrder(unsigned VirtReg, const VirtRegMap &VRM, - const RegisterClassInfo &RegClassInfo) + const RegisterClassInfo &RegClassInfo, + const LiveRegMatrix *Matrix) : Pos(0) { const MachineFunction &MF = VRM.getMachineFunction(); const TargetRegisterInfo *TRI = &VRM.getTargetRegInfo(); Order = RegClassInfo.getOrder(MF.getRegInfo().getRegClass(VirtReg)); - TRI->getRegAllocationHints(VirtReg, Order, Hints, MF, &VRM); + TRI->getRegAllocationHints(VirtReg, Order, Hints, MF, &VRM, Matrix); rewind(); DEBUG({ diff --git a/contrib/llvm/lib/CodeGen/AllocationOrder.h b/contrib/llvm/lib/CodeGen/AllocationOrder.h index 02b2d92..2aee3a6 100644 --- a/contrib/llvm/lib/CodeGen/AllocationOrder.h +++ b/contrib/llvm/lib/CodeGen/AllocationOrder.h @@ -24,6 +24,7 @@ namespace llvm { class RegisterClassInfo; class VirtRegMap; +class LiveRegMatrix; class LLVM_LIBRARY_VISIBILITY AllocationOrder { SmallVector<MCPhysReg, 16> Hints; @@ -37,7 +38,8 @@ public: /// @param RegClassInfo Information about reserved and allocatable registers. AllocationOrder(unsigned VirtReg, const VirtRegMap &VRM, - const RegisterClassInfo &RegClassInfo); + const RegisterClassInfo &RegClassInfo, + const LiveRegMatrix *Matrix); /// Get the allocation order without reordered hints. ArrayRef<MCPhysReg> getOrder() const { return Order; } diff --git a/contrib/llvm/lib/CodeGen/Analysis.cpp b/contrib/llvm/lib/CodeGen/Analysis.cpp index 98d4c8a..75579a2 100644 --- a/contrib/llvm/lib/CodeGen/Analysis.cpp +++ b/contrib/llvm/lib/CodeGen/Analysis.cpp @@ -14,6 +14,7 @@ #include "llvm/CodeGen/Analysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -25,6 +26,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Utils/GlobalStatus.h" @@ -515,7 +517,7 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) { if (isa<DbgInfoIntrinsic>(BBI)) continue; if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() || - !isSafeToSpeculativelyExecute(BBI)) + !isSafeToSpeculativelyExecute(&*BBI)) return false; } @@ -643,3 +645,97 @@ bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) { return !GS.IsCompared; } + +static void collectFuncletMembers( + DenseMap<const MachineBasicBlock *, int> &FuncletMembership, int Funclet, + const MachineBasicBlock *MBB) { + // Add this MBB to our funclet. + auto P = FuncletMembership.insert(std::make_pair(MBB, Funclet)); + + // Don't revisit blocks. + if (!P.second) { + assert(P.first->second == Funclet && "MBB is part of two funclets!"); + return; + } + + bool IsReturn = false; + int NumTerminators = 0; + for (const MachineInstr &MI : MBB->terminators()) { + IsReturn |= MI.isReturn(); + ++NumTerminators; + } + assert((!IsReturn || NumTerminators == 1) && + "Expected only one terminator when a return is present!"); + + // Returns are boundaries where funclet transfer can occur, don't follow + // successors. + if (IsReturn) + return; + + for (const MachineBasicBlock *SMBB : MBB->successors()) + if (!SMBB->isEHPad()) + collectFuncletMembers(FuncletMembership, Funclet, SMBB); +} + +DenseMap<const MachineBasicBlock *, int> +llvm::getFuncletMembership(const MachineFunction &MF) { + DenseMap<const MachineBasicBlock *, int> FuncletMembership; + + // We don't have anything to do if there aren't any EH pads. + if (!MF.getMMI().hasEHFunclets()) + return FuncletMembership; + + int EntryBBNumber = MF.front().getNumber(); + bool IsSEH = isAsynchronousEHPersonality( + classifyEHPersonality(MF.getFunction()->getPersonalityFn())); + + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + SmallVector<const MachineBasicBlock *, 16> FuncletBlocks; + SmallVector<const MachineBasicBlock *, 16> UnreachableBlocks; + SmallVector<const MachineBasicBlock *, 16> SEHCatchPads; + SmallVector<std::pair<const MachineBasicBlock *, int>, 16> CatchRetSuccessors; + for (const MachineBasicBlock &MBB : MF) { + if (MBB.isEHFuncletEntry()) { + FuncletBlocks.push_back(&MBB); + } else if (IsSEH && MBB.isEHPad()) { + SEHCatchPads.push_back(&MBB); + } else if (MBB.pred_empty()) { + UnreachableBlocks.push_back(&MBB); + } + + MachineBasicBlock::const_iterator MBBI = MBB.getFirstTerminator(); + // CatchPads are not funclets for SEH so do not consider CatchRet to + // transfer control to another funclet. + if (MBBI->getOpcode() != TII->getCatchReturnOpcode()) + continue; + + // FIXME: SEH CatchPads are not necessarily in the parent function: + // they could be inside a finally block. + const MachineBasicBlock *Successor = MBBI->getOperand(0).getMBB(); + const MachineBasicBlock *SuccessorColor = MBBI->getOperand(1).getMBB(); + CatchRetSuccessors.push_back( + {Successor, IsSEH ? EntryBBNumber : SuccessorColor->getNumber()}); + } + + // We don't have anything to do if there aren't any EH pads. + if (FuncletBlocks.empty()) + return FuncletMembership; + + // Identify all the basic blocks reachable from the function entry. + collectFuncletMembers(FuncletMembership, EntryBBNumber, &MF.front()); + // All blocks not part of a funclet are in the parent function. + for (const MachineBasicBlock *MBB : UnreachableBlocks) + collectFuncletMembers(FuncletMembership, EntryBBNumber, MBB); + // Next, identify all the blocks inside the funclets. + for (const MachineBasicBlock *MBB : FuncletBlocks) + collectFuncletMembers(FuncletMembership, MBB->getNumber(), MBB); + // SEH CatchPads aren't really funclets, handle them separately. + for (const MachineBasicBlock *MBB : SEHCatchPads) + collectFuncletMembers(FuncletMembership, EntryBBNumber, MBB); + // Finally, identify all the targets of a catchret. + for (std::pair<const MachineBasicBlock *, int> CatchRetPair : + CatchRetSuccessors) + collectFuncletMembers(FuncletMembership, CatchRetPair.second, + CatchRetPair.first); + return FuncletMembership; +} diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp index 0bad795..ade2d71 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp @@ -73,7 +73,6 @@ void ARMException::endFunction(const MachineFunction *MF) { const Function *Per = nullptr; if (F->hasPersonalityFn()) Per = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts()); - assert(!MMI->getPersonality() || Per == MMI->getPersonality()); bool forceEmitPersonality = F->hasPersonalityFn() && !isNoOpWithoutInvoke(classifyEHPersonality(Per)) && F->needsUnwindTableEntry(); @@ -115,9 +114,7 @@ void ARMException::emitTypeInfos(unsigned TTypeEncoding) { Entry = TypeInfos.size(); } - for (std::vector<const GlobalValue *>::const_reverse_iterator - I = TypeInfos.rbegin(), E = TypeInfos.rend(); I != E; ++I) { - const GlobalValue *GV = *I; + for (const GlobalValue *GV : reverse(TypeInfos)) { if (VerboseAsm) Asm->OutStreamer->AddComment("TypeInfo " + Twine(Entry--)); Asm->EmitTTypeReference(GV, TTypeEncoding); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 125047e..5f67d3d 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -135,11 +135,14 @@ const TargetLoweringObjectFile &AsmPrinter::getObjFileLowering() const { return *TM.getObjFileLowering(); } -/// getDataLayout - Return information about data layout. const DataLayout &AsmPrinter::getDataLayout() const { - return *TM.getDataLayout(); + return MMI->getModule()->getDataLayout(); } +// Do not use the cached DataLayout because some client use it without a Module +// (llmv-dsymutil, llvm-dwarfdump). +unsigned AsmPrinter::getPointerSize() const { return TM.getPointerSize(); } + const MCSubtargetInfo &AsmPrinter::getSubtargetInfo() const { assert(MF && "getSubtargetInfo requires a valid MachineFunction!"); return MF->getSubtarget<MCSubtargetInfo>(); @@ -189,14 +192,26 @@ bool AsmPrinter::doInitialization(Module &M) { // use the directive, where it would need the same conditionalization // anyway. Triple TT(getTargetTriple()); - if (TT.isOSDarwin()) { + // If there is a version specified, Major will be non-zero. + if (TT.isOSDarwin() && TT.getOSMajorVersion() != 0) { unsigned Major, Minor, Update; - TT.getOSVersion(Major, Minor, Update); - // If there is a version specified, Major will be non-zero. - if (Major) - OutStreamer->EmitVersionMin((TT.isMacOSX() ? - MCVM_OSXVersionMin : MCVM_IOSVersionMin), - Major, Minor, Update); + MCVersionMinType VersionType; + if (TT.isWatchOS()) { + VersionType = MCVM_WatchOSVersionMin; + TT.getWatchOSVersion(Major, Minor, Update); + } else if (TT.isTvOS()) { + VersionType = MCVM_TvOSVersionMin; + TT.getiOSVersion(Major, Minor, Update); + } else if (TT.isMacOSX()) { + VersionType = MCVM_OSXVersionMin; + if (!TT.getMacOSXVersion(Major, Minor, Update)) + Major = 0; + } else { + VersionType = MCVM_IOSVersionMin; + TT.getiOSVersion(Major, Minor, Update); + } + if (Major != 0) + OutStreamer->EmitVersionMin(VersionType, Major, Minor, Update); } // Allow the target to emit any magic that it wants at the start of the file. @@ -224,28 +239,20 @@ bool AsmPrinter::doInitialization(Module &M) { TM.getTargetFeatureString())); OutStreamer->AddComment("Start of file scope inline assembly"); OutStreamer->AddBlankLine(); - EmitInlineAsm(M.getModuleInlineAsm()+"\n", *STI, TM.Options.MCOptions); + EmitInlineAsm(M.getModuleInlineAsm()+"\n", + OutContext.getSubtargetCopy(*STI), TM.Options.MCOptions); OutStreamer->AddComment("End of file scope inline assembly"); OutStreamer->AddBlankLine(); } if (MAI->doesSupportDebugInformation()) { - bool skip_dwarf = false; - if (TM.getTargetTriple().isKnownWindowsMSVCEnvironment()) { + bool EmitCodeView = MMI->getModule()->getCodeViewFlag(); + if (EmitCodeView && TM.getTargetTriple().isKnownWindowsMSVCEnvironment()) { Handlers.push_back(HandlerInfo(new WinCodeViewLineTables(this), DbgTimerName, CodeViewLineTablesGroupName)); - // FIXME: Don't emit DWARF debug info if there's at least one function - // with AddressSanitizer instrumentation. - // This is a band-aid fix for PR22032. - for (auto &F : M.functions()) { - if (F.hasFnAttribute(Attribute::SanitizeAddress)) { - skip_dwarf = true; - break; - } - } } - if (!skip_dwarf) { + if (!EmitCodeView || MMI->getModule()->getDwarfVersion()) { DD = new DwarfDebug(this, &M); Handlers.push_back(HandlerInfo(DD, DbgTimerName, DWARFGroupName)); } @@ -340,8 +347,51 @@ MCSymbol *AsmPrinter::getSymbol(const GlobalValue *GV) const { return TM.getSymbol(GV, *Mang); } +static MCSymbol *getOrCreateEmuTLSControlSym(MCSymbol *GVSym, MCContext &C) { + return C.getOrCreateSymbol(Twine("__emutls_v.") + GVSym->getName()); +} + +static MCSymbol *getOrCreateEmuTLSInitSym(MCSymbol *GVSym, MCContext &C) { + return C.getOrCreateSymbol(Twine("__emutls_t.") + GVSym->getName()); +} + +/// EmitEmulatedTLSControlVariable - Emit the control variable for an emulated TLS variable. +void AsmPrinter::EmitEmulatedTLSControlVariable(const GlobalVariable *GV, + MCSymbol *EmittedSym, + bool AllZeroInitValue) { + MCSection *TLSVarSection = getObjFileLowering().getDataSection(); + OutStreamer->SwitchSection(TLSVarSection); + MCSymbol *GVSym = getSymbol(GV); + EmitLinkage(GV, EmittedSym); // same linkage as GV + const DataLayout &DL = GV->getParent()->getDataLayout(); + uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); + unsigned AlignLog = getGVAlignmentLog2(GV, DL); + unsigned WordSize = DL.getPointerSize(); + unsigned Alignment = DL.getPointerABIAlignment(); + EmitAlignment(Log2_32(Alignment)); + OutStreamer->EmitLabel(EmittedSym); + OutStreamer->EmitIntValue(Size, WordSize); + OutStreamer->EmitIntValue((1 << AlignLog), WordSize); + OutStreamer->EmitIntValue(0, WordSize); + if (GV->hasInitializer() && !AllZeroInitValue) { + OutStreamer->EmitSymbolValue( + getOrCreateEmuTLSInitSym(GVSym, OutContext), WordSize); + } else + OutStreamer->EmitIntValue(0, WordSize); + if (MAI->hasDotTypeDotSizeDirective()) + OutStreamer->emitELFSize(cast<MCSymbolELF>(EmittedSym), + MCConstantExpr::create(4 * WordSize, OutContext)); + OutStreamer->AddBlankLine(); // End of the __emutls_v.* variable. +} + /// EmitGlobalVariable - Emit the specified global variable to the .s file. void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { + bool IsEmuTLSVar = + GV->getThreadLocalMode() != llvm::GlobalVariable::NotThreadLocal && + TM.Options.EmulatedTLS; + assert(!(IsEmuTLSVar && GV->hasCommonLinkage()) && + "No emulated TLS variables in the common section"); + if (GV->hasInitializer()) { // Check to see if this is a special global used by LLVM, if so, emit it. if (EmitSpecialLLVMGlobal(GV)) @@ -352,7 +402,9 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { if (GlobalGOTEquivs.count(getSymbol(GV))) return; - if (isVerbose()) { + if (isVerbose() && !IsEmuTLSVar) { + // When printing the control variable __emutls_v.*, + // we don't need to print the original TLS variable name. GV->printAsOperand(OutStreamer->GetCommentOS(), /*PrintType=*/false, GV->getParent()); OutStreamer->GetCommentOS() << '\n'; @@ -360,7 +412,12 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { } MCSymbol *GVSym = getSymbol(GV); - EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); + MCSymbol *EmittedSym = IsEmuTLSVar ? + getOrCreateEmuTLSControlSym(GVSym, OutContext) : GVSym; + // getOrCreateEmuTLSControlSym only creates the symbol with name and default attributes. + // GV's or GVSym's attributes will be used for the EmittedSym. + + EmitVisibility(EmittedSym, GV->getVisibility(), !GV->isDeclaration()); if (!GV->hasInitializer()) // External globals require no extra code. return; @@ -371,17 +428,29 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { "' is already defined"); if (MAI->hasDotTypeDotSizeDirective()) - OutStreamer->EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject); + OutStreamer->EmitSymbolAttribute(EmittedSym, MCSA_ELF_TypeObject); SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM); - const DataLayout *DL = TM.getDataLayout(); - uint64_t Size = DL->getTypeAllocSize(GV->getType()->getElementType()); + const DataLayout &DL = GV->getParent()->getDataLayout(); + uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); // If the alignment is specified, we *must* obey it. Overaligning a global // with a specified alignment is a prompt way to break globals emitted to // sections and expected to be contiguous (e.g. ObjC metadata). - unsigned AlignLog = getGVAlignmentLog2(GV, *DL); + unsigned AlignLog = getGVAlignmentLog2(GV, DL); + + bool AllZeroInitValue = false; + const Constant *InitValue = GV->getInitializer(); + if (isa<ConstantAggregateZero>(InitValue)) + AllZeroInitValue = true; + else { + const ConstantInt *InitIntValue = dyn_cast<ConstantInt>(InitValue); + if (InitIntValue && InitIntValue->isZero()) + AllZeroInitValue = true; + } + if (IsEmuTLSVar) + EmitEmulatedTLSControlVariable(GV, EmittedSym, AllZeroInitValue); for (const HandlerInfo &HI : Handlers) { NamedRegionTimer T(HI.TimerName, HI.TimerGroupName, TimePassesIsEnabled); @@ -390,6 +459,8 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { // Handle common and BSS local symbols (.lcomm). if (GVKind.isCommon() || GVKind.isBSSLocal()) { + assert(!(IsEmuTLSVar && GVKind.isCommon()) && + "No emulated TLS variables in the common section"); if (Size == 0) Size = 1; // .comm Foo, 0 is undefined, avoid it. unsigned Align = 1 << AlignLog; @@ -434,12 +505,21 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { return; } - MCSection *TheSection = + if (IsEmuTLSVar && AllZeroInitValue) + return; // No need of initialization values. + + MCSymbol *EmittedInitSym = IsEmuTLSVar ? + getOrCreateEmuTLSInitSym(GVSym, OutContext) : GVSym; + // getOrCreateEmuTLSInitSym only creates the symbol with name and default attributes. + // GV's or GVSym's attributes will be used for the EmittedInitSym. + + MCSection *TheSection = IsEmuTLSVar ? + getObjFileLowering().getReadOnlySection() : getObjFileLowering().SectionForGlobal(GV, GVKind, *Mang, TM); // Handle the zerofill directive on darwin, which is a special form of BSS // emission. - if (GVKind.isBSSExtern() && MAI->hasMachoZeroFillDirective()) { + if (GVKind.isBSSExtern() && MAI->hasMachoZeroFillDirective() && !IsEmuTLSVar) { if (Size == 0) Size = 1; // zerofill of 0 bytes is undefined. // .globl _foo @@ -459,7 +539,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { // TLOF class. This will also make it more obvious that stuff like // MCStreamer::EmitTBSSSymbol is macho specific and only called from macho // specific code. - if (GVKind.isThreadLocal() && MAI->hasMachoTBSSDirective()) { + if (GVKind.isThreadLocal() && MAI->hasMachoTBSSDirective() && !IsEmuTLSVar) { // Emit the .tbss symbol MCSymbol *MangSym = OutContext.getOrCreateSymbol(GVSym->getName() + Twine("$tlv$init")); @@ -473,7 +553,8 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { EmitAlignment(AlignLog, GV); OutStreamer->EmitLabel(MangSym); - EmitGlobalConstant(GV->getInitializer()); + EmitGlobalConstant(GV->getParent()->getDataLayout(), + GV->getInitializer()); } OutStreamer->AddBlankLine(); @@ -490,7 +571,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { // - __tlv_bootstrap - used to make sure support exists // - spare pointer, used when mapped by the runtime // - pointer to mangled symbol above with initializer - unsigned PtrSize = DL->getPointerTypeSize(GV->getType()); + unsigned PtrSize = DL.getPointerTypeSize(GV->getType()); OutStreamer->EmitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"), PtrSize); OutStreamer->EmitIntValue(0, PtrSize); @@ -502,16 +583,18 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { OutStreamer->SwitchSection(TheSection); - EmitLinkage(GV, GVSym); + // emutls_t.* symbols are only used in the current compilation unit. + if (!IsEmuTLSVar) + EmitLinkage(GV, EmittedInitSym); EmitAlignment(AlignLog, GV); - OutStreamer->EmitLabel(GVSym); + OutStreamer->EmitLabel(EmittedInitSym); - EmitGlobalConstant(GV->getInitializer()); + EmitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer()); if (MAI->hasDotTypeDotSizeDirective()) // .size foo, 42 - OutStreamer->emitELFSize(cast<MCSymbolELF>(GVSym), + OutStreamer->emitELFSize(cast<MCSymbolELF>(EmittedInitSym), MCConstantExpr::create(Size, OutContext)); OutStreamer->AddBlankLine(); @@ -545,7 +628,7 @@ void AsmPrinter::EmitFunctionHeader() { // Emit the prefix data. if (F->hasPrefixData()) - EmitGlobalConstant(F->getPrefixData()); + EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData()); // Emit the CurrentFnSym. This is a virtual function to allow targets to // do their wild and crazy things as required. @@ -580,7 +663,7 @@ void AsmPrinter::EmitFunctionHeader() { // Emit the prologue data. if (F->hasPrologueData()) - EmitGlobalConstant(F->getPrologueData()); + EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrologueData()); } /// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the @@ -640,19 +723,27 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { /// that is an implicit def. void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const { unsigned RegNo = MI->getOperand(0).getReg(); - OutStreamer->AddComment(Twine("implicit-def: ") + - MMI->getContext().getRegisterInfo()->getName(RegNo)); + + SmallString<128> Str; + raw_svector_ostream OS(Str); + OS << "implicit-def: " + << PrintReg(RegNo, MF->getSubtarget().getRegisterInfo()); + + OutStreamer->AddComment(OS.str()); OutStreamer->AddBlankLine(); } static void emitKill(const MachineInstr *MI, AsmPrinter &AP) { - std::string Str = "kill:"; + std::string Str; + raw_string_ostream OS(Str); + OS << "kill:"; for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &Op = MI->getOperand(i); assert(Op.isReg() && "KILL instruction must have only register operands"); - Str += ' '; - Str += AP.MMI->getContext().getRegisterInfo()->getName(Op.getReg()); - Str += (Op.isDef() ? "<def>" : "<kill>"); + OS << ' ' + << PrintReg(Op.getReg(), + AP.MF->getSubtarget().getRegisterInfo()) + << (Op.isDef() ? "<def>" : "<kill>"); } AP.OutStreamer->AddComment(Str); AP.OutStreamer->AddBlankLine(); @@ -688,6 +779,31 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) { bool Deref = MI->getOperand(0).isReg() && MI->getOperand(1).isImm(); int64_t Offset = Deref ? MI->getOperand(1).getImm() : 0; + for (unsigned i = 0; i < Expr->getNumElements(); ++i) { + if (Deref) { + // We currently don't support extra Offsets or derefs after the first + // one. Bail out early instead of emitting an incorrect comment + OS << " [complex expression]"; + AP.OutStreamer->emitRawComment(OS.str()); + return true; + } + uint64_t Op = Expr->getElement(i); + if (Op == dwarf::DW_OP_deref) { + Deref = true; + continue; + } else if (Op == dwarf::DW_OP_bit_piece) { + // There can't be any operands after this in a valid expression + break; + } + uint64_t ExtraOffset = Expr->getElement(i++); + if (Op == dwarf::DW_OP_plus) + Offset += ExtraOffset; + else { + assert(Op == dwarf::DW_OP_minus); + Offset -= ExtraOffset; + } + } + // Register or immediate value. Register 0 means undef. if (MI->getOperand(0).isFPImm()) { APFloat APF = APFloat(MI->getOperand(0).getFPImm()->getValueAPF()); @@ -727,7 +843,7 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) { } if (Deref) OS << '['; - OS << AP.MMI->getContext().getRegisterInfo()->getName(Reg); + OS << PrintReg(Reg, AP.MF->getSubtarget().getRegisterInfo()); } if (Deref) @@ -888,7 +1004,7 @@ void AsmPrinter::EmitFunctionBody() { EmitFunctionBodyEnd(); if (!MMI->getLandingPads().empty() || MMI->hasDebugInfo() || - MAI->hasDotTypeDotSizeDirective()) { + MMI->hasEHFunclets() || MAI->hasDotTypeDotSizeDirective()) { // Create a symbol for the end of function. CurrentFnEnd = createTempSymbol("func_end"); OutStreamer->EmitLabel(CurrentFnEnd); @@ -1047,20 +1163,17 @@ bool AsmPrinter::doFinalization(Module &M) { // Output stubs for external and common global variables. MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); if (!Stubs.empty()) { - OutStreamer->SwitchSection(TLOF.getDataRelSection()); - const DataLayout *DL = TM.getDataLayout(); + OutStreamer->SwitchSection(TLOF.getDataSection()); + const DataLayout &DL = M.getDataLayout(); for (const auto &Stub : Stubs) { OutStreamer->EmitLabel(Stub.first); OutStreamer->EmitSymbolValue(Stub.second.getPointer(), - DL->getPointerSize()); + DL.getPointerSize()); } } } - // Make sure we wrote out everything we need. - OutStreamer->Flush(); - // Finalize debug and EH information. for (const HandlerInfo &HI : Handlers) { NamedRegionTimer T(HI.TimerName, HI.TimerGroupName, @@ -1103,10 +1216,29 @@ bool AsmPrinter::doFinalization(Module &M) { else assert(Alias.hasLocalLinkage() && "Invalid alias linkage"); + // Set the symbol type to function if the alias has a function type. + // This affects codegen when the aliasee is not a function. + if (Alias.getType()->getPointerElementType()->isFunctionTy()) + OutStreamer->EmitSymbolAttribute(Name, MCSA_ELF_TypeFunction); + EmitVisibility(Name, Alias.getVisibility()); // Emit the directives as assignments aka .set: OutStreamer->EmitAssignment(Name, lowerConstant(Alias.getAliasee())); + + // If the aliasee does not correspond to a symbol in the output, i.e. the + // alias is not of an object or the aliased object is private, then set the + // size of the alias symbol from the type of the alias. We don't do this in + // other situations as the alias and aliasee having differing types but same + // size may be intentional. + const GlobalObject *BaseObject = Alias.getBaseObject(); + if (MAI->hasDotTypeDotSizeDirective() && Alias.getValueType()->isSized() && + (!BaseObject || BaseObject->hasPrivateLinkage())) { + const DataLayout &DL = M.getDataLayout(); + uint64_t Size = DL.getTypeAllocSize(Alias.getValueType()); + OutStreamer->emitELFSize(cast<MCSymbolELF>(Name), + MCConstantExpr::create(Size, OutContext)); + } } GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>(); @@ -1120,16 +1252,16 @@ bool AsmPrinter::doFinalization(Module &M) { // Emit __morestack address if needed for indirect calls. if (MMI->usesMorestackAddr()) { - MCSection *ReadOnlySection = - getObjFileLowering().getSectionForConstant(SectionKind::getReadOnly(), - /*C=*/nullptr); + MCSection *ReadOnlySection = getObjFileLowering().getSectionForConstant( + getDataLayout(), SectionKind::getReadOnly(), + /*C=*/nullptr); OutStreamer->SwitchSection(ReadOnlySection); MCSymbol *AddrSymbol = OutContext.getOrCreateSymbol(StringRef("__morestack_addr")); OutStreamer->EmitLabel(AddrSymbol); - unsigned PtrSize = TM.getDataLayout()->getPointerSize(0); + unsigned PtrSize = M.getDataLayout().getPointerSize(0); OutStreamer->EmitSymbolValue(GetExternalSymbolSymbol("__morestack"), PtrSize); } @@ -1169,7 +1301,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) { CurExceptionSym = nullptr; bool NeedsLocalForSize = MAI->needsLocalForSize(); if (!MMI->getLandingPads().empty() || MMI->hasDebugInfo() || - NeedsLocalForSize) { + MMI->hasEHFunclets() || NeedsLocalForSize) { CurrentFnBegin = createTempSymbol("func_begin"); if (NeedsLocalForSize) CurrentFnSymForSize = CurrentFnBegin; @@ -1206,14 +1338,14 @@ void AsmPrinter::EmitConstantPool() { const MachineConstantPoolEntry &CPE = CP[i]; unsigned Align = CPE.getAlignment(); - SectionKind Kind = - CPE.getSectionKind(TM.getDataLayout()); + SectionKind Kind = CPE.getSectionKind(&getDataLayout()); const Constant *C = nullptr; if (!CPE.isMachineConstantPoolEntry()) C = CPE.Val.ConstVal; - MCSection *S = getObjFileLowering().getSectionForConstant(Kind, C); + MCSection *S = + getObjFileLowering().getSectionForConstant(getDataLayout(), Kind, C); // The number of sections are small, just do a linear search from the // last section to the first. @@ -1260,14 +1392,13 @@ void AsmPrinter::EmitConstantPool() { OutStreamer->EmitZeros(NewOffset - Offset); Type *Ty = CPE.getType(); - Offset = NewOffset + - TM.getDataLayout()->getTypeAllocSize(Ty); + Offset = NewOffset + getDataLayout().getTypeAllocSize(Ty); OutStreamer->EmitLabel(Sym); if (CPE.isMachineConstantPoolEntry()) EmitMachineConstantPoolValue(CPE.Val.MachineCPVal); else - EmitGlobalConstant(CPE.Val.ConstVal); + EmitGlobalConstant(getDataLayout(), CPE.Val.ConstVal); } } } @@ -1276,7 +1407,7 @@ void AsmPrinter::EmitConstantPool() { /// by the current function to the current output stream. /// void AsmPrinter::EmitJumpTableInfo() { - const DataLayout *DL = MF->getTarget().getDataLayout(); + const DataLayout &DL = MF->getDataLayout(); const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); if (!MJTI) return; if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_Inline) return; @@ -1296,8 +1427,7 @@ void AsmPrinter::EmitJumpTableInfo() { OutStreamer->SwitchSection(ReadOnlySection); } - EmitAlignment(Log2_32( - MJTI->getEntryAlignment(*TM.getDataLayout()))); + EmitAlignment(Log2_32(MJTI->getEntryAlignment(DL))); // Jump tables in code sections are marked with a data_region directive // where that's supported. @@ -1335,7 +1465,7 @@ void AsmPrinter::EmitJumpTableInfo() { // before each jump table. The first label is never referenced, but tells // the assembler and linker the extents of the jump table object. The // second label is actually referenced by the code. - if (JTInDiffSection && DL->hasLinkerPrivateGlobalPrefix()) + if (JTInDiffSection && DL.hasLinkerPrivateGlobalPrefix()) // FIXME: This doesn't have to have any specific name, just any randomly // named and numbered 'l' label would work. Simplify GetJTISymbol. OutStreamer->EmitLabel(GetJTISymbol(JTI, true)); @@ -1409,8 +1539,7 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI, assert(Value && "Unknown entry kind!"); - unsigned EntrySize = - MJTI->getEntrySize(*TM.getDataLayout()); + unsigned EntrySize = MJTI->getEntrySize(getDataLayout()); OutStreamer->EmitValue(Value, EntrySize); } @@ -1435,7 +1564,8 @@ bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) { assert(GV->hasInitializer() && "Not a special LLVM global!"); if (GV->getName() == "llvm.global_ctors") { - EmitXXStructorList(GV->getInitializer(), /* isCtor */ true); + EmitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(), + /* isCtor */ true); if (TM.getRelocationModel() == Reloc::Static && MAI->hasStaticCtorDtorReferenceInStaticMode()) { @@ -1447,7 +1577,8 @@ bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) { } if (GV->getName() == "llvm.global_dtors") { - EmitXXStructorList(GV->getInitializer(), /* isCtor */ false); + EmitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(), + /* isCtor */ false); if (TM.getRelocationModel() == Reloc::Static && MAI->hasStaticCtorDtorReferenceInStaticMode()) { @@ -1485,7 +1616,8 @@ struct Structor { /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init /// priority. -void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) { +void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List, + bool isCtor) { // Should be an array of '{ int, void ()* }' structs. The first value is the // init priority. if (!isa<ConstantArray>(List)) return; @@ -1520,8 +1652,7 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) { } // Emit the function pointers in the target-specific order - const DataLayout *DL = TM.getDataLayout(); - unsigned Align = Log2_32(DL->getPointerPrefAlignment()); + unsigned Align = Log2_32(DL.getPointerPrefAlignment()); std::stable_sort(Structors.begin(), Structors.end(), [](const Structor &L, const Structor &R) { return L.Priority < R.Priority; }); @@ -1542,7 +1673,7 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) { OutStreamer->SwitchSection(OutputSection); if (OutStreamer->getCurrentSection() != OutStreamer->getPreviousSection()) EmitAlignment(Align); - EmitXXStructor(S.Func); + EmitXXStructor(DL, S.Func); } } @@ -1621,8 +1752,7 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset, // void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalObject *GV) const { if (GV) - NumBits = getGVAlignmentLog2(GV, *TM.getDataLayout(), - NumBits); + NumBits = getGVAlignmentLog2(GV, GV->getParent()->getDataLayout(), NumBits); if (NumBits == 0) return; // 1-byte aligned: no need to emit alignment. @@ -1668,7 +1798,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) { // If the code isn't optimized, there may be outstanding folding // opportunities. Attempt to fold the expression using DataLayout as a // last resort before giving up. - if (Constant *C = ConstantFoldConstantExpression(CE, *TM.getDataLayout())) + if (Constant *C = ConstantFoldConstantExpression(CE, getDataLayout())) if (C != CE) return lowerConstant(C); @@ -1682,11 +1812,9 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) { report_fatal_error(OS.str()); } case Instruction::GetElementPtr: { - const DataLayout &DL = *TM.getDataLayout(); - // Generate a symbolic expression for the byte address - APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0); - cast<GEPOperator>(CE)->accumulateConstantOffset(DL, OffsetAI); + APInt OffsetAI(getDataLayout().getPointerTypeSizeInBits(CE->getType()), 0); + cast<GEPOperator>(CE)->accumulateConstantOffset(getDataLayout(), OffsetAI); const MCExpr *Base = lowerConstant(CE->getOperand(0)); if (!OffsetAI) @@ -1707,7 +1835,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) { return lowerConstant(CE->getOperand(0)); case Instruction::IntToPtr: { - const DataLayout &DL = *TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // Handle casts to pointers by changing them into casts to the appropriate // integer type. This promotes constant folding and simplifies this code. @@ -1718,7 +1846,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) { } case Instruction::PtrToInt: { - const DataLayout &DL = *TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // Support only foldable casts to/from pointers that can be eliminated by // changing the pointer to the appropriately sized integer type. @@ -1769,10 +1897,13 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) { } } -static void emitGlobalConstantImpl(const Constant *C, AsmPrinter &AP, +static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *C, + AsmPrinter &AP, const Constant *BaseCV = nullptr, uint64_t Offset = 0); +static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP); + /// isRepeatedByteSequence - Determine whether the given value is /// composed of a repeated sequence of identical bytes and return the /// byte value. If it is not a repeated sequence, return -1. @@ -1789,9 +1920,9 @@ static int isRepeatedByteSequence(const ConstantDataSequential *V) { /// isRepeatedByteSequence - Determine whether the given value is /// composed of a repeated sequence of identical bytes and return the /// byte value. If it is not a repeated sequence, return -1. -static int isRepeatedByteSequence(const Value *V, TargetMachine &TM) { +static int isRepeatedByteSequence(const Value *V, const DataLayout &DL) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) { - uint64_t Size = TM.getDataLayout()->getTypeAllocSizeInBits(V->getType()); + uint64_t Size = DL.getTypeAllocSizeInBits(V->getType()); assert(Size % 8 == 0); // Extend the element to take zero padding into account. @@ -1806,7 +1937,7 @@ static int isRepeatedByteSequence(const Value *V, TargetMachine &TM) { // byte. assert(CA->getNumOperands() != 0 && "Should be a CAZ"); Constant *Op0 = CA->getOperand(0); - int Byte = isRepeatedByteSequence(Op0, TM); + int Byte = isRepeatedByteSequence(Op0, DL); if (Byte == -1) return -1; @@ -1823,15 +1954,14 @@ static int isRepeatedByteSequence(const Value *V, TargetMachine &TM) { return -1; } -static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS, - AsmPrinter &AP){ +static void emitGlobalConstantDataSequential(const DataLayout &DL, + const ConstantDataSequential *CDS, + AsmPrinter &AP) { // See if we can aggregate this into a .fill, if so, emit it as such. - int Value = isRepeatedByteSequence(CDS, AP.TM); + int Value = isRepeatedByteSequence(CDS, DL); if (Value != -1) { - uint64_t Bytes = - AP.TM.getDataLayout()->getTypeAllocSize( - CDS->getType()); + uint64_t Bytes = DL.getTypeAllocSize(CDS->getType()); // Don't emit a 1-byte object as a .fill. if (Bytes > 1) return AP.OutStreamer->EmitFill(Bytes, Value); @@ -1851,37 +1981,11 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS, AP.OutStreamer->EmitIntValue(CDS->getElementAsInteger(i), ElementByteSize); } - } else if (ElementByteSize == 4) { - // FP Constants are printed as integer constants to avoid losing - // precision. - assert(CDS->getElementType()->isFloatTy()); - for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) { - union { - float F; - uint32_t I; - }; - - F = CDS->getElementAsFloat(i); - if (AP.isVerbose()) - AP.OutStreamer->GetCommentOS() << "float " << F << '\n'; - AP.OutStreamer->EmitIntValue(I, 4); - } } else { - assert(CDS->getElementType()->isDoubleTy()); - for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) { - union { - double F; - uint64_t I; - }; - - F = CDS->getElementAsDouble(i); - if (AP.isVerbose()) - AP.OutStreamer->GetCommentOS() << "double " << F << '\n'; - AP.OutStreamer->EmitIntValue(I, 8); - } + for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) + emitGlobalConstantFP(cast<ConstantFP>(CDS->getElementAsConstant(I)), AP); } - const DataLayout &DL = *AP.TM.getDataLayout(); unsigned Size = DL.getTypeAllocSize(CDS->getType()); unsigned EmittedSize = DL.getTypeAllocSize(CDS->getType()->getElementType()) * CDS->getNumElements(); @@ -1890,12 +1994,12 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS, } -static void emitGlobalConstantArray(const ConstantArray *CA, AsmPrinter &AP, +static void emitGlobalConstantArray(const DataLayout &DL, + const ConstantArray *CA, AsmPrinter &AP, const Constant *BaseCV, uint64_t Offset) { // See if we can aggregate some values. Make sure it can be // represented as a series of bytes of the constant value. - int Value = isRepeatedByteSequence(CA, AP.TM); - const DataLayout &DL = *AP.TM.getDataLayout(); + int Value = isRepeatedByteSequence(CA, DL); if (Value != -1) { uint64_t Bytes = DL.getTypeAllocSize(CA->getType()); @@ -1903,17 +2007,17 @@ static void emitGlobalConstantArray(const ConstantArray *CA, AsmPrinter &AP, } else { for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) { - emitGlobalConstantImpl(CA->getOperand(i), AP, BaseCV, Offset); + emitGlobalConstantImpl(DL, CA->getOperand(i), AP, BaseCV, Offset); Offset += DL.getTypeAllocSize(CA->getOperand(i)->getType()); } } } -static void emitGlobalConstantVector(const ConstantVector *CV, AsmPrinter &AP) { +static void emitGlobalConstantVector(const DataLayout &DL, + const ConstantVector *CV, AsmPrinter &AP) { for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i) - emitGlobalConstantImpl(CV->getOperand(i), AP); + emitGlobalConstantImpl(DL, CV->getOperand(i), AP); - const DataLayout &DL = *AP.TM.getDataLayout(); unsigned Size = DL.getTypeAllocSize(CV->getType()); unsigned EmittedSize = DL.getTypeAllocSize(CV->getType()->getElementType()) * CV->getType()->getNumElements(); @@ -1921,21 +2025,21 @@ static void emitGlobalConstantVector(const ConstantVector *CV, AsmPrinter &AP) { AP.OutStreamer->EmitZeros(Padding); } -static void emitGlobalConstantStruct(const ConstantStruct *CS, AsmPrinter &AP, +static void emitGlobalConstantStruct(const DataLayout &DL, + const ConstantStruct *CS, AsmPrinter &AP, const Constant *BaseCV, uint64_t Offset) { // Print the fields in successive locations. Pad to align if needed! - const DataLayout *DL = AP.TM.getDataLayout(); - unsigned Size = DL->getTypeAllocSize(CS->getType()); - const StructLayout *Layout = DL->getStructLayout(CS->getType()); + unsigned Size = DL.getTypeAllocSize(CS->getType()); + const StructLayout *Layout = DL.getStructLayout(CS->getType()); uint64_t SizeSoFar = 0; for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i) { const Constant *Field = CS->getOperand(i); // Print the actual field value. - emitGlobalConstantImpl(Field, AP, BaseCV, Offset+SizeSoFar); + emitGlobalConstantImpl(DL, Field, AP, BaseCV, Offset + SizeSoFar); // Check if padding is needed and insert one or more 0s. - uint64_t FieldSize = DL->getTypeAllocSize(Field->getType()); + uint64_t FieldSize = DL.getTypeAllocSize(Field->getType()); uint64_t PadSize = ((i == e-1 ? Size : Layout->getElementOffset(i+1)) - Layout->getElementOffset(i)) - FieldSize; SizeSoFar += FieldSize + PadSize; @@ -1974,8 +2078,7 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) { // PPC's long double has odd notions of endianness compared to how LLVM // handles it: p[0] goes first for *big* endian on PPC. - if (AP.TM.getDataLayout()->isBigEndian() && - !CFP->getType()->isPPC_FP128Ty()) { + if (AP.getDataLayout().isBigEndian() && !CFP->getType()->isPPC_FP128Ty()) { int Chunk = API.getNumWords() - 1; if (TrailingBytes) @@ -1993,13 +2096,13 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) { } // Emit the tail padding for the long double. - const DataLayout &DL = *AP.TM.getDataLayout(); + const DataLayout &DL = AP.getDataLayout(); AP.OutStreamer->EmitZeros(DL.getTypeAllocSize(CFP->getType()) - DL.getTypeStoreSize(CFP->getType())); } static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) { - const DataLayout *DL = AP.TM.getDataLayout(); + const DataLayout &DL = AP.getDataLayout(); unsigned BitWidth = CI->getBitWidth(); // Copy the value as we may massage the layout for constants whose bit width @@ -2016,7 +2119,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) { // Big endian: // * Record the extra bits to emit. // * Realign the raw data to emit the chunks of 64-bits. - if (DL->isBigEndian()) { + if (DL.isBigEndian()) { // Basically the structure of the raw data is a chunk of 64-bits cells: // 0 1 BitWidth / 64 // [chunk1][chunk2] ... [chunkN]. @@ -2037,7 +2140,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) { // quantities at a time. const uint64_t *RawData = Realigned.getRawData(); for (unsigned i = 0, e = BitWidth / 64; i != e; ++i) { - uint64_t Val = DL->isBigEndian() ? RawData[e - i - 1] : RawData[i]; + uint64_t Val = DL.isBigEndian() ? RawData[e - i - 1] : RawData[i]; AP.OutStreamer->EmitIntValue(Val, 8); } @@ -2045,8 +2148,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) { // Emit the extra bits after the 64-bits chunks. // Emit a directive that fills the expected size. - uint64_t Size = AP.TM.getDataLayout()->getTypeAllocSize( - CI->getType()); + uint64_t Size = AP.getDataLayout().getTypeAllocSize(CI->getType()); Size -= (BitWidth / 64) * 8; assert(Size && Size * 8 >= ExtraBitsSize && (ExtraBits & (((uint64_t)-1) >> (64 - ExtraBitsSize))) @@ -2094,7 +2196,7 @@ static void handleIndirectSymViaGOTPCRel(AsmPrinter &AP, const MCExpr **ME, if (!AP.GlobalGOTEquivs.count(GOTEquivSym)) return; - const GlobalValue *BaseGV = dyn_cast<GlobalValue>(BaseCst); + const GlobalValue *BaseGV = dyn_cast_or_null<GlobalValue>(BaseCst); if (!BaseGV) return; @@ -2149,10 +2251,10 @@ static void handleIndirectSymViaGOTPCRel(AsmPrinter &AP, const MCExpr **ME, AP.GlobalGOTEquivs[GOTEquivSym] = std::make_pair(GV, NumUses); } -static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP, - const Constant *BaseCV, uint64_t Offset) { - const DataLayout *DL = AP.TM.getDataLayout(); - uint64_t Size = DL->getTypeAllocSize(CV->getType()); +static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV, + AsmPrinter &AP, const Constant *BaseCV, + uint64_t Offset) { + uint64_t Size = DL.getTypeAllocSize(CV->getType()); // Globals with sub-elements such as combinations of arrays and structs // are handled recursively by emitGlobalConstantImpl. Keep track of the @@ -2189,32 +2291,32 @@ static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP, } if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(CV)) - return emitGlobalConstantDataSequential(CDS, AP); + return emitGlobalConstantDataSequential(DL, CDS, AP); if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV)) - return emitGlobalConstantArray(CVA, AP, BaseCV, Offset); + return emitGlobalConstantArray(DL, CVA, AP, BaseCV, Offset); if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV)) - return emitGlobalConstantStruct(CVS, AP, BaseCV, Offset); + return emitGlobalConstantStruct(DL, CVS, AP, BaseCV, Offset); if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) { // Look through bitcasts, which might not be able to be MCExpr'ized (e.g. of // vectors). if (CE->getOpcode() == Instruction::BitCast) - return emitGlobalConstantImpl(CE->getOperand(0), AP); + return emitGlobalConstantImpl(DL, CE->getOperand(0), AP); if (Size > 8) { // If the constant expression's size is greater than 64-bits, then we have // to emit the value in chunks. Try to constant fold the value and emit it // that way. - Constant *New = ConstantFoldConstantExpression(CE, *DL); + Constant *New = ConstantFoldConstantExpression(CE, DL); if (New && New != CE) - return emitGlobalConstantImpl(New, AP); + return emitGlobalConstantImpl(DL, New, AP); } } if (const ConstantVector *V = dyn_cast<ConstantVector>(CV)) - return emitGlobalConstantVector(V, AP); + return emitGlobalConstantVector(DL, V, AP); // Otherwise, it must be a ConstantExpr. Lower it to an MCExpr, then emit it // thread the streamer with EmitValue. @@ -2230,11 +2332,10 @@ static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP, } /// EmitGlobalConstant - Print a general LLVM constant to the .s file. -void AsmPrinter::EmitGlobalConstant(const Constant *CV) { - uint64_t Size = - TM.getDataLayout()->getTypeAllocSize(CV->getType()); +void AsmPrinter::EmitGlobalConstant(const DataLayout &DL, const Constant *CV) { + uint64_t Size = DL.getTypeAllocSize(CV->getType()); if (Size) - emitGlobalConstantImpl(CV, *this); + emitGlobalConstantImpl(DL, CV, *this); else if (MAI->hasSubsectionsViaSymbols()) { // If the global has zero size, emit a single byte so that two labels don't // look like they are at the same location. @@ -2272,10 +2373,10 @@ MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BasicBlock *BB) const { /// GetCPISymbol - Return the symbol for the specified constant pool entry. MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const { - const DataLayout *DL = TM.getDataLayout(); - return OutContext.getOrCreateSymbol - (Twine(DL->getPrivateGlobalPrefix()) + "CPI" + Twine(getFunctionNumber()) - + "_" + Twine(CPID)); + const DataLayout &DL = getDataLayout(); + return OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) + + "CPI" + Twine(getFunctionNumber()) + "_" + + Twine(CPID)); } /// GetJTISymbol - Return the symbol for the specified jump table entry. @@ -2286,10 +2387,10 @@ MCSymbol *AsmPrinter::GetJTISymbol(unsigned JTID, bool isLinkerPrivate) const { /// GetJTSetSymbol - Return the symbol for the specified jump table .set /// FIXME: privatize to AsmPrinter. MCSymbol *AsmPrinter::GetJTSetSymbol(unsigned UID, unsigned MBBID) const { - const DataLayout *DL = TM.getDataLayout(); - return OutContext.getOrCreateSymbol - (Twine(DL->getPrivateGlobalPrefix()) + Twine(getFunctionNumber()) + "_" + - Twine(UID) + "_set_" + Twine(MBBID)); + const DataLayout &DL = getDataLayout(); + return OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) + + Twine(getFunctionNumber()) + "_" + + Twine(UID) + "_set_" + Twine(MBBID)); } MCSymbol *AsmPrinter::getSymbolWithGlobalValueBase(const GlobalValue *GV, @@ -2301,7 +2402,7 @@ MCSymbol *AsmPrinter::getSymbolWithGlobalValueBase(const GlobalValue *GV, /// Return the MCSymbol for the specified ExternalSymbol. MCSymbol *AsmPrinter::GetExternalSymbolSymbol(StringRef Sym) const { SmallString<60> NameStr; - Mangler::getNameWithPrefix(NameStr, Sym, *TM.getDataLayout()); + Mangler::getNameWithPrefix(NameStr, Sym, getDataLayout()); return OutContext.getOrCreateSymbol(NameStr); } @@ -2376,6 +2477,14 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB, /// MachineBasicBlock, an alignment (if present) and a comment describing /// it if appropriate. void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { + // End the previous funclet and start a new one. + if (MBB.isEHFuncletEntry()) { + for (const HandlerInfo &HI : Handlers) { + HI.Handler->endFunclet(); + HI.Handler->beginFunclet(MBB); + } + } + // Emit an alignment directive for this block, if needed. if (unsigned Align = MBB.getAlignment()) EmitAlignment(Align); @@ -2389,20 +2498,28 @@ void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { if (isVerbose()) OutStreamer->AddComment("Block address taken"); - for (MCSymbol *Sym : MMI->getAddrLabelSymbolToEmit(BB)) - OutStreamer->EmitLabel(Sym); + // MBBs can have their address taken as part of CodeGen without having + // their corresponding BB's address taken in IR + if (BB->hasAddressTaken()) + for (MCSymbol *Sym : MMI->getAddrLabelSymbolToEmit(BB)) + OutStreamer->EmitLabel(Sym); } // Print some verbose block comments. if (isVerbose()) { - if (const BasicBlock *BB = MBB.getBasicBlock()) - if (BB->hasName()) - OutStreamer->AddComment("%" + BB->getName()); + if (const BasicBlock *BB = MBB.getBasicBlock()) { + if (BB->hasName()) { + BB->printAsOperand(OutStreamer->GetCommentOS(), + /*PrintType=*/false, BB->getModule()); + OutStreamer->GetCommentOS() << '\n'; + } + } emitBasicBlockLoopComments(MBB, LI, *this); } // Print the main label for the block. - if (MBB.pred_empty() || isBlockOnlyReachableByFallthrough(&MBB)) { + if (MBB.pred_empty() || + (isBlockOnlyReachableByFallthrough(&MBB) && !MBB.isEHFuncletEntry())) { if (isVerbose()) { // NOTE: Want this comment at start of line, don't emit with AddComment. OutStreamer->emitRawComment(" BB#" + Twine(MBB.getNumber()) + ":", false); @@ -2440,7 +2557,7 @@ bool AsmPrinter:: isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const { // If this is a landing pad, it isn't a fall through. If it has no preds, // then nothing falls through to it. - if (MBB->isLandingPad() || MBB->pred_empty()) + if (MBB->isEHPad() || MBB->pred_empty()) return false; // If there isn't exactly one predecessor, it can't be a fall through. diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp index ad180b6..504c5d2 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp @@ -47,7 +47,7 @@ void AsmPrinter::EmitSLEB128(int64_t Value, const char *Desc) const { OutStreamer->EmitSLEB128IntValue(Value); } -/// EmitULEB128 - emit the specified signed leb128 value. +/// EmitULEB128 - emit the specified unsigned leb128 value. void AsmPrinter::EmitULEB128(uint64_t Value, const char *Desc, unsigned PadTo) const { if (isVerbose() && Desc) @@ -56,18 +56,6 @@ void AsmPrinter::EmitULEB128(uint64_t Value, const char *Desc, OutStreamer->EmitULEB128IntValue(Value, PadTo); } -/// EmitCFAByte - Emit a .byte 42 directive for a DW_CFA_xxx value. -void AsmPrinter::EmitCFAByte(unsigned Val) const { - if (isVerbose()) { - if (Val >= dwarf::DW_CFA_offset && Val < dwarf::DW_CFA_offset + 64) - OutStreamer->AddComment("DW_CFA_offset + Reg (" + - Twine(Val - dwarf::DW_CFA_offset) + ")"); - else - OutStreamer->AddComment(dwarf::CallFrameString(Val)); - } - OutStreamer->EmitIntValue(Val, 1); -} - static const char *DecodeDWARFEncoding(unsigned Encoding) { switch (Encoding) { case dwarf::DW_EH_PE_absptr: @@ -134,7 +122,7 @@ unsigned AsmPrinter::GetSizeOfEncodedValue(unsigned Encoding) const { default: llvm_unreachable("Invalid encoded value."); case dwarf::DW_EH_PE_absptr: - return TM.getDataLayout()->getPointerSize(); + return MF->getDataLayout().getPointerSize(); case dwarf::DW_EH_PE_udata2: return 2; case dwarf::DW_EH_PE_udata4: @@ -228,6 +216,9 @@ void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const { case MCCFIInstruction::OpDefCfaOffset: OutStreamer->EmitCFIDefCfaOffset(Inst.getOffset()); break; + case MCCFIInstruction::OpAdjustCfaOffset: + OutStreamer->EmitCFIAdjustCfaOffset(Inst.getOffset()); + break; case MCCFIInstruction::OpDefCfa: OutStreamer->EmitCFIDefCfa(Inst.getRegister(), Inst.getOffset()); break; @@ -246,6 +237,12 @@ void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const { case MCCFIInstruction::OpSameValue: OutStreamer->EmitCFISameValue(Inst.getRegister()); break; + case MCCFIInstruction::OpGnuArgsSize: + OutStreamer->EmitCFIGnuArgsSize(Inst.getOffset()); + break; + case MCCFIInstruction::OpEscape: + OutStreamer->EmitCFIEscape(Inst.getValues()); + break; } } @@ -284,17 +281,10 @@ void AsmPrinter::emitDwarfDIE(const DIE &Die) const { } } -void -AsmPrinter::emitDwarfAbbrevs(const std::vector<DIEAbbrev *>& Abbrevs) const { - // For each abbrevation. - for (const DIEAbbrev *Abbrev : Abbrevs) { - // Emit the abbrevations code (base 1 index.) - EmitULEB128(Abbrev->getNumber(), "Abbreviation Code"); - - // Emit the abbreviations data. - Abbrev->Emit(this); - } +void AsmPrinter::emitDwarfAbbrev(const DIEAbbrev &Abbrev) const { + // Emit the abbreviations code (base 1 index.) + EmitULEB128(Abbrev.getNumber(), "Abbreviation Code"); - // Mark end of abbreviations. - EmitULEB128(0, "EOM(3)"); + // Emit the abbreviations data. + Abbrev.Emit(this); } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h index f1efe9d..e59961f 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h @@ -19,6 +19,7 @@ namespace llvm { +class MachineBasicBlock; class MachineFunction; class MachineInstr; class MCSymbol; @@ -50,6 +51,11 @@ public: /// beginFunction at all. virtual void endFunction(const MachineFunction *MF) = 0; + /// \brief Emit target-specific EH funclet machinery. + virtual void beginFunclet(const MachineBasicBlock &MBB, + MCSymbol *Sym = nullptr) {} + virtual void endFunclet() {} + /// \brief Process beginning of an instruction. virtual void beginInstruction(const MachineInstr *MI) = 0; diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index 793e629..4171657 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -127,19 +127,13 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, std::unique_ptr<MCAsmParser> Parser( createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI)); - // Create a temporary copy of the original STI because the parser may modify - // it. For example, when switching between arm and thumb mode. If the target - // needs to emit code to return to the original state it can do so in - // emitInlineAsmEnd(). - MCSubtargetInfo TmpSTI = STI; - // We create a new MCInstrInfo here since we might be at the module level // and not have a MachineFunction to initialize the TargetInstrInfo from and // we only need MCInstrInfo for asm parsing. We create one unconditionally // because it's not subtarget dependent. std::unique_ptr<MCInstrInfo> MII(TM.getTarget().createMCInstrInfo()); std::unique_ptr<MCTargetAsmParser> TAP(TM.getTarget().createMCAsmParser( - TmpSTI, *Parser, *MII, MCOptions)); + STI, *Parser, *MII, MCOptions)); if (!TAP) report_fatal_error("Inline asm not supported by this streamer because" " we don't have an asm parser for this target\n"); @@ -154,7 +148,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, // Don't implicitly switch to the text section before the asm. int Res = Parser->Run(/*NoInitialTextSection*/ true, /*NoFinalize*/ true); - emitInlineAsmEnd(STI, &TmpSTI); + emitInlineAsmEnd(STI, &TAP->getSTI()); if (Res && !HasDiagHandler) report_fatal_error("Error parsing inline asm\n"); } @@ -512,9 +506,9 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const { /// for their own strange codes. void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS, const char *Code) const { - const DataLayout *DL = TM.getDataLayout(); if (!strcmp(Code, "private")) { - OS << DL->getPrivateGlobalPrefix(); + const DataLayout &DL = MF->getDataLayout(); + OS << DL.getPrivateGlobalPrefix(); } else if (!strcmp(Code, "comment")) { OS << MAI->getCommentString(); } else if (!strcmp(Code, "uid")) { diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h b/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h index 0cc829f..df1997b 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h @@ -24,16 +24,19 @@ namespace llvm { class ByteStreamer { - public: - virtual ~ByteStreamer() {} + protected: + ~ByteStreamer() = default; + ByteStreamer(const ByteStreamer&) = default; + ByteStreamer() = default; + public: // For now we're just handling the calls we need for dwarf emission/hashing. virtual void EmitInt8(uint8_t Byte, const Twine &Comment = "") = 0; virtual void EmitSLEB128(uint64_t DWord, const Twine &Comment = "") = 0; virtual void EmitULEB128(uint64_t DWord, const Twine &Comment = "") = 0; }; -class APByteStreamer : public ByteStreamer { +class APByteStreamer final : public ByteStreamer { private: AsmPrinter &AP; @@ -53,7 +56,7 @@ public: } }; -class HashingByteStreamer : public ByteStreamer { +class HashingByteStreamer final : public ByteStreamer { private: DIEHash &Hash; public: @@ -69,7 +72,7 @@ class HashingByteStreamer : public ByteStreamer { } }; -class BufferByteStreamer : public ByteStreamer { +class BufferByteStreamer final : public ByteStreamer { private: SmallVectorImpl<char> &Buffer; SmallVectorImpl<std::string> &Comments; diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp index 46dbc76..7b0cdbd 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp @@ -32,6 +32,39 @@ using namespace llvm; //===----------------------------------------------------------------------===// +// EmittingAsmStreamer Implementation +//===----------------------------------------------------------------------===// +unsigned EmittingAsmStreamer::emitULEB128(uint64_t Value, const char *Desc, + unsigned PadTo) { + AP->EmitULEB128(Value, Desc, PadTo); + return 0; +} + +unsigned EmittingAsmStreamer::emitInt8(unsigned char Value) { + AP->EmitInt8(Value); + return 0; +} + +unsigned EmittingAsmStreamer::emitBytes(StringRef Data) { + AP->OutStreamer->EmitBytes(Data); + return 0; +} + +//===----------------------------------------------------------------------===// +// SizeReporterAsmStreamer Implementation +//===----------------------------------------------------------------------===// +unsigned SizeReporterAsmStreamer::emitULEB128(uint64_t Value, const char *Desc, + unsigned PadTo) { + return getULEB128Size(Value); +} + +unsigned SizeReporterAsmStreamer::emitInt8(unsigned char Value) { return 1; } + +unsigned SizeReporterAsmStreamer::emitBytes(StringRef Data) { + return Data.size(); +} + +//===----------------------------------------------------------------------===// // DIEAbbrevData Implementation //===----------------------------------------------------------------------===// @@ -86,7 +119,7 @@ void DIEAbbrev::Emit(const AsmPrinter *AP) const { AP->EmitULEB128(0, "EOM(2)"); } -#ifndef NDEBUG +LLVM_DUMP_METHOD void DIEAbbrev::print(raw_ostream &O) { O << "Abbreviation @" << format("0x%lx", (long)(intptr_t)this) @@ -104,12 +137,13 @@ void DIEAbbrev::print(raw_ostream &O) { << '\n'; } } + +LLVM_DUMP_METHOD void DIEAbbrev::dump() { print(dbgs()); } -#endif DIEAbbrev DIE::generateAbbrev() const { DIEAbbrev Abbrev(Tag, hasChildren()); - for (const DIEValue &V : Values) + for (const DIEValue &V : values()) Abbrev.AddAttribute(V.getAttribute(), V.getForm()); return Abbrev; } @@ -144,36 +178,35 @@ DIEValue DIE::findAttribute(dwarf::Attribute Attribute) const { return DIEValue(); } -#ifndef NDEBUG -void DIE::print(raw_ostream &O, unsigned IndentCount) const { - const std::string Indent(IndentCount, ' '); - bool isBlock = getTag() == 0; - - if (!isBlock) { - O << Indent - << "Die: " - << format("0x%lx", (long)(intptr_t)this) - << ", Offset: " << Offset - << ", Size: " << Size << "\n"; - - O << Indent - << dwarf::TagString(getTag()) - << " " - << dwarf::ChildrenString(hasChildren()) << "\n"; - } else { - O << "Size: " << Size << "\n"; - } +LLVM_DUMP_METHOD +static void printValues(raw_ostream &O, const DIEValueList &Values, + StringRef Type, unsigned Size, unsigned IndentCount) { + O << Type << ": Size: " << Size << "\n"; - IndentCount += 2; unsigned I = 0; - for (const auto &V : Values) { + const std::string Indent(IndentCount, ' '); + for (const auto &V : Values.values()) { O << Indent; + O << "Blk[" << I++ << "]"; + O << " " << dwarf::FormEncodingString(V.getForm()) << " "; + V.print(O); + O << "\n"; + } +} - if (!isBlock) - O << dwarf::AttributeString(V.getAttribute()); - else - O << "Blk[" << I++ << "]"; +LLVM_DUMP_METHOD +void DIE::print(raw_ostream &O, unsigned IndentCount) const { + const std::string Indent(IndentCount, ' '); + O << Indent << "Die: " << format("0x%lx", (long)(intptr_t) this) + << ", Offset: " << Offset << ", Size: " << Size << "\n"; + + O << Indent << dwarf::TagString(getTag()) << " " + << dwarf::ChildrenString(hasChildren()) << "\n"; + IndentCount += 2; + for (const auto &V : values()) { + O << Indent; + O << dwarf::AttributeString(V.getAttribute()); O << " " << dwarf::FormEncodingString(V.getForm()) << " "; V.print(O); O << "\n"; @@ -183,13 +216,13 @@ void DIE::print(raw_ostream &O, unsigned IndentCount) const { for (const auto &Child : children()) Child.print(O, IndentCount + 4); - if (!isBlock) O << "\n"; + O << "\n"; } +LLVM_DUMP_METHOD void DIE::dump() { print(dbgs()); } -#endif void DIEValue::EmitValue(const AsmPrinter *AP) const { switch (Ty) { @@ -215,7 +248,7 @@ unsigned DIEValue::SizeOf(const AsmPrinter *AP) const { llvm_unreachable("Unknown DIE kind"); } -#ifndef NDEBUG +LLVM_DUMP_METHOD void DIEValue::print(raw_ostream &O) const { switch (Ty) { case isNone: @@ -228,10 +261,10 @@ void DIEValue::print(raw_ostream &O) const { } } +LLVM_DUMP_METHOD void DIEValue::dump() const { print(dbgs()); } -#endif //===----------------------------------------------------------------------===// // DIEInteger Implementation @@ -264,7 +297,8 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const { case dwarf::DW_FORM_udata: Asm->EmitULEB128(Integer); return; case dwarf::DW_FORM_sdata: Asm->EmitSLEB128(Integer); return; case dwarf::DW_FORM_addr: - Size = Asm->getDataLayout().getPointerSize(); break; + Size = Asm->getPointerSize(); + break; case dwarf::DW_FORM_ref_addr: Size = SizeOf(Asm, dwarf::DW_FORM_ref_addr); break; @@ -294,21 +328,21 @@ unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { case dwarf::DW_FORM_GNU_addr_index: return getULEB128Size(Integer); case dwarf::DW_FORM_udata: return getULEB128Size(Integer); case dwarf::DW_FORM_sdata: return getSLEB128Size(Integer); - case dwarf::DW_FORM_addr: return AP->getDataLayout().getPointerSize(); + case dwarf::DW_FORM_addr: + return AP->getPointerSize(); case dwarf::DW_FORM_ref_addr: if (AP->OutStreamer->getContext().getDwarfVersion() == 2) - return AP->getDataLayout().getPointerSize(); + return AP->getPointerSize(); return sizeof(int32_t); default: llvm_unreachable("DIE Value form not supported yet"); } } -#ifndef NDEBUG +LLVM_DUMP_METHOD void DIEInteger::print(raw_ostream &O) const { O << "Int: " << (int64_t)Integer << " 0x"; O.write_hex(Integer); } -#endif //===----------------------------------------------------------------------===// // DIEExpr Implementation @@ -326,12 +360,11 @@ unsigned DIEExpr::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { if (Form == dwarf::DW_FORM_data4) return 4; if (Form == dwarf::DW_FORM_sec_offset) return 4; if (Form == dwarf::DW_FORM_strp) return 4; - return AP->getDataLayout().getPointerSize(); + return AP->getPointerSize(); } -#ifndef NDEBUG +LLVM_DUMP_METHOD void DIEExpr::print(raw_ostream &O) const { O << "Expr: " << *Expr; } -#endif //===----------------------------------------------------------------------===// // DIELabel Implementation @@ -352,12 +385,11 @@ unsigned DIELabel::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { if (Form == dwarf::DW_FORM_data4) return 4; if (Form == dwarf::DW_FORM_sec_offset) return 4; if (Form == dwarf::DW_FORM_strp) return 4; - return AP->getDataLayout().getPointerSize(); + return AP->getPointerSize(); } -#ifndef NDEBUG +LLVM_DUMP_METHOD void DIELabel::print(raw_ostream &O) const { O << "Lbl: " << Label->getName(); } -#endif //===----------------------------------------------------------------------===// // DIEDelta Implementation @@ -375,14 +407,13 @@ unsigned DIEDelta::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { if (Form == dwarf::DW_FORM_data4) return 4; if (Form == dwarf::DW_FORM_sec_offset) return 4; if (Form == dwarf::DW_FORM_strp) return 4; - return AP->getDataLayout().getPointerSize(); + return AP->getPointerSize(); } -#ifndef NDEBUG +LLVM_DUMP_METHOD void DIEDelta::print(raw_ostream &O) const { O << "Del: " << LabelHi->getName() << "-" << LabelLo->getName(); } -#endif //===----------------------------------------------------------------------===// // DIEString Implementation @@ -431,11 +462,10 @@ unsigned DIEString::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { return DIEInteger(S.getOffset()).SizeOf(AP, Form); } -#ifndef NDEBUG +LLVM_DUMP_METHOD void DIEString::print(raw_ostream &O) const { O << "String: " << S.getString(); } -#endif //===----------------------------------------------------------------------===// // DIEEntry Implementation @@ -472,15 +502,14 @@ unsigned DIEEntry::getRefAddrSize(const AsmPrinter *AP) { const DwarfDebug *DD = AP->getDwarfDebug(); assert(DD && "Expected Dwarf Debug info to be available"); if (DD->getDwarfVersion() == 2) - return AP->getDataLayout().getPointerSize(); + return AP->getPointerSize(); return sizeof(int32_t); } -#ifndef NDEBUG +LLVM_DUMP_METHOD void DIEEntry::print(raw_ostream &O) const { O << format("Die: 0x%lx", (long)(intptr_t)&Entry); } -#endif //===----------------------------------------------------------------------===// // DIETypeSignature Implementation @@ -491,11 +520,10 @@ void DIETypeSignature::EmitValue(const AsmPrinter *Asm, Asm->OutStreamer->EmitIntValue(Unit->getTypeSignature(), 8); } -#ifndef NDEBUG +LLVM_DUMP_METHOD void DIETypeSignature::print(raw_ostream &O) const { O << format("Type Unit: 0x%lx", Unit->getTypeSignature()); } -#endif //===----------------------------------------------------------------------===// // DIELoc Implementation @@ -505,7 +533,7 @@ void DIETypeSignature::print(raw_ostream &O) const { /// unsigned DIELoc::ComputeSize(const AsmPrinter *AP) const { if (!Size) { - for (const auto &V : Values) + for (const auto &V : values()) Size += V.SizeOf(AP); } @@ -525,7 +553,7 @@ void DIELoc::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const { Asm->EmitULEB128(Size); break; } - for (const auto &V : Values) + for (const auto &V : values()) V.EmitValue(Asm); } @@ -543,12 +571,10 @@ unsigned DIELoc::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { } } -#ifndef NDEBUG +LLVM_DUMP_METHOD void DIELoc::print(raw_ostream &O) const { - O << "ExprLoc: "; - DIE::print(O, 5); + printValues(O, *this, "ExprLoc", Size, 5); } -#endif //===----------------------------------------------------------------------===// // DIEBlock Implementation @@ -558,7 +584,7 @@ void DIELoc::print(raw_ostream &O) const { /// unsigned DIEBlock::ComputeSize(const AsmPrinter *AP) const { if (!Size) { - for (const auto &V : Values) + for (const auto &V : values()) Size += V.SizeOf(AP); } @@ -576,7 +602,7 @@ void DIEBlock::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const { case dwarf::DW_FORM_block: Asm->EmitULEB128(Size); break; } - for (const auto &V : Values) + for (const auto &V : values()) V.EmitValue(Asm); } @@ -592,12 +618,10 @@ unsigned DIEBlock::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { } } -#ifndef NDEBUG +LLVM_DUMP_METHOD void DIEBlock::print(raw_ostream &O) const { - O << "Blk: "; - DIE::print(O, 5); + printValues(O, *this, "Blk", Size, 5); } -#endif //===----------------------------------------------------------------------===// // DIELocList Implementation @@ -608,7 +632,7 @@ unsigned DIELocList::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const { return 4; if (Form == dwarf::DW_FORM_sec_offset) return 4; - return AP->getDataLayout().getPointerSize(); + return AP->getPointerSize(); } /// EmitValue - Emit label value. @@ -619,6 +643,5 @@ void DIELocList::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const { AP->emitDwarfSymbolReference(Label, /*ForceOffset*/ DD->useSplitDwarf()); } -#ifndef NDEBUG +LLVM_DUMP_METHOD void DIELocList::print(raw_ostream &O) const { O << "LocList: " << Index; } -#endif diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp index 5e60156..0201065 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp @@ -470,38 +470,6 @@ void DIEHash::computeHash(const DIE &Die) { } /// This is based on the type signature computation given in section 7.27 of the -/// DWARF4 standard. It is the md5 hash of a flattened description of the DIE -/// with the exception that we are hashing only the context and the name of the -/// type. -uint64_t DIEHash::computeDIEODRSignature(const DIE &Die) { - - // Add the contexts to the hash. We won't be computing the ODR hash for - // function local types so it's safe to use the generic context hashing - // algorithm here. - // FIXME: If we figure out how to account for linkage in some way we could - // actually do this with a slight modification to the parent hash algorithm. - if (const DIE *Parent = Die.getParent()) - addParentContext(*Parent); - - // Add the current DIE information. - - // Add the DWARF tag of the DIE. - addULEB128(Die.getTag()); - - // Add the name of the type to the hash. - addString(getDIEStringAttr(Die, dwarf::DW_AT_name)); - - // Now get the result. - MD5::MD5Result Result; - Hash.final(Result); - - // ... take the least significant 8 bytes and return those. Our MD5 - // implementation always returns its results in little endian, swap bytes - // appropriately. - return support::endian::read64le(Result + 8); -} - -/// This is based on the type signature computation given in section 7.27 of the /// DWARF4 standard. It is an md5 hash of the flattened description of the DIE /// with the inclusion of the full CU and all top level CU entities. // TODO: Initialize the type chain at 0 instead of 1 for CU signatures. diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h index 833ca02..44f0ce8 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h @@ -84,9 +84,6 @@ class DIEHash { public: DIEHash(AsmPrinter *A = nullptr) : AP(A) {} - /// \brief Computes the ODR signature. - uint64_t computeDIEODRSignature(const DIE &Die); - /// \brief Computes the CU signature. uint64_t computeCUSignature(const DIE &Die); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h index afffa83..bbe5324 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h @@ -9,6 +9,8 @@ #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H #define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H + +#include "DebugLocStream.h" #include "llvm/ADT/SmallString.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" @@ -17,7 +19,6 @@ namespace llvm { class AsmPrinter; -class DebugLocStream; /// \brief This struct describes location entries emitted in the .debug_loc /// section. diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp index f8cdde2..4ad3e18 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp @@ -41,7 +41,7 @@ void DwarfAccelTable::AddName(DwarfStringPoolEntryRef Name, const DIE *die, DIEs.Values.push_back(new (Allocator) HashDataContents(die, Flags)); } -void DwarfAccelTable::ComputeBucketCount(void) { +void DwarfAccelTable::ComputeBucketCount() { // First get the number of unique hashes. std::vector<uint32_t> uniques(Data.size()); for (size_t i = 0, e = Data.size(); i < e; ++i) diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp index 2c212c7..6665c16 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp @@ -78,12 +78,11 @@ void DwarfCFIException::endModule() { return; // Emit references to all used personality functions - const std::vector<const Function*> &Personalities = MMI->getPersonalities(); - for (size_t i = 0, e = Personalities.size(); i != e; ++i) { - if (!Personalities[i]) + for (const Function *Personality : MMI->getPersonalities()) { + if (!Personality) continue; - MCSymbol *Sym = Asm->getSymbol(Personalities[i]); - TLOF.emitPersonalityValue(*Asm->OutStreamer, Asm->TM, Sym); + MCSymbol *Sym = Asm->getSymbol(Personality); + TLOF.emitPersonalityValue(*Asm->OutStreamer, Asm->getDataLayout(), Sym); } } @@ -108,7 +107,6 @@ void DwarfCFIException::beginFunction(const MachineFunction *MF) { const Function *Per = nullptr; if (F->hasPersonalityFn()) Per = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts()); - assert(!MMI->getPersonality() || Per == MMI->getPersonality()); // Emit a personality function even when there are no landing pads bool forceEmitPersonality = diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index fc54a29..725063a 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -151,28 +151,33 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE( DIELoc *Loc = new (DIEValueAllocator) DIELoc; const MCSymbol *Sym = Asm->getSymbol(Global); if (Global->isThreadLocal()) { - // FIXME: Make this work with -gsplit-dwarf. - unsigned PointerSize = Asm->getDataLayout().getPointerSize(); - assert((PointerSize == 4 || PointerSize == 8) && - "Add support for other sizes if necessary"); - // Based on GCC's support for TLS: - if (!DD->useSplitDwarf()) { - // 1) Start with a constNu of the appropriate pointer size - addUInt(*Loc, dwarf::DW_FORM_data1, - PointerSize == 4 ? dwarf::DW_OP_const4u : dwarf::DW_OP_const8u); - // 2) containing the (relocated) offset of the TLS variable - // within the module's TLS block. - addExpr(*Loc, dwarf::DW_FORM_udata, - Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym)); + if (Asm->TM.Options.EmulatedTLS) { + // TODO: add debug info for emulated thread local mode. } else { - addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index); - addUInt(*Loc, dwarf::DW_FORM_udata, - DD->getAddressPool().getIndex(Sym, /* TLS */ true)); + // FIXME: Make this work with -gsplit-dwarf. + unsigned PointerSize = Asm->getDataLayout().getPointerSize(); + assert((PointerSize == 4 || PointerSize == 8) && + "Add support for other sizes if necessary"); + // Based on GCC's support for TLS: + if (!DD->useSplitDwarf()) { + // 1) Start with a constNu of the appropriate pointer size + addUInt(*Loc, dwarf::DW_FORM_data1, PointerSize == 4 + ? dwarf::DW_OP_const4u + : dwarf::DW_OP_const8u); + // 2) containing the (relocated) offset of the TLS variable + // within the module's TLS block. + addExpr(*Loc, dwarf::DW_FORM_udata, + Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym)); + } else { + addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index); + addUInt(*Loc, dwarf::DW_FORM_udata, + DD->getAddressPool().getIndex(Sym, /* TLS */ true)); + } + // 3) followed by an OP to make the debugger do a TLS lookup. + addUInt(*Loc, dwarf::DW_FORM_data1, + DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address + : dwarf::DW_OP_form_tls_address); } - // 3) followed by an OP to make the debugger do a TLS lookup. - addUInt(*Loc, dwarf::DW_FORM_data1, - DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address - : dwarf::DW_OP_form_tls_address); } else { DD->addArangeLabel(SymbolCU(this, Sym)); addOpAddress(*Loc, Sym); @@ -338,9 +343,9 @@ void DwarfCompileUnit::constructScopeDIE( // Skip imported directives in gmlt-like data. if (!includeMinimalInlineScopes()) { // There is no need to emit empty lexical block DIE. - for (const auto &E : DD->findImportedEntitiesForScope(DS)) + for (const auto *IE : ImportedEntities[DS]) Children.push_back( - constructImportedEntityDIE(cast<DIImportedEntity>(E.second))); + constructImportedEntityDIE(cast<DIImportedEntity>(IE))); } // If there are only other scopes as children, put them directly in the @@ -435,6 +440,9 @@ DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) { addUInt(*ScopeDIE, dwarf::DW_AT_call_file, None, getOrCreateSourceID(IA->getFilename(), IA->getDirectory())); addUInt(*ScopeDIE, dwarf::DW_AT_call_line, None, IA->getLine()); + if (IA->getDiscriminator()) + addUInt(*ScopeDIE, dwarf::DW_AT_GNU_discriminator, None, + IA->getDiscriminator()); // Add name to the name table, we do this here because we're guaranteed // to have concrete versions of our DW_TAG_inlined_subprogram nodes. @@ -517,8 +525,7 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV, unsigned FrameReg = 0; const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering(); int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg); - assert(Expr != DV.getExpression().end() && - "Wrong number of expressions"); + assert(Expr != DV.getExpression().end() && "Wrong number of expressions"); DwarfExpr.AddMachineRegIndirect(FrameReg, Offset); DwarfExpr.AddExpression((*Expr)->expr_op_begin(), (*Expr)->expr_op_end()); ++Expr; @@ -597,8 +604,8 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope, return ObjectPointer; } -void -DwarfCompileUnit::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) { +void DwarfCompileUnit::constructAbstractSubprogramScopeDIE( + LexicalScope *Scope) { DIE *&AbsDef = DU->getAbstractSPDies()[Scope->getScopeNode()]; if (AbsDef) return; diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index 509c943..2e28467 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -39,6 +39,12 @@ class DwarfCompileUnit : public DwarfUnit { /// The start of the unit within its section. MCSymbol *LabelBegin; + typedef llvm::SmallVector<const MDNode *, 8> ImportedEntityList; + typedef llvm::DenseMap<const MDNode *, ImportedEntityList> + ImportedEntityMap; + + ImportedEntityMap ImportedEntities; + /// GlobalNames - A map of globally visible named entities for this unit. StringMap<const DIE *> GlobalNames; @@ -98,6 +104,10 @@ public: unsigned getOrCreateSourceID(StringRef FileName, StringRef DirName) override; + void addImportedEntity(const DIImportedEntity* IE) { + ImportedEntities[IE->getScope()].push_back(IE); + } + /// addRange - Add an address range to the list of ranges for this unit. void addRange(RangeSpan Range); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 7d03a39..a4fb07e 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/ValueHandle.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" @@ -104,6 +105,14 @@ DwarfPubSections("generate-dwarf-pub-sections", cl::Hidden, clEnumVal(Disable, "Disabled"), clEnumValEnd), cl::init(Default)); +static cl::opt<DefaultOnOff> +DwarfLinkageNames("dwarf-linkage-names", cl::Hidden, + cl::desc("Emit DWARF linkage-name attributes."), + cl::values(clEnumVal(Default, "Default for platform"), + clEnumVal(Enable, "Enabled"), + clEnumVal(Disable, "Disabled"), clEnumValEnd), + cl::init(Default)); + static const char *const DWARFGroupName = "DWARF Emission"; static const char *const DbgTimerName = "DWARF Debug Writer"; @@ -176,9 +185,9 @@ const DIType *DbgVariable::getType() const { if (tag == dwarf::DW_TAG_pointer_type) subType = resolve(cast<DIDerivedType>(Ty)->getBaseType()); - auto Elements = cast<DICompositeTypeBase>(subType)->getElements(); + auto Elements = cast<DICompositeType>(subType)->getElements(); for (unsigned i = 0, N = Elements.size(); i < N; ++i) { - auto *DT = cast<DIDerivedTypeBase>(Elements[i]); + auto *DT = cast<DIDerivedType>(Elements[i]); if (getName() == DT->getName()) return resolve(DT->getBaseType()); } @@ -194,45 +203,67 @@ static LLVM_CONSTEXPR DwarfAccelTable::Atom TypeAtoms[] = { DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M) : Asm(A), MMI(Asm->MMI), DebugLocs(A->OutStreamer->isVerboseAsm()), PrevLabel(nullptr), InfoHolder(A, "info_string", DIEValueAllocator), - UsedNonDefaultText(false), SkeletonHolder(A, "skel_string", DIEValueAllocator), IsDarwin(Triple(A->getTargetTriple()).isOSDarwin()), - IsPS4(Triple(A->getTargetTriple()).isPS4()), AccelNames(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4)), AccelObjC(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4)), AccelNamespace(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4)), - AccelTypes(TypeAtoms) { + AccelTypes(TypeAtoms), DebuggerTuning(DebuggerKind::Default) { CurFn = nullptr; CurMI = nullptr; + Triple TT(Asm->getTargetTriple()); + + // Make sure we know our "debugger tuning." The target option takes + // precedence; fall back to triple-based defaults. + if (Asm->TM.Options.DebuggerTuning != DebuggerKind::Default) + DebuggerTuning = Asm->TM.Options.DebuggerTuning; + else if (IsDarwin) + DebuggerTuning = DebuggerKind::LLDB; + else if (TT.isPS4CPU()) + DebuggerTuning = DebuggerKind::SCE; + else + DebuggerTuning = DebuggerKind::GDB; - // Turn on accelerator tables for Darwin by default, pubnames by - // default for non-Darwin/PS4, and handle split dwarf. + // Turn on accelerator tables for LLDB by default. if (DwarfAccelTables == Default) - HasDwarfAccelTables = IsDarwin; + HasDwarfAccelTables = tuneForLLDB(); else HasDwarfAccelTables = DwarfAccelTables == Enable; + // Handle split DWARF. Off by default for now. if (SplitDwarf == Default) HasSplitDwarf = false; else HasSplitDwarf = SplitDwarf == Enable; + // Pubnames/pubtypes on by default for GDB. if (DwarfPubSections == Default) - HasDwarfPubSections = !IsDarwin && !IsPS4; + HasDwarfPubSections = tuneForGDB(); else HasDwarfPubSections = DwarfPubSections == Enable; + // SCE does not use linkage names. + if (DwarfLinkageNames == Default) + UseLinkageNames = !tuneForSCE(); + else + UseLinkageNames = DwarfLinkageNames == Enable; + unsigned DwarfVersionNumber = Asm->TM.Options.MCOptions.DwarfVersion; DwarfVersion = DwarfVersionNumber ? DwarfVersionNumber : MMI->getModule()->getDwarfVersion(); + // Use dwarf 4 by default if nothing is requested. + DwarfVersion = DwarfVersion ? DwarfVersion : dwarf::DWARF_VERSION; - // Darwin and PS4 use the standard TLS opcode (defined in DWARF 3). - // Everybody else uses GNU's. - UseGNUTLSOpcode = !(IsDarwin || IsPS4) || DwarfVersion < 3; + // Work around a GDB bug. GDB doesn't support the standard opcode; + // SCE doesn't support GNU's; LLDB prefers the standard opcode, which + // is defined as of DWARF 3. + // See GDB bug 11616 - DW_OP_form_tls_address is unimplemented + // https://sourceware.org/bugzilla/show_bug.cgi?id=11616 + UseGNUTLSOpcode = tuneForGDB() || DwarfVersion < 3; Asm->OutStreamer->getContext().setDwarfVersion(DwarfVersion); @@ -300,18 +331,6 @@ void DwarfDebug::addSubprogramNames(const DISubprogram *SP, DIE &Die) { } } -/// isSubprogramContext - Return true if Context is either a subprogram -/// or another context nested inside a subprogram. -bool DwarfDebug::isSubprogramContext(const MDNode *Context) { - if (!Context) - return false; - if (isa<DISubprogram>(Context)) - return true; - if (auto *T = dyn_cast<DIType>(Context)) - return isSubprogramContext(resolve(T->getScope())); - return false; -} - /// Check whether we should create a DIE for the given Scope, return true /// if we don't create a DIE (the corresponding DIE is null). bool DwarfDebug::isLexicalScopeDIENull(LexicalScope *Scope) { @@ -416,6 +435,16 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) { else NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoSection()); + if (DIUnit->getDWOId()) { + // This CU is either a clang module DWO or a skeleton CU. + NewCU.addUInt(Die, dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8, + DIUnit->getDWOId()); + if (!DIUnit->getSplitDebugFilename().empty()) + // This is a prefabricated skeleton CU. + NewCU.addString(Die, dwarf::DW_AT_GNU_dwo_name, + DIUnit->getSplitDebugFilename()); + } + CUMap.insert(std::make_pair(DIUnit, &NewCU)); CUDieMap.insert(std::make_pair(&Die, &NewCU)); return NewCU; @@ -436,8 +465,6 @@ void DwarfDebug::beginModule() { const Module *M = MMI->getModule(); - FunctionDIs = makeSubprogramMap(*M); - NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu"); if (!CU_Nodes) return; @@ -449,12 +476,7 @@ void DwarfDebug::beginModule() { auto *CUNode = cast<DICompileUnit>(N); DwarfCompileUnit &CU = constructDwarfCompileUnit(CUNode); for (auto *IE : CUNode->getImportedEntities()) - ScopesWithImportedEntities.push_back(std::make_pair(IE->getScope(), IE)); - // Stable sort to preserve the order of appearance of imported entities. - // This is to avoid out-of-order processing of interdependent declarations - // within the same scope, e.g. { namespace A = base; namespace B = A; } - std::stable_sort(ScopesWithImportedEntities.begin(), - ScopesWithImportedEntities.end(), less_first()); + CU.addImportedEntity(IE); for (auto *GV : CUNode->getGlobalVariables()) CU.getOrCreateGlobalVariableDIE(GV); for (auto *SP : CUNode->getSubprograms()) @@ -467,7 +489,10 @@ void DwarfDebug::beginModule() { for (auto *Ty : CUNode->getRetainedTypes()) { // The retained types array by design contains pointers to // MDNodes rather than DIRefs. Unique them here. - CU.getOrCreateTypeDIE(cast<DIType>(resolve(Ty->getRef()))); + DIType *RT = cast<DIType>(resolve(Ty->getRef())); + if (!RT->isExternalTypeRef()) + // There is no point in force-emitting a forward declaration. + CU.getOrCreateTypeDIE(RT); } // Emit imported_modules last so that the relevant context is already // available. @@ -536,6 +561,8 @@ void DwarfDebug::finalizeModuleInfo() { // Collect info for variables that were optimized out. collectDeadVariables(); + unsigned MacroOffset = 0; + std::unique_ptr<AsmStreamerBase> AS(new SizeReporterAsmStreamer(Asm)); // Handle anything that needs to be done on a per-unit basis after // all other generation. for (const auto &P : CUMap) { @@ -588,6 +615,15 @@ void DwarfDebug::finalizeModuleInfo() { U.setBaseAddress(TheCU.getRanges().front().getStart()); U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges()); } + + auto *CUNode = cast<DICompileUnit>(P.first); + if (CUNode->getMacros()) { + // Compile Unit has macros, emit "DW_AT_macro_info" attribute. + U.addUInt(U.getUnitDie(), dwarf::DW_AT_macro_info, + dwarf::DW_FORM_sec_offset, MacroOffset); + // Update macro section offset + MacroOffset += handleMacroNodes(AS.get(), CUNode->getMacros(), U); + } } // Compute DIE offsets and sizes. @@ -631,6 +667,9 @@ void DwarfDebug::endModule() { // Emit info into a debug ranges section. emitDebugRanges(); + // Emit info into a debug macinfo section. + emitDebugMacinfo(); + if (useSplitDwarf()) { emitDebugStrDWO(); emitDebugInfoDWO(); @@ -1061,12 +1100,8 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) { for (const auto &MBB : *MF) for (const auto &MI : MBB) if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) && - MI.getDebugLoc()) { - // Did the target forget to set the FrameSetup flag for CFI insns? - assert(!MI.isCFIInstruction() && - "First non-frame-setup instruction is a CFI instruction."); + MI.getDebugLoc()) return MI.getDebugLoc(); - } return DebugLoc(); } @@ -1079,8 +1114,8 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) { if (!MMI->hasDebugInfo()) return; - auto DI = FunctionDIs.find(MF->getFunction()); - if (DI == FunctionDIs.end()) + auto DI = MF->getFunction()->getSubprogram(); + if (!DI) return; // Grab the lexical scopes for the function, if we don't have any of those @@ -1127,7 +1162,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) { // The first mention of a function argument gets the CurrentFnBegin // label, so arguments are visible when breaking at function entry. const DILocalVariable *DIVar = Ranges.front().first->getDebugVariable(); - if (DIVar->getTag() == dwarf::DW_TAG_arg_variable && + if (DIVar->isParameter() && getDISubprogram(DIVar->getScope())->describes(MF->getFunction())) { LabelsBeforeInsn[Ranges.front().first] = Asm->getFunctionBegin(); if (Ranges.front().first->getDebugExpression()->isBitPiece()) { @@ -1171,7 +1206,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) { "endFunction should be called with the same function as beginFunction"); if (!MMI->hasDebugInfo() || LScopes.empty() || - !FunctionDIs.count(MF->getFunction())) { + !MF->getFunction()->getSubprogram()) { // If we don't have a lexical scope for this function then there will // be a hole in the range information. Keep note of this by setting the // previously used section to nullptr. @@ -1812,6 +1847,70 @@ void DwarfDebug::emitDebugRanges() { } } +unsigned DwarfDebug::handleMacroNodes(AsmStreamerBase *AS, + DIMacroNodeArray Nodes, + DwarfCompileUnit &U) { + unsigned Size = 0; + for (auto *MN : Nodes) { + if (auto *M = dyn_cast<DIMacro>(MN)) + Size += emitMacro(AS, *M); + else if (auto *F = dyn_cast<DIMacroFile>(MN)) + Size += emitMacroFile(AS, *F, U); + else + llvm_unreachable("Unexpected DI type!"); + } + return Size; +} + +unsigned DwarfDebug::emitMacro(AsmStreamerBase *AS, DIMacro &M) { + int Size = 0; + Size += AS->emitULEB128(M.getMacinfoType()); + Size += AS->emitULEB128(M.getLine()); + StringRef Name = M.getName(); + StringRef Value = M.getValue(); + Size += AS->emitBytes(Name); + if (!Value.empty()) { + // There should be one space between macro name and macro value. + Size += AS->emitInt8(' '); + Size += AS->emitBytes(Value); + } + Size += AS->emitInt8('\0'); + return Size; +} + +unsigned DwarfDebug::emitMacroFile(AsmStreamerBase *AS, DIMacroFile &F, + DwarfCompileUnit &U) { + int Size = 0; + assert(F.getMacinfoType() == dwarf::DW_MACINFO_start_file); + Size += AS->emitULEB128(dwarf::DW_MACINFO_start_file); + Size += AS->emitULEB128(F.getLine()); + DIFile *File = F.getFile(); + unsigned FID = + U.getOrCreateSourceID(File->getFilename(), File->getDirectory()); + Size += AS->emitULEB128(FID); + Size += handleMacroNodes(AS, F.getElements(), U); + Size += AS->emitULEB128(dwarf::DW_MACINFO_end_file); + return Size; +} + +// Emit visible names into a debug macinfo section. +void DwarfDebug::emitDebugMacinfo() { + if (MCSection *Macinfo = Asm->getObjFileLowering().getDwarfMacinfoSection()) { + // Start the dwarf macinfo section. + Asm->OutStreamer->SwitchSection(Macinfo); + } + std::unique_ptr<AsmStreamerBase> AS(new EmittingAsmStreamer(Asm)); + for (const auto &P : CUMap) { + auto &TheCU = *P.second; + auto *SkCU = TheCU.getSkeleton(); + DwarfCompileUnit &U = SkCU ? *SkCU : TheCU; + auto *CUNode = cast<DICompileUnit>(P.first); + handleMacroNodes(AS.get(), CUNode->getMacros(), U); + } + Asm->OutStreamer->AddComment("End Of Macro List Mark"); + Asm->EmitInt8(0); +} + // DWARF5 Experimental Separate Dwarf emitters. void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die, @@ -1863,7 +1962,7 @@ void DwarfDebug::emitDebugLineDWO() { assert(useSplitDwarf() && "No split dwarf?"); Asm->OutStreamer->SwitchSection( Asm->getObjFileLowering().getDwarfLineDWOSection()); - SplitTypeUnitFileTable.Emit(*Asm->OutStreamer); + SplitTypeUnitFileTable.Emit(*Asm->OutStreamer, MCDwarfLineTableParams()); } // Emit the .debug_str.dwo section for separated dwarf. This contains the @@ -1884,7 +1983,7 @@ MCDwarfDwoLineTable *DwarfDebug::getDwoLineTable(const DwarfCompileUnit &CU) { return &SplitTypeUnitFileTable; } -static uint64_t makeTypeSignature(StringRef Identifier) { +uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) { MD5 Hash; Hash.update(Identifier); // ... take the least significant 8 bytes and return those. Our MD5 diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h index 01f34c6..460c186 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -33,6 +33,7 @@ #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MachineLocation.h" #include "llvm/Support/Allocator.h" +#include "llvm/Target/TargetOptions.h" #include <memory> namespace llvm { @@ -49,24 +50,6 @@ class DwarfUnit; class MachineModuleInfo; //===----------------------------------------------------------------------===// -/// This class is used to record source line correspondence. -class SrcLineInfo { - unsigned Line; // Source line number. - unsigned Column; // Source column. - unsigned SourceID; // Source ID number. - MCSymbol *Label; // Label in code ID number. -public: - SrcLineInfo(unsigned L, unsigned C, unsigned S, MCSymbol *label) - : Line(L), Column(C), SourceID(S), Label(label) {} - - // Accessors - unsigned getLine() const { return Line; } - unsigned getColumn() const { return Column; } - unsigned getSourceID() const { return SourceID; } - MCSymbol *getLabel() const { return Label; } -}; - -//===----------------------------------------------------------------------===// /// This class is used to track local variable information. /// /// Variables can be created from allocas, in which case they're generated from @@ -127,14 +110,14 @@ public: // Accessors. const DILocalVariable *getVariable() const { return Var; } const DILocation *getInlinedAt() const { return IA; } - const ArrayRef<const DIExpression *> getExpression() const { return Expr; } + ArrayRef<const DIExpression *> getExpression() const { return Expr; } void setDIE(DIE &D) { TheDIE = &D; } DIE *getDIE() const { return TheDIE; } void setDebugLocListIndex(unsigned O) { DebugLocListIndex = O; } unsigned getDebugLocListIndex() const { return DebugLocListIndex; } StringRef getName() const { return Var->getName(); } const MachineInstr *getMInsn() const { return MInsn; } - const ArrayRef<int> getFrameIndex() const { return FrameIndex; } + ArrayRef<int> getFrameIndex() const { return FrameIndex; } void addMMIEntry(const DbgVariable &V) { assert(DebugLocListIndex == ~0U && !MInsn && "not an MMI entry"); @@ -156,7 +139,8 @@ public: // Translate tag to proper Dwarf tag. dwarf::Tag getTag() const { - if (Var->getTag() == dwarf::DW_TAG_arg_variable) + // FIXME: Why don't we just infer this tag and store it all along? + if (Var->isParameter()) return dwarf::DW_TAG_formal_parameter; return dwarf::DW_TAG_variable; @@ -282,11 +266,6 @@ class DwarfDebug : public AsmPrinterHandler { /// Holders for the various debug information flags that we might need to /// have exposed. See accessor functions below for description. - /// Holder for imported entities. - typedef SmallVector<std::pair<const MDNode *, const MDNode *>, 32> - ImportedEntityMap; - ImportedEntityMap ScopesWithImportedEntities; - /// Map from MDNodes for user-defined types to the type units that /// describe them. DenseMap<const MDNode *, const DwarfTypeUnit *> DwarfTypeUnits; @@ -298,16 +277,12 @@ class DwarfDebug : public AsmPrinterHandler { /// Whether to emit the pubnames/pubtypes sections. bool HasDwarfPubSections; - /// Whether or not to use AT_ranges for compilation units. - bool HasCURanges; - - /// Whether we emitted a function into a section other than the - /// default text. - bool UsedNonDefaultText; - /// Whether to use the GNU TLS opcode (instead of the standard opcode). bool UseGNUTLSOpcode; + /// Whether to emit DW_AT_[MIPS_]linkage_name. + bool UseLinkageNames; + /// Version of dwarf we're emitting. unsigned DwarfVersion; @@ -338,7 +313,6 @@ class DwarfDebug : public AsmPrinterHandler { /// True iff there are multiple CUs in this module. bool SingleCU; bool IsDarwin; - bool IsPS4; AddressPool AddrPool; @@ -347,7 +321,8 @@ class DwarfDebug : public AsmPrinterHandler { DwarfAccelTable AccelNamespace; DwarfAccelTable AccelTypes; - DenseMap<const Function *, DISubprogram *> FunctionDIs; + // Identify a debugger for "tuning" the debug info. + DebuggerKind DebuggerTuning; MCDwarfDwoLineTable *getDwoLineTable(const DwarfCompileUnit &); @@ -372,12 +347,6 @@ class DwarfDebug : public AsmPrinterHandler { /// Construct a DIE for this abstract scope. void constructAbstractSubprogramScopeDIE(LexicalScope *Scope); - /// Compute the size and offset of a DIE given an incoming Offset. - unsigned computeSizeAndOffset(DIE *Die, unsigned Offset); - - /// Compute the size and offset of all the DIEs. - void computeSizeAndOffsets(); - /// Collect info for variables that were optimized out. void collectDeadVariables(); @@ -431,20 +400,25 @@ class DwarfDebug : public AsmPrinterHandler { /// Emit visible names into a debug str section. void emitDebugStr(); - /// Emit visible names into a debug loc section. + /// Emit variable locations into a debug loc section. void emitDebugLoc(); - /// Emit visible names into a debug loc dwo section. + /// Emit variable locations into a debug loc dwo section. void emitDebugLocDWO(); - /// Emit visible names into a debug aranges section. + /// Emit address ranges into a debug aranges section. void emitDebugARanges(); - /// Emit visible names into a debug ranges section. + /// Emit address ranges into a debug ranges section. void emitDebugRanges(); - /// Emit inline info using custom format. - void emitDebugInlineInfo(); + /// Emit macros into a debug macinfo section. + void emitDebugMacinfo(); + unsigned emitMacro(AsmStreamerBase *AS, DIMacro &M); + unsigned emitMacroFile(AsmStreamerBase *AS, DIMacroFile &F, + DwarfCompileUnit &U); + unsigned handleMacroNodes(AsmStreamerBase *AS, DIMacroNodeArray Nodes, + DwarfCompileUnit &U); /// DWARF 5 Experimental Split Dwarf Emitters @@ -456,10 +430,6 @@ class DwarfDebug : public AsmPrinterHandler { /// section. DwarfCompileUnit &constructSkeletonCU(const DwarfCompileUnit &CU); - /// Construct the split debug info compile unit for the debug info - /// section. - DwarfTypeUnit &constructSkeletonTU(DwarfTypeUnit &TU); - /// Emit the debug info dwo section. void emitDebugInfoDWO(); @@ -544,6 +514,9 @@ public: /// Process end of an instruction. void endInstruction() override; + /// Perform an MD5 checksum of \p Identifier and return the lower 64 bits. + static uint64_t makeTypeSignature(StringRef Identifier); + /// Add a DIE to the set of types that we're going to pull into /// type units. void addDwarfTypeUnitType(DwarfCompileUnit &CU, StringRef Identifier, @@ -558,10 +531,22 @@ public: SymSize[Sym] = Size; } + /// Returns whether to emit DW_AT_[MIPS_]linkage_name. + bool useLinkageNames() const { return UseLinkageNames; } + /// Returns whether to use DW_OP_GNU_push_tls_address, instead of the /// standard DW_OP_form_tls_address opcode bool useGNUTLSOpcode() const { return UseGNUTLSOpcode; } + /// \defgroup DebuggerTuning Predicates to tune DWARF for a given debugger. + /// + /// Returns whether we are "tuning" for a given debugger. + /// @{ + bool tuneForGDB() const { return DebuggerTuning == DebuggerKind::GDB; } + bool tuneForLLDB() const { return DebuggerTuning == DebuggerKind::LLDB; } + bool tuneForSCE() const { return DebuggerTuning == DebuggerKind::SCE; } + /// @} + // Experimental DWARF5 features. /// Returns whether or not to emit tables that dwarf consumers can @@ -604,9 +589,6 @@ public: DwarfCompileUnit *lookupUnit(const DIE *CU) const { return CUDieMap.lookup(CU); } - /// isSubprogramContext - Return true if Context is either a subprogram - /// or another context nested inside a subprogram. - bool isSubprogramContext(const MDNode *Context); void addSubprogramNames(const DISubprogram *SP, DIE &Die); @@ -622,14 +604,6 @@ public: const MachineFunction *getCurrentFunction() const { return CurFn; } - iterator_range<ImportedEntityMap::const_iterator> - findImportedEntitiesForScope(const MDNode *Scope) const { - return make_range(std::equal_range( - ScopesWithImportedEntities.begin(), ScopesWithImportedEntities.end(), - std::pair<const MDNode *, const MDNode *>(Scope, nullptr), - less_first())); - } - /// A helper function to check whether the DIE for a given Scope is /// going to be null. bool isLexicalScopeDIENull(LexicalScope *Scope); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index a2799b8..7b5b831 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -211,12 +211,15 @@ bool DwarfExpression::AddMachineRegExpression(const DIExpression *Expr, return AddMachineRegPiece(MachineReg, SizeInBits, getOffsetOrZero(OffsetInBits, PieceOffsetInBits)); } - case dwarf::DW_OP_plus: { - // [DW_OP_reg,Offset,DW_OP_plus,DW_OP_deref] --> [DW_OP_breg,Offset]. + case dwarf::DW_OP_plus: + case dwarf::DW_OP_minus: { + // [DW_OP_reg,Offset,DW_OP_plus, DW_OP_deref] --> [DW_OP_breg, Offset]. + // [DW_OP_reg,Offset,DW_OP_minus,DW_OP_deref] --> [DW_OP_breg,-Offset]. auto N = I.getNext(); if (N != E && N->getOp() == dwarf::DW_OP_deref) { unsigned Offset = I->getArg(0); - ValidReg = AddMachineRegIndirect(MachineReg, Offset); + ValidReg = AddMachineRegIndirect( + MachineReg, I->getOp() == dwarf::DW_OP_plus ? Offset : -Offset); std::advance(I, 2); break; } else @@ -255,6 +258,12 @@ void DwarfExpression::AddExpression(DIExpression::expr_op_iterator I, EmitOp(dwarf::DW_OP_plus_uconst); EmitUnsigned(I->getArg(0)); break; + case dwarf::DW_OP_minus: + // There is no OP_minus_uconst. + EmitOp(dwarf::DW_OP_constu); + EmitUnsigned(I->getArg(0)); + EmitOp(dwarf::DW_OP_minus); + break; case dwarf::DW_OP_deref: EmitOp(dwarf::DW_OP_deref); break; diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 3555822..d75fea5 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -192,18 +192,19 @@ void DwarfUnit::addFlag(DIE &Die, dwarf::Attribute Attribute) { DIEInteger(1)); } -void DwarfUnit::addUInt(DIE &Die, dwarf::Attribute Attribute, +void DwarfUnit::addUInt(DIEValueList &Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form, uint64_t Integer) { if (!Form) Form = DIEInteger::BestForm(false, Integer); Die.addValue(DIEValueAllocator, Attribute, *Form, DIEInteger(Integer)); } -void DwarfUnit::addUInt(DIE &Block, dwarf::Form Form, uint64_t Integer) { +void DwarfUnit::addUInt(DIEValueList &Block, dwarf::Form Form, + uint64_t Integer) { addUInt(Block, (dwarf::Attribute)0, Form, Integer); } -void DwarfUnit::addSInt(DIE &Die, dwarf::Attribute Attribute, +void DwarfUnit::addSInt(DIEValueList &Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form, int64_t Integer) { if (!Form) Form = DIEInteger::BestForm(true, Integer); @@ -222,9 +223,10 @@ void DwarfUnit::addString(DIE &Die, dwarf::Attribute Attribute, DIEString(DU->getStringPool().getEntry(*Asm, String))); } -DIE::value_iterator DwarfUnit::addLabel(DIE &Die, dwarf::Attribute Attribute, - dwarf::Form Form, - const MCSymbol *Label) { +DIEValueList::value_iterator DwarfUnit::addLabel(DIEValueList &Die, + dwarf::Attribute Attribute, + dwarf::Form Form, + const MCSymbol *Label) { return Die.addValue(DIEValueAllocator, Attribute, Form, DIELabel(Label)); } @@ -277,6 +279,13 @@ void DwarfUnit::addDIETypeSignature(DIE &Die, const DwarfTypeUnit &Type) { dwarf::DW_FORM_ref_sig8, DIETypeSignature(Type)); } +void DwarfUnit::addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute, + StringRef Identifier) { + uint64_t Signature = DD->makeTypeSignature(Identifier); + Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_ref_sig8, + DIEInteger(Signature)); +} + void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIEEntry Entry) { const DIE *DieCU = Die.getUnitOrNull(); @@ -292,8 +301,6 @@ void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute, } DIE &DwarfUnit::createAndAddDIE(unsigned Tag, DIE &Parent, const DINode *N) { - assert(Tag != dwarf::DW_TAG_auto_variable && - Tag != dwarf::DW_TAG_arg_variable); DIE &Die = Parent.addChild(DIE::get(DIEValueAllocator, (dwarf::Tag)Tag)); if (N) insertDIE(N, &Die); @@ -445,7 +452,7 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die, // Find the __forwarding field and the variable field in the __Block_byref // struct. - DINodeArray Fields = cast<DICompositeTypeBase>(TmpTy)->getElements(); + DINodeArray Fields = cast<DICompositeType>(TmpTy)->getElements(); const DIDerivedType *varField = nullptr; const DIDerivedType *forwardingField = nullptr; @@ -506,34 +513,35 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die, /// Return true if type encoding is unsigned. static bool isUnsignedDIType(DwarfDebug *DD, const DIType *Ty) { - if (auto *DTy = dyn_cast<DIDerivedTypeBase>(Ty)) { + if (auto *CTy = dyn_cast<DICompositeType>(Ty)) { + // FIXME: Enums without a fixed underlying type have unknown signedness + // here, leading to incorrectly emitted constants. + if (CTy->getTag() == dwarf::DW_TAG_enumeration_type) + return false; + + // (Pieces of) aggregate types that get hacked apart by SROA may be + // represented by a constant. Encode them as unsigned bytes. + return true; + } + + if (auto *DTy = dyn_cast<DIDerivedType>(Ty)) { dwarf::Tag T = (dwarf::Tag)Ty->getTag(); // Encode pointer constants as unsigned bytes. This is used at least for // null pointer constant emission. - // (Pieces of) aggregate types that get hacked apart by SROA may also be - // represented by a constant. Encode them as unsigned bytes. // FIXME: reference and rvalue_reference /probably/ shouldn't be allowed // here, but accept them for now due to a bug in SROA producing bogus // dbg.values. - if (T == dwarf::DW_TAG_array_type || - T == dwarf::DW_TAG_class_type || - T == dwarf::DW_TAG_pointer_type || + if (T == dwarf::DW_TAG_pointer_type || T == dwarf::DW_TAG_ptr_to_member_type || T == dwarf::DW_TAG_reference_type || - T == dwarf::DW_TAG_rvalue_reference_type || - T == dwarf::DW_TAG_structure_type || - T == dwarf::DW_TAG_union_type) + T == dwarf::DW_TAG_rvalue_reference_type) return true; assert(T == dwarf::DW_TAG_typedef || T == dwarf::DW_TAG_const_type || T == dwarf::DW_TAG_volatile_type || - T == dwarf::DW_TAG_restrict_type || - T == dwarf::DW_TAG_enumeration_type); - if (DITypeRef Deriv = DTy->getBaseType()) - return isUnsignedDIType(DD, DD->resolve(Deriv)); - // FIXME: Enums without a fixed underlying type have unknown signedness - // here, leading to incorrectly emitted constants. - assert(DTy->getTag() == dwarf::DW_TAG_enumeration_type); - return false; + T == dwarf::DW_TAG_restrict_type); + DITypeRef Deriv = DTy->getBaseType(); + assert(Deriv && "Expected valid base type"); + return isUnsignedDIType(DD, DD->resolve(Deriv)); } auto *BTy = cast<DIBasicType>(Ty); @@ -659,7 +667,7 @@ void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, bool Unsigned) { } void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) { - if (!LinkageName.empty()) + if (!LinkageName.empty() && DD->useLinkageNames()) addString(Die, DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name : dwarf::DW_AT_MIPS_linkage_name, @@ -685,6 +693,8 @@ DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) { return getOrCreateNameSpace(NS); if (auto *SP = dyn_cast<DISubprogram>(Context)) return getOrCreateSubprogramDIE(SP); + if (auto *M = dyn_cast<DIModule>(Context)) + return getOrCreateModule(M); return getDIE(Context); } @@ -700,7 +710,8 @@ DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) { constructTypeDIE(TyDIE, cast<DICompositeType>(Ty)); - updateAcceleratorTables(Context, Ty, TyDIE); + if (!Ty->isExternalTypeRef()) + updateAcceleratorTables(Context, Ty, TyDIE); return &TyDIE; } @@ -753,7 +764,7 @@ void DwarfUnit::updateAcceleratorTables(const DIScope *Context, const DIType *Ty, const DIE &TyDIE) { if (!Ty->getName().empty() && !Ty->isForwardDecl()) { bool IsImplementation = 0; - if (auto *CT = dyn_cast<DICompositeTypeBase>(Ty)) { + if (auto *CT = dyn_cast<DICompositeType>(Ty)) { // A runtime language of 0 actually means C/C++ and that any // non-negative value is some version of Objective-C/C++. IsImplementation = CT->getRuntimeLang() == 0 || CT->isObjcClassComplete(); @@ -795,8 +806,7 @@ std::string DwarfUnit::getParentContextString(const DIScope *Context) const { // Reverse iterate over our list to go from the outermost construct to the // innermost. - for (auto I = Parents.rbegin(), E = Parents.rend(); I != E; ++I) { - const DIScope *Ctx = *I; + for (const DIScope *Ctx : make_range(Parents.rbegin(), Parents.rend())) { StringRef Name = Ctx->getName(); if (Name.empty() && isa<DINamespace>(Ctx)) Name = "(anonymous namespace)"; @@ -843,7 +853,9 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) { // Add size if non-zero (derived types might be zero-sized.) if (Size && Tag != dwarf::DW_TAG_pointer_type - && Tag != dwarf::DW_TAG_ptr_to_member_type) + && Tag != dwarf::DW_TAG_ptr_to_member_type + && Tag != dwarf::DW_TAG_reference_type + && Tag != dwarf::DW_TAG_rvalue_reference_type) addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size); if (Tag == dwarf::DW_TAG_ptr_to_member_type) @@ -899,6 +911,13 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) { } void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) { + if (CTy->isExternalTypeRef()) { + StringRef Identifier = CTy->getIdentifier(); + assert(!Identifier.empty() && "external type ref without identifier"); + addFlag(Buffer, dwarf::DW_AT_declaration); + return addDIETypeSignature(Buffer, dwarf::DW_AT_signature, Identifier); + } + // Add name if not anonymous or intermediate type. StringRef Name = CTy->getName(); @@ -1134,6 +1153,14 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP, "definition DIE was created in " "getOrCreateSubprogramDIE"); DeclLinkageName = SPDecl->getLinkageName(); + unsigned DeclID = + getOrCreateSourceID(SPDecl->getFilename(), SPDecl->getDirectory()); + unsigned DefID = getOrCreateSourceID(SP->getFilename(), SP->getDirectory()); + if (DeclID != DefID) + addUInt(SPDie, dwarf::DW_AT_decl_file, None, DefID); + + if (SP->getLine() != SPDecl->getLine()) + addUInt(SPDie, dwarf::DW_AT_decl_line, None, SP->getLine()); } // Add function template parameters. @@ -1180,11 +1207,10 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, Language == dwarf::DW_LANG_ObjC)) addFlag(SPDie, dwarf::DW_AT_prototyped); - const DISubroutineType *SPTy = SP->getType(); - assert(SPTy->getTag() == dwarf::DW_TAG_subroutine_type && - "the type of a subprogram should be a subroutine"); + DITypeRefArray Args; + if (const DISubroutineType *SPTy = SP->getType()) + Args = SPTy->getTypeArray(); - auto Args = SPTy->getTypeArray(); // Add a return type. If this is a type like a C/C++ void type we don't add a // return type. if (Args.size()) diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h index 44d9d22..82760bf 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -113,13 +113,6 @@ protected: DwarfUnit(unsigned UID, dwarf::Tag, const DICompileUnit *CU, AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU); - /// Add a string attribute data and value. - /// - /// This is guaranteed to be in the local string pool instead of indirected. - void addLocalString(DIE &Die, dwarf::Attribute Attribute, StringRef Str); - - void addIndexedString(DIE &Die, dwarf::Attribute Attribute, StringRef Str); - bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie); public: @@ -162,9 +155,6 @@ public: virtual void addGlobalType(const DIType *Ty, const DIE &Die, const DIScope *Context) {} - /// Add a new name to the namespace accelerator table. - void addAccelNamespace(StringRef Name, const DIE &Die); - /// Returns the DIE map slot for the specified debug variable. /// /// We delegate the request to DwarfDebug when the MDNode can be part of the @@ -186,14 +176,14 @@ public: void addFlag(DIE &Die, dwarf::Attribute Attribute); /// Add an unsigned integer attribute data and value. - void addUInt(DIE &Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form, - uint64_t Integer); + void addUInt(DIEValueList &Die, dwarf::Attribute Attribute, + Optional<dwarf::Form> Form, uint64_t Integer); - void addUInt(DIE &Block, dwarf::Form Form, uint64_t Integer); + void addUInt(DIEValueList &Block, dwarf::Form Form, uint64_t Integer); /// Add an signed integer attribute data and value. - void addSInt(DIE &Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form, - int64_t Integer); + void addSInt(DIEValueList &Die, dwarf::Attribute Attribute, + Optional<dwarf::Form> Form, int64_t Integer); void addSInt(DIELoc &Die, Optional<dwarf::Form> Form, int64_t Integer); @@ -206,8 +196,10 @@ public: void addString(DIE &Die, dwarf::Attribute Attribute, StringRef Str); /// Add a Dwarf label attribute data and value. - DIE::value_iterator addLabel(DIE &Die, dwarf::Attribute Attribute, - dwarf::Form Form, const MCSymbol *Label); + DIEValueList::value_iterator addLabel(DIEValueList &Die, + dwarf::Attribute Attribute, + dwarf::Form Form, + const MCSymbol *Label); void addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label); @@ -228,7 +220,11 @@ public: /// Add a DIE attribute data and value. void addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIEEntry Entry); + /// Add a type's DW_AT_signature and set the declaration flag. void addDIETypeSignature(DIE &Die, const DwarfTypeUnit &Type); + /// Add an attribute containing the type signature for a unique identifier. + void addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute, + StringRef Identifier); /// Add block data. void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Block); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp index 49ef8d3..e24dcb1 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp @@ -662,9 +662,8 @@ void EHStreamer::emitTypeInfos(unsigned TTypeEncoding) { Entry = TypeInfos.size(); } - for (std::vector<const GlobalValue *>::const_reverse_iterator - I = TypeInfos.rbegin(), E = TypeInfos.rend(); I != E; ++I) { - const GlobalValue *GV = *I; + for (const GlobalValue *GV : make_range(TypeInfos.rbegin(), + TypeInfos.rend())) { if (VerboseAsm) Asm->OutStreamer->AddComment("TypeInfo " + Twine(Entry--)); Asm->EmitTTypeReference(GV, TTypeEncoding); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h index e42e082..c6a0e9d 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h @@ -76,10 +76,6 @@ protected: SmallVectorImpl<ActionEntry> &Actions, SmallVectorImpl<unsigned> &FirstActions); - /// Return `true' if this is a call to a function marked `nounwind'. Return - /// `false' otherwise. - bool callToNoUnwindFunction(const MachineInstr *MI); - void computePadMap(const SmallVectorImpl<const LandingPadInfo *> &LandingPads, RangeMapType &PadMap); @@ -131,6 +127,10 @@ public: void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {} void beginInstruction(const MachineInstr *MI) override {} void endInstruction() override {} + + /// Return `true' if this is a call to a function marked `nounwind'. Return + /// `false' otherwise. + static bool callToNoUnwindFunction(const MachineInstr *MI); }; } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp index eb9e4c1..6a023b9 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp @@ -48,7 +48,7 @@ void llvm::linkErlangGCPrinter() {} void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) { MCStreamer &OS = *AP.OutStreamer; - unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize(); + unsigned IntPtrSize = M.getDataLayout().getPointerSize(); // Put this in a custom .note section. OS.SwitchSection( diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp index 2ceec61..c09ef6a 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp @@ -93,7 +93,7 @@ void OcamlGCMetadataPrinter::beginAssembly(Module &M, GCModuleInfo &Info, /// void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) { - unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize(); + unsigned IntPtrSize = M.getDataLayout().getPointerSize(); AP.OutStreamer->SwitchSection(AP.getObjFileLowering().getTextSection()); EmitCamlGlobal(M, AP, "code_end"); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp index 6610ac7..1e2f55b 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp @@ -27,15 +27,15 @@ StringRef WinCodeViewLineTables::getFullFilepath(const MDNode *S) { auto *Scope = cast<DIScope>(S); StringRef Dir = Scope->getDirectory(), Filename = Scope->getFilename(); - char *&Result = DirAndFilenameToFilepathMap[std::make_pair(Dir, Filename)]; - if (Result) - return Result; + std::string &Filepath = + DirAndFilenameToFilepathMap[std::make_pair(Dir, Filename)]; + if (!Filepath.empty()) + return Filepath; // Clang emits directory and relative filename info into the IR, but CodeView // operates on full paths. We could change Clang to emit full paths too, but // that would increase the IR size and probably not needed for other users. // For now, just concatenate and canonicalize the path here. - std::string Filepath; if (Filename.find(':') == 1) Filepath = Filename; else @@ -74,8 +74,7 @@ StringRef WinCodeViewLineTables::getFullFilepath(const MDNode *S) { while ((Cursor = Filepath.find("\\\\", Cursor)) != std::string::npos) Filepath.erase(Cursor, 1); - Result = strdup(Filepath.c_str()); - return StringRef(Result); + return Filepath; } void WinCodeViewLineTables::maybeRecordLocation(DebugLoc DL, @@ -83,13 +82,24 @@ void WinCodeViewLineTables::maybeRecordLocation(DebugLoc DL, const MDNode *Scope = DL.getScope(); if (!Scope) return; + unsigned LineNumber = DL.getLine(); + // Skip this line if it is longer than the maximum we can record. + if (LineNumber > COFF::CVL_MaxLineNumber) + return; + + unsigned ColumnNumber = DL.getCol(); + // Truncate the column number if it is longer than the maximum we can record. + if (ColumnNumber > COFF::CVL_MaxColumnNumber) + ColumnNumber = 0; + StringRef Filename = getFullFilepath(Scope); // Skip this instruction if it has the same file:line as the previous one. assert(CurFn); if (!CurFn->Instrs.empty()) { const InstrInfoTy &LastInstr = InstrInfo[CurFn->Instrs.back()]; - if (LastInstr.Filename == Filename && LastInstr.LineNumber == DL.getLine()) + if (LastInstr.Filename == Filename && LastInstr.LineNumber == LineNumber && + LastInstr.ColumnNumber == ColumnNumber) return; } FileNameRegistry.add(Filename); @@ -97,7 +107,7 @@ void WinCodeViewLineTables::maybeRecordLocation(DebugLoc DL, MCSymbol *MCL = Asm->MMI->getContext().createTempSymbol(); Asm->OutStreamer->EmitLabel(MCL); CurFn->Instrs.push_back(MCL); - InstrInfo[MCL] = InstrInfoTy(Filename, DL.getLine(), DL.getCol()); + InstrInfo[MCL] = InstrInfoTy(Filename, LineNumber, ColumnNumber); } WinCodeViewLineTables::WinCodeViewLineTables(AsmPrinter *AP) @@ -253,7 +263,7 @@ void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) { } FilenameSegmentLengths[LastSegmentEnd] = FI.Instrs.size() - LastSegmentEnd; - // Emit a line table subsection, requred to do PC-to-file:line lookup. + // Emit a line table subsection, required to do PC-to-file:line lookup. Asm->OutStreamer->AddComment("Line table subsection for " + Twine(FuncName)); Asm->EmitInt32(COFF::DEBUG_LINE_TABLE_SUBSECTION); MCSymbol *LineTableBegin = Asm->MMI->getContext().createTempSymbol(), @@ -283,8 +293,9 @@ void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) { ColSegEnd = ColSegI + FilenameSegmentLengths[LastSegmentStart]; ColSegI != ColSegEnd; ++ColSegI) { unsigned ColumnNumber = InstrInfo[FI.Instrs[ColSegI]].ColumnNumber; + assert(ColumnNumber <= COFF::CVL_MaxColumnNumber); Asm->EmitInt16(ColumnNumber); // Start column - Asm->EmitInt16(ColumnNumber); // End column + Asm->EmitInt16(0); // End column } Asm->OutStreamer->EmitLabel(FileSegmentEnd); }; @@ -321,7 +332,10 @@ void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) { // The first PC with the given linenumber and the linenumber itself. EmitLabelDiff(*Asm->OutStreamer, Fn, Instr); - Asm->EmitInt32(InstrInfo[Instr].LineNumber); + uint32_t LineNumber = InstrInfo[Instr].LineNumber; + assert(LineNumber <= COFF::CVL_MaxLineNumber); + uint32_t LineData = LineNumber | COFF::CVL_IsStatement; + Asm->EmitInt32(LineData); } FinishPreviousChunk(); diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h b/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h index 43d1a43..78068e0 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h @@ -98,7 +98,7 @@ class LLVM_LIBRARY_VISIBILITY WinCodeViewLineTables : public AsmPrinterHandler { } } FileNameRegistry; - typedef std::map<std::pair<StringRef, StringRef>, char *> + typedef std::map<std::pair<StringRef, StringRef>, std::string> DirAndFilenameToFilepathMapTy; DirAndFilenameToFilepathMapTy DirAndFilenameToFilepathMap; StringRef getFullFilepath(const MDNode *S); @@ -116,14 +116,6 @@ class LLVM_LIBRARY_VISIBILITY WinCodeViewLineTables : public AsmPrinterHandler { public: WinCodeViewLineTables(AsmPrinter *Asm); - ~WinCodeViewLineTables() override { - for (DirAndFilenameToFilepathMapTy::iterator - I = DirAndFilenameToFilepathMap.begin(), - E = DirAndFilenameToFilepathMap.end(); - I != E; ++I) - free(I->second); - } - void setSymbolSize(const llvm::MCSymbol *, uint64_t) override {} /// \brief Emit the COFF section that holds the line table information. diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp index a2b9316..4da5b58 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp @@ -30,6 +30,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCWin64EH.h" +#include "llvm/Support/COFF.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" @@ -37,6 +38,7 @@ #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; WinException::WinException(AsmPrinter *A) : EHStreamer(A) { @@ -62,9 +64,9 @@ void WinException::beginFunction(const MachineFunction *MF) { // If any landing pads survive, we need an EH table. bool hasLandingPads = !MMI->getLandingPads().empty(); + bool hasEHFunclets = MMI->hasEHFunclets(); const Function *F = MF->getFunction(); - const Function *ParentF = MMI->getWinEHParent(F); shouldEmitMoves = Asm->needsSEHMoves(); @@ -78,49 +80,23 @@ void WinException::beginFunction(const MachineFunction *MF) { F->hasPersonalityFn() && !isNoOpWithoutInvoke(classifyEHPersonality(Per)) && F->needsUnwindTableEntry(); - shouldEmitPersonality = forceEmitPersonality || (hasLandingPads && - PerEncoding != dwarf::DW_EH_PE_omit && Per); + shouldEmitPersonality = + forceEmitPersonality || ((hasLandingPads || hasEHFunclets) && + PerEncoding != dwarf::DW_EH_PE_omit && Per); unsigned LSDAEncoding = TLOF.getLSDAEncoding(); shouldEmitLSDA = shouldEmitPersonality && LSDAEncoding != dwarf::DW_EH_PE_omit; - // If we're not using CFI, we don't want the CFI or the personality. If - // WinEHPrepare outlined something, we should emit the LSDA. + // If we're not using CFI, we don't want the CFI or the personality, but we + // might want EH tables if we had EH pads. if (!Asm->MAI->usesWindowsCFI()) { - bool HasOutlinedChildren = - F->hasFnAttribute("wineh-parent") && F == ParentF; - shouldEmitLSDA = HasOutlinedChildren; + shouldEmitLSDA = hasEHFunclets; shouldEmitPersonality = false; return; } - // If this was an outlined handler, we need to define the label corresponding - // to the offset of the parent frame relative to the stack pointer after the - // prologue. - if (F != ParentF) { - WinEHFuncInfo &FuncInfo = MMI->getWinEHFuncInfo(ParentF); - auto I = FuncInfo.CatchHandlerParentFrameObjOffset.find(F); - if (I != FuncInfo.CatchHandlerParentFrameObjOffset.end()) { - MCSymbol *HandlerTypeParentFrameOffset = - Asm->OutContext.getOrCreateParentFrameOffsetSymbol( - GlobalValue::getRealLinkageName(F->getName())); - - // Emit a symbol assignment. - Asm->OutStreamer->EmitAssignment( - HandlerTypeParentFrameOffset, - MCConstantExpr::create(I->second, Asm->OutContext)); - } - } - - if (shouldEmitMoves || shouldEmitPersonality) - Asm->OutStreamer->EmitWinCFIStartProc(Asm->CurrentFnSym); - - if (shouldEmitPersonality) { - const MCSymbol *PersHandlerSym = - TLOF.getCFIPersonalitySymbol(Per, *Asm->Mang, Asm->TM, MMI); - Asm->OutStreamer->EmitWinEHHandler(PersHandlerSym, true, true); - } + beginFunclet(MF->front(), Asm->CurrentFnSym); } /// endFunction - Gather and emit post-function exception information. @@ -134,43 +110,158 @@ void WinException::endFunction(const MachineFunction *MF) { if (F->hasPersonalityFn()) Per = classifyEHPersonality(F->getPersonalityFn()); - // Get rid of any dead landing pads if we're not using a Windows EH scheme. In - // Windows EH schemes, the landing pad is not actually reachable. It only - // exists so that we can emit the right table data. - if (!isMSVCEHPersonality(Per)) + // Get rid of any dead landing pads if we're not using funclets. In funclet + // schemes, the landing pad is not actually reachable. It only exists so + // that we can emit the right table data. + if (!isFuncletEHPersonality(Per)) MMI->TidyLandingPads(); + endFunclet(); + + // endFunclet will emit the necessary .xdata tables for x64 SEH. + if (Per == EHPersonality::MSVC_Win64SEH && MMI->hasEHFunclets()) + return; + if (shouldEmitPersonality || shouldEmitLSDA) { Asm->OutStreamer->PushSection(); - if (shouldEmitMoves || shouldEmitPersonality) { - // Emit an UNWIND_INFO struct describing the prologue. - Asm->OutStreamer->EmitWinEHHandlerData(); - } else { - // Just switch sections to the right xdata section. This use of - // CurrentFnSym assumes that we only emit the LSDA when ending the parent - // function. - MCSection *XData = WinEH::UnwindEmitter::getXDataSection( - Asm->CurrentFnSym, Asm->OutContext); - Asm->OutStreamer->SwitchSection(XData); - } + // Just switch sections to the right xdata section. This use of CurrentFnSym + // assumes that we only emit the LSDA when ending the parent function. + MCSection *XData = WinEH::UnwindEmitter::getXDataSection(Asm->CurrentFnSym, + Asm->OutContext); + Asm->OutStreamer->SwitchSection(XData); // Emit the tables appropriate to the personality function in use. If we // don't recognize the personality, assume it uses an Itanium-style LSDA. if (Per == EHPersonality::MSVC_Win64SEH) - emitCSpecificHandlerTable(); + emitCSpecificHandlerTable(MF); else if (Per == EHPersonality::MSVC_X86SEH) emitExceptHandlerTable(MF); else if (Per == EHPersonality::MSVC_CXX) emitCXXFrameHandler3Table(MF); + else if (Per == EHPersonality::CoreCLR) + emitCLRExceptionTable(MF); else emitExceptionTable(); Asm->OutStreamer->PopSection(); } +} + +/// Retreive the MCSymbol for a GlobalValue or MachineBasicBlock. +static MCSymbol *getMCSymbolForMBB(AsmPrinter *Asm, + const MachineBasicBlock *MBB) { + if (!MBB) + return nullptr; + assert(MBB->isEHFuncletEntry()); + + // Give catches and cleanups a name based off of their parent function and + // their funclet entry block's number. + const MachineFunction *MF = MBB->getParent(); + const Function *F = MF->getFunction(); + StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName()); + MCContext &Ctx = MF->getContext(); + StringRef HandlerPrefix = MBB->isCleanupFuncletEntry() ? "dtor" : "catch"; + return Ctx.getOrCreateSymbol("?" + HandlerPrefix + "$" + + Twine(MBB->getNumber()) + "@?0?" + + FuncLinkageName + "@4HA"); +} + +void WinException::beginFunclet(const MachineBasicBlock &MBB, + MCSymbol *Sym) { + CurrentFuncletEntry = &MBB; + + const Function *F = Asm->MF->getFunction(); + // If a symbol was not provided for the funclet, invent one. + if (!Sym) { + Sym = getMCSymbolForMBB(Asm, &MBB); + + // Describe our funclet symbol as a function with internal linkage. + Asm->OutStreamer->BeginCOFFSymbolDef(Sym); + Asm->OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC); + Asm->OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION + << COFF::SCT_COMPLEX_TYPE_SHIFT); + Asm->OutStreamer->EndCOFFSymbolDef(); + + // We want our funclet's entry point to be aligned such that no nops will be + // present after the label. + Asm->EmitAlignment(std::max(Asm->MF->getAlignment(), MBB.getAlignment()), + F); + + // Now that we've emitted the alignment directive, point at our funclet. + Asm->OutStreamer->EmitLabel(Sym); + } + + // Mark 'Sym' as starting our funclet. if (shouldEmitMoves || shouldEmitPersonality) + Asm->OutStreamer->EmitWinCFIStartProc(Sym); + + if (shouldEmitPersonality) { + const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering(); + const Function *PerFn = nullptr; + + // Determine which personality routine we are using for this funclet. + if (F->hasPersonalityFn()) + PerFn = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts()); + const MCSymbol *PersHandlerSym = + TLOF.getCFIPersonalitySymbol(PerFn, *Asm->Mang, Asm->TM, MMI); + + // Classify the personality routine so that we may reason about it. + EHPersonality Per = EHPersonality::Unknown; + if (F->hasPersonalityFn()) + Per = classifyEHPersonality(F->getPersonalityFn()); + + // Do not emit a .seh_handler directive if it is a C++ cleanup funclet. + if (Per != EHPersonality::MSVC_CXX || + !CurrentFuncletEntry->isCleanupFuncletEntry()) + Asm->OutStreamer->EmitWinEHHandler(PersHandlerSym, true, true); + } +} + +void WinException::endFunclet() { + // No funclet to process? Great, we have nothing to do. + if (!CurrentFuncletEntry) + return; + + if (shouldEmitMoves || shouldEmitPersonality) { + const Function *F = Asm->MF->getFunction(); + EHPersonality Per = EHPersonality::Unknown; + if (F->hasPersonalityFn()) + Per = classifyEHPersonality(F->getPersonalityFn()); + + // The .seh_handlerdata directive implicitly switches section, push the + // current section so that we may return to it. + Asm->OutStreamer->PushSection(); + + // Emit an UNWIND_INFO struct describing the prologue. + Asm->OutStreamer->EmitWinEHHandlerData(); + + if (Per == EHPersonality::MSVC_CXX && shouldEmitPersonality && + !CurrentFuncletEntry->isCleanupFuncletEntry()) { + // If this is a C++ catch funclet (or the parent function), + // emit a reference to the LSDA for the parent function. + StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName()); + MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol( + Twine("$cppxdata$", FuncLinkageName)); + Asm->OutStreamer->EmitValue(create32bitRef(FuncInfoXData), 4); + } else if (Per == EHPersonality::MSVC_Win64SEH && MMI->hasEHFunclets() && + !CurrentFuncletEntry->isEHFuncletEntry()) { + // If this is the parent function in Win64 SEH, emit the LSDA immediately + // following .seh_handlerdata. + emitCSpecificHandlerTable(Asm->MF); + } + + // Switch back to the previous section now that we are done writing to + // .xdata. + Asm->OutStreamer->PopSection(); + + // Emit a .seh_endproc directive to mark the end of the function. Asm->OutStreamer->EmitWinCFIEndProc(); + } + + // Let's make sure we don't try to end the same funclet twice. + CurrentFuncletEntry = nullptr; } const MCExpr *WinException::create32bitRef(const MCSymbol *Value) { @@ -188,6 +279,202 @@ const MCExpr *WinException::create32bitRef(const GlobalValue *GV) { return create32bitRef(Asm->getSymbol(GV)); } +const MCExpr *WinException::getLabelPlusOne(const MCSymbol *Label) { + return MCBinaryExpr::createAdd(create32bitRef(Label), + MCConstantExpr::create(1, Asm->OutContext), + Asm->OutContext); +} + +const MCExpr *WinException::getOffset(const MCSymbol *OffsetOf, + const MCSymbol *OffsetFrom) { + return MCBinaryExpr::createSub( + MCSymbolRefExpr::create(OffsetOf, Asm->OutContext), + MCSymbolRefExpr::create(OffsetFrom, Asm->OutContext), Asm->OutContext); +} + +const MCExpr *WinException::getOffsetPlusOne(const MCSymbol *OffsetOf, + const MCSymbol *OffsetFrom) { + return MCBinaryExpr::createAdd(getOffset(OffsetOf, OffsetFrom), + MCConstantExpr::create(1, Asm->OutContext), + Asm->OutContext); +} + +int WinException::getFrameIndexOffset(int FrameIndex, + const WinEHFuncInfo &FuncInfo) { + const TargetFrameLowering &TFI = *Asm->MF->getSubtarget().getFrameLowering(); + unsigned UnusedReg; + if (Asm->MAI->usesWindowsCFI()) + return TFI.getFrameIndexReferenceFromSP(*Asm->MF, FrameIndex, UnusedReg); + // For 32-bit, offsets should be relative to the end of the EH registration + // node. For 64-bit, it's relative to SP at the end of the prologue. + assert(FuncInfo.EHRegNodeEndOffset != INT_MAX); + int Offset = TFI.getFrameIndexReference(*Asm->MF, FrameIndex, UnusedReg); + Offset += FuncInfo.EHRegNodeEndOffset; + return Offset; +} + +namespace { + +/// Top-level state used to represent unwind to caller +const int NullState = -1; + +struct InvokeStateChange { + /// EH Label immediately after the last invoke in the previous state, or + /// nullptr if the previous state was the null state. + const MCSymbol *PreviousEndLabel; + + /// EH label immediately before the first invoke in the new state, or nullptr + /// if the new state is the null state. + const MCSymbol *NewStartLabel; + + /// State of the invoke following NewStartLabel, or NullState to indicate + /// the presence of calls which may unwind to caller. + int NewState; +}; + +/// Iterator that reports all the invoke state changes in a range of machine +/// basic blocks. Changes to the null state are reported whenever a call that +/// may unwind to caller is encountered. The MBB range is expected to be an +/// entire function or funclet, and the start and end of the range are treated +/// as being in the NullState even if there's not an unwind-to-caller call +/// before the first invoke or after the last one (i.e., the first state change +/// reported is the first change to something other than NullState, and a +/// change back to NullState is always reported at the end of iteration). +class InvokeStateChangeIterator { + InvokeStateChangeIterator(const WinEHFuncInfo &EHInfo, + MachineFunction::const_iterator MFI, + MachineFunction::const_iterator MFE, + MachineBasicBlock::const_iterator MBBI, + int BaseState) + : EHInfo(EHInfo), MFI(MFI), MFE(MFE), MBBI(MBBI), BaseState(BaseState) { + LastStateChange.PreviousEndLabel = nullptr; + LastStateChange.NewStartLabel = nullptr; + LastStateChange.NewState = BaseState; + scan(); + } + +public: + static iterator_range<InvokeStateChangeIterator> + range(const WinEHFuncInfo &EHInfo, MachineFunction::const_iterator Begin, + MachineFunction::const_iterator End, int BaseState = NullState) { + // Reject empty ranges to simplify bookkeeping by ensuring that we can get + // the end of the last block. + assert(Begin != End); + auto BlockBegin = Begin->begin(); + auto BlockEnd = std::prev(End)->end(); + return make_range( + InvokeStateChangeIterator(EHInfo, Begin, End, BlockBegin, BaseState), + InvokeStateChangeIterator(EHInfo, End, End, BlockEnd, BaseState)); + } + + // Iterator methods. + bool operator==(const InvokeStateChangeIterator &O) const { + assert(BaseState == O.BaseState); + // Must be visiting same block. + if (MFI != O.MFI) + return false; + // Must be visiting same isntr. + if (MBBI != O.MBBI) + return false; + // At end of block/instr iteration, we can still have two distinct states: + // one to report the final EndLabel, and another indicating the end of the + // state change iteration. Check for CurrentEndLabel equality to + // distinguish these. + return CurrentEndLabel == O.CurrentEndLabel; + } + + bool operator!=(const InvokeStateChangeIterator &O) const { + return !operator==(O); + } + InvokeStateChange &operator*() { return LastStateChange; } + InvokeStateChange *operator->() { return &LastStateChange; } + InvokeStateChangeIterator &operator++() { return scan(); } + +private: + InvokeStateChangeIterator &scan(); + + const WinEHFuncInfo &EHInfo; + const MCSymbol *CurrentEndLabel = nullptr; + MachineFunction::const_iterator MFI; + MachineFunction::const_iterator MFE; + MachineBasicBlock::const_iterator MBBI; + InvokeStateChange LastStateChange; + bool VisitingInvoke = false; + int BaseState; +}; + +} // end anonymous namespace + +InvokeStateChangeIterator &InvokeStateChangeIterator::scan() { + bool IsNewBlock = false; + for (; MFI != MFE; ++MFI, IsNewBlock = true) { + if (IsNewBlock) + MBBI = MFI->begin(); + for (auto MBBE = MFI->end(); MBBI != MBBE; ++MBBI) { + const MachineInstr &MI = *MBBI; + if (!VisitingInvoke && LastStateChange.NewState != BaseState && + MI.isCall() && !EHStreamer::callToNoUnwindFunction(&MI)) { + // Indicate a change of state to the null state. We don't have + // start/end EH labels handy but the caller won't expect them for + // null state regions. + LastStateChange.PreviousEndLabel = CurrentEndLabel; + LastStateChange.NewStartLabel = nullptr; + LastStateChange.NewState = BaseState; + CurrentEndLabel = nullptr; + // Don't re-visit this instr on the next scan + ++MBBI; + return *this; + } + + // All other state changes are at EH labels before/after invokes. + if (!MI.isEHLabel()) + continue; + MCSymbol *Label = MI.getOperand(0).getMCSymbol(); + if (Label == CurrentEndLabel) { + VisitingInvoke = false; + continue; + } + auto InvokeMapIter = EHInfo.LabelToStateMap.find(Label); + // Ignore EH labels that aren't the ones inserted before an invoke + if (InvokeMapIter == EHInfo.LabelToStateMap.end()) + continue; + auto &StateAndEnd = InvokeMapIter->second; + int NewState = StateAndEnd.first; + // Keep track of the fact that we're between EH start/end labels so + // we know not to treat the inoke we'll see as unwinding to caller. + VisitingInvoke = true; + if (NewState == LastStateChange.NewState) { + // The state isn't actually changing here. Record the new end and + // keep going. + CurrentEndLabel = StateAndEnd.second; + continue; + } + // Found a state change to report + LastStateChange.PreviousEndLabel = CurrentEndLabel; + LastStateChange.NewStartLabel = Label; + LastStateChange.NewState = NewState; + // Start keeping track of the new current end + CurrentEndLabel = StateAndEnd.second; + // Don't re-visit this instr on the next scan + ++MBBI; + return *this; + } + } + // Iteration hit the end of the block range. + if (LastStateChange.NewState != BaseState) { + // Report the end of the last new state + LastStateChange.PreviousEndLabel = CurrentEndLabel; + LastStateChange.NewStartLabel = nullptr; + LastStateChange.NewState = BaseState; + // Leave CurrentEndLabel non-null to distinguish this state from end. + assert(CurrentEndLabel != nullptr); + return *this; + } + // We've reported all state changes and hit the end state. + CurrentEndLabel = nullptr; + return *this; +} + /// Emit the language-specific data that __C_specific_handler expects. This /// handler lives in the x64 Microsoft C runtime and allows catching or cleaning /// up after faults with __try, __except, and __finally. The typeinfo values @@ -216,135 +503,156 @@ const MCExpr *WinException::create32bitRef(const GlobalValue *GV) { /// imagerel32 LabelLPad; // Zero means __finally. /// } Entries[NumEntries]; /// }; -void WinException::emitCSpecificHandlerTable() { - const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads(); - - // Simplifying assumptions for first implementation: - // - Cleanups are not implemented. - // - Filters are not implemented. - - // The Itanium LSDA table sorts similar landing pads together to simplify the - // actions table, but we don't need that. - SmallVector<const LandingPadInfo *, 64> LandingPads; - LandingPads.reserve(PadInfos.size()); - for (const auto &LP : PadInfos) - LandingPads.push_back(&LP); - - // Compute label ranges for call sites as we would for the Itanium LSDA, but - // use an all zero action table because we aren't using these actions. - SmallVector<unsigned, 64> FirstActions; - FirstActions.resize(LandingPads.size()); - SmallVector<CallSiteEntry, 64> CallSites; - computeCallSiteTable(CallSites, LandingPads, FirstActions); - - MCSymbol *EHFuncBeginSym = Asm->getFunctionBegin(); - MCSymbol *EHFuncEndSym = Asm->getFunctionEnd(); - - // Emit the number of table entries. - unsigned NumEntries = 0; - for (const CallSiteEntry &CSE : CallSites) { - if (!CSE.LPad) - continue; // Ignore gaps. - NumEntries += CSE.LPad->SEHHandlers.size(); +void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) { + auto &OS = *Asm->OutStreamer; + MCContext &Ctx = Asm->OutContext; + const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo(); + + bool VerboseAsm = OS.isVerboseAsm(); + auto AddComment = [&](const Twine &Comment) { + if (VerboseAsm) + OS.AddComment(Comment); + }; + + // Emit a label assignment with the SEH frame offset so we can use it for + // llvm.x86.seh.recoverfp. + StringRef FLinkageName = + GlobalValue::getRealLinkageName(MF->getFunction()->getName()); + MCSymbol *ParentFrameOffset = + Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName); + const MCExpr *MCOffset = + MCConstantExpr::create(FuncInfo.SEHSetFrameOffset, Ctx); + Asm->OutStreamer->EmitAssignment(ParentFrameOffset, MCOffset); + + // Use the assembler to compute the number of table entries through label + // difference and division. + MCSymbol *TableBegin = + Ctx.createTempSymbol("lsda_begin", /*AlwaysAddSuffix=*/true); + MCSymbol *TableEnd = + Ctx.createTempSymbol("lsda_end", /*AlwaysAddSuffix=*/true); + const MCExpr *LabelDiff = getOffset(TableEnd, TableBegin); + const MCExpr *EntrySize = MCConstantExpr::create(16, Ctx); + const MCExpr *EntryCount = MCBinaryExpr::createDiv(LabelDiff, EntrySize, Ctx); + AddComment("Number of call sites"); + OS.EmitValue(EntryCount, 4); + + OS.EmitLabel(TableBegin); + + // Iterate over all the invoke try ranges. Unlike MSVC, LLVM currently only + // models exceptions from invokes. LLVM also allows arbitrary reordering of + // the code, so our tables end up looking a bit different. Rather than + // trying to match MSVC's tables exactly, we emit a denormalized table. For + // each range of invokes in the same state, we emit table entries for all + // the actions that would be taken in that state. This means our tables are + // slightly bigger, which is OK. + const MCSymbol *LastStartLabel = nullptr; + int LastEHState = -1; + // Break out before we enter into a finally funclet. + // FIXME: We need to emit separate EH tables for cleanups. + MachineFunction::const_iterator End = MF->end(); + MachineFunction::const_iterator Stop = std::next(MF->begin()); + while (Stop != End && !Stop->isEHFuncletEntry()) + ++Stop; + for (const auto &StateChange : + InvokeStateChangeIterator::range(FuncInfo, MF->begin(), Stop)) { + // Emit all the actions for the state we just transitioned out of + // if it was not the null state + if (LastEHState != -1) + emitSEHActionsForRange(FuncInfo, LastStartLabel, + StateChange.PreviousEndLabel, LastEHState); + LastStartLabel = StateChange.NewStartLabel; + LastEHState = StateChange.NewState; } - Asm->OutStreamer->EmitIntValue(NumEntries, 4); - // If there are no actions, we don't need to iterate again. - if (NumEntries == 0) - return; + OS.EmitLabel(TableEnd); +} - // Emit the four-label records for each call site entry. The table has to be - // sorted in layout order, and the call sites should already be sorted. - for (const CallSiteEntry &CSE : CallSites) { - // Ignore gaps. Unlike the Itanium model, unwinding through a frame without - // an EH table entry will propagate the exception rather than terminating - // the program. - if (!CSE.LPad) - continue; - const LandingPadInfo *LPad = CSE.LPad; - - // Compute the label range. We may reuse the function begin and end labels - // rather than forming new ones. - const MCExpr *Begin = - create32bitRef(CSE.BeginLabel ? CSE.BeginLabel : EHFuncBeginSym); - const MCExpr *End; - if (CSE.EndLabel) { - // The interval is half-open, so we have to add one to include the return - // address of the last invoke in the range. - End = MCBinaryExpr::createAdd(create32bitRef(CSE.EndLabel), - MCConstantExpr::create(1, Asm->OutContext), - Asm->OutContext); +void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo, + const MCSymbol *BeginLabel, + const MCSymbol *EndLabel, int State) { + auto &OS = *Asm->OutStreamer; + MCContext &Ctx = Asm->OutContext; + + bool VerboseAsm = OS.isVerboseAsm(); + auto AddComment = [&](const Twine &Comment) { + if (VerboseAsm) + OS.AddComment(Comment); + }; + + assert(BeginLabel && EndLabel); + while (State != -1) { + const SEHUnwindMapEntry &UME = FuncInfo.SEHUnwindMap[State]; + const MCExpr *FilterOrFinally; + const MCExpr *ExceptOrNull; + auto *Handler = UME.Handler.get<MachineBasicBlock *>(); + if (UME.IsFinally) { + FilterOrFinally = create32bitRef(getMCSymbolForMBB(Asm, Handler)); + ExceptOrNull = MCConstantExpr::create(0, Ctx); } else { - End = create32bitRef(EHFuncEndSym); + // For an except, the filter can be 1 (catch-all) or a function + // label. + FilterOrFinally = UME.Filter ? create32bitRef(UME.Filter) + : MCConstantExpr::create(1, Ctx); + ExceptOrNull = create32bitRef(Handler->getSymbol()); } - // Emit an entry for each action. - for (SEHHandler Handler : LPad->SEHHandlers) { - Asm->OutStreamer->EmitValue(Begin, 4); - Asm->OutStreamer->EmitValue(End, 4); - - // Emit the filter or finally function pointer, if present. Otherwise, - // emit '1' to indicate a catch-all. - const Function *F = Handler.FilterOrFinally; - if (F) - Asm->OutStreamer->EmitValue(create32bitRef(Asm->getSymbol(F)), 4); - else - Asm->OutStreamer->EmitIntValue(1, 4); - - // Emit the recovery address, if present. Otherwise, this must be a - // finally. - const BlockAddress *BA = Handler.RecoverBA; - if (BA) - Asm->OutStreamer->EmitValue( - create32bitRef(Asm->GetBlockAddressSymbol(BA)), 4); - else - Asm->OutStreamer->EmitIntValue(0, 4); - } + AddComment("LabelStart"); + OS.EmitValue(getLabelPlusOne(BeginLabel), 4); + AddComment("LabelEnd"); + OS.EmitValue(getLabelPlusOne(EndLabel), 4); + AddComment(UME.IsFinally ? "FinallyFunclet" : UME.Filter ? "FilterFunction" + : "CatchAll"); + OS.EmitValue(FilterOrFinally, 4); + AddComment(UME.IsFinally ? "Null" : "ExceptionHandler"); + OS.EmitValue(ExceptOrNull, 4); + + assert(UME.ToState < State && "states should decrease"); + State = UME.ToState; } } void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) { const Function *F = MF->getFunction(); - const Function *ParentF = MMI->getWinEHParent(F); auto &OS = *Asm->OutStreamer; - WinEHFuncInfo &FuncInfo = MMI->getWinEHFuncInfo(ParentF); + const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo(); - StringRef ParentLinkageName = - GlobalValue::getRealLinkageName(ParentF->getName()); + StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName()); + SmallVector<std::pair<const MCExpr *, int>, 4> IPToStateTable; MCSymbol *FuncInfoXData = nullptr; if (shouldEmitPersonality) { - FuncInfoXData = Asm->OutContext.getOrCreateSymbol( - Twine("$cppxdata$", ParentLinkageName)); - OS.EmitValue(create32bitRef(FuncInfoXData), 4); - - extendIP2StateTable(MF, ParentF, FuncInfo); - - // Defer emission until we've visited the parent function and all the catch - // handlers. Cleanups don't contribute to the ip2state table, so don't count - // them. - if (ParentF != F && !FuncInfo.CatchHandlerMaxState.count(F)) - return; - ++FuncInfo.NumIPToStateFuncsVisited; - if (FuncInfo.NumIPToStateFuncsVisited != FuncInfo.CatchHandlerMaxState.size()) - return; + // If we're 64-bit, emit a pointer to the C++ EH data, and build a map from + // IPs to state numbers. + FuncInfoXData = + Asm->OutContext.getOrCreateSymbol(Twine("$cppxdata$", FuncLinkageName)); + computeIP2StateTable(MF, FuncInfo, IPToStateTable); } else { - FuncInfoXData = Asm->OutContext.getOrCreateLSDASymbol(ParentLinkageName); - emitEHRegistrationOffsetLabel(FuncInfo, ParentLinkageName); + FuncInfoXData = Asm->OutContext.getOrCreateLSDASymbol(FuncLinkageName); } + int UnwindHelpOffset = 0; + if (Asm->MAI->usesWindowsCFI()) + UnwindHelpOffset = + getFrameIndexOffset(FuncInfo.UnwindHelpFrameIdx, FuncInfo); + MCSymbol *UnwindMapXData = nullptr; MCSymbol *TryBlockMapXData = nullptr; MCSymbol *IPToStateXData = nullptr; - if (!FuncInfo.UnwindMap.empty()) + if (!FuncInfo.CxxUnwindMap.empty()) UnwindMapXData = Asm->OutContext.getOrCreateSymbol( - Twine("$stateUnwindMap$", ParentLinkageName)); + Twine("$stateUnwindMap$", FuncLinkageName)); if (!FuncInfo.TryBlockMap.empty()) - TryBlockMapXData = Asm->OutContext.getOrCreateSymbol( - Twine("$tryMap$", ParentLinkageName)); - if (!FuncInfo.IPToStateList.empty()) - IPToStateXData = Asm->OutContext.getOrCreateSymbol( - Twine("$ip2state$", ParentLinkageName)); + TryBlockMapXData = + Asm->OutContext.getOrCreateSymbol(Twine("$tryMap$", FuncLinkageName)); + if (!IPToStateTable.empty()) + IPToStateXData = + Asm->OutContext.getOrCreateSymbol(Twine("$ip2state$", FuncLinkageName)); + + bool VerboseAsm = OS.isVerboseAsm(); + auto AddComment = [&](const Twine &Comment) { + if (VerboseAsm) + OS.AddComment(Comment); + }; // FuncInfo { // uint32_t MagicNumber @@ -363,17 +671,38 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) { // EHFlags & 4 -> The function is noexcept(true), unwinding can't continue. OS.EmitValueToAlignment(4); OS.EmitLabel(FuncInfoXData); - OS.EmitIntValue(0x19930522, 4); // MagicNumber - OS.EmitIntValue(FuncInfo.UnwindMap.size(), 4); // MaxState - OS.EmitValue(create32bitRef(UnwindMapXData), 4); // UnwindMap - OS.EmitIntValue(FuncInfo.TryBlockMap.size(), 4); // NumTryBlocks - OS.EmitValue(create32bitRef(TryBlockMapXData), 4); // TryBlockMap - OS.EmitIntValue(FuncInfo.IPToStateList.size(), 4); // IPMapEntries - OS.EmitValue(create32bitRef(IPToStateXData), 4); // IPToStateMap - if (Asm->MAI->usesWindowsCFI()) - OS.EmitIntValue(FuncInfo.UnwindHelpFrameOffset, 4); // UnwindHelp - OS.EmitIntValue(0, 4); // ESTypeList - OS.EmitIntValue(1, 4); // EHFlags + + AddComment("MagicNumber"); + OS.EmitIntValue(0x19930522, 4); + + AddComment("MaxState"); + OS.EmitIntValue(FuncInfo.CxxUnwindMap.size(), 4); + + AddComment("UnwindMap"); + OS.EmitValue(create32bitRef(UnwindMapXData), 4); + + AddComment("NumTryBlocks"); + OS.EmitIntValue(FuncInfo.TryBlockMap.size(), 4); + + AddComment("TryBlockMap"); + OS.EmitValue(create32bitRef(TryBlockMapXData), 4); + + AddComment("IPMapEntries"); + OS.EmitIntValue(IPToStateTable.size(), 4); + + AddComment("IPToStateXData"); + OS.EmitValue(create32bitRef(IPToStateXData), 4); + + if (Asm->MAI->usesWindowsCFI()) { + AddComment("UnwindHelp"); + OS.EmitIntValue(UnwindHelpOffset, 4); + } + + AddComment("ESTypeList"); + OS.EmitIntValue(0, 4); + + AddComment("EHFlags"); + OS.EmitIntValue(1, 4); // UnwindMapEntry { // int32_t ToState; @@ -381,9 +710,14 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) { // }; if (UnwindMapXData) { OS.EmitLabel(UnwindMapXData); - for (const WinEHUnwindMapEntry &UME : FuncInfo.UnwindMap) { - OS.EmitIntValue(UME.ToState, 4); // ToState - OS.EmitValue(create32bitRef(UME.Cleanup), 4); // Action + for (const CxxUnwindMapEntry &UME : FuncInfo.CxxUnwindMap) { + MCSymbol *CleanupSym = + getMCSymbolForMBB(Asm, UME.Cleanup.dyn_cast<MachineBasicBlock *>()); + AddComment("ToState"); + OS.EmitIntValue(UME.ToState, 4); + + AddComment("Action"); + OS.EmitValue(create32bitRef(CleanupSym), 4); } } @@ -398,33 +732,49 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) { OS.EmitLabel(TryBlockMapXData); SmallVector<MCSymbol *, 1> HandlerMaps; for (size_t I = 0, E = FuncInfo.TryBlockMap.size(); I != E; ++I) { - WinEHTryBlockMapEntry &TBME = FuncInfo.TryBlockMap[I]; - MCSymbol *HandlerMapXData = nullptr; + const WinEHTryBlockMapEntry &TBME = FuncInfo.TryBlockMap[I]; + MCSymbol *HandlerMapXData = nullptr; if (!TBME.HandlerArray.empty()) HandlerMapXData = Asm->OutContext.getOrCreateSymbol(Twine("$handlerMap$") .concat(Twine(I)) .concat("$") - .concat(ParentLinkageName)); - + .concat(FuncLinkageName)); HandlerMaps.push_back(HandlerMapXData); - int CatchHigh = -1; - for (WinEHHandlerType &HT : TBME.HandlerArray) - CatchHigh = - std::max(CatchHigh, FuncInfo.CatchHandlerMaxState[HT.Handler]); - - assert(TBME.TryLow <= TBME.TryHigh); - OS.EmitIntValue(TBME.TryLow, 4); // TryLow - OS.EmitIntValue(TBME.TryHigh, 4); // TryHigh - OS.EmitIntValue(CatchHigh, 4); // CatchHigh - OS.EmitIntValue(TBME.HandlerArray.size(), 4); // NumCatches - OS.EmitValue(create32bitRef(HandlerMapXData), 4); // HandlerArray + // TBMEs should form intervals. + assert(0 <= TBME.TryLow && "bad trymap interval"); + assert(TBME.TryLow <= TBME.TryHigh && "bad trymap interval"); + assert(TBME.TryHigh < TBME.CatchHigh && "bad trymap interval"); + assert(TBME.CatchHigh < int(FuncInfo.CxxUnwindMap.size()) && + "bad trymap interval"); + + AddComment("TryLow"); + OS.EmitIntValue(TBME.TryLow, 4); + + AddComment("TryHigh"); + OS.EmitIntValue(TBME.TryHigh, 4); + + AddComment("CatchHigh"); + OS.EmitIntValue(TBME.CatchHigh, 4); + + AddComment("NumCatches"); + OS.EmitIntValue(TBME.HandlerArray.size(), 4); + + AddComment("HandlerArray"); + OS.EmitValue(create32bitRef(HandlerMapXData), 4); + } + + // All funclets use the same parent frame offset currently. + unsigned ParentFrameOffset = 0; + if (shouldEmitPersonality) { + const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); + ParentFrameOffset = TFI->getWinEHParentFrameOffset(*MF); } for (size_t I = 0, E = FuncInfo.TryBlockMap.size(); I != E; ++I) { - WinEHTryBlockMapEntry &TBME = FuncInfo.TryBlockMap[I]; + const WinEHTryBlockMapEntry &TBME = FuncInfo.TryBlockMap[I]; MCSymbol *HandlerMapXData = HandlerMaps[I]; if (!HandlerMapXData) continue; @@ -438,32 +788,34 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) { OS.EmitLabel(HandlerMapXData); for (const WinEHHandlerType &HT : TBME.HandlerArray) { // Get the frame escape label with the offset of the catch object. If - // the index is -1, then there is no catch object, and we should emit an - // offset of zero, indicating that no copy will occur. + // the index is INT_MAX, then there is no catch object, and we should + // emit an offset of zero, indicating that no copy will occur. const MCExpr *FrameAllocOffsetRef = nullptr; - if (HT.CatchObjRecoverIdx >= 0) { - MCSymbol *FrameAllocOffset = - Asm->OutContext.getOrCreateFrameAllocSymbol( - GlobalValue::getRealLinkageName(ParentF->getName()), - HT.CatchObjRecoverIdx); - FrameAllocOffsetRef = MCSymbolRefExpr::create( - FrameAllocOffset, MCSymbolRefExpr::VK_None, Asm->OutContext); + if (HT.CatchObj.FrameIndex != INT_MAX) { + int Offset = getFrameIndexOffset(HT.CatchObj.FrameIndex, FuncInfo); + FrameAllocOffsetRef = MCConstantExpr::create(Offset, Asm->OutContext); } else { FrameAllocOffsetRef = MCConstantExpr::create(0, Asm->OutContext); } - OS.EmitIntValue(HT.Adjectives, 4); // Adjectives - OS.EmitValue(create32bitRef(HT.TypeDescriptor), 4); // Type - OS.EmitValue(FrameAllocOffsetRef, 4); // CatchObjOffset - OS.EmitValue(create32bitRef(HT.Handler), 4); // Handler + MCSymbol *HandlerSym = + getMCSymbolForMBB(Asm, HT.Handler.dyn_cast<MachineBasicBlock *>()); + + AddComment("Adjectives"); + OS.EmitIntValue(HT.Adjectives, 4); + + AddComment("Type"); + OS.EmitValue(create32bitRef(HT.TypeDescriptor), 4); + + AddComment("CatchObjOffset"); + OS.EmitValue(FrameAllocOffsetRef, 4); + + AddComment("Handler"); + OS.EmitValue(create32bitRef(HandlerSym), 4); if (shouldEmitPersonality) { - MCSymbol *ParentFrameOffset = - Asm->OutContext.getOrCreateParentFrameOffsetSymbol( - GlobalValue::getRealLinkageName(HT.Handler->getName())); - const MCSymbolRefExpr *ParentFrameOffsetRef = MCSymbolRefExpr::create( - ParentFrameOffset, Asm->OutContext); - OS.EmitValue(ParentFrameOffsetRef, 4); // ParentFrameOffset + AddComment("ParentFrameOffset"); + OS.EmitIntValue(ParentFrameOffset, 4); } } } @@ -475,87 +827,65 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) { // }; if (IPToStateXData) { OS.EmitLabel(IPToStateXData); - for (auto &IPStatePair : FuncInfo.IPToStateList) { - OS.EmitValue(create32bitRef(IPStatePair.first), 4); // IP - OS.EmitIntValue(IPStatePair.second, 4); // State + for (auto &IPStatePair : IPToStateTable) { + AddComment("IP"); + OS.EmitValue(IPStatePair.first, 4); + AddComment("ToState"); + OS.EmitIntValue(IPStatePair.second, 4); } } } -void WinException::extendIP2StateTable(const MachineFunction *MF, - const Function *ParentF, - WinEHFuncInfo &FuncInfo) { - const Function *F = MF->getFunction(); - - // The Itanium LSDA table sorts similar landing pads together to simplify the - // actions table, but we don't need that. - SmallVector<const LandingPadInfo *, 64> LandingPads; - const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads(); - LandingPads.reserve(PadInfos.size()); - for (const auto &LP : PadInfos) - LandingPads.push_back(&LP); - - RangeMapType PadMap; - computePadMap(LandingPads, PadMap); - - // The end label of the previous invoke or nounwind try-range. - MCSymbol *LastLabel = Asm->getFunctionBegin(); - - // Whether there is a potentially throwing instruction (currently this means - // an ordinary call) between the end of the previous try-range and now. - bool SawPotentiallyThrowing = false; - - int LastEHState = -2; - - // The parent function and the catch handlers contribute to the 'ip2state' - // table. - - // Include ip2state entries for the beginning of the main function and - // for catch handler functions. - if (F == ParentF) { - FuncInfo.IPToStateList.push_back(std::make_pair(LastLabel, -1)); - LastEHState = -1; - } else if (FuncInfo.HandlerBaseState.count(F)) { - FuncInfo.IPToStateList.push_back( - std::make_pair(LastLabel, FuncInfo.HandlerBaseState[F])); - LastEHState = FuncInfo.HandlerBaseState[F]; - } - for (const auto &MBB : *MF) { - for (const auto &MI : MBB) { - if (!MI.isEHLabel()) { - if (MI.isCall()) - SawPotentiallyThrowing |= !callToNoUnwindFunction(&MI); - continue; +void WinException::computeIP2StateTable( + const MachineFunction *MF, const WinEHFuncInfo &FuncInfo, + SmallVectorImpl<std::pair<const MCExpr *, int>> &IPToStateTable) { + + for (MachineFunction::const_iterator FuncletStart = MF->begin(), + FuncletEnd = MF->begin(), + End = MF->end(); + FuncletStart != End; FuncletStart = FuncletEnd) { + // Find the end of the funclet + while (++FuncletEnd != End) { + if (FuncletEnd->isEHFuncletEntry()) { + break; } + } - // End of the previous try-range? - MCSymbol *BeginLabel = MI.getOperand(0).getMCSymbol(); - if (BeginLabel == LastLabel) - SawPotentiallyThrowing = false; - - // Beginning of a new try-range? - RangeMapType::const_iterator L = PadMap.find(BeginLabel); - if (L == PadMap.end()) - // Nope, it was just some random label. - continue; - - const PadRange &P = L->second; - const LandingPadInfo *LandingPad = LandingPads[P.PadIndex]; - assert(BeginLabel == LandingPad->BeginLabels[P.RangeIndex] && - "Inconsistent landing pad map!"); - - // FIXME: Should this be using FuncInfo.HandlerBaseState? - if (SawPotentiallyThrowing && LastEHState != -1) { - FuncInfo.IPToStateList.push_back(std::make_pair(LastLabel, -1)); - SawPotentiallyThrowing = false; - LastEHState = -1; - } + // Don't emit ip2state entries for cleanup funclets. Any interesting + // exceptional actions in cleanups must be handled in a separate IR + // function. + if (FuncletStart->isCleanupFuncletEntry()) + continue; - if (LandingPad->WinEHState != LastEHState) - FuncInfo.IPToStateList.push_back( - std::make_pair(BeginLabel, LandingPad->WinEHState)); - LastEHState = LandingPad->WinEHState; - LastLabel = LandingPad->EndLabels[P.RangeIndex]; + MCSymbol *StartLabel; + int BaseState; + if (FuncletStart == MF->begin()) { + BaseState = NullState; + StartLabel = Asm->getFunctionBegin(); + } else { + auto *FuncletPad = + cast<FuncletPadInst>(FuncletStart->getBasicBlock()->getFirstNonPHI()); + assert(FuncInfo.FuncletBaseStateMap.count(FuncletPad) != 0); + BaseState = FuncInfo.FuncletBaseStateMap.find(FuncletPad)->second; + StartLabel = getMCSymbolForMBB(Asm, &*FuncletStart); + } + assert(StartLabel && "need local function start label"); + IPToStateTable.push_back( + std::make_pair(create32bitRef(StartLabel), BaseState)); + + for (const auto &StateChange : InvokeStateChangeIterator::range( + FuncInfo, FuncletStart, FuncletEnd, BaseState)) { + // Compute the label to report as the start of this entry; use the EH + // start label for the invoke if we have one, otherwise (this is a call + // which may unwind to our caller and does not have an EH start label, so) + // use the previous end label. + const MCSymbol *ChangeLabel = StateChange.NewStartLabel; + if (!ChangeLabel) + ChangeLabel = StateChange.PreviousEndLabel; + // Emit an entry indicating that PCs after 'Label' have this EH state. + IPToStateTable.push_back( + std::make_pair(getLabelPlusOne(ChangeLabel), StateChange.NewState)); + // FIXME: assert that NewState is between CatchLow and CatchHigh. } } } @@ -566,15 +896,15 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo, // registration in order to recover the parent frame pointer. Now that we know // we've code generated the parent, we can emit the label assignment that // those helpers use to get the offset of the registration node. - assert(FuncInfo.EHRegNodeEscapeIndex != INT_MAX && - "no EH reg node localescape index"); + MCContext &Ctx = Asm->OutContext; MCSymbol *ParentFrameOffset = - Asm->OutContext.getOrCreateParentFrameOffsetSymbol(FLinkageName); - MCSymbol *RegistrationOffsetSym = Asm->OutContext.getOrCreateFrameAllocSymbol( - FLinkageName, FuncInfo.EHRegNodeEscapeIndex); - const MCExpr *RegistrationOffsetSymRef = - MCSymbolRefExpr::create(RegistrationOffsetSym, Asm->OutContext); - Asm->OutStreamer->EmitAssignment(ParentFrameOffset, RegistrationOffsetSymRef); + Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName); + unsigned UnusedReg; + const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering(); + int64_t Offset = TFI->getFrameIndexReference( + *Asm->MF, FuncInfo.EHRegNodeFrameIndex, UnusedReg); + const MCExpr *MCOffset = MCConstantExpr::create(Offset, Ctx); + Asm->OutStreamer->EmitAssignment(ParentFrameOffset, MCOffset); } /// Emit the language-specific data that _except_handler3 and 4 expect. This is @@ -585,7 +915,13 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) { const Function *F = MF->getFunction(); StringRef FLinkageName = GlobalValue::getRealLinkageName(F->getName()); - WinEHFuncInfo &FuncInfo = MMI->getWinEHFuncInfo(F); + bool VerboseAsm = OS.isVerboseAsm(); + auto AddComment = [&](const Twine &Comment) { + if (VerboseAsm) + OS.AddComment(Comment); + }; + + const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo(); emitEHRegistrationOffsetLabel(FuncInfo, FLinkageName); // Emit the __ehtable label that we use for llvm.x86.seh.lsda. @@ -611,58 +947,291 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) { // // Only the EHCookieOffset field appears to vary, and it appears to be the // offset from the final saved SP value to the retaddr. + AddComment("GSCookieOffset"); OS.EmitIntValue(-2, 4); + AddComment("GSCookieXOROffset"); OS.EmitIntValue(0, 4); // FIXME: Calculate. + AddComment("EHCookieOffset"); OS.EmitIntValue(9999, 4); + AddComment("EHCookieXOROffset"); OS.EmitIntValue(0, 4); BaseState = -2; } - // Build a list of pointers to LandingPadInfos and then sort by WinEHState. - const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads(); - SmallVector<const LandingPadInfo *, 4> LPads; - LPads.reserve((PadInfos.size())); - for (const LandingPadInfo &LPInfo : PadInfos) - LPads.push_back(&LPInfo); - std::sort(LPads.begin(), LPads.end(), - [](const LandingPadInfo *L, const LandingPadInfo *R) { - return L->WinEHState < R->WinEHState; - }); - - // For each action in each lpad, emit one of these: - // struct ScopeTableEntry { - // int32_t EnclosingLevel; - // int32_t (__cdecl *Filter)(); - // void *HandlerOrFinally; - // }; - // - // The "outermost" action will use BaseState as its enclosing level. Each - // other action will refer to the previous state as its enclosing level. - int CurState = 0; - for (const LandingPadInfo *LPInfo : LPads) { - int EnclosingLevel = BaseState; - assert(CurState + int(LPInfo->SEHHandlers.size()) - 1 == - LPInfo->WinEHState && - "gaps in the SEH scope table"); - for (auto I = LPInfo->SEHHandlers.rbegin(), E = LPInfo->SEHHandlers.rend(); - I != E; ++I) { - const SEHHandler &Handler = *I; - const BlockAddress *BA = Handler.RecoverBA; - const Function *F = Handler.FilterOrFinally; - assert(F && "cannot catch all in 32-bit SEH without filter function"); - const MCExpr *FilterOrNull = - create32bitRef(BA ? Asm->getSymbol(F) : nullptr); - const MCExpr *ExceptOrFinally = create32bitRef( - BA ? Asm->GetBlockAddressSymbol(BA) : Asm->getSymbol(F)); - - OS.EmitIntValue(EnclosingLevel, 4); - OS.EmitValue(FilterOrNull, 4); - OS.EmitValue(ExceptOrFinally, 4); - - // The next state unwinds to this state. - EnclosingLevel = CurState; - CurState++; + assert(!FuncInfo.SEHUnwindMap.empty()); + for (const SEHUnwindMapEntry &UME : FuncInfo.SEHUnwindMap) { + auto *Handler = UME.Handler.get<MachineBasicBlock *>(); + const MCSymbol *ExceptOrFinally = + UME.IsFinally ? getMCSymbolForMBB(Asm, Handler) : Handler->getSymbol(); + // -1 is usually the base state for "unwind to caller", but for + // _except_handler4 it's -2. Do that replacement here if necessary. + int ToState = UME.ToState == -1 ? BaseState : UME.ToState; + AddComment("ToState"); + OS.EmitIntValue(ToState, 4); + AddComment(UME.IsFinally ? "Null" : "FilterFunction"); + OS.EmitValue(create32bitRef(UME.Filter), 4); + AddComment(UME.IsFinally ? "FinallyFunclet" : "ExceptionHandler"); + OS.EmitValue(create32bitRef(ExceptOrFinally), 4); + } +} + +static int getTryRank(const WinEHFuncInfo &FuncInfo, int State) { + int Rank = 0; + while (State != -1) { + ++Rank; + State = FuncInfo.ClrEHUnwindMap[State].TryParentState; + } + return Rank; +} + +static int getTryAncestor(const WinEHFuncInfo &FuncInfo, int Left, int Right) { + int LeftRank = getTryRank(FuncInfo, Left); + int RightRank = getTryRank(FuncInfo, Right); + + while (LeftRank < RightRank) { + Right = FuncInfo.ClrEHUnwindMap[Right].TryParentState; + --RightRank; + } + + while (RightRank < LeftRank) { + Left = FuncInfo.ClrEHUnwindMap[Left].TryParentState; + --LeftRank; + } + + while (Left != Right) { + Left = FuncInfo.ClrEHUnwindMap[Left].TryParentState; + Right = FuncInfo.ClrEHUnwindMap[Right].TryParentState; + } + + return Left; +} + +void WinException::emitCLRExceptionTable(const MachineFunction *MF) { + // CLR EH "states" are really just IDs that identify handlers/funclets; + // states, handlers, and funclets all have 1:1 mappings between them, and a + // handler/funclet's "state" is its index in the ClrEHUnwindMap. + MCStreamer &OS = *Asm->OutStreamer; + const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo(); + MCSymbol *FuncBeginSym = Asm->getFunctionBegin(); + MCSymbol *FuncEndSym = Asm->getFunctionEnd(); + + // A ClrClause describes a protected region. + struct ClrClause { + const MCSymbol *StartLabel; // Start of protected region + const MCSymbol *EndLabel; // End of protected region + int State; // Index of handler protecting the protected region + int EnclosingState; // Index of funclet enclosing the protected region + }; + SmallVector<ClrClause, 8> Clauses; + + // Build a map from handler MBBs to their corresponding states (i.e. their + // indices in the ClrEHUnwindMap). + int NumStates = FuncInfo.ClrEHUnwindMap.size(); + assert(NumStates > 0 && "Don't need exception table!"); + DenseMap<const MachineBasicBlock *, int> HandlerStates; + for (int State = 0; State < NumStates; ++State) { + MachineBasicBlock *HandlerBlock = + FuncInfo.ClrEHUnwindMap[State].Handler.get<MachineBasicBlock *>(); + HandlerStates[HandlerBlock] = State; + // Use this loop through all handlers to verify our assumption (used in + // the MinEnclosingState computation) that enclosing funclets have lower + // state numbers than their enclosed funclets. + assert(FuncInfo.ClrEHUnwindMap[State].HandlerParentState < State && + "ill-formed state numbering"); + } + // Map the main function to the NullState. + HandlerStates[&MF->front()] = NullState; + + // Write out a sentinel indicating the end of the standard (Windows) xdata + // and the start of the additional (CLR) info. + OS.EmitIntValue(0xffffffff, 4); + // Write out the number of funclets + OS.EmitIntValue(NumStates, 4); + + // Walk the machine blocks/instrs, computing and emitting a few things: + // 1. Emit a list of the offsets to each handler entry, in lexical order. + // 2. Compute a map (EndSymbolMap) from each funclet to the symbol at its end. + // 3. Compute the list of ClrClauses, in the required order (inner before + // outer, earlier before later; the order by which a forward scan with + // early termination will find the innermost enclosing clause covering + // a given address). + // 4. A map (MinClauseMap) from each handler index to the index of the + // outermost funclet/function which contains a try clause targeting the + // key handler. This will be used to determine IsDuplicate-ness when + // emitting ClrClauses. The NullState value is used to indicate that the + // top-level function contains a try clause targeting the key handler. + // HandlerStack is a stack of (PendingStartLabel, PendingState) pairs for + // try regions we entered before entering the PendingState try but which + // we haven't yet exited. + SmallVector<std::pair<const MCSymbol *, int>, 4> HandlerStack; + // EndSymbolMap and MinClauseMap are maps described above. + std::unique_ptr<MCSymbol *[]> EndSymbolMap(new MCSymbol *[NumStates]); + SmallVector<int, 4> MinClauseMap((size_t)NumStates, NumStates); + + // Visit the root function and each funclet. + for (MachineFunction::const_iterator FuncletStart = MF->begin(), + FuncletEnd = MF->begin(), + End = MF->end(); + FuncletStart != End; FuncletStart = FuncletEnd) { + int FuncletState = HandlerStates[&*FuncletStart]; + // Find the end of the funclet + MCSymbol *EndSymbol = FuncEndSym; + while (++FuncletEnd != End) { + if (FuncletEnd->isEHFuncletEntry()) { + EndSymbol = getMCSymbolForMBB(Asm, &*FuncletEnd); + break; + } } + // Emit the function/funclet end and, if this is a funclet (and not the + // root function), record it in the EndSymbolMap. + OS.EmitValue(getOffset(EndSymbol, FuncBeginSym), 4); + if (FuncletState != NullState) { + // Record the end of the handler. + EndSymbolMap[FuncletState] = EndSymbol; + } + + // Walk the state changes in this function/funclet and compute its clauses. + // Funclets always start in the null state. + const MCSymbol *CurrentStartLabel = nullptr; + int CurrentState = NullState; + assert(HandlerStack.empty()); + for (const auto &StateChange : + InvokeStateChangeIterator::range(FuncInfo, FuncletStart, FuncletEnd)) { + // Close any try regions we're not still under + int StillPendingState = + getTryAncestor(FuncInfo, CurrentState, StateChange.NewState); + while (CurrentState != StillPendingState) { + assert(CurrentState != NullState && + "Failed to find still-pending state!"); + // Close the pending clause + Clauses.push_back({CurrentStartLabel, StateChange.PreviousEndLabel, + CurrentState, FuncletState}); + // Now the next-outer try region is current + CurrentState = FuncInfo.ClrEHUnwindMap[CurrentState].TryParentState; + // Pop the new start label from the handler stack if we've exited all + // inner try regions of the corresponding try region. + if (HandlerStack.back().second == CurrentState) + CurrentStartLabel = HandlerStack.pop_back_val().first; + } + + if (StateChange.NewState != CurrentState) { + // For each clause we're starting, update the MinClauseMap so we can + // know which is the topmost funclet containing a clause targeting + // it. + for (int EnteredState = StateChange.NewState; + EnteredState != CurrentState; + EnteredState = + FuncInfo.ClrEHUnwindMap[EnteredState].TryParentState) { + int &MinEnclosingState = MinClauseMap[EnteredState]; + if (FuncletState < MinEnclosingState) + MinEnclosingState = FuncletState; + } + // Save the previous current start/label on the stack and update to + // the newly-current start/state. + HandlerStack.emplace_back(CurrentStartLabel, CurrentState); + CurrentStartLabel = StateChange.NewStartLabel; + CurrentState = StateChange.NewState; + } + } + assert(HandlerStack.empty()); + } + + // Now emit the clause info, starting with the number of clauses. + OS.EmitIntValue(Clauses.size(), 4); + for (ClrClause &Clause : Clauses) { + // Emit a CORINFO_EH_CLAUSE : + /* + struct CORINFO_EH_CLAUSE + { + CORINFO_EH_CLAUSE_FLAGS Flags; // actually a CorExceptionFlag + DWORD TryOffset; + DWORD TryLength; // actually TryEndOffset + DWORD HandlerOffset; + DWORD HandlerLength; // actually HandlerEndOffset + union + { + DWORD ClassToken; // use for catch clauses + DWORD FilterOffset; // use for filter clauses + }; + }; + + enum CORINFO_EH_CLAUSE_FLAGS + { + CORINFO_EH_CLAUSE_NONE = 0, + CORINFO_EH_CLAUSE_FILTER = 0x0001, // This clause is for a filter + CORINFO_EH_CLAUSE_FINALLY = 0x0002, // This clause is a finally clause + CORINFO_EH_CLAUSE_FAULT = 0x0004, // This clause is a fault clause + }; + typedef enum CorExceptionFlag + { + COR_ILEXCEPTION_CLAUSE_NONE, + COR_ILEXCEPTION_CLAUSE_FILTER = 0x0001, // This is a filter clause + COR_ILEXCEPTION_CLAUSE_FINALLY = 0x0002, // This is a finally clause + COR_ILEXCEPTION_CLAUSE_FAULT = 0x0004, // This is a fault clause + COR_ILEXCEPTION_CLAUSE_DUPLICATED = 0x0008, // duplicated clause. This + // clause was duplicated + // to a funclet which was + // pulled out of line + } CorExceptionFlag; + */ + // Add 1 to the start/end of the EH clause; the IP associated with a + // call when the runtime does its scan is the IP of the next instruction + // (the one to which control will return after the call), so we need + // to add 1 to the end of the clause to cover that offset. We also add + // 1 to the start of the clause to make sure that the ranges reported + // for all clauses are disjoint. Note that we'll need some additional + // logic when machine traps are supported, since in that case the IP + // that the runtime uses is the offset of the faulting instruction + // itself; if such an instruction immediately follows a call but the + // two belong to different clauses, we'll need to insert a nop between + // them so the runtime can distinguish the point to which the call will + // return from the point at which the fault occurs. + + const MCExpr *ClauseBegin = + getOffsetPlusOne(Clause.StartLabel, FuncBeginSym); + const MCExpr *ClauseEnd = getOffsetPlusOne(Clause.EndLabel, FuncBeginSym); + + const ClrEHUnwindMapEntry &Entry = FuncInfo.ClrEHUnwindMap[Clause.State]; + MachineBasicBlock *HandlerBlock = Entry.Handler.get<MachineBasicBlock *>(); + MCSymbol *BeginSym = getMCSymbolForMBB(Asm, HandlerBlock); + const MCExpr *HandlerBegin = getOffset(BeginSym, FuncBeginSym); + MCSymbol *EndSym = EndSymbolMap[Clause.State]; + const MCExpr *HandlerEnd = getOffset(EndSym, FuncBeginSym); + + uint32_t Flags = 0; + switch (Entry.HandlerType) { + case ClrHandlerType::Catch: + // Leaving bits 0-2 clear indicates catch. + break; + case ClrHandlerType::Filter: + Flags |= 1; + break; + case ClrHandlerType::Finally: + Flags |= 2; + break; + case ClrHandlerType::Fault: + Flags |= 4; + break; + } + if (Clause.EnclosingState != MinClauseMap[Clause.State]) { + // This is a "duplicate" clause; the handler needs to be entered from a + // frame above the one holding the invoke. + assert(Clause.EnclosingState > MinClauseMap[Clause.State]); + Flags |= 8; + } + OS.EmitIntValue(Flags, 4); + + // Write the clause start/end + OS.EmitValue(ClauseBegin, 4); + OS.EmitValue(ClauseEnd, 4); + + // Write out the handler start/end + OS.EmitValue(HandlerBegin, 4); + OS.EmitValue(HandlerEnd, 4); + + // Write out the type token or filter offset + assert(Entry.HandlerType != ClrHandlerType::Filter && "NYI: filters"); + OS.EmitIntValue(Entry.TypeToken, 4); } } diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h index 669c9cc..acb3010 100644 --- a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h +++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h @@ -21,6 +21,7 @@ class Function; class GlobalValue; class MachineFunction; class MCExpr; +class Value; struct WinEHFuncInfo; class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer { @@ -36,7 +37,14 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer { /// True if this is a 64-bit target and we should use image relative offsets. bool useImageRel32 = false; - void emitCSpecificHandlerTable(); + /// Pointer to the current funclet entry BB. + const MachineBasicBlock *CurrentFuncletEntry = nullptr; + + void emitCSpecificHandlerTable(const MachineFunction *MF); + + void emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo, + const MCSymbol *BeginLabel, + const MCSymbol *EndLabel, int State); /// Emit the EH table data for 32-bit and 64-bit functions using /// the __CxxFrameHandler3 personality. @@ -47,8 +55,11 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer { /// tables. void emitExceptHandlerTable(const MachineFunction *MF); - void extendIP2StateTable(const MachineFunction *MF, const Function *ParentF, - WinEHFuncInfo &FuncInfo); + void emitCLRExceptionTable(const MachineFunction *MF); + + void computeIP2StateTable( + const MachineFunction *MF, const WinEHFuncInfo &FuncInfo, + SmallVectorImpl<std::pair<const MCExpr *, int>> &IPToStateTable); /// Emits the label used with llvm.x86.seh.recoverfp, which is used by /// outlined funclets. @@ -57,6 +68,16 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer { const MCExpr *create32bitRef(const MCSymbol *Value); const MCExpr *create32bitRef(const GlobalValue *GV); + const MCExpr *getLabelPlusOne(const MCSymbol *Label); + const MCExpr *getOffset(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom); + const MCExpr *getOffsetPlusOne(const MCSymbol *OffsetOf, + const MCSymbol *OffsetFrom); + + /// Gets the offset that we should use in a table for a stack object with the + /// given index. For targets using CFI (Win64, etc), this is relative to the + /// established SP at the end of the prologue. For targets without CFI (Win32 + /// only), it is relative to the frame pointer. + int getFrameIndexOffset(int FrameIndex, const WinEHFuncInfo &FuncInfo); public: //===--------------------------------------------------------------------===// @@ -74,6 +95,10 @@ public: /// Gather and emit post-function exception information. void endFunction(const MachineFunction *) override; + + /// \brief Emit target-specific EH funclet machinery. + void beginFunclet(const MachineBasicBlock &MBB, MCSymbol *Sym) override; + void endFunclet() override; }; } diff --git a/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp index 530ab46..d12fdb2 100644 --- a/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -8,10 +8,14 @@ //===----------------------------------------------------------------------===// // // This file contains a pass (at IR level) to replace atomic instructions with -// either (intrinsic-based) load-linked/store-conditional loops or AtomicCmpXchg. +// target specific instruction which implement the same semantics in a way +// which better fits the target backend. This can include the use of either +// (intrinsic-based) load-linked/store-conditional loops, AtomicCmpXchg, or +// type coercions. // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/AtomicExpandUtils.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -20,6 +24,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -44,13 +49,17 @@ namespace { private: bool bracketInstWithFences(Instruction *I, AtomicOrdering Order, bool IsStore, bool IsLoad); - bool expandAtomicLoad(LoadInst *LI); + IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); + LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); + bool tryExpandAtomicLoad(LoadInst *LI); bool expandAtomicLoadToLL(LoadInst *LI); bool expandAtomicLoadToCmpXchg(LoadInst *LI); + StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI); bool expandAtomicStore(StoreInst *SI); bool tryExpandAtomicRMW(AtomicRMWInst *AI); - bool expandAtomicRMWToLLSC(AtomicRMWInst *AI); - bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI); + bool expandAtomicOpToLLSC( + Instruction *I, Value *Addr, AtomicOrdering MemOpOrder, + std::function<Value *(IRBuilder<> &, Value *)> PerformOp); bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); bool isIdempotentRMW(AtomicRMWInst *AI); bool simplifyIdempotentRMW(AtomicRMWInst *AI); @@ -108,7 +117,7 @@ bool AtomicExpand::runOnFunction(Function &F) { FenceOrdering = RMWI->getOrdering(); RMWI->setOrdering(Monotonic); IsStore = IsLoad = true; - } else if (CASI && !TLI->hasLoadLinkedStoreConditional() && + } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) && (isAtLeastRelease(CASI->getSuccessOrdering()) || isAtLeastAcquire(CASI->getSuccessOrdering()))) { // If a compare and swap is lowered to LL/SC, we can do smarter fence @@ -126,10 +135,28 @@ bool AtomicExpand::runOnFunction(Function &F) { } } - if (LI && TLI->shouldExpandAtomicLoadInIR(LI)) { - MadeChange |= expandAtomicLoad(LI); - } else if (SI && TLI->shouldExpandAtomicStoreInIR(SI)) { - MadeChange |= expandAtomicStore(SI); + if (LI) { + if (LI->getType()->isFloatingPointTy()) { + // TODO: add a TLI hook to control this so that each target can + // convert to lowering the original type one at a time. + LI = convertAtomicLoadToIntegerType(LI); + assert(LI->getType()->isIntegerTy() && "invariant broken"); + MadeChange = true; + } + + MadeChange |= tryExpandAtomicLoad(LI); + } else if (SI) { + if (SI->getValueOperand()->getType()->isFloatingPointTy()) { + // TODO: add a TLI hook to control this so that each target can + // convert to lowering the original type one at a time. + SI = convertAtomicStoreToIntegerType(SI); + assert(SI->getValueOperand()->getType()->isIntegerTy() && + "invariant broken"); + MadeChange = true; + } + + if (TLI->shouldExpandAtomicStoreInIR(SI)) + MadeChange |= expandAtomicStore(SI); } else if (RMWI) { // There are two different ways of expanding RMW instructions: // - into a load if it is idempotent @@ -141,7 +168,7 @@ bool AtomicExpand::runOnFunction(Function &F) { } else { MadeChange |= tryExpandAtomicRMW(RMWI); } - } else if (CASI && TLI->hasLoadLinkedStoreConditional()) { + } else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI)) { MadeChange |= expandAtomicCmpXchg(CASI); } } @@ -169,11 +196,56 @@ bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order, return (LeadingFence || TrailingFence); } -bool AtomicExpand::expandAtomicLoad(LoadInst *LI) { - if (TLI->hasLoadLinkedStoreConditional()) +/// Get the iX type with the same bitwidth as T. +IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T, + const DataLayout &DL) { + EVT VT = TLI->getValueType(DL, T); + unsigned BitWidth = VT.getStoreSizeInBits(); + assert(BitWidth == VT.getSizeInBits() && "must be a power of two"); + return IntegerType::get(T->getContext(), BitWidth); +} + +/// Convert an atomic load of a non-integral type to an integer load of the +/// equivelent bitwidth. See the function comment on +/// convertAtomicStoreToIntegerType for background. +LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) { + auto *M = LI->getModule(); + Type *NewTy = getCorrespondingIntegerType(LI->getType(), + M->getDataLayout()); + + IRBuilder<> Builder(LI); + + Value *Addr = LI->getPointerOperand(); + Type *PT = PointerType::get(NewTy, + Addr->getType()->getPointerAddressSpace()); + Value *NewAddr = Builder.CreateBitCast(Addr, PT); + + auto *NewLI = Builder.CreateLoad(NewAddr); + NewLI->setAlignment(LI->getAlignment()); + NewLI->setVolatile(LI->isVolatile()); + NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope()); + DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n"); + + Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType()); + LI->replaceAllUsesWith(NewVal); + LI->eraseFromParent(); + return NewLI; +} + +bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) { + switch (TLI->shouldExpandAtomicLoadInIR(LI)) { + case TargetLoweringBase::AtomicExpansionKind::None: + return false; + case TargetLoweringBase::AtomicExpansionKind::LLSC: + return expandAtomicOpToLLSC( + LI, LI->getPointerOperand(), LI->getOrdering(), + [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; }); + case TargetLoweringBase::AtomicExpansionKind::LLOnly: return expandAtomicLoadToLL(LI); - else + case TargetLoweringBase::AtomicExpansionKind::CmpXChg: return expandAtomicLoadToCmpXchg(LI); + } + llvm_unreachable("Unhandled case in tryExpandAtomicLoad"); } bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { @@ -184,6 +256,7 @@ bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { // to be single-copy atomic by ARM is an ldrexd (A3.5.3). Value *Val = TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering()); + TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); LI->replaceAllUsesWith(Val); LI->eraseFromParent(); @@ -209,6 +282,35 @@ bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) { return true; } +/// Convert an atomic store of a non-integral type to an integer store of the +/// equivelent bitwidth. We used to not support floating point or vector +/// atomics in the IR at all. The backends learned to deal with the bitcast +/// idiom because that was the only way of expressing the notion of a atomic +/// float or vector store. The long term plan is to teach each backend to +/// instruction select from the original atomic store, but as a migration +/// mechanism, we convert back to the old format which the backends understand. +/// Each backend will need individual work to recognize the new format. +StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { + IRBuilder<> Builder(SI); + auto *M = SI->getModule(); + Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(), + M->getDataLayout()); + Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy); + + Value *Addr = SI->getPointerOperand(); + Type *PT = PointerType::get(NewTy, + Addr->getType()->getPointerAddressSpace()); + Value *NewAddr = Builder.CreateBitCast(Addr, PT); + + StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); + NewSI->setAlignment(SI->getAlignment()); + NewSI->setVolatile(SI->isVolatile()); + NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope()); + DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); + SI->eraseFromParent(); + return NewSI; +} + bool AtomicExpand::expandAtomicStore(StoreInst *SI) { // This function is only called on atomic stores that are too large to be // atomic if implemented as a native store. So we replace them by an @@ -226,23 +328,15 @@ bool AtomicExpand::expandAtomicStore(StoreInst *SI) { return tryExpandAtomicRMW(AI); } -bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { - switch (TLI->shouldExpandAtomicRMWInIR(AI)) { - case TargetLoweringBase::AtomicRMWExpansionKind::None: - return false; - case TargetLoweringBase::AtomicRMWExpansionKind::LLSC: { - assert(TLI->hasLoadLinkedStoreConditional() && - "TargetLowering requested we expand AtomicRMW instruction into " - "load-linked/store-conditional combos, but such instructions aren't " - "supported"); - - return expandAtomicRMWToLLSC(AI); - } - case TargetLoweringBase::AtomicRMWExpansionKind::CmpXChg: { - return expandAtomicRMWToCmpXchg(AI); - } - } - llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); +static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr, + Value *Loaded, Value *NewVal, + AtomicOrdering MemOpOrder, + Value *&Success, Value *&NewLoaded) { + Value* Pair = Builder.CreateAtomicCmpXchg( + Addr, Loaded, NewVal, MemOpOrder, + AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); + Success = Builder.CreateExtractValue(Pair, 1, "success"); + NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); } /// Emit IR to implement the given atomicrmw operation on values in registers, @@ -282,10 +376,28 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, } } -bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) { - AtomicOrdering MemOpOrder = AI->getOrdering(); - Value *Addr = AI->getPointerOperand(); - BasicBlock *BB = AI->getParent(); +bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { + switch (TLI->shouldExpandAtomicRMWInIR(AI)) { + case TargetLoweringBase::AtomicExpansionKind::None: + return false; + case TargetLoweringBase::AtomicExpansionKind::LLSC: + return expandAtomicOpToLLSC(AI, AI->getPointerOperand(), AI->getOrdering(), + [&](IRBuilder<> &Builder, Value *Loaded) { + return performAtomicOp(AI->getOperation(), + Builder, Loaded, + AI->getValOperand()); + }); + case TargetLoweringBase::AtomicExpansionKind::CmpXChg: + return expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); + default: + llvm_unreachable("Unhandled case in tryExpandAtomicRMW"); + } +} + +bool AtomicExpand::expandAtomicOpToLLSC( + Instruction *I, Value *Addr, AtomicOrdering MemOpOrder, + std::function<Value *(IRBuilder<> &, Value *)> PerformOp) { + BasicBlock *BB = I->getParent(); Function *F = BB->getParent(); LLVMContext &Ctx = F->getContext(); @@ -303,11 +415,11 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) { // atomicrmw.end: // fence? // [...] - BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end"); + BasicBlock *ExitBB = BB->splitBasicBlock(I->getIterator(), "atomicrmw.end"); BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); - // This grabs the DebugLoc from AI. - IRBuilder<> Builder(AI); + // This grabs the DebugLoc from I. + IRBuilder<> Builder(I); // The split call above "helpfully" added a branch at the end of BB (to the // wrong place), but we might want a fence too. It's easiest to just remove @@ -320,8 +432,7 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) { Builder.SetInsertPoint(LoopBB); Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); - Value *NewVal = - performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); + Value *NewVal = PerformOp(Builder, Loaded); Value *StoreSuccess = TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); @@ -331,72 +442,8 @@ bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) { Builder.SetInsertPoint(ExitBB, ExitBB->begin()); - AI->replaceAllUsesWith(Loaded); - AI->eraseFromParent(); - - return true; -} - -bool AtomicExpand::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI) { - AtomicOrdering MemOpOrder = - AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering(); - Value *Addr = AI->getPointerOperand(); - BasicBlock *BB = AI->getParent(); - Function *F = BB->getParent(); - LLVMContext &Ctx = F->getContext(); - - // Given: atomicrmw some_op iN* %addr, iN %incr ordering - // - // The standard expansion we produce is: - // [...] - // %init_loaded = load atomic iN* %addr - // br label %loop - // loop: - // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] - // %new = some_op iN %loaded, %incr - // %pair = cmpxchg iN* %addr, iN %loaded, iN %new - // %new_loaded = extractvalue { iN, i1 } %pair, 0 - // %success = extractvalue { iN, i1 } %pair, 1 - // br i1 %success, label %atomicrmw.end, label %loop - // atomicrmw.end: - // [...] - BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end"); - BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); - - // This grabs the DebugLoc from AI. - IRBuilder<> Builder(AI); - - // The split call above "helpfully" added a branch at the end of BB (to the - // wrong place), but we want a load. It's easiest to just remove - // the branch entirely. - std::prev(BB->end())->eraseFromParent(); - Builder.SetInsertPoint(BB); - LoadInst *InitLoaded = Builder.CreateLoad(Addr); - // Atomics require at least natural alignment. - InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits()); - Builder.CreateBr(LoopBB); - - // Start the main loop block now that we've taken care of the preliminaries. - Builder.SetInsertPoint(LoopBB); - PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); - Loaded->addIncoming(InitLoaded, BB); - - Value *NewVal = - performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); - - Value *Pair = Builder.CreateAtomicCmpXchg( - Addr, Loaded, NewVal, MemOpOrder, - AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); - Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); - Loaded->addIncoming(NewLoaded, LoopBB); - - Value *Success = Builder.CreateExtractValue(Pair, 1, "success"); - Builder.CreateCondBr(Success, ExitBB, LoopBB); - - Builder.SetInsertPoint(ExitBB, ExitBB->begin()); - - AI->replaceAllUsesWith(NewLoaded); - AI->eraseFromParent(); + I->replaceAllUsesWith(Loaded); + I->eraseFromParent(); return true; } @@ -424,7 +471,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // %loaded = @load.linked(%addr) // %should_store = icmp eq %loaded, %desired // br i1 %should_store, label %cmpxchg.trystore, - // label %cmpxchg.failure + // label %cmpxchg.nostore // cmpxchg.trystore: // %stored = @store_conditional(%new, %addr) // %success = icmp eq i32 %stored, 0 @@ -432,6 +479,9 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // cmpxchg.success: // fence? // br label %cmpxchg.end + // cmpxchg.nostore: + // @load_linked_fail_balance()? + // br label %cmpxchg.failure // cmpxchg.failure: // fence? // br label %cmpxchg.end @@ -440,9 +490,10 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0 // %res = insertvalue { iN, i1 } %restmp, i1 %success, 1 // [...] - BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end"); + BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end"); auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB); - auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, FailureBB); + auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB); + auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB); auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, SuccessBB); auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB); @@ -466,7 +517,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // If the cmpxchg doesn't actually need any ordering when it fails, we can // jump straight past that fence instruction (if it exists). - Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB); + Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB); Builder.SetInsertPoint(TryStoreBB); Value *StoreSuccess = TLI->emitStoreConditional( @@ -482,6 +533,13 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { /*IsLoad=*/true); Builder.CreateBr(ExitBB); + Builder.SetInsertPoint(NoStoreBB); + // In the failing case, where we don't execute the store-conditional, the + // target might want to balance out the load-linked with a dedicated + // instruction (e.g., on ARM, clearing the exclusive monitor). + TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder); + Builder.CreateBr(FailureBB); + Builder.SetInsertPoint(FailureBB); TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true, /*IsLoad=*/true); @@ -556,9 +614,77 @@ bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) { bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) { - if (TLI->shouldExpandAtomicLoadInIR(ResultingLoad)) - expandAtomicLoad(ResultingLoad); + tryExpandAtomicLoad(ResultingLoad); return true; } return false; } + +bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, + CreateCmpXchgInstFun CreateCmpXchg) { + assert(AI); + + AtomicOrdering MemOpOrder = + AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering(); + Value *Addr = AI->getPointerOperand(); + BasicBlock *BB = AI->getParent(); + Function *F = BB->getParent(); + LLVMContext &Ctx = F->getContext(); + + // Given: atomicrmw some_op iN* %addr, iN %incr ordering + // + // The standard expansion we produce is: + // [...] + // %init_loaded = load atomic iN* %addr + // br label %loop + // loop: + // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] + // %new = some_op iN %loaded, %incr + // %pair = cmpxchg iN* %addr, iN %loaded, iN %new + // %new_loaded = extractvalue { iN, i1 } %pair, 0 + // %success = extractvalue { iN, i1 } %pair, 1 + // br i1 %success, label %atomicrmw.end, label %loop + // atomicrmw.end: + // [...] + BasicBlock *ExitBB = BB->splitBasicBlock(AI->getIterator(), "atomicrmw.end"); + BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); + + // This grabs the DebugLoc from AI. + IRBuilder<> Builder(AI); + + // The split call above "helpfully" added a branch at the end of BB (to the + // wrong place), but we want a load. It's easiest to just remove + // the branch entirely. + std::prev(BB->end())->eraseFromParent(); + Builder.SetInsertPoint(BB); + LoadInst *InitLoaded = Builder.CreateLoad(Addr); + // Atomics require at least natural alignment. + InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits() / 8); + Builder.CreateBr(LoopBB); + + // Start the main loop block now that we've taken care of the preliminaries. + Builder.SetInsertPoint(LoopBB); + PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); + Loaded->addIncoming(InitLoaded, BB); + + Value *NewVal = + performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); + + Value *NewLoaded = nullptr; + Value *Success = nullptr; + + CreateCmpXchg(Builder, Addr, Loaded, NewVal, MemOpOrder, + Success, NewLoaded); + assert(Success && NewLoaded); + + Loaded->addIncoming(NewLoaded, LoopBB); + + Builder.CreateCondBr(Success, ExitBB, LoopBB); + + Builder.SetInsertPoint(ExitBB, ExitBB->begin()); + + AI->replaceAllUsesWith(NewLoaded); + AI->eraseFromParent(); + + return true; +} diff --git a/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp index db00910..a67e194 100644 --- a/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -33,6 +33,6 @@ cl::opt<unsigned> cl::desc("Threshold for partial unrolling"), cl::Hidden); -BasicTTIImpl::BasicTTIImpl(const TargetMachine *TM, Function &F) +BasicTTIImpl::BasicTTIImpl(const TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm/lib/CodeGen/BranchFolding.cpp index 6182667..df5cac5 100644 --- a/contrib/llvm/lib/CodeGen/BranchFolding.cpp +++ b/contrib/llvm/lib/CodeGen/BranchFolding.cpp @@ -12,7 +12,8 @@ // it then removes. // // Note that this pass must be run after register allocation, it cannot handle -// SSA form. +// SSA form. It also must handle virtual registers for targets that emit virtual +// ISA (e.g. NVPTX). // //===----------------------------------------------------------------------===// @@ -20,6 +21,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -95,7 +97,7 @@ bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) { // TailMerge can create jump into if branches that make CFG irreducible for // HW that requires structurized CFG. bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() && - PassConfig->getEnableTailMerge(); + PassConfig->getEnableTailMerge(); BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, getAnalysis<MachineBlockFrequencyInfo>(), getAnalysis<MachineBranchProbabilityInfo>()); @@ -132,6 +134,7 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) { // Remove the block. MF->erase(MBB); + FuncletMembership.erase(MBB); } /// OptimizeImpDefsBlock - If a basic block is just a bunch of implicit_def @@ -150,9 +153,13 @@ bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) { if (!I->isImplicitDef()) break; unsigned Reg = I->getOperand(0).getReg(); - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - ImpDefRegs.insert(*SubRegs); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); + SubRegs.isValid(); ++SubRegs) + ImpDefRegs.insert(*SubRegs); + } else { + ImpDefRegs.insert(Reg); + } ++I; } if (ImpDefRegs.empty()) @@ -163,8 +170,7 @@ bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) { if (!TII->isUnpredicatedTerminator(I)) return false; // See if it uses any of the implicitly defined registers. - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - MachineOperand &MO = I->getOperand(i); + for (const MachineOperand &MO : I->operands()) { if (!MO.isReg() || !MO.isUse()) continue; unsigned Reg = MO.getReg(); @@ -208,14 +214,17 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF, // Fix CFG. The later algorithms expect it to be right. bool MadeChange = false; - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; I++) { - MachineBasicBlock *MBB = I, *TBB = nullptr, *FBB = nullptr; + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; - if (!TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, true)) - MadeChange |= MBB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty()); - MadeChange |= OptimizeImpDefsBlock(MBB); + if (!TII->AnalyzeBranch(MBB, TBB, FBB, Cond, true)) + MadeChange |= MBB.CorrectExtraCFGEdges(TBB, FBB, !Cond.empty()); + MadeChange |= OptimizeImpDefsBlock(&MBB); } + // Recalculate funclet membership. + FuncletMembership = getFuncletMembership(MF); + bool MadeChangeThisIteration = true; while (MadeChangeThisIteration) { MadeChangeThisIteration = TailMergeBlocks(MF); @@ -235,12 +244,9 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF, // Walk the function to find jump tables that are live. BitVector JTIsLive(JTI->getJumpTables().size()); - for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); - BB != E; ++BB) { - for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); - I != E; ++I) - for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) { - MachineOperand &Op = I->getOperand(op); + for (const MachineBasicBlock &BB : MF) { + for (const MachineInstr &I : BB) + for (const MachineOperand &Op : I.operands()) { if (!Op.isJTI()) continue; // Remember that this JT is live. @@ -365,7 +371,7 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1, } // Back past possible debugging pseudos at beginning of block. This matters // when one block differs from the other only by whether debugging pseudos - // are present at the beginning. (This way, the various checks later for + // are present at the beginning. (This way, the various checks later for // I1==MBB1->begin() work as expected.) if (I1 == MBB1->begin() && I2 != MBB2->begin()) { --I2; @@ -426,7 +432,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB, MachineFunction &MF = *CurMBB.getParent(); // Create the fall-through block. - MachineFunction::iterator MBBI = &CurMBB; + MachineFunction::iterator MBBI = CurMBB.getIterator(); MachineBasicBlock *NewMBB =MF.CreateMachineBasicBlock(BB); CurMBB.getParent()->insert(++MBBI, NewMBB); @@ -445,6 +451,11 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB, // For targets that use the register scavenger, we must maintain LiveIns. MaintainLiveIns(&CurMBB, NewMBB); + // Add the new block to the funclet. + const auto &FuncletI = FuncletMembership.find(&CurMBB); + if (FuncletI != FuncletMembership.end()) + FuncletMembership[NewMBB] = FuncletI->second; + return NewMBB; } @@ -479,7 +490,7 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB, DebugLoc dl; // FIXME: this is nowhere if (I != MF->end() && !TII->AnalyzeBranch(*CurMBB, TBB, FBB, Cond, true)) { - MachineBasicBlock *NextBB = I; + MachineBasicBlock *NextBB = &*I; if (TBB == NextBB && !Cond.empty() && !FBB) { if (!TII->ReverseBranchCondition(Cond)) { TII->RemoveBranch(*CurMBB); @@ -549,14 +560,23 @@ static unsigned CountTerminators(MachineBasicBlock *MBB, /// and decide if it would be profitable to merge those tails. Return the /// length of the common tail and iterators to the first common instruction /// in each block. -static bool ProfitableToMerge(MachineBasicBlock *MBB1, - MachineBasicBlock *MBB2, - unsigned minCommonTailLength, - unsigned &CommonTailLen, - MachineBasicBlock::iterator &I1, - MachineBasicBlock::iterator &I2, - MachineBasicBlock *SuccBB, - MachineBasicBlock *PredBB) { +static bool +ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2, + unsigned minCommonTailLength, unsigned &CommonTailLen, + MachineBasicBlock::iterator &I1, + MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB, + MachineBasicBlock *PredBB, + DenseMap<const MachineBasicBlock *, int> &FuncletMembership) { + // It is never profitable to tail-merge blocks from two different funclets. + if (!FuncletMembership.empty()) { + auto Funclet1 = FuncletMembership.find(MBB1); + assert(Funclet1 != FuncletMembership.end()); + auto Funclet2 = FuncletMembership.find(MBB2); + assert(Funclet2 != FuncletMembership.end()); + if (Funclet1->second != Funclet2->second) + return false; + } + CommonTailLen = ComputeCommonTailLength(MBB1, MBB2, I1, I2); if (CommonTailLen == 0) return false; @@ -600,12 +620,8 @@ static bool ProfitableToMerge(MachineBasicBlock *MBB1, // branch instruction, which is likely to be smaller than the 2 // instructions that would be deleted in the merge. MachineFunction *MF = MBB1->getParent(); - if (EffectiveTailLen >= 2 && - MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize) && - (I1 == MBB1->begin() || I2 == MBB2->begin())) - return true; - - return false; + return EffectiveTailLen >= 2 && MF->getFunction()->optForSize() && + (I1 == MBB1->begin() || I2 == MBB2->begin()); } /// ComputeSameTails - Look through all the blocks in MergePotentials that have @@ -634,7 +650,8 @@ unsigned BranchFolder::ComputeSameTails(unsigned CurHash, if (ProfitableToMerge(CurMPIter->getBlock(), I->getBlock(), minCommonTailLength, CommonTailLen, TrialBBI1, TrialBBI2, - SuccBB, PredBB)) { + SuccBB, PredBB, + FuncletMembership)) { if (CommonTailLen > maxCommonTailLength) { SameTails.clear(); maxCommonTailLength = CommonTailLen; @@ -727,18 +744,6 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB, return true; } -static bool hasIdenticalMMOs(const MachineInstr *MI1, const MachineInstr *MI2) { - auto I1 = MI1->memoperands_begin(), E1 = MI1->memoperands_end(); - auto I2 = MI2->memoperands_begin(), E2 = MI2->memoperands_end(); - if ((E1 - I1) != (E2 - I2)) - return false; - for (; I1 != E1; ++I1, ++I2) { - if (**I1 != **I2) - return false; - } - return true; -} - static void removeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos, MachineBasicBlock &MBBCommon) { @@ -775,8 +780,7 @@ removeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos, assert(MBBICommon->isIdenticalTo(&*MBBI) && "Expected matching MIIs!"); if (MBBICommon->mayLoad() || MBBICommon->mayStore()) - if (!hasIdenticalMMOs(&*MBBI, &*MBBICommon)) - MBBICommon->clearMemRefs(); + MBBICommon->setMemRefs(MBBICommon->mergeMemRefsWith(*MBBI)); ++MBBI; ++MBBICommon; @@ -840,8 +844,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB, // block, which we can't jump to), we can treat all blocks with this same // tail at once. Use PredBB if that is one of the possibilities, as that // will not introduce any extra branches. - MachineBasicBlock *EntryBB = MergePotentials.begin()->getBlock()-> - getParent()->begin(); + MachineBasicBlock *EntryBB = + &MergePotentials.front().getBlock()->getParent()->front(); unsigned commonTailIndex = SameTails.size(); // If there are two blocks, check to see if one can be made to fall through // into the other. @@ -917,12 +921,11 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { // First find blocks with no successors. MergePotentials.clear(); - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); - I != E && MergePotentials.size() < TailMergeThreshold; ++I) { - if (TriedMerging.count(I)) - continue; - if (I->succ_empty()) - MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(I), I)); + for (MachineBasicBlock &MBB : MF) { + if (MergePotentials.size() == TailMergeThreshold) + break; + if (!TriedMerging.count(&MBB) && MBB.succ_empty()) + MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(&MBB), &MBB)); } // If this is a large problem, avoid visiting the same basic blocks @@ -958,13 +961,13 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { I != E; ++I) { if (I->pred_size() < 2) continue; SmallPtrSet<MachineBasicBlock *, 8> UniquePreds; - MachineBasicBlock *IBB = I; - MachineBasicBlock *PredBB = std::prev(I); + MachineBasicBlock *IBB = &*I; + MachineBasicBlock *PredBB = &*std::prev(I); MergePotentials.clear(); - for (MachineBasicBlock::pred_iterator P = I->pred_begin(), - E2 = I->pred_end(); - P != E2 && MergePotentials.size() < TailMergeThreshold; ++P) { - MachineBasicBlock *PBB = *P; + for (MachineBasicBlock *PBB : I->predecessors()) { + if (MergePotentials.size() == TailMergeThreshold) + break; + if (TriedMerging.count(PBB)) continue; @@ -977,7 +980,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { continue; // Skip blocks which may jump to a landing pad. Can't tail merge these. - if (PBB->getLandingPadSuccessor()) + if (PBB->hasEHPadSuccessor()) continue; MachineBasicBlock *TBB = nullptr, *FBB = nullptr; @@ -990,18 +993,21 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { if (TII->ReverseBranchCondition(NewCond)) continue; // This is the QBB case described above - if (!FBB) - FBB = std::next(MachineFunction::iterator(PBB)); + if (!FBB) { + auto Next = ++PBB->getIterator(); + if (Next != MF.end()) + FBB = &*Next; + } } // Failing case: the only way IBB can be reached from PBB is via // exception handling. Happens for landing pads. Would be nice to have // a bit in the edge so we didn't have to do all this. - if (IBB->isLandingPad()) { - MachineFunction::iterator IP = PBB; IP++; + if (IBB->isEHPad()) { + MachineFunction::iterator IP = ++PBB->getIterator(); MachineBasicBlock *PredNextBB = nullptr; if (IP != MF.end()) - PredNextBB = IP; + PredNextBB = &*IP; if (!TBB) { if (IBB != PredNextBB) // fallthrough continue; @@ -1027,7 +1033,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { NewCond, dl); } - MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P)); + MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), PBB)); } } @@ -1042,7 +1048,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { // Reinsert an unconditional branch if needed. The 1 below can occur as a // result of removing blocks in TryTailMergeBlocks. - PredBB = std::prev(I); // this may have been changed in TryTailMergeBlocks + PredBB = &*std::prev(I); // this may have been changed in TryTailMergeBlocks if (MergePotentials.size() == 1 && MergePotentials.begin()->getBlock() != PredBB) FixTail(MergePotentials.begin()->getBlock(), IBB, TII); @@ -1080,13 +1086,19 @@ void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) { if (TailMBB.succ_size() <= 1) return; - auto MaxEdgeFreq = *std::max_element(EdgeFreqLs.begin(), EdgeFreqLs.end()); - uint64_t Scale = MaxEdgeFreq.getFrequency() / UINT32_MAX + 1; + auto SumEdgeFreq = + std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0)) + .getFrequency(); auto EdgeFreq = EdgeFreqLs.begin(); - for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end(); - SuccI != SuccE; ++SuccI, ++EdgeFreq) - TailMBB.setSuccWeight(SuccI, EdgeFreq->getFrequency() / Scale); + if (SumEdgeFreq > 0) { + for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end(); + SuccI != SuccE; ++SuccI, ++EdgeFreq) { + auto Prob = BranchProbability::getBranchProbability( + EdgeFreq->getFrequency(), SumEdgeFreq); + TailMBB.setSuccProbability(SuccI, Prob); + } + } } //===----------------------------------------------------------------------===// @@ -1098,10 +1110,12 @@ bool BranchFolder::OptimizeBranches(MachineFunction &MF) { // Make sure blocks are numbered in order MF.RenumberBlocks(); + // Renumbering blocks alters funclet membership, recalculate it. + FuncletMembership = getFuncletMembership(MF); for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end(); I != E; ) { - MachineBasicBlock *MBB = I++; + MachineBasicBlock *MBB = &*I++; MadeChange |= OptimizeBlock(MBB); // If it is dead, remove it. @@ -1111,6 +1125,7 @@ bool BranchFolder::OptimizeBranches(MachineFunction &MF) { ++NumDeadBlocks; } } + return MadeChange; } @@ -1167,20 +1182,31 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { MachineFunction &MF = *MBB->getParent(); ReoptimizeBlock: - MachineFunction::iterator FallThrough = MBB; + MachineFunction::iterator FallThrough = MBB->getIterator(); ++FallThrough; + // Make sure MBB and FallThrough belong to the same funclet. + bool SameFunclet = true; + if (!FuncletMembership.empty() && FallThrough != MF.end()) { + auto MBBFunclet = FuncletMembership.find(MBB); + assert(MBBFunclet != FuncletMembership.end()); + auto FallThroughFunclet = FuncletMembership.find(&*FallThrough); + assert(FallThroughFunclet != FuncletMembership.end()); + SameFunclet = MBBFunclet->second == FallThroughFunclet->second; + } + // If this block is empty, make everyone use its fall-through, not the block // explicitly. Landing pads should not do this since the landing-pad table // points to this block. Blocks with their addresses taken shouldn't be // optimized away. - if (IsEmptyBlock(MBB) && !MBB->isLandingPad() && !MBB->hasAddressTaken()) { + if (IsEmptyBlock(MBB) && !MBB->isEHPad() && !MBB->hasAddressTaken() && + SameFunclet) { // Dead block? Leave for cleanup later. if (MBB->pred_empty()) return MadeChange; if (FallThrough == MF.end()) { // TODO: Simplify preds to not branch here if possible! - } else if (FallThrough->isLandingPad()) { + } else if (FallThrough->isEHPad()) { // Don't rewrite to a landing pad fallthough. That could lead to the case // where a BB jumps to more than one landing pad. // TODO: Is it ever worth rewriting predecessors which don't already @@ -1190,12 +1216,12 @@ ReoptimizeBlock: // instead. while (!MBB->pred_empty()) { MachineBasicBlock *Pred = *(MBB->pred_end()-1); - Pred->ReplaceUsesOfBlockWith(MBB, FallThrough); + Pred->ReplaceUsesOfBlockWith(MBB, &*FallThrough); } // If MBB was the target of a jump table, update jump tables to go to the // fallthrough instead. if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo()) - MJTI->ReplaceMBBInJumpTables(MBB, FallThrough); + MJTI->ReplaceMBBInJumpTables(MBB, &*FallThrough); MadeChange = true; } return MadeChange; @@ -1237,7 +1263,7 @@ ReoptimizeBlock: // AnalyzeBranch. if (PriorCond.empty() && !PriorTBB && MBB->pred_size() == 1 && PrevBB.succ_size() == 1 && - !MBB->hasAddressTaken() && !MBB->isLandingPad()) { + !MBB->hasAddressTaken() && !MBB->isEHPad()) { DEBUG(dbgs() << "\nMerging into block: " << PrevBB << "From MBB: " << *MBB); // Remove redundant DBG_VALUEs first. @@ -1333,7 +1359,7 @@ ReoptimizeBlock: TII->InsertBranch(PrevBB, MBB, nullptr, NewPriorCond, dl); // Move this block to the end of the function. - MBB->moveAfter(--MF.end()); + MBB->moveAfter(&MF.back()); MadeChange = true; ++NumBranchOpts; return MadeChange; @@ -1371,7 +1397,7 @@ ReoptimizeBlock: // other blocks across it. if (CurTBB && CurCond.empty() && !CurFBB && IsBranchOnlyBlock(MBB) && CurTBB != MBB && - !MBB->hasAddressTaken()) { + !MBB->hasAddressTaken() && !MBB->isEHPad()) { DebugLoc dl = getBranchDebugLoc(*MBB); // This block may contain just an unconditional branch. Because there can // be 'non-branch terminators' in the block, try removing the branch and @@ -1468,14 +1494,11 @@ ReoptimizeBlock: // see if it has a fall-through into its successor. bool CurFallsThru = MBB->canFallThrough(); - if (!MBB->isLandingPad()) { + if (!MBB->isEHPad()) { // Check all the predecessors of this block. If one of them has no fall // throughs, move this block right after it. - for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(), - E = MBB->pred_end(); PI != E; ++PI) { + for (MachineBasicBlock *PredBB : MBB->predecessors()) { // Analyze the branch at the end of the pred. - MachineBasicBlock *PredBB = *PI; - MachineFunction::iterator PredFallthrough = PredBB; ++PredFallthrough; MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; SmallVector<MachineOperand, 4> PredCond; if (PredBB != MBB && !PredBB->canFallThrough() && @@ -1493,8 +1516,7 @@ ReoptimizeBlock: // B elsewhere // next: if (CurFallsThru) { - MachineBasicBlock *NextBB = - std::next(MachineFunction::iterator(MBB)); + MachineBasicBlock *NextBB = &*std::next(MBB->getIterator()); CurCond.clear(); TII->InsertBranch(*MBB, NextBB, nullptr, CurCond, DebugLoc()); } @@ -1507,11 +1529,9 @@ ReoptimizeBlock: if (!CurFallsThru) { // Check all successors to see if we can move this block before it. - for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), - E = MBB->succ_end(); SI != E; ++SI) { + for (MachineBasicBlock *SuccBB : MBB->successors()) { // Analyze the branch at the end of the block before the succ. - MachineBasicBlock *SuccBB = *SI; - MachineFunction::iterator SuccPrev = SuccBB; --SuccPrev; + MachineFunction::iterator SuccPrev = --SuccBB->getIterator(); // If this block doesn't already fall-through to that successor, and if // the succ doesn't already have a block that can fall through into it, @@ -1519,7 +1539,7 @@ ReoptimizeBlock: // fallthrough to happen. if (SuccBB != MBB && &*SuccPrev != MBB && !SuccPrev->canFallThrough() && !CurUnAnalyzable && - !SuccBB->isLandingPad()) { + !SuccBB->isEHPad()) { MBB->moveBefore(SuccBB); MadeChange = true; goto ReoptimizeBlock; @@ -1531,10 +1551,18 @@ ReoptimizeBlock: // removed, move this block to the end of the function. MachineBasicBlock *PrevTBB = nullptr, *PrevFBB = nullptr; SmallVector<MachineOperand, 4> PrevCond; + // We're looking for cases where PrevBB could possibly fall through to + // FallThrough, but if FallThrough is an EH pad that wouldn't be useful + // so here we skip over any EH pads so we might have a chance to find + // a branch target from PrevBB. + while (FallThrough != MF.end() && FallThrough->isEHPad()) + ++FallThrough; + // Now check to see if the current block is sitting between PrevBB and + // a block to which it could fall through. if (FallThrough != MF.end() && !TII->AnalyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) && - PrevBB.isSuccessor(FallThrough)) { - MBB->moveAfter(--MF.end()); + PrevBB.isSuccessor(&*FallThrough)) { + MBB->moveAfter(&MF.back()); MadeChange = true; return MadeChange; } @@ -1553,7 +1581,7 @@ ReoptimizeBlock: bool BranchFolder::HoistCommonCode(MachineFunction &MF) { bool MadeChange = false; for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) { - MachineBasicBlock *MBB = I++; + MachineBasicBlock *MBB = &*I++; MadeChange |= HoistCommonCodeInSuccs(MBB); } @@ -1564,15 +1592,23 @@ bool BranchFolder::HoistCommonCode(MachineFunction &MF) { /// its 'true' successor. static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB, MachineBasicBlock *TrueBB) { - for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), - E = BB->succ_end(); SI != E; ++SI) { - MachineBasicBlock *SuccBB = *SI; + for (MachineBasicBlock *SuccBB : BB->successors()) if (SuccBB != TrueBB) return SuccBB; - } return nullptr; } +template <class Container> +static void addRegAndItsAliases(unsigned Reg, const TargetRegisterInfo *TRI, + Container &Set) { + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + Set.insert(*AI); + } else { + Set.insert(Reg); + } +} + /// findHoistingInsertPosAndDeps - Find the location to move common instructions /// in successors to. The location is usually just before the terminator, /// however if the terminator is a conditional branch and its previous @@ -1590,16 +1626,14 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB, if (!TII->isUnpredicatedTerminator(Loc)) return MBB->end(); - for (unsigned i = 0, e = Loc->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = Loc->getOperand(i); + for (const MachineOperand &MO : Loc->operands()) { if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (!Reg) continue; if (MO.isUse()) { - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - Uses.insert(*AI); + addRegAndItsAliases(Reg, TRI, Uses); } else { if (!MO.isDead()) // Don't try to hoist code in the rare case the terminator defines a @@ -1608,8 +1642,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB, // If the terminator defines a register, make sure we don't hoist // the instruction whose def might be clobbered by the terminator. - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - Defs.insert(*AI); + addRegAndItsAliases(Reg, TRI, Defs); } } @@ -1626,8 +1659,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB, --PI; bool IsDef = false; - for (unsigned i = 0, e = PI->getNumOperands(); !IsDef && i != e; ++i) { - const MachineOperand &MO = PI->getOperand(i); + for (const MachineOperand &MO : PI->operands()) { // If PI has a regmask operand, it is probably a call. Separate away. if (MO.isRegMask()) return Loc; @@ -1636,8 +1668,10 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB, unsigned Reg = MO.getReg(); if (!Reg) continue; - if (Uses.count(Reg)) + if (Uses.count(Reg)) { IsDef = true; + break; + } } if (!IsDef) // The condition setting instruction is not just before the conditional @@ -1657,23 +1691,22 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB, // Find out what registers are live. Note this routine is ignoring other live // registers which are only used by instructions in successor blocks. - for (unsigned i = 0, e = PI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = PI->getOperand(i); + for (const MachineOperand &MO : PI->operands()) { if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (!Reg) continue; if (MO.isUse()) { - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - Uses.insert(*AI); + addRegAndItsAliases(Reg, TRI, Uses); } else { if (Uses.erase(Reg)) { - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) - Uses.erase(*SubRegs); // Use sub-registers to be conservative + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) + Uses.erase(*SubRegs); // Use sub-registers to be conservative + } } - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - Defs.insert(*AI); + addRegAndItsAliases(Reg, TRI, Defs); } } @@ -1737,8 +1770,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { break; bool IsSafe = true; - for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) { - MachineOperand &MO = TIB->getOperand(i); + for (MachineOperand &MO : TIB->operands()) { // Don't attempt to hoist instructions with register masks. if (MO.isRegMask()) { IsSafe = false; @@ -1793,28 +1825,29 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { break; // Remove kills from LocalDefsSet, these registers had short live ranges. - for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) { - MachineOperand &MO = TIB->getOperand(i); + for (const MachineOperand &MO : TIB->operands()) { if (!MO.isReg() || !MO.isUse() || !MO.isKill()) continue; unsigned Reg = MO.getReg(); if (!Reg || !LocalDefsSet.count(Reg)) continue; - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - LocalDefsSet.erase(*AI); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + LocalDefsSet.erase(*AI); + } else { + LocalDefsSet.erase(Reg); + } } // Track local defs so we can update liveins. - for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) { - MachineOperand &MO = TIB->getOperand(i); + for (const MachineOperand &MO : TIB->operands()) { if (!MO.isReg() || !MO.isDef() || MO.isDead()) continue; unsigned Reg = MO.getReg(); if (!Reg) continue; LocalDefs.push_back(Reg); - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - LocalDefsSet.insert(*AI); + addRegAndItsAliases(Reg, TRI, LocalDefsSet); } HasDups = true; diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.h b/contrib/llvm/lib/CodeGen/BranchFolding.h index 46c05dc..d759d53 100644 --- a/contrib/llvm/lib/CodeGen/BranchFolding.h +++ b/contrib/llvm/lib/CodeGen/BranchFolding.h @@ -54,6 +54,7 @@ namespace llvm { typedef std::vector<MergePotentialsElt>::iterator MPIterator; std::vector<MergePotentialsElt> MergePotentials; SmallPtrSet<const MachineBasicBlock*, 2> TriedMerging; + DenseMap<const MachineBasicBlock *, int> FuncletMembership; class SameTailElt { MPIterator MPIter; diff --git a/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp index d08fae0..abc655a 100644 --- a/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp +++ b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/VirtRegMap.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" @@ -24,6 +25,7 @@ using namespace llvm; void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS, MachineFunction &MF, + VirtRegMap *VRM, const MachineLoopInfo &MLI, const MachineBlockFrequencyInfo &MBFI, VirtRegAuxInfo::NormalizingFn norm) { @@ -31,7 +33,7 @@ void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS, << "********** Function: " << MF.getName() << '\n'); MachineRegisterInfo &MRI = MF.getRegInfo(); - VirtRegAuxInfo VRAI(MF, LIS, MLI, MBFI, norm); + VirtRegAuxInfo VRAI(MF, LIS, VRM, MLI, MBFI, norm); for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) { unsigned Reg = TargetRegisterInfo::index2VirtReg(i); if (MRI.reg_nodbg_empty(Reg)) @@ -74,7 +76,10 @@ static unsigned copyHint(const MachineInstr *mi, unsigned reg, // Check if all values in LI are rematerializable static bool isRematerializable(const LiveInterval &LI, const LiveIntervals &LIS, + VirtRegMap *VRM, const TargetInstrInfo &TII) { + unsigned Reg = LI.reg; + unsigned Original = VRM ? VRM->getOriginal(Reg) : 0; for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end(); I != E; ++I) { const VNInfo *VNI = *I; @@ -86,6 +91,36 @@ static bool isRematerializable(const LiveInterval &LI, MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def); assert(MI && "Dead valno in interval"); + // Trace copies introduced by live range splitting. The inline + // spiller can rematerialize through these copies, so the spill + // weight must reflect this. + if (VRM) { + while (MI->isFullCopy()) { + // The copy destination must match the interval register. + if (MI->getOperand(0).getReg() != Reg) + return false; + + // Get the source register. + Reg = MI->getOperand(1).getReg(); + + // If the original (pre-splitting) registers match this + // copy came from a split. + if (!TargetRegisterInfo::isVirtualRegister(Reg) || + VRM->getOriginal(Reg) != Original) + return false; + + // Follow the copy live-in value. + const LiveInterval &SrcLI = LIS.getInterval(Reg); + LiveQueryResult SrcQ = SrcLI.Query(VNI->def); + VNI = SrcQ.valueIn(); + assert(VNI && "Copy from non-existing value"); + if (VNI->isPHIDef()) + return false; + MI = LIS.getInstructionFromIndex(VNI->def); + assert(MI && "Dead valno in interval"); + } + } + if (!TII.isTriviallyReMaterializable(MI, LIS.getAliasAnalysis())) return false; } @@ -188,7 +223,7 @@ VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) { // it is a preferred candidate for spilling. // FIXME: this gets much more complicated once we support non-trivial // re-materialization. - if (isRematerializable(li, LIS, *MF.getSubtarget().getInstrInfo())) + if (isRematerializable(li, LIS, VRM, *MF.getSubtarget().getInstrInfo())) totalWeight *= 0.5F; li.weight = normalize(totalWeight, li.getSize(), numInstr); diff --git a/contrib/llvm/lib/CodeGen/CallingConvLower.cpp b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp index fb29b1d..23c0d54 100644 --- a/contrib/llvm/lib/CodeGen/CallingConvLower.cpp +++ b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp @@ -32,6 +32,7 @@ CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf, CallOrPrologue(Unknown) { // No stack is used. StackOffset = 0; + MaxStackArgAlign = 1; clearByValRegsInfo(); UsedRegs.resize((TRI.getNumRegs()+31)/32); @@ -192,6 +193,7 @@ static bool isValueTypeInRegForCC(CallingConv::ID CC, MVT VT) { void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs, MVT VT, CCAssignFn Fn) { unsigned SavedStackOffset = StackOffset; + unsigned SavedMaxStackArgAlign = MaxStackArgAlign; unsigned NumLocs = Locs.size(); // Set the 'inreg' flag if it is used for this calling convention. @@ -223,6 +225,7 @@ void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs, // as allocated so that future queries don't return the same registers, i.e. // when i64 and f64 are both passed in GPRs. StackOffset = SavedStackOffset; + MaxStackArgAlign = SavedMaxStackArgAlign; Locs.resize(NumLocs); } diff --git a/contrib/llvm/lib/CodeGen/CodeGen.cpp b/contrib/llvm/lib/CodeGen/CodeGen.cpp index 155c5ec..dc13b5b 100644 --- a/contrib/llvm/lib/CodeGen/CodeGen.cpp +++ b/contrib/llvm/lib/CodeGen/CodeGen.cpp @@ -29,6 +29,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeExpandISelPseudosPass(Registry); initializeExpandPostRAPass(Registry); initializeFinalizeMachineBundlesPass(Registry); + initializeFuncletLayoutPass(Registry); initializeGCMachineCodeAnalysisPass(Registry); initializeGCModuleInfoPass(Registry); initializeIfConverterPass(Registry); @@ -66,6 +67,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeSlotIndexesPass(Registry); initializeStackColoringPass(Registry); initializeStackMapLivenessPass(Registry); + initializeLiveDebugValuesPass(Registry); initializeStackProtectorPass(Registry); initializeStackSlotColoringPass(Registry); initializeTailDuplicatePassPass(Registry); diff --git a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp index 6ab6acc..03e5778 100644 --- a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -63,6 +64,9 @@ STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " "computations were sunk"); STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); +STATISTIC(NumAndsAdded, + "Number of and mask instructions added to form ext loads"); +STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); @@ -109,25 +113,18 @@ static cl::opt<bool> StressExtLdPromotion( namespace { typedef SmallPtrSet<Instruction *, 16> SetOfInstrs; -struct TypeIsSExt { - Type *Ty; - bool IsSExt; - TypeIsSExt(Type *Ty, bool IsSExt) : Ty(Ty), IsSExt(IsSExt) {} -}; +typedef PointerIntPair<Type *, 1, bool> TypeIsSExt; typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy; class TypePromotionTransaction; class CodeGenPrepare : public FunctionPass { - /// TLI - Keep a pointer of a TargetLowering to consult for determining - /// transformation profitability. const TargetMachine *TM; const TargetLowering *TLI; const TargetTransformInfo *TTI; const TargetLibraryInfo *TLInfo; - /// CurInstIterator - As we scan instructions optimizing them, this is the - /// next instruction to optimize. Xforms that can invalidate this should - /// update it. + /// As we scan instructions optimizing them, this is the next instruction + /// to optimize. Transforms that can invalidate this should update it. BasicBlock::iterator CurInstIterator; /// Keeps track of non-local addresses that have been sunk into a block. @@ -141,10 +138,10 @@ class TypePromotionTransaction; /// promotion for the current function. InstrToOrigTy PromotedInsts; - /// ModifiedDT - If CFG is modified in anyway. + /// True if CFG is modified in any way. bool ModifiedDT; - /// OptSize - True if optimizing for size. + /// True if optimizing for size. bool OptSize; /// DataLayout for the Function being processed. @@ -167,30 +164,33 @@ class TypePromotionTransaction; } private: - bool EliminateFallThrough(Function &F); - bool EliminateMostlyEmptyBlocks(Function &F); - bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; - void EliminateMostlyEmptyBlock(BasicBlock *BB); - bool OptimizeBlock(BasicBlock &BB, bool& ModifiedDT); - bool OptimizeInst(Instruction *I, bool& ModifiedDT); - bool OptimizeMemoryInst(Instruction *I, Value *Addr, + bool eliminateFallThrough(Function &F); + bool eliminateMostlyEmptyBlocks(Function &F); + bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; + void eliminateMostlyEmptyBlock(BasicBlock *BB); + bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT); + bool optimizeInst(Instruction *I, bool& ModifiedDT); + bool optimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy, unsigned AS); - bool OptimizeInlineAsmInst(CallInst *CS); - bool OptimizeCallInst(CallInst *CI, bool& ModifiedDT); - bool MoveExtToFormExtLoad(Instruction *&I); - bool OptimizeExtUses(Instruction *I); - bool OptimizeSelectInst(SelectInst *SI); - bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI); - bool OptimizeExtractElementInst(Instruction *Inst); - bool DupRetToEnableTailCallOpts(BasicBlock *BB); - bool PlaceDbgValues(Function &F); + bool optimizeInlineAsmInst(CallInst *CS); + bool optimizeCallInst(CallInst *CI, bool& ModifiedDT); + bool moveExtToFormExtLoad(Instruction *&I); + bool optimizeExtUses(Instruction *I); + bool optimizeLoadExt(LoadInst *I); + bool optimizeSelectInst(SelectInst *SI); + bool optimizeShuffleVectorInst(ShuffleVectorInst *SI); + bool optimizeSwitchInst(SwitchInst *CI); + bool optimizeExtractElementInst(Instruction *Inst); + bool dupRetToEnableTailCallOpts(BasicBlock *BB); + bool placeDbgValues(Function &F); bool sinkAndCmp(Function &F); - bool ExtLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI, + bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI, Instruction *&Inst, const SmallVectorImpl<Instruction *> &Exts, unsigned CreatedInstCost); bool splitBranchCondition(Function &F); bool simplifyOffsetableRelocate(Instruction &I); + void stripInvariantGroupMetadata(Instruction &I); }; } @@ -218,25 +218,31 @@ bool CodeGenPrepare::runOnFunction(Function &F) { TLI = TM->getSubtargetImpl(F)->getTargetLowering(); TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - OptSize = F.hasFnAttribute(Attribute::OptimizeForSize); + OptSize = F.optForSize(); /// This optimization identifies DIV instructions that can be /// profitably bypassed and carried out with a shorter, faster divide. if (!OptSize && TLI && TLI->isSlowDivBypassed()) { const DenseMap<unsigned int, unsigned int> &BypassWidths = TLI->getBypassSlowDivWidths(); - for (Function::iterator I = F.begin(); I != F.end(); I++) - EverMadeChange |= bypassSlowDivision(F, I, BypassWidths); + BasicBlock* BB = &*F.begin(); + while (BB != nullptr) { + // bypassSlowDivision may create new BBs, but we don't want to reapply the + // optimization to those blocks. + BasicBlock* Next = BB->getNextNode(); + EverMadeChange |= bypassSlowDivision(BB, BypassWidths); + BB = Next; + } } // Eliminate blocks that contain only PHI nodes and an // unconditional branch. - EverMadeChange |= EliminateMostlyEmptyBlocks(F); + EverMadeChange |= eliminateMostlyEmptyBlocks(F); // llvm.dbg.value is far away from the value then iSel may not be able // handle it properly. iSel will drop llvm.dbg.value if it can not // find a node corresponding to the value. - EverMadeChange |= PlaceDbgValues(F); + EverMadeChange |= placeDbgValues(F); // If there is a mask, compare against zero, and branch that can be combined // into a single target instruction, push the mask and compare into branch @@ -251,9 +257,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) { while (MadeChange) { MadeChange = false; for (Function::iterator I = F.begin(); I != F.end(); ) { - BasicBlock *BB = I++; + BasicBlock *BB = &*I++; bool ModifiedDTOnIteration = false; - MadeChange |= OptimizeBlock(*BB, ModifiedDTOnIteration); + MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration); // Restart BB iteration if the dominator tree of the Function was changed if (ModifiedDTOnIteration) @@ -296,7 +302,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) { // Merge pairs of basic blocks with unconditional branches, connected by // a single edge. if (EverMadeChange || MadeChange) - MadeChange |= EliminateFallThrough(F); + MadeChange |= eliminateFallThrough(F); EverMadeChange |= MadeChange; } @@ -314,14 +320,14 @@ bool CodeGenPrepare::runOnFunction(Function &F) { return EverMadeChange; } -/// EliminateFallThrough - Merge basic blocks which are connected -/// by a single edge, where one of the basic blocks has a single successor -/// pointing to the other basic block, which has a single predecessor. -bool CodeGenPrepare::EliminateFallThrough(Function &F) { +/// Merge basic blocks which are connected by a single edge, where one of the +/// basic blocks has a single successor pointing to the other basic block, +/// which has a single predecessor. +bool CodeGenPrepare::eliminateFallThrough(Function &F) { bool Changed = false; // Scan all of the blocks in the function, except for the entry block. for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) { - BasicBlock *BB = I++; + BasicBlock *BB = &*I++; // If the destination block has a single pred, then this is a trivial // edge, just collapse it. BasicBlock *SinglePred = BB->getSinglePredecessor(); @@ -342,22 +348,21 @@ bool CodeGenPrepare::EliminateFallThrough(Function &F) { BB->moveBefore(&BB->getParent()->getEntryBlock()); // We have erased a block. Update the iterator. - I = BB; + I = BB->getIterator(); } } return Changed; } -/// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes, -/// debug info directives, and an unconditional branch. Passes before isel -/// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for -/// isel. Start by eliminating these blocks so we can split them the way we -/// want them. -bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) { +/// Eliminate blocks that contain only PHI nodes, debug info directives, and an +/// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split +/// edges in ways that are non-optimal for isel. Start by eliminating these +/// blocks so we can split them the way we want them. +bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) { bool MadeChange = false; // Note that this intentionally skips the entry block. for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) { - BasicBlock *BB = I++; + BasicBlock *BB = &*I++; // If this block doesn't end with an uncond branch, ignore it. BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); @@ -366,7 +371,7 @@ bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) { // If the instruction before the branch (skipping debug info) isn't a phi // node, then other stuff is happening here. - BasicBlock::iterator BBI = BI; + BasicBlock::iterator BBI = BI->getIterator(); if (BBI != BB->begin()) { --BBI; while (isa<DbgInfoIntrinsic>(BBI)) { @@ -383,19 +388,19 @@ bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) { if (DestBB == BB) continue; - if (!CanMergeBlocks(BB, DestBB)) + if (!canMergeBlocks(BB, DestBB)) continue; - EliminateMostlyEmptyBlock(BB); + eliminateMostlyEmptyBlock(BB); MadeChange = true; } return MadeChange; } -/// CanMergeBlocks - Return true if we can merge BB into DestBB if there is a -/// single uncond branch between them, and BB contains no other non-phi +/// Return true if we can merge BB into DestBB if there is a single +/// unconditional branch between them, and BB contains no other non-phi /// instructions. -bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB, +bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const { // We only want to eliminate blocks whose phi nodes are used by phi nodes in // the successor. If there are more complex condition (e.g. preheaders), @@ -461,9 +466,9 @@ bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB, } -/// EliminateMostlyEmptyBlock - Eliminate a basic block that have only phi's and -/// an unconditional branch in it. -void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { +/// Eliminate a basic block that has only phi's and an unconditional branch in +/// it. +void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) { BranchInst *BI = cast<BranchInst>(BB->getTerminator()); BasicBlock *DestBB = BI->getSuccessor(0); @@ -527,19 +532,17 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { // Computes a map of base pointer relocation instructions to corresponding // derived pointer relocation instructions given a vector of all relocate calls static void computeBaseDerivedRelocateMap( - const SmallVectorImpl<User *> &AllRelocateCalls, - DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> & - RelocateInstMap) { + const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls, + DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> + &RelocateInstMap) { // Collect information in two maps: one primarily for locating the base object // while filling the second map; the second map is the final structure holding // a mapping between Base and corresponding Derived relocate calls - DenseMap<std::pair<unsigned, unsigned>, IntrinsicInst *> RelocateIdxMap; - for (auto &U : AllRelocateCalls) { - GCRelocateOperands ThisRelocate(U); - IntrinsicInst *I = cast<IntrinsicInst>(U); - auto K = std::make_pair(ThisRelocate.getBasePtrIndex(), - ThisRelocate.getDerivedPtrIndex()); - RelocateIdxMap.insert(std::make_pair(K, I)); + DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap; + for (auto *ThisRelocate : AllRelocateCalls) { + auto K = std::make_pair(ThisRelocate->getBasePtrIndex(), + ThisRelocate->getDerivedPtrIndex()); + RelocateIdxMap.insert(std::make_pair(K, ThisRelocate)); } for (auto &Item : RelocateIdxMap) { std::pair<unsigned, unsigned> Key = Item.first; @@ -547,7 +550,7 @@ static void computeBaseDerivedRelocateMap( // Base relocation: nothing to insert continue; - IntrinsicInst *I = Item.second; + GCRelocateInst *I = Item.second; auto BaseKey = std::make_pair(Key.first, Key.first); // We're iterating over RelocateIdxMap so we cannot modify it. @@ -580,22 +583,27 @@ static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP, // Takes a RelocatedBase (base pointer relocation instruction) and Targets to // replace, computes a replacement, and affects it. static bool -simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase, - const SmallVectorImpl<IntrinsicInst *> &Targets) { +simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase, + const SmallVectorImpl<GCRelocateInst *> &Targets) { bool MadeChange = false; - for (auto &ToReplace : Targets) { - GCRelocateOperands MasterRelocate(RelocatedBase); - GCRelocateOperands ThisRelocate(ToReplace); - - assert(ThisRelocate.getBasePtrIndex() == MasterRelocate.getBasePtrIndex() && + for (GCRelocateInst *ToReplace : Targets) { + assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() && "Not relocating a derived object of the original base object"); - if (ThisRelocate.getBasePtrIndex() == ThisRelocate.getDerivedPtrIndex()) { + if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) { // A duplicate relocate call. TODO: coalesce duplicates. continue; } - Value *Base = ThisRelocate.getBasePtr(); - auto Derived = dyn_cast<GetElementPtrInst>(ThisRelocate.getDerivedPtr()); + if (RelocatedBase->getParent() != ToReplace->getParent()) { + // Base and derived relocates are in different basic blocks. + // In this case transform is only valid when base dominates derived + // relocate. However it would be too expensive to check dominance + // for each such relocate, so we skip the whole transformation. + continue; + } + + Value *Base = ToReplace->getBasePtr(); + auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr()); if (!Derived || Derived->getPointerOperand() != Base) continue; @@ -631,21 +639,20 @@ simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase, // In this case, we can not find the bitcast any more. So we insert a new bitcast // no matter there is already one or not. In this way, we can handle all cases, and // the extra bitcast should be optimized away in later passes. - Instruction *ActualRelocatedBase = RelocatedBase; + Value *ActualRelocatedBase = RelocatedBase; if (RelocatedBase->getType() != Base->getType()) { ActualRelocatedBase = - cast<Instruction>(Builder.CreateBitCast(RelocatedBase, Base->getType())); + Builder.CreateBitCast(RelocatedBase, Base->getType()); } Value *Replacement = Builder.CreateGEP( Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV)); - Instruction *ReplacementInst = cast<Instruction>(Replacement); Replacement->takeName(ToReplace); // If the newly generated derived pointer's type does not match the original derived // pointer's type, cast the new derived pointer to match it. Same reasoning as above. - Instruction *ActualReplacement = ReplacementInst; - if (ReplacementInst->getType() != ToReplace->getType()) { + Value *ActualReplacement = Replacement; + if (Replacement->getType() != ToReplace->getType()) { ActualReplacement = - cast<Instruction>(Builder.CreateBitCast(ReplacementInst, ToReplace->getType())); + Builder.CreateBitCast(Replacement, ToReplace->getType()); } ToReplace->replaceAllUsesWith(ActualReplacement); ToReplace->eraseFromParent(); @@ -674,12 +681,12 @@ simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase, // %val = load %ptr' bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) { bool MadeChange = false; - SmallVector<User *, 2> AllRelocateCalls; + SmallVector<GCRelocateInst *, 2> AllRelocateCalls; for (auto *U : I.users()) - if (isGCRelocate(dyn_cast<Instruction>(U))) + if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U)) // Collect all the relocate calls associated with a statepoint - AllRelocateCalls.push_back(U); + AllRelocateCalls.push_back(Relocate); // We need atleast one base pointer relocation + one derived pointer // relocation to mangle @@ -688,7 +695,7 @@ bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) { // RelocateInstMap is a mapping from the base relocate instruction to the // corresponding derived relocate instructions - DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> RelocateInstMap; + DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap; computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap); if (RelocateInstMap.empty()) return false; @@ -723,6 +730,12 @@ static bool SinkCast(CastInst *CI) { // Preincrement use iterator so we don't invalidate it. ++UI; + // If the block selected to receive the cast is an EH pad that does not + // allow non-PHI instructions before the terminator, we can't sink the + // cast. + if (UserBB->getTerminator()->isEHPad()) + continue; + // If this user is in the same block as the cast, don't change the cast. if (UserBB == DefBB) continue; @@ -731,9 +744,9 @@ static bool SinkCast(CastInst *CI) { if (!InsertedCast) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); - InsertedCast = - CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "", - InsertPt); + assert(InsertPt != UserBB->end()); + InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0), + CI->getType(), "", &*InsertPt); } // Replace a use of the cast with a use of the new cast. @@ -751,10 +764,9 @@ static bool SinkCast(CastInst *CI) { return MadeChange; } -/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop -/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC), -/// sink it into user blocks to reduce the number of virtual -/// registers that must be created and coalesced. +/// If the specified cast instruction is a noop copy (e.g. it's casting from +/// one pointer type to another, i32->i8 on PPC), sink it into user blocks to +/// reduce the number of virtual registers that must be created and coalesced. /// /// Return true if any changes are made. /// @@ -789,8 +801,8 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI, return SinkCast(CI); } -/// CombineUAddWithOverflow - try to combine CI into a call to the -/// llvm.uadd.with.overflow intrinsic if possible. +/// Try to combine CI into a call to the llvm.uadd.with.overflow intrinsic if +/// possible. /// /// Return true if any changes were made. static bool CombineUAddWithOverflow(CmpInst *CI) { @@ -818,7 +830,7 @@ static bool CombineUAddWithOverflow(CmpInst *CI) { assert(*AddI->user_begin() == CI && "expected!"); #endif - Module *M = CI->getParent()->getParent()->getParent(); + Module *M = CI->getModule(); Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, Ty); auto *InsertPt = AddI->hasOneUse() ? CI : AddI; @@ -836,16 +848,16 @@ static bool CombineUAddWithOverflow(CmpInst *CI) { return true; } -/// SinkCmpExpression - Sink the given CmpInst into user blocks to reduce -/// the number of virtual registers that must be created and coalesced. This is -/// a clear win except on targets with multiple condition code registers -/// (PowerPC), where it might lose; some adjustment may be wanted there. +/// Sink the given CmpInst into user blocks to reduce the number of virtual +/// registers that must be created and coalesced. This is a clear win except on +/// targets with multiple condition code registers (PowerPC), where it might +/// lose; some adjustment may be wanted there. /// /// Return true if any changes are made. static bool SinkCmpExpression(CmpInst *CI) { BasicBlock *DefBB = CI->getParent(); - /// InsertedCmp - Only insert a cmp in each block once. + /// Only insert a cmp in each block once. DenseMap<BasicBlock*, CmpInst*> InsertedCmps; bool MadeChange = false; @@ -872,10 +884,10 @@ static bool SinkCmpExpression(CmpInst *CI) { if (!InsertedCmp) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); + assert(InsertPt != UserBB->end()); InsertedCmp = - CmpInst::Create(CI->getOpcode(), - CI->getPredicate(), CI->getOperand(0), - CI->getOperand(1), "", InsertPt); + CmpInst::Create(CI->getOpcode(), CI->getPredicate(), + CI->getOperand(0), CI->getOperand(1), "", &*InsertPt); } // Replace a use of the cmp with a use of the new cmp. @@ -903,8 +915,8 @@ static bool OptimizeCmpExpression(CmpInst *CI) { return false; } -/// isExtractBitsCandidateUse - Check if the candidates could -/// be combined with shift instruction, which includes: +/// Check if the candidates could be combined with a shift instruction, which +/// includes: /// 1. Truncate instruction /// 2. And instruction and the imm is a mask of the low bits: /// imm & (imm+1) == 0 @@ -922,8 +934,7 @@ static bool isExtractBitsCandidateUse(Instruction *User) { return true; } -/// SinkShiftAndTruncate - sink both shift and truncate instruction -/// to the use of truncate's BB. +/// Sink both shift and truncate instruction to the use of truncate's BB. static bool SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI, DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts, @@ -970,20 +981,22 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI, if (!InsertedShift && !InsertedTrunc) { BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt(); + assert(InsertPt != TruncUserBB->end()); // Sink the shift if (ShiftI->getOpcode() == Instruction::AShr) - InsertedShift = - BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt); + InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, + "", &*InsertPt); else - InsertedShift = - BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt); + InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, + "", &*InsertPt); // Sink the trunc BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt(); TruncInsertPt++; + assert(TruncInsertPt != TruncUserBB->end()); InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift, - TruncI->getType(), "", TruncInsertPt); + TruncI->getType(), "", &*TruncInsertPt); MadeChange = true; @@ -993,10 +1006,10 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI, return MadeChange; } -/// OptimizeExtractBits - sink the shift *right* instruction into user blocks if -/// the uses could potentially be combined with this shift instruction and -/// generate BitExtract instruction. It will only be applied if the architecture -/// supports BitExtract instruction. Here is an example: +/// Sink the shift *right* instruction into user blocks if the uses could +/// potentially be combined with this shift instruction and generate BitExtract +/// instruction. It will only be applied if the architecture supports BitExtract +/// instruction. Here is an example: /// BB1: /// %x.extract.shift = lshr i64 %arg1, 32 /// BB2: @@ -1067,13 +1080,14 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, if (!InsertedShift) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); + assert(InsertPt != UserBB->end()); if (ShiftI->getOpcode() == Instruction::AShr) - InsertedShift = - BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt); + InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, + "", &*InsertPt); else - InsertedShift = - BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt); + InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, + "", &*InsertPt); MadeChange = true; } @@ -1089,12 +1103,12 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, return MadeChange; } -// ScalarizeMaskedLoad() translates masked load intrinsic, like +// Translate a masked load intrinsic like // <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align, // <16 x i1> %mask, <16 x i32> %passthru) -// to a chain of basic blocks, whith loading element one-by-one if +// to a chain of basic blocks, with loading element one-by-one if // the appropriate mask bit is set -// +// // %1 = bitcast i8* %addr to i32* // %2 = extractelement <16 x i1> %mask, i32 0 // %3 = icmp eq i1 %2, true @@ -1126,35 +1140,68 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, // static void ScalarizeMaskedLoad(CallInst *CI) { Value *Ptr = CI->getArgOperand(0); - Value *Src0 = CI->getArgOperand(3); + Value *Alignment = CI->getArgOperand(1); Value *Mask = CI->getArgOperand(2); - VectorType *VecType = dyn_cast<VectorType>(CI->getType()); - Type *EltTy = VecType->getElementType(); + Value *Src0 = CI->getArgOperand(3); + unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); + VectorType *VecType = dyn_cast<VectorType>(CI->getType()); assert(VecType && "Unexpected return type of masked load intrinsic"); + Type *EltTy = CI->getType()->getVectorElementType(); + IRBuilder<> Builder(CI->getContext()); Instruction *InsertPt = CI; BasicBlock *IfBlock = CI->getParent(); BasicBlock *CondBlock = nullptr; BasicBlock *PrevIfBlock = CI->getParent(); - Builder.SetInsertPoint(InsertPt); + Builder.SetInsertPoint(InsertPt); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + // Short-cut if the mask is all-true. + bool IsAllOnesMask = isa<Constant>(Mask) && + cast<Constant>(Mask)->isAllOnesValue(); + + if (IsAllOnesMask) { + Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + // Adjust alignment for the scalar instruction. + AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8); // Bitcast %addr fron i8* to EltTy* Type *NewPtrType = EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + unsigned VectorWidth = VecType->getNumElements(); + Value *UndefVal = UndefValue::get(VecType); // The result vector Value *VResult = UndefVal; + if (isa<ConstantVector>(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal); + VResult = Builder.CreateInsertElement(VResult, Load, + Builder.getInt32(Idx)); + } + Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + PHINode *Phi = nullptr; Value *PrevPhi = UndefVal; - unsigned VectorWidth = VecType->getNumElements(); for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration @@ -1182,16 +1229,17 @@ static void ScalarizeMaskedLoad(CallInst *CI) { // %Elt = load i32* %EltAddr // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx // - CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); + CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load"); Builder.SetInsertPoint(InsertPt); Value *Gep = Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - LoadInst* Load = Builder.CreateLoad(Gep, false); + LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); // Create "else" block, fill it in the next iteration - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + BasicBlock *NewIfBlock = + CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); @@ -1208,7 +1256,7 @@ static void ScalarizeMaskedLoad(CallInst *CI) { CI->eraseFromParent(); } -// ScalarizeMaskedStore() translates masked store intrinsic, like +// Translate a masked store intrinsic, like // void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align, // <16 x i1> %mask) // to a chain of basic blocks, that stores element one-by-one if @@ -1224,12 +1272,12 @@ static void ScalarizeMaskedLoad(CallInst *CI) { // %5 = getelementptr i32* %1, i32 0 // store i32 %4, i32* %5 // br label %else -// +// // else: ; preds = %0, %cond.store // %6 = extractelement <16 x i1> %mask, i32 1 // %7 = icmp eq i1 %6, true // br i1 %7, label %cond.store1, label %else2 -// +// // cond.store1: ; preds = %else // %8 = extractelement <16 x i32> %val, i32 1 // %9 = getelementptr i32* %1, i32 1 @@ -1237,34 +1285,61 @@ static void ScalarizeMaskedLoad(CallInst *CI) { // br label %else2 // . . . static void ScalarizeMaskedStore(CallInst *CI) { - Value *Ptr = CI->getArgOperand(1); Value *Src = CI->getArgOperand(0); + Value *Ptr = CI->getArgOperand(1); + Value *Alignment = CI->getArgOperand(2); Value *Mask = CI->getArgOperand(3); + unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); VectorType *VecType = dyn_cast<VectorType>(Src->getType()); - Type *EltTy = VecType->getElementType(); - assert(VecType && "Unexpected data type in masked store intrinsic"); + Type *EltTy = VecType->getElementType(); + IRBuilder<> Builder(CI->getContext()); Instruction *InsertPt = CI; BasicBlock *IfBlock = CI->getParent(); Builder.SetInsertPoint(InsertPt); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + // Short-cut if the mask is all-true. + bool IsAllOnesMask = isa<Constant>(Mask) && + cast<Constant>(Mask)->isAllOnesValue(); + + if (IsAllOnesMask) { + Builder.CreateAlignedStore(Src, Ptr, AlignVal); + CI->eraseFromParent(); + return; + } + + // Adjust alignment for the scalar instruction. + AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8); // Bitcast %addr fron i8* to EltTy* Type *NewPtrType = EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace()); Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); - unsigned VectorWidth = VecType->getNumElements(); + + if (isa<ConstantVector>(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + Builder.CreateAlignedStore(OneElt, Gep, AlignVal); + } + CI->eraseFromParent(); + return; + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // // %mask_1 = extractelement <16 x i1> %mask, i32 Idx // %to_store = icmp eq i1 %mask_1, true - // br i1 %to_load, label %cond.store, label %else + // br i1 %to_store, label %cond.store, label %else // Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, @@ -1276,13 +1351,259 @@ static void ScalarizeMaskedStore(CallInst *CI) { // %EltAddr = getelementptr i32* %1, i32 0 // %store i32 %OneElt, i32* %EltAddr // - BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + BasicBlock *CondBlock = + IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store"); Builder.SetInsertPoint(InsertPt); - + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); Value *Gep = Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - Builder.CreateStore(OneElt, Gep); + Builder.CreateAlignedStore(OneElt, Gep, AlignVal); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = + CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } + CI->eraseFromParent(); +} + +// Translate a masked gather intrinsic like +// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4, +// <16 x i1> %Mask, <16 x i32> %Src) +// to a chain of basic blocks, with loading element one-by-one if +// the appropriate mask bit is set +// +// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> %Mask, i32 0 +// % ToLoad0 = icmp eq i1 % Mask0, true +// br i1 % ToLoad0, label %cond.load, label %else +// +// cond.load: +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// % Load0 = load i32, i32* % Ptr0, align 4 +// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0 +// br label %else +// +// else: +// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0] +// % Mask1 = extractelement <16 x i1> %Mask, i32 1 +// % ToLoad1 = icmp eq i1 % Mask1, true +// br i1 % ToLoad1, label %cond.load1, label %else2 +// +// cond.load1: +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// % Load1 = load i32, i32* % Ptr1, align 4 +// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1 +// br label %else2 +// . . . +// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src +// ret <16 x i32> %Result +static void ScalarizeMaskedGather(CallInst *CI) { + Value *Ptrs = CI->getArgOperand(0); + Value *Alignment = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + Value *Src0 = CI->getArgOperand(3); + + VectorType *VecType = dyn_cast<VectorType>(CI->getType()); + + assert(VecType && "Unexpected return type of masked load intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + BasicBlock *CondBlock = nullptr; + BasicBlock *PrevIfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + Value *UndefVal = UndefValue::get(VecType); + + // The result vector + Value *VResult = UndefVal; + unsigned VectorWidth = VecType->getNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa<ConstantVector>(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, + "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement(VResult, Load, + Builder.getInt32(Idx), + "Res" + Twine(Idx)); + } + Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + PHINode *Phi = nullptr; + Value *PrevPhi = UndefVal; + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %Mask1 = extractelement <16 x i1> %Mask, i32 1 + // %ToLoad1 = icmp eq i1 %Mask1, true + // br i1 %ToLoad1, label %cond.load, label %else + // + if (Idx > 0) { + Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + PrevPhi = Phi; + VResult = Phi; + } + + Value *Predicate = Builder.CreateExtractElement(Mask, + Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToLoad" + Twine(Idx)); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); + Builder.SetInsertPoint(InsertPt); + + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, + "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx), + "Res" + Twine(Idx)); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + } + + Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); +} + +// Translate a masked scatter intrinsic, like +// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4, +// <16 x i1> %Mask) +// to a chain of basic blocks, that stores element one-by-one if +// the appropriate mask bit is set. +// +// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> % Mask, i32 0 +// % ToStore0 = icmp eq i1 % Mask0, true +// br i1 %ToStore0, label %cond.store, label %else +// +// cond.store: +// % Elt0 = extractelement <16 x i32> %Src, i32 0 +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// store i32 %Elt0, i32* % Ptr0, align 4 +// br label %else +// +// else: +// % Mask1 = extractelement <16 x i1> % Mask, i32 1 +// % ToStore1 = icmp eq i1 % Mask1, true +// br i1 % ToStore1, label %cond.store1, label %else2 +// +// cond.store1: +// % Elt1 = extractelement <16 x i32> %Src, i32 1 +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// store i32 % Elt1, i32* % Ptr1, align 4 +// br label %else2 +// . . . +static void ScalarizeMaskedScatter(CallInst *CI) { + Value *Src = CI->getArgOperand(0); + Value *Ptrs = CI->getArgOperand(1); + Value *Alignment = CI->getArgOperand(2); + Value *Mask = CI->getArgOperand(3); + + assert(isa<VectorType>(Src->getType()) && + "Unexpected data type in masked scatter intrinsic"); + assert(isa<VectorType>(Ptrs->getType()) && + isa<PointerType>(Ptrs->getType()->getVectorElementType()) && + "Vector of pointers is expected in masked scatter intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue(); + unsigned VectorWidth = Src->getType()->getVectorNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa<ConstantVector>(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); + } + CI->eraseFromParent(); + return; + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + // Fill the "else" block, created in the previous iteration + // + // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx + // % ToStore = icmp eq i1 % Mask1, true + // br i1 % ToStore, label %cond.store, label %else + // + Value *Predicate = Builder.CreateExtractElement(Mask, + Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = + Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToStore" + Twine(Idx)); + + // Create "cond" block + // + // % Elt1 = extractelement <16 x i32> %Src, i32 1 + // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 + // %store i32 % Elt1, i32* % Ptr1 + // + BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); // Create "else" block, fill it in the next iteration BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); @@ -1295,7 +1616,86 @@ static void ScalarizeMaskedStore(CallInst *CI) { CI->eraseFromParent(); } -bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) { +/// If counting leading or trailing zeros is an expensive operation and a zero +/// input is defined, add a check for zero to avoid calling the intrinsic. +/// +/// We want to transform: +/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false) +/// +/// into: +/// entry: +/// %cmpz = icmp eq i64 %A, 0 +/// br i1 %cmpz, label %cond.end, label %cond.false +/// cond.false: +/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true) +/// br label %cond.end +/// cond.end: +/// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ] +/// +/// If the transform is performed, return true and set ModifiedDT to true. +static bool despeculateCountZeros(IntrinsicInst *CountZeros, + const TargetLowering *TLI, + const DataLayout *DL, + bool &ModifiedDT) { + if (!TLI || !DL) + return false; + + // If a zero input is undefined, it doesn't make sense to despeculate that. + if (match(CountZeros->getOperand(1), m_One())) + return false; + + // If it's cheap to speculate, there's nothing to do. + auto IntrinsicID = CountZeros->getIntrinsicID(); + if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) || + (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz())) + return false; + + // Only handle legal scalar cases. Anything else requires too much work. + Type *Ty = CountZeros->getType(); + unsigned SizeInBits = Ty->getPrimitiveSizeInBits(); + if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSize()) + return false; + + // The intrinsic will be sunk behind a compare against zero and branch. + BasicBlock *StartBlock = CountZeros->getParent(); + BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false"); + + // Create another block after the count zero intrinsic. A PHI will be added + // in this block to select the result of the intrinsic or the bit-width + // constant if the input to the intrinsic is zero. + BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros)); + BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end"); + + // Set up a builder to create a compare, conditional branch, and PHI. + IRBuilder<> Builder(CountZeros->getContext()); + Builder.SetInsertPoint(StartBlock->getTerminator()); + Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc()); + + // Replace the unconditional branch that was created by the first split with + // a compare against zero and a conditional branch. + Value *Zero = Constant::getNullValue(Ty); + Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz"); + Builder.CreateCondBr(Cmp, EndBlock, CallBlock); + StartBlock->getTerminator()->eraseFromParent(); + + // Create a PHI in the end block to select either the output of the intrinsic + // or the bit width of the operand. + Builder.SetInsertPoint(&EndBlock->front()); + PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz"); + CountZeros->replaceAllUsesWith(PN); + Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits)); + PN->addIncoming(BitWidth, StartBlock); + PN->addIncoming(CountZeros, CallBlock); + + // We are explicitly handling the zero case, so we can set the intrinsic's + // undefined zero argument to 'true'. This will also prevent reprocessing the + // intrinsic; we only despeculate when a zero input is defined. + CountZeros->setArgOperand(1, Builder.getTrue()); + ModifiedDT = true; + return true; +} + +bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); // Lower inline assembly if we can. @@ -1311,7 +1711,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) { return true; } // Sink address computing for memory operands into the block. - if (OptimizeInlineAsmInst(CI)) + if (optimizeInlineAsmInst(CI)) return true; } @@ -1372,14 +1772,14 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) { // Substituting this can cause recursive simplifications, which can // invalidate our iterator. Use a WeakVH to hold onto it in case this // happens. - WeakVH IterHandle(CurInstIterator); + WeakVH IterHandle(&*CurInstIterator); replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr); // If the iterator instruction was recursively deleted, start over at the // start of the block. - if (IterHandle != CurInstIterator) { + if (IterHandle != CurInstIterator.getNodePtrUnchecked()) { CurInstIterator = BB->begin(); SunkAddrs.clear(); } @@ -1387,7 +1787,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) { } case Intrinsic::masked_load: { // Scalarize unsupported vector masked load - if (!TTI->isLegalMaskedLoad(CI->getType(), 1)) { + if (!TTI->isLegalMaskedLoad(CI->getType())) { ScalarizeMaskedLoad(CI); ModifiedDT = true; return true; @@ -1395,13 +1795,29 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) { return false; } case Intrinsic::masked_store: { - if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType(), 1)) { + if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) { ScalarizeMaskedStore(CI); ModifiedDT = true; return true; } return false; } + case Intrinsic::masked_gather: { + if (!TTI->isLegalMaskedGather(CI->getType())) { + ScalarizeMaskedGather(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_scatter: { + if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) { + ScalarizeMaskedScatter(CI); + ModifiedDT = true; + return true; + } + return false; + } case Intrinsic::aarch64_stlxr: case Intrinsic::aarch64_stxr: { ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0)); @@ -1415,6 +1831,15 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) { InsertedInsts.insert(ExtVal); return true; } + case Intrinsic::invariant_group_barrier: + II->replaceAllUsesWith(II->getArgOperand(0)); + II->eraseFromParent(); + return true; + + case Intrinsic::cttz: + case Intrinsic::ctlz: + // If counting zeros is expensive, try to avoid it. + return despeculateCountZeros(II, TLI, DL, ModifiedDT); } if (TLI) { @@ -1426,7 +1851,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) { Type *AccessTy; if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace)) while (!PtrOps.empty()) - if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace)) + if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace)) return true; } } @@ -1447,9 +1872,8 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) { return false; } -/// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return -/// instructions to the predecessor to enable tail call optimizations. The -/// case it is currently looking for is: +/// Look for opportunities to duplicate return instructions to the predecessor +/// to enable tail call optimizations. The case it is currently looking for is: /// @code /// bb0: /// %tmp0 = tail call i32 @f0() @@ -1478,7 +1902,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) { /// %tmp2 = tail call i32 @f2() /// ret i32 %tmp2 /// @endcode -bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) { +bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) { if (!TLI) return false; @@ -1597,7 +2021,7 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) { namespace { -/// ExtAddrMode - This is an extended version of TargetLowering::AddrMode +/// This is an extended version of TargetLowering::AddrMode /// which holds actual Value*'s for register values. struct ExtAddrMode : public TargetLowering::AddrMode { Value *BaseReg; @@ -1709,10 +2133,10 @@ class TypePromotionTransaction { public: /// \brief Record the position of \p Inst. InsertionHandler(Instruction *Inst) { - BasicBlock::iterator It = Inst; + BasicBlock::iterator It = Inst->getIterator(); HasPrevInstruction = (It != (Inst->getParent()->begin())); if (HasPrevInstruction) - Point.PrevInst = --It; + Point.PrevInst = &*--It; else Point.BB = Inst->getParent(); } @@ -1724,7 +2148,7 @@ class TypePromotionTransaction { Inst->removeFromParent(); Inst->insertAfter(Point.PrevInst); } else { - Instruction *Position = Point.BB->getFirstInsertionPt(); + Instruction *Position = &*Point.BB->getFirstInsertionPt(); if (Inst->getParent()) Inst->moveBefore(Position); else @@ -1797,7 +2221,7 @@ class TypePromotionTransaction { Value *Val = Inst->getOperand(It); OriginalValues.push_back(Val); // Set a dummy one. - // We could use OperandSetter here, but that would implied an overhead + // We could use OperandSetter here, but that would imply an overhead // that we are not willing to pay. Inst->setOperand(It, UndefValue::get(Val->getType())); } @@ -2111,7 +2535,7 @@ class AddressingModeMatcher { unsigned AddrSpace; Instruction *MemoryInst; - /// AddrMode - This is the addressing mode that we're building up. This is + /// This is the addressing mode that we're building up. This is /// part of the return value of this addressing mode matching stuff. ExtAddrMode &AddrMode; @@ -2122,9 +2546,8 @@ class AddressingModeMatcher { /// The ongoing transaction where every action should be registered. TypePromotionTransaction &TPT; - /// IgnoreProfitability - This is set to true when we should not do - /// profitability checks. When true, IsProfitableToFoldIntoAddressingMode - /// always returns true. + /// This is set to true when we should not do profitability checks. + /// When true, IsProfitableToFoldIntoAddressingMode always returns true. bool IgnoreProfitability; AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI, @@ -2143,7 +2566,7 @@ class AddressingModeMatcher { } public: - /// Match - Find the maximal addressing mode that a load/store of V can fold, + /// Find the maximal addressing mode that a load/store of V can fold, /// give an access type of AccessTy. This returns a list of involved /// instructions in AddrModeInsts. /// \p InsertedInsts The instructions inserted by other CodeGenPrepare @@ -2161,32 +2584,32 @@ public: bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS, MemoryInst, Result, InsertedInsts, - PromotedInsts, TPT).MatchAddr(V, 0); + PromotedInsts, TPT).matchAddr(V, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); return Result; } private: - bool MatchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth); - bool MatchAddr(Value *V, unsigned Depth); - bool MatchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth, + bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth); + bool matchAddr(Value *V, unsigned Depth); + bool matchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth, bool *MovedAway = nullptr); - bool IsProfitableToFoldIntoAddressingMode(Instruction *I, + bool isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode &AMAfter); - bool ValueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2); - bool IsPromotionProfitable(unsigned NewCost, unsigned OldCost, + bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2); + bool isPromotionProfitable(unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const; }; -/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode. +/// Try adding ScaleReg*Scale to the current addressing mode. /// Return true and update AddrMode if this addr mode is legal for the target, /// false if not. -bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, +bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth) { // If Scale is 1, then this is the same as adding ScaleReg to the addressing // mode. Just process that directly. if (Scale == 1) - return MatchAddr(ScaleReg, Depth); + return matchAddr(ScaleReg, Depth); // If the scale is 0, it takes nothing to add this. if (Scale == 0) @@ -2233,9 +2656,9 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, return true; } -/// MightBeFoldableInst - This is a little filter, which returns true if an -/// addressing computation involving I might be folded into a load/store -/// accessing it. This doesn't need to be perfect, but needs to accept at least +/// This is a little filter, which returns true if an addressing computation +/// involving I might be folded into a load/store accessing it. +/// This doesn't need to be perfect, but needs to accept at least /// the set of instructions that MatchOperationAddr can. static bool MightBeFoldableInst(Instruction *I) { switch (I->getOpcode()) { @@ -2301,9 +2724,7 @@ class TypePromotionHelper { /// \brief Utility function to determine if \p OpIdx should be promoted when /// promoting \p Inst. static bool shouldExtOperand(const Instruction *Inst, int OpIdx) { - if (isa<SelectInst>(Inst) && OpIdx == 0) - return false; - return true; + return !(isa<SelectInst>(Inst) && OpIdx == 0); } /// \brief Utility function to promote the operand of \p Ext when this @@ -2413,8 +2834,7 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst, Value *OpndVal = Inst->getOperand(0); // Check if we can use this operand in the extension. - // If the type is larger than the result type of the extension, - // we cannot. + // If the type is larger than the result type of the extension, we cannot. if (!OpndVal->getType()->isIntegerTy() || OpndVal->getType()->getIntegerBitWidth() > ConsideredExtType->getIntegerBitWidth()) @@ -2433,18 +2853,16 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst, // #1 get the type of the operand and check the kind of the extended bits. const Type *OpndType; InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd); - if (It != PromotedInsts.end() && It->second.IsSExt == IsSExt) - OpndType = It->second.Ty; + if (It != PromotedInsts.end() && It->second.getInt() == IsSExt) + OpndType = It->second.getPointer(); else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd))) OpndType = Opnd->getOperand(0)->getType(); else return false; - // #2 check that the truncate just drop extended bits. - if (Inst->getType()->getIntegerBitWidth() >= OpndType->getIntegerBitWidth()) - return true; - - return false; + // #2 check that the truncate just drops extended bits. + return Inst->getType()->getIntegerBitWidth() >= + OpndType->getIntegerBitWidth(); } TypePromotionHelper::Action TypePromotionHelper::getAction( @@ -2553,7 +2971,7 @@ Value *TypePromotionHelper::promoteOperandForOther( } TPT.replaceAllUsesWith(ExtOpnd, Trunc); - // Restore the operand of Ext (which has been replace by the previous call + // Restore the operand of Ext (which has been replaced by the previous call // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext. TPT.setOperand(Ext, 0, ExtOpnd); } @@ -2631,8 +3049,7 @@ Value *TypePromotionHelper::promoteOperandForOther( return ExtOpnd; } -/// IsPromotionProfitable - Check whether or not promoting an instruction -/// to a wider type was profitable. +/// Check whether or not promoting an instruction to a wider type is profitable. /// \p NewCost gives the cost of extension instructions created by the /// promotion. /// \p OldCost gives the cost of extension instructions before the promotion @@ -2640,7 +3057,7 @@ Value *TypePromotionHelper::promoteOperandForOther( /// matched in the addressing mode the promotion. /// \p PromotedOperand is the value that has been promoted. /// \return True if the promotion is profitable, false otherwise. -bool AddressingModeMatcher::IsPromotionProfitable( +bool AddressingModeMatcher::isPromotionProfitable( unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const { DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost << '\n'); // The cost of the new extensions is greater than the cost of the @@ -2656,9 +3073,9 @@ bool AddressingModeMatcher::IsPromotionProfitable( return isPromotedInstructionLegal(TLI, DL, PromotedOperand); } -/// MatchOperationAddr - Given an instruction or constant expr, see if we can -/// fold the operation into the addressing mode. If so, update the addressing -/// mode and return true, otherwise return false without modifying AddrMode. +/// Given an instruction or constant expr, see if we can fold the operation +/// into the addressing mode. If so, update the addressing mode and return +/// true, otherwise return false without modifying AddrMode. /// If \p MovedAway is not NULL, it contains the information of whether or /// not AddrInst has to be folded into the addressing mode on success. /// If \p MovedAway == true, \p AddrInst will not be part of the addressing @@ -2667,7 +3084,7 @@ bool AddressingModeMatcher::IsPromotionProfitable( /// This state can happen when AddrInst is a sext, since it may be moved away. /// Therefore, AddrInst may not be valid when MovedAway is true and it must /// not be referenced anymore. -bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, +bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth, bool *MovedAway) { // Avoid exponential behavior on extremely deep expression trees. @@ -2680,13 +3097,13 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, switch (Opcode) { case Instruction::PtrToInt: // PtrToInt is always a noop, as we know that the int type is pointer sized. - return MatchAddr(AddrInst->getOperand(0), Depth); + return matchAddr(AddrInst->getOperand(0), Depth); case Instruction::IntToPtr: { auto AS = AddrInst->getType()->getPointerAddressSpace(); auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS)); // This inttoptr is a no-op if the integer type is pointer sized. if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy) - return MatchAddr(AddrInst->getOperand(0), Depth); + return matchAddr(AddrInst->getOperand(0), Depth); return false; } case Instruction::BitCast: @@ -2698,14 +3115,14 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, // and we don't want to mess around with them. Assume it knows what it // is doing. AddrInst->getOperand(0)->getType() != AddrInst->getType()) - return MatchAddr(AddrInst->getOperand(0), Depth); + return matchAddr(AddrInst->getOperand(0), Depth); return false; case Instruction::AddrSpaceCast: { unsigned SrcAS = AddrInst->getOperand(0)->getType()->getPointerAddressSpace(); unsigned DestAS = AddrInst->getType()->getPointerAddressSpace(); if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS)) - return MatchAddr(AddrInst->getOperand(0), Depth); + return matchAddr(AddrInst->getOperand(0), Depth); return false; } case Instruction::Add: { @@ -2719,8 +3136,8 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); - if (MatchAddr(AddrInst->getOperand(1), Depth+1) && - MatchAddr(AddrInst->getOperand(0), Depth+1)) + if (matchAddr(AddrInst->getOperand(1), Depth+1) && + matchAddr(AddrInst->getOperand(0), Depth+1)) return true; // Restore the old addr mode info. @@ -2729,8 +3146,8 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, TPT.rollback(LastKnownGood); // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. - if (MatchAddr(AddrInst->getOperand(0), Depth+1) && - MatchAddr(AddrInst->getOperand(1), Depth+1)) + if (matchAddr(AddrInst->getOperand(0), Depth+1) && + matchAddr(AddrInst->getOperand(1), Depth+1)) return true; // Otherwise we definitely can't merge the ADD in. @@ -2752,7 +3169,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, if (Opcode == Instruction::Shl) Scale = 1LL << Scale; - return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth); + return matchScaledValue(AddrInst->getOperand(0), Scale, Depth); } case Instruction::GetElementPtr: { // Scan the GEP. We check it if it contains constant offsets and at most @@ -2791,7 +3208,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, if (ConstantOffset == 0 || TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) { // Check to see if we can fold the base pointer in too. - if (MatchAddr(AddrInst->getOperand(0), Depth+1)) + if (matchAddr(AddrInst->getOperand(0), Depth+1)) return true; } AddrMode.BaseOffs -= ConstantOffset; @@ -2806,7 +3223,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, AddrMode.BaseOffs += ConstantOffset; // Match the base operand of the GEP. - if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) { + if (!matchAddr(AddrInst->getOperand(0), Depth+1)) { // If it couldn't be matched, just stuff the value in a register. if (AddrMode.HasBaseReg) { AddrMode = BackupAddrMode; @@ -2818,7 +3235,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, } // Match the remaining variable portion of the GEP. - if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, + if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, Depth)) { // If it couldn't be matched, try stuffing the base into a register // instead of matching it, and retrying the match of the scale. @@ -2829,7 +3246,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, AddrMode.HasBaseReg = true; AddrMode.BaseReg = AddrInst->getOperand(0); AddrMode.BaseOffs += ConstantOffset; - if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), + if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, Depth)) { // If even that didn't work, bail. AddrMode = BackupAddrMode; @@ -2879,12 +3296,12 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, ExtAddrMode BackupAddrMode = AddrMode; unsigned OldSize = AddrModeInsts.size(); - if (!MatchAddr(PromotedOperand, Depth) || - // The total of the new cost is equals to the cost of the created + if (!matchAddr(PromotedOperand, Depth) || + // The total of the new cost is equal to the cost of the created // instructions. - // The total of the old cost is equals to the cost of the extension plus + // The total of the old cost is equal to the cost of the extension plus // what we have saved in the addressing mode. - !IsPromotionProfitable(CreatedInstsCost, + !isPromotionProfitable(CreatedInstsCost, ExtCost + (AddrModeInsts.size() - OldSize), PromotedOperand)) { AddrMode = BackupAddrMode; @@ -2899,12 +3316,12 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, return false; } -/// MatchAddr - If we can, try to add the value of 'Addr' into the current -/// addressing mode. If Addr can't be added to AddrMode this returns false and -/// leaves AddrMode unmodified. This assumes that Addr is either a pointer type -/// or intptr_t for the target. +/// If we can, try to add the value of 'Addr' into the current addressing mode. +/// If Addr can't be added to AddrMode this returns false and leaves AddrMode +/// unmodified. This assumes that Addr is either a pointer type or intptr_t +/// for the target. /// -bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { +bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) { // Start a transaction at this point that we will rollback if the matching // fails. TypePromotionTransaction::ConstRestorationPt LastKnownGood = @@ -2929,8 +3346,8 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { // Check to see if it is possible to fold this operation. bool MovedAway = false; - if (MatchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) { - // This instruction may have been move away. If so, there is nothing + if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) { + // This instruction may have been moved away. If so, there is nothing // to check here. if (MovedAway) return true; @@ -2938,7 +3355,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { // *profitable* to do so. We use a simple cost model to avoid increasing // register pressure too much. if (I->hasOneUse() || - IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) { + isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) { AddrModeInsts.push_back(I); return true; } @@ -2950,7 +3367,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { TPT.rollback(LastKnownGood); } } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) { - if (MatchOperationAddr(CE, CE->getOpcode(), Depth)) + if (matchOperationAddr(CE, CE->getOpcode(), Depth)) return true; TPT.rollback(LastKnownGood); } else if (isa<ConstantPointerNull>(Addr)) { @@ -2983,9 +3400,8 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { return false; } -/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified -/// inline asm call are due to memory operands. If so, return true, otherwise -/// return false. +/// Check to see if all uses of OpVal by the specified inline asm call are due +/// to memory operands. If so, return true, otherwise return false. static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, const TargetMachine &TM) { const Function *F = CI->getParent()->getParent(); @@ -3011,8 +3427,8 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, return true; } -/// FindAllMemoryUses - Recursively walk all the uses of I until we find a -/// memory use. If we find an obviously non-foldable instruction, return true. +/// Recursively walk all the uses of I until we find a memory use. +/// If we find an obviously non-foldable instruction, return true. /// Add the ultimately found memory instructions to MemoryUses. static bool FindAllMemoryUses( Instruction *I, @@ -3059,11 +3475,11 @@ static bool FindAllMemoryUses( return false; } -/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at -/// the use site that we're folding it into. If so, there is no cost to -/// include it in the addressing mode. KnownLive1 and KnownLive2 are two values -/// that we know are live at the instruction already. -bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, +/// Return true if Val is already known to be live at the use site that we're +/// folding it into. If so, there is no cost to include it in the addressing +/// mode. KnownLive1 and KnownLive2 are two values that we know are live at the +/// instruction already. +bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, Value *KnownLive2) { // If Val is either of the known-live values, we know it is live! if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2) @@ -3085,11 +3501,11 @@ bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, return Val->isUsedInBasicBlock(MemoryInst->getParent()); } -/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing -/// mode of the machine to fold the specified instruction into a load or store -/// that ultimately uses it. However, the specified instruction has multiple -/// uses. Given this, it may actually increase register pressure to fold it -/// into the load. For example, consider this code: +/// It is possible for the addressing mode of the machine to fold the specified +/// instruction into a load or store that ultimately uses it. +/// However, the specified instruction has multiple uses. +/// Given this, it may actually increase register pressure to fold it +/// into the load. For example, consider this code: /// /// X = ... /// Y = X+1 @@ -3107,7 +3523,7 @@ bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, /// X was live across 'load Z' for other reasons, we actually *would* want to /// fold the addressing mode in the Z case. This would make Y die earlier. bool AddressingModeMatcher:: -IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, +isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode &AMAfter) { if (IgnoreProfitability) return true; @@ -3124,9 +3540,9 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, // If the BaseReg or ScaledReg was referenced by the previous addrmode, their // lifetime wasn't extended by adding this instruction. - if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) + if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) BaseReg = nullptr; - if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) + if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) ScaledReg = nullptr; // If folding this instruction (and it's subexprs) didn't extend any live @@ -3171,7 +3587,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, MemoryInst, Result, InsertedInsts, PromotedInsts, TPT); Matcher.IgnoreProfitability = true; - bool Success = Matcher.MatchAddr(Address, 0); + bool Success = Matcher.matchAddr(Address, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); // The match was to check the profitability, the changes made are not @@ -3192,7 +3608,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, } // end anonymous namespace -/// IsNonLocalValue - Return true if the specified values are defined in a +/// Return true if the specified values are defined in a /// different basic block than BB. static bool IsNonLocalValue(Value *V, BasicBlock *BB) { if (Instruction *I = dyn_cast<Instruction>(V)) @@ -3200,16 +3616,15 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) { return false; } -/// OptimizeMemoryInst - Load and Store Instructions often have -/// addressing modes that can do significant amounts of computation. As such, -/// instruction selection will try to get the load or store to do as much -/// computation as possible for the program. The problem is that isel can only -/// see within a single block. As such, we sink as much legal addressing mode -/// stuff into the block as possible. +/// Load and Store Instructions often have addressing modes that can do +/// significant amounts of computation. As such, instruction selection will try +/// to get the load or store to do as much computation as possible for the +/// program. The problem is that isel can only see within a single block. As +/// such, we sink as much legal addressing mode work into the block as possible. /// /// This method is used to optimize both load/store and inline asms with memory /// operands. -bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, +bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy, unsigned AddrSpace) { Value *Repl = Addr; @@ -3530,12 +3945,12 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, if (Repl->use_empty()) { // This can cause recursive deletion, which can invalidate our iterator. // Use a WeakVH to hold onto it in case this happens. - WeakVH IterHandle(CurInstIterator); + WeakVH IterHandle(&*CurInstIterator); BasicBlock *BB = CurInstIterator->getParent(); RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo); - if (IterHandle != CurInstIterator) { + if (IterHandle != CurInstIterator.getNodePtrUnchecked()) { // If the iterator instruction was recursively deleted, start over at the // start of the block. CurInstIterator = BB->begin(); @@ -3546,10 +3961,9 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, return true; } -/// OptimizeInlineAsmInst - If there are any memory operands, use -/// OptimizeMemoryInst to sink their address computing into the block when -/// possible / profitable. -bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { +/// If there are any memory operands, use OptimizeMemoryInst to sink their +/// address computing into the block when possible / profitable. +bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { bool MadeChange = false; const TargetRegisterInfo *TRI = @@ -3566,7 +3980,7 @@ bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { if (OpInfo.ConstraintType == TargetLowering::C_Memory && OpInfo.isIndirect) { Value *OpVal = CS->getArgOperand(ArgNo++); - MadeChange |= OptimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u); + MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u); } else if (OpInfo.Type == InlineAsm::isInput) ArgNo++; } @@ -3646,7 +4060,7 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) { /// %add = add nuw i64 %zext, 4 /// \encode /// Thanks to the promotion, we can match zext(load i32*) to i64. -bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT, +bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI, Instruction *&Inst, const SmallVectorImpl<Instruction *> &Exts, unsigned CreatedInstsCost = 0) { @@ -3696,7 +4110,7 @@ bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT, } // The promotion is profitable. // Check if it exposes an ext(load). - (void)ExtLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost); + (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost); if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost || // If we have created a new extension, i.e., now we have two // extensions. We must make sure one of them is merged with @@ -3713,13 +4127,13 @@ bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT, return false; } -/// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same -/// basic block as the load, unless conditions are unfavorable. This allows -/// SelectionDAG to fold the extend into the load. +/// Move a zext or sext fed by a load into the same basic block as the load, +/// unless conditions are unfavorable. This allows SelectionDAG to fold the +/// extend into the load. /// \p I[in/out] the extension may be modified during the process if some /// promotions apply. /// -bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) { +bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) { // Try to promote a chain of computation if it allows to form // an extended load. TypePromotionTransaction TPT; @@ -3730,7 +4144,7 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) { // Look for a load being extended. LoadInst *LI = nullptr; Instruction *OldExt = I; - bool HasPromoted = ExtLdPromotion(TPT, LI, I, Exts); + bool HasPromoted = extLdPromotion(TPT, LI, I, Exts); if (!LI || !I) { assert(!HasPromoted && !LI && "If we did not match any load instruction " "the code must remain the same"); @@ -3780,7 +4194,7 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) { return true; } -bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { +bool CodeGenPrepare::optimizeExtUses(Instruction *I) { BasicBlock *DefBB = I->getParent(); // If the result of a {s|z}ext and its source are both live out, rewrite all @@ -3838,7 +4252,8 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { if (!InsertedTrunc) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); - InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt); + assert(InsertPt != UserBB->end()); + InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt); InsertedInsts.insert(InsertedTrunc); } @@ -3851,9 +4266,202 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { return MadeChange; } -/// isFormingBranchFromSelectProfitable - Returns true if a SelectInst should be -/// turned into an explicit branch. -static bool isFormingBranchFromSelectProfitable(SelectInst *SI) { +// Find loads whose uses only use some of the loaded value's bits. Add an "and" +// just after the load if the target can fold this into one extload instruction, +// with the hope of eliminating some of the other later "and" instructions using +// the loaded value. "and"s that are made trivially redundant by the insertion +// of the new "and" are removed by this function, while others (e.g. those whose +// path from the load goes through a phi) are left for isel to potentially +// remove. +// +// For example: +// +// b0: +// x = load i32 +// ... +// b1: +// y = and x, 0xff +// z = use y +// +// becomes: +// +// b0: +// x = load i32 +// x' = and x, 0xff +// ... +// b1: +// z = use x' +// +// whereas: +// +// b0: +// x1 = load i32 +// ... +// b1: +// x2 = load i32 +// ... +// b2: +// x = phi x1, x2 +// y = and x, 0xff +// +// becomes (after a call to optimizeLoadExt for each load): +// +// b0: +// x1 = load i32 +// x1' = and x1, 0xff +// ... +// b1: +// x2 = load i32 +// x2' = and x2, 0xff +// ... +// b2: +// x = phi x1', x2' +// y = and x, 0xff +// + +bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { + + if (!Load->isSimple() || + !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy())) + return false; + + // Skip loads we've already transformed or have no reason to transform. + if (Load->hasOneUse()) { + User *LoadUser = *Load->user_begin(); + if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() && + !dyn_cast<PHINode>(LoadUser)) + return false; + } + + // Look at all uses of Load, looking through phis, to determine how many bits + // of the loaded value are needed. + SmallVector<Instruction *, 8> WorkList; + SmallPtrSet<Instruction *, 16> Visited; + SmallVector<Instruction *, 8> AndsToMaybeRemove; + for (auto *U : Load->users()) + WorkList.push_back(cast<Instruction>(U)); + + EVT LoadResultVT = TLI->getValueType(*DL, Load->getType()); + unsigned BitWidth = LoadResultVT.getSizeInBits(); + APInt DemandBits(BitWidth, 0); + APInt WidestAndBits(BitWidth, 0); + + while (!WorkList.empty()) { + Instruction *I = WorkList.back(); + WorkList.pop_back(); + + // Break use-def graph loops. + if (!Visited.insert(I).second) + continue; + + // For a PHI node, push all of its users. + if (auto *Phi = dyn_cast<PHINode>(I)) { + for (auto *U : Phi->users()) + WorkList.push_back(cast<Instruction>(U)); + continue; + } + + switch (I->getOpcode()) { + case llvm::Instruction::And: { + auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1)); + if (!AndC) + return false; + APInt AndBits = AndC->getValue(); + DemandBits |= AndBits; + // Keep track of the widest and mask we see. + if (AndBits.ugt(WidestAndBits)) + WidestAndBits = AndBits; + if (AndBits == WidestAndBits && I->getOperand(0) == Load) + AndsToMaybeRemove.push_back(I); + break; + } + + case llvm::Instruction::Shl: { + auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1)); + if (!ShlC) + return false; + uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1); + auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt); + DemandBits |= ShlDemandBits; + break; + } + + case llvm::Instruction::Trunc: { + EVT TruncVT = TLI->getValueType(*DL, I->getType()); + unsigned TruncBitWidth = TruncVT.getSizeInBits(); + auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth); + DemandBits |= TruncBits; + break; + } + + default: + return false; + } + } + + uint32_t ActiveBits = DemandBits.getActiveBits(); + // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the + // target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example, + // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but + // (and (load x) 1) is not matched as a single instruction, rather as a LDR + // followed by an AND. + // TODO: Look into removing this restriction by fixing backends to either + // return false for isLoadExtLegal for i1 or have them select this pattern to + // a single instruction. + // + // Also avoid hoisting if we didn't see any ands with the exact DemandBits + // mask, since these are the only ands that will be removed by isel. + if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) || + WidestAndBits != DemandBits) + return false; + + LLVMContext &Ctx = Load->getType()->getContext(); + Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits); + EVT TruncVT = TLI->getValueType(*DL, TruncTy); + + // Reject cases that won't be matched as extloads. + if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() || + !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT)) + return false; + + IRBuilder<> Builder(Load->getNextNode()); + auto *NewAnd = dyn_cast<Instruction>( + Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits))); + + // Replace all uses of load with new and (except for the use of load in the + // new and itself). + Load->replaceAllUsesWith(NewAnd); + NewAnd->setOperand(0, Load); + + // Remove any and instructions that are now redundant. + for (auto *And : AndsToMaybeRemove) + // Check that the and mask is the same as the one we decided to put on the + // new and. + if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) { + And->replaceAllUsesWith(NewAnd); + if (&*CurInstIterator == And) + CurInstIterator = std::next(And->getIterator()); + And->eraseFromParent(); + ++NumAndUses; + } + + ++NumAndsAdded; + return true; +} + +/// Check if V (an operand of a select instruction) is an expensive instruction +/// that is only used once. +static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { + auto *I = dyn_cast<Instruction>(V); + // If it's safe to speculatively execute, then it should not have side + // effects; therefore, it's safe to sink and possibly *not* execute. + return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) && + TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive; +} + +/// Returns true if a SelectInst should be turned into an explicit branch. +static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI, + SelectInst *SI) { // FIXME: This should use the same heuristics as IfConversion to determine // whether a select is better represented as a branch. This requires that // branch probability metadata is preserved for the select, which is not the @@ -3861,28 +4469,36 @@ static bool isFormingBranchFromSelectProfitable(SelectInst *SI) { CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition()); - // If the branch is predicted right, an out of order CPU can avoid blocking on - // the compare. Emit cmovs on compares with a memory operand as branches to - // avoid stalls on the load from memory. If the compare has more than one use - // there's probably another cmov or setcc around so it's not worth emitting a - // branch. - if (!Cmp) + // If a branch is predictable, an out-of-order CPU can avoid blocking on its + // comparison condition. If the compare has more than one use, there's + // probably another cmov or setcc around, so it's not worth emitting a branch. + if (!Cmp || !Cmp->hasOneUse()) return false; Value *CmpOp0 = Cmp->getOperand(0); Value *CmpOp1 = Cmp->getOperand(1); - // We check that the memory operand has one use to avoid uses of the loaded - // value directly after the compare, making branches unprofitable. - return Cmp->hasOneUse() && - ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) || - (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse())); + // Emit "cmov on compare with a memory operand" as a branch to avoid stalls + // on a load from memory. But if the load is used more than once, do not + // change the select to a branch because the load is probably needed + // regardless of whether the branch is taken or not. + if ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) || + (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse())) + return true; + + // If either operand of the select is expensive and only needed on one side + // of the select, we should form a branch. + if (sinkSelectOperand(TTI, SI->getTrueValue()) || + sinkSelectOperand(TTI, SI->getFalseValue())) + return true; + + return false; } /// If we have a SelectInst that will likely profit from branch prediction, /// turn it into a branch. -bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) { +bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); // Can we convert the 'select' to CF ? @@ -3902,34 +4518,97 @@ bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) { // We have efficient codegen support for the select instruction. // Check if it is profitable to keep this 'select'. if (!TLI->isPredictableSelectExpensive() || - !isFormingBranchFromSelectProfitable(SI)) + !isFormingBranchFromSelectProfitable(TTI, SI)) return false; } ModifiedDT = true; + // Transform a sequence like this: + // start: + // %cmp = cmp uge i32 %a, %b + // %sel = select i1 %cmp, i32 %c, i32 %d + // + // Into: + // start: + // %cmp = cmp uge i32 %a, %b + // br i1 %cmp, label %select.true, label %select.false + // select.true: + // br label %select.end + // select.false: + // br label %select.end + // select.end: + // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] + // + // In addition, we may sink instructions that produce %c or %d from + // the entry block into the destination(s) of the new branch. + // If the true or false blocks do not contain a sunken instruction, that + // block and its branch may be optimized away. In that case, one side of the + // first branch will point directly to select.end, and the corresponding PHI + // predecessor block will be the start block. + // First, we split the block containing the select into 2 blocks. BasicBlock *StartBlock = SI->getParent(); BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI)); - BasicBlock *NextBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); + BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); - // Create a new block serving as the landing pad for the branch. - BasicBlock *SmallBlock = BasicBlock::Create(SI->getContext(), "select.mid", - NextBlock->getParent(), NextBlock); - - // Move the unconditional branch from the block with the select in it into our - // landing pad block. + // Delete the unconditional branch that was just created by the split. StartBlock->getTerminator()->eraseFromParent(); - BranchInst::Create(NextBlock, SmallBlock); + + // These are the new basic blocks for the conditional branch. + // At least one will become an actual new basic block. + BasicBlock *TrueBlock = nullptr; + BasicBlock *FalseBlock = nullptr; + + // Sink expensive instructions into the conditional blocks to avoid executing + // them speculatively. + if (sinkSelectOperand(TTI, SI->getTrueValue())) { + TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink", + EndBlock->getParent(), EndBlock); + auto *TrueBranch = BranchInst::Create(EndBlock, TrueBlock); + auto *TrueInst = cast<Instruction>(SI->getTrueValue()); + TrueInst->moveBefore(TrueBranch); + } + if (sinkSelectOperand(TTI, SI->getFalseValue())) { + FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink", + EndBlock->getParent(), EndBlock); + auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); + auto *FalseInst = cast<Instruction>(SI->getFalseValue()); + FalseInst->moveBefore(FalseBranch); + } + + // If there was nothing to sink, then arbitrarily choose the 'false' side + // for a new input value to the PHI. + if (TrueBlock == FalseBlock) { + assert(TrueBlock == nullptr && + "Unexpected basic block transform while optimizing select"); + + FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", + EndBlock->getParent(), EndBlock); + BranchInst::Create(EndBlock, FalseBlock); + } // Insert the real conditional branch based on the original condition. - BranchInst::Create(NextBlock, SmallBlock, SI->getCondition(), SI); + // If we did not create a new block for one of the 'true' or 'false' paths + // of the condition, it means that side of the branch goes to the end block + // directly and the path originates from the start block from the point of + // view of the new PHI. + if (TrueBlock == nullptr) { + BranchInst::Create(EndBlock, FalseBlock, SI->getCondition(), SI); + TrueBlock = StartBlock; + } else if (FalseBlock == nullptr) { + BranchInst::Create(TrueBlock, EndBlock, SI->getCondition(), SI); + FalseBlock = StartBlock; + } else { + BranchInst::Create(TrueBlock, FalseBlock, SI->getCondition(), SI); + } // The select itself is replaced with a PHI Node. - PHINode *PN = PHINode::Create(SI->getType(), 2, "", NextBlock->begin()); + PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front()); PN->takeName(SI); - PN->addIncoming(SI->getTrueValue(), StartBlock); - PN->addIncoming(SI->getFalseValue(), SmallBlock); + PN->addIncoming(SI->getTrueValue(), TrueBlock); + PN->addIncoming(SI->getFalseValue(), FalseBlock); + SI->replaceAllUsesWith(PN); SI->eraseFromParent(); @@ -3955,7 +4634,7 @@ static bool isBroadcastShuffle(ShuffleVectorInst *SVI) { /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases /// it's often worth sinking a shufflevector splat down to its use so that /// codegen can spot all lanes are identical. -bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) { +bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { BasicBlock *DefBB = SVI->getParent(); // Only do this xform if variable vector shifts are particularly expensive. @@ -3987,9 +4666,10 @@ bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) { if (!InsertedShuffle) { BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); - InsertedShuffle = new ShuffleVectorInst(SVI->getOperand(0), - SVI->getOperand(1), - SVI->getOperand(2), "", InsertPt); + assert(InsertPt != UserBB->end()); + InsertedShuffle = + new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1), + SVI->getOperand(2), "", &*InsertPt); } UI->replaceUsesOfWith(SVI, InsertedShuffle); @@ -4005,6 +4685,49 @@ bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) { return MadeChange; } +bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { + if (!TLI || !DL) + return false; + + Value *Cond = SI->getCondition(); + Type *OldType = Cond->getType(); + LLVMContext &Context = Cond->getContext(); + MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType)); + unsigned RegWidth = RegType.getSizeInBits(); + + if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth()) + return false; + + // If the register width is greater than the type width, expand the condition + // of the switch instruction and each case constant to the width of the + // register. By widening the type of the switch condition, subsequent + // comparisons (for case comparisons) will not need to be extended to the + // preferred register width, so we will potentially eliminate N-1 extends, + // where N is the number of cases in the switch. + auto *NewType = Type::getIntNTy(Context, RegWidth); + + // Zero-extend the switch condition and case constants unless the switch + // condition is a function argument that is already being sign-extended. + // In that case, we can avoid an unnecessary mask/extension by sign-extending + // everything instead. + Instruction::CastOps ExtType = Instruction::ZExt; + if (auto *Arg = dyn_cast<Argument>(Cond)) + if (Arg->hasSExtAttr()) + ExtType = Instruction::SExt; + + auto *ExtInst = CastInst::Create(ExtType, Cond, NewType); + ExtInst->insertBefore(SI); + SI->setCondition(ExtInst); + for (SwitchInst::CaseIt Case : SI->cases()) { + APInt NarrowConst = Case.getCaseValue()->getValue(); + APInt WideConst = (ExtType == Instruction::ZExt) ? + NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth); + Case.setValue(ConstantInt::get(Context, WideConst)); + } + + return true; +} + namespace { /// \brief Helper class to promote a scalar operation to a vector one. /// This class is used to move downward extractelement transition. @@ -4138,7 +4861,7 @@ class VectorPromoteHelper { /// \brief Generate a constant vector with \p Val with the same /// number of elements as the transition. /// \p UseSplat defines whether or not \p Val should be replicated - /// accross the whole vector. + /// across the whole vector. /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>, /// otherwise we generate a vector with as many undef as possible: /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only @@ -4320,7 +5043,7 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) { /// Some targets can do store(extractelement) with one instruction. /// Try to push the extractelement towards the stores when the target /// has this feature and this is profitable. -bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) { +bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) { unsigned CombineCost = UINT_MAX; if (DisableStoreExtract || !TLI || (!StressStoreExtract && @@ -4372,7 +5095,7 @@ bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) { return false; } -bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) { +bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { // Bail out if we inserted the instruction to prevent optimizations from // stepping on each other's toes. if (InsertedInsts.count(I)) @@ -4413,8 +5136,8 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) { TargetLowering::TypeExpandInteger) { return SinkCast(CI); } else { - bool MadeChange = MoveExtToFormExtLoad(I); - return MadeChange | OptimizeExtUses(I); + bool MadeChange = moveExtToFormExtLoad(I); + return MadeChange | optimizeExtUses(I); } } return false; @@ -4425,17 +5148,21 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) { return OptimizeCmpExpression(CI); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + stripInvariantGroupMetadata(*LI); if (TLI) { + bool Modified = optimizeLoadExt(LI); unsigned AS = LI->getPointerAddressSpace(); - return OptimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); + Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS); + return Modified; } return false; } if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + stripInvariantGroupMetadata(*SI); if (TLI) { unsigned AS = SI->getPointerAddressSpace(); - return OptimizeMemoryInst(I, SI->getOperand(1), + return optimizeMemoryInst(I, SI->getOperand(1), SI->getOperand(0)->getType(), AS); } return false; @@ -4460,23 +5187,26 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) { GEPI->replaceAllUsesWith(NC); GEPI->eraseFromParent(); ++NumGEPsElim; - OptimizeInst(NC, ModifiedDT); + optimizeInst(NC, ModifiedDT); return true; } return false; } if (CallInst *CI = dyn_cast<CallInst>(I)) - return OptimizeCallInst(CI, ModifiedDT); + return optimizeCallInst(CI, ModifiedDT); if (SelectInst *SI = dyn_cast<SelectInst>(I)) - return OptimizeSelectInst(SI); + return optimizeSelectInst(SI); if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I)) - return OptimizeShuffleVectorInst(SVI); + return optimizeShuffleVectorInst(SVI); + + if (auto *Switch = dyn_cast<SwitchInst>(I)) + return optimizeSwitchInst(Switch); if (isa<ExtractElementInst>(I)) - return OptimizeExtractElementInst(I); + return optimizeExtractElementInst(I); return false; } @@ -4484,17 +5214,17 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) { // In this pass we look for GEP and cast instructions that are used // across basic blocks and rewrite them to improve basic-block-at-a-time // selection. -bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) { +bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) { SunkAddrs.clear(); bool MadeChange = false; CurInstIterator = BB.begin(); while (CurInstIterator != BB.end()) { - MadeChange |= OptimizeInst(CurInstIterator++, ModifiedDT); + MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT); if (ModifiedDT) return true; } - MadeChange |= DupRetToEnableTailCallOpts(&BB); + MadeChange |= dupRetToEnableTailCallOpts(&BB); return MadeChange; } @@ -4502,12 +5232,12 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) { // llvm.dbg.value is far away from the value then iSel may not be able // handle it properly. iSel will drop llvm.dbg.value if it can not // find a node corresponding to the value. -bool CodeGenPrepare::PlaceDbgValues(Function &F) { +bool CodeGenPrepare::placeDbgValues(Function &F) { bool MadeChange = false; for (BasicBlock &BB : F) { Instruction *PrevNonDbgInst = nullptr; for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { - Instruction *Insn = BI++; + Instruction *Insn = &*BI++; DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn); // Leave dbg.values that refer to an alloca alone. These // instrinsics describe the address of a variable (= the alloca) @@ -4521,10 +5251,14 @@ bool CodeGenPrepare::PlaceDbgValues(Function &F) { Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()); if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) { + // If VI is a phi in a block with an EHPad terminator, we can't insert + // after it. + if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad()) + continue; DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI); DVI->removeFromParent(); if (isa<PHINode>(VI)) - DVI->insertBefore(VI->getParent()->getFirstInsertionPt()); + DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt()); else DVI->insertAfter(VI); MadeChange = true; @@ -4548,7 +5282,7 @@ bool CodeGenPrepare::sinkAndCmp(Function &F) { return false; bool MadeChange = false; for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { - BasicBlock *BB = I++; + BasicBlock *BB = &*I++; // Does this BB end with the following? // %andVal = and %val, #single-bit-set @@ -4671,6 +5405,10 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) { if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB))) continue; + auto *Br1 = cast<BranchInst>(BB.getTerminator()); + if (Br1->getMetadata(LLVMContext::MD_unpredictable)) + continue; + unsigned Opc; Value *Cond1, *Cond2; if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)), @@ -4697,7 +5435,6 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) { // Update original basic block by using the first condition directly by the // branch instruction and removing the no longer needed and/or instruction. - auto *Br1 = cast<BranchInst>(BB.getTerminator()); Br1->setCondition(Cond1); LogicOp->eraseFromParent(); @@ -4828,3 +5565,8 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) { } return MadeChange; } + +void CodeGenPrepare::stripInvariantGroupMetadata(Instruction &I) { + if (auto *InvariantMD = I.getMetadata(LLVMContext::MD_invariant_group)) + I.dropUnknownNonDebugMetadata(InvariantMD->getMetadataID()); +} diff --git a/contrib/llvm/lib/CodeGen/CoreCLRGC.cpp b/contrib/llvm/lib/CodeGen/CoreCLRGC.cpp index 28c97ba..ff7c0d5 100644 --- a/contrib/llvm/lib/CodeGen/CoreCLRGC.cpp +++ b/contrib/llvm/lib/CodeGen/CoreCLRGC.cpp @@ -38,9 +38,9 @@ public: UsesMetadata = false; CustomRoots = false; } - Optional<bool> isGCManagedPointer(const Value *V) const override { + Optional<bool> isGCManagedPointer(const Type *Ty) const override { // Method is only valid on pointer typed values. - PointerType *PT = cast<PointerType>(V->getType()); + const PointerType *PT = cast<PointerType>(Ty); // We pick addrspace(1) as our GC managed heap. return (1 == PT->getAddressSpace()); } diff --git a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp index dba280f..c924ba3 100644 --- a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -52,14 +52,13 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { // Clear "do not change" set. KeepRegs.reset(); - bool IsReturnBlock = (BBSize != 0 && BB->back().isReturn()); + bool IsReturnBlock = BB->isReturnBlock(); // Examine the live-in regs of all successors. for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), SE = BB->succ_end(); SI != SE; ++SI) - for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(), - E = (*SI)->livein_end(); I != E; ++I) { - for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) { + for (const auto &LI : (*SI)->liveins()) { + for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) { unsigned Reg = *AI; Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1); KillIndices[Reg] = BBSize; diff --git a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp index 0a188c0..af6b6a3 100644 --- a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp +++ b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp @@ -31,10 +31,39 @@ #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; -DFAPacketizer::DFAPacketizer(const InstrItineraryData *I, const int (*SIT)[2], +// -------------------------------------------------------------------- +// Definitions shared between DFAPacketizer.cpp and DFAPacketizerEmitter.cpp + +namespace { + DFAInput addDFAFuncUnits(DFAInput Inp, unsigned FuncUnits) { + return (Inp << DFA_MAX_RESOURCES) | FuncUnits; + } + + /// Return the DFAInput for an instruction class input vector. + /// This function is used in both DFAPacketizer.cpp and in + /// DFAPacketizerEmitter.cpp. + DFAInput getDFAInsnInput(const std::vector<unsigned> &InsnClass) { + DFAInput InsnInput = 0; + assert ((InsnClass.size() <= DFA_MAX_RESTERMS) && + "Exceeded maximum number of DFA terms"); + for (auto U : InsnClass) + InsnInput = addDFAFuncUnits(InsnInput, U); + return InsnInput; + } +} +// -------------------------------------------------------------------- + +DFAPacketizer::DFAPacketizer(const InstrItineraryData *I, + const DFAStateInput (*SIT)[2], const unsigned *SET): InstrItins(I), CurrentState(0), DFAStateInputTable(SIT), - DFAStateEntryTable(SET) {} + DFAStateEntryTable(SET) { + // Make sure DFA types are large enough for the number of terms & resources. + assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <= (8 * sizeof(DFAInput)) + && "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAInput"); + assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <= (8 * sizeof(DFAStateInput)) + && "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAStateInput"); +} // @@ -60,26 +89,42 @@ void DFAPacketizer::ReadTable(unsigned int state) { DFAStateInputTable[i][1]; } +// +// getInsnInput - Return the DFAInput for an instruction class. +// +DFAInput DFAPacketizer::getInsnInput(unsigned InsnClass) { + // Note: this logic must match that in DFAPacketizerDefs.h for input vectors. + DFAInput InsnInput = 0; + unsigned i = 0; + for (const InstrStage *IS = InstrItins->beginStage(InsnClass), + *IE = InstrItins->endStage(InsnClass); IS != IE; ++IS, ++i) { + InsnInput = addDFAFuncUnits(InsnInput, IS->getUnits()); + assert ((i < DFA_MAX_RESTERMS) && "Exceeded maximum number of DFA inputs"); + } + return InsnInput; +} + +// getInsnInput - Return the DFAInput for an instruction class input vector. +DFAInput DFAPacketizer::getInsnInput(const std::vector<unsigned> &InsnClass) { + return getDFAInsnInput(InsnClass); +} // canReserveResources - Check if the resources occupied by a MCInstrDesc // are available in the current state. bool DFAPacketizer::canReserveResources(const llvm::MCInstrDesc *MID) { unsigned InsnClass = MID->getSchedClass(); - const llvm::InstrStage *IS = InstrItins->beginStage(InsnClass); - unsigned FuncUnits = IS->getUnits(); - UnsignPair StateTrans = UnsignPair(CurrentState, FuncUnits); + DFAInput InsnInput = getInsnInput(InsnClass); + UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput); ReadTable(CurrentState); return (CachedTable.count(StateTrans) != 0); } - // reserveResources - Reserve the resources occupied by a MCInstrDesc and // change the current state to reflect that change. void DFAPacketizer::reserveResources(const llvm::MCInstrDesc *MID) { unsigned InsnClass = MID->getSchedClass(); - const llvm::InstrStage *IS = InstrItins->beginStage(InsnClass); - unsigned FuncUnits = IS->getUnits(); - UnsignPair StateTrans = UnsignPair(CurrentState, FuncUnits); + DFAInput InsnInput = getInsnInput(InsnClass); + UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput); ReadTable(CurrentState); assert(CachedTable.count(StateTrans) != 0); CurrentState = CachedTable[StateTrans]; @@ -104,32 +149,35 @@ namespace llvm { // DefaultVLIWScheduler - This class extends ScheduleDAGInstrs and overrides // Schedule method to build the dependence graph. class DefaultVLIWScheduler : public ScheduleDAGInstrs { +private: + AliasAnalysis *AA; public: DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI, - bool IsPostRA); + AliasAnalysis *AA); // Schedule - Actual scheduling work. void schedule() override; }; } DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF, - MachineLoopInfo &MLI, bool IsPostRA) - : ScheduleDAGInstrs(MF, &MLI, IsPostRA) { + MachineLoopInfo &MLI, + AliasAnalysis *AA) + : ScheduleDAGInstrs(MF, &MLI), AA(AA) { CanHandleTerminators = true; } void DefaultVLIWScheduler::schedule() { // Build the scheduling graph. - buildSchedGraph(nullptr); + buildSchedGraph(AA); } // VLIWPacketizerList Ctor VLIWPacketizerList::VLIWPacketizerList(MachineFunction &MF, - MachineLoopInfo &MLI, bool IsPostRA) - : MF(MF) { + MachineLoopInfo &MLI, AliasAnalysis *AA) + : MF(MF), AA(AA) { TII = MF.getSubtarget().getInstrInfo(); ResourceTracker = TII->CreateTargetScheduleState(MF.getSubtarget()); - VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, IsPostRA); + VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, AA); } // VLIWPacketizerList Dtor @@ -147,7 +195,7 @@ void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB, MachineInstr *MI) { if (CurrentPacketMIs.size() > 1) { MachineInstr *MIFirst = CurrentPacketMIs.front(); - finalizeBundle(*MBB, MIFirst, MI); + finalizeBundle(*MBB, MIFirst->getIterator(), MI->getIterator()); } CurrentPacketMIs.clear(); ResourceTracker->clearResources(); @@ -191,7 +239,7 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB, // Ask DFA if machine resource is available for MI. bool ResourceAvail = ResourceTracker->canReserveResources(MI); - if (ResourceAvail) { + if (ResourceAvail && shouldAddToPacket(MI)) { // Dependency check for MI with instructions in CurrentPacketMIs. for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(), VE = CurrentPacketMIs.end(); VI != VE; ++VI) { @@ -210,7 +258,8 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB, } // !isLegalToPacketizeTogether. } // For all instructions in CurrentPacketMIs. } else { - // End the packet if resource is not available. + // End the packet if resource is not available, or if the instruction + // shoud not be added to the current packet. endPacket(MBB, MI); } diff --git a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp index 941129b..b11b497 100644 --- a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp +++ b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp @@ -101,26 +101,22 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) { // Loop over all instructions in all blocks, from bottom to top, so that it's // more likely that chains of dependent but ultimately dead instructions will // be cleaned up. - for (MachineFunction::reverse_iterator I = MF.rbegin(), E = MF.rend(); - I != E; ++I) { - MachineBasicBlock *MBB = &*I; - + for (MachineBasicBlock &MBB : make_range(MF.rbegin(), MF.rend())) { // Start out assuming that reserved registers are live out of this block. LivePhysRegs = MRI->getReservedRegs(); // Add live-ins from sucessors to LivePhysRegs. Normally, physregs are not // live across blocks, but some targets (x86) can have flags live out of a // block. - for (MachineBasicBlock::succ_iterator S = MBB->succ_begin(), - E = MBB->succ_end(); S != E; S++) - for (MachineBasicBlock::livein_iterator LI = (*S)->livein_begin(); - LI != (*S)->livein_end(); LI++) - LivePhysRegs.set(*LI); + for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(), + E = MBB.succ_end(); S != E; S++) + for (const auto &LI : (*S)->liveins()) + LivePhysRegs.set(LI.PhysReg); // Now scan the instructions and delete dead ones, tracking physreg // liveness as we go. - for (MachineBasicBlock::reverse_iterator MII = MBB->rbegin(), - MIE = MBB->rend(); MII != MIE; ) { + for (MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), + MIE = MBB.rend(); MII != MIE; ) { MachineInstr *MI = &*MII; // If the instruction is dead, delete it! @@ -132,7 +128,7 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) { MI->eraseFromParentAndMarkDBGValuesForRemoval(); AnyChanges = true; ++NumDeletes; - MIE = MBB->rend(); + MIE = MBB.rend(); // MII is now pointing to the next instruction to process, // so don't increment it. continue; diff --git a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp index e019dfb..eae78a9 100644 --- a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp +++ b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp @@ -16,7 +16,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -192,9 +192,9 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) { if (Resumes.empty()) return false; - // Check the personality, don't do anything if it's for MSVC. + // Check the personality, don't do anything if it's funclet-based. EHPersonality Pers = classifyEHPersonality(Fn.getPersonalityFn()); - if (isMSVCEHPersonality(Pers)) + if (isFuncletEHPersonality(Pers)) return false; LLVMContext &Ctx = Fn.getContext(); diff --git a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp index fbc4d97..f3536d7 100644 --- a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp +++ b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp @@ -538,11 +538,11 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) { // Fix up the CFG, temporarily leave Head without any successors. Head->removeSuccessor(TBB); - Head->removeSuccessor(FBB); + Head->removeSuccessor(FBB, true); if (TBB != Tail) - TBB->removeSuccessor(Tail); + TBB->removeSuccessor(Tail, true); if (FBB != Tail) - FBB->removeSuccessor(Tail); + FBB->removeSuccessor(Tail, true); // Fix up Head's terminators. // It should become a single branch or a fallthrough. diff --git a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp index 5b09cf1..c550008 100644 --- a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp +++ b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp @@ -375,9 +375,8 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { // This is the entry block. if (MBB->pred_empty()) { - for (MachineBasicBlock::livein_iterator i = MBB->livein_begin(), - e = MBB->livein_end(); i != e; ++i) { - for (int rx : regIndices(*i)) { + for (const auto &LI : MBB->liveins()) { + for (int rx : regIndices(LI.PhysReg)) { // Treat function live-ins as if they were defined just before the first // instruction. Usually, function arguments are set up immediately // before the call. @@ -559,12 +558,11 @@ void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) { MachineInstr *UndefMI = UndefReads.back().first; unsigned OpIdx = UndefReads.back().second; - for (MachineBasicBlock::reverse_iterator I = MBB->rbegin(), E = MBB->rend(); - I != E; ++I) { + for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) { // Update liveness, including the current instruction's defs. - LiveRegSet.stepBackward(*I); + LiveRegSet.stepBackward(I); - if (UndefMI == &*I) { + if (UndefMI == &I) { if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg())) TII->breakPartialRegDependency(UndefMI, OpIdx, TRI); @@ -733,12 +731,13 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { // If no relevant registers are used in the function, we can skip it // completely. bool anyregs = false; - for (TargetRegisterClass::const_iterator I = RC->begin(), E = RC->end(); - I != E; ++I) - if (MF->getRegInfo().isPhysRegUsed(*I)) { + const MachineRegisterInfo &MRI = mf.getRegInfo(); + for (unsigned Reg : *RC) { + if (MRI.isPhysRegUsed(Reg)) { anyregs = true; break; } + } if (!anyregs) return false; // Initialize the AliasMap on the first use. @@ -752,7 +751,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { AliasMap[*AI].push_back(i); } - MachineBasicBlock *Entry = MF->begin(); + MachineBasicBlock *Entry = &*MF->begin(); ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry); SmallVector<MachineBasicBlock*, 16> Loops; for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator @@ -761,22 +760,19 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { enterBasicBlock(MBB); if (SeenUnknownBackEdge) Loops.push_back(MBB); - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; - ++I) - visitInstr(I); + for (MachineInstr &MI : *MBB) + visitInstr(&MI); processUndefReads(MBB); leaveBasicBlock(MBB); } // Visit all the loop blocks again in order to merge DomainValues from // back-edges. - for (unsigned i = 0, e = Loops.size(); i != e; ++i) { - MachineBasicBlock *MBB = Loops[i]; + for (MachineBasicBlock *MBB : Loops) { enterBasicBlock(MBB); - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; - ++I) - if (!I->isDebugValue()) - processDefs(I, false); + for (MachineInstr &MI : *MBB) + if (!MI.isDebugValue()) + processDefs(&MI, false); processUndefReads(MBB); leaveBasicBlock(MBB); } diff --git a/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp index 55e809e..90ddac9 100644 --- a/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp +++ b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp @@ -50,7 +50,7 @@ bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) { // Iterate through each instruction in the function, looking for pseudos. for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { - MachineBasicBlock *MBB = I; + MachineBasicBlock *MBB = &*I; for (MachineBasicBlock::iterator MBBI = MBB->begin(), MBBE = MBB->end(); MBBI != MBBE; ) { MachineInstr *MI = MBBI++; @@ -63,7 +63,7 @@ bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) { // The expansion may involve new basic blocks. if (NewMBB != MBB) { MBB = NewMBB; - I = NewMBB; + I = NewMBB->getIterator(); MBBI = NewMBB->begin(); MBBE = NewMBB->end(); } diff --git a/contrib/llvm/lib/CodeGen/FuncletLayout.cpp b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp new file mode 100644 index 0000000..8b2f505 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp @@ -0,0 +1,55 @@ +//===-- FuncletLayout.cpp - Contiguously lay out funclets -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements basic block placement transformations which result in +// funclets being contiguous. +// +//===----------------------------------------------------------------------===// +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +using namespace llvm; + +#define DEBUG_TYPE "funclet-layout" + +namespace { +class FuncletLayout : public MachineFunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + FuncletLayout() : MachineFunctionPass(ID) { + initializeFuncletLayoutPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &F) override; +}; +} + +char FuncletLayout::ID = 0; +char &llvm::FuncletLayoutID = FuncletLayout::ID; +INITIALIZE_PASS(FuncletLayout, "funclet-layout", + "Contiguously Lay Out Funclets", false, false) + +bool FuncletLayout::runOnMachineFunction(MachineFunction &F) { + DenseMap<const MachineBasicBlock *, int> FuncletMembership = + getFuncletMembership(F); + if (FuncletMembership.empty()) + return false; + + F.sort([&](MachineBasicBlock &X, MachineBasicBlock &Y) { + auto FuncletX = FuncletMembership.find(&X); + auto FuncletY = FuncletMembership.find(&Y); + assert(FuncletX != FuncletMembership.end()); + assert(FuncletY != FuncletMembership.end()); + return FuncletX->second < FuncletY->second; + }); + + // Conservatively assume we changed something. + return true; +} diff --git a/contrib/llvm/lib/CodeGen/GCRootLowering.cpp b/contrib/llvm/lib/CodeGen/GCRootLowering.cpp index d8edd7e..484d317 100644 --- a/contrib/llvm/lib/CodeGen/GCRootLowering.cpp +++ b/contrib/llvm/lib/CodeGen/GCRootLowering.cpp @@ -158,7 +158,7 @@ static bool InsertRootInitializers(Function &F, AllocaInst **Roots, // Search for initializers in the initial BB. SmallPtrSet<AllocaInst *, 16> InitedRoots; - for (; !CouldBecomeSafePoint(IP); ++IP) + for (; !CouldBecomeSafePoint(&*IP); ++IP) if (StoreInst *SI = dyn_cast<StoreInst>(IP)) if (AllocaInst *AI = dyn_cast<AllocaInst>(SI->getOperand(1)->stripPointerCasts())) @@ -320,7 +320,9 @@ void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) { if (MF.getFrameInfo()->isDeadObjectIndex(RI->Num)) { RI = FI->removeStackRoot(RI); } else { - RI->StackOffset = TFI->getFrameIndexOffset(MF, RI->Num); + unsigned FrameReg; // FIXME: surely GCRoot ought to store the + // register that the offset is from? + RI->StackOffset = TFI->getFrameIndexReference(MF, RI->Num, FrameReg); ++RI; } } diff --git a/contrib/llvm/lib/CodeGen/GlobalMerge.cpp b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp index 6f9e839..dd9a840 100644 --- a/contrib/llvm/lib/CodeGen/GlobalMerge.cpp +++ b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp @@ -108,10 +108,9 @@ EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden, // FIXME: this could be a transitional option, and we probably need to remove // it if only we are sure this optimization could always benefit all targets. -static cl::opt<bool> +static cl::opt<cl::boolOrDefault> EnableGlobalMergeOnExternal("global-merge-on-external", cl::Hidden, - cl::desc("Enable global merge pass on external linkage"), - cl::init(false)); + cl::desc("Enable global merge pass on external linkage")); STATISTIC(NumMerged, "Number of globals merged"); namespace { @@ -129,11 +128,14 @@ namespace { /// FIXME: This could learn about optsize, and be used in the cost model. bool OnlyOptimizeForSize; + /// Whether we should merge global variables that have external linkage. + bool MergeExternalGlobals; + bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, Module &M, bool isConst, unsigned AddrSpace) const; /// \brief Merge everything in \p Globals for which the corresponding bit /// in \p GlobalSet is set. - bool doMerge(SmallVectorImpl<GlobalVariable *> &Globals, + bool doMerge(const SmallVectorImpl<GlobalVariable *> &Globals, const BitVector &GlobalSet, Module &M, bool isConst, unsigned AddrSpace) const; @@ -158,9 +160,11 @@ namespace { static char ID; // Pass identification, replacement for typeid. explicit GlobalMerge(const TargetMachine *TM = nullptr, unsigned MaximalOffset = 0, - bool OnlyOptimizeForSize = false) + bool OnlyOptimizeForSize = false, + bool MergeExternalGlobals = false) : FunctionPass(ID), TM(TM), MaxOffset(MaximalOffset), - OnlyOptimizeForSize(OnlyOptimizeForSize) { + OnlyOptimizeForSize(OnlyOptimizeForSize), + MergeExternalGlobals(MergeExternalGlobals) { initializeGlobalMergePass(*PassRegistry::getPassRegistry()); } @@ -189,14 +193,11 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, Module &M, bool isConst, unsigned AddrSpace) const { auto &DL = M.getDataLayout(); // FIXME: Find better heuristics - std::stable_sort( - Globals.begin(), Globals.end(), - [&DL](const GlobalVariable *GV1, const GlobalVariable *GV2) { - Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType(); - Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType(); - - return (DL.getTypeAllocSize(Ty1) < DL.getTypeAllocSize(Ty2)); - }); + std::stable_sort(Globals.begin(), Globals.end(), + [&DL](const GlobalVariable *GV1, const GlobalVariable *GV2) { + return DL.getTypeAllocSize(GV1->getValueType()) < + DL.getTypeAllocSize(GV2->getValueType()); + }); // If we want to just blindly group all globals together, do so. if (!GlobalMergeGroupByUse) { @@ -207,7 +208,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, // If we want to be smarter, look at all uses of each global, to try to // discover all sets of globals used together, and how many times each of - // these sets occured. + // these sets occurred. // // Keep this reasonably efficient, by having an append-only list of all sets // discovered so far (UsedGlobalSet), and mapping each "together-ness" unit of @@ -302,8 +303,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, Function *ParentFn = I->getParent()->getParent(); // If we're only optimizing for size, ignore non-minsize functions. - if (OnlyOptimizeForSize && - !ParentFn->hasFnAttribute(Attribute::MinSize)) + if (OnlyOptimizeForSize && !ParentFn->optForMinSize()) continue; size_t UGSIdx = GlobalUsesByFunction[ParentFn]; @@ -406,15 +406,14 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, return Changed; } -bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable *> &Globals, +bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals, const BitVector &GlobalSet, Module &M, bool isConst, unsigned AddrSpace) const { + assert(Globals.size() > 1); Type *Int32Ty = Type::getInt32Ty(M.getContext()); auto &DL = M.getDataLayout(); - assert(Globals.size() > 1); - DEBUG(dbgs() << " Trying to merge set, starts with #" << GlobalSet.find_first() << "\n"); @@ -425,58 +424,44 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable *> &Globals, std::vector<Type*> Tys; std::vector<Constant*> Inits; - bool HasExternal = false; - GlobalVariable *TheFirstExternal = 0; for (j = i; j != -1; j = GlobalSet.find_next(j)) { - Type *Ty = Globals[j]->getType()->getElementType(); + Type *Ty = Globals[j]->getValueType(); MergedSize += DL.getTypeAllocSize(Ty); if (MergedSize > MaxOffset) { break; } Tys.push_back(Ty); Inits.push_back(Globals[j]->getInitializer()); - - if (Globals[j]->hasExternalLinkage() && !HasExternal) { - HasExternal = true; - TheFirstExternal = Globals[j]; - } } - // If merged variables doesn't have external linkage, we needn't to expose - // the symbol after merging. - GlobalValue::LinkageTypes Linkage = HasExternal - ? GlobalValue::ExternalLinkage - : GlobalValue::InternalLinkage; - StructType *MergedTy = StructType::get(M.getContext(), Tys); Constant *MergedInit = ConstantStruct::get(MergedTy, Inits); - // If merged variables have external linkage, we use symbol name of the - // first variable merged as the suffix of global symbol name. This would - // be able to avoid the link-time naming conflict for globalm symbols. GlobalVariable *MergedGV = new GlobalVariable( - M, MergedTy, isConst, Linkage, MergedInit, - HasExternal ? "_MergedGlobals_" + TheFirstExternal->getName() - : "_MergedGlobals", - nullptr, GlobalVariable::NotThreadLocal, AddrSpace); + M, MergedTy, isConst, GlobalValue::PrivateLinkage, MergedInit, + "_MergedGlobals", nullptr, GlobalVariable::NotThreadLocal, AddrSpace); - for (ssize_t k = i, idx = 0; k != j; k = GlobalSet.find_next(k)) { + for (ssize_t k = i, idx = 0; k != j; k = GlobalSet.find_next(k), ++idx) { GlobalValue::LinkageTypes Linkage = Globals[k]->getLinkage(); std::string Name = Globals[k]->getName(); Constant *Idx[2] = { ConstantInt::get(Int32Ty, 0), - ConstantInt::get(Int32Ty, idx++) + ConstantInt::get(Int32Ty, idx), }; Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedTy, MergedGV, Idx); Globals[k]->replaceAllUsesWith(GEP); Globals[k]->eraseFromParent(); - if (Linkage != GlobalValue::InternalLinkage) { - // Generate a new alias... - auto *PTy = cast<PointerType>(GEP->getType()); - GlobalAlias::create(PTy, Linkage, Name, GEP, &M); + // When the linkage is not internal we must emit an alias for the original + // variable name as it may be accessed from another object. On non-Mach-O + // we can also emit an alias for internal linkage as it's safe to do so. + // It's not safe on Mach-O as the alias (and thus the portion of the + // MergedGlobals variable) may be dead stripped at link time. + if (Linkage != GlobalValue::InternalLinkage || + !TM->getTargetTriple().isOSBinFormatMachO()) { + GlobalAlias::create(Tys[idx], AddrSpace, Linkage, Name, GEP, &M); } NumMerged++; @@ -535,61 +520,57 @@ bool GlobalMerge::doInitialization(Module &M) { setMustKeepGlobalVariables(M); // Grab all non-const globals. - for (Module::global_iterator I = M.global_begin(), - E = M.global_end(); I != E; ++I) { + for (auto &GV : M.globals()) { // Merge is safe for "normal" internal or external globals only - if (I->isDeclaration() || I->isThreadLocal() || I->hasSection()) + if (GV.isDeclaration() || GV.isThreadLocal() || GV.hasSection()) continue; - if (!(EnableGlobalMergeOnExternal && I->hasExternalLinkage()) && - !I->hasInternalLinkage()) + if (!(MergeExternalGlobals && GV.hasExternalLinkage()) && + !GV.hasInternalLinkage()) continue; - PointerType *PT = dyn_cast<PointerType>(I->getType()); + PointerType *PT = dyn_cast<PointerType>(GV.getType()); assert(PT && "Global variable is not a pointer!"); unsigned AddressSpace = PT->getAddressSpace(); // Ignore fancy-aligned globals for now. - unsigned Alignment = DL.getPreferredAlignment(I); - Type *Ty = I->getType()->getElementType(); + unsigned Alignment = DL.getPreferredAlignment(&GV); + Type *Ty = GV.getValueType(); if (Alignment > DL.getABITypeAlignment(Ty)) continue; // Ignore all 'special' globals. - if (I->getName().startswith("llvm.") || - I->getName().startswith(".llvm.")) + if (GV.getName().startswith("llvm.") || + GV.getName().startswith(".llvm.")) continue; // Ignore all "required" globals: - if (isMustKeepGlobalVariable(I)) + if (isMustKeepGlobalVariable(&GV)) continue; if (DL.getTypeAllocSize(Ty) < MaxOffset) { - if (TargetLoweringObjectFile::getKindForGlobal(I, *TM).isBSSLocal()) - BSSGlobals[AddressSpace].push_back(I); - else if (I->isConstant()) - ConstGlobals[AddressSpace].push_back(I); + if (TargetLoweringObjectFile::getKindForGlobal(&GV, *TM).isBSSLocal()) + BSSGlobals[AddressSpace].push_back(&GV); + else if (GV.isConstant()) + ConstGlobals[AddressSpace].push_back(&GV); else - Globals[AddressSpace].push_back(I); + Globals[AddressSpace].push_back(&GV); } } - for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator - I = Globals.begin(), E = Globals.end(); I != E; ++I) - if (I->second.size() > 1) - Changed |= doMerge(I->second, M, false, I->first); + for (auto &P : Globals) + if (P.second.size() > 1) + Changed |= doMerge(P.second, M, false, P.first); - for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator - I = BSSGlobals.begin(), E = BSSGlobals.end(); I != E; ++I) - if (I->second.size() > 1) - Changed |= doMerge(I->second, M, false, I->first); + for (auto &P : BSSGlobals) + if (P.second.size() > 1) + Changed |= doMerge(P.second, M, false, P.first); if (EnableGlobalMergeOnConst) - for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator - I = ConstGlobals.begin(), E = ConstGlobals.end(); I != E; ++I) - if (I->second.size() > 1) - Changed |= doMerge(I->second, M, true, I->first); + for (auto &P : ConstGlobals) + if (P.second.size() > 1) + Changed |= doMerge(P.second, M, true, P.first); return Changed; } @@ -604,6 +585,9 @@ bool GlobalMerge::doFinalization(Module &M) { } Pass *llvm::createGlobalMergePass(const TargetMachine *TM, unsigned Offset, - bool OnlyOptimizeForSize) { - return new GlobalMerge(TM, Offset, OnlyOptimizeForSize); + bool OnlyOptimizeForSize, + bool MergeExternalByDefault) { + bool MergeExternal = (EnableGlobalMergeOnExternal == cl::BOU_UNSET) ? + MergeExternalByDefault : (EnableGlobalMergeOnExternal == cl::BOU_TRUE); + return new GlobalMerge(TM, Offset, OnlyOptimizeForSize, MergeExternal); } diff --git a/contrib/llvm/lib/CodeGen/IfConversion.cpp b/contrib/llvm/lib/CodeGen/IfConversion.cpp index ee0532b..c38c9d2 100644 --- a/contrib/llvm/lib/CodeGen/IfConversion.cpp +++ b/contrib/llvm/lib/CodeGen/IfConversion.cpp @@ -32,6 +32,7 @@ #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> using namespace llvm; @@ -190,10 +191,10 @@ namespace { private: bool ReverseBranchCondition(BBInfo &BBI); bool ValidSimple(BBInfo &TrueBBI, unsigned &Dups, - const BranchProbability &Prediction) const; + BranchProbability Prediction) const; bool ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI, bool FalseBranch, unsigned &Dups, - const BranchProbability &Prediction) const; + BranchProbability Prediction) const; bool ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI, unsigned &Dups1, unsigned &Dups2) const; void ScanInstructions(BBInfo &BBI); @@ -218,7 +219,7 @@ namespace { bool MeetIfcvtSizeLimit(MachineBasicBlock &BB, unsigned Cycle, unsigned Extra, - const BranchProbability &Prediction) const { + BranchProbability Prediction) const { return Cycle > 0 && TII->isProfitableToIfCvt(BB, Cycle, Extra, Prediction); } @@ -227,7 +228,7 @@ namespace { unsigned TCycle, unsigned TExtra, MachineBasicBlock &FBB, unsigned FCycle, unsigned FExtra, - const BranchProbability &Prediction) const { + BranchProbability Prediction) const { return TCycle > 0 && FCycle > 0 && TII->isProfitableToIfCvt(TBB, TCycle, TExtra, FBB, FCycle, FExtra, Prediction); @@ -462,11 +463,11 @@ bool IfConverter::ReverseBranchCondition(BBInfo &BBI) { /// getNextBlock - Returns the next block in the function blocks ordering. If /// it is the end, returns NULL. static inline MachineBasicBlock *getNextBlock(MachineBasicBlock *BB) { - MachineFunction::iterator I = BB; + MachineFunction::iterator I = BB->getIterator(); MachineFunction::iterator E = BB->getParent()->end(); if (++I == E) return nullptr; - return I; + return &*I; } /// ValidSimple - Returns true if the 'true' block (along with its @@ -474,7 +475,7 @@ static inline MachineBasicBlock *getNextBlock(MachineBasicBlock *BB) { /// number of instructions that the ifcvt would need to duplicate if performed /// in Dups. bool IfConverter::ValidSimple(BBInfo &TrueBBI, unsigned &Dups, - const BranchProbability &Prediction) const { + BranchProbability Prediction) const { Dups = 0; if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone) return false; @@ -501,7 +502,7 @@ bool IfConverter::ValidSimple(BBInfo &TrueBBI, unsigned &Dups, /// if performed in 'Dups'. bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI, bool FalseBranch, unsigned &Dups, - const BranchProbability &Prediction) const { + BranchProbability Prediction) const { Dups = 0; if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone) return false; @@ -530,10 +531,10 @@ bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI, MachineBasicBlock *TExit = FalseBranch ? TrueBBI.FalseBB : TrueBBI.TrueBB; if (!TExit && blockAlwaysFallThrough(TrueBBI)) { - MachineFunction::iterator I = TrueBBI.BB; + MachineFunction::iterator I = TrueBBI.BB->getIterator(); if (++I == TrueBBI.BB->getParent()->end()) return false; - TExit = I; + TExit = &*I; } return TExit && TExit == FalseBBI.BB; } @@ -948,10 +949,8 @@ void IfConverter::AnalyzeBlock(MachineBasicBlock *MBB, /// candidates. void IfConverter::AnalyzeBlocks(MachineFunction &MF, std::vector<IfcvtToken*> &Tokens) { - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { - MachineBasicBlock *BB = I; - AnalyzeBlock(BB, Tokens); - } + for (auto &BB : MF) + AnalyzeBlock(&BB, Tokens); // Sort to favor more complex ifcvt scheme. std::stable_sort(Tokens.begin(), Tokens.end(), IfcvtTokenCmp); @@ -961,14 +960,14 @@ void IfConverter::AnalyzeBlocks(MachineFunction &MF, /// that all the intervening blocks are empty (given BB can fall through to its /// next block). static bool canFallThroughTo(MachineBasicBlock *BB, MachineBasicBlock *ToBB) { - MachineFunction::iterator PI = BB; + MachineFunction::iterator PI = BB->getIterator(); MachineFunction::iterator I = std::next(PI); - MachineFunction::iterator TI = ToBB; + MachineFunction::iterator TI = ToBB->getIterator(); MachineFunction::iterator E = BB->getParent()->end(); while (I != TI) { // Check isSuccessor to avoid case where the next block is empty, but // it's not a successor. - if (I == E || !I->empty() || !PI->isSuccessor(I)) + if (I == E || !I->empty() || !PI->isSuccessor(&*I)) return false; PI = I++; } @@ -1114,7 +1113,7 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) { // RemoveExtraEdges won't work if the block has an unanalyzable branch, so // explicitly remove CvtBBI as a successor. - BBI.BB->removeSuccessor(CvtBBI->BB); + BBI.BB->removeSuccessor(CvtBBI->BB, true); } else { RemoveKills(CvtBBI->BB->begin(), CvtBBI->BB->end(), DontKill, *TRI); PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond); @@ -1153,28 +1152,6 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) { return true; } -/// Scale down weights to fit into uint32_t. NewTrue is the new weight -/// for successor TrueBB, and NewFalse is the new weight for successor -/// FalseBB. -static void ScaleWeights(uint64_t NewTrue, uint64_t NewFalse, - MachineBasicBlock *MBB, - const MachineBasicBlock *TrueBB, - const MachineBasicBlock *FalseBB, - const MachineBranchProbabilityInfo *MBPI) { - uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; - uint32_t Scale = (NewMax / UINT32_MAX) + 1; - for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), - SE = MBB->succ_end(); - SI != SE; ++SI) { - if (*SI == TrueBB) - MBB->setSuccWeight(SI, (uint32_t)(NewTrue / Scale)); - else if (*SI == FalseBB) - MBB->setSuccWeight(SI, (uint32_t)(NewFalse / Scale)); - else - MBB->setSuccWeight(SI, MBPI->getEdgeWeight(MBB, SI) / Scale); - } -} - /// IfConvertTriangle - If convert a triangle sub-CFG. /// bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { @@ -1231,16 +1208,14 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { DontKill.clear(); bool HasEarlyExit = CvtBBI->FalseBB != nullptr; - uint64_t CvtNext = 0, CvtFalse = 0, BBNext = 0, BBCvt = 0, SumWeight = 0; - uint32_t WeightScale = 0; + BranchProbability CvtNext, CvtFalse, BBNext, BBCvt; if (HasEarlyExit) { - // Get weights before modifying CvtBBI->BB and BBI.BB. - CvtNext = MBPI->getEdgeWeight(CvtBBI->BB, NextBBI->BB); - CvtFalse = MBPI->getEdgeWeight(CvtBBI->BB, CvtBBI->FalseBB); - BBNext = MBPI->getEdgeWeight(BBI.BB, NextBBI->BB); - BBCvt = MBPI->getEdgeWeight(BBI.BB, CvtBBI->BB); - SumWeight = MBPI->getSumForBlock(CvtBBI->BB, WeightScale); + // Get probabilities before modifying CvtBBI->BB and BBI.BB. + CvtNext = MBPI->getEdgeProbability(CvtBBI->BB, NextBBI->BB); + CvtFalse = MBPI->getEdgeProbability(CvtBBI->BB, CvtBBI->FalseBB); + BBNext = MBPI->getEdgeProbability(BBI.BB, NextBBI->BB); + BBCvt = MBPI->getEdgeProbability(BBI.BB, CvtBBI->BB); } if (CvtBBI->BB->pred_size() > 1) { @@ -1251,7 +1226,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { // RemoveExtraEdges won't work if the block has an unanalyzable branch, so // explicitly remove CvtBBI as a successor. - BBI.BB->removeSuccessor(CvtBBI->BB); + BBI.BB->removeSuccessor(CvtBBI->BB, true); } else { // Predicate the 'true' block after removing its branch. CvtBBI->NonPredSize -= TII->RemoveBranch(*CvtBBI->BB); @@ -1268,22 +1243,23 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { CvtBBI->BrCond.end()); if (TII->ReverseBranchCondition(RevCond)) llvm_unreachable("Unable to reverse branch condition!"); + + // Update the edge probability for both CvtBBI->FalseBB and NextBBI. + // NewNext = New_Prob(BBI.BB, NextBBI->BB) = + // Prob(BBI.BB, NextBBI->BB) + + // Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, NextBBI->BB) + // NewFalse = New_Prob(BBI.BB, CvtBBI->FalseBB) = + // Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, CvtBBI->FalseBB) + auto NewTrueBB = getNextBlock(BBI.BB); + auto NewNext = BBNext + BBCvt * CvtNext; + auto NewTrueBBIter = + std::find(BBI.BB->succ_begin(), BBI.BB->succ_end(), NewTrueBB); + if (NewTrueBBIter != BBI.BB->succ_end()) + BBI.BB->setSuccProbability(NewTrueBBIter, NewNext); + + auto NewFalse = BBCvt * CvtFalse; TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl); - BBI.BB->addSuccessor(CvtBBI->FalseBB); - // Update the edge weight for both CvtBBI->FalseBB and NextBBI. - // New_Weight(BBI.BB, NextBBI->BB) = - // Weight(BBI.BB, NextBBI->BB) * getSumForBlock(CvtBBI->BB) + - // Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, NextBBI->BB) - // New_Weight(BBI.BB, CvtBBI->FalseBB) = - // Weight(BBI.BB, CvtBBI->BB) * Weight(CvtBBI->BB, CvtBBI->FalseBB) - - uint64_t NewNext = BBNext * SumWeight + (BBCvt * CvtNext) / WeightScale; - uint64_t NewFalse = (BBCvt * CvtFalse) / WeightScale; - // We need to scale down all weights of BBI.BB to fit uint32_t. - // Here BBI.BB is connected to CvtBBI->FalseBB and will fall through to - // the next block. - ScaleWeights(NewNext, NewFalse, BBI.BB, getNextBlock(BBI.BB), - CvtBBI->FalseBB, MBPI); + BBI.BB->addSuccessor(CvtBBI->FalseBB, NewFalse); } // Merge in the 'false' block if the 'false' block has no other @@ -1526,7 +1502,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind, MergeBlocks(BBI, TailBBI); TailBBI.IsDone = true; } else { - BBI.BB->addSuccessor(TailBB); + BBI.BB->addSuccessor(TailBB, BranchProbability::getOne()); InsertUncondBranch(BBI.BB, TailBB, TII); BBI.HasFallThrough = false; } @@ -1536,7 +1512,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind, // which can happen here if TailBB is unanalyzable and is merged, so // explicitly remove BBI1 and BBI2 as successors. BBI.BB->removeSuccessor(BBI1->BB); - BBI.BB->removeSuccessor(BBI2->BB); + BBI.BB->removeSuccessor(BBI2->BB, true); RemoveExtraEdges(BBI); // Update block info. @@ -1686,25 +1662,94 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { ToBBI.BB->splice(ToBBI.BB->end(), FromBBI.BB, FromBBI.BB->begin(), FromBBI.BB->end()); - std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(), - FromBBI.BB->succ_end()); + // Force normalizing the successors' probabilities of ToBBI.BB to convert all + // unknown probabilities into known ones. + // FIXME: This usage is too tricky and in the future we would like to + // eliminate all unknown probabilities in MBB. + ToBBI.BB->normalizeSuccProbs(); + + SmallVector<MachineBasicBlock *, 4> FromSuccs(FromBBI.BB->succ_begin(), + FromBBI.BB->succ_end()); MachineBasicBlock *NBB = getNextBlock(FromBBI.BB); MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr; + // The edge probability from ToBBI.BB to FromBBI.BB, which is only needed when + // AddEdges is true and FromBBI.BB is a successor of ToBBI.BB. + auto To2FromProb = BranchProbability::getZero(); + if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) { + To2FromProb = MBPI->getEdgeProbability(ToBBI.BB, FromBBI.BB); + // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the + // edge probability being merged to other edges when this edge is removed + // later. + ToBBI.BB->setSuccProbability( + std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB), + BranchProbability::getZero()); + } - for (unsigned i = 0, e = Succs.size(); i != e; ++i) { - MachineBasicBlock *Succ = Succs[i]; + for (unsigned i = 0, e = FromSuccs.size(); i != e; ++i) { + MachineBasicBlock *Succ = FromSuccs[i]; // Fallthrough edge can't be transferred. if (Succ == FallThrough) continue; + + auto NewProb = BranchProbability::getZero(); + if (AddEdges) { + // Calculate the edge probability for the edge from ToBBI.BB to Succ, + // which is a portion of the edge probability from FromBBI.BB to Succ. The + // portion ratio is the edge probability from ToBBI.BB to FromBBI.BB (if + // FromBBI is a successor of ToBBI.BB. See comment below for excepion). + NewProb = MBPI->getEdgeProbability(FromBBI.BB, Succ); + + // To2FromProb is 0 when FromBBI.BB is not a successor of ToBBI.BB. This + // only happens when if-converting a diamond CFG and FromBBI.BB is the + // tail BB. In this case FromBBI.BB post-dominates ToBBI.BB and hence we + // could just use the probabilities on FromBBI.BB's out-edges when adding + // new successors. + if (!To2FromProb.isZero()) + NewProb *= To2FromProb; + } + FromBBI.BB->removeSuccessor(Succ); - if (AddEdges && !ToBBI.BB->isSuccessor(Succ)) - ToBBI.BB->addSuccessor(Succ); + + if (AddEdges) { + // If the edge from ToBBI.BB to Succ already exists, update the + // probability of this edge by adding NewProb to it. An example is shown + // below, in which A is ToBBI.BB and B is FromBBI.BB. In this case we + // don't have to set C as A's successor as it already is. We only need to + // update the edge probability on A->C. Note that B will not be + // immediately removed from A's successors. It is possible that B->D is + // not removed either if D is a fallthrough of B. Later the edge A->D + // (generated here) and B->D will be combined into one edge. To maintain + // correct edge probability of this combined edge, we need to set the edge + // probability of A->B to zero, which is already done above. The edge + // probability on A->D is calculated by scaling the original probability + // on A->B by the probability of B->D. + // + // Before ifcvt: After ifcvt (assume B->D is kept): + // + // A A + // /| /|\ + // / B / B| + // | /| | || + // |/ | | |/ + // C D C D + // + if (ToBBI.BB->isSuccessor(Succ)) + ToBBI.BB->setSuccProbability( + std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), Succ), + MBPI->getEdgeProbability(ToBBI.BB, Succ) + NewProb); + else + ToBBI.BB->addSuccessor(Succ, NewProb); + } } // Now FromBBI always falls through to the next block! if (NBB && !FromBBI.BB->isSuccessor(NBB)) FromBBI.BB->addSuccessor(NBB); + // Normalize the probabilities of ToBBI.BB's successors with all adjustment + // we've done above. + ToBBI.BB->normalizeSuccProbs(); + ToBBI.Predicate.append(FromBBI.Predicate.begin(), FromBBI.Predicate.end()); FromBBI.Predicate.clear(); diff --git a/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp index 93e0487..39c1b9f 100644 --- a/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp +++ b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp @@ -38,6 +38,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -107,6 +108,98 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; }; + +/// \brief Detect re-ordering hazards and dependencies. +/// +/// This class keeps track of defs and uses, and can be queried if a given +/// machine instruction can be re-ordered from after the machine instructions +/// seen so far to before them. +class HazardDetector { + DenseSet<unsigned> RegDefs; + DenseSet<unsigned> RegUses; + const TargetRegisterInfo &TRI; + bool hasSeenClobber; + +public: + explicit HazardDetector(const TargetRegisterInfo &TRI) : + TRI(TRI), hasSeenClobber(false) {} + + /// \brief Make a note of \p MI for later queries to isSafeToHoist. + /// + /// May clobber this HazardDetector instance. \see isClobbered. + void rememberInstruction(MachineInstr *MI); + + /// \brief Return true if it is safe to hoist \p MI from after all the + /// instructions seen so far (via rememberInstruction) to before it. + bool isSafeToHoist(MachineInstr *MI); + + /// \brief Return true if this instance of HazardDetector has been clobbered + /// (i.e. has no more useful information). + /// + /// A HazardDetecter is clobbered when it sees a construct it cannot + /// understand, and it would have to return a conservative answer for all + /// future queries. Having a separate clobbered state lets the client code + /// bail early, without making queries about all of the future instructions + /// (which would have returned the most conservative answer anyway). + /// + /// Calling rememberInstruction or isSafeToHoist on a clobbered HazardDetector + /// is an error. + bool isClobbered() { return hasSeenClobber; } +}; +} + + +void HazardDetector::rememberInstruction(MachineInstr *MI) { + assert(!isClobbered() && + "Don't add instructions to a clobbered hazard detector"); + + if (MI->mayStore() || MI->hasUnmodeledSideEffects()) { + hasSeenClobber = true; + return; + } + + for (auto *MMO : MI->memoperands()) { + // Right now we don't want to worry about LLVM's memory model. + if (!MMO->isUnordered()) { + hasSeenClobber = true; + return; + } + } + + for (auto &MO : MI->operands()) { + if (!MO.isReg() || !MO.getReg()) + continue; + + if (MO.isDef()) + RegDefs.insert(MO.getReg()); + else + RegUses.insert(MO.getReg()); + } +} + +bool HazardDetector::isSafeToHoist(MachineInstr *MI) { + assert(!isClobbered() && "isSafeToHoist cannot do anything useful!"); + + // Right now we don't want to worry about LLVM's memory model. This can be + // made more precise later. + for (auto *MMO : MI->memoperands()) + if (!MMO->isUnordered()) + return false; + + for (auto &MO : MI->operands()) { + if (MO.isReg() && MO.getReg()) { + for (unsigned Reg : RegDefs) + if (TRI.regsOverlap(Reg, MO.getReg())) + return false; // We found a write-after-write or read-after-write + + if (MO.isDef()) + for (unsigned Reg : RegUses) + if (TRI.regsOverlap(Reg, MO.getReg())) + return false; // We found a write-after-read + } + } + + return true; } bool ImplicitNullChecks::runOnMachineFunction(MachineFunction &MF) { @@ -132,10 +225,10 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks( MachineBasicBlock &MBB, SmallVectorImpl<NullCheck> &NullCheckList) { typedef TargetInstrInfo::MachineBranchPredicate MachineBranchPredicate; - MDNode *BranchMD = - MBB.getBasicBlock() - ? MBB.getBasicBlock()->getTerminator()->getMetadata("make.implicit") - : nullptr; + MDNode *BranchMD = nullptr; + if (auto *BB = MBB.getBasicBlock()) + BranchMD = BB->getTerminator()->getMetadata(LLVMContext::MD_make_implicit); + if (!BranchMD) return false; @@ -188,7 +281,7 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks( // // we want to end up with // - // Def = TrappingLoad (%RAX + <offset>), LblNull + // Def = FaultingLoad (%RAX + <offset>), LblNull // jmp LblNotNull ;; explicit or fallthrough // // LblNotNull: @@ -199,38 +292,34 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks( // LblNull: // callq throw_NullPointerException // + // + // To see why this is legal, consider the two possibilities: + // + // 1. %RAX is null: since we constrain <offset> to be less than PageSize, the + // load instruction dereferences the null page, causing a segmentation + // fault. + // + // 2. %RAX is not null: in this case we know that the load cannot fault, as + // otherwise the load would've faulted in the original program too and the + // original program would've been undefined. + // + // This reasoning cannot be extended to justify hoisting through arbitrary + // control flow. For instance, in the example below (in pseudo-C) + // + // if (ptr == null) { throw_npe(); unreachable; } + // if (some_cond) { return 42; } + // v = ptr->field; // LD + // ... + // + // we cannot (without code duplication) use the load marked "LD" to null check + // ptr -- clause (2) above does not apply in this case. In the above program + // the safety of ptr->field can be dependent on some_cond; and, for instance, + // ptr could be some non-null invalid reference that never gets loaded from + // because some_cond is always true. unsigned PointerReg = MBP.LHS.getReg(); - // As we scan NotNullSucc for a suitable load instruction, we keep track of - // the registers defined and used by the instructions we scan past. This bit - // of information lets us decide if it is legal to hoist the load instruction - // we find (if we do find such an instruction) to before NotNullSucc. - DenseSet<unsigned> RegDefs, RegUses; - - // Returns true if it is safe to reorder MI to before NotNullSucc. - auto IsSafeToHoist = [&](MachineInstr *MI) { - // Right now we don't want to worry about LLVM's memory model. This can be - // made more precise later. - for (auto *MMO : MI->memoperands()) - if (!MMO->isUnordered()) - return false; - - for (auto &MO : MI->operands()) { - if (MO.isReg() && MO.getReg()) { - for (unsigned Reg : RegDefs) - if (TRI->regsOverlap(Reg, MO.getReg())) - return false; // We found a write-after-write or read-after-write - - if (MO.isDef()) - for (unsigned Reg : RegUses) - if (TRI->regsOverlap(Reg, MO.getReg())) - return false; // We found a write-after-read - } - } - - return true; - }; + HazardDetector HD(*TRI); for (auto MII = NotNullSucc->begin(), MIE = NotNullSucc->end(); MII != MIE; ++MII) { @@ -238,37 +327,16 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks( unsigned BaseReg, Offset; if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) if (MI->mayLoad() && !MI->isPredicable() && BaseReg == PointerReg && - Offset < PageSize && MI->getDesc().getNumDefs() == 1 && - IsSafeToHoist(MI)) { + Offset < PageSize && MI->getDesc().getNumDefs() <= 1 && + HD.isSafeToHoist(MI)) { NullCheckList.emplace_back(MI, MBP.ConditionDef, &MBB, NotNullSucc, NullSucc); return true; } - // MI did not match our criteria for conversion to a trapping load. Check - // if we can continue looking. - - if (MI->mayStore() || MI->hasUnmodeledSideEffects()) + HD.rememberInstruction(MI); + if (HD.isClobbered()) return false; - - for (auto *MMO : MI->memoperands()) - // Right now we don't want to worry about LLVM's memory model. - if (!MMO->isUnordered()) - return false; - - // It _may_ be okay to reorder a later load instruction across MI. Make a - // note of its operands so that we can make the legality check if we find a - // suitable load instruction: - - for (auto &MO : MI->operands()) { - if (!MO.isReg() || !MO.getReg()) - continue; - - if (MO.isDef()) - RegDefs.insert(MO.getReg()); - else - RegUses.insert(MO.getReg()); - } } return false; @@ -281,14 +349,19 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks( MachineInstr *ImplicitNullChecks::insertFaultingLoad(MachineInstr *LoadMI, MachineBasicBlock *MBB, MCSymbol *HandlerLabel) { + const unsigned NoRegister = 0; // Guaranteed to be the NoRegister value for + // all targets. + DebugLoc DL; unsigned NumDefs = LoadMI->getDesc().getNumDefs(); - assert(NumDefs == 1 && "other cases unhandled!"); - (void)NumDefs; + assert(NumDefs <= 1 && "other cases unhandled!"); - unsigned DefReg = LoadMI->defs().begin()->getReg(); - assert(std::distance(LoadMI->defs().begin(), LoadMI->defs().end()) == 1 && - "expected exactly one def!"); + unsigned DefReg = NoRegister; + if (NumDefs != 0) { + DefReg = LoadMI->defs().begin()->getReg(); + assert(std::distance(LoadMI->defs().begin(), LoadMI->defs().end()) == 1 && + "expected exactly one def!"); + } auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_LOAD_OP), DefReg) .addSym(HandlerLabel) diff --git a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp index 9989f23..e310132 100644 --- a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp @@ -141,7 +141,7 @@ public: InlineSpiller(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm) : MF(mf), LIS(pass.getAnalysis<LiveIntervals>()), LSS(pass.getAnalysis<LiveStacks>()), - AA(&pass.getAnalysis<AliasAnalysis>()), + AA(&pass.getAnalysis<AAResultsWrapperPass>().getAAResults()), MDT(pass.getAnalysis<MachineDominatorTree>()), Loops(pass.getAnalysis<MachineLoopInfo>()), VRM(vrm), MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()), @@ -329,8 +329,8 @@ static raw_ostream &operator<<(raw_ostream &OS, if (SVI.KillsSource) OS << " kill"; OS << " deps["; - for (unsigned i = 0, e = SVI.Deps.size(); i != e; ++i) - OS << ' ' << SVI.Deps[i]->id << '@' << SVI.Deps[i]->def; + for (VNInfo *Dep : SVI.Deps) + OS << ' ' << Dep->id << '@' << Dep->def; OS << " ]"; if (SVI.DefMI) OS << " def: " << *SVI.DefMI; @@ -383,9 +383,8 @@ void InlineSpiller::propagateSiblingValue(SibValueMap::iterator SVIIter, bool PropSpill = !DisableHoisting && !isRegToSpill(SV.SpillReg); unsigned SpillDepth = ~0u; - for (TinyPtrVector<VNInfo*>::iterator DepI = Deps->begin(), - DepE = Deps->end(); DepI != DepE; ++DepI) { - SibValueMap::iterator DepSVI = SibValues.find(*DepI); + for (VNInfo *Dep : *Deps) { + SibValueMap::iterator DepSVI = SibValues.find(Dep); assert(DepSVI != SibValues.end() && "Dependent value not in SibValues"); SibValueInfo &DepSV = DepSVI->second; if (!DepSV.SpillMBB) @@ -566,12 +565,11 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI, // Create entries for all the PHIs. Don't add them to the worklist, we // are processing all of them in one go here. - for (unsigned i = 0, e = PHIs.size(); i != e; ++i) - SibValues.insert(std::make_pair(PHIs[i], SibValueInfo(Reg, PHIs[i]))); + for (VNInfo *PHI : PHIs) + SibValues.insert(std::make_pair(PHI, SibValueInfo(Reg, PHI))); // Add every PHI as a dependent of all the non-PHIs. - for (unsigned i = 0, e = NonPHIs.size(); i != e; ++i) { - VNInfo *NonPHI = NonPHIs[i]; + for (VNInfo *NonPHI : NonPHIs) { // Known value? Try an insertion. std::tie(SVI, Inserted) = SibValues.insert(std::make_pair(NonPHI, SibValueInfo(Reg, NonPHI))); @@ -654,8 +652,7 @@ void InlineSpiller::analyzeSiblingValues() { return; LiveInterval &OrigLI = LIS.getInterval(Original); - for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) { - unsigned Reg = RegsToSpill[i]; + for (unsigned Reg : RegsToSpill) { LiveInterval &LI = LIS.getInterval(Reg); for (LiveInterval::const_vni_iterator VI = LI.vni_begin(), VE = LI.vni_end(); VI != VE; ++VI) { @@ -831,9 +828,8 @@ void InlineSpiller::markValueUsed(LiveInterval *LI, VNInfo *VNI) { if (VNI->isPHIDef()) { MachineBasicBlock *MBB = LIS.getMBBFromIndex(VNI->def); - for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(), - PE = MBB->pred_end(); PI != PE; ++PI) { - VNInfo *PVNI = LI->getVNInfoBefore(LIS.getMBBEndIdx(*PI)); + for (MachineBasicBlock *P : MBB->predecessors()) { + VNInfo *PVNI = LI->getVNInfoBefore(LIS.getMBBEndIdx(P)); if (PVNI) WorkList.push_back(std::make_pair(LI, PVNI)); } @@ -920,8 +916,8 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, << *LIS.getInstructionFromIndex(DefIdx)); // Replace operands - for (unsigned i = 0, e = Ops.size(); i != e; ++i) { - MachineOperand &MO = Ops[i].first->getOperand(Ops[i].second); + for (const auto &OpPair : Ops) { + MachineOperand &MO = OpPair.first->getOperand(OpPair.second); if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg) { MO.setReg(NewVReg); MO.setIsKill(); @@ -944,8 +940,7 @@ void InlineSpiller::reMaterializeAll() { // Try to remat before all uses of snippets. bool anyRemat = false; - for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) { - unsigned Reg = RegsToSpill[i]; + for (unsigned Reg : RegsToSpill) { LiveInterval &LI = LIS.getInterval(Reg); for (MachineRegisterInfo::reg_bundle_iterator RegI = MRI.reg_bundle_begin(Reg), E = MRI.reg_bundle_end(); @@ -963,8 +958,7 @@ void InlineSpiller::reMaterializeAll() { return; // Remove any values that were completely rematted. - for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) { - unsigned Reg = RegsToSpill[i]; + for (unsigned Reg : RegsToSpill) { LiveInterval &LI = LIS.getInterval(Reg); for (LiveInterval::vni_iterator I = LI.vni_begin(), E = LI.vni_end(); I != E; ++I) { @@ -989,8 +983,7 @@ void InlineSpiller::reMaterializeAll() { // Get rid of deleted and empty intervals. unsigned ResultPos = 0; - for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) { - unsigned Reg = RegsToSpill[i]; + for (unsigned Reg : RegsToSpill) { if (!LIS.hasInterval(Reg)) continue; @@ -1098,9 +1091,9 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops, // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied // operands. SmallVector<unsigned, 8> FoldOps; - for (unsigned i = 0, e = Ops.size(); i != e; ++i) { - unsigned Idx = Ops[i].second; - assert(MI == Ops[i].first && "Instruction conflict during operand folding"); + for (const auto &OpPair : Ops) { + unsigned Idx = OpPair.second; + assert(MI == OpPair.first && "Instruction conflict during operand folding"); MachineOperand &MO = MI->getOperand(Idx); if (MO.isImplicit()) { ImpReg = MO.getReg(); @@ -1139,7 +1132,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops, continue; MIBundleOperands::PhysRegInfo RI = MIBundleOperands(FoldMI).analyzePhysReg(Reg, &TRI); - if (RI.Defines) + if (RI.FullyDefined) continue; // FoldMI does not define this physreg. Remove the LI segment. assert(MO->isDead() && "Cannot fold physreg def"); @@ -1152,10 +1145,9 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops, // Insert any new instructions other than FoldMI into the LIS maps. assert(!MIS.empty() && "Unexpected empty span of instructions!"); - for (MachineBasicBlock::iterator MII = MIS.begin(), End = MIS.end(); - MII != End; ++MII) - if (&*MII != FoldMI) - LIS.InsertMachineInstrInMaps(&*MII); + for (MachineInstr &MI : MIS) + if (&MI != FoldMI) + LIS.InsertMachineInstrInMaps(&MI); // TII.foldMemoryOperand may have left some implicit operands on the // instruction. Strip them. @@ -1301,11 +1293,11 @@ void InlineSpiller::spillAroundUses(unsigned Reg) { // Rewrite instruction operands. bool hasLiveDef = false; - for (unsigned i = 0, e = Ops.size(); i != e; ++i) { - MachineOperand &MO = Ops[i].first->getOperand(Ops[i].second); + for (const auto &OpPair : Ops) { + MachineOperand &MO = OpPair.first->getOperand(OpPair.second); MO.setReg(NewVReg); if (MO.isUse()) { - if (!Ops[i].first->isRegTiedToDefOperand(Ops[i].second)) + if (!OpPair.first->isRegTiedToDefOperand(OpPair.second)) MO.setIsKill(); } else { if (!MO.isDead()) @@ -1335,14 +1327,14 @@ void InlineSpiller::spillAll() { VRM.assignVirt2StackSlot(Edit->getReg(), StackSlot); assert(StackInt->getNumValNums() == 1 && "Bad stack interval values"); - for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) - StackInt->MergeSegmentsInAsValue(LIS.getInterval(RegsToSpill[i]), + for (unsigned Reg : RegsToSpill) + StackInt->MergeSegmentsInAsValue(LIS.getInterval(Reg), StackInt->getValNumInfo(0)); DEBUG(dbgs() << "Merged spilled regs: " << *StackInt << '\n'); // Spill around uses of all RegsToSpill. - for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) - spillAroundUses(RegsToSpill[i]); + for (unsigned Reg : RegsToSpill) + spillAroundUses(Reg); // Hoisted spills may cause dead code. if (!DeadDefs.empty()) { @@ -1351,9 +1343,9 @@ void InlineSpiller::spillAll() { } // Finally delete the SnippetCopies. - for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) { + for (unsigned Reg : RegsToSpill) { for (MachineRegisterInfo::reg_instr_iterator - RI = MRI.reg_instr_begin(RegsToSpill[i]), E = MRI.reg_instr_end(); + RI = MRI.reg_instr_begin(Reg), E = MRI.reg_instr_end(); RI != E; ) { MachineInstr *MI = &*(RI++); assert(SnippetCopies.count(MI) && "Remaining use wasn't a snippet copy"); @@ -1364,8 +1356,8 @@ void InlineSpiller::spillAll() { } // Delete all spilled registers. - for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) - Edit->eraseVirtReg(RegsToSpill[i]); + for (unsigned Reg : RegsToSpill) + Edit->eraseVirtReg(Reg); } void InlineSpiller::spill(LiveRangeEdit &edit) { diff --git a/contrib/llvm/lib/CodeGen/InterferenceCache.cpp b/contrib/llvm/lib/CodeGen/InterferenceCache.cpp index fd5749b..f8cc247 100644 --- a/contrib/llvm/lib/CodeGen/InterferenceCache.cpp +++ b/contrib/llvm/lib/CodeGen/InterferenceCache.cpp @@ -144,7 +144,8 @@ void InterferenceCache::Entry::update(unsigned MBBNum) { PrevPos = Start; } - MachineFunction::const_iterator MFI = MF->getBlockNumbered(MBBNum); + MachineFunction::const_iterator MFI = + MF->getBlockNumbered(MBBNum)->getIterator(); BlockInterference *BI = &Blocks[MBBNum]; ArrayRef<SlotIndex> RegMaskSlots; ArrayRef<const uint32_t*> RegMaskBits; diff --git a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 53c8adc..724f1d6 100644 --- a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -52,7 +52,7 @@ using namespace llvm; static cl::opt<bool> LowerInterleavedAccesses( "lower-interleaved-accesses", cl::desc("Enable lowering interleaved accesses to intrinsics"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); static unsigned MaxFactor; // The maximum supported interleave factor. @@ -271,7 +271,7 @@ bool InterleavedAccess::runOnFunction(Function &F) { SmallVector<Instruction *, 32> DeadInsts; bool Changed = false; - for (auto &I : inst_range(F)) { + for (auto &I : instructions(F)) { if (LoadInst *LI = dyn_cast<LoadInst>(&I)) Changed |= lowerInterleavedLoad(LI, DeadInsts); diff --git a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp index 2c95e9e..2962f87 100644 --- a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp +++ b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp @@ -35,24 +35,24 @@ static void EnsureFunctionExists(Module &M, const char *Name, M.getOrInsertFunction(Name, FunctionType::get(RetTy, ParamTys, false)); } -static void EnsureFPIntrinsicsExist(Module &M, Function *Fn, +static void EnsureFPIntrinsicsExist(Module &M, Function &Fn, const char *FName, const char *DName, const char *LDName) { // Insert definitions for all the floating point types. - switch((int)Fn->arg_begin()->getType()->getTypeID()) { + switch((int)Fn.arg_begin()->getType()->getTypeID()) { case Type::FloatTyID: - EnsureFunctionExists(M, FName, Fn->arg_begin(), Fn->arg_end(), + EnsureFunctionExists(M, FName, Fn.arg_begin(), Fn.arg_end(), Type::getFloatTy(M.getContext())); break; case Type::DoubleTyID: - EnsureFunctionExists(M, DName, Fn->arg_begin(), Fn->arg_end(), + EnsureFunctionExists(M, DName, Fn.arg_begin(), Fn.arg_end(), Type::getDoubleTy(M.getContext())); break; case Type::X86_FP80TyID: case Type::FP128TyID: case Type::PPC_FP128TyID: - EnsureFunctionExists(M, LDName, Fn->arg_begin(), Fn->arg_end(), - Fn->arg_begin()->getType()); + EnsureFunctionExists(M, LDName, Fn.arg_begin(), Fn.arg_end(), + Fn.arg_begin()->getType()); break; } } @@ -67,7 +67,7 @@ static CallInst *ReplaceCallWith(const char *NewFn, CallInst *CI, Type *RetTy) { // If we haven't already looked up this function, check to see if the // program already contains a function with this name. - Module *M = CI->getParent()->getParent()->getParent(); + Module *M = CI->getModule(); // Get or insert the definition now. std::vector<Type *> ParamTys; for (ArgIt I = ArgBegin; I != ArgEnd; ++I) @@ -75,7 +75,7 @@ static CallInst *ReplaceCallWith(const char *NewFn, CallInst *CI, Constant* FCache = M->getOrInsertFunction(NewFn, FunctionType::get(RetTy, ParamTys, false)); - IRBuilder<> Builder(CI->getParent(), CI); + IRBuilder<> Builder(CI->getParent(), CI->getIterator()); SmallVector<Value *, 8> Args(ArgBegin, ArgEnd); CallInst *NewCI = Builder.CreateCall(FCache, Args); NewCI->setName(CI->getName()); @@ -94,20 +94,20 @@ static CallInst *ReplaceCallWith(const char *NewFn, CallInst *CI, void IntrinsicLowering::AddPrototypes(Module &M) { LLVMContext &Context = M.getContext(); - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) - if (I->isDeclaration() && !I->use_empty()) - switch (I->getIntrinsicID()) { + for (auto &F : M) + if (F.isDeclaration() && !F.use_empty()) + switch (F.getIntrinsicID()) { default: break; case Intrinsic::setjmp: - EnsureFunctionExists(M, "setjmp", I->arg_begin(), I->arg_end(), + EnsureFunctionExists(M, "setjmp", F.arg_begin(), F.arg_end(), Type::getInt32Ty(M.getContext())); break; case Intrinsic::longjmp: - EnsureFunctionExists(M, "longjmp", I->arg_begin(), I->arg_end(), + EnsureFunctionExists(M, "longjmp", F.arg_begin(), F.arg_end(), Type::getVoidTy(M.getContext())); break; case Intrinsic::siglongjmp: - EnsureFunctionExists(M, "abort", I->arg_end(), I->arg_end(), + EnsureFunctionExists(M, "abort", F.arg_end(), F.arg_end(), Type::getVoidTy(M.getContext())); break; case Intrinsic::memcpy: @@ -132,31 +132,31 @@ void IntrinsicLowering::AddPrototypes(Module &M) { DL.getIntPtrType(Context), nullptr); break; case Intrinsic::sqrt: - EnsureFPIntrinsicsExist(M, I, "sqrtf", "sqrt", "sqrtl"); + EnsureFPIntrinsicsExist(M, F, "sqrtf", "sqrt", "sqrtl"); break; case Intrinsic::sin: - EnsureFPIntrinsicsExist(M, I, "sinf", "sin", "sinl"); + EnsureFPIntrinsicsExist(M, F, "sinf", "sin", "sinl"); break; case Intrinsic::cos: - EnsureFPIntrinsicsExist(M, I, "cosf", "cos", "cosl"); + EnsureFPIntrinsicsExist(M, F, "cosf", "cos", "cosl"); break; case Intrinsic::pow: - EnsureFPIntrinsicsExist(M, I, "powf", "pow", "powl"); + EnsureFPIntrinsicsExist(M, F, "powf", "pow", "powl"); break; case Intrinsic::log: - EnsureFPIntrinsicsExist(M, I, "logf", "log", "logl"); + EnsureFPIntrinsicsExist(M, F, "logf", "log", "logl"); break; case Intrinsic::log2: - EnsureFPIntrinsicsExist(M, I, "log2f", "log2", "log2l"); + EnsureFPIntrinsicsExist(M, F, "log2f", "log2", "log2l"); break; case Intrinsic::log10: - EnsureFPIntrinsicsExist(M, I, "log10f", "log10", "log10l"); + EnsureFPIntrinsicsExist(M, F, "log10f", "log10", "log10l"); break; case Intrinsic::exp: - EnsureFPIntrinsicsExist(M, I, "expf", "exp", "expl"); + EnsureFPIntrinsicsExist(M, F, "expf", "exp", "expl"); break; case Intrinsic::exp2: - EnsureFPIntrinsicsExist(M, I, "exp2f", "exp2", "exp2l"); + EnsureFPIntrinsicsExist(M, F, "exp2f", "exp2", "exp2l"); break; } } @@ -167,8 +167,8 @@ static Value *LowerBSWAP(LLVMContext &Context, Value *V, Instruction *IP) { assert(V->getType()->isIntegerTy() && "Can't bswap a non-integer type!"); unsigned BitSize = V->getType()->getPrimitiveSizeInBits(); - - IRBuilder<> Builder(IP->getParent(), IP); + + IRBuilder<> Builder(IP); switch(BitSize) { default: llvm_unreachable("Unhandled type size of value to byteswap!"); @@ -268,7 +268,7 @@ static Value *LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP) { 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL }; - IRBuilder<> Builder(IP->getParent(), IP); + IRBuilder<> Builder(IP); unsigned BitSize = V->getType()->getPrimitiveSizeInBits(); unsigned WordSize = (BitSize + 63) / 64; @@ -301,7 +301,7 @@ static Value *LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP) { /// instruction IP. static Value *LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP) { - IRBuilder<> Builder(IP->getParent(), IP); + IRBuilder<> Builder(IP); unsigned BitSize = V->getType()->getPrimitiveSizeInBits(); for (unsigned i = 1; i < BitSize; i <<= 1) { @@ -338,7 +338,7 @@ static void ReplaceFPIntrinsicWithCall(CallInst *CI, const char *Fname, } void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) { - IRBuilder<> Builder(CI->getParent(), CI); + IRBuilder<> Builder(CI); LLVMContext &Context = CI->getContext(); const Function *Callee = CI->getCalledFunction(); @@ -424,6 +424,13 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) { break; } + case Intrinsic::get_dynamic_area_offset: + errs() << "WARNING: this target does not support the custom llvm.get." + "dynamic.area.offset. It is being lowered to a constant 0\n"; + // Just lower it to a constant 0 because for most targets + // @llvm.get.dynamic.area.offset is lowered to zero. + CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 0)); + break; case Intrinsic::returnaddress: case Intrinsic::frameaddress: errs() << "WARNING: this target does not support the llvm." @@ -589,7 +596,7 @@ bool IntrinsicLowering::LowerToByteSwap(CallInst *CI) { return false; // Okay, we can do this xform, do so now. - Module *M = CI->getParent()->getParent()->getParent(); + Module *M = CI->getModule(); Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Ty); Value *Op = CI->getArgOperand(0); diff --git a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp index 37299eb..1c27377 100644 --- a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp +++ b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp @@ -82,7 +82,7 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T, } TargetIRAnalysis LLVMTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(BasicTTIImpl(this, F)); }); } @@ -125,9 +125,10 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM, PM.add(new MachineFunctionAnalysis(*TM, MFInitializer)); // Enable FastISel with -fast, but allow that to be overridden. + TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE); if (EnableFastISelOption == cl::BOU_TRUE || (TM->getOptLevel() == CodeGenOpt::None && - EnableFastISelOption != cl::BOU_FALSE)) + TM->getO0WantsFastISel())) TM->setFastISel(true); // Ask the target for an isel. @@ -202,6 +203,7 @@ bool LLVMTargetMachine::addPassesToEmitFile( Triple T(getTargetTriple().str()); AsmStreamer.reset(getTarget().createMCObjectStreamer( T, *Context, *MAB, Out, MCE, STI, Options.MCOptions.MCRelaxAll, + Options.MCOptions.MCIncrementalLinkerCompatible, /*DWARFMustBeAtTheEnd*/ true)); break; } @@ -254,6 +256,7 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx, const MCSubtargetInfo &STI = *getMCSubtargetInfo(); std::unique_ptr<MCStreamer> AsmStreamer(getTarget().createMCObjectStreamer( T, *Ctx, *MAB, Out, MCE, STI, Options.MCOptions.MCRelaxAll, + Options.MCOptions.MCIncrementalLinkerCompatible, /*DWARFMustBeAtTheEnd*/ true)); // Create the AsmPrinter, which takes ownership of AsmStreamer if successful. diff --git a/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp new file mode 100644 index 0000000..b9937e5 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp @@ -0,0 +1,416 @@ +//===------ LiveDebugValues.cpp - Tracking Debug Value MIs ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// This pass implements a data flow analysis that propagates debug location +/// information by inserting additional DBG_VALUE instructions into the machine +/// instruction stream. The pass internally builds debug location liveness +/// ranges to determine the points where additional DBG_VALUEs need to be +/// inserted. +/// +/// This is a separate pass from DbgValueHistoryCalculator to facilitate +/// testing and improve modularity. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <queue> +#include <list> + +using namespace llvm; + +#define DEBUG_TYPE "live-debug-values" + +STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted"); + +namespace { + +class LiveDebugValues : public MachineFunctionPass { + +private: + const TargetRegisterInfo *TRI; + const TargetInstrInfo *TII; + + typedef std::pair<const DILocalVariable *, const DILocation *> + InlinedVariable; + + /// A potentially inlined instance of a variable. + struct DebugVariable { + const DILocalVariable *Var; + const DILocation *InlinedAt; + + DebugVariable(const DILocalVariable *_var, const DILocation *_inlinedAt) + : Var(_var), InlinedAt(_inlinedAt) {} + + bool operator==(const DebugVariable &DV) const { + return (Var == DV.Var) && (InlinedAt == DV.InlinedAt); + } + }; + + /// Member variables and functions for Range Extension across basic blocks. + struct VarLoc { + DebugVariable Var; + const MachineInstr *MI; // MachineInstr should be a DBG_VALUE instr. + + VarLoc(DebugVariable _var, const MachineInstr *_mi) : Var(_var), MI(_mi) {} + + bool operator==(const VarLoc &V) const; + }; + + typedef std::list<VarLoc> VarLocList; + typedef SmallDenseMap<const MachineBasicBlock *, VarLocList> VarLocInMBB; + + void transferDebugValue(MachineInstr &MI, VarLocList &OpenRanges); + void transferRegisterDef(MachineInstr &MI, VarLocList &OpenRanges); + bool transferTerminatorInst(MachineInstr &MI, VarLocList &OpenRanges, + VarLocInMBB &OutLocs); + bool transfer(MachineInstr &MI, VarLocList &OpenRanges, VarLocInMBB &OutLocs); + + bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs); + + bool ExtendRanges(MachineFunction &MF); + +public: + static char ID; + + /// Default construct and initialize the pass. + LiveDebugValues(); + + /// Tell the pass manager which passes we depend on and what + /// information we preserve. + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// Print to ostream with a message. + void printVarLocInMBB(const VarLocInMBB &V, const char *msg, + raw_ostream &Out) const; + + /// Calculate the liveness information for the given machine function. + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // namespace + +//===----------------------------------------------------------------------===// +// Implementation +//===----------------------------------------------------------------------===// + +char LiveDebugValues::ID = 0; +char &llvm::LiveDebugValuesID = LiveDebugValues::ID; +INITIALIZE_PASS(LiveDebugValues, "livedebugvalues", "Live DEBUG_VALUE analysis", + false, false) + +/// Default construct and initialize the pass. +LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) { + initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry()); +} + +/// Tell the pass manager which passes we depend on and what information we +/// preserve. +void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const { + MachineFunctionPass::getAnalysisUsage(AU); +} + +// \brief If @MI is a DBG_VALUE with debug value described by a defined +// register, returns the number of this register. In the other case, returns 0. +static unsigned isDescribedByReg(const MachineInstr &MI) { + assert(MI.isDebugValue()); + assert(MI.getNumOperands() == 4); + // If location of variable is described using a register (directly or + // indirecltly), this register is always a first operand. + return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : 0; +} + +// \brief This function takes two DBG_VALUE instructions and returns true +// if their offsets are equal; otherwise returns false. +static bool areOffsetsEqual(const MachineInstr &MI1, const MachineInstr &MI2) { + assert(MI1.isDebugValue()); + assert(MI1.getNumOperands() == 4); + + assert(MI2.isDebugValue()); + assert(MI2.getNumOperands() == 4); + + if (!MI1.isIndirectDebugValue() && !MI2.isIndirectDebugValue()) + return true; + + // Check if both MIs are indirect and they are equal. + if (MI1.isIndirectDebugValue() && MI2.isIndirectDebugValue()) + return MI1.getOperand(1).getImm() == MI2.getOperand(1).getImm(); + + return false; +} + +//===----------------------------------------------------------------------===// +// Debug Range Extension Implementation +//===----------------------------------------------------------------------===// + +void LiveDebugValues::printVarLocInMBB(const VarLocInMBB &V, const char *msg, + raw_ostream &Out) const { + Out << "Printing " << msg << ":\n"; + for (const auto &L : V) { + Out << "MBB: " << L.first->getName() << ":\n"; + for (const auto &VLL : L.second) { + Out << " Var: " << VLL.Var.Var->getName(); + Out << " MI: "; + (*VLL.MI).dump(); + Out << "\n"; + } + } + Out << "\n"; +} + +bool LiveDebugValues::VarLoc::operator==(const VarLoc &V) const { + return (Var == V.Var) && (isDescribedByReg(*MI) == isDescribedByReg(*V.MI)) && + (areOffsetsEqual(*MI, *V.MI)); +} + +/// End all previous ranges related to @MI and start a new range from @MI +/// if it is a DBG_VALUE instr. +void LiveDebugValues::transferDebugValue(MachineInstr &MI, + VarLocList &OpenRanges) { + if (!MI.isDebugValue()) + return; + const DILocalVariable *RawVar = MI.getDebugVariable(); + assert(RawVar->isValidLocationForIntrinsic(MI.getDebugLoc()) && + "Expected inlined-at fields to agree"); + DebugVariable Var(RawVar, MI.getDebugLoc()->getInlinedAt()); + + // End all previous ranges of Var. + OpenRanges.erase( + std::remove_if(OpenRanges.begin(), OpenRanges.end(), + [&](const VarLoc &V) { return (Var == V.Var); }), + OpenRanges.end()); + + // Add Var to OpenRanges from this DBG_VALUE. + // TODO: Currently handles DBG_VALUE which has only reg as location. + if (isDescribedByReg(MI)) { + VarLoc V(Var, &MI); + OpenRanges.push_back(std::move(V)); + } +} + +/// A definition of a register may mark the end of a range. +void LiveDebugValues::transferRegisterDef(MachineInstr &MI, + VarLocList &OpenRanges) { + for (const MachineOperand &MO : MI.operands()) { + if (!(MO.isReg() && MO.isDef() && MO.getReg() && + TRI->isPhysicalRegister(MO.getReg()))) + continue; + // Remove ranges of all aliased registers. + for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI) + OpenRanges.erase(std::remove_if(OpenRanges.begin(), OpenRanges.end(), + [&](const VarLoc &V) { + return (*RAI == + isDescribedByReg(*V.MI)); + }), + OpenRanges.end()); + } +} + +/// Terminate all open ranges at the end of the current basic block. +bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI, + VarLocList &OpenRanges, + VarLocInMBB &OutLocs) { + bool Changed = false; + const MachineBasicBlock *CurMBB = MI.getParent(); + if (!(MI.isTerminator() || (&MI == &CurMBB->instr_back()))) + return false; + + if (OpenRanges.empty()) + return false; + + VarLocList &VLL = OutLocs[CurMBB]; + + for (auto OR : OpenRanges) { + // Copy OpenRanges to OutLocs, if not already present. + assert(OR.MI->isDebugValue()); + DEBUG(dbgs() << "Add to OutLocs: "; OR.MI->dump();); + if (std::find_if(VLL.begin(), VLL.end(), + [&](const VarLoc &V) { return (OR == V); }) == VLL.end()) { + VLL.push_back(std::move(OR)); + Changed = true; + } + } + OpenRanges.clear(); + return Changed; +} + +/// This routine creates OpenRanges and OutLocs. +bool LiveDebugValues::transfer(MachineInstr &MI, VarLocList &OpenRanges, + VarLocInMBB &OutLocs) { + bool Changed = false; + transferDebugValue(MI, OpenRanges); + transferRegisterDef(MI, OpenRanges); + Changed = transferTerminatorInst(MI, OpenRanges, OutLocs); + return Changed; +} + +/// This routine joins the analysis results of all incoming edges in @MBB by +/// inserting a new DBG_VALUE instruction at the start of the @MBB - if the same +/// source variable in all the predecessors of @MBB reside in the same location. +bool LiveDebugValues::join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, + VarLocInMBB &InLocs) { + DEBUG(dbgs() << "join MBB: " << MBB.getName() << "\n"); + bool Changed = false; + + VarLocList InLocsT; // Temporary incoming locations. + + // For all predecessors of this MBB, find the set of VarLocs that can be + // joined. + for (auto p : MBB.predecessors()) { + auto OL = OutLocs.find(p); + // Join is null in case of empty OutLocs from any of the pred. + if (OL == OutLocs.end()) + return false; + + // Just copy over the Out locs to incoming locs for the first predecessor. + if (p == *MBB.pred_begin()) { + InLocsT = OL->second; + continue; + } + + // Join with this predecessor. + VarLocList &VLL = OL->second; + InLocsT.erase( + std::remove_if(InLocsT.begin(), InLocsT.end(), [&](VarLoc &ILT) { + return (std::find_if(VLL.begin(), VLL.end(), [&](const VarLoc &V) { + return (ILT == V); + }) == VLL.end()); + }), InLocsT.end()); + } + + if (InLocsT.empty()) + return false; + + VarLocList &ILL = InLocs[&MBB]; + + // Insert DBG_VALUE instructions, if not already inserted. + for (auto ILT : InLocsT) { + if (std::find_if(ILL.begin(), ILL.end(), [&](const VarLoc &I) { + return (ILT == I); + }) == ILL.end()) { + // This VarLoc is not found in InLocs i.e. it is not yet inserted. So, a + // new range is started for the var from the mbb's beginning by inserting + // a new DBG_VALUE. transfer() will end this range however appropriate. + const MachineInstr *DMI = ILT.MI; + MachineInstr *MI = + BuildMI(MBB, MBB.instr_begin(), DMI->getDebugLoc(), DMI->getDesc(), + DMI->isIndirectDebugValue(), DMI->getOperand(0).getReg(), 0, + DMI->getDebugVariable(), DMI->getDebugExpression()); + if (DMI->isIndirectDebugValue()) + MI->getOperand(1).setImm(DMI->getOperand(1).getImm()); + DEBUG(dbgs() << "Inserted: "; MI->dump();); + ++NumInserted; + Changed = true; + + VarLoc V(ILT.Var, MI); + ILL.push_back(std::move(V)); + } + } + return Changed; +} + +/// Calculate the liveness information for the given machine function and +/// extend ranges across basic blocks. +bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { + + DEBUG(dbgs() << "\nDebug Range Extension\n"); + + bool Changed = false; + bool OLChanged = false; + bool MBBJoined = false; + + VarLocList OpenRanges; // Ranges that are open until end of bb. + VarLocInMBB OutLocs; // Ranges that exist beyond bb. + VarLocInMBB InLocs; // Ranges that are incoming after joining. + + DenseMap<unsigned int, MachineBasicBlock *> OrderToBB; + DenseMap<MachineBasicBlock *, unsigned int> BBToOrder; + std::priority_queue<unsigned int, std::vector<unsigned int>, + std::greater<unsigned int>> Worklist; + std::priority_queue<unsigned int, std::vector<unsigned int>, + std::greater<unsigned int>> Pending; + // Initialize every mbb with OutLocs. + for (auto &MBB : MF) + for (auto &MI : MBB) + transfer(MI, OpenRanges, OutLocs); + DEBUG(printVarLocInMBB(OutLocs, "OutLocs after initialization", dbgs())); + + ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); + unsigned int RPONumber = 0; + for (auto RI = RPOT.begin(), RE = RPOT.end(); RI != RE; ++RI) { + OrderToBB[RPONumber] = *RI; + BBToOrder[*RI] = RPONumber; + Worklist.push(RPONumber); + ++RPONumber; + } + + // This is a standard "union of predecessor outs" dataflow problem. + // To solve it, we perform join() and transfer() using the two worklist method + // until the ranges converge. + // Ranges have converged when both worklists are empty. + while (!Worklist.empty() || !Pending.empty()) { + // We track what is on the pending worklist to avoid inserting the same + // thing twice. We could avoid this with a custom priority queue, but this + // is probably not worth it. + SmallPtrSet<MachineBasicBlock *, 16> OnPending; + while (!Worklist.empty()) { + MachineBasicBlock *MBB = OrderToBB[Worklist.top()]; + Worklist.pop(); + MBBJoined = join(*MBB, OutLocs, InLocs); + + if (MBBJoined) { + MBBJoined = false; + Changed = true; + for (auto &MI : *MBB) + OLChanged |= transfer(MI, OpenRanges, OutLocs); + DEBUG(printVarLocInMBB(OutLocs, "OutLocs after propagating", dbgs())); + DEBUG(printVarLocInMBB(InLocs, "InLocs after propagating", dbgs())); + + if (OLChanged) { + OLChanged = false; + for (auto s : MBB->successors()) + if (!OnPending.count(s)) { + OnPending.insert(s); + Pending.push(BBToOrder[s]); + } + } + } + } + Worklist.swap(Pending); + // At this point, pending must be empty, since it was just the empty + // worklist + assert(Pending.empty() && "Pending should be empty"); + } + + DEBUG(printVarLocInMBB(OutLocs, "Final OutLocs", dbgs())); + DEBUG(printVarLocInMBB(InLocs, "Final InLocs", dbgs())); + return Changed; +} + +bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) { + TRI = MF.getSubtarget().getRegisterInfo(); + TII = MF.getSubtarget().getInstrInfo(); + + bool Changed = false; + + Changed |= ExtendRanges(MF); + + return Changed; +} diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp index 1571551..6dac7db 100644 --- a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp +++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp @@ -91,9 +91,7 @@ public: bool dominates(MachineBasicBlock *MBB) { if (LBlocks.empty()) LS.getMachineBasicBlocks(DL, LBlocks); - if (LBlocks.count(MBB) != 0 || LS.dominates(DL, MBB)) - return true; - return false; + return LBlocks.count(MBB) != 0 || LS.dominates(DL, MBB); } }; } // end anonymous namespace @@ -512,7 +510,7 @@ bool LDVImpl::collectDebugValues(MachineFunction &mf) { bool Changed = false; for (MachineFunction::iterator MFI = mf.begin(), MFE = mf.end(); MFI != MFE; ++MFI) { - MachineBasicBlock *MBB = MFI; + MachineBasicBlock *MBB = &*MFI; for (MachineBasicBlock::iterator MBBI = MBB->begin(), MBBE = MBB->end(); MBBI != MBBE;) { if (!MBBI->isDebugValue()) { @@ -536,65 +534,49 @@ bool LDVImpl::collectDebugValues(MachineFunction &mf) { return Changed; } -void UserValue::extendDef(SlotIndex Idx, unsigned LocNo, - LiveRange *LR, const VNInfo *VNI, - SmallVectorImpl<SlotIndex> *Kills, +/// We only propagate DBG_VALUES locally here. LiveDebugValues performs a +/// data-flow analysis to propagate them beyond basic block boundaries. +void UserValue::extendDef(SlotIndex Idx, unsigned LocNo, LiveRange *LR, + const VNInfo *VNI, SmallVectorImpl<SlotIndex> *Kills, LiveIntervals &LIS, MachineDominatorTree &MDT, UserValueScopes &UVS) { - SmallVector<SlotIndex, 16> Todo; - Todo.push_back(Idx); - do { - SlotIndex Start = Todo.pop_back_val(); - MachineBasicBlock *MBB = LIS.getMBBFromIndex(Start); - SlotIndex Stop = LIS.getMBBEndIdx(MBB); - LocMap::iterator I = locInts.find(Start); - - // Limit to VNI's live range. - bool ToEnd = true; - if (LR && VNI) { - LiveInterval::Segment *Segment = LR->getSegmentContaining(Start); - if (!Segment || Segment->valno != VNI) { - if (Kills) - Kills->push_back(Start); - continue; - } - if (Segment->end < Stop) - Stop = Segment->end, ToEnd = false; - } - - // There could already be a short def at Start. - if (I.valid() && I.start() <= Start) { - // Stop when meeting a different location or an already extended interval. - Start = Start.getNextSlot(); - if (I.value() != LocNo || I.stop() != Start) - continue; - // This is a one-slot placeholder. Just skip it. - ++I; + SlotIndex Start = Idx; + MachineBasicBlock *MBB = LIS.getMBBFromIndex(Start); + SlotIndex Stop = LIS.getMBBEndIdx(MBB); + LocMap::iterator I = locInts.find(Start); + + // Limit to VNI's live range. + bool ToEnd = true; + if (LR && VNI) { + LiveInterval::Segment *Segment = LR->getSegmentContaining(Start); + if (!Segment || Segment->valno != VNI) { + if (Kills) + Kills->push_back(Start); + return; } + if (Segment->end < Stop) + Stop = Segment->end, ToEnd = false; + } - // Limited by the next def. - if (I.valid() && I.start() < Stop) - Stop = I.start(), ToEnd = false; - // Limited by VNI's live range. - else if (!ToEnd && Kills) - Kills->push_back(Stop); + // There could already be a short def at Start. + if (I.valid() && I.start() <= Start) { + // Stop when meeting a different location or an already extended interval. + Start = Start.getNextSlot(); + if (I.value() != LocNo || I.stop() != Start) + return; + // This is a one-slot placeholder. Just skip it. + ++I; + } - if (Start >= Stop) - continue; + // Limited by the next def. + if (I.valid() && I.start() < Stop) + Stop = I.start(), ToEnd = false; + // Limited by VNI's live range. + else if (!ToEnd && Kills) + Kills->push_back(Stop); + if (Start < Stop) I.insert(Start, Stop, LocNo); - - // If we extended to the MBB end, propagate down the dominator tree. - if (!ToEnd) - continue; - const std::vector<MachineDomTreeNode*> &Children = - MDT.getNode(MBB)->getChildren(); - for (unsigned i = 0, e = Children.size(); i != e; ++i) { - MachineBasicBlock *MBB = Children[i]->getBlock(); - if (UVS.dominates(MBB)) - Todo.push_back(LIS.getMBBStartIdx(MBB)); - } - } while (!Todo.empty()); } void @@ -763,7 +745,7 @@ static void removeDebugValues(MachineFunction &mf) { bool LiveDebugVariables::runOnMachineFunction(MachineFunction &mf) { if (!EnableLDV) return false; - if (!FunctionDIs.count(mf.getFunction())) { + if (!mf.getFunction()->getSubprogram()) { removeDebugValues(mf); return false; } @@ -1004,11 +986,11 @@ void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS, SlotIndex Stop = I.stop(); unsigned LocNo = I.value(); DEBUG(dbgs() << "\t[" << Start << ';' << Stop << "):" << LocNo); - MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start); - SlotIndex MBBEnd = LIS.getMBBEndIdx(MBB); + MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start)->getIterator(); + SlotIndex MBBEnd = LIS.getMBBEndIdx(&*MBB); DEBUG(dbgs() << " BB#" << MBB->getNumber() << '-' << MBBEnd); - insertDebugValue(MBB, Start, LocNo, LIS, TII); + insertDebugValue(&*MBB, Start, LocNo, LIS, TII); // This interval may span multiple basic blocks. // Insert a DBG_VALUE into each one. while(Stop > MBBEnd) { @@ -1016,9 +998,9 @@ void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS, Start = MBBEnd; if (++MBB == MFEnd) break; - MBBEnd = LIS.getMBBEndIdx(MBB); + MBBEnd = LIS.getMBBEndIdx(&*MBB); DEBUG(dbgs() << " BB#" << MBB->getNumber() << '-' << MBBEnd); - insertDebugValue(MBB, Start, LocNo, LIS, TII); + insertDebugValue(&*MBB, Start, LocNo, LIS, TII); } DEBUG(dbgs() << '\n'); if (MBB == MFEnd) @@ -1047,7 +1029,6 @@ void LiveDebugVariables::emitDebugValues(VirtRegMap *VRM) { } bool LiveDebugVariables::doInitialization(Module &M) { - FunctionDIs = makeSubprogramMap(M); return Pass::doInitialization(M); } diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.h b/contrib/llvm/lib/CodeGen/LiveDebugVariables.h index 694aa17..3d36f4d 100644 --- a/contrib/llvm/lib/CodeGen/LiveDebugVariables.h +++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.h @@ -33,7 +33,6 @@ class VirtRegMap; class LLVM_LIBRARY_VISIBILITY LiveDebugVariables : public MachineFunctionPass { void *pImpl; - DenseMap<const Function *, DISubprogram *> FunctionDIs; public: static char ID; // Pass identification, replacement for typeid diff --git a/contrib/llvm/lib/CodeGen/LiveInterval.cpp b/contrib/llvm/lib/CodeGen/LiveInterval.cpp index d75e441..bb34883 100644 --- a/contrib/llvm/lib/CodeGen/LiveInterval.cpp +++ b/contrib/llvm/lib/CodeGen/LiveInterval.cpp @@ -26,7 +26,6 @@ #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegisterInfo.h" #include <algorithm> @@ -865,7 +864,7 @@ void LiveInterval::constructMainRangeFromSubranges( // - If any of the subranges is live at a point the main liverange has to be // live too, conversily if no subrange is live the main range mustn't be // live either. - // We do this by scannig through all the subranges simultaneously creating new + // We do this by scanning through all the subranges simultaneously creating new // segments in the main range as segments start/ends come up in the subranges. assert(hasSubRanges() && "expected subranges to be present"); assert(segments.empty() && valnos.empty() && "expected empty main range"); @@ -889,7 +888,7 @@ void LiveInterval::constructMainRangeFromSubranges( Segment CurrentSegment; bool ConstructingSegment = false; bool NeedVNIFixup = false; - unsigned ActiveMask = 0; + LaneBitmask ActiveMask = 0; SlotIndex Pos = First; while (true) { SlotIndex NextPos = Last; @@ -899,7 +898,7 @@ void LiveInterval::constructMainRangeFromSubranges( END_SEGMENT, } Event = NOTHING; // Which subregister lanes are affected by the current event. - unsigned EventMask = 0; + LaneBitmask EventMask = 0; // Whether a BEGIN_SEGMENT is also a valno definition point. bool IsDef = false; // Find the next begin or end of a subrange segment. Combine masks if we @@ -1066,7 +1065,7 @@ void LiveInterval::print(raw_ostream &OS) const { super::print(OS); // Print subranges for (const SubRange &SR : subranges()) { - OS << format(" L%04X ", SR.LaneMask) << SR; + OS << " L" << PrintLaneMask(SR.LaneMask) << ' ' << SR; } } @@ -1101,8 +1100,8 @@ void LiveInterval::verify(const MachineRegisterInfo *MRI) const { super::verify(); // Make sure SubRanges are fine and LaneMasks are disjunct. - unsigned Mask = 0; - unsigned MaxMask = MRI != nullptr ? MRI->getMaxLaneMaskForVReg(reg) : ~0u; + LaneBitmask Mask = 0; + LaneBitmask MaxMask = MRI != nullptr ? MRI->getMaxLaneMaskForVReg(reg) : ~0u; for (const SubRange &SR : subranges()) { // Subrange lanemask should be disjunct to any previous subrange masks. assert((Mask & SR.LaneMask) == 0); @@ -1110,6 +1109,8 @@ void LiveInterval::verify(const MachineRegisterInfo *MRI) const { // subrange mask should not contained in maximum lane mask for the vreg. assert((Mask & ~MaxMask) == 0); + // empty subranges must be removed. + assert(!SR.empty()); SR.verify(); // Main liverange should cover subrange. @@ -1327,15 +1328,15 @@ void LiveRangeUpdater::flush() { LR->verify(); } -unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) { +unsigned ConnectedVNInfoEqClasses::Classify(const LiveRange &LR) { // Create initial equivalence classes. EqClass.clear(); - EqClass.grow(LI->getNumValNums()); + EqClass.grow(LR.getNumValNums()); const VNInfo *used = nullptr, *unused = nullptr; // Determine connections. - for (const VNInfo *VNI : LI->valnos) { + for (const VNInfo *VNI : LR.valnos) { // Group all unused values into one class. if (VNI->isUnused()) { if (unused) @@ -1350,14 +1351,14 @@ unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) { // Connect to values live out of predecessors. for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PE = MBB->pred_end(); PI != PE; ++PI) - if (const VNInfo *PVNI = LI->getVNInfoBefore(LIS.getMBBEndIdx(*PI))) + if (const VNInfo *PVNI = LR.getVNInfoBefore(LIS.getMBBEndIdx(*PI))) EqClass.join(VNI->id, PVNI->id); } else { // Normal value defined by an instruction. Check for two-addr redef. // FIXME: This could be coincidental. Should we really check for a tied // operand constraint? // Note that VNI->def may be a use slot for an early clobber def. - if (const VNInfo *UVNI = LI->getVNInfoBefore(VNI->def)) + if (const VNInfo *UVNI = LR.getVNInfoBefore(VNI->def)) EqClass.join(VNI->id, UVNI->id); } } @@ -1370,11 +1371,42 @@ unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) { return EqClass.getNumClasses(); } -void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[], - MachineRegisterInfo &MRI) { - assert(LIV[0] && "LIV[0] must be set"); - LiveInterval &LI = *LIV[0]; +template<typename LiveRangeT, typename EqClassesT> +static void DistributeRange(LiveRangeT &LR, LiveRangeT *SplitLRs[], + EqClassesT VNIClasses) { + // Move segments to new intervals. + LiveRange::iterator J = LR.begin(), E = LR.end(); + while (J != E && VNIClasses[J->valno->id] == 0) + ++J; + for (LiveRange::iterator I = J; I != E; ++I) { + if (unsigned eq = VNIClasses[I->valno->id]) { + assert((SplitLRs[eq-1]->empty() || SplitLRs[eq-1]->expiredAt(I->start)) && + "New intervals should be empty"); + SplitLRs[eq-1]->segments.push_back(*I); + } else + *J++ = *I; + } + LR.segments.erase(J, E); + + // Transfer VNInfos to their new owners and renumber them. + unsigned j = 0, e = LR.getNumValNums(); + while (j != e && VNIClasses[j] == 0) + ++j; + for (unsigned i = j; i != e; ++i) { + VNInfo *VNI = LR.getValNumInfo(i); + if (unsigned eq = VNIClasses[i]) { + VNI->id = SplitLRs[eq-1]->getNumValNums(); + SplitLRs[eq-1]->valnos.push_back(VNI); + } else { + VNI->id = j; + LR.valnos[j++] = VNI; + } + } + LR.valnos.resize(j); +} +void ConnectedVNInfoEqClasses::Distribute(LiveInterval &LI, LiveInterval *LIV[], + MachineRegisterInfo &MRI) { // Rewrite instructions. for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(LI.reg), RE = MRI.reg_end(); RI != RE;) { @@ -1396,38 +1428,41 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[], // NULL. If the use is tied to a def, VNI will be the defined value. if (!VNI) continue; - MO.setReg(LIV[getEqClass(VNI)]->reg); - } - - // Move runs to new intervals. - LiveInterval::iterator J = LI.begin(), E = LI.end(); - while (J != E && EqClass[J->valno->id] == 0) - ++J; - for (LiveInterval::iterator I = J; I != E; ++I) { - if (unsigned eq = EqClass[I->valno->id]) { - assert((LIV[eq]->empty() || LIV[eq]->expiredAt(I->start)) && - "New intervals should be empty"); - LIV[eq]->segments.push_back(*I); - } else - *J++ = *I; + if (unsigned EqClass = getEqClass(VNI)) + MO.setReg(LIV[EqClass-1]->reg); } - // TODO: do not cheat anymore by simply cleaning all subranges - LI.clearSubRanges(); - LI.segments.erase(J, E); - // Transfer VNInfos to their new owners and renumber them. - unsigned j = 0, e = LI.getNumValNums(); - while (j != e && EqClass[j] == 0) - ++j; - for (unsigned i = j; i != e; ++i) { - VNInfo *VNI = LI.getValNumInfo(i); - if (unsigned eq = EqClass[i]) { - VNI->id = LIV[eq]->getNumValNums(); - LIV[eq]->valnos.push_back(VNI); - } else { - VNI->id = j; - LI.valnos[j++] = VNI; + // Distribute subregister liveranges. + if (LI.hasSubRanges()) { + unsigned NumComponents = EqClass.getNumClasses(); + SmallVector<unsigned, 8> VNIMapping; + SmallVector<LiveInterval::SubRange*, 8> SubRanges; + BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator(); + for (LiveInterval::SubRange &SR : LI.subranges()) { + // Create new subranges in the split intervals and construct a mapping + // for the VNInfos in the subrange. + unsigned NumValNos = SR.valnos.size(); + VNIMapping.clear(); + VNIMapping.reserve(NumValNos); + SubRanges.clear(); + SubRanges.resize(NumComponents-1, nullptr); + for (unsigned I = 0; I < NumValNos; ++I) { + const VNInfo &VNI = *SR.valnos[I]; + const VNInfo *MainRangeVNI = LI.getVNInfoAt(VNI.def); + assert(MainRangeVNI != nullptr + && "SubRange def must have corresponding main range def"); + unsigned ComponentNum = getEqClass(MainRangeVNI); + VNIMapping.push_back(ComponentNum); + if (ComponentNum > 0 && SubRanges[ComponentNum-1] == nullptr) { + SubRanges[ComponentNum-1] + = LIV[ComponentNum-1]->createSubRange(Allocator, SR.LaneMask); + } + } + DistributeRange(SR, SubRanges.data(), VNIMapping); } + LI.removeEmptySubRanges(); } - LI.valnos.resize(j); + + // Distribute main liverange. + DistributeRange(LI, LIV, EqClass); } diff --git a/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp index c00b010..a506e05 100644 --- a/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp +++ b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp @@ -32,7 +32,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -48,7 +47,7 @@ char LiveIntervals::ID = 0; char &llvm::LiveIntervalsID = LiveIntervals::ID; INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals", "Live Interval Analysis", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveVariables) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) @@ -76,8 +75,8 @@ cl::opt<bool> UseSegmentSetForPhysRegs( void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addRequired<AliasAnalysis>(); - AU.addPreserved<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); // LiveVariables isn't really required by this analysis, it is only required // here to make sure it is live during TwoAddressInstructionPass and // PHIElimination. This is temporary. @@ -124,7 +123,7 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) { MRI = &MF->getRegInfo(); TRI = MF->getSubtarget().getRegisterInfo(); TII = MF->getSubtarget().getInstrInfo(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); Indexes = &getAnalysis<SlotIndexes>(); DomTree = &getAnalysis<MachineDominatorTree>(); @@ -198,9 +197,16 @@ LiveInterval* LiveIntervals::createInterval(unsigned reg) { void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) { assert(LRCalc && "LRCalc not initialized."); assert(LI.empty() && "Should only compute empty intervals."); + bool ShouldTrackSubRegLiveness = MRI->shouldTrackSubRegLiveness(LI.reg); LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); - LRCalc->calculate(LI, MRI->shouldTrackSubRegLiveness(LI.reg)); - computeDeadValues(LI, nullptr); + LRCalc->calculate(LI, ShouldTrackSubRegLiveness); + bool SeparatedComponents = computeDeadValues(LI, nullptr); + if (SeparatedComponents) { + assert(ShouldTrackSubRegLiveness + && "Separated components should only occur for unused subreg defs"); + SmallVector<LiveInterval*, 8> SplitLIs; + splitSeparateComponents(LI, SplitLIs); + } } void LiveIntervals::computeVirtRegs() { @@ -216,19 +222,31 @@ void LiveIntervals::computeRegMasks() { RegMaskBlocks.resize(MF->getNumBlockIDs()); // Find all instructions with regmask operands. - for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); - MBBI != E; ++MBBI) { - MachineBasicBlock *MBB = MBBI; - std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB->getNumber()]; + for (MachineBasicBlock &MBB : *MF) { + std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB.getNumber()]; RMB.first = RegMaskSlots.size(); - for (MachineBasicBlock::iterator MI = MBB->begin(), ME = MBB->end(); - MI != ME; ++MI) - for (const MachineOperand &MO : MI->operands()) { + + // Some block starts, such as EH funclets, create masks. + if (const uint32_t *Mask = MBB.getBeginClobberMask(TRI)) { + RegMaskSlots.push_back(Indexes->getMBBStartIdx(&MBB)); + RegMaskBits.push_back(Mask); + } + + for (MachineInstr &MI : MBB) { + for (const MachineOperand &MO : MI.operands()) { if (!MO.isRegMask()) continue; - RegMaskSlots.push_back(Indexes->getInstructionIndex(MI).getRegSlot()); - RegMaskBits.push_back(MO.getRegMask()); + RegMaskSlots.push_back(Indexes->getInstructionIndex(&MI).getRegSlot()); + RegMaskBits.push_back(MO.getRegMask()); } + } + + // Some block ends, such as funclet returns, create masks. + if (const uint32_t *Mask = MBB.getEndClobberMask(TRI)) { + RegMaskSlots.push_back(Indexes->getMBBEndIdx(&MBB)); + RegMaskBits.push_back(Mask); + } + // Compute the number of register mask instructions in this block. RMB.second = RegMaskSlots.size() - RMB.first; } @@ -296,18 +314,17 @@ void LiveIntervals::computeLiveInRegUnits() { // Check all basic blocks for live-ins. for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end(); MFI != MFE; ++MFI) { - const MachineBasicBlock *MBB = MFI; + const MachineBasicBlock *MBB = &*MFI; // We only care about ABI blocks: Entry + landing pads. - if ((MFI != MF->begin() && !MBB->isLandingPad()) || MBB->livein_empty()) + if ((MFI != MF->begin() && !MBB->isEHPad()) || MBB->livein_empty()) continue; // Create phi-defs at Begin for all live-in registers. SlotIndex Begin = Indexes->getMBBStartIdx(MBB); DEBUG(dbgs() << Begin << "\tBB#" << MBB->getNumber()); - for (MachineBasicBlock::livein_iterator LII = MBB->livein_begin(), - LIE = MBB->livein_end(); LII != LIE; ++LII) { - for (MCRegUnitIterator Units(*LII, TRI); Units.isValid(); ++Units) { + for (const auto &LI : MBB->liveins()) { + for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) { unsigned Unit = *Units; LiveRange *LR = RegUnitRanges[Unit]; if (!LR) { @@ -396,9 +413,6 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes, } } -/// shrinkToUses - After removing some uses of a register, shrink its live -/// range to just the remaining uses. This method does not compute reaching -/// defs for new uses, and it doesn't remove dead defs. bool LiveIntervals::shrinkToUses(LiveInterval *li, SmallVectorImpl<MachineInstr*> *dead) { DEBUG(dbgs() << "Shrink: " << *li << '\n'); @@ -406,9 +420,14 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, && "Can only shrink virtual registers"); // Shrink subregister live ranges. + bool NeedsCleanup = false; for (LiveInterval::SubRange &S : li->subranges()) { shrinkToUses(S, li->reg); + if (S.empty()) + NeedsCleanup = true; } + if (NeedsCleanup) + li->removeEmptySubRanges(); // Find all the values used, including PHI kills. ShrinkToUsesWorkList WorkList; @@ -456,7 +475,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, bool LiveIntervals::computeDeadValues(LiveInterval &LI, SmallVectorImpl<MachineInstr*> *dead) { - bool PHIRemoved = false; + bool MayHaveSplitComponents = false; for (auto VNI : LI.valnos) { if (VNI->isUnused()) continue; @@ -466,10 +485,13 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI, // Is the register live before? Otherwise we may have to add a read-undef // flag for subregister defs. - if (MRI->shouldTrackSubRegLiveness(LI.reg)) { + bool DeadBeforeDef = false; + unsigned VReg = LI.reg; + if (MRI->shouldTrackSubRegLiveness(VReg)) { if ((I == LI.begin() || std::prev(I)->end < Def) && !VNI->isPHIDef()) { MachineInstr *MI = getInstructionFromIndex(Def); - MI->addRegisterDefReadUndef(LI.reg); + MI->setRegisterDefReadUndef(VReg); + DeadBeforeDef = true; } } @@ -480,19 +502,27 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI, VNI->markUnused(); LI.removeSegment(I); DEBUG(dbgs() << "Dead PHI at " << Def << " may separate interval\n"); - PHIRemoved = true; + MayHaveSplitComponents = true; } else { // This is a dead def. Make sure the instruction knows. MachineInstr *MI = getInstructionFromIndex(Def); assert(MI && "No instruction defining live value"); - MI->addRegisterDead(LI.reg, TRI); + MI->addRegisterDead(VReg, TRI); + + // If we have a dead def that is completely separate from the rest of + // the liverange then we rewrite it to use a different VReg to not violate + // the rule that the liveness of a virtual register forms a connected + // component. This should only happen if subregister liveness is tracked. + if (DeadBeforeDef) + MayHaveSplitComponents = true; + if (dead && MI->allDefsAreDead()) { DEBUG(dbgs() << "All defs dead: " << Def << '\t' << *MI); dead->push_back(MI); } } } - return PHIRemoved; + return MayHaveSplitComponents; } void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) @@ -512,8 +542,8 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) // Maybe the operand is for a subregister we don't care about. unsigned SubReg = MO.getSubReg(); if (SubReg != 0) { - unsigned SubRegMask = TRI->getSubRegIndexLaneMask(SubReg); - if ((SubRegMask & SR.LaneMask) == 0) + LaneBitmask LaneMask = TRI->getSubRegIndexLaneMask(SubReg); + if ((LaneMask & SR.LaneMask) == 0) continue; } // We only need to visit each instruction once. @@ -712,7 +742,7 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) { // assign R0L to %vreg1, and R0 to %vreg2 because the low 32bits of R0 // are actually never written by %vreg2. After assignment the <kill> // flag at the read instruction is invalid. - unsigned DefinedLanesMask; + LaneBitmask DefinedLanesMask; if (!SRs.empty()) { // Compute a mask of lanes that are defined. DefinedLanesMask = 0; @@ -736,7 +766,7 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) { continue; if (MO.isUse()) { // Reading any undefined lanes? - unsigned UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg()); + LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg()); if ((UseMask & ~DefinedLanesMask) != 0) goto CancelKill; } else if (MO.getSubReg() == 0) { @@ -944,7 +974,7 @@ public: LiveInterval &LI = LIS.getInterval(Reg); if (LI.hasSubRanges()) { unsigned SubReg = MO.getSubReg(); - unsigned LaneMask = TRI.getSubRegIndexLaneMask(SubReg); + LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubReg); for (LiveInterval::SubRange &S : LI.subranges()) { if ((S.LaneMask & LaneMask) == 0) continue; @@ -968,7 +998,7 @@ public: private: /// Update a single live range, assuming an instruction has been moved from /// OldIdx to NewIdx. - void updateRange(LiveRange &LR, unsigned Reg, unsigned LaneMask) { + void updateRange(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask) { if (!Updated.insert(&LR).second) return; DEBUG({ @@ -976,7 +1006,7 @@ private: if (TargetRegisterInfo::isVirtualRegister(Reg)) { dbgs() << PrintReg(Reg); if (LaneMask != 0) - dbgs() << format(" L%04X", LaneMask); + dbgs() << " L" << PrintLaneMask(LaneMask); } else { dbgs() << PrintRegUnit(Reg, &TRI); } @@ -1098,7 +1128,7 @@ private: /// Hoist kill to NewIdx, then scan for last kill between NewIdx and /// OldIdx. /// - void handleMoveUp(LiveRange &LR, unsigned Reg, unsigned LaneMask) { + void handleMoveUp(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask) { // First look for a kill at OldIdx. LiveRange::iterator I = LR.find(OldIdx.getBaseIndex()); LiveRange::iterator E = LR.end(); @@ -1175,7 +1205,7 @@ private: } // Return the last use of reg between NewIdx and OldIdx. - SlotIndex findLastUseBefore(unsigned Reg, unsigned LaneMask) { + SlotIndex findLastUseBefore(unsigned Reg, LaneBitmask LaneMask) { if (TargetRegisterInfo::isVirtualRegister(Reg)) { SlotIndex LastUse = NewIdx; @@ -1255,7 +1285,7 @@ void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin, const MachineBasicBlock::iterator End, const SlotIndex endIdx, LiveRange &LR, const unsigned Reg, - const unsigned LaneMask) { + LaneBitmask LaneMask) { LiveInterval::iterator LII = LR.find(endIdx); SlotIndex lastUseIdx; if (LII != LR.end() && LII->start < endIdx) @@ -1282,7 +1312,7 @@ void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin, continue; unsigned SubReg = MO.getSubReg(); - unsigned Mask = TRI->getSubRegIndexLaneMask(SubReg); + LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg); if ((Mask & LaneMask) == 0) continue; @@ -1412,3 +1442,20 @@ void LiveIntervals::removeVRegDefAt(LiveInterval &LI, SlotIndex Pos) { } LI.removeEmptySubRanges(); } + +void LiveIntervals::splitSeparateComponents(LiveInterval &LI, + SmallVectorImpl<LiveInterval*> &SplitLIs) { + ConnectedVNInfoEqClasses ConEQ(*this); + unsigned NumComp = ConEQ.Classify(LI); + if (NumComp <= 1) + return; + DEBUG(dbgs() << " Split " << NumComp << " components: " << LI << '\n'); + unsigned Reg = LI.reg; + const TargetRegisterClass *RegClass = MRI->getRegClass(Reg); + for (unsigned I = 1; I < NumComp; ++I) { + unsigned NewVReg = MRI->createVirtualRegister(RegClass); + LiveInterval &NewLI = createEmptyInterval(NewVReg); + SplitLIs.push_back(&NewLI); + } + ConEQ.Distribute(LI, SplitLIs.data(), *MRI); +} diff --git a/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp index cbd98e3..efbbcbe 100644 --- a/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp +++ b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp @@ -68,7 +68,7 @@ void LivePhysRegs::stepBackward(const MachineInstr &MI) { /// Simulates liveness when stepping forward over an instruction(bundle): Remove /// killed-uses, add defs. This is the not recommended way, because it depends -/// on accurate kill flags. If possible use stepBackwards() instead of this +/// on accurate kill flags. If possible use stepBackward() instead of this /// function. void LivePhysRegs::stepForward(const MachineInstr &MI, SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> &Clobbers) { @@ -128,8 +128,8 @@ void LivePhysRegs::dump() const { /// Add live-in registers of basic block \p MBB to \p LiveRegs. static void addLiveIns(LivePhysRegs &LiveRegs, const MachineBasicBlock &MBB) { - for (unsigned Reg : make_range(MBB.livein_begin(), MBB.livein_end())) - LiveRegs.addReg(Reg); + for (const auto &LI : MBB.liveins()) + LiveRegs.addReg(LI.PhysReg); } /// Add pristine registers to the given \p LiveRegs. This function removes @@ -147,11 +147,19 @@ static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF, } void LivePhysRegs::addLiveOuts(const MachineBasicBlock *MBB, - bool AddPristines) { - if (AddPristines) { + bool AddPristinesAndCSRs) { + if (AddPristinesAndCSRs) { const MachineFunction &MF = *MBB->getParent(); addPristines(*this, MF, *TRI); + if (!MBB->isReturnBlock()) { + // The return block has no successors whose live-ins we could merge + // below. So instead we add the callee saved registers manually. + for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) + addReg(*I); + } } + + // To get the live-outs we simply merge the live-ins of all successors. for (const MachineBasicBlock *Succ : MBB->successors()) ::addLiveIns(*this, *Succ); } diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp index bb2877a..c408615 100644 --- a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp +++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp @@ -64,23 +64,23 @@ void LiveRangeCalc::calculate(LiveInterval &LI, bool TrackSubRegs) { unsigned SubReg = MO.getSubReg(); if (LI.hasSubRanges() || (SubReg != 0 && TrackSubRegs)) { - unsigned Mask = SubReg != 0 ? TRI.getSubRegIndexLaneMask(SubReg) - : MRI->getMaxLaneMaskForVReg(Reg); + LaneBitmask Mask = SubReg != 0 ? TRI.getSubRegIndexLaneMask(SubReg) + : MRI->getMaxLaneMaskForVReg(Reg); // If this is the first time we see a subregister def, initialize // subranges by creating a copy of the main range. if (!LI.hasSubRanges() && !LI.empty()) { - unsigned ClassMask = MRI->getMaxLaneMaskForVReg(Reg); + LaneBitmask ClassMask = MRI->getMaxLaneMaskForVReg(Reg); LI.createSubRangeFrom(*Alloc, ClassMask, LI); } for (LiveInterval::SubRange &S : LI.subranges()) { // A Mask for subregs common to the existing subrange and current def. - unsigned Common = S.LaneMask & Mask; + LaneBitmask Common = S.LaneMask & Mask; if (Common == 0) continue; // A Mask for subregs covered by the subrange but not the current def. - unsigned LRest = S.LaneMask & ~Mask; + LaneBitmask LRest = S.LaneMask & ~Mask; LiveInterval::SubRange *CommonRange; if (LRest != 0) { // Split current subrange into Common and LRest ranges. @@ -138,7 +138,8 @@ void LiveRangeCalc::createDeadDefs(LiveRange &LR, unsigned Reg) { } -void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg, unsigned Mask) { +void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg, + LaneBitmask Mask) { // Visit all operands that read Reg. This may include partial defs. const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo(); for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) { @@ -157,7 +158,7 @@ void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg, unsigned Mask) { continue; unsigned SubReg = MO.getSubReg(); if (SubReg != 0) { - unsigned SubRegMask = TRI.getSubRegIndexLaneMask(SubReg); + LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(SubReg); // Ignore uses not covering the current subrange. if ((SubRegMask & Mask) == 0) continue; diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.h b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h index 34d9953..ff38c68 100644 --- a/contrib/llvm/lib/CodeGen/LiveRangeCalc.h +++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h @@ -129,7 +129,7 @@ class LiveRangeCalc { /// /// All uses must be jointly dominated by existing liveness. PHI-defs are /// inserted as needed to preserve SSA form. - void extendToUses(LiveRange &LR, unsigned Reg, unsigned LaneMask); + void extendToUses(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask); /// Reset Map and Seen fields. void resetLiveOutMap(); diff --git a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp index 08bbe0c..5ce364a 100644 --- a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -226,7 +226,7 @@ bool LiveRangeEdit::useIsKill(const LiveInterval &LI, return true; const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); unsigned SubReg = MO.getSubReg(); - unsigned LaneMask = TRI.getSubRegIndexLaneMask(SubReg); + LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubReg); for (const LiveInterval::SubRange &S : LI.subranges()) { if ((S.LaneMask & LaneMask) != 0 && S.Query(Idx).isKill()) return true; @@ -349,8 +349,9 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead, ToShrink.pop_back(); if (foldAsLoad(LI, Dead)) continue; + unsigned VReg = LI->reg; if (TheDelegate) - TheDelegate->LRE_WillShrinkVirtReg(LI->reg); + TheDelegate->LRE_WillShrinkVirtReg(VReg); if (!LIS.shrinkToUses(LI, &Dead)) continue; @@ -360,7 +361,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead, // them results in incorrect code. bool BeingSpilled = false; for (unsigned i = 0, e = RegsBeingSpilled.size(); i != e; ++i) { - if (LI->reg == RegsBeingSpilled[i]) { + if (VReg == RegsBeingSpilled[i]) { BeingSpilled = true; break; } @@ -370,29 +371,21 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead, // LI may have been separated, create new intervals. LI->RenumberValues(); - ConnectedVNInfoEqClasses ConEQ(LIS); - unsigned NumComp = ConEQ.Classify(LI); - if (NumComp <= 1) - continue; - ++NumFracRanges; - bool IsOriginal = VRM && VRM->getOriginal(LI->reg) == LI->reg; - DEBUG(dbgs() << NumComp << " components: " << *LI << '\n'); - SmallVector<LiveInterval*, 8> Dups(1, LI); - for (unsigned i = 1; i != NumComp; ++i) { - Dups.push_back(&createEmptyIntervalFrom(LI->reg)); + SmallVector<LiveInterval*, 8> SplitLIs; + LIS.splitSeparateComponents(*LI, SplitLIs); + if (!SplitLIs.empty()) + ++NumFracRanges; + + unsigned Original = VRM ? VRM->getOriginal(VReg) : 0; + for (const LiveInterval *SplitLI : SplitLIs) { // If LI is an original interval that hasn't been split yet, make the new // intervals their own originals instead of referring to LI. The original // interval must contain all the split products, and LI doesn't. - if (IsOriginal) - VRM->setIsSplitFromReg(Dups.back()->reg, 0); + if (Original != VReg && Original != 0) + VRM->setIsSplitFromReg(SplitLI->reg, Original); if (TheDelegate) - TheDelegate->LRE_DidCloneVirtReg(Dups.back()->reg, LI->reg); + TheDelegate->LRE_DidCloneVirtReg(SplitLI->reg, VReg); } - ConEQ.Distribute(&Dups[0], MRI); - DEBUG({ - for (unsigned i = 0; i != NumComp; ++i) - dbgs() << '\t' << *Dups[i] << '\n'; - }); } } @@ -411,7 +404,7 @@ void LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF, const MachineLoopInfo &Loops, const MachineBlockFrequencyInfo &MBFI) { - VirtRegAuxInfo VRAI(MF, LIS, Loops, MBFI); + VirtRegAuxInfo VRAI(MF, LIS, VRM, Loops, MBFI); for (unsigned I = 0, Size = size(); I < Size; ++I) { LiveInterval &LI = LIS.getInterval(get(I)); if (MRI.recomputeRegClass(LI.reg)) diff --git a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp index 9ea031d..7ee87c1 100644 --- a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -15,12 +15,11 @@ #include "RegisterCoalescer.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -49,7 +48,6 @@ void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const { bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) { TRI = MF.getSubtarget().getRegisterInfo(); - MRI = &MF.getRegInfo(); LIS = &getAnalysis<LiveIntervals>(); VRM = &getAnalysis<VirtRegMap>(); @@ -78,7 +76,7 @@ bool foreachUnit(const TargetRegisterInfo *TRI, LiveInterval &VRegInterval, if (VRegInterval.hasSubRanges()) { for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { unsigned Unit = (*Units).first; - unsigned Mask = (*Units).second; + LaneBitmask Mask = (*Units).second; for (LiveInterval::SubRange &S : VRegInterval.subranges()) { if (S.LaneMask & Mask) { if (Func(Unit, S)) @@ -101,7 +99,6 @@ void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) { << " to " << PrintReg(PhysReg, TRI) << ':'); assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment"); VRM->assignVirt2Phys(VirtReg.reg, PhysReg); - MRI->setPhysRegUsed(PhysReg); foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit, const LiveRange &Range) { diff --git a/contrib/llvm/lib/CodeGen/LiveVariables.cpp b/contrib/llvm/lib/CodeGen/LiveVariables.cpp index b355393..06b86d8 100644 --- a/contrib/llvm/lib/CodeGen/LiveVariables.cpp +++ b/contrib/llvm/lib/CodeGen/LiveVariables.cpp @@ -522,11 +522,15 @@ void LiveVariables::runOnInstr(MachineInstr *MI, continue; unsigned MOReg = MO.getReg(); if (MO.isUse()) { - MO.setIsKill(false); + if (!(TargetRegisterInfo::isPhysicalRegister(MOReg) && + MRI->isReserved(MOReg))) + MO.setIsKill(false); if (MO.readsReg()) UseRegs.push_back(MOReg); } else /*MO.isDef()*/ { - MO.setIsDead(false); + if (!(TargetRegisterInfo::isPhysicalRegister(MOReg) && + MRI->isReserved(MOReg))) + MO.setIsDead(false); DefRegs.push_back(MOReg); } } @@ -559,11 +563,10 @@ void LiveVariables::runOnInstr(MachineInstr *MI, void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) { // Mark live-in registers as live-in. SmallVector<unsigned, 4> Defs; - for (MachineBasicBlock::livein_iterator II = MBB->livein_begin(), - EE = MBB->livein_end(); II != EE; ++II) { - assert(TargetRegisterInfo::isPhysicalRegister(*II) && + for (const auto &LI : MBB->liveins()) { + assert(TargetRegisterInfo::isPhysicalRegister(LI.PhysReg) && "Cannot have a live-in virtual register!"); - HandlePhysRegDef(*II, nullptr, Defs); + HandlePhysRegDef(LI.PhysReg, nullptr, Defs); } // Loop over all of the instructions, processing them. @@ -599,14 +602,12 @@ void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) { for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) { MachineBasicBlock *SuccMBB = *SI; - if (SuccMBB->isLandingPad()) + if (SuccMBB->isEHPad()) continue; - for (MachineBasicBlock::livein_iterator LI = SuccMBB->livein_begin(), - LE = SuccMBB->livein_end(); LI != LE; ++LI) { - unsigned LReg = *LI; - if (!TRI->isInAllocatableClass(LReg)) + for (const auto &LI : SuccMBB->liveins()) { + if (!TRI->isInAllocatableClass(LI.PhysReg)) // Ignore other live-ins, e.g. those that are live into landing pads. - LiveOuts.insert(LReg); + LiveOuts.insert(LI.PhysReg); } } @@ -640,7 +641,7 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) { // function. This guarantees that we will see the definition of a virtual // register before its uses due to dominance properties of SSA (except for PHI // nodes, which are treated as a special case). - MachineBasicBlock *Entry = MF->begin(); + MachineBasicBlock *Entry = &MF->front(); SmallPtrSet<MachineBasicBlock*,16> Visited; for (MachineBasicBlock *MBB : depth_first_ext(Entry, Visited)) { diff --git a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp index 8378429..eb60005 100644 --- a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -325,7 +325,7 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) { // Sort the frame references by local offset array_pod_sort(FrameReferenceInsns.begin(), FrameReferenceInsns.end()); - MachineBasicBlock *Entry = Fn.begin(); + MachineBasicBlock *Entry = &Fn.front(); unsigned BaseReg = 0; int64_t BaseOffset = 0; diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp index 482c33a..28f9d4e 100644 --- a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp +++ b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "MILexer.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include <cctype> @@ -54,15 +55,132 @@ public: } // end anonymous namespace +MIToken &MIToken::reset(TokenKind Kind, StringRef Range) { + this->Kind = Kind; + this->Range = Range; + return *this; +} + +MIToken &MIToken::setStringValue(StringRef StrVal) { + StringValue = StrVal; + return *this; +} + +MIToken &MIToken::setOwnedStringValue(std::string StrVal) { + StringValueStorage = std::move(StrVal); + StringValue = StringValueStorage; + return *this; +} + +MIToken &MIToken::setIntegerValue(APSInt IntVal) { + this->IntVal = std::move(IntVal); + return *this; +} + /// Skip the leading whitespace characters and return the updated cursor. static Cursor skipWhitespace(Cursor C) { - while (isspace(C.peek())) + while (isblank(C.peek())) + C.advance(); + return C; +} + +static bool isNewlineChar(char C) { return C == '\n' || C == '\r'; } + +/// Skip a line comment and return the updated cursor. +static Cursor skipComment(Cursor C) { + if (C.peek() != ';') + return C; + while (!isNewlineChar(C.peek()) && !C.isEOF()) C.advance(); return C; } +/// Return true if the given character satisfies the following regular +/// expression: [-a-zA-Z$._0-9] static bool isIdentifierChar(char C) { - return isalpha(C) || isdigit(C) || C == '_' || C == '-' || C == '.'; + return isalpha(C) || isdigit(C) || C == '_' || C == '-' || C == '.' || + C == '$'; +} + +/// Unescapes the given string value. +/// +/// Expects the string value to be quoted. +static std::string unescapeQuotedString(StringRef Value) { + assert(Value.front() == '"' && Value.back() == '"'); + Cursor C = Cursor(Value.substr(1, Value.size() - 2)); + + std::string Str; + Str.reserve(C.remaining().size()); + while (!C.isEOF()) { + char Char = C.peek(); + if (Char == '\\') { + if (C.peek(1) == '\\') { + // Two '\' become one + Str += '\\'; + C.advance(2); + continue; + } + if (isxdigit(C.peek(1)) && isxdigit(C.peek(2))) { + Str += hexDigitValue(C.peek(1)) * 16 + hexDigitValue(C.peek(2)); + C.advance(3); + continue; + } + } + Str += Char; + C.advance(); + } + return Str; +} + +/// Lex a string constant using the following regular expression: \"[^\"]*\" +static Cursor lexStringConstant( + Cursor C, + function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) { + assert(C.peek() == '"'); + for (C.advance(); C.peek() != '"'; C.advance()) { + if (C.isEOF() || isNewlineChar(C.peek())) { + ErrorCallback( + C.location(), + "end of machine instruction reached before the closing '\"'"); + return None; + } + } + C.advance(); + return C; +} + +static Cursor lexName( + Cursor C, MIToken &Token, MIToken::TokenKind Type, unsigned PrefixLength, + function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) { + auto Range = C; + C.advance(PrefixLength); + if (C.peek() == '"') { + if (Cursor R = lexStringConstant(C, ErrorCallback)) { + StringRef String = Range.upto(R); + Token.reset(Type, String) + .setOwnedStringValue( + unescapeQuotedString(String.drop_front(PrefixLength))); + return R; + } + Token.reset(MIToken::Error, Range.remaining()); + return Range; + } + while (isIdentifierChar(C.peek())) + C.advance(); + Token.reset(Type, Range.upto(C)) + .setStringValue(Range.upto(C).drop_front(PrefixLength)); + return C; +} + +static Cursor maybeLexIntegerType(Cursor C, MIToken &Token) { + if (C.peek() != 'i' || !isdigit(C.peek(1))) + return None; + auto Range = C; + C.advance(); // Skip 'i' + while (isdigit(C.peek())) + C.advance(); + Token.reset(MIToken::IntegerType, Range.upto(C)); + return C; } static MIToken::TokenKind getIdentifierKind(StringRef Identifier) { @@ -70,32 +188,70 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) { .Case("_", MIToken::underscore) .Case("implicit", MIToken::kw_implicit) .Case("implicit-def", MIToken::kw_implicit_define) + .Case("def", MIToken::kw_def) .Case("dead", MIToken::kw_dead) .Case("killed", MIToken::kw_killed) .Case("undef", MIToken::kw_undef) + .Case("internal", MIToken::kw_internal) + .Case("early-clobber", MIToken::kw_early_clobber) + .Case("debug-use", MIToken::kw_debug_use) + .Case("tied-def", MIToken::kw_tied_def) + .Case("frame-setup", MIToken::kw_frame_setup) + .Case("debug-location", MIToken::kw_debug_location) + .Case(".cfi_same_value", MIToken::kw_cfi_same_value) + .Case(".cfi_offset", MIToken::kw_cfi_offset) + .Case(".cfi_def_cfa_register", MIToken::kw_cfi_def_cfa_register) + .Case(".cfi_def_cfa_offset", MIToken::kw_cfi_def_cfa_offset) + .Case(".cfi_def_cfa", MIToken::kw_cfi_def_cfa) + .Case("blockaddress", MIToken::kw_blockaddress) + .Case("target-index", MIToken::kw_target_index) + .Case("half", MIToken::kw_half) + .Case("float", MIToken::kw_float) + .Case("double", MIToken::kw_double) + .Case("x86_fp80", MIToken::kw_x86_fp80) + .Case("fp128", MIToken::kw_fp128) + .Case("ppc_fp128", MIToken::kw_ppc_fp128) + .Case("target-flags", MIToken::kw_target_flags) + .Case("volatile", MIToken::kw_volatile) + .Case("non-temporal", MIToken::kw_non_temporal) + .Case("invariant", MIToken::kw_invariant) + .Case("align", MIToken::kw_align) + .Case("stack", MIToken::kw_stack) + .Case("got", MIToken::kw_got) + .Case("jump-table", MIToken::kw_jump_table) + .Case("constant-pool", MIToken::kw_constant_pool) + .Case("call-entry", MIToken::kw_call_entry) + .Case("liveout", MIToken::kw_liveout) + .Case("address-taken", MIToken::kw_address_taken) + .Case("landing-pad", MIToken::kw_landing_pad) + .Case("liveins", MIToken::kw_liveins) + .Case("successors", MIToken::kw_successors) .Default(MIToken::Identifier); } static Cursor maybeLexIdentifier(Cursor C, MIToken &Token) { - if (!isalpha(C.peek()) && C.peek() != '_') + if (!isalpha(C.peek()) && C.peek() != '_' && C.peek() != '.') return None; auto Range = C; while (isIdentifierChar(C.peek())) C.advance(); auto Identifier = Range.upto(C); - Token = MIToken(getIdentifierKind(Identifier), Identifier); + Token.reset(getIdentifierKind(Identifier), Identifier) + .setStringValue(Identifier); return C; } static Cursor maybeLexMachineBasicBlock( Cursor C, MIToken &Token, function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) { - if (!C.remaining().startswith("%bb.")) + bool IsReference = C.remaining().startswith("%bb."); + if (!IsReference && !C.remaining().startswith("bb.")) return None; auto Range = C; - C.advance(4); // Skip '%bb.' + unsigned PrefixLength = IsReference ? 4 : 3; + C.advance(PrefixLength); // Skip '%bb.' or 'bb.' if (!isdigit(C.peek())) { - Token = MIToken(MIToken::Error, C.remaining()); + Token.reset(MIToken::Error, C.remaining()); ErrorCallback(C.location(), "expected a number after '%bb.'"); return C; } @@ -103,26 +259,103 @@ static Cursor maybeLexMachineBasicBlock( while (isdigit(C.peek())) C.advance(); StringRef Number = NumberRange.upto(C); - unsigned StringOffset = 4 + Number.size(); // Drop '%bb.<id>' + unsigned StringOffset = PrefixLength + Number.size(); // Drop '%bb.<id>' if (C.peek() == '.') { C.advance(); // Skip '.' ++StringOffset; while (isIdentifierChar(C.peek())) C.advance(); } - Token = MIToken(MIToken::MachineBasicBlock, Range.upto(C), APSInt(Number), - StringOffset); + Token.reset(IsReference ? MIToken::MachineBasicBlock + : MIToken::MachineBasicBlockLabel, + Range.upto(C)) + .setIntegerValue(APSInt(Number)) + .setStringValue(Range.upto(C).drop_front(StringOffset)); + return C; +} + +static Cursor maybeLexIndex(Cursor C, MIToken &Token, StringRef Rule, + MIToken::TokenKind Kind) { + if (!C.remaining().startswith(Rule) || !isdigit(C.peek(Rule.size()))) + return None; + auto Range = C; + C.advance(Rule.size()); + auto NumberRange = C; + while (isdigit(C.peek())) + C.advance(); + Token.reset(Kind, Range.upto(C)).setIntegerValue(APSInt(NumberRange.upto(C))); + return C; +} + +static Cursor maybeLexIndexAndName(Cursor C, MIToken &Token, StringRef Rule, + MIToken::TokenKind Kind) { + if (!C.remaining().startswith(Rule) || !isdigit(C.peek(Rule.size()))) + return None; + auto Range = C; + C.advance(Rule.size()); + auto NumberRange = C; + while (isdigit(C.peek())) + C.advance(); + StringRef Number = NumberRange.upto(C); + unsigned StringOffset = Rule.size() + Number.size(); + if (C.peek() == '.') { + C.advance(); + ++StringOffset; + while (isIdentifierChar(C.peek())) + C.advance(); + } + Token.reset(Kind, Range.upto(C)) + .setIntegerValue(APSInt(Number)) + .setStringValue(Range.upto(C).drop_front(StringOffset)); return C; } +static Cursor maybeLexJumpTableIndex(Cursor C, MIToken &Token) { + return maybeLexIndex(C, Token, "%jump-table.", MIToken::JumpTableIndex); +} + +static Cursor maybeLexStackObject(Cursor C, MIToken &Token) { + return maybeLexIndexAndName(C, Token, "%stack.", MIToken::StackObject); +} + +static Cursor maybeLexFixedStackObject(Cursor C, MIToken &Token) { + return maybeLexIndex(C, Token, "%fixed-stack.", MIToken::FixedStackObject); +} + +static Cursor maybeLexConstantPoolItem(Cursor C, MIToken &Token) { + return maybeLexIndex(C, Token, "%const.", MIToken::ConstantPoolItem); +} + +static Cursor maybeLexIRBlock( + Cursor C, MIToken &Token, + function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) { + const StringRef Rule = "%ir-block."; + if (!C.remaining().startswith(Rule)) + return None; + if (isdigit(C.peek(Rule.size()))) + return maybeLexIndex(C, Token, Rule, MIToken::IRBlock); + return lexName(C, Token, MIToken::NamedIRBlock, Rule.size(), ErrorCallback); +} + +static Cursor maybeLexIRValue( + Cursor C, MIToken &Token, + function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) { + const StringRef Rule = "%ir."; + if (!C.remaining().startswith(Rule)) + return None; + if (isdigit(C.peek(Rule.size()))) + return maybeLexIndex(C, Token, Rule, MIToken::IRValue); + return lexName(C, Token, MIToken::NamedIRValue, Rule.size(), ErrorCallback); +} + static Cursor lexVirtualRegister(Cursor C, MIToken &Token) { auto Range = C; C.advance(); // Skip '%' auto NumberRange = C; while (isdigit(C.peek())) C.advance(); - Token = MIToken(MIToken::VirtualRegister, Range.upto(C), - APSInt(NumberRange.upto(C))); + Token.reset(MIToken::VirtualRegister, Range.upto(C)) + .setIntegerValue(APSInt(NumberRange.upto(C))); return C; } @@ -135,41 +368,112 @@ static Cursor maybeLexRegister(Cursor C, MIToken &Token) { C.advance(); // Skip '%' while (isIdentifierChar(C.peek())) C.advance(); - Token = MIToken(MIToken::NamedRegister, Range.upto(C), - /*StringOffset=*/1); // Drop the '%' + Token.reset(MIToken::NamedRegister, Range.upto(C)) + .setStringValue(Range.upto(C).drop_front(1)); // Drop the '%' return C; } -static Cursor maybeLexGlobalValue(Cursor C, MIToken &Token) { +static Cursor maybeLexGlobalValue( + Cursor C, MIToken &Token, + function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) { if (C.peek() != '@') return None; + if (!isdigit(C.peek(1))) + return lexName(C, Token, MIToken::NamedGlobalValue, /*PrefixLength=*/1, + ErrorCallback); auto Range = C; - C.advance(); // Skip the '@' - // TODO: add support for quoted names. - if (!isdigit(C.peek())) { - while (isIdentifierChar(C.peek())) - C.advance(); - Token = MIToken(MIToken::NamedGlobalValue, Range.upto(C), - /*StringOffset=*/1); // Drop the '@' - return C; - } + C.advance(1); // Skip the '@' auto NumberRange = C; while (isdigit(C.peek())) C.advance(); - Token = - MIToken(MIToken::GlobalValue, Range.upto(C), APSInt(NumberRange.upto(C))); + Token.reset(MIToken::GlobalValue, Range.upto(C)) + .setIntegerValue(APSInt(NumberRange.upto(C))); return C; } -static Cursor maybeLexIntegerLiteral(Cursor C, MIToken &Token) { +static Cursor maybeLexExternalSymbol( + Cursor C, MIToken &Token, + function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) { + if (C.peek() != '$') + return None; + return lexName(C, Token, MIToken::ExternalSymbol, /*PrefixLength=*/1, + ErrorCallback); +} + +static bool isValidHexFloatingPointPrefix(char C) { + return C == 'H' || C == 'K' || C == 'L' || C == 'M'; +} + +static Cursor maybeLexHexFloatingPointLiteral(Cursor C, MIToken &Token) { + if (C.peek() != '0' || C.peek(1) != 'x') + return None; + Cursor Range = C; + C.advance(2); // Skip '0x' + if (isValidHexFloatingPointPrefix(C.peek())) + C.advance(); + while (isxdigit(C.peek())) + C.advance(); + Token.reset(MIToken::FloatingPointLiteral, Range.upto(C)); + return C; +} + +static Cursor lexFloatingPointLiteral(Cursor Range, Cursor C, MIToken &Token) { + C.advance(); + // Skip over [0-9]*([eE][-+]?[0-9]+)? + while (isdigit(C.peek())) + C.advance(); + if ((C.peek() == 'e' || C.peek() == 'E') && + (isdigit(C.peek(1)) || + ((C.peek(1) == '-' || C.peek(1) == '+') && isdigit(C.peek(2))))) { + C.advance(2); + while (isdigit(C.peek())) + C.advance(); + } + Token.reset(MIToken::FloatingPointLiteral, Range.upto(C)); + return C; +} + +static Cursor maybeLexNumericalLiteral(Cursor C, MIToken &Token) { if (!isdigit(C.peek()) && (C.peek() != '-' || !isdigit(C.peek(1)))) return None; auto Range = C; C.advance(); while (isdigit(C.peek())) C.advance(); + if (C.peek() == '.') + return lexFloatingPointLiteral(Range, C, Token); StringRef StrVal = Range.upto(C); - Token = MIToken(MIToken::IntegerLiteral, StrVal, APSInt(StrVal)); + Token.reset(MIToken::IntegerLiteral, StrVal).setIntegerValue(APSInt(StrVal)); + return C; +} + +static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) { + return StringSwitch<MIToken::TokenKind>(Identifier) + .Case("!tbaa", MIToken::md_tbaa) + .Case("!alias.scope", MIToken::md_alias_scope) + .Case("!noalias", MIToken::md_noalias) + .Case("!range", MIToken::md_range) + .Default(MIToken::Error); +} + +static Cursor maybeLexExlaim( + Cursor C, MIToken &Token, + function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) { + if (C.peek() != '!') + return None; + auto Range = C; + C.advance(1); + if (isdigit(C.peek()) || !isIdentifierChar(C.peek())) { + Token.reset(MIToken::exclaim, Range.upto(C)); + return C; + } + while (isIdentifierChar(C.peek())) + C.advance(); + StringRef StrVal = Range.upto(C); + Token.reset(getMetadataKeywordKind(StrVal), StrVal); + if (Token.isError()) + ErrorCallback(Token.location(), + "use of unknown metadata keyword '" + StrVal + "'"); return C; } @@ -181,44 +485,119 @@ static MIToken::TokenKind symbolToken(char C) { return MIToken::equal; case ':': return MIToken::colon; + case '(': + return MIToken::lparen; + case ')': + return MIToken::rparen; + case '{': + return MIToken::lbrace; + case '}': + return MIToken::rbrace; + case '+': + return MIToken::plus; + case '-': + return MIToken::minus; default: return MIToken::Error; } } static Cursor maybeLexSymbol(Cursor C, MIToken &Token) { - auto Kind = symbolToken(C.peek()); + MIToken::TokenKind Kind; + unsigned Length = 1; + if (C.peek() == ':' && C.peek(1) == ':') { + Kind = MIToken::coloncolon; + Length = 2; + } else + Kind = symbolToken(C.peek()); if (Kind == MIToken::Error) return None; auto Range = C; + C.advance(Length); + Token.reset(Kind, Range.upto(C)); + return C; +} + +static Cursor maybeLexNewline(Cursor C, MIToken &Token) { + if (!isNewlineChar(C.peek())) + return None; + auto Range = C; + C.advance(); + Token.reset(MIToken::Newline, Range.upto(C)); + return C; +} + +static Cursor maybeLexEscapedIRValue( + Cursor C, MIToken &Token, + function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) { + if (C.peek() != '`') + return None; + auto Range = C; + C.advance(); + auto StrRange = C; + while (C.peek() != '`') { + if (C.isEOF() || isNewlineChar(C.peek())) { + ErrorCallback( + C.location(), + "end of machine instruction reached before the closing '`'"); + Token.reset(MIToken::Error, Range.remaining()); + return C; + } + C.advance(); + } + StringRef Value = StrRange.upto(C); C.advance(); - Token = MIToken(Kind, Range.upto(C)); + Token.reset(MIToken::QuotedIRValue, Range.upto(C)).setStringValue(Value); return C; } StringRef llvm::lexMIToken( StringRef Source, MIToken &Token, function_ref<void(StringRef::iterator Loc, const Twine &)> ErrorCallback) { - auto C = skipWhitespace(Cursor(Source)); + auto C = skipComment(skipWhitespace(Cursor(Source))); if (C.isEOF()) { - Token = MIToken(MIToken::Eof, C.remaining()); + Token.reset(MIToken::Eof, C.remaining()); return C.remaining(); } - if (Cursor R = maybeLexIdentifier(C, Token)) + if (Cursor R = maybeLexIntegerType(C, Token)) return R.remaining(); if (Cursor R = maybeLexMachineBasicBlock(C, Token, ErrorCallback)) return R.remaining(); + if (Cursor R = maybeLexIdentifier(C, Token)) + return R.remaining(); + if (Cursor R = maybeLexJumpTableIndex(C, Token)) + return R.remaining(); + if (Cursor R = maybeLexStackObject(C, Token)) + return R.remaining(); + if (Cursor R = maybeLexFixedStackObject(C, Token)) + return R.remaining(); + if (Cursor R = maybeLexConstantPoolItem(C, Token)) + return R.remaining(); + if (Cursor R = maybeLexIRBlock(C, Token, ErrorCallback)) + return R.remaining(); + if (Cursor R = maybeLexIRValue(C, Token, ErrorCallback)) + return R.remaining(); if (Cursor R = maybeLexRegister(C, Token)) return R.remaining(); - if (Cursor R = maybeLexGlobalValue(C, Token)) + if (Cursor R = maybeLexGlobalValue(C, Token, ErrorCallback)) + return R.remaining(); + if (Cursor R = maybeLexExternalSymbol(C, Token, ErrorCallback)) return R.remaining(); - if (Cursor R = maybeLexIntegerLiteral(C, Token)) + if (Cursor R = maybeLexHexFloatingPointLiteral(C, Token)) + return R.remaining(); + if (Cursor R = maybeLexNumericalLiteral(C, Token)) + return R.remaining(); + if (Cursor R = maybeLexExlaim(C, Token, ErrorCallback)) return R.remaining(); if (Cursor R = maybeLexSymbol(C, Token)) return R.remaining(); + if (Cursor R = maybeLexNewline(C, Token)) + return R.remaining(); + if (Cursor R = maybeLexEscapedIRValue(C, Token, ErrorCallback)) + return R.remaining(); - Token = MIToken(MIToken::Error, C.remaining()); + Token.reset(MIToken::Error, C.remaining()); ErrorCallback(C.location(), Twine("unexpected character '") + Twine(C.peek()) + "'"); return C.remaining(); diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h index 55460b5..ff54aa3 100644 --- a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h +++ b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h @@ -30,50 +30,119 @@ struct MIToken { // Markers Eof, Error, + Newline, // Tokens with no info. comma, equal, underscore, colon, + coloncolon, + exclaim, + lparen, + rparen, + lbrace, + rbrace, + plus, + minus, // Keywords kw_implicit, kw_implicit_define, + kw_def, kw_dead, kw_killed, kw_undef, + kw_internal, + kw_early_clobber, + kw_debug_use, + kw_tied_def, + kw_frame_setup, + kw_debug_location, + kw_cfi_same_value, + kw_cfi_offset, + kw_cfi_def_cfa_register, + kw_cfi_def_cfa_offset, + kw_cfi_def_cfa, + kw_blockaddress, + kw_target_index, + kw_half, + kw_float, + kw_double, + kw_x86_fp80, + kw_fp128, + kw_ppc_fp128, + kw_target_flags, + kw_volatile, + kw_non_temporal, + kw_invariant, + kw_align, + kw_stack, + kw_got, + kw_jump_table, + kw_constant_pool, + kw_call_entry, + kw_liveout, + kw_address_taken, + kw_landing_pad, + kw_liveins, + kw_successors, + + // Named metadata keywords + md_tbaa, + md_alias_scope, + md_noalias, + md_range, // Identifier tokens Identifier, + IntegerType, NamedRegister, + MachineBasicBlockLabel, MachineBasicBlock, + StackObject, + FixedStackObject, NamedGlobalValue, GlobalValue, + ExternalSymbol, // Other tokens IntegerLiteral, - VirtualRegister + FloatingPointLiteral, + VirtualRegister, + ConstantPoolItem, + JumpTableIndex, + NamedIRBlock, + IRBlock, + NamedIRValue, + IRValue, + QuotedIRValue // `<constant value>` }; private: TokenKind Kind; - unsigned StringOffset; StringRef Range; + StringRef StringValue; + std::string StringValueStorage; APSInt IntVal; public: - MIToken(TokenKind Kind, StringRef Range, unsigned StringOffset = 0) - : Kind(Kind), StringOffset(StringOffset), Range(Range) {} + MIToken() : Kind(Error) {} - MIToken(TokenKind Kind, StringRef Range, const APSInt &IntVal, - unsigned StringOffset = 0) - : Kind(Kind), StringOffset(StringOffset), Range(Range), IntVal(IntVal) {} + MIToken &reset(TokenKind Kind, StringRef Range); + + MIToken &setStringValue(StringRef StrVal); + MIToken &setOwnedStringValue(std::string StrVal); + MIToken &setIntegerValue(APSInt IntVal); TokenKind kind() const { return Kind; } bool isError() const { return Kind == Error; } + bool isNewlineOrEOF() const { return Kind == Newline || Kind == Eof; } + + bool isErrorOrEOF() const { return Kind == Error || Kind == Eof; } + bool isRegister() const { return Kind == NamedRegister || Kind == underscore || Kind == VirtualRegister; @@ -81,7 +150,14 @@ public: bool isRegisterFlag() const { return Kind == kw_implicit || Kind == kw_implicit_define || - Kind == kw_dead || Kind == kw_killed || Kind == kw_undef; + Kind == kw_def || Kind == kw_dead || Kind == kw_killed || + Kind == kw_undef || Kind == kw_internal || + Kind == kw_early_clobber || Kind == kw_debug_use; + } + + bool isMemoryOperandFlag() const { + return Kind == kw_volatile || Kind == kw_non_temporal || + Kind == kw_invariant; } bool is(TokenKind K) const { return Kind == K; } @@ -90,13 +166,19 @@ public: StringRef::iterator location() const { return Range.begin(); } - StringRef stringValue() const { return Range.drop_front(StringOffset); } + StringRef range() const { return Range; } + + /// Return the token's string value. + StringRef stringValue() const { return StringValue; } const APSInt &integerValue() const { return IntVal; } bool hasIntegerValue() const { return Kind == IntegerLiteral || Kind == MachineBasicBlock || - Kind == GlobalValue || Kind == VirtualRegister; + Kind == MachineBasicBlockLabel || Kind == StackObject || + Kind == FixedStackObject || Kind == GlobalValue || + Kind == VirtualRegister || Kind == ConstantPoolItem || + Kind == JumpTableIndex || Kind == IRBlock || Kind == IRValue; } }; diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp index c000112..f2f6584 100644 --- a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -14,12 +14,20 @@ #include "MIParser.h" #include "MILexer.h" #include "llvm/ADT/StringMap.h" +#include "llvm/AsmParser/Parser.h" #include "llvm/AsmParser/SlotMapping.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ModuleSlotTracker.h" +#include "llvm/IR/ValueSymbolTable.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -30,15 +38,20 @@ using namespace llvm; namespace { /// A wrapper struct around the 'MachineOperand' struct that includes a source -/// range. -struct MachineOperandWithLocation { +/// range and other attributes. +struct ParsedMachineOperand { MachineOperand Operand; StringRef::iterator Begin; StringRef::iterator End; - - MachineOperandWithLocation(const MachineOperand &Operand, - StringRef::iterator Begin, StringRef::iterator End) - : Operand(Operand), Begin(Begin), End(End) {} + Optional<unsigned> TiedDefIdx; + + ParsedMachineOperand(const MachineOperand &Operand, StringRef::iterator Begin, + StringRef::iterator End, Optional<unsigned> &TiedDefIdx) + : Operand(Operand), Begin(Begin), End(End), TiedDefIdx(TiedDefIdx) { + if (TiedDefIdx) + assert(Operand.isReg() && Operand.isUse() && + "Only used register operands can be tied"); + } }; class MIParser { @@ -58,6 +71,16 @@ class MIParser { StringMap<const uint32_t *> Names2RegMasks; /// Maps from subregister names to subregister indices. StringMap<unsigned> Names2SubRegIndices; + /// Maps from slot numbers to function's unnamed basic blocks. + DenseMap<unsigned, const BasicBlock *> Slots2BasicBlocks; + /// Maps from slot numbers to function's unnamed values. + DenseMap<unsigned, const Value *> Slots2Values; + /// Maps from target index names to target indices. + StringMap<int> Names2TargetIndices; + /// Maps from direct target flag names to the direct target flag values. + StringMap<unsigned> Names2DirectTargetFlags; + /// Maps from direct target flag names to the bitmask target flag values. + StringMap<unsigned> Names2BitmaskTargetFlags; public: MIParser(SourceMgr &SM, MachineFunction &MF, SMDiagnostic &Error, @@ -76,19 +99,66 @@ public: /// This function always return true. bool error(StringRef::iterator Loc, const Twine &Msg); + bool + parseBasicBlockDefinitions(DenseMap<unsigned, MachineBasicBlock *> &MBBSlots); + bool parseBasicBlocks(); bool parse(MachineInstr *&MI); - bool parseMBB(MachineBasicBlock *&MBB); - bool parseNamedRegister(unsigned &Reg); + bool parseStandaloneMBB(MachineBasicBlock *&MBB); + bool parseStandaloneNamedRegister(unsigned &Reg); + bool parseStandaloneVirtualRegister(unsigned &Reg); + bool parseStandaloneStackObject(int &FI); + bool parseStandaloneMDNode(MDNode *&Node); + + bool + parseBasicBlockDefinition(DenseMap<unsigned, MachineBasicBlock *> &MBBSlots); + bool parseBasicBlock(MachineBasicBlock &MBB); + bool parseBasicBlockLiveins(MachineBasicBlock &MBB); + bool parseBasicBlockSuccessors(MachineBasicBlock &MBB); bool parseRegister(unsigned &Reg); bool parseRegisterFlag(unsigned &Flags); bool parseSubRegisterIndex(unsigned &SubReg); - bool parseRegisterOperand(MachineOperand &Dest, bool IsDef = false); + bool parseRegisterTiedDefIndex(unsigned &TiedDefIdx); + bool parseRegisterOperand(MachineOperand &Dest, + Optional<unsigned> &TiedDefIdx, bool IsDef = false); bool parseImmediateOperand(MachineOperand &Dest); + bool parseIRConstant(StringRef::iterator Loc, StringRef Source, + const Constant *&C); + bool parseIRConstant(StringRef::iterator Loc, const Constant *&C); + bool parseTypedImmediateOperand(MachineOperand &Dest); + bool parseFPImmediateOperand(MachineOperand &Dest); bool parseMBBReference(MachineBasicBlock *&MBB); bool parseMBBOperand(MachineOperand &Dest); + bool parseStackFrameIndex(int &FI); + bool parseStackObjectOperand(MachineOperand &Dest); + bool parseFixedStackFrameIndex(int &FI); + bool parseFixedStackObjectOperand(MachineOperand &Dest); + bool parseGlobalValue(GlobalValue *&GV); bool parseGlobalAddressOperand(MachineOperand &Dest); - bool parseMachineOperand(MachineOperand &Dest); + bool parseConstantPoolIndexOperand(MachineOperand &Dest); + bool parseJumpTableIndexOperand(MachineOperand &Dest); + bool parseExternalSymbolOperand(MachineOperand &Dest); + bool parseMDNode(MDNode *&Node); + bool parseMetadataOperand(MachineOperand &Dest); + bool parseCFIOffset(int &Offset); + bool parseCFIRegister(unsigned &Reg); + bool parseCFIOperand(MachineOperand &Dest); + bool parseIRBlock(BasicBlock *&BB, const Function &F); + bool parseBlockAddressOperand(MachineOperand &Dest); + bool parseTargetIndexOperand(MachineOperand &Dest); + bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest); + bool parseMachineOperand(MachineOperand &Dest, + Optional<unsigned> &TiedDefIdx); + bool parseMachineOperandAndTargetFlags(MachineOperand &Dest, + Optional<unsigned> &TiedDefIdx); + bool parseOffset(int64_t &Offset); + bool parseAlignment(unsigned &Alignment); + bool parseOperandsOffset(MachineOperand &Op); + bool parseIRValue(const Value *&V); + bool parseMemoryOperandFlag(unsigned &Flags); + bool parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV); + bool parseMachinePointerInfo(MachinePointerInfo &Dest); + bool parseMachineMemoryOperand(MachineMemOperand *&Dest); private: /// Convert the integer literal in the current token into an unsigned integer. @@ -96,15 +166,31 @@ private: /// Return true if an error occurred. bool getUnsigned(unsigned &Result); + /// Convert the integer literal in the current token into an uint64. + /// + /// Return true if an error occurred. + bool getUint64(uint64_t &Result); + + /// If the current token is of the given kind, consume it and return false. + /// Otherwise report an error and return true. + bool expectAndConsume(MIToken::TokenKind TokenKind); + + /// If the current token is of the given kind, consume it and return true. + /// Otherwise return false. + bool consumeIfPresent(MIToken::TokenKind TokenKind); + void initNames2InstrOpCodes(); /// Try to convert an instruction name to an opcode. Return true if the /// instruction name is invalid. bool parseInstrName(StringRef InstrName, unsigned &OpCode); - bool parseInstruction(unsigned &OpCode); + bool parseInstruction(unsigned &OpCode, unsigned &Flags); + + bool assignRegisterTies(MachineInstr &MI, + ArrayRef<ParsedMachineOperand> Operands); - bool verifyImplicitOperands(ArrayRef<MachineOperandWithLocation> Operands, + bool verifyImplicitOperands(ArrayRef<ParsedMachineOperand> Operands, const MCInstrDesc &MCID); void initNames2Regs(); @@ -126,6 +212,34 @@ private: /// /// Return 0 if the name isn't a subregister index class. unsigned getSubRegIndex(StringRef Name); + + const BasicBlock *getIRBlock(unsigned Slot); + const BasicBlock *getIRBlock(unsigned Slot, const Function &F); + + const Value *getIRValue(unsigned Slot); + + void initNames2TargetIndices(); + + /// Try to convert a name of target index to the corresponding target index. + /// + /// Return true if the name isn't a name of a target index. + bool getTargetIndex(StringRef Name, int &Index); + + void initNames2DirectTargetFlags(); + + /// Try to convert a name of a direct target flag to the corresponding + /// target flag. + /// + /// Return true if the name isn't a name of a direct flag. + bool getDirectTargetFlag(StringRef Name, unsigned &Flag); + + void initNames2BitmaskTargetFlags(); + + /// Try to convert a name of a bitmask target flag to the corresponding + /// target flag. + /// + /// Return true if the name isn't a name of a bitmask target flag. + bool getBitmaskTargetFlag(StringRef Name, unsigned &Flag); }; } // end anonymous namespace @@ -134,7 +248,7 @@ MIParser::MIParser(SourceMgr &SM, MachineFunction &MF, SMDiagnostic &Error, StringRef Source, const PerFunctionMIParsingState &PFS, const SlotMapping &IRSlots) : SM(SM), MF(MF), Error(Error), Source(Source), CurrentSource(Source), - Token(MIToken::Error, StringRef()), PFS(PFS), IRSlots(IRSlots) {} + PFS(PFS), IRSlots(IRSlots) {} void MIParser::lex() { CurrentSource = lexMIToken( @@ -146,49 +260,378 @@ bool MIParser::error(const Twine &Msg) { return error(Token.location(), Msg); } bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) { assert(Loc >= Source.data() && Loc <= (Source.data() + Source.size())); - Error = SMDiagnostic( - SM, SMLoc(), - SM.getMemoryBuffer(SM.getMainFileID())->getBufferIdentifier(), 1, - Loc - Source.data(), SourceMgr::DK_Error, Msg.str(), Source, None, None); + const MemoryBuffer &Buffer = *SM.getMemoryBuffer(SM.getMainFileID()); + if (Loc >= Buffer.getBufferStart() && Loc <= Buffer.getBufferEnd()) { + // Create an ordinary diagnostic when the source manager's buffer is the + // source string. + Error = SM.GetMessage(SMLoc::getFromPointer(Loc), SourceMgr::DK_Error, Msg); + return true; + } + // Create a diagnostic for a YAML string literal. + Error = SMDiagnostic(SM, SMLoc(), Buffer.getBufferIdentifier(), 1, + Loc - Source.data(), SourceMgr::DK_Error, Msg.str(), + Source, None, None); return true; } -bool MIParser::parse(MachineInstr *&MI) { +static const char *toString(MIToken::TokenKind TokenKind) { + switch (TokenKind) { + case MIToken::comma: + return "','"; + case MIToken::equal: + return "'='"; + case MIToken::colon: + return "':'"; + case MIToken::lparen: + return "'('"; + case MIToken::rparen: + return "')'"; + default: + return "<unknown token>"; + } +} + +bool MIParser::expectAndConsume(MIToken::TokenKind TokenKind) { + if (Token.isNot(TokenKind)) + return error(Twine("expected ") + toString(TokenKind)); + lex(); + return false; +} + +bool MIParser::consumeIfPresent(MIToken::TokenKind TokenKind) { + if (Token.isNot(TokenKind)) + return false; + lex(); + return true; +} + +bool MIParser::parseBasicBlockDefinition( + DenseMap<unsigned, MachineBasicBlock *> &MBBSlots) { + assert(Token.is(MIToken::MachineBasicBlockLabel)); + unsigned ID = 0; + if (getUnsigned(ID)) + return true; + auto Loc = Token.location(); + auto Name = Token.stringValue(); + lex(); + bool HasAddressTaken = false; + bool IsLandingPad = false; + unsigned Alignment = 0; + BasicBlock *BB = nullptr; + if (consumeIfPresent(MIToken::lparen)) { + do { + // TODO: Report an error when multiple same attributes are specified. + switch (Token.kind()) { + case MIToken::kw_address_taken: + HasAddressTaken = true; + lex(); + break; + case MIToken::kw_landing_pad: + IsLandingPad = true; + lex(); + break; + case MIToken::kw_align: + if (parseAlignment(Alignment)) + return true; + break; + case MIToken::IRBlock: + // TODO: Report an error when both name and ir block are specified. + if (parseIRBlock(BB, *MF.getFunction())) + return true; + lex(); + break; + default: + break; + } + } while (consumeIfPresent(MIToken::comma)); + if (expectAndConsume(MIToken::rparen)) + return true; + } + if (expectAndConsume(MIToken::colon)) + return true; + + if (!Name.empty()) { + BB = dyn_cast_or_null<BasicBlock>( + MF.getFunction()->getValueSymbolTable().lookup(Name)); + if (!BB) + return error(Loc, Twine("basic block '") + Name + + "' is not defined in the function '" + + MF.getName() + "'"); + } + auto *MBB = MF.CreateMachineBasicBlock(BB); + MF.insert(MF.end(), MBB); + bool WasInserted = MBBSlots.insert(std::make_pair(ID, MBB)).second; + if (!WasInserted) + return error(Loc, Twine("redefinition of machine basic block with id #") + + Twine(ID)); + if (Alignment) + MBB->setAlignment(Alignment); + if (HasAddressTaken) + MBB->setHasAddressTaken(); + MBB->setIsEHPad(IsLandingPad); + return false; +} + +bool MIParser::parseBasicBlockDefinitions( + DenseMap<unsigned, MachineBasicBlock *> &MBBSlots) { + lex(); + // Skip until the first machine basic block. + while (Token.is(MIToken::Newline)) + lex(); + if (Token.isErrorOrEOF()) + return Token.isError(); + if (Token.isNot(MIToken::MachineBasicBlockLabel)) + return error("expected a basic block definition before instructions"); + unsigned BraceDepth = 0; + do { + if (parseBasicBlockDefinition(MBBSlots)) + return true; + bool IsAfterNewline = false; + // Skip until the next machine basic block. + while (true) { + if ((Token.is(MIToken::MachineBasicBlockLabel) && IsAfterNewline) || + Token.isErrorOrEOF()) + break; + else if (Token.is(MIToken::MachineBasicBlockLabel)) + return error("basic block definition should be located at the start of " + "the line"); + else if (consumeIfPresent(MIToken::Newline)) { + IsAfterNewline = true; + continue; + } + IsAfterNewline = false; + if (Token.is(MIToken::lbrace)) + ++BraceDepth; + if (Token.is(MIToken::rbrace)) { + if (!BraceDepth) + return error("extraneous closing brace ('}')"); + --BraceDepth; + } + lex(); + } + // Verify that we closed all of the '{' at the end of a file or a block. + if (!Token.isError() && BraceDepth) + return error("expected '}'"); // FIXME: Report a note that shows '{'. + } while (!Token.isErrorOrEOF()); + return Token.isError(); +} + +bool MIParser::parseBasicBlockLiveins(MachineBasicBlock &MBB) { + assert(Token.is(MIToken::kw_liveins)); + lex(); + if (expectAndConsume(MIToken::colon)) + return true; + if (Token.isNewlineOrEOF()) // Allow an empty list of liveins. + return false; + do { + if (Token.isNot(MIToken::NamedRegister)) + return error("expected a named register"); + unsigned Reg = 0; + if (parseRegister(Reg)) + return true; + MBB.addLiveIn(Reg); + lex(); + } while (consumeIfPresent(MIToken::comma)); + return false; +} + +bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) { + assert(Token.is(MIToken::kw_successors)); lex(); + if (expectAndConsume(MIToken::colon)) + return true; + if (Token.isNewlineOrEOF()) // Allow an empty list of successors. + return false; + do { + if (Token.isNot(MIToken::MachineBasicBlock)) + return error("expected a machine basic block reference"); + MachineBasicBlock *SuccMBB = nullptr; + if (parseMBBReference(SuccMBB)) + return true; + lex(); + unsigned Weight = 0; + if (consumeIfPresent(MIToken::lparen)) { + if (Token.isNot(MIToken::IntegerLiteral)) + return error("expected an integer literal after '('"); + if (getUnsigned(Weight)) + return true; + lex(); + if (expectAndConsume(MIToken::rparen)) + return true; + } + MBB.addSuccessor(SuccMBB, BranchProbability::getRaw(Weight)); + } while (consumeIfPresent(MIToken::comma)); + MBB.normalizeSuccProbs(); + return false; +} +bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) { + // Skip the definition. + assert(Token.is(MIToken::MachineBasicBlockLabel)); + lex(); + if (consumeIfPresent(MIToken::lparen)) { + while (Token.isNot(MIToken::rparen) && !Token.isErrorOrEOF()) + lex(); + consumeIfPresent(MIToken::rparen); + } + consumeIfPresent(MIToken::colon); + + // Parse the liveins and successors. + // N.B: Multiple lists of successors and liveins are allowed and they're + // merged into one. + // Example: + // liveins: %edi + // liveins: %esi + // + // is equivalent to + // liveins: %edi, %esi + while (true) { + if (Token.is(MIToken::kw_successors)) { + if (parseBasicBlockSuccessors(MBB)) + return true; + } else if (Token.is(MIToken::kw_liveins)) { + if (parseBasicBlockLiveins(MBB)) + return true; + } else if (consumeIfPresent(MIToken::Newline)) { + continue; + } else + break; + if (!Token.isNewlineOrEOF()) + return error("expected line break at the end of a list"); + lex(); + } + + // Parse the instructions. + bool IsInBundle = false; + MachineInstr *PrevMI = nullptr; + while (true) { + if (Token.is(MIToken::MachineBasicBlockLabel) || Token.is(MIToken::Eof)) + return false; + else if (consumeIfPresent(MIToken::Newline)) + continue; + if (consumeIfPresent(MIToken::rbrace)) { + // The first parsing pass should verify that all closing '}' have an + // opening '{'. + assert(IsInBundle); + IsInBundle = false; + continue; + } + MachineInstr *MI = nullptr; + if (parse(MI)) + return true; + MBB.insert(MBB.end(), MI); + if (IsInBundle) { + PrevMI->setFlag(MachineInstr::BundledSucc); + MI->setFlag(MachineInstr::BundledPred); + } + PrevMI = MI; + if (Token.is(MIToken::lbrace)) { + if (IsInBundle) + return error("nested instruction bundles are not allowed"); + lex(); + // This instruction is the start of the bundle. + MI->setFlag(MachineInstr::BundledSucc); + IsInBundle = true; + if (!Token.is(MIToken::Newline)) + // The next instruction can be on the same line. + continue; + } + assert(Token.isNewlineOrEOF() && "MI is not fully parsed"); + lex(); + } + return false; +} + +bool MIParser::parseBasicBlocks() { + lex(); + // Skip until the first machine basic block. + while (Token.is(MIToken::Newline)) + lex(); + if (Token.isErrorOrEOF()) + return Token.isError(); + // The first parsing pass should have verified that this token is a MBB label + // in the 'parseBasicBlockDefinitions' method. + assert(Token.is(MIToken::MachineBasicBlockLabel)); + do { + MachineBasicBlock *MBB = nullptr; + if (parseMBBReference(MBB)) + return true; + if (parseBasicBlock(*MBB)) + return true; + // The method 'parseBasicBlock' should parse the whole block until the next + // block or the end of file. + assert(Token.is(MIToken::MachineBasicBlockLabel) || Token.is(MIToken::Eof)); + } while (Token.isNot(MIToken::Eof)); + return false; +} + +bool MIParser::parse(MachineInstr *&MI) { // Parse any register operands before '=' - // TODO: Allow parsing of multiple operands before '=' MachineOperand MO = MachineOperand::CreateImm(0); - SmallVector<MachineOperandWithLocation, 8> Operands; - if (Token.isRegister() || Token.isRegisterFlag()) { + SmallVector<ParsedMachineOperand, 8> Operands; + while (Token.isRegister() || Token.isRegisterFlag()) { auto Loc = Token.location(); - if (parseRegisterOperand(MO, /*IsDef=*/true)) + Optional<unsigned> TiedDefIdx; + if (parseRegisterOperand(MO, TiedDefIdx, /*IsDef=*/true)) return true; - Operands.push_back(MachineOperandWithLocation(MO, Loc, Token.location())); - if (Token.isNot(MIToken::equal)) - return error("expected '='"); + Operands.push_back( + ParsedMachineOperand(MO, Loc, Token.location(), TiedDefIdx)); + if (Token.isNot(MIToken::comma)) + break; lex(); } - - unsigned OpCode; - if (Token.isError() || parseInstruction(OpCode)) + if (!Operands.empty() && expectAndConsume(MIToken::equal)) return true; - // TODO: Parse the instruction flags and memory operands. + unsigned OpCode, Flags = 0; + if (Token.isError() || parseInstruction(OpCode, Flags)) + return true; // Parse the remaining machine operands. - while (Token.isNot(MIToken::Eof)) { + while (!Token.isNewlineOrEOF() && Token.isNot(MIToken::kw_debug_location) && + Token.isNot(MIToken::coloncolon) && Token.isNot(MIToken::lbrace)) { auto Loc = Token.location(); - if (parseMachineOperand(MO)) + Optional<unsigned> TiedDefIdx; + if (parseMachineOperandAndTargetFlags(MO, TiedDefIdx)) return true; - Operands.push_back(MachineOperandWithLocation(MO, Loc, Token.location())); - if (Token.is(MIToken::Eof)) + Operands.push_back( + ParsedMachineOperand(MO, Loc, Token.location(), TiedDefIdx)); + if (Token.isNewlineOrEOF() || Token.is(MIToken::coloncolon) || + Token.is(MIToken::lbrace)) break; if (Token.isNot(MIToken::comma)) return error("expected ',' before the next machine operand"); lex(); } + DebugLoc DebugLocation; + if (Token.is(MIToken::kw_debug_location)) { + lex(); + if (Token.isNot(MIToken::exclaim)) + return error("expected a metadata node after 'debug-location'"); + MDNode *Node = nullptr; + if (parseMDNode(Node)) + return true; + DebugLocation = DebugLoc(Node); + } + + // Parse the machine memory operands. + SmallVector<MachineMemOperand *, 2> MemOperands; + if (Token.is(MIToken::coloncolon)) { + lex(); + while (!Token.isNewlineOrEOF()) { + MachineMemOperand *MemOp = nullptr; + if (parseMachineMemoryOperand(MemOp)) + return true; + MemOperands.push_back(MemOp); + if (Token.isNewlineOrEOF()) + break; + if (Token.isNot(MIToken::comma)) + return error("expected ',' before the next machine memory operand"); + lex(); + } + } + const auto &MCID = MF.getSubtarget().getInstrInfo()->get(OpCode); if (!MCID.isVariadic()) { // FIXME: Move the implicit operand verification to the machine verifier. @@ -197,13 +640,22 @@ bool MIParser::parse(MachineInstr *&MI) { } // TODO: Check for extraneous machine operands. - MI = MF.CreateMachineInstr(MCID, DebugLoc(), /*NoImplicit=*/true); + MI = MF.CreateMachineInstr(MCID, DebugLocation, /*NoImplicit=*/true); + MI->setFlags(Flags); for (const auto &Operand : Operands) MI->addOperand(MF, Operand.Operand); + if (assignRegisterTies(*MI, Operands)) + return true; + if (MemOperands.empty()) + return false; + MachineInstr::mmo_iterator MemRefs = + MF.allocateMemRefsArray(MemOperands.size()); + std::copy(MemOperands.begin(), MemOperands.end(), MemRefs); + MI->setMemRefs(MemRefs, MemRefs + MemOperands.size()); return false; } -bool MIParser::parseMBB(MachineBasicBlock *&MBB) { +bool MIParser::parseStandaloneMBB(MachineBasicBlock *&MBB) { lex(); if (Token.isNot(MIToken::MachineBasicBlock)) return error("expected a machine basic block reference"); @@ -216,18 +668,52 @@ bool MIParser::parseMBB(MachineBasicBlock *&MBB) { return false; } -bool MIParser::parseNamedRegister(unsigned &Reg) { +bool MIParser::parseStandaloneNamedRegister(unsigned &Reg) { lex(); if (Token.isNot(MIToken::NamedRegister)) return error("expected a named register"); if (parseRegister(Reg)) - return 0; + return true; + lex(); + if (Token.isNot(MIToken::Eof)) + return error("expected end of string after the register reference"); + return false; +} + +bool MIParser::parseStandaloneVirtualRegister(unsigned &Reg) { + lex(); + if (Token.isNot(MIToken::VirtualRegister)) + return error("expected a virtual register"); + if (parseRegister(Reg)) + return true; lex(); if (Token.isNot(MIToken::Eof)) return error("expected end of string after the register reference"); return false; } +bool MIParser::parseStandaloneStackObject(int &FI) { + lex(); + if (Token.isNot(MIToken::StackObject)) + return error("expected a stack object"); + if (parseStackFrameIndex(FI)) + return true; + if (Token.isNot(MIToken::Eof)) + return error("expected end of string after the stack object reference"); + return false; +} + +bool MIParser::parseStandaloneMDNode(MDNode *&Node) { + lex(); + if (Token.isNot(MIToken::exclaim)) + return error("expected a metadata node"); + if (parseMDNode(Node)) + return true; + if (Token.isNot(MIToken::Eof)) + return error("expected end of string after the metadata node"); + return false; +} + static const char *printImplicitRegisterFlag(const MachineOperand &MO) { assert(MO.isImplicit()); return MO.isDef() ? "implicit-def" : "implicit"; @@ -239,8 +725,18 @@ static std::string getRegisterName(const TargetRegisterInfo *TRI, return StringRef(TRI->getName(Reg)).lower(); } -bool MIParser::verifyImplicitOperands( - ArrayRef<MachineOperandWithLocation> Operands, const MCInstrDesc &MCID) { +/// Return true if the parsed machine operands contain a given machine operand. +static bool isImplicitOperandIn(const MachineOperand &ImplicitOperand, + ArrayRef<ParsedMachineOperand> Operands) { + for (const auto &I : Operands) { + if (ImplicitOperand.isIdenticalTo(I.Operand)) + return true; + } + return false; +} + +bool MIParser::verifyImplicitOperands(ArrayRef<ParsedMachineOperand> Operands, + const MCInstrDesc &MCID) { if (MCID.isCall()) // We can't verify call instructions as they can contain arbitrary implicit // register and register mask operands. @@ -249,48 +745,32 @@ bool MIParser::verifyImplicitOperands( // Gather all the expected implicit operands. SmallVector<MachineOperand, 4> ImplicitOperands; if (MCID.ImplicitDefs) - for (const uint16_t *ImpDefs = MCID.getImplicitDefs(); *ImpDefs; ++ImpDefs) + for (const MCPhysReg *ImpDefs = MCID.getImplicitDefs(); *ImpDefs; ++ImpDefs) ImplicitOperands.push_back( MachineOperand::CreateReg(*ImpDefs, true, true)); if (MCID.ImplicitUses) - for (const uint16_t *ImpUses = MCID.getImplicitUses(); *ImpUses; ++ImpUses) + for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses; ++ImpUses) ImplicitOperands.push_back( MachineOperand::CreateReg(*ImpUses, false, true)); const auto *TRI = MF.getSubtarget().getRegisterInfo(); assert(TRI && "Expected target register info"); - size_t I = ImplicitOperands.size(), J = Operands.size(); - while (I) { - --I; - if (J) { - --J; - const auto &ImplicitOperand = ImplicitOperands[I]; - const auto &Operand = Operands[J].Operand; - if (ImplicitOperand.isIdenticalTo(Operand)) - continue; - if (Operand.isReg() && Operand.isImplicit()) { - return error(Operands[J].Begin, - Twine("expected an implicit register operand '") + - printImplicitRegisterFlag(ImplicitOperand) + " %" + - getRegisterName(TRI, ImplicitOperand.getReg()) + "'"); - } - } - // TODO: Fix source location when Operands[J].end is right before '=', i.e: - // insead of reporting an error at this location: - // %eax = MOV32r0 - // ^ - // report the error at the following location: - // %eax = MOV32r0 - // ^ - return error(J < Operands.size() ? Operands[J].End : Token.location(), + for (const auto &I : ImplicitOperands) { + if (isImplicitOperandIn(I, Operands)) + continue; + return error(Operands.empty() ? Token.location() : Operands.back().End, Twine("missing implicit register operand '") + - printImplicitRegisterFlag(ImplicitOperands[I]) + " %" + - getRegisterName(TRI, ImplicitOperands[I].getReg()) + "'"); + printImplicitRegisterFlag(I) + " %" + + getRegisterName(TRI, I.getReg()) + "'"); } return false; } -bool MIParser::parseInstruction(unsigned &OpCode) { +bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) { + if (Token.is(MIToken::kw_frame_setup)) { + Flags |= MachineInstr::FrameSetup; + lex(); + } if (Token.isNot(MIToken::Identifier)) return error("expected a machine instruction"); StringRef InstrName = Token.stringValue(); @@ -330,6 +810,7 @@ bool MIParser::parseRegister(unsigned &Reg) { } bool MIParser::parseRegisterFlag(unsigned &Flags) { + const unsigned OldFlags = Flags; switch (Token.kind()) { case MIToken::kw_implicit: Flags |= RegState::Implicit; @@ -337,6 +818,9 @@ bool MIParser::parseRegisterFlag(unsigned &Flags) { case MIToken::kw_implicit_define: Flags |= RegState::ImplicitDefine; break; + case MIToken::kw_def: + Flags |= RegState::Define; + break; case MIToken::kw_dead: Flags |= RegState::Dead; break; @@ -346,11 +830,22 @@ bool MIParser::parseRegisterFlag(unsigned &Flags) { case MIToken::kw_undef: Flags |= RegState::Undef; break; - // TODO: report an error when we specify the same flag more than once. - // TODO: parse the other register flags. + case MIToken::kw_internal: + Flags |= RegState::InternalRead; + break; + case MIToken::kw_early_clobber: + Flags |= RegState::EarlyClobber; + break; + case MIToken::kw_debug_use: + Flags |= RegState::Debug; + break; default: llvm_unreachable("The current token should be a register flag"); } + if (OldFlags == Flags) + // We know that the same flag is specified more than once when the flags + // weren't modified. + return error("duplicate '" + Token.stringValue() + "' register flag"); lex(); return false; } @@ -368,7 +863,59 @@ bool MIParser::parseSubRegisterIndex(unsigned &SubReg) { return false; } -bool MIParser::parseRegisterOperand(MachineOperand &Dest, bool IsDef) { +bool MIParser::parseRegisterTiedDefIndex(unsigned &TiedDefIdx) { + if (!consumeIfPresent(MIToken::kw_tied_def)) + return error("expected 'tied-def' after '('"); + if (Token.isNot(MIToken::IntegerLiteral)) + return error("expected an integer literal after 'tied-def'"); + if (getUnsigned(TiedDefIdx)) + return true; + lex(); + if (expectAndConsume(MIToken::rparen)) + return true; + return false; +} + +bool MIParser::assignRegisterTies(MachineInstr &MI, + ArrayRef<ParsedMachineOperand> Operands) { + SmallVector<std::pair<unsigned, unsigned>, 4> TiedRegisterPairs; + for (unsigned I = 0, E = Operands.size(); I != E; ++I) { + if (!Operands[I].TiedDefIdx) + continue; + // The parser ensures that this operand is a register use, so we just have + // to check the tied-def operand. + unsigned DefIdx = Operands[I].TiedDefIdx.getValue(); + if (DefIdx >= E) + return error(Operands[I].Begin, + Twine("use of invalid tied-def operand index '" + + Twine(DefIdx) + "'; instruction has only ") + + Twine(E) + " operands"); + const auto &DefOperand = Operands[DefIdx].Operand; + if (!DefOperand.isReg() || !DefOperand.isDef()) + // FIXME: add note with the def operand. + return error(Operands[I].Begin, + Twine("use of invalid tied-def operand index '") + + Twine(DefIdx) + "'; the operand #" + Twine(DefIdx) + + " isn't a defined register"); + // Check that the tied-def operand wasn't tied elsewhere. + for (const auto &TiedPair : TiedRegisterPairs) { + if (TiedPair.first == DefIdx) + return error(Operands[I].Begin, + Twine("the tied-def operand #") + Twine(DefIdx) + + " is already tied with another register operand"); + } + TiedRegisterPairs.push_back(std::make_pair(DefIdx, I)); + } + // FIXME: Verify that for non INLINEASM instructions, the def and use tied + // indices must be less than tied max. + for (const auto &TiedPair : TiedRegisterPairs) + MI.tieOperands(TiedPair.first, TiedPair.second); + return false; +} + +bool MIParser::parseRegisterOperand(MachineOperand &Dest, + Optional<unsigned> &TiedDefIdx, + bool IsDef) { unsigned Reg; unsigned Flags = IsDef ? RegState::Define : 0; while (Token.isRegisterFlag()) { @@ -385,10 +932,17 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest, bool IsDef) { if (parseSubRegisterIndex(SubReg)) return true; } + if ((Flags & RegState::Define) == 0 && consumeIfPresent(MIToken::lparen)) { + unsigned Idx; + if (parseRegisterTiedDefIndex(Idx)) + return true; + TiedDefIdx = Idx; + } Dest = MachineOperand::CreateReg( Reg, Flags & RegState::Define, Flags & RegState::Implicit, Flags & RegState::Kill, Flags & RegState::Dead, Flags & RegState::Undef, - /*isEarlyClobber=*/false, SubReg); + Flags & RegState::EarlyClobber, SubReg, Flags & RegState::Debug, + Flags & RegState::InternalRead); return false; } @@ -396,13 +950,55 @@ bool MIParser::parseImmediateOperand(MachineOperand &Dest) { assert(Token.is(MIToken::IntegerLiteral)); const APSInt &Int = Token.integerValue(); if (Int.getMinSignedBits() > 64) - // TODO: Replace this with an error when we can parse CIMM Machine Operands. - llvm_unreachable("Can't parse large integer literals yet!"); + return error("integer literal is too large to be an immediate operand"); Dest = MachineOperand::CreateImm(Int.getExtValue()); lex(); return false; } +bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue, + const Constant *&C) { + auto Source = StringValue.str(); // The source has to be null terminated. + SMDiagnostic Err; + C = parseConstantValue(Source.c_str(), Err, *MF.getFunction()->getParent(), + &IRSlots); + if (!C) + return error(Loc + Err.getColumnNo(), Err.getMessage()); + return false; +} + +bool MIParser::parseIRConstant(StringRef::iterator Loc, const Constant *&C) { + if (parseIRConstant(Loc, StringRef(Loc, Token.range().end() - Loc), C)) + return true; + lex(); + return false; +} + +bool MIParser::parseTypedImmediateOperand(MachineOperand &Dest) { + assert(Token.is(MIToken::IntegerType)); + auto Loc = Token.location(); + lex(); + if (Token.isNot(MIToken::IntegerLiteral)) + return error("expected an integer literal"); + const Constant *C = nullptr; + if (parseIRConstant(Loc, C)) + return true; + Dest = MachineOperand::CreateCImm(cast<ConstantInt>(C)); + return false; +} + +bool MIParser::parseFPImmediateOperand(MachineOperand &Dest) { + auto Loc = Token.location(); + lex(); + if (Token.isNot(MIToken::FloatingPointLiteral)) + return error("expected a floating point literal"); + const Constant *C = nullptr; + if (parseIRConstant(Loc, C)) + return true; + Dest = MachineOperand::CreateFPImm(cast<ConstantFP>(C)); + return false; +} + bool MIParser::getUnsigned(unsigned &Result) { assert(Token.hasIntegerValue() && "Expected a token with an integer value"); const uint64_t Limit = uint64_t(std::numeric_limits<unsigned>::max()) + 1; @@ -414,7 +1010,8 @@ bool MIParser::getUnsigned(unsigned &Result) { } bool MIParser::parseMBBReference(MachineBasicBlock *&MBB) { - assert(Token.is(MIToken::MachineBasicBlock)); + assert(Token.is(MIToken::MachineBasicBlock) || + Token.is(MIToken::MachineBasicBlockLabel)); unsigned Number; if (getUnsigned(Number)) return true; @@ -438,16 +1035,66 @@ bool MIParser::parseMBBOperand(MachineOperand &Dest) { return false; } -bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) { +bool MIParser::parseStackFrameIndex(int &FI) { + assert(Token.is(MIToken::StackObject)); + unsigned ID; + if (getUnsigned(ID)) + return true; + auto ObjectInfo = PFS.StackObjectSlots.find(ID); + if (ObjectInfo == PFS.StackObjectSlots.end()) + return error(Twine("use of undefined stack object '%stack.") + Twine(ID) + + "'"); + StringRef Name; + if (const auto *Alloca = + MF.getFrameInfo()->getObjectAllocation(ObjectInfo->second)) + Name = Alloca->getName(); + if (!Token.stringValue().empty() && Token.stringValue() != Name) + return error(Twine("the name of the stack object '%stack.") + Twine(ID) + + "' isn't '" + Token.stringValue() + "'"); + lex(); + FI = ObjectInfo->second; + return false; +} + +bool MIParser::parseStackObjectOperand(MachineOperand &Dest) { + int FI; + if (parseStackFrameIndex(FI)) + return true; + Dest = MachineOperand::CreateFI(FI); + return false; +} + +bool MIParser::parseFixedStackFrameIndex(int &FI) { + assert(Token.is(MIToken::FixedStackObject)); + unsigned ID; + if (getUnsigned(ID)) + return true; + auto ObjectInfo = PFS.FixedStackObjectSlots.find(ID); + if (ObjectInfo == PFS.FixedStackObjectSlots.end()) + return error(Twine("use of undefined fixed stack object '%fixed-stack.") + + Twine(ID) + "'"); + lex(); + FI = ObjectInfo->second; + return false; +} + +bool MIParser::parseFixedStackObjectOperand(MachineOperand &Dest) { + int FI; + if (parseFixedStackFrameIndex(FI)) + return true; + Dest = MachineOperand::CreateFI(FI); + return false; +} + +bool MIParser::parseGlobalValue(GlobalValue *&GV) { switch (Token.kind()) { case MIToken::NamedGlobalValue: { - auto Name = Token.stringValue(); const Module *M = MF.getFunction()->getParent(); - if (const auto *GV = M->getNamedValue(Name)) { - Dest = MachineOperand::CreateGA(GV, /*Offset=*/0); - break; - } - return error(Twine("use of undefined global value '@") + Name + "'"); + GV = M->getNamedValue(Token.stringValue()); + if (!GV) + return error(Twine("use of undefined global value '") + Token.range() + + "'"); + break; } case MIToken::GlobalValue: { unsigned GVIdx; @@ -456,36 +1103,323 @@ bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) { if (GVIdx >= IRSlots.GlobalValues.size()) return error(Twine("use of undefined global value '@") + Twine(GVIdx) + "'"); - Dest = MachineOperand::CreateGA(IRSlots.GlobalValues[GVIdx], - /*Offset=*/0); + GV = IRSlots.GlobalValues[GVIdx]; break; } default: llvm_unreachable("The current token should be a global value"); } - // TODO: Parse offset and target flags. + return false; +} + +bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) { + GlobalValue *GV = nullptr; + if (parseGlobalValue(GV)) + return true; + lex(); + Dest = MachineOperand::CreateGA(GV, /*Offset=*/0); + if (parseOperandsOffset(Dest)) + return true; + return false; +} + +bool MIParser::parseConstantPoolIndexOperand(MachineOperand &Dest) { + assert(Token.is(MIToken::ConstantPoolItem)); + unsigned ID; + if (getUnsigned(ID)) + return true; + auto ConstantInfo = PFS.ConstantPoolSlots.find(ID); + if (ConstantInfo == PFS.ConstantPoolSlots.end()) + return error("use of undefined constant '%const." + Twine(ID) + "'"); + lex(); + Dest = MachineOperand::CreateCPI(ID, /*Offset=*/0); + if (parseOperandsOffset(Dest)) + return true; + return false; +} + +bool MIParser::parseJumpTableIndexOperand(MachineOperand &Dest) { + assert(Token.is(MIToken::JumpTableIndex)); + unsigned ID; + if (getUnsigned(ID)) + return true; + auto JumpTableEntryInfo = PFS.JumpTableSlots.find(ID); + if (JumpTableEntryInfo == PFS.JumpTableSlots.end()) + return error("use of undefined jump table '%jump-table." + Twine(ID) + "'"); + lex(); + Dest = MachineOperand::CreateJTI(JumpTableEntryInfo->second); + return false; +} + +bool MIParser::parseExternalSymbolOperand(MachineOperand &Dest) { + assert(Token.is(MIToken::ExternalSymbol)); + const char *Symbol = MF.createExternalSymbolName(Token.stringValue()); + lex(); + Dest = MachineOperand::CreateES(Symbol); + if (parseOperandsOffset(Dest)) + return true; + return false; +} + +bool MIParser::parseMDNode(MDNode *&Node) { + assert(Token.is(MIToken::exclaim)); + auto Loc = Token.location(); + lex(); + if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned()) + return error("expected metadata id after '!'"); + unsigned ID; + if (getUnsigned(ID)) + return true; + auto NodeInfo = IRSlots.MetadataNodes.find(ID); + if (NodeInfo == IRSlots.MetadataNodes.end()) + return error(Loc, "use of undefined metadata '!" + Twine(ID) + "'"); + lex(); + Node = NodeInfo->second.get(); + return false; +} + +bool MIParser::parseMetadataOperand(MachineOperand &Dest) { + MDNode *Node = nullptr; + if (parseMDNode(Node)) + return true; + Dest = MachineOperand::CreateMetadata(Node); + return false; +} + +bool MIParser::parseCFIOffset(int &Offset) { + if (Token.isNot(MIToken::IntegerLiteral)) + return error("expected a cfi offset"); + if (Token.integerValue().getMinSignedBits() > 32) + return error("expected a 32 bit integer (the cfi offset is too large)"); + Offset = (int)Token.integerValue().getExtValue(); + lex(); + return false; +} + +bool MIParser::parseCFIRegister(unsigned &Reg) { + if (Token.isNot(MIToken::NamedRegister)) + return error("expected a cfi register"); + unsigned LLVMReg; + if (parseRegister(LLVMReg)) + return true; + const auto *TRI = MF.getSubtarget().getRegisterInfo(); + assert(TRI && "Expected target register info"); + int DwarfReg = TRI->getDwarfRegNum(LLVMReg, true); + if (DwarfReg < 0) + return error("invalid DWARF register"); + Reg = (unsigned)DwarfReg; + lex(); + return false; +} + +bool MIParser::parseCFIOperand(MachineOperand &Dest) { + auto Kind = Token.kind(); + lex(); + auto &MMI = MF.getMMI(); + int Offset; + unsigned Reg; + unsigned CFIIndex; + switch (Kind) { + case MIToken::kw_cfi_same_value: + if (parseCFIRegister(Reg)) + return true; + CFIIndex = + MMI.addFrameInst(MCCFIInstruction::createSameValue(nullptr, Reg)); + break; + case MIToken::kw_cfi_offset: + if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) || + parseCFIOffset(Offset)) + return true; + CFIIndex = + MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, Reg, Offset)); + break; + case MIToken::kw_cfi_def_cfa_register: + if (parseCFIRegister(Reg)) + return true; + CFIIndex = + MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); + break; + case MIToken::kw_cfi_def_cfa_offset: + if (parseCFIOffset(Offset)) + return true; + // NB: MCCFIInstruction::createDefCfaOffset negates the offset. + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, -Offset)); + break; + case MIToken::kw_cfi_def_cfa: + if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) || + parseCFIOffset(Offset)) + return true; + // NB: MCCFIInstruction::createDefCfa negates the offset. + CFIIndex = + MMI.addFrameInst(MCCFIInstruction::createDefCfa(nullptr, Reg, -Offset)); + break; + default: + // TODO: Parse the other CFI operands. + llvm_unreachable("The current token should be a cfi operand"); + } + Dest = MachineOperand::CreateCFIIndex(CFIIndex); + return false; +} + +bool MIParser::parseIRBlock(BasicBlock *&BB, const Function &F) { + switch (Token.kind()) { + case MIToken::NamedIRBlock: { + BB = dyn_cast_or_null<BasicBlock>( + F.getValueSymbolTable().lookup(Token.stringValue())); + if (!BB) + return error(Twine("use of undefined IR block '") + Token.range() + "'"); + break; + } + case MIToken::IRBlock: { + unsigned SlotNumber = 0; + if (getUnsigned(SlotNumber)) + return true; + BB = const_cast<BasicBlock *>(getIRBlock(SlotNumber, F)); + if (!BB) + return error(Twine("use of undefined IR block '%ir-block.") + + Twine(SlotNumber) + "'"); + break; + } + default: + llvm_unreachable("The current token should be an IR block reference"); + } + return false; +} + +bool MIParser::parseBlockAddressOperand(MachineOperand &Dest) { + assert(Token.is(MIToken::kw_blockaddress)); + lex(); + if (expectAndConsume(MIToken::lparen)) + return true; + if (Token.isNot(MIToken::GlobalValue) && + Token.isNot(MIToken::NamedGlobalValue)) + return error("expected a global value"); + GlobalValue *GV = nullptr; + if (parseGlobalValue(GV)) + return true; + auto *F = dyn_cast<Function>(GV); + if (!F) + return error("expected an IR function reference"); + lex(); + if (expectAndConsume(MIToken::comma)) + return true; + BasicBlock *BB = nullptr; + if (Token.isNot(MIToken::IRBlock) && Token.isNot(MIToken::NamedIRBlock)) + return error("expected an IR block reference"); + if (parseIRBlock(BB, *F)) + return true; + lex(); + if (expectAndConsume(MIToken::rparen)) + return true; + Dest = MachineOperand::CreateBA(BlockAddress::get(F, BB), /*Offset=*/0); + if (parseOperandsOffset(Dest)) + return true; + return false; +} + +bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) { + assert(Token.is(MIToken::kw_target_index)); + lex(); + if (expectAndConsume(MIToken::lparen)) + return true; + if (Token.isNot(MIToken::Identifier)) + return error("expected the name of the target index"); + int Index = 0; + if (getTargetIndex(Token.stringValue(), Index)) + return error("use of undefined target index '" + Token.stringValue() + "'"); lex(); + if (expectAndConsume(MIToken::rparen)) + return true; + Dest = MachineOperand::CreateTargetIndex(unsigned(Index), /*Offset=*/0); + if (parseOperandsOffset(Dest)) + return true; + return false; +} + +bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) { + assert(Token.is(MIToken::kw_liveout)); + const auto *TRI = MF.getSubtarget().getRegisterInfo(); + assert(TRI && "Expected target register info"); + uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs()); + lex(); + if (expectAndConsume(MIToken::lparen)) + return true; + while (true) { + if (Token.isNot(MIToken::NamedRegister)) + return error("expected a named register"); + unsigned Reg = 0; + if (parseRegister(Reg)) + return true; + lex(); + Mask[Reg / 32] |= 1U << (Reg % 32); + // TODO: Report an error if the same register is used more than once. + if (Token.isNot(MIToken::comma)) + break; + lex(); + } + if (expectAndConsume(MIToken::rparen)) + return true; + Dest = MachineOperand::CreateRegLiveOut(Mask); return false; } -bool MIParser::parseMachineOperand(MachineOperand &Dest) { +bool MIParser::parseMachineOperand(MachineOperand &Dest, + Optional<unsigned> &TiedDefIdx) { switch (Token.kind()) { case MIToken::kw_implicit: case MIToken::kw_implicit_define: + case MIToken::kw_def: case MIToken::kw_dead: case MIToken::kw_killed: case MIToken::kw_undef: + case MIToken::kw_internal: + case MIToken::kw_early_clobber: + case MIToken::kw_debug_use: case MIToken::underscore: case MIToken::NamedRegister: case MIToken::VirtualRegister: - return parseRegisterOperand(Dest); + return parseRegisterOperand(Dest, TiedDefIdx); case MIToken::IntegerLiteral: return parseImmediateOperand(Dest); + case MIToken::IntegerType: + return parseTypedImmediateOperand(Dest); + case MIToken::kw_half: + case MIToken::kw_float: + case MIToken::kw_double: + case MIToken::kw_x86_fp80: + case MIToken::kw_fp128: + case MIToken::kw_ppc_fp128: + return parseFPImmediateOperand(Dest); case MIToken::MachineBasicBlock: return parseMBBOperand(Dest); + case MIToken::StackObject: + return parseStackObjectOperand(Dest); + case MIToken::FixedStackObject: + return parseFixedStackObjectOperand(Dest); case MIToken::GlobalValue: case MIToken::NamedGlobalValue: return parseGlobalAddressOperand(Dest); + case MIToken::ConstantPoolItem: + return parseConstantPoolIndexOperand(Dest); + case MIToken::JumpTableIndex: + return parseJumpTableIndexOperand(Dest); + case MIToken::ExternalSymbol: + return parseExternalSymbolOperand(Dest); + case MIToken::exclaim: + return parseMetadataOperand(Dest); + case MIToken::kw_cfi_same_value: + case MIToken::kw_cfi_offset: + case MIToken::kw_cfi_def_cfa_register: + case MIToken::kw_cfi_def_cfa_offset: + case MIToken::kw_cfi_def_cfa: + return parseCFIOperand(Dest); + case MIToken::kw_blockaddress: + return parseBlockAddressOperand(Dest); + case MIToken::kw_target_index: + return parseTargetIndexOperand(Dest); + case MIToken::kw_liveout: + return parseLiveoutRegisterMaskOperand(Dest); case MIToken::Error: return true; case MIToken::Identifier: @@ -496,12 +1430,314 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest) { } // fallthrough default: - // TODO: parse the other machine operands. + // FIXME: Parse the MCSymbol machine operand. return error("expected a machine operand"); } return false; } +bool MIParser::parseMachineOperandAndTargetFlags( + MachineOperand &Dest, Optional<unsigned> &TiedDefIdx) { + unsigned TF = 0; + bool HasTargetFlags = false; + if (Token.is(MIToken::kw_target_flags)) { + HasTargetFlags = true; + lex(); + if (expectAndConsume(MIToken::lparen)) + return true; + if (Token.isNot(MIToken::Identifier)) + return error("expected the name of the target flag"); + if (getDirectTargetFlag(Token.stringValue(), TF)) { + if (getBitmaskTargetFlag(Token.stringValue(), TF)) + return error("use of undefined target flag '" + Token.stringValue() + + "'"); + } + lex(); + while (Token.is(MIToken::comma)) { + lex(); + if (Token.isNot(MIToken::Identifier)) + return error("expected the name of the target flag"); + unsigned BitFlag = 0; + if (getBitmaskTargetFlag(Token.stringValue(), BitFlag)) + return error("use of undefined target flag '" + Token.stringValue() + + "'"); + // TODO: Report an error when using a duplicate bit target flag. + TF |= BitFlag; + lex(); + } + if (expectAndConsume(MIToken::rparen)) + return true; + } + auto Loc = Token.location(); + if (parseMachineOperand(Dest, TiedDefIdx)) + return true; + if (!HasTargetFlags) + return false; + if (Dest.isReg()) + return error(Loc, "register operands can't have target flags"); + Dest.setTargetFlags(TF); + return false; +} + +bool MIParser::parseOffset(int64_t &Offset) { + if (Token.isNot(MIToken::plus) && Token.isNot(MIToken::minus)) + return false; + StringRef Sign = Token.range(); + bool IsNegative = Token.is(MIToken::minus); + lex(); + if (Token.isNot(MIToken::IntegerLiteral)) + return error("expected an integer literal after '" + Sign + "'"); + if (Token.integerValue().getMinSignedBits() > 64) + return error("expected 64-bit integer (too large)"); + Offset = Token.integerValue().getExtValue(); + if (IsNegative) + Offset = -Offset; + lex(); + return false; +} + +bool MIParser::parseAlignment(unsigned &Alignment) { + assert(Token.is(MIToken::kw_align)); + lex(); + if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned()) + return error("expected an integer literal after 'align'"); + if (getUnsigned(Alignment)) + return true; + lex(); + return false; +} + +bool MIParser::parseOperandsOffset(MachineOperand &Op) { + int64_t Offset = 0; + if (parseOffset(Offset)) + return true; + Op.setOffset(Offset); + return false; +} + +bool MIParser::parseIRValue(const Value *&V) { + switch (Token.kind()) { + case MIToken::NamedIRValue: { + V = MF.getFunction()->getValueSymbolTable().lookup(Token.stringValue()); + break; + } + case MIToken::IRValue: { + unsigned SlotNumber = 0; + if (getUnsigned(SlotNumber)) + return true; + V = getIRValue(SlotNumber); + break; + } + case MIToken::NamedGlobalValue: + case MIToken::GlobalValue: { + GlobalValue *GV = nullptr; + if (parseGlobalValue(GV)) + return true; + V = GV; + break; + } + case MIToken::QuotedIRValue: { + const Constant *C = nullptr; + if (parseIRConstant(Token.location(), Token.stringValue(), C)) + return true; + V = C; + break; + } + default: + llvm_unreachable("The current token should be an IR block reference"); + } + if (!V) + return error(Twine("use of undefined IR value '") + Token.range() + "'"); + return false; +} + +bool MIParser::getUint64(uint64_t &Result) { + assert(Token.hasIntegerValue()); + if (Token.integerValue().getActiveBits() > 64) + return error("expected 64-bit integer (too large)"); + Result = Token.integerValue().getZExtValue(); + return false; +} + +bool MIParser::parseMemoryOperandFlag(unsigned &Flags) { + const unsigned OldFlags = Flags; + switch (Token.kind()) { + case MIToken::kw_volatile: + Flags |= MachineMemOperand::MOVolatile; + break; + case MIToken::kw_non_temporal: + Flags |= MachineMemOperand::MONonTemporal; + break; + case MIToken::kw_invariant: + Flags |= MachineMemOperand::MOInvariant; + break; + // TODO: parse the target specific memory operand flags. + default: + llvm_unreachable("The current token should be a memory operand flag"); + } + if (OldFlags == Flags) + // We know that the same flag is specified more than once when the flags + // weren't modified. + return error("duplicate '" + Token.stringValue() + "' memory operand flag"); + lex(); + return false; +} + +bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) { + switch (Token.kind()) { + case MIToken::kw_stack: + PSV = MF.getPSVManager().getStack(); + break; + case MIToken::kw_got: + PSV = MF.getPSVManager().getGOT(); + break; + case MIToken::kw_jump_table: + PSV = MF.getPSVManager().getJumpTable(); + break; + case MIToken::kw_constant_pool: + PSV = MF.getPSVManager().getConstantPool(); + break; + case MIToken::FixedStackObject: { + int FI; + if (parseFixedStackFrameIndex(FI)) + return true; + PSV = MF.getPSVManager().getFixedStack(FI); + // The token was already consumed, so use return here instead of break. + return false; + } + case MIToken::kw_call_entry: { + lex(); + switch (Token.kind()) { + case MIToken::GlobalValue: + case MIToken::NamedGlobalValue: { + GlobalValue *GV = nullptr; + if (parseGlobalValue(GV)) + return true; + PSV = MF.getPSVManager().getGlobalValueCallEntry(GV); + break; + } + case MIToken::ExternalSymbol: + PSV = MF.getPSVManager().getExternalSymbolCallEntry( + MF.createExternalSymbolName(Token.stringValue())); + break; + default: + return error( + "expected a global value or an external symbol after 'call-entry'"); + } + break; + } + default: + llvm_unreachable("The current token should be pseudo source value"); + } + lex(); + return false; +} + +bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) { + if (Token.is(MIToken::kw_constant_pool) || Token.is(MIToken::kw_stack) || + Token.is(MIToken::kw_got) || Token.is(MIToken::kw_jump_table) || + Token.is(MIToken::FixedStackObject) || Token.is(MIToken::kw_call_entry)) { + const PseudoSourceValue *PSV = nullptr; + if (parseMemoryPseudoSourceValue(PSV)) + return true; + int64_t Offset = 0; + if (parseOffset(Offset)) + return true; + Dest = MachinePointerInfo(PSV, Offset); + return false; + } + if (Token.isNot(MIToken::NamedIRValue) && Token.isNot(MIToken::IRValue) && + Token.isNot(MIToken::GlobalValue) && + Token.isNot(MIToken::NamedGlobalValue) && + Token.isNot(MIToken::QuotedIRValue)) + return error("expected an IR value reference"); + const Value *V = nullptr; + if (parseIRValue(V)) + return true; + if (!V->getType()->isPointerTy()) + return error("expected a pointer IR value"); + lex(); + int64_t Offset = 0; + if (parseOffset(Offset)) + return true; + Dest = MachinePointerInfo(V, Offset); + return false; +} + +bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { + if (expectAndConsume(MIToken::lparen)) + return true; + unsigned Flags = 0; + while (Token.isMemoryOperandFlag()) { + if (parseMemoryOperandFlag(Flags)) + return true; + } + if (Token.isNot(MIToken::Identifier) || + (Token.stringValue() != "load" && Token.stringValue() != "store")) + return error("expected 'load' or 'store' memory operation"); + if (Token.stringValue() == "load") + Flags |= MachineMemOperand::MOLoad; + else + Flags |= MachineMemOperand::MOStore; + lex(); + + if (Token.isNot(MIToken::IntegerLiteral)) + return error("expected the size integer literal after memory operation"); + uint64_t Size; + if (getUint64(Size)) + return true; + lex(); + + const char *Word = Flags & MachineMemOperand::MOLoad ? "from" : "into"; + if (Token.isNot(MIToken::Identifier) || Token.stringValue() != Word) + return error(Twine("expected '") + Word + "'"); + lex(); + + MachinePointerInfo Ptr = MachinePointerInfo(); + if (parseMachinePointerInfo(Ptr)) + return true; + unsigned BaseAlignment = Size; + AAMDNodes AAInfo; + MDNode *Range = nullptr; + while (consumeIfPresent(MIToken::comma)) { + switch (Token.kind()) { + case MIToken::kw_align: + if (parseAlignment(BaseAlignment)) + return true; + break; + case MIToken::md_tbaa: + lex(); + if (parseMDNode(AAInfo.TBAA)) + return true; + break; + case MIToken::md_alias_scope: + lex(); + if (parseMDNode(AAInfo.Scope)) + return true; + break; + case MIToken::md_noalias: + lex(); + if (parseMDNode(AAInfo.NoAlias)) + return true; + break; + case MIToken::md_range: + lex(); + if (parseMDNode(Range)) + return true; + break; + // TODO: Report an error on duplicate metadata nodes. + default: + return error("expected 'align' or '!tbaa' or '!alias.scope' or " + "'!noalias' or '!range'"); + } + } + if (expectAndConsume(MIToken::rparen)) + return true; + Dest = + MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range); + return false; +} + void MIParser::initNames2InstrOpCodes() { if (!Names2InstrOpCodes.empty()) return; @@ -583,18 +1819,162 @@ unsigned MIParser::getSubRegIndex(StringRef Name) { return SubRegInfo->getValue(); } -bool llvm::parseMachineInstr(MachineInstr *&MI, SourceMgr &SM, - MachineFunction &MF, StringRef Src, - const PerFunctionMIParsingState &PFS, - const SlotMapping &IRSlots, SMDiagnostic &Error) { - return MIParser(SM, MF, Error, Src, PFS, IRSlots).parse(MI); +static void initSlots2BasicBlocks( + const Function &F, + DenseMap<unsigned, const BasicBlock *> &Slots2BasicBlocks) { + ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false); + MST.incorporateFunction(F); + for (auto &BB : F) { + if (BB.hasName()) + continue; + int Slot = MST.getLocalSlot(&BB); + if (Slot == -1) + continue; + Slots2BasicBlocks.insert(std::make_pair(unsigned(Slot), &BB)); + } +} + +static const BasicBlock *getIRBlockFromSlot( + unsigned Slot, + const DenseMap<unsigned, const BasicBlock *> &Slots2BasicBlocks) { + auto BlockInfo = Slots2BasicBlocks.find(Slot); + if (BlockInfo == Slots2BasicBlocks.end()) + return nullptr; + return BlockInfo->second; +} + +const BasicBlock *MIParser::getIRBlock(unsigned Slot) { + if (Slots2BasicBlocks.empty()) + initSlots2BasicBlocks(*MF.getFunction(), Slots2BasicBlocks); + return getIRBlockFromSlot(Slot, Slots2BasicBlocks); +} + +const BasicBlock *MIParser::getIRBlock(unsigned Slot, const Function &F) { + if (&F == MF.getFunction()) + return getIRBlock(Slot); + DenseMap<unsigned, const BasicBlock *> CustomSlots2BasicBlocks; + initSlots2BasicBlocks(F, CustomSlots2BasicBlocks); + return getIRBlockFromSlot(Slot, CustomSlots2BasicBlocks); +} + +static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST, + DenseMap<unsigned, const Value *> &Slots2Values) { + int Slot = MST.getLocalSlot(V); + if (Slot == -1) + return; + Slots2Values.insert(std::make_pair(unsigned(Slot), V)); +} + +/// Creates the mapping from slot numbers to function's unnamed IR values. +static void initSlots2Values(const Function &F, + DenseMap<unsigned, const Value *> &Slots2Values) { + ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false); + MST.incorporateFunction(F); + for (const auto &Arg : F.args()) + mapValueToSlot(&Arg, MST, Slots2Values); + for (const auto &BB : F) { + mapValueToSlot(&BB, MST, Slots2Values); + for (const auto &I : BB) + mapValueToSlot(&I, MST, Slots2Values); + } +} + +const Value *MIParser::getIRValue(unsigned Slot) { + if (Slots2Values.empty()) + initSlots2Values(*MF.getFunction(), Slots2Values); + auto ValueInfo = Slots2Values.find(Slot); + if (ValueInfo == Slots2Values.end()) + return nullptr; + return ValueInfo->second; +} + +void MIParser::initNames2TargetIndices() { + if (!Names2TargetIndices.empty()) + return; + const auto *TII = MF.getSubtarget().getInstrInfo(); + assert(TII && "Expected target instruction info"); + auto Indices = TII->getSerializableTargetIndices(); + for (const auto &I : Indices) + Names2TargetIndices.insert(std::make_pair(StringRef(I.second), I.first)); +} + +bool MIParser::getTargetIndex(StringRef Name, int &Index) { + initNames2TargetIndices(); + auto IndexInfo = Names2TargetIndices.find(Name); + if (IndexInfo == Names2TargetIndices.end()) + return true; + Index = IndexInfo->second; + return false; +} + +void MIParser::initNames2DirectTargetFlags() { + if (!Names2DirectTargetFlags.empty()) + return; + const auto *TII = MF.getSubtarget().getInstrInfo(); + assert(TII && "Expected target instruction info"); + auto Flags = TII->getSerializableDirectMachineOperandTargetFlags(); + for (const auto &I : Flags) + Names2DirectTargetFlags.insert( + std::make_pair(StringRef(I.second), I.first)); +} + +bool MIParser::getDirectTargetFlag(StringRef Name, unsigned &Flag) { + initNames2DirectTargetFlags(); + auto FlagInfo = Names2DirectTargetFlags.find(Name); + if (FlagInfo == Names2DirectTargetFlags.end()) + return true; + Flag = FlagInfo->second; + return false; +} + +void MIParser::initNames2BitmaskTargetFlags() { + if (!Names2BitmaskTargetFlags.empty()) + return; + const auto *TII = MF.getSubtarget().getInstrInfo(); + assert(TII && "Expected target instruction info"); + auto Flags = TII->getSerializableBitmaskMachineOperandTargetFlags(); + for (const auto &I : Flags) + Names2BitmaskTargetFlags.insert( + std::make_pair(StringRef(I.second), I.first)); +} + +bool MIParser::getBitmaskTargetFlag(StringRef Name, unsigned &Flag) { + initNames2BitmaskTargetFlags(); + auto FlagInfo = Names2BitmaskTargetFlags.find(Name); + if (FlagInfo == Names2BitmaskTargetFlags.end()) + return true; + Flag = FlagInfo->second; + return false; +} + +bool llvm::parseMachineBasicBlockDefinitions(MachineFunction &MF, StringRef Src, + PerFunctionMIParsingState &PFS, + const SlotMapping &IRSlots, + SMDiagnostic &Error) { + SourceMgr SM; + SM.AddNewSourceBuffer( + MemoryBuffer::getMemBuffer(Src, "", /*RequiresNullTerminator=*/false), + SMLoc()); + return MIParser(SM, MF, Error, Src, PFS, IRSlots) + .parseBasicBlockDefinitions(PFS.MBBSlots); +} + +bool llvm::parseMachineInstructions(MachineFunction &MF, StringRef Src, + const PerFunctionMIParsingState &PFS, + const SlotMapping &IRSlots, + SMDiagnostic &Error) { + SourceMgr SM; + SM.AddNewSourceBuffer( + MemoryBuffer::getMemBuffer(Src, "", /*RequiresNullTerminator=*/false), + SMLoc()); + return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseBasicBlocks(); } bool llvm::parseMBBReference(MachineBasicBlock *&MBB, SourceMgr &SM, MachineFunction &MF, StringRef Src, const PerFunctionMIParsingState &PFS, const SlotMapping &IRSlots, SMDiagnostic &Error) { - return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseMBB(MBB); + return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseStandaloneMBB(MBB); } bool llvm::parseNamedRegisterReference(unsigned &Reg, SourceMgr &SM, @@ -602,5 +1982,30 @@ bool llvm::parseNamedRegisterReference(unsigned &Reg, SourceMgr &SM, const PerFunctionMIParsingState &PFS, const SlotMapping &IRSlots, SMDiagnostic &Error) { - return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseNamedRegister(Reg); + return MIParser(SM, MF, Error, Src, PFS, IRSlots) + .parseStandaloneNamedRegister(Reg); +} + +bool llvm::parseVirtualRegisterReference(unsigned &Reg, SourceMgr &SM, + MachineFunction &MF, StringRef Src, + const PerFunctionMIParsingState &PFS, + const SlotMapping &IRSlots, + SMDiagnostic &Error) { + return MIParser(SM, MF, Error, Src, PFS, IRSlots) + .parseStandaloneVirtualRegister(Reg); +} + +bool llvm::parseStackObjectReference(int &FI, SourceMgr &SM, + MachineFunction &MF, StringRef Src, + const PerFunctionMIParsingState &PFS, + const SlotMapping &IRSlots, + SMDiagnostic &Error) { + return MIParser(SM, MF, Error, Src, PFS, IRSlots) + .parseStandaloneStackObject(FI); +} + +bool llvm::parseMDNode(MDNode *&Node, SourceMgr &SM, MachineFunction &MF, + StringRef Src, const PerFunctionMIParsingState &PFS, + const SlotMapping &IRSlots, SMDiagnostic &Error) { + return MIParser(SM, MF, Error, Src, PFS, IRSlots).parseStandaloneMDNode(Node); } diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h index fca4c4e..8aef704 100644 --- a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h +++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h @@ -19,9 +19,11 @@ namespace llvm { +class BasicBlock; class MachineBasicBlock; class MachineInstr; class MachineFunction; +class MDNode; struct SlotMapping; class SMDiagnostic; class SourceMgr; @@ -29,11 +31,42 @@ class SourceMgr; struct PerFunctionMIParsingState { DenseMap<unsigned, MachineBasicBlock *> MBBSlots; DenseMap<unsigned, unsigned> VirtualRegisterSlots; + DenseMap<unsigned, int> FixedStackObjectSlots; + DenseMap<unsigned, int> StackObjectSlots; + DenseMap<unsigned, unsigned> ConstantPoolSlots; + DenseMap<unsigned, unsigned> JumpTableSlots; }; -bool parseMachineInstr(MachineInstr *&MI, SourceMgr &SM, MachineFunction &MF, - StringRef Src, const PerFunctionMIParsingState &PFS, - const SlotMapping &IRSlots, SMDiagnostic &Error); +/// Parse the machine basic block definitions, and skip the machine +/// instructions. +/// +/// This function runs the first parsing pass on the machine function's body. +/// It parses only the machine basic block definitions and creates the machine +/// basic blocks in the given machine function. +/// +/// The machine instructions aren't parsed during the first pass because all +/// the machine basic blocks aren't defined yet - this makes it impossible to +/// resolve the machine basic block references. +/// +/// Return true if an error occurred. +bool parseMachineBasicBlockDefinitions(MachineFunction &MF, StringRef Src, + PerFunctionMIParsingState &PFS, + const SlotMapping &IRSlots, + SMDiagnostic &Error); + +/// Parse the machine instructions. +/// +/// This function runs the second parsing pass on the machine function's body. +/// It skips the machine basic block definitions and parses only the machine +/// instructions and basic block attributes like liveins and successors. +/// +/// The second parsing pass assumes that the first parsing pass already ran +/// on the given source string. +/// +/// Return true if an error occurred. +bool parseMachineInstructions(MachineFunction &MF, StringRef Src, + const PerFunctionMIParsingState &PFS, + const SlotMapping &IRSlots, SMDiagnostic &Error); bool parseMBBReference(MachineBasicBlock *&MBB, SourceMgr &SM, MachineFunction &MF, StringRef Src, @@ -46,6 +79,21 @@ bool parseNamedRegisterReference(unsigned &Reg, SourceMgr &SM, const SlotMapping &IRSlots, SMDiagnostic &Error); +bool parseVirtualRegisterReference(unsigned &Reg, SourceMgr &SM, + MachineFunction &MF, StringRef Src, + const PerFunctionMIParsingState &PFS, + const SlotMapping &IRSlots, + SMDiagnostic &Error); + +bool parseStackObjectReference(int &FI, SourceMgr &SM, MachineFunction &MF, + StringRef Src, + const PerFunctionMIParsingState &PFS, + const SlotMapping &IRSlots, SMDiagnostic &Error); + +bool parseMDNode(MDNode *&Node, SourceMgr &SM, MachineFunction &MF, + StringRef Src, const PerFunctionMIParsingState &PFS, + const SlotMapping &IRSlots, SMDiagnostic &Error); + } // end namespace llvm #endif diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 16b0e16..422efbc 100644 --- a/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -20,8 +20,10 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/AsmParser/Parser.h" #include "llvm/AsmParser/SlotMapping.h" +#include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/IR/BasicBlock.h" @@ -95,30 +97,53 @@ public: /// Return true if error occurred. bool initializeMachineFunction(MachineFunction &MF); - /// Initialize the machine basic block using it's YAML representation. - /// - /// Return true if an error occurred. - bool initializeMachineBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB, - const yaml::MachineBasicBlock &YamlMBB, - const PerFunctionMIParsingState &PFS); + bool initializeRegisterInfo(MachineFunction &MF, + const yaml::MachineFunction &YamlMF, + PerFunctionMIParsingState &PFS); + + void inferRegisterInfo(MachineFunction &MF, + const yaml::MachineFunction &YamlMF); + + bool initializeFrameInfo(MachineFunction &MF, + const yaml::MachineFunction &YamlMF, + PerFunctionMIParsingState &PFS); + + bool parseCalleeSavedRegister(MachineFunction &MF, + PerFunctionMIParsingState &PFS, + std::vector<CalleeSavedInfo> &CSIInfo, + const yaml::StringValue &RegisterSource, + int FrameIdx); + + bool parseStackObjectsDebugInfo(MachineFunction &MF, + PerFunctionMIParsingState &PFS, + const yaml::MachineStackObject &Object, + int FrameIdx); - bool - initializeRegisterInfo(const MachineFunction &MF, - MachineRegisterInfo &RegInfo, - const yaml::MachineFunction &YamlMF, - DenseMap<unsigned, unsigned> &VirtualRegisterSlots); + bool initializeConstantPool(MachineConstantPool &ConstantPool, + const yaml::MachineFunction &YamlMF, + const MachineFunction &MF, + DenseMap<unsigned, unsigned> &ConstantPoolSlots); - bool initializeFrameInfo(MachineFrameInfo &MFI, - const yaml::MachineFunction &YamlMF); + bool initializeJumpTableInfo(MachineFunction &MF, + const yaml::MachineJumpTable &YamlJTI, + PerFunctionMIParsingState &PFS); private: + bool parseMDNode(MDNode *&Node, const yaml::StringValue &Source, + MachineFunction &MF, const PerFunctionMIParsingState &PFS); + + bool parseMBBReference(MachineBasicBlock *&MBB, + const yaml::StringValue &Source, MachineFunction &MF, + const PerFunctionMIParsingState &PFS); + /// Return a MIR diagnostic converted from an MI string diagnostic. SMDiagnostic diagFromMIStringDiag(const SMDiagnostic &Error, SMRange SourceRange); - /// Return a MIR diagnostic converted from an LLVM assembly diagnostic. - SMDiagnostic diagFromLLVMAssemblyDiag(const SMDiagnostic &Error, - SMRange SourceRange); + /// Return a MIR diagnostic converted from a diagnostic located in a YAML + /// block scalar string. + SMDiagnostic diagFromBlockStringDiag(const SMDiagnostic &Error, + SMRange SourceRange); /// Create an empty function with the given name. void createDummyFunction(StringRef Name, Module &M); @@ -200,7 +225,7 @@ std::unique_ptr<Module> MIRParserImpl::parse() { M = parseAssembly(MemoryBufferRef(BSN->getValue(), Filename), Error, Context, &IRSlots); if (!M) { - reportDiagnostic(diagFromLLVMAssemblyDiag(Error, BSN->getSourceRange())); + reportDiagnostic(diagFromBlockStringDiag(Error, BSN->getSourceRange())); return M; } In.nextDocument(); @@ -261,88 +286,56 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) { MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice); MF.setHasInlineAsm(YamlMF.HasInlineAsm); PerFunctionMIParsingState PFS; - if (initializeRegisterInfo(MF, MF.getRegInfo(), YamlMF, - PFS.VirtualRegisterSlots)) - return true; - if (initializeFrameInfo(*MF.getFrameInfo(), YamlMF)) + if (initializeRegisterInfo(MF, YamlMF, PFS)) return true; - - const auto &F = *MF.getFunction(); - for (const auto &YamlMBB : YamlMF.BasicBlocks) { - const BasicBlock *BB = nullptr; - const yaml::StringValue &Name = YamlMBB.Name; - if (!Name.Value.empty()) { - BB = dyn_cast_or_null<BasicBlock>( - F.getValueSymbolTable().lookup(Name.Value)); - if (!BB) - return error(Name.SourceRange.Start, - Twine("basic block '") + Name.Value + - "' is not defined in the function '" + MF.getName() + - "'"); - } - auto *MBB = MF.CreateMachineBasicBlock(BB); - MF.insert(MF.end(), MBB); - bool WasInserted = - PFS.MBBSlots.insert(std::make_pair(YamlMBB.ID, MBB)).second; - if (!WasInserted) - return error(Twine("redefinition of machine basic block with id #") + - Twine(YamlMBB.ID)); - } - - if (YamlMF.BasicBlocks.empty()) - return error(Twine("machine function '") + Twine(MF.getName()) + - "' requires at least one machine basic block in its body"); - // Initialize the machine basic blocks after creating them all so that the - // machine instructions parser can resolve the MBB references. - unsigned I = 0; - for (const auto &YamlMBB : YamlMF.BasicBlocks) { - if (initializeMachineBasicBlock(MF, *MF.getBlockNumbered(I++), YamlMBB, - PFS)) + if (!YamlMF.Constants.empty()) { + auto *ConstantPool = MF.getConstantPool(); + assert(ConstantPool && "Constant pool must be created"); + if (initializeConstantPool(*ConstantPool, YamlMF, MF, + PFS.ConstantPoolSlots)) return true; } - return false; -} -bool MIRParserImpl::initializeMachineBasicBlock( - MachineFunction &MF, MachineBasicBlock &MBB, - const yaml::MachineBasicBlock &YamlMBB, - const PerFunctionMIParsingState &PFS) { - MBB.setAlignment(YamlMBB.Alignment); - if (YamlMBB.AddressTaken) - MBB.setHasAddressTaken(); - MBB.setIsLandingPad(YamlMBB.IsLandingPad); SMDiagnostic Error; - // Parse the successors. - for (const auto &MBBSource : YamlMBB.Successors) { - MachineBasicBlock *SuccMBB = nullptr; - if (parseMBBReference(SuccMBB, SM, MF, MBBSource.Value, PFS, IRSlots, - Error)) - return error(Error, MBBSource.SourceRange); - // TODO: Report an error when adding the same successor more than once. - MBB.addSuccessor(SuccMBB); - } - // Parse the liveins. - for (const auto &LiveInSource : YamlMBB.LiveIns) { - unsigned Reg = 0; - if (parseNamedRegisterReference(Reg, SM, MF, LiveInSource.Value, PFS, - IRSlots, Error)) - return error(Error, LiveInSource.SourceRange); - MBB.addLiveIn(Reg); + if (parseMachineBasicBlockDefinitions(MF, YamlMF.Body.Value.Value, PFS, + IRSlots, Error)) { + reportDiagnostic( + diagFromBlockStringDiag(Error, YamlMF.Body.Value.SourceRange)); + return true; } - // Parse the instructions. - for (const auto &MISource : YamlMBB.Instructions) { - MachineInstr *MI = nullptr; - if (parseMachineInstr(MI, SM, MF, MISource.Value, PFS, IRSlots, Error)) - return error(Error, MISource.SourceRange); - MBB.insert(MBB.end(), MI); + + if (MF.empty()) + return error(Twine("machine function '") + Twine(MF.getName()) + + "' requires at least one machine basic block in its body"); + // Initialize the frame information after creating all the MBBs so that the + // MBB references in the frame information can be resolved. + if (initializeFrameInfo(MF, YamlMF, PFS)) + return true; + // Initialize the jump table after creating all the MBBs so that the MBB + // references can be resolved. + if (!YamlMF.JumpTableInfo.Entries.empty() && + initializeJumpTableInfo(MF, YamlMF.JumpTableInfo, PFS)) + return true; + // Parse the machine instructions after creating all of the MBBs so that the + // parser can resolve the MBB references. + if (parseMachineInstructions(MF, YamlMF.Body.Value.Value, PFS, IRSlots, + Error)) { + reportDiagnostic( + diagFromBlockStringDiag(Error, YamlMF.Body.Value.SourceRange)); + return true; } + inferRegisterInfo(MF, YamlMF); + // FIXME: This is a temporary workaround until the reserved registers can be + // serialized. + MF.getRegInfo().freezeReservedRegs(MF); + MF.verify(); return false; } -bool MIRParserImpl::initializeRegisterInfo( - const MachineFunction &MF, MachineRegisterInfo &RegInfo, - const yaml::MachineFunction &YamlMF, - DenseMap<unsigned, unsigned> &VirtualRegisterSlots) { +bool MIRParserImpl::initializeRegisterInfo(MachineFunction &MF, + const yaml::MachineFunction &YamlMF, + PerFunctionMIParsingState &PFS) { + MachineRegisterInfo &RegInfo = MF.getRegInfo(); assert(RegInfo.isSSA()); if (!YamlMF.IsSSA) RegInfo.leaveSSA(); @@ -351,6 +344,7 @@ bool MIRParserImpl::initializeRegisterInfo( RegInfo.invalidateLiveness(); RegInfo.enableSubRegLiveness(YamlMF.TracksSubRegLiveness); + SMDiagnostic Error; // Parse the virtual register information. for (const auto &VReg : YamlMF.VirtualRegisters) { const auto *RC = getRegClass(MF, VReg.Class.Value); @@ -359,15 +353,71 @@ bool MIRParserImpl::initializeRegisterInfo( Twine("use of undefined register class '") + VReg.Class.Value + "'"); unsigned Reg = RegInfo.createVirtualRegister(RC); - // TODO: Report an error when the same virtual register with the same ID is - // redefined. - VirtualRegisterSlots.insert(std::make_pair(VReg.ID, Reg)); + if (!PFS.VirtualRegisterSlots.insert(std::make_pair(VReg.ID.Value, Reg)) + .second) + return error(VReg.ID.SourceRange.Start, + Twine("redefinition of virtual register '%") + + Twine(VReg.ID.Value) + "'"); + if (!VReg.PreferredRegister.Value.empty()) { + unsigned PreferredReg = 0; + if (parseNamedRegisterReference(PreferredReg, SM, MF, + VReg.PreferredRegister.Value, PFS, + IRSlots, Error)) + return error(Error, VReg.PreferredRegister.SourceRange); + RegInfo.setSimpleHint(Reg, PreferredReg); + } } + + // Parse the liveins. + for (const auto &LiveIn : YamlMF.LiveIns) { + unsigned Reg = 0; + if (parseNamedRegisterReference(Reg, SM, MF, LiveIn.Register.Value, PFS, + IRSlots, Error)) + return error(Error, LiveIn.Register.SourceRange); + unsigned VReg = 0; + if (!LiveIn.VirtualRegister.Value.empty()) { + if (parseVirtualRegisterReference( + VReg, SM, MF, LiveIn.VirtualRegister.Value, PFS, IRSlots, Error)) + return error(Error, LiveIn.VirtualRegister.SourceRange); + } + RegInfo.addLiveIn(Reg, VReg); + } + + // Parse the callee saved register mask. + BitVector CalleeSavedRegisterMask(RegInfo.getUsedPhysRegsMask().size()); + if (!YamlMF.CalleeSavedRegisters) + return false; + for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) { + unsigned Reg = 0; + if (parseNamedRegisterReference(Reg, SM, MF, RegSource.Value, PFS, IRSlots, + Error)) + return error(Error, RegSource.SourceRange); + CalleeSavedRegisterMask[Reg] = true; + } + RegInfo.setUsedPhysRegMask(CalleeSavedRegisterMask.flip()); return false; } -bool MIRParserImpl::initializeFrameInfo(MachineFrameInfo &MFI, - const yaml::MachineFunction &YamlMF) { +void MIRParserImpl::inferRegisterInfo(MachineFunction &MF, + const yaml::MachineFunction &YamlMF) { + if (YamlMF.CalleeSavedRegisters) + return; + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isRegMask()) + continue; + MF.getRegInfo().addPhysRegsUsedFromRegMask(MO.getRegMask()); + } + } + } +} + +bool MIRParserImpl::initializeFrameInfo(MachineFunction &MF, + const yaml::MachineFunction &YamlMF, + PerFunctionMIParsingState &PFS) { + MachineFrameInfo &MFI = *MF.getFrameInfo(); + const Function &F = *MF.getFunction(); const yaml::MachineFrameInfo &YamlMFI = YamlMF.FrameInfo; MFI.setFrameAddressIsTaken(YamlMFI.IsFrameAddressTaken); MFI.setReturnAddressIsTaken(YamlMFI.IsReturnAddressTaken); @@ -383,7 +433,20 @@ bool MIRParserImpl::initializeFrameInfo(MachineFrameInfo &MFI, MFI.setHasOpaqueSPAdjustment(YamlMFI.HasOpaqueSPAdjustment); MFI.setHasVAStart(YamlMFI.HasVAStart); MFI.setHasMustTailInVarArgFunc(YamlMFI.HasMustTailInVarArgFunc); + if (!YamlMFI.SavePoint.Value.empty()) { + MachineBasicBlock *MBB = nullptr; + if (parseMBBReference(MBB, YamlMFI.SavePoint, MF, PFS)) + return true; + MFI.setSavePoint(MBB); + } + if (!YamlMFI.RestorePoint.Value.empty()) { + MachineBasicBlock *MBB = nullptr; + if (parseMBBReference(MBB, YamlMFI.RestorePoint, MF, PFS)) + return true; + MFI.setRestorePoint(MBB); + } + std::vector<CalleeSavedInfo> CSIInfo; // Initialize the fixed frame objects. for (const auto &Object : YamlMF.FixedStackObjects) { int ObjectIdx; @@ -393,27 +456,190 @@ bool MIRParserImpl::initializeFrameInfo(MachineFrameInfo &MFI, else ObjectIdx = MFI.CreateFixedSpillStackObject(Object.Size, Object.Offset); MFI.setObjectAlignment(ObjectIdx, Object.Alignment); - // TODO: Store the mapping between fixed object IDs and object indices to - // parse fixed stack object references correctly. + if (!PFS.FixedStackObjectSlots.insert(std::make_pair(Object.ID.Value, + ObjectIdx)) + .second) + return error(Object.ID.SourceRange.Start, + Twine("redefinition of fixed stack object '%fixed-stack.") + + Twine(Object.ID.Value) + "'"); + if (parseCalleeSavedRegister(MF, PFS, CSIInfo, Object.CalleeSavedRegister, + ObjectIdx)) + return true; } // Initialize the ordinary frame objects. for (const auto &Object : YamlMF.StackObjects) { int ObjectIdx; + const AllocaInst *Alloca = nullptr; + const yaml::StringValue &Name = Object.Name; + if (!Name.Value.empty()) { + Alloca = dyn_cast_or_null<AllocaInst>( + F.getValueSymbolTable().lookup(Name.Value)); + if (!Alloca) + return error(Name.SourceRange.Start, + "alloca instruction named '" + Name.Value + + "' isn't defined in the function '" + F.getName() + + "'"); + } if (Object.Type == yaml::MachineStackObject::VariableSized) - ObjectIdx = - MFI.CreateVariableSizedObject(Object.Alignment, /*Alloca=*/nullptr); + ObjectIdx = MFI.CreateVariableSizedObject(Object.Alignment, Alloca); else ObjectIdx = MFI.CreateStackObject( Object.Size, Object.Alignment, - Object.Type == yaml::MachineStackObject::SpillSlot); + Object.Type == yaml::MachineStackObject::SpillSlot, Alloca); MFI.setObjectOffset(ObjectIdx, Object.Offset); - // TODO: Store the mapping between object IDs and object indices to parse - // stack object references correctly. + if (!PFS.StackObjectSlots.insert(std::make_pair(Object.ID.Value, ObjectIdx)) + .second) + return error(Object.ID.SourceRange.Start, + Twine("redefinition of stack object '%stack.") + + Twine(Object.ID.Value) + "'"); + if (parseCalleeSavedRegister(MF, PFS, CSIInfo, Object.CalleeSavedRegister, + ObjectIdx)) + return true; + if (Object.LocalOffset) + MFI.mapLocalFrameObject(ObjectIdx, Object.LocalOffset.getValue()); + if (parseStackObjectsDebugInfo(MF, PFS, Object, ObjectIdx)) + return true; + } + MFI.setCalleeSavedInfo(CSIInfo); + if (!CSIInfo.empty()) + MFI.setCalleeSavedInfoValid(true); + + // Initialize the various stack object references after initializing the + // stack objects. + if (!YamlMFI.StackProtector.Value.empty()) { + SMDiagnostic Error; + int FI; + if (parseStackObjectReference(FI, SM, MF, YamlMFI.StackProtector.Value, PFS, + IRSlots, Error)) + return error(Error, YamlMFI.StackProtector.SourceRange); + MFI.setStackProtectorIndex(FI); + } + return false; +} + +bool MIRParserImpl::parseCalleeSavedRegister( + MachineFunction &MF, PerFunctionMIParsingState &PFS, + std::vector<CalleeSavedInfo> &CSIInfo, + const yaml::StringValue &RegisterSource, int FrameIdx) { + if (RegisterSource.Value.empty()) + return false; + unsigned Reg = 0; + SMDiagnostic Error; + if (parseNamedRegisterReference(Reg, SM, MF, RegisterSource.Value, PFS, + IRSlots, Error)) + return error(Error, RegisterSource.SourceRange); + CSIInfo.push_back(CalleeSavedInfo(Reg, FrameIdx)); + return false; +} + +/// Verify that given node is of a certain type. Return true on error. +template <typename T> +static bool typecheckMDNode(T *&Result, MDNode *Node, + const yaml::StringValue &Source, + StringRef TypeString, MIRParserImpl &Parser) { + if (!Node) + return false; + Result = dyn_cast<T>(Node); + if (!Result) + return Parser.error(Source.SourceRange.Start, + "expected a reference to a '" + TypeString + + "' metadata node"); + return false; +} + +bool MIRParserImpl::parseStackObjectsDebugInfo( + MachineFunction &MF, PerFunctionMIParsingState &PFS, + const yaml::MachineStackObject &Object, int FrameIdx) { + // Debug information can only be attached to stack objects; Fixed stack + // objects aren't supported. + assert(FrameIdx >= 0 && "Expected a stack object frame index"); + MDNode *Var = nullptr, *Expr = nullptr, *Loc = nullptr; + if (parseMDNode(Var, Object.DebugVar, MF, PFS) || + parseMDNode(Expr, Object.DebugExpr, MF, PFS) || + parseMDNode(Loc, Object.DebugLoc, MF, PFS)) + return true; + if (!Var && !Expr && !Loc) + return false; + DILocalVariable *DIVar = nullptr; + DIExpression *DIExpr = nullptr; + DILocation *DILoc = nullptr; + if (typecheckMDNode(DIVar, Var, Object.DebugVar, "DILocalVariable", *this) || + typecheckMDNode(DIExpr, Expr, Object.DebugExpr, "DIExpression", *this) || + typecheckMDNode(DILoc, Loc, Object.DebugLoc, "DILocation", *this)) + return true; + MF.getMMI().setVariableDbgInfo(DIVar, DIExpr, unsigned(FrameIdx), DILoc); + return false; +} + +bool MIRParserImpl::parseMDNode(MDNode *&Node, const yaml::StringValue &Source, + MachineFunction &MF, + const PerFunctionMIParsingState &PFS) { + if (Source.Value.empty()) + return false; + SMDiagnostic Error; + if (llvm::parseMDNode(Node, SM, MF, Source.Value, PFS, IRSlots, Error)) + return error(Error, Source.SourceRange); + return false; +} + +bool MIRParserImpl::initializeConstantPool( + MachineConstantPool &ConstantPool, const yaml::MachineFunction &YamlMF, + const MachineFunction &MF, + DenseMap<unsigned, unsigned> &ConstantPoolSlots) { + const auto &M = *MF.getFunction()->getParent(); + SMDiagnostic Error; + for (const auto &YamlConstant : YamlMF.Constants) { + const Constant *Value = dyn_cast_or_null<Constant>( + parseConstantValue(YamlConstant.Value.Value, Error, M)); + if (!Value) + return error(Error, YamlConstant.Value.SourceRange); + unsigned Alignment = + YamlConstant.Alignment + ? YamlConstant.Alignment + : M.getDataLayout().getPrefTypeAlignment(Value->getType()); + unsigned Index = ConstantPool.getConstantPoolIndex(Value, Alignment); + if (!ConstantPoolSlots.insert(std::make_pair(YamlConstant.ID.Value, Index)) + .second) + return error(YamlConstant.ID.SourceRange.Start, + Twine("redefinition of constant pool item '%const.") + + Twine(YamlConstant.ID.Value) + "'"); } return false; } +bool MIRParserImpl::initializeJumpTableInfo( + MachineFunction &MF, const yaml::MachineJumpTable &YamlJTI, + PerFunctionMIParsingState &PFS) { + MachineJumpTableInfo *JTI = MF.getOrCreateJumpTableInfo(YamlJTI.Kind); + for (const auto &Entry : YamlJTI.Entries) { + std::vector<MachineBasicBlock *> Blocks; + for (const auto &MBBSource : Entry.Blocks) { + MachineBasicBlock *MBB = nullptr; + if (parseMBBReference(MBB, MBBSource.Value, MF, PFS)) + return true; + Blocks.push_back(MBB); + } + unsigned Index = JTI->createJumpTableIndex(Blocks); + if (!PFS.JumpTableSlots.insert(std::make_pair(Entry.ID.Value, Index)) + .second) + return error(Entry.ID.SourceRange.Start, + Twine("redefinition of jump table entry '%jump-table.") + + Twine(Entry.ID.Value) + "'"); + } + return false; +} + +bool MIRParserImpl::parseMBBReference(MachineBasicBlock *&MBB, + const yaml::StringValue &Source, + MachineFunction &MF, + const PerFunctionMIParsingState &PFS) { + SMDiagnostic Error; + if (llvm::parseMBBReference(MBB, SM, MF, Source.Value, PFS, IRSlots, Error)) + return error(Error, Source.SourceRange); + return false; +} + SMDiagnostic MIRParserImpl::diagFromMIStringDiag(const SMDiagnostic &Error, SMRange SourceRange) { assert(SourceRange.isValid() && "Invalid source range"); @@ -430,8 +656,8 @@ SMDiagnostic MIRParserImpl::diagFromMIStringDiag(const SMDiagnostic &Error, Error.getFixIts()); } -SMDiagnostic MIRParserImpl::diagFromLLVMAssemblyDiag(const SMDiagnostic &Error, - SMRange SourceRange) { +SMDiagnostic MIRParserImpl::diagFromBlockStringDiag(const SMDiagnostic &Error, + SMRange SourceRange) { assert(SourceRange.isValid()); // Translate the location of the error from the location in the llvm IR string diff --git a/contrib/llvm/lib/CodeGen/MIRPrinter.cpp b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp index d5cf924..175cb0d 100644 --- a/contrib/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp @@ -14,13 +14,20 @@ #include "MIRPrinter.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSlotTracker.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/YAMLTraits.h" @@ -31,11 +38,38 @@ using namespace llvm; namespace { +/// This structure describes how to print out stack object references. +struct FrameIndexOperand { + std::string Name; + unsigned ID; + bool IsFixed; + + FrameIndexOperand(StringRef Name, unsigned ID, bool IsFixed) + : Name(Name.str()), ID(ID), IsFixed(IsFixed) {} + + /// Return an ordinary stack object reference. + static FrameIndexOperand create(StringRef Name, unsigned ID) { + return FrameIndexOperand(Name, ID, /*IsFixed=*/false); + } + + /// Return a fixed stack object reference. + static FrameIndexOperand createFixed(unsigned ID) { + return FrameIndexOperand("", ID, /*IsFixed=*/true); + } +}; + +} // end anonymous namespace + +namespace llvm { + /// This class prints out the machine functions using the MIR serialization /// format. class MIRPrinter { raw_ostream &OS; DenseMap<const uint32_t *, unsigned> RegisterMaskIds; + /// Maps from stack object indices to operand indices which will be used when + /// printing frame index machine operands. + DenseMap<int, FrameIndexOperand> StackObjectOperandMapping; public: MIRPrinter(raw_ostream &OS) : OS(OS) {} @@ -44,11 +78,16 @@ public: void convert(yaml::MachineFunction &MF, const MachineRegisterInfo &RegInfo, const TargetRegisterInfo *TRI); - void convert(yaml::MachineFrameInfo &YamlMFI, const MachineFrameInfo &MFI); - void convert(ModuleSlotTracker &MST, yaml::MachineBasicBlock &YamlMBB, - const MachineBasicBlock &MBB); + void convert(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI, + const MachineFrameInfo &MFI); + void convert(yaml::MachineFunction &MF, + const MachineConstantPool &ConstantPool); + void convert(ModuleSlotTracker &MST, yaml::MachineJumpTable &YamlJTI, + const MachineJumpTableInfo &JTI); void convertStackObjects(yaml::MachineFunction &MF, - const MachineFrameInfo &MFI); + const MachineFrameInfo &MFI, MachineModuleInfo &MMI, + ModuleSlotTracker &MST, + const TargetRegisterInfo *TRI); private: void initRegisterMaskIds(const MachineFunction &MF); @@ -60,18 +99,32 @@ class MIPrinter { raw_ostream &OS; ModuleSlotTracker &MST; const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds; + const DenseMap<int, FrameIndexOperand> &StackObjectOperandMapping; public: MIPrinter(raw_ostream &OS, ModuleSlotTracker &MST, - const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds) - : OS(OS), MST(MST), RegisterMaskIds(RegisterMaskIds) {} + const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds, + const DenseMap<int, FrameIndexOperand> &StackObjectOperandMapping) + : OS(OS), MST(MST), RegisterMaskIds(RegisterMaskIds), + StackObjectOperandMapping(StackObjectOperandMapping) {} + + void print(const MachineBasicBlock &MBB); void print(const MachineInstr &MI); void printMBBReference(const MachineBasicBlock &MBB); - void print(const MachineOperand &Op, const TargetRegisterInfo *TRI); + void printIRBlockReference(const BasicBlock &BB); + void printIRValueReference(const Value &V); + void printStackObjectReference(int FrameIndex); + void printOffset(int64_t Offset); + void printTargetFlags(const MachineOperand &Op); + void print(const MachineOperand &Op, const TargetRegisterInfo *TRI, + unsigned I, bool ShouldPrintRegisterTies, bool IsDef = false); + void print(const MachineMemOperand &Op); + + void print(const MCCFIInstruction &CFI, const TargetRegisterInfo *TRI); }; -} // end anonymous namespace +} // end namespace llvm namespace llvm { namespace yaml { @@ -103,6 +156,12 @@ static void printReg(unsigned Reg, raw_ostream &OS, llvm_unreachable("Can't print this kind of register yet"); } +static void printReg(unsigned Reg, yaml::StringValue &Dest, + const TargetRegisterInfo *TRI) { + raw_string_ostream OS(Dest.Value); + printReg(Reg, OS, TRI); +} + void MIRPrinter::print(const MachineFunction &MF) { initRegisterMaskIds(MF); @@ -112,23 +171,25 @@ void MIRPrinter::print(const MachineFunction &MF) { YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice(); YamlMF.HasInlineAsm = MF.hasInlineAsm(); convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo()); - convert(YamlMF.FrameInfo, *MF.getFrameInfo()); - convertStackObjects(YamlMF, *MF.getFrameInfo()); - - int I = 0; ModuleSlotTracker MST(MF.getFunction()->getParent()); + MST.incorporateFunction(*MF.getFunction()); + convert(MST, YamlMF.FrameInfo, *MF.getFrameInfo()); + convertStackObjects(YamlMF, *MF.getFrameInfo(), MF.getMMI(), MST, + MF.getSubtarget().getRegisterInfo()); + if (const auto *ConstantPool = MF.getConstantPool()) + convert(YamlMF, *ConstantPool); + if (const auto *JumpTableInfo = MF.getJumpTableInfo()) + convert(MST, YamlMF.JumpTableInfo, *JumpTableInfo); + raw_string_ostream StrOS(YamlMF.Body.Value.Value); + bool IsNewlineNeeded = false; for (const auto &MBB : MF) { - // TODO: Allow printing of non sequentially numbered MBBs. - // This is currently needed as the basic block references get their index - // from MBB.getNumber(), thus it should be sequential so that the parser can - // map back to the correct MBBs when parsing the output. - assert(MBB.getNumber() == I++ && - "Can't print MBBs that aren't sequentially numbered"); - (void)I; - yaml::MachineBasicBlock YamlMBB; - convert(MST, YamlMBB, MBB); - YamlMF.BasicBlocks.push_back(YamlMBB); + if (IsNewlineNeeded) + StrOS << "\n"; + MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping) + .print(MBB); + IsNewlineNeeded = true; } + StrOS.flush(); yaml::Output Out(OS); Out << YamlMF; } @@ -147,11 +208,38 @@ void MIRPrinter::convert(yaml::MachineFunction &MF, VReg.ID = I; VReg.Class = StringRef(TRI->getRegClassName(RegInfo.getRegClass(Reg))).lower(); + unsigned PreferredReg = RegInfo.getSimpleHint(Reg); + if (PreferredReg) + printReg(PreferredReg, VReg.PreferredRegister, TRI); MF.VirtualRegisters.push_back(VReg); } + + // Print the live ins. + for (auto I = RegInfo.livein_begin(), E = RegInfo.livein_end(); I != E; ++I) { + yaml::MachineFunctionLiveIn LiveIn; + printReg(I->first, LiveIn.Register, TRI); + if (I->second) + printReg(I->second, LiveIn.VirtualRegister, TRI); + MF.LiveIns.push_back(LiveIn); + } + // The used physical register mask is printed as an inverted callee saved + // register mask. + const BitVector &UsedPhysRegMask = RegInfo.getUsedPhysRegsMask(); + if (UsedPhysRegMask.none()) + return; + std::vector<yaml::FlowStringValue> CalleeSavedRegisters; + for (unsigned I = 0, E = UsedPhysRegMask.size(); I != E; ++I) { + if (!UsedPhysRegMask[I]) { + yaml::FlowStringValue Reg; + printReg(I, Reg, TRI); + CalleeSavedRegisters.push_back(Reg); + } + } + MF.CalleeSavedRegisters = CalleeSavedRegisters; } -void MIRPrinter::convert(yaml::MachineFrameInfo &YamlMFI, +void MIRPrinter::convert(ModuleSlotTracker &MST, + yaml::MachineFrameInfo &YamlMFI, const MachineFrameInfo &MFI) { YamlMFI.IsFrameAddressTaken = MFI.isFrameAddressTaken(); YamlMFI.IsReturnAddressTaken = MFI.isReturnAddressTaken(); @@ -166,10 +254,23 @@ void MIRPrinter::convert(yaml::MachineFrameInfo &YamlMFI, YamlMFI.HasOpaqueSPAdjustment = MFI.hasOpaqueSPAdjustment(); YamlMFI.HasVAStart = MFI.hasVAStart(); YamlMFI.HasMustTailInVarArgFunc = MFI.hasMustTailInVarArgFunc(); + if (MFI.getSavePoint()) { + raw_string_ostream StrOS(YamlMFI.SavePoint.Value); + MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping) + .printMBBReference(*MFI.getSavePoint()); + } + if (MFI.getRestorePoint()) { + raw_string_ostream StrOS(YamlMFI.RestorePoint.Value); + MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping) + .printMBBReference(*MFI.getRestorePoint()); + } } void MIRPrinter::convertStackObjects(yaml::MachineFunction &MF, - const MachineFrameInfo &MFI) { + const MachineFrameInfo &MFI, + MachineModuleInfo &MMI, + ModuleSlotTracker &MST, + const TargetRegisterInfo *TRI) { // Process fixed stack objects. unsigned ID = 0; for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) { @@ -177,7 +278,7 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &MF, continue; yaml::FixedMachineStackObject YamlObject; - YamlObject.ID = ID++; + YamlObject.ID = ID; YamlObject.Type = MFI.isSpillSlotObjectIndex(I) ? yaml::FixedMachineStackObject::SpillSlot : yaml::FixedMachineStackObject::DefaultType; @@ -187,8 +288,8 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &MF, YamlObject.IsImmutable = MFI.isImmutableObjectIndex(I); YamlObject.IsAliased = MFI.isAliasedObjectIndex(I); MF.FixedStackObjects.push_back(YamlObject); - // TODO: Store the mapping between fixed object IDs and object indices to - // print the fixed stack object references correctly. + StackObjectOperandMapping.insert( + std::make_pair(I, FrameIndexOperand::createFixed(ID++))); } // Process ordinary stack objects. @@ -198,7 +299,10 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &MF, continue; yaml::MachineStackObject YamlObject; - YamlObject.ID = ID++; + YamlObject.ID = ID; + if (const auto *Alloca = MFI.getObjectAllocation(I)) + YamlObject.Name.Value = + Alloca->hasName() ? Alloca->getName() : "<unnamed alloca>"; YamlObject.Type = MFI.isSpillSlotObjectIndex(I) ? yaml::MachineStackObject::SpillSlot : MFI.isVariableSizedObjectIndex(I) @@ -209,47 +313,100 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &MF, YamlObject.Alignment = MFI.getObjectAlignment(I); MF.StackObjects.push_back(YamlObject); - // TODO: Store the mapping between object IDs and object indices to print - // the stack object references correctly. + StackObjectOperandMapping.insert(std::make_pair( + I, FrameIndexOperand::create(YamlObject.Name.Value, ID++))); + } + + for (const auto &CSInfo : MFI.getCalleeSavedInfo()) { + yaml::StringValue Reg; + printReg(CSInfo.getReg(), Reg, TRI); + auto StackObjectInfo = StackObjectOperandMapping.find(CSInfo.getFrameIdx()); + assert(StackObjectInfo != StackObjectOperandMapping.end() && + "Invalid stack object index"); + const FrameIndexOperand &StackObject = StackObjectInfo->second; + if (StackObject.IsFixed) + MF.FixedStackObjects[StackObject.ID].CalleeSavedRegister = Reg; + else + MF.StackObjects[StackObject.ID].CalleeSavedRegister = Reg; + } + for (unsigned I = 0, E = MFI.getLocalFrameObjectCount(); I < E; ++I) { + auto LocalObject = MFI.getLocalFrameObjectMap(I); + auto StackObjectInfo = StackObjectOperandMapping.find(LocalObject.first); + assert(StackObjectInfo != StackObjectOperandMapping.end() && + "Invalid stack object index"); + const FrameIndexOperand &StackObject = StackObjectInfo->second; + assert(!StackObject.IsFixed && "Expected a locally mapped stack object"); + MF.StackObjects[StackObject.ID].LocalOffset = LocalObject.second; + } + + // Print the stack object references in the frame information class after + // converting the stack objects. + if (MFI.hasStackProtectorIndex()) { + raw_string_ostream StrOS(MF.FrameInfo.StackProtector.Value); + MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping) + .printStackObjectReference(MFI.getStackProtectorIndex()); + } + + // Print the debug variable information. + for (MachineModuleInfo::VariableDbgInfo &DebugVar : + MMI.getVariableDbgInfo()) { + auto StackObjectInfo = StackObjectOperandMapping.find(DebugVar.Slot); + assert(StackObjectInfo != StackObjectOperandMapping.end() && + "Invalid stack object index"); + const FrameIndexOperand &StackObject = StackObjectInfo->second; + assert(!StackObject.IsFixed && "Expected a non-fixed stack object"); + auto &Object = MF.StackObjects[StackObject.ID]; + { + raw_string_ostream StrOS(Object.DebugVar.Value); + DebugVar.Var->printAsOperand(StrOS, MST); + } + { + raw_string_ostream StrOS(Object.DebugExpr.Value); + DebugVar.Expr->printAsOperand(StrOS, MST); + } + { + raw_string_ostream StrOS(Object.DebugLoc.Value); + DebugVar.Loc->printAsOperand(StrOS, MST); + } } } -void MIRPrinter::convert(ModuleSlotTracker &MST, - yaml::MachineBasicBlock &YamlMBB, - const MachineBasicBlock &MBB) { - assert(MBB.getNumber() >= 0 && "Invalid MBB number"); - YamlMBB.ID = (unsigned)MBB.getNumber(); - // TODO: Serialize unnamed BB references. - if (const auto *BB = MBB.getBasicBlock()) - YamlMBB.Name.Value = BB->hasName() ? BB->getName() : "<unnamed bb>"; - else - YamlMBB.Name.Value = ""; - YamlMBB.Alignment = MBB.getAlignment(); - YamlMBB.AddressTaken = MBB.hasAddressTaken(); - YamlMBB.IsLandingPad = MBB.isLandingPad(); - for (const auto *SuccMBB : MBB.successors()) { +void MIRPrinter::convert(yaml::MachineFunction &MF, + const MachineConstantPool &ConstantPool) { + unsigned ID = 0; + for (const MachineConstantPoolEntry &Constant : ConstantPool.getConstants()) { + // TODO: Serialize target specific constant pool entries. + if (Constant.isMachineConstantPoolEntry()) + llvm_unreachable("Can't print target specific constant pool entries yet"); + + yaml::MachineConstantPoolValue YamlConstant; std::string Str; raw_string_ostream StrOS(Str); - MIPrinter(StrOS, MST, RegisterMaskIds).printMBBReference(*SuccMBB); - YamlMBB.Successors.push_back(StrOS.str()); + Constant.Val.ConstVal->printAsOperand(StrOS); + YamlConstant.ID = ID++; + YamlConstant.Value = StrOS.str(); + YamlConstant.Alignment = Constant.getAlignment(); + MF.Constants.push_back(YamlConstant); } - // Print the live in registers. - const auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo(); - assert(TRI && "Expected target register info"); - for (auto I = MBB.livein_begin(), E = MBB.livein_end(); I != E; ++I) { +} + +void MIRPrinter::convert(ModuleSlotTracker &MST, + yaml::MachineJumpTable &YamlJTI, + const MachineJumpTableInfo &JTI) { + YamlJTI.Kind = JTI.getEntryKind(); + unsigned ID = 0; + for (const auto &Table : JTI.getJumpTables()) { std::string Str; - raw_string_ostream StrOS(Str); - printReg(*I, StrOS, TRI); - YamlMBB.LiveIns.push_back(StrOS.str()); - } - // Print the machine instructions. - YamlMBB.Instructions.reserve(MBB.size()); - std::string Str; - for (const auto &MI : MBB) { - raw_string_ostream StrOS(Str); - MIPrinter(StrOS, MST, RegisterMaskIds).print(MI); - YamlMBB.Instructions.push_back(StrOS.str()); - Str.clear(); + yaml::MachineJumpTable::Entry Entry; + Entry.ID = ID++; + for (const auto *MBB : Table.MBBs) { + raw_string_ostream StrOS(Str); + MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping) + .printMBBReference(*MBB); + Entry.Blocks.push_back(StrOS.str()); + Str.clear(); + } + YamlJTI.Entries.push_back(Entry); } } @@ -260,26 +417,137 @@ void MIRPrinter::initRegisterMaskIds(const MachineFunction &MF) { RegisterMaskIds.insert(std::make_pair(Mask, I++)); } +void MIPrinter::print(const MachineBasicBlock &MBB) { + assert(MBB.getNumber() >= 0 && "Invalid MBB number"); + OS << "bb." << MBB.getNumber(); + bool HasAttributes = false; + if (const auto *BB = MBB.getBasicBlock()) { + if (BB->hasName()) { + OS << "." << BB->getName(); + } else { + HasAttributes = true; + OS << " ("; + int Slot = MST.getLocalSlot(BB); + if (Slot == -1) + OS << "<ir-block badref>"; + else + OS << (Twine("%ir-block.") + Twine(Slot)).str(); + } + } + if (MBB.hasAddressTaken()) { + OS << (HasAttributes ? ", " : " ("); + OS << "address-taken"; + HasAttributes = true; + } + if (MBB.isEHPad()) { + OS << (HasAttributes ? ", " : " ("); + OS << "landing-pad"; + HasAttributes = true; + } + if (MBB.getAlignment()) { + OS << (HasAttributes ? ", " : " ("); + OS << "align " << MBB.getAlignment(); + HasAttributes = true; + } + if (HasAttributes) + OS << ")"; + OS << ":\n"; + + bool HasLineAttributes = false; + // Print the successors + if (!MBB.succ_empty()) { + OS.indent(2) << "successors: "; + for (auto I = MBB.succ_begin(), E = MBB.succ_end(); I != E; ++I) { + if (I != MBB.succ_begin()) + OS << ", "; + printMBBReference(**I); + if (MBB.hasSuccessorProbabilities()) + OS << '(' << MBB.getSuccProbability(I) << ')'; + } + OS << "\n"; + HasLineAttributes = true; + } + + // Print the live in registers. + const auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo(); + assert(TRI && "Expected target register info"); + if (!MBB.livein_empty()) { + OS.indent(2) << "liveins: "; + bool First = true; + for (const auto &LI : MBB.liveins()) { + if (!First) + OS << ", "; + First = false; + printReg(LI.PhysReg, OS, TRI); + if (LI.LaneMask != ~0u) + OS << ':' << PrintLaneMask(LI.LaneMask); + } + OS << "\n"; + HasLineAttributes = true; + } + + if (HasLineAttributes) + OS << "\n"; + bool IsInBundle = false; + for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; ++I) { + const MachineInstr &MI = *I; + if (IsInBundle && !MI.isInsideBundle()) { + OS.indent(2) << "}\n"; + IsInBundle = false; + } + OS.indent(IsInBundle ? 4 : 2); + print(MI); + if (!IsInBundle && MI.getFlag(MachineInstr::BundledSucc)) { + OS << " {"; + IsInBundle = true; + } + OS << "\n"; + } + if (IsInBundle) + OS.indent(2) << "}\n"; +} + +/// Return true when an instruction has tied register that can't be determined +/// by the instruction's descriptor. +static bool hasComplexRegisterTies(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) { + const auto &Operand = MI.getOperand(I); + if (!Operand.isReg() || Operand.isDef()) + // Ignore the defined registers as MCID marks only the uses as tied. + continue; + int ExpectedTiedIdx = MCID.getOperandConstraint(I, MCOI::TIED_TO); + int TiedIdx = Operand.isTied() ? int(MI.findTiedOperandIdx(I)) : -1; + if (ExpectedTiedIdx != TiedIdx) + return true; + } + return false; +} + void MIPrinter::print(const MachineInstr &MI) { const auto &SubTarget = MI.getParent()->getParent()->getSubtarget(); const auto *TRI = SubTarget.getRegisterInfo(); assert(TRI && "Expected target register info"); const auto *TII = SubTarget.getInstrInfo(); assert(TII && "Expected target instruction info"); + if (MI.isCFIInstruction()) + assert(MI.getNumOperands() == 1 && "Expected 1 operand in CFI instruction"); + bool ShouldPrintRegisterTies = hasComplexRegisterTies(MI); unsigned I = 0, E = MI.getNumOperands(); for (; I < E && MI.getOperand(I).isReg() && MI.getOperand(I).isDef() && !MI.getOperand(I).isImplicit(); ++I) { if (I) OS << ", "; - print(MI.getOperand(I), TRI); + print(MI.getOperand(I), TRI, I, ShouldPrintRegisterTies, /*IsDef=*/true); } if (I) OS << " = "; + if (MI.getFlag(MachineInstr::FrameSetup)) + OS << "frame-setup "; OS << TII->getName(MI.getOpcode()); - // TODO: Print the instruction flags, machine mem operands. if (I < E) OS << ' '; @@ -287,9 +555,27 @@ void MIPrinter::print(const MachineInstr &MI) { for (; I < E; ++I) { if (NeedComma) OS << ", "; - print(MI.getOperand(I), TRI); + print(MI.getOperand(I), TRI, I, ShouldPrintRegisterTies); NeedComma = true; } + + if (MI.getDebugLoc()) { + if (NeedComma) + OS << ','; + OS << " debug-location "; + MI.getDebugLoc()->printAsOperand(OS, MST); + } + + if (!MI.memoperands_empty()) { + OS << " :: "; + bool NeedComma = false; + for (const auto *Op : MI.memoperands()) { + if (NeedComma) + OS << ", "; + print(*Op); + NeedComma = true; + } + } } void MIPrinter::printMBBReference(const MachineBasicBlock &MBB) { @@ -300,32 +586,225 @@ void MIPrinter::printMBBReference(const MachineBasicBlock &MBB) { } } -void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI) { +static void printIRSlotNumber(raw_ostream &OS, int Slot) { + if (Slot == -1) + OS << "<badref>"; + else + OS << Slot; +} + +void MIPrinter::printIRBlockReference(const BasicBlock &BB) { + OS << "%ir-block."; + if (BB.hasName()) { + printLLVMNameWithoutPrefix(OS, BB.getName()); + return; + } + const Function *F = BB.getParent(); + int Slot; + if (F == MST.getCurrentFunction()) { + Slot = MST.getLocalSlot(&BB); + } else { + ModuleSlotTracker CustomMST(F->getParent(), + /*ShouldInitializeAllMetadata=*/false); + CustomMST.incorporateFunction(*F); + Slot = CustomMST.getLocalSlot(&BB); + } + printIRSlotNumber(OS, Slot); +} + +void MIPrinter::printIRValueReference(const Value &V) { + if (isa<GlobalValue>(V)) { + V.printAsOperand(OS, /*PrintType=*/false, MST); + return; + } + if (isa<Constant>(V)) { + // Machine memory operands can load/store to/from constant value pointers. + OS << '`'; + V.printAsOperand(OS, /*PrintType=*/true, MST); + OS << '`'; + return; + } + OS << "%ir."; + if (V.hasName()) { + printLLVMNameWithoutPrefix(OS, V.getName()); + return; + } + printIRSlotNumber(OS, MST.getLocalSlot(&V)); +} + +void MIPrinter::printStackObjectReference(int FrameIndex) { + auto ObjectInfo = StackObjectOperandMapping.find(FrameIndex); + assert(ObjectInfo != StackObjectOperandMapping.end() && + "Invalid frame index"); + const FrameIndexOperand &Operand = ObjectInfo->second; + if (Operand.IsFixed) { + OS << "%fixed-stack." << Operand.ID; + return; + } + OS << "%stack." << Operand.ID; + if (!Operand.Name.empty()) + OS << '.' << Operand.Name; +} + +void MIPrinter::printOffset(int64_t Offset) { + if (Offset == 0) + return; + if (Offset < 0) { + OS << " - " << -Offset; + return; + } + OS << " + " << Offset; +} + +static const char *getTargetFlagName(const TargetInstrInfo *TII, unsigned TF) { + auto Flags = TII->getSerializableDirectMachineOperandTargetFlags(); + for (const auto &I : Flags) { + if (I.first == TF) { + return I.second; + } + } + return nullptr; +} + +void MIPrinter::printTargetFlags(const MachineOperand &Op) { + if (!Op.getTargetFlags()) + return; + const auto *TII = + Op.getParent()->getParent()->getParent()->getSubtarget().getInstrInfo(); + assert(TII && "expected instruction info"); + auto Flags = TII->decomposeMachineOperandsTargetFlags(Op.getTargetFlags()); + OS << "target-flags("; + const bool HasDirectFlags = Flags.first; + const bool HasBitmaskFlags = Flags.second; + if (!HasDirectFlags && !HasBitmaskFlags) { + OS << "<unknown>) "; + return; + } + if (HasDirectFlags) { + if (const auto *Name = getTargetFlagName(TII, Flags.first)) + OS << Name; + else + OS << "<unknown target flag>"; + } + if (!HasBitmaskFlags) { + OS << ") "; + return; + } + bool IsCommaNeeded = HasDirectFlags; + unsigned BitMask = Flags.second; + auto BitMasks = TII->getSerializableBitmaskMachineOperandTargetFlags(); + for (const auto &Mask : BitMasks) { + // Check if the flag's bitmask has the bits of the current mask set. + if ((BitMask & Mask.first) == Mask.first) { + if (IsCommaNeeded) + OS << ", "; + IsCommaNeeded = true; + OS << Mask.second; + // Clear the bits which were serialized from the flag's bitmask. + BitMask &= ~(Mask.first); + } + } + if (BitMask) { + // When the resulting flag's bitmask isn't zero, we know that we didn't + // serialize all of the bit flags. + if (IsCommaNeeded) + OS << ", "; + OS << "<unknown bitmask target flag>"; + } + OS << ") "; +} + +static const char *getTargetIndexName(const MachineFunction &MF, int Index) { + const auto *TII = MF.getSubtarget().getInstrInfo(); + assert(TII && "expected instruction info"); + auto Indices = TII->getSerializableTargetIndices(); + for (const auto &I : Indices) { + if (I.first == Index) { + return I.second; + } + } + return nullptr; +} + +void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI, + unsigned I, bool ShouldPrintRegisterTies, bool IsDef) { + printTargetFlags(Op); switch (Op.getType()) { case MachineOperand::MO_Register: - // TODO: Print the other register flags. if (Op.isImplicit()) OS << (Op.isDef() ? "implicit-def " : "implicit "); + else if (!IsDef && Op.isDef()) + // Print the 'def' flag only when the operand is defined after '='. + OS << "def "; + if (Op.isInternalRead()) + OS << "internal "; if (Op.isDead()) OS << "dead "; if (Op.isKill()) OS << "killed "; if (Op.isUndef()) OS << "undef "; + if (Op.isEarlyClobber()) + OS << "early-clobber "; + if (Op.isDebug()) + OS << "debug-use "; printReg(Op.getReg(), OS, TRI); // Print the sub register. if (Op.getSubReg() != 0) OS << ':' << TRI->getSubRegIndexName(Op.getSubReg()); + if (ShouldPrintRegisterTies && Op.isTied() && !Op.isDef()) + OS << "(tied-def " << Op.getParent()->findTiedOperandIdx(I) << ")"; break; case MachineOperand::MO_Immediate: OS << Op.getImm(); break; + case MachineOperand::MO_CImmediate: + Op.getCImm()->printAsOperand(OS, /*PrintType=*/true, MST); + break; + case MachineOperand::MO_FPImmediate: + Op.getFPImm()->printAsOperand(OS, /*PrintType=*/true, MST); + break; case MachineOperand::MO_MachineBasicBlock: printMBBReference(*Op.getMBB()); break; + case MachineOperand::MO_FrameIndex: + printStackObjectReference(Op.getIndex()); + break; + case MachineOperand::MO_ConstantPoolIndex: + OS << "%const." << Op.getIndex(); + printOffset(Op.getOffset()); + break; + case MachineOperand::MO_TargetIndex: { + OS << "target-index("; + if (const auto *Name = getTargetIndexName( + *Op.getParent()->getParent()->getParent(), Op.getIndex())) + OS << Name; + else + OS << "<unknown>"; + OS << ')'; + printOffset(Op.getOffset()); + break; + } + case MachineOperand::MO_JumpTableIndex: + OS << "%jump-table." << Op.getIndex(); + break; + case MachineOperand::MO_ExternalSymbol: + OS << '$'; + printLLVMNameWithoutPrefix(OS, Op.getSymbolName()); + printOffset(Op.getOffset()); + break; case MachineOperand::MO_GlobalAddress: Op.getGlobal()->printAsOperand(OS, /*PrintType=*/false, MST); - // TODO: Print offset and target flags. + printOffset(Op.getOffset()); + break; + case MachineOperand::MO_BlockAddress: + OS << "blockaddress("; + Op.getBlockAddress()->getFunction()->printAsOperand(OS, /*PrintType=*/false, + MST); + OS << ", "; + printIRBlockReference(*Op.getBlockAddress()->getBasicBlock()); + OS << ')'; + printOffset(Op.getOffset()); break; case MachineOperand::MO_RegisterMask: { auto RegMaskInfo = RegisterMaskIds.find(Op.getRegMask()); @@ -335,9 +814,157 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI) { llvm_unreachable("Can't print this machine register mask yet."); break; } + case MachineOperand::MO_RegisterLiveOut: { + const uint32_t *RegMask = Op.getRegLiveOut(); + OS << "liveout("; + bool IsCommaNeeded = false; + for (unsigned Reg = 0, E = TRI->getNumRegs(); Reg < E; ++Reg) { + if (RegMask[Reg / 32] & (1U << (Reg % 32))) { + if (IsCommaNeeded) + OS << ", "; + printReg(Reg, OS, TRI); + IsCommaNeeded = true; + } + } + OS << ")"; + break; + } + case MachineOperand::MO_Metadata: + Op.getMetadata()->printAsOperand(OS, MST); + break; + case MachineOperand::MO_MCSymbol: + OS << "<mcsymbol " << *Op.getMCSymbol() << ">"; + break; + case MachineOperand::MO_CFIIndex: { + const auto &MMI = Op.getParent()->getParent()->getParent()->getMMI(); + print(MMI.getFrameInstructions()[Op.getCFIIndex()], TRI); + break; + } + } +} + +void MIPrinter::print(const MachineMemOperand &Op) { + OS << '('; + // TODO: Print operand's target specific flags. + if (Op.isVolatile()) + OS << "volatile "; + if (Op.isNonTemporal()) + OS << "non-temporal "; + if (Op.isInvariant()) + OS << "invariant "; + if (Op.isLoad()) + OS << "load "; + else { + assert(Op.isStore() && "Non load machine operand must be a store"); + OS << "store "; + } + OS << Op.getSize() << (Op.isLoad() ? " from " : " into "); + if (const Value *Val = Op.getValue()) { + printIRValueReference(*Val); + } else { + const PseudoSourceValue *PVal = Op.getPseudoValue(); + assert(PVal && "Expected a pseudo source value"); + switch (PVal->kind()) { + case PseudoSourceValue::Stack: + OS << "stack"; + break; + case PseudoSourceValue::GOT: + OS << "got"; + break; + case PseudoSourceValue::JumpTable: + OS << "jump-table"; + break; + case PseudoSourceValue::ConstantPool: + OS << "constant-pool"; + break; + case PseudoSourceValue::FixedStack: + printStackObjectReference( + cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex()); + break; + case PseudoSourceValue::GlobalValueCallEntry: + OS << "call-entry "; + cast<GlobalValuePseudoSourceValue>(PVal)->getValue()->printAsOperand( + OS, /*PrintType=*/false, MST); + break; + case PseudoSourceValue::ExternalSymbolCallEntry: + OS << "call-entry $"; + printLLVMNameWithoutPrefix( + OS, cast<ExternalSymbolPseudoSourceValue>(PVal)->getSymbol()); + break; + } + } + printOffset(Op.getOffset()); + if (Op.getBaseAlignment() != Op.getSize()) + OS << ", align " << Op.getBaseAlignment(); + auto AAInfo = Op.getAAInfo(); + if (AAInfo.TBAA) { + OS << ", !tbaa "; + AAInfo.TBAA->printAsOperand(OS, MST); + } + if (AAInfo.Scope) { + OS << ", !alias.scope "; + AAInfo.Scope->printAsOperand(OS, MST); + } + if (AAInfo.NoAlias) { + OS << ", !noalias "; + AAInfo.NoAlias->printAsOperand(OS, MST); + } + if (Op.getRanges()) { + OS << ", !range "; + Op.getRanges()->printAsOperand(OS, MST); + } + OS << ')'; +} + +static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS, + const TargetRegisterInfo *TRI) { + int Reg = TRI->getLLVMRegNum(DwarfReg, true); + if (Reg == -1) { + OS << "<badreg>"; + return; + } + printReg(Reg, OS, TRI); +} + +void MIPrinter::print(const MCCFIInstruction &CFI, + const TargetRegisterInfo *TRI) { + switch (CFI.getOperation()) { + case MCCFIInstruction::OpSameValue: + OS << ".cfi_same_value "; + if (CFI.getLabel()) + OS << "<mcsymbol> "; + printCFIRegister(CFI.getRegister(), OS, TRI); + break; + case MCCFIInstruction::OpOffset: + OS << ".cfi_offset "; + if (CFI.getLabel()) + OS << "<mcsymbol> "; + printCFIRegister(CFI.getRegister(), OS, TRI); + OS << ", " << CFI.getOffset(); + break; + case MCCFIInstruction::OpDefCfaRegister: + OS << ".cfi_def_cfa_register "; + if (CFI.getLabel()) + OS << "<mcsymbol> "; + printCFIRegister(CFI.getRegister(), OS, TRI); + break; + case MCCFIInstruction::OpDefCfaOffset: + OS << ".cfi_def_cfa_offset "; + if (CFI.getLabel()) + OS << "<mcsymbol> "; + OS << CFI.getOffset(); + break; + case MCCFIInstruction::OpDefCfa: + OS << ".cfi_def_cfa "; + if (CFI.getLabel()) + OS << "<mcsymbol> "; + printCFIRegister(CFI.getRegister(), OS, TRI); + OS << ", " << CFI.getOffset(); + break; default: - // TODO: Print the other machine operands. - llvm_unreachable("Can't print this machine operand at the moment"); + // TODO: Print the other CFI Operations. + OS << "<unserializable cfi operation>"; + break; } } diff --git a/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp b/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp index 13d61e6..8e7566a 100644 --- a/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp +++ b/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp @@ -40,7 +40,7 @@ struct MIRPrintingPass : public MachineFunctionPass { MachineFunctionPass::getAnalysisUsage(AU); } - virtual bool runOnMachineFunction(MachineFunction &MF) override { + bool runOnMachineFunction(MachineFunction &MF) override { std::string Str; raw_string_ostream StrOS(Str); printMIR(StrOS, MF); @@ -48,7 +48,7 @@ struct MIRPrintingPass : public MachineFunctionPass { return false; } - virtual bool doFinalization(Module &M) override { + bool doFinalization(Module &M) override { printMIR(OS, M); OS << MachineFunctions; return false; diff --git a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp index 5d3f7eb..85d544d 100644 --- a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/ModuleSlotTracker.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/Support/DataTypes.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -38,22 +39,21 @@ using namespace llvm; #define DEBUG_TYPE "codegen" -MachineBasicBlock::MachineBasicBlock(MachineFunction &mf, const BasicBlock *bb) - : BB(bb), Number(-1), xParent(&mf), Alignment(0), IsLandingPad(false), - AddressTaken(false), CachedMCSymbol(nullptr) { +MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B) + : BB(B), Number(-1), xParent(&MF) { Insts.Parent = this; } MachineBasicBlock::~MachineBasicBlock() { } -/// getSymbol - Return the MCSymbol for this basic block. -/// +/// Return the MCSymbol for this basic block. MCSymbol *MachineBasicBlock::getSymbol() const { if (!CachedMCSymbol) { const MachineFunction *MF = getParent(); MCContext &Ctx = MF->getContext(); const char *Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix(); + assert(getNumber() >= 0 && "cannot get label for unreachable MBB"); CachedMCSymbol = Ctx.getOrCreateSymbol(Twine(Prefix) + "BB" + Twine(MF->getFunctionNumber()) + "_" + Twine(getNumber())); @@ -68,9 +68,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineBasicBlock &MBB) { return OS; } -/// addNodeToList (MBB) - When an MBB is added to an MF, we need to update the -/// parent pointer of the MBB, the MBB numbering, and any instructions in the -/// MBB to be on the right operand list for registers. +/// When an MBB is added to an MF, we need to update the parent pointer of the +/// MBB, the MBB numbering, and any instructions in the MBB to be on the right +/// operand list for registers. /// /// MBBs start out as #-1. When a MBB is added to a MachineFunction, it /// gets the next available unique MBB number. If it is removed from a @@ -91,10 +91,8 @@ void ilist_traits<MachineBasicBlock>::removeNodeFromList(MachineBasicBlock *N) { N->Number = -1; } - -/// addNodeToList (MI) - When we add an instruction to a basic block -/// list, we update its parent pointer and add its operands from reg use/def -/// lists if appropriate. +/// When we add an instruction to a basic block list, we update its parent +/// pointer and add its operands from reg use/def lists if appropriate. void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) { assert(!N->getParent() && "machine instruction already in a basic block"); N->setParent(Parent); @@ -105,9 +103,8 @@ void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) { N->AddRegOperandsToUseLists(MF->getRegInfo()); } -/// removeNodeFromList (MI) - When we remove an instruction from a basic block -/// list, we update its parent pointer and remove its operands from reg use/def -/// lists if appropriate. +/// When we remove an instruction from a basic block list, we update its parent +/// pointer and remove its operands from reg use/def lists if appropriate. void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) { assert(N->getParent() && "machine instruction not in a basic block"); @@ -118,23 +115,22 @@ void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) { N->setParent(nullptr); } -/// transferNodesFromList (MI) - When moving a range of instructions from one -/// MBB list to another, we need to update the parent pointers and the use/def -/// lists. +/// When moving a range of instructions from one MBB list to another, we need to +/// update the parent pointers and the use/def lists. void ilist_traits<MachineInstr>:: -transferNodesFromList(ilist_traits<MachineInstr> &fromList, - ilist_iterator<MachineInstr> first, - ilist_iterator<MachineInstr> last) { - assert(Parent->getParent() == fromList.Parent->getParent() && +transferNodesFromList(ilist_traits<MachineInstr> &FromList, + ilist_iterator<MachineInstr> First, + ilist_iterator<MachineInstr> Last) { + assert(Parent->getParent() == FromList.Parent->getParent() && "MachineInstr parent mismatch!"); // Splice within the same MBB -> no change. - if (Parent == fromList.Parent) return; + if (Parent == FromList.Parent) return; // If splicing between two blocks within the same function, just update the // parent pointers. - for (; first != last; ++first) - first->setParent(Parent); + for (; First != Last; ++First) + First->setParent(Parent); } void ilist_traits<MachineInstr>::deleteNode(MachineInstr* MI) { @@ -208,11 +204,18 @@ const MachineBasicBlock *MachineBasicBlock::getLandingPadSuccessor() const { if (succ_size() > 2) return nullptr; for (const_succ_iterator I = succ_begin(), E = succ_end(); I != E; ++I) - if ((*I)->isLandingPad()) + if ((*I)->isEHPad()) return *I; return nullptr; } +bool MachineBasicBlock::hasEHPadSuccessor() const { + for (const_succ_iterator I = succ_begin(), E = succ_end(); I != E; ++I) + if ((*I)->isEHPad()) + return true; + return false; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void MachineBasicBlock::dump() const { print(dbgs()); @@ -271,7 +274,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, LBB->printAsOperand(OS, /*PrintType=*/false, MST); Comma = ", "; } - if (isLandingPad()) { OS << Comma << "EH LANDING PAD"; Comma = ", "; } + if (isEHPad()) { OS << Comma << "EH LANDING PAD"; Comma = ", "; } if (hasAddressTaken()) { OS << Comma << "ADDRESS TAKEN"; Comma = ", "; } if (Alignment) OS << Comma << "Align " << Alignment << " (" << (1u << Alignment) @@ -283,8 +286,11 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, if (!livein_empty()) { if (Indexes) OS << '\t'; OS << " Live Ins:"; - for (livein_iterator I = livein_begin(),E = livein_end(); I != E; ++I) - OS << ' ' << PrintReg(*I, TRI); + for (const auto &LI : make_range(livein_begin(), livein_end())) { + OS << ' ' << PrintReg(LI.PhysReg, TRI); + if (LI.LaneMask != ~0u) + OS << ':' << PrintLaneMask(LI.LaneMask); + } OS << '\n'; } // Print the preds of this block according to the CFG. @@ -298,8 +304,8 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, for (const_instr_iterator I = instr_begin(); I != instr_end(); ++I) { if (Indexes) { - if (Indexes->hasIndex(I)) - OS << Indexes->getInstructionIndex(I); + if (Indexes->hasIndex(&*I)) + OS << Indexes->getInstructionIndex(&*I); OS << '\t'; } OS << '\t'; @@ -314,35 +320,63 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << " Successors according to CFG:"; for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) { OS << " BB#" << (*SI)->getNumber(); - if (!Weights.empty()) - OS << '(' << *getWeightIterator(SI) << ')'; + if (!Probs.empty()) + OS << '(' << *getProbabilityIterator(SI) << ')'; } OS << '\n'; } } -void MachineBasicBlock::printAsOperand(raw_ostream &OS, bool /*PrintType*/) const { +void MachineBasicBlock::printAsOperand(raw_ostream &OS, + bool /*PrintType*/) const { OS << "BB#" << getNumber(); } -void MachineBasicBlock::removeLiveIn(unsigned Reg) { - std::vector<unsigned>::iterator I = - std::find(LiveIns.begin(), LiveIns.end(), Reg); - if (I != LiveIns.end()) +void MachineBasicBlock::removeLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) { + LiveInVector::iterator I = std::find_if( + LiveIns.begin(), LiveIns.end(), + [Reg] (const RegisterMaskPair &LI) { return LI.PhysReg == Reg; }); + if (I == LiveIns.end()) + return; + + I->LaneMask &= ~LaneMask; + if (I->LaneMask == 0) LiveIns.erase(I); } -bool MachineBasicBlock::isLiveIn(unsigned Reg) const { - livein_iterator I = std::find(livein_begin(), livein_end(), Reg); - return I != livein_end(); +bool MachineBasicBlock::isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) const { + livein_iterator I = std::find_if( + LiveIns.begin(), LiveIns.end(), + [Reg] (const RegisterMaskPair &LI) { return LI.PhysReg == Reg; }); + return I != livein_end() && (I->LaneMask & LaneMask) != 0; +} + +void MachineBasicBlock::sortUniqueLiveIns() { + std::sort(LiveIns.begin(), LiveIns.end(), + [](const RegisterMaskPair &LI0, const RegisterMaskPair &LI1) { + return LI0.PhysReg < LI1.PhysReg; + }); + // Liveins are sorted by physreg now we can merge their lanemasks. + LiveInVector::const_iterator I = LiveIns.begin(); + LiveInVector::const_iterator J; + LiveInVector::iterator Out = LiveIns.begin(); + for (; I != LiveIns.end(); ++Out, I = J) { + unsigned PhysReg = I->PhysReg; + LaneBitmask LaneMask = I->LaneMask; + for (J = std::next(I); J != LiveIns.end() && J->PhysReg == PhysReg; ++J) + LaneMask |= J->LaneMask; + Out->PhysReg = PhysReg; + Out->LaneMask = LaneMask; + } + LiveIns.erase(Out, LiveIns.end()); } unsigned -MachineBasicBlock::addLiveIn(unsigned PhysReg, const TargetRegisterClass *RC) { +MachineBasicBlock::addLiveIn(MCPhysReg PhysReg, const TargetRegisterClass *RC) { assert(getParent() && "MBB must be inserted in function"); assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) && "Expected physreg"); assert(RC && "Register class is required"); - assert((isLandingPad() || this == &getParent()->front()) && + assert((isEHPad() || this == &getParent()->front()) && "Only the entry block and landing pads can have physreg live ins"); bool LiveIn = isLiveIn(PhysReg); @@ -370,12 +404,11 @@ MachineBasicBlock::addLiveIn(unsigned PhysReg, const TargetRegisterClass *RC) { } void MachineBasicBlock::moveBefore(MachineBasicBlock *NewAfter) { - getParent()->splice(NewAfter, this); + getParent()->splice(NewAfter->getIterator(), getIterator()); } void MachineBasicBlock::moveAfter(MachineBasicBlock *NewBefore) { - MachineFunction::iterator BBI = NewBefore; - getParent()->splice(++BBI, this); + getParent()->splice(++NewBefore->getIterator(), getIterator()); } void MachineBasicBlock::updateTerminator() { @@ -385,7 +418,7 @@ void MachineBasicBlock::updateTerminator() { MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; - DebugLoc dl; // FIXME: this is nowhere + DebugLoc DL; // FIXME: this is nowhere bool B = TII->AnalyzeBranch(*this, TBB, FBB, Cond); (void) B; assert(!B && "UpdateTerminators requires analyzable predecessors!"); @@ -400,7 +433,7 @@ void MachineBasicBlock::updateTerminator() { // its layout successor, insert a branch. First we have to locate the // only non-landing-pad successor, as that is the fallthrough block. for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) { - if ((*SI)->isLandingPad()) + if ((*SI)->isEHPad()) continue; assert(!TBB && "Found more than one non-landing-pad successor!"); TBB = *SI; @@ -414,7 +447,7 @@ void MachineBasicBlock::updateTerminator() { // Finally update the unconditional successor to be reached via a branch // if it would not be reached by fallthrough. if (!isLayoutSuccessor(TBB)) - TII->InsertBranch(*this, TBB, nullptr, Cond, dl); + TII->InsertBranch(*this, TBB, nullptr, Cond, DL); } } else { if (FBB) { @@ -425,10 +458,10 @@ void MachineBasicBlock::updateTerminator() { if (TII->ReverseBranchCondition(Cond)) return; TII->RemoveBranch(*this); - TII->InsertBranch(*this, FBB, nullptr, Cond, dl); + TII->InsertBranch(*this, FBB, nullptr, Cond, DL); } else if (isLayoutSuccessor(FBB)) { TII->RemoveBranch(*this); - TII->InsertBranch(*this, TBB, nullptr, Cond, dl); + TII->InsertBranch(*this, TBB, nullptr, Cond, DL); } } else { // Walk through the successors and find the successor which is not @@ -436,7 +469,7 @@ void MachineBasicBlock::updateTerminator() { // as the fallthrough successor. MachineBasicBlock *FallthroughBB = nullptr; for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) { - if ((*SI)->isLandingPad() || *SI == TBB) + if ((*SI)->isEHPad() || *SI == TBB) continue; assert(!FallthroughBB && "Found more than one fallthrough successor."); FallthroughBB = *SI; @@ -445,14 +478,14 @@ void MachineBasicBlock::updateTerminator() { // We fallthrough to the same basic block as the conditional jump // targets. Remove the conditional jump, leaving unconditional // fallthrough. - // FIXME: This does not seem like a reasonable pattern to support, but it - // has been seen in the wild coming out of degenerate ARM test cases. + // FIXME: This does not seem like a reasonable pattern to support, but + // it has been seen in the wild coming out of degenerate ARM test cases. TII->RemoveBranch(*this); // Finally update the unconditional successor to be reached via a branch // if it would not be reached by fallthrough. if (!isLayoutSuccessor(TBB)) - TII->InsertBranch(*this, TBB, nullptr, Cond, dl); + TII->InsertBranch(*this, TBB, nullptr, Cond, DL); return; } @@ -461,55 +494,69 @@ void MachineBasicBlock::updateTerminator() { if (TII->ReverseBranchCondition(Cond)) { // We can't reverse the condition, add an unconditional branch. Cond.clear(); - TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, dl); + TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, DL); return; } TII->RemoveBranch(*this); - TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, dl); + TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, DL); } else if (!isLayoutSuccessor(FallthroughBB)) { TII->RemoveBranch(*this); - TII->InsertBranch(*this, TBB, FallthroughBB, Cond, dl); + TII->InsertBranch(*this, TBB, FallthroughBB, Cond, DL); } } } } -void MachineBasicBlock::addSuccessor(MachineBasicBlock *succ, uint32_t weight) { - - // If we see non-zero value for the first time it means we actually use Weight - // list, so we fill all Weights with 0's. - if (weight != 0 && Weights.empty()) - Weights.resize(Successors.size()); - - if (weight != 0 || !Weights.empty()) - Weights.push_back(weight); - - Successors.push_back(succ); - succ->addPredecessor(this); - } +void MachineBasicBlock::validateSuccProbs() const { +#ifndef NDEBUG + int64_t Sum = 0; + for (auto Prob : Probs) + Sum += Prob.getNumerator(); + // Due to precision issue, we assume that the sum of probabilities is one if + // the difference between the sum of their numerators and the denominator is + // no greater than the number of successors. + assert((uint64_t)std::abs(Sum - BranchProbability::getDenominator()) <= + Probs.size() && + "The sum of successors's probabilities exceeds one."); +#endif // NDEBUG +} -void MachineBasicBlock::removeSuccessor(MachineBasicBlock *succ) { - succ->removePredecessor(this); - succ_iterator I = std::find(Successors.begin(), Successors.end(), succ); - assert(I != Successors.end() && "Not a current successor!"); +void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ, + BranchProbability Prob) { + // Probability list is either empty (if successor list isn't empty, this means + // disabled optimization) or has the same size as successor list. + if (!(Probs.empty() && !Successors.empty())) + Probs.push_back(Prob); + Successors.push_back(Succ); + Succ->addPredecessor(this); +} - // If Weight list is empty it means we don't use it (disabled optimization). - if (!Weights.empty()) { - weight_iterator WI = getWeightIterator(I); - Weights.erase(WI); - } +void MachineBasicBlock::addSuccessorWithoutProb(MachineBasicBlock *Succ) { + // We need to make sure probability list is either empty or has the same size + // of successor list. When this function is called, we can safely delete all + // probability in the list. + Probs.clear(); + Successors.push_back(Succ); + Succ->addPredecessor(this); +} - Successors.erase(I); +void MachineBasicBlock::removeSuccessor(MachineBasicBlock *Succ, + bool NormalizeSuccProbs) { + succ_iterator I = std::find(Successors.begin(), Successors.end(), Succ); + removeSuccessor(I, NormalizeSuccProbs); } MachineBasicBlock::succ_iterator -MachineBasicBlock::removeSuccessor(succ_iterator I) { +MachineBasicBlock::removeSuccessor(succ_iterator I, bool NormalizeSuccProbs) { assert(I != Successors.end() && "Not a current successor!"); - // If Weight list is empty it means we don't use it (disabled optimization). - if (!Weights.empty()) { - weight_iterator WI = getWeightIterator(I); - Weights.erase(WI); + // If probability list is empty it means we don't use it (disabled + // optimization). + if (!Probs.empty()) { + probability_iterator WI = getProbabilityIterator(I); + Probs.erase(WI); + if (NormalizeSuccProbs) + normalizeSuccProbs(); } (*I)->removePredecessor(this); @@ -537,74 +584,77 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old, } } assert(OldI != E && "Old is not a successor of this block"); - Old->removePredecessor(this); // If New isn't already a successor, let it take Old's place. if (NewI == E) { + Old->removePredecessor(this); New->addPredecessor(this); *OldI = New; return; } // New is already a successor. - // Update its weight instead of adding a duplicate edge. - if (!Weights.empty()) { - weight_iterator OldWI = getWeightIterator(OldI); - *getWeightIterator(NewI) += *OldWI; - Weights.erase(OldWI); + // Update its probability instead of adding a duplicate edge. + if (!Probs.empty()) { + auto ProbIter = getProbabilityIterator(NewI); + if (!ProbIter->isUnknown()) + *ProbIter += *getProbabilityIterator(OldI); } - Successors.erase(OldI); + removeSuccessor(OldI); } -void MachineBasicBlock::addPredecessor(MachineBasicBlock *pred) { - Predecessors.push_back(pred); +void MachineBasicBlock::addPredecessor(MachineBasicBlock *Pred) { + Predecessors.push_back(Pred); } -void MachineBasicBlock::removePredecessor(MachineBasicBlock *pred) { - pred_iterator I = std::find(Predecessors.begin(), Predecessors.end(), pred); +void MachineBasicBlock::removePredecessor(MachineBasicBlock *Pred) { + pred_iterator I = std::find(Predecessors.begin(), Predecessors.end(), Pred); assert(I != Predecessors.end() && "Pred is not a predecessor of this block!"); Predecessors.erase(I); } -void MachineBasicBlock::transferSuccessors(MachineBasicBlock *fromMBB) { - if (this == fromMBB) +void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) { + if (this == FromMBB) return; - while (!fromMBB->succ_empty()) { - MachineBasicBlock *Succ = *fromMBB->succ_begin(); - uint32_t Weight = 0; + while (!FromMBB->succ_empty()) { + MachineBasicBlock *Succ = *FromMBB->succ_begin(); - // If Weight list is empty it means we don't use it (disabled optimization). - if (!fromMBB->Weights.empty()) - Weight = *fromMBB->Weights.begin(); + // If probability list is empty it means we don't use it (disabled optimization). + if (!FromMBB->Probs.empty()) { + auto Prob = *FromMBB->Probs.begin(); + addSuccessor(Succ, Prob); + } else + addSuccessorWithoutProb(Succ); - addSuccessor(Succ, Weight); - fromMBB->removeSuccessor(Succ); + FromMBB->removeSuccessor(Succ); } } void -MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) { - if (this == fromMBB) +MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) { + if (this == FromMBB) return; - while (!fromMBB->succ_empty()) { - MachineBasicBlock *Succ = *fromMBB->succ_begin(); - uint32_t Weight = 0; - if (!fromMBB->Weights.empty()) - Weight = *fromMBB->Weights.begin(); - addSuccessor(Succ, Weight); - fromMBB->removeSuccessor(Succ); + while (!FromMBB->succ_empty()) { + MachineBasicBlock *Succ = *FromMBB->succ_begin(); + if (!FromMBB->Probs.empty()) { + auto Prob = *FromMBB->Probs.begin(); + addSuccessor(Succ, Prob); + } else + addSuccessorWithoutProb(Succ); + FromMBB->removeSuccessor(Succ); // Fix up any PHI nodes in the successor. for (MachineBasicBlock::instr_iterator MI = Succ->instr_begin(), ME = Succ->instr_end(); MI != ME && MI->isPHI(); ++MI) for (unsigned i = 2, e = MI->getNumOperands()+1; i != e; i += 2) { MachineOperand &MO = MI->getOperand(i); - if (MO.getMBB() == fromMBB) + if (MO.getMBB() == FromMBB) MO.setMBB(this); } } + normalizeSuccProbs(); } bool MachineBasicBlock::isPredecessor(const MachineBasicBlock *MBB) const { @@ -621,14 +671,14 @@ bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const { } bool MachineBasicBlock::canFallThrough() { - MachineFunction::iterator Fallthrough = this; + MachineFunction::iterator Fallthrough = getIterator(); ++Fallthrough; // If FallthroughBlock is off the end of the function, it can't fall through. if (Fallthrough == getParent()->end()) return false; // If FallthroughBlock isn't a successor, no fallthrough is possible. - if (!isSuccessor(Fallthrough)) + if (!isSuccessor(&*Fallthrough)) return false; // Analyze the branches, if any, at the end of the block. @@ -666,11 +716,11 @@ MachineBasicBlock * MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { // Splitting the critical edge to a landing pad block is non-trivial. Don't do // it in this generic function. - if (Succ->isLandingPad()) + if (Succ->isEHPad()) return nullptr; MachineFunction *MF = getParent(); - DebugLoc dl; // FIXME: this is nowhere + DebugLoc DL; // FIXME: this is nowhere // Performance might be harmed on HW that implements branching using exec mask // where both sides of the branches are always executed. @@ -719,7 +769,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { if (LV) for (instr_iterator I = getFirstInstrTerminator(), E = instr_end(); I != E; ++I) { - MachineInstr *MI = I; + MachineInstr *MI = &*I; for (MachineInstr::mop_iterator OI = MI->operands_begin(), OE = MI->operands_end(); OI != OE; ++OI) { if (!OI->isReg() || OI->getReg() == 0 || @@ -739,7 +789,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { if (LIS) { for (instr_iterator I = getFirstInstrTerminator(), E = instr_end(); I != E; ++I) { - MachineInstr *MI = I; + MachineInstr *MI = &*I; for (MachineInstr::mop_iterator OI = MI->operands_begin(), OE = MI->operands_end(); OI != OE; ++OI) { @@ -761,7 +811,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { if (Indexes) { for (instr_iterator I = getFirstInstrTerminator(), E = instr_end(); I != E; ++I) - Terminators.push_back(I); + Terminators.push_back(&*I); } updateTerminator(); @@ -770,7 +820,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { SmallVector<MachineInstr*, 4> NewTerminators; for (instr_iterator I = getFirstInstrTerminator(), E = instr_end(); I != E; ++I) - NewTerminators.push_back(I); + NewTerminators.push_back(&*I); for (SmallVectorImpl<MachineInstr*>::iterator I = Terminators.begin(), E = Terminators.end(); I != E; ++I) { @@ -784,17 +834,16 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { NMBB->addSuccessor(Succ); if (!NMBB->isLayoutSuccessor(Succ)) { Cond.clear(); - MF->getSubtarget().getInstrInfo()->InsertBranch(*NMBB, Succ, nullptr, Cond, - dl); + TII->InsertBranch(*NMBB, Succ, nullptr, Cond, DL); if (Indexes) { for (instr_iterator I = NMBB->instr_begin(), E = NMBB->instr_end(); I != E; ++I) { // Some instructions may have been moved to NMBB by updateTerminator(), // so we first remove any instruction that already has an index. - if (Indexes->hasIndex(I)) - Indexes->removeMachineInstrFromMaps(I); - Indexes->insertMachineInstrInMaps(I); + if (Indexes->hasIndex(&*I)) + Indexes->removeMachineInstrFromMaps(&*I); + Indexes->insertMachineInstrInMaps(&*I); } } } @@ -808,9 +857,8 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { i->getOperand(ni+1).setMBB(NMBB); // Inherit live-ins from the successor - for (MachineBasicBlock::livein_iterator I = Succ->livein_begin(), - E = Succ->livein_end(); I != E; ++I) - NMBB->addLiveIn(*I); + for (const auto &LI : Succ->liveins()) + NMBB->addLiveIn(LI); // Update LiveVariables. const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); @@ -822,7 +870,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { if (!(--I)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false)) continue; if (TargetRegisterInfo::isVirtualRegister(Reg)) - LV->getVarInfo(Reg).Kills.push_back(I); + LV->getVarInfo(Reg).Kills.push_back(&*I); DEBUG(dbgs() << "Restored terminator kill: " << *I); break; } @@ -834,10 +882,10 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { if (LIS) { // After splitting the edge and updating SlotIndexes, live intervals may be // in one of two situations, depending on whether this block was the last in - // the function. If the original block was the last in the function, all live - // intervals will end prior to the beginning of the new split block. If the - // original block was not at the end of the function, all live intervals will - // extend to the end of the new split block. + // the function. If the original block was the last in the function, all + // live intervals will end prior to the beginning of the new split block. If + // the original block was not at the end of the function, all live intervals + // will extend to the end of the new split block. bool isLastMBB = std::next(MachineFunction::iterator(NMBB)) == getParent()->end(); @@ -861,7 +909,8 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) { LiveInterval &LI = LIS->getInterval(Reg); VNInfo *VNI = LI.getVNInfoAt(PrevIndex); - assert(VNI && "PHI sources should be live out of their predecessors."); + assert(VNI && + "PHI sources should be live out of their predecessors."); LI.addSegment(LiveInterval::Segment(StartIndex, EndIndex, VNI)); } } @@ -941,7 +990,7 @@ static void unbundleSingleMI(MachineInstr *MI) { MachineBasicBlock::instr_iterator MachineBasicBlock::erase(MachineBasicBlock::instr_iterator I) { - unbundleSingleMI(I); + unbundleSingleMI(&*I); return Insts.erase(I); } @@ -964,25 +1013,22 @@ MachineBasicBlock::insert(instr_iterator I, MachineInstr *MI) { return Insts.insert(I, MI); } -/// removeFromParent - This method unlinks 'this' from the containing function, -/// and returns it, but does not delete it. +/// This method unlinks 'this' from the containing function, and returns it, but +/// does not delete it. MachineBasicBlock *MachineBasicBlock::removeFromParent() { assert(getParent() && "Not embedded in a function!"); getParent()->remove(this); return this; } - -/// eraseFromParent - This method unlinks 'this' from the containing function, -/// and deletes it. +/// This method unlinks 'this' from the containing function, and deletes it. void MachineBasicBlock::eraseFromParent() { assert(getParent() && "Not embedded in a function!"); getParent()->erase(this); } - -/// ReplaceUsesOfBlockWith - Given a machine basic block that branched to -/// 'Old', change the code and CFG so that it branches to 'New' instead. +/// Given a machine basic block that branched to 'Old', change the code and CFG +/// so that it branches to 'New' instead. void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old, MachineBasicBlock *New) { assert(Old != New && "Cannot replace self with self!"); @@ -1004,46 +1050,44 @@ void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old, replaceSuccessor(Old, New); } -/// CorrectExtraCFGEdges - Various pieces of code can cause excess edges in the -/// CFG to be inserted. If we have proven that MBB can only branch to DestA and -/// DestB, remove any other MBB successors from the CFG. DestA and DestB can be -/// null. +/// Various pieces of code can cause excess edges in the CFG to be inserted. If +/// we have proven that MBB can only branch to DestA and DestB, remove any other +/// MBB successors from the CFG. DestA and DestB can be null. /// /// Besides DestA and DestB, retain other edges leading to LandingPads /// (currently there can be only one; we don't check or require that here). /// Note it is possible that DestA and/or DestB are LandingPads. bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA, MachineBasicBlock *DestB, - bool isCond) { + bool IsCond) { // The values of DestA and DestB frequently come from a call to the // 'TargetInstrInfo::AnalyzeBranch' method. We take our meaning of the initial // values from there. // // 1. If both DestA and DestB are null, then the block ends with no branches // (it falls through to its successor). - // 2. If DestA is set, DestB is null, and isCond is false, then the block ends + // 2. If DestA is set, DestB is null, and IsCond is false, then the block ends // with only an unconditional branch. - // 3. If DestA is set, DestB is null, and isCond is true, then the block ends + // 3. If DestA is set, DestB is null, and IsCond is true, then the block ends // with a conditional branch that falls through to a successor (DestB). - // 4. If DestA and DestB is set and isCond is true, then the block ends with a + // 4. If DestA and DestB is set and IsCond is true, then the block ends with a // conditional branch followed by an unconditional branch. DestA is the // 'true' destination and DestB is the 'false' destination. bool Changed = false; - MachineFunction::iterator FallThru = - std::next(MachineFunction::iterator(this)); + MachineFunction::iterator FallThru = std::next(getIterator()); if (!DestA && !DestB) { // Block falls through to successor. - DestA = FallThru; - DestB = FallThru; + DestA = &*FallThru; + DestB = &*FallThru; } else if (DestA && !DestB) { - if (isCond) + if (IsCond) // Block ends in conditional jump that falls through to successor. - DestB = FallThru; + DestB = &*FallThru; } else { - assert(DestA && DestB && isCond && + assert(DestA && DestB && IsCond && "CFG in a bad state. Cannot correct CFG edges"); } @@ -1054,7 +1098,7 @@ bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA, while (SI != succ_end()) { const MachineBasicBlock *MBB = *SI; if (!SeenMBBs.insert(MBB).second || - (MBB != DestA && MBB != DestB && !MBB->isLandingPad())) { + (MBB != DestA && MBB != DestB && !MBB->isEHPad())) { // This is a superfluous edge, remove it. SI = removeSuccessor(SI); Changed = true; @@ -1063,11 +1107,13 @@ bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA, } } + if (Changed) + normalizeSuccProbs(); return Changed; } -/// findDebugLoc - find the next valid DebugLoc starting at MBBI, skipping -/// any DBG_VALUE instructions. Return UnknownLoc if there is none. +/// Find the next valid DebugLoc starting at MBBI, skipping any DBG_VALUE +/// instructions. Return UnknownLoc if there is none. DebugLoc MachineBasicBlock::findDebugLoc(instr_iterator MBBI) { DebugLoc DL; @@ -1083,45 +1129,60 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) { return DL; } -/// getSuccWeight - Return weight of the edge from this block to MBB. -/// -uint32_t MachineBasicBlock::getSuccWeight(const_succ_iterator Succ) const { - if (Weights.empty()) - return 0; - - return *getWeightIterator(Succ); +/// Return probability of the edge from this block to MBB. +BranchProbability +MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const { + if (Probs.empty()) + return BranchProbability(1, succ_size()); + + const auto &Prob = *getProbabilityIterator(Succ); + if (Prob.isUnknown()) { + // For unknown probabilities, collect the sum of all known ones, and evenly + // ditribute the complemental of the sum to each unknown probability. + unsigned KnownProbNum = 0; + auto Sum = BranchProbability::getZero(); + for (auto &P : Probs) { + if (!P.isUnknown()) { + Sum += P; + KnownProbNum++; + } + } + return Sum.getCompl() / (Probs.size() - KnownProbNum); + } else + return Prob; } -/// Set successor weight of a given iterator. -void MachineBasicBlock::setSuccWeight(succ_iterator I, uint32_t weight) { - if (Weights.empty()) +/// Set successor probability of a given iterator. +void MachineBasicBlock::setSuccProbability(succ_iterator I, + BranchProbability Prob) { + assert(!Prob.isUnknown()); + if (Probs.empty()) return; - *getWeightIterator(I) = weight; + *getProbabilityIterator(I) = Prob; } -/// getWeightIterator - Return wight iterator corresonding to the I successor -/// iterator -MachineBasicBlock::weight_iterator MachineBasicBlock:: -getWeightIterator(MachineBasicBlock::succ_iterator I) { - assert(Weights.size() == Successors.size() && "Async weight list!"); - size_t index = std::distance(Successors.begin(), I); - assert(index < Weights.size() && "Not a current successor!"); - return Weights.begin() + index; +/// Return probability iterator corresonding to the I successor iterator +MachineBasicBlock::const_probability_iterator +MachineBasicBlock::getProbabilityIterator( + MachineBasicBlock::const_succ_iterator I) const { + assert(Probs.size() == Successors.size() && "Async probability list!"); + const size_t index = std::distance(Successors.begin(), I); + assert(index < Probs.size() && "Not a current successor!"); + return Probs.begin() + index; } -/// getWeightIterator - Return wight iterator corresonding to the I successor -/// iterator -MachineBasicBlock::const_weight_iterator MachineBasicBlock:: -getWeightIterator(MachineBasicBlock::const_succ_iterator I) const { - assert(Weights.size() == Successors.size() && "Async weight list!"); +/// Return probability iterator corresonding to the I successor iterator. +MachineBasicBlock::probability_iterator +MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) { + assert(Probs.size() == Successors.size() && "Async probability list!"); const size_t index = std::distance(Successors.begin(), I); - assert(index < Weights.size() && "Not a current successor!"); - return Weights.begin() + index; + assert(index < Probs.size() && "Not a current successor!"); + return Probs.begin() + index; } /// Return whether (physical) register "Reg" has been <def>ined and not <kill>ed /// as of just before "MI". -/// +/// /// Search is localised to a neighborhood of /// Neighborhood instructions before (searching for defs or kills) and N /// instructions after (searching just for defs) MI. @@ -1138,33 +1199,33 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI, do { --I; - MachineOperandIteratorBase::PhysRegInfo Analysis = + MachineOperandIteratorBase::PhysRegInfo Info = ConstMIOperands(I).analyzePhysReg(Reg, TRI); - if (Analysis.Defines) - // Outputs happen after inputs so they take precedence if both are - // present. - return Analysis.DefinesDead ? LQR_Dead : LQR_Live; + // Defs happen after uses so they take precedence if both are present. - if (Analysis.Kills || Analysis.Clobbers) - // Register killed, so isn't live. + // Register is dead after a dead def of the full register. + if (Info.DeadDef) return LQR_Dead; - - else if (Analysis.ReadsOverlap) - // Defined or read without a previous kill - live. - return Analysis.Reads ? LQR_Live : LQR_OverlappingLive; - + // Register is (at least partially) live after a def. + if (Info.Defined) + return LQR_Live; + // Register is dead after a full kill or clobber and no def. + if (Info.Killed || Info.Clobbered) + return LQR_Dead; + // Register must be live if we read it. + if (Info.Read) + return LQR_Live; } while (I != begin() && --N > 0); } // Did we get to the start of the block? if (I == begin()) { // If so, the register's state is definitely defined by the live-in state. - for (MCRegAliasIterator RAI(Reg, TRI, /*IncludeSelf=*/true); - RAI.isValid(); ++RAI) { + for (MCRegAliasIterator RAI(Reg, TRI, /*IncludeSelf=*/true); RAI.isValid(); + ++RAI) if (isLiveIn(*RAI)) - return (*RAI == Reg) ? LQR_Live : LQR_OverlappingLive; - } + return LQR_Live; return LQR_Dead; } @@ -1176,16 +1237,14 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI, // If this is the last insn in the block, don't search forwards. if (I != end()) { for (++I; I != end() && N > 0; ++I, --N) { - MachineOperandIteratorBase::PhysRegInfo Analysis = + MachineOperandIteratorBase::PhysRegInfo Info = ConstMIOperands(I).analyzePhysReg(Reg, TRI); - if (Analysis.ReadsOverlap) - // Used, therefore must have been live. - return (Analysis.Reads) ? - LQR_Live : LQR_OverlappingLive; - - else if (Analysis.Clobbers || Analysis.Defines) - // Defined (but not read) therefore cannot have been live. + // Register is live when we read it here. + if (Info.Read) + return LQR_Live; + // Register is dead if we can fully overwrite or clobber it here. + if (Info.FullyDefined || Info.Clobbered) return LQR_Dead; } } @@ -1193,3 +1252,17 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI, // At this point we have no idea of the liveness of the register. return LQR_Unknown; } + +const uint32_t * +MachineBasicBlock::getBeginClobberMask(const TargetRegisterInfo *TRI) const { + // EH funclet entry does not preserve any registers. + return isEHFuncletEntry() ? TRI->getNoPreservedMask() : nullptr; +} + +const uint32_t * +MachineBasicBlock::getEndClobberMask(const TargetRegisterInfo *TRI) const { + // If we see a return block with successors, this must be a funclet return, + // which does not preserve any registers. If there are no successors, we don't + // care what kind of return it is, putting a mask after it is a no-op. + return isReturnBlock() && !succ_empty() ? TRI->getNoPreservedMask() : nullptr; +} diff --git a/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp index 9151d99..9119e31 100644 --- a/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -57,7 +57,7 @@ struct GraphTraits<MachineBlockFrequencyInfo *> { static inline const NodeType *getEntryNode(const MachineBlockFrequencyInfo *G) { - return G->getFunction()->begin(); + return &G->getFunction()->front(); } static ChildIteratorType child_begin(const NodeType *N) { @@ -143,7 +143,7 @@ bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) { MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); if (!MBFI) MBFI.reset(new ImplType); - MBFI->doFunction(&F, &MBPI, &MLI); + MBFI->calculate(F, MBPI, MLI); #ifndef NDEBUG if (ViewMachineBlockFreqPropagationDAG != GVDT_None) { view(); diff --git a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 2969bad..f5e3056 100644 --- a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -51,7 +51,7 @@ using namespace llvm; #define DEBUG_TYPE "block-placement" STATISTIC(NumCondBranches, "Number of conditional branches"); -STATISTIC(NumUncondBranches, "Number of uncondittional branches"); +STATISTIC(NumUncondBranches, "Number of unconditional branches"); STATISTIC(CondBranchTakenFreq, "Potential frequency of taking conditional branches"); STATISTIC(UncondBranchTakenFreq, @@ -62,6 +62,11 @@ static cl::opt<unsigned> AlignAllBlock("align-all-blocks", "blocks in the function."), cl::init(0), cl::Hidden); +static cl::opt<unsigned> + AlignAllLoops("align-all-loops", + cl::desc("Force the alignment of all loops in the function."), + cl::init(0), cl::Hidden); + // FIXME: Find a good default for this flag and remove the flag. static cl::opt<unsigned> ExitBlockBias( "block-placement-exit-block-bias", @@ -81,6 +86,29 @@ static cl::opt<unsigned> OutlineOptionalThreshold( "instruction count below this threshold"), cl::init(4), cl::Hidden); +static cl::opt<unsigned> LoopToColdBlockRatio( + "loop-to-cold-block-ratio", + cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " + "(frequency of block) is greater than this ratio"), + cl::init(5), cl::Hidden); + +static cl::opt<bool> + PreciseRotationCost("precise-rotation-cost", + cl::desc("Model the cost of loop rotation more " + "precisely by using profile data."), + cl::init(false), cl::Hidden); + +static cl::opt<unsigned> MisfetchCost( + "misfetch-cost", + cl::desc("Cost that models the probablistic risk of an instruction " + "misfetch due to a jump comparing to falling through, whose cost " + "is zero."), + cl::init(1), cl::Hidden); + +static cl::opt<unsigned> JumpInstCost("jump-inst-cost", + cl::desc("Cost of jump instructions."), + cl::init(1), cl::Hidden); + namespace { class BlockChain; /// \brief Type for our function-wide basic block -> block chain mapping. @@ -246,9 +274,12 @@ class MachineBlockPlacement : public MachineFunctionPass { const BlockFilterSet &LoopBlockSet); MachineBasicBlock *findBestLoopExit(MachineFunction &F, MachineLoop &L, const BlockFilterSet &LoopBlockSet); + BlockFilterSet collectLoopBlockSet(MachineFunction &F, MachineLoop &L); void buildLoopChains(MachineFunction &F, MachineLoop &L); void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB, const BlockFilterSet &LoopBlockSet); + void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, + const BlockFilterSet &LoopBlockSet); void buildCFGChains(MachineFunction &F); public: @@ -354,31 +385,56 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, const BranchProbability HotProb(4, 5); // 80% MachineBasicBlock *BestSucc = nullptr; - // FIXME: Due to the performance of the probability and weight routines in - // the MBPI analysis, we manually compute probabilities using the edge - // weights. This is suboptimal as it means that the somewhat subtle - // definition of edge weight semantics is encoded here as well. We should - // improve the MBPI interface to efficiently support query patterns such as - // this. - uint32_t BestWeight = 0; - uint32_t WeightScale = 0; - uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale); - DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n"); + auto BestProb = BranchProbability::getZero(); + + // Adjust edge probabilities by excluding edges pointing to blocks that is + // either not in BlockFilter or is already in the current chain. Consider the + // following CFG: + // + // --->A + // | / \ + // | B C + // | \ / \ + // ----D E + // + // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after + // A->C is chosen as a fall-through, D won't be selected as a successor of C + // due to CFG constraint (the probability of C->D is not greater than + // HotProb). If we exclude E that is not in BlockFilter when calculating the + // probability of C->D, D will be selected and we will get A C D B as the + // layout of this loop. + auto AdjustedSumProb = BranchProbability::getOne(); + SmallVector<MachineBasicBlock *, 4> Successors; for (MachineBasicBlock *Succ : BB->successors()) { - if (BlockFilter && !BlockFilter->count(Succ)) - continue; - BlockChain &SuccChain = *BlockToChain[Succ]; - if (&SuccChain == &Chain) { - DEBUG(dbgs() << " " << getBlockName(Succ) << " -> Already merged!\n"); - continue; - } - if (Succ != *SuccChain.begin()) { - DEBUG(dbgs() << " " << getBlockName(Succ) << " -> Mid chain!\n"); - continue; + bool SkipSucc = false; + if (BlockFilter && !BlockFilter->count(Succ)) { + SkipSucc = true; + } else { + BlockChain *SuccChain = BlockToChain[Succ]; + if (SuccChain == &Chain) { + DEBUG(dbgs() << " " << getBlockName(Succ) + << " -> Already merged!\n"); + SkipSucc = true; + } else if (Succ != *SuccChain->begin()) { + DEBUG(dbgs() << " " << getBlockName(Succ) << " -> Mid chain!\n"); + continue; + } } + if (SkipSucc) + AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ); + else + Successors.push_back(Succ); + } - uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ); - BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight); + DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n"); + for (MachineBasicBlock *Succ : Successors) { + BranchProbability SuccProb; + uint32_t SuccProbN = MBPI->getEdgeProbability(BB, Succ).getNumerator(); + uint32_t SuccProbD = AdjustedSumProb.getNumerator(); + if (SuccProbN >= SuccProbD) + SuccProb = BranchProbability::getOne(); + else + SuccProb = BranchProbability(SuccProbN, SuccProbD); // If we outline optional branches, look whether Succ is unavoidable, i.e. // dominates all terminators of the MachineFunction. If it does, other @@ -406,6 +462,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, // Only consider successors which are either "hot", or wouldn't violate // any CFG constraints. + BlockChain &SuccChain = *BlockToChain[Succ]; if (SuccChain.LoopPredecessors != 0) { if (SuccProb < HotProb) { DEBUG(dbgs() << " " << getBlockName(Succ) << " -> " << SuccProb @@ -415,8 +472,9 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, // Make sure that a hot successor doesn't have a globally more // important predecessor. + auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); BlockFrequency CandidateEdgeFreq = - MBFI->getBlockFreq(BB) * SuccProb * HotProb.getCompl(); + MBFI->getBlockFreq(BB) * RealSuccProb * HotProb.getCompl(); bool BadCFGConflict = false; for (MachineBasicBlock *Pred : Succ->predecessors()) { if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) || @@ -440,10 +498,10 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, << " (prob)" << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "") << "\n"); - if (BestSucc && BestWeight >= SuccWeight) + if (BestSucc && BestProb >= SuccProb) continue; BestSucc = Succ; - BestWeight = SuccWeight; + BestProb = SuccProb; } return BestSucc; } @@ -505,14 +563,14 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock( const BlockFilterSet *BlockFilter) { for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F.end(); I != E; ++I) { - if (BlockFilter && !BlockFilter->count(I)) + if (BlockFilter && !BlockFilter->count(&*I)) continue; - if (BlockToChain[I] != &PlacedChain) { + if (BlockToChain[&*I] != &PlacedChain) { PrevUnplacedBlockIt = I; // Now select the head of the chain to which the unplaced block belongs // as the block to place. This will force the entire chain to be placed, // and satisfies the requirements of merging chains. - return *BlockToChain[I]->begin(); + return *BlockToChain[&*I]->begin(); } } return nullptr; @@ -672,13 +730,8 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, MachineBasicBlock *OldExitingBB = ExitingBB; BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq; bool HasLoopingSucc = false; - // FIXME: Due to the performance of the probability and weight routines in - // the MBPI analysis, we use the internal weights and manually compute the - // probabilities to avoid quadratic behavior. - uint32_t WeightScale = 0; - uint32_t SumWeight = MBPI->getSumForBlock(MBB, WeightScale); for (MachineBasicBlock *Succ : MBB->successors()) { - if (Succ->isLandingPad()) + if (Succ->isEHPad()) continue; if (Succ == MBB) continue; @@ -690,10 +743,10 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, continue; } - uint32_t SuccWeight = MBPI->getEdgeWeight(MBB, Succ); + auto SuccProb = MBPI->getEdgeProbability(MBB, Succ); if (LoopBlockSet.count(Succ)) { DEBUG(dbgs() << " looping: " << getBlockName(MBB) << " -> " - << getBlockName(Succ) << " (" << SuccWeight << ")\n"); + << getBlockName(Succ) << " (" << SuccProb << ")\n"); HasLoopingSucc = true; continue; } @@ -705,7 +758,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, BlocksExitingToOuterLoop.insert(MBB); } - BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight); BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb; DEBUG(dbgs() << " exiting: " << getBlockName(MBB) << " -> " << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] ("; @@ -791,6 +843,188 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end()); } +/// \brief Attempt to rotate a loop based on profile data to reduce branch cost. +/// +/// With profile data, we can determine the cost in terms of missed fall through +/// opportunities when rotating a loop chain and select the best rotation. +/// Basically, there are three kinds of cost to consider for each rotation: +/// 1. The possibly missed fall through edge (if it exists) from BB out of +/// the loop to the loop header. +/// 2. The possibly missed fall through edges (if they exist) from the loop +/// exits to BB out of the loop. +/// 3. The missed fall through edge (if it exists) from the last BB to the +/// first BB in the loop chain. +/// Therefore, the cost for a given rotation is the sum of costs listed above. +/// We select the best rotation with the smallest cost. +void MachineBlockPlacement::rotateLoopWithProfile( + BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) { + auto HeaderBB = L.getHeader(); + auto HeaderIter = std::find(LoopChain.begin(), LoopChain.end(), HeaderBB); + auto RotationPos = LoopChain.end(); + + BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency(); + + // A utility lambda that scales up a block frequency by dividing it by a + // branch probability which is the reciprocal of the scale. + auto ScaleBlockFrequency = [](BlockFrequency Freq, + unsigned Scale) -> BlockFrequency { + if (Scale == 0) + return 0; + // Use operator / between BlockFrequency and BranchProbability to implement + // saturating multiplication. + return Freq / BranchProbability(1, Scale); + }; + + // Compute the cost of the missed fall-through edge to the loop header if the + // chain head is not the loop header. As we only consider natural loops with + // single header, this computation can be done only once. + BlockFrequency HeaderFallThroughCost(0); + for (auto *Pred : HeaderBB->predecessors()) { + BlockChain *PredChain = BlockToChain[Pred]; + if (!LoopBlockSet.count(Pred) && + (!PredChain || Pred == *std::prev(PredChain->end()))) { + auto EdgeFreq = + MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB); + auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost); + // If the predecessor has only an unconditional jump to the header, we + // need to consider the cost of this jump. + if (Pred->succ_size() == 1) + FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost); + HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost); + } + } + + // Here we collect all exit blocks in the loop, and for each exit we find out + // its hottest exit edge. For each loop rotation, we define the loop exit cost + // as the sum of frequencies of exit edges we collect here, excluding the exit + // edge from the tail of the loop chain. + SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq; + for (auto BB : LoopChain) { + auto LargestExitEdgeProb = BranchProbability::getZero(); + for (auto *Succ : BB->successors()) { + BlockChain *SuccChain = BlockToChain[Succ]; + if (!LoopBlockSet.count(Succ) && + (!SuccChain || Succ == *SuccChain->begin())) { + auto SuccProb = MBPI->getEdgeProbability(BB, Succ); + LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb); + } + } + if (LargestExitEdgeProb > BranchProbability::getZero()) { + auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb; + ExitsWithFreq.emplace_back(BB, ExitFreq); + } + } + + // In this loop we iterate every block in the loop chain and calculate the + // cost assuming the block is the head of the loop chain. When the loop ends, + // we should have found the best candidate as the loop chain's head. + for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()), + EndIter = LoopChain.end(); + Iter != EndIter; Iter++, TailIter++) { + // TailIter is used to track the tail of the loop chain if the block we are + // checking (pointed by Iter) is the head of the chain. + if (TailIter == LoopChain.end()) + TailIter = LoopChain.begin(); + + auto TailBB = *TailIter; + + // Calculate the cost by putting this BB to the top. + BlockFrequency Cost = 0; + + // If the current BB is the loop header, we need to take into account the + // cost of the missed fall through edge from outside of the loop to the + // header. + if (Iter != HeaderIter) + Cost += HeaderFallThroughCost; + + // Collect the loop exit cost by summing up frequencies of all exit edges + // except the one from the chain tail. + for (auto &ExitWithFreq : ExitsWithFreq) + if (TailBB != ExitWithFreq.first) + Cost += ExitWithFreq.second; + + // The cost of breaking the once fall-through edge from the tail to the top + // of the loop chain. Here we need to consider three cases: + // 1. If the tail node has only one successor, then we will get an + // additional jmp instruction. So the cost here is (MisfetchCost + + // JumpInstCost) * tail node frequency. + // 2. If the tail node has two successors, then we may still get an + // additional jmp instruction if the layout successor after the loop + // chain is not its CFG successor. Note that the more frequently executed + // jmp instruction will be put ahead of the other one. Assume the + // frequency of those two branches are x and y, where x is the frequency + // of the edge to the chain head, then the cost will be + // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency. + // 3. If the tail node has more than two successors (this rarely happens), + // we won't consider any additional cost. + if (TailBB->isSuccessor(*Iter)) { + auto TailBBFreq = MBFI->getBlockFreq(TailBB); + if (TailBB->succ_size() == 1) + Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(), + MisfetchCost + JumpInstCost); + else if (TailBB->succ_size() == 2) { + auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter); + auto TailToHeadFreq = TailBBFreq * TailToHeadProb; + auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2) + ? TailBBFreq * TailToHeadProb.getCompl() + : TailToHeadFreq; + Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) + + ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost); + } + } + + DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockNum(*Iter) + << " to the top: " << Cost.getFrequency() << "\n"); + + if (Cost < SmallestRotationCost) { + SmallestRotationCost = Cost; + RotationPos = Iter; + } + } + + if (RotationPos != LoopChain.end()) { + DEBUG(dbgs() << "Rotate loop by making " << getBlockNum(*RotationPos) + << " to the top\n"); + std::rotate(LoopChain.begin(), RotationPos, LoopChain.end()); + } +} + +/// \brief Collect blocks in the given loop that are to be placed. +/// +/// When profile data is available, exclude cold blocks from the returned set; +/// otherwise, collect all blocks in the loop. +MachineBlockPlacement::BlockFilterSet +MachineBlockPlacement::collectLoopBlockSet(MachineFunction &F, MachineLoop &L) { + BlockFilterSet LoopBlockSet; + + // Filter cold blocks off from LoopBlockSet when profile data is available. + // Collect the sum of frequencies of incoming edges to the loop header from + // outside. If we treat the loop as a super block, this is the frequency of + // the loop. Then for each block in the loop, we calculate the ratio between + // its frequency and the frequency of the loop block. When it is too small, + // don't add it to the loop chain. If there are outer loops, then this block + // will be merged into the first outer loop chain for which this block is not + // cold anymore. This needs precise profile data and we only do this when + // profile data is available. + if (F.getFunction()->getEntryCount()) { + BlockFrequency LoopFreq(0); + for (auto LoopPred : L.getHeader()->predecessors()) + if (!L.contains(LoopPred)) + LoopFreq += MBFI->getBlockFreq(LoopPred) * + MBPI->getEdgeProbability(LoopPred, L.getHeader()); + + for (MachineBasicBlock *LoopBB : L.getBlocks()) { + auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency(); + if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio) + continue; + LoopBlockSet.insert(LoopBB); + } + } else + LoopBlockSet.insert(L.block_begin(), L.block_end()); + + return LoopBlockSet; +} + /// \brief Forms basic block chains from the natural loop structures. /// /// These chains are designed to preserve the existing *structure* of the code @@ -805,19 +1039,27 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F, buildLoopChains(F, *InnerLoop); SmallVector<MachineBasicBlock *, 16> BlockWorkList; - BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end()); + BlockFilterSet LoopBlockSet = collectLoopBlockSet(F, L); + + // Check if we have profile data for this function. If yes, we will rotate + // this loop by modeling costs more precisely which requires the profile data + // for better layout. + bool RotateLoopWithProfile = + PreciseRotationCost && F.getFunction()->getEntryCount(); // First check to see if there is an obviously preferable top block for the // loop. This will default to the header, but may end up as one of the // predecessors to the header if there is one which will result in strictly // fewer branches in the loop body. - MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet); + // When we use profile data to rotate the loop, this is unnecessary. + MachineBasicBlock *LoopTop = + RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet); // If we selected just the header for the loop top, look for a potentially // profitable exit block in the event that rotating the loop can eliminate // branches by placing an exit edge at the bottom. MachineBasicBlock *ExitingBB = nullptr; - if (LoopTop == L.getHeader()) + if (!RotateLoopWithProfile && LoopTop == L.getHeader()) ExitingBB = findBestLoopExit(F, L, LoopBlockSet); BlockChain &LoopChain = *BlockToChain[LoopTop]; @@ -828,7 +1070,8 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F, SmallPtrSet<BlockChain *, 4> UpdatedPreds; assert(LoopChain.LoopPredecessors == 0); UpdatedPreds.insert(&LoopChain); - for (MachineBasicBlock *LoopBB : L.getBlocks()) { + + for (MachineBasicBlock *LoopBB : LoopBlockSet) { BlockChain &Chain = *BlockToChain[LoopBB]; if (!UpdatedPreds.insert(&Chain).second) continue; @@ -848,7 +1091,11 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F, } buildChain(LoopTop, LoopChain, BlockWorkList, &LoopBlockSet); - rotateLoop(LoopChain, ExitingBB, LoopBlockSet); + + if (RotateLoopWithProfile) + rotateLoopWithProfile(LoopChain, L, LoopBlockSet); + else + rotateLoop(LoopChain, ExitingBB, LoopBlockSet); DEBUG({ // Crash at the end so we get all of the debugging output first. @@ -889,7 +1136,7 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { // the assumptions of the remaining algorithm. SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch. for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { - MachineBasicBlock *BB = FI; + MachineBasicBlock *BB = &*FI; BlockChain *Chain = new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB); // Also, merge any blocks which we cannot reason about and must preserve @@ -900,8 +1147,8 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { if (!TII->AnalyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough()) break; - MachineFunction::iterator NextFI(std::next(FI)); - MachineBasicBlock *NextBB = NextFI; + MachineFunction::iterator NextFI = std::next(FI); + MachineBasicBlock *NextBB = &*NextFI; // Ensure that the layout successor is a viable block, as we know that // fallthrough is a possibility. assert(NextFI != FE && "Can't fallthrough past the last block."); @@ -1004,7 +1251,7 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { // Update the terminator of the previous block. if (ChainBB == *FunctionChain.begin()) continue; - MachineBasicBlock *PrevBB = std::prev(MachineFunction::iterator(ChainBB)); + MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB)); // FIXME: It would be awesome of updateTerminator would just return rather // than assert when the branch cannot be analyzed in order to remove this @@ -1035,14 +1282,16 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { } // If PrevBB has a two-way branch, try to re-order the branches - // such that we branch to the successor with higher weight first. + // such that we branch to the successor with higher probability first. if (TBB && !Cond.empty() && FBB && - MBPI->getEdgeWeight(PrevBB, FBB) > MBPI->getEdgeWeight(PrevBB, TBB) && + MBPI->getEdgeProbability(PrevBB, FBB) > + MBPI->getEdgeProbability(PrevBB, TBB) && !TII->ReverseBranchCondition(Cond)) { DEBUG(dbgs() << "Reverse order of the two branches: " << getBlockName(PrevBB) << "\n"); - DEBUG(dbgs() << " Edge weight: " << MBPI->getEdgeWeight(PrevBB, FBB) - << " vs " << MBPI->getEdgeWeight(PrevBB, TBB) << "\n"); + DEBUG(dbgs() << " Edge probability: " + << MBPI->getEdgeProbability(PrevBB, FBB) << " vs " + << MBPI->getEdgeProbability(PrevBB, TBB) << "\n"); DebugLoc dl; // FIXME: this is nowhere TII->RemoveBranch(*PrevBB); TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl); @@ -1064,13 +1313,14 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { // exclusively on the loop info here so that we can align backedges in // unnatural CFGs and backedges that were introduced purely because of the // loop rotations done during this layout pass. + // FIXME: Use Function::optForSize(). if (F.getFunction()->hasFnAttribute(Attribute::OptimizeForSize)) return; if (FunctionChain.begin() == FunctionChain.end()) return; // Empty chain. const BranchProbability ColdProb(1, 5); // 20% - BlockFrequency EntryFreq = MBFI->getBlockFreq(F.begin()); + BlockFrequency EntryFreq = MBFI->getBlockFreq(&F.front()); BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb; for (MachineBasicBlock *ChainBB : FunctionChain) { if (ChainBB == *FunctionChain.begin()) @@ -1084,6 +1334,11 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { if (!L) continue; + if (AlignAllLoops) { + ChainBB->setAlignment(AlignAllLoops); + continue; + } + unsigned Align = TLI->getPrefLoopAlignment(L); if (!Align) continue; // Don't care about loop alignment. @@ -1224,4 +1479,3 @@ bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) { return false; } - diff --git a/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp index 6fbc2be..cf6d401 100644 --- a/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp @@ -28,91 +28,48 @@ char MachineBranchProbabilityInfo::ID = 0; void MachineBranchProbabilityInfo::anchor() { } -uint32_t MachineBranchProbabilityInfo:: -getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const { - // First we compute the sum with 64-bits of precision, ensuring that cannot - // overflow by bounding the number of weights considered. Hopefully no one - // actually needs 2^32 successors. - assert(MBB->succ_size() < UINT32_MAX); - uint64_t Sum = 0; - Scale = 1; - for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), - E = MBB->succ_end(); I != E; ++I) { - uint32_t Weight = getEdgeWeight(MBB, I); - Sum += Weight; - } - - // If the computed sum fits in 32-bits, we're done. - if (Sum <= UINT32_MAX) - return Sum; - - // Otherwise, compute the scale necessary to cause the weights to fit, and - // re-sum with that scale applied. - assert((Sum / UINT32_MAX) < UINT32_MAX); - Scale = (Sum / UINT32_MAX) + 1; - Sum = 0; - for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), - E = MBB->succ_end(); I != E; ++I) { - uint32_t Weight = getEdgeWeight(MBB, I); - Sum += Weight / Scale; - } - assert(Sum <= UINT32_MAX); - return Sum; -} - -uint32_t MachineBranchProbabilityInfo:: -getEdgeWeight(const MachineBasicBlock *Src, - MachineBasicBlock::const_succ_iterator Dst) const { - uint32_t Weight = Src->getSuccWeight(Dst); - if (!Weight) - return DEFAULT_WEIGHT; - return Weight; +BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( + const MachineBasicBlock *Src, + MachineBasicBlock::const_succ_iterator Dst) const { + return Src->getSuccProbability(Dst); } -uint32_t MachineBranchProbabilityInfo:: -getEdgeWeight(const MachineBasicBlock *Src, - const MachineBasicBlock *Dst) const { +BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( + const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { // This is a linear search. Try to use the const_succ_iterator version when // possible. - return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst)); + return getEdgeProbability(Src, + std::find(Src->succ_begin(), Src->succ_end(), Dst)); } bool MachineBranchProbabilityInfo::isEdgeHot(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { // Hot probability is at least 4/5 = 80% - // FIXME: Compare against a static "hot" BranchProbability. - return getEdgeProbability(Src, Dst) > BranchProbability(4, 5); + static BranchProbability HotProb(4, 5); + return getEdgeProbability(Src, Dst) > HotProb; } MachineBasicBlock * MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const { - uint32_t MaxWeight = 0; + auto MaxProb = BranchProbability::getZero(); MachineBasicBlock *MaxSucc = nullptr; for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) { - uint32_t Weight = getEdgeWeight(MBB, I); - if (Weight > MaxWeight) { - MaxWeight = Weight; + auto Prob = getEdgeProbability(MBB, I); + if (Prob > MaxProb) { + MaxProb = Prob; MaxSucc = *I; } } - if (getEdgeProbability(MBB, MaxSucc) >= BranchProbability(4, 5)) + static BranchProbability HotProb(4, 5); + if (getEdgeProbability(MBB, MaxSucc) >= HotProb) return MaxSucc; return nullptr; } -BranchProbability MachineBranchProbabilityInfo::getEdgeProbability( - const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { - uint32_t Scale = 1; - uint32_t D = getSumForBlock(Src, Scale); - uint32_t N = getEdgeWeight(Src, Dst) / Scale; - - return BranchProbability(N, D); -} - raw_ostream &MachineBranchProbabilityInfo::printEdgeProbability( raw_ostream &OS, const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const { diff --git a/contrib/llvm/lib/CodeGen/MachineCSE.cpp b/contrib/llvm/lib/CodeGen/MachineCSE.cpp index 87aaaa0..aad376c 100644 --- a/contrib/llvm/lib/CodeGen/MachineCSE.cpp +++ b/contrib/llvm/lib/CodeGen/MachineCSE.cpp @@ -57,7 +57,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addPreservedID(MachineLoopInfoID); AU.addRequired<MachineDominatorTree>(); AU.addPreserved<MachineDominatorTree>(); @@ -111,7 +111,7 @@ char &llvm::MachineCSEID = MachineCSE::ID; INITIALIZE_PASS_BEGIN(MachineCSE, "machine-cse", "Machine Common Subexpression Elimination", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(MachineCSE, "machine-cse", "Machine Common Subexpression Elimination", false, false) @@ -122,8 +122,7 @@ INITIALIZE_PASS_END(MachineCSE, "machine-cse", bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI, MachineBasicBlock *MBB) { bool Changed = false; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (MachineOperand &MO : MI->operands()) { if (!MO.isReg() || !MO.isUse()) continue; unsigned Reg = MO.getReg(); @@ -186,8 +185,7 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg, return true; bool SeenDef = false; - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = I->getOperand(i); + for (const MachineOperand &MO : I->operands()) { if (MO.isRegMask() && MO.clobbersPhysReg(Reg)) SeenDef = true; if (!MO.isReg() || !MO.getReg()) @@ -220,8 +218,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI, SmallVectorImpl<unsigned> &PhysDefs, bool &PhysUseDef) const{ // First, add all uses to PhysRefs. - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || MO.isDef()) continue; unsigned Reg = MO.getReg(); @@ -239,8 +236,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI, // (which currently contains only uses), set the PhysUseDef flag. PhysUseDef = false; MachineBasicBlock::const_iterator I = MI; I = std::next(I); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || !MO.isDef()) continue; unsigned Reg = MO.getReg(); @@ -311,8 +307,7 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI, if (I == E) return true; - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = I->getOperand(i); + for (const MachineOperand &MO : I->operands()) { // RegMasks go on instructions like calls that clobber lots of physregs. // Don't attempt to CSE across such an instruction. if (MO.isRegMask()) @@ -398,8 +393,7 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg, // Heuristics #2: If the expression doesn't not use a vr and the only use // of the redundant computation are copies, do not cse. bool HasVRegUse = false; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (MO.isReg() && MO.isUse() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) { HasVRegUse = true; @@ -580,9 +574,9 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { // Actually perform the elimination. if (DoCSE) { - for (unsigned i = 0, e = CSEPairs.size(); i != e; ++i) { - unsigned OldReg = CSEPairs[i].first; - unsigned NewReg = CSEPairs[i].second; + for (std::pair<unsigned, unsigned> &CSEPair : CSEPairs) { + unsigned OldReg = CSEPair.first; + unsigned NewReg = CSEPair.second; // OldReg may have been unused but is used now, clear the Dead flag MachineInstr *Def = MRI->getUniqueVRegDef(NewReg); assert(Def != nullptr && "CSEd register has no unique definition?"); @@ -594,8 +588,8 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) { // Go through implicit defs of CSMI and MI, if a def is not dead at MI, // we should make sure it is not dead at CSMI. - for (unsigned i = 0, e = ImplicitDefsToUpdate.size(); i != e; ++i) - CSMI->getOperand(ImplicitDefsToUpdate[i]).setIsDead(false); + for (unsigned ImplicitDefToUpdate : ImplicitDefsToUpdate) + CSMI->getOperand(ImplicitDefToUpdate).setIsDead(false); // Go through implicit defs of CSMI and MI, and clear the kill flags on // their uses in all the instructions between CSMI and MI. @@ -685,18 +679,14 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) { Node = WorkList.pop_back_val(); Scopes.push_back(Node); const std::vector<MachineDomTreeNode*> &Children = Node->getChildren(); - unsigned NumChildren = Children.size(); - OpenChildren[Node] = NumChildren; - for (unsigned i = 0; i != NumChildren; ++i) { - MachineDomTreeNode *Child = Children[i]; + OpenChildren[Node] = Children.size(); + for (MachineDomTreeNode *Child : Children) WorkList.push_back(Child); - } } while (!WorkList.empty()); // Now perform CSE. bool Changed = false; - for (unsigned i = 0, e = Scopes.size(); i != e; ++i) { - MachineDomTreeNode *Node = Scopes[i]; + for (MachineDomTreeNode *Node : Scopes) { MachineBasicBlock *MBB = Node->getBlock(); EnterScope(MBB); Changed |= ProcessBlock(MBB); @@ -714,7 +704,7 @@ bool MachineCSE::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); DT = &getAnalysis<MachineDominatorTree>(); LookAheadLimit = TII->getMachineCSELookAheadLimit(); return PerformCSE(DT->getRootNode()); diff --git a/contrib/llvm/lib/CodeGen/MachineCombiner.cpp b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp index f33d0e6..fa43c4d 100644 --- a/contrib/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp @@ -10,6 +10,7 @@ // The machine combiner pass uses machine trace metrics to ensure the combined // instructions does not lengthen the critical path or the resource depth. //===----------------------------------------------------------------------===// + #define DEBUG_TYPE "machine-combiner" #include "llvm/ADT/Statistic.h" @@ -68,10 +69,10 @@ private: MachineTraceMetrics::Trace BlockTrace); bool improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, - MachineTraceMetrics::Trace BlockTrace, - SmallVectorImpl<MachineInstr *> &InsInstrs, - DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, - bool NewCodeHasLessInsts); + MachineTraceMetrics::Trace BlockTrace, + SmallVectorImpl<MachineInstr *> &InsInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, + MachineCombinerPattern Pattern); bool preservesResourceLen(MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl<MachineInstr *> &InsInstrs, @@ -122,9 +123,9 @@ unsigned MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, MachineTraceMetrics::Trace BlockTrace) { - SmallVector<unsigned, 16> InstrDepth; - assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); + assert(TSchedModel.hasInstrSchedModelOrItineraries() && + "Missing machine model\n"); // For each instruction in the new sequence compute the depth based on the // operands. Use the trace information when possible. For new operands which @@ -180,8 +181,8 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, /// \returns Latency of \p NewRoot unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, MachineTraceMetrics::Trace BlockTrace) { - - assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); + assert(TSchedModel.hasInstrSchedModelOrItineraries() && + "Missing machine model\n"); // Check each definition in NewRoot and compute the latency unsigned NewRootLatency = 0; @@ -202,62 +203,86 @@ unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, UseMO->findRegisterUseOperandIdx(MO.getReg())); } else { - LatencyOp = TSchedModel.computeInstrLatency(NewRoot->getOpcode()); + LatencyOp = TSchedModel.computeInstrLatency(NewRoot); } NewRootLatency = std::max(NewRootLatency, LatencyOp); } return NewRootLatency; } -/// True when the new instruction sequence does not lengthen the critical path -/// and the new sequence has less instructions or the new sequence improves the -/// critical path. +/// The combiner's goal may differ based on which pattern it is attempting +/// to optimize. +enum class CombinerObjective { + MustReduceDepth, // The data dependency chain must be improved. + Default // The critical path must not be lengthened. +}; + +static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { + // TODO: If C++ ever gets a real enum class, make this part of the + // MachineCombinerPattern class. + switch (P) { + case MachineCombinerPattern::REASSOC_AX_BY: + case MachineCombinerPattern::REASSOC_AX_YB: + case MachineCombinerPattern::REASSOC_XA_BY: + case MachineCombinerPattern::REASSOC_XA_YB: + return CombinerObjective::MustReduceDepth; + default: + return CombinerObjective::Default; + } +} + /// The DAGCombine code sequence ends in MI (Machine Instruction) Root. /// The new code sequence ends in MI NewRoot. A necessary condition for the new /// sequence to replace the old sequence is that it cannot lengthen the critical -/// path. This is decided by the formula: -/// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)). -/// If the new sequence has an equal length critical path but does not reduce -/// the number of instructions (NewCodeHasLessInsts is false), then it is not -/// considered an improvement. The slack is the number of cycles Root can be -/// delayed before the critical patch becomes longer. +/// path. The definition of "improve" may be restricted by specifying that the +/// new path improves the data dependency chain (MustReduceDepth). bool MachineCombiner::improvesCriticalPathLen( MachineBasicBlock *MBB, MachineInstr *Root, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl<MachineInstr *> &InsInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, - bool NewCodeHasLessInsts) { - - assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); + MachineCombinerPattern Pattern) { + assert(TSchedModel.hasInstrSchedModelOrItineraries() && + "Missing machine model\n"); // NewRoot is the last instruction in the \p InsInstrs vector. - // Get depth and latency of NewRoot. unsigned NewRootIdx = InsInstrs.size() - 1; MachineInstr *NewRoot = InsInstrs[NewRootIdx]; - unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); - unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); - // Get depth, latency and slack of Root. + // Get depth and latency of NewRoot and Root. + unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth; + + DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; + dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; + dbgs() << " RootDepth: " << RootDepth << "\n"); + + // For a transform such as reassociation, the cost equation is + // conservatively calculated so that we must improve the depth (data + // dependency cycles) in the critical path to proceed with the transform. + // Being conservative also protects against inaccuracies in the underlying + // machine trace metrics and CPU models. + if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) + return NewRootDepth < RootDepth; + + // A more flexible cost calculation for the critical path includes the slack + // of the original code sequence. This may allow the transform to proceed + // even if the instruction depths (data dependency cycles) become worse. + unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); unsigned RootLatency = TSchedModel.computeInstrLatency(Root); unsigned RootSlack = BlockTrace.getInstrSlack(Root); - DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; - dbgs() << " NewRootDepth: " << NewRootDepth - << " NewRootLatency: " << NewRootLatency << "\n"; - dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency - << " RootSlack: " << RootSlack << "\n"; - dbgs() << " NewRootDepth + NewRootLatency " + DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; + dbgs() << " RootLatency: " << RootLatency << "\n"; + dbgs() << " RootSlack: " << RootSlack << "\n"; + dbgs() << " NewRootDepth + NewRootLatency = " << NewRootDepth + NewRootLatency << "\n"; - dbgs() << " RootDepth + RootLatency + RootSlack " + dbgs() << " RootDepth + RootLatency + RootSlack = " << RootDepth + RootLatency + RootSlack << "\n";); unsigned NewCycleCount = NewRootDepth + NewRootLatency; unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; - if (NewCodeHasLessInsts) - return NewCycleCount <= OldCycleCount; - else - return NewCycleCount < OldCycleCount; + return NewCycleCount <= OldCycleCount; } /// helper routine to convert instructions into SC @@ -271,11 +296,14 @@ void MachineCombiner::instr2instrSC( InstrsSC.push_back(SC); } } + /// True when the new instructions do not increase resource length bool MachineCombiner::preservesResourceLen( MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl<MachineInstr *> &InsInstrs, SmallVectorImpl<MachineInstr *> &DelInstrs) { + if (!TSchedModel.hasInstrSchedModel()) + return true; // Compute current resource length @@ -310,7 +338,7 @@ bool MachineCombiner::preservesResourceLen( bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { if (OptSize && (NewSize < OldSize)) return true; - if (!TSchedModel.hasInstrSchedModel()) + if (!TSchedModel.hasInstrSchedModelOrItineraries()) return true; return false; } @@ -332,7 +360,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { auto &MI = *BlockIter++; DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); - SmallVector<MachineCombinerPattern::MC_PATTERN, 16> Patterns; + SmallVector<MachineCombinerPattern, 16> Patterns; // The motivating example is: // // MUL Other MUL_op1 MUL_op2 Other @@ -358,54 +386,55 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { // mostly one pattern, and getMachineCombinerPatterns() can order patterns // based on an internal cost heuristic. - if (TII->getMachineCombinerPatterns(MI, Patterns)) { - for (auto P : Patterns) { - SmallVector<MachineInstr *, 16> InsInstrs; - SmallVector<MachineInstr *, 16> DelInstrs; - DenseMap<unsigned, unsigned> InstrIdxForVirtReg; - if (!MinInstr) - MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); - MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); + if (!TII->getMachineCombinerPatterns(MI, Patterns)) + continue; + + for (auto P : Patterns) { + SmallVector<MachineInstr *, 16> InsInstrs; + SmallVector<MachineInstr *, 16> DelInstrs; + DenseMap<unsigned, unsigned> InstrIdxForVirtReg; + if (!MinInstr) + MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); + MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); + Traces->verifyAnalysis(); + TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, + InstrIdxForVirtReg); + unsigned NewInstCount = InsInstrs.size(); + unsigned OldInstCount = DelInstrs.size(); + // Found pattern, but did not generate alternative sequence. + // This can happen e.g. when an immediate could not be materialized + // in a single instruction. + if (!NewInstCount) + continue; + + // Substitute when we optimize for codesize and the new sequence has + // fewer instructions OR + // the new sequence neither lengthens the critical path nor increases + // resource pressure. + if (doSubstitute(NewInstCount, OldInstCount) || + (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, + InstrIdxForVirtReg, P) && + preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { + for (auto *InstrPtr : InsInstrs) + MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); + for (auto *InstrPtr : DelInstrs) + InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); + + Changed = true; + ++NumInstCombined; + + Traces->invalidate(MBB); Traces->verifyAnalysis(); - TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, - InstrIdxForVirtReg); - unsigned NewInstCount = InsInstrs.size(); - unsigned OldInstCount = DelInstrs.size(); - // Found pattern, but did not generate alternative sequence. - // This can happen e.g. when an immediate could not be materialized - // in a single instruction. - if (!NewInstCount) - continue; - // Substitute when we optimize for codesize and the new sequence has - // fewer instructions OR - // the new sequence neither lengthens the critical path nor increases - // resource pressure. - if (doSubstitute(NewInstCount, OldInstCount) || - (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, - InstrIdxForVirtReg, - NewInstCount < OldInstCount) && - preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { - for (auto *InstrPtr : InsInstrs) - MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); - for (auto *InstrPtr : DelInstrs) - InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); - - Changed = true; - ++NumInstCombined; - - Traces->invalidate(MBB); - Traces->verifyAnalysis(); - // Eagerly stop after the first pattern fires. - break; - } else { - // Cleanup instructions of the alternative code sequence. There is no - // use for them. - MachineFunction *MF = MBB->getParent(); - for (auto *InstrPtr : InsInstrs) - MF->DeleteMachineInstr(InstrPtr); - } - InstrIdxForVirtReg.clear(); + // Eagerly stop after the first pattern fires. + break; + } else { + // Cleanup instructions of the alternative code sequence. There is no + // use for them. + MachineFunction *MF = MBB->getParent(); + for (auto *InstrPtr : InsInstrs) + MF->DeleteMachineInstr(InstrPtr); } + InstrIdxForVirtReg.clear(); } } @@ -420,9 +449,8 @@ bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { TSchedModel.init(SchedModel, &STI, TII); MRI = &MF.getRegInfo(); Traces = &getAnalysis<MachineTraceMetrics>(); - MinInstr = 0; - - OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); + MinInstr = nullptr; + OptSize = MF.getFunction()->optForSize(); DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); if (!TII->useMachineCombiner()) { diff --git a/contrib/llvm/lib/CodeGen/MachineFunction.cpp b/contrib/llvm/lib/CodeGen/MachineFunction.cpp index 9856e70..ca4bb1c 100644 --- a/contrib/llvm/lib/CodeGen/MachineFunction.cpp +++ b/contrib/llvm/lib/CodeGen/MachineFunction.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionInitializer.h" @@ -26,6 +27,8 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Function.h" @@ -44,6 +47,11 @@ using namespace llvm; #define DEBUG_TYPE "codegen" +static cl::opt<unsigned> + AlignAllFunctions("align-all-functions", + cl::desc("Force the alignment of all functions."), + cl::init(0), cl::Hidden); + void MachineFunctionInitializer::anchor() {} //===----------------------------------------------------------------------===// @@ -79,12 +87,27 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM, Alignment = STI->getTargetLowering()->getMinFunctionAlignment(); // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn. + // FIXME: Use Function::optForSize(). if (!Fn->hasFnAttribute(Attribute::OptimizeForSize)) Alignment = std::max(Alignment, STI->getTargetLowering()->getPrefFunctionAlignment()); + if (AlignAllFunctions) + Alignment = AlignAllFunctions; + FunctionNumber = FunctionNum; JumpTableInfo = nullptr; + + if (isFuncletEHPersonality(classifyEHPersonality( + F->hasPersonalityFn() ? F->getPersonalityFn() : nullptr))) { + WinEHInfo = new (Allocator) WinEHFuncInfo(); + } + + assert(TM.isCompatibleDataLayout(getDataLayout()) && + "Can't create a MachineFunction using a Module with a " + "Target-incompatible DataLayout attached\n"); + + PSVManager = llvm::make_unique<PseudoSourceValueManager>(); } MachineFunction::~MachineFunction() { @@ -117,6 +140,11 @@ MachineFunction::~MachineFunction() { JumpTableInfo->~MachineJumpTableInfo(); Allocator.Deallocate(JumpTableInfo); } + + if (WinEHInfo) { + WinEHInfo->~WinEHFuncInfo(); + Allocator.Deallocate(WinEHInfo); + } } const DataLayout &MachineFunction::getDataLayout() const { @@ -149,7 +177,7 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) { if (MBB == nullptr) MBBI = begin(); else - MBBI = MBB; + MBBI = MBB->getIterator(); // Figure out the block number this should have. unsigned BlockNo = 0; @@ -169,7 +197,7 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) { if (MBBNumbering[BlockNo]) MBBNumbering[BlockNo]->setNumber(-1); - MBBNumbering[BlockNo] = MBBI; + MBBNumbering[BlockNo] = &*MBBI; MBBI->setNumber(BlockNo); } } @@ -322,6 +350,13 @@ MachineFunction::extractStoreMemRefs(MachineInstr::mmo_iterator Begin, return std::make_pair(Result, Result + Num); } +const char *MachineFunction::createExternalSymbolName(StringRef Name) { + char *Dest = Allocator.Allocate<char>(Name.size() + 1); + std::copy(Name.begin(), Name.end(), Dest); + Dest[Name.size()] = 0; + return Dest; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void MachineFunction::dump() const { print(dbgs()); @@ -593,10 +628,9 @@ BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const { BV.set(*CSR); // Saved CSRs are not pristine. - const std::vector<CalleeSavedInfo> &CSI = getCalleeSavedInfo(); - for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), - E = CSI.end(); I != E; ++I) - BV.reset(I->getReg()); + for (auto &I : getCalleeSavedInfo()) + for (MCSubRegIterator S(I.getReg(), TRI, true); S.isValid(); ++S) + BV.reset(*S); return BV; } @@ -801,42 +835,26 @@ Type *MachineConstantPoolEntry::getType() const { return Val.ConstVal->getType(); } - -unsigned MachineConstantPoolEntry::getRelocationInfo() const { +bool MachineConstantPoolEntry::needsRelocation() const { if (isMachineConstantPoolEntry()) - return Val.MachineCPVal->getRelocationInfo(); - return Val.ConstVal->getRelocationInfo(); + return true; + return Val.ConstVal->needsRelocation(); } SectionKind MachineConstantPoolEntry::getSectionKind(const DataLayout *DL) const { - SectionKind Kind; - switch (getRelocationInfo()) { + if (needsRelocation()) + return SectionKind::getReadOnlyWithRel(); + switch (DL->getTypeAllocSize(getType())) { + case 4: + return SectionKind::getMergeableConst4(); + case 8: + return SectionKind::getMergeableConst8(); + case 16: + return SectionKind::getMergeableConst16(); default: - llvm_unreachable("Unknown section kind"); - case Constant::GlobalRelocations: - Kind = SectionKind::getReadOnlyWithRel(); - break; - case Constant::LocalRelocation: - Kind = SectionKind::getReadOnlyWithRelLocal(); - break; - case Constant::NoRelocation: - switch (DL->getTypeAllocSize(getType())) { - case 4: - Kind = SectionKind::getMergeableConst4(); - break; - case 8: - Kind = SectionKind::getMergeableConst8(); - break; - case 16: - Kind = SectionKind::getMergeableConst16(); - break; - default: - Kind = SectionKind::getReadOnly(); - break; - } + return SectionKind::getReadOnly(); } - return Kind; } MachineConstantPool::~MachineConstantPool() { diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp index aaf06a7..05463fc 100644 --- a/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp +++ b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp @@ -13,11 +13,14 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/DominanceFrontier.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/StackProtector.h" @@ -49,13 +52,16 @@ void MachineFunctionPass::getAnalysisUsage(AnalysisUsage &AU) const { // passes explicitly. This does not include setPreservesCFG, // because CodeGen overloads that to mean preserving the MachineBasicBlock // CFG in addition to the LLVM IR CFG. - AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<BasicAAWrapperPass>(); AU.addPreserved<DominanceFrontier>(); AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<IVUsers>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<MemoryDependenceAnalysis>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); AU.addPreserved<StackProtector>(); FunctionPass::getAnalysisUsage(AU); diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp index 790f5ac..4f424ff 100644 --- a/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp +++ b/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp @@ -31,7 +31,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass { const std::string Banner; MachineFunctionPrinterPass() : MachineFunctionPass(ID), OS(dbgs()) { } - MachineFunctionPrinterPass(raw_ostream &os, const std::string &banner) + MachineFunctionPrinterPass(raw_ostream &os, const std::string &banner) : MachineFunctionPass(ID), OS(os), Banner(banner) {} const char *getPassName() const override { return "MachineFunction Printer"; } @@ -42,6 +42,8 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass { } bool runOnMachineFunction(MachineFunction &MF) override { + if (!llvm::isFunctionInPrintList(MF.getName())) + return false; OS << "# " << Banner << ":\n"; MF.print(OS, getAnalysisIfAvailable<SlotIndexes>()); return false; diff --git a/contrib/llvm/lib/CodeGen/MachineInstr.cpp b/contrib/llvm/lib/CodeGen/MachineInstr.cpp index fdc4226..6dca74d 100644 --- a/contrib/llvm/lib/CodeGen/MachineInstr.cpp +++ b/contrib/llvm/lib/CodeGen/MachineInstr.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/Value.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -43,6 +44,11 @@ #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; +static cl::opt<bool> PrintWholeRegMask( + "print-whole-regmask", + cl::desc("Print the full contents of regmask operands in IR dumps"), + cl::init(true), cl::Hidden); + //===----------------------------------------------------------------------===// // MachineOperand Implementation //===----------------------------------------------------------------------===// @@ -407,9 +413,26 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, if (getOffset()) OS << "+" << getOffset(); OS << '>'; break; - case MachineOperand::MO_RegisterMask: - OS << "<regmask>"; + case MachineOperand::MO_RegisterMask: { + unsigned NumRegsInMask = 0; + unsigned NumRegsEmitted = 0; + OS << "<regmask"; + for (unsigned i = 0; i < TRI->getNumRegs(); ++i) { + unsigned MaskWord = i / 32; + unsigned MaskBit = i % 32; + if (getRegMask()[MaskWord] & (1 << MaskBit)) { + if (PrintWholeRegMask || NumRegsEmitted <= 10) { + OS << " " << PrintReg(i, TRI); + NumRegsEmitted++; + } + NumRegsInMask++; + } + } + if (NumRegsEmitted != NumRegsInMask) + OS << " and " << (NumRegsInMask - NumRegsEmitted) << " more..."; + OS << ">"; break; + } case MachineOperand::MO_RegisterLiveOut: OS << "<regliveout>"; break; @@ -443,26 +466,28 @@ unsigned MachinePointerInfo::getAddrSpace() const { /// getConstantPool - Return a MachinePointerInfo record that refers to the /// constant pool. -MachinePointerInfo MachinePointerInfo::getConstantPool() { - return MachinePointerInfo(PseudoSourceValue::getConstantPool()); +MachinePointerInfo MachinePointerInfo::getConstantPool(MachineFunction &MF) { + return MachinePointerInfo(MF.getPSVManager().getConstantPool()); } /// getFixedStack - Return a MachinePointerInfo record that refers to the /// the specified FrameIndex. -MachinePointerInfo MachinePointerInfo::getFixedStack(int FI, int64_t offset) { - return MachinePointerInfo(PseudoSourceValue::getFixedStack(FI), offset); +MachinePointerInfo MachinePointerInfo::getFixedStack(MachineFunction &MF, + int FI, int64_t Offset) { + return MachinePointerInfo(MF.getPSVManager().getFixedStack(FI), Offset); } -MachinePointerInfo MachinePointerInfo::getJumpTable() { - return MachinePointerInfo(PseudoSourceValue::getJumpTable()); +MachinePointerInfo MachinePointerInfo::getJumpTable(MachineFunction &MF) { + return MachinePointerInfo(MF.getPSVManager().getJumpTable()); } -MachinePointerInfo MachinePointerInfo::getGOT() { - return MachinePointerInfo(PseudoSourceValue::getGOT()); +MachinePointerInfo MachinePointerInfo::getGOT(MachineFunction &MF) { + return MachinePointerInfo(MF.getPSVManager().getGOT()); } -MachinePointerInfo MachinePointerInfo::getStack(int64_t Offset) { - return MachinePointerInfo(PseudoSourceValue::getStack(), Offset); +MachinePointerInfo MachinePointerInfo::getStack(MachineFunction &MF, + int64_t Offset) { + return MachinePointerInfo(MF.getPSVManager().getStack(), Offset); } MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, unsigned f, @@ -606,10 +631,12 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST) const { void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) { if (MCID->ImplicitDefs) - for (const uint16_t *ImpDefs = MCID->getImplicitDefs(); *ImpDefs; ++ImpDefs) + for (const MCPhysReg *ImpDefs = MCID->getImplicitDefs(); *ImpDefs; + ++ImpDefs) addOperand(MF, MachineOperand::CreateReg(*ImpDefs, true, true)); if (MCID->ImplicitUses) - for (const uint16_t *ImpUses = MCID->getImplicitUses(); *ImpUses; ++ImpUses) + for (const MCPhysReg *ImpUses = MCID->getImplicitUses(); *ImpUses; + ++ImpUses) addOperand(MF, MachineOperand::CreateReg(*ImpUses, false, true)); } @@ -839,9 +866,60 @@ void MachineInstr::addMemOperand(MachineFunction &MF, setMemRefs(NewMemRefs, NewMemRefs + NewNum); } +/// Check to see if the MMOs pointed to by the two MemRefs arrays are +/// identical. +static bool hasIdenticalMMOs(const MachineInstr &MI1, const MachineInstr &MI2) { + auto I1 = MI1.memoperands_begin(), E1 = MI1.memoperands_end(); + auto I2 = MI2.memoperands_begin(), E2 = MI2.memoperands_end(); + if ((E1 - I1) != (E2 - I2)) + return false; + for (; I1 != E1; ++I1, ++I2) { + if (**I1 != **I2) + return false; + } + return true; +} + +std::pair<MachineInstr::mmo_iterator, unsigned> +MachineInstr::mergeMemRefsWith(const MachineInstr& Other) { + + // If either of the incoming memrefs are empty, we must be conservative and + // treat this as if we've exhausted our space for memrefs and dropped them. + if (memoperands_empty() || Other.memoperands_empty()) + return std::make_pair(nullptr, 0); + + // If both instructions have identical memrefs, we don't need to merge them. + // Since many instructions have a single memref, and we tend to merge things + // like pairs of loads from the same location, this catches a large number of + // cases in practice. + if (hasIdenticalMMOs(*this, Other)) + return std::make_pair(MemRefs, NumMemRefs); + + // TODO: consider uniquing elements within the operand lists to reduce + // space usage and fall back to conservative information less often. + size_t CombinedNumMemRefs = NumMemRefs + Other.NumMemRefs; + + // If we don't have enough room to store this many memrefs, be conservative + // and drop them. Otherwise, we'd fail asserts when trying to add them to + // the new instruction. + if (CombinedNumMemRefs != uint8_t(CombinedNumMemRefs)) + return std::make_pair(nullptr, 0); + + MachineFunction *MF = getParent()->getParent(); + mmo_iterator MemBegin = MF->allocateMemRefsArray(CombinedNumMemRefs); + mmo_iterator MemEnd = std::copy(memoperands_begin(), memoperands_end(), + MemBegin); + MemEnd = std::copy(Other.memoperands_begin(), Other.memoperands_end(), + MemEnd); + assert(MemEnd - MemBegin == (ptrdiff_t)CombinedNumMemRefs && + "missing memrefs"); + + return std::make_pair(MemBegin, CombinedNumMemRefs); +} + bool MachineInstr::hasPropertyInBundle(unsigned Mask, QueryType Type) const { assert(!isBundledWithPred() && "Must be called on bundle header"); - for (MachineBasicBlock::const_instr_iterator MII = this;; ++MII) { + for (MachineBasicBlock::const_instr_iterator MII = getIterator();; ++MII) { if (MII->getDesc().getFlags() & Mask) { if (Type == AnyInBundle) return true; @@ -865,13 +943,13 @@ bool MachineInstr::isIdenticalTo(const MachineInstr *Other, if (isBundle()) { // Both instructions are bundles, compare MIs inside the bundle. - MachineBasicBlock::const_instr_iterator I1 = *this; + MachineBasicBlock::const_instr_iterator I1 = getIterator(); MachineBasicBlock::const_instr_iterator E1 = getParent()->instr_end(); - MachineBasicBlock::const_instr_iterator I2 = *Other; + MachineBasicBlock::const_instr_iterator I2 = Other->getIterator(); MachineBasicBlock::const_instr_iterator E2= Other->getParent()->instr_end(); while (++I1 != E1 && I1->isInsideBundle()) { ++I2; - if (I2 == E2 || !I2->isInsideBundle() || !I1->isIdenticalTo(I2, Check)) + if (I2 == E2 || !I2->isInsideBundle() || !I1->isIdenticalTo(&*I2, Check)) return false; } } @@ -976,7 +1054,7 @@ unsigned MachineInstr::getNumExplicitOperands() const { void MachineInstr::bundleWithPred() { assert(!isBundledWithPred() && "MI is already bundled with its predecessor"); setFlag(BundledPred); - MachineBasicBlock::instr_iterator Pred = this; + MachineBasicBlock::instr_iterator Pred = getIterator(); --Pred; assert(!Pred->isBundledWithSucc() && "Inconsistent bundle flags"); Pred->setFlag(BundledSucc); @@ -985,7 +1063,7 @@ void MachineInstr::bundleWithPred() { void MachineInstr::bundleWithSucc() { assert(!isBundledWithSucc() && "MI is already bundled with its successor"); setFlag(BundledSucc); - MachineBasicBlock::instr_iterator Succ = this; + MachineBasicBlock::instr_iterator Succ = getIterator(); ++Succ; assert(!Succ->isBundledWithPred() && "Inconsistent bundle flags"); Succ->setFlag(BundledPred); @@ -994,7 +1072,7 @@ void MachineInstr::bundleWithSucc() { void MachineInstr::unbundleFromPred() { assert(isBundledWithPred() && "MI isn't bundled with its predecessor"); clearFlag(BundledPred); - MachineBasicBlock::instr_iterator Pred = this; + MachineBasicBlock::instr_iterator Pred = getIterator(); --Pred; assert(Pred->isBundledWithSucc() && "Inconsistent bundle flags"); Pred->clearFlag(BundledSucc); @@ -1003,7 +1081,7 @@ void MachineInstr::unbundleFromPred() { void MachineInstr::unbundleFromSucc() { assert(isBundledWithSucc() && "MI isn't bundled with its successor"); clearFlag(BundledSucc); - MachineBasicBlock::instr_iterator Succ = this; + MachineBasicBlock::instr_iterator Succ = getIterator(); ++Succ; assert(Succ->isBundledWithPred() && "Inconsistent bundle flags"); Succ->clearFlag(BundledPred); @@ -1139,7 +1217,7 @@ const TargetRegisterClass *MachineInstr::getRegClassConstraintEffect( /// Return the number of instructions inside the MI bundle, not counting the /// header instruction. unsigned MachineInstr::getBundleSize() const { - MachineBasicBlock::const_instr_iterator I = this; + MachineBasicBlock::const_instr_iterator I = getIterator(); unsigned Size = 0; while (I->isBundledWithSucc()) ++Size, ++I; @@ -1501,6 +1579,10 @@ bool MachineInstr::hasUnmodeledSideEffects() const { return false; } +bool MachineInstr::isLoadFoldBarrier() const { + return mayStore() || isCall() || hasUnmodeledSideEffects(); +} + /// allDefsAreDead - Return true if all the defs of this instruction are dead. /// bool MachineInstr::allDefsAreDead() const { @@ -1615,7 +1697,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, FirstOp = false; } - for (unsigned i = StartOp, e = getNumOperands(); i != e; ++i) { const MachineOperand &MO = getOperand(i); @@ -1706,17 +1787,26 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, } bool HaveSemi = false; - const unsigned PrintableFlags = FrameSetup; + const unsigned PrintableFlags = FrameSetup | FrameDestroy; if (Flags & PrintableFlags) { - if (!HaveSemi) OS << ";"; HaveSemi = true; + if (!HaveSemi) { + OS << ";"; + HaveSemi = true; + } OS << " flags: "; if (Flags & FrameSetup) OS << "FrameSetup"; + + if (Flags & FrameDestroy) + OS << "FrameDestroy"; } if (!memoperands_empty()) { - if (!HaveSemi) OS << ";"; HaveSemi = true; + if (!HaveSemi) { + OS << ";"; + HaveSemi = true; + } OS << " mem:"; for (mmo_iterator i = memoperands_begin(), e = memoperands_end(); @@ -1729,7 +1819,10 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, // Print the regclass of any virtual registers encountered. if (MRI && !VirtRegs.empty()) { - if (!HaveSemi) OS << ";"; HaveSemi = true; + if (!HaveSemi) { + OS << ";"; + HaveSemi = true; + } for (unsigned i = 0; i != VirtRegs.size(); ++i) { const TargetRegisterClass *RC = MRI->getRegClass(VirtRegs[i]); OS << " " << TRI->getRegClassName(RC) @@ -1748,21 +1841,23 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, // Print debug location information. if (isDebugValue() && getOperand(e - 2).isMetadata()) { - if (!HaveSemi) OS << ";"; + if (!HaveSemi) + OS << ";"; auto *DV = cast<DILocalVariable>(getOperand(e - 2).getMetadata()); OS << " line no:" << DV->getLine(); if (auto *InlinedAt = debugLoc->getInlinedAt()) { DebugLoc InlinedAtDL(InlinedAt); if (InlinedAtDL && MF) { OS << " inlined @[ "; - InlinedAtDL.print(OS); + InlinedAtDL.print(OS); OS << " ]"; } } if (isIndirectDebugValue()) OS << " indirect"; } else if (debugLoc && MF) { - if (!HaveSemi) OS << ";"; + if (!HaveSemi) + OS << ";"; OS << " dbg:"; debugLoc.print(OS); } @@ -1902,11 +1997,11 @@ void MachineInstr::clearRegisterDeads(unsigned Reg) { } } -void MachineInstr::addRegisterDefReadUndef(unsigned Reg) { +void MachineInstr::setRegisterDefReadUndef(unsigned Reg, bool IsUndef) { for (MachineOperand &MO : operands()) { if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg || MO.getSubReg() == 0) continue; - MO.setIsUndef(); + MO.setIsUndef(IsUndef); } } diff --git a/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp b/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp index cd820ee..4619daf 100644 --- a/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -293,15 +293,17 @@ MachineOperandIteratorBase::PhysRegInfo MachineOperandIteratorBase::analyzePhysReg(unsigned Reg, const TargetRegisterInfo *TRI) { bool AllDefsDead = true; - PhysRegInfo PRI = {false, false, false, false, false, false}; + PhysRegInfo PRI = {false, false, false, false, false, false, false}; assert(TargetRegisterInfo::isPhysicalRegister(Reg) && "analyzePhysReg not given a physical register!"); for (; isValid(); ++*this) { MachineOperand &MO = deref(); - if (MO.isRegMask() && MO.clobbersPhysReg(Reg)) - PRI.Clobbers = true; // Regmask clobbers Reg. + if (MO.isRegMask() && MO.clobbersPhysReg(Reg)) { + PRI.Clobbered = true; + continue; + } if (!MO.isReg()) continue; @@ -310,33 +312,28 @@ MachineOperandIteratorBase::analyzePhysReg(unsigned Reg, if (!MOReg || !TargetRegisterInfo::isPhysicalRegister(MOReg)) continue; - bool IsRegOrSuperReg = MOReg == Reg || TRI->isSubRegister(MOReg, Reg); - bool IsRegOrOverlapping = MOReg == Reg || TRI->regsOverlap(MOReg, Reg); - - if (IsRegOrSuperReg && MO.readsReg()) { - // Reg or a super-reg is read, and perhaps killed also. - PRI.Reads = true; - PRI.Kills = MO.isKill(); - } - - if (IsRegOrOverlapping && MO.readsReg()) { - PRI.ReadsOverlap = true;// Reg or an overlapping register is read. - } - - if (!MO.isDef()) + if (!TRI->regsOverlap(MOReg, Reg)) continue; - if (IsRegOrSuperReg) { - PRI.Defines = true; // Reg or a super-register is defined. + bool Covered = TRI->isSuperRegisterEq(Reg, MOReg); + if (MO.readsReg()) { + PRI.Read = true; + if (Covered) { + PRI.FullyRead = true; + if (MO.isKill()) + PRI.Killed = true; + } + } else if (MO.isDef()) { + PRI.Defined = true; + if (Covered) + PRI.FullyDefined = true; if (!MO.isDead()) AllDefsDead = false; } - if (IsRegOrOverlapping) - PRI.Clobbers = true; // Reg or an overlapping reg is defined. } - if (AllDefsDead && PRI.Defines) - PRI.DefinesDead = true; // Reg or super-register was defined and was dead. + if (AllDefsDead && PRI.FullyDefined) + PRI.DeadDef = true; return PRI; } diff --git a/contrib/llvm/lib/CodeGen/MachineLICM.cpp b/contrib/llvm/lib/CodeGen/MachineLICM.cpp index e9ea5ed..99a97d2 100644 --- a/contrib/llvm/lib/CodeGen/MachineLICM.cpp +++ b/contrib/llvm/lib/CodeGen/MachineLICM.cpp @@ -138,7 +138,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineLoopInfo>(); AU.addRequired<MachineDominatorTree>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addPreserved<MachineLoopInfo>(); AU.addPreserved<MachineDominatorTree>(); MachineFunctionPass::getAnalysisUsage(AU); @@ -153,7 +153,7 @@ namespace { } private: - /// CandidateInfo - Keep track of information about hoisting candidates. + /// Keep track of information about hoisting candidates. struct CandidateInfo { MachineInstr *MI; unsigned Def; @@ -162,149 +162,76 @@ namespace { : MI(mi), Def(def), FI(fi) {} }; - /// HoistRegionPostRA - Walk the specified region of the CFG and hoist loop - /// invariants out to the preheader. void HoistRegionPostRA(); - /// HoistPostRA - When an instruction is found to only use loop invariant - /// operands that is safe to hoist, this instruction is called to do the - /// dirty work. void HoistPostRA(MachineInstr *MI, unsigned Def); - /// ProcessMI - Examine the instruction for potentai LICM candidate. Also - /// gather register def and frame object update information. - void ProcessMI(MachineInstr *MI, - BitVector &PhysRegDefs, - BitVector &PhysRegClobbers, - SmallSet<int, 32> &StoredFIs, + void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, + BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs, SmallVectorImpl<CandidateInfo> &Candidates); - /// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the - /// current loop. void AddToLiveIns(unsigned Reg); - /// IsLICMCandidate - Returns true if the instruction may be a suitable - /// candidate for LICM. e.g. If the instruction is a call, then it's - /// obviously not safe to hoist it. bool IsLICMCandidate(MachineInstr &I); - /// IsLoopInvariantInst - Returns true if the instruction is loop - /// invariant. I.e., all virtual register operands are defined outside of - /// the loop, physical registers aren't accessed (explicitly or implicitly), - /// and the instruction is hoistable. - /// bool IsLoopInvariantInst(MachineInstr &I); - /// HasLoopPHIUse - Return true if the specified instruction is used by any - /// phi node in the current loop. bool HasLoopPHIUse(const MachineInstr *MI) const; - /// HasHighOperandLatency - Compute operand latency between a def of 'Reg' - /// and an use in the current loop, return true if the target considered - /// it 'high'. bool HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx, unsigned Reg) const; bool IsCheapInstruction(MachineInstr &MI) const; - /// CanCauseHighRegPressure - Visit BBs from header to current BB, - /// check if hoisting an instruction of the given cost matrix can cause high - /// register pressure. bool CanCauseHighRegPressure(const DenseMap<unsigned, int> &Cost, bool Cheap); - /// UpdateBackTraceRegPressure - Traverse the back trace from header to - /// the current block and update their register pressures to reflect the - /// effect of hoisting MI from the current block to the preheader. void UpdateBackTraceRegPressure(const MachineInstr *MI); - /// IsProfitableToHoist - Return true if it is potentially profitable to - /// hoist the given loop invariant. bool IsProfitableToHoist(MachineInstr &MI); - /// IsGuaranteedToExecute - Check if this mbb is guaranteed to execute. - /// If not then a load from this mbb may not be safe to hoist. bool IsGuaranteedToExecute(MachineBasicBlock *BB); void EnterScope(MachineBasicBlock *MBB); void ExitScope(MachineBasicBlock *MBB); - /// ExitScopeIfDone - Destroy scope for the MBB that corresponds to given - /// dominator tree node if its a leaf or all of its children are done. Walk - /// up the dominator tree to destroy ancestors which are now done. - void ExitScopeIfDone(MachineDomTreeNode *Node, - DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren, - DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap); - - /// HoistOutOfLoop - Walk the specified loop in the CFG (defined by all - /// blocks dominated by the specified header block, and that are in the - /// current loop) in depth first order w.r.t the DominatorTree. This allows - /// us to visit definitions before uses, allowing us to hoist a loop body in - /// one pass without iteration. - /// + void ExitScopeIfDone( + MachineDomTreeNode *Node, + DenseMap<MachineDomTreeNode *, unsigned> &OpenChildren, + DenseMap<MachineDomTreeNode *, MachineDomTreeNode *> &ParentMap); + void HoistOutOfLoop(MachineDomTreeNode *LoopHeaderNode); + void HoistRegion(MachineDomTreeNode *N, bool IsHeader); - /// SinkIntoLoop - Sink instructions into loops if profitable. This - /// especially tries to prevent register spills caused by register pressure - /// if there is little to no overhead moving instructions into loops. void SinkIntoLoop(); - /// InitRegPressure - Find all virtual register references that are liveout - /// of the preheader to initialize the starting "register pressure". Note - /// this does not count live through (livein but not used) registers. void InitRegPressure(MachineBasicBlock *BB); - /// calcRegisterCost - Calculate the additional register pressure that the - /// registers used in MI cause. - /// - /// If 'ConsiderSeen' is true, updates 'RegSeen' and uses the information to - /// figure out which usages are live-ins. - /// FIXME: Figure out a way to consider 'RegSeen' from all code paths. DenseMap<unsigned, int> calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen, bool ConsiderUnseenAsDef); - /// UpdateRegPressure - Update estimate of register pressure after the - /// specified instruction. void UpdateRegPressure(const MachineInstr *MI, bool ConsiderUnseenAsDef = false); - /// ExtractHoistableLoad - Unfold a load from the given machineinstr if - /// the load itself could be hoisted. Return the unfolded and hoistable - /// load, or null if the load couldn't be unfolded or if it wouldn't - /// be hoistable. MachineInstr *ExtractHoistableLoad(MachineInstr *MI); - /// LookForDuplicate - Find an instruction amount PrevMIs that is a - /// duplicate of MI. Return this instruction if it's found. - const MachineInstr *LookForDuplicate(const MachineInstr *MI, - std::vector<const MachineInstr*> &PrevMIs); + const MachineInstr * + LookForDuplicate(const MachineInstr *MI, + std::vector<const MachineInstr *> &PrevMIs); - /// EliminateCSE - Given a LICM'ed instruction, look for an instruction on - /// the preheader that compute the same value. If it's found, do a RAU on - /// with the definition of the existing instruction rather than hoisting - /// the instruction to the preheader. - bool EliminateCSE(MachineInstr *MI, - DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI); + bool EliminateCSE( + MachineInstr *MI, + DenseMap<unsigned, std::vector<const MachineInstr *>>::iterator &CI); - /// MayCSE - Return true if the given instruction will be CSE'd if it's - /// hoisted out of the loop. bool MayCSE(MachineInstr *MI); - /// Hoist - When an instruction is found to only use loop invariant operands - /// that is safe to hoist, this instruction is called to do the dirty work. - /// It returns true if the instruction is hoisted. bool Hoist(MachineInstr *MI, MachineBasicBlock *Preheader); - /// InitCSEMap - Initialize the CSE map with instructions that are in the - /// current loop preheader that may become duplicates of instructions that - /// are hoisted out of the loop. void InitCSEMap(MachineBasicBlock *BB); - /// getCurPreheader - Get the preheader for the current loop, splitting - /// a critical edge if needed. MachineBasicBlock *getCurPreheader(); }; } // end anonymous namespace @@ -315,12 +242,11 @@ INITIALIZE_PASS_BEGIN(MachineLICM, "machinelicm", "Machine Loop Invariant Code Motion", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(MachineLICM, "machinelicm", "Machine Loop Invariant Code Motion", false, false) -/// LoopIsOuterMostWithPredecessor - Test if the given loop is the outer-most -/// loop that has a unique predecessor. +/// Test if the given loop is the outer-most loop that has a unique predecessor. static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) { // Check whether this loop even has a unique predecessor. if (!CurLoop->getLoopPredecessor()) @@ -367,7 +293,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) { // Get our Loop information... MLI = &getAnalysis<MachineLoopInfo>(); DT = &getAnalysis<MachineDominatorTree>(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); SmallVector<MachineLoop *, 8> Worklist(MLI->begin(), MLI->end()); while (!Worklist.empty()) { @@ -402,15 +328,17 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) { return Changed; } -/// InstructionStoresToFI - Return true if instruction stores to the -/// specified frame. +/// Return true if instruction stores to the specified frame. static bool InstructionStoresToFI(const MachineInstr *MI, int FI) { - for (MachineInstr::mmo_iterator o = MI->memoperands_begin(), - oe = MI->memoperands_end(); o != oe; ++o) { - if (!(*o)->isStore() || !(*o)->getPseudoValue()) + // If we lost memory operands, conservatively assume that the instruction + // writes to all slots. + if (MI->memoperands_empty()) + return true; + for (const MachineMemOperand *MemOp : MI->memoperands()) { + if (!MemOp->isStore() || !MemOp->getPseudoValue()) continue; if (const FixedStackPseudoSourceValue *Value = - dyn_cast<FixedStackPseudoSourceValue>((*o)->getPseudoValue())) { + dyn_cast<FixedStackPseudoSourceValue>(MemOp->getPseudoValue())) { if (Value->getFrameIndex() == FI) return true; } @@ -418,7 +346,7 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) { return false; } -/// ProcessMI - Examine the instruction for potentai LICM candidate. Also +/// Examine the instruction for potentai LICM candidate. Also /// gather register def and frame object update information. void MachineLICM::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, @@ -428,8 +356,7 @@ void MachineLICM::ProcessMI(MachineInstr *MI, bool RuledOut = false; bool HasNonInvariantUse = false; unsigned Def = 0; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (MO.isFI()) { // Remember if the instruction stores to the frame index. int FI = MO.getIndex(); @@ -506,8 +433,8 @@ void MachineLICM::ProcessMI(MachineInstr *MI, } } -/// HoistRegionPostRA - Walk the specified region of the CFG and hoist loop -/// invariants out to the preheader. +/// Walk the specified region of the CFG and hoist loop invariants out to the +/// preheader. void MachineLICM::HoistRegionPostRA() { MachineBasicBlock *Preheader = getCurPreheader(); if (!Preheader) @@ -523,38 +450,30 @@ void MachineLICM::HoistRegionPostRA() { // Walk the entire region, count number of defs for each register, and // collect potential LICM candidates. const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks(); - for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { - MachineBasicBlock *BB = Blocks[i]; - + for (MachineBasicBlock *BB : Blocks) { // If the header of the loop containing this basic block is a landing pad, // then don't try to hoist instructions out of this loop. const MachineLoop *ML = MLI->getLoopFor(BB); - if (ML && ML->getHeader()->isLandingPad()) continue; + if (ML && ML->getHeader()->isEHPad()) continue; // Conservatively treat live-in's as an external def. // FIXME: That means a reload that're reused in successor block(s) will not // be LICM'ed. - for (MachineBasicBlock::livein_iterator I = BB->livein_begin(), - E = BB->livein_end(); I != E; ++I) { - unsigned Reg = *I; - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + for (const auto &LI : BB->liveins()) { + for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) PhysRegDefs.set(*AI); } SpeculationState = SpeculateUnknown; - for (MachineBasicBlock::iterator - MII = BB->begin(), E = BB->end(); MII != E; ++MII) { - MachineInstr *MI = &*MII; - ProcessMI(MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates); - } + for (MachineInstr &MI : *BB) + ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates); } // Gather the registers read / clobbered by the terminator. BitVector TermRegs(NumRegs); MachineBasicBlock::iterator TI = Preheader->getFirstTerminator(); if (TI != Preheader->end()) { - for (unsigned i = 0, e = TI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = TI->getOperand(i); + for (const MachineOperand &MO : TI->operands()) { if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); @@ -573,17 +492,16 @@ void MachineLICM::HoistRegionPostRA() { // 3. Make sure candidate def should not clobber // registers read by the terminator. Similarly its def should not be // clobbered by the terminator. - for (unsigned i = 0, e = Candidates.size(); i != e; ++i) { - if (Candidates[i].FI != INT_MIN && - StoredFIs.count(Candidates[i].FI)) + for (CandidateInfo &Candidate : Candidates) { + if (Candidate.FI != INT_MIN && + StoredFIs.count(Candidate.FI)) continue; - unsigned Def = Candidates[i].Def; + unsigned Def = Candidate.Def; if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) { bool Safe = true; - MachineInstr *MI = Candidates[i].MI; - for (unsigned j = 0, ee = MI->getNumOperands(); j != ee; ++j) { - const MachineOperand &MO = MI->getOperand(j); + MachineInstr *MI = Candidate.MI; + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || MO.isDef() || !MO.getReg()) continue; unsigned Reg = MO.getReg(); @@ -596,24 +514,20 @@ void MachineLICM::HoistRegionPostRA() { } } if (Safe) - HoistPostRA(MI, Candidates[i].Def); + HoistPostRA(MI, Candidate.Def); } } } -/// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the current -/// loop, and make sure it is not killed by any instructions in the loop. +/// Add register 'Reg' to the livein sets of BBs in the current loop, and make +/// sure it is not killed by any instructions in the loop. void MachineLICM::AddToLiveIns(unsigned Reg) { const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks(); - for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { - MachineBasicBlock *BB = Blocks[i]; + for (MachineBasicBlock *BB : Blocks) { if (!BB->isLiveIn(Reg)) BB->addLiveIn(Reg); - for (MachineBasicBlock::iterator - MII = BB->begin(), E = BB->end(); MII != E; ++MII) { - MachineInstr *MI = &*MII; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (MachineInstr &MI : *BB) { + for (MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.getReg() || MO.isDef()) continue; if (MO.getReg() == Reg || TRI->isSuperRegister(Reg, MO.getReg())) MO.setIsKill(false); @@ -622,9 +536,8 @@ void MachineLICM::AddToLiveIns(unsigned Reg) { } } -/// HoistPostRA - When an instruction is found to only use loop invariant -/// operands that is safe to hoist, this instruction is called to do the -/// dirty work. +/// When an instruction is found to only use loop invariant operands that is +/// safe to hoist, this instruction is called to do the dirty work. void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) { MachineBasicBlock *Preheader = getCurPreheader(); @@ -646,8 +559,8 @@ void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) { Changed = true; } -// IsGuaranteedToExecute - Check if this mbb is guaranteed to execute. -// If not then a load from this mbb may not be safe to hoist. +/// Check if this mbb is guaranteed to execute. If not then a load from this mbb +/// may not be safe to hoist. bool MachineLICM::IsGuaranteedToExecute(MachineBasicBlock *BB) { if (SpeculationState != SpeculateUnknown) return SpeculationState == SpeculateFalse; @@ -656,8 +569,8 @@ bool MachineLICM::IsGuaranteedToExecute(MachineBasicBlock *BB) { // Check loop exiting blocks. SmallVector<MachineBasicBlock*, 8> CurrentLoopExitingBlocks; CurLoop->getExitingBlocks(CurrentLoopExitingBlocks); - for (unsigned i = 0, e = CurrentLoopExitingBlocks.size(); i != e; ++i) - if (!DT->dominates(BB, CurrentLoopExitingBlocks[i])) { + for (MachineBasicBlock *CurrentLoopExitingBlock : CurrentLoopExitingBlocks) + if (!DT->dominates(BB, CurrentLoopExitingBlock)) { SpeculationState = SpeculateTrue; return false; } @@ -679,9 +592,9 @@ void MachineLICM::ExitScope(MachineBasicBlock *MBB) { BackTrace.pop_back(); } -/// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given -/// dominator tree node if its a leaf or all of its children are done. Walk -/// up the dominator tree to destroy ancestors which are now done. +/// Destroy scope for the MBB that corresponds to the given dominator tree node +/// if its a leaf or all of its children are done. Walk up the dominator tree to +/// destroy ancestors which are now done. void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node, DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren, DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) { @@ -701,11 +614,10 @@ void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node, } } -/// HoistOutOfLoop - Walk the specified loop in the CFG (defined by all -/// blocks dominated by the specified header block, and that are in the -/// current loop) in depth first order w.r.t the DominatorTree. This allows -/// us to visit definitions before uses, allowing us to hoist a loop body in -/// one pass without iteration. +/// Walk the specified loop in the CFG (defined by all blocks dominated by the +/// specified header block, and that are in the current loop) in depth first +/// order w.r.t the DominatorTree. This allows us to visit definitions before +/// uses, allowing us to hoist a loop body in one pass without iteration. /// void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) { MachineBasicBlock *Preheader = getCurPreheader(); @@ -727,7 +639,7 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) { // If the header of the loop containing this basic block is a landing pad, // then don't try to hoist instructions out of this loop. const MachineLoop *ML = MLI->getLoopFor(BB); - if (ML && ML->getHeader()->isLandingPad()) + if (ML && ML->getHeader()->isEHPad()) continue; // If this subregion is not in the top level loop at all, exit. @@ -764,8 +676,7 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) { InitRegPressure(Preheader); // Now perform LICM. - for (unsigned i = 0, e = Scopes.size(); i != e; ++i) { - MachineDomTreeNode *Node = Scopes[i]; + for (MachineDomTreeNode *Node : Scopes) { MachineBasicBlock *MBB = Node->getBlock(); EnterScope(MBB); @@ -786,6 +697,9 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) { } } +/// Sink instructions into loops if profitable. This especially tries to prevent +/// register spills caused by register pressure if there is little to no +/// overhead moving instructions into loops. void MachineLICM::SinkIntoLoop() { MachineBasicBlock *Preheader = getCurPreheader(); if (!Preheader) @@ -796,8 +710,8 @@ void MachineLICM::SinkIntoLoop() { I != Preheader->instr_end(); ++I) { // We need to ensure that we can safely move this instruction into the loop. // As such, it must not have side-effects, e.g. such as a call has. - if (IsLoopInvariantInst(*I) && !HasLoopPHIUse(I)) - Candidates.push_back(I); + if (IsLoopInvariantInst(*I) && !HasLoopPHIUse(&*I)) + Candidates.push_back(&*I); } for (MachineInstr *I : Candidates) { @@ -837,9 +751,9 @@ static bool isOperandKill(const MachineOperand &MO, MachineRegisterInfo *MRI) { return MO.isKill() || MRI->hasOneNonDBGUse(MO.getReg()); } -/// InitRegPressure - Find all virtual register references that are liveout of -/// the preheader to initialize the starting "register pressure". Note this -/// does not count live through (livein but not used) registers. +/// Find all virtual register references that are liveout of the preheader to +/// initialize the starting "register pressure". Note this does not count live +/// through (livein but not used) registers. void MachineLICM::InitRegPressure(MachineBasicBlock *BB) { std::fill(RegPressure.begin(), RegPressure.end(), 0); @@ -858,8 +772,7 @@ void MachineLICM::InitRegPressure(MachineBasicBlock *BB) { UpdateRegPressure(&MI, /*ConsiderUnseenAsDef=*/true); } -/// UpdateRegPressure - Update estimate of register pressure after the -/// specified instruction. +/// Update estimate of register pressure after the specified instruction. void MachineLICM::UpdateRegPressure(const MachineInstr *MI, bool ConsiderUnseenAsDef) { auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/true, ConsiderUnseenAsDef); @@ -872,6 +785,12 @@ void MachineLICM::UpdateRegPressure(const MachineInstr *MI, } } +/// Calculate the additional register pressure that the registers used in MI +/// cause. +/// +/// If 'ConsiderSeen' is true, updates 'RegSeen' and uses the information to +/// figure out which usages are live-ins. +/// FIXME: Figure out a way to consider 'RegSeen' from all code paths. DenseMap<unsigned, int> MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen, bool ConsiderUnseenAsDef) { @@ -915,23 +834,26 @@ MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen, return Cost; } -/// isLoadFromGOTOrConstantPool - Return true if this machine instruction -/// loads from global offset table or constant pool. -static bool isLoadFromGOTOrConstantPool(MachineInstr &MI) { +/// Return true if this machine instruction loads from global offset table or +/// constant pool. +static bool mayLoadFromGOTOrConstantPool(MachineInstr &MI) { assert (MI.mayLoad() && "Expected MI that loads!"); - for (MachineInstr::mmo_iterator I = MI.memoperands_begin(), - E = MI.memoperands_end(); I != E; ++I) { - if (const PseudoSourceValue *PSV = (*I)->getPseudoValue()) { - if (PSV == PSV->getGOT() || PSV == PSV->getConstantPool()) + + // If we lost memory operands, conservatively assume that the instruction + // reads from everything.. + if (MI.memoperands_empty()) + return true; + + for (MachineMemOperand *MemOp : MI.memoperands()) + if (const PseudoSourceValue *PSV = MemOp->getPseudoValue()) + if (PSV->isGOT() || PSV->isConstantPool()) return true; - } - } + return false; } -/// IsLICMCandidate - Returns true if the instruction may be a suitable -/// candidate for LICM. e.g. If the instruction is a call, then it's obviously -/// not safe to hoist it. +/// Returns true if the instruction may be a suitable candidate for LICM. +/// e.g. If the instruction is a call, then it's obviously not safe to hoist it. bool MachineLICM::IsLICMCandidate(MachineInstr &I) { // Check if it's safe to move the instruction. bool DontMoveAcrossStore = true; @@ -944,16 +866,16 @@ bool MachineLICM::IsLICMCandidate(MachineInstr &I) { // from constant memory are not safe to speculate all the time, for example // indexed load from a jump table. // Stores and side effects are already checked by isSafeToMove. - if (I.mayLoad() && !isLoadFromGOTOrConstantPool(I) && + if (I.mayLoad() && !mayLoadFromGOTOrConstantPool(I) && !IsGuaranteedToExecute(I.getParent())) return false; return true; } -/// IsLoopInvariantInst - Returns true if the instruction is loop -/// invariant. I.e., all virtual register operands are defined outside of the -/// loop, physical registers aren't accessed explicitly, and there are no side +/// Returns true if the instruction is loop invariant. +/// I.e., all virtual register operands are defined outside of the loop, +/// physical registers aren't accessed explicitly, and there are no side /// effects that aren't captured by the operands or other flags. /// bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) { @@ -961,9 +883,7 @@ bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) { return false; // The instruction is loop invariant if all of its operands are. - for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) { - const MachineOperand &MO = I.getOperand(i); - + for (const MachineOperand &MO : I.operands()) { if (!MO.isReg()) continue; @@ -1007,8 +927,8 @@ bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) { } -/// HasLoopPHIUse - Return true if the specified instruction is used by a -/// phi node and hoisting it could cause a copy to be inserted. +/// Return true if the specified instruction is used by a phi node and hoisting +/// it could cause a copy to be inserted. bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const { SmallVector<const MachineInstr*, 8> Work(1, MI); do { @@ -1042,9 +962,8 @@ bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const { return false; } -/// HasHighOperandLatency - Compute operand latency between a def of 'Reg' -/// and an use in the current loop, return true if the target considered -/// it 'high'. +/// Compute operand latency between a def of 'Reg' and an use in the current +/// loop, return true if the target considered it high. bool MachineLICM::HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx, unsigned Reg) const { if (MRI->use_nodbg_empty(Reg)) @@ -1074,8 +993,8 @@ bool MachineLICM::HasHighOperandLatency(MachineInstr &MI, return false; } -/// IsCheapInstruction - Return true if the instruction is marked "cheap" or -/// the operand latency between its def and a use is one or less. +/// Return true if the instruction is marked "cheap" or the operand latency +/// between its def and a use is one or less. bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const { if (TII->isAsCheapAsAMove(&MI) || MI.isCopyLike()) return true; @@ -1099,9 +1018,8 @@ bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const { return isCheap; } -/// CanCauseHighRegPressure - Visit BBs from header to current BB, check -/// if hoisting an instruction of the given cost matrix can cause high -/// register pressure. +/// Visit BBs from header to current BB, check if hoisting an instruction of the +/// given cost matrix can cause high register pressure. bool MachineLICM::CanCauseHighRegPressure(const DenseMap<unsigned, int>& Cost, bool CheapInstr) { for (const auto &RPIdAndCost : Cost) { @@ -1124,9 +1042,9 @@ bool MachineLICM::CanCauseHighRegPressure(const DenseMap<unsigned, int>& Cost, return false; } -/// UpdateBackTraceRegPressure - Traverse the back trace from header to the -/// current block and update their register pressures to reflect the effect -/// of hoisting MI from the current block to the preheader. +/// Traverse the back trace from header to the current block and update their +/// register pressures to reflect the effect of hoisting MI from the current +/// block to the preheader. void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) { // First compute the 'cost' of the instruction, i.e. its contribution // to register pressure. @@ -1139,8 +1057,8 @@ void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) { RP[RPIdAndCost.first] += RPIdAndCost.second; } -/// IsProfitableToHoist - Return true if it is potentially profitable to hoist -/// the given loop invariant. +/// Return true if it is potentially profitable to hoist the given loop +/// invariant. bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) { if (MI.isImplicitDef()) return true; @@ -1230,6 +1148,9 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) { return true; } +/// Unfold a load from the given machineinstr if the load itself could be +/// hoisted. Return the unfolded and hoistable load, or null if the load +/// couldn't be unfolded or if it wouldn't be hoistable. MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) { // Don't unfold simple loads. if (MI->canFoldAsLoad()) @@ -1287,25 +1208,30 @@ MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) { return NewMIs[0]; } +/// Initialize the CSE map with instructions that are in the current loop +/// preheader that may become duplicates of instructions that are hoisted +/// out of the loop. void MachineLICM::InitCSEMap(MachineBasicBlock *BB) { - for (MachineBasicBlock::iterator I = BB->begin(),E = BB->end(); I != E; ++I) { - const MachineInstr *MI = &*I; - unsigned Opcode = MI->getOpcode(); - CSEMap[Opcode].push_back(MI); - } + for (MachineInstr &MI : *BB) + CSEMap[MI.getOpcode()].push_back(&MI); } +/// Find an instruction amount PrevMIs that is a duplicate of MI. +/// Return this instruction if it's found. const MachineInstr* MachineLICM::LookForDuplicate(const MachineInstr *MI, std::vector<const MachineInstr*> &PrevMIs) { - for (unsigned i = 0, e = PrevMIs.size(); i != e; ++i) { - const MachineInstr *PrevMI = PrevMIs[i]; + for (const MachineInstr *PrevMI : PrevMIs) if (TII->produceSameValue(MI, PrevMI, (PreRegAlloc ? MRI : nullptr))) return PrevMI; - } + return nullptr; } +/// Given a LICM'ed instruction, look for an instruction on the preheader that +/// computes the same value. If it's found, do a RAU on with the definition of +/// the existing instruction rather than hoisting the instruction to the +/// preheader. bool MachineLICM::EliminateCSE(MachineInstr *MI, DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI) { // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate @@ -1348,8 +1274,7 @@ bool MachineLICM::EliminateCSE(MachineInstr *MI, } } - for (unsigned i = 0, e = Defs.size(); i != e; ++i) { - unsigned Idx = Defs[i]; + for (unsigned Idx : Defs) { unsigned Reg = MI->getOperand(Idx).getReg(); unsigned DupReg = Dup->getOperand(Idx).getReg(); MRI->replaceRegWith(Reg, DupReg); @@ -1363,8 +1288,8 @@ bool MachineLICM::EliminateCSE(MachineInstr *MI, return false; } -/// MayCSE - Return true if the given instruction will be CSE'd if it's -/// hoisted out of the loop. +/// Return true if the given instruction will be CSE'd if it's hoisted out of +/// the loop. bool MachineLICM::MayCSE(MachineInstr *MI) { unsigned Opcode = MI->getOpcode(); DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator @@ -1377,9 +1302,9 @@ bool MachineLICM::MayCSE(MachineInstr *MI) { return LookForDuplicate(MI, CI->second) != nullptr; } -/// Hoist - When an instruction is found to use only loop invariant operands +/// When an instruction is found to use only loop invariant operands /// that are safe to hoist, this instruction is called to do the dirty work. -/// +/// It returns true if the instruction is hoisted. bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) { // First check whether we should hoist this instruction. if (!IsLoopInvariantInst(*MI) || !IsProfitableToHoist(*MI)) { @@ -1422,11 +1347,9 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) { // Clear the kill flags of any register this instruction defines, // since they may need to be live throughout the entire loop // rather than just live for part of it. - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (MachineOperand &MO : MI->operands()) if (MO.isReg() && MO.isDef() && !MO.isDead()) MRI->clearKillFlags(MO.getReg()); - } // Add to the CSE map. if (CI != CSEMap.end()) @@ -1441,6 +1364,7 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) { return true; } +/// Get the preheader for the current loop, splitting a critical edge if needed. MachineBasicBlock *MachineLICM::getCurPreheader() { // Determine the block to which to hoist instructions. If we can't find a // suitable loop predecessor, we can't do any hoisting. diff --git a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp index ce6abdd..2f5c9e0 100644 --- a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp @@ -37,7 +37,7 @@ char &llvm::MachineLoopInfoID = MachineLoopInfo::ID; bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) { releaseMemory(); - LI.Analyze(getAnalysis<MachineDominatorTree>().getBase()); + LI.analyze(getAnalysis<MachineDominatorTree>().getBase()); return false; } @@ -51,11 +51,11 @@ MachineBasicBlock *MachineLoop::getTopBlock() { MachineBasicBlock *TopMBB = getHeader(); MachineFunction::iterator Begin = TopMBB->getParent()->begin(); if (TopMBB != Begin) { - MachineBasicBlock *PriorMBB = std::prev(MachineFunction::iterator(TopMBB)); + MachineBasicBlock *PriorMBB = &*std::prev(TopMBB->getIterator()); while (contains(PriorMBB)) { TopMBB = PriorMBB; if (TopMBB == Begin) break; - PriorMBB = std::prev(MachineFunction::iterator(TopMBB)); + PriorMBB = &*std::prev(TopMBB->getIterator()); } } return TopMBB; @@ -65,11 +65,12 @@ MachineBasicBlock *MachineLoop::getBottomBlock() { MachineBasicBlock *BotMBB = getHeader(); MachineFunction::iterator End = BotMBB->getParent()->end(); if (BotMBB != std::prev(End)) { - MachineBasicBlock *NextMBB = std::next(MachineFunction::iterator(BotMBB)); + MachineBasicBlock *NextMBB = &*std::next(BotMBB->getIterator()); while (contains(NextMBB)) { BotMBB = NextMBB; - if (BotMBB == std::next(MachineFunction::iterator(BotMBB))) break; - NextMBB = std::next(MachineFunction::iterator(BotMBB)); + if (BotMBB == &*std::next(BotMBB->getIterator())) + break; + NextMBB = &*std::next(BotMBB->getIterator()); } } return BotMBB; diff --git a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp index 6a20624..1956a70 100644 --- a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp @@ -9,12 +9,12 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/ADT/PointerUnion.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/ADT/TinyPtrVector.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalVariable.h" @@ -35,7 +35,7 @@ char MachineModuleInfo::ID = 0; MachineModuleInfoImpl::~MachineModuleInfoImpl() {} namespace llvm { -class MMIAddrLabelMapCallbackPtr : CallbackVH { +class MMIAddrLabelMapCallbackPtr final : CallbackVH { MMIAddrLabelMap *Map; public: MMIAddrLabelMapCallbackPtr() : Map(nullptr) {} @@ -209,9 +209,8 @@ bool MachineModuleInfo::doInitialization(Module &M) { CurCallSite = 0; CallsEHReturn = false; CallsUnwindInit = false; + HasEHFunclets = false; DbgInfoAvailable = UsesVAFloatArgument = UsesMorestackAddr = false; - // Always emit some info, by default "no personality" info. - Personalities.push_back(nullptr); PersonalityTypeCache = EHPersonality::Unknown; AddrLabelSymbols = nullptr; TheModule = nullptr; @@ -249,6 +248,7 @@ void MachineModuleInfo::EndFunction() { FilterEnds.clear(); CallsEHReturn = false; CallsUnwindInit = false; + HasEHFunclets = false; VariableDbgInfos.clear(); } @@ -314,32 +314,11 @@ MCSymbol *MachineModuleInfo::addLandingPad(MachineBasicBlock *LandingPad) { return LandingPadLabel; } -/// addPersonality - Provide the personality function for the exception -/// information. -void MachineModuleInfo::addPersonality(MachineBasicBlock *LandingPad, - const Function *Personality) { - LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad); - LP.Personality = Personality; - addPersonality(Personality); -} - void MachineModuleInfo::addPersonality(const Function *Personality) { for (unsigned i = 0; i < Personalities.size(); ++i) if (Personalities[i] == Personality) return; - - // If this is the first personality we're adding go - // ahead and add it at the beginning. - if (!Personalities[0]) - Personalities[0] = Personality; - else - Personalities.push_back(Personality); -} - -void MachineModuleInfo::addWinEHState(MachineBasicBlock *LandingPad, - int State) { - LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad); - LP.WinEHState = State; + Personalities.push_back(Personality); } /// addCatchTypeInfo - Provide the catch typeinfo for a landing pad. @@ -481,56 +460,3 @@ try_next:; FilterIds.push_back(0); // terminator return FilterID; } - -/// getPersonality - Return the personality function for the current function. -const Function *MachineModuleInfo::getPersonality() const { - for (const LandingPadInfo &LPI : LandingPads) - if (LPI.Personality) - return LPI.Personality; - return nullptr; -} - -EHPersonality MachineModuleInfo::getPersonalityType() { - if (PersonalityTypeCache == EHPersonality::Unknown) { - if (const Function *F = getPersonality()) - PersonalityTypeCache = classifyEHPersonality(F); - } - return PersonalityTypeCache; -} - -/// getPersonalityIndex - Return unique index for current personality -/// function. NULL/first personality function should always get zero index. -unsigned MachineModuleInfo::getPersonalityIndex() const { - const Function* Personality = nullptr; - - // Scan landing pads. If there is at least one non-NULL personality - use it. - for (unsigned i = 0, e = LandingPads.size(); i != e; ++i) - if (LandingPads[i].Personality) { - Personality = LandingPads[i].Personality; - break; - } - - for (unsigned i = 0, e = Personalities.size(); i < e; ++i) { - if (Personalities[i] == Personality) - return i; - } - - // This will happen if the current personality function is - // in the zero index. - return 0; -} - -const Function *MachineModuleInfo::getWinEHParent(const Function *F) const { - StringRef WinEHParentName = - F->getFnAttribute("wineh-parent").getValueAsString(); - if (WinEHParentName.empty() || WinEHParentName == F->getName()) - return F; - return F->getParent()->getFunction(WinEHParentName); -} - -WinEHFuncInfo &MachineModuleInfo::getWinEHFuncInfo(const Function *F) { - auto &Ptr = FuncInfoMap[getWinEHParent(F)]; - if (!Ptr) - Ptr.reset(new WinEHFuncInfo); - return *Ptr; -} diff --git a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp index e883ce5..03c82f4 100644 --- a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -27,13 +27,11 @@ void MachineRegisterInfo::Delegate::anchor() {} MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF) : MF(MF), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true), TracksSubRegLiveness(false) { + unsigned NumRegs = getTargetRegisterInfo()->getNumRegs(); VRegInfo.reserve(256); RegAllocHints.reserve(256); - UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits()); - UsedPhysRegMask.resize(getTargetRegisterInfo()->getNumRegs()); - - // Create the physreg use/def lists. - PhysRegUseDefLists.resize(getTargetRegisterInfo()->getNumRegs(), nullptr); + UsedPhysRegMask.resize(NumRegs); + PhysRegUseDefLists.reset(new MachineOperand*[NumRegs]()); } /// setRegClass - Set the register class of the specified virtual register. @@ -117,6 +115,8 @@ void MachineRegisterInfo::clearVirtRegs() { } #endif VRegInfo.clear(); + for (auto &I : LiveIns) + I.second = 0; } void MachineRegisterInfo::verifyUseList(unsigned Reg) const { @@ -394,8 +394,7 @@ MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB, } } -unsigned MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const -{ +LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const { // Lane masks are only defined for vregs. assert(TargetRegisterInfo::isVirtualRegister(Reg)); const TargetRegisterClass &TRC = *getRegClass(Reg); @@ -468,11 +467,8 @@ static bool isNoReturnDef(const MachineOperand &MO) { if (MF.getFunction()->hasFnAttribute(Attribute::UWTable)) return false; const Function *Called = getCalledFunction(MI); - if (Called == nullptr || !Called->hasFnAttribute(Attribute::NoReturn) - || !Called->hasFnAttribute(Attribute::NoUnwind)) - return false; - - return true; + return !(Called == nullptr || !Called->hasFnAttribute(Attribute::NoReturn) || + !Called->hasFnAttribute(Attribute::NoUnwind)); } bool MachineRegisterInfo::isPhysRegModified(unsigned PhysReg) const { @@ -488,3 +484,15 @@ bool MachineRegisterInfo::isPhysRegModified(unsigned PhysReg) const { } return false; } + +bool MachineRegisterInfo::isPhysRegUsed(unsigned PhysReg) const { + if (UsedPhysRegMask.test(PhysReg)) + return true; + const TargetRegisterInfo *TRI = getTargetRegisterInfo(); + for (MCRegAliasIterator AliasReg(PhysReg, TRI, true); AliasReg.isValid(); + ++AliasReg) { + if (!reg_nodbg_empty(*AliasReg)) + return true; + } + return false; +} diff --git a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp index a48e54c..bcee15c 100644 --- a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp @@ -49,6 +49,11 @@ DumpCriticalPathLength("misched-dcpl", cl::Hidden, static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden, cl::desc("Pop up a window to show MISched dags after they are processed")); +/// In some situations a few uninteresting nodes depend on nearly all other +/// nodes in the graph, provide a cutoff to hide them. +static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff", cl::Hidden, + cl::desc("Hide nodes with more predecessor/successor than cutoff")); + static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden, cl::desc("Stop scheduling after N instructions"), cl::init(~0U)); @@ -106,7 +111,7 @@ public: void print(raw_ostream &O, const Module* = nullptr) const override; protected: - void scheduleRegions(ScheduleDAGInstrs &Scheduler); + void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags); }; /// MachineScheduler runs after coalescing and before register allocation. @@ -146,7 +151,7 @@ char &llvm::MachineSchedulerID = MachineScheduler::ID; INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler", "Machine Instruction Scheduler", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler", @@ -161,7 +166,7 @@ void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addRequiredID(MachineDominatorsID); AU.addRequired<MachineLoopInfo>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<TargetPassConfig>(); AU.addRequired<SlotIndexes>(); AU.addPreserved<SlotIndexes>(); @@ -315,14 +320,14 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { } else if (!mf.getSubtarget().enableMachineScheduler()) return false; - DEBUG(dbgs() << "Before MISsched:\n"; mf.print(dbgs())); + DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs())); // Initialize the context of the pass. MF = &mf; MLI = &getAnalysis<MachineLoopInfo>(); MDT = &getAnalysis<MachineDominatorTree>(); PassConfig = &getAnalysis<TargetPassConfig>(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); LIS = &getAnalysis<LiveIntervals>(); @@ -335,7 +340,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { // Instantiate the selected scheduler for this target, function, and // optimization level. std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler()); - scheduleRegions(*Scheduler); + scheduleRegions(*Scheduler, false); DEBUG(LIS->dump()); if (VerifyScheduling) @@ -363,7 +368,7 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) { // Instantiate the selected scheduler for this target, function, and // optimization level. std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler()); - scheduleRegions(*Scheduler); + scheduleRegions(*Scheduler, true); if (VerifyScheduling) MF->verify(this, "After post machine scheduling."); @@ -383,15 +388,14 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) { static bool isSchedBoundary(MachineBasicBlock::iterator MI, MachineBasicBlock *MBB, MachineFunction *MF, - const TargetInstrInfo *TII, - bool IsPostRA) { + const TargetInstrInfo *TII) { return MI->isCall() || TII->isSchedulingBoundary(MI, MBB, *MF); } /// Main driver for both MachineScheduler and PostMachineScheduler. -void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) { +void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, + bool FixKillFlags) { const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - bool IsPostRA = Scheduler.isPostRA(); // Visit all machine basic blocks. // @@ -400,7 +404,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) { for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end(); MBB != MBBEnd; ++MBB) { - Scheduler.startBlock(MBB); + Scheduler.startBlock(&*MBB); #ifndef NDEBUG if (SchedOnlyFunc.getNumOccurrences() && SchedOnlyFunc != MF->getName()) @@ -429,7 +433,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) { // Avoid decrementing RegionEnd for blocks with no terminator. if (RegionEnd != MBB->end() || - isSchedBoundary(std::prev(RegionEnd), MBB, MF, TII, IsPostRA)) { + isSchedBoundary(&*std::prev(RegionEnd), &*MBB, MF, TII)) { --RegionEnd; // Count the boundary instruction. --RemainingInstrs; @@ -440,14 +444,14 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) { unsigned NumRegionInstrs = 0; MachineBasicBlock::iterator I = RegionEnd; for(;I != MBB->begin(); --I, --RemainingInstrs) { - if (isSchedBoundary(std::prev(I), MBB, MF, TII, IsPostRA)) + if (isSchedBoundary(&*std::prev(I), &*MBB, MF, TII)) break; if (!I->isDebugValue()) ++NumRegionInstrs; } // Notify the scheduler of the region, even if we may skip scheduling // it. Perhaps it still needs to be bundled. - Scheduler.enterRegion(MBB, I, RegionEnd, NumRegionInstrs); + Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs); // Skip empty scheduling regions (0 or 1 schedulable instructions). if (I == RegionEnd || I == std::prev(RegionEnd)) { @@ -456,8 +460,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) { Scheduler.exitRegion(); continue; } - DEBUG(dbgs() << "********** " << ((Scheduler.isPostRA()) ? "PostRA " : "") - << "MI Scheduling **********\n"); + DEBUG(dbgs() << "********** MI Scheduling **********\n"); DEBUG(dbgs() << MF->getName() << ":BB#" << MBB->getNumber() << " " << MBB->getName() << "\n From: " << *I << " To: "; @@ -484,11 +487,11 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) { } assert(RemainingInstrs == 0 && "Instruction count mismatch!"); Scheduler.finishBlock(); - if (Scheduler.isPostRA()) { - // FIXME: Ideally, no further passes should rely on kill flags. However, - // thumb2 size reduction is currently an exception. - Scheduler.fixupKills(MBB); - } + // FIXME: Ideally, no further passes should rely on kill flags. However, + // thumb2 size reduction is currently an exception, so the PostMIScheduler + // needs to do this. + if (FixKillFlags) + Scheduler.fixupKills(&*MBB); } Scheduler.finalizeSchedule(); } @@ -499,7 +502,7 @@ void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const { LLVM_DUMP_METHOD void ReadyQueue::dump() { - dbgs() << Name << ": "; + dbgs() << "Queue " << Name << ": "; for (unsigned i = 0, e = Queue.size(); i < e; ++i) dbgs() << Queue[i]->NodeNum << " "; dbgs() << "\n"; @@ -660,6 +663,9 @@ bool ScheduleDAGMI::checkSchedLimit() { /// does not consider liveness or register pressure. It is useful for PostRA /// scheduling and potentially other custom schedulers. void ScheduleDAGMI::schedule() { + DEBUG(dbgs() << "ScheduleDAGMI::schedule starting\n"); + DEBUG(SchedImpl->dumpPolicy()); + // Build the DAG. buildSchedGraph(AA); @@ -682,7 +688,11 @@ void ScheduleDAGMI::schedule() { initQueues(TopRoots, BotRoots); bool IsTopNode = false; - while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) { + while (true) { + DEBUG(dbgs() << "** ScheduleDAGMI::schedule picking next node\n"); + SUnit *SU = SchedImpl->pickNode(IsTopNode); + if (!SU) break; + assert(!SU->isScheduled && "Node already scheduled"); if (!checkSchedLimit()) break; @@ -900,6 +910,13 @@ void ScheduleDAGMILive::initRegPressure() { updatePressureDiffs(LiveUses); } + DEBUG( + dbgs() << "Top Pressure:\n"; + dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI); + dbgs() << "Bottom Pressure:\n"; + dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI); + ); + assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom"); // Cache the list of excess pressure sets in this region. This will also track @@ -976,18 +993,24 @@ void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<unsigned> LiveUses) { } // RegisterPressureTracker guarantees that readsReg is true for LiveUses. assert(VNI && "No live value at use."); - for (VReg2UseMap::iterator - UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) { - SUnit *SU = UI->SU; - DEBUG(dbgs() << " UpdateRegP: SU(" << SU->NodeNum << ") " - << *SU->getInstr()); + for (const VReg2SUnit &V2SU + : make_range(VRegUses.find(Reg), VRegUses.end())) { + SUnit *SU = V2SU.SU; // If this use comes before the reaching def, it cannot be a last use, so // descrease its pressure change. if (!SU->isScheduled && SU != &ExitSU) { LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(SU->getInstr())); - if (LRQ.valueIn() == VNI) - getPressureDiff(SU).addPressureChange(Reg, true, &MRI); + if (LRQ.valueIn() == VNI) { + PressureDiff &PDiff = getPressureDiff(SU); + PDiff.addPressureChange(Reg, true, &MRI); + DEBUG( + dbgs() << " UpdateRegP: SU(" << SU->NodeNum << ") " + << *SU->getInstr(); + dbgs() << " to "; + PDiff.dump(*TRI); + ); + } } } } @@ -998,12 +1021,14 @@ void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<unsigned> LiveUses) { /// only includes instructions that have DAG nodes, not scheduling boundaries. /// /// This is a skeletal driver, with all the functionality pushed into helpers, -/// so that it can be easilly extended by experimental schedulers. Generally, +/// so that it can be easily extended by experimental schedulers. Generally, /// implementing MachineSchedStrategy should be sufficient to implement a new /// scheduling algorithm. However, if a scheduler further subclasses /// ScheduleDAGMILive then it will want to override this virtual method in order /// to update any specialized state. void ScheduleDAGMILive::schedule() { + DEBUG(dbgs() << "ScheduleDAGMILive::schedule starting\n"); + DEBUG(SchedImpl->dumpPolicy()); buildDAGWithRegPressure(); Topo.InitDAGTopologicalSorting(); @@ -1017,8 +1042,16 @@ void ScheduleDAGMILive::schedule() { // This may initialize a DFSResult to be used for queue priority. SchedImpl->initialize(this); - DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su) - SUnits[su].dumpAll(this)); + DEBUG( + for (const SUnit &SU : SUnits) { + SU.dumpAll(this); + if (ShouldTrackPressure) { + dbgs() << " Pressure Diff : "; + getPressureDiff(&SU).dump(*TRI); + } + dbgs() << '\n'; + } + ); if (ViewMISchedDAGs) viewGraph(); // Initialize ready queues now that the DAG and priority data are finalized. @@ -1030,7 +1063,11 @@ void ScheduleDAGMILive::schedule() { } bool IsTopNode = false; - while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) { + while (true) { + DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n"); + SUnit *SU = SchedImpl->pickNode(IsTopNode); + if (!SU) break; + assert(!SU->isScheduled && "Node already scheduled"); if (!checkSchedLimit()) break; @@ -1149,14 +1186,15 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() { unsigned LiveOutHeight = DefSU->getHeight(); unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency; // Visit all local users of the vreg def. - for (VReg2UseMap::iterator - UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) { - if (UI->SU == &ExitSU) + for (const VReg2SUnit &V2SU + : make_range(VRegUses.find(Reg), VRegUses.end())) { + SUnit *SU = V2SU.SU; + if (SU == &ExitSU) continue; // Only consider uses of the phi. LiveQueryResult LRQ = - LI.Query(LIS->getInstructionIndex(UI->SU->getInstr())); + LI.Query(LIS->getInstructionIndex(SU->getInstr())); if (!LRQ.valueIn()->isPHIDef()) continue; @@ -1164,10 +1202,10 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() { // overestimate in strange cases. This allows cyclic latency to be // estimated as the minimum slack of the vreg's depth or height. unsigned CyclicLatency = 0; - if (LiveOutDepth > UI->SU->getDepth()) - CyclicLatency = LiveOutDepth - UI->SU->getDepth(); + if (LiveOutDepth > SU->getDepth()) + CyclicLatency = LiveOutDepth - SU->getDepth(); - unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency; + unsigned LiveInHeight = SU->getHeight() + DefSU->Latency; if (LiveInHeight > LiveOutHeight) { if (LiveInHeight - LiveOutHeight < CyclicLatency) CyclicLatency = LiveInHeight - LiveOutHeight; @@ -1176,7 +1214,7 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() { CyclicLatency = 0; DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU(" - << UI->SU->NodeNum << ") = " << CyclicLatency << "c\n"); + << SU->NodeNum << ") = " << CyclicLatency << "c\n"); if (CyclicLatency > MaxCyclicLatency) MaxCyclicLatency = CyclicLatency; } @@ -1203,6 +1241,11 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { // Update top scheduled pressure. TopRPTracker.advance(); assert(TopRPTracker.getPos() == CurrentTop && "out of sync"); + DEBUG( + dbgs() << "Top Pressure:\n"; + dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI); + ); + updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure); } } @@ -1225,6 +1268,11 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { SmallVector<unsigned, 8> LiveUses; BotRPTracker.recede(&LiveUses); assert(BotRPTracker.getPos() == CurrentBottom && "out of sync"); + DEBUG( + dbgs() << "Bottom Pressure:\n"; + dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI); + ); + updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure); updatePressureDiffs(LiveUses); } @@ -1349,25 +1397,49 @@ namespace { /// \brief Post-process the DAG to create cluster edges between instructions /// that may be fused by the processor into a single operation. class MacroFusion : public ScheduleDAGMutation { - const TargetInstrInfo *TII; + const TargetInstrInfo &TII; + const TargetRegisterInfo &TRI; public: - MacroFusion(const TargetInstrInfo *tii): TII(tii) {} + MacroFusion(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) + : TII(TII), TRI(TRI) {} void apply(ScheduleDAGMI *DAG) override; }; } // anonymous +/// Returns true if \p MI reads a register written by \p Other. +static bool HasDataDep(const TargetRegisterInfo &TRI, const MachineInstr &MI, + const MachineInstr &Other) { + for (const MachineOperand &MO : MI.uses()) { + if (!MO.isReg() || !MO.readsReg()) + continue; + + unsigned Reg = MO.getReg(); + if (Other.modifiesRegister(Reg, &TRI)) + return true; + } + return false; +} + /// \brief Callback from DAG postProcessing to create cluster edges to encourage /// fused operations. void MacroFusion::apply(ScheduleDAGMI *DAG) { // For now, assume targets can only fuse with the branch. - MachineInstr *Branch = DAG->ExitSU.getInstr(); + SUnit &ExitSU = DAG->ExitSU; + MachineInstr *Branch = ExitSU.getInstr(); if (!Branch) return; - for (unsigned Idx = DAG->SUnits.size(); Idx > 0;) { - SUnit *SU = &DAG->SUnits[--Idx]; - if (!TII->shouldScheduleAdjacent(SU->getInstr(), Branch)) + for (SUnit &SU : DAG->SUnits) { + // SUnits with successors can't be schedule in front of the ExitSU. + if (!SU.Succs.empty()) + continue; + // We only care if the node writes to a register that the branch reads. + MachineInstr *Pred = SU.getInstr(); + if (!HasDataDep(TRI, *Branch, *Pred)) + continue; + + if (!TII.shouldScheduleAdjacent(Pred, Branch)) continue; // Create a single weak edge from SU to ExitSU. The only effect is to cause @@ -1376,11 +1448,11 @@ void MacroFusion::apply(ScheduleDAGMI *DAG) { // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling // of SU, we could create an artificial edge from the deepest root, but it // hasn't been needed yet. - bool Success = DAG->addEdge(&DAG->ExitSU, SDep(SU, SDep::Cluster)); + bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); (void)Success; assert(Success && "No DAG nodes should be reachable from ExitSU"); - DEBUG(dbgs() << "Macro Fuse SU(" << SU->NodeNum << ")\n"); + DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n"); break; } } @@ -2277,7 +2349,7 @@ void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) { Latency = Cand.SU->getDepth(); break; } - dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); + dbgs() << " Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); if (P.isValid()) dbgs() << " " << TRI->getRegPressureSetName(P.getPSet()) << ":" << P.getUnitInc() << " "; @@ -2438,6 +2510,14 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, } } +void GenericScheduler::dumpPolicy() { + dbgs() << "GenericScheduler RegionPolicy: " + << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure + << " OnlyTopDown=" << RegionPolicy.OnlyTopDown + << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp + << "\n"; +} + /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic /// critical path by more cycles than it takes to drain the instruction buffer. /// We estimate an upper bounds on in-flight instructions as: @@ -2499,11 +2579,13 @@ static bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, - GenericSchedulerBase::CandReason Reason) { - int TryRank = TryP.getPSetOrMax(); - int CandRank = CandP.getPSetOrMax(); + GenericSchedulerBase::CandReason Reason, + const TargetRegisterInfo *TRI, + const MachineFunction &MF) { + unsigned TryPSet = TryP.getPSetOrMax(); + unsigned CandPSet = CandP.getPSetOrMax(); // If both candidates affect the same set, go with the smallest increase. - if (TryRank == CandRank) { + if (TryPSet == CandPSet) { return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand, Reason); } @@ -2513,6 +2595,13 @@ static bool tryPressure(const PressureChange &TryP, Reason)) { return true; } + + int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) : + std::numeric_limits<int>::max(); + + int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) : + std::numeric_limits<int>::max(); + // If the candidates are decreasing pressure, reverse priority. if (TryP.getUnitInc() < 0) std::swap(TryRank, CandRank); @@ -2597,7 +2686,7 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand, } } DEBUG(if (TryCand.RPDelta.Excess.isValid()) - dbgs() << " SU(" << TryCand.SU->NodeNum << ") " + dbgs() << " Try SU(" << TryCand.SU->NodeNum << ") " << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet()) << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n"); @@ -2615,13 +2704,15 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand, // Avoid exceeding the target's limit. if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, - TryCand, Cand, RegExcess)) + TryCand, Cand, RegExcess, TRI, + DAG->MF)) return; // Avoid increasing the max critical pressure in the scheduled region. if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, - TryCand, Cand, RegCritical)) + TryCand, Cand, RegCritical, TRI, + DAG->MF)) return; // For loops that are acyclic path limited, aggressively schedule for latency. @@ -2657,7 +2748,8 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand, // Avoid increasing the max pressure of the entire region. if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, - TryCand, Cand, RegMax)) + TryCand, Cand, RegMax, TRI, + DAG->MF)) return; // Avoid critical resource consumption and balance the schedule. @@ -2672,8 +2764,8 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand, // Avoid serializing long latency dependence chains. // For acyclic path limited loops, latency was already checked above. - if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited - && tryLatency(TryCand, Cand, Zone)) { + if (!RegionPolicy.DisableLatencyHeuristic && Cand.Policy.ReduceLatency && + !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone)) { return; } @@ -2727,12 +2819,12 @@ SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) { // efficient, but also provides the best heuristics for CriticalPSets. if (SUnit *SU = Bot.pickOnlyChoice()) { IsTopNode = false; - DEBUG(dbgs() << "Pick Bot NOCAND\n"); + DEBUG(dbgs() << "Pick Bot ONLY1\n"); return SU; } if (SUnit *SU = Top.pickOnlyChoice()) { IsTopNode = true; - DEBUG(dbgs() << "Pick Top NOCAND\n"); + DEBUG(dbgs() << "Pick Top ONLY1\n"); return SU; } CandPolicy NoPolicy; @@ -2887,7 +2979,7 @@ static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C) { if (EnableLoadCluster && DAG->TII->enableClusterLoads()) DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI)); if (EnableMacroFusion) - DAG->addMutation(make_unique<MacroFusion>(DAG->TII)); + DAG->addMutation(make_unique<MacroFusion>(*DAG->TII, *DAG->TRI)); return DAG; } @@ -3254,12 +3346,10 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { } static bool isNodeHidden(const SUnit *Node) { - return (Node->Preds.size() > 10 || Node->Succs.size() > 10); - } - - static bool hasNodeAddressLabel(const SUnit *Node, - const ScheduleDAG *Graph) { - return false; + if (ViewMISchedCutoff == 0) + return false; + return (Node->Preds.size() > ViewMISchedCutoff + || Node->Succs.size() > ViewMISchedCutoff); } /// If you want to override the dot attributes printed for a particular diff --git a/contrib/llvm/lib/CodeGen/MachineSink.cpp b/contrib/llvm/lib/CodeGen/MachineSink.cpp index 1b9be50..5e6d619 100644 --- a/contrib/llvm/lib/CodeGen/MachineSink.cpp +++ b/contrib/llvm/lib/CodeGen/MachineSink.cpp @@ -87,7 +87,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<MachineDominatorTree>(); AU.addRequired<MachinePostDominatorTree>(); AU.addRequired<MachineLoopInfo>(); @@ -150,7 +150,7 @@ INITIALIZE_PASS_BEGIN(MachineSinking, "machine-sink", "Machine code sinking", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(MachineSinking, "machine-sink", "Machine code sinking", false, false) @@ -268,7 +268,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) { PDT = &getAnalysis<MachinePostDominatorTree>(); LI = &getAnalysis<MachineLoopInfo>(); MBFI = UseBlockFreqInfo ? &getAnalysis<MachineBlockFrequencyInfo>() : nullptr; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); bool EverMadeChange = false; @@ -667,7 +667,7 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI, // It's not safe to sink instructions to EH landing pad. Control flow into // landing pad is implicitly defined. - if (SuccToSinkTo && SuccToSinkTo->isLandingPad()) + if (SuccToSinkTo && SuccToSinkTo->isEHPad()) return nullptr; return SuccToSinkTo; @@ -686,7 +686,8 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore, if (!MI->isSafeToMove(AA, SawStore)) return false; - // Convergent operations may only be moved to control equivalent locations. + // Convergent operations may not be made control-dependent on additional + // values. if (MI->isConvergent()) return false; diff --git a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp index d9a6b684..f7edacd 100644 --- a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp +++ b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp @@ -724,13 +724,12 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI, // Update RegUnits to reflect live registers after UseMI. // First kills. - for (unsigned i = 0, e = Kills.size(); i != e; ++i) - for (MCRegUnitIterator Units(Kills[i], TRI); Units.isValid(); ++Units) + for (unsigned Kill : Kills) + for (MCRegUnitIterator Units(Kill, TRI); Units.isValid(); ++Units) RegUnits.erase(*Units); // Second, live defs. - for (unsigned i = 0, e = LiveDefOps.size(); i != e; ++i) { - unsigned DefOp = LiveDefOps[i]; + for (unsigned DefOp : LiveDefOps) { for (MCRegUnitIterator Units(UseMI->getOperand(DefOp).getReg(), TRI); Units.isValid(); ++Units) { LiveRegUnit &LRU = RegUnits[*Units]; @@ -756,8 +755,7 @@ computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) { assert(TBI.HasValidInstrDepths && "Missing depth info"); assert(TBI.HasValidInstrHeights && "Missing height info"); unsigned MaxLen = 0; - for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) { - const LiveInReg &LIR = TBI.LiveIns[i]; + for (const LiveInReg &LIR : TBI.LiveIns) { if (!TargetRegisterInfo::isVirtualRegister(LIR.Reg)) continue; const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg); diff --git a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp index ca35ec5..428295e 100644 --- a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp @@ -28,6 +28,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/LiveStackAnalysis.h" #include "llvm/CodeGen/LiveVariables.h" @@ -42,7 +43,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" -#include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" @@ -204,18 +204,19 @@ namespace { void visitMachineBasicBlockAfter(const MachineBasicBlock *MBB); void visitMachineFunctionAfter(); + template <typename T> void report(const char *msg, ilist_iterator<T> I) { + report(msg, &*I); + } void report(const char *msg, const MachineFunction *MF); void report(const char *msg, const MachineBasicBlock *MBB); void report(const char *msg, const MachineInstr *MI); void report(const char *msg, const MachineOperand *MO, unsigned MONum); - void report(const char *msg, const MachineFunction *MF, - const LiveInterval &LI); - void report(const char *msg, const MachineBasicBlock *MBB, - const LiveInterval &LI); - void report(const char *msg, const MachineFunction *MF, - const LiveRange &LR, unsigned Reg, unsigned LaneMask); - void report(const char *msg, const MachineBasicBlock *MBB, - const LiveRange &LR, unsigned Reg, unsigned LaneMask); + + void report_context(const LiveInterval &LI) const; + void report_context(const LiveRange &LR, unsigned Reg, + LaneBitmask LaneMask) const; + void report_context(const LiveRange::Segment &S) const; + void report_context(const VNInfo &VNI) const; void verifyInlineAsm(const MachineInstr *MI); @@ -233,9 +234,11 @@ namespace { void verifyLiveRangeSegment(const LiveRange&, const LiveRange::const_iterator I, unsigned, unsigned); - void verifyLiveRange(const LiveRange&, unsigned, unsigned LaneMask = 0); + void verifyLiveRange(const LiveRange&, unsigned, LaneBitmask LaneMask = 0); void verifyStackFrame(); + + void verifySlotIndexes() const; }; struct MachineVerifierPass : public MachineFunctionPass { @@ -273,6 +276,19 @@ void MachineFunction::verify(Pass *p, const char *Banner) const { .runOnMachineFunction(const_cast<MachineFunction&>(*this)); } +void MachineVerifier::verifySlotIndexes() const { + if (Indexes == nullptr) + return; + + // Ensure the IdxMBB list is sorted by slot indexes. + SlotIndex Last; + for (SlotIndexes::MBBIndexIterator I = Indexes->MBBIndexBegin(), + E = Indexes->MBBIndexEnd(); I != E; ++I) { + assert(!Last.isValid() || I->first > Last); + Last = I->first; + } +} + bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) { foundErrors = 0; @@ -295,10 +311,12 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) { Indexes = PASS->getAnalysisIfAvailable<SlotIndexes>(); } + verifySlotIndexes(); + visitMachineFunctionBefore(); for (MachineFunction::const_iterator MFI = MF.begin(), MFE = MF.end(); MFI!=MFE; ++MFI) { - visitMachineBasicBlockBefore(MFI); + visitMachineBasicBlockBefore(&*MFI); // Keep track of the current bundle header. const MachineInstr *CurBundle = nullptr; // Do we expect the next instruction to be part of the same bundle? @@ -306,7 +324,7 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock::const_instr_iterator MBBI = MFI->instr_begin(), MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) { - if (MBBI->getParent() != MFI) { + if (MBBI->getParent() != &*MFI) { report("Bad instruction parent pointer", MFI); errs() << "Instruction: " << *MBBI; continue; @@ -315,20 +333,22 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) { // Check for consistent bundle flags. if (InBundle && !MBBI->isBundledWithPred()) report("Missing BundledPred flag, " - "BundledSucc was set on predecessor", MBBI); + "BundledSucc was set on predecessor", + &*MBBI); if (!InBundle && MBBI->isBundledWithPred()) report("BundledPred flag is set, " - "but BundledSucc not set on predecessor", MBBI); + "but BundledSucc not set on predecessor", + &*MBBI); // Is this a bundle header? if (!MBBI->isInsideBundle()) { if (CurBundle) visitMachineBundleAfter(CurBundle); - CurBundle = MBBI; + CurBundle = &*MBBI; visitMachineBundleBefore(CurBundle); } else if (!CurBundle) report("No bundle header", MBBI); - visitMachineInstrBefore(MBBI); + visitMachineInstrBefore(&*MBBI); for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) { const MachineInstr &MI = *MBBI; const MachineOperand &Op = MI.getOperand(I); @@ -341,7 +361,7 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) { visitMachineOperand(&Op, I); } - visitMachineInstrAfter(MBBI); + visitMachineInstrAfter(&*MBBI); // Was this the last bundled instruction? InBundle = MBBI->isBundledWithSucc(); @@ -350,7 +370,7 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) { visitMachineBundleAfter(CurBundle); if (InBundle) report("BundledSucc flag set on last instruction in block", &MFI->back()); - visitMachineBasicBlockAfter(MFI); + visitMachineBasicBlockAfter(&*MFI); } visitMachineFunctionAfter(); @@ -375,7 +395,10 @@ void MachineVerifier::report(const char *msg, const MachineFunction *MF) { if (!foundErrors++) { if (Banner) errs() << "# " << Banner << '\n'; - MF->print(errs(), Indexes); + if (LiveInts != nullptr) + LiveInts->print(errs()); + else + MF->print(errs(), Indexes); } errs() << "*** Bad machine code: " << msg << " ***\n" << "- function: " << MF->getName() << "\n"; @@ -399,7 +422,8 @@ void MachineVerifier::report(const char *msg, const MachineInstr *MI) { errs() << "- instruction: "; if (Indexes && Indexes->hasIndex(MI)) errs() << Indexes->getInstructionIndex(MI) << '\t'; - MI->print(errs(), TM); + MI->print(errs(), /*SkipOpers=*/true); + errs() << '\n'; } void MachineVerifier::report(const char *msg, @@ -411,36 +435,24 @@ void MachineVerifier::report(const char *msg, errs() << "\n"; } -void MachineVerifier::report(const char *msg, const MachineFunction *MF, - const LiveInterval &LI) { - report(msg, MF); - errs() << "- interval: " << LI << '\n'; -} - -void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB, - const LiveInterval &LI) { - report(msg, MBB); +void MachineVerifier::report_context(const LiveInterval &LI) const { errs() << "- interval: " << LI << '\n'; } -void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB, - const LiveRange &LR, unsigned Reg, - unsigned LaneMask) { - report(msg, MBB); - errs() << "- liverange: " << LR << '\n'; +void MachineVerifier::report_context(const LiveRange &LR, unsigned Reg, + LaneBitmask LaneMask) const { errs() << "- register: " << PrintReg(Reg, TRI) << '\n'; if (LaneMask != 0) - errs() << "- lanemask: " << format("%04X\n", LaneMask); + errs() << "- lanemask: " << PrintLaneMask(LaneMask) << '\n'; + errs() << "- liverange: " << LR << '\n'; } -void MachineVerifier::report(const char *msg, const MachineFunction *MF, - const LiveRange &LR, unsigned Reg, - unsigned LaneMask) { - report(msg, MF); - errs() << "- liverange: " << LR << '\n'; - errs() << "- register: " << PrintReg(Reg, TRI) << '\n'; - if (LaneMask != 0) - errs() << "- lanemask: " << format("%04X\n", LaneMask); +void MachineVerifier::report_context(const LiveRange::Segment &S) const { + errs() << "- segment: " << S << '\n'; +} + +void MachineVerifier::report_context(const VNInfo &VNI) const { + errs() << "- ValNo: " << VNI.id << " (def " << VNI.def << ")\n"; } void MachineVerifier::markReachable(const MachineBasicBlock *MBB) { @@ -507,11 +519,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { if (MRI->isSSA()) { // If this block has allocatable physical registers live-in, check that // it is an entry block or landing pad. - for (MachineBasicBlock::livein_iterator LI = MBB->livein_begin(), - LE = MBB->livein_end(); - LI != LE; ++LI) { - unsigned reg = *LI; - if (isAllocatable(reg) && !MBB->isLandingPad() && + for (const auto &LI : MBB->liveins()) { + if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() && MBB != MBB->getParent()->begin()) { report("MBB has allocable live-in, but isn't entry or landing-pad.", MBB); } @@ -522,7 +531,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { SmallPtrSet<MachineBasicBlock*, 4> LandingPadSuccs; for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) { - if ((*I)->isLandingPad()) + if ((*I)->isEHPad()) LandingPadSuccs.insert(*I); if (!FunctionBlocks.count(*I)) report("MBB has successor that isn't part of the function.", MBB); @@ -547,10 +556,12 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { const MCAsmInfo *AsmInfo = TM->getMCAsmInfo(); const BasicBlock *BB = MBB->getBasicBlock(); + const Function *Fn = MF->getFunction(); if (LandingPadSuccs.size() > 1 && !(AsmInfo && AsmInfo->getExceptionHandlingType() == ExceptionHandling::SjLj && - BB && isa<SwitchInst>(BB->getTerminator()))) + BB && isa<SwitchInst>(BB->getTerminator())) && + !isFuncletEHPersonality(classifyEHPersonality(Fn->getPersonalityFn()))) report("MBB has more than one landing pad successor", MBB); // Call AnalyzeBranch. If it succeeds, there several more conditions to check. @@ -562,7 +573,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { // check whether its answers match up with reality. if (!TBB && !FBB) { // Block falls through to its successor. - MachineFunction::const_iterator MBBI = MBB; + MachineFunction::const_iterator MBBI = MBB->getIterator(); ++MBBI; if (MBBI == MF->end()) { // It's possible that the block legitimately ends with a noreturn @@ -575,7 +586,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { } else if (MBB->succ_size() != 1+LandingPadSuccs.size()) { report("MBB exits via unconditional fall-through but doesn't have " "exactly one CFG successor!", MBB); - } else if (!MBB->isSuccessor(MBBI)) { + } else if (!MBB->isSuccessor(&*MBBI)) { report("MBB exits via unconditional fall-through but its successor " "differs from its CFG successor!", MBB); } @@ -613,7 +624,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { } } else if (TBB && !FBB && !Cond.empty()) { // Block conditionally branches somewhere, otherwise falls through. - MachineFunction::const_iterator MBBI = MBB; + MachineFunction::const_iterator MBBI = MBB->getIterator(); ++MBBI; if (MBBI == MF->end()) { report("MBB conditionally falls through out of function!", MBB); @@ -628,7 +639,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { } else if (MBB->succ_size() != 2) { report("MBB exits via conditional branch/fall-through but doesn't have " "exactly two CFG successors!", MBB); - } else if (!matchPair(MBB->succ_begin(), TBB, MBBI)) { + } else if (!matchPair(MBB->succ_begin(), TBB, &*MBBI)) { report("MBB exits via conditional branch/fall-through but the CFG " "successors don't match the actual successors!", MBB); } @@ -680,13 +691,12 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { } regsLive.clear(); - for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(), - E = MBB->livein_end(); I != E; ++I) { - if (!TargetRegisterInfo::isPhysicalRegister(*I)) { + for (const auto &LI : MBB->liveins()) { + if (!TargetRegisterInfo::isPhysicalRegister(LI.PhysReg)) { report("MBB live-in list contains non-physical register", MBB); continue; } - for (MCSubRegIterator SubRegs(*I, TRI, /*IncludeSelf=*/true); + for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true); SubRegs.isValid(); ++SubRegs) regsLive.insert(*SubRegs); } @@ -822,9 +832,12 @@ void MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { const MachineInstr *MI = MO->getParent(); const MCInstrDesc &MCID = MI->getDesc(); + unsigned NumDefs = MCID.getNumDefs(); + if (MCID.getOpcode() == TargetOpcode::PATCHPOINT) + NumDefs = (MONum == 0 && MO->isReg()) ? NumDefs : 0; // The first MCID.NumDefs operands must be explicit register defines - if (MONum < MCID.getNumDefs()) { + if (MONum < NumDefs) { const MCOperandInfo &MCOI = MCID.OpInfo[MONum]; if (!MO->isReg()) report("Explicit definition must be a register", MO, MONum); @@ -972,13 +985,38 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { case MachineOperand::MO_FrameIndex: if (LiveStks && LiveStks->hasInterval(MO->getIndex()) && LiveInts && !LiveInts->isNotInMIMap(MI)) { - LiveInterval &LI = LiveStks->getInterval(MO->getIndex()); + int FI = MO->getIndex(); + LiveInterval &LI = LiveStks->getInterval(FI); SlotIndex Idx = LiveInts->getInstructionIndex(MI); - if (MI->mayLoad() && !LI.liveAt(Idx.getRegSlot(true))) { + + bool stores = MI->mayStore(); + bool loads = MI->mayLoad(); + // For a memory-to-memory move, we need to check if the frame + // index is used for storing or loading, by inspecting the + // memory operands. + if (stores && loads) { + for (auto *MMO : MI->memoperands()) { + const PseudoSourceValue *PSV = MMO->getPseudoValue(); + if (PSV == nullptr) continue; + const FixedStackPseudoSourceValue *Value = + dyn_cast<FixedStackPseudoSourceValue>(PSV); + if (Value == nullptr) continue; + if (Value->getFrameIndex() != FI) continue; + + if (MMO->isStore()) + loads = false; + else + stores = false; + break; + } + if (loads == stores) + report("Missing fixed stack memoperand.", MI); + } + if (loads && !LI.liveAt(Idx.getRegSlot(true))) { report("Instruction loads from dead spill slot", MO, MONum); errs() << "Live stack: " << LI << '\n'; } - if (MI->mayStore() && !LI.liveAt(Idx.getRegSlot())) { + if (stores && !LI.liveAt(Idx.getRegSlot())) { report("Instruction stores to dead spill slot", MO, MONum); errs() << "Live stack: " << LI << '\n'; } @@ -1387,40 +1425,39 @@ void MachineVerifier::verifyLiveIntervals() { void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR, const VNInfo *VNI, unsigned Reg, - unsigned LaneMask) { + LaneBitmask LaneMask) { if (VNI->isUnused()) return; const VNInfo *DefVNI = LR.getVNInfoAt(VNI->def); if (!DefVNI) { - report("Valno not live at def and not marked unused", MF, LR, Reg, - LaneMask); - errs() << "Valno #" << VNI->id << '\n'; + report("Value not live at VNInfo def and not marked unused", MF); + report_context(LR, Reg, LaneMask); + report_context(*VNI); return; } if (DefVNI != VNI) { - report("Live segment at def has different valno", MF, LR, Reg, LaneMask); - errs() << "Valno #" << VNI->id << " is defined at " << VNI->def - << " where valno #" << DefVNI->id << " is live\n"; + report("Live segment at def has different VNInfo", MF); + report_context(LR, Reg, LaneMask); + report_context(*VNI); return; } const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def); if (!MBB) { - report("Invalid definition index", MF, LR, Reg, LaneMask); - errs() << "Valno #" << VNI->id << " is defined at " << VNI->def - << " in " << LR << '\n'; + report("Invalid VNInfo definition index", MF); + report_context(LR, Reg, LaneMask); + report_context(*VNI); return; } if (VNI->isPHIDef()) { if (VNI->def != LiveInts->getMBBStartIdx(MBB)) { - report("PHIDef value is not defined at MBB start", MBB, LR, Reg, - LaneMask); - errs() << "Valno #" << VNI->id << " is defined at " << VNI->def - << ", not at the beginning of BB#" << MBB->getNumber() << '\n'; + report("PHIDef VNInfo is not defined at MBB start", MBB); + report_context(LR, Reg, LaneMask); + report_context(*VNI); } return; } @@ -1428,8 +1465,9 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR, // Non-PHI def. const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def); if (!MI) { - report("No instruction at def index", MBB, LR, Reg, LaneMask); - errs() << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n'; + report("No instruction at VNInfo def index", MBB); + report_context(LR, Reg, LaneMask); + report_context(*VNI); return; } @@ -1457,60 +1495,67 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR, if (!hasDef) { report("Defining instruction does not modify register", MI); - errs() << "Valno #" << VNI->id << " in " << LR << '\n'; + report_context(LR, Reg, LaneMask); + report_context(*VNI); } // Early clobber defs begin at USE slots, but other defs must begin at // DEF slots. if (isEarlyClobber) { if (!VNI->def.isEarlyClobber()) { - report("Early clobber def must be at an early-clobber slot", MBB, LR, - Reg, LaneMask); - errs() << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n'; + report("Early clobber def must be at an early-clobber slot", MBB); + report_context(LR, Reg, LaneMask); + report_context(*VNI); } } else if (!VNI->def.isRegister()) { - report("Non-PHI, non-early clobber def must be at a register slot", - MBB, LR, Reg, LaneMask); - errs() << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n'; + report("Non-PHI, non-early clobber def must be at a register slot", MBB); + report_context(LR, Reg, LaneMask); + report_context(*VNI); } } } void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, const LiveRange::const_iterator I, - unsigned Reg, unsigned LaneMask) { + unsigned Reg, LaneBitmask LaneMask) +{ const LiveRange::Segment &S = *I; const VNInfo *VNI = S.valno; assert(VNI && "Live segment has no valno"); if (VNI->id >= LR.getNumValNums() || VNI != LR.getValNumInfo(VNI->id)) { - report("Foreign valno in live segment", MF, LR, Reg, LaneMask); - errs() << S << " has a bad valno\n"; + report("Foreign valno in live segment", MF); + report_context(LR, Reg, LaneMask); + report_context(S); + report_context(*VNI); } if (VNI->isUnused()) { - report("Live segment valno is marked unused", MF, LR, Reg, LaneMask); - errs() << S << '\n'; + report("Live segment valno is marked unused", MF); + report_context(LR, Reg, LaneMask); + report_context(S); } const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(S.start); if (!MBB) { - report("Bad start of live segment, no basic block", MF, LR, Reg, LaneMask); - errs() << S << '\n'; + report("Bad start of live segment, no basic block", MF); + report_context(LR, Reg, LaneMask); + report_context(S); return; } SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB); if (S.start != MBBStartIdx && S.start != VNI->def) { - report("Live segment must begin at MBB entry or valno def", MBB, LR, Reg, - LaneMask); - errs() << S << '\n'; + report("Live segment must begin at MBB entry or valno def", MBB); + report_context(LR, Reg, LaneMask); + report_context(S); } const MachineBasicBlock *EndMBB = LiveInts->getMBBFromIndex(S.end.getPrevSlot()); if (!EndMBB) { - report("Bad end of live segment, no basic block", MF, LR, Reg, LaneMask); - errs() << S << '\n'; + report("Bad end of live segment, no basic block", MF); + report_context(LR, Reg, LaneMask); + report_context(S); return; } @@ -1527,26 +1572,26 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, const MachineInstr *MI = LiveInts->getInstructionFromIndex(S.end.getPrevSlot()); if (!MI) { - report("Live segment doesn't end at a valid instruction", EndMBB, LR, Reg, - LaneMask); - errs() << S << '\n'; + report("Live segment doesn't end at a valid instruction", EndMBB); + report_context(LR, Reg, LaneMask); + report_context(S); return; } // The block slot must refer to a basic block boundary. if (S.end.isBlock()) { - report("Live segment ends at B slot of an instruction", EndMBB, LR, Reg, - LaneMask); - errs() << S << '\n'; + report("Live segment ends at B slot of an instruction", EndMBB); + report_context(LR, Reg, LaneMask); + report_context(S); } if (S.end.isDead()) { // Segment ends on the dead slot. // That means there must be a dead def. if (!SlotIndex::isSameInstr(S.start, S.end)) { - report("Live segment ending at dead slot spans instructions", EndMBB, LR, - Reg, LaneMask); - errs() << S << '\n'; + report("Live segment ending at dead slot spans instructions", EndMBB); + report_context(LR, Reg, LaneMask); + report_context(S); } } @@ -1555,9 +1600,9 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, if (S.end.isEarlyClobber()) { if (I+1 == LR.end() || (I+1)->start != S.end) { report("Live segment ending at early clobber slot must be " - "redefined by an EC def in the same instruction", EndMBB, LR, Reg, - LaneMask); - errs() << S << '\n'; + "redefined by an EC def in the same instruction", EndMBB); + report_context(LR, Reg, LaneMask); + report_context(S); } } @@ -1587,14 +1632,15 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, !hasSubRegDef) { report("Instruction ending live segment doesn't read the register", MI); - errs() << S << " in " << LR << '\n'; + report_context(LR, Reg, LaneMask); + report_context(S); } } } } // Now check all the basic blocks in this live segment. - MachineFunction::const_iterator MFI = MBB; + MachineFunction::const_iterator MFI = MBB->getIterator(); // Is this live segment the beginning of a non-PHIDef VN? if (S.start == VNI->def && !VNI->isPHIDef()) { // Not live-in to any blocks. @@ -1604,10 +1650,10 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, ++MFI; } for (;;) { - assert(LiveInts->isLiveInToMBB(LR, MFI)); + assert(LiveInts->isLiveInToMBB(LR, &*MFI)); // We don't know how to track physregs into a landing pad. if (!TargetRegisterInfo::isVirtualRegister(Reg) && - MFI->isLandingPad()) { + MFI->isEHPad()) { if (&*MFI == EndMBB) break; ++MFI; @@ -1616,7 +1662,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, // Is VNI a PHI-def in the current block? bool IsPHI = VNI->isPHIDef() && - VNI->def == LiveInts->getMBBStartIdx(MFI); + VNI->def == LiveInts->getMBBStartIdx(&*MFI); // Check that VNI is live-out of all predecessors. for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(), @@ -1626,22 +1672,23 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, // All predecessors must have a live-out value. if (!PVNI) { - report("Register not marked live out of predecessor", *PI, LR, Reg, - LaneMask); - errs() << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber() - << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before " - << PEnd << '\n'; + report("Register not marked live out of predecessor", *PI); + report_context(LR, Reg, LaneMask); + report_context(*VNI); + errs() << " live into BB#" << MFI->getNumber() + << '@' << LiveInts->getMBBStartIdx(&*MFI) << ", not live before " + << PEnd << '\n'; continue; } // Only PHI-defs can take different predecessor values. if (!IsPHI && PVNI != VNI) { - report("Different value live out of predecessor", *PI, LR, Reg, - LaneMask); + report("Different value live out of predecessor", *PI); + report_context(LR, Reg, LaneMask); errs() << "Valno #" << PVNI->id << " live out of BB#" - << (*PI)->getNumber() << '@' << PEnd - << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber() - << '@' << LiveInts->getMBBStartIdx(MFI) << '\n'; + << (*PI)->getNumber() << '@' << PEnd << "\nValno #" << VNI->id + << " live into BB#" << MFI->getNumber() << '@' + << LiveInts->getMBBStartIdx(&*MFI) << '\n'; } } if (&*MFI == EndMBB) @@ -1651,7 +1698,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, } void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg, - unsigned LaneMask) { + LaneBitmask LaneMask) { for (const VNInfo *VNI : LR.valnos) verifyLiveRangeValue(LR, VNI, Reg, LaneMask); @@ -1664,24 +1711,35 @@ void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) { assert(TargetRegisterInfo::isVirtualRegister(Reg)); verifyLiveRange(LI, Reg); - unsigned Mask = 0; - unsigned MaxMask = MRI->getMaxLaneMaskForVReg(Reg); + LaneBitmask Mask = 0; + LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(Reg); for (const LiveInterval::SubRange &SR : LI.subranges()) { - if ((Mask & SR.LaneMask) != 0) - report("Lane masks of sub ranges overlap in live interval", MF, LI); - if ((SR.LaneMask & ~MaxMask) != 0) - report("Subrange lanemask is invalid", MF, LI); + if ((Mask & SR.LaneMask) != 0) { + report("Lane masks of sub ranges overlap in live interval", MF); + report_context(LI); + } + if ((SR.LaneMask & ~MaxMask) != 0) { + report("Subrange lanemask is invalid", MF); + report_context(LI); + } + if (SR.empty()) { + report("Subrange must not be empty", MF); + report_context(SR, LI.reg, SR.LaneMask); + } Mask |= SR.LaneMask; verifyLiveRange(SR, LI.reg, SR.LaneMask); - if (!LI.covers(SR)) - report("A Subrange is not covered by the main range", MF, LI); + if (!LI.covers(SR)) { + report("A Subrange is not covered by the main range", MF); + report_context(LI); + } } // Check the LI only has one connected component. ConnectedVNInfoEqClasses ConEQ(*LiveInts); - unsigned NumComp = ConEQ.Classify(&LI); + unsigned NumComp = ConEQ.Classify(LI); if (NumComp > 1) { - report("Multiple connected components in live interval", MF, LI); + report("Multiple connected components in live interval", MF); + report_context(LI); for (unsigned comp = 0; comp != NumComp; ++comp) { errs() << comp << ": valnos"; for (LiveInterval::const_vni_iterator I = LI.vni_begin(), diff --git a/contrib/llvm/lib/CodeGen/PHIElimination.cpp b/contrib/llvm/lib/CodeGen/PHIElimination.cpp index d343301..2c93792 100644 --- a/contrib/llvm/lib/CodeGen/PHIElimination.cpp +++ b/contrib/llvm/lib/CodeGen/PHIElimination.cpp @@ -548,7 +548,7 @@ void PHIElimination::analyzePHINodes(const MachineFunction& MF) { bool PHIElimination::SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB, MachineLoopInfo *MLI) { - if (MBB.empty() || !MBB.front().isPHI() || MBB.isLandingPad()) + if (MBB.empty() || !MBB.front().isPHI() || MBB.isEHPad()) return false; // Quick exit for basic blocks without PHIs. const MachineLoop *CurLoop = MLI ? MLI->getLoopFor(&MBB) : nullptr; diff --git a/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp b/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp index 99bbad1..4cabc3a 100644 --- a/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp +++ b/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp @@ -28,7 +28,7 @@ llvm::findPHICopyInsertPoint(MachineBasicBlock* MBB, MachineBasicBlock* SuccMBB, // Usually, we just want to insert the copy before the first terminator // instruction. However, for the edge going to a landing pad, we must insert // the copy before the call/invoke instruction. - if (!SuccMBB->isLandingPad()) + if (!SuccMBB->isEHPad()) return MBB->getFirstTerminator(); // Discover any defs/uses in this basic block. diff --git a/contrib/llvm/lib/CodeGen/ParallelCG.cpp b/contrib/llvm/lib/CodeGen/ParallelCG.cpp new file mode 100644 index 0000000..e73ba02 --- /dev/null +++ b/contrib/llvm/lib/CodeGen/ParallelCG.cpp @@ -0,0 +1,96 @@ +//===-- ParallelCG.cpp ----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines functions that can be used for parallel code generation. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ParallelCG.h" +#include "llvm/Bitcode/ReaderWriter.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/thread.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/SplitModule.h" + +using namespace llvm; + +static void codegen(Module *M, llvm::raw_pwrite_stream &OS, + const Target *TheTarget, StringRef CPU, StringRef Features, + const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL, + TargetMachine::CodeGenFileType FileType) { + std::unique_ptr<TargetMachine> TM(TheTarget->createTargetMachine( + M->getTargetTriple(), CPU, Features, Options, RM, CM, OL)); + + legacy::PassManager CodeGenPasses; + if (TM->addPassesToEmitFile(CodeGenPasses, OS, FileType)) + report_fatal_error("Failed to setup codegen"); + CodeGenPasses.run(*M); +} + +std::unique_ptr<Module> +llvm::splitCodeGen(std::unique_ptr<Module> M, + ArrayRef<llvm::raw_pwrite_stream *> OSs, StringRef CPU, + StringRef Features, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, + TargetMachine::CodeGenFileType FileType) { + StringRef TripleStr = M->getTargetTriple(); + std::string ErrMsg; + const Target *TheTarget = TargetRegistry::lookupTarget(TripleStr, ErrMsg); + if (!TheTarget) + report_fatal_error(Twine("Target not found: ") + ErrMsg); + + if (OSs.size() == 1) { + codegen(M.get(), *OSs[0], TheTarget, CPU, Features, Options, RM, CM, + OL, FileType); + return M; + } + + std::vector<thread> Threads; + SplitModule(std::move(M), OSs.size(), [&](std::unique_ptr<Module> MPart) { + // We want to clone the module in a new context to multi-thread the codegen. + // We do it by serializing partition modules to bitcode (while still on the + // main thread, in order to avoid data races) and spinning up new threads + // which deserialize the partitions into separate contexts. + // FIXME: Provide a more direct way to do this in LLVM. + SmallVector<char, 0> BC; + raw_svector_ostream BCOS(BC); + WriteBitcodeToFile(MPart.get(), BCOS); + + llvm::raw_pwrite_stream *ThreadOS = OSs[Threads.size()]; + Threads.emplace_back( + [TheTarget, CPU, Features, Options, RM, CM, OL, FileType, + ThreadOS](const SmallVector<char, 0> &BC) { + LLVMContext Ctx; + ErrorOr<std::unique_ptr<Module>> MOrErr = + parseBitcodeFile(MemoryBufferRef(StringRef(BC.data(), BC.size()), + "<split-module>"), + Ctx); + if (!MOrErr) + report_fatal_error("Failed to read bitcode"); + std::unique_ptr<Module> MPartInCtx = std::move(MOrErr.get()); + + codegen(MPartInCtx.get(), *ThreadOS, TheTarget, CPU, Features, + Options, RM, CM, OL, FileType); + }, + // Pass BC using std::move to ensure that it get moved rather than + // copied into the thread's context. + std::move(BC)); + }); + + for (thread &T : Threads) + T.join(); + + return {}; +} diff --git a/contrib/llvm/lib/CodeGen/Passes.cpp b/contrib/llvm/lib/CodeGen/Passes.cpp index 024d166..873f712 100644 --- a/contrib/llvm/lib/CodeGen/Passes.cpp +++ b/contrib/llvm/lib/CodeGen/Passes.cpp @@ -13,7 +13,11 @@ //===---------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/CFLAliasAnalysis.h" #include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" +#include "llvm/Analysis/TypeBasedAliasAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/IR/IRPrintingPasses.h" @@ -52,9 +56,6 @@ static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden, cl::desc("Disable Machine LICM")); static cl::opt<bool> DisableMachineCSE("disable-machine-cse", cl::Hidden, cl::desc("Disable Machine Common Subexpression Elimination")); -static cl::opt<cl::boolOrDefault> - EnableShrinkWrapOpt("enable-shrink-wrap", cl::Hidden, - cl::desc("enable the shrink-wrapping pass")); static cl::opt<cl::boolOrDefault> OptimizeRegAlloc( "optimize-regalloc", cl::Hidden, cl::desc("Enable optimized register allocation compilation path.")); @@ -95,10 +96,10 @@ PrintMachineInstrs("print-machineinstrs", cl::ValueOptional, // Temporary option to allow experimenting with MachineScheduler as a post-RA // scheduler. Targets can "properly" enable this with -// substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); Ideally it -// wouldn't be part of the standard pass pipeline, and the target would just add -// a PostRA scheduling pass wherever it wants. -static cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden, +// substitutePass(&PostRASchedulerID, &PostMachineSchedulerID). +// Targets can return true in targetSchedulesPostRAScheduling() and +// insert a PostRA scheduling pass wherever it wants. +cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden, cl::desc("Run MachineScheduler post regalloc (independent of preRA sched)")); // Experimental option to run live interval analysis early. @@ -188,6 +189,29 @@ char TargetPassConfig::ID = 0; char TargetPassConfig::EarlyTailDuplicateID = 0; char TargetPassConfig::PostRAMachineLICMID = 0; +namespace { +struct InsertedPass { + AnalysisID TargetPassID; + IdentifyingPassPtr InsertedPassID; + bool VerifyAfter; + bool PrintAfter; + + InsertedPass(AnalysisID TargetPassID, IdentifyingPassPtr InsertedPassID, + bool VerifyAfter, bool PrintAfter) + : TargetPassID(TargetPassID), InsertedPassID(InsertedPassID), + VerifyAfter(VerifyAfter), PrintAfter(PrintAfter) {} + + Pass *getInsertedPass() const { + assert(InsertedPassID.isValid() && "Illegal Pass ID!"); + if (InsertedPassID.isInstance()) + return InsertedPassID.getInstance(); + Pass *NP = Pass::createPass(InsertedPassID.getID()); + assert(NP && "Pass ID not registered"); + return NP; + } +}; +} + namespace llvm { class PassConfigImpl { public: @@ -202,7 +226,7 @@ public: /// Store the pairs of <AnalysisID, AnalysisID> of which the second pass /// is inserted after each instance of the first one. - SmallVector<std::pair<AnalysisID, IdentifyingPassPtr>, 4> InsertedPasses; + SmallVector<InsertedPass, 4> InsertedPasses; }; } // namespace llvm @@ -217,7 +241,7 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) : ImmutablePass(ID), PM(&pm), StartBefore(nullptr), StartAfter(nullptr), StopAfter(nullptr), Started(true), Stopped(false), AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false), - DisableVerify(false), EnableTailMerge(true), EnableShrinkWrap(false) { + DisableVerify(false), EnableTailMerge(true) { Impl = new PassConfigImpl(); @@ -225,6 +249,10 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) // including this pass itself. initializeCodeGen(*PassRegistry::getPassRegistry()); + // Also register alias analysis passes required by codegen passes. + initializeBasicAAWrapperPassPass(*PassRegistry::getPassRegistry()); + initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry()); + // Substitute Pseudo Pass IDs for real ones. substitutePass(&EarlyTailDuplicateID, &TailDuplicateID); substitutePass(&PostRAMachineLICMID, &MachineLICMID); @@ -232,14 +260,15 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) /// Insert InsertedPassID pass after TargetPassID. void TargetPassConfig::insertPass(AnalysisID TargetPassID, - IdentifyingPassPtr InsertedPassID) { + IdentifyingPassPtr InsertedPassID, + bool VerifyAfter, bool PrintAfter) { assert(((!InsertedPassID.isInstance() && TargetPassID != InsertedPassID.getID()) || (InsertedPassID.isInstance() && TargetPassID != InsertedPassID.getInstance()->getPassID())) && "Insert a pass after itself!"); - std::pair<AnalysisID, IdentifyingPassPtr> P(TargetPassID, InsertedPassID); - Impl->InsertedPasses.push_back(P); + Impl->InsertedPasses.emplace_back(TargetPassID, InsertedPassID, VerifyAfter, + PrintAfter); } /// createPassConfig - Create a pass configuration object to be used by @@ -304,21 +333,9 @@ void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) { } // Add the passes after the pass P if there is any. - for (SmallVectorImpl<std::pair<AnalysisID, IdentifyingPassPtr> >::iterator - I = Impl->InsertedPasses.begin(), - E = Impl->InsertedPasses.end(); - I != E; ++I) { - if ((*I).first == PassID) { - assert((*I).second.isValid() && "Illegal Pass ID!"); - Pass *NP; - if ((*I).second.isInstance()) - NP = (*I).second.getInstance(); - else { - NP = Pass::createPass((*I).second.getID()); - assert(NP && "Pass ID not registered"); - } - addPass(NP, false, false); - } + for (auto IP : Impl->InsertedPasses) { + if (IP.TargetPassID == PassID) + addPass(IP.getInsertedPass(), IP.VerifyAfter, IP.PrintAfter); } } else { delete P; @@ -380,10 +397,10 @@ void TargetPassConfig::addIRPasses() { // BasicAliasAnalysis wins if they disagree. This is intended to help // support "obvious" type-punning idioms. if (UseCFLAA) - addPass(createCFLAliasAnalysisPass()); - addPass(createTypeBasedAliasAnalysisPass()); - addPass(createScopedNoAliasAAPass()); - addPass(createBasicAliasAnalysisPass()); + addPass(createCFLAAWrapperPass()); + addPass(createTypeBasedAAWrapperPass()); + addPass(createScopedNoAliasAAWrapperPass()); + addPass(createBasicAAWrapperPass()); // Before running any passes, run the verifier to determine if the input // coming from the front-end and/or optimizer is valid. @@ -461,7 +478,7 @@ void TargetPassConfig::addISelPrepare() { // Add both the safe stack and the stack protection passes: each of them will // only protect functions that have corresponding attributes. - addPass(createSafeStackPass()); + addPass(createSafeStackPass(TM)); addPass(createStackProtectorPass(TM)); if (PrintISelInput) @@ -539,8 +556,9 @@ void TargetPassConfig::addMachinePasses() { addPostRegAlloc(); // Insert prolog/epilog code. Eliminate abstract frame index references... - if (getEnableShrinkWrap()) + if (getOptLevel() != CodeGenOpt::None) addPass(&ShrinkWrapID); + addPass(&PrologEpilogCodeInserterID); /// Add passes that optimize machine instructions after register allocation. @@ -557,7 +575,10 @@ void TargetPassConfig::addMachinePasses() { addPass(&ImplicitNullChecksID); // Second pass scheduler. - if (getOptLevel() != CodeGenOpt::None) { + // Let Target optionally insert this pass by itself at some other + // point. + if (getOptLevel() != CodeGenOpt::None && + !TM->targetSchedulesPostRAScheduling()) { if (MISchedPostRA) addPass(&PostMachineSchedulerID); else @@ -576,7 +597,10 @@ void TargetPassConfig::addMachinePasses() { addPreEmitPass(); + addPass(&FuncletLayoutID, false); + addPass(&StackMapLivenessID, false); + addPass(&LiveDebugValuesID, false); AddingMachinePasses = false; } @@ -613,27 +637,12 @@ void TargetPassConfig::addMachineSSAOptimization() { addPass(&MachineCSEID, false); addPass(&MachineSinkingID); - addPass(&PeepholeOptimizerID, false); + addPass(&PeepholeOptimizerID); // Clean-up the dead code that may have been generated by peephole // rewriting. addPass(&DeadMachineInstructionElimID); } -bool TargetPassConfig::getEnableShrinkWrap() const { - switch (EnableShrinkWrapOpt) { - case cl::BOU_UNSET: - return EnableShrinkWrap && getOptLevel() != CodeGenOpt::None; - // If EnableShrinkWrap is set, it takes precedence on whatever the - // target sets. The rational is that we assume we want to test - // something related to shrink-wrapping. - case cl::BOU_TRUE: - return true; - case cl::BOU_FALSE: - return false; - } - llvm_unreachable("Invalid shrink-wrapping state"); -} - //===---------------------------------------------------------------------===// /// Register Allocation Pass Configuration //===---------------------------------------------------------------------===// @@ -717,7 +726,8 @@ void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { addPass(&PHIEliminationID, false); addPass(&TwoAddressInstructionPassID, false); - addPass(RegAllocPass); + if (RegAllocPass) + addPass(RegAllocPass); } /// Add standard target-independent passes that are tightly coupled with @@ -748,25 +758,27 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { // PreRA instruction scheduling. addPass(&MachineSchedulerID); - // Add the selected register allocation pass. - addPass(RegAllocPass); + if (RegAllocPass) { + // Add the selected register allocation pass. + addPass(RegAllocPass); - // Allow targets to change the register assignments before rewriting. - addPreRewrite(); + // Allow targets to change the register assignments before rewriting. + addPreRewrite(); - // Finally rewrite virtual registers. - addPass(&VirtRegRewriterID); + // Finally rewrite virtual registers. + addPass(&VirtRegRewriterID); - // Perform stack slot coloring and post-ra machine LICM. - // - // FIXME: Re-enable coloring with register when it's capable of adding - // kill markers. - addPass(&StackSlotColoringID); + // Perform stack slot coloring and post-ra machine LICM. + // + // FIXME: Re-enable coloring with register when it's capable of adding + // kill markers. + addPass(&StackSlotColoringID); - // Run post-ra machine LICM to hoist reloads / remats. - // - // FIXME: can this move into MachineLateOptimization? - addPass(&PostRAMachineLICMID); + // Run post-ra machine LICM to hoist reloads / remats. + // + // FIXME: can this move into MachineLateOptimization? + addPass(&PostRAMachineLICMID); + } } //===---------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp index ebe05e3..52b42b6 100644 --- a/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -43,7 +43,7 @@ // - Optimize Loads: // // Loads that can be folded into a later instruction. A load is foldable -// if it loads to virtual registers and the virtual register defined has +// if it loads to virtual registers and the virtual register defined has // a single use. // // - Optimize Copies and Bitcast (more generally, target specific copies): @@ -98,6 +98,16 @@ static cl::opt<bool> DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(false), cl::desc("Disable advanced copy optimization")); +static cl::opt<bool> DisableNAPhysCopyOpt( + "disable-non-allocatable-phys-copy-opt", cl::Hidden, cl::init(false), + cl::desc("Disable non-allocatable physical register copy optimization")); + +// Limit the number of PHI instructions to process +// in PeepholeOptimizer::getNextSource. +static cl::opt<unsigned> RewritePHILimit( + "rewrite-phi-limit", cl::Hidden, cl::init(10), + cl::desc("Limit the length of PHI chains to lookup")); + STATISTIC(NumReuse, "Number of extension results reused"); STATISTIC(NumCmps, "Number of compares eliminated"); STATISTIC(NumImmFold, "Number of move immediate folded"); @@ -105,8 +115,11 @@ STATISTIC(NumLoadFold, "Number of loads folded"); STATISTIC(NumSelects, "Number of selects optimized"); STATISTIC(NumUncoalescableCopies, "Number of uncoalescable copies optimized"); STATISTIC(NumRewrittenCopies, "Number of copies rewritten"); +STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed"); namespace { + class ValueTrackerResult; + class PeepholeOptimizer : public MachineFunctionPass { const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; @@ -130,6 +143,10 @@ namespace { } } + /// \brief Track Def -> Use info used for rewriting copies. + typedef SmallDenseMap<TargetInstrInfo::RegSubRegPair, ValueTrackerResult> + RewriteMapTy; + private: bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB); bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, @@ -137,17 +154,38 @@ namespace { bool optimizeSelect(MachineInstr *MI, SmallPtrSetImpl<MachineInstr *> &LocalMIs); bool optimizeCondBranch(MachineInstr *MI); - bool optimizeCopyOrBitcast(MachineInstr *MI); bool optimizeCoalescableCopy(MachineInstr *MI); bool optimizeUncoalescableCopy(MachineInstr *MI, SmallPtrSetImpl<MachineInstr *> &LocalMIs); - bool findNextSource(unsigned &Reg, unsigned &SubReg); + bool findNextSource(unsigned Reg, unsigned SubReg, + RewriteMapTy &RewriteMap); bool isMoveImmediate(MachineInstr *MI, SmallSet<unsigned, 4> &ImmDefRegs, DenseMap<unsigned, MachineInstr*> &ImmDefMIs); bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, SmallSet<unsigned, 4> &ImmDefRegs, DenseMap<unsigned, MachineInstr*> &ImmDefMIs); + + /// \brief If copy instruction \p MI is a virtual register copy, track it in + /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was + /// previously seen as a copy, replace the uses of this copy with the + /// previously seen copy's destination register. + bool foldRedundantCopy(MachineInstr *MI, + SmallSet<unsigned, 4> &CopySrcRegs, + DenseMap<unsigned, MachineInstr *> &CopyMIs); + + /// \brief Is the register \p Reg a non-allocatable physical register? + bool isNAPhysCopy(unsigned Reg); + + /// \brief If copy instruction \p MI is a non-allocatable virtual<->physical + /// register copy, track it in the \p NAPhysToVirtMIs map. If this + /// non-allocatable physical register was previously copied to a virtual + /// registered and hasn't been clobbered, the virt->phys copy can be + /// deleted. + bool foldRedundantNAPhysCopy( + MachineInstr *MI, + DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs); + bool isLoadFoldable(MachineInstr *MI, SmallSet<unsigned, 16> &FoldAsLoadDefCandidates); @@ -171,6 +209,69 @@ namespace { } }; + /// \brief Helper class to hold a reply for ValueTracker queries. Contains the + /// returned sources for a given search and the instructions where the sources + /// were tracked from. + class ValueTrackerResult { + private: + /// Track all sources found by one ValueTracker query. + SmallVector<TargetInstrInfo::RegSubRegPair, 2> RegSrcs; + + /// Instruction using the sources in 'RegSrcs'. + const MachineInstr *Inst; + + public: + ValueTrackerResult() : Inst(nullptr) {} + ValueTrackerResult(unsigned Reg, unsigned SubReg) : Inst(nullptr) { + addSource(Reg, SubReg); + } + + bool isValid() const { return getNumSources() > 0; } + + void setInst(const MachineInstr *I) { Inst = I; } + const MachineInstr *getInst() const { return Inst; } + + void clear() { + RegSrcs.clear(); + Inst = nullptr; + } + + void addSource(unsigned SrcReg, unsigned SrcSubReg) { + RegSrcs.push_back(TargetInstrInfo::RegSubRegPair(SrcReg, SrcSubReg)); + } + + void setSource(int Idx, unsigned SrcReg, unsigned SrcSubReg) { + assert(Idx < getNumSources() && "Reg pair source out of index"); + RegSrcs[Idx] = TargetInstrInfo::RegSubRegPair(SrcReg, SrcSubReg); + } + + int getNumSources() const { return RegSrcs.size(); } + + unsigned getSrcReg(int Idx) const { + assert(Idx < getNumSources() && "Reg source out of index"); + return RegSrcs[Idx].Reg; + } + + unsigned getSrcSubReg(int Idx) const { + assert(Idx < getNumSources() && "SubReg source out of index"); + return RegSrcs[Idx].SubReg; + } + + bool operator==(const ValueTrackerResult &Other) { + if (Other.getInst() != getInst()) + return false; + + if (Other.getNumSources() != getNumSources()) + return false; + + for (int i = 0, e = Other.getNumSources(); i != e; ++i) + if (Other.getSrcReg(i) != getSrcReg(i) || + Other.getSrcSubReg(i) != getSrcSubReg(i)) + return false; + return true; + } + }; + /// \brief Helper class to track the possible sources of a value defined by /// a (chain of) copy related instructions. /// Given a definition (instruction and definition index), this class @@ -213,23 +314,25 @@ namespace { /// \brief Dispatcher to the right underlying implementation of /// getNextSource. - bool getNextSourceImpl(unsigned &SrcReg, unsigned &SrcSubReg); + ValueTrackerResult getNextSourceImpl(); /// \brief Specialized version of getNextSource for Copy instructions. - bool getNextSourceFromCopy(unsigned &SrcReg, unsigned &SrcSubReg); + ValueTrackerResult getNextSourceFromCopy(); /// \brief Specialized version of getNextSource for Bitcast instructions. - bool getNextSourceFromBitcast(unsigned &SrcReg, unsigned &SrcSubReg); + ValueTrackerResult getNextSourceFromBitcast(); /// \brief Specialized version of getNextSource for RegSequence /// instructions. - bool getNextSourceFromRegSequence(unsigned &SrcReg, unsigned &SrcSubReg); + ValueTrackerResult getNextSourceFromRegSequence(); /// \brief Specialized version of getNextSource for InsertSubreg /// instructions. - bool getNextSourceFromInsertSubreg(unsigned &SrcReg, unsigned &SrcSubReg); + ValueTrackerResult getNextSourceFromInsertSubreg(); /// \brief Specialized version of getNextSource for ExtractSubreg /// instructions. - bool getNextSourceFromExtractSubreg(unsigned &SrcReg, unsigned &SrcSubReg); + ValueTrackerResult getNextSourceFromExtractSubreg(); /// \brief Specialized version of getNextSource for SubregToReg /// instructions. - bool getNextSourceFromSubregToReg(unsigned &SrcReg, unsigned &SrcSubReg); + ValueTrackerResult getNextSourceFromSubregToReg(); + /// \brief Specialized version of getNextSource for PHI instructions. + ValueTrackerResult getNextSourceFromPHI(); public: /// \brief Create a ValueTracker instance for the value defined by \p Reg. @@ -276,16 +379,10 @@ namespace { /// \brief Following the use-def chain, get the next available source /// for the tracked value. - /// When the returned value is not nullptr, \p SrcReg gives the register - /// that contain the tracked value. - /// \note The sub register index returned in \p SrcSubReg must be used - /// on \p SrcReg to access the actual value. - /// \return Unless the returned value is nullptr (i.e., no source found), - /// \p SrcReg gives the register of the next source used in the returned - /// instruction and \p SrcSubReg the sub-register index to be used on that - /// source to get the tracked value. When nullptr is returned, no - /// alternative source has been found. - const MachineInstr *getNextSource(unsigned &SrcReg, unsigned &SrcSubReg); + /// \return A ValueTrackerResult containing a set of registers + /// and sub registers with tracked values. A ValueTrackerResult with + /// an empty set of registers means no source was found. + ValueTrackerResult getNextSource(); /// \brief Get the last register where the initial value can be found. /// Initially this is the register of the definition. @@ -303,11 +400,10 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(PeepholeOptimizer, "peephole-opts", "Peephole Optimizations", false, false) -/// optimizeExtInstr - If instruction is a copy-like instruction, i.e. it reads -/// a single register and writes a single register and it does not modify the -/// source, and if the source value is preserved as a sub-register of the -/// result, then replace all reachable uses of the source with the subreg of the -/// result. +/// If instruction is a copy-like instruction, i.e. it reads a single register +/// and writes a single register and it does not modify the source, and if the +/// source value is preserved as a sub-register of the result, then replace all +/// reachable uses of the source with the subreg of the result. /// /// Do not generate an EXTRACT that is used only in a debug use, as this changes /// the code. Since this code does not currently share EXTRACTs, just ignore all @@ -458,10 +554,10 @@ optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, return Changed; } -/// optimizeCmpInstr - If the instruction is a compare and the previous -/// instruction it's comparing against all ready sets (or could be modified to -/// set) the same flag as the compare, then we can remove the comparison and use -/// the flag from the previous instruction. +/// If the instruction is a compare and the previous instruction it's comparing +/// against already sets (or could be modified to set) the same flag as the +/// compare, then we can remove the comparison and use the flag from the +/// previous instruction. bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB) { // If this instruction is a comparison against zero and isn't comparing a @@ -506,88 +602,138 @@ bool PeepholeOptimizer::optimizeCondBranch(MachineInstr *MI) { return TII->optimizeCondBranch(MI); } -/// \brief Check if the registers defined by the pair (RegisterClass, SubReg) -/// share the same register file. -static bool shareSameRegisterFile(const TargetRegisterInfo &TRI, - const TargetRegisterClass *DefRC, - unsigned DefSubReg, - const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) { - // Same register class. - if (DefRC == SrcRC) - return true; - - // Both operands are sub registers. Check if they share a register class. - unsigned SrcIdx, DefIdx; - if (SrcSubReg && DefSubReg) - return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg, - SrcIdx, DefIdx) != nullptr; - // At most one of the register is a sub register, make it Src to avoid - // duplicating the test. - if (!SrcSubReg) { - std::swap(DefSubReg, SrcSubReg); - std::swap(DefRC, SrcRC); - } - - // One of the register is a sub register, check if we can get a superclass. - if (SrcSubReg) - return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr; - // Plain copy. - return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr; -} - /// \brief Try to find the next source that share the same register file /// for the value defined by \p Reg and \p SubReg. -/// When true is returned, \p Reg and \p SubReg are updated with the -/// register number and sub-register index of the new source. +/// When true is returned, the \p RewriteMap can be used by the client to +/// retrieve all Def -> Use along the way up to the next source. Any found +/// Use that is not itself a key for another entry, is the next source to +/// use. During the search for the next source, multiple sources can be found +/// given multiple incoming sources of a PHI instruction. In this case, we +/// look in each PHI source for the next source; all found next sources must +/// share the same register file as \p Reg and \p SubReg. The client should +/// then be capable to rewrite all intermediate PHIs to get the next source. /// \return False if no alternative sources are available. True otherwise. -bool PeepholeOptimizer::findNextSource(unsigned &Reg, unsigned &SubReg) { +bool PeepholeOptimizer::findNextSource(unsigned Reg, unsigned SubReg, + RewriteMapTy &RewriteMap) { // Do not try to find a new source for a physical register. // So far we do not have any motivating example for doing that. // Thus, instead of maintaining untested code, we will revisit that if // that changes at some point. if (TargetRegisterInfo::isPhysicalRegister(Reg)) return false; - const TargetRegisterClass *DefRC = MRI->getRegClass(Reg); - unsigned DefSubReg = SubReg; - - unsigned Src; - unsigned SrcSubReg; - bool ShouldRewrite = false; - - // Follow the chain of copies until we reach the top of the use-def chain - // or find a more suitable source. - ValueTracker ValTracker(Reg, DefSubReg, *MRI, !DisableAdvCopyOpt, TII); - do { - unsigned CopySrcReg, CopySrcSubReg; - if (!ValTracker.getNextSource(CopySrcReg, CopySrcSubReg)) - break; - Src = CopySrcReg; - SrcSubReg = CopySrcSubReg; - - // Do not extend the live-ranges of physical registers as they add - // constraints to the register allocator. - // Moreover, if we want to extend the live-range of a physical register, - // unlike SSA virtual register, we will have to check that they are not - // redefine before the related use. - if (TargetRegisterInfo::isPhysicalRegister(Src)) - break; - const TargetRegisterClass *SrcRC = MRI->getRegClass(Src); + SmallVector<TargetInstrInfo::RegSubRegPair, 4> SrcToLook; + TargetInstrInfo::RegSubRegPair CurSrcPair(Reg, SubReg); + SrcToLook.push_back(CurSrcPair); + + unsigned PHICount = 0; + while (!SrcToLook.empty() && PHICount < RewritePHILimit) { + TargetInstrInfo::RegSubRegPair Pair = SrcToLook.pop_back_val(); + // As explained above, do not handle physical registers + if (TargetRegisterInfo::isPhysicalRegister(Pair.Reg)) + return false; - // If this source does not incur a cross register bank copy, use it. - ShouldRewrite = shareSameRegisterFile(*TRI, DefRC, DefSubReg, SrcRC, - SrcSubReg); - } while (!ShouldRewrite); + CurSrcPair = Pair; + ValueTracker ValTracker(CurSrcPair.Reg, CurSrcPair.SubReg, *MRI, + !DisableAdvCopyOpt, TII); + ValueTrackerResult Res; + bool ShouldRewrite = false; + + do { + // Follow the chain of copies until we reach the top of the use-def chain + // or find a more suitable source. + Res = ValTracker.getNextSource(); + if (!Res.isValid()) + break; + + // Insert the Def -> Use entry for the recently found source. + ValueTrackerResult CurSrcRes = RewriteMap.lookup(CurSrcPair); + if (CurSrcRes.isValid()) { + assert(CurSrcRes == Res && "ValueTrackerResult found must match"); + // An existent entry with multiple sources is a PHI cycle we must avoid. + // Otherwise it's an entry with a valid next source we already found. + if (CurSrcRes.getNumSources() > 1) { + DEBUG(dbgs() << "findNextSource: found PHI cycle, aborting...\n"); + return false; + } + break; + } + RewriteMap.insert(std::make_pair(CurSrcPair, Res)); + + // ValueTrackerResult usually have one source unless it's the result from + // a PHI instruction. Add the found PHI edges to be looked up further. + unsigned NumSrcs = Res.getNumSources(); + if (NumSrcs > 1) { + PHICount++; + for (unsigned i = 0; i < NumSrcs; ++i) + SrcToLook.push_back(TargetInstrInfo::RegSubRegPair( + Res.getSrcReg(i), Res.getSrcSubReg(i))); + break; + } - // If we did not find a more suitable source, there is nothing to optimize. - if (!ShouldRewrite || Src == Reg) + CurSrcPair.Reg = Res.getSrcReg(0); + CurSrcPair.SubReg = Res.getSrcSubReg(0); + // Do not extend the live-ranges of physical registers as they add + // constraints to the register allocator. Moreover, if we want to extend + // the live-range of a physical register, unlike SSA virtual register, + // we will have to check that they aren't redefine before the related use. + if (TargetRegisterInfo::isPhysicalRegister(CurSrcPair.Reg)) + return false; + + const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg); + ShouldRewrite = TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC, + CurSrcPair.SubReg); + } while (!ShouldRewrite); + + // Continue looking for new sources... + if (Res.isValid()) + continue; + + // Do not continue searching for a new source if the there's at least + // one use-def which cannot be rewritten. + if (!ShouldRewrite) + return false; + } + + if (PHICount >= RewritePHILimit) { + DEBUG(dbgs() << "findNextSource: PHI limit reached\n"); return false; + } - Reg = Src; - SubReg = SrcSubReg; - return true; + // If we did not find a more suitable source, there is nothing to optimize. + return CurSrcPair.Reg != Reg; +} + +/// \brief Insert a PHI instruction with incoming edges \p SrcRegs that are +/// guaranteed to have the same register class. This is necessary whenever we +/// successfully traverse a PHI instruction and find suitable sources coming +/// from its edges. By inserting a new PHI, we provide a rewritten PHI def +/// suitable to be used in a new COPY instruction. +static MachineInstr * +insertPHI(MachineRegisterInfo *MRI, const TargetInstrInfo *TII, + const SmallVectorImpl<TargetInstrInfo::RegSubRegPair> &SrcRegs, + MachineInstr *OrigPHI) { + assert(!SrcRegs.empty() && "No sources to create a PHI instruction?"); + + const TargetRegisterClass *NewRC = MRI->getRegClass(SrcRegs[0].Reg); + unsigned NewVR = MRI->createVirtualRegister(NewRC); + MachineBasicBlock *MBB = OrigPHI->getParent(); + MachineInstrBuilder MIB = BuildMI(*MBB, OrigPHI, OrigPHI->getDebugLoc(), + TII->get(TargetOpcode::PHI), NewVR); + + unsigned MBBOpIdx = 2; + for (auto RegPair : SrcRegs) { + MIB.addReg(RegPair.Reg, 0, RegPair.SubReg); + MIB.addMBB(OrigPHI->getOperand(MBBOpIdx).getMBB()); + // Since we're extended the lifetime of RegPair.Reg, clear the + // kill flags to account for that and make RegPair.Reg reaches + // the new PHI. + MRI->clearKillFlags(RegPair.Reg); + MBBOpIdx += 2; + } + + return MIB; } namespace { @@ -624,7 +770,7 @@ public: /// This source defines the whole definition, i.e., /// (TrackReg, TrackSubReg) = (dst, dstSubIdx). /// - /// The second and subsequent calls will return false, has there is only one + /// The second and subsequent calls will return false, as there is only one /// rewritable source. /// /// \return True if a rewritable source has been found, false otherwise. @@ -632,9 +778,9 @@ public: virtual bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg, unsigned &TrackReg, unsigned &TrackSubReg) { - // If CurrentSrcIdx == 1, this means this function has already been - // called once. CopyLike has one defintiion and one argument, thus, - // there is nothing else to rewrite. + // If CurrentSrcIdx == 1, this means this function has already been called + // once. CopyLike has one definition and one argument, thus, there is + // nothing else to rewrite. if (!CopyLike.isCopy() || CurrentSrcIdx == 1) return false; // This is the first call to getNextRewritableSource. @@ -653,7 +799,7 @@ public: /// \brief Rewrite the current source with \p NewReg and \p NewSubReg /// if possible. - /// \return True if the rewritting was possible, false otherwise. + /// \return True if the rewriting was possible, false otherwise. virtual bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) { if (!CopyLike.isCopy() || CurrentSrcIdx != 1) return false; @@ -662,6 +808,157 @@ public: MOSrc.setSubReg(NewSubReg); return true; } + + /// \brief Given a \p Def.Reg and Def.SubReg pair, use \p RewriteMap to find + /// the new source to use for rewrite. If \p HandleMultipleSources is true and + /// multiple sources for a given \p Def are found along the way, we found a + /// PHI instructions that needs to be rewritten. + /// TODO: HandleMultipleSources should be removed once we test PHI handling + /// with coalescable copies. + TargetInstrInfo::RegSubRegPair + getNewSource(MachineRegisterInfo *MRI, const TargetInstrInfo *TII, + TargetInstrInfo::RegSubRegPair Def, + PeepholeOptimizer::RewriteMapTy &RewriteMap, + bool HandleMultipleSources = true) { + + TargetInstrInfo::RegSubRegPair LookupSrc(Def.Reg, Def.SubReg); + do { + ValueTrackerResult Res = RewriteMap.lookup(LookupSrc); + // If there are no entries on the map, LookupSrc is the new source. + if (!Res.isValid()) + return LookupSrc; + + // There's only one source for this definition, keep searching... + unsigned NumSrcs = Res.getNumSources(); + if (NumSrcs == 1) { + LookupSrc.Reg = Res.getSrcReg(0); + LookupSrc.SubReg = Res.getSrcSubReg(0); + continue; + } + + // TODO: Remove once multiple srcs w/ coalescable copies are supported. + if (!HandleMultipleSources) + break; + + // Multiple sources, recurse into each source to find a new source + // for it. Then, rewrite the PHI accordingly to its new edges. + SmallVector<TargetInstrInfo::RegSubRegPair, 4> NewPHISrcs; + for (unsigned i = 0; i < NumSrcs; ++i) { + TargetInstrInfo::RegSubRegPair PHISrc(Res.getSrcReg(i), + Res.getSrcSubReg(i)); + NewPHISrcs.push_back( + getNewSource(MRI, TII, PHISrc, RewriteMap, HandleMultipleSources)); + } + + // Build the new PHI node and return its def register as the new source. + MachineInstr *OrigPHI = const_cast<MachineInstr *>(Res.getInst()); + MachineInstr *NewPHI = insertPHI(MRI, TII, NewPHISrcs, OrigPHI); + DEBUG(dbgs() << "-- getNewSource\n"); + DEBUG(dbgs() << " Replacing: " << *OrigPHI); + DEBUG(dbgs() << " With: " << *NewPHI); + const MachineOperand &MODef = NewPHI->getOperand(0); + return TargetInstrInfo::RegSubRegPair(MODef.getReg(), MODef.getSubReg()); + + } while (1); + + return TargetInstrInfo::RegSubRegPair(0, 0); + } + + /// \brief Rewrite the source found through \p Def, by using the \p RewriteMap + /// and create a new COPY instruction. More info about RewriteMap in + /// PeepholeOptimizer::findNextSource. Right now this is only used to handle + /// Uncoalescable copies, since they are copy like instructions that aren't + /// recognized by the register allocator. + virtual MachineInstr * + RewriteSource(TargetInstrInfo::RegSubRegPair Def, + PeepholeOptimizer::RewriteMapTy &RewriteMap) { + return nullptr; + } +}; + +/// \brief Helper class to rewrite uncoalescable copy like instructions +/// into new COPY (coalescable friendly) instructions. +class UncoalescableRewriter : public CopyRewriter { +protected: + const TargetInstrInfo &TII; + MachineRegisterInfo &MRI; + /// The number of defs in the bitcast + unsigned NumDefs; + +public: + UncoalescableRewriter(MachineInstr &MI, const TargetInstrInfo &TII, + MachineRegisterInfo &MRI) + : CopyRewriter(MI), TII(TII), MRI(MRI) { + NumDefs = MI.getDesc().getNumDefs(); + } + + /// \brief Get the next rewritable def source (TrackReg, TrackSubReg) + /// All such sources need to be considered rewritable in order to + /// rewrite a uncoalescable copy-like instruction. This method return + /// each definition that must be checked if rewritable. + /// + bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg, + unsigned &TrackReg, + unsigned &TrackSubReg) override { + // Find the next non-dead definition and continue from there. + if (CurrentSrcIdx == NumDefs) + return false; + + while (CopyLike.getOperand(CurrentSrcIdx).isDead()) { + ++CurrentSrcIdx; + if (CurrentSrcIdx == NumDefs) + return false; + } + + // What we track are the alternative sources of the definition. + const MachineOperand &MODef = CopyLike.getOperand(CurrentSrcIdx); + TrackReg = MODef.getReg(); + TrackSubReg = MODef.getSubReg(); + + CurrentSrcIdx++; + return true; + } + + /// \brief Rewrite the source found through \p Def, by using the \p RewriteMap + /// and create a new COPY instruction. More info about RewriteMap in + /// PeepholeOptimizer::findNextSource. Right now this is only used to handle + /// Uncoalescable copies, since they are copy like instructions that aren't + /// recognized by the register allocator. + MachineInstr * + RewriteSource(TargetInstrInfo::RegSubRegPair Def, + PeepholeOptimizer::RewriteMapTy &RewriteMap) override { + assert(!TargetRegisterInfo::isPhysicalRegister(Def.Reg) && + "We do not rewrite physical registers"); + + // Find the new source to use in the COPY rewrite. + TargetInstrInfo::RegSubRegPair NewSrc = + getNewSource(&MRI, &TII, Def, RewriteMap); + + // Insert the COPY. + const TargetRegisterClass *DefRC = MRI.getRegClass(Def.Reg); + unsigned NewVR = MRI.createVirtualRegister(DefRC); + + MachineInstr *NewCopy = + BuildMI(*CopyLike.getParent(), &CopyLike, CopyLike.getDebugLoc(), + TII.get(TargetOpcode::COPY), NewVR) + .addReg(NewSrc.Reg, 0, NewSrc.SubReg); + + NewCopy->getOperand(0).setSubReg(Def.SubReg); + if (Def.SubReg) + NewCopy->getOperand(0).setIsUndef(); + + DEBUG(dbgs() << "-- RewriteSource\n"); + DEBUG(dbgs() << " Replacing: " << CopyLike); + DEBUG(dbgs() << " With: " << *NewCopy); + MRI.replaceRegWith(Def.Reg, NewVR); + MRI.clearKillFlags(NewVR); + + // We extended the lifetime of NewSrc.Reg, clear the kill flags to + // account for that. + MRI.clearKillFlags(NewSrc.Reg); + + return NewCopy; + } }; /// \brief Specialized rewriter for INSERT_SUBREG instruction. @@ -699,7 +996,7 @@ public: // partial definition. TrackReg = MODef.getReg(); if (MODef.getSubReg()) - // Bails if we have to compose sub-register indices. + // Bail if we have to compose sub-register indices. return false; TrackSubReg = (unsigned)CopyLike.getOperand(3).getImm(); return true; @@ -740,7 +1037,7 @@ public: CurrentSrcIdx = 1; const MachineOperand &MOExtractedReg = CopyLike.getOperand(1); SrcReg = MOExtractedReg.getReg(); - // If we have to compose sub-register indices, bails out. + // If we have to compose sub-register indices, bail out. if (MOExtractedReg.getSubReg()) return false; @@ -818,7 +1115,7 @@ public: } const MachineOperand &MOInsertedReg = CopyLike.getOperand(CurrentSrcIdx); SrcReg = MOInsertedReg.getReg(); - // If we have to compose sub-register indices, bails out. + // If we have to compose sub-register indices, bail out. if ((SrcSubReg = MOInsertedReg.getSubReg())) return false; @@ -828,7 +1125,7 @@ public: const MachineOperand &MODef = CopyLike.getOperand(0); TrackReg = MODef.getReg(); - // If we have to compose sub-registers, bails. + // If we have to compose sub-registers, bail. return MODef.getSubReg() == 0; } @@ -850,7 +1147,13 @@ public: /// \return A pointer to a dynamically allocated CopyRewriter or nullptr /// if no rewriter works for \p MI. static CopyRewriter *getCopyRewriter(MachineInstr &MI, - const TargetInstrInfo &TII) { + const TargetInstrInfo &TII, + MachineRegisterInfo &MRI) { + // Handle uncoalescable copy-like instructions. + if (MI.isBitcast() || (MI.isRegSequenceLike() || MI.isInsertSubregLike() || + MI.isExtractSubregLike())) + return new UncoalescableRewriter(MI, TII, MRI); + switch (MI.getOpcode()) { default: return nullptr; @@ -874,7 +1177,7 @@ static CopyRewriter *getCopyRewriter(MachineInstr &MI, /// the same register bank. /// New copies issued by this optimization are register allocator /// friendly. This optimization does not remove any copy as it may -/// overconstraint the register allocator, but replaces some operands +/// overconstrain the register allocator, but replaces some operands /// when possible. /// \pre isCoalescableCopy(*MI) is true. /// \return True, when \p MI has been rewritten. False otherwise. @@ -889,25 +1192,33 @@ bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr *MI) { bool Changed = false; // Get the right rewriter for the current copy. - std::unique_ptr<CopyRewriter> CpyRewriter(getCopyRewriter(*MI, *TII)); - // If none exists, bails out. + std::unique_ptr<CopyRewriter> CpyRewriter(getCopyRewriter(*MI, *TII, *MRI)); + // If none exists, bail out. if (!CpyRewriter) return false; // Rewrite each rewritable source. unsigned SrcReg, SrcSubReg, TrackReg, TrackSubReg; while (CpyRewriter->getNextRewritableSource(SrcReg, SrcSubReg, TrackReg, TrackSubReg)) { - unsigned NewSrc = TrackReg; - unsigned NewSubReg = TrackSubReg; - // Try to find a more suitable source. - // If we failed to do so, or get the actual source, - // move to the next source. - if (!findNextSource(NewSrc, NewSubReg) || SrcReg == NewSrc) + // Keep track of PHI nodes and its incoming edges when looking for sources. + RewriteMapTy RewriteMap; + // Try to find a more suitable source. If we failed to do so, or get the + // actual source, move to the next source. + if (!findNextSource(TrackReg, TrackSubReg, RewriteMap)) + continue; + + // Get the new source to rewrite. TODO: Only enable handling of multiple + // sources (PHIs) once we have a motivating example and testcases for it. + TargetInstrInfo::RegSubRegPair TrackPair(TrackReg, TrackSubReg); + TargetInstrInfo::RegSubRegPair NewSrc = CpyRewriter->getNewSource( + MRI, TII, TrackPair, RewriteMap, false /* multiple sources */); + if (SrcReg == NewSrc.Reg || NewSrc.Reg == 0) continue; + // Rewrite source. - if (CpyRewriter->RewriteCurrentSource(NewSrc, NewSubReg)) { + if (CpyRewriter->RewriteCurrentSource(NewSrc.Reg, NewSrc.SubReg)) { // We may have extended the live-range of NewSrc, account for that. - MRI->clearKillFlags(NewSrc); + MRI->clearKillFlags(NewSrc.Reg); Changed = true; } } @@ -936,61 +1247,53 @@ bool PeepholeOptimizer::optimizeUncoalescableCopy( assert(MI && isUncoalescableCopy(*MI) && "Invalid argument"); // Check if we can rewrite all the values defined by this instruction. - SmallVector< - std::pair<TargetInstrInfo::RegSubRegPair, TargetInstrInfo::RegSubRegPair>, - 4> RewritePairs; - for (const MachineOperand &MODef : MI->defs()) { - if (MODef.isDead()) - // We can ignore those. - continue; + SmallVector<TargetInstrInfo::RegSubRegPair, 4> RewritePairs; + // Get the right rewriter for the current copy. + std::unique_ptr<CopyRewriter> CpyRewriter(getCopyRewriter(*MI, *TII, *MRI)); + // If none exists, bail out. + if (!CpyRewriter) + return false; + // Rewrite each rewritable source by generating new COPYs. This works + // differently from optimizeCoalescableCopy since it first makes sure that all + // definitions can be rewritten. + RewriteMapTy RewriteMap; + unsigned Reg, SubReg, CopyDefReg, CopyDefSubReg; + while (CpyRewriter->getNextRewritableSource(Reg, SubReg, CopyDefReg, + CopyDefSubReg)) { // If a physical register is here, this is probably for a good reason. // Do not rewrite that. - if (TargetRegisterInfo::isPhysicalRegister(MODef.getReg())) + if (TargetRegisterInfo::isPhysicalRegister(CopyDefReg)) return false; // If we do not know how to rewrite this definition, there is no point // in trying to kill this instruction. - TargetInstrInfo::RegSubRegPair Def(MODef.getReg(), MODef.getSubReg()); - TargetInstrInfo::RegSubRegPair Src = Def; - if (!findNextSource(Src.Reg, Src.SubReg)) + TargetInstrInfo::RegSubRegPair Def(CopyDefReg, CopyDefSubReg); + if (!findNextSource(Def.Reg, Def.SubReg, RewriteMap)) return false; - RewritePairs.push_back(std::make_pair(Def, Src)); + + RewritePairs.push_back(Def); } + // The change is possible for all defs, do it. - for (const auto &PairDefSrc : RewritePairs) { - const auto &Def = PairDefSrc.first; - const auto &Src = PairDefSrc.second; + for (const auto &Def : RewritePairs) { // Rewrite the "copy" in a way the register coalescer understands. - assert(!TargetRegisterInfo::isPhysicalRegister(Def.Reg) && - "We do not rewrite physical registers"); - const TargetRegisterClass *DefRC = MRI->getRegClass(Def.Reg); - unsigned NewVR = MRI->createVirtualRegister(DefRC); - MachineInstr *NewCopy = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(TargetOpcode::COPY), - NewVR).addReg(Src.Reg, 0, Src.SubReg); - NewCopy->getOperand(0).setSubReg(Def.SubReg); - if (Def.SubReg) - NewCopy->getOperand(0).setIsUndef(); + MachineInstr *NewCopy = CpyRewriter->RewriteSource(Def, RewriteMap); + assert(NewCopy && "Should be able to always generate a new copy"); LocalMIs.insert(NewCopy); - MRI->replaceRegWith(Def.Reg, NewVR); - MRI->clearKillFlags(NewVR); - // We extended the lifetime of Src. - // Clear the kill flags to account for that. - MRI->clearKillFlags(Src.Reg); } + // MI is now dead. MI->eraseFromParent(); ++NumUncoalescableCopies; return true; } -/// isLoadFoldable - Check whether MI is a candidate for folding into a later -/// instruction. We only fold loads to virtual registers and the virtual -/// register defined has a single use. +/// Check whether MI is a candidate for folding into a later instruction. +/// We only fold loads to virtual registers and the virtual register defined +/// has a single use. bool PeepholeOptimizer::isLoadFoldable( - MachineInstr *MI, - SmallSet<unsigned, 16> &FoldAsLoadDefCandidates) { + MachineInstr *MI, SmallSet<unsigned, 16> &FoldAsLoadDefCandidates) { if (!MI->canFoldAsLoad() || !MI->mayLoad()) return false; const MCInstrDesc &MCID = MI->getDesc(); @@ -1010,9 +1313,9 @@ bool PeepholeOptimizer::isLoadFoldable( return false; } -bool PeepholeOptimizer::isMoveImmediate(MachineInstr *MI, - SmallSet<unsigned, 4> &ImmDefRegs, - DenseMap<unsigned, MachineInstr*> &ImmDefMIs) { +bool PeepholeOptimizer::isMoveImmediate( + MachineInstr *MI, SmallSet<unsigned, 4> &ImmDefRegs, + DenseMap<unsigned, MachineInstr *> &ImmDefMIs) { const MCInstrDesc &MCID = MI->getDesc(); if (!MI->isMoveImmediate()) return false; @@ -1028,23 +1331,26 @@ bool PeepholeOptimizer::isMoveImmediate(MachineInstr *MI, return false; } -/// foldImmediate - Try folding register operands that are defined by move -/// immediate instructions, i.e. a trivial constant folding optimization, if +/// Try folding register operands that are defined by move immediate +/// instructions, i.e. a trivial constant folding optimization, if /// and only if the def and use are in the same BB. -bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, - SmallSet<unsigned, 4> &ImmDefRegs, - DenseMap<unsigned, MachineInstr*> &ImmDefMIs) { +bool PeepholeOptimizer::foldImmediate( + MachineInstr *MI, MachineBasicBlock *MBB, SmallSet<unsigned, 4> &ImmDefRegs, + DenseMap<unsigned, MachineInstr *> &ImmDefMIs) { for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (!MO.isReg() || MO.isDef()) continue; + // Ignore dead implicit defs. + if (MO.isImplicit() && MO.isDead()) + continue; unsigned Reg = MO.getReg(); if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; if (ImmDefRegs.count(Reg) == 0) continue; DenseMap<unsigned, MachineInstr*>::iterator II = ImmDefMIs.find(Reg); - assert(II != ImmDefMIs.end()); + assert(II != ImmDefMIs.end() && "couldn't find immediate definition"); if (TII->FoldImmediate(MI, II->second, Reg, MRI)) { ++NumImmFold; return true; @@ -1053,6 +1359,117 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, return false; } +// FIXME: This is very simple and misses some cases which should be handled when +// motivating examples are found. +// +// The copy rewriting logic should look at uses as well as defs and be able to +// eliminate copies across blocks. +// +// Later copies that are subregister extracts will also not be eliminated since +// only the first copy is considered. +// +// e.g. +// %vreg1 = COPY %vreg0 +// %vreg2 = COPY %vreg0:sub1 +// +// Should replace %vreg2 uses with %vreg1:sub1 +bool PeepholeOptimizer::foldRedundantCopy( + MachineInstr *MI, SmallSet<unsigned, 4> &CopySrcRegs, + DenseMap<unsigned, MachineInstr *> &CopyMIs) { + assert(MI->isCopy() && "expected a COPY machine instruction"); + + unsigned SrcReg = MI->getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + return false; + + unsigned DstReg = MI->getOperand(0).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(DstReg)) + return false; + + if (CopySrcRegs.insert(SrcReg).second) { + // First copy of this reg seen. + CopyMIs.insert(std::make_pair(SrcReg, MI)); + return false; + } + + MachineInstr *PrevCopy = CopyMIs.find(SrcReg)->second; + + unsigned SrcSubReg = MI->getOperand(1).getSubReg(); + unsigned PrevSrcSubReg = PrevCopy->getOperand(1).getSubReg(); + + // Can't replace different subregister extracts. + if (SrcSubReg != PrevSrcSubReg) + return false; + + unsigned PrevDstReg = PrevCopy->getOperand(0).getReg(); + + // Only replace if the copy register class is the same. + // + // TODO: If we have multiple copies to different register classes, we may want + // to track multiple copies of the same source register. + if (MRI->getRegClass(DstReg) != MRI->getRegClass(PrevDstReg)) + return false; + + MRI->replaceRegWith(DstReg, PrevDstReg); + + // Lifetime of the previous copy has been extended. + MRI->clearKillFlags(PrevDstReg); + return true; +} + +bool PeepholeOptimizer::isNAPhysCopy(unsigned Reg) { + return TargetRegisterInfo::isPhysicalRegister(Reg) && + !MRI->isAllocatable(Reg); +} + +bool PeepholeOptimizer::foldRedundantNAPhysCopy( + MachineInstr *MI, DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs) { + assert(MI->isCopy() && "expected a COPY machine instruction"); + + if (DisableNAPhysCopyOpt) + return false; + + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + if (isNAPhysCopy(SrcReg) && TargetRegisterInfo::isVirtualRegister(DstReg)) { + // %vreg = COPY %PHYSREG + // Avoid using a datastructure which can track multiple live non-allocatable + // phys->virt copies since LLVM doesn't seem to do this. + NAPhysToVirtMIs.insert({SrcReg, MI}); + return false; + } + + if (!(TargetRegisterInfo::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg))) + return false; + + // %PHYSREG = COPY %vreg + auto PrevCopy = NAPhysToVirtMIs.find(DstReg); + if (PrevCopy == NAPhysToVirtMIs.end()) { + // We can't remove the copy: there was an intervening clobber of the + // non-allocatable physical register after the copy to virtual. + DEBUG(dbgs() << "NAPhysCopy: intervening clobber forbids erasing " << *MI + << '\n'); + return false; + } + + unsigned PrevDstReg = PrevCopy->second->getOperand(0).getReg(); + if (PrevDstReg == SrcReg) { + // Remove the virt->phys copy: we saw the virtual register definition, and + // the non-allocatable physical register's state hasn't changed since then. + DEBUG(dbgs() << "NAPhysCopy: erasing " << *MI << '\n'); + ++NumNAPhysCopies; + return true; + } + + // Potential missed optimization opportunity: we saw a different virtual + // register get a copy of the non-allocatable physical register, and we only + // track one such copy. Avoid getting confused by this new non-allocatable + // physical register definition, and remove it from the tracked copies. + DEBUG(dbgs() << "NAPhysCopy: missed opportunity " << *MI << '\n'); + NAPhysToVirtMIs.erase(PrevCopy); + return false; +} + bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { if (skipOptnoneFunction(*MF.getFunction())) return false; @@ -1070,9 +1487,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { - MachineBasicBlock *MBB = &*I; - + for (MachineBasicBlock &MBB : MF) { bool SeenMoveImm = false; // During this forward scan, at some point it needs to answer the question @@ -1086,8 +1501,19 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { DenseMap<unsigned, MachineInstr*> ImmDefMIs; SmallSet<unsigned, 16> FoldAsLoadDefCandidates; - for (MachineBasicBlock::iterator - MII = I->begin(), MIE = I->end(); MII != MIE; ) { + // Track when a non-allocatable physical register is copied to a virtual + // register so that useless moves can be removed. + // + // %PHYSREG is the map index; MI is the last valid `%vreg = COPY %PHYSREG` + // without any intervening re-definition of %PHYSREG. + DenseMap<unsigned, MachineInstr *> NAPhysToVirtMIs; + + // Set of virtual registers that are copied from. + SmallSet<unsigned, 4> CopySrcRegs; + DenseMap<unsigned, MachineInstr *> CopySrcMIs; + + for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); + MII != MIE; ) { MachineInstr *MI = &*MII; // We may be erasing MI below, increment MII now. ++MII; @@ -1097,20 +1523,60 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { if (MI->isDebugValue()) continue; - // If there exists an instruction which belongs to the following - // categories, we will discard the load candidates. - if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || - MI->isKill() || MI->isInlineAsm() || - MI->hasUnmodeledSideEffects()) { + // If we run into an instruction we can't fold across, discard + // the load candidates. + if (MI->isLoadFoldBarrier()) FoldAsLoadDefCandidates.clear(); + + if (MI->isPosition() || MI->isPHI()) + continue; + + if (!MI->isCopy()) { + for (const auto &Op : MI->operands()) { + // Visit all operands: definitions can be implicit or explicit. + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + if (Op.isDef() && isNAPhysCopy(Reg)) { + const auto &Def = NAPhysToVirtMIs.find(Reg); + if (Def != NAPhysToVirtMIs.end()) { + // A new definition of the non-allocatable physical register + // invalidates previous copies. + DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI + << '\n'); + NAPhysToVirtMIs.erase(Def); + } + } + } else if (Op.isRegMask()) { + const uint32_t *RegMask = Op.getRegMask(); + for (auto &RegMI : NAPhysToVirtMIs) { + unsigned Def = RegMI.first; + if (MachineOperand::clobbersPhysReg(RegMask, Def)) { + DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI + << '\n'); + NAPhysToVirtMIs.erase(Def); + } + } + } + } + } + + if (MI->isImplicitDef() || MI->isKill()) + continue; + + if (MI->isInlineAsm() || MI->hasUnmodeledSideEffects()) { + // Blow away all non-allocatable physical registers knowledge since we + // don't know what's correct anymore. + // + // FIXME: handle explicit asm clobbers. + DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI + << '\n'); + NAPhysToVirtMIs.clear(); continue; } - if (MI->mayStore() || MI->isCall()) - FoldAsLoadDefCandidates.clear(); if ((isUncoalescableCopy(*MI) && optimizeUncoalescableCopy(MI, LocalMIs)) || - (MI->isCompare() && optimizeCmpInstr(MI, MBB)) || + (MI->isCompare() && optimizeCmpInstr(MI, &MBB)) || (MI->isSelect() && optimizeSelect(MI, LocalMIs))) { // MI is deleted. LocalMIs.erase(MI); @@ -1129,17 +1595,26 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { continue; } + if (MI->isCopy() && + (foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs) || + foldRedundantNAPhysCopy(MI, NAPhysToVirtMIs))) { + LocalMIs.erase(MI); + MI->eraseFromParent(); + Changed = true; + continue; + } + if (isMoveImmediate(MI, ImmDefRegs, ImmDefMIs)) { SeenMoveImm = true; } else { - Changed |= optimizeExtInstr(MI, MBB, LocalMIs); + Changed |= optimizeExtInstr(MI, &MBB, LocalMIs); // optimizeExtInstr might have created new instructions after MI // and before the already incremented MII. Adjust MII so that the // next iteration sees the new instructions. MII = MI; ++MII; if (SeenMoveImm) - Changed |= foldImmediate(MI, MBB, ImmDefRegs, ImmDefMIs); + Changed |= foldImmediate(MI, &MBB, ImmDefRegs, ImmDefMIs); } // Check whether MI is a load candidate for folding into a later @@ -1190,8 +1665,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { return Changed; } -bool ValueTracker::getNextSourceFromCopy(unsigned &SrcReg, - unsigned &SrcSubReg) { +ValueTrackerResult ValueTracker::getNextSourceFromCopy() { assert(Def->isCopy() && "Invalid definition"); // Copy instruction are supposed to be: Def = Src. // If someone breaks this assumption, bad things will happen everywhere. @@ -1199,30 +1673,27 @@ bool ValueTracker::getNextSourceFromCopy(unsigned &SrcReg, if (Def->getOperand(DefIdx).getSubReg() != DefSubReg) // If we look for a different subreg, it means we want a subreg of src. - // Bails as we do not support composing subreg yet. - return false; + // Bails as we do not support composing subregs yet. + return ValueTrackerResult(); // Otherwise, we want the whole source. const MachineOperand &Src = Def->getOperand(1); - SrcReg = Src.getReg(); - SrcSubReg = Src.getSubReg(); - return true; + return ValueTrackerResult(Src.getReg(), Src.getSubReg()); } -bool ValueTracker::getNextSourceFromBitcast(unsigned &SrcReg, - unsigned &SrcSubReg) { +ValueTrackerResult ValueTracker::getNextSourceFromBitcast() { assert(Def->isBitcast() && "Invalid definition"); // Bail if there are effects that a plain copy will not expose. if (Def->hasUnmodeledSideEffects()) - return false; + return ValueTrackerResult(); // Bitcasts with more than one def are not supported. if (Def->getDesc().getNumDefs() != 1) - return false; + return ValueTrackerResult(); if (Def->getOperand(DefIdx).getSubReg() != DefSubReg) // If we look for a different subreg, it means we want a subreg of the src. - // Bails as we do not support composing subreg yet. - return false; + // Bails as we do not support composing subregs yet. + return ValueTrackerResult(); unsigned SrcIdx = Def->getNumOperands(); for (unsigned OpIdx = DefIdx + 1, EndOpIdx = SrcIdx; OpIdx != EndOpIdx; @@ -1230,25 +1701,25 @@ bool ValueTracker::getNextSourceFromBitcast(unsigned &SrcReg, const MachineOperand &MO = Def->getOperand(OpIdx); if (!MO.isReg() || !MO.getReg()) continue; + // Ignore dead implicit defs. + if (MO.isImplicit() && MO.isDead()) + continue; assert(!MO.isDef() && "We should have skipped all the definitions by now"); if (SrcIdx != EndOpIdx) // Multiple sources? - return false; + return ValueTrackerResult(); SrcIdx = OpIdx; } const MachineOperand &Src = Def->getOperand(SrcIdx); - SrcReg = Src.getReg(); - SrcSubReg = Src.getSubReg(); - return true; + return ValueTrackerResult(Src.getReg(), Src.getSubReg()); } -bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcReg, - unsigned &SrcSubReg) { +ValueTrackerResult ValueTracker::getNextSourceFromRegSequence() { assert((Def->isRegSequence() || Def->isRegSequenceLike()) && "Invalid definition"); if (Def->getOperand(DefIdx).getSubReg()) - // If we are composing subreg, bails out. + // If we are composing subregs, bail out. // The case we are checking is Def.<subreg> = REG_SEQUENCE. // This should almost never happen as the SSA property is tracked at // the register level (as opposed to the subreg level). @@ -1262,16 +1733,16 @@ bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcReg, // have this case. // If we can ascertain (or force) that this never happens, we could // turn that into an assertion. - return false; + return ValueTrackerResult(); if (!TII) // We could handle the REG_SEQUENCE here, but we do not want to // duplicate the code from the generic TII. - return false; + return ValueTrackerResult(); SmallVector<TargetInstrInfo::RegSubRegPairAndIdx, 8> RegSeqInputRegs; if (!TII->getRegSequenceInputs(*Def, DefIdx, RegSeqInputRegs)) - return false; + return ValueTrackerResult(); // We are looking at: // Def = REG_SEQUENCE v0, sub0, v1, sub1, ... @@ -1279,41 +1750,38 @@ bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcReg, for (auto &RegSeqInput : RegSeqInputRegs) { if (RegSeqInput.SubIdx == DefSubReg) { if (RegSeqInput.SubReg) - // Bails if we have to compose sub registers. - return false; + // Bail if we have to compose sub registers. + return ValueTrackerResult(); - SrcReg = RegSeqInput.Reg; - SrcSubReg = RegSeqInput.SubReg; - return true; + return ValueTrackerResult(RegSeqInput.Reg, RegSeqInput.SubReg); } } // If the subreg we are tracking is super-defined by another subreg, // we could follow this value. However, this would require to compose // the subreg and we do not do that for now. - return false; + return ValueTrackerResult(); } -bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcReg, - unsigned &SrcSubReg) { +ValueTrackerResult ValueTracker::getNextSourceFromInsertSubreg() { assert((Def->isInsertSubreg() || Def->isInsertSubregLike()) && "Invalid definition"); if (Def->getOperand(DefIdx).getSubReg()) - // If we are composing subreg, bails out. + // If we are composing subreg, bail out. // Same remark as getNextSourceFromRegSequence. // I.e., this may be turned into an assert. - return false; + return ValueTrackerResult(); if (!TII) // We could handle the REG_SEQUENCE here, but we do not want to // duplicate the code from the generic TII. - return false; + return ValueTrackerResult(); TargetInstrInfo::RegSubRegPair BaseReg; TargetInstrInfo::RegSubRegPairAndIdx InsertedReg; if (!TII->getInsertSubregInputs(*Def, DefIdx, BaseReg, InsertedReg)) - return false; + return ValueTrackerResult(); // We are looking at: // Def = INSERT_SUBREG v0, v1, sub1 @@ -1323,9 +1791,7 @@ bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcReg, // #1 Check if the inserted register matches the required sub index. if (InsertedReg.SubIdx == DefSubReg) { - SrcReg = InsertedReg.Reg; - SrcSubReg = InsertedReg.SubReg; - return true; + return ValueTrackerResult(InsertedReg.Reg, InsertedReg.SubReg); } // #2 Otherwise, if the sub register we are looking for is not partial // defined by the inserted element, we can look through the main @@ -1333,10 +1799,10 @@ bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcReg, const MachineOperand &MODef = Def->getOperand(DefIdx); // If the result register (Def) and the base register (v0) do not // have the same register class or if we have to compose - // subregisters, bails out. + // subregisters, bail out. if (MRI.getRegClass(MODef.getReg()) != MRI.getRegClass(BaseReg.Reg) || BaseReg.SubReg) - return false; + return ValueTrackerResult(); // Get the TRI and check if the inserted sub-register overlaps with the // sub-register we are tracking. @@ -1344,121 +1810,138 @@ bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcReg, if (!TRI || (TRI->getSubRegIndexLaneMask(DefSubReg) & TRI->getSubRegIndexLaneMask(InsertedReg.SubIdx)) != 0) - return false; + return ValueTrackerResult(); // At this point, the value is available in v0 via the same subreg // we used for Def. - SrcReg = BaseReg.Reg; - SrcSubReg = DefSubReg; - return true; + return ValueTrackerResult(BaseReg.Reg, DefSubReg); } -bool ValueTracker::getNextSourceFromExtractSubreg(unsigned &SrcReg, - unsigned &SrcSubReg) { +ValueTrackerResult ValueTracker::getNextSourceFromExtractSubreg() { assert((Def->isExtractSubreg() || Def->isExtractSubregLike()) && "Invalid definition"); // We are looking at: // Def = EXTRACT_SUBREG v0, sub0 - // Bails if we have to compose sub registers. + // Bail if we have to compose sub registers. // Indeed, if DefSubReg != 0, we would have to compose it with sub0. if (DefSubReg) - return false; + return ValueTrackerResult(); if (!TII) // We could handle the EXTRACT_SUBREG here, but we do not want to // duplicate the code from the generic TII. - return false; + return ValueTrackerResult(); TargetInstrInfo::RegSubRegPairAndIdx ExtractSubregInputReg; if (!TII->getExtractSubregInputs(*Def, DefIdx, ExtractSubregInputReg)) - return false; + return ValueTrackerResult(); - // Bails if we have to compose sub registers. + // Bail if we have to compose sub registers. // Likewise, if v0.subreg != 0, we would have to compose v0.subreg with sub0. if (ExtractSubregInputReg.SubReg) - return false; + return ValueTrackerResult(); // Otherwise, the value is available in the v0.sub0. - SrcReg = ExtractSubregInputReg.Reg; - SrcSubReg = ExtractSubregInputReg.SubIdx; - return true; + return ValueTrackerResult(ExtractSubregInputReg.Reg, + ExtractSubregInputReg.SubIdx); } -bool ValueTracker::getNextSourceFromSubregToReg(unsigned &SrcReg, - unsigned &SrcSubReg) { +ValueTrackerResult ValueTracker::getNextSourceFromSubregToReg() { assert(Def->isSubregToReg() && "Invalid definition"); // We are looking at: // Def = SUBREG_TO_REG Imm, v0, sub0 - // Bails if we have to compose sub registers. + // Bail if we have to compose sub registers. // If DefSubReg != sub0, we would have to check that all the bits // we track are included in sub0 and if yes, we would have to // determine the right subreg in v0. if (DefSubReg != Def->getOperand(3).getImm()) - return false; - // Bails if we have to compose sub registers. + return ValueTrackerResult(); + // Bail if we have to compose sub registers. // Likewise, if v0.subreg != 0, we would have to compose it with sub0. if (Def->getOperand(2).getSubReg()) - return false; + return ValueTrackerResult(); - SrcReg = Def->getOperand(2).getReg(); - SrcSubReg = Def->getOperand(3).getImm(); - return true; + return ValueTrackerResult(Def->getOperand(2).getReg(), + Def->getOperand(3).getImm()); +} + +/// \brief Explore each PHI incoming operand and return its sources +ValueTrackerResult ValueTracker::getNextSourceFromPHI() { + assert(Def->isPHI() && "Invalid definition"); + ValueTrackerResult Res; + + // If we look for a different subreg, bail as we do not support composing + // subregs yet. + if (Def->getOperand(0).getSubReg() != DefSubReg) + return ValueTrackerResult(); + + // Return all register sources for PHI instructions. + for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2) { + auto &MO = Def->getOperand(i); + assert(MO.isReg() && "Invalid PHI instruction"); + Res.addSource(MO.getReg(), MO.getSubReg()); + } + + return Res; } -bool ValueTracker::getNextSourceImpl(unsigned &SrcReg, unsigned &SrcSubReg) { +ValueTrackerResult ValueTracker::getNextSourceImpl() { assert(Def && "This method needs a valid definition"); assert( (DefIdx < Def->getDesc().getNumDefs() || Def->getDesc().isVariadic()) && Def->getOperand(DefIdx).isDef() && "Invalid DefIdx"); if (Def->isCopy()) - return getNextSourceFromCopy(SrcReg, SrcSubReg); + return getNextSourceFromCopy(); if (Def->isBitcast()) - return getNextSourceFromBitcast(SrcReg, SrcSubReg); + return getNextSourceFromBitcast(); // All the remaining cases involve "complex" instructions. - // Bails if we did not ask for the advanced tracking. + // Bail if we did not ask for the advanced tracking. if (!UseAdvancedTracking) - return false; + return ValueTrackerResult(); if (Def->isRegSequence() || Def->isRegSequenceLike()) - return getNextSourceFromRegSequence(SrcReg, SrcSubReg); + return getNextSourceFromRegSequence(); if (Def->isInsertSubreg() || Def->isInsertSubregLike()) - return getNextSourceFromInsertSubreg(SrcReg, SrcSubReg); + return getNextSourceFromInsertSubreg(); if (Def->isExtractSubreg() || Def->isExtractSubregLike()) - return getNextSourceFromExtractSubreg(SrcReg, SrcSubReg); + return getNextSourceFromExtractSubreg(); if (Def->isSubregToReg()) - return getNextSourceFromSubregToReg(SrcReg, SrcSubReg); - return false; + return getNextSourceFromSubregToReg(); + if (Def->isPHI()) + return getNextSourceFromPHI(); + return ValueTrackerResult(); } -const MachineInstr *ValueTracker::getNextSource(unsigned &SrcReg, - unsigned &SrcSubReg) { +ValueTrackerResult ValueTracker::getNextSource() { // If we reach a point where we cannot move up in the use-def chain, // there is nothing we can get. if (!Def) - return nullptr; + return ValueTrackerResult(); - const MachineInstr *PrevDef = nullptr; - // Try to find the next source. - if (getNextSourceImpl(SrcReg, SrcSubReg)) { + ValueTrackerResult Res = getNextSourceImpl(); + if (Res.isValid()) { // Update definition, definition index, and subregister for the // next call of getNextSource. // Update the current register. - Reg = SrcReg; - // Update the return value before moving up in the use-def chain. - PrevDef = Def; + bool OneRegSrc = Res.getNumSources() == 1; + if (OneRegSrc) + Reg = Res.getSrcReg(0); + // Update the result before moving up in the use-def chain + // with the instruction containing the last found sources. + Res.setInst(Def); + // If we can still move up in the use-def chain, move to the next - // defintion. - if (!TargetRegisterInfo::isPhysicalRegister(Reg)) { + // definition. + if (!TargetRegisterInfo::isPhysicalRegister(Reg) && OneRegSrc) { Def = MRI.getVRegDef(Reg); DefIdx = MRI.def_begin(Reg).getOperandNo(); - DefSubReg = SrcSubReg; - return PrevDef; + DefSubReg = Res.getSrcSubReg(0); + return Res; } } // If we end up here, this means we will not be able to find another source - // for the next iteration. - // Make sure any new call to getNextSource bails out early by cutting the - // use-def chain. + // for the next iteration. Make sure any new call to getNextSource bails out + // early by cutting the use-def chain. Def = nullptr; - return PrevDef; + return Res; } diff --git a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp index 6f76116..b95dffd 100644 --- a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp +++ b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp @@ -87,7 +87,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<TargetPassConfig>(); AU.addRequired<MachineDominatorTree>(); AU.addPreserved<MachineDominatorTree>(); @@ -196,7 +196,7 @@ SchedulePostRATDList::SchedulePostRATDList( const RegisterClassInfo &RCI, TargetSubtargetInfo::AntiDepBreakMode AntiDepMode, SmallVectorImpl<const TargetRegisterClass *> &CriticalPathRCs) - : ScheduleDAGInstrs(MF, &MLI, /*IsPostRA=*/true), AA(AA), EndIndex(0) { + : ScheduleDAGInstrs(MF, &MLI), AA(AA), EndIndex(0) { const InstrItineraryData *InstrItins = MF.getSubtarget().getInstrItineraryData(); @@ -267,7 +267,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { TII = Fn.getSubtarget().getInstrInfo(); MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); - AliasAnalysis *AA = &getAnalysis<AliasAnalysis>(); + AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>(); RegClassInfo.runOnMachineFunction(Fn); @@ -302,8 +302,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { CriticalPathRCs); // Loop over all of the basic blocks - for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); - MBB != MBBe; ++MBB) { + for (auto &MBB : Fn) { #ifndef NDEBUG // If DebugDiv > 0 then only schedule MBB with (ID % DebugDiv) == DebugMod if (DebugDiv > 0) { @@ -311,25 +310,25 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { if (bbcnt++ % DebugDiv != DebugMod) continue; dbgs() << "*** DEBUG scheduling " << Fn.getName() - << ":BB#" << MBB->getNumber() << " ***\n"; + << ":BB#" << MBB.getNumber() << " ***\n"; } #endif // Initialize register live-range state for scheduling in this block. - Scheduler.startBlock(MBB); + Scheduler.startBlock(&MBB); // Schedule each sequence of instructions not interrupted by a label // or anything else that effectively needs to shut down scheduling. - MachineBasicBlock::iterator Current = MBB->end(); - unsigned Count = MBB->size(), CurrentCount = Count; - for (MachineBasicBlock::iterator I = Current; I != MBB->begin(); ) { + MachineBasicBlock::iterator Current = MBB.end(); + unsigned Count = MBB.size(), CurrentCount = Count; + for (MachineBasicBlock::iterator I = Current; I != MBB.begin();) { MachineInstr *MI = std::prev(I); --Count; // Calls are not scheduling boundaries before register allocation, but // post-ra we don't gain anything by scheduling across calls since we // don't need to worry about register pressure. - if (MI->isCall() || TII->isSchedulingBoundary(MI, MBB, Fn)) { - Scheduler.enterRegion(MBB, I, Current, CurrentCount - Count); + if (MI->isCall() || TII->isSchedulingBoundary(MI, &MBB, Fn)) { + Scheduler.enterRegion(&MBB, I, Current, CurrentCount - Count); Scheduler.setEndIndex(CurrentCount); Scheduler.schedule(); Scheduler.exitRegion(); @@ -343,9 +342,9 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { Count -= MI->getBundleSize(); } assert(Count == 0 && "Instruction count mismatch!"); - assert((MBB->begin() == Current || CurrentCount != 0) && + assert((MBB.begin() == Current || CurrentCount != 0) && "Instruction count mismatch!"); - Scheduler.enterRegion(MBB, MBB->begin(), Current, CurrentCount); + Scheduler.enterRegion(&MBB, MBB.begin(), Current, CurrentCount); Scheduler.setEndIndex(CurrentCount); Scheduler.schedule(); Scheduler.exitRegion(); @@ -355,7 +354,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { Scheduler.finishBlock(); // Update register kills - Scheduler.fixupKills(MBB); + Scheduler.fixupKills(&MBB); } return true; @@ -400,8 +399,12 @@ void SchedulePostRATDList::schedule() { } DEBUG(dbgs() << "********** List Scheduling **********\n"); - DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su) - SUnits[su].dumpAll(this)); + DEBUG( + for (const SUnit &SU : SUnits) { + SU.dumpAll(this); + dbgs() << '\n'; + } + ); AvailableQueue.initNodes(SUnits); ListScheduleTopDown(); diff --git a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp index 5f81949..d27ea2f 100644 --- a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp +++ b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp @@ -58,7 +58,7 @@ INITIALIZE_PASS_END(ProcessImplicitDefs, "processimpdefs", void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<AAResultsWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -96,7 +96,7 @@ void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) { // This is a physreg implicit-def. // Look for the first instruction to use or define an alias. - MachineBasicBlock::instr_iterator UserMI = MI; + MachineBasicBlock::instr_iterator UserMI = MI->getIterator(); MachineBasicBlock::instr_iterator UserE = MI->getParent()->instr_end(); bool Found = false; for (++UserMI; UserMI != UserE; ++UserMI) { @@ -151,7 +151,7 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock::instr_iterator MBBI = MFI->instr_begin(), MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) if (MBBI->isImplicitDef()) - WorkList.insert(MBBI); + WorkList.insert(&*MBBI); if (WorkList.empty()) continue; diff --git a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 6ca69a1..939c500 100644 --- a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -71,8 +71,9 @@ private: // stack frame indexes. unsigned MinCSFrameIndex, MaxCSFrameIndex; - // Save and Restore blocks of the current function. - MachineBasicBlock *SaveBlock; + // Save and Restore blocks of the current function. Typically there is a + // single save block, unless Windows EH funclets are involved. + SmallVector<MachineBasicBlock *, 1> SaveBlocks; SmallVector<MachineBasicBlock *, 4> RestoreBlocks; // Flag to control whether to use the register scavenger to resolve @@ -91,9 +92,6 @@ private: int &SPAdj); void scavengeFrameVirtualRegs(MachineFunction &Fn); void insertPrologEpilogCode(MachineFunction &Fn); - - // Convenience for recognizing return blocks. - bool isReturnBlock(const MachineBasicBlock *MBB) const; }; } // namespace @@ -128,10 +126,6 @@ void PEI::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -bool PEI::isReturnBlock(const MachineBasicBlock* MBB) const { - return (MBB && !MBB->empty() && MBB->back().isReturn()); -} - /// Compute the set of return blocks void PEI::calculateSets(MachineFunction &Fn) { const MachineFrameInfo *MFI = Fn.getFrameInfo(); @@ -142,25 +136,25 @@ void PEI::calculateSets(MachineFunction &Fn) { // Use the points found by shrink-wrapping, if any. if (MFI->getSavePoint()) { - SaveBlock = MFI->getSavePoint(); + SaveBlocks.push_back(MFI->getSavePoint()); assert(MFI->getRestorePoint() && "Both restore and save must be set"); MachineBasicBlock *RestoreBlock = MFI->getRestorePoint(); // If RestoreBlock does not have any successor and is not a return block // then the end point is unreachable and we do not need to insert any // epilogue. - if (!RestoreBlock->succ_empty() || isReturnBlock(RestoreBlock)) + if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock()) RestoreBlocks.push_back(RestoreBlock); return; } // Save refs to entry and return blocks. - SaveBlock = Fn.begin(); - for (MachineFunction::iterator MBB = Fn.begin(), E = Fn.end(); - MBB != E; ++MBB) - if (isReturnBlock(MBB)) - RestoreBlocks.push_back(MBB); - - return; + SaveBlocks.push_back(&Fn.front()); + for (MachineBasicBlock &MBB : Fn) { + if (MBB.isEHFuncletEntry()) + SaveBlocks.push_back(&MBB); + if (MBB.isReturnBlock()) + RestoreBlocks.push_back(&MBB); + } } /// StackObjSet - A set of stack object indexes @@ -195,7 +189,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) { // place all spills in the entry block, all restores in return blocks. calculateSets(Fn); - // Add the code to save and restore the callee saved registers + // Add the code to save and restore the callee saved registers. if (!F->hasFnAttribute(Attribute::Naked)) insertCSRSpillsAndRestores(Fn); @@ -237,6 +231,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) { } delete RS; + SaveBlocks.clear(); RestoreBlocks.clear(); return true; } @@ -407,7 +402,7 @@ static void updateLiveness(MachineFunction &MF) { const MachineBasicBlock *CurBB = WorkList.pop_back_val(); // By construction, the region that is after the save point is // dominated by the Save and post-dominated by the Restore. - if (CurBB == Save) + if (CurBB == Save && Save != Restore) continue; // Enqueue all the successors not already visited. // Those are by construction either before Save or after Restore. @@ -419,10 +414,13 @@ static void updateLiveness(MachineFunction &MF) { const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - for (MachineBasicBlock *MBB : Visited) + for (MachineBasicBlock *MBB : Visited) { + MCPhysReg Reg = CSI[i].getReg(); // Add the callee-saved register as live-in. // It's killed at the spill. - MBB->addLiveIn(CSI[i].getReg()); + if (!MBB->isLiveIn(Reg)) + MBB->addLiveIn(Reg); + } } } @@ -446,18 +444,20 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) { MachineBasicBlock::iterator I; // Spill using target interface. - I = SaveBlock->begin(); - if (!TFI->spillCalleeSavedRegisters(*SaveBlock, I, CSI, TRI)) { - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - // Insert the spill to the stack frame. - unsigned Reg = CSI[i].getReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(*SaveBlock, I, Reg, true, CSI[i].getFrameIdx(), - RC, TRI); + for (MachineBasicBlock *SaveBlock : SaveBlocks) { + I = SaveBlock->begin(); + if (!TFI->spillCalleeSavedRegisters(*SaveBlock, I, CSI, TRI)) { + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + // Insert the spill to the stack frame. + unsigned Reg = CSI[i].getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.storeRegToStackSlot(*SaveBlock, I, Reg, true, CSI[i].getFrameIdx(), + RC, TRI); + } } + // Update the live-in information of all the blocks up to the save point. + updateLiveness(Fn); } - // Update the live-in information of all the blocks up to the save point. - updateLiveness(Fn); // Restore using target interface. for (MachineBasicBlock *MBB : RestoreBlocks) { @@ -500,7 +500,7 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) { static inline void AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx, bool StackGrowsDown, int64_t &Offset, - unsigned &MaxAlign) { + unsigned &MaxAlign, unsigned Skew) { // If the stack grows down, add the object size to find the lowest address. if (StackGrowsDown) Offset += MFI->getObjectSize(FrameIdx); @@ -512,7 +512,7 @@ AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx, MaxAlign = std::max(MaxAlign, Align); // Adjust to alignment boundary. - Offset = (Offset + Align - 1) / Align * Align; + Offset = RoundUpToAlignment(Offset, Align, Skew); if (StackGrowsDown) { DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n"); @@ -530,12 +530,12 @@ static void AssignProtectedObjSet(const StackObjSet &UnassignedObjs, SmallSet<int, 16> &ProtectedObjs, MachineFrameInfo *MFI, bool StackGrowsDown, - int64_t &Offset, unsigned &MaxAlign) { + int64_t &Offset, unsigned &MaxAlign, unsigned Skew) { for (StackObjSet::const_iterator I = UnassignedObjs.begin(), E = UnassignedObjs.end(); I != E; ++I) { int i = *I; - AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign); + AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew); ProtectedObjs.insert(i); } } @@ -563,6 +563,9 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) { && "Local area offset should be in direction of stack growth"); int64_t Offset = LocalAreaOffset; + // Skew to be applied to alignment. + unsigned Skew = TFI.getStackAlignmentSkew(Fn); + // If there are fixed sized objects that are preallocated in the local area, // non-fixed objects can't be allocated right at the start of local area. // We currently don't support filling in holes in between fixed sized @@ -593,7 +596,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) { unsigned Align = MFI->getObjectAlignment(i); // Adjust to alignment boundary - Offset = RoundUpToAlignment(Offset, Align); + Offset = RoundUpToAlignment(Offset, Align, Skew); MFI->setObjectOffset(i, -Offset); // Set the computed offset } @@ -602,7 +605,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) { for (int i = MaxCSFI; i >= MinCSFI ; --i) { unsigned Align = MFI->getObjectAlignment(i); // Adjust to alignment boundary - Offset = RoundUpToAlignment(Offset, Align); + Offset = RoundUpToAlignment(Offset, Align, Skew); MFI->setObjectOffset(i, Offset); Offset += MFI->getObjectSize(i); @@ -624,7 +627,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) { RS->getScavengingFrameIndices(SFIs); for (SmallVectorImpl<int>::iterator I = SFIs.begin(), IE = SFIs.end(); I != IE; ++I) - AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign); + AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew); } // FIXME: Once this is working, then enable flag will change to a target @@ -635,7 +638,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) { unsigned Align = MFI->getLocalFrameMaxAlign(); // Adjust to alignment boundary. - Offset = RoundUpToAlignment(Offset, Align); + Offset = RoundUpToAlignment(Offset, Align, Skew); DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n"); @@ -662,7 +665,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) { StackObjSet AddrOfObjs; AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown, - Offset, MaxAlign); + Offset, MaxAlign, Skew); // Assign large stack objects first. for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) { @@ -695,11 +698,11 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) { } AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown, - Offset, MaxAlign); + Offset, MaxAlign, Skew); AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown, - Offset, MaxAlign); + Offset, MaxAlign, Skew); AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown, - Offset, MaxAlign); + Offset, MaxAlign, Skew); } // Then assign frame offsets to stack objects that are not used to spill @@ -719,7 +722,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) { if (ProtectedObjs.count(i)) continue; - AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign); + AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew); } // Make sure the special register scavenging spill slot is closest to the @@ -729,7 +732,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) { RS->getScavengingFrameIndices(SFIs); for (SmallVectorImpl<int>::iterator I = SFIs.begin(), IE = SFIs.end(); I != IE; ++I) - AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign); + AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew); } if (!TFI.targetHandlesStackFrameRounding()) { @@ -754,7 +757,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) { // If the frame pointer is eliminated, all frame offsets will be relative to // SP not FP. Align to MaxAlign so this works. StackAlign = std::max(StackAlign, MaxAlign); - Offset = RoundUpToAlignment(Offset, StackAlign); + Offset = RoundUpToAlignment(Offset, StackAlign, Skew); } // Update frame info to pretend that this is part of the stack... @@ -771,18 +774,24 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) { const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); // Add prologue to the function... - TFI.emitPrologue(Fn, *SaveBlock); + for (MachineBasicBlock *SaveBlock : SaveBlocks) + TFI.emitPrologue(Fn, *SaveBlock); // Add epilogue to restore the callee-save registers in each exiting block. for (MachineBasicBlock *RestoreBlock : RestoreBlocks) TFI.emitEpilogue(Fn, *RestoreBlock); + for (MachineBasicBlock *SaveBlock : SaveBlocks) + TFI.inlineStackProbe(Fn, *SaveBlock); + // Emit additional code that is required to support segmented stacks, if // we've been asked for it. This, when linked with a runtime with support // for segmented stacks (libgcc is one), will result in allocating stack // space in small chunks instead of one large contiguous block. - if (Fn.shouldSplitStack()) - TFI.adjustForSegmentedStacks(Fn, *SaveBlock); + if (Fn.shouldSplitStack()) { + for (MachineBasicBlock *SaveBlock : SaveBlocks) + TFI.adjustForSegmentedStacks(Fn, *SaveBlock); + } // Emit additional code that is required to explicitly handle the stack in // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The @@ -790,7 +799,8 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) { // different conditional check and another BIF for allocating more stack // space. if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE) - TFI.adjustForHiPEPrologue(Fn, *SaveBlock); + for (MachineBasicBlock *SaveBlock : SaveBlocks) + TFI.adjustForHiPEPrologue(Fn, *SaveBlock); } /// replaceFrameIndices - Replace all MO_FrameIndex operands with physical @@ -800,25 +810,6 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) { const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); if (!TFI.needsFrameIndexResolution(Fn)) return; - MachineModuleInfo &MMI = Fn.getMMI(); - const Function *F = Fn.getFunction(); - const Function *ParentF = MMI.getWinEHParent(F); - unsigned FrameReg; - if (F == ParentF) { - WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(Fn.getFunction()); - // FIXME: This should be unconditional but we have bugs in the preparation - // pass. - if (FuncInfo.UnwindHelpFrameIdx != INT_MAX) - FuncInfo.UnwindHelpFrameOffset = TFI.getFrameIndexReferenceFromSP( - Fn, FuncInfo.UnwindHelpFrameIdx, FrameReg); - } else if (MMI.hasWinEHFuncInfo(F)) { - WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(Fn.getFunction()); - auto I = FuncInfo.CatchHandlerParentFrameObjIdx.find(F); - if (I != FuncInfo.CatchHandlerParentFrameObjIdx.end()) - FuncInfo.CatchHandlerParentFrameObjOffset[F] = - TFI.getFrameIndexReferenceFromSP(Fn, I->second, FrameReg); - } - // Store SPAdj at exit of a basic block. SmallVector<int, 8> SPState; SPState.resize(Fn.getNumBlockIDs()); @@ -841,12 +832,12 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) { } // Handle the unreachable blocks. - for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { - if (Reachable.count(BB)) + for (auto &BB : Fn) { + if (Reachable.count(&BB)) // Already handled in DFS traversal. continue; int SPAdj = 0; - replaceFrameIndices(BB, Fn, SPAdj); + replaceFrameIndices(&BB, Fn, SPAdj); } } @@ -889,11 +880,11 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, if (!MI->getOperand(i).isFI()) continue; - // Frame indicies in debug values are encoded in a target independent + // Frame indices in debug values are encoded in a target independent // way with simply the frame index and offset rather than any // target-specific addressing mode. if (MI->isDebugValue()) { - assert(i == 0 && "Frame indicies can only appear as the first " + assert(i == 0 && "Frame indices can only appear as the first " "operand of a DBG_VALUE machine instruction"); unsigned Reg; MachineOperand &Offset = MI->getOperand(1); @@ -979,7 +970,7 @@ PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) { // Run through the instructions and find any virtual registers. for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { - RS->enterBasicBlock(BB); + RS->enterBasicBlock(&*BB); int SPAdj = 0; @@ -1026,12 +1017,8 @@ PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) { // Replace this reference to the virtual register with the // scratch register. assert (ScratchReg && "Missing scratch register!"); - MachineRegisterInfo &MRI = Fn.getRegInfo(); Fn.getRegInfo().replaceRegWith(Reg, ScratchReg); - // Make sure MRI now accounts this register as used. - MRI.setPhysRegUsed(ScratchReg); - // Because this instruction was processed by the RS before this // register was allocated, make sure that the RS now records the // register as being used. @@ -1044,7 +1031,7 @@ PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) { // problem because we need the spill code before I: Move I to just // prior to J. if (I != std::prev(J)) { - BB->splice(J, BB, I); + BB->splice(J, &*BB, I); // Before we move I, we need to prepare the RS to visit I again. // Specifically, RS will assert if it sees uses of registers that diff --git a/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp index b1c341d..1f46417 100644 --- a/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp +++ b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/IR/DerivedTypes.h" @@ -22,87 +23,38 @@ #include <map> using namespace llvm; -namespace { -struct PSVGlobalsTy { - // PseudoSourceValues are immutable so don't need locking. - const PseudoSourceValue PSVs[4]; - sys::Mutex Lock; // Guards FSValues, but not the values inside it. - std::map<int, const PseudoSourceValue *> FSValues; - - PSVGlobalsTy() : PSVs() {} - ~PSVGlobalsTy() { - for (std::map<int, const PseudoSourceValue *>::iterator - I = FSValues.begin(), E = FSValues.end(); I != E; ++I) { - delete I->second; - } - } -}; - -static ManagedStatic<PSVGlobalsTy> PSVGlobals; - -} // anonymous namespace - -const PseudoSourceValue *PseudoSourceValue::getStack() -{ return &PSVGlobals->PSVs[0]; } -const PseudoSourceValue *PseudoSourceValue::getGOT() -{ return &PSVGlobals->PSVs[1]; } -const PseudoSourceValue *PseudoSourceValue::getJumpTable() -{ return &PSVGlobals->PSVs[2]; } -const PseudoSourceValue *PseudoSourceValue::getConstantPool() -{ return &PSVGlobals->PSVs[3]; } - static const char *const PSVNames[] = { - "Stack", - "GOT", - "JumpTable", - "ConstantPool" -}; + "Stack", "GOT", "JumpTable", "ConstantPool", "FixedStack", + "GlobalValueCallEntry", "ExternalSymbolCallEntry"}; -PseudoSourceValue::PseudoSourceValue(bool isFixed) : isFixed(isFixed) {} +PseudoSourceValue::PseudoSourceValue(PSVKind Kind) : Kind(Kind) {} PseudoSourceValue::~PseudoSourceValue() {} void PseudoSourceValue::printCustom(raw_ostream &O) const { - O << PSVNames[this - PSVGlobals->PSVs]; -} - -const PseudoSourceValue *PseudoSourceValue::getFixedStack(int FI) { - PSVGlobalsTy &PG = *PSVGlobals; - sys::ScopedLock locked(PG.Lock); - const PseudoSourceValue *&V = PG.FSValues[FI]; - if (!V) - V = new FixedStackPseudoSourceValue(FI); - return V; + O << PSVNames[Kind]; } bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const { - if (this == getStack()) + if (isStack()) return false; - if (this == getGOT() || - this == getConstantPool() || - this == getJumpTable()) + if (isGOT() || isConstantPool() || isJumpTable()) return true; llvm_unreachable("Unknown PseudoSourceValue!"); } -bool PseudoSourceValue::isAliased(const MachineFrameInfo *MFI) const { - if (this == getStack() || - this == getGOT() || - this == getConstantPool() || - this == getJumpTable()) +bool PseudoSourceValue::isAliased(const MachineFrameInfo *) const { + if (isStack() || isGOT() || isConstantPool() || isJumpTable()) return false; llvm_unreachable("Unknown PseudoSourceValue!"); } -bool PseudoSourceValue::mayAlias(const MachineFrameInfo *MFI) const { - if (this == getGOT() || - this == getConstantPool() || - this == getJumpTable()) - return false; - return true; +bool PseudoSourceValue::mayAlias(const MachineFrameInfo *) const { + return !(isGOT() || isConstantPool() || isJumpTable()); } -bool FixedStackPseudoSourceValue::isConstant(const MachineFrameInfo *MFI) const{ +bool FixedStackPseudoSourceValue::isConstant( + const MachineFrameInfo *MFI) const { return MFI && MFI->isImmutableObjectIndex(FI); } @@ -122,3 +74,69 @@ bool FixedStackPseudoSourceValue::mayAlias(const MachineFrameInfo *MFI) const { void FixedStackPseudoSourceValue::printCustom(raw_ostream &OS) const { OS << "FixedStack" << FI; } + +CallEntryPseudoSourceValue::CallEntryPseudoSourceValue(PSVKind Kind) + : PseudoSourceValue(Kind) {} + +bool CallEntryPseudoSourceValue::isConstant(const MachineFrameInfo *) const { + return false; +} + +bool CallEntryPseudoSourceValue::isAliased(const MachineFrameInfo *) const { + return false; +} + +bool CallEntryPseudoSourceValue::mayAlias(const MachineFrameInfo *) const { + return false; +} + +GlobalValuePseudoSourceValue::GlobalValuePseudoSourceValue( + const GlobalValue *GV) + : CallEntryPseudoSourceValue(GlobalValueCallEntry), GV(GV) {} + +ExternalSymbolPseudoSourceValue::ExternalSymbolPseudoSourceValue(const char *ES) + : CallEntryPseudoSourceValue(ExternalSymbolCallEntry), ES(ES) {} + +PseudoSourceValueManager::PseudoSourceValueManager() + : StackPSV(PseudoSourceValue::Stack), GOTPSV(PseudoSourceValue::GOT), + JumpTablePSV(PseudoSourceValue::JumpTable), + ConstantPoolPSV(PseudoSourceValue::ConstantPool) {} + +const PseudoSourceValue *PseudoSourceValueManager::getStack() { + return &StackPSV; +} + +const PseudoSourceValue *PseudoSourceValueManager::getGOT() { return &GOTPSV; } + +const PseudoSourceValue *PseudoSourceValueManager::getConstantPool() { + return &ConstantPoolPSV; +} + +const PseudoSourceValue *PseudoSourceValueManager::getJumpTable() { + return &JumpTablePSV; +} + +const PseudoSourceValue *PseudoSourceValueManager::getFixedStack(int FI) { + std::unique_ptr<FixedStackPseudoSourceValue> &V = FSValues[FI]; + if (!V) + V = llvm::make_unique<FixedStackPseudoSourceValue>(FI); + return V.get(); +} + +const PseudoSourceValue * +PseudoSourceValueManager::getGlobalValueCallEntry(const GlobalValue *GV) { + std::unique_ptr<const GlobalValuePseudoSourceValue> &E = + GlobalCallEntries[GV]; + if (!E) + E = llvm::make_unique<GlobalValuePseudoSourceValue>(GV); + return E.get(); +} + +const PseudoSourceValue * +PseudoSourceValueManager::getExternalSymbolCallEntry(const char *ES) { + std::unique_ptr<const ExternalSymbolPseudoSourceValue> &E = + ExternalCallEntries[ES]; + if (!E) + E = llvm::make_unique<ExternalSymbolPseudoSourceValue>(ES); + return E.get(); +} diff --git a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp index 0090332..cfe367d 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -133,8 +133,8 @@ RABasic::RABasic(): MachineFunctionPass(ID) { void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addRequired<AliasAnalysis>(); - AU.addPreserved<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); AU.addRequired<LiveIntervals>(); AU.addPreserved<LiveIntervals>(); AU.addPreserved<SlotIndexes>(); @@ -223,7 +223,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg, SmallVector<unsigned, 8> PhysRegSpillCands; // Check for an available register in this class. - AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo); + AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix); while (unsigned PhysReg = Order.next()) { // Check for interference in PhysReg switch (Matrix->checkInterference(VirtReg, PhysReg)) { @@ -276,7 +276,7 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) { getAnalysis<LiveIntervals>(), getAnalysis<LiveRegMatrix>()); - calculateSpillWeightsAndHints(*LIS, *MF, + calculateSpillWeightsAndHints(*LIS, *MF, VRM, getAnalysis<MachineLoopInfo>(), getAnalysis<MachineBlockFrequencyInfo>()); diff --git a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp index fd3d4d7..f4c076f 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp @@ -799,10 +799,9 @@ void RAFast::AllocateBasicBlock() { MachineBasicBlock::iterator MII = MBB->begin(); // Add live-in registers as live. - for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(), - E = MBB->livein_end(); I != E; ++I) - if (MRI->isAllocatable(*I)) - definePhysReg(MII, *I, regReserved); + for (const auto &LI : MBB->liveins()) + if (MRI->isAllocatable(LI.PhysReg)) + definePhysReg(MII, LI.PhysReg, regReserved); SmallVector<unsigned, 8> VirtDead; SmallVector<MachineInstr*, 32> Coalesced; @@ -986,10 +985,6 @@ void RAFast::AllocateBasicBlock() { } } - for (UsedInInstrSet::iterator - I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I) - MRI->setRegUnitUsed(*I); - // Track registers defined by instruction - early clobbers and tied uses at // this point. UsedInInstr.clear(); @@ -1050,10 +1045,6 @@ void RAFast::AllocateBasicBlock() { killVirtReg(VirtDead[i]); VirtDead.clear(); - for (UsedInInstrSet::iterator - I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I) - MRI->setRegUnitUsed(*I); - if (CopyDst && CopyDst == CopySrc && CopyDstSub == CopySrcSub) { DEBUG(dbgs() << "-- coalescing: " << *MI); Coalesced.push_back(MI); @@ -1103,12 +1094,6 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) { AllocateBasicBlock(); } - // Add the clobber lists for all the instructions we skipped earlier. - for (const MCInstrDesc *Desc : SkippedInstrs) - if (const uint16_t *Defs = Desc->getImplicitDefs()) - while (*Defs) - MRI->setPhysRegUsed(*Defs++); - // All machine operands and other references to virtual registers have been // replaced. Remove the virtual registers. MRI->clearVirtRegs(); diff --git a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp index 7ebcf7f..945cb9e 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -86,6 +86,14 @@ static cl::opt<bool> EnableLocalReassignment( "may be compile time intensive"), cl::init(false)); +static cl::opt<bool> EnableDeferredSpilling( + "enable-deferred-spilling", cl::Hidden, + cl::desc("Instead of spilling a variable right away, defer the actual " + "code insertion to the end of the allocation. That way the " + "allocator might still find a suitable coloring for this " + "variable because of other evicted variables."), + cl::init(false)); + // FIXME: Find a good default for this flag and remove the flag. static cl::opt<unsigned> CSRFirstTimeCost("regalloc-csr-first-time-cost", @@ -157,6 +165,11 @@ class RAGreedy : public MachineFunctionPass, /// Live range will be spilled. No more splitting will be attempted. RS_Spill, + + /// Live range is in memory. Because of other evictions, it might get moved + /// in a register in the end. + RS_Memory, + /// There is nothing more we can do to this live range. Abort compilation /// if it can't be assigned. RS_Done @@ -414,6 +427,7 @@ const char *const RAGreedy::StageName[] = { "RS_Split", "RS_Split2", "RS_Spill", + "RS_Memory", "RS_Done" }; #endif @@ -447,8 +461,8 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addRequired<MachineBlockFrequencyInfo>(); AU.addPreserved<MachineBlockFrequencyInfo>(); - AU.addRequired<AliasAnalysis>(); - AU.addPreserved<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); AU.addRequired<LiveIntervals>(); AU.addPreserved<LiveIntervals>(); AU.addRequired<SlotIndexes>(); @@ -536,6 +550,13 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) { // Unsplit ranges that couldn't be allocated immediately are deferred until // everything else has been allocated. Prio = Size; + } else if (ExtraRegInfo[Reg].Stage == RS_Memory) { + // Memory operand should be considered last. + // Change the priority such that Memory operand are assigned in + // the reverse order that they came in. + // TODO: Make this a member variable and probably do something about hints. + static unsigned MemOp = 0; + Prio = MemOp++; } else { // Giant live ranges fall back to the global assignment heuristic, which // prevents excessive spilling in pathological cases. @@ -637,7 +658,7 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg, //===----------------------------------------------------------------------===// unsigned RAGreedy::canReassign(LiveInterval &VirtReg, unsigned PrevReg) { - AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo); + AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix); unsigned PhysReg; while ((PhysReg = Order.next())) { if (PhysReg == PrevReg) @@ -2450,7 +2471,7 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, unsigned Depth) { unsigned CostPerUseLimit = ~0u; // First try assigning a free register. - AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo); + AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix); if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs)) { // When NewVRegs is not empty, we may have made decisions such as evicting // a virtual register, go with the earlier decisions and use the physical @@ -2512,13 +2533,23 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, return PhysReg; // Finally spill VirtReg itself. - NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled); - LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this); - spiller().spill(LRE); - setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done); + if (EnableDeferredSpilling && getStage(VirtReg) < RS_Memory) { + // TODO: This is experimental and in particular, we do not model + // the live range splitting done by spilling correctly. + // We would need a deep integration with the spiller to do the + // right thing here. Anyway, that is still good for early testing. + setStage(VirtReg, RS_Memory); + DEBUG(dbgs() << "Do as if this register is in memory\n"); + NewVRegs.push_back(VirtReg.reg); + } else { + NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled); + LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this); + spiller().spill(LRE); + setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done); - if (VerifyEnabled) - MF->verify(this, "After spilling"); + if (VerifyEnabled) + MF->verify(this, "After spilling"); + } // The live virtual register requesting allocation was spilled, so tell // the caller not to allocate anything during this round. @@ -2555,7 +2586,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { initializeCSRCost(); - calculateSpillWeightsAndHints(*LIS, mf, *Loops, *MBFI); + calculateSpillWeightsAndHints(*LIS, mf, VRM, *Loops, *MBFI); DEBUG(LIS->dump()); diff --git a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp index eeff73d..fd28b05 100644 --- a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp +++ b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp @@ -47,6 +47,7 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/Printable.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -497,8 +498,8 @@ void PBQPRAConstraintList::anchor() {} void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const { au.setPreservesCFG(); - au.addRequired<AliasAnalysis>(); - au.addPreserved<AliasAnalysis>(); + au.addRequired<AAResultsWrapperPass>(); + au.addPreserved<AAResultsWrapperPass>(); au.addRequired<SlotIndexes>(); au.addPreserved<SlotIndexes>(); au.addRequired<LiveIntervals>(); @@ -724,11 +725,11 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) { MachineBlockFrequencyInfo &MBFI = getAnalysis<MachineBlockFrequencyInfo>(); - calculateSpillWeightsAndHints(LIS, MF, getAnalysis<MachineLoopInfo>(), MBFI, - normalizePBQPSpillWeight); - VirtRegMap &VRM = getAnalysis<VirtRegMap>(); + calculateSpillWeightsAndHints(LIS, MF, &VRM, getAnalysis<MachineLoopInfo>(), + MBFI, normalizePBQPSpillWeight); + std::unique_ptr<Spiller> VRegSpiller(createInlineSpiller(*this, MF, VRM)); MF.getRegInfo().freezeReservedRegs(MF); @@ -805,33 +806,17 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) { return true; } -namespace { -// A helper class for printing node and register info in a consistent way -class PrintNodeInfo { -public: - typedef PBQP::RegAlloc::PBQPRAGraph Graph; - typedef PBQP::RegAlloc::PBQPRAGraph::NodeId NodeId; - - PrintNodeInfo(NodeId NId, const Graph &G) : G(G), NId(NId) {} - - void print(raw_ostream &OS) const { +/// Create Printable object for node and register info. +static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId, + const PBQP::RegAlloc::PBQPRAGraph &G) { + return Printable([NId, &G](raw_ostream &OS) { const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo(); const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); unsigned VReg = G.getNodeMetadata(NId).getVReg(); const char *RegClassName = TRI->getRegClassName(MRI.getRegClass(VReg)); OS << NId << " (" << RegClassName << ':' << PrintReg(VReg, TRI) << ')'; - } - -private: - const Graph &G; - NodeId NId; -}; - -inline raw_ostream &operator<<(raw_ostream &OS, const PrintNodeInfo &PR) { - PR.print(OS); - return OS; + }); } -} // anonymous namespace void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const { for (auto NId : nodeIds()) { diff --git a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp index c911b9b..c1ff13e 100644 --- a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -32,7 +32,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" @@ -93,7 +92,7 @@ namespace { /// A LaneMask to remember on which subregister live ranges we need to call /// shrinkToUses() later. - unsigned ShrinkMask; + LaneBitmask ShrinkMask; /// True if the main range of the currently coalesced intervals should be /// checked for smaller live intervals. @@ -164,15 +163,13 @@ namespace { /// LaneMask are split as necessary. @p LaneMask are the lanes that /// @p ToMerge will occupy in the coalescer register. @p LI has its subrange /// lanemasks already adjusted to the coalesced register. - /// @returns false if live range conflicts couldn't get resolved. - bool mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge, - unsigned LaneMask, CoalescerPair &CP); + void mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge, + LaneBitmask LaneMask, CoalescerPair &CP); /// Join the liveranges of two subregisters. Joins @p RRange into /// @p LRange, @p RRange may be invalid afterwards. - /// @returns false if live range conflicts couldn't get resolved. - bool joinSubRegRanges(LiveRange &LRange, LiveRange &RRange, - unsigned LaneMask, const CoalescerPair &CP); + void joinSubRegRanges(LiveRange &LRange, LiveRange &RRange, + LaneBitmask LaneMask, const CoalescerPair &CP); /// We found a non-trivially-coalescable copy. If the source value number is /// defined by a copy from the destination reg see if we can merge these two @@ -224,30 +221,17 @@ namespace { /// Dst, we can drop \p Copy. bool applyTerminalRule(const MachineInstr &Copy) const; - /// Check whether or not \p LI is composed by multiple connected - /// components and if that is the case, fix that. - void splitNewRanges(LiveInterval *LI) { - ConnectedVNInfoEqClasses ConEQ(*LIS); - unsigned NumComps = ConEQ.Classify(LI); - if (NumComps <= 1) - return; - SmallVector<LiveInterval*, 8> NewComps(1, LI); - for (unsigned i = 1; i != NumComps; ++i) { - unsigned VReg = MRI->createVirtualRegister(MRI->getRegClass(LI->reg)); - NewComps.push_back(&LIS->createEmptyInterval(VReg)); - } - - ConEQ.Distribute(&NewComps[0], *MRI); - } - /// Wrapper method for \see LiveIntervals::shrinkToUses. /// This method does the proper fixing of the live-ranges when the afore /// mentioned method returns true. void shrinkToUses(LiveInterval *LI, SmallVectorImpl<MachineInstr * > *Dead = nullptr) { - if (LIS->shrinkToUses(LI, Dead)) - // We may have created multiple connected components, split them. - splitNewRanges(LI); + if (LIS->shrinkToUses(LI, Dead)) { + /// Check whether or not \p LI is composed by multiple connected + /// components and if that is the case, fix that. + SmallVector<LiveInterval*, 8> SplitLIs; + LIS->splitSeparateComponents(*LI, SplitLIs); + } } public: @@ -275,7 +259,7 @@ INITIALIZE_PASS_BEGIN(RegisterCoalescer, "simple-register-coalescing", INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(RegisterCoalescer, "simple-register-coalescing", "Simple Register Coalescing", false, false) @@ -453,7 +437,7 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const { void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<LiveIntervals>(); AU.addPreserved<LiveIntervals>(); AU.addPreserved<SlotIndexes>(); @@ -679,14 +663,18 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, unsigned UseOpIdx; if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx)) return false; - unsigned Op1, Op2, NewDstIdx; - if (!TII->findCommutedOpIndices(DefMI, Op1, Op2)) - return false; - if (Op1 == UseOpIdx) - NewDstIdx = Op2; - else if (Op2 == UseOpIdx) - NewDstIdx = Op1; - else + + // FIXME: The code below tries to commute 'UseOpIdx' operand with some other + // commutable operand which is expressed by 'CommuteAnyOperandIndex'value + // passed to the method. That _other_ operand is chosen by + // the findCommutedOpIndices() method. + // + // That is obviously an area for improvement in case of instructions having + // more than 2 operands. For example, if some instruction has 3 commutable + // operands then all possible variants (i.e. op#1<->op#2, op#1<->op#3, + // op#2<->op#3) of commute transformation should be considered/tried here. + unsigned NewDstIdx = TargetInstrInfo::CommuteAnyOperandIndex; + if (!TII->findCommutedOpIndices(DefMI, UseOpIdx, NewDstIdx)) return false; MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx); @@ -719,7 +707,8 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, // At this point we have decided that it is legal to do this // transformation. Start by commuting the instruction. MachineBasicBlock *MBB = DefMI->getParent(); - MachineInstr *NewMI = TII->commuteInstruction(DefMI); + MachineInstr *NewMI = + TII->commuteInstruction(DefMI, false, UseOpIdx, NewDstIdx); if (!NewMI) return false; if (TargetRegisterInfo::isVirtualRegister(IntA.reg) && @@ -804,7 +793,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); if (IntB.hasSubRanges()) { if (!IntA.hasSubRanges()) { - unsigned Mask = MRI->getMaxLaneMaskForVReg(IntA.reg); + LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntA.reg); IntA.createSubRangeFrom(Allocator, Mask, IntA); } SlotIndex AIdx = CopyIdx.getRegSlot(true); @@ -812,20 +801,21 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, VNInfo *ASubValNo = SA.getVNInfoAt(AIdx); assert(ASubValNo != nullptr); - unsigned AMask = SA.LaneMask; + LaneBitmask AMask = SA.LaneMask; for (LiveInterval::SubRange &SB : IntB.subranges()) { - unsigned BMask = SB.LaneMask; - unsigned Common = BMask & AMask; + LaneBitmask BMask = SB.LaneMask; + LaneBitmask Common = BMask & AMask; if (Common == 0) continue; - DEBUG( - dbgs() << format("\t\tCopy+Merge %04X into %04X\n", BMask, Common)); - unsigned BRest = BMask & ~AMask; + DEBUG( dbgs() << "\t\tCopy_Merge " << PrintLaneMask(BMask) + << " into " << PrintLaneMask(Common) << '\n'); + LaneBitmask BRest = BMask & ~AMask; LiveInterval::SubRange *CommonRange; if (BRest != 0) { SB.LaneMask = BRest; - DEBUG(dbgs() << format("\t\tReduce Lane to %04X\n", BRest)); + DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(BRest) + << '\n'); // Duplicate SubRange for newly merged common stuff. CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB); } else { @@ -842,7 +832,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP, AMask &= ~BMask; } if (AMask != 0) { - DEBUG(dbgs() << format("\t\tNew Lane %04X\n", AMask)); + DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(AMask) << '\n'); LiveRange *NewRange = IntB.createSubRange(Allocator, AMask); VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator); addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo); @@ -1107,7 +1097,7 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) { const LiveInterval &SrcLI = LIS->getInterval(SrcReg); // CopyMI is undef iff SrcReg is not live before the instruction. if (SrcSubIdx != 0 && SrcLI.hasSubRanges()) { - unsigned SrcMask = TRI->getSubRegIndexLaneMask(SrcSubIdx); + LaneBitmask SrcMask = TRI->getSubRegIndexLaneMask(SrcSubIdx); for (const LiveInterval::SubRange &SR : SrcLI.subranges()) { if ((SR.LaneMask & SrcMask) == 0) continue; @@ -1128,7 +1118,7 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) { DstLI.MergeValueNumberInto(VNI, PrevVNI); // The affected subregister segments can be removed. - unsigned DstMask = TRI->getSubRegIndexLaneMask(DstSubIdx); + LaneBitmask DstMask = TRI->getSubRegIndexLaneMask(DstSubIdx); for (LiveInterval::SubRange &SR : DstLI.subranges()) { if ((SR.LaneMask & DstMask) == 0) continue; @@ -1147,7 +1137,7 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) { continue; const MachineInstr &MI = *MO.getParent(); SlotIndex UseIdx = LIS->getInstructionIndex(&MI); - unsigned UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg()); + LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg()); bool isLive; if (UseMask != ~0u && DstLI.hasSubRanges()) { isLive = false; @@ -1213,10 +1203,10 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg, if (SubIdx != 0 && MO.isUse() && MRI->shouldTrackSubRegLiveness(DstReg)) { if (!DstInt->hasSubRanges()) { BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); - unsigned Mask = MRI->getMaxLaneMaskForVReg(DstInt->reg); + LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(DstInt->reg); DstInt->createSubRangeFrom(Allocator, Mask, *DstInt); } - unsigned Mask = TRI->getSubRegIndexLaneMask(SubIdx); + LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubIdx); bool IsUndef = true; SlotIndex MIIdx = UseMI->isDebugValue() ? LIS->getSlotIndexes()->getIndexBefore(UseMI) @@ -1445,8 +1435,8 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { for (LiveInterval::SubRange &S : LI.subranges()) { if ((S.LaneMask & ShrinkMask) == 0) continue; - DEBUG(dbgs() << "Shrink LaneUses (Lane " - << format("%04X", S.LaneMask) << ")\n"); + DEBUG(dbgs() << "Shrink LaneUses (Lane " << PrintLaneMask(S.LaneMask) + << ")\n"); LIS->shrinkToUses(S, LI.reg); } LI.removeEmptySubRanges(); @@ -1644,7 +1634,7 @@ class JoinVals { const unsigned SubIdx; /// The LaneMask that this liverange will occupy the coalesced register. May /// be smaller than the lanemask produced by SubIdx when merging subranges. - const unsigned LaneMask; + const LaneBitmask LaneMask; /// This is true when joining sub register ranges, false when joining main /// ranges. @@ -1699,11 +1689,11 @@ class JoinVals { ConflictResolution Resolution; /// Lanes written by this def, 0 for unanalyzed values. - unsigned WriteLanes; + LaneBitmask WriteLanes; /// Lanes with defined values in this register. Other lanes are undef and /// safe to clobber. - unsigned ValidLanes; + LaneBitmask ValidLanes; /// Value in LI being redefined by this def. VNInfo *RedefVNI; @@ -1744,7 +1734,7 @@ class JoinVals { /// Compute the bitmask of lanes actually written by DefMI. /// Set Redef if there are any partial register definitions that depend on the /// previous value of the register. - unsigned computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const; + LaneBitmask computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const; /// Find the ultimate value that VNI was copied from. std::pair<const VNInfo*,unsigned> followCopyChain(const VNInfo *VNI) const; @@ -1780,12 +1770,12 @@ class JoinVals { /// entry to TaintedVals. /// /// Returns false if the tainted lanes extend beyond the basic block. - bool taintExtent(unsigned, unsigned, JoinVals&, - SmallVectorImpl<std::pair<SlotIndex, unsigned> >&); + bool taintExtent(unsigned, LaneBitmask, JoinVals&, + SmallVectorImpl<std::pair<SlotIndex, LaneBitmask> >&); /// Return true if MI uses any of the given Lanes from Reg. /// This does not include partial redefinitions of Reg. - bool usesLanes(const MachineInstr *MI, unsigned, unsigned, unsigned) const; + bool usesLanes(const MachineInstr *MI, unsigned, unsigned, LaneBitmask) const; /// Determine if ValNo is a copy of a value number in LR or Other.LR that will /// be pruned: @@ -1796,7 +1786,7 @@ class JoinVals { bool isPrunedValue(unsigned ValNo, JoinVals &Other); public: - JoinVals(LiveRange &LR, unsigned Reg, unsigned SubIdx, unsigned LaneMask, + JoinVals(LiveRange &LR, unsigned Reg, unsigned SubIdx, LaneBitmask LaneMask, SmallVectorImpl<VNInfo*> &newVNInfo, const CoalescerPair &cp, LiveIntervals *lis, const TargetRegisterInfo *TRI, bool SubRangeJoin, bool TrackSubRegLiveness) @@ -1822,8 +1812,8 @@ public: /// Removes subranges starting at copies that get removed. This sometimes /// happens when undefined subranges are copied around. These ranges contain - /// no usefull information and can be removed. - void pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask); + /// no useful information and can be removed. + void pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask); /// Erase any machine instructions that have been coalesced away. /// Add erased instructions to ErasedInstrs. @@ -1840,9 +1830,9 @@ public: }; } // end anonymous namespace -unsigned JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef) +LaneBitmask JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const { - unsigned L = 0; + LaneBitmask L = 0; for (const MachineOperand &MO : DefMI->operands()) { if (!MO.isReg() || MO.getReg() != Reg || !MO.isDef()) continue; @@ -1879,7 +1869,7 @@ std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain( ValueIn = nullptr; for (const LiveInterval::SubRange &S : LI.subranges()) { // Transform lanemask to a mask in the joined live interval. - unsigned SMask = TRI->composeSubRegIndexLaneMask(SubIdx, S.LaneMask); + LaneBitmask SMask = TRI->composeSubRegIndexLaneMask(SubIdx, S.LaneMask); if ((SMask & LaneMask) == 0) continue; LiveQueryResult LRQ = S.Query(Def); @@ -1928,7 +1918,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) { const MachineInstr *DefMI = nullptr; if (VNI->isPHIDef()) { // Conservatively assume that all lanes in a PHI are valid. - unsigned Lanes = SubRangeJoin ? 1 : TRI->getSubRegIndexLaneMask(SubIdx); + LaneBitmask Lanes = SubRangeJoin ? 1 : TRI->getSubRegIndexLaneMask(SubIdx); V.ValidLanes = V.WriteLanes = Lanes; } else { DefMI = Indexes->getInstructionFromIndex(VNI->def); @@ -2190,8 +2180,8 @@ bool JoinVals::mapValues(JoinVals &Other) { } bool JoinVals:: -taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other, - SmallVectorImpl<std::pair<SlotIndex, unsigned> > &TaintExtent) { +taintExtent(unsigned ValNo, LaneBitmask TaintedLanes, JoinVals &Other, + SmallVectorImpl<std::pair<SlotIndex, LaneBitmask> > &TaintExtent) { VNInfo *VNI = LR.getValNumInfo(ValNo); MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def); SlotIndex MBBEnd = Indexes->getMBBEndIdx(MBB); @@ -2230,7 +2220,7 @@ taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other, } bool JoinVals::usesLanes(const MachineInstr *MI, unsigned Reg, unsigned SubIdx, - unsigned Lanes) const { + LaneBitmask Lanes) const { if (MI->isDebugValue()) return false; for (const MachineOperand &MO : MI->operands()) { @@ -2264,8 +2254,8 @@ bool JoinVals::resolveConflicts(JoinVals &Other) { // VNI is known to clobber some lanes in OtherVNI. If we go ahead with the // join, those lanes will be tainted with a wrong value. Get the extent of // the tainted lanes. - unsigned TaintedLanes = V.WriteLanes & OtherV.ValidLanes; - SmallVector<std::pair<SlotIndex, unsigned>, 8> TaintExtent; + LaneBitmask TaintedLanes = V.WriteLanes & OtherV.ValidLanes; + SmallVector<std::pair<SlotIndex, LaneBitmask>, 8> TaintExtent; if (!taintExtent(i, TaintedLanes, Other, TaintExtent)) // Tainted lanes would extend beyond the basic block. return false; @@ -2384,7 +2374,7 @@ void JoinVals::pruneValues(JoinVals &Other, } } -void JoinVals::pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask) +void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) { // Look for values being erased. bool DidPrune = false; @@ -2401,7 +2391,7 @@ void JoinVals::pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask) // copied and we must remove that subrange value as well. VNInfo *ValueOut = Q.valueOutOrDead(); if (ValueOut != nullptr && Q.valueIn() == nullptr) { - DEBUG(dbgs() << "\t\tPrune sublane " << format("%04X", S.LaneMask) + DEBUG(dbgs() << "\t\tPrune sublane " << PrintLaneMask(S.LaneMask) << " at " << Def << "\n"); LIS->pruneValue(S, Def, nullptr); DidPrune = true; @@ -2410,10 +2400,10 @@ void JoinVals::pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask) continue; } // If a subrange ends at the copy, then a value was copied but only - // partially used later. Shrink the subregister range apropriately. + // partially used later. Shrink the subregister range appropriately. if (Q.valueIn() != nullptr && Q.valueOut() == nullptr) { - DEBUG(dbgs() << "\t\tDead uses at sublane " - << format("%04X", S.LaneMask) << " at " << Def << "\n"); + DEBUG(dbgs() << "\t\tDead uses at sublane " << PrintLaneMask(S.LaneMask) + << " at " << Def << "\n"); ShrinkMask |= S.LaneMask; } } @@ -2477,8 +2467,8 @@ void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs, } } -bool RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange, - unsigned LaneMask, +void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange, + LaneBitmask LaneMask, const CoalescerPair &CP) { SmallVector<VNInfo*, 16> NewVNInfo; JoinVals RHSVals(RRange, CP.getSrcReg(), CP.getSrcIdx(), LaneMask, @@ -2492,13 +2482,15 @@ bool RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange, // ranges get mapped to the "overflow" lane mask bit which creates unexpected // interferences. if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals)) { - DEBUG(dbgs() << "*** Couldn't join subrange!\n"); - return false; + // We already determined that it is legal to merge the intervals, so this + // should never fail. + llvm_unreachable("*** Couldn't join subrange!\n"); } if (!LHSVals.resolveConflicts(RHSVals) || !RHSVals.resolveConflicts(LHSVals)) { - DEBUG(dbgs() << "*** Couldn't join subrange!\n"); - return false; + // We already determined that it is legal to merge the intervals, so this + // should never fail. + llvm_unreachable("*** Couldn't join subrange!\n"); } // The merging algorithm in LiveInterval::join() can't handle conflicting @@ -2521,36 +2513,37 @@ bool RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange, DEBUG(dbgs() << "\t\tjoined lanes: " << LRange << "\n"); if (EndPoints.empty()) - return true; + return; // Recompute the parts of the live range we had to remove because of // CR_Replace conflicts. DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size() << " points: " << LRange << '\n'); LIS->extendToIndices(LRange, EndPoints); - return true; } -bool RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI, +void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge, - unsigned LaneMask, CoalescerPair &CP) { + LaneBitmask LaneMask, + CoalescerPair &CP) { BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); for (LiveInterval::SubRange &R : LI.subranges()) { - unsigned RMask = R.LaneMask; + LaneBitmask RMask = R.LaneMask; // LaneMask of subregisters common to subrange R and ToMerge. - unsigned Common = RMask & LaneMask; + LaneBitmask Common = RMask & LaneMask; // There is nothing to do without common subregs. if (Common == 0) continue; - DEBUG(dbgs() << format("\t\tCopy+Merge %04X into %04X\n", RMask, Common)); + DEBUG(dbgs() << "\t\tCopy+Merge " << PrintLaneMask(RMask) << " into " + << PrintLaneMask(Common) << '\n'); // LaneMask of subregisters contained in the R range but not in ToMerge, // they have to split into their own subrange. - unsigned LRest = RMask & ~LaneMask; + LaneBitmask LRest = RMask & ~LaneMask; LiveInterval::SubRange *CommonRange; if (LRest != 0) { R.LaneMask = LRest; - DEBUG(dbgs() << format("\t\tReduce Lane to %04X\n", LRest)); + DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(LRest) << '\n'); // Duplicate SubRange for newly merged common stuff. CommonRange = LI.createSubRangeFrom(Allocator, Common, R); } else { @@ -2559,16 +2552,14 @@ bool RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI, CommonRange = &R; } LiveRange RangeCopy(ToMerge, Allocator); - if (!joinSubRegRanges(*CommonRange, RangeCopy, Common, CP)) - return false; + joinSubRegRanges(*CommonRange, RangeCopy, Common, CP); LaneMask &= ~RMask; } if (LaneMask != 0) { - DEBUG(dbgs() << format("\t\tNew Lane %04X\n", LaneMask)); + DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(LaneMask) << '\n'); LI.createSubRangeFrom(Allocator, LaneMask, ToMerge); } - return true; } bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) { @@ -2602,15 +2593,15 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) { // create initial subranges if necessary. unsigned DstIdx = CP.getDstIdx(); if (!LHS.hasSubRanges()) { - unsigned Mask = DstIdx == 0 ? CP.getNewRC()->getLaneMask() - : TRI->getSubRegIndexLaneMask(DstIdx); + LaneBitmask Mask = DstIdx == 0 ? CP.getNewRC()->getLaneMask() + : TRI->getSubRegIndexLaneMask(DstIdx); // LHS must support subregs or we wouldn't be in this codepath. assert(Mask != 0); LHS.createSubRangeFrom(Allocator, Mask, LHS); } else if (DstIdx != 0) { // Transform LHS lanemasks to new register class if necessary. for (LiveInterval::SubRange &R : LHS.subranges()) { - unsigned Mask = TRI->composeSubRegIndexLaneMask(DstIdx, R.LaneMask); + LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(DstIdx, R.LaneMask); R.LaneMask = Mask; } } @@ -2619,41 +2610,21 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) { // Determine lanemasks of RHS in the coalesced register and merge subranges. unsigned SrcIdx = CP.getSrcIdx(); - bool Abort = false; if (!RHS.hasSubRanges()) { - unsigned Mask = SrcIdx == 0 ? CP.getNewRC()->getLaneMask() - : TRI->getSubRegIndexLaneMask(SrcIdx); - if (!mergeSubRangeInto(LHS, RHS, Mask, CP)) - Abort = true; + LaneBitmask Mask = SrcIdx == 0 ? CP.getNewRC()->getLaneMask() + : TRI->getSubRegIndexLaneMask(SrcIdx); + mergeSubRangeInto(LHS, RHS, Mask, CP); } else { // Pair up subranges and merge. for (LiveInterval::SubRange &R : RHS.subranges()) { - unsigned Mask = TRI->composeSubRegIndexLaneMask(SrcIdx, R.LaneMask); - if (!mergeSubRangeInto(LHS, R, Mask, CP)) { - Abort = true; - break; - } + LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(SrcIdx, R.LaneMask); + mergeSubRangeInto(LHS, R, Mask, CP); } } - if (Abort) { - // This shouldn't have happened :-( - // However we are aware of at least one existing problem where we - // can't merge subranges when multiple ranges end up in the - // "overflow bit" 32. As a workaround we drop all subregister ranges - // which means we loose some precision but are back to a well defined - // state. - assert(TargetRegisterInfo::isImpreciseLaneMask( - CP.getNewRC()->getLaneMask()) - && "SubRange merge should only fail when merging into bit 32."); - DEBUG(dbgs() << "\tSubrange join aborted!\n"); - LHS.clearSubRanges(); - RHS.clearSubRanges(); - } else { - DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n"); + DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n"); - LHSVals.pruneSubRegValues(LHS, ShrinkMask); - RHSVals.pruneSubRegValues(LHS, ShrinkMask); - } + LHSVals.pruneSubRegValues(LHS, ShrinkMask); + RHSVals.pruneSubRegValues(LHS, ShrinkMask); } // The merging algorithm in LiveInterval::join() can't handle conflicting @@ -2799,7 +2770,7 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const { !isTerminalReg(DstReg, Copy, MRI)) return false; - // DstReg is a terminal node. Check if it inteferes with any other + // DstReg is a terminal node. Check if it interferes with any other // copy involving SrcReg. const MachineBasicBlock *OrigBB = Copy.getParent(); const LiveInterval &DstLI = LIS->getInterval(DstReg); @@ -2903,8 +2874,8 @@ void RegisterCoalescer::joinAllIntervals() { std::vector<MBBPriorityInfo> MBBs; MBBs.reserve(MF->size()); - for (MachineFunction::iterator I = MF->begin(), E = MF->end();I != E;++I){ - MachineBasicBlock *MBB = I; + for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) { + MachineBasicBlock *MBB = &*I; MBBs.push_back(MBBPriorityInfo(MBB, Loops->getLoopDepth(MBB), JoinSplitEdges && isSplitEdge(MBB))); } @@ -2943,7 +2914,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) { TRI = STI.getRegisterInfo(); TII = STI.getInstrInfo(); LIS = &getAnalysis<LiveIntervals>(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); Loops = &getAnalysis<MachineLoopInfo>(); if (EnableGlobalCopies == cl::BOU_UNSET) JoinGlobalCopies = STI.enableJoinGlobalCopies(); @@ -2981,22 +2952,25 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) { if (MRI->recomputeRegClass(Reg)) { DEBUG(dbgs() << PrintReg(Reg) << " inflated to " << TRI->getRegClassName(MRI->getRegClass(Reg)) << '\n'); + ++NumInflated; + LiveInterval &LI = LIS->getInterval(Reg); - unsigned MaxMask = MRI->getMaxLaneMaskForVReg(Reg); - if (MaxMask == 0) { + if (LI.hasSubRanges()) { // If the inflated register class does not support subregisters anymore // remove the subranges. - LI.clearSubRanges(); - } else { + if (!MRI->shouldTrackSubRegLiveness(Reg)) { + LI.clearSubRanges(); + } else { #ifndef NDEBUG - // If subranges are still supported, then the same subregs should still - // be supported. - for (LiveInterval::SubRange &S : LI.subranges()) { - assert ((S.LaneMask & ~MaxMask) == 0); - } + LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(Reg); + // If subranges are still supported, then the same subregs + // should still be supported. + for (LiveInterval::SubRange &S : LI.subranges()) { + assert((S.LaneMask & ~MaxMask) == 0); + } #endif + } } - ++NumInflated; } } diff --git a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp index c3786e5..f33dc3e 100644 --- a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp +++ b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp @@ -59,12 +59,12 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) const { dbgs() << "Max Pressure: "; dumpRegSetPressure(MaxSetPressure, TRI); dbgs() << "Live In: "; - for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i) - dbgs() << PrintVRegOrUnit(LiveInRegs[i], TRI) << " "; + for (unsigned Reg : LiveInRegs) + dbgs() << PrintVRegOrUnit(Reg, TRI) << " "; dbgs() << '\n'; dbgs() << "Live Out: "; - for (unsigned i = 0, e = LiveOutRegs.size(); i < e; ++i) - dbgs() << PrintVRegOrUnit(LiveOutRegs[i], TRI) << " "; + for (unsigned Reg : LiveOutRegs) + dbgs() << PrintVRegOrUnit(Reg, TRI) << " "; dbgs() << '\n'; } @@ -78,11 +78,13 @@ void RegPressureTracker::dump() const { } void PressureDiff::dump(const TargetRegisterInfo &TRI) const { + const char *sep = ""; for (const PressureChange &Change : *this) { - if (!Change.isValid() || Change.getUnitInc() == 0) - continue; - dbgs() << " " << TRI.getRegPressureSetName(Change.getPSet()) + if (!Change.isValid()) + break; + dbgs() << sep << TRI.getRegPressureSetName(Change.getPSet()) << " " << Change.getUnitInc(); + sep = " "; } dbgs() << '\n'; } @@ -90,22 +92,21 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const { /// Increase the current pressure as impacted by these registers and bump /// the high water mark if needed. void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> RegUnits) { - for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) { - PSetIterator PSetI = MRI->getPressureSets(RegUnits[i]); + for (unsigned RegUnit : RegUnits) { + PSetIterator PSetI = MRI->getPressureSets(RegUnit); unsigned Weight = PSetI.getWeight(); for (; PSetI.isValid(); ++PSetI) { CurrSetPressure[*PSetI] += Weight; - if (CurrSetPressure[*PSetI] > P.MaxSetPressure[*PSetI]) { - P.MaxSetPressure[*PSetI] = CurrSetPressure[*PSetI]; - } + P.MaxSetPressure[*PSetI] = + std::max(P.MaxSetPressure[*PSetI], CurrSetPressure[*PSetI]); } } } /// Simply decrease the current pressure as impacted by these registers. void RegPressureTracker::decreaseRegPressure(ArrayRef<unsigned> RegUnits) { - for (unsigned I = 0, E = RegUnits.size(); I != E; ++I) - decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnits[I])); + for (unsigned RegUnit : RegUnits) + decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnit)); } /// Clear the result so it can be used for another round of pressure tracking. @@ -157,10 +158,22 @@ void RegionPressure::openBottom(MachineBasicBlock::const_iterator PrevBottom) { LiveInRegs.clear(); } -const LiveRange *RegPressureTracker::getLiveRange(unsigned Reg) const { +void LiveRegSet::init(const MachineRegisterInfo &MRI) { + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + unsigned NumRegUnits = TRI.getNumRegs(); + unsigned NumVirtRegs = MRI.getNumVirtRegs(); + Regs.setUniverse(NumRegUnits + NumVirtRegs); + this->NumRegUnits = NumRegUnits; +} + +void LiveRegSet::clear() { + Regs.clear(); +} + +static const LiveRange *getLiveRange(const LiveIntervals &LIS, unsigned Reg) { if (TargetRegisterInfo::isVirtualRegister(Reg)) - return &LIS->getInterval(Reg); - return LIS->getCachedRegUnit(Reg); + return &LIS.getInterval(Reg); + return LIS.getCachedRegUnit(Reg); } void RegPressureTracker::reset() { @@ -176,8 +189,7 @@ void RegPressureTracker::reset() { else static_cast<RegionPressure&>(P).reset(); - LiveRegs.PhysRegs.clear(); - LiveRegs.VirtRegs.clear(); + LiveRegs.clear(); UntiedDefs.clear(); } @@ -210,8 +222,7 @@ void RegPressureTracker::init(const MachineFunction *mf, P.MaxSetPressure = CurrSetPressure; - LiveRegs.PhysRegs.setUniverse(TRI->getNumRegs()); - LiveRegs.VirtRegs.setUniverse(MRI->getNumVirtRegs()); + LiveRegs.init(*MRI); if (TrackUntiedDefs) UntiedDefs.setUniverse(MRI->getNumVirtRegs()); } @@ -250,14 +261,8 @@ void RegPressureTracker::closeTop() { static_cast<RegionPressure&>(P).TopPos = CurrPos; assert(P.LiveInRegs.empty() && "inconsistent max pressure result"); - P.LiveInRegs.reserve(LiveRegs.PhysRegs.size() + LiveRegs.VirtRegs.size()); - P.LiveInRegs.append(LiveRegs.PhysRegs.begin(), LiveRegs.PhysRegs.end()); - for (SparseSet<unsigned>::const_iterator I = - LiveRegs.VirtRegs.begin(), E = LiveRegs.VirtRegs.end(); I != E; ++I) - P.LiveInRegs.push_back(*I); - std::sort(P.LiveInRegs.begin(), P.LiveInRegs.end()); - P.LiveInRegs.erase(std::unique(P.LiveInRegs.begin(), P.LiveInRegs.end()), - P.LiveInRegs.end()); + P.LiveInRegs.reserve(LiveRegs.size()); + LiveRegs.appendTo(P.LiveInRegs); } /// Set the boundary for the bottom of the region and summarize live outs. @@ -268,21 +273,14 @@ void RegPressureTracker::closeBottom() { static_cast<RegionPressure&>(P).BottomPos = CurrPos; assert(P.LiveOutRegs.empty() && "inconsistent max pressure result"); - P.LiveOutRegs.reserve(LiveRegs.PhysRegs.size() + LiveRegs.VirtRegs.size()); - P.LiveOutRegs.append(LiveRegs.PhysRegs.begin(), LiveRegs.PhysRegs.end()); - for (SparseSet<unsigned>::const_iterator I = - LiveRegs.VirtRegs.begin(), E = LiveRegs.VirtRegs.end(); I != E; ++I) - P.LiveOutRegs.push_back(*I); - std::sort(P.LiveOutRegs.begin(), P.LiveOutRegs.end()); - P.LiveOutRegs.erase(std::unique(P.LiveOutRegs.begin(), P.LiveOutRegs.end()), - P.LiveOutRegs.end()); + P.LiveOutRegs.reserve(LiveRegs.size()); + LiveRegs.appendTo(P.LiveOutRegs); } /// Finalize the region boundaries and record live ins and live outs. void RegPressureTracker::closeRegion() { if (!isTopClosed() && !isBottomClosed()) { - assert(LiveRegs.PhysRegs.empty() && LiveRegs.VirtRegs.empty() && - "no region boundary"); + assert(LiveRegs.size() == 0 && "no region boundary"); return; } if (!isBottomClosed()) @@ -299,8 +297,7 @@ void RegPressureTracker::closeRegion() { void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) { LiveThruPressure.assign(TRI->getNumRegPressureSets(), 0); assert(isBottomClosed() && "need bottom-up tracking to intialize."); - for (unsigned i = 0, e = P.LiveOutRegs.size(); i < e; ++i) { - unsigned Reg = P.LiveOutRegs[i]; + for (unsigned Reg : P.LiveOutRegs) { if (TargetRegisterInfo::isVirtualRegister(Reg) && !RPTracker.hasUntiedDef(Reg)) { increaseSetPressure(LiveThruPressure, MRI->getPressureSets(Reg)); @@ -315,69 +312,96 @@ static bool containsReg(ArrayRef<unsigned> RegUnits, unsigned RegUnit) { } namespace { + /// Collect this instruction's unique uses and defs into SmallVectors for /// processing defs and uses in order. /// /// FIXME: always ignore tied opers -class RegisterOperands { - const TargetRegisterInfo *TRI; - const MachineRegisterInfo *MRI; +class RegisterOperandsCollector { + RegisterOperands &RegOpers; + const TargetRegisterInfo &TRI; + const MachineRegisterInfo &MRI; bool IgnoreDead; -public: - SmallVector<unsigned, 8> Uses; - SmallVector<unsigned, 8> Defs; - SmallVector<unsigned, 8> DeadDefs; - - RegisterOperands(const TargetRegisterInfo *tri, - const MachineRegisterInfo *mri, bool ID = false): - TRI(tri), MRI(mri), IgnoreDead(ID) {} + RegisterOperandsCollector(RegisterOperands &RegOpers, + const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, + bool IgnoreDead) + : RegOpers(RegOpers), TRI(TRI), MRI(MRI), IgnoreDead(IgnoreDead) {} + + void collectInstr(const MachineInstr &MI) const { + for (ConstMIBundleOperands OperI(&MI); OperI.isValid(); ++OperI) + collectOperand(*OperI); + + // Remove redundant physreg dead defs. + SmallVectorImpl<unsigned>::iterator I = + std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(), + std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs)); + RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end()); + } - /// Push this operand's register onto the correct vector. - void collect(const MachineOperand &MO) { + /// Push this operand's register onto the correct vectors. + void collectOperand(const MachineOperand &MO) const { if (!MO.isReg() || !MO.getReg()) return; + unsigned Reg = MO.getReg(); if (MO.readsReg()) - pushRegUnits(MO.getReg(), Uses); + pushRegUnits(Reg, RegOpers.Uses); if (MO.isDef()) { if (MO.isDead()) { if (!IgnoreDead) - pushRegUnits(MO.getReg(), DeadDefs); - } - else - pushRegUnits(MO.getReg(), Defs); + pushRegUnits(Reg, RegOpers.DeadDefs); + } else + pushRegUnits(Reg, RegOpers.Defs); } } -protected: - void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &RegUnits) { + void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &RegUnits) const { if (TargetRegisterInfo::isVirtualRegister(Reg)) { if (containsReg(RegUnits, Reg)) return; RegUnits.push_back(Reg); - } - else if (MRI->isAllocatable(Reg)) { - for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) { + } else if (MRI.isAllocatable(Reg)) { + for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units) { if (containsReg(RegUnits, *Units)) continue; RegUnits.push_back(*Units); } } } + + friend class llvm::RegisterOperands; }; -} // namespace -/// Collect physical and virtual register operands. -static void collectOperands(const MachineInstr *MI, - RegisterOperands &RegOpers) { - for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI) - RegOpers.collect(*OperI); +} // namespace - // Remove redundant physreg dead defs. - SmallVectorImpl<unsigned>::iterator I = - std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(), - std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs)); - RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end()); +void RegisterOperands::collect(const MachineInstr &MI, + const TargetRegisterInfo &TRI, + const MachineRegisterInfo &MRI, + bool IgnoreDead) { + RegisterOperandsCollector Collector(*this, TRI, MRI, IgnoreDead); + Collector.collectInstr(MI); +} + +void RegisterOperands::detectDeadDefs(const MachineInstr &MI, + const LiveIntervals &LIS) { + SlotIndex SlotIdx = LIS.getInstructionIndex(&MI); + for (SmallVectorImpl<unsigned>::iterator RI = Defs.begin(); + RI != Defs.end(); /*empty*/) { + unsigned Reg = *RI; + const LiveRange *LR = getLiveRange(LIS, Reg); + if (LR != nullptr) { + LiveQueryResult LRQ = LR->Query(SlotIdx); + if (LRQ.isDeadDef()) { + // LiveIntervals knows this is a dead even though it's MachineOperand is + // not flagged as such. + DeadDefs.push_back(Reg); + RI = Defs.erase(RI); + continue; + } + } + ++RI; + } } /// Initialize an array of N PressureDiffs. @@ -392,6 +416,18 @@ void PressureDiffs::init(unsigned N) { PDiffArray = reinterpret_cast<PressureDiff*>(calloc(N, sizeof(PressureDiff))); } +void PressureDiffs::addInstruction(unsigned Idx, + const RegisterOperands &RegOpers, + const MachineRegisterInfo &MRI) { + PressureDiff &PDiff = (*this)[Idx]; + assert(!PDiff.begin()->isValid() && "stale PDiff"); + for (unsigned Reg : RegOpers.Defs) + PDiff.addPressureChange(Reg, true, &MRI); + + for (unsigned Reg : RegOpers.Uses) + PDiff.addPressureChange(Reg, false, &MRI); +} + /// Add a change in pressure to the pressure diff of a given instruction. void PressureDiff::addPressureChange(unsigned RegUnit, bool IsDec, const MachineRegisterInfo *MRI) { @@ -399,7 +435,7 @@ void PressureDiff::addPressureChange(unsigned RegUnit, bool IsDec, int Weight = IsDec ? -PSetI.getWeight() : PSetI.getWeight(); for (; PSetI.isValid(); ++PSetI) { // Find an existing entry in the pressure diff for this PSet. - PressureDiff::iterator I = begin(), E = end(); + PressureDiff::iterator I = nonconst_begin(), E = nonconst_end(); for (; I != E && I->isValid(); ++I) { if (I->getPSet() >= *PSetI) break; @@ -411,30 +447,28 @@ void PressureDiff::addPressureChange(unsigned RegUnit, bool IsDec, if (!I->isValid() || I->getPSet() != *PSetI) { PressureChange PTmp = PressureChange(*PSetI); for (PressureDiff::iterator J = I; J != E && PTmp.isValid(); ++J) - std::swap(*J,PTmp); + std::swap(*J, PTmp); } // Update the units for this pressure set. - I->setUnitInc(I->getUnitInc() + Weight); + unsigned NewUnitInc = I->getUnitInc() + Weight; + if (NewUnitInc != 0) { + I->setUnitInc(NewUnitInc); + } else { + // Remove entry + PressureDiff::iterator J; + for (J = std::next(I); J != E && J->isValid(); ++J, ++I) + *I = *J; + if (J != E) + *I = *J; + } } } -/// Record the pressure difference induced by the given operand list. -static void collectPDiff(PressureDiff &PDiff, RegisterOperands &RegOpers, - const MachineRegisterInfo *MRI) { - assert(!PDiff.begin()->isValid() && "stale PDiff"); - - for (unsigned i = 0, e = RegOpers.Defs.size(); i != e; ++i) - PDiff.addPressureChange(RegOpers.Defs[i], true, MRI); - - for (unsigned i = 0, e = RegOpers.Uses.size(); i != e; ++i) - PDiff.addPressureChange(RegOpers.Uses[i], false, MRI); -} - /// Force liveness of registers. void RegPressureTracker::addLiveRegs(ArrayRef<unsigned> Regs) { - for (unsigned i = 0, e = Regs.size(); i != e; ++i) { - if (LiveRegs.insert(Regs[i])) - increaseRegPressure(Regs[i]); + for (unsigned Reg : Regs) { + if (LiveRegs.insert(Reg)) + increaseRegPressure(Reg); } } @@ -465,42 +499,9 @@ void RegPressureTracker::discoverLiveOut(unsigned Reg) { /// registers that are both defined and used by the instruction. If a pressure /// difference pointer is provided record the changes is pressure caused by this /// instruction independent of liveness. -bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses, - PressureDiff *PDiff) { - // Check for the top of the analyzable region. - if (CurrPos == MBB->begin()) { - closeRegion(); - return false; - } - if (!isBottomClosed()) - closeBottom(); - - // Open the top of the region using block iterators. - if (!RequireIntervals && isTopClosed()) - static_cast<RegionPressure&>(P).openTop(CurrPos); - - // Find the previous instruction. - do - --CurrPos; - while (CurrPos != MBB->begin() && CurrPos->isDebugValue()); - - if (CurrPos->isDebugValue()) { - closeRegion(); - return false; - } - SlotIndex SlotIdx; - if (RequireIntervals) - SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot(); - - // Open the top of the region using slot indexes. - if (RequireIntervals && isTopClosed()) - static_cast<IntervalPressure&>(P).openTop(SlotIdx); - - RegisterOperands RegOpers(TRI, MRI); - collectOperands(CurrPos, RegOpers); - - if (PDiff) - collectPDiff(*PDiff, RegOpers, MRI); +void RegPressureTracker::recede(const RegisterOperands &RegOpers, + SmallVectorImpl<unsigned> *LiveUses) { + assert(!CurrPos->isDebugValue()); // Boost pressure for all dead defs together. increaseRegPressure(RegOpers.DeadDefs); @@ -508,37 +509,23 @@ bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses, // Kill liveness at live defs. // TODO: consider earlyclobbers? - for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) { - unsigned Reg = RegOpers.Defs[i]; - bool DeadDef = false; - if (RequireIntervals) { - const LiveRange *LR = getLiveRange(Reg); - if (LR) { - LiveQueryResult LRQ = LR->Query(SlotIdx); - DeadDef = LRQ.isDeadDef(); - } - } - if (DeadDef) { - // LiveIntervals knows this is a dead even though it's MachineOperand is - // not flagged as such. Since this register will not be recorded as - // live-out, increase its PDiff value to avoid underflowing pressure. - if (PDiff) - PDiff->addPressureChange(Reg, false, MRI); - } else { - if (LiveRegs.erase(Reg)) - decreaseRegPressure(Reg); - else - discoverLiveOut(Reg); - } + for (unsigned Reg : RegOpers.Defs) { + if (LiveRegs.erase(Reg)) + decreaseRegPressure(Reg); + else + discoverLiveOut(Reg); } + SlotIndex SlotIdx; + if (RequireIntervals) + SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot(); + // Generate liveness for uses. - for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) { - unsigned Reg = RegOpers.Uses[i]; + for (unsigned Reg : RegOpers.Uses) { if (!LiveRegs.contains(Reg)) { // Adjust liveouts if LiveIntervals are available. if (RequireIntervals) { - const LiveRange *LR = getLiveRange(Reg); + const LiveRange *LR = getLiveRange(*LIS, Reg); if (LR) { LiveQueryResult LRQ = LR->Query(SlotIdx); if (!LRQ.isKill() && !LRQ.valueDefined()) @@ -552,24 +539,53 @@ bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses, } } if (TrackUntiedDefs) { - for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) { - unsigned Reg = RegOpers.Defs[i]; + for (unsigned Reg : RegOpers.Defs) { if (TargetRegisterInfo::isVirtualRegister(Reg) && !LiveRegs.contains(Reg)) UntiedDefs.insert(Reg); } } - return true; +} + +void RegPressureTracker::recedeSkipDebugValues() { + assert(CurrPos != MBB->begin()); + if (!isBottomClosed()) + closeBottom(); + + // Open the top of the region using block iterators. + if (!RequireIntervals && isTopClosed()) + static_cast<RegionPressure&>(P).openTop(CurrPos); + + // Find the previous instruction. + do + --CurrPos; + while (CurrPos != MBB->begin() && CurrPos->isDebugValue()); + + SlotIndex SlotIdx; + if (RequireIntervals) + SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot(); + + // Open the top of the region using slot indexes. + if (RequireIntervals && isTopClosed()) + static_cast<IntervalPressure&>(P).openTop(SlotIdx); +} + +void RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses) { + recedeSkipDebugValues(); + + const MachineInstr &MI = *CurrPos; + RegisterOperands RegOpers; + RegOpers.collect(MI, *TRI, *MRI); + if (RequireIntervals) + RegOpers.detectDeadDefs(MI, *LIS); + + recede(RegOpers, LiveUses); } /// Advance across the current instruction. -bool RegPressureTracker::advance() { +void RegPressureTracker::advance() { assert(!TrackUntiedDefs && "unsupported mode"); - // Check for the bottom of the analyzable region. - if (CurrPos == MBB->end()) { - closeRegion(); - return false; - } + assert(CurrPos != MBB->end()); if (!isTopClosed()) closeTop(); @@ -585,11 +601,10 @@ bool RegPressureTracker::advance() { static_cast<RegionPressure&>(P).openBottom(CurrPos); } - RegisterOperands RegOpers(TRI, MRI); - collectOperands(CurrPos, RegOpers); + RegisterOperands RegOpers; + RegOpers.collect(*CurrPos, *TRI, *MRI); - for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) { - unsigned Reg = RegOpers.Uses[i]; + for (unsigned Reg : RegOpers.Uses) { // Discover live-ins. bool isLive = LiveRegs.contains(Reg); if (!isLive) @@ -597,24 +612,21 @@ bool RegPressureTracker::advance() { // Kill liveness at last uses. bool lastUse = false; if (RequireIntervals) { - const LiveRange *LR = getLiveRange(Reg); + const LiveRange *LR = getLiveRange(*LIS, Reg); lastUse = LR && LR->Query(SlotIdx).isKill(); - } - else { + } else { // Allocatable physregs are always single-use before register rewriting. lastUse = !TargetRegisterInfo::isVirtualRegister(Reg); } if (lastUse && isLive) { LiveRegs.erase(Reg); decreaseRegPressure(Reg); - } - else if (!lastUse && !isLive) + } else if (!lastUse && !isLive) increaseRegPressure(Reg); } // Generate liveness for defs. - for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) { - unsigned Reg = RegOpers.Defs[i]; + for (unsigned Reg : RegOpers.Defs) { if (LiveRegs.insert(Reg)) increaseRegPressure(Reg); } @@ -627,7 +639,6 @@ bool RegPressureTracker::advance() { do ++CurrPos; while (CurrPos != MBB->end() && CurrPos->isDebugValue()); - return true; } /// Find the max change in excess pressure across all sets. @@ -653,8 +664,7 @@ static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec, PDiff = 0; // Under the limit else PDiff = PNew - Limit; // Just exceeded limit. - } - else if (Limit > PNew) + } else if (Limit > PNew) PDiff = Limit - POld; // Just obeyed limit. if (PDiff) { @@ -719,34 +729,19 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) { assert(!MI->isDebugValue() && "Expect a nondebug instruction."); // Account for register pressure similar to RegPressureTracker::recede(). - RegisterOperands RegOpers(TRI, MRI, /*IgnoreDead=*/true); - collectOperands(MI, RegOpers); - - // Boost max pressure for all dead defs together. - // Since CurrSetPressure and MaxSetPressure - increaseRegPressure(RegOpers.DeadDefs); - decreaseRegPressure(RegOpers.DeadDefs); + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, *MRI, /*IgnoreDead=*/true); + assert(RegOpers.DeadDefs.size() == 0); + if (RequireIntervals) + RegOpers.detectDeadDefs(*MI, *LIS); // Kill liveness at live defs. - for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) { - unsigned Reg = RegOpers.Defs[i]; - bool DeadDef = false; - if (RequireIntervals) { - const LiveRange *LR = getLiveRange(Reg); - if (LR) { - SlotIndex SlotIdx = LIS->getInstructionIndex(MI); - LiveQueryResult LRQ = LR->Query(SlotIdx); - DeadDef = LRQ.isDeadDef(); - } - } - if (!DeadDef) { - if (!containsReg(RegOpers.Uses, Reg)) - decreaseRegPressure(Reg); - } + for (unsigned Reg : RegOpers.Defs) { + if (!containsReg(RegOpers.Uses, Reg)) + decreaseRegPressure(Reg); } // Generate liveness for uses. - for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) { - unsigned Reg = RegOpers.Uses[i]; + for (unsigned Reg : RegOpers.Uses) { if (!LiveRegs.contains(Reg)) increaseRegPressure(Reg); } @@ -853,7 +848,8 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff, unsigned MNew = MOld; // Ignore DeadDefs here because they aren't captured by PressureChange. unsigned PNew = POld + PDiffI->getUnitInc(); - assert((PDiffI->getUnitInc() >= 0) == (PNew >= POld) && "PSet overflow"); + assert((PDiffI->getUnitInc() >= 0) == (PNew >= POld) + && "PSet overflow/underflow"); if (PNew > MOld) MNew = PNew; // Check if current pressure has exceeded the limit. @@ -892,19 +888,13 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff, } /// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx). -static bool findUseBetween(unsigned Reg, - SlotIndex PriorUseIdx, SlotIndex NextUseIdx, - const MachineRegisterInfo *MRI, +static bool findUseBetween(unsigned Reg, SlotIndex PriorUseIdx, + SlotIndex NextUseIdx, const MachineRegisterInfo &MRI, const LiveIntervals *LIS) { - for (MachineRegisterInfo::use_instr_nodbg_iterator - UI = MRI->use_instr_nodbg_begin(Reg), - UE = MRI->use_instr_nodbg_end(); UI != UE; ++UI) { - const MachineInstr* MI = &*UI; - if (MI->isDebugValue()) - continue; - SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot(); - if (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx) - return true; + for (const MachineInstr &MI : MRI.use_nodbg_instructions(Reg)) { + SlotIndex InstSlot = LIS->getInstructionIndex(&MI).getRegSlot(); + if (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx) + return true; } return false; } @@ -919,8 +909,8 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) { assert(!MI->isDebugValue() && "Expect a nondebug instruction."); // Account for register pressure similar to RegPressureTracker::recede(). - RegisterOperands RegOpers(TRI, MRI); - collectOperands(MI, RegOpers); + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, *MRI); // Kill liveness at last uses. Assume allocatable physregs are single-use // rather than checking LiveIntervals. @@ -928,21 +918,18 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) { if (RequireIntervals) SlotIdx = LIS->getInstructionIndex(MI).getRegSlot(); - for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) { - unsigned Reg = RegOpers.Uses[i]; + for (unsigned Reg : RegOpers.Uses) { if (RequireIntervals) { // FIXME: allow the caller to pass in the list of vreg uses that remain // to be bottom-scheduled to avoid searching uses at each query. SlotIndex CurrIdx = getCurrSlot(); - const LiveRange *LR = getLiveRange(Reg); + const LiveRange *LR = getLiveRange(*LIS, Reg); if (LR) { LiveQueryResult LRQ = LR->Query(SlotIdx); - if (LRQ.isKill() && !findUseBetween(Reg, CurrIdx, SlotIdx, MRI, LIS)) { + if (LRQ.isKill() && !findUseBetween(Reg, CurrIdx, SlotIdx, *MRI, LIS)) decreaseRegPressure(Reg); - } } - } - else if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + } else if (!TargetRegisterInfo::isVirtualRegister(Reg)) { // Allocatable physregs are always single-use before register rewriting. decreaseRegPressure(Reg); } @@ -966,7 +953,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) { /// This is expensive for an on-the-fly query because it calls /// bumpDownwardPressure to recompute the pressure sets based on current /// liveness. We don't yet have a fast version of downward pressure tracking -/// analagous to getUpwardPressureDelta. +/// analogous to getUpwardPressureDelta. void RegPressureTracker:: getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta, ArrayRef<PressureChange> CriticalPSets, diff --git a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp index 4176686..8fa1bf7 100644 --- a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp +++ b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp @@ -31,9 +31,12 @@ using namespace llvm; #define DEBUG_TYPE "reg-scavenging" /// setUsed - Set the register units of this register as used. -void RegScavenger::setRegUsed(unsigned Reg) { - for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) - RegUnitsAvailable.reset(*RUI); +void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) { + for (MCRegUnitMaskIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { + LaneBitmask UnitMask = (*RUI).second; + if (UnitMask == 0 || (LaneMask & UnitMask) != 0) + RegUnitsAvailable.reset((*RUI).first); + } } void RegScavenger::initRegState() { @@ -50,9 +53,8 @@ void RegScavenger::initRegState() { return; // Live-in registers are in use. - for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(), - E = MBB->livein_end(); I != E; ++I) - setRegUsed(*I); + for (const auto &LI : MBB->liveins()) + setRegUsed(LI.PhysReg, LI.LaneMask); // Pristine CSRs are also unavailable. const MachineFunction &MF = *MBB->getParent(); diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp index 76a7fef..efde61e 100644 --- a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp +++ b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp @@ -372,7 +372,6 @@ void SUnit::dumpAll(const ScheduleDAG *G) const { dbgs() << "\n"; } } - dbgs() << "\n"; } #endif diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index 390b6d2..11b246a 100644 --- a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -13,12 +13,12 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/ADT/IntEqClasses.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -51,15 +51,11 @@ static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden, ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, const MachineLoopInfo *mli, - bool IsPostRAFlag, bool RemoveKillFlags, - LiveIntervals *lis) - : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(lis), - IsPostRA(IsPostRAFlag), RemoveKillFlags(RemoveKillFlags), - CanHandleTerminators(false), FirstDbgValue(nullptr) { - assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals"); + bool RemoveKillFlags) + : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), + RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false), + TrackLaneMasks(false), FirstDbgValue(nullptr) { DbgValues.clear(); - assert(!(IsPostRA && MRI.getNumVirtRegs()) && - "Virtual registers must be removed prior to PostRA scheduling"); const TargetSubtargetInfo &ST = mf.getSubtarget(); SchedModel.init(ST.getSchedModel(), &ST, TII); @@ -230,11 +226,8 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() { if (TRI->isPhysicalRegister(Reg)) Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg)); - else { - assert(!IsPostRA && "Virtual register encountered after regalloc."); - if (MO.readsReg()) // ignore undef operands - addVRegUseDeps(&ExitSU, i); - } + else if (MO.readsReg()) // ignore undef operands + addVRegUseDeps(&ExitSU, i); } } else { // For others, e.g. fallthrough, conditional branch, assume the exit @@ -242,11 +235,9 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() { assert(Uses.empty() && "Uses in set before adding deps?"); for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), SE = BB->succ_end(); SI != SE; ++SI) - for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(), - E = (*SI)->livein_end(); I != E; ++I) { - unsigned Reg = *I; - if (!Uses.contains(Reg)) - Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg)); + for (const auto &LI : (*SI)->liveins()) { + if (!Uses.contains(LI.PhysReg)) + Uses.insert(PhysRegSUOper(&ExitSU, -1, LI.PhysReg)); } } } @@ -371,6 +362,20 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { } } +LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const +{ + unsigned Reg = MO.getReg(); + // No point in tracking lanemasks if we don't have interesting subregisters. + const TargetRegisterClass &RC = *MRI.getRegClass(Reg); + if (!RC.HasDisjunctSubRegs) + return ~0u; + + unsigned SubReg = MO.getSubReg(); + if (SubReg == 0) + return RC.getLaneMask(); + return TRI->getSubRegIndexLaneMask(SubReg); +} + /// addVRegDefDeps - Add register output and data dependencies from this SUnit /// to instructions that occur later in the same scheduling region if they read /// from or write to the virtual register defined at OperIdx. @@ -378,35 +383,106 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { /// TODO: Hoist loop induction variable increments. This has to be /// reevaluated. Generally, IV scheduling should be done before coalescing. void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { - const MachineInstr *MI = SU->getInstr(); - unsigned Reg = MI->getOperand(OperIdx).getReg(); + MachineInstr *MI = SU->getInstr(); + MachineOperand &MO = MI->getOperand(OperIdx); + unsigned Reg = MO.getReg(); + + LaneBitmask DefLaneMask; + LaneBitmask KillLaneMask; + if (TrackLaneMasks) { + bool IsKill = MO.getSubReg() == 0 || MO.isUndef(); + DefLaneMask = getLaneMaskForMO(MO); + // If we have a <read-undef> flag, none of the lane values comes from an + // earlier instruction. + KillLaneMask = IsKill ? ~0u : DefLaneMask; + + // Clear undef flag, we'll re-add it later once we know which subregister + // Def is first. + MO.setIsUndef(false); + } else { + DefLaneMask = ~0u; + KillLaneMask = ~0u; + } - // Singly defined vregs do not have output/anti dependencies. - // The current operand is a def, so we have at least one. - // Check here if there are any others... + if (MO.isDead()) { + assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() && + "Dead defs should have no uses"); + } else { + // Add data dependence to all uses we found so far. + const TargetSubtargetInfo &ST = MF.getSubtarget(); + for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg), + E = CurrentVRegUses.end(); I != E; /*empty*/) { + LaneBitmask LaneMask = I->LaneMask; + // Ignore uses of other lanes. + if ((LaneMask & KillLaneMask) == 0) { + ++I; + continue; + } + + if ((LaneMask & DefLaneMask) != 0) { + SUnit *UseSU = I->SU; + MachineInstr *Use = UseSU->getInstr(); + SDep Dep(SU, SDep::Data, Reg); + Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use, + I->OperandIndex)); + ST.adjustSchedDependency(SU, UseSU, Dep); + UseSU->addPred(Dep); + } + + LaneMask &= ~KillLaneMask; + // If we found a Def for all lanes of this use, remove it from the list. + if (LaneMask != 0) { + I->LaneMask = LaneMask; + ++I; + } else + I = CurrentVRegUses.erase(I); + } + } + + // Shortcut: Singly defined vregs do not have output/anti dependencies. if (MRI.hasOneDef(Reg)) return; - // Add output dependence to the next nearest def of this vreg. + // Add output dependence to the next nearest defs of this vreg. // // Unless this definition is dead, the output dependence should be // transitively redundant with antidependencies from this definition's // uses. We're conservative for now until we have a way to guarantee the uses // are not eliminated sometime during scheduling. The output dependence edge // is also useful if output latency exceeds def-use latency. - VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg); - if (DefI == VRegDefs.end()) - VRegDefs.insert(VReg2SUnit(Reg, SU)); - else { - SUnit *DefSU = DefI->SU; - if (DefSU != SU && DefSU != &ExitSU) { - SDep Dep(SU, SDep::Output, Reg); - Dep.setLatency( - SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr())); - DefSU->addPred(Dep); - } - DefI->SU = SU; + LaneBitmask LaneMask = DefLaneMask; + for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg), + CurrentVRegDefs.end())) { + // Ignore defs for other lanes. + if ((V2SU.LaneMask & LaneMask) == 0) + continue; + // Add an output dependence. + SUnit *DefSU = V2SU.SU; + // Ignore additional defs of the same lanes in one instruction. This can + // happen because lanemasks are shared for targets with too many + // subregisters. We also use some representration tricks/hacks where we + // add super-register defs/uses, to imply that although we only access parts + // of the reg we care about the full one. + if (DefSU == SU) + continue; + SDep Dep(SU, SDep::Output, Reg); + Dep.setLatency( + SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr())); + DefSU->addPred(Dep); + + // Update current definition. This can get tricky if the def was about a + // bigger lanemask before. We then have to shrink it and create a new + // VReg2SUnit for the non-overlapping part. + LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask; + LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask; + if (NonOverlapMask != 0) + CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, V2SU.SU)); + V2SU.SU = SU; + V2SU.LaneMask = OverlapMask; } + // If there was no CurrentVRegDefs entry for some lanes yet, create one. + if (LaneMask != 0) + CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU)); } /// addVRegUseDeps - Add a register data dependency if the instruction that @@ -416,59 +492,34 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { /// /// TODO: Handle ExitSU "uses" properly. void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { - MachineInstr *MI = SU->getInstr(); - unsigned Reg = MI->getOperand(OperIdx).getReg(); + const MachineInstr *MI = SU->getInstr(); + const MachineOperand &MO = MI->getOperand(OperIdx); + unsigned Reg = MO.getReg(); + + // Remember the use. Data dependencies will be added when we find the def. + LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO) : ~0u; + CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU)); + + // Add antidependences to the following defs of the vreg. + for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg), + CurrentVRegDefs.end())) { + // Ignore defs for unrelated lanes. + LaneBitmask PrevDefLaneMask = V2SU.LaneMask; + if ((PrevDefLaneMask & LaneMask) == 0) + continue; + if (V2SU.SU == SU) + continue; - // Record this local VReg use. - VReg2UseMap::iterator UI = VRegUses.find(Reg); - for (; UI != VRegUses.end(); ++UI) { - if (UI->SU == SU) - break; + V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg)); } - if (UI == VRegUses.end()) - VRegUses.insert(VReg2SUnit(Reg, SU)); - - // Lookup this operand's reaching definition. - assert(LIS && "vreg dependencies requires LiveIntervals"); - LiveQueryResult LRQ - = LIS->getInterval(Reg).Query(LIS->getInstructionIndex(MI)); - VNInfo *VNI = LRQ.valueIn(); - - // VNI will be valid because MachineOperand::readsReg() is checked by caller. - assert(VNI && "No value to read by operand"); - MachineInstr *Def = LIS->getInstructionFromIndex(VNI->def); - // Phis and other noninstructions (after coalescing) have a NULL Def. - if (Def) { - SUnit *DefSU = getSUnit(Def); - if (DefSU) { - // The reaching Def lives within this scheduling region. - // Create a data dependence. - SDep dep(DefSU, SDep::Data, Reg); - // Adjust the dependence latency using operand def/use information, then - // allow the target to perform its own adjustments. - int DefOp = Def->findRegisterDefOperandIdx(Reg); - dep.setLatency(SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx)); - - const TargetSubtargetInfo &ST = MF.getSubtarget(); - ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep)); - SU->addPred(dep); - } - } - - // Add antidependence to the following def of the vreg it uses. - VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg); - if (DefI != VRegDefs.end() && DefI->SU != SU) - DefI->SU->addPred(SDep(SU, SDep::Anti, Reg)); } /// Return true if MI is an instruction we are unable to reason about /// (like a call or something with unmodeled side effects). static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) { - if (MI->isCall() || MI->hasUnmodeledSideEffects() || - (MI->hasOrderedMemoryRef() && - (!MI->mayLoad() || !MI->isInvariantLoad(AA)))) - return true; - return false; + return MI->isCall() || MI->hasUnmodeledSideEffects() || + (MI->hasOrderedMemoryRef() && + (!MI->mayLoad() || !MI->isInvariantLoad(AA))); } // This MI might have either incomplete info, or known to be unsafe @@ -508,7 +559,7 @@ static inline bool isUnsafeMemoryObject(MachineInstr *MI, return false; } -/// This returns true if the two MIs need a chain edge betwee them. +/// This returns true if the two MIs need a chain edge between them. /// If these are not even memory operations, we still may need /// chain deps between them. The question really is - could /// these two MIs be reordered during scheduling from memory dependency @@ -670,7 +721,7 @@ static inline void addChainDependency(AliasAnalysis *AA, unsigned TrueMemOrderLatency = 0, bool isNormalMemory = false) { // If this is a false dependency, - // do not add the edge, but rememeber the rejected node. + // do not add the edge, but remember the rejected node. if (MIsNeedChainEdge(AA, MFI, DL, SUa->getInstr(), SUb->getInstr())) { SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier); Dep.setLatency(TrueMemOrderLatency); @@ -685,7 +736,7 @@ static inline void addChainDependency(AliasAnalysis *AA, } } -/// Create an SUnit for each real instruction, numbered in top-down toplological +/// Create an SUnit for each real instruction, numbered in top-down topological /// order. The instruction order A < B, implies that no edge exists from B to A. /// /// Map each real instruction to its SUnit. @@ -743,17 +794,44 @@ void ScheduleDAGInstrs::initSUnits() { } } +void ScheduleDAGInstrs::collectVRegUses(SUnit *SU) { + const MachineInstr *MI = SU->getInstr(); + for (const MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (!MO.readsReg()) + continue; + if (TrackLaneMasks && !MO.isUse()) + continue; + + unsigned Reg = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + + // Record this local VReg use. + VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg); + for (; UI != VRegUses.end(); ++UI) { + if (UI->SU == SU) + break; + } + if (UI == VRegUses.end()) + VRegUses.insert(VReg2SUnit(Reg, 0, SU)); + } +} + /// If RegPressure is non-null, compute register pressure as a side effect. The /// DAG builder is an efficient place to do it because it already visits /// operands. void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker, - PressureDiffs *PDiffs) { + PressureDiffs *PDiffs, + bool TrackLaneMasks) { const TargetSubtargetInfo &ST = MF.getSubtarget(); bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI : ST.useAA(); AliasAnalysis *AAForDep = UseAA ? AA : nullptr; + this->TrackLaneMasks = TrackLaneMasks; MISUnitMap.clear(); ScheduleDAG::clearDAG(); @@ -766,7 +844,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, // We build scheduling units by walking a block's instruction list from bottom // to top. - // Remember where a generic side-effecting instruction is as we procede. + // Remember where a generic side-effecting instruction is as we proceed. SUnit *BarrierChain = nullptr, *AliasChain = nullptr; // Memory references to specific known memory locations are tracked @@ -787,10 +865,14 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, Defs.setUniverse(TRI->getNumRegs()); Uses.setUniverse(TRI->getNumRegs()); - assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs"); + assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs"); + assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses"); + unsigned NumVirtRegs = MRI.getNumVirtRegs(); + CurrentVRegDefs.setUniverse(NumVirtRegs); + CurrentVRegUses.setUniverse(NumVirtRegs); + VRegUses.clear(); - VRegDefs.setUniverse(MRI.getNumVirtRegs()); - VRegUses.setUniverse(MRI.getNumVirtRegs()); + VRegUses.setUniverse(NumVirtRegs); // Model data dependencies between instructions being scheduled and the // ExitSU. @@ -814,10 +896,16 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, assert(SU && "No SUnit mapped to this MI"); if (RPTracker) { - PressureDiff *PDiff = PDiffs ? &(*PDiffs)[SU->NodeNum] : nullptr; - RPTracker->recede(/*LiveUses=*/nullptr, PDiff); - assert(RPTracker->getPos() == std::prev(MII) && - "RPTracker can't find MI"); + collectVRegUses(SU); + + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, MRI); + if (PDiffs != nullptr) + PDiffs->addInstruction(SU->NodeNum, RegOpers, MRI); + + RPTracker->recedeSkipDebugValues(); + assert(&*RPTracker->getPos() == MI && "RPTracker in sync"); + RPTracker->recede(RegOpers); } assert( @@ -835,7 +923,6 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, if (TRI->isPhysicalRegister(Reg)) addPhysRegDeps(SU, j); else { - assert(!IsPostRA && "Virtual register encountered!"); if (MO.isDef()) { HasVRegDef = true; addVRegDefDeps(SU, j); @@ -890,7 +977,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, BarrierChain = SU; // This is a barrier event that acts as a pivotal node in the DAG, // so it is safe to clear list of exposed nodes. - adjustChainDeps(AA, MFI, *TM.getDataLayout(), SU, &ExitSU, RejectMemNodes, + adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU, RejectMemNodes, TrueMemOrderLatency); RejectMemNodes.clear(); NonAliasMemDefs.clear(); @@ -903,27 +990,30 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, unsigned ChainLatency = 0; if (AliasChain->getInstr()->mayLoad()) ChainLatency = TrueMemOrderLatency; - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, AliasChain, + addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, AliasChain, RejectMemNodes, ChainLatency); } AliasChain = SU; for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, + addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, PendingLoads[k], RejectMemNodes, TrueMemOrderLatency); for (MapVector<ValueType, std::vector<SUnit *> >::iterator I = AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I) { for (unsigned i = 0, e = I->second.size(); i != e; ++i) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, + addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, I->second[i], RejectMemNodes); } for (MapVector<ValueType, std::vector<SUnit *> >::iterator I = AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) { for (unsigned i = 0, e = I->second.size(); i != e; ++i) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, + addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, I->second[i], RejectMemNodes, TrueMemOrderLatency); } - adjustChainDeps(AA, MFI, *TM.getDataLayout(), SU, &ExitSU, RejectMemNodes, + // This call must come after calls to addChainDependency() since it + // consumes the 'RejectMemNodes' list that addChainDependency() possibly + // adds to. + adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU, RejectMemNodes, TrueMemOrderLatency); PendingLoads.clear(); AliasMemDefs.clear(); @@ -937,7 +1027,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, BarrierChain->addPred(SDep(SU, SDep::Barrier)); UnderlyingObjectsVector Objs; - getUnderlyingObjectsForInstr(MI, MFI, Objs, *TM.getDataLayout()); + getUnderlyingObjectsForInstr(MI, MFI, Objs, MF.getDataLayout()); if (Objs.empty()) { // Treat all other stores conservatively. @@ -961,7 +1051,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end()); if (I != IE) { for (unsigned i = 0, e = I->second.size(); i != e; ++i) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, + addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, I->second[i], RejectMemNodes, 0, true); // If we're not using AA, then we only need one store per object. @@ -986,7 +1076,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, ((ThisMayAlias) ? AliasMemUses.end() : NonAliasMemUses.end()); if (J != JE) { for (unsigned i = 0, e = J->second.size(); i != e; ++i) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, + addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, J->second[i], RejectMemNodes, TrueMemOrderLatency, true); J->second.clear(); @@ -996,15 +1086,18 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, // Add dependencies from all the PendingLoads, i.e. loads // with no underlying object. for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, + addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, PendingLoads[k], RejectMemNodes, TrueMemOrderLatency); // Add dependence on alias chain, if needed. if (AliasChain) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, AliasChain, + addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, AliasChain, RejectMemNodes); } - adjustChainDeps(AA, MFI, *TM.getDataLayout(), SU, &ExitSU, RejectMemNodes, + // This call must come after calls to addChainDependency() since it + // consumes the 'RejectMemNodes' list that addChainDependency() possibly + // adds to. + adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU, RejectMemNodes, TrueMemOrderLatency); } else if (MI->mayLoad()) { bool MayAlias = true; @@ -1012,7 +1105,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, // Invariant load, no chain dependencies needed! } else { UnderlyingObjectsVector Objs; - getUnderlyingObjectsForInstr(MI, MFI, Objs, *TM.getDataLayout()); + getUnderlyingObjectsForInstr(MI, MFI, Objs, MF.getDataLayout()); if (Objs.empty()) { // A load with no underlying object. Depend on all @@ -1020,7 +1113,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, for (MapVector<ValueType, std::vector<SUnit *> >::iterator I = AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I) for (unsigned i = 0, e = I->second.size(); i != e; ++i) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, + addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, I->second[i], RejectMemNodes); PendingLoads.push_back(SU); @@ -1044,20 +1137,23 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end()); if (I != IE) for (unsigned i = 0, e = I->second.size(); i != e; ++i) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, + addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, I->second[i], RejectMemNodes, 0, true); if (ThisMayAlias) AliasMemUses[V].push_back(SU); else NonAliasMemUses[V].push_back(SU); } - if (MayAlias) - adjustChainDeps(AA, MFI, *TM.getDataLayout(), SU, &ExitSU, - RejectMemNodes, /*Latency=*/0); // Add dependencies on alias and barrier chains, if needed. if (MayAlias && AliasChain) - addChainDependency(AAForDep, MFI, *TM.getDataLayout(), SU, AliasChain, + addChainDependency(AAForDep, MFI, MF.getDataLayout(), SU, AliasChain, RejectMemNodes); + if (MayAlias) + // This call must come after calls to addChainDependency() since it + // consumes the 'RejectMemNodes' list that addChainDependency() + // possibly adds to. + adjustChainDeps(AA, MFI, MF.getDataLayout(), SU, &ExitSU, + RejectMemNodes, /*Latency=*/0); if (BarrierChain) BarrierChain->addPred(SDep(SU, SDep::Barrier)); } @@ -1068,7 +1164,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA, Defs.clear(); Uses.clear(); - VRegDefs.clear(); + CurrentVRegDefs.clear(); + CurrentVRegUses.clear(); PendingLoads.clear(); } @@ -1080,11 +1177,9 @@ void ScheduleDAGInstrs::startBlockForKills(MachineBasicBlock *BB) { // Examine the live-in regs of all successors. for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), SE = BB->succ_end(); SI != SE; ++SI) { - for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(), - E = (*SI)->livein_end(); I != E; ++I) { - unsigned Reg = *I; + for (const auto &LI : (*SI)->liveins()) { // Repeat, for reg and all subregs. - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); + for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true); SubRegs.isValid(); ++SubRegs) LiveRegs.set(*SubRegs); } @@ -1103,7 +1198,7 @@ static void toggleBundleKillFlag(MachineInstr *MI, unsigned Reg, // Once we set a kill flag on an instruction, we bail out, as otherwise we // might set it on too many operands. We will clear as many flags as we // can though. - MachineBasicBlock::instr_iterator Begin = MI; + MachineBasicBlock::instr_iterator Begin = MI->getIterator(); MachineBasicBlock::instr_iterator End = getBundleEnd(MI); while (Begin != End) { for (MachineOperand &MO : (--End)->operands()) { @@ -1237,7 +1332,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) { toggleKillFlag(MI, MO); DEBUG(MI->dump()); DEBUG(if (MI->getOpcode() == TargetOpcode::BUNDLE) { - MachineBasicBlock::instr_iterator Begin = MI; + MachineBasicBlock::instr_iterator Begin = MI->getIterator(); MachineBasicBlock::instr_iterator End = getBundleEnd(MI); while (++Begin != End) DEBUG(Begin->dump()); diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp index b2e4617..1150d26 100644 --- a/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp @@ -43,9 +43,12 @@ namespace llvm { return (Node->NumPreds > 10 || Node->NumSuccs > 10); } - static bool hasNodeAddressLabel(const SUnit *Node, - const ScheduleDAG *Graph) { - return true; + static std::string getNodeIdentifierLabel(const SUnit *Node, + const ScheduleDAG *Graph) { + std::string R; + raw_string_ostream OS(R); + OS << static_cast<const void *>(Node); + return R; } /// If you want to override the dot attributes printed for a particular diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3b29306..c741982 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -156,13 +156,16 @@ namespace { void deleteAndRecombine(SDNode *N); bool recursivelyDeleteUnusedNodes(SDNode *N); + /// Replaces all uses of the results of one DAG node with new values. SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, bool AddTo = true); + /// Replaces all uses of the results of one DAG node with new values. SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) { return CombineTo(N, &Res, 1, AddTo); } + /// Replaces all uses of the results of one DAG node with new values. SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true) { SDValue To[] = { Res0, Res1 }; @@ -233,18 +236,17 @@ namespace { SDValue visitADDE(SDNode *N); SDValue visitSUBE(SDNode *N); SDValue visitMUL(SDNode *N); + SDValue useDivRem(SDNode *N); SDValue visitSDIV(SDNode *N); SDValue visitUDIV(SDNode *N); - SDValue visitSREM(SDNode *N); - SDValue visitUREM(SDNode *N); + SDValue visitREM(SDNode *N); SDValue visitMULHU(SDNode *N); SDValue visitMULHS(SDNode *N); SDValue visitSMUL_LOHI(SDNode *N); SDValue visitUMUL_LOHI(SDNode *N); SDValue visitSMULO(SDNode *N); SDValue visitUMULO(SDNode *N); - SDValue visitSDIVREM(SDNode *N); - SDValue visitUDIVREM(SDNode *N); + SDValue visitIMINMAX(SDNode *N); SDValue visitAND(SDNode *N); SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *LocReference); SDValue visitOR(SDNode *N); @@ -265,6 +267,7 @@ namespace { SDValue visitVSELECT(SDNode *N); SDValue visitSELECT_CC(SDNode *N); SDValue visitSETCC(SDNode *N); + SDValue visitSETCCE(SDNode *N); SDValue visitSIGN_EXTEND(SDNode *N); SDValue visitZERO_EXTEND(SDNode *N); SDValue visitANY_EXTEND(SDNode *N); @@ -298,6 +301,10 @@ namespace { SDValue visitBRCOND(SDNode *N); SDValue visitBR_CC(SDNode *N); SDValue visitLOAD(SDNode *N); + + SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain); + SDValue replaceStoreOfFPConstant(StoreSDNode *ST); + SDValue visitSTORE(SDNode *N); SDValue visitINSERT_VECTOR_ELT(SDNode *N); SDValue visitEXTRACT_VECTOR_ELT(SDNode *N); @@ -312,9 +319,11 @@ namespace { SDValue visitMGATHER(SDNode *N); SDValue visitMSCATTER(SDNode *N); SDValue visitFP_TO_FP16(SDNode *N); + SDValue visitFP16_TO_FP(SDNode *N); SDValue visitFADDForFMACombine(SDNode *N); SDValue visitFSUBForFMACombine(SDNode *N); + SDValue visitFMULForFMACombine(SDNode *N); SDValue XformToShuffleWithZero(SDNode *N); SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS); @@ -338,14 +347,17 @@ namespace { unsigned HiOp); SDValue CombineConsecutiveLoads(SDNode *N, EVT VT); SDValue CombineExtLoad(SDNode *N); + SDValue combineRepeatedFPDivisors(SDNode *N); SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT); SDValue BuildSDIV(SDNode *N); SDValue BuildSDIVPow2(SDNode *N); SDValue BuildUDIV(SDNode *N); - SDValue BuildReciprocalEstimate(SDValue Op); - SDValue BuildRsqrtEstimate(SDValue Op); - SDValue BuildRsqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations); - SDValue BuildRsqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations); + SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags); + SDValue BuildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags); + SDValue BuildRsqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations, + SDNodeFlags *Flags); + SDValue BuildRsqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations, + SDNodeFlags *Flags); SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, bool DemandHighBits = true); SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); @@ -374,6 +386,10 @@ namespace { /// chain (aliasing node.) SDValue FindBetterChain(SDNode *N, SDValue Chain); + /// Do FindBetterChain for a store and any possibly adjacent stores on + /// consecutive chains. + bool findBetterNeighborChains(StoreSDNode *St); + /// Holds a pointer to an LSBaseSDNode as well as information on where it /// is located in a sequence of memory operations connected by a chain. struct MemOpLink { @@ -388,19 +404,37 @@ namespace { unsigned SequenceNum; }; + /// This is a helper function for visitMUL to check the profitability + /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2). + /// MulNode is the original multiply, AddNode is (add x, c1), + /// and ConstNode is c2. + bool isMulAddWithConstProfitable(SDNode *MulNode, + SDValue &AddNode, + SDValue &ConstNode); + /// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a /// constant build_vector of the stored constant values in Stores. SDValue getMergedConstantVectorStore(SelectionDAG &DAG, SDLoc SL, ArrayRef<MemOpLink> Stores, + SmallVectorImpl<SDValue> &Chains, EVT Ty) const; + /// This is a helper function for visitAND and visitZERO_EXTEND. Returns + /// true if the (and (load x) c) pattern matches an extload. ExtVT returns + /// the type of the loaded value to be extended. LoadedVT returns the type + /// of the original loaded value. NarrowLoad returns whether the load would + /// need to be narrowed in order to match. + bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, + EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT, + bool &NarrowLoad); + /// This is a helper function for MergeConsecutiveStores. When the source /// elements of the consecutive stores are all constants or all extracted /// vector elements, try to merge them into one larger store. /// \return True if a merged store was created. bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes, - EVT MemVT, unsigned NumElem, + EVT MemVT, unsigned NumStores, bool IsConstantSrc, bool UseVector); /// This is a helper function for MergeConsecutiveStores. @@ -409,7 +443,7 @@ namespace { void getStoreMergeAndAliasCandidates( StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes, SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes); - + /// Merge consecutive store operations into a wide store. /// This optimization uses wide integers or vectors when possible. /// \return True if some memory operations were changed. @@ -427,9 +461,7 @@ namespace { DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) { - auto *F = DAG.getMachineFunction().getFunction(); - ForCodeSize = F->hasFnAttribute(Attribute::OptimizeForSize) || - F->hasFnAttribute(Attribute::MinSize); + ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize(); } /// Runs the dag combiner on all nodes in the work list @@ -606,6 +638,9 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG, assert(Op.hasOneUse() && "Unknown reuse!"); assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree"); + + const SDNodeFlags *Flags = Op.getNode()->getFlags(); + switch (Op.getOpcode()) { default: llvm_unreachable("Unknown code"); case ISD::ConstantFP: { @@ -623,12 +658,12 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1), - Op.getOperand(1)); + Op.getOperand(1), Flags); // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(1), DAG, LegalOperations, Depth+1), - Op.getOperand(0)); + Op.getOperand(0), Flags); case ISD::FSUB: // We can't turn -(A-B) into B-A when we honor signed zeros. assert(Options.UnsafeFPMath); @@ -640,7 +675,7 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG, // fold (fneg (fsub A, B)) -> (fsub B, A) return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), - Op.getOperand(1), Op.getOperand(0)); + Op.getOperand(1), Op.getOperand(0), Flags); case ISD::FMUL: case ISD::FDIV: @@ -652,13 +687,13 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG, return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1), - Op.getOperand(1)); + Op.getOperand(1), Flags); // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y)) return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), Op.getOperand(0), GetNegatedExpression(Op.getOperand(1), DAG, - LegalOperations, Depth+1)); + LegalOperations, Depth+1), Flags); case ISD::FP_EXTEND: case ISD::FSIN: @@ -1216,9 +1251,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) { LegalTypes = Level >= AfterLegalizeTypes; // Add all the dag nodes to the worklist. - for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), - E = DAG.allnodes_end(); I != E; ++I) - AddToWorklist(I); + for (SDNode &Node : DAG.allnodes()) + AddToWorklist(&Node); // Create a dummy node (which is not added to allnodes), that adds a reference // to the root node, preventing it from being deleted, and tracking any @@ -1333,16 +1367,18 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::MUL: return visitMUL(N); case ISD::SDIV: return visitSDIV(N); case ISD::UDIV: return visitUDIV(N); - case ISD::SREM: return visitSREM(N); - case ISD::UREM: return visitUREM(N); + case ISD::SREM: + case ISD::UREM: return visitREM(N); case ISD::MULHU: return visitMULHU(N); case ISD::MULHS: return visitMULHS(N); case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); case ISD::SMULO: return visitSMULO(N); case ISD::UMULO: return visitUMULO(N); - case ISD::SDIVREM: return visitSDIVREM(N); - case ISD::UDIVREM: return visitUDIVREM(N); + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: return visitIMINMAX(N); case ISD::AND: return visitAND(N); case ISD::OR: return visitOR(N); case ISD::XOR: return visitXOR(N); @@ -1361,6 +1397,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::VSELECT: return visitVSELECT(N); case ISD::SELECT_CC: return visitSELECT_CC(N); case ISD::SETCC: return visitSETCC(N); + case ISD::SETCCE: return visitSETCCE(N); case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N); case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); case ISD::ANY_EXTEND: return visitANY_EXTEND(N); @@ -1408,6 +1445,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::MSCATTER: return visitMSCATTER(N); case ISD::MSTORE: return visitMSTORE(N); case ISD::FP_TO_FP16: return visitFP_TO_FP16(N); + case ISD::FP16_TO_FP: return visitFP16_TO_FP(N); } return SDValue(); } @@ -1470,13 +1508,8 @@ SDValue DAGCombiner::combine(SDNode *N) { // Constant operands are canonicalized to RHS. if (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1)) { SDValue Ops[] = {N1, N0}; - SDNode *CSENode; - if (const auto *BinNode = dyn_cast<BinaryWithFlagsSDNode>(N)) { - CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops, - &BinNode->Flags); - } else { - CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops); - } + SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops, + N->getFlags()); if (CSENode) return SDValue(CSENode, 0); } @@ -1595,26 +1628,6 @@ SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) { return SDValue(N, 0); // Return N so it doesn't get rechecked! } -static bool isNullConstant(SDValue V) { - ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V); - return Const != nullptr && Const->isNullValue(); -} - -static bool isNullFPConstant(SDValue V) { - ConstantFPSDNode *Const = dyn_cast<ConstantFPSDNode>(V); - return Const != nullptr && Const->isZero() && !Const->isNegative(); -} - -static bool isAllOnesConstant(SDValue V) { - ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V); - return Const != nullptr && Const->isAllOnesValue(); -} - -static bool isOneConstant(SDValue V) { - ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V); - return Const != nullptr && Const->isOne(); -} - /// If \p N is a ContantSDNode with isOpaque() == false return it casted to a /// ContantSDNode pointer else nullptr. static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) { @@ -1721,22 +1734,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) { return SDValue(N, 0); // fold (a+b) -> (a|b) iff a and b share no bits. - if (VT.isInteger() && !VT.isVector()) { - APInt LHSZero, LHSOne; - APInt RHSZero, RHSOne; - DAG.computeKnownBits(N0, LHSZero, LHSOne); - - if (LHSZero.getBoolValue()) { - DAG.computeKnownBits(N1, RHSZero, RHSOne); - - // If all possibly-set bits on the LHS are clear on the RHS, return an OR. - // If all possibly-set bits on the RHS are clear on the LHS, return an OR. - if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero){ - if (!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) - return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1); - } - } - } + if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && + VT.isInteger() && !VT.isVector() && DAG.haveNoCommonBitsSet(N0, N1)) + return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1); // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n)) if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB && @@ -1971,31 +1971,26 @@ SDValue DAGCombiner::visitSUBC(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); + SDLoc DL(N); // If the flag result is dead, turn this into an SUB. if (!N->hasAnyUseOfValue(1)) - return CombineTo(N, DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, N1), - DAG.getNode(ISD::CARRY_FALSE, SDLoc(N), - MVT::Glue)); + return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), + DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // fold (subc x, x) -> 0 + no borrow - if (N0 == N1) { - SDLoc DL(N); + if (N0 == N1) return CombineTo(N, DAG.getConstant(0, DL, VT), - DAG.getNode(ISD::CARRY_FALSE, DL, - MVT::Glue)); - } + DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // fold (subc x, 0) -> x + no borrow if (isNullConstant(N1)) - return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, SDLoc(N), - MVT::Glue)); + return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow if (isAllOnesConstant(N0)) - return CombineTo(N, DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0), - DAG.getNode(ISD::CARRY_FALSE, SDLoc(N), - MVT::Glue)); + return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), + DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); return SDValue(); } @@ -2130,14 +2125,15 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { } // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2) - if (N1IsConst && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() && - (isConstantSplatVector(N0.getOperand(1).getNode(), Val) || - isa<ConstantSDNode>(N0.getOperand(1)))) - return DAG.getNode(ISD::ADD, SDLoc(N), VT, - DAG.getNode(ISD::MUL, SDLoc(N0), VT, - N0.getOperand(0), N1), - DAG.getNode(ISD::MUL, SDLoc(N1), VT, - N0.getOperand(1), N1)); + if (isConstantIntBuildVectorOrConstantInt(N1) && + N0.getOpcode() == ISD::ADD && + isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) && + isMulAddWithConstProfitable(N, N0, N1)) + return DAG.getNode(ISD::ADD, SDLoc(N), VT, + DAG.getNode(ISD::MUL, SDLoc(N0), VT, + N0.getOperand(0), N1), + DAG.getNode(ISD::MUL, SDLoc(N1), VT, + N0.getOperand(1), N1)); // reassociate mul if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1)) @@ -2146,6 +2142,88 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { return SDValue(); } +/// Return true if divmod libcall is available. +static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned, + const TargetLowering &TLI) { + RTLIB::Libcall LC; + switch (Node->getSimpleValueType(0).SimpleTy) { + default: return false; // No libcall for vector types. + case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; + case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; + case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; + case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; + case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break; + } + + return TLI.getLibcallName(LC) != nullptr; +} + +/// Issue divrem if both quotient and remainder are needed. +SDValue DAGCombiner::useDivRem(SDNode *Node) { + if (Node->use_empty()) + return SDValue(); // This is a dead node, leave it alone. + + EVT VT = Node->getValueType(0); + if (!TLI.isTypeLegal(VT)) + return SDValue(); + + unsigned Opcode = Node->getOpcode(); + bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM); + + unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM; + // If DIVREM is going to get expanded into a libcall, + // but there is no libcall available, then don't combine. + if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) && + !isDivRemLibcallAvailable(Node, isSigned, TLI)) + return SDValue(); + + // If div is legal, it's better to do the normal expansion + unsigned OtherOpcode = 0; + if ((Opcode == ISD::SDIV) || (Opcode == ISD::UDIV)) { + OtherOpcode = isSigned ? ISD::SREM : ISD::UREM; + if (TLI.isOperationLegalOrCustom(Opcode, VT)) + return SDValue(); + } else { + OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV; + if (TLI.isOperationLegalOrCustom(OtherOpcode, VT)) + return SDValue(); + } + + SDValue Op0 = Node->getOperand(0); + SDValue Op1 = Node->getOperand(1); + SDValue combined; + for (SDNode::use_iterator UI = Op0.getNode()->use_begin(), + UE = Op0.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User == Node || User->use_empty()) + continue; + // Convert the other matching node(s), too; + // otherwise, the DIVREM may get target-legalized into something + // target-specific that we won't be able to recognize. + unsigned UserOpc = User->getOpcode(); + if ((UserOpc == Opcode || UserOpc == OtherOpcode || UserOpc == DivRemOpc) && + User->getOperand(0) == Op0 && + User->getOperand(1) == Op1) { + if (!combined) { + if (UserOpc == OtherOpcode) { + SDVTList VTs = DAG.getVTList(VT, VT); + combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1); + } else if (UserOpc == DivRemOpc) { + combined = SDValue(User, 0); + } else { + assert(UserOpc == Opcode); + continue; + } + } + if (UserOpc == ISD::SDIV || UserOpc == ISD::UDIV) + CombineTo(User, combined); + else if (UserOpc == ISD::SREM || UserOpc == ISD::UREM) + CombineTo(User, combined.getValue(1)); + } + } + return combined; +} + SDValue DAGCombiner::visitSDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -2156,26 +2234,26 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; + SDLoc DL(N); + // fold (sdiv c1, c2) -> c1/c2 ConstantSDNode *N0C = isConstOrConstSplat(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N0C && N1C && !N0C->isOpaque() && !N1C->isOpaque()) - return DAG.FoldConstantArithmetic(ISD::SDIV, SDLoc(N), VT, N0C, N1C); + return DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, N0C, N1C); // fold (sdiv X, 1) -> X if (N1C && N1C->isOne()) return N0; // fold (sdiv X, -1) -> 0-X - if (N1C && N1C->isAllOnesValue()) { - SDLoc DL(N); + if (N1C && N1C->isAllOnesValue()) return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0); - } + // If we know the sign bits of both operands are zero, strength reduce to a // udiv instead. Handles (X&15) /s 4 -> X&15 >> 2 if (!VT.isVector()) { if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) - return DAG.getNode(ISD::UDIV, SDLoc(N), N1.getValueType(), - N0, N1); + return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1); } // fold (sdiv X, pow2) -> simple ops after legalize @@ -2186,18 +2264,11 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { !cast<BinaryWithFlagsSDNode>(N)->Flags.hasExact() && (N1C->getAPIntValue().isPowerOf2() || (-N1C->getAPIntValue()).isPowerOf2())) { - // If dividing by powers of two is cheap, then don't perform the following - // fold. - if (TLI.isPow2SDivCheap()) - return SDValue(); - // Target-specific implementation of sdiv x, pow2. - SDValue Res = BuildSDIVPow2(N); - if (Res.getNode()) + if (SDValue Res = BuildSDIVPow2(N)) return Res; unsigned lg2 = N1C->getAPIntValue().countTrailingZeros(); - SDLoc DL(N); // Splat the sign bit into the register SDValue SGN = @@ -2228,15 +2299,23 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { } // If integer divide is expensive and we satisfy the requirements, emit an - // alternate sequence. - if (N1C && !TLI.isIntDivCheap()) { - SDValue Op = BuildSDIV(N); - if (Op.getNode()) return Op; - } + // alternate sequence. Targets may check function attributes for size/speed + // trade-offs. + AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) + if (SDValue Op = BuildSDIV(N)) + return Op; + + // sdiv, srem -> sdivrem + // If the divisor is constant, then return DIVREM only if isIntDivCheap() is true. + // Otherwise, we break the simplification logic in visitREM(). + if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) + if (SDValue DivRem = useDivRem(N)) + return DivRem; // undef / X -> 0 if (N0.getOpcode() == ISD::UNDEF) - return DAG.getConstant(0, SDLoc(N), VT); + return DAG.getConstant(0, DL, VT); // X / undef -> undef if (N1.getOpcode() == ISD::UNDEF) return N1; @@ -2254,26 +2333,26 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; + SDLoc DL(N); + // fold (udiv c1, c2) -> c1/c2 ConstantSDNode *N0C = isConstOrConstSplat(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N0C && N1C) - if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UDIV, SDLoc(N), VT, + if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, N0C, N1C)) return Folded; // fold (udiv x, (1 << c)) -> x >>u c - if (N1C && !N1C->isOpaque() && N1C->getAPIntValue().isPowerOf2()) { - SDLoc DL(N); + if (N1C && !N1C->isOpaque() && N1C->getAPIntValue().isPowerOf2()) return DAG.getNode(ISD::SRL, DL, VT, N0, DAG.getConstant(N1C->getAPIntValue().logBase2(), DL, getShiftAmountTy(N0.getValueType()))); - } + // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 if (N1.getOpcode() == ISD::SHL) { if (ConstantSDNode *SHC = getAsNonOpaqueConstant(N1.getOperand(0))) { if (SHC->getAPIntValue().isPowerOf2()) { EVT ADDVT = N1.getOperand(1).getValueType(); - SDLoc DL(N); SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), DAG.getConstant(SHC->getAPIntValue() @@ -2284,15 +2363,23 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { } } } + // fold (udiv x, c) -> alternate - if (N1C && !TLI.isIntDivCheap()) { - SDValue Op = BuildUDIV(N); - if (Op.getNode()) return Op; - } + AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) + if (SDValue Op = BuildUDIV(N)) + return Op; + + // sdiv, srem -> sdivrem + // If the divisor is constant, then return DIVREM only if isIntDivCheap() is true. + // Otherwise, we break the simplification logic in visitREM(). + if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) + if (SDValue DivRem = useDivRem(N)) + return DivRem; // undef / X -> 0 if (N0.getOpcode() == ISD::UNDEF) - return DAG.getConstant(0, SDLoc(N), VT); + return DAG.getConstant(0, DL, VT); // X / undef -> undef if (N1.getOpcode() == ISD::UNDEF) return N1; @@ -2300,102 +2387,83 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { return SDValue(); } -SDValue DAGCombiner::visitSREM(SDNode *N) { +// handles ISD::SREM and ISD::UREM +SDValue DAGCombiner::visitREM(SDNode *N) { + unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + bool isSigned = (Opcode == ISD::SREM); + SDLoc DL(N); - // fold (srem c1, c2) -> c1%c2 + // fold (rem c1, c2) -> c1%c2 ConstantSDNode *N0C = isConstOrConstSplat(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N0C && N1C) - if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::SREM, SDLoc(N), VT, - N0C, N1C)) + if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C)) return Folded; - // If we know the sign bits of both operands are zero, strength reduce to a - // urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15 - if (!VT.isVector()) { - if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) - return DAG.getNode(ISD::UREM, SDLoc(N), VT, N0, N1); - } - // If X/C can be simplified by the division-by-constant logic, lower - // X%C to the equivalent of X-X/C*C. - if (N1C && !N1C->isNullValue()) { - SDValue Div = DAG.getNode(ISD::SDIV, SDLoc(N), VT, N0, N1); - AddToWorklist(Div.getNode()); - SDValue OptimizedDiv = combine(Div.getNode()); - if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) { - SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, - OptimizedDiv, N1); - SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, Mul); - AddToWorklist(Mul.getNode()); - return Sub; + if (isSigned) { + // If we know the sign bits of both operands are zero, strength reduce to a + // urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15 + if (!VT.isVector()) { + if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) + return DAG.getNode(ISD::UREM, DL, VT, N0, N1); } - } - - // undef % X -> 0 - if (N0.getOpcode() == ISD::UNDEF) - return DAG.getConstant(0, SDLoc(N), VT); - // X % undef -> undef - if (N1.getOpcode() == ISD::UNDEF) - return N1; - - return SDValue(); -} - -SDValue DAGCombiner::visitUREM(SDNode *N) { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - EVT VT = N->getValueType(0); - - // fold (urem c1, c2) -> c1%c2 - ConstantSDNode *N0C = isConstOrConstSplat(N0); - ConstantSDNode *N1C = isConstOrConstSplat(N1); - if (N0C && N1C) - if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UREM, SDLoc(N), VT, - N0C, N1C)) - return Folded; - // fold (urem x, pow2) -> (and x, pow2-1) - if (N1C && !N1C->isNullValue() && !N1C->isOpaque() && - N1C->getAPIntValue().isPowerOf2()) { - SDLoc DL(N); - return DAG.getNode(ISD::AND, DL, VT, N0, - DAG.getConstant(N1C->getAPIntValue() - 1, DL, VT)); - } - // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) - if (N1.getOpcode() == ISD::SHL) { - if (ConstantSDNode *SHC = getAsNonOpaqueConstant(N1.getOperand(0))) { - if (SHC->getAPIntValue().isPowerOf2()) { - SDLoc DL(N); - SDValue Add = - DAG.getNode(ISD::ADD, DL, VT, N1, + } else { + // fold (urem x, pow2) -> (and x, pow2-1) + if (N1C && !N1C->isNullValue() && !N1C->isOpaque() && + N1C->getAPIntValue().isPowerOf2()) { + return DAG.getNode(ISD::AND, DL, VT, N0, + DAG.getConstant(N1C->getAPIntValue() - 1, DL, VT)); + } + // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) + if (N1.getOpcode() == ISD::SHL) { + if (ConstantSDNode *SHC = getAsNonOpaqueConstant(N1.getOperand(0))) { + if (SHC->getAPIntValue().isPowerOf2()) { + SDValue Add = + DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT)); - AddToWorklist(Add.getNode()); - return DAG.getNode(ISD::AND, DL, VT, N0, Add); + AddToWorklist(Add.getNode()); + return DAG.getNode(ISD::AND, DL, VT, N0, Add); + } } } } + AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. - if (N1C && !N1C->isNullValue()) { - SDValue Div = DAG.getNode(ISD::UDIV, SDLoc(N), VT, N0, N1); + // To avoid mangling nodes, this simplification requires that the combine() + // call for the speculative DIV must not cause a DIVREM conversion. We guard + // against this by skipping the simplification if isIntDivCheap(). When + // div is not cheap, combine will not return a DIVREM. Regardless, + // checking cheapness here makes sense since the simplification results in + // fatter code. + if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap(VT, Attr)) { + unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; + SDValue Div = DAG.getNode(DivOpcode, DL, VT, N0, N1); AddToWorklist(Div.getNode()); SDValue OptimizedDiv = combine(Div.getNode()); if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) { - SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, - OptimizedDiv, N1); - SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, Mul); + assert((OptimizedDiv.getOpcode() != ISD::UDIVREM) && + (OptimizedDiv.getOpcode() != ISD::SDIVREM)); + SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1); + SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul); AddToWorklist(Mul.getNode()); return Sub; } } + // sdiv, srem -> sdivrem + if (SDValue DivRem = useDivRem(N)) + return DivRem.getValue(1); + // undef % X -> 0 if (N0.getOpcode() == ISD::UNDEF) - return DAG.getConstant(0, SDLoc(N), VT); + return DAG.getConstant(0, DL, VT); // X % undef -> undef if (N1.getOpcode() == ISD::UNDEF) return N1; @@ -2532,8 +2600,8 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, } SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) { - SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS); - if (Res.getNode()) return Res; + if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS)) + return Res; EVT VT = N->getValueType(0); SDLoc DL(N); @@ -2563,8 +2631,8 @@ SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) { } SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) { - SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU); - if (Res.getNode()) return Res; + if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU)) + return Res; EVT VT = N->getValueType(0); SDLoc DL(N); @@ -2613,16 +2681,26 @@ SDValue DAGCombiner::visitUMULO(SDNode *N) { return SDValue(); } -SDValue DAGCombiner::visitSDIVREM(SDNode *N) { - SDValue Res = SimplifyNodeWithTwoResults(N, ISD::SDIV, ISD::SREM); - if (Res.getNode()) return Res; +SDValue DAGCombiner::visitIMINMAX(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + + // fold vector ops + if (VT.isVector()) + if (SDValue FoldedVOp = SimplifyVBinOp(N)) + return FoldedVOp; - return SDValue(); -} + // fold (add c1, c2) -> c1+c2 + ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); + ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); + if (N0C && N1C) + return DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, N0C, N1C); -SDValue DAGCombiner::visitUDIVREM(SDNode *N) { - SDValue Res = SimplifyNodeWithTwoResults(N, ISD::UDIV, ISD::UREM); - if (Res.getNode()) return Res; + // canonicalize constant to RHS + if (isConstantIntBuildVectorOrConstantInt(N0) && + !isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); return SDValue(); } @@ -2848,10 +2926,13 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, if (Result != ISD::SETCC_INVALID && (!LegalOperations || (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) && - TLI.isOperationLegal(ISD::SETCC, - getSetCCResultType(N0.getSimpleValueType()))))) - return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(), - LL, LR, Result); + TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) { + EVT CCVT = getSetCCResultType(LL.getValueType()); + if (N0.getValueType() == CCVT || + (!LegalOperations && N0.getValueType() == MVT::i1)) + return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(), + LL, LR, Result); + } } } @@ -2887,6 +2968,46 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, return SDValue(); } +bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, + EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT, + bool &NarrowLoad) { + uint32_t ActiveBits = AndC->getAPIntValue().getActiveBits(); + + if (ActiveBits == 0 || !APIntOps::isMask(ActiveBits, AndC->getAPIntValue())) + return false; + + ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); + LoadedVT = LoadN->getMemoryVT(); + + if (ExtVT == LoadedVT && + (!LegalOperations || + TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) { + // ZEXTLOAD will match without needing to change the size of the value being + // loaded. + NarrowLoad = false; + return true; + } + + // Do not change the width of a volatile load. + if (LoadN->isVolatile()) + return false; + + // Do not generate loads of non-round integer types since these can + // be expensive (and would be wrong if the type is not byte sized). + if (!LoadedVT.bitsGT(ExtVT) || !ExtVT.isRound()) + return false; + + if (LegalOperations && + !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT)) + return false; + + if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT)) + return false; + + NarrowLoad = true; + return true; +} + SDValue DAGCombiner::visitAND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -3079,16 +3200,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) { : cast<LoadSDNode>(N0); if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed() && N0.hasOneUse() && SDValue(LN0, 0).hasOneUse()) { - uint32_t ActiveBits = N1C->getAPIntValue().getActiveBits(); - if (ActiveBits > 0 && APIntOps::isMask(ActiveBits, N1C->getAPIntValue())){ - EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); - EVT LoadedVT = LN0->getMemoryVT(); - EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT; - - if (ExtVT == LoadedVT && - (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, - ExtVT))) { - + auto NarrowLoad = false; + EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT; + EVT ExtVT, LoadedVT; + if (isAndLoadExtLoad(N1C, LN0, LoadResultTy, ExtVT, LoadedVT, + NarrowLoad)) { + if (!NarrowLoad) { SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy, LN0->getChain(), LN0->getBasePtr(), ExtVT, @@ -3096,14 +3213,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { AddToWorklist(N); CombineTo(LN0, NewLoad, NewLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! - } - - // Do not change the width of a volatile load. - // Do not generate loads of non-round integer types since these can - // be expensive (and would be wrong if the type is not byte sized). - if (!LN0->isVolatile() && LoadedVT.bitsGT(ExtVT) && ExtVT.isRound() && - (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, - ExtVT))) { + } else { EVT PtrType = LN0->getOperand(1).getValueType(); unsigned Alignment = LN0->getAlignment(); @@ -3142,10 +3252,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) { return Combined; // Simplify: (and (op x...), (op y...)) -> (op (and x, y)) - if (N0.getOpcode() == N1.getOpcode()) { - SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N); - if (Tmp.getNode()) return Tmp; - } + if (N0.getOpcode() == N1.getOpcode()) + if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) + return Tmp; // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1) // fold (and (sra)) -> (and (srl)) when possible. @@ -3507,10 +3616,13 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) { if (Result != ISD::SETCC_INVALID && (!LegalOperations || (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) && - TLI.isOperationLegal(ISD::SETCC, - getSetCCResultType(N0.getValueType()))))) - return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(), - LL, LR, Result); + TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) { + EVT CCVT = getSetCCResultType(LL.getValueType()); + if (N0.getValueType() == CCVT || + (!LegalOperations && N0.getValueType() == MVT::i1)) + return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(), + LL, LR, Result); + } } } @@ -3665,11 +3777,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) { return Combined; // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16) - SDValue BSwap = MatchBSwapHWord(N, N0, N1); - if (BSwap.getNode()) + if (SDValue BSwap = MatchBSwapHWord(N, N0, N1)) return BSwap; - BSwap = MatchBSwapHWordLow(N, N0, N1); - if (BSwap.getNode()) + if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1)) return BSwap; // reassociate or @@ -3690,10 +3800,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) { } } // Simplify: (or (op x...), (op y...)) -> (op (or x, y)) - if (N0.getOpcode() == N1.getOpcode()) { - SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N); - if (Tmp.getNode()) return Tmp; - } + if (N0.getOpcode() == N1.getOpcode()) + if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) + return Tmp; // See if this is some rotate idiom. if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N))) @@ -3710,7 +3819,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) { /// Match "(X shl/srl V1) & V2" where V2 may not be present. static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) { if (Op.getOpcode() == ISD::AND) { - if (isa<ConstantSDNode>(Op.getOperand(1))) { + if (isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) { Mask = Op.getOperand(1); Op = Op.getOperand(0); } else { @@ -3727,105 +3836,106 @@ static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) { } // Return true if we can prove that, whenever Neg and Pos are both in the -// range [0, OpSize), Neg == (Pos == 0 ? 0 : OpSize - Pos). This means that +// range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that // for two opposing shifts shift1 and shift2 and a value X with OpBits bits: // // (or (shift1 X, Neg), (shift2 X, Pos)) // // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate -// in direction shift1 by Neg. The range [0, OpSize) means that we only need +// in direction shift1 by Neg. The range [0, EltSize) means that we only need // to consider shift amounts with defined behavior. -static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned OpSize) { - // If OpSize is a power of 2 then: +static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) { + // If EltSize is a power of 2 then: // - // (a) (Pos == 0 ? 0 : OpSize - Pos) == (OpSize - Pos) & (OpSize - 1) - // (b) Neg == Neg & (OpSize - 1) whenever Neg is in [0, OpSize). + // (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1) + // (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize). // - // So if OpSize is a power of 2 and Neg is (and Neg', OpSize-1), we check + // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check // for the stronger condition: // - // Neg & (OpSize - 1) == (OpSize - Pos) & (OpSize - 1) [A] + // Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A] // - // for all Neg and Pos. Since Neg & (OpSize - 1) == Neg' & (OpSize - 1) + // for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1) // we can just replace Neg with Neg' for the rest of the function. // // In other cases we check for the even stronger condition: // - // Neg == OpSize - Pos [B] + // Neg == EltSize - Pos [B] // // for all Neg and Pos. Note that the (or ...) then invokes undefined - // behavior if Pos == 0 (and consequently Neg == OpSize). + // behavior if Pos == 0 (and consequently Neg == EltSize). // - // We could actually use [A] whenever OpSize is a power of 2, but the + // We could actually use [A] whenever EltSize is a power of 2, but the // only extra cases that it would match are those uninteresting ones // where Neg and Pos are never in range at the same time. E.g. for - // OpSize == 32, using [A] would allow a Neg of the form (sub 64, Pos) + // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos) // as well as (sub 32, Pos), but: // // (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos)) // // always invokes undefined behavior for 32-bit X. // - // Below, Mask == OpSize - 1 when using [A] and is all-ones otherwise. + // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise. unsigned MaskLoBits = 0; - if (Neg.getOpcode() == ISD::AND && - isPowerOf2_64(OpSize) && - Neg.getOperand(1).getOpcode() == ISD::Constant && - cast<ConstantSDNode>(Neg.getOperand(1))->getAPIntValue() == OpSize - 1) { - Neg = Neg.getOperand(0); - MaskLoBits = Log2_64(OpSize); + if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) { + if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) { + if (NegC->getAPIntValue() == EltSize - 1) { + Neg = Neg.getOperand(0); + MaskLoBits = Log2_64(EltSize); + } + } } // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1. if (Neg.getOpcode() != ISD::SUB) - return 0; - ConstantSDNode *NegC = dyn_cast<ConstantSDNode>(Neg.getOperand(0)); + return false; + ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0)); if (!NegC) - return 0; + return false; SDValue NegOp1 = Neg.getOperand(1); - // On the RHS of [A], if Pos is Pos' & (OpSize - 1), just replace Pos with + // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with // Pos'. The truncation is redundant for the purpose of the equality. - if (MaskLoBits && - Pos.getOpcode() == ISD::AND && - Pos.getOperand(1).getOpcode() == ISD::Constant && - cast<ConstantSDNode>(Pos.getOperand(1))->getAPIntValue() == OpSize - 1) - Pos = Pos.getOperand(0); + if (MaskLoBits && Pos.getOpcode() == ISD::AND) + if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) + if (PosC->getAPIntValue() == EltSize - 1) + Pos = Pos.getOperand(0); // The condition we need is now: // - // (NegC - NegOp1) & Mask == (OpSize - Pos) & Mask + // (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask // // If NegOp1 == Pos then we need: // - // OpSize & Mask == NegC & Mask + // EltSize & Mask == NegC & Mask // // (because "x & Mask" is a truncation and distributes through subtraction). APInt Width; if (Pos == NegOp1) Width = NegC->getAPIntValue(); + // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC. // Then the condition we want to prove becomes: // - // (NegC - NegOp1) & Mask == (OpSize - (NegOp1 + PosC)) & Mask + // (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask // // which, again because "x & Mask" is a truncation, becomes: // - // NegC & Mask == (OpSize - PosC) & Mask - // OpSize & Mask == (NegC + PosC) & Mask - else if (Pos.getOpcode() == ISD::ADD && - Pos.getOperand(0) == NegOp1 && - Pos.getOperand(1).getOpcode() == ISD::Constant) - Width = (cast<ConstantSDNode>(Pos.getOperand(1))->getAPIntValue() + - NegC->getAPIntValue()); - else + // NegC & Mask == (EltSize - PosC) & Mask + // EltSize & Mask == (NegC + PosC) & Mask + else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) { + if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) + Width = PosC->getAPIntValue() + NegC->getAPIntValue(); + else + return false; + } else return false; - // Now we just need to check that OpSize & Mask == Width & Mask. + // Now we just need to check that EltSize & Mask == Width & Mask. if (MaskLoBits) - // Opsize & Mask is 0 since Mask is Opsize - 1. + // EltSize & Mask is 0 since Mask is EltSize - 1. return Width.getLoBits(MaskLoBits) == 0; - return Width == OpSize; + return Width == EltSize; } // A subroutine of MatchRotate used once we have found an OR of two opposite @@ -3845,7 +3955,7 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos, // (srl x, (*ext y))) -> // (rotr x, y) or (rotl x, (sub 32, y)) EVT VT = Shifted.getValueType(); - if (matchRotateSub(InnerPos, InnerNeg, VT.getSizeInBits())) { + if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits())) { bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT); return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted, HasPos ? Pos : Neg).getNode(); @@ -3888,10 +3998,10 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) { if (RHSShift.getOpcode() == ISD::SHL) { std::swap(LHS, RHS); std::swap(LHSShift, RHSShift); - std::swap(LHSMask , RHSMask ); + std::swap(LHSMask, RHSMask); } - unsigned OpSizeInBits = VT.getSizeInBits(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDValue LHSShiftArg = LHSShift.getOperand(0); SDValue LHSShiftAmt = LHSShift.getOperand(1); SDValue RHSShiftArg = RHSShift.getOperand(0); @@ -3899,11 +4009,10 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) { // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) - if (LHSShiftAmt.getOpcode() == ISD::Constant && - RHSShiftAmt.getOpcode() == ISD::Constant) { - uint64_t LShVal = cast<ConstantSDNode>(LHSShiftAmt)->getZExtValue(); - uint64_t RShVal = cast<ConstantSDNode>(RHSShiftAmt)->getZExtValue(); - if ((LShVal + RShVal) != OpSizeInBits) + if (isConstOrConstSplat(LHSShiftAmt) && isConstOrConstSplat(RHSShiftAmt)) { + uint64_t LShVal = isConstOrConstSplat(LHSShiftAmt)->getZExtValue(); + uint64_t RShVal = isConstOrConstSplat(RHSShiftAmt)->getZExtValue(); + if ((LShVal + RShVal) != EltSizeInBits) return nullptr; SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, @@ -3911,18 +4020,23 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) { // If there is an AND of either shifted operand, apply it to the result. if (LHSMask.getNode() || RHSMask.getNode()) { - APInt Mask = APInt::getAllOnesValue(OpSizeInBits); + APInt AllBits = APInt::getAllOnesValue(EltSizeInBits); + SDValue Mask = DAG.getConstant(AllBits, DL, VT); if (LHSMask.getNode()) { - APInt RHSBits = APInt::getLowBitsSet(OpSizeInBits, LShVal); - Mask &= cast<ConstantSDNode>(LHSMask)->getAPIntValue() | RHSBits; + APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal); + Mask = DAG.getNode(ISD::AND, DL, VT, Mask, + DAG.getNode(ISD::OR, DL, VT, LHSMask, + DAG.getConstant(RHSBits, DL, VT))); } if (RHSMask.getNode()) { - APInt LHSBits = APInt::getHighBitsSet(OpSizeInBits, RShVal); - Mask &= cast<ConstantSDNode>(RHSMask)->getAPIntValue() | LHSBits; + APInt LHSBits = APInt::getHighBitsSet(EltSizeInBits, RShVal); + Mask = DAG.getNode(ISD::AND, DL, VT, Mask, + DAG.getNode(ISD::OR, DL, VT, RHSMask, + DAG.getConstant(LHSBits, DL, VT))); } - Rot = DAG.getNode(ISD::AND, DL, VT, Rot, DAG.getConstant(Mask, DL, VT)); + Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask); } return Rot.getNode(); @@ -4112,10 +4226,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { } // Simplify: xor (op x...), (op y...) -> (op (xor x, y)) - if (N0.getOpcode() == N1.getOpcode()) { - SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N); - if (Tmp.getNode()) return Tmp; - } + if (N0.getOpcode() == N1.getOpcode()) + if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) + return Tmp; // Simplify the expression using non-local knowledge. if (!VT.isVector() && @@ -4434,12 +4547,19 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1); } - if (N1C && !N1C->isOpaque()) { - SDValue NewSHL = visitShiftByConstant(N, N1C); - if (NewSHL.getNode()) - return NewSHL; + // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2) + if (N1C && N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse()) { + if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { + if (SDValue Folded = + DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N1), VT, N0C1, N1C)) + return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Folded); + } } + if (N1C && !N1C->isOpaque()) + if (SDValue NewSHL = visitShiftByConstant(N, N1C)) + return NewSHL; + return SDValue(); } @@ -4583,11 +4703,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1); - if (N1C && !N1C->isOpaque()) { - SDValue NewSRA = visitShiftByConstant(N, N1C); - if (NewSRA.getNode()) + if (N1C && !N1C->isOpaque()) + if (SDValue NewSRA = visitShiftByConstant(N, N1C)) return NewSRA; - } return SDValue(); } @@ -4744,8 +4862,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { - SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()); - if (NewOp1.getNode()) + if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1); } @@ -4754,15 +4871,12 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); - if (N1C && !N1C->isOpaque()) { - SDValue NewSRL = visitShiftByConstant(N, N1C); - if (NewSRL.getNode()) + if (N1C && !N1C->isOpaque()) + if (SDValue NewSRL = visitShiftByConstant(N, N1C)) return NewSRL; - } // Attempt to convert a srl of a load into a narrower zero-extending load. - SDValue NarrowLoad = ReduceLoadWidth(N); - if (NarrowLoad.getNode()) + if (SDValue NarrowLoad = ReduceLoadWidth(N)) return NarrowLoad; // Here is a common situation. We want to optimize: @@ -4973,70 +5087,47 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { if (SimplifySelectOps(N, N1, N2)) return SDValue(N, 0); // Don't revisit N. - // fold selects based on a setcc into other things, such as min/max/abs - if (N0.getOpcode() == ISD::SETCC) { - // select x, y (fcmp lt x, y) -> fminnum x, y - // select x, y (fcmp gt x, y) -> fmaxnum x, y - // - // This is OK if we don't care about what happens if either operand is a - // NaN. - // - - // FIXME: Instead of testing for UnsafeFPMath, this should be checking for - // no signed zeros as well as no nans. - const TargetOptions &Options = DAG.getTarget().Options; - if (Options.UnsafeFPMath && - VT.isFloatingPoint() && N0.hasOneUse() && - DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) { - ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); - - SDValue FMinMax = - combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), - N1, N2, CC, TLI, DAG); - if (FMinMax) - return FMinMax; - } - - if ((!LegalOperations && - TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) || - TLI.isOperationLegal(ISD::SELECT_CC, VT)) - return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, - N0.getOperand(0), N0.getOperand(1), - N1, N2, N0.getOperand(2)); - return SimplifySelect(SDLoc(N), N0, N1, N2); - } - if (VT0 == MVT::i1) { - if (TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT)) { - // select (and Cond0, Cond1), X, Y - // -> select Cond0, (select Cond1, X, Y), Y - if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) { - SDValue Cond0 = N0->getOperand(0); - SDValue Cond1 = N0->getOperand(1); - SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N), - N1.getValueType(), Cond1, N1, N2); + // The code in this block deals with the following 2 equivalences: + // select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y)) + // select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y) + // The target can specify its prefered form with the + // shouldNormalizeToSelectSequence() callback. However we always transform + // to the right anyway if we find the inner select exists in the DAG anyway + // and we always transform to the left side if we know that we can further + // optimize the combination of the conditions. + bool normalizeToSequence + = TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT); + // select (and Cond0, Cond1), X, Y + // -> select Cond0, (select Cond1, X, Y), Y + if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) { + SDValue Cond0 = N0->getOperand(0); + SDValue Cond1 = N0->getOperand(1); + SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N), + N1.getValueType(), Cond1, N1, N2); + if (normalizeToSequence || !InnerSelect.use_empty()) return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Cond0, InnerSelect, N2); - } - // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y) - if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) { - SDValue Cond0 = N0->getOperand(0); - SDValue Cond1 = N0->getOperand(1); - SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N), - N1.getValueType(), Cond1, N1, N2); + } + // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y) + if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) { + SDValue Cond0 = N0->getOperand(0); + SDValue Cond1 = N0->getOperand(1); + SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N), + N1.getValueType(), Cond1, N1, N2); + if (normalizeToSequence || !InnerSelect.use_empty()) return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Cond0, N1, InnerSelect); - } } // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y - if (N1->getOpcode() == ISD::SELECT) { + if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) { SDValue N1_0 = N1->getOperand(0); SDValue N1_1 = N1->getOperand(1); SDValue N1_2 = N1->getOperand(2); if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) { // Create the actual and node if we can generate good code for it. - if (!TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT)) { + if (!normalizeToSequence) { SDValue And = DAG.getNode(ISD::AND, SDLoc(N), N0.getValueType(), N0, N1_0); return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), And, @@ -5049,13 +5140,13 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { } } // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y - if (N2->getOpcode() == ISD::SELECT) { + if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) { SDValue N2_0 = N2->getOperand(0); SDValue N2_1 = N2->getOperand(1); SDValue N2_2 = N2->getOperand(2); if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) { // Create the actual or node if we can generate good code for it. - if (!TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT)) { + if (!normalizeToSequence) { SDValue Or = DAG.getNode(ISD::OR, SDLoc(N), N0.getValueType(), N0, N2_0); return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Or, @@ -5069,6 +5160,38 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { } } + // fold selects based on a setcc into other things, such as min/max/abs + if (N0.getOpcode() == ISD::SETCC) { + // select x, y (fcmp lt x, y) -> fminnum x, y + // select x, y (fcmp gt x, y) -> fmaxnum x, y + // + // This is OK if we don't care about what happens if either operand is a + // NaN. + // + + // FIXME: Instead of testing for UnsafeFPMath, this should be checking for + // no signed zeros as well as no nans. + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.UnsafeFPMath && + VT.isFloatingPoint() && N0.hasOneUse() && + DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) { + ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); + + if (SDValue FMinMax = combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0), + N0.getOperand(1), N1, N2, CC, + TLI, DAG)) + return FMinMax; + } + + if ((!LegalOperations && + TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) || + TLI.isOperationLegal(ISD::SELECT_CC, VT)) + return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), + N1, N2, N0.getOperand(2)); + return SimplifySelect(SDLoc(N), N0, N1, N2); + } + return SDValue(); } @@ -5523,8 +5646,7 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { if (N1.getOpcode() == ISD::CONCAT_VECTORS && N2.getOpcode() == ISD::CONCAT_VECTORS && ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) { - SDValue CV = ConvertSelectToConcatVector(N, DAG); - if (CV.getNode()) + if (SDValue CV = ConvertSelectToConcatVector(N, DAG)) return CV; } @@ -5580,7 +5702,20 @@ SDValue DAGCombiner::visitSETCC(SDNode *N) { SDLoc(N)); } -/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or +SDValue DAGCombiner::visitSETCCE(SDNode *N) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Carry = N->getOperand(2); + SDValue Cond = N->getOperand(3); + + // If Carry is false, fold to a regular SETCC. + if (Carry.getOpcode() == ISD::CARRY_FALSE) + return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond); + + return SDValue(); +} + +/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or /// a build_vector of constants. /// This function is called by the DAGCombiner when visiting sext/zext/aext /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). @@ -5837,8 +5972,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { if (N0.getOpcode() == ISD::TRUNCATE) { // fold (sext (truncate (load x))) -> (sext (smaller load x)) // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n))) - SDValue NarrowLoad = ReduceLoadWidth(N0.getNode()); - if (NarrowLoad.getNode()) { + if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { SDNode* oye = N0.getNode()->getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); @@ -6024,7 +6158,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { if (!VT.isVector()) { EVT SetCCVT = getSetCCResultType(N0.getOperand(0).getValueType()); - if (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, SetCCVT)) { + if (!LegalOperations || + TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) { SDLoc DL(N); ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); SDValue SetCC = DAG.getSetCC(DL, SetCCVT, @@ -6120,8 +6255,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { // fold (zext (truncate (load x))) -> (zext (smaller load x)) // fold (zext (truncate (srl (load x), c))) -> (zext (small load (x+c/n))) if (N0.getOpcode() == ISD::TRUNCATE) { - SDValue NarrowLoad = ReduceLoadWidth(N0.getNode()); - if (NarrowLoad.getNode()) { + if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { SDNode* oye = N0.getNode()->getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); @@ -6133,32 +6267,45 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { } // fold (zext (truncate x)) -> (and x, mask) - if (N0.getOpcode() == ISD::TRUNCATE && - (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT))) { - + if (N0.getOpcode() == ISD::TRUNCATE) { // fold (zext (truncate (load x))) -> (zext (smaller load x)) // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n))) - SDValue NarrowLoad = ReduceLoadWidth(N0.getNode()); - if (NarrowLoad.getNode()) { - SDNode* oye = N0.getNode()->getOperand(0).getNode(); + if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { + SDNode *oye = N0.getNode()->getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorklist(oye); } - return SDValue(N, 0); // Return N so it doesn't get rechecked! + return SDValue(N, 0); // Return N so it doesn't get rechecked! } - SDValue Op = N0.getOperand(0); - if (Op.getValueType().bitsLT(VT)) { - Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op); - AddToWorklist(Op.getNode()); - } else if (Op.getValueType().bitsGT(VT)) { - Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); - AddToWorklist(Op.getNode()); + EVT SrcVT = N0.getOperand(0).getValueType(); + EVT MinVT = N0.getValueType(); + + // Try to mask before the extension to avoid having to generate a larger mask, + // possibly over several sub-vectors. + if (SrcVT.bitsLT(VT)) { + if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) && + TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) { + SDValue Op = N0.getOperand(0); + Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); + AddToWorklist(Op.getNode()); + return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); + } + } + + if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) { + SDValue Op = N0.getOperand(0); + if (SrcVT.bitsLT(VT)) { + Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op); + AddToWorklist(Op.getNode()); + } else if (SrcVT.bitsGT(VT)) { + Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); + AddToWorklist(Op.getNode()); + } + return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); } - return DAG.getZeroExtendInReg(Op, SDLoc(N), - N0.getValueType().getScalarType()); } // Fold (zext (and (trunc x), cst)) -> (and x, cst), @@ -6219,6 +6366,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { // fold (zext (and/or/xor (load x), cst)) -> // (and/or/xor (zextload x), (zext cst)) + // Unless (and (load x) cst) will match as a zextload already and has + // additional users. if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::XOR) && isa<LoadSDNode>(N0.getOperand(0)) && @@ -6229,9 +6378,20 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed()) { bool DoXform = true; SmallVector<SDNode*, 4> SetCCs; - if (!N0.hasOneUse()) - DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::ZERO_EXTEND, - SetCCs, TLI); + if (!N0.hasOneUse()) { + if (N0.getOpcode() == ISD::AND) { + auto *AndC = cast<ConstantSDNode>(N0.getOperand(1)); + auto NarrowLoad = false; + EVT LoadResultTy = AndC->getValueType(0); + EVT ExtVT, LoadedVT; + if (isAndLoadExtLoad(AndC, LN0, LoadResultTy, ExtVT, LoadedVT, + NarrowLoad)) + DoXform = false; + } + if (DoXform) + DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), + ISD::ZERO_EXTEND, SetCCs, TLI); + } if (DoXform) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), VT, LN0->getChain(), LN0->getBasePtr(), @@ -6378,8 +6538,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { // fold (aext (truncate (load x))) -> (aext (smaller load x)) // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n))) if (N0.getOpcode() == ISD::TRUNCATE) { - SDValue NarrowLoad = ReduceLoadWidth(N0.getNode()); - if (NarrowLoad.getNode()) { + if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { SDNode* oye = N0.getNode()->getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); @@ -6546,8 +6705,7 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) { // Watch out for shift count overflow though. if (Amt >= Mask.getBitWidth()) break; APInt NewMask = Mask << Amt; - SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask); - if (SimplifyLHS.getNode()) + if (SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask)) return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS, V.getOperand(1)); } @@ -6685,9 +6843,13 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { uint64_t PtrOff = ShAmt / 8; unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff); SDLoc DL(LN0); + // The original load itself didn't wrap, so an offset within it doesn't. + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(true); SDValue NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, LN0->getBasePtr(), - DAG.getConstant(PtrOff, DL, PtrType)); + DAG.getConstant(PtrOff, DL, PtrType), + &Flags); AddToWorklist(NewPtr.getNode()); SDValue Load; @@ -6736,8 +6898,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { unsigned VTBits = VT.getScalarType().getSizeInBits(); unsigned EVTBits = EVT.getScalarType().getSizeInBits(); + if (N0.isUndef()) + return DAG.getUNDEF(VT); + // fold (sext_in_reg c1) -> c1 - if (isa<ConstantSDNode>(N0) || N0.getOpcode() == ISD::UNDEF) + if (isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1); // If the input is already sign extended, just drop the extension. @@ -6771,8 +6936,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { // fold (sext_in_reg (load x)) -> (smaller sextload x) // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits)) - SDValue NarrowLoad = ReduceLoadWidth(N); - if (NarrowLoad.getNode()) + if (SDValue NarrowLoad = ReduceLoadWidth(N)) return NarrowLoad; // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24) @@ -6831,29 +6995,6 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { BSwap, N1); } - // Fold a sext_inreg of a build_vector of ConstantSDNodes or undefs - // into a build_vector. - if (ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) { - SmallVector<SDValue, 8> Elts; - unsigned NumElts = N0->getNumOperands(); - unsigned ShAmt = VTBits - EVTBits; - - for (unsigned i = 0; i != NumElts; ++i) { - SDValue Op = N0->getOperand(i); - if (Op->getOpcode() == ISD::UNDEF) { - Elts.push_back(Op); - continue; - } - - ConstantSDNode *CurrentND = cast<ConstantSDNode>(Op); - const APInt &C = APInt(VTBits, CurrentND->getAPIntValue().getZExtValue()); - Elts.push_back(DAG.getConstant(C.shl(ShAmt).ashr(ShAmt).getZExtValue(), - SDLoc(Op), Op.getValueType())); - } - - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Elts); - } - return SDValue(); } @@ -6999,9 +7140,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // fold (truncate (load x)) -> (smaller load x) // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits)) if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) { - SDValue Reduced = ReduceLoadWidth(N); - if (Reduced.getNode()) + if (SDValue Reduced = ReduceLoadWidth(N)) return Reduced; + // Handle the case where the load remains an extending load even // after truncation. if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) { @@ -7107,6 +7248,12 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) { return SDValue(); } +static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) { + // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi + // and Lo parts; on big-endian machines it doesn't. + return DAG.getDataLayout().isBigEndian() ? 1 : 0; +} + SDValue DAGCombiner::visitBITCAST(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -7173,6 +7320,15 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit) // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit)) + // + // For ppc_fp128: + // fold (bitcast (fneg x)) -> + // flipbit = signbit + // (xor (bitcast x) (build_pair flipbit, flipbit)) + // + // fold (bitcast (fabs x)) -> + // flipbit = (and (extract_element (bitcast x), 0), signbit) + // (xor (bitcast x) (build_pair flipbit, flipbit)) // This often reduces constant pool loads. if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) || (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) && @@ -7183,6 +7339,29 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { AddToWorklist(NewConv.getNode()); SDLoc DL(N); + if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { + assert(VT.getSizeInBits() == 128); + SDValue SignBit = DAG.getConstant( + APInt::getSignBit(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64); + SDValue FlipBit; + if (N0.getOpcode() == ISD::FNEG) { + FlipBit = SignBit; + AddToWorklist(FlipBit.getNode()); + } else { + assert(N0.getOpcode() == ISD::FABS); + SDValue Hi = + DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv, + DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG), + SDLoc(NewConv))); + AddToWorklist(Hi.getNode()); + FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit); + AddToWorklist(FlipBit.getNode()); + } + SDValue FlipBits = + DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit); + AddToWorklist(FlipBits.getNode()); + return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits); + } APInt SignBit = APInt::getSignBit(VT.getSizeInBits()); if (N0.getOpcode() == ISD::FNEG) return DAG.getNode(ISD::XOR, DL, VT, @@ -7196,6 +7375,13 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { // (or (and (bitconvert x), sign), (and cst, (not sign))) // Note that we don't handle (copysign x, cst) because this can always be // folded to an fneg or fabs. + // + // For ppc_fp128: + // fold (bitcast (fcopysign cst, x)) -> + // flipbit = (and (extract_element + // (xor (bitcast cst), (bitcast x)), 0), + // signbit) + // (xor (bitcast cst) (build_pair flipbit, flipbit)) if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() && isa<ConstantFPSDNode>(N0.getOperand(0)) && VT.isInteger() && !VT.isVector()) { @@ -7224,6 +7410,30 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { AddToWorklist(X.getNode()); } + if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { + APInt SignBit = APInt::getSignBit(VT.getSizeInBits() / 2); + SDValue Cst = DAG.getNode(ISD::BITCAST, SDLoc(N0.getOperand(0)), VT, + N0.getOperand(0)); + AddToWorklist(Cst.getNode()); + SDValue X = DAG.getNode(ISD::BITCAST, SDLoc(N0.getOperand(1)), VT, + N0.getOperand(1)); + AddToWorklist(X.getNode()); + SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X); + AddToWorklist(XorResult.getNode()); + SDValue XorResult64 = DAG.getNode( + ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult, + DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG), + SDLoc(XorResult))); + AddToWorklist(XorResult64.getNode()); + SDValue FlipBit = + DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64, + DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64)); + AddToWorklist(FlipBit.getNode()); + SDValue FlipBits = + DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit); + AddToWorklist(FlipBits.getNode()); + return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits); + } APInt SignBit = APInt::getSignBit(VT.getSizeInBits()); X = DAG.getNode(ISD::AND, SDLoc(X), VT, X, DAG.getConstant(SignBit, SDLoc(X), VT)); @@ -7240,11 +7450,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { } // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive. - if (N0.getOpcode() == ISD::BUILD_PAIR) { - SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT); - if (CombineLD.getNode()) + if (N0.getOpcode() == ISD::BUILD_PAIR) + if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT)) return CombineLD; - } // Remove double bitcasts from shuffles - this is often a legacy of // XformToShuffleWithZero being used to combine bitmaskings (of @@ -7257,10 +7465,10 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0); // If operands are a bitcast, peek through if it casts the original VT. - // If operands are a UNDEF or constant, just bitcast back to original VT. + // If operands are a constant, just bitcast back to original VT. auto PeekThroughBitcast = [&](SDValue Op) { if (Op.getOpcode() == ISD::BITCAST && - Op.getOperand(0)->getValueType(0) == VT) + Op.getOperand(0).getValueType() == VT) return SDValue(Op.getOperand(0)); if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) @@ -7431,28 +7639,34 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; - bool UnsafeFPMath = (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath); + bool AllowFusion = + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); // Floating-point multiply-add with intermediate rounding. - bool HasFMAD = (LegalOperations && - TLI.isOperationLegal(ISD::FMAD, VT)); + bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. - bool HasFMA = ((!LegalOperations || - TLI.isOperationLegalOrCustom(ISD::FMA, VT)) && - TLI.isFMAFasterThanFMulAndFAdd(VT) && - UnsafeFPMath); + bool HasFMA = + AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); // Always prefer FMAD to FMA for precision. - unsigned int PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; + unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); bool LookThroughFPExt = TLI.isFPExtFree(VT); + // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), + // prefer to fold the multiply with fewer uses. + if (Aggressive && N0.getOpcode() == ISD::FMUL && + N1.getOpcode() == ISD::FMUL) { + if (N0.getNode()->use_size() > N1.getNode()->use_size()) + std::swap(N0, N1); + } + // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (N0.getOpcode() == ISD::FMUL && (Aggressive || N0->hasOneUse())) { @@ -7469,7 +7683,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { } // Look through FP_EXTEND nodes to do more combining. - if (UnsafeFPMath && LookThroughFPExt) { + if (AllowFusion && LookThroughFPExt) { // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); @@ -7495,7 +7709,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { } // More folding opportunities when target permits. - if ((UnsafeFPMath || HasFMAD) && Aggressive) { + if ((AllowFusion || HasFMAD) && Aggressive) { // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z)) if (N0.getOpcode() == PreferredFusedOpcode && N0.getOperand(2).getOpcode() == ISD::FMUL) { @@ -7518,7 +7732,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { N0)); } - if (UnsafeFPMath && LookThroughFPExt) { + if (AllowFusion && LookThroughFPExt) { // fold (fadd (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y, (fma (fpext u), (fpext v), z)) auto FoldFAddFMAFPExtFMul = [&] ( @@ -7608,25 +7822,23 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; - bool UnsafeFPMath = (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath); + bool AllowFusion = + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); // Floating-point multiply-add with intermediate rounding. - bool HasFMAD = (LegalOperations && - TLI.isOperationLegal(ISD::FMAD, VT)); + bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. - bool HasFMA = ((!LegalOperations || - TLI.isOperationLegalOrCustom(ISD::FMA, VT)) && - TLI.isFMAFasterThanFMulAndFAdd(VT) && - UnsafeFPMath); + bool HasFMA = + AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); // Always prefer FMAD to FMA for precision. - unsigned int PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; + unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); bool LookThroughFPExt = TLI.isFPExtFree(VT); @@ -7659,7 +7871,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { } // Look through FP_EXTEND nodes to do more combining. - if (UnsafeFPMath && LookThroughFPExt) { + if (AllowFusion && LookThroughFPExt) { // fold (fsub (fpext (fmul x, y)), z) // -> (fma (fpext x), (fpext y), (fneg z)) if (N0.getOpcode() == ISD::FP_EXTEND) { @@ -7735,7 +7947,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { } // More folding opportunities when target permits. - if ((UnsafeFPMath || HasFMAD) && Aggressive) { + if ((AllowFusion || HasFMAD) && Aggressive) { // fold (fsub (fma x, y, (fmul u, v)), z) // -> (fma x, y (fma u, v, (fneg z))) if (N0.getOpcode() == PreferredFusedOpcode && @@ -7765,7 +7977,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { N21, N0)); } - if (UnsafeFPMath && LookThroughFPExt) { + if (AllowFusion && LookThroughFPExt) { // fold (fsub (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) if (N0.getOpcode() == PreferredFusedOpcode) { @@ -7866,14 +8078,97 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { return SDValue(); } +/// Try to perform FMA combining on a given FMUL node. +SDValue DAGCombiner::visitFMULForFMACombine(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc SL(N); + + assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation"); + + const TargetOptions &Options = DAG.getTarget().Options; + bool AllowFusion = + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); + + // Floating-point multiply-add with intermediate rounding. + bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); + + // Floating-point multiply-add without intermediate rounding. + bool HasFMA = + AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) && + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); + + // No valid opcode, do not combine. + if (!HasFMAD && !HasFMA) + return SDValue(); + + // Always prefer FMAD to FMA for precision. + unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; + bool Aggressive = TLI.enableAggressiveFMAFusion(VT); + + // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y) + // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y)) + auto FuseFADD = [&](SDValue X, SDValue Y) { + if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) { + auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); + if (XC1 && XC1->isExactlyValue(+1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); + if (XC1 && XC1->isExactlyValue(-1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y)); + } + return SDValue(); + }; + + if (SDValue FMA = FuseFADD(N0, N1)) + return FMA; + if (SDValue FMA = FuseFADD(N1, N0)) + return FMA; + + // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y) + // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y)) + // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y)) + // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y) + auto FuseFSUB = [&](SDValue X, SDValue Y) { + if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) { + auto XC0 = isConstOrConstSplatFP(X.getOperand(0)); + if (XC0 && XC0->isExactlyValue(+1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, + Y); + if (XC0 && XC0->isExactlyValue(-1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y)); + + auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); + if (XC1 && XC1->isExactlyValue(+1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y)); + if (XC1 && XC1->isExactlyValue(-1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y); + } + return SDValue(); + }; + + if (SDValue FMA = FuseFSUB(N0, N1)) + return FMA; + if (SDValue FMA = FuseFSUB(N1, N0)) + return FMA; + + return SDValue(); +} + SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); - ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); + bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0); + bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; + const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; // fold vector ops if (VT.isVector()) @@ -7882,23 +8177,23 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // fold (fadd c1, c2) -> c1 + c2 if (N0CFP && N1CFP) - return DAG.getNode(ISD::FADD, DL, VT, N0, N1); + return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags); // canonicalize constant to RHS if (N0CFP && !N1CFP) - return DAG.getNode(ISD::FADD, DL, VT, N1, N0); + return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags); // fold (fadd A, (fneg B)) -> (fsub A, B) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2) return DAG.getNode(ISD::FSUB, DL, VT, N0, - GetNegatedExpression(N1, DAG, LegalOperations)); + GetNegatedExpression(N1, DAG, LegalOperations), Flags); // fold (fadd (fneg A), B) -> (fsub B, A) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && isNegatibleForFree(N0, LegalOperations, TLI, &Options) == 2) return DAG.getNode(ISD::FSUB, DL, VT, N1, - GetNegatedExpression(N0, DAG, LegalOperations)); + GetNegatedExpression(N0, DAG, LegalOperations), Flags); // If 'unsafe math' is enabled, fold lots of things. if (Options.UnsafeFPMath) { @@ -7907,14 +8202,17 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { bool AllowNewConst = (Level < AfterLegalizeDAG); // fold (fadd A, 0) -> A - if (N1CFP && N1CFP->isZero()) - return N0; + if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1)) + if (N1C->isZero()) + return N0; // fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2)) if (N1CFP && N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() && - isa<ConstantFPSDNode>(N0.getOperand(1))) + isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), - DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1)); + DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1, + Flags), + Flags); // If allowed, fold (fadd (fneg x), x) -> 0.0 if (AllowNewConst && N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1) @@ -7929,64 +8227,64 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // of rounding steps. if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) { if (N0.getOpcode() == ISD::FMUL) { - ConstantFPSDNode *CFP00 = dyn_cast<ConstantFPSDNode>(N0.getOperand(0)); - ConstantFPSDNode *CFP01 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1)); + bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); + bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); // (fadd (fmul x, c), x) -> (fmul x, c+1) if (CFP01 && !CFP00 && N0.getOperand(0) == N1) { - SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, SDValue(CFP01, 0), - DAG.getConstantFP(1.0, DL, VT)); - return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP); + SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), + DAG.getConstantFP(1.0, DL, VT), Flags); + return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags); } // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2) if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD && N1.getOperand(0) == N1.getOperand(1) && N0.getOperand(0) == N1.getOperand(0)) { - SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, SDValue(CFP01, 0), - DAG.getConstantFP(2.0, DL, VT)); - return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP); + SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), + DAG.getConstantFP(2.0, DL, VT), Flags); + return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags); } } if (N1.getOpcode() == ISD::FMUL) { - ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0)); - ConstantFPSDNode *CFP11 = dyn_cast<ConstantFPSDNode>(N1.getOperand(1)); + bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); + bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); // (fadd x, (fmul x, c)) -> (fmul x, c+1) if (CFP11 && !CFP10 && N1.getOperand(0) == N0) { - SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, SDValue(CFP11, 0), - DAG.getConstantFP(1.0, DL, VT)); - return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP); + SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), + DAG.getConstantFP(1.0, DL, VT), Flags); + return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags); } // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2) if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1) && N1.getOperand(0) == N0.getOperand(0)) { - SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, SDValue(CFP11, 0), - DAG.getConstantFP(2.0, DL, VT)); - return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP); + SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), + DAG.getConstantFP(2.0, DL, VT), Flags); + return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags); } } if (N0.getOpcode() == ISD::FADD && AllowNewConst) { - ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N0.getOperand(0)); + bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); // (fadd (fadd x, x), x) -> (fmul x, 3.0) - if (!CFP && N0.getOperand(0) == N0.getOperand(1) && + if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) && (N0.getOperand(0) == N1)) { return DAG.getNode(ISD::FMUL, DL, VT, - N1, DAG.getConstantFP(3.0, DL, VT)); + N1, DAG.getConstantFP(3.0, DL, VT), Flags); } } if (N1.getOpcode() == ISD::FADD && AllowNewConst) { - ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0)); + bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); // (fadd x, (fadd x, x)) -> (fmul x, 3.0) if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) && N1.getOperand(0) == N0) { return DAG.getNode(ISD::FMUL, DL, VT, - N0, DAG.getConstantFP(3.0, DL, VT)); + N0, DAG.getConstantFP(3.0, DL, VT), Flags); } } @@ -7996,15 +8294,14 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { N0.getOperand(0) == N0.getOperand(1) && N1.getOperand(0) == N1.getOperand(1) && N0.getOperand(0) == N1.getOperand(0)) { - return DAG.getNode(ISD::FMUL, DL, VT, - N0.getOperand(0), DAG.getConstantFP(4.0, DL, VT)); + return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), + DAG.getConstantFP(4.0, DL, VT), Flags); } } } // enable-unsafe-fp-math // FADD -> FMA combines: - SDValue Fused = visitFADDForFMACombine(N); - if (Fused) { + if (SDValue Fused = visitFADDForFMACombine(N)) { AddToWorklist(Fused.getNode()); return Fused; } @@ -8020,6 +8317,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { EVT VT = N->getValueType(0); SDLoc dl(N); const TargetOptions &Options = DAG.getTarget().Options; + const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; // fold vector ops if (VT.isVector()) @@ -8028,12 +8326,12 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { // fold (fsub c1, c2) -> c1-c2 if (N0CFP && N1CFP) - return DAG.getNode(ISD::FSUB, dl, VT, N0, N1); + return DAG.getNode(ISD::FSUB, dl, VT, N0, N1, Flags); // fold (fsub A, (fneg B)) -> (fadd A, B) if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) return DAG.getNode(ISD::FADD, dl, VT, N0, - GetNegatedExpression(N1, DAG, LegalOperations)); + GetNegatedExpression(N1, DAG, LegalOperations), Flags); // If 'unsafe math' is enabled, fold lots of things. if (Options.UnsafeFPMath) { @@ -8068,8 +8366,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { } // FSUB -> FMA combines: - SDValue Fused = visitFSUBForFMACombine(N); - if (Fused) { + if (SDValue Fused = visitFSUBForFMACombine(N)) { AddToWorklist(Fused.getNode()); return Fused; } @@ -8085,6 +8382,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; + const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; // fold vector ops if (VT.isVector()) { @@ -8095,12 +8393,12 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { // fold (fmul c1, c2) -> c1*c2 if (N0CFP && N1CFP) - return DAG.getNode(ISD::FMUL, DL, VT, N0, N1); + return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags); // canonicalize constant to RHS if (isConstantFPBuildVectorOrConstantFP(N0) && !isConstantFPBuildVectorOrConstantFP(N1)) - return DAG.getNode(ISD::FMUL, DL, VT, N1, N0); + return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags); // fold (fmul A, 1.0) -> A if (N1CFP && N1CFP->isExactlyValue(1.0)) @@ -8129,8 +8427,8 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { // the second operand of the outer multiply are constants. if ((N1CFP && isConstOrConstSplatFP(N01)) || (BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) { - SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1); - return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts); + SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags); + return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags); } } } @@ -8139,16 +8437,18 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { // Undo the fmul 2.0, x -> fadd x, x transformation, since if it occurs // during an early run of DAGCombiner can prevent folding with fmuls // inserted during lowering. - if (N0.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1)) { + if (N0.getOpcode() == ISD::FADD && + (N0.getOperand(0) == N0.getOperand(1)) && + N0.hasOneUse()) { const SDValue Two = DAG.getConstantFP(2.0, DL, VT); - SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1); - return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts); + SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags); + return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags); } } // fold (fmul X, 2.0) -> (fadd X, X) if (N1CFP && N1CFP->isExactlyValue(+2.0)) - return DAG.getNode(ISD::FADD, DL, VT, N0, N0); + return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags); // fold (fmul X, -1.0) -> (fneg X) if (N1CFP && N1CFP->isExactlyValue(-1.0)) @@ -8163,10 +8463,17 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { if (LHSNeg == 2 || RHSNeg == 2) return DAG.getNode(ISD::FMUL, DL, VT, GetNegatedExpression(N0, DAG, LegalOperations), - GetNegatedExpression(N1, DAG, LegalOperations)); + GetNegatedExpression(N1, DAG, LegalOperations), + Flags); } } + // FMUL -> FMA combines: + if (SDValue Fused = visitFMULForFMACombine(N)) { + AddToWorklist(Fused.getNode()); + return Fused; + } + return SDValue(); } @@ -8193,66 +8500,145 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { if (N1CFP && N1CFP->isZero()) return N2; } + // TODO: The FMA node should have flags that propagate to these nodes. if (N0CFP && N0CFP->isExactlyValue(1.0)) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2); if (N1CFP && N1CFP->isExactlyValue(1.0)) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2); // Canonicalize (fma c, x, y) -> (fma x, c, y) - if (N0CFP && !N1CFP) + if (isConstantFPBuildVectorOrConstantFP(N0) && + !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); - // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) - if (Options.UnsafeFPMath && N1CFP && - N2.getOpcode() == ISD::FMUL && - N0 == N2.getOperand(0) && - N2.getOperand(1).getOpcode() == ISD::ConstantFP) { - return DAG.getNode(ISD::FMUL, dl, VT, N0, - DAG.getNode(ISD::FADD, dl, VT, N1, N2.getOperand(1))); - } + // TODO: FMA nodes should have flags that propagate to the created nodes. + // For now, create a Flags object for use with all unsafe math transforms. + SDNodeFlags Flags; + Flags.setUnsafeAlgebra(true); + if (Options.UnsafeFPMath) { + // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) + if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && + isConstantFPBuildVectorOrConstantFP(N1) && + isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) { + return DAG.getNode(ISD::FMUL, dl, VT, N0, + DAG.getNode(ISD::FADD, dl, VT, N1, N2.getOperand(1), + &Flags), &Flags); + } - // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) - if (Options.UnsafeFPMath && - N0.getOpcode() == ISD::FMUL && N1CFP && - N0.getOperand(1).getOpcode() == ISD::ConstantFP) { - return DAG.getNode(ISD::FMA, dl, VT, - N0.getOperand(0), - DAG.getNode(ISD::FMUL, dl, VT, N1, N0.getOperand(1)), - N2); + // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) + if (N0.getOpcode() == ISD::FMUL && + isConstantFPBuildVectorOrConstantFP(N1) && + isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { + return DAG.getNode(ISD::FMA, dl, VT, + N0.getOperand(0), + DAG.getNode(ISD::FMUL, dl, VT, N1, N0.getOperand(1), + &Flags), + N2); + } } // (fma x, 1, y) -> (fadd x, y) // (fma x, -1, y) -> (fadd (fneg x), y) if (N1CFP) { if (N1CFP->isExactlyValue(1.0)) + // TODO: The FMA node should have flags that propagate to this node. return DAG.getNode(ISD::FADD, dl, VT, N0, N2); if (N1CFP->isExactlyValue(-1.0) && (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) { SDValue RHSNeg = DAG.getNode(ISD::FNEG, dl, VT, N0); AddToWorklist(RHSNeg.getNode()); + // TODO: The FMA node should have flags that propagate to this node. return DAG.getNode(ISD::FADD, dl, VT, N2, RHSNeg); } } - // (fma x, c, x) -> (fmul x, (c+1)) - if (Options.UnsafeFPMath && N1CFP && N0 == N2) - return DAG.getNode(ISD::FMUL, dl, VT, N0, - DAG.getNode(ISD::FADD, dl, VT, - N1, DAG.getConstantFP(1.0, dl, VT))); - - // (fma x, c, (fneg x)) -> (fmul x, (c-1)) - if (Options.UnsafeFPMath && N1CFP && - N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) + if (Options.UnsafeFPMath) { + // (fma x, c, x) -> (fmul x, (c+1)) + if (N1CFP && N0 == N2) { return DAG.getNode(ISD::FMUL, dl, VT, N0, - DAG.getNode(ISD::FADD, dl, VT, - N1, DAG.getConstantFP(-1.0, dl, VT))); + DAG.getNode(ISD::FADD, dl, VT, + N1, DAG.getConstantFP(1.0, dl, VT), + &Flags), &Flags); + } + // (fma x, c, (fneg x)) -> (fmul x, (c-1)) + if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) { + return DAG.getNode(ISD::FMUL, dl, VT, N0, + DAG.getNode(ISD::FADD, dl, VT, + N1, DAG.getConstantFP(-1.0, dl, VT), + &Flags), &Flags); + } + } return SDValue(); } +// Combine multiple FDIVs with the same divisor into multiple FMULs by the +// reciprocal. +// E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip) +// Notice that this is not always beneficial. One reason is different target +// may have different costs for FDIV and FMUL, so sometimes the cost of two +// FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason +// is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". +SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { + bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath; + const SDNodeFlags *Flags = N->getFlags(); + if (!UnsafeMath && !Flags->hasAllowReciprocal()) + return SDValue(); + + // Skip if current node is a reciprocal. + SDValue N0 = N->getOperand(0); + ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); + if (N0CFP && N0CFP->isExactlyValue(1.0)) + return SDValue(); + + // Exit early if the target does not want this transform or if there can't + // possibly be enough uses of the divisor to make the transform worthwhile. + SDValue N1 = N->getOperand(1); + unsigned MinUses = TLI.combineRepeatedFPDivisors(); + if (!MinUses || N1->use_size() < MinUses) + return SDValue(); + + // Find all FDIV users of the same divisor. + // Use a set because duplicates may be present in the user list. + SetVector<SDNode *> Users; + for (auto *U : N1->uses()) { + if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) { + // This division is eligible for optimization only if global unsafe math + // is enabled or if this division allows reciprocal formation. + if (UnsafeMath || U->getFlags()->hasAllowReciprocal()) + Users.insert(U); + } + } + + // Now that we have the actual number of divisor uses, make sure it meets + // the minimum threshold specified by the target. + if (Users.size() < MinUses) + return SDValue(); + + EVT VT = N->getValueType(0); + SDLoc DL(N); + SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); + SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags); + + // Dividend / Divisor -> Dividend * Reciprocal + for (auto *U : Users) { + SDValue Dividend = U->getOperand(0); + if (Dividend != FPOne) { + SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend, + Reciprocal, Flags); + CombineTo(U, NewNode); + } else if (U != Reciprocal.getNode()) { + // In the absence of fast-math-flags, this user node is always the + // same node as Reciprocal, but with FMF they may be different nodes. + CombineTo(U, Reciprocal); + } + } + return SDValue(N, 0); // N was replaced. +} + SDValue DAGCombiner::visitFDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -8261,6 +8647,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; + SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; // fold vector ops if (VT.isVector()) @@ -8269,7 +8656,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { // fold (fdiv c1, c2) -> c1/c2 if (N0CFP && N1CFP) - return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1); + return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags); if (Options.UnsafeFPMath) { // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. @@ -8288,28 +8675,30 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) || TLI.isFPImmLegal(Recip, VT))) return DAG.getNode(ISD::FMUL, DL, VT, N0, - DAG.getConstantFP(Recip, DL, VT)); + DAG.getConstantFP(Recip, DL, VT), Flags); } // If this FDIV is part of a reciprocal square root, it may be folded // into a target-specific square root estimate instruction. if (N1.getOpcode() == ISD::FSQRT) { - if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0))) { - return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); + if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0), Flags)) { + return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } else if (N1.getOpcode() == ISD::FP_EXTEND && N1.getOperand(0).getOpcode() == ISD::FSQRT) { - if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) { + if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0), + Flags)) { RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV); AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); + return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } else if (N1.getOpcode() == ISD::FP_ROUND && N1.getOperand(0).getOpcode() == ISD::FSQRT) { - if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) { + if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0), + Flags)) { RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1)); AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); + return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } else if (N1.getOpcode() == ISD::FMUL) { // Look through an FMUL. Even though this won't remove the FDIV directly, @@ -8326,18 +8715,18 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { if (SqrtOp.getNode()) { // We found a FSQRT, so try to make this fold: // x / (y * sqrt(z)) -> x * (rsqrt(z) / y) - if (SDValue RV = BuildRsqrtEstimate(SqrtOp.getOperand(0))) { - RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp); + if (SDValue RV = BuildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) { + RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp, Flags); AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); + return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } } // Fold into a reciprocal estimate and multiply instead of a real divide. - if (SDValue RV = BuildReciprocalEstimate(N1)) { + if (SDValue RV = BuildReciprocalEstimate(N1, Flags)) { AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, DL, VT, N0, RV); + return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } @@ -8349,52 +8738,13 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { if (LHSNeg == 2 || RHSNeg == 2) return DAG.getNode(ISD::FDIV, SDLoc(N), VT, GetNegatedExpression(N0, DAG, LegalOperations), - GetNegatedExpression(N1, DAG, LegalOperations)); + GetNegatedExpression(N1, DAG, LegalOperations), + Flags); } } - // Combine multiple FDIVs with the same divisor into multiple FMULs by the - // reciprocal. - // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip) - // Notice that this is not always beneficial. One reason is different target - // may have different costs for FDIV and FMUL, so sometimes the cost of two - // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason - // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". - if (Options.UnsafeFPMath) { - // Skip if current node is a reciprocal. - if (N0CFP && N0CFP->isExactlyValue(1.0)) - return SDValue(); - - // Find all FDIV users of the same divisor. - // Use a set because duplicates may be present in the user list. - SetVector<SDNode *> Users; - for (auto *U : N1->uses()) - if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) - Users.insert(U); - - if (TLI.combineRepeatedFPDivisors(Users.size())) { - SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); - // FIXME: This optimization requires some level of fast-math, so the - // created reciprocal node should at least have the 'allowReciprocal' - // fast-math-flag set. - SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1); - - // Dividend / Divisor -> Dividend * Reciprocal - for (auto *U : Users) { - SDValue Dividend = U->getOperand(0); - if (Dividend != FPOne) { - SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend, - Reciprocal); - CombineTo(U, NewNode); - } else if (U != Reciprocal.getNode()) { - // In the absence of fast-math-flags, this user node is always the - // same node as Reciprocal, but with FMF they may be different nodes. - CombineTo(U, Reciprocal); - } - } - return SDValue(N, 0); // N was replaced. - } - } + if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N)) + return CombineRepeatedDivisors; return SDValue(); } @@ -8408,7 +8758,8 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { // fold (frem c1, c2) -> fmod(c1,c2) if (N0CFP && N1CFP) - return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1); + return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, + &cast<BinaryWithFlagsSDNode>(N)->Flags); return SDValue(); } @@ -8417,20 +8768,25 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) { if (!DAG.getTarget().Options.UnsafeFPMath || TLI.isFsqrtCheap()) return SDValue(); + // TODO: FSQRT nodes should have flags that propagate to the created nodes. + // For now, create a Flags object for use with all unsafe math transforms. + SDNodeFlags Flags; + Flags.setUnsafeAlgebra(true); + // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5) - SDValue RV = BuildRsqrtEstimate(N->getOperand(0)); + SDValue RV = BuildRsqrtEstimate(N->getOperand(0), &Flags); if (!RV) return SDValue(); - + EVT VT = RV.getValueType(); SDLoc DL(N); - RV = DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV); + RV = DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV, &Flags); AddToWorklist(RV.getNode()); // Unfortunately, RV is now NaN if the input was exactly 0. // Select out this case and force the answer to 0. SDValue Zero = DAG.getConstantFP(0.0, DL, VT); - EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + EVT CCVT = getSetCCResultType(VT); SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, N->getOperand(0), Zero, ISD::SETEQ); AddToWorklist(ZeroCmp.getNode()); AddToWorklist(RV.getNode()); @@ -8439,6 +8795,23 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) { ZeroCmp, Zero, RV); } +/// copysign(x, fp_extend(y)) -> copysign(x, y) +/// copysign(x, fp_round(y)) -> copysign(x, y) +static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) { + SDValue N1 = N->getOperand(1); + if ((N1.getOpcode() == ISD::FP_EXTEND || + N1.getOpcode() == ISD::FP_ROUND)) { + // Do not optimize out type conversion of f128 type yet. + // For some targets like x86_64, configuration is changed to keep one f128 + // value in one SSE register, but instruction selection cannot handle + // FCOPYSIGN on SSE registers yet. + EVT N1VT = N1->getValueType(0); + EVT N1Op0VT = N1->getOperand(0)->getValueType(0); + return (N1VT == N1Op0VT || N1Op0VT != MVT::f128); + } + return false; +} + SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -8482,7 +8855,7 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { // copysign(x, fp_extend(y)) -> copysign(x, y) // copysign(x, fp_round(y)) -> copysign(x, y) - if (N1.getOpcode() == ISD::FP_EXTEND || N1.getOpcode() == ISD::FP_ROUND) + if (CanCombineFCOPYSIGN_EXTEND_ROUND(N)) return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0)); @@ -8837,11 +9210,12 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { APFloat CVal = CFP1->getValueAPF(); CVal.changeSign(); if (Level >= AfterLegalizeDAG && - (TLI.isFPImmLegal(CVal, N->getValueType(0)) || - TLI.isOperationLegal(ISD::ConstantFP, N->getValueType(0)))) - return DAG.getNode( - ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), - DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1))); + (TLI.isFPImmLegal(CVal, VT) || + TLI.isOperationLegal(ISD::ConstantFP, VT))) + return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), + DAG.getNode(ISD::FNEG, SDLoc(N), VT, + N0.getOperand(1)), + &cast<BinaryWithFlagsSDNode>(N0)->Flags); } } @@ -8851,20 +9225,20 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { SDValue DAGCombiner::visitFMINNUM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - const ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); - const ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); + EVT VT = N->getValueType(0); + const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); + const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); if (N0CFP && N1CFP) { const APFloat &C0 = N0CFP->getValueAPF(); const APFloat &C1 = N1CFP->getValueAPF(); - return DAG.getConstantFP(minnum(C0, C1), SDLoc(N), N->getValueType(0)); + return DAG.getConstantFP(minnum(C0, C1), SDLoc(N), VT); } - if (N0CFP) { - EVT VT = N->getValueType(0); - // Canonicalize to constant on RHS. + // Canonicalize to constant on RHS. + if (isConstantFPBuildVectorOrConstantFP(N0) && + !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMINNUM, SDLoc(N), VT, N1, N0); - } return SDValue(); } @@ -8872,20 +9246,20 @@ SDValue DAGCombiner::visitFMINNUM(SDNode *N) { SDValue DAGCombiner::visitFMAXNUM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - const ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); - const ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); + EVT VT = N->getValueType(0); + const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); + const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); if (N0CFP && N1CFP) { const APFloat &C0 = N0CFP->getValueAPF(); const APFloat &C1 = N1CFP->getValueAPF(); - return DAG.getConstantFP(maxnum(C0, C1), SDLoc(N), N->getValueType(0)); + return DAG.getConstantFP(maxnum(C0, C1), SDLoc(N), VT); } - if (N0CFP) { - EVT VT = N->getValueType(0); - // Canonicalize to constant on RHS. + // Canonicalize to constant on RHS. + if (isConstantFPBuildVectorOrConstantFP(N0) && + !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMAXNUM, SDLoc(N), VT, N1, N0); - } return SDValue(); } @@ -9034,8 +9408,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) { SDValue Op1 = TheXor->getOperand(1); if (Op0.getOpcode() == Op1.getOpcode()) { // Avoid missing important xor optimizations. - SDValue Tmp = visitXOR(TheXor); - if (Tmp.getNode()) { + if (SDValue Tmp = visitXOR(TheXor)) { if (Tmp.getNode() != TheXor) { DEBUG(dbgs() << "\nReplacing.8 "; TheXor->dump(&DAG); @@ -9722,8 +10095,8 @@ struct LoadedSlice { void addSliceGain(const LoadedSlice &LS) { // Each slice saves a truncate. const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo(); - if (!TLI.isTruncateFree(LS.Inst->getValueType(0), - LS.Inst->getOperand(0).getValueType())) + if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(), + LS.Inst->getValueType(0))) ++Truncates; // If there is a shift amount, this slice gets rid of it. if (LS.Shift) @@ -10625,30 +10998,109 @@ struct BaseIndexOffset { }; } // namespace +// This is a helper function for visitMUL to check the profitability +// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2). +// MulNode is the original multiply, AddNode is (add x, c1), +// and ConstNode is c2. +// +// If the (add x, c1) has multiple uses, we could increase +// the number of adds if we make this transformation. +// It would only be worth doing this if we can remove a +// multiply in the process. Check for that here. +// To illustrate: +// (A + c1) * c3 +// (A + c2) * c3 +// We're checking for cases where we have common "c3 * A" expressions. +bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, + SDValue &AddNode, + SDValue &ConstNode) { + APInt Val; + + // If the add only has one use, this would be OK to do. + if (AddNode.getNode()->hasOneUse()) + return true; + + // Walk all the users of the constant with which we're multiplying. + for (SDNode *Use : ConstNode->uses()) { + + if (Use == MulNode) // This use is the one we're on right now. Skip it. + continue; + + if (Use->getOpcode() == ISD::MUL) { // We have another multiply use. + SDNode *OtherOp; + SDNode *MulVar = AddNode.getOperand(0).getNode(); + + // OtherOp is what we're multiplying against the constant. + if (Use->getOperand(0) == ConstNode) + OtherOp = Use->getOperand(1).getNode(); + else + OtherOp = Use->getOperand(0).getNode(); + + // Check to see if multiply is with the same operand of our "add". + // + // ConstNode = CONST + // Use = ConstNode * A <-- visiting Use. OtherOp is A. + // ... + // AddNode = (A + c1) <-- MulVar is A. + // = AddNode * ConstNode <-- current visiting instruction. + // + // If we make this transformation, we will have a common + // multiply (ConstNode * A) that we can save. + if (OtherOp == MulVar) + return true; + + // Now check to see if a future expansion will give us a common + // multiply. + // + // ConstNode = CONST + // AddNode = (A + c1) + // ... = AddNode * ConstNode <-- current visiting instruction. + // ... + // OtherOp = (A + c2) + // Use = OtherOp * ConstNode <-- visiting Use. + // + // If we make this transformation, we will have a common + // multiply (CONST * A) after we also do the same transformation + // to the "t2" instruction. + if (OtherOp->getOpcode() == ISD::ADD && + isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) && + OtherOp->getOperand(0).getNode() == MulVar) + return true; + } + } + + // Didn't find a case where this would be profitable. + return false; +} + SDValue DAGCombiner::getMergedConstantVectorStore(SelectionDAG &DAG, SDLoc SL, ArrayRef<MemOpLink> Stores, + SmallVectorImpl<SDValue> &Chains, EVT Ty) const { SmallVector<SDValue, 8> BuildVector; - for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) - BuildVector.push_back(cast<StoreSDNode>(Stores[I].MemNode)->getValue()); + for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) { + StoreSDNode *St = cast<StoreSDNode>(Stores[I].MemNode); + Chains.push_back(St->getChain()); + BuildVector.push_back(St->getValue()); + } return DAG.getNode(ISD::BUILD_VECTOR, SL, Ty, BuildVector); } bool DAGCombiner::MergeStoresOfConstantsOrVecElts( SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, - unsigned NumElem, bool IsConstantSrc, bool UseVector) { + unsigned NumStores, bool IsConstantSrc, bool UseVector) { // Make sure we have something to merge. - if (NumElem < 2) + if (NumStores < 2) return false; int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; unsigned LatestNodeUsed = 0; - for (unsigned i=0; i < NumElem; ++i) { + for (unsigned i=0; i < NumStores; ++i) { // Find a chain for the new wide-store operand. Notice that some // of the store nodes that we found may not be selected for inclusion // in the wide store. The chain we use needs to be the chain of the @@ -10657,45 +11109,57 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( LatestNodeUsed = i; } + SmallVector<SDValue, 8> Chains; + // The latest Node in the DAG. LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode; SDLoc DL(StoreNodes[0].MemNode); SDValue StoredVal; if (UseVector) { - // Find a legal type for the vector store. - EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem); + bool IsVec = MemVT.isVector(); + unsigned Elts = NumStores; + if (IsVec) { + // When merging vector stores, get the total number of elements. + Elts *= MemVT.getVectorNumElements(); + } + // Get the type for the merged vector store. + EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); assert(TLI.isTypeLegal(Ty) && "Illegal vector store"); + if (IsConstantSrc) { - StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Ty); + StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Chains, Ty); } else { SmallVector<SDValue, 8> Ops; - for (unsigned i = 0; i < NumElem ; ++i) { + for (unsigned i = 0; i < NumStores; ++i) { StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); SDValue Val = St->getValue(); - // All of the operands of a BUILD_VECTOR must have the same type. + // All operands of BUILD_VECTOR / CONCAT_VECTOR must have the same type. if (Val.getValueType() != MemVT) return false; Ops.push_back(Val); + Chains.push_back(St->getChain()); } // Build the extracted vector elements back into a vector. - StoredVal = DAG.getNode(ISD::BUILD_VECTOR, DL, Ty, Ops); - } + StoredVal = DAG.getNode(IsVec ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, + DL, Ty, Ops); } } else { // We should always use a vector store when merging extracted vector // elements, so this path implies a store of constants. assert(IsConstantSrc && "Merged vector elements should use vector store"); - unsigned SizeInBits = NumElem * ElementSizeBytes * 8; + unsigned SizeInBits = NumStores * ElementSizeBytes * 8; APInt StoreInt(SizeInBits, 0); // Construct a single integer constant which is made of the smaller // constant inputs. bool IsLE = DAG.getDataLayout().isLittleEndian(); - for (unsigned i = 0; i < NumElem ; ++i) { - unsigned Idx = IsLE ? (NumElem - 1 - i) : i; + for (unsigned i = 0; i < NumStores; ++i) { + unsigned Idx = IsLE ? (NumStores - 1 - i) : i; StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode); + Chains.push_back(St->getChain()); + SDValue Val = St->getValue(); StoreInt <<= ElementSizeBytes * 8; if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) { @@ -10712,7 +11176,10 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( StoredVal = DAG.getConstant(StoreInt, DL, StoreTy); } - SDValue NewStore = DAG.getStore(LatestOp->getChain(), DL, StoredVal, + assert(!Chains.empty()); + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), false, false, @@ -10721,7 +11188,7 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( // Replace the last store with the new store CombineTo(LatestOp, NewStore); // Erase all other stores. - for (unsigned i = 0; i < NumElem ; ++i) { + for (unsigned i = 0; i < NumStores; ++i) { if (StoreNodes[i].MemNode == LatestOp) continue; StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); @@ -10743,17 +11210,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( return true; } -static bool allowableAlignment(const SelectionDAG &DAG, - const TargetLowering &TLI, EVT EVTTy, - unsigned AS, unsigned Align) { - if (TLI.allowsMisalignedMemoryAccesses(EVTTy, AS, Align)) - return true; - - Type *Ty = EVTTy.getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = DAG.getDataLayout().getPrefTypeAlignment(Ty); - return (Align >= ABIAlignment); -} - void DAGCombiner::getStoreMergeAndAliasCandidates( StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes, SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes) { @@ -10775,6 +11231,38 @@ void DAGCombiner::getStoreMergeAndAliasCandidates( EVT MemVT = St->getMemoryVT(); unsigned Seq = 0; StoreSDNode *Index = St; + + + bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA + : DAG.getSubtarget().useAA(); + + if (UseAA) { + // Look at other users of the same chain. Stores on the same chain do not + // alias. If combiner-aa is enabled, non-aliasing stores are canonicalized + // to be on the same chain, so don't bother looking at adjacent chains. + + SDValue Chain = St->getChain(); + for (auto I = Chain->use_begin(), E = Chain->use_end(); I != E; ++I) { + if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) { + if (I.getOperandNo() != 0) + continue; + + if (OtherST->isVolatile() || OtherST->isIndexed()) + continue; + + if (OtherST->getMemoryVT() != MemVT) + continue; + + BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr()); + + if (Ptr.equalBaseIndex(BasePtr)) + StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset, Seq++)); + } + } + + return; + } + while (Index) { // If the chain has more than one use, then we can't reorder the mem ops. if (Index != St && !SDValue(Index, 0)->hasOneUse()) @@ -10800,6 +11288,13 @@ void DAGCombiner::getStoreMergeAndAliasCandidates( if (Index->getMemoryVT() != MemVT) break; + // We do not allow under-aligned stores in order to prevent + // overriding stores. NOTE: this is a bad hack. Alignment SHOULD + // be irrelevant here; what MATTERS is that we not move memory + // operations that potentially overlap past each-other. + if (Index->getAlignment() < MemVT.getStoreSize()) + break; + // We found a potential memory operand to merge. StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++)); @@ -10844,8 +11339,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { if (ElementSizeBytes * 8 != MemVT.getSizeInBits()) return false; - // Don't merge vectors into wider inputs. - if (MemVT.isVector() || !MemVT.isSimple()) + if (!MemVT.isSimple()) return false; // Perform an early exit check. Do not bother looking at stored values that @@ -10854,9 +11348,16 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { bool IsLoadSrc = isa<LoadSDNode>(StoredVal); bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) || isa<ConstantFPSDNode>(StoredVal); - bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT); + bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT || + StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR); - if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc) + if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc) + return false; + + // Don't merge vectors into wider vectors if the source data comes from loads. + // TODO: This restriction can be lifted by using logic similar to the + // ExtractVecSrc case. + if (MemVT.isVector() && IsLoadSrc) return false; // Only look at ends of store sequences. @@ -10868,22 +11369,28 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { // We need to make sure that these nodes do not interfere with // any of the store nodes. SmallVector<LSBaseSDNode*, 8> AliasLoadNodes; - + // Save the StoreSDNodes that we find in the chain. SmallVector<MemOpLink, 8> StoreNodes; getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes); - + // Check if there is anything to merge. if (StoreNodes.size() < 2) return false; - // Sort the memory operands according to their distance from the base pointer. + // Sort the memory operands according to their distance from the + // base pointer. As a secondary criteria: make sure stores coming + // later in the code come first in the list. This is important for + // the non-UseAA case, because we're merging stores into the FINAL + // store along a chain which potentially contains aliasing stores. + // Thus, if there are multiple stores to the same address, the last + // one can be considered for merging but not the others. std::sort(StoreNodes.begin(), StoreNodes.end(), [](MemOpLink LHS, MemOpLink RHS) { return LHS.OffsetFromBase < RHS.OffsetFromBase || (LHS.OffsetFromBase == RHS.OffsetFromBase && - LHS.SequenceNum > RHS.SequenceNum); + LHS.SequenceNum < RHS.SequenceNum); }); // Scan the memory operations on the chain and find the first non-consecutive @@ -10900,15 +11407,12 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { break; } - bool Alias = false; // Check if this store interferes with any of the loads that we found. - for (unsigned ld = 0, lde = AliasLoadNodes.size(); ld < lde; ++ld) - if (isAlias(AliasLoadNodes[ld], StoreNodes[i].MemNode)) { - Alias = true; - break; - } - // We found a load that alias with this store. Stop the sequence. - if (Alias) + // If we find a load that alias with this store. Stop the sequence. + if (std::any_of(AliasLoadNodes.begin(), AliasLoadNodes.end(), + [&](LSBaseSDNode* Ldn) { + return isAlias(Ldn, StoreNodes[i].MemNode); + })) break; // Mark this node as useful. @@ -10919,6 +11423,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; unsigned FirstStoreAS = FirstInChain->getAddressSpace(); unsigned FirstStoreAlign = FirstInChain->getAlignment(); + LLVMContext &Context = *DAG.getContext(); + const DataLayout &DL = DAG.getDataLayout(); // Store the constants into memory as one consecutive store. if (IsConstantSrc) { @@ -10940,43 +11446,40 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { // Find a legal type for the constant store. unsigned SizeInBits = (i+1) * ElementSizeBytes * 8; - EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits); + EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); + bool IsFast; if (TLI.isTypeLegal(StoreTy) && - allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS, - FirstStoreAlign)) { + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, + FirstStoreAlign, &IsFast) && IsFast) { LastLegalType = i+1; // Or check whether a truncstore is legal. - } else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) == + } else if (TLI.getTypeAction(Context, StoreTy) == TargetLowering::TypePromoteInteger) { EVT LegalizedStoredValueTy = - TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType()); + TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && - allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstStoreAS, - FirstStoreAlign)) { + TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, + FirstStoreAS, FirstStoreAlign, &IsFast) && + IsFast) { LastLegalType = i + 1; } } - // Find a legal type for the vector store. - EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); - if (TLI.isTypeLegal(Ty) && - allowableAlignment(DAG, TLI, Ty, FirstStoreAS, FirstStoreAlign)) { - LastLegalVectorType = i + 1; + // We only use vectors if the constant is known to be zero or the target + // allows it and the function is not marked with the noimplicitfloat + // attribute. + if ((!NonZero || TLI.storeOfVectorConstantIsCheap(MemVT, i+1, + FirstStoreAS)) && + !NoVectors) { + // Find a legal type for the vector store. + EVT Ty = EVT::getVectorVT(Context, MemVT, i+1); + if (TLI.isTypeLegal(Ty) && + TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, + FirstStoreAlign, &IsFast) && IsFast) + LastLegalVectorType = i + 1; } } - - // We only use vectors if the constant is known to be zero or the target - // allows it and the function is not marked with the noimplicitfloat - // attribute. - if (NoVectors) { - LastLegalVectorType = 0; - } else if (NonZero && !TLI.storeOfVectorConstantIsCheap(MemVT, - LastLegalVectorType, - FirstStoreAS)) { - LastLegalVectorType = 0; - } - // Check if we found a legal integer type to store. if (LastLegalType == 0 && LastLegalVectorType == 0) return false; @@ -10990,27 +11493,36 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { // When extracting multiple vector elements, try to store them // in one vector store rather than a sequence of scalar stores. - if (IsExtractVecEltSrc) { - unsigned NumElem = 0; + if (IsExtractVecSrc) { + unsigned NumStoresToMerge = 0; + bool IsVec = MemVT.isVector(); for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) { StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - SDValue StoredVal = St->getValue(); + unsigned StoreValOpcode = St->getValue().getOpcode(); // This restriction could be loosened. // Bail out if any stored values are not elements extracted from a vector. // It should be possible to handle mixed sources, but load sources need // more careful handling (see the block of code below that handles // consecutive loads). - if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT && + StoreValOpcode != ISD::EXTRACT_SUBVECTOR) return false; // Find a legal type for the vector store. - EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); + unsigned Elts = i + 1; + if (IsVec) { + // When merging vector stores, get the total number of elements. + Elts *= MemVT.getVectorNumElements(); + } + EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); + bool IsFast; if (TLI.isTypeLegal(Ty) && - allowableAlignment(DAG, TLI, Ty, FirstStoreAS, FirstStoreAlign)) - NumElem = i + 1; + TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, + FirstStoreAlign, &IsFast) && IsFast) + NumStoresToMerge = i + 1; } - return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, + return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumStoresToMerge, false, true); } @@ -11084,7 +11596,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { StartAddress = LoadNodes[0].OffsetFromBase; SDValue FirstChain = FirstLoad->getChain(); for (unsigned i = 1; i < LoadNodes.size(); ++i) { - // All loads much share the same chain. + // All loads must share the same chain. if (LoadNodes[i].MemNode->getChain() != FirstChain) break; @@ -11092,35 +11604,41 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { if (CurrAddress - StartAddress != (ElementSizeBytes * i)) break; LastConsecutiveLoad = i; - // Find a legal type for the vector store. - EVT StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); + EVT StoreTy = EVT::getVectorVT(Context, MemVT, i+1); + bool IsFastSt, IsFastLd; if (TLI.isTypeLegal(StoreTy) && - allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS, FirstStoreAlign) && - allowableAlignment(DAG, TLI, StoreTy, FirstLoadAS, FirstLoadAlign)) { + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, + FirstStoreAlign, &IsFastSt) && IsFastSt && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, + FirstLoadAlign, &IsFastLd) && IsFastLd) { LastLegalVectorType = i + 1; } // Find a legal type for the integer store. unsigned SizeInBits = (i+1) * ElementSizeBytes * 8; - StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits); + StoreTy = EVT::getIntegerVT(Context, SizeInBits); if (TLI.isTypeLegal(StoreTy) && - allowableAlignment(DAG, TLI, StoreTy, FirstStoreAS, FirstStoreAlign) && - allowableAlignment(DAG, TLI, StoreTy, FirstLoadAS, FirstLoadAlign)) + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, + FirstStoreAlign, &IsFastSt) && IsFastSt && + TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, + FirstLoadAlign, &IsFastLd) && IsFastLd) LastLegalIntegerType = i + 1; // Or check whether a truncstore and extload is legal. - else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) == + else if (TLI.getTypeAction(Context, StoreTy) == TargetLowering::TypePromoteInteger) { EVT LegalizedStoredValueTy = - TLI.getTypeToTransformTo(*DAG.getContext(), StoreTy); + TLI.getTypeToTransformTo(Context, StoreTy); if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) && - allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstStoreAS, - FirstStoreAlign) && - allowableAlignment(DAG, TLI, LegalizedStoredValueTy, FirstLoadAS, - FirstLoadAlign)) + TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, + FirstStoreAS, FirstStoreAlign, &IsFastSt) && + IsFastSt && + TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, + FirstLoadAS, FirstLoadAlign, &IsFastLd) && + IsFastLd) LastLegalIntegerType = i+1; } } @@ -11138,6 +11656,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { if (NumElem < 2) return false; + // Collect the chains from all merged stores. + SmallVector<SDValue, 8> MergeStoreChains; + MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain()); + // The latest Node in the DAG. unsigned LatestNodeUsed = 0; for (unsigned i=1; i<NumElem; ++i) { @@ -11147,6 +11669,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { // latest store node which is *used* and replaced by the wide store. if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum) LatestNodeUsed = i; + + MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain()); } LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode; @@ -11155,34 +11679,33 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { // to memory. EVT JointMemOpVT; if (UseVectorTy) { - JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem); + JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem); } else { unsigned SizeInBits = NumElem * ElementSizeBytes * 8; - JointMemOpVT = EVT::getIntegerVT(*DAG.getContext(), SizeInBits); + JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); } SDLoc LoadDL(LoadNodes[0].MemNode); SDLoc StoreDL(StoreNodes[0].MemNode); + // The merged loads are required to have the same incoming chain, so + // using the first's chain is acceptable. SDValue NewLoad = DAG.getLoad( JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), false, false, false, FirstLoadAlign); + SDValue NewStoreChain = + DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains); + SDValue NewStore = DAG.getStore( - LatestOp->getChain(), StoreDL, NewLoad, FirstInChain->getBasePtr(), + NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), false, false, FirstStoreAlign); - // Replace one of the loads with the new load. - LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[0].MemNode); - DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), - SDValue(NewLoad.getNode(), 1)); - - // Remove the rest of the load chains. - for (unsigned i = 1; i < NumElem ; ++i) { - // Replace all chain users of the old load nodes with the chain of the new - // load node. + // Transfer chain users from old loads to the new load. + for (unsigned i = 0; i < NumElem; ++i) { LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode); - DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Ld->getChain()); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), + SDValue(NewLoad.getNode(), 1)); } // Replace the last store with the new store. @@ -11200,6 +11723,114 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { return true; } +SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) { + SDLoc SL(ST); + SDValue ReplStore; + + // Replace the chain to avoid dependency. + if (ST->isTruncatingStore()) { + ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(), + ST->getBasePtr(), ST->getMemoryVT(), + ST->getMemOperand()); + } else { + ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(), + ST->getMemOperand()); + } + + // Create token to keep both nodes around. + SDValue Token = DAG.getNode(ISD::TokenFactor, SL, + MVT::Other, ST->getChain(), ReplStore); + + // Make sure the new and old chains are cleaned up. + AddToWorklist(Token.getNode()); + + // Don't add users to work list. + return CombineTo(ST, Token, false); +} + +SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) { + SDValue Value = ST->getValue(); + if (Value.getOpcode() == ISD::TargetConstantFP) + return SDValue(); + + SDLoc DL(ST); + + SDValue Chain = ST->getChain(); + SDValue Ptr = ST->getBasePtr(); + + const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value); + + // NOTE: If the original store is volatile, this transform must not increase + // the number of stores. For example, on x86-32 an f64 can be stored in one + // processor operation but an i64 (which is not legal) requires two. So the + // transform should not be done in this case. + + SDValue Tmp; + switch (CFP->getSimpleValueType(0).SimpleTy) { + default: + llvm_unreachable("Unknown FP type"); + case MVT::f16: // We don't do this for these yet. + case MVT::f80: + case MVT::f128: + case MVT::ppcf128: + return SDValue(); + case MVT::f32: + if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) || + TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { + ; + Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF(). + bitcastToAPInt().getZExtValue(), SDLoc(CFP), + MVT::i32); + return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand()); + } + + return SDValue(); + case MVT::f64: + if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations && + !ST->isVolatile()) || + TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) { + ; + Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). + getZExtValue(), SDLoc(CFP), MVT::i64); + return DAG.getStore(Chain, DL, Tmp, + Ptr, ST->getMemOperand()); + } + + if (!ST->isVolatile() && + TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { + // Many FP stores are not made apparent until after legalize, e.g. for + // argument passing. Since this is so common, custom legalize the + // 64-bit integer store into two 32-bit stores. + uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue(); + SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32); + SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32); + if (DAG.getDataLayout().isBigEndian()) + std::swap(Lo, Hi); + + unsigned Alignment = ST->getAlignment(); + bool isVolatile = ST->isVolatile(); + bool isNonTemporal = ST->isNonTemporal(); + AAMDNodes AAInfo = ST->getAAInfo(); + + SDValue St0 = DAG.getStore(Chain, DL, Lo, + Ptr, ST->getPointerInfo(), + isVolatile, isNonTemporal, + ST->getAlignment(), AAInfo); + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(4, DL, Ptr.getValueType())); + Alignment = MinAlign(Alignment, 4U); + SDValue St1 = DAG.getStore(Chain, DL, Hi, + Ptr, ST->getPointerInfo().getWithOffset(4), + isVolatile, isNonTemporal, + Alignment, AAInfo); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + St0, St1); + } + + return SDValue(); + } +} + SDValue DAGCombiner::visitSTORE(SDNode *N) { StoreSDNode *ST = cast<StoreSDNode>(N); SDValue Chain = ST->getChain(); @@ -11227,81 +11858,6 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { if (Value.getOpcode() == ISD::UNDEF && ST->isUnindexed()) return Chain; - // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr' - if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Value)) { - // NOTE: If the original store is volatile, this transform must not increase - // the number of stores. For example, on x86-32 an f64 can be stored in one - // processor operation but an i64 (which is not legal) requires two. So the - // transform should not be done in this case. - if (Value.getOpcode() != ISD::TargetConstantFP) { - SDValue Tmp; - switch (CFP->getSimpleValueType(0).SimpleTy) { - default: llvm_unreachable("Unknown FP type"); - case MVT::f16: // We don't do this for these yet. - case MVT::f80: - case MVT::f128: - case MVT::ppcf128: - break; - case MVT::f32: - if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) || - TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { - ; - Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF(). - bitcastToAPInt().getZExtValue(), SDLoc(CFP), - MVT::i32); - return DAG.getStore(Chain, SDLoc(N), Tmp, - Ptr, ST->getMemOperand()); - } - break; - case MVT::f64: - if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations && - !ST->isVolatile()) || - TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) { - ; - Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). - getZExtValue(), SDLoc(CFP), MVT::i64); - return DAG.getStore(Chain, SDLoc(N), Tmp, - Ptr, ST->getMemOperand()); - } - - if (!ST->isVolatile() && - TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { - // Many FP stores are not made apparent until after legalize, e.g. for - // argument passing. Since this is so common, custom legalize the - // 64-bit integer store into two 32-bit stores. - uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue(); - SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32); - SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32); - if (DAG.getDataLayout().isBigEndian()) - std::swap(Lo, Hi); - - unsigned Alignment = ST->getAlignment(); - bool isVolatile = ST->isVolatile(); - bool isNonTemporal = ST->isNonTemporal(); - AAMDNodes AAInfo = ST->getAAInfo(); - - SDLoc DL(N); - - SDValue St0 = DAG.getStore(Chain, SDLoc(ST), Lo, - Ptr, ST->getPointerInfo(), - isVolatile, isNonTemporal, - ST->getAlignment(), AAInfo); - Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(4, DL, Ptr.getValueType())); - Alignment = MinAlign(Alignment, 4U); - SDValue St1 = DAG.getStore(Chain, SDLoc(ST), Hi, - Ptr, ST->getPointerInfo().getWithOffset(4), - isVolatile, isNonTemporal, - Alignment, AAInfo); - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - St0, St1); - } - - break; - } - } - } - // Try to infer better alignment information than the store already has. if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) { if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { @@ -11319,8 +11875,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { // Try transforming a pair floating point load / store ops to integer // load / store ops. - SDValue NewST = TransformFPLoadStorePair(N); - if (NewST.getNode()) + if (SDValue NewST = TransformFPLoadStorePair(N)) return NewST; bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA @@ -11331,31 +11886,17 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { UseAA = false; #endif if (UseAA && ST->isUnindexed()) { - // Walk up chain skipping non-aliasing memory nodes. - SDValue BetterChain = FindBetterChain(N, Chain); - - // If there is a better chain. - if (Chain != BetterChain) { - SDValue ReplStore; - - // Replace the chain to avoid dependency. - if (ST->isTruncatingStore()) { - ReplStore = DAG.getTruncStore(BetterChain, SDLoc(N), Value, Ptr, - ST->getMemoryVT(), ST->getMemOperand()); - } else { - ReplStore = DAG.getStore(BetterChain, SDLoc(N), Value, Ptr, - ST->getMemOperand()); - } + // FIXME: We should do this even without AA enabled. AA will just allow + // FindBetterChain to work in more situations. The problem with this is that + // any combine that expects memory operations to be on consecutive chains + // first needs to be updated to look for users of the same chain. - // Create token to keep both nodes around. - SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N), - MVT::Other, Chain, ReplStore); - - // Make sure the new and old chains are cleaned up. - AddToWorklist(Token.getNode()); - - // Don't add users to work list. - return CombineTo(N, Token, false); + // Walk up chain skipping non-aliasing memory nodes, on this store and any + // adjacent stores. + if (findBetterNeighborChains(ST)) { + // replaceStoreChain uses CombineTo, which handled all of the worklist + // manipulation. Return the original node to not do anything else. + return SDValue(ST, 0); } } @@ -11440,6 +11981,16 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { return SDValue(N, 0); } + // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr' + // + // Make sure to do this only after attempting to merge stores in order to + // avoid changing the types of some subset of stores due to visit order, + // preventing their merging. + if (isa<ConstantFPSDNode>(Value)) { + if (SDValue NewSt = replaceStoreOfFPConstant(ST)) + return NewSt; + } + return ReduceLoadOpStoreWidth(N); } @@ -11613,7 +12164,24 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { } SDValue EltNo = N->getOperand(1); - bool ConstEltNo = isa<ConstantSDNode>(EltNo); + ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo); + + // extract_vector_elt (build_vector x, y), 1 -> y + if (ConstEltNo && + InVec.getOpcode() == ISD::BUILD_VECTOR && + TLI.isTypeLegal(VT) && + (InVec.hasOneUse() || + TLI.aggressivelyPreferBuildVectorSources(VT))) { + SDValue Elt = InVec.getOperand(ConstEltNo->getZExtValue()); + EVT InEltVT = Elt.getValueType(); + + // Sometimes build_vector's scalar input types do not match result type. + if (NVT == InEltVT) + return Elt; + + // TODO: It may be useful to truncate if free if the build_vector implicitly + // converts. + } // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT. // We only perform this optimization before the op legalization phase because @@ -11621,13 +12189,11 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // patterns. For example on AVX, extracting elements from a wide vector // without using extract_subvector. However, if we can find an underlying // scalar value, then we can always use that. - if (InVec.getOpcode() == ISD::VECTOR_SHUFFLE - && ConstEltNo) { - int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); + if (ConstEltNo && InVec.getOpcode() == ISD::VECTOR_SHUFFLE) { int NumElem = VT.getVectorNumElements(); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(InVec); // Find the new index to extract from. - int OrigElt = SVOp->getMaskElt(Elt); + int OrigElt = SVOp->getMaskElt(ConstEltNo->getZExtValue()); // Extracting an undef index is undef. if (OrigElt == -1) @@ -12183,12 +12749,90 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, Ops)); } -SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { - // TODO: Check to see if this is a CONCAT_VECTORS of a bunch of - // EXTRACT_SUBVECTOR operations. If so, and if the EXTRACT_SUBVECTOR vector - // inputs come from at most two distinct vectors, turn this into a shuffle - // node. +// Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR +// operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at +// most two distinct vectors the same size as the result, attempt to turn this +// into a legal shuffle. +static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + EVT OpVT = N->getOperand(0).getValueType(); + int NumElts = VT.getVectorNumElements(); + int NumOpElts = OpVT.getVectorNumElements(); + + SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT); + SmallVector<int, 8> Mask; + + for (SDValue Op : N->ops()) { + // Peek through any bitcast. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + + // UNDEF nodes convert to UNDEF shuffle mask values. + if (Op.getOpcode() == ISD::UNDEF) { + Mask.append((unsigned)NumOpElts, -1); + continue; + } + + if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR) + return SDValue(); + + // What vector are we extracting the subvector from and at what index? + SDValue ExtVec = Op.getOperand(0); + + // We want the EVT of the original extraction to correctly scale the + // extraction index. + EVT ExtVT = ExtVec.getValueType(); + + // Peek through any bitcast. + while (ExtVec.getOpcode() == ISD::BITCAST) + ExtVec = ExtVec.getOperand(0); + + // UNDEF nodes convert to UNDEF shuffle mask values. + if (ExtVec.getOpcode() == ISD::UNDEF) { + Mask.append((unsigned)NumOpElts, -1); + continue; + } + + if (!isa<ConstantSDNode>(Op.getOperand(1))) + return SDValue(); + int ExtIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + + // Ensure that we are extracting a subvector from a vector the same + // size as the result. + if (ExtVT.getSizeInBits() != VT.getSizeInBits()) + return SDValue(); + + // Scale the subvector index to account for any bitcast. + int NumExtElts = ExtVT.getVectorNumElements(); + if (0 == (NumExtElts % NumElts)) + ExtIdx /= (NumExtElts / NumElts); + else if (0 == (NumElts % NumExtElts)) + ExtIdx *= (NumElts / NumExtElts); + else + return SDValue(); + // At most we can reference 2 inputs in the final shuffle. + if (SV0.getOpcode() == ISD::UNDEF || SV0 == ExtVec) { + SV0 = ExtVec; + for (int i = 0; i != NumOpElts; ++i) + Mask.push_back(i + ExtIdx); + } else if (SV1.getOpcode() == ISD::UNDEF || SV1 == ExtVec) { + SV1 = ExtVec; + for (int i = 0; i != NumOpElts; ++i) + Mask.push_back(i + ExtIdx + NumElts); + } else { + return SDValue(); + } + } + + if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(Mask, VT)) + return SDValue(); + + return DAG.getVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0), + DAG.getBitcast(VT, SV1), Mask); +} + +SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // If we only have one input vector, we don't need to do any concatenation. if (N->getNumOperands() == 1) return N->getOperand(0); @@ -12289,6 +12933,11 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { if (SDValue V = combineConcatVectorOfScalars(N, DAG)) return V; + // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE. + if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) + if (SDValue V = combineConcatVectorOfExtracts(N, DAG)) + return V; + // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR // nodes often generate nop CONCAT_VECTOR nodes. // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that @@ -12503,7 +13152,7 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { std::all_of(SVN->getMask().begin() + NumElemsPerConcat, SVN->getMask().end(), [](int i) { return i == -1; })) { N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1), - ArrayRef<int>(SVN->getMask().begin(), NumElemsPerConcat)); + makeArrayRef(SVN->getMask().begin(), NumElemsPerConcat)); N1 = DAG.getUNDEF(ConcatVT); return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1); } @@ -12981,6 +13630,21 @@ SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) { + SDValue N0 = N->getOperand(0); + + // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op) + if (N0->getOpcode() == ISD::AND) { + ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1)); + if (AndConst && AndConst->getAPIntValue() == 0xffff) { + return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0), + N0.getOperand(0)); + } + } + + return SDValue(); +} + /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle /// with the destination vector and a zero vector. /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==> @@ -13002,34 +13666,76 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { if (RHS.getOpcode() == ISD::BITCAST) RHS = RHS.getOperand(0); - if (RHS.getOpcode() == ISD::BUILD_VECTOR) { + if (RHS.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + EVT RVT = RHS.getValueType(); + unsigned NumElts = RHS.getNumOperands(); + + // Attempt to create a valid clear mask, splitting the mask into + // sub elements and checking to see if each is + // all zeros or all ones - suitable for shuffle masking. + auto BuildClearMask = [&](int Split) { + int NumSubElts = NumElts * Split; + int NumSubBits = RVT.getScalarSizeInBits() / Split; + SmallVector<int, 8> Indices; - unsigned NumElts = RHS.getNumOperands(); + for (int i = 0; i != NumSubElts; ++i) { + int EltIdx = i / Split; + int SubIdx = i % Split; + SDValue Elt = RHS.getOperand(EltIdx); + if (Elt.getOpcode() == ISD::UNDEF) { + Indices.push_back(-1); + continue; + } - for (unsigned i = 0; i != NumElts; ++i) { - SDValue Elt = RHS.getOperand(i); - if (isAllOnesConstant(Elt)) + APInt Bits; + if (isa<ConstantSDNode>(Elt)) + Bits = cast<ConstantSDNode>(Elt)->getAPIntValue(); + else if (isa<ConstantFPSDNode>(Elt)) + Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt(); + else + return SDValue(); + + // Extract the sub element from the constant bit mask. + if (DAG.getDataLayout().isBigEndian()) { + Bits = Bits.lshr((Split - SubIdx - 1) * NumSubBits); + } else { + Bits = Bits.lshr(SubIdx * NumSubBits); + } + + if (Split > 1) + Bits = Bits.trunc(NumSubBits); + + if (Bits.isAllOnesValue()) Indices.push_back(i); - else if (isNullConstant(Elt)) - Indices.push_back(NumElts+i); + else if (Bits == 0) + Indices.push_back(i + NumSubElts); else return SDValue(); } // Let's see if the target supports this vector_shuffle. - EVT RVT = RHS.getValueType(); - if (!TLI.isVectorClearMaskLegal(Indices, RVT)) + EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits); + EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts); + if (!TLI.isVectorClearMaskLegal(Indices, ClearVT)) return SDValue(); - // Return the new VECTOR_SHUFFLE node. - EVT EltVT = RVT.getVectorElementType(); - SmallVector<SDValue,8> ZeroOps(RVT.getVectorNumElements(), - DAG.getConstant(0, dl, EltVT)); - SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, RVT, ZeroOps); - LHS = DAG.getNode(ISD::BITCAST, dl, RVT, LHS); - SDValue Shuf = DAG.getVectorShuffle(RVT, dl, LHS, Zero, &Indices[0]); - return DAG.getNode(ISD::BITCAST, dl, VT, Shuf); - } + SDValue Zero = DAG.getConstant(0, dl, ClearVT); + return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, dl, + DAG.getBitcast(ClearVT, LHS), + Zero, &Indices[0])); + }; + + // Determine maximum split level (byte level masking). + int MaxSplit = 1; + if (RVT.getScalarSizeInBits() % 8 == 0) + MaxSplit = RVT.getScalarSizeInBits() / 8; + + for (int Split = 1; Split <= MaxSplit; ++Split) + if (RVT.getScalarSizeInBits() % Split == 0) + if (SDValue S = BuildClearMask(Split)) + return S; return SDValue(); } @@ -13041,60 +13747,17 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + SDValue Ops[] = {LHS, RHS}; + // See if we can constant fold the vector operation. + if (SDValue Fold = DAG.FoldConstantVectorArithmetic( + N->getOpcode(), SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags())) + return Fold; + + // Try to convert a constant mask AND into a shuffle clear mask. if (SDValue Shuffle = XformToShuffleWithZero(N)) return Shuffle; - // If the LHS and RHS are BUILD_VECTOR nodes, see if we can constant fold - // this operation. - if (LHS.getOpcode() == ISD::BUILD_VECTOR && - RHS.getOpcode() == ISD::BUILD_VECTOR) { - // Check if both vectors are constants. If not bail out. - if (!(cast<BuildVectorSDNode>(LHS)->isConstant() && - cast<BuildVectorSDNode>(RHS)->isConstant())) - return SDValue(); - - SmallVector<SDValue, 8> Ops; - for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) { - SDValue LHSOp = LHS.getOperand(i); - SDValue RHSOp = RHS.getOperand(i); - - // Can't fold divide by zero. - if (N->getOpcode() == ISD::SDIV || N->getOpcode() == ISD::UDIV || - N->getOpcode() == ISD::FDIV) { - if (isNullConstant(RHSOp) || (RHSOp.getOpcode() == ISD::ConstantFP && - cast<ConstantFPSDNode>(RHSOp.getNode())->isZero())) - break; - } - - EVT VT = LHSOp.getValueType(); - EVT RVT = RHSOp.getValueType(); - if (RVT != VT) { - // Integer BUILD_VECTOR operands may have types larger than the element - // size (e.g., when the element type is not legal). Prior to type - // legalization, the types may not match between the two BUILD_VECTORS. - // Truncate one of the operands to make them match. - if (RVT.getSizeInBits() > VT.getSizeInBits()) { - RHSOp = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, RHSOp); - } else { - LHSOp = DAG.getNode(ISD::TRUNCATE, SDLoc(N), RVT, LHSOp); - VT = RVT; - } - } - SDValue FoldOp = DAG.getNode(N->getOpcode(), SDLoc(LHS), VT, - LHSOp, RHSOp); - if (FoldOp.getOpcode() != ISD::UNDEF && - FoldOp.getOpcode() != ISD::Constant && - FoldOp.getOpcode() != ISD::ConstantFP) - break; - Ops.push_back(FoldOp); - AddToWorklist(FoldOp.getNode()); - } - - if (Ops.size() == LHS.getNumOperands()) - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), LHS.getValueType(), Ops); - } - // Type legalization might introduce new shuffles in the DAG. // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask))) // -> (shuffle (VBinOp (A, B)), Undef, Mask). @@ -13109,7 +13772,8 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { EVT VT = N->getValueType(0); SDValue UndefVector = LHS.getOperand(1); SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT, - LHS.getOperand(0), RHS.getOperand(0)); + LHS.getOperand(0), RHS.getOperand(0), + N->getFlags()); AddUsersToWorklist(N); return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector, &SVN0->getMask()[0]); @@ -13390,9 +14054,10 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset); AddToWorklist(CPIdx.getNode()); - return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), false, - false, false, Alignment); + return DAG.getLoad( + TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, Alignment); } } @@ -13481,8 +14146,7 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, // Get a SetCC of the condition // NOTE: Don't create a SETCC if it's not legal on this target. if (!LegalOperations || - TLI.isOperationLegal(ISD::SETCC, - LegalTypes ? getSetCCResultType(N0.getValueType()) : MVT::i1)) { + TLI.isOperationLegal(ISD::SETCC, N0.getValueType())) { SDValue Temp, SCC; // cast from setcc result type to select result type if (LegalTypes) { @@ -13514,51 +14178,6 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, } } - // Check to see if this is the equivalent of setcc - // FIXME: Turn all of these into setcc if setcc if setcc is legal - // otherwise, go ahead with the folds. - if (0 && isNullConstant(N3) && isOneConstant(N2)) { - EVT XType = N0.getValueType(); - if (!LegalOperations || - TLI.isOperationLegal(ISD::SETCC, getSetCCResultType(XType))) { - SDValue Res = DAG.getSetCC(DL, getSetCCResultType(XType), N0, N1, CC); - if (Res.getValueType() != VT) - Res = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Res); - return Res; - } - - // fold (seteq X, 0) -> (srl (ctlz X, log2(size(X)))) - if (isNullConstant(N1) && CC == ISD::SETEQ && - (!LegalOperations || - TLI.isOperationLegal(ISD::CTLZ, XType))) { - SDValue Ctlz = DAG.getNode(ISD::CTLZ, SDLoc(N0), XType, N0); - return DAG.getNode(ISD::SRL, DL, XType, Ctlz, - DAG.getConstant(Log2_32(XType.getSizeInBits()), - SDLoc(Ctlz), - getShiftAmountTy(Ctlz.getValueType()))); - } - // fold (setgt X, 0) -> (srl (and (-X, ~X), size(X)-1)) - if (isNullConstant(N1) && CC == ISD::SETGT) { - SDLoc DL(N0); - SDValue NegN0 = DAG.getNode(ISD::SUB, DL, - XType, DAG.getConstant(0, DL, XType), N0); - SDValue NotN0 = DAG.getNOT(DL, N0, XType); - return DAG.getNode(ISD::SRL, DL, XType, - DAG.getNode(ISD::AND, DL, XType, NegN0, NotN0), - DAG.getConstant(XType.getSizeInBits() - 1, DL, - getShiftAmountTy(XType))); - } - // fold (setgt X, -1) -> (xor (srl (X, size(X)-1), 1)) - if (isAllOnesConstant(N1) && CC == ISD::SETGT) { - SDLoc DL(N0); - SDValue Sign = DAG.getNode(ISD::SRL, DL, XType, N0, - DAG.getConstant(XType.getSizeInBits() - 1, DL, - getShiftAmountTy(N0.getValueType()))); - return DAG.getNode(ISD::XOR, DL, XType, Sign, DAG.getConstant(1, DL, - XType)); - } - } - // Check to see if this is an integer abs. // select_cc setg[te] X, 0, X, -X -> // select_cc setgt X, -1, X, -X -> @@ -13666,7 +14285,7 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) { return S; } -SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op) { +SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) { if (Level >= AfterLegalizeDAG) return SDValue(); @@ -13690,16 +14309,16 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op) { // Newton iterations: Est = Est + Est (1 - Arg * Est) for (unsigned i = 0; i < Iterations; ++i) { - SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est); + SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags); AddToWorklist(NewEst.getNode()); - NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst); + NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags); AddToWorklist(NewEst.getNode()); - NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst); + NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); AddToWorklist(NewEst.getNode()); - Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst); + Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags); AddToWorklist(Est.getNode()); } } @@ -13716,31 +14335,32 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op) { /// X_{i+1} = X_i (1.5 - A X_i^2 / 2) /// As a result, we precompute A/2 prior to the iteration loop. SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est, - unsigned Iterations) { + unsigned Iterations, + SDNodeFlags *Flags) { EVT VT = Arg.getValueType(); SDLoc DL(Arg); SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT); // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that // this entire sequence requires only one FP constant. - SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg); + SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags); AddToWorklist(HalfArg.getNode()); - HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg); + HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags); AddToWorklist(HalfArg.getNode()); // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) for (unsigned i = 0; i < Iterations; ++i) { - SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est); + SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags); AddToWorklist(NewEst.getNode()); - NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst); + NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags); AddToWorklist(NewEst.getNode()); - NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst); + NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags); AddToWorklist(NewEst.getNode()); - Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst); + Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); AddToWorklist(Est.getNode()); } return Est; @@ -13752,7 +14372,8 @@ SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est, /// => /// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0)) SDValue DAGCombiner::BuildRsqrtNRTwoConst(SDValue Arg, SDValue Est, - unsigned Iterations) { + unsigned Iterations, + SDNodeFlags *Flags) { EVT VT = Arg.getValueType(); SDLoc DL(Arg); SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT); @@ -13760,25 +14381,25 @@ SDValue DAGCombiner::BuildRsqrtNRTwoConst(SDValue Arg, SDValue Est, // Newton iterations: Est = -0.5 * Est * (-3.0 + Arg * Est * Est) for (unsigned i = 0; i < Iterations; ++i) { - SDValue HalfEst = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf); + SDValue HalfEst = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags); AddToWorklist(HalfEst.getNode()); - Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Est); + Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags); AddToWorklist(Est.getNode()); - Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg); + Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags); AddToWorklist(Est.getNode()); - Est = DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree); + Est = DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree, Flags); AddToWorklist(Est.getNode()); - Est = DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst); + Est = DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst, Flags); AddToWorklist(Est.getNode()); } return Est; } -SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) { +SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags) { if (Level >= AfterLegalizeDAG) return SDValue(); @@ -13790,8 +14411,8 @@ SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) { AddToWorklist(Est.getNode()); if (Iterations) { Est = UseOneConstNR ? - BuildRsqrtNROneConst(Op, Est, Iterations) : - BuildRsqrtNRTwoConst(Op, Est, Iterations); + BuildRsqrtNROneConst(Op, Est, Iterations, Flags) : + BuildRsqrtNRTwoConst(Op, Est, Iterations, Flags); } return Est; } @@ -13955,14 +14576,12 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain, SDValue Chain = Chains.pop_back_val(); // For TokenFactor nodes, look at each operand and only continue up the - // chain until we find two aliases. If we've seen two aliases, assume we'll - // find more and revert to original chain since the xform is unlikely to be - // profitable. + // chain until we reach the depth limit. // // FIXME: The depth check could be made to return the last non-aliasing // chain we found before we hit a tokenfactor rather than the original // chain. - if (Depth > 6 || Aliases.size() == 2) { + if (Depth > TLI.getGatherAllAliasesMaxDepth()) { Aliases.clear(); Aliases.push_back(OriginalChain); return; @@ -14094,6 +14713,83 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases); } +bool DAGCombiner::findBetterNeighborChains(StoreSDNode* St) { + // This holds the base pointer, index, and the offset in bytes from the base + // pointer. + BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr()); + + // We must have a base and an offset. + if (!BasePtr.Base.getNode()) + return false; + + // Do not handle stores to undef base pointers. + if (BasePtr.Base.getOpcode() == ISD::UNDEF) + return false; + + SmallVector<StoreSDNode *, 8> ChainedStores; + ChainedStores.push_back(St); + + // Walk up the chain and look for nodes with offsets from the same + // base pointer. Stop when reaching an instruction with a different kind + // or instruction which has a different base pointer. + StoreSDNode *Index = St; + while (Index) { + // If the chain has more than one use, then we can't reorder the mem ops. + if (Index != St && !SDValue(Index, 0)->hasOneUse()) + break; + + if (Index->isVolatile() || Index->isIndexed()) + break; + + // Find the base pointer and offset for this memory node. + BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr()); + + // Check that the base pointer is the same as the original one. + if (!Ptr.equalBaseIndex(BasePtr)) + break; + + // Find the next memory operand in the chain. If the next operand in the + // chain is a store then move up and continue the scan with the next + // memory operand. If the next operand is a load save it and use alias + // information to check if it interferes with anything. + SDNode *NextInChain = Index->getChain().getNode(); + while (true) { + if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) { + // We found a store node. Use it for the next iteration. + ChainedStores.push_back(STn); + Index = STn; + break; + } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) { + NextInChain = Ldn->getChain().getNode(); + continue; + } else { + Index = nullptr; + break; + } + } + } + + bool MadeChange = false; + SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains; + + for (StoreSDNode *ChainedStore : ChainedStores) { + SDValue Chain = ChainedStore->getChain(); + SDValue BetterChain = FindBetterChain(ChainedStore, Chain); + + if (Chain != BetterChain) { + MadeChange = true; + BetterChains.push_back(std::make_pair(ChainedStore, BetterChain)); + } + } + + // Do all replacements after finding the replacements to make to avoid making + // the chains more complicated by introducing new TokenFactors. + for (auto Replacement : BetterChains) + replaceStoreChain(Replacement.first, Replacement.second); + + return MadeChange; +} + /// This is the entry point for the file. void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA, CodeGenOpt::Level OptLevel) { diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 2b9ba2c..cfbb209 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -118,9 +118,9 @@ bool FastISel::lowerArguments() { for (Function::const_arg_iterator I = FuncInfo.Fn->arg_begin(), E = FuncInfo.Fn->arg_end(); I != E; ++I) { - DenseMap<const Value *, unsigned>::iterator VI = LocalValueMap.find(I); + DenseMap<const Value *, unsigned>::iterator VI = LocalValueMap.find(&*I); assert(VI != LocalValueMap.end() && "Missed an argument?"); - FuncInfo.ValueMap[I] = VI->second; + FuncInfo.ValueMap[&*I] = VI->second; } return true; } @@ -611,7 +611,7 @@ bool FastISel::selectStackmap(const CallInst *I) { // have to worry about calling conventions and target-specific lowering code. // Instead we perform the call lowering right here. // - // CALLSEQ_START(0) + // CALLSEQ_START(0...) // STACKMAP(id, nbytes, ...) // CALLSEQ_END(0, 0) // @@ -647,8 +647,11 @@ bool FastISel::selectStackmap(const CallInst *I) { // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(0); + auto Builder = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)); + const MCInstrDesc &MCID = Builder.getInstr()->getDesc(); + for (unsigned I = 0, E = MCID.getNumOperands(); I < E; ++I) + Builder.addImm(0); // Issue STACKMAP. MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -1100,13 +1103,6 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { // The donothing intrinsic does, well, nothing. case Intrinsic::donothing: return true; - case Intrinsic::eh_actions: { - unsigned ResultReg = getRegForValue(UndefValue::get(II->getType())); - if (!ResultReg) - return false; - updateValueMap(II, ResultReg); - return true; - } case Intrinsic::dbg_declare: { const DbgDeclareInst *DI = cast<DbgDeclareInst>(II); assert(DI->getVariable() && "Missing variable"); @@ -1326,12 +1322,38 @@ bool FastISel::selectBitCast(const User *I) { return true; } +// Remove local value instructions starting from the instruction after +// SavedLastLocalValue to the current function insert point. +void FastISel::removeDeadLocalValueCode(MachineInstr *SavedLastLocalValue) +{ + MachineInstr *CurLastLocalValue = getLastLocalValue(); + if (CurLastLocalValue != SavedLastLocalValue) { + // Find the first local value instruction to be deleted. + // This is the instruction after SavedLastLocalValue if it is non-NULL. + // Otherwise it's the first instruction in the block. + MachineBasicBlock::iterator FirstDeadInst(SavedLastLocalValue); + if (SavedLastLocalValue) + ++FirstDeadInst; + else + FirstDeadInst = FuncInfo.MBB->getFirstNonPHI(); + setLastLocalValue(SavedLastLocalValue); + removeDeadCode(FirstDeadInst, FuncInfo.InsertPt); + } +} + bool FastISel::selectInstruction(const Instruction *I) { + MachineInstr *SavedLastLocalValue = getLastLocalValue(); // Just before the terminator instruction, insert instructions to // feed PHI nodes in successor blocks. if (isa<TerminatorInst>(I)) - if (!handlePHINodesInSuccessorBlocks(I->getParent())) + if (!handlePHINodesInSuccessorBlocks(I->getParent())) { + // PHI node handling may have generated local value instructions, + // even though it failed to handle all PHI nodes. + // We remove these instructions because SelectionDAGISel will generate + // them again. + removeDeadLocalValueCode(SavedLastLocalValue); return false; + } DbgLoc = I->getDebugLoc(); @@ -1348,7 +1370,7 @@ bool FastISel::selectInstruction(const Instruction *I) { LibInfo->hasOptimizedCodeGen(Func)) return false; - // Don't handle Intrinsic::trap if a trap funciton is specified. + // Don't handle Intrinsic::trap if a trap function is specified. if (F && F->getIntrinsicID() == Intrinsic::trap && Call->hasFnAttr("trap-func-name")) return false; @@ -1380,8 +1402,12 @@ bool FastISel::selectInstruction(const Instruction *I) { DbgLoc = DebugLoc(); // Undo phi node updates, because they will be added again by SelectionDAG. - if (isa<TerminatorInst>(I)) + if (isa<TerminatorInst>(I)) { + // PHI node handling may have generated local value instructions. + // We remove them because SelectionDAGISel will generate them again. + removeDeadLocalValueCode(SavedLastLocalValue); FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate); + } return false; } @@ -1398,11 +1424,30 @@ void FastISel::fastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DbgLoc) { TII.InsertBranch(*FuncInfo.MBB, MSucc, nullptr, SmallVector<MachineOperand, 0>(), DbgLoc); } - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(FuncInfo.MBB->getBasicBlock(), - MSucc->getBasicBlock()); - FuncInfo.MBB->addSuccessor(MSucc, BranchWeight); + if (FuncInfo.BPI) { + auto BranchProbability = FuncInfo.BPI->getEdgeProbability( + FuncInfo.MBB->getBasicBlock(), MSucc->getBasicBlock()); + FuncInfo.MBB->addSuccessor(MSucc, BranchProbability); + } else + FuncInfo.MBB->addSuccessorWithoutProb(MSucc); +} + +void FastISel::finishCondBranch(const BasicBlock *BranchBB, + MachineBasicBlock *TrueMBB, + MachineBasicBlock *FalseMBB) { + // Add TrueMBB as successor unless it is equal to the FalseMBB: This can + // happen in degenerate IR and MachineIR forbids to have a block twice in the + // successor/predecessor lists. + if (TrueMBB != FalseMBB) { + if (FuncInfo.BPI) { + auto BranchProbability = + FuncInfo.BPI->getEdgeProbability(BranchBB, TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchProbability); + } else + FuncInfo.MBB->addSuccessorWithoutProb(TrueMBB); + } + + fastEmitBranch(FalseMBB, DbgLoc); } /// Emit an FNeg operation. @@ -1864,21 +1909,18 @@ unsigned FastISel::fastEmitInst_rii(unsigned MachineInstOpcode, return ResultReg; } -unsigned FastISel::fastEmitInst_rf(unsigned MachineInstOpcode, - const TargetRegisterClass *RC, unsigned Op0, - bool Op0IsKill, const ConstantFP *FPImm) { +unsigned FastISel::fastEmitInst_f(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + const ConstantFP *FPImm) { const MCInstrDesc &II = TII.get(MachineInstOpcode); unsigned ResultReg = createResultReg(RC); - Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs()); if (II.getNumDefs() >= 1) BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) - .addReg(Op0, getKillRegState(Op0IsKill)) .addFPImm(FPImm); else { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) - .addReg(Op0, getKillRegState(Op0IsKill)) .addFPImm(FPImm); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]); @@ -1912,35 +1954,6 @@ unsigned FastISel::fastEmitInst_rri(unsigned MachineInstOpcode, return ResultReg; } -unsigned FastISel::fastEmitInst_rrii(unsigned MachineInstOpcode, - const TargetRegisterClass *RC, - unsigned Op0, bool Op0IsKill, unsigned Op1, - bool Op1IsKill, uint64_t Imm1, - uint64_t Imm2) { - const MCInstrDesc &II = TII.get(MachineInstOpcode); - - unsigned ResultReg = createResultReg(RC); - Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs()); - Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1); - - if (II.getNumDefs() >= 1) - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) - .addReg(Op0, getKillRegState(Op0IsKill)) - .addReg(Op1, getKillRegState(Op1IsKill)) - .addImm(Imm1) - .addImm(Imm2); - else { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) - .addReg(Op0, getKillRegState(Op0IsKill)) - .addReg(Op1, getKillRegState(Op1IsKill)) - .addImm(Imm1) - .addImm(Imm2); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]); - } - return ResultReg; -} - unsigned FastISel::fastEmitInst_i(unsigned MachineInstOpcode, const TargetRegisterClass *RC, uint64_t Imm) { unsigned ResultReg = createResultReg(RC); @@ -1957,25 +1970,6 @@ unsigned FastISel::fastEmitInst_i(unsigned MachineInstOpcode, return ResultReg; } -unsigned FastISel::fastEmitInst_ii(unsigned MachineInstOpcode, - const TargetRegisterClass *RC, uint64_t Imm1, - uint64_t Imm2) { - unsigned ResultReg = createResultReg(RC); - const MCInstrDesc &II = TII.get(MachineInstOpcode); - - if (II.getNumDefs() >= 1) - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) - .addImm(Imm1) - .addImm(Imm2); - else { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addImm(Imm1) - .addImm(Imm2); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]); - } - return ResultReg; -} - unsigned FastISel::fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0, bool Op0IsKill, uint32_t Idx) { unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT)); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index cc306cb..08815ed 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -87,6 +87,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); MachineModuleInfo &MMI = MF->getMMI(); + const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); // Check whether the function can return without sret-demotion. SmallVector<ISD::OutputArg, 4> Outs; @@ -103,28 +104,29 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) { if (const AllocaInst *AI = dyn_cast<AllocaInst>(I)) { - // Static allocas can be folded into the initial stack frame adjustment. - if (AI->isStaticAlloca()) { + Type *Ty = AI->getAllocatedType(); + unsigned Align = + std::max((unsigned)MF->getDataLayout().getPrefTypeAlignment(Ty), + AI->getAlignment()); + unsigned StackAlign = TFI->getStackAlignment(); + + // Static allocas can be folded into the initial stack frame + // adjustment. For targets that don't realign the stack, don't + // do this if there is an extra alignment requirement. + if (AI->isStaticAlloca() && + (TFI->isStackRealignable() || (Align <= StackAlign))) { const ConstantInt *CUI = cast<ConstantInt>(AI->getArraySize()); - Type *Ty = AI->getAllocatedType(); uint64_t TySize = MF->getDataLayout().getTypeAllocSize(Ty); - unsigned Align = - std::max((unsigned)MF->getDataLayout().getPrefTypeAlignment(Ty), - AI->getAlignment()); TySize *= CUI->getZExtValue(); // Get total allocated size. if (TySize == 0) TySize = 1; // Don't create zero-sized stack objects. StaticAllocaMap[AI] = MF->getFrameInfo()->CreateStackObject(TySize, Align, false, AI); - } else { - unsigned Align = - std::max((unsigned)MF->getDataLayout().getPrefTypeAlignment( - AI->getAllocatedType()), - AI->getAlignment()); - unsigned StackAlign = - MF->getSubtarget().getFrameLowering()->getStackAlignment(); + // FIXME: Overaligned static allocas should be grouped into + // a single dynamic allocation instead of using a separate + // stack allocation for each one. if (Align <= StackAlign) Align = 0; // Inform the Frame Information that we have variable-sized objects. @@ -134,7 +136,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, // Look for inline asm that clobbers the SP register. if (isa<CallInst>(I) || isa<InvokeInst>(I)) { - ImmutableCallSite CS(I); + ImmutableCallSite CS(&*I); if (isa<InlineAsm>(CS.getCalledValue())) { unsigned SP = TLI->getStackPointerRegisterToSaveRestore(); const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); @@ -163,7 +165,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, MF->getFrameInfo()->setHasVAStart(true); } - // If we have a musttail call in a variadic funciton, we need to ensure we + // If we have a musttail call in a variadic function, we need to ensure we // forward implicit register parameters. if (const auto *CI = dyn_cast<CallInst>(I)) { if (CI->isMustTailCall() && Fn->isVarArg()) @@ -172,10 +174,9 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, // Mark values used outside their block as exported, by allocating // a virtual register for them. - if (isUsedOutsideOfDefiningBlock(I)) - if (!isa<AllocaInst>(I) || - !StaticAllocaMap.count(cast<AllocaInst>(I))) - InitializeRegForValue(I); + if (isUsedOutsideOfDefiningBlock(&*I)) + if (!isa<AllocaInst>(I) || !StaticAllocaMap.count(cast<AllocaInst>(I))) + InitializeRegForValue(&*I); // Collect llvm.dbg.declare information. This is done now instead of // during the initial isel pass through the IR so that it is done @@ -205,15 +206,36 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, } // Decide the preferred extend type for a value. - PreferredExtendType[I] = getPreferredExtendForValue(I); + PreferredExtendType[&*I] = getPreferredExtendForValue(&*I); } // Create an initial MachineBasicBlock for each LLVM BasicBlock in F. This // also creates the initial PHI MachineInstrs, though none of the input // operands are populated. for (BB = Fn->begin(); BB != EB; ++BB) { - MachineBasicBlock *MBB = mf.CreateMachineBasicBlock(BB); - MBBMap[BB] = MBB; + // Don't create MachineBasicBlocks for imaginary EH pad blocks. These blocks + // are really data, and no instructions can live here. + if (BB->isEHPad()) { + const Instruction *I = BB->getFirstNonPHI(); + // If this is a non-landingpad EH pad, mark this function as using + // funclets. + // FIXME: SEH catchpads do not create funclets, so we could avoid setting + // this in such cases in order to improve frame layout. + if (!isa<LandingPadInst>(I)) { + MMI.setHasEHFunclets(true); + MF->getFrameInfo()->setHasOpaqueSPAdjustment(true); + } + if (isa<CatchSwitchInst>(I)) { + assert(&*BB->begin() == I && + "WinEHPrepare failed to remove PHIs from imaginary BBs"); + continue; + } + if (isa<FuncletPadInst>(I)) + assert(&*BB->begin() == I && "WinEHPrepare failed to demote PHIs"); + } + + MachineBasicBlock *MBB = mf.CreateMachineBasicBlock(&*BB); + MBBMap[&*BB] = MBB; MF->push_back(MBB); // Transfer the address-taken flag. This is necessary because there could @@ -252,94 +274,62 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, // Mark landing pad blocks. SmallVector<const LandingPadInst *, 4> LPads; for (BB = Fn->begin(); BB != EB; ++BB) { - if (const auto *Invoke = dyn_cast<InvokeInst>(BB->getTerminator())) - MBBMap[Invoke->getSuccessor(1)]->setIsLandingPad(); - if (BB->isLandingPad()) - LPads.push_back(BB->getLandingPadInst()); + const Instruction *FNP = BB->getFirstNonPHI(); + if (BB->isEHPad() && MBBMap.count(&*BB)) + MBBMap[&*BB]->setIsEHPad(); + if (const auto *LPI = dyn_cast<LandingPadInst>(FNP)) + LPads.push_back(LPI); } - // If this is an MSVC EH personality, we need to do a bit more work. - EHPersonality Personality = EHPersonality::Unknown; - if (Fn->hasPersonalityFn()) - Personality = classifyEHPersonality(Fn->getPersonalityFn()); - if (!isMSVCEHPersonality(Personality)) + // If this personality uses funclets, we need to do a bit more work. + if (!Fn->hasPersonalityFn()) + return; + EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn()); + if (!isFuncletEHPersonality(Personality)) return; - if (Personality == EHPersonality::MSVC_Win64SEH || - Personality == EHPersonality::MSVC_X86SEH) { - addSEHHandlersForLPads(LPads); - } - - WinEHFuncInfo &EHInfo = MMI.getWinEHFuncInfo(&fn); - if (Personality == EHPersonality::MSVC_CXX) { - const Function *WinEHParentFn = MMI.getWinEHParent(&fn); - calculateWinCXXEHStateNumbers(WinEHParentFn, EHInfo); - } - - // Copy the state numbers to LandingPadInfo for the current function, which - // could be a handler or the parent. This should happen for 32-bit SEH and - // C++ EH. - if (Personality == EHPersonality::MSVC_CXX || - Personality == EHPersonality::MSVC_X86SEH) { - for (const LandingPadInst *LP : LPads) { - MachineBasicBlock *LPadMBB = MBBMap[LP->getParent()]; - MMI.addWinEHState(LPadMBB, EHInfo.LandingPadStateMap[LP]); - } - } -} - -void FunctionLoweringInfo::addSEHHandlersForLPads( - ArrayRef<const LandingPadInst *> LPads) { - MachineModuleInfo &MMI = MF->getMMI(); - - // Iterate over all landing pads with llvm.eh.actions calls. - for (const LandingPadInst *LP : LPads) { - const IntrinsicInst *ActionsCall = - dyn_cast<IntrinsicInst>(LP->getNextNode()); - if (!ActionsCall || - ActionsCall->getIntrinsicID() != Intrinsic::eh_actions) - continue; - - // Parse the llvm.eh.actions call we found. - MachineBasicBlock *LPadMBB = MBBMap[LP->getParent()]; - SmallVector<std::unique_ptr<ActionHandler>, 4> Actions; - parseEHActions(ActionsCall, Actions); - - // Iterate EH actions from most to least precedence, which means - // iterating in reverse. - for (auto I = Actions.rbegin(), E = Actions.rend(); I != E; ++I) { - ActionHandler *Action = I->get(); - if (auto *CH = dyn_cast<CatchHandler>(Action)) { - const auto *Filter = - dyn_cast<Function>(CH->getSelector()->stripPointerCasts()); - assert((Filter || CH->getSelector()->isNullValue()) && - "expected function or catch-all"); - const auto *RecoverBA = - cast<BlockAddress>(CH->getHandlerBlockOrFunc()); - MMI.addSEHCatchHandler(LPadMBB, Filter, RecoverBA); + // Calculate state numbers if we haven't already. + WinEHFuncInfo &EHInfo = *MF->getWinEHFuncInfo(); + if (Personality == EHPersonality::MSVC_CXX) + calculateWinCXXEHStateNumbers(&fn, EHInfo); + else if (isAsynchronousEHPersonality(Personality)) + calculateSEHStateNumbers(&fn, EHInfo); + else if (Personality == EHPersonality::CoreCLR) + calculateClrEHStateNumbers(&fn, EHInfo); + + // Map all BB references in the WinEH data to MBBs. + for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) { + for (WinEHHandlerType &H : TBME.HandlerArray) { + if (H.CatchObj.Alloca) { + assert(StaticAllocaMap.count(H.CatchObj.Alloca)); + H.CatchObj.FrameIndex = StaticAllocaMap[H.CatchObj.Alloca]; } else { - assert(isa<CleanupHandler>(Action)); - const auto *Fini = cast<Function>(Action->getHandlerBlockOrFunc()); - MMI.addSEHCleanupHandler(LPadMBB, Fini); + H.CatchObj.FrameIndex = INT_MAX; } + if (H.Handler) + H.Handler = MBBMap[H.Handler.get<const BasicBlock *>()]; } } + for (CxxUnwindMapEntry &UME : EHInfo.CxxUnwindMap) + if (UME.Cleanup) + UME.Cleanup = MBBMap[UME.Cleanup.get<const BasicBlock *>()]; + for (SEHUnwindMapEntry &UME : EHInfo.SEHUnwindMap) { + const BasicBlock *BB = UME.Handler.get<const BasicBlock *>(); + UME.Handler = MBBMap[BB]; + } + for (ClrEHUnwindMapEntry &CME : EHInfo.ClrEHUnwindMap) { + const BasicBlock *BB = CME.Handler.get<const BasicBlock *>(); + CME.Handler = MBBMap[BB]; + } } /// clear - Clear out all the function-specific state. This returns this /// FunctionLoweringInfo to an empty state, ready to be used for a /// different function. void FunctionLoweringInfo::clear() { - assert(CatchInfoFound.size() == CatchInfoLost.size() && - "Not all catch info was assigned to a landing pad!"); - MBBMap.clear(); ValueMap.clear(); StaticAllocaMap.clear(); -#ifndef NDEBUG - CatchInfoLost.clear(); - CatchInfoFound.clear(); -#endif LiveOutRegInfo.clear(); VisitedBBs.clear(); ArgDbgValues.clear(); @@ -520,6 +510,17 @@ int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) { return 0; } +unsigned FunctionLoweringInfo::getCatchPadExceptionPointerVReg( + const Value *CPI, const TargetRegisterClass *RC) { + MachineRegisterInfo &MRI = MF->getRegInfo(); + auto I = CatchPadExceptionPointers.insert({CPI, 0}); + unsigned &VReg = I.first->second; + if (I.second) + VReg = MRI.createVirtualRegister(RC); + assert(VReg && "null vreg in exception pointer table!"); + return VReg; +} + /// ComputeUsesVAFloatArgument - Determine if any floating-point values are /// being passed to this variadic function, and set the MachineModuleInfo's /// usesVAFloatArgument flag if so. This flag is used to emit an undefined @@ -547,10 +548,9 @@ void llvm::ComputeUsesVAFloatArgument(const CallInst &I, /// landingpad instruction and add them to the specified machine module info. void llvm::AddLandingPadInfo(const LandingPadInst &I, MachineModuleInfo &MMI, MachineBasicBlock *MBB) { - MMI.addPersonality( - MBB, - cast<Function>( - I.getParent()->getParent()->getPersonalityFn()->stripPointerCasts())); + if (const auto *PF = dyn_cast<Function>( + I.getParent()->getParent()->getPersonalityFn()->stripPointerCasts())) + MMI.addPersonality(PF); if (I.isCleanup()) MMI.addCleanup(MBB); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 5ec1030..a1e2d41 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -139,7 +139,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned, UseRC = RC; else if (RC) { const TargetRegisterClass *ComRC = - TRI->getCommonSubClass(UseRC, RC); + TRI->getCommonSubClass(UseRC, RC, VT.SimpleTy); // If multiple uses expect disjoint register classes, we emit // copies in AddRegisterOperand. if (ComRC) diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index fbc8f1e..5d572c4 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -39,6 +39,10 @@ using namespace llvm; #define DEBUG_TYPE "legalizedag" +namespace { + +struct FloatSignAsInt; + //===----------------------------------------------------------------------===// /// This takes an arbitrary SelectionDAG as input and /// hacks on it until the target machine can handle it. This involves @@ -51,7 +55,6 @@ using namespace llvm; /// 'setcc' instruction efficiently, but does support 'brcc' instruction, this /// will attempt merge setcc and brc instructions into brcc's. /// -namespace { class SelectionDAGLegalize { const TargetMachine &TM; const TargetLowering &TLI; @@ -130,7 +133,11 @@ private: SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node); void ExpandDYNAMIC_STACKALLOC(SDNode *Node, SmallVectorImpl<SDValue> &Results); - SDValue ExpandFCOPYSIGN(SDNode *Node); + void getSignAsIntValue(FloatSignAsInt &State, SDLoc DL, SDValue Value) const; + SDValue modifySignAsInt(const FloatSignAsInt &State, SDLoc DL, + SDValue NewIntValue) const; + SDValue ExpandFCOPYSIGN(SDNode *Node) const; + SDValue ExpandFABS(SDNode *Node) const; SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue LegalOp, EVT DestVT, SDLoc dl); SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned, @@ -138,6 +145,7 @@ private: SDValue PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT, bool isSigned, SDLoc dl); + SDValue ExpandBITREVERSE(SDValue Op, SDLoc dl); SDValue ExpandBSWAP(SDValue Op, SDLoc dl); SDValue ExpandBitCount(unsigned Opc, SDValue Op, SDLoc dl); @@ -146,10 +154,11 @@ private: SDValue ExpandVectorBuildThroughStack(SDNode* Node); SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP); + SDValue ExpandConstant(ConstantSDNode *CP); - std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node); - - void ExpandNode(SDNode *Node); + // if ExpandNode returns false, LegalizeOp falls back to ConvertNodeToLibcall + bool ExpandNode(SDNode *Node); + void ConvertNodeToLibcall(SDNode *Node); void PromoteNode(SDNode *Node); public: @@ -273,17 +282,30 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) { DAG.getConstantPool(LLVMC, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); if (Extend) { - SDValue Result = - DAG.getExtLoad(ISD::EXTLOAD, dl, OrigVT, - DAG.getEntryNode(), - CPIdx, MachinePointerInfo::getConstantPool(), - VT, false, false, false, Alignment); + SDValue Result = DAG.getExtLoad( + ISD::EXTLOAD, dl, OrigVT, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), VT, + false, false, false, Alignment); return Result; } SDValue Result = - DAG.getLoad(OrigVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), false, false, false, - Alignment); + DAG.getLoad(OrigVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, Alignment); + return Result; +} + +/// Expands the Constant node to a load from the constant pool. +SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) { + SDLoc dl(CP); + EVT VT = CP->getValueType(0); + SDValue CPIdx = DAG.getConstantPool(CP->getConstantIntValue(), + TLI.getPointerTy(DAG.getDataLayout())); + unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); + SDValue Result = + DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, Alignment); return Result; } @@ -594,13 +616,13 @@ PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx, int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); // Store the vector. - SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Tmp1, StackPtr, - MachinePointerInfo::getFixedStack(SPFI), - false, false, 0); + SDValue Ch = DAG.getStore( + DAG.getEntryNode(), dl, Tmp1, StackPtr, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI), false, + false, 0); // Truncate or zero extend offset to target pointer type. - unsigned CastOpc = IdxVT.bitsGT(PtrVT) ? ISD::TRUNCATE : ISD::ZERO_EXTEND; - Tmp3 = DAG.getNode(CastOpc, dl, PtrVT, Tmp3); + Tmp3 = DAG.getZExtOrTrunc(Tmp3, dl, PtrVT); // Add the offset to the index. unsigned EltSize = EltVT.getSizeInBits()/8; Tmp3 = DAG.getNode(ISD::MUL, dl, IdxVT, Tmp3, @@ -610,9 +632,9 @@ PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx, Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2, MachinePointerInfo(), EltVT, false, false, 0); // Load the updated vector. - return DAG.getLoad(VT, dl, Ch, StackPtr, - MachinePointerInfo::getFixedStack(SPFI), false, false, - false, 0); + return DAG.getLoad(VT, dl, Ch, StackPtr, MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), SPFI), + false, false, false, 0); } @@ -728,14 +750,12 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) { case TargetLowering::Legal: { // If this is an unaligned store and the target doesn't support it, // expand it. + EVT MemVT = ST->getMemoryVT(); unsigned AS = ST->getAddressSpace(); unsigned Align = ST->getAlignment(); - if (!TLI.allowsMisalignedMemoryAccesses(ST->getMemoryVT(), AS, Align)) { - Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); - if (Align < ABIAlignment) - ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this); - } + const DataLayout &DL = DAG.getDataLayout(); + if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) + ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this); break; } case TargetLowering::Custom: { @@ -839,20 +859,16 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) { SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); ReplaceNode(SDValue(Node, 0), Result); } else { - switch (TLI.getTruncStoreAction(ST->getValue().getSimpleValueType(), - StVT.getSimpleVT())) { + switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) { default: llvm_unreachable("This action is not supported yet!"); case TargetLowering::Legal: { + EVT MemVT = ST->getMemoryVT(); unsigned AS = ST->getAddressSpace(); unsigned Align = ST->getAlignment(); // If this is an unaligned store and the target doesn't support it, // expand it. - if (!TLI.allowsMisalignedMemoryAccesses(ST->getMemoryVT(), AS, Align)) { - Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = DL.getABITypeAlignment(Ty); - if (Align < ABIAlignment) - ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this); - } + if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) + ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this); break; } case TargetLowering::Custom: { @@ -895,17 +911,14 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { switch (TLI.getOperationAction(Node->getOpcode(), VT)) { default: llvm_unreachable("This action is not supported yet!"); case TargetLowering::Legal: { + EVT MemVT = LD->getMemoryVT(); unsigned AS = LD->getAddressSpace(); unsigned Align = LD->getAlignment(); + const DataLayout &DL = DAG.getDataLayout(); // If this is an unaligned load and the target doesn't support it, // expand it. - if (!TLI.allowsMisalignedMemoryAccesses(LD->getMemoryVT(), AS, Align)) { - Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); - if (Align < ABIAlignment){ - ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, RVal, RChain); - } - } + if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) + ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, RVal, RChain); break; } case TargetLowering::Custom: { @@ -1092,23 +1105,20 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { Chain = Res.getValue(1); } } else { - // If this is an unaligned load and the target doesn't support - // it, expand it. + // If this is an unaligned load and the target doesn't support it, + // expand it. EVT MemVT = LD->getMemoryVT(); unsigned AS = LD->getAddressSpace(); unsigned Align = LD->getAlignment(); - if (!TLI.allowsMisalignedMemoryAccesses(MemVT, AS, Align)) { - Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); - if (Align < ABIAlignment){ - ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, Value, Chain); - } - } + const DataLayout &DL = DAG.getDataLayout(); + if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) + ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, Value, Chain); } break; } case TargetLowering::Expand: - if (!TLI.isLoadExtLegal(ISD::EXTLOAD, Node->getValueType(0), SrcVT)) { + EVT DestVT = Node->getValueType(0); + if (!TLI.isLoadExtLegal(ISD::EXTLOAD, DestVT, SrcVT)) { // If the source type is not legal, see if there is a legal extload to // an intermediate type that we can then extend further. EVT LoadVT = TLI.getRegisterType(SrcVT.getSimpleVT()); @@ -1127,6 +1137,23 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { Chain = Load.getValue(1); break; } + + // Handle the special case of fp16 extloads. EXTLOAD doesn't have the + // normal undefined upper bits behavior to allow using an in-reg extend + // with the illegal FP type, so load as an integer and do the + // from-integer conversion. + if (SrcVT.getScalarType() == MVT::f16) { + EVT ISrcVT = SrcVT.changeTypeToInteger(); + EVT IDestVT = DestVT.changeTypeToInteger(); + EVT LoadVT = TLI.getRegisterType(IDestVT.getSimpleVT()); + + SDValue Result = DAG.getExtLoad(ISD::ZEXTLOAD, dl, LoadVT, + Chain, Ptr, ISrcVT, + LD->getMemOperand()); + Value = DAG.getNode(ISD::FP16_TO_FP, dl, DestVT, Result); + Chain = Result.getValue(1); + break; + } } assert(!SrcVT.isVector() && @@ -1180,15 +1207,17 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { #ifndef NDEBUG for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) - assert(TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) == - TargetLowering::TypeLegal && + assert((TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) == + TargetLowering::TypeLegal || + TLI.isTypeLegal(Node->getValueType(i))) && "Unexpected illegal type!"); for (const SDValue &Op : Node->op_values()) - assert((TLI.getTypeAction(*DAG.getContext(), - Op.getValueType()) == TargetLowering::TypeLegal || - Op.getOpcode() == ISD::TargetConstant) && - "Unexpected illegal type!"); + assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) == + TargetLowering::TypeLegal || + TLI.isTypeLegal(Op.getValueType()) || + Op.getOpcode() == ISD::TargetConstant) && + "Unexpected illegal type!"); #endif // Figure out the correct action; the way to query this varies by opcode @@ -1201,6 +1230,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { case ISD::STACKSAVE: Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other); break; + case ISD::GET_DYNAMIC_AREA_OFFSET: + Action = TLI.getOperationAction(Node->getOpcode(), + Node->getValueType(0)); + break; case ISD::VAARG: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); @@ -1229,7 +1262,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { case ISD::SETCC: case ISD::BR_CC: { unsigned CCOperand = Node->getOpcode() == ISD::SELECT_CC ? 4 : - Node->getOpcode() == ISD::SETCC ? 2 : 1; + Node->getOpcode() == ISD::SETCC ? 2 : + Node->getOpcode() == ISD::SETCCE ? 3 : 1; unsigned CompareOperand = Node->getOpcode() == ISD::BR_CC ? 2 : 0; MVT OpVT = Node->getOperand(CompareOperand).getSimpleValueType(); ISD::CondCode CCCode = @@ -1265,6 +1299,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { case ISD::FRAME_TO_ARGS_OFFSET: case ISD::EH_SJLJ_SETJMP: case ISD::EH_SJLJ_LONGJMP: + case ISD::EH_SJLJ_SETUP_DISPATCH: // These operations lie about being legal: when they claim to be legal, // they should actually be expanded. Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); @@ -1281,6 +1316,11 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { if (Action == TargetLowering::Legal) Action = TargetLowering::Custom; break; + case ISD::READCYCLECOUNTER: + // READCYCLECOUNTER returns an i64, even if type legalization might have + // expanded that to several smaller types. + Action = TLI.getOperationAction(Node->getOpcode(), MVT::i64); + break; case ISD::READ_REGISTER: case ISD::WRITE_REGISTER: // Named register is legal in the DAG, but blocked by register name @@ -1379,7 +1419,11 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { } // FALL THROUGH case TargetLowering::Expand: - ExpandNode(Node); + if (ExpandNode(Node)) + return; + // FALL THROUGH + case TargetLowering::LibCall: + ConvertNodeToLibcall(Node); return; case TargetLowering::Promote: PromoteNode(Node); @@ -1419,6 +1463,11 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) { // series of EXTRACT_VECTOR_ELT nodes are generated, one for each element in // the vector. If all are expanded here, we don't want one store per vector // element. + + // Caches for hasPredecessorHelper + SmallPtrSet<const SDNode *, 32> Visited; + SmallVector<const SDNode *, 16> Worklist; + SDValue StackPtr, Ch; for (SDNode::use_iterator UI = Vec.getNode()->use_begin(), UE = Vec.getNode()->use_end(); UI != UE; ++UI) { @@ -1433,6 +1482,12 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) { if (!ST->getChain().reachesChainWithoutSideEffects(DAG.getEntryNode())) continue; + // If the index is dependent on the store we will introduce a cycle when + // creating the load (the load uses the index, and by replacing the chain + // we will make the index dependent on the load). + if (Idx.getNode()->hasPredecessorHelper(ST, Visited, Worklist)) + continue; + StackPtr = ST->getBasePtr(); Ch = SDValue(ST, 0); break; @@ -1490,7 +1545,8 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) { SDValue StackPtr = DAG.CreateStackTemporary(Vec.getValueType()); int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); - MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FI); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // First store the whole vector. SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, @@ -1528,7 +1584,8 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) { SDLoc dl(Node); SDValue FIPtr = DAG.CreateStackTemporary(VT); int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex(); - MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FI); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // Emit a store of each element to the stack slot. SmallVector<SDValue, 8> Stores; @@ -1568,69 +1625,143 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) { false, false, false, 0); } -SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode* Node) { - SDLoc dl(Node); - SDValue Tmp1 = Node->getOperand(0); - SDValue Tmp2 = Node->getOperand(1); - - // Get the sign bit of the RHS. First obtain a value that has the same - // sign as the sign bit, i.e. negative if and only if the sign bit is 1. - SDValue SignBit; - EVT FloatVT = Tmp2.getValueType(); - EVT IVT = EVT::getIntegerVT(*DAG.getContext(), FloatVT.getSizeInBits()); +namespace { +/// Keeps track of state when getting the sign of a floating-point value as an +/// integer. +struct FloatSignAsInt { + EVT FloatVT; + SDValue Chain; + SDValue FloatPtr; + SDValue IntPtr; + MachinePointerInfo IntPointerInfo; + MachinePointerInfo FloatPointerInfo; + SDValue IntValue; + APInt SignMask; +}; +} + +/// Bitcast a floating-point value to an integer value. Only bitcast the part +/// containing the sign bit if the target has no integer value capable of +/// holding all bits of the floating-point value. +void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State, + SDLoc DL, SDValue Value) const { + EVT FloatVT = Value.getValueType(); + unsigned NumBits = FloatVT.getSizeInBits(); + State.FloatVT = FloatVT; + EVT IVT = EVT::getIntegerVT(*DAG.getContext(), NumBits); + // Convert to an integer of the same size. if (TLI.isTypeLegal(IVT)) { - // Convert to an integer with the same sign bit. - SignBit = DAG.getNode(ISD::BITCAST, dl, IVT, Tmp2); + State.IntValue = DAG.getNode(ISD::BITCAST, DL, IVT, Value); + State.SignMask = APInt::getSignBit(NumBits); + return; + } + + auto &DataLayout = DAG.getDataLayout(); + // Store the float to memory, then load the sign part out as an integer. + MVT LoadTy = TLI.getRegisterType(*DAG.getContext(), MVT::i8); + // First create a temporary that is aligned for both the load and store. + SDValue StackPtr = DAG.CreateStackTemporary(FloatVT, LoadTy); + int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); + // Then store the float to it. + State.FloatPtr = StackPtr; + MachineFunction &MF = DAG.getMachineFunction(); + State.FloatPointerInfo = MachinePointerInfo::getFixedStack(MF, FI); + State.Chain = DAG.getStore(DAG.getEntryNode(), DL, Value, State.FloatPtr, + State.FloatPointerInfo, false, false, 0); + + SDValue IntPtr; + if (DataLayout.isBigEndian()) { + assert(FloatVT.isByteSized() && "Unsupported floating point type!"); + // Load out a legal integer with the same sign bit as the float. + IntPtr = StackPtr; + State.IntPointerInfo = State.FloatPointerInfo; } else { - auto &DL = DAG.getDataLayout(); - // Store the float to memory, then load the sign part out as an integer. - MVT LoadTy = TLI.getPointerTy(DL); - // First create a temporary that is aligned for both the load and store. - SDValue StackPtr = DAG.CreateStackTemporary(FloatVT, LoadTy); - // Then store the float to it. - SDValue Ch = - DAG.getStore(DAG.getEntryNode(), dl, Tmp2, StackPtr, MachinePointerInfo(), - false, false, 0); - if (DL.isBigEndian()) { - assert(FloatVT.isByteSized() && "Unsupported floating point type!"); - // Load out a legal integer with the same sign bit as the float. - SignBit = DAG.getLoad(LoadTy, dl, Ch, StackPtr, MachinePointerInfo(), - false, false, false, 0); - } else { // Little endian - SDValue LoadPtr = StackPtr; - // The float may be wider than the integer we are going to load. Advance - // the pointer so that the loaded integer will contain the sign bit. - unsigned Strides = (FloatVT.getSizeInBits()-1)/LoadTy.getSizeInBits(); - unsigned ByteOffset = (Strides * LoadTy.getSizeInBits()) / 8; - LoadPtr = DAG.getNode(ISD::ADD, dl, LoadPtr.getValueType(), LoadPtr, - DAG.getConstant(ByteOffset, dl, - LoadPtr.getValueType())); - // Load a legal integer containing the sign bit. - SignBit = DAG.getLoad(LoadTy, dl, Ch, LoadPtr, MachinePointerInfo(), - false, false, false, 0); - // Move the sign bit to the top bit of the loaded integer. - unsigned BitShift = LoadTy.getSizeInBits() - - (FloatVT.getSizeInBits() - 8 * ByteOffset); - assert(BitShift < LoadTy.getSizeInBits() && "Pointer advanced wrong?"); - if (BitShift) - SignBit = DAG.getNode( - ISD::SHL, dl, LoadTy, SignBit, - DAG.getConstant(BitShift, dl, - TLI.getShiftAmountTy(SignBit.getValueType(), DL))); - } + // Advance the pointer so that the loaded byte will contain the sign bit. + unsigned ByteOffset = (FloatVT.getSizeInBits() / 8) - 1; + IntPtr = DAG.getNode(ISD::ADD, DL, StackPtr.getValueType(), StackPtr, + DAG.getConstant(ByteOffset, DL, StackPtr.getValueType())); + State.IntPointerInfo = MachinePointerInfo::getFixedStack(MF, FI, + ByteOffset); } - // Now get the sign bit proper, by seeing whether the value is negative. - SignBit = DAG.getSetCC(dl, getSetCCResultType(SignBit.getValueType()), - SignBit, - DAG.getConstant(0, dl, SignBit.getValueType()), - ISD::SETLT); - // Get the absolute value of the result. - SDValue AbsVal = DAG.getNode(ISD::FABS, dl, Tmp1.getValueType(), Tmp1); - // Select between the nabs and abs value based on the sign bit of - // the input. - return DAG.getSelect(dl, AbsVal.getValueType(), SignBit, - DAG.getNode(ISD::FNEG, dl, AbsVal.getValueType(), AbsVal), - AbsVal); + + State.IntPtr = IntPtr; + State.IntValue = DAG.getExtLoad(ISD::EXTLOAD, DL, LoadTy, State.Chain, + IntPtr, State.IntPointerInfo, MVT::i8, + false, false, false, 0); + State.SignMask = APInt::getOneBitSet(LoadTy.getSizeInBits(), 7); +} + +/// Replace the integer value produced by getSignAsIntValue() with a new value +/// and cast the result back to a floating-point type. +SDValue SelectionDAGLegalize::modifySignAsInt(const FloatSignAsInt &State, + SDLoc DL, SDValue NewIntValue) const { + if (!State.Chain) + return DAG.getNode(ISD::BITCAST, DL, State.FloatVT, NewIntValue); + + // Override the part containing the sign bit in the value stored on the stack. + SDValue Chain = DAG.getTruncStore(State.Chain, DL, NewIntValue, State.IntPtr, + State.IntPointerInfo, MVT::i8, false, false, + 0); + return DAG.getLoad(State.FloatVT, DL, Chain, State.FloatPtr, + State.FloatPointerInfo, false, false, false, 0); +} + +SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode *Node) const { + SDLoc DL(Node); + SDValue Mag = Node->getOperand(0); + SDValue Sign = Node->getOperand(1); + + // Get sign bit into an integer value. + FloatSignAsInt SignAsInt; + getSignAsIntValue(SignAsInt, DL, Sign); + + EVT IntVT = SignAsInt.IntValue.getValueType(); + SDValue SignMask = DAG.getConstant(SignAsInt.SignMask, DL, IntVT); + SDValue SignBit = DAG.getNode(ISD::AND, DL, IntVT, SignAsInt.IntValue, + SignMask); + + // If FABS is legal transform FCOPYSIGN(x, y) => sign(x) ? -FABS(x) : FABS(X) + EVT FloatVT = Mag.getValueType(); + if (TLI.isOperationLegalOrCustom(ISD::FABS, FloatVT) && + TLI.isOperationLegalOrCustom(ISD::FNEG, FloatVT)) { + SDValue AbsValue = DAG.getNode(ISD::FABS, DL, FloatVT, Mag); + SDValue NegValue = DAG.getNode(ISD::FNEG, DL, FloatVT, AbsValue); + SDValue Cond = DAG.getSetCC(DL, getSetCCResultType(IntVT), SignBit, + DAG.getConstant(0, DL, IntVT), ISD::SETNE); + return DAG.getSelect(DL, FloatVT, Cond, NegValue, AbsValue); + } + + // Transform values to integer, copy the sign bit and transform back. + FloatSignAsInt MagAsInt; + getSignAsIntValue(MagAsInt, DL, Mag); + assert(SignAsInt.SignMask == MagAsInt.SignMask); + SDValue ClearSignMask = DAG.getConstant(~SignAsInt.SignMask, DL, IntVT); + SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, MagAsInt.IntValue, + ClearSignMask); + SDValue CopiedSign = DAG.getNode(ISD::OR, DL, IntVT, ClearedSign, SignBit); + + return modifySignAsInt(MagAsInt, DL, CopiedSign); +} + +SDValue SelectionDAGLegalize::ExpandFABS(SDNode *Node) const { + SDLoc DL(Node); + SDValue Value = Node->getOperand(0); + + // Transform FABS(x) => FCOPYSIGN(x, 0.0) if FCOPYSIGN is legal. + EVT FloatVT = Value.getValueType(); + if (TLI.isOperationLegalOrCustom(ISD::FCOPYSIGN, FloatVT)) { + SDValue Zero = DAG.getConstantFP(0.0, DL, FloatVT); + return DAG.getNode(ISD::FCOPYSIGN, DL, FloatVT, Value, Zero); + } + + // Transform value to integer, clear the sign bit and transform back. + FloatSignAsInt ValueAsInt; + getSignAsIntValue(ValueAsInt, DL, Value); + EVT IntVT = ValueAsInt.IntValue.getValueType(); + SDValue ClearSignMask = DAG.getConstant(~ValueAsInt.SignMask, DL, IntVT); + SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, ValueAsInt.IntValue, + ClearSignMask); + return modifySignAsInt(ValueAsInt, DL, ClearedSign); } void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node, @@ -1798,7 +1929,8 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(FIPtr); int SPFI = StackPtrFI->getIndex(); - MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SPFI); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); unsigned SrcSize = SrcOp.getValueType().getSizeInBits(); unsigned SlotSize = SlotVT.getSizeInBits(); @@ -1838,14 +1970,14 @@ SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) { FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(StackPtr); int SPFI = StackPtrFI->getIndex(); - SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), dl, Node->getOperand(0), - StackPtr, - MachinePointerInfo::getFixedStack(SPFI), - Node->getValueType(0).getVectorElementType(), - false, false, 0); - return DAG.getLoad(Node->getValueType(0), dl, Ch, StackPtr, - MachinePointerInfo::getFixedStack(SPFI), - false, false, false, 0); + SDValue Ch = DAG.getTruncStore( + DAG.getEntryNode(), dl, Node->getOperand(0), StackPtr, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI), + Node->getValueType(0).getVectorElementType(), false, false, 0); + return DAG.getLoad( + Node->getValueType(0), dl, Ch, StackPtr, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI), false, + false, false, 0); } static bool @@ -2011,9 +2143,10 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) { SDValue CPIdx = DAG.getConstantPool(CP, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); - return DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); + return DAG.getLoad( + VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, Alignment); } SmallSet<SDValue, 16> DefinedValues; @@ -2205,47 +2338,6 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned, return ExpandLibCall(LC, Node, isSigned); } -/// Return true if divmod libcall is available. -static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned, - const TargetLowering &TLI) { - RTLIB::Libcall LC; - switch (Node->getSimpleValueType(0).SimpleTy) { - default: llvm_unreachable("Unexpected request for libcall!"); - case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; - case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; - case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; - case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; - case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break; - } - - return TLI.getLibcallName(LC) != nullptr; -} - -/// Only issue divrem libcall if both quotient and remainder are needed. -static bool useDivRem(SDNode *Node, bool isSigned, bool isDIV) { - // The other use might have been replaced with a divrem already. - unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM; - unsigned OtherOpcode = 0; - if (isSigned) - OtherOpcode = isDIV ? ISD::SREM : ISD::SDIV; - else - OtherOpcode = isDIV ? ISD::UREM : ISD::UDIV; - - SDValue Op0 = Node->getOperand(0); - SDValue Op1 = Node->getOperand(1); - for (SDNode::use_iterator UI = Op0.getNode()->use_begin(), - UE = Op0.getNode()->use_end(); UI != UE; ++UI) { - SDNode *User = *UI; - if (User == Node) - continue; - if ((User->getOpcode() == OtherOpcode || User->getOpcode() == DivRemOpc) && - User->getOperand(0) == Op0 && - User->getOperand(1) == Op1) - return true; - } - return false; -} - /// Issue libcalls to __{u}divmod to compute div / rem pairs. void SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, @@ -2428,6 +2520,8 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, EVT DestVT, SDLoc dl) { + // TODO: Should any fast-math-flags be set for the created nodes? + if (Op0.getValueType() == MVT::i32 && TLI.isTypeLegal(MVT::f64)) { // simple 32-bit [signed|unsigned] integer to float/double expansion @@ -2611,14 +2705,15 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, Alignment = std::min(Alignment, 4u); SDValue FudgeInReg; if (DestVT == MVT::f32) - FudgeInReg = DAG.getLoad(MVT::f32, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); + FudgeInReg = DAG.getLoad( + MVT::f32, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, Alignment); else { - SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, - DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - MVT::f32, false, false, false, Alignment); + SDValue Load = DAG.getExtLoad( + ISD::EXTLOAD, dl, DestVT, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, + false, false, false, Alignment); HandleSDNode Handle(Load); LegalizeOp(Load.getNode()); FudgeInReg = Handle.getValue(); @@ -2713,6 +2808,31 @@ SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp, return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation); } +/// Open code the operations for BITREVERSE. +SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, SDLoc dl) { + EVT VT = Op.getValueType(); + EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); + unsigned Sz = VT.getScalarSizeInBits(); + + SDValue Tmp, Tmp2; + Tmp = DAG.getConstant(0, dl, VT); + for (unsigned I = 0, J = Sz-1; I < Sz; ++I, --J) { + if (I < J) + Tmp2 = + DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(J - I, dl, SHVT)); + else + Tmp2 = + DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT)); + + APInt Shift(Sz, 1); + Shift = Shift.shl(J); + Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT)); + Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp, Tmp2); + } + + return Tmp; +} + /// Open code the operations for BSWAP of the specified operation. SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, SDLoc dl) { EVT VT = Op.getValueType(); @@ -2821,6 +2941,18 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, // This trivially expands to CTLZ. return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op); case ISD::CTLZ: { + EVT VT = Op.getValueType(); + unsigned len = VT.getSizeInBits(); + + if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) { + EVT SetCCVT = getSetCCResultType(VT); + SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op); + SDValue Zero = DAG.getConstant(0, dl, VT); + SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ); + return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero, + DAG.getConstant(len, dl, VT), CTLZ); + } + // for now, we do this: // x = x | (x >> 1); // x = x | (x >> 2); @@ -2830,9 +2962,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, // return popcount(~x); // // Ref: "Hacker's Delight" by Henry Warren - EVT VT = Op.getValueType(); EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); - unsigned len = VT.getSizeInBits(); for (unsigned i = 0; (1U << i) <= (len / 2); ++i) { SDValue Tmp3 = DAG.getConstant(1ULL << i, dl, ShVT); Op = DAG.getNode(ISD::OR, dl, VT, Op, @@ -2865,16 +2995,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, } } -std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) { - unsigned Opc = Node->getOpcode(); - MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT(); - RTLIB::Libcall LC = RTLIB::getATOMIC(Opc, VT); - assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!"); - - return ExpandChainLibCall(LC, Node, false); -} - -void SelectionDAGLegalize::ExpandNode(SDNode *Node) { +bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { SmallVector<SDValue, 8> Results; SDLoc dl(Node); SDValue Tmp1, Tmp2, Tmp3, Tmp4; @@ -2888,6 +3009,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { Tmp1 = ExpandBitCount(Node->getOpcode(), Node->getOperand(0), dl); Results.push_back(Tmp1); break; + case ISD::BITREVERSE: + Results.push_back(ExpandBITREVERSE(Node->getOperand(0), dl)); + break; case ISD::BSWAP: Results.push_back(ExpandBSWAP(Node->getOperand(0), dl)); break; @@ -2908,30 +3032,19 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { // preserve the chain and be done. Results.push_back(Node->getOperand(0)); break; + case ISD::READCYCLECOUNTER: + // If the target didn't expand this, just return 'zero' and preserve the + // chain. + Results.append(Node->getNumValues() - 1, + DAG.getConstant(0, dl, Node->getValueType(0))); + Results.push_back(Node->getOperand(0)); + break; case ISD::EH_SJLJ_SETJMP: // If the target didn't expand this, just return 'zero' and preserve the // chain. Results.push_back(DAG.getConstant(0, dl, MVT::i32)); Results.push_back(Node->getOperand(0)); break; - case ISD::ATOMIC_FENCE: { - // If the target didn't lower this, lower it to '__sync_synchronize()' call - // FIXME: handle "fence singlethread" more efficiently. - TargetLowering::ArgListTy Args; - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(Node->getOperand(0)) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__sync_synchronize", - TLI.getPointerTy(DAG.getDataLayout())), - std::move(Args), 0); - - std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); - - Results.push_back(CallResult.second); - break; - } case ISD::ATOMIC_LOAD: { // There is no libcall for atomic load; fake it with ATOMIC_CMP_SWAP. SDValue Zero = DAG.getConstant(0, dl, Node->getValueType(0)); @@ -2959,26 +3072,6 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(Swap.getValue(1)); break; } - // By default, atomic intrinsics are marked Legal and lowered. Targets - // which don't support them directly, however, may want libcalls, in which - // case they mark them Expand, and we get here. - case ISD::ATOMIC_SWAP: - case ISD::ATOMIC_LOAD_ADD: - case ISD::ATOMIC_LOAD_SUB: - case ISD::ATOMIC_LOAD_AND: - case ISD::ATOMIC_LOAD_OR: - case ISD::ATOMIC_LOAD_XOR: - case ISD::ATOMIC_LOAD_NAND: - case ISD::ATOMIC_LOAD_MIN: - case ISD::ATOMIC_LOAD_MAX: - case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_LOAD_UMAX: - case ISD::ATOMIC_CMP_SWAP: { - std::pair<SDValue, SDValue> Tmp = ExpandAtomic(Node); - Results.push_back(Tmp.first); - Results.push_back(Tmp.second); - break; - } case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { // Expanding an ATOMIC_CMP_SWAP_WITH_SUCCESS produces an ATOMIC_CMP_SWAP and // splits out the success value as a comparison. Expanding the resulting @@ -3017,21 +3110,6 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { } break; } - case ISD::TRAP: { - // If this operation is not supported, lower it to 'abort()' call - TargetLowering::ArgListTy Args; - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(Node->getOperand(0)) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("abort", - TLI.getPointerTy(DAG.getDataLayout())), - std::move(Args), 0); - std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); - - Results.push_back(CallResult.second); - break; - } case ISD::FP_ROUND: case ISD::BITCAST: Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0), @@ -3097,6 +3175,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { Node->getOperand(0), Tmp1, ISD::SETLT); True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0)); + // TODO: Should any fast-math-flags be set for the FSUB? False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, DAG.getNode(ISD::FSUB, dl, VT, Node->getOperand(0), Tmp1)); @@ -3106,57 +3185,13 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(Tmp1); break; } - case ISD::VAARG: { - const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); - EVT VT = Node->getValueType(0); - Tmp1 = Node->getOperand(0); - Tmp2 = Node->getOperand(1); - unsigned Align = Node->getConstantOperandVal(3); - - SDValue VAListLoad = - DAG.getLoad(TLI.getPointerTy(DAG.getDataLayout()), dl, Tmp1, Tmp2, - MachinePointerInfo(V), false, false, false, 0); - SDValue VAList = VAListLoad; - - if (Align > TLI.getMinStackArgumentAlignment()) { - assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2"); - - VAList = DAG.getNode(ISD::ADD, dl, VAList.getValueType(), VAList, - DAG.getConstant(Align - 1, dl, - VAList.getValueType())); - - VAList = DAG.getNode(ISD::AND, dl, VAList.getValueType(), VAList, - DAG.getConstant(-(int64_t)Align, dl, - VAList.getValueType())); - } - - // Increment the pointer, VAList, to the next vaarg - Tmp3 = DAG.getNode(ISD::ADD, dl, VAList.getValueType(), VAList, - DAG.getConstant(DAG.getDataLayout().getTypeAllocSize( - VT.getTypeForEVT(*DAG.getContext())), - dl, VAList.getValueType())); - // Store the incremented VAList to the legalized pointer - Tmp3 = DAG.getStore(VAListLoad.getValue(1), dl, Tmp3, Tmp2, - MachinePointerInfo(V), false, false, 0); - // Load the actual argument out of the pointer VAList - Results.push_back(DAG.getLoad(VT, dl, Tmp3, VAList, MachinePointerInfo(), - false, false, false, 0)); + case ISD::VAARG: + Results.push_back(DAG.expandVAArg(Node)); Results.push_back(Results[0].getValue(1)); break; - } - case ISD::VACOPY: { - // This defaults to loading a pointer from the input and storing it to the - // output, returning the chain. - const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue(); - const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue(); - Tmp1 = DAG.getLoad(TLI.getPointerTy(DAG.getDataLayout()), dl, - Node->getOperand(0), Node->getOperand(2), - MachinePointerInfo(VS), false, false, false, 0); - Tmp1 = DAG.getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1), - MachinePointerInfo(VD), false, false, 0); - Results.push_back(Tmp1); + case ISD::VACOPY: + Results.push_back(DAG.expandVACopy(Node)); break; - } case ISD::EXTRACT_VECTOR_ELT: if (Node->getOperand(0).getValueType().getVectorNumElements() == 1) // This must be an access of the only element. Return it. @@ -3302,28 +3337,24 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(Node->getOperand(0)); } break; + case ISD::GET_DYNAMIC_AREA_OFFSET: + Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0))); + Results.push_back(Results[0].getValue(0)); + break; case ISD::FCOPYSIGN: Results.push_back(ExpandFCOPYSIGN(Node)); break; case ISD::FNEG: // Expand Y = FNEG(X) -> Y = SUB -0.0, X Tmp1 = DAG.getConstantFP(-0.0, dl, Node->getValueType(0)); + // TODO: If FNEG has fast-math-flags, propagate them to the FSUB. Tmp1 = DAG.getNode(ISD::FSUB, dl, Node->getValueType(0), Tmp1, Node->getOperand(0)); Results.push_back(Tmp1); break; - case ISD::FABS: { - // Expand Y = FABS(X) -> Y = (X >u 0.0) ? X : fneg(X). - EVT VT = Node->getValueType(0); - Tmp1 = Node->getOperand(0); - Tmp2 = DAG.getConstantFP(0.0, dl, VT); - Tmp2 = DAG.getSetCC(dl, getSetCCResultType(Tmp1.getValueType()), - Tmp1, Tmp2, ISD::SETUGT); - Tmp3 = DAG.getNode(ISD::FNEG, dl, VT, Tmp1); - Tmp1 = DAG.getSelect(dl, VT, Tmp2, Tmp1, Tmp3); - Results.push_back(Tmp1); + case ISD::FABS: + Results.push_back(ExpandFABS(Node)); break; - } case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: @@ -3344,25 +3375,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { break; } - case ISD::FMINNUM: - Results.push_back(ExpandFPLibCall(Node, RTLIB::FMIN_F32, RTLIB::FMIN_F64, - RTLIB::FMIN_F80, RTLIB::FMIN_F128, - RTLIB::FMIN_PPCF128)); - break; - case ISD::FMAXNUM: - Results.push_back(ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64, - RTLIB::FMAX_F80, RTLIB::FMAX_F128, - RTLIB::FMAX_PPCF128)); - break; - case ISD::FSQRT: - Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64, - RTLIB::SQRT_F80, RTLIB::SQRT_F128, - RTLIB::SQRT_PPCF128)); - break; case ISD::FSIN: case ISD::FCOS: { EVT VT = Node->getValueType(0); - bool isSIN = Node->getOpcode() == ISD::FSIN; // Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin / // fcos which share the same operand and both are used. if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) || @@ -3370,137 +3385,27 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { && useSinCos(Node)) { SDVTList VTs = DAG.getVTList(VT, VT); Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0)); - if (!isSIN) + if (Node->getOpcode() == ISD::FCOS) Tmp1 = Tmp1.getValue(1); Results.push_back(Tmp1); - } else if (isSIN) { - Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64, - RTLIB::SIN_F80, RTLIB::SIN_F128, - RTLIB::SIN_PPCF128)); - } else { - Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64, - RTLIB::COS_F80, RTLIB::COS_F128, - RTLIB::COS_PPCF128)); } break; } - case ISD::FSINCOS: - // Expand into sincos libcall. - ExpandSinCosLibCall(Node, Results); - break; - case ISD::FLOG: - Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64, - RTLIB::LOG_F80, RTLIB::LOG_F128, - RTLIB::LOG_PPCF128)); - break; - case ISD::FLOG2: - Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64, - RTLIB::LOG2_F80, RTLIB::LOG2_F128, - RTLIB::LOG2_PPCF128)); - break; - case ISD::FLOG10: - Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64, - RTLIB::LOG10_F80, RTLIB::LOG10_F128, - RTLIB::LOG10_PPCF128)); - break; - case ISD::FEXP: - Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64, - RTLIB::EXP_F80, RTLIB::EXP_F128, - RTLIB::EXP_PPCF128)); - break; - case ISD::FEXP2: - Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64, - RTLIB::EXP2_F80, RTLIB::EXP2_F128, - RTLIB::EXP2_PPCF128)); - break; - case ISD::FTRUNC: - Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64, - RTLIB::TRUNC_F80, RTLIB::TRUNC_F128, - RTLIB::TRUNC_PPCF128)); - break; - case ISD::FFLOOR: - Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64, - RTLIB::FLOOR_F80, RTLIB::FLOOR_F128, - RTLIB::FLOOR_PPCF128)); - break; - case ISD::FCEIL: - Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64, - RTLIB::CEIL_F80, RTLIB::CEIL_F128, - RTLIB::CEIL_PPCF128)); - break; - case ISD::FRINT: - Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64, - RTLIB::RINT_F80, RTLIB::RINT_F128, - RTLIB::RINT_PPCF128)); - break; - case ISD::FNEARBYINT: - Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32, - RTLIB::NEARBYINT_F64, - RTLIB::NEARBYINT_F80, - RTLIB::NEARBYINT_F128, - RTLIB::NEARBYINT_PPCF128)); - break; - case ISD::FROUND: - Results.push_back(ExpandFPLibCall(Node, RTLIB::ROUND_F32, - RTLIB::ROUND_F64, - RTLIB::ROUND_F80, - RTLIB::ROUND_F128, - RTLIB::ROUND_PPCF128)); - break; - case ISD::FPOWI: - Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64, - RTLIB::POWI_F80, RTLIB::POWI_F128, - RTLIB::POWI_PPCF128)); - break; - case ISD::FPOW: - Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64, - RTLIB::POW_F80, RTLIB::POW_F128, - RTLIB::POW_PPCF128)); - break; - case ISD::FDIV: - Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64, - RTLIB::DIV_F80, RTLIB::DIV_F128, - RTLIB::DIV_PPCF128)); - break; - case ISD::FREM: - Results.push_back(ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64, - RTLIB::REM_F80, RTLIB::REM_F128, - RTLIB::REM_PPCF128)); - break; - case ISD::FMA: - Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64, - RTLIB::FMA_F80, RTLIB::FMA_F128, - RTLIB::FMA_PPCF128)); - break; case ISD::FMAD: llvm_unreachable("Illegal fmad should never be formed"); - case ISD::FADD: - Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64, - RTLIB::ADD_F80, RTLIB::ADD_F128, - RTLIB::ADD_PPCF128)); - break; - case ISD::FMUL: - Results.push_back(ExpandFPLibCall(Node, RTLIB::MUL_F32, RTLIB::MUL_F64, - RTLIB::MUL_F80, RTLIB::MUL_F128, - RTLIB::MUL_PPCF128)); - break; - case ISD::FP16_TO_FP: { - if (Node->getValueType(0) == MVT::f32) { - Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false)); - break; + case ISD::FP16_TO_FP: + if (Node->getValueType(0) != MVT::f32) { + // We can extend to types bigger than f32 in two steps without changing + // the result. Since "f16 -> f32" is much more commonly available, give + // CodeGen the option of emitting that before resorting to a libcall. + SDValue Res = + DAG.getNode(ISD::FP16_TO_FP, dl, MVT::f32, Node->getOperand(0)); + Results.push_back( + DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res)); } - - // We can extend to types bigger than f32 in two steps without changing the - // result. Since "f16 -> f32" is much more commonly available, give CodeGen - // the option of emitting that before resorting to a libcall. - SDValue Res = - DAG.getNode(ISD::FP16_TO_FP, dl, MVT::f32, Node->getOperand(0)); - Results.push_back( - DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res)); break; - } - case ISD::FP_TO_FP16: { + case ISD::FP_TO_FP16: if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) { SDValue Op = Node->getOperand(0); MVT SVT = Op.getSimpleValueType(); @@ -3512,16 +3417,9 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { DAG.getIntPtrConstant(0, dl)); Results.push_back( DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, FloatVal)); - break; } } - - RTLIB::Libcall LC = - RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16); - assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16"); - Results.push_back(ExpandLibCall(LC, Node, false)); break; - } case ISD::ConstantFP: { ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Node); // Check to see if this FP immediate is already legal. @@ -3530,17 +3428,19 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(ExpandConstantFP(CFP, true)); break; } + case ISD::Constant: { + ConstantSDNode *CP = cast<ConstantSDNode>(Node); + Results.push_back(ExpandConstant(CP)); + break; + } case ISD::FSUB: { EVT VT = Node->getValueType(0); if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) && TLI.isOperationLegalOrCustom(ISD::FNEG, VT)) { + const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(Node)->Flags; Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1)); - Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1); + Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1, Flags); Results.push_back(Tmp1); - } else { - Results.push_back(ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64, - RTLIB::SUB_F80, RTLIB::SUB_F128, - RTLIB::SUB_PPCF128)); } break; } @@ -3564,29 +3464,17 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM; Tmp2 = Node->getOperand(0); Tmp3 = Node->getOperand(1); - if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) || - (isDivRemLibcallAvailable(Node, isSigned, TLI) && - // If div is legal, it's better to do the normal expansion - !TLI.isOperationLegalOrCustom(DivOpc, Node->getValueType(0)) && - useDivRem(Node, isSigned, false))) { + if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) { SDVTList VTs = DAG.getVTList(VT, VT); Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1); + Results.push_back(Tmp1); } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) { // X % Y -> X-X/Y*Y Tmp1 = DAG.getNode(DivOpc, dl, VT, Tmp2, Tmp3); Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Tmp3); Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp1); - } else if (isSigned) - Tmp1 = ExpandIntLibCall(Node, true, - RTLIB::SREM_I8, - RTLIB::SREM_I16, RTLIB::SREM_I32, - RTLIB::SREM_I64, RTLIB::SREM_I128); - else - Tmp1 = ExpandIntLibCall(Node, false, - RTLIB::UREM_I8, - RTLIB::UREM_I16, RTLIB::UREM_I32, - RTLIB::UREM_I64, RTLIB::UREM_I128); - Results.push_back(Tmp1); + Results.push_back(Tmp1); + } break; } case ISD::UDIV: @@ -3594,23 +3482,12 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { bool isSigned = Node->getOpcode() == ISD::SDIV; unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM; EVT VT = Node->getValueType(0); - SDVTList VTs = DAG.getVTList(VT, VT); - if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) || - (isDivRemLibcallAvailable(Node, isSigned, TLI) && - useDivRem(Node, isSigned, true))) + if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) { + SDVTList VTs = DAG.getVTList(VT, VT); Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0), Node->getOperand(1)); - else if (isSigned) - Tmp1 = ExpandIntLibCall(Node, true, - RTLIB::SDIV_I8, - RTLIB::SDIV_I16, RTLIB::SDIV_I32, - RTLIB::SDIV_I64, RTLIB::SDIV_I128); - else - Tmp1 = ExpandIntLibCall(Node, false, - RTLIB::UDIV_I8, - RTLIB::UDIV_I16, RTLIB::UDIV_I32, - RTLIB::UDIV_I64, RTLIB::UDIV_I128); - Results.push_back(Tmp1); + Results.push_back(Tmp1); + } break; } case ISD::MULHU: @@ -3626,11 +3503,6 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(Tmp1.getValue(1)); break; } - case ISD::SDIVREM: - case ISD::UDIVREM: - // Expand into divrem libcall - ExpandDivRemLibCall(Node, Results); - break; case ISD::MUL: { EVT VT = Node->getValueType(0); SDVTList VTs = DAG.getVTList(VT, VT); @@ -3673,14 +3545,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { TLI.getShiftAmountTy(HalfType, DAG.getDataLayout())); Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift); Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi)); - break; } - - Tmp1 = ExpandIntLibCall(Node, false, - RTLIB::MUL_I8, - RTLIB::MUL_I16, RTLIB::MUL_I32, - RTLIB::MUL_I64, RTLIB::MUL_I128); - Results.push_back(Tmp1); break; } case ISD::SADDO: @@ -3867,9 +3732,10 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { Index, Table); EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8); - SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, dl, PTy, Chain, Addr, - MachinePointerInfo::getJumpTable(), MemVT, - false, false, false, 0); + SDValue LD = DAG.getExtLoad( + ISD::SEXTLOAD, dl, PTy, Chain, Addr, + MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), MemVT, + false, false, false, 0); Addr = LD; if (TM.getRelocationModel() == Reloc::PIC_) { // For PIC, the sequence is: @@ -4092,16 +3958,276 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) { } // Replace the original node with the legalized result. + if (Results.empty()) + return false; + + ReplaceNode(Node, Results.data()); + return true; +} + +void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { + SmallVector<SDValue, 8> Results; + SDLoc dl(Node); + SDValue Tmp1, Tmp2, Tmp3, Tmp4; + unsigned Opc = Node->getOpcode(); + switch (Opc) { + case ISD::ATOMIC_FENCE: { + // If the target didn't lower this, lower it to '__sync_synchronize()' call + // FIXME: handle "fence singlethread" more efficiently. + TargetLowering::ArgListTy Args; + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(Node->getOperand(0)) + .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol("__sync_synchronize", + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args), 0); + + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + + Results.push_back(CallResult.second); + break; + } + // By default, atomic intrinsics are marked Legal and lowered. Targets + // which don't support them directly, however, may want libcalls, in which + // case they mark them Expand, and we get here. + case ISD::ATOMIC_SWAP: + case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_NAND: + case ISD::ATOMIC_LOAD_MIN: + case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_UMIN: + case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_CMP_SWAP: { + MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT(); + RTLIB::Libcall LC = RTLIB::getATOMIC(Opc, VT); + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!"); + + std::pair<SDValue, SDValue> Tmp = ExpandChainLibCall(LC, Node, false); + Results.push_back(Tmp.first); + Results.push_back(Tmp.second); + break; + } + case ISD::TRAP: { + // If this operation is not supported, lower it to 'abort()' call + TargetLowering::ArgListTy Args; + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(Node->getOperand(0)) + .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol("abort", + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args), 0); + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + + Results.push_back(CallResult.second); + break; + } + case ISD::FMINNUM: + Results.push_back(ExpandFPLibCall(Node, RTLIB::FMIN_F32, RTLIB::FMIN_F64, + RTLIB::FMIN_F80, RTLIB::FMIN_F128, + RTLIB::FMIN_PPCF128)); + break; + case ISD::FMAXNUM: + Results.push_back(ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64, + RTLIB::FMAX_F80, RTLIB::FMAX_F128, + RTLIB::FMAX_PPCF128)); + break; + case ISD::FSQRT: + Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64, + RTLIB::SQRT_F80, RTLIB::SQRT_F128, + RTLIB::SQRT_PPCF128)); + break; + case ISD::FSIN: + Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64, + RTLIB::SIN_F80, RTLIB::SIN_F128, + RTLIB::SIN_PPCF128)); + break; + case ISD::FCOS: + Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64, + RTLIB::COS_F80, RTLIB::COS_F128, + RTLIB::COS_PPCF128)); + break; + case ISD::FSINCOS: + // Expand into sincos libcall. + ExpandSinCosLibCall(Node, Results); + break; + case ISD::FLOG: + Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64, + RTLIB::LOG_F80, RTLIB::LOG_F128, + RTLIB::LOG_PPCF128)); + break; + case ISD::FLOG2: + Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64, + RTLIB::LOG2_F80, RTLIB::LOG2_F128, + RTLIB::LOG2_PPCF128)); + break; + case ISD::FLOG10: + Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64, + RTLIB::LOG10_F80, RTLIB::LOG10_F128, + RTLIB::LOG10_PPCF128)); + break; + case ISD::FEXP: + Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64, + RTLIB::EXP_F80, RTLIB::EXP_F128, + RTLIB::EXP_PPCF128)); + break; + case ISD::FEXP2: + Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64, + RTLIB::EXP2_F80, RTLIB::EXP2_F128, + RTLIB::EXP2_PPCF128)); + break; + case ISD::FTRUNC: + Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64, + RTLIB::TRUNC_F80, RTLIB::TRUNC_F128, + RTLIB::TRUNC_PPCF128)); + break; + case ISD::FFLOOR: + Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64, + RTLIB::FLOOR_F80, RTLIB::FLOOR_F128, + RTLIB::FLOOR_PPCF128)); + break; + case ISD::FCEIL: + Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64, + RTLIB::CEIL_F80, RTLIB::CEIL_F128, + RTLIB::CEIL_PPCF128)); + break; + case ISD::FRINT: + Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64, + RTLIB::RINT_F80, RTLIB::RINT_F128, + RTLIB::RINT_PPCF128)); + break; + case ISD::FNEARBYINT: + Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32, + RTLIB::NEARBYINT_F64, + RTLIB::NEARBYINT_F80, + RTLIB::NEARBYINT_F128, + RTLIB::NEARBYINT_PPCF128)); + break; + case ISD::FROUND: + Results.push_back(ExpandFPLibCall(Node, RTLIB::ROUND_F32, + RTLIB::ROUND_F64, + RTLIB::ROUND_F80, + RTLIB::ROUND_F128, + RTLIB::ROUND_PPCF128)); + break; + case ISD::FPOWI: + Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64, + RTLIB::POWI_F80, RTLIB::POWI_F128, + RTLIB::POWI_PPCF128)); + break; + case ISD::FPOW: + Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64, + RTLIB::POW_F80, RTLIB::POW_F128, + RTLIB::POW_PPCF128)); + break; + case ISD::FDIV: + Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64, + RTLIB::DIV_F80, RTLIB::DIV_F128, + RTLIB::DIV_PPCF128)); + break; + case ISD::FREM: + Results.push_back(ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64, + RTLIB::REM_F80, RTLIB::REM_F128, + RTLIB::REM_PPCF128)); + break; + case ISD::FMA: + Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64, + RTLIB::FMA_F80, RTLIB::FMA_F128, + RTLIB::FMA_PPCF128)); + break; + case ISD::FADD: + Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64, + RTLIB::ADD_F80, RTLIB::ADD_F128, + RTLIB::ADD_PPCF128)); + break; + case ISD::FMUL: + Results.push_back(ExpandFPLibCall(Node, RTLIB::MUL_F32, RTLIB::MUL_F64, + RTLIB::MUL_F80, RTLIB::MUL_F128, + RTLIB::MUL_PPCF128)); + break; + case ISD::FP16_TO_FP: + if (Node->getValueType(0) == MVT::f32) { + Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false)); + } + break; + case ISD::FP_TO_FP16: { + RTLIB::Libcall LC = + RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16); + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16"); + Results.push_back(ExpandLibCall(LC, Node, false)); + break; + } + case ISD::FSUB: + Results.push_back(ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64, + RTLIB::SUB_F80, RTLIB::SUB_F128, + RTLIB::SUB_PPCF128)); + break; + case ISD::SREM: + Results.push_back(ExpandIntLibCall(Node, true, + RTLIB::SREM_I8, + RTLIB::SREM_I16, RTLIB::SREM_I32, + RTLIB::SREM_I64, RTLIB::SREM_I128)); + break; + case ISD::UREM: + Results.push_back(ExpandIntLibCall(Node, false, + RTLIB::UREM_I8, + RTLIB::UREM_I16, RTLIB::UREM_I32, + RTLIB::UREM_I64, RTLIB::UREM_I128)); + break; + case ISD::SDIV: + Results.push_back(ExpandIntLibCall(Node, true, + RTLIB::SDIV_I8, + RTLIB::SDIV_I16, RTLIB::SDIV_I32, + RTLIB::SDIV_I64, RTLIB::SDIV_I128)); + break; + case ISD::UDIV: + Results.push_back(ExpandIntLibCall(Node, false, + RTLIB::UDIV_I8, + RTLIB::UDIV_I16, RTLIB::UDIV_I32, + RTLIB::UDIV_I64, RTLIB::UDIV_I128)); + break; + case ISD::SDIVREM: + case ISD::UDIVREM: + // Expand into divrem libcall + ExpandDivRemLibCall(Node, Results); + break; + case ISD::MUL: + Results.push_back(ExpandIntLibCall(Node, false, + RTLIB::MUL_I8, + RTLIB::MUL_I16, RTLIB::MUL_I32, + RTLIB::MUL_I64, RTLIB::MUL_I128)); + break; + } + + // Replace the original node with the legalized result. if (!Results.empty()) ReplaceNode(Node, Results.data()); } +// Determine the vector type to use in place of an original scalar element when +// promoting equally sized vectors. +static MVT getPromotedVectorElementType(const TargetLowering &TLI, + MVT EltVT, MVT NewEltVT) { + unsigned OldEltsPerNewElt = EltVT.getSizeInBits() / NewEltVT.getSizeInBits(); + MVT MidVT = MVT::getVectorVT(NewEltVT, OldEltsPerNewElt); + assert(TLI.isTypeLegal(MidVT) && "unexpected"); + return MidVT; +} + void SelectionDAGLegalize::PromoteNode(SDNode *Node) { SmallVector<SDValue, 8> Results; MVT OVT = Node->getSimpleValueType(0); if (Node->getOpcode() == ISD::UINT_TO_FP || Node->getOpcode() == ISD::SINT_TO_FP || - Node->getOpcode() == ISD::SETCC) { + Node->getOpcode() == ISD::SETCC || + Node->getOpcode() == ISD::EXTRACT_VECTOR_ELT || + Node->getOpcode() == ISD::INSERT_VECTOR_ELT) { OVT = Node->getOperand(0).getSimpleValueType(); } if (Node->getOpcode() == ISD::BR_CC) @@ -4284,11 +4410,11 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { case ISD::FREM: case ISD::FMINNUM: case ISD::FMAXNUM: - case ISD::FCOPYSIGN: case ISD::FPOW: { Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0)); Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1)); - Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2); + Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, + Node->getFlags()); Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT, Tmp3, DAG.getIntPtrConstant(0, dl))); break; @@ -4303,12 +4429,20 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { DAG.getIntPtrConstant(0, dl))); break; } + case ISD::FCOPYSIGN: case ISD::FPOWI: { Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0)); Tmp2 = Node->getOperand(1); Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2); + + // fcopysign doesn't change anything but the sign bit, so + // (fp_round (fcopysign (fpext a), b)) + // is as precise as + // (fp_round (fpext a)) + // which is a no-op. Mark it as a TRUNCating FP_ROUND. + const bool isTrunc = (Node->getOpcode() == ISD::FCOPYSIGN); Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT, - Tmp3, DAG.getIntPtrConstant(0, dl))); + Tmp3, DAG.getIntPtrConstant(isTrunc, dl))); break; } case ISD::FFLOOR: @@ -4333,6 +4467,157 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { Tmp2, DAG.getIntPtrConstant(0, dl))); break; } + case ISD::BUILD_VECTOR: { + MVT EltVT = OVT.getVectorElementType(); + MVT NewEltVT = NVT.getVectorElementType(); + + // Handle bitcasts to a different vector type with the same total bit size + // + // e.g. v2i64 = build_vector i64:x, i64:y => v4i32 + // => + // v4i32 = concat_vectors (v2i32 (bitcast i64:x)), (v2i32 (bitcast i64:y)) + + assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() && + "Invalid promote type for build_vector"); + assert(NewEltVT.bitsLT(EltVT) && "not handled"); + + MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT); + + SmallVector<SDValue, 8> NewOps; + for (unsigned I = 0, E = Node->getNumOperands(); I != E; ++I) { + SDValue Op = Node->getOperand(I); + NewOps.push_back(DAG.getNode(ISD::BITCAST, SDLoc(Op), MidVT, Op)); + } + + SDLoc SL(Node); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewOps); + SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat); + Results.push_back(CvtVec); + break; + } + case ISD::EXTRACT_VECTOR_ELT: { + MVT EltVT = OVT.getVectorElementType(); + MVT NewEltVT = NVT.getVectorElementType(); + + // Handle bitcasts to a different vector type with the same total bit size. + // + // e.g. v2i64 = extract_vector_elt x:v2i64, y:i32 + // => + // v4i32:castx = bitcast x:v2i64 + // + // i64 = bitcast + // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))), + // (i32 (extract_vector_elt castx, (2 * y + 1))) + // + + assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() && + "Invalid promote type for extract_vector_elt"); + assert(NewEltVT.bitsLT(EltVT) && "not handled"); + + MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT); + unsigned NewEltsPerOldElt = MidVT.getVectorNumElements(); + + SDValue Idx = Node->getOperand(1); + EVT IdxVT = Idx.getValueType(); + SDLoc SL(Node); + SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SL, IdxVT); + SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor); + + SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0)); + + SmallVector<SDValue, 8> NewOps; + for (unsigned I = 0; I < NewEltsPerOldElt; ++I) { + SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT); + SDValue TmpIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset); + + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT, + CastVec, TmpIdx); + NewOps.push_back(Elt); + } + + SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, SL, MidVT, NewOps); + + Results.push_back(DAG.getNode(ISD::BITCAST, SL, EltVT, NewVec)); + break; + } + case ISD::INSERT_VECTOR_ELT: { + MVT EltVT = OVT.getVectorElementType(); + MVT NewEltVT = NVT.getVectorElementType(); + + // Handle bitcasts to a different vector type with the same total bit size + // + // e.g. v2i64 = insert_vector_elt x:v2i64, y:i64, z:i32 + // => + // v4i32:castx = bitcast x:v2i64 + // v2i32:casty = bitcast y:i64 + // + // v2i64 = bitcast + // (v4i32 insert_vector_elt + // (v4i32 insert_vector_elt v4i32:castx, + // (extract_vector_elt casty, 0), 2 * z), + // (extract_vector_elt casty, 1), (2 * z + 1)) + + assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() && + "Invalid promote type for insert_vector_elt"); + assert(NewEltVT.bitsLT(EltVT) && "not handled"); + + MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT); + unsigned NewEltsPerOldElt = MidVT.getVectorNumElements(); + + SDValue Val = Node->getOperand(1); + SDValue Idx = Node->getOperand(2); + EVT IdxVT = Idx.getValueType(); + SDLoc SL(Node); + + SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SDLoc(), IdxVT); + SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor); + + SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0)); + SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val); + + SDValue NewVec = CastVec; + for (unsigned I = 0; I < NewEltsPerOldElt; ++I) { + SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT); + SDValue InEltIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset); + + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT, + CastVal, IdxOffset); + + NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NVT, + NewVec, Elt, InEltIdx); + } + + Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewVec)); + break; + } + case ISD::SCALAR_TO_VECTOR: { + MVT EltVT = OVT.getVectorElementType(); + MVT NewEltVT = NVT.getVectorElementType(); + + // Handle bitcasts to different vector type with the smae total bit size. + // + // e.g. v2i64 = scalar_to_vector x:i64 + // => + // concat_vectors (v2i32 bitcast x:i64), (v2i32 undef) + // + + MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT); + SDValue Val = Node->getOperand(0); + SDLoc SL(Node); + + SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val); + SDValue Undef = DAG.getUNDEF(MidVT); + + SmallVector<SDValue, 8> NewElts; + NewElts.push_back(CastVal); + for (unsigned I = 1, NElts = OVT.getVectorNumElements(); I != NElts; ++I) + NewElts.push_back(Undef); + + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewElts); + SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat); + Results.push_back(CvtVec); + break; + } } // Replace the original node with the legalized result. @@ -4356,7 +4641,7 @@ void SelectionDAG::Legalize() { for (auto NI = allnodes_end(); NI != allnodes_begin();) { --NI; - SDNode *N = NI; + SDNode *N = &*NI; if (N->use_empty() && N != getRoot().getNode()) { ++NI; DeleteNode(N); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 3c50a41..6c0193a 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -43,10 +43,10 @@ static RTLIB::Libcall GetFPLibCall(EVT VT, } //===----------------------------------------------------------------------===// -// Result Float to Integer Conversion. +// Convert Float Results to Integer for Non-HW-supported Operations. //===----------------------------------------------------------------------===// -void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { +bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"); SDValue R = SDValue(); @@ -59,20 +59,26 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { #endif llvm_unreachable("Do not know how to soften the result of this operator!"); + case ISD::Register: + case ISD::CopyFromReg: + case ISD::CopyToReg: + assert(isLegalInHWReg(N->getValueType(ResNo)) && + "Unsupported SoftenFloatRes opcode!"); + // Only when isLegalInHWReg, we can skip check of the operands. + R = SDValue(N, ResNo); + break; case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break; - case ISD::BITCAST: R = SoftenFloatRes_BITCAST(N); break; + case ISD::BITCAST: R = SoftenFloatRes_BITCAST(N, ResNo); break; case ISD::BUILD_PAIR: R = SoftenFloatRes_BUILD_PAIR(N); break; - case ISD::ConstantFP: - R = SoftenFloatRes_ConstantFP(cast<ConstantFPSDNode>(N)); - break; + case ISD::ConstantFP: R = SoftenFloatRes_ConstantFP(N, ResNo); break; case ISD::EXTRACT_VECTOR_ELT: R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break; - case ISD::FABS: R = SoftenFloatRes_FABS(N); break; + case ISD::FABS: R = SoftenFloatRes_FABS(N, ResNo); break; case ISD::FMINNUM: R = SoftenFloatRes_FMINNUM(N); break; case ISD::FMAXNUM: R = SoftenFloatRes_FMAXNUM(N); break; case ISD::FADD: R = SoftenFloatRes_FADD(N); break; case ISD::FCEIL: R = SoftenFloatRes_FCEIL(N); break; - case ISD::FCOPYSIGN: R = SoftenFloatRes_FCOPYSIGN(N); break; + case ISD::FCOPYSIGN: R = SoftenFloatRes_FCOPYSIGN(N, ResNo); break; case ISD::FCOS: R = SoftenFloatRes_FCOS(N); break; case ISD::FDIV: R = SoftenFloatRes_FDIV(N); break; case ISD::FEXP: R = SoftenFloatRes_FEXP(N); break; @@ -84,7 +90,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::FMA: R = SoftenFloatRes_FMA(N); break; case ISD::FMUL: R = SoftenFloatRes_FMUL(N); break; case ISD::FNEARBYINT: R = SoftenFloatRes_FNEARBYINT(N); break; - case ISD::FNEG: R = SoftenFloatRes_FNEG(N); break; + case ISD::FNEG: R = SoftenFloatRes_FNEG(N, ResNo); break; case ISD::FP_EXTEND: R = SoftenFloatRes_FP_EXTEND(N); break; case ISD::FP_ROUND: R = SoftenFloatRes_FP_ROUND(N); break; case ISD::FP16_TO_FP: R = SoftenFloatRes_FP16_TO_FP(N); break; @@ -97,9 +103,9 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::FSQRT: R = SoftenFloatRes_FSQRT(N); break; case ISD::FSUB: R = SoftenFloatRes_FSUB(N); break; case ISD::FTRUNC: R = SoftenFloatRes_FTRUNC(N); break; - case ISD::LOAD: R = SoftenFloatRes_LOAD(N); break; - case ISD::SELECT: R = SoftenFloatRes_SELECT(N); break; - case ISD::SELECT_CC: R = SoftenFloatRes_SELECT_CC(N); break; + case ISD::LOAD: R = SoftenFloatRes_LOAD(N, ResNo); break; + case ISD::SELECT: R = SoftenFloatRes_SELECT(N, ResNo); break; + case ISD::SELECT_CC: R = SoftenFloatRes_SELECT_CC(N, ResNo); break; case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: R = SoftenFloatRes_XINT_TO_FP(N); break; case ISD::UNDEF: R = SoftenFloatRes_UNDEF(N); break; @@ -107,11 +113,19 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { } // If R is null, the sub-method took care of registering the result. - if (R.getNode()) + if (R.getNode()) { SetSoftenedFloat(SDValue(N, ResNo), R); + ReplaceSoftenFloatResult(N, ResNo, R); + } + // Return true only if the node is changed, + // assuming that the operands are also converted when necessary. + // Otherwise, return false to tell caller to scan operands. + return R.getNode() && R.getNode() != N; } -SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N) { +SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo) { + if (isLegalInHWReg(N->getValueType(ResNo))) + return SDValue(N, ResNo); return BitConvertToInteger(N->getOperand(0)); } @@ -130,10 +144,14 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) { BitConvertToInteger(N->getOperand(1))); } -SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(ConstantFPSDNode *N) { - return DAG.getConstant(N->getValueAPF().bitcastToAPInt(), SDLoc(N), +SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) { + // When LegalInHWReg, we can load better from the constant pool. + if (isLegalInHWReg(N->getValueType(ResNo))) + return SDValue(N, ResNo); + ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N); + return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN), TLI.getTypeToTransformTo(*DAG.getContext(), - N->getValueType(0))); + CN->getValueType(0))); } SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) { @@ -143,7 +161,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) { NewOp, N->getOperand(1)); } -SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) { +SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N, unsigned ResNo) { + // When LegalInHWReg, FABS can be implemented as native bitwise operations. + if (isLegalInHWReg(N->getValueType(ResNo))) + return SDValue(N, ResNo); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); unsigned Size = NVT.getSizeInBits(); @@ -165,7 +186,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) { RTLIB::FMIN_F80, RTLIB::FMIN_F128, RTLIB::FMIN_PPCF128), - NVT, Ops, 2, false, SDLoc(N)).first; + NVT, Ops, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) { @@ -178,7 +199,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) { RTLIB::FMAX_F80, RTLIB::FMAX_F128, RTLIB::FMAX_PPCF128), - NVT, Ops, 2, false, SDLoc(N)).first; + NVT, Ops, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) { @@ -191,7 +212,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) { RTLIB::ADD_F80, RTLIB::ADD_F128, RTLIB::ADD_PPCF128), - NVT, Ops, 2, false, SDLoc(N)).first; + NVT, Ops, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) { @@ -203,10 +224,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) { RTLIB::CEIL_F80, RTLIB::CEIL_F128, RTLIB::CEIL_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } -SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) { +SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo) { + // When LegalInHWReg, FCOPYSIGN can be implemented as native bitwise operations. + if (isLegalInHWReg(N->getValueType(ResNo))) + return SDValue(N, ResNo); SDValue LHS = GetSoftenedFloat(N->getOperand(0)); SDValue RHS = BitConvertToInteger(N->getOperand(1)); SDLoc dl(N); @@ -263,7 +287,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) { RTLIB::COS_F80, RTLIB::COS_F128, RTLIB::COS_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) { @@ -276,7 +300,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) { RTLIB::DIV_F80, RTLIB::DIV_F128, RTLIB::DIV_PPCF128), - NVT, Ops, 2, false, SDLoc(N)).first; + NVT, Ops, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) { @@ -288,7 +312,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) { RTLIB::EXP_F80, RTLIB::EXP_F128, RTLIB::EXP_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) { @@ -300,7 +324,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) { RTLIB::EXP2_F80, RTLIB::EXP2_F128, RTLIB::EXP2_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) { @@ -312,7 +336,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) { RTLIB::FLOOR_F80, RTLIB::FLOOR_F128, RTLIB::FLOOR_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) { @@ -324,7 +348,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) { RTLIB::LOG_F80, RTLIB::LOG_F128, RTLIB::LOG_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) { @@ -336,7 +360,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) { RTLIB::LOG2_F80, RTLIB::LOG2_F128, RTLIB::LOG2_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) { @@ -348,7 +372,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) { RTLIB::LOG10_F80, RTLIB::LOG10_F128, RTLIB::LOG10_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) { @@ -362,7 +386,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) { RTLIB::FMA_F80, RTLIB::FMA_F128, RTLIB::FMA_PPCF128), - NVT, Ops, 3, false, SDLoc(N)).first; + NVT, Ops, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) { @@ -375,7 +399,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) { RTLIB::MUL_F80, RTLIB::MUL_F128, RTLIB::MUL_PPCF128), - NVT, Ops, 2, false, SDLoc(N)).first; + NVT, Ops, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) { @@ -387,10 +411,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) { RTLIB::NEARBYINT_F80, RTLIB::NEARBYINT_F128, RTLIB::NEARBYINT_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } -SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) { +SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo) { + // When LegalInHWReg, FNEG can be implemented as native bitwise operations. + if (isLegalInHWReg(N->getValueType(ResNo))) + return SDValue(N, ResNo); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); // Expand Y = FNEG(X) -> Y = SUB -0.0, X @@ -402,7 +429,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) { RTLIB::SUB_F80, RTLIB::SUB_F128, RTLIB::SUB_PPCF128), - NVT, Ops, 2, false, dl).first; + NVT, Ops, false, dl).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { @@ -418,11 +445,20 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { SoftenFloatResult(Op.getNode(), 0); } + if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) { + Op = GetPromotedFloat(Op); + // If the promotion did the FP_EXTEND to the destination type for us, + // there's nothing left to do here. + if (Op.getValueType() == N->getValueType(0)) { + return BitConvertToInteger(Op); + } + } + RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0)); if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftenFloat) Op = GetSoftenedFloat(Op); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!"); - return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first; + return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first; } // FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special @@ -430,7 +466,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) { EVT MidVT = TLI.getTypeToTransformTo(*DAG.getContext(), MVT::f32); SDValue Op = N->getOperand(0); - SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, &Op, 1, + SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, Op, false, SDLoc(N)).first; if (N->getValueType(0) == MVT::f32) return Res32; @@ -438,7 +474,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); RTLIB::Libcall LC = RTLIB::getFPEXT(MVT::f32, N->getValueType(0)); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!"); - return TLI.makeLibCall(DAG, LC, NVT, &Res32, 1, false, SDLoc(N)).first; + return TLI.makeLibCall(DAG, LC, NVT, Res32, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) { @@ -452,7 +488,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) { RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0)); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!"); - return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first; + return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) { @@ -465,7 +501,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) { RTLIB::POW_F80, RTLIB::POW_F128, RTLIB::POW_PPCF128), - NVT, Ops, 2, false, SDLoc(N)).first; + NVT, Ops, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) { @@ -479,7 +515,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) { RTLIB::POWI_F80, RTLIB::POWI_F128, RTLIB::POWI_PPCF128), - NVT, Ops, 2, false, SDLoc(N)).first; + NVT, Ops, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) { @@ -492,7 +528,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) { RTLIB::REM_F80, RTLIB::REM_F128, RTLIB::REM_PPCF128), - NVT, Ops, 2, false, SDLoc(N)).first; + NVT, Ops, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) { @@ -504,7 +540,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) { RTLIB::RINT_F80, RTLIB::RINT_F128, RTLIB::RINT_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) { @@ -516,7 +552,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) { RTLIB::ROUND_F80, RTLIB::ROUND_F128, RTLIB::ROUND_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) { @@ -528,7 +564,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) { RTLIB::SIN_F80, RTLIB::SIN_F128, RTLIB::SIN_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) { @@ -540,7 +576,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) { RTLIB::SQRT_F80, RTLIB::SQRT_F128, RTLIB::SQRT_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) { @@ -553,7 +589,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) { RTLIB::SUB_F80, RTLIB::SUB_F128, RTLIB::SUB_PPCF128), - NVT, Ops, 2, false, SDLoc(N)).first; + NVT, Ops, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) { @@ -568,10 +604,11 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) { RTLIB::TRUNC_F80, RTLIB::TRUNC_F128, RTLIB::TRUNC_PPCF128), - NVT, &Op, 1, false, SDLoc(N)).first; + NVT, Op, false, SDLoc(N)).first; } -SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) { +SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo) { + bool LegalInHWReg = isLegalInHWReg(N->getValueType(ResNo)); LoadSDNode *L = cast<LoadSDNode>(N); EVT VT = N->getValueType(0); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); @@ -586,7 +623,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) { L->getAAInfo()); // Legalized the chain result - switch anything that used the old chain to // use the new one. - ReplaceValueWith(SDValue(N, 1), NewL.getValue(1)); + if (N != NewL.getValue(1).getNode()) + ReplaceValueWith(SDValue(N, 1), NewL.getValue(1)); return NewL; } @@ -600,17 +638,24 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) { // Legalized the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), NewL.getValue(1)); - return BitConvertToInteger(DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL)); + auto ExtendNode = DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL); + if (LegalInHWReg) + return ExtendNode; + return BitConvertToInteger(ExtendNode); } -SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) { +SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo) { + if (isLegalInHWReg(N->getValueType(ResNo))) + return SDValue(N, ResNo); SDValue LHS = GetSoftenedFloat(N->getOperand(1)); SDValue RHS = GetSoftenedFloat(N->getOperand(2)); return DAG.getSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS, RHS); } -SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) { +SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo) { + if (isLegalInHWReg(N->getValueType(ResNo))) + return SDValue(N, ResNo); SDValue LHS = GetSoftenedFloat(N->getOperand(2)); SDValue RHS = GetSoftenedFloat(N->getOperand(3)); return DAG.getNode(ISD::SELECT_CC, SDLoc(N), @@ -636,7 +681,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) { // Legalized the chain result - switch anything that used the old chain to // use the new one. - ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1)); + if (N != NewVAARG.getValue(1).getNode()) + ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1)); return NewVAARG; } @@ -665,12 +711,12 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) { NVT, N->getOperand(0)); return TLI.makeLibCall(DAG, LC, TLI.getTypeToTransformTo(*DAG.getContext(), RVT), - &Op, 1, Signed, dl).first; + Op, Signed, dl).first; } //===----------------------------------------------------------------------===// -// Operand Float to Integer Conversion.. +// Convert Float Operand to Integer for Non-HW-supported Operations. //===----------------------------------------------------------------------===// bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) { @@ -680,6 +726,8 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) { switch (N->getOpcode()) { default: + if (CanSkipSoftenFloatOperand(N, OpNo)) + return false; #ifndef NDEBUG dbgs() << "SoftenFloatOperand Op #" << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"; @@ -691,18 +739,27 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) { case ISD::FP_EXTEND: Res = SoftenFloatOp_FP_EXTEND(N); break; case ISD::FP_TO_FP16: // Same as FP_ROUND for softening purposes case ISD::FP_ROUND: Res = SoftenFloatOp_FP_ROUND(N); break; - case ISD::FP_TO_SINT: Res = SoftenFloatOp_FP_TO_SINT(N); break; - case ISD::FP_TO_UINT: Res = SoftenFloatOp_FP_TO_UINT(N); break; + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: Res = SoftenFloatOp_FP_TO_XINT(N); break; case ISD::SELECT_CC: Res = SoftenFloatOp_SELECT_CC(N); break; case ISD::SETCC: Res = SoftenFloatOp_SETCC(N); break; - case ISD::STORE: Res = SoftenFloatOp_STORE(N, OpNo); break; + case ISD::STORE: + Res = SoftenFloatOp_STORE(N, OpNo); + // Do not try to analyze or soften this node again if the value is + // or can be held in a register. In that case, Res.getNode() should + // be equal to N. + if (Res.getNode() == N && + isLegalInHWReg(N->getOperand(OpNo).getValueType())) + return false; + // Otherwise, we need to reanalyze and lower the new Res nodes. + break; } // If the result is null, the sub-method took care of registering results etc. if (!Res.getNode()) return false; // If the result is N, the sub-method updated N in place. Tell the legalizer - // core about this. + // core about this to re-analyze. if (Res.getNode() == N) return true; @@ -713,6 +770,41 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) { return false; } +bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) { + if (!isLegalInHWReg(N->getOperand(OpNo).getValueType())) + return false; + // When the operand type can be kept in registers, SoftenFloatResult + // will call ReplaceValueWith to replace all references and we can + // skip softening this operand. + switch (N->getOperand(OpNo).getOpcode()) { + case ISD::BITCAST: + case ISD::ConstantFP: + case ISD::CopyFromReg: + case ISD::CopyToReg: + case ISD::FABS: + case ISD::FCOPYSIGN: + case ISD::FNEG: + case ISD::Register: + case ISD::SELECT: + case ISD::SELECT_CC: + return true; + } + // For some opcodes, SoftenFloatResult handles all conversion of softening + // and replacing operands, so that there is no need to soften operands + // again, although such opcode could be scanned for other illegal operands. + switch (N->getOpcode()) { + case ISD::ConstantFP: + case ISD::CopyFromReg: + case ISD::CopyToReg: + case ISD::FABS: + case ISD::FCOPYSIGN: + case ISD::FNEG: + case ISD::Register: + return true; + } + return false; +} + SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) { return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), GetSoftenedFloat(N->getOperand(0))); @@ -730,7 +822,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_EXTEND(SDNode *N) { RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, RVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND libcall"); - return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first; + return TLI.makeLibCall(DAG, LC, RVT, Op, false, SDLoc(N)).first; } @@ -747,7 +839,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) { assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall"); SDValue Op = GetSoftenedFloat(N->getOperand(0)); - return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first; + return TLI.makeLibCall(DAG, LC, RVT, Op, false, SDLoc(N)).first; } SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) { @@ -773,20 +865,33 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) { 0); } -SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_SINT(SDNode *N) { +SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) { + bool Signed = N->getOpcode() == ISD::FP_TO_SINT; + EVT SVT = N->getOperand(0).getValueType(); EVT RVT = N->getValueType(0); - RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT); - assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!"); - SDValue Op = GetSoftenedFloat(N->getOperand(0)); - return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first; -} + EVT NVT = EVT(); + SDLoc dl(N); + + // If the result is not legal, eg: fp -> i1, then it needs to be promoted to + // a larger type, eg: fp -> i32. Even if it is legal, no libcall may exactly + // match, eg. we don't have fp -> i8 conversions. + // Look for an appropriate libcall. + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE; + IntVT <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL; + ++IntVT) { + NVT = (MVT::SimpleValueType)IntVT; + // The type needs to big enough to hold the result. + if (NVT.bitsGE(RVT)) + LC = Signed ? RTLIB::getFPTOSINT(SVT, NVT):RTLIB::getFPTOUINT(SVT, NVT); + } + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_XINT!"); -SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) { - EVT RVT = N->getValueType(0); - RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT); - assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!"); SDValue Op = GetSoftenedFloat(N->getOperand(0)); - return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first; + SDValue Res = TLI.makeLibCall(DAG, LC, NVT, Op, false, dl).first; + + // Truncate the result if the libcall returns a larger type. + return DAG.getNode(ISD::TRUNCATE, dl, RVT, Res); } SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) { @@ -1028,7 +1133,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo, RTLIB::DIV_F80, RTLIB::DIV_F128, RTLIB::DIV_PPCF128), - N->getValueType(0), Ops, 2, false, + N->getValueType(0), Ops, false, SDLoc(N)).first; GetPairElements(Call, Lo, Hi); } @@ -1102,7 +1207,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo, RTLIB::FMA_F80, RTLIB::FMA_F128, RTLIB::FMA_PPCF128), - N->getValueType(0), Ops, 3, false, + N->getValueType(0), Ops, false, SDLoc(N)).first; GetPairElements(Call, Lo, Hi); } @@ -1116,7 +1221,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo, RTLIB::MUL_F80, RTLIB::MUL_F128, RTLIB::MUL_PPCF128), - N->getValueType(0), Ops, 2, false, + N->getValueType(0), Ops, false, SDLoc(N)).first; GetPairElements(Call, Lo, Hi); } @@ -1231,7 +1336,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo, RTLIB::SUB_F80, RTLIB::SUB_F128, RTLIB::SUB_PPCF128), - N->getValueType(0), Ops, 2, false, + N->getValueType(0), Ops, false, SDLoc(N)).first; GetPairElements(Call, Lo, Hi); } @@ -1310,7 +1415,7 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, } assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!"); - Hi = TLI.makeLibCall(DAG, LC, VT, &Src, 1, true, dl).first; + Hi = TLI.makeLibCall(DAG, LC, VT, Src, true, dl).first; GetPairElements(Hi, Lo, Hi); } @@ -1341,6 +1446,7 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, break; } + // TODO: Are there fast-math-flags to propagate to this FADD? Lo = DAG.getNode(ISD::FADD, dl, VT, Hi, DAG.getConstantFP(APFloat(APFloat::PPCDoubleDouble, APInt(128, Parts)), @@ -1494,7 +1600,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) { RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!"); - return TLI.makeLibCall(DAG, LC, RVT, &N->getOperand(0), 1, false, dl).first; + return TLI.makeLibCall(DAG, LC, RVT, N->getOperand(0), false, dl).first; } SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) { @@ -1511,6 +1617,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) { SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128); // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X // FIXME: generated code sucks. + // TODO: Are there fast-math-flags to propagate to this FSUB? return DAG.getSelectCC(dl, N->getOperand(0), Tmp, DAG.getNode(ISD::ADD, dl, MVT::i32, DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, @@ -1527,7 +1634,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) { RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!"); - return TLI.makeLibCall(DAG, LC, N->getValueType(0), &N->getOperand(0), 1, + return TLI.makeLibCall(DAG, LC, N->getValueType(0), N->getOperand(0), false, dl).first; } @@ -1912,8 +2019,7 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_BinOp(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDValue Op0 = GetPromotedFloat(N->getOperand(0)); SDValue Op1 = GetPromotedFloat(N->getOperand(1)); - - return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1); + return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1, N->getFlags()); } SDValue DAGTypeLegalizer::PromoteFloatRes_FMAD(SDNode *N) { diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 9f060a09..74f80db 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -53,6 +53,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::AssertSext: Res = PromoteIntRes_AssertSext(N); break; case ISD::AssertZext: Res = PromoteIntRes_AssertZext(N); break; case ISD::BITCAST: Res = PromoteIntRes_BITCAST(N); break; + case ISD::BITREVERSE: Res = PromoteIntRes_BITREVERSE(N); break; case ISD::BSWAP: Res = PromoteIntRes_BSWAP(N); break; case ISD::BUILD_PAIR: Res = PromoteIntRes_BUILD_PAIR(N); break; case ISD::Constant: Res = PromoteIntRes_Constant(N); break; @@ -65,16 +66,20 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::CTTZ: Res = PromoteIntRes_CTTZ(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break; - case ISD::LOAD: Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N));break; - case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N));break; + case ISD::LOAD: Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N)); break; + case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N)); + break; + case ISD::MGATHER: Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N)); + break; case ISD::SELECT: Res = PromoteIntRes_SELECT(N); break; case ISD::VSELECT: Res = PromoteIntRes_VSELECT(N); break; case ISD::SELECT_CC: Res = PromoteIntRes_SELECT_CC(N); break; case ISD::SETCC: Res = PromoteIntRes_SETCC(N); break; case ISD::SMIN: - case ISD::SMAX: + case ISD::SMAX: Res = PromoteIntRes_SExtIntBinOp(N); break; case ISD::UMIN: - case ISD::UMAX: Res = PromoteIntRes_SimpleIntBinOp(N); break; + case ISD::UMAX: Res = PromoteIntRes_ZExtIntBinOp(N); break; + case ISD::SHL: Res = PromoteIntRes_SHL(N); break; case ISD::SIGN_EXTEND_INREG: Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break; @@ -114,10 +119,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::MUL: Res = PromoteIntRes_SimpleIntBinOp(N); break; case ISD::SDIV: - case ISD::SREM: Res = PromoteIntRes_SDIV(N); break; + case ISD::SREM: Res = PromoteIntRes_SExtIntBinOp(N); break; case ISD::UDIV: - case ISD::UREM: Res = PromoteIntRes_UDIV(N); break; + case ISD::UREM: Res = PromoteIntRes_ZExtIntBinOp(N); break; case ISD::SADDO: case ISD::SSUBO: Res = PromoteIntRes_SADDSUBO(N, ResNo); break; @@ -180,7 +185,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) { N->getChain(), N->getBasePtr(), N->getMemOperand(), N->getOrdering(), N->getSynchScope()); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; @@ -193,7 +198,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) { N->getChain(), N->getBasePtr(), Op2, N->getMemOperand(), N->getOrdering(), N->getSynchScope()); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; @@ -257,12 +262,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) { return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp)); case TargetLowering::TypePromoteFloat: { // Convert the promoted float by hand. - if (NOutVT.bitsEq(NInVT)) { - SDValue PromotedOp = GetPromotedFloat(InOp); - SDValue Trunc = DAG.getNode(ISD::FP_TO_FP16, dl, NOutVT, PromotedOp); - return DAG.getNode(ISD::AssertZext, dl, NOutVT, Trunc, - DAG.getValueType(OutVT)); - } + SDValue PromotedOp = GetPromotedFloat(InOp); + return DAG.getNode(ISD::FP_TO_FP16, dl, NOutVT, PromotedOp); break; } case TargetLowering::TypeExpandInteger: @@ -316,6 +317,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) { TLI.getShiftAmountTy(NVT, DAG.getDataLayout()))); } +SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) { + SDValue Op = GetPromotedInteger(N->getOperand(0)); + EVT OVT = N->getValueType(0); + EVT NVT = Op.getValueType(); + SDLoc dl(N); + + unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(); + return DAG.getNode( + ISD::SRL, dl, NVT, DAG.getNode(ISD::BITREVERSE, dl, NVT, Op), + DAG.getConstant(DiffBits, dl, + TLI.getShiftAmountTy(NVT, DAG.getDataLayout()))); +} + SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) { // The pair element type may be legal, or may not promote to the same type as // the result, for example i14 = BUILD_PAIR (i7, i7). Handle all cases. @@ -465,7 +479,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) { SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(), N->getMemoryVT(), N->getMemOperand()); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; @@ -475,20 +489,34 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0()); - SDValue Mask = N->getMask(); - EVT NewMaskVT = getSetCCResultType(NVT); - if (NewMaskVT != N->getMask().getValueType()) - Mask = PromoteTargetBoolean(Mask, NewMaskVT); SDLoc dl(N); - SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(), - Mask, ExtSrc0, N->getMemoryVT(), + N->getMask(), ExtSrc0, N->getMemoryVT(), N->getMemOperand(), ISD::SEXTLOAD); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + +SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue ExtSrc0 = GetPromotedInteger(N->getValue()); + assert(NVT == ExtSrc0.getValueType() && + "Gather result type and the passThru agrument type should be the same"); + + SDLoc dl(N); + SDValue Ops[] = {N->getChain(), ExtSrc0, N->getMask(), N->getBasePtr(), + N->getIndex()}; + SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } + /// Promote the overflow flag of an overflowing arithmetic node. SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) { // Simply change the return type of the boolean result. @@ -534,14 +562,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) { return Res; } -SDValue DAGTypeLegalizer::PromoteIntRes_SDIV(SDNode *N) { - // Sign extend the input. - SDValue LHS = SExtPromotedInteger(N->getOperand(0)); - SDValue RHS = SExtPromotedInteger(N->getOperand(1)); - return DAG.getNode(N->getOpcode(), SDLoc(N), - LHS.getValueType(), LHS, RHS); -} - SDValue DAGTypeLegalizer::PromoteIntRes_SELECT(SDNode *N) { SDValue LHS = GetPromotedInteger(N->getOperand(1)); SDValue RHS = GetPromotedInteger(N->getOperand(2)); @@ -629,6 +649,22 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) { LHS.getValueType(), LHS, RHS); } +SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) { + // Sign extend the input. + SDValue LHS = SExtPromotedInteger(N->getOperand(0)); + SDValue RHS = SExtPromotedInteger(N->getOperand(1)); + return DAG.getNode(N->getOpcode(), SDLoc(N), + LHS.getValueType(), LHS, RHS); +} + +SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) { + // Zero extend the input. + SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); + SDValue RHS = ZExtPromotedInteger(N->getOperand(1)); + return DAG.getNode(N->getOpcode(), SDLoc(N), + LHS.getValueType(), LHS, RHS); +} + SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -770,14 +806,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) { return Mul; } -SDValue DAGTypeLegalizer::PromoteIntRes_UDIV(SDNode *N) { - // Zero extend the input. - SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); - SDValue RHS = ZExtPromotedInteger(N->getOperand(1)); - return DAG.getNode(N->getOpcode(), SDLoc(N), - LHS.getValueType(), LHS, RHS); -} - SDValue DAGTypeLegalizer::PromoteIntRes_UNDEF(SDNode *N) { return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0))); @@ -875,6 +903,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { OpNo); break; case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast<MaskedLoadSDNode>(N), OpNo); break; + case ISD::MGATHER: Res = PromoteIntOp_MGATHER(cast<MaskedGatherSDNode>(N), + OpNo); break; + case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N), + OpNo); break; case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break; case ISD::FP16_TO_FP: case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break; @@ -1143,56 +1175,49 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){ N->getMemoryVT(), N->getMemOperand()); } -SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){ +SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, + unsigned OpNo) { SDValue DataOp = N->getValue(); EVT DataVT = DataOp.getValueType(); SDValue Mask = N->getMask(); - EVT MaskVT = Mask.getValueType(); SDLoc dl(N); bool TruncateStore = false; - if (!TLI.isTypeLegal(DataVT)) { - if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) { - DataOp = GetPromotedInteger(DataOp); - if (!TLI.isTypeLegal(MaskVT)) - Mask = PromoteTargetBoolean(Mask, DataOp.getValueType()); - TruncateStore = true; - } + if (OpNo == 2) { + // Mask comes before the data operand. If the data operand is legal, we just + // promote the mask. + // When the data operand has illegal type, we should legalize the data + // operand first. The mask will be promoted/splitted/widened according to + // the data operand type. + if (TLI.isTypeLegal(DataVT)) + Mask = PromoteTargetBoolean(Mask, DataVT); else { - assert(getTypeAction(DataVT) == TargetLowering::TypeWidenVector && - "Unexpected data legalization in MSTORE"); - DataOp = GetWidenedVector(DataOp); - - if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector) - Mask = GetWidenedVector(Mask); - else { - EVT BoolVT = getSetCCResultType(DataOp.getValueType()); + if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) + return PromoteIntOp_MSTORE(N, 3); - // We can't use ModifyToType() because we should fill the mask with - // zeroes - unsigned WidenNumElts = BoolVT.getVectorNumElements(); - unsigned MaskNumElts = MaskVT.getVectorNumElements(); + else if (getTypeAction(DataVT) == TargetLowering::TypeWidenVector) + return WidenVecOp_MSTORE(N, 3); - unsigned NumConcat = WidenNumElts / MaskNumElts; - SmallVector<SDValue, 16> Ops(NumConcat); - SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT); - Ops[0] = Mask; - for (unsigned i = 1; i != NumConcat; ++i) - Ops[i] = ZeroVal; - - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops); + else { + assert (getTypeAction(DataVT) == TargetLowering::TypeSplitVector); + return SplitVecOp_MSTORE(N, 3); } } + } else { // Data operand + assert(OpNo == 3 && "Unexpected operand for promotion"); + DataOp = GetPromotedInteger(DataOp); + Mask = PromoteTargetBoolean(Mask, DataOp.getValueType()); + TruncateStore = true; } - else - Mask = PromoteTargetBoolean(N->getMask(), DataOp.getValueType()); + return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask, N->getMemoryVT(), N->getMemOperand(), TruncateStore); } -SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){ +SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, + unsigned OpNo) { assert(OpNo == 2 && "Only know how to promote the mask!"); EVT DataVT = N->getValueType(0); SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); @@ -1201,6 +1226,31 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo) return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N, + unsigned OpNo) { + + SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); + if (OpNo == 2) { + // The Mask + EVT DataVT = N->getValueType(0); + NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); + } else + NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo)); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + +SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, + unsigned OpNo) { + SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); + if (OpNo == 2) { + // The Mask + EVT DataVT = N->getValue().getValueType(); + NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); + } else + NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo)); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op); @@ -1259,6 +1309,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::ANY_EXTEND: ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break; case ISD::AssertSext: ExpandIntRes_AssertSext(N, Lo, Hi); break; case ISD::AssertZext: ExpandIntRes_AssertZext(N, Lo, Hi); break; + case ISD::BITREVERSE: ExpandIntRes_BITREVERSE(N, Lo, Hi); break; case ISD::BSWAP: ExpandIntRes_BSWAP(N, Lo, Hi); break; case ISD::Constant: ExpandIntRes_Constant(N, Lo, Hi); break; case ISD::CTLZ_ZERO_UNDEF: @@ -1270,6 +1321,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::FP_TO_UINT: ExpandIntRes_FP_TO_UINT(N, Lo, Hi); break; case ISD::LOAD: ExpandIntRes_LOAD(cast<LoadSDNode>(N), Lo, Hi); break; case ISD::MUL: ExpandIntRes_MUL(N, Lo, Hi); break; + case ISD::READCYCLECOUNTER: ExpandIntRes_READCYCLECOUNTER(N, Lo, Hi); break; case ISD::SDIV: ExpandIntRes_SDIV(N, Lo, Hi); break; case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break; case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break; @@ -1763,12 +1815,6 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N, ReplaceValueWith(SDValue(N, 1), Hi.getValue(1)); } -void DAGTypeLegalizer::ExpandIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo, - SDValue &Lo, SDValue &Hi) { - SDValue Res = DisintegrateMERGE_VALUES(N, ResNo); - SplitInteger(Res, Lo, Hi); -} - void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); @@ -1834,6 +1880,14 @@ void DAGTypeLegalizer::ExpandIntRes_AssertZext(SDNode *N, } } +void DAGTypeLegalizer::ExpandIntRes_BITREVERSE(SDNode *N, + SDValue &Lo, SDValue &Hi) { + SDLoc dl(N); + GetExpandedInteger(N->getOperand(0), Hi, Lo); // Note swapped operands. + Lo = DAG.getNode(ISD::BITREVERSE, dl, Lo.getValueType(), Lo); + Hi = DAG.getNode(ISD::BITREVERSE, dl, Hi.getValueType(), Hi); +} + void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); @@ -1918,8 +1972,7 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo, RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!"); - SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, true/*irrelevant*/, - dl).first, + SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, true/*irrelevant*/, dl).first, Lo, Hi); } @@ -1934,8 +1987,7 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo, RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!"); - SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, false/*irrelevant*/, - dl).first, + SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, false/*irrelevant*/, dl).first, Lo, Hi); } @@ -2055,7 +2107,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N, } } - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Ch); } @@ -2096,11 +2148,21 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N, assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported MUL!"); SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; - SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true/*irrelevant*/, - dl).first, + SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true/*irrelevant*/, dl).first, Lo, Hi); } +void DAGTypeLegalizer::ExpandIntRes_READCYCLECOUNTER(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDLoc DL(N); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDVTList VTs = DAG.getVTList(NVT, NVT, MVT::Other); + SDValue R = DAG.getNode(N->getOpcode(), DL, VTs, N->getOperand(0)); + Lo = R.getValue(0); + Hi = R.getValue(1); + ReplaceValueWith(SDValue(N, 1), R.getValue(2)); +} + void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node, SDValue &Lo, SDValue &Hi) { SDValue LHS = Node->getOperand(0); @@ -2166,7 +2228,7 @@ void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N, LC = RTLIB::SDIV_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!"); - SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl).first, Lo, Hi); + SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N, @@ -2261,8 +2323,7 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N, if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) { SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; - SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, isSigned, dl).first, Lo, - Hi); + SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, isSigned, dl).first, Lo, Hi); return; } @@ -2352,7 +2413,7 @@ void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N, LC = RTLIB::SREM_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!"); - SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl).first, Lo, Hi); + SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N, @@ -2499,7 +2560,7 @@ void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N, LC = RTLIB::UDIV_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!"); - SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl).first, Lo, Hi); + SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N, @@ -2525,7 +2586,7 @@ void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N, LC = RTLIB::UREM_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!"); - SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl).first, Lo, Hi); + SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N, @@ -2605,6 +2666,7 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::SCALAR_TO_VECTOR: Res = ExpandOp_SCALAR_TO_VECTOR(N); break; case ISD::SELECT_CC: Res = ExpandIntOp_SELECT_CC(N); break; case ISD::SETCC: Res = ExpandIntOp_SETCC(N); break; + case ISD::SETCCE: Res = ExpandIntOp_SETCCE(N); break; case ISD::SINT_TO_FP: Res = ExpandIntOp_SINT_TO_FP(N); break; case ISD::STORE: Res = ExpandIntOp_STORE(cast<StoreSDNode>(N), OpNo); break; case ISD::TRUNCATE: Res = ExpandIntOp_TRUNCATE(N); break; @@ -2732,6 +2794,47 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS, return; } + if (LHSHi == RHSHi) { + // Comparing the low bits is enough. + NewLHS = Tmp1; + NewRHS = SDValue(); + return; + } + + // Lower with SETCCE if the target supports it. + // FIXME: Make all targets support this, then remove the other lowering. + if (TLI.getOperationAction( + ISD::SETCCE, + TLI.getTypeToExpandTo(*DAG.getContext(), LHSLo.getValueType())) == + TargetLowering::Custom) { + // SETCCE can detect < and >= directly. For > and <=, flip operands and + // condition code. + bool FlipOperands = false; + switch (CCCode) { + case ISD::SETGT: CCCode = ISD::SETLT; FlipOperands = true; break; + case ISD::SETUGT: CCCode = ISD::SETULT; FlipOperands = true; break; + case ISD::SETLE: CCCode = ISD::SETGE; FlipOperands = true; break; + case ISD::SETULE: CCCode = ISD::SETUGE; FlipOperands = true; break; + default: break; + } + if (FlipOperands) { + std::swap(LHSLo, RHSLo); + std::swap(LHSHi, RHSHi); + } + // Perform a wide subtraction, feeding the carry from the low part into + // SETCCE. The SETCCE operation is essentially looking at the high part of + // the result of LHS - RHS. It is negative iff LHS < RHS. It is zero or + // positive iff LHS >= RHS. + SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), MVT::Glue); + SDValue LowCmp = DAG.getNode(ISD::SUBC, dl, VTList, LHSLo, RHSLo); + SDValue Res = + DAG.getNode(ISD::SETCCE, dl, getSetCCResultType(LHSLo.getValueType()), + LHSHi, RHSHi, LowCmp.getValue(1), DAG.getCondCode(CCCode)); + NewLHS = Res; + NewRHS = SDValue(); + return; + } + NewLHS = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()), LHSHi, RHSHi, ISD::SETEQ, false, DagCombineInfo, dl); @@ -2796,6 +2899,24 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) { DAG.getCondCode(CCCode)), 0); } +SDValue DAGTypeLegalizer::ExpandIntOp_SETCCE(SDNode *N) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Carry = N->getOperand(2); + SDValue Cond = N->getOperand(3); + SDLoc dl = SDLoc(N); + + SDValue LHSLo, LHSHi, RHSLo, RHSHi; + GetExpandedInteger(LHS, LHSLo, LHSHi); + GetExpandedInteger(RHS, RHSLo, RHSHi); + + // Expand to a SUBE for the low part and a smaller SETCCE for the high. + SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), MVT::Glue); + SDValue LowCmp = DAG.getNode(ISD::SUBE, dl, VTList, LHSLo, RHSLo, Carry); + return DAG.getNode(ISD::SETCCE, dl, N->getValueType(0), LHSHi, RHSHi, + LowCmp.getValue(1), Cond); +} + SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) { // The value being shifted is legal, but the shift amount is too big. // It follows that either the result of the shift is undefined, or the @@ -2820,7 +2941,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) { RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Don't know how to expand this SINT_TO_FP!"); - return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, SDLoc(N)).first; + return TLI.makeLibCall(DAG, LC, DstVT, Op, true, SDLoc(N)).first; } SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) { @@ -2980,11 +3101,10 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) { // Load the value out, extending it from f32 to the destination float type. // FIXME: Avoid the extend by constructing the right constant pool? - SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, DstVT, DAG.getEntryNode(), - FudgePtr, - MachinePointerInfo::getConstantPool(), - MVT::f32, - false, false, false, Alignment); + SDValue Fudge = DAG.getExtLoad( + ISD::EXTLOAD, dl, DstVT, DAG.getEntryNode(), FudgePtr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, + false, false, false, Alignment); return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge); } @@ -2992,7 +3112,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) { RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Don't know how to expand this UINT_TO_FP!"); - return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, dl).first; + return TLI.makeLibCall(DAG, LC, DstVT, Op, true, dl).first; } SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) { diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index 54cfaf5..2a0b0aa 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -73,21 +73,20 @@ void DAGTypeLegalizer::PerformExpensiveChecks() { // (for example because it was created but not used). In general, we cannot // distinguish between new nodes and deleted nodes. SmallVector<SDNode*, 16> NewNodes; - for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), - E = DAG.allnodes_end(); I != E; ++I) { + for (SDNode &Node : DAG.allnodes()) { // Remember nodes marked NewNode - they are subject to extra checking below. - if (I->getNodeId() == NewNode) - NewNodes.push_back(I); + if (Node.getNodeId() == NewNode) + NewNodes.push_back(&Node); - for (unsigned i = 0, e = I->getNumValues(); i != e; ++i) { - SDValue Res(I, i); + for (unsigned i = 0, e = Node.getNumValues(); i != e; ++i) { + SDValue Res(&Node, i); bool Failed = false; unsigned Mapped = 0; if (ReplacedValues.find(Res) != ReplacedValues.end()) { Mapped |= 1; // Check that remapped values are only used by nodes marked NewNode. - for (SDNode::use_iterator UI = I->use_begin(), UE = I->use_end(); + for (SDNode::use_iterator UI = Node.use_begin(), UE = Node.use_end(); UI != UE; ++UI) if (UI.getUse().getResNo() == i) assert(UI->getNodeId() == NewNode && @@ -119,16 +118,16 @@ void DAGTypeLegalizer::PerformExpensiveChecks() { if (WidenedVectors.find(Res) != WidenedVectors.end()) Mapped |= 128; - if (I->getNodeId() != Processed) { + if (Node.getNodeId() != Processed) { // Since we allow ReplacedValues to map deleted nodes, it may map nodes // marked NewNode too, since a deleted node may have been reallocated as // another node that has not been seen by the LegalizeTypes machinery. - if ((I->getNodeId() == NewNode && Mapped > 1) || - (I->getNodeId() != NewNode && Mapped != 0)) { + if ((Node.getNodeId() == NewNode && Mapped > 1) || + (Node.getNodeId() != NewNode && Mapped != 0)) { dbgs() << "Unprocessed value in a map!"; Failed = true; } - } else if (isTypeLegal(Res.getValueType()) || IgnoreNodeResults(I)) { + } else if (isTypeLegal(Res.getValueType()) || IgnoreNodeResults(&Node)) { if (Mapped > 1) { dbgs() << "Value with legal type was transformed!"; Failed = true; @@ -194,13 +193,12 @@ bool DAGTypeLegalizer::run() { // Walk all nodes in the graph, assigning them a NodeId of 'ReadyToProcess' // (and remembering them) if they are leaves and assigning 'Unanalyzed' if // non-leaves. - for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), - E = DAG.allnodes_end(); I != E; ++I) { - if (I->getNumOperands() == 0) { - I->setNodeId(ReadyToProcess); - Worklist.push_back(I); + for (SDNode &Node : DAG.allnodes()) { + if (Node.getNumOperands() == 0) { + Node.setNodeId(ReadyToProcess); + Worklist.push_back(&Node); } else { - I->setNodeId(Unanalyzed); + Node.setNodeId(Unanalyzed); } } @@ -240,9 +238,13 @@ bool DAGTypeLegalizer::run() { Changed = true; goto NodeDone; case TargetLowering::TypeSoftenFloat: - SoftenFloatResult(N, i); - Changed = true; - goto NodeDone; + Changed = SoftenFloatResult(N, i); + if (Changed) + goto NodeDone; + // If not changed, the result type should be legally in register. + assert(isLegalInHWReg(ResultVT) && + "Unchanged SoftenFloatResult should be legal in register!"); + goto ScanOperands; case TargetLowering::TypeExpandFloat: ExpandFloatResult(N, i); Changed = true; @@ -409,40 +411,48 @@ NodeDone: // In a debug build, scan all the nodes to make sure we found them all. This // ensures that there are no cycles and that everything got processed. #ifndef NDEBUG - for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), - E = DAG.allnodes_end(); I != E; ++I) { + for (SDNode &Node : DAG.allnodes()) { bool Failed = false; // Check that all result types are legal. - if (!IgnoreNodeResults(I)) - for (unsigned i = 0, NumVals = I->getNumValues(); i < NumVals; ++i) - if (!isTypeLegal(I->getValueType(i))) { - dbgs() << "Result type " << i << " illegal!\n"; + // A value type is illegal if its TypeAction is not TypeLegal, + // and TLI.RegClassForVT does not have a register class for this type. + // For example, the x86_64 target has f128 that is not TypeLegal, + // to have softened operators, but it also has FR128 register class to + // pass and return f128 values. Hence a legalized node can have f128 type. + if (!IgnoreNodeResults(&Node)) + for (unsigned i = 0, NumVals = Node.getNumValues(); i < NumVals; ++i) + if (!isTypeLegal(Node.getValueType(i)) && + !TLI.isTypeLegal(Node.getValueType(i))) { + dbgs() << "Result type " << i << " illegal: "; + Node.dump(); Failed = true; } // Check that all operand types are legal. - for (unsigned i = 0, NumOps = I->getNumOperands(); i < NumOps; ++i) - if (!IgnoreNodeResults(I->getOperand(i).getNode()) && - !isTypeLegal(I->getOperand(i).getValueType())) { - dbgs() << "Operand type " << i << " illegal!\n"; + for (unsigned i = 0, NumOps = Node.getNumOperands(); i < NumOps; ++i) + if (!IgnoreNodeResults(Node.getOperand(i).getNode()) && + !isTypeLegal(Node.getOperand(i).getValueType()) && + !TLI.isTypeLegal(Node.getOperand(i).getValueType())) { + dbgs() << "Operand type " << i << " illegal: "; + Node.getOperand(i).dump(); Failed = true; } - if (I->getNodeId() != Processed) { - if (I->getNodeId() == NewNode) + if (Node.getNodeId() != Processed) { + if (Node.getNodeId() == NewNode) dbgs() << "New node not analyzed?\n"; - else if (I->getNodeId() == Unanalyzed) + else if (Node.getNodeId() == Unanalyzed) dbgs() << "Unanalyzed node not noticed?\n"; - else if (I->getNodeId() > 0) + else if (Node.getNodeId() > 0) dbgs() << "Operand not processed?\n"; - else if (I->getNodeId() == ReadyToProcess) + else if (Node.getNodeId() == ReadyToProcess) dbgs() << "Not added to worklist?\n"; Failed = true; } if (Failed) { - I->dump(&DAG); dbgs() << "\n"; + Node.dump(&DAG); dbgs() << "\n"; llvm_unreachable(nullptr); } } @@ -751,13 +761,23 @@ void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) { } void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) { - assert(Result.getValueType() == - TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) && + // f128 of x86_64 could be kept in SSE registers, + // but sometimes softened to i128. + assert((Result.getValueType() == + TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) || + Op.getValueType() == + TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) && "Invalid type for softened float"); AnalyzeNewValue(Result); SDValue &OpEntry = SoftenedFloats[Op]; - assert(!OpEntry.getNode() && "Node is already converted to integer!"); + // Allow repeated calls to save f128 type nodes + // or any node with type that transforms to itself. + // Many operations on these types are not softened. + assert((!OpEntry.getNode()|| + Op.getValueType() == + TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) && + "Node is already converted to integer!"); OpEntry = Result; } @@ -1042,23 +1062,22 @@ SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N, unsigned NumOps = N->getNumOperands(); SDLoc dl(N); if (NumOps == 0) { - return TLI.makeLibCall(DAG, LC, N->getValueType(0), nullptr, 0, isSigned, + return TLI.makeLibCall(DAG, LC, N->getValueType(0), None, isSigned, dl).first; } else if (NumOps == 1) { SDValue Op = N->getOperand(0); - return TLI.makeLibCall(DAG, LC, N->getValueType(0), &Op, 1, isSigned, + return TLI.makeLibCall(DAG, LC, N->getValueType(0), Op, isSigned, dl).first; } else if (NumOps == 2) { SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; - return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, 2, isSigned, + return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, isSigned, dl).first; } SmallVector<SDValue, 8> Ops(NumOps); for (unsigned i = 0; i < NumOps; ++i) Ops[i] = N->getOperand(i); - return TLI.makeLibCall(DAG, LC, N->getValueType(0), - &Ops[0], NumOps, isSigned, dl).first; + return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, isSigned, dl).first; } // ExpandChainLibCall - Expand a node into a call to a libcall. Similar to @@ -1108,6 +1127,23 @@ SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) { return DAG.getNode(ExtendCode, dl, BoolVT, Bool); } +/// WidenTargetBoolean - Widen the given target boolean to a target boolean +/// of the given type. The boolean vector is widened and then promoted to match +/// the target boolean type of the given ValVT. +SDValue DAGTypeLegalizer::WidenTargetBoolean(SDValue Bool, EVT ValVT, + bool WithZeroes) { + SDLoc dl(Bool); + EVT BoolVT = Bool.getValueType(); + + assert(ValVT.getVectorNumElements() > BoolVT.getVectorNumElements() && + TLI.isTypeLegal(ValVT) && + "Unexpected types in WidenTargetBoolean"); + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), BoolVT.getScalarType(), + ValVT.getVectorNumElements()); + Bool = ModifyToType(Bool, WideVT, WithZeroes); + return PromoteTargetBoolean(Bool, ValVT); +} + /// SplitInteger - Return the lower LoVT bits of Op in Lo and the upper HiVT /// bits in Hi. void DAGTypeLegalizer::SplitInteger(SDValue Op, diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index d1131a7..8ba19f7 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -72,6 +72,20 @@ private: return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal; } + /// isSimpleLegalType - Return true if this is a simple legal type. + bool isSimpleLegalType(EVT VT) const { + return VT.isSimple() && TLI.isTypeLegal(VT); + } + + /// isLegalInHWReg - Return true if this type can be passed in registers. + /// For example, x86_64's f128, should to be legally in registers + /// and only some operations converted to library calls or integer + /// bitwise operations. + bool isLegalInHWReg(EVT VT) const { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + return VT == NVT && isSimpleLegalType(VT); + } + EVT getSetCCResultType(EVT VT) const { return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); } @@ -173,6 +187,11 @@ private: std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node); SDValue PromoteTargetBoolean(SDValue Bool, EVT ValVT); + + /// Modify Bit Vector to match SetCC result type of ValVT. + /// The bit vector is widened with zeroes when WithZeroes is true. + SDValue WidenTargetBoolean(SDValue Bool, EVT ValVT, bool WithZeroes = false); + void ReplaceValueWith(SDValue From, SDValue To); void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi); void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT, @@ -234,6 +253,7 @@ private: SDValue PromoteIntRes_CONCAT_VECTORS(SDNode *N); SDValue PromoteIntRes_BITCAST(SDNode *N); SDValue PromoteIntRes_BSWAP(SDNode *N); + SDValue PromoteIntRes_BITREVERSE(SDNode *N); SDValue PromoteIntRes_BUILD_PAIR(SDNode *N); SDValue PromoteIntRes_Constant(SDNode *N); SDValue PromoteIntRes_CONVERT_RNDSAT(SDNode *N); @@ -246,21 +266,22 @@ private: SDValue PromoteIntRes_INT_EXTEND(SDNode *N); SDValue PromoteIntRes_LOAD(LoadSDNode *N); SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N); + SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N); SDValue PromoteIntRes_Overflow(SDNode *N); SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo); - SDValue PromoteIntRes_SDIV(SDNode *N); SDValue PromoteIntRes_SELECT(SDNode *N); SDValue PromoteIntRes_VSELECT(SDNode *N); SDValue PromoteIntRes_SELECT_CC(SDNode *N); SDValue PromoteIntRes_SETCC(SDNode *N); SDValue PromoteIntRes_SHL(SDNode *N); SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N); + SDValue PromoteIntRes_ZExtIntBinOp(SDNode *N); + SDValue PromoteIntRes_SExtIntBinOp(SDNode *N); SDValue PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N); SDValue PromoteIntRes_SRA(SDNode *N); SDValue PromoteIntRes_SRL(SDNode *N); SDValue PromoteIntRes_TRUNCATE(SDNode *N); SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo); - SDValue PromoteIntRes_UDIV(SDNode *N); SDValue PromoteIntRes_UNDEF(SDNode *N); SDValue PromoteIntRes_VAARG(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); @@ -276,7 +297,6 @@ private: SDValue PromoteIntOp_BUILD_VECTOR(SDNode *N); SDValue PromoteIntOp_CONVERT_RNDSAT(SDNode *N); SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo); - SDValue PromoteIntOp_EXTRACT_ELEMENT(SDNode *N); SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N); SDValue PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N); @@ -284,7 +304,6 @@ private: SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo); - SDValue PromoteIntOp_VSETCC(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_Shift(SDNode *N); SDValue PromoteIntOp_SIGN_EXTEND(SDNode *N); SDValue PromoteIntOp_SINT_TO_FP(SDNode *N); @@ -294,6 +313,8 @@ private: SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N); SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -312,8 +333,6 @@ private: // Integer Result Expansion. void ExpandIntegerResult(SDNode *N, unsigned ResNo); - void ExpandIntRes_MERGE_VALUES (SDNode *N, unsigned ResNo, - SDValue &Lo, SDValue &Hi); void ExpandIntRes_ANY_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_AssertSext (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_AssertZext (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -322,6 +341,7 @@ private: void ExpandIntRes_CTPOP (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_CTTZ (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_LOAD (LoadSDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_READCYCLECOUNTER (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_SIGN_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_SIGN_EXTEND_INREG (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_TRUNCATE (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -333,6 +353,7 @@ private: void ExpandIntRes_ADDSUB (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUBC (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUBE (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_BITREVERSE (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_BSWAP (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_MUL (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_SDIV (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -354,12 +375,10 @@ private: // Integer Operand Expansion. bool ExpandIntegerOperand(SDNode *N, unsigned OperandNo); - SDValue ExpandIntOp_BITCAST(SDNode *N); SDValue ExpandIntOp_BR_CC(SDNode *N); - SDValue ExpandIntOp_BUILD_VECTOR(SDNode *N); - SDValue ExpandIntOp_EXTRACT_ELEMENT(SDNode *N); SDValue ExpandIntOp_SELECT_CC(SDNode *N); SDValue ExpandIntOp_SETCC(SDNode *N); + SDValue ExpandIntOp_SETCCE(SDNode *N); SDValue ExpandIntOp_Shift(SDNode *N); SDValue ExpandIntOp_SINT_TO_FP(SDNode *N); SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo); @@ -375,32 +394,48 @@ private: // Float to Integer Conversion Support: LegalizeFloatTypes.cpp //===--------------------------------------------------------------------===// - /// GetSoftenedFloat - Given a processed operand Op which was converted to an - /// integer of the same size, this returns the integer. The integer contains - /// exactly the same bits as Op - only the type changed. For example, if Op - /// is an f32 which was softened to an i32, then this method returns an i32, - /// the bits of which coincide with those of Op. + /// GetSoftenedFloat - Given an operand Op of Float type, returns the integer + /// if the Op is not supported in target HW and converted to the integer. + /// The integer contains exactly the same bits as Op - only the type changed. + /// For example, if Op is an f32 which was softened to an i32, then this method + /// returns an i32, the bits of which coincide with those of Op. + /// If the Op can be efficiently supported in target HW or the operand must + /// stay in a register, the Op is not converted to an integer. + /// In that case, the given op is returned. SDValue GetSoftenedFloat(SDValue Op) { SDValue &SoftenedOp = SoftenedFloats[Op]; + if (!SoftenedOp.getNode() && + isSimpleLegalType(Op.getValueType())) + return Op; RemapValue(SoftenedOp); assert(SoftenedOp.getNode() && "Operand wasn't converted to integer?"); return SoftenedOp; } void SetSoftenedFloat(SDValue Op, SDValue Result); - // Result Float to Integer Conversion. - void SoftenFloatResult(SDNode *N, unsigned OpNo); + // Call ReplaceValueWith(SDValue(N, ResNo), Res) if necessary. + void ReplaceSoftenFloatResult(SDNode *N, unsigned ResNo, SDValue &NewRes) { + // When the result type can be kept in HW registers, the converted + // NewRes node could have the same type. We can save the effort in + // cloning every user of N in SoftenFloatOperand or other legalization functions, + // by calling ReplaceValueWith here to update all users. + if (NewRes.getNode() != N && isLegalInHWReg(N->getValueType(ResNo))) + ReplaceValueWith(SDValue(N, ResNo), NewRes); + } + + // Convert Float Results to Integer for Non-HW-supported Operations. + bool SoftenFloatResult(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo); - SDValue SoftenFloatRes_BITCAST(SDNode *N); + SDValue SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N); - SDValue SoftenFloatRes_ConstantFP(ConstantFPSDNode *N); + SDValue SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N); - SDValue SoftenFloatRes_FABS(SDNode *N); + SDValue SoftenFloatRes_FABS(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_FMINNUM(SDNode *N); SDValue SoftenFloatRes_FMAXNUM(SDNode *N); SDValue SoftenFloatRes_FADD(SDNode *N); SDValue SoftenFloatRes_FCEIL(SDNode *N); - SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N); + SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_FCOS(SDNode *N); SDValue SoftenFloatRes_FDIV(SDNode *N); SDValue SoftenFloatRes_FEXP(SDNode *N); @@ -412,7 +447,7 @@ private: SDValue SoftenFloatRes_FMA(SDNode *N); SDValue SoftenFloatRes_FMUL(SDNode *N); SDValue SoftenFloatRes_FNEARBYINT(SDNode *N); - SDValue SoftenFloatRes_FNEG(SDNode *N); + SDValue SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_FP_EXTEND(SDNode *N); SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N); SDValue SoftenFloatRes_FP_ROUND(SDNode *N); @@ -425,21 +460,25 @@ private: SDValue SoftenFloatRes_FSQRT(SDNode *N); SDValue SoftenFloatRes_FSUB(SDNode *N); SDValue SoftenFloatRes_FTRUNC(SDNode *N); - SDValue SoftenFloatRes_LOAD(SDNode *N); - SDValue SoftenFloatRes_SELECT(SDNode *N); - SDValue SoftenFloatRes_SELECT_CC(SDNode *N); + SDValue SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo); + SDValue SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo); + SDValue SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo); SDValue SoftenFloatRes_UNDEF(SDNode *N); SDValue SoftenFloatRes_VAARG(SDNode *N); SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N); - // Operand Float to Integer Conversion. + // Return true if we can skip softening the given operand or SDNode because + // it was soften before by SoftenFloatResult and references to the operand + // were replaced by ReplaceValueWith. + bool CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo); + + // Convert Float Operand to Integer for Non-HW-supported Operations. bool SoftenFloatOperand(SDNode *N, unsigned OpNo); SDValue SoftenFloatOp_BITCAST(SDNode *N); SDValue SoftenFloatOp_BR_CC(SDNode *N); SDValue SoftenFloatOp_FP_EXTEND(SDNode *N); SDValue SoftenFloatOp_FP_ROUND(SDNode *N); - SDValue SoftenFloatOp_FP_TO_SINT(SDNode *N); - SDValue SoftenFloatOp_FP_TO_UINT(SDNode *N); + SDValue SoftenFloatOp_FP_TO_XINT(SDNode *N); SDValue SoftenFloatOp_SELECT_CC(SDNode *N); SDValue SoftenFloatOp_SETCC(SDNode *N); SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo); @@ -575,7 +614,6 @@ private: SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N); SDValue ScalarizeVecRes_LOAD(LoadSDNode *N); SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N); - SDValue ScalarizeVecRes_SIGN_EXTEND_INREG(SDNode *N); SDValue ScalarizeVecRes_VSELECT(SDNode *N); SDValue ScalarizeVecRes_SELECT(SDNode *N); SDValue ScalarizeVecRes_SELECT_CC(SDNode *N); @@ -617,20 +655,18 @@ private: void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); - void SplitVecRes_BUILD_PAIR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_MLOAD(MaskedLoadSDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_MGATHER(MaskedGatherSDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); - void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi); - void SplitVecRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo, SDValue &Hi); @@ -650,6 +686,7 @@ private: SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N); SDValue SplitVecOp_VSETCC(SDNode *N); SDValue SplitVecOp_FP_ROUND(SDNode *N); + SDValue SplitVecOp_FCOPYSIGN(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Support: LegalizeVectorTypes.cpp @@ -680,8 +717,8 @@ private: SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); SDValue WidenVecRes_LOAD(SDNode* N); SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N); + SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N); SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N); - SDValue WidenVecRes_SIGN_EXTEND_INREG(SDNode* N); SDValue WidenVecRes_SELECT(SDNode* N); SDValue WidenVecRes_SELECT_CC(SDNode* N); SDValue WidenVecRes_SETCC(SDNode* N); @@ -693,6 +730,7 @@ private: SDValue WidenVecRes_Binary(SDNode *N); SDValue WidenVecRes_BinaryCanTrap(SDNode *N); SDValue WidenVecRes_Convert(SDNode *N); + SDValue WidenVecRes_FCOPYSIGN(SDNode *N); SDValue WidenVecRes_POWI(SDNode *N); SDValue WidenVecRes_Shift(SDNode *N); SDValue WidenVecRes_Unary(SDNode *N); @@ -707,9 +745,11 @@ private: SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N); SDValue WidenVecOp_STORE(SDNode* N); SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo); + SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_SETCC(SDNode* N); SDValue WidenVecOp_Convert(SDNode *N); + SDValue WidenVecOp_FCOPYSIGN(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Utilities Support: LegalizeVectorTypes.cpp @@ -745,8 +785,10 @@ private: /// Modifies a vector input (widen or narrows) to a vector of NVT. The /// input vector must have the same element type as NVT. - SDValue ModifyToType(SDValue InOp, EVT WidenVT); - + /// When FillWithZeroes is "on" the vector will be widened with + /// zeroes. + /// By default, the vector will be widened with undefined values. + SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false); //===--------------------------------------------------------------------===// // Generic Splitting: LegalizeTypesGeneric.cpp diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index 14d8f77..593c346 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -53,12 +53,17 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { case TargetLowering::TypePromoteFloat: llvm_unreachable("Bitcast of a promotion-needing float should never need" "expansion"); - case TargetLowering::TypeSoftenFloat: - // Convert the integer operand instead. - SplitInteger(GetSoftenedFloat(InOp), Lo, Hi); + case TargetLowering::TypeSoftenFloat: { + // Expand the floating point operand only if it was converted to integers. + // Otherwise, it is a legal type like f128 that can be saved in a register. + auto SoftenedOp = GetSoftenedFloat(InOp); + if (SoftenedOp == InOp) + break; + SplitInteger(SoftenedOp, Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; + } case TargetLowering::TypeExpandInteger: case TargetLowering::TypeExpandFloat: { auto &DL = DAG.getDataLayout(); @@ -161,7 +166,8 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { NOutVT.getTypeForEVT(*DAG.getContext())); SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment); int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); - MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SPFI); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); // Emit a store to the stack slot. SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo, diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 83d4ad5..f61f631 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -105,6 +105,8 @@ class VectorLegalizer { SDValue ExpandLoad(SDValue Op); SDValue ExpandStore(SDValue Op); SDValue ExpandFNEG(SDValue Op); + SDValue ExpandBITREVERSE(SDValue Op); + SDValue ExpandCTLZ_CTTZ_ZERO_UNDEF(SDValue Op); /// \brief Implements vector promotion. /// @@ -159,7 +161,7 @@ bool VectorLegalizer::Run() { DAG.AssignTopologicalOrder(); for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), E = std::prev(DAG.allnodes_end()); I != std::next(E); ++I) - LegalizeOp(SDValue(I, 0)); + LegalizeOp(SDValue(&*I, 0)); // Finally, it's possible the root changed. Get the new root. SDValue OldRoot = DAG.getRoot(); @@ -218,9 +220,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { assert(Result.getValue(1).use_empty() && "There are still live users of the old chain!"); return LegalizeOp(Lowered); - } else { - return TranslateLegalizeResults(Op, Lowered); } + return TranslateLegalizeResults(Op, Lowered); } case TargetLowering::Expand: Changed = true; @@ -231,7 +232,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { EVT StVT = ST->getMemoryVT(); MVT ValVT = ST->getValue().getSimpleValueType(); if (StVT.isVector() && ST->isTruncatingStore()) - switch (TLI.getTruncStoreAction(ValVT, StVT.getSimpleVT())) { + switch (TLI.getTruncStoreAction(ValVT, StVT)) { default: llvm_unreachable("This action is not supported yet!"); case TargetLowering::Legal: return TranslateLegalizeResults(Op, Result); @@ -244,7 +245,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { Changed = true; return LegalizeOp(ExpandStore(Op)); } - } else if (Op.getOpcode() == ISD::MSCATTER) + } else if (Op.getOpcode() == ISD::MSCATTER || Op.getOpcode() == ISD::MSTORE) HasVectorValue = true; for (SDNode::value_iterator J = Node->value_begin(), E = Node->value_end(); @@ -265,6 +266,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::UDIV: case ISD::SREM: case ISD::UREM: + case ISD::SDIVREM: + case ISD::UDIVREM: case ISD::FADD: case ISD::FSUB: case ISD::FMUL: @@ -279,6 +282,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::ROTL: case ISD::ROTR: case ISD::BSWAP: + case ISD::BITREVERSE: case ISD::CTLZ: case ISD::CTTZ: case ISD::CTLZ_ZERO_UNDEF: @@ -298,6 +302,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::FABS: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINNAN: + case ISD::FMAXNAN: case ISD::FCOPYSIGN: case ISD::FSQRT: case ISD::FSIN: @@ -338,9 +344,13 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::MSCATTER: QueryType = cast<MaskedScatterSDNode>(Node)->getValue().getValueType(); break; + case ISD::MSTORE: + QueryType = cast<MaskedStoreSDNode>(Node)->getValue().getValueType(); + break; } switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) { + default: llvm_unreachable("This action is not supported yet!"); case TargetLowering::Promote: Result = Promote(Op); Changed = true; @@ -411,7 +421,7 @@ SDValue VectorLegalizer::Promote(SDValue Op) { Operands[j] = Op.getOperand(j); } - Op = DAG.getNode(Op.getOpcode(), dl, NVT, Operands); + Op = DAG.getNode(Op.getOpcode(), dl, NVT, Operands, Op.getNode()->getFlags()); if ((VT.isFloatingPoint() && NVT.isFloatingPoint()) || (VT.isVector() && VT.getVectorElementType().isFloatingPoint() && NVT.isVector() && NVT.getVectorElementType().isFloatingPoint())) @@ -708,6 +718,11 @@ SDValue VectorLegalizer::Expand(SDValue Op) { return ExpandFNEG(Op); case ISD::SETCC: return UnrollVSETCC(Op); + case ISD::BITREVERSE: + return ExpandBITREVERSE(Op); + case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_UNDEF: + return ExpandCTLZ_CTTZ_ZERO_UNDEF(Op); default: return DAG.UnrollVectorOp(Op.getNode()); } @@ -893,6 +908,25 @@ SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) { return DAG.getNode(ISD::BITCAST, DL, VT, Op); } +SDValue VectorLegalizer::ExpandBITREVERSE(SDValue Op) { + EVT VT = Op.getValueType(); + + // If we have the scalar operation, it's probably cheaper to unroll it. + if (TLI.isOperationLegalOrCustom(ISD::BITREVERSE, VT.getScalarType())) + return DAG.UnrollVectorOp(Op.getNode()); + + // If we have the appropriate vector bit operations, it is better to use them + // than unrolling and expanding each component. + if (!TLI.isOperationLegalOrCustom(ISD::SHL, VT) || + !TLI.isOperationLegalOrCustom(ISD::SRL, VT) || + !TLI.isOperationLegalOrCustom(ISD::AND, VT) || + !TLI.isOperationLegalOrCustom(ISD::OR, VT)) + return DAG.UnrollVectorOp(Op.getNode()); + + // Let LegalizeDAG handle this later. + return Op; +} + SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) { // Implement VSELECT in terms of XOR, AND, OR // on platforms which do not support blend natively. @@ -971,6 +1005,7 @@ SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) { // Convert hi and lo to floats // Convert the hi part back to the upper values + // TODO: Can any fast-math-flags be set on these nodes? SDValue fHI = DAG.getNode(ISD::SINT_TO_FP, DL, Op.getValueType(), HI); fHI = DAG.getNode(ISD::FMUL, DL, Op.getValueType(), fHI, TWOHW); SDValue fLO = DAG.getNode(ISD::SINT_TO_FP, DL, Op.getValueType(), LO); @@ -984,12 +1019,23 @@ SDValue VectorLegalizer::ExpandFNEG(SDValue Op) { if (TLI.isOperationLegalOrCustom(ISD::FSUB, Op.getValueType())) { SDLoc DL(Op); SDValue Zero = DAG.getConstantFP(-0.0, DL, Op.getValueType()); + // TODO: If FNEG had fast-math-flags, they'd get propagated to this FSUB. return DAG.getNode(ISD::FSUB, DL, Op.getValueType(), Zero, Op.getOperand(0)); } return DAG.UnrollVectorOp(Op.getNode()); } +SDValue VectorLegalizer::ExpandCTLZ_CTTZ_ZERO_UNDEF(SDValue Op) { + // If the non-ZERO_UNDEF version is supported we can let LegalizeDAG handle. + unsigned Opc = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ? ISD::CTLZ : ISD::CTTZ; + if (TLI.isOperationLegalOrCustom(Opc, Op.getValueType())) + return Op; + + // Otherwise go ahead and unroll. + return DAG.UnrollVectorOp(Op.getNode()); +} + SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) { EVT VT = Op.getValueType(); unsigned NumElems = VT.getVectorNumElements(); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 51cd661..d0187d3 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -67,6 +67,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::UNDEF: R = ScalarizeVecRes_UNDEF(N); break; case ISD::VECTOR_SHUFFLE: R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break; case ISD::ANY_EXTEND: + case ISD::BITREVERSE: case ISD::BSWAP: case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: @@ -108,6 +109,12 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::FMUL: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINNAN: + case ISD::FMAXNAN: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: case ISD::FPOW: case ISD::FREM: @@ -139,7 +146,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) { SDValue LHS = GetScalarizedVector(N->getOperand(0)); SDValue RHS = GetScalarizedVector(N->getOperand(1)); return DAG.getNode(N->getOpcode(), SDLoc(N), - LHS.getValueType(), LHS, RHS); + LHS.getValueType(), LHS, RHS, N->getFlags()); } SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) { @@ -228,7 +235,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) { N->isInvariant(), N->getOriginalAlignment(), N->getAAInfo()); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Result.getValue(1)); return Result; @@ -594,6 +601,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::INSERT_SUBVECTOR: SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break; case ISD::FP_ROUND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break; case ISD::FPOWI: SplitVecRes_FPOWI(N, Lo, Hi); break; + case ISD::FCOPYSIGN: SplitVecRes_FCOPYSIGN(N, Lo, Hi); break; case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break; case ISD::SCALAR_TO_VECTOR: SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break; case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break; @@ -613,6 +621,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi); break; + case ISD::BITREVERSE: case ISD::BSWAP: case ISD::CONVERT_RNDSAT: case ISD::CTLZ: @@ -656,11 +665,12 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::SUB: case ISD::MUL: case ISD::FADD: - case ISD::FCOPYSIGN: case ISD::FSUB: case ISD::FMUL: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINNAN: + case ISD::FMAXNAN: case ISD::SDIV: case ISD::UDIV: case ISD::FDIV: @@ -698,8 +708,10 @@ void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, GetSplitVector(N->getOperand(1), RHSLo, RHSHi); SDLoc dl(N); - Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo, RHSLo); - Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi, RHSHi); + const SDNodeFlags *Flags = N->getFlags(); + unsigned Opcode = N->getOpcode(); + Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags); + Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags); } void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, @@ -870,6 +882,25 @@ void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1)); } +void DAGTypeLegalizer::SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDValue LHSLo, LHSHi; + GetSplitVector(N->getOperand(0), LHSLo, LHSHi); + SDLoc DL(N); + + SDValue RHSLo, RHSHi; + SDValue RHS = N->getOperand(1); + EVT RHSVT = RHS.getValueType(); + if (getTypeAction(RHSVT) == TargetLowering::TypeSplitVector) + GetSplitVector(RHS, RHSLo, RHSHi); + else + std::tie(RHSLo, RHSHi) = DAG.SplitVector(RHS, SDLoc(RHS)); + + + Lo = DAG.getNode(ISD::FCOPYSIGN, DL, LHSLo.getValueType(), LHSLo, RHSLo); + Hi = DAG.getNode(ISD::FCOPYSIGN, DL, LHSHi.getValueType(), LHSHi, RHSHi); +} + void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LHSLo, LHSHi; @@ -989,7 +1020,7 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(LD, 1), Ch); } @@ -1003,6 +1034,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue Ch = MLD->getChain(); SDValue Ptr = MLD->getBasePtr(); SDValue Mask = MLD->getMask(); + SDValue Src0 = MLD->getSrc0(); unsigned Alignment = MLD->getOriginalAlignment(); ISD::LoadExtType ExtType = MLD->getExtensionType(); @@ -1012,16 +1044,22 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, (Alignment == MLD->getValueType(0).getSizeInBits()/8) ? Alignment/2 : Alignment; + // Split Mask operand SDValue MaskLo, MaskHi; - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); EVT MemoryVT = MLD->getMemoryVT(); EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); - SDValue Src0 = MLD->getSrc0(); SDValue Src0Lo, Src0Hi; - std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl); + if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Src0, Src0Lo, Src0Hi); + else + std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MLD->getPointerInfo(), @@ -1049,7 +1087,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(MLD, 1), Ch); @@ -1064,20 +1102,33 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, SDValue Ch = MGT->getChain(); SDValue Ptr = MGT->getBasePtr(); SDValue Mask = MGT->getMask(); + SDValue Src0 = MGT->getValue(); + SDValue Index = MGT->getIndex(); unsigned Alignment = MGT->getOriginalAlignment(); + // Split Mask operand SDValue MaskLo, MaskHi; - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); EVT MemoryVT = MGT->getMemoryVT(); EVT LoMemVT, HiMemVT; + // Split MemoryVT std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue Src0Lo, Src0Hi; - std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(MGT->getValue(), dl); + if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Src0, Src0Lo, Src0Hi); + else + std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl); SDValue IndexHi, IndexLo; - std::tie(IndexLo, IndexHi) = DAG.SplitVector(MGT->getIndex(), dl); + if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Index, IndexLo, IndexHi); + else + std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MGT->getPointerInfo(), @@ -1097,7 +1148,7 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(MGT, 1), Ch); } @@ -1357,6 +1408,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { Res = SplitVecOp_TruncateHelper(N); break; case ISD::FP_ROUND: Res = SplitVecOp_FP_ROUND(N); break; + case ISD::FCOPYSIGN: Res = SplitVecOp_FCOPYSIGN(N); break; case ISD::STORE: Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo); break; @@ -1567,23 +1619,31 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, SDValue Ptr = MGT->getBasePtr(); SDValue Index = MGT->getIndex(); SDValue Mask = MGT->getMask(); + SDValue Src0 = MGT->getValue(); unsigned Alignment = MGT->getOriginalAlignment(); SDValue MaskLo, MaskHi; - std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + // Split Mask operand + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); EVT MemoryVT = MGT->getMemoryVT(); EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue Src0Lo, Src0Hi; - std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(MGT->getValue(), dl); + if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Src0, Src0Lo, Src0Hi); + else + std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl); SDValue IndexHi, IndexLo; - if (Index.getNode()) - std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl); + if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Index, IndexLo, IndexHi); else - IndexLo = IndexHi = Index; + std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MGT->getPointerInfo(), @@ -1609,7 +1669,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(MGT, 1), Ch); @@ -1633,9 +1693,21 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N, std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; - GetSplitVector(Data, DataLo, DataHi); + if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector) + // Split Data operand + GetSplitVector(Data, DataLo, DataHi); + else + std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); + SDValue MaskLo, MaskHi; - GetSplitVector(Mask, MaskLo, MaskHi); + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + // Split Mask operand + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); + + MaskLo = PromoteTargetBoolean(MaskLo, DataLo.getValueType()); + MaskHi = PromoteTargetBoolean(MaskHi, DataHi.getValueType()); // if Alignment is equal to the vector size, // take the half of it for the second part @@ -1680,25 +1752,29 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned Alignment = N->getOriginalAlignment(); SDLoc DL(N); + // Split all operands EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; - GetSplitVector(Data, DataLo, DataHi); - SDValue MaskLo, MaskHi; - GetSplitVector(Mask, MaskLo, MaskHi); + if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector) + // Split Data operand + GetSplitVector(Data, DataLo, DataHi); + else + std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); - SDValue PtrLo, PtrHi; - if (Ptr.getValueType().isVector()) // gather form vector of pointers - std::tie(PtrLo, PtrHi) = DAG.SplitVector(Ptr, DL); + SDValue MaskLo, MaskHi; + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + // Split Mask operand + GetSplitVector(Mask, MaskLo, MaskHi); else - PtrLo = PtrHi = Ptr; + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); SDValue IndexHi, IndexLo; - if (Index.getNode()) - std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL); + if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Index, IndexLo, IndexHi); else - IndexLo = IndexHi = Index; + std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL); SDValue Lo, Hi; MachineMemOperand *MMO = DAG.getMachineFunction(). @@ -1706,7 +1782,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N, MachineMemOperand::MOStore, LoMemVT.getStoreSize(), Alignment, N->getAAInfo(), N->getRanges()); - SDValue OpsLo[] = {Ch, DataLo, MaskLo, PtrLo, IndexLo}; + SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo}; Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(), DL, OpsLo, MMO); @@ -1715,7 +1791,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N, MachineMemOperand::MOStore, HiMemVT.getStoreSize(), Alignment, N->getAAInfo(), N->getRanges()); - SDValue OpsHi[] = {Ch, DataHi, MaskHi, PtrHi, IndexHi}; + SDValue OpsHi[] = {Ch, DataHi, MaskHi, Ptr, IndexHi}; Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(), DL, OpsHi, MMO); @@ -1891,6 +1967,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) { return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); } +SDValue DAGTypeLegalizer::SplitVecOp_FCOPYSIGN(SDNode *N) { + // The result (and the first input) has a legal vector type, but the second + // input needs splitting. + return DAG.UnrollVectorOp(N, N->getValueType(0).getVectorNumElements()); +} //===----------------------------------------------------------------------===// @@ -1938,6 +2019,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::MLOAD: Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N)); break; + case ISD::MGATHER: + Res = WidenVecRes_MGATHER(cast<MaskedGatherSDNode>(N)); + break; case ISD::ADD: case ISD::AND: @@ -1949,11 +2033,16 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::XOR: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINNAN: + case ISD::FMAXNAN: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: Res = WidenVecRes_Binary(N); break; case ISD::FADD: - case ISD::FCOPYSIGN: case ISD::FMUL: case ISD::FPOW: case ISD::FSUB: @@ -1966,6 +2055,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { Res = WidenVecRes_BinaryCanTrap(N); break; + case ISD::FCOPYSIGN: + Res = WidenVecRes_FCOPYSIGN(N); + break; + case ISD::FPOWI: Res = WidenVecRes_POWI(N); break; @@ -1989,6 +2082,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { Res = WidenVecRes_Convert(N); break; + case ISD::BITREVERSE: case ISD::BSWAP: case ISD::CTLZ: case ISD::CTPOP: @@ -2037,7 +2131,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue InOp1 = GetWidenedVector(N->getOperand(0)); SDValue InOp2 = GetWidenedVector(N->getOperand(1)); - return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2); + return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, N->getFlags()); } SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) { @@ -2048,6 +2142,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) { EVT WidenEltVT = WidenVT.getVectorElementType(); EVT VT = WidenVT; unsigned NumElts = VT.getVectorNumElements(); + const SDNodeFlags *Flags = N->getFlags(); while (!TLI.isTypeLegal(VT) && NumElts != 1) { NumElts = NumElts / 2; VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts); @@ -2057,7 +2152,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) { // Operation doesn't trap so just widen as normal. SDValue InOp1 = GetWidenedVector(N->getOperand(0)); SDValue InOp2 = GetWidenedVector(N->getOperand(1)); - return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2); + return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, Flags); } // No legal vector version so unroll the vector operation and then widen. @@ -2087,7 +2182,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) { SDValue EOp2 = DAG.getNode( ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2, DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); - ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2); + ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2, Flags); Idx += NumElts; CurNumElts -= NumElts; } @@ -2105,7 +2200,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) { ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp2, DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT, - EOp1, EOp2); + EOp1, EOp2, Flags); } CurNumElts = 0; } @@ -2195,7 +2290,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { unsigned Opcode = N->getOpcode(); unsigned InVTNumElts = InVT.getVectorNumElements(); - + const SDNodeFlags *Flags = N->getFlags(); if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) { InOp = GetWidenedVector(N->getOperand(0)); InVT = InOp.getValueType(); @@ -2203,7 +2298,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { if (InVTNumElts == WidenNumElts) { if (N->getNumOperands() == 1) return DAG.getNode(Opcode, DL, WidenVT, InOp); - return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1)); + return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags); } } @@ -2224,7 +2319,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops); if (N->getNumOperands() == 1) return DAG.getNode(Opcode, DL, WidenVT, InVec); - return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1)); + return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1), Flags); } if (InVTNumElts % WidenNumElts == 0) { @@ -2234,7 +2329,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { // Extract the input and convert the shorten input vector. if (N->getNumOperands() == 1) return DAG.getNode(Opcode, DL, WidenVT, InVal); - return DAG.getNode(Opcode, DL, WidenVT, InVal, N->getOperand(1)); + return DAG.getNode(Opcode, DL, WidenVT, InVal, N->getOperand(1), Flags); } } @@ -2250,7 +2345,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { if (N->getNumOperands() == 1) Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val); else - Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1)); + Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1), Flags); } SDValue UndefVal = DAG.getUNDEF(EltVT); @@ -2260,6 +2355,17 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops); } +SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) { + // If this is an FCOPYSIGN with same input types, we can treat it as a + // normal (can trap) binary op. + if (N->getOperand(0).getValueType() == N->getOperand(1).getValueType()) + return WidenVecRes_BinaryCanTrap(N); + + // If the types are different, fall back to unrolling. + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements()); +} + SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue InOp = GetWidenedVector(N->getOperand(0)); @@ -2669,7 +2775,35 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) { SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(), Mask, Src0, N->getMemoryVT(), N->getMemOperand(), ExtType); - // Legalized the chain result - switch anything that used the old chain to + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + +SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) { + + EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue Mask = N->getMask(); + SDValue Src0 = GetWidenedVector(N->getValue()); + unsigned NumElts = WideVT.getVectorNumElements(); + SDLoc dl(N); + + // The mask should be widened as well + Mask = WidenTargetBoolean(Mask, WideVT, true); + + // Widen the Index operand + SDValue Index = N->getIndex(); + EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(), + Index.getValueType().getScalarType(), + NumElts); + Index = ModifyToType(Index, WideIndexVT); + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + + // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; @@ -2831,7 +2965,9 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::STORE: Res = WidenVecOp_STORE(N); break; case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break; + case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break; case ISD::SETCC: Res = WidenVecOp_SETCC(N); break; + case ISD::FCOPYSIGN: Res = WidenVecOp_FCOPYSIGN(N); break; case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: @@ -2928,6 +3064,13 @@ SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) { } } +SDValue DAGTypeLegalizer::WidenVecOp_FCOPYSIGN(SDNode *N) { + // The result (and first input) is legal, but the second input is illegal. + // We can't do much to fix that, so just unroll and let the extracts off of + // the second input be widened as needed later. + return DAG.UnrollVectorOp(N); +} + SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) { // Since the result is legal and the input is illegal, it is unlikely // that we can fix the input to a legal type so unroll the convert @@ -3070,6 +3213,34 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) { false); } +SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) { + assert(OpNo == 1 && "Can widen only data operand of mscatter"); + MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N); + SDValue DataOp = MSC->getValue(); + SDValue Mask = MSC->getMask(); + + // Widen the value + SDValue WideVal = GetWidenedVector(DataOp); + EVT WideVT = WideVal.getValueType(); + unsigned NumElts = WideVal.getValueType().getVectorNumElements(); + SDLoc dl(N); + + // The mask should be widened as well + Mask = WidenTargetBoolean(Mask, WideVT, true); + + // Widen index + SDValue Index = MSC->getIndex(); + EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(), + Index.getValueType().getScalarType(), + NumElts); + Index = ModifyToType(Index, WideIndexVT); + + SDValue Ops[] = {MSC->getChain(), WideVal, Mask, MSC->getBasePtr(), Index}; + return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), + MSC->getMemoryVT(), dl, Ops, + MSC->getMemOperand()); +} + SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) { SDValue InOp0 = GetWidenedVector(N->getOperand(0)); SDValue InOp1 = GetWidenedVector(N->getOperand(1)); @@ -3533,7 +3704,9 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain, /// Modifies a vector input (widen or narrows) to a vector of NVT. The /// input vector must have the same element type as NVT. -SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) { +/// FillWithZeroes specifies that the vector should be widened with zeroes. +SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT, + bool FillWithZeroes) { // Note that InOp might have been widened so it might already have // the right width or it might need be narrowed. EVT InVT = InOp.getValueType(); @@ -3550,10 +3723,11 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) { if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) { unsigned NumConcat = WidenNumElts / InNumElts; SmallVector<SDValue, 16> Ops(NumConcat); - SDValue UndefVal = DAG.getUNDEF(InVT); + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, InVT) : + DAG.getUNDEF(InVT); Ops[0] = InOp; for (unsigned i = 1; i != NumConcat; ++i) - Ops[i] = UndefVal; + Ops[i] = FillVal; return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops); } @@ -3573,8 +3747,9 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) { ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); - SDValue UndefVal = DAG.getUNDEF(EltVT); + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : + DAG.getUNDEF(EltVT); for ( ; Idx < WidenNumElts; ++Idx) - Ops[Idx] = UndefVal; + Ops[Idx] = FillVal; return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp index 6303422..622e06f 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp @@ -49,7 +49,7 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) TII = STI.getInstrInfo(); ResourcesModel.reset(TII->CreateTargetScheduleState(STI)); // This hard requirement could be relaxed, but for now - // do not let it procede. + // do not let it proceed. assert(ResourcesModel && "Unimplemented CreateTargetScheduleState."); unsigned NumRC = TRI->getNumRegClasses(); @@ -269,12 +269,12 @@ bool ResourcePriorityQueue::isResourceAvailable(SUnit *SU) { } // Now see if there are no other dependencies - // to instructions alredy in the packet. + // to instructions already in the packet. for (unsigned i = 0, e = Packet.size(); i != e; ++i) for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(), E = Packet[i]->Succs.end(); I != E; ++I) { // Since we do not add pseudos to packets, might as well - // ignor order deps. + // ignore order deps. if (I->isCtrl()) continue; diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp index 34e1a70..62e7733 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp @@ -440,7 +440,7 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg, const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!"); NumRes = MCID.getNumDefs(); - for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) { + for (const MCPhysReg *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) { if (Reg == *ImpDef) break; ++NumRes; @@ -519,7 +519,7 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU, const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode()); if (!MCID.ImplicitDefs) continue; - for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) { + for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) { CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI); } } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index e9bd520..91024e6 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -141,8 +141,8 @@ private: /// that are "live". These nodes must be scheduled before any other nodes that /// modifies the registers can be scheduled. unsigned NumLiveRegs; - std::vector<SUnit*> LiveRegDefs; - std::vector<SUnit*> LiveRegGens; + std::unique_ptr<SUnit*[]> LiveRegDefs; + std::unique_ptr<SUnit*[]> LiveRegGens; // Collect interferences between physical register use/defs. // Each interference is an SUnit and set of physical registers. @@ -328,8 +328,8 @@ void ScheduleDAGRRList::Schedule() { NumLiveRegs = 0; // Allocate slots for each physical register, plus one for a special register // to track the virtual resource of a calling sequence. - LiveRegDefs.resize(TRI->getNumRegs() + 1, nullptr); - LiveRegGens.resize(TRI->getNumRegs() + 1, nullptr); + LiveRegDefs.reset(new SUnit*[TRI->getNumRegs() + 1]()); + LiveRegGens.reset(new SUnit*[TRI->getNumRegs() + 1]()); CallSeqEndForStart.clear(); assert(Interferences.empty() && LRegsMap.empty() && "stale Interferences"); @@ -1206,7 +1206,7 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg, const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!"); NumRes = MCID.getNumDefs(); - for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) { + for (const MCPhysReg *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) { if (Reg == *ImpDef) break; ++NumRes; @@ -1218,7 +1218,7 @@ static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg, /// CheckForLiveRegDef - Return true and update live register vector if the /// specified register def of the specified SUnit clobbers any "live" registers. static void CheckForLiveRegDef(SUnit *SU, unsigned Reg, - std::vector<SUnit*> &LiveRegDefs, + SUnit **LiveRegDefs, SmallSet<unsigned, 4> &RegAdded, SmallVectorImpl<unsigned> &LRegs, const TargetRegisterInfo *TRI) { @@ -1240,7 +1240,7 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg, /// CheckForLiveRegDefMasked - Check for any live physregs that are clobbered /// by RegMask, and add them to LRegs. static void CheckForLiveRegDefMasked(SUnit *SU, const uint32_t *RegMask, - std::vector<SUnit*> &LiveRegDefs, + ArrayRef<SUnit*> LiveRegDefs, SmallSet<unsigned, 4> &RegAdded, SmallVectorImpl<unsigned> &LRegs) { // Look at all live registers. Skip Reg0 and the special CallResource. @@ -1278,7 +1278,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) { for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); I != E; ++I) { if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] != SU) - CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs, + CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs.get(), RegAdded, LRegs, TRI); } @@ -1302,7 +1302,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) { for (; NumVals; --NumVals, ++i) { unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg(); if (TargetRegisterInfo::isPhysicalRegister(Reg)) - CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI); + CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI); } } else i += NumVals; @@ -1328,13 +1328,15 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) { } } if (const uint32_t *RegMask = getNodeRegMask(Node)) - CheckForLiveRegDefMasked(SU, RegMask, LiveRegDefs, RegAdded, LRegs); + CheckForLiveRegDefMasked(SU, RegMask, + makeArrayRef(LiveRegDefs.get(), TRI->getNumRegs()), + RegAdded, LRegs); const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode()); if (!MCID.ImplicitDefs) continue; - for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) - CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI); + for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) + CheckForLiveRegDef(SU, *Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI); } return !LRegs.empty(); @@ -2718,7 +2720,7 @@ static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU, ScheduleDAGRRList *scheduleDAG, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) { - const uint16_t *ImpDefs + const MCPhysReg *ImpDefs = TII->get(SU->getNode()->getMachineOpcode()).getImplicitDefs(); const uint32_t *RegMask = getNodeRegMask(SU->getNode()); if(!ImpDefs && !RegMask) @@ -2737,7 +2739,7 @@ static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU, return true; if (ImpDefs) - for (const uint16_t *ImpDef = ImpDefs; *ImpDef; ++ImpDef) + for (const MCPhysReg *ImpDef = ImpDefs; *ImpDef; ++ImpDef) // Return true if SU clobbers this physical register use and the // definition of the register reaches from DepSU. IsReachable queries // a topological forward sort of the DAG (following the successors). @@ -2756,13 +2758,13 @@ static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU, const TargetRegisterInfo *TRI) { SDNode *N = SuccSU->getNode(); unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs(); - const uint16_t *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs(); + const MCPhysReg *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs(); assert(ImpDefs && "Caller should check hasPhysRegDefs"); for (const SDNode *SUNode = SU->getNode(); SUNode; SUNode = SUNode->getGluedNode()) { if (!SUNode->isMachineOpcode()) continue; - const uint16_t *SUImpDefs = + const MCPhysReg *SUImpDefs = TII->get(SUNode->getMachineOpcode()).getImplicitDefs(); const uint32_t *SURegMask = getNodeRegMask(SUNode); if (!SUImpDefs && !SURegMask) diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h index 159c28c..5cc8066 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h @@ -86,12 +86,6 @@ namespace llvm { /// flagged together nodes with a single SUnit. void BuildSchedGraph(AliasAnalysis *AA); - /// InitVRegCycleFlag - Set isVRegCycle if this node's single use is - /// CopyToReg and its only active data operands are CopyFromReg within a - /// single block loop. - /// - void InitVRegCycleFlag(SUnit *SU); - /// InitNumRegDefsLeft - Determine the # of regs defined by this node. /// void InitNumRegDefsLeft(SUnit *SU); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 14f44cc..96bf914 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -13,6 +13,7 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "SDNodeDbgValue.h" +#include "llvm/ADT/APSInt.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" @@ -210,28 +211,6 @@ bool ISD::isBuildVectorOfConstantFPSDNodes(const SDNode *N) { return true; } -/// isScalarToVector - Return true if the specified node is a -/// ISD::SCALAR_TO_VECTOR node or a BUILD_VECTOR node where only the low -/// element is not an undef. -bool ISD::isScalarToVector(const SDNode *N) { - if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) - return true; - - if (N->getOpcode() != ISD::BUILD_VECTOR) - return false; - if (N->getOperand(0).getOpcode() == ISD::UNDEF) - return false; - unsigned NumElems = N->getNumOperands(); - if (NumElems == 1) - return false; - for (unsigned i = 1; i < NumElems; ++i) { - SDValue V = N->getOperand(i); - if (V.getOpcode() != ISD::UNDEF) - return false; - } - return true; -} - /// allOperandsUndef - Return true if the node has at least one operand /// and all operands of the specified node are ISD::UNDEF. bool ISD::allOperandsUndef(const SDNode *N) { @@ -397,24 +376,21 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID, ID.AddInteger(Op.getResNo()); } } + /// Add logical or fast math flag values to FoldingSetNodeID value. static void AddNodeIDFlags(FoldingSetNodeID &ID, unsigned Opcode, const SDNodeFlags *Flags) { - if (!Flags || !isBinOpWithFlags(Opcode)) + if (!isBinOpWithFlags(Opcode)) return; - unsigned RawFlags = Flags->getRawFlags(); - // If no flags are set, do not alter the ID. We must match the ID of nodes - // that were created without explicitly specifying flags. This also saves time - // and allows a gradual increase in API usage of the optional optimization - // flags. - if (RawFlags != 0) - ID.AddInteger(RawFlags); + unsigned RawFlags = 0; + if (Flags) + RawFlags = Flags->getRawFlags(); + ID.AddInteger(RawFlags); } static void AddNodeIDFlags(FoldingSetNodeID &ID, const SDNode *N) { - if (auto *Node = dyn_cast<BinaryWithFlagsSDNode>(N)) - AddNodeIDFlags(ID, Node->getOpcode(), &Node->Flags); + AddNodeIDFlags(ID, N->getOpcode(), N->getFlags()); } static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC, @@ -624,9 +600,9 @@ void SelectionDAG::RemoveDeadNodes() { SmallVector<SDNode*, 128> DeadNodes; // Add all obviously-dead nodes to the DeadNodes worklist. - for (allnodes_iterator I = allnodes_begin(), E = allnodes_end(); I != E; ++I) - if (I->use_empty()) - DeadNodes.push_back(I); + for (SDNode &Node : allnodes()) + if (Node.use_empty()) + DeadNodes.push_back(&Node); RemoveDeadNodes(DeadNodes); @@ -766,6 +742,7 @@ static void VerifySDNode(SDNode *N) { void SelectionDAG::InsertNode(SDNode *N) { AllNodes.push_back(N); #ifndef NDEBUG + N->PersistentId = NextPersistentId++; VerifySDNode(N); #endif } @@ -929,7 +906,7 @@ SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL) EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)), Root(getEntryNode()), NewNodesMustHaveLegalTypes(false), UpdateListeners(nullptr) { - AllNodes.push_back(&EntryNode); + InsertNode(&EntryNode); DbgInfo = new SDDbgInfo(); } @@ -950,7 +927,10 @@ void SelectionDAG::allnodes_clear() { assert(&*AllNodes.begin() == &EntryNode); AllNodes.remove(AllNodes.begin()); while (!AllNodes.empty()) - DeallocateNode(AllNodes.begin()); + DeallocateNode(&AllNodes.front()); +#ifndef NDEBUG + NextPersistentId = 0; +#endif } BinarySDNode *SelectionDAG::GetBinarySDNode(unsigned Opcode, SDLoc DL, @@ -1023,7 +1003,7 @@ void SelectionDAG::clear() { static_cast<SDNode*>(nullptr)); EntryNode.UseList = nullptr; - AllNodes.push_back(&EntryNode); + InsertNode(&EntryNode); Root = getEntryNode(); DbgInfo->clear(); } @@ -1429,8 +1409,8 @@ SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset, if (SDNode *E = FindNodeOrInsertPos(ID, IP)) return SDValue(E, 0); - SDNode *N = new (NodeAllocator) TargetIndexSDNode(Index, VT, Offset, - TargetFlags); + SDNode *N = + new (NodeAllocator) TargetIndexSDNode(Index, VT, Offset, TargetFlags); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); @@ -1852,8 +1832,58 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) { EVT ShTy = TLI->getShiftAmountTy(LHSTy, getDataLayout()); if (OpTy == ShTy || OpTy.isVector()) return Op; - ISD::NodeType Opcode = OpTy.bitsGT(ShTy) ? ISD::TRUNCATE : ISD::ZERO_EXTEND; - return getNode(Opcode, SDLoc(Op), ShTy, Op); + return getZExtOrTrunc(Op, SDLoc(Op), ShTy); +} + +SDValue SelectionDAG::expandVAArg(SDNode *Node) { + SDLoc dl(Node); + const TargetLowering &TLI = getTargetLoweringInfo(); + const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); + EVT VT = Node->getValueType(0); + SDValue Tmp1 = Node->getOperand(0); + SDValue Tmp2 = Node->getOperand(1); + unsigned Align = Node->getConstantOperandVal(3); + + SDValue VAListLoad = + getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1, Tmp2, + MachinePointerInfo(V), false, false, false, 0); + SDValue VAList = VAListLoad; + + if (Align > TLI.getMinStackArgumentAlignment()) { + assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2"); + + VAList = getNode(ISD::ADD, dl, VAList.getValueType(), VAList, + getConstant(Align - 1, dl, VAList.getValueType())); + + VAList = getNode(ISD::AND, dl, VAList.getValueType(), VAList, + getConstant(-(int64_t)Align, dl, VAList.getValueType())); + } + + // Increment the pointer, VAList, to the next vaarg + Tmp1 = getNode(ISD::ADD, dl, VAList.getValueType(), VAList, + getConstant(getDataLayout().getTypeAllocSize( + VT.getTypeForEVT(*getContext())), + dl, VAList.getValueType())); + // Store the incremented VAList to the legalized pointer + Tmp1 = getStore(VAListLoad.getValue(1), dl, Tmp1, Tmp2, + MachinePointerInfo(V), false, false, 0); + // Load the actual argument out of the pointer VAList + return getLoad(VT, dl, Tmp1, VAList, MachinePointerInfo(), + false, false, false, 0); +} + +SDValue SelectionDAG::expandVACopy(SDNode *Node) { + SDLoc dl(Node); + const TargetLowering &TLI = getTargetLoweringInfo(); + // This defaults to loading a pointer from the input and storing it to the + // output, returning the chain. + const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue(); + const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue(); + SDValue Tmp1 = getLoad(TLI.getPointerTy(getDataLayout()), dl, + Node->getOperand(0), Node->getOperand(2), + MachinePointerInfo(VS), false, false, false, 0); + return getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1), + MachinePointerInfo(VD), false, false, 0); } /// CreateStackTemporary - Create a stack temporary, suitable for holding the @@ -1872,8 +1902,7 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) { /// CreateStackTemporary - Create a stack temporary suitable for holding /// either of the specified value types. SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) { - unsigned Bytes = std::max(VT1.getStoreSizeInBits(), - VT2.getStoreSizeInBits())/8; + unsigned Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize()); Type *Ty1 = VT1.getTypeForEVT(*getContext()); Type *Ty2 = VT2.getTypeForEVT(*getContext()); const DataLayout &DL = getDataLayout(); @@ -2255,7 +2284,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero, unsigned MemBits = VT.getScalarType().getSizeInBits(); KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); } else if (const MDNode *Ranges = LD->getRanges()) { - computeKnownBitsFromRangeMetadata(*Ranges, KnownZero); + if (LD->getExtensionType() == ISD::NON_EXTLOAD) + computeKnownBitsFromRangeMetadata(*Ranges, KnownZero, KnownOne); } break; } @@ -2564,6 +2594,11 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{ if (Tmp == 1) return 1; // Early out. Tmp2 = ComputeNumSignBits(Op.getOperand(2), Depth+1); return std::min(Tmp, Tmp2); + case ISD::SELECT_CC: + Tmp = ComputeNumSignBits(Op.getOperand(2), Depth+1); + if (Tmp == 1) return 1; // Early out. + Tmp2 = ComputeNumSignBits(Op.getOperand(3), Depth+1); + return std::min(Tmp, Tmp2); case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: @@ -2679,7 +2714,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{ const int rIndex = Items - 1 - cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - // If the sign portion ends in our element the substraction gives correct + // If the sign portion ends in our element the subtraction gives correct // result. Otherwise it gives either negative or > bitwidth result return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0); } @@ -2798,6 +2833,53 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const { return false; } +bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const { + assert(A.getValueType() == B.getValueType() && + "Values must have the same type"); + APInt AZero, AOne; + APInt BZero, BOne; + computeKnownBits(A, AZero, AOne); + computeKnownBits(B, BZero, BOne); + return (AZero | BZero).isAllOnesValue(); +} + +static SDValue FoldCONCAT_VECTORS(SDLoc DL, EVT VT, ArrayRef<SDValue> Ops, + llvm::SelectionDAG &DAG) { + if (Ops.size() == 1) + return Ops[0]; + + // Concat of UNDEFs is UNDEF. + if (std::all_of(Ops.begin(), Ops.end(), + [](SDValue Op) { return Op.isUndef(); })) + return DAG.getUNDEF(VT); + + // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified + // to one big BUILD_VECTOR. + // FIXME: Add support for UNDEF and SCALAR_TO_VECTOR as well. + if (!std::all_of(Ops.begin(), Ops.end(), [](SDValue Op) { + return Op.getOpcode() == ISD::BUILD_VECTOR; + })) + return SDValue(); + + EVT SVT = VT.getScalarType(); + SmallVector<SDValue, 16> Elts; + for (SDValue Op : Ops) + Elts.append(Op->op_begin(), Op->op_end()); + + // BUILD_VECTOR requires all inputs to be of the same type, find the + // maximum type and extend them all. + for (SDValue Op : Elts) + SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT); + + if (SVT.bitsGT(VT.getScalarType())) + for (SDValue &Op : Elts) + Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT) + ? DAG.getZExtOrTrunc(Op, DL, SVT) + : DAG.getSExtOrTrunc(Op, DL, SVT); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Elts); +} + /// getNode - Gets or creates the specified node. /// SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT) { @@ -2848,8 +2930,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, return getConstantFP(APFloat(APFloat::IEEEhalf, Val), DL, VT); if (VT == MVT::f32 && C->getValueType(0) == MVT::i32) return getConstantFP(APFloat(APFloat::IEEEsingle, Val), DL, VT); - else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64) + if (VT == MVT::f64 && C->getValueType(0) == MVT::i64) return getConstantFP(APFloat(APFloat::IEEEdouble, Val), DL, VT); + if (VT == MVT::f128 && C->getValueType(0) == MVT::i128) + return getConstantFP(APFloat(APFloat::IEEEquad, Val), DL, VT); break; case ISD::BSWAP: return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(), @@ -2954,44 +3038,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: case ISD::CTPOP: { - EVT SVT = VT.getScalarType(); - EVT InVT = BV->getValueType(0); - EVT InSVT = InVT.getScalarType(); - - // Find legal integer scalar type for constant promotion and - // ensure that its scalar size is at least as large as source. - EVT LegalSVT = SVT; - if (SVT.isInteger()) { - LegalSVT = TLI->getTypeToTransformTo(*getContext(), SVT); - if (LegalSVT.bitsLT(SVT)) break; - } - - // Let the above scalar folding handle the folding of each element. - SmallVector<SDValue, 8> Ops; - for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { - SDValue OpN = BV->getOperand(i); - EVT OpVT = OpN.getValueType(); - - // Build vector (integer) scalar operands may need implicit - // truncation - do this before constant folding. - if (OpVT.isInteger() && OpVT.bitsGT(InSVT)) - OpN = getNode(ISD::TRUNCATE, DL, InSVT, OpN); - - OpN = getNode(Opcode, DL, SVT, OpN); - - // Legalize the (integer) scalar constant if necessary. - if (LegalSVT != SVT) - OpN = getNode(ISD::ANY_EXTEND, DL, LegalSVT, OpN); - - if (OpN.getOpcode() != ISD::UNDEF && - OpN.getOpcode() != ISD::Constant && - OpN.getOpcode() != ISD::ConstantFP) - break; - Ops.push_back(OpN); - } - if (Ops.size() == VT.getVectorNumElements()) - return getNode(ISD::BUILD_VECTOR, DL, VT, Ops); - break; + SDValue Ops = { Operand }; + if (SDValue Fold = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops)) + return Fold; } } } @@ -3012,6 +3061,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); + assert(Operand.getValueType().bitsLT(VT) && + "Invalid fpext node, dst < src!"); if (Operand.getOpcode() == ISD::UNDEF) return getUNDEF(VT); break; @@ -3019,12 +3070,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, assert(VT.isInteger() && Operand.getValueType().isInteger() && "Invalid SIGN_EXTEND!"); if (Operand.getValueType() == VT) return Operand; // noop extension - assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) && - "Invalid sext node, dst < src!"); assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); + assert(Operand.getValueType().bitsLT(VT) && + "Invalid sext node, dst < src!"); if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND) return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); else if (OpOpcode == ISD::UNDEF) @@ -3035,12 +3086,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, assert(VT.isInteger() && Operand.getValueType().isInteger() && "Invalid ZERO_EXTEND!"); if (Operand.getValueType() == VT) return Operand; // noop extension - assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) && - "Invalid zext node, dst < src!"); assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); + assert(Operand.getValueType().bitsLT(VT) && + "Invalid zext node, dst < src!"); if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x) return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getNode()->getOperand(0)); @@ -3052,12 +3103,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, assert(VT.isInteger() && Operand.getValueType().isInteger() && "Invalid ANY_EXTEND!"); if (Operand.getValueType() == VT) return Operand; // noop extension - assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) && - "Invalid anyext node, dst < src!"); assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); + assert(Operand.getValueType().bitsLT(VT) && + "Invalid anyext node, dst < src!"); if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ANY_EXTEND) @@ -3077,12 +3128,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, assert(VT.isInteger() && Operand.getValueType().isInteger() && "Invalid TRUNCATE!"); if (Operand.getValueType() == VT) return Operand; // noop truncate - assert(Operand.getValueType().getScalarType().bitsGT(VT.getScalarType()) && - "Invalid truncate node, src < dst!"); assert((!VT.isVector() || VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); + assert(Operand.getValueType().bitsGT(VT) && + "Invalid truncate node, src < dst!"); if (OpOpcode == ISD::TRUNCATE) return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0)); if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || @@ -3135,8 +3186,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, case ISD::FNEG: // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB) + // FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags? return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1), - Operand.getNode()->getOperand(0)); + Operand.getNode()->getOperand(0), + &cast<BinaryWithFlagsSDNode>(Operand.getNode())->Flags); if (OpOpcode == ISD::FNEG) // --X -> X return Operand.getNode()->getOperand(0); break; @@ -3182,6 +3235,10 @@ static std::pair<APInt, bool> FoldValue(unsigned Opcode, const APInt &C1, case ISD::SRA: return std::make_pair(C1.ashr(C2), true); case ISD::ROTL: return std::make_pair(C1.rotl(C2), true); case ISD::ROTR: return std::make_pair(C1.rotr(C2), true); + case ISD::SMIN: return std::make_pair(C1.sle(C2) ? C1 : C2, true); + case ISD::SMAX: return std::make_pair(C1.sge(C2) ? C1 : C2, true); + case ISD::UMIN: return std::make_pair(C1.ule(C2) ? C1 : C2, true); + case ISD::UMAX: return std::make_pair(C1.uge(C2) ? C1 : C2, true); case ISD::UDIV: if (!C2.getBoolValue()) break; @@ -3284,10 +3341,118 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, SDLoc DL, EVT VT, return getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Outputs); } +SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode, SDLoc DL, + EVT VT, + ArrayRef<SDValue> Ops, + const SDNodeFlags *Flags) { + // If the opcode is a target-specific ISD node, there's nothing we can + // do here and the operand rules may not line up with the below, so + // bail early. + if (Opcode >= ISD::BUILTIN_OP_END) + return SDValue(); + + // We can only fold vectors - maybe merge with FoldConstantArithmetic someday? + if (!VT.isVector()) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + + auto IsScalarOrSameVectorSize = [&](const SDValue &Op) { + return !Op.getValueType().isVector() || + Op.getValueType().getVectorNumElements() == NumElts; + }; + + auto IsConstantBuildVectorOrUndef = [&](const SDValue &Op) { + BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op); + return (Op.getOpcode() == ISD::UNDEF) || + (Op.getOpcode() == ISD::CONDCODE) || (BV && BV->isConstant()); + }; + + // All operands must be vector types with the same number of elements as + // the result type and must be either UNDEF or a build vector of constant + // or UNDEF scalars. + if (!std::all_of(Ops.begin(), Ops.end(), IsConstantBuildVectorOrUndef) || + !std::all_of(Ops.begin(), Ops.end(), IsScalarOrSameVectorSize)) + return SDValue(); + + // If we are comparing vectors, then the result needs to be a i1 boolean + // that is then sign-extended back to the legal result type. + EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType()); + + // Find legal integer scalar type for constant promotion and + // ensure that its scalar size is at least as large as source. + EVT LegalSVT = VT.getScalarType(); + if (LegalSVT.isInteger()) { + LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT); + if (LegalSVT.bitsLT(SVT)) + return SDValue(); + } + + // Constant fold each scalar lane separately. + SmallVector<SDValue, 4> ScalarResults; + for (unsigned i = 0; i != NumElts; i++) { + SmallVector<SDValue, 4> ScalarOps; + for (SDValue Op : Ops) { + EVT InSVT = Op.getValueType().getScalarType(); + BuildVectorSDNode *InBV = dyn_cast<BuildVectorSDNode>(Op); + if (!InBV) { + // We've checked that this is UNDEF or a constant of some kind. + if (Op.isUndef()) + ScalarOps.push_back(getUNDEF(InSVT)); + else + ScalarOps.push_back(Op); + continue; + } + + SDValue ScalarOp = InBV->getOperand(i); + EVT ScalarVT = ScalarOp.getValueType(); + + // Build vector (integer) scalar operands may need implicit + // truncation - do this before constant folding. + if (ScalarVT.isInteger() && ScalarVT.bitsGT(InSVT)) + ScalarOp = getNode(ISD::TRUNCATE, DL, InSVT, ScalarOp); + + ScalarOps.push_back(ScalarOp); + } + + // Constant fold the scalar operands. + SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags); + + // Legalize the (integer) scalar constant if necessary. + if (LegalSVT != SVT) + ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult); + + // Scalar folding only succeeded if the result is a constant or UNDEF. + if (ScalarResult.getOpcode() != ISD::UNDEF && + ScalarResult.getOpcode() != ISD::Constant && + ScalarResult.getOpcode() != ISD::ConstantFP) + return SDValue(); + ScalarResults.push_back(ScalarResult); + } + + assert(ScalarResults.size() == NumElts && + "Unexpected number of scalar results for BUILD_VECTOR"); + return getNode(ISD::BUILD_VECTOR, DL, VT, ScalarResults); +} + SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, const SDNodeFlags *Flags) { ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2); + ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); + ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2); + + // Canonicalize constant to RHS if commutative. + if (isCommutativeBinOp(Opcode)) { + if (N1C && !N2C) { + std::swap(N1C, N2C); + std::swap(N1, N2); + } else if (N1CFP && !N2CFP) { + std::swap(N1CFP, N2CFP); + std::swap(N1, N2); + } + } + switch (Opcode) { default: break; case ISD::TokenFactor: @@ -3298,34 +3463,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, if (N2.getOpcode() == ISD::EntryToken) return N1; if (N1 == N2) return N1; break; - case ISD::CONCAT_VECTORS: - // Concat of UNDEFs is UNDEF. - if (N1.getOpcode() == ISD::UNDEF && - N2.getOpcode() == ISD::UNDEF) - return getUNDEF(VT); - - // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to - // one big BUILD_VECTOR. - if (N1.getOpcode() == ISD::BUILD_VECTOR && - N2.getOpcode() == ISD::BUILD_VECTOR) { - SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(), - N1.getNode()->op_end()); - Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end()); - - // BUILD_VECTOR requires all inputs to be of the same type, find the - // maximum type and extend them all. - EVT SVT = VT.getScalarType(); - for (SDValue Op : Elts) - SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT); - if (SVT.bitsGT(VT.getScalarType())) - for (SDValue &Op : Elts) - Op = TLI->isZExtFree(Op.getValueType(), SVT) - ? getZExtOrTrunc(Op, DL, SVT) - : getSExtOrTrunc(Op, DL, SVT); - - return getNode(ISD::BUILD_VECTOR, DL, VT, Elts); - } + case ISD::CONCAT_VECTORS: { + // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF. + SDValue Ops[] = {N1, N2}; + if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this)) + return V; break; + } case ISD::AND: assert(VT.isInteger() && "This operator does not apply to FP types!"); assert(N1.getValueType() == N2.getValueType() && @@ -3356,6 +3500,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, case ISD::MUL: case ISD::SDIV: case ISD::SREM: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: assert(VT.isInteger() && "This operator does not apply to FP types!"); assert(N1.getValueType() == N2.getValueType() && N1.getValueType() == VT && "Binary operator types must match!"); @@ -3367,37 +3515,20 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, case ISD::FREM: if (getTarget().Options.UnsafeFPMath) { if (Opcode == ISD::FADD) { - // 0+x --> x - if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1)) - if (CFP->getValueAPF().isZero()) - return N2; // x+0 --> x - if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N2)) - if (CFP->getValueAPF().isZero()) - return N1; + if (N2CFP && N2CFP->getValueAPF().isZero()) + return N1; } else if (Opcode == ISD::FSUB) { // x-0 --> x - if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N2)) - if (CFP->getValueAPF().isZero()) - return N1; + if (N2CFP && N2CFP->getValueAPF().isZero()) + return N1; } else if (Opcode == ISD::FMUL) { - ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1); - SDValue V = N2; - - // If the first operand isn't the constant, try the second - if (!CFP) { - CFP = dyn_cast<ConstantFPSDNode>(N2); - V = N1; - } - - if (CFP) { - // 0*x --> 0 - if (CFP->isZero()) - return SDValue(CFP,0); - // 1*x --> x - if (CFP->isExactlyValue(1.0)) - return V; - } + // x*0 --> 0 + if (N2CFP && N2CFP->isZero()) + return N2; + // x*1 --> x + if (N2CFP && N2CFP->isExactlyValue(1.0)) + return N1; } } assert(VT.isFloatingPoint() && "This operator only applies to FP types!"); @@ -3457,7 +3588,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, assert(VT.isFloatingPoint() && N1.getValueType().isFloatingPoint() && VT.bitsLE(N1.getValueType()) && - isa<ConstantSDNode>(N2) && "Invalid FP_ROUND!"); + N2C && "Invalid FP_ROUND!"); if (N1.getValueType() == VT) return N1; // noop conversion. break; case ISD::AssertSext: @@ -3502,13 +3633,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SmallVector<SDValue, 8> Ops; for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { SDValue Op = N1.getOperand(i); - if (Op.getValueType() != VT.getScalarType()) break; if (Op.getOpcode() == ISD::UNDEF) { - Ops.push_back(Op); + Ops.push_back(getUNDEF(VT.getScalarType())); continue; } if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { APInt Val = C->getAPIntValue(); + Val = Val.zextOrTrunc(VT.getScalarSizeInBits()); Ops.push_back(SignExtendInReg(Val)); continue; } @@ -3590,15 +3721,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, return N1.getOperand(N2C->getZExtValue()); // EXTRACT_ELEMENT of a constant int is also very common. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { + if (N1C) { unsigned ElementSize = VT.getSizeInBits(); unsigned Shift = ElementSize * N2C->getZExtValue(); - APInt ShiftedVal = C->getAPIntValue().lshr(Shift); + APInt ShiftedVal = N1C->getAPIntValue().lshr(Shift); return getConstant(ShiftedVal.trunc(ElementSize), DL, VT); } break; - case ISD::EXTRACT_SUBVECTOR: { - SDValue Index = N2; + case ISD::EXTRACT_SUBVECTOR: if (VT.isSimple() && N1.getValueType().isSimple()) { assert(VT.isVector() && N1.getValueType().isVector() && "Extract subvector VTs must be a vectors!"); @@ -3608,9 +3738,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, assert(VT.getSimpleVT() <= N1.getSimpleValueType() && "Extract subvector must be from larger vector to smaller vector!"); - if (isa<ConstantSDNode>(Index)) { - assert((VT.getVectorNumElements() + - cast<ConstantSDNode>(Index)->getZExtValue() + if (N2C) { + assert((VT.getVectorNumElements() + N2C->getZExtValue() <= N1.getValueType().getVectorNumElements()) && "Extract subvector overflow!"); } @@ -3621,29 +3750,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, } break; } - } // Perform trivial constant folding. if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, N1.getNode(), N2.getNode())) return SV; - // Canonicalize constant to RHS if commutative. - if (N1C && !N2C && isCommutativeBinOp(Opcode)) { - std::swap(N1C, N2C); - std::swap(N1, N2); - } - // Constant fold FP operations. bool HasFPExceptions = TLI->hasFloatingPointExceptions(); - ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); - ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2); if (N1CFP) { - if (!N2CFP && isCommutativeBinOp(Opcode)) { - // Canonicalize constant to RHS if commutative. - std::swap(N1CFP, N2CFP); - std::swap(N1, N2); - } else if (N2CFP) { + if (N2CFP) { APFloat V1 = N1CFP->getValueAPF(), V2 = N2CFP->getValueAPF(); APFloat::opStatus s; switch (Opcode) { @@ -3670,7 +3786,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, } break; case ISD::FREM : - s = V1.mod(V2, APFloat::rmNearestTiesToEven); + s = V1.mod(V2); if (!HasFPExceptions || (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)) { return getConstantFP(V1, DL, VT); @@ -3795,7 +3911,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1, SDValue N2, SDValue N3) { // Perform various simplifications. - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); switch (Opcode) { case ISD::FMA: { ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); @@ -3812,27 +3927,25 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, } break; } - case ISD::CONCAT_VECTORS: - // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to - // one big BUILD_VECTOR. - if (N1.getOpcode() == ISD::BUILD_VECTOR && - N2.getOpcode() == ISD::BUILD_VECTOR && - N3.getOpcode() == ISD::BUILD_VECTOR) { - SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(), - N1.getNode()->op_end()); - Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end()); - Elts.append(N3.getNode()->op_begin(), N3.getNode()->op_end()); - return getNode(ISD::BUILD_VECTOR, DL, VT, Elts); - } + case ISD::CONCAT_VECTORS: { + // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF. + SDValue Ops[] = {N1, N2, N3}; + if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this)) + return V; break; + } case ISD::SETCC: { // Use FoldSetCC to simplify SETCC's. - SDValue Simp = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL); - if (Simp.getNode()) return Simp; + if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL)) + return V; + // Vector constant folding. + SDValue Ops[] = {N1, N2, N3}; + if (SDValue V = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops)) + return V; break; } case ISD::SELECT: - if (N1C) { + if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) { if (N1C->getZExtValue()) return N2; // select true, X, Y -> X return N3; // select false, X, Y -> Y @@ -4153,6 +4266,14 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps, return true; } +static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { + // On Darwin, -Os means optimize for size without hurting performance, so + // only really optimize for size when -Oz (MinSize) is used. + if (MF.getTarget().getTargetTriple().isOSDarwin()) + return MF.getFunction()->optForMinSize(); + return MF.getFunction()->optForSize(); +} + static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, @@ -4173,7 +4294,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl, bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); - bool OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); + bool OptSize = shouldLowerMemFuncForSize(MF); FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst); if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; @@ -4286,7 +4407,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl, bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); - bool OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); + bool OptSize = shouldLowerMemFuncForSize(MF); FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst); if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; @@ -4380,7 +4501,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl, bool DstAlignCanChange = false; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); - bool OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); + bool OptSize = shouldLowerMemFuncForSize(MF); FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst); if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; @@ -4446,6 +4567,16 @@ static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl, return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } +static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI, + unsigned AS) { + // Lowering memcpy / memset / memmove intrinsics to calls is only valid if all + // pointer operands can be losslessly bitcasted to pointers of address space 0 + if (AS != 0 && !TLI->isNoopAddrSpaceCast(AS, 0)) { + report_fatal_error("cannot lower memory intrinsic in address space " + + Twine(AS)); + } +} + SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVol, bool AlwaysInline, @@ -4487,6 +4618,9 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst, true, DstPtrInfo, SrcPtrInfo); } + checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace()); + checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace()); + // FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc // memcpy is not guaranteed to be safe. libc memcpys aren't required to // respect volatile, so they may do things like read or write memory @@ -4548,6 +4682,9 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst, return Result; } + checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace()); + checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace()); + // FIXME: If the memmove is volatile, lowering it to plain libc memmove may // not be safe. See memcpy above for more details. @@ -4605,6 +4742,8 @@ SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst, return Result; } + checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace()); + // Emit a library call. Type *IntPtrTy = getDataLayout().getIntPtrType(*getContext()); TargetLowering::ArgListTy Args; @@ -4872,10 +5011,12 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList, /// MachinePointerInfo record from it. This is particularly useful because the /// code generator has many cases where it doesn't bother passing in a /// MachinePointerInfo to getLoad or getStore when it has "FI+Cst". -static MachinePointerInfo InferPointerInfo(SDValue Ptr, int64_t Offset = 0) { +static MachinePointerInfo InferPointerInfo(SelectionDAG &DAG, SDValue Ptr, + int64_t Offset = 0) { // If this is FI+Offset, we can model it. if (const FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) - return MachinePointerInfo::getFixedStack(FI->getIndex(), Offset); + return MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), + FI->getIndex(), Offset); // If this is (FI+Offset1)+Offset2, we can model it. if (Ptr.getOpcode() != ISD::ADD || @@ -4884,20 +5025,22 @@ static MachinePointerInfo InferPointerInfo(SDValue Ptr, int64_t Offset = 0) { return MachinePointerInfo(); int FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); - return MachinePointerInfo::getFixedStack(FI, Offset+ - cast<ConstantSDNode>(Ptr.getOperand(1))->getSExtValue()); + return MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), FI, + Offset + cast<ConstantSDNode>(Ptr.getOperand(1))->getSExtValue()); } /// InferPointerInfo - If the specified ptr/offset is a frame index, infer a /// MachinePointerInfo record from it. This is particularly useful because the /// code generator has many cases where it doesn't bother passing in a /// MachinePointerInfo to getLoad or getStore when it has "FI+Cst". -static MachinePointerInfo InferPointerInfo(SDValue Ptr, SDValue OffsetOp) { +static MachinePointerInfo InferPointerInfo(SelectionDAG &DAG, SDValue Ptr, + SDValue OffsetOp) { // If the 'Offset' value isn't a constant, we can't handle this. if (ConstantSDNode *OffsetNode = dyn_cast<ConstantSDNode>(OffsetOp)) - return InferPointerInfo(Ptr, OffsetNode->getSExtValue()); + return InferPointerInfo(DAG, Ptr, OffsetNode->getSExtValue()); if (OffsetOp.getOpcode() == ISD::UNDEF) - return InferPointerInfo(Ptr); + return InferPointerInfo(DAG, Ptr); return MachinePointerInfo(); } @@ -4926,7 +5069,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, // If we don't have a PtrInfo, infer the trivial frame index case to simplify // clients. if (PtrInfo.V.isNull()) - PtrInfo = InferPointerInfo(Ptr, Offset); + PtrInfo = InferPointerInfo(*this, Ptr, Offset); MachineFunction &MF = getMachineFunction(); MachineMemOperand *MMO = @@ -5054,7 +5197,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val, Flags |= MachineMemOperand::MONonTemporal; if (PtrInfo.V.isNull()) - PtrInfo = InferPointerInfo(Ptr); + PtrInfo = InferPointerInfo(*this, Ptr); MachineFunction &MF = getMachineFunction(); MachineMemOperand *MMO = @@ -5109,7 +5252,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val, Flags |= MachineMemOperand::MONonTemporal; if (PtrInfo.V.isNull()) - PtrInfo = InferPointerInfo(Ptr); + PtrInfo = InferPointerInfo(*this, Ptr); MachineFunction &MF = getMachineFunction(); MachineMemOperand *MMO = @@ -5261,7 +5404,7 @@ SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, SDLoc dl, cast<MaskedGatherSDNode>(E)->refineAlignment(MMO); return SDValue(E, 0); } - MaskedGatherSDNode *N = + MaskedGatherSDNode *N = new (NodeAllocator) MaskedGatherSDNode(dl.getIROrder(), dl.getDebugLoc(), Ops, VTs, VT, MMO); CSEMap.InsertNode(N, IP); @@ -5317,18 +5460,24 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, } SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, - ArrayRef<SDValue> Ops) { + ArrayRef<SDValue> Ops, const SDNodeFlags *Flags) { unsigned NumOps = Ops.size(); switch (NumOps) { case 0: return getNode(Opcode, DL, VT); case 1: return getNode(Opcode, DL, VT, Ops[0]); - case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]); + case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags); case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]); default: break; } switch (Opcode) { default: break; + case ISD::CONCAT_VECTORS: { + // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF. + if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this)) + return V; + break; + } case ISD::SELECT_CC: { assert(NumOps == 5 && "SELECT_CC takes 5 operands!"); assert(Ops[0].getValueType() == Ops[1].getValueType() && @@ -5656,7 +5805,7 @@ UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops) { "Update with wrong number of operands"); // If no operands changed just return the input node. - if (Ops.empty() || std::equal(Ops.begin(), Ops.end(), N->op_begin())) + if (std::equal(Ops.begin(), Ops.end(), N->op_begin())) return N; // See if the modified node already exists. @@ -6451,13 +6600,13 @@ unsigned SelectionDAG::AssignTopologicalOrder() { // Node Id fields for nodes At SortedPos and after will contain the // count of outstanding operands. for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) { - SDNode *N = I++; + SDNode *N = &*I++; checkForCycles(N, this); unsigned Degree = N->getNumOperands(); if (Degree == 0) { // A node with no uses, add it to the result array immediately. N->setNodeId(DAGSize++); - allnodes_iterator Q = N; + allnodes_iterator Q(N); if (Q != SortedPos) SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q)); assert(SortedPos != AllNodes.end() && "Overran node list"); @@ -6470,8 +6619,8 @@ unsigned SelectionDAG::AssignTopologicalOrder() { // Visit all the nodes. As we iterate, move nodes into sorted order, // such that by the time the end is reached all nodes will be sorted. - for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ++I) { - SDNode *N = I; + for (SDNode &Node : allnodes()) { + SDNode *N = &Node; checkForCycles(N, this); // N is in sorted position, so all its uses have one less operand // that needs to be sorted. @@ -6493,9 +6642,10 @@ unsigned SelectionDAG::AssignTopologicalOrder() { P->setNodeId(Degree); } } - if (I == SortedPos) { + if (&Node == SortedPos) { #ifndef NDEBUG - SDNode *S = ++I; + allnodes_iterator I(N); + SDNode *S = &*++I; dbgs() << "Overran sorted position:\n"; S->dumprFull(this); dbgs() << "\n"; dbgs() << "Checking if this is due to cycles\n"; @@ -6559,6 +6709,26 @@ void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) { // SDNode Class //===----------------------------------------------------------------------===// +bool llvm::isNullConstant(SDValue V) { + ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V); + return Const != nullptr && Const->isNullValue(); +} + +bool llvm::isNullFPConstant(SDValue V) { + ConstantFPSDNode *Const = dyn_cast<ConstantFPSDNode>(V); + return Const != nullptr && Const->isZero() && !Const->isNegative(); +} + +bool llvm::isAllOnesConstant(SDValue V) { + ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V); + return Const != nullptr && Const->isAllOnesValue(); +} + +bool llvm::isOneConstant(SDValue V) { + ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V); + return Const != nullptr && Const->isOne(); +} + HandleSDNode::~HandleSDNode() { DropOperands(); } @@ -6772,6 +6942,12 @@ uint64_t SDNode::getConstantOperandVal(unsigned Num) const { return cast<ConstantSDNode>(OperandList[Num])->getZExtValue(); } +const SDNodeFlags *SDNode::getFlags() const { + if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this)) + return &FlagsNode->Flags; + return nullptr; +} + SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { assert(N->getNumValues() == 1 && "Can't unroll a vector with multiple results!"); @@ -6808,9 +6984,11 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { } switch (N->getOpcode()) { - default: - Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands)); + default: { + Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands, + N->getFlags())); break; + } case ISD::VSELECT: Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, Operands)); break; @@ -7101,6 +7279,24 @@ BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const { return dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements)); } +int32_t +BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, + uint32_t BitWidth) const { + if (ConstantFPSDNode *CN = + dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements))) { + bool IsExact; + APSInt IntVal(BitWidth); + APFloat APF = CN->getValueAPF(); + if (APF.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) != + APFloat::opOK || + !IsExact) + return -1; + + return IntVal.exactLogBase2(); + } + return -1; +} + bool BuildVectorSDNode::isConstant() const { for (const SDValue &Op : op_values()) { unsigned Opc = Op.getOpcode(); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 2c3c0eb1..45ae39a 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GCMetadata.h" @@ -63,6 +64,7 @@ #include "llvm/Target/TargetSelectionDAGInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> +#include <utility> using namespace llvm; #define DEBUG_TYPE "isel" @@ -79,7 +81,7 @@ LimitFPPrecision("limit-float-precision", cl::init(0)); static cl::opt<bool> -EnableFMFInDAG("enable-fmf-dag", cl::init(false), cl::Hidden, +EnableFMFInDAG("enable-fmf-dag", cl::init(true), cl::Hidden, cl::desc("Enable fast-math-flags for DAG nodes")); // Limit the width of DAG chains. This is important in general to prevent @@ -196,6 +198,14 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL, if (PartEVT == ValueVT) return Val; + if (PartEVT.isInteger() && ValueVT.isFloatingPoint() && + ValueVT.bitsLT(PartEVT)) { + // For an FP value in an integer part, we need to truncate to the right + // width first. + PartEVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits()); + Val = DAG.getNode(ISD::TRUNCATE, DL, PartEVT, Val); + } + if (PartEVT.isInteger() && ValueVT.isInteger()) { if (ValueVT.bitsLT(PartEVT)) { // For a truncate, see if we have any information to @@ -319,9 +329,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, SDLoc DL, assert(PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements() && "Cannot handle this kind of promotion"); // Promoted vector extract - bool Smaller = ValueVT.bitsLE(PartEVT); - return DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND), - DL, ValueVT, Val); + return DAG.getAnyExtOrTrunc(Val, DL, ValueVT); } @@ -339,11 +347,8 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, SDLoc DL, } if (ValueVT.getVectorNumElements() == 1 && - ValueVT.getVectorElementType() != PartEVT) { - bool Smaller = ValueVT.bitsLE(PartEVT); - Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND), - DL, ValueVT.getScalarType(), Val); - } + ValueVT.getVectorElementType() != PartEVT) + Val = DAG.getAnyExtOrTrunc(Val, DL, ValueVT.getScalarType()); return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val); } @@ -387,6 +392,12 @@ static void getCopyToParts(SelectionDAG &DAG, SDLoc DL, assert(NumParts == 1 && "Do not know what to promote to!"); Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val); } else { + if (ValueVT.isFloatingPoint()) { + // FP values need to be bitcast, then extended if they are being put + // into a larger container. + ValueVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits()); + Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); + } assert((PartVT.isInteger() || PartVT == MVT::x86mmx) && ValueVT.isInteger() && "Unknown mismatch!"); @@ -520,9 +531,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc DL, PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements()) { // Promoted vector extract - bool Smaller = PartEVT.bitsLE(ValueVT); - Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND), - DL, PartVT, Val); + Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT); } else{ // Vector -> scalar conversion. assert(ValueVT.getVectorNumElements() == 1 && @@ -531,9 +540,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc DL, ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val, DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); - bool Smaller = ValueVT.bitsLE(PartVT); - Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND), - DL, PartVT, Val); + Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT); } Parts[0] = Val; @@ -595,8 +602,7 @@ RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI, const DataLayout &DL, unsigned Reg, Type *Ty) { ComputeValueVTs(TLI, DL, Ty, ValueVTs); - for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) { - EVT ValueVT = ValueVTs[Value]; + for (EVT ValueVT : ValueVTs) { unsigned NumRegs = TLI.getNumRegisters(Context, ValueVT); MVT RegisterVT = TLI.getRegisterType(Context, ValueVT); for (unsigned i = 0; i != NumRegs; ++i) @@ -907,7 +913,8 @@ void SelectionDAGBuilder::visit(const Instruction &I) { visit(I.getOpcode(), I); - if (!isa<TerminatorInst>(&I) && !HasTailCall) + if (!isa<TerminatorInst>(&I) && !HasTailCall && + !isStatepoint(&I)) // statepoints handle their exports internally CopyToExportRegsIfNeeded(&I); CurInst = nullptr; @@ -943,14 +950,12 @@ void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V, assert(Variable->isValidLocationForIntrinsic(dl) && "Expected inlined-at fields to agree"); uint64_t Offset = DI->getOffset(); - // A dbg.value for an alloca is always indirect. - bool IsIndirect = isa<AllocaInst>(V) || Offset != 0; SDDbgValue *SDV; if (Val.getNode()) { - if (!EmitFuncArgumentDbgValue(V, Variable, Expr, dl, Offset, IsIndirect, + if (!EmitFuncArgumentDbgValue(V, Variable, Expr, dl, Offset, false, Val)) { SDV = DAG.getDbgValue(Variable, Expr, Val.getNode(), Val.getResNo(), - IsIndirect, Offset, dl, DbgSDNodeOrder); + false, Offset, dl, DbgSDNodeOrder); DAG.AddDbgValue(SDV, Val.getNode(), false); } } else @@ -1168,6 +1173,140 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { llvm_unreachable("Can't get register for value!"); } +void SelectionDAGBuilder::visitCatchPad(const CatchPadInst &I) { + auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn()); + bool IsMSVCCXX = Pers == EHPersonality::MSVC_CXX; + bool IsCoreCLR = Pers == EHPersonality::CoreCLR; + MachineBasicBlock *CatchPadMBB = FuncInfo.MBB; + // In MSVC C++ and CoreCLR, catchblocks are funclets and need prologues. + if (IsMSVCCXX || IsCoreCLR) + CatchPadMBB->setIsEHFuncletEntry(); + + DAG.setRoot(DAG.getNode(ISD::CATCHPAD, getCurSDLoc(), MVT::Other, getControlRoot())); +} + +void SelectionDAGBuilder::visitCatchRet(const CatchReturnInst &I) { + // Update machine-CFG edge. + MachineBasicBlock *TargetMBB = FuncInfo.MBBMap[I.getSuccessor()]; + FuncInfo.MBB->addSuccessor(TargetMBB); + + auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn()); + bool IsSEH = isAsynchronousEHPersonality(Pers); + if (IsSEH) { + // If this is not a fall-through branch or optimizations are switched off, + // emit the branch. + if (TargetMBB != NextBlock(FuncInfo.MBB) || + TM.getOptLevel() == CodeGenOpt::None) + DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, + getControlRoot(), DAG.getBasicBlock(TargetMBB))); + return; + } + + // Figure out the funclet membership for the catchret's successor. + // This will be used by the FuncletLayout pass to determine how to order the + // BB's. + // A 'catchret' returns to the outer scope's color. + Value *ParentPad = I.getParentPad(); + const BasicBlock *SuccessorColor; + if (isa<ConstantTokenNone>(ParentPad)) + SuccessorColor = &FuncInfo.Fn->getEntryBlock(); + else + SuccessorColor = cast<Instruction>(ParentPad)->getParent(); + assert(SuccessorColor && "No parent funclet for catchret!"); + MachineBasicBlock *SuccessorColorMBB = FuncInfo.MBBMap[SuccessorColor]; + assert(SuccessorColorMBB && "No MBB for SuccessorColor!"); + + // Create the terminator node. + SDValue Ret = DAG.getNode(ISD::CATCHRET, getCurSDLoc(), MVT::Other, + getControlRoot(), DAG.getBasicBlock(TargetMBB), + DAG.getBasicBlock(SuccessorColorMBB)); + DAG.setRoot(Ret); +} + +void SelectionDAGBuilder::visitCleanupPad(const CleanupPadInst &CPI) { + // Don't emit any special code for the cleanuppad instruction. It just marks + // the start of a funclet. + FuncInfo.MBB->setIsEHFuncletEntry(); + FuncInfo.MBB->setIsCleanupFuncletEntry(); +} + +/// When an invoke or a cleanupret unwinds to the next EH pad, there are +/// many places it could ultimately go. In the IR, we have a single unwind +/// destination, but in the machine CFG, we enumerate all the possible blocks. +/// This function skips over imaginary basic blocks that hold catchswitch +/// instructions, and finds all the "real" machine +/// basic block destinations. As those destinations may not be successors of +/// EHPadBB, here we also calculate the edge probability to those destinations. +/// The passed-in Prob is the edge probability to EHPadBB. +static void findUnwindDestinations( + FunctionLoweringInfo &FuncInfo, const BasicBlock *EHPadBB, + BranchProbability Prob, + SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>> + &UnwindDests) { + EHPersonality Personality = + classifyEHPersonality(FuncInfo.Fn->getPersonalityFn()); + bool IsMSVCCXX = Personality == EHPersonality::MSVC_CXX; + bool IsCoreCLR = Personality == EHPersonality::CoreCLR; + + while (EHPadBB) { + const Instruction *Pad = EHPadBB->getFirstNonPHI(); + BasicBlock *NewEHPadBB = nullptr; + if (isa<LandingPadInst>(Pad)) { + // Stop on landingpads. They are not funclets. + UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob); + break; + } else if (isa<CleanupPadInst>(Pad)) { + // Stop on cleanup pads. Cleanups are always funclet entries for all known + // personalities. + UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob); + UnwindDests.back().first->setIsEHFuncletEntry(); + break; + } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) { + // Add the catchpad handlers to the possible destinations. + for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) { + UnwindDests.emplace_back(FuncInfo.MBBMap[CatchPadBB], Prob); + // For MSVC++ and the CLR, catchblocks are funclets and need prologues. + if (IsMSVCCXX || IsCoreCLR) + UnwindDests.back().first->setIsEHFuncletEntry(); + } + NewEHPadBB = CatchSwitch->getUnwindDest(); + } else { + continue; + } + + BranchProbabilityInfo *BPI = FuncInfo.BPI; + if (BPI && NewEHPadBB) + Prob *= BPI->getEdgeProbability(EHPadBB, NewEHPadBB); + EHPadBB = NewEHPadBB; + } +} + +void SelectionDAGBuilder::visitCleanupRet(const CleanupReturnInst &I) { + // Update successor info. + SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests; + auto UnwindDest = I.getUnwindDest(); + BranchProbabilityInfo *BPI = FuncInfo.BPI; + BranchProbability UnwindDestProb = + (BPI && UnwindDest) + ? BPI->getEdgeProbability(FuncInfo.MBB->getBasicBlock(), UnwindDest) + : BranchProbability::getZero(); + findUnwindDestinations(FuncInfo, UnwindDest, UnwindDestProb, UnwindDests); + for (auto &UnwindDest : UnwindDests) { + UnwindDest.first->setIsEHPad(); + addSuccessorWithProb(FuncInfo.MBB, UnwindDest.first, UnwindDest.second); + } + FuncInfo.MBB->normalizeSuccProbs(); + + // Create the terminator node. + SDValue Ret = + DAG.getNode(ISD::CLEANUPRET, getCurSDLoc(), MVT::Other, getControlRoot()); + DAG.setRoot(Ret); +} + +void SelectionDAGBuilder::visitCatchSwitch(const CatchSwitchInst &CSI) { + report_fatal_error("visitCatchSwitch not yet implemented!"); +} + void SelectionDAGBuilder::visitRet(const ReturnInst &I) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); auto &DL = DAG.getDataLayout(); @@ -1186,7 +1325,8 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { ComputeValueVTs(TLI, DL, PointerType::getUnqual(F->getReturnType()), PtrValueVTs); - SDValue RetPtr = DAG.getRegister(DemoteReg, PtrValueVTs[0]); + SDValue RetPtr = DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), + DemoteReg, PtrValueVTs[0]); SDValue RetOp = getValue(I.getOperand(0)); SmallVector<EVT, 4> ValueVTs; @@ -1194,12 +1334,18 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs, &Offsets); unsigned NumValues = ValueVTs.size(); + // An aggregate return value cannot wrap around the address space, so + // offsets to its parts don't wrap either. + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(true); + SmallVector<SDValue, 4> Chains(NumValues); for (unsigned i = 0; i != NumValues; ++i) { SDValue Add = DAG.getNode(ISD::ADD, getCurSDLoc(), RetPtr.getValueType(), RetPtr, DAG.getIntPtrConstant(Offsets[i], - getCurSDLoc())); + getCurSDLoc()), + &Flags); Chains[i] = DAG.getStore(Chain, getCurSDLoc(), SDValue(RetOp.getNode(), RetOp.getResNo() + i), @@ -1334,25 +1480,34 @@ bool SelectionDAGBuilder::isExportableFromCurrentBlock(const Value *V, } /// Return branch probability calculated by BranchProbabilityInfo for IR blocks. -uint32_t SelectionDAGBuilder::getEdgeWeight(const MachineBasicBlock *Src, - const MachineBasicBlock *Dst) const { +BranchProbability +SelectionDAGBuilder::getEdgeProbability(const MachineBasicBlock *Src, + const MachineBasicBlock *Dst) const { BranchProbabilityInfo *BPI = FuncInfo.BPI; - if (!BPI) - return 0; const BasicBlock *SrcBB = Src->getBasicBlock(); const BasicBlock *DstBB = Dst->getBasicBlock(); - return BPI->getEdgeWeight(SrcBB, DstBB); + if (!BPI) { + // If BPI is not available, set the default probability as 1 / N, where N is + // the number of successors. + auto SuccSize = std::max<uint32_t>( + std::distance(succ_begin(SrcBB), succ_end(SrcBB)), 1); + return BranchProbability(1, SuccSize); + } + return BPI->getEdgeProbability(SrcBB, DstBB); } -void SelectionDAGBuilder:: -addSuccessorWithWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst, - uint32_t Weight /* = 0 */) { - if (!Weight) - Weight = getEdgeWeight(Src, Dst); - Src->addSuccessor(Dst, Weight); +void SelectionDAGBuilder::addSuccessorWithProb(MachineBasicBlock *Src, + MachineBasicBlock *Dst, + BranchProbability Prob) { + if (!FuncInfo.BPI) + Src->addSuccessorWithoutProb(Dst); + else { + if (Prob.isUnknown()) + Prob = getEdgeProbability(Src, Dst); + Src->addSuccessor(Dst, Prob); + } } - static bool InBlock(const Value *V, const BasicBlock *BB) { if (const Instruction *I = dyn_cast<Instruction>(V)) return I->getParent() == BB; @@ -1369,8 +1524,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *FBB, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, - uint32_t TWeight, - uint32_t FWeight) { + BranchProbability TProb, + BranchProbability FProb) { const BasicBlock *BB = CurBB->getBasicBlock(); // If the leaf of the tree is a comparison, merge the condition into @@ -1385,17 +1540,15 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond, ISD::CondCode Condition; if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) { Condition = getICmpCondCode(IC->getPredicate()); - } else if (const FCmpInst *FC = dyn_cast<FCmpInst>(Cond)) { + } else { + const FCmpInst *FC = cast<FCmpInst>(Cond); Condition = getFCmpCondCode(FC->getPredicate()); if (TM.Options.NoNaNsFPMath) Condition = getFCmpCodeWithoutNaN(Condition); - } else { - (void)Condition; // silence warning. - llvm_unreachable("Unknown compare instruction"); } CaseBlock CB(Condition, BOp->getOperand(0), BOp->getOperand(1), nullptr, - TBB, FBB, CurBB, TWeight, FWeight); + TBB, FBB, CurBB, TProb, FProb); SwitchCases.push_back(CB); return; } @@ -1403,26 +1556,19 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond, // Create a CaseBlock record representing this branch. CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(*DAG.getContext()), - nullptr, TBB, FBB, CurBB, TWeight, FWeight); + nullptr, TBB, FBB, CurBB, TProb, FProb); SwitchCases.push_back(CB); } -/// Scale down both weights to fit into uint32_t. -static void ScaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) { - uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse; - uint32_t Scale = (NewMax / UINT32_MAX) + 1; - NewTrue = NewTrue / Scale; - NewFalse = NewFalse / Scale; -} - /// FindMergedConditions - If Cond is an expression like void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, - unsigned Opc, uint32_t TWeight, - uint32_t FWeight) { + Instruction::BinaryOps Opc, + BranchProbability TProb, + BranchProbability FProb) { // If this node is not part of the or/and tree, emit it as a branch. const Instruction *BOp = dyn_cast<Instruction>(Cond); if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) || @@ -1431,12 +1577,12 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) || !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) { EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB, - TWeight, FWeight); + TProb, FProb); return; } // Create TmpBB after CurBB. - MachineFunction::iterator BBI = CurBB; + MachineFunction::iterator BBI(CurBB); MachineFunction &MF = DAG.getMachineFunction(); MachineBasicBlock *TmpBB = MF.CreateMachineBasicBlock(CurBB->getBasicBlock()); CurBB->getParent()->insert(++BBI, TmpBB); @@ -1455,26 +1601,25 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, // The requirement is that // TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB) // = TrueProb for original BB. - // Assuming the original weights are A and B, one choice is to set BB1's - // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice - // assumes that + // Assuming the original probabilities are A and B, one choice is to set + // BB1's probabilities to A/2 and A/2+B, and set TmpBB's probabilities to + // A/(1+B) and 2B/(1+B). This choice assumes that // TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB. // Another choice is to assume TrueProb for BB1 equals to TrueProb for // TmpBB, but the math is more complicated. - uint64_t NewTrueWeight = TWeight; - uint64_t NewFalseWeight = (uint64_t)TWeight + 2 * (uint64_t)FWeight; - ScaleWeights(NewTrueWeight, NewFalseWeight); + auto NewTrueProb = TProb / 2; + auto NewFalseProb = TProb / 2 + FProb; // Emit the LHS condition. FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc, - NewTrueWeight, NewFalseWeight); + NewTrueProb, NewFalseProb); - NewTrueWeight = TWeight; - NewFalseWeight = 2 * (uint64_t)FWeight; - ScaleWeights(NewTrueWeight, NewFalseWeight); + // Normalize A/2 and B to get A/(1+B) and 2B/(1+B). + SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb}; + BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end()); // Emit the RHS condition into TmpBB. FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, - NewTrueWeight, NewFalseWeight); + Probs[0], Probs[1]); } else { assert(Opc == Instruction::And && "Unknown merge op!"); // Codegen X & Y as: @@ -1491,24 +1636,23 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond, // The requirement is that // FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB) // = FalseProb for original BB. - // Assuming the original weights are A and B, one choice is to set BB1's - // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice - // assumes that - // FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB. - - uint64_t NewTrueWeight = 2 * (uint64_t)TWeight + (uint64_t)FWeight; - uint64_t NewFalseWeight = FWeight; - ScaleWeights(NewTrueWeight, NewFalseWeight); + // Assuming the original probabilities are A and B, one choice is to set + // BB1's probabilities to A+B/2 and B/2, and set TmpBB's probabilities to + // 2A/(1+A) and B/(1+A). This choice assumes that FalseProb for BB1 == + // TrueProb for BB1 * FalseProb for TmpBB. + + auto NewTrueProb = TProb + FProb / 2; + auto NewFalseProb = FProb / 2; // Emit the LHS condition. FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc, - NewTrueWeight, NewFalseWeight); + NewTrueProb, NewFalseProb); - NewTrueWeight = 2 * (uint64_t)TWeight; - NewFalseWeight = FWeight; - ScaleWeights(NewTrueWeight, NewFalseWeight); + // Normalize A and B/2 to get 2A/(1+A) and B/(1+A). + SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2}; + BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end()); // Emit the RHS condition into TmpBB. FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc, - NewTrueWeight, NewFalseWeight); + Probs[0], Probs[1]); } } @@ -1585,12 +1729,14 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) { // jle foo // if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) { - if (!DAG.getTargetLoweringInfo().isJumpExpensive() && - BOp->hasOneUse() && (BOp->getOpcode() == Instruction::And || - BOp->getOpcode() == Instruction::Or)) { + Instruction::BinaryOps Opcode = BOp->getOpcode(); + if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp->hasOneUse() && + !I.getMetadata(LLVMContext::MD_unpredictable) && + (Opcode == Instruction::And || Opcode == Instruction::Or)) { FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB, - BOp->getOpcode(), getEdgeWeight(BrMBB, Succ0MBB), - getEdgeWeight(BrMBB, Succ1MBB)); + Opcode, + getEdgeProbability(BrMBB, Succ0MBB), + getEdgeProbability(BrMBB, Succ1MBB)); // If the compares in later blocks need to use values not currently // exported from this block, export them now. This block should always // be the first entry. @@ -1669,11 +1815,12 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB, } // Update successor info - addSuccessorWithWeight(SwitchBB, CB.TrueBB, CB.TrueWeight); + addSuccessorWithProb(SwitchBB, CB.TrueBB, CB.TrueProb); // TrueBB and FalseBB are always different unless the incoming IR is // degenerate. This only happens when running llc on weird IR. if (CB.TrueBB != CB.FalseBB) - addSuccessorWithWeight(SwitchBB, CB.FalseBB, CB.FalseWeight); + addSuccessorWithProb(SwitchBB, CB.FalseBB, CB.FalseProb); + SwitchBB->normalizeSuccProbs(); // If the lhs block is the next block, invert the condition so that we can // fall through to the lhs instead of the rhs block. @@ -1797,10 +1944,10 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, GuardPtr, MachinePointerInfo(IRGuard, 0), true, false, false, Align); - SDValue StackSlot = DAG.getLoad(PtrTy, dl, DAG.getEntryNode(), - StackSlotPtr, - MachinePointerInfo::getFixedStack(FI), - true, false, false, Align); + SDValue StackSlot = DAG.getLoad( + PtrTy, dl, DAG.getEntryNode(), StackSlotPtr, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), true, + false, false, Align); // Perform the comparison via a subtract/getsetcc. EVT VT = Guard.getValueType(); @@ -1837,7 +1984,7 @@ SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Chain = TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid, - nullptr, 0, false, getCurSDLoc(), false, false).second; + None, false, getCurSDLoc(), false, false).second; DAG.setRoot(Chain); } @@ -1884,8 +2031,9 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B, MachineBasicBlock* MBB = B.Cases[0].ThisBB; - addSuccessorWithWeight(SwitchBB, B.Default); - addSuccessorWithWeight(SwitchBB, MBB); + addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb); + addSuccessorWithProb(SwitchBB, MBB, B.Prob); + SwitchBB->normalizeSuccProbs(); SDValue BrRange = DAG.getNode(ISD::BRCOND, dl, MVT::Other, CopyTo, RangeCmp, @@ -1902,7 +2050,7 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B, /// visitBitTestCase - this function produces one "bit test" void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB, MachineBasicBlock* NextMBB, - uint32_t BranchWeightToNext, + BranchProbability BranchProbToNext, unsigned Reg, BitTestCase &B, MachineBasicBlock *SwitchBB) { @@ -1938,10 +2086,14 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB, AndOp, DAG.getConstant(0, dl, VT), ISD::SETNE); } - // The branch weight from SwitchBB to B.TargetBB is B.ExtraWeight. - addSuccessorWithWeight(SwitchBB, B.TargetBB, B.ExtraWeight); - // The branch weight from SwitchBB to NextMBB is BranchWeightToNext. - addSuccessorWithWeight(SwitchBB, NextMBB, BranchWeightToNext); + // The branch probability from SwitchBB to B.TargetBB is B.ExtraProb. + addSuccessorWithProb(SwitchBB, B.TargetBB, B.ExtraProb); + // The branch probability from SwitchBB to NextMBB is BranchProbToNext. + addSuccessorWithProb(SwitchBB, NextMBB, BranchProbToNext); + // It is not guaranteed that the sum of B.ExtraProb and BranchProbToNext is + // one as they are relative probabilities (and thus work more like weights), + // and hence we need to normalize them to let the sum of them become one. + SwitchBB->normalizeSuccProbs(); SDValue BrAnd = DAG.getNode(ISD::BRCOND, dl, MVT::Other, getControlRoot(), @@ -1958,9 +2110,10 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB, void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) { MachineBasicBlock *InvokeMBB = FuncInfo.MBB; - // Retrieve successors. + // Retrieve successors. Look through artificial IR level blocks like + // catchswitch for successors. MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)]; - MachineBasicBlock *LandingPad = FuncInfo.MBBMap[I.getSuccessor(1)]; + const BasicBlock *EHPadBB = I.getSuccessor(1); const Value *Callee(I.getCalledValue()); const Function *Fn = dyn_cast<Function>(Callee); @@ -1975,14 +2128,14 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) { break; case Intrinsic::experimental_patchpoint_void: case Intrinsic::experimental_patchpoint_i64: - visitPatchpoint(&I, LandingPad); + visitPatchpoint(&I, EHPadBB); break; case Intrinsic::experimental_gc_statepoint: - LowerStatepoint(ImmutableStatepoint(&I), LandingPad); + LowerStatepoint(ImmutableStatepoint(&I), EHPadBB); break; } } else - LowerCallTo(&I, getValue(Callee), false, LandingPad); + LowerCallTo(&I, getValue(Callee), false, EHPadBB); // If the value of the invoke is used outside of its defining block, make it // available as a virtual register. @@ -1992,9 +2145,20 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) { CopyToExportRegsIfNeeded(&I); } - // Update successor info - addSuccessorWithWeight(InvokeMBB, Return); - addSuccessorWithWeight(InvokeMBB, LandingPad); + SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests; + BranchProbabilityInfo *BPI = FuncInfo.BPI; + BranchProbability EHPadBBProb = + BPI ? BPI->getEdgeProbability(InvokeMBB->getBasicBlock(), EHPadBB) + : BranchProbability::getZero(); + findUnwindDestinations(FuncInfo, EHPadBB, EHPadBBProb, UnwindDests); + + // Update successor info. + addSuccessorWithProb(InvokeMBB, Return); + for (auto &UnwindDest : UnwindDests) { + UnwindDest.first->setIsEHPad(); + addSuccessorWithProb(InvokeMBB, UnwindDest.first, UnwindDest.second); + } + InvokeMBB->normalizeSuccProbs(); // Drop into normal successor. DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), @@ -2007,7 +2171,7 @@ void SelectionDAGBuilder::visitResume(const ResumeInst &RI) { } void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) { - assert(FuncInfo.MBB->isLandingPad() && + assert(FuncInfo.MBB->isEHPad() && "Call to landingpad not in landing pad!"); MachineBasicBlock *MBB = FuncInfo.MBB; @@ -2017,8 +2181,16 @@ void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) { // If there aren't registers to copy the values into (e.g., during SjLj // exceptions), then don't bother to create these DAG nodes. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.getExceptionPointerRegister() == 0 && - TLI.getExceptionSelectorRegister() == 0) + const Constant *PersonalityFn = FuncInfo.Fn->getPersonalityFn(); + if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 && + TLI.getExceptionSelectorRegister(PersonalityFn) == 0) + return; + + // If landingpad's return type is token type, we don't create DAG nodes + // for its exception pointer and selector value. The extraction of exception + // pointer or selector value from token type landingpads is not currently + // supported. + if (LP.getType()->isTokenTy()) return; SmallVector<EVT, 2> ValueVTs; @@ -2074,8 +2246,7 @@ void SelectionDAGBuilder::sortAndRangeify(CaseClusterVector &Clusters) { // If this case has the same successor and is a neighbour, merge it into // the previous cluster. Clusters[DstIndex - 1].High = CaseVal; - Clusters[DstIndex - 1].Weight += CC.Weight; - assert(Clusters[DstIndex - 1].Weight >= CC.Weight && "Weight overflow!"); + Clusters[DstIndex - 1].Prob += CC.Prob; } else { std::memmove(&Clusters[DstIndex++], &Clusters[SrcIndex], sizeof(Clusters[SrcIndex])); @@ -2109,8 +2280,9 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) { continue; MachineBasicBlock *Succ = FuncInfo.MBBMap[BB]; - addSuccessorWithWeight(IndirectBrMBB, Succ); + addSuccessorWithProb(IndirectBrMBB, Succ); } + IndirectBrMBB->normalizeSuccProbs(); DAG.setRoot(DAG.getNode(ISD::BRIND, getCurSDLoc(), MVT::Other, getControlRoot(), @@ -2119,7 +2291,8 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) { void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) { if (DAG.getTarget().Options.TrapUnreachable) - DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot())); + DAG.setRoot( + DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot())); } void SelectionDAGBuilder::visitFSub(const User &I) { @@ -2260,6 +2433,10 @@ void SelectionDAGBuilder::visitFCmp(const User &I) { SDValue Op1 = getValue(I.getOperand(0)); SDValue Op2 = getValue(I.getOperand(1)); ISD::CondCode Condition = getFCmpCondCode(predicate); + + // FIXME: Fcmp instructions have fast-math-flags in IR, so we should use them. + // FIXME: We should propagate the fast-math-flags to the DAG node itself for + // further optimization, but currently FMF is only applicable to binary nodes. if (TM.Options.NoNaNsFPMath) Condition = getFCmpCodeWithoutNaN(Condition); EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(), @@ -2284,27 +2461,74 @@ void SelectionDAGBuilder::visitSelect(const User &I) { // Min/max matching is only viable if all output VTs are the same. if (std::equal(ValueVTs.begin(), ValueVTs.end(), ValueVTs.begin())) { - Value *LHS, *RHS; - SelectPatternFlavor SPF = matchSelectPattern(const_cast<User*>(&I), LHS, RHS); - ISD::NodeType Opc = ISD::DELETED_NODE; - switch (SPF) { - case SPF_UMAX: Opc = ISD::UMAX; break; - case SPF_UMIN: Opc = ISD::UMIN; break; - case SPF_SMAX: Opc = ISD::SMAX; break; - case SPF_SMIN: Opc = ISD::SMIN; break; - default: break; - } - EVT VT = ValueVTs[0]; LLVMContext &Ctx = *DAG.getContext(); auto &TLI = DAG.getTargetLoweringInfo(); - while (TLI.getTypeAction(Ctx, VT) == TargetLoweringBase::TypeSplitVector) + + // We care about the legality of the operation after it has been type + // legalized. + while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal && + VT != TLI.getTypeToTransformTo(Ctx, VT)) VT = TLI.getTypeToTransformTo(Ctx, VT); - if (Opc != ISD::DELETED_NODE && TLI.isOperationLegalOrCustom(Opc, VT) && - // If the underlying comparison instruction is used by any other instruction, - // the consumed instructions won't be destroyed, so it is not profitable - // to convert to a min/max. + // If the vselect is legal, assume we want to leave this as a vector setcc + + // vselect. Otherwise, if this is going to be scalarized, we want to see if + // min/max is legal on the scalar type. + bool UseScalarMinMax = VT.isVector() && + !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT); + + Value *LHS, *RHS; + auto SPR = matchSelectPattern(const_cast<User*>(&I), LHS, RHS); + ISD::NodeType Opc = ISD::DELETED_NODE; + switch (SPR.Flavor) { + case SPF_UMAX: Opc = ISD::UMAX; break; + case SPF_UMIN: Opc = ISD::UMIN; break; + case SPF_SMAX: Opc = ISD::SMAX; break; + case SPF_SMIN: Opc = ISD::SMIN; break; + case SPF_FMINNUM: + switch (SPR.NaNBehavior) { + case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?"); + case SPNB_RETURNS_NAN: Opc = ISD::FMINNAN; break; + case SPNB_RETURNS_OTHER: Opc = ISD::FMINNUM; break; + case SPNB_RETURNS_ANY: { + if (TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT)) + Opc = ISD::FMINNUM; + else if (TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT)) + Opc = ISD::FMINNAN; + else if (UseScalarMinMax) + Opc = TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT.getScalarType()) ? + ISD::FMINNUM : ISD::FMINNAN; + break; + } + } + break; + case SPF_FMAXNUM: + switch (SPR.NaNBehavior) { + case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?"); + case SPNB_RETURNS_NAN: Opc = ISD::FMAXNAN; break; + case SPNB_RETURNS_OTHER: Opc = ISD::FMAXNUM; break; + case SPNB_RETURNS_ANY: + + if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT)) + Opc = ISD::FMAXNUM; + else if (TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT)) + Opc = ISD::FMAXNAN; + else if (UseScalarMinMax) + Opc = TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT.getScalarType()) ? + ISD::FMAXNUM : ISD::FMAXNAN; + break; + } + break; + default: break; + } + + if (Opc != ISD::DELETED_NODE && + (TLI.isOperationLegalOrCustom(Opc, VT) || + (UseScalarMinMax && + TLI.isOperationLegalOrCustom(Opc, VT.getScalarType()))) && + // If the underlying comparison instruction is used by any other + // instruction, the consumed instructions won't be destroyed, so it is + // not profitable to convert to a min/max. cast<SelectInst>(&I)->getCondition()->hasOneUse()) { OpCode = Opc; LHSVal = getValue(LHS); @@ -2781,8 +3005,15 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { if (Field) { // N = N + Offset uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field); + + // In an inbouds GEP with an offset that is nonnegative even when + // interpreted as signed, assume there is no unsigned overflow. + SDNodeFlags Flags; + if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds()) + Flags.setNoUnsignedWrap(true); + N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, - DAG.getConstant(Offset, dl, N.getValueType())); + DAG.getConstant(Offset, dl, N.getValueType()), &Flags); } Ty = StTy->getElementType(Field); @@ -2807,7 +3038,14 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { SDValue OffsVal = VectorWidth ? DAG.getConstant(Offs, dl, MVT::getVectorVT(PtrTy, VectorWidth)) : DAG.getConstant(Offs, dl, PtrTy); - N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal); + + // In an inbouds GEP with an offset that is nonnegative even when + // interpreted as signed, assume there is no unsigned overflow. + SDNodeFlags Flags; + if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds()) + Flags.setNoUnsignedWrap(true); + + N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, &Flags); continue; } @@ -2879,10 +3117,13 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) { Align = 0; // Round the size of the allocation up to the stack alignment size - // by add SA-1 to the size. + // by add SA-1 to the size. This doesn't overflow because we're computing + // an address inside an alloca. + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(true); AllocSize = DAG.getNode(ISD::ADD, dl, AllocSize.getValueType(), AllocSize, - DAG.getIntPtrConstant(StackAlign - 1, dl)); + DAG.getIntPtrConstant(StackAlign - 1, dl), &Flags); // Mask out the low bits for alignment purposes. AllocSize = DAG.getNode(ISD::AND, dl, @@ -2920,7 +3161,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { // throughout the function's lifetime. bool isInvariant = I.getMetadata(LLVMContext::MD_invariant_load) != nullptr && - isDereferenceablePointer(SV, *DAG.getTarget().getDataLayout()); + isDereferenceablePointer(SV, DAG.getDataLayout()); unsigned Alignment = I.getAlignment(); AAMDNodes AAInfo; @@ -2940,8 +3181,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { if (isVolatile || NumValues > MaxParallelChains) // Serialize volatile loads with other side effects. Root = getRoot(); - else if (AA->pointsToConstantMemory( - MemoryLocation(SV, AA->getTypeStoreSize(Ty), AAInfo))) { + else if (AA->pointsToConstantMemory(MemoryLocation( + SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) { // Do not serialize (non-volatile) loads of constant memory with anything. Root = DAG.getEntryNode(); ConstantMemory = true; @@ -2955,6 +3196,11 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { if (isVolatile) Root = TLI.prepareVolatileOrAtomicLoad(Root, dl, DAG); + // An aggregate load cannot wrap around the address space, so offsets to its + // parts don't wrap either. + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(true); + SmallVector<SDValue, 4> Values(NumValues); SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues)); EVT PtrVT = Ptr.getValueType(); @@ -2975,7 +3221,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { } SDValue A = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr, - DAG.getConstant(Offsets[i], dl, PtrVT)); + DAG.getConstant(Offsets[i], dl, PtrVT), + &Flags); SDValue L = DAG.getLoad(ValueVTs[i], dl, Root, A, MachinePointerInfo(SV, Offsets[i]), isVolatile, isNonTemporal, isInvariant, Alignment, AAInfo, @@ -3030,6 +3277,11 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) { AAMDNodes AAInfo; I.getAAMetadata(AAInfo); + // An aggregate load cannot wrap around the address space, so offsets to its + // parts don't wrap either. + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(true); + unsigned ChainI = 0; for (unsigned i = 0; i != NumValues; ++i, ++ChainI) { // See visitLoad comments. @@ -3040,7 +3292,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) { ChainI = 0; } SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr, - DAG.getConstant(Offsets[i], dl, PtrVT)); + DAG.getConstant(Offsets[i], dl, PtrVT), &Flags); SDValue St = DAG.getStore(Root, dl, SDValue(Src.getNode(), Src.getResNo() + i), Add, MachinePointerInfo(PtrV, Offsets[i]), @@ -3056,7 +3308,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) { void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) { SDLoc sdl = getCurSDLoc(); - // llvm.masked.store.*(Src0, Ptr, alignemt, Mask) + // llvm.masked.store.*(Src0, Ptr, alignment, Mask) Value *PtrOperand = I.getArgOperand(1); SDValue Ptr = getValue(PtrOperand); SDValue Src0 = getValue(I.getArgOperand(0)); @@ -3080,63 +3332,70 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) { setValue(&I, StoreNode); } -// Gather/scatter receive a vector of pointers. -// This vector of pointers may be represented as a base pointer + vector of -// indices, it depends on GEP and instruction preceeding GEP -// that calculates indices -static bool getUniformBase(Value *& Ptr, SDValue& Base, SDValue& Index, +// Get a uniform base for the Gather/Scatter intrinsic. +// The first argument of the Gather/Scatter intrinsic is a vector of pointers. +// We try to represent it as a base pointer + vector of indices. +// Usually, the vector of pointers comes from a 'getelementptr' instruction. +// The first operand of the GEP may be a single pointer or a vector of pointers +// Example: +// %gep.ptr = getelementptr i32, <8 x i32*> %vptr, <8 x i32> %ind +// or +// %gep.ptr = getelementptr i32, i32* %ptr, <8 x i32> %ind +// %res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.ptr, .. +// +// When the first GEP operand is a single pointer - it is the uniform base we +// are looking for. If first operand of the GEP is a splat vector - we +// extract the spalt value and use it as a uniform base. +// In all other cases the function returns 'false'. +// +static bool getUniformBase(const Value *& Ptr, SDValue& Base, SDValue& Index, SelectionDAGBuilder* SDB) { - assert (Ptr->getType()->isVectorTy() && "Uexpected pointer type"); - GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); - if (!Gep || Gep->getNumOperands() > 2) + SelectionDAG& DAG = SDB->DAG; + LLVMContext &Context = *DAG.getContext(); + + assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type"); + const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); + if (!GEP || GEP->getNumOperands() > 2) return false; - ShuffleVectorInst *ShuffleInst = - dyn_cast<ShuffleVectorInst>(Gep->getPointerOperand()); - if (!ShuffleInst || !ShuffleInst->getMask()->isNullValue() || - cast<Instruction>(ShuffleInst->getOperand(0))->getOpcode() != - Instruction::InsertElement) + + const Value *GEPPtr = GEP->getPointerOperand(); + if (!GEPPtr->getType()->isVectorTy()) + Ptr = GEPPtr; + else if (!(Ptr = getSplatValue(GEPPtr))) return false; - Ptr = cast<InsertElementInst>(ShuffleInst->getOperand(0))->getOperand(1); + Value *IndexVal = GEP->getOperand(1); - SelectionDAG& DAG = SDB->DAG; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // Check is the Ptr is inside current basic block - // If not, look for the shuffle instruction - if (SDB->findValue(Ptr)) - Base = SDB->getValue(Ptr); - else if (SDB->findValue(ShuffleInst)) { - SDValue ShuffleNode = SDB->getValue(ShuffleInst); - SDLoc sdl = ShuffleNode; - Base = DAG.getNode( - ISD::EXTRACT_VECTOR_ELT, sdl, - ShuffleNode.getValueType().getScalarType(), ShuffleNode, - DAG.getConstant(0, sdl, TLI.getVectorIdxTy(DAG.getDataLayout()))); - SDB->setValue(Ptr, Base); - } - else + // The operands of the GEP may be defined in another basic block. + // In this case we'll not find nodes for the operands. + if (!SDB->findValue(Ptr) || !SDB->findValue(IndexVal)) return false; - Value *IndexVal = Gep->getOperand(1); - if (SDB->findValue(IndexVal)) { - Index = SDB->getValue(IndexVal); + Base = SDB->getValue(Ptr); + Index = SDB->getValue(IndexVal); - if (SExtInst* Sext = dyn_cast<SExtInst>(IndexVal)) { + // Suppress sign extension. + if (SExtInst* Sext = dyn_cast<SExtInst>(IndexVal)) { + if (SDB->findValue(Sext->getOperand(0))) { IndexVal = Sext->getOperand(0); - if (SDB->findValue(IndexVal)) - Index = SDB->getValue(IndexVal); + Index = SDB->getValue(IndexVal); } - return true; } - return false; + if (!Index.getValueType().isVector()) { + unsigned GEPWidth = GEP->getType()->getVectorNumElements(); + EVT VT = EVT::getVectorVT(Context, Index.getValueType(), GEPWidth); + SmallVector<SDValue, 16> Ops(GEPWidth, Index); + Index = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Index), VT, Ops); + } + return true; } void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) { SDLoc sdl = getCurSDLoc(); // llvm.masked.scatter.*(Src0, Ptrs, alignemt, Mask) - Value *Ptr = I.getArgOperand(1); + const Value *Ptr = I.getArgOperand(1); SDValue Src0 = getValue(I.getArgOperand(0)); SDValue Mask = getValue(I.getArgOperand(3)); EVT VT = Src0.getValueType(); @@ -3150,10 +3409,10 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) { SDValue Base; SDValue Index; - Value *BasePtr = Ptr; + const Value *BasePtr = Ptr; bool UniformBase = getUniformBase(BasePtr, Base, Index, this); - Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr; + const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr; MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MachinePointerInfo(MemOpBasePtr), MachineMemOperand::MOStore, VT.getStoreSize(), @@ -3190,7 +3449,8 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) { SDValue InChain = DAG.getRoot(); if (AA->pointsToConstantMemory(MemoryLocation( - PtrOperand, AA->getTypeStoreSize(I.getType()), AAInfo))) { + PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), + AAInfo))) { // Do not serialize (non-volatile) loads of constant memory with anything. InChain = DAG.getEntryNode(); } @@ -3212,7 +3472,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { SDLoc sdl = getCurSDLoc(); // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0) - Value *Ptr = I.getArgOperand(0); + const Value *Ptr = I.getArgOperand(0); SDValue Src0 = getValue(I.getArgOperand(3)); SDValue Mask = getValue(I.getArgOperand(2)); @@ -3229,12 +3489,13 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { SDValue Root = DAG.getRoot(); SDValue Base; SDValue Index; - Value *BasePtr = Ptr; + const Value *BasePtr = Ptr; bool UniformBase = getUniformBase(BasePtr, Base, Index, this); bool ConstantMemory = false; if (UniformBase && - AA->pointsToConstantMemory( - MemoryLocation(BasePtr, AA->getTypeStoreSize(I.getType()), AAInfo))) { + AA->pointsToConstantMemory(MemoryLocation( + BasePtr, DAG.getDataLayout().getTypeStoreSize(I.getType()), + AAInfo))) { // Do not serialize (non-volatile) loads of constant memory with anything. Root = DAG.getEntryNode(); ConstantMemory = true; @@ -3511,6 +3772,8 @@ getF32Constant(SelectionDAG &DAG, unsigned Flt, SDLoc dl) { static SDValue getLimitedPrecisionExp2(SDValue t0, SDLoc dl, SelectionDAG &DAG) { + // TODO: What fast-math-flags should be set on the floating-point nodes? + // IntegerPartOfX = ((int32_t)(t0); SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0); @@ -3609,6 +3872,8 @@ static SDValue expandExp(SDLoc dl, SDValue Op, SelectionDAG &DAG, // // #define LOG2OFe 1.4426950f // t0 = Op * LOG2OFe + + // TODO: What fast-math-flags should be set here? SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op, getF32Constant(DAG, 0x3fb8aa3b, dl)); return getLimitedPrecisionExp2(t0, dl, DAG); @@ -3622,6 +3887,9 @@ static SDValue expandExp(SDLoc dl, SDValue Op, SelectionDAG &DAG, /// limited-precision mode. static SDValue expandLog(SDLoc dl, SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI) { + + // TODO: What fast-math-flags should be set on the floating-point nodes? + if (Op.getValueType() == MVT::f32 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) { SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op); @@ -3718,6 +3986,9 @@ static SDValue expandLog(SDLoc dl, SDValue Op, SelectionDAG &DAG, /// limited-precision mode. static SDValue expandLog2(SDLoc dl, SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI) { + + // TODO: What fast-math-flags should be set on the floating-point nodes? + if (Op.getValueType() == MVT::f32 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) { SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op); @@ -3813,6 +4084,9 @@ static SDValue expandLog2(SDLoc dl, SDValue Op, SelectionDAG &DAG, /// limited-precision mode. static SDValue expandLog10(SDLoc dl, SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI) { + + // TODO: What fast-math-flags should be set on the floating-point nodes? + if (Op.getValueType() == MVT::f32 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) { SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op); @@ -3922,6 +4196,7 @@ static SDValue expandPow(SDLoc dl, SDValue LHS, SDValue RHS, } } + // TODO: What fast-math-flags should be set on the FMUL node? if (IsExp10) { // Put the exponent in the right bit position for later addition to the // final result: @@ -3955,9 +4230,9 @@ static SDValue ExpandPowI(SDLoc DL, SDValue LHS, SDValue RHS, return DAG.getConstantFP(1.0, DL, LHS.getValueType()); const Function *F = DAG.getMachineFunction().getFunction(); - if (!F->hasFnAttribute(Attribute::OptimizeForSize) || - // If optimizing for size, don't insert too many multiplies. This - // inserts up to 5 multiplies. + if (!F->optForSize() || + // If optimizing for size, don't insert too many multiplies. + // This inserts up to 5 multiplies. countPopulation(Val) + Log2_32(Val) < 7) { // We use the simple binary decomposition method to generate the multiply // sequence. There are more optimal ways to do this (for example, @@ -3965,6 +4240,8 @@ static SDValue ExpandPowI(SDLoc DL, SDValue LHS, SDValue RHS, // the benefit of being both really simple and much better than a libcall. SDValue Res; // Logically starts equal to 1.0 SDValue CurSquare = LHS; + // TODO: Intrinsics should have fast-math-flags that propagate to these + // nodes. while (Val) { if (Val & 1) { if (Res.getNode()) @@ -3990,22 +4267,20 @@ static SDValue ExpandPowI(SDLoc DL, SDValue LHS, SDValue RHS, return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS); } -// getTruncatedArgReg - Find underlying register used for an truncated -// argument. -static unsigned getTruncatedArgReg(const SDValue &N) { - if (N.getOpcode() != ISD::TRUNCATE) +// getUnderlyingArgReg - Find underlying register used for a truncated or +// bitcasted argument. +static unsigned getUnderlyingArgReg(const SDValue &N) { + switch (N.getOpcode()) { + case ISD::CopyFromReg: + return cast<RegisterSDNode>(N.getOperand(1))->getReg(); + case ISD::BITCAST: + case ISD::AssertZext: + case ISD::AssertSext: + case ISD::TRUNCATE: + return getUnderlyingArgReg(N.getOperand(0)); + default: return 0; - - const SDValue &Ext = N.getOperand(0); - if (Ext.getOpcode() == ISD::AssertZext || - Ext.getOpcode() == ISD::AssertSext) { - const SDValue &CFR = Ext.getOperand(0); - if (CFR.getOpcode() == ISD::CopyFromReg) - return cast<RegisterSDNode>(CFR.getOperand(1))->getReg(); - if (CFR.getOpcode() == ISD::TRUNCATE) - return getTruncatedArgReg(CFR); } - return 0; } /// EmitFuncArgumentDbgValue - If the DbgValueInst is a dbg_value of a function @@ -4033,11 +4308,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( Op = MachineOperand::CreateFI(FI); if (!Op && N.getNode()) { - unsigned Reg; - if (N.getOpcode() == ISD::CopyFromReg) - Reg = cast<RegisterSDNode>(N.getOperand(1))->getReg(); - else - Reg = getTruncatedArgReg(N); + unsigned Reg = getUnderlyingArgReg(N); if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) { MachineRegisterInfo &RegInfo = MF.getRegInfo(); unsigned PR = RegInfo.getLiveInPhysReg(Reg); @@ -4145,14 +4416,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { case Intrinsic::longjmp: return &"_longjmp"[!TLI.usesUnderscoreLongJmp()]; case Intrinsic::memcpy: { - // FIXME: this definition of "user defined address space" is x86-specific - // Assert for address < 256 since we support only user defined address - // spaces. - assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace() - < 256 && - cast<PointerType>(I.getArgOperand(1)->getType())->getAddressSpace() - < 256 && - "Unknown address space"); SDValue Op1 = getValue(I.getArgOperand(0)); SDValue Op2 = getValue(I.getArgOperand(1)); SDValue Op3 = getValue(I.getArgOperand(2)); @@ -4169,12 +4432,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return nullptr; } case Intrinsic::memset: { - // FIXME: this definition of "user defined address space" is x86-specific - // Assert for address < 256 since we support only user defined address - // spaces. - assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace() - < 256 && - "Unknown address space"); SDValue Op1 = getValue(I.getArgOperand(0)); SDValue Op2 = getValue(I.getArgOperand(1)); SDValue Op3 = getValue(I.getArgOperand(2)); @@ -4189,14 +4446,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return nullptr; } case Intrinsic::memmove: { - // FIXME: this definition of "user defined address space" is x86-specific - // Assert for address < 256 since we support only user defined address - // spaces. - assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace() - < 256 && - cast<PointerType>(I.getArgOperand(1)->getType())->getAddressSpace() - < 256 && - "Unknown address space"); SDValue Op1 = getValue(I.getArgOperand(0)); SDValue Op2 = getValue(I.getArgOperand(1)); SDValue Op3 = getValue(I.getArgOperand(2)); @@ -4238,33 +4487,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address)) Address = BCI->getOperand(0); // Parameters are handled specially. - bool isParameter = Variable->getTag() == dwarf::DW_TAG_arg_variable || - isa<Argument>(Address); - - const AllocaInst *AI = dyn_cast<AllocaInst>(Address); - - if (isParameter && !AI) { - FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N.getNode()); - if (FINode) - // Byval parameter. We have a frame index at this point. - SDV = DAG.getFrameIndexDbgValue( - Variable, Expression, FINode->getIndex(), 0, dl, SDNodeOrder); - else { - // Address is an argument, so try to emit its dbg value using - // virtual register info from the FuncInfo.ValueMap. - EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false, - N); - return nullptr; - } - } else if (AI) + bool isParameter = Variable->isParameter() || isa<Argument>(Address); + auto FINode = dyn_cast<FrameIndexSDNode>(N.getNode()); + if (isParameter && FINode) { + // Byval parameter. We have a frame index at this point. + SDV = DAG.getFrameIndexDbgValue(Variable, Expression, + FINode->getIndex(), 0, dl, SDNodeOrder); + } else if (isa<Argument>(Address)) { + // Address is an argument, so try to emit its dbg value using + // virtual register info from the FuncInfo.ValueMap. + EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false, + N); + return nullptr; + } else { SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(), true, 0, dl, SDNodeOrder); - else { - // Can't do anything with other non-AI cases yet. - DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); - DEBUG(dbgs() << "non-AllocaInst issue for Address: \n\t"); - DEBUG(Address->dump()); - return nullptr; } DAG.AddDbgValue(SDV, N.getNode(), isParameter); } else { @@ -4315,12 +4552,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { // Check unused arguments map. N = UnusedArgNodeMap[V]; if (N.getNode()) { - // A dbg.value for an alloca is always indirect. - bool IsIndirect = isa<AllocaInst>(V) || Offset != 0; if (!EmitFuncArgumentDbgValue(V, Variable, Expression, dl, Offset, - IsIndirect, N)) { + false, N)) { SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(), - IsIndirect, Offset, dl, SDNodeOrder); + false, Offset, dl, SDNodeOrder); DAG.AddDbgValue(SDV, N.getNode(), false); } } else if (!V->use_empty() ) { @@ -4421,6 +4656,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { getRoot(), getValue(I.getArgOperand(0)))); return nullptr; } + case Intrinsic::eh_sjlj_setup_dispatch: { + DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_SETUP_DISPATCH, sdl, MVT::Other, + getRoot())); + return nullptr; + } case Intrinsic::masked_gather: visitMaskedGather(I); @@ -4614,6 +4854,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { getValue(I.getArgOperand(1)), getValue(I.getArgOperand(2)))); } else { + // TODO: Intrinsic calls should have fast-math-flags. SDValue Mul = DAG.getNode(ISD::FMUL, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), @@ -4652,6 +4893,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { DAG.setRoot(Res.getValue(1)); return nullptr; } + case Intrinsic::bitreverse: + setValue(&I, DAG.getNode(ISD::BITREVERSE, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)))); + return nullptr; case Intrinsic::bswap: setValue(&I, DAG.getNode(ISD::BSWAP, sdl, getValue(I.getArgOperand(0)).getValueType(), @@ -4693,6 +4939,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res)); return nullptr; } + case Intrinsic::get_dynamic_area_offset: { + SDValue Op = getRoot(); + EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout()); + EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType()); + // Result type for @llvm.get.dynamic.area.offset should match PtrTy for + // target. + if (PtrTy != ResTy) + report_fatal_error("Wrong result type for @llvm.get.dynamic.area.offset" + " intrinsic!"); + Res = DAG.getNode(ISD::GET_DYNAMIC_AREA_OFFSET, sdl, DAG.getVTList(ResTy), + Op); + DAG.setRoot(Op); + setValue(&I, Res); + return nullptr; + } case Intrinsic::stackprotector: { // Emit code into the DAG to store the stack guard onto the stack. MachineFunction &MF = DAG.getMachineFunction(); @@ -4743,8 +5004,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { SDValue FIN = DAG.getFrameIndex(FI, PtrTy); // Store the stack protector onto the stack. - Res = DAG.getStore(Chain, sdl, Src, FIN, - MachinePointerInfo::getFixedStack(FI), + Res = DAG.getStore(Chain, sdl, Src, FIN, MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), FI), true, false, 0); setValue(&I, Res); DAG.setRoot(Res); @@ -4946,9 +5207,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { } case Intrinsic::clear_cache: return TLI.getClearCacheBuiltinName(); - case Intrinsic::eh_actions: - setValue(&I, DAG.getUNDEF(TLI.getPointerTy(DAG.getDataLayout()))); - return nullptr; case Intrinsic::donothing: // ignore return nullptr; @@ -4965,20 +5223,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { visitStatepoint(I); return nullptr; } - case Intrinsic::experimental_gc_result_int: - case Intrinsic::experimental_gc_result_float: - case Intrinsic::experimental_gc_result_ptr: case Intrinsic::experimental_gc_result: { visitGCResult(I); return nullptr; } case Intrinsic::experimental_gc_relocate: { - visitGCRelocate(I); + visitGCRelocate(cast<GCRelocateInst>(I)); return nullptr; } case Intrinsic::instrprof_increment: llvm_unreachable("instrprof failed to lower an increment"); - + case Intrinsic::instrprof_value_profile: + llvm_unreachable("instrprof failed to lower a value profiling call"); case Intrinsic::localescape: { MachineFunction &MF = DAG.getMachineFunction(); const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo(); @@ -5032,19 +5288,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return nullptr; } - case Intrinsic::eh_begincatch: - case Intrinsic::eh_endcatch: - llvm_unreachable("begin/end catch intrinsics not lowered in codegen"); + + case Intrinsic::eh_exceptionpointer: case Intrinsic::eh_exceptioncode: { - unsigned Reg = TLI.getExceptionPointerRegister(); - assert(Reg && "cannot get exception code on this platform"); + // Get the exception pointer vreg, copy from it, and resize it to fit. + const auto *CPI = cast<CatchPadInst>(I.getArgOperand(0)); MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); const TargetRegisterClass *PtrRC = TLI.getRegClassFor(PtrVT); - assert(FuncInfo.MBB->isLandingPad() && "eh.exceptioncode in non-lpad"); - unsigned VReg = FuncInfo.MBB->addLiveIn(Reg, PtrRC); + unsigned VReg = FuncInfo.getCatchPadExceptionPointerVReg(CPI, PtrRC); SDValue N = DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), VReg, PtrVT); - N = DAG.getZExtOrTrunc(N, getCurSDLoc(), MVT::i32); + if (Intrinsic == Intrinsic::eh_exceptioncode) + N = DAG.getZExtOrTrunc(N, getCurSDLoc(), MVT::i32); setValue(&I, N); return nullptr; } @@ -5053,11 +5308,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { std::pair<SDValue, SDValue> SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI, - MachineBasicBlock *LandingPad) { + const BasicBlock *EHPadBB) { MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI(); MCSymbol *BeginLabel = nullptr; - if (LandingPad) { + if (EHPadBB) { // Insert a label before the invoke call to mark the try range. This can be // used to detect deletion of the invoke via the MachineModuleInfo. BeginLabel = MMI.getContext().createTempSymbol(); @@ -5067,7 +5322,7 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI, unsigned CallSiteIndex = MMI.getCurrentCallSite(); if (CallSiteIndex) { MMI.setCallSiteBeginLabel(BeginLabel, CallSiteIndex); - LPadToCallSiteMap[LandingPad].push_back(CallSiteIndex); + LPadToCallSiteMap[FuncInfo.MBBMap[EHPadBB]].push_back(CallSiteIndex); // Now that the call site is handled, stop tracking it. MMI.setCurrentCallSite(0); @@ -5100,14 +5355,21 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI, DAG.setRoot(Result.second); } - if (LandingPad) { + if (EHPadBB) { // Insert a label at the end of the invoke call to mark the try range. This // can be used to detect deletion of the invoke via the MachineModuleInfo. MCSymbol *EndLabel = MMI.getContext().createTempSymbol(); DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getRoot(), EndLabel)); // Inform MachineModuleInfo of range. - MMI.addInvoke(LandingPad, BeginLabel, EndLabel); + if (MMI.hasEHFunclets()) { + assert(CLI.CS); + WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo(); + EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CS->getInstruction()), + BeginLabel, EndLabel); + } else { + MMI.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel); + } } return Result; @@ -5115,7 +5377,7 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI, void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool isTailCall, - MachineBasicBlock *LandingPad) { + const BasicBlock *EHPadBB) { PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); FunctionType *FTy = cast<FunctionType>(PT->getElementType()); Type *RetTy = FTy->getReturnType(); @@ -5154,7 +5416,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee, CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot()) .setCallee(RetTy, FTy, Callee, std::move(Args), CS) .setTailCall(isTailCall); - std::pair<SDValue,SDValue> Result = lowerInvokable(CLI, LandingPad); + std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB); if (Result.first.getNode()) setValue(CS.getInstruction(), Result.first); @@ -5978,7 +6240,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) { SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput]; if (OpInfo.ConstraintVT != Input.ConstraintVT) { - const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); std::pair<unsigned, const TargetRegisterClass *> MatchRC = TLI.getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode, OpInfo.ConstraintVT); @@ -6037,10 +6299,10 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) { int SSFI = MF.getFrameInfo()->CreateStackObject(TySize, Align, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getPointerTy(DAG.getDataLayout())); - Chain = DAG.getStore(Chain, getCurSDLoc(), - OpInfo.CallOperand, StackSlot, - MachinePointerInfo::getFixedStack(SSFI), - false, false, 0); + Chain = DAG.getStore( + Chain, getCurSDLoc(), OpInfo.CallOperand, StackSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + false, false, 0); OpInfo.CallOperand = StackSlot; } @@ -6460,12 +6722,9 @@ void SelectionDAGBuilder::visitVACopy(const CallInst &I) { /// This is a helper for lowering intrinsics that follow a target calling /// convention or require stack pointer adjustment. Only a subset of the /// intrinsic's operands need to participate in the calling convention. -std::pair<SDValue, SDValue> -SelectionDAGBuilder::lowerCallOperands(ImmutableCallSite CS, unsigned ArgIdx, - unsigned NumArgs, SDValue Callee, - Type *ReturnTy, - MachineBasicBlock *LandingPad, - bool IsPatchPoint) { +std::pair<SDValue, SDValue> SelectionDAGBuilder::lowerCallOperands( + ImmutableCallSite CS, unsigned ArgIdx, unsigned NumArgs, SDValue Callee, + Type *ReturnTy, const BasicBlock *EHPadBB, bool IsPatchPoint) { TargetLowering::ArgListTy Args; Args.reserve(NumArgs); @@ -6489,7 +6748,7 @@ SelectionDAGBuilder::lowerCallOperands(ImmutableCallSite CS, unsigned ArgIdx, .setCallee(CS.getCallingConv(), ReturnTy, Callee, std::move(Args), NumArgs) .setDiscardResult(CS->use_empty()).setIsPatchPoint(IsPatchPoint); - return lowerInvokable(CLI, LandingPad); + return lowerInvokable(CLI, EHPadBB); } /// \brief Add a stack map intrinsic call's live variable operands to a stackmap @@ -6593,7 +6852,7 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) { /// \brief Lower llvm.experimental.patchpoint directly to its target opcode. void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS, - MachineBasicBlock *LandingPad) { + const BasicBlock *EHPadBB) { // void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>, // i32 <numBytes>, // i8* <target>, @@ -6630,9 +6889,8 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS, unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs; Type *ReturnTy = IsAnyRegCC ? Type::getVoidTy(*DAG.getContext()) : CS->getType(); - std::pair<SDValue, SDValue> Result = - lowerCallOperands(CS, NumMetaOpers, NumCallArgs, Callee, ReturnTy, - LandingPad, true); + std::pair<SDValue, SDValue> Result = lowerCallOperands( + CS, NumMetaOpers, NumCallArgs, Callee, ReturnTy, EHPadBB, true); SDNode *CallEnd = Result.second.getNode(); if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg)) @@ -6926,8 +7184,11 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { i, j*Parts[j].getValueType().getStoreSize()); if (NumParts > 1 && j == 0) MyFlags.Flags.setSplit(); - else if (j != 0) + else if (j != 0) { MyFlags.Flags.setOrigAlign(1); + if (j == NumParts - 1) + MyFlags.Flags.setSplitEnd(); + } CLI.Outs.push_back(MyFlags); CLI.OutVals.push_back(Parts[j]); @@ -6980,14 +7241,20 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { ReturnValues.resize(NumValues); SmallVector<SDValue, 4> Chains(NumValues); + // An aggregate return value cannot wrap around the address space, so + // offsets to its parts don't wrap either. + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(true); + for (unsigned i = 0; i < NumValues; ++i) { SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot, CLI.DAG.getConstant(Offsets[i], CLI.DL, - PtrVT)); + PtrVT), &Flags); SDValue L = CLI.DAG.getLoad( RetTys[i], CLI.DL, CLI.Chain, Add, - MachinePointerInfo::getFixedStack(DemoteStackIdx, Offsets[i]), false, - false, false, 1); + MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(), + DemoteStackIdx, Offsets[i]), + false, false, false, 1); ReturnValues[i] = L; Chains[i] = L.getValue(1); } @@ -7069,9 +7336,9 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) { if (FastISel) return A->use_empty(); - const BasicBlock *Entry = A->getParent()->begin(); + const BasicBlock &Entry = A->getParent()->front(); for (const User *U : A->users()) - if (cast<Instruction>(U)->getParent() != Entry || isa<SwitchInst>(U)) + if (cast<Instruction>(U)->getParent() != &Entry || isa<SwitchInst>(U)) return false; // Use not in entry block. return true; @@ -7138,6 +7405,11 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // in the various CC lowering callbacks. Flags.setByVal(); } + if (F.getCallingConv() == CallingConv::X86_INTR) { + // IA Interrupt passes frame (1st parameter) by value in the stack. + if (Idx == 1) + Flags.setByVal(); + } if (Flags.isByVal() || Flags.isInAlloca()) { PointerType *Ty = cast<PointerType>(I->getType()); Type *ElementTy = Ty->getElementType(); @@ -7165,8 +7437,11 @@ void SelectionDAGISel::LowerArguments(const Function &F) { if (NumRegs > 1 && i == 0) MyFlags.Flags.setSplit(); // if it isn't first piece, alignment must be 1 - else if (i > 0) + else if (i > 0) { MyFlags.Flags.setOrigAlign(1); + if (i == NumRegs - 1) + MyFlags.Flags.setSplitEnd(); + } Ins.push_back(MyFlags); } if (NeedsRegBlock && Value == NumValues - 1) @@ -7235,12 +7510,12 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // If this argument is unused then remember its value. It is used to generate // debugging information. if (I->use_empty() && NumValues) { - SDB->setUnusedArgValue(I, InVals[i]); + SDB->setUnusedArgValue(&*I, InVals[i]); // Also remember any frame index for use in FastISel. if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(InVals[i].getNode())) - FuncInfo->setArgumentFrameIndex(I, FI->getIndex()); + FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); } for (unsigned Val = 0; Val != NumValues; ++Val) { @@ -7270,18 +7545,18 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Note down frame index. if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode())) - FuncInfo->setArgumentFrameIndex(I, FI->getIndex()); + FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues), SDB->getCurSDLoc()); - SDB->setValue(I, Res); + SDB->setValue(&*I, Res); if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) { if (LoadSDNode *LNode = dyn_cast<LoadSDNode>(Res.getOperand(0).getNode())) if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode())) - FuncInfo->setArgumentFrameIndex(I, FI->getIndex()); + FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex()); } // If this argument is live outside of the entry block, insert a copy from @@ -7293,13 +7568,13 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // uses with vregs. unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg(); if (TargetRegisterInfo::isVirtualRegister(Reg)) { - FuncInfo->ValueMap[I] = Reg; + FuncInfo->ValueMap[&*I] = Reg; continue; } } - if (!isOnlyUsedInEntryBlock(I, TM.Options.EnableFastISel)) { - FuncInfo->InitializeRegForValue(I); - SDB->CopyToExportRegsIfNeeded(I); + if (!isOnlyUsedInEntryBlock(&*I, TM.Options.EnableFastISel)) { + FuncInfo->InitializeRegForValue(&*I); + SDB->CopyToExportRegsIfNeeded(&*I); } } @@ -7401,21 +7676,21 @@ AddSuccessorMBB(const BasicBlock *BB, // If SuccBB has not been created yet, create it. if (!SuccMBB) { MachineFunction *MF = ParentMBB->getParent(); - MachineFunction::iterator BBI = ParentMBB; + MachineFunction::iterator BBI(ParentMBB); SuccMBB = MF->CreateMachineBasicBlock(BB); MF->insert(++BBI, SuccMBB); } // Add it as a successor of ParentMBB. ParentMBB->addSuccessor( - SuccMBB, BranchProbabilityInfo::getBranchWeightStackProtector(IsLikely)); + SuccMBB, BranchProbabilityInfo::getBranchProbStackProtector(IsLikely)); return SuccMBB; } MachineBasicBlock *SelectionDAGBuilder::NextBlock(MachineBasicBlock *MBB) { - MachineFunction::iterator I = MBB; + MachineFunction::iterator I(MBB); if (++I == FuncInfo.MF->end()) return nullptr; - return I; + return &*I; } /// During lowering new call nodes can be created (such as memset, etc.). @@ -7469,14 +7744,18 @@ bool SelectionDAGBuilder::buildJumpTable(CaseClusterVector &Clusters, CaseCluster &JTCluster) { assert(First <= Last); - uint32_t Weight = 0; + auto Prob = BranchProbability::getZero(); unsigned NumCmps = 0; std::vector<MachineBasicBlock*> Table; - DenseMap<MachineBasicBlock*, uint32_t> JTWeights; + DenseMap<MachineBasicBlock*, BranchProbability> JTProbs; + + // Initialize probabilities in JTProbs. + for (unsigned I = First; I <= Last; ++I) + JTProbs[Clusters[I].MBB] = BranchProbability::getZero(); + for (unsigned I = First; I <= Last; ++I) { assert(Clusters[I].Kind == CC_Range); - Weight += Clusters[I].Weight; - assert(Weight >= Clusters[I].Weight && "Weight overflow!"); + Prob += Clusters[I].Prob; APInt Low = Clusters[I].Low->getValue(); APInt High = Clusters[I].High->getValue(); NumCmps += (Low == High) ? 1 : 2; @@ -7491,10 +7770,10 @@ bool SelectionDAGBuilder::buildJumpTable(CaseClusterVector &Clusters, uint64_t ClusterSize = (High - Low).getLimitedValue() + 1; for (uint64_t J = 0; J < ClusterSize; ++J) Table.push_back(Clusters[I].MBB); - JTWeights[Clusters[I].MBB] += Clusters[I].Weight; + JTProbs[Clusters[I].MBB] += Clusters[I].Prob; } - unsigned NumDests = JTWeights.size(); + unsigned NumDests = JTProbs.size(); if (isSuitableForBitTests(NumDests, NumCmps, Clusters[First].Low->getValue(), Clusters[Last].High->getValue())) { @@ -7513,9 +7792,10 @@ bool SelectionDAGBuilder::buildJumpTable(CaseClusterVector &Clusters, for (MachineBasicBlock *Succ : Table) { if (Done.count(Succ)) continue; - addSuccessorWithWeight(JumpTableMBB, Succ, JTWeights[Succ]); + addSuccessorWithProb(JumpTableMBB, Succ, JTProbs[Succ]); Done.insert(Succ); } + JumpTableMBB->normalizeSuccProbs(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned JTI = CurMF->getOrCreateJumpTableInfo(TLI.getJumpTableEncoding()) @@ -7529,7 +7809,7 @@ bool SelectionDAGBuilder::buildJumpTable(CaseClusterVector &Clusters, JTCases.emplace_back(std::move(JTH), std::move(JT)); JTCluster = CaseCluster::jumpTable(Clusters[First].Low, Clusters[Last].High, - JTCases.size() - 1, Weight); + JTCases.size() - 1, Prob); return true; } @@ -7707,19 +7987,29 @@ bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters, .getSizeInBits(); assert(rangeFitsInWord(Low, High) && "Case range must fit in bit mask!"); - if (Low.isNonNegative() && High.slt(BitWidth)) { - // Optimize the case where all the case values fit in a - // word without having to subtract minValue. In this case, - // we can optimize away the subtraction. + // Check if the clusters cover a contiguous range such that no value in the + // range will jump to the default statement. + bool ContiguousRange = true; + for (int64_t I = First + 1; I <= Last; ++I) { + if (Clusters[I].Low->getValue() != Clusters[I - 1].High->getValue() + 1) { + ContiguousRange = false; + break; + } + } + + if (Low.isStrictlyPositive() && High.slt(BitWidth)) { + // Optimize the case where all the case values fit in a word without having + // to subtract minValue. In this case, we can optimize away the subtraction. LowBound = APInt::getNullValue(Low.getBitWidth()); CmpRange = High; + ContiguousRange = false; } else { LowBound = Low; CmpRange = High - Low; } CaseBitsVector CBV; - uint32_t TotalWeight = 0; + auto TotalProb = BranchProbability::getZero(); for (unsigned i = First; i <= Last; ++i) { // Find the CaseBits for this destination. unsigned j; @@ -7727,39 +8017,40 @@ bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters, if (CBV[j].BB == Clusters[i].MBB) break; if (j == CBV.size()) - CBV.push_back(CaseBits(0, Clusters[i].MBB, 0, 0)); + CBV.push_back( + CaseBits(0, Clusters[i].MBB, 0, BranchProbability::getZero())); CaseBits *CB = &CBV[j]; - // Update Mask, Bits and ExtraWeight. + // Update Mask, Bits and ExtraProb. uint64_t Lo = (Clusters[i].Low->getValue() - LowBound).getZExtValue(); uint64_t Hi = (Clusters[i].High->getValue() - LowBound).getZExtValue(); assert(Hi >= Lo && Hi < 64 && "Invalid bit case!"); CB->Mask |= (-1ULL >> (63 - (Hi - Lo))) << Lo; CB->Bits += Hi - Lo + 1; - CB->ExtraWeight += Clusters[i].Weight; - TotalWeight += Clusters[i].Weight; - assert(TotalWeight >= Clusters[i].Weight && "Weight overflow!"); + CB->ExtraProb += Clusters[i].Prob; + TotalProb += Clusters[i].Prob; } BitTestInfo BTI; std::sort(CBV.begin(), CBV.end(), [](const CaseBits &a, const CaseBits &b) { - // Sort by weight first, number of bits second. - if (a.ExtraWeight != b.ExtraWeight) - return a.ExtraWeight > b.ExtraWeight; + // Sort by probability first, number of bits second. + if (a.ExtraProb != b.ExtraProb) + return a.ExtraProb > b.ExtraProb; return a.Bits > b.Bits; }); for (auto &CB : CBV) { MachineBasicBlock *BitTestBB = FuncInfo.MF->CreateMachineBasicBlock(SI->getParent()); - BTI.push_back(BitTestCase(CB.Mask, BitTestBB, CB.BB, CB.ExtraWeight)); + BTI.push_back(BitTestCase(CB.Mask, BitTestBB, CB.BB, CB.ExtraProb)); } BitTestCases.emplace_back(std::move(LowBound), std::move(CmpRange), - SI->getCondition(), -1U, MVT::Other, false, nullptr, - nullptr, std::move(BTI)); + SI->getCondition(), -1U, MVT::Other, false, + ContiguousRange, nullptr, nullptr, std::move(BTI), + TotalProb); BTCluster = CaseCluster::bitTests(Clusters[First].Low, Clusters[Last].High, - BitTestCases.size() - 1, TotalWeight); + BitTestCases.size() - 1, TotalProb); return true; } @@ -7868,9 +8159,9 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond, MachineBasicBlock *DefaultMBB) { MachineFunction *CurMF = FuncInfo.MF; MachineBasicBlock *NextMBB = nullptr; - MachineFunction::iterator BBI = W.MBB; + MachineFunction::iterator BBI(W.MBB); if (++BBI != FuncInfo.MF->end()) - NextMBB = BBI; + NextMBB = &*BBI; unsigned Size = W.LastCluster - W.FirstCluster + 1; @@ -7906,13 +8197,16 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond, ISD::SETEQ); // Update successor info. - // Both Small and Big will jump to Small.BB, so we sum up the weights. - addSuccessorWithWeight(SwitchMBB, Small.MBB, Small.Weight + Big.Weight); - addSuccessorWithWeight( - SwitchMBB, DefaultMBB, - // The default destination is the first successor in IR. - BPI ? BPI->getEdgeWeight(SwitchMBB->getBasicBlock(), (unsigned)0) - : 0); + // Both Small and Big will jump to Small.BB, so we sum up the + // probabilities. + addSuccessorWithProb(SwitchMBB, Small.MBB, Small.Prob + Big.Prob); + if (BPI) + addSuccessorWithProb( + SwitchMBB, DefaultMBB, + // The default destination is the first successor in IR. + BPI->getEdgeProbability(SwitchMBB->getBasicBlock(), (unsigned)0)); + else + addSuccessorWithProb(SwitchMBB, DefaultMBB); // Insert the true branch. SDValue BrCond = @@ -7929,17 +8223,17 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond, } if (TM.getOptLevel() != CodeGenOpt::None) { - // Order cases by weight so the most likely case will be checked first. + // Order cases by probability so the most likely case will be checked first. std::sort(W.FirstCluster, W.LastCluster + 1, [](const CaseCluster &a, const CaseCluster &b) { - return a.Weight > b.Weight; + return a.Prob > b.Prob; }); // Rearrange the case blocks so that the last one falls through if possible - // without without changing the order of weights. + // without without changing the order of probabilities. for (CaseClusterIt I = W.LastCluster; I > W.FirstCluster; ) { --I; - if (I->Weight > W.LastCluster->Weight) + if (I->Prob > W.LastCluster->Prob) break; if (I->Kind == CC_Range && I->MBB == NextMBB) { std::swap(*I, *W.LastCluster); @@ -7948,12 +8242,11 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond, } } - // Compute total weight. - uint32_t UnhandledWeights = 0; - for (CaseClusterIt I = W.FirstCluster; I <= W.LastCluster; ++I) { - UnhandledWeights += I->Weight; - assert(UnhandledWeights >= I->Weight && "Weight overflow!"); - } + // Compute total probability. + BranchProbability DefaultProb = W.DefaultProb; + BranchProbability UnhandledProbs = DefaultProb; + for (CaseClusterIt I = W.FirstCluster; I <= W.LastCluster; ++I) + UnhandledProbs += I->Prob; MachineBasicBlock *CurMBB = W.MBB; for (CaseClusterIt I = W.FirstCluster, E = W.LastCluster; I <= E; ++I) { @@ -7967,6 +8260,7 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond, // Put Cond in a virtual register to make it available from the new blocks. ExportFromCurrentBlock(Cond); } + UnhandledProbs -= I->Prob; switch (I->Kind) { case CC_JumpTable: { @@ -7977,8 +8271,28 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond, // The jump block hasn't been inserted yet; insert it here. MachineBasicBlock *JumpMBB = JT->MBB; CurMF->insert(BBI, JumpMBB); - addSuccessorWithWeight(CurMBB, Fallthrough); - addSuccessorWithWeight(CurMBB, JumpMBB); + + auto JumpProb = I->Prob; + auto FallthroughProb = UnhandledProbs; + + // If the default statement is a target of the jump table, we evenly + // distribute the default probability to successors of CurMBB. Also + // update the probability on the edge from JumpMBB to Fallthrough. + for (MachineBasicBlock::succ_iterator SI = JumpMBB->succ_begin(), + SE = JumpMBB->succ_end(); + SI != SE; ++SI) { + if (*SI == DefaultMBB) { + JumpProb += DefaultProb / 2; + FallthroughProb -= DefaultProb / 2; + JumpMBB->setSuccProbability(SI, DefaultProb / 2); + JumpMBB->normalizeSuccProbs(); + break; + } + } + + addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb); + addSuccessorWithProb(CurMBB, JumpMBB, JumpProb); + CurMBB->normalizeSuccProbs(); // The jump table header will be inserted in our current block, do the // range check, and fall through to our fallthrough block. @@ -8004,8 +8318,17 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond, BTB->Parent = CurMBB; BTB->Default = Fallthrough; - // If we're in the right place, emit the bit test header header right now. - if (CurMBB ==SwitchMBB) { + BTB->DefaultProb = UnhandledProbs; + // If the cases in bit test don't form a contiguous range, we evenly + // distribute the probability on the edge to Fallthrough to two + // successors of CurMBB. + if (!BTB->ContiguousRange) { + BTB->Prob += DefaultProb / 2; + BTB->DefaultProb -= DefaultProb / 2; + } + + // If we're in the right place, emit the bit test header right now. + if (CurMBB == SwitchMBB) { visitBitTestHeader(*BTB, SwitchMBB); BTB->Emitted = true; } @@ -8028,10 +8351,9 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond, RHS = I->High; } - // The false weight is the sum of all unhandled cases. - UnhandledWeights -= I->Weight; - CaseBlock CB(CC, LHS, RHS, MHS, I->MBB, Fallthrough, CurMBB, I->Weight, - UnhandledWeights); + // The false probability is the sum of all unhandled cases. + CaseBlock CB(CC, LHS, RHS, MHS, I->MBB, Fallthrough, CurMBB, I->Prob, + UnhandledProbs); if (CurMBB == SwitchMBB) visitSwitchCase(CB, SwitchMBB); @@ -8049,8 +8371,8 @@ unsigned SelectionDAGBuilder::caseClusterRank(const CaseCluster &CC, CaseClusterIt First, CaseClusterIt Last) { return std::count_if(First, Last + 1, [&](const CaseCluster &X) { - if (X.Weight != CC.Weight) - return X.Weight > CC.Weight; + if (X.Prob != CC.Prob) + return X.Prob > CC.Prob; // Ties are broken by comparing the case value. return X.Low->getValue().slt(CC.Low->getValue()); @@ -8066,24 +8388,24 @@ void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList, assert(W.LastCluster - W.FirstCluster + 1 >= 2 && "Too small to split!"); - // Balance the tree based on branch weights to create a near-optimal (in terms - // of search time given key frequency) binary search tree. See e.g. Kurt + // Balance the tree based on branch probabilities to create a near-optimal (in + // terms of search time given key frequency) binary search tree. See e.g. Kurt // Mehlhorn "Nearly Optimal Binary Search Trees" (1975). CaseClusterIt LastLeft = W.FirstCluster; CaseClusterIt FirstRight = W.LastCluster; - uint32_t LeftWeight = LastLeft->Weight; - uint32_t RightWeight = FirstRight->Weight; + auto LeftProb = LastLeft->Prob + W.DefaultProb / 2; + auto RightProb = FirstRight->Prob + W.DefaultProb / 2; // Move LastLeft and FirstRight towards each other from opposite directions to - // find a partitioning of the clusters which balances the weight on both - // sides. If LeftWeight and RightWeight are equal, alternate which side is - // taken to ensure 0-weight nodes are distributed evenly. + // find a partitioning of the clusters which balances the probability on both + // sides. If LeftProb and RightProb are equal, alternate which side is + // taken to ensure 0-probability nodes are distributed evenly. unsigned I = 0; while (LastLeft + 1 < FirstRight) { - if (LeftWeight < RightWeight || (LeftWeight == RightWeight && (I & 1))) - LeftWeight += (++LastLeft)->Weight; + if (LeftProb < RightProb || (LeftProb == RightProb && (I & 1))) + LeftProb += (++LastLeft)->Prob; else - RightWeight += (--FirstRight)->Weight; + RightProb += (--FirstRight)->Prob; I++; } @@ -8144,7 +8466,7 @@ void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList, const ConstantInt *Pivot = PivotCluster->Low; // New blocks will be inserted immediately after the current one. - MachineFunction::iterator BBI = W.MBB; + MachineFunction::iterator BBI(W.MBB); ++BBI; // We will branch to the LHS if Value < Pivot. If LHS is a single cluster, @@ -8158,7 +8480,8 @@ void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList, } else { LeftMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock()); FuncInfo.MF->insert(BBI, LeftMBB); - WorkList.push_back({LeftMBB, FirstLeft, LastLeft, W.GE, Pivot}); + WorkList.push_back( + {LeftMBB, FirstLeft, LastLeft, W.GE, Pivot, W.DefaultProb / 2}); // Put Cond in a virtual register to make it available from the new blocks. ExportFromCurrentBlock(Cond); } @@ -8173,14 +8496,15 @@ void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList, } else { RightMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock()); FuncInfo.MF->insert(BBI, RightMBB); - WorkList.push_back({RightMBB, FirstRight, LastRight, Pivot, W.LT}); + WorkList.push_back( + {RightMBB, FirstRight, LastRight, Pivot, W.LT, W.DefaultProb / 2}); // Put Cond in a virtual register to make it available from the new blocks. ExportFromCurrentBlock(Cond); } // Create the CaseBlock record that will be used to lower the branch. CaseBlock CB(ISD::SETLT, Cond, Pivot, nullptr, LeftMBB, RightMBB, W.MBB, - LeftWeight, RightWeight); + LeftProb, RightProb); if (W.MBB == SwitchMBB) visitSwitchCase(CB, SwitchMBB); @@ -8196,9 +8520,10 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) { for (auto I : SI.cases()) { MachineBasicBlock *Succ = FuncInfo.MBBMap[I.getCaseSuccessor()]; const ConstantInt *CaseVal = I.getCaseValue(); - uint32_t Weight = - BPI ? BPI->getEdgeWeight(SI.getParent(), I.getSuccessorIndex()) : 0; - Clusters.push_back(CaseCluster::range(CaseVal, CaseVal, Succ, Weight)); + BranchProbability Prob = + BPI ? BPI->getEdgeProbability(SI.getParent(), I.getSuccessorIndex()) + : BranchProbability(1, SI.getNumCases() + 1); + Clusters.push_back(CaseCluster::range(CaseVal, CaseVal, Succ, Prob)); } MachineBasicBlock *DefaultMBB = FuncInfo.MBBMap[SI.getDefaultDest()]; @@ -8274,7 +8599,8 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) { SwitchWorkList WorkList; CaseClusterIt First = Clusters.begin(); CaseClusterIt Last = Clusters.end() - 1; - WorkList.push_back({SwitchMBB, First, Last, nullptr, nullptr}); + auto DefaultProb = getEdgeProbability(SwitchMBB, DefaultMBB); + WorkList.push_back({SwitchMBB, First, Last, nullptr, nullptr, DefaultProb}); while (!WorkList.empty()) { SwitchWorkListItem W = WorkList.back(); diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 7006754..8fb85ff 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -17,6 +17,7 @@ #include "StatepointLowering.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -30,7 +31,6 @@ namespace llvm { class AddrSpaceCastInst; -class AliasAnalysis; class AllocaInst; class BasicBlock; class BitCastInst; @@ -154,39 +154,39 @@ private: unsigned JTCasesIndex; unsigned BTCasesIndex; }; - uint32_t Weight; + BranchProbability Prob; static CaseCluster range(const ConstantInt *Low, const ConstantInt *High, - MachineBasicBlock *MBB, uint32_t Weight) { + MachineBasicBlock *MBB, BranchProbability Prob) { CaseCluster C; C.Kind = CC_Range; C.Low = Low; C.High = High; C.MBB = MBB; - C.Weight = Weight; + C.Prob = Prob; return C; } static CaseCluster jumpTable(const ConstantInt *Low, const ConstantInt *High, unsigned JTCasesIndex, - uint32_t Weight) { + BranchProbability Prob) { CaseCluster C; C.Kind = CC_JumpTable; C.Low = Low; C.High = High; C.JTCasesIndex = JTCasesIndex; - C.Weight = Weight; + C.Prob = Prob; return C; } static CaseCluster bitTests(const ConstantInt *Low, const ConstantInt *High, - unsigned BTCasesIndex, uint32_t Weight) { + unsigned BTCasesIndex, BranchProbability Prob) { CaseCluster C; C.Kind = CC_BitTests; C.Low = Low; C.High = High; C.BTCasesIndex = BTCasesIndex; - C.Weight = Weight; + C.Prob = Prob; return C; } }; @@ -198,13 +198,13 @@ private: uint64_t Mask; MachineBasicBlock* BB; unsigned Bits; - uint32_t ExtraWeight; + BranchProbability ExtraProb; CaseBits(uint64_t mask, MachineBasicBlock* bb, unsigned bits, - uint32_t Weight): - Mask(mask), BB(bb), Bits(bits), ExtraWeight(Weight) { } + BranchProbability Prob): + Mask(mask), BB(bb), Bits(bits), ExtraProb(Prob) { } - CaseBits() : Mask(0), BB(nullptr), Bits(0), ExtraWeight(0) {} + CaseBits() : Mask(0), BB(nullptr), Bits(0) {} }; typedef std::vector<CaseBits> CaseBitsVector; @@ -217,13 +217,13 @@ private: /// blocks needed by multi-case switch statements. struct CaseBlock { CaseBlock(ISD::CondCode cc, const Value *cmplhs, const Value *cmprhs, - const Value *cmpmiddle, - MachineBasicBlock *truebb, MachineBasicBlock *falsebb, - MachineBasicBlock *me, - uint32_t trueweight = 0, uint32_t falseweight = 0) - : CC(cc), CmpLHS(cmplhs), CmpMHS(cmpmiddle), CmpRHS(cmprhs), - TrueBB(truebb), FalseBB(falsebb), ThisBB(me), - TrueWeight(trueweight), FalseWeight(falseweight) { } + const Value *cmpmiddle, MachineBasicBlock *truebb, + MachineBasicBlock *falsebb, MachineBasicBlock *me, + BranchProbability trueprob = BranchProbability::getUnknown(), + BranchProbability falseprob = BranchProbability::getUnknown()) + : CC(cc), CmpLHS(cmplhs), CmpMHS(cmpmiddle), CmpRHS(cmprhs), + TrueBB(truebb), FalseBB(falsebb), ThisBB(me), TrueProb(trueprob), + FalseProb(falseprob) {} // CC - the condition code to use for the case block's setcc node ISD::CondCode CC; @@ -239,8 +239,8 @@ private: // ThisBB - the block into which to emit the code for the setcc and branches MachineBasicBlock *ThisBB; - // TrueWeight/FalseWeight - branch weights. - uint32_t TrueWeight, FalseWeight; + // TrueProb/FalseProb - branch weights. + BranchProbability TrueProb, FalseProb; }; struct JumpTable { @@ -272,32 +272,35 @@ private: struct BitTestCase { BitTestCase(uint64_t M, MachineBasicBlock* T, MachineBasicBlock* Tr, - uint32_t Weight): - Mask(M), ThisBB(T), TargetBB(Tr), ExtraWeight(Weight) { } + BranchProbability Prob): + Mask(M), ThisBB(T), TargetBB(Tr), ExtraProb(Prob) { } uint64_t Mask; MachineBasicBlock *ThisBB; MachineBasicBlock *TargetBB; - uint32_t ExtraWeight; + BranchProbability ExtraProb; }; typedef SmallVector<BitTestCase, 3> BitTestInfo; struct BitTestBlock { - BitTestBlock(APInt F, APInt R, const Value* SV, - unsigned Rg, MVT RgVT, bool E, - MachineBasicBlock* P, MachineBasicBlock* D, - BitTestInfo C): - First(F), Range(R), SValue(SV), Reg(Rg), RegVT(RgVT), Emitted(E), - Parent(P), Default(D), Cases(std::move(C)) { } + BitTestBlock(APInt F, APInt R, const Value *SV, unsigned Rg, MVT RgVT, + bool E, bool CR, MachineBasicBlock *P, MachineBasicBlock *D, + BitTestInfo C, BranchProbability Pr) + : First(F), Range(R), SValue(SV), Reg(Rg), RegVT(RgVT), Emitted(E), + ContiguousRange(CR), Parent(P), Default(D), Cases(std::move(C)), + Prob(Pr) {} APInt First; APInt Range; const Value *SValue; unsigned Reg; MVT RegVT; bool Emitted; + bool ContiguousRange; MachineBasicBlock *Parent; MachineBasicBlock *Default; BitTestInfo Cases; + BranchProbability Prob; + BranchProbability DefaultProb; }; /// Minimum jump table density, in percent. @@ -339,6 +342,7 @@ private: CaseClusterIt LastCluster; const ConstantInt *GE; const ConstantInt *LT; + BranchProbability DefaultProb; }; typedef SmallVector<SwitchWorkListItem, 4> SwitchWorkList; @@ -515,6 +519,7 @@ private: void resetPerFunctionState() { FailureMBB = nullptr; Guard = nullptr; + GuardReg = 0; } MachineBasicBlock *getParentMBB() { return ParentMBB; } @@ -592,10 +597,6 @@ public: /// FunctionLoweringInfo &FuncInfo; - /// OptLevel - What optimization level we're generating code for. - /// - CodeGenOpt::Level OptLevel; - /// GFI - Garbage collection metadata for the function. GCFunctionInfo *GFI; @@ -613,7 +614,7 @@ public: SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo, CodeGenOpt::Level ol) : CurInst(nullptr), SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()), - DAG(dag), FuncInfo(funcinfo), OptLevel(ol), + DAG(dag), FuncInfo(funcinfo), HasTailCall(false) { } @@ -692,19 +693,20 @@ public: void FindMergedConditions(const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB, MachineBasicBlock *CurBB, - MachineBasicBlock *SwitchBB, unsigned Opc, - uint32_t TW, uint32_t FW); + MachineBasicBlock *SwitchBB, + Instruction::BinaryOps Opc, BranchProbability TW, + BranchProbability FW); void EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB, MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB, - uint32_t TW, uint32_t FW); + BranchProbability TW, BranchProbability FW); bool ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases); bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB); void CopyToExportRegsIfNeeded(const Value *V); void ExportFromCurrentBlock(const Value *V); void LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool IsTailCall, - MachineBasicBlock *LandingPad = nullptr); + const BasicBlock *EHPadBB = nullptr); std::pair<SDValue, SDValue> lowerCallOperands( ImmutableCallSite CS, @@ -712,7 +714,7 @@ public: unsigned NumArgs, SDValue Callee, Type *ReturnTy, - MachineBasicBlock *LandingPad = nullptr, + const BasicBlock *EHPadBB = nullptr, bool IsPatchPoint = false); /// UpdateSplitBlock - When an MBB was split during scheduling, update the @@ -722,11 +724,11 @@ public: // This function is responsible for the whole statepoint lowering process. // It uniformly handles invoke and call statepoints. void LowerStatepoint(ImmutableStatepoint Statepoint, - MachineBasicBlock *LandingPad = nullptr); + const BasicBlock *EHPadBB = nullptr); private: - std::pair<SDValue, SDValue> lowerInvokable( - TargetLowering::CallLoweringInfo &CLI, - MachineBasicBlock *LandingPad); + std::pair<SDValue, SDValue> + lowerInvokable(TargetLowering::CallLoweringInfo &CLI, + const BasicBlock *EHPadBB = nullptr); // Terminator instructions. void visitRet(const ReturnInst &I); @@ -734,11 +736,18 @@ private: void visitSwitch(const SwitchInst &I); void visitIndirectBr(const IndirectBrInst &I); void visitUnreachable(const UnreachableInst &I); + void visitCleanupRet(const CleanupReturnInst &I); + void visitCatchSwitch(const CatchSwitchInst &I); + void visitCatchRet(const CatchReturnInst &I); + void visitCatchPad(const CatchPadInst &I); + void visitCleanupPad(const CleanupPadInst &CPI); + + BranchProbability getEdgeProbability(const MachineBasicBlock *Src, + const MachineBasicBlock *Dst) const; + void addSuccessorWithProb( + MachineBasicBlock *Src, MachineBasicBlock *Dst, + BranchProbability Prob = BranchProbability::getUnknown()); - uint32_t getEdgeWeight(const MachineBasicBlock *Src, - const MachineBasicBlock *Dst) const; - void addSuccessorWithWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst, - uint32_t Weight = 0); public: void visitSwitchCase(CaseBlock &CB, MachineBasicBlock *SwitchBB); @@ -748,7 +757,7 @@ public: void visitBitTestHeader(BitTestBlock &B, MachineBasicBlock *SwitchBB); void visitBitTestCase(BitTestBlock &BB, MachineBasicBlock* NextMBB, - uint32_t BranchWeightToNext, + BranchProbability BranchProbToNext, unsigned Reg, BitTestCase &B, MachineBasicBlock *SwitchBB); @@ -842,11 +851,11 @@ private: void visitVACopy(const CallInst &I); void visitStackmap(const CallInst &I); void visitPatchpoint(ImmutableCallSite CS, - MachineBasicBlock *LandingPad = nullptr); + const BasicBlock *EHPadBB = nullptr); // These three are implemented in StatepointLowering.cpp void visitStatepoint(const CallInst &I); - void visitGCRelocate(const CallInst &I); + void visitGCRelocate(const GCRelocateInst &I); void visitGCResult(const CallInst &I); void visitUserOp1(const Instruction &I) { diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 5b9b182..a1c6c4c 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GraphWriter.h" +#include "llvm/Support/Printable.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -30,6 +31,11 @@ #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; +static cl::opt<bool> +VerboseDAGDumping("dag-dump-verbose", cl::Hidden, + cl::desc("Display more information when dumping selection " + "DAG nodes.")); + std::string SDNode::getOperationName(const SelectionDAG *G) const { switch (getOpcode()) { default: @@ -102,6 +108,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::EH_RETURN: return "EH_RETURN"; case ISD::EH_SJLJ_SETJMP: return "EH_SJLJ_SETJMP"; case ISD::EH_SJLJ_LONGJMP: return "EH_SJLJ_LONGJMP"; + case ISD::EH_SJLJ_SETUP_DISPATCH: return "EH_SJLJ_SETUP_DISPATCH"; case ISD::ConstantPool: return "ConstantPool"; case ISD::TargetIndex: return "TargetIndex"; case ISD::ExternalSymbol: return "ExternalSymbol"; @@ -145,6 +152,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::FABS: return "fabs"; case ISD::FMINNUM: return "fminnum"; case ISD::FMAXNUM: return "fmaxnum"; + case ISD::FMINNAN: return "fminnan"; + case ISD::FMAXNAN: return "fmaxnan"; case ISD::FNEG: return "fneg"; case ISD::FSQRT: return "fsqrt"; case ISD::FSIN: return "fsin"; @@ -201,6 +210,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::FPOWI: return "fpowi"; case ISD::SETCC: return "setcc"; + case ISD::SETCCE: return "setcce"; case ISD::SELECT: return "select"; case ISD::VSELECT: return "vselect"; case ISD::SELECT_CC: return "select_cc"; @@ -273,6 +283,10 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::CALLSEQ_START: return "callseq_start"; case ISD::CALLSEQ_END: return "callseq_end"; + // EH instructions + case ISD::CATCHRET: return "catchret"; + case ISD::CLEANUPRET: return "cleanupret"; + // Other operators case ISD::LOAD: return "load"; case ISD::STORE: return "store"; @@ -295,15 +309,17 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::LIFETIME_END: return "lifetime.end"; case ISD::GC_TRANSITION_START: return "gc_transition.start"; case ISD::GC_TRANSITION_END: return "gc_transition.end"; + case ISD::GET_DYNAMIC_AREA_OFFSET: return "get.dynamic.area.offset"; // Bit manipulation + case ISD::BITREVERSE: return "bitreverse"; case ISD::BSWAP: return "bswap"; case ISD::CTPOP: return "ctpop"; case ISD::CTTZ: return "cttz"; case ISD::CTTZ_ZERO_UNDEF: return "cttz_zero_undef"; case ISD::CTLZ: return "ctlz"; case ISD::CTLZ_ZERO_UNDEF: return "ctlz_zero_undef"; - + // Trampolines case ISD::INIT_TRAMPOLINE: return "init_trampoline"; case ISD::ADJUST_TRAMPOLINE: return "adjust_trampoline"; @@ -320,7 +336,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::SETO: return "seto"; case ISD::SETUO: return "setuo"; - case ISD::SETUEQ: return "setue"; + case ISD::SETUEQ: return "setueq"; case ISD::SETUGT: return "setugt"; case ISD::SETUGE: return "setuge"; case ISD::SETULT: return "setult"; @@ -352,6 +368,16 @@ const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) { } } +static Printable PrintNodeId(const SDNode &Node) { + return Printable([&Node](raw_ostream &OS) { +#ifndef NDEBUG + OS << 't' << Node.PersistentId; +#else + OS << (const void*)&Node; +#endif + }); +} + void SDNode::dump() const { dump(nullptr); } void SDNode::dump(const SelectionDAG *G) const { print(dbgs(), G); @@ -359,8 +385,6 @@ void SDNode::dump(const SelectionDAG *G) const { } void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const { - OS << (const void*)this << ": "; - for (unsigned i = 0, e = getNumValues(); i != e; ++i) { if (i) OS << ","; if (getValueType(i) == MVT::Other) @@ -368,7 +392,6 @@ void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const { else OS << getValueType(i).getEVTString(); } - OS << " = " << getOperationName(G); } void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { @@ -523,48 +546,58 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { << ']'; } - if (unsigned Order = getIROrder()) - OS << " [ORD=" << Order << ']'; + if (VerboseDAGDumping) { + if (unsigned Order = getIROrder()) + OS << " [ORD=" << Order << ']'; - if (getNodeId() != -1) - OS << " [ID=" << getNodeId() << ']'; + if (getNodeId() != -1) + OS << " [ID=" << getNodeId() << ']'; - if (!G) - return; + if (!G) + return; - DILocation *L = getDebugLoc(); - if (!L) - return; + DILocation *L = getDebugLoc(); + if (!L) + return; + + if (auto *Scope = L->getScope()) + OS << Scope->getFilename(); + else + OS << "<unknown>"; + OS << ':' << L->getLine(); + if (unsigned C = L->getColumn()) + OS << ':' << C; + } +} - if (auto *Scope = L->getScope()) - OS << Scope->getFilename(); - else - OS << "<unknown>"; - OS << ':' << L->getLine(); - if (unsigned C = L->getColumn()) - OS << ':' << C; +/// Return true if this node is so simple that we should just print it inline +/// if it appears as an operand. +static bool shouldPrintInline(const SDNode &Node) { + if (Node.getOpcode() == ISD::EntryToken) + return false; + return Node.getNumOperands() == 0; } static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) { - for (const SDValue &Op : N->op_values()) + for (const SDValue &Op : N->op_values()) { + if (shouldPrintInline(*Op.getNode())) + continue; if (Op.getNode()->hasOneUse()) DumpNodes(Op.getNode(), indent+2, G); - else - dbgs() << "\n" << std::string(indent+2, ' ') - << (void*)Op.getNode() << ": <multiple use>"; + } - dbgs() << '\n'; dbgs().indent(indent); N->dump(G); } void SelectionDAG::dump() const { - dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:"; + dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:\n"; for (allnodes_const_iterator I = allnodes_begin(), E = allnodes_end(); I != E; ++I) { - const SDNode *N = I; - if (!N->hasOneUse() && N != getRoot().getNode()) + const SDNode *N = &*I; + if (!N->hasOneUse() && N != getRoot().getNode() && + (!shouldPrintInline(*N) || N->use_empty())) DumpNodes(N, 2, this); } @@ -573,10 +606,30 @@ void SelectionDAG::dump() const { } void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const { + OS << PrintNodeId(*this) << ": "; print_types(OS, G); + OS << " = " << getOperationName(G); print_details(OS, G); } +static bool printOperand(raw_ostream &OS, const SelectionDAG *G, + const SDValue Value) { + if (!Value.getNode()) { + OS << "<null>"; + return false; + } else if (shouldPrintInline(*Value.getNode())) { + OS << Value->getOperationName(G) << ':'; + Value->print_types(OS, G); + Value->print_details(OS, G); + return true; + } else { + OS << PrintNodeId(*Value.getNode()); + if (unsigned RN = Value.getResNo()) + OS << ':' << RN; + return false; + } +} + typedef SmallPtrSet<const SDNode *, 128> VisitedSDNodeSet; static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent, const SelectionDAG *G, VisitedSDNodeSet &once) { @@ -589,20 +642,13 @@ static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent, // Having printed this SDNode, walk the children: for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { - const SDNode *child = N->getOperand(i).getNode(); - if (i) OS << ","; OS << " "; - if (child->getNumOperands() == 0) { - // This child has no grandchildren; print it inline right here. - child->printr(OS, G); - once.insert(child); - } else { // Just the address. FIXME: also print the child's opcode. - OS << (const void*)child; - if (unsigned RN = N->getOperand(i).getResNo()) - OS << ":" << RN; - } + const SDValue Op = N->getOperand(i); + bool printedInline = printOperand(OS, G, Op); + if (printedInline) + once.insert(Op.getNode()); } OS << "\n"; @@ -664,12 +710,9 @@ void SDNode::dumprFull(const SelectionDAG *G) const { } void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const { - print_types(OS, G); + printr(OS, G); for (unsigned i = 0, e = getNumOperands(); i != e; ++i) { if (i) OS << ", "; else OS << " "; - OS << (void*)getOperand(i).getNode(); - if (unsigned RN = getOperand(i).getResNo()) - OS << ":" << RN; + printOperand(OS, G, getOperand(i)); } - print_details(OS, G); } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 97ece8b..9f8759d 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" @@ -263,13 +264,17 @@ namespace llvm { return; IS.OptLevel = NewOptLevel; IS.TM.setOptLevel(NewOptLevel); - SavedFastISel = IS.TM.Options.EnableFastISel; - if (NewOptLevel == CodeGenOpt::None) - IS.TM.setFastISel(true); DEBUG(dbgs() << "\nChanging optimization level for Function " << IS.MF->getFunction()->getName() << "\n"); DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel << " ; After: -O" << NewOptLevel << "\n"); + SavedFastISel = IS.TM.Options.EnableFastISel; + if (NewOptLevel == CodeGenOpt::None) { + IS.TM.setFastISel(IS.TM.getO0WantsFastISel()); + DEBUG(dbgs() << "\tFastISel is " + << (IS.TM.Options.EnableFastISel ? "enabled" : "disabled") + << "\n"); + } } ~OptLevelChanger() { @@ -293,6 +298,11 @@ namespace llvm { const TargetLowering *TLI = IS->TLI; const TargetSubtargetInfo &ST = IS->MF->getSubtarget(); + // Try first to see if the Target has its own way of selecting a scheduler + if (auto *SchedulerCtor = ST.getDAGScheduler(OptLevel)) { + return SchedulerCtor(IS, OptLevel); + } + if (OptLevel == CodeGenOpt::None || (ST.enableMachineScheduler() && ST.enableMachineSchedDefaultSched()) || TLI->getSchedulingPreference() == Sched::Source) @@ -350,8 +360,9 @@ SelectionDAGISel::SelectionDAGISel(TargetMachine &tm, OptLevel(OL), DAGSize(0) { initializeGCModuleInfoPass(*PassRegistry::getPassRegistry()); - initializeAliasAnalysisAnalysisGroup(*PassRegistry::getPassRegistry()); - initializeBranchProbabilityInfoPass(*PassRegistry::getPassRegistry()); + initializeBranchProbabilityInfoWrapperPassPass( + *PassRegistry::getPassRegistry()); + initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry()); initializeTargetLibraryInfoWrapperPassPass( *PassRegistry::getPassRegistry()); } @@ -363,13 +374,12 @@ SelectionDAGISel::~SelectionDAGISel() { } void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AliasAnalysis>(); - AU.addPreserved<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<GCModuleInfo>(); AU.addPreserved<GCModuleInfo>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); if (UseMBPI && OptLevel != CodeGenOpt::None) - AU.addRequired<BranchProbabilityInfo>(); + AU.addRequired<BranchProbabilityInfoWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -380,10 +390,10 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const { /// /// This is required for correctness, so it must be done at -O0. /// -static void SplitCriticalSideEffectEdges(Function &Fn, AliasAnalysis *AA) { +static void SplitCriticalSideEffectEdges(Function &Fn) { // Loop for blocks with phi nodes. - for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { - PHINode *PN = dyn_cast<PHINode>(BB->begin()); + for (BasicBlock &BB : Fn) { + PHINode *PN = dyn_cast<PHINode>(BB.begin()); if (!PN) continue; ReprocessBlock: @@ -391,7 +401,7 @@ static void SplitCriticalSideEffectEdges(Function &Fn, AliasAnalysis *AA) { // are potentially trapping constant expressions. Constant expressions are // the only potentially trapping value that can occur as the argument to a // PHI. - for (BasicBlock::iterator I = BB->begin(); (PN = dyn_cast<PHINode>(I)); ++I) + for (BasicBlock::iterator I = BB.begin(); (PN = dyn_cast<PHINode>(I)); ++I) for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { ConstantExpr *CE = dyn_cast<ConstantExpr>(PN->getIncomingValue(i)); if (!CE || !CE->canTrap()) continue; @@ -405,8 +415,8 @@ static void SplitCriticalSideEffectEdges(Function &Fn, AliasAnalysis *AA) { // Okay, we have to split this edge. SplitCriticalEdge( - Pred->getTerminator(), GetSuccessorNumber(Pred, BB), - CriticalEdgeSplittingOptions(AA).setMergeIdenticalEdges()); + Pred->getTerminator(), GetSuccessorNumber(Pred, &BB), + CriticalEdgeSplittingOptions().setMergeIdenticalEdges()); goto ReprocessBlock; } } @@ -437,19 +447,19 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { TII = MF->getSubtarget().getInstrInfo(); TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr; DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n"); - SplitCriticalSideEffectEdges(const_cast<Function&>(Fn), AA); + SplitCriticalSideEffectEdges(const_cast<Function &>(Fn)); CurDAG->init(*MF); FuncInfo->set(Fn, *MF, CurDAG); if (UseMBPI && OptLevel != CodeGenOpt::None) - FuncInfo->BPI = &getAnalysis<BranchProbabilityInfo>(); + FuncInfo->BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); else FuncInfo->BPI = nullptr; @@ -457,15 +467,50 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { MF->setHasInlineAsm(false); + FuncInfo->SplitCSR = false; + SmallVector<MachineBasicBlock*, 4> Returns; + + // We split CSR if the target supports it for the given function + // and the function has only return exits. + if (TLI->supportSplitCSR(MF)) { + FuncInfo->SplitCSR = true; + + // Collect all the return blocks. + for (const BasicBlock &BB : Fn) { + if (!succ_empty(&BB)) + continue; + + const TerminatorInst *Term = BB.getTerminator(); + if (isa<UnreachableInst>(Term)) + continue; + if (isa<ReturnInst>(Term)) { + Returns.push_back(FuncInfo->MBBMap[&BB]); + continue; + } + + // Bail out if the exit block is not Return nor Unreachable. + FuncInfo->SplitCSR = false; + break; + } + } + + MachineBasicBlock *EntryMBB = &MF->front(); + if (FuncInfo->SplitCSR) + // This performs initialization so lowering for SplitCSR will be correct. + TLI->initializeSplitCSR(EntryMBB); + SelectAllBasicBlocks(Fn); // If the first basic block in the function has live ins that need to be // copied into vregs, emit the copies into the top of the block before // emitting the code for the block. - MachineBasicBlock *EntryMBB = MF->begin(); const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); RegInfo->EmitLiveInCopies(EntryMBB, TRI, *TII); + // Insert copies in the entry block and the return blocks. + if (FuncInfo->SplitCSR) + TLI->insertCopiesSplitCSR(EntryMBB, Returns); + DenseMap<unsigned, unsigned> LiveInMap; if (!FuncInfo->ArgDbgValues.empty()) for (MachineRegisterInfo::livein_iterator LI = RegInfo->livein_begin(), @@ -588,6 +633,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { MRI.replaceRegWith(From, To); } + if (TLI->hasCopyImplyingStackAdjustment(MF)) + MFI->setHasOpaqueSPAdjustment(true); + // Freeze the set of reserved registers now that MachineFrameInfo has been // set up. All the information required by getReservedRegs() should be // available now. @@ -882,7 +930,7 @@ void SelectionDAGISel::DoInstructionSelection() { // graph) and preceding back toward the beginning (the entry // node). while (ISelPosition != CurDAG->allnodes_begin()) { - SDNode *Node = --ISelPosition; + SDNode *Node = &*--ISelPosition; // Skip dead nodes. DAGCombiner is expected to eliminate all dead nodes, // but there are currently some corner cases that it misses. Also, this // makes it theoretically possible to disable the DAGCombiner. @@ -916,14 +964,47 @@ void SelectionDAGISel::DoInstructionSelection() { PostprocessISelDAG(); } +static bool hasExceptionPointerOrCodeUser(const CatchPadInst *CPI) { + for (const User *U : CPI->users()) { + if (const IntrinsicInst *EHPtrCall = dyn_cast<IntrinsicInst>(U)) { + Intrinsic::ID IID = EHPtrCall->getIntrinsicID(); + if (IID == Intrinsic::eh_exceptionpointer || + IID == Intrinsic::eh_exceptioncode) + return true; + } + } + return false; +} + /// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and /// do other setup for EH landing-pad blocks. bool SelectionDAGISel::PrepareEHLandingPad() { MachineBasicBlock *MBB = FuncInfo->MBB; - + const Constant *PersonalityFn = FuncInfo->Fn->getPersonalityFn(); + const BasicBlock *LLVMBB = MBB->getBasicBlock(); const TargetRegisterClass *PtrRC = TLI->getRegClassFor(TLI->getPointerTy(CurDAG->getDataLayout())); + // Catchpads have one live-in register, which typically holds the exception + // pointer or code. + if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) { + if (hasExceptionPointerOrCodeUser(CPI)) { + // Get or create the virtual register to hold the pointer or code. Mark + // the live in physreg and copy into the vreg. + MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn); + assert(EHPhysReg && "target lacks exception pointer register"); + MBB->addLiveIn(EHPhysReg); + unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC); + BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), + TII->get(TargetOpcode::COPY), VReg) + .addReg(EHPhysReg, RegState::Kill); + } + return true; + } + + if (!LLVMBB->isLandingPad()) + return true; + // Add a label to mark the beginning of the landing pad. Deletion of the // landing pad can thus be detected via the MachineModuleInfo. MCSymbol *Label = MF->getMMI().addLandingPad(MBB); @@ -935,52 +1016,12 @@ bool SelectionDAGISel::PrepareEHLandingPad() { BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II) .addSym(Label); - // If this is an MSVC-style personality function, we need to split the landing - // pad into several BBs. - const BasicBlock *LLVMBB = MBB->getBasicBlock(); - const LandingPadInst *LPadInst = LLVMBB->getLandingPadInst(); - MF->getMMI().addPersonality(MBB, cast<Function>(LPadInst->getParent() - ->getParent() - ->getPersonalityFn() - ->stripPointerCasts())); - EHPersonality Personality = MF->getMMI().getPersonalityType(); - - if (isMSVCEHPersonality(Personality)) { - SmallVector<MachineBasicBlock *, 4> ClauseBBs; - const IntrinsicInst *ActionsCall = - dyn_cast<IntrinsicInst>(LLVMBB->getFirstInsertionPt()); - // Get all invoke BBs that unwind to this landingpad. - SmallVector<MachineBasicBlock *, 4> InvokeBBs(MBB->pred_begin(), - MBB->pred_end()); - if (ActionsCall && ActionsCall->getIntrinsicID() == Intrinsic::eh_actions) { - // If this is a call to llvm.eh.actions followed by indirectbr, then we've - // run WinEHPrepare, and we should remove this block from the machine CFG. - // Mark the targets of the indirectbr as landingpads instead. - for (const BasicBlock *LLVMSucc : successors(LLVMBB)) { - MachineBasicBlock *ClauseBB = FuncInfo->MBBMap[LLVMSucc]; - // Add the edge from the invoke to the clause. - for (MachineBasicBlock *InvokeBB : InvokeBBs) - InvokeBB->addSuccessor(ClauseBB); - - // Mark the clause as a landing pad or MI passes will delete it. - ClauseBB->setIsLandingPad(); - } - } - - // Remove the edge from the invoke to the lpad. - for (MachineBasicBlock *InvokeBB : InvokeBBs) - InvokeBB->removeSuccessor(MBB); - - // Don't select instructions for the landingpad. - return false; - } - // Mark exception register as live in. - if (unsigned Reg = TLI->getExceptionPointerRegister()) + if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn)) FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC); // Mark exception selector register as live in. - if (unsigned Reg = TLI->getExceptionSelectorRegister()) + if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn)) FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC); return true; @@ -992,9 +1033,9 @@ bool SelectionDAGISel::PrepareEHLandingPad() { static bool isFoldedOrDeadInstruction(const Instruction *I, FunctionLoweringInfo *FuncInfo) { return !I->mayWriteToMemory() && // Side-effecting instructions aren't folded. - !isa<TerminatorInst>(I) && // Terminators aren't folded. + !isa<TerminatorInst>(I) && // Terminators aren't folded. !isa<DbgInfoIntrinsic>(I) && // Debug instructions aren't folded. - !isa<LandingPadInst>(I) && // Landingpad instructions aren't folded. + !I->isEHPad() && // EH pad instructions aren't folded. !FuncInfo->isExportedInst(I); // Exported instrs must be computed. } @@ -1143,17 +1184,20 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { FuncInfo->VisitedBBs.insert(LLVMBB); } - BasicBlock::const_iterator const Begin = LLVMBB->getFirstNonPHI(); + BasicBlock::const_iterator const Begin = + LLVMBB->getFirstNonPHI()->getIterator(); BasicBlock::const_iterator const End = LLVMBB->end(); BasicBlock::const_iterator BI = End; FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB]; + if (!FuncInfo->MBB) + continue; // Some blocks like catchpads have no code or MBB. FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI(); // Setup an EH landing-pad block. FuncInfo->ExceptionPointerVirtReg = 0; FuncInfo->ExceptionSelectorVirtReg = 0; - if (LLVMBB->isLandingPad()) + if (LLVMBB->isEHPad()) if (!PrepareEHLandingPad()) continue; @@ -1192,7 +1236,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { unsigned NumFastIselRemaining = std::distance(Begin, End); // Do FastISel on as many instructions as possible. for (; BI != Begin; --BI) { - const Instruction *Inst = std::prev(BI); + const Instruction *Inst = &*std::prev(BI); // If we no longer require this instruction, skip it. if (isFoldedOrDeadInstruction(Inst, FuncInfo)) { @@ -1212,8 +1256,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // then see if there is a load right before the selected instructions. // Try to fold the load if so. const Instruction *BeforeInst = Inst; - while (BeforeInst != Begin) { - BeforeInst = std::prev(BasicBlock::const_iterator(BeforeInst)); + while (BeforeInst != &*Begin) { + BeforeInst = &*std::prev(BasicBlock::const_iterator(BeforeInst)); if (!isFoldedOrDeadInstruction(BeforeInst, FuncInfo)) break; } @@ -1245,7 +1289,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { // For the purpose of debugging, just abort. report_fatal_error("FastISel didn't select the entire block"); - if (!Inst->getType()->isVoidTy() && !Inst->use_empty()) { + if (!Inst->getType()->isVoidTy() && !Inst->getType()->isTokenTy() && + !Inst->use_empty()) { unsigned &R = FuncInfo->ValueMap[Inst]; if (!R) R = FuncInfo->CreateRegs(Inst->getType()); @@ -1253,7 +1298,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { bool HadTailCall = false; MachineBasicBlock::iterator SavedInsertPt = FuncInfo->InsertPt; - SelectBasicBlock(Inst, BI, HadTailCall); + SelectBasicBlock(Inst->getIterator(), BI, HadTailCall); // If the call was emitted as a tail call, we're done with the block. // We also need to delete any previously emitted instructions. @@ -1483,35 +1528,39 @@ SelectionDAGISel::FinishBasicBlock() { CodeGenAndEmitDAG(); } - uint32_t UnhandledWeight = 0; - for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j) - UnhandledWeight += SDB->BitTestCases[i].Cases[j].ExtraWeight; - + BranchProbability UnhandledProb = SDB->BitTestCases[i].Prob; for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j) { - UnhandledWeight -= SDB->BitTestCases[i].Cases[j].ExtraWeight; + UnhandledProb -= SDB->BitTestCases[i].Cases[j].ExtraProb; // Set the current basic block to the mbb we wish to insert the code into FuncInfo->MBB = SDB->BitTestCases[i].Cases[j].ThisBB; FuncInfo->InsertPt = FuncInfo->MBB->end(); // Emit the code - if (j+1 != ej) - SDB->visitBitTestCase(SDB->BitTestCases[i], - SDB->BitTestCases[i].Cases[j+1].ThisBB, - UnhandledWeight, - SDB->BitTestCases[i].Reg, - SDB->BitTestCases[i].Cases[j], - FuncInfo->MBB); + + // If all cases cover a contiguous range, it is not necessary to jump to + // the default block after the last bit test fails. This is because the + // range check during bit test header creation has guaranteed that every + // case here doesn't go outside the range. + MachineBasicBlock *NextMBB; + if (SDB->BitTestCases[i].ContiguousRange && j + 2 == ej) + NextMBB = SDB->BitTestCases[i].Cases[j + 1].TargetBB; + else if (j + 1 != ej) + NextMBB = SDB->BitTestCases[i].Cases[j + 1].ThisBB; else - SDB->visitBitTestCase(SDB->BitTestCases[i], - SDB->BitTestCases[i].Default, - UnhandledWeight, - SDB->BitTestCases[i].Reg, - SDB->BitTestCases[i].Cases[j], - FuncInfo->MBB); + NextMBB = SDB->BitTestCases[i].Default; + SDB->visitBitTestCase(SDB->BitTestCases[i], + NextMBB, + UnhandledProb, + SDB->BitTestCases[i].Reg, + SDB->BitTestCases[i].Cases[j], + FuncInfo->MBB); CurDAG->setRoot(SDB->getRoot()); SDB->clear(); CodeGenAndEmitDAG(); + + if (SDB->BitTestCases[i].ContiguousRange && j + 2 == ej) + break; } // Update PHI Nodes @@ -1642,14 +1691,7 @@ SelectionDAGISel::FinishBasicBlock() { /// one preferred by the target. /// ScheduleDAGSDNodes *SelectionDAGISel::CreateScheduler() { - RegisterScheduler::FunctionPassCtor Ctor = RegisterScheduler::getDefault(); - - if (!Ctor) { - Ctor = ISHeuristic; - RegisterScheduler::setDefault(Ctor); - } - - return Ctor(this, OptLevel); + return ISHeuristic(this, OptLevel); } //===----------------------------------------------------------------------===// @@ -1961,7 +2003,7 @@ SDNode *SelectionDAGISel::Select_UNDEF(SDNode *N) { } /// GetVBR - decode a vbr encoding whose top bit is set. -LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline uint64_t GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) { assert(Val >= 128 && "Not a VBR"); Val &= 127; // Remove first vbr bit. @@ -2287,7 +2329,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList, } /// CheckSame - Implements OP_CheckSame. -LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N, const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) { @@ -2298,7 +2340,7 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, } /// CheckChildSame - Implements OP_CheckChildXSame. -LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N, const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes, @@ -2310,20 +2352,20 @@ CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, } /// CheckPatternPredicate - Implements OP_CheckPatternPredicate. -LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex, const SelectionDAGISel &SDISel) { return SDISel.CheckPatternPredicate(MatcherTable[MatcherIndex++]); } /// CheckNodePredicate - Implements OP_CheckNodePredicate. -LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckNodePredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex, const SelectionDAGISel &SDISel, SDNode *N) { return SDISel.CheckNodePredicate(N, MatcherTable[MatcherIndex++]); } -LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDNode *N) { uint16_t Opc = MatcherTable[MatcherIndex++]; @@ -2331,7 +2373,7 @@ CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex, return N->getOpcode() == Opc; } -LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N, const TargetLowering *TLI, const DataLayout &DL) { MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++]; @@ -2341,7 +2383,7 @@ CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N, return VT == MVT::iPTR && N.getValueType() == TLI->getPointerTy(DL); } -LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckChildType(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N, const TargetLowering *TLI, const DataLayout &DL, unsigned ChildNo) { @@ -2351,14 +2393,14 @@ CheckChildType(const unsigned char *MatcherTable, unsigned &MatcherIndex, DL); } -LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckCondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N) { return cast<CondCodeSDNode>(N)->get() == (ISD::CondCode)MatcherTable[MatcherIndex++]; } -LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N, const TargetLowering *TLI, const DataLayout &DL) { MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++]; @@ -2369,7 +2411,7 @@ CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex, return VT == MVT::iPTR && cast<VTSDNode>(N)->getVT() == TLI->getPointerTy(DL); } -LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N) { int64_t Val = MatcherTable[MatcherIndex++]; @@ -2380,7 +2422,7 @@ CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex, return C && C->getSExtValue() == Val; } -LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckChildInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N, unsigned ChildNo) { if (ChildNo >= N.getNumOperands()) @@ -2388,7 +2430,7 @@ CheckChildInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex, return ::CheckInteger(MatcherTable, MatcherIndex, N.getOperand(ChildNo)); } -LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N, const SelectionDAGISel &SDISel) { int64_t Val = MatcherTable[MatcherIndex++]; @@ -2401,7 +2443,7 @@ CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex, return C && SDISel.CheckAndMask(N.getOperand(0), C, Val); } -LLVM_ATTRIBUTE_ALWAYS_INLINE static bool +LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N, const SelectionDAGISel &SDISel) { int64_t Val = MatcherTable[MatcherIndex++]; diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp index 4df5ede..2764688 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp @@ -80,9 +80,16 @@ namespace llvm { return true; } - static bool hasNodeAddressLabel(const SDNode *Node, - const SelectionDAG *Graph) { - return true; + static std::string getNodeIdentifierLabel(const SDNode *Node, + const SelectionDAG *Graph) { + std::string R; + raw_string_ostream OS(R); +#ifndef NDEBUG + OS << 't' << Node->PersistentId; +#else + OS << static_cast<const void *>(Node); +#endif + return R; } /// If you want to override the dot attributes printed for a particular diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index 34688df..02545a7 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCStrategy.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -95,6 +96,9 @@ StatepointLoweringState::allocateStackSlot(EVT ValueType, SDValue SpillSlot = Builder.DAG.CreateStackTemporary(ValueType); const unsigned FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); + auto *MFI = Builder.DAG.getMachineFunction().getFrameInfo(); + MFI->markAsStatepointSpillSlotObjectIndex(FI); + Builder.FuncInfo.StatepointStackSlots.push_back(FI); AllocatedStackSlots.push_back(true); return SpillSlot; @@ -105,8 +109,8 @@ StatepointLoweringState::allocateStackSlot(EVT ValueType, return Builder.DAG.getFrameIndex(FI, ValueType); } // Note: We deliberately choose to advance this only on the failing path. - // Doing so on the suceeding path involes a bit of complexity that caused a - // minor bug previously. Unless performance shows this matters, please + // Doing so on the succeeding path involves a bit of complexity that caused + // a minor bug previously. Unless performance shows this matters, please // keep this code as simple as possible. NextSlotToAllocate++; } @@ -119,18 +123,16 @@ StatepointLoweringState::allocateStackSlot(EVT ValueType, static Optional<int> findPreviousSpillSlot(const Value *Val, SelectionDAGBuilder &Builder, int LookUpDepth) { - // Can not look any futher - give up now + // Can not look any further - give up now if (LookUpDepth <= 0) return Optional<int>(); // Spill location is known for gc relocates - if (isGCRelocate(Val)) { - GCRelocateOperands RelocOps(cast<Instruction>(Val)); - + if (const auto *Relocate = dyn_cast<GCRelocateInst>(Val)) { FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap = - Builder.FuncInfo.StatepointRelocatedValues[RelocOps.getStatepoint()]; + Builder.FuncInfo.StatepointRelocatedValues[Relocate->getStatepoint()]; - auto It = SpillMap.find(RelocOps.getDerivedPtr()); + auto It = SpillMap.find(Relocate->getDerivedPtr()); if (It == SpillMap.end()) return Optional<int>(); @@ -196,7 +198,7 @@ static Optional<int> findPreviousSpillSlot(const Value *Val, /// Try to find existing copies of the incoming values in stack slots used for /// statepoint spilling. If we can find a spill slot for the incoming value, /// mark that slot as allocated, and reuse the same slot for this safepoint. -/// This helps to avoid series of loads and stores that only serve to resuffle +/// This helps to avoid series of loads and stores that only serve to reshuffle /// values on the stack between calls. static void reservePreviousStackSlotForValue(const Value *IncomingValue, SelectionDAGBuilder &Builder) { @@ -255,7 +257,7 @@ static void removeDuplicatesGCPtrs(SmallVectorImpl<const Value *> &Bases, SmallVectorImpl<const Value *> &Relocs, SelectionDAGBuilder &Builder) { - // This is horribly ineffecient, but I don't care right now + // This is horribly inefficient, but I don't care right now SmallSet<SDValue, 64> Seen; SmallVector<const Value *, 64> NewBases, NewPtrs, NewRelocs; @@ -283,13 +285,29 @@ static void removeDuplicatesGCPtrs(SmallVectorImpl<const Value *> &Bases, /// call node. Also update NodeMap so that getValue(statepoint) will /// reference lowered call result static SDNode * -lowerCallFromStatepoint(ImmutableStatepoint ISP, MachineBasicBlock *LandingPad, +lowerCallFromStatepoint(ImmutableStatepoint ISP, const BasicBlock *EHPadBB, SelectionDAGBuilder &Builder, SmallVectorImpl<SDValue> &PendingExports) { ImmutableCallSite CS(ISP.getCallSite()); - SDValue ActualCallee = Builder.getValue(ISP.getCalledValue()); + SDValue ActualCallee; + + if (ISP.getNumPatchBytes() > 0) { + // If we've been asked to emit a nop sequence instead of a call instruction + // for this statepoint then don't lower the call target, but use a constant + // `null` instead. Not lowering the call target lets statepoint clients get + // away without providing a physical address for the symbolic call target at + // link time. + + const auto &TLI = Builder.DAG.getTargetLoweringInfo(); + const auto &DL = Builder.DAG.getDataLayout(); + + unsigned AS = ISP.getCalledValue()->getType()->getPointerAddressSpace(); + ActualCallee = Builder.DAG.getConstant(0, Builder.getCurSDLoc(), + TLI.getPointerTy(DL, AS)); + } else + ActualCallee = Builder.getValue(ISP.getCalledValue()); assert(CS.getCallingConv() != CallingConv::AnyReg && "anyregcc is not supported on statepoints!"); @@ -300,7 +318,7 @@ lowerCallFromStatepoint(ImmutableStatepoint ISP, MachineBasicBlock *LandingPad, SDValue ReturnValue, CallEndVal; std::tie(ReturnValue, CallEndVal) = Builder.lowerCallOperands( ISP.getCallSite(), ImmutableStatepoint::CallArgsBeginPos, - ISP.getNumCallArgs(), ActualCallee, DefTy, LandingPad, + ISP.getNumCallArgs(), ActualCallee, DefTy, EHPadBB, false /* IsPatchPoint */); SDNode *CallEnd = CallEndVal.getNode(); @@ -317,25 +335,33 @@ lowerCallFromStatepoint(ImmutableStatepoint ISP, MachineBasicBlock *LandingPad, // ch, glue = callseq_end ch, glue // get_return_value ch, glue // - // get_return_value can either be a CopyFromReg to grab the return value from - // %RAX, or it can be a LOAD to load a value returned by reference via a stack - // slot. + // get_return_value can either be a sequence of CopyFromReg instructions + // to grab the return value from the return register(s), or it can be a LOAD + // to load a value returned by reference via a stack slot. - if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg || - CallEnd->getOpcode() == ISD::LOAD)) - CallEnd = CallEnd->getOperand(0).getNode(); + if (HasDef) { + if (CallEnd->getOpcode() == ISD::LOAD) + CallEnd = CallEnd->getOperand(0).getNode(); + else + while (CallEnd->getOpcode() == ISD::CopyFromReg) + CallEnd = CallEnd->getOperand(0).getNode(); + } assert(CallEnd->getOpcode() == ISD::CALLSEQ_END && "expected!"); - if (HasDef) { - if (CS.isInvoke()) { - // Result value will be used in different basic block for invokes - // so we need to export it now. But statepoint call has a different type - // than the actuall call. It means that standart exporting mechanism will - // create register of the wrong type. So instead we need to create - // register with correct type and save value into it manually. + // Export the result value if needed + const Instruction *GCResult = ISP.getGCResult(); + if (HasDef && GCResult) { + if (GCResult->getParent() != CS.getParent()) { + // Result value will be used in a different basic block so we need to + // export it now. + // Default exporting mechanism will not work here because statepoint call + // has a different type than the actual call. It means that by default + // llvm will create export register of the wrong type (always i32 in our + // case). So instead we need to create export register with correct type + // manually. // TODO: To eliminate this problem we can remove gc.result intrinsics - // completelly and make statepoint call to return a tuple. + // completely and make statepoint call to return a tuple. unsigned Reg = Builder.FuncInfo.CreateRegs(ISP.getActualReturnType()); RegsForValue RFV( *Builder.DAG.getContext(), Builder.DAG.getTargetLoweringInfo(), @@ -347,8 +373,9 @@ lowerCallFromStatepoint(ImmutableStatepoint ISP, MachineBasicBlock *LandingPad, PendingExports.push_back(Chain); Builder.FuncInfo.ValueMap[CS.getInstruction()] = Reg; } else { - // The value of the statepoint itself will be the value of call itself. - // We'll replace the actually call node shortly. gc_result will grab + // Result value will be used in a same basic block. Don't export it or + // perform any explicit register copies. + // We'll replace the actuall call node shortly. gc_result will grab // this value. Builder.setValue(CS.getInstruction(), ReturnValue); } @@ -372,10 +399,10 @@ static void getIncomingStatepointGCValues( SmallVectorImpl<const Value *> &Bases, SmallVectorImpl<const Value *> &Ptrs, SmallVectorImpl<const Value *> &Relocs, ImmutableStatepoint StatepointSite, SelectionDAGBuilder &Builder) { - for (GCRelocateOperands relocateOpers : StatepointSite.getRelocates()) { - Relocs.push_back(relocateOpers.getUnderlyingCallSite().getInstruction()); - Bases.push_back(relocateOpers.getBasePtr()); - Ptrs.push_back(relocateOpers.getDerivedPtr()); + for (const GCRelocateInst *Relocate : StatepointSite.getRelocates()) { + Relocs.push_back(Relocate); + Bases.push_back(Relocate->getBasePtr()); + Ptrs.push_back(Relocate->getDerivedPtr()); } // Remove any redundant llvm::Values which map to the same SDValue as another @@ -411,7 +438,8 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain, // chaining stores one after another, this may allow // a bit more optimal scheduling for them Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc, - MachinePointerInfo::getFixedStack(Index), + MachinePointerInfo::getFixedStack( + Builder.DAG.getMachineFunction(), Index), false, false, 0); Builder.StatepointLowering.setLocation(Incoming, Loc); @@ -433,7 +461,9 @@ static void lowerIncomingStatepointValue(SDValue Incoming, // If the original value was a constant, make sure it gets recorded as // such in the stackmap. This is required so that the consumer can // parse any internal format to the deopt state. It also handles null - // pointers and other constant pointers in GC states + // pointers and other constant pointers in GC states. Note the constant + // vectors do not appear to actually hit this path and that anything larger + // than an i64 value (not type!) will fail asserts here. pushStackMapConstant(Ops, Builder, C->getSExtValue()); } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) { // This handles allocas as arguments to the statepoint (this is only @@ -477,27 +507,27 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops, #ifndef NDEBUG // Check that each of the gc pointer and bases we've gotten out of the - // safepoint is something the strategy thinks might be a pointer into the GC - // heap. This is basically just here to help catch errors during statepoint - // insertion. TODO: This should actually be in the Verifier, but we can't get - // to the GCStrategy from there (yet). + // safepoint is something the strategy thinks might be a pointer (or vector + // of pointers) into the GC heap. This is basically just here to help catch + // errors during statepoint insertion. TODO: This should actually be in the + // Verifier, but we can't get to the GCStrategy from there (yet). GCStrategy &S = Builder.GFI->getStrategy(); for (const Value *V : Bases) { - auto Opt = S.isGCManagedPointer(V); + auto Opt = S.isGCManagedPointer(V->getType()->getScalarType()); if (Opt.hasValue()) { assert(Opt.getValue() && "non gc managed base pointer found in statepoint"); } } for (const Value *V : Ptrs) { - auto Opt = S.isGCManagedPointer(V); + auto Opt = S.isGCManagedPointer(V->getType()->getScalarType()); if (Opt.hasValue()) { assert(Opt.getValue() && "non gc managed derived pointer found in statepoint"); } } for (const Value *V : Relocations) { - auto Opt = S.isGCManagedPointer(V); + auto Opt = S.isGCManagedPointer(V->getType()->getScalarType()); if (Opt.hasValue()) { assert(Opt.getValue() && "non gc managed pointer relocated"); } @@ -572,8 +602,8 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops, FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap = Builder.FuncInfo.StatepointRelocatedValues[StatepointInstr]; - for (GCRelocateOperands RelocateOpers : StatepointSite.getRelocates()) { - const Value *V = RelocateOpers.getDerivedPtr(); + for (const GCRelocateInst *Relocate : StatepointSite.getRelocates()) { + const Value *V = Relocate->getDerivedPtr(); SDValue SDV = Builder.getValue(V); SDValue Loc = Builder.StatepointLowering.getLocation(SDV); @@ -581,19 +611,20 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops, SpillMap[V] = cast<FrameIndexSDNode>(Loc)->getIndex(); } else { // Record value as visited, but not spilled. This is case for allocas - // and constants. For this values we can avoid emiting spill load while + // and constants. For this values we can avoid emitting spill load while // visiting corresponding gc_relocate. // Actually we do not need to record them in this map at all. - // We do this only to check that we are not relocating any unvisited value. + // We do this only to check that we are not relocating any unvisited + // value. SpillMap[V] = None; // Default llvm mechanisms for exporting values which are used in // different basic blocks does not work for gc relocates. // Note that it would be incorrect to teach llvm that all relocates are - // uses of the corresponging values so that it would automatically + // uses of the corresponding values so that it would automatically // export them. Relocates of the spilled values does not use original // value. - if (StatepointSite.getCallSite().isInvoke()) + if (Relocate->getParent() != StatepointInstr->getParent()) Builder.ExportFromCurrentBlock(V); } } @@ -608,7 +639,7 @@ void SelectionDAGBuilder::visitStatepoint(const CallInst &CI) { } void SelectionDAGBuilder::LowerStatepoint( - ImmutableStatepoint ISP, MachineBasicBlock *LandingPad /*=nullptr*/) { + ImmutableStatepoint ISP, const BasicBlock *EHPadBB /*= nullptr*/) { // The basic scheme here is that information about both the original call and // the safepoint is encoded in the CallInst. We create a temporary call and // lower it, then reverse engineer the calling sequence. @@ -620,14 +651,12 @@ void SelectionDAGBuilder::LowerStatepoint( ImmutableCallSite CS(ISP.getCallSite()); #ifndef NDEBUG - // Consistency check. Don't do this for invokes. It would be too - // expensive to preserve this information across different basic blocks - if (!CS.isInvoke()) { - for (const User *U : CS->users()) { - const CallInst *Call = cast<CallInst>(U); - if (isGCRelocate(Call)) - StatepointLowering.scheduleRelocCall(*Call); - } + // Consistency check. Check only relocates in the same basic block as thier + // statepoint. + for (const User *U : CS->users()) { + const CallInst *Call = cast<CallInst>(U); + if (isa<GCRelocateInst>(Call) && Call->getParent() == CS.getParent()) + StatepointLowering.scheduleRelocCall(*Call); } #endif @@ -648,7 +677,7 @@ void SelectionDAGBuilder::LowerStatepoint( // Get call node, we will replace it later with statepoint SDNode *CallNode = - lowerCallFromStatepoint(ISP, LandingPad, *this, PendingExports); + lowerCallFromStatepoint(ISP, EHPadBB, *this, PendingExports); // Construct the actual GC_TRANSITION_START, STATEPOINT, and GC_TRANSITION_END // nodes with all the appropriate arguments and return values. @@ -790,7 +819,7 @@ void SelectionDAGBuilder::LowerStatepoint( // Replace original call DAG.ReplaceAllUsesWith(CallNode, SinkNode); // This may update Root - // Remove originall call node + // Remove original call node DAG.DeleteNode(CallNode); // DON'T set the root - under the assumption that it's already set past the @@ -809,8 +838,9 @@ void SelectionDAGBuilder::visitGCResult(const CallInst &CI) { Instruction *I = cast<Instruction>(CI.getArgOperand(0)); assert(isStatepoint(I) && "first argument must be a statepoint token"); - if (isa<InvokeInst>(I)) { - // For invokes we should have stored call result in a virtual register. + if (I->getParent() != CI.getParent()) { + // Statepoint is in different basic block so we should have stored call + // result in a virtual register. // We can not use default getValue() functionality to copy value from this // register because statepoint and actuall call return types can be // different, and getValue() will use CopyFromReg of the wrong type, @@ -828,23 +858,22 @@ void SelectionDAGBuilder::visitGCResult(const CallInst &CI) { } } -void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) { - GCRelocateOperands RelocateOpers(&CI); - +void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) { #ifndef NDEBUG // Consistency check - // We skip this check for invoke statepoints. It would be too expensive to - // preserve validation info through different basic blocks. - if (!RelocateOpers.isTiedToInvoke()) { - StatepointLowering.relocCallVisited(CI); + // We skip this check for relocates not in the same basic block as thier + // statepoint. It would be too expensive to preserve validation info through + // different basic blocks. + if (Relocate.getStatepoint()->getParent() == Relocate.getParent()) { + StatepointLowering.relocCallVisited(Relocate); } #endif - const Value *DerivedPtr = RelocateOpers.getDerivedPtr(); + const Value *DerivedPtr = Relocate.getDerivedPtr(); SDValue SD = getValue(DerivedPtr); FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap = - FuncInfo.StatepointRelocatedValues[RelocateOpers.getStatepoint()]; + FuncInfo.StatepointRelocatedValues[Relocate.getStatepoint()]; // We should have recorded location for this pointer assert(SpillMap.count(DerivedPtr) && "Relocating not lowered gc value"); @@ -853,7 +882,7 @@ void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) { // We didn't need to spill these special cases (constants and allocas). // See the handling in spillIncomingValueForStatepoint for detail. if (!DerivedPtrLocation) { - setValue(&CI, SD); + setValue(&Relocate, SD); return; } @@ -862,17 +891,18 @@ void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) { // Be conservative: flush all pending loads // TODO: Probably we can be less restrictive on this, - // it may allow more scheduling opprtunities + // it may allow more scheduling opportunities. SDValue Chain = getRoot(); SDValue SpillLoad = - DAG.getLoad(SpillSlot.getValueType(), getCurSDLoc(), Chain, SpillSlot, - MachinePointerInfo::getFixedStack(*DerivedPtrLocation), - false, false, false, 0); + DAG.getLoad(SpillSlot.getValueType(), getCurSDLoc(), Chain, SpillSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), + *DerivedPtrLocation), + false, false, false, 0); // Again, be conservative, don't emit pending loads DAG.setRoot(SpillLoad.getValue(1)); assert(SpillLoad.getNode()); - setValue(&CI, SpillLoad); + setValue(&Relocate, SpillLoad); } diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index fbf6512..c64d882 100644 --- a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -85,21 +85,22 @@ void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS, std::pair<SDValue, SDValue> TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, - const SDValue *Ops, unsigned NumOps, + ArrayRef<SDValue> Ops, bool isSigned, SDLoc dl, bool doesNotReturn, bool isReturnValueUsed) const { TargetLowering::ArgListTy Args; - Args.reserve(NumOps); + Args.reserve(Ops.size()); TargetLowering::ArgListEntry Entry; - for (unsigned i = 0; i != NumOps; ++i) { - Entry.Node = Ops[i]; + for (SDValue Op : Ops) { + Entry.Node = Op; Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext()); - Entry.isSExt = shouldSignExtendTypeInLibCall(Ops[i].getValueType(), isSigned); - Entry.isZExt = !shouldSignExtendTypeInLibCall(Ops[i].getValueType(), isSigned); + Entry.isSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); + Entry.isZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned); Args.push_back(Entry); } + if (LC == RTLIB::UNKNOWN_LIBCALL) report_fatal_error("Unsupported library call operation!"); SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), @@ -115,9 +116,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, return LowerCallTo(CLI); } - -/// SoftenSetCCOperands - Soften the operands of a comparison. This code is -/// shared among BR_CC, SELECT_CC, and SETCC handlers. +/// Soften the operands of a comparison. This code is shared among BR_CC, +/// SELECT_CC, and SETCC handlers. void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, @@ -127,6 +127,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT, // Expand into one or more soft-fp libcall(s). RTLIB::Libcall LC1 = RTLIB::UNKNOWN_LIBCALL, LC2 = RTLIB::UNKNOWN_LIBCALL; + bool ShouldInvertCC = false; switch (CCCode) { case ISD::SETEQ: case ISD::SETOEQ: @@ -166,34 +167,38 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT, LC1 = (VT == MVT::f32) ? RTLIB::O_F32 : (VT == MVT::f64) ? RTLIB::O_F64 : RTLIB::O_F128; break; - default: + case ISD::SETONE: + // SETONE = SETOLT | SETOGT + LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : + (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128; + LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 : + (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128; + break; + case ISD::SETUEQ: LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 : (VT == MVT::f64) ? RTLIB::UO_F64 : RTLIB::UO_F128; + LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 : + (VT == MVT::f64) ? RTLIB::OEQ_F64 : RTLIB::OEQ_F128; + break; + default: + // Invert CC for unordered comparisons + ShouldInvertCC = true; switch (CCCode) { - case ISD::SETONE: - // SETONE = SETOLT | SETOGT - LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : - (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128; - // Fallthrough - case ISD::SETUGT: - LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 : - (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128; - break; - case ISD::SETUGE: - LC2 = (VT == MVT::f32) ? RTLIB::OGE_F32 : - (VT == MVT::f64) ? RTLIB::OGE_F64 : RTLIB::OGE_F128; - break; case ISD::SETULT: - LC2 = (VT == MVT::f32) ? RTLIB::OLT_F32 : - (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128; + LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 : + (VT == MVT::f64) ? RTLIB::OGE_F64 : RTLIB::OGE_F128; break; case ISD::SETULE: - LC2 = (VT == MVT::f32) ? RTLIB::OLE_F32 : + LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 : + (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128; + break; + case ISD::SETUGT: + LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 : (VT == MVT::f64) ? RTLIB::OLE_F64 : RTLIB::OLE_F128; break; - case ISD::SETUEQ: - LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 : - (VT == MVT::f64) ? RTLIB::OEQ_F64 : RTLIB::OEQ_F128; + case ISD::SETUGE: + LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : + (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128; break; default: llvm_unreachable("Do not know how to soften this setcc!"); } @@ -201,17 +206,21 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT, // Use the target specific return value for comparions lib calls. EVT RetVT = getCmpLibcallReturnType(); - SDValue Ops[2] = { NewLHS, NewRHS }; - NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, 2, false/*sign irrelevant*/, + SDValue Ops[2] = {NewLHS, NewRHS}; + NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, false /*sign irrelevant*/, dl).first; NewRHS = DAG.getConstant(0, dl, RetVT); + CCCode = getCmpLibcallCC(LC1); + if (ShouldInvertCC) + CCCode = getSetCCInverse(CCCode, /*isInteger=*/true); + if (LC2 != RTLIB::UNKNOWN_LIBCALL) { SDValue Tmp = DAG.getNode( ISD::SETCC, dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT), NewLHS, NewRHS, DAG.getCondCode(CCCode)); - NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, 2, false/*sign irrelevant*/, + NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, false/*sign irrelevant*/, dl).first; NewLHS = DAG.getNode( ISD::SETCC, dl, @@ -222,9 +231,8 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT, } } -/// getJumpTableEncoding - Return the entry encoding for a jump table in the -/// current function. The returned value is a member of the -/// MachineJumpTableInfo::JTEntryKind enum. +/// Return the entry encoding for a jump table in the current function. The +/// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum. unsigned TargetLowering::getJumpTableEncoding() const { // In non-pic modes, just use the address of a block. if (getTargetMachine().getRelocationModel() != Reloc::PIC_) @@ -250,9 +258,8 @@ SDValue TargetLowering::getPICJumpTableRelocBase(SDValue Table, return Table; } -/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the -/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an -/// MCExpr. +/// This returns the relocation base for the given PIC jumptable, the same as +/// getPICJumpTableRelocBase, but as an MCExpr. const MCExpr * TargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,MCContext &Ctx) const{ @@ -279,10 +286,9 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // Optimization Methods //===----------------------------------------------------------------------===// -/// ShrinkDemandedConstant - Check to see if the specified operand of the -/// specified instruction is a constant integer. If so, check to see if there -/// are any bits set in the constant that are not demanded. If so, shrink the -/// constant and return true. +/// Check to see if the specified operand of the specified instruction is a +/// constant integer. If so, check to see if there are any bits set in the +/// constant that are not demanded. If so, shrink the constant and return true. bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op, const APInt &Demanded) { SDLoc dl(Op); @@ -317,10 +323,9 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op, return false; } -/// ShrinkDemandedOp - Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the -/// casts are free. This uses isZExtFree and ZERO_EXTEND for the widening -/// cast, but it could be generalized for targets with other types of -/// implicit widening casts. +/// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free. +/// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be +/// generalized for targets with other types of implicit widening casts. bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op, unsigned BitWidth, @@ -366,13 +371,13 @@ TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op, return false; } -/// SimplifyDemandedBits - Look at Op. At this point, we know that only the -/// DemandedMask bits of the result of Op are ever used downstream. If we can -/// use this information to simplify Op, create a new simplified DAG node and -/// return true, returning the original and new nodes in Old and New. Otherwise, -/// analyze the expression and return a mask of KnownOne and KnownZero bits for -/// the expression (used to simplify the caller). The KnownZero/One bits may -/// only be accurate for those bits in the DemandedMask. +/// Look at Op. At this point, we know that only the DemandedMask bits of the +/// result of Op are ever used downstream. If we can use this information to +/// simplify Op, create a new simplified DAG node and return true, returning the +/// original and new nodes in Old and New. Otherwise, analyze the expression and +/// return a mask of KnownOne and KnownZero bits for the expression (used to +/// simplify the caller). The KnownZero/One bits may only be accurate for those +/// bits in the DemandedMask. bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask, APInt &KnownZero, @@ -1061,7 +1066,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, Op.getOperand(0).getValueType().isFloatingPoint()) { bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType()); bool i32Legal = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32); - if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple()) { + if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple() && + Op.getOperand(0).getValueType() != MVT::f128) { + // Cannot eliminate/lower SHL for f128 yet. EVT Ty = OpVTLegal ? Op.getValueType() : MVT::i32; // Make a FGETSIGN + SHL to move the sign bit into the appropriate // place. We expect the SHL to be eliminated by other optimizations. @@ -1120,9 +1127,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, return false; } -/// computeKnownBitsForTargetNode - Determine which of the bits specified -/// in Mask are known to be either zero or one and return them in the -/// KnownZero/KnownOne bitsets. +/// Determine which of the bits specified in Mask are known to be either zero or +/// one and return them in the KnownZero/KnownOne bitsets. void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, @@ -1137,9 +1143,8 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); } -/// ComputeNumSignBitsForTargetNode - This method can be implemented by -/// targets that want to expose additional information about sign bits to the -/// DAG Combiner. +/// This method can be implemented by targets that want to expose additional +/// information about sign bits to the DAG Combiner. unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &, unsigned Depth) const { @@ -1152,10 +1157,8 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, return 1; } -/// ValueHasExactlyOneBitSet - Test if the given value is known to have exactly -/// one bit set. This differs from computeKnownBits in that it doesn't need to -/// determine which bit is set. -/// +/// Test if the given value is known to have exactly one bit set. This differs +/// from computeKnownBits in that it doesn't need to determine which bit is set. static bool ValueHasExactlyOneBitSet(SDValue Val, const SelectionDAG &DAG) { // A left-shift of a constant one will have exactly one bit set, because // shifting the bit off the end is undefined. @@ -1239,8 +1242,8 @@ bool TargetLowering::isConstFalseVal(const SDNode *N) const { return CN->isNullValue(); } -/// SimplifySetCC - Try to simplify a setcc built with the specified operands -/// and cc. If it is unable to simplify it, return a null SDValue. +/// Try to simplify a setcc built with the specified operands and cc. If it is +/// unable to simplify it, return a null SDValue. SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, bool foldBooleans, @@ -1270,7 +1273,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, isCondCodeLegal(SwappedCC, N0.getSimpleValueType()))) return DAG.getSetCC(dl, VT, N1, N0, SwappedCC); - if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) { + if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) { const APInt &C1 = N1C->getAPIntValue(); // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an @@ -1335,7 +1338,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, PreExt = N0->getOperand(0); } else if (N0->getOpcode() == ISD::AND) { // DAGCombine turns costly ZExts into ANDs - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0->getOperand(1))) + if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1))) if ((C->getAPIntValue()+1).isPowerOf2()) { MinBits = C->getAPIntValue().countTrailingOnes(); PreExt = N0->getOperand(0); @@ -1345,7 +1348,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, MinBits = N0->getOperand(0).getValueSizeInBits(); PreExt = N0->getOperand(0); Signed = true; - } else if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(N0)) { + } else if (auto *LN0 = dyn_cast<LoadSDNode>(N0)) { // ZEXTLOAD / SEXTLOAD if (LN0->getExtensionType() == ISD::ZEXTLOAD) { MinBits = LN0->getMemoryVT().getSizeInBits(); @@ -1697,8 +1700,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, (isTypeLegal(VT) && VT.bitsLE(N0.getValueType()))) && N0.getOpcode() == ISD::AND) { auto &DL = DAG.getDataLayout(); - if (ConstantSDNode *AndRHS = - dyn_cast<ConstantSDNode>(N0.getOperand(1))) { + if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { EVT ShiftTy = DCI.isBeforeLegalize() ? getPointerTy(DL) : getShiftAmountTy(N0.getValueType(), DL); @@ -1728,8 +1730,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // (X & -256) == 256 -> (X >> 8) == 1 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && N0.getOpcode() == ISD::AND && N0.hasOneUse()) { - if (ConstantSDNode *AndRHS = - dyn_cast<ConstantSDNode>(N0.getOperand(1))) { + if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { const APInt &AndRHSC = AndRHS->getAPIntValue(); if ((-AndRHSC).isPowerOf2() && (AndRHSC & C1) == C1) { unsigned ShiftBits = AndRHSC.countTrailingZeros(); @@ -1783,7 +1784,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // Constant fold or commute setcc. SDValue O = DAG.FoldSetCC(VT, N0, N1, Cond, dl); if (O.getNode()) return O; - } else if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1.getNode())) { + } else if (auto *CFP = dyn_cast<ConstantFPSDNode>(N1.getNode())) { // If the RHS of an FP comparison is a constant, simplify it away in // some cases. if (CFP->getValueAPF().isNaN()) { @@ -1900,8 +1901,8 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // to be careful about increasing register pressure needlessly. bool LegalRHSImm = false; - if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(N1)) { - if (ConstantSDNode *LHSR = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { + if (auto *RHSC = dyn_cast<ConstantSDNode>(N1)) { + if (auto *LHSR = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { // Turn (X+C1) == C2 --> X == C2-C1 if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse()) { return DAG.getSetCC(dl, VT, N0.getOperand(0), @@ -1924,7 +1925,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } // Turn (C1-X) == C2 --> X == C1-C2 - if (ConstantSDNode *SUBC = dyn_cast<ConstantSDNode>(N0.getOperand(0))) { + if (auto *SUBC = dyn_cast<ConstantSDNode>(N0.getOperand(0))) { if (N0.getOpcode() == ISD::SUB && N0.getNode()->hasOneUse()) { return DAG.getSetCC(dl, VT, N0.getOperand(1), @@ -2075,12 +2076,11 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, return SDValue(); } -/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the -/// node is a GlobalAddress + offset. +/// Returns true (and the GlobalValue and the offset) if the node is a +/// GlobalAddress + offset. bool TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue *&GA, int64_t &Offset) const { - if (isa<GlobalAddressSDNode>(N)) { - GlobalAddressSDNode *GASD = cast<GlobalAddressSDNode>(N); + if (auto *GASD = dyn_cast<GlobalAddressSDNode>(N)) { GA = GASD->getGlobal(); Offset += GASD->getOffset(); return true; @@ -2090,14 +2090,12 @@ bool TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue *&GA, SDValue N1 = N->getOperand(0); SDValue N2 = N->getOperand(1); if (isGAPlusOffset(N1.getNode(), GA, Offset)) { - ConstantSDNode *V = dyn_cast<ConstantSDNode>(N2); - if (V) { + if (auto *V = dyn_cast<ConstantSDNode>(N2)) { Offset += V->getSExtValue(); return true; } } else if (isGAPlusOffset(N2.getNode(), GA, Offset)) { - ConstantSDNode *V = dyn_cast<ConstantSDNode>(N1); - if (V) { + if (auto *V = dyn_cast<ConstantSDNode>(N1)) { Offset += V->getSExtValue(); return true; } @@ -2107,9 +2105,8 @@ bool TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue *&GA, return false; } - -SDValue TargetLowering:: -PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { +SDValue TargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { // Default implementation: no optimization. return SDValue(); } @@ -2159,9 +2156,9 @@ TargetLowering::getConstraintType(StringRef Constraint) const { return C_Unknown; } -/// LowerXConstraint - try to replace an X constraint, which matches anything, -/// with another that has more specific requirements based on the type of the -/// corresponding operand. +/// Try to replace an X constraint, which matches anything, with another that +/// has more specific requirements based on the type of the corresponding +/// operand. const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const{ if (ConstraintVT.isInteger()) return "r"; @@ -2170,8 +2167,8 @@ const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const{ return nullptr; } -/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops -/// vector. If it is invalid, don't add anything to Ops. +/// Lower the specified operand into the Ops vector. +/// If it is invalid, don't add anything to Ops. void TargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, @@ -2284,31 +2281,30 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI, //===----------------------------------------------------------------------===// // Constraint Selection. -/// isMatchingInputConstraint - Return true of this is an input operand that is -/// a matching constraint like "4". +/// Return true of this is an input operand that is a matching constraint like +/// "4". bool TargetLowering::AsmOperandInfo::isMatchingInputConstraint() const { assert(!ConstraintCode.empty() && "No known constraint!"); return isdigit(static_cast<unsigned char>(ConstraintCode[0])); } -/// getMatchedOperand - If this is an input matching constraint, this method -/// returns the output operand it matches. +/// If this is an input matching constraint, this method returns the output +/// operand it matches. unsigned TargetLowering::AsmOperandInfo::getMatchedOperand() const { assert(!ConstraintCode.empty() && "No known constraint!"); return atoi(ConstraintCode.c_str()); } - -/// ParseConstraints - Split up the constraint string from the inline -/// assembly value into the specific constraints and their prefixes, -/// and also tie in the associated operand values. +/// Split up the constraint string from the inline assembly value into the +/// specific constraints and their prefixes, and also tie in the associated +/// operand values. /// If this returns an empty vector, and if the constraint string itself /// isn't empty, there was an error parsing. TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, ImmutableCallSite CS) const { - /// ConstraintOperands - Information about all of the constraints. + /// Information about all of the constraints. AsmOperandInfoVector ConstraintOperands; const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue()); unsigned maCount = 0; // Largest number of multiple alternative constraints. @@ -2483,16 +2479,13 @@ TargetLowering::ParseConstraints(const DataLayout &DL, " incompatible type!"); } } - } } return ConstraintOperands; } - -/// getConstraintGenerality - Return an integer indicating how general CT -/// is. +/// Return an integer indicating how general CT is. static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) { switch (CT) { case TargetLowering::C_Other: @@ -2581,8 +2574,8 @@ TargetLowering::ConstraintWeight return weight; } -/// ChooseConstraint - If there are multiple different constraints that we -/// could pick for this operand (e.g. "imr") try to pick the 'best' one. +/// If there are multiple different constraints that we could pick for this +/// operand (e.g. "imr") try to pick the 'best' one. /// This is somewhat tricky: constraints fall into four classes: /// Other -> immediates and magic values /// Register -> one specific register @@ -2649,9 +2642,8 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo, OpInfo.ConstraintType = BestType; } -/// ComputeConstraintToUse - Determines the constraint code and constraint -/// type to use for the specific AsmOperandInfo, setting -/// OpInfo.ConstraintCode and OpInfo.ConstraintType. +/// Determines the constraint code and constraint type to use for the specific +/// AsmOperandInfo, setting OpInfo.ConstraintCode and OpInfo.ConstraintType. void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG) const { @@ -2717,6 +2709,16 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d, return Mul; } +SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + std::vector<SDNode *> *Created) const { + AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isIntDivCheap(N->getValueType(0), Attr)) + return SDValue(N,0); // Lower SDIV as SDIV + return SDValue(); +} + /// \brief Given an ISD::SDIV node expressing a divide by constant, /// return a DAG expression to select that will generate the same value by /// multiplying by a magic number. @@ -3036,3 +3038,46 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result, DAG.getConstant(0, dl, NVT), Ret, ISD::SETLT); return true; } + +//===----------------------------------------------------------------------===// +// Implementation of Emulated TLS Model +//===----------------------------------------------------------------------===// + +SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, + SelectionDAG &DAG) const { + // Access to address of TLS varialbe xyz is lowered to a function call: + // __emutls_get_address( address of global variable named "__emutls_v.xyz" ) + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + PointerType *VoidPtrType = Type::getInt8PtrTy(*DAG.getContext()); + SDLoc dl(GA); + + ArgListTy Args; + ArgListEntry Entry; + std::string NameString = ("__emutls_v." + GA->getGlobal()->getName()).str(); + Module *VariableModule = const_cast<Module*>(GA->getGlobal()->getParent()); + StringRef EmuTlsVarName(NameString); + GlobalVariable *EmuTlsVar = VariableModule->getNamedGlobal(EmuTlsVarName); + if (!EmuTlsVar) + EmuTlsVar = dyn_cast_or_null<GlobalVariable>( + VariableModule->getOrInsertGlobal(EmuTlsVarName, VoidPtrType)); + Entry.Node = DAG.getGlobalAddress(EmuTlsVar, dl, PtrVT); + Entry.Ty = VoidPtrType; + Args.push_back(Entry); + + SDValue EmuTlsGetAddr = DAG.getExternalSymbol("__emutls_get_address", PtrVT); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()); + CLI.setCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args), 0); + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + + // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. + // At last for X86 targets, maybe good for other targets too? + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setAdjustsStack(true); // Is this only for X86 target? + MFI->setHasCalls(true); + + assert((GA->getOffset() == 0) && + "Emulated TLS must have zero offset in GlobalAddressSDNode"); + return CallResult.first; +} diff --git a/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp index e7b2a8e..878eeee 100644 --- a/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp +++ b/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp @@ -112,7 +112,7 @@ public: case 1: // Find all 'return', 'resume', and 'unwind' instructions. while (StateBB != StateE) { - BasicBlock *CurBB = StateBB++; + BasicBlock *CurBB = &*StateBB++; // Branches and invokes do not escape, only unwind, resume, and return // do. @@ -120,7 +120,7 @@ public: if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI)) continue; - Builder.SetInsertPoint(TI->getParent(), TI); + Builder.SetInsertPoint(TI); return &Builder; } @@ -163,8 +163,8 @@ public: // Split the basic block containing the function call. BasicBlock *CallBB = CI->getParent(); - BasicBlock *NewBB = - CallBB->splitBasicBlock(CI, CallBB->getName() + ".cont"); + BasicBlock *NewBB = CallBB->splitBasicBlock( + CI->getIterator(), CallBB->getName() + ".cont"); // Remove the unconditional branch inserted at the end of CallBB. CallBB->getInstList().pop_back(); @@ -184,7 +184,7 @@ public: delete CI; } - Builder.SetInsertPoint(RI->getParent(), RI); + Builder.SetInsertPoint(RI); return &Builder; } } diff --git a/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp index 4463cc7..d361a6c 100644 --- a/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp +++ b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp @@ -43,9 +43,12 @@ // points must be in the same loop. // Property #3 is ensured via the MachineBlockFrequencyInfo. // -// If this pass found points matching all this properties, then -// MachineFrameInfo is updated this that information. +// If this pass found points matching all these properties, then +// MachineFrameInfo is updated with this information. //===----------------------------------------------------------------------===// +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" // To check for profitability. #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" @@ -61,11 +64,14 @@ #include "llvm/CodeGen/Passes.h" // To know about callee-saved. #include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/Debug.h" // To query the target about frame lowering. #include "llvm/Target/TargetFrameLowering.h" // To know about frame setup operation. #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" // To access TargetInstrInfo. #include "llvm/Target/TargetSubtargetInfo.h" @@ -78,6 +84,10 @@ STATISTIC(NumCandidates, "Number of shrink-wrapping candidates"); STATISTIC(NumCandidatesDropped, "Number of shrink-wrapping candidates dropped because of frequency"); +static cl::opt<cl::boolOrDefault> + EnableShrinkWrapOpt("enable-shrink-wrap", cl::Hidden, + cl::desc("enable the shrink-wrapping pass")); + namespace { /// \brief Class to determine where the safe point to insert the /// prologue and epilogue are. @@ -113,18 +123,38 @@ class ShrinkWrap : public MachineFunctionPass { unsigned FrameDestroyOpcode; /// Entry block. const MachineBasicBlock *Entry; + typedef SmallSetVector<unsigned, 16> SetOfRegs; + /// Registers that need to be saved for the current function. + mutable SetOfRegs CurrentCSRs; + /// Current MachineFunction. + MachineFunction *MachineFunc; /// \brief Check if \p MI uses or defines a callee-saved register or /// a frame index. If this is the case, this means \p MI must happen /// after Save and before Restore. - bool useOrDefCSROrFI(const MachineInstr &MI) const; + bool useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS) const; + + const SetOfRegs &getCurrentCSRs(RegScavenger *RS) const { + if (CurrentCSRs.empty()) { + BitVector SavedRegs; + const TargetFrameLowering *TFI = + MachineFunc->getSubtarget().getFrameLowering(); + + TFI->determineCalleeSaves(*MachineFunc, SavedRegs, RS); + + for (int Reg = SavedRegs.find_first(); Reg != -1; + Reg = SavedRegs.find_next(Reg)) + CurrentCSRs.insert((unsigned)Reg); + } + return CurrentCSRs; + } /// \brief Update the Save and Restore points such that \p MBB is in /// the region that is dominated by Save and post-dominated by Restore /// and Save and Restore still match the safe point definition. /// Such point may not exist and Save and/or Restore may be null after /// this call. - void updateSaveRestorePoints(MachineBasicBlock &MBB); + void updateSaveRestorePoints(MachineBasicBlock &MBB, RegScavenger *RS); /// \brief Initialize the pass for \p MF. void init(MachineFunction &MF) { @@ -140,6 +170,8 @@ class ShrinkWrap : public MachineFunctionPass { FrameSetupOpcode = TII.getCallFrameSetupOpcode(); FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); Entry = &MF.front(); + CurrentCSRs.clear(); + MachineFunc = &MF; ++NumFunc; } @@ -148,6 +180,9 @@ class ShrinkWrap : public MachineFunctionPass { /// shrink-wrapping. bool ArePointsInteresting() const { return Save != Entry && Save && Restore; } + /// \brief Check if shrink wrapping is enabled for this target and function. + static bool isShrinkWrapEnabled(const MachineFunction &MF); + public: static char ID; @@ -185,27 +220,34 @@ INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false, false) -bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI) const { +bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI, + RegScavenger *RS) const { if (MI.getOpcode() == FrameSetupOpcode || MI.getOpcode() == FrameDestroyOpcode) { DEBUG(dbgs() << "Frame instruction: " << MI << '\n'); return true; } for (const MachineOperand &MO : MI.operands()) { - bool UseCSR = false; + bool UseOrDefCSR = false; if (MO.isReg()) { unsigned PhysReg = MO.getReg(); if (!PhysReg) continue; assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) && "Unallocated register?!"); - UseCSR = RCI.getLastCalleeSavedAlias(PhysReg); + UseOrDefCSR = RCI.getLastCalleeSavedAlias(PhysReg); + } else if (MO.isRegMask()) { + // Check if this regmask clobbers any of the CSRs. + for (unsigned Reg : getCurrentCSRs(RS)) { + if (MO.clobbersPhysReg(Reg)) { + UseOrDefCSR = true; + break; + } + } } - // TODO: Handle regmask more accurately. - // For now, be conservative about them. - if (UseCSR || MO.isFI() || MO.isRegMask()) { - DEBUG(dbgs() << "Use or define CSR(" << UseCSR << ") or FI(" << MO.isFI() - << "): " << MI << '\n'); + if (UseOrDefCSR || MO.isFI()) { + DEBUG(dbgs() << "Use or define CSR(" << UseOrDefCSR << ") or FI(" + << MO.isFI() << "): " << MI << '\n'); return true; } } @@ -222,10 +264,13 @@ MachineBasicBlock *FindIDom(MachineBasicBlock &Block, ListOfBBs BBs, if (!IDom) break; } + if (IDom == &Block) + return nullptr; return IDom; } -void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB) { +void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB, + RegScavenger *RS) { // Get rid of the easy cases first. if (!Save) Save = &MBB; @@ -246,7 +291,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB) { // terminator. if (Restore == &MBB) { for (const MachineInstr &Terminator : MBB.terminators()) { - if (!useOrDefCSROrFI(Terminator)) + if (!useOrDefCSROrFI(Terminator, RS)) continue; // One of the terminator needs to happen before the restore point. if (MBB.succ_empty()) { @@ -277,7 +322,24 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB) { while (Save && Restore && (!(SaveDominatesRestore = MDT->dominates(Save, Restore)) || !(RestorePostDominatesSave = MPDT->dominates(Restore, Save)) || - MLI->getLoopFor(Save) != MLI->getLoopFor(Restore))) { + // Post-dominance is not enough in loops to ensure that all uses/defs + // are after the prologue and before the epilogue at runtime. + // E.g., + // while(1) { + // Save + // Restore + // if (...) + // break; + // use/def CSRs + // } + // All the uses/defs of CSRs are dominated by Save and post-dominated + // by Restore. However, the CSRs uses are still reachable after + // Restore and before Save are executed. + // + // For now, just push the restore/save points outside of loops. + // FIXME: Refine the criteria to still find interesting cases + // for loops. + MLI->getLoopFor(Save) || MLI->getLoopFor(Restore))) { // Fix (A). if (!SaveDominatesRestore) { Save = MDT->findNearestCommonDominator(Save, Restore); @@ -288,35 +350,114 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB) { Restore = MPDT->findNearestCommonDominator(Restore, Save); // Fix (C). - if (Save && Restore && Save != Restore && - MLI->getLoopFor(Save) != MLI->getLoopFor(Restore)) { - if (MLI->getLoopDepth(Save) > MLI->getLoopDepth(Restore)) - // Push Save outside of this loop. + if (Save && Restore && + (MLI->getLoopFor(Save) || MLI->getLoopFor(Restore))) { + if (MLI->getLoopDepth(Save) > MLI->getLoopDepth(Restore)) { + // Push Save outside of this loop if immediate dominator is different + // from save block. If immediate dominator is not different, bail out. Save = FindIDom<>(*Save, Save->predecessors(), *MDT); - else + if (!Save) + break; + } else { + // If the loop does not exit, there is no point in looking + // for a post-dominator outside the loop. + SmallVector<MachineBasicBlock*, 4> ExitBlocks; + MLI->getLoopFor(Restore)->getExitingBlocks(ExitBlocks); // Push Restore outside of this loop. - Restore = FindIDom<>(*Restore, Restore->successors(), *MPDT); + // Look for the immediate post-dominator of the loop exits. + MachineBasicBlock *IPdom = Restore; + for (MachineBasicBlock *LoopExitBB: ExitBlocks) { + IPdom = FindIDom<>(*IPdom, LoopExitBB->successors(), *MPDT); + if (!IPdom) + break; + } + // If the immediate post-dominator is not in a less nested loop, + // then we are stuck in a program with an infinite loop. + // In that case, we will not find a safe point, hence, bail out. + if (IPdom && MLI->getLoopDepth(IPdom) < MLI->getLoopDepth(Restore)) + Restore = IPdom; + else { + Restore = nullptr; + break; + } + } + } + } +} + +/// Check whether the edge (\p SrcBB, \p DestBB) is a backedge according to MLI. +/// I.e., check if it exists a loop that contains SrcBB and where DestBB is the +/// loop header. +static bool isProperBackedge(const MachineLoopInfo &MLI, + const MachineBasicBlock *SrcBB, + const MachineBasicBlock *DestBB) { + for (const MachineLoop *Loop = MLI.getLoopFor(SrcBB); Loop; + Loop = Loop->getParentLoop()) { + if (Loop->getHeader() == DestBB) + return true; + } + return false; +} + +/// Check if the CFG of \p MF is irreducible. +static bool isIrreducibleCFG(const MachineFunction &MF, + const MachineLoopInfo &MLI) { + const MachineBasicBlock *Entry = &*MF.begin(); + ReversePostOrderTraversal<const MachineBasicBlock *> RPOT(Entry); + BitVector VisitedBB(MF.getNumBlockIDs()); + for (const MachineBasicBlock *MBB : RPOT) { + VisitedBB.set(MBB->getNumber()); + for (const MachineBasicBlock *SuccBB : MBB->successors()) { + if (!VisitedBB.test(SuccBB->getNumber())) + continue; + // We already visited SuccBB, thus MBB->SuccBB must be a backedge. + // Check that the head matches what we have in the loop information. + // Otherwise, we have an irreducible graph. + if (!isProperBackedge(MLI, MBB, SuccBB)) + return true; } } + return false; } bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { - if (MF.empty()) + if (MF.empty() || !isShrinkWrapEnabled(MF)) return false; + DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); init(MF); + if (isIrreducibleCFG(MF, *MLI)) { + // If MF is irreducible, a block may be in a loop without + // MachineLoopInfo reporting it. I.e., we may use the + // post-dominance property in loops, which lead to incorrect + // results. Moreover, we may miss that the prologue and + // epilogue are not in the same loop, leading to unbalanced + // construction/deconstruction of the stack frame. + DEBUG(dbgs() << "Irreducible CFGs are not supported yet\n"); + return false; + } + + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + std::unique_ptr<RegScavenger> RS( + TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr); + for (MachineBasicBlock &MBB : MF) { DEBUG(dbgs() << "Look into: " << MBB.getNumber() << ' ' << MBB.getName() << '\n'); + if (MBB.isEHFuncletEntry()) { + DEBUG(dbgs() << "EH Funclets are not supported yet.\n"); + return false; + } + for (const MachineInstr &MI : MBB) { - if (!useOrDefCSROrFI(MI)) + if (!useOrDefCSROrFI(MI, RS.get())) continue; // Save (resp. restore) point must dominate (resp. post dominate) // MI. Look for the proper basic block for those. - updateSaveRestorePoints(MBB); + updateSaveRestorePoints(MBB, RS.get()); // If we are at a point where we cannot improve the placement of // save/restore instructions, just give up. if (!ArePointsInteresting()) { @@ -368,7 +509,7 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { break; NewBB = Restore; } - updateSaveRestorePoints(*NewBB); + updateSaveRestorePoints(*NewBB, RS.get()); } while (Save && Restore); if (!ArePointsInteresting()) { @@ -386,3 +527,30 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { ++NumCandidates; return false; } + +bool ShrinkWrap::isShrinkWrapEnabled(const MachineFunction &MF) { + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + + switch (EnableShrinkWrapOpt) { + case cl::BOU_UNSET: + return TFI->enableShrinkWrapping(MF) && + // Windows with CFI has some limitations that make it impossible + // to use shrink-wrapping. + !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && + // Sanitizers look at the value of the stack at the location + // of the crash. Since a crash can happen anywhere, the + // frame must be lowered before anything else happen for the + // sanitizers to be able to get a correct stack frame. + !(MF.getFunction()->hasFnAttribute(Attribute::SanitizeAddress) || + MF.getFunction()->hasFnAttribute(Attribute::SanitizeThread) || + MF.getFunction()->hasFnAttribute(Attribute::SanitizeMemory)); + // If EnableShrinkWrap is set, it takes precedence on whatever the + // target sets. The rational is that we assume we want to test + // something related to shrink-wrapping. + case cl::BOU_TRUE: + return true; + case cl::BOU_FALSE: + return false; + } + llvm_unreachable("Invalid shrink-wrapping state"); +} diff --git a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp index d236e1f..e1f242a 100644 --- a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp +++ b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp @@ -50,7 +50,7 @@ class SjLjEHPrepare : public FunctionPass { Type *FunctionContextTy; Constant *RegisterFn; Constant *UnregisterFn; - Constant *BuiltinSetjmpFn; + Constant *BuiltinSetupDispatchFn; Constant *FrameAddrFn; Constant *StackAddrFn; Constant *StackRestoreFn; @@ -112,7 +112,8 @@ bool SjLjEHPrepare::doInitialization(Module &M) { FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress); StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave); StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore); - BuiltinSetjmpFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setjmp); + BuiltinSetupDispatchFn = + Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setup_dispatch); LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda); CallSiteFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_callsite); FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext); @@ -178,8 +179,8 @@ void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, // values and replace the LPI with that aggregate. Type *LPadType = LPI->getType(); Value *LPadVal = UndefValue::get(LPadType); - IRBuilder<> Builder( - std::next(BasicBlock::iterator(cast<Instruction>(SelVal)))); + auto *SelI = cast<Instruction>(SelVal); + IRBuilder<> Builder(SelI->getParent(), std::next(SelI->getIterator())); LPadVal = Builder.CreateInsertValue(LPadVal, ExnVal, 0, "lpad.val"); LPadVal = Builder.CreateInsertValue(LPadVal, SelVal, 1, "lpad.val"); @@ -190,7 +191,7 @@ void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, /// it with all of the data that we know at this point. Value *SjLjEHPrepare::setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads) { - BasicBlock *EntryBB = F.begin(); + BasicBlock *EntryBB = &F.front(); // Create an alloca for the incoming jump buffer ptr and the new jump buffer // that needs to be restored on all exits from the function. This is an alloca @@ -198,12 +199,13 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F, auto &DL = F.getParent()->getDataLayout(); unsigned Align = DL.getPrefTypeAlignment(FunctionContextTy); FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context", - EntryBB->begin()); + &EntryBB->front()); // Fill in the function context structure. for (unsigned I = 0, E = LPads.size(); I != E; ++I) { LandingPadInst *LPI = LPads[I]; - IRBuilder<> Builder(LPI->getParent()->getFirstInsertionPt()); + IRBuilder<> Builder(LPI->getParent(), + LPI->getParent()->getFirstInsertionPt()); // Reference the __data field. Value *FCData = @@ -250,21 +252,20 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) { while (isa<AllocaInst>(AfterAllocaInsPt) && isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsPt)->getArraySize())) ++AfterAllocaInsPt; + assert(AfterAllocaInsPt != F.front().end()); - for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE; - ++AI) { - Type *Ty = AI->getType(); + for (auto &AI : F.args()) { + Type *Ty = AI.getType(); // Use 'select i8 true, %arg, undef' to simulate a 'no-op' instruction. Value *TrueValue = ConstantInt::getTrue(F.getContext()); Value *UndefValue = UndefValue::get(Ty); - Instruction *SI = SelectInst::Create(TrueValue, AI, UndefValue, - AI->getName() + ".tmp", - AfterAllocaInsPt); - AI->replaceAllUsesWith(SI); + Instruction *SI = SelectInst::Create( + TrueValue, &AI, UndefValue, AI.getName() + ".tmp", &*AfterAllocaInsPt); + AI.replaceAllUsesWith(SI); // Reset the operand, because it was clobbered by the RAUW above. - SI->setOperand(1, AI); + SI->setOperand(1, &AI); } } @@ -279,7 +280,7 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F, // Ignore obvious cases we don't have to handle. In particular, most // instructions either have no uses or only have a single use inside the // current block. Ignore them quickly. - Instruction *Inst = II; + Instruction *Inst = &*II; if (Inst->use_empty()) continue; if (Inst->hasOneUse() && @@ -360,7 +361,7 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F, DemotePHIToStack(PN); // Move the landingpad instruction back to the top of the landing pad block. - LPI->moveBefore(UnwindBlock->begin()); + LPI->moveBefore(&UnwindBlock->front()); } } @@ -400,7 +401,7 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) { Value *FuncCtx = setupFunctionContext(F, makeArrayRef(LPads.begin(), LPads.end())); - BasicBlock *EntryBB = F.begin(); + BasicBlock *EntryBB = &F.front(); IRBuilder<> Builder(EntryBB->getTerminator()); // Get a reference to the jump buffer. @@ -421,9 +422,8 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) { Val = Builder.CreateCall(StackAddrFn, {}, "sp"); Builder.CreateStore(Val, StackPtr, /*isVolatile=*/true); - // Call the setjmp instrinsic. It fills in the rest of the jmpbuf. - Value *SetjmpArg = Builder.CreateBitCast(JBufPtr, Builder.getInt8PtrTy()); - Builder.CreateCall(BuiltinSetjmpFn, SetjmpArg); + // Call the setup_dispatch instrinsic. It fills in the rest of the jmpbuf. + Builder.CreateCall(BuiltinSetupDispatchFn, {}); // Store a pointer to the function context so that the back-end will know // where to look for it. @@ -475,7 +475,7 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) { continue; } Instruction *StackAddr = CallInst::Create(StackAddrFn, "sp"); - StackAddr->insertAfter(I); + StackAddr->insertAfter(&*I); Instruction *StoreStackAddr = new StoreInst(StackAddr, StackPtr, true); StoreStackAddr->insertAfter(StackAddr); } diff --git a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp index 025ae70..c9d23f6 100644 --- a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp +++ b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp @@ -172,8 +172,8 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB, // optionally includes an additional position prior to MBB->begin(), indicated // by the includeStart flag. This is done so that we can iterate MIs in a MBB // in parallel with SlotIndexes, but there should be a better way to do this. - IndexList::iterator ListB = startIdx.listEntry(); - IndexList::iterator ListI = endIdx.listEntry(); + IndexList::iterator ListB = startIdx.listEntry()->getIterator(); + IndexList::iterator ListI = endIdx.listEntry()->getIterator(); MachineBasicBlock::iterator MBBI = End; bool pastStart = false; while (ListI != ListB || MBBI != Begin || (includeStart && !pastStart)) { diff --git a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp index 97a5424..d30cfc2 100644 --- a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp +++ b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp @@ -36,7 +36,6 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Format.h" #include "llvm/Support/ManagedStatic.h" using namespace llvm; @@ -188,9 +187,9 @@ bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) { BlockFrequencies.resize(mf.getNumBlockIDs()); MBFI = &getAnalysis<MachineBlockFrequencyInfo>(); setThreshold(MBFI->getEntryFreq()); - for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) { - unsigned Num = I->getNumber(); - BlockFrequencies[Num] = MBFI->getBlockFreq(I); + for (auto &I : mf) { + unsigned Num = I.getNumber(); + BlockFrequencies[Num] = MBFI->getBlockFreq(&I); } // We never change the function. diff --git a/contrib/llvm/lib/CodeGen/SplitKit.cpp b/contrib/llvm/lib/CodeGen/SplitKit.cpp index dab1dfe..51dddab 100644 --- a/contrib/llvm/lib/CodeGen/SplitKit.cpp +++ b/contrib/llvm/lib/CodeGen/SplitKit.cpp @@ -56,6 +56,7 @@ void SplitAnalysis::clear() { SlotIndex SplitAnalysis::computeLastSplitPoint(unsigned Num) { const MachineBasicBlock *MBB = MF.getBlockNumbered(Num); + // FIXME: Handle multiple EH pad successors. const MachineBasicBlock *LPad = MBB->getLandingPadSuccessor(); std::pair<SlotIndex, SlotIndex> &LSP = LastSplitPoint[Num]; SlotIndex MBBEnd = LIS.getMBBEndIdx(MBB); @@ -176,10 +177,11 @@ bool SplitAnalysis::calcLiveBlockInfo() { UseE = UseSlots.end(); // Loop over basic blocks where CurLI is live. - MachineFunction::iterator MFI = LIS.getMBBFromIndex(LVI->start); + MachineFunction::iterator MFI = + LIS.getMBBFromIndex(LVI->start)->getIterator(); for (;;) { BlockInfo BI; - BI.MBB = MFI; + BI.MBB = &*MFI; SlotIndex Start, Stop; std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB); @@ -259,7 +261,7 @@ bool SplitAnalysis::calcLiveBlockInfo() { if (LVI->start < Stop) ++MFI; else - MFI = LIS.getMBBFromIndex(LVI->start); + MFI = LIS.getMBBFromIndex(LVI->start)->getIterator(); } assert(getNumLiveBlocks() == countLiveBlocks(CurLI) && "Bad block count"); @@ -275,8 +277,9 @@ unsigned SplitAnalysis::countLiveBlocks(const LiveInterval *cli) const { unsigned Count = 0; // Loop over basic blocks where li is live. - MachineFunction::const_iterator MFI = LIS.getMBBFromIndex(LVI->start); - SlotIndex Stop = LIS.getMBBEndIdx(MFI); + MachineFunction::const_iterator MFI = + LIS.getMBBFromIndex(LVI->start)->getIterator(); + SlotIndex Stop = LIS.getMBBEndIdx(&*MFI); for (;;) { ++Count; LVI = li->advanceTo(LVI, Stop); @@ -284,7 +287,7 @@ unsigned SplitAnalysis::countLiveBlocks(const LiveInterval *cli) const { return Count; do { ++MFI; - Stop = LIS.getMBBEndIdx(MFI); + Stop = LIS.getMBBEndIdx(&*MFI); } while (Stop <= LVI->start); } } @@ -864,9 +867,9 @@ bool SplitEditor::transferValues() { // This value has multiple defs in RegIdx, but it wasn't rematerialized, // so the live range is accurate. Add live-in blocks in [Start;End) to the // LiveInBlocks. - MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start); + MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start)->getIterator(); SlotIndex BlockStart, BlockEnd; - std::tie(BlockStart, BlockEnd) = LIS.getSlotIndexes()->getMBBRange(MBB); + std::tie(BlockStart, BlockEnd) = LIS.getSlotIndexes()->getMBBRange(&*MBB); // The first block may be live-in, or it may have its own def. if (Start != BlockStart) { @@ -875,7 +878,7 @@ bool SplitEditor::transferValues() { DEBUG(dbgs() << ':' << VNI->id << "*BB#" << MBB->getNumber()); // MBB has its own def. Is it also live-out? if (BlockEnd <= End) - LRC.setLiveOutValue(MBB, VNI); + LRC.setLiveOutValue(&*MBB, VNI); // Skip to the next block for live-in. ++MBB; @@ -886,23 +889,23 @@ bool SplitEditor::transferValues() { assert(Start <= BlockStart && "Expected live-in block"); while (BlockStart < End) { DEBUG(dbgs() << ">BB#" << MBB->getNumber()); - BlockEnd = LIS.getMBBEndIdx(MBB); + BlockEnd = LIS.getMBBEndIdx(&*MBB); if (BlockStart == ParentVNI->def) { // This block has the def of a parent PHI, so it isn't live-in. assert(ParentVNI->isPHIDef() && "Non-phi defined at block start?"); VNInfo *VNI = LR.extendInBlock(BlockStart, std::min(BlockEnd, End)); assert(VNI && "Missing def for complex mapped parent PHI"); if (End >= BlockEnd) - LRC.setLiveOutValue(MBB, VNI); // Live-out as well. + LRC.setLiveOutValue(&*MBB, VNI); // Live-out as well. } else { // This block needs a live-in value. The last block covered may not // be live-out. if (End < BlockEnd) - LRC.addLiveInBlock(LR, MDT[MBB], End); + LRC.addLiveInBlock(LR, MDT[&*MBB], End); else { // Live-through, and we don't know the value. - LRC.addLiveInBlock(LR, MDT[MBB]); - LRC.setLiveOutValue(MBB, nullptr); + LRC.addLiveInBlock(LR, MDT[&*MBB]); + LRC.setLiveOutValue(&*MBB, nullptr); } } BlockStart = BlockEnd; @@ -1081,16 +1084,14 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) { ConnectedVNInfoEqClasses ConEQ(LIS); for (unsigned i = 0, e = Edit->size(); i != e; ++i) { // Don't use iterators, they are invalidated by create() below. - LiveInterval *li = &LIS.getInterval(Edit->get(i)); - unsigned NumComp = ConEQ.Classify(li); - if (NumComp <= 1) - continue; - DEBUG(dbgs() << " " << NumComp << " components: " << *li << '\n'); - SmallVector<LiveInterval*, 8> dups; - dups.push_back(li); - for (unsigned j = 1; j != NumComp; ++j) - dups.push_back(&Edit->createEmptyInterval()); - ConEQ.Distribute(&dups[0], MRI); + unsigned VReg = Edit->get(i); + LiveInterval &LI = LIS.getInterval(VReg); + SmallVector<LiveInterval*, 8> SplitLIs; + LIS.splitSeparateComponents(LI, SplitLIs); + unsigned Original = VRM.getOriginal(VReg); + for (LiveInterval *SplitLI : SplitLIs) + VRM.setIsSplitFromReg(SplitLI->reg, Original); + // The new intervals all map back to i. if (LRMap) LRMap->resize(Edit->size(), i); diff --git a/contrib/llvm/lib/CodeGen/StackColoring.cpp b/contrib/llvm/lib/CodeGen/StackColoring.cpp index 3541b33..7b52038 100644 --- a/contrib/llvm/lib/CodeGen/StackColoring.cpp +++ b/contrib/llvm/lib/CodeGen/StackColoring.cpp @@ -43,6 +43,7 @@ #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/StackProtector.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -570,6 +571,14 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) { } } + // Update the location of C++ catch objects for the MSVC personality routine. + if (WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo()) + for (WinEHTryBlockMapEntry &TBME : EHInfo->TryBlockMap) + for (WinEHHandlerType &H : TBME.HandlerArray) + if (H.CatchObj.FrameIndex != INT_MAX && + SlotRemap.count(H.CatchObj.FrameIndex)) + H.CatchObj.FrameIndex = SlotRemap[H.CatchObj.FrameIndex]; + DEBUG(dbgs()<<"Fixed "<<FixedMemOp<<" machine memory operands.\n"); DEBUG(dbgs()<<"Fixed "<<FixedDbg<<" debug locations.\n"); DEBUG(dbgs()<<"Fixed "<<FixedInstr<<" machine instructions.\n"); diff --git a/contrib/llvm/lib/CodeGen/StackMaps.cpp b/contrib/llvm/lib/CodeGen/StackMaps.cpp index 116eef6..b3cd8b3 100644 --- a/contrib/llvm/lib/CodeGen/StackMaps.cpp +++ b/contrib/llvm/lib/CodeGen/StackMaps.cpp @@ -94,7 +94,9 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI, default: llvm_unreachable("Unrecognized operand type."); case StackMaps::DirectMemRefOp: { - unsigned Size = AP.TM.getDataLayout()->getPointerSizeInBits(); + auto &DL = AP.MF->getDataLayout(); + + unsigned Size = DL.getPointerSizeInBits(); assert((Size % 8) == 0 && "Need pointer size in bytes."); Size /= 8; unsigned Reg = (++MOI)->getReg(); diff --git a/contrib/llvm/lib/CodeGen/StackProtector.cpp b/contrib/llvm/lib/CodeGen/StackProtector.cpp index bcea37a..db3fef5 100644 --- a/contrib/llvm/lib/CodeGen/StackProtector.cpp +++ b/contrib/llvm/lib/CodeGen/StackProtector.cpp @@ -373,7 +373,7 @@ bool StackProtector::InsertStackProtectors() { Value *StackGuardVar = nullptr; // The stack guard variable. for (Function::iterator I = F->begin(), E = F->end(); I != E;) { - BasicBlock *BB = I++; + BasicBlock *BB = &*I++; ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()); if (!RI) continue; @@ -433,7 +433,7 @@ bool StackProtector::InsertStackProtectors() { BasicBlock *FailBB = CreateFailBB(); // Split the basic block before the return instruction. - BasicBlock *NewBB = BB->splitBasicBlock(RI, "SP_return"); + BasicBlock *NewBB = BB->splitBasicBlock(RI->getIterator(), "SP_return"); // Update the dominator tree if we need to. if (DT && DT->isReachableFromEntry(BB)) { @@ -453,22 +453,20 @@ bool StackProtector::InsertStackProtectors() { LoadInst *LI1 = B.CreateLoad(StackGuardVar); LoadInst *LI2 = B.CreateLoad(AI); Value *Cmp = B.CreateICmpEQ(LI1, LI2); - unsigned SuccessWeight = - BranchProbabilityInfo::getBranchWeightStackProtector(true); - unsigned FailureWeight = - BranchProbabilityInfo::getBranchWeightStackProtector(false); + auto SuccessProb = + BranchProbabilityInfo::getBranchProbStackProtector(true); + auto FailureProb = + BranchProbabilityInfo::getBranchProbStackProtector(false); MDNode *Weights = MDBuilder(F->getContext()) - .createBranchWeights(SuccessWeight, FailureWeight); + .createBranchWeights(SuccessProb.getNumerator(), + FailureProb.getNumerator()); B.CreateCondBr(Cmp, NewBB, FailBB, Weights); } } // Return if we didn't modify any basic blocks. i.e., there are no return // statements in the function. - if (!HasPrologue) - return false; - - return true; + return HasPrologue; } /// CreateFailBB - Create a basic block to jump to when the stack protector diff --git a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp index a5a175f..51f4d0e 100644 --- a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp +++ b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp @@ -318,7 +318,7 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) { if (NewFI == -1 || (NewFI == (int)SS)) continue; - const PseudoSourceValue *NewSV = PseudoSourceValue::getFixedStack(NewFI); + const PseudoSourceValue *NewSV = MF.getPSVManager().getFixedStack(NewFI); SmallVectorImpl<MachineMemOperand *> &RefMMOs = SSRefs[SS]; for (unsigned i = 0, e = RefMMOs.size(); i != e; ++i) RefMMOs[i]->setValue(NewSV); diff --git a/contrib/llvm/lib/CodeGen/StatepointExampleGC.cpp b/contrib/llvm/lib/CodeGen/StatepointExampleGC.cpp index 95dfd75..3f60e18 100644 --- a/contrib/llvm/lib/CodeGen/StatepointExampleGC.cpp +++ b/contrib/llvm/lib/CodeGen/StatepointExampleGC.cpp @@ -34,9 +34,9 @@ public: UsesMetadata = false; CustomRoots = false; } - Optional<bool> isGCManagedPointer(const Value *V) const override { + Optional<bool> isGCManagedPointer(const Type *Ty) const override { // Method is only valid on pointer typed values. - PointerType *PT = cast<PointerType>(V->getType()); + const PointerType *PT = cast<PointerType>(Ty); // For the sake of this example GC, we arbitrarily pick addrspace(1) as our // GC managed heap. We know that a pointer into this heap needs to be // updated and that no other pointer does. Note that addrspace(1) is used diff --git a/contrib/llvm/lib/CodeGen/TailDuplication.cpp b/contrib/llvm/lib/CodeGen/TailDuplication.cpp index 237460c..d2fbf53 100644 --- a/contrib/llvm/lib/CodeGen/TailDuplication.cpp +++ b/contrib/llvm/lib/CodeGen/TailDuplication.cpp @@ -59,7 +59,7 @@ TailDupLimit("tail-dup-limit", cl::init(~0U), cl::Hidden); typedef std::vector<std::pair<MachineBasicBlock*,unsigned> > AvailableValsTy; namespace { - /// TailDuplicatePass - Perform tail duplication. + /// Perform tail duplication. class TailDuplicatePass : public MachineFunctionPass { const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; @@ -69,11 +69,11 @@ namespace { std::unique_ptr<RegScavenger> RS; bool PreRegAlloc; - // SSAUpdateVRs - A list of virtual registers for which to update SSA form. + // A list of virtual registers for which to update SSA form. SmallVector<unsigned, 16> SSAUpdateVRs; - // SSAUpdateVals - For each virtual register in SSAUpdateVals keep a list of - // source virtual registers. + // For each virtual register in SSAUpdateVals keep a list of source virtual + // registers. DenseMap<unsigned, AvailableValsTy> SSAUpdateVals; public: @@ -161,7 +161,7 @@ void TailDuplicatePass::getAnalysisUsage(AnalysisUsage &AU) const { static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) { for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ++I) { - MachineBasicBlock *MBB = I; + MachineBasicBlock *MBB = &*I; SmallSetVector<MachineBasicBlock*, 8> Preds(MBB->pred_begin(), MBB->pred_end()); MachineBasicBlock::iterator MI = MBB->begin(); @@ -207,7 +207,7 @@ static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) { } } -/// TailDuplicateAndUpdate - Tail duplicate the block and cleanup. +/// Tail duplicate the block and cleanup. bool TailDuplicatePass::TailDuplicateAndUpdate(MachineBasicBlock *MBB, bool IsSimple, @@ -310,9 +310,9 @@ TailDuplicatePass::TailDuplicateAndUpdate(MachineBasicBlock *MBB, return true; } -/// TailDuplicateBlocks - Look for small blocks that are unconditionally -/// branched to and do not fall through. Tail-duplicate their instructions -/// into their predecessors to eliminate (dynamic) branches. +/// Look for small blocks that are unconditionally branched to and do not fall +/// through. Tail-duplicate their instructions into their predecessors to +/// eliminate (dynamic) branches. bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) { bool MadeChange = false; @@ -322,7 +322,7 @@ bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) { } for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ) { - MachineBasicBlock *MBB = I++; + MachineBasicBlock *MBB = &*I++; if (NumTails == TailDupLimit) break; @@ -375,8 +375,7 @@ static void getRegsUsedByPHIs(const MachineBasicBlock &BB, } } -/// AddSSAUpdateEntry - Add a definition and source virtual registers pair for -/// SSA update. +/// Add a definition and source virtual registers pair for SSA update. void TailDuplicatePass::AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg, MachineBasicBlock *BB) { DenseMap<unsigned, AvailableValsTy>::iterator LI= SSAUpdateVals.find(OrigReg); @@ -390,9 +389,8 @@ void TailDuplicatePass::AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg, } } -/// ProcessPHI - Process PHI node in TailBB by turning it into a copy in PredBB. -/// Remember the source register that's contributed by PredBB and update SSA -/// update map. +/// Process PHI node in TailBB by turning it into a copy in PredBB. Remember the +/// source register that's contributed by PredBB and update SSA update map. void TailDuplicatePass::ProcessPHI( MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB, DenseMap<unsigned, unsigned> &LocalVRMap, @@ -422,7 +420,7 @@ void TailDuplicatePass::ProcessPHI( MI->eraseFromParent(); } -/// DuplicateInstruction - Duplicate a TailBB instruction to PredBB and update +/// Duplicate a TailBB instruction to PredBB and update /// the source operands due to earlier PHI translation. void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI, MachineBasicBlock *TailBB, @@ -459,9 +457,9 @@ void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI, PredBB->insert(PredBB->instr_end(), NewMI); } -/// UpdateSuccessorsPHIs - After FromBB is tail duplicated into its predecessor -/// blocks, the successors have gained new predecessors. Update the PHI -/// instructions in them accordingly. +/// After FromBB is tail duplicated into its predecessor blocks, the successors +/// have gained new predecessors. Update the PHI instructions in them +/// accordingly. void TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead, SmallVectorImpl<MachineBasicBlock *> &TDBBs, @@ -545,7 +543,7 @@ TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead, } } -/// shouldTailDuplicate - Determine if it is profitable to duplicate this block. +/// Determine if it is profitable to duplicate this block. bool TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, @@ -563,6 +561,7 @@ TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF, // compensate for the duplication. unsigned MaxDuplicateCount; if (TailDuplicateSize.getNumOccurrences() == 0 && + // FIXME: Use Function::optForSize(). MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize)) MaxDuplicateCount = 1; else @@ -584,30 +583,51 @@ TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF, // Check the instructions in the block to determine whether tail-duplication // is invalid or unlikely to be profitable. unsigned InstrCount = 0; - for (MachineBasicBlock::iterator I = TailBB.begin(); I != TailBB.end(); ++I) { + for (MachineInstr &MI : TailBB) { // Non-duplicable things shouldn't be tail-duplicated. - if (I->isNotDuplicable()) + if (MI.isNotDuplicable()) return false; // Do not duplicate 'return' instructions if this is a pre-regalloc run. // A return may expand into a lot more instructions (e.g. reload of callee // saved registers) after PEI. - if (PreRegAlloc && I->isReturn()) + if (PreRegAlloc && MI.isReturn()) return false; // Avoid duplicating calls before register allocation. Calls presents a // barrier to register allocation so duplicating them may end up increasing // spills. - if (PreRegAlloc && I->isCall()) + if (PreRegAlloc && MI.isCall()) return false; - if (!I->isPHI() && !I->isDebugValue()) + if (!MI.isPHI() && !MI.isDebugValue()) InstrCount += 1; if (InstrCount > MaxDuplicateCount) return false; } + // Check if any of the successors of TailBB has a PHI node in which the + // value corresponding to TailBB uses a subregister. + // If a phi node uses a register paired with a subregister, the actual + // "value type" of the phi may differ from the type of the register without + // any subregisters. Due to a bug, tail duplication may add a new operand + // without a necessary subregister, producing an invalid code. This is + // demonstrated by test/CodeGen/Hexagon/tail-dup-subreg-abort.ll. + // Disable tail duplication for this case for now, until the problem is + // fixed. + for (auto SB : TailBB.successors()) { + for (auto &I : *SB) { + if (!I.isPHI()) + break; + unsigned Idx = getPHISrcRegOpIdx(&I, &TailBB); + assert(Idx != 0); + MachineOperand &PU = I.getOperand(Idx); + if (PU.getSubReg() != 0) + return false; + } + } + if (HasIndirectbr && PreRegAlloc) return true; @@ -620,7 +640,7 @@ TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF, return canCompletelyDuplicateBB(TailBB); } -/// isSimpleBB - True if this BB has only one unconditional jump. +/// True if this BB has only one unconditional jump. bool TailDuplicatePass::isSimpleBB(MachineBasicBlock *TailBB) { if (TailBB->succ_size() != 1) @@ -636,22 +656,16 @@ TailDuplicatePass::isSimpleBB(MachineBasicBlock *TailBB) { static bool bothUsedInPHI(const MachineBasicBlock &A, SmallPtrSet<MachineBasicBlock*, 8> SuccsB) { - for (MachineBasicBlock::const_succ_iterator SI = A.succ_begin(), - SE = A.succ_end(); SI != SE; ++SI) { - MachineBasicBlock *BB = *SI; + for (MachineBasicBlock *BB : A.successors()) if (SuccsB.count(BB) && !BB->empty() && BB->begin()->isPHI()) return true; - } return false; } bool TailDuplicatePass::canCompletelyDuplicateBB(MachineBasicBlock &BB) { - for (MachineBasicBlock::pred_iterator PI = BB.pred_begin(), - PE = BB.pred_end(); PI != PE; ++PI) { - MachineBasicBlock *PredBB = *PI; - + for (MachineBasicBlock *PredBB : BB.predecessors()) { if (PredBB->succ_size() > 1) return false; @@ -680,7 +694,7 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB, PE = Preds.end(); PI != PE; ++PI) { MachineBasicBlock *PredBB = *PI; - if (PredBB->getLandingPadSuccessor()) + if (PredBB->hasEHPadSuccessor()) continue; if (bothUsedInPHI(*PredBB, Succs)) @@ -696,7 +710,7 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB, << "From simple Succ: " << *TailBB); MachineBasicBlock *NewTarget = *TailBB->succ_begin(); - MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(PredBB)); + MachineBasicBlock *NextBB = &*std::next(PredBB->getIterator()); // Make PredFBB explicit. if (PredCond.empty()) @@ -731,19 +745,19 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB, if (PredTBB) TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc()); - uint32_t Weight = MBPI->getEdgeWeight(PredBB, TailBB); - PredBB->removeSuccessor(TailBB); - unsigned NumSuccessors = PredBB->succ_size(); - assert(NumSuccessors <= 1); - if (NumSuccessors == 0 || *PredBB->succ_begin() != NewTarget) - PredBB->addSuccessor(NewTarget, Weight); + if (!PredBB->isSuccessor(NewTarget)) + PredBB->replaceSuccessor(TailBB, NewTarget); + else { + PredBB->removeSuccessor(TailBB, true); + assert(PredBB->succ_size() <= 1); + } TDBBs.push_back(PredBB); } return Changed; } -/// TailDuplicate - If it is profitable, duplicate TailBB's contents in each +/// If it is profitable, duplicate TailBB's contents in each /// of its predecessors. bool TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, @@ -798,13 +812,12 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, RS->enterBasicBlock(PredBB); if (!PredBB->empty()) RS->forward(std::prev(PredBB->end())); - for (MachineBasicBlock::livein_iterator I = TailBB->livein_begin(), - E = TailBB->livein_end(); I != E; ++I) { - if (!RS->isRegUsed(*I, false)) + for (const auto &LI : TailBB->liveins()) { + if (!RS->isRegUsed(LI.PhysReg, false)) // If a register is previously livein to the tail but it's not live // at the end of predecessor BB, then it should be added to its // livein list. - PredBB->addLiveIn(*I); + PredBB->addLiveIn(LI); } } @@ -845,7 +858,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, "TailDuplicate called on block with multiple successors!"); for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(), E = TailBB->succ_end(); I != E; ++I) - PredBB->addSuccessor(*I, MBPI->getEdgeWeight(TailBB, I)); + PredBB->addSuccessor(*I, MBPI->getEdgeProbability(TailBB, I)); Changed = true; ++NumTailDups; @@ -854,7 +867,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, // If TailBB was duplicated into all its predecessors except for the prior // block, which falls through unconditionally, move the contents of this // block into the prior block. - MachineBasicBlock *PrevBB = std::prev(MachineFunction::iterator(TailBB)); + MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator()); MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr; SmallVector<MachineOperand, 4> PriorCond; // This has to check PrevBB->succ_size() because EH edges are ignored by @@ -960,8 +973,8 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, return Changed; } -/// RemoveDeadBlock - Remove the specified dead machine basic block from the -/// function, updating the CFG. +/// Remove the specified dead machine basic block from the function, updating +/// the CFG. void TailDuplicatePass::RemoveDeadBlock(MachineBasicBlock *MBB) { assert(MBB->pred_empty() && "MBB must be dead!"); DEBUG(dbgs() << "\nRemoving MBB: " << *MBB); diff --git a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp index f3cccd8..679ade1 100644 --- a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -17,6 +17,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallingConv.h" #include "llvm/IR/Function.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -32,25 +33,22 @@ bool TargetFrameLowering::noFramePointerElim(const MachineFunction &MF) const { return Attr.getValueAsString() == "true"; } -/// getFrameIndexOffset - Returns the displacement from the frame register to -/// the stack frame of the specified index. This is the default implementation -/// which is overridden for some targets. -int TargetFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - return MFI->getObjectOffset(FI) + MFI->getStackSize() - - getOffsetOfLocalArea() + MFI->getOffsetAdjustment(); -} - +/// Returns the displacement from the frame register to the stack +/// frame of the specified index, along with the frame register used +/// (in output arg FrameReg). This is the default implementation which +/// is overridden for some targets. int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); // By default, assume all frame indices are referenced via whatever // getFrameRegister() says. The target can override this if it's doing // something different. FrameReg = RI->getFrameRegister(MF); - return getFrameIndexOffset(MF, FI); + + return MFI->getObjectOffset(FI) + MFI->getStackSize() - + getOffsetOfLocalArea() + MFI->getOffsetAdjustment(); } bool TargetFrameLowering::needsFrameIndexResolution( @@ -84,3 +82,13 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(Reg); } } + +unsigned TargetFrameLowering::getStackAlignmentSkew( + const MachineFunction &MF) const { + // When HHVM function is called, the stack is skewed as the return address + // is removed from the stack before we enter the function. + if (LLVM_UNLIKELY(MF.getFunction()->getCallingConv() == CallingConv::HHVM)) + return MF.getTarget().getPointerSize(); + + return 0; +} diff --git a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp index 97ca025..6eaf991 100644 --- a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -118,23 +118,24 @@ TargetInstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail, MBB->addSuccessor(NewDest); } -// commuteInstruction - The default implementation of this method just exchanges -// the two operands returned by findCommutedOpIndices. -MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI, - bool NewMI) const { +MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned Idx1, + unsigned Idx2) const { const MCInstrDesc &MCID = MI->getDesc(); bool HasDef = MCID.getNumDefs(); if (HasDef && !MI->getOperand(0).isReg()) // No idea how to commute this instruction. Target should implement its own. return nullptr; - unsigned Idx1, Idx2; - if (!findCommutedOpIndices(MI, Idx1, Idx2)) { - assert(MI->isCommutable() && "Precondition violation: MI must be commutable."); - return nullptr; - } + unsigned CommutableOpIdx1 = Idx1; (void)CommutableOpIdx1; + unsigned CommutableOpIdx2 = Idx2; (void)CommutableOpIdx2; + assert(findCommutedOpIndices(MI, CommutableOpIdx1, CommutableOpIdx2) && + CommutableOpIdx1 == Idx1 && CommutableOpIdx2 == Idx2 && + "TargetInstrInfo::CommuteInstructionImpl(): not commutable operands."); assert(MI->getOperand(Idx1).isReg() && MI->getOperand(Idx2).isReg() && "This only knows how to commute register operands so far"); + unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; unsigned Reg1 = MI->getOperand(Idx1).getReg(); unsigned Reg2 = MI->getOperand(Idx2).getReg(); @@ -184,9 +185,53 @@ MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI, return MI; } -/// findCommutedOpIndices - If specified MI is commutable, return the two -/// operand indices that would swap value. Return true if the instruction -/// is not in a form which this routine understands. +MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { + // If OpIdx1 or OpIdx2 is not specified, then this method is free to choose + // any commutable operand, which is done in findCommutedOpIndices() method + // called below. + if ((OpIdx1 == CommuteAnyOperandIndex || OpIdx2 == CommuteAnyOperandIndex) && + !findCommutedOpIndices(MI, OpIdx1, OpIdx2)) { + assert(MI->isCommutable() && + "Precondition violation: MI must be commutable."); + return nullptr; + } + return commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); +} + +bool TargetInstrInfo::fixCommutedOpIndices(unsigned &ResultIdx1, + unsigned &ResultIdx2, + unsigned CommutableOpIdx1, + unsigned CommutableOpIdx2) { + if (ResultIdx1 == CommuteAnyOperandIndex && + ResultIdx2 == CommuteAnyOperandIndex) { + ResultIdx1 = CommutableOpIdx1; + ResultIdx2 = CommutableOpIdx2; + } else if (ResultIdx1 == CommuteAnyOperandIndex) { + if (ResultIdx2 == CommutableOpIdx1) + ResultIdx1 = CommutableOpIdx2; + else if (ResultIdx2 == CommutableOpIdx2) + ResultIdx1 = CommutableOpIdx1; + else + return false; + } else if (ResultIdx2 == CommuteAnyOperandIndex) { + if (ResultIdx1 == CommutableOpIdx1) + ResultIdx2 = CommutableOpIdx2; + else if (ResultIdx1 == CommutableOpIdx2) + ResultIdx2 = CommutableOpIdx1; + else + return false; + } else + // Check that the result operand indices match the given commutable + // operand indices. + return (ResultIdx1 == CommutableOpIdx1 && ResultIdx2 == CommutableOpIdx2) || + (ResultIdx1 == CommutableOpIdx2 && ResultIdx2 == CommutableOpIdx1); + + return true; +} + bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { @@ -196,10 +241,15 @@ bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI, const MCInstrDesc &MCID = MI->getDesc(); if (!MCID.isCommutable()) return false; + // This assumes v0 = op v1, v2 and commuting would swap v1 and v2. If this // is not true, then the target must implement this. - SrcOpIdx1 = MCID.getNumDefs(); - SrcOpIdx2 = SrcOpIdx1 + 1; + unsigned CommutableOpIdx1 = MCID.getNumDefs(); + unsigned CommutableOpIdx2 = CommutableOpIdx1 + 1; + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + if (!MI->getOperand(SrcOpIdx1).isReg() || !MI->getOperand(SrcOpIdx2).isReg()) // No idea. @@ -207,7 +257,6 @@ bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI, return true; } - bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { if (!MI->isTerminator()) return false; @@ -315,7 +364,7 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, assert(RC->getSize() >= (Offset + Size) && "bad subregister range"); - if (!MF.getTarget().getDataLayout()->isLittleEndian()) { + if (!MF.getDataLayout().isLittleEndian()) { Offset = RC->getSize() - (Offset + Size); } return true; @@ -384,11 +433,6 @@ void TargetInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { llvm_unreachable("Not a MachO target"); } -bool TargetInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef<unsigned> Ops) const { - return MI->isCopy() && Ops.size() == 1 && canFoldCopy(MI, Ops[0]); -} - static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, int FrameIndex, const TargetInstrInfo &TII) { @@ -489,10 +533,9 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI, "Folded a use to a non-load!"); const MachineFrameInfo &MFI = *MF.getFrameInfo(); assert(MFI.getObjectOffset(FI) != -1); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - Flags, MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), Flags, MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); NewMI->addMemOperand(MF, MMO); return NewMI; @@ -517,6 +560,217 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI, return --Pos; } +bool TargetInstrInfo::hasReassociableOperands( + const MachineInstr &Inst, const MachineBasicBlock *MBB) const { + const MachineOperand &Op1 = Inst.getOperand(1); + const MachineOperand &Op2 = Inst.getOperand(2); + const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + // We need virtual register definitions for the operands that we will + // reassociate. + MachineInstr *MI1 = nullptr; + MachineInstr *MI2 = nullptr; + if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg())) + MI1 = MRI.getUniqueVRegDef(Op1.getReg()); + if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg())) + MI2 = MRI.getUniqueVRegDef(Op2.getReg()); + + // And they need to be in the trace (otherwise, they won't have a depth). + return MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB; +} + +bool TargetInstrInfo::hasReassociableSibling(const MachineInstr &Inst, + bool &Commuted) const { + const MachineBasicBlock *MBB = Inst.getParent(); + const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg()); + MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg()); + unsigned AssocOpcode = Inst.getOpcode(); + + // If only one operand has the same opcode and it's the second source operand, + // the operands must be commuted. + Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode; + if (Commuted) + std::swap(MI1, MI2); + + // 1. The previous instruction must be the same type as Inst. + // 2. The previous instruction must have virtual register definitions for its + // operands in the same basic block as Inst. + // 3. The previous instruction's result must only be used by Inst. + return MI1->getOpcode() == AssocOpcode && + hasReassociableOperands(*MI1, MBB) && + MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()); +} + +// 1. The operation must be associative and commutative. +// 2. The instruction must have virtual register definitions for its +// operands in the same basic block. +// 3. The instruction must have a reassociable sibling. +bool TargetInstrInfo::isReassociationCandidate(const MachineInstr &Inst, + bool &Commuted) const { + return isAssociativeAndCommutative(Inst) && + hasReassociableOperands(Inst, Inst.getParent()) && + hasReassociableSibling(Inst, Commuted); +} + +// The concept of the reassociation pass is that these operations can benefit +// from this kind of transformation: +// +// A = ? op ? +// B = A op X (Prev) +// C = B op Y (Root) +// --> +// A = ? op ? +// B = X op Y +// C = A op B +// +// breaking the dependency between A and B, allowing them to be executed in +// parallel (or back-to-back in a pipeline) instead of depending on each other. + +// FIXME: This has the potential to be expensive (compile time) while not +// improving the code at all. Some ways to limit the overhead: +// 1. Track successful transforms; bail out if hit rate gets too low. +// 2. Only enable at -O3 or some other non-default optimization level. +// 3. Pre-screen pattern candidates here: if an operand of the previous +// instruction is known to not increase the critical path, then don't match +// that pattern. +bool TargetInstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) const { + + bool Commute; + if (isReassociationCandidate(Root, Commute)) { + // We found a sequence of instructions that may be suitable for a + // reassociation of operands to increase ILP. Specify each commutation + // possibility for the Prev instruction in the sequence and let the + // machine combiner decide if changing the operands is worthwhile. + if (Commute) { + Patterns.push_back(MachineCombinerPattern::REASSOC_AX_YB); + Patterns.push_back(MachineCombinerPattern::REASSOC_XA_YB); + } else { + Patterns.push_back(MachineCombinerPattern::REASSOC_AX_BY); + Patterns.push_back(MachineCombinerPattern::REASSOC_XA_BY); + } + return true; + } + + return false; +} + +/// Attempt the reassociation transformation to reduce critical path length. +/// See the above comments before getMachineCombinerPatterns(). +void TargetInstrInfo::reassociateOps( + MachineInstr &Root, MachineInstr &Prev, + MachineCombinerPattern Pattern, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { + MachineFunction *MF = Root.getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + // This array encodes the operand index for each parameter because the + // operands may be commuted. Each row corresponds to a pattern value, + // and each column specifies the index of A, B, X, Y. + unsigned OpIdx[4][4] = { + { 1, 1, 2, 2 }, + { 1, 2, 2, 1 }, + { 2, 1, 1, 2 }, + { 2, 2, 1, 1 } + }; + + int Row; + switch (Pattern) { + case MachineCombinerPattern::REASSOC_AX_BY: Row = 0; break; + case MachineCombinerPattern::REASSOC_AX_YB: Row = 1; break; + case MachineCombinerPattern::REASSOC_XA_BY: Row = 2; break; + case MachineCombinerPattern::REASSOC_XA_YB: Row = 3; break; + default: llvm_unreachable("unexpected MachineCombinerPattern"); + } + + MachineOperand &OpA = Prev.getOperand(OpIdx[Row][0]); + MachineOperand &OpB = Root.getOperand(OpIdx[Row][1]); + MachineOperand &OpX = Prev.getOperand(OpIdx[Row][2]); + MachineOperand &OpY = Root.getOperand(OpIdx[Row][3]); + MachineOperand &OpC = Root.getOperand(0); + + unsigned RegA = OpA.getReg(); + unsigned RegB = OpB.getReg(); + unsigned RegX = OpX.getReg(); + unsigned RegY = OpY.getReg(); + unsigned RegC = OpC.getReg(); + + if (TargetRegisterInfo::isVirtualRegister(RegA)) + MRI.constrainRegClass(RegA, RC); + if (TargetRegisterInfo::isVirtualRegister(RegB)) + MRI.constrainRegClass(RegB, RC); + if (TargetRegisterInfo::isVirtualRegister(RegX)) + MRI.constrainRegClass(RegX, RC); + if (TargetRegisterInfo::isVirtualRegister(RegY)) + MRI.constrainRegClass(RegY, RC); + if (TargetRegisterInfo::isVirtualRegister(RegC)) + MRI.constrainRegClass(RegC, RC); + + // Create a new virtual register for the result of (X op Y) instead of + // recycling RegB because the MachineCombiner's computation of the critical + // path requires a new register definition rather than an existing one. + unsigned NewVR = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + + unsigned Opcode = Root.getOpcode(); + bool KillA = OpA.isKill(); + bool KillX = OpX.isKill(); + bool KillY = OpY.isKill(); + + // Create new instructions for insertion. + MachineInstrBuilder MIB1 = + BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR) + .addReg(RegX, getKillRegState(KillX)) + .addReg(RegY, getKillRegState(KillY)); + MachineInstrBuilder MIB2 = + BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC) + .addReg(RegA, getKillRegState(KillA)) + .addReg(NewVR, getKillRegState(true)); + + setSpecialOperandAttr(Root, Prev, *MIB1, *MIB2); + + // Record new instructions for insertion and old instructions for deletion. + InsInstrs.push_back(MIB1); + InsInstrs.push_back(MIB2); + DelInstrs.push_back(&Prev); + DelInstrs.push_back(&Root); +} + +void TargetInstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const { + MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo(); + + // Select the previous instruction in the sequence based on the input pattern. + MachineInstr *Prev = nullptr; + switch (Pattern) { + case MachineCombinerPattern::REASSOC_AX_BY: + case MachineCombinerPattern::REASSOC_XA_BY: + Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + break; + case MachineCombinerPattern::REASSOC_AX_YB: + case MachineCombinerPattern::REASSOC_XA_YB: + Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); + break; + default: + break; + } + + assert(Prev && "Unknown pattern for machine combiner"); + + reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg); + return; +} + /// foldMemoryOperand - Same as the previous version except it allows folding /// of any load and store from / to any address, not just from a specific /// stack slot. @@ -661,6 +915,7 @@ int TargetInstrInfo::getSPAdjust(const MachineInstr *MI) const { return 0; int SPAdj = MI->getOperand(0).getImm(); + SPAdj = TFI->alignSPAdjust(SPAdj); if ((!StackGrowsDown && MI->getOpcode() == FrameSetupOpcode) || (StackGrowsDown && MI->getOpcode() == FrameDestroyOpcode)) @@ -686,10 +941,7 @@ bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr *MI, // modification. const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - if (MI->modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI)) - return true; - - return false; + return MI->modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI); } // Provide a global flag for disabling the PreRA hazard recognizer that targets diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp index ecfd659..36a31c9 100644 --- a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -247,13 +247,9 @@ static void InitLibcallNames(const char **Names, const Triple &TT) { Names[RTLIB::FPROUND_F80_F64] = "__truncxfdf2"; Names[RTLIB::FPROUND_F128_F64] = "__trunctfdf2"; Names[RTLIB::FPROUND_PPCF128_F64] = "__trunctfdf2"; - Names[RTLIB::FPTOSINT_F32_I8] = "__fixsfqi"; - Names[RTLIB::FPTOSINT_F32_I16] = "__fixsfhi"; Names[RTLIB::FPTOSINT_F32_I32] = "__fixsfsi"; Names[RTLIB::FPTOSINT_F32_I64] = "__fixsfdi"; Names[RTLIB::FPTOSINT_F32_I128] = "__fixsfti"; - Names[RTLIB::FPTOSINT_F64_I8] = "__fixdfqi"; - Names[RTLIB::FPTOSINT_F64_I16] = "__fixdfhi"; Names[RTLIB::FPTOSINT_F64_I32] = "__fixdfsi"; Names[RTLIB::FPTOSINT_F64_I64] = "__fixdfdi"; Names[RTLIB::FPTOSINT_F64_I128] = "__fixdfti"; @@ -266,13 +262,9 @@ static void InitLibcallNames(const char **Names, const Triple &TT) { Names[RTLIB::FPTOSINT_PPCF128_I32] = "__fixtfsi"; Names[RTLIB::FPTOSINT_PPCF128_I64] = "__fixtfdi"; Names[RTLIB::FPTOSINT_PPCF128_I128] = "__fixtfti"; - Names[RTLIB::FPTOUINT_F32_I8] = "__fixunssfqi"; - Names[RTLIB::FPTOUINT_F32_I16] = "__fixunssfhi"; Names[RTLIB::FPTOUINT_F32_I32] = "__fixunssfsi"; Names[RTLIB::FPTOUINT_F32_I64] = "__fixunssfdi"; Names[RTLIB::FPTOUINT_F32_I128] = "__fixunssfti"; - Names[RTLIB::FPTOUINT_F64_I8] = "__fixunsdfqi"; - Names[RTLIB::FPTOUINT_F64_I16] = "__fixunsdfhi"; Names[RTLIB::FPTOUINT_F64_I32] = "__fixunsdfsi"; Names[RTLIB::FPTOUINT_F64_I64] = "__fixunsdfdi"; Names[RTLIB::FPTOUINT_F64_I128] = "__fixunsdfti"; @@ -501,10 +493,6 @@ RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) { /// UNKNOWN_LIBCALL if there is none. RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) { if (OpVT == MVT::f32) { - if (RetVT == MVT::i8) - return FPTOSINT_F32_I8; - if (RetVT == MVT::i16) - return FPTOSINT_F32_I16; if (RetVT == MVT::i32) return FPTOSINT_F32_I32; if (RetVT == MVT::i64) @@ -512,10 +500,6 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) { if (RetVT == MVT::i128) return FPTOSINT_F32_I128; } else if (OpVT == MVT::f64) { - if (RetVT == MVT::i8) - return FPTOSINT_F64_I8; - if (RetVT == MVT::i16) - return FPTOSINT_F64_I16; if (RetVT == MVT::i32) return FPTOSINT_F64_I32; if (RetVT == MVT::i64) @@ -551,10 +535,6 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) { /// UNKNOWN_LIBCALL if there is none. RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) { if (OpVT == MVT::f32) { - if (RetVT == MVT::i8) - return FPTOUINT_F32_I8; - if (RetVT == MVT::i16) - return FPTOUINT_F32_I16; if (RetVT == MVT::i32) return FPTOUINT_F32_I32; if (RetVT == MVT::i64) @@ -562,10 +542,6 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) { if (RetVT == MVT::i128) return FPTOUINT_F32_I128; } else if (OpVT == MVT::f64) { - if (RetVT == MVT::i8) - return FPTOUINT_F64_I8; - if (RetVT == MVT::i16) - return FPTOUINT_F64_I16; if (RetVT == MVT::i32) return FPTOUINT_F64_I32; if (RetVT == MVT::i64) @@ -758,17 +734,13 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { SelectIsExpensive = false; HasMultipleConditionRegisters = false; HasExtractBitsInsn = false; - IntDivIsCheap = false; FsqrtIsCheap = false; - Pow2SDivIsCheap = false; JumpIsExpensive = JumpIsExpensiveOverride; PredictableSelectIsExpensive = false; MaskAndBranchFoldingIsLegal = false; EnableExtLdPromotion = false; HasFloatingPointExceptions = true; StackPointerRegisterToSaveRestore = 0; - ExceptionPointerRegister = 0; - ExceptionSelectorRegister = 0; BooleanContents = UndefinedBooleanContent; BooleanFloatContents = UndefinedBooleanContent; BooleanVectorContents = UndefinedBooleanContent; @@ -778,6 +750,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { MinFunctionAlignment = 0; PrefFunctionAlignment = 0; PrefLoopAlignment = 0; + GatherAllAliasesMaxDepth = 6; MinStackArgumentAlignment = 1; InsertFencesForAtomic = false; MinimumJumpTableEntries = 4; @@ -814,6 +787,8 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::CONCAT_VECTORS, VT, Expand); setOperationAction(ISD::FMINNUM, VT, Expand); setOperationAction(ISD::FMAXNUM, VT, Expand); + setOperationAction(ISD::FMINNAN, VT, Expand); + setOperationAction(ISD::FMAXNAN, VT, Expand); setOperationAction(ISD::FMAD, VT, Expand); setOperationAction(ISD::SMIN, VT, Expand); setOperationAction(ISD::SMAX, VT, Expand); @@ -828,6 +803,8 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::SMULO, VT, Expand); setOperationAction(ISD::UMULO, VT, Expand); + setOperationAction(ISD::BITREVERSE, VT, Expand); + // These library functions default to expand. setOperationAction(ISD::FROUND, VT, Expand); @@ -838,11 +815,17 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand); } + + // For most targets @llvm.get.dynamic.area.offest just returns 0. + setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand); } // Most targets ignore the @llvm.prefetch intrinsic. setOperationAction(ISD::PREFETCH, MVT::Other, Expand); + // Most targets also ignore the @llvm.readcyclecounter intrinsic. + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Expand); + // ConstantFP nodes default to expand. Targets can either change this to // Legal, in which case all fp constants are legal, or use isFPImmLegal() // to optimize expansions for certain constants. @@ -1111,6 +1094,19 @@ MachineBasicBlock* TargetLoweringBase::emitPatchPoint(MachineInstr *MI, MachineBasicBlock *MBB) const { MachineFunction &MF = *MI->getParent()->getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + + // We're handling multiple types of operands here: + // PATCHPOINT MetaArgs - live-in, read only, direct + // STATEPOINT Deopt Spill - live-through, read only, indirect + // STATEPOINT Deopt Alloca - live-through, read only, direct + // (We're currently conservative and mark the deopt slots read/write in + // practice.) + // STATEPOINT GC Spill - live-through, read/write, indirect + // STATEPOINT GC Alloca - live-through, read/write, direct + // The live-in vs live-through is handled already (the live through ones are + // all stack slots), but we need to handle the different type of stackmap + // operands and memory effects here. // MI changes inside this loop as we grow operands. for(unsigned OperIdx = 0; OperIdx != MI->getNumOperands(); ++OperIdx) { @@ -1126,10 +1122,24 @@ TargetLoweringBase::emitPatchPoint(MachineInstr *MI, // Copy operands before the frame-index. for (unsigned i = 0; i < OperIdx; ++i) MIB.addOperand(MI->getOperand(i)); - // Add frame index operands: direct-mem-ref tag, #FI, offset. - MIB.addImm(StackMaps::DirectMemRefOp); - MIB.addOperand(MI->getOperand(OperIdx)); - MIB.addImm(0); + // Add frame index operands recognized by stackmaps.cpp + if (MFI.isStatepointSpillSlotObjectIndex(FI)) { + // indirect-mem-ref tag, size, #FI, offset. + // Used for spills inserted by StatepointLowering. This codepath is not + // used for patchpoints/stackmaps at all, for these spilling is done via + // foldMemoryOperand callback only. + assert(MI->getOpcode() == TargetOpcode::STATEPOINT && "sanity"); + MIB.addImm(StackMaps::IndirectMemRefOp); + MIB.addImm(MFI.getObjectSize(FI)); + MIB.addOperand(MI->getOperand(OperIdx)); + MIB.addImm(0); + } else { + // direct-mem-ref tag, #FI, offset. + // Used by patchpoint, and direct alloca arguments to statepoints + MIB.addImm(StackMaps::DirectMemRefOp); + MIB.addOperand(MI->getOperand(OperIdx)); + MIB.addImm(0); + } // Copy the operands after the frame index. for (unsigned i = OperIdx + 1; i != MI->getNumOperands(); ++i) MIB.addOperand(MI->getOperand(i)); @@ -1139,7 +1149,6 @@ TargetLoweringBase::emitPatchPoint(MachineInstr *MI, assert(MIB->mayLoad() && "Folded a stackmap use to a non-load!"); // Add a new memory operand for this FI. - const MachineFrameInfo &MFI = *MF.getFrameInfo(); assert(MFI.getObjectOffset(FI) != -1); unsigned Flags = MachineMemOperand::MOLoad; @@ -1148,8 +1157,8 @@ TargetLoweringBase::emitPatchPoint(MachineInstr *MI, Flags |= MachineMemOperand::MOVolatile; } MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI), Flags, - TM.getDataLayout()->getPointerSize(), MFI.getObjectAlignment(FI)); + MachinePointerInfo::getFixedStack(MF, FI), Flags, + MF.getDataLayout().getPointerSize(), MFI.getObjectAlignment(FI)); MIB->addMemOperand(MF, MMO); // Replace the instruction and update the operand index. @@ -1274,20 +1283,14 @@ void TargetLoweringBase::computeRegisterProperties( ValueTypeActions.setTypeAction(MVT::f32, TypeSoftenFloat); } + // Decide how to handle f16. If the target does not have native f16 support, + // promote it to f32, because there are no f16 library calls (except for + // conversions). if (!isTypeLegal(MVT::f16)) { - // If the target has native f32 support, promote f16 operations to f32. If - // f32 is not supported, generate soft float library calls. - if (isTypeLegal(MVT::f32)) { - NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::f32]; - RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::f32]; - TransformToType[MVT::f16] = MVT::f32; - ValueTypeActions.setTypeAction(MVT::f16, TypePromoteFloat); - } else { - NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::i16]; - RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::i16]; - TransformToType[MVT::f16] = MVT::i16; - ValueTypeActions.setTypeAction(MVT::f16, TypeSoftenFloat); - } + NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::f32]; + RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::f32]; + TransformToType[MVT::f16] = MVT::f32; + ValueTypeActions.setTypeAction(MVT::f16, TypePromoteFloat); } // Loop over all of the vector value types to see which need transformations. @@ -1528,6 +1531,29 @@ unsigned TargetLoweringBase::getByValTypeAlignment(Type *Ty, return DL.getABITypeAlignment(Ty); } +bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context, + const DataLayout &DL, EVT VT, + unsigned AddrSpace, + unsigned Alignment, + bool *Fast) const { + // Check if the specified alignment is sufficient based on the data layout. + // TODO: While using the data layout works in practice, a better solution + // would be to implement this check directly (make this a virtual function). + // For example, the ABI alignment may change based on software platform while + // this function should only be affected by hardware implementation. + Type *Ty = VT.getTypeForEVT(Context); + if (Alignment >= DL.getABITypeAlignment(Ty)) { + // Assume that an access that meets the ABI-specified alignment is fast. + if (Fast != nullptr) + *Fast = true; + return true; + } + + // This is a misaligned access. + return allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Fast); +} + + //===----------------------------------------------------------------------===// // TargetTransformInfo Helpers //===----------------------------------------------------------------------===// @@ -1546,6 +1572,11 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const { case Invoke: return 0; case Resume: return 0; case Unreachable: return 0; + case CleanupRet: return 0; + case CatchRet: return 0; + case CatchPad: return 0; + case CatchSwitch: return 0; + case CleanupPad: return 0; case Add: return ISD::ADD; case FAdd: return ISD::FADD; case Sub: return ISD::SUB; @@ -1603,13 +1634,13 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const { llvm_unreachable("Unknown instruction type encountered!"); } -std::pair<unsigned, MVT> +std::pair<int, MVT> TargetLoweringBase::getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const { LLVMContext &C = Ty->getContext(); EVT MTy = getValueType(DL, Ty); - unsigned Cost = 1; + int Cost = 1; // We keep legalizing the type until we find a legal kind. We assume that // the only operation that costs anything is the split. After splitting // we need to handle two types. @@ -1622,11 +1653,28 @@ TargetLoweringBase::getTypeLegalizationCost(const DataLayout &DL, if (LK.first == TypeSplitVector || LK.first == TypeExpandInteger) Cost *= 2; + // Do not loop with f128 type. + if (MTy == LK.second) + return std::make_pair(Cost, MTy.getSimpleVT()); + // Keep legalizing the type. MTy = LK.second; } } +Value *TargetLoweringBase::getSafeStackPointerLocation(IRBuilder<> &IRB) const { + if (!TM.getTargetTriple().isAndroid()) + return nullptr; + + // Android provides a libc function to retrieve the address of the current + // thread's unsafe stack pointer. + Module *M = IRB.GetInsertBlock()->getParent()->getParent(); + Type *StackPtrTy = Type::getInt8PtrTy(M->getContext()); + Value *Fn = M->getOrInsertFunction("__safestack_pointer_address", + StackPtrTy->getPointerTo(0), nullptr); + return IRB.CreateCall(Fn); +} + //===----------------------------------------------------------------------===// // Loop Strength Reduction hooks //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 2f78763..58ae9cc 100644 --- a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionCOFF.h" @@ -32,6 +33,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCValue.h" +#include "llvm/Support/COFF.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" @@ -58,9 +60,8 @@ MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol( report_fatal_error("We do not support this DWARF encoding yet!"); } -void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer, - const TargetMachine &TM, - const MCSymbol *Sym) const { +void TargetLoweringObjectFileELF::emitPersonalityValue( + MCStreamer &Streamer, const DataLayout &DL, const MCSymbol *Sym) const { SmallString<64> NameData("DW.ref."); NameData += Sym->getName(); MCSymbolELF *Label = @@ -72,9 +73,9 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer, unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_GROUP; MCSection *Sec = getContext().getELFSection(NameData, ELF::SHT_PROGBITS, Flags, 0, Label->getName()); - unsigned Size = TM.getDataLayout()->getPointerSize(); + unsigned Size = DL.getPointerSize(); Streamer.SwitchSection(Sec); - Streamer.EmitValueToAlignment(TM.getDataLayout()->getPointerABIAlignment()); + Streamer.EmitValueToAlignment(DL.getPointerABIAlignment()); Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject); const MCExpr *E = MCConstantExpr::create(Size, getContext()); Streamer.emitELFSize(Label, E); @@ -232,14 +233,8 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) { return ".tdata"; if (Kind.isThreadBSS()) return ".tbss"; - if (Kind.isDataNoRel()) + if (Kind.isData()) return ".data"; - if (Kind.isDataRelLocal()) - return ".data.rel.local"; - if (Kind.isDataRel()) - return ".data.rel"; - if (Kind.isReadOnlyWithRelLocal()) - return ".data.rel.ro.local"; assert(Kind.isReadOnlyWithRel() && "Unknown section kind"); return ".data.rel.ro"; } @@ -282,8 +277,8 @@ selectELFSectionForGlobal(MCContext &Ctx, const GlobalValue *GV, // We also need alignment here. // FIXME: this is getting the alignment of the character, not the // alignment of the global! - unsigned Align = - TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)); + unsigned Align = GV->getParent()->getDataLayout().getPreferredAlignment( + cast<GlobalVariable>(GV)); std::string SizeSpec = ".rodata.str" + utostr(EntrySize) + "."; Name = SizeSpec + utostr(Align); @@ -350,9 +345,8 @@ bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection( /// Given a mergeable constant with the specified size and relocation /// information, return a section that it should be placed in. -MCSection * -TargetLoweringObjectFileELF::getSectionForConstant(SectionKind Kind, - const Constant *C) const { +MCSection *TargetLoweringObjectFileELF::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C) const { if (Kind.isMergeableConst4() && MergeableConst4Section) return MergeableConst4Section; if (Kind.isMergeableConst8() && MergeableConst8Section) @@ -362,7 +356,6 @@ TargetLoweringObjectFileELF::getSectionForConstant(SectionKind Kind, if (Kind.isReadOnly()) return ReadOnlySection; - if (Kind.isReadOnlyWithRelLocal()) return DataRelROLocalSection; assert(Kind.isReadOnlyWithRel() && "Unknown section kind"); return DataRelROSection; } @@ -507,7 +500,7 @@ emitModuleFlags(MCStreamer &Streamer, // Get the section. MCSectionMachO *S = getContext().getMachOSection( - Segment, Section, TAA, StubSize, SectionKind::getDataNoRel()); + Segment, Section, TAA, StubSize, SectionKind::getData()); Streamer.SwitchSection(S); Streamer.EmitLabel(getContext(). getOrCreateSymbol(StringRef("L_OBJC_IMAGE_INFO"))); @@ -589,14 +582,16 @@ MCSection *TargetLoweringObjectFileMachO::SelectSectionForGlobal( // FIXME: Alignment check should be handled by section classifier. if (Kind.isMergeable1ByteCString() && - TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32) + GV->getParent()->getDataLayout().getPreferredAlignment( + cast<GlobalVariable>(GV)) < 32) return CStringSection; // Do not put 16-bit arrays in the UString section if they have an // externally visible label, this runs into issues with certain linker // versions. if (Kind.isMergeable2ByteCString() && !GV->hasExternalLinkage() && - TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32) + GV->getParent()->getDataLayout().getPreferredAlignment( + cast<GlobalVariable>(GV)) < 32) return UStringSection; // With MachO only variables whose corresponding symbol starts with 'l' or @@ -634,12 +629,11 @@ MCSection *TargetLoweringObjectFileMachO::SelectSectionForGlobal( return DataSection; } -MCSection * -TargetLoweringObjectFileMachO::getSectionForConstant(SectionKind Kind, - const Constant *C) const { +MCSection *TargetLoweringObjectFileMachO::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C) const { // If this constant requires a relocation, we have to put it in the data // segment, not in the text segment. - if (Kind.isDataRel() || Kind.isReadOnlyWithRel()) + if (Kind.isData() || Kind.isReadOnlyWithRel()) return ConstDataSection; if (Kind.isMergeableConst4()) @@ -706,7 +700,7 @@ MCSymbol *TargetLoweringObjectFileMachO::getCFIPersonalitySymbol( const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel( const MCSymbol *Sym, const MCValue &MV, int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const { - // Although MachO 32-bit targets do not explictly have a GOTPCREL relocation + // Although MachO 32-bit targets do not explicitly have a GOTPCREL relocation // as 64-bit do, we replace the GOT equivalent by accessing the final symbol // through a non_lazy_ptr stub instead. One advantage is that it allows the // computation of deltas to final external symbols. Example: @@ -740,7 +734,7 @@ const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel( // non_lazy_ptr stubs. SmallString<128> Name; StringRef Suffix = "$non_lazy_ptr"; - Name += DL->getPrivateGlobalPrefix(); + Name += MMI->getModule()->getDataLayout().getPrivateGlobalPrefix(); Name += Sym->getName(); Name += Suffix; MCSymbol *Stub = Ctx.getOrCreateSymbol(Name); @@ -763,6 +757,29 @@ const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel( return MCBinaryExpr::createSub(LHS, RHS, Ctx); } +static bool canUsePrivateLabel(const MCAsmInfo &AsmInfo, + const MCSection &Section) { + if (!AsmInfo.isSectionAtomizableBySymbols(Section)) + return true; + + // If it is not dead stripped, it is safe to use private labels. + const MCSectionMachO &SMO = cast<MCSectionMachO>(Section); + if (SMO.hasAttribute(MachO::S_ATTR_NO_DEAD_STRIP)) + return true; + + return false; +} + +void TargetLoweringObjectFileMachO::getNameWithPrefix( + SmallVectorImpl<char> &OutName, const GlobalValue *GV, Mangler &Mang, + const TargetMachine &TM) const { + SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM); + const MCSection *TheSection = SectionForGlobal(GV, GVKind, Mang, TM); + bool CannotUsePrivateLabel = + !canUsePrivateLabel(*TM.getMCAsmInfo(), *TheSection); + Mang.getNameWithPrefix(OutName, GV, CannotUsePrivateLabel); +} + //===----------------------------------------------------------------------===// // COFF //===----------------------------------------------------------------------===// @@ -918,7 +935,7 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal( COMDATSymName, Selection); } else { SmallString<256> TmpData; - getNameWithPrefix(TmpData, GV, /*CannotUsePrivateLabel=*/true, Mang, TM); + Mang.getNameWithPrefix(TmpData, GV, /*CannotUsePrivateLabel=*/true); return getContext().getCOFFSection(Name, Characteristics, Kind, TmpData, Selection); } @@ -943,8 +960,9 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal( } void TargetLoweringObjectFileCOFF::getNameWithPrefix( - SmallVectorImpl<char> &OutName, const GlobalValue *GV, - bool CannotUsePrivateLabel, Mangler &Mang, const TargetMachine &TM) const { + SmallVectorImpl<char> &OutName, const GlobalValue *GV, Mangler &Mang, + const TargetMachine &TM) const { + bool CannotUsePrivateLabel = false; if (GV->hasPrivateLinkage() && ((isa<Function>(GV) && TM.getFunctionSections()) || (isa<GlobalVariable>(GV) && TM.getDataSections()))) @@ -1043,7 +1061,7 @@ void TargetLoweringObjectFileCOFF::emitLinkerFlagsForGlobal( raw_string_ostream FlagOS(Flag); Mang.getNameWithPrefix(FlagOS, GV, false); FlagOS.flush(); - if (Flag[0] == DL->getGlobalPrefix()) + if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix()) OS << Flag.substr(1); else OS << Flag; diff --git a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp index 61a66b6..0a7042a 100644 --- a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -11,13 +11,19 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/IR/Function.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" + +#define DEBUG_TYPE "target-reg-info" using namespace llvm; @@ -34,54 +40,71 @@ TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID, TargetRegisterInfo::~TargetRegisterInfo() {} -void PrintReg::print(raw_ostream &OS) const { - if (!Reg) - OS << "%noreg"; - else if (TargetRegisterInfo::isStackSlot(Reg)) - OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg); - else if (TargetRegisterInfo::isVirtualRegister(Reg)) - OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg); - else if (TRI && Reg < TRI->getNumRegs()) - OS << '%' << TRI->getName(Reg); - else - OS << "%physreg" << Reg; - if (SubIdx) { - if (TRI) - OS << ':' << TRI->getSubRegIndexName(SubIdx); +namespace llvm { + +Printable PrintReg(unsigned Reg, const TargetRegisterInfo *TRI, + unsigned SubIdx) { + return Printable([Reg, TRI, SubIdx](raw_ostream &OS) { + if (!Reg) + OS << "%noreg"; + else if (TargetRegisterInfo::isStackSlot(Reg)) + OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg); + else if (TargetRegisterInfo::isVirtualRegister(Reg)) + OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg); + else if (TRI && Reg < TRI->getNumRegs()) + OS << '%' << TRI->getName(Reg); else - OS << ":sub(" << SubIdx << ')'; - } + OS << "%physreg" << Reg; + if (SubIdx) { + if (TRI) + OS << ':' << TRI->getSubRegIndexName(SubIdx); + else + OS << ":sub(" << SubIdx << ')'; + } + }); } -void PrintRegUnit::print(raw_ostream &OS) const { - // Generic printout when TRI is missing. - if (!TRI) { - OS << "Unit~" << Unit; - return; - } +Printable PrintRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) { + return Printable([Unit, TRI](raw_ostream &OS) { + // Generic printout when TRI is missing. + if (!TRI) { + OS << "Unit~" << Unit; + return; + } - // Check for invalid register units. - if (Unit >= TRI->getNumRegUnits()) { - OS << "BadUnit~" << Unit; - return; - } + // Check for invalid register units. + if (Unit >= TRI->getNumRegUnits()) { + OS << "BadUnit~" << Unit; + return; + } - // Normal units have at least one root. - MCRegUnitRootIterator Roots(Unit, TRI); - assert(Roots.isValid() && "Unit has no roots."); - OS << TRI->getName(*Roots); - for (++Roots; Roots.isValid(); ++Roots) - OS << '~' << TRI->getName(*Roots); + // Normal units have at least one root. + MCRegUnitRootIterator Roots(Unit, TRI); + assert(Roots.isValid() && "Unit has no roots."); + OS << TRI->getName(*Roots); + for (++Roots; Roots.isValid(); ++Roots) + OS << '~' << TRI->getName(*Roots); + }); } -void PrintVRegOrUnit::print(raw_ostream &OS) const { - if (TRI && TRI->isVirtualRegister(Unit)) { - OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit); - return; - } - PrintRegUnit::print(OS); +Printable PrintVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) { + return Printable([Unit, TRI](raw_ostream &OS) { + if (TRI && TRI->isVirtualRegister(Unit)) { + OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit); + } else { + OS << PrintRegUnit(Unit, TRI); + } + }); +} + +Printable PrintLaneMask(LaneBitmask LaneMask) { + return Printable([LaneMask](raw_ostream &OS) { + OS << format("%08X", LaneMask); + }); } +} // End of llvm namespace + /// getAllocatableClass - Return the maximal subclass of the given register /// class that is alloctable, or NULL. const TargetRegisterClass * @@ -161,16 +184,24 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF, static inline const TargetRegisterClass *firstCommonClass(const uint32_t *A, const uint32_t *B, - const TargetRegisterInfo *TRI) { + const TargetRegisterInfo *TRI, + const MVT::SimpleValueType SVT = + MVT::SimpleValueType::Any) { + const MVT VT(SVT); for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32) - if (unsigned Common = *A++ & *B++) - return TRI->getRegClass(I + countTrailingZeros(Common)); + if (unsigned Common = *A++ & *B++) { + const TargetRegisterClass *RC = + TRI->getRegClass(I + countTrailingZeros(Common)); + if (SVT == MVT::SimpleValueType::Any || RC->hasType(VT)) + return RC; + } return nullptr; } const TargetRegisterClass * TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A, - const TargetRegisterClass *B) const { + const TargetRegisterClass *B, + const MVT::SimpleValueType SVT) const { // First take care of the trivial cases. if (A == B) return A; @@ -179,7 +210,7 @@ TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A, // Register classes are ordered topologically, so the largest common // sub-class it the common sub-class with the smallest ID. - return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this); + return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this, SVT); } const TargetRegisterClass * @@ -260,13 +291,55 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, return BestRC; } +/// \brief Check if the registers defined by the pair (RegisterClass, SubReg) +/// share the same register file. +static bool shareSameRegisterFile(const TargetRegisterInfo &TRI, + const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) { + // Same register class. + if (DefRC == SrcRC) + return true; + + // Both operands are sub registers. Check if they share a register class. + unsigned SrcIdx, DefIdx; + if (SrcSubReg && DefSubReg) { + return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg, + SrcIdx, DefIdx) != nullptr; + } + + // At most one of the register is a sub register, make it Src to avoid + // duplicating the test. + if (!SrcSubReg) { + std::swap(DefSubReg, SrcSubReg); + std::swap(DefRC, SrcRC); + } + + // One of the register is a sub register, check if we can get a superclass. + if (SrcSubReg) + return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr; + + // Plain copy. + return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr; +} + +bool TargetRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { + // If this source does not incur a cross register bank copy, use it. + return shareSameRegisterFile(*this, DefRC, DefSubReg, SrcRC, SrcSubReg); +} + // Compute target-independent register allocator hints to help eliminate copies. void TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg, ArrayRef<MCPhysReg> Order, SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF, - const VirtRegMap *VRM) const { + const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg); @@ -295,6 +368,26 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg, Hints.push_back(Phys); } +bool TargetRegisterInfo::canRealignStack(const MachineFunction &MF) const { + return !MF.getFunction()->hasFnAttribute("no-realign-stack"); +} + +bool TargetRegisterInfo::needsStackRealignment( + const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const Function *F = MF.getFunction(); + unsigned StackAlign = TFI->getStackAlignment(); + bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || + F->hasFnAttribute(Attribute::StackAlignment)); + if (MF.getFunction()->hasFnAttribute("stackrealign") || requiresRealignment) { + if (canRealignStack(MF)) + return true; + DEBUG(dbgs() << "Can't realign function's stack: " << F->getName() << "\n"); + } + return false; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex, diff --git a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp index 299380d..1c4558c 100644 --- a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp +++ b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp @@ -211,11 +211,9 @@ unsigned TargetSchedModel::computeOperandLatency( if (SCDesc->isValid() && !DefMI->getOperand(DefOperIdx).isImplicit() && !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef() && SchedModel.isComplete()) { - std::string Err; - raw_string_ostream ss(Err); - ss << "DefIdx " << DefIdx << " exceeds machine model writes for " - << *DefMI; - report_fatal_error(ss.str()); + errs() << "DefIdx " << DefIdx << " exceeds machine model writes for " + << *DefMI << " (Try with MCSchedModel.CompleteModel set to false)"; + llvm_unreachable("incomplete machine model"); } #endif // FIXME: Automatically giving all implicit defs defaultDefLatency is diff --git a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 1e30821..c6bae24 100644 --- a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -83,21 +83,20 @@ class TwoAddressInstructionPass : public MachineFunctionPass { // The current basic block being processed. MachineBasicBlock *MBB; - // DistanceMap - Keep track the distance of a MI from the start of the - // current basic block. + // Keep track the distance of a MI from the start of the current basic block. DenseMap<MachineInstr*, unsigned> DistanceMap; // Set of already processed instructions in the current block. SmallPtrSet<MachineInstr*, 8> Processed; - // SrcRegMap - A map from virtual registers to physical registers which are - // likely targets to be coalesced to due to copies from physical registers to - // virtual registers. e.g. v1024 = move r0. + // A map from virtual registers to physical registers which are likely targets + // to be coalesced to due to copies from physical registers to virtual + // registers. e.g. v1024 = move r0. DenseMap<unsigned, unsigned> SrcRegMap; - // DstRegMap - A map from virtual registers to physical registers which are - // likely targets to be coalesced to due to copies to physical registers from - // virtual registers. e.g. r1 = move v1024. + // A map from virtual registers to physical registers which are likely targets + // to be coalesced to due to copies to physical registers from virtual + // registers. e.g. r1 = move v1024. DenseMap<unsigned, unsigned> DstRegMap; bool sink3AddrInstruction(MachineInstr *MI, unsigned Reg, @@ -110,8 +109,8 @@ class TwoAddressInstructionPass : public MachineFunctionPass { bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC, MachineInstr *MI, unsigned Dist); - bool commuteInstruction(MachineBasicBlock::iterator &mi, - unsigned RegB, unsigned RegC, unsigned Dist); + bool commuteInstruction(MachineInstr *MI, + unsigned RegBIdx, unsigned RegCIdx, unsigned Dist); bool isProfitableToConv3Addr(unsigned RegA, unsigned RegB); @@ -133,6 +132,11 @@ class TwoAddressInstructionPass : public MachineFunctionPass { unsigned SrcIdx, unsigned DstIdx, unsigned Dist, bool shouldOnlyCommute); + bool tryInstructionCommute(MachineInstr *MI, + unsigned DstOpIdx, + unsigned BaseOpIdx, + bool BaseOpKilled, + unsigned Dist); void scanUses(unsigned DstReg); void processCopy(MachineInstr *MI); @@ -151,7 +155,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addPreserved<LiveVariables>(); AU.addPreserved<SlotIndexes>(); AU.addPreserved<LiveIntervals>(); @@ -160,7 +164,7 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } - /// runOnMachineFunction - Pass entry point. + /// Pass entry point. bool runOnMachineFunction(MachineFunction&) override; }; } // end anonymous namespace @@ -168,7 +172,7 @@ public: char TwoAddressInstructionPass::ID = 0; INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, "twoaddressinstruction", "Two-Address instruction pass", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(TwoAddressInstructionPass, "twoaddressinstruction", "Two-Address instruction pass", false, false) @@ -176,10 +180,9 @@ char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID; static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, LiveIntervals *LIS); -/// sink3AddrInstruction - A two-address instruction has been converted to a -/// three-address instruction to avoid clobbering a register. Try to sink it -/// past the instruction that would kill the above mentioned register to reduce -/// register pressure. +/// A two-address instruction has been converted to a three-address instruction +/// to avoid clobbering a register. Try to sink it past the instruction that +/// would kill the above mentioned register to reduce register pressure. bool TwoAddressInstructionPass:: sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg, MachineBasicBlock::iterator OldPos) { @@ -195,8 +198,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg, unsigned DefReg = 0; SmallSet<unsigned, 4> UseRegs; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; unsigned MOReg = MO.getReg(); @@ -231,10 +233,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg, KillMI = LIS->getInstructionFromIndex(I->end); } if (!KillMI) { - for (MachineRegisterInfo::use_nodbg_iterator - UI = MRI->use_nodbg_begin(SavedReg), - UE = MRI->use_nodbg_end(); UI != UE; ++UI) { - MachineOperand &UseMO = *UI; + for (MachineOperand &UseMO : MRI->use_nodbg_operands(SavedReg)) { if (!UseMO.isKill()) continue; KillMI = UseMO.getParent(); @@ -312,8 +311,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg, return true; } -/// getSingleDef -- return the MachineInstr* if it is the single def of the Reg -/// in current BB. +/// Return the MachineInstr* if it is the single def of the Reg in current BB. static MachineInstr *getSingleDef(unsigned Reg, MachineBasicBlock *BB, const MachineRegisterInfo *MRI) { MachineInstr *Ret = nullptr; @@ -351,10 +349,10 @@ bool TwoAddressInstructionPass::isRevCopyChain(unsigned FromReg, unsigned ToReg, return false; } -/// noUseAfterLastDef - Return true if there are no intervening uses between the -/// last instruction in the MBB that defines the specified register and the -/// two-address instruction which is being processed. It also returns the last -/// def location by reference +/// Return true if there are no intervening uses between the last instruction +/// in the MBB that defines the specified register and the two-address +/// instruction which is being processed. It also returns the last def location +/// by reference. bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist, unsigned &LastDef) { LastDef = 0; @@ -375,9 +373,9 @@ bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist, return !(LastUse > LastDef && LastUse < Dist); } -/// isCopyToReg - Return true if the specified MI is a copy instruction or -/// a extract_subreg instruction. It also returns the source and destination -/// registers and whether they are physical registers by reference. +/// Return true if the specified MI is a copy instruction or an extract_subreg +/// instruction. It also returns the source and destination registers and +/// whether they are physical registers by reference. static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII, unsigned &SrcReg, unsigned &DstReg, bool &IsSrcPhys, bool &IsDstPhys) { @@ -397,8 +395,8 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII, return true; } -/// isPLainlyKilled - Test if the given register value, which is used by the -// given instruction, is killed by the given instruction. +/// Test if the given register value, which is used by the +/// given instruction, is killed by the given instruction. static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, LiveIntervals *LIS) { if (LIS && TargetRegisterInfo::isVirtualRegister(Reg) && @@ -424,7 +422,7 @@ static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, return MI->killsRegister(Reg); } -/// isKilled - Test if the given register value, which is used by the given +/// Test if the given register value, which is used by the given /// instruction, is killed by the given instruction. This looks through /// coalescable copies to see if the original value is potentially not killed. /// @@ -472,8 +470,8 @@ static bool isKilled(MachineInstr &MI, unsigned Reg, } } -/// isTwoAddrUse - Return true if the specified MI uses the specified register -/// as a two-address use. If so, return the destination register by reference. +/// Return true if the specified MI uses the specified register as a two-address +/// use. If so, return the destination register by reference. static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) { for (unsigned i = 0, NumOps = MI.getNumOperands(); i != NumOps; ++i) { const MachineOperand &MO = MI.getOperand(i); @@ -488,8 +486,8 @@ static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) { return false; } -/// findOnlyInterestingUse - Given a register, if has a single in-basic block -/// use, return the use instruction if it's a copy or a two-address use. +/// Given a register, if has a single in-basic block use, return the use +/// instruction if it's a copy or a two-address use. static MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB, MachineRegisterInfo *MRI, @@ -516,8 +514,8 @@ MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB, return nullptr; } -/// getMappedReg - Return the physical register the specified virtual register -/// might be mapped to. +/// Return the physical register the specified virtual register might be mapped +/// to. static unsigned getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) { while (TargetRegisterInfo::isVirtualRegister(Reg)) { @@ -531,8 +529,7 @@ getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) { return 0; } -/// regsAreCompatible - Return true if the two registers are equal or aliased. -/// +/// Return true if the two registers are equal or aliased. static bool regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) { if (RegA == RegB) @@ -543,8 +540,8 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) { } -/// isProfitableToCommute - Return true if it's potentially profitable to commute -/// the two-address instruction that's being processed. +/// Return true if it's potentially profitable to commute the two-address +/// instruction that's being processed. bool TwoAddressInstructionPass:: isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC, @@ -642,15 +639,15 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC, return LastDefB && LastDefC && LastDefC > LastDefB; } -/// commuteInstruction - Commute a two-address instruction and update the basic -/// block, distance map, and live variables if needed. Return true if it is -/// successful. -bool TwoAddressInstructionPass:: -commuteInstruction(MachineBasicBlock::iterator &mi, - unsigned RegB, unsigned RegC, unsigned Dist) { - MachineInstr *MI = mi; +/// Commute a two-address instruction and update the basic block, distance map, +/// and live variables if needed. Return true if it is successful. +bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI, + unsigned RegBIdx, + unsigned RegCIdx, + unsigned Dist) { + unsigned RegC = MI->getOperand(RegCIdx).getReg(); DEBUG(dbgs() << "2addr: COMMUTING : " << *MI); - MachineInstr *NewMI = TII->commuteInstruction(MI); + MachineInstr *NewMI = TII->commuteInstruction(MI, false, RegBIdx, RegCIdx); if (NewMI == nullptr) { DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n"); @@ -672,8 +669,8 @@ commuteInstruction(MachineBasicBlock::iterator &mi, return true; } -/// isProfitableToConv3Addr - Return true if it is profitable to convert the -/// given 2-address instruction to a 3-address one. +/// Return true if it is profitable to convert the given 2-address instruction +/// to a 3-address one. bool TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){ // Look for situations like this: @@ -689,17 +686,18 @@ TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){ return (ToRegA && !regsAreCompatible(FromRegB, ToRegA, TRI)); } -/// convertInstTo3Addr - Convert the specified two-address instruction into a -/// three address one. Return true if this transformation was successful. +/// Convert the specified two-address instruction into a three address one. +/// Return true if this transformation was successful. bool TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, unsigned RegA, unsigned RegB, unsigned Dist) { // FIXME: Why does convertToThreeAddress() need an iterator reference? - MachineFunction::iterator MFI = MBB; + MachineFunction::iterator MFI = MBB->getIterator(); MachineInstr *NewMI = TII->convertToThreeAddress(MFI, mi, LV); - assert(MBB == MFI && "convertToThreeAddress changed iterator reference"); + assert(MBB->getIterator() == MFI && + "convertToThreeAddress changed iterator reference"); if (!NewMI) return false; @@ -730,8 +728,8 @@ TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi, return true; } -/// scanUses - Scan forward recursively for only uses, update maps if the use -/// is a copy or a two-address instruction. +/// Scan forward recursively for only uses, update maps if the use is a copy or +/// a two-address instruction. void TwoAddressInstructionPass::scanUses(unsigned DstReg) { SmallVector<unsigned, 4> VirtRegPairs; @@ -777,8 +775,8 @@ TwoAddressInstructionPass::scanUses(unsigned DstReg) { } } -/// processCopy - If the specified instruction is not yet processed, process it -/// if it's a copy. For a copy instruction, we find the physical registers the +/// If the specified instruction is not yet processed, process it if it's a +/// copy. For a copy instruction, we find the physical registers the /// source and destination registers might be mapped to. These are kept in /// point-to maps used to determine future optimizations. e.g. /// v1024 = mov r0 @@ -813,9 +811,9 @@ void TwoAddressInstructionPass::processCopy(MachineInstr *MI) { return; } -/// rescheduleMIBelowKill - If there is one more local instruction that reads -/// 'Reg' and it kills 'Reg, consider moving the instruction below the kill -/// instruction in order to eliminate the need for the copy. +/// If there is one more local instruction that reads 'Reg' and it kills 'Reg, +/// consider moving the instruction below the kill instruction in order to +/// eliminate the need for the copy. bool TwoAddressInstructionPass:: rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, @@ -871,8 +869,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, SmallSet<unsigned, 2> Uses; SmallSet<unsigned, 2> Kills; SmallSet<unsigned, 2> Defs; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; unsigned MOReg = MO.getReg(); @@ -914,8 +911,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, OtherMI->isBranch() || OtherMI->isTerminator()) // Don't move pass calls, etc. return false; - for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = OtherMI->getOperand(i); + for (const MachineOperand &MO : OtherMI->operands()) { if (!MO.isReg()) continue; unsigned MOReg = MO.getReg(); @@ -984,8 +980,8 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi, return true; } -/// isDefTooClose - Return true if the re-scheduling will put the given -/// instruction too close to the defs of its register dependencies. +/// Return true if the re-scheduling will put the given instruction too close +/// to the defs of its register dependencies. bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist, MachineInstr *MI) { for (MachineInstr &DefMI : MRI->def_instructions(Reg)) { @@ -1004,10 +1000,9 @@ bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist, return false; } -/// rescheduleKillAboveMI - If there is one more local instruction that reads -/// 'Reg' and it kills 'Reg, consider moving the kill instruction above the -/// current two-address instruction in order to eliminate the need for the -/// copy. +/// If there is one more local instruction that reads 'Reg' and it kills 'Reg, +/// consider moving the kill instruction above the current two-address +/// instruction in order to eliminate the need for the copy. bool TwoAddressInstructionPass:: rescheduleKillAboveMI(MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, @@ -1055,8 +1050,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi, SmallSet<unsigned, 2> Kills; SmallSet<unsigned, 2> Defs; SmallSet<unsigned, 2> LiveDefs; - for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = KillMI->getOperand(i); + for (const MachineOperand &MO : KillMI->operands()) { if (!MO.isReg()) continue; unsigned MOReg = MO.getReg(); @@ -1094,8 +1088,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi, // Don't move pass calls, etc. return false; SmallVector<unsigned, 2> OtherDefs; - for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = OtherMI->getOperand(i); + for (const MachineOperand &MO : OtherMI->operands()) { if (!MO.isReg()) continue; unsigned MOReg = MO.getReg(); @@ -1155,13 +1148,68 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi, return true; } -/// tryInstructionTransform - For the case where an instruction has a single -/// pair of tied register operands, attempt some transformations that may -/// either eliminate the tied operands or improve the opportunities for -/// coalescing away the register copy. Returns true if no copy needs to be -/// inserted to untie mi's operands (either because they were untied, or -/// because mi was rescheduled, and will be visited again later). If the -/// shouldOnlyCommute flag is true, only instruction commutation is attempted. +/// Tries to commute the operand 'BaseOpIdx' and some other operand in the +/// given machine instruction to improve opportunities for coalescing and +/// elimination of a register to register copy. +/// +/// 'DstOpIdx' specifies the index of MI def operand. +/// 'BaseOpKilled' specifies if the register associated with 'BaseOpIdx' +/// operand is killed by the given instruction. +/// The 'Dist' arguments provides the distance of MI from the start of the +/// current basic block and it is used to determine if it is profitable +/// to commute operands in the instruction. +/// +/// Returns true if the transformation happened. Otherwise, returns false. +bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI, + unsigned DstOpIdx, + unsigned BaseOpIdx, + bool BaseOpKilled, + unsigned Dist) { + unsigned DstOpReg = MI->getOperand(DstOpIdx).getReg(); + unsigned BaseOpReg = MI->getOperand(BaseOpIdx).getReg(); + unsigned OpsNum = MI->getDesc().getNumOperands(); + unsigned OtherOpIdx = MI->getDesc().getNumDefs(); + for (; OtherOpIdx < OpsNum; OtherOpIdx++) { + // The call of findCommutedOpIndices below only checks if BaseOpIdx + // and OtherOpIdx are commutable, it does not really search for + // other commutable operands and does not change the values of passed + // variables. + if (OtherOpIdx == BaseOpIdx || + !TII->findCommutedOpIndices(MI, BaseOpIdx, OtherOpIdx)) + continue; + + unsigned OtherOpReg = MI->getOperand(OtherOpIdx).getReg(); + bool AggressiveCommute = false; + + // If OtherOp dies but BaseOp does not, swap the OtherOp and BaseOp + // operands. This makes the live ranges of DstOp and OtherOp joinable. + bool DoCommute = + !BaseOpKilled && isKilled(*MI, OtherOpReg, MRI, TII, LIS, false); + + if (!DoCommute && + isProfitableToCommute(DstOpReg, BaseOpReg, OtherOpReg, MI, Dist)) { + DoCommute = true; + AggressiveCommute = true; + } + + // If it's profitable to commute, try to do so. + if (DoCommute && commuteInstruction(MI, BaseOpIdx, OtherOpIdx, Dist)) { + ++NumCommuted; + if (AggressiveCommute) + ++NumAggrCommuted; + return true; + } + } + return false; +} + +/// For the case where an instruction has a single pair of tied register +/// operands, attempt some transformations that may either eliminate the tied +/// operands or improve the opportunities for coalescing away the register copy. +/// Returns true if no copy needs to be inserted to untie mi's operands +/// (either because they were untied, or because mi was rescheduled, and will +/// be visited again later). If the shouldOnlyCommute flag is true, only +/// instruction commutation is attempted. bool TwoAddressInstructionPass:: tryInstructionTransform(MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, @@ -1181,51 +1229,18 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi, if (TargetRegisterInfo::isVirtualRegister(regA)) scanUses(regA); - // Check if it is profitable to commute the operands. - unsigned SrcOp1, SrcOp2; - unsigned regC = 0; - unsigned regCIdx = ~0U; - bool TryCommute = false; - bool AggressiveCommute = false; - if (MI.isCommutable() && MI.getNumOperands() >= 3 && - TII->findCommutedOpIndices(&MI, SrcOp1, SrcOp2)) { - if (SrcIdx == SrcOp1) - regCIdx = SrcOp2; - else if (SrcIdx == SrcOp2) - regCIdx = SrcOp1; - - if (regCIdx != ~0U) { - regC = MI.getOperand(regCIdx).getReg(); - if (!regBKilled && isKilled(MI, regC, MRI, TII, LIS, false)) - // If C dies but B does not, swap the B and C operands. - // This makes the live ranges of A and C joinable. - TryCommute = true; - else if (isProfitableToCommute(regA, regB, regC, &MI, Dist)) { - TryCommute = true; - AggressiveCommute = true; - } - } - } + bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist); // If the instruction is convertible to 3 Addr, instead // of returning try 3 Addr transformation aggresively and // use this variable to check later. Because it might be better. // For example, we can just use `leal (%rsi,%rdi), %eax` and `ret` // instead of the following code. - // addl %esi, %edi - // movl %edi, %eax + // addl %esi, %edi + // movl %edi, %eax // ret - bool Commuted = false; - - // If it's profitable to commute, try to do so. - if (TryCommute && commuteInstruction(mi, regB, regC, Dist)) { - Commuted = true; - ++NumCommuted; - if (AggressiveCommute) - ++NumAggrCommuted; - if (!MI.isConvertibleTo3Addr()) - return false; - } + if (Commuted && !MI.isConvertibleTo3Addr()) + return false; if (shouldOnlyCommute) return false; @@ -1237,6 +1252,13 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi, return true; } + // If we commuted, regB may have changed so we should re-sample it to avoid + // confusing the three address conversion below. + if (Commuted) { + regB = MI.getOperand(SrcIdx).getReg(); + regBKilled = isKilled(MI, regB, MRI, TII, LIS, true); + } + if (MI.isConvertibleTo3Addr()) { // This instruction is potentially convertible to a true // three-address instruction. Check if it is profitable. @@ -1348,10 +1370,9 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi, SmallVector<unsigned, 4> OrigRegs; if (LIS) { - for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(), - MOE = MI.operands_end(); MOI != MOE; ++MOI) { - if (MOI->isReg()) - OrigRegs.push_back(MOI->getReg()); + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg()) + OrigRegs.push_back(MO.getReg()); } } @@ -1536,12 +1557,10 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, SrcRegMap[RegA] = RegB; } - if (AllUsesCopied) { if (!IsEarlyClobber) { // Replace other (un-tied) uses of regB with LastCopiedReg. - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (MachineOperand &MO : MI->operands()) { if (MO.isReg() && MO.getReg() == RegB && MO.getSubReg() == SubRegB && MO.isUse()) { if (MO.isKill()) { @@ -1578,8 +1597,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, // regB is still used in this instruction, but a kill flag was // removed from a different tied use of regB, so now we need to add // a kill flag to one of the remaining uses of regB. - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (MachineOperand &MO : MI->operands()) { if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) { MO.setIsKill(true); break; @@ -1588,8 +1606,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, } } -/// runOnMachineFunction - Reduce two-address instructions to two operands. -/// +/// Reduce two-address instructions to two operands. bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; const TargetMachine &TM = MF->getTarget(); @@ -1599,7 +1616,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { InstrItins = MF->getSubtarget().getInstrItineraryData(); LV = getAnalysisIfAvailable<LiveVariables>(); LIS = getAnalysisIfAvailable<LiveIntervals>(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); OptLevel = TM.getOptLevel(); bool MadeChange = false; @@ -1614,7 +1631,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { TiedOperandMap TiedOperands; for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end(); MBBI != MBBE; ++MBBI) { - MBB = MBBI; + MBB = &*MBBI; unsigned Dist = 0; DistanceMap.clear(); SrcRegMap.clear(); @@ -1661,8 +1678,8 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { unsigned DstReg = mi->getOperand(DstIdx).getReg(); if (SrcReg != DstReg && tryInstructionTransform(mi, nmi, SrcIdx, DstIdx, Dist, false)) { - // The tied operands have been eliminated or shifted further down the - // block to ease elimination. Continue processing with 'nmi'. + // The tied operands have been eliminated or shifted further down + // the block to ease elimination. Continue processing with 'nmi'. TiedOperands.clear(); mi = nmi; continue; @@ -1671,9 +1688,8 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { } // Now iterate over the information collected above. - for (TiedOperandMap::iterator OI = TiedOperands.begin(), - OE = TiedOperands.end(); OI != OE; ++OI) { - processTiedPairs(mi, OI->second, Dist); + for (auto &TO : TiedOperands) { + processTiedPairs(mi, TO.second, Dist); DEBUG(dbgs() << "\t\trewrite to:\t" << *mi); } diff --git a/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp index d393e10..8c9631e 100644 --- a/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp +++ b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp @@ -71,8 +71,8 @@ bool UnreachableBlockElim::runOnFunction(Function &F) { // in them. std::vector<BasicBlock*> DeadBlocks; for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) - if (!Reachable.count(I)) { - BasicBlock *BB = I; + if (!Reachable.count(&*I)) { + BasicBlock *BB = &*I; DeadBlocks.push_back(BB); while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) { PN->replaceAllUsesWith(Constant::getNullValue(PN->getType())); @@ -131,7 +131,7 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) { // in them. std::vector<MachineBasicBlock*> DeadBlocks; for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) { - MachineBasicBlock *BB = I; + MachineBasicBlock *BB = &*I; // Test for deadness. if (!Reachable.count(BB)) { @@ -167,7 +167,7 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) { // Cleanup PHI nodes. for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) { - MachineBasicBlock *BB = I; + MachineBasicBlock *BB = &*I; // Prune unneeded PHI entries. SmallPtrSet<MachineBasicBlock*, 8> preds(BB->pred_begin(), BB->pred_end()); diff --git a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp index 2912bdd..bf1c0dc 100644 --- a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp @@ -163,11 +163,12 @@ class VirtRegRewriter : public MachineFunctionPass { SlotIndexes *Indexes; LiveIntervals *LIS; VirtRegMap *VRM; - SparseSet<unsigned> PhysRegs; void rewrite(); void addMBBLiveIns(); bool readsUndefSubreg(const MachineOperand &MO) const; + void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const; + public: static char ID; VirtRegRewriter() : MachineFunctionPass(ID) {} @@ -237,10 +238,52 @@ bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) { return true; } +void VirtRegRewriter::addLiveInsForSubRanges(const LiveInterval &LI, + unsigned PhysReg) const { + assert(!LI.empty()); + assert(LI.hasSubRanges()); + + typedef std::pair<const LiveInterval::SubRange *, + LiveInterval::const_iterator> SubRangeIteratorPair; + SmallVector<SubRangeIteratorPair, 4> SubRanges; + SlotIndex First; + SlotIndex Last; + for (const LiveInterval::SubRange &SR : LI.subranges()) { + SubRanges.push_back(std::make_pair(&SR, SR.begin())); + if (!First.isValid() || SR.segments.front().start < First) + First = SR.segments.front().start; + if (!Last.isValid() || SR.segments.back().end > Last) + Last = SR.segments.back().end; + } + + // Check all mbb start positions between First and Last while + // simulatenously advancing an iterator for each subrange. + for (SlotIndexes::MBBIndexIterator MBBI = Indexes->findMBBIndex(First); + MBBI != Indexes->MBBIndexEnd() && MBBI->first <= Last; ++MBBI) { + SlotIndex MBBBegin = MBBI->first; + // Advance all subrange iterators so that their end position is just + // behind MBBBegin (or the iterator is at the end). + LaneBitmask LaneMask = 0; + for (auto &RangeIterPair : SubRanges) { + const LiveInterval::SubRange *SR = RangeIterPair.first; + LiveInterval::const_iterator &SRI = RangeIterPair.second; + while (SRI != SR->end() && SRI->end <= MBBBegin) + ++SRI; + if (SRI == SR->end()) + continue; + if (SRI->start <= MBBBegin) + LaneMask |= SR->LaneMask; + } + if (LaneMask == 0) + continue; + MachineBasicBlock *MBB = MBBI->second; + MBB->addLiveIn(PhysReg, LaneMask); + } +} + // Compute MBB live-in lists from virtual register live ranges and their // assignments. void VirtRegRewriter::addMBBLiveIns() { - SmallVector<MachineBasicBlock*, 16> LiveIn; for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) { unsigned VirtReg = TargetRegisterInfo::index2VirtReg(Idx); if (MRI->reg_nodbg_empty(VirtReg)) @@ -254,31 +297,18 @@ void VirtRegRewriter::addMBBLiveIns() { assert(PhysReg != VirtRegMap::NO_PHYS_REG && "Unmapped virtual register."); if (LI.hasSubRanges()) { - for (LiveInterval::SubRange &S : LI.subranges()) { - for (const auto &Seg : S.segments) { - if (!Indexes->findLiveInMBBs(Seg.start, Seg.end, LiveIn)) - continue; - for (MCSubRegIndexIterator SR(PhysReg, TRI); SR.isValid(); ++SR) { - unsigned SubReg = SR.getSubReg(); - unsigned SubRegIndex = SR.getSubRegIndex(); - unsigned SubRegLaneMask = TRI->getSubRegIndexLaneMask(SubRegIndex); - if ((SubRegLaneMask & S.LaneMask) == 0) - continue; - for (unsigned i = 0, e = LiveIn.size(); i != e; ++i) { - LiveIn[i]->addLiveIn(SubReg); - } - } - LiveIn.clear(); - } - } + addLiveInsForSubRanges(LI, PhysReg); } else { - // Scan the segments of LI. - for (const auto &Seg : LI.segments) { - if (!Indexes->findLiveInMBBs(Seg.start, Seg.end, LiveIn)) - continue; - for (unsigned i = 0, e = LiveIn.size(); i != e; ++i) - LiveIn[i]->addLiveIn(PhysReg); - LiveIn.clear(); + // Go over MBB begin positions and see if we have segments covering them. + // The following works because segments and the MBBIndex list are both + // sorted by slot indexes. + SlotIndexes::MBBIndexIterator I = Indexes->MBBIndexBegin(); + for (const auto &Seg : LI) { + I = Indexes->advanceMBBIndex(I, Seg.start); + for (; I != Indexes->MBBIndexEnd() && I->first < Seg.end; ++I) { + MachineBasicBlock *MBB = I->second; + MBB->addLiveIn(PhysReg); + } } } } @@ -305,7 +335,7 @@ bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const { assert(LI.liveAt(BaseIndex) && "Reads of completely dead register should be marked undef already"); unsigned SubRegIdx = MO.getSubReg(); - unsigned UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx); + LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx); // See if any of the relevant subregister liveranges is defined at this point. for (const LiveInterval::SubRange &SR : LI.subranges()) { if ((SR.LaneMask & UseMask) != 0 && SR.liveAt(BaseIndex)) @@ -319,54 +349,15 @@ void VirtRegRewriter::rewrite() { SmallVector<unsigned, 8> SuperDeads; SmallVector<unsigned, 8> SuperDefs; SmallVector<unsigned, 8> SuperKills; - SmallPtrSet<const MachineInstr *, 4> NoReturnInsts; - - // Here we have a SparseSet to hold which PhysRegs are actually encountered - // in the MF we are about to iterate over so that later when we call - // setPhysRegUsed, we are only doing it for physRegs that were actually found - // in the program and not for all of the possible physRegs for the given - // target architecture. If the target has a lot of physRegs, then for a small - // program there will be a significant compile time reduction here. - PhysRegs.clear(); - PhysRegs.setUniverse(TRI->getNumRegs()); - - // The function with uwtable should guarantee that the stack unwinder - // can unwind the stack to the previous frame. Thus, we can't apply the - // noreturn optimization if the caller function has uwtable attribute. - bool HasUWTable = MF->getFunction()->hasFnAttribute(Attribute::UWTable); for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end(); MBBI != MBBE; ++MBBI) { DEBUG(MBBI->print(dbgs(), Indexes)); - bool IsExitBB = MBBI->succ_empty(); for (MachineBasicBlock::instr_iterator MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) { - MachineInstr *MI = MII; + MachineInstr *MI = &*MII; ++MII; - // Check if this instruction is a call to a noreturn function. If this - // is a call to noreturn function and we don't need the stack unwinding - // functionality (i.e. this function does not have uwtable attribute and - // the callee function has the nounwind attribute), then we can ignore - // the definitions set by this instruction. - if (!HasUWTable && IsExitBB && MI->isCall()) { - for (MachineInstr::mop_iterator MOI = MI->operands_begin(), - MOE = MI->operands_end(); MOI != MOE; ++MOI) { - MachineOperand &MO = *MOI; - if (!MO.isGlobal()) - continue; - const Function *Func = dyn_cast<Function>(MO.getGlobal()); - if (!Func || !Func->hasFnAttribute(Attribute::NoReturn) || - // We need to keep correct unwind information - // even if the function will not return, since the - // runtime may need it. - !Func->hasFnAttribute(Attribute::NoUnwind)) - continue; - NoReturnInsts.insert(MI); - break; - } - } - for (MachineInstr::mop_iterator MOI = MI->operands_begin(), MOE = MI->operands_end(); MOI != MOE; ++MOI) { MachineOperand &MO = *MOI; @@ -375,15 +366,6 @@ void VirtRegRewriter::rewrite() { if (MO.isRegMask()) MRI->addPhysRegsUsedFromRegMask(MO.getRegMask()); - // If we encounter a VirtReg or PhysReg then get at the PhysReg and add - // it to the physreg bitset. Later we use only the PhysRegs that were - // actually encountered in the MF to populate the MRI's used physregs. - if (MO.isReg() && MO.getReg()) - PhysRegs.insert( - TargetRegisterInfo::isVirtualRegister(MO.getReg()) ? - VRM->getPhys(MO.getReg()) : - MO.getReg()); - if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) continue; unsigned VirtReg = MO.getReg(); @@ -418,14 +400,6 @@ void VirtRegRewriter::rewrite() { MO.setIsUndef(true); } else if (!MO.isDead()) { assert(MO.isDef()); - // Things get tricky when we ran out of lane mask bits and - // merged multiple lanes into the overflow bit: In this case - // our subregister liveness tracking isn't precise and we can't - // know what subregister parts are undefined, fall back to the - // implicit super-register def then. - unsigned LaneMask = TRI->getSubRegIndexLaneMask(SubReg); - if (TargetRegisterInfo::isImpreciseLaneMask(LaneMask)) - SuperDefs.push_back(PhysReg); } } @@ -470,29 +444,5 @@ void VirtRegRewriter::rewrite() { } } } - - // Tell MRI about physical registers in use. - if (NoReturnInsts.empty()) { - for (SparseSet<unsigned>::iterator - RegI = PhysRegs.begin(), E = PhysRegs.end(); RegI != E; ++RegI) - if (!MRI->reg_nodbg_empty(*RegI)) - MRI->setPhysRegUsed(*RegI); - } else { - for (SparseSet<unsigned>::iterator - I = PhysRegs.begin(), E = PhysRegs.end(); I != E; ++I) { - unsigned Reg = *I; - if (MRI->reg_nodbg_empty(Reg)) - continue; - // Check if this register has a use that will impact the rest of the - // code. Uses in debug and noreturn instructions do not impact the - // generated code. - for (MachineInstr &It : MRI->reg_nodbg_instructions(Reg)) { - if (!NoReturnInsts.count(&It)) { - MRI->setPhysRegUsed(Reg); - break; - } - } - } - } } diff --git a/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp index 0d26ed3..886c5f6 100644 --- a/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp +++ b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp @@ -17,67 +17,44 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/Triple.h" -#include "llvm/ADT/TinyPtrVector.h" -#include "llvm/Analysis/LibCallSemantics.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/WinEHFuncInfo.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Verifier.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/PromoteMemToReg.h" -#include <memory> +#include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; -using namespace llvm::PatternMatch; #define DEBUG_TYPE "winehprepare" -namespace { - -// This map is used to model frame variable usage during outlining, to -// construct a structure type to hold the frame variables in a frame -// allocation block, and to remap the frame variable allocas (including -// spill locations as needed) to GEPs that get the variable from the -// frame allocation structure. -typedef MapVector<Value *, TinyPtrVector<AllocaInst *>> FrameVarInfoMap; - -// TinyPtrVector cannot hold nullptr, so we need our own sentinel that isn't -// quite null. -AllocaInst *getCatchObjectSentinel() { - return static_cast<AllocaInst *>(nullptr) + 1; -} - -typedef SmallSet<BasicBlock *, 4> VisitedBlockSet; +static cl::opt<bool> DisableDemotion( + "disable-demotion", cl::Hidden, + cl::desc( + "Clone multicolor basic blocks but do not demote cross funclet values"), + cl::init(false)); -class LandingPadActions; -class LandingPadMap; - -typedef DenseMap<const BasicBlock *, CatchHandler *> CatchHandlerMapTy; -typedef DenseMap<const BasicBlock *, CleanupHandler *> CleanupHandlerMapTy; +static cl::opt<bool> DisableCleanups( + "disable-cleanups", cl::Hidden, + cl::desc("Do not remove implausible terminators or other similar cleanups"), + cl::init(false)); +namespace { + class WinEHPrepare : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid. - WinEHPrepare(const TargetMachine *TM = nullptr) - : FunctionPass(ID) { - if (TM) - TheTriple = TM->getTargetTriple(); - } + WinEHPrepare(const TargetMachine *TM = nullptr) : FunctionPass(ID) {} bool runOnFunction(Function &Fn) override; @@ -90,264 +67,27 @@ public: } private: - bool prepareExceptionHandlers(Function &F, - SmallVectorImpl<LandingPadInst *> &LPads); - void identifyEHBlocks(Function &F, SmallVectorImpl<LandingPadInst *> &LPads); - void promoteLandingPadValues(LandingPadInst *LPad); - void demoteValuesLiveAcrossHandlers(Function &F, - SmallVectorImpl<LandingPadInst *> &LPads); - void findSEHEHReturnPoints(Function &F, - SetVector<BasicBlock *> &EHReturnBlocks); - void findCXXEHReturnPoints(Function &F, - SetVector<BasicBlock *> &EHReturnBlocks); - void getPossibleReturnTargets(Function *ParentF, Function *HandlerF, - SetVector<BasicBlock*> &Targets); - void completeNestedLandingPad(Function *ParentFn, - LandingPadInst *OutlinedLPad, - const LandingPadInst *OriginalLPad, - FrameVarInfoMap &VarInfo); - Function *createHandlerFunc(Function *ParentFn, Type *RetTy, - const Twine &Name, Module *M, Value *&ParentFP); - bool outlineHandler(ActionHandler *Action, Function *SrcFn, - LandingPadInst *LPad, BasicBlock *StartBB, - FrameVarInfoMap &VarInfo); - void addStubInvokeToHandlerIfNeeded(Function *Handler); - - void mapLandingPadBlocks(LandingPadInst *LPad, LandingPadActions &Actions); - CatchHandler *findCatchHandler(BasicBlock *BB, BasicBlock *&NextBB, - VisitedBlockSet &VisitedBlocks); - void findCleanupHandlers(LandingPadActions &Actions, BasicBlock *StartBB, - BasicBlock *EndBB); - - void processSEHCatchHandler(CatchHandler *Handler, BasicBlock *StartBB); - - Triple TheTriple; + void insertPHIStores(PHINode *OriginalPHI, AllocaInst *SpillSlot); + void + insertPHIStore(BasicBlock *PredBlock, Value *PredVal, AllocaInst *SpillSlot, + SmallVectorImpl<std::pair<BasicBlock *, Value *>> &Worklist); + AllocaInst *insertPHILoads(PHINode *PN, Function &F); + void replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot, + DenseMap<BasicBlock *, Value *> &Loads, Function &F); + bool prepareExplicitEH(Function &F); + void colorFunclets(Function &F); + + void demotePHIsOnFunclets(Function &F); + void cloneCommonBlocks(Function &F); + void removeImplausibleInstructions(Function &F); + void cleanupPreparedFunclets(Function &F); + void verifyPreparedFunclets(Function &F); // All fields are reset by runOnFunction. - DominatorTree *DT = nullptr; - const TargetLibraryInfo *LibInfo = nullptr; EHPersonality Personality = EHPersonality::Unknown; - CatchHandlerMapTy CatchHandlerMap; - CleanupHandlerMapTy CleanupHandlerMap; - DenseMap<const LandingPadInst *, LandingPadMap> LPadMaps; - SmallPtrSet<BasicBlock *, 4> NormalBlocks; - SmallPtrSet<BasicBlock *, 4> EHBlocks; - SetVector<BasicBlock *> EHReturnBlocks; - - // This maps landing pad instructions found in outlined handlers to - // the landing pad instruction in the parent function from which they - // were cloned. The cloned/nested landing pad is used as the key - // because the landing pad may be cloned into multiple handlers. - // This map will be used to add the llvm.eh.actions call to the nested - // landing pads after all handlers have been outlined. - DenseMap<LandingPadInst *, const LandingPadInst *> NestedLPtoOriginalLP; - - // This maps blocks in the parent function which are destinations of - // catch handlers to cloned blocks in (other) outlined handlers. This - // handles the case where a nested landing pads has a catch handler that - // returns to a handler function rather than the parent function. - // The original block is used as the key here because there should only - // ever be one handler function from which the cloned block is not pruned. - // The original block will be pruned from the parent function after all - // handlers have been outlined. This map will be used to adjust the - // return instructions of handlers which return to the block that was - // outlined into a handler. This is done after all handlers have been - // outlined but before the outlined code is pruned from the parent function. - DenseMap<const BasicBlock *, BasicBlock *> LPadTargetBlocks; - - // Map from outlined handler to call to parent local address. Only used for - // 32-bit EH. - DenseMap<Function *, Value *> HandlerToParentFP; - - AllocaInst *SEHExceptionCodeSlot = nullptr; -}; - -class WinEHFrameVariableMaterializer : public ValueMaterializer { -public: - WinEHFrameVariableMaterializer(Function *OutlinedFn, Value *ParentFP, - FrameVarInfoMap &FrameVarInfo); - ~WinEHFrameVariableMaterializer() override {} - - Value *materializeValueFor(Value *V) override; - - void escapeCatchObject(Value *V); - -private: - FrameVarInfoMap &FrameVarInfo; - IRBuilder<> Builder; -}; - -class LandingPadMap { -public: - LandingPadMap() : OriginLPad(nullptr) {} - void mapLandingPad(const LandingPadInst *LPad); - - bool isInitialized() { return OriginLPad != nullptr; } - - bool isOriginLandingPadBlock(const BasicBlock *BB) const; - bool isLandingPadSpecificInst(const Instruction *Inst) const; - - void remapEHValues(ValueToValueMapTy &VMap, Value *EHPtrValue, - Value *SelectorValue) const; - -private: - const LandingPadInst *OriginLPad; - // We will normally only see one of each of these instructions, but - // if more than one occurs for some reason we can handle that. - TinyPtrVector<const ExtractValueInst *> ExtractedEHPtrs; - TinyPtrVector<const ExtractValueInst *> ExtractedSelectors; -}; - -class WinEHCloningDirectorBase : public CloningDirector { -public: - WinEHCloningDirectorBase(Function *HandlerFn, Value *ParentFP, - FrameVarInfoMap &VarInfo, LandingPadMap &LPadMap) - : Materializer(HandlerFn, ParentFP, VarInfo), - SelectorIDType(Type::getInt32Ty(HandlerFn->getContext())), - Int8PtrType(Type::getInt8PtrTy(HandlerFn->getContext())), - LPadMap(LPadMap), ParentFP(ParentFP) {} - - CloningAction handleInstruction(ValueToValueMapTy &VMap, - const Instruction *Inst, - BasicBlock *NewBB) override; - - virtual CloningAction handleBeginCatch(ValueToValueMapTy &VMap, - const Instruction *Inst, - BasicBlock *NewBB) = 0; - virtual CloningAction handleEndCatch(ValueToValueMapTy &VMap, - const Instruction *Inst, - BasicBlock *NewBB) = 0; - virtual CloningAction handleTypeIdFor(ValueToValueMapTy &VMap, - const Instruction *Inst, - BasicBlock *NewBB) = 0; - virtual CloningAction handleIndirectBr(ValueToValueMapTy &VMap, - const IndirectBrInst *IBr, - BasicBlock *NewBB) = 0; - virtual CloningAction handleInvoke(ValueToValueMapTy &VMap, - const InvokeInst *Invoke, - BasicBlock *NewBB) = 0; - virtual CloningAction handleResume(ValueToValueMapTy &VMap, - const ResumeInst *Resume, - BasicBlock *NewBB) = 0; - virtual CloningAction handleCompare(ValueToValueMapTy &VMap, - const CmpInst *Compare, - BasicBlock *NewBB) = 0; - virtual CloningAction handleLandingPad(ValueToValueMapTy &VMap, - const LandingPadInst *LPad, - BasicBlock *NewBB) = 0; - - ValueMaterializer *getValueMaterializer() override { return &Materializer; } - -protected: - WinEHFrameVariableMaterializer Materializer; - Type *SelectorIDType; - Type *Int8PtrType; - LandingPadMap &LPadMap; - - /// The value representing the parent frame pointer. - Value *ParentFP; -}; - -class WinEHCatchDirector : public WinEHCloningDirectorBase { -public: - WinEHCatchDirector( - Function *CatchFn, Value *ParentFP, Value *Selector, - FrameVarInfoMap &VarInfo, LandingPadMap &LPadMap, - DenseMap<LandingPadInst *, const LandingPadInst *> &NestedLPads, - DominatorTree *DT, SmallPtrSetImpl<BasicBlock *> &EHBlocks) - : WinEHCloningDirectorBase(CatchFn, ParentFP, VarInfo, LPadMap), - CurrentSelector(Selector->stripPointerCasts()), - ExceptionObjectVar(nullptr), NestedLPtoOriginalLP(NestedLPads), - DT(DT), EHBlocks(EHBlocks) {} - - CloningAction handleBeginCatch(ValueToValueMapTy &VMap, - const Instruction *Inst, - BasicBlock *NewBB) override; - CloningAction handleEndCatch(ValueToValueMapTy &VMap, const Instruction *Inst, - BasicBlock *NewBB) override; - CloningAction handleTypeIdFor(ValueToValueMapTy &VMap, - const Instruction *Inst, - BasicBlock *NewBB) override; - CloningAction handleIndirectBr(ValueToValueMapTy &VMap, - const IndirectBrInst *IBr, - BasicBlock *NewBB) override; - CloningAction handleInvoke(ValueToValueMapTy &VMap, const InvokeInst *Invoke, - BasicBlock *NewBB) override; - CloningAction handleResume(ValueToValueMapTy &VMap, const ResumeInst *Resume, - BasicBlock *NewBB) override; - CloningAction handleCompare(ValueToValueMapTy &VMap, const CmpInst *Compare, - BasicBlock *NewBB) override; - CloningAction handleLandingPad(ValueToValueMapTy &VMap, - const LandingPadInst *LPad, - BasicBlock *NewBB) override; - - Value *getExceptionVar() { return ExceptionObjectVar; } - TinyPtrVector<BasicBlock *> &getReturnTargets() { return ReturnTargets; } - -private: - Value *CurrentSelector; - - Value *ExceptionObjectVar; - TinyPtrVector<BasicBlock *> ReturnTargets; - // This will be a reference to the field of the same name in the WinEHPrepare - // object which instantiates this WinEHCatchDirector object. - DenseMap<LandingPadInst *, const LandingPadInst *> &NestedLPtoOriginalLP; - DominatorTree *DT; - SmallPtrSetImpl<BasicBlock *> &EHBlocks; -}; - -class WinEHCleanupDirector : public WinEHCloningDirectorBase { -public: - WinEHCleanupDirector(Function *CleanupFn, Value *ParentFP, - FrameVarInfoMap &VarInfo, LandingPadMap &LPadMap) - : WinEHCloningDirectorBase(CleanupFn, ParentFP, VarInfo, - LPadMap) {} - - CloningAction handleBeginCatch(ValueToValueMapTy &VMap, - const Instruction *Inst, - BasicBlock *NewBB) override; - CloningAction handleEndCatch(ValueToValueMapTy &VMap, const Instruction *Inst, - BasicBlock *NewBB) override; - CloningAction handleTypeIdFor(ValueToValueMapTy &VMap, - const Instruction *Inst, - BasicBlock *NewBB) override; - CloningAction handleIndirectBr(ValueToValueMapTy &VMap, - const IndirectBrInst *IBr, - BasicBlock *NewBB) override; - CloningAction handleInvoke(ValueToValueMapTy &VMap, const InvokeInst *Invoke, - BasicBlock *NewBB) override; - CloningAction handleResume(ValueToValueMapTy &VMap, const ResumeInst *Resume, - BasicBlock *NewBB) override; - CloningAction handleCompare(ValueToValueMapTy &VMap, const CmpInst *Compare, - BasicBlock *NewBB) override; - CloningAction handleLandingPad(ValueToValueMapTy &VMap, - const LandingPadInst *LPad, - BasicBlock *NewBB) override; -}; - -class LandingPadActions { -public: - LandingPadActions() : HasCleanupHandlers(false) {} - - void insertCatchHandler(CatchHandler *Action) { Actions.push_back(Action); } - void insertCleanupHandler(CleanupHandler *Action) { - Actions.push_back(Action); - HasCleanupHandlers = true; - } - - bool includesCleanup() const { return HasCleanupHandlers; } - - SmallVectorImpl<ActionHandler *> &actions() { return Actions; } - SmallVectorImpl<ActionHandler *>::iterator begin() { return Actions.begin(); } - SmallVectorImpl<ActionHandler *>::iterator end() { return Actions.end(); } - -private: - // Note that this class does not own the ActionHandler objects in this vector. - // The ActionHandlers are owned by the CatchHandlerMap and CleanupHandlerMap - // in the WinEHPrepare class. - SmallVector<ActionHandler *, 4> Actions; - bool HasCleanupHandlers; + DenseMap<BasicBlock *, ColorVector> BlockColors; + MapVector<BasicBlock *, std::vector<BasicBlock *>> FuncletBlocks; }; } // end anonymous namespace @@ -361,2536 +101,1122 @@ FunctionPass *llvm::createWinEHPass(const TargetMachine *TM) { } bool WinEHPrepare::runOnFunction(Function &Fn) { - // No need to prepare outlined handlers. - if (Fn.hasFnAttribute("wineh-parent")) - return false; - - SmallVector<LandingPadInst *, 4> LPads; - SmallVector<ResumeInst *, 4> Resumes; - for (BasicBlock &BB : Fn) { - if (auto *LP = BB.getLandingPadInst()) - LPads.push_back(LP); - if (auto *Resume = dyn_cast<ResumeInst>(BB.getTerminator())) - Resumes.push_back(Resume); - } - - // No need to prepare functions that lack landing pads. - if (LPads.empty()) + if (!Fn.hasPersonalityFn()) return false; // Classify the personality to see what kind of preparation we need. Personality = classifyEHPersonality(Fn.getPersonalityFn()); - // Do nothing if this is not an MSVC personality. - if (!isMSVCEHPersonality(Personality)) + // Do nothing if this is not a funclet-based personality. + if (!isFuncletEHPersonality(Personality)) return false; - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - - // If there were any landing pads, prepareExceptionHandlers will make changes. - prepareExceptionHandlers(Fn, LPads); - return true; + return prepareExplicitEH(Fn); } bool WinEHPrepare::doFinalization(Module &M) { return false; } -void WinEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); -} - -static bool isSelectorDispatch(BasicBlock *BB, BasicBlock *&CatchHandler, - Constant *&Selector, BasicBlock *&NextBB); - -// Finds blocks reachable from the starting set Worklist. Does not follow unwind -// edges or blocks listed in StopPoints. -static void findReachableBlocks(SmallPtrSetImpl<BasicBlock *> &ReachableBBs, - SetVector<BasicBlock *> &Worklist, - const SetVector<BasicBlock *> *StopPoints) { - while (!Worklist.empty()) { - BasicBlock *BB = Worklist.pop_back_val(); - - // Don't cross blocks that we should stop at. - if (StopPoints && StopPoints->count(BB)) - continue; - - if (!ReachableBBs.insert(BB).second) - continue; // Already visited. - - // Don't follow unwind edges of invokes. - if (auto *II = dyn_cast<InvokeInst>(BB->getTerminator())) { - Worklist.insert(II->getNormalDest()); - continue; - } - - // Otherwise, follow all successors. - Worklist.insert(succ_begin(BB), succ_end(BB)); - } -} - -// Attempt to find an instruction where a block can be split before -// a call to llvm.eh.begincatch and its operands. If the block -// begins with the begincatch call or one of its adjacent operands -// the block will not be split. -static Instruction *findBeginCatchSplitPoint(BasicBlock *BB, - IntrinsicInst *II) { - // If the begincatch call is already the first instruction in the block, - // don't split. - Instruction *FirstNonPHI = BB->getFirstNonPHI(); - if (II == FirstNonPHI) - return nullptr; - - // If either operand is in the same basic block as the instruction and - // isn't used by another instruction before the begincatch call, include it - // in the split block. - auto *Op0 = dyn_cast<Instruction>(II->getOperand(0)); - auto *Op1 = dyn_cast<Instruction>(II->getOperand(1)); - - Instruction *I = II->getPrevNode(); - Instruction *LastI = II; - - while (I == Op0 || I == Op1) { - // If the block begins with one of the operands and there are no other - // instructions between the operand and the begincatch call, don't split. - if (I == FirstNonPHI) - return nullptr; - - LastI = I; - I = I->getPrevNode(); - } - - // If there is at least one instruction in the block before the begincatch - // call and its operands, split the block at either the begincatch or - // its operand. - return LastI; -} +void WinEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const {} -/// Find all points where exceptional control rejoins normal control flow via -/// llvm.eh.endcatch. Add them to the normal bb reachability worklist. -void WinEHPrepare::findCXXEHReturnPoints( - Function &F, SetVector<BasicBlock *> &EHReturnBlocks) { - for (auto BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) { - BasicBlock *BB = BBI; - for (Instruction &I : *BB) { - if (match(&I, m_Intrinsic<Intrinsic::eh_begincatch>())) { - Instruction *SplitPt = - findBeginCatchSplitPoint(BB, cast<IntrinsicInst>(&I)); - if (SplitPt) { - // Split the block before the llvm.eh.begincatch call to allow - // cleanup and catch code to be distinguished later. - // Do not update BBI because we still need to process the - // portion of the block that we are splitting off. - SplitBlock(BB, SplitPt, DT); - break; - } - } - if (match(&I, m_Intrinsic<Intrinsic::eh_endcatch>())) { - // Split the block after the call to llvm.eh.endcatch if there is - // anything other than an unconditional branch, or if the successor - // starts with a phi. - auto *Br = dyn_cast<BranchInst>(I.getNextNode()); - if (!Br || !Br->isUnconditional() || - isa<PHINode>(Br->getSuccessor(0)->begin())) { - DEBUG(dbgs() << "splitting block " << BB->getName() - << " with llvm.eh.endcatch\n"); - BBI = SplitBlock(BB, I.getNextNode(), DT); - } - // The next BB is normal control flow. - EHReturnBlocks.insert(BB->getTerminator()->getSuccessor(0)); - break; - } - } - } -} - -static bool isCatchAllLandingPad(const BasicBlock *BB) { - const LandingPadInst *LP = BB->getLandingPadInst(); - if (!LP) - return false; - unsigned N = LP->getNumClauses(); - return (N > 0 && LP->isCatch(N - 1) && - isa<ConstantPointerNull>(LP->getClause(N - 1))); +static int addUnwindMapEntry(WinEHFuncInfo &FuncInfo, int ToState, + const BasicBlock *BB) { + CxxUnwindMapEntry UME; + UME.ToState = ToState; + UME.Cleanup = BB; + FuncInfo.CxxUnwindMap.push_back(UME); + return FuncInfo.getLastStateNumber(); } -/// Find all points where exceptions control rejoins normal control flow via -/// selector dispatch. -void WinEHPrepare::findSEHEHReturnPoints( - Function &F, SetVector<BasicBlock *> &EHReturnBlocks) { - for (auto BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) { - BasicBlock *BB = BBI; - // If the landingpad is a catch-all, treat the whole lpad as if it is - // reachable from normal control flow. - // FIXME: This is imprecise. We need a better way of identifying where a - // catch-all starts and cleanups stop. As far as LLVM is concerned, there - // is no difference. - if (isCatchAllLandingPad(BB)) { - EHReturnBlocks.insert(BB); - continue; - } - - BasicBlock *CatchHandler; - BasicBlock *NextBB; - Constant *Selector; - if (isSelectorDispatch(BB, CatchHandler, Selector, NextBB)) { - // Split the edge if there are multiple predecessors. This creates a place - // where we can insert EH recovery code. - if (!CatchHandler->getSinglePredecessor()) { - DEBUG(dbgs() << "splitting EH return edge from " << BB->getName() - << " to " << CatchHandler->getName() << '\n'); - BBI = CatchHandler = SplitCriticalEdge( - BB, std::find(succ_begin(BB), succ_end(BB), CatchHandler)); - } - EHReturnBlocks.insert(CatchHandler); - } +static void addTryBlockMapEntry(WinEHFuncInfo &FuncInfo, int TryLow, + int TryHigh, int CatchHigh, + ArrayRef<const CatchPadInst *> Handlers) { + WinEHTryBlockMapEntry TBME; + TBME.TryLow = TryLow; + TBME.TryHigh = TryHigh; + TBME.CatchHigh = CatchHigh; + assert(TBME.TryLow <= TBME.TryHigh); + for (const CatchPadInst *CPI : Handlers) { + WinEHHandlerType HT; + Constant *TypeInfo = cast<Constant>(CPI->getArgOperand(0)); + if (TypeInfo->isNullValue()) + HT.TypeDescriptor = nullptr; + else + HT.TypeDescriptor = cast<GlobalVariable>(TypeInfo->stripPointerCasts()); + HT.Adjectives = cast<ConstantInt>(CPI->getArgOperand(1))->getZExtValue(); + HT.Handler = CPI->getParent(); + if (auto *AI = + dyn_cast<AllocaInst>(CPI->getArgOperand(2)->stripPointerCasts())) + HT.CatchObj.Alloca = AI; + else + HT.CatchObj.Alloca = nullptr; + TBME.HandlerArray.push_back(HT); } + FuncInfo.TryBlockMap.push_back(TBME); } -void WinEHPrepare::identifyEHBlocks(Function &F, - SmallVectorImpl<LandingPadInst *> &LPads) { - DEBUG(dbgs() << "Demoting values live across exception handlers in function " - << F.getName() << '\n'); - - // Build a set of all non-exceptional blocks and exceptional blocks. - // - Non-exceptional blocks are blocks reachable from the entry block while - // not following invoke unwind edges. - // - Exceptional blocks are blocks reachable from landingpads. Analysis does - // not follow llvm.eh.endcatch blocks, which mark a transition from - // exceptional to normal control. - - if (Personality == EHPersonality::MSVC_CXX) - findCXXEHReturnPoints(F, EHReturnBlocks); - else - findSEHEHReturnPoints(F, EHReturnBlocks); - - DEBUG({ - dbgs() << "identified the following blocks as EH return points:\n"; - for (BasicBlock *BB : EHReturnBlocks) - dbgs() << " " << BB->getName() << '\n'; - }); - -// Join points should not have phis at this point, unless they are a -// landingpad, in which case we will demote their phis later. -#ifndef NDEBUG - for (BasicBlock *BB : EHReturnBlocks) - assert((BB->isLandingPad() || !isa<PHINode>(BB->begin())) && - "non-lpad EH return block has phi"); -#endif - - // Normal blocks are the blocks reachable from the entry block and all EH - // return points. - SetVector<BasicBlock *> Worklist; - Worklist = EHReturnBlocks; - Worklist.insert(&F.getEntryBlock()); - findReachableBlocks(NormalBlocks, Worklist, nullptr); - DEBUG({ - dbgs() << "marked the following blocks as normal:\n"; - for (BasicBlock *BB : NormalBlocks) - dbgs() << " " << BB->getName() << '\n'; - }); - - // Exceptional blocks are the blocks reachable from landingpads that don't - // cross EH return points. - Worklist.clear(); - for (auto *LPI : LPads) - Worklist.insert(LPI->getParent()); - findReachableBlocks(EHBlocks, Worklist, &EHReturnBlocks); - DEBUG({ - dbgs() << "marked the following blocks as exceptional:\n"; - for (BasicBlock *BB : EHBlocks) - dbgs() << " " << BB->getName() << '\n'; - }); - +static BasicBlock *getCleanupRetUnwindDest(const CleanupPadInst *CleanupPad) { + for (const User *U : CleanupPad->users()) + if (const auto *CRI = dyn_cast<CleanupReturnInst>(U)) + return CRI->getUnwindDest(); + return nullptr; } -/// Ensure that all values live into and out of exception handlers are stored -/// in memory. -/// FIXME: This falls down when values are defined in one handler and live into -/// another handler. For example, a cleanup defines a value used only by a -/// catch handler. -void WinEHPrepare::demoteValuesLiveAcrossHandlers( - Function &F, SmallVectorImpl<LandingPadInst *> &LPads) { - DEBUG(dbgs() << "Demoting values live across exception handlers in function " - << F.getName() << '\n'); - - // identifyEHBlocks() should have been called before this function. - assert(!NormalBlocks.empty()); - - // Try to avoid demoting EH pointer and selector values. They get in the way - // of our pattern matching. - SmallPtrSet<Instruction *, 10> EHVals; - for (BasicBlock &BB : F) { - LandingPadInst *LP = BB.getLandingPadInst(); - if (!LP) +static void calculateStateNumbersForInvokes(const Function *Fn, + WinEHFuncInfo &FuncInfo) { + auto *F = const_cast<Function *>(Fn); + DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(*F); + for (BasicBlock &BB : *F) { + auto *II = dyn_cast<InvokeInst>(BB.getTerminator()); + if (!II) continue; - EHVals.insert(LP); - for (User *U : LP->users()) { - auto *EI = dyn_cast<ExtractValueInst>(U); - if (!EI) - continue; - EHVals.insert(EI); - for (User *U2 : EI->users()) { - if (auto *PN = dyn_cast<PHINode>(U2)) - EHVals.insert(PN); - } - } - } - SetVector<Argument *> ArgsToDemote; - SetVector<Instruction *> InstrsToDemote; - for (BasicBlock &BB : F) { - bool IsNormalBB = NormalBlocks.count(&BB); - bool IsEHBB = EHBlocks.count(&BB); - if (!IsNormalBB && !IsEHBB) - continue; // Blocks that are neither normal nor EH are unreachable. - for (Instruction &I : BB) { - for (Value *Op : I.operands()) { - // Don't demote static allocas, constants, and labels. - if (isa<Constant>(Op) || isa<BasicBlock>(Op) || isa<InlineAsm>(Op)) - continue; - auto *AI = dyn_cast<AllocaInst>(Op); - if (AI && AI->isStaticAlloca()) - continue; - - if (auto *Arg = dyn_cast<Argument>(Op)) { - if (IsEHBB) { - DEBUG(dbgs() << "Demoting argument " << *Arg - << " used by EH instr: " << I << "\n"); - ArgsToDemote.insert(Arg); - } - continue; - } - - // Don't demote EH values. - auto *OpI = cast<Instruction>(Op); - if (EHVals.count(OpI)) - continue; - - BasicBlock *OpBB = OpI->getParent(); - // If a value is produced and consumed in the same BB, we don't need to - // demote it. - if (OpBB == &BB) - continue; - bool IsOpNormalBB = NormalBlocks.count(OpBB); - bool IsOpEHBB = EHBlocks.count(OpBB); - if (IsNormalBB != IsOpNormalBB || IsEHBB != IsOpEHBB) { - DEBUG({ - dbgs() << "Demoting instruction live in-out from EH:\n"; - dbgs() << "Instr: " << *OpI << '\n'; - dbgs() << "User: " << I << '\n'; - }); - InstrsToDemote.insert(OpI); - } - } + auto &BBColors = BlockColors[&BB]; + assert(BBColors.size() == 1 && "multi-color BB not removed by preparation"); + BasicBlock *FuncletEntryBB = BBColors.front(); + + BasicBlock *FuncletUnwindDest; + auto *FuncletPad = + dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI()); + assert(FuncletPad || FuncletEntryBB == &Fn->getEntryBlock()); + if (!FuncletPad) + FuncletUnwindDest = nullptr; + else if (auto *CatchPad = dyn_cast<CatchPadInst>(FuncletPad)) + FuncletUnwindDest = CatchPad->getCatchSwitch()->getUnwindDest(); + else if (auto *CleanupPad = dyn_cast<CleanupPadInst>(FuncletPad)) + FuncletUnwindDest = getCleanupRetUnwindDest(CleanupPad); + else + llvm_unreachable("unexpected funclet pad!"); + + BasicBlock *InvokeUnwindDest = II->getUnwindDest(); + int BaseState = -1; + if (FuncletUnwindDest == InvokeUnwindDest) { + auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad); + if (BaseStateI != FuncInfo.FuncletBaseStateMap.end()) + BaseState = BaseStateI->second; } - } - // Demote values live into and out of handlers. - // FIXME: This demotion is inefficient. We should insert spills at the point - // of definition, insert one reload in each handler that uses the value, and - // insert reloads in the BB used to rejoin normal control flow. - Instruction *AllocaInsertPt = F.getEntryBlock().getFirstInsertionPt(); - for (Instruction *I : InstrsToDemote) - DemoteRegToStack(*I, false, AllocaInsertPt); - - // Demote arguments separately, and only for uses in EH blocks. - for (Argument *Arg : ArgsToDemote) { - auto *Slot = new AllocaInst(Arg->getType(), nullptr, - Arg->getName() + ".reg2mem", AllocaInsertPt); - SmallVector<User *, 4> Users(Arg->user_begin(), Arg->user_end()); - for (User *U : Users) { - auto *I = dyn_cast<Instruction>(U); - if (I && EHBlocks.count(I->getParent())) { - auto *Reload = new LoadInst(Slot, Arg->getName() + ".reload", false, I); - U->replaceUsesOfWith(Arg, Reload); - } + if (BaseState != -1) { + FuncInfo.InvokeStateMap[II] = BaseState; + } else { + Instruction *PadInst = InvokeUnwindDest->getFirstNonPHI(); + assert(FuncInfo.EHPadStateMap.count(PadInst) && "EH Pad has no state!"); + FuncInfo.InvokeStateMap[II] = FuncInfo.EHPadStateMap[PadInst]; } - new StoreInst(Arg, Slot, AllocaInsertPt); - } - - // Demote landingpad phis, as the landingpad will be removed from the machine - // CFG. - for (LandingPadInst *LPI : LPads) { - BasicBlock *BB = LPI->getParent(); - while (auto *Phi = dyn_cast<PHINode>(BB->begin())) - DemotePHIToStack(Phi, AllocaInsertPt); } - - DEBUG(dbgs() << "Demoted " << InstrsToDemote.size() << " instructions and " - << ArgsToDemote.size() << " arguments for WinEHPrepare\n\n"); } -bool WinEHPrepare::prepareExceptionHandlers( - Function &F, SmallVectorImpl<LandingPadInst *> &LPads) { - // Don't run on functions that are already prepared. - for (LandingPadInst *LPad : LPads) { - BasicBlock *LPadBB = LPad->getParent(); - for (Instruction &Inst : *LPadBB) - if (match(&Inst, m_Intrinsic<Intrinsic::eh_actions>())) - return false; - } - - identifyEHBlocks(F, LPads); - demoteValuesLiveAcrossHandlers(F, LPads); - - // These containers are used to re-map frame variables that are used in - // outlined catch and cleanup handlers. They will be populated as the - // handlers are outlined. - FrameVarInfoMap FrameVarInfo; - - bool HandlersOutlined = false; - - Module *M = F.getParent(); - LLVMContext &Context = M->getContext(); - - // Create a new function to receive the handler contents. - PointerType *Int8PtrType = Type::getInt8PtrTy(Context); - Type *Int32Type = Type::getInt32Ty(Context); - Function *ActionIntrin = Intrinsic::getDeclaration(M, Intrinsic::eh_actions); - - if (isAsynchronousEHPersonality(Personality)) { - // FIXME: Switch the ehptr type to i32 and then switch this. - SEHExceptionCodeSlot = - new AllocaInst(Int8PtrType, nullptr, "seh_exception_code", - F.getEntryBlock().getFirstInsertionPt()); +// Given BB which ends in an unwind edge, return the EHPad that this BB belongs +// to. If the unwind edge came from an invoke, return null. +static const BasicBlock *getEHPadFromPredecessor(const BasicBlock *BB, + Value *ParentPad) { + const TerminatorInst *TI = BB->getTerminator(); + if (isa<InvokeInst>(TI)) + return nullptr; + if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) { + if (CatchSwitch->getParentPad() != ParentPad) + return nullptr; + return BB; } + assert(!TI->isEHPad() && "unexpected EHPad!"); + auto *CleanupPad = cast<CleanupReturnInst>(TI)->getCleanupPad(); + if (CleanupPad->getParentPad() != ParentPad) + return nullptr; + return CleanupPad->getParent(); +} - // In order to handle the case where one outlined catch handler returns - // to a block within another outlined catch handler that would otherwise - // be unreachable, we need to outline the nested landing pad before we - // outline the landing pad which encloses it. - if (!isAsynchronousEHPersonality(Personality)) - std::sort(LPads.begin(), LPads.end(), - [this](LandingPadInst *const &L, LandingPadInst *const &R) { - return DT->properlyDominates(R->getParent(), L->getParent()); - }); - - // This container stores the llvm.eh.recover and IndirectBr instructions - // that make up the body of each landing pad after it has been outlined. - // We need to defer the population of the target list for the indirectbr - // until all landing pads have been outlined so that we can handle the - // case of blocks in the target that are reached only from nested - // landing pads. - SmallVector<std::pair<CallInst*, IndirectBrInst *>, 4> LPadImpls; - - for (LandingPadInst *LPad : LPads) { - // Look for evidence that this landingpad has already been processed. - bool LPadHasActionList = false; - BasicBlock *LPadBB = LPad->getParent(); - for (Instruction &Inst : *LPadBB) { - if (match(&Inst, m_Intrinsic<Intrinsic::eh_actions>())) { - LPadHasActionList = true; - break; - } - } - - // If we've already outlined the handlers for this landingpad, - // there's nothing more to do here. - if (LPadHasActionList) - continue; - - // If either of the values in the aggregate returned by the landing pad is - // extracted and stored to memory, promote the stored value to a register. - promoteLandingPadValues(LPad); - - LandingPadActions Actions; - mapLandingPadBlocks(LPad, Actions); - - HandlersOutlined |= !Actions.actions().empty(); - for (ActionHandler *Action : Actions) { - if (Action->hasBeenProcessed()) - continue; - BasicBlock *StartBB = Action->getStartBlock(); - - // SEH doesn't do any outlining for catches. Instead, pass the handler - // basic block addr to llvm.eh.actions and list the block as a return - // target. - if (isAsynchronousEHPersonality(Personality)) { - if (auto *CatchAction = dyn_cast<CatchHandler>(Action)) { - processSEHCatchHandler(CatchAction, StartBB); - continue; - } - } - - outlineHandler(Action, &F, LPad, StartBB, FrameVarInfo); - } - - // Split the block after the landingpad instruction so that it is just a - // call to llvm.eh.actions followed by indirectbr. - assert(!isa<PHINode>(LPadBB->begin()) && "lpad phi not removed"); - SplitBlock(LPadBB, LPad->getNextNode(), DT); - // Erase the branch inserted by the split so we can insert indirectbr. - LPadBB->getTerminator()->eraseFromParent(); - - // Replace all extracted values with undef and ultimately replace the - // landingpad with undef. - SmallVector<Instruction *, 4> SEHCodeUses; - SmallVector<Instruction *, 4> EHUndefs; - for (User *U : LPad->users()) { - auto *E = dyn_cast<ExtractValueInst>(U); - if (!E) - continue; - assert(E->getNumIndices() == 1 && - "Unexpected operation: extracting both landing pad values"); - unsigned Idx = *E->idx_begin(); - assert((Idx == 0 || Idx == 1) && "unexpected index"); - if (Idx == 0 && isAsynchronousEHPersonality(Personality)) - SEHCodeUses.push_back(E); - else - EHUndefs.push_back(E); - } - for (Instruction *E : EHUndefs) { - E->replaceAllUsesWith(UndefValue::get(E->getType())); - E->eraseFromParent(); - } - LPad->replaceAllUsesWith(UndefValue::get(LPad->getType())); - - // Rewrite uses of the exception pointer to loads of an alloca. - while (!SEHCodeUses.empty()) { - Instruction *E = SEHCodeUses.pop_back_val(); - SmallVector<Use *, 4> Uses; - for (Use &U : E->uses()) - Uses.push_back(&U); - for (Use *U : Uses) { - auto *I = cast<Instruction>(U->getUser()); - if (isa<ResumeInst>(I)) - continue; - if (auto *Phi = dyn_cast<PHINode>(I)) - SEHCodeUses.push_back(Phi); - else - U->set(new LoadInst(SEHExceptionCodeSlot, "sehcode", false, I)); - } - E->replaceAllUsesWith(UndefValue::get(E->getType())); - E->eraseFromParent(); - } - - // Add a call to describe the actions for this landing pad. - std::vector<Value *> ActionArgs; - for (ActionHandler *Action : Actions) { - // Action codes from docs are: 0 cleanup, 1 catch. - if (auto *CatchAction = dyn_cast<CatchHandler>(Action)) { - ActionArgs.push_back(ConstantInt::get(Int32Type, 1)); - ActionArgs.push_back(CatchAction->getSelector()); - // Find the frame escape index of the exception object alloca in the - // parent. - int FrameEscapeIdx = -1; - Value *EHObj = const_cast<Value *>(CatchAction->getExceptionVar()); - if (EHObj && !isa<ConstantPointerNull>(EHObj)) { - auto I = FrameVarInfo.find(EHObj); - assert(I != FrameVarInfo.end() && - "failed to map llvm.eh.begincatch var"); - FrameEscapeIdx = std::distance(FrameVarInfo.begin(), I); - } - ActionArgs.push_back(ConstantInt::get(Int32Type, FrameEscapeIdx)); - } else { - ActionArgs.push_back(ConstantInt::get(Int32Type, 0)); - } - ActionArgs.push_back(Action->getHandlerBlockOrFunc()); - } - CallInst *Recover = - CallInst::Create(ActionIntrin, ActionArgs, "recover", LPadBB); - - SetVector<BasicBlock *> ReturnTargets; - for (ActionHandler *Action : Actions) { - if (auto *CatchAction = dyn_cast<CatchHandler>(Action)) { - const auto &CatchTargets = CatchAction->getReturnTargets(); - ReturnTargets.insert(CatchTargets.begin(), CatchTargets.end()); - } - } - IndirectBrInst *Branch = - IndirectBrInst::Create(Recover, ReturnTargets.size(), LPadBB); - for (BasicBlock *Target : ReturnTargets) - Branch->addDestination(Target); - - if (!isAsynchronousEHPersonality(Personality)) { - // C++ EH must repopulate the targets later to handle the case of - // targets that are reached indirectly through nested landing pads. - LPadImpls.push_back(std::make_pair(Recover, Branch)); - } - - } // End for each landingpad +static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo, + const Instruction *FirstNonPHI, + int ParentState) { + const BasicBlock *BB = FirstNonPHI->getParent(); + assert(BB->isEHPad() && "not a funclet!"); - // If nothing got outlined, there is no more processing to be done. - if (!HandlersOutlined) - return false; + if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(FirstNonPHI)) { + assert(FuncInfo.EHPadStateMap.count(CatchSwitch) == 0 && + "shouldn't revist catch funclets!"); - // Replace any nested landing pad stubs with the correct action handler. - // This must be done before we remove unreachable blocks because it - // cleans up references to outlined blocks that will be deleted. - for (auto &LPadPair : NestedLPtoOriginalLP) - completeNestedLandingPad(&F, LPadPair.first, LPadPair.second, FrameVarInfo); - NestedLPtoOriginalLP.clear(); - - // Update the indirectbr instructions' target lists if necessary. - SetVector<BasicBlock*> CheckedTargets; - SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList; - for (auto &LPadImplPair : LPadImpls) { - IntrinsicInst *Recover = cast<IntrinsicInst>(LPadImplPair.first); - IndirectBrInst *Branch = LPadImplPair.second; - - // Get a list of handlers called by - parseEHActions(Recover, ActionList); - - // Add an indirect branch listing possible successors of the catch handlers. - SetVector<BasicBlock *> ReturnTargets; - for (const auto &Action : ActionList) { - if (auto *CA = dyn_cast<CatchHandler>(Action.get())) { - Function *Handler = cast<Function>(CA->getHandlerBlockOrFunc()); - getPossibleReturnTargets(&F, Handler, ReturnTargets); - } + SmallVector<const CatchPadInst *, 2> Handlers; + for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) { + auto *CatchPad = cast<CatchPadInst>(CatchPadBB->getFirstNonPHI()); + Handlers.push_back(CatchPad); } - ActionList.clear(); - // Clear any targets we already knew about. - for (unsigned int I = 0, E = Branch->getNumDestinations(); I < E; ++I) { - BasicBlock *KnownTarget = Branch->getDestination(I); - if (ReturnTargets.count(KnownTarget)) - ReturnTargets.remove(KnownTarget); - } - for (BasicBlock *Target : ReturnTargets) { - Branch->addDestination(Target); - // The target may be a block that we excepted to get pruned. - // If it is, it may contain a call to llvm.eh.endcatch. - if (CheckedTargets.insert(Target)) { - // Earlier preparations guarantee that all calls to llvm.eh.endcatch - // will be followed by an unconditional branch. - auto *Br = dyn_cast<BranchInst>(Target->getTerminator()); - if (Br && Br->isUnconditional() && - Br != Target->getFirstNonPHIOrDbgOrLifetime()) { - Instruction *Prev = Br->getPrevNode(); - if (match(cast<Value>(Prev), m_Intrinsic<Intrinsic::eh_endcatch>())) - Prev->eraseFromParent(); - } + int TryLow = addUnwindMapEntry(FuncInfo, ParentState, nullptr); + FuncInfo.EHPadStateMap[CatchSwitch] = TryLow; + for (const BasicBlock *PredBlock : predecessors(BB)) + if ((PredBlock = getEHPadFromPredecessor(PredBlock, + CatchSwitch->getParentPad()))) + calculateCXXStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(), + TryLow); + int CatchLow = addUnwindMapEntry(FuncInfo, ParentState, nullptr); + + // catchpads are separate funclets in C++ EH due to the way rethrow works. + int TryHigh = CatchLow - 1; + for (const auto *CatchPad : Handlers) { + FuncInfo.FuncletBaseStateMap[CatchPad] = CatchLow; + for (const User *U : CatchPad->users()) { + const auto *UserI = cast<Instruction>(U); + if (auto *InnerCatchSwitch = dyn_cast<CatchSwitchInst>(UserI)) + if (InnerCatchSwitch->getUnwindDest() == CatchSwitch->getUnwindDest()) + calculateCXXStateNumbers(FuncInfo, UserI, CatchLow); + if (auto *InnerCleanupPad = dyn_cast<CleanupPadInst>(UserI)) + if (getCleanupRetUnwindDest(InnerCleanupPad) == + CatchSwitch->getUnwindDest()) + calculateCXXStateNumbers(FuncInfo, UserI, CatchLow); } } - } - LPadImpls.clear(); - - F.addFnAttr("wineh-parent", F.getName()); - - // Delete any blocks that were only used by handlers that were outlined above. - removeUnreachableBlocks(F); + int CatchHigh = FuncInfo.getLastStateNumber(); + addTryBlockMapEntry(FuncInfo, TryLow, TryHigh, CatchHigh, Handlers); + DEBUG(dbgs() << "TryLow[" << BB->getName() << "]: " << TryLow << '\n'); + DEBUG(dbgs() << "TryHigh[" << BB->getName() << "]: " << TryHigh << '\n'); + DEBUG(dbgs() << "CatchHigh[" << BB->getName() << "]: " << CatchHigh + << '\n'); + } else { + auto *CleanupPad = cast<CleanupPadInst>(FirstNonPHI); - BasicBlock *Entry = &F.getEntryBlock(); - IRBuilder<> Builder(F.getParent()->getContext()); - Builder.SetInsertPoint(Entry->getFirstInsertionPt()); - - Function *FrameEscapeFn = - Intrinsic::getDeclaration(M, Intrinsic::localescape); - Function *RecoverFrameFn = - Intrinsic::getDeclaration(M, Intrinsic::localrecover); - SmallVector<Value *, 8> AllocasToEscape; - - // Scan the entry block for an existing call to llvm.localescape. We need to - // keep escaping those objects. - for (Instruction &I : F.front()) { - auto *II = dyn_cast<IntrinsicInst>(&I); - if (II && II->getIntrinsicID() == Intrinsic::localescape) { - auto Args = II->arg_operands(); - AllocasToEscape.append(Args.begin(), Args.end()); - II->eraseFromParent(); - break; - } - } + // It's possible for a cleanup to be visited twice: it might have multiple + // cleanupret instructions. + if (FuncInfo.EHPadStateMap.count(CleanupPad)) + return; - // Finally, replace all of the temporary allocas for frame variables used in - // the outlined handlers with calls to llvm.localrecover. - for (auto &VarInfoEntry : FrameVarInfo) { - Value *ParentVal = VarInfoEntry.first; - TinyPtrVector<AllocaInst *> &Allocas = VarInfoEntry.second; - AllocaInst *ParentAlloca = cast<AllocaInst>(ParentVal); - - // FIXME: We should try to sink unescaped allocas from the parent frame into - // the child frame. If the alloca is escaped, we have to use the lifetime - // markers to ensure that the alloca is only live within the child frame. - - // Add this alloca to the list of things to escape. - AllocasToEscape.push_back(ParentAlloca); - - // Next replace all outlined allocas that are mapped to it. - for (AllocaInst *TempAlloca : Allocas) { - if (TempAlloca == getCatchObjectSentinel()) - continue; // Skip catch parameter sentinels. - Function *HandlerFn = TempAlloca->getParent()->getParent(); - llvm::Value *FP = HandlerToParentFP[HandlerFn]; - assert(FP); - - // FIXME: Sink this localrecover into the blocks where it is used. - Builder.SetInsertPoint(TempAlloca); - Builder.SetCurrentDebugLocation(TempAlloca->getDebugLoc()); - Value *RecoverArgs[] = { - Builder.CreateBitCast(&F, Int8PtrType, ""), FP, - llvm::ConstantInt::get(Int32Type, AllocasToEscape.size() - 1)}; - Instruction *RecoveredAlloca = - Builder.CreateCall(RecoverFrameFn, RecoverArgs); - - // Add a pointer bitcast if the alloca wasn't an i8. - if (RecoveredAlloca->getType() != TempAlloca->getType()) { - RecoveredAlloca->setName(Twine(TempAlloca->getName()) + ".i8"); - RecoveredAlloca = cast<Instruction>( - Builder.CreateBitCast(RecoveredAlloca, TempAlloca->getType())); + int CleanupState = addUnwindMapEntry(FuncInfo, ParentState, BB); + FuncInfo.EHPadStateMap[CleanupPad] = CleanupState; + DEBUG(dbgs() << "Assigning state #" << CleanupState << " to BB " + << BB->getName() << '\n'); + for (const BasicBlock *PredBlock : predecessors(BB)) { + if ((PredBlock = getEHPadFromPredecessor(PredBlock, + CleanupPad->getParentPad()))) { + calculateCXXStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(), + CleanupState); } - TempAlloca->replaceAllUsesWith(RecoveredAlloca); - TempAlloca->removeFromParent(); - RecoveredAlloca->takeName(TempAlloca); - delete TempAlloca; } - } // End for each FrameVarInfo entry. - - // Insert 'call void (...)* @llvm.localescape(...)' at the end of the entry - // block. - Builder.SetInsertPoint(&F.getEntryBlock().back()); - Builder.CreateCall(FrameEscapeFn, AllocasToEscape); - - if (SEHExceptionCodeSlot) { - if (isAllocaPromotable(SEHExceptionCodeSlot)) { - SmallPtrSet<BasicBlock *, 4> UserBlocks; - for (User *U : SEHExceptionCodeSlot->users()) { - if (auto *Inst = dyn_cast<Instruction>(U)) - UserBlocks.insert(Inst->getParent()); - } - PromoteMemToReg(SEHExceptionCodeSlot, *DT); - // After the promotion, kill off dead instructions. - for (BasicBlock *BB : UserBlocks) - SimplifyInstructionsInBlock(BB, LibInfo); + for (const User *U : CleanupPad->users()) { + const auto *UserI = cast<Instruction>(U); + if (UserI->isEHPad()) + report_fatal_error("Cleanup funclets for the MSVC++ personality cannot " + "contain exceptional actions"); } } +} - // Clean up the handler action maps we created for this function - DeleteContainerSeconds(CatchHandlerMap); - CatchHandlerMap.clear(); - DeleteContainerSeconds(CleanupHandlerMap); - CleanupHandlerMap.clear(); - HandlerToParentFP.clear(); - DT = nullptr; - LibInfo = nullptr; - SEHExceptionCodeSlot = nullptr; - EHBlocks.clear(); - NormalBlocks.clear(); - EHReturnBlocks.clear(); - - return HandlersOutlined; +static int addSEHExcept(WinEHFuncInfo &FuncInfo, int ParentState, + const Function *Filter, const BasicBlock *Handler) { + SEHUnwindMapEntry Entry; + Entry.ToState = ParentState; + Entry.IsFinally = false; + Entry.Filter = Filter; + Entry.Handler = Handler; + FuncInfo.SEHUnwindMap.push_back(Entry); + return FuncInfo.SEHUnwindMap.size() - 1; } -void WinEHPrepare::promoteLandingPadValues(LandingPadInst *LPad) { - // If the return values of the landing pad instruction are extracted and - // stored to memory, we want to promote the store locations to reg values. - SmallVector<AllocaInst *, 2> EHAllocas; - - // The landingpad instruction returns an aggregate value. Typically, its - // value will be passed to a pair of extract value instructions and the - // results of those extracts are often passed to store instructions. - // In unoptimized code the stored value will often be loaded and then stored - // again. - for (auto *U : LPad->users()) { - ExtractValueInst *Extract = dyn_cast<ExtractValueInst>(U); - if (!Extract) - continue; +static int addSEHFinally(WinEHFuncInfo &FuncInfo, int ParentState, + const BasicBlock *Handler) { + SEHUnwindMapEntry Entry; + Entry.ToState = ParentState; + Entry.IsFinally = true; + Entry.Filter = nullptr; + Entry.Handler = Handler; + FuncInfo.SEHUnwindMap.push_back(Entry); + return FuncInfo.SEHUnwindMap.size() - 1; +} - for (auto *EU : Extract->users()) { - if (auto *Store = dyn_cast<StoreInst>(EU)) { - auto *AV = cast<AllocaInst>(Store->getPointerOperand()); - EHAllocas.push_back(AV); - } +static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo, + const Instruction *FirstNonPHI, + int ParentState) { + const BasicBlock *BB = FirstNonPHI->getParent(); + assert(BB->isEHPad() && "no a funclet!"); + + if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(FirstNonPHI)) { + assert(FuncInfo.EHPadStateMap.count(CatchSwitch) == 0 && + "shouldn't revist catch funclets!"); + + // Extract the filter function and the __except basic block and create a + // state for them. + assert(CatchSwitch->getNumHandlers() == 1 && + "SEH doesn't have multiple handlers per __try"); + const auto *CatchPad = + cast<CatchPadInst>((*CatchSwitch->handler_begin())->getFirstNonPHI()); + const BasicBlock *CatchPadBB = CatchPad->getParent(); + const Constant *FilterOrNull = + cast<Constant>(CatchPad->getArgOperand(0)->stripPointerCasts()); + const Function *Filter = dyn_cast<Function>(FilterOrNull); + assert((Filter || FilterOrNull->isNullValue()) && + "unexpected filter value"); + int TryState = addSEHExcept(FuncInfo, ParentState, Filter, CatchPadBB); + + // Everything in the __try block uses TryState as its parent state. + FuncInfo.EHPadStateMap[CatchSwitch] = TryState; + DEBUG(dbgs() << "Assigning state #" << TryState << " to BB " + << CatchPadBB->getName() << '\n'); + for (const BasicBlock *PredBlock : predecessors(BB)) + if ((PredBlock = getEHPadFromPredecessor(PredBlock, + CatchSwitch->getParentPad()))) + calculateSEHStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(), + TryState); + + // Everything in the __except block unwinds to ParentState, just like code + // outside the __try. + for (const User *U : CatchPad->users()) { + const auto *UserI = cast<Instruction>(U); + if (auto *InnerCatchSwitch = dyn_cast<CatchSwitchInst>(UserI)) + if (InnerCatchSwitch->getUnwindDest() == CatchSwitch->getUnwindDest()) + calculateSEHStateNumbers(FuncInfo, UserI, ParentState); + if (auto *InnerCleanupPad = dyn_cast<CleanupPadInst>(UserI)) + if (getCleanupRetUnwindDest(InnerCleanupPad) == + CatchSwitch->getUnwindDest()) + calculateSEHStateNumbers(FuncInfo, UserI, ParentState); } - } + } else { + auto *CleanupPad = cast<CleanupPadInst>(FirstNonPHI); - // We can't do this without a dominator tree. - assert(DT); + // It's possible for a cleanup to be visited twice: it might have multiple + // cleanupret instructions. + if (FuncInfo.EHPadStateMap.count(CleanupPad)) + return; - if (!EHAllocas.empty()) { - PromoteMemToReg(EHAllocas, *DT); - EHAllocas.clear(); + int CleanupState = addSEHFinally(FuncInfo, ParentState, BB); + FuncInfo.EHPadStateMap[CleanupPad] = CleanupState; + DEBUG(dbgs() << "Assigning state #" << CleanupState << " to BB " + << BB->getName() << '\n'); + for (const BasicBlock *PredBlock : predecessors(BB)) + if ((PredBlock = + getEHPadFromPredecessor(PredBlock, CleanupPad->getParentPad()))) + calculateSEHStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(), + CleanupState); + for (const User *U : CleanupPad->users()) { + const auto *UserI = cast<Instruction>(U); + if (UserI->isEHPad()) + report_fatal_error("Cleanup funclets for the SEH personality cannot " + "contain exceptional actions"); + } } +} - // After promotion, some extracts may be trivially dead. Remove them. - SmallVector<Value *, 4> Users(LPad->user_begin(), LPad->user_end()); - for (auto *U : Users) - RecursivelyDeleteTriviallyDeadInstructions(U); +static bool isTopLevelPadForMSVC(const Instruction *EHPad) { + if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(EHPad)) + return isa<ConstantTokenNone>(CatchSwitch->getParentPad()) && + CatchSwitch->unwindsToCaller(); + if (auto *CleanupPad = dyn_cast<CleanupPadInst>(EHPad)) + return isa<ConstantTokenNone>(CleanupPad->getParentPad()) && + getCleanupRetUnwindDest(CleanupPad) == nullptr; + if (isa<CatchPadInst>(EHPad)) + return false; + llvm_unreachable("unexpected EHPad!"); } -void WinEHPrepare::getPossibleReturnTargets(Function *ParentF, - Function *HandlerF, - SetVector<BasicBlock*> &Targets) { - for (BasicBlock &BB : *HandlerF) { - // If the handler contains landing pads, check for any - // handlers that may return directly to a block in the - // parent function. - if (auto *LPI = BB.getLandingPadInst()) { - IntrinsicInst *Recover = cast<IntrinsicInst>(LPI->getNextNode()); - SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList; - parseEHActions(Recover, ActionList); - for (const auto &Action : ActionList) { - if (auto *CH = dyn_cast<CatchHandler>(Action.get())) { - Function *NestedF = cast<Function>(CH->getHandlerBlockOrFunc()); - getPossibleReturnTargets(ParentF, NestedF, Targets); - } - } - } +void llvm::calculateSEHStateNumbers(const Function *Fn, + WinEHFuncInfo &FuncInfo) { + // Don't compute state numbers twice. + if (!FuncInfo.SEHUnwindMap.empty()) + return; - auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator()); - if (!Ret) + for (const BasicBlock &BB : *Fn) { + if (!BB.isEHPad()) continue; - - // Handler functions must always return a block address. - BlockAddress *BA = cast<BlockAddress>(Ret->getReturnValue()); - - // If this is the handler for a nested landing pad, the - // return address may have been remapped to a block in the - // parent handler. We're not interested in those. - if (BA->getFunction() != ParentF) + const Instruction *FirstNonPHI = BB.getFirstNonPHI(); + if (!isTopLevelPadForMSVC(FirstNonPHI)) continue; - - Targets.insert(BA->getBasicBlock()); + ::calculateSEHStateNumbers(FuncInfo, FirstNonPHI, -1); } + + calculateStateNumbersForInvokes(Fn, FuncInfo); } -void WinEHPrepare::completeNestedLandingPad(Function *ParentFn, - LandingPadInst *OutlinedLPad, - const LandingPadInst *OriginalLPad, - FrameVarInfoMap &FrameVarInfo) { - // Get the nested block and erase the unreachable instruction that was - // temporarily inserted as its terminator. - LLVMContext &Context = ParentFn->getContext(); - BasicBlock *OutlinedBB = OutlinedLPad->getParent(); - // If the nested landing pad was outlined before the landing pad that enclosed - // it, it will already be in outlined form. In that case, we just need to see - // if the returns and the enclosing branch instruction need to be updated. - IndirectBrInst *Branch = - dyn_cast<IndirectBrInst>(OutlinedBB->getTerminator()); - if (!Branch) { - // If the landing pad wasn't in outlined form, it should be a stub with - // an unreachable terminator. - assert(isa<UnreachableInst>(OutlinedBB->getTerminator())); - OutlinedBB->getTerminator()->eraseFromParent(); - // That should leave OutlinedLPad as the last instruction in its block. - assert(&OutlinedBB->back() == OutlinedLPad); - } +void llvm::calculateWinCXXEHStateNumbers(const Function *Fn, + WinEHFuncInfo &FuncInfo) { + // Return if it's already been done. + if (!FuncInfo.EHPadStateMap.empty()) + return; - // The original landing pad will have already had its action intrinsic - // built by the outlining loop. We need to clone that into the outlined - // location. It may also be necessary to add references to the exception - // variables to the outlined handler in which this landing pad is nested - // and remap return instructions in the nested handlers that should return - // to an address in the outlined handler. - Function *OutlinedHandlerFn = OutlinedBB->getParent(); - BasicBlock::const_iterator II = OriginalLPad; - ++II; - // The instruction after the landing pad should now be a call to eh.actions. - const Instruction *Recover = II; - const IntrinsicInst *EHActions = cast<IntrinsicInst>(Recover); - - // Remap the return target in the nested handler. - SmallVector<BlockAddress *, 4> ActionTargets; - SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList; - parseEHActions(EHActions, ActionList); - for (const auto &Action : ActionList) { - auto *Catch = dyn_cast<CatchHandler>(Action.get()); - if (!Catch) + for (const BasicBlock &BB : *Fn) { + if (!BB.isEHPad()) continue; - // The dyn_cast to function here selects C++ catch handlers and skips - // SEH catch handlers. - auto *Handler = dyn_cast<Function>(Catch->getHandlerBlockOrFunc()); - if (!Handler) + const Instruction *FirstNonPHI = BB.getFirstNonPHI(); + if (!isTopLevelPadForMSVC(FirstNonPHI)) continue; - // Visit all the return instructions, looking for places that return - // to a location within OutlinedHandlerFn. - for (BasicBlock &NestedHandlerBB : *Handler) { - auto *Ret = dyn_cast<ReturnInst>(NestedHandlerBB.getTerminator()); - if (!Ret) - continue; - - // Handler functions must always return a block address. - BlockAddress *BA = cast<BlockAddress>(Ret->getReturnValue()); - // The original target will have been in the main parent function, - // but if it is the address of a block that has been outlined, it - // should be a block that was outlined into OutlinedHandlerFn. - assert(BA->getFunction() == ParentFn); - - // Ignore targets that aren't part of an outlined handler function. - if (!LPadTargetBlocks.count(BA->getBasicBlock())) - continue; - - // If the return value is the address ofF a block that we - // previously outlined into the parent handler function, replace - // the return instruction and add the mapped target to the list - // of possible return addresses. - BasicBlock *MappedBB = LPadTargetBlocks[BA->getBasicBlock()]; - assert(MappedBB->getParent() == OutlinedHandlerFn); - BlockAddress *NewBA = BlockAddress::get(OutlinedHandlerFn, MappedBB); - Ret->eraseFromParent(); - ReturnInst::Create(Context, NewBA, &NestedHandlerBB); - ActionTargets.push_back(NewBA); - } - } - ActionList.clear(); - - if (Branch) { - // If the landing pad was already in outlined form, just update its targets. - for (unsigned int I = Branch->getNumDestinations(); I > 0; --I) - Branch->removeDestination(I); - // Add the previously collected action targets. - for (auto *Target : ActionTargets) - Branch->addDestination(Target->getBasicBlock()); - } else { - // If the landing pad was previously stubbed out, fill in its outlined form. - IntrinsicInst *NewEHActions = cast<IntrinsicInst>(EHActions->clone()); - OutlinedBB->getInstList().push_back(NewEHActions); - - // Insert an indirect branch into the outlined landing pad BB. - IndirectBrInst *IBr = IndirectBrInst::Create(NewEHActions, 0, OutlinedBB); - // Add the previously collected action targets. - for (auto *Target : ActionTargets) - IBr->addDestination(Target->getBasicBlock()); + calculateCXXStateNumbers(FuncInfo, FirstNonPHI, -1); } -} - -// This function examines a block to determine whether the block ends with a -// conditional branch to a catch handler based on a selector comparison. -// This function is used both by the WinEHPrepare::findSelectorComparison() and -// WinEHCleanupDirector::handleTypeIdFor(). -static bool isSelectorDispatch(BasicBlock *BB, BasicBlock *&CatchHandler, - Constant *&Selector, BasicBlock *&NextBB) { - ICmpInst::Predicate Pred; - BasicBlock *TBB, *FBB; - Value *LHS, *RHS; - - if (!match(BB->getTerminator(), - m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), TBB, FBB))) - return false; - if (!match(LHS, - m_Intrinsic<Intrinsic::eh_typeid_for>(m_Constant(Selector))) && - !match(RHS, m_Intrinsic<Intrinsic::eh_typeid_for>(m_Constant(Selector)))) - return false; - - if (Pred == CmpInst::ICMP_EQ) { - CatchHandler = TBB; - NextBB = FBB; - return true; - } - - if (Pred == CmpInst::ICMP_NE) { - CatchHandler = FBB; - NextBB = TBB; - return true; - } - - return false; + calculateStateNumbersForInvokes(Fn, FuncInfo); } -static bool isCatchBlock(BasicBlock *BB) { - for (BasicBlock::iterator II = BB->getFirstNonPHIOrDbg(), IE = BB->end(); - II != IE; ++II) { - if (match(cast<Value>(II), m_Intrinsic<Intrinsic::eh_begincatch>())) - return true; - } - return false; +static int addClrEHHandler(WinEHFuncInfo &FuncInfo, int HandlerParentState, + int TryParentState, ClrHandlerType HandlerType, + uint32_t TypeToken, const BasicBlock *Handler) { + ClrEHUnwindMapEntry Entry; + Entry.HandlerParentState = HandlerParentState; + Entry.TryParentState = TryParentState; + Entry.Handler = Handler; + Entry.HandlerType = HandlerType; + Entry.TypeToken = TypeToken; + FuncInfo.ClrEHUnwindMap.push_back(Entry); + return FuncInfo.ClrEHUnwindMap.size() - 1; } -static BasicBlock *createStubLandingPad(Function *Handler) { - // FIXME: Finish this! - LLVMContext &Context = Handler->getContext(); - BasicBlock *StubBB = BasicBlock::Create(Context, "stub"); - Handler->getBasicBlockList().push_back(StubBB); - IRBuilder<> Builder(StubBB); - LandingPadInst *LPad = Builder.CreateLandingPad( - llvm::StructType::get(Type::getInt8PtrTy(Context), - Type::getInt32Ty(Context), nullptr), - 0); - // Insert a call to llvm.eh.actions so that we don't try to outline this lpad. - Function *ActionIntrin = - Intrinsic::getDeclaration(Handler->getParent(), Intrinsic::eh_actions); - Builder.CreateCall(ActionIntrin, {}, "recover"); - LPad->setCleanup(true); - Builder.CreateUnreachable(); - return StubBB; -} - -// Cycles through the blocks in an outlined handler function looking for an -// invoke instruction and inserts an invoke of llvm.donothing with an empty -// landing pad if none is found. The code that generates the .xdata tables for -// the handler needs at least one landing pad to identify the parent function's -// personality. -void WinEHPrepare::addStubInvokeToHandlerIfNeeded(Function *Handler) { - ReturnInst *Ret = nullptr; - UnreachableInst *Unreached = nullptr; - for (BasicBlock &BB : *Handler) { - TerminatorInst *Terminator = BB.getTerminator(); - // If we find an invoke, there is nothing to be done. - auto *II = dyn_cast<InvokeInst>(Terminator); - if (II) - return; - // If we've already recorded a return instruction, keep looking for invokes. - if (!Ret) - Ret = dyn_cast<ReturnInst>(Terminator); - // If we haven't recorded an unreachable instruction, try this terminator. - if (!Unreached) - Unreached = dyn_cast<UnreachableInst>(Terminator); - } - - // If we got this far, the handler contains no invokes. We should have seen - // at least one return or unreachable instruction. We'll insert an invoke of - // llvm.donothing ahead of that instruction. - assert(Ret || Unreached); - TerminatorInst *Term; - if (Ret) - Term = Ret; - else - Term = Unreached; - BasicBlock *OldRetBB = Term->getParent(); - BasicBlock *NewRetBB = SplitBlock(OldRetBB, Term, DT); - // SplitBlock adds an unconditional branch instruction at the end of the - // parent block. We want to replace that with an invoke call, so we can - // erase it now. - OldRetBB->getTerminator()->eraseFromParent(); - BasicBlock *StubLandingPad = createStubLandingPad(Handler); - Function *F = - Intrinsic::getDeclaration(Handler->getParent(), Intrinsic::donothing); - InvokeInst::Create(F, NewRetBB, StubLandingPad, None, "", OldRetBB); -} - -// FIXME: Consider sinking this into lib/Target/X86 somehow. TargetLowering -// usually doesn't build LLVM IR, so that's probably the wrong place. -Function *WinEHPrepare::createHandlerFunc(Function *ParentFn, Type *RetTy, - const Twine &Name, Module *M, - Value *&ParentFP) { - // x64 uses a two-argument prototype where the parent FP is the second - // argument. x86 uses no arguments, just the incoming EBP value. - LLVMContext &Context = M->getContext(); - Type *Int8PtrType = Type::getInt8PtrTy(Context); - FunctionType *FnType; - if (TheTriple.getArch() == Triple::x86_64) { - Type *ArgTys[2] = {Int8PtrType, Int8PtrType}; - FnType = FunctionType::get(RetTy, ArgTys, false); - } else { - FnType = FunctionType::get(RetTy, None, false); - } - - Function *Handler = - Function::Create(FnType, GlobalVariable::InternalLinkage, Name, M); - BasicBlock *Entry = BasicBlock::Create(Context, "entry"); - Handler->getBasicBlockList().push_front(Entry); - if (TheTriple.getArch() == Triple::x86_64) { - ParentFP = &(Handler->getArgumentList().back()); - } else { - assert(M); - Function *FrameAddressFn = - Intrinsic::getDeclaration(M, Intrinsic::frameaddress); - Function *RecoverFPFn = - Intrinsic::getDeclaration(M, Intrinsic::x86_seh_recoverfp); - IRBuilder<> Builder(&Handler->getEntryBlock()); - Value *EBP = - Builder.CreateCall(FrameAddressFn, {Builder.getInt32(1)}, "ebp"); - Value *ParentI8Fn = Builder.CreateBitCast(ParentFn, Int8PtrType); - ParentFP = Builder.CreateCall(RecoverFPFn, {ParentI8Fn, EBP}); - } - return Handler; -} +void llvm::calculateClrEHStateNumbers(const Function *Fn, + WinEHFuncInfo &FuncInfo) { + // Return if it's already been done. + if (!FuncInfo.EHPadStateMap.empty()) + return; -bool WinEHPrepare::outlineHandler(ActionHandler *Action, Function *SrcFn, - LandingPadInst *LPad, BasicBlock *StartBB, - FrameVarInfoMap &VarInfo) { - Module *M = SrcFn->getParent(); - LLVMContext &Context = M->getContext(); - Type *Int8PtrType = Type::getInt8PtrTy(Context); - - // Create a new function to receive the handler contents. - Value *ParentFP; - Function *Handler; - if (Action->getType() == Catch) { - Handler = createHandlerFunc(SrcFn, Int8PtrType, SrcFn->getName() + ".catch", M, - ParentFP); - } else { - Handler = createHandlerFunc(SrcFn, Type::getVoidTy(Context), - SrcFn->getName() + ".cleanup", M, ParentFP); - } - Handler->setPersonalityFn(SrcFn->getPersonalityFn()); - HandlerToParentFP[Handler] = ParentFP; - Handler->addFnAttr("wineh-parent", SrcFn->getName()); - BasicBlock *Entry = &Handler->getEntryBlock(); - - // Generate a standard prolog to setup the frame recovery structure. - IRBuilder<> Builder(Context); - Builder.SetInsertPoint(Entry); - Builder.SetCurrentDebugLocation(LPad->getDebugLoc()); - - std::unique_ptr<WinEHCloningDirectorBase> Director; - - ValueToValueMapTy VMap; - - LandingPadMap &LPadMap = LPadMaps[LPad]; - if (!LPadMap.isInitialized()) - LPadMap.mapLandingPad(LPad); - if (auto *CatchAction = dyn_cast<CatchHandler>(Action)) { - Constant *Sel = CatchAction->getSelector(); - Director.reset(new WinEHCatchDirector(Handler, ParentFP, Sel, VarInfo, - LPadMap, NestedLPtoOriginalLP, DT, - EHBlocks)); - LPadMap.remapEHValues(VMap, UndefValue::get(Int8PtrType), - ConstantInt::get(Type::getInt32Ty(Context), 1)); - } else { - Director.reset( - new WinEHCleanupDirector(Handler, ParentFP, VarInfo, LPadMap)); - LPadMap.remapEHValues(VMap, UndefValue::get(Int8PtrType), - UndefValue::get(Type::getInt32Ty(Context))); + // This numbering assigns one state number to each catchpad and cleanuppad. + // It also computes two tree-like relations over states: + // 1) Each state has a "HandlerParentState", which is the state of the next + // outer handler enclosing this state's handler (same as nearest ancestor + // per the ParentPad linkage on EH pads, but skipping over catchswitches). + // 2) Each state has a "TryParentState", which: + // a) for a catchpad that's not the last handler on its catchswitch, is + // the state of the next catchpad on that catchswitch + // b) for all other pads, is the state of the pad whose try region is the + // next outer try region enclosing this state's try region. The "try + // regions are not present as such in the IR, but will be inferred + // based on the placement of invokes and pads which reach each other + // by exceptional exits + // Catchswitches do not get their own states, but each gets mapped to the + // state of its first catchpad. + + // Step one: walk down from outermost to innermost funclets, assigning each + // catchpad and cleanuppad a state number. Add an entry to the + // ClrEHUnwindMap for each state, recording its HandlerParentState and + // handler attributes. Record the TryParentState as well for each catchpad + // that's not the last on its catchswitch, but initialize all other entries' + // TryParentStates to a sentinel -1 value that the next pass will update. + + // Seed a worklist with pads that have no parent. + SmallVector<std::pair<const Instruction *, int>, 8> Worklist; + for (const BasicBlock &BB : *Fn) { + const Instruction *FirstNonPHI = BB.getFirstNonPHI(); + const Value *ParentPad; + if (const auto *CPI = dyn_cast<CleanupPadInst>(FirstNonPHI)) + ParentPad = CPI->getParentPad(); + else if (const auto *CSI = dyn_cast<CatchSwitchInst>(FirstNonPHI)) + ParentPad = CSI->getParentPad(); + else + continue; + if (isa<ConstantTokenNone>(ParentPad)) + Worklist.emplace_back(FirstNonPHI, -1); } - SmallVector<ReturnInst *, 8> Returns; - ClonedCodeInfo OutlinedFunctionInfo; - - // If the start block contains PHI nodes, we need to map them. - BasicBlock::iterator II = StartBB->begin(); - while (auto *PN = dyn_cast<PHINode>(II)) { - bool Mapped = false; - // Look for PHI values that we have already mapped (such as the selector). - for (Value *Val : PN->incoming_values()) { - if (VMap.count(Val)) { - VMap[PN] = VMap[Val]; - Mapped = true; + // Use the worklist to visit all pads, from outer to inner. Record + // HandlerParentState for all pads. Record TryParentState only for catchpads + // that aren't the last on their catchswitch (setting all other entries' + // TryParentStates to an initial value of -1). This loop is also responsible + // for setting the EHPadStateMap entry for all catchpads, cleanuppads, and + // catchswitches. + while (!Worklist.empty()) { + const Instruction *Pad; + int HandlerParentState; + std::tie(Pad, HandlerParentState) = Worklist.pop_back_val(); + + if (const auto *Cleanup = dyn_cast<CleanupPadInst>(Pad)) { + // Create the entry for this cleanup with the appropriate handler + // properties. Finaly and fault handlers are distinguished by arity. + ClrHandlerType HandlerType = + (Cleanup->getNumArgOperands() ? ClrHandlerType::Fault + : ClrHandlerType::Finally); + int CleanupState = addClrEHHandler(FuncInfo, HandlerParentState, -1, + HandlerType, 0, Pad->getParent()); + // Queue any child EH pads on the worklist. + for (const User *U : Cleanup->users()) + if (const auto *I = dyn_cast<Instruction>(U)) + if (I->isEHPad()) + Worklist.emplace_back(I, CleanupState); + // Remember this pad's state. + FuncInfo.EHPadStateMap[Cleanup] = CleanupState; + } else { + // Walk the handlers of this catchswitch in reverse order since all but + // the last need to set the following one as its TryParentState. + const auto *CatchSwitch = cast<CatchSwitchInst>(Pad); + int CatchState = -1, FollowerState = -1; + SmallVector<const BasicBlock *, 4> CatchBlocks(CatchSwitch->handlers()); + for (auto CBI = CatchBlocks.rbegin(), CBE = CatchBlocks.rend(); + CBI != CBE; ++CBI, FollowerState = CatchState) { + const BasicBlock *CatchBlock = *CBI; + // Create the entry for this catch with the appropriate handler + // properties. + const auto *Catch = cast<CatchPadInst>(CatchBlock->getFirstNonPHI()); + uint32_t TypeToken = static_cast<uint32_t>( + cast<ConstantInt>(Catch->getArgOperand(0))->getZExtValue()); + CatchState = + addClrEHHandler(FuncInfo, HandlerParentState, FollowerState, + ClrHandlerType::Catch, TypeToken, CatchBlock); + // Queue any child EH pads on the worklist. + for (const User *U : Catch->users()) + if (const auto *I = dyn_cast<Instruction>(U)) + if (I->isEHPad()) + Worklist.emplace_back(I, CatchState); + // Remember this catch's state. + FuncInfo.EHPadStateMap[Catch] = CatchState; } + // Associate the catchswitch with the state of its first catch. + assert(CatchSwitch->getNumHandlers()); + FuncInfo.EHPadStateMap[CatchSwitch] = CatchState; } - // If we didn't find a match for this value, map it as an undef. - if (!Mapped) { - VMap[PN] = UndefValue::get(PN->getType()); - } - ++II; } - // The landing pad value may be used by PHI nodes. It will ultimately be - // eliminated, but we need it in the map for intermediate handling. - VMap[LPad] = UndefValue::get(LPad->getType()); - - // Skip over PHIs and, if applicable, landingpad instructions. - II = StartBB->getFirstInsertionPt(); - - CloneAndPruneIntoFromInst(Handler, SrcFn, II, VMap, - /*ModuleLevelChanges=*/false, Returns, "", - &OutlinedFunctionInfo, Director.get()); - - // Move all the instructions in the cloned "entry" block into our entry block. - // Depending on how the parent function was laid out, the block that will - // correspond to the outlined entry block may not be the first block in the - // list. We can recognize it, however, as the cloned block which has no - // predecessors. Any other block wouldn't have been cloned if it didn't - // have a predecessor which was also cloned. - Function::iterator ClonedIt = std::next(Function::iterator(Entry)); - while (!pred_empty(ClonedIt)) - ++ClonedIt; - BasicBlock *ClonedEntryBB = ClonedIt; - assert(ClonedEntryBB); - Entry->getInstList().splice(Entry->end(), ClonedEntryBB->getInstList()); - ClonedEntryBB->eraseFromParent(); - - // Make sure we can identify the handler's personality later. - addStubInvokeToHandlerIfNeeded(Handler); - - if (auto *CatchAction = dyn_cast<CatchHandler>(Action)) { - WinEHCatchDirector *CatchDirector = - reinterpret_cast<WinEHCatchDirector *>(Director.get()); - CatchAction->setExceptionVar(CatchDirector->getExceptionVar()); - CatchAction->setReturnTargets(CatchDirector->getReturnTargets()); - - // Look for blocks that are not part of the landing pad that we just - // outlined but terminate with a call to llvm.eh.endcatch and a - // branch to a block that is in the handler we just outlined. - // These blocks will be part of a nested landing pad that intends to - // return to an address in this handler. This case is best handled - // after both landing pads have been outlined, so for now we'll just - // save the association of the blocks in LPadTargetBlocks. The - // return instructions which are created from these branches will be - // replaced after all landing pads have been outlined. - for (const auto MapEntry : VMap) { - // VMap maps all values and blocks that were just cloned, but dead - // blocks which were pruned will map to nullptr. - if (!isa<BasicBlock>(MapEntry.first) || MapEntry.second == nullptr) + // Step two: record the TryParentState of each state. For cleanuppads that + // don't have cleanuprets, we may need to infer this from their child pads, + // so visit pads in descendant-most to ancestor-most order. + for (auto Entry = FuncInfo.ClrEHUnwindMap.rbegin(), + End = FuncInfo.ClrEHUnwindMap.rend(); + Entry != End; ++Entry) { + const Instruction *Pad = + Entry->Handler.get<const BasicBlock *>()->getFirstNonPHI(); + // For most pads, the TryParentState is the state associated with the + // unwind dest of exceptional exits from it. + const BasicBlock *UnwindDest; + if (const auto *Catch = dyn_cast<CatchPadInst>(Pad)) { + // If a catch is not the last in its catchswitch, its TryParentState is + // the state associated with the next catch in the switch, even though + // that's not the unwind dest of exceptions escaping the catch. Those + // cases were already assigned a TryParentState in the first pass, so + // skip them. + if (Entry->TryParentState != -1) continue; - const BasicBlock *MappedBB = cast<BasicBlock>(MapEntry.first); - for (auto *Pred : predecessors(const_cast<BasicBlock *>(MappedBB))) { - auto *Branch = dyn_cast<BranchInst>(Pred->getTerminator()); - if (!Branch || !Branch->isUnconditional() || Pred->size() <= 1) - continue; - BasicBlock::iterator II = const_cast<BranchInst *>(Branch); - --II; - if (match(cast<Value>(II), m_Intrinsic<Intrinsic::eh_endcatch>())) { - // This would indicate that a nested landing pad wants to return - // to a block that is outlined into two different handlers. - assert(!LPadTargetBlocks.count(MappedBB)); - LPadTargetBlocks[MappedBB] = cast<BasicBlock>(MapEntry.second); + // Otherwise, get the unwind dest from the catchswitch. + UnwindDest = Catch->getCatchSwitch()->getUnwindDest(); + } else { + const auto *Cleanup = cast<CleanupPadInst>(Pad); + UnwindDest = nullptr; + for (const User *U : Cleanup->users()) { + if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) { + // Common and unambiguous case -- cleanupret indicates cleanup's + // unwind dest. + UnwindDest = CleanupRet->getUnwindDest(); + break; } - } - } - } // End if (CatchAction) - - Action->setHandlerBlockOrFunc(Handler); - - return true; -} -/// This BB must end in a selector dispatch. All we need to do is pass the -/// handler block to llvm.eh.actions and list it as a possible indirectbr -/// target. -void WinEHPrepare::processSEHCatchHandler(CatchHandler *CatchAction, - BasicBlock *StartBB) { - BasicBlock *HandlerBB; - BasicBlock *NextBB; - Constant *Selector; - bool Res = isSelectorDispatch(StartBB, HandlerBB, Selector, NextBB); - if (Res) { - // If this was EH dispatch, this must be a conditional branch to the handler - // block. - // FIXME: Handle instructions in the dispatch block. Currently we drop them, - // leading to crashes if some optimization hoists stuff here. - assert(CatchAction->getSelector() && HandlerBB && - "expected catch EH dispatch"); - } else { - // This must be a catch-all. Split the block after the landingpad. - assert(CatchAction->getSelector()->isNullValue() && "expected catch-all"); - HandlerBB = SplitBlock(StartBB, StartBB->getFirstInsertionPt(), DT); - } - IRBuilder<> Builder(HandlerBB->getFirstInsertionPt()); - Function *EHCodeFn = Intrinsic::getDeclaration( - StartBB->getParent()->getParent(), Intrinsic::eh_exceptioncode); - Value *Code = Builder.CreateCall(EHCodeFn, {}, "sehcode"); - Code = Builder.CreateIntToPtr(Code, SEHExceptionCodeSlot->getAllocatedType()); - Builder.CreateStore(Code, SEHExceptionCodeSlot); - CatchAction->setHandlerBlockOrFunc(BlockAddress::get(HandlerBB)); - TinyPtrVector<BasicBlock *> Targets(HandlerBB); - CatchAction->setReturnTargets(Targets); -} + // Get an unwind dest for the user + const BasicBlock *UserUnwindDest = nullptr; + if (auto *Invoke = dyn_cast<InvokeInst>(U)) { + UserUnwindDest = Invoke->getUnwindDest(); + } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(U)) { + UserUnwindDest = CatchSwitch->getUnwindDest(); + } else if (auto *ChildCleanup = dyn_cast<CleanupPadInst>(U)) { + int UserState = FuncInfo.EHPadStateMap[ChildCleanup]; + int UserUnwindState = + FuncInfo.ClrEHUnwindMap[UserState].TryParentState; + if (UserUnwindState != -1) + UserUnwindDest = FuncInfo.ClrEHUnwindMap[UserUnwindState] + .Handler.get<const BasicBlock *>(); + } -void LandingPadMap::mapLandingPad(const LandingPadInst *LPad) { - // Each instance of this class should only ever be used to map a single - // landing pad. - assert(OriginLPad == nullptr || OriginLPad == LPad); + // Not having an unwind dest for this user might indicate that it + // doesn't unwind, so can't be taken as proof that the cleanup itself + // may unwind to caller (see e.g. SimplifyUnreachable and + // RemoveUnwindEdge). + if (!UserUnwindDest) + continue; - // If the landing pad has already been mapped, there's nothing more to do. - if (OriginLPad == LPad) - return; + // Now we have an unwind dest for the user, but we need to see if it + // unwinds all the way out of the cleanup or if it stays within it. + const Instruction *UserUnwindPad = UserUnwindDest->getFirstNonPHI(); + const Value *UserUnwindParent; + if (auto *CSI = dyn_cast<CatchSwitchInst>(UserUnwindPad)) + UserUnwindParent = CSI->getParentPad(); + else + UserUnwindParent = + cast<CleanupPadInst>(UserUnwindPad)->getParentPad(); - OriginLPad = LPad; + // The unwind stays within the cleanup iff it targets a child of the + // cleanup. + if (UserUnwindParent == Cleanup) + continue; - // The landingpad instruction returns an aggregate value. Typically, its - // value will be passed to a pair of extract value instructions and the - // results of those extracts will have been promoted to reg values before - // this routine is called. - for (auto *U : LPad->users()) { - const ExtractValueInst *Extract = dyn_cast<ExtractValueInst>(U); - if (!Extract) - continue; - assert(Extract->getNumIndices() == 1 && - "Unexpected operation: extracting both landing pad values"); - unsigned int Idx = *(Extract->idx_begin()); - assert((Idx == 0 || Idx == 1) && - "Unexpected operation: extracting an unknown landing pad element"); - if (Idx == 0) { - ExtractedEHPtrs.push_back(Extract); - } else if (Idx == 1) { - ExtractedSelectors.push_back(Extract); + // This unwind exits the cleanup, so its dest is the cleanup's dest. + UnwindDest = UserUnwindDest; + break; + } } - } -} - -bool LandingPadMap::isOriginLandingPadBlock(const BasicBlock *BB) const { - return BB->getLandingPadInst() == OriginLPad; -} -bool LandingPadMap::isLandingPadSpecificInst(const Instruction *Inst) const { - if (Inst == OriginLPad) - return true; - for (auto *Extract : ExtractedEHPtrs) { - if (Inst == Extract) - return true; - } - for (auto *Extract : ExtractedSelectors) { - if (Inst == Extract) - return true; - } - return false; -} - -void LandingPadMap::remapEHValues(ValueToValueMapTy &VMap, Value *EHPtrValue, - Value *SelectorValue) const { - // Remap all landing pad extract instructions to the specified values. - for (auto *Extract : ExtractedEHPtrs) - VMap[Extract] = EHPtrValue; - for (auto *Extract : ExtractedSelectors) - VMap[Extract] = SelectorValue; -} - -static bool isLocalAddressCall(const Value *V) { - return match(const_cast<Value *>(V), m_Intrinsic<Intrinsic::localaddress>()); -} - -CloningDirector::CloningAction WinEHCloningDirectorBase::handleInstruction( - ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) { - // If this is one of the boilerplate landing pad instructions, skip it. - // The instruction will have already been remapped in VMap. - if (LPadMap.isLandingPadSpecificInst(Inst)) - return CloningDirector::SkipInstruction; - - // Nested landing pads that have not already been outlined will be cloned as - // stubs, with just the landingpad instruction and an unreachable instruction. - // When all landingpads have been outlined, we'll replace this with the - // llvm.eh.actions call and indirect branch created when the landing pad was - // outlined. - if (auto *LPad = dyn_cast<LandingPadInst>(Inst)) { - return handleLandingPad(VMap, LPad, NewBB); - } - - // Nested landing pads that have already been outlined will be cloned in their - // outlined form, but we need to intercept the ibr instruction to filter out - // targets that do not return to the handler we are outlining. - if (auto *IBr = dyn_cast<IndirectBrInst>(Inst)) { - return handleIndirectBr(VMap, IBr, NewBB); - } - - if (auto *Invoke = dyn_cast<InvokeInst>(Inst)) - return handleInvoke(VMap, Invoke, NewBB); - - if (auto *Resume = dyn_cast<ResumeInst>(Inst)) - return handleResume(VMap, Resume, NewBB); - - if (auto *Cmp = dyn_cast<CmpInst>(Inst)) - return handleCompare(VMap, Cmp, NewBB); - - if (match(Inst, m_Intrinsic<Intrinsic::eh_begincatch>())) - return handleBeginCatch(VMap, Inst, NewBB); - if (match(Inst, m_Intrinsic<Intrinsic::eh_endcatch>())) - return handleEndCatch(VMap, Inst, NewBB); - if (match(Inst, m_Intrinsic<Intrinsic::eh_typeid_for>())) - return handleTypeIdFor(VMap, Inst, NewBB); - - // When outlining llvm.localaddress(), remap that to the second argument, - // which is the FP of the parent. - if (isLocalAddressCall(Inst)) { - VMap[Inst] = ParentFP; - return CloningDirector::SkipInstruction; - } - - // Continue with the default cloning behavior. - return CloningDirector::CloneInstruction; -} - -CloningDirector::CloningAction WinEHCatchDirector::handleLandingPad( - ValueToValueMapTy &VMap, const LandingPadInst *LPad, BasicBlock *NewBB) { - // If the instruction after the landing pad is a call to llvm.eh.actions - // the landing pad has already been outlined. In this case, we should - // clone it because it may return to a block in the handler we are - // outlining now that would otherwise be unreachable. The landing pads - // are sorted before outlining begins to enable this case to work - // properly. - const Instruction *NextI = LPad->getNextNode(); - if (match(NextI, m_Intrinsic<Intrinsic::eh_actions>())) - return CloningDirector::CloneInstruction; - - // If the landing pad hasn't been outlined yet, the landing pad we are - // outlining now does not dominate it and so it cannot return to a block - // in this handler. In that case, we can just insert a stub landing - // pad now and patch it up later. - Instruction *NewInst = LPad->clone(); - if (LPad->hasName()) - NewInst->setName(LPad->getName()); - // Save this correlation for later processing. - NestedLPtoOriginalLP[cast<LandingPadInst>(NewInst)] = LPad; - VMap[LPad] = NewInst; - BasicBlock::InstListType &InstList = NewBB->getInstList(); - InstList.push_back(NewInst); - InstList.push_back(new UnreachableInst(NewBB->getContext())); - return CloningDirector::StopCloningBB; -} - -CloningDirector::CloningAction WinEHCatchDirector::handleBeginCatch( - ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) { - // The argument to the call is some form of the first element of the - // landingpad aggregate value, but that doesn't matter. It isn't used - // here. - // The second argument is an outparameter where the exception object will be - // stored. Typically the exception object is a scalar, but it can be an - // aggregate when catching by value. - // FIXME: Leave something behind to indicate where the exception object lives - // for this handler. Should it be part of llvm.eh.actions? - assert(ExceptionObjectVar == nullptr && "Multiple calls to " - "llvm.eh.begincatch found while " - "outlining catch handler."); - ExceptionObjectVar = Inst->getOperand(1)->stripPointerCasts(); - if (isa<ConstantPointerNull>(ExceptionObjectVar)) - return CloningDirector::SkipInstruction; - assert(cast<AllocaInst>(ExceptionObjectVar)->isStaticAlloca() && - "catch parameter is not static alloca"); - Materializer.escapeCatchObject(ExceptionObjectVar); - return CloningDirector::SkipInstruction; -} + // Record the state of the unwind dest as the TryParentState. + int UnwindDestState; + + // If UnwindDest is null at this point, either the pad in question can + // be exited by unwind to caller, or it cannot be exited by unwind. In + // either case, reporting such cases as unwinding to caller is correct. + // This can lead to EH tables that "look strange" -- if this pad's is in + // a parent funclet which has other children that do unwind to an enclosing + // pad, the try region for this pad will be missing the "duplicate" EH + // clause entries that you'd expect to see covering the whole parent. That + // should be benign, since the unwind never actually happens. If it were + // an issue, we could add a subsequent pass that pushes unwind dests down + // from parents that have them to children that appear to unwind to caller. + if (!UnwindDest) { + UnwindDestState = -1; + } else { + UnwindDestState = FuncInfo.EHPadStateMap[UnwindDest->getFirstNonPHI()]; + } -CloningDirector::CloningAction -WinEHCatchDirector::handleEndCatch(ValueToValueMapTy &VMap, - const Instruction *Inst, BasicBlock *NewBB) { - auto *IntrinCall = dyn_cast<IntrinsicInst>(Inst); - // It might be interesting to track whether or not we are inside a catch - // function, but that might make the algorithm more brittle than it needs - // to be. - - // The end catch call can occur in one of two places: either in a - // landingpad block that is part of the catch handlers exception mechanism, - // or at the end of the catch block. However, a catch-all handler may call - // end catch from the original landing pad. If the call occurs in a nested - // landing pad block, we must skip it and continue so that the landing pad - // gets cloned. - auto *ParentBB = IntrinCall->getParent(); - if (ParentBB->isLandingPad() && !LPadMap.isOriginLandingPadBlock(ParentBB)) - return CloningDirector::SkipInstruction; - - // If an end catch occurs anywhere else we want to terminate the handler - // with a return to the code that follows the endcatch call. If the - // next instruction is not an unconditional branch, we need to split the - // block to provide a clear target for the return instruction. - BasicBlock *ContinueBB; - auto Next = std::next(BasicBlock::const_iterator(IntrinCall)); - const BranchInst *Branch = dyn_cast<BranchInst>(Next); - if (!Branch || !Branch->isUnconditional()) { - // We're interrupting the cloning process at this location, so the - // const_cast we're doing here will not cause a problem. - ContinueBB = SplitBlock(const_cast<BasicBlock *>(ParentBB), - const_cast<Instruction *>(cast<Instruction>(Next))); - } else { - ContinueBB = Branch->getSuccessor(0); + Entry->TryParentState = UnwindDestState; } - ReturnInst::Create(NewBB->getContext(), BlockAddress::get(ContinueBB), NewBB); - ReturnTargets.push_back(ContinueBB); - - // We just added a terminator to the cloned block. - // Tell the caller to stop processing the current basic block so that - // the branch instruction will be skipped. - return CloningDirector::StopCloningBB; + // Step three: transfer information from pads to invokes. + calculateStateNumbersForInvokes(Fn, FuncInfo); } -CloningDirector::CloningAction WinEHCatchDirector::handleTypeIdFor( - ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) { - auto *IntrinCall = dyn_cast<IntrinsicInst>(Inst); - Value *Selector = IntrinCall->getArgOperand(0)->stripPointerCasts(); - // This causes a replacement that will collapse the landing pad CFG based - // on the filter function we intend to match. - if (Selector == CurrentSelector) - VMap[Inst] = ConstantInt::get(SelectorIDType, 1); - else - VMap[Inst] = ConstantInt::get(SelectorIDType, 0); - // Tell the caller not to clone this instruction. - return CloningDirector::SkipInstruction; -} +void WinEHPrepare::colorFunclets(Function &F) { + BlockColors = colorEHFunclets(F); -CloningDirector::CloningAction WinEHCatchDirector::handleIndirectBr( - ValueToValueMapTy &VMap, - const IndirectBrInst *IBr, - BasicBlock *NewBB) { - // If this indirect branch is not part of a landing pad block, just clone it. - const BasicBlock *ParentBB = IBr->getParent(); - if (!ParentBB->isLandingPad()) - return CloningDirector::CloneInstruction; - - // If it is part of a landing pad, we want to filter out target blocks - // that are not part of the handler we are outlining. - const LandingPadInst *LPad = ParentBB->getLandingPadInst(); - - // Save this correlation for later processing. - NestedLPtoOriginalLP[cast<LandingPadInst>(VMap[LPad])] = LPad; - - // We should only get here for landing pads that have already been outlined. - assert(match(LPad->getNextNode(), m_Intrinsic<Intrinsic::eh_actions>())); - - // Copy the indirectbr, but only include targets that were previously - // identified as EH blocks and are dominated by the nested landing pad. - SetVector<const BasicBlock *> ReturnTargets; - for (int I = 0, E = IBr->getNumDestinations(); I < E; ++I) { - auto *TargetBB = IBr->getDestination(I); - if (EHBlocks.count(const_cast<BasicBlock*>(TargetBB)) && - DT->dominates(ParentBB, TargetBB)) { - DEBUG(dbgs() << " Adding destination " << TargetBB->getName() << "\n"); - ReturnTargets.insert(TargetBB); - } + // Invert the map from BB to colors to color to BBs. + for (BasicBlock &BB : F) { + ColorVector &Colors = BlockColors[&BB]; + for (BasicBlock *Color : Colors) + FuncletBlocks[Color].push_back(&BB); } - IndirectBrInst *NewBranch = - IndirectBrInst::Create(const_cast<Value *>(IBr->getAddress()), - ReturnTargets.size(), NewBB); - for (auto *Target : ReturnTargets) - NewBranch->addDestination(const_cast<BasicBlock*>(Target)); - - // The operands and targets of the branch instruction are remapped later - // because it is a terminator. Tell the cloning code to clone the - // blocks we just added to the target list. - return CloningDirector::CloneSuccessors; } -CloningDirector::CloningAction -WinEHCatchDirector::handleInvoke(ValueToValueMapTy &VMap, - const InvokeInst *Invoke, BasicBlock *NewBB) { - return CloningDirector::CloneInstruction; -} +void WinEHPrepare::demotePHIsOnFunclets(Function &F) { + // Strip PHI nodes off of EH pads. + SmallVector<PHINode *, 16> PHINodes; + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) { + BasicBlock *BB = &*FI++; + if (!BB->isEHPad()) + continue; + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { + Instruction *I = &*BI++; + auto *PN = dyn_cast<PHINode>(I); + // Stop at the first non-PHI. + if (!PN) + break; -CloningDirector::CloningAction -WinEHCatchDirector::handleResume(ValueToValueMapTy &VMap, - const ResumeInst *Resume, BasicBlock *NewBB) { - // Resume instructions shouldn't be reachable from catch handlers. - // We still need to handle it, but it will be pruned. - BasicBlock::InstListType &InstList = NewBB->getInstList(); - InstList.push_back(new UnreachableInst(NewBB->getContext())); - return CloningDirector::StopCloningBB; -} + AllocaInst *SpillSlot = insertPHILoads(PN, F); + if (SpillSlot) + insertPHIStores(PN, SpillSlot); -CloningDirector::CloningAction -WinEHCatchDirector::handleCompare(ValueToValueMapTy &VMap, - const CmpInst *Compare, BasicBlock *NewBB) { - const IntrinsicInst *IntrinCall = nullptr; - if (match(Compare->getOperand(0), m_Intrinsic<Intrinsic::eh_typeid_for>())) { - IntrinCall = dyn_cast<IntrinsicInst>(Compare->getOperand(0)); - } else if (match(Compare->getOperand(1), - m_Intrinsic<Intrinsic::eh_typeid_for>())) { - IntrinCall = dyn_cast<IntrinsicInst>(Compare->getOperand(1)); - } - if (IntrinCall) { - Value *Selector = IntrinCall->getArgOperand(0)->stripPointerCasts(); - // This causes a replacement that will collapse the landing pad CFG based - // on the filter function we intend to match. - if (Selector == CurrentSelector->stripPointerCasts()) { - VMap[Compare] = ConstantInt::get(SelectorIDType, 1); - } else { - VMap[Compare] = ConstantInt::get(SelectorIDType, 0); + PHINodes.push_back(PN); } - return CloningDirector::SkipInstruction; - } - return CloningDirector::CloneInstruction; -} - -CloningDirector::CloningAction WinEHCleanupDirector::handleLandingPad( - ValueToValueMapTy &VMap, const LandingPadInst *LPad, BasicBlock *NewBB) { - // The MS runtime will terminate the process if an exception occurs in a - // cleanup handler, so we shouldn't encounter landing pads in the actual - // cleanup code, but they may appear in catch blocks. Depending on where - // we started cloning we may see one, but it will get dropped during dead - // block pruning. - Instruction *NewInst = new UnreachableInst(NewBB->getContext()); - VMap[LPad] = NewInst; - BasicBlock::InstListType &InstList = NewBB->getInstList(); - InstList.push_back(NewInst); - return CloningDirector::StopCloningBB; -} - -CloningDirector::CloningAction WinEHCleanupDirector::handleBeginCatch( - ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) { - // Cleanup code may flow into catch blocks or the catch block may be part - // of a branch that will be optimized away. We'll insert a return - // instruction now, but it may be pruned before the cloning process is - // complete. - ReturnInst::Create(NewBB->getContext(), nullptr, NewBB); - return CloningDirector::StopCloningBB; -} - -CloningDirector::CloningAction WinEHCleanupDirector::handleEndCatch( - ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) { - // Cleanup handlers nested within catch handlers may begin with a call to - // eh.endcatch. We can just ignore that instruction. - return CloningDirector::SkipInstruction; -} - -CloningDirector::CloningAction WinEHCleanupDirector::handleTypeIdFor( - ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) { - // If we encounter a selector comparison while cloning a cleanup handler, - // we want to stop cloning immediately. Anything after the dispatch - // will be outlined into a different handler. - BasicBlock *CatchHandler; - Constant *Selector; - BasicBlock *NextBB; - if (isSelectorDispatch(const_cast<BasicBlock *>(Inst->getParent()), - CatchHandler, Selector, NextBB)) { - ReturnInst::Create(NewBB->getContext(), nullptr, NewBB); - return CloningDirector::StopCloningBB; } - // If eg.typeid.for is called for any other reason, it can be ignored. - VMap[Inst] = ConstantInt::get(SelectorIDType, 0); - return CloningDirector::SkipInstruction; -} - -CloningDirector::CloningAction WinEHCleanupDirector::handleIndirectBr( - ValueToValueMapTy &VMap, - const IndirectBrInst *IBr, - BasicBlock *NewBB) { - // No special handling is required for cleanup cloning. - return CloningDirector::CloneInstruction; -} - -CloningDirector::CloningAction WinEHCleanupDirector::handleInvoke( - ValueToValueMapTy &VMap, const InvokeInst *Invoke, BasicBlock *NewBB) { - // All invokes in cleanup handlers can be replaced with calls. - SmallVector<Value *, 16> CallArgs(Invoke->op_begin(), Invoke->op_end() - 3); - // Insert a normal call instruction... - CallInst *NewCall = - CallInst::Create(const_cast<Value *>(Invoke->getCalledValue()), CallArgs, - Invoke->getName(), NewBB); - NewCall->setCallingConv(Invoke->getCallingConv()); - NewCall->setAttributes(Invoke->getAttributes()); - NewCall->setDebugLoc(Invoke->getDebugLoc()); - VMap[Invoke] = NewCall; - - // Remap the operands. - llvm::RemapInstruction(NewCall, VMap, RF_None, nullptr, &Materializer); - - // Insert an unconditional branch to the normal destination. - BranchInst::Create(Invoke->getNormalDest(), NewBB); - - // The unwind destination won't be cloned into the new function, so - // we don't need to clean up its phi nodes. - - // We just added a terminator to the cloned block. - // Tell the caller to stop processing the current basic block. - return CloningDirector::CloneSuccessors; -} - -CloningDirector::CloningAction WinEHCleanupDirector::handleResume( - ValueToValueMapTy &VMap, const ResumeInst *Resume, BasicBlock *NewBB) { - ReturnInst::Create(NewBB->getContext(), nullptr, NewBB); - - // We just added a terminator to the cloned block. - // Tell the caller to stop processing the current basic block so that - // the branch instruction will be skipped. - return CloningDirector::StopCloningBB; -} -CloningDirector::CloningAction -WinEHCleanupDirector::handleCompare(ValueToValueMapTy &VMap, - const CmpInst *Compare, BasicBlock *NewBB) { - if (match(Compare->getOperand(0), m_Intrinsic<Intrinsic::eh_typeid_for>()) || - match(Compare->getOperand(1), m_Intrinsic<Intrinsic::eh_typeid_for>())) { - VMap[Compare] = ConstantInt::get(SelectorIDType, 1); - return CloningDirector::SkipInstruction; + for (auto *PN : PHINodes) { + // There may be lingering uses on other EH PHIs being removed + PN->replaceAllUsesWith(UndefValue::get(PN->getType())); + PN->eraseFromParent(); } - return CloningDirector::CloneInstruction; } -WinEHFrameVariableMaterializer::WinEHFrameVariableMaterializer( - Function *OutlinedFn, Value *ParentFP, FrameVarInfoMap &FrameVarInfo) - : FrameVarInfo(FrameVarInfo), Builder(OutlinedFn->getContext()) { - BasicBlock *EntryBB = &OutlinedFn->getEntryBlock(); - - // New allocas should be inserted in the entry block, but after the parent FP - // is established if it is an instruction. - Instruction *InsertPoint = EntryBB->getFirstInsertionPt(); - if (auto *FPInst = dyn_cast<Instruction>(ParentFP)) - InsertPoint = FPInst->getNextNode(); - Builder.SetInsertPoint(EntryBB, InsertPoint); -} +void WinEHPrepare::cloneCommonBlocks(Function &F) { + // We need to clone all blocks which belong to multiple funclets. Values are + // remapped throughout the funclet to propogate both the new instructions + // *and* the new basic blocks themselves. + for (auto &Funclets : FuncletBlocks) { + BasicBlock *FuncletPadBB = Funclets.first; + std::vector<BasicBlock *> &BlocksInFunclet = Funclets.second; + Value *FuncletToken; + if (FuncletPadBB == &F.getEntryBlock()) + FuncletToken = ConstantTokenNone::get(F.getContext()); + else + FuncletToken = FuncletPadBB->getFirstNonPHI(); + + std::vector<std::pair<BasicBlock *, BasicBlock *>> Orig2Clone; + ValueToValueMapTy VMap; + for (BasicBlock *BB : BlocksInFunclet) { + ColorVector &ColorsForBB = BlockColors[BB]; + // We don't need to do anything if the block is monochromatic. + size_t NumColorsForBB = ColorsForBB.size(); + if (NumColorsForBB == 1) + continue; -Value *WinEHFrameVariableMaterializer::materializeValueFor(Value *V) { - // If we're asked to materialize a static alloca, we temporarily create an - // alloca in the outlined function and add this to the FrameVarInfo map. When - // all the outlining is complete, we'll replace these temporary allocas with - // calls to llvm.localrecover. - if (auto *AV = dyn_cast<AllocaInst>(V)) { - assert(AV->isStaticAlloca() && - "cannot materialize un-demoted dynamic alloca"); - AllocaInst *NewAlloca = dyn_cast<AllocaInst>(AV->clone()); - Builder.Insert(NewAlloca, AV->getName()); - FrameVarInfo[AV].push_back(NewAlloca); - return NewAlloca; - } + DEBUG_WITH_TYPE("winehprepare-coloring", + dbgs() << " Cloning block \'" << BB->getName() + << "\' for funclet \'" << FuncletPadBB->getName() + << "\'.\n"); - if (isa<Instruction>(V) || isa<Argument>(V)) { - Function *Parent = isa<Instruction>(V) - ? cast<Instruction>(V)->getParent()->getParent() - : cast<Argument>(V)->getParent(); - errs() - << "Failed to demote instruction used in exception handler of function " - << GlobalValue::getRealLinkageName(Parent->getName()) << ":\n"; - errs() << " " << *V << '\n'; - report_fatal_error("WinEHPrepare failed to demote instruction"); - } + // Create a new basic block and copy instructions into it! + BasicBlock *CBB = + CloneBasicBlock(BB, VMap, Twine(".for.", FuncletPadBB->getName())); + // Insert the clone immediately after the original to ensure determinism + // and to keep the same relative ordering of any funclet's blocks. + CBB->insertInto(&F, BB->getNextNode()); - // Don't materialize other values. - return nullptr; -} + // Add basic block mapping. + VMap[BB] = CBB; -void WinEHFrameVariableMaterializer::escapeCatchObject(Value *V) { - // Catch parameter objects have to live in the parent frame. When we see a use - // of a catch parameter, add a sentinel to the multimap to indicate that it's - // used from another handler. This will prevent us from trying to sink the - // alloca into the handler and ensure that the catch parameter is present in - // the call to llvm.localescape. - FrameVarInfo[V].push_back(getCatchObjectSentinel()); -} - -// This function maps the catch and cleanup handlers that are reachable from the -// specified landing pad. The landing pad sequence will have this basic shape: -// -// <cleanup handler> -// <selector comparison> -// <catch handler> -// <cleanup handler> -// <selector comparison> -// <catch handler> -// <cleanup handler> -// ... -// -// Any of the cleanup slots may be absent. The cleanup slots may be occupied by -// any arbitrary control flow, but all paths through the cleanup code must -// eventually reach the next selector comparison and no path can skip to a -// different selector comparisons, though some paths may terminate abnormally. -// Therefore, we will use a depth first search from the start of any given -// cleanup block and stop searching when we find the next selector comparison. -// -// If the landingpad instruction does not have a catch clause, we will assume -// that any instructions other than selector comparisons and catch handlers can -// be ignored. In practice, these will only be the boilerplate instructions. -// -// The catch handlers may also have any control structure, but we are only -// interested in the start of the catch handlers, so we don't need to actually -// follow the flow of the catch handlers. The start of the catch handlers can -// be located from the compare instructions, but they can be skipped in the -// flow by following the contrary branch. -void WinEHPrepare::mapLandingPadBlocks(LandingPadInst *LPad, - LandingPadActions &Actions) { - unsigned int NumClauses = LPad->getNumClauses(); - unsigned int HandlersFound = 0; - BasicBlock *BB = LPad->getParent(); - - DEBUG(dbgs() << "Mapping landing pad: " << BB->getName() << "\n"); - - if (NumClauses == 0) { - findCleanupHandlers(Actions, BB, nullptr); - return; - } - - VisitedBlockSet VisitedBlocks; - - while (HandlersFound != NumClauses) { - BasicBlock *NextBB = nullptr; + // Record delta operations that we need to perform to our color mappings. + Orig2Clone.emplace_back(BB, CBB); + } - // Skip over filter clauses. - if (LPad->isFilter(HandlersFound)) { - ++HandlersFound; + // If nothing was cloned, we're done cloning in this funclet. + if (Orig2Clone.empty()) continue; + + // Update our color mappings to reflect that one block has lost a color and + // another has gained a color. + for (auto &BBMapping : Orig2Clone) { + BasicBlock *OldBlock = BBMapping.first; + BasicBlock *NewBlock = BBMapping.second; + + BlocksInFunclet.push_back(NewBlock); + ColorVector &NewColors = BlockColors[NewBlock]; + assert(NewColors.empty() && "A new block should only have one color!"); + NewColors.push_back(FuncletPadBB); + + DEBUG_WITH_TYPE("winehprepare-coloring", + dbgs() << " Assigned color \'" << FuncletPadBB->getName() + << "\' to block \'" << NewBlock->getName() + << "\'.\n"); + + BlocksInFunclet.erase( + std::remove(BlocksInFunclet.begin(), BlocksInFunclet.end(), OldBlock), + BlocksInFunclet.end()); + ColorVector &OldColors = BlockColors[OldBlock]; + OldColors.erase( + std::remove(OldColors.begin(), OldColors.end(), FuncletPadBB), + OldColors.end()); + + DEBUG_WITH_TYPE("winehprepare-coloring", + dbgs() << " Removed color \'" << FuncletPadBB->getName() + << "\' from block \'" << OldBlock->getName() + << "\'.\n"); } - // See if the clause we're looking for is a catch-all. - // If so, the catch begins immediately. - Constant *ExpectedSelector = - LPad->getClause(HandlersFound)->stripPointerCasts(); - if (isa<ConstantPointerNull>(ExpectedSelector)) { - // The catch all must occur last. - assert(HandlersFound == NumClauses - 1); - - // There can be additional selector dispatches in the call chain that we - // need to ignore. - BasicBlock *CatchBlock = nullptr; - Constant *Selector; - while (BB && isSelectorDispatch(BB, CatchBlock, Selector, NextBB)) { - DEBUG(dbgs() << " Found extra catch dispatch in block " - << CatchBlock->getName() << "\n"); - BB = NextBB; - } + // Loop over all of the instructions in this funclet, fixing up operand + // references as we go. This uses VMap to do all the hard work. + for (BasicBlock *BB : BlocksInFunclet) + // Loop over all instructions, fixing each one as we find it... + for (Instruction &I : *BB) + RemapInstruction(&I, VMap, + RF_IgnoreMissingEntries | RF_NoModuleLevelChanges); + + // Catchrets targeting cloned blocks need to be updated separately from + // the loop above because they are not in the current funclet. + SmallVector<CatchReturnInst *, 2> FixupCatchrets; + for (auto &BBMapping : Orig2Clone) { + BasicBlock *OldBlock = BBMapping.first; + BasicBlock *NewBlock = BBMapping.second; + + FixupCatchrets.clear(); + for (BasicBlock *Pred : predecessors(OldBlock)) + if (auto *CatchRet = dyn_cast<CatchReturnInst>(Pred->getTerminator())) + if (CatchRet->getParentPad() == FuncletToken) + FixupCatchrets.push_back(CatchRet); + + for (CatchReturnInst *CatchRet : FixupCatchrets) + CatchRet->setSuccessor(NewBlock); + } - // Add the catch handler to the action list. - CatchHandler *Action = nullptr; - if (CatchHandlerMap.count(BB) && CatchHandlerMap[BB] != nullptr) { - // If the CatchHandlerMap already has an entry for this BB, re-use it. - Action = CatchHandlerMap[BB]; - assert(Action->getSelector() == ExpectedSelector); - } else { - // We don't expect a selector dispatch, but there may be a call to - // llvm.eh.begincatch, which separates catch handling code from - // cleanup code in the same control flow. This call looks for the - // begincatch intrinsic. - Action = findCatchHandler(BB, NextBB, VisitedBlocks); - if (Action) { - // For C++ EH, check if there is any interesting cleanup code before - // we begin the catch. This is important because cleanups cannot - // rethrow exceptions but code called from catches can. For SEH, it - // isn't important if some finally code before a catch-all is executed - // out of line or after recovering from the exception. - if (Personality == EHPersonality::MSVC_CXX) - findCleanupHandlers(Actions, BB, BB); + auto UpdatePHIOnClonedBlock = [&](PHINode *PN, bool IsForOldBlock) { + unsigned NumPreds = PN->getNumIncomingValues(); + for (unsigned PredIdx = 0, PredEnd = NumPreds; PredIdx != PredEnd; + ++PredIdx) { + BasicBlock *IncomingBlock = PN->getIncomingBlock(PredIdx); + bool EdgeTargetsFunclet; + if (auto *CRI = + dyn_cast<CatchReturnInst>(IncomingBlock->getTerminator())) { + EdgeTargetsFunclet = (CRI->getParentPad() == FuncletToken); } else { - // If an action was not found, it means that the control flows - // directly into the catch-all handler and there is no cleanup code. - // That's an expected situation and we must create a catch action. - // Since this is a catch-all handler, the selector won't actually - // appear in the code anywhere. ExpectedSelector here is the constant - // null ptr that we got from the landing pad instruction. - Action = new CatchHandler(BB, ExpectedSelector, nullptr); - CatchHandlerMap[BB] = Action; + ColorVector &IncomingColors = BlockColors[IncomingBlock]; + assert(!IncomingColors.empty() && "Block not colored!"); + assert((IncomingColors.size() == 1 || + llvm::all_of(IncomingColors, + [&](BasicBlock *Color) { + return Color != FuncletPadBB; + })) && + "Cloning should leave this funclet's blocks monochromatic"); + EdgeTargetsFunclet = (IncomingColors.front() == FuncletPadBB); } + if (IsForOldBlock != EdgeTargetsFunclet) + continue; + PN->removeIncomingValue(IncomingBlock, /*DeletePHIIfEmpty=*/false); + // Revisit the next entry. + --PredIdx; + --PredEnd; } - Actions.insertCatchHandler(Action); - DEBUG(dbgs() << " Catch all handler at block " << BB->getName() << "\n"); - ++HandlersFound; - - // Once we reach a catch-all, don't expect to hit a resume instruction. - BB = nullptr; - break; - } - - CatchHandler *CatchAction = findCatchHandler(BB, NextBB, VisitedBlocks); - assert(CatchAction); - - // See if there is any interesting code executed before the dispatch. - findCleanupHandlers(Actions, BB, CatchAction->getStartBlock()); - - // When the source program contains multiple nested try blocks the catch - // handlers can get strung together in such a way that we can encounter - // a dispatch for a selector that we've already had a handler for. - if (CatchAction->getSelector()->stripPointerCasts() == ExpectedSelector) { - ++HandlersFound; - - // Add the catch handler to the action list. - DEBUG(dbgs() << " Found catch dispatch in block " - << CatchAction->getStartBlock()->getName() << "\n"); - Actions.insertCatchHandler(CatchAction); - } else { - // Under some circumstances optimized IR will flow unconditionally into a - // handler block without checking the selector. This can only happen if - // the landing pad has a catch-all handler and the handler for the - // preceeding catch clause is identical to the catch-call handler - // (typically an empty catch). In this case, the handler must be shared - // by all remaining clauses. - if (isa<ConstantPointerNull>( - CatchAction->getSelector()->stripPointerCasts())) { - DEBUG(dbgs() << " Applying early catch-all handler in block " - << CatchAction->getStartBlock()->getName() - << " to all remaining clauses.\n"); - Actions.insertCatchHandler(CatchAction); - return; + }; + + for (auto &BBMapping : Orig2Clone) { + BasicBlock *OldBlock = BBMapping.first; + BasicBlock *NewBlock = BBMapping.second; + for (Instruction &OldI : *OldBlock) { + auto *OldPN = dyn_cast<PHINode>(&OldI); + if (!OldPN) + break; + UpdatePHIOnClonedBlock(OldPN, /*IsForOldBlock=*/true); + } + for (Instruction &NewI : *NewBlock) { + auto *NewPN = dyn_cast<PHINode>(&NewI); + if (!NewPN) + break; + UpdatePHIOnClonedBlock(NewPN, /*IsForOldBlock=*/false); } - - DEBUG(dbgs() << " Found extra catch dispatch in block " - << CatchAction->getStartBlock()->getName() << "\n"); } - // Move on to the block after the catch handler. - BB = NextBB; - } - - // If we didn't wind up in a catch-all, see if there is any interesting code - // executed before the resume. - findCleanupHandlers(Actions, BB, BB); - - // It's possible that some optimization moved code into a landingpad that - // wasn't - // previously being used for cleanup. If that happens, we need to execute - // that - // extra code from a cleanup handler. - if (Actions.includesCleanup() && !LPad->isCleanup()) - LPad->setCleanup(true); -} - -// This function searches starting with the input block for the next -// block that terminates with a branch whose condition is based on a selector -// comparison. This may be the input block. See the mapLandingPadBlocks -// comments for a discussion of control flow assumptions. -// -CatchHandler *WinEHPrepare::findCatchHandler(BasicBlock *BB, - BasicBlock *&NextBB, - VisitedBlockSet &VisitedBlocks) { - // See if we've already found a catch handler use it. - // Call count() first to avoid creating a null entry for blocks - // we haven't seen before. - if (CatchHandlerMap.count(BB) && CatchHandlerMap[BB] != nullptr) { - CatchHandler *Action = cast<CatchHandler>(CatchHandlerMap[BB]); - NextBB = Action->getNextBB(); - return Action; - } + // Check to see if SuccBB has PHI nodes. If so, we need to add entries to + // the PHI nodes for NewBB now. + for (auto &BBMapping : Orig2Clone) { + BasicBlock *OldBlock = BBMapping.first; + BasicBlock *NewBlock = BBMapping.second; + for (BasicBlock *SuccBB : successors(NewBlock)) { + for (Instruction &SuccI : *SuccBB) { + auto *SuccPN = dyn_cast<PHINode>(&SuccI); + if (!SuccPN) + break; + + // Ok, we have a PHI node. Figure out what the incoming value was for + // the OldBlock. + int OldBlockIdx = SuccPN->getBasicBlockIndex(OldBlock); + if (OldBlockIdx == -1) + break; + Value *IV = SuccPN->getIncomingValue(OldBlockIdx); + + // Remap the value if necessary. + if (auto *Inst = dyn_cast<Instruction>(IV)) { + ValueToValueMapTy::iterator I = VMap.find(Inst); + if (I != VMap.end()) + IV = I->second; + } - // VisitedBlocks applies only to the current search. We still - // need to consider blocks that we've visited while mapping other - // landing pads. - VisitedBlocks.insert(BB); - - BasicBlock *CatchBlock = nullptr; - Constant *Selector = nullptr; - - // If this is the first time we've visited this block from any landing pad - // look to see if it is a selector dispatch block. - if (!CatchHandlerMap.count(BB)) { - if (isSelectorDispatch(BB, CatchBlock, Selector, NextBB)) { - CatchHandler *Action = new CatchHandler(BB, Selector, NextBB); - CatchHandlerMap[BB] = Action; - return Action; - } - // If we encounter a block containing an llvm.eh.begincatch before we - // find a selector dispatch block, the handler is assumed to be - // reached unconditionally. This happens for catch-all blocks, but - // it can also happen for other catch handlers that have been combined - // with the catch-all handler during optimization. - if (isCatchBlock(BB)) { - PointerType *Int8PtrTy = Type::getInt8PtrTy(BB->getContext()); - Constant *NullSelector = ConstantPointerNull::get(Int8PtrTy); - CatchHandler *Action = new CatchHandler(BB, NullSelector, nullptr); - CatchHandlerMap[BB] = Action; - return Action; + SuccPN->addIncoming(IV, NewBlock); + } + } } - } - // Visit each successor, looking for the dispatch. - // FIXME: We expect to find the dispatch quickly, so this will probably - // work better as a breadth first search. - for (BasicBlock *Succ : successors(BB)) { - if (VisitedBlocks.count(Succ)) - continue; + for (ValueToValueMapTy::value_type VT : VMap) { + // If there were values defined in BB that are used outside the funclet, + // then we now have to update all uses of the value to use either the + // original value, the cloned value, or some PHI derived value. This can + // require arbitrary PHI insertion, of which we are prepared to do, clean + // these up now. + SmallVector<Use *, 16> UsesToRename; - CatchHandler *Action = findCatchHandler(Succ, NextBB, VisitedBlocks); - if (Action) - return Action; - } - return nullptr; -} - -// These are helper functions to combine repeated code from findCleanupHandlers. -static void createCleanupHandler(LandingPadActions &Actions, - CleanupHandlerMapTy &CleanupHandlerMap, - BasicBlock *BB) { - CleanupHandler *Action = new CleanupHandler(BB); - CleanupHandlerMap[BB] = Action; - Actions.insertCleanupHandler(Action); - DEBUG(dbgs() << " Found cleanup code in block " - << Action->getStartBlock()->getName() << "\n"); -} - -static CallSite matchOutlinedFinallyCall(BasicBlock *BB, - Instruction *MaybeCall) { - // Look for finally blocks that Clang has already outlined for us. - // %fp = call i8* @llvm.localaddress() - // call void @"fin$parent"(iN 1, i8* %fp) - if (isLocalAddressCall(MaybeCall) && MaybeCall != BB->getTerminator()) - MaybeCall = MaybeCall->getNextNode(); - CallSite FinallyCall(MaybeCall); - if (!FinallyCall || FinallyCall.arg_size() != 2) - return CallSite(); - if (!match(FinallyCall.getArgument(0), m_SpecificInt(1))) - return CallSite(); - if (!isLocalAddressCall(FinallyCall.getArgument(1))) - return CallSite(); - return FinallyCall; -} - -static BasicBlock *followSingleUnconditionalBranches(BasicBlock *BB) { - // Skip single ubr blocks. - while (BB->getFirstNonPHIOrDbg() == BB->getTerminator()) { - auto *Br = dyn_cast<BranchInst>(BB->getTerminator()); - if (Br && Br->isUnconditional()) - BB = Br->getSuccessor(0); - else - return BB; - } - return BB; -} - -// This function searches starting with the input block for the next block that -// contains code that is not part of a catch handler and would not be eliminated -// during handler outlining. -// -void WinEHPrepare::findCleanupHandlers(LandingPadActions &Actions, - BasicBlock *StartBB, BasicBlock *EndBB) { - // Here we will skip over the following: - // - // landing pad prolog: - // - // Unconditional branches - // - // Selector dispatch - // - // Resume pattern - // - // Anything else marks the start of an interesting block - - BasicBlock *BB = StartBB; - // Anything other than an unconditional branch will kick us out of this loop - // one way or another. - while (BB) { - BB = followSingleUnconditionalBranches(BB); - // If we've already scanned this block, don't scan it again. If it is - // a cleanup block, there will be an action in the CleanupHandlerMap. - // If we've scanned it and it is not a cleanup block, there will be a - // nullptr in the CleanupHandlerMap. If we have not scanned it, there will - // be no entry in the CleanupHandlerMap. We must call count() first to - // avoid creating a null entry for blocks we haven't scanned. - if (CleanupHandlerMap.count(BB)) { - if (auto *Action = CleanupHandlerMap[BB]) { - Actions.insertCleanupHandler(Action); - DEBUG(dbgs() << " Found cleanup code in block " - << Action->getStartBlock()->getName() << "\n"); - // FIXME: This cleanup might chain into another, and we need to discover - // that. - return; - } else { - // Here we handle the case where the cleanup handler map contains a - // value for this block but the value is a nullptr. This means that - // we have previously analyzed the block and determined that it did - // not contain any cleanup code. Based on the earlier analysis, we - // know the block must end in either an unconditional branch, a - // resume or a conditional branch that is predicated on a comparison - // with a selector. Either the resume or the selector dispatch - // would terminate the search for cleanup code, so the unconditional - // branch is the only case for which we might need to continue - // searching. - BasicBlock *SuccBB = followSingleUnconditionalBranches(BB); - if (SuccBB == BB || SuccBB == EndBB) - return; - BB = SuccBB; + auto *OldI = dyn_cast<Instruction>(const_cast<Value *>(VT.first)); + if (!OldI) continue; + auto *NewI = cast<Instruction>(VT.second); + // Scan all uses of this instruction to see if it is used outside of its + // funclet, and if so, record them in UsesToRename. + for (Use &U : OldI->uses()) { + Instruction *UserI = cast<Instruction>(U.getUser()); + BasicBlock *UserBB = UserI->getParent(); + ColorVector &ColorsForUserBB = BlockColors[UserBB]; + assert(!ColorsForUserBB.empty()); + if (ColorsForUserBB.size() > 1 || + *ColorsForUserBB.begin() != FuncletPadBB) + UsesToRename.push_back(&U); } - } - // Create an entry in the cleanup handler map for this block. Initially - // we create an entry that says this isn't a cleanup block. If we find - // cleanup code, the caller will replace this entry. - CleanupHandlerMap[BB] = nullptr; + // If there are no uses outside the block, we're done with this + // instruction. + if (UsesToRename.empty()) + continue; - TerminatorInst *Terminator = BB->getTerminator(); + // We found a use of OldI outside of the funclet. Rename all uses of OldI + // that are outside its funclet to be uses of the appropriate PHI node + // etc. + SSAUpdater SSAUpdate; + SSAUpdate.Initialize(OldI->getType(), OldI->getName()); + SSAUpdate.AddAvailableValue(OldI->getParent(), OldI); + SSAUpdate.AddAvailableValue(NewI->getParent(), NewI); - // Landing pad blocks have extra instructions we need to accept. - LandingPadMap *LPadMap = nullptr; - if (BB->isLandingPad()) { - LandingPadInst *LPad = BB->getLandingPadInst(); - LPadMap = &LPadMaps[LPad]; - if (!LPadMap->isInitialized()) - LPadMap->mapLandingPad(LPad); + while (!UsesToRename.empty()) + SSAUpdate.RewriteUseAfterInsertions(*UsesToRename.pop_back_val()); } + } +} - // Look for the bare resume pattern: - // %lpad.val1 = insertvalue { i8*, i32 } undef, i8* %exn, 0 - // %lpad.val2 = insertvalue { i8*, i32 } %lpad.val1, i32 %sel, 1 - // resume { i8*, i32 } %lpad.val2 - if (auto *Resume = dyn_cast<ResumeInst>(Terminator)) { - InsertValueInst *Insert1 = nullptr; - InsertValueInst *Insert2 = nullptr; - Value *ResumeVal = Resume->getOperand(0); - // If the resume value isn't a phi or landingpad value, it should be a - // series of insertions. Identify them so we can avoid them when scanning - // for cleanups. - if (!isa<PHINode>(ResumeVal) && !isa<LandingPadInst>(ResumeVal)) { - Insert2 = dyn_cast<InsertValueInst>(ResumeVal); - if (!Insert2) - return createCleanupHandler(Actions, CleanupHandlerMap, BB); - Insert1 = dyn_cast<InsertValueInst>(Insert2->getAggregateOperand()); - if (!Insert1) - return createCleanupHandler(Actions, CleanupHandlerMap, BB); - } - for (BasicBlock::iterator II = BB->getFirstNonPHIOrDbg(), IE = BB->end(); - II != IE; ++II) { - Instruction *Inst = II; - if (LPadMap && LPadMap->isLandingPadSpecificInst(Inst)) +void WinEHPrepare::removeImplausibleInstructions(Function &F) { + // Remove implausible terminators and replace them with UnreachableInst. + for (auto &Funclet : FuncletBlocks) { + BasicBlock *FuncletPadBB = Funclet.first; + std::vector<BasicBlock *> &BlocksInFunclet = Funclet.second; + Instruction *FirstNonPHI = FuncletPadBB->getFirstNonPHI(); + auto *FuncletPad = dyn_cast<FuncletPadInst>(FirstNonPHI); + auto *CatchPad = dyn_cast_or_null<CatchPadInst>(FuncletPad); + auto *CleanupPad = dyn_cast_or_null<CleanupPadInst>(FuncletPad); + + for (BasicBlock *BB : BlocksInFunclet) { + for (Instruction &I : *BB) { + CallSite CS(&I); + if (!CS) continue; - if (Inst == Insert1 || Inst == Insert2 || Inst == Resume) - continue; - if (!Inst->hasOneUse() || - (Inst->user_back() != Insert1 && Inst->user_back() != Insert2)) { - return createCleanupHandler(Actions, CleanupHandlerMap, BB); - } - } - return; - } - BranchInst *Branch = dyn_cast<BranchInst>(Terminator); - if (Branch && Branch->isConditional()) { - // Look for the selector dispatch. - // %2 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIf to i8*)) - // %matches = icmp eq i32 %sel, %2 - // br i1 %matches, label %catch14, label %eh.resume - CmpInst *Compare = dyn_cast<CmpInst>(Branch->getCondition()); - if (!Compare || !Compare->isEquality()) - return createCleanupHandler(Actions, CleanupHandlerMap, BB); - for (BasicBlock::iterator II = BB->getFirstNonPHIOrDbg(), IE = BB->end(); - II != IE; ++II) { - Instruction *Inst = II; - if (LPadMap && LPadMap->isLandingPadSpecificInst(Inst)) - continue; - if (Inst == Compare || Inst == Branch) + Value *FuncletBundleOperand = nullptr; + if (auto BU = CS.getOperandBundle(LLVMContext::OB_funclet)) + FuncletBundleOperand = BU->Inputs.front(); + + if (FuncletBundleOperand == FuncletPad) continue; - if (match(Inst, m_Intrinsic<Intrinsic::eh_typeid_for>())) + + // Skip call sites which are nounwind intrinsics. + auto *CalledFn = + dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); + if (CalledFn && CalledFn->isIntrinsic() && CS.doesNotThrow()) continue; - return createCleanupHandler(Actions, CleanupHandlerMap, BB); - } - // The selector dispatch block should always terminate our search. - assert(BB == EndBB); - return; - } - if (isAsynchronousEHPersonality(Personality)) { - // If this is a landingpad block, split the block at the first non-landing - // pad instruction. - Instruction *MaybeCall = BB->getFirstNonPHIOrDbg(); - if (LPadMap) { - while (MaybeCall != BB->getTerminator() && - LPadMap->isLandingPadSpecificInst(MaybeCall)) - MaybeCall = MaybeCall->getNextNode(); + // This call site was not part of this funclet, remove it. + if (CS.isInvoke()) { + // Remove the unwind edge if it was an invoke. + removeUnwindEdge(BB); + // Get a pointer to the new call. + BasicBlock::iterator CallI = + std::prev(BB->getTerminator()->getIterator()); + auto *CI = cast<CallInst>(&*CallI); + changeToUnreachable(CI, /*UseLLVMTrap=*/false); + } else { + changeToUnreachable(&I, /*UseLLVMTrap=*/false); + } + + // There are no more instructions in the block (except for unreachable), + // we are done. + break; } - // Look for outlined finally calls on x64, since those happen to match the - // prototype provided by the runtime. - if (TheTriple.getArch() == Triple::x86_64) { - if (CallSite FinallyCall = matchOutlinedFinallyCall(BB, MaybeCall)) { - Function *Fin = FinallyCall.getCalledFunction(); - assert(Fin && "outlined finally call should be direct"); - auto *Action = new CleanupHandler(BB); - Action->setHandlerBlockOrFunc(Fin); - Actions.insertCleanupHandler(Action); - CleanupHandlerMap[BB] = Action; - DEBUG(dbgs() << " Found frontend-outlined finally call to " - << Fin->getName() << " in block " - << Action->getStartBlock()->getName() << "\n"); - - // Split the block if there were more interesting instructions and - // look for finally calls in the normal successor block. - BasicBlock *SuccBB = BB; - if (FinallyCall.getInstruction() != BB->getTerminator() && - FinallyCall.getInstruction()->getNextNode() != - BB->getTerminator()) { - SuccBB = - SplitBlock(BB, FinallyCall.getInstruction()->getNextNode(), DT); - } else { - if (FinallyCall.isInvoke()) { - SuccBB = cast<InvokeInst>(FinallyCall.getInstruction()) - ->getNormalDest(); - } else { - SuccBB = BB->getUniqueSuccessor(); - assert(SuccBB && - "splitOutlinedFinallyCalls didn't insert a branch"); - } - } - BB = SuccBB; - if (BB == EndBB) - return; - continue; + TerminatorInst *TI = BB->getTerminator(); + // CatchPadInst and CleanupPadInst can't transfer control to a ReturnInst. + bool IsUnreachableRet = isa<ReturnInst>(TI) && FuncletPad; + // The token consumed by a CatchReturnInst must match the funclet token. + bool IsUnreachableCatchret = false; + if (auto *CRI = dyn_cast<CatchReturnInst>(TI)) + IsUnreachableCatchret = CRI->getCatchPad() != CatchPad; + // The token consumed by a CleanupReturnInst must match the funclet token. + bool IsUnreachableCleanupret = false; + if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) + IsUnreachableCleanupret = CRI->getCleanupPad() != CleanupPad; + if (IsUnreachableRet || IsUnreachableCatchret || + IsUnreachableCleanupret) { + changeToUnreachable(TI, /*UseLLVMTrap=*/false); + } else if (isa<InvokeInst>(TI)) { + if (Personality == EHPersonality::MSVC_CXX && CleanupPad) { + // Invokes within a cleanuppad for the MSVC++ personality never + // transfer control to their unwind edge: the personality will + // terminate the program. + removeUnwindEdge(BB); } } } - - // Anything else is either a catch block or interesting cleanup code. - for (BasicBlock::iterator II = BB->getFirstNonPHIOrDbg(), IE = BB->end(); - II != IE; ++II) { - Instruction *Inst = II; - if (LPadMap && LPadMap->isLandingPadSpecificInst(Inst)) - continue; - // Unconditional branches fall through to this loop. - if (Inst == Branch) - continue; - // If this is a catch block, there is no cleanup code to be found. - if (match(Inst, m_Intrinsic<Intrinsic::eh_begincatch>())) - return; - // If this a nested landing pad, it may contain an endcatch call. - if (match(Inst, m_Intrinsic<Intrinsic::eh_endcatch>())) - return; - // Anything else makes this interesting cleanup code. - return createCleanupHandler(Actions, CleanupHandlerMap, BB); - } - - // Only unconditional branches in empty blocks should get this far. - assert(Branch && Branch->isUnconditional()); - if (BB == EndBB) - return; - BB = Branch->getSuccessor(0); } } -// This is a public function, declared in WinEHFuncInfo.h and is also -// referenced by WinEHNumbering in FunctionLoweringInfo.cpp. -void llvm::parseEHActions( - const IntrinsicInst *II, - SmallVectorImpl<std::unique_ptr<ActionHandler>> &Actions) { - assert(II->getIntrinsicID() == Intrinsic::eh_actions && - "attempted to parse non eh.actions intrinsic"); - for (unsigned I = 0, E = II->getNumArgOperands(); I != E;) { - uint64_t ActionKind = - cast<ConstantInt>(II->getArgOperand(I))->getZExtValue(); - if (ActionKind == /*catch=*/1) { - auto *Selector = cast<Constant>(II->getArgOperand(I + 1)); - ConstantInt *EHObjIndex = cast<ConstantInt>(II->getArgOperand(I + 2)); - int64_t EHObjIndexVal = EHObjIndex->getSExtValue(); - Constant *Handler = cast<Constant>(II->getArgOperand(I + 3)); - I += 4; - auto CH = make_unique<CatchHandler>(/*BB=*/nullptr, Selector, - /*NextBB=*/nullptr); - CH->setHandlerBlockOrFunc(Handler); - CH->setExceptionVarIndex(EHObjIndexVal); - Actions.push_back(std::move(CH)); - } else if (ActionKind == 0) { - Constant *Handler = cast<Constant>(II->getArgOperand(I + 1)); - I += 2; - auto CH = make_unique<CleanupHandler>(/*BB=*/nullptr); - CH->setHandlerBlockOrFunc(Handler); - Actions.push_back(std::move(CH)); - } else { - llvm_unreachable("Expected either a catch or cleanup handler!"); - } +void WinEHPrepare::cleanupPreparedFunclets(Function &F) { + // Clean-up some of the mess we made by removing useles PHI nodes, trivial + // branches, etc. + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) { + BasicBlock *BB = &*FI++; + SimplifyInstructionsInBlock(BB); + ConstantFoldTerminator(BB, /*DeleteDeadConditions=*/true); + MergeBlockIntoPredecessor(BB); } - std::reverse(Actions.begin(), Actions.end()); + + // We might have some unreachable blocks after cleaning up some impossible + // control flow. + removeUnreachableBlocks(F); } -namespace { -struct WinEHNumbering { - WinEHNumbering(WinEHFuncInfo &FuncInfo) : FuncInfo(FuncInfo), - CurrentBaseState(-1), NextState(0) {} +void WinEHPrepare::verifyPreparedFunclets(Function &F) { + for (BasicBlock &BB : F) { + size_t NumColors = BlockColors[&BB].size(); + assert(NumColors == 1 && "Expected monochromatic BB!"); + if (NumColors == 0) + report_fatal_error("Uncolored BB!"); + if (NumColors > 1) + report_fatal_error("Multicolor BB!"); + assert((DisableDemotion || !(BB.isEHPad() && isa<PHINode>(BB.begin()))) && + "EH Pad still has a PHI!"); + } +} - WinEHFuncInfo &FuncInfo; - int CurrentBaseState; - int NextState; +bool WinEHPrepare::prepareExplicitEH(Function &F) { + // Remove unreachable blocks. It is not valuable to assign them a color and + // their existence can trick us into thinking values are alive when they are + // not. + removeUnreachableBlocks(F); - SmallVector<std::unique_ptr<ActionHandler>, 4> HandlerStack; - SmallPtrSet<const Function *, 4> VisitedHandlers; + // Determine which blocks are reachable from which funclet entries. + colorFunclets(F); - int currentEHNumber() const { - return HandlerStack.empty() ? CurrentBaseState : HandlerStack.back()->getEHState(); - } + cloneCommonBlocks(F); - void createUnwindMapEntry(int ToState, ActionHandler *AH); - void createTryBlockMapEntry(int TryLow, int TryHigh, - ArrayRef<CatchHandler *> Handlers); - void processCallSite(MutableArrayRef<std::unique_ptr<ActionHandler>> Actions, - ImmutableCallSite CS); - void popUnmatchedActions(int FirstMismatch); - void calculateStateNumbers(const Function &F); - void findActionRootLPads(const Function &F); -}; -} + if (!DisableDemotion) + demotePHIsOnFunclets(F); -void WinEHNumbering::createUnwindMapEntry(int ToState, ActionHandler *AH) { - WinEHUnwindMapEntry UME; - UME.ToState = ToState; - if (auto *CH = dyn_cast_or_null<CleanupHandler>(AH)) - UME.Cleanup = cast<Function>(CH->getHandlerBlockOrFunc()); - else - UME.Cleanup = nullptr; - FuncInfo.UnwindMap.push_back(UME); -} + if (!DisableCleanups) { + DEBUG(verifyFunction(F)); + removeImplausibleInstructions(F); -void WinEHNumbering::createTryBlockMapEntry(int TryLow, int TryHigh, - ArrayRef<CatchHandler *> Handlers) { - // See if we already have an entry for this set of handlers. - // This is using iterators rather than a range-based for loop because - // if we find the entry we're looking for we'll need the iterator to erase it. - int NumHandlers = Handlers.size(); - auto I = FuncInfo.TryBlockMap.begin(); - auto E = FuncInfo.TryBlockMap.end(); - for ( ; I != E; ++I) { - auto &Entry = *I; - if (Entry.HandlerArray.size() != (size_t)NumHandlers) - continue; - int N; - for (N = 0; N < NumHandlers; ++N) { - if (Entry.HandlerArray[N].Handler != Handlers[N]->getHandlerBlockOrFunc()) - break; // breaks out of inner loop - } - // If all the handlers match, this is what we were looking for. - if (N == NumHandlers) { - break; - } + DEBUG(verifyFunction(F)); + cleanupPreparedFunclets(F); } - // If we found an existing entry for this set of handlers, extend the range - // but move the entry to the end of the map vector. The order of entries - // in the map is critical to the way that the runtime finds handlers. - // FIXME: Depending on what has happened with block ordering, this may - // incorrectly combine entries that should remain separate. - if (I != E) { - // Copy the existing entry. - WinEHTryBlockMapEntry Entry = *I; - Entry.TryLow = std::min(TryLow, Entry.TryLow); - Entry.TryHigh = std::max(TryHigh, Entry.TryHigh); - assert(Entry.TryLow <= Entry.TryHigh); - // Erase the old entry and add this one to the back. - FuncInfo.TryBlockMap.erase(I); - FuncInfo.TryBlockMap.push_back(Entry); - return; - } + DEBUG(verifyPreparedFunclets(F)); + // Recolor the CFG to verify that all is well. + DEBUG(colorFunclets(F)); + DEBUG(verifyPreparedFunclets(F)); - // If we didn't find an entry, create a new one. - WinEHTryBlockMapEntry TBME; - TBME.TryLow = TryLow; - TBME.TryHigh = TryHigh; - assert(TBME.TryLow <= TBME.TryHigh); - for (CatchHandler *CH : Handlers) { - WinEHHandlerType HT; - if (CH->getSelector()->isNullValue()) { - HT.Adjectives = 0x40; - HT.TypeDescriptor = nullptr; - } else { - auto *GV = cast<GlobalVariable>(CH->getSelector()->stripPointerCasts()); - // Selectors are always pointers to GlobalVariables with 'struct' type. - // The struct has two fields, adjectives and a type descriptor. - auto *CS = cast<ConstantStruct>(GV->getInitializer()); - HT.Adjectives = - cast<ConstantInt>(CS->getAggregateElement(0U))->getZExtValue(); - HT.TypeDescriptor = - cast<GlobalVariable>(CS->getAggregateElement(1)->stripPointerCasts()); - } - HT.Handler = cast<Function>(CH->getHandlerBlockOrFunc()); - HT.CatchObjRecoverIdx = CH->getExceptionVarIndex(); - TBME.HandlerArray.push_back(HT); - } - FuncInfo.TryBlockMap.push_back(TBME); -} + BlockColors.clear(); + FuncletBlocks.clear(); -static void print_name(const Value *V) { -#ifndef NDEBUG - if (!V) { - DEBUG(dbgs() << "null"); - return; - } - - if (const auto *F = dyn_cast<Function>(V)) - DEBUG(dbgs() << F->getName()); - else - DEBUG(V->dump()); -#endif + return true; } -void WinEHNumbering::processCallSite( - MutableArrayRef<std::unique_ptr<ActionHandler>> Actions, - ImmutableCallSite CS) { - DEBUG(dbgs() << "processCallSite (EH state = " << currentEHNumber() - << ") for: "); - print_name(CS ? CS.getCalledValue() : nullptr); - DEBUG(dbgs() << '\n'); - - DEBUG(dbgs() << "HandlerStack: \n"); - for (int I = 0, E = HandlerStack.size(); I < E; ++I) { - DEBUG(dbgs() << " "); - print_name(HandlerStack[I]->getHandlerBlockOrFunc()); - DEBUG(dbgs() << '\n'); - } - DEBUG(dbgs() << "Actions: \n"); - for (int I = 0, E = Actions.size(); I < E; ++I) { - DEBUG(dbgs() << " "); - print_name(Actions[I]->getHandlerBlockOrFunc()); - DEBUG(dbgs() << '\n'); - } - int FirstMismatch = 0; - for (int E = std::min(HandlerStack.size(), Actions.size()); FirstMismatch < E; - ++FirstMismatch) { - if (HandlerStack[FirstMismatch]->getHandlerBlockOrFunc() != - Actions[FirstMismatch]->getHandlerBlockOrFunc()) - break; - } - - // Remove unmatched actions from the stack and process their EH states. - popUnmatchedActions(FirstMismatch); - - DEBUG(dbgs() << "Pushing actions for CallSite: "); - print_name(CS ? CS.getCalledValue() : nullptr); - DEBUG(dbgs() << '\n'); - - bool LastActionWasCatch = false; - const LandingPadInst *LastRootLPad = nullptr; - for (size_t I = FirstMismatch; I != Actions.size(); ++I) { - // We can reuse eh states when pushing two catches for the same invoke. - bool CurrActionIsCatch = isa<CatchHandler>(Actions[I].get()); - auto *Handler = cast<Function>(Actions[I]->getHandlerBlockOrFunc()); - // Various conditions can lead to a handler being popped from the - // stack and re-pushed later. That shouldn't create a new state. - // FIXME: Can code optimization lead to re-used handlers? - if (FuncInfo.HandlerEnclosedState.count(Handler)) { - // If we already assigned the state enclosed by this handler re-use it. - Actions[I]->setEHState(FuncInfo.HandlerEnclosedState[Handler]); +// TODO: Share loads when one use dominates another, or when a catchpad exit +// dominates uses (needs dominators). +AllocaInst *WinEHPrepare::insertPHILoads(PHINode *PN, Function &F) { + BasicBlock *PHIBlock = PN->getParent(); + AllocaInst *SpillSlot = nullptr; + Instruction *EHPad = PHIBlock->getFirstNonPHI(); + + if (!isa<TerminatorInst>(EHPad)) { + // If the EHPad isn't a terminator, then we can insert a load in this block + // that will dominate all uses. + SpillSlot = new AllocaInst(PN->getType(), nullptr, + Twine(PN->getName(), ".wineh.spillslot"), + &F.getEntryBlock().front()); + Value *V = new LoadInst(SpillSlot, Twine(PN->getName(), ".wineh.reload"), + &*PHIBlock->getFirstInsertionPt()); + PN->replaceAllUsesWith(V); + return SpillSlot; + } + + // Otherwise, we have a PHI on a terminator EHPad, and we give up and insert + // loads of the slot before every use. + DenseMap<BasicBlock *, Value *> Loads; + for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end(); + UI != UE;) { + Use &U = *UI++; + auto *UsingInst = cast<Instruction>(U.getUser()); + if (isa<PHINode>(UsingInst) && UsingInst->getParent()->isEHPad()) { + // Use is on an EH pad phi. Leave it alone; we'll insert loads and + // stores for it separately. continue; } - const LandingPadInst* RootLPad = FuncInfo.RootLPad[Handler]; - if (CurrActionIsCatch && LastActionWasCatch && RootLPad == LastRootLPad) { - DEBUG(dbgs() << "setEHState for handler to " << currentEHNumber() << "\n"); - Actions[I]->setEHState(currentEHNumber()); - } else { - DEBUG(dbgs() << "createUnwindMapEntry(" << currentEHNumber() << ", "); - print_name(Actions[I]->getHandlerBlockOrFunc()); - DEBUG(dbgs() << ") with EH state " << NextState << "\n"); - createUnwindMapEntry(currentEHNumber(), Actions[I].get()); - DEBUG(dbgs() << "setEHState for handler to " << NextState << "\n"); - Actions[I]->setEHState(NextState); - NextState++; - } - HandlerStack.push_back(std::move(Actions[I])); - LastActionWasCatch = CurrActionIsCatch; - LastRootLPad = RootLPad; + replaceUseWithLoad(PN, U, SpillSlot, Loads, F); } - - // This is used to defer numbering states for a handler until after the - // last time it appears in an invoke action list. - if (CS.isInvoke()) { - for (int I = 0, E = HandlerStack.size(); I < E; ++I) { - auto *Handler = cast<Function>(HandlerStack[I]->getHandlerBlockOrFunc()); - if (FuncInfo.LastInvoke[Handler] != cast<InvokeInst>(CS.getInstruction())) - continue; - FuncInfo.LastInvokeVisited[Handler] = true; - DEBUG(dbgs() << "Last invoke of "); - print_name(Handler); - DEBUG(dbgs() << " has been visited.\n"); - } - } - - DEBUG(dbgs() << "In EHState " << currentEHNumber() << " for CallSite: "); - print_name(CS ? CS.getCalledValue() : nullptr); - DEBUG(dbgs() << '\n'); + return SpillSlot; } -void WinEHNumbering::popUnmatchedActions(int FirstMismatch) { - // Don't recurse while we are looping over the handler stack. Instead, defer - // the numbering of the catch handlers until we are done popping. - SmallVector<CatchHandler *, 4> PoppedCatches; - for (int I = HandlerStack.size() - 1; I >= FirstMismatch; --I) { - std::unique_ptr<ActionHandler> Handler = HandlerStack.pop_back_val(); - if (isa<CatchHandler>(Handler.get())) - PoppedCatches.push_back(cast<CatchHandler>(Handler.release())); - } +// TODO: improve store placement. Inserting at def is probably good, but need +// to be careful not to introduce interfering stores (needs liveness analysis). +// TODO: identify related phi nodes that can share spill slots, and share them +// (also needs liveness). +void WinEHPrepare::insertPHIStores(PHINode *OriginalPHI, + AllocaInst *SpillSlot) { + // Use a worklist of (Block, Value) pairs -- the given Value needs to be + // stored to the spill slot by the end of the given Block. + SmallVector<std::pair<BasicBlock *, Value *>, 4> Worklist; - int TryHigh = NextState - 1; - int LastTryLowIdx = 0; - for (int I = 0, E = PoppedCatches.size(); I != E; ++I) { - CatchHandler *CH = PoppedCatches[I]; - DEBUG(dbgs() << "Popped handler with state " << CH->getEHState() << "\n"); - if (I + 1 == E || CH->getEHState() != PoppedCatches[I + 1]->getEHState()) { - int TryLow = CH->getEHState(); - auto Handlers = - makeArrayRef(&PoppedCatches[LastTryLowIdx], I - LastTryLowIdx + 1); - DEBUG(dbgs() << "createTryBlockMapEntry(" << TryLow << ", " << TryHigh); - for (size_t J = 0; J < Handlers.size(); ++J) { - DEBUG(dbgs() << ", "); - print_name(Handlers[J]->getHandlerBlockOrFunc()); - } - DEBUG(dbgs() << ")\n"); - createTryBlockMapEntry(TryLow, TryHigh, Handlers); - LastTryLowIdx = I + 1; - } - } + Worklist.push_back({OriginalPHI->getParent(), OriginalPHI}); - for (CatchHandler *CH : PoppedCatches) { - if (auto *F = dyn_cast<Function>(CH->getHandlerBlockOrFunc())) { - if (FuncInfo.LastInvokeVisited[F]) { - DEBUG(dbgs() << "Assigning base state " << NextState << " to "); - print_name(F); - DEBUG(dbgs() << '\n'); - FuncInfo.HandlerBaseState[F] = NextState; - DEBUG(dbgs() << "createUnwindMapEntry(" << currentEHNumber() - << ", null)\n"); - createUnwindMapEntry(currentEHNumber(), nullptr); - ++NextState; - calculateStateNumbers(*F); + while (!Worklist.empty()) { + BasicBlock *EHBlock; + Value *InVal; + std::tie(EHBlock, InVal) = Worklist.pop_back_val(); + + PHINode *PN = dyn_cast<PHINode>(InVal); + if (PN && PN->getParent() == EHBlock) { + // The value is defined by another PHI we need to remove, with no room to + // insert a store after the PHI, so each predecessor needs to store its + // incoming value. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) { + Value *PredVal = PN->getIncomingValue(i); + + // Undef can safely be skipped. + if (isa<UndefValue>(PredVal)) + continue; + + insertPHIStore(PN->getIncomingBlock(i), PredVal, SpillSlot, Worklist); } - else { - DEBUG(dbgs() << "Deferring handling of "); - print_name(F); - DEBUG(dbgs() << " until last invoke visited.\n"); + } else { + // We need to store InVal, which dominates EHBlock, but can't put a store + // in EHBlock, so need to put stores in each predecessor. + for (BasicBlock *PredBlock : predecessors(EHBlock)) { + insertPHIStore(PredBlock, InVal, SpillSlot, Worklist); } } - delete CH; } } -void WinEHNumbering::calculateStateNumbers(const Function &F) { - auto I = VisitedHandlers.insert(&F); - if (!I.second) - return; // We've already visited this handler, don't renumber it. +void WinEHPrepare::insertPHIStore( + BasicBlock *PredBlock, Value *PredVal, AllocaInst *SpillSlot, + SmallVectorImpl<std::pair<BasicBlock *, Value *>> &Worklist) { - int OldBaseState = CurrentBaseState; - if (FuncInfo.HandlerBaseState.count(&F)) { - CurrentBaseState = FuncInfo.HandlerBaseState[&F]; - } - - size_t SavedHandlerStackSize = HandlerStack.size(); - - DEBUG(dbgs() << "Calculating state numbers for: " << F.getName() << '\n'); - SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList; - for (const BasicBlock &BB : F) { - for (const Instruction &I : BB) { - const auto *CI = dyn_cast<CallInst>(&I); - if (!CI || CI->doesNotThrow()) - continue; - processCallSite(None, CI); - } - const auto *II = dyn_cast<InvokeInst>(BB.getTerminator()); - if (!II) - continue; - const LandingPadInst *LPI = II->getLandingPadInst(); - auto *ActionsCall = dyn_cast<IntrinsicInst>(LPI->getNextNode()); - if (!ActionsCall) - continue; - parseEHActions(ActionsCall, ActionList); - if (ActionList.empty()) - continue; - processCallSite(ActionList, II); - ActionList.clear(); - FuncInfo.LandingPadStateMap[LPI] = currentEHNumber(); - DEBUG(dbgs() << "Assigning state " << currentEHNumber() - << " to landing pad at " << LPI->getParent()->getName() - << '\n'); + if (PredBlock->isEHPad() && + isa<TerminatorInst>(PredBlock->getFirstNonPHI())) { + // Pred is unsplittable, so we need to queue it on the worklist. + Worklist.push_back({PredBlock, PredVal}); + return; } - // Pop any actions that were pushed on the stack for this function. - popUnmatchedActions(SavedHandlerStackSize); - - DEBUG(dbgs() << "Assigning max state " << NextState - 1 - << " to " << F.getName() << '\n'); - FuncInfo.CatchHandlerMaxState[&F] = NextState - 1; - - CurrentBaseState = OldBaseState; + // Otherwise, insert the store at the end of the basic block. + new StoreInst(PredVal, SpillSlot, PredBlock->getTerminator()); } -// This function follows the same basic traversal as calculateStateNumbers -// but it is necessary to identify the root landing pad associated -// with each action before we start assigning state numbers. -void WinEHNumbering::findActionRootLPads(const Function &F) { - auto I = VisitedHandlers.insert(&F); - if (!I.second) - return; // We've already visited this handler, don't revisit it. - - SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList; - for (const BasicBlock &BB : F) { - const auto *II = dyn_cast<InvokeInst>(BB.getTerminator()); - if (!II) - continue; - const LandingPadInst *LPI = II->getLandingPadInst(); - auto *ActionsCall = dyn_cast<IntrinsicInst>(LPI->getNextNode()); - if (!ActionsCall) - continue; - - assert(ActionsCall->getIntrinsicID() == Intrinsic::eh_actions); - parseEHActions(ActionsCall, ActionList); - if (ActionList.empty()) - continue; - for (int I = 0, E = ActionList.size(); I < E; ++I) { - if (auto *Handler - = dyn_cast<Function>(ActionList[I]->getHandlerBlockOrFunc())) { - FuncInfo.LastInvoke[Handler] = II; - // Don't replace the root landing pad if we previously saw this - // handler in a different function. - if (FuncInfo.RootLPad.count(Handler) && - FuncInfo.RootLPad[Handler]->getParent()->getParent() != &F) - continue; - DEBUG(dbgs() << "Setting root lpad for "); - print_name(Handler); - DEBUG(dbgs() << " to " << LPI->getParent()->getName() << '\n'); - FuncInfo.RootLPad[Handler] = LPI; - } +void WinEHPrepare::replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot, + DenseMap<BasicBlock *, Value *> &Loads, + Function &F) { + // Lazilly create the spill slot. + if (!SpillSlot) + SpillSlot = new AllocaInst(V->getType(), nullptr, + Twine(V->getName(), ".wineh.spillslot"), + &F.getEntryBlock().front()); + + auto *UsingInst = cast<Instruction>(U.getUser()); + if (auto *UsingPHI = dyn_cast<PHINode>(UsingInst)) { + // If this is a PHI node, we can't insert a load of the value before + // the use. Instead insert the load in the predecessor block + // corresponding to the incoming value. + // + // Note that if there are multiple edges from a basic block to this + // PHI node that we cannot have multiple loads. The problem is that + // the resulting PHI node will have multiple values (from each load) + // coming in from the same block, which is illegal SSA form. + // For this reason, we keep track of and reuse loads we insert. + BasicBlock *IncomingBlock = UsingPHI->getIncomingBlock(U); + if (auto *CatchRet = + dyn_cast<CatchReturnInst>(IncomingBlock->getTerminator())) { + // Putting a load above a catchret and use on the phi would still leave + // a cross-funclet def/use. We need to split the edge, change the + // catchret to target the new block, and put the load there. + BasicBlock *PHIBlock = UsingInst->getParent(); + BasicBlock *NewBlock = SplitEdge(IncomingBlock, PHIBlock); + // SplitEdge gives us: + // IncomingBlock: + // ... + // br label %NewBlock + // NewBlock: + // catchret label %PHIBlock + // But we need: + // IncomingBlock: + // ... + // catchret label %NewBlock + // NewBlock: + // br label %PHIBlock + // So move the terminators to each others' blocks and swap their + // successors. + BranchInst *Goto = cast<BranchInst>(IncomingBlock->getTerminator()); + Goto->removeFromParent(); + CatchRet->removeFromParent(); + IncomingBlock->getInstList().push_back(CatchRet); + NewBlock->getInstList().push_back(Goto); + Goto->setSuccessor(0, PHIBlock); + CatchRet->setSuccessor(NewBlock); + // Update the color mapping for the newly split edge. + ColorVector &ColorsForPHIBlock = BlockColors[PHIBlock]; + BlockColors[NewBlock] = ColorsForPHIBlock; + for (BasicBlock *FuncletPad : ColorsForPHIBlock) + FuncletBlocks[FuncletPad].push_back(NewBlock); + // Treat the new block as incoming for load insertion. + IncomingBlock = NewBlock; } - // Walk the actions again and look for nested handlers. This has to - // happen after all of the actions have been processed in the current - // function. - for (int I = 0, E = ActionList.size(); I < E; ++I) - if (auto *Handler - = dyn_cast<Function>(ActionList[I]->getHandlerBlockOrFunc())) - findActionRootLPads(*Handler); - ActionList.clear(); + Value *&Load = Loads[IncomingBlock]; + // Insert the load into the predecessor block + if (!Load) + Load = new LoadInst(SpillSlot, Twine(V->getName(), ".wineh.reload"), + /*Volatile=*/false, IncomingBlock->getTerminator()); + + U.set(Load); + } else { + // Reload right before the old use. + auto *Load = new LoadInst(SpillSlot, Twine(V->getName(), ".wineh.reload"), + /*Volatile=*/false, UsingInst); + U.set(Load); } } -void llvm::calculateWinCXXEHStateNumbers(const Function *ParentFn, - WinEHFuncInfo &FuncInfo) { - // Return if it's already been done. - if (!FuncInfo.LandingPadStateMap.empty()) - return; - - WinEHNumbering Num(FuncInfo); - Num.findActionRootLPads(*ParentFn); - // The VisitedHandlers list is used by both findActionRootLPads and - // calculateStateNumbers, but both functions need to visit all handlers. - Num.VisitedHandlers.clear(); - Num.calculateStateNumbers(*ParentFn); - // Pop everything on the handler stack. - // It may be necessary to call this more than once because a handler can - // be pushed on the stack as a result of clearing the stack. - while (!Num.HandlerStack.empty()) - Num.processCallSite(None, ImmutableCallSite()); +void WinEHFuncInfo::addIPToStateRange(const InvokeInst *II, + MCSymbol *InvokeBegin, + MCSymbol *InvokeEnd) { + assert(InvokeStateMap.count(II) && + "should get invoke with precomputed state"); + LabelToStateMap[InvokeBegin] = std::make_pair(InvokeStateMap[II], InvokeEnd); } + +WinEHFuncInfo::WinEHFuncInfo() {} diff --git a/contrib/llvm/lib/DebugInfo/CodeView/FieldListRecordBuilder.cpp b/contrib/llvm/lib/DebugInfo/CodeView/FieldListRecordBuilder.cpp new file mode 100644 index 0000000..91b71cc --- /dev/null +++ b/contrib/llvm/lib/DebugInfo/CodeView/FieldListRecordBuilder.cpp @@ -0,0 +1,165 @@ +//===-- FieldListRecordBuilder.cpp ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/CodeView/FieldListRecordBuilder.h" + +using namespace llvm; +using namespace codeview; + +FieldListRecordBuilder::FieldListRecordBuilder() + : ListRecordBuilder(TypeRecordKind::FieldList) {} + +void FieldListRecordBuilder::writeBaseClass(MemberAccess Access, TypeIndex Type, + uint64_t Offset) { + TypeRecordBuilder &Builder = getBuilder(); + + Builder.writeTypeRecordKind(TypeRecordKind::BaseClass); + Builder.writeUInt16(static_cast<uint16_t>(Access)); + Builder.writeTypeIndex(Type); + Builder.writeEncodedUnsignedInteger(Offset); + + finishSubRecord(); +} + +void FieldListRecordBuilder::writeEnumerate(MemberAccess Access, uint64_t Value, + StringRef Name) { + TypeRecordBuilder &Builder = getBuilder(); + + Builder.writeTypeRecordKind(TypeRecordKind::Enumerate); + Builder.writeUInt16(static_cast<uint16_t>(Access)); + Builder.writeEncodedUnsignedInteger(Value); + Builder.writeNullTerminatedString(Name); + + finishSubRecord(); +} + +void FieldListRecordBuilder::writeMember(MemberAccess Access, TypeIndex Type, + uint64_t Offset, StringRef Name) { + TypeRecordBuilder &Builder = getBuilder(); + + Builder.writeTypeRecordKind(TypeRecordKind::Member); + Builder.writeUInt16(static_cast<uint16_t>(Access)); + Builder.writeTypeIndex(Type); + Builder.writeEncodedUnsignedInteger(Offset); + Builder.writeNullTerminatedString(Name); + + finishSubRecord(); +} + +void FieldListRecordBuilder::writeMethod(uint16_t OverloadCount, + TypeIndex MethodList, StringRef Name) { + TypeRecordBuilder &Builder = getBuilder(); + + Builder.writeTypeRecordKind(TypeRecordKind::Method); + Builder.writeUInt16(OverloadCount); + Builder.writeTypeIndex(MethodList); + Builder.writeNullTerminatedString(Name); + + finishSubRecord(); +} + +void FieldListRecordBuilder::writeOneMethod( + MemberAccess Access, MethodKind Kind, MethodOptions Options, TypeIndex Type, + int32_t VTableSlotOffset, StringRef Name) { + TypeRecordBuilder &Builder = getBuilder(); + + uint16_t Flags = static_cast<uint16_t>(Access); + Flags |= static_cast<uint16_t>(Kind) << MethodKindShift; + Flags |= static_cast<uint16_t>(Options); + + Builder.writeTypeRecordKind(TypeRecordKind::OneMethod); + Builder.writeUInt16(Flags); + Builder.writeTypeIndex(Type); + switch (Kind) { + case MethodKind::IntroducingVirtual: + case MethodKind::PureIntroducingVirtual: + assert(VTableSlotOffset >= 0); + Builder.writeInt32(VTableSlotOffset); + break; + + default: + assert(VTableSlotOffset == -1); + break; + } + + Builder.writeNullTerminatedString(Name); + + finishSubRecord(); +} + +void FieldListRecordBuilder::writeOneMethod(const MethodInfo &Method, + StringRef Name) { + writeOneMethod(Method.getAccess(), Method.getKind(), Method.getOptions(), + Method.getType(), Method.getVTableSlotOffset(), Name); +} + +void FieldListRecordBuilder::writeNestedType(TypeIndex Type, StringRef Name) { + TypeRecordBuilder &Builder = getBuilder(); + + Builder.writeTypeRecordKind(TypeRecordKind::NestedType); + Builder.writeUInt16(0); + Builder.writeTypeIndex(Type); + Builder.writeNullTerminatedString(Name); + + finishSubRecord(); +} + +void FieldListRecordBuilder::writeStaticMember(MemberAccess Access, + TypeIndex Type, StringRef Name) { + TypeRecordBuilder &Builder = getBuilder(); + + Builder.writeTypeRecordKind(TypeRecordKind::StaticMember); + Builder.writeUInt16(static_cast<uint16_t>(Access)); + Builder.writeTypeIndex(Type); + Builder.writeNullTerminatedString(Name); + + finishSubRecord(); +} + +void FieldListRecordBuilder::writeIndirectVirtualBaseClass( + MemberAccess Access, TypeIndex Type, TypeIndex VirtualBasePointerType, + int64_t VirtualBasePointerOffset, uint64_t SlotIndex) { + writeVirtualBaseClass(TypeRecordKind::IndirectVirtualBaseClass, Access, Type, + VirtualBasePointerType, VirtualBasePointerOffset, + SlotIndex); +} + +void FieldListRecordBuilder::writeVirtualBaseClass( + MemberAccess Access, TypeIndex Type, TypeIndex VirtualBasePointerType, + int64_t VirtualBasePointerOffset, uint64_t SlotIndex) { + writeVirtualBaseClass(TypeRecordKind::VirtualBaseClass, Access, Type, + VirtualBasePointerType, VirtualBasePointerOffset, + SlotIndex); +} + +void FieldListRecordBuilder::writeVirtualBaseClass( + TypeRecordKind Kind, MemberAccess Access, TypeIndex Type, + TypeIndex VirtualBasePointerType, int64_t VirtualBasePointerOffset, + uint64_t SlotIndex) { + TypeRecordBuilder &Builder = getBuilder(); + + Builder.writeTypeRecordKind(Kind); + Builder.writeUInt16(static_cast<uint16_t>(Access)); + Builder.writeTypeIndex(Type); + Builder.writeTypeIndex(VirtualBasePointerType); + Builder.writeEncodedInteger(VirtualBasePointerOffset); + Builder.writeEncodedUnsignedInteger(SlotIndex); + + finishSubRecord(); +} + +void FieldListRecordBuilder::writeVirtualFunctionTablePointer(TypeIndex Type) { + TypeRecordBuilder &Builder = getBuilder(); + + Builder.writeTypeRecordKind(TypeRecordKind::VirtualFunctionTablePointer); + Builder.writeUInt16(0); + Builder.writeTypeIndex(Type); + + finishSubRecord(); +}
\ No newline at end of file diff --git a/contrib/llvm/lib/DebugInfo/CodeView/Line.cpp b/contrib/llvm/lib/DebugInfo/CodeView/Line.cpp new file mode 100644 index 0000000..4cb766b --- /dev/null +++ b/contrib/llvm/lib/DebugInfo/CodeView/Line.cpp @@ -0,0 +1,22 @@ +//===-- Line.cpp ----------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/CodeView/Line.h" + +using namespace llvm; +using namespace codeview; + +LineInfo::LineInfo(uint32_t StartLine, uint32_t EndLine, bool IsStatement) { + LineData = StartLine & StartLineMask; + uint32_t LineDelta = EndLine - StartLine; + LineData |= (LineDelta << EndLineDeltaShift) & EndLineDeltaMask; + if (IsStatement) { + LineData |= StatementFlag; + } +} diff --git a/contrib/llvm/lib/DebugInfo/CodeView/ListRecordBuilder.cpp b/contrib/llvm/lib/DebugInfo/CodeView/ListRecordBuilder.cpp new file mode 100644 index 0000000..69c7e87 --- /dev/null +++ b/contrib/llvm/lib/DebugInfo/CodeView/ListRecordBuilder.cpp @@ -0,0 +1,31 @@ +//===-- ListRecordBuilder.cpp ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/CodeView/ListRecordBuilder.h" + +using namespace llvm; +using namespace codeview; + +ListRecordBuilder::ListRecordBuilder(TypeRecordKind Kind) : Builder(Kind) {} + +void ListRecordBuilder::finishSubRecord() { + // The builder starts at offset 2 in the actual CodeView buffer, so add an + // additional offset of 2 before computing the alignment. + uint32_t Remainder = (Builder.size() + 2) % 4; + if (Remainder != 0) { + for (int32_t PaddingBytesLeft = 4 - Remainder; PaddingBytesLeft > 0; + --PaddingBytesLeft) { + Builder.writeUInt8(0xf0 + PaddingBytesLeft); + } + } + + // TODO: Split the list into multiple records if it's longer than 64KB, using + // a subrecord of TypeRecordKind::Index to chain the records together. + assert(Builder.size() < 65536); +} diff --git a/contrib/llvm/lib/DebugInfo/CodeView/MemoryTypeTableBuilder.cpp b/contrib/llvm/lib/DebugInfo/CodeView/MemoryTypeTableBuilder.cpp new file mode 100644 index 0000000..9afce92 --- /dev/null +++ b/contrib/llvm/lib/DebugInfo/CodeView/MemoryTypeTableBuilder.cpp @@ -0,0 +1,35 @@ +//===-- MemoryTypeTableBuilder.cpp ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/CodeView/MemoryTypeTableBuilder.h" +#include "llvm/DebugInfo/CodeView/TypeIndex.h" + +using namespace llvm; +using namespace codeview; + +MemoryTypeTableBuilder::Record::Record(StringRef RData) + : Size(RData.size()), Data(new char[RData.size()]) { + memcpy(Data.get(), RData.data(), RData.size()); +} + +TypeIndex MemoryTypeTableBuilder::writeRecord(StringRef Data) { + auto I = HashedRecords.find(Data); + if (I != HashedRecords.end()) { + return I->second; + } + + std::unique_ptr<Record> R(new Record(Data)); + + TypeIndex TI(static_cast<uint32_t>(Records.size()) + + TypeIndex::FirstNonSimpleIndex); + HashedRecords.insert(std::make_pair(StringRef(R->data(), R->size()), TI)); + Records.push_back(std::move(R)); + + return TI; +} diff --git a/contrib/llvm/lib/DebugInfo/CodeView/MethodListRecordBuilder.cpp b/contrib/llvm/lib/DebugInfo/CodeView/MethodListRecordBuilder.cpp new file mode 100644 index 0000000..8893025 --- /dev/null +++ b/contrib/llvm/lib/DebugInfo/CodeView/MethodListRecordBuilder.cpp @@ -0,0 +1,49 @@ +//===-- MethodListRecordBuilder.cpp ---------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/CodeView/MethodListRecordBuilder.h" +#include "llvm/DebugInfo/CodeView/FieldListRecordBuilder.h" + +using namespace llvm; +using namespace codeview; + +MethodListRecordBuilder::MethodListRecordBuilder() + : ListRecordBuilder(TypeRecordKind::MethodList) {} + +void MethodListRecordBuilder::writeMethod(MemberAccess Access, MethodKind Kind, + MethodOptions Options, TypeIndex Type, + int32_t VTableSlotOffset) { + TypeRecordBuilder &Builder = getBuilder(); + + uint16_t Flags = static_cast<uint16_t>(Access); + Flags |= static_cast<uint16_t>(Kind) << MethodKindShift; + Flags |= static_cast<uint16_t>(Options); + + Builder.writeUInt16(Flags); + Builder.writeUInt16(0); + Builder.writeTypeIndex(Type); + switch (Kind) { + case MethodKind::IntroducingVirtual: + case MethodKind::PureIntroducingVirtual: + assert(VTableSlotOffset >= 0); + Builder.writeInt32(VTableSlotOffset); + break; + + default: + assert(VTableSlotOffset == -1); + break; + } + + // TODO: Fail if too big? +} + +void MethodListRecordBuilder::writeMethod(const MethodInfo &Method) { + writeMethod(Method.getAccess(), Method.getKind(), Method.getOptions(), + Method.getType(), Method.getVTableSlotOffset()); +} diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordBuilder.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordBuilder.cpp new file mode 100644 index 0000000..cbf464f --- /dev/null +++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordBuilder.cpp @@ -0,0 +1,113 @@ +//===-- TypeRecordBuilder.cpp ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/CodeView/TypeRecordBuilder.h" + +using namespace llvm; +using namespace codeview; + +TypeRecordBuilder::TypeRecordBuilder(TypeRecordKind Kind) : Stream(Buffer), + Writer(Stream) { + writeTypeRecordKind(Kind); +} + +StringRef TypeRecordBuilder::str() { + return StringRef(Buffer.data(), Buffer.size()); +} + +void TypeRecordBuilder::writeUInt8(uint8_t Value) { + Writer.write(Value); +} + +void TypeRecordBuilder::writeInt16(int16_t Value) { + Writer.write(Value); +} + +void TypeRecordBuilder::writeUInt16(uint16_t Value) { + Writer.write(Value); +} + +void TypeRecordBuilder::writeInt32(int32_t Value) { + Writer.write(Value); +} + +void TypeRecordBuilder::writeUInt32(uint32_t Value) { + Writer.write(Value); +} + +void TypeRecordBuilder::writeInt64(int64_t Value) { + Writer.write(Value); +} + +void TypeRecordBuilder::writeUInt64(uint64_t Value) { + Writer.write(Value); +} + +void TypeRecordBuilder::writeEncodedInteger(int64_t Value) { + if (Value >= 0) { + writeEncodedUnsignedInteger(static_cast<uint64_t>(Value)); + } else { + writeEncodedSignedInteger(Value); + } +} + +void TypeRecordBuilder::writeEncodedSignedInteger(int64_t Value) { + if (Value >= std::numeric_limits<int8_t>::min() && + Value <= std::numeric_limits<int8_t>::max()) { + writeUInt16(static_cast<uint16_t>(TypeRecordKind::SByte)); + writeInt16(static_cast<int8_t>(Value)); + } else if (Value >= std::numeric_limits<int16_t>::min() && + Value <= std::numeric_limits<int16_t>::max()) { + writeUInt16(static_cast<uint16_t>(TypeRecordKind::Int16)); + writeInt16(static_cast<int16_t>(Value)); + } else if (Value >= std::numeric_limits<int32_t>::min() && + Value <= std::numeric_limits<int32_t>::max()) { + writeUInt16(static_cast<uint32_t>(TypeRecordKind::Int32)); + writeInt32(static_cast<int32_t>(Value)); + } else { + writeUInt16(static_cast<uint16_t>(TypeRecordKind::Int64)); + writeInt64(Value); + } +} + +void TypeRecordBuilder::writeEncodedUnsignedInteger(uint64_t Value) { + if (Value < static_cast<uint16_t>(TypeRecordKind::SByte)) { + writeUInt16(static_cast<uint16_t>(Value)); + } else if (Value <= std::numeric_limits<uint16_t>::max()) { + writeUInt16(static_cast<uint16_t>(TypeRecordKind::UInt16)); + writeUInt16(static_cast<uint16_t>(Value)); + } else if (Value <= std::numeric_limits<uint32_t>::max()) { + writeUInt16(static_cast<uint16_t>(TypeRecordKind::UInt32)); + writeUInt32(static_cast<uint32_t>(Value)); + } else { + writeUInt16(static_cast<uint16_t>(TypeRecordKind::UInt64)); + writeUInt64(Value); + } +} + +void TypeRecordBuilder::writeNullTerminatedString(const char *Value) { + assert(Value != nullptr); + + size_t Length = strlen(Value); + Stream.write(Value, Length); + writeUInt8(0); +} + +void TypeRecordBuilder::writeNullTerminatedString(StringRef Value) { + Stream.write(Value.data(), Value.size()); + writeUInt8(0); +} + +void TypeRecordBuilder::writeTypeIndex(TypeIndex TypeInd) { + writeUInt32(TypeInd.getIndex()); +} + +void TypeRecordBuilder::writeTypeRecordKind(TypeRecordKind Kind) { + writeUInt16(static_cast<uint16_t>(Kind)); +} diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeTableBuilder.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeTableBuilder.cpp new file mode 100644 index 0000000..4af5dca --- /dev/null +++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeTableBuilder.cpp @@ -0,0 +1,217 @@ +//===-- TypeTableBuilder.cpp ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/DebugInfo/CodeView/FieldListRecordBuilder.h" +#include "llvm/DebugInfo/CodeView/MethodListRecordBuilder.h" +#include "llvm/DebugInfo/CodeView/TypeIndex.h" +#include "llvm/DebugInfo/CodeView/TypeRecordBuilder.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace codeview; + +namespace { + +const int PointerKindShift = 0; +const int PointerModeShift = 5; +const int PointerSizeShift = 13; + +const int ClassHfaKindShift = 11; +const int ClassWindowsRTClassKindShift = 14; + +void writePointerBase(TypeRecordBuilder &Builder, + const PointerRecordBase &Record) { + Builder.writeTypeIndex(Record.getReferentType()); + uint32_t flags = + static_cast<uint32_t>(Record.getOptions()) | + (Record.getSize() << PointerSizeShift) | + (static_cast<uint32_t>(Record.getMode()) << PointerModeShift) | + (static_cast<uint32_t>(Record.getPointerKind()) << PointerKindShift); + Builder.writeUInt32(flags); +} +} + +TypeTableBuilder::TypeTableBuilder() {} + +TypeTableBuilder::~TypeTableBuilder() {} + +TypeIndex TypeTableBuilder::writeModifier(const ModifierRecord &Record) { + TypeRecordBuilder Builder(TypeRecordKind::Modifier); + + Builder.writeTypeIndex(Record.getModifiedType()); + Builder.writeUInt16(static_cast<uint16_t>(Record.getOptions())); + + return writeRecord(Builder); +} + +TypeIndex TypeTableBuilder::writeProcedure(const ProcedureRecord &Record) { + TypeRecordBuilder Builder(TypeRecordKind::Procedure); + + Builder.writeTypeIndex(Record.getReturnType()); + Builder.writeUInt8(static_cast<uint8_t>(Record.getCallConv())); + Builder.writeUInt8(static_cast<uint8_t>(Record.getOptions())); + Builder.writeUInt16(Record.getParameterCount()); + Builder.writeTypeIndex(Record.getArgumentList()); + + return writeRecord(Builder); +} + +TypeIndex +TypeTableBuilder::writeMemberFunction(const MemberFunctionRecord &Record) { + TypeRecordBuilder Builder(TypeRecordKind::MemberFunction); + + Builder.writeTypeIndex(Record.getReturnType()); + Builder.writeTypeIndex(Record.getClassType()); + Builder.writeTypeIndex(Record.getThisType()); + Builder.writeUInt8(static_cast<uint8_t>(Record.getCallConv())); + Builder.writeUInt8(static_cast<uint8_t>(Record.getOptions())); + Builder.writeUInt16(Record.getParameterCount()); + Builder.writeTypeIndex(Record.getArgumentList()); + Builder.writeInt32(Record.getThisPointerAdjustment()); + + return writeRecord(Builder); +} + +TypeIndex +TypeTableBuilder::writeArgumentList(const ArgumentListRecord &Record) { + TypeRecordBuilder Builder(TypeRecordKind::ArgumentList); + + Builder.writeUInt32(Record.getArgumentTypes().size()); + for (TypeIndex TI : Record.getArgumentTypes()) { + Builder.writeTypeIndex(TI); + } + + return writeRecord(Builder); +} + +TypeIndex TypeTableBuilder::writePointer(const PointerRecord &Record) { + TypeRecordBuilder Builder(TypeRecordKind::Pointer); + + writePointerBase(Builder, Record); + + return writeRecord(Builder); +} + +TypeIndex +TypeTableBuilder::writePointerToMember(const PointerToMemberRecord &Record) { + TypeRecordBuilder Builder(TypeRecordKind::Pointer); + + writePointerBase(Builder, Record); + + Builder.writeTypeIndex(Record.getContainingType()); + Builder.writeUInt16(static_cast<uint16_t>(Record.getRepresentation())); + + return writeRecord(Builder); +} + +TypeIndex TypeTableBuilder::writeArray(const ArrayRecord &Record) { + TypeRecordBuilder Builder(TypeRecordKind::Array); + + Builder.writeTypeIndex(Record.getElementType()); + Builder.writeTypeIndex(Record.getIndexType()); + Builder.writeEncodedUnsignedInteger(Record.getSize()); + Builder.writeNullTerminatedString(Record.getName()); + + return writeRecord(Builder); +} + +TypeIndex TypeTableBuilder::writeAggregate(const AggregateRecord &Record) { + assert((Record.getKind() == TypeRecordKind::Structure) || + (Record.getKind() == TypeRecordKind::Class) || + (Record.getKind() == TypeRecordKind::Union)); + + TypeRecordBuilder Builder(Record.getKind()); + + Builder.writeUInt16(Record.getMemberCount()); + uint16_t Flags = + static_cast<uint16_t>(Record.getOptions()) | + (static_cast<uint16_t>(Record.getHfa()) << ClassHfaKindShift) | + (static_cast<uint16_t>(Record.getWinRTKind()) + << ClassWindowsRTClassKindShift); + Builder.writeUInt16(Flags); + Builder.writeTypeIndex(Record.getFieldList()); + if (Record.getKind() != TypeRecordKind::Union) { + Builder.writeTypeIndex(Record.getDerivationList()); + Builder.writeTypeIndex(Record.getVTableShape()); + } else { + assert(Record.getDerivationList() == TypeIndex()); + assert(Record.getVTableShape() == TypeIndex()); + } + Builder.writeEncodedUnsignedInteger(Record.getSize()); + Builder.writeNullTerminatedString(Record.getName()); + if ((Record.getOptions() & ClassOptions::HasUniqueName) != + ClassOptions::None) { + Builder.writeNullTerminatedString(Record.getUniqueName()); + } + + return writeRecord(Builder); +} + +TypeIndex TypeTableBuilder::writeEnum(const EnumRecord &Record) { + TypeRecordBuilder Builder(TypeRecordKind::Enum); + + Builder.writeUInt16(Record.getMemberCount()); + Builder.writeUInt16(static_cast<uint16_t>(Record.getOptions())); + Builder.writeTypeIndex(Record.getUnderlyingType()); + Builder.writeTypeIndex(Record.getFieldList()); + Builder.writeNullTerminatedString(Record.getName()); + if ((Record.getOptions() & ClassOptions::HasUniqueName) != + ClassOptions::None) { + Builder.writeNullTerminatedString(Record.getUniqueName()); + } + + return writeRecord(Builder); +} + +TypeIndex TypeTableBuilder::writeBitField(const BitFieldRecord &Record) { + TypeRecordBuilder Builder(TypeRecordKind::BitField); + + Builder.writeTypeIndex(Record.getType()); + Builder.writeUInt8(Record.getBitSize()); + Builder.writeUInt8(Record.getBitOffset()); + + return writeRecord(Builder); +} + +TypeIndex TypeTableBuilder::writeVirtualTableShape( + const VirtualTableShapeRecord &Record) { + TypeRecordBuilder Builder(TypeRecordKind::VirtualTableShape); + + ArrayRef<VirtualTableSlotKind> Slots = Record.getSlots(); + + Builder.writeUInt16(Slots.size()); + for (size_t SlotIndex = 0; SlotIndex < Slots.size(); SlotIndex += 2) { + uint8_t Byte = static_cast<uint8_t>(Slots[SlotIndex]) << 4; + if ((SlotIndex + 1) < Slots.size()) { + Byte |= static_cast<uint8_t>(Slots[SlotIndex + 1]); + } + Builder.writeUInt8(Byte); + } + + return writeRecord(Builder); +} + +TypeIndex TypeTableBuilder::writeRecord(TypeRecordBuilder &Builder) { + return writeRecord(Builder.str()); +} + +TypeIndex TypeTableBuilder::writeFieldList(FieldListRecordBuilder &FieldList) { + // TODO: Split the list into multiple records if it's longer than 64KB, using + // a subrecord of TypeRecordKind::Index to chain the records together. + return writeRecord(FieldList.str()); +} + +TypeIndex +TypeTableBuilder::writeMethodList(MethodListRecordBuilder &MethodList) { + // TODO: Split the list into multiple records if it's longer than 64KB, using + // a subrecord of TypeRecordKind::Index to chain the records together. + return writeRecord(MethodList.str()); +} diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp index 96bcf15..a4195b7 100644 --- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -12,6 +12,7 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h" #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h" +#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h" #include "llvm/Support/Compression.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/Format.h" @@ -126,6 +127,11 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) { getDebugFrame()->dump(OS); } + if (DumpType == DIDT_All || DumpType == DIDT_Macro) { + OS << "\n.debug_macinfo contents:\n"; + getDebugMacro()->dump(OS); + } + uint32_t offset = 0; if (DumpType == DIDT_All || DumpType == DIDT_Aranges) { OS << "\n.debug_aranges contents:\n"; @@ -155,6 +161,16 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) { } } + if (DumpType == DIDT_All || DumpType == DIDT_CUIndex) { + OS << "\n.debug_cu_index contents:\n"; + getCUIndex().dump(OS); + } + + if (DumpType == DIDT_All || DumpType == DIDT_TUIndex) { + OS << "\n.debug_tu_index contents:\n"; + getTUIndex().dump(OS); + } + if (DumpType == DIDT_All || DumpType == DIDT_LineDwo) { OS << "\n.debug_line.dwo contents:\n"; unsigned stmtOffset = 0; @@ -250,6 +266,28 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) { getStringSection(), isLittleEndian()); } +const DWARFUnitIndex &DWARFContext::getCUIndex() { + if (CUIndex) + return *CUIndex; + + DataExtractor CUIndexData(getCUIndexSection(), isLittleEndian(), 0); + + CUIndex = llvm::make_unique<DWARFUnitIndex>(DW_SECT_INFO); + CUIndex->parse(CUIndexData); + return *CUIndex; +} + +const DWARFUnitIndex &DWARFContext::getTUIndex() { + if (TUIndex) + return *TUIndex; + + DataExtractor TUIndexData(getTUIndexSection(), isLittleEndian(), 0); + + TUIndex = llvm::make_unique<DWARFUnitIndex>(DW_SECT_TYPES); + TUIndex->parse(TUIndexData); + return *TUIndex; +} + const DWARFDebugAbbrev *DWARFContext::getDebugAbbrev() { if (Abbrev) return Abbrev.get(); @@ -322,24 +360,37 @@ const DWARFDebugFrame *DWARFContext::getDebugFrame() { return DebugFrame.get(); } +const DWARFDebugMacro *DWARFContext::getDebugMacro() { + if (Macro) + return Macro.get(); + + DataExtractor MacinfoData(getMacinfoSection(), isLittleEndian(), 0); + Macro.reset(new DWARFDebugMacro()); + Macro->parse(MacinfoData); + return Macro.get(); +} + const DWARFLineTable * DWARFContext::getLineTableForUnit(DWARFUnit *U) { if (!Line) Line.reset(new DWARFDebugLine(&getLineSection().Relocs)); + const auto *UnitDIE = U->getUnitDIE(); if (UnitDIE == nullptr) return nullptr; + unsigned stmtOffset = UnitDIE->getAttributeValueAsSectionOffset(U, DW_AT_stmt_list, -1U); if (stmtOffset == -1U) return nullptr; // No line table for this compile unit. + stmtOffset += U->getLineTableOffset(); // See if the line table is cached. if (const DWARFLineTable *lt = Line->getLineTable(stmtOffset)) return lt; // We have to parse it first. - DataExtractor lineData(getLineSection().Data, isLittleEndian(), + DataExtractor lineData(U->getLineSection(), isLittleEndian(), U->getAddressByteSize()); return Line->getOrParseLineTable(lineData, stmtOffset); } @@ -556,10 +607,11 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, continue; StringRef data; + section_iterator RelocatedSection = Section.getRelocatedSection(); // Try to obtain an already relocated version of this section. // Else use the unrelocated section from the object file. We'll have to // apply relocations ourselves later. - if (!L || !L->getLoadedSectionContents(name,data)) + if (!L || !L->getLoadedSectionContents(*RelocatedSection,data)) Section.getContents(data); name = name.substr(name.find_first_not_of("._")); // Skip . and _ prefixes. @@ -591,6 +643,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, .Case("debug_frame", &DebugFrameSection) .Case("debug_str", &StringSection) .Case("debug_ranges", &RangeSection) + .Case("debug_macinfo", &MacinfoSection) .Case("debug_pubnames", &PubNamesSection) .Case("debug_pubtypes", &PubTypesSection) .Case("debug_gnu_pubnames", &GnuPubNamesSection) @@ -607,6 +660,8 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, .Case("apple_namespaces", &AppleNamespacesSection.Data) .Case("apple_namespac", &AppleNamespacesSection.Data) .Case("apple_objc", &AppleObjCSection.Data) + .Case("debug_cu_index", &CUIndexSection) + .Case("debug_tu_index", &TUIndexSection) // Any more debug info sections go here. .Default(nullptr); if (SectionData) { @@ -623,7 +678,6 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, TypesDWOSections[Section].Data = data; } - section_iterator RelocatedSection = Section.getRelocatedSection(); if (RelocatedSection == Obj.section_end()) continue; @@ -634,7 +688,15 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, // If the section we're relocating was relocated already by the JIT, // then we used the relocated version above, so we do not need to process // relocations for it now. - if (L && L->getLoadedSectionContents(RelSecName,RelSecData)) + if (L && L->getLoadedSectionContents(*RelocatedSection,RelSecData)) + continue; + + // In Mach-o files, the relocations do not need to be applied if + // there is no load offset to apply. The value read at the + // relocation point already factors in the section address + // (actually applying the relocations will produce wrong results + // as the section address will be added twice). + if (!L && isa<MachOObjectFile>(&Obj)) continue; RelSecName = RelSecName.substr( @@ -685,13 +747,19 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, } SymAddr = *SymAddrOrErr; // Also remember what section this symbol is in for later - Sym->getSection(RSec); + RSec = *Sym->getSection(); } else if (auto *MObj = dyn_cast<MachOObjectFile>(&Obj)) { // MachO also has relocations that point to sections and // scattered relocations. - // FIXME: We are not handling scattered relocations, do we have to? - RSec = MObj->getRelocationSection(Reloc.getRawDataRefImpl()); - SymAddr = RSec->getAddress(); + auto RelocInfo = MObj->getRelocation(Reloc.getRawDataRefImpl()); + if (MObj->isRelocationScattered(RelocInfo)) { + // FIXME: it's not clear how to correctly handle scattered + // relocations. + continue; + } else { + RSec = MObj->getRelocationSection(Reloc.getRawDataRefImpl()); + SymAddr = RSec->getAddress(); + } } // If we are given load addresses for the sections, we need to adjust: @@ -699,12 +767,15 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, // (Address of Section in File) + // (Load Address of Section) if (L != nullptr && RSec != Obj.section_end()) { - // RSec is now either the section being targetted or the section - // containing the symbol being targetted. In either case, + // RSec is now either the section being targeted or the section + // containing the symbol being targeted. In either case, // we need to perform the same computation. StringRef SecName; RSec->getName(SecName); - SectionLoadAddress = L->getSectionLoadAddress(SecName); +// llvm::dbgs() << "Name: '" << SecName +// << "', RSec: " << RSec->getRawDataRefImpl() +// << ", Section: " << Section.getRawDataRefImpl() << "\n"; + SectionLoadAddress = L->getSectionLoadAddress(*RSec); if (SectionLoadAddress != 0) SymAddr += SectionLoadAddress - RSec->getAddress(); } diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp index 5abbde4..62d5e66 100644 --- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp +++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp @@ -139,7 +139,7 @@ void DWARFDebugInfoEntryMinimal::dumpAttribute(raw_ostream &OS, std::string File; auto Color = syntax::Enumerator; if (attr == DW_AT_decl_file || attr == DW_AT_call_file) { - Color = syntax::String; + Color = syntax::String; if (const auto *LT = u->getContext().getLineTableForUnit(u)) if (LT->getFileNameByIndex( formValue.getAsUnsignedConstant().getValue(), diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp new file mode 100644 index 0000000..b6555fa --- /dev/null +++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp @@ -0,0 +1,103 @@ +//===-- DWARFDebugMacro.cpp -----------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SyntaxHighlighting.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace dwarf;
+using namespace syntax;
+
+void DWARFDebugMacro::dump(raw_ostream &OS) const {
+ unsigned IndLevel = 0;
+ for (const Entry &E : Macros) {
+ // There should not be DW_MACINFO_end_file when IndLevel is Zero. However,
+ // this check handles the case of corrupted ".debug_macinfo" section.
+ if (IndLevel > 0)
+ IndLevel -= (E.Type == DW_MACINFO_end_file);
+ // Print indentation.
+ for (unsigned I = 0; I < IndLevel; I++)
+ OS << " ";
+ IndLevel += (E.Type == DW_MACINFO_start_file);
+
+ WithColor(OS, syntax::Macro).get() << MacinfoString(E.Type);
+ switch (E.Type) {
+ default:
+ // Got a corrupted ".debug_macinfo" section (invalid macinfo type).
+ break;
+ case DW_MACINFO_define:
+ case DW_MACINFO_undef:
+ OS << " - lineno: " << E.Line;
+ OS << " macro: " << E.MacroStr;
+ break;
+ case DW_MACINFO_start_file:
+ OS << " - lineno: " << E.Line;
+ OS << " filenum: " << E.File;
+ break;
+ case DW_MACINFO_end_file:
+ break;
+ case DW_MACINFO_vendor_ext:
+ OS << " - constant: " << E.ExtConstant;
+ OS << " string: " << E.ExtStr;
+ break;
+ }
+ OS << "\n";
+ }
+}
+
+void DWARFDebugMacro::parse(DataExtractor data) {
+ uint32_t Offset = 0;
+ while (data.isValidOffset(Offset)) {
+ // A macro list entry consists of:
+ Entry E;
+ // 1. Macinfo type
+ E.Type = data.getULEB128(&Offset);
+
+ if (E.Type == 0) {
+ // Reached end of ".debug_macinfo" section.
+ return;
+ }
+
+ switch (E.Type) {
+ default:
+ // Got a corrupted ".debug_macinfo" section (invalid macinfo type).
+ // Push the corrupted entry to the list and halt parsing.
+ E.Type = DW_MACINFO_invalid;
+ Macros.push_back(E);
+ return;
+ case DW_MACINFO_define:
+ case DW_MACINFO_undef:
+ // 2. Source line
+ E.Line = data.getULEB128(&Offset);
+ // 3. Macro string
+ E.MacroStr = data.getCStr(&Offset);
+ break;
+ case DW_MACINFO_start_file:
+ // 2. Source line
+ E.Line = data.getULEB128(&Offset);
+ // 3. Source file id
+ E.File = data.getULEB128(&Offset);
+ break;
+ case DW_MACINFO_end_file:
+ break;
+ case DW_MACINFO_vendor_ext:
+ // 2. Vendor extension constant
+ E.ExtConstant = data.getULEB128(&Offset);
+ // 3. Vendor extension string
+ E.ExtStr = data.getCStr(&Offset);
+ break;
+ }
+
+ Macros.push_back(E);
+ }
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp index 53a676e..3dc5842 100644 --- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp +++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp @@ -18,7 +18,7 @@ #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include <cassert> -#include <climits> +#include <limits> using namespace llvm; using namespace dwarf; using namespace syntax; @@ -110,7 +110,7 @@ static const DWARFFormValue::FormClass DWARF4FormClasses[] = { bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const { // First, check DWARF4 form classes. - if (Form < ArrayRef<FormClass>(DWARF4FormClasses).size() && + if (Form < makeArrayRef(DWARF4FormClasses).size() && DWARF4FormClasses[Form] == FC) return true; // Check more forms from DWARF4 and DWARF5 proposals. @@ -261,6 +261,12 @@ DWARFFormValue::skipValue(DataExtractor debug_info_data, uint32_t* offset_ptr, bool DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data, uint32_t *offset_ptr, const DWARFUnit *cu) { + return skipValue(form, debug_info_data, offset_ptr, cu->getVersion(), + cu->getAddressByteSize()); +} +bool DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data, + uint32_t *offset_ptr, uint16_t Version, + uint8_t AddrSize) { bool indirect = false; do { switch (form) { @@ -295,10 +301,10 @@ DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data, // Compile unit address sized values case DW_FORM_addr: - *offset_ptr += cu->getAddressByteSize(); + *offset_ptr += AddrSize; return true; case DW_FORM_ref_addr: - *offset_ptr += getRefAddrSize(cu->getAddressByteSize(), cu->getVersion()); + *offset_ptr += getRefAddrSize(AddrSize, Version); return true; // 0 byte values - implied from the form. @@ -565,7 +571,7 @@ Optional<uint64_t> DWARFFormValue::getAsUnsignedConstant() const { Optional<int64_t> DWARFFormValue::getAsSignedConstant() const { if ((!isFormClass(FC_Constant) && !isFormClass(FC_Flag)) || - (Form == DW_FORM_udata && uint64_t(LLONG_MAX) < Value.uval)) + (Form == DW_FORM_udata && uint64_t(std::numeric_limits<int64_t>::max()) < Value.uval)) return None; switch (Form) { case DW_FORM_data4: @@ -584,6 +590,6 @@ Optional<int64_t> DWARFFormValue::getAsSignedConstant() const { Optional<ArrayRef<uint8_t>> DWARFFormValue::getAsBlock() const { if (!isFormClass(FC_Block) && !isFormClass(FC_Exprloc)) return None; - return ArrayRef<uint8_t>(Value.data, Value.uval); + return makeArrayRef(Value.data, Value.uval); } diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp index 348476d..92ca2d4 100644 --- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp +++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp @@ -14,29 +14,37 @@ #include "llvm/Support/Path.h" #include <cstdio> -using namespace llvm; +namespace llvm { using namespace dwarf; void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) { parseImpl(C, Section, C.getDebugAbbrev(), C.getRangeSection(), C.getStringSection(), StringRef(), C.getAddrSection(), - C.isLittleEndian()); + C.getLineSection().Data, C.isLittleEndian()); } void DWARFUnitSectionBase::parseDWO(DWARFContext &C, - const DWARFSection &DWOSection) { + const DWARFSection &DWOSection, + DWARFUnitIndex *Index) { parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), C.getRangeDWOSection(), C.getStringDWOSection(), C.getStringOffsetDWOSection(), - C.getAddrSection(), C.isLittleEndian()); + C.getAddrSection(), C.getLineDWOSection().Data, C.isLittleEndian()); } DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section, const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS, - StringRef SOS, StringRef AOS, bool LE, - const DWARFUnitSectionBase &UnitSection) + StringRef SOS, StringRef AOS, StringRef LS, bool LE, + const DWARFUnitSectionBase &UnitSection, + const DWARFUnitIndex::Entry *IndexEntry) : Context(DC), InfoSection(Section), Abbrev(DA), RangeSection(RS), - StringSection(SS), StringOffsetSection(SOS), AddrOffsetSection(AOS), - isLittleEndian(LE), UnitSection(UnitSection) { + LineSection(LS), StringSection(SS), StringOffsetSection([&]() { + if (IndexEntry) + if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS)) + return SOS.slice(C->Offset, C->Offset + C->Length); + return SOS; + }()), + AddrOffsetSection(AOS), isLittleEndian(LE), UnitSection(UnitSection), + IndexEntry(IndexEntry) { clear(); } @@ -69,6 +77,17 @@ bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) { Length = debug_info.getU32(offset_ptr); Version = debug_info.getU16(offset_ptr); uint64_t AbbrOffset = debug_info.getU32(offset_ptr); + if (IndexEntry) { + if (AbbrOffset) + return false; + auto *UnitContrib = IndexEntry->getOffset(); + if (!UnitContrib || UnitContrib->Length != (Length + 4)) + return false; + auto *AbbrEntry = IndexEntry->getOffset(DW_SECT_ABBREV); + if (!AbbrEntry) + return false; + AbbrOffset = AbbrEntry->Offset; + } AddrSize = debug_info.getU8(offset_ptr); bool LengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1); @@ -375,3 +394,12 @@ DWARFUnit::getInlinedChainForAddress(uint64_t Address) { return DWARFDebugInfoEntryInlinedChain(); return SubprogramDIE->getInlinedChainForAddress(ChainCU, Address); } + +const DWARFUnitIndex &getDWARFUnitIndex(DWARFContext &Context, + DWARFSectionKind Kind) { + if (Kind == DW_SECT_INFO) + return Context.getCUIndex(); + assert(Kind == DW_SECT_TYPES); + return Context.getTUIndex(); +} +} diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp new file mode 100644 index 0000000..96b3169 --- /dev/null +++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp @@ -0,0 +1,168 @@ +//===-- DWARFUnitIndex.cpp ------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h" + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/ErrorHandling.h" + +namespace llvm { + +bool DWARFUnitIndex::Header::parse(DataExtractor IndexData, + uint32_t *OffsetPtr) { + if (!IndexData.isValidOffsetForDataOfSize(*OffsetPtr, 16)) + return false; + Version = IndexData.getU32(OffsetPtr); + NumColumns = IndexData.getU32(OffsetPtr); + NumUnits = IndexData.getU32(OffsetPtr); + NumBuckets = IndexData.getU32(OffsetPtr); + return Version <= 2; +} + +void DWARFUnitIndex::Header::dump(raw_ostream &OS) const { + OS << format("version = %u slots = %u\n\n", Version, NumBuckets); +} + +bool DWARFUnitIndex::parse(DataExtractor IndexData) { + bool b = parseImpl(IndexData); + if (!b) { + // Make sure we don't try to dump anything + Header.NumBuckets = 0; + // Release any partially initialized data. + ColumnKinds.reset(); + Rows.reset(); + } + return b; +} + +bool DWARFUnitIndex::parseImpl(DataExtractor IndexData) { + uint32_t Offset = 0; + if (!Header.parse(IndexData, &Offset)) + return false; + + if (!IndexData.isValidOffsetForDataOfSize( + Offset, Header.NumBuckets * (8 + 4) + + (2 * Header.NumUnits + 1) * 4 * Header.NumColumns)) + return false; + + Rows = llvm::make_unique<Entry[]>(Header.NumBuckets); + auto Contribs = + llvm::make_unique<Entry::SectionContribution *[]>(Header.NumUnits); + ColumnKinds = llvm::make_unique<DWARFSectionKind[]>(Header.NumColumns); + + // Read Hash Table of Signatures + for (unsigned i = 0; i != Header.NumBuckets; ++i) + Rows[i].Signature = IndexData.getU64(&Offset); + + // Read Parallel Table of Indexes + for (unsigned i = 0; i != Header.NumBuckets; ++i) { + auto Index = IndexData.getU32(&Offset); + if (!Index) + continue; + Rows[i].Index = this; + Rows[i].Contributions = + llvm::make_unique<Entry::SectionContribution[]>(Header.NumColumns); + Contribs[Index - 1] = Rows[i].Contributions.get(); + } + + // Read the Column Headers + for (unsigned i = 0; i != Header.NumColumns; ++i) { + ColumnKinds[i] = static_cast<DWARFSectionKind>(IndexData.getU32(&Offset)); + if (ColumnKinds[i] == InfoColumnKind) { + if (InfoColumn != -1) + return false; + InfoColumn = i; + } + } + + if (InfoColumn == -1) + return false; + + // Read Table of Section Offsets + for (unsigned i = 0; i != Header.NumUnits; ++i) { + auto *Contrib = Contribs[i]; + for (unsigned i = 0; i != Header.NumColumns; ++i) + Contrib[i].Offset = IndexData.getU32(&Offset); + } + + // Read Table of Section Sizes + for (unsigned i = 0; i != Header.NumUnits; ++i) { + auto *Contrib = Contribs[i]; + for (unsigned i = 0; i != Header.NumColumns; ++i) + Contrib[i].Length = IndexData.getU32(&Offset); + } + + return true; +} + +StringRef DWARFUnitIndex::getColumnHeader(DWARFSectionKind DS) { +#define CASE(DS) \ + case DW_SECT_##DS: \ + return #DS; + switch (DS) { + CASE(INFO); + CASE(TYPES); + CASE(ABBREV); + CASE(LINE); + CASE(LOC); + CASE(STR_OFFSETS); + CASE(MACINFO); + CASE(MACRO); + } + llvm_unreachable("unknown DWARFSectionKind"); +} + +void DWARFUnitIndex::dump(raw_ostream &OS) const { + if (!Header.NumBuckets) + return; + + Header.dump(OS); + OS << "Index Signature "; + for (unsigned i = 0; i != Header.NumColumns; ++i) + OS << ' ' << left_justify(getColumnHeader(ColumnKinds[i]), 24); + OS << "\n----- ------------------"; + for (unsigned i = 0; i != Header.NumColumns; ++i) + OS << " ------------------------"; + OS << '\n'; + for (unsigned i = 0; i != Header.NumBuckets; ++i) { + auto &Row = Rows[i]; + if (auto *Contribs = Row.Contributions.get()) { + OS << format("%5u 0x%016" PRIx64 " ", i + 1, Row.Signature); + for (unsigned i = 0; i != Header.NumColumns; ++i) { + auto &Contrib = Contribs[i]; + OS << format("[0x%08x, 0x%08x) ", Contrib.Offset, + Contrib.Offset + Contrib.Length); + } + OS << '\n'; + } + } +} + +const DWARFUnitIndex::Entry::SectionContribution * +DWARFUnitIndex::Entry::getOffset(DWARFSectionKind Sec) const { + uint32_t i = 0; + for (; i != Index->Header.NumColumns; ++i) + if (Index->ColumnKinds[i] == Sec) + return &Contributions[i]; + return nullptr; +} +const DWARFUnitIndex::Entry::SectionContribution * +DWARFUnitIndex::Entry::getOffset() const { + return &Contributions[Index->InfoColumn]; +} + +const DWARFUnitIndex::Entry * +DWARFUnitIndex::getFromOffset(uint32_t Offset) const { + for (uint32_t i = 0; i != Header.NumBuckets; ++i) + if (const auto &Contribs = Rows[i].Contributions) + if (Contribs[InfoColumn].Offset == Offset) + return &Rows[i]; + return nullptr; +} +} diff --git a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp index a6b4c65..4f561d0 100644 --- a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp +++ b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp @@ -27,6 +27,7 @@ WithColor::WithColor(llvm::raw_ostream &OS, enum HighlightColor Type) : OS(OS) { case Tag: OS.changeColor(llvm::raw_ostream::BLUE); break; case Attribute: OS.changeColor(llvm::raw_ostream::CYAN); break; case Enumerator: OS.changeColor(llvm::raw_ostream::MAGENTA); break; + case Macro: OS.changeColor(llvm::raw_ostream::RED); break; } } } diff --git a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h index 946a313..16e6835 100644 --- a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h +++ b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h @@ -17,7 +17,7 @@ namespace dwarf { namespace syntax { // Symbolic names for various syntax elements. -enum HighlightColor { Address, String, Tag, Attribute, Enumerator }; +enum HighlightColor { Address, String, Tag, Attribute, Enumerator, Macro }; /// An RAII object that temporarily switches an output stream to a /// specific color. diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDB.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDB.cpp index 13201bb..613407e 100644 --- a/contrib/llvm/lib/DebugInfo/PDB/PDB.cpp +++ b/contrib/llvm/lib/DebugInfo/PDB/PDB.cpp @@ -31,7 +31,7 @@ PDB_ErrorCode llvm::loadDataForPDB(PDB_ReaderType Type, StringRef Path, PDB_ErrorCode llvm::loadDataForEXE(PDB_ReaderType Type, StringRef Path, std::unique_ptr<IPDBSession> &Session) { -// Create the correct concrete instance type based on the value of Type. + // Create the correct concrete instance type based on the value of Type. #if HAVE_DIA_SDK return DIASession::createFromExe(Path, Session); #endif diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBContext.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBContext.cpp index 83f27c7..ca2ae66 100644 --- a/contrib/llvm/lib/DebugInfo/PDB/PDBContext.cpp +++ b/contrib/llvm/lib/DebugInfo/PDB/PDBContext.cpp @@ -21,24 +21,11 @@ using namespace llvm; using namespace llvm::object; PDBContext::PDBContext(const COFFObjectFile &Object, - std::unique_ptr<IPDBSession> PDBSession, - bool RelativeAddress) + std::unique_ptr<IPDBSession> PDBSession) : DIContext(CK_PDB), Session(std::move(PDBSession)) { - if (!RelativeAddress) { - uint64_t ImageBase = 0; - if (Object.is64()) { - const pe32plus_header *Header = nullptr; - Object.getPE32PlusHeader(Header); - if (Header) - ImageBase = Header->ImageBase; - } else { - const pe32_header *Header = nullptr; - Object.getPE32Header(Header); - if (Header) - ImageBase = static_cast<uint64_t>(Header->ImageBase); - } - Session->setLoadAddress(ImageBase); - } + ErrorOr<uint64_t> ImageBase = Object.getImageBase(); + if (ImageBase) + Session->setLoadAddress(ImageBase.get()); } void PDBContext::dump(raw_ostream &OS, DIDumpType DumpType) {} diff --git a/contrib/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/contrib/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp new file mode 100644 index 0000000..a9dee7a --- /dev/null +++ b/contrib/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp @@ -0,0 +1,101 @@ +//===- lib/DebugInfo/Symbolize/DIPrinter.cpp ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the DIPrinter class, which is responsible for printing +// structures defined in DebugInfo/DIContext.h +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/Symbolize/DIPrinter.h" + +#include "llvm/DebugInfo/DIContext.h" +#include "llvm/Support/LineIterator.h" + +namespace llvm { +namespace symbolize { + +// By default, DILineInfo contains "<invalid>" for function/filename it +// cannot fetch. We replace it to "??" to make our output closer to addr2line. +static const char kDILineInfoBadString[] = "<invalid>"; +static const char kBadString[] = "??"; + +// Prints source code around in the FileName the Line. +void DIPrinter::printContext(std::string FileName, int64_t Line) { + if (PrintSourceContext <= 0) + return; + + ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr = + MemoryBuffer::getFile(FileName); + if (!BufOrErr) + return; + + std::unique_ptr<MemoryBuffer> Buf = std::move(BufOrErr.get()); + int64_t FirstLine = + std::max(static_cast<int64_t>(1), Line - PrintSourceContext / 2); + int64_t LastLine = FirstLine + PrintSourceContext; + size_t MaxLineNumberWidth = std::ceil(std::log10(LastLine)); + + for (line_iterator I = line_iterator(*Buf, false); + !I.is_at_eof() && I.line_number() <= LastLine; ++I) { + int64_t L = I.line_number(); + if (L >= FirstLine && L <= LastLine) { + OS << format_decimal(L, MaxLineNumberWidth); + if (L == Line) + OS << " >: "; + else + OS << " : "; + OS << *I << "\n"; + } + } +} + +void DIPrinter::print(const DILineInfo &Info, bool Inlined) { + if (PrintFunctionNames) { + std::string FunctionName = Info.FunctionName; + if (FunctionName == kDILineInfoBadString) + FunctionName = kBadString; + + StringRef Delimiter = (PrintPretty == true) ? " at " : "\n"; + StringRef Prefix = (PrintPretty && Inlined) ? " (inlined by) " : ""; + OS << Prefix << FunctionName << Delimiter; + } + std::string Filename = Info.FileName; + if (Filename == kDILineInfoBadString) + Filename = kBadString; + OS << Filename << ":" << Info.Line << ":" << Info.Column << "\n"; + printContext(Filename, Info.Line); +} + +DIPrinter &DIPrinter::operator<<(const DILineInfo &Info) { + print(Info, false); + return *this; +} + +DIPrinter &DIPrinter::operator<<(const DIInliningInfo &Info) { + uint32_t FramesNum = Info.getNumberOfFrames(); + if (FramesNum == 0) { + print(DILineInfo(), false); + return *this; + } + for (uint32_t i = 0; i < FramesNum; i++) + print(Info.getFrame(i), i > 0); + return *this; +} + +DIPrinter &DIPrinter::operator<<(const DIGlobal &Global) { + std::string Name = Global.Name; + if (Name == kDILineInfoBadString) + Name = kBadString; + OS << Name << "\n"; + OS << Global.Start << " " << Global.Size << "\n"; + return *this; +} + +} +} diff --git a/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp new file mode 100644 index 0000000..e314624 --- /dev/null +++ b/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp @@ -0,0 +1,254 @@ +//===-- SymbolizableObjectFile.cpp ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation of SymbolizableObjectFile class. +// +//===----------------------------------------------------------------------===// + +#include "SymbolizableObjectFile.h" +#include "llvm/Object/SymbolSize.h" +#include "llvm/Support/DataExtractor.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" + +namespace llvm { +namespace symbolize { + +using namespace object; + +static DILineInfoSpecifier +getDILineInfoSpecifier(FunctionNameKind FNKind) { + return DILineInfoSpecifier( + DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, FNKind); +} + +ErrorOr<std::unique_ptr<SymbolizableObjectFile>> +SymbolizableObjectFile::create(object::ObjectFile *Obj, + std::unique_ptr<DIContext> DICtx) { + std::unique_ptr<SymbolizableObjectFile> res( + new SymbolizableObjectFile(Obj, std::move(DICtx))); + std::unique_ptr<DataExtractor> OpdExtractor; + uint64_t OpdAddress = 0; + // Find the .opd (function descriptor) section if any, for big-endian + // PowerPC64 ELF. + if (Obj->getArch() == Triple::ppc64) { + for (section_iterator Section : Obj->sections()) { + StringRef Name; + StringRef Data; + if (auto EC = Section->getName(Name)) + return EC; + if (Name == ".opd") { + if (auto EC = Section->getContents(Data)) + return EC; + OpdExtractor.reset(new DataExtractor(Data, Obj->isLittleEndian(), + Obj->getBytesInAddress())); + OpdAddress = Section->getAddress(); + break; + } + } + } + std::vector<std::pair<SymbolRef, uint64_t>> Symbols = + computeSymbolSizes(*Obj); + for (auto &P : Symbols) + res->addSymbol(P.first, P.second, OpdExtractor.get(), OpdAddress); + + // If this is a COFF object and we didn't find any symbols, try the export + // table. + if (Symbols.empty()) { + if (auto *CoffObj = dyn_cast<COFFObjectFile>(Obj)) + if (auto EC = res->addCoffExportSymbols(CoffObj)) + return EC; + } + return std::move(res); +} + +SymbolizableObjectFile::SymbolizableObjectFile(ObjectFile *Obj, + std::unique_ptr<DIContext> DICtx) + : Module(Obj), DebugInfoContext(std::move(DICtx)) {} + +namespace { +struct OffsetNamePair { + uint32_t Offset; + StringRef Name; + bool operator<(const OffsetNamePair &R) const { + return Offset < R.Offset; + } +}; +} + +std::error_code SymbolizableObjectFile::addCoffExportSymbols( + const COFFObjectFile *CoffObj) { + // Get all export names and offsets. + std::vector<OffsetNamePair> ExportSyms; + for (const ExportDirectoryEntryRef &Ref : CoffObj->export_directories()) { + StringRef Name; + uint32_t Offset; + if (auto EC = Ref.getSymbolName(Name)) + return EC; + if (auto EC = Ref.getExportRVA(Offset)) + return EC; + ExportSyms.push_back(OffsetNamePair{Offset, Name}); + } + if (ExportSyms.empty()) + return std::error_code(); + + // Sort by ascending offset. + array_pod_sort(ExportSyms.begin(), ExportSyms.end()); + + // Approximate the symbol sizes by assuming they run to the next symbol. + // FIXME: This assumes all exports are functions. + uint64_t ImageBase = CoffObj->getImageBase(); + for (auto I = ExportSyms.begin(), E = ExportSyms.end(); I != E; ++I) { + OffsetNamePair &Export = *I; + // FIXME: The last export has a one byte size now. + uint32_t NextOffset = I != E ? I->Offset : Export.Offset + 1; + uint64_t SymbolStart = ImageBase + Export.Offset; + uint64_t SymbolSize = NextOffset - Export.Offset; + SymbolDesc SD = {SymbolStart, SymbolSize}; + Functions.insert(std::make_pair(SD, Export.Name)); + } + return std::error_code(); +} + +std::error_code SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol, + uint64_t SymbolSize, + DataExtractor *OpdExtractor, + uint64_t OpdAddress) { + SymbolRef::Type SymbolType = Symbol.getType(); + if (SymbolType != SymbolRef::ST_Function && SymbolType != SymbolRef::ST_Data) + return std::error_code(); + ErrorOr<uint64_t> SymbolAddressOrErr = Symbol.getAddress(); + if (auto EC = SymbolAddressOrErr.getError()) + return EC; + uint64_t SymbolAddress = *SymbolAddressOrErr; + if (OpdExtractor) { + // For big-endian PowerPC64 ELF, symbols in the .opd section refer to + // function descriptors. The first word of the descriptor is a pointer to + // the function's code. + // For the purposes of symbolization, pretend the symbol's address is that + // of the function's code, not the descriptor. + uint64_t OpdOffset = SymbolAddress - OpdAddress; + uint32_t OpdOffset32 = OpdOffset; + if (OpdOffset == OpdOffset32 && + OpdExtractor->isValidOffsetForAddress(OpdOffset32)) + SymbolAddress = OpdExtractor->getAddress(&OpdOffset32); + } + ErrorOr<StringRef> SymbolNameOrErr = Symbol.getName(); + if (auto EC = SymbolNameOrErr.getError()) + return EC; + StringRef SymbolName = *SymbolNameOrErr; + // Mach-O symbol table names have leading underscore, skip it. + if (Module->isMachO() && SymbolName.size() > 0 && SymbolName[0] == '_') + SymbolName = SymbolName.drop_front(); + // FIXME: If a function has alias, there are two entries in symbol table + // with same address size. Make sure we choose the correct one. + auto &M = SymbolType == SymbolRef::ST_Function ? Functions : Objects; + SymbolDesc SD = { SymbolAddress, SymbolSize }; + M.insert(std::make_pair(SD, SymbolName)); + return std::error_code(); +} + +// Return true if this is a 32-bit x86 PE COFF module. +bool SymbolizableObjectFile::isWin32Module() const { + auto *CoffObject = dyn_cast<COFFObjectFile>(Module); + return CoffObject && CoffObject->getMachine() == COFF::IMAGE_FILE_MACHINE_I386; +} + +uint64_t SymbolizableObjectFile::getModulePreferredBase() const { + if (auto *CoffObject = dyn_cast<COFFObjectFile>(Module)) + return CoffObject->getImageBase(); + return 0; +} + +bool SymbolizableObjectFile::getNameFromSymbolTable(SymbolRef::Type Type, + uint64_t Address, + std::string &Name, + uint64_t &Addr, + uint64_t &Size) const { + const auto &SymbolMap = Type == SymbolRef::ST_Function ? Functions : Objects; + if (SymbolMap.empty()) + return false; + SymbolDesc SD = { Address, Address }; + auto SymbolIterator = SymbolMap.upper_bound(SD); + if (SymbolIterator == SymbolMap.begin()) + return false; + --SymbolIterator; + if (SymbolIterator->first.Size != 0 && + SymbolIterator->first.Addr + SymbolIterator->first.Size <= Address) + return false; + Name = SymbolIterator->second.str(); + Addr = SymbolIterator->first.Addr; + Size = SymbolIterator->first.Size; + return true; +} + +bool SymbolizableObjectFile::shouldOverrideWithSymbolTable( + FunctionNameKind FNKind, bool UseSymbolTable) const { + // When DWARF is used with -gline-tables-only / -gmlt, the symbol table gives + // better answers for linkage names than the DIContext. Otherwise, we are + // probably using PEs and PDBs, and we shouldn't do the override. PE files + // generally only contain the names of exported symbols. + return FNKind == FunctionNameKind::LinkageName && UseSymbolTable && + isa<DWARFContext>(DebugInfoContext.get()); +} + +DILineInfo SymbolizableObjectFile::symbolizeCode(uint64_t ModuleOffset, + FunctionNameKind FNKind, + bool UseSymbolTable) const { + DILineInfo LineInfo; + if (DebugInfoContext) { + LineInfo = DebugInfoContext->getLineInfoForAddress( + ModuleOffset, getDILineInfoSpecifier(FNKind)); + } + // Override function name from symbol table if necessary. + if (shouldOverrideWithSymbolTable(FNKind, UseSymbolTable)) { + std::string FunctionName; + uint64_t Start, Size; + if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset, + FunctionName, Start, Size)) { + LineInfo.FunctionName = FunctionName; + } + } + return LineInfo; +} + +DIInliningInfo SymbolizableObjectFile::symbolizeInlinedCode( + uint64_t ModuleOffset, FunctionNameKind FNKind, bool UseSymbolTable) const { + DIInliningInfo InlinedContext; + + if (DebugInfoContext) + InlinedContext = DebugInfoContext->getInliningInfoForAddress( + ModuleOffset, getDILineInfoSpecifier(FNKind)); + // Make sure there is at least one frame in context. + if (InlinedContext.getNumberOfFrames() == 0) + InlinedContext.addFrame(DILineInfo()); + + // Override the function name in lower frame with name from symbol table. + if (shouldOverrideWithSymbolTable(FNKind, UseSymbolTable)) { + std::string FunctionName; + uint64_t Start, Size; + if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset, + FunctionName, Start, Size)) { + InlinedContext.getMutableFrame(InlinedContext.getNumberOfFrames() - 1) + ->FunctionName = FunctionName; + } + } + + return InlinedContext; +} + +DIGlobal SymbolizableObjectFile::symbolizeData(uint64_t ModuleOffset) const { + DIGlobal Res; + getNameFromSymbolTable(SymbolRef::ST_Data, ModuleOffset, Res.Name, Res.Start, + Res.Size); + return Res; +} + +} // namespace symbolize +} // namespace llvm + diff --git a/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h b/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h new file mode 100644 index 0000000..8583b6a --- /dev/null +++ b/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h @@ -0,0 +1,82 @@ +//===-- SymbolizableObjectFile.h -------------------------------- C++ -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the SymbolizableObjectFile class. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H +#define LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H + +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include <map> + +namespace llvm { +class DataExtractor; +} + +namespace llvm { +namespace symbolize { + +class SymbolizableObjectFile : public SymbolizableModule { +public: + static ErrorOr<std::unique_ptr<SymbolizableObjectFile>> + create(object::ObjectFile *Obj, std::unique_ptr<DIContext> DICtx); + + DILineInfo symbolizeCode(uint64_t ModuleOffset, FunctionNameKind FNKind, + bool UseSymbolTable) const override; + DIInliningInfo symbolizeInlinedCode(uint64_t ModuleOffset, + FunctionNameKind FNKind, + bool UseSymbolTable) const override; + DIGlobal symbolizeData(uint64_t ModuleOffset) const override; + + // Return true if this is a 32-bit x86 PE COFF module. + bool isWin32Module() const override; + + // Returns the preferred base of the module, i.e. where the loader would place + // it in memory assuming there were no conflicts. + uint64_t getModulePreferredBase() const override; + +private: + bool shouldOverrideWithSymbolTable(FunctionNameKind FNKind, + bool UseSymbolTable) const; + + bool getNameFromSymbolTable(object::SymbolRef::Type Type, uint64_t Address, + std::string &Name, uint64_t &Addr, + uint64_t &Size) const; + // For big-endian PowerPC64 ELF, OpdAddress is the address of the .opd + // (function descriptor) section and OpdExtractor refers to its contents. + std::error_code addSymbol(const object::SymbolRef &Symbol, + uint64_t SymbolSize, + DataExtractor *OpdExtractor = nullptr, + uint64_t OpdAddress = 0); + std::error_code addCoffExportSymbols(const object::COFFObjectFile *CoffObj); + + object::ObjectFile *Module; + std::unique_ptr<DIContext> DebugInfoContext; + + struct SymbolDesc { + uint64_t Addr; + // If size is 0, assume that symbol occupies the whole memory range up to + // the following symbol. + uint64_t Size; + friend bool operator<(const SymbolDesc &s1, const SymbolDesc &s2) { + return s1.Addr < s2.Addr; + } + }; + std::map<SymbolDesc, StringRef> Functions; + std::map<SymbolDesc, StringRef> Objects; + + SymbolizableObjectFile(object::ObjectFile *Obj, + std::unique_ptr<DIContext> DICtx); +}; + +} // namespace symbolize +} // namespace llvm + +#endif // LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H diff --git a/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp new file mode 100644 index 0000000..3da1963 --- /dev/null +++ b/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp @@ -0,0 +1,456 @@ +//===-- LLVMSymbolize.cpp -------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation for LLVM symbolization library. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/Symbolize/Symbolize.h" + +#include "SymbolizableObjectFile.h" + +#include "llvm/ADT/STLExtras.h" +#include "llvm/Config/config.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/PDB/PDB.h" +#include "llvm/DebugInfo/PDB/PDBContext.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/MachO.h" +#include "llvm/Object/MachOUniversal.h" +#include "llvm/Support/COFF.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compression.h" +#include "llvm/Support/DataExtractor.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include <stdlib.h> + +#if defined(_MSC_VER) +#include <Windows.h> +#include <DbgHelp.h> +#pragma comment(lib, "dbghelp.lib") + +// Windows.h conflicts with our COFF header definitions. +#ifdef IMAGE_FILE_MACHINE_I386 +#undef IMAGE_FILE_MACHINE_I386 +#endif +#endif + +namespace llvm { +namespace symbolize { + +ErrorOr<DILineInfo> LLVMSymbolizer::symbolizeCode(const std::string &ModuleName, + uint64_t ModuleOffset) { + auto InfoOrErr = getOrCreateModuleInfo(ModuleName); + if (auto EC = InfoOrErr.getError()) + return EC; + SymbolizableModule *Info = InfoOrErr.get(); + + // If the user is giving us relative addresses, add the preferred base of the + // object to the offset before we do the query. It's what DIContext expects. + if (Opts.RelativeAddresses) + ModuleOffset += Info->getModulePreferredBase(); + + DILineInfo LineInfo = Info->symbolizeCode(ModuleOffset, Opts.PrintFunctions, + Opts.UseSymbolTable); + if (Opts.Demangle) + LineInfo.FunctionName = DemangleName(LineInfo.FunctionName, Info); + return LineInfo; +} + +ErrorOr<DIInliningInfo> +LLVMSymbolizer::symbolizeInlinedCode(const std::string &ModuleName, + uint64_t ModuleOffset) { + auto InfoOrErr = getOrCreateModuleInfo(ModuleName); + if (auto EC = InfoOrErr.getError()) + return EC; + SymbolizableModule *Info = InfoOrErr.get(); + + // If the user is giving us relative addresses, add the preferred base of the + // object to the offset before we do the query. It's what DIContext expects. + if (Opts.RelativeAddresses) + ModuleOffset += Info->getModulePreferredBase(); + + DIInliningInfo InlinedContext = Info->symbolizeInlinedCode( + ModuleOffset, Opts.PrintFunctions, Opts.UseSymbolTable); + if (Opts.Demangle) { + for (int i = 0, n = InlinedContext.getNumberOfFrames(); i < n; i++) { + auto *Frame = InlinedContext.getMutableFrame(i); + Frame->FunctionName = DemangleName(Frame->FunctionName, Info); + } + } + return InlinedContext; +} + +ErrorOr<DIGlobal> LLVMSymbolizer::symbolizeData(const std::string &ModuleName, + uint64_t ModuleOffset) { + auto InfoOrErr = getOrCreateModuleInfo(ModuleName); + if (auto EC = InfoOrErr.getError()) + return EC; + SymbolizableModule *Info = InfoOrErr.get(); + + // If the user is giving us relative addresses, add the preferred base of + // the object to the offset before we do the query. It's what DIContext + // expects. + if (Opts.RelativeAddresses) + ModuleOffset += Info->getModulePreferredBase(); + + DIGlobal Global = Info->symbolizeData(ModuleOffset); + if (Opts.Demangle) + Global.Name = DemangleName(Global.Name, Info); + return Global; +} + +void LLVMSymbolizer::flush() { + ObjectForUBPathAndArch.clear(); + BinaryForPath.clear(); + ObjectPairForPathArch.clear(); + Modules.clear(); +} + +// For Path="/path/to/foo" and Basename="foo" assume that debug info is in +// /path/to/foo.dSYM/Contents/Resources/DWARF/foo. +// For Path="/path/to/bar.dSYM" and Basename="foo" assume that debug info is in +// /path/to/bar.dSYM/Contents/Resources/DWARF/foo. +static +std::string getDarwinDWARFResourceForPath( + const std::string &Path, const std::string &Basename) { + SmallString<16> ResourceName = StringRef(Path); + if (sys::path::extension(Path) != ".dSYM") { + ResourceName += ".dSYM"; + } + sys::path::append(ResourceName, "Contents", "Resources", "DWARF"); + sys::path::append(ResourceName, Basename); + return ResourceName.str(); +} + +static bool checkFileCRC(StringRef Path, uint32_t CRCHash) { + ErrorOr<std::unique_ptr<MemoryBuffer>> MB = + MemoryBuffer::getFileOrSTDIN(Path); + if (!MB) + return false; + return !zlib::isAvailable() || CRCHash == zlib::crc32(MB.get()->getBuffer()); +} + +static bool findDebugBinary(const std::string &OrigPath, + const std::string &DebuglinkName, uint32_t CRCHash, + std::string &Result) { + std::string OrigRealPath = OrigPath; +#if defined(HAVE_REALPATH) + if (char *RP = realpath(OrigPath.c_str(), nullptr)) { + OrigRealPath = RP; + free(RP); + } +#endif + SmallString<16> OrigDir(OrigRealPath); + llvm::sys::path::remove_filename(OrigDir); + SmallString<16> DebugPath = OrigDir; + // Try /path/to/original_binary/debuglink_name + llvm::sys::path::append(DebugPath, DebuglinkName); + if (checkFileCRC(DebugPath, CRCHash)) { + Result = DebugPath.str(); + return true; + } + // Try /path/to/original_binary/.debug/debuglink_name + DebugPath = OrigRealPath; + llvm::sys::path::append(DebugPath, ".debug", DebuglinkName); + if (checkFileCRC(DebugPath, CRCHash)) { + Result = DebugPath.str(); + return true; + } + // Try /usr/lib/debug/path/to/original_binary/debuglink_name + DebugPath = "/usr/lib/debug"; + llvm::sys::path::append(DebugPath, llvm::sys::path::relative_path(OrigDir), + DebuglinkName); + if (checkFileCRC(DebugPath, CRCHash)) { + Result = DebugPath.str(); + return true; + } + return false; +} + +static bool getGNUDebuglinkContents(const ObjectFile *Obj, std::string &DebugName, + uint32_t &CRCHash) { + if (!Obj) + return false; + for (const SectionRef &Section : Obj->sections()) { + StringRef Name; + Section.getName(Name); + Name = Name.substr(Name.find_first_not_of("._")); + if (Name == "gnu_debuglink") { + StringRef Data; + Section.getContents(Data); + DataExtractor DE(Data, Obj->isLittleEndian(), 0); + uint32_t Offset = 0; + if (const char *DebugNameStr = DE.getCStr(&Offset)) { + // 4-byte align the offset. + Offset = (Offset + 3) & ~0x3; + if (DE.isValidOffsetForDataOfSize(Offset, 4)) { + DebugName = DebugNameStr; + CRCHash = DE.getU32(&Offset); + return true; + } + } + break; + } + } + return false; +} + +static +bool darwinDsymMatchesBinary(const MachOObjectFile *DbgObj, + const MachOObjectFile *Obj) { + ArrayRef<uint8_t> dbg_uuid = DbgObj->getUuid(); + ArrayRef<uint8_t> bin_uuid = Obj->getUuid(); + if (dbg_uuid.empty() || bin_uuid.empty()) + return false; + return !memcmp(dbg_uuid.data(), bin_uuid.data(), dbg_uuid.size()); +} + +ObjectFile *LLVMSymbolizer::lookUpDsymFile(const std::string &ExePath, + const MachOObjectFile *MachExeObj, const std::string &ArchName) { + // On Darwin we may find DWARF in separate object file in + // resource directory. + std::vector<std::string> DsymPaths; + StringRef Filename = sys::path::filename(ExePath); + DsymPaths.push_back(getDarwinDWARFResourceForPath(ExePath, Filename)); + for (const auto &Path : Opts.DsymHints) { + DsymPaths.push_back(getDarwinDWARFResourceForPath(Path, Filename)); + } + for (const auto &Path : DsymPaths) { + auto DbgObjOrErr = getOrCreateObject(Path, ArchName); + if (!DbgObjOrErr) + continue; + ObjectFile *DbgObj = DbgObjOrErr.get(); + const MachOObjectFile *MachDbgObj = dyn_cast<const MachOObjectFile>(DbgObj); + if (!MachDbgObj) + continue; + if (darwinDsymMatchesBinary(MachDbgObj, MachExeObj)) + return DbgObj; + } + return nullptr; +} + +ObjectFile *LLVMSymbolizer::lookUpDebuglinkObject(const std::string &Path, + const ObjectFile *Obj, + const std::string &ArchName) { + std::string DebuglinkName; + uint32_t CRCHash; + std::string DebugBinaryPath; + if (!getGNUDebuglinkContents(Obj, DebuglinkName, CRCHash)) + return nullptr; + if (!findDebugBinary(Path, DebuglinkName, CRCHash, DebugBinaryPath)) + return nullptr; + auto DbgObjOrErr = getOrCreateObject(DebugBinaryPath, ArchName); + if (!DbgObjOrErr) + return nullptr; + return DbgObjOrErr.get(); +} + +ErrorOr<LLVMSymbolizer::ObjectPair> +LLVMSymbolizer::getOrCreateObjectPair(const std::string &Path, + const std::string &ArchName) { + const auto &I = ObjectPairForPathArch.find(std::make_pair(Path, ArchName)); + if (I != ObjectPairForPathArch.end()) + return I->second; + + auto ObjOrErr = getOrCreateObject(Path, ArchName); + if (auto EC = ObjOrErr.getError()) { + ObjectPairForPathArch.insert( + std::make_pair(std::make_pair(Path, ArchName), EC)); + return EC; + } + + ObjectFile *Obj = ObjOrErr.get(); + assert(Obj != nullptr); + ObjectFile *DbgObj = nullptr; + + if (auto MachObj = dyn_cast<const MachOObjectFile>(Obj)) + DbgObj = lookUpDsymFile(Path, MachObj, ArchName); + if (!DbgObj) + DbgObj = lookUpDebuglinkObject(Path, Obj, ArchName); + if (!DbgObj) + DbgObj = Obj; + ObjectPair Res = std::make_pair(Obj, DbgObj); + ObjectPairForPathArch.insert( + std::make_pair(std::make_pair(Path, ArchName), Res)); + return Res; +} + +ErrorOr<ObjectFile *> +LLVMSymbolizer::getOrCreateObject(const std::string &Path, + const std::string &ArchName) { + const auto &I = BinaryForPath.find(Path); + Binary *Bin = nullptr; + if (I == BinaryForPath.end()) { + ErrorOr<OwningBinary<Binary>> BinOrErr = createBinary(Path); + if (auto EC = BinOrErr.getError()) { + BinaryForPath.insert(std::make_pair(Path, EC)); + return EC; + } + Bin = BinOrErr->getBinary(); + BinaryForPath.insert(std::make_pair(Path, std::move(BinOrErr.get()))); + } else if (auto EC = I->second.getError()) { + return EC; + } else { + Bin = I->second->getBinary(); + } + + assert(Bin != nullptr); + + if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(Bin)) { + const auto &I = ObjectForUBPathAndArch.find(std::make_pair(Path, ArchName)); + if (I != ObjectForUBPathAndArch.end()) { + if (auto EC = I->second.getError()) + return EC; + return I->second->get(); + } + ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr = + UB->getObjectForArch(ArchName); + if (auto EC = ObjOrErr.getError()) { + ObjectForUBPathAndArch.insert( + std::make_pair(std::make_pair(Path, ArchName), EC)); + return EC; + } + ObjectFile *Res = ObjOrErr->get(); + ObjectForUBPathAndArch.insert(std::make_pair(std::make_pair(Path, ArchName), + std::move(ObjOrErr.get()))); + return Res; + } + if (Bin->isObject()) { + return cast<ObjectFile>(Bin); + } + return object_error::arch_not_found; +} + +ErrorOr<SymbolizableModule *> +LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) { + const auto &I = Modules.find(ModuleName); + if (I != Modules.end()) { + auto &InfoOrErr = I->second; + if (auto EC = InfoOrErr.getError()) + return EC; + return InfoOrErr->get(); + } + std::string BinaryName = ModuleName; + std::string ArchName = Opts.DefaultArch; + size_t ColonPos = ModuleName.find_last_of(':'); + // Verify that substring after colon form a valid arch name. + if (ColonPos != std::string::npos) { + std::string ArchStr = ModuleName.substr(ColonPos + 1); + if (Triple(ArchStr).getArch() != Triple::UnknownArch) { + BinaryName = ModuleName.substr(0, ColonPos); + ArchName = ArchStr; + } + } + auto ObjectsOrErr = getOrCreateObjectPair(BinaryName, ArchName); + if (auto EC = ObjectsOrErr.getError()) { + // Failed to find valid object file. + Modules.insert(std::make_pair(ModuleName, EC)); + return EC; + } + ObjectPair Objects = ObjectsOrErr.get(); + + std::unique_ptr<DIContext> Context; + if (auto CoffObject = dyn_cast<COFFObjectFile>(Objects.first)) { + // If this is a COFF object, assume it contains PDB debug information. If + // we don't find any we will fall back to the DWARF case. + std::unique_ptr<IPDBSession> Session; + PDB_ErrorCode Error = loadDataForEXE(PDB_ReaderType::DIA, + Objects.first->getFileName(), Session); + if (Error == PDB_ErrorCode::Success) { + Context.reset(new PDBContext(*CoffObject, std::move(Session))); + } + } + if (!Context) + Context.reset(new DWARFContextInMemory(*Objects.second)); + assert(Context); + auto InfoOrErr = + SymbolizableObjectFile::create(Objects.first, std::move(Context)); + auto InsertResult = + Modules.insert(std::make_pair(ModuleName, std::move(InfoOrErr))); + assert(InsertResult.second); + if (auto EC = InsertResult.first->second.getError()) + return EC; + return InsertResult.first->second->get(); +} + +// Undo these various manglings for Win32 extern "C" functions: +// cdecl - _foo +// stdcall - _foo@12 +// fastcall - @foo@12 +// vectorcall - foo@@12 +// These are all different linkage names for 'foo'. +static StringRef demanglePE32ExternCFunc(StringRef SymbolName) { + // Remove any '_' or '@' prefix. + char Front = SymbolName.empty() ? '\0' : SymbolName[0]; + if (Front == '_' || Front == '@') + SymbolName = SymbolName.drop_front(); + + // Remove any '@[0-9]+' suffix. + if (Front != '?') { + size_t AtPos = SymbolName.rfind('@'); + if (AtPos != StringRef::npos && + std::all_of(SymbolName.begin() + AtPos + 1, SymbolName.end(), + [](char C) { return C >= '0' && C <= '9'; })) { + SymbolName = SymbolName.substr(0, AtPos); + } + } + + // Remove any ending '@' for vectorcall. + if (SymbolName.endswith("@")) + SymbolName = SymbolName.drop_back(); + + return SymbolName; +} + +#if !defined(_MSC_VER) +// Assume that __cxa_demangle is provided by libcxxabi (except for Windows). +extern "C" char *__cxa_demangle(const char *mangled_name, char *output_buffer, + size_t *length, int *status); +#endif + +std::string LLVMSymbolizer::DemangleName(const std::string &Name, + const SymbolizableModule *ModInfo) { +#if !defined(_MSC_VER) + // We can spoil names of symbols with C linkage, so use an heuristic + // approach to check if the name should be demangled. + if (Name.substr(0, 2) == "_Z") { + int status = 0; + char *DemangledName = __cxa_demangle(Name.c_str(), nullptr, nullptr, &status); + if (status != 0) + return Name; + std::string Result = DemangledName; + free(DemangledName); + return Result; + } +#else + if (!Name.empty() && Name.front() == '?') { + // Only do MSVC C++ demangling on symbols starting with '?'. + char DemangledName[1024] = {0}; + DWORD result = ::UnDecorateSymbolName( + Name.c_str(), DemangledName, 1023, + UNDNAME_NO_ACCESS_SPECIFIERS | // Strip public, private, protected + UNDNAME_NO_ALLOCATION_LANGUAGE | // Strip __thiscall, __stdcall, etc + UNDNAME_NO_THROW_SIGNATURES | // Strip throw() specifications + UNDNAME_NO_MEMBER_TYPE | // Strip virtual, static, etc specifiers + UNDNAME_NO_MS_KEYWORDS | // Strip all MS extension keywords + UNDNAME_NO_FUNCTION_RETURNS); // Strip function return types + return (result == 0) ? Name : std::string(DemangledName); + } +#endif + if (ModInfo && ModInfo->isWin32Module()) + return std::string(demanglePE32ExternCFunc(Name)); + return Name; +} + +} // namespace symbolize +} // namespace llvm diff --git a/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp index 67a1ca6..41c8da4 100644 --- a/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp +++ b/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp @@ -61,8 +61,7 @@ ExecutionEngine *(*ExecutionEngine::InterpCtor)(std::unique_ptr<Module> M, void JITEventListener::anchor() {} -ExecutionEngine::ExecutionEngine(std::unique_ptr<Module> M) - : LazyFunctionCreator(nullptr) { +void ExecutionEngine::Init(std::unique_ptr<Module> M) { CompilingLazily = false; GVCompilationDisabled = false; SymbolSearchingDisabled = false; @@ -79,6 +78,16 @@ ExecutionEngine::ExecutionEngine(std::unique_ptr<Module> M) Modules.push_back(std::move(M)); } +ExecutionEngine::ExecutionEngine(std::unique_ptr<Module> M) + : DL(M->getDataLayout()), LazyFunctionCreator(nullptr) { + Init(std::move(M)); +} + +ExecutionEngine::ExecutionEngine(DataLayout DL, std::unique_ptr<Module> M) + : DL(std::move(DL)), LazyFunctionCreator(nullptr) { + Init(std::move(M)); +} + ExecutionEngine::~ExecutionEngine() { clearAllGlobalMappings(); } @@ -86,7 +95,7 @@ ExecutionEngine::~ExecutionEngine() { namespace { /// \brief Helper class which uses a value handler to automatically deletes the /// memory block when the GlobalVariable is destroyed. -class GVMemoryBlock : public CallbackVH { +class GVMemoryBlock final : public CallbackVH { GVMemoryBlock(const GlobalVariable *GV) : CallbackVH(const_cast<GlobalVariable*>(GV)) {} @@ -115,7 +124,7 @@ public: } // anonymous namespace char *ExecutionEngine::getMemoryForGV(const GlobalVariable *GV) { - return GVMemoryBlock::Create(GV, *getDataLayout()); + return GVMemoryBlock::Create(GV, getDataLayout()); } void ExecutionEngine::addObjectFile(std::unique_ptr<object::ObjectFile> O) { @@ -187,7 +196,7 @@ std::string ExecutionEngine::getMangledName(const GlobalValue *GV) { const DataLayout &DL = GV->getParent()->getDataLayout().isDefault() - ? *getDataLayout() + ? getDataLayout() : GV->getParent()->getDataLayout(); Mangler::getNameWithPrefix(FullName, GV->getName(), DL); @@ -228,11 +237,10 @@ void ExecutionEngine::clearAllGlobalMappings() { void ExecutionEngine::clearGlobalMappingsFromModule(Module *M) { MutexGuard locked(lock); - for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ++FI) - EEState.RemoveMapping(getMangledName(FI)); - for (Module::global_iterator GI = M->global_begin(), GE = M->global_end(); - GI != GE; ++GI) - EEState.RemoveMapping(getMangledName(GI)); + for (Function &FI : *M) + EEState.RemoveMapping(getMangledName(&FI)); + for (GlobalVariable &GI : M->globals()) + EEState.RemoveMapping(getMangledName(&GI)); } uint64_t ExecutionEngine::updateGlobalMapping(const GlobalValue *GV, @@ -333,7 +341,7 @@ void *ArgvArray::reset(LLVMContext &C, ExecutionEngine *EE, const std::vector<std::string> &InputArgv) { Values.clear(); // Free the old contents. Values.reserve(InputArgv.size()); - unsigned PtrSize = EE->getDataLayout()->getPointerSize(); + unsigned PtrSize = EE->getDataLayout().getPointerSize(); Array = make_unique<char[]>((InputArgv.size()+1)*PtrSize); DEBUG(dbgs() << "JIT: ARGV = " << (void*)Array.get() << "\n"); @@ -408,7 +416,7 @@ void ExecutionEngine::runStaticConstructorsDestructors(bool isDtors) { #ifndef NDEBUG /// isTargetNullPtr - Return whether the target pointer stored at Loc is null. static bool isTargetNullPtr(ExecutionEngine *EE, void *Loc) { - unsigned PtrSize = EE->getDataLayout()->getPointerSize(); + unsigned PtrSize = EE->getDataLayout().getPointerSize(); for (unsigned i = 0; i < PtrSize; ++i) if (*(i + (uint8_t*)Loc)) return false; @@ -621,8 +629,8 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) { break; case Type::VectorTyID: // if the whole vector is 'undef' just reserve memory for the value. - const VectorType* VTy = dyn_cast<VectorType>(C->getType()); - const Type *ElemTy = VTy->getElementType(); + auto* VTy = dyn_cast<VectorType>(C->getType()); + Type *ElemTy = VTy->getElementType(); unsigned int elemNum = VTy->getNumElements(); Result.AggregateVal.resize(elemNum); if (ElemTy->isIntegerTy()) @@ -641,8 +649,8 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) { case Instruction::GetElementPtr: { // Compute the index GenericValue Result = getConstantValue(Op0); - APInt Offset(DL->getPointerSizeInBits(), 0); - cast<GEPOperator>(CE)->accumulateConstantOffset(*DL, Offset); + APInt Offset(DL.getPointerSizeInBits(), 0); + cast<GEPOperator>(CE)->accumulateConstantOffset(DL, Offset); char* tmp = (char*) Result.PointerVal; Result = PTOGV(tmp + Offset.getSExtValue()); @@ -729,16 +737,16 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) { } case Instruction::PtrToInt: { GenericValue GV = getConstantValue(Op0); - uint32_t PtrWidth = DL->getTypeSizeInBits(Op0->getType()); + uint32_t PtrWidth = DL.getTypeSizeInBits(Op0->getType()); assert(PtrWidth <= 64 && "Bad pointer width"); GV.IntVal = APInt(PtrWidth, uintptr_t(GV.PointerVal)); - uint32_t IntWidth = DL->getTypeSizeInBits(CE->getType()); + uint32_t IntWidth = DL.getTypeSizeInBits(CE->getType()); GV.IntVal = GV.IntVal.zextOrTrunc(IntWidth); return GV; } case Instruction::IntToPtr: { GenericValue GV = getConstantValue(Op0); - uint32_t PtrWidth = DL->getTypeSizeInBits(CE->getType()); + uint32_t PtrWidth = DL.getTypeSizeInBits(CE->getType()); GV.IntVal = GV.IntVal.zextOrTrunc(PtrWidth); assert(GV.IntVal.getBitWidth() <= 64 && "Bad pointer width"); GV.PointerVal = PointerTy(uintptr_t(GV.IntVal.getZExtValue())); @@ -860,8 +868,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) { GV.IntVal = apfLHS.bitcastToAPInt(); break; case Instruction::FRem: - apfLHS.mod(APFloat(Sem, RHS.IntVal), - APFloat::rmNearestTiesToEven); + apfLHS.mod(APFloat(Sem, RHS.IntVal)); GV.IntVal = apfLHS.bitcastToAPInt(); break; } @@ -1040,7 +1047,7 @@ static void StoreIntToMemory(const APInt &IntVal, uint8_t *Dst, void ExecutionEngine::StoreValueToMemory(const GenericValue &Val, GenericValue *Ptr, Type *Ty) { - const unsigned StoreBytes = getDataLayout()->getTypeStoreSize(Ty); + const unsigned StoreBytes = getDataLayout().getTypeStoreSize(Ty); switch (Ty->getTypeID()) { default: @@ -1080,7 +1087,7 @@ void ExecutionEngine::StoreValueToMemory(const GenericValue &Val, break; } - if (sys::IsLittleEndianHost != getDataLayout()->isLittleEndian()) + if (sys::IsLittleEndianHost != getDataLayout().isLittleEndian()) // Host and target are different endian - reverse the stored bytes. std::reverse((uint8_t*)Ptr, StoreBytes + (uint8_t*)Ptr); } @@ -1117,7 +1124,7 @@ static void LoadIntFromMemory(APInt &IntVal, uint8_t *Src, unsigned LoadBytes) { void ExecutionEngine::LoadValueFromMemory(GenericValue &Result, GenericValue *Ptr, Type *Ty) { - const unsigned LoadBytes = getDataLayout()->getTypeStoreSize(Ty); + const unsigned LoadBytes = getDataLayout().getTypeStoreSize(Ty); switch (Ty->getTypeID()) { case Type::IntegerTyID: @@ -1143,8 +1150,8 @@ void ExecutionEngine::LoadValueFromMemory(GenericValue &Result, break; } case Type::VectorTyID: { - const VectorType *VT = cast<VectorType>(Ty); - const Type *ElemT = VT->getElementType(); + auto *VT = cast<VectorType>(Ty); + Type *ElemT = VT->getElementType(); const unsigned numElems = VT->getNumElements(); if (ElemT->isFloatTy()) { Result.AggregateVal.resize(numElems); @@ -1183,20 +1190,20 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) { if (const ConstantVector *CP = dyn_cast<ConstantVector>(Init)) { unsigned ElementSize = - getDataLayout()->getTypeAllocSize(CP->getType()->getElementType()); + getDataLayout().getTypeAllocSize(CP->getType()->getElementType()); for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i) InitializeMemory(CP->getOperand(i), (char*)Addr+i*ElementSize); return; } if (isa<ConstantAggregateZero>(Init)) { - memset(Addr, 0, (size_t)getDataLayout()->getTypeAllocSize(Init->getType())); + memset(Addr, 0, (size_t)getDataLayout().getTypeAllocSize(Init->getType())); return; } if (const ConstantArray *CPA = dyn_cast<ConstantArray>(Init)) { unsigned ElementSize = - getDataLayout()->getTypeAllocSize(CPA->getType()->getElementType()); + getDataLayout().getTypeAllocSize(CPA->getType()->getElementType()); for (unsigned i = 0, e = CPA->getNumOperands(); i != e; ++i) InitializeMemory(CPA->getOperand(i), (char*)Addr+i*ElementSize); return; @@ -1204,7 +1211,7 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) { if (const ConstantStruct *CPS = dyn_cast<ConstantStruct>(Init)) { const StructLayout *SL = - getDataLayout()->getStructLayout(cast<StructType>(CPS->getType())); + getDataLayout().getStructLayout(cast<StructType>(CPS->getType())); for (unsigned i = 0, e = CPS->getNumOperands(); i != e; ++i) InitializeMemory(CPS->getOperand(i), (char*)Addr+SL->getElementOffset(i)); return; @@ -1349,7 +1356,7 @@ void ExecutionEngine::EmitGlobalVariable(const GlobalVariable *GV) { InitializeMemory(GV->getInitializer(), GA); Type *ElTy = GV->getType()->getElementType(); - size_t GVSize = (size_t)getDataLayout()->getTypeAllocSize(ElTy); + size_t GVSize = (size_t)getDataLayout().getTypeAllocSize(ElTy); NumInitBytes += (unsigned)GVSize; ++NumGlobals; } diff --git a/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp index 55ab5af..ff7c4dc 100644 --- a/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp +++ b/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp @@ -29,7 +29,7 @@ using namespace llvm; DEFINE_SIMPLE_CONVERSION_FUNCTIONS(GenericValue, LLVMGenericValueRef) -inline LLVMTargetMachineRef wrap(const TargetMachine *P) { +static LLVMTargetMachineRef wrap(const TargetMachine *P) { return reinterpret_cast<LLVMTargetMachineRef>(const_cast<TargetMachine*>(P)); } @@ -210,35 +210,6 @@ LLVMBool LLVMCreateMCJITCompilerForModule( return 1; } -LLVMBool LLVMCreateExecutionEngine(LLVMExecutionEngineRef *OutEE, - LLVMModuleProviderRef MP, - char **OutError) { - /* The module provider is now actually a module. */ - return LLVMCreateExecutionEngineForModule(OutEE, - reinterpret_cast<LLVMModuleRef>(MP), - OutError); -} - -LLVMBool LLVMCreateInterpreter(LLVMExecutionEngineRef *OutInterp, - LLVMModuleProviderRef MP, - char **OutError) { - /* The module provider is now actually a module. */ - return LLVMCreateInterpreterForModule(OutInterp, - reinterpret_cast<LLVMModuleRef>(MP), - OutError); -} - -LLVMBool LLVMCreateJITCompiler(LLVMExecutionEngineRef *OutJIT, - LLVMModuleProviderRef MP, - unsigned OptLevel, - char **OutError) { - /* The module provider is now actually a module. */ - return LLVMCreateJITCompilerForModule(OutJIT, - reinterpret_cast<LLVMModuleRef>(MP), - OptLevel, OutError); -} - - void LLVMDisposeExecutionEngine(LLVMExecutionEngineRef EE) { delete unwrap(EE); } @@ -282,11 +253,6 @@ void LLVMAddModule(LLVMExecutionEngineRef EE, LLVMModuleRef M){ unwrap(EE)->addModule(std::unique_ptr<Module>(unwrap(M))); } -void LLVMAddModuleProvider(LLVMExecutionEngineRef EE, LLVMModuleProviderRef MP){ - /* The module provider is now actually a module. */ - LLVMAddModule(EE, reinterpret_cast<LLVMModuleRef>(MP)); -} - LLVMBool LLVMRemoveModule(LLVMExecutionEngineRef EE, LLVMModuleRef M, LLVMModuleRef *OutMod, char **OutError) { Module *Mod = unwrap(M); @@ -295,14 +261,6 @@ LLVMBool LLVMRemoveModule(LLVMExecutionEngineRef EE, LLVMModuleRef M, return 0; } -LLVMBool LLVMRemoveModuleProvider(LLVMExecutionEngineRef EE, - LLVMModuleProviderRef MP, - LLVMModuleRef *OutMod, char **OutError) { - /* The module provider is now actually a module. */ - return LLVMRemoveModule(EE, reinterpret_cast<LLVMModuleRef>(MP), OutMod, - OutError); -} - LLVMBool LLVMFindFunction(LLVMExecutionEngineRef EE, const char *Name, LLVMValueRef *OutFn) { if (Function *F = unwrap(EE)->FindFunctionNamed(Name)) { @@ -318,7 +276,7 @@ void *LLVMRecompileAndRelinkFunction(LLVMExecutionEngineRef EE, } LLVMTargetDataRef LLVMGetExecutionEngineTargetData(LLVMExecutionEngineRef EE) { - return wrap(unwrap(EE)->getDataLayout()); + return wrap(&unwrap(EE)->getDataLayout()); } LLVMTargetMachineRef diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp b/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp index dbfa37e..1eb4f7d 100644 --- a/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp +++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp @@ -593,7 +593,7 @@ static GenericValue executeFCMP_UNO(GenericValue Src1, GenericValue Src2, } static GenericValue executeFCMP_BOOL(GenericValue Src1, GenericValue Src2, - const Type *Ty, const bool val) { + Type *Ty, const bool val) { GenericValue Dest; if(Ty->isVectorTy()) { assert(Src1.AggregateVal.size() == Src2.AggregateVal.size()); @@ -788,7 +788,7 @@ void Interpreter::visitBinaryOperator(BinaryOperator &I) { } static GenericValue executeSelectInst(GenericValue Src1, GenericValue Src2, - GenericValue Src3, const Type *Ty) { + GenericValue Src3, Type *Ty) { GenericValue Dest; if(Ty->isVectorTy()) { assert(Src1.AggregateVal.size() == Src2.AggregateVal.size()); @@ -805,7 +805,7 @@ static GenericValue executeSelectInst(GenericValue Src1, GenericValue Src2, void Interpreter::visitSelectInst(SelectInst &I) { ExecutionContext &SF = ECStack.back(); - const Type * Ty = I.getOperand(0)->getType(); + Type * Ty = I.getOperand(0)->getType(); GenericValue Src1 = getOperandValue(I.getOperand(0), SF); GenericValue Src2 = getOperandValue(I.getOperand(1), SF); GenericValue Src3 = getOperandValue(I.getOperand(2), SF); @@ -968,7 +968,7 @@ void Interpreter::visitAllocaInst(AllocaInst &I) { unsigned NumElements = getOperandValue(I.getOperand(0), SF).IntVal.getZExtValue(); - unsigned TypeSize = (size_t)TD.getTypeAllocSize(Ty); + unsigned TypeSize = (size_t)getDataLayout().getTypeAllocSize(Ty); // Avoid malloc-ing zero bytes, use max()... unsigned MemToAlloc = std::max(1U, NumElements * TypeSize); @@ -1000,7 +1000,7 @@ GenericValue Interpreter::executeGEPOperation(Value *Ptr, gep_type_iterator I, for (; I != E; ++I) { if (StructType *STy = dyn_cast<StructType>(*I)) { - const StructLayout *SLO = TD.getStructLayout(STy); + const StructLayout *SLO = getDataLayout().getStructLayout(STy); const ConstantInt *CPU = cast<ConstantInt>(I.getOperand()); unsigned Index = unsigned(CPU->getZExtValue()); @@ -1020,7 +1020,7 @@ GenericValue Interpreter::executeGEPOperation(Value *Ptr, gep_type_iterator I, assert(BitWidth == 64 && "Invalid index type for getelementptr"); Idx = (int64_t)IdxGV.IntVal.getZExtValue(); } - Total += TD.getTypeAllocSize(ST->getElementType())*Idx; + Total += getDataLayout().getTypeAllocSize(ST->getElementType()) * Idx; } } @@ -1139,7 +1139,7 @@ void Interpreter::visitShl(BinaryOperator &I) { GenericValue Src1 = getOperandValue(I.getOperand(0), SF); GenericValue Src2 = getOperandValue(I.getOperand(1), SF); GenericValue Dest; - const Type *Ty = I.getType(); + Type *Ty = I.getType(); if (Ty->isVectorTy()) { uint32_t src1Size = uint32_t(Src1.AggregateVal.size()); @@ -1166,7 +1166,7 @@ void Interpreter::visitLShr(BinaryOperator &I) { GenericValue Src1 = getOperandValue(I.getOperand(0), SF); GenericValue Src2 = getOperandValue(I.getOperand(1), SF); GenericValue Dest; - const Type *Ty = I.getType(); + Type *Ty = I.getType(); if (Ty->isVectorTy()) { uint32_t src1Size = uint32_t(Src1.AggregateVal.size()); @@ -1193,7 +1193,7 @@ void Interpreter::visitAShr(BinaryOperator &I) { GenericValue Src1 = getOperandValue(I.getOperand(0), SF); GenericValue Src2 = getOperandValue(I.getOperand(1), SF); GenericValue Dest; - const Type *Ty = I.getType(); + Type *Ty = I.getType(); if (Ty->isVectorTy()) { size_t src1Size = Src1.AggregateVal.size(); @@ -1237,10 +1237,10 @@ GenericValue Interpreter::executeTruncInst(Value *SrcVal, Type *DstTy, GenericValue Interpreter::executeSExtInst(Value *SrcVal, Type *DstTy, ExecutionContext &SF) { - const Type *SrcTy = SrcVal->getType(); + Type *SrcTy = SrcVal->getType(); GenericValue Dest, Src = getOperandValue(SrcVal, SF); if (SrcTy->isVectorTy()) { - const Type *DstVecTy = DstTy->getScalarType(); + Type *DstVecTy = DstTy->getScalarType(); unsigned DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth(); unsigned size = Src.AggregateVal.size(); // the sizes of src and dst vectors must be equal. @@ -1248,7 +1248,7 @@ GenericValue Interpreter::executeSExtInst(Value *SrcVal, Type *DstTy, for (unsigned i = 0; i < size; i++) Dest.AggregateVal[i].IntVal = Src.AggregateVal[i].IntVal.sext(DBitWidth); } else { - const IntegerType *DITy = cast<IntegerType>(DstTy); + auto *DITy = cast<IntegerType>(DstTy); unsigned DBitWidth = DITy->getBitWidth(); Dest.IntVal = Src.IntVal.sext(DBitWidth); } @@ -1257,10 +1257,10 @@ GenericValue Interpreter::executeSExtInst(Value *SrcVal, Type *DstTy, GenericValue Interpreter::executeZExtInst(Value *SrcVal, Type *DstTy, ExecutionContext &SF) { - const Type *SrcTy = SrcVal->getType(); + Type *SrcTy = SrcVal->getType(); GenericValue Dest, Src = getOperandValue(SrcVal, SF); if (SrcTy->isVectorTy()) { - const Type *DstVecTy = DstTy->getScalarType(); + Type *DstVecTy = DstTy->getScalarType(); unsigned DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth(); unsigned size = Src.AggregateVal.size(); @@ -1269,7 +1269,7 @@ GenericValue Interpreter::executeZExtInst(Value *SrcVal, Type *DstTy, for (unsigned i = 0; i < size; i++) Dest.AggregateVal[i].IntVal = Src.AggregateVal[i].IntVal.zext(DBitWidth); } else { - const IntegerType *DITy = cast<IntegerType>(DstTy); + auto *DITy = cast<IntegerType>(DstTy); unsigned DBitWidth = DITy->getBitWidth(); Dest.IntVal = Src.IntVal.zext(DBitWidth); } @@ -1327,8 +1327,8 @@ GenericValue Interpreter::executeFPToUIInst(Value *SrcVal, Type *DstTy, GenericValue Dest, Src = getOperandValue(SrcVal, SF); if (SrcTy->getTypeID() == Type::VectorTyID) { - const Type *DstVecTy = DstTy->getScalarType(); - const Type *SrcVecTy = SrcTy->getScalarType(); + Type *DstVecTy = DstTy->getScalarType(); + Type *SrcVecTy = SrcTy->getScalarType(); uint32_t DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth(); unsigned size = Src.AggregateVal.size(); // the sizes of src and dst vectors must be equal. @@ -1365,8 +1365,8 @@ GenericValue Interpreter::executeFPToSIInst(Value *SrcVal, Type *DstTy, GenericValue Dest, Src = getOperandValue(SrcVal, SF); if (SrcTy->getTypeID() == Type::VectorTyID) { - const Type *DstVecTy = DstTy->getScalarType(); - const Type *SrcVecTy = SrcTy->getScalarType(); + Type *DstVecTy = DstTy->getScalarType(); + Type *SrcVecTy = SrcTy->getScalarType(); uint32_t DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth(); unsigned size = Src.AggregateVal.size(); // the sizes of src and dst vectors must be equal @@ -1401,7 +1401,7 @@ GenericValue Interpreter::executeUIToFPInst(Value *SrcVal, Type *DstTy, GenericValue Dest, Src = getOperandValue(SrcVal, SF); if (SrcVal->getType()->getTypeID() == Type::VectorTyID) { - const Type *DstVecTy = DstTy->getScalarType(); + Type *DstVecTy = DstTy->getScalarType(); unsigned size = Src.AggregateVal.size(); // the sizes of src and dst vectors must be equal Dest.AggregateVal.resize(size); @@ -1433,7 +1433,7 @@ GenericValue Interpreter::executeSIToFPInst(Value *SrcVal, Type *DstTy, GenericValue Dest, Src = getOperandValue(SrcVal, SF); if (SrcVal->getType()->getTypeID() == Type::VectorTyID) { - const Type *DstVecTy = DstTy->getScalarType(); + Type *DstVecTy = DstTy->getScalarType(); unsigned size = Src.AggregateVal.size(); // the sizes of src and dst vectors must be equal Dest.AggregateVal.resize(size); @@ -1477,7 +1477,7 @@ GenericValue Interpreter::executeIntToPtrInst(Value *SrcVal, Type *DstTy, GenericValue Dest, Src = getOperandValue(SrcVal, SF); assert(DstTy->isPointerTy() && "Invalid PtrToInt instruction"); - uint32_t PtrSize = TD.getPointerSizeInBits(); + uint32_t PtrSize = getDataLayout().getPointerSizeInBits(); if (PtrSize != Src.IntVal.getBitWidth()) Src.IntVal = Src.IntVal.zextOrTrunc(PtrSize); @@ -1497,10 +1497,10 @@ GenericValue Interpreter::executeBitCastInst(Value *SrcVal, Type *DstTy, (DstTy->getTypeID() == Type::VectorTyID)) { // vector src bitcast to vector dst or vector src bitcast to scalar dst or // scalar src bitcast to vector dst - bool isLittleEndian = TD.isLittleEndian(); + bool isLittleEndian = getDataLayout().isLittleEndian(); GenericValue TempDst, TempSrc, SrcVec; - const Type *SrcElemTy; - const Type *DstElemTy; + Type *SrcElemTy; + Type *DstElemTy; unsigned SrcBitSize; unsigned DstBitSize; unsigned SrcNum; @@ -2091,7 +2091,7 @@ void Interpreter::callFunction(Function *F, ArrayRef<GenericValue> ArgVals) { } // Get pointers to first LLVM BB & Instruction in function. - StackFrame.CurBB = F->begin(); + StackFrame.CurBB = &F->front(); StackFrame.CurInst = StackFrame.CurBB->begin(); // Run through the function arguments and initialize their values... @@ -2103,7 +2103,7 @@ void Interpreter::callFunction(Function *F, ArrayRef<GenericValue> ArgVals) { unsigned i = 0; for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E; ++AI, ++i) - SetValue(AI, ArgVals[i], StackFrame); + SetValue(&*AI, ArgVals[i], StackFrame); // Handle varargs arguments... StackFrame.VarArgs.assign(ArgVals.begin()+i, ArgVals.end()); @@ -2121,27 +2121,5 @@ void Interpreter::run() { DEBUG(dbgs() << "About to interpret: " << I); visit(I); // Dispatch to one of the visit* methods... -#if 0 - // This is not safe, as visiting the instruction could lower it and free I. -DEBUG( - if (!isa<CallInst>(I) && !isa<InvokeInst>(I) && - I.getType() != Type::VoidTy) { - dbgs() << " --> "; - const GenericValue &Val = SF.Values[&I]; - switch (I.getType()->getTypeID()) { - default: llvm_unreachable("Invalid GenericValue Type"); - case Type::VoidTyID: dbgs() << "void"; break; - case Type::FloatTyID: dbgs() << "float " << Val.FloatVal; break; - case Type::DoubleTyID: dbgs() << "double " << Val.DoubleVal; break; - case Type::PointerTyID: dbgs() << "void* " << intptr_t(Val.PointerVal); - break; - case Type::IntegerTyID: - dbgs() << "i" << Val.IntVal.getBitWidth() << " " - << Val.IntVal.toStringUnsigned(10) - << " (0x" << Val.IntVal.toStringUnsigned(16) << ")\n"; - break; - } - }); -#endif } } diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp index 9b44042..441f0eb 100644 --- a/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp +++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp @@ -178,7 +178,7 @@ static void *ffiValueFor(Type *Ty, const GenericValue &AV, } static bool ffiInvoke(RawFunc Fn, Function *F, ArrayRef<GenericValue> ArgVals, - const DataLayout *TD, GenericValue &Result) { + const DataLayout &TD, GenericValue &Result) { ffi_cif cif; FunctionType *FTy = F->getFunctionType(); const unsigned NumArgs = F->arg_size(); @@ -198,7 +198,7 @@ static bool ffiInvoke(RawFunc Fn, Function *F, ArrayRef<GenericValue> ArgVals, const unsigned ArgNo = A->getArgNo(); Type *ArgTy = FTy->getParamType(ArgNo); args[ArgNo] = ffiTypeFor(ArgTy); - ArgBytes += TD->getTypeStoreSize(ArgTy); + ArgBytes += TD.getTypeStoreSize(ArgTy); } SmallVector<uint8_t, 128> ArgData; @@ -210,7 +210,7 @@ static bool ffiInvoke(RawFunc Fn, Function *F, ArrayRef<GenericValue> ArgVals, const unsigned ArgNo = A->getArgNo(); Type *ArgTy = FTy->getParamType(ArgNo); values[ArgNo] = ffiValueFor(ArgTy, ArgVals[ArgNo], ArgDataPtr); - ArgDataPtr += TD->getTypeStoreSize(ArgTy); + ArgDataPtr += TD.getTypeStoreSize(ArgTy); } Type *RetTy = FTy->getReturnType(); @@ -219,7 +219,7 @@ static bool ffiInvoke(RawFunc Fn, Function *F, ArrayRef<GenericValue> ArgVals, if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, NumArgs, rtype, &args[0]) == FFI_OK) { SmallVector<uint8_t, 128> ret; if (RetTy->getTypeID() != Type::VoidTyID) - ret.resize(TD->getTypeStoreSize(RetTy)); + ret.resize(TD.getTypeStoreSize(RetTy)); ffi_call(&cif, Fn, ret.data(), values.data()); switch (RetTy->getTypeID()) { case Type::IntegerTyID: @@ -368,7 +368,7 @@ static GenericValue lle_X_sprintf(FunctionType *FT, case 'x': case 'X': if (HowLong >= 1) { if (HowLong == 1 && - TheInterpreter->getDataLayout()->getPointerSizeInBits() == 64 && + TheInterpreter->getDataLayout().getPointerSizeInBits() == 64 && sizeof(long) < sizeof(int64_t)) { // Make sure we use %lld with a 64 bit argument because we might be // compiling LLI on a 32 bit compiler. diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.cpp b/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.cpp index f103c09..bc7da2e 100644 --- a/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.cpp +++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.cpp @@ -35,7 +35,7 @@ extern "C" void LLVMLinkInInterpreter() { } ExecutionEngine *Interpreter::create(std::unique_ptr<Module> M, std::string *ErrStr) { // Tell this Module to materialize everything and release the GVMaterializer. - if (std::error_code EC = M->materializeAllPermanently()) { + if (std::error_code EC = M->materializeAll()) { if (ErrStr) *ErrStr = EC.message(); // We got an error, just return 0 @@ -49,16 +49,15 @@ ExecutionEngine *Interpreter::create(std::unique_ptr<Module> M, // Interpreter ctor - Initialize stuff // Interpreter::Interpreter(std::unique_ptr<Module> M) - : ExecutionEngine(std::move(M)), TD(Modules.back().get()) { + : ExecutionEngine(std::move(M)) { memset(&ExitValue.Untyped, 0, sizeof(ExitValue.Untyped)); - setDataLayout(&TD); // Initialize the "backend" initializeExecutionEngine(); initializeExternalFunctions(); emitGlobals(); - IL = new IntrinsicLowering(TD); + IL = new IntrinsicLowering(getDataLayout()); } Interpreter::~Interpreter() { diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h b/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h index f976641..2e5a867 100644 --- a/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h +++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h @@ -26,7 +26,6 @@ namespace llvm { class IntrinsicLowering; -struct FunctionInfo; template<typename T> class generic_gep_type_iterator; class ConstantExpr; typedef generic_gep_type_iterator<User::const_op_iterator> gep_type_iterator; @@ -95,7 +94,6 @@ struct ExecutionContext { // class Interpreter : public ExecutionEngine, public InstVisitor<Interpreter> { GenericValue ExitValue; // The return value of the called function - DataLayout TD; IntrinsicLowering *IL; // The runtime stack of executing code. The top of the stack is the current diff --git a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp index f6944ee..6cbebe9 100644 --- a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp +++ b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp @@ -65,12 +65,13 @@ MCJIT::createJIT(std::unique_ptr<Module> M, std::move(Resolver)); } -MCJIT::MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> tm, +MCJIT::MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> TM, std::shared_ptr<MCJITMemoryManager> MemMgr, std::shared_ptr<RuntimeDyld::SymbolResolver> Resolver) - : ExecutionEngine(std::move(M)), TM(std::move(tm)), Ctx(nullptr), - MemMgr(std::move(MemMgr)), Resolver(*this, std::move(Resolver)), - Dyld(*this->MemMgr, this->Resolver), ObjCache(nullptr) { + : ExecutionEngine(TM->createDataLayout(), std::move(M)), TM(std::move(TM)), + Ctx(nullptr), MemMgr(std::move(MemMgr)), + Resolver(*this, std::move(Resolver)), Dyld(*this->MemMgr, this->Resolver), + ObjCache(nullptr) { // FIXME: We are managing our modules, so we do not want the base class // ExecutionEngine to manage them as well. To avoid double destruction // of the first (and only) module added in ExecutionEngine constructor @@ -85,7 +86,6 @@ MCJIT::MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> tm, Modules.clear(); OwnedModules.addModule(std::move(First)); - setDataLayout(TM->getDataLayout()); RegisterJITEventListener(JITEventListener::createGDBRegistrationListener()); } @@ -159,7 +159,6 @@ std::unique_ptr<MemoryBuffer> MCJIT::emitObject(Module *M) { // Initialize passes. PM.run(*M); // Flush the output buffer to get the generated code into memory - ObjStream.flush(); std::unique_ptr<MemoryBuffer> CompiledObjBuffer( new ObjectMemoryBuffer(std::move(ObjBufferSV))); @@ -193,7 +192,11 @@ void MCJIT::generateCodeForModule(Module *M) { if (ObjCache) ObjectToLoad = ObjCache->getObject(M); - M->setDataLayout(*TM->getDataLayout()); + if (M->getDataLayout().isDefault()) { + M->setDataLayout(getDataLayout()); + } else { + assert(M->getDataLayout() == getDataLayout() && "DataLayout Mismatch"); + } // If the cache did not contain a suitable object, compile the object if (!ObjectToLoad) { @@ -265,7 +268,7 @@ void MCJIT::finalizeModule(Module *M) { RuntimeDyld::SymbolInfo MCJIT::findExistingSymbol(const std::string &Name) { SmallString<128> FullName; - Mangler::getNameWithPrefix(FullName, Name, *TM->getDataLayout()); + Mangler::getNameWithPrefix(FullName, Name, getDataLayout()); if (void *Addr = getPointerToGlobalIfAvailable(FullName)) return RuntimeDyld::SymbolInfo(static_cast<uint64_t>( @@ -315,10 +318,12 @@ RuntimeDyld::SymbolInfo MCJIT::findSymbol(const std::string &Name, object::Archive *A = OB.getBinary(); // Look for our symbols in each Archive object::Archive::child_iterator ChildIt = A->findSym(Name); + if (std::error_code EC = ChildIt->getError()) + report_fatal_error(EC.message()); if (ChildIt != A->child_end()) { // FIXME: Support nested archives? ErrorOr<std::unique_ptr<object::Binary>> ChildBinOrErr = - ChildIt->getAsBinary(); + (*ChildIt)->getAsBinary(); if (ChildBinOrErr.getError()) continue; std::unique_ptr<object::Binary> &ChildBin = ChildBinOrErr.get(); diff --git a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h index a45173c..3c9d2fd 100644 --- a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h +++ b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h @@ -86,7 +86,7 @@ class MCJIT : public ExecutionEngine { ModulePtrSet::iterator begin_added() { return AddedModules.begin(); } ModulePtrSet::iterator end_added() { return AddedModules.end(); } iterator_range<ModulePtrSet::iterator> added() { - return iterator_range<ModulePtrSet::iterator>(begin_added(), end_added()); + return make_range(begin_added(), end_added()); } ModulePtrSet::iterator begin_loaded() { return LoadedModules.begin(); } @@ -223,12 +223,13 @@ public: /// FindFunctionNamed - Search all of the active modules to find the function that /// defines FnName. This is very slow operation and shouldn't be used for /// general code. - virtual Function *FindFunctionNamed(const char *FnName) override; + Function *FindFunctionNamed(const char *FnName) override; - /// FindGlobalVariableNamed - Search all of the active modules to find the global variable - /// that defines Name. This is very slow operation and shouldn't be used for - /// general code. - virtual GlobalVariable *FindGlobalVariableNamed(const char *Name, bool AllowInternal = false) override; + /// FindGlobalVariableNamed - Search all of the active modules to find the + /// global variable that defines Name. This is very slow operation and + /// shouldn't be used for general code. + GlobalVariable *FindGlobalVariableNamed(const char *Name, + bool AllowInternal = false) override; /// Sets the object manager that MCJIT should use to avoid compilation. void setObjectCache(ObjectCache *manager) override; @@ -335,6 +336,6 @@ protected: bool CheckFunctionsOnly); }; -} // End llvm namespace +} // end llvm namespace -#endif +#endif // LLVM_LIB_EXECUTIONENGINE_MCJIT_MCJIT_H diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp index b439810..34564e4 100644 --- a/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp +++ b/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp @@ -19,6 +19,9 @@ namespace llvm { namespace orc { +void JITCompileCallbackManager::anchor() {} +void IndirectStubsManager::anchor() {} + Constant* createIRTypedAddress(FunctionType &FT, TargetAddress Addr) { Constant *AddrIntVal = ConstantInt::get(Type::getInt64Ty(FT.getContext()), Addr); @@ -37,7 +40,7 @@ GlobalVariable* createImplPointer(PointerType &PT, Module &M, return IP; } -void makeStub(Function &F, GlobalVariable &ImplPointer) { +void makeStub(Function &F, Value &ImplPointer) { assert(F.isDeclaration() && "Can't turn a definition into a stub."); assert(F.getParent() && "Function isn't in a module."); Module &M = *F.getParent(); @@ -61,9 +64,7 @@ class GlobalRenamer { public: static bool needsRenaming(const Value &New) { - if (!New.hasName() || New.getName().startswith("\01L")) - return true; - return false; + return !New.hasName() || New.getName().startswith("\01L"); } const std::string& getRename(const Value &Orig) { @@ -106,6 +107,9 @@ void makeAllSymbolsExternallyAccessible(Module &M) { for (auto &GV : M.globals()) raiseVisibilityOnValue(GV, Renamer); + + for (auto &A : M.aliases()) + raiseVisibilityOnValue(A, Renamer); } Function* cloneFunctionDecl(Module &Dst, const Function &F, @@ -121,7 +125,7 @@ Function* cloneFunctionDecl(Module &Dst, const Function &F, auto NewArgI = NewF->arg_begin(); for (auto ArgI = F.arg_begin(), ArgE = F.arg_end(); ArgI != ArgE; ++ArgI, ++NewArgI) - (*VMap)[ArgI] = NewArgI; + (*VMap)[&*ArgI] = &*NewArgI; } return NewF; @@ -177,5 +181,16 @@ void moveGlobalVariableInitializer(GlobalVariable &OrigGV, nullptr, Materializer)); } +GlobalAlias* cloneGlobalAliasDecl(Module &Dst, const GlobalAlias &OrigA, + ValueToValueMapTy &VMap) { + assert(OrigA.getAliasee() && "Original alias doesn't have an aliasee?"); + auto *NewA = GlobalAlias::create(OrigA.getValueType(), + OrigA.getType()->getPointerAddressSpace(), + OrigA.getLinkage(), OrigA.getName(), &Dst); + NewA->copyAttributesFrom(&OrigA); + VMap[&OrigA] = NewA; + return NewA; +} + } // End namespace orc. } // End namespace llvm. diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcArchitectureSupport.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcArchitectureSupport.cpp new file mode 100644 index 0000000..01e829f --- /dev/null +++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcArchitectureSupport.cpp @@ -0,0 +1,171 @@ +//===------ OrcArchSupport.cpp - Architecture specific support code -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Triple.h" +#include "llvm/ExecutionEngine/Orc/OrcArchitectureSupport.h" +#include "llvm/Support/Process.h" +#include <array> + +namespace llvm { +namespace orc { + +void OrcX86_64::writeResolverCode(uint8_t *ResolverMem, JITReentryFn ReentryFn, + void *CallbackMgr) { + + const uint8_t ResolverCode[] = { + // resolver_entry: + 0x55, // 0x00: pushq %rbp + 0x48, 0x89, 0xe5, // 0x01: movq %rsp, %rbp + 0x50, // 0x04: pushq %rax + 0x53, // 0x05: pushq %rbx + 0x51, // 0x06: pushq %rcx + 0x52, // 0x07: pushq %rdx + 0x56, // 0x08: pushq %rsi + 0x57, // 0x09: pushq %rdi + 0x41, 0x50, // 0x0a: pushq %r8 + 0x41, 0x51, // 0x0c: pushq %r9 + 0x41, 0x52, // 0x0e: pushq %r10 + 0x41, 0x53, // 0x10: pushq %r11 + 0x41, 0x54, // 0x12: pushq %r12 + 0x41, 0x55, // 0x14: pushq %r13 + 0x41, 0x56, // 0x16: pushq %r14 + 0x41, 0x57, // 0x18: pushq %r15 + 0x48, 0x81, 0xec, 0x08, 0x02, 0x00, 0x00, // 0x1a: subq 20, %rsp + 0x48, 0x0f, 0xae, 0x04, 0x24, // 0x21: fxsave64 (%rsp) + 0x48, 0x8d, 0x3d, 0x43, 0x00, 0x00, 0x00, // 0x26: leaq 67(%rip), %rdi + 0x48, 0x8b, 0x3f, // 0x2d: movq (%rdi), %rdi + 0x48, 0x8b, 0x75, 0x08, // 0x30: movq 8(%rbp), %rsi + 0x48, 0x83, 0xee, 0x06, // 0x34: subq $6, %rsi + 0x48, 0xb8, // 0x38: movabsq $0, %rax + + // 0x3a: JIT re-entry fn addr: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + + 0xff, 0xd0, // 0x42: callq *%rax + 0x48, 0x89, 0x45, 0x08, // 0x44: movq %rax, 8(%rbp) + 0x48, 0x0f, 0xae, 0x0c, 0x24, // 0x48: fxrstor64 (%rsp) + 0x48, 0x81, 0xc4, 0x08, 0x02, 0x00, 0x00, // 0x4d: addq 20, %rsp + 0x41, 0x5f, // 0x54: popq %r15 + 0x41, 0x5e, // 0x56: popq %r14 + 0x41, 0x5d, // 0x58: popq %r13 + 0x41, 0x5c, // 0x5a: popq %r12 + 0x41, 0x5b, // 0x5c: popq %r11 + 0x41, 0x5a, // 0x5e: popq %r10 + 0x41, 0x59, // 0x60: popq %r9 + 0x41, 0x58, // 0x62: popq %r8 + 0x5f, // 0x64: popq %rdi + 0x5e, // 0x65: popq %rsi + 0x5a, // 0x66: popq %rdx + 0x59, // 0x67: popq %rcx + 0x5b, // 0x68: popq %rbx + 0x58, // 0x69: popq %rax + 0x5d, // 0x6a: popq %rbp + 0xc3, // 0x6b: retq + 0x00, 0x00, 0x00, 0x00, // 0x6c: <padding> + + // 0x70: Callback mgr address. + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + + const unsigned ReentryFnAddrOffset = 0x3a; + const unsigned CallbackMgrAddrOffset = 0x70; + + memcpy(ResolverMem, ResolverCode, sizeof(ResolverCode)); + memcpy(ResolverMem + ReentryFnAddrOffset, &ReentryFn, sizeof(ReentryFn)); + memcpy(ResolverMem + CallbackMgrAddrOffset, &CallbackMgr, + sizeof(CallbackMgr)); +} + +void OrcX86_64::writeTrampolines(uint8_t *TrampolineMem, void *ResolverAddr, + unsigned NumTrampolines) { + + unsigned OffsetToPtr = NumTrampolines * TrampolineSize; + + memcpy(TrampolineMem + OffsetToPtr, &ResolverAddr, sizeof(void*)); + + uint64_t *Trampolines = reinterpret_cast<uint64_t*>(TrampolineMem); + uint64_t CallIndirPCRel = 0xf1c40000000015ff; + + for (unsigned I = 0; I < NumTrampolines; ++I, OffsetToPtr -= TrampolineSize) + Trampolines[I] = CallIndirPCRel | ((OffsetToPtr - 6) << 16); +} + +std::error_code OrcX86_64::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo, + unsigned MinStubs, + void *InitialPtrVal) { + // Stub format is: + // + // .section __orc_stubs + // stub1: + // jmpq *ptr1(%rip) + // .byte 0xC4 ; <- Invalid opcode padding. + // .byte 0xF1 + // stub2: + // jmpq *ptr2(%rip) + // + // ... + // + // .section __orc_ptrs + // ptr1: + // .quad 0x0 + // ptr2: + // .quad 0x0 + // + // ... + + const unsigned StubSize = IndirectStubsInfo::StubSize; + + // Emit at least MinStubs, rounded up to fill the pages allocated. + unsigned PageSize = sys::Process::getPageSize(); + unsigned NumPages = ((MinStubs * StubSize) + (PageSize - 1)) / PageSize; + unsigned NumStubs = (NumPages * PageSize) / StubSize; + + // Allocate memory for stubs and pointers in one call. + std::error_code EC; + auto StubsMem = + sys::OwningMemoryBlock( + sys::Memory::allocateMappedMemory(2 * NumPages * PageSize, nullptr, + sys::Memory::MF_READ | + sys::Memory::MF_WRITE, + EC)); + + if (EC) + return EC; + + // Create separate MemoryBlocks representing the stubs and pointers. + sys::MemoryBlock StubsBlock(StubsMem.base(), NumPages * PageSize); + sys::MemoryBlock PtrsBlock(static_cast<char*>(StubsMem.base()) + + NumPages * PageSize, + NumPages * PageSize); + + // Populate the stubs page stubs and mark it executable. + uint64_t *Stub = reinterpret_cast<uint64_t*>(StubsBlock.base()); + uint64_t PtrOffsetField = + static_cast<uint64_t>(NumPages * PageSize - 6) << 16; + for (unsigned I = 0; I < NumStubs; ++I) + Stub[I] = 0xF1C40000000025ff | PtrOffsetField; + + if (auto EC = sys::Memory::protectMappedMemory(StubsBlock, + sys::Memory::MF_READ | + sys::Memory::MF_EXEC)) + return EC; + + // Initialize all pointers to point at FailureAddress. + void **Ptr = reinterpret_cast<void**>(PtrsBlock.base()); + for (unsigned I = 0; I < NumStubs; ++I) + Ptr[I] = InitialPtrVal; + + StubsInfo.NumStubs = NumStubs; + StubsInfo.StubsMem = std::move(StubsMem); + + return std::error_code(); +} + +} // End namespace orc. +} // End namespace llvm. diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp new file mode 100644 index 0000000..d2379cd --- /dev/null +++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp @@ -0,0 +1,97 @@ +//===----------- OrcCBindings.cpp - C bindings for the Orc APIs -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "OrcCBindingsStack.h" +#include "llvm-c/OrcBindings.h" + +using namespace llvm; + +LLVMOrcJITStackRef LLVMOrcCreateInstance(LLVMTargetMachineRef TM) { + TargetMachine *TM2(unwrap(TM)); + + Triple T(TM2->getTargetTriple()); + + auto CompileCallbackMgr = OrcCBindingsStack::createCompileCallbackMgr(T); + auto IndirectStubsMgrBuilder = + OrcCBindingsStack::createIndirectStubsMgrBuilder(T); + + OrcCBindingsStack *JITStack = + new OrcCBindingsStack(*TM2, std::move(CompileCallbackMgr), + IndirectStubsMgrBuilder); + + return wrap(JITStack); +} + +void LLVMOrcGetMangledSymbol(LLVMOrcJITStackRef JITStack, char **MangledName, + const char *SymbolName) { + OrcCBindingsStack &J = *unwrap(JITStack); + std::string Mangled = J.mangle(SymbolName); + *MangledName = new char[Mangled.size() + 1]; + strcpy(*MangledName, Mangled.c_str()); +} + +void LLVMOrcDisposeMangledSymbol(char *MangledName) { + delete[] MangledName; +} + +LLVMOrcTargetAddress +LLVMOrcCreateLazyCompileCallback(LLVMOrcJITStackRef JITStack, + LLVMOrcLazyCompileCallbackFn Callback, + void *CallbackCtx) { + OrcCBindingsStack &J = *unwrap(JITStack); + return J.createLazyCompileCallback(Callback, CallbackCtx); +} + +void LLVMOrcCreateIndirectStub(LLVMOrcJITStackRef JITStack, + const char *StubName, + LLVMOrcTargetAddress InitAddr) { + OrcCBindingsStack &J = *unwrap(JITStack); + J.createIndirectStub(StubName, InitAddr); +} + +void LLVMOrcSetIndirectStubPointer(LLVMOrcJITStackRef JITStack, + const char *StubName, + LLVMOrcTargetAddress NewAddr) { + OrcCBindingsStack &J = *unwrap(JITStack); + J.setIndirectStubPointer(StubName, NewAddr); +} + +LLVMOrcModuleHandle +LLVMOrcAddEagerlyCompiledIR(LLVMOrcJITStackRef JITStack, LLVMModuleRef Mod, + LLVMOrcSymbolResolverFn SymbolResolver, + void *SymbolResolverCtx) { + OrcCBindingsStack &J = *unwrap(JITStack); + Module *M(unwrap(Mod)); + return J.addIRModuleEager(M, SymbolResolver, SymbolResolverCtx); +} + +LLVMOrcModuleHandle +LLVMOrcAddLazilyCompiledIR(LLVMOrcJITStackRef JITStack, LLVMModuleRef Mod, + LLVMOrcSymbolResolverFn SymbolResolver, + void *SymbolResolverCtx) { + OrcCBindingsStack &J = *unwrap(JITStack); + Module *M(unwrap(Mod)); + return J.addIRModuleLazy(M, SymbolResolver, SymbolResolverCtx); +} + +void LLVMOrcRemoveModule(LLVMOrcJITStackRef JITStack, LLVMOrcModuleHandle H) { + OrcCBindingsStack &J = *unwrap(JITStack); + J.removeModule(H); +} + +LLVMOrcTargetAddress LLVMOrcGetSymbolAddress(LLVMOrcJITStackRef JITStack, + const char *SymbolName) { + OrcCBindingsStack &J = *unwrap(JITStack); + auto Sym = J.findSymbol(SymbolName, true); + return Sym.getAddress(); +} + +void LLVMOrcDisposeInstance(LLVMOrcJITStackRef JITStack) { + delete unwrap(JITStack); +} diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp new file mode 100644 index 0000000..956daae --- /dev/null +++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.cpp @@ -0,0 +1,43 @@ +//===-------- OrcCBindingsStack.cpp - Orc JIT stack for C bindings --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "OrcCBindingsStack.h" + +#include "llvm/ExecutionEngine/Orc/OrcArchitectureSupport.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/DynamicLibrary.h" +#include <cstdio> +#include <system_error> + +using namespace llvm; + +std::unique_ptr<OrcCBindingsStack::CompileCallbackMgr> +OrcCBindingsStack::createCompileCallbackMgr(Triple T) { + switch (T.getArch()) { + default: return nullptr; + + case Triple::x86_64: { + typedef orc::LocalJITCompileCallbackManager<orc::OrcX86_64> CCMgrT; + return llvm::make_unique<CCMgrT>(0); + } + } +} + +OrcCBindingsStack::IndirectStubsManagerBuilder +OrcCBindingsStack::createIndirectStubsMgrBuilder(Triple T) { + switch (T.getArch()) { + default: return nullptr; + + case Triple::x86_64: + return [](){ + return llvm::make_unique< + orc::LocalIndirectStubsManager<orc::OrcX86_64>>(); + }; + } +} diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h new file mode 100644 index 0000000..aae6a99 --- /dev/null +++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h @@ -0,0 +1,283 @@ +//===--- OrcCBindingsStack.h - Orc JIT stack for C bindings ---*- C++ -*---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_EXECUTIONENGINE_ORC_ORCCBINDINGSSTACK_H +#define LLVM_LIB_EXECUTIONENGINE_ORC_ORCCBINDINGSSTACK_H + +#include "llvm/ADT/Triple.h" +#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h" +#include "llvm/ExecutionEngine/Orc/CompileUtils.h" +#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" +#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" +#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm-c/OrcBindings.h" + +namespace llvm { + +class OrcCBindingsStack; + +DEFINE_SIMPLE_CONVERSION_FUNCTIONS(OrcCBindingsStack, LLVMOrcJITStackRef) +DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef) + +class OrcCBindingsStack { +public: + + typedef orc::JITCompileCallbackManager CompileCallbackMgr; + typedef orc::ObjectLinkingLayer<> ObjLayerT; + typedef orc::IRCompileLayer<ObjLayerT> CompileLayerT; + typedef orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr> CODLayerT; + + typedef std::function<std::unique_ptr<CompileCallbackMgr>()> + CallbackManagerBuilder; + + typedef CODLayerT::IndirectStubsManagerBuilderT IndirectStubsManagerBuilder; + +private: + + class GenericHandle { + public: + virtual ~GenericHandle() {} + virtual orc::JITSymbol findSymbolIn(const std::string &Name, + bool ExportedSymbolsOnly) = 0; + virtual void removeModule() = 0; + }; + + template <typename LayerT> + class GenericHandleImpl : public GenericHandle { + public: + GenericHandleImpl(LayerT &Layer, typename LayerT::ModuleSetHandleT Handle) + : Layer(Layer), Handle(std::move(Handle)) {} + + orc::JITSymbol findSymbolIn(const std::string &Name, + bool ExportedSymbolsOnly) override { + return Layer.findSymbolIn(Handle, Name, ExportedSymbolsOnly); + } + + void removeModule() override { + return Layer.removeModuleSet(Handle); + } + + private: + LayerT &Layer; + typename LayerT::ModuleSetHandleT Handle; + }; + + template <typename LayerT> + std::unique_ptr<GenericHandleImpl<LayerT>> + createGenericHandle(LayerT &Layer, typename LayerT::ModuleSetHandleT Handle) { + return llvm::make_unique<GenericHandleImpl<LayerT>>(Layer, + std::move(Handle)); + } + +public: + + // We need a 'ModuleSetHandleT' to conform to the layer concept. + typedef unsigned ModuleSetHandleT; + + typedef unsigned ModuleHandleT; + + static std::unique_ptr<CompileCallbackMgr> createCompileCallbackMgr(Triple T); + static IndirectStubsManagerBuilder createIndirectStubsMgrBuilder(Triple T); + + OrcCBindingsStack(TargetMachine &TM, + std::unique_ptr<CompileCallbackMgr> CCMgr, + IndirectStubsManagerBuilder IndirectStubsMgrBuilder) + : DL(TM.createDataLayout()), CCMgr(std::move(CCMgr)), + ObjectLayer(), + CompileLayer(ObjectLayer, orc::SimpleCompiler(TM)), + CODLayer(CompileLayer, + [](Function &F) { std::set<Function*> S; S.insert(&F); return S; }, + *this->CCMgr, std::move(IndirectStubsMgrBuilder), false), + IndirectStubsMgr(IndirectStubsMgrBuilder()), + CXXRuntimeOverrides([this](const std::string &S) { return mangle(S); }) {} + + ~OrcCBindingsStack() { + // Run any destructors registered with __cxa_atexit. + CXXRuntimeOverrides.runDestructors(); + // Run any IR destructors. + for (auto &DtorRunner : IRStaticDestructorRunners) + DtorRunner.runViaLayer(*this); + } + + std::string mangle(StringRef Name) { + std::string MangledName; + { + raw_string_ostream MangledNameStream(MangledName); + Mangler::getNameWithPrefix(MangledNameStream, Name, DL); + } + return MangledName; + } + + template <typename PtrTy> + static PtrTy fromTargetAddress(orc::TargetAddress Addr) { + return reinterpret_cast<PtrTy>(static_cast<uintptr_t>(Addr)); + } + + orc::TargetAddress + createLazyCompileCallback(LLVMOrcLazyCompileCallbackFn Callback, + void *CallbackCtx) { + auto CCInfo = CCMgr->getCompileCallback(); + CCInfo.setCompileAction( + [=]() -> orc::TargetAddress { + return Callback(wrap(this), CallbackCtx); + }); + return CCInfo.getAddress(); + } + + void createIndirectStub(StringRef StubName, orc::TargetAddress Addr) { + IndirectStubsMgr->createStub(StubName, Addr, JITSymbolFlags::Exported); + } + + void setIndirectStubPointer(StringRef Name, orc::TargetAddress Addr) { + IndirectStubsMgr->updatePointer(Name, Addr); + } + + std::shared_ptr<RuntimeDyld::SymbolResolver> + createResolver(LLVMOrcSymbolResolverFn ExternalResolver, + void *ExternalResolverCtx) { + auto Resolver = orc::createLambdaResolver( + [this, ExternalResolver, ExternalResolverCtx](const std::string &Name) { + // Search order: + // 1. JIT'd symbols. + // 2. Runtime overrides. + // 3. External resolver (if present). + + if (auto Sym = CODLayer.findSymbol(Name, true)) + return RuntimeDyld::SymbolInfo(Sym.getAddress(), + Sym.getFlags()); + if (auto Sym = CXXRuntimeOverrides.searchOverrides(Name)) + return Sym; + + if (ExternalResolver) + return RuntimeDyld::SymbolInfo(ExternalResolver(Name.c_str(), + ExternalResolverCtx), + llvm::JITSymbolFlags::Exported); + + return RuntimeDyld::SymbolInfo(nullptr); + }, + [](const std::string &Name) { + return RuntimeDyld::SymbolInfo(nullptr); + } + ); + + return std::shared_ptr<RuntimeDyld::SymbolResolver>(std::move(Resolver)); + } + + template <typename LayerT> + ModuleHandleT addIRModule(LayerT &Layer, + Module *M, + std::unique_ptr<RuntimeDyld::MemoryManager> MemMgr, + LLVMOrcSymbolResolverFn ExternalResolver, + void *ExternalResolverCtx) { + + // Attach a data-layout if one isn't already present. + if (M->getDataLayout().isDefault()) + M->setDataLayout(DL); + + // Record the static constructors and destructors. We have to do this before + // we hand over ownership of the module to the JIT. + std::vector<std::string> CtorNames, DtorNames; + for (auto Ctor : orc::getConstructors(*M)) + CtorNames.push_back(mangle(Ctor.Func->getName())); + for (auto Dtor : orc::getDestructors(*M)) + DtorNames.push_back(mangle(Dtor.Func->getName())); + + // Create the resolver. + auto Resolver = createResolver(ExternalResolver, ExternalResolverCtx); + + // Add the module to the JIT. + std::vector<Module*> S; + S.push_back(std::move(M)); + + auto LH = Layer.addModuleSet(std::move(S), std::move(MemMgr), + std::move(Resolver)); + ModuleHandleT H = createHandle(Layer, LH); + + // Run the static constructors, and save the static destructor runner for + // execution when the JIT is torn down. + orc::CtorDtorRunner<OrcCBindingsStack> CtorRunner(std::move(CtorNames), H); + CtorRunner.runViaLayer(*this); + + IRStaticDestructorRunners.emplace_back(std::move(DtorNames), H); + + return H; + } + + ModuleHandleT addIRModuleEager(Module* M, + LLVMOrcSymbolResolverFn ExternalResolver, + void *ExternalResolverCtx) { + return addIRModule(CompileLayer, std::move(M), + llvm::make_unique<SectionMemoryManager>(), + std::move(ExternalResolver), ExternalResolverCtx); + } + + ModuleHandleT addIRModuleLazy(Module* M, + LLVMOrcSymbolResolverFn ExternalResolver, + void *ExternalResolverCtx) { + return addIRModule(CODLayer, std::move(M), + llvm::make_unique<SectionMemoryManager>(), + std::move(ExternalResolver), ExternalResolverCtx); + } + + void removeModule(ModuleHandleT H) { + GenericHandles[H]->removeModule(); + GenericHandles[H] = nullptr; + FreeHandleIndexes.push_back(H); + } + + orc::JITSymbol findSymbol(const std::string &Name, bool ExportedSymbolsOnly) { + if (auto Sym = IndirectStubsMgr->findStub(Name, ExportedSymbolsOnly)) + return Sym; + return CODLayer.findSymbol(mangle(Name), ExportedSymbolsOnly); + } + + orc::JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name, + bool ExportedSymbolsOnly) { + return GenericHandles[H]->findSymbolIn(Name, ExportedSymbolsOnly); + } + +private: + + template <typename LayerT> + unsigned createHandle(LayerT &Layer, + typename LayerT::ModuleSetHandleT Handle) { + unsigned NewHandle; + if (!FreeHandleIndexes.empty()) { + NewHandle = FreeHandleIndexes.back(); + FreeHandleIndexes.pop_back(); + GenericHandles[NewHandle] = createGenericHandle(Layer, std::move(Handle)); + return NewHandle; + } else { + NewHandle = GenericHandles.size(); + GenericHandles.push_back(createGenericHandle(Layer, std::move(Handle))); + } + return NewHandle; + } + + DataLayout DL; + SectionMemoryManager CCMgrMemMgr; + + std::unique_ptr<CompileCallbackMgr> CCMgr; + ObjLayerT ObjectLayer; + CompileLayerT CompileLayer; + CODLayerT CODLayer; + + std::unique_ptr<orc::IndirectStubsManager> IndirectStubsMgr; + + std::vector<std::unique_ptr<GenericHandle>> GenericHandles; + std::vector<unsigned> FreeHandleIndexes; + + orc::LocalCXXRuntimeOverrides CXXRuntimeOverrides; + std::vector<orc::CtorDtorRunner<OrcCBindingsStack>> IRStaticDestructorRunners; +}; + +} // end namespace llvm + +#endif // LLVM_LIB_EXECUTIONENGINE_ORC_ORCCBINDINGSSTACK_H diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp new file mode 100644 index 0000000..e95115e --- /dev/null +++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp @@ -0,0 +1,57 @@ +//===---------------- OrcError.cpp - Error codes for ORC ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Error codes for ORC. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/OrcError.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ManagedStatic.h" + +using namespace llvm; +using namespace llvm::orc; + +namespace { + +class OrcErrorCategory : public std::error_category { +public: + const char *name() const LLVM_NOEXCEPT override { return "orc"; } + + std::string message(int condition) const override { + switch (static_cast<OrcErrorCode>(condition)) { + case OrcErrorCode::RemoteAllocatorDoesNotExist: + return "Remote allocator does not exist"; + case OrcErrorCode::RemoteAllocatorIdAlreadyInUse: + return "Remote allocator Id already in use"; + case OrcErrorCode::RemoteMProtectAddrUnrecognized: + return "Remote mprotect call references unallocated memory"; + case OrcErrorCode::RemoteIndirectStubsOwnerDoesNotExist: + return "Remote indirect stubs owner does not exist"; + case OrcErrorCode::RemoteIndirectStubsOwnerIdAlreadyInUse: + return "Remote indirect stubs owner Id already in use"; + case OrcErrorCode::UnexpectedRPCCall: + return "Unexpected RPC call"; + } + llvm_unreachable("Unhandled error code"); + } +}; + +static ManagedStatic<OrcErrorCategory> OrcErrCat; +} + +namespace llvm { +namespace orc { + +std::error_code orcError(OrcErrorCode ErrCode) { + typedef std::underlying_type<OrcErrorCode>::type UT; + return std::error_code(static_cast<UT>(ErrCode), *OrcErrCat); +} +} +} diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h index 7dc5164..2ab70a9 100644 --- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h +++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h @@ -54,10 +54,13 @@ class OrcMCJITReplacement : public ExecutionEngine { return Addr; } - void reserveAllocationSpace(uintptr_t CodeSize, uintptr_t DataSizeRO, - uintptr_t DataSizeRW) override { - return ClientMM->reserveAllocationSpace(CodeSize, DataSizeRO, - DataSizeRW); + void reserveAllocationSpace(uintptr_t CodeSize, uint32_t CodeAlign, + uintptr_t RODataSize, uint32_t RODataAlign, + uintptr_t RWDataSize, + uint32_t RWDataAlign) override { + return ClientMM->reserveAllocationSpace(CodeSize, CodeAlign, + RODataSize, RODataAlign, + RWDataSize, RWDataAlign); } bool needsToReserveAllocationSpace() override { @@ -74,6 +77,11 @@ class OrcMCJITReplacement : public ExecutionEngine { return ClientMM->deregisterEHFrames(Addr, LoadAddr, Size); } + void notifyObjectLoaded(RuntimeDyld &RTDyld, + const object::ObjectFile &O) override { + return ClientMM->notifyObjectLoaded(RTDyld, O); + } + void notifyObjectLoaded(ExecutionEngine *EE, const object::ObjectFile &O) override { return ClientMM->notifyObjectLoaded(EE, O); @@ -137,25 +145,26 @@ public: } OrcMCJITReplacement( - std::shared_ptr<MCJITMemoryManager> MemMgr, - std::shared_ptr<RuntimeDyld::SymbolResolver> ClientResolver, - std::unique_ptr<TargetMachine> TM) - : TM(std::move(TM)), MemMgr(*this, std::move(MemMgr)), - Resolver(*this), ClientResolver(std::move(ClientResolver)), - NotifyObjectLoaded(*this), NotifyFinalized(*this), + std::shared_ptr<MCJITMemoryManager> MemMgr, + std::shared_ptr<RuntimeDyld::SymbolResolver> ClientResolver, + std::unique_ptr<TargetMachine> TM) + : ExecutionEngine(TM->createDataLayout()), TM(std::move(TM)), + MemMgr(*this, std::move(MemMgr)), Resolver(*this), + ClientResolver(std::move(ClientResolver)), NotifyObjectLoaded(*this), + NotifyFinalized(*this), ObjectLayer(NotifyObjectLoaded, NotifyFinalized), CompileLayer(ObjectLayer, SimpleCompiler(*this->TM)), - LazyEmitLayer(CompileLayer) { - setDataLayout(this->TM->getDataLayout()); - } + LazyEmitLayer(CompileLayer) {} void addModule(std::unique_ptr<Module> M) override { // If this module doesn't have a DataLayout attached then attach the // default. - if (M->getDataLayout().isDefault()) - M->setDataLayout(*getDataLayout()); - + if (M->getDataLayout().isDefault()) { + M->setDataLayout(getDataLayout()); + } else { + assert(M->getDataLayout() == getDataLayout() && "DataLayout Mismatch"); + } Modules.push_back(std::move(M)); std::vector<Module *> Ms; Ms.push_back(&*Modules.back()); @@ -174,12 +183,7 @@ public: std::tie(Obj, Buf) = O.takeBinary(); std::vector<std::unique_ptr<object::ObjectFile>> Objs; Objs.push_back(std::move(Obj)); - auto H = - ObjectLayer.addObjectSet(std::move(Objs), &MemMgr, &Resolver); - - std::vector<std::unique_ptr<MemoryBuffer>> Bufs; - Bufs.push_back(std::move(Buf)); - ObjectLayer.takeOwnershipOfBuffers(H, std::move(Bufs)); + ObjectLayer.addObjectSet(std::move(Objs), &MemMgr, &Resolver); } void addArchive(object::OwningBinary<object::Archive> A) override { @@ -234,6 +238,10 @@ public: CompileLayer.setObjectCache(NewCache); } + void setProcessAllSections(bool ProcessAllSections) override { + ObjectLayer.setProcessAllSections(ProcessAllSections); + } + private: RuntimeDyld::SymbolInfo findMangledSymbol(StringRef Name) { @@ -252,10 +260,12 @@ private: object::Archive *A = OB.getBinary(); // Look for our symbols in each Archive object::Archive::child_iterator ChildIt = A->findSym(Name); + if (std::error_code EC = ChildIt->getError()) + report_fatal_error(EC.message()); if (ChildIt != A->child_end()) { // FIXME: Support nested archives? ErrorOr<std::unique_ptr<object::Binary>> ChildBinOrErr = - ChildIt->getAsBinary(); + (*ChildIt)->getAsBinary(); if (ChildBinOrErr.getError()) continue; std::unique_ptr<object::Binary> &ChildBin = ChildBinOrErr.get(); @@ -289,7 +299,7 @@ private: "Incorrect number of Infos for Objects."); for (unsigned I = 0; I < Objects.size(); ++I) M.MemMgr.notifyObjectLoaded(&M, *Objects[I]); - }; + } private: OrcMCJITReplacement &M; @@ -310,7 +320,7 @@ private: std::string MangledName; { raw_string_ostream MangledNameStream(MangledName); - Mang.getNameWithPrefix(MangledNameStream, Name, *TM->getDataLayout()); + Mang.getNameWithPrefix(MangledNameStream, Name, getDataLayout()); } return MangledName; } diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.cpp new file mode 100644 index 0000000..064633b --- /dev/null +++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.cpp @@ -0,0 +1,83 @@ +//===------- OrcRemoteTargetRPCAPI.cpp - ORC Remote API utilities ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h" + +namespace llvm { +namespace orc { +namespace remote { + +const char *OrcRemoteTargetRPCAPI::getJITProcIdName(JITProcId Id) { + switch (Id) { + case InvalidId: + return "*** Invalid JITProcId ***"; + case CallIntVoidId: + return "CallIntVoid"; + case CallIntVoidResponseId: + return "CallIntVoidResponse"; + case CallMainId: + return "CallMain"; + case CallMainResponseId: + return "CallMainResponse"; + case CallVoidVoidId: + return "CallVoidVoid"; + case CallVoidVoidResponseId: + return "CallVoidVoidResponse"; + case CreateRemoteAllocatorId: + return "CreateRemoteAllocator"; + case CreateIndirectStubsOwnerId: + return "CreateIndirectStubsOwner"; + case DestroyRemoteAllocatorId: + return "DestroyRemoteAllocator"; + case DestroyIndirectStubsOwnerId: + return "DestroyIndirectStubsOwner"; + case EmitIndirectStubsId: + return "EmitIndirectStubs"; + case EmitIndirectStubsResponseId: + return "EmitIndirectStubsResponse"; + case EmitResolverBlockId: + return "EmitResolverBlock"; + case EmitTrampolineBlockId: + return "EmitTrampolineBlock"; + case EmitTrampolineBlockResponseId: + return "EmitTrampolineBlockResponse"; + case GetSymbolAddressId: + return "GetSymbolAddress"; + case GetSymbolAddressResponseId: + return "GetSymbolAddressResponse"; + case GetRemoteInfoId: + return "GetRemoteInfo"; + case GetRemoteInfoResponseId: + return "GetRemoteInfoResponse"; + case ReadMemId: + return "ReadMem"; + case ReadMemResponseId: + return "ReadMemResponse"; + case ReserveMemId: + return "ReserveMem"; + case ReserveMemResponseId: + return "ReserveMemResponse"; + case RequestCompileId: + return "RequestCompile"; + case RequestCompileResponseId: + return "RequestCompileResponse"; + case SetProtectionsId: + return "SetProtections"; + case TerminateSessionId: + return "TerminateSession"; + case WriteMemId: + return "WriteMem"; + case WritePtrId: + return "WritePtr"; + }; + return nullptr; +} +} +} +} diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcTargetSupport.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcTargetSupport.cpp deleted file mode 100644 index 258868a..0000000 --- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcTargetSupport.cpp +++ /dev/null @@ -1,138 +0,0 @@ -#include "llvm/ADT/Triple.h" -#include "llvm/ExecutionEngine/Orc/OrcTargetSupport.h" -#include <array> - -using namespace llvm::orc; - -namespace { - -uint64_t executeCompileCallback(JITCompileCallbackManagerBase *JCBM, - TargetAddress CallbackID) { - return JCBM->executeCompileCallback(CallbackID); -} - -} - -namespace llvm { -namespace orc { - -const char* OrcX86_64::ResolverBlockName = "orc_resolver_block"; - -void OrcX86_64::insertResolverBlock( - Module &M, JITCompileCallbackManagerBase &JCBM) { - - // Trampoline code-sequence length, used to get trampoline address from return - // address. - const unsigned X86_64_TrampolineLength = 6; - - // List of x86-64 GPRs to save. Note - RBP saved separately below. - std::array<const char *, 14> GPRs = {{ - "rax", "rbx", "rcx", "rdx", - "rsi", "rdi", "r8", "r9", - "r10", "r11", "r12", "r13", - "r14", "r15" - }}; - - // Address of the executeCompileCallback function. - uint64_t CallbackAddr = - static_cast<uint64_t>( - reinterpret_cast<uintptr_t>(executeCompileCallback)); - - std::ostringstream AsmStream; - Triple TT(M.getTargetTriple()); - - // Switch to text section. - if (TT.getOS() == Triple::Darwin) - AsmStream << ".section __TEXT,__text,regular,pure_instructions\n" - << ".align 4, 0x90\n"; - else - AsmStream << ".text\n" - << ".align 16, 0x90\n"; - - // Bake in a pointer to the callback manager immediately before the - // start of the resolver function. - AsmStream << "jit_callback_manager_addr:\n" - << " .quad " << &JCBM << "\n"; - - // Start the resolver function. - AsmStream << ResolverBlockName << ":\n" - << " pushq %rbp\n" - << " movq %rsp, %rbp\n"; - - // Store the GPRs. - for (const auto &GPR : GPRs) - AsmStream << " pushq %" << GPR << "\n"; - - // Store floating-point state with FXSAVE. - // Note: We need to keep the stack 16-byte aligned, so if we've emitted an odd - // number of 64-bit pushes so far (GPRs.size() plus 1 for RBP) then add - // an extra 64 bits of padding to the FXSave area. - unsigned Padding = (GPRs.size() + 1) % 2 ? 8 : 0; - unsigned FXSaveSize = 512 + Padding; - AsmStream << " subq $" << FXSaveSize << ", %rsp\n" - << " fxsave64 (%rsp)\n" - - // Load callback manager address, compute trampoline address, call JIT. - << " lea jit_callback_manager_addr(%rip), %rdi\n" - << " movq (%rdi), %rdi\n" - << " movq 0x8(%rbp), %rsi\n" - << " subq $" << X86_64_TrampolineLength << ", %rsi\n" - << " movabsq $" << CallbackAddr << ", %rax\n" - << " callq *%rax\n" - - // Replace the return to the trampoline with the return address of the - // compiled function body. - << " movq %rax, 0x8(%rbp)\n" - - // Restore the floating point state. - << " fxrstor64 (%rsp)\n" - << " addq $" << FXSaveSize << ", %rsp\n"; - - for (const auto &GPR : make_range(GPRs.rbegin(), GPRs.rend())) - AsmStream << " popq %" << GPR << "\n"; - - // Restore original RBP and return to compiled function body. - AsmStream << " popq %rbp\n" - << " retq\n"; - - M.appendModuleInlineAsm(AsmStream.str()); -} - -OrcX86_64::LabelNameFtor -OrcX86_64::insertCompileCallbackTrampolines(Module &M, - TargetAddress ResolverBlockAddr, - unsigned NumCalls, - unsigned StartIndex) { - const char *ResolverBlockPtrName = "Lorc_resolve_block_addr"; - - std::ostringstream AsmStream; - Triple TT(M.getTargetTriple()); - - if (TT.getOS() == Triple::Darwin) - AsmStream << ".section __TEXT,__text,regular,pure_instructions\n" - << ".align 4, 0x90\n"; - else - AsmStream << ".text\n" - << ".align 16, 0x90\n"; - - AsmStream << ResolverBlockPtrName << ":\n" - << " .quad " << ResolverBlockAddr << "\n"; - - auto GetLabelName = - [=](unsigned I) { - std::ostringstream LabelStream; - LabelStream << "orc_jcc_" << (StartIndex + I); - return LabelStream.str(); - }; - - for (unsigned I = 0; I < NumCalls; ++I) - AsmStream << GetLabelName(I) << ":\n" - << " callq *" << ResolverBlockPtrName << "(%rip)\n"; - - M.appendModuleInlineAsm(AsmStream.str()); - - return GetLabelName; -} - -} // End namespace orc. -} // End namespace llvm. diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index 93287a3..d16b2db 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -41,20 +41,21 @@ void RuntimeDyldImpl::deregisterEHFrames() {} #ifndef NDEBUG static void dumpSectionMemory(const SectionEntry &S, StringRef State) { - dbgs() << "----- Contents of section " << S.Name << " " << State << " -----"; + dbgs() << "----- Contents of section " << S.getName() << " " << State + << " -----"; - if (S.Address == nullptr) { + if (S.getAddress() == nullptr) { dbgs() << "\n <section not emitted>\n"; return; } const unsigned ColsPerRow = 16; - uint8_t *DataAddr = S.Address; - uint64_t LoadAddr = S.LoadAddress; + uint8_t *DataAddr = S.getAddress(); + uint64_t LoadAddr = S.getLoadAddress(); unsigned StartPadding = LoadAddr & (ColsPerRow - 1); - unsigned BytesRemaining = S.Size; + unsigned BytesRemaining = S.getSize(); if (StartPadding) { dbgs() << "\n" << format("0x%016" PRIx64, @@ -82,30 +83,41 @@ static void dumpSectionMemory(const SectionEntry &S, StringRef State) { void RuntimeDyldImpl::resolveRelocations() { MutexGuard locked(lock); + // Print out the sections prior to relocation. + DEBUG( + for (int i = 0, e = Sections.size(); i != e; ++i) + dumpSectionMemory(Sections[i], "before relocations"); + ); + // First, resolve relocations associated with external symbols. resolveExternalSymbols(); - // Just iterate over the sections we have and resolve all the relocations - // in them. Gross overkill, but it gets the job done. - for (int i = 0, e = Sections.size(); i != e; ++i) { + // Iterate over all outstanding relocations + for (auto it = Relocations.begin(), e = Relocations.end(); it != e; ++it) { // The Section here (Sections[i]) refers to the section in which the // symbol for the relocation is located. The SectionID in the relocation // entry provides the section to which the relocation will be applied. - uint64_t Addr = Sections[i].LoadAddress; - DEBUG(dbgs() << "Resolving relocations Section #" << i << "\t" + int Idx = it->first; + uint64_t Addr = Sections[Idx].getLoadAddress(); + DEBUG(dbgs() << "Resolving relocations Section #" << Idx << "\t" << format("%p", (uintptr_t)Addr) << "\n"); - DEBUG(dumpSectionMemory(Sections[i], "before relocations")); - resolveRelocationList(Relocations[i], Addr); - DEBUG(dumpSectionMemory(Sections[i], "after relocations")); - Relocations.erase(i); + resolveRelocationList(it->second, Addr); } + Relocations.clear(); + + // Print out sections after relocation. + DEBUG( + for (int i = 0, e = Sections.size(); i != e; ++i) + dumpSectionMemory(Sections[i], "after relocations"); + ); + } void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress, uint64_t TargetAddress) { MutexGuard locked(lock); for (unsigned i = 0, e = Sections.size(); i != e; ++i) { - if (Sections[i].Address == LocalAddress) { + if (Sections[i].getAddress() == LocalAddress) { reassignSectionAddress(i, TargetAddress); return; } @@ -122,14 +134,10 @@ static std::error_code getOffset(const SymbolRef &Sym, SectionRef Sec, return std::error_code(); } -std::pair<unsigned, unsigned> +RuntimeDyldImpl::ObjSectionToIDMap RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) { MutexGuard locked(lock); - // Grab the first Section ID. We'll use this later to construct the underlying - // range for the returned LoadedObjectInfo. - unsigned SectionsAddedBeginIdx = Sections.size(); - // Save information about our target Arch = (Triple::ArchType)Obj.getArch(); IsTargetLittleEndian = Obj.isLittleEndian(); @@ -138,9 +146,12 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) { // Compute the memory size required to load all sections to be loaded // and pass this information to the memory manager if (MemMgr.needsToReserveAllocationSpace()) { - uint64_t CodeSize = 0, DataSizeRO = 0, DataSizeRW = 0; - computeTotalAllocSize(Obj, CodeSize, DataSizeRO, DataSizeRW); - MemMgr.reserveAllocationSpace(CodeSize, DataSizeRO, DataSizeRW); + uint64_t CodeSize = 0, RODataSize = 0, RWDataSize = 0; + uint32_t CodeAlign = 1, RODataAlign = 1, RWDataAlign = 1; + computeTotalAllocSize(Obj, CodeSize, CodeAlign, RODataSize, RODataAlign, + RWDataSize, RWDataAlign); + MemMgr.reserveAllocationSpace(CodeSize, CodeAlign, RODataSize, RODataAlign, + RWDataSize, RWDataAlign); } // Used sections from the object file @@ -155,39 +166,56 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) { ++I) { uint32_t Flags = I->getFlags(); - bool IsCommon = Flags & SymbolRef::SF_Common; - if (IsCommon) + if (Flags & SymbolRef::SF_Common) CommonSymbols.push_back(*I); else { object::SymbolRef::Type SymType = I->getType(); - if (SymType == object::SymbolRef::ST_Function || - SymType == object::SymbolRef::ST_Data || - SymType == object::SymbolRef::ST_Unknown) { - - ErrorOr<StringRef> NameOrErr = I->getName(); - Check(NameOrErr.getError()); - StringRef Name = *NameOrErr; - section_iterator SI = Obj.section_end(); - Check(I->getSection(SI)); + // Get symbol name. + ErrorOr<StringRef> NameOrErr = I->getName(); + Check(NameOrErr.getError()); + StringRef Name = *NameOrErr; + + // Compute JIT symbol flags. + JITSymbolFlags RTDyldSymFlags = JITSymbolFlags::None; + if (Flags & SymbolRef::SF_Weak) + RTDyldSymFlags |= JITSymbolFlags::Weak; + if (Flags & SymbolRef::SF_Exported) + RTDyldSymFlags |= JITSymbolFlags::Exported; + + if (Flags & SymbolRef::SF_Absolute && + SymType != object::SymbolRef::ST_File) { + auto Addr = I->getAddress(); + Check(Addr.getError()); + uint64_t SectOffset = *Addr; + unsigned SectionID = AbsoluteSymbolSection; + + DEBUG(dbgs() << "\tType: " << SymType << " (absolute) Name: " << Name + << " SID: " << SectionID << " Offset: " + << format("%p", (uintptr_t)SectOffset) + << " flags: " << Flags << "\n"); + GlobalSymbolTable[Name] = + SymbolTableEntry(SectionID, SectOffset, RTDyldSymFlags); + } else if (SymType == object::SymbolRef::ST_Function || + SymType == object::SymbolRef::ST_Data || + SymType == object::SymbolRef::ST_Unknown || + SymType == object::SymbolRef::ST_Other) { + + ErrorOr<section_iterator> SIOrErr = I->getSection(); + Check(SIOrErr.getError()); + section_iterator SI = *SIOrErr; if (SI == Obj.section_end()) continue; + // Get symbol offset. uint64_t SectOffset; Check(getOffset(*I, *SI, SectOffset)); - StringRef SectionData; - Check(SI->getContents(SectionData)); bool IsCode = SI->isText(); - unsigned SectionID = - findOrEmitSection(Obj, *SI, IsCode, LocalSections); + unsigned SectionID = findOrEmitSection(Obj, *SI, IsCode, LocalSections); + DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name << " SID: " << SectionID << " Offset: " << format("%p", (uintptr_t)SectOffset) << " flags: " << Flags << "\n"); - JITSymbolFlags RTDyldSymFlags = JITSymbolFlags::None; - if (Flags & SymbolRef::SF_Weak) - RTDyldSymFlags |= JITSymbolFlags::Weak; - if (Flags & SymbolRef::SF_Exported) - RTDyldSymFlags |= JITSymbolFlags::Exported; GlobalSymbolTable[Name] = SymbolTableEntry(SectionID, SectOffset, RTDyldSymFlags); } @@ -231,9 +259,10 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) { // Give the subclasses a chance to tie-up any loose ends. finalizeLoad(Obj, LocalSections); - unsigned SectionsAddedEndIdx = Sections.size(); +// for (auto E : LocalSections) +// llvm::dbgs() << "Added: " << E.first.getRawDataRefImpl() << " -> " << E.second << "\n"; - return std::make_pair(SectionsAddedBeginIdx, SectionsAddedEndIdx); + return LocalSections; } // A helper method for computeTotalAllocSize. @@ -309,13 +338,15 @@ static bool isZeroInit(const SectionRef Section) { // sections void RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj, uint64_t &CodeSize, - uint64_t &DataSizeRO, - uint64_t &DataSizeRW) { + uint32_t &CodeAlign, + uint64_t &RODataSize, + uint32_t &RODataAlign, + uint64_t &RWDataSize, + uint32_t &RWDataAlign) { // Compute the size of all sections required for execution std::vector<uint64_t> CodeSectionSizes; std::vector<uint64_t> ROSectionSizes; std::vector<uint64_t> RWSectionSizes; - uint64_t MaxAlignment = sizeof(void *); // Collect sizes of all sections to be loaded; // also determine the max alignment of all sections @@ -350,17 +381,15 @@ void RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj, SectionSize = 1; if (IsCode) { + CodeAlign = std::max(CodeAlign, Alignment); CodeSectionSizes.push_back(SectionSize); } else if (IsReadOnly) { + RODataAlign = std::max(RODataAlign, Alignment); ROSectionSizes.push_back(SectionSize); } else { + RWDataAlign = std::max(RWDataAlign, Alignment); RWSectionSizes.push_back(SectionSize); } - - // update the max alignment - if (Alignment > MaxAlignment) { - MaxAlignment = Alignment; - } } } @@ -384,9 +413,9 @@ void RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj, // allocated with the max alignment. Note that we cannot compute with the // individual alignments of the sections, because then the required size // depends on the order, in which the sections are allocated. - CodeSize = computeAllocationSizeForSections(CodeSectionSizes, MaxAlignment); - DataSizeRO = computeAllocationSizeForSections(ROSectionSizes, MaxAlignment); - DataSizeRW = computeAllocationSizeForSections(RWSectionSizes, MaxAlignment); + CodeSize = computeAllocationSizeForSections(CodeSectionSizes, CodeAlign); + RODataSize = computeAllocationSizeForSections(ROSectionSizes, RODataAlign); + RWDataSize = computeAllocationSizeForSections(RWSectionSizes, RWDataAlign); } // compute stub buffer size for the given section @@ -406,10 +435,9 @@ unsigned RuntimeDyldImpl::computeSectionStubBufSize(const ObjectFile &Obj, if (!(RelSecI == Section)) continue; - for (const RelocationRef &Reloc : SI->relocations()) { - (void)Reloc; - StubBufSize += StubSize; - } + for (const RelocationRef &Reloc : SI->relocations()) + if (relocationNeedsStub(Reloc)) + StubBufSize += StubSize; } // Get section data size and alignment @@ -492,7 +520,8 @@ void RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj, if (!Addr) report_fatal_error("Unable to allocate memory for common symbols!"); uint64_t Offset = 0; - Sections.push_back(SectionEntry("<common symbols>", Addr, CommonSize, 0)); + Sections.push_back( + SectionEntry("<common symbols>", Addr, CommonSize, CommonSize, 0)); memset(Addr, 0, CommonSize); DEBUG(dbgs() << "emitCommonSection SectionID: " << SectionID << " new addr: " @@ -524,6 +553,9 @@ void RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj, Offset += Size; Addr += Size; } + + if (Checker) + Checker->registerSection(Obj.getFileName(), SectionID); } unsigned RuntimeDyldImpl::emitSection(const ObjectFile &Obj, @@ -556,12 +588,20 @@ unsigned RuntimeDyldImpl::emitSection(const ObjectFile &Obj, uint8_t *Addr; const char *pData = nullptr; - // In either case, set the location of the unrelocated section in memory, - // since we still process relocations for it even if we're not applying them. - Check(Section.getContents(data)); - // Virtual sections have no data in the object image, so leave pData = 0 - if (!IsVirtual) + // If this section contains any bits (i.e. isn't a virtual or bss section), + // grab a reference to them. + if (!IsVirtual && !IsZeroInit) { + // In either case, set the location of the unrelocated section in memory, + // since we still process relocations for it even if we're not applying them. + Check(Section.getContents(data)); pData = data.data(); + } + + // Code section alignment needs to be at least as high as stub alignment or + // padding calculations may by incorrect when the section is remapped to a + // higher alignment. + if (IsCode) + Alignment = std::max(Alignment, getStubAlignment()); // Some sections, such as debug info, don't need to be loaded for execution. // Leave those where they are. @@ -606,7 +646,8 @@ unsigned RuntimeDyldImpl::emitSection(const ObjectFile &Obj, << " Allocate: " << Allocate << "\n"); } - Sections.push_back(SectionEntry(Name, Addr, DataSize, (uintptr_t)pData)); + Sections.push_back( + SectionEntry(Name, Addr, DataSize, Allocate, (uintptr_t)pData)); if (Checker) Checker->registerSection(Obj.getFileName(), SectionID); @@ -742,11 +783,11 @@ void RuntimeDyldImpl::reassignSectionAddress(unsigned SectionID, // Addr is a uint64_t because we can't assume the pointer width // of the target is the same as that of the host. Just use a generic // "big enough" type. - DEBUG(dbgs() << "Reassigning address for section " - << SectionID << " (" << Sections[SectionID].Name << "): " - << format("0x%016" PRIx64, Sections[SectionID].LoadAddress) << " -> " - << format("0x%016" PRIx64, Addr) << "\n"); - Sections[SectionID].LoadAddress = Addr; + DEBUG(dbgs() << "Reassigning address for section " << SectionID << " (" + << Sections[SectionID].getName() << "): " + << format("0x%016" PRIx64, Sections[SectionID].getLoadAddress()) + << " -> " << format("0x%016" PRIx64, Addr) << "\n"); + Sections[SectionID].setLoadAddress(Addr); } void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs, @@ -754,7 +795,7 @@ void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs, for (unsigned i = 0, e = Relocs.size(); i != e; ++i) { const RelocationEntry &RE = Relocs[i]; // Ignore relocations for sections that were not loaded - if (Sections[RE.SectionID].Address == nullptr) + if (Sections[RE.SectionID].getAddress() == nullptr) continue; resolveRelocation(RE, Value); } @@ -818,10 +859,11 @@ void RuntimeDyldImpl::resolveExternalSymbols() { // RuntimeDyld class implementation uint64_t RuntimeDyld::LoadedObjectInfo::getSectionLoadAddress( - StringRef SectionName) const { - for (unsigned I = BeginIdx; I != EndIdx; ++I) - if (RTDyld.Sections[I].Name == SectionName) - return RTDyld.Sections[I].LoadAddress; + const object::SectionRef &Sec) const { + + auto I = ObjSecToIDMap.find(Sec); + if (I != ObjSecToIDMap.end()) + return RTDyld.Sections[I->second].getLoadAddress(); return 0; } @@ -898,7 +940,9 @@ RuntimeDyld::loadObject(const ObjectFile &Obj) { if (!Dyld->isCompatibleFile(Obj)) report_fatal_error("Incompatible object format!"); - return Dyld->loadObject(Obj); + auto LoadedObjInfo = Dyld->loadObject(Obj); + MemMgr.notifyObjectLoaded(*this, Obj); + return LoadedObjInfo; } void *RuntimeDyld::getSymbolLocalAddress(StringRef Name) const { @@ -928,6 +972,17 @@ bool RuntimeDyld::hasError() { return Dyld->hasError(); } StringRef RuntimeDyld::getErrorString() { return Dyld->getErrorString(); } +void RuntimeDyld::finalizeWithMemoryManagerLocking() { + bool MemoryFinalizationLocked = MemMgr.FinalizationLocked; + MemMgr.FinalizationLocked = true; + resolveRelocations(); + registerEHFrames(); + if (!MemoryFinalizationLocked) { + MemMgr.finalizeMemory(); + MemMgr.FinalizationLocked = false; + } +} + void RuntimeDyld::registerEHFrames() { if (Dyld) Dyld->registerEHFrames(); diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp index 1dacc13..e5fab92 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "RuntimeDyldCOFF.h" +#include "Targets/RuntimeDyldCOFFI386.h" #include "Targets/RuntimeDyldCOFFX86_64.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Triple.h" @@ -24,12 +25,11 @@ using namespace llvm::object; namespace { -class LoadedCOFFObjectInfo +class LoadedCOFFObjectInfo final : public RuntimeDyld::LoadedObjectInfoHelper<LoadedCOFFObjectInfo> { public: - LoadedCOFFObjectInfo(RuntimeDyldImpl &RTDyld, unsigned BeginIdx, - unsigned EndIdx) - : LoadedObjectInfoHelper(RTDyld, BeginIdx, EndIdx) {} + LoadedCOFFObjectInfo(RuntimeDyldImpl &RTDyld, ObjSectionToIDMap ObjSecToIDMap) + : LoadedObjectInfoHelper(RTDyld, std::move(ObjSecToIDMap)) {} OwningBinary<ObjectFile> getObjectForDebug(const ObjectFile &Obj) const override { @@ -48,6 +48,8 @@ llvm::RuntimeDyldCOFF::create(Triple::ArchType Arch, default: llvm_unreachable("Unsupported target for RuntimeDyldCOFF."); break; + case Triple::x86: + return make_unique<RuntimeDyldCOFFI386>(MemMgr, Resolver); case Triple::x86_64: return make_unique<RuntimeDyldCOFFX86_64>(MemMgr, Resolver); } @@ -55,10 +57,7 @@ llvm::RuntimeDyldCOFF::create(Triple::ArchType Arch, std::unique_ptr<RuntimeDyld::LoadedObjectInfo> RuntimeDyldCOFF::loadObject(const object::ObjectFile &O) { - unsigned SectionStartIdx, SectionEndIdx; - std::tie(SectionStartIdx, SectionEndIdx) = loadObjectImpl(O); - return llvm::make_unique<LoadedCOFFObjectInfo>(*this, SectionStartIdx, - SectionEndIdx); + return llvm::make_unique<LoadedCOFFObjectInfo>(*this, loadObjectImpl(O)); } uint64_t RuntimeDyldCOFF::getSymbolOffset(const SymbolRef &Sym) { diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp index ae199b7..58ce88a 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp @@ -727,7 +727,7 @@ bool RuntimeDyldCheckerImpl::checkAllRulesInBuffer(StringRef RulePrefix, } bool RuntimeDyldCheckerImpl::isSymbolValid(StringRef Symbol) const { - if (getRTDyld().getSymbolLocalAddress(Symbol)) + if (getRTDyld().getSymbol(Symbol)) return true; return !!getRTDyld().Resolver.findSymbol(Symbol); } @@ -799,11 +799,10 @@ std::pair<uint64_t, std::string> RuntimeDyldCheckerImpl::getSectionAddr( unsigned SectionID = SectionInfo->SectionID; uint64_t Addr; if (IsInsideLoad) - Addr = - static_cast<uint64_t>( - reinterpret_cast<uintptr_t>(getRTDyld().Sections[SectionID].Address)); + Addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>( + getRTDyld().Sections[SectionID].getAddress())); else - Addr = getRTDyld().Sections[SectionID].LoadAddress; + Addr = getRTDyld().Sections[SectionID].getLoadAddress(); return std::make_pair(Addr, std::string("")); } @@ -835,11 +834,11 @@ std::pair<uint64_t, std::string> RuntimeDyldCheckerImpl::getStubAddrFor( uint64_t Addr; if (IsInsideLoad) { - uintptr_t SectionBase = - reinterpret_cast<uintptr_t>(getRTDyld().Sections[SectionID].Address); + uintptr_t SectionBase = reinterpret_cast<uintptr_t>( + getRTDyld().Sections[SectionID].getAddress()); Addr = static_cast<uint64_t>(SectionBase) + StubOffset; } else { - uint64_t SectionBase = getRTDyld().Sections[SectionID].LoadAddress; + uint64_t SectionBase = getRTDyld().Sections[SectionID].getLoadAddress(); Addr = SectionBase + StubOffset; } @@ -855,16 +854,16 @@ RuntimeDyldCheckerImpl::getSubsectionStartingAt(StringRef Name) const { const auto &SymInfo = pos->second; uint8_t *SectionAddr = getRTDyld().getSectionAddress(SymInfo.getSectionID()); return StringRef(reinterpret_cast<const char *>(SectionAddr) + - SymInfo.getOffset(), - getRTDyld().Sections[SymInfo.getSectionID()].Size - - SymInfo.getOffset()); + SymInfo.getOffset(), + getRTDyld().Sections[SymInfo.getSectionID()].getSize() - + SymInfo.getOffset()); } void RuntimeDyldCheckerImpl::registerSection( StringRef FilePath, unsigned SectionID) { StringRef FileName = sys::path::filename(FilePath); const SectionEntry &Section = getRTDyld().Sections[SectionID]; - StringRef SectionName = Section.Name; + StringRef SectionName = Section.getName(); Stubs[FileName][SectionName].SectionID = SectionID; } @@ -874,7 +873,7 @@ void RuntimeDyldCheckerImpl::registerStubMap( const RuntimeDyldImpl::StubMap &RTDyldStubs) { StringRef FileName = sys::path::filename(FilePath); const SectionEntry &Section = getRTDyld().Sections[SectionID]; - StringRef SectionName = Section.Name; + StringRef SectionName = Section.getName(); Stubs[FileName][SectionName].SectionID = SectionID; diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index 3787950..e09b71a 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -66,7 +66,6 @@ public: static inline bool classof(const ELFObjectFile<ELFT> *v) { return v->isDyldType(); } - }; @@ -104,12 +103,11 @@ void DyldELFObject<ELFT>::updateSymbolAddress(const SymbolRef &SymRef, sym->st_value = static_cast<addr_type>(Addr); } -class LoadedELFObjectInfo +class LoadedELFObjectInfo final : public RuntimeDyld::LoadedObjectInfoHelper<LoadedELFObjectInfo> { public: - LoadedELFObjectInfo(RuntimeDyldImpl &RTDyld, unsigned BeginIdx, - unsigned EndIdx) - : LoadedObjectInfoHelper(RTDyld, BeginIdx, EndIdx) {} + LoadedELFObjectInfo(RuntimeDyldImpl &RTDyld, ObjSectionToIDMap ObjSecToIDMap) + : LoadedObjectInfoHelper(RTDyld, std::move(ObjSecToIDMap)) {} OwningBinary<ObjectFile> getObjectForDebug(const ObjectFile &Obj) const override; @@ -118,6 +116,7 @@ public: template <typename ELFT> std::unique_ptr<DyldELFObject<ELFT>> createRTDyldELFObject(MemoryBufferRef Buffer, + const ObjectFile &SourceObject, const LoadedELFObjectInfo &L, std::error_code &ec) { typedef typename ELFFile<ELFT>::Elf_Shdr Elf_Shdr; @@ -127,6 +126,7 @@ createRTDyldELFObject(MemoryBufferRef Buffer, llvm::make_unique<DyldELFObject<ELFT>>(Buffer, ec); // Iterate over all sections in the object. + auto SI = SourceObject.section_begin(); for (const auto &Sec : Obj->sections()) { StringRef SectionName; Sec.getName(SectionName); @@ -135,12 +135,13 @@ createRTDyldELFObject(MemoryBufferRef Buffer, Elf_Shdr *shdr = const_cast<Elf_Shdr *>( reinterpret_cast<const Elf_Shdr *>(ShdrRef.p)); - if (uint64_t SecLoadAddr = L.getSectionLoadAddress(SectionName)) { + if (uint64_t SecLoadAddr = L.getSectionLoadAddress(*SI)) { // This assumes that the address passed in matches the target address // bitness. The template-based type cast handles everything else. shdr->sh_addr = static_cast<addr_type>(SecLoadAddr); } } + ++SI; } return Obj; @@ -158,16 +159,20 @@ OwningBinary<ObjectFile> createELFDebugObject(const ObjectFile &Obj, std::unique_ptr<ObjectFile> DebugObj; if (Obj.getBytesInAddress() == 4 && Obj.isLittleEndian()) { typedef ELFType<support::little, false> ELF32LE; - DebugObj = createRTDyldELFObject<ELF32LE>(Buffer->getMemBufferRef(), L, ec); + DebugObj = createRTDyldELFObject<ELF32LE>(Buffer->getMemBufferRef(), Obj, L, + ec); } else if (Obj.getBytesInAddress() == 4 && !Obj.isLittleEndian()) { typedef ELFType<support::big, false> ELF32BE; - DebugObj = createRTDyldELFObject<ELF32BE>(Buffer->getMemBufferRef(), L, ec); + DebugObj = createRTDyldELFObject<ELF32BE>(Buffer->getMemBufferRef(), Obj, L, + ec); } else if (Obj.getBytesInAddress() == 8 && !Obj.isLittleEndian()) { typedef ELFType<support::big, true> ELF64BE; - DebugObj = createRTDyldELFObject<ELF64BE>(Buffer->getMemBufferRef(), L, ec); + DebugObj = createRTDyldELFObject<ELF64BE>(Buffer->getMemBufferRef(), Obj, L, + ec); } else if (Obj.getBytesInAddress() == 8 && Obj.isLittleEndian()) { typedef ELFType<support::little, true> ELF64LE; - DebugObj = createRTDyldELFObject<ELF64LE>(Buffer->getMemBufferRef(), L, ec); + DebugObj = createRTDyldELFObject<ELF64LE>(Buffer->getMemBufferRef(), Obj, L, + ec); } else llvm_unreachable("Unexpected ELF format"); @@ -181,7 +186,7 @@ LoadedELFObjectInfo::getObjectForDebug(const ObjectFile &Obj) const { return createELFDebugObject(Obj, *this); } -} // namespace +} // anonymous namespace namespace llvm { @@ -193,9 +198,9 @@ RuntimeDyldELF::~RuntimeDyldELF() {} void RuntimeDyldELF::registerEHFrames() { for (int i = 0, e = UnregisteredEHFrameSections.size(); i != e; ++i) { SID EHFrameSID = UnregisteredEHFrameSections[i]; - uint8_t *EHFrameAddr = Sections[EHFrameSID].Address; - uint64_t EHFrameLoadAddr = Sections[EHFrameSID].LoadAddress; - size_t EHFrameSize = Sections[EHFrameSID].Size; + uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress(); + uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress(); + size_t EHFrameSize = Sections[EHFrameSID].getSize(); MemMgr.registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize); RegisteredEHFrameSections.push_back(EHFrameSID); } @@ -205,9 +210,9 @@ void RuntimeDyldELF::registerEHFrames() { void RuntimeDyldELF::deregisterEHFrames() { for (int i = 0, e = RegisteredEHFrameSections.size(); i != e; ++i) { SID EHFrameSID = RegisteredEHFrameSections[i]; - uint8_t *EHFrameAddr = Sections[EHFrameSID].Address; - uint64_t EHFrameLoadAddr = Sections[EHFrameSID].LoadAddress; - size_t EHFrameSize = Sections[EHFrameSID].Size; + uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress(); + uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress(); + size_t EHFrameSize = Sections[EHFrameSID].getSize(); MemMgr.deregisterEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize); } RegisteredEHFrameSections.clear(); @@ -215,10 +220,7 @@ void RuntimeDyldELF::deregisterEHFrames() { std::unique_ptr<RuntimeDyld::LoadedObjectInfo> RuntimeDyldELF::loadObject(const object::ObjectFile &O) { - unsigned SectionStartIdx, SectionEndIdx; - std::tie(SectionStartIdx, SectionEndIdx) = loadObjectImpl(O); - return llvm::make_unique<LoadedELFObjectInfo>(*this, SectionStartIdx, - SectionEndIdx); + return llvm::make_unique<LoadedELFObjectInfo>(*this, loadObjectImpl(O)); } void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section, @@ -230,9 +232,10 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section, llvm_unreachable("Relocation type not implemented yet!"); break; case ELF::R_X86_64_64: { - support::ulittle64_t::ref(Section.Address + Offset) = Value + Addend; + support::ulittle64_t::ref(Section.getAddressWithOffset(Offset)) = + Value + Addend; DEBUG(dbgs() << "Writing " << format("%p", (Value + Addend)) << " at " - << format("%p\n", Section.Address + Offset)); + << format("%p\n", Section.getAddressWithOffset(Offset))); break; } case ELF::R_X86_64_32: @@ -242,23 +245,34 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section, (Type == ELF::R_X86_64_32S && ((int64_t)Value <= INT32_MAX && (int64_t)Value >= INT32_MIN))); uint32_t TruncatedAddr = (Value & 0xFFFFFFFF); - support::ulittle32_t::ref(Section.Address + Offset) = TruncatedAddr; + support::ulittle32_t::ref(Section.getAddressWithOffset(Offset)) = + TruncatedAddr; DEBUG(dbgs() << "Writing " << format("%p", TruncatedAddr) << " at " - << format("%p\n", Section.Address + Offset)); + << format("%p\n", Section.getAddressWithOffset(Offset))); + break; + } + case ELF::R_X86_64_PC8: { + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); + int64_t RealOffset = Value + Addend - FinalAddress; + assert(isInt<8>(RealOffset)); + int8_t TruncOffset = (RealOffset & 0xFF); + Section.getAddress()[Offset] = TruncOffset; break; } case ELF::R_X86_64_PC32: { - uint64_t FinalAddress = Section.LoadAddress + Offset; + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); int64_t RealOffset = Value + Addend - FinalAddress; assert(isInt<32>(RealOffset)); int32_t TruncOffset = (RealOffset & 0xFFFFFFFF); - support::ulittle32_t::ref(Section.Address + Offset) = TruncOffset; + support::ulittle32_t::ref(Section.getAddressWithOffset(Offset)) = + TruncOffset; break; } case ELF::R_X86_64_PC64: { - uint64_t FinalAddress = Section.LoadAddress + Offset; + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); int64_t RealOffset = Value + Addend - FinalAddress; - support::ulittle64_t::ref(Section.Address + Offset) = RealOffset; + support::ulittle64_t::ref(Section.getAddressWithOffset(Offset)) = + RealOffset; break; } } @@ -269,13 +283,16 @@ void RuntimeDyldELF::resolveX86Relocation(const SectionEntry &Section, uint32_t Type, int32_t Addend) { switch (Type) { case ELF::R_386_32: { - support::ulittle32_t::ref(Section.Address + Offset) = Value + Addend; + support::ulittle32_t::ref(Section.getAddressWithOffset(Offset)) = + Value + Addend; break; } case ELF::R_386_PC32: { - uint32_t FinalAddress = ((Section.LoadAddress + Offset) & 0xFFFFFFFF); + uint32_t FinalAddress = + Section.getLoadAddressWithOffset(Offset) & 0xFFFFFFFF; uint32_t RealOffset = Value + Addend - FinalAddress; - support::ulittle32_t::ref(Section.Address + Offset) = RealOffset; + support::ulittle32_t::ref(Section.getAddressWithOffset(Offset)) = + RealOffset; break; } default: @@ -289,11 +306,12 @@ void RuntimeDyldELF::resolveX86Relocation(const SectionEntry &Section, void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section, uint64_t Offset, uint64_t Value, uint32_t Type, int64_t Addend) { - uint32_t *TargetPtr = reinterpret_cast<uint32_t *>(Section.Address + Offset); - uint64_t FinalAddress = Section.LoadAddress + Offset; + uint32_t *TargetPtr = + reinterpret_cast<uint32_t *>(Section.getAddressWithOffset(Offset)); + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); DEBUG(dbgs() << "resolveAArch64Relocation, LocalAddress: 0x" - << format("%llx", Section.Address + Offset) + << format("%llx", Section.getAddressWithOffset(Offset)) << " FinalAddress: 0x" << format("%llx", FinalAddress) << " Value: 0x" << format("%llx", Value) << " Type: 0x" << format("%x", Type) << " Addend: 0x" << format("%llx", Addend) @@ -305,7 +323,7 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section, break; case ELF::R_AARCH64_ABS64: { uint64_t *TargetPtr = - reinterpret_cast<uint64_t *>(Section.Address + Offset); + reinterpret_cast<uint64_t *>(Section.getAddressWithOffset(Offset)); *TargetPtr = Value + Addend; break; } @@ -428,12 +446,13 @@ void RuntimeDyldELF::resolveARMRelocation(const SectionEntry &Section, uint64_t Offset, uint32_t Value, uint32_t Type, int32_t Addend) { // TODO: Add Thumb relocations. - uint32_t *TargetPtr = (uint32_t *)(Section.Address + Offset); - uint32_t FinalAddress = ((Section.LoadAddress + Offset) & 0xFFFFFFFF); + uint32_t *TargetPtr = + reinterpret_cast<uint32_t *>(Section.getAddressWithOffset(Offset)); + uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset) & 0xFFFFFFFF; Value += Addend; DEBUG(dbgs() << "resolveARMRelocation, LocalAddress: " - << Section.Address + Offset + << Section.getAddressWithOffset(Offset) << " FinalAddress: " << format("%p", FinalAddress) << " Value: " << format("%x", Value) << " Type: " << format("%x", Type) << " Addend: " << format("%x", Addend) << "\n"); @@ -477,13 +496,14 @@ void RuntimeDyldELF::resolveARMRelocation(const SectionEntry &Section, void RuntimeDyldELF::resolveMIPSRelocation(const SectionEntry &Section, uint64_t Offset, uint32_t Value, uint32_t Type, int32_t Addend) { - uint8_t *TargetPtr = Section.Address + Offset; + uint8_t *TargetPtr = Section.getAddressWithOffset(Offset); Value += Addend; DEBUG(dbgs() << "resolveMIPSRelocation, LocalAddress: " - << Section.Address + Offset << " FinalAddress: " - << format("%p", Section.LoadAddress + Offset) << " Value: " - << format("%x", Value) << " Type: " << format("%x", Type) + << Section.getAddressWithOffset(Offset) << " FinalAddress: " + << format("%p", Section.getLoadAddressWithOffset(Offset)) + << " Value: " << format("%x", Value) + << " Type: " << format("%x", Type) << " Addend: " << format("%x", Addend) << "\n"); uint32_t Insn = readBytesUnaligned(TargetPtr, 4); @@ -512,47 +532,47 @@ void RuntimeDyldELF::resolveMIPSRelocation(const SectionEntry &Section, writeBytesUnaligned(Insn, TargetPtr, 4); break; case ELF::R_MIPS_PC32: { - uint32_t FinalAddress = (Section.LoadAddress + Offset); + uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset); writeBytesUnaligned(Value - FinalAddress, (uint8_t *)TargetPtr, 4); break; } case ELF::R_MIPS_PC16: { - uint32_t FinalAddress = (Section.LoadAddress + Offset); + uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset); Insn &= 0xffff0000; Insn |= ((Value - FinalAddress) >> 2) & 0xffff; writeBytesUnaligned(Insn, TargetPtr, 4); break; } case ELF::R_MIPS_PC19_S2: { - uint32_t FinalAddress = (Section.LoadAddress + Offset); + uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset); Insn &= 0xfff80000; Insn |= ((Value - (FinalAddress & ~0x3)) >> 2) & 0x7ffff; writeBytesUnaligned(Insn, TargetPtr, 4); break; } case ELF::R_MIPS_PC21_S2: { - uint32_t FinalAddress = (Section.LoadAddress + Offset); + uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset); Insn &= 0xffe00000; Insn |= ((Value - FinalAddress) >> 2) & 0x1fffff; writeBytesUnaligned(Insn, TargetPtr, 4); break; } case ELF::R_MIPS_PC26_S2: { - uint32_t FinalAddress = (Section.LoadAddress + Offset); + uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset); Insn &= 0xfc000000; Insn |= ((Value - FinalAddress) >> 2) & 0x3ffffff; writeBytesUnaligned(Insn, TargetPtr, 4); break; } case ELF::R_MIPS_PCHI16: { - uint32_t FinalAddress = (Section.LoadAddress + Offset); + uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset); Insn &= 0xffff0000; Insn |= ((Value - FinalAddress + 0x8000) >> 16) & 0xffff; writeBytesUnaligned(Insn, TargetPtr, 4); break; } case ELF::R_MIPS_PCLO16: { - uint32_t FinalAddress = (Section.LoadAddress + Offset); + uint32_t FinalAddress = Section.getLoadAddressWithOffset(Offset); Insn &= 0xffff0000; Insn |= (Value - FinalAddress) & 0xffff; writeBytesUnaligned(Insn, TargetPtr, 4); @@ -603,7 +623,8 @@ void RuntimeDyldELF::resolveMIPS64Relocation(const SectionEntry &Section, CalculatedValue, SymOffset, SectionID); } - applyMIPS64Relocation(Section.Address + Offset, CalculatedValue, RelType); + applyMIPS64Relocation(Section.getAddressWithOffset(Offset), CalculatedValue, + RelType); } int64_t @@ -613,13 +634,12 @@ RuntimeDyldELF::evaluateMIPS64Relocation(const SectionEntry &Section, uint64_t SymOffset, SID SectionID) { DEBUG(dbgs() << "evaluateMIPS64Relocation, LocalAddress: 0x" - << format("%llx", Section.Address + Offset) + << format("%llx", Section.getAddressWithOffset(Offset)) << " FinalAddress: 0x" - << format("%llx", Section.LoadAddress + Offset) + << format("%llx", Section.getLoadAddressWithOffset(Offset)) << " Value: 0x" << format("%llx", Value) << " Type: 0x" << format("%x", Type) << " Addend: 0x" << format("%llx", Addend) - << " SymOffset: " << format("%x", SymOffset) - << "\n"); + << " SymOffset: " << format("%x", SymOffset) << "\n"); switch (Type) { default: @@ -672,35 +692,35 @@ RuntimeDyldELF::evaluateMIPS64Relocation(const SectionEntry &Section, return Value + Addend - (GOTAddr + 0x7ff0); } case ELF::R_MIPS_PC16: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); return ((Value + Addend - FinalAddress) >> 2) & 0xffff; } case ELF::R_MIPS_PC32: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); return Value + Addend - FinalAddress; } case ELF::R_MIPS_PC18_S3: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); - return ((Value + Addend - ((FinalAddress | 7) ^ 7)) >> 3) & 0x3ffff; + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); + return ((Value + Addend - (FinalAddress & ~0x7)) >> 3) & 0x3ffff; } case ELF::R_MIPS_PC19_S2: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); - return ((Value + Addend - FinalAddress) >> 2) & 0x7ffff; + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); + return ((Value + Addend - (FinalAddress & ~0x3)) >> 2) & 0x7ffff; } case ELF::R_MIPS_PC21_S2: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); return ((Value + Addend - FinalAddress) >> 2) & 0x1fffff; } case ELF::R_MIPS_PC26_S2: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); return ((Value + Addend - FinalAddress) >> 2) & 0x3ffffff; } case ELF::R_MIPS_PCHI16: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); return ((Value + Addend - FinalAddress + 0x8000) >> 16) & 0xffff; } case ELF::R_MIPS_PCLO16: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); return (Value + Addend - FinalAddress) & 0xffff; } } @@ -769,7 +789,7 @@ void RuntimeDyldELF::findPPC64TOCSection(const ELFObjectFileBase &Obj, // relocation) without a .toc directive. In this case just use the // first section (which is usually the .odp) since the code won't // reference the .toc base directly. - Rel.SymbolName = NULL; + Rel.SymbolName = nullptr; Rel.SectionID = 0; // The TOC consists of sections .got, .toc, .tocbss, .plt in that @@ -842,8 +862,9 @@ void RuntimeDyldELF::findOPDEntrySection(const ELFObjectFileBase &Obj, if (Rel.Addend != (int64_t)TargetSymbolOffset) continue; - section_iterator tsi(Obj.section_end()); - check(TargetSymbol->getSection(tsi)); + ErrorOr<section_iterator> TSIOrErr = TargetSymbol->getSection(); + check(TSIOrErr.getError()); + section_iterator tsi = *TSIOrErr; bool IsCode = tsi->isText(); Rel.SectionID = findOrEmitSection(Obj, (*tsi), IsCode, LocalSections); Rel.Addend = (intptr_t)Addend; @@ -884,10 +905,30 @@ static inline uint16_t applyPPChighesta (uint64_t value) { return ((value + 0x8000) >> 48) & 0xffff; } +void RuntimeDyldELF::resolvePPC32Relocation(const SectionEntry &Section, + uint64_t Offset, uint64_t Value, + uint32_t Type, int64_t Addend) { + uint8_t *LocalAddress = Section.getAddressWithOffset(Offset); + switch (Type) { + default: + llvm_unreachable("Relocation type not implemented yet!"); + break; + case ELF::R_PPC_ADDR16_LO: + writeInt16BE(LocalAddress, applyPPClo(Value + Addend)); + break; + case ELF::R_PPC_ADDR16_HI: + writeInt16BE(LocalAddress, applyPPChi(Value + Addend)); + break; + case ELF::R_PPC_ADDR16_HA: + writeInt16BE(LocalAddress, applyPPCha(Value + Addend)); + break; + } +} + void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section, uint64_t Offset, uint64_t Value, uint32_t Type, int64_t Addend) { - uint8_t *LocalAddress = Section.Address + Offset; + uint8_t *LocalAddress = Section.getAddressWithOffset(Offset); switch (Type) { default: llvm_unreachable("Relocation type not implemented yet!"); @@ -929,17 +970,17 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section, writeInt16BE(LocalAddress + 2, (aalk & 3) | ((Value + Addend) & 0xfffc)); } break; case ELF::R_PPC64_REL16_LO: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); uint64_t Delta = Value - FinalAddress + Addend; writeInt16BE(LocalAddress, applyPPClo(Delta)); } break; case ELF::R_PPC64_REL16_HI: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); uint64_t Delta = Value - FinalAddress + Addend; writeInt16BE(LocalAddress, applyPPChi(Delta)); } break; case ELF::R_PPC64_REL16_HA: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); uint64_t Delta = Value - FinalAddress + Addend; writeInt16BE(LocalAddress, applyPPCha(Delta)); } break; @@ -950,22 +991,22 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section, writeInt32BE(LocalAddress, Result); } break; case ELF::R_PPC64_REL24: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); int32_t delta = static_cast<int32_t>(Value - FinalAddress + Addend); - if (SignExtend32<24>(delta) != delta) + if (SignExtend32<26>(delta) != delta) llvm_unreachable("Relocation R_PPC64_REL24 overflow"); // Generates a 'bl <address>' instruction writeInt32BE(LocalAddress, 0x48000001 | (delta & 0x03FFFFFC)); } break; case ELF::R_PPC64_REL32: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); int32_t delta = static_cast<int32_t>(Value - FinalAddress + Addend); if (SignExtend32<32>(delta) != delta) llvm_unreachable("Relocation R_PPC64_REL32 overflow"); writeInt32BE(LocalAddress, delta); } break; case ELF::R_PPC64_REL64: { - uint64_t FinalAddress = (Section.LoadAddress + Offset); + uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); uint64_t Delta = Value - FinalAddress + Addend; writeInt64BE(LocalAddress, Delta); } break; @@ -978,27 +1019,27 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section, void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section, uint64_t Offset, uint64_t Value, uint32_t Type, int64_t Addend) { - uint8_t *LocalAddress = Section.Address + Offset; + uint8_t *LocalAddress = Section.getAddressWithOffset(Offset); switch (Type) { default: llvm_unreachable("Relocation type not implemented yet!"); break; case ELF::R_390_PC16DBL: case ELF::R_390_PLT16DBL: { - int64_t Delta = (Value + Addend) - (Section.LoadAddress + Offset); + int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset); assert(int16_t(Delta / 2) * 2 == Delta && "R_390_PC16DBL overflow"); writeInt16BE(LocalAddress, Delta / 2); break; } case ELF::R_390_PC32DBL: case ELF::R_390_PLT32DBL: { - int64_t Delta = (Value + Addend) - (Section.LoadAddress + Offset); + int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset); assert(int32_t(Delta / 2) * 2 == Delta && "R_390_PC32DBL overflow"); writeInt32BE(LocalAddress, Delta / 2); break; } case ELF::R_390_PC32: { - int64_t Delta = (Value + Addend) - (Section.LoadAddress + Offset); + int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset); assert(int32_t(Delta) == Delta && "R_390_PC32 overflow"); writeInt32BE(LocalAddress, Delta); break; @@ -1072,6 +1113,9 @@ void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section, else llvm_unreachable("Mips ABI not handled"); break; + case Triple::ppc: + resolvePPC32Relocation(Section, Offset, Value, Type, Addend); + break; case Triple::ppc64: // Fall through. case Triple::ppc64le: resolvePPC64Relocation(Section, Offset, Value, Type, Addend); @@ -1085,7 +1129,7 @@ void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section, } void *RuntimeDyldELF::computePlaceholderAddress(unsigned SectionID, uint64_t Offset) const { - return (void*)(Sections[SectionID].ObjAddress + Offset); + return (void *)(Sections[SectionID].getObjAddress() + Offset); } void RuntimeDyldELF::processSimpleRelocation(unsigned SectionID, uint64_t Offset, unsigned RelType, RelocationValueRef Value) { @@ -1096,6 +1140,29 @@ void RuntimeDyldELF::processSimpleRelocation(unsigned SectionID, uint64_t Offset addRelocationForSection(RE, Value.SectionID); } +uint32_t RuntimeDyldELF::getMatchingLoRelocation(uint32_t RelType, + bool IsLocal) const { + switch (RelType) { + case ELF::R_MICROMIPS_GOT16: + if (IsLocal) + return ELF::R_MICROMIPS_LO16; + break; + case ELF::R_MICROMIPS_HI16: + return ELF::R_MICROMIPS_LO16; + case ELF::R_MIPS_GOT16: + if (IsLocal) + return ELF::R_MIPS_LO16; + break; + case ELF::R_MIPS_HI16: + return ELF::R_MIPS_LO16; + case ELF::R_MIPS_PCHI16: + return ELF::R_MIPS_PCLO16; + default: + break; + } + return ELF::R_MIPS_NONE; +} + relocation_iterator RuntimeDyldELF::processRelocationRef( unsigned SectionID, relocation_iterator RelI, const ObjectFile &O, ObjSectionToIDMap &ObjSectionToID, StubMap &Stubs) { @@ -1136,8 +1203,7 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( // TODO: Now ELF SymbolRef::ST_Debug = STT_SECTION, it's not obviously // and can be changed by another developers. Maybe best way is add // a new symbol type ST_Section to SymbolRef and use it. - section_iterator si(Obj.section_end()); - Symbol->getSection(si); + section_iterator si = *Symbol->getSection(); if (si == Obj.section_end()) llvm_unreachable("Symbol section not found, bad object file format!"); DEBUG(dbgs() << "\t\tThis is section symbol\n"); @@ -1178,24 +1244,28 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( // Look for an existing stub. StubMap::const_iterator i = Stubs.find(Value); if (i != Stubs.end()) { - resolveRelocation(Section, Offset, (uint64_t)Section.Address + i->second, + resolveRelocation(Section, Offset, + (uint64_t)Section.getAddressWithOffset(i->second), RelType, 0); DEBUG(dbgs() << " Stub function found\n"); } else { // Create a new stub function. DEBUG(dbgs() << " Create a new stub function\n"); - Stubs[Value] = Section.StubOffset; - uint8_t *StubTargetAddr = - createStubFunction(Section.Address + Section.StubOffset); + Stubs[Value] = Section.getStubOffset(); + uint8_t *StubTargetAddr = createStubFunction( + Section.getAddressWithOffset(Section.getStubOffset())); - RelocationEntry REmovz_g3(SectionID, StubTargetAddr - Section.Address, + RelocationEntry REmovz_g3(SectionID, + StubTargetAddr - Section.getAddress(), ELF::R_AARCH64_MOVW_UABS_G3, Value.Addend); - RelocationEntry REmovk_g2(SectionID, StubTargetAddr - Section.Address + 4, + RelocationEntry REmovk_g2(SectionID, StubTargetAddr - + Section.getAddress() + 4, ELF::R_AARCH64_MOVW_UABS_G2_NC, Value.Addend); - RelocationEntry REmovk_g1(SectionID, StubTargetAddr - Section.Address + 8, + RelocationEntry REmovk_g1(SectionID, StubTargetAddr - + Section.getAddress() + 8, ELF::R_AARCH64_MOVW_UABS_G1_NC, Value.Addend); - RelocationEntry REmovk_g0(SectionID, - StubTargetAddr - Section.Address + 12, + RelocationEntry REmovk_g0(SectionID, StubTargetAddr - + Section.getAddress() + 12, ELF::R_AARCH64_MOVW_UABS_G0_NC, Value.Addend); if (Value.SymbolName) { @@ -1210,9 +1280,10 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( addRelocationForSection(REmovk_g0, Value.SectionID); } resolveRelocation(Section, Offset, - (uint64_t)Section.Address + Section.StubOffset, RelType, - 0); - Section.StubOffset += getMaxStubSize(); + reinterpret_cast<uint64_t>(Section.getAddressWithOffset( + Section.getStubOffset())), + RelType, 0); + Section.advanceStubOffset(getMaxStubSize()); } } else if (Arch == Triple::arm) { if (RelType == ELF::R_ARM_PC24 || RelType == ELF::R_ARM_CALL || @@ -1224,26 +1295,29 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( // Look for an existing stub. StubMap::const_iterator i = Stubs.find(Value); if (i != Stubs.end()) { - resolveRelocation(Section, Offset, (uint64_t)Section.Address + i->second, - RelType, 0); + resolveRelocation( + Section, Offset, + reinterpret_cast<uint64_t>(Section.getAddressWithOffset(i->second)), + RelType, 0); DEBUG(dbgs() << " Stub function found\n"); } else { // Create a new stub function. DEBUG(dbgs() << " Create a new stub function\n"); - Stubs[Value] = Section.StubOffset; - uint8_t *StubTargetAddr = - createStubFunction(Section.Address + Section.StubOffset); - RelocationEntry RE(SectionID, StubTargetAddr - Section.Address, - ELF::R_ARM_ABS32, Value.Addend); + Stubs[Value] = Section.getStubOffset(); + uint8_t *StubTargetAddr = createStubFunction( + Section.getAddressWithOffset(Section.getStubOffset())); + RelocationEntry RE(SectionID, StubTargetAddr - Section.getAddress(), + ELF::R_ARM_ABS32, Value.Addend); if (Value.SymbolName) addRelocationForSymbol(RE, Value.SymbolName); else addRelocationForSection(RE, Value.SectionID); - resolveRelocation(Section, Offset, - (uint64_t)Section.Address + Section.StubOffset, RelType, - 0); - Section.StubOffset += getMaxStubSize(); + resolveRelocation(Section, Offset, reinterpret_cast<uint64_t>( + Section.getAddressWithOffset( + Section.getStubOffset())), + RelType, 0); + Section.advanceStubOffset(getMaxStubSize()); } } else { uint32_t *Placeholder = @@ -1282,15 +1356,16 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( } else { // Create a new stub function. DEBUG(dbgs() << " Create a new stub function\n"); - Stubs[Value] = Section.StubOffset; - uint8_t *StubTargetAddr = - createStubFunction(Section.Address + Section.StubOffset); + Stubs[Value] = Section.getStubOffset(); + uint8_t *StubTargetAddr = createStubFunction( + Section.getAddressWithOffset(Section.getStubOffset())); // Creating Hi and Lo relocations for the filled stub instructions. - RelocationEntry REHi(SectionID, StubTargetAddr - Section.Address, - ELF::R_MIPS_HI16, Value.Addend); - RelocationEntry RELo(SectionID, StubTargetAddr - Section.Address + 4, - ELF::R_MIPS_LO16, Value.Addend); + RelocationEntry REHi(SectionID, StubTargetAddr - Section.getAddress(), + ELF::R_MIPS_HI16, Value.Addend); + RelocationEntry RELo(SectionID, + StubTargetAddr - Section.getAddress() + 4, + ELF::R_MIPS_LO16, Value.Addend); if (Value.SymbolName) { addRelocationForSymbol(REHi, Value.SymbolName); @@ -1301,21 +1376,39 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( addRelocationForSection(RELo, Value.SectionID); } - RelocationEntry RE(SectionID, Offset, RelType, Section.StubOffset); + RelocationEntry RE(SectionID, Offset, RelType, Section.getStubOffset()); addRelocationForSection(RE, SectionID); - Section.StubOffset += getMaxStubSize(); + Section.advanceStubOffset(getMaxStubSize()); } + } else if (RelType == ELF::R_MIPS_HI16 || RelType == ELF::R_MIPS_PCHI16) { + int64_t Addend = (Opcode & 0x0000ffff) << 16; + RelocationEntry RE(SectionID, Offset, RelType, Addend); + PendingRelocs.push_back(std::make_pair(Value, RE)); + } else if (RelType == ELF::R_MIPS_LO16 || RelType == ELF::R_MIPS_PCLO16) { + int64_t Addend = Value.Addend + SignExtend32<16>(Opcode & 0x0000ffff); + for (auto I = PendingRelocs.begin(); I != PendingRelocs.end();) { + const RelocationValueRef &MatchingValue = I->first; + RelocationEntry &Reloc = I->second; + if (MatchingValue == Value && + RelType == getMatchingLoRelocation(Reloc.RelType) && + SectionID == Reloc.SectionID) { + Reloc.Addend += Addend; + if (Value.SymbolName) + addRelocationForSymbol(Reloc, Value.SymbolName); + else + addRelocationForSection(Reloc, Value.SectionID); + I = PendingRelocs.erase(I); + } else + ++I; + } + RelocationEntry RE(SectionID, Offset, RelType, Addend); + if (Value.SymbolName) + addRelocationForSymbol(RE, Value.SymbolName); + else + addRelocationForSection(RE, Value.SectionID); } else { - // FIXME: Calculate correct addends for R_MIPS_HI16, R_MIPS_LO16, - // R_MIPS_PCHI16 and R_MIPS_PCLO16 relocations. - if (RelType == ELF::R_MIPS_HI16 || RelType == ELF::R_MIPS_PCHI16) - Value.Addend += (Opcode & 0x0000ffff) << 16; - else if (RelType == ELF::R_MIPS_LO16) - Value.Addend += (Opcode & 0x0000ffff); - else if (RelType == ELF::R_MIPS_32) + if (RelType == ELF::R_MIPS_32) Value.Addend += Opcode; - else if (RelType == ELF::R_MIPS_PCLO16) - Value.Addend += SignExtend32<16>((Opcode & 0x0000ffff)); else if (RelType == ELF::R_MIPS_PC16) Value.Addend += SignExtend32<18>((Opcode & 0x0000ffff) << 2); else if (RelType == ELF::R_MIPS_PC19_S2) @@ -1353,7 +1446,7 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( // an external symbol (Symbol::ST_Unknown) or if the target address // is not within the signed 24-bits branch address. SectionEntry &Section = Sections[SectionID]; - uint8_t *Target = Section.Address + Offset; + uint8_t *Target = Section.getAddressWithOffset(Offset); bool RangeOverflow = false; if (SymType != SymbolRef::ST_Unknown) { if (AbiVariant != 2) { @@ -1367,10 +1460,11 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( uint8_t SymOther = Symbol->getOther(); Value.Addend += ELF::decodePPC64LocalEntryOffset(SymOther); } - uint8_t *RelocTarget = Sections[Value.SectionID].Address + Value.Addend; + uint8_t *RelocTarget = + Sections[Value.SectionID].getAddressWithOffset(Value.Addend); int32_t delta = static_cast<int32_t>(Target - RelocTarget); - // If it is within 24-bits branch range, just set the branch target - if (SignExtend32<24>(delta) == delta) { + // If it is within 26-bits branch range, just set the branch target + if (SignExtend32<26>(delta) == delta) { RelocationEntry RE(SectionID, Offset, RelType, Value.Addend); if (Value.SymbolName) addRelocationForSymbol(RE, Value.SymbolName); @@ -1387,23 +1481,25 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( if (i != Stubs.end()) { // Symbol function stub already created, just relocate to it resolveRelocation(Section, Offset, - (uint64_t)Section.Address + i->second, RelType, 0); + reinterpret_cast<uint64_t>( + Section.getAddressWithOffset(i->second)), + RelType, 0); DEBUG(dbgs() << " Stub function found\n"); } else { // Create a new stub function. DEBUG(dbgs() << " Create a new stub function\n"); - Stubs[Value] = Section.StubOffset; - uint8_t *StubTargetAddr = - createStubFunction(Section.Address + Section.StubOffset, - AbiVariant); - RelocationEntry RE(SectionID, StubTargetAddr - Section.Address, + Stubs[Value] = Section.getStubOffset(); + uint8_t *StubTargetAddr = createStubFunction( + Section.getAddressWithOffset(Section.getStubOffset()), + AbiVariant); + RelocationEntry RE(SectionID, StubTargetAddr - Section.getAddress(), ELF::R_PPC64_ADDR64, Value.Addend); // Generates the 64-bits address loads as exemplified in section // 4.5.1 in PPC64 ELF ABI. Note that the relocations need to // apply to the low part of the instructions, so we have to update // the offset according to the target endianness. - uint64_t StubRelocOffset = StubTargetAddr - Section.Address; + uint64_t StubRelocOffset = StubTargetAddr - Section.getAddress(); if (!IsTargetLittleEndian) StubRelocOffset += 2; @@ -1428,10 +1524,11 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( addRelocationForSection(REl, Value.SectionID); } - resolveRelocation(Section, Offset, - (uint64_t)Section.Address + Section.StubOffset, + resolveRelocation(Section, Offset, reinterpret_cast<uint64_t>( + Section.getAddressWithOffset( + Section.getStubOffset())), RelType, 0); - Section.StubOffset += getMaxStubSize(); + Section.advanceStubOffset(getMaxStubSize()); } if (SymType == SymbolRef::ST_Unknown) { // Restore the TOC for external calls @@ -1450,11 +1547,11 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( // These relocations are supposed to subtract the TOC address from // the final value. This does not fit cleanly into the RuntimeDyld // scheme, since there may be *two* sections involved in determining - // the relocation value (the section of the symbol refered to by the + // the relocation value (the section of the symbol referred to by the // relocation, and the TOC section associated with the current module). // // Fortunately, these relocations are currently only ever generated - // refering to symbols that themselves reside in the TOC, which means + // referring to symbols that themselves reside in the TOC, which means // that the two sections are actually the same. Thus they cancel out // and we can immediately resolve the relocation right now. switch (RelType) { @@ -1511,16 +1608,17 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( StubMap::const_iterator i = Stubs.find(Value); uintptr_t StubAddress; if (i != Stubs.end()) { - StubAddress = uintptr_t(Section.Address) + i->second; + StubAddress = uintptr_t(Section.getAddressWithOffset(i->second)); DEBUG(dbgs() << " Stub function found\n"); } else { // Create a new stub function. DEBUG(dbgs() << " Create a new stub function\n"); - uintptr_t BaseAddress = uintptr_t(Section.Address); + uintptr_t BaseAddress = uintptr_t(Section.getAddress()); uintptr_t StubAlignment = getStubAlignment(); - StubAddress = (BaseAddress + Section.StubOffset + StubAlignment - 1) & - -StubAlignment; + StubAddress = + (BaseAddress + Section.getStubOffset() + StubAlignment - 1) & + -StubAlignment; unsigned StubOffset = StubAddress - BaseAddress; Stubs[Value] = StubOffset; @@ -1531,7 +1629,7 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( addRelocationForSymbol(RE, Value.SymbolName); else addRelocationForSection(RE, Value.SectionID); - Section.StubOffset = StubOffset + getMaxStubSize(); + Section.advanceStubOffset(getMaxStubSize()); } if (RelType == ELF::R_390_GOTENT) @@ -1564,37 +1662,39 @@ relocation_iterator RuntimeDyldELF::processRelocationRef( StubMap::const_iterator i = Stubs.find(Value); uintptr_t StubAddress; if (i != Stubs.end()) { - StubAddress = uintptr_t(Section.Address) + i->second; - DEBUG(dbgs() << " Stub function found\n"); + StubAddress = uintptr_t(Section.getAddress()) + i->second; + DEBUG(dbgs() << " Stub function found\n"); } else { - // Create a new stub function (equivalent to a PLT entry). - DEBUG(dbgs() << " Create a new stub function\n"); + // Create a new stub function (equivalent to a PLT entry). + DEBUG(dbgs() << " Create a new stub function\n"); - uintptr_t BaseAddress = uintptr_t(Section.Address); - uintptr_t StubAlignment = getStubAlignment(); - StubAddress = (BaseAddress + Section.StubOffset + StubAlignment - 1) & - -StubAlignment; - unsigned StubOffset = StubAddress - BaseAddress; - Stubs[Value] = StubOffset; - createStubFunction((uint8_t *)StubAddress); + uintptr_t BaseAddress = uintptr_t(Section.getAddress()); + uintptr_t StubAlignment = getStubAlignment(); + StubAddress = + (BaseAddress + Section.getStubOffset() + StubAlignment - 1) & + -StubAlignment; + unsigned StubOffset = StubAddress - BaseAddress; + Stubs[Value] = StubOffset; + createStubFunction((uint8_t *)StubAddress); - // Bump our stub offset counter - Section.StubOffset = StubOffset + getMaxStubSize(); + // Bump our stub offset counter + Section.advanceStubOffset(getMaxStubSize()); - // Allocate a GOT Entry - uint64_t GOTOffset = allocateGOTEntries(SectionID, 1); + // Allocate a GOT Entry + uint64_t GOTOffset = allocateGOTEntries(SectionID, 1); - // The load of the GOT address has an addend of -4 - resolveGOTOffsetRelocation(SectionID, StubOffset + 2, GOTOffset - 4); + // The load of the GOT address has an addend of -4 + resolveGOTOffsetRelocation(SectionID, StubOffset + 2, GOTOffset - 4); - // Fill in the value of the symbol we're targeting into the GOT - addRelocationForSymbol(computeGOTOffsetRE(SectionID,GOTOffset,0,ELF::R_X86_64_64), - Value.SymbolName); + // Fill in the value of the symbol we're targeting into the GOT + addRelocationForSymbol( + computeGOTOffsetRE(SectionID, GOTOffset, 0, ELF::R_X86_64_64), + Value.SymbolName); } // Make the target call a call into the stub table. resolveRelocation(Section, Offset, StubAddress, ELF::R_X86_64_PC32, - Addend); + Addend); } else { RelocationEntry RE(SectionID, Offset, ELF::R_X86_64_PC32, Value.Addend, Value.Offset); @@ -1670,7 +1770,7 @@ uint64_t RuntimeDyldELF::allocateGOTEntries(unsigned SectionID, unsigned no) GOTSectionID = Sections.size(); // Reserve a section id. We'll allocate the section later // once we know the total size - Sections.push_back(SectionEntry(".got", 0, 0, 0)); + Sections.push_back(SectionEntry(".got", nullptr, 0, 0, 0)); } uint64_t StartOffset = CurrentGOTIndex * getGOTEntrySize(); CurrentGOTIndex += no; @@ -1693,6 +1793,10 @@ RelocationEntry RuntimeDyldELF::computeGOTOffsetRE(unsigned SectionID, uint64_t void RuntimeDyldELF::finalizeLoad(const ObjectFile &Obj, ObjSectionToIDMap &SectionMap) { + if (IsMipsO32ABI) + if (!PendingRelocs.empty()) + report_fatal_error("Can't find matching LO16 reloc"); + // If necessary, allocate the global offset table if (GOTSectionID != 0) { // Allocate memory for the section @@ -1702,7 +1806,8 @@ void RuntimeDyldELF::finalizeLoad(const ObjectFile &Obj, if (!Addr) report_fatal_error("Unable to allocate memory for GOT!"); - Sections[GOTSectionID] = SectionEntry(".got", Addr, TotalSize, 0); + Sections[GOTSectionID] = + SectionEntry(".got", Addr, TotalSize, TotalSize, 0); if (Checker) Checker->registerSection(Obj.getFileName(), GOTSectionID); @@ -1746,4 +1851,23 @@ bool RuntimeDyldELF::isCompatibleFile(const object::ObjectFile &Obj) const { return Obj.isELF(); } +bool RuntimeDyldELF::relocationNeedsStub(const RelocationRef &R) const { + if (Arch != Triple::x86_64) + return true; // Conservative answer + + switch (R.getType()) { + default: + return true; // Conservative answer + + + case ELF::R_X86_64_GOTPCREL: + case ELF::R_X86_64_PC32: + case ELF::R_X86_64_PC64: + case ELF::R_X86_64_64: + // We know that these reloation types won't need a stub function. This list + // can be extended as needed. + return false; + } +} + } // namespace llvm diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h index 1a2552d..041811d 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h @@ -43,6 +43,9 @@ class RuntimeDyldELF : public RuntimeDyldImpl { void resolveMIPSRelocation(const SectionEntry &Section, uint64_t Offset, uint32_t Value, uint32_t Type, int32_t Addend); + void resolvePPC32Relocation(const SectionEntry &Section, uint64_t Offset, + uint64_t Value, uint32_t Type, int64_t Addend); + void resolvePPC64Relocation(const SectionEntry &Section, uint64_t Offset, uint64_t Value, uint32_t Type, int64_t Addend); @@ -120,6 +123,10 @@ class RuntimeDyldELF : public RuntimeDyldImpl { // no particular advanced processing. void processSimpleRelocation(unsigned SectionID, uint64_t Offset, unsigned RelType, RelocationValueRef Value); + // Return matching *LO16 relocation (Mips specific) + uint32_t getMatchingLoRelocation(uint32_t RelType, + bool IsLocal = false) const; + // The tentative ID for the GOT section unsigned GOTSectionID; @@ -135,12 +142,18 @@ class RuntimeDyldELF : public RuntimeDyldImpl { // A map to avoid duplicate got entries (Mips64 specific) StringMap<uint64_t> GOTSymbolOffsets; + // *HI16 relocations will be added for resolving when we find matching + // *LO16 part. (Mips specific) + SmallVector<std::pair<RelocationValueRef, RelocationEntry>, 8> PendingRelocs; + // When a module is loaded we save the SectionID of the EH frame section // in a table until we receive a request to register all unregistered // EH frame sections with the memory manager. SmallVector<SID, 2> UnregisteredEHFrameSections; SmallVector<SID, 2> RegisteredEHFrameSections; + bool relocationNeedsStub(const RelocationRef &R) const override; + public: RuntimeDyldELF(RuntimeDyld::MemoryManager &MemMgr, RuntimeDyld::SymbolResolver &Resolver); diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h index e085a92..ab732c6 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h @@ -30,6 +30,7 @@ #include "llvm/Support/SwapByteOrder.h" #include "llvm/Support/raw_ostream.h" #include <map> +#include <unordered_map> #include <system_error> using namespace llvm; @@ -50,7 +51,6 @@ class Twine; /// SectionEntry - represents a section emitted into memory by the dynamic /// linker. class SectionEntry { -public: /// Name - section name. std::string Name; @@ -70,15 +70,54 @@ public: /// relocations (like ARM). uintptr_t StubOffset; + /// The total amount of space allocated for this section. This includes the + /// section size and the maximum amount of space that the stubs can occupy. + size_t AllocationSize; + /// ObjAddress - address of the section in the in-memory object file. Used /// for calculating relocations in some object formats (like MachO). uintptr_t ObjAddress; +public: SectionEntry(StringRef name, uint8_t *address, size_t size, - uintptr_t objAddress) + size_t allocationSize, uintptr_t objAddress) : Name(name), Address(address), Size(size), LoadAddress(reinterpret_cast<uintptr_t>(address)), StubOffset(size), - ObjAddress(objAddress) {} + AllocationSize(allocationSize), ObjAddress(objAddress) { + // AllocationSize is used only in asserts, prevent an "unused private field" + // warning: + (void)AllocationSize; + } + + StringRef getName() const { return Name; } + + uint8_t *getAddress() const { return Address; } + + /// \brief Return the address of this section with an offset. + uint8_t *getAddressWithOffset(unsigned OffsetBytes) const { + assert(OffsetBytes <= AllocationSize && "Offset out of bounds!"); + return Address + OffsetBytes; + } + + size_t getSize() const { return Size; } + + uint64_t getLoadAddress() const { return LoadAddress; } + void setLoadAddress(uint64_t LA) { LoadAddress = LA; } + + /// \brief Return the load address of this section with an offset. + uint64_t getLoadAddressWithOffset(unsigned OffsetBytes) const { + assert(OffsetBytes <= AllocationSize && "Offset out of bounds!"); + return LoadAddress + OffsetBytes; + } + + uintptr_t getStubOffset() const { return StubOffset; } + + void advanceStubOffset(unsigned StubSize) { + StubOffset += StubSize; + assert(StubOffset <= AllocationSize && "Not enough space allocated!"); + } + + uintptr_t getObjAddress() const { return ObjAddress; } }; /// RelocationEntry - used to represent relocations internally in the dynamic @@ -188,6 +227,8 @@ class RuntimeDyldImpl { friend class RuntimeDyld::LoadedObjectInfo; friend class RuntimeDyldCheckerImpl; protected: + static const unsigned AbsoluteSymbolSection = ~0U; + // The MemoryManager to load objects into. RuntimeDyld::MemoryManager &MemMgr; @@ -224,7 +265,7 @@ protected: // Relocations to sections already loaded. Indexed by SectionID which is the // source of the address. The target where the address will be written is // SectionID/Offset in the relocation itself. - DenseMap<unsigned, RelocationList> Relocations; + std::unordered_map<unsigned, RelocationList> Relocations; // Relocations to external symbols that are not yet resolved. Symbols are // external when they aren't found in the global symbol table of all loaded @@ -269,11 +310,11 @@ protected: } uint64_t getSectionLoadAddress(unsigned SectionID) const { - return Sections[SectionID].LoadAddress; + return Sections[SectionID].getLoadAddress(); } uint8_t *getSectionAddress(unsigned SectionID) const { - return (uint8_t *)Sections[SectionID].Address; + return Sections[SectionID].getAddress(); } void writeInt16BE(uint8_t *Addr, uint16_t Value) { @@ -370,15 +411,22 @@ protected: // \brief Compute an upper bound of the memory that is required to load all // sections - void computeTotalAllocSize(const ObjectFile &Obj, uint64_t &CodeSize, - uint64_t &DataSizeRO, uint64_t &DataSizeRW); + void computeTotalAllocSize(const ObjectFile &Obj, + uint64_t &CodeSize, uint32_t &CodeAlign, + uint64_t &RODataSize, uint32_t &RODataAlign, + uint64_t &RWDataSize, uint32_t &RWDataAlign); // \brief Compute the stub buffer size required for a section unsigned computeSectionStubBufSize(const ObjectFile &Obj, const SectionRef &Section); // \brief Implementation of the generic part of the loadObject algorithm. - std::pair<unsigned, unsigned> loadObjectImpl(const object::ObjectFile &Obj); + ObjSectionToIDMap loadObjectImpl(const object::ObjectFile &Obj); + + // \brief Return true if the relocation R may require allocating a stub. + virtual bool relocationNeedsStub(const RelocationRef &R) const { + return true; // Conservative answer + } public: RuntimeDyldImpl(RuntimeDyld::MemoryManager &MemMgr, @@ -407,6 +455,9 @@ public: if (pos == GlobalSymbolTable.end()) return nullptr; const auto &SymInfo = pos->second; + // Absolute symbols do not have a local address. + if (SymInfo.getSectionID() == AbsoluteSymbolSection) + return nullptr; return getSectionAddress(SymInfo.getSectionID()) + SymInfo.getOffset(); } @@ -417,8 +468,10 @@ public: if (pos == GlobalSymbolTable.end()) return nullptr; const auto &SymEntry = pos->second; - uint64_t TargetAddr = - getSectionLoadAddress(SymEntry.getSectionID()) + SymEntry.getOffset(); + uint64_t SectionAddr = 0; + if (SymEntry.getSectionID() != AbsoluteSymbolSection) + SectionAddr = getSectionLoadAddress(SymEntry.getSectionID()); + uint64_t TargetAddr = SectionAddr + SymEntry.getOffset(); return RuntimeDyld::SymbolInfo(TargetAddr, SymEntry.getFlags()); } diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp index c074114..739e8d6 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp @@ -26,12 +26,12 @@ using namespace llvm::object; namespace { -class LoadedMachOObjectInfo +class LoadedMachOObjectInfo final : public RuntimeDyld::LoadedObjectInfoHelper<LoadedMachOObjectInfo> { public: - LoadedMachOObjectInfo(RuntimeDyldImpl &RTDyld, unsigned BeginIdx, - unsigned EndIdx) - : LoadedObjectInfoHelper(RTDyld, BeginIdx, EndIdx) {} + LoadedMachOObjectInfo(RuntimeDyldImpl &RTDyld, + ObjSectionToIDMap ObjSecToIDMap) + : LoadedObjectInfoHelper(RTDyld, std::move(ObjSecToIDMap)) {} OwningBinary<ObjectFile> getObjectForDebug(const ObjectFile &Obj) const override { @@ -45,11 +45,47 @@ namespace llvm { int64_t RuntimeDyldMachO::memcpyAddend(const RelocationEntry &RE) const { unsigned NumBytes = 1 << RE.Size; - uint8_t *Src = Sections[RE.SectionID].Address + RE.Offset; + uint8_t *Src = Sections[RE.SectionID].getAddress() + RE.Offset; return static_cast<int64_t>(readBytesUnaligned(Src, NumBytes)); } +relocation_iterator RuntimeDyldMachO::processScatteredVANILLA( + unsigned SectionID, relocation_iterator RelI, + const ObjectFile &BaseObjT, + RuntimeDyldMachO::ObjSectionToIDMap &ObjSectionToID) { + const MachOObjectFile &Obj = + static_cast<const MachOObjectFile&>(BaseObjT); + MachO::any_relocation_info RE = + Obj.getRelocation(RelI->getRawDataRefImpl()); + + SectionEntry &Section = Sections[SectionID]; + uint32_t RelocType = Obj.getAnyRelocationType(RE); + bool IsPCRel = Obj.getAnyRelocationPCRel(RE); + unsigned Size = Obj.getAnyRelocationLength(RE); + uint64_t Offset = RelI->getOffset(); + uint8_t *LocalAddress = Section.getAddressWithOffset(Offset); + unsigned NumBytes = 1 << Size; + int64_t Addend = readBytesUnaligned(LocalAddress, NumBytes); + + unsigned SymbolBaseAddr = Obj.getScatteredRelocationValue(RE); + section_iterator TargetSI = getSectionByAddress(Obj, SymbolBaseAddr); + assert(TargetSI != Obj.section_end() && "Can't find section for symbol"); + uint64_t SectionBaseAddr = TargetSI->getAddress(); + SectionRef TargetSection = *TargetSI; + bool IsCode = TargetSection.isText(); + uint32_t TargetSectionID = + findOrEmitSection(Obj, TargetSection, IsCode, ObjSectionToID); + + Addend -= SectionBaseAddr; + RelocationEntry R(SectionID, Offset, RelocType, Addend, IsPCRel, Size); + + addRelocationForSection(R, TargetSectionID); + + return ++RelI; +} + + RelocationValueRef RuntimeDyldMachO::getRelocationValueRef( const ObjectFile &BaseTObj, const relocation_iterator &RI, const RelocationEntry &RE, ObjSectionToIDMap &ObjSectionToID) { @@ -99,8 +135,8 @@ void RuntimeDyldMachO::makeValueAddendPCRel(RelocationValueRef &Value, void RuntimeDyldMachO::dumpRelocationToResolve(const RelocationEntry &RE, uint64_t Value) const { const SectionEntry &Section = Sections[RE.SectionID]; - uint8_t *LocalAddress = Section.Address + RE.Offset; - uint64_t FinalAddress = Section.LoadAddress + RE.Offset; + uint8_t *LocalAddress = Section.getAddress() + RE.Offset; + uint64_t FinalAddress = Section.getLoadAddress() + RE.Offset; dbgs() << "resolveRelocation Section: " << RE.SectionID << " LocalAddress: " << format("%p", LocalAddress) @@ -147,10 +183,9 @@ void RuntimeDyldMachO::populateIndirectSymbolPointersSection( "Pointers section does not contain a whole number of stubs?"); DEBUG(dbgs() << "Populating pointer table section " - << Sections[PTSectionID].Name - << ", Section ID " << PTSectionID << ", " - << NumPTEntries << " entries, " << PTEntrySize - << " bytes each:\n"); + << Sections[PTSectionID].getName() << ", Section ID " + << PTSectionID << ", " << NumPTEntries << " entries, " + << PTEntrySize << " bytes each:\n"); for (unsigned i = 0; i < NumPTEntries; ++i) { unsigned SymbolIndex = @@ -204,7 +239,7 @@ void RuntimeDyldMachOCRTPBase<Impl>::finalizeLoad(const ObjectFile &Obj, } template <typename Impl> -unsigned char *RuntimeDyldMachOCRTPBase<Impl>::processFDE(unsigned char *P, +unsigned char *RuntimeDyldMachOCRTPBase<Impl>::processFDE(uint8_t *P, int64_t DeltaForText, int64_t DeltaForEH) { typedef typename Impl::TargetPtrT TargetPtrT; @@ -213,7 +248,7 @@ unsigned char *RuntimeDyldMachOCRTPBase<Impl>::processFDE(unsigned char *P, << ", Delta for EH: " << DeltaForEH << "\n"); uint32_t Length = readBytesUnaligned(P, 4); P += 4; - unsigned char *Ret = P + Length; + uint8_t *Ret = P + Length; uint32_t Offset = readBytesUnaligned(P, 4); if (Offset == 0) // is a CIE return Ret; @@ -240,9 +275,9 @@ unsigned char *RuntimeDyldMachOCRTPBase<Impl>::processFDE(unsigned char *P, } static int64_t computeDelta(SectionEntry *A, SectionEntry *B) { - int64_t ObjDistance = - static_cast<int64_t>(A->ObjAddress) - static_cast<int64_t>(B->ObjAddress); - int64_t MemDistance = A->LoadAddress - B->LoadAddress; + int64_t ObjDistance = static_cast<int64_t>(A->getObjAddress()) - + static_cast<int64_t>(B->getObjAddress()); + int64_t MemDistance = A->getLoadAddress() - B->getLoadAddress(); return ObjDistance - MemDistance; } @@ -265,14 +300,14 @@ void RuntimeDyldMachOCRTPBase<Impl>::registerEHFrames() { if (ExceptTab) DeltaForEH = computeDelta(ExceptTab, EHFrame); - unsigned char *P = EHFrame->Address; - unsigned char *End = P + EHFrame->Size; + uint8_t *P = EHFrame->getAddress(); + uint8_t *End = P + EHFrame->getSize(); do { P = processFDE(P, DeltaForText, DeltaForEH); } while (P != End); - MemMgr.registerEHFrames(EHFrame->Address, EHFrame->LoadAddress, - EHFrame->Size); + MemMgr.registerEHFrames(EHFrame->getAddress(), EHFrame->getLoadAddress(), + EHFrame->getSize()); } UnregisteredEHFrameSections.clear(); } @@ -298,10 +333,7 @@ RuntimeDyldMachO::create(Triple::ArchType Arch, std::unique_ptr<RuntimeDyld::LoadedObjectInfo> RuntimeDyldMachO::loadObject(const object::ObjectFile &O) { - unsigned SectionStartIdx, SectionEndIdx; - std::tie(SectionStartIdx, SectionEndIdx) = loadObjectImpl(O); - return llvm::make_unique<LoadedMachOObjectInfo>(*this, SectionStartIdx, - SectionEndIdx); + return llvm::make_unique<LoadedMachOObjectInfo>(*this, loadObjectImpl(O)); } } // end namespace llvm diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h index 0d7364f..c8ae47b 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h @@ -79,6 +79,12 @@ protected: return RelocationEntry(SectionID, Offset, RelType, 0, IsPCRel, Size); } + /// Process a scattered vanilla relocation. + relocation_iterator processScatteredVANILLA( + unsigned SectionID, relocation_iterator RelI, + const ObjectFile &BaseObjT, + RuntimeDyldMachO::ObjSectionToIDMap &ObjSectionToID); + /// Construct a RelocationValueRef representing the relocation target. /// For Symbols in known sections, this will return a RelocationValueRef /// representing a (SectionID, Offset) pair. @@ -140,7 +146,7 @@ private: Impl &impl() { return static_cast<Impl &>(*this); } const Impl &impl() const { return static_cast<const Impl &>(*this); } - unsigned char *processFDE(unsigned char *P, int64_t DeltaForText, + unsigned char *processFDE(uint8_t *P, int64_t DeltaForText, int64_t DeltaForEH); public: diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h new file mode 100644 index 0000000..fbfbb32 --- /dev/null +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h @@ -0,0 +1,201 @@ +//===--- RuntimeDyldCOFFI386.h --- COFF/X86_64 specific code ---*- C++ --*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// COFF x86 support for MC-JIT runtime dynamic linker. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDCOFFI386_H +#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDCOFFI386_H + +#include "llvm/Object/COFF.h" +#include "llvm/Support/COFF.h" +#include "../RuntimeDyldCOFF.h" + +#define DEBUG_TYPE "dyld" + +namespace llvm { + +class RuntimeDyldCOFFI386 : public RuntimeDyldCOFF { +public: + RuntimeDyldCOFFI386(RuntimeDyld::MemoryManager &MM, + RuntimeDyld::SymbolResolver &Resolver) + : RuntimeDyldCOFF(MM, Resolver) {} + + unsigned getMaxStubSize() override { + return 8; // 2-byte jmp instruction + 32-bit relative address + 2 byte pad + } + + unsigned getStubAlignment() override { return 1; } + + relocation_iterator processRelocationRef(unsigned SectionID, + relocation_iterator RelI, + const ObjectFile &Obj, + ObjSectionToIDMap &ObjSectionToID, + StubMap &Stubs) override { + auto Symbol = RelI->getSymbol(); + if (Symbol == Obj.symbol_end()) + report_fatal_error("Unknown symbol in relocation"); + + ErrorOr<StringRef> TargetNameOrErr = Symbol->getName(); + if (auto EC = TargetNameOrErr.getError()) + report_fatal_error(EC.message()); + StringRef TargetName = *TargetNameOrErr; + + auto Section = *Symbol->getSection(); + + uint64_t RelType = RelI->getType(); + uint64_t Offset = RelI->getOffset(); + +#if !defined(NDEBUG) + SmallString<32> RelTypeName; + RelI->getTypeName(RelTypeName); +#endif + DEBUG(dbgs() << "\t\tIn Section " << SectionID << " Offset " << Offset + << " RelType: " << RelTypeName << " TargetName: " << TargetName + << "\n"); + + unsigned TargetSectionID = -1; + if (Section == Obj.section_end()) { + RelocationEntry RE(SectionID, Offset, RelType, 0, -1, 0, 0, 0, false, 0); + addRelocationForSymbol(RE, TargetName); + } else { + TargetSectionID = + findOrEmitSection(Obj, *Section, Section->isText(), ObjSectionToID); + + switch (RelType) { + case COFF::IMAGE_REL_I386_ABSOLUTE: + // This relocation is ignored. + break; + case COFF::IMAGE_REL_I386_DIR32: + case COFF::IMAGE_REL_I386_DIR32NB: + case COFF::IMAGE_REL_I386_REL32: { + RelocationEntry RE = + RelocationEntry(SectionID, Offset, RelType, 0, TargetSectionID, + getSymbolOffset(*Symbol), 0, 0, false, 0); + addRelocationForSection(RE, TargetSectionID); + break; + } + case COFF::IMAGE_REL_I386_SECTION: { + RelocationEntry RE = + RelocationEntry(TargetSectionID, Offset, RelType, 0); + addRelocationForSection(RE, TargetSectionID); + break; + } + case COFF::IMAGE_REL_I386_SECREL: { + RelocationEntry RE = RelocationEntry(SectionID, Offset, RelType, + getSymbolOffset(*Symbol)); + addRelocationForSection(RE, TargetSectionID); + break; + } + default: + llvm_unreachable("unsupported relocation type"); + } + + } + + return ++RelI; + } + + void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override { + const auto Section = Sections[RE.SectionID]; + uint8_t *Target = Section.getAddressWithOffset(RE.Offset); + + switch (RE.RelType) { + case COFF::IMAGE_REL_I386_ABSOLUTE: + // This relocation is ignored. + break; + case COFF::IMAGE_REL_I386_DIR32: { + // The target's 32-bit VA. + uint64_t Result = + RE.Sections.SectionA == static_cast<uint32_t>(-1) + ? Value + : Sections[RE.Sections.SectionA].getLoadAddressWithOffset( + RE.Addend); + assert(static_cast<int32_t>(Result) <= INT32_MAX && + "relocation overflow"); + assert(static_cast<int32_t>(Result) >= INT32_MIN && + "relocation underflow"); + DEBUG(dbgs() << "\t\tOffset: " << RE.Offset + << " RelType: IMAGE_REL_I386_DIR32" + << " TargetSection: " << RE.Sections.SectionA + << " Value: " << format("0x%08" PRIx32, Result) << '\n'); + writeBytesUnaligned(Result, Target, 4); + break; + } + case COFF::IMAGE_REL_I386_DIR32NB: { + // The target's 32-bit RVA. + // NOTE: use Section[0].getLoadAddress() as an approximation of ImageBase + uint64_t Result = + Sections[RE.Sections.SectionA].getLoadAddressWithOffset(RE.Addend) - + Sections[0].getLoadAddress(); + assert(static_cast<int32_t>(Result) <= INT32_MAX && + "relocation overflow"); + assert(static_cast<int32_t>(Result) >= INT32_MIN && + "relocation underflow"); + DEBUG(dbgs() << "\t\tOffset: " << RE.Offset + << " RelType: IMAGE_REL_I386_DIR32NB" + << " TargetSection: " << RE.Sections.SectionA + << " Value: " << format("0x%08" PRIx32, Result) << '\n'); + writeBytesUnaligned(Result, Target, 4); + break; + } + case COFF::IMAGE_REL_I386_REL32: { + // 32-bit relative displacement to the target. + uint64_t Result = Sections[RE.Sections.SectionA].getLoadAddress() - + Section.getLoadAddress() + RE.Addend - 4 - RE.Offset; + assert(static_cast<int32_t>(Result) <= INT32_MAX && + "relocation overflow"); + assert(static_cast<int32_t>(Result) >= INT32_MIN && + "relocation underflow"); + DEBUG(dbgs() << "\t\tOffset: " << RE.Offset + << " RelType: IMAGE_REL_I386_REL32" + << " TargetSection: " << RE.Sections.SectionA + << " Value: " << format("0x%08" PRIx32, Result) << '\n'); + writeBytesUnaligned(Result, Target, 4); + break; + } + case COFF::IMAGE_REL_I386_SECTION: + // 16-bit section index of the section that contains the target. + assert(static_cast<int32_t>(RE.SectionID) <= INT16_MAX && + "relocation overflow"); + assert(static_cast<int32_t>(RE.SectionID) >= INT16_MIN && + "relocation underflow"); + DEBUG(dbgs() << "\t\tOffset: " << RE.Offset + << " RelType: IMAGE_REL_I386_SECTION Value: " << RE.SectionID + << '\n'); + writeBytesUnaligned(RE.SectionID, Target, 2); + break; + case COFF::IMAGE_REL_I386_SECREL: + // 32-bit offset of the target from the beginning of its section. + assert(static_cast<int32_t>(RE.Addend) <= INT32_MAX && + "relocation overflow"); + assert(static_cast<int32_t>(RE.Addend) >= INT32_MIN && + "relocation underflow"); + DEBUG(dbgs() << "\t\tOffset: " << RE.Offset + << " RelType: IMAGE_REL_I386_SECREL Value: " << RE.Addend + << '\n'); + writeBytesUnaligned(RE.Addend, Target, 2); + break; + default: + llvm_unreachable("unsupported relocation type"); + } + } + + void registerEHFrames() override {} + void deregisterEHFrames() override {} + + void finalizeLoad(const ObjectFile &Obj, + ObjSectionToIDMap &SectionMap) override {} +}; + +} + +#endif + diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h index 408227e..25f538d 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h @@ -62,7 +62,7 @@ public: // symbol in the target address space. void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override { const SectionEntry &Section = Sections[RE.SectionID]; - uint8_t *Target = Section.Address + RE.Offset; + uint8_t *Target = Section.getAddressWithOffset(RE.Offset); switch (RE.RelType) { @@ -72,8 +72,7 @@ public: case COFF::IMAGE_REL_AMD64_REL32_3: case COFF::IMAGE_REL_AMD64_REL32_4: case COFF::IMAGE_REL_AMD64_REL32_5: { - uint32_t *TargetAddress = (uint32_t *)Target; - uint64_t FinalAddress = Section.LoadAddress + RE.Offset; + uint64_t FinalAddress = Section.getLoadAddressWithOffset(RE.Offset); // Delta is the distance from the start of the reloc to the end of the // instruction with the reloc. uint64_t Delta = 4 + (RE.RelType - COFF::IMAGE_REL_AMD64_REL32); @@ -81,7 +80,7 @@ public: uint64_t Result = Value + RE.Addend; assert(((int64_t)Result <= INT32_MAX) && "Relocation overflow"); assert(((int64_t)Result >= INT32_MIN) && "Relocation underflow"); - *TargetAddress = Result; + writeBytesUnaligned(Result, Target, 4); break; } @@ -92,14 +91,12 @@ public: // within a 32 bit offset from the base. // // For now we just set these to zero. - uint32_t *TargetAddress = (uint32_t *)Target; - *TargetAddress = 0; + writeBytesUnaligned(0, Target, 4); break; } case COFF::IMAGE_REL_AMD64_ADDR64: { - uint64_t *TargetAddress = (uint64_t *)Target; - *TargetAddress = Value + RE.Addend; + writeBytesUnaligned(Value + RE.Addend, Target, 8); break; } @@ -119,8 +116,7 @@ public: symbol_iterator Symbol = RelI->getSymbol(); if (Symbol == Obj.symbol_end()) report_fatal_error("Unknown symbol in relocation"); - section_iterator SecI(Obj.section_end()); - Symbol->getSection(SecI); + section_iterator SecI = *Symbol->getSection(); // If there is no section, this must be an external reference. const bool IsExtern = SecI == Obj.section_end(); @@ -129,7 +125,7 @@ public: uint64_t Offset = RelI->getOffset(); uint64_t Addend = 0; SectionEntry &Section = Sections[SectionID]; - uintptr_t ObjTarget = Section.ObjAddress + Offset; + uintptr_t ObjTarget = Section.getObjAddress() + Offset; switch (RelType) { @@ -140,14 +136,14 @@ public: case COFF::IMAGE_REL_AMD64_REL32_4: case COFF::IMAGE_REL_AMD64_REL32_5: case COFF::IMAGE_REL_AMD64_ADDR32NB: { - uint32_t *Displacement = (uint32_t *)ObjTarget; - Addend = *Displacement; + uint8_t *Displacement = (uint8_t *)ObjTarget; + Addend = readBytesUnaligned(Displacement, 4); break; } case COFF::IMAGE_REL_AMD64_ADDR64: { - uint64_t *Displacement = (uint64_t *)ObjTarget; - Addend = *Displacement; + uint8_t *Displacement = (uint8_t *)ObjTarget; + Addend = readBytesUnaligned(Displacement, 8); break; } @@ -182,9 +178,9 @@ public: unsigned getStubAlignment() override { return 1; } void registerEHFrames() override { for (auto const &EHFrameSID : UnregisteredEHFrameSections) { - uint8_t *EHFrameAddr = Sections[EHFrameSID].Address; - uint64_t EHFrameLoadAddr = Sections[EHFrameSID].LoadAddress; - size_t EHFrameSize = Sections[EHFrameSID].Size; + uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress(); + uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress(); + size_t EHFrameSize = Sections[EHFrameSID].getSize(); MemMgr.registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize); RegisteredEHFrameSections.push_back(EHFrameSID); } diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h index 7bf7641..dbca377 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h @@ -34,7 +34,7 @@ public: /// Extract the addend encoded in the instruction / memory location. int64_t decodeAddend(const RelocationEntry &RE) const { const SectionEntry &Section = Sections[RE.SectionID]; - uint8_t *LocalAddress = Section.Address + RE.Offset; + uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset); unsigned NumBytes = 1 << RE.Size; int64_t Addend = 0; // Verify that the relocation has the correct size and alignment. @@ -272,15 +272,14 @@ public: RelocationEntry RE(getRelocationEntry(SectionID, Obj, RelI)); RE.Addend = decodeAddend(RE); - RelocationValueRef Value( - getRelocationValueRef(Obj, RelI, RE, ObjSectionToID)); assert((ExplicitAddend == 0 || RE.Addend == 0) && "Relocation has "\ "ARM64_RELOC_ADDEND and embedded addend in the instruction."); - if (ExplicitAddend) { + if (ExplicitAddend) RE.Addend = ExplicitAddend; - Value.Offset = ExplicitAddend; - } + + RelocationValueRef Value( + getRelocationValueRef(Obj, RelI, RE, ObjSectionToID)); bool IsExtern = Obj.getPlainRelocationExternal(RelInfo); if (!IsExtern && RE.IsPCRel) @@ -305,7 +304,7 @@ public: DEBUG(dumpRelocationToResolve(RE, Value)); const SectionEntry &Section = Sections[RE.SectionID]; - uint8_t *LocalAddress = Section.Address + RE.Offset; + uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset); MachO::RelocationInfoType RelType = static_cast<MachO::RelocationInfoType>(RE.RelType); @@ -325,7 +324,7 @@ public: case MachO::ARM64_RELOC_BRANCH26: { assert(RE.IsPCRel && "not PCRel and ARM64_RELOC_BRANCH26 not supported"); // Check if branch is in range. - uint64_t FinalAddress = Section.LoadAddress + RE.Offset; + uint64_t FinalAddress = Section.getLoadAddressWithOffset(RE.Offset); int64_t PCRelVal = Value - FinalAddress + RE.Addend; encodeAddend(LocalAddress, /*Size=*/4, RelType, PCRelVal); break; @@ -334,7 +333,7 @@ public: case MachO::ARM64_RELOC_PAGE21: { assert(RE.IsPCRel && "not PCRel and ARM64_RELOC_PAGE21 not supported"); // Adjust for PC-relative relocation and offset. - uint64_t FinalAddress = Section.LoadAddress + RE.Offset; + uint64_t FinalAddress = Section.getLoadAddressWithOffset(RE.Offset); int64_t PCRelVal = ((Value + RE.Addend) & (-4096)) - (FinalAddress & (-4096)); encodeAddend(LocalAddress, /*Size=*/4, RelType, PCRelVal); @@ -376,10 +375,10 @@ private: else { // FIXME: There must be a better way to do this then to check and fix the // alignment every time!!! - uintptr_t BaseAddress = uintptr_t(Section.Address); + uintptr_t BaseAddress = uintptr_t(Section.getAddress()); uintptr_t StubAlignment = getStubAlignment(); uintptr_t StubAddress = - (BaseAddress + Section.StubOffset + StubAlignment - 1) & + (BaseAddress + Section.getStubOffset() + StubAlignment - 1) & -StubAlignment; unsigned StubOffset = StubAddress - BaseAddress; Stubs[Value] = StubOffset; @@ -392,7 +391,7 @@ private: addRelocationForSymbol(GOTRE, Value.SymbolName); else addRelocationForSection(GOTRE, Value.SectionID); - Section.StubOffset = StubOffset + getMaxStubSize(); + Section.advanceStubOffset(getMaxStubSize()); Offset = static_cast<int64_t>(StubOffset); } RelocationEntry TargetRE(RE.SectionID, RE.Offset, RE.RelType, Offset, diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h index 0a24bb2..7731df0 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h @@ -35,7 +35,7 @@ public: int64_t decodeAddend(const RelocationEntry &RE) const { const SectionEntry &Section = Sections[RE.SectionID]; - uint8_t *LocalAddress = Section.Address + RE.Offset; + uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset); switch (RE.RelType) { default: @@ -64,8 +64,10 @@ public: if (RelType == MachO::ARM_RELOC_HALF_SECTDIFF) return processHALFSECTDIFFRelocation(SectionID, RelI, Obj, ObjSectionToID); + else if (RelType == MachO::GENERIC_RELOC_VANILLA) + return processScatteredVANILLA(SectionID, RelI, Obj, ObjSectionToID); else - return ++++RelI; + return ++RelI; } RelocationEntry RE(getRelocationEntry(SectionID, Obj, RelI)); @@ -92,12 +94,12 @@ public: void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override { DEBUG(dumpRelocationToResolve(RE, Value)); const SectionEntry &Section = Sections[RE.SectionID]; - uint8_t *LocalAddress = Section.Address + RE.Offset; + uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset); // If the relocation is PC-relative, the value to be encoded is the // pointer difference. if (RE.IsPCRel) { - uint64_t FinalAddress = Section.LoadAddress + RE.Offset; + uint64_t FinalAddress = Section.getLoadAddressWithOffset(RE.Offset); Value -= FinalAddress; // ARM PCRel relocations have an effective-PC offset of two instructions // (four bytes in Thumb mode, 8 bytes in ARM mode). @@ -130,8 +132,8 @@ public: break; } case MachO::ARM_RELOC_HALF_SECTDIFF: { - uint64_t SectionABase = Sections[RE.Sections.SectionA].LoadAddress; - uint64_t SectionBBase = Sections[RE.Sections.SectionB].LoadAddress; + uint64_t SectionABase = Sections[RE.Sections.SectionA].getLoadAddress(); + uint64_t SectionBBase = Sections[RE.Sections.SectionB].getLoadAddress(); assert((Value == SectionABase || Value == SectionBBase) && "Unexpected HALFSECTDIFF relocation value."); Value = SectionABase - SectionBBase + RE.Addend; @@ -178,21 +180,21 @@ private: RuntimeDyldMachO::StubMap::const_iterator i = Stubs.find(Value); uint8_t *Addr; if (i != Stubs.end()) { - Addr = Section.Address + i->second; + Addr = Section.getAddressWithOffset(i->second); } else { // Create a new stub function. - Stubs[Value] = Section.StubOffset; - uint8_t *StubTargetAddr = - createStubFunction(Section.Address + Section.StubOffset); - RelocationEntry StubRE(RE.SectionID, StubTargetAddr - Section.Address, - MachO::GENERIC_RELOC_VANILLA, Value.Offset, false, - 2); + Stubs[Value] = Section.getStubOffset(); + uint8_t *StubTargetAddr = createStubFunction( + Section.getAddressWithOffset(Section.getStubOffset())); + RelocationEntry StubRE( + RE.SectionID, StubTargetAddr - Section.getAddress(), + MachO::GENERIC_RELOC_VANILLA, Value.Offset, false, 2); if (Value.SymbolName) addRelocationForSymbol(StubRE, Value.SymbolName); else addRelocationForSection(StubRE, Value.SectionID); - Addr = Section.Address + Section.StubOffset; - Section.StubOffset += getMaxStubSize(); + Addr = Section.getAddressWithOffset(Section.getStubOffset()); + Section.advanceStubOffset(getMaxStubSize()); } RelocationEntry TargetRE(RE.SectionID, RE.Offset, RE.RelType, 0, RE.IsPCRel, RE.Size); @@ -221,7 +223,7 @@ private: uint32_t RelocType = MachO.getAnyRelocationType(RE); bool IsPCRel = MachO.getAnyRelocationPCRel(RE); uint64_t Offset = RelI->getOffset(); - uint8_t *LocalAddress = Section.Address + Offset; + uint8_t *LocalAddress = Section.getAddressWithOffset(Offset); int64_t Immediate = readBytesUnaligned(LocalAddress, 4); // Copy the whole instruction out. Immediate = ((Immediate >> 4) & 0xf000) | (Immediate & 0xfff); diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h index 569a078..85059d7 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h @@ -47,8 +47,7 @@ public: return processSECTDIFFRelocation(SectionID, RelI, Obj, ObjSectionToID); else if (RelType == MachO::GENERIC_RELOC_VANILLA) - return processI386ScatteredVANILLA(SectionID, RelI, Obj, - ObjSectionToID); + return processScatteredVANILLA(SectionID, RelI, Obj, ObjSectionToID); llvm_unreachable("Unhandled scattered relocation."); } @@ -84,10 +83,10 @@ public: DEBUG(dumpRelocationToResolve(RE, Value)); const SectionEntry &Section = Sections[RE.SectionID]; - uint8_t *LocalAddress = Section.Address + RE.Offset; + uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset); if (RE.IsPCRel) { - uint64_t FinalAddress = Section.LoadAddress + RE.Offset; + uint64_t FinalAddress = Section.getLoadAddressWithOffset(RE.Offset); Value -= FinalAddress + 4; // see MachOX86_64::resolveRelocation. } @@ -99,8 +98,8 @@ public: break; case MachO::GENERIC_RELOC_SECTDIFF: case MachO::GENERIC_RELOC_LOCAL_SECTDIFF: { - uint64_t SectionABase = Sections[RE.Sections.SectionA].LoadAddress; - uint64_t SectionBBase = Sections[RE.Sections.SectionB].LoadAddress; + uint64_t SectionABase = Sections[RE.Sections.SectionA].getLoadAddress(); + uint64_t SectionBBase = Sections[RE.Sections.SectionB].getLoadAddress(); assert((Value == SectionABase || Value == SectionBBase) && "Unexpected SECTDIFF relocation value."); Value = SectionABase - SectionBBase + RE.Addend; @@ -139,7 +138,7 @@ private: bool IsPCRel = Obj.getAnyRelocationPCRel(RE); unsigned Size = Obj.getAnyRelocationLength(RE); uint64_t Offset = RelI->getOffset(); - uint8_t *LocalAddress = Section.Address + Offset; + uint8_t *LocalAddress = Section.getAddressWithOffset(Offset); unsigned NumBytes = 1 << Size; uint64_t Addend = readBytesUnaligned(LocalAddress, NumBytes); @@ -183,41 +182,6 @@ private: return ++RelI; } - relocation_iterator processI386ScatteredVANILLA( - unsigned SectionID, relocation_iterator RelI, - const ObjectFile &BaseObjT, - RuntimeDyldMachO::ObjSectionToIDMap &ObjSectionToID) { - const MachOObjectFile &Obj = - static_cast<const MachOObjectFile&>(BaseObjT); - MachO::any_relocation_info RE = - Obj.getRelocation(RelI->getRawDataRefImpl()); - - SectionEntry &Section = Sections[SectionID]; - uint32_t RelocType = Obj.getAnyRelocationType(RE); - bool IsPCRel = Obj.getAnyRelocationPCRel(RE); - unsigned Size = Obj.getAnyRelocationLength(RE); - uint64_t Offset = RelI->getOffset(); - uint8_t *LocalAddress = Section.Address + Offset; - unsigned NumBytes = 1 << Size; - int64_t Addend = readBytesUnaligned(LocalAddress, NumBytes); - - unsigned SymbolBaseAddr = Obj.getScatteredRelocationValue(RE); - section_iterator TargetSI = getSectionByAddress(Obj, SymbolBaseAddr); - assert(TargetSI != Obj.section_end() && "Can't find section for symbol"); - uint64_t SectionBaseAddr = TargetSI->getAddress(); - SectionRef TargetSection = *TargetSI; - bool IsCode = TargetSection.isText(); - uint32_t TargetSectionID = - findOrEmitSection(Obj, TargetSection, IsCode, ObjSectionToID); - - Addend -= SectionBaseAddr; - RelocationEntry R(SectionID, Offset, RelocType, Addend, IsPCRel, Size); - - addRelocationForSection(R, TargetSectionID); - - return ++RelI; - } - // Populate stubs in __jump_table section. void populateJumpTable(const MachOObjectFile &Obj, const SectionRef &JTSection, unsigned JTSectionID) { diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h index dd56e72..2242295 100644 --- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h +++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h @@ -39,6 +39,10 @@ public: static_cast<const MachOObjectFile &>(BaseObjT); MachO::any_relocation_info RelInfo = Obj.getRelocation(RelI->getRawDataRefImpl()); + uint32_t RelType = Obj.getAnyRelocationType(RelInfo); + + if (RelType == MachO::X86_64_RELOC_SUBTRACTOR) + return processSubtractRelocation(SectionID, RelI, Obj, ObjSectionToID); assert(!Obj.isRelocationScattered(RelInfo) && "Scattered relocations not supported on X86_64"); @@ -69,14 +73,14 @@ public: void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override { DEBUG(dumpRelocationToResolve(RE, Value)); const SectionEntry &Section = Sections[RE.SectionID]; - uint8_t *LocalAddress = Section.Address + RE.Offset; + uint8_t *LocalAddress = Section.getAddressWithOffset(RE.Offset); // If the relocation is PC-relative, the value to be encoded is the // pointer difference. if (RE.IsPCRel) { // FIXME: It seems this value needs to be adjusted by 4 for an effective // PC address. Is that expected? Only for branches, perhaps? - uint64_t FinalAddress = Section.LoadAddress + RE.Offset; + uint64_t FinalAddress = Section.getLoadAddressWithOffset(RE.Offset); Value -= FinalAddress + 4; } @@ -91,9 +95,17 @@ public: case MachO::X86_64_RELOC_BRANCH: writeBytesUnaligned(Value + RE.Addend, LocalAddress, 1 << RE.Size); break; + case MachO::X86_64_RELOC_SUBTRACTOR: { + uint64_t SectionABase = Sections[RE.Sections.SectionA].getLoadAddress(); + uint64_t SectionBBase = Sections[RE.Sections.SectionB].getLoadAddress(); + assert((Value == SectionABase || Value == SectionBBase) && + "Unexpected SUBTRACTOR relocation value."); + Value = SectionABase - SectionBBase + RE.Addend; + writeBytesUnaligned(Value, LocalAddress, 1 << RE.Size); + break; + } case MachO::X86_64_RELOC_GOT_LOAD: case MachO::X86_64_RELOC_GOT: - case MachO::X86_64_RELOC_SUBTRACTOR: case MachO::X86_64_RELOC_TLV: Error("Relocation type not implemented yet!"); } @@ -112,24 +124,65 @@ private: RuntimeDyldMachO::StubMap::const_iterator i = Stubs.find(Value); uint8_t *Addr; if (i != Stubs.end()) { - Addr = Section.Address + i->second; + Addr = Section.getAddressWithOffset(i->second); } else { - Stubs[Value] = Section.StubOffset; - uint8_t *GOTEntry = Section.Address + Section.StubOffset; - RelocationEntry GOTRE(RE.SectionID, Section.StubOffset, + Stubs[Value] = Section.getStubOffset(); + uint8_t *GOTEntry = Section.getAddressWithOffset(Section.getStubOffset()); + RelocationEntry GOTRE(RE.SectionID, Section.getStubOffset(), MachO::X86_64_RELOC_UNSIGNED, Value.Offset, false, 3); if (Value.SymbolName) addRelocationForSymbol(GOTRE, Value.SymbolName); else addRelocationForSection(GOTRE, Value.SectionID); - Section.StubOffset += 8; + Section.advanceStubOffset(8); Addr = GOTEntry; } RelocationEntry TargetRE(RE.SectionID, RE.Offset, MachO::X86_64_RELOC_UNSIGNED, RE.Addend, true, 2); resolveRelocation(TargetRE, (uint64_t)Addr); } + + relocation_iterator + processSubtractRelocation(unsigned SectionID, relocation_iterator RelI, + const ObjectFile &BaseObjT, + ObjSectionToIDMap &ObjSectionToID) { + const MachOObjectFile &Obj = + static_cast<const MachOObjectFile&>(BaseObjT); + MachO::any_relocation_info RE = + Obj.getRelocation(RelI->getRawDataRefImpl()); + + unsigned Size = Obj.getAnyRelocationLength(RE); + uint64_t Offset = RelI->getOffset(); + uint8_t *LocalAddress = Sections[SectionID].getAddressWithOffset(Offset); + unsigned NumBytes = 1 << Size; + + ErrorOr<StringRef> SubtrahendNameOrErr = RelI->getSymbol()->getName(); + if (auto EC = SubtrahendNameOrErr.getError()) + report_fatal_error(EC.message()); + auto SubtrahendI = GlobalSymbolTable.find(*SubtrahendNameOrErr); + unsigned SectionBID = SubtrahendI->second.getSectionID(); + uint64_t SectionBOffset = SubtrahendI->second.getOffset(); + int64_t Addend = + SignExtend64(readBytesUnaligned(LocalAddress, NumBytes), NumBytes * 8); + + ++RelI; + ErrorOr<StringRef> MinuendNameOrErr = RelI->getSymbol()->getName(); + if (auto EC = MinuendNameOrErr.getError()) + report_fatal_error(EC.message()); + auto MinuendI = GlobalSymbolTable.find(*MinuendNameOrErr); + unsigned SectionAID = MinuendI->second.getSectionID(); + uint64_t SectionAOffset = MinuendI->second.getOffset(); + + RelocationEntry R(SectionID, Offset, MachO::X86_64_RELOC_SUBTRACTOR, (uint64_t)Addend, + SectionAID, SectionAOffset, SectionBID, SectionBOffset, + false, Size); + + addRelocationForSection(R, SectionAID); + + return ++RelI; + } + }; } diff --git a/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp b/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp index 5986084..1ad5f17 100644 --- a/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp +++ b/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp @@ -15,6 +15,7 @@ #include "llvm/Config/config.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/Process.h" namespace llvm { @@ -48,16 +49,27 @@ uint8_t *SectionMemoryManager::allocateSection(MemoryGroup &MemGroup, // Look in the list of free memory regions and use a block there if one // is available. - for (int i = 0, e = MemGroup.FreeMem.size(); i != e; ++i) { - sys::MemoryBlock &MB = MemGroup.FreeMem[i]; - if (MB.size() >= RequiredSize) { - Addr = (uintptr_t)MB.base(); - uintptr_t EndOfBlock = Addr + MB.size(); + for (FreeMemBlock &FreeMB : MemGroup.FreeMem) { + if (FreeMB.Free.size() >= RequiredSize) { + Addr = (uintptr_t)FreeMB.Free.base(); + uintptr_t EndOfBlock = Addr + FreeMB.Free.size(); // Align the address. Addr = (Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1); - // Store cutted free memory block. - MemGroup.FreeMem[i] = sys::MemoryBlock((void*)(Addr + Size), - EndOfBlock - Addr - Size); + + if (FreeMB.PendingPrefixIndex == (unsigned)-1) { + // The part of the block we're giving out to the user is now pending + MemGroup.PendingMem.push_back(sys::MemoryBlock((void *)Addr, Size)); + + // Remember this pending block, such that future allocations can just + // modify it rather than creating a new one + FreeMB.PendingPrefixIndex = MemGroup.PendingMem.size() - 1; + } else { + sys::MemoryBlock &PendingMB = MemGroup.PendingMem[FreeMB.PendingPrefixIndex]; + PendingMB = sys::MemoryBlock(PendingMB.base(), Addr + Size - (uintptr_t)PendingMB.base()); + } + + // Remember how much free space is now left in this block + FreeMB.Free = sys::MemoryBlock((void *)(Addr + Size), EndOfBlock - Addr - Size); return (uint8_t*)Addr; } } @@ -85,6 +97,7 @@ uint8_t *SectionMemoryManager::allocateSection(MemoryGroup &MemGroup, // Save this address as the basis for our next request MemGroup.Near = MB; + // Remember that we allocated this memory MemGroup.AllocatedMem.push_back(MB); Addr = (uintptr_t)MB.base(); uintptr_t EndOfBlock = Addr + MB.size(); @@ -92,11 +105,18 @@ uint8_t *SectionMemoryManager::allocateSection(MemoryGroup &MemGroup, // Align the address. Addr = (Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1); + // The part of the block we're giving out to the user is now pending + MemGroup.PendingMem.push_back(sys::MemoryBlock((void *)Addr, Size)); + // The allocateMappedMemory may allocate much more memory than we need. In // this case, we store the unused memory as a free memory block. unsigned FreeSize = EndOfBlock-Addr-Size; - if (FreeSize > 16) - MemGroup.FreeMem.push_back(sys::MemoryBlock((void*)(Addr + Size), FreeSize)); + if (FreeSize > 16) { + FreeMemBlock FreeMB; + FreeMB.Free = sys::MemoryBlock((void*)(Addr + Size), FreeSize); + FreeMB.PendingPrefixIndex = (unsigned)-1; + MemGroup.FreeMem.push_back(FreeMB); + } // Return aligned address return (uint8_t*)Addr; @@ -107,9 +127,6 @@ bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg) // FIXME: Should in-progress permissions be reverted if an error occurs? std::error_code ec; - // Don't allow free memory blocks to be used after setting protection flags. - CodeMem.FreeMem.clear(); - // Make code memory executable. ec = applyMemoryGroupPermissions(CodeMem, sys::Memory::MF_READ | sys::Memory::MF_EXEC); @@ -120,9 +137,6 @@ bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg) return true; } - // Don't allow free memory blocks to be used after setting protection flags. - RODataMem.FreeMem.clear(); - // Make read-only data memory read-only. ec = applyMemoryGroupPermissions(RODataMem, sys::Memory::MF_READ | sys::Memory::MF_EXEC); @@ -143,36 +157,62 @@ bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg) return false; } +static sys::MemoryBlock trimBlockToPageSize(sys::MemoryBlock M) { + static const size_t PageSize = sys::Process::getPageSize(); + + size_t StartOverlap = + (PageSize - ((uintptr_t)M.base() % PageSize)) % PageSize; + + size_t TrimmedSize = M.size(); + TrimmedSize -= StartOverlap; + TrimmedSize -= TrimmedSize % PageSize; + + sys::MemoryBlock Trimmed((void *)((uintptr_t)M.base() + StartOverlap), TrimmedSize); + + assert(((uintptr_t)Trimmed.base() % PageSize) == 0); + assert((Trimmed.size() % PageSize) == 0); + assert(M.base() <= Trimmed.base() && Trimmed.size() <= M.size()); + + return Trimmed; +} + + std::error_code SectionMemoryManager::applyMemoryGroupPermissions(MemoryGroup &MemGroup, unsigned Permissions) { - - for (int i = 0, e = MemGroup.AllocatedMem.size(); i != e; ++i) { - std::error_code ec; - ec = - sys::Memory::protectMappedMemory(MemGroup.AllocatedMem[i], Permissions); - if (ec) { - return ec; - } + for (sys::MemoryBlock &MB : MemGroup.PendingMem) + if (std::error_code EC = sys::Memory::protectMappedMemory(MB, Permissions)) + return EC; + + MemGroup.PendingMem.clear(); + + // Now go through free blocks and trim any of them that don't span the entire + // page because one of the pending blocks may have overlapped it. + for (FreeMemBlock &FreeMB : MemGroup.FreeMem) { + FreeMB.Free = trimBlockToPageSize(FreeMB.Free); + // We cleared the PendingMem list, so all these pointers are now invalid + FreeMB.PendingPrefixIndex = (unsigned)-1; } + // Remove all blocks which are now empty + MemGroup.FreeMem.erase( + std::remove_if(MemGroup.FreeMem.begin(), MemGroup.FreeMem.end(), + [](FreeMemBlock &FreeMB) { return FreeMB.Free.size() == 0; }), + MemGroup.FreeMem.end()); + return std::error_code(); } void SectionMemoryManager::invalidateInstructionCache() { - for (int i = 0, e = CodeMem.AllocatedMem.size(); i != e; ++i) - sys::Memory::InvalidateInstructionCache(CodeMem.AllocatedMem[i].base(), - CodeMem.AllocatedMem[i].size()); + for (sys::MemoryBlock &Block : CodeMem.PendingMem) + sys::Memory::InvalidateInstructionCache(Block.base(), Block.size()); } SectionMemoryManager::~SectionMemoryManager() { - for (unsigned i = 0, e = CodeMem.AllocatedMem.size(); i != e; ++i) - sys::Memory::releaseMappedMemory(CodeMem.AllocatedMem[i]); - for (unsigned i = 0, e = RWDataMem.AllocatedMem.size(); i != e; ++i) - sys::Memory::releaseMappedMemory(RWDataMem.AllocatedMem[i]); - for (unsigned i = 0, e = RODataMem.AllocatedMem.size(); i != e; ++i) - sys::Memory::releaseMappedMemory(RODataMem.AllocatedMem[i]); + for (MemoryGroup *Group : {&CodeMem, &RWDataMem, &RODataMem}) { + for (sys::MemoryBlock &Block : Group->AllocatedMem) + sys::Memory::releaseMappedMemory(Block); + } } } // namespace llvm - diff --git a/contrib/llvm/lib/IR/AsmWriter.cpp b/contrib/llvm/lib/IR/AsmWriter.cpp index b553f11..0ce44e1 100644 --- a/contrib/llvm/lib/IR/AsmWriter.cpp +++ b/contrib/llvm/lib/IR/AsmWriter.cpp @@ -39,6 +39,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" @@ -102,17 +103,9 @@ static OrderMap orderModule(const Module *M) { orderValue(&A, OM); } for (const Function &F : *M) { - if (F.hasPrefixData()) - if (!isa<GlobalValue>(F.getPrefixData())) - orderValue(F.getPrefixData(), OM); - - if (F.hasPrologueData()) - if (!isa<GlobalValue>(F.getPrologueData())) - orderValue(F.getPrologueData(), OM); - - if (F.hasPersonalityFn()) - if (!isa<GlobalValue>(F.getPersonalityFn())) - orderValue(F.getPersonalityFn(), OM); + for (const Use &U : F.operands()) + if (!isa<GlobalValue>(U.get())) + orderValue(U.get(), OM); orderValue(&F, OM); @@ -232,8 +225,7 @@ static UseListOrderStack predictUseListOrder(const Module *M) { // We want to visit the functions backward now so we can list function-local // constants in the last Function they're used in. Module-level constants // have already been visited above. - for (auto I = M->rbegin(), E = M->rend(); I != E; ++I) { - const Function &F = *I; + for (const Function &F : make_range(M->rbegin(), M->rend())) { if (F.isDeclaration()) continue; for (const BasicBlock &BB : F) @@ -263,8 +255,8 @@ static UseListOrderStack predictUseListOrder(const Module *M) { for (const GlobalAlias &A : M->aliases()) predictValueUseListOrder(A.getAliasee(), nullptr, OM, Stack); for (const Function &F : *M) - if (F.hasPrefixData()) - predictValueUseListOrder(F.getPrefixData(), nullptr, OM, Stack); + for (const Use &U : F.operands()) + predictValueUseListOrder(U.get(), nullptr, OM, Stack); return Stack; } @@ -304,6 +296,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) { case CallingConv::AnyReg: Out << "anyregcc"; break; case CallingConv::PreserveMost: Out << "preserve_mostcc"; break; case CallingConv::PreserveAll: Out << "preserve_allcc"; break; + case CallingConv::CXX_FAST_TLS: Out << "cxx_fast_tlscc"; break; case CallingConv::GHC: Out << "ghccc"; break; case CallingConv::X86_StdCall: Out << "x86_stdcallcc"; break; case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break; @@ -320,6 +313,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) { case CallingConv::X86_64_Win64: Out << "x86_64_win64cc"; break; case CallingConv::SPIR_FUNC: Out << "spir_func"; break; case CallingConv::SPIR_KERNEL: Out << "spir_kernel"; break; + case CallingConv::X86_INTR: Out << "x86_intrcc"; break; + case CallingConv::HHVM: Out << "hhvmcc"; break; + case CallingConv::HHVM_C: Out << "hhvm_ccc"; break; } } @@ -343,18 +339,8 @@ enum PrefixType { NoPrefix }; -/// PrintLLVMName - Turn the specified name into an 'LLVM name', which is either -/// prefixed with % (if the string only contains simple characters) or is -/// surrounded with ""'s (if it has special chars in it). Print it out. -static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) { +void llvm::printLLVMNameWithoutPrefix(raw_ostream &OS, StringRef Name) { assert(!Name.empty() && "Cannot get empty name!"); - switch (Prefix) { - case NoPrefix: break; - case GlobalPrefix: OS << '@'; break; - case ComdatPrefix: OS << '$'; break; - case LabelPrefix: break; - case LocalPrefix: OS << '%'; break; - } // Scan the name to see if it needs quotes first. bool NeedsQuotes = isdigit(static_cast<unsigned char>(Name[0])); @@ -386,9 +372,31 @@ static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) { OS << '"'; } -/// PrintLLVMName - Turn the specified name into an 'LLVM name', which is either -/// prefixed with % (if the string only contains simple characters) or is -/// surrounded with ""'s (if it has special chars in it). Print it out. +/// Turn the specified name into an 'LLVM name', which is either prefixed with % +/// (if the string only contains simple characters) or is surrounded with ""'s +/// (if it has special chars in it). Print it out. +static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) { + switch (Prefix) { + case NoPrefix: + break; + case GlobalPrefix: + OS << '@'; + break; + case ComdatPrefix: + OS << '$'; + break; + case LabelPrefix: + break; + case LocalPrefix: + OS << '%'; + break; + } + printLLVMNameWithoutPrefix(OS, Name); +} + +/// Turn the specified name into an 'LLVM name', which is either prefixed with % +/// (if the string only contains simple characters) or is surrounded with ""'s +/// (if it has special chars in it). Print it out. static void PrintLLVMName(raw_ostream &OS, const Value *V) { PrintLLVMName(OS, V->getName(), isa<GlobalValue>(V) ? GlobalPrefix : LocalPrefix); @@ -456,6 +464,7 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) { case Type::LabelTyID: OS << "label"; return; case Type::MetadataTyID: OS << "metadata"; return; case Type::X86_MMXTyID: OS << "x86_mmx"; return; + case Type::TokenTyID: OS << "token"; return; case Type::IntegerTyID: OS << 'i' << cast<IntegerType>(Ty)->getBitWidth(); return; @@ -691,8 +700,9 @@ void ModuleSlotTracker::incorporateFunction(const Function &F) { this->F = &F; } -static SlotTracker *createSlotTracker(const Module *M) { - return new SlotTracker(M); +int ModuleSlotTracker::getLocalSlot(const Value *V) { + assert(F && "No function incorporated"); + return Machine->getLocalSlot(V); } static SlotTracker *createSlotTracker(const Value *V) { @@ -802,7 +812,7 @@ void SlotTracker::processFunction() { for(Function::const_arg_iterator AI = TheFunction->arg_begin(), AE = TheFunction->arg_end(); AI != AE; ++AI) if (!AI->hasName()) - CreateFunctionSlot(AI); + CreateFunctionSlot(&*AI); ST_DEBUG("Inserting Instructions:\n"); @@ -1093,11 +1103,10 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, // the value back and get the same value. // bool ignored; - bool isHalf = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEhalf; bool isDouble = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEdouble; bool isInf = CFP->getValueAPF().isInfinity(); bool isNaN = CFP->getValueAPF().isNaN(); - if (!isHalf && !isInf && !isNaN) { + if (!isInf && !isNaN) { double Val = isDouble ? CFP->getValueAPF().convertToDouble() : CFP->getValueAPF().convertToFloat(); SmallString<128> StrVal; @@ -1123,15 +1132,12 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, // x86, so we must not use these types. static_assert(sizeof(double) == sizeof(uint64_t), "assuming that double is 64 bits!"); - char Buffer[40]; APFloat apf = CFP->getValueAPF(); - // Halves and floats are represented in ASCII IR as double, convert. + // Floats are represented in ASCII IR as double, convert. if (!isDouble) apf.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored); - Out << "0x" << - utohex_buffer(uint64_t(apf.bitcastToAPInt().getZExtValue()), - Buffer+40); + Out << format_hex(apf.bitcastToAPInt().getZExtValue(), 0, /*Upper=*/true); return; } @@ -1139,60 +1145,32 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, // These appear as a magic letter identifying the type, then a // fixed number of hex digits. Out << "0x"; - // Bit position, in the current word, of the next nibble to print. - int shiftcount; - + APInt API = CFP->getValueAPF().bitcastToAPInt(); if (&CFP->getValueAPF().getSemantics() == &APFloat::x87DoubleExtended) { Out << 'K'; - // api needed to prevent premature destruction - APInt api = CFP->getValueAPF().bitcastToAPInt(); - const uint64_t* p = api.getRawData(); - uint64_t word = p[1]; - shiftcount = 12; - int width = api.getBitWidth(); - for (int j=0; j<width; j+=4, shiftcount-=4) { - unsigned int nibble = (word>>shiftcount) & 15; - if (nibble < 10) - Out << (unsigned char)(nibble + '0'); - else - Out << (unsigned char)(nibble - 10 + 'A'); - if (shiftcount == 0 && j+4 < width) { - word = *p; - shiftcount = 64; - if (width-j-4 < 64) - shiftcount = width-j-4; - } - } + Out << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4, + /*Upper=*/true); + Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16, + /*Upper=*/true); return; } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad) { - shiftcount = 60; Out << 'L'; + Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16, + /*Upper=*/true); + Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16, + /*Upper=*/true); } else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble) { - shiftcount = 60; Out << 'M'; + Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16, + /*Upper=*/true); + Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16, + /*Upper=*/true); } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEhalf) { - shiftcount = 12; Out << 'H'; + Out << format_hex_no_prefix(API.getZExtValue(), 4, + /*Upper=*/true); } else llvm_unreachable("Unsupported floating point type"); - // api needed to prevent premature destruction - APInt api = CFP->getValueAPF().bitcastToAPInt(); - const uint64_t* p = api.getRawData(); - uint64_t word = *p; - int width = api.getBitWidth(); - for (int j=0; j<width; j+=4, shiftcount-=4) { - unsigned int nibble = (word>>shiftcount) & 15; - if (nibble < 10) - Out << (unsigned char)(nibble + '0'); - else - Out << (unsigned char)(nibble - 10 + 'A'); - if (shiftcount == 0 && j+4 < width) { - word = *(++p); - shiftcount = 64; - if (width-j-4 < 64) - shiftcount = width-j-4; - } - } return; } @@ -1313,6 +1291,11 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, return; } + if (isa<ConstantTokenNone>(CV)) { + Out << "none"; + return; + } + if (isa<UndefValue>(CV)) { Out << "undef"; return; @@ -1326,10 +1309,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, Out << " ("; if (const GEPOperator *GEP = dyn_cast<GEPOperator>(CE)) { - TypePrinter.print( - cast<PointerType>(GEP->getPointerOperandType()->getScalarType()) - ->getElementType(), - Out); + TypePrinter.print(GEP->getSourceElementType(), Out); Out << ", "; } @@ -1409,6 +1389,7 @@ struct MDFieldPrinter { : Out(Out), TypePrinter(TypePrinter), Machine(Machine), Context(Context) { } void printTag(const DINode *N); + void printMacinfoType(const DIMacroNode *N); void printString(StringRef Name, StringRef Value, bool ShouldSkipEmpty = true); void printMetadata(StringRef Name, const Metadata *MD, @@ -1431,6 +1412,14 @@ void MDFieldPrinter::printTag(const DINode *N) { Out << N->getTag(); } +void MDFieldPrinter::printMacinfoType(const DIMacroNode *N) { + Out << FS << "type: "; + if (const char *Type = dwarf::MacinfoString(N->getMacinfoType())) + Out << Type; + else + Out << N->getMacinfoType(); +} + void MDFieldPrinter::printString(StringRef Name, StringRef Value, bool ShouldSkipEmpty) { if (ShouldSkipEmpty && Value.empty()) @@ -1656,6 +1645,7 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N, Printer.printMetadata("subprograms", N->getRawSubprograms()); Printer.printMetadata("globals", N->getRawGlobalVariables()); Printer.printMetadata("imports", N->getRawImportedEntities()); + Printer.printMetadata("macros", N->getRawMacros()); Printer.printInt("dwoId", N->getDWOId()); Out << ")"; } @@ -1680,7 +1670,6 @@ static void writeDISubprogram(raw_ostream &Out, const DISubprogram *N, Printer.printInt("virtualIndex", N->getVirtualIndex()); Printer.printDIFlags("flags", N->getFlags()); Printer.printBool("isOptimized", N->isOptimized()); - Printer.printMetadata("function", N->getRawFunction()); Printer.printMetadata("templateParams", N->getRawTemplateParams()); Printer.printMetadata("declaration", N->getRawDeclaration()); Printer.printMetadata("variables", N->getRawVariables()); @@ -1725,6 +1714,29 @@ static void writeDINamespace(raw_ostream &Out, const DINamespace *N, Out << ")"; } +static void writeDIMacro(raw_ostream &Out, const DIMacro *N, + TypePrinting *TypePrinter, SlotTracker *Machine, + const Module *Context) { + Out << "!DIMacro("; + MDFieldPrinter Printer(Out, TypePrinter, Machine, Context); + Printer.printMacinfoType(N); + Printer.printInt("line", N->getLine()); + Printer.printString("name", N->getName()); + Printer.printString("value", N->getValue()); + Out << ")"; +} + +static void writeDIMacroFile(raw_ostream &Out, const DIMacroFile *N, + TypePrinting *TypePrinter, SlotTracker *Machine, + const Module *Context) { + Out << "!DIMacroFile("; + MDFieldPrinter Printer(Out, TypePrinter, Machine, Context); + Printer.printInt("line", N->getLine()); + Printer.printMetadata("file", N->getRawFile(), /* ShouldSkipNull */ false); + Printer.printMetadata("nodes", N->getRawElements()); + Out << ")"; +} + static void writeDIModule(raw_ostream &Out, const DIModule *N, TypePrinting *TypePrinter, SlotTracker *Machine, const Module *Context) { @@ -1789,11 +1801,8 @@ static void writeDILocalVariable(raw_ostream &Out, const DILocalVariable *N, SlotTracker *Machine, const Module *Context) { Out << "!DILocalVariable("; MDFieldPrinter Printer(Out, TypePrinter, Machine, Context); - Printer.printTag(N); Printer.printString("name", N->getName()); - Printer.printInt("arg", N->getArg(), - /* ShouldSkipZero */ - N->getTag() == dwarf::DW_TAG_auto_variable); + Printer.printInt("arg", N->getArg()); Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false); Printer.printMetadata("file", N->getRawFile()); Printer.printInt("line", N->getLine()); @@ -1998,6 +2007,7 @@ class AssemblyWriter { TypePrinting TypePrinter; AssemblyAnnotationWriter *AnnotationWriter; SetVector<const Comdat *> Comdats; + bool IsForDebug; bool ShouldPreserveUseListOrder; UseListOrderStack UseListOrders; SmallVector<StringRef, 8> MDNames; @@ -2005,12 +2015,7 @@ class AssemblyWriter { public: /// Construct an AssemblyWriter with an external SlotTracker AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac, const Module *M, - AssemblyAnnotationWriter *AAW, - bool ShouldPreserveUseListOrder = false); - - /// Construct an AssemblyWriter with an internally allocated SlotTracker - AssemblyWriter(formatted_raw_ostream &o, const Module *M, - AssemblyAnnotationWriter *AAW, + AssemblyAnnotationWriter *AAW, bool IsForDebug, bool ShouldPreserveUseListOrder = false); void printMDNodeBody(const MDNode *MD); @@ -2020,6 +2025,7 @@ public: void writeOperand(const Value *Op, bool PrintType); void writeParamOperand(const Value *Operand, AttributeSet Attrs,unsigned Idx); + void writeOperandBundles(ImmutableCallSite CS); void writeAtomic(AtomicOrdering Ordering, SynchronizationScope SynchScope); void writeAtomicCmpXchg(AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, @@ -2043,8 +2049,6 @@ public: void printUseLists(const Function *F); private: - void init(); - /// \brief Print out metadata attachments. void printMetadataAttachments( const SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs, @@ -2056,11 +2060,16 @@ private: // printGCRelocateComment - print comment after call to the gc.relocate // intrinsic indicating base and derived pointer names. - void printGCRelocateComment(const Value &V); + void printGCRelocateComment(const GCRelocateInst &Relocate); }; } // namespace -void AssemblyWriter::init() { +AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac, + const Module *M, AssemblyAnnotationWriter *AAW, + bool IsForDebug, bool ShouldPreserveUseListOrder) + : Out(o), TheModule(M), Machine(Mac), AnnotationWriter(AAW), + IsForDebug(IsForDebug), + ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) { if (!TheModule) return; TypePrinter.incorporateTypes(*TheModule); @@ -2072,23 +2081,6 @@ void AssemblyWriter::init() { Comdats.insert(C); } -AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac, - const Module *M, AssemblyAnnotationWriter *AAW, - bool ShouldPreserveUseListOrder) - : Out(o), TheModule(M), Machine(Mac), AnnotationWriter(AAW), - ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) { - init(); -} - -AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, const Module *M, - AssemblyAnnotationWriter *AAW, - bool ShouldPreserveUseListOrder) - : Out(o), TheModule(M), SlotTrackerStorage(createSlotTracker(M)), - Machine(*SlotTrackerStorage), AnnotationWriter(AAW), - ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) { - init(); -} - void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) { if (!Operand) { Out << "<null operand!>"; @@ -2170,6 +2162,43 @@ void AssemblyWriter::writeParamOperand(const Value *Operand, WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine, TheModule); } +void AssemblyWriter::writeOperandBundles(ImmutableCallSite CS) { + if (!CS.hasOperandBundles()) + return; + + Out << " [ "; + + bool FirstBundle = true; + for (unsigned i = 0, e = CS.getNumOperandBundles(); i != e; ++i) { + OperandBundleUse BU = CS.getOperandBundleAt(i); + + if (!FirstBundle) + Out << ", "; + FirstBundle = false; + + Out << '"'; + PrintEscapedString(BU.getTagName(), Out); + Out << '"'; + + Out << '('; + + bool FirstInput = true; + for (const auto &Input : BU.Inputs) { + if (!FirstInput) + Out << ", "; + FirstInput = false; + + TypePrinter.print(Input->getType(), Out); + Out << " "; + WriteAsOperandInternal(Out, Input, &TypePrinter, &Machine, TheModule); + } + + Out << ')'; + } + + Out << " ]"; +} + void AssemblyWriter::printModule(const Module *M) { Machine.initialize(); @@ -2422,6 +2451,10 @@ void AssemblyWriter::printAlias(const GlobalAlias *GA) { Out << "alias "; + TypePrinter.print(GA->getValueType(), Out); + + Out << ", "; + const Constant *Aliasee = GA->getAliasee(); if (!Aliasee) { @@ -2536,28 +2569,26 @@ void AssemblyWriter::printFunction(const Function *F) { Machine.incorporateFunction(F); // Loop over the arguments, printing them... - - unsigned Idx = 1; - if (!F->isDeclaration()) { - // If this isn't a declaration, print the argument names as well. - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I) { + if (F->isDeclaration() && !IsForDebug) { + // We're only interested in the type here - don't print argument names. + for (unsigned I = 0, E = FT->getNumParams(); I != E; ++I) { // Insert commas as we go... the first arg doesn't get a comma - if (I != F->arg_begin()) Out << ", "; - printArgument(I, Attrs, Idx); - Idx++; + if (I) + Out << ", "; + // Output type... + TypePrinter.print(FT->getParamType(I), Out); + + if (Attrs.hasAttributes(I + 1)) + Out << ' ' << Attrs.getAsString(I + 1); } } else { - // Otherwise, print the types from the function type. - for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) { + // The arguments are meaningful here, print them in detail. + unsigned Idx = 1; + for (const Argument &Arg : F->args()) { // Insert commas as we go... the first arg doesn't get a comma - if (i) Out << ", "; - - // Output type... - TypePrinter.print(FT->getParamType(i), Out); - - if (Attrs.hasAttributes(i+1)) - Out << ' ' << Attrs.getAsString(i+1); + if (Idx != 1) + Out << ", "; + printArgument(&Arg, Attrs, Idx++); } } @@ -2604,7 +2635,7 @@ void AssemblyWriter::printFunction(const Function *F) { Out << " {"; // Output all of the function's basic blocks. for (Function::const_iterator I = F->begin(), E = F->end(); I != E; ++I) - printBasicBlock(I); + printBasicBlock(&*I); // Output the function's use-lists. printUseLists(F); @@ -2691,14 +2722,11 @@ void AssemblyWriter::printInstructionLine(const Instruction &I) { /// printGCRelocateComment - print comment after call to the gc.relocate /// intrinsic indicating base and derived pointer names. -void AssemblyWriter::printGCRelocateComment(const Value &V) { - assert(isGCRelocate(&V)); - GCRelocateOperands GCOps(cast<Instruction>(&V)); - +void AssemblyWriter::printGCRelocateComment(const GCRelocateInst &Relocate) { Out << " ; ("; - writeOperand(GCOps.getBasePtr(), false); + writeOperand(Relocate.getBasePtr(), false); Out << ", "; - writeOperand(GCOps.getDerivedPtr(), false); + writeOperand(Relocate.getDerivedPtr(), false); Out << ")"; } @@ -2706,8 +2734,8 @@ void AssemblyWriter::printGCRelocateComment(const Value &V) { /// which slot it occupies. /// void AssemblyWriter::printInfoComment(const Value &V) { - if (isGCRelocate(&V)) - printGCRelocateComment(V); + if (const auto *Relocate = dyn_cast<GCRelocateInst>(&V)) + printGCRelocateComment(*Relocate); if (AnnotationWriter) AnnotationWriter->printInfoComment(V, Out); @@ -2738,6 +2766,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) { Out << "musttail "; else if (CI->isTailCall()) Out << "tail "; + else if (CI->isNoTailCall()) + Out << "notail "; } // Print out the opcode... @@ -2850,8 +2880,50 @@ void AssemblyWriter::printInstruction(const Instruction &I) { writeOperand(LPI->getClause(i), true); } + } else if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(&I)) { + Out << " within "; + writeOperand(CatchSwitch->getParentPad(), /*PrintType=*/false); + Out << " ["; + unsigned Op = 0; + for (const BasicBlock *PadBB : CatchSwitch->handlers()) { + if (Op > 0) + Out << ", "; + writeOperand(PadBB, /*PrintType=*/true); + ++Op; + } + Out << "] unwind "; + if (const BasicBlock *UnwindDest = CatchSwitch->getUnwindDest()) + writeOperand(UnwindDest, /*PrintType=*/true); + else + Out << "to caller"; + } else if (const auto *FPI = dyn_cast<FuncletPadInst>(&I)) { + Out << " within "; + writeOperand(FPI->getParentPad(), /*PrintType=*/false); + Out << " ["; + for (unsigned Op = 0, NumOps = FPI->getNumArgOperands(); Op < NumOps; + ++Op) { + if (Op > 0) + Out << ", "; + writeOperand(FPI->getArgOperand(Op), /*PrintType=*/true); + } + Out << ']'; } else if (isa<ReturnInst>(I) && !Operand) { Out << " void"; + } else if (const auto *CRI = dyn_cast<CatchReturnInst>(&I)) { + Out << " from "; + writeOperand(CRI->getOperand(0), /*PrintType=*/false); + + Out << " to "; + writeOperand(CRI->getOperand(1), /*PrintType=*/true); + } else if (const auto *CRI = dyn_cast<CleanupReturnInst>(&I)) { + Out << " from "; + writeOperand(CRI->getOperand(0), /*PrintType=*/false); + + Out << " unwind "; + if (CRI->hasUnwindDest()) + writeOperand(CRI->getOperand(1), /*PrintType=*/true); + else + Out << "to caller"; } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) { // Print the calling convention being used. if (CI->getCallingConv() != CallingConv::C) { @@ -2892,6 +2964,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) { Out << ')'; if (PAL.hasAttributes(AttributeSet::FunctionIndex)) Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes()); + + writeOperandBundles(CI); + } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) { Operand = II->getCalledValue(); FunctionType *FTy = cast<FunctionType>(II->getFunctionType()); @@ -2926,6 +3001,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) { if (PAL.hasAttributes(AttributeSet::FunctionIndex)) Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes()); + writeOperandBundles(II); + Out << "\n to "; writeOperand(II->getNormalDest(), true); Out << " unwind "; @@ -3044,7 +3121,7 @@ void AssemblyWriter::printMetadataAttachments( return; if (MDNames.empty()) - TheModule->getMDKindNames(MDNames); + MDs[0].second->getContext().getMDKindNames(MDNames); for (const auto &I : MDs) { unsigned Kind = I.first; @@ -3138,29 +3215,23 @@ void AssemblyWriter::printUseLists(const Function *F) { // External Interface declarations //===----------------------------------------------------------------------===// -void Function::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW) const { - SlotTracker SlotTable(this->getParent()); - formatted_raw_ostream OS(ROS); - AssemblyWriter W(OS, SlotTable, this->getParent(), AAW); - W.printFunction(this); -} - void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW, - bool ShouldPreserveUseListOrder) const { + bool ShouldPreserveUseListOrder, bool IsForDebug) const { SlotTracker SlotTable(this); formatted_raw_ostream OS(ROS); - AssemblyWriter W(OS, SlotTable, this, AAW, ShouldPreserveUseListOrder); + AssemblyWriter W(OS, SlotTable, this, AAW, IsForDebug, + ShouldPreserveUseListOrder); W.printModule(this); } -void NamedMDNode::print(raw_ostream &ROS) const { +void NamedMDNode::print(raw_ostream &ROS, bool IsForDebug) const { SlotTracker SlotTable(getParent()); formatted_raw_ostream OS(ROS); - AssemblyWriter W(OS, SlotTable, getParent(), nullptr); + AssemblyWriter W(OS, SlotTable, getParent(), nullptr, IsForDebug); W.printNamedMDNode(this); } -void Comdat::print(raw_ostream &ROS) const { +void Comdat::print(raw_ostream &ROS, bool /*IsForDebug*/) const { PrintLLVMName(ROS, getName(), ComdatPrefix); ROS << " = comdat "; @@ -3185,7 +3256,7 @@ void Comdat::print(raw_ostream &ROS) const { ROS << '\n'; } -void Type::print(raw_ostream &OS) const { +void Type::print(raw_ostream &OS, bool /*IsForDebug*/) const { TypePrinting TP; TP.print(const_cast<Type*>(this), OS); @@ -3208,7 +3279,7 @@ static bool isReferencingMDNode(const Instruction &I) { return false; } -void Value::print(raw_ostream &ROS) const { +void Value::print(raw_ostream &ROS, bool IsForDebug) const { bool ShouldInitializeAllMetadata = false; if (auto *I = dyn_cast<Instruction>(this)) ShouldInitializeAllMetadata = isReferencingMDNode(*I); @@ -3216,10 +3287,11 @@ void Value::print(raw_ostream &ROS) const { ShouldInitializeAllMetadata = true; ModuleSlotTracker MST(getModuleFromVal(this), ShouldInitializeAllMetadata); - print(ROS, MST); + print(ROS, MST, IsForDebug); } -void Value::print(raw_ostream &ROS, ModuleSlotTracker &MST) const { +void Value::print(raw_ostream &ROS, ModuleSlotTracker &MST, + bool IsForDebug) const { formatted_raw_ostream OS(ROS); SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr)); SlotTracker &SlotTable = @@ -3231,14 +3303,14 @@ void Value::print(raw_ostream &ROS, ModuleSlotTracker &MST) const { if (const Instruction *I = dyn_cast<Instruction>(this)) { incorporateFunction(I->getParent() ? I->getParent()->getParent() : nullptr); - AssemblyWriter W(OS, SlotTable, getModuleFromVal(I), nullptr); + AssemblyWriter W(OS, SlotTable, getModuleFromVal(I), nullptr, IsForDebug); W.printInstruction(*I); } else if (const BasicBlock *BB = dyn_cast<BasicBlock>(this)) { incorporateFunction(BB->getParent()); - AssemblyWriter W(OS, SlotTable, getModuleFromVal(BB), nullptr); + AssemblyWriter W(OS, SlotTable, getModuleFromVal(BB), nullptr, IsForDebug); W.printBasicBlock(BB); } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(this)) { - AssemblyWriter W(OS, SlotTable, GV->getParent(), nullptr); + AssemblyWriter W(OS, SlotTable, GV->getParent(), nullptr, IsForDebug); if (const GlobalVariable *V = dyn_cast<GlobalVariable>(GV)) W.printGlobal(V); else if (const Function *F = dyn_cast<Function>(GV)) @@ -3261,7 +3333,7 @@ void Value::print(raw_ostream &ROS, ModuleSlotTracker &MST) const { /// Print without a type, skipping the TypePrinting object. /// -/// \return \c true iff printing was succesful. +/// \return \c true iff printing was successful. static bool printWithoutType(const Value &V, raw_ostream &O, SlotTracker *Machine, const Module *M) { if (V.hasName() || isa<GlobalValue>(V) || @@ -3340,41 +3412,45 @@ void Metadata::printAsOperand(raw_ostream &OS, ModuleSlotTracker &MST, printMetadataImpl(OS, *this, MST, M, /* OnlyAsOperand */ true); } -void Metadata::print(raw_ostream &OS, const Module *M) const { +void Metadata::print(raw_ostream &OS, const Module *M, + bool /*IsForDebug*/) const { ModuleSlotTracker MST(M, isa<MDNode>(this)); printMetadataImpl(OS, *this, MST, M, /* OnlyAsOperand */ false); } void Metadata::print(raw_ostream &OS, ModuleSlotTracker &MST, - const Module *M) const { + const Module *M, bool /*IsForDebug*/) const { printMetadataImpl(OS, *this, MST, M, /* OnlyAsOperand */ false); } // Value::dump - allow easy printing of Values from the debugger. LLVM_DUMP_METHOD -void Value::dump() const { print(dbgs()); dbgs() << '\n'; } +void Value::dump() const { print(dbgs(), /*IsForDebug=*/true); dbgs() << '\n'; } // Type::dump - allow easy printing of Types from the debugger. LLVM_DUMP_METHOD -void Type::dump() const { print(dbgs()); dbgs() << '\n'; } +void Type::dump() const { print(dbgs(), /*IsForDebug=*/true); dbgs() << '\n'; } // Module::dump() - Allow printing of Modules from the debugger. LLVM_DUMP_METHOD -void Module::dump() const { print(dbgs(), nullptr); } +void Module::dump() const { + print(dbgs(), nullptr, + /*ShouldPreserveUseListOrder=*/false, /*IsForDebug=*/true); +} // \brief Allow printing of Comdats from the debugger. LLVM_DUMP_METHOD -void Comdat::dump() const { print(dbgs()); } +void Comdat::dump() const { print(dbgs(), /*IsForDebug=*/true); } // NamedMDNode::dump() - Allow printing of NamedMDNodes from the debugger. LLVM_DUMP_METHOD -void NamedMDNode::dump() const { print(dbgs()); } +void NamedMDNode::dump() const { print(dbgs(), /*IsForDebug=*/true); } LLVM_DUMP_METHOD void Metadata::dump() const { dump(nullptr); } LLVM_DUMP_METHOD void Metadata::dump(const Module *M) const { - print(dbgs(), M); + print(dbgs(), M, /*IsForDebug=*/true); dbgs() << '\n'; } diff --git a/contrib/llvm/lib/IR/AttributeImpl.h b/contrib/llvm/lib/IR/AttributeImpl.h index 6f338ae..659f956 100644 --- a/contrib/llvm/lib/IR/AttributeImpl.h +++ b/contrib/llvm/lib/IR/AttributeImpl.h @@ -18,6 +18,7 @@ #include "llvm/ADT/FoldingSet.h" #include "llvm/IR/Attributes.h" +#include "llvm/Support/TrailingObjects.h" #include <string> namespace llvm { @@ -141,13 +142,16 @@ public: /// \class /// \brief This class represents a group of attributes that apply to one /// element: function, return type, or parameter. -class AttributeSetNode : public FoldingSetNode { +class AttributeSetNode final + : public FoldingSetNode, + private TrailingObjects<AttributeSetNode, Attribute> { + friend TrailingObjects; + unsigned NumAttrs; ///< Number of attributes in this node. AttributeSetNode(ArrayRef<Attribute> Attrs) : NumAttrs(Attrs.size()) { // There's memory after the node where we can store the entries in. - std::copy(Attrs.begin(), Attrs.end(), - reinterpret_cast<Attribute *>(this + 1)); + std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>()); } // AttributesSetNode is uniqued, these should not be publicly available. @@ -170,7 +174,7 @@ public: std::string getAsString(bool InAttrGrp) const; typedef const Attribute *iterator; - iterator begin() const { return reinterpret_cast<iterator>(this + 1); } + iterator begin() const { return getTrailingObjects<Attribute>(); } iterator end() const { return begin() + NumAttrs; } void Profile(FoldingSetNodeID &ID) const { @@ -181,27 +185,29 @@ public: AttrList[I].Profile(ID); } }; -static_assert( - AlignOf<AttributeSetNode>::Alignment >= AlignOf<Attribute>::Alignment, - "Alignment is insufficient for objects appended to AttributeSetNode"); + +typedef std::pair<unsigned, AttributeSetNode *> IndexAttrPair; //===----------------------------------------------------------------------===// /// \class /// \brief This class represents a set of attributes that apply to the function, /// return type, and parameters. -class AttributeSetImpl : public FoldingSetNode { +class AttributeSetImpl final + : public FoldingSetNode, + private TrailingObjects<AttributeSetImpl, IndexAttrPair> { friend class AttributeSet; - -public: - typedef std::pair<unsigned, AttributeSetNode*> IndexAttrPair; + friend TrailingObjects; private: LLVMContext &Context; unsigned NumAttrs; ///< Number of entries in this set. + // Helper fn for TrailingObjects class. + size_t numTrailingObjects(OverloadToken<IndexAttrPair>) { return NumAttrs; } + /// \brief Return a pointer to the IndexAttrPair for the specified slot. const IndexAttrPair *getNode(unsigned Slot) const { - return reinterpret_cast<const IndexAttrPair *>(this + 1) + Slot; + return getTrailingObjects<IndexAttrPair>() + Slot; } // AttributesSet is uniqued, these should not be publicly available. @@ -222,8 +228,7 @@ public: } #endif // There's memory after the node where we can store the entries in. - std::copy(Attrs.begin(), Attrs.end(), - reinterpret_cast<IndexAttrPair *>(this + 1)); + std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<IndexAttrPair>()); } /// \brief Get the context that created this AttributeSetImpl. @@ -273,10 +278,6 @@ public: void dump() const; }; -static_assert( - AlignOf<AttributeSetImpl>::Alignment >= - AlignOf<AttributeSetImpl::IndexAttrPair>::Alignment, - "Alignment is insufficient for objects appended to AttributeSetImpl"); } // end llvm namespace diff --git a/contrib/llvm/lib/IR/Attributes.cpp b/contrib/llvm/lib/IR/Attributes.cpp index 546a986..6c01bb6 100644 --- a/contrib/llvm/lib/IR/Attributes.cpp +++ b/contrib/llvm/lib/IR/Attributes.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" #include "AttributeImpl.h" #include "LLVMContextImpl.h" #include "llvm/ADT/STLExtras.h" @@ -120,28 +121,28 @@ Attribute::AttrKind Attribute::getKindAsEnum() const { if (!pImpl) return None; assert((isEnumAttribute() || isIntAttribute()) && "Invalid attribute type to get the kind as an enum!"); - return pImpl ? pImpl->getKindAsEnum() : None; + return pImpl->getKindAsEnum(); } uint64_t Attribute::getValueAsInt() const { if (!pImpl) return 0; assert(isIntAttribute() && "Expected the attribute to be an integer attribute!"); - return pImpl ? pImpl->getValueAsInt() : 0; + return pImpl->getValueAsInt(); } StringRef Attribute::getKindAsString() const { if (!pImpl) return StringRef(); assert(isStringAttribute() && "Invalid attribute type to get the kind as a string!"); - return pImpl ? pImpl->getKindAsString() : StringRef(); + return pImpl->getKindAsString(); } StringRef Attribute::getValueAsString() const { if (!pImpl) return StringRef(); assert(isStringAttribute() && "Invalid attribute type to get the value as a string!"); - return pImpl ? pImpl->getValueAsString() : StringRef(); + return pImpl->getValueAsString(); } bool Attribute::hasAttribute(AttrKind Kind) const { @@ -198,6 +199,10 @@ std::string Attribute::getAsString(bool InAttrGrp) const { return "byval"; if (hasAttribute(Attribute::Convergent)) return "convergent"; + if (hasAttribute(Attribute::InaccessibleMemOnly)) + return "inaccessiblememonly"; + if (hasAttribute(Attribute::InaccessibleMemOrArgMemOnly)) + return "inaccessiblemem_or_argmemonly"; if (hasAttribute(Attribute::InAlloca)) return "inalloca"; if (hasAttribute(Attribute::InlineHint)) @@ -232,6 +237,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const { return "noredzone"; if (hasAttribute(Attribute::NoReturn)) return "noreturn"; + if (hasAttribute(Attribute::NoRecurse)) + return "norecurse"; if (hasAttribute(Attribute::NoUnwind)) return "nounwind"; if (hasAttribute(Attribute::OptimizeNone)) @@ -442,6 +449,9 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) { case Attribute::JumpTable: return 1ULL << 45; case Attribute::Convergent: return 1ULL << 46; case Attribute::SafeStack: return 1ULL << 47; + case Attribute::NoRecurse: return 1ULL << 48; + case Attribute::InaccessibleMemOnly: return 1ULL << 49; + case Attribute::InaccessibleMemOrArgMemOnly: return 1ULL << 50; case Attribute::Dereferenceable: llvm_unreachable("dereferenceable attribute not supported in raw format"); break; @@ -472,9 +482,8 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C, SmallVector<Attribute, 8> SortedAttrs(Attrs.begin(), Attrs.end()); array_pod_sort(SortedAttrs.begin(), SortedAttrs.end()); - for (SmallVectorImpl<Attribute>::iterator I = SortedAttrs.begin(), - E = SortedAttrs.end(); I != E; ++I) - I->Profile(ID); + for (Attribute Attr : SortedAttrs) + Attr.Profile(ID); void *InsertPoint; AttributeSetNode *PA = @@ -484,8 +493,7 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C, // new one and insert it. if (!PA) { // Coallocate entries after the AttributeSetNode itself. - void *Mem = ::operator new(sizeof(AttributeSetNode) + - sizeof(Attribute) * SortedAttrs.size()); + void *Mem = ::operator new(totalSizeToAlloc<Attribute>(SortedAttrs.size())); PA = new (Mem) AttributeSetNode(SortedAttrs); pImpl->AttrsSetNodes.InsertNode(PA, InsertPoint); } @@ -617,9 +625,8 @@ AttributeSet::getImpl(LLVMContext &C, // create a new one and insert it. if (!PA) { // Coallocate entries after the AttributeSetImpl itself. - void *Mem = ::operator new(sizeof(AttributeSetImpl) + - sizeof(std::pair<unsigned, AttributeSetNode *>) * - Attrs.size()); + void *Mem = ::operator new( + AttributeSetImpl::totalSizeToAlloc<IndexAttrPair>(Attrs.size())); PA = new (Mem) AttributeSetImpl(C, Attrs); pImpl->AttrsLists.InsertNode(PA, InsertPoint); } @@ -634,14 +641,15 @@ AttributeSet AttributeSet::get(LLVMContext &C, if (Attrs.empty()) return AttributeSet(); -#ifndef NDEBUG - for (unsigned i = 0, e = Attrs.size(); i != e; ++i) { - assert((!i || Attrs[i-1].first <= Attrs[i].first) && - "Misordered Attributes list!"); - assert(!Attrs[i].second.hasAttribute(Attribute::None) && - "Pointless attribute!"); - } -#endif + assert(std::is_sorted(Attrs.begin(), Attrs.end(), + [](const std::pair<unsigned, Attribute> &LHS, + const std::pair<unsigned, Attribute> &RHS) { + return LHS.first < RHS.first; + }) && "Misordered Attributes list!"); + assert(std::none_of(Attrs.begin(), Attrs.end(), + [](const std::pair<unsigned, Attribute> &Pair) { + return Pair.second.hasAttribute(Attribute::None); + }) && "Pointless attribute!"); // Create a vector if (unsigned, AttributeSetNode*) pairs from the attributes // list. @@ -684,22 +692,26 @@ AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index, if (!B.contains(Kind)) continue; - if (Kind == Attribute::Alignment) - Attrs.push_back(std::make_pair(Index, Attribute:: - getWithAlignment(C, B.getAlignment()))); - else if (Kind == Attribute::StackAlignment) - Attrs.push_back(std::make_pair(Index, Attribute:: - getWithStackAlignment(C, B.getStackAlignment()))); - else if (Kind == Attribute::Dereferenceable) - Attrs.push_back(std::make_pair(Index, - Attribute::getWithDereferenceableBytes(C, - B.getDereferenceableBytes()))); - else if (Kind == Attribute::DereferenceableOrNull) - Attrs.push_back( - std::make_pair(Index, Attribute::getWithDereferenceableOrNullBytes( - C, B.getDereferenceableOrNullBytes()))); - else - Attrs.push_back(std::make_pair(Index, Attribute::get(C, Kind))); + Attribute Attr; + switch (Kind) { + case Attribute::Alignment: + Attr = Attribute::getWithAlignment(C, B.getAlignment()); + break; + case Attribute::StackAlignment: + Attr = Attribute::getWithStackAlignment(C, B.getStackAlignment()); + break; + case Attribute::Dereferenceable: + Attr = Attribute::getWithDereferenceableBytes( + C, B.getDereferenceableBytes()); + break; + case Attribute::DereferenceableOrNull: + Attr = Attribute::getWithDereferenceableOrNullBytes( + C, B.getDereferenceableOrNullBytes()); + break; + default: + Attr = Attribute::get(C, Kind); + } + Attrs.push_back(std::make_pair(Index, Attr)); } // Add target-dependent (string) attributes. @@ -713,9 +725,8 @@ AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index, AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index, ArrayRef<Attribute::AttrKind> Kind) { SmallVector<std::pair<unsigned, Attribute>, 8> Attrs; - for (ArrayRef<Attribute::AttrKind>::iterator I = Kind.begin(), - E = Kind.end(); I != E; ++I) - Attrs.push_back(std::make_pair(Index, Attribute::get(C, *I))); + for (Attribute::AttrKind K : Kind) + Attrs.push_back(std::make_pair(Index, Attribute::get(C, K))); return get(C, Attrs); } @@ -736,9 +747,8 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) { if (!AS) continue; SmallVector<std::pair<unsigned, AttributeSetNode *>, 8>::iterator ANVI = AttrNodeVec.begin(), ANVE; - for (const AttributeSetImpl::IndexAttrPair - *AI = AS->getNode(0), - *AE = AS->getNode(AS->getNumAttributes()); + for (const IndexAttrPair *AI = AS->getNode(0), + *AE = AS->getNode(AS->getNumAttributes()); AI != AE; ++AI) { ANVE = AttrNodeVec.end(); while (ANVI != ANVE && ANVI->first <= AI->first) @@ -770,6 +780,36 @@ AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index, return addAttributes(C, Index, AttributeSet::get(C, Index, B)); } +AttributeSet AttributeSet::addAttribute(LLVMContext &C, + ArrayRef<unsigned> Indices, + Attribute A) const { + unsigned I = 0, E = pImpl ? pImpl->getNumAttributes() : 0; + auto IdxI = Indices.begin(), IdxE = Indices.end(); + SmallVector<AttributeSet, 4> AttrSet; + + while (I != E && IdxI != IdxE) { + if (getSlotIndex(I) < *IdxI) + AttrSet.emplace_back(getSlotAttributes(I++)); + else if (getSlotIndex(I) > *IdxI) + AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A))); + else { + AttrBuilder B(getSlotAttributes(I), *IdxI); + B.addAttribute(A); + AttrSet.emplace_back(AttributeSet::get(C, *IdxI, B)); + ++I; + ++IdxI; + } + } + + while (I != E) + AttrSet.emplace_back(getSlotAttributes(I++)); + + while (IdxI != IdxE) + AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A))); + + return get(C, AttrSet); +} + AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Index, AttributeSet Attrs) const { if (!pImpl) return Attrs; @@ -955,17 +995,17 @@ AttributeSet AttributeSet::getFnAttributes() const { bool AttributeSet::hasAttribute(unsigned Index, Attribute::AttrKind Kind) const{ AttributeSetNode *ASN = getAttributes(Index); - return ASN ? ASN->hasAttribute(Kind) : false; + return ASN && ASN->hasAttribute(Kind); } bool AttributeSet::hasAttribute(unsigned Index, StringRef Kind) const { AttributeSetNode *ASN = getAttributes(Index); - return ASN ? ASN->hasAttribute(Kind) : false; + return ASN && ASN->hasAttribute(Kind); } bool AttributeSet::hasAttributes(unsigned Index) const { AttributeSetNode *ASN = getAttributes(Index); - return ASN ? ASN->hasAttributes() : false; + return ASN && ASN->hasAttributes(); } /// \brief Return true if the specified attribute is set for at least one @@ -1111,6 +1151,7 @@ AttrBuilder::AttrBuilder(AttributeSet AS, unsigned Index) void AttrBuilder::clear() { Attrs.reset(); + TargetDepAttrs.clear(); Alignment = StackAlignment = DerefBytes = DerefOrNullBytes = 0; } @@ -1177,23 +1218,10 @@ AttrBuilder &AttrBuilder::removeAttributes(AttributeSet A, uint64_t Index) { for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); I != E; ++I) { Attribute Attr = *I; if (Attr.isEnumAttribute() || Attr.isIntAttribute()) { - Attribute::AttrKind Kind = I->getKindAsEnum(); - Attrs[Kind] = false; - - if (Kind == Attribute::Alignment) - Alignment = 0; - else if (Kind == Attribute::StackAlignment) - StackAlignment = 0; - else if (Kind == Attribute::Dereferenceable) - DerefBytes = 0; - else if (Kind == Attribute::DereferenceableOrNull) - DerefOrNullBytes = 0; + removeAttribute(Attr.getKindAsEnum()); } else { assert(Attr.isStringAttribute() && "Invalid attribute type!"); - std::map<std::string, std::string>::iterator - Iter = TargetDepAttrs.find(Attr.getKindAsString()); - if (Iter != TargetDepAttrs.end()) - TargetDepAttrs.erase(Iter); + removeAttribute(Attr.getKindAsString()); } } @@ -1322,8 +1350,7 @@ bool AttrBuilder::hasAttributes(AttributeSet A, uint64_t Index) const { assert(Slot != ~0U && "Couldn't find the index!"); - for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); - I != E; ++I) { + for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); I != E; ++I) { Attribute Attr = *I; if (Attr.isEnumAttribute() || Attr.isIntAttribute()) { if (Attrs[I->getKindAsEnum()]) @@ -1382,7 +1409,7 @@ AttrBuilder &AttrBuilder::addRawValue(uint64_t Val) { //===----------------------------------------------------------------------===// /// \brief Which attributes cannot be applied to a type. -AttrBuilder AttributeFuncs::typeIncompatible(const Type *Ty) { +AttrBuilder AttributeFuncs::typeIncompatible(Type *Ty) { AttrBuilder Incompatible; if (!Ty->isIntegerTy()) @@ -1406,3 +1433,80 @@ AttrBuilder AttributeFuncs::typeIncompatible(const Type *Ty) { return Incompatible; } + +template<typename AttrClass> +static bool isEqual(const Function &Caller, const Function &Callee) { + return Caller.getFnAttribute(AttrClass::getKind()) == + Callee.getFnAttribute(AttrClass::getKind()); +} + +/// \brief Compute the logical AND of the attributes of the caller and the +/// callee. +/// +/// This function sets the caller's attribute to false if the callee's attribute +/// is false. +template<typename AttrClass> +static void setAND(Function &Caller, const Function &Callee) { + if (AttrClass::isSet(Caller, AttrClass::getKind()) && + !AttrClass::isSet(Callee, AttrClass::getKind())) + AttrClass::set(Caller, AttrClass::getKind(), false); +} + +/// \brief Compute the logical OR of the attributes of the caller and the +/// callee. +/// +/// This function sets the caller's attribute to true if the callee's attribute +/// is true. +template<typename AttrClass> +static void setOR(Function &Caller, const Function &Callee) { + if (!AttrClass::isSet(Caller, AttrClass::getKind()) && + AttrClass::isSet(Callee, AttrClass::getKind())) + AttrClass::set(Caller, AttrClass::getKind(), true); +} + +/// \brief If the inlined function had a higher stack protection level than the +/// calling function, then bump up the caller's stack protection level. +static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) { + // If upgrading the SSP attribute, clear out the old SSP Attributes first. + // Having multiple SSP attributes doesn't actually hurt, but it adds useless + // clutter to the IR. + AttrBuilder B; + B.addAttribute(Attribute::StackProtect) + .addAttribute(Attribute::StackProtectStrong) + .addAttribute(Attribute::StackProtectReq); + AttributeSet OldSSPAttr = AttributeSet::get(Caller.getContext(), + AttributeSet::FunctionIndex, + B); + + if (Callee.hasFnAttribute(Attribute::SafeStack)) { + Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); + Caller.addFnAttr(Attribute::SafeStack); + } else if (Callee.hasFnAttribute(Attribute::StackProtectReq) && + !Caller.hasFnAttribute(Attribute::SafeStack)) { + Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); + Caller.addFnAttr(Attribute::StackProtectReq); + } else if (Callee.hasFnAttribute(Attribute::StackProtectStrong) && + !Caller.hasFnAttribute(Attribute::SafeStack) && + !Caller.hasFnAttribute(Attribute::StackProtectReq)) { + Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); + Caller.addFnAttr(Attribute::StackProtectStrong); + } else if (Callee.hasFnAttribute(Attribute::StackProtect) && + !Caller.hasFnAttribute(Attribute::SafeStack) && + !Caller.hasFnAttribute(Attribute::StackProtectReq) && + !Caller.hasFnAttribute(Attribute::StackProtectStrong)) + Caller.addFnAttr(Attribute::StackProtect); +} + +#define GET_ATTR_COMPAT_FUNC +#include "AttributesCompatFunc.inc" + +bool AttributeFuncs::areInlineCompatible(const Function &Caller, + const Function &Callee) { + return hasCompatibleFnAttrs(Caller, Callee); +} + + +void AttributeFuncs::mergeAttributesForInlining(Function &Caller, + const Function &Callee) { + mergeFnAttrs(Caller, Callee); +} diff --git a/contrib/llvm/lib/IR/AttributesCompatFunc.td b/contrib/llvm/lib/IR/AttributesCompatFunc.td new file mode 100644 index 0000000..7c85b3d --- /dev/null +++ b/contrib/llvm/lib/IR/AttributesCompatFunc.td @@ -0,0 +1 @@ +include "llvm/IR/Attributes.td" diff --git a/contrib/llvm/lib/IR/AutoUpgrade.cpp b/contrib/llvm/lib/IR/AutoUpgrade.cpp index f1c6ebd..12c354c 100644 --- a/contrib/llvm/lib/IR/AutoUpgrade.cpp +++ b/contrib/llvm/lib/IR/AutoUpgrade.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Regex.h" #include <cstring> using namespace llvm; @@ -92,8 +93,42 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { F->arg_begin()->getType()); return true; } + Regex vldRegex("^arm\\.neon\\.vld([1234]|[234]lane)\\.v[a-z0-9]*$"); + if (vldRegex.match(Name)) { + auto fArgs = F->getFunctionType()->params(); + SmallVector<Type *, 4> Tys(fArgs.begin(), fArgs.end()); + // Can't use Intrinsic::getDeclaration here as the return types might + // then only be structurally equal. + FunctionType* fType = FunctionType::get(F->getReturnType(), Tys, false); + NewFn = Function::Create(fType, F->getLinkage(), + "llvm." + Name + ".p0i8", F->getParent()); + return true; + } + Regex vstRegex("^arm\\.neon\\.vst([1234]|[234]lane)\\.v[a-z0-9]*$"); + if (vstRegex.match(Name)) { + static const Intrinsic::ID StoreInts[] = {Intrinsic::arm_neon_vst1, + Intrinsic::arm_neon_vst2, + Intrinsic::arm_neon_vst3, + Intrinsic::arm_neon_vst4}; + + static const Intrinsic::ID StoreLaneInts[] = { + Intrinsic::arm_neon_vst2lane, Intrinsic::arm_neon_vst3lane, + Intrinsic::arm_neon_vst4lane + }; + + auto fArgs = F->getFunctionType()->params(); + Type *Tys[] = {fArgs[0], fArgs[1]}; + if (Name.find("lane") == StringRef::npos) + NewFn = Intrinsic::getDeclaration(F->getParent(), + StoreInts[fArgs.size() - 3], Tys); + else + NewFn = Intrinsic::getDeclaration(F->getParent(), + StoreLaneInts[fArgs.size() - 5], Tys); + return true; + } break; } + case 'c': { if (Name.startswith("ctlz.") && F->arg_size() == 1) { F->setName(Name + ".old"); @@ -129,7 +164,10 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name.startswith("x86.sse2.pcmpgt.") || Name.startswith("x86.avx2.pcmpeq.") || Name.startswith("x86.avx2.pcmpgt.") || + Name.startswith("x86.avx2.vbroadcast") || + Name.startswith("x86.avx2.pbroadcast") || Name.startswith("x86.avx.vpermil.") || + Name.startswith("x86.sse41.pmovsx") || Name == "x86.avx.vinsertf128.pd.256" || Name == "x86.avx.vinsertf128.ps.256" || Name == "x86.avx.vinsertf128.si.256" || @@ -162,6 +200,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name == "x86.avx2.pblendd.128" || Name == "x86.avx2.pblendd.256" || Name == "x86.avx2.vbroadcasti128" || + Name == "x86.xop.vpcmov" || (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) { NewFn = nullptr; return true; @@ -325,7 +364,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Function *F = CI->getCalledFunction(); LLVMContext &C = CI->getContext(); IRBuilder<> Builder(C); - Builder.SetInsertPoint(CI->getParent(), CI); + Builder.SetInsertPoint(CI->getParent(), CI->getIterator()); assert(F && "Intrinsic call is not direct?"); @@ -351,7 +390,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Name == "llvm.x86.avx.movnt.ps.256" || Name == "llvm.x86.avx.movnt.pd.256") { IRBuilder<> Builder(C); - Builder.SetInsertPoint(CI->getParent(), CI); + Builder.SetInsertPoint(CI->getParent(), CI->getIterator()); Module *M = F->getParent(); SmallVector<Metadata *, 1> Elts; @@ -368,7 +407,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { "cast"); StoreInst *SI = Builder.CreateStore(Arg1, BC); SI->setMetadata(M->getMDKindID("nontemporal"), Node); - SI->setAlignment(16); + SI->setAlignment(32); // Remove intrinsic. CI->eraseFromParent(); @@ -419,6 +458,16 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateCall(VPCOM, {CI->getArgOperand(0), CI->getArgOperand(1), Builder.getInt8(Imm)}); + } else if (Name == "llvm.x86.xop.vpcmov") { + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + Value *Sel = CI->getArgOperand(2); + unsigned NumElts = CI->getType()->getVectorNumElements(); + Constant *MinusOne = ConstantVector::getSplat(NumElts, Builder.getInt64(-1)); + Value *NotSel = Builder.CreateXor(Sel, MinusOne); + Value *Sel0 = Builder.CreateAnd(Arg0, Sel); + Value *Sel1 = Builder.CreateAnd(Arg1, NotSel); + Rep = Builder.CreateOr(Sel0, Sel1); } else if (Name == "llvm.x86.sse42.crc32.64.8") { Function *CRC32 = Intrinsic::getDeclaration(F->getParent(), Intrinsic::x86_sse42_crc32_32_8); @@ -438,6 +487,19 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { for (unsigned I = 0; I < EltNum; ++I) Rep = Builder.CreateInsertElement(Rep, Load, ConstantInt::get(I32Ty, I)); + } else if (Name.startswith("llvm.x86.sse41.pmovsx")) { + VectorType *SrcTy = cast<VectorType>(CI->getArgOperand(0)->getType()); + VectorType *DstTy = cast<VectorType>(CI->getType()); + unsigned NumDstElts = DstTy->getNumElements(); + + // Extract a subvector of the first NumDstElts lanes and sign extend. + SmallVector<int, 8> ShuffleMask; + for (int i = 0; i != (int)NumDstElts; ++i) + ShuffleMask.push_back(i); + + Value *SV = Builder.CreateShuffleVector( + CI->getArgOperand(0), UndefValue::get(SrcTy), ShuffleMask); + Rep = Builder.CreateSExt(SV, DstTy); } else if (Name == "llvm.x86.avx2.vbroadcasti128") { // Replace vbroadcasts with a vector shuffle. Type *VT = VectorType::get(Type::getInt64Ty(C), 2); @@ -447,6 +509,14 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { const int Idxs[4] = { 0, 1, 0, 1 }; Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()), Idxs); + } else if (Name.startswith("llvm.x86.avx2.pbroadcast") || + Name.startswith("llvm.x86.avx2.vbroadcast")) { + // Replace vp?broadcasts with a vector shuffle. + Value *Op = CI->getArgOperand(0); + unsigned NumElts = CI->getType()->getVectorNumElements(); + Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts); + Rep = Builder.CreateShuffleVector(Op, UndefValue::get(Op->getType()), + Constant::getNullValue(MaskTy)); } else if (Name == "llvm.x86.sse2.psll.dq") { // 128-bit shift left specified in bits. unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue(); @@ -517,10 +587,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue(); VectorType *VecTy = cast<VectorType>(CI->getType()); unsigned NumElts = VecTy->getNumElements(); - + // Mask off the high bits of the immediate value; hardware ignores those. Imm = Imm & 1; - + // Extend the second operand into a vector that is twice as big. Value *UndefV = UndefValue::get(Op1->getType()); SmallVector<Constant*, 8> Idxs; @@ -562,7 +632,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue(); VectorType *VecTy = cast<VectorType>(CI->getType()); unsigned NumElts = VecTy->getNumElements(); - + // Mask off the high bits of the immediate value; hardware ignores those. Imm = Imm & 1; @@ -627,6 +697,27 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { default: llvm_unreachable("Unknown function for CallInst upgrade."); + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { + SmallVector<Value *, 4> Args(CI->arg_operands().begin(), + CI->arg_operands().end()); + CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args)); + CI->eraseFromParent(); + return; + } + case Intrinsic::ctlz: case Intrinsic::cttz: assert(CI->getNumArgOperands() == 1 && diff --git a/contrib/llvm/lib/IR/BasicBlock.cpp b/contrib/llvm/lib/IR/BasicBlock.cpp index 0a04494..f61276f 100644 --- a/contrib/llvm/lib/IR/BasicBlock.cpp +++ b/contrib/llvm/lib/IR/BasicBlock.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Type.h" #include <algorithm> + using namespace llvm; ValueSymbolTable *BasicBlock::getValueSymbolTable() { @@ -35,8 +36,7 @@ LLVMContext &BasicBlock::getContext() const { // Explicit instantiation of SymbolTableListTraits since some of the methods // are not in the public header file... -template class llvm::SymbolTableListTraits<Instruction, BasicBlock>; - +template class llvm::SymbolTableListTraits<Instruction>; BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent, BasicBlock *InsertBefore) @@ -56,7 +56,7 @@ void BasicBlock::insertInto(Function *NewParent, BasicBlock *InsertBefore) { assert(!Parent && "Already has a parent"); if (InsertBefore) - NewParent->getBasicBlockList().insert(InsertBefore, this); + NewParent->getBasicBlockList().insert(InsertBefore->getIterator(), this); else NewParent->getBasicBlockList().push_back(this); } @@ -91,26 +91,26 @@ void BasicBlock::setParent(Function *parent) { } void BasicBlock::removeFromParent() { - getParent()->getBasicBlockList().remove(this); + getParent()->getBasicBlockList().remove(getIterator()); } iplist<BasicBlock>::iterator BasicBlock::eraseFromParent() { - return getParent()->getBasicBlockList().erase(this); + return getParent()->getBasicBlockList().erase(getIterator()); } /// Unlink this basic block from its current function and /// insert it into the function that MovePos lives in, right before MovePos. void BasicBlock::moveBefore(BasicBlock *MovePos) { - MovePos->getParent()->getBasicBlockList().splice(MovePos, - getParent()->getBasicBlockList(), this); + MovePos->getParent()->getBasicBlockList().splice( + MovePos->getIterator(), getParent()->getBasicBlockList(), getIterator()); } /// Unlink this basic block from its current function and /// insert it into the function that MovePos lives in, right after MovePos. void BasicBlock::moveAfter(BasicBlock *MovePos) { - Function::iterator I = MovePos; - MovePos->getParent()->getBasicBlockList().splice(++I, - getParent()->getBasicBlockList(), this); + MovePos->getParent()->getBasicBlockList().splice( + ++MovePos->getIterator(), getParent()->getBasicBlockList(), + getIterator()); } const Module *BasicBlock::getModule() const { @@ -196,8 +196,8 @@ BasicBlock::iterator BasicBlock::getFirstInsertionPt() { if (!FirstNonPHI) return end(); - iterator InsertPt = FirstNonPHI; - if (isa<LandingPadInst>(InsertPt)) ++InsertPt; + iterator InsertPt = FirstNonPHI->getIterator(); + if (InsertPt->isEHPad()) ++InsertPt; return InsertPt; } @@ -245,12 +245,12 @@ BasicBlock *BasicBlock::getSingleSuccessor() { BasicBlock *BasicBlock::getUniqueSuccessor() { succ_iterator SI = succ_begin(this), E = succ_end(this); - if (SI == E) return NULL; // No successors + if (SI == E) return nullptr; // No successors BasicBlock *SuccBB = *SI; ++SI; for (;SI != E; ++SI) { if (*SI != SuccBB) - return NULL; + return nullptr; // The same successor appears multiple times in the successor list. // This is OK. } @@ -333,6 +333,17 @@ void BasicBlock::removePredecessor(BasicBlock *Pred, } } +bool BasicBlock::canSplitPredecessors() const { + const Instruction *FirstNonPHI = getFirstNonPHI(); + if (isa<LandingPadInst>(FirstNonPHI)) + return true; + // This is perhaps a little conservative because constructs like + // CleanupBlockInst are pretty easy to split. However, SplitBlockPredecessors + // cannot handle such things just yet. + if (FirstNonPHI->isEHPad()) + return false; + return true; +} /// This splits a basic block into two at the specified /// instruction. Note that all instructions BEFORE the specified iterator stay @@ -393,8 +404,7 @@ void BasicBlock::replaceSuccessorsPhiUsesWith(BasicBlock *New) { // Cope with being called on a BasicBlock that doesn't have a terminator // yet. Clang's CodeGenFunction::EmitReturnBlock() likes to do this. return; - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { - BasicBlock *Succ = TI->getSuccessor(i); + for (BasicBlock *Succ : TI->successors()) { // N.B. Succ might not be a complete BasicBlock, so don't assume // that it ends with a non-phi instruction. for (iterator II = Succ->begin(), IE = Succ->end(); II != IE; ++II) { diff --git a/contrib/llvm/lib/IR/ConstantFold.cpp b/contrib/llvm/lib/IR/ConstantFold.cpp index 46bb20e..ce3fe03 100644 --- a/contrib/llvm/lib/IR/ConstantFold.cpp +++ b/contrib/llvm/lib/IR/ConstantFold.cpp @@ -83,7 +83,7 @@ foldConstantCastPair( assert(DstTy && DstTy->isFirstClassType() && "Invalid cast destination type"); assert(CastInst::isCast(opc) && "Invalid cast opcode"); - // The the types and opcodes for the two Cast constant expressions + // The types and opcodes for the two Cast constant expressions Type *SrcTy = Op->getOperand(0)->getType(); Type *MidTy = Op->getType(); Instruction::CastOps firstOp = Instruction::CastOps(Op->getOpcode()); @@ -109,7 +109,7 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) { if (PointerType *PTy = dyn_cast<PointerType>(V->getType())) if (PointerType *DPTy = dyn_cast<PointerType>(DestTy)) if (PTy->getAddressSpace() == DPTy->getAddressSpace() - && DPTy->getElementType()->isSized()) { + && PTy->getElementType()->isSized()) { SmallVector<Value*, 8> IdxList; Value *Zero = Constant::getNullValue(Type::getInt32Ty(DPTy->getContext())); @@ -1187,7 +1187,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, (void)C3V.divide(C2V, APFloat::rmNearestTiesToEven); return ConstantFP::get(C1->getContext(), C3V); case Instruction::FRem: - (void)C3V.mod(C2V, APFloat::rmNearestTiesToEven); + (void)C3V.mod(C2V); return ConstantFP::get(C1->getContext(), C3V); } } @@ -1277,9 +1277,9 @@ static bool isMaybeZeroSizedType(Type *Ty) { } /// IdxCompare - Compare the two constants as though they were getelementptr -/// indices. This allows coersion of the types to be the same thing. +/// indices. This allows coercion of the types to be the same thing. /// -/// If the two constants are the "same" (after coersion), return 0. If the +/// If the two constants are the "same" (after coercion), return 0. If the /// first is less than the second, return -1, if the second is less than the /// first, return 1. If the constants are not integral, return -2. /// @@ -1685,7 +1685,7 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, // Otherwise, for integer compare, pick the same value as the non-undef // operand, and fold it to true or false. if (isIntegerPredicate) - return ConstantInt::get(ResultTy, CmpInst::isTrueWhenEqual(pred)); + return ConstantInt::get(ResultTy, CmpInst::isTrueWhenEqual(Predicate)); // Choosing NaN for the undef will always make unordered comparison succeed // and ordered comparison fails. @@ -1869,7 +1869,8 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, } else { // Evaluate the relation between the two constants, per the predicate. int Result = -1; // -1 = unknown, 0 = known false, 1 = known true. - switch (evaluateICmpRelation(C1, C2, CmpInst::isSigned(pred))) { + switch (evaluateICmpRelation(C1, C2, + CmpInst::isSigned((CmpInst::Predicate)pred))) { default: llvm_unreachable("Unknown relational!"); case ICmpInst::BAD_ICMP_PREDICATE: break; // Couldn't determine anything about these constants. @@ -1950,8 +1951,10 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, // If the left hand side is an extension, try eliminating it. if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) { - if ((CE1->getOpcode() == Instruction::SExt && ICmpInst::isSigned(pred)) || - (CE1->getOpcode() == Instruction::ZExt && !ICmpInst::isSigned(pred))){ + if ((CE1->getOpcode() == Instruction::SExt && + ICmpInst::isSigned((ICmpInst::Predicate)pred)) || + (CE1->getOpcode() == Instruction::ZExt && + !ICmpInst::isSigned((ICmpInst::Predicate)pred))){ Constant *CE1Op0 = CE1->getOperand(0); Constant *CE1Inverse = ConstantExpr::getTrunc(CE1, CE1Op0->getType()); if (CE1Inverse == CE1Op0) { @@ -1997,17 +2000,17 @@ static bool isInBoundsIndices(ArrayRef<IndexTy> Idxs) { } /// \brief Test whether a given ConstantInt is in-range for a SequentialType. -static bool isIndexInRangeOfSequentialType(const SequentialType *STy, +static bool isIndexInRangeOfSequentialType(SequentialType *STy, const ConstantInt *CI) { - if (const PointerType *PTy = dyn_cast<PointerType>(STy)) - // Only handle pointers to sized types, not pointers to functions. - return PTy->getElementType()->isSized(); + // And indices are valid when indexing along a pointer + if (isa<PointerType>(STy)) + return true; uint64_t NumElements = 0; // Determine the number of elements in our sequential type. - if (const ArrayType *ATy = dyn_cast<ArrayType>(STy)) + if (auto *ATy = dyn_cast<ArrayType>(STy)) NumElements = ATy->getNumElements(); - else if (const VectorType *VTy = dyn_cast<VectorType>(STy)) + else if (auto *VTy = dyn_cast<VectorType>(STy)) NumElements = VTy->getNumElements(); assert((isa<ArrayType>(STy) || NumElements > 0) && @@ -2178,7 +2181,7 @@ static Constant *ConstantFoldGetElementPtrImpl(Type *PointeeTy, Constant *C, // dimension. NewIdxs.resize(Idxs.size()); uint64_t NumElements = 0; - if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) + if (auto *ATy = dyn_cast<ArrayType>(Ty)) NumElements = ATy->getNumElements(); else NumElements = cast<VectorType>(Ty)->getNumElements(); diff --git a/contrib/llvm/lib/IR/ConstantRange.cpp b/contrib/llvm/lib/IR/ConstantRange.cpp index 91095cf..48f9b27 100644 --- a/contrib/llvm/lib/IR/ConstantRange.cpp +++ b/contrib/llvm/lib/IR/ConstantRange.cpp @@ -21,7 +21,9 @@ // //===----------------------------------------------------------------------===// +#include "llvm/IR/Instruction.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Operator.h" #include "llvm/IR/ConstantRange.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -125,6 +127,57 @@ ConstantRange ConstantRange::makeSatisfyingICmpRegion(CmpInst::Predicate Pred, .inverse(); } +ConstantRange ConstantRange::makeNoWrapRegion(Instruction::BinaryOps BinOp, + const APInt &C, + unsigned NoWrapKind) { + typedef OverflowingBinaryOperator OBO; + + // Computes the intersection of CR0 and CR1. It is different from + // intersectWith in that the ConstantRange returned will only contain elements + // in both CR0 and CR1 (i.e. SubsetIntersect(X, Y) is a *subset*, proper or + // not, of both X and Y). + auto SubsetIntersect = + [](const ConstantRange &CR0, const ConstantRange &CR1) { + return CR0.inverse().unionWith(CR1.inverse()).inverse(); + }; + + assert(BinOp >= Instruction::BinaryOpsBegin && + BinOp < Instruction::BinaryOpsEnd && "Binary operators only!"); + + assert((NoWrapKind == OBO::NoSignedWrap || + NoWrapKind == OBO::NoUnsignedWrap || + NoWrapKind == (OBO::NoUnsignedWrap | OBO::NoSignedWrap)) && + "NoWrapKind invalid!"); + + unsigned BitWidth = C.getBitWidth(); + if (BinOp != Instruction::Add) + // Conservative answer: empty set + return ConstantRange(BitWidth, false); + + if (C.isMinValue()) + // Full set: nothing signed / unsigned wraps when added to 0. + return ConstantRange(BitWidth); + + ConstantRange Result(BitWidth); + + if (NoWrapKind & OBO::NoUnsignedWrap) + Result = SubsetIntersect(Result, + ConstantRange(APInt::getNullValue(BitWidth), -C)); + + if (NoWrapKind & OBO::NoSignedWrap) { + if (C.isStrictlyPositive()) + Result = SubsetIntersect( + Result, ConstantRange(APInt::getSignedMinValue(BitWidth), + APInt::getSignedMinValue(BitWidth) - C)); + else + Result = SubsetIntersect( + Result, ConstantRange(APInt::getSignedMinValue(BitWidth) - C, + APInt::getSignedMinValue(BitWidth))); + } + + return Result; +} + /// isFullSet - Return true if this set contains all of the elements possible /// for this data-type bool ConstantRange::isFullSet() const { diff --git a/contrib/llvm/lib/IR/Constants.cpp b/contrib/llvm/lib/IR/Constants.cpp index 308e6bd..0898bf6 100644 --- a/contrib/llvm/lib/IR/Constants.cpp +++ b/contrib/llvm/lib/IR/Constants.cpp @@ -53,6 +53,11 @@ bool Constant::isNegativeZeroValue() const { if (SplatCFP && SplatCFP->isZero() && SplatCFP->isNegative()) return true; + if (const ConstantVector *CV = dyn_cast<ConstantVector>(this)) + if (ConstantFP *SplatCFP = dyn_cast_or_null<ConstantFP>(CV->getSplatValue())) + if (SplatCFP && SplatCFP->isZero() && SplatCFP->isNegative()) + return true; + // We've already handled true FP case; any other FP vectors can't represent -0.0. if (getType()->isFPOrFPVectorTy()) return false; @@ -68,6 +73,17 @@ bool Constant::isZeroValue() const { if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this)) return CFP->isZero(); + // Equivalent for a vector of -0.0's. + if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) + if (ConstantFP *SplatCFP = dyn_cast_or_null<ConstantFP>(CV->getSplatValue())) + if (SplatCFP && SplatCFP->isZero()) + return true; + + if (const ConstantVector *CV = dyn_cast<ConstantVector>(this)) + if (ConstantFP *SplatCFP = dyn_cast_or_null<ConstantFP>(CV->getSplatValue())) + if (SplatCFP && SplatCFP->isZero()) + return true; + // Otherwise, just use +0.0. return isNullValue(); } @@ -81,8 +97,10 @@ bool Constant::isNullValue() const { if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this)) return CFP->isZero() && !CFP->isNegative(); - // constant zero is zero for aggregates and cpnull is null for pointers. - return isa<ConstantAggregateZero>(this) || isa<ConstantPointerNull>(this); + // constant zero is zero for aggregates, cpnull is null for pointers, none for + // tokens. + return isa<ConstantAggregateZero>(this) || isa<ConstantPointerNull>(this) || + isa<ConstantTokenNone>(this); } bool Constant::isAllOnesValue() const { @@ -204,6 +222,8 @@ Constant *Constant::getNullValue(Type *Ty) { case Type::ArrayTyID: case Type::VectorTyID: return ConstantAggregateZero::get(Ty); + case Type::TokenTyID: + return ConstantTokenNone::get(Ty->getContext()); default: // Function, Label, or Opaque type? llvm_unreachable("Cannot create a null constant of that type!"); @@ -410,32 +430,13 @@ bool Constant::isConstantUsed() const { return false; } +bool Constant::needsRelocation() const { + if (isa<GlobalValue>(this)) + return true; // Global reference. - -/// getRelocationInfo - This method classifies the entry according to -/// whether or not it may generate a relocation entry. This must be -/// conservative, so if it might codegen to a relocatable entry, it should say -/// so. The return values are: -/// -/// NoRelocation: This constant pool entry is guaranteed to never have a -/// relocation applied to it (because it holds a simple constant like -/// '4'). -/// LocalRelocation: This entry has relocations, but the entries are -/// guaranteed to be resolvable by the static linker, so the dynamic -/// linker will never see them. -/// GlobalRelocations: This entry may have arbitrary relocations. -/// -/// FIXME: This really should not be in IR. -Constant::PossibleRelocationsTy Constant::getRelocationInfo() const { - if (const GlobalValue *GV = dyn_cast<GlobalValue>(this)) { - if (GV->hasLocalLinkage() || GV->hasHiddenVisibility()) - return LocalRelocation; // Local to this file/library. - return GlobalRelocations; // Global reference. - } - if (const BlockAddress *BA = dyn_cast<BlockAddress>(this)) - return BA->getFunction()->getRelocationInfo(); - + return BA->getFunction()->needsRelocation(); + // While raw uses of blockaddress need to be relocated, differences between // two of them don't when they are for labels in the same function. This is a // common idiom when creating a table for the indirect goto extension, so we @@ -444,20 +445,18 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const { if (CE->getOpcode() == Instruction::Sub) { ConstantExpr *LHS = dyn_cast<ConstantExpr>(CE->getOperand(0)); ConstantExpr *RHS = dyn_cast<ConstantExpr>(CE->getOperand(1)); - if (LHS && RHS && - LHS->getOpcode() == Instruction::PtrToInt && + if (LHS && RHS && LHS->getOpcode() == Instruction::PtrToInt && RHS->getOpcode() == Instruction::PtrToInt && isa<BlockAddress>(LHS->getOperand(0)) && isa<BlockAddress>(RHS->getOperand(0)) && cast<BlockAddress>(LHS->getOperand(0))->getFunction() == - cast<BlockAddress>(RHS->getOperand(0))->getFunction()) - return NoRelocation; + cast<BlockAddress>(RHS->getOperand(0))->getFunction()) + return false; } - PossibleRelocationsTy Result = NoRelocation; + bool Result = false; for (unsigned i = 0, e = getNumOperands(); i != e; ++i) - Result = std::max(Result, - cast<Constant>(getOperand(i))->getRelocationInfo()); + Result |= cast<Constant>(getOperand(i))->needsRelocation(); return Result; } @@ -797,10 +796,10 @@ Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const { } unsigned ConstantAggregateZero::getNumElements() const { - const Type *Ty = getType(); - if (const auto *AT = dyn_cast<ArrayType>(Ty)) + Type *Ty = getType(); + if (auto *AT = dyn_cast<ArrayType>(Ty)) return AT->getNumElements(); - if (const auto *VT = dyn_cast<VectorType>(Ty)) + if (auto *VT = dyn_cast<VectorType>(Ty)) return VT->getNumElements(); return Ty->getStructNumElements(); } @@ -838,10 +837,10 @@ UndefValue *UndefValue::getElementValue(unsigned Idx) const { } unsigned UndefValue::getNumElements() const { - const Type *Ty = getType(); - if (const auto *AT = dyn_cast<ArrayType>(Ty)) + Type *Ty = getType(); + if (auto *AT = dyn_cast<ArrayType>(Ty)) return AT->getNumElements(); - if (const auto *VT = dyn_cast<VectorType>(Ty)) + if (auto *VT = dyn_cast<VectorType>(Ty)) return VT->getNumElements(); return Ty->getStructNumElements(); } @@ -858,6 +857,59 @@ static bool rangeOnlyContains(ItTy Start, ItTy End, EltTy Elt) { return true; } +template <typename SequentialTy, typename ElementTy> +static Constant *getIntSequenceIfElementsMatch(ArrayRef<Constant *> V) { + assert(!V.empty() && "Cannot get empty int sequence."); + + SmallVector<ElementTy, 16> Elts; + for (Constant *C : V) + if (auto *CI = dyn_cast<ConstantInt>(C)) + Elts.push_back(CI->getZExtValue()); + else + return nullptr; + return SequentialTy::get(V[0]->getContext(), Elts); +} + +template <typename SequentialTy, typename ElementTy> +static Constant *getFPSequenceIfElementsMatch(ArrayRef<Constant *> V) { + assert(!V.empty() && "Cannot get empty FP sequence."); + + SmallVector<ElementTy, 16> Elts; + for (Constant *C : V) + if (auto *CFP = dyn_cast<ConstantFP>(C)) + Elts.push_back(CFP->getValueAPF().bitcastToAPInt().getLimitedValue()); + else + return nullptr; + return SequentialTy::getFP(V[0]->getContext(), Elts); +} + +template <typename SequenceTy> +static Constant *getSequenceIfElementsMatch(Constant *C, + ArrayRef<Constant *> V) { + // We speculatively build the elements here even if it turns out that there is + // a constantexpr or something else weird, since it is so uncommon for that to + // happen. + if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) { + if (CI->getType()->isIntegerTy(8)) + return getIntSequenceIfElementsMatch<SequenceTy, uint8_t>(V); + else if (CI->getType()->isIntegerTy(16)) + return getIntSequenceIfElementsMatch<SequenceTy, uint16_t>(V); + else if (CI->getType()->isIntegerTy(32)) + return getIntSequenceIfElementsMatch<SequenceTy, uint32_t>(V); + else if (CI->getType()->isIntegerTy(64)) + return getIntSequenceIfElementsMatch<SequenceTy, uint64_t>(V); + } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) { + if (CFP->getType()->isHalfTy()) + return getFPSequenceIfElementsMatch<SequenceTy, uint16_t>(V); + else if (CFP->getType()->isFloatTy()) + return getFPSequenceIfElementsMatch<SequenceTy, uint32_t>(V); + else if (CFP->getType()->isDoubleTy()) + return getFPSequenceIfElementsMatch<SequenceTy, uint64_t>(V); + } + + return nullptr; +} + ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V) : Constant(T, ConstantArrayVal, OperandTraits<ConstantArray>::op_end(this) - V.size(), @@ -875,6 +927,7 @@ Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) { return C; return Ty->getContext().pImpl->ArrayConstants.getOrCreate(Ty, V); } + Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) { // Empty arrays are canonicalized to ConstantAggregateZero. if (V.empty()) @@ -897,74 +950,8 @@ Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) { // Check to see if all of the elements are ConstantFP or ConstantInt and if // the element type is compatible with ConstantDataVector. If so, use it. - if (ConstantDataSequential::isElementTypeCompatible(C->getType())) { - // We speculatively build the elements here even if it turns out that there - // is a constantexpr or something else weird in the array, since it is so - // uncommon for that to happen. - if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) { - if (CI->getType()->isIntegerTy(8)) { - SmallVector<uint8_t, 16> Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataArray::get(C->getContext(), Elts); - } else if (CI->getType()->isIntegerTy(16)) { - SmallVector<uint16_t, 16> Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataArray::get(C->getContext(), Elts); - } else if (CI->getType()->isIntegerTy(32)) { - SmallVector<uint32_t, 16> Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataArray::get(C->getContext(), Elts); - } else if (CI->getType()->isIntegerTy(64)) { - SmallVector<uint64_t, 16> Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataArray::get(C->getContext(), Elts); - } - } - - if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) { - if (CFP->getType()->isFloatTy()) { - SmallVector<uint32_t, 16> Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i])) - Elts.push_back( - CFP->getValueAPF().bitcastToAPInt().getLimitedValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataArray::getFP(C->getContext(), Elts); - } else if (CFP->getType()->isDoubleTy()) { - SmallVector<uint64_t, 16> Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i])) - Elts.push_back( - CFP->getValueAPF().bitcastToAPInt().getLimitedValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataArray::getFP(C->getContext(), Elts); - } - } - } + if (ConstantDataSequential::isElementTypeCompatible(C->getType())) + return getSequenceIfElementsMatch<ConstantDataArray>(C, V); // Otherwise, we really do want to create a ConstantArray. return nullptr; @@ -1060,6 +1047,7 @@ Constant *ConstantVector::get(ArrayRef<Constant*> V) { VectorType *Ty = VectorType::get(V.front()->getType(), V.size()); return Ty->getContext().pImpl->VectorConstants.getOrCreate(Ty, V); } + Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) { assert(!V.empty() && "Vectors can't be empty"); VectorType *T = VectorType::get(V.front()->getType(), V.size()); @@ -1085,74 +1073,8 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) { // Check to see if all of the elements are ConstantFP or ConstantInt and if // the element type is compatible with ConstantDataVector. If so, use it. - if (ConstantDataSequential::isElementTypeCompatible(C->getType())) { - // We speculatively build the elements here even if it turns out that there - // is a constantexpr or something else weird in the array, since it is so - // uncommon for that to happen. - if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) { - if (CI->getType()->isIntegerTy(8)) { - SmallVector<uint8_t, 16> Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataVector::get(C->getContext(), Elts); - } else if (CI->getType()->isIntegerTy(16)) { - SmallVector<uint16_t, 16> Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataVector::get(C->getContext(), Elts); - } else if (CI->getType()->isIntegerTy(32)) { - SmallVector<uint32_t, 16> Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataVector::get(C->getContext(), Elts); - } else if (CI->getType()->isIntegerTy(64)) { - SmallVector<uint64_t, 16> Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i])) - Elts.push_back(CI->getZExtValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataVector::get(C->getContext(), Elts); - } - } - - if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) { - if (CFP->getType()->isFloatTy()) { - SmallVector<uint32_t, 16> Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i])) - Elts.push_back( - CFP->getValueAPF().bitcastToAPInt().getLimitedValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataVector::getFP(C->getContext(), Elts); - } else if (CFP->getType()->isDoubleTy()) { - SmallVector<uint64_t, 16> Elts; - for (unsigned i = 0, e = V.size(); i != e; ++i) - if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i])) - Elts.push_back( - CFP->getValueAPF().bitcastToAPInt().getLimitedValue()); - else - break; - if (Elts.size() == V.size()) - return ConstantDataVector::getFP(C->getContext(), Elts); - } - } - } + if (ConstantDataSequential::isElementTypeCompatible(C->getType())) + return getSequenceIfElementsMatch<ConstantDataVector>(C, V); // Otherwise, the element type isn't compatible with ConstantDataVector, or // the operand list constants a ConstantExpr or something else strange. @@ -1170,6 +1092,17 @@ Constant *ConstantVector::getSplat(unsigned NumElts, Constant *V) { return get(Elts); } +ConstantTokenNone *ConstantTokenNone::get(LLVMContext &Context) { + LLVMContextImpl *pImpl = Context.pImpl; + if (!pImpl->TheNoneToken) + pImpl->TheNoneToken.reset(new ConstantTokenNone(Context)); + return pImpl->TheNoneToken.get(); +} + +/// Remove the constant from the constant table. +void ConstantTokenNone::destroyConstantImpl() { + llvm_unreachable("You can't ConstantTokenNone->destroyConstantImpl()!"); +} // Utility function for determining if a ConstantExpr is a CastOp or not. This // can't be inline because we don't want to #include Instruction.h into @@ -1221,8 +1154,7 @@ ArrayRef<unsigned> ConstantExpr::getIndices() const { } unsigned ConstantExpr::getPredicate() const { - assert(isCompare()); - return ((const CompareConstantExpr*)this)->predicate; + return cast<CompareConstantExpr>(this)->predicate; } /// getWithOperandReplaced - Return a constant expression identical to this @@ -1245,7 +1177,7 @@ ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const { /// operands replaced with the specified values. The specified array must /// have the same number of operands as our current one. Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty, - bool OnlyIfReduced) const { + bool OnlyIfReduced, Type *SrcTy) const { assert(Ops.size() == getNumOperands() && "Operand count mismatch!"); // If no operands changed return self. @@ -1283,10 +1215,13 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty, case Instruction::ShuffleVector: return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2], OnlyIfReducedTy); - case Instruction::GetElementPtr: - return ConstantExpr::getGetElementPtr(nullptr, Ops[0], Ops.slice(1), - cast<GEPOperator>(this)->isInBounds(), - OnlyIfReducedTy); + case Instruction::GetElementPtr: { + auto *GEPO = cast<GEPOperator>(this); + assert(SrcTy || (Ops[0]->getType() == getOperand(0)->getType())); + return ConstantExpr::getGetElementPtr( + SrcTy ? SrcTy : GEPO->getSourceElementType(), Ops[0], Ops.slice(1), + GEPO->isInBounds(), OnlyIfReducedTy); + } case Instruction::ICmp: case Instruction::FCmp: return ConstantExpr::getCompare(getPredicate(), Ops[0], Ops[1], @@ -2430,9 +2365,9 @@ StringRef ConstantDataSequential::getRawDataValues() const { /// formed with a vector or array of the specified element type. /// ConstantDataArray only works with normal float and int types that are /// stored densely in memory, not with things like i42 or x86_f80. -bool ConstantDataSequential::isElementTypeCompatible(const Type *Ty) { - if (Ty->isFloatTy() || Ty->isDoubleTy()) return true; - if (const IntegerType *IT = dyn_cast<IntegerType>(Ty)) { +bool ConstantDataSequential::isElementTypeCompatible(Type *Ty) { + if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) return true; + if (auto *IT = dyn_cast<IntegerType>(Ty)) { switch (IT->getBitWidth()) { case 8: case 16: @@ -2587,7 +2522,7 @@ Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) { /// object. Constant *ConstantDataArray::getFP(LLVMContext &Context, ArrayRef<uint16_t> Elts) { - Type *Ty = VectorType::get(Type::getHalfTy(Context), Elts.size()); + Type *Ty = ArrayType::get(Type::getHalfTy(Context), Elts.size()); const char *Data = reinterpret_cast<const char *>(Elts.data()); return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 2), Ty); } @@ -2703,6 +2638,11 @@ Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) { } if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) { + if (CFP->getType()->isHalfTy()) { + SmallVector<uint16_t, 16> Elts( + NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue()); + return getFP(V->getContext(), Elts); + } if (CFP->getType()->isFloatTy()) { SmallVector<uint32_t, 16> Elts( NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue()); @@ -2748,6 +2688,10 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const { switch (getElementType()->getTypeID()) { default: llvm_unreachable("Accessor can only be used when element is float/double!"); + case Type::HalfTyID: { + auto EltVal = *reinterpret_cast<const uint16_t *>(EltPtr); + return APFloat(APFloat::IEEEhalf, APInt(16, EltVal)); + } case Type::FloatTyID: { auto EltVal = *reinterpret_cast<const uint32_t *>(EltPtr); return APFloat(APFloat::IEEEsingle, APInt(32, EltVal)); @@ -2782,7 +2726,8 @@ double ConstantDataSequential::getElementAsDouble(unsigned Elt) const { /// Note that this has to compute a new constant to return, so it isn't as /// efficient as getElementAsInteger/Float/Double. Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const { - if (getElementType()->isFloatTy() || getElementType()->isDoubleTy()) + if (getElementType()->isHalfTy() || getElementType()->isFloatTy() || + getElementType()->isDoubleTy()) return ConstantFP::get(getContext(), getElementAsAPFloat(Elt)); return ConstantInt::get(getElementType(), getElementAsInteger(Elt)); @@ -2872,6 +2817,11 @@ Value *ConstantFP::handleOperandChangeImpl(Value *From, Value *To, Use *U) { llvm_unreachable("Unsupported class for handleOperandChange()!"); } +Value *ConstantTokenNone::handleOperandChangeImpl(Value *From, Value *To, + Use *U) { + llvm_unreachable("Unsupported class for handleOperandChange()!"); +} + Value *UndefValue::handleOperandChangeImpl(Value *From, Value *To, Use *U) { llvm_unreachable("Unsupported class for handleOperandChange()!"); } @@ -3070,7 +3020,7 @@ Instruction *ConstantExpr::getAsInstruction() { case Instruction::ICmp: case Instruction::FCmp: return CmpInst::Create((Instruction::OtherOps)getOpcode(), - getPredicate(), Ops[0], Ops[1]); + (CmpInst::Predicate)getPredicate(), Ops[0], Ops[1]); default: assert(getNumOperands() == 2 && "Must be binary operator?"); diff --git a/contrib/llvm/lib/IR/ConstantsContext.h b/contrib/llvm/lib/IR/ConstantsContext.h index f3ddcd7..13fcbd2 100644 --- a/contrib/llvm/lib/IR/ConstantsContext.h +++ b/contrib/llvm/lib/IR/ConstantsContext.h @@ -179,6 +179,13 @@ public: /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); + + static bool classof(const ConstantExpr *CE) { + return CE->getOpcode() == Instruction::ExtractValue; + } + static bool classof(const Value *V) { + return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V)); + } }; /// InsertValueConstantExpr - This class is private to @@ -205,6 +212,13 @@ public: /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); + + static bool classof(const ConstantExpr *CE) { + return CE->getOpcode() == Instruction::InsertValue; + } + static bool classof(const Value *V) { + return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V)); + } }; /// GetElementPtrConstantExpr - This class is private to Constants.cpp, and is @@ -235,6 +249,13 @@ public: Type *getSourceElementType() const; /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); + + static bool classof(const ConstantExpr *CE) { + return CE->getOpcode() == Instruction::GetElementPtr; + } + static bool classof(const Value *V) { + return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V)); + } }; // CompareConstantExpr - This class is private to Constants.cpp, and is used @@ -257,6 +278,14 @@ public: } /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); + + static bool classof(const ConstantExpr *CE) { + return CE->getOpcode() == Instruction::ICmp || + CE->getOpcode() == Instruction::FCmp; + } + static bool classof(const Value *V) { + return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V)); + } }; template <> @@ -373,41 +402,45 @@ template <class ConstantClass> struct ConstantAggrKeyType { struct InlineAsmKeyType { StringRef AsmString; StringRef Constraints; + FunctionType *FTy; bool HasSideEffects; bool IsAlignStack; InlineAsm::AsmDialect AsmDialect; InlineAsmKeyType(StringRef AsmString, StringRef Constraints, - bool HasSideEffects, bool IsAlignStack, + FunctionType *FTy, bool HasSideEffects, bool IsAlignStack, InlineAsm::AsmDialect AsmDialect) - : AsmString(AsmString), Constraints(Constraints), + : AsmString(AsmString), Constraints(Constraints), FTy(FTy), HasSideEffects(HasSideEffects), IsAlignStack(IsAlignStack), AsmDialect(AsmDialect) {} InlineAsmKeyType(const InlineAsm *Asm, SmallVectorImpl<Constant *> &) : AsmString(Asm->getAsmString()), Constraints(Asm->getConstraintString()), - HasSideEffects(Asm->hasSideEffects()), + FTy(Asm->getFunctionType()), HasSideEffects(Asm->hasSideEffects()), IsAlignStack(Asm->isAlignStack()), AsmDialect(Asm->getDialect()) {} bool operator==(const InlineAsmKeyType &X) const { return HasSideEffects == X.HasSideEffects && IsAlignStack == X.IsAlignStack && AsmDialect == X.AsmDialect && - AsmString == X.AsmString && Constraints == X.Constraints; + AsmString == X.AsmString && Constraints == X.Constraints && + FTy == X.FTy; } bool operator==(const InlineAsm *Asm) const { return HasSideEffects == Asm->hasSideEffects() && IsAlignStack == Asm->isAlignStack() && AsmDialect == Asm->getDialect() && AsmString == Asm->getAsmString() && - Constraints == Asm->getConstraintString(); + Constraints == Asm->getConstraintString() && + FTy == Asm->getFunctionType(); } unsigned getHash() const { return hash_combine(AsmString, Constraints, HasSideEffects, IsAlignStack, - AsmDialect); + AsmDialect, FTy); } typedef ConstantInfo<InlineAsm>::TypeClass TypeClass; InlineAsm *create(TypeClass *Ty) const { - return new InlineAsm(Ty, AsmString, Constraints, HasSideEffects, + assert(PointerType::getUnqual(FTy) == Ty); + return new InlineAsm(FTy, AsmString, Constraints, HasSideEffects, IsAlignStack, AsmDialect); } }; diff --git a/contrib/llvm/lib/IR/Core.cpp b/contrib/llvm/lib/IR/Core.cpp index 0eb88a9..591dafa 100644 --- a/contrib/llvm/lib/IR/Core.cpp +++ b/contrib/llvm/lib/IR/Core.cpp @@ -262,6 +262,8 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) { return LLVMVectorTypeKind; case Type::X86_MMXTyID: return LLVMX86_MMXTypeKind; + case Type::TokenTyID: + return LLVMTokenTypeKind; } llvm_unreachable("Unhandled TypeID."); } @@ -366,6 +368,9 @@ LLVMTypeRef LLVMPPCFP128TypeInContext(LLVMContextRef C) { LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getX86_MMXTy(*unwrap(C)); } +LLVMTypeRef LLVMTokenTypeInContext(LLVMContextRef C) { + return (LLVMTypeRef) Type::getTokenTy(*unwrap(C)); +} LLVMTypeRef LLVMHalfType(void) { return LLVMHalfTypeInContext(LLVMGetGlobalContext()); @@ -1528,7 +1533,7 @@ LLVMValueRef LLVMGetFirstGlobal(LLVMModuleRef M) { Module::global_iterator I = Mod->global_begin(); if (I == Mod->global_end()) return nullptr; - return wrap(I); + return wrap(&*I); } LLVMValueRef LLVMGetLastGlobal(LLVMModuleRef M) { @@ -1536,23 +1541,23 @@ LLVMValueRef LLVMGetLastGlobal(LLVMModuleRef M) { Module::global_iterator I = Mod->global_end(); if (I == Mod->global_begin()) return nullptr; - return wrap(--I); + return wrap(&*--I); } LLVMValueRef LLVMGetNextGlobal(LLVMValueRef GlobalVar) { GlobalVariable *GV = unwrap<GlobalVariable>(GlobalVar); - Module::global_iterator I = GV; + Module::global_iterator I(GV); if (++I == GV->getParent()->global_end()) return nullptr; - return wrap(I); + return wrap(&*I); } LLVMValueRef LLVMGetPreviousGlobal(LLVMValueRef GlobalVar) { GlobalVariable *GV = unwrap<GlobalVariable>(GlobalVar); - Module::global_iterator I = GV; + Module::global_iterator I(GV); if (I == GV->getParent()->global_begin()) return nullptr; - return wrap(--I); + return wrap(&*--I); } void LLVMDeleteGlobal(LLVMValueRef GlobalVar) { @@ -1639,7 +1644,8 @@ void LLVMSetExternallyInitialized(LLVMValueRef GlobalVar, LLVMBool IsExtInit) { LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee, const char *Name) { auto *PTy = cast<PointerType>(unwrap(Ty)); - return wrap(GlobalAlias::create(PTy, GlobalValue::ExternalLinkage, Name, + return wrap(GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(), + GlobalValue::ExternalLinkage, Name, unwrap<Constant>(Aliasee), unwrap(M))); } @@ -1660,7 +1666,7 @@ LLVMValueRef LLVMGetFirstFunction(LLVMModuleRef M) { Module::iterator I = Mod->begin(); if (I == Mod->end()) return nullptr; - return wrap(I); + return wrap(&*I); } LLVMValueRef LLVMGetLastFunction(LLVMModuleRef M) { @@ -1668,23 +1674,23 @@ LLVMValueRef LLVMGetLastFunction(LLVMModuleRef M) { Module::iterator I = Mod->end(); if (I == Mod->begin()) return nullptr; - return wrap(--I); + return wrap(&*--I); } LLVMValueRef LLVMGetNextFunction(LLVMValueRef Fn) { Function *Func = unwrap<Function>(Fn); - Module::iterator I = Func; + Module::iterator I(Func); if (++I == Func->getParent()->end()) return nullptr; - return wrap(I); + return wrap(&*I); } LLVMValueRef LLVMGetPreviousFunction(LLVMValueRef Fn) { Function *Func = unwrap<Function>(Fn); - Module::iterator I = Func; + Module::iterator I(Func); if (I == Func->getParent()->begin()) return nullptr; - return wrap(--I); + return wrap(&*--I); } void LLVMDeleteFunction(LLVMValueRef Fn) { @@ -1716,7 +1722,7 @@ void LLVMSetFunctionCallConv(LLVMValueRef Fn, unsigned CC) { const char *LLVMGetGC(LLVMValueRef Fn) { Function *F = unwrap<Function>(Fn); - return F->hasGC()? F->getGC() : nullptr; + return F->hasGC()? F->getGC().c_str() : nullptr; } void LLVMSetGC(LLVMValueRef Fn, const char *GC) { @@ -1779,14 +1785,14 @@ void LLVMGetParams(LLVMValueRef FnRef, LLVMValueRef *ParamRefs) { Function *Fn = unwrap<Function>(FnRef); for (Function::arg_iterator I = Fn->arg_begin(), E = Fn->arg_end(); I != E; I++) - *ParamRefs++ = wrap(I); + *ParamRefs++ = wrap(&*I); } LLVMValueRef LLVMGetParam(LLVMValueRef FnRef, unsigned index) { Function::arg_iterator AI = unwrap<Function>(FnRef)->arg_begin(); while (index --> 0) AI++; - return wrap(AI); + return wrap(&*AI); } LLVMValueRef LLVMGetParamParent(LLVMValueRef V) { @@ -1798,7 +1804,7 @@ LLVMValueRef LLVMGetFirstParam(LLVMValueRef Fn) { Function::arg_iterator I = Func->arg_begin(); if (I == Func->arg_end()) return nullptr; - return wrap(I); + return wrap(&*I); } LLVMValueRef LLVMGetLastParam(LLVMValueRef Fn) { @@ -1806,23 +1812,23 @@ LLVMValueRef LLVMGetLastParam(LLVMValueRef Fn) { Function::arg_iterator I = Func->arg_end(); if (I == Func->arg_begin()) return nullptr; - return wrap(--I); + return wrap(&*--I); } LLVMValueRef LLVMGetNextParam(LLVMValueRef Arg) { Argument *A = unwrap<Argument>(Arg); - Function::arg_iterator I = A; + Function::arg_iterator I(A); if (++I == A->getParent()->arg_end()) return nullptr; - return wrap(I); + return wrap(&*I); } LLVMValueRef LLVMGetPreviousParam(LLVMValueRef Arg) { Argument *A = unwrap<Argument>(Arg); - Function::arg_iterator I = A; + Function::arg_iterator I(A); if (I == A->getParent()->arg_begin()) return nullptr; - return wrap(--I); + return wrap(&*--I); } void LLVMAddAttribute(LLVMValueRef Arg, LLVMAttribute PA) { @@ -1880,7 +1886,7 @@ unsigned LLVMCountBasicBlocks(LLVMValueRef FnRef) { void LLVMGetBasicBlocks(LLVMValueRef FnRef, LLVMBasicBlockRef *BasicBlocksRefs){ Function *Fn = unwrap<Function>(FnRef); for (Function::iterator I = Fn->begin(), E = Fn->end(); I != E; I++) - *BasicBlocksRefs++ = wrap(I); + *BasicBlocksRefs++ = wrap(&*I); } LLVMBasicBlockRef LLVMGetEntryBasicBlock(LLVMValueRef Fn) { @@ -1892,7 +1898,7 @@ LLVMBasicBlockRef LLVMGetFirstBasicBlock(LLVMValueRef Fn) { Function::iterator I = Func->begin(); if (I == Func->end()) return nullptr; - return wrap(I); + return wrap(&*I); } LLVMBasicBlockRef LLVMGetLastBasicBlock(LLVMValueRef Fn) { @@ -1900,23 +1906,23 @@ LLVMBasicBlockRef LLVMGetLastBasicBlock(LLVMValueRef Fn) { Function::iterator I = Func->end(); if (I == Func->begin()) return nullptr; - return wrap(--I); + return wrap(&*--I); } LLVMBasicBlockRef LLVMGetNextBasicBlock(LLVMBasicBlockRef BB) { BasicBlock *Block = unwrap(BB); - Function::iterator I = Block; + Function::iterator I(Block); if (++I == Block->getParent()->end()) return nullptr; - return wrap(I); + return wrap(&*I); } LLVMBasicBlockRef LLVMGetPreviousBasicBlock(LLVMBasicBlockRef BB) { BasicBlock *Block = unwrap(BB); - Function::iterator I = Block; + Function::iterator I(Block); if (I == Block->getParent()->begin()) return nullptr; - return wrap(--I); + return wrap(&*--I); } LLVMBasicBlockRef LLVMAppendBasicBlockInContext(LLVMContextRef C, @@ -1968,7 +1974,7 @@ LLVMValueRef LLVMGetFirstInstruction(LLVMBasicBlockRef BB) { BasicBlock::iterator I = Block->begin(); if (I == Block->end()) return nullptr; - return wrap(I); + return wrap(&*I); } LLVMValueRef LLVMGetLastInstruction(LLVMBasicBlockRef BB) { @@ -1976,23 +1982,23 @@ LLVMValueRef LLVMGetLastInstruction(LLVMBasicBlockRef BB) { BasicBlock::iterator I = Block->end(); if (I == Block->begin()) return nullptr; - return wrap(--I); + return wrap(&*--I); } LLVMValueRef LLVMGetNextInstruction(LLVMValueRef Inst) { Instruction *Instr = unwrap<Instruction>(Inst); - BasicBlock::iterator I = Instr; + BasicBlock::iterator I(Instr); if (++I == Instr->getParent()->end()) return nullptr; - return wrap(I); + return wrap(&*I); } LLVMValueRef LLVMGetPreviousInstruction(LLVMValueRef Inst) { Instruction *Instr = unwrap<Instruction>(Inst); - BasicBlock::iterator I = Instr; + BasicBlock::iterator I(Instr); if (I == Instr->getParent()->begin()) return nullptr; - return wrap(--I); + return wrap(&*--I); } void LLVMInstructionEraseFromParent(LLVMValueRef Inst) { @@ -2160,12 +2166,12 @@ void LLVMPositionBuilder(LLVMBuilderRef Builder, LLVMBasicBlockRef Block, LLVMValueRef Instr) { BasicBlock *BB = unwrap(Block); Instruction *I = Instr? unwrap<Instruction>(Instr) : (Instruction*) BB->end(); - unwrap(Builder)->SetInsertPoint(BB, I); + unwrap(Builder)->SetInsertPoint(BB, I->getIterator()); } void LLVMPositionBuilderBefore(LLVMBuilderRef Builder, LLVMValueRef Instr) { Instruction *I = unwrap<Instruction>(Instr); - unwrap(Builder)->SetInsertPoint(I->getParent(), I); + unwrap(Builder)->SetInsertPoint(I->getParent(), I->getIterator()); } void LLVMPositionBuilderAtEnd(LLVMBuilderRef Builder, LLVMBasicBlockRef Block) { @@ -2489,7 +2495,6 @@ LLVMValueRef LLVMBuildFree(LLVMBuilderRef B, LLVMValueRef PointerVal) { CallInst::CreateFree(unwrap(PointerVal), unwrap(B)->GetInsertBlock()))); } - LLVMValueRef LLVMBuildLoad(LLVMBuilderRef B, LLVMValueRef PointerVal, const char *Name) { return wrap(unwrap(B)->CreateLoad(unwrap(PointerVal), Name)); @@ -2515,6 +2520,21 @@ static AtomicOrdering mapFromLLVMOrdering(LLVMAtomicOrdering Ordering) { llvm_unreachable("Invalid LLVMAtomicOrdering value!"); } +static LLVMAtomicOrdering mapToLLVMOrdering(AtomicOrdering Ordering) { + switch (Ordering) { + case NotAtomic: return LLVMAtomicOrderingNotAtomic; + case Unordered: return LLVMAtomicOrderingUnordered; + case Monotonic: return LLVMAtomicOrderingMonotonic; + case Acquire: return LLVMAtomicOrderingAcquire; + case Release: return LLVMAtomicOrderingRelease; + case AcquireRelease: return LLVMAtomicOrderingAcquireRelease; + case SequentiallyConsistent: + return LLVMAtomicOrderingSequentiallyConsistent; + } + + llvm_unreachable("Invalid AtomicOrdering value!"); +} + LLVMValueRef LLVMBuildFence(LLVMBuilderRef B, LLVMAtomicOrdering Ordering, LLVMBool isSingleThread, const char *Name) { return wrap( @@ -2567,6 +2587,25 @@ void LLVMSetVolatile(LLVMValueRef MemAccessInst, LLVMBool isVolatile) { return cast<StoreInst>(P)->setVolatile(isVolatile); } +LLVMAtomicOrdering LLVMGetOrdering(LLVMValueRef MemAccessInst) { + Value *P = unwrap<Value>(MemAccessInst); + AtomicOrdering O; + if (LoadInst *LI = dyn_cast<LoadInst>(P)) + O = LI->getOrdering(); + else + O = cast<StoreInst>(P)->getOrdering(); + return mapToLLVMOrdering(O); +} + +void LLVMSetOrdering(LLVMValueRef MemAccessInst, LLVMAtomicOrdering Ordering) { + Value *P = unwrap<Value>(MemAccessInst); + AtomicOrdering O = mapFromLLVMOrdering(Ordering); + + if (LoadInst *LI = dyn_cast<LoadInst>(P)) + return LI->setOrdering(O); + return cast<StoreInst>(P)->setOrdering(O); +} + /*--.. Casts ...............................................................--*/ LLVMValueRef LLVMBuildTrunc(LLVMBuilderRef B, LLVMValueRef Val, diff --git a/contrib/llvm/lib/IR/DIBuilder.cpp b/contrib/llvm/lib/IR/DIBuilder.cpp index 2a90e70..b7841fe 100644 --- a/contrib/llvm/lib/IR/DIBuilder.cpp +++ b/contrib/llvm/lib/IR/DIBuilder.cpp @@ -148,7 +148,7 @@ DICompileUnit *DIBuilder::createCompileUnit( CUNode = DICompileUnit::getDistinct( VMContext, Lang, DIFile::get(VMContext, Filename, Directory), Producer, isOptimized, Flags, RunTimeVer, SplitName, Kind, nullptr, - nullptr, nullptr, nullptr, nullptr, DWOId); + nullptr, nullptr, nullptr, nullptr, nullptr, DWOId); // Create a named metadata so that it is easier to find cu in a module. // Note that we only generate this when the caller wants to actually @@ -255,10 +255,12 @@ DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy, DITypeRef::get(Base)); } -DIDerivedType *DIBuilder::createReferenceType(unsigned Tag, DIType *RTy) { +DIDerivedType *DIBuilder::createReferenceType(unsigned Tag, DIType *RTy, + uint64_t SizeInBits, + uint64_t AlignInBits) { assert(RTy && "Unable to create reference type"); return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, - DITypeRef::get(RTy), 0, 0, 0, 0); + DITypeRef::get(RTy), SizeInBits, AlignInBits, 0, 0); } DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name, @@ -429,12 +431,23 @@ DICompositeType *DIBuilder::createUnionType( return R; } -DISubroutineType *DIBuilder::createSubroutineType(DIFile *File, - DITypeRefArray ParameterTypes, +DISubroutineType *DIBuilder::createSubroutineType(DITypeRefArray ParameterTypes, unsigned Flags) { return DISubroutineType::get(VMContext, Flags, ParameterTypes); } +DICompositeType *DIBuilder::createExternalTypeRef(unsigned Tag, DIFile *File, + StringRef UniqueIdentifier) { + assert(!UniqueIdentifier.empty() && "external type ref without uid"); + auto *CTy = + DICompositeType::get(VMContext, Tag, "", nullptr, 0, nullptr, nullptr, 0, + 0, 0, DINode::FlagExternalTypeRef, nullptr, 0, + nullptr, nullptr, UniqueIdentifier); + // Types with unique IDs need to be in the type map. + retainType(CTy); + return CTy; +} + DICompositeType *DIBuilder::createEnumerationType( DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber, uint64_t SizeInBits, uint64_t AlignInBits, DINodeArray Elements, @@ -590,18 +603,20 @@ DIGlobalVariable *DIBuilder::createTempGlobalVariableFwdDecl( .release(); } -DILocalVariable *DIBuilder::createLocalVariable( - unsigned Tag, DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNo, - DIType *Ty, bool AlwaysPreserve, unsigned Flags, unsigned ArgNo) { +static DILocalVariable *createLocalVariable( + LLVMContext &VMContext, + DenseMap<MDNode *, std::vector<TrackingMDNodeRef>> &PreservedVariables, + DIScope *Scope, StringRef Name, unsigned ArgNo, DIFile *File, + unsigned LineNo, DIType *Ty, bool AlwaysPreserve, unsigned Flags) { // FIXME: Why getNonCompileUnitScope()? // FIXME: Why is "!Context" okay here? // FIXME: Why doesn't this check for a subprogram or lexical block (AFAICT // the only valid scopes)? DIScope *Context = getNonCompileUnitScope(Scope); - auto *Node = DILocalVariable::get( - VMContext, Tag, cast_or_null<DILocalScope>(Context), Name, File, LineNo, - DITypeRef::get(Ty), ArgNo, Flags); + auto *Node = + DILocalVariable::get(VMContext, cast_or_null<DILocalScope>(Context), Name, + File, LineNo, DITypeRef::get(Ty), ArgNo, Flags); if (AlwaysPreserve) { // The optimizer may remove local variables. If there is an interest // to preserve variable info in such situation then stash it in a @@ -613,6 +628,23 @@ DILocalVariable *DIBuilder::createLocalVariable( return Node; } +DILocalVariable *DIBuilder::createAutoVariable(DIScope *Scope, StringRef Name, + DIFile *File, unsigned LineNo, + DIType *Ty, bool AlwaysPreserve, + unsigned Flags) { + return createLocalVariable(VMContext, PreservedVariables, Scope, Name, + /* ArgNo */ 0, File, LineNo, Ty, AlwaysPreserve, + Flags); +} + +DILocalVariable *DIBuilder::createParameterVariable( + DIScope *Scope, StringRef Name, unsigned ArgNo, DIFile *File, + unsigned LineNo, DIType *Ty, bool AlwaysPreserve, unsigned Flags) { + assert(ArgNo && "Expected non-zero argument number for parameter"); + return createLocalVariable(VMContext, PreservedVariables, Scope, Name, ArgNo, + File, LineNo, Ty, AlwaysPreserve, Flags); +} + DIExpression *DIBuilder::createExpression(ArrayRef<uint64_t> Addr) { return DIExpression::get(VMContext, Addr); } @@ -629,36 +661,37 @@ DIExpression *DIBuilder::createBitPieceExpression(unsigned OffsetInBytes, return DIExpression::get(VMContext, Addr); } -DISubprogram *DIBuilder::createFunction(DIScopeRef Context, StringRef Name, - StringRef LinkageName, DIFile *File, - unsigned LineNo, DISubroutineType *Ty, - bool isLocalToUnit, bool isDefinition, - unsigned ScopeLine, unsigned Flags, - bool isOptimized, Function *Fn, - MDNode *TParams, MDNode *Decl) { +DISubprogram *DIBuilder::createFunction( + DIScopeRef Context, StringRef Name, StringRef LinkageName, DIFile *File, + unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit, + bool isDefinition, unsigned ScopeLine, unsigned Flags, bool isOptimized, + DITemplateParameterArray TParams, DISubprogram *Decl) { // dragonegg does not generate identifier for types, so using an empty map // to resolve the context should be fine. DITypeIdentifierMap EmptyMap; return createFunction(Context.resolve(EmptyMap), Name, LinkageName, File, LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine, - Flags, isOptimized, Fn, TParams, Decl); -} - -DISubprogram *DIBuilder::createFunction(DIScope *Context, StringRef Name, - StringRef LinkageName, DIFile *File, - unsigned LineNo, DISubroutineType *Ty, - bool isLocalToUnit, bool isDefinition, - unsigned ScopeLine, unsigned Flags, - bool isOptimized, Function *Fn, - MDNode *TParams, MDNode *Decl) { - assert(Ty->getTag() == dwarf::DW_TAG_subroutine_type && - "function types should be subroutines"); - auto *Node = DISubprogram::get( - VMContext, DIScopeRef::get(getNonCompileUnitScope(Context)), Name, - LinkageName, File, LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine, - nullptr, 0, 0, Flags, isOptimized, Fn, cast_or_null<MDTuple>(TParams), - cast_or_null<DISubprogram>(Decl), - MDTuple::getTemporary(VMContext, None).release()); + Flags, isOptimized, TParams, Decl); +} + +template <class... Ts> +static DISubprogram *getSubprogram(bool IsDistinct, Ts &&... Args) { + if (IsDistinct) + return DISubprogram::getDistinct(std::forward<Ts>(Args)...); + return DISubprogram::get(std::forward<Ts>(Args)...); +} + +DISubprogram *DIBuilder::createFunction( + DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File, + unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit, + bool isDefinition, unsigned ScopeLine, unsigned Flags, bool isOptimized, + DITemplateParameterArray TParams, DISubprogram *Decl) { + auto *Node = + getSubprogram(/* IsDistinct = */ isDefinition, VMContext, + DIScopeRef::get(getNonCompileUnitScope(Context)), Name, + LinkageName, File, LineNo, Ty, isLocalToUnit, isDefinition, + ScopeLine, nullptr, 0, 0, Flags, isOptimized, TParams, Decl, + MDTuple::getTemporary(VMContext, None).release()); if (isDefinition) AllSubprograms.push_back(Node); @@ -670,12 +703,11 @@ DISubprogram *DIBuilder::createTempFunctionFwdDecl( DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File, unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit, bool isDefinition, unsigned ScopeLine, unsigned Flags, bool isOptimized, - Function *Fn, MDNode *TParams, MDNode *Decl) { + DITemplateParameterArray TParams, DISubprogram *Decl) { return DISubprogram::getTemporary( VMContext, DIScopeRef::get(getNonCompileUnitScope(Context)), Name, LinkageName, File, LineNo, Ty, isLocalToUnit, isDefinition, - ScopeLine, nullptr, 0, 0, Flags, isOptimized, Fn, - cast_or_null<MDTuple>(TParams), cast_or_null<DISubprogram>(Decl), + ScopeLine, nullptr, 0, 0, Flags, isOptimized, TParams, Decl, nullptr) .release(); } @@ -685,18 +717,16 @@ DIBuilder::createMethod(DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *F, unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit, bool isDefinition, unsigned VK, unsigned VIndex, DIType *VTableHolder, unsigned Flags, - bool isOptimized, Function *Fn, MDNode *TParam) { - assert(Ty->getTag() == dwarf::DW_TAG_subroutine_type && - "function types should be subroutines"); + bool isOptimized, DITemplateParameterArray TParams) { assert(getNonCompileUnitScope(Context) && "Methods should have both a Context and a context that isn't " "the compile unit."); // FIXME: Do we want to use different scope/lines? - auto *SP = DISubprogram::get( - VMContext, DIScopeRef::get(cast<DIScope>(Context)), Name, LinkageName, F, - LineNo, Ty, isLocalToUnit, isDefinition, LineNo, - DITypeRef::get(VTableHolder), VK, VIndex, Flags, isOptimized, Fn, - cast_or_null<MDTuple>(TParam), nullptr, nullptr); + auto *SP = getSubprogram( + /* IsDistinct = */ isDefinition, VMContext, + DIScopeRef::get(cast<DIScope>(Context)), Name, LinkageName, F, LineNo, Ty, + isLocalToUnit, isDefinition, LineNo, DITypeRef::get(VTableHolder), VK, + VIndex, Flags, isOptimized, TParams, nullptr, nullptr); if (isDefinition) AllSubprograms.push_back(SP); diff --git a/contrib/llvm/lib/IR/DataLayout.cpp b/contrib/llvm/lib/IR/DataLayout.cpp index 4d867ef..5468f47 100644 --- a/contrib/llvm/lib/IR/DataLayout.cpp +++ b/contrib/llvm/lib/IR/DataLayout.cpp @@ -41,6 +41,7 @@ StructLayout::StructLayout(StructType *ST, const DataLayout &DL) { assert(!ST->isOpaque() && "Cannot get layout of opaque structs"); StructAlignment = 0; StructSize = 0; + IsPadded = false; NumElements = ST->getNumElements(); // Loop over each of the elements, placing them in memory. @@ -49,8 +50,10 @@ StructLayout::StructLayout(StructType *ST, const DataLayout &DL) { unsigned TyAlign = ST->isPacked() ? 1 : DL.getABITypeAlignment(Ty); // Add padding if necessary to align the data element properly. - if ((StructSize & (TyAlign-1)) != 0) + if ((StructSize & (TyAlign-1)) != 0) { + IsPadded = true; StructSize = RoundUpToAlignment(StructSize, TyAlign); + } // Keep track of maximum alignment constraint. StructAlignment = std::max(TyAlign, StructAlignment); @@ -64,8 +67,10 @@ StructLayout::StructLayout(StructType *ST, const DataLayout &DL) { // Add padding to the end of the struct so that it could be put in an array // and all array elements would be aligned correctly. - if ((StructSize & (StructAlignment-1)) != 0) + if ((StructSize & (StructAlignment-1)) != 0) { + IsPadded = true; StructSize = RoundUpToAlignment(StructSize, StructAlignment); + } } @@ -461,8 +466,8 @@ unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType, return ABIInfo ? Alignments[i].ABIAlign : Alignments[i].PrefAlign; // The best match so far depends on what we're looking for. - if (AlignType == INTEGER_ALIGN && - Alignments[i].AlignType == INTEGER_ALIGN) { + if (AlignType == INTEGER_ALIGN && + Alignments[i].AlignType == INTEGER_ALIGN) { // The "best match" for integers is the smallest size that is larger than // the BitWidth requested. if (Alignments[i].TypeBitWidth > BitWidth && (BestMatchIdx == -1 || diff --git a/contrib/llvm/lib/IR/DebugInfo.cpp b/contrib/llvm/lib/IR/DebugInfo.cpp index 9646d1a..a2443be 100644 --- a/contrib/llvm/lib/IR/DebugInfo.cpp +++ b/contrib/llvm/lib/IR/DebugInfo.cpp @@ -56,21 +56,6 @@ DISubprogram *llvm::getDISubprogram(const Function *F) { return nullptr; } -DICompositeTypeBase *llvm::getDICompositeType(DIType *T) { - if (auto *C = dyn_cast_or_null<DICompositeTypeBase>(T)) - return C; - - if (auto *D = dyn_cast_or_null<DIDerivedTypeBase>(T)) { - // This function is currently used by dragonegg and dragonegg does - // not generate identifier for types, so using an empty map to resolve - // DerivedFrom should be fine. - DITypeIdentifierMap EmptyMap; - return getDICompositeType(D->getBaseType().resolve(EmptyMap)); - } - - return nullptr; -} - DITypeIdentifierMap llvm::generateDITypeIdentifierMap(const NamedMDNode *CU_Nodes) { DITypeIdentifierMap Map; @@ -164,20 +149,22 @@ void DebugInfoFinder::processType(DIType *DT) { if (!addType(DT)) return; processScope(DT->getScope().resolve(TypeIdentifierMap)); - if (auto *DCT = dyn_cast<DICompositeTypeBase>(DT)) { + if (auto *ST = dyn_cast<DISubroutineType>(DT)) { + for (DITypeRef Ref : ST->getTypeArray()) + processType(Ref.resolve(TypeIdentifierMap)); + return; + } + if (auto *DCT = dyn_cast<DICompositeType>(DT)) { processType(DCT->getBaseType().resolve(TypeIdentifierMap)); - if (auto *ST = dyn_cast<DISubroutineType>(DCT)) { - for (DITypeRef Ref : ST->getTypeArray()) - processType(Ref.resolve(TypeIdentifierMap)); - return; - } for (Metadata *D : DCT->getElements()) { if (auto *T = dyn_cast<DIType>(D)) processType(T); else if (auto *SP = dyn_cast<DISubprogram>(D)) processSubprogram(SP); } - } else if (auto *DDT = dyn_cast<DIDerivedTypeBase>(DT)) { + return; + } + if (auto *DDT = dyn_cast<DIDerivedType>(DT)) { processType(DDT->getBaseType().resolve(TypeIdentifierMap)); } } @@ -313,6 +300,10 @@ bool DebugInfoFinder::addScope(DIScope *Scope) { bool llvm::stripDebugInfo(Function &F) { bool Changed = false; + if (F.getSubprogram()) { + Changed = true; + F.setSubprogram(nullptr); + } for (BasicBlock &BB : F) { for (Instruction &I : BB) { if (I.getDebugLoc()) { @@ -349,7 +340,7 @@ bool llvm::StripDebugInfo(Module &M) { for (Module::named_metadata_iterator NMI = M.named_metadata_begin(), NME = M.named_metadata_end(); NMI != NME;) { - NamedMDNode *NMD = NMI; + NamedMDNode *NMD = &*NMI; ++NMI; if (NMD->getName().startswith("llvm.dbg.")) { NMD->eraseFromParent(); @@ -372,21 +363,3 @@ unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) { return Val->getZExtValue(); return 0; } - -DenseMap<const llvm::Function *, DISubprogram *> -llvm::makeSubprogramMap(const Module &M) { - DenseMap<const Function *, DISubprogram *> R; - - NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu"); - if (!CU_Nodes) - return R; - - for (MDNode *N : CU_Nodes->operands()) { - auto *CUNode = cast<DICompileUnit>(N); - for (auto *SP : CUNode->getSubprograms()) { - if (Function *F = SP->getFunction()) - R.insert(std::make_pair(F, SP)); - } - } - return R; -} diff --git a/contrib/llvm/lib/IR/DebugInfoMetadata.cpp b/contrib/llvm/lib/IR/DebugInfoMetadata.cpp index 5e01748..58e0abd 100644 --- a/contrib/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/contrib/llvm/lib/IR/DebugInfoMetadata.cpp @@ -295,8 +295,7 @@ DISubroutineType *DISubroutineType::getImpl(LLVMContext &Context, StorageType Storage, bool ShouldCreate) { DEFINE_GETIMPL_LOOKUP(DISubroutineType, (Flags, TypeArray)); - Metadata *Ops[] = {nullptr, nullptr, nullptr, nullptr, - TypeArray, nullptr, nullptr, nullptr}; + Metadata *Ops[] = {nullptr, nullptr, nullptr, TypeArray}; DEFINE_GETIMPL_STORE(DISubroutineType, (Flags), Ops); } @@ -316,22 +315,20 @@ DICompileUnit *DICompileUnit::getImpl( unsigned RuntimeVersion, MDString *SplitDebugFilename, unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes, Metadata *Subprograms, Metadata *GlobalVariables, - Metadata *ImportedEntities, uint64_t DWOId, + Metadata *ImportedEntities, Metadata *Macros, uint64_t DWOId, StorageType Storage, bool ShouldCreate) { + assert(Storage != Uniqued && "Cannot unique DICompileUnit"); assert(isCanonical(Producer) && "Expected canonical MDString"); assert(isCanonical(Flags) && "Expected canonical MDString"); assert(isCanonical(SplitDebugFilename) && "Expected canonical MDString"); - DEFINE_GETIMPL_LOOKUP( - DICompileUnit, - (SourceLanguage, File, getString(Producer), IsOptimized, getString(Flags), - RuntimeVersion, getString(SplitDebugFilename), EmissionKind, EnumTypes, - RetainedTypes, Subprograms, GlobalVariables, ImportedEntities, DWOId)); + Metadata *Ops[] = {File, Producer, Flags, SplitDebugFilename, EnumTypes, RetainedTypes, Subprograms, GlobalVariables, - ImportedEntities}; - DEFINE_GETIMPL_STORE( - DICompileUnit, - (SourceLanguage, IsOptimized, RuntimeVersion, EmissionKind, DWOId), Ops); + ImportedEntities, Macros}; + return storeImpl(new (ArrayRef<Metadata *>(Ops).size()) DICompileUnit( + Context, Storage, SourceLanguage, IsOptimized, + RuntimeVersion, EmissionKind, DWOId, Ops), + Storage); } DISubprogram *DILocalScope::getSubprogram() const { @@ -345,34 +342,28 @@ DISubprogram *DISubprogram::getImpl( MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type, bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine, Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex, - unsigned Flags, bool IsOptimized, Metadata *Function, - Metadata *TemplateParams, Metadata *Declaration, Metadata *Variables, - StorageType Storage, bool ShouldCreate) { + unsigned Flags, bool IsOptimized, Metadata *TemplateParams, + Metadata *Declaration, Metadata *Variables, StorageType Storage, + bool ShouldCreate) { assert(isCanonical(Name) && "Expected canonical MDString"); assert(isCanonical(LinkageName) && "Expected canonical MDString"); DEFINE_GETIMPL_LOOKUP(DISubprogram, (Scope, getString(Name), getString(LinkageName), File, Line, Type, IsLocalToUnit, IsDefinition, ScopeLine, ContainingType, Virtuality, VirtualIndex, Flags, - IsOptimized, Function, TemplateParams, Declaration, - Variables)); - Metadata *Ops[] = {File, Scope, Name, Name, - LinkageName, Type, ContainingType, Function, - TemplateParams, Declaration, Variables}; + IsOptimized, TemplateParams, Declaration, Variables)); + Metadata *Ops[] = {File, Scope, Name, Name, + LinkageName, Type, ContainingType, TemplateParams, + Declaration, Variables}; DEFINE_GETIMPL_STORE(DISubprogram, (Line, ScopeLine, Virtuality, VirtualIndex, Flags, IsLocalToUnit, IsDefinition, IsOptimized), Ops); } -Function *DISubprogram::getFunction() const { - // FIXME: Should this be looking through bitcasts? - return dyn_cast_or_null<Function>(getFunctionConstant()); -} - bool DISubprogram::describes(const Function *F) const { assert(F && "Invalid function"); - if (F == getFunction()) + if (F->getSubprogram() == this) return true; StringRef Name = getLinkageName(); if (Name.empty()) @@ -380,15 +371,13 @@ bool DISubprogram::describes(const Function *F) const { return F->getName() == Name; } -void DISubprogram::replaceFunction(Function *F) { - replaceFunction(F ? ConstantAsMetadata::get(F) - : static_cast<ConstantAsMetadata *>(nullptr)); -} - DILexicalBlock *DILexicalBlock::getImpl(LLVMContext &Context, Metadata *Scope, Metadata *File, unsigned Line, unsigned Column, StorageType Storage, bool ShouldCreate) { + // Fixup column. + adjustColumn(Column); + assert(Scope && "Expected scope"); DEFINE_GETIMPL_LOOKUP(DILexicalBlock, (Scope, File, Line, Column)); Metadata *Ops[] = {File, Scope}; @@ -467,21 +456,21 @@ DIGlobalVariable::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name, Ops); } -DILocalVariable *DILocalVariable::getImpl(LLVMContext &Context, unsigned Tag, - Metadata *Scope, MDString *Name, - Metadata *File, unsigned Line, - Metadata *Type, unsigned Arg, - unsigned Flags, StorageType Storage, +DILocalVariable *DILocalVariable::getImpl(LLVMContext &Context, Metadata *Scope, + MDString *Name, Metadata *File, + unsigned Line, Metadata *Type, + unsigned Arg, unsigned Flags, + StorageType Storage, bool ShouldCreate) { // 64K ought to be enough for any frontend. assert(Arg <= UINT16_MAX && "Expected argument number to fit in 16-bits"); assert(Scope && "Expected scope"); assert(isCanonical(Name) && "Expected canonical MDString"); - DEFINE_GETIMPL_LOOKUP(DILocalVariable, (Tag, Scope, getString(Name), File, - Line, Type, Arg, Flags)); + DEFINE_GETIMPL_LOOKUP(DILocalVariable, + (Scope, getString(Name), File, Line, Type, Arg, Flags)); Metadata *Ops[] = {Scope, Name, File, Type}; - DEFINE_GETIMPL_STORE(DILocalVariable, (Tag, Line, Arg, Flags), Ops); + DEFINE_GETIMPL_STORE(DILocalVariable, (Line, Arg, Flags), Ops); } DIExpression *DIExpression::getImpl(LLVMContext &Context, @@ -496,6 +485,7 @@ unsigned DIExpression::ExprOperand::getSize() const { case dwarf::DW_OP_bit_piece: return 3; case dwarf::DW_OP_plus: + case dwarf::DW_OP_minus: return 2; default: return 1; @@ -516,6 +506,7 @@ bool DIExpression::isValid() const { // Piece expressions must be at the end. return I->get() + I->getSize() == E->get(); case dwarf::DW_OP_plus: + case dwarf::DW_OP_minus: case dwarf::DW_OP_deref: break; } @@ -566,3 +557,24 @@ DIImportedEntity *DIImportedEntity::getImpl(LLVMContext &Context, unsigned Tag, Metadata *Ops[] = {Scope, Entity, Name}; DEFINE_GETIMPL_STORE(DIImportedEntity, (Tag, Line), Ops); } + +DIMacro *DIMacro::getImpl(LLVMContext &Context, unsigned MIType, + unsigned Line, MDString *Name, MDString *Value, + StorageType Storage, bool ShouldCreate) { + assert(isCanonical(Name) && "Expected canonical MDString"); + DEFINE_GETIMPL_LOOKUP(DIMacro, + (MIType, Line, getString(Name), getString(Value))); + Metadata *Ops[] = { Name, Value }; + DEFINE_GETIMPL_STORE(DIMacro, (MIType, Line), Ops); +} + +DIMacroFile *DIMacroFile::getImpl(LLVMContext &Context, unsigned MIType, + unsigned Line, Metadata *File, + Metadata *Elements, StorageType Storage, + bool ShouldCreate) { + DEFINE_GETIMPL_LOOKUP(DIMacroFile, + (MIType, Line, File, Elements)); + Metadata *Ops[] = { File, Elements }; + DEFINE_GETIMPL_STORE(DIMacroFile, (MIType, Line), Ops); +} + diff --git a/contrib/llvm/lib/IR/DiagnosticInfo.cpp b/contrib/llvm/lib/IR/DiagnosticInfo.cpp index b8f77ed..6426f76 100644 --- a/contrib/llvm/lib/IR/DiagnosticInfo.cpp +++ b/contrib/llvm/lib/IR/DiagnosticInfo.cpp @@ -49,7 +49,7 @@ struct PassRemarksOpt { "' in -pass-remarks: " + RegexError, false); } - }; + } }; static PassRemarksOpt PassRemarksOptLoc; @@ -91,6 +91,8 @@ int llvm::getNextAvailablePluginDiagnosticKind() { return ++PluginKindID; } +const char *DiagnosticInfo::AlwaysPrint = ""; + DiagnosticInfoInlineAsm::DiagnosticInfoInlineAsm(const Instruction &I, const Twine &MsgStr, DiagnosticSeverity Severity) @@ -121,9 +123,17 @@ void DiagnosticInfoDebugMetadataVersion::print(DiagnosticPrinter &DP) const { } void DiagnosticInfoSampleProfile::print(DiagnosticPrinter &DP) const { - if (getFileName() && getLineNum() > 0) - DP << getFileName() << ":" << getLineNum() << ": "; - else if (getFileName()) + if (!FileName.empty()) { + DP << getFileName(); + if (LineNum > 0) + DP << ":" << getLineNum(); + DP << ": "; + } + DP << getMsg(); +} + +void DiagnosticInfoPGOProfile::print(DiagnosticPrinter &DP) const { + if (getFileName()) DP << getFileName() << ": "; DP << getMsg(); } @@ -166,8 +176,9 @@ bool DiagnosticInfoOptimizationRemarkMissed::isEnabled() const { } bool DiagnosticInfoOptimizationRemarkAnalysis::isEnabled() const { - return PassRemarksAnalysisOptLoc.Pattern && - PassRemarksAnalysisOptLoc.Pattern->match(getPassName()); + return getPassName() == DiagnosticInfo::AlwaysPrint || + (PassRemarksAnalysisOptLoc.Pattern && + PassRemarksAnalysisOptLoc.Pattern->match(getPassName())); } void DiagnosticInfoMIRParser::print(DiagnosticPrinter &DP) const { @@ -196,6 +207,24 @@ void llvm::emitOptimizationRemarkAnalysis(LLVMContext &Ctx, DiagnosticInfoOptimizationRemarkAnalysis(PassName, Fn, DLoc, Msg)); } +void llvm::emitOptimizationRemarkAnalysisFPCommute(LLVMContext &Ctx, + const char *PassName, + const Function &Fn, + const DebugLoc &DLoc, + const Twine &Msg) { + Ctx.diagnose(DiagnosticInfoOptimizationRemarkAnalysisFPCommute(PassName, Fn, + DLoc, Msg)); +} + +void llvm::emitOptimizationRemarkAnalysisAliasing(LLVMContext &Ctx, + const char *PassName, + const Function &Fn, + const DebugLoc &DLoc, + const Twine &Msg) { + Ctx.diagnose(DiagnosticInfoOptimizationRemarkAnalysisAliasing(PassName, Fn, + DLoc, Msg)); +} + bool DiagnosticInfoOptimizationFailure::isEnabled() const { // Only print warnings. return getSeverity() == DS_Warning; diff --git a/contrib/llvm/lib/IR/Dominators.cpp b/contrib/llvm/lib/IR/Dominators.cpp index b6a8bbc..b9d4fb7 100644 --- a/contrib/llvm/lib/IR/Dominators.cpp +++ b/contrib/llvm/lib/IR/Dominators.cpp @@ -91,10 +91,10 @@ bool DominatorTree::dominates(const Instruction *Def, if (Def == User) return false; - // The value defined by an invoke dominates an instruction only if - // it dominates every instruction in UseBB. - // A PHI is dominated only if the instruction dominates every possible use - // in the UseBB. + // The value defined by an invoke dominates an instruction only if it + // dominates every instruction in UseBB. + // A PHI is dominated only if the instruction dominates every possible use in + // the UseBB. if (isa<InvokeInst>(Def) || isa<PHINode>(User)) return dominates(Def, UseBB); @@ -126,15 +126,15 @@ bool DominatorTree::dominates(const Instruction *Def, if (DefBB == UseBB) return false; - const InvokeInst *II = dyn_cast<InvokeInst>(Def); - if (!II) - return dominates(DefBB, UseBB); - // Invoke results are only usable in the normal destination, not in the // exceptional destination. - BasicBlock *NormalDest = II->getNormalDest(); - BasicBlockEdge E(DefBB, NormalDest); - return dominates(E, UseBB); + if (const auto *II = dyn_cast<InvokeInst>(Def)) { + BasicBlock *NormalDest = II->getNormalDest(); + BasicBlockEdge E(DefBB, NormalDest); + return dominates(E, UseBB); + } + + return dominates(DefBB, UseBB); } bool DominatorTree::dominates(const BasicBlockEdge &BBE, @@ -142,7 +142,8 @@ bool DominatorTree::dominates(const BasicBlockEdge &BBE, // Assert that we have a single edge. We could handle them by simply // returning false, but since isSingleEdge is linear on the number of // edges, the callers can normally handle them more efficiently. - assert(BBE.isSingleEdge()); + assert(BBE.isSingleEdge() && + "This function is not efficient in handling multiple edges"); // If the BB the edge ends in doesn't dominate the use BB, then the // edge also doesn't. @@ -192,7 +193,8 @@ bool DominatorTree::dominates(const BasicBlockEdge &BBE, const Use &U) const { // Assert that we have a single edge. We could handle them by simply // returning false, but since isSingleEdge is linear on the number of // edges, the callers can normally handle them more efficiently. - assert(BBE.isSingleEdge()); + assert(BBE.isSingleEdge() && + "This function is not efficient in handling multiple edges"); Instruction *UserInst = cast<Instruction>(U.getUser()); // A PHI in the end of the edge is dominated by it. @@ -232,8 +234,8 @@ bool DominatorTree::dominates(const Instruction *Def, const Use &U) const { if (!isReachableFromEntry(DefBB)) return false; - // Invoke instructions define their return values on the edges - // to their normal successors, so we have to handle them specially. + // Invoke instructions define their return values on the edges to their normal + // successors, so we have to handle them specially. // Among other things, this means they don't dominate anything in // their own block, except possibly a phi, so we don't need to // walk the block in any case. diff --git a/contrib/llvm/lib/IR/Function.cpp b/contrib/llvm/lib/IR/Function.cpp index b50ad12..cfdfc40 100644 --- a/contrib/llvm/lib/IR/Function.cpp +++ b/contrib/llvm/lib/IR/Function.cpp @@ -35,8 +35,8 @@ using namespace llvm; // Explicit instantiations of SymbolTableListTraits since some of the methods // are not in the public header file... -template class llvm::SymbolTableListTraits<Argument, Function>; -template class llvm::SymbolTableListTraits<BasicBlock, Function>; +template class llvm::SymbolTableListTraits<Argument>; +template class llvm::SymbolTableListTraits<BasicBlock>; //===----------------------------------------------------------------------===// // Argument Implementation @@ -235,11 +235,11 @@ Type *Function::getReturnType() const { } void Function::removeFromParent() { - getParent()->getFunctionList().remove(this); + getParent()->getFunctionList().remove(getIterator()); } void Function::eraseFromParent() { - getParent()->getFunctionList().erase(this); + getParent()->getFunctionList().erase(getIterator()); } //===----------------------------------------------------------------------===// @@ -248,7 +248,7 @@ void Function::eraseFromParent() { Function::Function(FunctionType *Ty, LinkageTypes Linkage, const Twine &name, Module *ParentModule) - : GlobalObject(PointerType::getUnqual(Ty), Value::FunctionVal, + : GlobalObject(Ty, Value::FunctionVal, OperandTraits<Function>::op_begin(this), 0, Linkage, name), Ty(Ty) { assert(FunctionType::isValidReturnType(getReturnType()) && @@ -279,9 +279,6 @@ Function::~Function() { // Remove the function from the on-the-side GC table. clearGC(); - - // FIXME: needed by operator delete - setFunctionNumOperands(1); } void Function::BuildLazyArguments() const { @@ -328,14 +325,15 @@ void Function::dropAllReferences() { while (!BasicBlocks.empty()) BasicBlocks.begin()->eraseFromParent(); - // Prefix and prologue data are stored in a side table. - setPrefixData(nullptr); - setPrologueData(nullptr); + // Drop uses of any optional data (real or placeholder). + if (getNumOperands()) { + User::dropAllReferences(); + setNumHungOffUseOperands(0); + setValueSubclassData(getSubclassDataFromValue() & ~0xe); + } // Metadata is stored in a side-table. clearMetadata(); - - setPersonalityFn(nullptr); } void Function::addAttribute(unsigned i, Attribute::AttrKind attr) { @@ -368,73 +366,43 @@ void Function::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) { setAttributes(PAL); } -// Maintain the GC name for each function in an on-the-side table. This saves -// allocating an additional word in Function for programs which do not use GC -// (i.e., most programs) at the cost of increased overhead for clients which do -// use GC. -static DenseMap<const Function*,PooledStringPtr> *GCNames; -static StringPool *GCNamePool; -static ManagedStatic<sys::SmartRWMutex<true> > GCLock; - -bool Function::hasGC() const { - sys::SmartScopedReader<true> Reader(*GCLock); - return GCNames && GCNames->count(this); -} - -const char *Function::getGC() const { +const std::string &Function::getGC() const { assert(hasGC() && "Function has no collector"); - sys::SmartScopedReader<true> Reader(*GCLock); - return *(*GCNames)[this]; + return getContext().getGC(*this); } -void Function::setGC(const char *Str) { - sys::SmartScopedWriter<true> Writer(*GCLock); - if (!GCNamePool) - GCNamePool = new StringPool(); - if (!GCNames) - GCNames = new DenseMap<const Function*,PooledStringPtr>(); - (*GCNames)[this] = GCNamePool->intern(Str); +void Function::setGC(const std::string Str) { + setValueSubclassDataBit(14, !Str.empty()); + getContext().setGC(*this, std::move(Str)); } void Function::clearGC() { - sys::SmartScopedWriter<true> Writer(*GCLock); - if (GCNames) { - GCNames->erase(this); - if (GCNames->empty()) { - delete GCNames; - GCNames = nullptr; - if (GCNamePool->empty()) { - delete GCNamePool; - GCNamePool = nullptr; - } - } - } + if (!hasGC()) + return; + getContext().deleteGC(*this); + setValueSubclassDataBit(14, false); } -/// copyAttributesFrom - copy all additional attributes (those not needed to -/// create a Function) from the Function Src to this one. +/// Copy all additional attributes (those not needed to create a Function) from +/// the Function Src to this one. void Function::copyAttributesFrom(const GlobalValue *Src) { - assert(isa<Function>(Src) && "Expected a Function!"); GlobalObject::copyAttributesFrom(Src); - const Function *SrcF = cast<Function>(Src); + const Function *SrcF = dyn_cast<Function>(Src); + if (!SrcF) + return; + setCallingConv(SrcF->getCallingConv()); setAttributes(SrcF->getAttributes()); if (SrcF->hasGC()) setGC(SrcF->getGC()); else clearGC(); + if (SrcF->hasPersonalityFn()) + setPersonalityFn(SrcF->getPersonalityFn()); if (SrcF->hasPrefixData()) setPrefixData(SrcF->getPrefixData()); - else - setPrefixData(nullptr); if (SrcF->hasPrologueData()) setPrologueData(SrcF->getPrologueData()); - else - setPrologueData(nullptr); - if (SrcF->hasPersonalityFn()) - setPersonalityFn(SrcF->getPersonalityFn()); - else - setPersonalityFn(nullptr); } /// \brief This does the actual lookup of an intrinsic ID which @@ -492,7 +460,10 @@ static std::string getMangledTypeStr(Type* Ty) { Result += "vararg"; // Ensure nested function types are distinguishable. Result += "f"; - } else if (Ty) + } else if (isa<VectorType>(Ty)) + Result += "v" + utostr(Ty->getVectorNumElements()) + + getMangledTypeStr(Ty->getVectorElementType()); + else if (Ty) Result += EVT::getEVT(Ty).getEVTString(); return Result; } @@ -541,22 +512,25 @@ enum IIT_Info { // Values from 16+ are only encodable with the inefficient encoding. IIT_V64 = 16, IIT_MMX = 17, - IIT_METADATA = 18, - IIT_EMPTYSTRUCT = 19, - IIT_STRUCT2 = 20, - IIT_STRUCT3 = 21, - IIT_STRUCT4 = 22, - IIT_STRUCT5 = 23, - IIT_EXTEND_ARG = 24, - IIT_TRUNC_ARG = 25, - IIT_ANYPTR = 26, - IIT_V1 = 27, - IIT_VARARG = 28, - IIT_HALF_VEC_ARG = 29, - IIT_SAME_VEC_WIDTH_ARG = 30, - IIT_PTR_TO_ARG = 31, - IIT_VEC_OF_PTRS_TO_ELT = 32, - IIT_I128 = 33 + IIT_TOKEN = 18, + IIT_METADATA = 19, + IIT_EMPTYSTRUCT = 20, + IIT_STRUCT2 = 21, + IIT_STRUCT3 = 22, + IIT_STRUCT4 = 23, + IIT_STRUCT5 = 24, + IIT_EXTEND_ARG = 25, + IIT_TRUNC_ARG = 26, + IIT_ANYPTR = 27, + IIT_V1 = 28, + IIT_VARARG = 29, + IIT_HALF_VEC_ARG = 30, + IIT_SAME_VEC_WIDTH_ARG = 31, + IIT_PTR_TO_ARG = 32, + IIT_VEC_OF_PTRS_TO_ELT = 33, + IIT_I128 = 34, + IIT_V512 = 35, + IIT_V1024 = 36 }; @@ -576,6 +550,9 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos, case IIT_MMX: OutputTable.push_back(IITDescriptor::get(IITDescriptor::MMX, 0)); return; + case IIT_TOKEN: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Token, 0)); + return; case IIT_METADATA: OutputTable.push_back(IITDescriptor::get(IITDescriptor::Metadata, 0)); return; @@ -634,6 +611,14 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos, OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 64)); DecodeIITType(NextElt, Infos, OutputTable); return; + case IIT_V512: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 512)); + DecodeIITType(NextElt, Infos, OutputTable); + return; + case IIT_V1024: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 1024)); + DecodeIITType(NextElt, Infos, OutputTable); + return; case IIT_PTR: OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 0)); DecodeIITType(NextElt, Infos, OutputTable); @@ -751,6 +736,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos, case IITDescriptor::Void: return Type::getVoidTy(Context); case IITDescriptor::VarArg: return Type::getVoidTy(Context); case IITDescriptor::MMX: return Type::getX86_MMXTy(Context); + case IITDescriptor::Token: return Type::getTokenTy(Context); case IITDescriptor::Metadata: return Type::getMetadataTy(Context); case IITDescriptor::Half: return Type::getHalfTy(Context); case IITDescriptor::Float: return Type::getFloatTy(Context); @@ -924,62 +910,68 @@ bool Function::callsFunctionThatReturnsTwice() const { return false; } +Constant *Function::getPersonalityFn() const { + assert(hasPersonalityFn() && getNumOperands()); + return cast<Constant>(Op<0>()); +} + +void Function::setPersonalityFn(Constant *Fn) { + setHungoffOperand<0>(Fn); + setValueSubclassDataBit(3, Fn != nullptr); +} + Constant *Function::getPrefixData() const { - assert(hasPrefixData()); - const LLVMContextImpl::PrefixDataMapTy &PDMap = - getContext().pImpl->PrefixDataMap; - assert(PDMap.find(this) != PDMap.end()); - return cast<Constant>(PDMap.find(this)->second->getReturnValue()); + assert(hasPrefixData() && getNumOperands()); + return cast<Constant>(Op<1>()); } void Function::setPrefixData(Constant *PrefixData) { - if (!PrefixData && !hasPrefixData()) - return; - - unsigned SCData = getSubclassDataFromValue(); - LLVMContextImpl::PrefixDataMapTy &PDMap = getContext().pImpl->PrefixDataMap; - ReturnInst *&PDHolder = PDMap[this]; - if (PrefixData) { - if (PDHolder) - PDHolder->setOperand(0, PrefixData); - else - PDHolder = ReturnInst::Create(getContext(), PrefixData); - SCData |= (1<<1); - } else { - delete PDHolder; - PDMap.erase(this); - SCData &= ~(1<<1); - } - setValueSubclassData(SCData); + setHungoffOperand<1>(PrefixData); + setValueSubclassDataBit(1, PrefixData != nullptr); } Constant *Function::getPrologueData() const { - assert(hasPrologueData()); - const LLVMContextImpl::PrologueDataMapTy &SOMap = - getContext().pImpl->PrologueDataMap; - assert(SOMap.find(this) != SOMap.end()); - return cast<Constant>(SOMap.find(this)->second->getReturnValue()); + assert(hasPrologueData() && getNumOperands()); + return cast<Constant>(Op<2>()); } void Function::setPrologueData(Constant *PrologueData) { - if (!PrologueData && !hasPrologueData()) - return; - - unsigned PDData = getSubclassDataFromValue(); - LLVMContextImpl::PrologueDataMapTy &PDMap = getContext().pImpl->PrologueDataMap; - ReturnInst *&PDHolder = PDMap[this]; - if (PrologueData) { - if (PDHolder) - PDHolder->setOperand(0, PrologueData); - else - PDHolder = ReturnInst::Create(getContext(), PrologueData); - PDData |= (1<<2); - } else { - delete PDHolder; - PDMap.erase(this); - PDData &= ~(1<<2); + setHungoffOperand<2>(PrologueData); + setValueSubclassDataBit(2, PrologueData != nullptr); +} + +void Function::allocHungoffUselist() { + // If we've already allocated a uselist, stop here. + if (getNumOperands()) + return; + + allocHungoffUses(3, /*IsPhi=*/ false); + setNumHungOffUseOperands(3); + + // Initialize the uselist with placeholder operands to allow traversal. + auto *CPN = ConstantPointerNull::get(Type::getInt1PtrTy(getContext(), 0)); + Op<0>().set(CPN); + Op<1>().set(CPN); + Op<2>().set(CPN); +} + +template <int Idx> +void Function::setHungoffOperand(Constant *C) { + if (C) { + allocHungoffUselist(); + Op<Idx>().set(C); + } else if (getNumOperands()) { + Op<Idx>().set( + ConstantPointerNull::get(Type::getInt1PtrTy(getContext(), 0))); } - setValueSubclassData(PDData); +} + +void Function::setValueSubclassDataBit(unsigned Bit, bool On) { + assert(Bit < 16 && "SubclassData contains only 16 bits"); + if (On) + setValueSubclassData(getSubclassDataFromValue() | (1 << Bit)); + else + setValueSubclassData(getSubclassDataFromValue() & ~(1 << Bit)); } void Function::setEntryCount(uint64_t Count) { @@ -997,22 +989,3 @@ Optional<uint64_t> Function::getEntryCount() const { } return None; } - -void Function::setPersonalityFn(Constant *C) { - if (!C) { - if (hasPersonalityFn()) { - // Note, the num operands is used to compute the offset of the operand, so - // the order here matters. Clearing the operand then clearing the num - // operands ensures we have the correct offset to the operand. - Op<0>().set(nullptr); - setFunctionNumOperands(0); - } - } else { - // Note, the num operands is used to compute the offset of the operand, so - // the order here matters. We need to set num operands to 1 first so that - // we get the correct offset to the first operand when we set it. - if (!hasPersonalityFn()) - setFunctionNumOperands(1); - Op<0>().set(C); - } -} diff --git a/contrib/llvm/lib/IR/FunctionInfo.cpp b/contrib/llvm/lib/IR/FunctionInfo.cpp new file mode 100644 index 0000000..17a67bc --- /dev/null +++ b/contrib/llvm/lib/IR/FunctionInfo.cpp @@ -0,0 +1,67 @@ +//===-- FunctionInfo.cpp - Function Info Index ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the function info index and summary classes for the +// IR library. +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/FunctionInfo.h" +#include "llvm/ADT/StringMap.h" +using namespace llvm; + +// Create the combined function index/summary from multiple +// per-module instances. +void FunctionInfoIndex::mergeFrom(std::unique_ptr<FunctionInfoIndex> Other, + uint64_t NextModuleId) { + + StringRef ModPath; + for (auto &OtherFuncInfoLists : *Other) { + std::string FuncName = OtherFuncInfoLists.getKey(); + FunctionInfoList &List = OtherFuncInfoLists.second; + + // Assert that the func info list only has one entry, since we shouldn't + // have duplicate names within a single per-module index. + assert(List.size() == 1); + std::unique_ptr<FunctionInfo> Info = std::move(List.front()); + + // Skip if there was no function summary section. + if (!Info->functionSummary()) + continue; + + // Add the module path string ref for this module if we haven't already + // saved a reference to it. + if (ModPath.empty()) + ModPath = + addModulePath(Info->functionSummary()->modulePath(), NextModuleId); + else + assert(ModPath == Info->functionSummary()->modulePath() && + "Each module in the combined map should have a unique ID"); + + // Note the module path string ref was copied above and is still owned by + // the original per-module index. Reset it to the new module path + // string reference owned by the combined index. + Info->functionSummary()->setModulePath(ModPath); + + // If it is a local function, rename it. + if (Info->functionSummary()->isLocalFunction()) { + // Any local functions are virtually renamed when being added to the + // combined index map, to disambiguate from other functions with + // the same name. The symbol table created for the combined index + // file should contain the renamed symbols. + FuncName = + FunctionInfoIndex::getGlobalNameForLocal(FuncName, NextModuleId); + } + + // Add new function info to existing list. There may be duplicates when + // combining FunctionMap entries, due to COMDAT functions. Any local + // functions were virtually renamed above. + addFunctionInfo(FuncName, std::move(Info)); + } +} diff --git a/contrib/llvm/lib/IR/GCOV.cpp b/contrib/llvm/lib/IR/GCOV.cpp index 6ed58913..35b8157 100644 --- a/contrib/llvm/lib/IR/GCOV.cpp +++ b/contrib/llvm/lib/IR/GCOV.cpp @@ -448,7 +448,7 @@ static uint32_t branchDiv(uint64_t Numerator, uint64_t Divisor) { namespace { struct formatBranchInfo { - formatBranchInfo(const GCOVOptions &Options, uint64_t Count, uint64_t Total) + formatBranchInfo(const GCOV::Options &Options, uint64_t Count, uint64_t Total) : Options(Options), Count(Count), Total(Total) {} void print(raw_ostream &OS) const { @@ -460,7 +460,7 @@ struct formatBranchInfo { OS << "taken " << branchDiv(Count, Total) << "%"; } - const GCOVOptions &Options; + const GCOV::Options &Options; uint64_t Count; uint64_t Total; }; diff --git a/contrib/llvm/lib/IR/Globals.cpp b/contrib/llvm/lib/IR/Globals.cpp index 1d02826..6159f93 100644 --- a/contrib/llvm/lib/IR/Globals.cpp +++ b/contrib/llvm/lib/IR/Globals.cpp @@ -32,15 +32,9 @@ bool GlobalValue::isMaterializable() const { return F->isMaterializable(); return false; } -bool GlobalValue::isDematerializable() const { - return getParent() && getParent()->isDematerializable(this); -} std::error_code GlobalValue::materialize() { return getParent()->materialize(this); } -void GlobalValue::dematerialize() { - getParent()->dematerialize(this); -} /// Override destroyConstantImpl to make sure it doesn't get called on /// GlobalValue's because they shouldn't be treated like other constants. @@ -97,10 +91,11 @@ void GlobalObject::setGlobalObjectSubClassData(unsigned Val) { } void GlobalObject::copyAttributesFrom(const GlobalValue *Src) { - const auto *GV = cast<GlobalObject>(Src); - GlobalValue::copyAttributesFrom(GV); - setAlignment(GV->getAlignment()); - setSection(GV->getSection()); + GlobalValue::copyAttributesFrom(Src); + if (const auto *GV = dyn_cast<GlobalObject>(Src)) { + setAlignment(GV->getAlignment()); + setSection(GV->getSection()); + } } const char *GlobalValue::getSection() const { @@ -147,9 +142,9 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link, Constant *InitVal, const Twine &Name, ThreadLocalMode TLMode, unsigned AddressSpace, bool isExternallyInitialized) - : GlobalObject(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal, + : GlobalObject(Ty, Value::GlobalVariableVal, OperandTraits<GlobalVariable>::op_begin(this), - InitVal != nullptr, Link, Name), + InitVal != nullptr, Link, Name, AddressSpace), isConstantGlobal(constant), isExternallyInitializedConstant(isExternallyInitialized) { setThreadLocalMode(TLMode); @@ -165,9 +160,9 @@ GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant, const Twine &Name, GlobalVariable *Before, ThreadLocalMode TLMode, unsigned AddressSpace, bool isExternallyInitialized) - : GlobalObject(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal, + : GlobalObject(Ty, Value::GlobalVariableVal, OperandTraits<GlobalVariable>::op_begin(this), - InitVal != nullptr, Link, Name), + InitVal != nullptr, Link, Name, AddressSpace), isConstantGlobal(constant), isExternallyInitializedConstant(isExternallyInitialized) { setThreadLocalMode(TLMode); @@ -178,7 +173,7 @@ GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant, } if (Before) - Before->getParent()->getGlobalList().insert(Before, this); + Before->getParent()->getGlobalList().insert(Before->getIterator(), this); else M.getGlobalList().push_back(this); } @@ -188,11 +183,11 @@ void GlobalVariable::setParent(Module *parent) { } void GlobalVariable::removeFromParent() { - getParent()->getGlobalList().remove(this); + getParent()->getGlobalList().remove(getIterator()); } void GlobalVariable::eraseFromParent() { - getParent()->getGlobalList().erase(this); + getParent()->getGlobalList().erase(getIterator()); } void GlobalVariable::setInitializer(Constant *InitVal) { @@ -216,14 +211,14 @@ void GlobalVariable::setInitializer(Constant *InitVal) { } } -/// copyAttributesFrom - copy all additional attributes (those not needed to -/// create a GlobalVariable) from the GlobalVariable Src to this one. +/// Copy all additional attributes (those not needed to create a GlobalVariable) +/// from the GlobalVariable Src to this one. void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) { - assert(isa<GlobalVariable>(Src) && "Expected a GlobalVariable!"); GlobalObject::copyAttributesFrom(Src); - const GlobalVariable *SrcVar = cast<GlobalVariable>(Src); - setThreadLocalMode(SrcVar->getThreadLocalMode()); - setExternallyInitialized(SrcVar->isExternallyInitialized()); + if (const GlobalVariable *SrcVar = dyn_cast<GlobalVariable>(Src)) { + setThreadLocalMode(SrcVar->getThreadLocalMode()); + setExternallyInitialized(SrcVar->isExternallyInitialized()); + } } @@ -231,35 +226,40 @@ void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) { // GlobalAlias Implementation //===----------------------------------------------------------------------===// -GlobalAlias::GlobalAlias(PointerType *Ty, LinkageTypes Link, const Twine &Name, - Constant *Aliasee, Module *ParentModule) - : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name) { +GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link, + const Twine &Name, Constant *Aliasee, + Module *ParentModule) + : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name, + AddressSpace) { Op<0>() = Aliasee; if (ParentModule) ParentModule->getAliasList().push_back(this); } -GlobalAlias *GlobalAlias::create(PointerType *Ty, LinkageTypes Link, - const Twine &Name, Constant *Aliasee, - Module *ParentModule) { - return new GlobalAlias(Ty, Link, Name, Aliasee, ParentModule); +GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace, + LinkageTypes Link, const Twine &Name, + Constant *Aliasee, Module *ParentModule) { + return new GlobalAlias(Ty, AddressSpace, Link, Name, Aliasee, ParentModule); } -GlobalAlias *GlobalAlias::create(PointerType *Ty, LinkageTypes Linkage, - const Twine &Name, Module *Parent) { - return create(Ty, Linkage, Name, nullptr, Parent); +GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace, + LinkageTypes Linkage, const Twine &Name, + Module *Parent) { + return create(Ty, AddressSpace, Linkage, Name, nullptr, Parent); } -GlobalAlias *GlobalAlias::create(PointerType *Ty, LinkageTypes Linkage, - const Twine &Name, GlobalValue *Aliasee) { - return create(Ty, Linkage, Name, Aliasee, Aliasee->getParent()); +GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace, + LinkageTypes Linkage, const Twine &Name, + GlobalValue *Aliasee) { + return create(Ty, AddressSpace, Linkage, Name, Aliasee, Aliasee->getParent()); } GlobalAlias *GlobalAlias::create(LinkageTypes Link, const Twine &Name, GlobalValue *Aliasee) { PointerType *PTy = Aliasee->getType(); - return create(PTy, Link, Name, Aliasee); + return create(PTy->getElementType(), PTy->getAddressSpace(), Link, Name, + Aliasee); } GlobalAlias *GlobalAlias::create(const Twine &Name, GlobalValue *Aliasee) { @@ -271,11 +271,11 @@ void GlobalAlias::setParent(Module *parent) { } void GlobalAlias::removeFromParent() { - getParent()->getAliasList().remove(this); + getParent()->getAliasList().remove(getIterator()); } void GlobalAlias::eraseFromParent() { - getParent()->getAliasList().erase(this); + getParent()->getAliasList().erase(getIterator()); } void GlobalAlias::setAliasee(Constant *Aliasee) { diff --git a/contrib/llvm/lib/IR/IRBuilder.cpp b/contrib/llvm/lib/IR/IRBuilder.cpp index bddb278..4474129 100644 --- a/contrib/llvm/lib/IR/IRBuilder.cpp +++ b/contrib/llvm/lib/IR/IRBuilder.cpp @@ -247,18 +247,21 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id, return createCallHelper(TheFn, Ops, this, Name); } +template <typename T0, typename T1, typename T2, typename T3> static std::vector<Value *> getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes, - Value *ActualCallee, ArrayRef<Value *> CallArgs, - ArrayRef<Value *> DeoptArgs, ArrayRef<Value *> GCArgs) { + Value *ActualCallee, uint32_t Flags, ArrayRef<T0> CallArgs, + ArrayRef<T1> TransitionArgs, ArrayRef<T2> DeoptArgs, + ArrayRef<T3> GCArgs) { std::vector<Value *> Args; Args.push_back(B.getInt64(ID)); Args.push_back(B.getInt32(NumPatchBytes)); Args.push_back(ActualCallee); Args.push_back(B.getInt32(CallArgs.size())); - Args.push_back(B.getInt32((unsigned)StatepointFlags::None)); + Args.push_back(B.getInt32(Flags)); Args.insert(Args.end(), CallArgs.begin(), CallArgs.end()); - Args.push_back(B.getInt32(0 /* no transition args */)); + Args.push_back(B.getInt32(TransitionArgs.size())); + Args.insert(Args.end(), TransitionArgs.begin(), TransitionArgs.end()); Args.push_back(B.getInt32(DeoptArgs.size())); Args.insert(Args.end(), DeoptArgs.begin(), DeoptArgs.end()); Args.insert(Args.end(), GCArgs.begin(), GCArgs.end()); @@ -266,69 +269,109 @@ getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes, return Args; } -CallInst *IRBuilderBase::CreateGCStatepointCall( - uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, - ArrayRef<Value *> CallArgs, ArrayRef<Value *> DeoptArgs, - ArrayRef<Value *> GCArgs, const Twine &Name) { +template <typename T0, typename T1, typename T2, typename T3> +static CallInst *CreateGCStatepointCallCommon( + IRBuilderBase *Builder, uint64_t ID, uint32_t NumPatchBytes, + Value *ActualCallee, uint32_t Flags, ArrayRef<T0> CallArgs, + ArrayRef<T1> TransitionArgs, ArrayRef<T2> DeoptArgs, ArrayRef<T3> GCArgs, + const Twine &Name) { // Extract out the type of the callee. PointerType *FuncPtrType = cast<PointerType>(ActualCallee->getType()); assert(isa<FunctionType>(FuncPtrType->getElementType()) && "actual callee must be a callable value"); - Module *M = BB->getParent()->getParent(); + Module *M = Builder->GetInsertBlock()->getParent()->getParent(); // Fill in the one generic type'd argument (the function is also vararg) Type *ArgTypes[] = { FuncPtrType }; Function *FnStatepoint = Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_statepoint, ArgTypes); - std::vector<llvm::Value *> Args = getStatepointArgs( - *this, ID, NumPatchBytes, ActualCallee, CallArgs, DeoptArgs, GCArgs); - return createCallHelper(FnStatepoint, Args, this, Name); + std::vector<llvm::Value *> Args = + getStatepointArgs(*Builder, ID, NumPatchBytes, ActualCallee, Flags, + CallArgs, TransitionArgs, DeoptArgs, GCArgs); + return createCallHelper(FnStatepoint, Args, Builder, Name); } CallInst *IRBuilderBase::CreateGCStatepointCall( uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, - ArrayRef<Use> CallArgs, ArrayRef<Value *> DeoptArgs, + ArrayRef<Value *> CallArgs, ArrayRef<Value *> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) { - std::vector<Value *> VCallArgs; - for (auto &U : CallArgs) - VCallArgs.push_back(U.get()); - return CreateGCStatepointCall(ID, NumPatchBytes, ActualCallee, VCallArgs, - DeoptArgs, GCArgs, Name); + return CreateGCStatepointCallCommon<Value *, Value *, Value *, Value *>( + this, ID, NumPatchBytes, ActualCallee, uint32_t(StatepointFlags::None), + CallArgs, None /* No Transition Args */, DeoptArgs, GCArgs, Name); } -InvokeInst *IRBuilderBase::CreateGCStatepointInvoke( - uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee, - BasicBlock *NormalDest, BasicBlock *UnwindDest, - ArrayRef<Value *> InvokeArgs, ArrayRef<Value *> DeoptArgs, +CallInst *IRBuilderBase::CreateGCStatepointCall( + uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, uint32_t Flags, + ArrayRef<Use> CallArgs, ArrayRef<Use> TransitionArgs, + ArrayRef<Use> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) { + return CreateGCStatepointCallCommon<Use, Use, Use, Value *>( + this, ID, NumPatchBytes, ActualCallee, Flags, CallArgs, TransitionArgs, + DeoptArgs, GCArgs, Name); +} + +CallInst *IRBuilderBase::CreateGCStatepointCall( + uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, + ArrayRef<Use> CallArgs, ArrayRef<Value *> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) { + return CreateGCStatepointCallCommon<Use, Value *, Value *, Value *>( + this, ID, NumPatchBytes, ActualCallee, uint32_t(StatepointFlags::None), + CallArgs, None, DeoptArgs, GCArgs, Name); +} + +template <typename T0, typename T1, typename T2, typename T3> +static InvokeInst *CreateGCStatepointInvokeCommon( + IRBuilderBase *Builder, uint64_t ID, uint32_t NumPatchBytes, + Value *ActualInvokee, BasicBlock *NormalDest, BasicBlock *UnwindDest, + uint32_t Flags, ArrayRef<T0> InvokeArgs, ArrayRef<T1> TransitionArgs, + ArrayRef<T2> DeoptArgs, ArrayRef<T3> GCArgs, const Twine &Name) { // Extract out the type of the callee. PointerType *FuncPtrType = cast<PointerType>(ActualInvokee->getType()); assert(isa<FunctionType>(FuncPtrType->getElementType()) && "actual callee must be a callable value"); - Module *M = BB->getParent()->getParent(); + Module *M = Builder->GetInsertBlock()->getParent()->getParent(); // Fill in the one generic type'd argument (the function is also vararg) Function *FnStatepoint = Intrinsic::getDeclaration( M, Intrinsic::experimental_gc_statepoint, {FuncPtrType}); - std::vector<llvm::Value *> Args = getStatepointArgs( - *this, ID, NumPatchBytes, ActualInvokee, InvokeArgs, DeoptArgs, GCArgs); - return createInvokeHelper(FnStatepoint, NormalDest, UnwindDest, Args, this, + std::vector<llvm::Value *> Args = + getStatepointArgs(*Builder, ID, NumPatchBytes, ActualInvokee, Flags, + InvokeArgs, TransitionArgs, DeoptArgs, GCArgs); + return createInvokeHelper(FnStatepoint, NormalDest, UnwindDest, Args, Builder, Name); } InvokeInst *IRBuilderBase::CreateGCStatepointInvoke( uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee, + BasicBlock *NormalDest, BasicBlock *UnwindDest, + ArrayRef<Value *> InvokeArgs, ArrayRef<Value *> DeoptArgs, + ArrayRef<Value *> GCArgs, const Twine &Name) { + return CreateGCStatepointInvokeCommon<Value *, Value *, Value *, Value *>( + this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, + uint32_t(StatepointFlags::None), InvokeArgs, None /* No Transition Args*/, + DeoptArgs, GCArgs, Name); +} + +InvokeInst *IRBuilderBase::CreateGCStatepointInvoke( + uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee, + BasicBlock *NormalDest, BasicBlock *UnwindDest, uint32_t Flags, + ArrayRef<Use> InvokeArgs, ArrayRef<Use> TransitionArgs, + ArrayRef<Use> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) { + return CreateGCStatepointInvokeCommon<Use, Use, Use, Value *>( + this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, Flags, + InvokeArgs, TransitionArgs, DeoptArgs, GCArgs, Name); +} + +InvokeInst *IRBuilderBase::CreateGCStatepointInvoke( + uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef<Use> InvokeArgs, ArrayRef<Value *> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) { - std::vector<Value *> VCallArgs; - for (auto &U : InvokeArgs) - VCallArgs.push_back(U.get()); - return CreateGCStatepointInvoke(ID, NumPatchBytes, ActualInvokee, NormalDest, - UnwindDest, VCallArgs, DeoptArgs, GCArgs, - Name); + return CreateGCStatepointInvokeCommon<Use, Value *, Value *, Value *>( + this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, + uint32_t(StatepointFlags::None), InvokeArgs, None, DeoptArgs, GCArgs, + Name); } CallInst *IRBuilderBase::CreateGCResult(Instruction *Statepoint, diff --git a/contrib/llvm/lib/IR/IRPrintingPasses.cpp b/contrib/llvm/lib/IR/IRPrintingPasses.cpp index c1ac336..822dbeb 100644 --- a/contrib/llvm/lib/IR/IRPrintingPasses.cpp +++ b/contrib/llvm/lib/IR/IRPrintingPasses.cpp @@ -28,7 +28,13 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner, PreservedAnalyses PrintModulePass::run(Module &M) { OS << Banner; - M.print(OS, nullptr, ShouldPreserveUseListOrder); + if (llvm::isFunctionInPrintList("*")) + M.print(OS, nullptr, ShouldPreserveUseListOrder); + else { + for(const auto &F : M.functions()) + if (llvm::isFunctionInPrintList(F.getName())) + F.print(OS); + } return PreservedAnalyses::all(); } @@ -37,7 +43,8 @@ PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner) : OS(OS), Banner(Banner) {} PreservedAnalyses PrintFunctionPass::run(Function &F) { - OS << Banner << static_cast<Value &>(F); + if (isFunctionInPrintList(F.getName())) + OS << Banner << static_cast<Value &>(F); return PreservedAnalyses::all(); } diff --git a/contrib/llvm/lib/IR/InlineAsm.cpp b/contrib/llvm/lib/IR/InlineAsm.cpp index aa9e027..15d3b83 100644 --- a/contrib/llvm/lib/IR/InlineAsm.cpp +++ b/contrib/llvm/lib/IR/InlineAsm.cpp @@ -24,23 +24,22 @@ using namespace llvm; InlineAsm::~InlineAsm() { } - -InlineAsm *InlineAsm::get(FunctionType *Ty, StringRef AsmString, +InlineAsm *InlineAsm::get(FunctionType *FTy, StringRef AsmString, StringRef Constraints, bool hasSideEffects, bool isAlignStack, AsmDialect asmDialect) { - InlineAsmKeyType Key(AsmString, Constraints, hasSideEffects, isAlignStack, - asmDialect); - LLVMContextImpl *pImpl = Ty->getContext().pImpl; - return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(Ty), Key); + InlineAsmKeyType Key(AsmString, Constraints, FTy, hasSideEffects, + isAlignStack, asmDialect); + LLVMContextImpl *pImpl = FTy->getContext().pImpl; + return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(FTy), Key); } -InlineAsm::InlineAsm(PointerType *Ty, const std::string &asmString, +InlineAsm::InlineAsm(FunctionType *FTy, const std::string &asmString, const std::string &constraints, bool hasSideEffects, bool isAlignStack, AsmDialect asmDialect) - : Value(Ty, Value::InlineAsmVal), - AsmString(asmString), Constraints(constraints), - HasSideEffects(hasSideEffects), IsAlignStack(isAlignStack), - Dialect(asmDialect) { + : Value(PointerType::getUnqual(FTy), Value::InlineAsmVal), + AsmString(asmString), Constraints(constraints), FTy(FTy), + HasSideEffects(hasSideEffects), IsAlignStack(isAlignStack), + Dialect(asmDialect) { // Do various checks on the constraint string and type. assert(Verify(getFunctionType(), constraints) && @@ -53,7 +52,7 @@ void InlineAsm::destroyConstant() { } FunctionType *InlineAsm::getFunctionType() const { - return cast<FunctionType>(getType()->getElementType()); + return FTy; } ///Default constructor. @@ -160,6 +159,9 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str, // If Operand N already has a matching input, reject this. An output // can't be constrained to the same value as multiple inputs. if (isMultipleAlternative) { + if (multipleAlternativeIndex >= + ConstraintsSoFar[N].multipleAlternatives.size()) + return true; InlineAsm::SubConstraintInfo &scInfo = ConstraintsSoFar[N].multipleAlternatives[multipleAlternativeIndex]; if (scInfo.MatchingInput != -1) @@ -291,4 +293,3 @@ bool InlineAsm::Verify(FunctionType *Ty, StringRef ConstStr) { if (Ty->getNumParams() != NumInputs) return false; return true; } - diff --git a/contrib/llvm/lib/IR/Instruction.cpp b/contrib/llvm/lib/IR/Instruction.cpp index c57ba16..4b33d2e 100644 --- a/contrib/llvm/lib/IR/Instruction.cpp +++ b/contrib/llvm/lib/IR/Instruction.cpp @@ -28,7 +28,7 @@ Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps, if (InsertBefore) { BasicBlock *BB = InsertBefore->getParent(); assert(BB && "Instruction to insert before is not in a basic block!"); - BB->getInstList().insert(InsertBefore, this); + BB->getInstList().insert(InsertBefore->getIterator(), this); } } @@ -62,33 +62,38 @@ Module *Instruction::getModule() { return getParent()->getModule(); } +Function *Instruction::getFunction() { return getParent()->getParent(); } + +const Function *Instruction::getFunction() const { + return getParent()->getParent(); +} void Instruction::removeFromParent() { - getParent()->getInstList().remove(this); + getParent()->getInstList().remove(getIterator()); } iplist<Instruction>::iterator Instruction::eraseFromParent() { - return getParent()->getInstList().erase(this); + return getParent()->getInstList().erase(getIterator()); } -/// insertBefore - Insert an unlinked instructions into a basic block -/// immediately before the specified instruction. +/// Insert an unlinked instruction into a basic block immediately before the +/// specified instruction. void Instruction::insertBefore(Instruction *InsertPos) { - InsertPos->getParent()->getInstList().insert(InsertPos, this); + InsertPos->getParent()->getInstList().insert(InsertPos->getIterator(), this); } -/// insertAfter - Insert an unlinked instructions into a basic block -/// immediately after the specified instruction. +/// Insert an unlinked instruction into a basic block immediately after the +/// specified instruction. void Instruction::insertAfter(Instruction *InsertPos) { - InsertPos->getParent()->getInstList().insertAfter(InsertPos, this); + InsertPos->getParent()->getInstList().insertAfter(InsertPos->getIterator(), + this); } -/// moveBefore - Unlink this instruction from its current basic block and -/// insert it into the basic block that MovePos lives in, right before -/// MovePos. +/// Unlink this instruction from its current basic block and insert it into the +/// basic block that MovePos lives in, right before MovePos. void Instruction::moveBefore(Instruction *MovePos) { - MovePos->getParent()->getInstList().splice(MovePos,getParent()->getInstList(), - this); + MovePos->getParent()->getInstList().splice( + MovePos->getIterator(), getParent()->getInstList(), getIterator()); } /// Set or clear the unsafe-algebra flag on this instruction, which must be an @@ -196,6 +201,10 @@ const char *Instruction::getOpcodeName(unsigned OpCode) { case Invoke: return "invoke"; case Resume: return "resume"; case Unreachable: return "unreachable"; + case CleanupRet: return "cleanupret"; + case CatchRet: return "catchret"; + case CatchPad: return "catchpad"; + case CatchSwitch: return "catchswitch"; // Standard binary operators... case Add: return "add"; @@ -256,6 +265,7 @@ const char *Instruction::getOpcodeName(unsigned OpCode) { case ExtractValue: return "extractvalue"; case InsertValue: return "insertvalue"; case LandingPad: return "landingpad"; + case CleanupPad: return "cleanuppad"; default: return "<Invalid operator> "; } @@ -285,11 +295,12 @@ static bool haveSameSpecialState(const Instruction *I1, const Instruction *I2, if (const CallInst *CI = dyn_cast<CallInst>(I1)) return CI->isTailCall() == cast<CallInst>(I2)->isTailCall() && CI->getCallingConv() == cast<CallInst>(I2)->getCallingConv() && - CI->getAttributes() == cast<CallInst>(I2)->getAttributes(); + CI->getAttributes() == cast<CallInst>(I2)->getAttributes() && + CI->hasIdenticalOperandBundleSchema(*cast<CallInst>(I2)); if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1)) return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() && - CI->getAttributes() == - cast<InvokeInst>(I2)->getAttributes(); + CI->getAttributes() == cast<InvokeInst>(I2)->getAttributes() && + CI->hasIdenticalOperandBundleSchema(*cast<InvokeInst>(I2)); if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1)) return IVI->getIndices() == cast<InsertValueInst>(I2)->getIndices(); if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1)) @@ -407,6 +418,8 @@ bool Instruction::mayReadFromMemory() const { case Instruction::Fence: // FIXME: refine definition of mayReadFromMemory case Instruction::AtomicCmpXchg: case Instruction::AtomicRMW: + case Instruction::CatchPad: + case Instruction::CatchRet: return true; case Instruction::Call: return !cast<CallInst>(this)->doesNotAccessMemory(); @@ -427,6 +440,8 @@ bool Instruction::mayWriteToMemory() const { case Instruction::VAArg: case Instruction::AtomicCmpXchg: case Instruction::AtomicRMW: + case Instruction::CatchPad: + case Instruction::CatchRet: return true; case Instruction::Call: return !cast<CallInst>(this)->onlyReadsMemory(); @@ -455,6 +470,10 @@ bool Instruction::isAtomic() const { bool Instruction::mayThrow() const { if (const CallInst *CI = dyn_cast<CallInst>(this)) return !CI->doesNotThrow(); + if (const auto *CRI = dyn_cast<CleanupReturnInst>(this)) + return CRI->unwindsToCaller(); + if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(this)) + return CatchSwitch->unwindsToCaller(); return isa<ResumeInst>(this); } diff --git a/contrib/llvm/lib/IR/Instructions.cpp b/contrib/llvm/lib/IR/Instructions.cpp index 86c921a..7c64ca7 100644 --- a/contrib/llvm/lib/IR/Instructions.cpp +++ b/contrib/llvm/lib/IR/Instructions.cpp @@ -62,7 +62,10 @@ UnaryInstruction::~UnaryInstruction() { const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) { if (Op1->getType() != Op2->getType()) return "both values to select must have same type"; - + + if (Op1->getType()->isTokenTy()) + return "select values cannot have token type"; + if (VectorType *VT = dyn_cast<VectorType>(Op0->getType())) { // Vector select. if (VT->getElementType() != Type::getInt1Ty(Op0->getContext())) @@ -84,6 +87,8 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) { // PHINode Class //===----------------------------------------------------------------------===// +void PHINode::anchor() {} + PHINode::PHINode(const PHINode &PN) : Instruction(PN.getType(), Instruction::PHI, nullptr, PN.getNumOperands()), ReservedSpace(PN.getNumOperands()) { @@ -223,9 +228,10 @@ CallInst::~CallInst() { } void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args, - const Twine &NameStr) { + ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr) { this->FTy = FTy; - assert(getNumOperands() == Args.size() + 1 && "NumOperands not set up?"); + assert(getNumOperands() == Args.size() + CountBundleInputs(Bundles) + 1 && + "NumOperands not set up?"); Op<-1>() = Func; #ifndef NDEBUG @@ -240,6 +246,11 @@ void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args, #endif std::copy(Args.begin(), Args.end(), op_begin()); + + auto It = populateBundleOperandInfos(Bundles, Args.size()); + (void)It; + assert(It + 1 == op_end() && "Should add up!"); + setName(NameStr); } @@ -281,11 +292,26 @@ CallInst::CallInst(const CallInst &CI) AttributeList(CI.AttributeList), FTy(CI.FTy) { setTailCallKind(CI.getTailCallKind()); setCallingConv(CI.getCallingConv()); - + std::copy(CI.op_begin(), CI.op_end(), op_begin()); + std::copy(CI.bundle_op_info_begin(), CI.bundle_op_info_end(), + bundle_op_info_begin()); SubclassOptionalData = CI.SubclassOptionalData; } +CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB, + Instruction *InsertPt) { + std::vector<Value *> Args(CI->arg_begin(), CI->arg_end()); + + auto *NewCI = CallInst::Create(CI->getCalledValue(), Args, OpB, CI->getName(), + InsertPt); + NewCI->setTailCallKind(CI->getTailCallKind()); + NewCI->setCallingConv(CI->getCallingConv()); + NewCI->SubclassOptionalData = CI->SubclassOptionalData; + NewCI->setAttributes(CI->getAttributes()); + return NewCI; +} + void CallInst::addAttribute(unsigned i, Attribute::AttrKind attr) { AttributeSet PAL = getAttributes(); PAL = PAL.addAttribute(getContext(), i, attr); @@ -320,6 +346,8 @@ void CallInst::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) { } bool CallInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const { + assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!"); + if (AttributeList.hasAttribute(i, A)) return true; if (const Function *F = getCalledFunction()) @@ -327,6 +355,25 @@ bool CallInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const { return false; } +bool CallInst::dataOperandHasImpliedAttr(unsigned i, + Attribute::AttrKind A) const { + + // There are getNumOperands() - 1 data operands. The last operand is the + // callee. + assert(i < getNumOperands() && "Data operand index out of bounds!"); + + // The attribute A can either be directly specified, if the operand in + // question is a call argument; or be indirectly implied by the kind of its + // containing operand bundle, if the operand is a bundle operand. + + if (i < (getNumArgOperands() + 1)) + return paramHasAttr(i, A); + + assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) && + "Must be either a call argument or an operand bundle!"); + return bundleOperandHasAttr(i - 1, A); +} + /// IsConstantOne - Return true only if val is constant int 1 static bool IsConstantOne(Value *val) { assert(val && "IsConstantOne does not work with nullptr val"); @@ -496,10 +543,12 @@ Instruction* CallInst::CreateFree(Value* Source, BasicBlock *InsertAtEnd) { void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef<Value *> Args, + ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr) { this->FTy = FTy; - assert(getNumOperands() == 3 + Args.size() && "NumOperands not set up?"); + assert(getNumOperands() == 3 + Args.size() + CountBundleInputs(Bundles) && + "NumOperands not set up?"); Op<-3>() = Fn; Op<-2>() = IfNormal; Op<-1>() = IfException; @@ -516,6 +565,11 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal, #endif std::copy(Args.begin(), Args.end(), op_begin()); + + auto It = populateBundleOperandInfos(Bundles, Args.size()); + (void)It; + assert(It + 3 == op_end() && "Should add up!"); + setName(NameStr); } @@ -527,9 +581,24 @@ InvokeInst::InvokeInst(const InvokeInst &II) AttributeList(II.AttributeList), FTy(II.FTy) { setCallingConv(II.getCallingConv()); std::copy(II.op_begin(), II.op_end(), op_begin()); + std::copy(II.bundle_op_info_begin(), II.bundle_op_info_end(), + bundle_op_info_begin()); SubclassOptionalData = II.SubclassOptionalData; } +InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB, + Instruction *InsertPt) { + std::vector<Value *> Args(II->arg_begin(), II->arg_end()); + + auto *NewII = InvokeInst::Create(II->getCalledValue(), II->getNormalDest(), + II->getUnwindDest(), Args, OpB, + II->getName(), InsertPt); + NewII->setCallingConv(II->getCallingConv()); + NewII->SubclassOptionalData = II->SubclassOptionalData; + NewII->setAttributes(II->getAttributes()); + return NewII; +} + BasicBlock *InvokeInst::getSuccessorV(unsigned idx) const { return getSuccessor(idx); } @@ -540,15 +609,9 @@ void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) { return setSuccessor(idx, B); } -bool InvokeInst::hasFnAttrImpl(Attribute::AttrKind A) const { - if (AttributeList.hasAttribute(AttributeSet::FunctionIndex, A)) - return true; - if (const Function *F = getCalledFunction()) - return F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, A); - return false; -} - bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const { + assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!"); + if (AttributeList.hasAttribute(i, A)) return true; if (const Function *F = getCalledFunction()) @@ -556,6 +619,24 @@ bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const { return false; } +bool InvokeInst::dataOperandHasImpliedAttr(unsigned i, + Attribute::AttrKind A) const { + // There are getNumOperands() - 3 data operands. The last three operands are + // the callee and the two successor basic blocks. + assert(i < (getNumOperands() - 2) && "Data operand index out of bounds!"); + + // The attribute A can either be directly specified, if the operand in + // question is an invoke argument; or be indirectly implied by the kind of its + // containing operand bundle, if the operand is a bundle operand. + + if (i < (getNumArgOperands() + 1)) + return paramHasAttr(i, A); + + assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) && + "Must be either an invoke argument or an operand bundle!"); + return bundleOperandHasAttr(i - 1, A); +} + void InvokeInst::addAttribute(unsigned i, Attribute::AttrKind attr) { AttributeSet PAL = getAttributes(); PAL = PAL.addAttribute(getContext(), i, attr); @@ -671,6 +752,234 @@ BasicBlock *ResumeInst::getSuccessorV(unsigned idx) const { } //===----------------------------------------------------------------------===// +// CleanupReturnInst Implementation +//===----------------------------------------------------------------------===// + +CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI) + : TerminatorInst(CRI.getType(), Instruction::CleanupRet, + OperandTraits<CleanupReturnInst>::op_end(this) - + CRI.getNumOperands(), + CRI.getNumOperands()) { + setInstructionSubclassData(CRI.getSubclassDataFromInstruction()); + Op<0>() = CRI.Op<0>(); + if (CRI.hasUnwindDest()) + Op<1>() = CRI.Op<1>(); +} + +void CleanupReturnInst::init(Value *CleanupPad, BasicBlock *UnwindBB) { + if (UnwindBB) + setInstructionSubclassData(getSubclassDataFromInstruction() | 1); + + Op<0>() = CleanupPad; + if (UnwindBB) + Op<1>() = UnwindBB; +} + +CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, + unsigned Values, Instruction *InsertBefore) + : TerminatorInst(Type::getVoidTy(CleanupPad->getContext()), + Instruction::CleanupRet, + OperandTraits<CleanupReturnInst>::op_end(this) - Values, + Values, InsertBefore) { + init(CleanupPad, UnwindBB); +} + +CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, + unsigned Values, BasicBlock *InsertAtEnd) + : TerminatorInst(Type::getVoidTy(CleanupPad->getContext()), + Instruction::CleanupRet, + OperandTraits<CleanupReturnInst>::op_end(this) - Values, + Values, InsertAtEnd) { + init(CleanupPad, UnwindBB); +} + +BasicBlock *CleanupReturnInst::getSuccessorV(unsigned Idx) const { + assert(Idx == 0); + return getUnwindDest(); +} +unsigned CleanupReturnInst::getNumSuccessorsV() const { + return getNumSuccessors(); +} +void CleanupReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) { + assert(Idx == 0); + setUnwindDest(B); +} + +//===----------------------------------------------------------------------===// +// CatchReturnInst Implementation +//===----------------------------------------------------------------------===// +void CatchReturnInst::init(Value *CatchPad, BasicBlock *BB) { + Op<0>() = CatchPad; + Op<1>() = BB; +} + +CatchReturnInst::CatchReturnInst(const CatchReturnInst &CRI) + : TerminatorInst(Type::getVoidTy(CRI.getContext()), Instruction::CatchRet, + OperandTraits<CatchReturnInst>::op_begin(this), 2) { + Op<0>() = CRI.Op<0>(); + Op<1>() = CRI.Op<1>(); +} + +CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB, + Instruction *InsertBefore) + : TerminatorInst(Type::getVoidTy(BB->getContext()), Instruction::CatchRet, + OperandTraits<CatchReturnInst>::op_begin(this), 2, + InsertBefore) { + init(CatchPad, BB); +} + +CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB, + BasicBlock *InsertAtEnd) + : TerminatorInst(Type::getVoidTy(BB->getContext()), Instruction::CatchRet, + OperandTraits<CatchReturnInst>::op_begin(this), 2, + InsertAtEnd) { + init(CatchPad, BB); +} + +BasicBlock *CatchReturnInst::getSuccessorV(unsigned Idx) const { + assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!"); + return getSuccessor(); +} +unsigned CatchReturnInst::getNumSuccessorsV() const { + return getNumSuccessors(); +} +void CatchReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) { + assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!"); + setSuccessor(B); +} + +//===----------------------------------------------------------------------===// +// CatchSwitchInst Implementation +//===----------------------------------------------------------------------===// + +CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest, + unsigned NumReservedValues, + const Twine &NameStr, + Instruction *InsertBefore) + : TerminatorInst(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0, + InsertBefore) { + if (UnwindDest) + ++NumReservedValues; + init(ParentPad, UnwindDest, NumReservedValues + 1); + setName(NameStr); +} + +CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest, + unsigned NumReservedValues, + const Twine &NameStr, BasicBlock *InsertAtEnd) + : TerminatorInst(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0, + InsertAtEnd) { + if (UnwindDest) + ++NumReservedValues; + init(ParentPad, UnwindDest, NumReservedValues + 1); + setName(NameStr); +} + +CatchSwitchInst::CatchSwitchInst(const CatchSwitchInst &CSI) + : TerminatorInst(CSI.getType(), Instruction::CatchSwitch, nullptr, + CSI.getNumOperands()) { + init(CSI.getParentPad(), CSI.getUnwindDest(), CSI.getNumOperands()); + setNumHungOffUseOperands(ReservedSpace); + Use *OL = getOperandList(); + const Use *InOL = CSI.getOperandList(); + for (unsigned I = 1, E = ReservedSpace; I != E; ++I) + OL[I] = InOL[I]; +} + +void CatchSwitchInst::init(Value *ParentPad, BasicBlock *UnwindDest, + unsigned NumReservedValues) { + assert(ParentPad && NumReservedValues); + + ReservedSpace = NumReservedValues; + setNumHungOffUseOperands(UnwindDest ? 2 : 1); + allocHungoffUses(ReservedSpace); + + Op<0>() = ParentPad; + if (UnwindDest) { + setInstructionSubclassData(getSubclassDataFromInstruction() | 1); + setUnwindDest(UnwindDest); + } +} + +/// growOperands - grow operands - This grows the operand list in response to a +/// push_back style of operation. This grows the number of ops by 2 times. +void CatchSwitchInst::growOperands(unsigned Size) { + unsigned NumOperands = getNumOperands(); + assert(NumOperands >= 1); + if (ReservedSpace >= NumOperands + Size) + return; + ReservedSpace = (NumOperands + Size / 2) * 2; + growHungoffUses(ReservedSpace); +} + +void CatchSwitchInst::addHandler(BasicBlock *Handler) { + unsigned OpNo = getNumOperands(); + growOperands(1); + assert(OpNo < ReservedSpace && "Growing didn't work!"); + setNumHungOffUseOperands(getNumOperands() + 1); + getOperandList()[OpNo] = Handler; +} + +void CatchSwitchInst::removeHandler(handler_iterator HI) { + // Move all subsequent handlers up one. + Use *EndDst = op_end() - 1; + for (Use *CurDst = HI.getCurrent(); CurDst != EndDst; ++CurDst) + *CurDst = *(CurDst + 1); + // Null out the last handler use. + *EndDst = nullptr; + + setNumHungOffUseOperands(getNumOperands() - 1); +} + +BasicBlock *CatchSwitchInst::getSuccessorV(unsigned idx) const { + return getSuccessor(idx); +} +unsigned CatchSwitchInst::getNumSuccessorsV() const { + return getNumSuccessors(); +} +void CatchSwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) { + setSuccessor(idx, B); +} + +//===----------------------------------------------------------------------===// +// FuncletPadInst Implementation +//===----------------------------------------------------------------------===// +void FuncletPadInst::init(Value *ParentPad, ArrayRef<Value *> Args, + const Twine &NameStr) { + assert(getNumOperands() == 1 + Args.size() && "NumOperands not set up?"); + std::copy(Args.begin(), Args.end(), op_begin()); + setParentPad(ParentPad); + setName(NameStr); +} + +FuncletPadInst::FuncletPadInst(const FuncletPadInst &FPI) + : Instruction(FPI.getType(), FPI.getOpcode(), + OperandTraits<FuncletPadInst>::op_end(this) - + FPI.getNumOperands(), + FPI.getNumOperands()) { + std::copy(FPI.op_begin(), FPI.op_end(), op_begin()); + setParentPad(FPI.getParentPad()); +} + +FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad, + ArrayRef<Value *> Args, unsigned Values, + const Twine &NameStr, Instruction *InsertBefore) + : Instruction(ParentPad->getType(), Op, + OperandTraits<FuncletPadInst>::op_end(this) - Values, Values, + InsertBefore) { + init(ParentPad, Args, NameStr); +} + +FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad, + ArrayRef<Value *> Args, unsigned Values, + const Twine &NameStr, BasicBlock *InsertAtEnd) + : Instruction(ParentPad->getType(), Op, + OperandTraits<FuncletPadInst>::op_end(this) - Values, Values, + InsertAtEnd) { + init(ParentPad, Args, NameStr); +} + +//===----------------------------------------------------------------------===// // UnreachableInst Implementation //===----------------------------------------------------------------------===// @@ -1193,6 +1502,8 @@ FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, // GetElementPtrInst Implementation //===----------------------------------------------------------------------===// +void GetElementPtrInst::anchor() {} + void GetElementPtrInst::init(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name) { assert(getNumOperands() == 1 + IdxList.size() && @@ -2029,7 +2340,7 @@ bool CastInst::isNoopCast(const DataLayout &DL) const { /// * %S = secondOpcode MidTy %F to DstTy /// The function returns a resultOpcode so these two casts can be replaced with: /// * %Replacement = resultOpcode %SrcTy %x to DstTy -/// If no such cast is permited, the function returns 0. +/// If no such cast is permitted, the function returns 0. unsigned CastInst::isEliminableCastPair( Instruction::CastOps firstOp, Instruction::CastOps secondOp, Type *SrcTy, Type *MidTy, Type *DstTy, Type *SrcIntPtrTy, Type *MidIntPtrTy, @@ -2037,7 +2348,7 @@ unsigned CastInst::isEliminableCastPair( // Define the 144 possibilities for these two cast instructions. The values // in this matrix determine what to do in a given situation and select the // case in the switch below. The rows correspond to firstOp, the columns - // correspond to secondOp. In looking at the table below, keep in mind + // correspond to secondOp. In looking at the table below, keep in mind // the following cast properties: // // Size Compare Source Destination @@ -2087,17 +2398,19 @@ unsigned CastInst::isEliminableCastPair( { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+ }; + // TODO: This logic could be encoded into the table above and handled in the + // switch below. // If either of the casts are a bitcast from scalar to vector, disallow the - // merging. However, bitcast of A->B->A are allowed. - bool isFirstBitcast = (firstOp == Instruction::BitCast); - bool isSecondBitcast = (secondOp == Instruction::BitCast); - bool chainedBitcast = (SrcTy == DstTy && isFirstBitcast && isSecondBitcast); - - // Check if any of the bitcasts convert scalars<->vectors. - if ((isFirstBitcast && isa<VectorType>(SrcTy) != isa<VectorType>(MidTy)) || - (isSecondBitcast && isa<VectorType>(MidTy) != isa<VectorType>(DstTy))) - // Unless we are bitcasing to the original type, disallow optimizations. - if (!chainedBitcast) return 0; + // merging. However, any pair of bitcasts are allowed. + bool IsFirstBitcast = (firstOp == Instruction::BitCast); + bool IsSecondBitcast = (secondOp == Instruction::BitCast); + bool AreBothBitcasts = IsFirstBitcast && IsSecondBitcast; + + // Check if any of the casts convert scalars <-> vectors. + if ((IsFirstBitcast && isa<VectorType>(SrcTy) != isa<VectorType>(MidTy)) || + (IsSecondBitcast && isa<VectorType>(MidTy) != isa<VectorType>(DstTy))) + if (!AreBothBitcasts) + return 0; int ElimCase = CastResults[firstOp-Instruction::CastOpsBegin] [secondOp-Instruction::CastOpsBegin]; @@ -2966,9 +3279,8 @@ AddrSpaceCastInst::AddrSpaceCastInst( void CmpInst::anchor() {} -CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate, - Value *LHS, Value *RHS, const Twine &Name, - Instruction *InsertBefore) +CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS, + Value *RHS, const Twine &Name, Instruction *InsertBefore) : Instruction(ty, op, OperandTraits<CmpInst>::op_begin(this), OperandTraits<CmpInst>::operands(this), @@ -2979,9 +3291,8 @@ CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate, setName(Name); } -CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate, - Value *LHS, Value *RHS, const Twine &Name, - BasicBlock *InsertAtEnd) +CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS, + Value *RHS, const Twine &Name, BasicBlock *InsertAtEnd) : Instruction(ty, op, OperandTraits<CmpInst>::op_begin(this), OperandTraits<CmpInst>::operands(this), @@ -2993,8 +3304,7 @@ CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate, } CmpInst * -CmpInst::Create(OtherOps Op, unsigned short predicate, - Value *S1, Value *S2, +CmpInst::Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2, const Twine &Name, Instruction *InsertBefore) { if (Op == Instruction::ICmp) { if (InsertBefore) @@ -3014,7 +3324,7 @@ CmpInst::Create(OtherOps Op, unsigned short predicate, } CmpInst * -CmpInst::Create(OtherOps Op, unsigned short predicate, Value *S1, Value *S2, +CmpInst::Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2, const Twine &Name, BasicBlock *InsertAtEnd) { if (Op == Instruction::ICmp) { return new ICmpInst(*InsertAtEnd, CmpInst::Predicate(predicate), @@ -3077,6 +3387,8 @@ CmpInst::Predicate CmpInst::getInversePredicate(Predicate pred) { } } +void ICmpInst::anchor() {} + ICmpInst::Predicate ICmpInst::getSignedPredicate(Predicate pred) { switch (pred) { default: llvm_unreachable("Unknown icmp predicate!"); @@ -3196,7 +3508,24 @@ CmpInst::Predicate CmpInst::getSwappedPredicate(Predicate pred) { } } -bool CmpInst::isUnsigned(unsigned short predicate) { +CmpInst::Predicate CmpInst::getSignedPredicate(Predicate pred) { + assert(CmpInst::isUnsigned(pred) && "Call only with signed predicates!"); + + switch (pred) { + default: + llvm_unreachable("Unknown predicate!"); + case CmpInst::ICMP_ULT: + return CmpInst::ICMP_SLT; + case CmpInst::ICMP_ULE: + return CmpInst::ICMP_SLE; + case CmpInst::ICMP_UGT: + return CmpInst::ICMP_SGT; + case CmpInst::ICMP_UGE: + return CmpInst::ICMP_SGE; + } +} + +bool CmpInst::isUnsigned(Predicate predicate) { switch (predicate) { default: return false; case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: case ICmpInst::ICMP_UGT: @@ -3204,7 +3533,7 @@ bool CmpInst::isUnsigned(unsigned short predicate) { } } -bool CmpInst::isSigned(unsigned short predicate) { +bool CmpInst::isSigned(Predicate predicate) { switch (predicate) { default: return false; case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: case ICmpInst::ICMP_SGT: @@ -3212,7 +3541,7 @@ bool CmpInst::isSigned(unsigned short predicate) { } } -bool CmpInst::isOrdered(unsigned short predicate) { +bool CmpInst::isOrdered(Predicate predicate) { switch (predicate) { default: return false; case FCmpInst::FCMP_OEQ: case FCmpInst::FCMP_ONE: case FCmpInst::FCMP_OGT: @@ -3221,7 +3550,7 @@ bool CmpInst::isOrdered(unsigned short predicate) { } } -bool CmpInst::isUnordered(unsigned short predicate) { +bool CmpInst::isUnordered(Predicate predicate) { switch (predicate) { default: return false; case FCmpInst::FCMP_UEQ: case FCmpInst::FCMP_UNE: case FCmpInst::FCMP_UGT: @@ -3230,7 +3559,7 @@ bool CmpInst::isUnordered(unsigned short predicate) { } } -bool CmpInst::isTrueWhenEqual(unsigned short predicate) { +bool CmpInst::isTrueWhenEqual(Predicate predicate) { switch(predicate) { default: return false; case ICMP_EQ: case ICMP_UGE: case ICMP_ULE: case ICMP_SGE: case ICMP_SLE: @@ -3238,7 +3567,7 @@ bool CmpInst::isTrueWhenEqual(unsigned short predicate) { } } -bool CmpInst::isFalseWhenEqual(unsigned short predicate) { +bool CmpInst::isFalseWhenEqual(Predicate predicate) { switch(predicate) { case ICMP_NE: case ICMP_UGT: case ICMP_ULT: case ICMP_SGT: case ICMP_SLT: case FCMP_FALSE: case FCMP_ONE: case FCMP_OGT: case FCMP_OLT: return true; @@ -3569,6 +3898,10 @@ AddrSpaceCastInst *AddrSpaceCastInst::cloneImpl() const { } CallInst *CallInst::cloneImpl() const { + if (hasOperandBundles()) { + unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo); + return new(getNumOperands(), DescriptorBytes) CallInst(*this); + } return new(getNumOperands()) CallInst(*this); } @@ -3613,11 +3946,31 @@ IndirectBrInst *IndirectBrInst::cloneImpl() const { } InvokeInst *InvokeInst::cloneImpl() const { + if (hasOperandBundles()) { + unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo); + return new(getNumOperands(), DescriptorBytes) InvokeInst(*this); + } return new(getNumOperands()) InvokeInst(*this); } ResumeInst *ResumeInst::cloneImpl() const { return new (1) ResumeInst(*this); } +CleanupReturnInst *CleanupReturnInst::cloneImpl() const { + return new (getNumOperands()) CleanupReturnInst(*this); +} + +CatchReturnInst *CatchReturnInst::cloneImpl() const { + return new (getNumOperands()) CatchReturnInst(*this); +} + +CatchSwitchInst *CatchSwitchInst::cloneImpl() const { + return new CatchSwitchInst(*this); +} + +FuncletPadInst *FuncletPadInst::cloneImpl() const { + return new (getNumOperands()) FuncletPadInst(*this); +} + UnreachableInst *UnreachableInst::cloneImpl() const { LLVMContext &Context = getContext(); return new UnreachableInst(Context); diff --git a/contrib/llvm/lib/IR/LLVMContext.cpp b/contrib/llvm/lib/IR/LLVMContext.cpp index 6d799e4..48b53b0 100644 --- a/contrib/llvm/lib/IR/LLVMContext.cpp +++ b/contrib/llvm/lib/IR/LLVMContext.cpp @@ -104,6 +104,39 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) { assert(DereferenceableOrNullID == MD_dereferenceable_or_null && "dereferenceable_or_null kind id drifted"); (void)DereferenceableOrNullID; + + // Create the 'make.implicit' metadata kind. + unsigned MakeImplicitID = getMDKindID("make.implicit"); + assert(MakeImplicitID == MD_make_implicit && + "make.implicit kind id drifted"); + (void)MakeImplicitID; + + // Create the 'unpredictable' metadata kind. + unsigned UnpredictableID = getMDKindID("unpredictable"); + assert(UnpredictableID == MD_unpredictable && + "unpredictable kind id drifted"); + (void)UnpredictableID; + + // Create the 'invariant.group' metadata kind. + unsigned InvariantGroupId = getMDKindID("invariant.group"); + assert(InvariantGroupId == MD_invariant_group && + "invariant.group kind id drifted"); + (void)InvariantGroupId; + + // Create the 'align' metadata kind. + unsigned AlignID = getMDKindID("align"); + assert(AlignID == MD_align && "align kind id drifted"); + (void)AlignID; + + auto *DeoptEntry = pImpl->getOrInsertBundleTag("deopt"); + assert(DeoptEntry->second == LLVMContext::OB_deopt && + "deopt operand bundle id drifted!"); + (void)DeoptEntry; + + auto *FuncletEntry = pImpl->getOrInsertBundleTag("funclet"); + assert(FuncletEntry->second == LLVMContext::OB_funclet && + "funclet operand bundle id drifted!"); + (void)FuncletEntry; } LLVMContext::~LLVMContext() { delete pImpl; } @@ -193,6 +226,11 @@ static bool isDiagnosticEnabled(const DiagnosticInfo &DI) { if (!cast<DiagnosticInfoOptimizationRemarkAnalysis>(DI).isEnabled()) return false; break; + case llvm::DK_OptimizationRemarkAnalysisFPCommute: + if (!cast<DiagnosticInfoOptimizationRemarkAnalysisFPCommute>(DI) + .isEnabled()) + return false; + break; default: break; } @@ -250,7 +288,7 @@ unsigned LLVMContext::getMDKindID(StringRef Name) const { .first->second; } -/// getHandlerNames - Populate client supplied smallvector using custome +/// getHandlerNames - Populate client-supplied smallvector using custom /// metadata name and ID. void LLVMContext::getMDKindNames(SmallVectorImpl<StringRef> &Names) const { Names.resize(pImpl->CustomMDKindNames.size()); @@ -258,3 +296,27 @@ void LLVMContext::getMDKindNames(SmallVectorImpl<StringRef> &Names) const { E = pImpl->CustomMDKindNames.end(); I != E; ++I) Names[I->second] = I->first(); } + +void LLVMContext::getOperandBundleTags(SmallVectorImpl<StringRef> &Tags) const { + pImpl->getOperandBundleTags(Tags); +} + +uint32_t LLVMContext::getOperandBundleTagID(StringRef Tag) const { + return pImpl->getOperandBundleTagID(Tag); +} + +void LLVMContext::setGC(const Function &Fn, std::string GCName) { + auto It = pImpl->GCNames.find(&Fn); + + if (It == pImpl->GCNames.end()) { + pImpl->GCNames.insert(std::make_pair(&Fn, std::move(GCName))); + return; + } + It->second = std::move(GCName); +} +const std::string &LLVMContext::getGC(const Function &Fn) { + return pImpl->GCNames[&Fn]; +} +void LLVMContext::deleteGC(const Function &Fn) { + pImpl->GCNames.erase(&Fn); +} diff --git a/contrib/llvm/lib/IR/LLVMContextImpl.cpp b/contrib/llvm/lib/IR/LLVMContextImpl.cpp index 1e20807..5239b4f 100644 --- a/contrib/llvm/lib/IR/LLVMContextImpl.cpp +++ b/contrib/llvm/lib/IR/LLVMContextImpl.cpp @@ -27,6 +27,7 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C) FloatTy(C, Type::FloatTyID), DoubleTy(C, Type::DoubleTyID), MetadataTy(C, Type::MetadataTyID), + TokenTy(C, Type::TokenTyID), X86_FP80Ty(C, Type::X86_FP80TyID), FP128Ty(C, Type::FP128TyID), PPC_FP128Ty(C, Type::PPC_FP128TyID), @@ -78,7 +79,7 @@ LLVMContextImpl::~LLVMContextImpl() { // unnecessary RAUW when nodes are still unresolved. for (auto *I : DistinctMDNodes) I->dropAllReferences(); -#define HANDLE_MDNODE_LEAF(CLASS) \ +#define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS) \ for (auto *I : CLASS##s) \ I->dropAllReferences(); #include "llvm/IR/Metadata.def" @@ -92,8 +93,8 @@ LLVMContextImpl::~LLVMContextImpl() { // Destroy MDNodes. for (MDNode *I : DistinctMDNodes) I->deleteAsSubclass(); -#define HANDLE_MDNODE_LEAF(CLASS) \ - for (CLASS *I : CLASS##s) \ +#define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS) \ + for (CLASS * I : CLASS##s) \ delete I; #include "llvm/IR/Metadata.def" @@ -218,6 +219,23 @@ unsigned MDNodeOpsKey::calculateHash(ArrayRef<Metadata *> Ops) { return hash_combine_range(Ops.begin(), Ops.end()); } +StringMapEntry<uint32_t> *LLVMContextImpl::getOrInsertBundleTag(StringRef Tag) { + uint32_t NewIdx = BundleTagCache.size(); + return &*(BundleTagCache.insert(std::make_pair(Tag, NewIdx)).first); +} + +void LLVMContextImpl::getOperandBundleTags(SmallVectorImpl<StringRef> &Tags) const { + Tags.resize(BundleTagCache.size()); + for (const auto &T : BundleTagCache) + Tags[T.second] = T.first(); +} + +uint32_t LLVMContextImpl::getOperandBundleTagID(StringRef Tag) const { + auto I = BundleTagCache.find(Tag); + assert(I != BundleTagCache.end() && "Unknown tag!"); + return I->second; +} + // ConstantsContext anchors void UnaryConstantExpr::anchor() { } diff --git a/contrib/llvm/lib/IR/LLVMContextImpl.h b/contrib/llvm/lib/IR/LLVMContextImpl.h index cbbf11e..d42047d 100644 --- a/contrib/llvm/lib/IR/LLVMContextImpl.h +++ b/contrib/llvm/lib/IR/LLVMContextImpl.h @@ -458,67 +458,6 @@ template <> struct MDNodeKeyImpl<DIFile> { unsigned getHashValue() const { return hash_combine(Filename, Directory); } }; -template <> struct MDNodeKeyImpl<DICompileUnit> { - unsigned SourceLanguage; - Metadata *File; - StringRef Producer; - bool IsOptimized; - StringRef Flags; - unsigned RuntimeVersion; - StringRef SplitDebugFilename; - unsigned EmissionKind; - Metadata *EnumTypes; - Metadata *RetainedTypes; - Metadata *Subprograms; - Metadata *GlobalVariables; - Metadata *ImportedEntities; - uint64_t DWOId; - - MDNodeKeyImpl(unsigned SourceLanguage, Metadata *File, StringRef Producer, - bool IsOptimized, StringRef Flags, unsigned RuntimeVersion, - StringRef SplitDebugFilename, unsigned EmissionKind, - Metadata *EnumTypes, Metadata *RetainedTypes, - Metadata *Subprograms, Metadata *GlobalVariables, - Metadata *ImportedEntities, uint64_t DWOId) - : SourceLanguage(SourceLanguage), File(File), Producer(Producer), - IsOptimized(IsOptimized), Flags(Flags), RuntimeVersion(RuntimeVersion), - SplitDebugFilename(SplitDebugFilename), EmissionKind(EmissionKind), - EnumTypes(EnumTypes), RetainedTypes(RetainedTypes), - Subprograms(Subprograms), GlobalVariables(GlobalVariables), - ImportedEntities(ImportedEntities), DWOId(DWOId) {} - MDNodeKeyImpl(const DICompileUnit *N) - : SourceLanguage(N->getSourceLanguage()), File(N->getRawFile()), - Producer(N->getProducer()), IsOptimized(N->isOptimized()), - Flags(N->getFlags()), RuntimeVersion(N->getRuntimeVersion()), - SplitDebugFilename(N->getSplitDebugFilename()), - EmissionKind(N->getEmissionKind()), EnumTypes(N->getRawEnumTypes()), - RetainedTypes(N->getRawRetainedTypes()), - Subprograms(N->getRawSubprograms()), - GlobalVariables(N->getRawGlobalVariables()), - ImportedEntities(N->getRawImportedEntities()), DWOId(N->getDWOId()) {} - - bool isKeyOf(const DICompileUnit *RHS) const { - return SourceLanguage == RHS->getSourceLanguage() && - File == RHS->getRawFile() && Producer == RHS->getProducer() && - IsOptimized == RHS->isOptimized() && Flags == RHS->getFlags() && - RuntimeVersion == RHS->getRuntimeVersion() && - SplitDebugFilename == RHS->getSplitDebugFilename() && - EmissionKind == RHS->getEmissionKind() && - EnumTypes == RHS->getRawEnumTypes() && - RetainedTypes == RHS->getRawRetainedTypes() && - Subprograms == RHS->getRawSubprograms() && - GlobalVariables == RHS->getRawGlobalVariables() && - ImportedEntities == RHS->getRawImportedEntities() && - DWOId == RHS->getDWOId(); - } - unsigned getHashValue() const { - return hash_combine(SourceLanguage, File, Producer, IsOptimized, Flags, - RuntimeVersion, SplitDebugFilename, EmissionKind, - EnumTypes, RetainedTypes, Subprograms, GlobalVariables, - ImportedEntities, DWOId); - } -}; - template <> struct MDNodeKeyImpl<DISubprogram> { Metadata *Scope; StringRef Name; @@ -534,7 +473,6 @@ template <> struct MDNodeKeyImpl<DISubprogram> { unsigned VirtualIndex; unsigned Flags; bool IsOptimized; - Metadata *Function; Metadata *TemplateParams; Metadata *Declaration; Metadata *Variables; @@ -544,15 +482,15 @@ template <> struct MDNodeKeyImpl<DISubprogram> { bool IsLocalToUnit, bool IsDefinition, unsigned ScopeLine, Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex, unsigned Flags, bool IsOptimized, - Metadata *Function, Metadata *TemplateParams, - Metadata *Declaration, Metadata *Variables) + Metadata *TemplateParams, Metadata *Declaration, + Metadata *Variables) : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File), Line(Line), Type(Type), IsLocalToUnit(IsLocalToUnit), IsDefinition(IsDefinition), ScopeLine(ScopeLine), ContainingType(ContainingType), Virtuality(Virtuality), VirtualIndex(VirtualIndex), Flags(Flags), IsOptimized(IsOptimized), - Function(Function), TemplateParams(TemplateParams), - Declaration(Declaration), Variables(Variables) {} + TemplateParams(TemplateParams), Declaration(Declaration), + Variables(Variables) {} MDNodeKeyImpl(const DISubprogram *N) : Scope(N->getRawScope()), Name(N->getName()), LinkageName(N->getLinkageName()), File(N->getRawFile()), @@ -561,7 +499,6 @@ template <> struct MDNodeKeyImpl<DISubprogram> { ScopeLine(N->getScopeLine()), ContainingType(N->getRawContainingType()), Virtuality(N->getVirtuality()), VirtualIndex(N->getVirtualIndex()), Flags(N->getFlags()), IsOptimized(N->isOptimized()), - Function(N->getRawFunction()), TemplateParams(N->getRawTemplateParams()), Declaration(N->getRawDeclaration()), Variables(N->getRawVariables()) {} @@ -576,7 +513,6 @@ template <> struct MDNodeKeyImpl<DISubprogram> { Virtuality == RHS->getVirtuality() && VirtualIndex == RHS->getVirtualIndex() && Flags == RHS->getFlags() && IsOptimized == RHS->isOptimized() && - Function == RHS->getRawFunction() && TemplateParams == RHS->getRawTemplateParams() && Declaration == RHS->getRawDeclaration() && Variables == RHS->getRawVariables(); @@ -584,7 +520,7 @@ template <> struct MDNodeKeyImpl<DISubprogram> { unsigned getHashValue() const { return hash_combine(Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition, ScopeLine, ContainingType, - Virtuality, VirtualIndex, Flags, IsOptimized, Function, + Virtuality, VirtualIndex, Flags, IsOptimized, TemplateParams, Declaration, Variables); } }; @@ -759,7 +695,6 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> { }; template <> struct MDNodeKeyImpl<DILocalVariable> { - unsigned Tag; Metadata *Scope; StringRef Name; Metadata *File; @@ -768,23 +703,23 @@ template <> struct MDNodeKeyImpl<DILocalVariable> { unsigned Arg; unsigned Flags; - MDNodeKeyImpl(unsigned Tag, Metadata *Scope, StringRef Name, Metadata *File, - unsigned Line, Metadata *Type, unsigned Arg, unsigned Flags) - : Tag(Tag), Scope(Scope), Name(Name), File(File), Line(Line), Type(Type), - Arg(Arg), Flags(Flags) {} + MDNodeKeyImpl(Metadata *Scope, StringRef Name, Metadata *File, unsigned Line, + Metadata *Type, unsigned Arg, unsigned Flags) + : Scope(Scope), Name(Name), File(File), Line(Line), Type(Type), Arg(Arg), + Flags(Flags) {} MDNodeKeyImpl(const DILocalVariable *N) - : Tag(N->getTag()), Scope(N->getRawScope()), Name(N->getName()), - File(N->getRawFile()), Line(N->getLine()), Type(N->getRawType()), - Arg(N->getArg()), Flags(N->getFlags()) {} + : Scope(N->getRawScope()), Name(N->getName()), File(N->getRawFile()), + Line(N->getLine()), Type(N->getRawType()), Arg(N->getArg()), + Flags(N->getFlags()) {} bool isKeyOf(const DILocalVariable *RHS) const { - return Tag == RHS->getTag() && Scope == RHS->getRawScope() && - Name == RHS->getName() && File == RHS->getRawFile() && - Line == RHS->getLine() && Type == RHS->getRawType() && - Arg == RHS->getArg() && Flags == RHS->getFlags(); + return Scope == RHS->getRawScope() && Name == RHS->getName() && + File == RHS->getRawFile() && Line == RHS->getLine() && + Type == RHS->getRawType() && Arg == RHS->getArg() && + Flags == RHS->getFlags(); } unsigned getHashValue() const { - return hash_combine(Tag, Scope, Name, File, Line, Type, Arg, Flags); + return hash_combine(Scope, Name, File, Line, Type, Arg, Flags); } }; @@ -857,6 +792,49 @@ template <> struct MDNodeKeyImpl<DIImportedEntity> { } }; +template <> struct MDNodeKeyImpl<DIMacro> { + unsigned MIType; + unsigned Line; + StringRef Name; + StringRef Value; + + MDNodeKeyImpl(unsigned MIType, unsigned Line, StringRef Name, StringRef Value) + : MIType(MIType), Line(Line), Name(Name), Value(Value) {} + MDNodeKeyImpl(const DIMacro *N) + : MIType(N->getMacinfoType()), Line(N->getLine()), Name(N->getName()), + Value(N->getValue()) {} + + bool isKeyOf(const DIMacro *RHS) const { + return MIType == RHS->getMacinfoType() && Line == RHS->getLine() && + Name == RHS->getName() && Value == RHS->getValue(); + } + unsigned getHashValue() const { + return hash_combine(MIType, Line, Name, Value); + } +}; + +template <> struct MDNodeKeyImpl<DIMacroFile> { + unsigned MIType; + unsigned Line; + Metadata *File; + Metadata *Elements; + + MDNodeKeyImpl(unsigned MIType, unsigned Line, Metadata *File, + Metadata *Elements) + : MIType(MIType), Line(Line), File(File), Elements(Elements) {} + MDNodeKeyImpl(const DIMacroFile *N) + : MIType(N->getMacinfoType()), Line(N->getLine()), File(N->getRawFile()), + Elements(N->getRawElements()) {} + + bool isKeyOf(const DIMacroFile *RHS) const { + return MIType == RHS->getMacinfoType() && Line == RHS->getLine() && + File == RHS->getRawFile() && File == RHS->getRawElements(); + } + unsigned getHashValue() const { + return hash_combine(MIType, Line, File, Elements); + } +}; + /// \brief DenseMapInfo for MDNode subclasses. template <class NodeTy> struct MDNodeInfo { typedef MDNodeKeyImpl<NodeTy> KeyTy; @@ -953,7 +931,8 @@ public: DenseMap<const Value*, ValueName*> ValueNames; -#define HANDLE_MDNODE_LEAF(CLASS) DenseSet<CLASS *, CLASS##Info> CLASS##s; +#define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS) \ + DenseSet<CLASS *, CLASS##Info> CLASS##s; #include "llvm/IR/Metadata.def" // MDNodes may be uniqued or not uniqued. When they're not uniqued, they @@ -988,8 +967,10 @@ public: ConstantInt *TheTrueVal; ConstantInt *TheFalseVal; + std::unique_ptr<ConstantTokenNone> TheNoneToken; + // Basic type instances. - Type VoidTy, LabelTy, HalfTy, FloatTy, DoubleTy, MetadataTy; + Type VoidTy, LabelTy, HalfTy, FloatTy, DoubleTy, MetadataTy, TokenTy; Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy; IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty; @@ -1033,20 +1014,26 @@ public: /// instructions in different blocks at the same location. DenseMap<std::pair<const char *, unsigned>, unsigned> DiscriminatorTable; - /// \brief Mapping from a function to its prefix data, which is stored as the - /// operand of an unparented ReturnInst so that the prefix data has a Use. - typedef DenseMap<const Function *, ReturnInst *> PrefixDataMapTy; - PrefixDataMapTy PrefixDataMap; - - /// \brief Mapping from a function to its prologue data, which is stored as - /// the operand of an unparented ReturnInst so that the prologue data has a - /// Use. - typedef DenseMap<const Function *, ReturnInst *> PrologueDataMapTy; - PrologueDataMapTy PrologueDataMap; - int getOrAddScopeRecordIdxEntry(MDNode *N, int ExistingIdx); int getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,int ExistingIdx); + /// \brief A set of interned tags for operand bundles. The StringMap maps + /// bundle tags to their IDs. + /// + /// \see LLVMContext::getOperandBundleTagID + StringMap<uint32_t> BundleTagCache; + + StringMapEntry<uint32_t> *getOrInsertBundleTag(StringRef Tag); + void getOperandBundleTags(SmallVectorImpl<StringRef> &Tags) const; + uint32_t getOperandBundleTagID(StringRef Tag) const; + + /// Maintain the GC name for each function. + /// + /// This saves allocating an additional word in Function for programs which + /// do not use GC (i.e., most programs) at the cost of increased overhead for + /// clients which do use GC. + DenseMap<const Function*, std::string> GCNames; + LLVMContextImpl(LLVMContext &C); ~LLVMContextImpl(); diff --git a/contrib/llvm/lib/IR/LegacyPassManager.cpp b/contrib/llvm/lib/IR/LegacyPassManager.cpp index 27d98a2..63d89f2 100644 --- a/contrib/llvm/lib/IR/LegacyPassManager.cpp +++ b/contrib/llvm/lib/IR/LegacyPassManager.cpp @@ -28,6 +28,7 @@ #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <map> +#include <unordered_set> using namespace llvm; using namespace llvm::legacy; @@ -83,6 +84,13 @@ PrintAfterAll("print-after-all", llvm::cl::desc("Print IR after each pass"), cl::init(false)); +static cl::list<std::string> + PrintFuncsList("filter-print-funcs", cl::value_desc("function names"), + cl::desc("Only print IR for functions whose name " + "match this for all print-[before|after][-all] " + "options"), + cl::CommaSeparated); + /// This is a helper to determine whether to print IR before or /// after a pass. @@ -109,6 +117,11 @@ static bool ShouldPrintAfterPass(const PassInfo *PI) { return PrintAfterAll || ShouldPrintBeforeOrAfterPass(PI, PrintAfter); } +bool llvm::isFunctionInPrintList(StringRef FunctionName) { + static std::unordered_set<std::string> PrintFuncNames(PrintFuncsList.begin(), + PrintFuncsList.end()); + return PrintFuncNames.empty() || PrintFuncNames.count(FunctionName); +} /// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions /// or higher is specified. bool PMDataManager::isPassDebuggingExecutionsOrMore() const { @@ -569,13 +582,33 @@ void PMTopLevelManager::collectLastUses(SmallVectorImpl<Pass *> &LastUses, AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) { AnalysisUsage *AnUsage = nullptr; - DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P); + auto DMI = AnUsageMap.find(P); if (DMI != AnUsageMap.end()) AnUsage = DMI->second; else { - AnUsage = new AnalysisUsage(); - P->getAnalysisUsage(*AnUsage); - AnUsageMap[P] = AnUsage; + // Look up the analysis usage from the pass instance (different instances + // of the same pass can produce different results), but unique the + // resulting object to reduce memory usage. This helps to greatly reduce + // memory usage when we have many instances of only a few pass types + // (e.g. instcombine, simplifycfg, etc...) which tend to share a fixed set + // of dependencies. + AnalysisUsage AU; + P->getAnalysisUsage(AU); + + AUFoldingSetNode* Node = nullptr; + FoldingSetNodeID ID; + AUFoldingSetNode::Profile(ID, AU); + void *IP = nullptr; + if (auto *N = UniqueAnalysisUsages.FindNodeOrInsertPos(ID, IP)) + Node = N; + else { + Node = new (AUFoldingSetNodeAllocator.Allocate()) AUFoldingSetNode(AU); + UniqueAnalysisUsages.InsertNode(Node, IP); + } + assert(Node && "cached analysis usage must be non null"); + + AnUsageMap[P] = &Node->AU; + AnUsage = &Node->AU;; } return AnUsage; } @@ -686,6 +719,10 @@ void PMTopLevelManager::schedulePass(Pass *P) { /// passes and all pass managers. If desired pass is not found /// then return NULL. Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) { + // For immutable passes we have a direct mapping from ID to pass, so check + // that first. + if (Pass *P = ImmutablePassMap.lookup(AID)) + return P; // Check pass managers for (PMDataManager *PassManager : PassManagers) @@ -697,24 +734,6 @@ Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) { if (Pass *P = IndirectPassManager->findAnalysisPass(AID, false)) return P; - // Check the immutable passes. Iterate in reverse order so that we find - // the most recently registered passes first. - for (auto I = ImmutablePasses.rbegin(), E = ImmutablePasses.rend(); I != E; - ++I) { - AnalysisID PI = (*I)->getPassID(); - if (PI == AID) - return *I; - - // If Pass not found then check the interfaces implemented by Immutable Pass - const PassInfo *PassInf = findAnalysisPassInfo(PI); - assert(PassInf && "Expected all immutable passes to be initialized"); - const std::vector<const PassInfo*> &ImmPI = - PassInf->getInterfacesImplemented(); - for (const PassInfo *PI : ImmPI) - if (PI->getTypeInfo() == AID) - return *I; - } - return nullptr; } @@ -729,6 +748,24 @@ const PassInfo *PMTopLevelManager::findAnalysisPassInfo(AnalysisID AID) const { return PI; } +void PMTopLevelManager::addImmutablePass(ImmutablePass *P) { + P->initializePass(); + ImmutablePasses.push_back(P); + + // Add this pass to the map from its analysis ID. We clobber any prior runs + // of the pass in the map so that the last one added is the one found when + // doing lookups. + AnalysisID AID = P->getPassID(); + ImmutablePassMap[AID] = P; + + // Also add any interfaces implemented by the immutable pass to the map for + // fast lookup. + const PassInfo *PassInf = findAnalysisPassInfo(AID); + assert(PassInf && "Expected all immutable passes to be initialized"); + for (const PassInfo *ImmPI : PassInf->getInterfacesImplemented()) + ImmutablePassMap[ImmPI->getTypeInfo()] = P; +} + // Print passes managed by this top level manager. void PMTopLevelManager::dumpPasses() const { @@ -780,15 +817,8 @@ void PMTopLevelManager::initializeAllAnalysisInfo() { for (DenseMap<Pass *, Pass *>::iterator DMI = LastUser.begin(), DME = LastUser.end(); DMI != DME; ++DMI) { - DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator InvDMI = - InversedLastUser.find(DMI->second); - if (InvDMI != InversedLastUser.end()) { - SmallPtrSet<Pass *, 8> &L = InvDMI->second; - L.insert(DMI->first); - } else { - SmallPtrSet<Pass *, 8> L; L.insert(DMI->first); - InversedLastUser[DMI->second] = L; - } + SmallPtrSet<Pass *, 8> &L = InversedLastUser[DMI->second]; + L.insert(DMI->first); } } @@ -801,10 +831,6 @@ PMTopLevelManager::~PMTopLevelManager() { for (SmallVectorImpl<ImmutablePass *>::iterator I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I) delete *I; - - for (DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.begin(), - DME = AnUsageMap.end(); DMI != DME; ++DMI) - delete DMI->second; } //===----------------------------------------------------------------------===// @@ -989,31 +1015,28 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) { // At the moment, this pass is the last user of all required passes. SmallVector<Pass *, 12> LastUses; - SmallVector<Pass *, 8> RequiredPasses; + SmallVector<Pass *, 8> UsedPasses; SmallVector<AnalysisID, 8> ReqAnalysisNotAvailable; unsigned PDepth = this->getDepth(); - collectRequiredAnalysis(RequiredPasses, - ReqAnalysisNotAvailable, P); - for (SmallVectorImpl<Pass *>::iterator I = RequiredPasses.begin(), - E = RequiredPasses.end(); I != E; ++I) { - Pass *PRequired = *I; + collectRequiredAndUsedAnalyses(UsedPasses, ReqAnalysisNotAvailable, P); + for (Pass *PUsed : UsedPasses) { unsigned RDepth = 0; - assert(PRequired->getResolver() && "Analysis Resolver is not set"); - PMDataManager &DM = PRequired->getResolver()->getPMDataManager(); + assert(PUsed->getResolver() && "Analysis Resolver is not set"); + PMDataManager &DM = PUsed->getResolver()->getPMDataManager(); RDepth = DM.getDepth(); if (PDepth == RDepth) - LastUses.push_back(PRequired); + LastUses.push_back(PUsed); else if (PDepth > RDepth) { // Let the parent claim responsibility of last use - TransferLastUses.push_back(PRequired); + TransferLastUses.push_back(PUsed); // Keep track of higher level analysis used by this manager. - HigherLevelAnalysis.push_back(PRequired); + HigherLevelAnalysis.push_back(PUsed); } else - llvm_unreachable("Unable to accommodate Required Pass"); + llvm_unreachable("Unable to accommodate Used Pass"); } // Set P as P's last user until someone starts using P. @@ -1030,10 +1053,8 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) { } // Now, take care of required analyses that are not available. - for (SmallVectorImpl<AnalysisID>::iterator - I = ReqAnalysisNotAvailable.begin(), - E = ReqAnalysisNotAvailable.end() ;I != E; ++I) { - const PassInfo *PI = TPM->findAnalysisPassInfo(*I); + for (AnalysisID ID : ReqAnalysisNotAvailable) { + const PassInfo *PI = TPM->findAnalysisPassInfo(ID); Pass *AnalysisPass = PI->createPass(); this->addLowerLevelRequiredPass(P, AnalysisPass); } @@ -1048,30 +1069,29 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) { } -/// Populate RP with analysis pass that are required by +/// Populate UP with analysis pass that are used or required by /// pass P and are available. Populate RP_NotAvail with analysis /// pass that are required by pass P but are not available. -void PMDataManager::collectRequiredAnalysis(SmallVectorImpl<Pass *> &RP, - SmallVectorImpl<AnalysisID> &RP_NotAvail, - Pass *P) { +void PMDataManager::collectRequiredAndUsedAnalyses( + SmallVectorImpl<Pass *> &UP, SmallVectorImpl<AnalysisID> &RP_NotAvail, + Pass *P) { AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P); - const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet(); - for (AnalysisUsage::VectorType::const_iterator - I = RequiredSet.begin(), E = RequiredSet.end(); I != E; ++I) { - if (Pass *AnalysisPass = findAnalysisPass(*I, true)) - RP.push_back(AnalysisPass); + + for (const auto &UsedID : AnUsage->getUsedSet()) + if (Pass *AnalysisPass = findAnalysisPass(UsedID, true)) + UP.push_back(AnalysisPass); + + for (const auto &RequiredID : AnUsage->getRequiredSet()) + if (Pass *AnalysisPass = findAnalysisPass(RequiredID, true)) + UP.push_back(AnalysisPass); else - RP_NotAvail.push_back(*I); - } + RP_NotAvail.push_back(RequiredID); - const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet(); - for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(), - E = IDs.end(); I != E; ++I) { - if (Pass *AnalysisPass = findAnalysisPass(*I, true)) - RP.push_back(AnalysisPass); + for (const auto &RequiredID : AnUsage->getRequiredTransitiveSet()) + if (Pass *AnalysisPass = findAnalysisPass(RequiredID, true)) + UP.push_back(AnalysisPass); else - RP_NotAvail.push_back(*I); - } + RP_NotAvail.push_back(RequiredID); } // All Required analyses should be available to the pass as it runs! Here @@ -1206,6 +1226,15 @@ void PMDataManager::dumpPreservedSet(const Pass *P) const { dumpAnalysisUsage("Preserved", P, analysisUsage.getPreservedSet()); } +void PMDataManager::dumpUsedSet(const Pass *P) const { + if (PassDebugging < Details) + return; + + AnalysisUsage analysisUsage; + P->getAnalysisUsage(analysisUsage); + dumpAnalysisUsage("Used", P, analysisUsage.getUsedSet()); +} + void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P, const AnalysisUsage::VectorType &Set) const { assert(PassDebugging >= Details); @@ -1310,6 +1339,7 @@ bool BBPassManager::runOnFunction(Function &F) { dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG, I->getName()); dumpPreservedSet(BP); + dumpUsedSet(BP); verifyPreservedAnalysis(BP); removeNotPreservedAnalysis(BP); @@ -1524,6 +1554,7 @@ bool FPPassManager::runOnFunction(Function &F) { if (LocalChanged) dumpPassInfo(FP, MODIFICATION_MSG, ON_FUNCTION_MSG, F.getName()); dumpPreservedSet(FP); + dumpUsedSet(FP); verifyPreservedAnalysis(FP); removeNotPreservedAnalysis(FP); @@ -1601,6 +1632,7 @@ MPPassManager::runOnModule(Module &M) { dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG, M.getModuleIdentifier()); dumpPreservedSet(MP); + dumpUsedSet(MP); verifyPreservedAnalysis(MP); removeNotPreservedAnalysis(MP); diff --git a/contrib/llvm/lib/IR/MDBuilder.cpp b/contrib/llvm/lib/IR/MDBuilder.cpp index b4c5ca7..4ce3ea2 100644 --- a/contrib/llvm/lib/IR/MDBuilder.cpp +++ b/contrib/llvm/lib/IR/MDBuilder.cpp @@ -36,8 +36,7 @@ MDNode *MDBuilder::createFPMath(float Accuracy) { MDNode *MDBuilder::createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight) { - uint32_t Weights[] = {TrueWeight, FalseWeight}; - return createBranchWeights(Weights); + return createBranchWeights({TrueWeight, FalseWeight}); } MDNode *MDBuilder::createBranchWeights(ArrayRef<uint32_t> Weights) { @@ -53,14 +52,15 @@ MDNode *MDBuilder::createBranchWeights(ArrayRef<uint32_t> Weights) { return MDNode::get(Context, Vals); } -MDNode *MDBuilder::createFunctionEntryCount(uint64_t Count) { - SmallVector<Metadata *, 2> Vals(2); - Vals[0] = createString("function_entry_count"); +MDNode *MDBuilder::createUnpredictable() { + return MDNode::get(Context, None); +} +MDNode *MDBuilder::createFunctionEntryCount(uint64_t Count) { Type *Int64Ty = Type::getInt64Ty(Context); - Vals[1] = createConstant(ConstantInt::get(Int64Ty, Count)); - - return MDNode::get(Context, Vals); + return MDNode::get(Context, + {createString("function_entry_count"), + createConstant(ConstantInt::get(Int64Ty, Count))}); } MDNode *MDBuilder::createRange(const APInt &Lo, const APInt &Hi) { @@ -76,8 +76,7 @@ MDNode *MDBuilder::createRange(Constant *Lo, Constant *Hi) { return nullptr; // Return the range [Lo, Hi). - Metadata *Range[2] = {createConstant(Lo), createConstant(Hi)}; - return MDNode::get(Context, Range); + return MDNode::get(Context, {createConstant(Lo), createConstant(Hi)}); } MDNode *MDBuilder::createAnonymousAARoot(StringRef Name, MDNode *Extra) { @@ -112,12 +111,10 @@ MDNode *MDBuilder::createTBAANode(StringRef Name, MDNode *Parent, bool isConstant) { if (isConstant) { Constant *Flags = ConstantInt::get(Type::getInt64Ty(Context), 1); - Metadata *Ops[3] = {createString(Name), Parent, createConstant(Flags)}; - return MDNode::get(Context, Ops); - } else { - Metadata *Ops[2] = {createString(Name), Parent}; - return MDNode::get(Context, Ops); + return MDNode::get(Context, + {createString(Name), Parent, createConstant(Flags)}); } + return MDNode::get(Context, {createString(Name), Parent}); } MDNode *MDBuilder::createAliasScopeDomain(StringRef Name) { @@ -125,8 +122,7 @@ MDNode *MDBuilder::createAliasScopeDomain(StringRef Name) { } MDNode *MDBuilder::createAliasScope(StringRef Name, MDNode *Domain) { - Metadata *Ops[2] = {createString(Name), Domain}; - return MDNode::get(Context, Ops); + return MDNode::get(Context, {createString(Name), Domain}); } /// \brief Return metadata for a tbaa.struct node with the given @@ -161,23 +157,19 @@ MDNode *MDBuilder::createTBAAStructTypeNode( MDNode *MDBuilder::createTBAAScalarTypeNode(StringRef Name, MDNode *Parent, uint64_t Offset) { ConstantInt *Off = ConstantInt::get(Type::getInt64Ty(Context), Offset); - Metadata *Ops[3] = {createString(Name), Parent, createConstant(Off)}; - return MDNode::get(Context, Ops); + return MDNode::get(Context, + {createString(Name), Parent, createConstant(Off)}); } /// \brief Return metadata for a TBAA tag node with the given /// base type, access type and offset relative to the base type. MDNode *MDBuilder::createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType, uint64_t Offset, bool IsConstant) { - Type *Int64 = Type::getInt64Ty(Context); + IntegerType *Int64 = Type::getInt64Ty(Context); + ConstantInt *Off = ConstantInt::get(Int64, Offset); if (IsConstant) { - Metadata *Ops[4] = {BaseType, AccessType, - createConstant(ConstantInt::get(Int64, Offset)), - createConstant(ConstantInt::get(Int64, 1))}; - return MDNode::get(Context, Ops); - } else { - Metadata *Ops[3] = {BaseType, AccessType, - createConstant(ConstantInt::get(Int64, Offset))}; - return MDNode::get(Context, Ops); + return MDNode::get(Context, {BaseType, AccessType, createConstant(Off), + createConstant(ConstantInt::get(Int64, 1))}); } + return MDNode::get(Context, {BaseType, AccessType, createConstant(Off)}); } diff --git a/contrib/llvm/lib/IR/Metadata.cpp b/contrib/llvm/lib/IR/Metadata.cpp index 1abcf0d..9a9a501 100644 --- a/contrib/llvm/lib/IR/Metadata.cpp +++ b/contrib/llvm/lib/IR/Metadata.cpp @@ -120,6 +120,38 @@ void MetadataAsValue::untrack() { MetadataTracking::untrack(MD); } +bool MetadataTracking::track(void *Ref, Metadata &MD, OwnerTy Owner) { + assert(Ref && "Expected live reference"); + assert((Owner || *static_cast<Metadata **>(Ref) == &MD) && + "Reference without owner must be direct"); + if (auto *R = ReplaceableMetadataImpl::get(MD)) { + R->addRef(Ref, Owner); + return true; + } + return false; +} + +void MetadataTracking::untrack(void *Ref, Metadata &MD) { + assert(Ref && "Expected live reference"); + if (auto *R = ReplaceableMetadataImpl::get(MD)) + R->dropRef(Ref); +} + +bool MetadataTracking::retrack(void *Ref, Metadata &MD, void *New) { + assert(Ref && "Expected live reference"); + assert(New && "Expected live reference"); + assert(Ref != New && "Expected change"); + if (auto *R = ReplaceableMetadataImpl::get(MD)) { + R->moveRef(Ref, New, MD); + return true; + } + return false; +} + +bool MetadataTracking::isReplaceable(const Metadata &MD) { + return ReplaceableMetadataImpl::get(const_cast<Metadata &>(MD)); +} + void ReplaceableMetadataImpl::addRef(void *Ref, OwnerTy Owner) { bool WasInserted = UseMap.insert(std::make_pair(Ref, std::make_pair(Owner, NextIndex))) @@ -158,6 +190,8 @@ void ReplaceableMetadataImpl::moveRef(void *Ref, void *New, void ReplaceableMetadataImpl::replaceAllUsesWith(Metadata *MD) { assert(!(MD && isa<MDNode>(MD) && cast<MDNode>(MD)->isTemporary()) && "Expected non-temp node"); + assert(CanReplace && + "Attempted to replace Metadata marked for no replacement"); if (UseMap.empty()) return; @@ -239,6 +273,12 @@ void ReplaceableMetadataImpl::resolveAllUses(bool ResolveUsers) { } } +ReplaceableMetadataImpl *ReplaceableMetadataImpl::get(Metadata &MD) { + if (auto *N = dyn_cast<MDNode>(&MD)) + return N->Context.getReplaceableUses(); + return dyn_cast<ValueAsMetadata>(&MD); +} + static Function *getLocalFunction(Value *V) { assert(V && "Expected value"); if (auto *A = dyn_cast<Argument>(V)) @@ -517,7 +557,7 @@ void MDNode::decrementUnresolvedOperandCount() { resolve(); } -void MDNode::resolveCycles() { +void MDNode::resolveRecursivelyImpl(bool AllowTemps) { if (isResolved()) return; @@ -530,6 +570,8 @@ void MDNode::resolveCycles() { if (!N) continue; + if (N->isTemporary() && AllowTemps) + continue; assert(!N->isTemporary() && "Expected all forward declarations to be resolved"); if (!N->isResolved()) @@ -545,6 +587,18 @@ static bool hasSelfReference(MDNode *N) { } MDNode *MDNode::replaceWithPermanentImpl() { + switch (getMetadataID()) { + default: + // If this type isn't uniquable, replace with a distinct node. + return replaceWithDistinctImpl(); + +#define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS) \ + case CLASS##Kind: \ + break; +#include "llvm/IR/Metadata.def" + } + + // Even if this type is uniquable, self-references have to be distinct. if (hasSelfReference(this)) return replaceWithDistinctImpl(); return replaceWithUniquedImpl(); @@ -671,8 +725,8 @@ MDNode *MDNode::uniquify() { // Try to insert into uniquing store. switch (getMetadataID()) { default: - llvm_unreachable("Invalid subclass of MDNode"); -#define HANDLE_MDNODE_LEAF(CLASS) \ + llvm_unreachable("Invalid or non-uniquable subclass of MDNode"); +#define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS) \ case CLASS##Kind: { \ CLASS *SubclassThis = cast<CLASS>(this); \ std::integral_constant<bool, HasCachedHash<CLASS>::value> \ @@ -687,8 +741,8 @@ MDNode *MDNode::uniquify() { void MDNode::eraseFromStore() { switch (getMetadataID()) { default: - llvm_unreachable("Invalid subclass of MDNode"); -#define HANDLE_MDNODE_LEAF(CLASS) \ + llvm_unreachable("Invalid or non-uniquable subclass of MDNode"); +#define HANDLE_MDNODE_LEAF_UNIQUABLE(CLASS) \ case CLASS##Kind: \ getContext().pImpl->CLASS##s.erase(cast<CLASS>(this)); \ break; @@ -941,6 +995,17 @@ MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) { return MDNode::get(A->getContext(), MDs); } +MDNode *MDNode::getMostGenericAlignmentOrDereferenceable(MDNode *A, MDNode *B) { + if (!A || !B) + return nullptr; + + ConstantInt *AVal = mdconst::extract<ConstantInt>(A->getOperand(0)); + ConstantInt *BVal = mdconst::extract<ConstantInt>(B->getOperand(0)); + if (AVal->getZExtValue() < BVal->getZExtValue()) + return A; + return B; +} + //===----------------------------------------------------------------------===// // NamedMDNode implementation. // @@ -1045,14 +1110,10 @@ MDNode *Instruction::getMetadataImpl(StringRef Kind) const { return getMetadataImpl(getContext().getMDKindID(Kind)); } -void Instruction::dropUnknownMetadata(ArrayRef<unsigned> KnownIDs) { +void Instruction::dropUnknownNonDebugMetadata(ArrayRef<unsigned> KnownIDs) { SmallSet<unsigned, 5> KnownSet; KnownSet.insert(KnownIDs.begin(), KnownIDs.end()); - // Drop debug if needed - if (KnownSet.erase(LLVMContext::MD_dbg)) - DbgLoc = DebugLoc(); - if (!hasMetadataHashEntry()) return; // Nothing to remove! @@ -1077,7 +1138,7 @@ void Instruction::dropUnknownMetadata(ArrayRef<unsigned> KnownIDs) { } } -/// setMetadata - Set the metadata of of the specified kind to the specified +/// setMetadata - Set the metadata of the specified kind to the specified /// node. This updates/replaces metadata if already present, or removes it if /// Node is null. void Instruction::setMetadata(unsigned KindID, MDNode *Node) { @@ -1251,3 +1312,11 @@ void Function::clearMetadata() { getContext().pImpl->FunctionMetadata.erase(this); setHasMetadataHashEntry(false); } + +void Function::setSubprogram(DISubprogram *SP) { + setMetadata(LLVMContext::MD_dbg, SP); +} + +DISubprogram *Function::getSubprogram() const { + return cast_or_null<DISubprogram>(getMetadata(LLVMContext::MD_dbg)); +} diff --git a/contrib/llvm/lib/IR/MetadataImpl.h b/contrib/llvm/lib/IR/MetadataImpl.h index 662a50e..b913746 100644 --- a/contrib/llvm/lib/IR/MetadataImpl.h +++ b/contrib/llvm/lib/IR/MetadataImpl.h @@ -26,6 +26,19 @@ static T *getUniqued(DenseSet<T *, InfoT> &Store, return I == Store.end() ? nullptr : *I; } +template <class T> T *MDNode::storeImpl(T *N, StorageType Storage) { + switch (Storage) { + case Uniqued: + llvm_unreachable("Cannot unique without a uniquing-store"); + case Distinct: + N->storeDistinctInContext(); + break; + case Temporary: + break; + } + return N; +} + template <class T, class StoreT> T *MDNode::storeImpl(T *N, StorageType Storage, StoreT &Store) { switch (Storage) { diff --git a/contrib/llvm/lib/IR/MetadataTracking.cpp b/contrib/llvm/lib/IR/MetadataTracking.cpp deleted file mode 100644 index 47f0b93..0000000 --- a/contrib/llvm/lib/IR/MetadataTracking.cpp +++ /dev/null @@ -1,55 +0,0 @@ -//===- MetadataTracking.cpp - Implement metadata tracking -----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements Metadata tracking. -// -//===----------------------------------------------------------------------===// - -#include "llvm/IR/MetadataTracking.h" -#include "llvm/IR/Metadata.h" - -using namespace llvm; - -ReplaceableMetadataImpl *ReplaceableMetadataImpl::get(Metadata &MD) { - if (auto *N = dyn_cast<MDNode>(&MD)) - return N->Context.getReplaceableUses(); - return dyn_cast<ValueAsMetadata>(&MD); -} - -bool MetadataTracking::track(void *Ref, Metadata &MD, OwnerTy Owner) { - assert(Ref && "Expected live reference"); - assert((Owner || *static_cast<Metadata **>(Ref) == &MD) && - "Reference without owner must be direct"); - if (auto *R = ReplaceableMetadataImpl::get(MD)) { - R->addRef(Ref, Owner); - return true; - } - return false; -} - -void MetadataTracking::untrack(void *Ref, Metadata &MD) { - assert(Ref && "Expected live reference"); - if (auto *R = ReplaceableMetadataImpl::get(MD)) - R->dropRef(Ref); -} - -bool MetadataTracking::retrack(void *Ref, Metadata &MD, void *New) { - assert(Ref && "Expected live reference"); - assert(New && "Expected live reference"); - assert(Ref != New && "Expected change"); - if (auto *R = ReplaceableMetadataImpl::get(MD)) { - R->moveRef(Ref, New, MD); - return true; - } - return false; -} - -bool MetadataTracking::isReplaceable(const Metadata &MD) { - return ReplaceableMetadataImpl::get(const_cast<Metadata &>(MD)); -} diff --git a/contrib/llvm/lib/IR/Module.cpp b/contrib/llvm/lib/IR/Module.cpp index 043f74e..ac578d6 100644 --- a/contrib/llvm/lib/IR/Module.cpp +++ b/contrib/llvm/lib/IR/Module.cpp @@ -29,6 +29,7 @@ #include <algorithm> #include <cstdarg> #include <cstdlib> + using namespace llvm; //===----------------------------------------------------------------------===// @@ -37,9 +38,9 @@ using namespace llvm; // Explicit instantiations of SymbolTableListTraits since some of the methods // are not in the public header file. -template class llvm::SymbolTableListTraits<Function, Module>; -template class llvm::SymbolTableListTraits<GlobalVariable, Module>; -template class llvm::SymbolTableListTraits<GlobalAlias, Module>; +template class llvm::SymbolTableListTraits<Function>; +template class llvm::SymbolTableListTraits<GlobalVariable>; +template class llvm::SymbolTableListTraits<GlobalAlias>; //===----------------------------------------------------------------------===// // Primitive Module methods. @@ -81,7 +82,6 @@ RandomNumberGenerator *Module::createRNG(const Pass* P) const { return new RandomNumberGenerator(Salt); } - /// getNamedValue - Return the first global value in the module with /// the specified name, of arbitrary type. This method returns null /// if a global with the specified name is not found. @@ -102,6 +102,9 @@ void Module::getMDKindNames(SmallVectorImpl<StringRef> &Result) const { return Context.getMDKindNames(Result); } +void Module::getOperandBundleTags(SmallVectorImpl<StringRef> &Result) const { + return Context.getOperandBundleTags(Result); +} //===----------------------------------------------------------------------===// // Methods for easy access to the functions in the module. @@ -274,7 +277,7 @@ NamedMDNode *Module::getOrInsertNamedMetadata(StringRef Name) { /// delete it. void Module::eraseNamedMetadata(NamedMDNode *NMD) { static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab)->erase(NMD->getName()); - NamedMDList.erase(NMD); + NamedMDList.erase(NMD->getIterator()); } bool Module::isValidModFlagBehavior(Metadata *MD, ModFlagBehavior &MFB) { @@ -376,17 +379,11 @@ const DataLayout &Module::getDataLayout() const { return DL; } // void Module::setMaterializer(GVMaterializer *GVM) { assert(!Materializer && - "Module already has a GVMaterializer. Call MaterializeAllPermanently" + "Module already has a GVMaterializer. Call materializeAll" " to clear it out before setting another one."); Materializer.reset(GVM); } -bool Module::isDematerializable(const GlobalValue *GV) const { - if (Materializer) - return Materializer->isDematerializable(GV); - return false; -} - std::error_code Module::materialize(GlobalValue *GV) { if (!Materializer) return std::error_code(); @@ -394,23 +391,11 @@ std::error_code Module::materialize(GlobalValue *GV) { return Materializer->materialize(GV); } -void Module::dematerialize(GlobalValue *GV) { - if (Materializer) - return Materializer->dematerialize(GV); -} - std::error_code Module::materializeAll() { if (!Materializer) return std::error_code(); - return Materializer->materializeModule(this); -} - -std::error_code Module::materializeAllPermanently() { - if (std::error_code EC = materializeAll()) - return EC; - - Materializer.reset(); - return std::error_code(); + std::unique_ptr<GVMaterializer> M = std::move(Materializer); + return M->materializeModule(); } std::error_code Module::materializeMetadata() { @@ -458,7 +443,14 @@ void Module::dropAllReferences() { unsigned Module::getDwarfVersion() const { auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("Dwarf Version")); if (!Val) - return dwarf::DWARF_VERSION; + return 0; + return cast<ConstantInt>(Val->getValue())->getZExtValue(); +} + +unsigned Module::getCodeViewFlag() const { + auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("CodeView")); + if (!Val) + return 0; return cast<ConstantInt>(Val->getValue())->getZExtValue(); } @@ -471,7 +463,7 @@ Comdat *Module::getOrInsertComdat(StringRef Name) { PICLevel::Level Module::getPICLevel() const { auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("PIC Level")); - if (Val == NULL) + if (!Val) return PICLevel::Default; return static_cast<PICLevel::Level>( @@ -481,3 +473,15 @@ PICLevel::Level Module::getPICLevel() const { void Module::setPICLevel(PICLevel::Level PL) { addModuleFlag(ModFlagBehavior::Error, "PIC Level", PL); } + +void Module::setMaximumFunctionCount(uint64_t Count) { + addModuleFlag(ModFlagBehavior::Error, "MaxFunctionCount", Count); +} + +Optional<uint64_t> Module::getMaximumFunctionCount() { + auto *Val = + cast_or_null<ConstantAsMetadata>(getModuleFlag("MaxFunctionCount")); + if (!Val) + return None; + return cast<ConstantInt>(Val->getValue())->getZExtValue(); +} diff --git a/contrib/llvm/lib/IR/Statepoint.cpp b/contrib/llvm/lib/IR/Statepoint.cpp index 83ee611..27a990e 100644 --- a/contrib/llvm/lib/IR/Statepoint.cpp +++ b/contrib/llvm/lib/IR/Statepoint.cpp @@ -40,20 +40,7 @@ bool llvm::isStatepoint(const Value &inst) { } bool llvm::isGCRelocate(const ImmutableCallSite &CS) { - if (!CS.getInstruction()) { - // This is not a call site - return false; - } - - return isGCRelocate(CS.getInstruction()); -} -bool llvm::isGCRelocate(const Value *inst) { - if (const CallInst *call = dyn_cast<CallInst>(inst)) { - if (const Function *F = call->getCalledFunction()) { - return F->getIntrinsicID() == Intrinsic::experimental_gc_relocate; - } - } - return false; + return CS.getInstruction() && isa<GCRelocateInst>(CS.getInstruction()); } bool llvm::isGCResult(const ImmutableCallSite &CS) { @@ -67,10 +54,7 @@ bool llvm::isGCResult(const ImmutableCallSite &CS) { bool llvm::isGCResult(const Value *inst) { if (const CallInst *call = dyn_cast<CallInst>(inst)) { if (Function *F = call->getCalledFunction()) { - return (F->getIntrinsicID() == Intrinsic::experimental_gc_result_int || - F->getIntrinsicID() == Intrinsic::experimental_gc_result_float || - F->getIntrinsicID() == Intrinsic::experimental_gc_result_ptr || - F->getIntrinsicID() == Intrinsic::experimental_gc_result); + return F->getIntrinsicID() == Intrinsic::experimental_gc_result; } } return false; diff --git a/contrib/llvm/lib/IR/SymbolTableListTraitsImpl.h b/contrib/llvm/lib/IR/SymbolTableListTraitsImpl.h index a18f982..50573d8 100644 --- a/contrib/llvm/lib/IR/SymbolTableListTraitsImpl.h +++ b/contrib/llvm/lib/IR/SymbolTableListTraitsImpl.h @@ -24,77 +24,73 @@ namespace llvm { /// setSymTabObject - This is called when (f.e.) the parent of a basic block /// changes. This requires us to remove all the instruction symtab entries from /// the current function and reinsert them into the new function. -template<typename ValueSubClass, typename ItemParentClass> -template<typename TPtr> -void SymbolTableListTraits<ValueSubClass,ItemParentClass> -::setSymTabObject(TPtr *Dest, TPtr Src) { +template <typename ValueSubClass> +template <typename TPtr> +void SymbolTableListTraits<ValueSubClass>::setSymTabObject(TPtr *Dest, + TPtr Src) { // Get the old symtab and value list before doing the assignment. - ValueSymbolTable *OldST = TraitsClass::getSymTab(getListOwner()); + ValueSymbolTable *OldST = getSymTab(getListOwner()); // Do it. *Dest = Src; // Get the new SymTab object. - ValueSymbolTable *NewST = TraitsClass::getSymTab(getListOwner()); + ValueSymbolTable *NewST = getSymTab(getListOwner()); // If there is nothing to do, quick exit. if (OldST == NewST) return; // Move all the elements from the old symtab to the new one. - iplist<ValueSubClass> &ItemList = TraitsClass::getList(getListOwner()); + ListTy &ItemList = getList(getListOwner()); if (ItemList.empty()) return; if (OldST) { // Remove all entries from the previous symtab. - for (typename iplist<ValueSubClass>::iterator I = ItemList.begin(); - I != ItemList.end(); ++I) + for (auto I = ItemList.begin(); I != ItemList.end(); ++I) if (I->hasName()) OldST->removeValueName(I->getValueName()); } if (NewST) { // Add all of the items to the new symtab. - for (typename iplist<ValueSubClass>::iterator I = ItemList.begin(); - I != ItemList.end(); ++I) + for (auto I = ItemList.begin(); I != ItemList.end(); ++I) if (I->hasName()) - NewST->reinsertValue(I); + NewST->reinsertValue(&*I); } } -template<typename ValueSubClass, typename ItemParentClass> -void SymbolTableListTraits<ValueSubClass,ItemParentClass> -::addNodeToList(ValueSubClass *V) { +template <typename ValueSubClass> +void SymbolTableListTraits<ValueSubClass>::addNodeToList(ValueSubClass *V) { assert(!V->getParent() && "Value already in a container!!"); ItemParentClass *Owner = getListOwner(); V->setParent(Owner); if (V->hasName()) - if (ValueSymbolTable *ST = TraitsClass::getSymTab(Owner)) + if (ValueSymbolTable *ST = getSymTab(Owner)) ST->reinsertValue(V); } -template<typename ValueSubClass, typename ItemParentClass> -void SymbolTableListTraits<ValueSubClass,ItemParentClass> -::removeNodeFromList(ValueSubClass *V) { +template <typename ValueSubClass> +void SymbolTableListTraits<ValueSubClass>::removeNodeFromList( + ValueSubClass *V) { V->setParent(nullptr); if (V->hasName()) - if (ValueSymbolTable *ST = TraitsClass::getSymTab(getListOwner())) + if (ValueSymbolTable *ST = getSymTab(getListOwner())) ST->removeValueName(V->getValueName()); } -template<typename ValueSubClass, typename ItemParentClass> -void SymbolTableListTraits<ValueSubClass,ItemParentClass> -::transferNodesFromList(ilist_traits<ValueSubClass> &L2, - ilist_iterator<ValueSubClass> first, - ilist_iterator<ValueSubClass> last) { +template <typename ValueSubClass> +void SymbolTableListTraits<ValueSubClass>::transferNodesFromList( + SymbolTableListTraits &L2, ilist_iterator<ValueSubClass> first, + ilist_iterator<ValueSubClass> last) { // We only have to do work here if transferring instructions between BBs ItemParentClass *NewIP = getListOwner(), *OldIP = L2.getListOwner(); if (NewIP == OldIP) return; // No work to do at all... // We only have to update symbol table entries if we are transferring the // instructions to a different symtab object... - ValueSymbolTable *NewST = TraitsClass::getSymTab(NewIP); - ValueSymbolTable *OldST = TraitsClass::getSymTab(OldIP); + ValueSymbolTable *NewST = getSymTab(NewIP); + ValueSymbolTable *OldST = getSymTab(OldIP); if (NewST != OldST) { for (; first != last; ++first) { ValueSubClass &V = *first; diff --git a/contrib/llvm/lib/IR/Type.cpp b/contrib/llvm/lib/IR/Type.cpp index a9ca800..4c1baf5 100644 --- a/contrib/llvm/lib/IR/Type.cpp +++ b/contrib/llvm/lib/IR/Type.cpp @@ -35,6 +35,7 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) { case LabelTyID : return getLabelTy(C); case MetadataTyID : return getMetadataTy(C); case X86_MMXTyID : return getX86_MMXTy(C); + case TokenTyID : return getTokenTy(C); default: return nullptr; } @@ -42,16 +43,10 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) { /// getScalarType - If this is a vector type, return the element type, /// otherwise return this. -Type *Type::getScalarType() { - if (VectorType *VTy = dyn_cast<VectorType>(this)) +Type *Type::getScalarType() const { + if (auto *VTy = dyn_cast<VectorType>(this)) return VTy->getElementType(); - return this; -} - -const Type *Type::getScalarType() const { - if (const VectorType *VTy = dyn_cast<VectorType>(this)) - return VTy->getElementType(); - return this; + return const_cast<Type*>(this); } /// isIntegerTy - Return true if this is an IntegerType of the specified width. @@ -74,8 +69,8 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const { // Vector -> Vector conversions are always lossless if the two vector types // have the same size, otherwise not. Also, 64-bit vector types can be // converted to x86mmx. - if (const VectorType *thisPTy = dyn_cast<VectorType>(this)) { - if (const VectorType *thatPTy = dyn_cast<VectorType>(Ty)) + if (auto *thisPTy = dyn_cast<VectorType>(this)) { + if (auto *thatPTy = dyn_cast<VectorType>(Ty)) return thisPTy->getBitWidth() == thatPTy->getBitWidth(); if (Ty->getTypeID() == Type::X86_MMXTyID && thisPTy->getBitWidth() == 64) @@ -83,7 +78,7 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const { } if (this->getTypeID() == Type::X86_MMXTyID) - if (const VectorType *thatPTy = dyn_cast<VectorType>(Ty)) + if (auto *thatPTy = dyn_cast<VectorType>(Ty)) if (thatPTy->getBitWidth() == 64) return true; @@ -91,8 +86,8 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const { // remaining and ptr->ptr. Just select the lossless conversions. Everything // else is not lossless. Conservatively assume we can't losslessly convert // between pointers with different address spaces. - if (const PointerType *PTy = dyn_cast<PointerType>(this)) { - if (const PointerType *OtherPTy = dyn_cast<PointerType>(Ty)) + if (auto *PTy = dyn_cast<PointerType>(this)) { + if (auto *OtherPTy = dyn_cast<PointerType>(Ty)) return PTy->getAddressSpace() == OtherPTy->getAddressSpace(); return false; } @@ -100,14 +95,12 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const { } bool Type::isEmptyTy() const { - const ArrayType *ATy = dyn_cast<ArrayType>(this); - if (ATy) { + if (auto *ATy = dyn_cast<ArrayType>(this)) { unsigned NumElements = ATy->getNumElements(); return NumElements == 0 || ATy->getElementType()->isEmptyTy(); } - const StructType *STy = dyn_cast<StructType>(this); - if (STy) { + if (auto *STy = dyn_cast<StructType>(this)) { unsigned NumElements = STy->getNumElements(); for (unsigned i = 0; i < NumElements; ++i) if (!STy->getElementType(i)->isEmptyTy()) @@ -144,7 +137,7 @@ unsigned Type::getScalarSizeInBits() const { /// is only valid on floating point types. If the FP type does not /// have a stable mantissa (e.g. ppc long double), this method returns -1. int Type::getFPMantissaWidth() const { - if (const VectorType *VTy = dyn_cast<VectorType>(this)) + if (auto *VTy = dyn_cast<VectorType>(this)) return VTy->getElementType()->getFPMantissaWidth(); assert(isFloatingPointTy() && "Not a floating point type!"); if (getTypeID() == HalfTyID) return 11; @@ -159,66 +152,17 @@ int Type::getFPMantissaWidth() const { /// isSizedDerivedType - Derived types like structures and arrays are sized /// iff all of the members of the type are sized as well. Since asking for /// their size is relatively uncommon, move this operation out of line. -bool Type::isSizedDerivedType(SmallPtrSetImpl<const Type*> *Visited) const { - if (const ArrayType *ATy = dyn_cast<ArrayType>(this)) +bool Type::isSizedDerivedType(SmallPtrSetImpl<Type*> *Visited) const { + if (auto *ATy = dyn_cast<ArrayType>(this)) return ATy->getElementType()->isSized(Visited); - if (const VectorType *VTy = dyn_cast<VectorType>(this)) + if (auto *VTy = dyn_cast<VectorType>(this)) return VTy->getElementType()->isSized(Visited); return cast<StructType>(this)->isSized(Visited); } //===----------------------------------------------------------------------===// -// Subclass Helper Methods -//===----------------------------------------------------------------------===// - -unsigned Type::getIntegerBitWidth() const { - return cast<IntegerType>(this)->getBitWidth(); -} - -bool Type::isFunctionVarArg() const { - return cast<FunctionType>(this)->isVarArg(); -} - -Type *Type::getFunctionParamType(unsigned i) const { - return cast<FunctionType>(this)->getParamType(i); -} - -unsigned Type::getFunctionNumParams() const { - return cast<FunctionType>(this)->getNumParams(); -} - -StringRef Type::getStructName() const { - return cast<StructType>(this)->getName(); -} - -unsigned Type::getStructNumElements() const { - return cast<StructType>(this)->getNumElements(); -} - -Type *Type::getStructElementType(unsigned N) const { - return cast<StructType>(this)->getElementType(N); -} - -Type *Type::getSequentialElementType() const { - return cast<SequentialType>(this)->getElementType(); -} - -uint64_t Type::getArrayNumElements() const { - return cast<ArrayType>(this)->getNumElements(); -} - -unsigned Type::getVectorNumElements() const { - return cast<VectorType>(this)->getNumElements(); -} - -unsigned Type::getPointerAddressSpace() const { - return cast<PointerType>(getScalarType())->getAddressSpace(); -} - - -//===----------------------------------------------------------------------===// // Primitive 'Type' data //===----------------------------------------------------------------------===// @@ -228,6 +172,7 @@ Type *Type::getHalfTy(LLVMContext &C) { return &C.pImpl->HalfTy; } Type *Type::getFloatTy(LLVMContext &C) { return &C.pImpl->FloatTy; } Type *Type::getDoubleTy(LLVMContext &C) { return &C.pImpl->DoubleTy; } Type *Type::getMetadataTy(LLVMContext &C) { return &C.pImpl->MetadataTy; } +Type *Type::getTokenTy(LLVMContext &C) { return &C.pImpl->TokenTy; } Type *Type::getX86_FP80Ty(LLVMContext &C) { return &C.pImpl->X86_FP80Ty; } Type *Type::getFP128Ty(LLVMContext &C) { return &C.pImpl->FP128Ty; } Type *Type::getPPC_FP128Ty(LLVMContext &C) { return &C.pImpl->PPC_FP128Ty; } @@ -345,7 +290,7 @@ FunctionType::FunctionType(Type *Result, ArrayRef<Type*> Params, assert(isValidReturnType(Result) && "invalid return type for function"); setSubclassData(IsVarArgs); - SubTys[0] = const_cast<Type*>(Result); + SubTys[0] = Result; for (unsigned i = 0, e = Params.size(); i != e; ++i) { assert(isValidArgumentType(Params[i]) && @@ -428,12 +373,14 @@ void StructType::setBody(ArrayRef<Type*> Elements, bool isPacked) { if (isPacked) setSubclassData(getSubclassData() | SCDB_Packed); - unsigned NumElements = Elements.size(); - Type **Elts = getContext().pImpl->TypeAllocator.Allocate<Type*>(NumElements); - memcpy(Elts, Elements.data(), sizeof(Elements[0]) * NumElements); - - ContainedTys = Elts; - NumContainedTys = NumElements; + NumContainedTys = Elements.size(); + + if (Elements.empty()) { + ContainedTys = nullptr; + return; + } + + ContainedTys = Elements.copy(getContext().pImpl->TypeAllocator).data(); } void StructType::setName(StringRef Name) { @@ -470,7 +417,6 @@ void StructType::setName(StringRef Name) { do { TempStr.resize(NameSize + 1); - TmpStream.resync(); TmpStream << getContext().pImpl->NamedStructTypesUniqueID++; IterBool = getContext().pImpl->NamedStructTypes.insert( @@ -556,13 +502,13 @@ StructType *StructType::create(StringRef Name, Type *type, ...) { return Ret; } -bool StructType::isSized(SmallPtrSetImpl<const Type*> *Visited) const { +bool StructType::isSized(SmallPtrSetImpl<Type*> *Visited) const { if ((getSubclassData() & SCDB_IsSized) != 0) return true; if (isOpaque()) return false; - if (Visited && !Visited->insert(this).second) + if (Visited && !Visited->insert(const_cast<StructType*>(this)).second) return false; // Okay, our struct is sized if all of the elements are, but if one of the @@ -602,22 +548,19 @@ void StructType::setBody(Type *type, ...) { bool StructType::isValidElementType(Type *ElemTy) { return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() && - !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy(); + !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() && + !ElemTy->isTokenTy(); } /// isLayoutIdentical - Return true if this is layout identical to the /// specified struct. bool StructType::isLayoutIdentical(StructType *Other) const { if (this == Other) return true; - - if (isPacked() != Other->isPacked() || - getNumElements() != Other->getNumElements()) + + if (isPacked() != Other->isPacked()) return false; - if (!getNumElements()) - return true; - - return std::equal(element_begin(), element_end(), Other->element_begin()); + return elements() == Other->elements(); } /// getTypeByName - Return the type with the specified name, or null if there @@ -631,8 +574,8 @@ StructType *Module::getTypeByName(StringRef Name) const { // CompositeType Implementation //===----------------------------------------------------------------------===// -Type *CompositeType::getTypeAtIndex(const Value *V) { - if (StructType *STy = dyn_cast<StructType>(this)) { +Type *CompositeType::getTypeAtIndex(const Value *V) const { + if (auto *STy = dyn_cast<StructType>(this)) { unsigned Idx = (unsigned)cast<Constant>(V)->getUniqueInteger().getZExtValue(); assert(indexValid(Idx) && "Invalid structure index!"); @@ -641,16 +584,18 @@ Type *CompositeType::getTypeAtIndex(const Value *V) { return cast<SequentialType>(this)->getElementType(); } -Type *CompositeType::getTypeAtIndex(unsigned Idx) { - if (StructType *STy = dyn_cast<StructType>(this)) { + +Type *CompositeType::getTypeAtIndex(unsigned Idx) const{ + if (auto *STy = dyn_cast<StructType>(this)) { assert(indexValid(Idx) && "Invalid structure index!"); return STy->getElementType(Idx); } - + return cast<SequentialType>(this)->getElementType(); } + bool CompositeType::indexValid(const Value *V) const { - if (const StructType *STy = dyn_cast<StructType>(this)) { + if (auto *STy = dyn_cast<StructType>(this)) { // Structure indexes require (vectors of) 32-bit integer constants. In the // vector case all of the indices must be equal. if (!V->getType()->getScalarType()->isIntegerTy(32)) @@ -667,7 +612,7 @@ bool CompositeType::indexValid(const Value *V) const { } bool CompositeType::indexValid(unsigned Idx) const { - if (const StructType *STy = dyn_cast<StructType>(this)) + if (auto *STy = dyn_cast<StructType>(this)) return Idx < STy->getNumElements(); // Sequential types can be indexed by any integer. return true; @@ -683,10 +628,9 @@ ArrayType::ArrayType(Type *ElType, uint64_t NumEl) NumElements = NumEl; } -ArrayType *ArrayType::get(Type *elementType, uint64_t NumElements) { - Type *ElementType = const_cast<Type*>(elementType); +ArrayType *ArrayType::get(Type *ElementType, uint64_t NumElements) { assert(isValidElementType(ElementType) && "Invalid type for array element!"); - + LLVMContextImpl *pImpl = ElementType->getContext().pImpl; ArrayType *&Entry = pImpl->ArrayTypes[std::make_pair(ElementType, NumElements)]; @@ -698,7 +642,8 @@ ArrayType *ArrayType::get(Type *elementType, uint64_t NumElements) { bool ArrayType::isValidElementType(Type *ElemTy) { return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() && - !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy(); + !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() && + !ElemTy->isTokenTy(); } //===----------------------------------------------------------------------===// @@ -710,8 +655,7 @@ VectorType::VectorType(Type *ElType, unsigned NumEl) NumElements = NumEl; } -VectorType *VectorType::get(Type *elementType, unsigned NumElements) { - Type *ElementType = const_cast<Type*>(elementType); +VectorType *VectorType::get(Type *ElementType, unsigned NumElements) { assert(NumElements > 0 && "#Elements of a VectorType must be greater than 0"); assert(isValidElementType(ElementType) && "Element type of a VectorType must " "be an integer, floating point, or " @@ -761,13 +705,13 @@ PointerType::PointerType(Type *E, unsigned AddrSpace) assert(oldNCT == NumContainedTys && "bitfield written out of bounds?"); } -PointerType *Type::getPointerTo(unsigned addrs) { - return PointerType::get(this, addrs); +PointerType *Type::getPointerTo(unsigned addrs) const { + return PointerType::get(const_cast<Type*>(this), addrs); } bool PointerType::isValidElementType(Type *ElemTy) { return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() && - !ElemTy->isMetadataTy(); + !ElemTy->isMetadataTy() && !ElemTy->isTokenTy(); } bool PointerType::isLoadableOrStorableType(Type *ElemTy) { diff --git a/contrib/llvm/lib/IR/TypeFinder.cpp b/contrib/llvm/lib/IR/TypeFinder.cpp index 7accc5b..b5bdab0 100644 --- a/contrib/llvm/lib/IR/TypeFinder.cpp +++ b/contrib/llvm/lib/IR/TypeFinder.cpp @@ -44,19 +44,13 @@ void TypeFinder::run(const Module &M, bool onlyNamed) { for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) { incorporateType(FI->getType()); - if (FI->hasPrefixData()) - incorporateValue(FI->getPrefixData()); - - if (FI->hasPrologueData()) - incorporateValue(FI->getPrologueData()); - - if (FI->hasPersonalityFn()) - incorporateValue(FI->getPersonalityFn()); + for (const Use &U : FI->operands()) + incorporateValue(U.get()); // First incorporate the arguments. for (Function::const_arg_iterator AI = FI->arg_begin(), AE = FI->arg_end(); AI != AE; ++AI) - incorporateValue(AI); + incorporateValue(&*AI); for (Function::const_iterator BB = FI->begin(), E = FI->end(); BB != E;++BB) @@ -85,7 +79,7 @@ void TypeFinder::run(const Module &M, bool onlyNamed) { for (Module::const_named_metadata_iterator I = M.named_metadata_begin(), E = M.named_metadata_end(); I != E; ++I) { - const NamedMDNode *NMD = I; + const NamedMDNode *NMD = &*I; for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) incorporateMDNode(NMD->getOperand(i)); } diff --git a/contrib/llvm/lib/IR/User.cpp b/contrib/llvm/lib/IR/User.cpp index 522722d..a75abe6 100644 --- a/contrib/llvm/lib/IR/User.cpp +++ b/contrib/llvm/lib/IR/User.cpp @@ -87,22 +87,70 @@ void User::growHungoffUses(unsigned NewNumUses, bool IsPhi) { Use::zap(OldOps, OldOps + OldNumUses, true); } + +// This is a private struct used by `User` to track the co-allocated descriptor +// section. +struct DescriptorInfo { + intptr_t SizeInBytes; +}; + +ArrayRef<const uint8_t> User::getDescriptor() const { + auto MutableARef = const_cast<User *>(this)->getDescriptor(); + return {MutableARef.begin(), MutableARef.end()}; +} + +MutableArrayRef<uint8_t> User::getDescriptor() { + assert(HasDescriptor && "Don't call otherwise!"); + assert(!HasHungOffUses && "Invariant!"); + + auto *DI = reinterpret_cast<DescriptorInfo *>(getIntrusiveOperands()) - 1; + assert(DI->SizeInBytes != 0 && "Should not have had a descriptor otherwise!"); + + return MutableArrayRef<uint8_t>( + reinterpret_cast<uint8_t *>(DI) - DI->SizeInBytes, DI->SizeInBytes); +} + //===----------------------------------------------------------------------===// // User operator new Implementations //===----------------------------------------------------------------------===// -void *User::operator new(size_t Size, unsigned Us) { +void *User::allocateFixedOperandUser(size_t Size, unsigned Us, + unsigned DescBytes) { assert(Us < (1u << NumUserOperandsBits) && "Too many operands"); - void *Storage = ::operator new(Size + sizeof(Use) * Us); - Use *Start = static_cast<Use*>(Storage); + + static_assert(sizeof(DescriptorInfo) % sizeof(void *) == 0, "Required below"); + + unsigned DescBytesToAllocate = + DescBytes == 0 ? 0 : (DescBytes + sizeof(DescriptorInfo)); + assert(DescBytesToAllocate % sizeof(void *) == 0 && + "We need this to satisfy alignment constraints for Uses"); + + uint8_t *Storage = static_cast<uint8_t *>( + ::operator new(Size + sizeof(Use) * Us + DescBytesToAllocate)); + Use *Start = reinterpret_cast<Use *>(Storage + DescBytesToAllocate); Use *End = Start + Us; User *Obj = reinterpret_cast<User*>(End); Obj->NumUserOperands = Us; Obj->HasHungOffUses = false; + Obj->HasDescriptor = DescBytes != 0; Use::initTags(Start, End); + + if (DescBytes != 0) { + auto *DescInfo = reinterpret_cast<DescriptorInfo *>(Storage + DescBytes); + DescInfo->SizeInBytes = DescBytes; + } + return Obj; } +void *User::operator new(size_t Size, unsigned Us) { + return allocateFixedOperandUser(Size, Us, 0); +} + +void *User::operator new(size_t Size, unsigned Us, unsigned DescBytes) { + return allocateFixedOperandUser(Size, Us, DescBytes); +} + void *User::operator new(size_t Size) { // Allocate space for a single Use* void *Storage = ::operator new(Size + sizeof(Use *)); @@ -110,6 +158,7 @@ void *User::operator new(size_t Size) { User *Obj = reinterpret_cast<User *>(HungOffOperandList + 1); Obj->NumUserOperands = 0; Obj->HasHungOffUses = true; + Obj->HasDescriptor = false; *HungOffOperandList = nullptr; return Obj; } @@ -123,11 +172,20 @@ void User::operator delete(void *Usr) { // use a Use[] allocated prior to the user. User *Obj = static_cast<User *>(Usr); if (Obj->HasHungOffUses) { + assert(!Obj->HasDescriptor && "not supported!"); + Use **HungOffOperandList = static_cast<Use **>(Usr) - 1; // drop the hung off uses. Use::zap(*HungOffOperandList, *HungOffOperandList + Obj->NumUserOperands, /* Delete */ true); ::operator delete(HungOffOperandList); + } else if (Obj->HasDescriptor) { + Use *UseBegin = static_cast<Use *>(Usr) - Obj->NumUserOperands; + Use::zap(UseBegin, UseBegin + Obj->NumUserOperands, /* Delete */ false); + + auto *DI = reinterpret_cast<DescriptorInfo *>(UseBegin) - 1; + uint8_t *Storage = reinterpret_cast<uint8_t *>(DI) - DI->SizeInBytes; + ::operator delete(Storage); } else { Use *Storage = static_cast<Use *>(Usr) - Obj->NumUserOperands; Use::zap(Storage, Storage + Obj->NumUserOperands, diff --git a/contrib/llvm/lib/IR/Value.cpp b/contrib/llvm/lib/IR/Value.cpp index f554d59..eb9deb6 100644 --- a/contrib/llvm/lib/IR/Value.cpp +++ b/contrib/llvm/lib/IR/Value.cpp @@ -314,6 +314,16 @@ void Value::takeName(Value *V) { } #ifndef NDEBUG +void Value::assertModuleIsMaterialized() const { + const GlobalValue *GV = dyn_cast<GlobalValue>(this); + if (!GV) + return; + const Module *M = GV->getParent(); + if (!M) + return; + assert(M->isMaterialized()); +} + static bool contains(SmallPtrSetImpl<ConstantExpr *> &Cache, ConstantExpr *Expr, Constant *C) { if (!Cache.insert(Expr).second) @@ -490,8 +500,7 @@ Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, return V; Offset = GEPOffset; V = GEP->getPointerOperand(); - } else if (Operator::getOpcode(V) == Instruction::BitCast || - Operator::getOpcode(V) == Instruction::AddrSpaceCast) { + } else if (Operator::getOpcode(V) == Instruction::BitCast) { V = cast<Operator>(V)->getOperand(0); } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { V = GA->getAliasee(); diff --git a/contrib/llvm/lib/IR/ValueSymbolTable.cpp b/contrib/llvm/lib/IR/ValueSymbolTable.cpp index e10142d..deb6e75 100644 --- a/contrib/llvm/lib/IR/ValueSymbolTable.cpp +++ b/contrib/llvm/lib/IR/ValueSymbolTable.cpp @@ -32,6 +32,24 @@ ValueSymbolTable::~ValueSymbolTable() { #endif } +ValueName *ValueSymbolTable::makeUniqueName(Value *V, + SmallString<256> &UniqueName) { + unsigned BaseSize = UniqueName.size(); + while (1) { + // Trim any suffix off and append the next number. + UniqueName.resize(BaseSize); + raw_svector_ostream S(UniqueName); + if (isa<GlobalValue>(V)) + S << "."; + S << ++LastUnique; + + // Try insert the vmap entry with this suffix. + auto IterBool = vmap.insert(std::make_pair(UniqueName, V)); + if (IterBool.second) + return &*IterBool.first; + } +} + // Insert a value into the symbol table with the specified name... // void ValueSymbolTable::reinsertValue(Value* V) { @@ -49,21 +67,8 @@ void ValueSymbolTable::reinsertValue(Value* V) { // The name is too already used, just free it so we can allocate a new name. V->getValueName()->Destroy(); - unsigned BaseSize = UniqueName.size(); - while (1) { - // Trim any suffix off and append the next number. - UniqueName.resize(BaseSize); - raw_svector_ostream(UniqueName) << "." << ++LastUnique; - - // Try insert the vmap entry with this suffix. - auto IterBool = vmap.insert(std::make_pair(UniqueName, V)); - if (IterBool.second) { - // Newly inserted name. Success! - V->setValueName(&*IterBool.first); - //DEBUG(dbgs() << " Inserted value: " << UniqueName << ": " << *V << "\n"); - return; - } - } + ValueName *VN = makeUniqueName(V, UniqueName); + V->setValueName(VN); } void ValueSymbolTable::removeValueName(ValueName *V) { @@ -86,20 +91,7 @@ ValueName *ValueSymbolTable::createValueName(StringRef Name, Value *V) { // Otherwise, there is a naming conflict. Rename this value. SmallString<256> UniqueName(Name.begin(), Name.end()); - - while (1) { - // Trim any suffix off and append the next number. - UniqueName.resize(Name.size()); - raw_svector_ostream(UniqueName) << ++LastUnique; - - // Try insert the vmap entry with this suffix. - auto IterBool = vmap.insert(std::make_pair(UniqueName, V)); - if (IterBool.second) { - // DEBUG(dbgs() << " Inserted value: " << UniqueName << ": " << *V << - // "\n"); - return &*IterBool.first; - } - } + return makeUniqueName(V, UniqueName); } diff --git a/contrib/llvm/lib/IR/ValueTypes.cpp b/contrib/llvm/lib/IR/ValueTypes.cpp index d95de39..f293230 100644 --- a/contrib/llvm/lib/IR/ValueTypes.cpp +++ b/contrib/llvm/lib/IR/ValueTypes.cpp @@ -19,6 +19,11 @@ #include "llvm/Support/ErrorHandling.h" using namespace llvm; +EVT EVT::changeExtendedTypeToInteger() const { + LLVMContext &Context = LLVMTy->getContext(); + return getIntegerVT(Context, getSizeInBits()); +} + EVT EVT::changeExtendedVectorElementTypeToInteger() const { LLVMContext &Context = LLVMTy->getContext(); EVT IntTy = getIntegerVT(Context, getVectorElementType().getSizeInBits()); @@ -83,6 +88,10 @@ bool EVT::isExtended1024BitVector() const { return isExtendedVector() && getExtendedSizeInBits() == 1024; } +bool EVT::isExtended2048BitVector() const { + return isExtendedVector() && getExtendedSizeInBits() == 2048; +} + EVT EVT::getExtendedVectorElementType() const { assert(isExtended() && "Type is not extended!"); return EVT::getEVT(cast<VectorType>(LLVMTy)->getElementType()); @@ -134,6 +143,8 @@ std::string EVT::getEVTString() const { case MVT::v16i1: return "v16i1"; case MVT::v32i1: return "v32i1"; case MVT::v64i1: return "v64i1"; + case MVT::v512i1: return "v512i1"; + case MVT::v1024i1: return "v1024i1"; case MVT::v1i8: return "v1i8"; case MVT::v2i8: return "v2i8"; case MVT::v4i8: return "v4i8"; @@ -141,22 +152,29 @@ std::string EVT::getEVTString() const { case MVT::v16i8: return "v16i8"; case MVT::v32i8: return "v32i8"; case MVT::v64i8: return "v64i8"; + case MVT::v128i8: return "v128i8"; + case MVT::v256i8: return "v256i8"; case MVT::v1i16: return "v1i16"; case MVT::v2i16: return "v2i16"; case MVT::v4i16: return "v4i16"; case MVT::v8i16: return "v8i16"; case MVT::v16i16: return "v16i16"; case MVT::v32i16: return "v32i16"; + case MVT::v64i16: return "v64i16"; + case MVT::v128i16: return "v128i16"; case MVT::v1i32: return "v1i32"; case MVT::v2i32: return "v2i32"; case MVT::v4i32: return "v4i32"; case MVT::v8i32: return "v8i32"; case MVT::v16i32: return "v16i32"; + case MVT::v32i32: return "v32i32"; + case MVT::v64i32: return "v64i32"; case MVT::v1i64: return "v1i64"; case MVT::v2i64: return "v2i64"; case MVT::v4i64: return "v4i64"; case MVT::v8i64: return "v8i64"; case MVT::v16i64: return "v16i64"; + case MVT::v32i64: return "v32i64"; case MVT::v1i128: return "v1i128"; case MVT::v1f32: return "v1f32"; case MVT::v2f32: return "v2f32"; @@ -203,6 +221,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const { case MVT::v16i1: return VectorType::get(Type::getInt1Ty(Context), 16); case MVT::v32i1: return VectorType::get(Type::getInt1Ty(Context), 32); case MVT::v64i1: return VectorType::get(Type::getInt1Ty(Context), 64); + case MVT::v512i1: return VectorType::get(Type::getInt1Ty(Context), 512); + case MVT::v1024i1: return VectorType::get(Type::getInt1Ty(Context), 1024); case MVT::v1i8: return VectorType::get(Type::getInt8Ty(Context), 1); case MVT::v2i8: return VectorType::get(Type::getInt8Ty(Context), 2); case MVT::v4i8: return VectorType::get(Type::getInt8Ty(Context), 4); @@ -210,22 +230,29 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const { case MVT::v16i8: return VectorType::get(Type::getInt8Ty(Context), 16); case MVT::v32i8: return VectorType::get(Type::getInt8Ty(Context), 32); case MVT::v64i8: return VectorType::get(Type::getInt8Ty(Context), 64); + case MVT::v128i8: return VectorType::get(Type::getInt8Ty(Context), 128); + case MVT::v256i8: return VectorType::get(Type::getInt8Ty(Context), 256); case MVT::v1i16: return VectorType::get(Type::getInt16Ty(Context), 1); case MVT::v2i16: return VectorType::get(Type::getInt16Ty(Context), 2); case MVT::v4i16: return VectorType::get(Type::getInt16Ty(Context), 4); case MVT::v8i16: return VectorType::get(Type::getInt16Ty(Context), 8); case MVT::v16i16: return VectorType::get(Type::getInt16Ty(Context), 16); case MVT::v32i16: return VectorType::get(Type::getInt16Ty(Context), 32); + case MVT::v64i16: return VectorType::get(Type::getInt16Ty(Context), 64); + case MVT::v128i16: return VectorType::get(Type::getInt16Ty(Context), 128); case MVT::v1i32: return VectorType::get(Type::getInt32Ty(Context), 1); case MVT::v2i32: return VectorType::get(Type::getInt32Ty(Context), 2); case MVT::v4i32: return VectorType::get(Type::getInt32Ty(Context), 4); case MVT::v8i32: return VectorType::get(Type::getInt32Ty(Context), 8); case MVT::v16i32: return VectorType::get(Type::getInt32Ty(Context), 16); + case MVT::v32i32: return VectorType::get(Type::getInt32Ty(Context), 32); + case MVT::v64i32: return VectorType::get(Type::getInt32Ty(Context), 64); case MVT::v1i64: return VectorType::get(Type::getInt64Ty(Context), 1); case MVT::v2i64: return VectorType::get(Type::getInt64Ty(Context), 2); case MVT::v4i64: return VectorType::get(Type::getInt64Ty(Context), 4); case MVT::v8i64: return VectorType::get(Type::getInt64Ty(Context), 8); case MVT::v16i64: return VectorType::get(Type::getInt64Ty(Context), 16); + case MVT::v32i64: return VectorType::get(Type::getInt64Ty(Context), 32); case MVT::v1i128: return VectorType::get(Type::getInt128Ty(Context), 1); case MVT::v2f16: return VectorType::get(Type::getHalfTy(Context), 2); case MVT::v4f16: return VectorType::get(Type::getHalfTy(Context), 4); diff --git a/contrib/llvm/lib/IR/Verifier.cpp b/contrib/llvm/lib/IR/Verifier.cpp index 2a0a4ff..9198b0e 100644 --- a/contrib/llvm/lib/IR/Verifier.cpp +++ b/contrib/llvm/lib/IR/Verifier.cpp @@ -39,13 +39,13 @@ // only by the unwind edge of an invoke instruction. // * A landingpad instruction must be the first non-PHI instruction in the // block. -// * All landingpad instructions must use the same personality function with -// the same function. +// * Landingpad instructions must be in a function with a personality function. // * All other things that are tested by asserts spread about the code... // //===----------------------------------------------------------------------===// #include "llvm/IR/Verifier.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" @@ -92,6 +92,16 @@ struct VerifierSupport { : OS(OS), M(nullptr), Broken(false) {} private: + template <class NodeTy> void Write(const ilist_iterator<NodeTy> &I) { + Write(&*I); + } + + void Write(const Module *M) { + if (!M) + return; + OS << "; ModuleID = '" << M->getModuleIdentifier() << "'\n"; + } + void Write(const Value *V) { if (!V) return; @@ -136,6 +146,11 @@ private: OS << *C; } + template <typename T> void Write(ArrayRef<T> Vs) { + for (const T &V : Vs) + Write(V); + } + template <typename T1, typename... Ts> void WriteTs(const T1 &V1, const Ts &... Vs) { Write(V1); @@ -184,6 +199,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport { /// \brief Track unresolved string-based type references. SmallDenseMap<const MDString *, const MDNode *, 32> UnresolvedTypeRefs; + /// \brief The result type for a landingpad. + Type *LandingPadResultTy; + /// \brief Whether we've seen a call to @llvm.localescape in this function /// already. bool SawFrameEscape; @@ -192,9 +210,19 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport { /// given function and the largest index passed to llvm.localrecover. DenseMap<Function *, std::pair<unsigned, unsigned>> FrameEscapeInfo; + // Maps catchswitches and cleanuppads that unwind to siblings to the + // terminators that indicate the unwind, used to detect cycles therein. + MapVector<Instruction *, TerminatorInst *> SiblingFuncletInfo; + + /// Cache of constants visited in search of ConstantExprs. + SmallPtrSet<const Constant *, 32> ConstantExprVisited; + + void checkAtomicMemAccessSize(const Module *M, Type *Ty, + const Instruction *I); public: explicit Verifier(raw_ostream &OS) - : VerifierSupport(OS), Context(nullptr), SawFrameEscape(false) {} + : VerifierSupport(OS), Context(nullptr), LandingPadResultTy(nullptr), + SawFrameEscape(false) {} bool verify(const Function &F) { M = F.getParent(); @@ -227,8 +255,11 @@ public: Broken = false; // FIXME: We strip const here because the inst visitor strips const. visit(const_cast<Function &>(F)); + verifySiblingFuncletUnwinds(); InstsInThisBlock.clear(); + LandingPadResultTy = nullptr; SawFrameEscape = false; + SiblingFuncletInfo.clear(); return !Broken; } @@ -297,12 +328,12 @@ private: void visitFunction(const Function &F); void visitBasicBlock(BasicBlock &BB); void visitRangeMetadata(Instruction& I, MDNode* Range, Type* Ty); + void visitDereferenceableMetadata(Instruction& I, MDNode* MD); template <class Ty> bool isValidMetadataArray(const MDTuple &N); #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) void visit##CLASS(const CLASS &N); #include "llvm/IR/Metadata.def" void visitDIScope(const DIScope &N); - void visitDIDerivedTypeBase(const DIDerivedTypeBase &N); void visitDIVariable(const DIVariable &N); void visitDILexicalBlockBase(const DILexicalBlockBase &N); void visitDITemplateParameter(const DITemplateParameter &N); @@ -379,7 +410,14 @@ private: void visitAllocaInst(AllocaInst &AI); void visitExtractValueInst(ExtractValueInst &EVI); void visitInsertValueInst(InsertValueInst &IVI); + void visitEHPadPredecessors(Instruction &I); void visitLandingPadInst(LandingPadInst &LPI); + void visitCatchPadInst(CatchPadInst &CPI); + void visitCatchReturnInst(CatchReturnInst &CatchReturn); + void visitCleanupPadInst(CleanupPadInst &CPI); + void visitFuncletPadInst(FuncletPadInst &FPI); + void visitCatchSwitchInst(CatchSwitchInst &CatchSwitch); + void visitCleanupReturnInst(CleanupReturnInst &CRI); void VerifyCallSite(CallSite CS); void verifyMustTailCall(CallInst &CI); @@ -399,9 +437,11 @@ private: void VerifyFunctionMetadata( const SmallVector<std::pair<unsigned, MDNode *>, 4> MDs); - void VerifyConstantExprBitcastType(const ConstantExpr *CE); + void visitConstantExprsRecursively(const Constant *EntryC); + void visitConstantExpr(const ConstantExpr *CE); void VerifyStatepoint(ImmutableCallSite CS); void verifyFrameRecoverIndices(); + void verifySiblingFuncletUnwinds(); // Module-level debug info verification... void verifyTypeRefs(); @@ -524,25 +564,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) { } // Walk any aggregate initializers looking for bitcasts between address spaces - SmallPtrSet<const Value *, 4> Visited; - SmallVector<const Value *, 4> WorkStack; - WorkStack.push_back(cast<Value>(GV.getInitializer())); - - while (!WorkStack.empty()) { - const Value *V = WorkStack.pop_back_val(); - if (!Visited.insert(V).second) - continue; - - if (const User *U = dyn_cast<User>(V)) { - WorkStack.append(U->op_begin(), U->op_end()); - } - - if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) { - VerifyConstantExprBitcastType(CE); - if (Broken) - return; - } - } + visitConstantExprsRecursively(GV.getInitializer()); visitGlobalValue(GV); } @@ -556,7 +578,8 @@ void Verifier::visitAliaseeSubExpr(const GlobalAlias &GA, const Constant &C) { void Verifier::visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias*> &Visited, const GlobalAlias &GA, const Constant &C) { if (const auto *GV = dyn_cast<GlobalValue>(&C)) { - Assert(!GV->isDeclaration(), "Alias must point to a definition", &GA); + Assert(!GV->isDeclarationForLinker(), "Alias must point to a definition", + &GA); if (const auto *GA2 = dyn_cast<GlobalAlias>(GV)) { Assert(Visited.insert(GA2).second, "Aliases cannot form a cycle", &GA); @@ -571,7 +594,7 @@ void Verifier::visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias*> &Visited, } if (const auto *CE = dyn_cast<ConstantExpr>(&C)) - VerifyConstantExprBitcastType(CE); + visitConstantExprsRecursively(CE); for (const Use &U : C.operands()) { Value *V = &*U; @@ -779,39 +802,10 @@ void Verifier::visitDIBasicType(const DIBasicType &N) { "invalid tag", &N); } -void Verifier::visitDIDerivedTypeBase(const DIDerivedTypeBase &N) { +void Verifier::visitDIDerivedType(const DIDerivedType &N) { // Common scope checks. visitDIScope(N); - Assert(isScopeRef(N, N.getScope()), "invalid scope", &N, N.getScope()); - Assert(isTypeRef(N, N.getBaseType()), "invalid base type", &N, - N.getBaseType()); - - // FIXME: Sink this into the subclass verifies. - if (!N.getFile() || N.getFile()->getFilename().empty()) { - // Check whether the filename is allowed to be empty. - uint16_t Tag = N.getTag(); - Assert( - Tag == dwarf::DW_TAG_const_type || Tag == dwarf::DW_TAG_volatile_type || - Tag == dwarf::DW_TAG_pointer_type || - Tag == dwarf::DW_TAG_ptr_to_member_type || - Tag == dwarf::DW_TAG_reference_type || - Tag == dwarf::DW_TAG_rvalue_reference_type || - Tag == dwarf::DW_TAG_restrict_type || - Tag == dwarf::DW_TAG_array_type || - Tag == dwarf::DW_TAG_enumeration_type || - Tag == dwarf::DW_TAG_subroutine_type || - Tag == dwarf::DW_TAG_inheritance || Tag == dwarf::DW_TAG_friend || - Tag == dwarf::DW_TAG_structure_type || - Tag == dwarf::DW_TAG_member || Tag == dwarf::DW_TAG_typedef, - "derived/composite type requires a filename", &N, N.getFile()); - } -} - -void Verifier::visitDIDerivedType(const DIDerivedType &N) { - // Common derived type checks. - visitDIDerivedTypeBase(N); - Assert(N.getTag() == dwarf::DW_TAG_typedef || N.getTag() == dwarf::DW_TAG_pointer_type || N.getTag() == dwarf::DW_TAG_ptr_to_member_type || @@ -828,6 +822,10 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) { Assert(isTypeRef(N, N.getExtraData()), "invalid pointer to member type", &N, N.getExtraData()); } + + Assert(isScopeRef(N, N.getScope()), "invalid scope", &N, N.getScope()); + Assert(isTypeRef(N, N.getBaseType()), "invalid base type", &N, + N.getBaseType()); } static bool hasConflictingReferenceFlags(unsigned Flags) { @@ -845,27 +843,34 @@ void Verifier::visitTemplateParams(const MDNode &N, const Metadata &RawParams) { } void Verifier::visitDICompositeType(const DICompositeType &N) { - // Common derived type checks. - visitDIDerivedTypeBase(N); + // Common scope checks. + visitDIScope(N); Assert(N.getTag() == dwarf::DW_TAG_array_type || N.getTag() == dwarf::DW_TAG_structure_type || N.getTag() == dwarf::DW_TAG_union_type || N.getTag() == dwarf::DW_TAG_enumeration_type || - N.getTag() == dwarf::DW_TAG_subroutine_type || N.getTag() == dwarf::DW_TAG_class_type, "invalid tag", &N); + Assert(isScopeRef(N, N.getScope()), "invalid scope", &N, N.getScope()); + Assert(isTypeRef(N, N.getBaseType()), "invalid base type", &N, + N.getBaseType()); + Assert(!N.getRawElements() || isa<MDTuple>(N.getRawElements()), "invalid composite elements", &N, N.getRawElements()); Assert(isTypeRef(N, N.getRawVTableHolder()), "invalid vtable holder", &N, N.getRawVTableHolder()); - Assert(!N.getRawElements() || isa<MDTuple>(N.getRawElements()), - "invalid composite elements", &N, N.getRawElements()); Assert(!hasConflictingReferenceFlags(N.getFlags()), "invalid reference flags", &N); if (auto *Params = N.getRawTemplateParams()) visitTemplateParams(N, *Params); + + if (N.getTag() == dwarf::DW_TAG_class_type || + N.getTag() == dwarf::DW_TAG_union_type) { + Assert(N.getFile() && !N.getFile()->getFilename().empty(), + "class/union requires a filename", &N, N.getFile()); + } } void Verifier::visitDISubroutineType(const DISubroutineType &N) { @@ -885,6 +890,7 @@ void Verifier::visitDIFile(const DIFile &N) { } void Verifier::visitDICompileUnit(const DICompileUnit &N) { + Assert(N.isDistinct(), "compile units must be distinct", &N); Assert(N.getTag() == dwarf::DW_TAG_compile_unit, "invalid tag", &N); // Don't bother verifying the compilation directory or producer string @@ -928,6 +934,12 @@ void Verifier::visitDICompileUnit(const DICompileUnit &N) { Op); } } + if (auto *Array = N.getRawMacros()) { + Assert(isa<MDTuple>(Array), "invalid macro list", &N, Array); + for (Metadata *Op : N.getMacros()->operands()) { + Assert(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op); + } + } } void Verifier::visitDISubprogram(const DISubprogram &N) { @@ -937,13 +949,6 @@ void Verifier::visitDISubprogram(const DISubprogram &N) { Assert(isa<DISubroutineType>(T), "invalid subroutine type", &N, T); Assert(isTypeRef(N, N.getRawContainingType()), "invalid containing type", &N, N.getRawContainingType()); - if (auto *RawF = N.getRawFunction()) { - auto *FMD = dyn_cast<ConstantAsMetadata>(RawF); - auto *F = FMD ? FMD->getValue() : nullptr; - auto *FT = F ? dyn_cast<PointerType>(F->getType()) : nullptr; - Assert(F && FT && isa<FunctionType>(FT->getElementType()), - "invalid function", &N, F, FT); - } if (auto *Params = N.getRawTemplateParams()) visitTemplateParams(N, *Params); if (auto *S = N.getRawDeclaration()) { @@ -961,40 +966,8 @@ void Verifier::visitDISubprogram(const DISubprogram &N) { Assert(!hasConflictingReferenceFlags(N.getFlags()), "invalid reference flags", &N); - auto *F = N.getFunction(); - if (!F) - return; - - // Check that all !dbg attachments lead to back to N (or, at least, another - // subprogram that describes the same function). - // - // FIXME: Check this incrementally while visiting !dbg attachments. - // FIXME: Only check when N is the canonical subprogram for F. - SmallPtrSet<const MDNode *, 32> Seen; - for (auto &BB : *F) - for (auto &I : BB) { - // Be careful about using DILocation here since we might be dealing with - // broken code (this is the Verifier after all). - DILocation *DL = - dyn_cast_or_null<DILocation>(I.getDebugLoc().getAsMDNode()); - if (!DL) - continue; - if (!Seen.insert(DL).second) - continue; - - DILocalScope *Scope = DL->getInlinedAtScope(); - if (Scope && !Seen.insert(Scope).second) - continue; - - DISubprogram *SP = Scope ? Scope->getSubprogram() : nullptr; - if (SP && !Seen.insert(SP).second) - continue; - - // FIXME: Once N is canonical, check "SP == &N". - Assert(SP->describes(F), - "!dbg attachment points at wrong subprogram for function", &N, F, - &I, DL, Scope, SP); - } + if (N.isDefinition()) + Assert(N.isDistinct(), "subprogram definitions must be distinct", &N); } void Verifier::visitDILexicalBlockBase(const DILexicalBlockBase &N) { @@ -1020,6 +993,30 @@ void Verifier::visitDINamespace(const DINamespace &N) { Assert(isa<DIScope>(S), "invalid scope ref", &N, S); } +void Verifier::visitDIMacro(const DIMacro &N) { + Assert(N.getMacinfoType() == dwarf::DW_MACINFO_define || + N.getMacinfoType() == dwarf::DW_MACINFO_undef, + "invalid macinfo type", &N); + Assert(!N.getName().empty(), "anonymous macro", &N); + if (!N.getValue().empty()) { + assert(N.getValue().data()[0] != ' ' && "Macro value has a space prefix"); + } +} + +void Verifier::visitDIMacroFile(const DIMacroFile &N) { + Assert(N.getMacinfoType() == dwarf::DW_MACINFO_start_file, + "invalid macinfo type", &N); + if (auto *F = N.getRawFile()) + Assert(isa<DIFile>(F), "invalid file", &N, F); + + if (auto *Array = N.getRawElements()) { + Assert(isa<MDTuple>(Array), "invalid macro list", &N, Array); + for (Metadata *Op : N.getElements()->operands()) { + Assert(Op && isa<DIMacroNode>(Op), "invalid macro ref", &N, Op); + } + } +} + void Verifier::visitDIModule(const DIModule &N) { Assert(N.getTag() == dwarf::DW_TAG_module, "invalid tag", &N); Assert(!N.getName().empty(), "anonymous module", &N); @@ -1075,9 +1072,7 @@ void Verifier::visitDILocalVariable(const DILocalVariable &N) { // Checks common to all variables. visitDIVariable(N); - Assert(N.getTag() == dwarf::DW_TAG_auto_variable || - N.getTag() == dwarf::DW_TAG_arg_variable, - "invalid tag", &N); + Assert(N.getTag() == dwarf::DW_TAG_variable, "invalid tag", &N); Assert(N.getRawScope() && isa<DILocalScope>(N.getRawScope()), "local variable requires a valid scope", &N, N.getRawScope()); } @@ -1274,7 +1269,10 @@ void Verifier::VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx, I->getKindAsEnum() == Attribute::OptimizeNone || I->getKindAsEnum() == Attribute::JumpTable || I->getKindAsEnum() == Attribute::Convergent || - I->getKindAsEnum() == Attribute::ArgMemOnly) { + I->getKindAsEnum() == Attribute::ArgMemOnly || + I->getKindAsEnum() == Attribute::NoRecurse || + I->getKindAsEnum() == Attribute::InaccessibleMemOnly || + I->getKindAsEnum() == Attribute::InaccessibleMemOrArgMemOnly) { if (!isFunction) { CheckFailed("Attribute '" + I->getAsString() + "' only applies to functions!", V); @@ -1365,7 +1363,7 @@ void Verifier::VerifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty, V); if (PointerType *PTy = dyn_cast<PointerType>(Ty)) { - SmallPtrSet<const Type*, 4> Visited; + SmallPtrSet<Type*, 4> Visited; if (!PTy->getElementType()->isSized(&Visited)) { Assert(!Attrs.hasAttribute(Idx, Attribute::ByVal) && !Attrs.hasAttribute(Idx, Attribute::InAlloca), @@ -1445,6 +1443,18 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs, "Attributes 'readnone and readonly' are incompatible!", V); Assert( + !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) && + Attrs.hasAttribute(AttributeSet::FunctionIndex, + Attribute::InaccessibleMemOrArgMemOnly)), + "Attributes 'readnone and inaccessiblemem_or_argmemonly' are incompatible!", V); + + Assert( + !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) && + Attrs.hasAttribute(AttributeSet::FunctionIndex, + Attribute::InaccessibleMemOnly)), + "Attributes 'readnone and inaccessiblememonly' are incompatible!", V); + + Assert( !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::NoInline) && Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::AlwaysInline)), @@ -1501,7 +1511,35 @@ void Verifier::VerifyFunctionMetadata( } } -void Verifier::VerifyConstantExprBitcastType(const ConstantExpr *CE) { +void Verifier::visitConstantExprsRecursively(const Constant *EntryC) { + if (!ConstantExprVisited.insert(EntryC).second) + return; + + SmallVector<const Constant *, 16> Stack; + Stack.push_back(EntryC); + + while (!Stack.empty()) { + const Constant *C = Stack.pop_back_val(); + + // Check this constant expression. + if (const auto *CE = dyn_cast<ConstantExpr>(C)) + visitConstantExpr(CE); + + // Visit all sub-expressions. + for (const Use &U : C->operands()) { + const auto *OpC = dyn_cast<Constant>(U); + if (!OpC) + continue; + if (isa<GlobalValue>(OpC)) + continue; // Global values get visited separately. + if (!ConstantExprVisited.insert(OpC).second) + continue; + Stack.push_back(OpC); + } + } +} + +void Verifier::visitConstantExpr(const ConstantExpr *CE) { if (CE->getOpcode() != Instruction::BitCast) return; @@ -1554,17 +1592,11 @@ void Verifier::VerifyStatepoint(ImmutableCallSite CS) { &CI); const Value *Target = CS.getArgument(2); - const PointerType *PT = dyn_cast<PointerType>(Target->getType()); + auto *PT = dyn_cast<PointerType>(Target->getType()); Assert(PT && PT->getElementType()->isFunctionTy(), "gc.statepoint callee must be of function pointer type", &CI, Target); FunctionType *TargetFuncType = cast<FunctionType>(PT->getElementType()); - if (NumPatchBytes) - Assert(isa<ConstantPointerNull>(Target->stripPointerCasts()), - "gc.statepoint must have null as call target if number of patchable " - "bytes is non zero", - &CI); - const Value *NumCallArgsV = CS.getArgument(3); Assert(isa<ConstantInt>(NumCallArgsV), "gc.statepoint number of arguments to underlying call " @@ -1642,14 +1674,14 @@ void Verifier::VerifyStatepoint(ImmutableCallSite CS) { const CallInst *Call = dyn_cast<const CallInst>(U); Assert(Call, "illegal use of statepoint token", &CI, U); if (!Call) continue; - Assert(isGCRelocate(Call) || isGCResult(Call), + Assert(isa<GCRelocateInst>(Call) || isGCResult(Call), "gc.result or gc.relocate are the only value uses" "of a gc.statepoint", &CI, U); if (isGCResult(Call)) { Assert(Call->getArgOperand(0) == &CI, "gc.result connected to wrong gc.statepoint", &CI, Call); - } else if (isGCRelocate(Call)) { + } else if (isa<GCRelocateInst>(Call)) { Assert(Call->getArgOperand(0) == &CI, "gc.relocate connected to wrong gc.statepoint", &CI, Call); } @@ -1678,6 +1710,59 @@ void Verifier::verifyFrameRecoverIndices() { } } +static Instruction *getSuccPad(TerminatorInst *Terminator) { + BasicBlock *UnwindDest; + if (auto *II = dyn_cast<InvokeInst>(Terminator)) + UnwindDest = II->getUnwindDest(); + else if (auto *CSI = dyn_cast<CatchSwitchInst>(Terminator)) + UnwindDest = CSI->getUnwindDest(); + else + UnwindDest = cast<CleanupReturnInst>(Terminator)->getUnwindDest(); + return UnwindDest->getFirstNonPHI(); +} + +void Verifier::verifySiblingFuncletUnwinds() { + SmallPtrSet<Instruction *, 8> Visited; + SmallPtrSet<Instruction *, 8> Active; + for (const auto &Pair : SiblingFuncletInfo) { + Instruction *PredPad = Pair.first; + if (Visited.count(PredPad)) + continue; + Active.insert(PredPad); + TerminatorInst *Terminator = Pair.second; + do { + Instruction *SuccPad = getSuccPad(Terminator); + if (Active.count(SuccPad)) { + // Found a cycle; report error + Instruction *CyclePad = SuccPad; + SmallVector<Instruction *, 8> CycleNodes; + do { + CycleNodes.push_back(CyclePad); + TerminatorInst *CycleTerminator = SiblingFuncletInfo[CyclePad]; + if (CycleTerminator != CyclePad) + CycleNodes.push_back(CycleTerminator); + CyclePad = getSuccPad(CycleTerminator); + } while (CyclePad != SuccPad); + Assert(false, "EH pads can't handle each other's exceptions", + ArrayRef<Instruction *>(CycleNodes)); + } + // Don't re-walk a node we've already checked + if (!Visited.insert(SuccPad).second) + break; + // Walk to this successor if it has a map entry. + PredPad = SuccPad; + auto TermI = SiblingFuncletInfo.find(PredPad); + if (TermI == SiblingFuncletInfo.end()) + break; + Terminator = TermI->second; + Active.insert(PredPad); + } while (true); + // Each node only has one successor, so we've walked all the active + // nodes' successors. + Active.clear(); + } +} + // visitFunction - Verify that a function is ok. // void Verifier::visitFunction(const Function &F) { @@ -1743,17 +1828,33 @@ void Verifier::visitFunction(const Function &F) { FT->getParamType(i)); Assert(I->getType()->isFirstClassType(), "Function arguments must have first-class types!", I); - if (!isLLVMdotName) + if (!isLLVMdotName) { Assert(!I->getType()->isMetadataTy(), "Function takes metadata but isn't an intrinsic", I, &F); + Assert(!I->getType()->isTokenTy(), + "Function takes token but isn't an intrinsic", I, &F); + } } + if (!isLLVMdotName) + Assert(!F.getReturnType()->isTokenTy(), + "Functions returns a token but isn't an intrinsic", &F); + // Get the function metadata attachments. SmallVector<std::pair<unsigned, MDNode *>, 4> MDs; F.getAllMetadata(MDs); assert(F.hasMetadata() != MDs.empty() && "Bit out-of-sync"); VerifyFunctionMetadata(MDs); + // Check validity of the personality function + if (F.hasPersonalityFn()) { + auto *Per = dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts()); + if (Per) + Assert(Per->getParent() == F.getParent(), + "Referencing personality function in another module!", + &F, F.getParent(), Per, Per->getParent()); + } + if (F.isMaterializable()) { // Function has a body somewhere we can't see. Assert(MDs.empty(), "unmaterialized function cannot have metadata", &F, @@ -1782,13 +1883,27 @@ void Verifier::visitFunction(const Function &F) { } // Visit metadata attachments. - for (const auto &I : MDs) + for (const auto &I : MDs) { + // Verify that the attachment is legal. + switch (I.first) { + default: + break; + case LLVMContext::MD_dbg: + Assert(isa<DISubprogram>(I.second), + "function !dbg attachment must be a subprogram", &F, I.second); + break; + } + + // Verify the metadata itself. visitMDNode(*I.second); + } } // If this function is actually an intrinsic, verify that it is only used in // direct call/invokes, never having its "address taken". - if (F.getIntrinsicID()) { + // Only do this if the module is materialized, otherwise we don't have all the + // uses. + if (F.getIntrinsicID() && F.getParent()->isMaterialized()) { const User *U; if (F.hasAddressTaken(&U)) Assert(0, "Invalid user of intrinsic instruction!", U); @@ -1798,6 +1913,44 @@ void Verifier::visitFunction(const Function &F) { (F.isDeclaration() && F.hasExternalLinkage()) || F.hasAvailableExternallyLinkage(), "Function is marked as dllimport, but not external.", &F); + + auto *N = F.getSubprogram(); + if (!N) + return; + + // Check that all !dbg attachments lead to back to N (or, at least, another + // subprogram that describes the same function). + // + // FIXME: Check this incrementally while visiting !dbg attachments. + // FIXME: Only check when N is the canonical subprogram for F. + SmallPtrSet<const MDNode *, 32> Seen; + for (auto &BB : F) + for (auto &I : BB) { + // Be careful about using DILocation here since we might be dealing with + // broken code (this is the Verifier after all). + DILocation *DL = + dyn_cast_or_null<DILocation>(I.getDebugLoc().getAsMDNode()); + if (!DL) + continue; + if (!Seen.insert(DL).second) + continue; + + DILocalScope *Scope = DL->getInlinedAtScope(); + if (Scope && !Seen.insert(Scope).second) + continue; + + DISubprogram *SP = Scope ? Scope->getSubprogram() : nullptr; + + // Scope and SP could be the same MDNode and we don't want to skip + // validation in that case + if (SP && ((Scope != SP) && !Seen.insert(SP).second)) + continue; + + // FIXME: Once N is canonical, check "SP == &N". + Assert(SP->describes(&F), + "!dbg attachment points at wrong subprogram for function", N, &F, + &I, DL, Scope, SP); + } } // verifyBasicBlock - Verify that a basic block is well formed... @@ -2194,6 +2347,9 @@ void Verifier::visitPHINode(PHINode &PN) { isa<PHINode>(--BasicBlock::iterator(&PN)), "PHI nodes not grouped at top of basic block!", &PN, PN.getParent()); + // Check that a PHI doesn't yield a Token. + Assert(!PN.getType()->isTokenTy(), "PHI nodes cannot have token type!"); + // Check that all of the values of the PHI node have the same type as the // result, and that the incoming blocks are really basic blocks. for (Value *IncValue : PN.incoming_values()) { @@ -2296,16 +2452,44 @@ void Verifier::VerifyCallSite(CallSite CS) { // Verify that there's no metadata unless it's a direct call to an intrinsic. if (CS.getCalledFunction() == nullptr || !CS.getCalledFunction()->getName().startswith("llvm.")) { - for (FunctionType::param_iterator PI = FTy->param_begin(), - PE = FTy->param_end(); PI != PE; ++PI) - Assert(!(*PI)->isMetadataTy(), + for (Type *ParamTy : FTy->params()) { + Assert(!ParamTy->isMetadataTy(), "Function has metadata parameter but isn't an intrinsic", I); + Assert(!ParamTy->isTokenTy(), + "Function has token parameter but isn't an intrinsic", I); + } } + // Verify that indirect calls don't return tokens. + if (CS.getCalledFunction() == nullptr) + Assert(!FTy->getReturnType()->isTokenTy(), + "Return type cannot be token for indirect call!"); + if (Function *F = CS.getCalledFunction()) if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID()) visitIntrinsicCallSite(ID, CS); + // Verify that a callsite has at most one "deopt" and one "funclet" operand + // bundle. + bool FoundDeoptBundle = false, FoundFuncletBundle = false; + for (unsigned i = 0, e = CS.getNumOperandBundles(); i < e; ++i) { + OperandBundleUse BU = CS.getOperandBundleAt(i); + uint32_t Tag = BU.getTagID(); + if (Tag == LLVMContext::OB_deopt) { + Assert(!FoundDeoptBundle, "Multiple deopt operand bundles", I); + FoundDeoptBundle = true; + } + if (Tag == LLVMContext::OB_funclet) { + Assert(!FoundFuncletBundle, "Multiple funclet operand bundles", I); + FoundFuncletBundle = true; + Assert(BU.Inputs.size() == 1, + "Expected exactly one funclet bundle operand", I); + Assert(isa<FuncletPadInst>(BU.Inputs.front()), + "Funclet bundle operands should correspond to a FuncletPadInst", + I); + } + } + visitInstruction(*I); } @@ -2406,10 +2590,12 @@ void Verifier::visitCallInst(CallInst &CI) { void Verifier::visitInvokeInst(InvokeInst &II) { VerifyCallSite(&II); - // Verify that there is a landingpad instruction as the first non-PHI - // instruction of the 'unwind' destination. - Assert(II.getUnwindDest()->isLandingPad(), - "The unwind destination does not have a landingpad instruction!", &II); + // Verify that the first non-PHI instruction of the unwind destination is an + // exception handling instruction. + Assert( + II.getUnwindDest()->isEHPad(), + "The unwind destination does not have an exception handling instruction!", + &II); visitTerminatorInst(II); } @@ -2622,6 +2808,14 @@ void Verifier::visitRangeMetadata(Instruction& I, } } +void Verifier::checkAtomicMemAccessSize(const Module *M, Type *Ty, + const Instruction *I) { + unsigned Size = M->getDataLayout().getTypeSizeInBits(Ty); + Assert(Size >= 8, "atomic memory access' size must be byte-sized", Ty, I); + Assert(!(Size & (Size - 1)), + "atomic memory access' operand must have a power-of-two size", Ty, I); +} + void Verifier::visitLoadInst(LoadInst &LI) { PointerType *PTy = dyn_cast<PointerType>(LI.getOperand(0)->getType()); Assert(PTy, "Load operand must be a pointer.", &LI); @@ -2633,14 +2827,12 @@ void Verifier::visitLoadInst(LoadInst &LI) { "Load cannot have Release ordering", &LI); Assert(LI.getAlignment() != 0, "Atomic load must specify explicit alignment", &LI); - if (!ElTy->isPointerTy()) { - Assert(ElTy->isIntegerTy(), "atomic load operand must have integer type!", - &LI, ElTy); - unsigned Size = ElTy->getPrimitiveSizeInBits(); - Assert(Size >= 8 && !(Size & (Size - 1)), - "atomic load operand must be power-of-two byte-sized integer", &LI, - ElTy); - } + Assert(ElTy->isIntegerTy() || ElTy->isPointerTy() || + ElTy->isFloatingPointTy(), + "atomic load operand must have integer, pointer, or floating point " + "type!", + ElTy, &LI); + checkAtomicMemAccessSize(M, ElTy, &LI); } else { Assert(LI.getSynchScope() == CrossThread, "Non-atomic load cannot have SynchronizationScope specified", &LI); @@ -2662,14 +2854,12 @@ void Verifier::visitStoreInst(StoreInst &SI) { "Store cannot have Acquire ordering", &SI); Assert(SI.getAlignment() != 0, "Atomic store must specify explicit alignment", &SI); - if (!ElTy->isPointerTy()) { - Assert(ElTy->isIntegerTy(), - "atomic store operand must have integer type!", &SI, ElTy); - unsigned Size = ElTy->getPrimitiveSizeInBits(); - Assert(Size >= 8 && !(Size & (Size - 1)), - "atomic store operand must be power-of-two byte-sized integer", - &SI, ElTy); - } + Assert(ElTy->isIntegerTy() || ElTy->isPointerTy() || + ElTy->isFloatingPointTy(), + "atomic store operand must have integer, pointer, or floating point " + "type!", + ElTy, &SI); + checkAtomicMemAccessSize(M, ElTy, &SI); } else { Assert(SI.getSynchScope() == CrossThread, "Non-atomic store cannot have SynchronizationScope specified", &SI); @@ -2678,7 +2868,7 @@ void Verifier::visitStoreInst(StoreInst &SI) { } void Verifier::visitAllocaInst(AllocaInst &AI) { - SmallPtrSet<const Type*, 4> Visited; + SmallPtrSet<Type*, 4> Visited; PointerType *PTy = AI.getType(); Assert(PTy->getAddressSpace() == 0, "Allocation instruction pointer not in the generic address space!", @@ -2716,9 +2906,7 @@ void Verifier::visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI) { Type *ElTy = PTy->getElementType(); Assert(ElTy->isIntegerTy(), "cmpxchg operand must have integer type!", &CXI, ElTy); - unsigned Size = ElTy->getPrimitiveSizeInBits(); - Assert(Size >= 8 && !(Size & (Size - 1)), - "cmpxchg operand must be power-of-two byte-sized integer", &CXI, ElTy); + checkAtomicMemAccessSize(M, ElTy, &CXI); Assert(ElTy == CXI.getOperand(1)->getType(), "Expected value type does not match pointer operand type!", &CXI, ElTy); @@ -2737,10 +2925,7 @@ void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) { Type *ElTy = PTy->getElementType(); Assert(ElTy->isIntegerTy(), "atomicrmw operand must have integer type!", &RMWI, ElTy); - unsigned Size = ElTy->getPrimitiveSizeInBits(); - Assert(Size >= 8 && !(Size & (Size - 1)), - "atomicrmw operand must be power-of-two byte-sized integer", &RMWI, - ElTy); + checkAtomicMemAccessSize(M, ElTy, &RMWI); Assert(ElTy == RMWI.getOperand(1)->getType(), "Argument value type does not match pointer operand type!", &RMWI, ElTy); @@ -2777,23 +2962,98 @@ void Verifier::visitInsertValueInst(InsertValueInst &IVI) { visitInstruction(IVI); } -void Verifier::visitLandingPadInst(LandingPadInst &LPI) { - BasicBlock *BB = LPI.getParent(); +static Value *getParentPad(Value *EHPad) { + if (auto *FPI = dyn_cast<FuncletPadInst>(EHPad)) + return FPI->getParentPad(); + + return cast<CatchSwitchInst>(EHPad)->getParentPad(); +} + +void Verifier::visitEHPadPredecessors(Instruction &I) { + assert(I.isEHPad()); + + BasicBlock *BB = I.getParent(); + Function *F = BB->getParent(); + + Assert(BB != &F->getEntryBlock(), "EH pad cannot be in entry block.", &I); + + if (auto *LPI = dyn_cast<LandingPadInst>(&I)) { + // The landingpad instruction defines its parent as a landing pad block. The + // landing pad block may be branched to only by the unwind edge of an + // invoke. + for (BasicBlock *PredBB : predecessors(BB)) { + const auto *II = dyn_cast<InvokeInst>(PredBB->getTerminator()); + Assert(II && II->getUnwindDest() == BB && II->getNormalDest() != BB, + "Block containing LandingPadInst must be jumped to " + "only by the unwind edge of an invoke.", + LPI); + } + return; + } + if (auto *CPI = dyn_cast<CatchPadInst>(&I)) { + if (!pred_empty(BB)) + Assert(BB->getUniquePredecessor() == CPI->getCatchSwitch()->getParent(), + "Block containg CatchPadInst must be jumped to " + "only by its catchswitch.", + CPI); + Assert(BB != CPI->getCatchSwitch()->getUnwindDest(), + "Catchswitch cannot unwind to one of its catchpads", + CPI->getCatchSwitch(), CPI); + return; + } + + // Verify that each pred has a legal terminator with a legal to/from EH + // pad relationship. + Instruction *ToPad = &I; + Value *ToPadParent = getParentPad(ToPad); + for (BasicBlock *PredBB : predecessors(BB)) { + TerminatorInst *TI = PredBB->getTerminator(); + Value *FromPad; + if (auto *II = dyn_cast<InvokeInst>(TI)) { + Assert(II->getUnwindDest() == BB && II->getNormalDest() != BB, + "EH pad must be jumped to via an unwind edge", ToPad, II); + if (auto Bundle = II->getOperandBundle(LLVMContext::OB_funclet)) + FromPad = Bundle->Inputs[0]; + else + FromPad = ConstantTokenNone::get(II->getContext()); + } else if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) { + FromPad = CRI->getCleanupPad(); + Assert(FromPad != ToPadParent, "A cleanupret must exit its cleanup", CRI); + } else if (auto *CSI = dyn_cast<CatchSwitchInst>(TI)) { + FromPad = CSI; + } else { + Assert(false, "EH pad must be jumped to via an unwind edge", ToPad, TI); + } + + // The edge may exit from zero or more nested pads. + for (;; FromPad = getParentPad(FromPad)) { + Assert(FromPad != ToPad, + "EH pad cannot handle exceptions raised within it", FromPad, TI); + if (FromPad == ToPadParent) { + // This is a legal unwind edge. + break; + } + Assert(!isa<ConstantTokenNone>(FromPad), + "A single unwind edge may only enter one EH pad", TI); + } + } +} +void Verifier::visitLandingPadInst(LandingPadInst &LPI) { // The landingpad instruction is ill-formed if it doesn't have any clauses and // isn't a cleanup. Assert(LPI.getNumClauses() > 0 || LPI.isCleanup(), "LandingPadInst needs at least one clause or to be a cleanup.", &LPI); - // The landingpad instruction defines its parent as a landing pad block. The - // landing pad block may be branched to only by the unwind edge of an invoke. - for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) { - const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator()); - Assert(II && II->getUnwindDest() == BB && II->getNormalDest() != BB, - "Block containing LandingPadInst must be jumped to " - "only by the unwind edge of an invoke.", + visitEHPadPredecessors(LPI); + + if (!LandingPadResultTy) + LandingPadResultTy = LPI.getType(); + else + Assert(LandingPadResultTy == LPI.getType(), + "The landingpad instruction should have a consistent result type " + "inside a function.", &LPI); - } Function *F = LPI.getParent()->getParent(); Assert(F->hasPersonalityFn(), @@ -2820,6 +3080,269 @@ void Verifier::visitLandingPadInst(LandingPadInst &LPI) { visitInstruction(LPI); } +void Verifier::visitCatchPadInst(CatchPadInst &CPI) { + visitEHPadPredecessors(CPI); + + BasicBlock *BB = CPI.getParent(); + + Function *F = BB->getParent(); + Assert(F->hasPersonalityFn(), + "CatchPadInst needs to be in a function with a personality.", &CPI); + + Assert(isa<CatchSwitchInst>(CPI.getParentPad()), + "CatchPadInst needs to be directly nested in a CatchSwitchInst.", + CPI.getParentPad()); + + // The catchpad instruction must be the first non-PHI instruction in the + // block. + Assert(BB->getFirstNonPHI() == &CPI, + "CatchPadInst not the first non-PHI instruction in the block.", &CPI); + + visitFuncletPadInst(CPI); +} + +void Verifier::visitCatchReturnInst(CatchReturnInst &CatchReturn) { + Assert(isa<CatchPadInst>(CatchReturn.getOperand(0)), + "CatchReturnInst needs to be provided a CatchPad", &CatchReturn, + CatchReturn.getOperand(0)); + + visitTerminatorInst(CatchReturn); +} + +void Verifier::visitCleanupPadInst(CleanupPadInst &CPI) { + visitEHPadPredecessors(CPI); + + BasicBlock *BB = CPI.getParent(); + + Function *F = BB->getParent(); + Assert(F->hasPersonalityFn(), + "CleanupPadInst needs to be in a function with a personality.", &CPI); + + // The cleanuppad instruction must be the first non-PHI instruction in the + // block. + Assert(BB->getFirstNonPHI() == &CPI, + "CleanupPadInst not the first non-PHI instruction in the block.", + &CPI); + + auto *ParentPad = CPI.getParentPad(); + Assert(isa<ConstantTokenNone>(ParentPad) || isa<FuncletPadInst>(ParentPad), + "CleanupPadInst has an invalid parent.", &CPI); + + visitFuncletPadInst(CPI); +} + +void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) { + User *FirstUser = nullptr; + Value *FirstUnwindPad = nullptr; + SmallVector<FuncletPadInst *, 8> Worklist({&FPI}); + while (!Worklist.empty()) { + FuncletPadInst *CurrentPad = Worklist.pop_back_val(); + Value *UnresolvedAncestorPad = nullptr; + for (User *U : CurrentPad->users()) { + BasicBlock *UnwindDest; + if (auto *CRI = dyn_cast<CleanupReturnInst>(U)) { + UnwindDest = CRI->getUnwindDest(); + } else if (auto *CSI = dyn_cast<CatchSwitchInst>(U)) { + // We allow catchswitch unwind to caller to nest + // within an outer pad that unwinds somewhere else, + // because catchswitch doesn't have a nounwind variant. + // See e.g. SimplifyCFGOpt::SimplifyUnreachable. + if (CSI->unwindsToCaller()) + continue; + UnwindDest = CSI->getUnwindDest(); + } else if (auto *II = dyn_cast<InvokeInst>(U)) { + UnwindDest = II->getUnwindDest(); + } else if (isa<CallInst>(U)) { + // Calls which don't unwind may be found inside funclet + // pads that unwind somewhere else. We don't *require* + // such calls to be annotated nounwind. + continue; + } else if (auto *CPI = dyn_cast<CleanupPadInst>(U)) { + // The unwind dest for a cleanup can only be found by + // recursive search. Add it to the worklist, and we'll + // search for its first use that determines where it unwinds. + Worklist.push_back(CPI); + continue; + } else { + Assert(isa<CatchReturnInst>(U), "Bogus funclet pad use", U); + continue; + } + + Value *UnwindPad; + bool ExitsFPI; + if (UnwindDest) { + UnwindPad = UnwindDest->getFirstNonPHI(); + Value *UnwindParent = getParentPad(UnwindPad); + // Ignore unwind edges that don't exit CurrentPad. + if (UnwindParent == CurrentPad) + continue; + // Determine whether the original funclet pad is exited, + // and if we are scanning nested pads determine how many + // of them are exited so we can stop searching their + // children. + Value *ExitedPad = CurrentPad; + ExitsFPI = false; + do { + if (ExitedPad == &FPI) { + ExitsFPI = true; + // Now we can resolve any ancestors of CurrentPad up to + // FPI, but not including FPI since we need to make sure + // to check all direct users of FPI for consistency. + UnresolvedAncestorPad = &FPI; + break; + } + Value *ExitedParent = getParentPad(ExitedPad); + if (ExitedParent == UnwindParent) { + // ExitedPad is the ancestor-most pad which this unwind + // edge exits, so we can resolve up to it, meaning that + // ExitedParent is the first ancestor still unresolved. + UnresolvedAncestorPad = ExitedParent; + break; + } + ExitedPad = ExitedParent; + } while (!isa<ConstantTokenNone>(ExitedPad)); + } else { + // Unwinding to caller exits all pads. + UnwindPad = ConstantTokenNone::get(FPI.getContext()); + ExitsFPI = true; + UnresolvedAncestorPad = &FPI; + } + + if (ExitsFPI) { + // This unwind edge exits FPI. Make sure it agrees with other + // such edges. + if (FirstUser) { + Assert(UnwindPad == FirstUnwindPad, "Unwind edges out of a funclet " + "pad must have the same unwind " + "dest", + &FPI, U, FirstUser); + } else { + FirstUser = U; + FirstUnwindPad = UnwindPad; + // Record cleanup sibling unwinds for verifySiblingFuncletUnwinds + if (isa<CleanupPadInst>(&FPI) && !isa<ConstantTokenNone>(UnwindPad) && + getParentPad(UnwindPad) == getParentPad(&FPI)) + SiblingFuncletInfo[&FPI] = cast<TerminatorInst>(U); + } + } + // Make sure we visit all uses of FPI, but for nested pads stop as + // soon as we know where they unwind to. + if (CurrentPad != &FPI) + break; + } + if (UnresolvedAncestorPad) { + if (CurrentPad == UnresolvedAncestorPad) { + // When CurrentPad is FPI itself, we don't mark it as resolved even if + // we've found an unwind edge that exits it, because we need to verify + // all direct uses of FPI. + assert(CurrentPad == &FPI); + continue; + } + // Pop off the worklist any nested pads that we've found an unwind + // destination for. The pads on the worklist are the uncles, + // great-uncles, etc. of CurrentPad. We've found an unwind destination + // for all ancestors of CurrentPad up to but not including + // UnresolvedAncestorPad. + Value *ResolvedPad = CurrentPad; + while (!Worklist.empty()) { + Value *UnclePad = Worklist.back(); + Value *AncestorPad = getParentPad(UnclePad); + // Walk ResolvedPad up the ancestor list until we either find the + // uncle's parent or the last resolved ancestor. + while (ResolvedPad != AncestorPad) { + Value *ResolvedParent = getParentPad(ResolvedPad); + if (ResolvedParent == UnresolvedAncestorPad) { + break; + } + ResolvedPad = ResolvedParent; + } + // If the resolved ancestor search didn't find the uncle's parent, + // then the uncle is not yet resolved. + if (ResolvedPad != AncestorPad) + break; + // This uncle is resolved, so pop it from the worklist. + Worklist.pop_back(); + } + } + } + + if (FirstUnwindPad) { + if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(FPI.getParentPad())) { + BasicBlock *SwitchUnwindDest = CatchSwitch->getUnwindDest(); + Value *SwitchUnwindPad; + if (SwitchUnwindDest) + SwitchUnwindPad = SwitchUnwindDest->getFirstNonPHI(); + else + SwitchUnwindPad = ConstantTokenNone::get(FPI.getContext()); + Assert(SwitchUnwindPad == FirstUnwindPad, + "Unwind edges out of a catch must have the same unwind dest as " + "the parent catchswitch", + &FPI, FirstUser, CatchSwitch); + } + } + + visitInstruction(FPI); +} + +void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) { + visitEHPadPredecessors(CatchSwitch); + + BasicBlock *BB = CatchSwitch.getParent(); + + Function *F = BB->getParent(); + Assert(F->hasPersonalityFn(), + "CatchSwitchInst needs to be in a function with a personality.", + &CatchSwitch); + + // The catchswitch instruction must be the first non-PHI instruction in the + // block. + Assert(BB->getFirstNonPHI() == &CatchSwitch, + "CatchSwitchInst not the first non-PHI instruction in the block.", + &CatchSwitch); + + auto *ParentPad = CatchSwitch.getParentPad(); + Assert(isa<ConstantTokenNone>(ParentPad) || isa<FuncletPadInst>(ParentPad), + "CatchSwitchInst has an invalid parent.", ParentPad); + + if (BasicBlock *UnwindDest = CatchSwitch.getUnwindDest()) { + Instruction *I = UnwindDest->getFirstNonPHI(); + Assert(I->isEHPad() && !isa<LandingPadInst>(I), + "CatchSwitchInst must unwind to an EH block which is not a " + "landingpad.", + &CatchSwitch); + + // Record catchswitch sibling unwinds for verifySiblingFuncletUnwinds + if (getParentPad(I) == ParentPad) + SiblingFuncletInfo[&CatchSwitch] = &CatchSwitch; + } + + Assert(CatchSwitch.getNumHandlers() != 0, + "CatchSwitchInst cannot have empty handler list", &CatchSwitch); + + for (BasicBlock *Handler : CatchSwitch.handlers()) { + Assert(isa<CatchPadInst>(Handler->getFirstNonPHI()), + "CatchSwitchInst handlers must be catchpads", &CatchSwitch, Handler); + } + + visitTerminatorInst(CatchSwitch); +} + +void Verifier::visitCleanupReturnInst(CleanupReturnInst &CRI) { + Assert(isa<CleanupPadInst>(CRI.getOperand(0)), + "CleanupReturnInst needs to be provided a CleanupPad", &CRI, + CRI.getOperand(0)); + + if (BasicBlock *UnwindDest = CRI.getUnwindDest()) { + Instruction *I = UnwindDest->getFirstNonPHI(); + Assert(I->isEHPad() && !isa<LandingPadInst>(I), + "CleanupReturnInst must unwind to an EH block which is not a " + "landingpad.", + &CRI); + } + + visitTerminatorInst(CRI); +} + void Verifier::verifyDominatesUse(Instruction &I, unsigned i) { Instruction *Op = cast<Instruction>(I.getOperand(i)); // If the we have an invalid invoke, don't try to compute the dominance. @@ -2835,6 +3358,19 @@ void Verifier::verifyDominatesUse(Instruction &I, unsigned i) { "Instruction does not dominate all uses!", Op, &I); } +void Verifier::visitDereferenceableMetadata(Instruction& I, MDNode* MD) { + Assert(I.getType()->isPointerTy(), "dereferenceable, dereferenceable_or_null " + "apply only to pointer types", &I); + Assert(isa<LoadInst>(I), + "dereferenceable, dereferenceable_or_null apply only to load" + " instructions, use attributes for calls or invokes", &I); + Assert(MD->getNumOperands() == 1, "dereferenceable, dereferenceable_or_null " + "take one operand!", &I); + ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(MD->getOperand(0)); + Assert(CI && CI->getType()->isIntegerTy(64), "dereferenceable, " + "dereferenceable_or_null metadata value must be an i64!", &I); +} + /// verifyInstruction - Verify that an instruction is well formed. /// void Verifier::visitInstruction(Instruction &I) { @@ -2903,7 +3439,7 @@ void Verifier::visitInstruction(Instruction &I) { " donothing or patchpoint", &I); Assert(F->getParent() == M, "Referencing function in another module!", - &I); + &I, M, F, F->getParent()); } else if (BasicBlock *OpBB = dyn_cast<BasicBlock>(I.getOperand(i))) { Assert(OpBB->getParent() == BB->getParent(), "Referring to a basic block in another function!", &I); @@ -2911,7 +3447,7 @@ void Verifier::visitInstruction(Instruction &I) { Assert(OpArg->getParent() == BB->getParent(), "Referring to an argument in another function!", &I); } else if (GlobalValue *GV = dyn_cast<GlobalValue>(I.getOperand(i))) { - Assert(GV->getParent() == M, "Referencing global in another module!", &I); + Assert(GV->getParent() == M, "Referencing global in another module!", &I, M, GV, GV->getParent()); } else if (isa<Instruction>(I.getOperand(i))) { verifyDominatesUse(I, i); } else if (isa<InlineAsm>(I.getOperand(i))) { @@ -2922,22 +3458,7 @@ void Verifier::visitInstruction(Instruction &I) { if (CE->getType()->isPtrOrPtrVectorTy()) { // If we have a ConstantExpr pointer, we need to see if it came from an // illegal bitcast (inttoptr <constant int> ) - SmallVector<const ConstantExpr *, 4> Stack; - SmallPtrSet<const ConstantExpr *, 4> Visited; - Stack.push_back(CE); - - while (!Stack.empty()) { - const ConstantExpr *V = Stack.pop_back_val(); - if (!Visited.insert(V).second) - continue; - - VerifyConstantExprBitcastType(V); - - for (unsigned I = 0, N = V->getNumOperands(); I != N; ++I) { - if (ConstantExpr *Op = dyn_cast<ConstantExpr>(V->getOperand(I))) - Stack.push_back(Op); - } - } + visitConstantExprsRecursively(CE); } } } @@ -2971,6 +3492,28 @@ void Verifier::visitInstruction(Instruction &I) { &I); } + if (MDNode *MD = I.getMetadata(LLVMContext::MD_dereferenceable)) + visitDereferenceableMetadata(I, MD); + + if (MDNode *MD = I.getMetadata(LLVMContext::MD_dereferenceable_or_null)) + visitDereferenceableMetadata(I, MD); + + if (MDNode *AlignMD = I.getMetadata(LLVMContext::MD_align)) { + Assert(I.getType()->isPointerTy(), "align applies only to pointer types", + &I); + Assert(isa<LoadInst>(I), "align applies only to load instructions, " + "use attributes for calls or invokes", &I); + Assert(AlignMD->getNumOperands() == 1, "align takes one operand!", &I); + ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(AlignMD->getOperand(0)); + Assert(CI && CI->getType()->isIntegerTy(64), + "align metadata value must be an i64!", &I); + uint64_t Align = CI->getZExtValue(); + Assert(isPowerOf2_64(Align), + "align metadata value must be a power of 2!", &I); + Assert(Align <= Value::MaximumAlignment, + "alignment is larger that implementation defined limit", &I); + } + if (MDNode *N = I.getDebugLoc().getAsMDNode()) { Assert(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N); visitMDNode(*N); @@ -2998,6 +3541,7 @@ bool Verifier::VerifyIntrinsicType(Type *Ty, case IITDescriptor::Void: return !Ty->isVoidTy(); case IITDescriptor::VarArg: return true; case IITDescriptor::MMX: return !Ty->isX86_MMXTy(); + case IITDescriptor::Token: return !Ty->isTokenTy(); case IITDescriptor::Metadata: return !Ty->isMetadataTy(); case IITDescriptor::Half: return !Ty->isHalfTy(); case IITDescriptor::Float: return !Ty->isFloatTy(); @@ -3321,9 +3865,6 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) { VerifyStatepoint(CS); break; - case Intrinsic::experimental_gc_result_int: - case Intrinsic::experimental_gc_result_float: - case Intrinsic::experimental_gc_result_ptr: case Intrinsic::experimental_gc_result: { Assert(CS.getParent()->getParent()->hasGC(), "Enclosing function does not use GC.", CS); @@ -3339,9 +3880,8 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) { // Assert that result type matches wrapped callee. const Value *Target = StatepointCS.getArgument(2); - const PointerType *PT = cast<PointerType>(Target->getType()); - const FunctionType *TargetFuncType = - cast<FunctionType>(PT->getElementType()); + auto *PT = cast<PointerType>(Target->getType()); + auto *TargetFuncType = cast<FunctionType>(PT->getElementType()); Assert(CS.getType() == TargetFuncType->getReturnType(), "gc.result result type does not match wrapped callee", CS); break; @@ -3349,22 +3889,22 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) { case Intrinsic::experimental_gc_relocate: { Assert(CS.getNumArgOperands() == 3, "wrong number of arguments", CS); + Assert(isa<PointerType>(CS.getType()->getScalarType()), + "gc.relocate must return a pointer or a vector of pointers", CS); + // Check that this relocate is correctly tied to the statepoint // This is case for relocate on the unwinding path of an invoke statepoint - if (ExtractValueInst *ExtractValue = - dyn_cast<ExtractValueInst>(CS.getArgOperand(0))) { - Assert(isa<LandingPadInst>(ExtractValue->getAggregateOperand()), - "gc relocate on unwind path incorrectly linked to the statepoint", - CS); + if (LandingPadInst *LandingPad = + dyn_cast<LandingPadInst>(CS.getArgOperand(0))) { const BasicBlock *InvokeBB = - ExtractValue->getParent()->getUniquePredecessor(); + LandingPad->getParent()->getUniquePredecessor(); // Landingpad relocates should have only one predecessor with invoke // statepoint terminator Assert(InvokeBB, "safepoints should have unique landingpads", - ExtractValue->getParent()); + LandingPad->getParent()); Assert(InvokeBB->getTerminator(), "safepoint block should be well formed", InvokeBB); Assert(isStatepoint(InvokeBB->getTerminator()), @@ -3381,8 +3921,8 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) { // Verify rest of the relocate arguments - GCRelocateOperands Ops(CS); - ImmutableCallSite StatepointCS(Ops.getStatepoint()); + ImmutableCallSite StatepointCS( + cast<GCRelocateInst>(*CS.getInstruction()).getStatepoint()); // Both the base and derived must be piped through the safepoint Value* Base = CS.getArgOperand(1); @@ -3434,20 +3974,29 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) { "'gc parameters' section of the statepoint call", CS); - // Relocated value must be a pointer type, but gc_relocate does not need to return the - // same pointer type as the relocated pointer. It can be casted to the correct type later - // if it's desired. However, they must have the same address space. - GCRelocateOperands Operands(CS); - Assert(Operands.getDerivedPtr()->getType()->isPointerTy(), + // Relocated value must be either a pointer type or vector-of-pointer type, + // but gc_relocate does not need to return the same pointer type as the + // relocated pointer. It can be casted to the correct type later if it's + // desired. However, they must have the same address space and 'vectorness' + GCRelocateInst &Relocate = cast<GCRelocateInst>(*CS.getInstruction()); + Assert(Relocate.getDerivedPtr()->getType()->getScalarType()->isPointerTy(), "gc.relocate: relocated value must be a gc pointer", CS); - // gc_relocate return type must be a pointer type, and is verified earlier in - // VerifyIntrinsicType(). - Assert(cast<PointerType>(CS.getType())->getAddressSpace() == - cast<PointerType>(Operands.getDerivedPtr()->getType())->getAddressSpace(), + auto ResultType = CS.getType(); + auto DerivedType = Relocate.getDerivedPtr()->getType(); + Assert(ResultType->isVectorTy() == DerivedType->isVectorTy(), + "gc.relocate: vector relocates to vector and pointer to pointer", CS); + Assert(ResultType->getPointerAddressSpace() == + DerivedType->getPointerAddressSpace(), "gc.relocate: relocating a pointer shouldn't change its address space", CS); break; } + case Intrinsic::eh_exceptioncode: + case Intrinsic::eh_exceptionpointer: { + Assert(isa<CatchPadInst>(CS.getArgOperand(0)), + "eh.exceptionpointer argument must be a catchpad", CS); + break; + } }; } @@ -3598,7 +4147,7 @@ void Verifier::verifyTypeRefs() { for (auto *CU : CUs->operands()) if (auto Ts = cast<DICompileUnit>(CU)->getRetainedTypes()) for (DIType *Op : Ts) - if (auto *T = dyn_cast<DICompositeType>(Op)) + if (auto *T = dyn_cast_or_null<DICompositeType>(Op)) if (auto *S = T->getRawIdentifier()) { UnresolvedTypeRefs.erase(S); TypeRefs.insert(std::make_pair(S, T)); diff --git a/contrib/llvm/lib/IRReader/IRReader.cpp b/contrib/llvm/lib/IRReader/IRReader.cpp index 43fee65..9b243fc 100644 --- a/contrib/llvm/lib/IRReader/IRReader.cpp +++ b/contrib/llvm/lib/IRReader/IRReader.cpp @@ -31,11 +31,11 @@ static const char *const TimeIRParsingName = "Parse IR"; static std::unique_ptr<Module> getLazyIRModule(std::unique_ptr<MemoryBuffer> Buffer, SMDiagnostic &Err, - LLVMContext &Context) { + LLVMContext &Context, bool ShouldLazyLoadMetadata) { if (isBitcode((const unsigned char *)Buffer->getBufferStart(), (const unsigned char *)Buffer->getBufferEnd())) { - ErrorOr<std::unique_ptr<Module>> ModuleOrErr = - getLazyBitcodeModule(std::move(Buffer), Context); + ErrorOr<std::unique_ptr<Module>> ModuleOrErr = getLazyBitcodeModule( + std::move(Buffer), Context, ShouldLazyLoadMetadata); if (std::error_code EC = ModuleOrErr.getError()) { Err = SMDiagnostic(Buffer->getBufferIdentifier(), SourceMgr::DK_Error, EC.message()); @@ -49,7 +49,8 @@ getLazyIRModule(std::unique_ptr<MemoryBuffer> Buffer, SMDiagnostic &Err, std::unique_ptr<Module> llvm::getLazyIRFileModule(StringRef Filename, SMDiagnostic &Err, - LLVMContext &Context) { + LLVMContext &Context, + bool ShouldLazyLoadMetadata) { ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr = MemoryBuffer::getFileOrSTDIN(Filename); if (std::error_code EC = FileOrErr.getError()) { @@ -58,7 +59,8 @@ std::unique_ptr<Module> llvm::getLazyIRFileModule(StringRef Filename, return nullptr; } - return getLazyIRModule(std::move(FileOrErr.get()), Err, Context); + return getLazyIRModule(std::move(FileOrErr.get()), Err, Context, + ShouldLazyLoadMetadata); } std::unique_ptr<Module> llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err, diff --git a/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp b/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp index 25ae4ac..66df23b 100644 --- a/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp +++ b/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Bitcode/ReaderWriter.h" +#include "llvm/CodeGen/ParallelCG.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/Config/config.h" #include "llvm/IR/Constants.h" @@ -63,47 +64,15 @@ const char* LTOCodeGenerator::getVersionString() { #endif } -static void handleLTODiagnostic(const DiagnosticInfo &DI) { - DiagnosticPrinterRawOStream DP(errs()); - DI.print(DP); - errs() << "\n"; -} - -LTOCodeGenerator::LTOCodeGenerator() - : Context(getGlobalContext()), IRLinker(new Module("ld-temp.o", Context), - handleLTODiagnostic) { - initializeLTOPasses(); -} - -LTOCodeGenerator::LTOCodeGenerator(std::unique_ptr<LLVMContext> Context) - : OwnedContext(std::move(Context)), Context(*OwnedContext), - IRLinker(new Module("ld-temp.o", *OwnedContext), handleLTODiagnostic) { +LTOCodeGenerator::LTOCodeGenerator(LLVMContext &Context) + : Context(Context), MergedModule(new Module("ld-temp.o", Context)), + TheLinker(new Linker(*MergedModule)) { initializeLTOPasses(); } -void LTOCodeGenerator::destroyMergedModule() { - if (OwnedModule) { - assert(IRLinker.getModule() == &OwnedModule->getModule() && - "The linker's module should be the same as the owned module"); - delete OwnedModule; - OwnedModule = nullptr; - } else if (IRLinker.getModule()) - IRLinker.deleteModule(); -} - -LTOCodeGenerator::~LTOCodeGenerator() { - destroyMergedModule(); +LTOCodeGenerator::~LTOCodeGenerator() {} - delete TargetMach; - TargetMach = nullptr; - - for (std::vector<char *>::iterator I = CodegenOptions.begin(), - E = CodegenOptions.end(); - I != E; ++I) - free(*I); -} - -// Initialize LTO passes. Please keep this funciton in sync with +// Initialize LTO passes. Please keep this function in sync with // PassManagerBuilder::populateLTOPassManager(), and make sure all LTO // passes are initialized. void LTOCodeGenerator::initializeLTOPasses() { @@ -120,11 +89,12 @@ void LTOCodeGenerator::initializeLTOPasses() { initializeGlobalDCEPass(R); initializeArgPromotionPass(R); initializeJumpThreadingPass(R); - initializeSROAPass(R); + initializeSROALegacyPassPass(R); initializeSROA_DTPass(R); initializeSROA_SSAUpPass(R); - initializeFunctionAttrsPass(R); - initializeGlobalsModRefPass(R); + initializePostOrderFunctionAttrsPass(R); + initializeReversePostOrderFunctionAttrsPass(R); + initializeGlobalsAAWrapperPassPass(R); initializeLICMPass(R); initializeMergedLoadStoreMotionPass(R); initializeGVNPass(R); @@ -133,41 +103,39 @@ void LTOCodeGenerator::initializeLTOPasses() { initializeCFGSimplifyPassPass(R); } -bool LTOCodeGenerator::addModule(LTOModule *mod) { - assert(&mod->getModule().getContext() == &Context && +bool LTOCodeGenerator::addModule(LTOModule *Mod) { + assert(&Mod->getModule().getContext() == &Context && "Expected module in same context"); - bool ret = IRLinker.linkInModule(&mod->getModule()); + bool ret = TheLinker->linkInModule(Mod->takeModule()); - const std::vector<const char*> &undefs = mod->getAsmUndefinedRefs(); + const std::vector<const char *> &undefs = Mod->getAsmUndefinedRefs(); for (int i = 0, e = undefs.size(); i != e; ++i) AsmUndefinedRefs[undefs[i]] = 1; return !ret; } -void LTOCodeGenerator::setModule(LTOModule *Mod) { +void LTOCodeGenerator::setModule(std::unique_ptr<LTOModule> Mod) { assert(&Mod->getModule().getContext() == &Context && "Expected module in same context"); - // Delete the old merged module. - destroyMergedModule(); AsmUndefinedRefs.clear(); - OwnedModule = Mod; - IRLinker.setModule(&Mod->getModule()); + MergedModule = Mod->takeModule(); + TheLinker = make_unique<Linker>(*MergedModule); const std::vector<const char*> &Undefs = Mod->getAsmUndefinedRefs(); for (int I = 0, E = Undefs.size(); I != E; ++I) AsmUndefinedRefs[Undefs[I]] = 1; } -void LTOCodeGenerator::setTargetOptions(TargetOptions options) { - Options = options; +void LTOCodeGenerator::setTargetOptions(TargetOptions Options) { + this->Options = Options; } -void LTOCodeGenerator::setDebugInfo(lto_debug_model debug) { - switch (debug) { +void LTOCodeGenerator::setDebugInfo(lto_debug_model Debug) { + switch (Debug) { case LTO_DEBUG_MODEL_NONE: EmitDwarfDebugInfo = false; return; @@ -179,21 +147,26 @@ void LTOCodeGenerator::setDebugInfo(lto_debug_model debug) { llvm_unreachable("Unknown debug format!"); } -void LTOCodeGenerator::setCodePICModel(lto_codegen_model model) { - switch (model) { - case LTO_CODEGEN_PIC_MODEL_STATIC: - case LTO_CODEGEN_PIC_MODEL_DYNAMIC: - case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC: - case LTO_CODEGEN_PIC_MODEL_DEFAULT: - CodeModel = model; - return; +void LTOCodeGenerator::setOptLevel(unsigned Level) { + OptLevel = Level; + switch (OptLevel) { + case 0: + CGOptLevel = CodeGenOpt::None; + break; + case 1: + CGOptLevel = CodeGenOpt::Less; + break; + case 2: + CGOptLevel = CodeGenOpt::Default; + break; + case 3: + CGOptLevel = CodeGenOpt::Aggressive; + break; } - llvm_unreachable("Unknown PIC model!"); } -bool LTOCodeGenerator::writeMergedModules(const char *path, - std::string &errMsg) { - if (!determineTarget(errMsg)) +bool LTOCodeGenerator::writeMergedModules(const char *Path) { + if (!determineTarget()) return false; // mark which symbols can not be internalized @@ -201,20 +174,22 @@ bool LTOCodeGenerator::writeMergedModules(const char *path, // create output file std::error_code EC; - tool_output_file Out(path, EC, sys::fs::F_None); + tool_output_file Out(Path, EC, sys::fs::F_None); if (EC) { - errMsg = "could not open bitcode file for writing: "; - errMsg += path; + std::string ErrMsg = "could not open bitcode file for writing: "; + ErrMsg += Path; + emitError(ErrMsg); return false; } // write bitcode to it - WriteBitcodeToFile(IRLinker.getModule(), Out.os(), ShouldEmbedUselists); + WriteBitcodeToFile(MergedModule.get(), Out.os(), ShouldEmbedUselists); Out.os().close(); if (Out.os().has_error()) { - errMsg = "could not write bitcode file: "; - errMsg += path; + std::string ErrMsg = "could not write bitcode file: "; + ErrMsg += Path; + emitError(ErrMsg); Out.os().clear_error(); return false; } @@ -223,22 +198,25 @@ bool LTOCodeGenerator::writeMergedModules(const char *path, return true; } -bool LTOCodeGenerator::compileOptimizedToFile(const char **name, - std::string &errMsg) { - // make unique temp .o file to put generated object file +bool LTOCodeGenerator::compileOptimizedToFile(const char **Name) { + // make unique temp output file to put generated code SmallString<128> Filename; int FD; + + const char *Extension = + (FileType == TargetMachine::CGFT_AssemblyFile ? "s" : "o"); + std::error_code EC = - sys::fs::createTemporaryFile("lto-llvm", "o", FD, Filename); + sys::fs::createTemporaryFile("lto-llvm", Extension, FD, Filename); if (EC) { - errMsg = EC.message(); + emitError(EC.message()); return false; } // generate object file tool_output_file objFile(Filename.c_str(), FD); - bool genResult = compileOptimized(objFile.os(), errMsg); + bool genResult = compileOptimized(&objFile.os()); objFile.os().close(); if (objFile.os().has_error()) { objFile.os().clear_error(); @@ -253,21 +231,21 @@ bool LTOCodeGenerator::compileOptimizedToFile(const char **name, } NativeObjectPath = Filename.c_str(); - *name = NativeObjectPath.c_str(); + *Name = NativeObjectPath.c_str(); return true; } std::unique_ptr<MemoryBuffer> -LTOCodeGenerator::compileOptimized(std::string &errMsg) { +LTOCodeGenerator::compileOptimized() { const char *name; - if (!compileOptimizedToFile(&name, errMsg)) + if (!compileOptimizedToFile(&name)) return nullptr; // read .o file into memory buffer ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr = MemoryBuffer::getFile(name, -1, false); if (std::error_code EC = BufferOrErr.getError()) { - errMsg = EC.message(); + emitError(EC.message()); sys::fs::remove(NativeObjectPath); return nullptr; } @@ -278,66 +256,51 @@ LTOCodeGenerator::compileOptimized(std::string &errMsg) { return std::move(*BufferOrErr); } - -bool LTOCodeGenerator::compile_to_file(const char **name, - bool disableInline, - bool disableGVNLoadPRE, - bool disableVectorization, - std::string &errMsg) { - if (!optimize(disableInline, disableGVNLoadPRE, - disableVectorization, errMsg)) +bool LTOCodeGenerator::compile_to_file(const char **Name, bool DisableVerify, + bool DisableInline, + bool DisableGVNLoadPRE, + bool DisableVectorization) { + if (!optimize(DisableVerify, DisableInline, DisableGVNLoadPRE, + DisableVectorization)) return false; - return compileOptimizedToFile(name, errMsg); + return compileOptimizedToFile(Name); } std::unique_ptr<MemoryBuffer> -LTOCodeGenerator::compile(bool disableInline, bool disableGVNLoadPRE, - bool disableVectorization, std::string &errMsg) { - if (!optimize(disableInline, disableGVNLoadPRE, - disableVectorization, errMsg)) +LTOCodeGenerator::compile(bool DisableVerify, bool DisableInline, + bool DisableGVNLoadPRE, bool DisableVectorization) { + if (!optimize(DisableVerify, DisableInline, DisableGVNLoadPRE, + DisableVectorization)) return nullptr; - return compileOptimized(errMsg); + return compileOptimized(); } -bool LTOCodeGenerator::determineTarget(std::string &errMsg) { +bool LTOCodeGenerator::determineTarget() { if (TargetMach) return true; - std::string TripleStr = IRLinker.getModule()->getTargetTriple(); - if (TripleStr.empty()) + std::string TripleStr = MergedModule->getTargetTriple(); + if (TripleStr.empty()) { TripleStr = sys::getDefaultTargetTriple(); + MergedModule->setTargetTriple(TripleStr); + } llvm::Triple Triple(TripleStr); // create target machine from info for merged modules - const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg); - if (!march) + std::string ErrMsg; + const Target *march = TargetRegistry::lookupTarget(TripleStr, ErrMsg); + if (!march) { + emitError(ErrMsg); return false; - - // The relocation model is actually a static member of TargetMachine and - // needs to be set before the TargetMachine is instantiated. - Reloc::Model RelocModel = Reloc::Default; - switch (CodeModel) { - case LTO_CODEGEN_PIC_MODEL_STATIC: - RelocModel = Reloc::Static; - break; - case LTO_CODEGEN_PIC_MODEL_DYNAMIC: - RelocModel = Reloc::PIC_; - break; - case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC: - RelocModel = Reloc::DynamicNoPIC; - break; - case LTO_CODEGEN_PIC_MODEL_DEFAULT: - // RelocModel is already the default, so leave it that way. - break; } // Construct LTOModule, hand over ownership of module and target. Use MAttr as // the default set of features. SubtargetFeatures Features(MAttr); Features.getDefaultSubtargetFeatures(Triple); - std::string FeatureStr = Features.getString(); + FeatureStr = Features.getString(); // Set a default CPU for Darwin triples. if (MCpu.empty() && Triple.isOSDarwin()) { if (Triple.getArch() == llvm::Triple::x86_64) @@ -348,25 +311,9 @@ bool LTOCodeGenerator::determineTarget(std::string &errMsg) { MCpu = "cyclone"; } - CodeGenOpt::Level CGOptLevel; - switch (OptLevel) { - case 0: - CGOptLevel = CodeGenOpt::None; - break; - case 1: - CGOptLevel = CodeGenOpt::Less; - break; - case 2: - CGOptLevel = CodeGenOpt::Default; - break; - case 3: - CGOptLevel = CodeGenOpt::Aggressive; - break; - } - - TargetMach = march->createTargetMachine(TripleStr, MCpu, FeatureStr, Options, - RelocModel, CodeModel::Default, - CGOptLevel); + TargetMach.reset(march->createTargetMachine(TripleStr, MCpu, FeatureStr, + Options, RelocModel, + CodeModel::Default, CGOptLevel)); return true; } @@ -453,7 +400,6 @@ static void accumulateAndSortLibcalls(std::vector<StringRef> &Libcalls, void LTOCodeGenerator::applyScopeRestrictions() { if (ScopeRestrictionsDone || !ShouldInternalize) return; - Module *mergedModule = IRLinker.getModule(); // Start off with a verification pass. legacy::PassManager passes; @@ -467,20 +413,17 @@ void LTOCodeGenerator::applyScopeRestrictions() { TargetLibraryInfoImpl TLII(Triple(TargetMach->getTargetTriple())); TargetLibraryInfo TLI(TLII); - accumulateAndSortLibcalls(Libcalls, TLI, *mergedModule, *TargetMach); + accumulateAndSortLibcalls(Libcalls, TLI, *MergedModule, *TargetMach); - for (Module::iterator f = mergedModule->begin(), - e = mergedModule->end(); f != e; ++f) - applyRestriction(*f, Libcalls, MustPreserveList, AsmUsed, Mangler); - for (Module::global_iterator v = mergedModule->global_begin(), - e = mergedModule->global_end(); v != e; ++v) - applyRestriction(*v, Libcalls, MustPreserveList, AsmUsed, Mangler); - for (Module::alias_iterator a = mergedModule->alias_begin(), - e = mergedModule->alias_end(); a != e; ++a) - applyRestriction(*a, Libcalls, MustPreserveList, AsmUsed, Mangler); + for (Function &f : *MergedModule) + applyRestriction(f, Libcalls, MustPreserveList, AsmUsed, Mangler); + for (GlobalVariable &v : MergedModule->globals()) + applyRestriction(v, Libcalls, MustPreserveList, AsmUsed, Mangler); + for (GlobalAlias &a : MergedModule->aliases()) + applyRestriction(a, Libcalls, MustPreserveList, AsmUsed, Mangler); GlobalVariable *LLVMCompilerUsed = - mergedModule->getGlobalVariable("llvm.compiler.used"); + MergedModule->getGlobalVariable("llvm.compiler.used"); findUsedValues(LLVMCompilerUsed, AsmUsed); if (LLVMCompilerUsed) LLVMCompilerUsed->eraseFromParent(); @@ -495,7 +438,7 @@ void LTOCodeGenerator::applyScopeRestrictions() { llvm::ArrayType *ATy = llvm::ArrayType::get(i8PTy, asmUsed2.size()); LLVMCompilerUsed = - new llvm::GlobalVariable(*mergedModule, ATy, false, + new llvm::GlobalVariable(*MergedModule, ATy, false, llvm::GlobalValue::AppendingLinkage, llvm::ConstantArray::get(ATy, asmUsed2), "llvm.compiler.used"); @@ -506,21 +449,18 @@ void LTOCodeGenerator::applyScopeRestrictions() { passes.add(createInternalizePass(MustPreserveList)); // apply scope restrictions - passes.run(*mergedModule); + passes.run(*MergedModule); ScopeRestrictionsDone = true; } /// Optimize merged modules using various IPO passes -bool LTOCodeGenerator::optimize(bool DisableInline, +bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline, bool DisableGVNLoadPRE, - bool DisableVectorization, - std::string &errMsg) { - if (!this->determineTarget(errMsg)) + bool DisableVectorization) { + if (!this->determineTarget()) return false; - Module *mergedModule = IRLinker.getModule(); - // Mark which symbols can not be internalized this->applyScopeRestrictions(); @@ -528,7 +468,7 @@ bool LTOCodeGenerator::optimize(bool DisableInline, legacy::PassManager passes; // Add an appropriate DataLayout instance for this module... - mergedModule->setDataLayout(*TargetMach->getDataLayout()); + MergedModule->setDataLayout(TargetMach->createDataLayout()); passes.add( createTargetTransformInfoWrapperPass(TargetMach->getTargetIRAnalysis())); @@ -542,60 +482,57 @@ bool LTOCodeGenerator::optimize(bool DisableInline, PMB.Inliner = createFunctionInliningPass(); PMB.LibraryInfo = new TargetLibraryInfoImpl(TargetTriple); PMB.OptLevel = OptLevel; - PMB.VerifyInput = true; - PMB.VerifyOutput = true; + PMB.VerifyInput = !DisableVerify; + PMB.VerifyOutput = !DisableVerify; PMB.populateLTOPassManager(passes); // Run our queue of passes all at once now, efficiently. - passes.run(*mergedModule); + passes.run(*MergedModule); return true; } -bool LTOCodeGenerator::compileOptimized(raw_pwrite_stream &out, - std::string &errMsg) { - if (!this->determineTarget(errMsg)) +bool LTOCodeGenerator::compileOptimized(ArrayRef<raw_pwrite_stream *> Out) { + if (!this->determineTarget()) return false; - Module *mergedModule = IRLinker.getModule(); - - legacy::PassManager codeGenPasses; + legacy::PassManager preCodeGenPasses; // If the bitcode files contain ARC code and were compiled with optimization, // the ObjCARCContractPass must be run, so do it unconditionally here. - codeGenPasses.add(createObjCARCContractPass()); - - if (TargetMach->addPassesToEmitFile(codeGenPasses, out, - TargetMachine::CGFT_ObjectFile)) { - errMsg = "target file type not supported"; - return false; - } - - // Run the code generator, and write assembly file - codeGenPasses.run(*mergedModule); + preCodeGenPasses.add(createObjCARCContractPass()); + preCodeGenPasses.run(*MergedModule); + + // Do code generation. We need to preserve the module in case the client calls + // writeMergedModules() after compilation, but we only need to allow this at + // parallelism level 1. This is achieved by having splitCodeGen return the + // original module at parallelism level 1 which we then assign back to + // MergedModule. + MergedModule = + splitCodeGen(std::move(MergedModule), Out, MCpu, FeatureStr, Options, + RelocModel, CodeModel::Default, CGOptLevel, FileType); return true; } /// setCodeGenDebugOptions - Set codegen debugging options to aid in debugging /// LTO problems. -void LTOCodeGenerator::setCodeGenDebugOptions(const char *options) { - for (std::pair<StringRef, StringRef> o = getToken(options); - !o.first.empty(); o = getToken(o.second)) { - // ParseCommandLineOptions() expects argv[0] to be program name. Lazily add - // that. - if (CodegenOptions.empty()) - CodegenOptions.push_back(strdup("libLLVMLTO")); - CodegenOptions.push_back(strdup(o.first.str().c_str())); - } +void LTOCodeGenerator::setCodeGenDebugOptions(const char *Options) { + for (std::pair<StringRef, StringRef> o = getToken(Options); !o.first.empty(); + o = getToken(o.second)) + CodegenOptions.push_back(o.first); } void LTOCodeGenerator::parseCodeGenDebugOptions() { // if options were requested, set them - if (!CodegenOptions.empty()) - cl::ParseCommandLineOptions(CodegenOptions.size(), - const_cast<char **>(&CodegenOptions[0])); + if (!CodegenOptions.empty()) { + // ParseCommandLineOptions() expects argv[0] to be program name. + std::vector<const char *> CodegenArgv(1, "libLLVMLTO"); + for (std::string &Arg : CodegenOptions) + CodegenArgv.push_back(Arg.c_str()); + cl::ParseCommandLineOptions(CodegenArgv.size(), CodegenArgv.data()); + } } void LTOCodeGenerator::DiagnosticHandler(const DiagnosticInfo &DI, @@ -645,3 +582,20 @@ LTOCodeGenerator::setDiagnosticHandler(lto_diagnostic_handler_t DiagHandler, Context.setDiagnosticHandler(LTOCodeGenerator::DiagnosticHandler, this, /* RespectFilters */ true); } + +namespace { +class LTODiagnosticInfo : public DiagnosticInfo { + const Twine &Msg; +public: + LTODiagnosticInfo(const Twine &DiagMsg, DiagnosticSeverity Severity=DS_Error) + : DiagnosticInfo(DK_Linker, Severity), Msg(DiagMsg) {} + void print(DiagnosticPrinter &DP) const override { DP << Msg; } +}; +} + +void LTOCodeGenerator::emitError(const std::string &ErrMsg) { + if (DiagHandler) + (*DiagHandler)(LTO_DS_ERROR, ErrMsg.c_str(), DiagContext); + else + Context.diagnose(LTODiagnosticInfo(ErrMsg)); +} diff --git a/contrib/llvm/lib/LTO/LTOModule.cpp b/contrib/llvm/lib/LTO/LTOModule.cpp index 53ed417..409b949 100644 --- a/contrib/llvm/lib/LTO/LTOModule.cpp +++ b/contrib/llvm/lib/LTO/LTOModule.cpp @@ -91,106 +91,97 @@ bool LTOModule::isBitcodeForTarget(MemoryBuffer *Buffer, return StringRef(Triple).startswith(TriplePrefix); } -LTOModule *LTOModule::createFromFile(const char *path, TargetOptions options, - std::string &errMsg) { +std::string LTOModule::getProducerString(MemoryBuffer *Buffer) { + ErrorOr<MemoryBufferRef> BCOrErr = + IRObjectFile::findBitcodeInMemBuffer(Buffer->getMemBufferRef()); + if (!BCOrErr) + return ""; + LLVMContext Context; + return getBitcodeProducerString(*BCOrErr, Context); +} + +ErrorOr<std::unique_ptr<LTOModule>> +LTOModule::createFromFile(LLVMContext &Context, const char *path, + TargetOptions options) { ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr = MemoryBuffer::getFile(path); - if (std::error_code EC = BufferOrErr.getError()) { - errMsg = EC.message(); - return nullptr; - } + if (std::error_code EC = BufferOrErr.getError()) + return EC; std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get()); - return makeLTOModule(Buffer->getMemBufferRef(), options, errMsg, - &getGlobalContext()); + return makeLTOModule(Buffer->getMemBufferRef(), options, &Context); } -LTOModule *LTOModule::createFromOpenFile(int fd, const char *path, size_t size, - TargetOptions options, - std::string &errMsg) { - return createFromOpenFileSlice(fd, path, size, 0, options, errMsg); +ErrorOr<std::unique_ptr<LTOModule>> +LTOModule::createFromOpenFile(LLVMContext &Context, int fd, const char *path, + size_t size, TargetOptions options) { + return createFromOpenFileSlice(Context, fd, path, size, 0, options); } -LTOModule *LTOModule::createFromOpenFileSlice(int fd, const char *path, - size_t map_size, off_t offset, - TargetOptions options, - std::string &errMsg) { +ErrorOr<std::unique_ptr<LTOModule>> +LTOModule::createFromOpenFileSlice(LLVMContext &Context, int fd, + const char *path, size_t map_size, + off_t offset, TargetOptions options) { ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr = MemoryBuffer::getOpenFileSlice(fd, path, map_size, offset); - if (std::error_code EC = BufferOrErr.getError()) { - errMsg = EC.message(); - return nullptr; - } + if (std::error_code EC = BufferOrErr.getError()) + return EC; std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get()); - return makeLTOModule(Buffer->getMemBufferRef(), options, errMsg, - &getGlobalContext()); + return makeLTOModule(Buffer->getMemBufferRef(), options, &Context); } -LTOModule *LTOModule::createFromBuffer(const void *mem, size_t length, - TargetOptions options, - std::string &errMsg, StringRef path) { - return createInContext(mem, length, options, errMsg, path, - &getGlobalContext()); +ErrorOr<std::unique_ptr<LTOModule>> +LTOModule::createFromBuffer(LLVMContext &Context, const void *mem, + size_t length, TargetOptions options, + StringRef path) { + return createInContext(mem, length, options, path, &Context); } -LTOModule *LTOModule::createInLocalContext(const void *mem, size_t length, - TargetOptions options, - std::string &errMsg, - StringRef path) { - return createInContext(mem, length, options, errMsg, path, nullptr); +ErrorOr<std::unique_ptr<LTOModule>> +LTOModule::createInLocalContext(const void *mem, size_t length, + TargetOptions options, StringRef path) { + return createInContext(mem, length, options, path, nullptr); } -LTOModule *LTOModule::createInContext(const void *mem, size_t length, - TargetOptions options, - std::string &errMsg, StringRef path, - LLVMContext *Context) { +ErrorOr<std::unique_ptr<LTOModule>> +LTOModule::createInContext(const void *mem, size_t length, + TargetOptions options, StringRef path, + LLVMContext *Context) { StringRef Data((const char *)mem, length); MemoryBufferRef Buffer(Data, path); - return makeLTOModule(Buffer, options, errMsg, Context); + return makeLTOModule(Buffer, options, Context); } -static std::unique_ptr<Module> parseBitcodeFileImpl(MemoryBufferRef Buffer, - LLVMContext &Context, - bool ShouldBeLazy, - std::string &ErrMsg) { +static ErrorOr<std::unique_ptr<Module>> +parseBitcodeFileImpl(MemoryBufferRef Buffer, LLVMContext &Context, + bool ShouldBeLazy) { // Find the buffer. ErrorOr<MemoryBufferRef> MBOrErr = IRObjectFile::findBitcodeInMemBuffer(Buffer); - if (std::error_code EC = MBOrErr.getError()) { - ErrMsg = EC.message(); - return nullptr; - } - - std::function<void(const DiagnosticInfo &)> DiagnosticHandler = - [&ErrMsg](const DiagnosticInfo &DI) { - raw_string_ostream Stream(ErrMsg); - DiagnosticPrinterRawOStream DP(Stream); - DI.print(DP); - }; + if (std::error_code EC = MBOrErr.getError()) + return EC; if (!ShouldBeLazy) { // Parse the full file. - ErrorOr<std::unique_ptr<Module>> M = - parseBitcodeFile(*MBOrErr, Context, DiagnosticHandler); - if (!M) - return nullptr; + ErrorOr<std::unique_ptr<Module>> M = parseBitcodeFile(*MBOrErr, Context); + if (std::error_code EC = M.getError()) + return EC; return std::move(*M); } // Parse lazily. std::unique_ptr<MemoryBuffer> LightweightBuf = MemoryBuffer::getMemBuffer(*MBOrErr, false); - ErrorOr<std::unique_ptr<Module>> M = - getLazyBitcodeModule(std::move(LightweightBuf), Context, - DiagnosticHandler, true /*ShouldLazyLoadMetadata*/); - if (!M) - return nullptr; + ErrorOr<std::unique_ptr<Module>> M = getLazyBitcodeModule( + std::move(LightweightBuf), Context, true /*ShouldLazyLoadMetadata*/); + if (std::error_code EC = M.getError()) + return EC; return std::move(*M); } -LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer, - TargetOptions options, std::string &errMsg, - LLVMContext *Context) { +ErrorOr<std::unique_ptr<LTOModule>> +LTOModule::makeLTOModule(MemoryBufferRef Buffer, TargetOptions options, + LLVMContext *Context) { std::unique_ptr<LLVMContext> OwnedContext; if (!Context) { OwnedContext = llvm::make_unique<LLVMContext>(); @@ -199,11 +190,12 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer, // If we own a context, we know this is being used only for symbol // extraction, not linking. Be lazy in that case. - std::unique_ptr<Module> M = parseBitcodeFileImpl( - Buffer, *Context, - /* ShouldBeLazy */ static_cast<bool>(OwnedContext), errMsg); - if (!M) - return nullptr; + ErrorOr<std::unique_ptr<Module>> MOrErr = + parseBitcodeFileImpl(Buffer, *Context, + /* ShouldBeLazy */ static_cast<bool>(OwnedContext)); + if (std::error_code EC = MOrErr.getError()) + return EC; + std::unique_ptr<Module> &M = *MOrErr; std::string TripleStr = M->getTargetTriple(); if (TripleStr.empty()) @@ -211,9 +203,10 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer, llvm::Triple Triple(TripleStr); // find machine architecture for this module + std::string errMsg; const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg); if (!march) - return nullptr; + return std::unique_ptr<LTOModule>(nullptr); // construct LTOModule, hand over ownership of module and target SubtargetFeatures Features; @@ -232,25 +225,21 @@ LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer, TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr, options); - M->setDataLayout(*target->getDataLayout()); + M->setDataLayout(target->createDataLayout()); std::unique_ptr<object::IRObjectFile> IRObj( new object::IRObjectFile(Buffer, std::move(M))); - LTOModule *Ret; + std::unique_ptr<LTOModule> Ret; if (OwnedContext) - Ret = new LTOModule(std::move(IRObj), target, std::move(OwnedContext)); + Ret.reset(new LTOModule(std::move(IRObj), target, std::move(OwnedContext))); else - Ret = new LTOModule(std::move(IRObj), target); - - if (Ret->parseSymbols(errMsg)) { - delete Ret; - return nullptr; - } + Ret.reset(new LTOModule(std::move(IRObj), target)); + Ret->parseSymbols(); Ret->parseMetadata(); - return Ret; + return std::move(Ret); } /// Create a MemoryBuffer from a memory range with an optional name. @@ -583,9 +572,7 @@ void LTOModule::addPotentialUndefinedSymbol(const object::BasicSymbolRef &Sym, info.symbol = decl; } -/// parseSymbols - Parse the symbols from the module and model-level ASM and add -/// them to either the defined or undefined lists. -bool LTOModule::parseSymbols(std::string &errMsg) { +void LTOModule::parseSymbols() { for (auto &Sym : IRFile->symbols()) { const GlobalValue *GV = IRFile->getSymbolGV(Sym.getRawDataRefImpl()); uint32_t Flags = Sym.getFlags(); @@ -640,8 +627,6 @@ bool LTOModule::parseSymbols(std::string &errMsg) { NameAndAttributes info = u->getValue(); _symbols.push_back(info); } - - return false; } /// parseMetadata - Parse metadata from the module diff --git a/contrib/llvm/lib/LibDriver/LibDriver.cpp b/contrib/llvm/lib/LibDriver/LibDriver.cpp index b33a22f..3ae5434 100644 --- a/contrib/llvm/lib/LibDriver/LibDriver.cpp +++ b/contrib/llvm/lib/LibDriver/LibDriver.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // Defines an interface to a lib.exe-compatible driver that also understands -// bitcode files. Used by llvm-lib and lld-link2 /lib. +// bitcode files. Used by llvm-lib and lld-link /lib. // //===----------------------------------------------------------------------===// @@ -51,7 +51,7 @@ static const llvm::opt::OptTable::Info infoTable[] = { class LibOptTable : public llvm::opt::OptTable { public: - LibOptTable() : OptTable(infoTable, llvm::array_lengthof(infoTable), true) {} + LibOptTable() : OptTable(infoTable, true) {} }; } @@ -102,7 +102,7 @@ static Optional<std::string> findInputFile(StringRef File, int llvm::libDriverMain(llvm::ArrayRef<const char*> ArgsArr) { SmallVector<const char *, 20> NewArgs(ArgsArr.begin(), ArgsArr.end()); BumpPtrAllocator Alloc; - BumpPtrStringSaver Saver(Alloc); + StringSaver Saver(Alloc); cl::ExpandResponseFiles(Saver, cl::TokenizeWindowsCommandLine, NewArgs); ArgsArr = NewArgs; @@ -135,14 +135,13 @@ int llvm::libDriverMain(llvm::ArrayRef<const char*> ArgsArr) { llvm::errs() << Arg->getValue() << ": no such file or directory\n"; return 1; } - Members.emplace_back(Saver.save(*Path), - llvm::sys::path::filename(Arg->getValue())); + Members.emplace_back(Saver.save(*Path)); } std::pair<StringRef, std::error_code> Result = llvm::writeArchive(getOutputPath(&Args, Members[0]), Members, /*WriteSymtab=*/true, object::Archive::K_GNU, - /*Deterministic*/ true); + /*Deterministic*/ true, Args.hasArg(OPT_llvmlibthin)); if (Result.second) { if (Result.first.empty()) diff --git a/contrib/llvm/lib/LibDriver/Options.td b/contrib/llvm/lib/LibDriver/Options.td index 0aa1aff..5a56ef7 100644 --- a/contrib/llvm/lib/LibDriver/Options.td +++ b/contrib/llvm/lib/LibDriver/Options.td @@ -12,6 +12,8 @@ class P<string name, string help> : def libpath: P<"libpath", "Object file search path">; def out : P<"out", "Path to file to write output">; +def llvmlibthin : F<"llvmlibthin">; + //============================================================================== // The flags below do nothing. They are defined only for lib.exe compatibility. //============================================================================== diff --git a/contrib/llvm/lib/Linker/IRMover.cpp b/contrib/llvm/lib/Linker/IRMover.cpp new file mode 100644 index 0000000..8dd59f9 --- /dev/null +++ b/contrib/llvm/lib/Linker/IRMover.cpp @@ -0,0 +1,1698 @@ +//===- lib/Linker/IRMover.cpp ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Linker/IRMover.h" +#include "LinkDiagnosticInfo.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/GVMaterializer.h" +#include "llvm/IR/TypeFinder.h" +#include "llvm/Transforms/Utils/Cloning.h" +using namespace llvm; + +//===----------------------------------------------------------------------===// +// TypeMap implementation. +//===----------------------------------------------------------------------===// + +namespace { +class TypeMapTy : public ValueMapTypeRemapper { + /// This is a mapping from a source type to a destination type to use. + DenseMap<Type *, Type *> MappedTypes; + + /// When checking to see if two subgraphs are isomorphic, we speculatively + /// add types to MappedTypes, but keep track of them here in case we need to + /// roll back. + SmallVector<Type *, 16> SpeculativeTypes; + + SmallVector<StructType *, 16> SpeculativeDstOpaqueTypes; + + /// This is a list of non-opaque structs in the source module that are mapped + /// to an opaque struct in the destination module. + SmallVector<StructType *, 16> SrcDefinitionsToResolve; + + /// This is the set of opaque types in the destination modules who are + /// getting a body from the source module. + SmallPtrSet<StructType *, 16> DstResolvedOpaqueTypes; + +public: + TypeMapTy(IRMover::IdentifiedStructTypeSet &DstStructTypesSet) + : DstStructTypesSet(DstStructTypesSet) {} + + IRMover::IdentifiedStructTypeSet &DstStructTypesSet; + /// Indicate that the specified type in the destination module is conceptually + /// equivalent to the specified type in the source module. + void addTypeMapping(Type *DstTy, Type *SrcTy); + + /// Produce a body for an opaque type in the dest module from a type + /// definition in the source module. + void linkDefinedTypeBodies(); + + /// Return the mapped type to use for the specified input type from the + /// source module. + Type *get(Type *SrcTy); + Type *get(Type *SrcTy, SmallPtrSet<StructType *, 8> &Visited); + + void finishType(StructType *DTy, StructType *STy, ArrayRef<Type *> ETypes); + + FunctionType *get(FunctionType *T) { + return cast<FunctionType>(get((Type *)T)); + } + +private: + Type *remapType(Type *SrcTy) override { return get(SrcTy); } + + bool areTypesIsomorphic(Type *DstTy, Type *SrcTy); +}; +} + +void TypeMapTy::addTypeMapping(Type *DstTy, Type *SrcTy) { + assert(SpeculativeTypes.empty()); + assert(SpeculativeDstOpaqueTypes.empty()); + + // Check to see if these types are recursively isomorphic and establish a + // mapping between them if so. + if (!areTypesIsomorphic(DstTy, SrcTy)) { + // Oops, they aren't isomorphic. Just discard this request by rolling out + // any speculative mappings we've established. + for (Type *Ty : SpeculativeTypes) + MappedTypes.erase(Ty); + + SrcDefinitionsToResolve.resize(SrcDefinitionsToResolve.size() - + SpeculativeDstOpaqueTypes.size()); + for (StructType *Ty : SpeculativeDstOpaqueTypes) + DstResolvedOpaqueTypes.erase(Ty); + } else { + for (Type *Ty : SpeculativeTypes) + if (auto *STy = dyn_cast<StructType>(Ty)) + if (STy->hasName()) + STy->setName(""); + } + SpeculativeTypes.clear(); + SpeculativeDstOpaqueTypes.clear(); +} + +/// Recursively walk this pair of types, returning true if they are isomorphic, +/// false if they are not. +bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) { + // Two types with differing kinds are clearly not isomorphic. + if (DstTy->getTypeID() != SrcTy->getTypeID()) + return false; + + // If we have an entry in the MappedTypes table, then we have our answer. + Type *&Entry = MappedTypes[SrcTy]; + if (Entry) + return Entry == DstTy; + + // Two identical types are clearly isomorphic. Remember this + // non-speculatively. + if (DstTy == SrcTy) { + Entry = DstTy; + return true; + } + + // Okay, we have two types with identical kinds that we haven't seen before. + + // If this is an opaque struct type, special case it. + if (StructType *SSTy = dyn_cast<StructType>(SrcTy)) { + // Mapping an opaque type to any struct, just keep the dest struct. + if (SSTy->isOpaque()) { + Entry = DstTy; + SpeculativeTypes.push_back(SrcTy); + return true; + } + + // Mapping a non-opaque source type to an opaque dest. If this is the first + // type that we're mapping onto this destination type then we succeed. Keep + // the dest, but fill it in later. If this is the second (different) type + // that we're trying to map onto the same opaque type then we fail. + if (cast<StructType>(DstTy)->isOpaque()) { + // We can only map one source type onto the opaque destination type. + if (!DstResolvedOpaqueTypes.insert(cast<StructType>(DstTy)).second) + return false; + SrcDefinitionsToResolve.push_back(SSTy); + SpeculativeTypes.push_back(SrcTy); + SpeculativeDstOpaqueTypes.push_back(cast<StructType>(DstTy)); + Entry = DstTy; + return true; + } + } + + // If the number of subtypes disagree between the two types, then we fail. + if (SrcTy->getNumContainedTypes() != DstTy->getNumContainedTypes()) + return false; + + // Fail if any of the extra properties (e.g. array size) of the type disagree. + if (isa<IntegerType>(DstTy)) + return false; // bitwidth disagrees. + if (PointerType *PT = dyn_cast<PointerType>(DstTy)) { + if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace()) + return false; + + } else if (FunctionType *FT = dyn_cast<FunctionType>(DstTy)) { + if (FT->isVarArg() != cast<FunctionType>(SrcTy)->isVarArg()) + return false; + } else if (StructType *DSTy = dyn_cast<StructType>(DstTy)) { + StructType *SSTy = cast<StructType>(SrcTy); + if (DSTy->isLiteral() != SSTy->isLiteral() || + DSTy->isPacked() != SSTy->isPacked()) + return false; + } else if (ArrayType *DATy = dyn_cast<ArrayType>(DstTy)) { + if (DATy->getNumElements() != cast<ArrayType>(SrcTy)->getNumElements()) + return false; + } else if (VectorType *DVTy = dyn_cast<VectorType>(DstTy)) { + if (DVTy->getNumElements() != cast<VectorType>(SrcTy)->getNumElements()) + return false; + } + + // Otherwise, we speculate that these two types will line up and recursively + // check the subelements. + Entry = DstTy; + SpeculativeTypes.push_back(SrcTy); + + for (unsigned I = 0, E = SrcTy->getNumContainedTypes(); I != E; ++I) + if (!areTypesIsomorphic(DstTy->getContainedType(I), + SrcTy->getContainedType(I))) + return false; + + // If everything seems to have lined up, then everything is great. + return true; +} + +void TypeMapTy::linkDefinedTypeBodies() { + SmallVector<Type *, 16> Elements; + for (StructType *SrcSTy : SrcDefinitionsToResolve) { + StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]); + assert(DstSTy->isOpaque()); + + // Map the body of the source type over to a new body for the dest type. + Elements.resize(SrcSTy->getNumElements()); + for (unsigned I = 0, E = Elements.size(); I != E; ++I) + Elements[I] = get(SrcSTy->getElementType(I)); + + DstSTy->setBody(Elements, SrcSTy->isPacked()); + DstStructTypesSet.switchToNonOpaque(DstSTy); + } + SrcDefinitionsToResolve.clear(); + DstResolvedOpaqueTypes.clear(); +} + +void TypeMapTy::finishType(StructType *DTy, StructType *STy, + ArrayRef<Type *> ETypes) { + DTy->setBody(ETypes, STy->isPacked()); + + // Steal STy's name. + if (STy->hasName()) { + SmallString<16> TmpName = STy->getName(); + STy->setName(""); + DTy->setName(TmpName); + } + + DstStructTypesSet.addNonOpaque(DTy); +} + +Type *TypeMapTy::get(Type *Ty) { + SmallPtrSet<StructType *, 8> Visited; + return get(Ty, Visited); +} + +Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) { + // If we already have an entry for this type, return it. + Type **Entry = &MappedTypes[Ty]; + if (*Entry) + return *Entry; + + // These are types that LLVM itself will unique. + bool IsUniqued = !isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral(); + +#ifndef NDEBUG + if (!IsUniqued) { + for (auto &Pair : MappedTypes) { + assert(!(Pair.first != Ty && Pair.second == Ty) && + "mapping to a source type"); + } + } +#endif + + if (!IsUniqued && !Visited.insert(cast<StructType>(Ty)).second) { + StructType *DTy = StructType::create(Ty->getContext()); + return *Entry = DTy; + } + + // If this is not a recursive type, then just map all of the elements and + // then rebuild the type from inside out. + SmallVector<Type *, 4> ElementTypes; + + // If there are no element types to map, then the type is itself. This is + // true for the anonymous {} struct, things like 'float', integers, etc. + if (Ty->getNumContainedTypes() == 0 && IsUniqued) + return *Entry = Ty; + + // Remap all of the elements, keeping track of whether any of them change. + bool AnyChange = false; + ElementTypes.resize(Ty->getNumContainedTypes()); + for (unsigned I = 0, E = Ty->getNumContainedTypes(); I != E; ++I) { + ElementTypes[I] = get(Ty->getContainedType(I), Visited); + AnyChange |= ElementTypes[I] != Ty->getContainedType(I); + } + + // If we found our type while recursively processing stuff, just use it. + Entry = &MappedTypes[Ty]; + if (*Entry) { + if (auto *DTy = dyn_cast<StructType>(*Entry)) { + if (DTy->isOpaque()) { + auto *STy = cast<StructType>(Ty); + finishType(DTy, STy, ElementTypes); + } + } + return *Entry; + } + + // If all of the element types mapped directly over and the type is not + // a nomed struct, then the type is usable as-is. + if (!AnyChange && IsUniqued) + return *Entry = Ty; + + // Otherwise, rebuild a modified type. + switch (Ty->getTypeID()) { + default: + llvm_unreachable("unknown derived type to remap"); + case Type::ArrayTyID: + return *Entry = ArrayType::get(ElementTypes[0], + cast<ArrayType>(Ty)->getNumElements()); + case Type::VectorTyID: + return *Entry = VectorType::get(ElementTypes[0], + cast<VectorType>(Ty)->getNumElements()); + case Type::PointerTyID: + return *Entry = PointerType::get(ElementTypes[0], + cast<PointerType>(Ty)->getAddressSpace()); + case Type::FunctionTyID: + return *Entry = FunctionType::get(ElementTypes[0], + makeArrayRef(ElementTypes).slice(1), + cast<FunctionType>(Ty)->isVarArg()); + case Type::StructTyID: { + auto *STy = cast<StructType>(Ty); + bool IsPacked = STy->isPacked(); + if (IsUniqued) + return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked); + + // If the type is opaque, we can just use it directly. + if (STy->isOpaque()) { + DstStructTypesSet.addOpaque(STy); + return *Entry = Ty; + } + + if (StructType *OldT = + DstStructTypesSet.findNonOpaque(ElementTypes, IsPacked)) { + STy->setName(""); + return *Entry = OldT; + } + + if (!AnyChange) { + DstStructTypesSet.addNonOpaque(STy); + return *Entry = Ty; + } + + StructType *DTy = StructType::create(Ty->getContext()); + finishType(DTy, STy, ElementTypes); + return *Entry = DTy; + } + } +} + +LinkDiagnosticInfo::LinkDiagnosticInfo(DiagnosticSeverity Severity, + const Twine &Msg) + : DiagnosticInfo(DK_Linker, Severity), Msg(Msg) {} +void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; } + +//===----------------------------------------------------------------------===// +// IRLinker implementation. +//===----------------------------------------------------------------------===// + +namespace { +class IRLinker; + +/// Creates prototypes for functions that are lazily linked on the fly. This +/// speeds up linking for modules with many/ lazily linked functions of which +/// few get used. +class GlobalValueMaterializer final : public ValueMaterializer { + IRLinker *TheIRLinker; + +public: + GlobalValueMaterializer(IRLinker *TheIRLinker) : TheIRLinker(TheIRLinker) {} + Value *materializeDeclFor(Value *V) override; + void materializeInitFor(GlobalValue *New, GlobalValue *Old) override; + Metadata *mapTemporaryMetadata(Metadata *MD) override; + void replaceTemporaryMetadata(const Metadata *OrigMD, + Metadata *NewMD) override; + bool isMetadataNeeded(Metadata *MD) override; +}; + +class LocalValueMaterializer final : public ValueMaterializer { + IRLinker *TheIRLinker; + +public: + LocalValueMaterializer(IRLinker *TheIRLinker) : TheIRLinker(TheIRLinker) {} + Value *materializeDeclFor(Value *V) override; + void materializeInitFor(GlobalValue *New, GlobalValue *Old) override; + Metadata *mapTemporaryMetadata(Metadata *MD) override; + void replaceTemporaryMetadata(const Metadata *OrigMD, + Metadata *NewMD) override; + bool isMetadataNeeded(Metadata *MD) override; +}; + +/// This is responsible for keeping track of the state used for moving data +/// from SrcM to DstM. +class IRLinker { + Module &DstM; + Module &SrcM; + + std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor; + + TypeMapTy TypeMap; + GlobalValueMaterializer GValMaterializer; + LocalValueMaterializer LValMaterializer; + + /// Mapping of values from what they used to be in Src, to what they are now + /// in DstM. ValueToValueMapTy is a ValueMap, which involves some overhead + /// due to the use of Value handles which the Linker doesn't actually need, + /// but this allows us to reuse the ValueMapper code. + ValueToValueMapTy ValueMap; + ValueToValueMapTy AliasValueMap; + + DenseSet<GlobalValue *> ValuesToLink; + std::vector<GlobalValue *> Worklist; + + void maybeAdd(GlobalValue *GV) { + if (ValuesToLink.insert(GV).second) + Worklist.push_back(GV); + } + + /// Set to true when all global value body linking is complete (including + /// lazy linking). Used to prevent metadata linking from creating new + /// references. + bool DoneLinkingBodies = false; + + bool HasError = false; + + /// Flag indicating that we are just linking metadata (after function + /// importing). + bool IsMetadataLinkingPostpass; + + /// Flags to pass to value mapper invocations. + RemapFlags ValueMapperFlags = RF_MoveDistinctMDs; + + /// Association between metadata values created during bitcode parsing and + /// the value id. Used to correlate temporary metadata created during + /// function importing with the final metadata parsed during the subsequent + /// metadata linking postpass. + DenseMap<const Metadata *, unsigned> MetadataToIDs; + + /// Association between metadata value id and temporary metadata that + /// remains unmapped after function importing. Saved during function + /// importing and consumed during the metadata linking postpass. + DenseMap<unsigned, MDNode *> *ValIDToTempMDMap; + + /// Set of subprogram metadata that does not need to be linked into the + /// destination module, because the functions were not imported directly + /// or via an inlined body in an imported function. + SmallPtrSet<const Metadata *, 16> UnneededSubprograms; + + /// Handles cloning of a global values from the source module into + /// the destination module, including setting the attributes and visibility. + GlobalValue *copyGlobalValueProto(const GlobalValue *SGV, bool ForDefinition); + + /// Helper method for setting a message and returning an error code. + bool emitError(const Twine &Message) { + SrcM.getContext().diagnose(LinkDiagnosticInfo(DS_Error, Message)); + HasError = true; + return true; + } + + void emitWarning(const Twine &Message) { + SrcM.getContext().diagnose(LinkDiagnosticInfo(DS_Warning, Message)); + } + + /// Check whether we should be linking metadata from the source module. + bool shouldLinkMetadata() { + // ValIDToTempMDMap will be non-null when we are importing or otherwise want + // to link metadata lazily, and then when linking the metadata. + // We only want to return true for the former case. + return ValIDToTempMDMap == nullptr || IsMetadataLinkingPostpass; + } + + /// Given a global in the source module, return the global in the + /// destination module that is being linked to, if any. + GlobalValue *getLinkedToGlobal(const GlobalValue *SrcGV) { + // If the source has no name it can't link. If it has local linkage, + // there is no name match-up going on. + if (!SrcGV->hasName() || SrcGV->hasLocalLinkage()) + return nullptr; + + // Otherwise see if we have a match in the destination module's symtab. + GlobalValue *DGV = DstM.getNamedValue(SrcGV->getName()); + if (!DGV) + return nullptr; + + // If we found a global with the same name in the dest module, but it has + // internal linkage, we are really not doing any linkage here. + if (DGV->hasLocalLinkage()) + return nullptr; + + // Otherwise, we do in fact link to the destination global. + return DGV; + } + + void computeTypeMapping(); + + Constant *linkAppendingVarProto(GlobalVariable *DstGV, + const GlobalVariable *SrcGV); + + bool shouldLink(GlobalValue *DGV, GlobalValue &SGV); + Constant *linkGlobalValueProto(GlobalValue *GV, bool ForAlias); + + bool linkModuleFlagsMetadata(); + + void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src); + bool linkFunctionBody(Function &Dst, Function &Src); + void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src); + bool linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src); + + /// Functions that take care of cloning a specific global value type + /// into the destination module. + GlobalVariable *copyGlobalVariableProto(const GlobalVariable *SGVar); + Function *copyFunctionProto(const Function *SF); + GlobalValue *copyGlobalAliasProto(const GlobalAlias *SGA); + + void linkNamedMDNodes(); + + /// Populate the UnneededSubprograms set with the DISubprogram metadata + /// from the source module that we don't need to link into the dest module, + /// because the functions were not imported directly or via an inlined body + /// in an imported function. + void findNeededSubprograms(ValueToValueMapTy &ValueMap); + + /// The value mapper leaves nulls in the list of subprograms for any + /// in the UnneededSubprograms map. Strip those out after metadata linking. + void stripNullSubprograms(); + +public: + IRLinker(Module &DstM, IRMover::IdentifiedStructTypeSet &Set, Module &SrcM, + ArrayRef<GlobalValue *> ValuesToLink, + std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor, + DenseMap<unsigned, MDNode *> *ValIDToTempMDMap = nullptr, + bool IsMetadataLinkingPostpass = false) + : DstM(DstM), SrcM(SrcM), AddLazyFor(AddLazyFor), TypeMap(Set), + GValMaterializer(this), LValMaterializer(this), + IsMetadataLinkingPostpass(IsMetadataLinkingPostpass), + ValIDToTempMDMap(ValIDToTempMDMap) { + for (GlobalValue *GV : ValuesToLink) + maybeAdd(GV); + + // If appropriate, tell the value mapper that it can expect to see + // temporary metadata. + if (!shouldLinkMetadata()) + ValueMapperFlags = ValueMapperFlags | RF_HaveUnmaterializedMetadata; + } + + ~IRLinker() { + // In the case where we are not linking metadata, we unset the CanReplace + // flag on all temporary metadata in the MetadataToIDs map to ensure + // none was replaced while being a map key. Now that we are destructing + // the map, set the flag back to true, so that it is replaceable during + // metadata linking. + if (!shouldLinkMetadata()) { + for (auto MDI : MetadataToIDs) { + Metadata *MD = const_cast<Metadata *>(MDI.first); + MDNode *Node = dyn_cast<MDNode>(MD); + assert((Node && Node->isTemporary()) && + "Found non-temp metadata in map when not linking metadata"); + Node->setCanReplace(true); + } + } + } + + bool run(); + Value *materializeDeclFor(Value *V, bool ForAlias); + void materializeInitFor(GlobalValue *New, GlobalValue *Old, bool ForAlias); + + /// Save the mapping between the given temporary metadata and its metadata + /// value id. Used to support metadata linking as a postpass for function + /// importing. + Metadata *mapTemporaryMetadata(Metadata *MD); + + /// Replace any temporary metadata saved for the source metadata's id with + /// the new non-temporary metadata. Used when metadata linking as a postpass + /// for function importing. + void replaceTemporaryMetadata(const Metadata *OrigMD, Metadata *NewMD); + + /// Indicates whether we need to map the given metadata into the destination + /// module. Used to prevent linking of metadata only needed by functions not + /// linked into the dest module. + bool isMetadataNeeded(Metadata *MD); +}; +} + +/// The LLVM SymbolTable class autorenames globals that conflict in the symbol +/// table. This is good for all clients except for us. Go through the trouble +/// to force this back. +static void forceRenaming(GlobalValue *GV, StringRef Name) { + // If the global doesn't force its name or if it already has the right name, + // there is nothing for us to do. + if (GV->hasLocalLinkage() || GV->getName() == Name) + return; + + Module *M = GV->getParent(); + + // If there is a conflict, rename the conflict. + if (GlobalValue *ConflictGV = M->getNamedValue(Name)) { + GV->takeName(ConflictGV); + ConflictGV->setName(Name); // This will cause ConflictGV to get renamed + assert(ConflictGV->getName() != Name && "forceRenaming didn't work"); + } else { + GV->setName(Name); // Force the name back + } +} + +Value *GlobalValueMaterializer::materializeDeclFor(Value *V) { + return TheIRLinker->materializeDeclFor(V, false); +} + +void GlobalValueMaterializer::materializeInitFor(GlobalValue *New, + GlobalValue *Old) { + TheIRLinker->materializeInitFor(New, Old, false); +} + +Metadata *GlobalValueMaterializer::mapTemporaryMetadata(Metadata *MD) { + return TheIRLinker->mapTemporaryMetadata(MD); +} + +void GlobalValueMaterializer::replaceTemporaryMetadata(const Metadata *OrigMD, + Metadata *NewMD) { + TheIRLinker->replaceTemporaryMetadata(OrigMD, NewMD); +} + +bool GlobalValueMaterializer::isMetadataNeeded(Metadata *MD) { + return TheIRLinker->isMetadataNeeded(MD); +} + +Value *LocalValueMaterializer::materializeDeclFor(Value *V) { + return TheIRLinker->materializeDeclFor(V, true); +} + +void LocalValueMaterializer::materializeInitFor(GlobalValue *New, + GlobalValue *Old) { + TheIRLinker->materializeInitFor(New, Old, true); +} + +Metadata *LocalValueMaterializer::mapTemporaryMetadata(Metadata *MD) { + return TheIRLinker->mapTemporaryMetadata(MD); +} + +void LocalValueMaterializer::replaceTemporaryMetadata(const Metadata *OrigMD, + Metadata *NewMD) { + TheIRLinker->replaceTemporaryMetadata(OrigMD, NewMD); +} + +bool LocalValueMaterializer::isMetadataNeeded(Metadata *MD) { + return TheIRLinker->isMetadataNeeded(MD); +} + +Value *IRLinker::materializeDeclFor(Value *V, bool ForAlias) { + auto *SGV = dyn_cast<GlobalValue>(V); + if (!SGV) + return nullptr; + + return linkGlobalValueProto(SGV, ForAlias); +} + +void IRLinker::materializeInitFor(GlobalValue *New, GlobalValue *Old, + bool ForAlias) { + // If we already created the body, just return. + if (auto *F = dyn_cast<Function>(New)) { + if (!F->isDeclaration()) + return; + } else if (auto *V = dyn_cast<GlobalVariable>(New)) { + if (V->hasInitializer()) + return; + } else { + auto *A = cast<GlobalAlias>(New); + if (A->getAliasee()) + return; + } + + if (ForAlias || shouldLink(New, *Old)) + linkGlobalValueBody(*New, *Old); +} + +Metadata *IRLinker::mapTemporaryMetadata(Metadata *MD) { + if (!ValIDToTempMDMap) + return nullptr; + // If this temporary metadata has a value id recorded during function + // parsing, record that in the ValIDToTempMDMap if one was provided. + if (MetadataToIDs.count(MD)) { + unsigned Idx = MetadataToIDs[MD]; + // Check if we created a temp MD when importing a different function from + // this module. If so, reuse it the same temporary metadata, otherwise + // add this temporary metadata to the map. + if (!ValIDToTempMDMap->count(Idx)) { + MDNode *Node = cast<MDNode>(MD); + assert(Node->isTemporary()); + (*ValIDToTempMDMap)[Idx] = Node; + } + return (*ValIDToTempMDMap)[Idx]; + } + return nullptr; +} + +void IRLinker::replaceTemporaryMetadata(const Metadata *OrigMD, + Metadata *NewMD) { + if (!ValIDToTempMDMap) + return; +#ifndef NDEBUG + auto *N = dyn_cast_or_null<MDNode>(NewMD); + assert(!N || !N->isTemporary()); +#endif + // If a mapping between metadata value ids and temporary metadata + // created during function importing was provided, and the source + // metadata has a value id recorded during metadata parsing, replace + // the temporary metadata with the final mapped metadata now. + if (MetadataToIDs.count(OrigMD)) { + unsigned Idx = MetadataToIDs[OrigMD]; + // Nothing to do if we didn't need to create a temporary metadata during + // function importing. + if (!ValIDToTempMDMap->count(Idx)) + return; + MDNode *TempMD = (*ValIDToTempMDMap)[Idx]; + TempMD->replaceAllUsesWith(NewMD); + MDNode::deleteTemporary(TempMD); + ValIDToTempMDMap->erase(Idx); + } +} + +bool IRLinker::isMetadataNeeded(Metadata *MD) { + // Currently only DISubprogram metadata is marked as being unneeded. + if (UnneededSubprograms.empty()) + return true; + MDNode *Node = dyn_cast<MDNode>(MD); + if (!Node) + return true; + DISubprogram *SP = getDISubprogram(Node); + if (!SP) + return true; + return !UnneededSubprograms.count(SP); +} + +/// Loop through the global variables in the src module and merge them into the +/// dest module. +GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) { + // No linking to be performed or linking from the source: simply create an + // identical version of the symbol over in the dest module... the + // initializer will be filled in later by LinkGlobalInits. + GlobalVariable *NewDGV = + new GlobalVariable(DstM, TypeMap.get(SGVar->getType()->getElementType()), + SGVar->isConstant(), GlobalValue::ExternalLinkage, + /*init*/ nullptr, SGVar->getName(), + /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(), + SGVar->getType()->getAddressSpace()); + NewDGV->setAlignment(SGVar->getAlignment()); + return NewDGV; +} + +/// Link the function in the source module into the destination module if +/// needed, setting up mapping information. +Function *IRLinker::copyFunctionProto(const Function *SF) { + // If there is no linkage to be performed or we are linking from the source, + // bring SF over. + return Function::Create(TypeMap.get(SF->getFunctionType()), + GlobalValue::ExternalLinkage, SF->getName(), &DstM); +} + +/// Set up prototypes for any aliases that come over from the source module. +GlobalValue *IRLinker::copyGlobalAliasProto(const GlobalAlias *SGA) { + // If there is no linkage to be performed or we're linking from the source, + // bring over SGA. + auto *Ty = TypeMap.get(SGA->getValueType()); + return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(), + GlobalValue::ExternalLinkage, SGA->getName(), + &DstM); +} + +GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV, + bool ForDefinition) { + GlobalValue *NewGV; + if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) { + NewGV = copyGlobalVariableProto(SGVar); + } else if (auto *SF = dyn_cast<Function>(SGV)) { + NewGV = copyFunctionProto(SF); + } else { + if (ForDefinition) + NewGV = copyGlobalAliasProto(cast<GlobalAlias>(SGV)); + else + NewGV = new GlobalVariable( + DstM, TypeMap.get(SGV->getType()->getElementType()), + /*isConstant*/ false, GlobalValue::ExternalLinkage, + /*init*/ nullptr, SGV->getName(), + /*insertbefore*/ nullptr, SGV->getThreadLocalMode(), + SGV->getType()->getAddressSpace()); + } + + if (ForDefinition) + NewGV->setLinkage(SGV->getLinkage()); + else if (SGV->hasExternalWeakLinkage() || SGV->hasWeakLinkage() || + SGV->hasLinkOnceLinkage()) + NewGV->setLinkage(GlobalValue::ExternalWeakLinkage); + + NewGV->copyAttributesFrom(SGV); + + // Remove these copied constants in case this stays a declaration, since + // they point to the source module. If the def is linked the values will + // be mapped in during linkFunctionBody. + if (auto *NewF = dyn_cast<Function>(NewGV)) { + NewF->setPersonalityFn(nullptr); + NewF->setPrefixData(nullptr); + NewF->setPrologueData(nullptr); + } + + return NewGV; +} + +/// Loop over all of the linked values to compute type mappings. For example, +/// if we link "extern Foo *x" and "Foo *x = NULL", then we have two struct +/// types 'Foo' but one got renamed when the module was loaded into the same +/// LLVMContext. +void IRLinker::computeTypeMapping() { + for (GlobalValue &SGV : SrcM.globals()) { + GlobalValue *DGV = getLinkedToGlobal(&SGV); + if (!DGV) + continue; + + if (!DGV->hasAppendingLinkage() || !SGV.hasAppendingLinkage()) { + TypeMap.addTypeMapping(DGV->getType(), SGV.getType()); + continue; + } + + // Unify the element type of appending arrays. + ArrayType *DAT = cast<ArrayType>(DGV->getType()->getElementType()); + ArrayType *SAT = cast<ArrayType>(SGV.getType()->getElementType()); + TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType()); + } + + for (GlobalValue &SGV : SrcM) + if (GlobalValue *DGV = getLinkedToGlobal(&SGV)) + TypeMap.addTypeMapping(DGV->getType(), SGV.getType()); + + for (GlobalValue &SGV : SrcM.aliases()) + if (GlobalValue *DGV = getLinkedToGlobal(&SGV)) + TypeMap.addTypeMapping(DGV->getType(), SGV.getType()); + + // Incorporate types by name, scanning all the types in the source module. + // At this point, the destination module may have a type "%foo = { i32 }" for + // example. When the source module got loaded into the same LLVMContext, if + // it had the same type, it would have been renamed to "%foo.42 = { i32 }". + std::vector<StructType *> Types = SrcM.getIdentifiedStructTypes(); + for (StructType *ST : Types) { + if (!ST->hasName()) + continue; + + // Check to see if there is a dot in the name followed by a digit. + size_t DotPos = ST->getName().rfind('.'); + if (DotPos == 0 || DotPos == StringRef::npos || + ST->getName().back() == '.' || + !isdigit(static_cast<unsigned char>(ST->getName()[DotPos + 1]))) + continue; + + // Check to see if the destination module has a struct with the prefix name. + StructType *DST = DstM.getTypeByName(ST->getName().substr(0, DotPos)); + if (!DST) + continue; + + // Don't use it if this actually came from the source module. They're in + // the same LLVMContext after all. Also don't use it unless the type is + // actually used in the destination module. This can happen in situations + // like this: + // + // Module A Module B + // -------- -------- + // %Z = type { %A } %B = type { %C.1 } + // %A = type { %B.1, [7 x i8] } %C.1 = type { i8* } + // %B.1 = type { %C } %A.2 = type { %B.3, [5 x i8] } + // %C = type { i8* } %B.3 = type { %C.1 } + // + // When we link Module B with Module A, the '%B' in Module B is + // used. However, that would then use '%C.1'. But when we process '%C.1', + // we prefer to take the '%C' version. So we are then left with both + // '%C.1' and '%C' being used for the same types. This leads to some + // variables using one type and some using the other. + if (TypeMap.DstStructTypesSet.hasType(DST)) + TypeMap.addTypeMapping(DST, ST); + } + + // Now that we have discovered all of the type equivalences, get a body for + // any 'opaque' types in the dest module that are now resolved. + TypeMap.linkDefinedTypeBodies(); +} + +static void getArrayElements(const Constant *C, + SmallVectorImpl<Constant *> &Dest) { + unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements(); + + for (unsigned i = 0; i != NumElements; ++i) + Dest.push_back(C->getAggregateElement(i)); +} + +/// If there were any appending global variables, link them together now. +/// Return true on error. +Constant *IRLinker::linkAppendingVarProto(GlobalVariable *DstGV, + const GlobalVariable *SrcGV) { + Type *EltTy = cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType())) + ->getElementType(); + + StringRef Name = SrcGV->getName(); + bool IsNewStructor = false; + bool IsOldStructor = false; + if (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") { + if (cast<StructType>(EltTy)->getNumElements() == 3) + IsNewStructor = true; + else + IsOldStructor = true; + } + + PointerType *VoidPtrTy = Type::getInt8Ty(SrcGV->getContext())->getPointerTo(); + if (IsOldStructor) { + auto &ST = *cast<StructType>(EltTy); + Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy}; + EltTy = StructType::get(SrcGV->getContext(), Tys, false); + } + + if (DstGV) { + ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType()); + + if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage()) { + emitError( + "Linking globals named '" + SrcGV->getName() + + "': can only link appending global with another appending global!"); + return nullptr; + } + + // Check to see that they two arrays agree on type. + if (EltTy != DstTy->getElementType()) { + emitError("Appending variables with different element types!"); + return nullptr; + } + if (DstGV->isConstant() != SrcGV->isConstant()) { + emitError("Appending variables linked with different const'ness!"); + return nullptr; + } + + if (DstGV->getAlignment() != SrcGV->getAlignment()) { + emitError( + "Appending variables with different alignment need to be linked!"); + return nullptr; + } + + if (DstGV->getVisibility() != SrcGV->getVisibility()) { + emitError( + "Appending variables with different visibility need to be linked!"); + return nullptr; + } + + if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr()) { + emitError( + "Appending variables with different unnamed_addr need to be linked!"); + return nullptr; + } + + if (StringRef(DstGV->getSection()) != SrcGV->getSection()) { + emitError( + "Appending variables with different section name need to be linked!"); + return nullptr; + } + } + + SmallVector<Constant *, 16> DstElements; + if (DstGV) + getArrayElements(DstGV->getInitializer(), DstElements); + + SmallVector<Constant *, 16> SrcElements; + getArrayElements(SrcGV->getInitializer(), SrcElements); + + if (IsNewStructor) + SrcElements.erase( + std::remove_if(SrcElements.begin(), SrcElements.end(), + [this](Constant *E) { + auto *Key = dyn_cast<GlobalValue>( + E->getAggregateElement(2)->stripPointerCasts()); + if (!Key) + return false; + GlobalValue *DGV = getLinkedToGlobal(Key); + return !shouldLink(DGV, *Key); + }), + SrcElements.end()); + uint64_t NewSize = DstElements.size() + SrcElements.size(); + ArrayType *NewType = ArrayType::get(EltTy, NewSize); + + // Create the new global variable. + GlobalVariable *NG = new GlobalVariable( + DstM, NewType, SrcGV->isConstant(), SrcGV->getLinkage(), + /*init*/ nullptr, /*name*/ "", DstGV, SrcGV->getThreadLocalMode(), + SrcGV->getType()->getAddressSpace()); + + NG->copyAttributesFrom(SrcGV); + forceRenaming(NG, SrcGV->getName()); + + Constant *Ret = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType())); + + // Stop recursion. + ValueMap[SrcGV] = Ret; + + for (auto *V : SrcElements) { + Constant *NewV; + if (IsOldStructor) { + auto *S = cast<ConstantStruct>(V); + auto *E1 = MapValue(S->getOperand(0), ValueMap, ValueMapperFlags, + &TypeMap, &GValMaterializer); + auto *E2 = MapValue(S->getOperand(1), ValueMap, ValueMapperFlags, + &TypeMap, &GValMaterializer); + Value *Null = Constant::getNullValue(VoidPtrTy); + NewV = + ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr); + } else { + NewV = + MapValue(V, ValueMap, ValueMapperFlags, &TypeMap, &GValMaterializer); + } + DstElements.push_back(NewV); + } + + NG->setInitializer(ConstantArray::get(NewType, DstElements)); + + // Replace any uses of the two global variables with uses of the new + // global. + if (DstGV) { + DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType())); + DstGV->eraseFromParent(); + } + + return Ret; +} + +static bool useExistingDest(GlobalValue &SGV, GlobalValue *DGV, + bool ShouldLink) { + if (!DGV) + return false; + + if (SGV.isDeclaration()) + return true; + + if (DGV->isDeclarationForLinker() && !SGV.isDeclarationForLinker()) + return false; + + if (ShouldLink) + return false; + + return true; +} + +bool IRLinker::shouldLink(GlobalValue *DGV, GlobalValue &SGV) { + // Already imported all the values. Just map to the Dest value + // in case it is referenced in the metadata. + if (IsMetadataLinkingPostpass) { + assert(!ValuesToLink.count(&SGV) && + "Source value unexpectedly requested for link during metadata link"); + return false; + } + + if (ValuesToLink.count(&SGV)) + return true; + + if (SGV.hasLocalLinkage()) + return true; + + if (DGV && !DGV->isDeclaration()) + return false; + + if (SGV.hasAvailableExternallyLinkage()) + return true; + + if (DoneLinkingBodies) + return false; + + AddLazyFor(SGV, [this](GlobalValue &GV) { maybeAdd(&GV); }); + return ValuesToLink.count(&SGV); +} + +Constant *IRLinker::linkGlobalValueProto(GlobalValue *SGV, bool ForAlias) { + GlobalValue *DGV = getLinkedToGlobal(SGV); + + bool ShouldLink = shouldLink(DGV, *SGV); + + // just missing from map + if (ShouldLink) { + auto I = ValueMap.find(SGV); + if (I != ValueMap.end()) + return cast<Constant>(I->second); + + I = AliasValueMap.find(SGV); + if (I != AliasValueMap.end()) + return cast<Constant>(I->second); + } + + DGV = nullptr; + if (ShouldLink || !ForAlias) + DGV = getLinkedToGlobal(SGV); + + // Handle the ultra special appending linkage case first. + assert(!DGV || SGV->hasAppendingLinkage() == DGV->hasAppendingLinkage()); + if (SGV->hasAppendingLinkage()) + return linkAppendingVarProto(cast_or_null<GlobalVariable>(DGV), + cast<GlobalVariable>(SGV)); + + GlobalValue *NewGV; + if (useExistingDest(*SGV, DGV, ShouldLink)) { + NewGV = DGV; + } else { + // If we are done linking global value bodies (i.e. we are performing + // metadata linking), don't link in the global value due to this + // reference, simply map it to null. + if (DoneLinkingBodies) + return nullptr; + + NewGV = copyGlobalValueProto(SGV, ShouldLink); + if (!ForAlias) + forceRenaming(NewGV, SGV->getName()); + } + if (ShouldLink || ForAlias) { + if (const Comdat *SC = SGV->getComdat()) { + if (auto *GO = dyn_cast<GlobalObject>(NewGV)) { + Comdat *DC = DstM.getOrInsertComdat(SC->getName()); + DC->setSelectionKind(SC->getSelectionKind()); + GO->setComdat(DC); + } + } + } + + if (!ShouldLink && ForAlias) + NewGV->setLinkage(GlobalValue::InternalLinkage); + + Constant *C = NewGV; + if (DGV) + C = ConstantExpr::getBitCast(NewGV, TypeMap.get(SGV->getType())); + + if (DGV && NewGV != DGV) { + DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewGV, DGV->getType())); + DGV->eraseFromParent(); + } + + return C; +} + +/// Update the initializers in the Dest module now that all globals that may be +/// referenced are in Dest. +void IRLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) { + // Figure out what the initializer looks like in the dest module. + Dst.setInitializer(MapValue(Src.getInitializer(), ValueMap, ValueMapperFlags, + &TypeMap, &GValMaterializer)); +} + +/// Copy the source function over into the dest function and fix up references +/// to values. At this point we know that Dest is an external function, and +/// that Src is not. +bool IRLinker::linkFunctionBody(Function &Dst, Function &Src) { + assert(Dst.isDeclaration() && !Src.isDeclaration()); + + // Materialize if needed. + if (std::error_code EC = Src.materialize()) + return emitError(EC.message()); + + if (!shouldLinkMetadata()) + // This is only supported for lazy links. Do after materialization of + // a function and before remapping metadata on instructions below + // in RemapInstruction, as the saved mapping is used to handle + // the temporary metadata hanging off instructions. + SrcM.getMaterializer()->saveMetadataList(MetadataToIDs, + /* OnlyTempMD = */ true); + + // Link in the prefix data. + if (Src.hasPrefixData()) + Dst.setPrefixData(MapValue(Src.getPrefixData(), ValueMap, ValueMapperFlags, + &TypeMap, &GValMaterializer)); + + // Link in the prologue data. + if (Src.hasPrologueData()) + Dst.setPrologueData(MapValue(Src.getPrologueData(), ValueMap, + ValueMapperFlags, &TypeMap, + &GValMaterializer)); + + // Link in the personality function. + if (Src.hasPersonalityFn()) + Dst.setPersonalityFn(MapValue(Src.getPersonalityFn(), ValueMap, + ValueMapperFlags, &TypeMap, + &GValMaterializer)); + + // Go through and convert function arguments over, remembering the mapping. + Function::arg_iterator DI = Dst.arg_begin(); + for (Argument &Arg : Src.args()) { + DI->setName(Arg.getName()); // Copy the name over. + + // Add a mapping to our mapping. + ValueMap[&Arg] = &*DI; + ++DI; + } + + // Copy over the metadata attachments. + SmallVector<std::pair<unsigned, MDNode *>, 8> MDs; + Src.getAllMetadata(MDs); + for (const auto &I : MDs) + Dst.setMetadata(I.first, MapMetadata(I.second, ValueMap, ValueMapperFlags, + &TypeMap, &GValMaterializer)); + + // Splice the body of the source function into the dest function. + Dst.getBasicBlockList().splice(Dst.end(), Src.getBasicBlockList()); + + // At this point, all of the instructions and values of the function are now + // copied over. The only problem is that they are still referencing values in + // the Source function as operands. Loop through all of the operands of the + // functions and patch them up to point to the local versions. + for (BasicBlock &BB : Dst) + for (Instruction &I : BB) + RemapInstruction(&I, ValueMap, RF_IgnoreMissingEntries | ValueMapperFlags, + &TypeMap, &GValMaterializer); + + // There is no need to map the arguments anymore. + for (Argument &Arg : Src.args()) + ValueMap.erase(&Arg); + + return false; +} + +void IRLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) { + Constant *Aliasee = Src.getAliasee(); + Constant *Val = MapValue(Aliasee, AliasValueMap, ValueMapperFlags, &TypeMap, + &LValMaterializer); + Dst.setAliasee(Val); +} + +bool IRLinker::linkGlobalValueBody(GlobalValue &Dst, GlobalValue &Src) { + if (auto *F = dyn_cast<Function>(&Src)) + return linkFunctionBody(cast<Function>(Dst), *F); + if (auto *GVar = dyn_cast<GlobalVariable>(&Src)) { + linkGlobalInit(cast<GlobalVariable>(Dst), *GVar); + return false; + } + linkAliasBody(cast<GlobalAlias>(Dst), cast<GlobalAlias>(Src)); + return false; +} + +void IRLinker::findNeededSubprograms(ValueToValueMapTy &ValueMap) { + // Track unneeded nodes to make it simpler to handle the case + // where we are checking if an already-mapped SP is needed. + NamedMDNode *CompileUnits = SrcM.getNamedMetadata("llvm.dbg.cu"); + if (!CompileUnits) + return; + for (unsigned I = 0, E = CompileUnits->getNumOperands(); I != E; ++I) { + auto *CU = cast<DICompileUnit>(CompileUnits->getOperand(I)); + assert(CU && "Expected valid compile unit"); + // Ensure that we don't remove subprograms referenced by DIImportedEntity. + // It is not legal to have a DIImportedEntity with a null entity or scope. + // FIXME: The DISubprogram for functions not linked in but kept due to + // being referenced by a DIImportedEntity should also get their + // IsDefinition flag is unset. + SmallPtrSet<DISubprogram *, 8> ImportedEntitySPs; + for (auto *IE : CU->getImportedEntities()) { + if (auto *SP = dyn_cast<DISubprogram>(IE->getEntity())) + ImportedEntitySPs.insert(SP); + if (auto *SP = dyn_cast<DISubprogram>(IE->getScope())) + ImportedEntitySPs.insert(SP); + } + for (auto *Op : CU->getSubprograms()) { + // Unless we were doing function importing and deferred metadata linking, + // any needed SPs should have been mapped as they would be reached + // from the function linked in (either on the function itself for linked + // function bodies, or from DILocation on inlined instructions). + assert(!(ValueMap.MD()[Op] && IsMetadataLinkingPostpass) && + "DISubprogram shouldn't be mapped yet"); + if (!ValueMap.MD()[Op] && !ImportedEntitySPs.count(Op)) + UnneededSubprograms.insert(Op); + } + } + if (!IsMetadataLinkingPostpass) + return; + // In the case of metadata linking as a postpass (e.g. for function + // importing), see which DISubprogram MD from the source has an associated + // temporary metadata node, which means the SP was needed by an imported + // function. + for (auto MDI : MetadataToIDs) { + const MDNode *Node = dyn_cast<MDNode>(MDI.first); + if (!Node) + continue; + DISubprogram *SP = getDISubprogram(Node); + if (!SP || !ValIDToTempMDMap->count(MDI.second)) + continue; + UnneededSubprograms.erase(SP); + } +} + +// Squash null subprograms from compile unit subprogram lists. +void IRLinker::stripNullSubprograms() { + NamedMDNode *CompileUnits = DstM.getNamedMetadata("llvm.dbg.cu"); + if (!CompileUnits) + return; + for (unsigned I = 0, E = CompileUnits->getNumOperands(); I != E; ++I) { + auto *CU = cast<DICompileUnit>(CompileUnits->getOperand(I)); + assert(CU && "Expected valid compile unit"); + + SmallVector<Metadata *, 16> NewSPs; + NewSPs.reserve(CU->getSubprograms().size()); + bool FoundNull = false; + for (DISubprogram *SP : CU->getSubprograms()) { + if (!SP) { + FoundNull = true; + continue; + } + NewSPs.push_back(SP); + } + if (FoundNull) + CU->replaceSubprograms(MDTuple::get(CU->getContext(), NewSPs)); + } +} + +/// Insert all of the named MDNodes in Src into the Dest module. +void IRLinker::linkNamedMDNodes() { + findNeededSubprograms(ValueMap); + const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata(); + for (const NamedMDNode &NMD : SrcM.named_metadata()) { + // Don't link module flags here. Do them separately. + if (&NMD == SrcModFlags) + continue; + NamedMDNode *DestNMD = DstM.getOrInsertNamedMetadata(NMD.getName()); + // Add Src elements into Dest node. + for (const MDNode *op : NMD.operands()) + DestNMD->addOperand(MapMetadata( + op, ValueMap, ValueMapperFlags | RF_NullMapMissingGlobalValues, + &TypeMap, &GValMaterializer)); + } + stripNullSubprograms(); +} + +/// Merge the linker flags in Src into the Dest module. +bool IRLinker::linkModuleFlagsMetadata() { + // If the source module has no module flags, we are done. + const NamedMDNode *SrcModFlags = SrcM.getModuleFlagsMetadata(); + if (!SrcModFlags) + return false; + + // If the destination module doesn't have module flags yet, then just copy + // over the source module's flags. + NamedMDNode *DstModFlags = DstM.getOrInsertModuleFlagsMetadata(); + if (DstModFlags->getNumOperands() == 0) { + for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) + DstModFlags->addOperand(SrcModFlags->getOperand(I)); + + return false; + } + + // First build a map of the existing module flags and requirements. + DenseMap<MDString *, std::pair<MDNode *, unsigned>> Flags; + SmallSetVector<MDNode *, 16> Requirements; + for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) { + MDNode *Op = DstModFlags->getOperand(I); + ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0)); + MDString *ID = cast<MDString>(Op->getOperand(1)); + + if (Behavior->getZExtValue() == Module::Require) { + Requirements.insert(cast<MDNode>(Op->getOperand(2))); + } else { + Flags[ID] = std::make_pair(Op, I); + } + } + + // Merge in the flags from the source module, and also collect its set of + // requirements. + for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) { + MDNode *SrcOp = SrcModFlags->getOperand(I); + ConstantInt *SrcBehavior = + mdconst::extract<ConstantInt>(SrcOp->getOperand(0)); + MDString *ID = cast<MDString>(SrcOp->getOperand(1)); + MDNode *DstOp; + unsigned DstIndex; + std::tie(DstOp, DstIndex) = Flags.lookup(ID); + unsigned SrcBehaviorValue = SrcBehavior->getZExtValue(); + + // If this is a requirement, add it and continue. + if (SrcBehaviorValue == Module::Require) { + // If the destination module does not already have this requirement, add + // it. + if (Requirements.insert(cast<MDNode>(SrcOp->getOperand(2)))) { + DstModFlags->addOperand(SrcOp); + } + continue; + } + + // If there is no existing flag with this ID, just add it. + if (!DstOp) { + Flags[ID] = std::make_pair(SrcOp, DstModFlags->getNumOperands()); + DstModFlags->addOperand(SrcOp); + continue; + } + + // Otherwise, perform a merge. + ConstantInt *DstBehavior = + mdconst::extract<ConstantInt>(DstOp->getOperand(0)); + unsigned DstBehaviorValue = DstBehavior->getZExtValue(); + + // If either flag has override behavior, handle it first. + if (DstBehaviorValue == Module::Override) { + // Diagnose inconsistent flags which both have override behavior. + if (SrcBehaviorValue == Module::Override && + SrcOp->getOperand(2) != DstOp->getOperand(2)) { + emitError("linking module flags '" + ID->getString() + + "': IDs have conflicting override values"); + } + continue; + } else if (SrcBehaviorValue == Module::Override) { + // Update the destination flag to that of the source. + DstModFlags->setOperand(DstIndex, SrcOp); + Flags[ID].first = SrcOp; + continue; + } + + // Diagnose inconsistent merge behavior types. + if (SrcBehaviorValue != DstBehaviorValue) { + emitError("linking module flags '" + ID->getString() + + "': IDs have conflicting behaviors"); + continue; + } + + auto replaceDstValue = [&](MDNode *New) { + Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New}; + MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps); + DstModFlags->setOperand(DstIndex, Flag); + Flags[ID].first = Flag; + }; + + // Perform the merge for standard behavior types. + switch (SrcBehaviorValue) { + case Module::Require: + case Module::Override: + llvm_unreachable("not possible"); + case Module::Error: { + // Emit an error if the values differ. + if (SrcOp->getOperand(2) != DstOp->getOperand(2)) { + emitError("linking module flags '" + ID->getString() + + "': IDs have conflicting values"); + } + continue; + } + case Module::Warning: { + // Emit a warning if the values differ. + if (SrcOp->getOperand(2) != DstOp->getOperand(2)) { + emitWarning("linking module flags '" + ID->getString() + + "': IDs have conflicting values"); + } + continue; + } + case Module::Append: { + MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2)); + MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2)); + SmallVector<Metadata *, 8> MDs; + MDs.reserve(DstValue->getNumOperands() + SrcValue->getNumOperands()); + MDs.append(DstValue->op_begin(), DstValue->op_end()); + MDs.append(SrcValue->op_begin(), SrcValue->op_end()); + + replaceDstValue(MDNode::get(DstM.getContext(), MDs)); + break; + } + case Module::AppendUnique: { + SmallSetVector<Metadata *, 16> Elts; + MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2)); + MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2)); + Elts.insert(DstValue->op_begin(), DstValue->op_end()); + Elts.insert(SrcValue->op_begin(), SrcValue->op_end()); + + replaceDstValue(MDNode::get(DstM.getContext(), + makeArrayRef(Elts.begin(), Elts.end()))); + break; + } + } + } + + // Check all of the requirements. + for (unsigned I = 0, E = Requirements.size(); I != E; ++I) { + MDNode *Requirement = Requirements[I]; + MDString *Flag = cast<MDString>(Requirement->getOperand(0)); + Metadata *ReqValue = Requirement->getOperand(1); + + MDNode *Op = Flags[Flag].first; + if (!Op || Op->getOperand(2) != ReqValue) { + emitError("linking module flags '" + Flag->getString() + + "': does not have the required value"); + continue; + } + } + + return HasError; +} + +// This function returns true if the triples match. +static bool triplesMatch(const Triple &T0, const Triple &T1) { + // If vendor is apple, ignore the version number. + if (T0.getVendor() == Triple::Apple) + return T0.getArch() == T1.getArch() && T0.getSubArch() == T1.getSubArch() && + T0.getVendor() == T1.getVendor() && T0.getOS() == T1.getOS(); + + return T0 == T1; +} + +// This function returns the merged triple. +static std::string mergeTriples(const Triple &SrcTriple, + const Triple &DstTriple) { + // If vendor is apple, pick the triple with the larger version number. + if (SrcTriple.getVendor() == Triple::Apple) + if (DstTriple.isOSVersionLT(SrcTriple)) + return SrcTriple.str(); + + return DstTriple.str(); +} + +bool IRLinker::run() { + // Inherit the target data from the source module if the destination module + // doesn't have one already. + if (DstM.getDataLayout().isDefault()) + DstM.setDataLayout(SrcM.getDataLayout()); + + if (SrcM.getDataLayout() != DstM.getDataLayout()) { + emitWarning("Linking two modules of different data layouts: '" + + SrcM.getModuleIdentifier() + "' is '" + + SrcM.getDataLayoutStr() + "' whereas '" + + DstM.getModuleIdentifier() + "' is '" + + DstM.getDataLayoutStr() + "'\n"); + } + + // Copy the target triple from the source to dest if the dest's is empty. + if (DstM.getTargetTriple().empty() && !SrcM.getTargetTriple().empty()) + DstM.setTargetTriple(SrcM.getTargetTriple()); + + Triple SrcTriple(SrcM.getTargetTriple()), DstTriple(DstM.getTargetTriple()); + + if (!SrcM.getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple)) + emitWarning("Linking two modules of different target triples: " + + SrcM.getModuleIdentifier() + "' is '" + SrcM.getTargetTriple() + + "' whereas '" + DstM.getModuleIdentifier() + "' is '" + + DstM.getTargetTriple() + "'\n"); + + DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple)); + + // Append the module inline asm string. + if (!SrcM.getModuleInlineAsm().empty()) { + if (DstM.getModuleInlineAsm().empty()) + DstM.setModuleInlineAsm(SrcM.getModuleInlineAsm()); + else + DstM.setModuleInlineAsm(DstM.getModuleInlineAsm() + "\n" + + SrcM.getModuleInlineAsm()); + } + + // Loop over all of the linked values to compute type mappings. + computeTypeMapping(); + + std::reverse(Worklist.begin(), Worklist.end()); + while (!Worklist.empty()) { + GlobalValue *GV = Worklist.back(); + Worklist.pop_back(); + + // Already mapped. + if (ValueMap.find(GV) != ValueMap.end() || + AliasValueMap.find(GV) != AliasValueMap.end()) + continue; + + assert(!GV->isDeclaration()); + MapValue(GV, ValueMap, ValueMapperFlags, &TypeMap, &GValMaterializer); + if (HasError) + return true; + } + + // Note that we are done linking global value bodies. This prevents + // metadata linking from creating new references. + DoneLinkingBodies = true; + + // Remap all of the named MDNodes in Src into the DstM module. We do this + // after linking GlobalValues so that MDNodes that reference GlobalValues + // are properly remapped. + if (shouldLinkMetadata()) { + // Even if just linking metadata we should link decls above in case + // any are referenced by metadata. IRLinker::shouldLink ensures that + // we don't actually link anything from source. + if (IsMetadataLinkingPostpass) { + // Ensure metadata materialized + if (SrcM.getMaterializer()->materializeMetadata()) + return true; + SrcM.getMaterializer()->saveMetadataList(MetadataToIDs, + /* OnlyTempMD = */ false); + } + + linkNamedMDNodes(); + + if (IsMetadataLinkingPostpass) { + // Handle anything left in the ValIDToTempMDMap, such as metadata nodes + // not reached by the dbg.cu NamedMD (i.e. only reached from + // instructions). + // Walk the MetadataToIDs once to find the set of new (imported) MD + // that still has corresponding temporary metadata, and invoke metadata + // mapping on each one. + for (auto MDI : MetadataToIDs) { + if (!ValIDToTempMDMap->count(MDI.second)) + continue; + MapMetadata(MDI.first, ValueMap, ValueMapperFlags, &TypeMap, + &GValMaterializer); + } + assert(ValIDToTempMDMap->empty()); + } + + // Merge the module flags into the DstM module. + if (linkModuleFlagsMetadata()) + return true; + } + + return false; +} + +IRMover::StructTypeKeyInfo::KeyTy::KeyTy(ArrayRef<Type *> E, bool P) + : ETypes(E), IsPacked(P) {} + +IRMover::StructTypeKeyInfo::KeyTy::KeyTy(const StructType *ST) + : ETypes(ST->elements()), IsPacked(ST->isPacked()) {} + +bool IRMover::StructTypeKeyInfo::KeyTy::operator==(const KeyTy &That) const { + if (IsPacked != That.IsPacked) + return false; + if (ETypes != That.ETypes) + return false; + return true; +} + +bool IRMover::StructTypeKeyInfo::KeyTy::operator!=(const KeyTy &That) const { + return !this->operator==(That); +} + +StructType *IRMover::StructTypeKeyInfo::getEmptyKey() { + return DenseMapInfo<StructType *>::getEmptyKey(); +} + +StructType *IRMover::StructTypeKeyInfo::getTombstoneKey() { + return DenseMapInfo<StructType *>::getTombstoneKey(); +} + +unsigned IRMover::StructTypeKeyInfo::getHashValue(const KeyTy &Key) { + return hash_combine(hash_combine_range(Key.ETypes.begin(), Key.ETypes.end()), + Key.IsPacked); +} + +unsigned IRMover::StructTypeKeyInfo::getHashValue(const StructType *ST) { + return getHashValue(KeyTy(ST)); +} + +bool IRMover::StructTypeKeyInfo::isEqual(const KeyTy &LHS, + const StructType *RHS) { + if (RHS == getEmptyKey() || RHS == getTombstoneKey()) + return false; + return LHS == KeyTy(RHS); +} + +bool IRMover::StructTypeKeyInfo::isEqual(const StructType *LHS, + const StructType *RHS) { + if (RHS == getEmptyKey()) + return LHS == getEmptyKey(); + + if (RHS == getTombstoneKey()) + return LHS == getTombstoneKey(); + + return KeyTy(LHS) == KeyTy(RHS); +} + +void IRMover::IdentifiedStructTypeSet::addNonOpaque(StructType *Ty) { + assert(!Ty->isOpaque()); + NonOpaqueStructTypes.insert(Ty); +} + +void IRMover::IdentifiedStructTypeSet::switchToNonOpaque(StructType *Ty) { + assert(!Ty->isOpaque()); + NonOpaqueStructTypes.insert(Ty); + bool Removed = OpaqueStructTypes.erase(Ty); + (void)Removed; + assert(Removed); +} + +void IRMover::IdentifiedStructTypeSet::addOpaque(StructType *Ty) { + assert(Ty->isOpaque()); + OpaqueStructTypes.insert(Ty); +} + +StructType * +IRMover::IdentifiedStructTypeSet::findNonOpaque(ArrayRef<Type *> ETypes, + bool IsPacked) { + IRMover::StructTypeKeyInfo::KeyTy Key(ETypes, IsPacked); + auto I = NonOpaqueStructTypes.find_as(Key); + if (I == NonOpaqueStructTypes.end()) + return nullptr; + return *I; +} + +bool IRMover::IdentifiedStructTypeSet::hasType(StructType *Ty) { + if (Ty->isOpaque()) + return OpaqueStructTypes.count(Ty); + auto I = NonOpaqueStructTypes.find(Ty); + if (I == NonOpaqueStructTypes.end()) + return false; + return *I == Ty; +} + +IRMover::IRMover(Module &M) : Composite(M) { + TypeFinder StructTypes; + StructTypes.run(M, true); + for (StructType *Ty : StructTypes) { + if (Ty->isOpaque()) + IdentifiedStructTypes.addOpaque(Ty); + else + IdentifiedStructTypes.addNonOpaque(Ty); + } +} + +bool IRMover::move( + Module &Src, ArrayRef<GlobalValue *> ValuesToLink, + std::function<void(GlobalValue &, ValueAdder Add)> AddLazyFor, + DenseMap<unsigned, MDNode *> *ValIDToTempMDMap, + bool IsMetadataLinkingPostpass) { + IRLinker TheIRLinker(Composite, IdentifiedStructTypes, Src, ValuesToLink, + AddLazyFor, ValIDToTempMDMap, IsMetadataLinkingPostpass); + bool RetCode = TheIRLinker.run(); + Composite.dropTriviallyDeadConstantArrays(); + return RetCode; +} diff --git a/contrib/llvm/lib/Linker/LinkDiagnosticInfo.h b/contrib/llvm/lib/Linker/LinkDiagnosticInfo.h new file mode 100644 index 0000000..d91f19c --- /dev/null +++ b/contrib/llvm/lib/Linker/LinkDiagnosticInfo.h @@ -0,0 +1,25 @@ +//===- LinkDiagnosticInfo.h -------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_LINKER_LINK_DIAGNOSTIC_INFO_H +#define LLVM_LIB_LINKER_LINK_DIAGNOSTIC_INFO_H + +#include "llvm/IR/DiagnosticInfo.h" + +namespace llvm { +class LinkDiagnosticInfo : public DiagnosticInfo { + const Twine &Msg; + +public: + LinkDiagnosticInfo(DiagnosticSeverity Severity, const Twine &Msg); + void print(DiagnosticPrinter &DP) const override; +}; +} + +#endif diff --git a/contrib/llvm/lib/Linker/LinkModules.cpp b/contrib/llvm/lib/Linker/LinkModules.cpp index f090680..6ffa71e 100644 --- a/contrib/llvm/lib/Linker/LinkModules.cpp +++ b/contrib/llvm/lib/Linker/LinkModules.cpp @@ -12,447 +12,69 @@ //===----------------------------------------------------------------------===// #include "llvm/Linker/Linker.h" +#include "LinkDiagnosticInfo.h" #include "llvm-c/Linker.h" -#include "llvm/ADT/Hashing.h" -#include "llvm/ADT/Optional.h" #include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/Triple.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DebugInfo.h" -#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/ADT/StringSet.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/TypeFinder.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include <cctype> -#include <tuple> using namespace llvm; - -//===----------------------------------------------------------------------===// -// TypeMap implementation. -//===----------------------------------------------------------------------===// - namespace { -class TypeMapTy : public ValueMapTypeRemapper { - /// This is a mapping from a source type to a destination type to use. - DenseMap<Type*, Type*> MappedTypes; - - /// When checking to see if two subgraphs are isomorphic, we speculatively - /// add types to MappedTypes, but keep track of them here in case we need to - /// roll back. - SmallVector<Type*, 16> SpeculativeTypes; - - SmallVector<StructType*, 16> SpeculativeDstOpaqueTypes; - - /// This is a list of non-opaque structs in the source module that are mapped - /// to an opaque struct in the destination module. - SmallVector<StructType*, 16> SrcDefinitionsToResolve; - - /// This is the set of opaque types in the destination modules who are - /// getting a body from the source module. - SmallPtrSet<StructType*, 16> DstResolvedOpaqueTypes; - -public: - TypeMapTy(Linker::IdentifiedStructTypeSet &DstStructTypesSet) - : DstStructTypesSet(DstStructTypesSet) {} - - Linker::IdentifiedStructTypeSet &DstStructTypesSet; - /// Indicate that the specified type in the destination module is conceptually - /// equivalent to the specified type in the source module. - void addTypeMapping(Type *DstTy, Type *SrcTy); - - /// Produce a body for an opaque type in the dest module from a type - /// definition in the source module. - void linkDefinedTypeBodies(); - - /// Return the mapped type to use for the specified input type from the - /// source module. - Type *get(Type *SrcTy); - Type *get(Type *SrcTy, SmallPtrSet<StructType *, 8> &Visited); - - void finishType(StructType *DTy, StructType *STy, ArrayRef<Type *> ETypes); - - FunctionType *get(FunctionType *T) { - return cast<FunctionType>(get((Type *)T)); - } - - /// Dump out the type map for debugging purposes. - void dump() const { - for (auto &Pair : MappedTypes) { - dbgs() << "TypeMap: "; - Pair.first->print(dbgs()); - dbgs() << " => "; - Pair.second->print(dbgs()); - dbgs() << '\n'; - } - } - -private: - Type *remapType(Type *SrcTy) override { return get(SrcTy); } - - bool areTypesIsomorphic(Type *DstTy, Type *SrcTy); -}; -} - -void TypeMapTy::addTypeMapping(Type *DstTy, Type *SrcTy) { - assert(SpeculativeTypes.empty()); - assert(SpeculativeDstOpaqueTypes.empty()); - - // Check to see if these types are recursively isomorphic and establish a - // mapping between them if so. - if (!areTypesIsomorphic(DstTy, SrcTy)) { - // Oops, they aren't isomorphic. Just discard this request by rolling out - // any speculative mappings we've established. - for (Type *Ty : SpeculativeTypes) - MappedTypes.erase(Ty); - - SrcDefinitionsToResolve.resize(SrcDefinitionsToResolve.size() - - SpeculativeDstOpaqueTypes.size()); - for (StructType *Ty : SpeculativeDstOpaqueTypes) - DstResolvedOpaqueTypes.erase(Ty); - } else { - for (Type *Ty : SpeculativeTypes) - if (auto *STy = dyn_cast<StructType>(Ty)) - if (STy->hasName()) - STy->setName(""); - } - SpeculativeTypes.clear(); - SpeculativeDstOpaqueTypes.clear(); -} - -/// Recursively walk this pair of types, returning true if they are isomorphic, -/// false if they are not. -bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) { - // Two types with differing kinds are clearly not isomorphic. - if (DstTy->getTypeID() != SrcTy->getTypeID()) - return false; - - // If we have an entry in the MappedTypes table, then we have our answer. - Type *&Entry = MappedTypes[SrcTy]; - if (Entry) - return Entry == DstTy; - - // Two identical types are clearly isomorphic. Remember this - // non-speculatively. - if (DstTy == SrcTy) { - Entry = DstTy; - return true; - } - - // Okay, we have two types with identical kinds that we haven't seen before. - - // If this is an opaque struct type, special case it. - if (StructType *SSTy = dyn_cast<StructType>(SrcTy)) { - // Mapping an opaque type to any struct, just keep the dest struct. - if (SSTy->isOpaque()) { - Entry = DstTy; - SpeculativeTypes.push_back(SrcTy); - return true; - } - - // Mapping a non-opaque source type to an opaque dest. If this is the first - // type that we're mapping onto this destination type then we succeed. Keep - // the dest, but fill it in later. If this is the second (different) type - // that we're trying to map onto the same opaque type then we fail. - if (cast<StructType>(DstTy)->isOpaque()) { - // We can only map one source type onto the opaque destination type. - if (!DstResolvedOpaqueTypes.insert(cast<StructType>(DstTy)).second) - return false; - SrcDefinitionsToResolve.push_back(SSTy); - SpeculativeTypes.push_back(SrcTy); - SpeculativeDstOpaqueTypes.push_back(cast<StructType>(DstTy)); - Entry = DstTy; - return true; - } - } - - // If the number of subtypes disagree between the two types, then we fail. - if (SrcTy->getNumContainedTypes() != DstTy->getNumContainedTypes()) - return false; - - // Fail if any of the extra properties (e.g. array size) of the type disagree. - if (isa<IntegerType>(DstTy)) - return false; // bitwidth disagrees. - if (PointerType *PT = dyn_cast<PointerType>(DstTy)) { - if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace()) - return false; - - } else if (FunctionType *FT = dyn_cast<FunctionType>(DstTy)) { - if (FT->isVarArg() != cast<FunctionType>(SrcTy)->isVarArg()) - return false; - } else if (StructType *DSTy = dyn_cast<StructType>(DstTy)) { - StructType *SSTy = cast<StructType>(SrcTy); - if (DSTy->isLiteral() != SSTy->isLiteral() || - DSTy->isPacked() != SSTy->isPacked()) - return false; - } else if (ArrayType *DATy = dyn_cast<ArrayType>(DstTy)) { - if (DATy->getNumElements() != cast<ArrayType>(SrcTy)->getNumElements()) - return false; - } else if (VectorType *DVTy = dyn_cast<VectorType>(DstTy)) { - if (DVTy->getNumElements() != cast<VectorType>(SrcTy)->getNumElements()) - return false; - } - - // Otherwise, we speculate that these two types will line up and recursively - // check the subelements. - Entry = DstTy; - SpeculativeTypes.push_back(SrcTy); - - for (unsigned I = 0, E = SrcTy->getNumContainedTypes(); I != E; ++I) - if (!areTypesIsomorphic(DstTy->getContainedType(I), - SrcTy->getContainedType(I))) - return false; - - // If everything seems to have lined up, then everything is great. - return true; -} - -void TypeMapTy::linkDefinedTypeBodies() { - SmallVector<Type*, 16> Elements; - for (StructType *SrcSTy : SrcDefinitionsToResolve) { - StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]); - assert(DstSTy->isOpaque()); - - // Map the body of the source type over to a new body for the dest type. - Elements.resize(SrcSTy->getNumElements()); - for (unsigned I = 0, E = Elements.size(); I != E; ++I) - Elements[I] = get(SrcSTy->getElementType(I)); - - DstSTy->setBody(Elements, SrcSTy->isPacked()); - DstStructTypesSet.switchToNonOpaque(DstSTy); - } - SrcDefinitionsToResolve.clear(); - DstResolvedOpaqueTypes.clear(); -} - -void TypeMapTy::finishType(StructType *DTy, StructType *STy, - ArrayRef<Type *> ETypes) { - DTy->setBody(ETypes, STy->isPacked()); - - // Steal STy's name. - if (STy->hasName()) { - SmallString<16> TmpName = STy->getName(); - STy->setName(""); - DTy->setName(TmpName); - } - - DstStructTypesSet.addNonOpaque(DTy); -} - -Type *TypeMapTy::get(Type *Ty) { - SmallPtrSet<StructType *, 8> Visited; - return get(Ty, Visited); -} - -Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) { - // If we already have an entry for this type, return it. - Type **Entry = &MappedTypes[Ty]; - if (*Entry) - return *Entry; - - // These are types that LLVM itself will unique. - bool IsUniqued = !isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral(); - -#ifndef NDEBUG - if (!IsUniqued) { - for (auto &Pair : MappedTypes) { - assert(!(Pair.first != Ty && Pair.second == Ty) && - "mapping to a source type"); - } - } -#endif - - if (!IsUniqued && !Visited.insert(cast<StructType>(Ty)).second) { - StructType *DTy = StructType::create(Ty->getContext()); - return *Entry = DTy; - } - - // If this is not a recursive type, then just map all of the elements and - // then rebuild the type from inside out. - SmallVector<Type *, 4> ElementTypes; - - // If there are no element types to map, then the type is itself. This is - // true for the anonymous {} struct, things like 'float', integers, etc. - if (Ty->getNumContainedTypes() == 0 && IsUniqued) - return *Entry = Ty; - - // Remap all of the elements, keeping track of whether any of them change. - bool AnyChange = false; - ElementTypes.resize(Ty->getNumContainedTypes()); - for (unsigned I = 0, E = Ty->getNumContainedTypes(); I != E; ++I) { - ElementTypes[I] = get(Ty->getContainedType(I), Visited); - AnyChange |= ElementTypes[I] != Ty->getContainedType(I); - } - - // If we found our type while recursively processing stuff, just use it. - Entry = &MappedTypes[Ty]; - if (*Entry) { - if (auto *DTy = dyn_cast<StructType>(*Entry)) { - if (DTy->isOpaque()) { - auto *STy = cast<StructType>(Ty); - finishType(DTy, STy, ElementTypes); - } - } - return *Entry; - } - - // If all of the element types mapped directly over and the type is not - // a nomed struct, then the type is usable as-is. - if (!AnyChange && IsUniqued) - return *Entry = Ty; - - // Otherwise, rebuild a modified type. - switch (Ty->getTypeID()) { - default: - llvm_unreachable("unknown derived type to remap"); - case Type::ArrayTyID: - return *Entry = ArrayType::get(ElementTypes[0], - cast<ArrayType>(Ty)->getNumElements()); - case Type::VectorTyID: - return *Entry = VectorType::get(ElementTypes[0], - cast<VectorType>(Ty)->getNumElements()); - case Type::PointerTyID: - return *Entry = PointerType::get(ElementTypes[0], - cast<PointerType>(Ty)->getAddressSpace()); - case Type::FunctionTyID: - return *Entry = FunctionType::get(ElementTypes[0], - makeArrayRef(ElementTypes).slice(1), - cast<FunctionType>(Ty)->isVarArg()); - case Type::StructTyID: { - auto *STy = cast<StructType>(Ty); - bool IsPacked = STy->isPacked(); - if (IsUniqued) - return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked); - - // If the type is opaque, we can just use it directly. - if (STy->isOpaque()) { - DstStructTypesSet.addOpaque(STy); - return *Entry = Ty; - } - - if (StructType *OldT = - DstStructTypesSet.findNonOpaque(ElementTypes, IsPacked)) { - STy->setName(""); - return *Entry = OldT; - } - - if (!AnyChange) { - DstStructTypesSet.addNonOpaque(STy); - return *Entry = Ty; - } - - StructType *DTy = StructType::create(Ty->getContext()); - finishType(DTy, STy, ElementTypes); - return *Entry = DTy; - } - } -} - -//===----------------------------------------------------------------------===// -// ModuleLinker implementation. -//===----------------------------------------------------------------------===// - -namespace { -class ModuleLinker; - -/// Creates prototypes for functions that are lazily linked on the fly. This -/// speeds up linking for modules with many/ lazily linked functions of which -/// few get used. -class ValueMaterializerTy : public ValueMaterializer { - TypeMapTy &TypeMap; - Module *DstM; - std::vector<GlobalValue *> &LazilyLinkGlobalValues; - -public: - ValueMaterializerTy(TypeMapTy &TypeMap, Module *DstM, - std::vector<GlobalValue *> &LazilyLinkGlobalValues) - : ValueMaterializer(), TypeMap(TypeMap), DstM(DstM), - LazilyLinkGlobalValues(LazilyLinkGlobalValues) {} - - Value *materializeValueFor(Value *V) override; -}; - -class LinkDiagnosticInfo : public DiagnosticInfo { - const Twine &Msg; - -public: - LinkDiagnosticInfo(DiagnosticSeverity Severity, const Twine &Msg); - void print(DiagnosticPrinter &DP) const override; -}; -LinkDiagnosticInfo::LinkDiagnosticInfo(DiagnosticSeverity Severity, - const Twine &Msg) - : DiagnosticInfo(DK_Linker, Severity), Msg(Msg) {} -void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; } /// This is an implementation class for the LinkModules function, which is the /// entrypoint for this file. class ModuleLinker { - Module *DstM, *SrcM; - - TypeMapTy TypeMap; - ValueMaterializerTy ValMaterializer; + IRMover &Mover; + Module &SrcM; - /// Mapping of values from what they used to be in Src, to what they are now - /// in DstM. ValueToValueMapTy is a ValueMap, which involves some overhead - /// due to the use of Value handles which the Linker doesn't actually need, - /// but this allows us to reuse the ValueMapper code. - ValueToValueMapTy ValueMap; + SetVector<GlobalValue *> ValuesToLink; + StringSet<> Internalize; - struct AppendingVarInfo { - GlobalVariable *NewGV; // New aggregate global in dest module. - const Constant *DstInit; // Old initializer from dest module. - const Constant *SrcInit; // Old initializer from src module. - }; - - std::vector<AppendingVarInfo> AppendingVars; + /// For symbol clashes, prefer those from Src. + unsigned Flags; - // Set of items not to link in from source. - SmallPtrSet<const Value *, 16> DoNotLinkFromSource; + /// Function index passed into ModuleLinker for using in function + /// importing/exporting handling. + const FunctionInfoIndex *ImportIndex; - // Vector of GlobalValues to lazily link in. - std::vector<GlobalValue *> LazilyLinkGlobalValues; + /// Functions to import from source module, all other functions are + /// imported as declarations instead of definitions. + DenseSet<const GlobalValue *> *FunctionsToImport; - /// Functions that have replaced other functions. - SmallPtrSet<const Function *, 16> OverridingFunctions; + /// Set to true if the given FunctionInfoIndex contains any functions + /// from this source module, in which case we must conservatively assume + /// that any of its functions may be imported into another module + /// as part of a different backend compilation process. + bool HasExportedFunctions = false; - DiagnosticHandlerFunction DiagnosticHandler; + /// Association between metadata value id and temporary metadata that + /// remains unmapped after function importing. Saved during function + /// importing and consumed during the metadata linking postpass. + DenseMap<unsigned, MDNode *> *ValIDToTempMDMap; - /// For symbol clashes, prefer those from Src. - bool OverrideFromSrc; + /// Used as the callback for lazy linking. + /// The mover has just hit GV and we have to decide if it, and other members + /// of the same comdat, should be linked. Every member to be linked is passed + /// to Add. + void addLazyFor(GlobalValue &GV, IRMover::ValueAdder Add); -public: - ModuleLinker(Module *dstM, Linker::IdentifiedStructTypeSet &Set, Module *srcM, - DiagnosticHandlerFunction DiagnosticHandler, - bool OverrideFromSrc) - : DstM(dstM), SrcM(srcM), TypeMap(Set), - ValMaterializer(TypeMap, DstM, LazilyLinkGlobalValues), - DiagnosticHandler(DiagnosticHandler), OverrideFromSrc(OverrideFromSrc) { + bool shouldOverrideFromSrc() { return Flags & Linker::OverrideFromSrc; } + bool shouldLinkOnlyNeeded() { return Flags & Linker::LinkOnlyNeeded; } + bool shouldInternalizeLinkedSymbols() { + return Flags & Linker::InternalizeLinkedSymbols; } - bool run(); - -private: bool shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest, const GlobalValue &Src); - /// Helper method for setting a message and returning an error code. + /// Should we have mover and linker error diag info? bool emitError(const Twine &Message) { - DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message)); + SrcM.getContext().diagnose(LinkDiagnosticInfo(DS_Error, Message)); return true; } - void emitWarning(const Twine &Message) { - DiagnosticHandler(LinkDiagnosticInfo(DS_Warning, Message)); - } - - bool getComdatLeader(Module *M, StringRef ComdatName, + bool getComdatLeader(Module &M, StringRef ComdatName, const GlobalVariable *&GVar); bool computeResultingSelectionKind(StringRef ComdatName, Comdat::SelectionKind Src, @@ -463,17 +85,20 @@ private: ComdatsChosen; bool getComdatResult(const Comdat *SrcC, Comdat::SelectionKind &SK, bool &LinkFromSrc); + // Keep track of the global value members of each comdat in source. + DenseMap<const Comdat *, std::vector<GlobalValue *>> ComdatMembers; /// Given a global in the source module, return the global in the /// destination module that is being linked to, if any. GlobalValue *getLinkedToGlobal(const GlobalValue *SrcGV) { + Module &DstM = Mover.getModule(); // If the source has no name it can't link. If it has local linkage, // there is no name match-up going on. - if (!SrcGV->hasName() || SrcGV->hasLocalLinkage()) + if (!SrcGV->hasName() || GlobalValue::isLocalLinkage(SrcGV->getLinkage())) return nullptr; // Otherwise see if we have a match in the destination module's symtab. - GlobalValue *DGV = DstM->getNamedValue(SrcGV->getName()); + GlobalValue *DGV = DstM.getNamedValue(SrcGV->getName()); if (!DGV) return nullptr; @@ -486,139 +111,305 @@ private: return DGV; } - void computeTypeMapping(); - - void upgradeMismatchedGlobalArray(StringRef Name); - void upgradeMismatchedGlobals(); + bool linkIfNeeded(GlobalValue &GV); - bool linkAppendingVarProto(GlobalVariable *DstGV, - const GlobalVariable *SrcGV); + /// Helper method to check if we are importing from the current source + /// module. + bool isPerformingImport() const { return FunctionsToImport != nullptr; } - bool linkGlobalValueProto(GlobalValue *GV); - bool linkModuleFlagsMetadata(); + /// If we are importing from the source module, checks if we should + /// import SGV as a definition, otherwise import as a declaration. + bool doImportAsDefinition(const GlobalValue *SGV); - void linkAppendingVarInit(const AppendingVarInfo &AVI); - - void linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src); - bool linkFunctionBody(Function &Dst, Function &Src); - void linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src); - bool linkGlobalValueBody(GlobalValue &Src); +public: + ModuleLinker(IRMover &Mover, Module &SrcM, unsigned Flags, + const FunctionInfoIndex *Index = nullptr, + DenseSet<const GlobalValue *> *FunctionsToImport = nullptr, + DenseMap<unsigned, MDNode *> *ValIDToTempMDMap = nullptr) + : Mover(Mover), SrcM(SrcM), Flags(Flags), ImportIndex(Index), + FunctionsToImport(FunctionsToImport), + ValIDToTempMDMap(ValIDToTempMDMap) { + assert((ImportIndex || !FunctionsToImport) && + "Expect a FunctionInfoIndex when importing"); + // If we have a FunctionInfoIndex but no function to import, + // then this is the primary module being compiled in a ThinLTO + // backend compilation, and we need to see if it has functions that + // may be exported to another backend compilation. + if (ImportIndex && !FunctionsToImport) + HasExportedFunctions = ImportIndex->hasExportedFunctions(SrcM); + assert((ValIDToTempMDMap || !FunctionsToImport) && + "Function importing must provide a ValIDToTempMDMap"); + } - void linkNamedMDNodes(); - void stripReplacedSubprograms(); + bool run(); }; -} - -/// The LLVM SymbolTable class autorenames globals that conflict in the symbol -/// table. This is good for all clients except for us. Go through the trouble -/// to force this back. -static void forceRenaming(GlobalValue *GV, StringRef Name) { - // If the global doesn't force its name or if it already has the right name, - // there is nothing for us to do. - if (GV->hasLocalLinkage() || GV->getName() == Name) - return; - Module *M = GV->getParent(); +/// Class to handle necessary GlobalValue changes required by ThinLTO including +/// linkage changes and any necessary renaming. +class ThinLTOGlobalProcessing { + /// The Module which we are exporting or importing functions from. + Module &M; + + /// Function index passed in for function importing/exporting handling. + const FunctionInfoIndex *ImportIndex; + + /// Functions to import from this module, all other functions will be + /// imported as declarations instead of definitions. + DenseSet<const GlobalValue *> *FunctionsToImport; + + /// Set to true if the given FunctionInfoIndex contains any functions + /// from this source module, in which case we must conservatively assume + /// that any of its functions may be imported into another module + /// as part of a different backend compilation process. + bool HasExportedFunctions = false; + + /// Populated during ThinLTO global processing with locals promoted + /// to global scope in an exporting module, which now need to be linked + /// in if calling from the ModuleLinker. + SetVector<GlobalValue *> NewExportedValues; + + /// Check if we should promote the given local value to global scope. + bool doPromoteLocalToGlobal(const GlobalValue *SGV); + + /// Helper methods to check if we are importing from or potentially + /// exporting from the current source module. + bool isPerformingImport() const { return FunctionsToImport != nullptr; } + bool isModuleExporting() const { return HasExportedFunctions; } + + /// If we are importing from the source module, checks if we should + /// import SGV as a definition, otherwise import as a declaration. + bool doImportAsDefinition(const GlobalValue *SGV); + + /// Get the name for SGV that should be used in the linked destination + /// module. Specifically, this handles the case where we need to rename + /// a local that is being promoted to global scope. + std::string getName(const GlobalValue *SGV); + + /// Process globals so that they can be used in ThinLTO. This includes + /// promoting local variables so that they can be reference externally by + /// thin lto imported globals and converting strong external globals to + /// available_externally. + void processGlobalsForThinLTO(); + void processGlobalForThinLTO(GlobalValue &GV); + + /// Get the new linkage for SGV that should be used in the linked destination + /// module. Specifically, for ThinLTO importing or exporting it may need + /// to be adjusted. + GlobalValue::LinkageTypes getLinkage(const GlobalValue *SGV); - // If there is a conflict, rename the conflict. - if (GlobalValue *ConflictGV = M->getNamedValue(Name)) { - GV->takeName(ConflictGV); - ConflictGV->setName(Name); // This will cause ConflictGV to get renamed - assert(ConflictGV->getName() != Name && "forceRenaming didn't work"); - } else { - GV->setName(Name); // Force the name back +public: + ThinLTOGlobalProcessing( + Module &M, const FunctionInfoIndex *Index, + DenseSet<const GlobalValue *> *FunctionsToImport = nullptr) + : M(M), ImportIndex(Index), FunctionsToImport(FunctionsToImport) { + // If we have a FunctionInfoIndex but no function to import, + // then this is the primary module being compiled in a ThinLTO + // backend compilation, and we need to see if it has functions that + // may be exported to another backend compilation. + if (!FunctionsToImport) + HasExportedFunctions = ImportIndex->hasExportedFunctions(M); } -} -/// copy additional attributes (those not needed to construct a GlobalValue) -/// from the SrcGV to the DestGV. -static void copyGVAttributes(GlobalValue *DestGV, const GlobalValue *SrcGV) { - DestGV->copyAttributesFrom(SrcGV); - forceRenaming(DestGV, SrcGV->getName()); + bool run(); + + /// Access the promoted globals that are now exported and need to be linked. + SetVector<GlobalValue *> &getNewExportedValues() { return NewExportedValues; } +}; } -static bool isLessConstraining(GlobalValue::VisibilityTypes a, - GlobalValue::VisibilityTypes b) { - if (a == GlobalValue::HiddenVisibility) - return false; - if (b == GlobalValue::HiddenVisibility) +/// Checks if we should import SGV as a definition, otherwise import as a +/// declaration. +static bool +doImportAsDefinitionImpl(const GlobalValue *SGV, + DenseSet<const GlobalValue *> *FunctionsToImport) { + auto *GA = dyn_cast<GlobalAlias>(SGV); + if (GA) { + if (GA->hasWeakAnyLinkage()) + return false; + const GlobalObject *GO = GA->getBaseObject(); + if (!GO->hasLinkOnceODRLinkage()) + return false; + return doImportAsDefinitionImpl(GO, FunctionsToImport); + } + // Always import GlobalVariable definitions, except for the special + // case of WeakAny which are imported as ExternalWeak declarations + // (see comments in ModuleLinker::getLinkage). The linkage changes + // described in ModuleLinker::getLinkage ensure the correct behavior (e.g. + // global variables with external linkage are transformed to + // available_externally definitions, which are ultimately turned into + // declarations after the EliminateAvailableExternally pass). + if (isa<GlobalVariable>(SGV) && !SGV->isDeclaration() && + !SGV->hasWeakAnyLinkage()) return true; - if (a == GlobalValue::ProtectedVisibility) - return false; - if (b == GlobalValue::ProtectedVisibility) + // Only import the function requested for importing. + auto *SF = dyn_cast<Function>(SGV); + if (SF && FunctionsToImport->count(SF)) return true; + // Otherwise no. return false; } -/// Loop through the global variables in the src module and merge them into the -/// dest module. -static GlobalVariable *copyGlobalVariableProto(TypeMapTy &TypeMap, Module &DstM, - const GlobalVariable *SGVar) { - // No linking to be performed or linking from the source: simply create an - // identical version of the symbol over in the dest module... the - // initializer will be filled in later by LinkGlobalInits. - GlobalVariable *NewDGV = new GlobalVariable( - DstM, TypeMap.get(SGVar->getType()->getElementType()), - SGVar->isConstant(), SGVar->getLinkage(), /*init*/ nullptr, - SGVar->getName(), /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(), - SGVar->getType()->getAddressSpace()); - - return NewDGV; +bool ThinLTOGlobalProcessing::doImportAsDefinition(const GlobalValue *SGV) { + if (!isPerformingImport()) + return false; + return doImportAsDefinitionImpl(SGV, FunctionsToImport); } -/// Link the function in the source module into the destination module if -/// needed, setting up mapping information. -static Function *copyFunctionProto(TypeMapTy &TypeMap, Module &DstM, - const Function *SF) { - // If there is no linkage to be performed or we are linking from the source, - // bring SF over. - return Function::Create(TypeMap.get(SF->getFunctionType()), SF->getLinkage(), - SF->getName(), &DstM); +bool ModuleLinker::doImportAsDefinition(const GlobalValue *SGV) { + if (!isPerformingImport()) + return false; + return doImportAsDefinitionImpl(SGV, FunctionsToImport); } -/// Set up prototypes for any aliases that come over from the source module. -static GlobalAlias *copyGlobalAliasProto(TypeMapTy &TypeMap, Module &DstM, - const GlobalAlias *SGA) { - // If there is no linkage to be performed or we're linking from the source, - // bring over SGA. - auto *PTy = cast<PointerType>(TypeMap.get(SGA->getType())); - return GlobalAlias::create(PTy, SGA->getLinkage(), SGA->getName(), &DstM); -} +bool ThinLTOGlobalProcessing::doPromoteLocalToGlobal(const GlobalValue *SGV) { + assert(SGV->hasLocalLinkage()); + // Both the imported references and the original local variable must + // be promoted. + if (!isPerformingImport() && !isModuleExporting()) + return false; + + // Local const variables never need to be promoted unless they are address + // taken. The imported uses can simply use the clone created in this module. + // For now we are conservative in determining which variables are not + // address taken by checking the unnamed addr flag. To be more aggressive, + // the address taken information must be checked earlier during parsing + // of the module and recorded in the function index for use when importing + // from that module. + auto *GVar = dyn_cast<GlobalVariable>(SGV); + if (GVar && GVar->isConstant() && GVar->hasUnnamedAddr()) + return false; -static GlobalValue *copyGlobalValueProto(TypeMapTy &TypeMap, Module &DstM, - const GlobalValue *SGV) { - GlobalValue *NewGV; - if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) - NewGV = copyGlobalVariableProto(TypeMap, DstM, SGVar); - else if (auto *SF = dyn_cast<Function>(SGV)) - NewGV = copyFunctionProto(TypeMap, DstM, SF); - else - NewGV = copyGlobalAliasProto(TypeMap, DstM, cast<GlobalAlias>(SGV)); - copyGVAttributes(NewGV, SGV); - return NewGV; + // Eventually we only need to promote functions in the exporting module that + // are referenced by a potentially exported function (i.e. one that is in the + // function index). + return true; } -Value *ValueMaterializerTy::materializeValueFor(Value *V) { - auto *SGV = dyn_cast<GlobalValue>(V); - if (!SGV) - return nullptr; +std::string ThinLTOGlobalProcessing::getName(const GlobalValue *SGV) { + // For locals that must be promoted to global scope, ensure that + // the promoted name uniquely identifies the copy in the original module, + // using the ID assigned during combined index creation. When importing, + // we rename all locals (not just those that are promoted) in order to + // avoid naming conflicts between locals imported from different modules. + if (SGV->hasLocalLinkage() && + (doPromoteLocalToGlobal(SGV) || isPerformingImport())) + return FunctionInfoIndex::getGlobalNameForLocal( + SGV->getName(), + ImportIndex->getModuleId(SGV->getParent()->getModuleIdentifier())); + return SGV->getName(); +} + +GlobalValue::LinkageTypes +ThinLTOGlobalProcessing::getLinkage(const GlobalValue *SGV) { + // Any local variable that is referenced by an exported function needs + // to be promoted to global scope. Since we don't currently know which + // functions reference which local variables/functions, we must treat + // all as potentially exported if this module is exporting anything. + if (isModuleExporting()) { + if (SGV->hasLocalLinkage() && doPromoteLocalToGlobal(SGV)) + return GlobalValue::ExternalLinkage; + return SGV->getLinkage(); + } + + // Otherwise, if we aren't importing, no linkage change is needed. + if (!isPerformingImport()) + return SGV->getLinkage(); + + switch (SGV->getLinkage()) { + case GlobalValue::ExternalLinkage: + // External defnitions are converted to available_externally + // definitions upon import, so that they are available for inlining + // and/or optimization, but are turned into declarations later + // during the EliminateAvailableExternally pass. + if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV)) + return GlobalValue::AvailableExternallyLinkage; + // An imported external declaration stays external. + return SGV->getLinkage(); + + case GlobalValue::AvailableExternallyLinkage: + // An imported available_externally definition converts + // to external if imported as a declaration. + if (!doImportAsDefinition(SGV)) + return GlobalValue::ExternalLinkage; + // An imported available_externally declaration stays that way. + return SGV->getLinkage(); + + case GlobalValue::LinkOnceAnyLinkage: + case GlobalValue::LinkOnceODRLinkage: + // These both stay the same when importing the definition. + // The ThinLTO pass will eventually force-import their definitions. + return SGV->getLinkage(); + + case GlobalValue::WeakAnyLinkage: + // Can't import weak_any definitions correctly, or we might change the + // program semantics, since the linker will pick the first weak_any + // definition and importing would change the order they are seen by the + // linker. The module linking caller needs to enforce this. + assert(!doImportAsDefinition(SGV)); + // If imported as a declaration, it becomes external_weak. + return GlobalValue::ExternalWeakLinkage; + + case GlobalValue::WeakODRLinkage: + // For weak_odr linkage, there is a guarantee that all copies will be + // equivalent, so the issue described above for weak_any does not exist, + // and the definition can be imported. It can be treated similarly + // to an imported externally visible global value. + if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV)) + return GlobalValue::AvailableExternallyLinkage; + else + return GlobalValue::ExternalLinkage; + + case GlobalValue::AppendingLinkage: + // It would be incorrect to import an appending linkage variable, + // since it would cause global constructors/destructors to be + // executed multiple times. This should have already been handled + // by linkIfNeeded, and we will assert in shouldLinkFromSource + // if we try to import, so we simply return AppendingLinkage. + return GlobalValue::AppendingLinkage; + + case GlobalValue::InternalLinkage: + case GlobalValue::PrivateLinkage: + // If we are promoting the local to global scope, it is handled + // similarly to a normal externally visible global. + if (doPromoteLocalToGlobal(SGV)) { + if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV)) + return GlobalValue::AvailableExternallyLinkage; + else + return GlobalValue::ExternalLinkage; + } + // A non-promoted imported local definition stays local. + // The ThinLTO pass will eventually force-import their definitions. + return SGV->getLinkage(); - GlobalValue *DGV = copyGlobalValueProto(TypeMap, *DstM, SGV); + case GlobalValue::ExternalWeakLinkage: + // External weak doesn't apply to definitions, must be a declaration. + assert(!doImportAsDefinition(SGV)); + // Linkage stays external_weak. + return SGV->getLinkage(); - if (Comdat *SC = SGV->getComdat()) { - if (auto *DGO = dyn_cast<GlobalObject>(DGV)) { - Comdat *DC = DstM->getOrInsertComdat(SC->getName()); - DGO->setComdat(DC); - } + case GlobalValue::CommonLinkage: + // Linkage stays common on definitions. + // The ThinLTO pass will eventually force-import their definitions. + return SGV->getLinkage(); } - LazilyLinkGlobalValues.push_back(SGV); - return DGV; + llvm_unreachable("unknown linkage type"); +} + +static GlobalValue::VisibilityTypes +getMinVisibility(GlobalValue::VisibilityTypes A, + GlobalValue::VisibilityTypes B) { + if (A == GlobalValue::HiddenVisibility || B == GlobalValue::HiddenVisibility) + return GlobalValue::HiddenVisibility; + if (A == GlobalValue::ProtectedVisibility || + B == GlobalValue::ProtectedVisibility) + return GlobalValue::ProtectedVisibility; + return GlobalValue::DefaultVisibility; } -bool ModuleLinker::getComdatLeader(Module *M, StringRef ComdatName, +bool ModuleLinker::getComdatLeader(Module &M, StringRef ComdatName, const GlobalVariable *&GVar) { - const GlobalValue *GVal = M->getNamedValue(ComdatName); + const GlobalValue *GVal = M.getNamedValue(ComdatName); if (const auto *GA = dyn_cast_or_null<GlobalAlias>(GVal)) { GVal = GA->getBaseObject(); if (!GVal) @@ -641,6 +432,7 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName, Comdat::SelectionKind Dst, Comdat::SelectionKind &Result, bool &LinkFromSrc) { + Module &DstM = Mover.getModule(); // The ability to mix Comdat::SelectionKind::Any with // Comdat::SelectionKind::Largest is a behavior that comes from COFF. bool DstAnyOrLargest = Dst == Comdat::SelectionKind::Any || @@ -677,8 +469,8 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName, getComdatLeader(SrcM, ComdatName, SrcGV)) return true; - const DataLayout &DstDL = DstM->getDataLayout(); - const DataLayout &SrcDL = SrcM->getDataLayout(); + const DataLayout &DstDL = DstM.getDataLayout(); + const DataLayout &SrcDL = SrcM.getDataLayout(); uint64_t DstSize = DstDL.getTypeAllocSize(DstGV->getType()->getPointerElementType()); uint64_t SrcSize = @@ -708,9 +500,10 @@ bool ModuleLinker::computeResultingSelectionKind(StringRef ComdatName, bool ModuleLinker::getComdatResult(const Comdat *SrcC, Comdat::SelectionKind &Result, bool &LinkFromSrc) { + Module &DstM = Mover.getModule(); Comdat::SelectionKind SSK = SrcC->getSelectionKind(); StringRef ComdatName = SrcC->getName(); - Module::ComdatSymTabType &ComdatSymTab = DstM->getComdatSymbolTable(); + Module::ComdatSymTabType &ComdatSymTab = DstM.getComdatSymbolTable(); Module::ComdatSymTabType::iterator DstCI = ComdatSymTab.find(ComdatName); if (DstCI == ComdatSymTab.end()) { @@ -729,14 +522,17 @@ bool ModuleLinker::getComdatResult(const Comdat *SrcC, bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest, const GlobalValue &Src) { + // Should we unconditionally use the Src? - if (OverrideFromSrc) { + if (shouldOverrideFromSrc()) { LinkFromSrc = true; return false; } // We always have to add Src if it has appending linkage. if (Src.hasAppendingLinkage()) { + // Should have prevented importing for appending linkage in linkIfNeeded. + assert(!isPerformingImport()); LinkFromSrc = true; return false; } @@ -744,6 +540,28 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc, bool SrcIsDeclaration = Src.isDeclarationForLinker(); bool DestIsDeclaration = Dest.isDeclarationForLinker(); + if (isPerformingImport()) { + if (isa<Function>(&Src)) { + // For functions, LinkFromSrc iff this is a function requested + // for importing. For variables, decide below normally. + LinkFromSrc = FunctionsToImport->count(&Src); + return false; + } + + // Check if this is an alias with an already existing definition + // in Dest, which must have come from a prior importing pass from + // the same Src module. Unlike imported function and variable + // definitions, which are imported as available_externally and are + // not definitions for the linker, that is not a valid linkage for + // imported aliases which must be definitions. Simply use the existing + // Dest copy. + if (isa<GlobalAlias>(&Src) && !DestIsDeclaration) { + assert(isa<GlobalAlias>(&Dest)); + LinkFromSrc = false; + return false; + } + } + if (SrcIsDeclaration) { // If Src is external or if both Src & Dest are external.. Just link the // external globals, we aren't adding anything. @@ -753,7 +571,12 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc, return false; } // If the Dest is weak, use the source linkage. - LinkFromSrc = Dest.hasExternalWeakLinkage(); + if (Dest.hasExternalWeakLinkage()) { + LinkFromSrc = true; + return false; + } + // Link an available_externally over a declaration. + LinkFromSrc = !Src.isDeclaration() && Dest.isDeclaration(); return false; } @@ -808,730 +631,122 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc, "': symbol multiply defined!"); } -/// Loop over all of the linked values to compute type mappings. For example, -/// if we link "extern Foo *x" and "Foo *x = NULL", then we have two struct -/// types 'Foo' but one got renamed when the module was loaded into the same -/// LLVMContext. -void ModuleLinker::computeTypeMapping() { - for (GlobalValue &SGV : SrcM->globals()) { - GlobalValue *DGV = getLinkedToGlobal(&SGV); - if (!DGV) - continue; - - if (!DGV->hasAppendingLinkage() || !SGV.hasAppendingLinkage()) { - TypeMap.addTypeMapping(DGV->getType(), SGV.getType()); - continue; - } - - // Unify the element type of appending arrays. - ArrayType *DAT = cast<ArrayType>(DGV->getType()->getElementType()); - ArrayType *SAT = cast<ArrayType>(SGV.getType()->getElementType()); - TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType()); - } - - for (GlobalValue &SGV : *SrcM) { - if (GlobalValue *DGV = getLinkedToGlobal(&SGV)) - TypeMap.addTypeMapping(DGV->getType(), SGV.getType()); - } - - for (GlobalValue &SGV : SrcM->aliases()) { - if (GlobalValue *DGV = getLinkedToGlobal(&SGV)) - TypeMap.addTypeMapping(DGV->getType(), SGV.getType()); - } - - // Incorporate types by name, scanning all the types in the source module. - // At this point, the destination module may have a type "%foo = { i32 }" for - // example. When the source module got loaded into the same LLVMContext, if - // it had the same type, it would have been renamed to "%foo.42 = { i32 }". - std::vector<StructType *> Types = SrcM->getIdentifiedStructTypes(); - for (StructType *ST : Types) { - if (!ST->hasName()) - continue; - - // Check to see if there is a dot in the name followed by a digit. - size_t DotPos = ST->getName().rfind('.'); - if (DotPos == 0 || DotPos == StringRef::npos || - ST->getName().back() == '.' || - !isdigit(static_cast<unsigned char>(ST->getName()[DotPos + 1]))) - continue; +bool ModuleLinker::linkIfNeeded(GlobalValue &GV) { + GlobalValue *DGV = getLinkedToGlobal(&GV); - // Check to see if the destination module has a struct with the prefix name. - StructType *DST = DstM->getTypeByName(ST->getName().substr(0, DotPos)); - if (!DST) - continue; - - // Don't use it if this actually came from the source module. They're in - // the same LLVMContext after all. Also don't use it unless the type is - // actually used in the destination module. This can happen in situations - // like this: - // - // Module A Module B - // -------- -------- - // %Z = type { %A } %B = type { %C.1 } - // %A = type { %B.1, [7 x i8] } %C.1 = type { i8* } - // %B.1 = type { %C } %A.2 = type { %B.3, [5 x i8] } - // %C = type { i8* } %B.3 = type { %C.1 } - // - // When we link Module B with Module A, the '%B' in Module B is - // used. However, that would then use '%C.1'. But when we process '%C.1', - // we prefer to take the '%C' version. So we are then left with both - // '%C.1' and '%C' being used for the same types. This leads to some - // variables using one type and some using the other. - if (TypeMap.DstStructTypesSet.hasType(DST)) - TypeMap.addTypeMapping(DST, ST); - } - - // Now that we have discovered all of the type equivalences, get a body for - // any 'opaque' types in the dest module that are now resolved. - TypeMap.linkDefinedTypeBodies(); -} + if (shouldLinkOnlyNeeded() && !(DGV && DGV->isDeclaration())) + return false; -static void upgradeGlobalArray(GlobalVariable *GV) { - ArrayType *ATy = cast<ArrayType>(GV->getType()->getElementType()); - StructType *OldTy = cast<StructType>(ATy->getElementType()); - assert(OldTy->getNumElements() == 2 && "Expected to upgrade from 2 elements"); - - // Get the upgraded 3 element type. - PointerType *VoidPtrTy = Type::getInt8Ty(GV->getContext())->getPointerTo(); - Type *Tys[3] = {OldTy->getElementType(0), OldTy->getElementType(1), - VoidPtrTy}; - StructType *NewTy = StructType::get(GV->getContext(), Tys, false); - - // Build new constants with a null third field filled in. - Constant *OldInitC = GV->getInitializer(); - ConstantArray *OldInit = dyn_cast<ConstantArray>(OldInitC); - if (!OldInit && !isa<ConstantAggregateZero>(OldInitC)) - // Invalid initializer; give up. - return; - std::vector<Constant *> Initializers; - if (OldInit && OldInit->getNumOperands()) { - Value *Null = Constant::getNullValue(VoidPtrTy); - for (Use &U : OldInit->operands()) { - ConstantStruct *Init = cast<ConstantStruct>(U.get()); - Initializers.push_back(ConstantStruct::get( - NewTy, Init->getOperand(0), Init->getOperand(1), Null, nullptr)); + if (DGV && !GV.hasLocalLinkage() && !GV.hasAppendingLinkage()) { + auto *DGVar = dyn_cast<GlobalVariable>(DGV); + auto *SGVar = dyn_cast<GlobalVariable>(&GV); + if (DGVar && SGVar) { + if (DGVar->isDeclaration() && SGVar->isDeclaration() && + (!DGVar->isConstant() || !SGVar->isConstant())) { + DGVar->setConstant(false); + SGVar->setConstant(false); + } + if (DGVar->hasCommonLinkage() && SGVar->hasCommonLinkage()) { + unsigned Align = std::max(DGVar->getAlignment(), SGVar->getAlignment()); + SGVar->setAlignment(Align); + DGVar->setAlignment(Align); + } } - } - assert(Initializers.size() == ATy->getNumElements() && - "Failed to copy all array elements"); - - // Replace the old GV with a new one. - ATy = ArrayType::get(NewTy, Initializers.size()); - Constant *NewInit = ConstantArray::get(ATy, Initializers); - GlobalVariable *NewGV = new GlobalVariable( - *GV->getParent(), ATy, GV->isConstant(), GV->getLinkage(), NewInit, "", - GV, GV->getThreadLocalMode(), GV->getType()->getAddressSpace(), - GV->isExternallyInitialized()); - NewGV->copyAttributesFrom(GV); - NewGV->takeName(GV); - assert(GV->use_empty() && "program cannot use initializer list"); - GV->eraseFromParent(); -} -void ModuleLinker::upgradeMismatchedGlobalArray(StringRef Name) { - // Look for the global arrays. - auto *DstGV = dyn_cast_or_null<GlobalVariable>(DstM->getNamedValue(Name)); - if (!DstGV) - return; - auto *SrcGV = dyn_cast_or_null<GlobalVariable>(SrcM->getNamedValue(Name)); - if (!SrcGV) - return; + GlobalValue::VisibilityTypes Visibility = + getMinVisibility(DGV->getVisibility(), GV.getVisibility()); + DGV->setVisibility(Visibility); + GV.setVisibility(Visibility); - // Check if the types already match. - auto *DstTy = cast<ArrayType>(DstGV->getType()->getElementType()); - auto *SrcTy = - cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType())); - if (DstTy == SrcTy) - return; - - // Grab the element types. We can only upgrade an array of a two-field - // struct. Only bother if the other one has three-fields. - auto *DstEltTy = cast<StructType>(DstTy->getElementType()); - auto *SrcEltTy = cast<StructType>(SrcTy->getElementType()); - if (DstEltTy->getNumElements() == 2 && SrcEltTy->getNumElements() == 3) { - upgradeGlobalArray(DstGV); - return; + bool HasUnnamedAddr = GV.hasUnnamedAddr() && DGV->hasUnnamedAddr(); + DGV->setUnnamedAddr(HasUnnamedAddr); + GV.setUnnamedAddr(HasUnnamedAddr); } - if (DstEltTy->getNumElements() == 3 && SrcEltTy->getNumElements() == 2) - upgradeGlobalArray(SrcGV); - - // We can't upgrade any other differences. -} - -void ModuleLinker::upgradeMismatchedGlobals() { - upgradeMismatchedGlobalArray("llvm.global_ctors"); - upgradeMismatchedGlobalArray("llvm.global_dtors"); -} - -/// If there were any appending global variables, link them together now. -/// Return true on error. -bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV, - const GlobalVariable *SrcGV) { - - if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage()) - return emitError("Linking globals named '" + SrcGV->getName() + - "': can only link appending global with another appending global!"); - - ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType()); - ArrayType *SrcTy = - cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType())); - Type *EltTy = DstTy->getElementType(); - - // Check to see that they two arrays agree on type. - if (EltTy != SrcTy->getElementType()) - return emitError("Appending variables with different element types!"); - if (DstGV->isConstant() != SrcGV->isConstant()) - return emitError("Appending variables linked with different const'ness!"); - - if (DstGV->getAlignment() != SrcGV->getAlignment()) - return emitError( - "Appending variables with different alignment need to be linked!"); - - if (DstGV->getVisibility() != SrcGV->getVisibility()) - return emitError( - "Appending variables with different visibility need to be linked!"); - - if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr()) - return emitError( - "Appending variables with different unnamed_addr need to be linked!"); - - if (StringRef(DstGV->getSection()) != SrcGV->getSection()) - return emitError( - "Appending variables with different section name need to be linked!"); - - uint64_t NewSize = DstTy->getNumElements() + SrcTy->getNumElements(); - ArrayType *NewType = ArrayType::get(EltTy, NewSize); - - // Create the new global variable. - GlobalVariable *NG = - new GlobalVariable(*DstGV->getParent(), NewType, SrcGV->isConstant(), - DstGV->getLinkage(), /*init*/nullptr, /*name*/"", DstGV, - DstGV->getThreadLocalMode(), - DstGV->getType()->getAddressSpace()); - // Propagate alignment, visibility and section info. - copyGVAttributes(NG, DstGV); - - AppendingVarInfo AVI; - AVI.NewGV = NG; - AVI.DstInit = DstGV->getInitializer(); - AVI.SrcInit = SrcGV->getInitializer(); - AppendingVars.push_back(AVI); - - // Replace any uses of the two global variables with uses of the new - // global. - ValueMap[SrcGV] = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType())); - - DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType())); - DstGV->eraseFromParent(); - - // Track the source variable so we don't try to link it. - DoNotLinkFromSource.insert(SrcGV); - - return false; -} + // Don't want to append to global_ctors list, for example, when we + // are importing for ThinLTO, otherwise the global ctors and dtors + // get executed multiple times for local variables (the latter causing + // double frees). + if (GV.hasAppendingLinkage() && isPerformingImport()) + return false; -bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) { - GlobalValue *DGV = getLinkedToGlobal(SGV); + if (isPerformingImport() && !doImportAsDefinition(&GV)) + return false; - // Handle the ultra special appending linkage case first. - if (DGV && DGV->hasAppendingLinkage()) - return linkAppendingVarProto(cast<GlobalVariable>(DGV), - cast<GlobalVariable>(SGV)); + if (!DGV && !shouldOverrideFromSrc() && + (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() || + GV.hasAvailableExternallyLinkage())) + return false; - bool LinkFromSrc = true; - Comdat *C = nullptr; - GlobalValue::VisibilityTypes Visibility = SGV->getVisibility(); - bool HasUnnamedAddr = SGV->hasUnnamedAddr(); + if (GV.isDeclaration()) + return false; - if (const Comdat *SC = SGV->getComdat()) { + if (const Comdat *SC = GV.getComdat()) { + bool LinkFromSrc; Comdat::SelectionKind SK; std::tie(SK, LinkFromSrc) = ComdatsChosen[SC]; - C = DstM->getOrInsertComdat(SC->getName()); - C->setSelectionKind(SK); - } else if (DGV) { - if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV)) - return true; - } - - if (!LinkFromSrc) { - // Track the source global so that we don't attempt to copy it over when - // processing global initializers. - DoNotLinkFromSource.insert(SGV); - - if (DGV) - // Make sure to remember this mapping. - ValueMap[SGV] = - ConstantExpr::getBitCast(DGV, TypeMap.get(SGV->getType())); - } - - if (DGV) { - Visibility = isLessConstraining(Visibility, DGV->getVisibility()) - ? DGV->getVisibility() - : Visibility; - HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr(); - } - - if (!LinkFromSrc && !DGV) + if (LinkFromSrc) + ValuesToLink.insert(&GV); return false; - - GlobalValue *NewGV; - if (!LinkFromSrc) { - NewGV = DGV; - } else { - // If the GV is to be lazily linked, don't create it just yet. - // The ValueMaterializerTy will deal with creating it if it's used. - if (!DGV && !OverrideFromSrc && - (SGV->hasLocalLinkage() || SGV->hasLinkOnceLinkage() || - SGV->hasAvailableExternallyLinkage())) { - DoNotLinkFromSource.insert(SGV); - return false; - } - - NewGV = copyGlobalValueProto(TypeMap, *DstM, SGV); - - if (DGV && isa<Function>(DGV)) - if (auto *NewF = dyn_cast<Function>(NewGV)) - OverridingFunctions.insert(NewF); - } - - NewGV->setUnnamedAddr(HasUnnamedAddr); - NewGV->setVisibility(Visibility); - - if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) { - if (C) - NewGO->setComdat(C); - - if (DGV && DGV->hasCommonLinkage() && SGV->hasCommonLinkage()) - NewGO->setAlignment(std::max(DGV->getAlignment(), SGV->getAlignment())); - } - - if (auto *NewGVar = dyn_cast<GlobalVariable>(NewGV)) { - auto *DGVar = dyn_cast_or_null<GlobalVariable>(DGV); - auto *SGVar = dyn_cast<GlobalVariable>(SGV); - if (DGVar && SGVar && DGVar->isDeclaration() && SGVar->isDeclaration() && - (!DGVar->isConstant() || !SGVar->isConstant())) - NewGVar->setConstant(false); - } - - // Make sure to remember this mapping. - if (NewGV != DGV) { - if (DGV) { - DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewGV, DGV->getType())); - DGV->eraseFromParent(); - } - ValueMap[SGV] = NewGV; - } - - return false; -} - -static void getArrayElements(const Constant *C, - SmallVectorImpl<Constant *> &Dest) { - unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements(); - - for (unsigned i = 0; i != NumElements; ++i) - Dest.push_back(C->getAggregateElement(i)); -} - -void ModuleLinker::linkAppendingVarInit(const AppendingVarInfo &AVI) { - // Merge the initializer. - SmallVector<Constant *, 16> DstElements; - getArrayElements(AVI.DstInit, DstElements); - - SmallVector<Constant *, 16> SrcElements; - getArrayElements(AVI.SrcInit, SrcElements); - - ArrayType *NewType = cast<ArrayType>(AVI.NewGV->getType()->getElementType()); - - StringRef Name = AVI.NewGV->getName(); - bool IsNewStructor = - (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") && - cast<StructType>(NewType->getElementType())->getNumElements() == 3; - - for (auto *V : SrcElements) { - if (IsNewStructor) { - Constant *Key = V->getAggregateElement(2); - if (DoNotLinkFromSource.count(Key)) - continue; - } - DstElements.push_back( - MapValue(V, ValueMap, RF_None, &TypeMap, &ValMaterializer)); } - if (IsNewStructor) { - NewType = ArrayType::get(NewType->getElementType(), DstElements.size()); - AVI.NewGV->mutateType(PointerType::get(NewType, 0)); - } - - AVI.NewGV->setInitializer(ConstantArray::get(NewType, DstElements)); -} -/// Update the initializers in the Dest module now that all globals that may be -/// referenced are in Dest. -void ModuleLinker::linkGlobalInit(GlobalVariable &Dst, GlobalVariable &Src) { - // Figure out what the initializer looks like in the dest module. - Dst.setInitializer(MapValue(Src.getInitializer(), ValueMap, RF_None, &TypeMap, - &ValMaterializer)); -} - -/// Copy the source function over into the dest function and fix up references -/// to values. At this point we know that Dest is an external function, and -/// that Src is not. -bool ModuleLinker::linkFunctionBody(Function &Dst, Function &Src) { - assert(Dst.isDeclaration() && !Src.isDeclaration()); - - // Materialize if needed. - if (std::error_code EC = Src.materialize()) - return emitError(EC.message()); - - // Link in the prefix data. - if (Src.hasPrefixData()) - Dst.setPrefixData(MapValue(Src.getPrefixData(), ValueMap, RF_None, &TypeMap, - &ValMaterializer)); - - // Link in the prologue data. - if (Src.hasPrologueData()) - Dst.setPrologueData(MapValue(Src.getPrologueData(), ValueMap, RF_None, - &TypeMap, &ValMaterializer)); - - // Link in the personality function. - if (Src.hasPersonalityFn()) - Dst.setPersonalityFn(MapValue(Src.getPersonalityFn(), ValueMap, RF_None, - &TypeMap, &ValMaterializer)); - - // Go through and convert function arguments over, remembering the mapping. - Function::arg_iterator DI = Dst.arg_begin(); - for (Argument &Arg : Src.args()) { - DI->setName(Arg.getName()); // Copy the name over. - - // Add a mapping to our mapping. - ValueMap[&Arg] = DI; - ++DI; - } - - // Copy over the metadata attachments. - SmallVector<std::pair<unsigned, MDNode *>, 8> MDs; - Src.getAllMetadata(MDs); - for (const auto &I : MDs) - Dst.setMetadata(I.first, MapMetadata(I.second, ValueMap, RF_None, &TypeMap, - &ValMaterializer)); - - // Splice the body of the source function into the dest function. - Dst.getBasicBlockList().splice(Dst.end(), Src.getBasicBlockList()); - - // At this point, all of the instructions and values of the function are now - // copied over. The only problem is that they are still referencing values in - // the Source function as operands. Loop through all of the operands of the - // functions and patch them up to point to the local versions. - for (BasicBlock &BB : Dst) - for (Instruction &I : BB) - RemapInstruction(&I, ValueMap, RF_IgnoreMissingEntries, &TypeMap, - &ValMaterializer); - - // There is no need to map the arguments anymore. - for (Argument &Arg : Src.args()) - ValueMap.erase(&Arg); - - Src.dematerialize(); - return false; -} - -void ModuleLinker::linkAliasBody(GlobalAlias &Dst, GlobalAlias &Src) { - Constant *Aliasee = Src.getAliasee(); - Constant *Val = - MapValue(Aliasee, ValueMap, RF_None, &TypeMap, &ValMaterializer); - Dst.setAliasee(Val); -} - -bool ModuleLinker::linkGlobalValueBody(GlobalValue &Src) { - Value *Dst = ValueMap[&Src]; - assert(Dst); - if (auto *F = dyn_cast<Function>(&Src)) - return linkFunctionBody(cast<Function>(*Dst), *F); - if (auto *GVar = dyn_cast<GlobalVariable>(&Src)) { - linkGlobalInit(cast<GlobalVariable>(*Dst), *GVar); - return false; - } - linkAliasBody(cast<GlobalAlias>(*Dst), cast<GlobalAlias>(Src)); + bool LinkFromSrc = true; + if (DGV && shouldLinkFromSource(LinkFromSrc, *DGV, GV)) + return true; + if (LinkFromSrc) + ValuesToLink.insert(&GV); return false; } -/// Insert all of the named MDNodes in Src into the Dest module. -void ModuleLinker::linkNamedMDNodes() { - const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata(); - for (const NamedMDNode &NMD : SrcM->named_metadata()) { - // Don't link module flags here. Do them separately. - if (&NMD == SrcModFlags) - continue; - NamedMDNode *DestNMD = DstM->getOrInsertNamedMetadata(NMD.getName()); - // Add Src elements into Dest node. - for (const MDNode *op : NMD.operands()) - DestNMD->addOperand( - MapMetadata(op, ValueMap, RF_None, &TypeMap, &ValMaterializer)); - } -} - -/// Drop DISubprograms that have been superseded. -/// -/// FIXME: this creates an asymmetric result: we strip functions from losing -/// subprograms in DstM, but leave losing subprograms in SrcM. -/// TODO: Remove this logic once the backend can correctly determine canonical -/// subprograms. -void ModuleLinker::stripReplacedSubprograms() { - // Avoid quadratic runtime by returning early when there's nothing to do. - if (OverridingFunctions.empty()) +void ModuleLinker::addLazyFor(GlobalValue &GV, IRMover::ValueAdder Add) { + // Add these to the internalize list + if (!GV.hasLinkOnceLinkage()) return; - // Move the functions now, so the set gets cleared even on early returns. - auto Functions = std::move(OverridingFunctions); - OverridingFunctions.clear(); + if (shouldInternalizeLinkedSymbols()) + Internalize.insert(GV.getName()); + Add(GV); - // Drop functions from subprograms if they've been overridden by the new - // compile unit. - NamedMDNode *CompileUnits = DstM->getNamedMetadata("llvm.dbg.cu"); - if (!CompileUnits) + const Comdat *SC = GV.getComdat(); + if (!SC) + return; + for (GlobalValue *GV2 : ComdatMembers[SC]) { + if (!GV2->hasLocalLinkage() && shouldInternalizeLinkedSymbols()) + Internalize.insert(GV2->getName()); + Add(*GV2); + } +} + +void ThinLTOGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { + if (GV.hasLocalLinkage() && + (doPromoteLocalToGlobal(&GV) || isPerformingImport())) { + GV.setName(getName(&GV)); + GV.setLinkage(getLinkage(&GV)); + if (!GV.hasLocalLinkage()) + GV.setVisibility(GlobalValue::HiddenVisibility); + if (isModuleExporting()) + NewExportedValues.insert(&GV); return; - for (unsigned I = 0, E = CompileUnits->getNumOperands(); I != E; ++I) { - auto *CU = cast<DICompileUnit>(CompileUnits->getOperand(I)); - assert(CU && "Expected valid compile unit"); - - for (DISubprogram *SP : CU->getSubprograms()) { - if (!SP || !SP->getFunction() || !Functions.count(SP->getFunction())) - continue; - - // Prevent DebugInfoFinder from tagging this as the canonical subprogram, - // since the canonical one is in the incoming module. - SP->replaceFunction(nullptr); - } - } -} - -/// Merge the linker flags in Src into the Dest module. -bool ModuleLinker::linkModuleFlagsMetadata() { - // If the source module has no module flags, we are done. - const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata(); - if (!SrcModFlags) return false; - - // If the destination module doesn't have module flags yet, then just copy - // over the source module's flags. - NamedMDNode *DstModFlags = DstM->getOrInsertModuleFlagsMetadata(); - if (DstModFlags->getNumOperands() == 0) { - for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) - DstModFlags->addOperand(SrcModFlags->getOperand(I)); - - return false; - } - - // First build a map of the existing module flags and requirements. - DenseMap<MDString *, std::pair<MDNode *, unsigned>> Flags; - SmallSetVector<MDNode*, 16> Requirements; - for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) { - MDNode *Op = DstModFlags->getOperand(I); - ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0)); - MDString *ID = cast<MDString>(Op->getOperand(1)); - - if (Behavior->getZExtValue() == Module::Require) { - Requirements.insert(cast<MDNode>(Op->getOperand(2))); - } else { - Flags[ID] = std::make_pair(Op, I); - } - } - - // Merge in the flags from the source module, and also collect its set of - // requirements. - bool HasErr = false; - for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) { - MDNode *SrcOp = SrcModFlags->getOperand(I); - ConstantInt *SrcBehavior = - mdconst::extract<ConstantInt>(SrcOp->getOperand(0)); - MDString *ID = cast<MDString>(SrcOp->getOperand(1)); - MDNode *DstOp; - unsigned DstIndex; - std::tie(DstOp, DstIndex) = Flags.lookup(ID); - unsigned SrcBehaviorValue = SrcBehavior->getZExtValue(); - - // If this is a requirement, add it and continue. - if (SrcBehaviorValue == Module::Require) { - // If the destination module does not already have this requirement, add - // it. - if (Requirements.insert(cast<MDNode>(SrcOp->getOperand(2)))) { - DstModFlags->addOperand(SrcOp); - } - continue; - } - - // If there is no existing flag with this ID, just add it. - if (!DstOp) { - Flags[ID] = std::make_pair(SrcOp, DstModFlags->getNumOperands()); - DstModFlags->addOperand(SrcOp); - continue; - } - - // Otherwise, perform a merge. - ConstantInt *DstBehavior = - mdconst::extract<ConstantInt>(DstOp->getOperand(0)); - unsigned DstBehaviorValue = DstBehavior->getZExtValue(); - - // If either flag has override behavior, handle it first. - if (DstBehaviorValue == Module::Override) { - // Diagnose inconsistent flags which both have override behavior. - if (SrcBehaviorValue == Module::Override && - SrcOp->getOperand(2) != DstOp->getOperand(2)) { - HasErr |= emitError("linking module flags '" + ID->getString() + - "': IDs have conflicting override values"); - } - continue; - } else if (SrcBehaviorValue == Module::Override) { - // Update the destination flag to that of the source. - DstModFlags->setOperand(DstIndex, SrcOp); - Flags[ID].first = SrcOp; - continue; - } - - // Diagnose inconsistent merge behavior types. - if (SrcBehaviorValue != DstBehaviorValue) { - HasErr |= emitError("linking module flags '" + ID->getString() + - "': IDs have conflicting behaviors"); - continue; - } - - auto replaceDstValue = [&](MDNode *New) { - Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New}; - MDNode *Flag = MDNode::get(DstM->getContext(), FlagOps); - DstModFlags->setOperand(DstIndex, Flag); - Flags[ID].first = Flag; - }; - - // Perform the merge for standard behavior types. - switch (SrcBehaviorValue) { - case Module::Require: - case Module::Override: llvm_unreachable("not possible"); - case Module::Error: { - // Emit an error if the values differ. - if (SrcOp->getOperand(2) != DstOp->getOperand(2)) { - HasErr |= emitError("linking module flags '" + ID->getString() + - "': IDs have conflicting values"); - } - continue; - } - case Module::Warning: { - // Emit a warning if the values differ. - if (SrcOp->getOperand(2) != DstOp->getOperand(2)) { - emitWarning("linking module flags '" + ID->getString() + - "': IDs have conflicting values"); - } - continue; - } - case Module::Append: { - MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2)); - MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2)); - SmallVector<Metadata *, 8> MDs; - MDs.reserve(DstValue->getNumOperands() + SrcValue->getNumOperands()); - MDs.append(DstValue->op_begin(), DstValue->op_end()); - MDs.append(SrcValue->op_begin(), SrcValue->op_end()); - - replaceDstValue(MDNode::get(DstM->getContext(), MDs)); - break; - } - case Module::AppendUnique: { - SmallSetVector<Metadata *, 16> Elts; - MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2)); - MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2)); - Elts.insert(DstValue->op_begin(), DstValue->op_end()); - Elts.insert(SrcValue->op_begin(), SrcValue->op_end()); - - replaceDstValue(MDNode::get(DstM->getContext(), - makeArrayRef(Elts.begin(), Elts.end()))); - break; - } - } - } - - // Check all of the requirements. - for (unsigned I = 0, E = Requirements.size(); I != E; ++I) { - MDNode *Requirement = Requirements[I]; - MDString *Flag = cast<MDString>(Requirement->getOperand(0)); - Metadata *ReqValue = Requirement->getOperand(1); - - MDNode *Op = Flags[Flag].first; - if (!Op || Op->getOperand(2) != ReqValue) { - HasErr |= emitError("linking module flags '" + Flag->getString() + - "': does not have the required value"); - continue; - } } - - return HasErr; + GV.setLinkage(getLinkage(&GV)); } -// This function returns true if the triples match. -static bool triplesMatch(const Triple &T0, const Triple &T1) { - // If vendor is apple, ignore the version number. - if (T0.getVendor() == Triple::Apple) - return T0.getArch() == T1.getArch() && - T0.getSubArch() == T1.getSubArch() && - T0.getVendor() == T1.getVendor() && - T0.getOS() == T1.getOS(); - - return T0 == T1; +void ThinLTOGlobalProcessing::processGlobalsForThinLTO() { + for (GlobalVariable &GV : M.globals()) + processGlobalForThinLTO(GV); + for (Function &SF : M) + processGlobalForThinLTO(SF); + for (GlobalAlias &GA : M.aliases()) + processGlobalForThinLTO(GA); } -// This function returns the merged triple. -static std::string mergeTriples(const Triple &SrcTriple, const Triple &DstTriple) { - // If vendor is apple, pick the triple with the larger version number. - if (SrcTriple.getVendor() == Triple::Apple) - if (DstTriple.isOSVersionLT(SrcTriple)) - return SrcTriple.str(); - - return DstTriple.str(); +bool ThinLTOGlobalProcessing::run() { + processGlobalsForThinLTO(); + return false; } bool ModuleLinker::run() { - assert(DstM && "Null destination module"); - assert(SrcM && "Null source module"); - - // Inherit the target data from the source module if the destination module - // doesn't have one already. - if (DstM->getDataLayout().isDefault()) - DstM->setDataLayout(SrcM->getDataLayout()); - - if (SrcM->getDataLayout() != DstM->getDataLayout()) { - emitWarning("Linking two modules of different data layouts: '" + - SrcM->getModuleIdentifier() + "' is '" + - SrcM->getDataLayoutStr() + "' whereas '" + - DstM->getModuleIdentifier() + "' is '" + - DstM->getDataLayoutStr() + "'\n"); - } - - // Copy the target triple from the source to dest if the dest's is empty. - if (DstM->getTargetTriple().empty() && !SrcM->getTargetTriple().empty()) - DstM->setTargetTriple(SrcM->getTargetTriple()); - - Triple SrcTriple(SrcM->getTargetTriple()), DstTriple(DstM->getTargetTriple()); - - if (!SrcM->getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple)) - emitWarning("Linking two modules of different target triples: " + - SrcM->getModuleIdentifier() + "' is '" + - SrcM->getTargetTriple() + "' whereas '" + - DstM->getModuleIdentifier() + "' is '" + - DstM->getTargetTriple() + "'\n"); - - DstM->setTargetTriple(mergeTriples(SrcTriple, DstTriple)); - - // Append the module inline asm string. - if (!SrcM->getModuleInlineAsm().empty()) { - if (DstM->getModuleInlineAsm().empty()) - DstM->setModuleInlineAsm(SrcM->getModuleInlineAsm()); - else - DstM->setModuleInlineAsm(DstM->getModuleInlineAsm()+"\n"+ - SrcM->getModuleInlineAsm()); - } - - // Loop over all of the linked values to compute type mappings. - computeTypeMapping(); - - ComdatsChosen.clear(); - for (const auto &SMEC : SrcM->getComdatSymbolTable()) { + for (const auto &SMEC : SrcM.getComdatSymbolTable()) { const Comdat &C = SMEC.getValue(); if (ComdatsChosen.count(&C)) continue; @@ -1542,233 +757,95 @@ bool ModuleLinker::run() { ComdatsChosen[&C] = std::make_pair(SK, LinkFromSrc); } - // Upgrade mismatched global arrays. - upgradeMismatchedGlobals(); + for (GlobalVariable &GV : SrcM.globals()) + if (const Comdat *SC = GV.getComdat()) + ComdatMembers[SC].push_back(&GV); + + for (Function &SF : SrcM) + if (const Comdat *SC = SF.getComdat()) + ComdatMembers[SC].push_back(&SF); + + for (GlobalAlias &GA : SrcM.aliases()) + if (const Comdat *SC = GA.getComdat()) + ComdatMembers[SC].push_back(&GA); // Insert all of the globals in src into the DstM module... without linking // initializers (which could refer to functions not yet mapped over). - for (GlobalVariable &GV : SrcM->globals()) - if (linkGlobalValueProto(&GV)) + for (GlobalVariable &GV : SrcM.globals()) + if (linkIfNeeded(GV)) return true; - // Link the functions together between the two modules, without doing function - // bodies... this just adds external function prototypes to the DstM - // function... We do this so that when we begin processing function bodies, - // all of the global values that may be referenced are available in our - // ValueMap. - for (Function &F :*SrcM) - if (linkGlobalValueProto(&F)) + for (Function &SF : SrcM) + if (linkIfNeeded(SF)) return true; - // If there were any aliases, link them now. - for (GlobalAlias &GA : SrcM->aliases()) - if (linkGlobalValueProto(&GA)) + for (GlobalAlias &GA : SrcM.aliases()) + if (linkIfNeeded(GA)) return true; - for (const AppendingVarInfo &AppendingVar : AppendingVars) - linkAppendingVarInit(AppendingVar); - - for (const auto &Entry : DstM->getComdatSymbolTable()) { - const Comdat &C = Entry.getValue(); - if (C.getSelectionKind() == Comdat::Any) - continue; - const GlobalValue *GV = SrcM->getNamedValue(C.getName()); - if (GV) - MapValue(GV, ValueMap, RF_None, &TypeMap, &ValMaterializer); - } - - // Strip replaced subprograms before mapping any metadata -- so that we're - // not changing metadata from the source module (note that - // linkGlobalValueBody() eventually calls RemapInstruction() and therefore - // MapMetadata()) -- but after linking global value protocols -- so that - // OverridingFunctions has been built. - stripReplacedSubprograms(); - - // Link in the function bodies that are defined in the source module into - // DstM. - for (Function &SF : *SrcM) { - // Skip if no body (function is external). - if (SF.isDeclaration()) - continue; - - // Skip if not linking from source. - if (DoNotLinkFromSource.count(&SF)) - continue; - - if (linkGlobalValueBody(SF)) + if (ImportIndex) { + ThinLTOGlobalProcessing ThinLTOProcessing(SrcM, ImportIndex, + FunctionsToImport); + if (ThinLTOProcessing.run()) return true; + for (auto *GV : ThinLTOProcessing.getNewExportedValues()) + ValuesToLink.insert(GV); } - // Resolve all uses of aliases with aliasees. - for (GlobalAlias &Src : SrcM->aliases()) { - if (DoNotLinkFromSource.count(&Src)) + for (unsigned I = 0; I < ValuesToLink.size(); ++I) { + GlobalValue *GV = ValuesToLink[I]; + const Comdat *SC = GV->getComdat(); + if (!SC) continue; - linkGlobalValueBody(Src); + for (GlobalValue *GV2 : ComdatMembers[SC]) + ValuesToLink.insert(GV2); } - // Remap all of the named MDNodes in Src into the DstM module. We do this - // after linking GlobalValues so that MDNodes that reference GlobalValues - // are properly remapped. - linkNamedMDNodes(); - - // Merge the module flags into the DstM module. - if (linkModuleFlagsMetadata()) - return true; - - // Update the initializers in the DstM module now that all globals that may - // be referenced are in DstM. - for (GlobalVariable &Src : SrcM->globals()) { - // Only process initialized GV's or ones not already in dest. - if (!Src.hasInitializer() || DoNotLinkFromSource.count(&Src)) - continue; - linkGlobalValueBody(Src); + if (shouldInternalizeLinkedSymbols()) { + for (GlobalValue *GV : ValuesToLink) + Internalize.insert(GV->getName()); } - // Process vector of lazily linked in functions. - while (!LazilyLinkGlobalValues.empty()) { - GlobalValue *SGV = LazilyLinkGlobalValues.back(); - LazilyLinkGlobalValues.pop_back(); - - assert(!SGV->isDeclaration() && "users should not pass down decls"); - if (linkGlobalValueBody(*SGV)) - return true; + if (Mover.move(SrcM, ValuesToLink.getArrayRef(), + [this](GlobalValue &GV, IRMover::ValueAdder Add) { + addLazyFor(GV, Add); + }, + ValIDToTempMDMap, false)) + return true; + Module &DstM = Mover.getModule(); + for (auto &P : Internalize) { + GlobalValue *GV = DstM.getNamedValue(P.first()); + GV->setLinkage(GlobalValue::InternalLinkage); } return false; } -Linker::StructTypeKeyInfo::KeyTy::KeyTy(ArrayRef<Type *> E, bool P) - : ETypes(E), IsPacked(P) {} - -Linker::StructTypeKeyInfo::KeyTy::KeyTy(const StructType *ST) - : ETypes(ST->elements()), IsPacked(ST->isPacked()) {} - -bool Linker::StructTypeKeyInfo::KeyTy::operator==(const KeyTy &That) const { - if (IsPacked != That.IsPacked) - return false; - if (ETypes != That.ETypes) - return false; - return true; -} - -bool Linker::StructTypeKeyInfo::KeyTy::operator!=(const KeyTy &That) const { - return !this->operator==(That); -} - -StructType *Linker::StructTypeKeyInfo::getEmptyKey() { - return DenseMapInfo<StructType *>::getEmptyKey(); -} - -StructType *Linker::StructTypeKeyInfo::getTombstoneKey() { - return DenseMapInfo<StructType *>::getTombstoneKey(); -} - -unsigned Linker::StructTypeKeyInfo::getHashValue(const KeyTy &Key) { - return hash_combine(hash_combine_range(Key.ETypes.begin(), Key.ETypes.end()), - Key.IsPacked); -} - -unsigned Linker::StructTypeKeyInfo::getHashValue(const StructType *ST) { - return getHashValue(KeyTy(ST)); -} - -bool Linker::StructTypeKeyInfo::isEqual(const KeyTy &LHS, - const StructType *RHS) { - if (RHS == getEmptyKey() || RHS == getTombstoneKey()) - return false; - return LHS == KeyTy(RHS); -} - -bool Linker::StructTypeKeyInfo::isEqual(const StructType *LHS, - const StructType *RHS) { - if (RHS == getEmptyKey()) - return LHS == getEmptyKey(); - - if (RHS == getTombstoneKey()) - return LHS == getTombstoneKey(); - - return KeyTy(LHS) == KeyTy(RHS); -} - -void Linker::IdentifiedStructTypeSet::addNonOpaque(StructType *Ty) { - assert(!Ty->isOpaque()); - NonOpaqueStructTypes.insert(Ty); -} +Linker::Linker(Module &M) : Mover(M) {} -void Linker::IdentifiedStructTypeSet::switchToNonOpaque(StructType *Ty) { - assert(!Ty->isOpaque()); - NonOpaqueStructTypes.insert(Ty); - bool Removed = OpaqueStructTypes.erase(Ty); - (void)Removed; - assert(Removed); +bool Linker::linkInModule(std::unique_ptr<Module> Src, unsigned Flags, + const FunctionInfoIndex *Index, + DenseSet<const GlobalValue *> *FunctionsToImport, + DenseMap<unsigned, MDNode *> *ValIDToTempMDMap) { + ModuleLinker ModLinker(Mover, *Src, Flags, Index, FunctionsToImport, + ValIDToTempMDMap); + return ModLinker.run(); } -void Linker::IdentifiedStructTypeSet::addOpaque(StructType *Ty) { - assert(Ty->isOpaque()); - OpaqueStructTypes.insert(Ty); +bool Linker::linkInModuleForCAPI(Module &Src) { + ModuleLinker ModLinker(Mover, Src, 0, nullptr, nullptr); + return ModLinker.run(); } -StructType * -Linker::IdentifiedStructTypeSet::findNonOpaque(ArrayRef<Type *> ETypes, - bool IsPacked) { - Linker::StructTypeKeyInfo::KeyTy Key(ETypes, IsPacked); - auto I = NonOpaqueStructTypes.find_as(Key); - if (I == NonOpaqueStructTypes.end()) - return nullptr; - return *I; -} - -bool Linker::IdentifiedStructTypeSet::hasType(StructType *Ty) { - if (Ty->isOpaque()) - return OpaqueStructTypes.count(Ty); - auto I = NonOpaqueStructTypes.find(Ty); - if (I == NonOpaqueStructTypes.end()) - return false; - return *I == Ty; -} - -void Linker::init(Module *M, DiagnosticHandlerFunction DiagnosticHandler) { - this->Composite = M; - this->DiagnosticHandler = DiagnosticHandler; - - TypeFinder StructTypes; - StructTypes.run(*M, true); - for (StructType *Ty : StructTypes) { - if (Ty->isOpaque()) - IdentifiedStructTypes.addOpaque(Ty); - else - IdentifiedStructTypes.addNonOpaque(Ty); - } -} - -Linker::Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler) { - init(M, DiagnosticHandler); -} - -Linker::Linker(Module *M) { - init(M, [this](const DiagnosticInfo &DI) { - Composite->getContext().diagnose(DI); - }); -} - -Linker::~Linker() { -} - -void Linker::deleteModule() { - delete Composite; - Composite = nullptr; -} - -bool Linker::linkInModule(Module *Src, bool OverrideSymbols) { - ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src, - DiagnosticHandler, OverrideSymbols); - bool RetCode = TheLinker.run(); - Composite->dropTriviallyDeadConstantArrays(); - return RetCode; -} - -void Linker::setModule(Module *Dst) { - init(Dst, DiagnosticHandler); +bool Linker::linkInMetadata(Module &Src, + DenseMap<unsigned, MDNode *> *ValIDToTempMDMap) { + SetVector<GlobalValue *> ValuesToLink; + if (Mover.move( + Src, ValuesToLink.getArrayRef(), + [this](GlobalValue &GV, IRMover::ValueAdder Add) { assert(false); }, + ValIDToTempMDMap, true)) + return true; + return false; } //===----------------------------------------------------------------------===// @@ -1780,34 +857,52 @@ void Linker::setModule(Module *Dst) { /// true is returned and ErrorMsg (if not null) is set to indicate the problem. /// Upon failure, the Dest module could be in a modified state, and shouldn't be /// relied on to be consistent. -bool Linker::LinkModules(Module *Dest, Module *Src, - DiagnosticHandlerFunction DiagnosticHandler) { - Linker L(Dest, DiagnosticHandler); - return L.linkInModule(Src); +bool Linker::linkModules(Module &Dest, std::unique_ptr<Module> Src, + unsigned Flags) { + Linker L(Dest); + return L.linkInModule(std::move(Src), Flags); } -bool Linker::LinkModules(Module *Dest, Module *Src) { - Linker L(Dest); - return L.linkInModule(Src); +bool llvm::renameModuleForThinLTO(Module &M, const FunctionInfoIndex *Index) { + ThinLTOGlobalProcessing ThinLTOProcessing(M, Index); + return ThinLTOProcessing.run(); } //===----------------------------------------------------------------------===// // C API. //===----------------------------------------------------------------------===// +static void diagnosticHandler(const DiagnosticInfo &DI, void *C) { + auto *Message = reinterpret_cast<std::string *>(C); + raw_string_ostream Stream(*Message); + DiagnosticPrinterRawOStream DP(Stream); + DI.print(DP); +} + LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src, LLVMLinkerMode Unused, char **OutMessages) { Module *D = unwrap(Dest); + LLVMContext &Ctx = D->getContext(); + + LLVMContext::DiagnosticHandlerTy OldDiagnosticHandler = + Ctx.getDiagnosticHandler(); + void *OldDiagnosticContext = Ctx.getDiagnosticContext(); std::string Message; - raw_string_ostream Stream(Message); - DiagnosticPrinterRawOStream DP(Stream); + Ctx.setDiagnosticHandler(diagnosticHandler, &Message, true); + + Linker L(*D); + Module *M = unwrap(Src); + LLVMBool Result = L.linkInModuleForCAPI(*M); - LLVMBool Result = Linker::LinkModules( - D, unwrap(Src), [&](const DiagnosticInfo &DI) { DI.print(DP); }); + Ctx.setDiagnosticHandler(OldDiagnosticHandler, OldDiagnosticContext, true); - if (OutMessages && Result) { - Stream.flush(); + if (OutMessages && Result) *OutMessages = strdup(Message.c_str()); - } return Result; } + +LLVMBool LLVMLinkModules2(LLVMModuleRef Dest, LLVMModuleRef Src) { + Module *D = unwrap(Dest); + std::unique_ptr<Module> M(unwrap(Src)); + return Linker::linkModules(*D, std::move(M)); +} diff --git a/contrib/llvm/lib/MC/ConstantPools.cpp b/contrib/llvm/lib/MC/ConstantPools.cpp index f7649fb..9643b75 100644 --- a/contrib/llvm/lib/MC/ConstantPools.cpp +++ b/contrib/llvm/lib/MC/ConstantPools.cpp @@ -29,17 +29,17 @@ void ConstantPool::emitEntries(MCStreamer &Streamer) { I != E; ++I) { Streamer.EmitCodeAlignment(I->Size); // align naturally Streamer.EmitLabel(I->Label); - Streamer.EmitValue(I->Value, I->Size); + Streamer.EmitValue(I->Value, I->Size, I->Loc); } Streamer.EmitDataRegion(MCDR_DataRegionEnd); Entries.clear(); } const MCExpr *ConstantPool::addEntry(const MCExpr *Value, MCContext &Context, - unsigned Size) { + unsigned Size, SMLoc Loc) { MCSymbol *CPEntryLabel = Context.createTempSymbol(); - Entries.push_back(ConstantPoolEntry(CPEntryLabel, Value, Size)); + Entries.push_back(ConstantPoolEntry(CPEntryLabel, Value, Size, Loc)); return MCSymbolRefExpr::create(CPEntryLabel, Context); } @@ -90,8 +90,8 @@ void AssemblerConstantPools::emitForCurrentSection(MCStreamer &Streamer) { const MCExpr *AssemblerConstantPools::addEntry(MCStreamer &Streamer, const MCExpr *Expr, - unsigned Size) { + unsigned Size, SMLoc Loc) { MCSection *Section = Streamer.getCurrentSection().first; return getOrCreateConstantPool(Section).addEntry(Expr, Streamer.getContext(), - Size); + Size, Loc); } diff --git a/contrib/llvm/lib/MC/ELFObjectWriter.cpp b/contrib/llvm/lib/MC/ELFObjectWriter.cpp index e925bc2..e6552be 100644 --- a/contrib/llvm/lib/MC/ELFObjectWriter.cpp +++ b/contrib/llvm/lib/MC/ELFObjectWriter.cpp @@ -33,6 +33,7 @@ #include "llvm/Support/ELF.h" #include "llvm/Support/Endian.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/StringSaver.h" #include <vector> using namespace llvm; @@ -106,7 +107,9 @@ class ELFObjectWriter : public MCObjectWriter { /// @name Symbol Table Data /// @{ - StringTableBuilder StrTabBuilder; + BumpPtrAllocator Alloc; + StringSaver VersionSymSaver{Alloc}; + StringTableBuilder StrTabBuilder{StringTableBuilder::ELF}; /// @} @@ -157,9 +160,9 @@ class ELFObjectWriter : public MCObjectWriter { template <typename T> void write(T Val) { if (IsLittleEndian) - support::endian::Writer<support::little>(OS).write(Val); + support::endian::Writer<support::little>(getStream()).write(Val); else - support::endian::Writer<support::big>(OS).write(Val); + support::endian::Writer<support::big>(getStream()).write(Val); } void writeHeader(const MCAssembler &Asm); @@ -232,7 +235,7 @@ class ELFObjectWriter : public MCObjectWriter { } void ELFObjectWriter::align(unsigned Alignment) { - uint64_t Padding = OffsetToAlignment(OS.tell(), Alignment); + uint64_t Padding = OffsetToAlignment(getStream().tell(), Alignment); WriteZeros(Padding); } @@ -447,9 +450,6 @@ void ELFObjectWriter::writeSymbol(SymbolTableWriter &Writer, uint32_t StringIndex, ELFSymbolData &MSD, const MCAsmLayout &Layout) { const auto &Symbol = cast<MCSymbolELF>(*MSD.Symbol); - assert((!Symbol.getFragment() || - (Symbol.getFragment()->getParent() == &Symbol.getSection())) && - "The symbol's section doesn't match the fragment's symbol"); const MCSymbolELF *Base = cast_or_null<MCSymbolELF>(Layout.getBaseSymbol(Symbol)); @@ -630,28 +630,36 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm, // In general, ELF has no relocations for -B. It can only represent (A + C) // or (A + C - R). If B = R + K and the relocation is not pcrel, we can // replace B to implement it: (A - R - K + C) - if (IsPCRel) - Asm.getContext().reportFatalError( + if (IsPCRel) { + Asm.getContext().reportError( Fixup.getLoc(), "No relocation available to represent this relative expression"); + return; + } const auto &SymB = cast<MCSymbolELF>(RefB->getSymbol()); - if (SymB.isUndefined()) - Asm.getContext().reportFatalError( + if (SymB.isUndefined()) { + Asm.getContext().reportError( Fixup.getLoc(), Twine("symbol '") + SymB.getName() + "' can not be undefined in a subtraction expression"); + return; + } assert(!SymB.isAbsolute() && "Should have been folded"); const MCSection &SecB = SymB.getSection(); - if (&SecB != &FixupSection) - Asm.getContext().reportFatalError( + if (&SecB != &FixupSection) { + Asm.getContext().reportError( Fixup.getLoc(), "Cannot represent a difference across sections"); + return; + } - if (::isWeak(SymB)) - Asm.getContext().reportFatalError( + if (::isWeak(SymB)) { + Asm.getContext().reportError( Fixup.getLoc(), "Cannot represent a subtraction with a weak symbol"); + return; + } uint64_t SymBOffset = Layout.getSymbolOffset(SymB); uint64_t K = SymBOffset - FixupOffset; @@ -764,7 +772,7 @@ void ELFObjectWriter::computeSymbolTable( SymbolTableIndex = addToSectionTable(SymtabSection); align(SymtabSection->getAlignment()); - uint64_t SecStart = OS.tell(); + uint64_t SecStart = getStream().tell(); // The first entry is the undefined symbol entry. Writer.writeSymbol(0, 0, 0, 0, 0, 0, false); @@ -784,8 +792,10 @@ void ELFObjectWriter::computeSymbolTable( Renames.count(&Symbol))) continue; - if (Symbol.isTemporary() && Symbol.isUndefined()) - Ctx.reportFatalError(SMLoc(), "Undefined temporary"); + if (Symbol.isTemporary() && Symbol.isUndefined()) { + Ctx.reportError(SMLoc(), "Undefined temporary symbol"); + continue; + } ELFSymbolData MSD; MSD.Symbol = cast<MCSymbolELF>(&Symbol); @@ -850,13 +860,15 @@ void ELFObjectWriter::computeSymbolTable( Buf += Name.substr(0, Pos); unsigned Skip = MSD.SectionIndex == ELF::SHN_UNDEF ? 2 : 1; Buf += Name.substr(Pos + Skip); - Name = Buf; + Name = VersionSymSaver.save(Buf.c_str()); } } // Sections have their own string table - if (Symbol.getType() != ELF::STT_SECTION) - MSD.Name = StrTabBuilder.add(Name); + if (Symbol.getType() != ELF::STT_SECTION) { + MSD.Name = Name; + StrTabBuilder.add(Name); + } if (Local) LocalSymbolData.push_back(MSD); @@ -878,7 +890,7 @@ void ELFObjectWriter::computeSymbolTable( for (const std::string &Name : FileNames) StrTabBuilder.add(Name); - StrTabBuilder.finalize(StringTableBuilder::ELF); + StrTabBuilder.finalize(); for (const std::string &Name : FileNames) Writer.writeSymbol(StrTabBuilder.getOffset(Name), @@ -911,7 +923,7 @@ void ELFObjectWriter::computeSymbolTable( assert(MSD.Symbol->getBinding() != ELF::STB_LOCAL); } - uint64_t SecEnd = OS.tell(); + uint64_t SecEnd = getStream().tell(); SectionOffsets[SymtabSection] = std::make_pair(SecStart, SecEnd); ArrayRef<uint32_t> ShndxIndexes = Writer.getShndxIndexes(); @@ -921,12 +933,12 @@ void ELFObjectWriter::computeSymbolTable( } assert(SymtabShndxSectionIndex != 0); - SecStart = OS.tell(); + SecStart = getStream().tell(); const MCSectionELF *SymtabShndxSection = SectionTable[SymtabShndxSectionIndex - 1]; for (uint32_t Index : ShndxIndexes) write(Index); - SecEnd = OS.tell(); + SecEnd = getStream().tell(); SectionOffsets[SymtabShndxSection] = std::make_pair(SecStart, SecEnd); } @@ -957,31 +969,6 @@ ELFObjectWriter::createRelocationSection(MCContext &Ctx, return RelaSection; } -static SmallVector<char, 128> -getUncompressedData(const MCAsmLayout &Layout, - const MCSection::FragmentListType &Fragments) { - SmallVector<char, 128> UncompressedData; - for (const MCFragment &F : Fragments) { - const SmallVectorImpl<char> *Contents; - switch (F.getKind()) { - case MCFragment::FT_Data: - Contents = &cast<MCDataFragment>(F).getContents(); - break; - case MCFragment::FT_Dwarf: - Contents = &cast<MCDwarfLineAddrFragment>(F).getContents(); - break; - case MCFragment::FT_DwarfFrame: - Contents = &cast<MCDwarfCallFrameFragment>(F).getContents(); - break; - default: - llvm_unreachable( - "Not expecting any other fragment types in a debug_* section"); - } - UncompressedData.append(Contents->begin(), Contents->end()); - } - return UncompressedData; -} - // Include the debug info compression header: // "ZLIB" followed by 8 bytes representing the uncompressed size of the section, // useful for consumers to preallocate a buffer to decompress into. @@ -1016,27 +1003,29 @@ void ELFObjectWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec, return; } - // Gather the uncompressed data from all the fragments. - const MCSection::FragmentListType &Fragments = Section.getFragmentList(); - SmallVector<char, 128> UncompressedData = - getUncompressedData(Layout, Fragments); + SmallVector<char, 128> UncompressedData; + raw_svector_ostream VecOS(UncompressedData); + raw_pwrite_stream &OldStream = getStream(); + setStream(VecOS); + Asm.writeSectionData(&Section, Layout); + setStream(OldStream); SmallVector<char, 128> CompressedContents; zlib::Status Success = zlib::compress( StringRef(UncompressedData.data(), UncompressedData.size()), CompressedContents); if (Success != zlib::StatusOK) { - Asm.writeSectionData(&Section, Layout); + getStream() << UncompressedData; return; } if (!prependCompressionHeader(UncompressedData.size(), CompressedContents)) { - Asm.writeSectionData(&Section, Layout); + getStream() << UncompressedData; return; } Asm.getContext().renameELFSection(&Section, (".z" + SectionName.drop_front(1)).str()); - OS << CompressedContents; + getStream() << CompressedContents; } void ELFObjectWriter::WriteSecHdrEntry(uint32_t Name, uint32_t Type, @@ -1061,8 +1050,13 @@ void ELFObjectWriter::writeRelocations(const MCAssembler &Asm, const MCSectionELF &Sec) { std::vector<ELFRelocationEntry> &Relocs = Relocations[&Sec]; - // Sort the relocation entries. Most targets just sort by Offset, but some - // (e.g., MIPS) have additional constraints. + // We record relocations by pushing to the end of a vector. Reverse the vector + // to get the relocations in the order they were created. + // In most cases that is not important, but it can be for special sections + // (.eh_frame) or specific relocations (TLS optimizations on SystemZ). + std::reverse(Relocs.begin(), Relocs.end()); + + // Sort the relocation entries. MIPS needs this. TargetObjectWriter->sortRelocs(Asm, Relocs); for (unsigned i = 0, e = Relocs.size(); i != e; ++i) { @@ -1100,7 +1094,7 @@ void ELFObjectWriter::writeRelocations(const MCAssembler &Asm, const MCSectionELF *ELFObjectWriter::createStringTable(MCContext &Ctx) { const MCSectionELF *StrtabSection = SectionTable[StringTableIndex - 1]; - OS << StrTabBuilder.data(); + getStream() << StrTabBuilder.data(); return StrtabSection; } @@ -1209,12 +1203,12 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm, align(Section.getAlignment()); // Remember the offset into the file for this section. - uint64_t SecStart = OS.tell(); + uint64_t SecStart = getStream().tell(); const MCSymbolELF *SignatureSymbol = Section.getGroup(); writeSectionData(Asm, Section, Layout); - uint64_t SecEnd = OS.tell(); + uint64_t SecEnd = getStream().tell(); SectionOffsets[&Section] = std::make_pair(SecStart, SecEnd); MCSectionELF *RelSection = createRelocationSection(Ctx, Section); @@ -1246,7 +1240,7 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm, align(Group->getAlignment()); // Remember the offset into the file for this section. - uint64_t SecStart = OS.tell(); + uint64_t SecStart = getStream().tell(); const MCSymbol *SignatureSymbol = Group->getGroup(); assert(SignatureSymbol); @@ -1256,7 +1250,7 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm, write(SecIndex); } - uint64_t SecEnd = OS.tell(); + uint64_t SecEnd = getStream().tell(); SectionOffsets[Group] = std::make_pair(SecStart, SecEnd); } @@ -1267,25 +1261,25 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm, align(RelSection->getAlignment()); // Remember the offset into the file for this section. - uint64_t SecStart = OS.tell(); + uint64_t SecStart = getStream().tell(); writeRelocations(Asm, *RelSection->getAssociatedSection()); - uint64_t SecEnd = OS.tell(); + uint64_t SecEnd = getStream().tell(); SectionOffsets[RelSection] = std::make_pair(SecStart, SecEnd); } { - uint64_t SecStart = OS.tell(); + uint64_t SecStart = getStream().tell(); const MCSectionELF *Sec = createStringTable(Ctx); - uint64_t SecEnd = OS.tell(); + uint64_t SecEnd = getStream().tell(); SectionOffsets[Sec] = std::make_pair(SecStart, SecEnd); } uint64_t NaturalAlignment = is64Bit() ? 8 : 4; align(NaturalAlignment); - const unsigned SectionHeaderOffset = OS.tell(); + const unsigned SectionHeaderOffset = getStream().tell(); // ... then the section header table ... writeSectionHeader(Layout, SectionIndexMap, SectionOffsets); @@ -1301,19 +1295,19 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm, uint64_t Val = SectionHeaderOffset; if (sys::IsLittleEndianHost != IsLittleEndian) sys::swapByteOrder(Val); - OS.pwrite(reinterpret_cast<char *>(&Val), sizeof(Val), - offsetof(ELF::Elf64_Ehdr, e_shoff)); + getStream().pwrite(reinterpret_cast<char *>(&Val), sizeof(Val), + offsetof(ELF::Elf64_Ehdr, e_shoff)); NumSectionsOffset = offsetof(ELF::Elf64_Ehdr, e_shnum); } else { uint32_t Val = SectionHeaderOffset; if (sys::IsLittleEndianHost != IsLittleEndian) sys::swapByteOrder(Val); - OS.pwrite(reinterpret_cast<char *>(&Val), sizeof(Val), - offsetof(ELF::Elf32_Ehdr, e_shoff)); + getStream().pwrite(reinterpret_cast<char *>(&Val), sizeof(Val), + offsetof(ELF::Elf32_Ehdr, e_shoff)); NumSectionsOffset = offsetof(ELF::Elf32_Ehdr, e_shnum); } - OS.pwrite(reinterpret_cast<char *>(&NumSections), sizeof(NumSections), - NumSectionsOffset); + getStream().pwrite(reinterpret_cast<char *>(&NumSections), + sizeof(NumSections), NumSectionsOffset); } bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl( diff --git a/contrib/llvm/lib/MC/MCAsmBackend.cpp b/contrib/llvm/lib/MC/MCAsmBackend.cpp index 36c65b7..fcf139b 100644 --- a/contrib/llvm/lib/MC/MCAsmBackend.cpp +++ b/contrib/llvm/lib/MC/MCAsmBackend.cpp @@ -16,6 +16,10 @@ MCAsmBackend::MCAsmBackend() : HasDataInCodeSupport(false) {} MCAsmBackend::~MCAsmBackend() {} +bool MCAsmBackend::getFixupKind(StringRef Name, MCFixupKind &MappedKind) const { + return false; +} + const MCFixupKindInfo &MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { static const MCFixupKindInfo Builtins[] = { {"FK_Data_1", 0, 8, 0}, diff --git a/contrib/llvm/lib/MC/MCAsmInfo.cpp b/contrib/llvm/lib/MC/MCAsmInfo.cpp index 100dc7c..36e10b3 100644 --- a/contrib/llvm/lib/MC/MCAsmInfo.cpp +++ b/contrib/llvm/lib/MC/MCAsmInfo.cpp @@ -157,3 +157,9 @@ bool MCAsmInfo::isValidUnquotedName(StringRef Name) const { return true; } + +bool MCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const { + // FIXME: Does .section .bss/.data/.text work everywhere?? + return SectionName == ".text" || SectionName == ".data" || + (SectionName == ".bss" && !usesELFSectionDirectiveForBSS()); +} diff --git a/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp b/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp index 97fc76a..5b9dd20 100644 --- a/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp +++ b/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp @@ -37,8 +37,7 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() { UseIntegratedAssembler = true; - // FIXME: For now keep the previous behavior, AShr. Need to double-check - // other COFF-targeting assemblers and change this if necessary. + // At least MSVC inline-asm does AShr. UseLogicalShr = false; } diff --git a/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp b/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp index bb90ff2..ae9486d 100644 --- a/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp +++ b/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp @@ -93,9 +93,4 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() { UseIntegratedAssembler = true; SetDirectiveSuppressesReloc = true; - - // FIXME: For now keep the previous behavior, AShr, matching the previous - // behavior of as(1) (both -q and -Q: resp. LLVM and gas v1.38). - // If/when this changes, the AArch64 Darwin special case can go away. - UseLogicalShr = false; } diff --git a/contrib/llvm/lib/MC/MCAsmStreamer.cpp b/contrib/llvm/lib/MC/MCAsmStreamer.cpp index 227c937..c99ce77 100644 --- a/contrib/llvm/lib/MC/MCAsmStreamer.cpp +++ b/contrib/llvm/lib/MC/MCAsmStreamer.cpp @@ -1,4 +1,4 @@ -//===- lib/MC/MCAsmStreamer.cpp - Text Assembly Output --------------------===// +//===- lib/MC/MCAsmStreamer.cpp - Text Assembly Output ----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -29,9 +29,11 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormattedStream.h" +#include "llvm/Support/LEB128.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Path.h" #include <cctype> + using namespace llvm; namespace { @@ -78,6 +80,9 @@ public: } EmitCommentsAndEOL(); } + + void EmitSyntaxDirective() override; + void EmitCommentsAndEOL(); /// isVerboseAsm - Return true if this streamer supports verbose assembly at @@ -160,7 +165,7 @@ public: void EmitBytes(StringRef Data) override; void EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc = SMLoc()) override; + SMLoc Loc = SMLoc()) override; void EmitIntValue(uint64_t Value, unsigned Size) override; void EmitULEB128Value(const MCExpr *Value) override; @@ -181,7 +186,7 @@ public: void EmitCodeAlignment(unsigned ByteAlignment, unsigned MaxBytesToEmit = 0) override; - bool EmitValueToOffset(const MCExpr *Offset, + void emitValueToOffset(const MCExpr *Offset, unsigned char Value = 0) override; void EmitFileDirective(StringRef Filename) override; @@ -207,6 +212,8 @@ public: void EmitCFISameValue(int64_t Register) override; void EmitCFIRelOffset(int64_t Register, int64_t Offset) override; void EmitCFIAdjustCfaOffset(int64_t Adjustment) override; + void EmitCFIEscape(StringRef Values) override; + void EmitCFIGnuArgsSize(int64_t Size) override; void EmitCFISignalFrame() override; void EmitCFIUndefined(int64_t Register) override; void EmitCFIRegister(int64_t Register1, int64_t Register2) override; @@ -233,6 +240,9 @@ public: void EmitBundleLock(bool AlignToEnd) override; void EmitBundleUnlock() override; + bool EmitRelocDirective(const MCExpr &Offset, StringRef Name, + const MCExpr *Expr, SMLoc Loc) override; + /// EmitRawText - If this file is backed by an assembly streamer, this dumps /// the specified string in the output .s file. This capability is /// indicated by the hasRawTextSupport() predicate. @@ -250,15 +260,9 @@ public: void MCAsmStreamer::AddComment(const Twine &T) { if (!IsVerboseAsm) return; - // Make sure that CommentStream is flushed. - CommentStream.flush(); - T.toVector(CommentToEmit); // Each comment goes on its own line. CommentToEmit.push_back('\n'); - - // Tell the comment stream that the vector changed underneath it. - CommentStream.resync(); } void MCAsmStreamer::EmitCommentsAndEOL() { @@ -267,7 +271,6 @@ void MCAsmStreamer::EmitCommentsAndEOL() { return; } - CommentStream.flush(); StringRef Comments = CommentToEmit; assert(Comments.back() == '\n' && @@ -282,8 +285,6 @@ void MCAsmStreamer::EmitCommentsAndEOL() { } while (!Comments.empty()); CommentToEmit.clear(); - // Tell the comment stream that the vector changed underneath it. - CommentStream.resync(); } static inline int64_t truncateToSize(int64_t Value, unsigned Bytes) { @@ -372,6 +373,8 @@ void MCAsmStreamer::EmitDataRegion(MCDataRegionType Kind) { void MCAsmStreamer::EmitVersionMin(MCVersionMinType Kind, unsigned Major, unsigned Minor, unsigned Update) { switch (Kind) { + case MCVM_WatchOSVersionMin: OS << "\t.watchos_version_min"; break; + case MCVM_TvOSVersionMin: OS << "\t.tvos_version_min"; break; case MCVM_IOSVersionMin: OS << "\t.ios_version_min"; break; case MCVM_OSXVersionMin: OS << "\t.macosx_version_min"; break; } @@ -480,6 +483,14 @@ void MCAsmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) { EmitEOL(); } +void MCAsmStreamer::EmitSyntaxDirective() { + if (MAI->getAssemblerDialect() == 1) + OS << "\t.intel_syntax noprefix\n"; + // FIXME: Currently emit unprefix'ed registers. + // The intel_syntax directive has one optional argument + // with may have a value of prefix or noprefix. +} + void MCAsmStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) { OS << "\t.def\t "; Symbol->print(OS, MAI); @@ -531,9 +542,6 @@ void MCAsmStreamer::emitELFSize(MCSymbolELF *Symbol, const MCExpr *Value) { void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment) { - // Common symbols do not belong to any actual section. - AssignSection(Symbol, nullptr); - OS << "\t.comm\t"; Symbol->print(OS, MAI); OS << ',' << Size; @@ -553,9 +561,6 @@ void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, /// @param Size - The size of the common symbol. void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, unsigned ByteAlign) { - // Common symbols do not belong to any actual section. - AssignSection(Symbol, nullptr); - OS << "\t.lcomm\t"; Symbol->print(OS, MAI); OS << ',' << Size; @@ -579,7 +584,7 @@ void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, void MCAsmStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment) { if (Symbol) - AssignSection(Symbol, Section); + AssignFragment(Symbol, &Section->getDummyFragment()); // Note: a .zerofill directive does not switch sections. OS << ".zerofill "; @@ -603,7 +608,7 @@ void MCAsmStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol, // e.g. _a. void MCAsmStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment) { - AssignSection(Symbol, Section); + AssignFragment(Symbol, &Section->getDummyFragment()); assert(Symbol && "Symbol shouldn't be NULL!"); // Instead of using the Section we'll just use the shortcut. @@ -654,7 +659,6 @@ static void PrintQuotedString(StringRef Data, raw_ostream &OS) { OS << '"'; } - void MCAsmStreamer::EmitBytes(StringRef Data) { assert(getCurrentSection().first && "Cannot emit contents before setting section!"); @@ -685,7 +689,7 @@ void MCAsmStreamer::EmitIntValue(uint64_t Value, unsigned Size) { } void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) { + SMLoc Loc) { assert(Size <= 8 && "Invalid size"); assert(getCurrentSection().first && "Cannot emit contents before setting section!"); @@ -776,7 +780,6 @@ void MCAsmStreamer::EmitGPRel32Value(const MCExpr *Value) { EmitEOL(); } - /// EmitFill - Emit NumBytes bytes worth of the value specified by /// FillValue. This implements directives such as '.space'. void MCAsmStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue) { @@ -856,17 +859,15 @@ void MCAsmStreamer::EmitCodeAlignment(unsigned ByteAlignment, 1, MaxBytesToEmit); } -bool MCAsmStreamer::EmitValueToOffset(const MCExpr *Offset, +void MCAsmStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value) { // FIXME: Verify that Offset is associated with the current section. OS << ".org "; Offset->print(OS, MAI); OS << ", " << (unsigned)Value; EmitEOL(); - return false; } - void MCAsmStreamer::EmitFileDirective(StringRef Filename) { assert(MAI->hasSingleParameterDotFile()); OS << "\t.file\t"; @@ -1014,6 +1015,32 @@ void MCAsmStreamer::EmitCFIDefCfaOffset(int64_t Offset) { EmitEOL(); } +static void PrintCFIEscape(llvm::formatted_raw_ostream &OS, StringRef Values) { + OS << "\t.cfi_escape "; + if (!Values.empty()) { + size_t e = Values.size() - 1; + for (size_t i = 0; i < e; ++i) + OS << format("0x%02x", uint8_t(Values[i])) << ", "; + OS << format("0x%02x", uint8_t(Values[e])); + } +} + +void MCAsmStreamer::EmitCFIEscape(StringRef Values) { + MCStreamer::EmitCFIEscape(Values); + PrintCFIEscape(OS, Values); + EmitEOL(); +} + +void MCAsmStreamer::EmitCFIGnuArgsSize(int64_t Size) { + MCStreamer::EmitCFIGnuArgsSize(Size); + + uint8_t Buffer[16] = { dwarf::DW_CFA_GNU_args_size }; + unsigned Len = encodeULEB128(Size, Buffer + 1) + 1; + + PrintCFIEscape(OS, StringRef((const char *)&Buffer[0], Len)); + EmitEOL(); +} + void MCAsmStreamer::EmitCFIDefCfaRegister(int64_t Register) { MCStreamer::EmitCFIDefCfaRegister(Register); OS << "\t.cfi_def_cfa_register "; @@ -1203,7 +1230,7 @@ void MCAsmStreamer::EmitWinCFIPushFrame(bool Code) { EmitEOL(); } -void MCAsmStreamer::EmitWinCFIEndProlog(void) { +void MCAsmStreamer::EmitWinCFIEndProlog() { MCStreamer::EmitWinCFIEndProlog(); OS << "\t.seh_endprologue"; @@ -1217,7 +1244,6 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst, SmallVector<MCFixup, 4> Fixups; raw_svector_ostream VecOS(Code); Emitter->encodeInstruction(Inst, VecOS, Fixups, STI); - VecOS.flush(); // If we are showing fixups, create symbolic markers in the encoded // representation. We do this by making a per-bit map to the fixup item index, @@ -1334,6 +1360,19 @@ void MCAsmStreamer::EmitBundleUnlock() { EmitEOL(); } +bool MCAsmStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name, + const MCExpr *Expr, SMLoc) { + OS << "\t.reloc "; + Offset.print(OS, MAI); + OS << ", " << Name; + if (Expr) { + OS << ", "; + Expr->print(OS, MAI); + } + EmitEOL(); + return false; +} + /// EmitRawText - If this file is backed by an assembly streamer, this dumps /// the specified string in the output .s file. This capability is /// indicated by the hasRawTextSupport() predicate. diff --git a/contrib/llvm/lib/MC/MCAssembler.cpp b/contrib/llvm/lib/MC/MCAssembler.cpp index f53b589..15e82fa 100644 --- a/contrib/llvm/lib/MC/MCAssembler.cpp +++ b/contrib/llvm/lib/MC/MCAssembler.cpp @@ -64,272 +64,11 @@ STATISTIC(RelaxedInstructions, "Number of relaxed instructions"); /* *** */ -MCAsmLayout::MCAsmLayout(MCAssembler &Asm) - : Assembler(Asm), LastValidFragment() - { - // Compute the section layout order. Virtual sections must go last. - for (MCAssembler::iterator it = Asm.begin(), ie = Asm.end(); it != ie; ++it) - if (!it->isVirtualSection()) - SectionOrder.push_back(&*it); - for (MCAssembler::iterator it = Asm.begin(), ie = Asm.end(); it != ie; ++it) - if (it->isVirtualSection()) - SectionOrder.push_back(&*it); -} - -bool MCAsmLayout::isFragmentValid(const MCFragment *F) const { - const MCSection *Sec = F->getParent(); - const MCFragment *LastValid = LastValidFragment.lookup(Sec); - if (!LastValid) - return false; - assert(LastValid->getParent() == Sec); - return F->getLayoutOrder() <= LastValid->getLayoutOrder(); -} - -void MCAsmLayout::invalidateFragmentsFrom(MCFragment *F) { - // If this fragment wasn't already valid, we don't need to do anything. - if (!isFragmentValid(F)) - return; - - // Otherwise, reset the last valid fragment to the previous fragment - // (if this is the first fragment, it will be NULL). - LastValidFragment[F->getParent()] = F->getPrevNode(); -} - -void MCAsmLayout::ensureValid(const MCFragment *F) const { - MCSection *Sec = F->getParent(); - MCFragment *Cur = LastValidFragment[Sec]; - if (!Cur) - Cur = Sec->begin(); - else - Cur = Cur->getNextNode(); - - // Advance the layout position until the fragment is valid. - while (!isFragmentValid(F)) { - assert(Cur && "Layout bookkeeping error"); - const_cast<MCAsmLayout*>(this)->layoutFragment(Cur); - Cur = Cur->getNextNode(); - } -} - -uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const { - ensureValid(F); - assert(F->Offset != ~UINT64_C(0) && "Address not set!"); - return F->Offset; -} - -// Simple getSymbolOffset helper for the non-varibale case. -static bool getLabelOffset(const MCAsmLayout &Layout, const MCSymbol &S, - bool ReportError, uint64_t &Val) { - if (!S.getFragment()) { - if (ReportError) - report_fatal_error("unable to evaluate offset to undefined symbol '" + - S.getName() + "'"); - return false; - } - Val = Layout.getFragmentOffset(S.getFragment()) + S.getOffset(); - return true; -} - -static bool getSymbolOffsetImpl(const MCAsmLayout &Layout, const MCSymbol &S, - bool ReportError, uint64_t &Val) { - if (!S.isVariable()) - return getLabelOffset(Layout, S, ReportError, Val); - - // If SD is a variable, evaluate it. - MCValue Target; - if (!S.getVariableValue()->evaluateAsRelocatable(Target, &Layout, nullptr)) - report_fatal_error("unable to evaluate offset for variable '" + - S.getName() + "'"); - - uint64_t Offset = Target.getConstant(); - - const MCSymbolRefExpr *A = Target.getSymA(); - if (A) { - uint64_t ValA; - if (!getLabelOffset(Layout, A->getSymbol(), ReportError, ValA)) - return false; - Offset += ValA; - } - - const MCSymbolRefExpr *B = Target.getSymB(); - if (B) { - uint64_t ValB; - if (!getLabelOffset(Layout, B->getSymbol(), ReportError, ValB)) - return false; - Offset -= ValB; - } - - Val = Offset; - return true; -} - -bool MCAsmLayout::getSymbolOffset(const MCSymbol &S, uint64_t &Val) const { - return getSymbolOffsetImpl(*this, S, false, Val); -} - -uint64_t MCAsmLayout::getSymbolOffset(const MCSymbol &S) const { - uint64_t Val; - getSymbolOffsetImpl(*this, S, true, Val); - return Val; -} - -const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const { - if (!Symbol.isVariable()) - return &Symbol; - - const MCExpr *Expr = Symbol.getVariableValue(); - MCValue Value; - if (!Expr->evaluateAsValue(Value, *this)) - llvm_unreachable("Invalid Expression"); - - const MCSymbolRefExpr *RefB = Value.getSymB(); - if (RefB) - Assembler.getContext().reportFatalError( - SMLoc(), Twine("symbol '") + RefB->getSymbol().getName() + - "' could not be evaluated in a subtraction expression"); - - const MCSymbolRefExpr *A = Value.getSymA(); - if (!A) - return nullptr; - - const MCSymbol &ASym = A->getSymbol(); - const MCAssembler &Asm = getAssembler(); - if (ASym.isCommon()) { - // FIXME: we should probably add a SMLoc to MCExpr. - Asm.getContext().reportFatalError(SMLoc(), - "Common symbol " + ASym.getName() + - " cannot be used in assignment expr"); - } - - return &ASym; -} - -uint64_t MCAsmLayout::getSectionAddressSize(const MCSection *Sec) const { - // The size is the last fragment's end offset. - const MCFragment &F = Sec->getFragmentList().back(); - return getFragmentOffset(&F) + getAssembler().computeFragmentSize(*this, F); -} - -uint64_t MCAsmLayout::getSectionFileSize(const MCSection *Sec) const { - // Virtual sections have no file size. - if (Sec->isVirtualSection()) - return 0; - - // Otherwise, the file size is the same as the address space size. - return getSectionAddressSize(Sec); -} - -uint64_t llvm::computeBundlePadding(const MCAssembler &Assembler, - const MCFragment *F, - uint64_t FOffset, uint64_t FSize) { - uint64_t BundleSize = Assembler.getBundleAlignSize(); - assert(BundleSize > 0 && - "computeBundlePadding should only be called if bundling is enabled"); - uint64_t BundleMask = BundleSize - 1; - uint64_t OffsetInBundle = FOffset & BundleMask; - uint64_t EndOfFragment = OffsetInBundle + FSize; - - // There are two kinds of bundling restrictions: - // - // 1) For alignToBundleEnd(), add padding to ensure that the fragment will - // *end* on a bundle boundary. - // 2) Otherwise, check if the fragment would cross a bundle boundary. If it - // would, add padding until the end of the bundle so that the fragment - // will start in a new one. - if (F->alignToBundleEnd()) { - // Three possibilities here: - // - // A) The fragment just happens to end at a bundle boundary, so we're good. - // B) The fragment ends before the current bundle boundary: pad it just - // enough to reach the boundary. - // C) The fragment ends after the current bundle boundary: pad it until it - // reaches the end of the next bundle boundary. - // - // Note: this code could be made shorter with some modulo trickery, but it's - // intentionally kept in its more explicit form for simplicity. - if (EndOfFragment == BundleSize) - return 0; - else if (EndOfFragment < BundleSize) - return BundleSize - EndOfFragment; - else { // EndOfFragment > BundleSize - return 2 * BundleSize - EndOfFragment; - } - } else if (OffsetInBundle > 0 && EndOfFragment > BundleSize) - return BundleSize - OffsetInBundle; - else - return 0; -} - -/* *** */ - -void ilist_node_traits<MCFragment>::deleteNode(MCFragment *V) { - V->destroy(); -} - -MCFragment::MCFragment() : Kind(FragmentType(~0)), HasInstructions(false), - AlignToBundleEnd(false), BundlePadding(0) { -} - -MCFragment::~MCFragment() { } - -MCFragment::MCFragment(FragmentType Kind, bool HasInstructions, - uint8_t BundlePadding, MCSection *Parent) - : Kind(Kind), HasInstructions(HasInstructions), AlignToBundleEnd(false), - BundlePadding(BundlePadding), Parent(Parent), Atom(nullptr), - Offset(~UINT64_C(0)) { - if (Parent) - Parent->getFragmentList().push_back(this); -} - -void MCFragment::destroy() { - // First check if we are the sentinal. - if (Kind == FragmentType(~0)) { - delete this; - return; - } - - switch (Kind) { - case FT_Align: - delete cast<MCAlignFragment>(this); - return; - case FT_Data: - delete cast<MCDataFragment>(this); - return; - case FT_CompactEncodedInst: - delete cast<MCCompactEncodedInstFragment>(this); - return; - case FT_Fill: - delete cast<MCFillFragment>(this); - return; - case FT_Relaxable: - delete cast<MCRelaxableFragment>(this); - return; - case FT_Org: - delete cast<MCOrgFragment>(this); - return; - case FT_Dwarf: - delete cast<MCDwarfLineAddrFragment>(this); - return; - case FT_DwarfFrame: - delete cast<MCDwarfCallFrameFragment>(this); - return; - case FT_LEB: - delete cast<MCLEBFragment>(this); - return; - case FT_SafeSEH: - delete cast<MCSafeSEHFragment>(this); - return; - } -} - -/* *** */ - MCAssembler::MCAssembler(MCContext &Context_, MCAsmBackend &Backend_, - MCCodeEmitter &Emitter_, MCObjectWriter &Writer_, - raw_ostream &OS_) + MCCodeEmitter &Emitter_, MCObjectWriter &Writer_) : Context(Context_), Backend(Backend_), Emitter(Emitter_), Writer(Writer_), - OS(OS_), BundleAlignSize(0), RelaxAll(false), - SubsectionsViaSymbols(false), ELFHeaderEFlags(0) { + BundleAlignSize(0), RelaxAll(false), SubsectionsViaSymbols(false), + IncrementalLinkerCompatible(false), ELFHeaderEFlags(0) { VersionMinInfo.Major = 0; // Major version == 0 for "none specified" } @@ -347,6 +86,7 @@ void MCAssembler::reset() { BundleAlignSize = 0; RelaxAll = false; SubsectionsViaSymbols = false; + IncrementalLinkerCompatible = false; ELFHeaderEFlags = 0; LOHContainer.reset(); VersionMinInfo.Major = 0; @@ -358,6 +98,14 @@ void MCAssembler::reset() { getLOHContainer().reset(); } +bool MCAssembler::registerSection(MCSection &Section) { + if (Section.isRegistered()) + return false; + Sections.push_back(&Section); + Section.setIsRegistered(true); + return true; +} + bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const { if (ThumbFuncs.count(Symbol)) return true; @@ -404,7 +152,7 @@ const MCSymbol *MCAssembler::getAtom(const MCSymbol &S) const { return &S; // Absolute and undefined symbols have no defining atom. - if (!S.getFragment()) + if (!S.isInSection()) return nullptr; // Non-linker visible symbols in sections which can't be atomized have no @@ -426,8 +174,13 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, // probably merge the two into a single callback that tries to evaluate a // fixup and records a relocation if one is needed. const MCExpr *Expr = Fixup.getValue(); - if (!Expr->evaluateAsRelocatable(Target, &Layout, &Fixup)) - getContext().reportFatalError(Fixup.getLoc(), "expected relocatable expression"); + if (!Expr->evaluateAsRelocatable(Target, &Layout, &Fixup)) { + getContext().reportError(Fixup.getLoc(), "expected relocatable expression"); + // Claim to have completely evaluated the fixup, to prevent any further + // processing from being done. + Value = 0; + return true; + } bool IsPCRel = Backend.getFixupKindInfo( Fixup.getKind()).Flags & MCFixupKindInfo::FKF_IsPCRel; @@ -523,12 +276,19 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout, case MCFragment::FT_Org: { const MCOrgFragment &OF = cast<MCOrgFragment>(F); - int64_t TargetLocation; - if (!OF.getOffset().evaluateAsAbsolute(TargetLocation, Layout)) + MCValue Value; + if (!OF.getOffset().evaluateAsValue(Value, Layout)) report_fatal_error("expected assembly-time absolute expression"); // FIXME: We need a way to communicate this error. uint64_t FragmentOffset = Layout.getFragmentOffset(&OF); + int64_t TargetLocation = Value.getConstant(); + if (const MCSymbolRefExpr *A = Value.getSymA()) { + uint64_t Val; + if (!Layout.getSymbolOffset(A->getSymbol(), Val)) + report_fatal_error("expected absolute expression"); + TargetLocation += Val; + } int64_t Size = TargetLocation - FragmentOffset; if (Size < 0 || Size >= 0x40000000) report_fatal_error("invalid .org offset '" + Twine(TargetLocation) + @@ -540,6 +300,8 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout, return cast<MCDwarfLineAddrFragment>(F).getContents().size(); case MCFragment::FT_DwarfFrame: return cast<MCDwarfCallFrameFragment>(F).getContents().size(); + case MCFragment::FT_Dummy: + llvm_unreachable("Should not have been added"); } llvm_unreachable("invalid fragment kind"); @@ -773,6 +535,8 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout, OW->writeBytes(CF.getContents()); break; } + case MCFragment::FT_Dummy: + llvm_unreachable("Should not have been added"); } assert(OW->getStream().tell() - Start == FragmentSize && @@ -786,15 +550,14 @@ void MCAssembler::writeSectionData(const MCSection *Sec, assert(Layout.getSectionFileSize(Sec) == 0 && "Invalid size for section!"); // Check that contents are only things legal inside a virtual section. - for (MCSection::const_iterator it = Sec->begin(), ie = Sec->end(); it != ie; - ++it) { - switch (it->getKind()) { + for (const MCFragment &F : *Sec) { + switch (F.getKind()) { default: llvm_unreachable("Invalid fragment in virtual section!"); case MCFragment::FT_Data: { // Check that we aren't trying to write a non-zero contents (or fixups) // into a virtual section. This is to support clients which use standard // directives to fill the contents of virtual sections. - const MCDataFragment &DF = cast<MCDataFragment>(*it); + const MCDataFragment &DF = cast<MCDataFragment>(F); assert(DF.fixup_begin() == DF.fixup_end() && "Cannot have fixups in virtual section!"); for (unsigned i = 0, e = DF.getContents().size(); i != e; ++i) @@ -810,13 +573,13 @@ void MCAssembler::writeSectionData(const MCSection *Sec, case MCFragment::FT_Align: // Check that we aren't trying to write a non-zero value into a virtual // section. - assert((cast<MCAlignFragment>(it)->getValueSize() == 0 || - cast<MCAlignFragment>(it)->getValue() == 0) && + assert((cast<MCAlignFragment>(F).getValueSize() == 0 || + cast<MCAlignFragment>(F).getValue() == 0) && "Invalid align in virtual section!"); break; case MCFragment::FT_Fill: - assert((cast<MCFillFragment>(it)->getValueSize() == 0 || - cast<MCFillFragment>(it)->getValue() == 0) && + assert((cast<MCFillFragment>(F).getValueSize() == 0 || + cast<MCFillFragment>(F).getValue() == 0) && "Invalid fill in virtual section!"); break; } @@ -828,9 +591,8 @@ void MCAssembler::writeSectionData(const MCSection *Sec, uint64_t Start = getWriter().getStream().tell(); (void)Start; - for (MCSection::const_iterator it = Sec->begin(), ie = Sec->end(); it != ie; - ++it) - writeFragment(*this, Layout, *it); + for (const MCFragment &F : *Sec) + writeFragment(*this, Layout, F); assert(getWriter().getStream().tell() - Start == Layout.getSectionAddressSize(Sec)); @@ -854,23 +616,20 @@ std::pair<uint64_t, bool> MCAssembler::handleFixup(const MCAsmLayout &Layout, return std::make_pair(FixedValue, IsPCRel); } -void MCAssembler::Finish() { +void MCAssembler::layout(MCAsmLayout &Layout) { DEBUG_WITH_TYPE("mc-dump", { llvm::errs() << "assembler backend - pre-layout\n--\n"; dump(); }); - // Create the layout object. - MCAsmLayout Layout(*this); - // Create dummy fragments and assign section ordinals. unsigned SectionIndex = 0; - for (MCAssembler::iterator it = begin(), ie = end(); it != ie; ++it) { + for (MCSection &Sec : *this) { // Create dummy fragments to eliminate any empty sections, this simplifies // layout. - if (it->getFragmentList().empty()) - new MCDataFragment(&*it); + if (Sec.getFragmentList().empty()) + new MCDataFragment(&Sec); - it->setOrdinal(SectionIndex++); + Sec.setOrdinal(SectionIndex++); } // Assign layout order indices to sections and fragments. @@ -879,9 +638,8 @@ void MCAssembler::Finish() { Sec->setLayoutOrder(i); unsigned FragmentIndex = 0; - for (MCSection::iterator iFrag = Sec->begin(), iFragEnd = Sec->end(); - iFrag != iFragEnd; ++iFrag) - iFrag->setLayoutOrder(FragmentIndex++); + for (MCFragment &Frag : *Sec) + Frag.setLayoutOrder(FragmentIndex++); } // Layout until everything fits. @@ -899,17 +657,14 @@ void MCAssembler::Finish() { llvm::errs() << "assembler backend - final-layout\n--\n"; dump(); }); - uint64_t StartOffset = OS.tell(); - // Allow the object writer a chance to perform post-layout binding (for // example, to set the index fields in the symbol data). getWriter().executePostLayoutBinding(*this, Layout); // Evaluate and apply the fixups, generating relocation entries as necessary. - for (MCAssembler::iterator it = begin(), ie = end(); it != ie; ++it) { - for (MCSection::iterator it2 = it->begin(), ie2 = it->end(); it2 != ie2; - ++it2) { - MCEncodedFragment *F = dyn_cast<MCEncodedFragment>(it2); + for (MCSection &Sec : *this) { + for (MCFragment &Frag : Sec) { + MCEncodedFragment *F = dyn_cast<MCEncodedFragment>(&Frag); // Data and relaxable fragments both have fixups. So only process // those here. // FIXME: Is there a better way to do this? MCEncodedFragmentWithFixups @@ -935,6 +690,15 @@ void MCAssembler::Finish() { } } } +} + +void MCAssembler::Finish() { + // Create the layout object. + MCAsmLayout Layout(*this); + layout(Layout); + + raw_ostream &OS = getWriter().getStream(); + uint64_t StartOffset = OS.tell(); // Write the object file. getWriter().writeObject(*this, Layout); @@ -960,9 +724,8 @@ bool MCAssembler::fragmentNeedsRelaxation(const MCRelaxableFragment *F, if (!getBackend().mayNeedRelaxation(F->getInst())) return false; - for (MCRelaxableFragment::const_fixup_iterator it = F->fixup_begin(), - ie = F->fixup_end(); it != ie; ++it) - if (fixupNeedsRelaxation(*it, F, Layout)) + for (const MCFixup &Fixup : F->getFixups()) + if (fixupNeedsRelaxation(Fixup, F, Layout)) return true; return false; @@ -991,7 +754,6 @@ bool MCAssembler::relaxInstruction(MCAsmLayout &Layout, SmallString<256> Code; raw_svector_ostream VecOS(Code); getEmitter().encodeInstruction(Relaxed, VecOS, Fixups, F.getSubtargetInfo()); - VecOS.flush(); // Update the fragment. F.setInst(Relaxed); @@ -1014,7 +776,6 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) { encodeSLEB128(Value, OSE); else encodeULEB128(Value, OSE); - OSE.flush(); return OldSize != LF.getContents().size(); } @@ -1031,8 +792,8 @@ bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout, SmallString<8> &Data = DF.getContents(); Data.clear(); raw_svector_ostream OSE(Data); - MCDwarfLineAddr::Encode(Context, LineDelta, AddrDelta, OSE); - OSE.flush(); + MCDwarfLineAddr::Encode(Context, getDWARFLinetableParams(), LineDelta, + AddrDelta, OSE); return OldSize != Data.size(); } @@ -1048,7 +809,6 @@ bool MCAssembler::relaxDwarfCallFrameFragment(MCAsmLayout &Layout, Data.clear(); raw_svector_ostream OSE(Data); MCDwarfFrameEmitter::EncodeAdvanceLoc(Context, AddrDelta, OSE); - OSE.flush(); return OldSize != Data.size(); } @@ -1085,7 +845,7 @@ bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSection &Sec) { break; } if (RelaxedFrag && !FirstRelaxedFragment) - FirstRelaxedFragment = I; + FirstRelaxedFragment = &*I; } if (FirstRelaxedFragment) { Layout.invalidateFragmentsFrom(FirstRelaxedFragment); @@ -1113,158 +873,3 @@ void MCAssembler::finishLayout(MCAsmLayout &Layout) { Layout.getFragmentOffset(&*Layout.getSectionOrder()[i]->rbegin()); } } - -// Debugging methods - -namespace llvm { - -raw_ostream &operator<<(raw_ostream &OS, const MCFixup &AF) { - OS << "<MCFixup" << " Offset:" << AF.getOffset() - << " Value:" << *AF.getValue() - << " Kind:" << AF.getKind() << ">"; - return OS; -} - -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void MCFragment::dump() { - raw_ostream &OS = llvm::errs(); - - OS << "<"; - switch (getKind()) { - case MCFragment::FT_Align: OS << "MCAlignFragment"; break; - case MCFragment::FT_Data: OS << "MCDataFragment"; break; - case MCFragment::FT_CompactEncodedInst: - OS << "MCCompactEncodedInstFragment"; break; - case MCFragment::FT_Fill: OS << "MCFillFragment"; break; - case MCFragment::FT_Relaxable: OS << "MCRelaxableFragment"; break; - case MCFragment::FT_Org: OS << "MCOrgFragment"; break; - case MCFragment::FT_Dwarf: OS << "MCDwarfFragment"; break; - case MCFragment::FT_DwarfFrame: OS << "MCDwarfCallFrameFragment"; break; - case MCFragment::FT_LEB: OS << "MCLEBFragment"; break; - case MCFragment::FT_SafeSEH: OS << "MCSafeSEHFragment"; break; - } - - OS << "<MCFragment " << (void*) this << " LayoutOrder:" << LayoutOrder - << " Offset:" << Offset - << " HasInstructions:" << hasInstructions() - << " BundlePadding:" << static_cast<unsigned>(getBundlePadding()) << ">"; - - switch (getKind()) { - case MCFragment::FT_Align: { - const MCAlignFragment *AF = cast<MCAlignFragment>(this); - if (AF->hasEmitNops()) - OS << " (emit nops)"; - OS << "\n "; - OS << " Alignment:" << AF->getAlignment() - << " Value:" << AF->getValue() << " ValueSize:" << AF->getValueSize() - << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">"; - break; - } - case MCFragment::FT_Data: { - const MCDataFragment *DF = cast<MCDataFragment>(this); - OS << "\n "; - OS << " Contents:["; - const SmallVectorImpl<char> &Contents = DF->getContents(); - for (unsigned i = 0, e = Contents.size(); i != e; ++i) { - if (i) OS << ","; - OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF); - } - OS << "] (" << Contents.size() << " bytes)"; - - if (DF->fixup_begin() != DF->fixup_end()) { - OS << ",\n "; - OS << " Fixups:["; - for (MCDataFragment::const_fixup_iterator it = DF->fixup_begin(), - ie = DF->fixup_end(); it != ie; ++it) { - if (it != DF->fixup_begin()) OS << ",\n "; - OS << *it; - } - OS << "]"; - } - break; - } - case MCFragment::FT_CompactEncodedInst: { - const MCCompactEncodedInstFragment *CEIF = - cast<MCCompactEncodedInstFragment>(this); - OS << "\n "; - OS << " Contents:["; - const SmallVectorImpl<char> &Contents = CEIF->getContents(); - for (unsigned i = 0, e = Contents.size(); i != e; ++i) { - if (i) OS << ","; - OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF); - } - OS << "] (" << Contents.size() << " bytes)"; - break; - } - case MCFragment::FT_Fill: { - const MCFillFragment *FF = cast<MCFillFragment>(this); - OS << " Value:" << FF->getValue() << " ValueSize:" << FF->getValueSize() - << " Size:" << FF->getSize(); - break; - } - case MCFragment::FT_Relaxable: { - const MCRelaxableFragment *F = cast<MCRelaxableFragment>(this); - OS << "\n "; - OS << " Inst:"; - F->getInst().dump_pretty(OS); - break; - } - case MCFragment::FT_Org: { - const MCOrgFragment *OF = cast<MCOrgFragment>(this); - OS << "\n "; - OS << " Offset:" << OF->getOffset() << " Value:" << OF->getValue(); - break; - } - case MCFragment::FT_Dwarf: { - const MCDwarfLineAddrFragment *OF = cast<MCDwarfLineAddrFragment>(this); - OS << "\n "; - OS << " AddrDelta:" << OF->getAddrDelta() - << " LineDelta:" << OF->getLineDelta(); - break; - } - case MCFragment::FT_DwarfFrame: { - const MCDwarfCallFrameFragment *CF = cast<MCDwarfCallFrameFragment>(this); - OS << "\n "; - OS << " AddrDelta:" << CF->getAddrDelta(); - break; - } - case MCFragment::FT_LEB: { - const MCLEBFragment *LF = cast<MCLEBFragment>(this); - OS << "\n "; - OS << " Value:" << LF->getValue() << " Signed:" << LF->isSigned(); - break; - } - case MCFragment::FT_SafeSEH: { - const MCSafeSEHFragment *F = cast<MCSafeSEHFragment>(this); - OS << "\n "; - OS << " Sym:" << F->getSymbol(); - break; - } - } - OS << ">"; -} - -void MCAssembler::dump() { - raw_ostream &OS = llvm::errs(); - - OS << "<MCAssembler\n"; - OS << " Sections:[\n "; - for (iterator it = begin(), ie = end(); it != ie; ++it) { - if (it != begin()) OS << ",\n "; - it->dump(); - } - OS << "],\n"; - OS << " Symbols:["; - - for (symbol_iterator it = symbol_begin(), ie = symbol_end(); it != ie; ++it) { - if (it != symbol_begin()) OS << ",\n "; - OS << "("; - it->dump(); - OS << ", Index:" << it->getIndex() << ", "; - OS << ")"; - } - OS << "]>\n"; -} -#endif diff --git a/contrib/llvm/lib/MC/MCContext.cpp b/contrib/llvm/lib/MC/MCContext.cpp index a85796c..b5ad518 100644 --- a/contrib/llvm/lib/MC/MCContext.cpp +++ b/contrib/llvm/lib/MC/MCContext.cpp @@ -23,6 +23,7 @@ #include "llvm/MC/MCSymbolCOFF.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCSymbolMachO.h" +#include "llvm/Support/COFF.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" @@ -41,7 +42,7 @@ MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri, CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0), DwarfLocSeen(false), GenDwarfForAssembly(false), GenDwarfFileNumber(0), DwarfVersion(4), AllowTemporaryLabels(true), DwarfCompileUnitID(0), - AutoReset(DoAutoReset) { + AutoReset(DoAutoReset), HadError(false) { std::error_code EC = llvm::sys::fs::current_path(CompilationDir); if (EC) @@ -62,9 +63,6 @@ MCContext::~MCContext() { // NOTE: The symbols are all allocated out of a bump pointer allocator, // we don't need to free them here. - - // If the stream for the .secure_log_unique directive was created free it. - delete (raw_ostream *)SecureLog; } //===----------------------------------------------------------------------===// @@ -73,13 +71,11 @@ MCContext::~MCContext() { void MCContext::reset() { // Call the destructors so the fragments are freed - for (auto &I : ELFUniquingMap) - I.second->~MCSectionELF(); - for (auto &I : COFFUniquingMap) - I.second->~MCSectionCOFF(); - for (auto &I : MachOUniquingMap) - I.second->~MCSectionMachO(); + COFFAllocator.DestroyAll(); + ELFAllocator.DestroyAll(); + MachOAllocator.DestroyAll(); + MCSubtargetAllocator.DestroyAll(); UsedNames.clear(); Symbols.clear(); SectionSymbols.clear(); @@ -103,6 +99,8 @@ void MCContext::reset() { DwarfLocSeen = false; GenDwarfForAssembly = false; GenDwarfFileNumber = 0; + + HadError = false; } //===----------------------------------------------------------------------===// @@ -294,8 +292,8 @@ MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section, Begin = createTempSymbol(BeginSymName, false); // Otherwise, return a new section. - return Entry = new (*this) MCSectionMachO(Segment, Section, TypeAndAttributes, - Reserved2, Kind, Begin); + return Entry = new (MachOAllocator.Allocate()) MCSectionMachO( + Segment, Section, TypeAndAttributes, Reserved2, Kind, Begin); } void MCContext::renameELFSection(MCSectionELF *Section, StringRef Name) { @@ -322,7 +320,7 @@ MCSectionELF *MCContext::createELFRelSection(StringRef Name, unsigned Type, bool Inserted; std::tie(I, Inserted) = ELFRelSecNames.insert(std::make_pair(Name, true)); - return new (*this) + return new (ELFAllocator.Allocate()) MCSectionELF(I->getKey(), Type, Flags, SectionKind::getReadOnly(), EntrySize, Group, true, nullptr, Associated); } @@ -367,15 +365,15 @@ MCSectionELF *MCContext::getELFSection(StringRef Section, unsigned Type, if (BeginSymName) Begin = createTempSymbol(BeginSymName, false); - MCSectionELF *Result = - new (*this) MCSectionELF(CachedName, Type, Flags, Kind, EntrySize, - GroupSym, UniqueID, Begin, Associated); + MCSectionELF *Result = new (ELFAllocator.Allocate()) + MCSectionELF(CachedName, Type, Flags, Kind, EntrySize, GroupSym, UniqueID, + Begin, Associated); Entry.second = Result; return Result; } MCSectionELF *MCContext::createELFGroupSection(const MCSymbolELF *Group) { - MCSectionELF *Result = new (*this) + MCSectionELF *Result = new (ELFAllocator.Allocate()) MCSectionELF(".group", ELF::SHT_GROUP, 0, SectionKind::getReadOnly(), 4, Group, ~0, nullptr, nullptr); return Result; @@ -404,7 +402,7 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section, Begin = createTempSymbol(BeginSymName, false); StringRef CachedName = Iter->first.SectionName; - MCSectionCOFF *Result = new (*this) MCSectionCOFF( + MCSectionCOFF *Result = new (COFFAllocator.Allocate()) MCSectionCOFF( CachedName, Characteristics, COMDATSymbol, Selection, Kind, Begin); Iter->second = Result; @@ -441,6 +439,10 @@ MCSectionCOFF *MCContext::getAssociativeCOFFSection(MCSectionCOFF *Sec, COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE); } +MCSubtargetInfo &MCContext::getSubtargetCopy(const MCSubtargetInfo &STI) { + return *new (MCSubtargetAllocator.Allocate()) MCSubtargetInfo(STI); +} + //===----------------------------------------------------------------------===// // Dwarf Management //===----------------------------------------------------------------------===// @@ -472,14 +474,24 @@ void MCContext::finalizeDwarfSections(MCStreamer &MCOS) { [&](MCSection *Sec) { return !MCOS.mayHaveInstructions(*Sec); }); } -void MCContext::reportFatalError(SMLoc Loc, const Twine &Msg) const { - // If we have a source manager and a location, use it. Otherwise just - // use the generic report_fatal_error(). - if (!SrcMgr || Loc == SMLoc()) +//===----------------------------------------------------------------------===// +// Error Reporting +//===----------------------------------------------------------------------===// + +void MCContext::reportError(SMLoc Loc, const Twine &Msg) { + HadError = true; + + // If we have a source manager use it. Otherwise just use the generic + // report_fatal_error(). + if (!SrcMgr) report_fatal_error(Msg, false); // Use the source manager to print the message. SrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg); +} + +void MCContext::reportFatalError(SMLoc Loc, const Twine &Msg) { + reportError(Loc, Msg); // If we reached here, we are failing ungracefully. Run the interrupt handlers // to make sure any special cleanups get done, in particular that we remove diff --git a/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp index 716d76a..82063fb 100644 --- a/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp +++ b/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp @@ -125,7 +125,6 @@ void LLVMDisasmDispose(LLVMDisasmContextRef DCR){ static void emitComments(LLVMDisasmContext *DC, formatted_raw_ostream &FormattedOS) { // Flush the stream before taking its content. - DC->CommentStream.flush(); StringRef Comments = DC->CommentsToEmit.str(); // Get the default information for printing a comment. const MCAsmInfo *MAI = DC->getAsmInfo(); @@ -147,7 +146,6 @@ static void emitComments(LLVMDisasmContext *DC, // Tell the comment stream that the vector changed underneath it. DC->CommentsToEmit.clear(); - DC->CommentStream.resync(); } /// \brief Gets latency information for \p Inst from the itinerary @@ -261,7 +259,6 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes, return 0; case MCDisassembler::Success: { - Annotations.flush(); StringRef AnnotationsStr = Annotations.str(); SmallVector<char, 64> InsnStr; @@ -273,7 +270,6 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes, emitLatency(DC, Inst); emitComments(DC, FormattedOS); - OS.flush(); assert(OutStringSize != 0 && "Output buffer cannot be zero size"); size_t OutputSize = std::min(OutStringSize-1, InsnStr.size()); diff --git a/contrib/llvm/lib/MC/MCDwarf.cpp b/contrib/llvm/lib/MC/MCDwarf.cpp index c84c486..dafa768 100644 --- a/contrib/llvm/lib/MC/MCDwarf.cpp +++ b/contrib/llvm/lib/MC/MCDwarf.cpp @@ -27,26 +27,8 @@ #include "llvm/Support/Path.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" -using namespace llvm; - -// Given a special op, return the address skip amount (in units of -// DWARF2_LINE_MIN_INSN_LENGTH. -#define SPECIAL_ADDR(op) (((op) - DWARF2_LINE_OPCODE_BASE)/DWARF2_LINE_RANGE) - -// The maximum address skip amount that can be encoded with a special op. -#define MAX_SPECIAL_ADDR_DELTA SPECIAL_ADDR(255) - -// First special line opcode - leave room for the standard opcodes. -// Note: If you want to change this, you'll have to update the -// "standard_opcode_lengths" table that is emitted in DwarfFileTable::Emit(). -#define DWARF2_LINE_OPCODE_BASE 13 - -// Minimum line offset in a special line info. opcode. This value -// was chosen to give a reasonable range of values. -#define DWARF2_LINE_BASE -5 -// Range of line offsets in a special line info. opcode. -#define DWARF2_LINE_RANGE 14 +using namespace llvm; static inline uint64_t ScaleAddrDelta(MCContext &Context, uint64_t AddrDelta) { unsigned MinInsnLength = Context.getAsmInfo()->getMinInstAlignment(); @@ -197,7 +179,8 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section, // // This emits the Dwarf file and the line tables. // -void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS) { +void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS, + MCDwarfLineTableParams Params) { MCContext &context = MCOS->getContext(); auto &LineTables = context.getMCDwarfLineTables(); @@ -212,14 +195,17 @@ void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS) { // Handle the rest of the Compile Units. for (const auto &CUIDTablePair : LineTables) - CUIDTablePair.second.EmitCU(MCOS); + CUIDTablePair.second.EmitCU(MCOS, Params); } -void MCDwarfDwoLineTable::Emit(MCStreamer &MCOS) const { - MCOS.EmitLabel(Header.Emit(&MCOS, None).second); +void MCDwarfDwoLineTable::Emit(MCStreamer &MCOS, + MCDwarfLineTableParams Params) const { + MCOS.EmitLabel(Header.Emit(&MCOS, Params, None).second); } -std::pair<MCSymbol *, MCSymbol *> MCDwarfLineTableHeader::Emit(MCStreamer *MCOS) const { +std::pair<MCSymbol *, MCSymbol *> +MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, + MCDwarfLineTableParams Params) const { static const char StandardOpcodeLengths[] = { 0, // length of DW_LNS_copy 1, // length of DW_LNS_advance_pc @@ -234,9 +220,10 @@ std::pair<MCSymbol *, MCSymbol *> MCDwarfLineTableHeader::Emit(MCStreamer *MCOS) 0, // length of DW_LNS_set_epilogue_begin 1 // DW_LNS_set_isa }; - assert(array_lengthof(StandardOpcodeLengths) == - (DWARF2_LINE_OPCODE_BASE - 1)); - return Emit(MCOS, StandardOpcodeLengths); + assert(array_lengthof(StandardOpcodeLengths) >= + (Params.DWARF2LineOpcodeBase - 1U)); + return Emit(MCOS, Params, makeArrayRef(StandardOpcodeLengths, + Params.DWARF2LineOpcodeBase - 1)); } static const MCExpr *forceExpAbs(MCStreamer &OS, const MCExpr* Expr) { @@ -256,9 +243,8 @@ static void emitAbsValue(MCStreamer &OS, const MCExpr *Value, unsigned Size) { } std::pair<MCSymbol *, MCSymbol *> -MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, +MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params, ArrayRef<char> StandardOpcodeLengths) const { - MCContext &context = MCOS->getContext(); // Create a symbol at the beginning of the line table. @@ -293,8 +279,8 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, // Parameters of the state machine, are next. MCOS->EmitIntValue(context.getAsmInfo()->getMinInstAlignment(), 1); MCOS->EmitIntValue(DWARF2_LINE_DEFAULT_IS_STMT, 1); - MCOS->EmitIntValue(DWARF2_LINE_BASE, 1); - MCOS->EmitIntValue(DWARF2_LINE_RANGE, 1); + MCOS->EmitIntValue(Params.DWARF2LineBase, 1); + MCOS->EmitIntValue(Params.DWARF2LineRange, 1); MCOS->EmitIntValue(StandardOpcodeLengths.size() + 1, 1); // Standard opcode lengths @@ -329,8 +315,9 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, return std::make_pair(LineStartSym, LineEndSym); } -void MCDwarfLineTable::EmitCU(MCObjectStreamer *MCOS) const { - MCSymbol *LineEndSym = Header.Emit(MCOS).second; +void MCDwarfLineTable::EmitCU(MCObjectStreamer *MCOS, + MCDwarfLineTableParams Params) const { + MCSymbol *LineEndSym = Header.Emit(MCOS, Params).second; // Put out the line tables. for (const auto &LineSec : MCLineSections.getMCLineEntries()) @@ -416,21 +403,31 @@ unsigned MCDwarfLineTableHeader::getFile(StringRef &Directory, } /// Utility function to emit the encoding to a streamer. -void MCDwarfLineAddr::Emit(MCStreamer *MCOS, int64_t LineDelta, - uint64_t AddrDelta) { +void MCDwarfLineAddr::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params, + int64_t LineDelta, uint64_t AddrDelta) { MCContext &Context = MCOS->getContext(); SmallString<256> Tmp; raw_svector_ostream OS(Tmp); - MCDwarfLineAddr::Encode(Context, LineDelta, AddrDelta, OS); + MCDwarfLineAddr::Encode(Context, Params, LineDelta, AddrDelta, OS); MCOS->EmitBytes(OS.str()); } +/// Given a special op, return the address skip amount (in units of +/// DWARF2_LINE_MIN_INSN_LENGTH). +static uint64_t SpecialAddr(MCDwarfLineTableParams Params, uint64_t op) { + return (op - Params.DWARF2LineOpcodeBase) / Params.DWARF2LineRange; +} + /// Utility function to encode a Dwarf pair of LineDelta and AddrDeltas. -void MCDwarfLineAddr::Encode(MCContext &Context, int64_t LineDelta, - uint64_t AddrDelta, raw_ostream &OS) { +void MCDwarfLineAddr::Encode(MCContext &Context, MCDwarfLineTableParams Params, + int64_t LineDelta, uint64_t AddrDelta, + raw_ostream &OS) { uint64_t Temp, Opcode; bool NeedCopy = false; + // The maximum address skip amount that can be encoded with a special op. + uint64_t MaxSpecialAddrDelta = SpecialAddr(Params, 255); + // Scale the address delta by the minimum instruction length. AddrDelta = ScaleAddrDelta(Context, AddrDelta); @@ -438,7 +435,7 @@ void MCDwarfLineAddr::Encode(MCContext &Context, int64_t LineDelta, // DW_LNE_end_sequence. We cannot use special opcodes here, since we want the // end_sequence to emit the matrix entry. if (LineDelta == INT64_MAX) { - if (AddrDelta == MAX_SPECIAL_ADDR_DELTA) + if (AddrDelta == MaxSpecialAddrDelta) OS << char(dwarf::DW_LNS_const_add_pc); else if (AddrDelta) { OS << char(dwarf::DW_LNS_advance_pc); @@ -451,16 +448,16 @@ void MCDwarfLineAddr::Encode(MCContext &Context, int64_t LineDelta, } // Bias the line delta by the base. - Temp = LineDelta - DWARF2_LINE_BASE; + Temp = LineDelta - Params.DWARF2LineBase; // If the line increment is out of range of a special opcode, we must encode // it with DW_LNS_advance_line. - if (Temp >= DWARF2_LINE_RANGE) { + if (Temp >= Params.DWARF2LineRange) { OS << char(dwarf::DW_LNS_advance_line); encodeSLEB128(LineDelta, OS); LineDelta = 0; - Temp = 0 - DWARF2_LINE_BASE; + Temp = 0 - Params.DWARF2LineBase; NeedCopy = true; } @@ -471,19 +468,19 @@ void MCDwarfLineAddr::Encode(MCContext &Context, int64_t LineDelta, } // Bias the opcode by the special opcode base. - Temp += DWARF2_LINE_OPCODE_BASE; + Temp += Params.DWARF2LineOpcodeBase; // Avoid overflow when addr_delta is large. - if (AddrDelta < 256 + MAX_SPECIAL_ADDR_DELTA) { + if (AddrDelta < 256 + MaxSpecialAddrDelta) { // Try using a special opcode. - Opcode = Temp + AddrDelta * DWARF2_LINE_RANGE; + Opcode = Temp + AddrDelta * Params.DWARF2LineRange; if (Opcode <= 255) { OS << char(Opcode); return; } // Try using DW_LNS_const_add_pc followed by special op. - Opcode = Temp + (AddrDelta - MAX_SPECIAL_ADDR_DELTA) * DWARF2_LINE_RANGE; + Opcode = Temp + (AddrDelta - MaxSpecialAddrDelta) * Params.DWARF2LineRange; if (Opcode <= 255) { OS << char(dwarf::DW_LNS_const_add_pc); OS << char(Opcode); @@ -517,10 +514,14 @@ static void EmitGenDwarfAbbrev(MCStreamer *MCOS) { MCOS->EmitULEB128IntValue(1); MCOS->EmitULEB128IntValue(dwarf::DW_TAG_compile_unit); MCOS->EmitIntValue(dwarf::DW_CHILDREN_yes, 1); - EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4); - if (MCOS->getContext().getGenDwarfSectionSyms().size() > 1 && - MCOS->getContext().getDwarfVersion() >= 3) { - EmitAbbrev(MCOS, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4); + EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list, context.getDwarfVersion() >= 4 + ? dwarf::DW_FORM_sec_offset + : dwarf::DW_FORM_data4); + if (context.getGenDwarfSectionSyms().size() > 1 && + context.getDwarfVersion() >= 3) { + EmitAbbrev(MCOS, dwarf::DW_AT_ranges, context.getDwarfVersion() >= 4 + ? dwarf::DW_FORM_sec_offset + : dwarf::DW_FORM_data4); } else { EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr); EmitAbbrev(MCOS, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr); @@ -845,7 +846,7 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) { LineSectionSymbol = MCOS->getDwarfLineTableSymbol(0); MCSymbol *AbbrevSectionSymbol = nullptr; MCSymbol *InfoSectionSymbol = nullptr; - MCSymbol *RangesSectionSymbol = NULL; + MCSymbol *RangesSectionSymbol = nullptr; // Create end symbols for each section, and remove empty sections MCOS->getContext().finalizeDwarfSections(*MCOS); @@ -998,38 +999,29 @@ static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol, } namespace { - class FrameEmitterImpl { - int CFAOffset; - int InitialCFAOffset; - bool IsEH; - const MCSymbol *SectionStart; - public: - FrameEmitterImpl(bool isEH) - : CFAOffset(0), InitialCFAOffset(0), IsEH(isEH), SectionStart(nullptr) { - } - - void setSectionStart(const MCSymbol *Label) { SectionStart = Label; } - - /// Emit the unwind information in a compact way. - void EmitCompactUnwind(MCObjectStreamer &streamer, - const MCDwarfFrameInfo &frame); - - const MCSymbol &EmitCIE(MCObjectStreamer &streamer, - const MCSymbol *personality, - unsigned personalityEncoding, - const MCSymbol *lsda, - bool IsSignalFrame, - unsigned lsdaEncoding, - bool IsSimple); - MCSymbol *EmitFDE(MCObjectStreamer &streamer, - const MCSymbol &cieStart, - const MCDwarfFrameInfo &frame); - void EmitCFIInstructions(MCObjectStreamer &streamer, - ArrayRef<MCCFIInstruction> Instrs, - MCSymbol *BaseLabel); - void EmitCFIInstruction(MCObjectStreamer &Streamer, - const MCCFIInstruction &Instr); - }; +class FrameEmitterImpl { + int CFAOffset = 0; + int InitialCFAOffset = 0; + bool IsEH; + MCObjectStreamer &Streamer; + +public: + FrameEmitterImpl(bool IsEH, MCObjectStreamer &Streamer) + : IsEH(IsEH), Streamer(Streamer) {} + + /// Emit the unwind information in a compact way. + void EmitCompactUnwind(const MCDwarfFrameInfo &frame); + + const MCSymbol &EmitCIE(const MCSymbol *personality, + unsigned personalityEncoding, const MCSymbol *lsda, + bool IsSignalFrame, unsigned lsdaEncoding, + bool IsSimple); + void EmitFDE(const MCSymbol &cieStart, const MCDwarfFrameInfo &frame, + bool LastInSection, const MCSymbol &SectionStart); + void EmitCFIInstructions(ArrayRef<MCCFIInstruction> Instrs, + MCSymbol *BaseLabel); + void EmitCFIInstruction(const MCCFIInstruction &Instr); +}; } // end anonymous namespace @@ -1037,8 +1029,7 @@ static void emitEncodingByte(MCObjectStreamer &Streamer, unsigned Encoding) { Streamer.EmitIntValue(Encoding, 1); } -void FrameEmitterImpl::EmitCFIInstruction(MCObjectStreamer &Streamer, - const MCCFIInstruction &Instr) { +void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) { int dataAlignmentFactor = getDataAlignmentFactor(Streamer); auto *MRI = Streamer.getContext().getRegisterInfo(); @@ -1150,6 +1141,11 @@ void FrameEmitterImpl::EmitCFIInstruction(MCObjectStreamer &Streamer, Streamer.EmitIntValue(dwarf::DW_CFA_restore | Reg, 1); return; } + case MCCFIInstruction::OpGnuArgsSize: { + Streamer.EmitIntValue(dwarf::DW_CFA_GNU_args_size, 1); + Streamer.EmitULEB128IntValue(Instr.getOffset()); + return; + } case MCCFIInstruction::OpEscape: Streamer.EmitBytes(Instr.getValues()); return; @@ -1158,8 +1154,7 @@ void FrameEmitterImpl::EmitCFIInstruction(MCObjectStreamer &Streamer, } /// Emit frame instructions to describe the layout of the frame. -void FrameEmitterImpl::EmitCFIInstructions(MCObjectStreamer &streamer, - ArrayRef<MCCFIInstruction> Instrs, +void FrameEmitterImpl::EmitCFIInstructions(ArrayRef<MCCFIInstruction> Instrs, MCSymbol *BaseLabel) { for (unsigned i = 0, N = Instrs.size(); i < N; ++i) { const MCCFIInstruction &Instr = Instrs[i]; @@ -1171,18 +1166,17 @@ void FrameEmitterImpl::EmitCFIInstructions(MCObjectStreamer &streamer, if (BaseLabel && Label) { MCSymbol *ThisSym = Label; if (ThisSym != BaseLabel) { - streamer.EmitDwarfAdvanceFrameAddr(BaseLabel, ThisSym); + Streamer.EmitDwarfAdvanceFrameAddr(BaseLabel, ThisSym); BaseLabel = ThisSym; } } - EmitCFIInstruction(streamer, Instr); + EmitCFIInstruction(Instr); } } /// Emit the unwind information in a compact way. -void FrameEmitterImpl::EmitCompactUnwind(MCObjectStreamer &Streamer, - const MCDwarfFrameInfo &Frame) { +void FrameEmitterImpl::EmitCompactUnwind(const MCDwarfFrameInfo &Frame) { MCContext &Context = Streamer.getContext(); const MCObjectFileInfo *MOFI = Context.getObjectFileInfo(); @@ -1254,39 +1248,39 @@ static unsigned getCIEVersion(bool IsEH, unsigned DwarfVersion) { case 3: return 3; case 4: + case 5: return 4; } llvm_unreachable("Unknown version"); } -const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer, - const MCSymbol *personality, +const MCSymbol &FrameEmitterImpl::EmitCIE(const MCSymbol *personality, unsigned personalityEncoding, const MCSymbol *lsda, bool IsSignalFrame, unsigned lsdaEncoding, bool IsSimple) { - MCContext &context = streamer.getContext(); + MCContext &context = Streamer.getContext(); const MCRegisterInfo *MRI = context.getRegisterInfo(); const MCObjectFileInfo *MOFI = context.getObjectFileInfo(); MCSymbol *sectionStart = context.createTempSymbol(); - streamer.EmitLabel(sectionStart); + Streamer.EmitLabel(sectionStart); MCSymbol *sectionEnd = context.createTempSymbol(); // Length - const MCExpr *Length = MakeStartMinusEndExpr(streamer, *sectionStart, - *sectionEnd, 4); - emitAbsValue(streamer, Length, 4); + const MCExpr *Length = + MakeStartMinusEndExpr(Streamer, *sectionStart, *sectionEnd, 4); + emitAbsValue(Streamer, Length, 4); // CIE ID unsigned CIE_ID = IsEH ? 0 : -1; - streamer.EmitIntValue(CIE_ID, 4); + Streamer.EmitIntValue(CIE_ID, 4); // Version uint8_t CIEVersion = getCIEVersion(IsEH, context.getDwarfVersion()); - streamer.EmitIntValue(CIEVersion, 1); + Streamer.EmitIntValue(CIEVersion, 1); // Augmentation String SmallString<8> Augmentation; @@ -1299,31 +1293,31 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer, Augmentation += "R"; if (IsSignalFrame) Augmentation += "S"; - streamer.EmitBytes(Augmentation); + Streamer.EmitBytes(Augmentation); } - streamer.EmitIntValue(0, 1); + Streamer.EmitIntValue(0, 1); if (CIEVersion >= 4) { // Address Size - streamer.EmitIntValue(context.getAsmInfo()->getPointerSize(), 1); + Streamer.EmitIntValue(context.getAsmInfo()->getPointerSize(), 1); // Segment Descriptor Size - streamer.EmitIntValue(0, 1); + Streamer.EmitIntValue(0, 1); } // Code Alignment Factor - streamer.EmitULEB128IntValue(context.getAsmInfo()->getMinInstAlignment()); + Streamer.EmitULEB128IntValue(context.getAsmInfo()->getMinInstAlignment()); // Data Alignment Factor - streamer.EmitSLEB128IntValue(getDataAlignmentFactor(streamer)); + Streamer.EmitSLEB128IntValue(getDataAlignmentFactor(Streamer)); // Return Address Register if (CIEVersion == 1) { assert(MRI->getRARegister() <= 255 && "DWARF 2 encodes return_address_register in one byte"); - streamer.EmitIntValue(MRI->getDwarfRegNum(MRI->getRARegister(), IsEH), 1); + Streamer.EmitIntValue(MRI->getDwarfRegNum(MRI->getRARegister(), IsEH), 1); } else { - streamer.EmitULEB128IntValue( + Streamer.EmitULEB128IntValue( MRI->getDwarfRegNum(MRI->getRARegister(), IsEH)); } @@ -1335,28 +1329,28 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer, // Personality Encoding augmentationLength += 1; // Personality - augmentationLength += getSizeForEncoding(streamer, personalityEncoding); + augmentationLength += getSizeForEncoding(Streamer, personalityEncoding); } if (lsda) augmentationLength += 1; // Encoding of the FDE pointers augmentationLength += 1; - streamer.EmitULEB128IntValue(augmentationLength); + Streamer.EmitULEB128IntValue(augmentationLength); // Augmentation Data (optional) if (personality) { // Personality Encoding - emitEncodingByte(streamer, personalityEncoding); + emitEncodingByte(Streamer, personalityEncoding); // Personality - EmitPersonality(streamer, *personality, personalityEncoding); + EmitPersonality(Streamer, *personality, personalityEncoding); } if (lsda) - emitEncodingByte(streamer, lsdaEncoding); + emitEncodingByte(Streamer, lsdaEncoding); // Encoding of the FDE pointers - emitEncodingByte(streamer, MOFI->getFDEEncoding()); + emitEncodingByte(Streamer, MOFI->getFDEEncoding()); } // Initial Instructions @@ -1365,22 +1359,23 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer, if (!IsSimple) { const std::vector<MCCFIInstruction> &Instructions = MAI->getInitialFrameState(); - EmitCFIInstructions(streamer, Instructions, nullptr); + EmitCFIInstructions(Instructions, nullptr); } InitialCFAOffset = CFAOffset; // Padding - streamer.EmitValueToAlignment(IsEH ? 4 : MAI->getPointerSize()); + Streamer.EmitValueToAlignment(IsEH ? 4 : MAI->getPointerSize()); - streamer.EmitLabel(sectionEnd); + Streamer.EmitLabel(sectionEnd); return *sectionStart; } -MCSymbol *FrameEmitterImpl::EmitFDE(MCObjectStreamer &streamer, - const MCSymbol &cieStart, - const MCDwarfFrameInfo &frame) { - MCContext &context = streamer.getContext(); +void FrameEmitterImpl::EmitFDE(const MCSymbol &cieStart, + const MCDwarfFrameInfo &frame, + bool LastInSection, + const MCSymbol &SectionStart) { + MCContext &context = Streamer.getContext(); MCSymbol *fdeStart = context.createTempSymbol(); MCSymbol *fdeEnd = context.createTempSymbol(); const MCObjectFileInfo *MOFI = context.getObjectFileInfo(); @@ -1388,107 +1383,103 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCObjectStreamer &streamer, CFAOffset = InitialCFAOffset; // Length - const MCExpr *Length = MakeStartMinusEndExpr(streamer, *fdeStart, *fdeEnd, 0); - emitAbsValue(streamer, Length, 4); + const MCExpr *Length = MakeStartMinusEndExpr(Streamer, *fdeStart, *fdeEnd, 0); + emitAbsValue(Streamer, Length, 4); - streamer.EmitLabel(fdeStart); + Streamer.EmitLabel(fdeStart); // CIE Pointer const MCAsmInfo *asmInfo = context.getAsmInfo(); if (IsEH) { - const MCExpr *offset = MakeStartMinusEndExpr(streamer, cieStart, *fdeStart, - 0); - emitAbsValue(streamer, offset, 4); + const MCExpr *offset = + MakeStartMinusEndExpr(Streamer, cieStart, *fdeStart, 0); + emitAbsValue(Streamer, offset, 4); } else if (!asmInfo->doesDwarfUseRelocationsAcrossSections()) { - const MCExpr *offset = MakeStartMinusEndExpr(streamer, *SectionStart, - cieStart, 0); - emitAbsValue(streamer, offset, 4); + const MCExpr *offset = + MakeStartMinusEndExpr(Streamer, SectionStart, cieStart, 0); + emitAbsValue(Streamer, offset, 4); } else { - streamer.EmitSymbolValue(&cieStart, 4); + Streamer.EmitSymbolValue(&cieStart, 4); } // PC Begin unsigned PCEncoding = IsEH ? MOFI->getFDEEncoding() : (unsigned)dwarf::DW_EH_PE_absptr; - unsigned PCSize = getSizeForEncoding(streamer, PCEncoding); - emitFDESymbol(streamer, *frame.Begin, PCEncoding, IsEH); + unsigned PCSize = getSizeForEncoding(Streamer, PCEncoding); + emitFDESymbol(Streamer, *frame.Begin, PCEncoding, IsEH); // PC Range - const MCExpr *Range = MakeStartMinusEndExpr(streamer, *frame.Begin, - *frame.End, 0); - emitAbsValue(streamer, Range, PCSize); + const MCExpr *Range = + MakeStartMinusEndExpr(Streamer, *frame.Begin, *frame.End, 0); + emitAbsValue(Streamer, Range, PCSize); if (IsEH) { // Augmentation Data Length unsigned augmentationLength = 0; if (frame.Lsda) - augmentationLength += getSizeForEncoding(streamer, frame.LsdaEncoding); + augmentationLength += getSizeForEncoding(Streamer, frame.LsdaEncoding); - streamer.EmitULEB128IntValue(augmentationLength); + Streamer.EmitULEB128IntValue(augmentationLength); // Augmentation Data if (frame.Lsda) - emitFDESymbol(streamer, *frame.Lsda, frame.LsdaEncoding, true); + emitFDESymbol(Streamer, *frame.Lsda, frame.LsdaEncoding, true); } // Call Frame Instructions - EmitCFIInstructions(streamer, frame.Instructions, frame.Begin); + EmitCFIInstructions(frame.Instructions, frame.Begin); // Padding - streamer.EmitValueToAlignment(PCSize); + // The size of a .eh_frame section has to be a multiple of the alignment + // since a null CIE is interpreted as the end. Old systems overaligned + // .eh_frame, so we do too and account for it in the last FDE. + unsigned Align = LastInSection ? asmInfo->getPointerSize() : PCSize; + Streamer.EmitValueToAlignment(Align); - return fdeEnd; + Streamer.EmitLabel(fdeEnd); } namespace { - struct CIEKey { - static const CIEKey getEmptyKey() { - return CIEKey(nullptr, 0, -1, false, false); - } - static const CIEKey getTombstoneKey() { - return CIEKey(nullptr, -1, 0, false, false); - } +struct CIEKey { + static const CIEKey getEmptyKey() { + return CIEKey(nullptr, 0, -1, false, false); + } + static const CIEKey getTombstoneKey() { + return CIEKey(nullptr, -1, 0, false, false); + } - CIEKey(const MCSymbol *Personality_, unsigned PersonalityEncoding_, - unsigned LsdaEncoding_, bool IsSignalFrame_, bool IsSimple_) - : Personality(Personality_), PersonalityEncoding(PersonalityEncoding_), - LsdaEncoding(LsdaEncoding_), IsSignalFrame(IsSignalFrame_), - IsSimple(IsSimple_) {} - const MCSymbol *Personality; - unsigned PersonalityEncoding; - unsigned LsdaEncoding; - bool IsSignalFrame; - bool IsSimple; - }; -} + CIEKey(const MCSymbol *Personality, unsigned PersonalityEncoding, + unsigned LsdaEncoding, bool IsSignalFrame, bool IsSimple) + : Personality(Personality), PersonalityEncoding(PersonalityEncoding), + LsdaEncoding(LsdaEncoding), IsSignalFrame(IsSignalFrame), + IsSimple(IsSimple) {} + const MCSymbol *Personality; + unsigned PersonalityEncoding; + unsigned LsdaEncoding; + bool IsSignalFrame; + bool IsSimple; +}; +} // anonymous namespace namespace llvm { - template <> - struct DenseMapInfo<CIEKey> { - static CIEKey getEmptyKey() { - return CIEKey::getEmptyKey(); - } - static CIEKey getTombstoneKey() { - return CIEKey::getTombstoneKey(); - } - static unsigned getHashValue(const CIEKey &Key) { - return static_cast<unsigned>(hash_combine(Key.Personality, - Key.PersonalityEncoding, - Key.LsdaEncoding, - Key.IsSignalFrame, - Key.IsSimple)); - } - static bool isEqual(const CIEKey &LHS, - const CIEKey &RHS) { - return LHS.Personality == RHS.Personality && - LHS.PersonalityEncoding == RHS.PersonalityEncoding && - LHS.LsdaEncoding == RHS.LsdaEncoding && - LHS.IsSignalFrame == RHS.IsSignalFrame && - LHS.IsSimple == RHS.IsSimple; - } - }; -} +template <> struct DenseMapInfo<CIEKey> { + static CIEKey getEmptyKey() { return CIEKey::getEmptyKey(); } + static CIEKey getTombstoneKey() { return CIEKey::getTombstoneKey(); } + static unsigned getHashValue(const CIEKey &Key) { + return static_cast<unsigned>( + hash_combine(Key.Personality, Key.PersonalityEncoding, Key.LsdaEncoding, + Key.IsSignalFrame, Key.IsSimple)); + } + static bool isEqual(const CIEKey &LHS, const CIEKey &RHS) { + return LHS.Personality == RHS.Personality && + LHS.PersonalityEncoding == RHS.PersonalityEncoding && + LHS.LsdaEncoding == RHS.LsdaEncoding && + LHS.IsSignalFrame == RHS.IsSignalFrame && + LHS.IsSimple == RHS.IsSimple; + } +}; +} // namespace llvm void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB, bool IsEH) { @@ -1496,7 +1487,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB, MCContext &Context = Streamer.getContext(); const MCObjectFileInfo *MOFI = Context.getObjectFileInfo(); - FrameEmitterImpl Emitter(IsEH); + FrameEmitterImpl Emitter(IsEH, Streamer); ArrayRef<MCDwarfFrameInfo> FrameArray = Streamer.getDwarfFrameInfos(); // Emit the compact unwind info if available. @@ -1514,7 +1505,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB, NeedsEHFrameSection |= Frame.CompactUnwindEncoding == MOFI->getCompactUnwindDwarfEHFrameOnly(); - Emitter.EmitCompactUnwind(Streamer, Frame); + Emitter.EmitCompactUnwind(Frame); } } @@ -1527,23 +1518,15 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB, Streamer.SwitchSection(&Section); MCSymbol *SectionStart = Context.createTempSymbol(); Streamer.EmitLabel(SectionStart); - Emitter.setSectionStart(SectionStart); - MCSymbol *FDEEnd = nullptr; DenseMap<CIEKey, const MCSymbol *> CIEStarts; const MCSymbol *DummyDebugKey = nullptr; - NeedsEHFrameSection = !MOFI->getSupportsCompactUnwindWithoutEHFrame(); - for (unsigned i = 0, n = FrameArray.size(); i < n; ++i) { - const MCDwarfFrameInfo &Frame = FrameArray[i]; - - // Emit the label from the previous iteration - if (FDEEnd) { - Streamer.EmitLabel(FDEEnd); - FDEEnd = nullptr; - } - - if (!NeedsEHFrameSection && Frame.CompactUnwindEncoding != + bool CanOmitDwarf = MOFI->getOmitDwarfIfHaveCompactUnwind(); + for (auto I = FrameArray.begin(), E = FrameArray.end(); I != E;) { + const MCDwarfFrameInfo &Frame = *I; + ++I; + if (CanOmitDwarf && Frame.CompactUnwindEncoding != MOFI->getCompactUnwindDwarfEHFrameOnly()) // Don't generate an EH frame if we don't need one. I.e., it's taken care // of by the compact unwind encoding. @@ -1553,18 +1536,12 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB, Frame.LsdaEncoding, Frame.IsSignalFrame, Frame.IsSimple); const MCSymbol *&CIEStart = IsEH ? CIEStarts[Key] : DummyDebugKey; if (!CIEStart) - CIEStart = &Emitter.EmitCIE(Streamer, Frame.Personality, - Frame.PersonalityEncoding, Frame.Lsda, - Frame.IsSignalFrame, - Frame.LsdaEncoding, - Frame.IsSimple); + CIEStart = &Emitter.EmitCIE(Frame.Personality, Frame.PersonalityEncoding, + Frame.Lsda, Frame.IsSignalFrame, + Frame.LsdaEncoding, Frame.IsSimple); - FDEEnd = Emitter.EmitFDE(Streamer, *CIEStart, Frame); + Emitter.EmitFDE(*CIEStart, Frame, I == E, *SectionStart); } - - Streamer.EmitValueToAlignment(Context.getAsmInfo()->getPointerSize()); - if (FDEEnd) - Streamer.EmitLabel(FDEEnd); } void MCDwarfFrameEmitter::EmitAdvanceLoc(MCObjectStreamer &Streamer, diff --git a/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp b/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp index bc0ba85..de645ca 100644 --- a/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp +++ b/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp @@ -29,23 +29,7 @@ bool MCELFObjectTargetWriter::needsRelocateWithSymbol(const MCSymbol &Sym, return false; } -// ELF doesn't require relocations to be in any order. We sort by the Offset, -// just to match gnu as for easier comparison. The use type is an arbitrary way -// of making the sort deterministic. -static int cmpRel(const ELFRelocationEntry *AP, const ELFRelocationEntry *BP) { - const ELFRelocationEntry &A = *AP; - const ELFRelocationEntry &B = *BP; - if (A.Offset != B.Offset) - return B.Offset - A.Offset; - if (B.Type != A.Type) - return A.Type - B.Type; - //llvm_unreachable("ELFRelocs might be unstable!"); - return 0; -} - - void MCELFObjectTargetWriter::sortRelocs(const MCAssembler &Asm, std::vector<ELFRelocationEntry> &Relocs) { - array_pod_sort(Relocs.begin(), Relocs.end(), cmpRel); } diff --git a/contrib/llvm/lib/MC/MCELFStreamer.cpp b/contrib/llvm/lib/MC/MCELFStreamer.cpp index fe9ac21..06d161b 100644 --- a/contrib/llvm/lib/MC/MCELFStreamer.cpp +++ b/contrib/llvm/lib/MC/MCELFStreamer.cpp @@ -68,7 +68,6 @@ void MCELFStreamer::mergeFragment(MCDataFragment *DF, EF->setBundlePadding(static_cast<uint8_t>(RequiredBundlePadding)); Assembler.writeFragmentPadding(*EF, FSize, OW); - VecOS.flush(); delete OW; DF->getContents().append(Code.begin(), Code.end()); @@ -87,20 +86,10 @@ void MCELFStreamer::mergeFragment(MCDataFragment *DF, } void MCELFStreamer::InitSections(bool NoExecStack) { - // This emulates the same behavior of GNU as. This makes it easier - // to compare the output as the major sections are in the same order. MCContext &Ctx = getContext(); SwitchSection(Ctx.getObjectFileInfo()->getTextSection()); EmitCodeAlignment(4); - SwitchSection(Ctx.getObjectFileInfo()->getDataSection()); - EmitCodeAlignment(4); - - SwitchSection(Ctx.getObjectFileInfo()->getBSSSection()); - EmitCodeAlignment(4); - - SwitchSection(Ctx.getObjectFileInfo()->getTextSection()); - if (NoExecStack) SwitchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx)); } @@ -112,7 +101,7 @@ void MCELFStreamer::EmitLabel(MCSymbol *S) { MCObjectStreamer::EmitLabel(Symbol); const MCSectionELF &Section = - static_cast<const MCSectionELF&>(Symbol->getSection()); + static_cast<const MCSectionELF &>(*getCurrentSectionOnly()); if (Section.getFlags() & ELF::SHF_TLS) Symbol->setType(ELF::STT_TLS); } @@ -134,7 +123,7 @@ void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) { llvm_unreachable("invalid assembler flag!"); } -// If bundle aligment is used and there are any instructions in the section, it +// If bundle alignment is used and there are any instructions in the section, it // needs to be aligned to at least the bundle size. static void setSectionAlignmentForBundling(const MCAssembler &Assembler, MCSection *Section) { @@ -312,13 +301,20 @@ void MCELFStreamer::EmitCommonSymbol(MCSymbol *S, uint64_t Size, Symbol->setType(ELF::STT_OBJECT); if (Symbol->getBinding() == ELF::STB_LOCAL) { - MCSection *Section = getAssembler().getContext().getELFSection( + MCSection &Section = *getAssembler().getContext().getELFSection( ".bss", ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); + MCSectionSubPair P = getCurrentSection(); + SwitchSection(&Section); - AssignSection(Symbol, Section); + EmitValueToAlignment(ByteAlignment, 0, 1, 0); + EmitLabel(Symbol); + EmitZeros(Size); - struct LocalCommon L = {Symbol, Size, ByteAlignment}; - LocalCommons.push_back(L); + // Update the maximum alignment of the section if necessary. + if (ByteAlignment > Section.getAlignment()) + Section.setAlignment(ByteAlignment); + + SwitchSection(P.first, P.second); } else { if(Symbol->declareCommon(Size, ByteAlignment)) report_fatal_error("Symbol: " + Symbol->getName() + @@ -344,7 +340,7 @@ void MCELFStreamer::EmitLocalCommonSymbol(MCSymbol *S, uint64_t Size, } void MCELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) { + SMLoc Loc) { if (isBundleLocked()) report_fatal_error("Emitting values inside a locked bundle is forbidden"); fixSymbolsInTLSFixups(Value); @@ -480,7 +476,6 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst, SmallString<256> Code; raw_svector_ostream VecOS(Code); Assembler.getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI); - VecOS.flush(); for (unsigned i = 0, e = Fixups.size(); i != e; ++i) fixSymbolsInTLSFixups(Fixups[i].getValue()); @@ -603,7 +598,7 @@ void MCELFStreamer::EmitBundleUnlock() { report_fatal_error("Empty bundle-locked group is forbidden"); // When the -mc-relax-all flag is used, we emit instructions to fragments - // stored on a stack. When the bundle unlock is emited, we pop a fragment + // stored on a stack. When the bundle unlock is emitted, we pop a fragment // from the stack a merge it to the one below. if (getAssembler().getRelaxAll()) { assert(!BundleGroups.empty() && "There are no bundle groups"); @@ -625,29 +620,6 @@ void MCELFStreamer::EmitBundleUnlock() { Sec.setBundleLockState(MCSection::NotBundleLocked); } -void MCELFStreamer::Flush() { - for (std::vector<LocalCommon>::const_iterator i = LocalCommons.begin(), - e = LocalCommons.end(); - i != e; ++i) { - const MCSymbol &Symbol = *i->Symbol; - uint64_t Size = i->Size; - unsigned ByteAlignment = i->ByteAlignment; - MCSection &Section = Symbol.getSection(); - - getAssembler().registerSection(Section); - new MCAlignFragment(ByteAlignment, 0, 1, ByteAlignment, &Section); - - MCFragment *F = new MCFillFragment(0, 0, Size, &Section); - Symbol.setFragment(F); - - // Update the maximum alignment of the section if necessary. - if (ByteAlignment > Section.getAlignment()) - Section.setAlignment(ByteAlignment); - } - - LocalCommons.clear(); -} - void MCELFStreamer::FinishImpl() { // Ensure the last section gets aligned if necessary. MCSection *CurSection = getCurrentSectionOnly(); @@ -655,8 +627,6 @@ void MCELFStreamer::FinishImpl() { EmitFrames(nullptr); - Flush(); - this->MCObjectStreamer::FinishImpl(); } diff --git a/contrib/llvm/lib/MC/MCExpr.cpp b/contrib/llvm/lib/MC/MCExpr.cpp index a30ceec..748644b 100644 --- a/contrib/llvm/lib/MC/MCExpr.cpp +++ b/contrib/llvm/lib/MC/MCExpr.cpp @@ -43,7 +43,7 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const { const MCSymbol &Sym = SRE.getSymbol(); // Parenthesize names that start with $ so that they don't look like // absolute names. - bool UseParens = Sym.getName()[0] == '$'; + bool UseParens = Sym.getName().size() && Sym.getName()[0] == '$'; if (UseParens) { OS << '('; Sym.print(OS, MAI); @@ -202,6 +202,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) { case VK_SIZE: return "SIZE"; case VK_WEAKREF: return "WEAKREF"; case VK_ARM_NONE: return "none"; + case VK_ARM_GOT_PREL: return "GOT_PREL"; case VK_ARM_TARGET1: return "target1"; case VK_ARM_TARGET2: return "target2"; case VK_ARM_PREL31: return "prel31"; @@ -299,6 +300,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) { case VK_Hexagon_LD_PLT: return "LDPLT"; case VK_Hexagon_IE: return "IE"; case VK_Hexagon_IE_GOT: return "IEGOT"; + case VK_WebAssembly_FUNCTION: return "FUNCTION"; case VK_TPREL: return "tprel"; case VK_DTPREL: return "dtprel"; } @@ -311,7 +313,6 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) { .Case("got", VK_GOT) .Case("gotoff", VK_GOTOFF) .Case("gotpcrel", VK_GOTPCREL) - .Case("got_prel", VK_GOTPCREL) .Case("gottpoff", VK_GOTTPOFF) .Case("indntpoff", VK_INDNTPOFF) .Case("ntpoff", VK_NTPOFF) @@ -382,7 +383,15 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) { .Case("got@tlsld@l", VK_PPC_GOT_TLSLD_LO) .Case("got@tlsld@h", VK_PPC_GOT_TLSLD_HI) .Case("got@tlsld@ha", VK_PPC_GOT_TLSLD_HA) + .Case("gdgot", VK_Hexagon_GD_GOT) + .Case("gdplt", VK_Hexagon_GD_PLT) + .Case("iegot", VK_Hexagon_IE_GOT) + .Case("ie", VK_Hexagon_IE) + .Case("ldgot", VK_Hexagon_LD_GOT) + .Case("ldplt", VK_Hexagon_LD_PLT) + .Case("pcrel", VK_Hexagon_PCREL) .Case("none", VK_ARM_NONE) + .Case("got_prel", VK_ARM_GOT_PREL) .Case("target1", VK_ARM_TARGET1) .Case("target2", VK_ARM_TARGET2) .Case("prel31", VK_ARM_PREL31) @@ -477,7 +486,8 @@ static void AttemptToFoldSymbolOffsetDifference( if (!Asm->getWriter().isSymbolRefDifferenceFullyResolved(*Asm, A, B, InSet)) return; - if (SA.getFragment() == SB.getFragment()) { + if (SA.getFragment() == SB.getFragment() && !SA.isVariable() && + !SB.isVariable()) { Addend += (SA.getOffset() - SB.getOffset()); // Pointers to Thumb symbols need to have their low-bit set to allow @@ -583,11 +593,6 @@ EvaluateSymbolicAdd(const MCAssembler *Asm, const MCAsmLayout *Layout, const MCSymbolRefExpr *A = LHS_A ? LHS_A : RHS_A; const MCSymbolRefExpr *B = LHS_B ? LHS_B : RHS_B; - // If we have a negated symbol, then we must have also have a non-negated - // symbol in order to encode the expression. - if (B && !A) - return false; - Res = MCValue::get(A, B, Result_Cst); return true; } @@ -606,7 +611,7 @@ bool MCExpr::evaluateAsValue(MCValue &Res, const MCAsmLayout &Layout) const { true); } -static bool canExpand(const MCSymbol &Sym, const MCAssembler *Asm, bool InSet) { +static bool canExpand(const MCSymbol &Sym, bool InSet) { const MCExpr *Expr = Sym.getVariableValue(); const auto *Inner = dyn_cast<MCSymbolRefExpr>(Expr); if (Inner) { @@ -616,9 +621,7 @@ static bool canExpand(const MCSymbol &Sym, const MCAssembler *Asm, bool InSet) { if (InSet) return true; - if (!Asm) - return false; - return !Asm->getWriter().isWeak(Sym); + return !Sym.isInSection(); } bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, @@ -643,7 +646,7 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, // Evaluate recursively if this is a variable. if (Sym.isVariable() && SRE->getKind() == MCSymbolRefExpr::VK_None && - canExpand(Sym, Asm, InSet)) { + canExpand(Sym, InSet)) { bool IsMachO = SRE->hasSubsectionsViaSymbols(); if (Sym.getVariableValue()->evaluateAsRelocatableImpl( Res, Asm, Layout, Fixup, Addrs, InSet || IsMachO)) { @@ -739,7 +742,17 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, case MCBinaryExpr::AShr: Result = LHS >> RHS; break; case MCBinaryExpr::Add: Result = LHS + RHS; break; case MCBinaryExpr::And: Result = LHS & RHS; break; - case MCBinaryExpr::Div: Result = LHS / RHS; break; + case MCBinaryExpr::Div: + // Handle division by zero. gas just emits a warning and keeps going, + // we try to be stricter. + // FIXME: Currently the caller of this function has no way to understand + // we're bailing out because of 'division by zero'. Therefore, it will + // emit a 'expected relocatable expression' error. It would be nice to + // change this code to emit a better diagnostic. + if (RHS == 0) + return false; + Result = LHS / RHS; + break; case MCBinaryExpr::EQ: Result = LHS == RHS; break; case MCBinaryExpr::GT: Result = LHS > RHS; break; case MCBinaryExpr::GTE: Result = LHS >= RHS; break; @@ -765,45 +778,41 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, llvm_unreachable("Invalid assembly expression kind!"); } -MCSection *MCExpr::findAssociatedSection() const { +MCFragment *MCExpr::findAssociatedFragment() const { switch (getKind()) { case Target: // We never look through target specific expressions. - return cast<MCTargetExpr>(this)->findAssociatedSection(); + return cast<MCTargetExpr>(this)->findAssociatedFragment(); case Constant: - return MCSymbol::AbsolutePseudoSection; + return MCSymbol::AbsolutePseudoFragment; case SymbolRef: { const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(this); const MCSymbol &Sym = SRE->getSymbol(); - - if (Sym.isDefined()) - return &Sym.getSection(); - - return nullptr; + return Sym.getFragment(); } case Unary: - return cast<MCUnaryExpr>(this)->getSubExpr()->findAssociatedSection(); + return cast<MCUnaryExpr>(this)->getSubExpr()->findAssociatedFragment(); case Binary: { const MCBinaryExpr *BE = cast<MCBinaryExpr>(this); - MCSection *LHS_S = BE->getLHS()->findAssociatedSection(); - MCSection *RHS_S = BE->getRHS()->findAssociatedSection(); + MCFragment *LHS_F = BE->getLHS()->findAssociatedFragment(); + MCFragment *RHS_F = BE->getRHS()->findAssociatedFragment(); - // If either section is absolute, return the other. - if (LHS_S == MCSymbol::AbsolutePseudoSection) - return RHS_S; - if (RHS_S == MCSymbol::AbsolutePseudoSection) - return LHS_S; + // If either is absolute, return the other. + if (LHS_F == MCSymbol::AbsolutePseudoFragment) + return RHS_F; + if (RHS_F == MCSymbol::AbsolutePseudoFragment) + return LHS_F; // Not always correct, but probably the best we can do without more context. if (BE->getOpcode() == MCBinaryExpr::Sub) - return MCSymbol::AbsolutePseudoSection; + return MCSymbol::AbsolutePseudoFragment; - // Otherwise, return the first non-null section. - return LHS_S ? LHS_S : RHS_S; + // Otherwise, return the first non-null fragment. + return LHS_F ? LHS_F : RHS_F; } } diff --git a/contrib/llvm/lib/MC/MCFragment.cpp b/contrib/llvm/lib/MC/MCFragment.cpp new file mode 100644 index 0000000..efdb704 --- /dev/null +++ b/contrib/llvm/lib/MC/MCFragment.cpp @@ -0,0 +1,458 @@ +//===- lib/MC/MCFragment.cpp - Assembler Fragment Implementation ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/MC/MCFragment.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDwarf.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LEB128.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include <tuple> +using namespace llvm; + +MCAsmLayout::MCAsmLayout(MCAssembler &Asm) + : Assembler(Asm), LastValidFragment() + { + // Compute the section layout order. Virtual sections must go last. + for (MCSection &Sec : Asm) + if (!Sec.isVirtualSection()) + SectionOrder.push_back(&Sec); + for (MCSection &Sec : Asm) + if (Sec.isVirtualSection()) + SectionOrder.push_back(&Sec); +} + +bool MCAsmLayout::isFragmentValid(const MCFragment *F) const { + const MCSection *Sec = F->getParent(); + const MCFragment *LastValid = LastValidFragment.lookup(Sec); + if (!LastValid) + return false; + assert(LastValid->getParent() == Sec); + return F->getLayoutOrder() <= LastValid->getLayoutOrder(); +} + +void MCAsmLayout::invalidateFragmentsFrom(MCFragment *F) { + // If this fragment wasn't already valid, we don't need to do anything. + if (!isFragmentValid(F)) + return; + + // Otherwise, reset the last valid fragment to the previous fragment + // (if this is the first fragment, it will be NULL). + LastValidFragment[F->getParent()] = F->getPrevNode(); +} + +void MCAsmLayout::ensureValid(const MCFragment *F) const { + MCSection *Sec = F->getParent(); + MCSection::iterator I; + if (MCFragment *Cur = LastValidFragment[Sec]) + I = ++MCSection::iterator(Cur); + else + I = Sec->begin(); + + // Advance the layout position until the fragment is valid. + while (!isFragmentValid(F)) { + assert(I != Sec->end() && "Layout bookkeeping error"); + const_cast<MCAsmLayout *>(this)->layoutFragment(&*I); + ++I; + } +} + +uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const { + ensureValid(F); + assert(F->Offset != ~UINT64_C(0) && "Address not set!"); + return F->Offset; +} + +// Simple getSymbolOffset helper for the non-varibale case. +static bool getLabelOffset(const MCAsmLayout &Layout, const MCSymbol &S, + bool ReportError, uint64_t &Val) { + if (!S.getFragment()) { + if (ReportError) + report_fatal_error("unable to evaluate offset to undefined symbol '" + + S.getName() + "'"); + return false; + } + Val = Layout.getFragmentOffset(S.getFragment()) + S.getOffset(); + return true; +} + +static bool getSymbolOffsetImpl(const MCAsmLayout &Layout, const MCSymbol &S, + bool ReportError, uint64_t &Val) { + if (!S.isVariable()) + return getLabelOffset(Layout, S, ReportError, Val); + + // If SD is a variable, evaluate it. + MCValue Target; + if (!S.getVariableValue()->evaluateAsValue(Target, Layout)) + report_fatal_error("unable to evaluate offset for variable '" + + S.getName() + "'"); + + uint64_t Offset = Target.getConstant(); + + const MCSymbolRefExpr *A = Target.getSymA(); + if (A) { + uint64_t ValA; + if (!getLabelOffset(Layout, A->getSymbol(), ReportError, ValA)) + return false; + Offset += ValA; + } + + const MCSymbolRefExpr *B = Target.getSymB(); + if (B) { + uint64_t ValB; + if (!getLabelOffset(Layout, B->getSymbol(), ReportError, ValB)) + return false; + Offset -= ValB; + } + + Val = Offset; + return true; +} + +bool MCAsmLayout::getSymbolOffset(const MCSymbol &S, uint64_t &Val) const { + return getSymbolOffsetImpl(*this, S, false, Val); +} + +uint64_t MCAsmLayout::getSymbolOffset(const MCSymbol &S) const { + uint64_t Val; + getSymbolOffsetImpl(*this, S, true, Val); + return Val; +} + +const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const { + if (!Symbol.isVariable()) + return &Symbol; + + const MCExpr *Expr = Symbol.getVariableValue(); + MCValue Value; + if (!Expr->evaluateAsValue(Value, *this)) { + Assembler.getContext().reportError( + SMLoc(), "expression could not be evaluated"); + return nullptr; + } + + const MCSymbolRefExpr *RefB = Value.getSymB(); + if (RefB) { + Assembler.getContext().reportError( + SMLoc(), Twine("symbol '") + RefB->getSymbol().getName() + + "' could not be evaluated in a subtraction expression"); + return nullptr; + } + + const MCSymbolRefExpr *A = Value.getSymA(); + if (!A) + return nullptr; + + const MCSymbol &ASym = A->getSymbol(); + const MCAssembler &Asm = getAssembler(); + if (ASym.isCommon()) { + // FIXME: we should probably add a SMLoc to MCExpr. + Asm.getContext().reportError(SMLoc(), + "Common symbol '" + ASym.getName() + + "' cannot be used in assignment expr"); + return nullptr; + } + + return &ASym; +} + +uint64_t MCAsmLayout::getSectionAddressSize(const MCSection *Sec) const { + // The size is the last fragment's end offset. + const MCFragment &F = Sec->getFragmentList().back(); + return getFragmentOffset(&F) + getAssembler().computeFragmentSize(*this, F); +} + +uint64_t MCAsmLayout::getSectionFileSize(const MCSection *Sec) const { + // Virtual sections have no file size. + if (Sec->isVirtualSection()) + return 0; + + // Otherwise, the file size is the same as the address space size. + return getSectionAddressSize(Sec); +} + +uint64_t llvm::computeBundlePadding(const MCAssembler &Assembler, + const MCFragment *F, + uint64_t FOffset, uint64_t FSize) { + uint64_t BundleSize = Assembler.getBundleAlignSize(); + assert(BundleSize > 0 && + "computeBundlePadding should only be called if bundling is enabled"); + uint64_t BundleMask = BundleSize - 1; + uint64_t OffsetInBundle = FOffset & BundleMask; + uint64_t EndOfFragment = OffsetInBundle + FSize; + + // There are two kinds of bundling restrictions: + // + // 1) For alignToBundleEnd(), add padding to ensure that the fragment will + // *end* on a bundle boundary. + // 2) Otherwise, check if the fragment would cross a bundle boundary. If it + // would, add padding until the end of the bundle so that the fragment + // will start in a new one. + if (F->alignToBundleEnd()) { + // Three possibilities here: + // + // A) The fragment just happens to end at a bundle boundary, so we're good. + // B) The fragment ends before the current bundle boundary: pad it just + // enough to reach the boundary. + // C) The fragment ends after the current bundle boundary: pad it until it + // reaches the end of the next bundle boundary. + // + // Note: this code could be made shorter with some modulo trickery, but it's + // intentionally kept in its more explicit form for simplicity. + if (EndOfFragment == BundleSize) + return 0; + else if (EndOfFragment < BundleSize) + return BundleSize - EndOfFragment; + else { // EndOfFragment > BundleSize + return 2 * BundleSize - EndOfFragment; + } + } else if (OffsetInBundle > 0 && EndOfFragment > BundleSize) + return BundleSize - OffsetInBundle; + else + return 0; +} + +/* *** */ + +void ilist_node_traits<MCFragment>::deleteNode(MCFragment *V) { + V->destroy(); +} + +MCFragment::MCFragment() : Kind(FragmentType(~0)), HasInstructions(false), + AlignToBundleEnd(false), BundlePadding(0) { +} + +MCFragment::~MCFragment() { } + +MCFragment::MCFragment(FragmentType Kind, bool HasInstructions, + uint8_t BundlePadding, MCSection *Parent) + : Kind(Kind), HasInstructions(HasInstructions), AlignToBundleEnd(false), + BundlePadding(BundlePadding), Parent(Parent), Atom(nullptr), + Offset(~UINT64_C(0)) { + if (Parent && !isDummy()) + Parent->getFragmentList().push_back(this); +} + +void MCFragment::destroy() { + // First check if we are the sentinal. + if (Kind == FragmentType(~0)) { + delete this; + return; + } + + switch (Kind) { + case FT_Align: + delete cast<MCAlignFragment>(this); + return; + case FT_Data: + delete cast<MCDataFragment>(this); + return; + case FT_CompactEncodedInst: + delete cast<MCCompactEncodedInstFragment>(this); + return; + case FT_Fill: + delete cast<MCFillFragment>(this); + return; + case FT_Relaxable: + delete cast<MCRelaxableFragment>(this); + return; + case FT_Org: + delete cast<MCOrgFragment>(this); + return; + case FT_Dwarf: + delete cast<MCDwarfLineAddrFragment>(this); + return; + case FT_DwarfFrame: + delete cast<MCDwarfCallFrameFragment>(this); + return; + case FT_LEB: + delete cast<MCLEBFragment>(this); + return; + case FT_SafeSEH: + delete cast<MCSafeSEHFragment>(this); + return; + case FT_Dummy: + delete cast<MCDummyFragment>(this); + return; + } +} + +/* *** */ + +// Debugging methods + +namespace llvm { + +raw_ostream &operator<<(raw_ostream &OS, const MCFixup &AF) { + OS << "<MCFixup" << " Offset:" << AF.getOffset() + << " Value:" << *AF.getValue() + << " Kind:" << AF.getKind() << ">"; + return OS; +} + +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void MCFragment::dump() { + raw_ostream &OS = llvm::errs(); + + OS << "<"; + switch (getKind()) { + case MCFragment::FT_Align: OS << "MCAlignFragment"; break; + case MCFragment::FT_Data: OS << "MCDataFragment"; break; + case MCFragment::FT_CompactEncodedInst: + OS << "MCCompactEncodedInstFragment"; break; + case MCFragment::FT_Fill: OS << "MCFillFragment"; break; + case MCFragment::FT_Relaxable: OS << "MCRelaxableFragment"; break; + case MCFragment::FT_Org: OS << "MCOrgFragment"; break; + case MCFragment::FT_Dwarf: OS << "MCDwarfFragment"; break; + case MCFragment::FT_DwarfFrame: OS << "MCDwarfCallFrameFragment"; break; + case MCFragment::FT_LEB: OS << "MCLEBFragment"; break; + case MCFragment::FT_SafeSEH: OS << "MCSafeSEHFragment"; break; + case MCFragment::FT_Dummy: + OS << "MCDummyFragment"; + break; + } + + OS << "<MCFragment " << (void*) this << " LayoutOrder:" << LayoutOrder + << " Offset:" << Offset + << " HasInstructions:" << hasInstructions() + << " BundlePadding:" << static_cast<unsigned>(getBundlePadding()) << ">"; + + switch (getKind()) { + case MCFragment::FT_Align: { + const MCAlignFragment *AF = cast<MCAlignFragment>(this); + if (AF->hasEmitNops()) + OS << " (emit nops)"; + OS << "\n "; + OS << " Alignment:" << AF->getAlignment() + << " Value:" << AF->getValue() << " ValueSize:" << AF->getValueSize() + << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">"; + break; + } + case MCFragment::FT_Data: { + const MCDataFragment *DF = cast<MCDataFragment>(this); + OS << "\n "; + OS << " Contents:["; + const SmallVectorImpl<char> &Contents = DF->getContents(); + for (unsigned i = 0, e = Contents.size(); i != e; ++i) { + if (i) OS << ","; + OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF); + } + OS << "] (" << Contents.size() << " bytes)"; + + if (DF->fixup_begin() != DF->fixup_end()) { + OS << ",\n "; + OS << " Fixups:["; + for (MCDataFragment::const_fixup_iterator it = DF->fixup_begin(), + ie = DF->fixup_end(); it != ie; ++it) { + if (it != DF->fixup_begin()) OS << ",\n "; + OS << *it; + } + OS << "]"; + } + break; + } + case MCFragment::FT_CompactEncodedInst: { + const MCCompactEncodedInstFragment *CEIF = + cast<MCCompactEncodedInstFragment>(this); + OS << "\n "; + OS << " Contents:["; + const SmallVectorImpl<char> &Contents = CEIF->getContents(); + for (unsigned i = 0, e = Contents.size(); i != e; ++i) { + if (i) OS << ","; + OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF); + } + OS << "] (" << Contents.size() << " bytes)"; + break; + } + case MCFragment::FT_Fill: { + const MCFillFragment *FF = cast<MCFillFragment>(this); + OS << " Value:" << FF->getValue() << " ValueSize:" << FF->getValueSize() + << " Size:" << FF->getSize(); + break; + } + case MCFragment::FT_Relaxable: { + const MCRelaxableFragment *F = cast<MCRelaxableFragment>(this); + OS << "\n "; + OS << " Inst:"; + F->getInst().dump_pretty(OS); + break; + } + case MCFragment::FT_Org: { + const MCOrgFragment *OF = cast<MCOrgFragment>(this); + OS << "\n "; + OS << " Offset:" << OF->getOffset() << " Value:" << OF->getValue(); + break; + } + case MCFragment::FT_Dwarf: { + const MCDwarfLineAddrFragment *OF = cast<MCDwarfLineAddrFragment>(this); + OS << "\n "; + OS << " AddrDelta:" << OF->getAddrDelta() + << " LineDelta:" << OF->getLineDelta(); + break; + } + case MCFragment::FT_DwarfFrame: { + const MCDwarfCallFrameFragment *CF = cast<MCDwarfCallFrameFragment>(this); + OS << "\n "; + OS << " AddrDelta:" << CF->getAddrDelta(); + break; + } + case MCFragment::FT_LEB: { + const MCLEBFragment *LF = cast<MCLEBFragment>(this); + OS << "\n "; + OS << " Value:" << LF->getValue() << " Signed:" << LF->isSigned(); + break; + } + case MCFragment::FT_SafeSEH: { + const MCSafeSEHFragment *F = cast<MCSafeSEHFragment>(this); + OS << "\n "; + OS << " Sym:" << F->getSymbol(); + break; + } + case MCFragment::FT_Dummy: + break; + } + OS << ">"; +} + +void MCAssembler::dump() { + raw_ostream &OS = llvm::errs(); + + OS << "<MCAssembler\n"; + OS << " Sections:[\n "; + for (iterator it = begin(), ie = end(); it != ie; ++it) { + if (it != begin()) OS << ",\n "; + it->dump(); + } + OS << "],\n"; + OS << " Symbols:["; + + for (symbol_iterator it = symbol_begin(), ie = symbol_end(); it != ie; ++it) { + if (it != symbol_begin()) OS << ",\n "; + OS << "("; + it->dump(); + OS << ", Index:" << it->getIndex() << ", "; + OS << ")"; + } + OS << "]>\n"; +} +#endif diff --git a/contrib/llvm/lib/MC/MCInst.cpp b/contrib/llvm/lib/MC/MCInst.cpp index 7ef69be..5f829ae 100644 --- a/contrib/llvm/lib/MC/MCInst.cpp +++ b/contrib/llvm/lib/MC/MCInst.cpp @@ -23,6 +23,8 @@ void MCOperand::print(raw_ostream &OS) const { OS << "Reg:" << getReg(); else if (isImm()) OS << "Imm:" << getImm(); + else if (isFPImm()) + OS << "FPImm:" << getFPImm(); else if (isExpr()) { OS << "Expr:(" << *getExpr() << ")"; } else if (isInst()) { diff --git a/contrib/llvm/lib/MC/MCInstrDesc.cpp b/contrib/llvm/lib/MC/MCInstrDesc.cpp index 5be2fa1..ee55f3e 100644 --- a/contrib/llvm/lib/MC/MCInstrDesc.cpp +++ b/contrib/llvm/lib/MC/MCInstrDesc.cpp @@ -53,7 +53,7 @@ bool MCInstrDesc::mayAffectControlFlow(const MCInst &MI, bool MCInstrDesc::hasImplicitDefOfPhysReg(unsigned Reg, const MCRegisterInfo *MRI) const { - if (const uint16_t *ImpDefs = ImplicitDefs) + if (const MCPhysReg *ImpDefs = ImplicitDefs) for (; *ImpDefs; ++ImpDefs) if (*ImpDefs == Reg || (MRI && MRI->isSubRegister(Reg, *ImpDefs))) return true; diff --git a/contrib/llvm/lib/MC/MCMachOStreamer.cpp b/contrib/llvm/lib/MC/MCMachOStreamer.cpp index 53cd131..21f7571 100644 --- a/contrib/llvm/lib/MC/MCMachOStreamer.cpp +++ b/contrib/llvm/lib/MC/MCMachOStreamer.cpp @@ -60,6 +60,7 @@ public: /// state management void reset() override { + CreatedADWARFSection = false; HasSectionLabel.clear(); MCObjectStreamer::reset(); } @@ -180,8 +181,6 @@ void MCMachOStreamer::EmitEHSymAttributes(const MCSymbol *Symbol, void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) { assert(Symbol->isUndefined() && "Cannot define a symbol twice!"); - // isSymbolLinkerVisible uses the section. - AssignSection(Symbol, getCurrentSection().first); // We have to create a new fragment if this is an atom defining symbol, // fragments cannot span atoms. if (getAssembler().isSymbolLinkerVisible(*Symbol)) @@ -384,8 +383,6 @@ void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, // FIXME: Darwin 'as' does appear to allow redef of a .comm by itself. assert(Symbol->isUndefined() && "Cannot define a symbol twice!"); - AssignSection(Symbol, nullptr); - getAssembler().registerSymbol(*Symbol); Symbol->setExternal(true); Symbol->setCommon(Size, ByteAlignment); @@ -417,8 +414,6 @@ void MCMachOStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol, if (ByteAlignment != 1) new MCAlignFragment(ByteAlignment, 0, 0, ByteAlignment, Section); - AssignSection(Symbol, Section); - MCFragment *F = new MCFillFragment(0, 0, Size, Section); Symbol->setFragment(F); @@ -443,12 +438,11 @@ void MCMachOStreamer::EmitInstToData(const MCInst &Inst, SmallString<256> Code; raw_svector_ostream VecOS(Code); getAssembler().getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI); - VecOS.flush(); // Add the fixups and data. - for (unsigned i = 0, e = Fixups.size(); i != e; ++i) { - Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size()); - DF->getFixups().push_back(Fixups[i]); + for (MCFixup &Fixup : Fixups) { + Fixup.setOffset(Fixup.getOffset() + DF->getContents().size()); + DF->getFixups().push_back(Fixup); } DF->getContents().append(Code.begin(), Code.end()); } @@ -463,7 +457,8 @@ void MCMachOStreamer::FinishImpl() { // defining symbols. DenseMap<const MCFragment *, const MCSymbol *> DefiningSymbolMap; for (const MCSymbol &Symbol : getAssembler().symbols()) { - if (getAssembler().isSymbolLinkerVisible(Symbol) && Symbol.getFragment()) { + if (getAssembler().isSymbolLinkerVisible(Symbol) && Symbol.isInSection() && + !Symbol.isVariable()) { // An atom defining symbol should never be internal to a fragment. assert(Symbol.getOffset() == 0 && "Invalid offset in atom defining symbol!"); @@ -473,14 +468,12 @@ void MCMachOStreamer::FinishImpl() { // Set the fragment atom associations by tracking the last seen atom defining // symbol. - for (MCAssembler::iterator it = getAssembler().begin(), - ie = getAssembler().end(); it != ie; ++it) { + for (MCSection &Sec : getAssembler()) { const MCSymbol *CurrentAtom = nullptr; - for (MCSection::iterator it2 = it->begin(), ie2 = it->end(); it2 != ie2; - ++it2) { - if (const MCSymbol *Symbol = DefiningSymbolMap.lookup(it2)) + for (MCFragment &Frag : Sec) { + if (const MCSymbol *Symbol = DefiningSymbolMap.lookup(&Frag)) CurrentAtom = Symbol; - it2->setAtom(CurrentAtom); + Frag.setAtom(CurrentAtom); } } @@ -493,6 +486,26 @@ MCStreamer *llvm::createMachOStreamer(MCContext &Context, MCAsmBackend &MAB, bool LabelSections) { MCMachOStreamer *S = new MCMachOStreamer(Context, MAB, OS, CE, DWARFMustBeAtTheEnd, LabelSections); + const Triple &TT = Context.getObjectFileInfo()->getTargetTriple(); + if (TT.isOSDarwin()) { + unsigned Major, Minor, Update; + TT.getOSVersion(Major, Minor, Update); + // If there is a version specified, Major will be non-zero. + if (Major) { + MCVersionMinType VersionType; + if (TT.isWatchOS()) + VersionType = MCVM_WatchOSVersionMin; + else if (TT.isTvOS()) + VersionType = MCVM_TvOSVersionMin; + else if (TT.isMacOSX()) + VersionType = MCVM_OSXVersionMin; + else { + assert(TT.isiOS() && "Must only be iOS platform left"); + VersionType = MCVM_IOSVersionMin; + } + S->EmitVersionMin(VersionType, Major, Minor, Update); + } + } if (RelaxAll) S->getAssembler().setRelaxAll(true); return S; diff --git a/contrib/llvm/lib/MC/MCObjectFileInfo.cpp b/contrib/llvm/lib/MC/MCObjectFileInfo.cpp index 576827a..f86f7e4 100644 --- a/contrib/llvm/lib/MC/MCObjectFileInfo.cpp +++ b/contrib/llvm/lib/MC/MCObjectFileInfo.cpp @@ -1,4 +1,4 @@ -//===-- MObjectFileInfo.cpp - Object File Information ---------------------===// +//===-- MCObjectFileInfo.cpp - Object File Information --------------------===// // // The LLVM Compiler Infrastructure // @@ -16,6 +16,8 @@ #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/Support/COFF.h" + using namespace llvm; static bool useCompactUnwind(const Triple &T) { @@ -27,6 +29,10 @@ static bool useCompactUnwind(const Triple &T) { if (T.getArch() == Triple::aarch64) return true; + // armv7k always has it. + if (T.isWatchOS()) + return true; + // Use it on newer version of OS X. if (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6)) return true; @@ -43,9 +49,18 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) { // MachO SupportsWeakOmittedEHFrame = false; + EHFrameSection = Ctx->getMachOSection( + "__TEXT", "__eh_frame", + MachO::S_COALESCED | MachO::S_ATTR_NO_TOC | + MachO::S_ATTR_STRIP_STATIC_SYMS | MachO::S_ATTR_LIVE_SUPPORT, + SectionKind::getReadOnly()); + if (T.isOSDarwin() && T.getArch() == Triple::aarch64) SupportsCompactUnwindWithoutEHFrame = true; + if (T.isWatchOS()) + OmitDwarfIfHaveCompactUnwind = true; + PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4; LSDAEncoding = FDECFIEncoding = dwarf::DW_EH_PE_pcrel; @@ -61,16 +76,15 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) { MachO::S_ATTR_PURE_INSTRUCTIONS, SectionKind::getText()); DataSection // .data - = Ctx->getMachOSection("__DATA", "__data", 0, - SectionKind::getDataRel()); + = Ctx->getMachOSection("__DATA", "__data", 0, SectionKind::getData()); // BSSSection might not be expected initialized on msvc. BSSSection = nullptr; TLSDataSection // .tdata - = Ctx->getMachOSection("__DATA", "__thread_data", - MachO::S_THREAD_LOCAL_REGULAR, - SectionKind::getDataRel()); + = Ctx->getMachOSection("__DATA", "__thread_data", + MachO::S_THREAD_LOCAL_REGULAR, + SectionKind::getData()); TLSBSSSection // .tbss = Ctx->getMachOSection("__DATA", "__thread_bss", MachO::S_THREAD_LOCAL_ZEROFILL, @@ -78,14 +92,13 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) { // TODO: Verify datarel below. TLSTLVSection // .tlv - = Ctx->getMachOSection("__DATA", "__thread_vars", - MachO::S_THREAD_LOCAL_VARIABLES, - SectionKind::getDataRel()); + = Ctx->getMachOSection("__DATA", "__thread_vars", + MachO::S_THREAD_LOCAL_VARIABLES, + SectionKind::getData()); - TLSThreadInitSection - = Ctx->getMachOSection("__DATA", "__thread_init", - MachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS, - SectionKind::getDataRel()); + TLSThreadInitSection = Ctx->getMachOSection( + "__DATA", "__thread_init", MachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS, + SectionKind::getData()); CStringSection // .cstring = Ctx->getMachOSection("__TEXT", "__cstring", @@ -112,22 +125,35 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) { = Ctx->getMachOSection("__TEXT", "__const", 0, SectionKind::getReadOnly()); - TextCoalSection - = Ctx->getMachOSection("__TEXT", "__textcoal_nt", - MachO::S_COALESCED | - MachO::S_ATTR_PURE_INSTRUCTIONS, - SectionKind::getText()); - ConstTextCoalSection - = Ctx->getMachOSection("__TEXT", "__const_coal", - MachO::S_COALESCED, - SectionKind::getReadOnly()); + // If the target is not powerpc, map the coal sections to the non-coal + // sections. + // + // "__TEXT/__textcoal_nt" => section "__TEXT/__text" + // "__TEXT/__const_coal" => section "__TEXT/__const" + // "__DATA/__datacoal_nt" => section "__DATA/__data" + Triple::ArchType ArchTy = T.getArch(); + + if (ArchTy == Triple::ppc || ArchTy == Triple::ppc64) { + TextCoalSection + = Ctx->getMachOSection("__TEXT", "__textcoal_nt", + MachO::S_COALESCED | + MachO::S_ATTR_PURE_INSTRUCTIONS, + SectionKind::getText()); + ConstTextCoalSection + = Ctx->getMachOSection("__TEXT", "__const_coal", + MachO::S_COALESCED, + SectionKind::getReadOnly()); + DataCoalSection = Ctx->getMachOSection( + "__DATA", "__datacoal_nt", MachO::S_COALESCED, SectionKind::getData()); + } else { + TextCoalSection = TextSection; + ConstTextCoalSection = ReadOnlySection; + DataCoalSection = DataSection; + } + ConstDataSection // .const_data = Ctx->getMachOSection("__DATA", "__const", 0, SectionKind::getReadOnlyWithRel()); - DataCoalSection - = Ctx->getMachOSection("__DATA","__datacoal_nt", - MachO::S_COALESCED, - SectionKind::getDataRel()); DataCommonSection = Ctx->getMachOSection("__DATA","__common", MachO::S_ZEROFILL, @@ -147,21 +173,17 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) { SectionKind::getMetadata()); if (RelocM == Reloc::Static) { - StaticCtorSection - = Ctx->getMachOSection("__TEXT", "__constructor", 0, - SectionKind::getDataRel()); - StaticDtorSection - = Ctx->getMachOSection("__TEXT", "__destructor", 0, - SectionKind::getDataRel()); + StaticCtorSection = Ctx->getMachOSection("__TEXT", "__constructor", 0, + SectionKind::getData()); + StaticDtorSection = Ctx->getMachOSection("__TEXT", "__destructor", 0, + SectionKind::getData()); } else { - StaticCtorSection - = Ctx->getMachOSection("__DATA", "__mod_init_func", - MachO::S_MOD_INIT_FUNC_POINTERS, - SectionKind::getDataRel()); - StaticDtorSection - = Ctx->getMachOSection("__DATA", "__mod_term_func", - MachO::S_MOD_TERM_FUNC_POINTERS, - SectionKind::getDataRel()); + StaticCtorSection = Ctx->getMachOSection("__DATA", "__mod_init_func", + MachO::S_MOD_INIT_FUNC_POINTERS, + SectionKind::getData()); + StaticDtorSection = Ctx->getMachOSection("__DATA", "__mod_term_func", + MachO::S_MOD_TERM_FUNC_POINTERS, + SectionKind::getData()); } // Exception Handling. @@ -176,9 +198,11 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) { SectionKind::getReadOnly()); if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86) - CompactUnwindDwarfEHFrameOnly = 0x04000000; + CompactUnwindDwarfEHFrameOnly = 0x04000000; // UNWIND_X86_64_MODE_DWARF else if (T.getArch() == Triple::aarch64) - CompactUnwindDwarfEHFrameOnly = 0x03000000; + CompactUnwindDwarfEHFrameOnly = 0x03000000; // UNWIND_ARM64_MODE_DWARF + else if (T.getArch() == Triple::arm || T.getArch() == Triple::thumb) + CompactUnwindDwarfEHFrameOnly = 0x04000000; // UNWIND_ARM_MODE_DWARF } // Debug Information. @@ -232,9 +256,18 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(Triple T) { DwarfRangesSection = Ctx->getMachOSection("__DWARF", "__debug_ranges", MachO::S_ATTR_DEBUG, SectionKind::getMetadata(), "debug_range"); + DwarfMacinfoSection = + Ctx->getMachOSection("__DWARF", "__debug_macinfo", MachO::S_ATTR_DEBUG, + SectionKind::getMetadata()); DwarfDebugInlineSection = Ctx->getMachOSection("__DWARF", "__debug_inlined", MachO::S_ATTR_DEBUG, SectionKind::getMetadata()); + DwarfCUIndexSection = + Ctx->getMachOSection("__DWARF", "__debug_cu_index", MachO::S_ATTR_DEBUG, + SectionKind::getMetadata()); + DwarfTUIndexSection = + Ctx->getMachOSection("__DWARF", "__debug_tu_index", MachO::S_ATTR_DEBUG, + SectionKind::getMetadata()); StackMapSection = Ctx->getMachOSection("__LLVM_STACKMAPS", "__llvm_stackmaps", 0, SectionKind::getMetadata()); @@ -258,7 +291,6 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) { FDECFIEncoding = dwarf::DW_EH_PE_pcrel | ((CMModel == CodeModel::Large) ? dwarf::DW_EH_PE_sdata8 : dwarf::DW_EH_PE_sdata4); - break; default: FDECFIEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4; @@ -391,17 +423,15 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) { break; } + unsigned EHSectionType = T.getArch() == Triple::x86_64 + ? ELF::SHT_X86_64_UNWIND + : ELF::SHT_PROGBITS; + // Solaris requires different flags for .eh_frame to seemingly every other // platform. - EHSectionType = ELF::SHT_PROGBITS; - EHSectionFlags = ELF::SHF_ALLOC; - if (T.isOSSolaris()) { - if (T.getArch() == Triple::x86_64) - EHSectionType = ELF::SHT_X86_64_UNWIND; - else - EHSectionFlags |= ELF::SHF_WRITE; - } - + unsigned EHSectionFlags = ELF::SHF_ALLOC; + if (T.isOSSolaris() && T.getArch() != Triple::x86_64) + EHSectionFlags |= ELF::SHF_WRITE; // ELF BSSSection = Ctx->getELFSection(".bss", ELF::SHT_NOBITS, @@ -423,18 +453,9 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) { TLSBSSSection = Ctx->getELFSection( ".tbss", ELF::SHT_NOBITS, ELF::SHF_ALLOC | ELF::SHF_TLS | ELF::SHF_WRITE); - DataRelSection = Ctx->getELFSection(".data.rel", ELF::SHT_PROGBITS, - ELF::SHF_ALLOC | ELF::SHF_WRITE); - - DataRelLocalSection = Ctx->getELFSection(".data.rel.local", ELF::SHT_PROGBITS, - ELF::SHF_ALLOC | ELF::SHF_WRITE); - DataRelROSection = Ctx->getELFSection(".data.rel.ro", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_WRITE); - DataRelROLocalSection = Ctx->getELFSection( - ".data.rel.ro.local", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_WRITE); - MergeableConst4Section = Ctx->getELFSection(".rodata.cst4", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_MERGE, 4, ""); @@ -487,6 +508,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) { Ctx->getELFSection(".debug_aranges", ELF::SHT_PROGBITS, 0); DwarfRangesSection = Ctx->getELFSection(".debug_ranges", ELF::SHT_PROGBITS, 0, "debug_range"); + DwarfMacinfoSection = + Ctx->getELFSection(".debug_macinfo", ELF::SHT_PROGBITS, 0); // DWARF5 Experimental Debug Info @@ -519,14 +542,28 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(Triple T) { DwarfAddrSection = Ctx->getELFSection(".debug_addr", ELF::SHT_PROGBITS, 0, "addr_sec"); + // DWP Sections + DwarfCUIndexSection = + Ctx->getELFSection(".debug_cu_index", ELF::SHT_PROGBITS, 0); + DwarfTUIndexSection = + Ctx->getELFSection(".debug_tu_index", ELF::SHT_PROGBITS, 0); + StackMapSection = Ctx->getELFSection(".llvm_stackmaps", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); FaultMapSection = Ctx->getELFSection(".llvm_faultmaps", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); + + EHFrameSection = + Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags); } void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) { + EHFrameSection = Ctx->getCOFFSection( + ".eh_frame", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE, + SectionKind::getData()); + bool IsWoA = T.getArch() == Triple::arm || T.getArch() == Triple::thumb; CommDirectiveSupportsAlignment = true; @@ -545,7 +582,7 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) { DataSection = Ctx->getCOFFSection( ".data", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE, - SectionKind::getDataRel()); + SectionKind::getData()); ReadOnlySection = Ctx->getCOFFSection( ".rdata", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ, SectionKind::getReadOnly()); @@ -563,21 +600,20 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) { StaticCtorSection = Ctx->getCOFFSection( ".ctors", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE, - SectionKind::getDataRel()); + SectionKind::getData()); StaticDtorSection = Ctx->getCOFFSection( ".dtors", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE, - SectionKind::getDataRel()); + SectionKind::getData()); } // FIXME: We're emitting LSDA info into a readonly section on COFF, even // though it contains relocatable pointers. In PIC mode, this is probably a // big runtime hit for C++ apps. Either the contents of the LSDA need to be // adjusted or this should be a data section. - assert(T.isOSWindows() && "Windows is the only supported COFF target"); if (T.getArch() == Triple::x86_64) { // On Windows 64 with SEH, the LSDA is emitted into the .xdata section - LSDASection = 0; + LSDASection = nullptr; } else { LSDASection = Ctx->getCOFFSection(".gcc_except_table", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | @@ -653,6 +689,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) { COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ, SectionKind::getMetadata(), "debug_range"); + DwarfMacinfoSection = Ctx->getCOFFSection( + ".debug_macinfo", + COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ, + SectionKind::getMetadata()); DwarfInfoDWOSection = Ctx->getCOFFSection( ".debug_info.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | @@ -693,6 +734,16 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) { COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ, SectionKind::getMetadata(), "addr_sec"); + DwarfCUIndexSection = Ctx->getCOFFSection( + ".debug_cu_index", + COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ, + SectionKind::getMetadata()); + DwarfTUIndexSection = Ctx->getCOFFSection( + ".debug_tu_index", + COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ, + SectionKind::getMetadata()); DwarfAccelNamesSection = Ctx->getCOFFSection( ".apple_names", COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | @@ -720,11 +771,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) { PDataSection = Ctx->getCOFFSection( ".pdata", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ, - SectionKind::getDataRel()); + SectionKind::getData()); XDataSection = Ctx->getCOFFSection( ".xdata", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ, - SectionKind::getDataRel()); + SectionKind::getData()); SXDataSection = Ctx->getCOFFSection(".sxdata", COFF::IMAGE_SCN_LNK_INFO, SectionKind::getMetadata()); @@ -732,12 +783,12 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(Triple T) { TLSDataSection = Ctx->getCOFFSection( ".tls$", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE, - SectionKind::getDataRel()); - + SectionKind::getData()); + StackMapSection = Ctx->getCOFFSection(".llvm_stackmaps", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ, - SectionKind::getReadOnly()); + SectionKind::getReadOnly()); } void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, @@ -752,6 +803,7 @@ void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, CommDirectiveSupportsAlignment = true; SupportsWeakOmittedEHFrame = true; SupportsCompactUnwindWithoutEHFrame = false; + OmitDwarfIfHaveCompactUnwind = false; PersonalityEncoding = LSDAEncoding = FDECFIEncoding = TTypeEncoding = dwarf::DW_EH_PE_absptr; @@ -767,25 +819,26 @@ void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, TT = TheTriple; - Triple::ArchType Arch = TT.getArch(); - // FIXME: Checking for Arch here to filter out bogus triples such as - // cellspu-apple-darwin. Perhaps we should fix in Triple? - if ((Arch == Triple::x86 || Arch == Triple::x86_64 || - Arch == Triple::arm || Arch == Triple::thumb || - Arch == Triple::aarch64 || - Arch == Triple::ppc || Arch == Triple::ppc64 || - Arch == Triple::UnknownArch) && - TT.isOSBinFormatMachO()) { + switch (TT.getObjectFormat()) { + case Triple::MachO: Env = IsMachO; initMachOMCObjectFileInfo(TT); - } else if ((Arch == Triple::x86 || Arch == Triple::x86_64 || - Arch == Triple::arm || Arch == Triple::thumb) && - (TT.isOSWindows() && TT.getObjectFormat() == Triple::COFF)) { + break; + case Triple::COFF: + if (!TT.isOSWindows()) + report_fatal_error( + "Cannot initialize MC for non-Windows COFF object files."); + Env = IsCOFF; initCOFFMCObjectFileInfo(TT); - } else { + break; + case Triple::ELF: Env = IsELF; initELFMCObjectFileInfo(TT); + break; + case Triple::UnknownObjectFormat: + report_fatal_error("Cannot initialize MC for unknown object file format."); + break; } } @@ -799,24 +852,3 @@ MCSection *MCObjectFileInfo::getDwarfTypesSection(uint64_t Hash) const { return Ctx->getELFSection(".debug_types", ELF::SHT_PROGBITS, ELF::SHF_GROUP, 0, utostr(Hash)); } - -void MCObjectFileInfo::InitEHFrameSection() { - if (Env == IsMachO) - EHFrameSection = - Ctx->getMachOSection("__TEXT", "__eh_frame", - MachO::S_COALESCED | - MachO::S_ATTR_NO_TOC | - MachO::S_ATTR_STRIP_STATIC_SYMS | - MachO::S_ATTR_LIVE_SUPPORT, - SectionKind::getReadOnly()); - else if (Env == IsELF) - EHFrameSection = - Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags); - else - EHFrameSection = - Ctx->getCOFFSection(".eh_frame", - COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | - COFF::IMAGE_SCN_MEM_READ | - COFF::IMAGE_SCN_MEM_WRITE, - SectionKind::getDataRel()); -} diff --git a/contrib/llvm/lib/MC/MCObjectStreamer.cpp b/contrib/llvm/lib/MC/MCObjectStreamer.cpp index 0a63777..972610a 100644 --- a/contrib/llvm/lib/MC/MCObjectStreamer.cpp +++ b/contrib/llvm/lib/MC/MCObjectStreamer.cpp @@ -28,7 +28,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context, MCAsmBackend &TAB, MCCodeEmitter *Emitter_) : MCStreamer(Context), Assembler(new MCAssembler(Context, TAB, *Emitter_, - *TAB.createObjectWriter(OS), OS)), + *TAB.createObjectWriter(OS))), EmitEHFrame(true), EmitDebugFrame(false) {} MCObjectStreamer::~MCObjectStreamer() { @@ -39,32 +39,31 @@ MCObjectStreamer::~MCObjectStreamer() { } void MCObjectStreamer::flushPendingLabels(MCFragment *F, uint64_t FOffset) { - if (PendingLabels.size()) { - if (!F) { - F = new MCDataFragment(); - MCSection *CurSection = getCurrentSectionOnly(); - CurSection->getFragmentList().insert(CurInsertionPoint, F); - F->setParent(CurSection); - } - for (MCSymbol *Sym : PendingLabels) { - Sym->setFragment(F); - Sym->setOffset(FOffset); - } - PendingLabels.clear(); + if (PendingLabels.empty()) + return; + if (!F) { + F = new MCDataFragment(); + MCSection *CurSection = getCurrentSectionOnly(); + CurSection->getFragmentList().insert(CurInsertionPoint, F); + F->setParent(CurSection); + } + for (MCSymbol *Sym : PendingLabels) { + Sym->setFragment(F); + Sym->setOffset(FOffset); } + PendingLabels.clear(); } void MCObjectStreamer::emitAbsoluteSymbolDiff(const MCSymbol *Hi, const MCSymbol *Lo, unsigned Size) { // If not assigned to the same (valid) fragment, fallback. - if (!Hi->getFragment() || Hi->getFragment() != Lo->getFragment()) { + if (!Hi->getFragment() || Hi->getFragment() != Lo->getFragment() || + Hi->isVariable() || Lo->isVariable()) { MCStreamer::emitAbsoluteSymbolDiff(Hi, Lo, Size); return; } - assert(Hi->getOffset() >= Lo->getOffset() && - "Expected Hi to be greater than Lo"); EmitIntValue(Hi->getOffset() - Lo->getOffset(), Size); } @@ -93,7 +92,7 @@ MCFragment *MCObjectStreamer::getCurrentFragment() const { assert(getCurrentSectionOnly() && "No current section!"); if (CurInsertionPoint != getCurrentSectionOnly()->getFragmentList().begin()) - return std::prev(CurInsertionPoint); + return &*std::prev(CurInsertionPoint); return nullptr; } @@ -121,7 +120,7 @@ void MCObjectStreamer::EmitCFISections(bool EH, bool Debug) { } void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) { + SMLoc Loc) { MCStreamer::EmitValueImpl(Value, Size, Loc); MCDataFragment *DF = getOrCreateDataFragment(); flushPendingLabels(DF, DF->getContents().size()); @@ -155,7 +154,6 @@ void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) { MCStreamer::EmitLabel(Symbol); getAssembler().registerSymbol(*Symbol); - assert(!Symbol->getFragment() && "Unexpected fragment on symbol data!"); // If there is a current fragment, mark the symbol as pointing into it. // Otherwise queue the label and set its fragment pointer when we emit the @@ -276,7 +274,6 @@ void MCObjectStreamer::EmitInstToFragment(const MCInst &Inst, raw_svector_ostream VecOS(Code); getAssembler().getEmitter().encodeInstruction(Inst, VecOS, IF->getFixups(), STI); - VecOS.flush(); IF->getContents().append(Code.begin(), Code.end()); } @@ -321,8 +318,10 @@ static const MCExpr *buildSymbolDiff(MCObjectStreamer &OS, const MCSymbol *A, return AddrDelta; } -static void emitDwarfSetLineAddr(MCObjectStreamer &OS, int64_t LineDelta, - const MCSymbol *Label, int PointerSize) { +static void emitDwarfSetLineAddr(MCObjectStreamer &OS, + MCDwarfLineTableParams Params, + int64_t LineDelta, const MCSymbol *Label, + int PointerSize) { // emit the sequence to set the address OS.EmitIntValue(dwarf::DW_LNS_extended_op, 1); OS.EmitULEB128IntValue(PointerSize + 1); @@ -330,7 +329,7 @@ static void emitDwarfSetLineAddr(MCObjectStreamer &OS, int64_t LineDelta, OS.EmitSymbolValue(Label, PointerSize); // emit the sequence for the LineDelta (from 1) and a zero address delta. - MCDwarfLineAddr::Emit(&OS, LineDelta, 0); + MCDwarfLineAddr::Emit(&OS, Params, LineDelta, 0); } void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta, @@ -338,13 +337,15 @@ void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta, const MCSymbol *Label, unsigned PointerSize) { if (!LastLabel) { - emitDwarfSetLineAddr(*this, LineDelta, Label, PointerSize); + emitDwarfSetLineAddr(*this, Assembler->getDWARFLinetableParams(), LineDelta, + Label, PointerSize); return; } const MCExpr *AddrDelta = buildSymbolDiff(*this, Label, LastLabel); int64_t Res; if (AddrDelta->evaluateAsAbsolute(Res, getAssembler())) { - MCDwarfLineAddr::Emit(this, LineDelta, Res); + MCDwarfLineAddr::Emit(this, Assembler->getDWARFLinetableParams(), LineDelta, + Res); return; } insert(new MCDwarfLineAddrFragment(LineDelta, *AddrDelta)); @@ -388,26 +389,9 @@ void MCObjectStreamer::EmitCodeAlignment(unsigned ByteAlignment, cast<MCAlignFragment>(getCurrentFragment())->setEmitNops(true); } -bool MCObjectStreamer::EmitValueToOffset(const MCExpr *Offset, +void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value) { - int64_t Res; - if (Offset->evaluateAsAbsolute(Res, getAssembler())) { - insert(new MCOrgFragment(*Offset, Value)); - return false; - } - - MCSymbol *CurrentPos = getContext().createTempSymbol(); - EmitLabel(CurrentPos); - MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; - const MCExpr *Ref = - MCSymbolRefExpr::create(CurrentPos, Variant, getContext()); - const MCExpr *Delta = - MCBinaryExpr::create(MCBinaryExpr::Sub, Offset, Ref, getContext()); - - if (!Delta->evaluateAsAbsolute(Res, getAssembler())) - return true; - EmitFill(Res, Value); - return false; + insert(new MCOrgFragment(*Offset, Value)); } // Associate GPRel32 fixup with data and resize data area @@ -430,19 +414,31 @@ void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) { DF->getContents().resize(DF->getContents().size() + 8, 0); } -void MCObjectStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue) { - // FIXME: A MCFillFragment would be more memory efficient but MCExpr has - // problems evaluating expressions across multiple fragments. +bool MCObjectStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name, + const MCExpr *Expr, SMLoc Loc) { + int64_t OffsetValue; + if (!Offset.evaluateAsAbsolute(OffsetValue)) + llvm_unreachable("Offset is not absolute"); + MCDataFragment *DF = getOrCreateDataFragment(); flushPendingLabels(DF, DF->getContents().size()); - DF->getContents().append(NumBytes, FillValue); + + MCFixupKind Kind; + if (!Assembler->getBackend().getFixupKind(Name, Kind)) + return true; + + if (Expr == nullptr) + Expr = + MCSymbolRefExpr::create(getContext().createTempSymbol(), getContext()); + DF->getFixups().push_back(MCFixup::create(OffsetValue, Expr, Kind, Loc)); + return false; } -void MCObjectStreamer::EmitZeros(uint64_t NumBytes) { +void MCObjectStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue) { const MCSection *Sec = getCurrentSection().first; assert(Sec && "need a section"); unsigned ItemSize = Sec->isVirtualSection() ? 0 : 1; - insert(new MCFillFragment(0, ItemSize, NumBytes)); + insert(new MCFillFragment(FillValue, ItemSize, NumBytes)); } void MCObjectStreamer::FinishImpl() { @@ -451,7 +447,7 @@ void MCObjectStreamer::FinishImpl() { MCGenDwarfInfo::Emit(this); // Dump out the dwarf file & directory tables and line tables. - MCDwarfLineTable::Emit(this); + MCDwarfLineTable::Emit(this, getAssembler().getDWARFLinetableParams()); flushPendingLabels(nullptr); getAssembler().Finish(); diff --git a/contrib/llvm/lib/MC/MCObjectWriter.cpp b/contrib/llvm/lib/MC/MCObjectWriter.cpp index 3479034..e84f74a 100644 --- a/contrib/llvm/lib/MC/MCObjectWriter.cpp +++ b/contrib/llvm/lib/MC/MCObjectWriter.cpp @@ -33,8 +33,14 @@ bool MCObjectWriter::isSymbolRefDifferenceFullyResolved( if (!SA.getFragment() || !SB.getFragment()) return false; - return isSymbolRefDifferenceFullyResolvedImpl(Asm, SA, *SB.getFragment(), - InSet, false); + return isSymbolRefDifferenceFullyResolvedImpl(Asm, SA, SB, InSet); +} + +bool MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl( + const MCAssembler &Asm, const MCSymbol &A, const MCSymbol &B, + bool InSet) const { + return isSymbolRefDifferenceFullyResolvedImpl(Asm, A, *B.getFragment(), InSet, + false); } bool MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl( diff --git a/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp b/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp index b983d99..36c1920 100644 --- a/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -436,7 +436,8 @@ StringRef AsmLexer::LexUntilEndOfLine() { return StringRef(TokStart, CurPtr-TokStart); } -const AsmToken AsmLexer::peekTok(bool ShouldSkipSpace) { +size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf, + bool ShouldSkipSpace) { const char *SavedTokStart = TokStart; const char *SavedCurPtr = CurPtr; bool SavedAtStartOfLine = isAtStartOfLine; @@ -446,7 +447,16 @@ const AsmToken AsmLexer::peekTok(bool ShouldSkipSpace) { SMLoc SavedErrLoc = getErrLoc(); SkipSpace = ShouldSkipSpace; - AsmToken Token = LexToken(); + + size_t ReadCount; + for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) { + AsmToken Token = LexToken(); + + Buf[ReadCount] = Token; + + if (Token.is(AsmToken::Eof)) + break; + } SetError(SavedErrLoc, SavedErr); @@ -455,7 +465,7 @@ const AsmToken AsmLexer::peekTok(bool ShouldSkipSpace) { CurPtr = SavedCurPtr; TokStart = SavedTokStart; - return Token; + return ReadCount; } bool AsmLexer::isAtStartOfComment(const char *Ptr) { diff --git a/contrib/llvm/lib/MC/MCParser/AsmParser.cpp b/contrib/llvm/lib/MC/MCParser/AsmParser.cpp index 04d1413..646cbb4 100644 --- a/contrib/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/contrib/llvm/lib/MC/MCParser/AsmParser.cpp @@ -33,6 +33,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/MC/MCValue.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -251,14 +252,14 @@ private: bool parseStatement(ParseStatementInfo &Info, MCAsmParserSemaCallback *SI); void eatToEndOfLine(); - bool parseCppHashLineFilenameComment(const SMLoc &L); + bool parseCppHashLineFilenameComment(SMLoc L); void checkForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body, ArrayRef<MCAsmMacroParameter> Parameters); bool expandMacro(raw_svector_ostream &OS, StringRef Body, ArrayRef<MCAsmMacroParameter> Parameters, ArrayRef<MCAsmMacroArgument> A, bool EnableAtPseudoVariable, - const SMLoc &L); + SMLoc L); /// \brief Are macros enabled in the parser? bool areMacrosEnabled() {return MacrosEnabledFlag;} @@ -342,6 +343,7 @@ private: enum DirectiveKind { DK_NO_DIRECTIVE, // Placeholder DK_SET, DK_EQU, DK_EQUIV, DK_ASCII, DK_ASCIZ, DK_STRING, DK_BYTE, DK_SHORT, + DK_RELOC, DK_VALUE, DK_2BYTE, DK_LONG, DK_INT, DK_4BYTE, DK_QUAD, DK_8BYTE, DK_OCTA, DK_SINGLE, DK_FLOAT, DK_DOUBLE, DK_ALIGN, DK_ALIGN32, DK_BALIGN, DK_BALIGNW, DK_BALIGNL, DK_P2ALIGN, DK_P2ALIGNW, DK_P2ALIGNL, DK_ORG, DK_FILL, DK_ENDR, @@ -374,6 +376,7 @@ private: // ".ascii", ".asciz", ".string" bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated); + bool parseDirectiveReloc(SMLoc DirectiveLoc); // ".reloc" bool parseDirectiveValue(unsigned Size); // ".byte", ".long", ... bool parseDirectiveOctaValue(); // ".octa" bool parseDirectiveRealValue(const fltSemantics &); // ".single", ... @@ -553,6 +556,8 @@ void AsmParser::Note(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) { } bool AsmParser::Warning(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) { + if(getTargetParser().getTargetOptions().MCNoWarn) + return false; if (getTargetParser().getTargetOptions().MCFatalWarnings) return Error(L, Msg, Ranges); printMessage(L, SourceMgr::DK_Warning, Msg, Ranges); @@ -679,11 +684,8 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) { // so conservatively exclude them. Only do this if we're finalizing, though, // as otherwise we won't necessarilly have seen everything yet. if (!NoFinalize && MAI.hasSubsectionsViaSymbols()) { - const MCContext::SymbolTable &Symbols = getContext().getSymbols(); - for (MCContext::SymbolTable::const_iterator i = Symbols.begin(), - e = Symbols.end(); - i != e; ++i) { - MCSymbol *Sym = i->getValue(); + for (const auto &TableEntry : getContext().getSymbols()) { + MCSymbol *Sym = TableEntry.getValue(); // Variable symbols may not be marked as defined, so check those // explicitly. If we know it's a variable, we have a definition for // the purposes of this check. @@ -691,9 +693,8 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) { // FIXME: We would really like to refer back to where the symbol was // first referenced for a source location. We need to add something // to track that. Currently, we just point to the end of the file. - printMessage( - getLexer().getLoc(), SourceMgr::DK_Error, - "assembler local symbol '" + Sym->getName() + "' not defined"); + return Error(getLexer().getLoc(), "assembler local symbol '" + + Sym->getName() + "' not defined"); } } @@ -702,7 +703,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) { if (!HadError && !NoFinalize) Out.Finish(); - return HadError; + return HadError || getContext().hadError(); } void AsmParser::checkForValidSection() { @@ -865,11 +866,12 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { // If this is an absolute variable reference, substitute it now to preserve // semantics in the face of reassignment. - if (Sym->isVariable() && isa<MCConstantExpr>(Sym->getVariableValue())) { + if (Sym->isVariable() && + isa<MCConstantExpr>(Sym->getVariableValue(/*SetUsed*/ false))) { if (Variant) return Error(EndLoc, "unexpected modifier on variable reference"); - Res = Sym->getVariableValue(); + Res = Sym->getVariableValue(/*SetUsed*/ false); return false; } @@ -1102,8 +1104,9 @@ bool AsmParser::parseAbsoluteExpression(int64_t &Res) { return false; } -unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K, - MCBinaryExpr::Opcode &Kind) { +static unsigned getDarwinBinOpPrecedence(AsmToken::TokenKind K, + MCBinaryExpr::Opcode &Kind, + bool ShouldUseLogicalShr) { switch (K) { default: return 0; // not a binop. @@ -1155,7 +1158,7 @@ unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K, Kind = MCBinaryExpr::Shl; return 4; case AsmToken::GreaterGreater: - Kind = MAI.shouldUseLogicalShr() ? MCBinaryExpr::LShr : MCBinaryExpr::AShr; + Kind = ShouldUseLogicalShr ? MCBinaryExpr::LShr : MCBinaryExpr::AShr; return 4; // High Intermediate Precedence: +, - @@ -1179,6 +1182,89 @@ unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K, } } +static unsigned getGNUBinOpPrecedence(AsmToken::TokenKind K, + MCBinaryExpr::Opcode &Kind, + bool ShouldUseLogicalShr) { + switch (K) { + default: + return 0; // not a binop. + + // Lowest Precedence: &&, || + case AsmToken::AmpAmp: + Kind = MCBinaryExpr::LAnd; + return 2; + case AsmToken::PipePipe: + Kind = MCBinaryExpr::LOr; + return 1; + + // Low Precedence: ==, !=, <>, <, <=, >, >= + case AsmToken::EqualEqual: + Kind = MCBinaryExpr::EQ; + return 3; + case AsmToken::ExclaimEqual: + case AsmToken::LessGreater: + Kind = MCBinaryExpr::NE; + return 3; + case AsmToken::Less: + Kind = MCBinaryExpr::LT; + return 3; + case AsmToken::LessEqual: + Kind = MCBinaryExpr::LTE; + return 3; + case AsmToken::Greater: + Kind = MCBinaryExpr::GT; + return 3; + case AsmToken::GreaterEqual: + Kind = MCBinaryExpr::GTE; + return 3; + + // Low Intermediate Precedence: +, - + case AsmToken::Plus: + Kind = MCBinaryExpr::Add; + return 4; + case AsmToken::Minus: + Kind = MCBinaryExpr::Sub; + return 4; + + // High Intermediate Precedence: |, &, ^ + // + // FIXME: gas seems to support '!' as an infix operator? + case AsmToken::Pipe: + Kind = MCBinaryExpr::Or; + return 5; + case AsmToken::Caret: + Kind = MCBinaryExpr::Xor; + return 5; + case AsmToken::Amp: + Kind = MCBinaryExpr::And; + return 5; + + // Highest Precedence: *, /, %, <<, >> + case AsmToken::Star: + Kind = MCBinaryExpr::Mul; + return 6; + case AsmToken::Slash: + Kind = MCBinaryExpr::Div; + return 6; + case AsmToken::Percent: + Kind = MCBinaryExpr::Mod; + return 6; + case AsmToken::LessLess: + Kind = MCBinaryExpr::Shl; + return 6; + case AsmToken::GreaterGreater: + Kind = ShouldUseLogicalShr ? MCBinaryExpr::LShr : MCBinaryExpr::AShr; + return 6; + } +} + +unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K, + MCBinaryExpr::Opcode &Kind) { + bool ShouldUseLogicalShr = MAI.shouldUseLogicalShr(); + return IsDarwin ? getDarwinBinOpPrecedence(K, Kind, ShouldUseLogicalShr) + : getGNUBinOpPrecedence(K, Kind, ShouldUseLogicalShr); +} + /// \brief Parse all binary operators with precedence >= 'Precedence'. /// Res contains the LHS of the expression on input. bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res, @@ -1251,6 +1337,15 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, // Treat '.' as a valid identifier in this context. Lex(); IDVal = "."; + } else if (Lexer.is(AsmToken::LCurly)) { + // Treat '{' as a valid identifier in this context. + Lex(); + IDVal = "{"; + + } else if (Lexer.is(AsmToken::RCurly)) { + // Treat '}' as a valid identifier in this context. + Lex(); + IDVal = "}"; } else if (parseIdentifier(IDVal)) { if (!TheCondState.Ignore) return TokError("unexpected token at start of statement"); @@ -1313,6 +1408,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, // See what kind of statement we have. switch (Lexer.getKind()) { case AsmToken::Colon: { + if (!getTargetParser().isLabel(ID)) + break; checkForValidSection(); // identifier ':' -> Label. @@ -1334,8 +1431,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, SI->LookupInlineAsmLabel(IDVal, getSourceManager(), IDLoc, true); assert(RewrittenLabel.size() && "We should have an internal name here."); - Info.AsmRewrites->push_back(AsmRewrite(AOK_Label, IDLoc, - IDVal.size(), RewrittenLabel)); + Info.AsmRewrites->emplace_back(AOK_Label, IDLoc, IDVal.size(), + RewrittenLabel); IDVal = RewrittenLabel; } Sym = getContext().getOrCreateSymbol(IDVal); @@ -1371,6 +1468,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, } case AsmToken::Equal: + if (!getTargetParser().equalIsAsmAssignment()) + break; // identifier '=' ... -> assignment statement Lex(); @@ -1599,6 +1698,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, return parseDirectiveError(IDLoc, true); case DK_WARNING: return parseDirectiveWarning(IDLoc); + case DK_RELOC: + return parseDirectiveReloc(IDLoc); } return Error(IDLoc, "unknown directive"); @@ -1613,12 +1714,14 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, if (ParsingInlineAsm && (IDVal == "align" || IDVal == "ALIGN")) return parseDirectiveMSAlign(IDLoc, Info); + if (ParsingInlineAsm && (IDVal == "even")) + Info.AsmRewrites->emplace_back(AOK_EVEN, IDLoc, 4); checkForValidSection(); // Canonicalize the opcode to lower case. std::string OpcodeStr = IDVal.lower(); ParseInstructionInfo IInfo(Info.AsmRewrites); - bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr, IDLoc, + bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr, ID, Info.ParsedOperands); Info.ParseError = HadError; @@ -1703,7 +1806,7 @@ void AsmParser::eatToEndOfLine() { /// parseCppHashLineFilenameComment as this: /// ::= # number "filename" /// or just as a full line comment if it doesn't have a number and a string. -bool AsmParser::parseCppHashLineFilenameComment(const SMLoc &L) { +bool AsmParser::parseCppHashLineFilenameComment(SMLoc L) { Lex(); // Eat the hash token. if (getLexer().isNot(AsmToken::Integer)) { @@ -1743,7 +1846,7 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) { raw_ostream &OS = errs(); const SourceMgr &DiagSrcMgr = *Diag.getSourceMgr(); - const SMLoc &DiagLoc = Diag.getLoc(); + SMLoc DiagLoc = Diag.getLoc(); unsigned DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc); unsigned CppHashBuf = Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashLoc); @@ -1802,7 +1905,7 @@ static bool isIdentifierChar(char c) { bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body, ArrayRef<MCAsmMacroParameter> Parameters, ArrayRef<MCAsmMacroArgument> A, - bool EnableAtPseudoVariable, const SMLoc &L) { + bool EnableAtPseudoVariable, SMLoc L) { unsigned NParameters = Parameters.size(); bool HasVararg = NParameters ? Parameters.back().Vararg : false; if ((!IsDarwin || NParameters != 0) && NParameters != A.size()) @@ -1858,10 +1961,8 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body, break; // Otherwise substitute with the token values, with spaces eliminated. - for (MCAsmMacroArgument::const_iterator it = A[Index].begin(), - ie = A[Index].end(); - it != ie; ++it) - OS << it->getString(); + for (const AsmToken &Token : A[Index]) + OS << Token.getString(); break; } } @@ -1897,15 +1998,13 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body, } } else { bool VarargParameter = HasVararg && Index == (NParameters - 1); - for (MCAsmMacroArgument::const_iterator it = A[Index].begin(), - ie = A[Index].end(); - it != ie; ++it) + for (const AsmToken &Token : A[Index]) // We expect no quotes around the string's contents when // parsing for varargs. - if (it->getKind() != AsmToken::String || VarargParameter) - OS << it->getString(); + if (Token.getKind() != AsmToken::String || VarargParameter) + OS << Token.getString(); else - OS << it->getStringContents(); + OS << Token.getStringContents(); Pos += 1 + Argument.size(); } @@ -2371,6 +2470,51 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) { return false; } +/// parseDirectiveReloc +/// ::= .reloc expression , identifier [ , expression ] +bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) { + const MCExpr *Offset; + const MCExpr *Expr = nullptr; + + SMLoc OffsetLoc = Lexer.getTok().getLoc(); + if (parseExpression(Offset)) + return true; + + // We can only deal with constant expressions at the moment. + int64_t OffsetValue; + if (!Offset->evaluateAsAbsolute(OffsetValue)) + return Error(OffsetLoc, "expression is not a constant value"); + + if (Lexer.isNot(AsmToken::Comma)) + return TokError("expected comma"); + Lexer.Lex(); + + if (Lexer.isNot(AsmToken::Identifier)) + return TokError("expected relocation name"); + SMLoc NameLoc = Lexer.getTok().getLoc(); + StringRef Name = Lexer.getTok().getIdentifier(); + Lexer.Lex(); + + if (Lexer.is(AsmToken::Comma)) { + Lexer.Lex(); + SMLoc ExprLoc = Lexer.getLoc(); + if (parseExpression(Expr)) + return true; + + MCValue Value; + if (!Expr->evaluateAsRelocatable(Value, nullptr, nullptr)) + return Error(ExprLoc, "expression must be relocatable"); + } + + if (Lexer.isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in .reloc directive"); + + if (getStreamer().EmitRelocDirective(*Offset, Name, Expr, DirectiveLoc)) + return Error(NameLoc, "unknown relocation name"); + + return false; +} + /// parseDirectiveValue /// ::= (.byte | .short | ... ) [ expression (, expression)* ] bool AsmParser::parseDirectiveValue(unsigned Size) { @@ -2617,7 +2761,6 @@ bool AsmParser::parseDirectiveOrg() { checkForValidSection(); const MCExpr *Offset; - SMLoc Loc = getTok().getLoc(); if (parseExpression(Offset)) return true; @@ -2636,13 +2779,7 @@ bool AsmParser::parseDirectiveOrg() { } Lex(); - - // Only limited forms of relocatable expressions are accepted here, it - // has to be relative to the current section. The streamer will return - // 'true' if the expression wasn't evaluatable. - if (getStreamer().EmitValueToOffset(Offset, FillExpr)) - return Error(Loc, "expected assembly-time absolute expression"); - + getStreamer().emitValueToOffset(Offset, FillExpr); return false; } @@ -2703,7 +2840,11 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) { Alignment = 1ULL << Alignment; } else { - // Reject alignments that aren't a power of two, for gas compatibility. + // Reject alignments that aren't either a power of two or zero, + // for gas compatibility. Alignment of zero is silently rounded + // up to one. + if (Alignment == 0) + Alignment = 1; if (!isPowerOf2_64(Alignment)) Error(AlignmentLoc, "alignment must be a power of 2"); } @@ -4269,6 +4410,7 @@ void AsmParser::initializeDirectiveKindMap() { DirectiveKindMap[".err"] = DK_ERR; DirectiveKindMap[".error"] = DK_ERROR; DirectiveKindMap[".warning"] = DK_WARNING; + DirectiveKindMap[".reloc"] = DK_RELOC; } MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) { @@ -4405,10 +4547,10 @@ bool AsmParser::parseDirectiveIrp(SMLoc DirectiveLoc) { SmallString<256> Buf; raw_svector_ostream OS(Buf); - for (MCAsmMacroArguments::iterator i = A.begin(), e = A.end(); i != e; ++i) { + for (const MCAsmMacroArgument &Arg : A) { // Note that the AtPseudoVariable is enabled for instantiations of .irp. // This is undocumented, but GAS seems to support it. - if (expandMacro(OS, M->Body, Parameter, *i, true, getTok().getLoc())) + if (expandMacro(OS, M->Body, Parameter, Arg, true, getTok().getLoc())) return true; } @@ -4488,10 +4630,10 @@ bool AsmParser::parseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info, if (!MCE) return Error(ExprLoc, "unexpected expression in _emit"); uint64_t IntValue = MCE->getValue(); - if (!isUIntN(8, IntValue) && !isIntN(8, IntValue)) + if (!isUInt<8>(IntValue) && !isInt<8>(IntValue)) return Error(ExprLoc, "literal value out of range for directive"); - Info.AsmRewrites->push_back(AsmRewrite(AOK_Emit, IDLoc, Len)); + Info.AsmRewrites->emplace_back(AOK_Emit, IDLoc, Len); return false; } @@ -4507,8 +4649,7 @@ bool AsmParser::parseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) { if (!isPowerOf2_64(IntValue)) return Error(ExprLoc, "literal value not a power of two greater then zero"); - Info.AsmRewrites->push_back( - AsmRewrite(AOK_Align, IDLoc, 5, Log2_64(IntValue))); + Info.AsmRewrites->emplace_back(AOK_Align, IDLoc, 5, Log2_64(IntValue)); return false; } @@ -4604,18 +4745,18 @@ bool AsmParser::parseMSInlineAsm( OutputDecls.push_back(OpDecl); OutputDeclsAddressOf.push_back(Operand.needAddressOf()); OutputConstraints.push_back(("=" + Operand.getConstraint()).str()); - AsmStrRewrites.push_back(AsmRewrite(AOK_Output, Start, SymName.size())); + AsmStrRewrites.emplace_back(AOK_Output, Start, SymName.size()); } else { InputDecls.push_back(OpDecl); InputDeclsAddressOf.push_back(Operand.needAddressOf()); InputConstraints.push_back(Operand.getConstraint().str()); - AsmStrRewrites.push_back(AsmRewrite(AOK_Input, Start, SymName.size())); + AsmStrRewrites.emplace_back(AOK_Input, Start, SymName.size()); } } // Consider implicit defs to be clobbers. Think of cpuid and push. - ArrayRef<uint16_t> ImpDefs(Desc.getImplicitDefs(), - Desc.getNumImplicitDefs()); + ArrayRef<MCPhysReg> ImpDefs(Desc.getImplicitDefs(), + Desc.getNumImplicitDefs()); ClobberRegs.insert(ClobberRegs.end(), ImpDefs.begin(), ImpDefs.end()); } @@ -4710,14 +4851,23 @@ bool AsmParser::parseMSInlineAsm( OS << ".byte"; break; case AOK_Align: { - unsigned Val = AR.Val; - OS << ".align " << Val; + // MS alignment directives are measured in bytes. If the native assembler + // measures alignment in bytes, we can pass it straight through. + OS << ".align"; + if (getContext().getAsmInfo()->getAlignmentIsInBytes()) + break; - // Skip the original immediate. + // Alignment is in log2 form, so print that instead and skip the original + // immediate. + unsigned Val = AR.Val; + OS << ' ' << Val; assert(Val < 10 && "Expected alignment less then 2^10."); AdditionalSkip = (Val < 4) ? 2 : Val < 7 ? 3 : 4; break; } + case AOK_EVEN: + OS << ".even"; + break; case AOK_DotOperator: // Insert the dot if the user omitted it. OS.flush(); @@ -4803,7 +4953,8 @@ bool parseAssignmentExpression(StringRef Name, bool allow_redef, // FIXME: Diagnose assignment to protected identifier (e.g., register name). if (isSymbolUsedInExpression(Sym, Value)) return Parser.Error(EqualLoc, "Recursive use of '" + Name + "'"); - else if (Sym->isUndefined() && !Sym->isUsed() && !Sym->isVariable()) + else if (Sym->isUndefined(/*SetUsed*/ false) && !Sym->isUsed() && + !Sym->isVariable()) ; // Allow redefinitions of undefined symbols only used in directives. else if (Sym->isVariable() && !Sym->isUsed() && allow_redef) ; // Allow redefinitions of variables that haven't yet been used. @@ -4815,15 +4966,8 @@ bool parseAssignmentExpression(StringRef Name, bool allow_redef, return Parser.Error(EqualLoc, "invalid reassignment of non-absolute variable '" + Name + "'"); - - // Don't count these checks as uses. - Sym->setUsed(false); } else if (Name == ".") { - if (Parser.getStreamer().EmitValueToOffset(Value, 0)) { - Parser.Error(EqualLoc, "expected absolute expression"); - Parser.eatToEndOfStatement(); - return true; - } + Parser.getStreamer().emitValueToOffset(Value, 0); return false; } else Sym = Parser.getContext().getOrCreateSymbol(Name); diff --git a/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp index f09bce0..a4b2b19 100644 --- a/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp +++ b/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp @@ -98,11 +98,10 @@ class COFFAsmParser : public MCAsmParserExtension { SectionKind::getText()); } bool ParseSectionDirectiveData(StringRef, SMLoc) { - return ParseSectionSwitch(".data", - COFF::IMAGE_SCN_CNT_INITIALIZED_DATA - | COFF::IMAGE_SCN_MEM_READ - | COFF::IMAGE_SCN_MEM_WRITE, - SectionKind::getDataRel()); + return ParseSectionSwitch(".data", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ | + COFF::IMAGE_SCN_MEM_WRITE, + SectionKind::getData()); } bool ParseSectionDirectiveBSS(StringRef, SMLoc) { return ParseSectionSwitch(".bss", @@ -153,7 +152,7 @@ static SectionKind computeSectionKind(unsigned Flags) { if (Flags & COFF::IMAGE_SCN_MEM_READ && (Flags & COFF::IMAGE_SCN_MEM_WRITE) == 0) return SectionKind::getReadOnly(); - return SectionKind::getDataRel(); + return SectionKind::getData(); } bool COFFAsmParser::ParseSectionFlags(StringRef FlagsString, unsigned* Flags) { diff --git a/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp index dc664e8..73e068a 100644 --- a/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp +++ b/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp @@ -8,10 +8,13 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCParser/MCAsmParserExtension.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCSectionMachO.h" @@ -38,6 +41,8 @@ class DarwinAsmParser : public MCAsmParserExtension { unsigned TAA = 0, unsigned ImplicitAlign = 0, unsigned StubSize = 0); + SMLoc LastVersionMinDirective; + public: DarwinAsmParser() {} @@ -164,9 +169,14 @@ public: addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveTLV>(".tlv"); addDirectiveHandler<&DarwinAsmParser::parseSectionDirectiveIdent>(".ident"); + addDirectiveHandler<&DarwinAsmParser::parseVersionMin>( + ".watchos_version_min"); + addDirectiveHandler<&DarwinAsmParser::parseVersionMin>(".tvos_version_min"); addDirectiveHandler<&DarwinAsmParser::parseVersionMin>(".ios_version_min"); addDirectiveHandler<&DarwinAsmParser::parseVersionMin>( ".macosx_version_min"); + + LastVersionMinDirective = SMLoc(); } bool parseDirectiveDesc(StringRef, SMLoc); @@ -381,9 +391,8 @@ bool DarwinAsmParser::parseSectionSwitch(const char *Segment, // FIXME: Arch specific. bool isText = TAA & MachO::S_ATTR_PURE_INSTRUCTIONS; getStreamer().SwitchSection(getContext().getMachOSection( - Segment, Section, TAA, StubSize, - isText ? SectionKind::getText() - : SectionKind::getDataRel())); + Segment, Section, TAA, StubSize, + isText ? SectionKind::getText() : SectionKind::getData())); // Set the implicit alignment, if any. // @@ -579,12 +588,34 @@ bool DarwinAsmParser::parseDirectiveSection(StringRef, SMLoc) { if (!ErrorStr.empty()) return Error(Loc, ErrorStr.c_str()); + // Issue a warning if the target is not powerpc and Section is a *coal* section. + Triple TT = getParser().getContext().getObjectFileInfo()->getTargetTriple(); + Triple::ArchType ArchTy = TT.getArch(); + + if (ArchTy != Triple::ppc && ArchTy != Triple::ppc64) { + StringRef NonCoalSection = StringSwitch<StringRef>(Section) + .Case("__textcoal_nt", "__text") + .Case("__const_coal", "__const") + .Case("__datacoal_nt", "__data") + .Default(Section); + + if (!Section.equals(NonCoalSection)) { + StringRef SectionVal(Loc.getPointer()); + size_t B = SectionVal.find(',') + 1, E = SectionVal.find(',', B); + SMLoc BLoc = SMLoc::getFromPointer(SectionVal.data() + B); + SMLoc ELoc = SMLoc::getFromPointer(SectionVal.data() + E); + getParser().Warning(Loc, "section \"" + Section + "\" is deprecated", + SMRange(BLoc, ELoc)); + getParser().Note(Loc, "change section name to \"" + NonCoalSection + + "\"", SMRange(BLoc, ELoc)); + } + } + // FIXME: Arch specific. bool isText = Segment == "__TEXT"; // FIXME: Hack. getStreamer().SwitchSection(getContext().getMachOSection( - Segment, Section, TAA, StubSize, - isText ? SectionKind::getText() - : SectionKind::getDataRel())); + Segment, Section, TAA, StubSize, + isText ? SectionKind::getText() : SectionKind::getData())); return false; } @@ -636,17 +667,16 @@ bool DarwinAsmParser::parseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) { "environment variable unset."); // Open the secure log file if we haven't already. - raw_ostream *OS = getContext().getSecureLog(); + raw_fd_ostream *OS = getContext().getSecureLog(); if (!OS) { std::error_code EC; - OS = new raw_fd_ostream(SecureLogFile, EC, - sys::fs::F_Append | sys::fs::F_Text); - if (EC) { - delete OS; + auto NewOS = llvm::make_unique<raw_fd_ostream>( + SecureLogFile, EC, sys::fs::F_Append | sys::fs::F_Text); + if (EC) return Error(IDLoc, Twine("can't open secure log file: ") + SecureLogFile + " (" + EC.message() + ")"); - } - getContext().setSecureLog(OS); + OS = NewOS.get(); + getContext().setSecureLog(std::move(NewOS)); } // Write the message. @@ -867,9 +897,11 @@ bool DarwinAsmParser::parseDirectiveDataRegionEnd(StringRef, SMLoc) { /// parseVersionMin /// ::= .ios_version_min major,minor[,update] /// ::= .macosx_version_min major,minor[,update] -bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc) { +bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc Loc) { int64_t Major = 0, Minor = 0, Update = 0; int Kind = StringSwitch<int>(Directive) + .Case(".watchos_version_min", MCVM_WatchOSVersionMin) + .Case(".tvos_version_min", MCVM_TvOSVersionMin) .Case(".ios_version_min", MCVM_IOSVersionMin) .Case(".macosx_version_min", MCVM_OSXVersionMin); // Get the major version number. @@ -902,6 +934,24 @@ bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc) { Lex(); } + const Triple &T = getContext().getObjectFileInfo()->getTargetTriple(); + Triple::OSType ExpectedOS = Triple::UnknownOS; + switch ((MCVersionMinType)Kind) { + case MCVM_WatchOSVersionMin: ExpectedOS = Triple::WatchOS; break; + case MCVM_TvOSVersionMin: ExpectedOS = Triple::TvOS; break; + case MCVM_IOSVersionMin: ExpectedOS = Triple::IOS; break; + case MCVM_OSXVersionMin: ExpectedOS = Triple::MacOSX; break; + } + if (T.getOS() != ExpectedOS) + Warning(Loc, Directive + " should only be used for " + + Triple::getOSTypeName(ExpectedOS) + " targets"); + + if (LastVersionMinDirective.isValid()) { + Warning(Loc, "overriding previous version_min directive"); + Note(LastVersionMinDirective, "previous definition is here"); + } + LastVersionMinDirective = Loc; + // We've parsed a correct version specifier, so send it to the streamer. getStreamer().EmitVersionMin((MCVersionMinType)Kind, Major, Minor, Update); diff --git a/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp index 5f8a603..6cbcdec 100644 --- a/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -52,8 +52,6 @@ public: addDirectiveHandler< &ELFAsmParser::ParseSectionDirectiveDataRelRo>(".data.rel.ro"); addDirectiveHandler< - &ELFAsmParser::ParseSectionDirectiveDataRelRoLocal>(".data.rel.ro.local"); - addDirectiveHandler< &ELFAsmParser::ParseSectionDirectiveEhFrame>(".eh_frame"); addDirectiveHandler<&ELFAsmParser::ParseDirectiveSection>(".section"); addDirectiveHandler< @@ -81,8 +79,8 @@ public: // the best way for us to get access to it? bool ParseSectionDirectiveData(StringRef, SMLoc) { return ParseSectionSwitch(".data", ELF::SHT_PROGBITS, - ELF::SHF_WRITE |ELF::SHF_ALLOC, - SectionKind::getDataRel()); + ELF::SHF_WRITE | ELF::SHF_ALLOC, + SectionKind::getData()); } bool ParseSectionDirectiveText(StringRef, SMLoc) { return ParseSectionSwitch(".text", ELF::SHT_PROGBITS, @@ -113,9 +111,8 @@ public: } bool ParseSectionDirectiveDataRel(StringRef, SMLoc) { return ParseSectionSwitch(".data.rel", ELF::SHT_PROGBITS, - ELF::SHF_ALLOC | - ELF::SHF_WRITE, - SectionKind::getDataRel()); + ELF::SHF_ALLOC | ELF::SHF_WRITE, + SectionKind::getData()); } bool ParseSectionDirectiveDataRelRo(StringRef, SMLoc) { return ParseSectionSwitch(".data.rel.ro", ELF::SHT_PROGBITS, @@ -123,17 +120,10 @@ public: ELF::SHF_WRITE, SectionKind::getReadOnlyWithRel()); } - bool ParseSectionDirectiveDataRelRoLocal(StringRef, SMLoc) { - return ParseSectionSwitch(".data.rel.ro.local", ELF::SHT_PROGBITS, - ELF::SHF_ALLOC | - ELF::SHF_WRITE, - SectionKind::getReadOnlyWithRelLocal()); - } bool ParseSectionDirectiveEhFrame(StringRef, SMLoc) { return ParseSectionSwitch(".eh_frame", ELF::SHT_PROGBITS, - ELF::SHF_ALLOC | - ELF::SHF_WRITE, - SectionKind::getDataRel()); + ELF::SHF_ALLOC | ELF::SHF_WRITE, + SectionKind::getData()); } bool ParseDirectivePushSection(StringRef, SMLoc); bool ParseDirectivePopSection(StringRef, SMLoc); diff --git a/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp b/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp index 795cc85..e891bd2 100644 --- a/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp +++ b/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp @@ -12,8 +12,8 @@ using namespace llvm; -MCAsmLexer::MCAsmLexer() : CurTok(AsmToken::Error, StringRef()), - TokStart(nullptr), SkipSpace(true) { +MCAsmLexer::MCAsmLexer() : TokStart(nullptr), SkipSpace(true) { + CurTok.emplace_back(AsmToken::Error, StringRef()); } MCAsmLexer::~MCAsmLexer() { diff --git a/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp index 60a3a3b..4e4b478 100644 --- a/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp +++ b/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp @@ -7,13 +7,26 @@ // //===----------------------------------------------------------------------===// +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCTargetAsmParser.h" using namespace llvm; -MCTargetAsmParser::MCTargetAsmParser() - : AvailableFeatures(0), ParsingInlineAsm(false) +MCTargetAsmParser::MCTargetAsmParser(MCTargetOptions const &MCOptions, + const MCSubtargetInfo &STI) + : AvailableFeatures(0), ParsingInlineAsm(false), MCOptions(MCOptions), + STI(&STI) { } MCTargetAsmParser::~MCTargetAsmParser() { } + +MCSubtargetInfo &MCTargetAsmParser::copySTI() { + MCSubtargetInfo &STICopy = getContext().getSubtargetCopy(getSTI()); + STI = &STICopy; + return STICopy; +} + +const MCSubtargetInfo &MCTargetAsmParser::getSTI() const { + return *STI; +} diff --git a/contrib/llvm/lib/MC/MCSection.cpp b/contrib/llvm/lib/MC/MCSection.cpp index 9152f2b..dbd544a 100644 --- a/contrib/llvm/lib/MC/MCSection.cpp +++ b/contrib/llvm/lib/MC/MCSection.cpp @@ -21,7 +21,7 @@ using namespace llvm; MCSection::MCSection(SectionVariant V, SectionKind K, MCSymbol *Begin) : Begin(Begin), BundleGroupBeforeFirstInst(false), HasInstructions(false), - IsRegistered(false), Variant(V), Kind(K) {} + IsRegistered(false), DummyFragment(this), Variant(V), Kind(K) {} MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) { if (!End) @@ -72,7 +72,7 @@ MCSection::getSubsectionInsertionPoint(unsigned Subsection) { if (MI == SubsectionFragmentMap.end()) IP = end(); else - IP = MI->second; + IP = MI->second->getIterator(); if (!ExactMatch && Subsection != 0) { // The GNU as documentation claims that subsections have an alignment of 4, // although this appears not to be the case. diff --git a/contrib/llvm/lib/MC/MCSectionCOFF.cpp b/contrib/llvm/lib/MC/MCSectionCOFF.cpp index ce0b4f5..b8373f4 100644 --- a/contrib/llvm/lib/MC/MCSectionCOFF.cpp +++ b/contrib/llvm/lib/MC/MCSectionCOFF.cpp @@ -11,6 +11,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/COFF.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; diff --git a/contrib/llvm/lib/MC/MCSectionELF.cpp b/contrib/llvm/lib/MC/MCSectionELF.cpp index b4448d7..5a0bb7f 100644 --- a/contrib/llvm/lib/MC/MCSectionELF.cpp +++ b/contrib/llvm/lib/MC/MCSectionELF.cpp @@ -27,12 +27,7 @@ bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name, if (isUnique()) return false; - // FIXME: Does .section .bss/.data/.text work everywhere?? - if (Name == ".text" || Name == ".data" || - (Name == ".bss" && !MAI.usesELFSectionDirectiveForBSS())) - return true; - - return false; + return MAI.shouldOmitSectionDirective(Name); } static void printName(raw_ostream &OS, StringRef Name) { @@ -138,6 +133,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, OS << "note"; else if (Type == ELF::SHT_PROGBITS) OS << "progbits"; + else if (Type == ELF::SHT_X86_64_UNWIND) + OS << "unwind"; if (EntrySize) { assert(Flags & ELF::SHF_MERGE); diff --git a/contrib/llvm/lib/MC/MCSectionMachO.cpp b/contrib/llvm/lib/MC/MCSectionMachO.cpp index c9f1591..879c6e5 100644 --- a/contrib/llvm/lib/MC/MCSectionMachO.cpp +++ b/contrib/llvm/lib/MC/MCSectionMachO.cpp @@ -177,7 +177,7 @@ std::string MCSectionMachO::ParseSectionSpecifier(StringRef Spec, // In. TAAParsed = false; SmallVector<StringRef, 5> SplitSpec; - Spec.split(SplitSpec, ","); + Spec.split(SplitSpec, ','); // Remove leading and trailing whitespace. auto GetEmptyOrTrim = [&SplitSpec](size_t Idx) -> StringRef { return SplitSpec.size() > Idx ? SplitSpec[Idx].trim() : StringRef(); @@ -235,7 +235,7 @@ std::string MCSectionMachO::ParseSectionSpecifier(StringRef Spec, // In. // The attribute list is a '+' separated list of attributes. SmallVector<StringRef, 1> SectionAttrs; - Attrs.split(SectionAttrs, "+", /*MaxSplit=*/-1, /*KeepEmpty=*/false); + Attrs.split(SectionAttrs, '+', /*MaxSplit=*/-1, /*KeepEmpty=*/false); for (StringRef &SectionAttr : SectionAttrs) { auto AttrDescriptorI = std::find_if( diff --git a/contrib/llvm/lib/MC/MCStreamer.cpp b/contrib/llvm/lib/MC/MCStreamer.cpp index 7fbbbd9..836b405 100644 --- a/contrib/llvm/lib/MC/MCStreamer.cpp +++ b/contrib/llvm/lib/MC/MCStreamer.cpp @@ -107,8 +107,7 @@ void MCStreamer::EmitSLEB128IntValue(int64_t Value) { EmitBytes(OSE.str()); } -void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) { +void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size, SMLoc Loc) { EmitValueImpl(Value, Size, Loc); } @@ -189,11 +188,9 @@ void MCStreamer::InitSections(bool NoExecStack) { SwitchSection(getContext().getObjectFileInfo()->getTextSection()); } -void MCStreamer::AssignSection(MCSymbol *Symbol, MCSection *Section) { - if (Section) - Symbol->setSection(*Section); - else - Symbol->setUndefined(); +void MCStreamer::AssignFragment(MCSymbol *Symbol, MCFragment *Fragment) { + assert(Fragment); + Symbol->setFragment(Fragment); // As we emit symbols into a section, track the order so that they can // be sorted upon later. Zero is reserved to mean 'unemitted'. @@ -203,7 +200,8 @@ void MCStreamer::AssignSection(MCSymbol *Symbol, MCSection *Section) { void MCStreamer::EmitLabel(MCSymbol *Symbol) { assert(!Symbol->isVariable() && "Cannot emit a variable symbol!"); assert(getCurrentSection().first && "Cannot emit before setting section!"); - AssignSection(Symbol, getCurrentSection().first); + assert(!Symbol->getFragment() && "Unexpected fragment on symbol data!"); + Symbol->setFragment(&getCurrentSectionOnly()->getDummyFragment()); MCTargetStreamer *TS = getTargetStreamer(); if (TS) @@ -361,6 +359,14 @@ void MCStreamer::EmitCFIEscape(StringRef Values) { CurFrame->Instructions.push_back(Instruction); } +void MCStreamer::EmitCFIGnuArgsSize(int64_t Size) { + MCSymbol *Label = EmitCFICommon(); + MCCFIInstruction Instruction = + MCCFIInstruction::createGnuArgsSize(Label, Size); + MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); + CurFrame->Instructions.push_back(Instruction); +} + void MCStreamer::EmitCFISignalFrame() { EnsureValidDwarfFrame(); MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo(); @@ -467,6 +473,8 @@ void MCStreamer::EmitWinEHHandlerData() { report_fatal_error("Chained unwind areas can't have handlers!"); } +void MCStreamer::EmitSyntaxDirective() {} + void MCStreamer::EmitWinCFIPushReg(unsigned Register) { EnsureValidWinFrameInfo(); @@ -679,8 +687,7 @@ void MCStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, void MCStreamer::ChangeSection(MCSection *, const MCExpr *) {} void MCStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {} void MCStreamer::EmitBytes(StringRef Data) {} -void MCStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) { +void MCStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) { visitUsedExpr(*Value); } void MCStreamer::EmitULEB128Value(const MCExpr *Value) {} @@ -690,9 +697,7 @@ void MCStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value, unsigned MaxBytesToEmit) {} void MCStreamer::EmitCodeAlignment(unsigned ByteAlignment, unsigned MaxBytesToEmit) {} -bool MCStreamer::EmitValueToOffset(const MCExpr *Offset, unsigned char Value) { - return false; -} +void MCStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value) {} void MCStreamer::EmitBundleAlignMode(unsigned AlignPow2) {} void MCStreamer::EmitBundleLock(bool AlignToEnd) {} void MCStreamer::FinishImpl() {} diff --git a/contrib/llvm/lib/MC/MCSubtargetInfo.cpp b/contrib/llvm/lib/MC/MCSubtargetInfo.cpp index 9210cf5..1b59250 100644 --- a/contrib/llvm/lib/MC/MCSubtargetInfo.cpp +++ b/contrib/llvm/lib/MC/MCSubtargetInfo.cpp @@ -32,8 +32,8 @@ void MCSubtargetInfo::InitMCProcessorInfo(StringRef CPU, StringRef FS) { CPUSchedModel = &MCSchedModel::GetDefaultSchedModel(); } -void MCSubtargetInfo::setDefaultFeatures(StringRef CPU) { - FeatureBits = getFeatures(CPU, "", ProcDesc, ProcFeatures); +void MCSubtargetInfo::setDefaultFeatures(StringRef CPU, StringRef FS) { + FeatureBits = getFeatures(CPU, FS, ProcDesc, ProcFeatures); } MCSubtargetInfo::MCSubtargetInfo( @@ -63,32 +63,30 @@ FeatureBitset MCSubtargetInfo::ToggleFeature(const FeatureBitset &FB) { /// ToggleFeature - Toggle a feature and returns the re-computed feature /// bits. This version will also change all implied bits. FeatureBitset MCSubtargetInfo::ToggleFeature(StringRef FS) { - SubtargetFeatures Features; - FeatureBits = Features.ToggleFeature(FeatureBits, FS, ProcFeatures); + SubtargetFeatures::ToggleFeature(FeatureBits, FS, ProcFeatures); return FeatureBits; } FeatureBitset MCSubtargetInfo::ApplyFeatureFlag(StringRef FS) { - SubtargetFeatures Features; - FeatureBits = Features.ApplyFeatureFlag(FeatureBits, FS, ProcFeatures); + SubtargetFeatures::ApplyFeatureFlag(FeatureBits, FS, ProcFeatures); return FeatureBits; } const MCSchedModel &MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const { assert(ProcSchedModels && "Processor machine model not available!"); - unsigned NumProcs = ProcDesc.size(); -#ifndef NDEBUG - for (size_t i = 1; i < NumProcs; i++) { - assert(strcmp(ProcSchedModels[i - 1].Key, ProcSchedModels[i].Key) < 0 && - "Processor machine model table is not sorted"); - } -#endif + ArrayRef<SubtargetInfoKV> SchedModels(ProcSchedModels, ProcDesc.size()); + + assert(std::is_sorted(SchedModels.begin(), SchedModels.end(), + [](const SubtargetInfoKV &LHS, const SubtargetInfoKV &RHS) { + return strcmp(LHS.Key, RHS.Key) < 0; + }) && + "Processor machine model table is not sorted"); // Find entry - const SubtargetInfoKV *Found = - std::lower_bound(ProcSchedModels, ProcSchedModels+NumProcs, CPU); - if (Found == ProcSchedModels+NumProcs || StringRef(Found->Key) != CPU) { + auto Found = + std::lower_bound(SchedModels.begin(), SchedModels.end(), CPU); + if (Found == SchedModels.end() || StringRef(Found->Key) != CPU) { if (CPU != "help") // Don't error if the user asked for help. errs() << "'" << CPU << "' is not a recognized processor for this target" diff --git a/contrib/llvm/lib/MC/MCSymbol.cpp b/contrib/llvm/lib/MC/MCSymbol.cpp index 125380a..ab3b8eb 100644 --- a/contrib/llvm/lib/MC/MCSymbol.cpp +++ b/contrib/llvm/lib/MC/MCSymbol.cpp @@ -16,8 +16,11 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; -// Sentinel value for the absolute pseudo section. -MCSection *MCSymbol::AbsolutePseudoSection = reinterpret_cast<MCSection *>(1); +// Only the address of this fragment is ever actually used. +static MCDummyFragment SentinelFragment(nullptr); + +// Sentinel value for the absolute pseudo fragment. +MCFragment *MCSymbol::AbsolutePseudoFragment = &SentinelFragment; void *MCSymbol::operator new(size_t s, const StringMapEntry<bool> *Name, MCContext &Ctx) { diff --git a/contrib/llvm/lib/MC/MCTargetOptions.cpp b/contrib/llvm/lib/MC/MCTargetOptions.cpp index 1258d9e..4656227 100644 --- a/contrib/llvm/lib/MC/MCTargetOptions.cpp +++ b/contrib/llvm/lib/MC/MCTargetOptions.cpp @@ -14,9 +14,10 @@ namespace llvm { MCTargetOptions::MCTargetOptions() : SanitizeAddress(false), MCRelaxAll(false), MCNoExecStack(false), - MCFatalWarnings(false), MCSaveTempLabels(false), - MCUseDwarfDirectory(false), ShowMCEncoding(false), ShowMCInst(false), - AsmVerbose(false), DwarfVersion(0), ABIName() {} + MCFatalWarnings(false), MCNoWarn(false), MCSaveTempLabels(false), + MCUseDwarfDirectory(false), MCIncrementalLinkerCompatible(false), + ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false), + DwarfVersion(0), ABIName() {} StringRef MCTargetOptions::getABIName() const { return ABIName; diff --git a/contrib/llvm/lib/MC/MCWinEH.cpp b/contrib/llvm/lib/MC/MCWinEH.cpp index d5d9ead..83af203 100644 --- a/contrib/llvm/lib/MC/MCWinEH.cpp +++ b/contrib/llvm/lib/MC/MCWinEH.cpp @@ -49,10 +49,10 @@ static MCSection *getUnwindInfoSection(StringRef SecName, if (CodeSecName.startswith(".text$")) CodeSecName = CodeSecName.substr(6); - return Context.getCOFFSection( - (SecName + Twine('$') + CodeSecName).str(), - COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ, - SectionKind::getDataRel()); + return Context.getCOFFSection((SecName + Twine('$') + CodeSecName).str(), + COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ, + SectionKind::getData()); } } diff --git a/contrib/llvm/lib/MC/MachObjectWriter.cpp b/contrib/llvm/lib/MC/MachObjectWriter.cpp index 8ce6127..324385f 100644 --- a/contrib/llvm/lib/MC/MachObjectWriter.cpp +++ b/contrib/llvm/lib/MC/MachObjectWriter.cpp @@ -78,7 +78,6 @@ uint64_t MachObjectWriter::getSymbolAddress(const MCSymbol &S, dyn_cast<const MCConstantExpr>(S.getVariableValue())) return C->getValue(); - MCValue Target; if (!S.getVariableValue()->evaluateAsRelocatable(Target, &Layout, nullptr)) report_fatal_error("unable to evaluate offset for variable '" + @@ -117,7 +116,8 @@ uint64_t MachObjectWriter::getPaddingSize(const MCSection *Sec, return OffsetToAlignment(EndAddr, NextSec.getAlignment()); } -void MachObjectWriter::writeHeader(unsigned NumLoadCommands, +void MachObjectWriter::writeHeader(MachO::HeaderFileType Type, + unsigned NumLoadCommands, unsigned LoadCommandsSize, bool SubsectionsViaSymbols) { uint32_t Flags = 0; @@ -128,7 +128,7 @@ void MachObjectWriter::writeHeader(unsigned NumLoadCommands, // struct mach_header (28 bytes) or // struct mach_header_64 (32 bytes) - uint64_t Start = OS.tell(); + uint64_t Start = getStream().tell(); (void) Start; write32(is64Bit() ? MachO::MH_MAGIC_64 : MachO::MH_MAGIC); @@ -136,29 +136,30 @@ void MachObjectWriter::writeHeader(unsigned NumLoadCommands, write32(TargetObjectWriter->getCPUType()); write32(TargetObjectWriter->getCPUSubtype()); - write32(MachO::MH_OBJECT); + write32(Type); write32(NumLoadCommands); write32(LoadCommandsSize); write32(Flags); if (is64Bit()) write32(0); // reserved - assert(OS.tell() - Start == - (is64Bit()?sizeof(MachO::mach_header_64): sizeof(MachO::mach_header))); + assert( + getStream().tell() - Start == + (is64Bit() ? sizeof(MachO::mach_header_64) : sizeof(MachO::mach_header))); } /// writeSegmentLoadCommand - Write a segment load command. /// /// \param NumSections The number of sections in this segment. /// \param SectionDataSize The total size of the sections. -void MachObjectWriter::writeSegmentLoadCommand(unsigned NumSections, - uint64_t VMSize, - uint64_t SectionDataStartOffset, - uint64_t SectionDataSize) { +void MachObjectWriter::writeSegmentLoadCommand( + StringRef Name, unsigned NumSections, uint64_t VMAddr, uint64_t VMSize, + uint64_t SectionDataStartOffset, uint64_t SectionDataSize, uint32_t MaxProt, + uint32_t InitProt) { // struct segment_command (56 bytes) or // struct segment_command_64 (72 bytes) - uint64_t Start = OS.tell(); + uint64_t Start = getStream().tell(); (void) Start; unsigned SegmentLoadCommandSize = @@ -169,31 +170,32 @@ void MachObjectWriter::writeSegmentLoadCommand(unsigned NumSections, NumSections * (is64Bit() ? sizeof(MachO::section_64) : sizeof(MachO::section))); - writeBytes("", 16); + assert(Name.size() <= 16); + writeBytes(Name, 16); if (is64Bit()) { - write64(0); // vmaddr + write64(VMAddr); // vmaddr write64(VMSize); // vmsize write64(SectionDataStartOffset); // file offset write64(SectionDataSize); // file size } else { - write32(0); // vmaddr + write32(VMAddr); // vmaddr write32(VMSize); // vmsize write32(SectionDataStartOffset); // file offset write32(SectionDataSize); // file size } // maxprot - write32(MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE); + write32(MaxProt); // initprot - write32(MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE); + write32(InitProt); write32(NumSections); write32(0); // flags - assert(OS.tell() - Start == SegmentLoadCommandSize); + assert(getStream().tell() - Start == SegmentLoadCommandSize); } -void MachObjectWriter::writeSection(const MCAssembler &Asm, - const MCAsmLayout &Layout, - const MCSection &Sec, uint64_t FileOffset, +void MachObjectWriter::writeSection(const MCAsmLayout &Layout, + const MCSection &Sec, uint64_t VMAddr, + uint64_t FileOffset, unsigned Flags, uint64_t RelocationsStart, unsigned NumRelocations) { uint64_t SectionSize = Layout.getSectionAddressSize(&Sec); @@ -208,24 +210,20 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm, // struct section (68 bytes) or // struct section_64 (80 bytes) - uint64_t Start = OS.tell(); + uint64_t Start = getStream().tell(); (void) Start; writeBytes(Section.getSectionName(), 16); writeBytes(Section.getSegmentName(), 16); if (is64Bit()) { - write64(getSectionAddress(&Sec)); // address + write64(VMAddr); // address write64(SectionSize); // size } else { - write32(getSectionAddress(&Sec)); // address + write32(VMAddr); // address write32(SectionSize); // size } write32(FileOffset); - unsigned Flags = Section.getTypeAndAttributes(); - if (Section.hasInstructions()) - Flags |= MachO::S_ATTR_SOME_INSTRUCTIONS; - assert(isPowerOf2_32(Section.getAlignment()) && "Invalid alignment!"); write32(Log2_32(Section.getAlignment())); write32(NumRelocations ? RelocationsStart : 0); @@ -236,8 +234,8 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm, if (is64Bit()) write32(0); // reserved3 - assert(OS.tell() - Start == (is64Bit() ? sizeof(MachO::section_64) : - sizeof(MachO::section))); + assert(getStream().tell() - Start == + (is64Bit() ? sizeof(MachO::section_64) : sizeof(MachO::section))); } void MachObjectWriter::writeSymtabLoadCommand(uint32_t SymbolOffset, @@ -246,7 +244,7 @@ void MachObjectWriter::writeSymtabLoadCommand(uint32_t SymbolOffset, uint32_t StringTableSize) { // struct symtab_command (24 bytes) - uint64_t Start = OS.tell(); + uint64_t Start = getStream().tell(); (void) Start; write32(MachO::LC_SYMTAB); @@ -256,7 +254,7 @@ void MachObjectWriter::writeSymtabLoadCommand(uint32_t SymbolOffset, write32(StringTableOffset); write32(StringTableSize); - assert(OS.tell() - Start == sizeof(MachO::symtab_command)); + assert(getStream().tell() - Start == sizeof(MachO::symtab_command)); } void MachObjectWriter::writeDysymtabLoadCommand(uint32_t FirstLocalSymbol, @@ -269,7 +267,7 @@ void MachObjectWriter::writeDysymtabLoadCommand(uint32_t FirstLocalSymbol, uint32_t NumIndirectSymbols) { // struct dysymtab_command (80 bytes) - uint64_t Start = OS.tell(); + uint64_t Start = getStream().tell(); (void) Start; write32(MachO::LC_DYSYMTAB); @@ -293,7 +291,7 @@ void MachObjectWriter::writeDysymtabLoadCommand(uint32_t FirstLocalSymbol, write32(0); // locreloff write32(0); // nlocrel - assert(OS.tell() - Start == sizeof(MachO::dysymtab_command)); + assert(getStream().tell() - Start == sizeof(MachO::dysymtab_command)); } MachObjectWriter::MachSymbolData * @@ -389,7 +387,7 @@ void MachObjectWriter::writeNlist(MachSymbolData &MSD, void MachObjectWriter::writeLinkeditLoadCommand(uint32_t Type, uint32_t DataOffset, uint32_t DataSize) { - uint64_t Start = OS.tell(); + uint64_t Start = getStream().tell(); (void) Start; write32(Type); @@ -397,7 +395,7 @@ void MachObjectWriter::writeLinkeditLoadCommand(uint32_t Type, write32(DataOffset); write32(DataSize); - assert(OS.tell() - Start == sizeof(MachO::linkedit_data_command)); + assert(getStream().tell() - Start == sizeof(MachO::linkedit_data_command)); } static unsigned ComputeLinkerOptionsLoadCommandSize( @@ -413,7 +411,7 @@ void MachObjectWriter::writeLinkerOptionsLoadCommand( const std::vector<std::string> &Options) { unsigned Size = ComputeLinkerOptionsLoadCommandSize(Options, is64Bit()); - uint64_t Start = OS.tell(); + uint64_t Start = getStream().tell(); (void) Start; write32(MachO::LC_LINKER_OPTION); @@ -429,7 +427,7 @@ void MachObjectWriter::writeLinkerOptionsLoadCommand( // Pad to a multiple of the pointer size. writeBytes("", OffsetToAlignment(BytesWritten, is64Bit() ? 8 : 4)); - assert(OS.tell() - Start == Size); + assert(getStream().tell() - Start == Size); } void MachObjectWriter::recordRelocation(MCAssembler &Asm, @@ -458,9 +456,9 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) { if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS && Section.getType() != MachO::S_LAZY_SYMBOL_POINTERS && Section.getType() != MachO::S_SYMBOL_STUBS) { - MCSymbol &Symbol = *it->Symbol; - report_fatal_error("indirect symbol '" + Symbol.getName() + - "' not in a symbol pointer or stub section"); + MCSymbol &Symbol = *it->Symbol; + report_fatal_error("indirect symbol '" + Symbol.getName() + + "' not in a symbol pointer or stub section"); } } @@ -522,7 +520,7 @@ void MachObjectWriter::computeSymbolTable( StringTable.add(Symbol.getName()); } - StringTable.finalize(StringTableBuilder::MachO); + StringTable.finalize(); // Build the symbol arrays but only for non-local symbols. // @@ -628,6 +626,18 @@ void MachObjectWriter::executePostLayoutBinding(MCAssembler &Asm, } bool MachObjectWriter::isSymbolRefDifferenceFullyResolvedImpl( + const MCAssembler &Asm, const MCSymbol &A, const MCSymbol &B, + bool InSet) const { + // FIXME: We don't handle things like + // foo = . + // creating atoms. + if (A.isVariable() || B.isVariable()) + return false; + return MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(Asm, A, B, + InSet); +} + +bool MachObjectWriter::isSymbolRefDifferenceFullyResolvedImpl( const MCAssembler &Asm, const MCSymbol &SymA, const MCFragment &FB, bool InSet, bool IsPCRel) const { if (InSet) @@ -746,7 +756,7 @@ void MachObjectWriter::writeObject(MCAssembler &Asm, ++NumLoadCommands; LoadCommandsSize += ComputeLinkerOptionsLoadCommandSize(Option, is64Bit()); } - + // Compute the total size of the section data, as well as its file size and vm // size. uint64_t SectionDataStart = (is64Bit() ? sizeof(MachO::mach_header_64) : @@ -776,18 +786,25 @@ void MachObjectWriter::writeObject(MCAssembler &Asm, SectionDataFileSize += SectionDataPadding; // Write the prolog, starting with the header and load command... - writeHeader(NumLoadCommands, LoadCommandsSize, + writeHeader(MachO::MH_OBJECT, NumLoadCommands, LoadCommandsSize, Asm.getSubsectionsViaSymbols()); - writeSegmentLoadCommand(NumSections, VMSize, - SectionDataStart, SectionDataSize); + uint32_t Prot = + MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE; + writeSegmentLoadCommand("", NumSections, 0, VMSize, SectionDataStart, + SectionDataSize, Prot, Prot); // ... and then the section headers. uint64_t RelocTableEnd = SectionDataStart + SectionDataFileSize; - for (const MCSection &Sec : Asm) { + for (const MCSection &Section : Asm) { + const auto &Sec = cast<MCSectionMachO>(Section); std::vector<RelAndSymbol> &Relocs = Relocations[&Sec]; unsigned NumRelocs = Relocs.size(); uint64_t SectionStart = SectionDataStart + getSectionAddress(&Sec); - writeSection(Asm, Layout, Sec, SectionStart, RelocTableEnd, NumRelocs); + unsigned Flags = Sec.getTypeAndAttributes(); + if (Sec.hasInstructions()) + Flags |= MachO::S_ATTR_SOME_INSTRUCTIONS; + writeSection(Layout, Sec, getSectionAddress(&Sec), SectionStart, Flags, + RelocTableEnd, NumRelocs); RelocTableEnd += NumRelocs * sizeof(MachO::any_relocation_info); } @@ -798,8 +815,22 @@ void MachObjectWriter::writeObject(MCAssembler &Asm, assert(VersionInfo.Major < 65536 && "unencodable major target version"); uint32_t EncodedVersion = VersionInfo.Update | (VersionInfo.Minor << 8) | (VersionInfo.Major << 16); - write32(VersionInfo.Kind == MCVM_OSXVersionMin ? MachO::LC_VERSION_MIN_MACOSX : - MachO::LC_VERSION_MIN_IPHONEOS); + MachO::LoadCommandType LCType; + switch (VersionInfo.Kind) { + case MCVM_OSXVersionMin: + LCType = MachO::LC_VERSION_MIN_MACOSX; + break; + case MCVM_IOSVersionMin: + LCType = MachO::LC_VERSION_MIN_IPHONEOS; + break; + case MCVM_TvOSVersionMin: + LCType = MachO::LC_VERSION_MIN_TVOS; + break; + case MCVM_WatchOSVersionMin: + LCType = MachO::LC_VERSION_MIN_WATCHOS; + break; + } + write32(LCType); write32(sizeof(MachO::version_min_command)); write32(EncodedVersion); write32(0); // reserved. @@ -901,12 +932,12 @@ void MachObjectWriter::writeObject(MCAssembler &Asm, // Write out the loh commands, if there is one. if (LOHSize) { #ifndef NDEBUG - unsigned Start = OS.tell(); + unsigned Start = getStream().tell(); #endif Asm.getLOHContainer().emit(*this, Layout); // Pad to a multiple of the pointer size. writeBytes("", OffsetToAlignment(LOHRawSize, is64Bit() ? 8 : 4)); - assert(OS.tell() - Start == LOHSize); + assert(getStream().tell() - Start == LOHSize); } // Write the symbol table data, if used. @@ -942,7 +973,7 @@ void MachObjectWriter::writeObject(MCAssembler &Asm, writeNlist(Entry, Layout); // Write the string table. - OS << StringTable.data(); + getStream() << StringTable.data(); } } diff --git a/contrib/llvm/lib/MC/StringTableBuilder.cpp b/contrib/llvm/lib/MC/StringTableBuilder.cpp index 9de9363..80e5522 100644 --- a/contrib/llvm/lib/MC/StringTableBuilder.cpp +++ b/contrib/llvm/lib/MC/StringTableBuilder.cpp @@ -8,35 +8,71 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/StringTableBuilder.h" -#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Support/COFF.h" #include "llvm/Support/Endian.h" +#include <vector> + using namespace llvm; -static bool compareBySuffix(StringRef a, StringRef b) { - size_t sizeA = a.size(); - size_t sizeB = b.size(); - size_t len = std::min(sizeA, sizeB); - for (size_t i = 0; i < len; ++i) { - char ca = a[sizeA - i - 1]; - char cb = b[sizeB - i - 1]; - if (ca != cb) - return ca > cb; +StringTableBuilder::StringTableBuilder(Kind K) : K(K) {} + +typedef std::pair<StringRef, size_t> StringPair; + +// Returns the character at Pos from end of a string. +static int charTailAt(StringPair *P, size_t Pos) { + StringRef S = P->first; + if (Pos >= S.size()) + return -1; + return (unsigned char)S[S.size() - Pos - 1]; +} + +// Three-way radix quicksort. This is much faster than std::sort with strcmp +// because it does not compare characters that we already know the same. +static void multikey_qsort(StringPair **Begin, StringPair **End, int Pos) { +tailcall: + if (End - Begin <= 1) + return; + + // Partition items. Items in [Begin, P) are greater than the pivot, + // [P, Q) are the same as the pivot, and [Q, End) are less than the pivot. + int Pivot = charTailAt(*Begin, Pos); + StringPair **P = Begin; + StringPair **Q = End; + for (StringPair **R = Begin + 1; R < Q;) { + int C = charTailAt(*R, Pos); + if (C > Pivot) + std::swap(*P++, *R++); + else if (C < Pivot) + std::swap(*--Q, *R); + else + R++; + } + + multikey_qsort(Begin, P, Pos); + multikey_qsort(Q, End, Pos); + if (Pivot != -1) { + // qsort(P, Q, Pos + 1), but with tail call optimization. + Begin = P; + End = Q; + ++Pos; + goto tailcall; } - return sizeA > sizeB; } -void StringTableBuilder::finalize(Kind kind) { - SmallVector<StringRef, 8> Strings; +void StringTableBuilder::finalize() { + std::vector<std::pair<StringRef, size_t> *> Strings; Strings.reserve(StringIndexMap.size()); + for (std::pair<StringRef, size_t> &P : StringIndexMap) + Strings.push_back(&P); - for (auto i = StringIndexMap.begin(), e = StringIndexMap.end(); i != e; ++i) - Strings.push_back(i->getKey()); - - std::sort(Strings.begin(), Strings.end(), compareBySuffix); + if (!Strings.empty()) + multikey_qsort(&Strings[0], &Strings[0] + Strings.size(), 0); - switch (kind) { + switch (K) { + case RAW: + break; case ELF: case MachO: // Start the table with a NUL byte. @@ -49,22 +85,25 @@ void StringTableBuilder::finalize(Kind kind) { } StringRef Previous; - for (StringRef s : Strings) { - if (kind == WinCOFF) - assert(s.size() > COFF::NameSize && "Short string in COFF string table!"); + for (std::pair<StringRef, size_t> *P : Strings) { + StringRef S = P->first; + if (K == WinCOFF) + assert(S.size() > COFF::NameSize && "Short string in COFF string table!"); - if (Previous.endswith(s)) { - StringIndexMap[s] = StringTable.size() - 1 - s.size(); + if (Previous.endswith(S)) { + P->second = StringTable.size() - S.size() - (K != RAW); continue; } - StringIndexMap[s] = StringTable.size(); - StringTable += s; - StringTable += '\x00'; - Previous = s; + P->second = StringTable.size(); + StringTable += S; + if (K != RAW) + StringTable += '\x00'; + Previous = S; } - switch (kind) { + switch (K) { + case RAW: case ELF: break; case MachO: @@ -75,14 +114,31 @@ void StringTableBuilder::finalize(Kind kind) { case WinCOFF: // Write the table size in the first word. assert(StringTable.size() <= std::numeric_limits<uint32_t>::max()); - uint32_t size = static_cast<uint32_t>(StringTable.size()); + uint32_t Size = static_cast<uint32_t>(StringTable.size()); support::endian::write<uint32_t, support::little, support::unaligned>( - StringTable.data(), size); + StringTable.data(), Size); break; } + + Size = StringTable.size(); } void StringTableBuilder::clear() { StringTable.clear(); StringIndexMap.clear(); } + +size_t StringTableBuilder::getOffset(StringRef S) const { + assert(isFinalized()); + auto I = StringIndexMap.find(S); + assert(I != StringIndexMap.end() && "String is not in table!"); + return I->second; +} + +size_t StringTableBuilder::add(StringRef S) { + assert(!isFinalized()); + auto P = StringIndexMap.insert(std::make_pair(S, Size)); + if (P.second) + Size += S.size() + (K != RAW); + return P.first->second; +} diff --git a/contrib/llvm/lib/MC/SubtargetFeature.cpp b/contrib/llvm/lib/MC/SubtargetFeature.cpp index 76574e9..7cce0fe 100644 --- a/contrib/llvm/lib/MC/SubtargetFeature.cpp +++ b/contrib/llvm/lib/MC/SubtargetFeature.cpp @@ -56,7 +56,7 @@ static inline bool isEnabled(StringRef Feature) { /// static void Split(std::vector<std::string> &V, StringRef S) { SmallVector<StringRef, 3> Tmp; - S.split(Tmp, ",", -1, false /* KeepEmpty */); + S.split(Tmp, ',', -1, false /* KeepEmpty */); V.assign(Tmp.begin(), Tmp.end()); } @@ -160,10 +160,9 @@ void ClearImpliedBits(FeatureBitset &Bits, } } -/// ToggleFeature - Toggle a feature and returns the newly updated feature -/// bits. -FeatureBitset -SubtargetFeatures::ToggleFeature(FeatureBitset Bits, StringRef Feature, +/// ToggleFeature - Toggle a feature and update the feature bits. +void +SubtargetFeatures::ToggleFeature(FeatureBitset &Bits, StringRef Feature, ArrayRef<SubtargetFeatureKV> FeatureTable) { // Find feature in table. @@ -186,12 +185,9 @@ SubtargetFeatures::ToggleFeature(FeatureBitset Bits, StringRef Feature, << "' is not a recognized feature for this target" << " (ignoring feature)\n"; } - - return Bits; } -FeatureBitset -SubtargetFeatures::ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature, +void SubtargetFeatures::ApplyFeatureFlag(FeatureBitset &Bits, StringRef Feature, ArrayRef<SubtargetFeatureKV> FeatureTable) { assert(hasFlag(Feature)); @@ -203,7 +199,7 @@ SubtargetFeatures::ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature, if (FeatureEntry) { // Enable/disable feature in bits if (isEnabled(Feature)) { - Bits |= FeatureEntry->Value; + Bits |= FeatureEntry->Value; // For each feature that this implies, set it. SetImpliedBits(Bits, FeatureEntry, FeatureTable); @@ -218,8 +214,6 @@ SubtargetFeatures::ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature, << "' is not a recognized feature for this target" << " (ignoring feature)\n"; } - - return Bits; } @@ -234,14 +228,10 @@ SubtargetFeatures::getFeatureBits(StringRef CPU, return FeatureBitset(); #ifndef NDEBUG - for (size_t i = 1, e = CPUTable.size(); i != e; ++i) { - assert(strcmp(CPUTable[i - 1].Key, CPUTable[i].Key) < 0 && - "CPU table is not sorted"); - } - for (size_t i = 1, e = FeatureTable.size(); i != e; ++i) { - assert(strcmp(FeatureTable[i - 1].Key, FeatureTable[i].Key) < 0 && - "CPU features table is not sorted"); - } + assert(std::is_sorted(std::begin(CPUTable), std::end(CPUTable)) && + "CPU table is not sorted"); + assert(std::is_sorted(std::begin(FeatureTable), std::end(FeatureTable)) && + "CPU features table is not sorted"); #endif // Resulting bits FeatureBitset Bits; @@ -277,7 +267,7 @@ SubtargetFeatures::getFeatureBits(StringRef CPU, if (Feature == "+help") Help(CPUTable, FeatureTable); - Bits = ApplyFeatureFlag(Bits, Feature, FeatureTable); + ApplyFeatureFlag(Bits, Feature, FeatureTable); } return Bits; diff --git a/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp b/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp index 56ef1c7..a76cbdb 100644 --- a/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" +#include "llvm/Config/config.h" #include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" @@ -32,8 +33,10 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/Endian.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/JamCRC.h" #include "llvm/Support/TimeValue.h" #include <cstdio> +#include <ctime> using namespace llvm; @@ -76,8 +79,6 @@ public: COFFSymbol(StringRef name); void set_name_offset(uint32_t Offset); - bool should_keep() const; - int64_t getIndex() const { return Index; } void setIndex(int Value) { Index = Value; @@ -125,7 +126,7 @@ public: COFF::header Header; sections Sections; symbols Symbols; - StringTableBuilder Strings; + StringTableBuilder Strings{StringTableBuilder::WinCOFF}; // Maps used during object file creation. section_map SectionMap; @@ -160,8 +161,6 @@ public: void SetSymbolName(COFFSymbol &S); void SetSectionName(COFFSection &S); - bool ExportSymbol(const MCSymbol &Symbol, MCAssembler &Asm); - bool IsPhysicalSection(COFFSection *S); // Entity writing methods. @@ -215,38 +214,6 @@ void COFFSymbol::set_name_offset(uint32_t Offset) { write_uint32_le(Data.Name + 4, Offset); } -/// logic to decide if the symbol should be reported in the symbol table -bool COFFSymbol::should_keep() const { - // no section means its external, keep it - if (!Section) - return true; - - // if it has relocations pointing at it, keep it - if (Relocations > 0) { - assert(Section->Number != -1 && "Sections with relocations must be real!"); - return true; - } - - // if this is a safeseh handler, keep it - if (MC && (cast<MCSymbolCOFF>(MC)->isSafeSEH())) - return true; - - // if the section its in is being droped, drop it - if (Section->Number == -1) - return false; - - // if it is the section symbol, keep it - if (Section->Symbol == this) - return true; - - // if its temporary, drop it - if (MC && MC->isTemporary()) - return false; - - // otherwise, keep it - return true; -} - //------------------------------------------------------------------------------ // Section class implementation @@ -392,7 +359,6 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &Symbol, MCAssembler &Assembler, const MCAsmLayout &Layout) { COFFSymbol *coff_symbol = GetOrCreateCOFFSymbol(&Symbol); - SymbolMap[&Symbol] = coff_symbol; if (cast<MCSymbolCOFF>(Symbol).isWeakExternal()) { coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL; @@ -515,25 +481,6 @@ void WinCOFFObjectWriter::SetSymbolName(COFFSymbol &S) { std::memcpy(S.Data.Name, S.Name.c_str(), S.Name.size()); } -bool WinCOFFObjectWriter::ExportSymbol(const MCSymbol &Symbol, - MCAssembler &Asm) { - // This doesn't seem to be right. Strings referred to from the .data section - // need symbols so they can be linked to code in the .text section right? - - // return Asm.isSymbolLinkerVisible(Symbol); - - // Non-temporary labels should always be visible to the linker. - if (!Symbol.isTemporary()) - return true; - - // Temporary variable symbols are invisible. - if (Symbol.isVariable()) - return false; - - // Absolute temporary labels are never visible. - return !Symbol.isAbsolute(); -} - bool WinCOFFObjectWriter::IsPhysicalSection(COFFSection *S) { return (S->Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) == 0; @@ -663,7 +610,7 @@ void WinCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm, defineSection(static_cast<const MCSectionCOFF &>(Section)); for (const MCSymbol &Symbol : Asm.symbols()) - if (ExportSymbol(Symbol, Asm)) + if (!Symbol.isTemporary()) DefineSymbol(Symbol, Asm, Layout); } @@ -674,7 +621,8 @@ bool WinCOFFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl( // thunk to implement their /INCREMENTAL feature. Make sure we don't optimize // away any relocations to functions. uint16_t Type = cast<MCSymbolCOFF>(SymA).getType(); - if ((Type >> COFF::SCT_COMPLEX_TYPE_SHIFT) == COFF::IMAGE_SYM_DTYPE_FUNCTION) + if (Asm.isIncrementalLinkerCompatible() && + (Type >> COFF::SCT_COMPLEX_TYPE_SHIFT) == COFF::IMAGE_SYM_DTYPE_FUNCTION) return false; return MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(Asm, SymA, FB, InSet, IsPCRel); @@ -702,41 +650,49 @@ void WinCOFFObjectWriter::recordRelocation( const MCFixup &Fixup, MCValue Target, bool &IsPCRel, uint64_t &FixedValue) { assert(Target.getSymA() && "Relocation must reference a symbol!"); - const MCSymbol &Symbol = Target.getSymA()->getSymbol(); - const MCSymbol &A = Symbol; - if (!A.isRegistered()) - Asm.getContext().reportFatalError(Fixup.getLoc(), + const MCSymbol &A = Target.getSymA()->getSymbol(); + if (!A.isRegistered()) { + Asm.getContext().reportError(Fixup.getLoc(), Twine("symbol '") + A.getName() + "' can not be undefined"); + return; + } + if (A.isTemporary() && A.isUndefined()) { + Asm.getContext().reportError(Fixup.getLoc(), + Twine("assembler label '") + A.getName() + + "' can not be undefined"); + return; + } MCSection *Section = Fragment->getParent(); // Mark this symbol as requiring an entry in the symbol table. assert(SectionMap.find(Section) != SectionMap.end() && "Section must already have been defined in executePostLayoutBinding!"); - assert(SymbolMap.find(&A) != SymbolMap.end() && - "Symbol must already have been defined in executePostLayoutBinding!"); COFFSection *coff_section = SectionMap[Section]; - COFFSymbol *coff_symbol = SymbolMap[&A]; const MCSymbolRefExpr *SymB = Target.getSymB(); bool CrossSection = false; if (SymB) { const MCSymbol *B = &SymB->getSymbol(); - if (!B->getFragment()) - Asm.getContext().reportFatalError( + if (!B->getFragment()) { + Asm.getContext().reportError( Fixup.getLoc(), Twine("symbol '") + B->getName() + "' can not be undefined in a subtraction expression"); + return; + } - if (!A.getFragment()) - Asm.getContext().reportFatalError( + if (!A.getFragment()) { + Asm.getContext().reportError( Fixup.getLoc(), - Twine("symbol '") + Symbol.getName() + + Twine("symbol '") + A.getName() + "' can not be undefined in a subtraction expression"); + return; + } - CrossSection = &Symbol.getSection() != &B->getSection(); + CrossSection = &A.getSection() != &B->getSection(); // Offset of the symbol in the section int64_t OffsetOfB = Layout.getSymbolOffset(*B); @@ -765,12 +721,19 @@ void WinCOFFObjectWriter::recordRelocation( Reloc.Data.VirtualAddress = Layout.getFragmentOffset(Fragment); // Turn relocations for temporary symbols into section relocations. - if (coff_symbol->MC->isTemporary() || CrossSection) { - Reloc.Symb = coff_symbol->Section->Symbol; - FixedValue += Layout.getFragmentOffset(coff_symbol->MC->getFragment()) + - coff_symbol->MC->getOffset(); - } else - Reloc.Symb = coff_symbol; + if (A.isTemporary() || CrossSection) { + MCSection *TargetSection = &A.getSection(); + assert( + SectionMap.find(TargetSection) != SectionMap.end() && + "Section must already have been defined in executePostLayoutBinding!"); + Reloc.Symb = SectionMap[TargetSection]->Symbol; + FixedValue += Layout.getSymbolOffset(A); + } else { + assert( + SymbolMap.find(&A) != SymbolMap.end() && + "Symbol must already have been defined in executePostLayoutBinding!"); + Reloc.Symb = SymbolMap[&A]; + } ++Reloc.Symb->Relocations; @@ -884,14 +847,10 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm, // Update section number & offset for symbols that have them. if (Symbol->Section) Symbol->Data.SectionNumber = Symbol->Section->Number; - if (Symbol->should_keep()) { - Symbol->setIndex(Header.NumberOfSymbols++); - // Update auxiliary symbol info. - Symbol->Data.NumberOfAuxSymbols = Symbol->Aux.size(); - Header.NumberOfSymbols += Symbol->Data.NumberOfAuxSymbols; - } else { - Symbol->setIndex(-1); - } + Symbol->setIndex(Header.NumberOfSymbols++); + // Update auxiliary symbol info. + Symbol->Data.NumberOfAuxSymbols = Symbol->Aux.size(); + Header.NumberOfSymbols += Symbol->Data.NumberOfAuxSymbols; } // Build string table. @@ -899,16 +858,15 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm, if (S->Name.size() > COFF::NameSize) Strings.add(S->Name); for (const auto &S : Symbols) - if (S->should_keep() && S->Name.size() > COFF::NameSize) + if (S->Name.size() > COFF::NameSize) Strings.add(S->Name); - Strings.finalize(StringTableBuilder::WinCOFF); + Strings.finalize(); // Set names. for (const auto &S : Sections) SetSectionName(*S); for (auto &S : Symbols) - if (S->should_keep()) - SetSymbolName(*S); + SetSymbolName(*S); // Fixup weak external references. for (auto &Symbol : Symbols) { @@ -948,7 +906,7 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm, // Assign file offsets to COFF object file structures. - unsigned offset = 0; + unsigned offset = getInitialOffset(); if (UseBigObj) offset += COFF::Header32Size; @@ -1011,8 +969,17 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm, Header.PointerToSymbolTable = offset; - // We want a deterministic output. It looks like GNU as also writes 0 in here. - Header.TimeDateStamp = 0; + // MS LINK expects to be able to use this timestamp to implement their + // /INCREMENTAL feature. + if (Asm.isIncrementalLinkerCompatible()) { + std::time_t Now = time(nullptr); + if (Now < 0 || !isUInt<32>(Now)) + Now = UINT32_MAX; + Header.TimeDateStamp = Now; + } else { + // Have deterministic output if /INCREMENTAL isn't needed. Also matches GNU. + Header.TimeDateStamp = 0; + } // Write it all to disk... WriteFileHeader(Header); @@ -1029,6 +996,7 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm, } } + SmallVector<char, 128> SectionContents; for (i = Sections.begin(), ie = Sections.end(), j = Asm.begin(), je = Asm.end(); (i != ie) && (j != je); ++i, ++j) { @@ -1037,20 +1005,47 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm, continue; if ((*i)->Header.PointerToRawData != 0) { - assert(OS.tell() <= (*i)->Header.PointerToRawData && + assert(getStream().tell() <= (*i)->Header.PointerToRawData && "Section::PointerToRawData is insane!"); - unsigned SectionDataPadding = (*i)->Header.PointerToRawData - OS.tell(); + unsigned SectionDataPadding = + (*i)->Header.PointerToRawData - getStream().tell(); assert(SectionDataPadding < 4 && "Should only need at most three bytes of padding!"); WriteZeros(SectionDataPadding); + // Save the contents of the section to a temporary buffer, we need this + // to CRC the data before we dump it into the object file. + SectionContents.clear(); + raw_svector_ostream VecOS(SectionContents); + raw_pwrite_stream &OldStream = getStream(); + // Redirect the output stream to our buffer. + setStream(VecOS); + // Fill our buffer with the section data. Asm.writeSectionData(&*j, Layout); + // Reset the stream back to what it was before. + setStream(OldStream); + + // Calculate our CRC with an initial value of '0', this is not how + // JamCRC is specified but it aligns with the expected output. + JamCRC JC(/*Init=*/0x00000000U); + JC.update(SectionContents); + + // Write the section contents to the object file. + getStream() << SectionContents; + + // Update the section definition auxiliary symbol to record the CRC. + COFFSection *Sec = SectionMap[&*j]; + COFFSymbol::AuxiliarySymbols &AuxSyms = Sec->Symbol->Aux; + assert(AuxSyms.size() == 1 && + AuxSyms[0].AuxType == ATSectionDefinition); + AuxSymbol &SecDef = AuxSyms[0]; + SecDef.Aux.SectionDefinition.CheckSum = JC.getCRC(); } if ((*i)->Relocations.size() > 0) { - assert(OS.tell() == (*i)->Header.PointerToRelocations && + assert(getStream().tell() == (*i)->Header.PointerToRelocations && "Section::PointerToRelocations is insane!"); if ((*i)->Relocations.size() >= 0xffff) { @@ -1071,14 +1066,14 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm, } } - assert(OS.tell() == Header.PointerToSymbolTable && + assert(getStream().tell() == Header.PointerToSymbolTable && "Header::PointerToSymbolTable is insane!"); for (auto &Symbol : Symbols) if (Symbol->getIndex() != -1) WriteSymbol(*Symbol); - OS.write(Strings.data().data(), Strings.data().size()); + getStream().write(Strings.data().data(), Strings.data().size()); } MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_) diff --git a/contrib/llvm/lib/MC/WinCOFFStreamer.cpp b/contrib/llvm/lib/MC/WinCOFFStreamer.cpp index 36dd691..a38b1a4 100644 --- a/contrib/llvm/lib/MC/WinCOFFStreamer.cpp +++ b/contrib/llvm/lib/MC/WinCOFFStreamer.cpp @@ -49,7 +49,6 @@ void MCWinCOFFStreamer::EmitInstToData(const MCInst &Inst, SmallString<256> Code; raw_svector_ostream VecOS(Code); getAssembler().getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI); - VecOS.flush(); // Add the fixups and data. for (unsigned i = 0, e = Fixups.size(); i != e; ++i) { @@ -123,29 +122,37 @@ void MCWinCOFFStreamer::BeginCOFFSymbolDef(MCSymbol const *Symbol) { "Got non-COFF section in the COFF backend!"); if (CurSymbol) - FatalError("starting a new symbol definition without completing the " - "previous one"); + Error("starting a new symbol definition without completing the " + "previous one"); CurSymbol = Symbol; } void MCWinCOFFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) { - if (!CurSymbol) - FatalError("storage class specified outside of symbol definition"); + if (!CurSymbol) { + Error("storage class specified outside of symbol definition"); + return; + } - if (StorageClass & ~COFF::SSC_Invalid) - FatalError("storage class value '" + Twine(StorageClass) + + if (StorageClass & ~COFF::SSC_Invalid) { + Error("storage class value '" + Twine(StorageClass) + "' out of range"); + return; + } getAssembler().registerSymbol(*CurSymbol); cast<MCSymbolCOFF>(CurSymbol)->setClass((uint16_t)StorageClass); } void MCWinCOFFStreamer::EmitCOFFSymbolType(int Type) { - if (!CurSymbol) - FatalError("symbol type specified outside of a symbol definition"); + if (!CurSymbol) { + Error("symbol type specified outside of a symbol definition"); + return; + } - if (Type & ~0xffff) - FatalError("type value '" + Twine(Type) + "' out of range"); + if (Type & ~0xffff) { + Error("type value '" + Twine(Type) + "' out of range"); + return; + } getAssembler().registerSymbol(*CurSymbol); cast<MCSymbolCOFF>(CurSymbol)->setType((uint16_t)Type); @@ -153,7 +160,7 @@ void MCWinCOFFStreamer::EmitCOFFSymbolType(int Type) { void MCWinCOFFStreamer::EndCOFFSymbolDef() { if (!CurSymbol) - FatalError("ending symbol definition without starting one"); + Error("ending symbol definition without starting one"); CurSymbol = nullptr; } @@ -215,8 +222,6 @@ void MCWinCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, Size = std::max(Size, static_cast<uint64_t>(ByteAlignment)); } - AssignSection(Symbol, nullptr); - getAssembler().registerSymbol(*Symbol); Symbol->setExternal(true); Symbol->setCommon(Size, ByteAlignment); @@ -228,7 +233,6 @@ void MCWinCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, OS << " -aligncomm:\"" << Symbol->getName() << "\"," << Log2_32_Ceil(ByteAlignment); - OS.flush(); PushSection(); SwitchSection(MFI->getDrectveSection()); @@ -249,8 +253,6 @@ void MCWinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, getAssembler().registerSymbol(*Symbol); Symbol->setExternal(false); - AssignSection(Symbol, Section); - if (ByteAlignment != 1) new MCAlignFragment(ByteAlignment, /*Value=*/0, /*ValueSize=*/0, ByteAlignment, Section); @@ -287,9 +289,8 @@ void MCWinCOFFStreamer::FinishImpl() { MCObjectStreamer::FinishImpl(); } -LLVM_ATTRIBUTE_NORETURN -void MCWinCOFFStreamer::FatalError(const Twine &Msg) const { - getContext().reportFatalError(SMLoc(), Msg); +void MCWinCOFFStreamer::Error(const Twine &Msg) const { + getContext().reportError(SMLoc(), Msg); } } diff --git a/contrib/llvm/lib/Object/Archive.cpp b/contrib/llvm/lib/Object/Archive.cpp index d482119..99b0650 100644 --- a/contrib/llvm/lib/Object/Archive.cpp +++ b/contrib/llvm/lib/Object/Archive.cpp @@ -43,10 +43,10 @@ StringRef ArchiveMemberHeader::getName() const { return llvm::StringRef(Name, end); } -uint32_t ArchiveMemberHeader::getSize() const { +ErrorOr<uint32_t> ArchiveMemberHeader::getSize() const { uint32_t Ret; if (llvm::StringRef(Size, sizeof(Size)).rtrim(" ").getAsInteger(10, Ret)) - llvm_unreachable("Size is not a decimal number."); + return object_error::parse_failed; // Size is not a decimal number. return Ret; } @@ -82,22 +82,30 @@ unsigned ArchiveMemberHeader::getGID() const { return Ret; } -Archive::Child::Child(const Archive *Parent, const char *Start) +Archive::Child::Child(const Archive *Parent, StringRef Data, + uint16_t StartOfFile) + : Parent(Parent), Data(Data), StartOfFile(StartOfFile) {} + +Archive::Child::Child(const Archive *Parent, const char *Start, + std::error_code *EC) : Parent(Parent) { if (!Start) return; - const ArchiveMemberHeader *Header = - reinterpret_cast<const ArchiveMemberHeader *>(Start); uint64_t Size = sizeof(ArchiveMemberHeader); - if (!Parent->IsThin || Header->getName() == "/" || Header->getName() == "//") - Size += Header->getSize(); Data = StringRef(Start, Size); + if (!isThinMember()) { + ErrorOr<uint64_t> MemberSize = getRawSize(); + if ((*EC = MemberSize.getError())) + return; + Size += MemberSize.get(); + Data = StringRef(Start, Size); + } // Setup StartOfFile and PaddingBytes. StartOfFile = sizeof(ArchiveMemberHeader); // Don't include attached name. - StringRef Name = Header->getName(); + StringRef Name = getRawName(); if (Name.startswith("#1/")) { uint64_t NameSize; if (Name.substr(3).rtrim(" ").getAsInteger(10, NameSize)) @@ -106,25 +114,40 @@ Archive::Child::Child(const Archive *Parent, const char *Start) } } -uint64_t Archive::Child::getSize() const { - if (Parent->IsThin) - return getHeader()->getSize(); +ErrorOr<uint64_t> Archive::Child::getSize() const { + if (Parent->IsThin) { + ErrorOr<uint32_t> Size = getHeader()->getSize(); + if (std::error_code EC = Size.getError()) + return EC; + return Size.get(); + } return Data.size() - StartOfFile; } -uint64_t Archive::Child::getRawSize() const { - return getHeader()->getSize(); +ErrorOr<uint64_t> Archive::Child::getRawSize() const { + ErrorOr<uint32_t> Size = getHeader()->getSize(); + if (std::error_code EC = Size.getError()) + return EC; + return Size.get(); +} + +bool Archive::Child::isThinMember() const { + StringRef Name = getHeader()->getName(); + return Parent->IsThin && Name != "/" && Name != "//"; } ErrorOr<StringRef> Archive::Child::getBuffer() const { - if (!Parent->IsThin) - return StringRef(Data.data() + StartOfFile, getSize()); + if (!isThinMember()) { + ErrorOr<uint32_t> Size = getSize(); + if (std::error_code EC = Size.getError()) + return EC; + return StringRef(Data.data() + StartOfFile, Size.get()); + } ErrorOr<StringRef> Name = getName(); if (std::error_code EC = Name.getError()) return EC; - SmallString<128> FullName = - Parent->getMemoryBufferRef().getBufferIdentifier(); - sys::path::remove_filename(FullName); + SmallString<128> FullName = sys::path::parent_path( + Parent->getMemoryBufferRef().getBufferIdentifier()); sys::path::append(FullName, *Name); ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getFile(FullName); if (std::error_code EC = Buf.getError()) @@ -133,7 +156,7 @@ ErrorOr<StringRef> Archive::Child::getBuffer() const { return Parent->ThinBuffers.back()->getBuffer(); } -Archive::Child Archive::Child::getNext() const { +ErrorOr<Archive::Child> Archive::Child::getNext() const { size_t SpaceToSkip = Data.size(); // If it's odd, add 1 to make it even. if (SpaceToSkip & 1) @@ -141,11 +164,19 @@ Archive::Child Archive::Child::getNext() const { const char *NextLoc = Data.data() + SpaceToSkip; + // Check to see if this is at the end of the archive. + if (NextLoc == Parent->Data.getBufferEnd()) + return Child(Parent, nullptr, nullptr); + // Check to see if this is past the end of the archive. - if (NextLoc >= Parent->Data.getBufferEnd()) - return Child(Parent, nullptr); + if (NextLoc > Parent->Data.getBufferEnd()) + return object_error::parse_failed; - return Child(Parent, NextLoc); + std::error_code EC; + Child Ret(Parent, NextLoc, &EC); + if (EC) + return EC; + return Ret; } uint64_t Archive::Child::getChildOffset() const { @@ -168,17 +199,11 @@ ErrorOr<StringRef> Archive::Child::getName() const { std::size_t offset; if (name.substr(1).rtrim(" ").getAsInteger(10, offset)) llvm_unreachable("Long name offset is not an integer"); - const char *addr = Parent->StringTable->Data.begin() - + sizeof(ArchiveMemberHeader) - + offset; + // Verify it. - if (Parent->StringTable == Parent->child_end() - || addr < (Parent->StringTable->Data.begin() - + sizeof(ArchiveMemberHeader)) - || addr > (Parent->StringTable->Data.begin() - + sizeof(ArchiveMemberHeader) - + Parent->StringTable->getSize())) + if (offset >= Parent->StringTable.size()) return object_error::parse_failed; + const char *addr = Parent->StringTable.begin() + offset; // GNU long file names end with a "/\n". if (Parent->kind() == K_GNU || Parent->kind() == K_MIPS64) { @@ -227,9 +252,13 @@ ErrorOr<std::unique_ptr<Archive>> Archive::create(MemoryBufferRef Source) { return std::move(Ret); } +void Archive::setFirstRegular(const Child &C) { + FirstRegularData = C.Data; + FirstRegularStartOfFile = C.StartOfFile; +} + Archive::Archive(MemoryBufferRef Source, std::error_code &ec) - : Binary(Binary::ID_Archive, Source), SymbolTable(child_end()), - StringTable(child_end()), FirstRegular(child_end()) { + : Binary(Binary::ID_Archive, Source) { StringRef Buffer = Data.getBuffer(); // Check for sufficient magic. if (Buffer.startswith(ThinMagic)) { @@ -242,15 +271,26 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec) } // Get the special members. - child_iterator i = child_begin(false); - child_iterator e = child_end(); + child_iterator I = child_begin(false); + if ((ec = I->getError())) + return; + child_iterator E = child_end(); - if (i == e) { + if (I == E) { ec = std::error_code(); return; } + const Child *C = &**I; - StringRef Name = i->getRawName(); + auto Increment = [&]() { + ++I; + if ((ec = I->getError())) + return true; + C = &**I; + return false; + }; + + StringRef Name = C->getRawName(); // Below is the pattern that is used to figure out the archive format // GNU archive format @@ -273,9 +313,13 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec) if (Name == "__.SYMDEF") { Format = K_BSD; - SymbolTable = i; - ++i; - FirstRegular = i; + // We know that the symbol table is not an external file, so we just assert + // there is no error. + SymbolTable = *C->getBuffer(); + if (Increment()) + return; + setFirstRegular(*C); + ec = std::error_code(); return; } @@ -283,16 +327,19 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec) if (Name.startswith("#1/")) { Format = K_BSD; // We know this is BSD, so getName will work since there is no string table. - ErrorOr<StringRef> NameOrErr = i->getName(); + ErrorOr<StringRef> NameOrErr = C->getName(); ec = NameOrErr.getError(); if (ec) return; Name = NameOrErr.get(); if (Name == "__.SYMDEF SORTED" || Name == "__.SYMDEF") { - SymbolTable = i; - ++i; + // We know that the symbol table is not an external file, so we just + // assert there is no error. + SymbolTable = *C->getBuffer(); + if (Increment()) + return; } - FirstRegular = i; + setFirstRegular(*C); return; } @@ -303,30 +350,36 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec) bool has64SymTable = false; if (Name == "/" || Name == "/SYM64/") { - SymbolTable = i; + // We know that the symbol table is not an external file, so we just assert + // there is no error. + SymbolTable = *C->getBuffer(); if (Name == "/SYM64/") has64SymTable = true; - ++i; - if (i == e) { + if (Increment()) + return; + if (I == E) { ec = std::error_code(); return; } - Name = i->getRawName(); + Name = C->getRawName(); } if (Name == "//") { Format = has64SymTable ? K_MIPS64 : K_GNU; - StringTable = i; - ++i; - FirstRegular = i; + // The string table is never an external member, so we just assert on the + // ErrorOr. + StringTable = *C->getBuffer(); + if (Increment()) + return; + setFirstRegular(*C); ec = std::error_code(); return; } if (Name[0] != '/') { Format = has64SymTable ? K_MIPS64 : K_GNU; - FirstRegular = i; + setFirstRegular(*C); ec = std::error_code(); return; } @@ -337,23 +390,30 @@ Archive::Archive(MemoryBufferRef Source, std::error_code &ec) } Format = K_COFF; - SymbolTable = i; + // We know that the symbol table is not an external file, so we just assert + // there is no error. + SymbolTable = *C->getBuffer(); - ++i; - if (i == e) { - FirstRegular = i; + if (Increment()) + return; + + if (I == E) { + setFirstRegular(*C); ec = std::error_code(); return; } - Name = i->getRawName(); + Name = C->getRawName(); if (Name == "//") { - StringTable = i; - ++i; + // The string table is never an external member, so we just assert on the + // ErrorOr. + StringTable = *C->getBuffer(); + if (Increment()) + return; } - FirstRegular = i; + setFirstRegular(*C); ec = std::error_code(); } @@ -362,22 +422,25 @@ Archive::child_iterator Archive::child_begin(bool SkipInternal) const { return child_end(); if (SkipInternal) - return FirstRegular; + return Child(this, FirstRegularData, FirstRegularStartOfFile); const char *Loc = Data.getBufferStart() + strlen(Magic); - Child c(this, Loc); - return c; + std::error_code EC; + Child c(this, Loc, &EC); + if (EC) + return child_iterator(EC); + return child_iterator(c); } Archive::child_iterator Archive::child_end() const { - return Child(this, nullptr); + return Child(this, nullptr, nullptr); } StringRef Archive::Symbol::getName() const { return Parent->getSymbolTable().begin() + StringIndex; } -ErrorOr<Archive::child_iterator> Archive::Symbol::getMember() const { +ErrorOr<Archive::Child> Archive::Symbol::getMember() const { const char *Buf = Parent->getSymbolTable().begin(); const char *Offsets = Buf; if (Parent->kind() == K_MIPS64) @@ -422,8 +485,11 @@ ErrorOr<Archive::child_iterator> Archive::Symbol::getMember() const { } const char *Loc = Parent->getData().begin() + Offset; - child_iterator Iter(Child(Parent, Loc)); - return Iter; + std::error_code EC; + Child C(Parent, Loc, &EC); + if (EC) + return EC; + return C; } Archive::Symbol Archive::Symbol::getNext() const { @@ -506,12 +572,12 @@ Archive::symbol_iterator Archive::symbol_begin() const { } Archive::symbol_iterator Archive::symbol_end() const { - if (!hasSymbolTable()) - return symbol_iterator(Symbol(this, 0, 0)); return symbol_iterator(Symbol(this, getNumberOfSymbols(), 0)); } uint32_t Archive::getNumberOfSymbols() const { + if (!hasSymbolTable()) + return 0; const char *buf = getSymbolTable().begin(); if (kind() == K_GNU) return read32be(buf); @@ -542,6 +608,4 @@ Archive::child_iterator Archive::findSym(StringRef name) const { return child_end(); } -bool Archive::hasSymbolTable() const { - return SymbolTable != child_end(); -} +bool Archive::hasSymbolTable() const { return !SymbolTable.empty(); } diff --git a/contrib/llvm/lib/Object/ArchiveWriter.cpp b/contrib/llvm/lib/Object/ArchiveWriter.cpp index a40901c..c7343fd 100644 --- a/contrib/llvm/lib/Object/ArchiveWriter.cpp +++ b/contrib/llvm/lib/Object/ArchiveWriter.cpp @@ -34,32 +34,32 @@ using namespace llvm; -NewArchiveIterator::NewArchiveIterator(object::Archive::child_iterator I, +NewArchiveIterator::NewArchiveIterator(const object::Archive::Child &OldMember, StringRef Name) - : IsNewMember(false), Name(Name), OldI(I) {} + : IsNewMember(false), Name(Name), OldMember(OldMember) {} -NewArchiveIterator::NewArchiveIterator(StringRef NewFilename, StringRef Name) - : IsNewMember(true), Name(Name), NewFilename(NewFilename) {} +NewArchiveIterator::NewArchiveIterator(StringRef FileName) + : IsNewMember(true), Name(FileName), OldMember(nullptr, nullptr, nullptr) {} StringRef NewArchiveIterator::getName() const { return Name; } bool NewArchiveIterator::isNewMember() const { return IsNewMember; } -object::Archive::child_iterator NewArchiveIterator::getOld() const { +const object::Archive::Child &NewArchiveIterator::getOld() const { assert(!IsNewMember); - return OldI; + return OldMember; } StringRef NewArchiveIterator::getNew() const { assert(IsNewMember); - return NewFilename; + return Name; } llvm::ErrorOr<int> NewArchiveIterator::getFD(sys::fs::file_status &NewStatus) const { assert(IsNewMember); int NewFD; - if (auto EC = sys::fs::openFileForRead(NewFilename, NewFD)) + if (auto EC = sys::fs::openFileForRead(Name, NewFD)) return EC; assert(NewFD != -1); @@ -77,7 +77,7 @@ NewArchiveIterator::getFD(sys::fs::file_status &NewStatus) const { template <typename T> static void printWithSpacePadding(raw_fd_ostream &OS, T Data, unsigned Size, - bool MayTruncate = false) { + bool MayTruncate = false) { uint64_t OldPos = OS.tell(); OS << Data; unsigned SizeSoFar = OS.tell() - OldPos; @@ -135,30 +135,56 @@ static void printBSDMemberHeader(raw_fd_ostream &Out, StringRef Name, Out.write(uint8_t(0)); } +static bool useStringTable(bool Thin, StringRef Name) { + return Thin || Name.size() >= 16; +} + static void -printMemberHeader(raw_fd_ostream &Out, object::Archive::Kind Kind, +printMemberHeader(raw_fd_ostream &Out, object::Archive::Kind Kind, bool Thin, StringRef Name, std::vector<unsigned>::iterator &StringMapIndexIter, const sys::TimeValue &ModTime, unsigned UID, unsigned GID, unsigned Perms, unsigned Size) { if (Kind == object::Archive::K_BSD) return printBSDMemberHeader(Out, Name, ModTime, UID, GID, Perms, Size); - if (Name.size() < 16) + if (!useStringTable(Thin, Name)) return printGNUSmallMemberHeader(Out, Name, ModTime, UID, GID, Perms, Size); Out << '/'; printWithSpacePadding(Out, *StringMapIndexIter++, 15); printRestOfMemberHeader(Out, ModTime, UID, GID, Perms, Size); } -static void writeStringTable(raw_fd_ostream &Out, +// Compute the relative path from From to To. +static std::string computeRelativePath(StringRef From, StringRef To) { + if (sys::path::is_absolute(From) || sys::path::is_absolute(To)) + return To; + + StringRef DirFrom = sys::path::parent_path(From); + auto FromI = sys::path::begin(DirFrom); + auto ToI = sys::path::begin(To); + while (*FromI == *ToI) { + ++FromI; + ++ToI; + } + + SmallString<128> Relative; + for (auto FromE = sys::path::end(DirFrom); FromI != FromE; ++FromI) + sys::path::append(Relative, ".."); + + for (auto ToE = sys::path::end(To); ToI != ToE; ++ToI) + sys::path::append(Relative, *ToI); + + return Relative.str(); +} + +static void writeStringTable(raw_fd_ostream &Out, StringRef ArcName, ArrayRef<NewArchiveIterator> Members, - std::vector<unsigned> &StringMapIndexes) { + std::vector<unsigned> &StringMapIndexes, + bool Thin) { unsigned StartOffset = 0; - for (ArrayRef<NewArchiveIterator>::iterator I = Members.begin(), - E = Members.end(); - I != E; ++I) { - StringRef Name = I->getName(); - if (Name.size() < 16) + for (const NewArchiveIterator &I : Members) { + StringRef Name = sys::path::filename(I.getName()); + if (!useStringTable(Thin, Name)) continue; if (StartOffset == 0) { printWithSpacePadding(Out, "//", 58); @@ -166,7 +192,13 @@ static void writeStringTable(raw_fd_ostream &Out, StartOffset = Out.tell(); } StringMapIndexes.push_back(Out.tell() - StartOffset); - Out << Name << "/\n"; + + if (Thin) + Out << computeRelativePath(ArcName, I.getName()); + else + Out << Name; + + Out << "/\n"; } if (StartOffset == 0) return; @@ -268,9 +300,11 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind, return BodyStartOffset + 4; } -std::pair<StringRef, std::error_code> llvm::writeArchive( - StringRef ArcName, std::vector<NewArchiveIterator> &NewMembers, - bool WriteSymtab, object::Archive::Kind Kind, bool Deterministic) { +std::pair<StringRef, std::error_code> +llvm::writeArchive(StringRef ArcName, + std::vector<NewArchiveIterator> &NewMembers, + bool WriteSymtab, object::Archive::Kind Kind, + bool Deterministic, bool Thin) { SmallString<128> TmpArchive; int TmpArchiveFD; if (auto EC = sys::fs::createUniqueFile(ArcName + ".temp-archive-%%%%%%%.a", @@ -279,7 +313,10 @@ std::pair<StringRef, std::error_code> llvm::writeArchive( tool_output_file Output(TmpArchive, TmpArchiveFD); raw_fd_ostream &Out = Output.os(); - Out << "!<arch>\n"; + if (Thin) + Out << "!<thin>\n"; + else + Out << "!<arch>\n"; std::vector<unsigned> MemberOffsetRefs; @@ -309,9 +346,11 @@ std::pair<StringRef, std::error_code> llvm::writeArchive( Buffers.push_back(std::move(MemberBufferOrErr.get())); MemberRef = Buffers.back()->getMemBufferRef(); } else { - object::Archive::child_iterator OldMember = Member.getOld(); + const object::Archive::Child &OldMember = Member.getOld(); + assert((!Thin || OldMember.getParent()->isThin()) && + "Thin archives cannot refers to member of other archives"); ErrorOr<MemoryBufferRef> MemberBufferOrErr = - OldMember->getMemoryBufferRef(); + OldMember.getMemoryBufferRef(); if (auto EC = MemberBufferOrErr.getError()) return std::make_pair("", EC); MemberRef = MemberBufferOrErr.get(); @@ -330,7 +369,7 @@ std::pair<StringRef, std::error_code> llvm::writeArchive( std::vector<unsigned> StringMapIndexes; if (Kind != object::Archive::K_BSD) - writeStringTable(Out, NewMembers, StringMapIndexes); + writeStringTable(Out, ArcName, NewMembers, StringMapIndexes, Thin); unsigned MemberNum = 0; unsigned NewMemberNum = 0; @@ -358,26 +397,32 @@ std::pair<StringRef, std::error_code> llvm::writeArchive( GID = Status.getGroup(); Perms = Status.permissions(); } else { - object::Archive::child_iterator OldMember = I.getOld(); - ModTime = OldMember->getLastModified(); - UID = OldMember->getUID(); - GID = OldMember->getGID(); - Perms = OldMember->getAccessMode(); + const object::Archive::Child &OldMember = I.getOld(); + ModTime = OldMember.getLastModified(); + UID = OldMember.getUID(); + GID = OldMember.getGID(); + Perms = OldMember.getAccessMode(); } if (I.isNewMember()) { StringRef FileName = I.getNew(); const sys::fs::file_status &Status = NewMemberStatus[NewMemberNum++]; - printMemberHeader(Out, Kind, sys::path::filename(FileName), + printMemberHeader(Out, Kind, Thin, sys::path::filename(FileName), StringMapIndexIter, ModTime, UID, GID, Perms, Status.getSize()); } else { - object::Archive::child_iterator OldMember = I.getOld(); - printMemberHeader(Out, Kind, I.getName(), StringMapIndexIter, ModTime, - UID, GID, Perms, OldMember->getSize()); + const object::Archive::Child &OldMember = I.getOld(); + ErrorOr<uint32_t> Size = OldMember.getSize(); + if (std::error_code EC = Size.getError()) + return std::make_pair("", EC); + StringRef FileName = I.getName(); + printMemberHeader(Out, Kind, Thin, sys::path::filename(FileName), + StringMapIndexIter, ModTime, UID, GID, Perms, + Size.get()); } - Out << File.getBuffer(); + if (!Thin) + Out << File.getBuffer(); if (Out.tell() % 2) Out << '\n'; diff --git a/contrib/llvm/lib/Object/COFFObjectFile.cpp b/contrib/llvm/lib/Object/COFFObjectFile.cpp index bcca983..4cd6aff 100644 --- a/contrib/llvm/lib/Object/COFFObjectFile.cpp +++ b/contrib/llvm/lib/Object/COFFObjectFile.cpp @@ -171,6 +171,11 @@ ErrorOr<uint64_t> COFFObjectFile::getSymbolAddress(DataRefImpl Ref) const { if (std::error_code EC = getSection(SectionNumber, Section)) return EC; Result += Section->VirtualAddress; + + // The section VirtualAddress does not include ImageBase, and we want to + // return virtual addresses. + Result += getImageBase(); + return Result; } @@ -178,10 +183,10 @@ SymbolRef::Type COFFObjectFile::getSymbolType(DataRefImpl Ref) const { COFFSymbolRef Symb = getCOFFSymbol(Ref); int32_t SectionNumber = Symb.getSectionNumber(); + if (Symb.getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION) + return SymbolRef::ST_Function; if (Symb.isAnyUndefined()) return SymbolRef::ST_Unknown; - if (Symb.isFunctionDefinition()) - return SymbolRef::ST_Function; if (Symb.isCommon()) return SymbolRef::ST_Data; if (Symb.isFileRecord()) @@ -230,21 +235,17 @@ uint64_t COFFObjectFile::getCommonSymbolSizeImpl(DataRefImpl Ref) const { return Symb.getValue(); } -std::error_code -COFFObjectFile::getSymbolSection(DataRefImpl Ref, - section_iterator &Result) const { +ErrorOr<section_iterator> +COFFObjectFile::getSymbolSection(DataRefImpl Ref) const { COFFSymbolRef Symb = getCOFFSymbol(Ref); - if (COFF::isReservedSectionNumber(Symb.getSectionNumber())) { - Result = section_end(); - } else { - const coff_section *Sec = nullptr; - if (std::error_code EC = getSection(Symb.getSectionNumber(), Sec)) - return EC; - DataRefImpl Ref; - Ref.p = reinterpret_cast<uintptr_t>(Sec); - Result = section_iterator(SectionRef(Ref, this)); - } - return std::error_code(); + if (COFF::isReservedSectionNumber(Symb.getSectionNumber())) + return section_end(); + const coff_section *Sec = nullptr; + if (std::error_code EC = getSection(Symb.getSectionNumber(), Sec)) + return EC; + DataRefImpl Ret; + Ret.p = reinterpret_cast<uintptr_t>(Sec); + return section_iterator(SectionRef(Ret, this)); } unsigned COFFObjectFile::getSymbolSectionID(SymbolRef Sym) const { @@ -266,7 +267,12 @@ std::error_code COFFObjectFile::getSectionName(DataRefImpl Ref, uint64_t COFFObjectFile::getSectionAddress(DataRefImpl Ref) const { const coff_section *Sec = toSec(Ref); - return Sec->VirtualAddress; + uint64_t Result = Sec->VirtualAddress; + + // The section VirtualAddress does not include ImageBase, and we want to + // return virtual addresses. + Result += getImageBase(); + return Result; } uint64_t COFFObjectFile::getSectionSize(DataRefImpl Ref) const { @@ -412,10 +418,18 @@ std::error_code COFFObjectFile::initSymbolTablePtr() { return std::error_code(); } +uint64_t COFFObjectFile::getImageBase() const { + if (PE32Header) + return PE32Header->ImageBase; + else if (PE32PlusHeader) + return PE32PlusHeader->ImageBase; + // This actually comes up in practice. + return 0; +} + // Returns the file offset for the given VA. std::error_code COFFObjectFile::getVaPtr(uint64_t Addr, uintptr_t &Res) const { - uint64_t ImageBase = PE32Header ? (uint64_t)PE32Header->ImageBase - : (uint64_t)PE32PlusHeader->ImageBase; + uint64_t ImageBase = getImageBase(); uint64_t Rva = Addr - ImageBase; assert(Rva <= UINT32_MAX); return getRvaPtr((uint32_t)Rva, Res); @@ -744,6 +758,8 @@ StringRef COFFObjectFile::getFileFormatName() const { return "COFF-x86-64"; case COFF::IMAGE_FILE_MACHINE_ARMNT: return "COFF-ARM"; + case COFF::IMAGE_FILE_MACHINE_ARM64: + return "COFF-ARM64"; default: return "COFF-<unknown arch>"; } @@ -757,6 +773,8 @@ unsigned COFFObjectFile::getArch() const { return Triple::x86_64; case COFF::IMAGE_FILE_MACHINE_ARMNT: return Triple::thumb; + case COFF::IMAGE_FILE_MACHINE_ARM64: + return Triple::aarch64; default: return Triple::UnknownArch; } @@ -1318,6 +1336,30 @@ ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const { return std::error_code(); } +std::error_code ExportDirectoryEntryRef::isForwarder(bool &Result) const { + const data_directory *DataEntry; + if (auto EC = OwningObject->getDataDirectory(COFF::EXPORT_TABLE, DataEntry)) + return EC; + uint32_t RVA; + if (auto EC = getExportRVA(RVA)) + return EC; + uint32_t Begin = DataEntry->RelativeVirtualAddress; + uint32_t End = DataEntry->RelativeVirtualAddress + DataEntry->Size; + Result = (Begin <= RVA && RVA < End); + return std::error_code(); +} + +std::error_code ExportDirectoryEntryRef::getForwardTo(StringRef &Result) const { + uint32_t RVA; + if (auto EC = getExportRVA(RVA)) + return EC; + uintptr_t IntPtr = 0; + if (auto EC = OwningObject->getRvaPtr(RVA, IntPtr)) + return EC; + Result = StringRef(reinterpret_cast<const char *>(IntPtr)); + return std::error_code(); +} + bool ImportedSymbolRef:: operator==(const ImportedSymbolRef &Other) const { return Entry32 == Other.Entry32 && Entry64 == Other.Entry64 diff --git a/contrib/llvm/lib/Object/COFFYAML.cpp b/contrib/llvm/lib/Object/COFFYAML.cpp index 9a24b53..4c1fca1 100644 --- a/contrib/llvm/lib/Object/COFFYAML.cpp +++ b/contrib/llvm/lib/Object/COFFYAML.cpp @@ -56,6 +56,7 @@ void ScalarEnumerationTraits<COFF::MachineTypes>::enumeration( ECase(IMAGE_FILE_MACHINE_AMD64); ECase(IMAGE_FILE_MACHINE_ARM); ECase(IMAGE_FILE_MACHINE_ARMNT); + ECase(IMAGE_FILE_MACHINE_ARM64); ECase(IMAGE_FILE_MACHINE_EBC); ECase(IMAGE_FILE_MACHINE_I386); ECase(IMAGE_FILE_MACHINE_IA64); @@ -210,6 +211,7 @@ void ScalarBitSetTraits<COFF::Characteristics>::bitset( void ScalarBitSetTraits<COFF::SectionCharacteristics>::bitset( IO &IO, COFF::SectionCharacteristics &Value) { + BCase(IMAGE_SCN_TYPE_NOLOAD); BCase(IMAGE_SCN_TYPE_NO_PAD); BCase(IMAGE_SCN_CNT_CODE); BCase(IMAGE_SCN_CNT_INITIALIZED_DATA); diff --git a/contrib/llvm/lib/Object/ELF.cpp b/contrib/llvm/lib/Object/ELF.cpp index 398e9e4..12b772d 100644 --- a/contrib/llvm/lib/Object/ELF.cpp +++ b/contrib/llvm/lib/Object/ELF.cpp @@ -26,6 +26,7 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) { } break; case ELF::EM_386: + case ELF::EM_IAMCU: switch (Type) { #include "llvm/Support/ELFRelocs/i386.def" default: @@ -90,6 +91,13 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) { break; } break; + case ELF::EM_WEBASSEMBLY: + switch (Type) { +#include "llvm/Support/ELFRelocs/WebAssembly.def" + default: + break; + } + break; default: break; } diff --git a/contrib/llvm/lib/Object/ELFYAML.cpp b/contrib/llvm/lib/Object/ELFYAML.cpp index 72c232c..4a4b227 100644 --- a/contrib/llvm/lib/Object/ELFYAML.cpp +++ b/contrib/llvm/lib/Object/ELFYAML.cpp @@ -193,6 +193,7 @@ ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(IO &IO, ECase(EM_VIDEOCORE5) ECase(EM_78KOR) ECase(EM_56800EX) + ECase(EM_AMDGPU) #undef ECase } @@ -316,6 +317,25 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO, BCase(EF_HEXAGON_ISA_V4) BCase(EF_HEXAGON_ISA_V5) break; + case ELF::EM_AVR: + BCase(EF_AVR_ARCH_AVR1) + BCase(EF_AVR_ARCH_AVR2) + BCase(EF_AVR_ARCH_AVR25) + BCase(EF_AVR_ARCH_AVR3) + BCase(EF_AVR_ARCH_AVR31) + BCase(EF_AVR_ARCH_AVR35) + BCase(EF_AVR_ARCH_AVR4) + BCase(EF_AVR_ARCH_AVR51) + BCase(EF_AVR_ARCH_AVR6) + BCase(EF_AVR_ARCH_AVRTINY) + BCase(EF_AVR_ARCH_XMEGA1) + BCase(EF_AVR_ARCH_XMEGA2) + BCase(EF_AVR_ARCH_XMEGA3) + BCase(EF_AVR_ARCH_XMEGA4) + BCase(EF_AVR_ARCH_XMEGA5) + BCase(EF_AVR_ARCH_XMEGA6) + BCase(EF_AVR_ARCH_XMEGA7) + break; default: llvm_unreachable("Unsupported architecture"); } @@ -382,6 +402,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration( void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO, ELFYAML::ELF_SHF &Value) { + const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext()); #define BCase(X) IO.bitSetCase(Value, #X, ELF::X); BCase(SHF_WRITE) BCase(SHF_ALLOC) @@ -394,6 +415,17 @@ void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO, BCase(SHF_OS_NONCONFORMING) BCase(SHF_GROUP) BCase(SHF_TLS) + switch(Object->Header.Machine) { + case ELF::EM_AMDGPU: + BCase(SHF_AMDGPU_HSA_GLOBAL) + BCase(SHF_AMDGPU_HSA_READONLY) + BCase(SHF_AMDGPU_HSA_CODE) + BCase(SHF_AMDGPU_HSA_AGENT) + break; + default: + // Nothing to do. + break; + } #undef BCase } @@ -466,6 +498,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration( #include "llvm/Support/ELFRelocs/Hexagon.def" break; case ELF::EM_386: + case ELF::EM_IAMCU: #include "llvm/Support/ELFRelocs/i386.def" break; case ELF::EM_AARCH64: diff --git a/contrib/llvm/lib/Object/Error.cpp b/contrib/llvm/lib/Object/Error.cpp index 7ca2f12..7ecc3a1 100644 --- a/contrib/llvm/lib/Object/Error.cpp +++ b/contrib/llvm/lib/Object/Error.cpp @@ -47,6 +47,8 @@ std::string _object_error_category::message(int EV) const { return "Invalid section index"; case object_error::bitcode_section_not_found: return "Bitcode section not found in object file"; + case object_error::elf_invalid_dynamic_table_size: + return "Invalid dynamic table size"; case object_error::macho_small_load_command: return "Mach-O load command with size < 8 bytes"; case object_error::macho_load_segment_too_many_sections: diff --git a/contrib/llvm/lib/Object/FunctionIndexObjectFile.cpp b/contrib/llvm/lib/Object/FunctionIndexObjectFile.cpp new file mode 100644 index 0000000..fe111de --- /dev/null +++ b/contrib/llvm/lib/Object/FunctionIndexObjectFile.cpp @@ -0,0 +1,143 @@ +//===- FunctionIndexObjectFile.cpp - Function index file implementation ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Part of the FunctionIndexObjectFile class implementation. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Object/FunctionIndexObjectFile.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Bitcode/ReaderWriter.h" +#include "llvm/IR/FunctionInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; +using namespace object; + +FunctionIndexObjectFile::FunctionIndexObjectFile( + MemoryBufferRef Object, std::unique_ptr<FunctionInfoIndex> I) + : SymbolicFile(Binary::ID_FunctionIndex, Object), Index(std::move(I)) {} + +FunctionIndexObjectFile::~FunctionIndexObjectFile() {} + +std::unique_ptr<FunctionInfoIndex> FunctionIndexObjectFile::takeIndex() { + return std::move(Index); +} + +ErrorOr<MemoryBufferRef> +FunctionIndexObjectFile::findBitcodeInObject(const ObjectFile &Obj) { + for (const SectionRef &Sec : Obj.sections()) { + StringRef SecName; + if (std::error_code EC = Sec.getName(SecName)) + return EC; + if (SecName == ".llvmbc") { + StringRef SecContents; + if (std::error_code EC = Sec.getContents(SecContents)) + return EC; + return MemoryBufferRef(SecContents, Obj.getFileName()); + } + } + + return object_error::bitcode_section_not_found; +} + +ErrorOr<MemoryBufferRef> +FunctionIndexObjectFile::findBitcodeInMemBuffer(MemoryBufferRef Object) { + sys::fs::file_magic Type = sys::fs::identify_magic(Object.getBuffer()); + switch (Type) { + case sys::fs::file_magic::bitcode: + return Object; + case sys::fs::file_magic::elf_relocatable: + case sys::fs::file_magic::macho_object: + case sys::fs::file_magic::coff_object: { + ErrorOr<std::unique_ptr<ObjectFile>> ObjFile = + ObjectFile::createObjectFile(Object, Type); + if (!ObjFile) + return ObjFile.getError(); + return findBitcodeInObject(*ObjFile->get()); + } + default: + return object_error::invalid_file_type; + } +} + +// Looks for function index in the given memory buffer. +// returns true if found, else false. +bool FunctionIndexObjectFile::hasFunctionSummaryInMemBuffer( + MemoryBufferRef Object, DiagnosticHandlerFunction DiagnosticHandler) { + ErrorOr<MemoryBufferRef> BCOrErr = findBitcodeInMemBuffer(Object); + if (!BCOrErr) + return false; + + return hasFunctionSummary(BCOrErr.get(), DiagnosticHandler); +} + +// Parse function index in the given memory buffer. +// Return new FunctionIndexObjectFile instance containing parsed +// function summary/index. +ErrorOr<std::unique_ptr<FunctionIndexObjectFile>> +FunctionIndexObjectFile::create(MemoryBufferRef Object, + DiagnosticHandlerFunction DiagnosticHandler, + bool IsLazy) { + std::unique_ptr<FunctionInfoIndex> Index; + + ErrorOr<MemoryBufferRef> BCOrErr = findBitcodeInMemBuffer(Object); + if (!BCOrErr) + return BCOrErr.getError(); + + ErrorOr<std::unique_ptr<FunctionInfoIndex>> IOrErr = getFunctionInfoIndex( + BCOrErr.get(), DiagnosticHandler, IsLazy); + + if (std::error_code EC = IOrErr.getError()) + return EC; + + Index = std::move(IOrErr.get()); + + return llvm::make_unique<FunctionIndexObjectFile>(Object, std::move(Index)); +} + +// Parse the function summary information for function with the +// given name out of the given buffer. Parsed information is +// stored on the index object saved in this object. +std::error_code FunctionIndexObjectFile::findFunctionSummaryInMemBuffer( + MemoryBufferRef Object, DiagnosticHandlerFunction DiagnosticHandler, + StringRef FunctionName) { + sys::fs::file_magic Type = sys::fs::identify_magic(Object.getBuffer()); + switch (Type) { + case sys::fs::file_magic::bitcode: { + return readFunctionSummary(Object, DiagnosticHandler, FunctionName, + std::move(Index)); + } + default: + return object_error::invalid_file_type; + } +} + +// Parse the function index out of an IR file and return the function +// index object if found, or nullptr if not. +ErrorOr<std::unique_ptr<FunctionInfoIndex>> +llvm::getFunctionIndexForFile(StringRef Path, + DiagnosticHandlerFunction DiagnosticHandler) { + ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr = + MemoryBuffer::getFileOrSTDIN(Path); + std::error_code EC = FileOrErr.getError(); + if (EC) + return EC; + MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef(); + ErrorOr<std::unique_ptr<object::FunctionIndexObjectFile>> ObjOrErr = + object::FunctionIndexObjectFile::create(BufferRef, DiagnosticHandler); + EC = ObjOrErr.getError(); + if (EC) + return EC; + + object::FunctionIndexObjectFile &Obj = **ObjOrErr; + return Obj.takeIndex(); +} diff --git a/contrib/llvm/lib/Object/IRObjectFile.cpp b/contrib/llvm/lib/Object/IRObjectFile.cpp index 9f5132e..c35c413 100644 --- a/contrib/llvm/lib/Object/IRObjectFile.cpp +++ b/contrib/llvm/lib/Object/IRObjectFile.cpp @@ -219,6 +219,12 @@ uint32_t IRObjectFile::getSymbolFlags(DataRefImpl Symb) const { uint32_t Res = BasicSymbolRef::SF_None; if (GV->isDeclarationForLinker()) Res |= BasicSymbolRef::SF_Undefined; + else if (GV->hasHiddenVisibility() && !GV->hasLocalLinkage()) + Res |= BasicSymbolRef::SF_Hidden; + if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) { + if (GVar->isConstant()) + Res |= BasicSymbolRef::SF_Const; + } if (GV->hasPrivateLinkage()) Res |= BasicSymbolRef::SF_FormatSpecific; if (!GV->hasLocalLinkage()) @@ -303,7 +309,7 @@ llvm::object::IRObjectFile::create(MemoryBufferRef Object, MemoryBuffer::getMemBuffer(BCOrErr.get(), false)); ErrorOr<std::unique_ptr<Module>> MOrErr = - getLazyBitcodeModule(std::move(Buff), Context, nullptr, + getLazyBitcodeModule(std::move(Buff), Context, /*ShouldLazyLoadMetadata*/ true); if (std::error_code EC = MOrErr.getError()) return EC; diff --git a/contrib/llvm/lib/Object/MachOObjectFile.cpp b/contrib/llvm/lib/Object/MachOObjectFile.cpp index 0590063..d1f79b2 100644 --- a/contrib/llvm/lib/Object/MachOObjectFile.cpp +++ b/contrib/llvm/lib/Object/MachOObjectFile.cpp @@ -278,7 +278,7 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian, return; } LinkOptHintsLoadCmd = Load.Ptr; - } else if (Load.C.cmd == MachO::LC_DYLD_INFO || + } else if (Load.C.cmd == MachO::LC_DYLD_INFO || Load.C.cmd == MachO::LC_DYLD_INFO_ONLY) { // Multiple dyldinfo load commands if (DyldInfoLoadCmd) { @@ -401,6 +401,9 @@ SymbolRef::Type MachOObjectFile::getSymbolType(DataRefImpl Symb) const { case MachO::N_UNDF : return SymbolRef::ST_Unknown; case MachO::N_SECT : + section_iterator Sec = *getSymbolSection(Symb); + if (Sec->isData() || Sec->isBSS()) + return SymbolRef::ST_Data; return SymbolRef::ST_Function; } return SymbolRef::ST_Other; @@ -445,22 +448,18 @@ uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const { return Result; } -std::error_code MachOObjectFile::getSymbolSection(DataRefImpl Symb, - section_iterator &Res) const { +ErrorOr<section_iterator> +MachOObjectFile::getSymbolSection(DataRefImpl Symb) const { MachO::nlist_base Entry = getSymbolTableEntryBase(this, Symb); uint8_t index = Entry.n_sect; - if (index == 0) { - Res = section_end(); - } else { - DataRefImpl DRI; - DRI.d.a = index - 1; - if (DRI.d.a >= Sections.size()) - report_fatal_error("getSymbolSection: Invalid section index."); - Res = section_iterator(SectionRef(DRI, this)); - } - - return std::error_code(); + if (index == 0) + return section_end(); + DataRefImpl DRI; + DRI.d.a = index - 1; + if (DRI.d.a >= Sections.size()) + report_fatal_error("getSymbolSection: Invalid section index."); + return section_iterator(SectionRef(DRI, this)); } unsigned MachOObjectFile::getSymbolSectionID(SymbolRef Sym) const { @@ -487,9 +486,32 @@ uint64_t MachOObjectFile::getSectionAddress(DataRefImpl Sec) const { } uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const { - if (is64Bit()) - return getSection64(Sec).size; - return getSection(Sec).size; + // In the case if a malformed Mach-O file where the section offset is past + // the end of the file or some part of the section size is past the end of + // the file return a size of zero or a size that covers the rest of the file + // but does not extend past the end of the file. + uint32_t SectOffset, SectType; + uint64_t SectSize; + + if (is64Bit()) { + MachO::section_64 Sect = getSection64(Sec); + SectOffset = Sect.offset; + SectSize = Sect.size; + SectType = Sect.flags & MachO::SECTION_TYPE; + } else { + MachO::section Sect = getSection(Sec); + SectOffset = Sect.offset; + SectSize = Sect.size; + SectType = Sect.flags & MachO::SECTION_TYPE; + } + if (SectType == MachO::S_ZEROFILL || SectType == MachO::S_GB_ZEROFILL) + return SectSize; + uint64_t FileSize = getData().size(); + if (SectOffset > FileSize) + return 0; + if (FileSize - SectOffset < SectSize) + return FileSize - SectOffset; + return SectSize; } std::error_code MachOObjectFile::getSectionContents(DataRefImpl Sec, @@ -1136,8 +1158,7 @@ Triple MachOObjectFile::getThumbArch(uint32_t CPUType, uint32_t CPUSubType, } Triple MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType, - const char **McpuDefault, - Triple *ThumbTriple) { + const char **McpuDefault, Triple *ThumbTriple) { Triple T = MachOObjectFile::getArch(CPUType, CPUSubType, McpuDefault); *ThumbTriple = MachOObjectFile::getThumbArch(CPUType, CPUSubType, McpuDefault); @@ -1212,8 +1233,8 @@ dice_iterator MachOObjectFile::end_dices() const { return dice_iterator(DiceRef(DRI, this)); } -ExportEntry::ExportEntry(ArrayRef<uint8_t> T) - : Trie(T), Malformed(false), Done(false) { } +ExportEntry::ExportEntry(ArrayRef<uint8_t> T) + : Trie(T), Malformed(false), Done(false) {} void ExportEntry::moveToFirst() { pushNode(0); @@ -1226,7 +1247,7 @@ void ExportEntry::moveToEnd() { } bool ExportEntry::operator==(const ExportEntry &Other) const { - // Common case, one at end, other iterating from begin. + // Common case, one at end, other iterating from begin. if (Done || Other.Done) return (Done == Other.Done); // Not equal if different stack sizes. @@ -1240,7 +1261,7 @@ bool ExportEntry::operator==(const ExportEntry &Other) const { if (Stack[i].Start != Other.Stack[i].Start) return false; } - return true; + return true; } uint64_t ExportEntry::readULEB128(const uint8_t *&Ptr) { @@ -1281,11 +1302,10 @@ uint32_t ExportEntry::nodeOffset() const { return Stack.back().Start - Trie.begin(); } -ExportEntry::NodeState::NodeState(const uint8_t *Ptr) - : Start(Ptr), Current(Ptr), Flags(0), Address(0), Other(0), - ImportName(nullptr), ChildCount(0), NextChildIndex(0), - ParentStringLength(0), IsExportNode(false) { -} +ExportEntry::NodeState::NodeState(const uint8_t *Ptr) + : Start(Ptr), Current(Ptr), Flags(0), Address(0), Other(0), + ImportName(nullptr), ChildCount(0), NextChildIndex(0), + ParentStringLength(0), IsExportNode(false) {} void ExportEntry::pushNode(uint64_t offset) { const uint8_t *Ptr = Trie.begin() + offset; @@ -1302,7 +1322,7 @@ void ExportEntry::pushNode(uint64_t offset) { } else { State.Address = readULEB128(State.Current); if (State.Flags & MachO::EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER) - State.Other = readULEB128(State.Current); + State.Other = readULEB128(State.Current); } } State.ChildCount = *Children; @@ -1339,7 +1359,7 @@ void ExportEntry::pushDownUntilBottom() { // // There is one "export" node for each exported symbol. But because some // symbols may be a prefix of another symbol (e.g. _dup and _dup2), an export -// node may have child nodes too. +// node may have child nodes too. // // The algorithm for moveNext() is to keep moving down the leftmost unvisited // child until hitting a node with no children (which is an export node or @@ -1372,7 +1392,7 @@ void ExportEntry::moveNext() { Done = true; } -iterator_range<export_iterator> +iterator_range<export_iterator> MachOObjectFile::exports(ArrayRef<uint8_t> Trie) { ExportEntry Start(Trie); if (Trie.size() == 0) @@ -1383,15 +1403,13 @@ MachOObjectFile::exports(ArrayRef<uint8_t> Trie) { ExportEntry Finish(Trie); Finish.moveToEnd(); - return iterator_range<export_iterator>(export_iterator(Start), - export_iterator(Finish)); + return make_range(export_iterator(Start), export_iterator(Finish)); } iterator_range<export_iterator> MachOObjectFile::exports() const { return exports(getDyldInfoExportsTrie()); } - MachORebaseEntry::MachORebaseEntry(ArrayRef<uint8_t> Bytes, bool is64Bit) : Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), SegmentIndex(0), RemainingLoopCount(0), AdvanceAmount(0), RebaseType(0), @@ -1555,17 +1573,14 @@ MachOObjectFile::rebaseTable(ArrayRef<uint8_t> Opcodes, bool is64) { MachORebaseEntry Finish(Opcodes, is64); Finish.moveToEnd(); - return iterator_range<rebase_iterator>(rebase_iterator(Start), - rebase_iterator(Finish)); + return make_range(rebase_iterator(Start), rebase_iterator(Finish)); } iterator_range<rebase_iterator> MachOObjectFile::rebaseTable() const { return rebaseTable(getDyldInfoRebaseOpcodes(), is64Bit()); } - -MachOBindEntry::MachOBindEntry(ArrayRef<uint8_t> Bytes, bool is64Bit, - Kind BK) +MachOBindEntry::MachOBindEntry(ArrayRef<uint8_t> Bytes, bool is64Bit, Kind BK) : Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), SegmentIndex(0), Ordinal(0), Flags(0), Addend(0), RemainingLoopCount(0), AdvanceAmount(0), BindType(0), PointerSize(is64Bit ? 8 : 4), @@ -1769,7 +1784,6 @@ int64_t MachOBindEntry::readSLEB128() { return Result; } - uint32_t MachOBindEntry::segmentIndex() const { return SegmentIndex; } uint64_t MachOBindEntry::segmentOffset() const { return SegmentOffset; } @@ -1810,8 +1824,7 @@ MachOObjectFile::bindTable(ArrayRef<uint8_t> Opcodes, bool is64, MachOBindEntry Finish(Opcodes, is64, BKind); Finish.moveToEnd(); - return iterator_range<bind_iterator>(bind_iterator(Start), - bind_iterator(Finish)); + return make_range(bind_iterator(Start), bind_iterator(Finish)); } iterator_range<bind_iterator> MachOObjectFile::bindTable() const { @@ -1841,8 +1854,7 @@ MachOObjectFile::end_load_commands() const { iterator_range<MachOObjectFile::load_command_iterator> MachOObjectFile::load_commands() const { - return iterator_range<load_command_iterator>(begin_load_commands(), - end_load_commands()); + return make_range(begin_load_commands(), end_load_commands()); } StringRef @@ -2207,66 +2219,66 @@ MachOObjectFile::getLinkOptHintsLoadCommand() const { } ArrayRef<uint8_t> MachOObjectFile::getDyldInfoRebaseOpcodes() const { - if (!DyldInfoLoadCmd) - return ArrayRef<uint8_t>(); + if (!DyldInfoLoadCmd) + return None; - MachO::dyld_info_command DyldInfo - = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd); - const uint8_t *Ptr = reinterpret_cast<const uint8_t*>( - getPtr(this, DyldInfo.rebase_off)); - return ArrayRef<uint8_t>(Ptr, DyldInfo.rebase_size); + MachO::dyld_info_command DyldInfo = + getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd); + const uint8_t *Ptr = + reinterpret_cast<const uint8_t *>(getPtr(this, DyldInfo.rebase_off)); + return makeArrayRef(Ptr, DyldInfo.rebase_size); } ArrayRef<uint8_t> MachOObjectFile::getDyldInfoBindOpcodes() const { - if (!DyldInfoLoadCmd) - return ArrayRef<uint8_t>(); + if (!DyldInfoLoadCmd) + return None; - MachO::dyld_info_command DyldInfo - = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd); - const uint8_t *Ptr = reinterpret_cast<const uint8_t*>( - getPtr(this, DyldInfo.bind_off)); - return ArrayRef<uint8_t>(Ptr, DyldInfo.bind_size); + MachO::dyld_info_command DyldInfo = + getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd); + const uint8_t *Ptr = + reinterpret_cast<const uint8_t *>(getPtr(this, DyldInfo.bind_off)); + return makeArrayRef(Ptr, DyldInfo.bind_size); } ArrayRef<uint8_t> MachOObjectFile::getDyldInfoWeakBindOpcodes() const { - if (!DyldInfoLoadCmd) - return ArrayRef<uint8_t>(); + if (!DyldInfoLoadCmd) + return None; - MachO::dyld_info_command DyldInfo - = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd); - const uint8_t *Ptr = reinterpret_cast<const uint8_t*>( - getPtr(this, DyldInfo.weak_bind_off)); - return ArrayRef<uint8_t>(Ptr, DyldInfo.weak_bind_size); + MachO::dyld_info_command DyldInfo = + getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd); + const uint8_t *Ptr = + reinterpret_cast<const uint8_t *>(getPtr(this, DyldInfo.weak_bind_off)); + return makeArrayRef(Ptr, DyldInfo.weak_bind_size); } ArrayRef<uint8_t> MachOObjectFile::getDyldInfoLazyBindOpcodes() const { - if (!DyldInfoLoadCmd) - return ArrayRef<uint8_t>(); + if (!DyldInfoLoadCmd) + return None; - MachO::dyld_info_command DyldInfo - = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd); - const uint8_t *Ptr = reinterpret_cast<const uint8_t*>( - getPtr(this, DyldInfo.lazy_bind_off)); - return ArrayRef<uint8_t>(Ptr, DyldInfo.lazy_bind_size); + MachO::dyld_info_command DyldInfo = + getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd); + const uint8_t *Ptr = + reinterpret_cast<const uint8_t *>(getPtr(this, DyldInfo.lazy_bind_off)); + return makeArrayRef(Ptr, DyldInfo.lazy_bind_size); } ArrayRef<uint8_t> MachOObjectFile::getDyldInfoExportsTrie() const { - if (!DyldInfoLoadCmd) - return ArrayRef<uint8_t>(); + if (!DyldInfoLoadCmd) + return None; - MachO::dyld_info_command DyldInfo - = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd); - const uint8_t *Ptr = reinterpret_cast<const uint8_t*>( - getPtr(this, DyldInfo.export_off)); - return ArrayRef<uint8_t>(Ptr, DyldInfo.export_size); + MachO::dyld_info_command DyldInfo = + getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd); + const uint8_t *Ptr = + reinterpret_cast<const uint8_t *>(getPtr(this, DyldInfo.export_off)); + return makeArrayRef(Ptr, DyldInfo.export_size); } ArrayRef<uint8_t> MachOObjectFile::getUuid() const { if (!UuidLoadCmd) - return ArrayRef<uint8_t>(); + return None; // Returning a pointer is fine as uuid doesn't need endian swapping. const char *Ptr = UuidLoadCmd + offsetof(MachO::uuid_command, uuid); - return ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(Ptr), 16); + return makeArrayRef(reinterpret_cast<const uint8_t *>(Ptr), 16); } StringRef MachOObjectFile::getStringTableData() const { @@ -2315,4 +2327,3 @@ ObjectFile::createMachOObjectFile(MemoryBufferRef Buffer) { return EC; return std::move(Ret); } - diff --git a/contrib/llvm/lib/Object/MachOUniversal.cpp b/contrib/llvm/lib/Object/MachOUniversal.cpp index 1d0e69e..a1c83b9 100644 --- a/contrib/llvm/lib/Object/MachOUniversal.cpp +++ b/contrib/llvm/lib/Object/MachOUniversal.cpp @@ -69,14 +69,14 @@ MachOUniversalBinary::ObjectForArch::ObjectForArch( ErrorOr<std::unique_ptr<MachOObjectFile>> MachOUniversalBinary::ObjectForArch::getAsObjectFile() const { - if (Parent) { - StringRef ParentData = Parent->getData(); - StringRef ObjectData = ParentData.substr(Header.offset, Header.size); - StringRef ObjectName = Parent->getFileName(); - MemoryBufferRef ObjBuffer(ObjectData, ObjectName); - return ObjectFile::createMachOObjectFile(ObjBuffer); - } - return object_error::parse_failed; + if (!Parent) + return object_error::parse_failed; + + StringRef ParentData = Parent->getData(); + StringRef ObjectData = ParentData.substr(Header.offset, Header.size); + StringRef ObjectName = Parent->getFileName(); + MemoryBufferRef ObjBuffer(ObjectData, ObjectName); + return ObjectFile::createMachOObjectFile(ObjBuffer); } ErrorOr<std::unique_ptr<Archive>> diff --git a/contrib/llvm/lib/Object/Object.cpp b/contrib/llvm/lib/Object/Object.cpp index 5c4b7a6..b44c1a1 100644 --- a/contrib/llvm/lib/Object/Object.cpp +++ b/contrib/llvm/lib/Object/Object.cpp @@ -98,8 +98,10 @@ void LLVMMoveToNextSection(LLVMSectionIteratorRef SI) { void LLVMMoveToContainingSection(LLVMSectionIteratorRef Sect, LLVMSymbolIteratorRef Sym) { - if (std::error_code ec = (*unwrap(Sym))->getSection(*unwrap(Sect))) + ErrorOr<section_iterator> SecOrErr = (*unwrap(Sym))->getSection(); + if (std::error_code ec = SecOrErr.getError()) report_fatal_error(ec.message()); + *unwrap(Sect) = *SecOrErr; } // ObjectFile Symbol iterators diff --git a/contrib/llvm/lib/Object/ObjectFile.cpp b/contrib/llvm/lib/Object/ObjectFile.cpp index f82edae..d12dc41 100644 --- a/contrib/llvm/lib/Object/ObjectFile.cpp +++ b/contrib/llvm/lib/Object/ObjectFile.cpp @@ -29,10 +29,10 @@ ObjectFile::ObjectFile(unsigned int Type, MemoryBufferRef Source) : SymbolicFile(Type, Source) {} bool SectionRef::containsSymbol(SymbolRef S) const { - section_iterator SymSec = getObject()->section_end(); - if (S.getSection(SymSec)) + ErrorOr<section_iterator> SymSec = S.getSection(); + if (!SymSec) return false; - return *this == *SymSec; + return *this == **SymSec; } uint64_t ObjectFile::getSymbolValue(DataRefImpl Ref) const { diff --git a/contrib/llvm/lib/Object/SymbolicFile.cpp b/contrib/llvm/lib/Object/SymbolicFile.cpp index 854e68e..bf79dfb 100644 --- a/contrib/llvm/lib/Object/SymbolicFile.cpp +++ b/contrib/llvm/lib/Object/SymbolicFile.cpp @@ -11,6 +11,8 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Object/COFF.h" +#include "llvm/Object/COFFImportFile.h" #include "llvm/Object/IRObjectFile.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Object/SymbolicFile.h" @@ -54,9 +56,10 @@ ErrorOr<std::unique_ptr<SymbolicFile>> SymbolicFile::createSymbolicFile( case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub: case sys::fs::file_magic::macho_dsym_companion: case sys::fs::file_magic::macho_kext_bundle: - case sys::fs::file_magic::coff_import_library: case sys::fs::file_magic::pecoff_executable: return ObjectFile::createObjectFile(Object, Type); + case sys::fs::file_magic::coff_import_library: + return std::unique_ptr<SymbolicFile>(new COFFImportFile(Object)); case sys::fs::file_magic::elf_relocatable: case sys::fs::file_magic::macho_object: case sys::fs::file_magic::coff_object: { diff --git a/contrib/llvm/lib/Option/Arg.cpp b/contrib/llvm/lib/Option/Arg.cpp index ac00073..c3de2d1 100644 --- a/contrib/llvm/lib/Option/Arg.cpp +++ b/contrib/llvm/lib/Option/Arg.cpp @@ -13,6 +13,7 @@ #include "llvm/Option/ArgList.h" #include "llvm/Option/Option.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" using namespace llvm; using namespace llvm::opt; @@ -43,23 +44,25 @@ Arg::~Arg() { } } -void Arg::dump() const { - llvm::errs() << "<"; +void Arg::print(raw_ostream& O) const { + O << "<"; - llvm::errs() << " Opt:"; - Opt.dump(); + O << " Opt:"; + Opt.print(O); - llvm::errs() << " Index:" << Index; + O << " Index:" << Index; - llvm::errs() << " Values: ["; + O << " Values: ["; for (unsigned i = 0, e = Values.size(); i != e; ++i) { - if (i) llvm::errs() << ", "; - llvm::errs() << "'" << Values[i] << "'"; + if (i) O << ", "; + O << "'" << Values[i] << "'"; } - llvm::errs() << "]>\n"; + O << "]>\n"; } +LLVM_DUMP_METHOD void Arg::dump() const { print(dbgs()); } + std::string Arg::getAsString(const ArgList &Args) const { SmallString<256> Res; llvm::raw_svector_ostream OS(Res); diff --git a/contrib/llvm/lib/Option/ArgList.cpp b/contrib/llvm/lib/Option/ArgList.cpp index a74ead6..0826ef8 100644 --- a/contrib/llvm/lib/Option/ArgList.cpp +++ b/contrib/llvm/lib/Option/ArgList.cpp @@ -13,6 +13,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/Option/Arg.h" #include "llvm/Option/Option.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -258,6 +259,21 @@ void ArgList::AddLastArg(ArgStringList &Output, OptSpecifier Id0, } } +void ArgList::AddAllArgs(ArgStringList &Output, + ArrayRef<OptSpecifier> Ids) const { + for (const Arg *Arg : Args) { + for (OptSpecifier Id : Ids) { + if (Arg->getOption().matches(Id)) { + Arg->claim(); + Arg->render(*this, Output); + break; + } + } + } +} + +/// This 3-opt variant of AddAllArgs could be eliminated in favor of one +/// that accepts a single specifier, given the above which accepts any number. void ArgList::AddAllArgs(ArgStringList &Output, OptSpecifier Id0, OptSpecifier Id1, OptSpecifier Id2) const { for (auto Arg: filtered(Id0, Id1, Id2)) { @@ -313,6 +329,15 @@ const char *ArgList::GetOrMakeJoinedArgString(unsigned Index, return MakeArgString(LHS + RHS); } +void ArgList::print(raw_ostream &O) const { + for (Arg *A : *this) { + O << "* "; + A->print(O); + } +} + +LLVM_DUMP_METHOD void ArgList::dump() const { print(dbgs()); } + // void InputArgList::releaseMemory() { diff --git a/contrib/llvm/lib/Option/OptTable.cpp b/contrib/llvm/lib/Option/OptTable.cpp index e83536f..09d4ceb 100644 --- a/contrib/llvm/lib/Option/OptTable.cpp +++ b/contrib/llvm/lib/Option/OptTable.cpp @@ -84,11 +84,9 @@ static inline bool operator<(const OptTable::Info &I, const char *Name) { OptSpecifier::OptSpecifier(const Option *Opt) : ID(Opt->getID()) {} -OptTable::OptTable(const Info *OptionInfos, unsigned NumOptionInfos, - bool IgnoreCase) - : OptionInfos(OptionInfos), NumOptionInfos(NumOptionInfos), - IgnoreCase(IgnoreCase), TheInputOptionID(0), TheUnknownOptionID(0), - FirstSearchableIndex(0) { +OptTable::OptTable(ArrayRef<Info> OptionInfos, bool IgnoreCase) + : OptionInfos(OptionInfos), IgnoreCase(IgnoreCase), TheInputOptionID(0), + TheUnknownOptionID(0), FirstSearchableIndex(0) { // Explicitly zero initialize the error to work around a bug in array // value-initialization on MinGW with gcc 4.3.5. @@ -199,8 +197,8 @@ Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index, if (isInput(PrefixesUnion, Str)) return new Arg(getOption(TheInputOptionID), Str, Index++, Str); - const Info *Start = OptionInfos + FirstSearchableIndex; - const Info *End = OptionInfos + getNumOptions(); + const Info *Start = OptionInfos.begin() + FirstSearchableIndex; + const Info *End = OptionInfos.end(); StringRef Name = StringRef(Str).ltrim(PrefixChars); // Search for the first next option which could be a prefix. diff --git a/contrib/llvm/lib/Option/Option.cpp b/contrib/llvm/lib/Option/Option.cpp index 221414d..ebf05aa 100644 --- a/contrib/llvm/lib/Option/Option.cpp +++ b/contrib/llvm/lib/Option/Option.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> @@ -35,10 +36,10 @@ Option::Option(const OptTable::Info *info, const OptTable *owner) } } -void Option::dump() const { - llvm::errs() << "<"; +void Option::print(raw_ostream &O) const { + O << "<"; switch (getKind()) { -#define P(N) case N: llvm::errs() << #N; break +#define P(N) case N: O << #N; break P(GroupClass); P(InputClass); P(UnknownClass); @@ -54,33 +55,35 @@ void Option::dump() const { } if (Info->Prefixes) { - llvm::errs() << " Prefixes:["; - for (const char * const *Pre = Info->Prefixes; *Pre != nullptr; ++Pre) { - llvm::errs() << '"' << *Pre << (*(Pre + 1) == nullptr ? "\"" : "\", "); + O << " Prefixes:["; + for (const char *const *Pre = Info->Prefixes; *Pre != nullptr; ++Pre) { + O << '"' << *Pre << (*(Pre + 1) == nullptr ? "\"" : "\", "); } - llvm::errs() << ']'; + O << ']'; } - llvm::errs() << " Name:\"" << getName() << '"'; + O << " Name:\"" << getName() << '"'; const Option Group = getGroup(); if (Group.isValid()) { - llvm::errs() << " Group:"; - Group.dump(); + O << " Group:"; + Group.print(O); } const Option Alias = getAlias(); if (Alias.isValid()) { - llvm::errs() << " Alias:"; - Alias.dump(); + O << " Alias:"; + Alias.print(O); } if (getKind() == MultiArgClass) - llvm::errs() << " NumArgs:" << getNumArgs(); + O << " NumArgs:" << getNumArgs(); - llvm::errs() << ">\n"; + O << ">\n"; } +void Option::dump() const { print(dbgs()); } + bool Option::matches(OptSpecifier Opt) const { // Aliases are never considered in matching, look through them. const Option Alias = getAlias(); diff --git a/contrib/llvm/lib/Passes/PassBuilder.cpp b/contrib/llvm/lib/Passes/PassBuilder.cpp index ba71320..8ba81f7 100644 --- a/contrib/llvm/lib/Passes/PassBuilder.cpp +++ b/contrib/llvm/lib/Passes/PassBuilder.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" @@ -28,9 +29,14 @@ #include "llvm/IR/Verifier.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/IPO/ForceFunctionAttrs.h" +#include "llvm/Transforms/IPO/InferFunctionAttrs.h" +#include "llvm/Transforms/IPO/StripDeadPrototypes.h" #include "llvm/Transforms/InstCombine/InstCombine.h" +#include "llvm/Transforms/Scalar/ADCE.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" +#include "llvm/Transforms/Scalar/SROA.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" using namespace llvm; diff --git a/contrib/llvm/lib/Passes/PassRegistry.def b/contrib/llvm/lib/Passes/PassRegistry.def index d768a3a..241a789 100644 --- a/contrib/llvm/lib/Passes/PassRegistry.def +++ b/contrib/llvm/lib/Passes/PassRegistry.def @@ -27,10 +27,13 @@ MODULE_ANALYSIS("targetlibinfo", TargetLibraryAnalysis()) #ifndef MODULE_PASS #define MODULE_PASS(NAME, CREATE_PASS) #endif +MODULE_PASS("forceattrs", ForceFunctionAttrsPass()) +MODULE_PASS("inferattrs", InferFunctionAttrsPass()) MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass()) MODULE_PASS("no-op-module", NoOpModulePass()) MODULE_PASS("print", PrintModulePass(dbgs())) MODULE_PASS("print-cg", LazyCallGraphPrinterPass(dbgs())) +MODULE_PASS("strip-dead-prototypes", StripDeadPrototypesPass()) MODULE_PASS("verify", VerifierPass()) #undef MODULE_PASS @@ -54,6 +57,7 @@ FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis()) FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis()) FUNCTION_ANALYSIS("loops", LoopAnalysis()) FUNCTION_ANALYSIS("no-op-function", NoOpFunctionAnalysis()) +FUNCTION_ANALYSIS("scalar-evolution", ScalarEvolutionAnalysis()) FUNCTION_ANALYSIS("targetlibinfo", TargetLibraryAnalysis()) FUNCTION_ANALYSIS("targetir", TM ? TM->getTargetIRAnalysis() : TargetIRAnalysis()) @@ -62,6 +66,7 @@ FUNCTION_ANALYSIS("targetir", #ifndef FUNCTION_PASS #define FUNCTION_PASS(NAME, CREATE_PASS) #endif +FUNCTION_PASS("adce", ADCEPass()) FUNCTION_PASS("early-cse", EarlyCSEPass()) FUNCTION_PASS("instcombine", InstCombinePass()) FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass()) @@ -71,7 +76,9 @@ FUNCTION_PASS("print", PrintFunctionPass(dbgs())) FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs())) FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(dbgs())) FUNCTION_PASS("print<loops>", LoopPrinterPass(dbgs())) +FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(dbgs())) FUNCTION_PASS("simplify-cfg", SimplifyCFGPass()) +FUNCTION_PASS("sroa", SROA()) FUNCTION_PASS("verify", VerifierPass()) FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass()) #undef FUNCTION_PASS diff --git a/contrib/llvm/lib/ProfileData/CoverageMapping.cpp b/contrib/llvm/lib/ProfileData/CoverageMapping.cpp index cf04fea..f5d477b 100644 --- a/contrib/llvm/lib/ProfileData/CoverageMapping.cpp +++ b/contrib/llvm/lib/ProfileData/CoverageMapping.cpp @@ -181,18 +181,6 @@ void FunctionRecordIterator::skipOtherFiles() { *this = FunctionRecordIterator(); } -/// Get the function name from the record, removing the filename prefix if -/// necessary. -static StringRef getFuncNameWithoutPrefix(const CoverageMappingRecord &Record) { - StringRef FunctionName = Record.FunctionName; - if (Record.Filenames.empty()) - return FunctionName; - StringRef Filename = sys::path::filename(Record.Filenames[0]); - if (FunctionName.startswith(Filename)) - FunctionName = FunctionName.drop_front(Filename.size() + 1); - return FunctionName; -} - ErrorOr<std::unique_ptr<CoverageMapping>> CoverageMapping::load(CoverageMappingReader &CoverageReader, IndexedInstrProfReader &ProfileReader) { @@ -216,7 +204,11 @@ CoverageMapping::load(CoverageMappingReader &CoverageReader, assert(!Record.MappingRegions.empty() && "Function has no regions"); - FunctionRecord Function(getFuncNameWithoutPrefix(Record), Record.Filenames); + StringRef OrigFuncName = Record.FunctionName; + if (!Record.Filenames.empty()) + OrigFuncName = + getFuncNameWithoutPrefix(OrigFuncName, Record.Filenames[0]); + FunctionRecord Function(OrigFuncName, Record.Filenames); for (const auto &Region : Record.MappingRegions) { ErrorOr<int64_t> ExecutionCount = Ctx.evaluate(Region.Count); if (!ExecutionCount) @@ -525,6 +517,6 @@ class CoverageMappingErrorCategoryType : public std::error_category { static ManagedStatic<CoverageMappingErrorCategoryType> ErrorCategory; -const std::error_category &llvm::coveragemap_category() { +const std::error_category &llvm::coverage::coveragemap_category() { return *ErrorCategory; } diff --git a/contrib/llvm/lib/ProfileData/CoverageMappingReader.cpp b/contrib/llvm/lib/ProfileData/CoverageMappingReader.cpp index 334a3f5..89e1cf4 100644 --- a/contrib/llvm/lib/ProfileData/CoverageMappingReader.cpp +++ b/contrib/llvm/lib/ProfileData/CoverageMappingReader.cpp @@ -290,36 +290,25 @@ std::error_code RawCoverageMappingReader::read() { return std::error_code(); } -namespace { - -/// \brief A helper structure to access the data from a section -/// in an object file. -struct SectionData { - StringRef Data; - uint64_t Address; - - std::error_code load(SectionRef &Section) { - if (auto Err = Section.getContents(Data)) - return Err; - Address = Section.getAddress(); - return std::error_code(); - } +std::error_code InstrProfSymtab::create(SectionRef &Section) { + if (auto Err = Section.getContents(Data)) + return Err; + Address = Section.getAddress(); + return std::error_code(); +} - std::error_code get(uint64_t Pointer, size_t Size, StringRef &Result) { - if (Pointer < Address) - return coveragemap_error::malformed; - auto Offset = Pointer - Address; - if (Offset + Size > Data.size()) - return coveragemap_error::malformed; - Result = Data.substr(Pointer - Address, Size); - return std::error_code(); - } -}; +StringRef InstrProfSymtab::getFuncName(uint64_t Pointer, size_t Size) { + if (Pointer < Address) + return StringRef(); + auto Offset = Pointer - Address; + if (Offset + Size > Data.size()) + return StringRef(); + return Data.substr(Pointer - Address, Size); } template <typename T, support::endianness Endian> -std::error_code readCoverageMappingData( - SectionData &ProfileNames, StringRef Data, +static std::error_code readCoverageMappingData( + InstrProfSymtab &ProfileNames, StringRef Data, std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records, std::vector<StringRef> &Filenames) { using namespace support; @@ -327,23 +316,21 @@ std::error_code readCoverageMappingData( // Read the records in the coverage data section. for (const char *Buf = Data.data(), *End = Buf + Data.size(); Buf < End;) { - if (Buf + 4 * sizeof(uint32_t) > End) + if (Buf + sizeof(CovMapHeader) > End) return coveragemap_error::malformed; - uint32_t NRecords = endian::readNext<uint32_t, Endian, unaligned>(Buf); - uint32_t FilenamesSize = endian::readNext<uint32_t, Endian, unaligned>(Buf); - uint32_t CoverageSize = endian::readNext<uint32_t, Endian, unaligned>(Buf); - uint32_t Version = endian::readNext<uint32_t, Endian, unaligned>(Buf); - - switch (Version) { - case CoverageMappingVersion1: - break; - default: + auto CovHeader = reinterpret_cast<const coverage::CovMapHeader *>(Buf); + uint32_t NRecords = CovHeader->getNRecords<Endian>(); + uint32_t FilenamesSize = CovHeader->getFilenamesSize<Endian>(); + uint32_t CoverageSize = CovHeader->getCoverageSize<Endian>(); + uint32_t Version = CovHeader->getVersion<Endian>(); + Buf = reinterpret_cast<const char *>(++CovHeader); + + if (Version > coverage::CoverageMappingCurrentVersion) return coveragemap_error::unsupported_version; - } // Skip past the function records, saving the start and end for later. const char *FunBuf = Buf; - Buf += NRecords * (sizeof(T) + 2 * sizeof(uint32_t) + sizeof(uint64_t)); + Buf += NRecords * sizeof(coverage::CovMapFunctionRecord<T>); const char *FunEnd = Buf; // Get the filenames. @@ -366,12 +353,12 @@ std::error_code readCoverageMappingData( // before reading the next map. Buf += alignmentAdjustment(Buf, 8); - while (FunBuf < FunEnd) { + auto CFR = + reinterpret_cast<const coverage::CovMapFunctionRecord<T> *>(FunBuf); + while ((const char *)CFR < FunEnd) { // Read the function information - T NamePtr = endian::readNext<T, Endian, unaligned>(FunBuf); - uint32_t NameSize = endian::readNext<uint32_t, Endian, unaligned>(FunBuf); - uint32_t DataSize = endian::readNext<uint32_t, Endian, unaligned>(FunBuf); - uint64_t FuncHash = endian::readNext<uint64_t, Endian, unaligned>(FunBuf); + uint32_t DataSize = CFR->template getDataSize<Endian>(); + uint64_t FuncHash = CFR->template getFuncHash<Endian>(); // Now use that to read the coverage data. if (CovBuf + DataSize > CovEnd) @@ -382,16 +369,18 @@ std::error_code readCoverageMappingData( // Ignore this record if we already have a record that points to the same // function name. This is useful to ignore the redundant records for the // functions with ODR linkage. - if (!UniqueFunctionMappingData.insert(NamePtr).second) + T NameRef = CFR->template getFuncNameRef<Endian>(); + if (!UniqueFunctionMappingData.insert(NameRef).second) continue; - // Finally, grab the name and create a record. StringRef FuncName; - if (std::error_code EC = ProfileNames.get(NamePtr, NameSize, FuncName)) + if (std::error_code EC = + CFR->template getFuncName<Endian>(ProfileNames, FuncName)) return EC; Records.push_back(BinaryCoverageReader::ProfileMappingRecord( CoverageMappingVersion(Version), FuncName, FuncHash, Mapping, FilenamesBegin, Filenames.size() - FilenamesBegin)); + CFR++; } } @@ -401,7 +390,7 @@ std::error_code readCoverageMappingData( static const char *TestingFormatMagic = "llvmcovmtestdata"; static std::error_code loadTestingFormat(StringRef Data, - SectionData &ProfileNames, + InstrProfSymtab &ProfileNames, StringRef &CoverageMapping, uint8_t &BytesInAddress, support::endianness &Endian) { @@ -420,14 +409,14 @@ static std::error_code loadTestingFormat(StringRef Data, if (Data.size() < 1) return coveragemap_error::truncated; N = 0; - ProfileNames.Address = + uint64_t Address = decodeULEB128(reinterpret_cast<const uint8_t *>(Data.data()), &N); if (N > Data.size()) return coveragemap_error::malformed; Data = Data.substr(N); if (Data.size() < ProfileNamesSize) return coveragemap_error::malformed; - ProfileNames.Data = Data.substr(0, ProfileNamesSize); + ProfileNames.create(Data.substr(0, ProfileNamesSize), Address); CoverageMapping = Data.substr(ProfileNamesSize); return std::error_code(); } @@ -443,12 +432,10 @@ static ErrorOr<SectionRef> lookupSection(ObjectFile &OF, StringRef Name) { return coveragemap_error::no_data_found; } -static std::error_code loadBinaryFormat(MemoryBufferRef ObjectBuffer, - SectionData &ProfileNames, - StringRef &CoverageMapping, - uint8_t &BytesInAddress, - support::endianness &Endian, - StringRef Arch) { +static std::error_code +loadBinaryFormat(MemoryBufferRef ObjectBuffer, InstrProfSymtab &ProfileNames, + StringRef &CoverageMapping, uint8_t &BytesInAddress, + support::endianness &Endian, StringRef Arch) { auto BinOrErr = object::createBinary(ObjectBuffer); if (std::error_code EC = BinOrErr.getError()) return EC; @@ -477,17 +464,18 @@ static std::error_code loadBinaryFormat(MemoryBufferRef ObjectBuffer, : support::endianness::big; // Look for the sections that we are interested in. - auto NamesSection = lookupSection(*OF, "__llvm_prf_names"); + auto NamesSection = lookupSection(*OF, getInstrProfNameSectionName(false)); if (auto EC = NamesSection.getError()) return EC; - auto CoverageSection = lookupSection(*OF, "__llvm_covmap"); + auto CoverageSection = + lookupSection(*OF, getInstrProfCoverageSectionName(false)); if (auto EC = CoverageSection.getError()) return EC; // Get the contents of the given sections. if (std::error_code EC = CoverageSection->getContents(CoverageMapping)) return EC; - if (std::error_code EC = ProfileNames.load(*NamesSection)) + if (std::error_code EC = ProfileNames.create(*NamesSection)) return EC; return std::error_code(); @@ -498,33 +486,33 @@ BinaryCoverageReader::create(std::unique_ptr<MemoryBuffer> &ObjectBuffer, StringRef Arch) { std::unique_ptr<BinaryCoverageReader> Reader(new BinaryCoverageReader()); - SectionData Profile; + InstrProfSymtab ProfileNames; StringRef Coverage; uint8_t BytesInAddress; support::endianness Endian; std::error_code EC; if (ObjectBuffer->getBuffer().startswith(TestingFormatMagic)) // This is a special format used for testing. - EC = loadTestingFormat(ObjectBuffer->getBuffer(), Profile, Coverage, + EC = loadTestingFormat(ObjectBuffer->getBuffer(), ProfileNames, Coverage, BytesInAddress, Endian); else - EC = loadBinaryFormat(ObjectBuffer->getMemBufferRef(), Profile, Coverage, - BytesInAddress, Endian, Arch); + EC = loadBinaryFormat(ObjectBuffer->getMemBufferRef(), ProfileNames, + Coverage, BytesInAddress, Endian, Arch); if (EC) return EC; if (BytesInAddress == 4 && Endian == support::endianness::little) EC = readCoverageMappingData<uint32_t, support::endianness::little>( - Profile, Coverage, Reader->MappingRecords, Reader->Filenames); + ProfileNames, Coverage, Reader->MappingRecords, Reader->Filenames); else if (BytesInAddress == 4 && Endian == support::endianness::big) EC = readCoverageMappingData<uint32_t, support::endianness::big>( - Profile, Coverage, Reader->MappingRecords, Reader->Filenames); + ProfileNames, Coverage, Reader->MappingRecords, Reader->Filenames); else if (BytesInAddress == 8 && Endian == support::endianness::little) EC = readCoverageMappingData<uint64_t, support::endianness::little>( - Profile, Coverage, Reader->MappingRecords, Reader->Filenames); + ProfileNames, Coverage, Reader->MappingRecords, Reader->Filenames); else if (BytesInAddress == 8 && Endian == support::endianness::big) EC = readCoverageMappingData<uint64_t, support::endianness::big>( - Profile, Coverage, Reader->MappingRecords, Reader->Filenames); + ProfileNames, Coverage, Reader->MappingRecords, Reader->Filenames); else return coveragemap_error::malformed; if (EC) diff --git a/contrib/llvm/lib/ProfileData/InstrProf.cpp b/contrib/llvm/lib/ProfileData/InstrProf.cpp index 92822a7..d677763 100644 --- a/contrib/llvm/lib/ProfileData/InstrProf.cpp +++ b/contrib/llvm/lib/ProfileData/InstrProf.cpp @@ -13,7 +13,14 @@ //===----------------------------------------------------------------------===// #include "llvm/ProfileData/InstrProf.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Compression.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LEB128.h" #include "llvm/Support/ManagedStatic.h" using namespace llvm; @@ -28,28 +35,32 @@ class InstrProfErrorCategoryType : public std::error_category { return "Success"; case instrprof_error::eof: return "End of File"; + case instrprof_error::unrecognized_format: + return "Unrecognized instrumentation profile encoding format"; case instrprof_error::bad_magic: - return "Invalid profile data (bad magic)"; + return "Invalid instrumentation profile data (bad magic)"; case instrprof_error::bad_header: - return "Invalid profile data (file header is corrupt)"; + return "Invalid instrumentation profile data (file header is corrupt)"; case instrprof_error::unsupported_version: - return "Unsupported profiling format version"; + return "Unsupported instrumentation profile format version"; case instrprof_error::unsupported_hash_type: - return "Unsupported profiling hash"; + return "Unsupported instrumentation profile hash type"; case instrprof_error::too_large: return "Too much profile data"; case instrprof_error::truncated: return "Truncated profile data"; case instrprof_error::malformed: - return "Malformed profile data"; + return "Malformed instrumentation profile data"; case instrprof_error::unknown_function: return "No profile data available for function"; case instrprof_error::hash_mismatch: - return "Function hash mismatch"; + return "Function control flow change detected (hash mismatch)"; case instrprof_error::count_mismatch: - return "Function count mismatch"; + return "Function basic block count change detected (counter mismatch)"; case instrprof_error::counter_overflow: return "Counter overflow"; + case instrprof_error::value_site_count_mismatch: + return "Function value site count change detected (counter mismatch)"; } llvm_unreachable("A value of instrprof_error has no message."); } @@ -61,3 +72,531 @@ static ManagedStatic<InstrProfErrorCategoryType> ErrorCategory; const std::error_category &llvm::instrprof_category() { return *ErrorCategory; } + +namespace llvm { + +std::string getPGOFuncName(StringRef RawFuncName, + GlobalValue::LinkageTypes Linkage, + StringRef FileName, + uint64_t Version LLVM_ATTRIBUTE_UNUSED) { + + // Function names may be prefixed with a binary '1' to indicate + // that the backend should not modify the symbols due to any platform + // naming convention. Do not include that '1' in the PGO profile name. + if (RawFuncName[0] == '\1') + RawFuncName = RawFuncName.substr(1); + + std::string FuncName = RawFuncName; + if (llvm::GlobalValue::isLocalLinkage(Linkage)) { + // For local symbols, prepend the main file name to distinguish them. + // Do not include the full path in the file name since there's no guarantee + // that it will stay the same, e.g., if the files are checked out from + // version control in different locations. + if (FileName.empty()) + FuncName = FuncName.insert(0, "<unknown>:"); + else + FuncName = FuncName.insert(0, FileName.str() + ":"); + } + return FuncName; +} + +std::string getPGOFuncName(const Function &F, uint64_t Version) { + return getPGOFuncName(F.getName(), F.getLinkage(), F.getParent()->getName(), + Version); +} + +StringRef getFuncNameWithoutPrefix(StringRef PGOFuncName, StringRef FileName) { + if (FileName.empty()) + return PGOFuncName; + // Drop the file name including ':'. See also getPGOFuncName. + if (PGOFuncName.startswith(FileName)) + PGOFuncName = PGOFuncName.drop_front(FileName.size() + 1); + return PGOFuncName; +} + +// \p FuncName is the string used as profile lookup key for the function. A +// symbol is created to hold the name. Return the legalized symbol name. +static std::string getPGOFuncNameVarName(StringRef FuncName, + GlobalValue::LinkageTypes Linkage) { + std::string VarName = getInstrProfNameVarPrefix(); + VarName += FuncName; + + if (!GlobalValue::isLocalLinkage(Linkage)) + return VarName; + + // Now fix up illegal chars in local VarName that may upset the assembler. + const char *InvalidChars = "-:<>\"'"; + size_t found = VarName.find_first_of(InvalidChars); + while (found != std::string::npos) { + VarName[found] = '_'; + found = VarName.find_first_of(InvalidChars, found + 1); + } + return VarName; +} + +GlobalVariable *createPGOFuncNameVar(Module &M, + GlobalValue::LinkageTypes Linkage, + StringRef FuncName) { + + // We generally want to match the function's linkage, but available_externally + // and extern_weak both have the wrong semantics, and anything that doesn't + // need to link across compilation units doesn't need to be visible at all. + if (Linkage == GlobalValue::ExternalWeakLinkage) + Linkage = GlobalValue::LinkOnceAnyLinkage; + else if (Linkage == GlobalValue::AvailableExternallyLinkage) + Linkage = GlobalValue::LinkOnceODRLinkage; + else if (Linkage == GlobalValue::InternalLinkage || + Linkage == GlobalValue::ExternalLinkage) + Linkage = GlobalValue::PrivateLinkage; + + auto *Value = ConstantDataArray::getString(M.getContext(), FuncName, false); + auto FuncNameVar = + new GlobalVariable(M, Value->getType(), true, Linkage, Value, + getPGOFuncNameVarName(FuncName, Linkage)); + + // Hide the symbol so that we correctly get a copy for each executable. + if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) + FuncNameVar->setVisibility(GlobalValue::HiddenVisibility); + + return FuncNameVar; +} + +GlobalVariable *createPGOFuncNameVar(Function &F, StringRef FuncName) { + return createPGOFuncNameVar(*F.getParent(), F.getLinkage(), FuncName); +} + +int collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs, + bool doCompression, std::string &Result) { + uint8_t Header[16], *P = Header; + std::string UncompressedNameStrings = + join(NameStrs.begin(), NameStrs.end(), StringRef(" ")); + + unsigned EncLen = encodeULEB128(UncompressedNameStrings.length(), P); + P += EncLen; + + auto WriteStringToResult = [&](size_t CompressedLen, + const std::string &InputStr) { + EncLen = encodeULEB128(CompressedLen, P); + P += EncLen; + char *HeaderStr = reinterpret_cast<char *>(&Header[0]); + unsigned HeaderLen = P - &Header[0]; + Result.append(HeaderStr, HeaderLen); + Result += InputStr; + return 0; + }; + + if (!doCompression) + return WriteStringToResult(0, UncompressedNameStrings); + + SmallVector<char, 128> CompressedNameStrings; + zlib::Status Success = + zlib::compress(StringRef(UncompressedNameStrings), CompressedNameStrings, + zlib::BestSizeCompression); + + if (Success != zlib::StatusOK) + return 1; + + return WriteStringToResult( + CompressedNameStrings.size(), + std::string(CompressedNameStrings.data(), CompressedNameStrings.size())); +} + +StringRef getPGOFuncNameInitializer(GlobalVariable *NameVar) { + auto *Arr = cast<ConstantDataArray>(NameVar->getInitializer()); + StringRef NameStr = + Arr->isCString() ? Arr->getAsCString() : Arr->getAsString(); + return NameStr; +} + +int collectPGOFuncNameStrings(const std::vector<GlobalVariable *> &NameVars, + std::string &Result) { + std::vector<std::string> NameStrs; + for (auto *NameVar : NameVars) { + NameStrs.push_back(getPGOFuncNameInitializer(NameVar)); + } + return collectPGOFuncNameStrings(NameStrs, zlib::isAvailable(), Result); +} + +int readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) { + const uint8_t *P = reinterpret_cast<const uint8_t *>(NameStrings.data()); + const uint8_t *EndP = reinterpret_cast<const uint8_t *>(NameStrings.data() + + NameStrings.size()); + while (P < EndP) { + uint32_t N; + uint64_t UncompressedSize = decodeULEB128(P, &N); + P += N; + uint64_t CompressedSize = decodeULEB128(P, &N); + P += N; + bool isCompressed = (CompressedSize != 0); + SmallString<128> UncompressedNameStrings; + StringRef NameStrings; + if (isCompressed) { + StringRef CompressedNameStrings(reinterpret_cast<const char *>(P), + CompressedSize); + if (zlib::uncompress(CompressedNameStrings, UncompressedNameStrings, + UncompressedSize) != zlib::StatusOK) + return 1; + P += CompressedSize; + NameStrings = StringRef(UncompressedNameStrings.data(), + UncompressedNameStrings.size()); + } else { + NameStrings = + StringRef(reinterpret_cast<const char *>(P), UncompressedSize); + P += UncompressedSize; + } + // Now parse the name strings. + SmallVector<StringRef, 0> Names; + NameStrings.split(Names, ' '); + for (StringRef &Name : Names) + Symtab.addFuncName(Name); + + while (P < EndP && *P == 0) + P++; + } + Symtab.finalizeSymtab(); + return 0; +} + +instrprof_error InstrProfValueSiteRecord::merge(InstrProfValueSiteRecord &Input, + uint64_t Weight) { + this->sortByTargetValues(); + Input.sortByTargetValues(); + auto I = ValueData.begin(); + auto IE = ValueData.end(); + instrprof_error Result = instrprof_error::success; + for (auto J = Input.ValueData.begin(), JE = Input.ValueData.end(); J != JE; + ++J) { + while (I != IE && I->Value < J->Value) + ++I; + if (I != IE && I->Value == J->Value) { + bool Overflowed; + I->Count = SaturatingMultiplyAdd(J->Count, Weight, I->Count, &Overflowed); + if (Overflowed) + Result = instrprof_error::counter_overflow; + ++I; + continue; + } + ValueData.insert(I, *J); + } + return Result; +} + +instrprof_error InstrProfValueSiteRecord::scale(uint64_t Weight) { + instrprof_error Result = instrprof_error::success; + for (auto I = ValueData.begin(), IE = ValueData.end(); I != IE; ++I) { + bool Overflowed; + I->Count = SaturatingMultiply(I->Count, Weight, &Overflowed); + if (Overflowed) + Result = instrprof_error::counter_overflow; + } + return Result; +} + +// Merge Value Profile data from Src record to this record for ValueKind. +// Scale merged value counts by \p Weight. +instrprof_error InstrProfRecord::mergeValueProfData(uint32_t ValueKind, + InstrProfRecord &Src, + uint64_t Weight) { + uint32_t ThisNumValueSites = getNumValueSites(ValueKind); + uint32_t OtherNumValueSites = Src.getNumValueSites(ValueKind); + if (ThisNumValueSites != OtherNumValueSites) + return instrprof_error::value_site_count_mismatch; + std::vector<InstrProfValueSiteRecord> &ThisSiteRecords = + getValueSitesForKind(ValueKind); + std::vector<InstrProfValueSiteRecord> &OtherSiteRecords = + Src.getValueSitesForKind(ValueKind); + instrprof_error Result = instrprof_error::success; + for (uint32_t I = 0; I < ThisNumValueSites; I++) + MergeResult(Result, ThisSiteRecords[I].merge(OtherSiteRecords[I], Weight)); + return Result; +} + +instrprof_error InstrProfRecord::merge(InstrProfRecord &Other, + uint64_t Weight) { + // If the number of counters doesn't match we either have bad data + // or a hash collision. + if (Counts.size() != Other.Counts.size()) + return instrprof_error::count_mismatch; + + instrprof_error Result = instrprof_error::success; + + for (size_t I = 0, E = Other.Counts.size(); I < E; ++I) { + bool Overflowed; + Counts[I] = + SaturatingMultiplyAdd(Other.Counts[I], Weight, Counts[I], &Overflowed); + if (Overflowed) + Result = instrprof_error::counter_overflow; + } + + for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) + MergeResult(Result, mergeValueProfData(Kind, Other, Weight)); + + return Result; +} + +instrprof_error InstrProfRecord::scaleValueProfData(uint32_t ValueKind, + uint64_t Weight) { + uint32_t ThisNumValueSites = getNumValueSites(ValueKind); + std::vector<InstrProfValueSiteRecord> &ThisSiteRecords = + getValueSitesForKind(ValueKind); + instrprof_error Result = instrprof_error::success; + for (uint32_t I = 0; I < ThisNumValueSites; I++) + MergeResult(Result, ThisSiteRecords[I].scale(Weight)); + return Result; +} + +instrprof_error InstrProfRecord::scale(uint64_t Weight) { + instrprof_error Result = instrprof_error::success; + for (auto &Count : this->Counts) { + bool Overflowed; + Count = SaturatingMultiply(Count, Weight, &Overflowed); + if (Overflowed && Result == instrprof_error::success) { + Result = instrprof_error::counter_overflow; + } + } + for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) + MergeResult(Result, scaleValueProfData(Kind, Weight)); + + return Result; +} + +// Map indirect call target name hash to name string. +uint64_t InstrProfRecord::remapValue(uint64_t Value, uint32_t ValueKind, + ValueMapType *ValueMap) { + if (!ValueMap) + return Value; + switch (ValueKind) { + case IPVK_IndirectCallTarget: { + auto Result = + std::lower_bound(ValueMap->begin(), ValueMap->end(), Value, + [](const std::pair<uint64_t, uint64_t> &LHS, + uint64_t RHS) { return LHS.first < RHS; }); + if (Result != ValueMap->end()) + Value = (uint64_t)Result->second; + break; + } + } + return Value; +} + +void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site, + InstrProfValueData *VData, uint32_t N, + ValueMapType *ValueMap) { + for (uint32_t I = 0; I < N; I++) { + VData[I].Value = remapValue(VData[I].Value, ValueKind, ValueMap); + } + std::vector<InstrProfValueSiteRecord> &ValueSites = + getValueSitesForKind(ValueKind); + if (N == 0) + ValueSites.push_back(InstrProfValueSiteRecord()); + else + ValueSites.emplace_back(VData, VData + N); +} + +#define INSTR_PROF_COMMON_API_IMPL +#include "llvm/ProfileData/InstrProfData.inc" + +/*! + * \brief ValueProfRecordClosure Interface implementation for InstrProfRecord + * class. These C wrappers are used as adaptors so that C++ code can be + * invoked as callbacks. + */ +uint32_t getNumValueKindsInstrProf(const void *Record) { + return reinterpret_cast<const InstrProfRecord *>(Record)->getNumValueKinds(); +} + +uint32_t getNumValueSitesInstrProf(const void *Record, uint32_t VKind) { + return reinterpret_cast<const InstrProfRecord *>(Record) + ->getNumValueSites(VKind); +} + +uint32_t getNumValueDataInstrProf(const void *Record, uint32_t VKind) { + return reinterpret_cast<const InstrProfRecord *>(Record) + ->getNumValueData(VKind); +} + +uint32_t getNumValueDataForSiteInstrProf(const void *R, uint32_t VK, + uint32_t S) { + return reinterpret_cast<const InstrProfRecord *>(R) + ->getNumValueDataForSite(VK, S); +} + +void getValueForSiteInstrProf(const void *R, InstrProfValueData *Dst, + uint32_t K, uint32_t S, + uint64_t (*Mapper)(uint32_t, uint64_t)) { + return reinterpret_cast<const InstrProfRecord *>(R)->getValueForSite( + Dst, K, S, Mapper); +} + +ValueProfData *allocValueProfDataInstrProf(size_t TotalSizeInBytes) { + ValueProfData *VD = + (ValueProfData *)(new (::operator new(TotalSizeInBytes)) ValueProfData()); + memset(VD, 0, TotalSizeInBytes); + return VD; +} + +static ValueProfRecordClosure InstrProfRecordClosure = { + 0, + getNumValueKindsInstrProf, + getNumValueSitesInstrProf, + getNumValueDataInstrProf, + getNumValueDataForSiteInstrProf, + 0, + getValueForSiteInstrProf, + allocValueProfDataInstrProf}; + +// Wrapper implementation using the closure mechanism. +uint32_t ValueProfData::getSize(const InstrProfRecord &Record) { + InstrProfRecordClosure.Record = &Record; + return getValueProfDataSize(&InstrProfRecordClosure); +} + +// Wrapper implementation using the closure mechanism. +std::unique_ptr<ValueProfData> +ValueProfData::serializeFrom(const InstrProfRecord &Record) { + InstrProfRecordClosure.Record = &Record; + + std::unique_ptr<ValueProfData> VPD( + serializeValueProfDataFrom(&InstrProfRecordClosure, nullptr)); + return VPD; +} + +void ValueProfRecord::deserializeTo(InstrProfRecord &Record, + InstrProfRecord::ValueMapType *VMap) { + Record.reserveSites(Kind, NumValueSites); + + InstrProfValueData *ValueData = getValueProfRecordValueData(this); + for (uint64_t VSite = 0; VSite < NumValueSites; ++VSite) { + uint8_t ValueDataCount = this->SiteCountArray[VSite]; + Record.addValueData(Kind, VSite, ValueData, ValueDataCount, VMap); + ValueData += ValueDataCount; + } +} + +// For writing/serializing, Old is the host endianness, and New is +// byte order intended on disk. For Reading/deserialization, Old +// is the on-disk source endianness, and New is the host endianness. +void ValueProfRecord::swapBytes(support::endianness Old, + support::endianness New) { + using namespace support; + if (Old == New) + return; + + if (getHostEndianness() != Old) { + sys::swapByteOrder<uint32_t>(NumValueSites); + sys::swapByteOrder<uint32_t>(Kind); + } + uint32_t ND = getValueProfRecordNumValueData(this); + InstrProfValueData *VD = getValueProfRecordValueData(this); + + // No need to swap byte array: SiteCountArrray. + for (uint32_t I = 0; I < ND; I++) { + sys::swapByteOrder<uint64_t>(VD[I].Value); + sys::swapByteOrder<uint64_t>(VD[I].Count); + } + if (getHostEndianness() == Old) { + sys::swapByteOrder<uint32_t>(NumValueSites); + sys::swapByteOrder<uint32_t>(Kind); + } +} + +void ValueProfData::deserializeTo(InstrProfRecord &Record, + InstrProfRecord::ValueMapType *VMap) { + if (NumValueKinds == 0) + return; + + ValueProfRecord *VR = getFirstValueProfRecord(this); + for (uint32_t K = 0; K < NumValueKinds; K++) { + VR->deserializeTo(Record, VMap); + VR = getValueProfRecordNext(VR); + } +} + +template <class T> +static T swapToHostOrder(const unsigned char *&D, support::endianness Orig) { + using namespace support; + if (Orig == little) + return endian::readNext<T, little, unaligned>(D); + else + return endian::readNext<T, big, unaligned>(D); +} + +static std::unique_ptr<ValueProfData> allocValueProfData(uint32_t TotalSize) { + return std::unique_ptr<ValueProfData>(new (::operator new(TotalSize)) + ValueProfData()); +} + +instrprof_error ValueProfData::checkIntegrity() { + if (NumValueKinds > IPVK_Last + 1) + return instrprof_error::malformed; + // Total size needs to be mulltiple of quadword size. + if (TotalSize % sizeof(uint64_t)) + return instrprof_error::malformed; + + ValueProfRecord *VR = getFirstValueProfRecord(this); + for (uint32_t K = 0; K < this->NumValueKinds; K++) { + if (VR->Kind > IPVK_Last) + return instrprof_error::malformed; + VR = getValueProfRecordNext(VR); + if ((char *)VR - (char *)this > (ptrdiff_t)TotalSize) + return instrprof_error::malformed; + } + return instrprof_error::success; +} + +ErrorOr<std::unique_ptr<ValueProfData>> +ValueProfData::getValueProfData(const unsigned char *D, + const unsigned char *const BufferEnd, + support::endianness Endianness) { + using namespace support; + if (D + sizeof(ValueProfData) > BufferEnd) + return instrprof_error::truncated; + + const unsigned char *Header = D; + uint32_t TotalSize = swapToHostOrder<uint32_t>(Header, Endianness); + if (D + TotalSize > BufferEnd) + return instrprof_error::too_large; + + std::unique_ptr<ValueProfData> VPD = allocValueProfData(TotalSize); + memcpy(VPD.get(), D, TotalSize); + // Byte swap. + VPD->swapBytesToHost(Endianness); + + instrprof_error EC = VPD->checkIntegrity(); + if (EC != instrprof_error::success) + return EC; + + return std::move(VPD); +} + +void ValueProfData::swapBytesToHost(support::endianness Endianness) { + using namespace support; + if (Endianness == getHostEndianness()) + return; + + sys::swapByteOrder<uint32_t>(TotalSize); + sys::swapByteOrder<uint32_t>(NumValueKinds); + + ValueProfRecord *VR = getFirstValueProfRecord(this); + for (uint32_t K = 0; K < NumValueKinds; K++) { + VR->swapBytes(Endianness, getHostEndianness()); + VR = getValueProfRecordNext(VR); + } +} + +void ValueProfData::swapBytesFromHost(support::endianness Endianness) { + using namespace support; + if (Endianness == getHostEndianness()) + return; + + ValueProfRecord *VR = getFirstValueProfRecord(this); + for (uint32_t K = 0; K < NumValueKinds; K++) { + ValueProfRecord *NVR = getValueProfRecordNext(VR); + VR->swapBytes(getHostEndianness(), Endianness); + VR = NVR; + } + sys::swapByteOrder<uint32_t>(TotalSize); + sys::swapByteOrder<uint32_t>(NumValueKinds); +} + +} diff --git a/contrib/llvm/lib/ProfileData/InstrProfIndexed.h b/contrib/llvm/lib/ProfileData/InstrProfIndexed.h deleted file mode 100644 index ebca7b2..0000000 --- a/contrib/llvm/lib/ProfileData/InstrProfIndexed.h +++ /dev/null @@ -1,56 +0,0 @@ -//=-- InstrProfIndexed.h - Indexed profiling format support -------*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Shared header for the instrumented profile data reader and writer. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_PROFILEDATA_INSTRPROFINDEXED_H -#define LLVM_LIB_PROFILEDATA_INSTRPROFINDEXED_H - -#include "llvm/Support/Endian.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MD5.h" - -namespace llvm { - -namespace IndexedInstrProf { -enum class HashT : uint32_t { - MD5, - - Last = MD5 -}; - -static inline uint64_t MD5Hash(StringRef Str) { - MD5 Hash; - Hash.update(Str); - llvm::MD5::MD5Result Result; - Hash.final(Result); - // Return the least significant 8 bytes. Our MD5 implementation returns the - // result in little endian, so we may need to swap bytes. - using namespace llvm::support; - return endian::read<uint64_t, little, unaligned>(Result); -} - -static inline uint64_t ComputeHash(HashT Type, StringRef K) { - switch (Type) { - case HashT::MD5: - return IndexedInstrProf::MD5Hash(K); - } - llvm_unreachable("Unhandled hash type"); -} - -const uint64_t Magic = 0x8169666f72706cff; // "\xfflprofi\x81" -const uint64_t Version = 2; -const HashT HashType = HashT::MD5; -} - -} // end namespace llvm - -#endif diff --git a/contrib/llvm/lib/ProfileData/InstrProfReader.cpp b/contrib/llvm/lib/ProfileData/InstrProfReader.cpp index 8a529a0..5e83456 100644 --- a/contrib/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/contrib/llvm/lib/ProfileData/InstrProfReader.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "llvm/ProfileData/InstrProfReader.h" -#include "InstrProfIndexed.h" #include "llvm/ADT/STLExtras.h" #include <cassert> @@ -55,8 +54,10 @@ InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) { Result.reset(new RawInstrProfReader64(std::move(Buffer))); else if (RawInstrProfReader32::hasFormat(*Buffer)) Result.reset(new RawInstrProfReader32(std::move(Buffer))); - else + else if (TextInstrProfReader::hasFormat(*Buffer)) Result.reset(new TextInstrProfReader(std::move(Buffer))); + else + return instrprof_error::unrecognized_format; // Initialize the reader and return the result. if (std::error_code EC = initializeReader(*Result)) @@ -98,16 +99,98 @@ void InstrProfIterator::Increment() { *this = InstrProfIterator(); } +bool TextInstrProfReader::hasFormat(const MemoryBuffer &Buffer) { + // Verify that this really looks like plain ASCII text by checking a + // 'reasonable' number of characters (up to profile magic size). + size_t count = std::min(Buffer.getBufferSize(), sizeof(uint64_t)); + StringRef buffer = Buffer.getBufferStart(); + return count == 0 || + std::all_of(buffer.begin(), buffer.begin() + count, + [](char c) { return ::isprint(c) || ::isspace(c); }); +} + +std::error_code TextInstrProfReader::readHeader() { + Symtab.reset(new InstrProfSymtab()); + return success(); +} + +std::error_code +TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) { + +#define CHECK_LINE_END(Line) \ + if (Line.is_at_end()) \ + return error(instrprof_error::truncated); +#define READ_NUM(Str, Dst) \ + if ((Str).getAsInteger(10, (Dst))) \ + return error(instrprof_error::malformed); +#define VP_READ_ADVANCE(Val) \ + CHECK_LINE_END(Line); \ + uint32_t Val; \ + READ_NUM((*Line), (Val)); \ + Line++; + + if (Line.is_at_end()) + return success(); + + uint32_t NumValueKinds; + if (Line->getAsInteger(10, NumValueKinds)) { + // No value profile data + return success(); + } + if (NumValueKinds == 0 || NumValueKinds > IPVK_Last + 1) + return error(instrprof_error::malformed); + Line++; + + for (uint32_t VK = 0; VK < NumValueKinds; VK++) { + VP_READ_ADVANCE(ValueKind); + if (ValueKind > IPVK_Last) + return error(instrprof_error::malformed); + VP_READ_ADVANCE(NumValueSites); + if (!NumValueSites) + continue; + + Record.reserveSites(VK, NumValueSites); + for (uint32_t S = 0; S < NumValueSites; S++) { + VP_READ_ADVANCE(NumValueData); + + std::vector<InstrProfValueData> CurrentValues; + for (uint32_t V = 0; V < NumValueData; V++) { + CHECK_LINE_END(Line); + std::pair<StringRef, StringRef> VD = Line->split(':'); + uint64_t TakenCount, Value; + if (VK == IPVK_IndirectCallTarget) { + Symtab->addFuncName(VD.first); + Value = IndexedInstrProf::ComputeHash(VD.first); + } else { + READ_NUM(VD.first, Value); + } + READ_NUM(VD.second, TakenCount); + CurrentValues.push_back({Value, TakenCount}); + Line++; + } + Record.addValueData(VK, S, CurrentValues.data(), NumValueData, nullptr); + } + } + return success(); + +#undef CHECK_LINE_END +#undef READ_NUM +#undef VP_READ_ADVANCE +} + std::error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) { // Skip empty lines and comments. while (!Line.is_at_end() && (Line->empty() || Line->startswith("#"))) ++Line; // If we hit EOF while looking for a name, we're done. - if (Line.is_at_end()) + if (Line.is_at_end()) { + Symtab->finalizeSymtab(); return error(instrprof_error::eof); + } // Read the function name. Record.Name = *Line++; + Symtab->addFuncName(Record.Name); // Read the function hash. if (Line.is_at_end()) @@ -136,36 +219,14 @@ std::error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) { Record.Counts.push_back(Count); } - return success(); -} + // Check if value profile data exists and read it if so. + if (std::error_code EC = readValueProfileData(Record)) + return EC; -template <class IntPtrT> -static uint64_t getRawMagic(); - -template <> -uint64_t getRawMagic<uint64_t>() { - return - uint64_t(255) << 56 | - uint64_t('l') << 48 | - uint64_t('p') << 40 | - uint64_t('r') << 32 | - uint64_t('o') << 24 | - uint64_t('f') << 16 | - uint64_t('r') << 8 | - uint64_t(129); -} - -template <> -uint64_t getRawMagic<uint32_t>() { - return - uint64_t(255) << 56 | - uint64_t('l') << 48 | - uint64_t('p') << 40 | - uint64_t('r') << 32 | - uint64_t('o') << 24 | - uint64_t('f') << 16 | - uint64_t('R') << 8 | - uint64_t(129); + // This is needed to avoid two pass parsing because llvm-profdata + // does dumping while reading. + Symtab->finalizeSymtab(); + return success(); } template <class IntPtrT> @@ -174,19 +235,19 @@ bool RawInstrProfReader<IntPtrT>::hasFormat(const MemoryBuffer &DataBuffer) { return false; uint64_t Magic = *reinterpret_cast<const uint64_t *>(DataBuffer.getBufferStart()); - return getRawMagic<IntPtrT>() == Magic || - sys::getSwappedBytes(getRawMagic<IntPtrT>()) == Magic; + return RawInstrProf::getMagic<IntPtrT>() == Magic || + sys::getSwappedBytes(RawInstrProf::getMagic<IntPtrT>()) == Magic; } template <class IntPtrT> std::error_code RawInstrProfReader<IntPtrT>::readHeader() { if (!hasFormat(*DataBuffer)) return error(instrprof_error::bad_magic); - if (DataBuffer->getBufferSize() < sizeof(RawHeader)) + if (DataBuffer->getBufferSize() < sizeof(RawInstrProf::Header)) return error(instrprof_error::bad_header); - auto *Header = - reinterpret_cast<const RawHeader *>(DataBuffer->getBufferStart()); - ShouldSwapBytes = Header->Magic != getRawMagic<IntPtrT>(); + auto *Header = reinterpret_cast<const RawInstrProf::Header *>( + DataBuffer->getBufferStart()); + ShouldSwapBytes = Header->Magic != RawInstrProf::getMagic<IntPtrT>(); return readHeader(*Header); } @@ -202,29 +263,38 @@ RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) { return instrprof_error::eof; // If there isn't enough space for another header, this is probably just // garbage at the end of the file. - if (CurrentPos + sizeof(RawHeader) > End) + if (CurrentPos + sizeof(RawInstrProf::Header) > End) return instrprof_error::malformed; // The writer ensures each profile is padded to start at an aligned address. if (reinterpret_cast<size_t>(CurrentPos) % alignOf<uint64_t>()) return instrprof_error::malformed; // The magic should have the same byte order as in the previous header. uint64_t Magic = *reinterpret_cast<const uint64_t *>(CurrentPos); - if (Magic != swap(getRawMagic<IntPtrT>())) + if (Magic != swap(RawInstrProf::getMagic<IntPtrT>())) return instrprof_error::bad_magic; // There's another profile to read, so we need to process the header. - auto *Header = reinterpret_cast<const RawHeader *>(CurrentPos); + auto *Header = reinterpret_cast<const RawInstrProf::Header *>(CurrentPos); return readHeader(*Header); } -static uint64_t getRawVersion() { - return 1; +template <class IntPtrT> +void RawInstrProfReader<IntPtrT>::createSymtab(InstrProfSymtab &Symtab) { + for (const RawInstrProf::ProfileData<IntPtrT> *I = Data; I != DataEnd; ++I) { + StringRef FunctionName(getName(I->NamePtr), swap(I->NameSize)); + Symtab.addFuncName(FunctionName); + const IntPtrT FPtr = swap(I->FunctionPointer); + if (!FPtr) + continue; + Symtab.mapAddress(FPtr, IndexedInstrProf::ComputeHash(FunctionName)); + } + Symtab.finalizeSymtab(); } template <class IntPtrT> std::error_code -RawInstrProfReader<IntPtrT>::readHeader(const RawHeader &Header) { - if (swap(Header.Version) != getRawVersion()) +RawInstrProfReader<IntPtrT>::readHeader(const RawInstrProf::Header &Header) { + if (swap(Header.Version) != RawInstrProf::Version) return error(instrprof_error::unsupported_version); CountersDelta = swap(Header.CountersDelta); @@ -232,50 +302,69 @@ RawInstrProfReader<IntPtrT>::readHeader(const RawHeader &Header) { auto DataSize = swap(Header.DataSize); auto CountersSize = swap(Header.CountersSize); auto NamesSize = swap(Header.NamesSize); + auto ValueDataSize = swap(Header.ValueDataSize); + ValueKindLast = swap(Header.ValueKindLast); + + auto DataSizeInBytes = DataSize * sizeof(RawInstrProf::ProfileData<IntPtrT>); + auto PaddingSize = getNumPaddingBytes(NamesSize); - ptrdiff_t DataOffset = sizeof(RawHeader); - ptrdiff_t CountersOffset = DataOffset + sizeof(ProfileData) * DataSize; + ptrdiff_t DataOffset = sizeof(RawInstrProf::Header); + ptrdiff_t CountersOffset = DataOffset + DataSizeInBytes; ptrdiff_t NamesOffset = CountersOffset + sizeof(uint64_t) * CountersSize; - size_t ProfileSize = NamesOffset + sizeof(char) * NamesSize; + ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize; + size_t ProfileSize = ValueDataOffset + ValueDataSize; auto *Start = reinterpret_cast<const char *>(&Header); if (Start + ProfileSize > DataBuffer->getBufferEnd()) return error(instrprof_error::bad_header); - Data = reinterpret_cast<const ProfileData *>(Start + DataOffset); + Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>( + Start + DataOffset); DataEnd = Data + DataSize; CountersStart = reinterpret_cast<const uint64_t *>(Start + CountersOffset); NamesStart = Start + NamesOffset; + ValueDataStart = reinterpret_cast<const uint8_t *>(Start + ValueDataOffset); ProfileEnd = Start + ProfileSize; + std::unique_ptr<InstrProfSymtab> NewSymtab = make_unique<InstrProfSymtab>(); + createSymtab(*NewSymtab.get()); + Symtab = std::move(NewSymtab); return success(); } template <class IntPtrT> -std::error_code -RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) { - if (Data == DataEnd) - if (std::error_code EC = readNextHeader(ProfileEnd)) - return EC; +std::error_code RawInstrProfReader<IntPtrT>::readName(InstrProfRecord &Record) { + Record.Name = StringRef(getName(Data->NamePtr), swap(Data->NameSize)); + if (Record.Name.data() < NamesStart || + Record.Name.data() + Record.Name.size() > + reinterpret_cast<const char *>(ValueDataStart)) + return error(instrprof_error::malformed); + return success(); +} + +template <class IntPtrT> +std::error_code RawInstrProfReader<IntPtrT>::readFuncHash( + InstrProfRecord &Record) { + Record.Hash = swap(Data->FuncHash); + return success(); +} - // Get the raw data. - StringRef RawName(getName(Data->NamePtr), swap(Data->NameSize)); +template <class IntPtrT> +std::error_code RawInstrProfReader<IntPtrT>::readRawCounts( + InstrProfRecord &Record) { uint32_t NumCounters = swap(Data->NumCounters); + IntPtrT CounterPtr = Data->CounterPtr; if (NumCounters == 0) return error(instrprof_error::malformed); - auto RawCounts = makeArrayRef(getCounter(Data->CounterPtr), NumCounters); - // Check bounds. + auto RawCounts = makeArrayRef(getCounter(CounterPtr), NumCounters); auto *NamesStartAsCounter = reinterpret_cast<const uint64_t *>(NamesStart); - if (RawName.data() < NamesStart || - RawName.data() + RawName.size() > DataBuffer->getBufferEnd() || - RawCounts.data() < CountersStart || + + // Check bounds. + if (RawCounts.data() < CountersStart || RawCounts.data() + RawCounts.size() > NamesStartAsCounter) return error(instrprof_error::malformed); - // Store the data in Record, byte-swapping as necessary. - Record.Hash = swap(Data->FuncHash); - Record.Name = RawName; if (ShouldSwapBytes) { Record.Counts.clear(); Record.Counts.reserve(RawCounts.size()); @@ -284,8 +373,61 @@ RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) { } else Record.Counts = RawCounts; + return success(); +} + +template <class IntPtrT> +std::error_code +RawInstrProfReader<IntPtrT>::readValueProfilingData(InstrProfRecord &Record) { + + Record.clearValueData(); + CurValueDataSize = 0; + // Need to match the logic in value profile dumper code in compiler-rt: + uint32_t NumValueKinds = 0; + for (uint32_t I = 0; I < IPVK_Last + 1; I++) + NumValueKinds += (Data->NumValueSites[I] != 0); + + if (!NumValueKinds) + return success(); + + ErrorOr<std::unique_ptr<ValueProfData>> VDataPtrOrErr = + ValueProfData::getValueProfData(ValueDataStart, + (const unsigned char *)ProfileEnd, + getDataEndianness()); + + if (VDataPtrOrErr.getError()) + return VDataPtrOrErr.getError(); + + VDataPtrOrErr.get()->deserializeTo(Record, &Symtab->getAddrHashMap()); + CurValueDataSize = VDataPtrOrErr.get()->getSize(); + return success(); +} + +template <class IntPtrT> +std::error_code +RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) { + if (atEnd()) + if (std::error_code EC = readNextHeader(ProfileEnd)) + return EC; + + // Read name ad set it in Record. + if (std::error_code EC = readName(Record)) + return EC; + + // Read FuncHash and set it in Record. + if (std::error_code EC = readFuncHash(Record)) + return EC; + + // Read raw counts and set Record. + if (std::error_code EC = readRawCounts(Record)) + return EC; + + // Read value data and set Record. + if (std::error_code EC = readValueProfilingData(Record)) + return EC; + // Iterate. - ++Data; + advanceData(); return success(); } @@ -302,52 +444,112 @@ InstrProfLookupTrait::ComputeHash(StringRef K) { typedef InstrProfLookupTrait::data_type data_type; typedef InstrProfLookupTrait::offset_type offset_type; +bool InstrProfLookupTrait::readValueProfilingData( + const unsigned char *&D, const unsigned char *const End) { + ErrorOr<std::unique_ptr<ValueProfData>> VDataPtrOrErr = + ValueProfData::getValueProfData(D, End, ValueProfDataEndianness); + + if (VDataPtrOrErr.getError()) + return false; + + VDataPtrOrErr.get()->deserializeTo(DataBuffer.back(), nullptr); + D += VDataPtrOrErr.get()->TotalSize; + + return true; +} + data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D, offset_type N) { - // Check if the data is corrupt. If so, don't try to read it. if (N % sizeof(uint64_t)) return data_type(); DataBuffer.clear(); - uint64_t NumCounts; - uint64_t NumEntries = N / sizeof(uint64_t); std::vector<uint64_t> CounterBuffer; - for (uint64_t I = 0; I < NumEntries; I += NumCounts) { - using namespace support; - // The function hash comes first. - uint64_t Hash = endian::readNext<uint64_t, little, unaligned>(D); - if (++I >= NumEntries) + using namespace support; + const unsigned char *End = D + N; + while (D < End) { + // Read hash. + if (D + sizeof(uint64_t) >= End) return data_type(); + uint64_t Hash = endian::readNext<uint64_t, little, unaligned>(D); - // In v1, we have at least one count. - // Later, we have the number of counts. - NumCounts = (1 == FormatVersion) - ? NumEntries - I - : endian::readNext<uint64_t, little, unaligned>(D); - if (1 != FormatVersion) - ++I; - - // If we have more counts than data, this is bogus. - if (I + NumCounts > NumEntries) + // Initialize number of counters for FormatVersion == 1. + uint64_t CountsSize = N / sizeof(uint64_t) - 1; + // If format version is different then read the number of counters. + if (FormatVersion != 1) { + if (D + sizeof(uint64_t) > End) + return data_type(); + CountsSize = endian::readNext<uint64_t, little, unaligned>(D); + } + // Read counter values. + if (D + CountsSize * sizeof(uint64_t) > End) return data_type(); CounterBuffer.clear(); - for (unsigned J = 0; J < NumCounts; ++J) + CounterBuffer.reserve(CountsSize); + for (uint64_t J = 0; J < CountsSize; ++J) CounterBuffer.push_back(endian::readNext<uint64_t, little, unaligned>(D)); - DataBuffer.push_back(InstrProfRecord(K, Hash, std::move(CounterBuffer))); + DataBuffer.emplace_back(K, Hash, std::move(CounterBuffer)); + + // Read value profiling data. + if (FormatVersion > 2 && !readValueProfilingData(D, End)) { + DataBuffer.clear(); + return data_type(); + } } return DataBuffer; } +template <typename HashTableImpl> +std::error_code InstrProfReaderIndex<HashTableImpl>::getRecords( + StringRef FuncName, ArrayRef<InstrProfRecord> &Data) { + auto Iter = HashTable->find(FuncName); + if (Iter == HashTable->end()) + return instrprof_error::unknown_function; + + Data = (*Iter); + if (Data.empty()) + return instrprof_error::malformed; + + return instrprof_error::success; +} + +template <typename HashTableImpl> +std::error_code InstrProfReaderIndex<HashTableImpl>::getRecords( + ArrayRef<InstrProfRecord> &Data) { + if (atEnd()) + return instrprof_error::eof; + + Data = *RecordIterator; + + if (Data.empty()) + return instrprof_error::malformed; + + return instrprof_error::success; +} + +template <typename HashTableImpl> +InstrProfReaderIndex<HashTableImpl>::InstrProfReaderIndex( + const unsigned char *Buckets, const unsigned char *const Payload, + const unsigned char *const Base, IndexedInstrProf::HashT HashType, + uint64_t Version) { + FormatVersion = Version; + HashTable.reset(HashTableImpl::Create( + Buckets, Payload, Base, + typename HashTableImpl::InfoType(HashType, Version))); + RecordIterator = HashTable->data_begin(); +} + bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) { if (DataBuffer.getBufferSize() < 8) return false; using namespace support; uint64_t Magic = endian::read<uint64_t, little, aligned>(DataBuffer.getBufferStart()); + // Verify that it's magical. return Magic == IndexedInstrProf::Magic; } @@ -360,71 +562,91 @@ std::error_code IndexedInstrProfReader::readHeader() { using namespace support; + auto *Header = reinterpret_cast<const IndexedInstrProf::Header *>(Cur); + Cur += sizeof(IndexedInstrProf::Header); + // Check the magic number. - uint64_t Magic = endian::readNext<uint64_t, little, unaligned>(Cur); + uint64_t Magic = endian::byte_swap<uint64_t, little>(Header->Magic); if (Magic != IndexedInstrProf::Magic) return error(instrprof_error::bad_magic); // Read the version. - FormatVersion = endian::readNext<uint64_t, little, unaligned>(Cur); + uint64_t FormatVersion = endian::byte_swap<uint64_t, little>(Header->Version); if (FormatVersion > IndexedInstrProf::Version) return error(instrprof_error::unsupported_version); // Read the maximal function count. - MaxFunctionCount = endian::readNext<uint64_t, little, unaligned>(Cur); + MaxFunctionCount = + endian::byte_swap<uint64_t, little>(Header->MaxFunctionCount); // Read the hash type and start offset. IndexedInstrProf::HashT HashType = static_cast<IndexedInstrProf::HashT>( - endian::readNext<uint64_t, little, unaligned>(Cur)); + endian::byte_swap<uint64_t, little>(Header->HashType)); if (HashType > IndexedInstrProf::HashT::Last) return error(instrprof_error::unsupported_hash_type); - uint64_t HashOffset = endian::readNext<uint64_t, little, unaligned>(Cur); - // The rest of the file is an on disk hash table. - Index.reset(InstrProfReaderIndex::Create( - Start + HashOffset, Cur, Start, - InstrProfLookupTrait(HashType, FormatVersion))); - // Set up our iterator for readNextRecord. - RecordIterator = Index->data_begin(); + uint64_t HashOffset = endian::byte_swap<uint64_t, little>(Header->HashOffset); + // The rest of the file is an on disk hash table. + InstrProfReaderIndexBase *IndexPtr = nullptr; + IndexPtr = new InstrProfReaderIndex<OnDiskHashTableImplV3>( + Start + HashOffset, Cur, Start, HashType, FormatVersion); + Index.reset(IndexPtr); return success(); } -std::error_code IndexedInstrProfReader::getFunctionCounts( - StringRef FuncName, uint64_t FuncHash, std::vector<uint64_t> &Counts) { - auto Iter = Index->find(FuncName); - if (Iter == Index->end()) - return error(instrprof_error::unknown_function); +InstrProfSymtab &IndexedInstrProfReader::getSymtab() { + if (Symtab.get()) + return *Symtab.get(); - // Found it. Look for counters with the right hash. - ArrayRef<InstrProfRecord> Data = (*Iter); - if (Data.empty()) - return error(instrprof_error::malformed); + std::unique_ptr<InstrProfSymtab> NewSymtab = make_unique<InstrProfSymtab>(); + Index->populateSymtab(*NewSymtab.get()); + + Symtab = std::move(NewSymtab); + return *Symtab.get(); +} +ErrorOr<InstrProfRecord> +IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName, + uint64_t FuncHash) { + ArrayRef<InstrProfRecord> Data; + std::error_code EC = Index->getRecords(FuncName, Data); + if (EC != instrprof_error::success) + return EC; + // Found it. Look for counters with the right hash. for (unsigned I = 0, E = Data.size(); I < E; ++I) { // Check for a match and fill the vector if there is one. if (Data[I].Hash == FuncHash) { - Counts = Data[I].Counts; - return success(); + return std::move(Data[I]); } } return error(instrprof_error::hash_mismatch); } std::error_code -IndexedInstrProfReader::readNextRecord(InstrProfRecord &Record) { - // Are we out of records? - if (RecordIterator == Index->data_end()) - return error(instrprof_error::eof); +IndexedInstrProfReader::getFunctionCounts(StringRef FuncName, uint64_t FuncHash, + std::vector<uint64_t> &Counts) { + ErrorOr<InstrProfRecord> Record = getInstrProfRecord(FuncName, FuncHash); + if (std::error_code EC = Record.getError()) + return EC; - if ((*RecordIterator).empty()) - return error(instrprof_error::malformed); + Counts = Record.get().Counts; + return success(); +} +std::error_code IndexedInstrProfReader::readNextRecord( + InstrProfRecord &Record) { static unsigned RecordIndex = 0; - ArrayRef<InstrProfRecord> Data = (*RecordIterator); + + ArrayRef<InstrProfRecord> Data; + + std::error_code EC = Index->getRecords(Data); + if (EC != instrprof_error::success) + return error(EC); + Record = Data[RecordIndex++]; if (RecordIndex >= Data.size()) { - ++RecordIterator; + Index->advanceToNextKey(); RecordIndex = 0; } return success(); diff --git a/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp b/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp index 2188543..f522724 100644 --- a/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -13,27 +13,29 @@ //===----------------------------------------------------------------------===// #include "llvm/ProfileData/InstrProfWriter.h" -#include "InstrProfIndexed.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/OnDiskHashTable.h" +#include <tuple> using namespace llvm; namespace { +static support::endianness ValueProfDataEndianness = support::little; + class InstrProfRecordTrait { public: typedef StringRef key_type; typedef StringRef key_type_ref; - typedef const InstrProfWriter::CounterData *const data_type; - typedef const InstrProfWriter::CounterData *const data_type_ref; + typedef const InstrProfWriter::ProfilingData *const data_type; + typedef const InstrProfWriter::ProfilingData *const data_type_ref; typedef uint64_t hash_value_type; typedef uint64_t offset_type; static hash_value_type ComputeHash(key_type_ref K) { - return IndexedInstrProf::ComputeHash(IndexedInstrProf::HashType, K); + return IndexedInstrProf::ComputeHash(K); } static std::pair<offset_type, offset_type> @@ -45,8 +47,15 @@ public: LE.write<offset_type>(N); offset_type M = 0; - for (const auto &Counts : *V) - M += (2 + Counts.second.size()) * sizeof(uint64_t); + for (const auto &ProfileData : *V) { + const InstrProfRecord &ProfRecord = ProfileData.second; + M += sizeof(uint64_t); // The function hash + M += sizeof(uint64_t); // The size of the Counts vector + M += ProfRecord.Counts.size() * sizeof(uint64_t); + + // Value data + M += ValueProfData::getSize(ProfileData.second); + } LE.write<offset_type>(M); return std::make_pair(N, M); @@ -60,50 +69,62 @@ public: offset_type) { using namespace llvm::support; endian::Writer<little> LE(Out); + for (const auto &ProfileData : *V) { + const InstrProfRecord &ProfRecord = ProfileData.second; - for (const auto &Counts : *V) { - LE.write<uint64_t>(Counts.first); - LE.write<uint64_t>(Counts.second.size()); - for (uint64_t I : Counts.second) + LE.write<uint64_t>(ProfileData.first); // Function hash + LE.write<uint64_t>(ProfRecord.Counts.size()); + for (uint64_t I : ProfRecord.Counts) LE.write<uint64_t>(I); + + // Write value data + std::unique_ptr<ValueProfData> VDataPtr = + ValueProfData::serializeFrom(ProfileData.second); + uint32_t S = VDataPtr->getSize(); + VDataPtr->swapBytesFromHost(ValueProfDataEndianness); + Out.write((const char *)VDataPtr.get(), S); } } }; } -std::error_code -InstrProfWriter::addFunctionCounts(StringRef FunctionName, - uint64_t FunctionHash, - ArrayRef<uint64_t> Counters) { - auto &CounterData = FunctionData[FunctionName]; +// Internal interface for testing purpose only. +void InstrProfWriter::setValueProfDataEndianness( + support::endianness Endianness) { + ValueProfDataEndianness = Endianness; +} + +std::error_code InstrProfWriter::addRecord(InstrProfRecord &&I, + uint64_t Weight) { + auto &ProfileDataMap = FunctionData[I.Name]; + + bool NewFunc; + ProfilingData::iterator Where; + std::tie(Where, NewFunc) = + ProfileDataMap.insert(std::make_pair(I.Hash, InstrProfRecord())); + InstrProfRecord &Dest = Where->second; - auto Where = CounterData.find(FunctionHash); - if (Where == CounterData.end()) { + instrprof_error Result = instrprof_error::success; + if (NewFunc) { // We've never seen a function with this name and hash, add it. - CounterData[FunctionHash] = Counters; - // We keep track of the max function count as we go for simplicity. - if (Counters[0] > MaxFunctionCount) - MaxFunctionCount = Counters[0]; - return instrprof_error::success; + Dest = std::move(I); + // Fix up the name to avoid dangling reference. + Dest.Name = FunctionData.find(Dest.Name)->getKey(); + if (Weight > 1) + Result = Dest.scale(Weight); + } else { + // We're updating a function we've seen before. + Result = Dest.merge(I, Weight); } - // We're updating a function we've seen before. - auto &FoundCounters = Where->second; - // If the number of counters doesn't match we either have bad data or a hash - // collision. - if (FoundCounters.size() != Counters.size()) - return instrprof_error::count_mismatch; - - for (size_t I = 0, E = Counters.size(); I < E; ++I) { - if (FoundCounters[I] + Counters[I] < FoundCounters[I]) - return instrprof_error::counter_overflow; - FoundCounters[I] += Counters[I]; - } + Dest.sortValueData(); + // We keep track of the max function count as we go for simplicity. - if (FoundCounters[0] > MaxFunctionCount) - MaxFunctionCount = FoundCounters[0]; + // Update this statistic no matter the result of the merge. + if (Dest.Counts[0] > MaxFunctionCount) + MaxFunctionCount = Dest.Counts[0]; - return instrprof_error::success; + return Result; } std::pair<uint64_t, uint64_t> InstrProfWriter::writeImpl(raw_ostream &OS) { @@ -117,13 +138,23 @@ std::pair<uint64_t, uint64_t> InstrProfWriter::writeImpl(raw_ostream &OS) { endian::Writer<little> LE(OS); // Write the header. - LE.write<uint64_t>(IndexedInstrProf::Magic); - LE.write<uint64_t>(IndexedInstrProf::Version); - LE.write<uint64_t>(MaxFunctionCount); - LE.write<uint64_t>(static_cast<uint64_t>(IndexedInstrProf::HashType)); + IndexedInstrProf::Header Header; + Header.Magic = IndexedInstrProf::Magic; + Header.Version = IndexedInstrProf::Version; + Header.MaxFunctionCount = MaxFunctionCount; + Header.HashType = static_cast<uint64_t>(IndexedInstrProf::HashType); + Header.HashOffset = 0; + int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t); + + // Only write out all the fields execpt 'HashOffset'. We need + // to remember the offset of that field to allow back patching + // later. + for (int I = 0; I < N - 1; I++) + LE.write<uint64_t>(reinterpret_cast<uint64_t *>(&Header)[I]); // Save a space to write the hash table start location. uint64_t HashTableStartLoc = OS.tell(); + // Reserve the space for HashOffset field. LE.write<uint64_t>(0); // Write the hash table. uint64_t HashTableStart = Generator.Emit(OS); @@ -138,9 +169,65 @@ void InstrProfWriter::write(raw_fd_ostream &OS) { // Go back and fill in the hash table start. using namespace support; OS.seek(TableStart.first); + // Now patch the HashOffset field previously reserved. endian::Writer<little>(OS).write<uint64_t>(TableStart.second); } +static const char *ValueProfKindStr[] = { +#define VALUE_PROF_KIND(Enumerator, Value) #Enumerator, +#include "llvm/ProfileData/InstrProfData.inc" +}; + +void InstrProfWriter::writeRecordInText(const InstrProfRecord &Func, + InstrProfSymtab &Symtab, + raw_fd_ostream &OS) { + OS << Func.Name << "\n"; + OS << "# Func Hash:\n" << Func.Hash << "\n"; + OS << "# Num Counters:\n" << Func.Counts.size() << "\n"; + OS << "# Counter Values:\n"; + for (uint64_t Count : Func.Counts) + OS << Count << "\n"; + + uint32_t NumValueKinds = Func.getNumValueKinds(); + if (!NumValueKinds) { + OS << "\n"; + return; + } + + OS << "# Num Value Kinds:\n" << Func.getNumValueKinds() << "\n"; + for (uint32_t VK = 0; VK < IPVK_Last + 1; VK++) { + uint32_t NS = Func.getNumValueSites(VK); + if (!NS) + continue; + OS << "# ValueKind = " << ValueProfKindStr[VK] << ":\n" << VK << "\n"; + OS << "# NumValueSites:\n" << NS << "\n"; + for (uint32_t S = 0; S < NS; S++) { + uint32_t ND = Func.getNumValueDataForSite(VK, S); + OS << ND << "\n"; + std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S); + for (uint32_t I = 0; I < ND; I++) { + if (VK == IPVK_IndirectCallTarget) + OS << Symtab.getFuncName(VD[I].Value) << ":" << VD[I].Count << "\n"; + else + OS << VD[I].Value << ":" << VD[I].Count << "\n"; + } + } + } + + OS << "\n"; +} + +void InstrProfWriter::writeText(raw_fd_ostream &OS) { + InstrProfSymtab Symtab; + for (const auto &I : FunctionData) + Symtab.addFuncName(I.getKey()); + Symtab.finalizeSymtab(); + + for (const auto &I : FunctionData) + for (const auto &Func : I.getValue()) + writeRecordInText(Func.second, Symtab, OS); +} + std::unique_ptr<MemoryBuffer> InstrProfWriter::writeBuffer() { std::string Data; llvm::raw_string_ostream OS(Data); diff --git a/contrib/llvm/lib/ProfileData/SampleProf.cpp b/contrib/llvm/lib/ProfileData/SampleProf.cpp index 920c48a..9ded757 100644 --- a/contrib/llvm/lib/ProfileData/SampleProf.cpp +++ b/contrib/llvm/lib/ProfileData/SampleProf.cpp @@ -16,6 +16,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ManagedStatic.h" +using namespace llvm::sampleprof; using namespace llvm; namespace { @@ -27,17 +28,25 @@ class SampleProfErrorCategoryType : public std::error_category { case sampleprof_error::success: return "Success"; case sampleprof_error::bad_magic: - return "Invalid file format (bad magic)"; + return "Invalid sample profile data (bad magic)"; case sampleprof_error::unsupported_version: - return "Unsupported format version"; + return "Unsupported sample profile format version"; case sampleprof_error::too_large: return "Too much profile data"; case sampleprof_error::truncated: return "Truncated profile data"; case sampleprof_error::malformed: - return "Malformed profile data"; + return "Malformed sample profile data"; case sampleprof_error::unrecognized_format: - return "Unrecognized profile encoding format"; + return "Unrecognized sample profile encoding format"; + case sampleprof_error::unsupported_writing_format: + return "Profile encoding format unsupported for writing operations"; + case sampleprof_error::truncated_name_table: + return "Truncated function name table"; + case sampleprof_error::not_implemented: + return "Unimplemented feature"; + case sampleprof_error::counter_overflow: + return "Counter overflow"; } llvm_unreachable("A value of sampleprof_error has no message."); } @@ -49,3 +58,92 @@ static ManagedStatic<SampleProfErrorCategoryType> ErrorCategory; const std::error_category &llvm::sampleprof_category() { return *ErrorCategory; } + +void LineLocation::print(raw_ostream &OS) const { + OS << LineOffset; + if (Discriminator > 0) + OS << "." << Discriminator; +} + +raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS, + const LineLocation &Loc) { + Loc.print(OS); + return OS; +} + +void LineLocation::dump() const { print(dbgs()); } + +void CallsiteLocation::print(raw_ostream &OS) const { + LineLocation::print(OS); + OS << ": inlined callee: " << CalleeName; +} + +void CallsiteLocation::dump() const { print(dbgs()); } + +inline raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS, + const CallsiteLocation &Loc) { + Loc.print(OS); + return OS; +} + +/// \brief Print the sample record to the stream \p OS indented by \p Indent. +void SampleRecord::print(raw_ostream &OS, unsigned Indent) const { + OS << NumSamples; + if (hasCalls()) { + OS << ", calls:"; + for (const auto &I : getCallTargets()) + OS << " " << I.first() << ":" << I.second; + } + OS << "\n"; +} + +void SampleRecord::dump() const { print(dbgs(), 0); } + +raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS, + const SampleRecord &Sample) { + Sample.print(OS, 0); + return OS; +} + +/// \brief Print the samples collected for a function on stream \p OS. +void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const { + OS << TotalSamples << ", " << TotalHeadSamples << ", " << BodySamples.size() + << " sampled lines\n"; + + OS.indent(Indent); + if (BodySamples.size() > 0) { + OS << "Samples collected in the function's body {\n"; + SampleSorter<LineLocation, SampleRecord> SortedBodySamples(BodySamples); + for (const auto &SI : SortedBodySamples.get()) { + OS.indent(Indent + 2); + OS << SI->first << ": " << SI->second; + } + OS.indent(Indent); + OS << "}\n"; + } else { + OS << "No samples collected in the function's body\n"; + } + + OS.indent(Indent); + if (CallsiteSamples.size() > 0) { + OS << "Samples collected in inlined callsites {\n"; + SampleSorter<CallsiteLocation, FunctionSamples> SortedCallsiteSamples( + CallsiteSamples); + for (const auto &CS : SortedCallsiteSamples.get()) { + OS.indent(Indent + 2); + OS << CS->first << ": "; + CS->second.print(OS, Indent + 4); + } + OS << "}\n"; + } else { + OS << "No inlined callsites in this function\n"; + } +} + +raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS, + const FunctionSamples &FS) { + FS.print(OS); + return OS; +} + +void FunctionSamples::dump(void) const { print(dbgs(), 0); } diff --git a/contrib/llvm/lib/ProfileData/SampleProfReader.cpp b/contrib/llvm/lib/ProfileData/SampleProfReader.cpp index b39bfd6..93cd87b 100644 --- a/contrib/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/contrib/llvm/lib/ProfileData/SampleProfReader.cpp @@ -8,133 +8,37 @@ //===----------------------------------------------------------------------===// // // This file implements the class that reads LLVM sample profiles. It -// supports two file formats: text and binary. The textual representation -// is useful for debugging and testing purposes. The binary representation -// is more compact, resulting in smaller file sizes. However, they can -// both be used interchangeably. +// supports three file formats: text, binary and gcov. // -// NOTE: If you are making changes to the file format, please remember -// to document them in the Clang documentation at -// tools/clang/docs/UsersManual.rst. +// The textual representation is useful for debugging and testing purposes. The +// binary representation is more compact, resulting in smaller file sizes. // -// Text format -// ----------- +// The gcov encoding is the one generated by GCC's AutoFDO profile creation +// tool (https://github.com/google/autofdo) // -// Sample profiles are written as ASCII text. The file is divided into -// sections, which correspond to each of the functions executed at runtime. -// Each section has the following format -// -// function1:total_samples:total_head_samples -// offset1[.discriminator]: number_of_samples [fn1:num fn2:num ... ] -// offset2[.discriminator]: number_of_samples [fn3:num fn4:num ... ] -// ... -// offsetN[.discriminator]: number_of_samples [fn5:num fn6:num ... ] -// -// The file may contain blank lines between sections and within a -// section. However, the spacing within a single line is fixed. Additional -// spaces will result in an error while reading the file. -// -// Function names must be mangled in order for the profile loader to -// match them in the current translation unit. The two numbers in the -// function header specify how many total samples were accumulated in the -// function (first number), and the total number of samples accumulated -// in the prologue of the function (second number). This head sample -// count provides an indicator of how frequently the function is invoked. -// -// Each sampled line may contain several items. Some are optional (marked -// below): -// -// a. Source line offset. This number represents the line number -// in the function where the sample was collected. The line number is -// always relative to the line where symbol of the function is -// defined. So, if the function has its header at line 280, the offset -// 13 is at line 293 in the file. -// -// Note that this offset should never be a negative number. This could -// happen in cases like macros. The debug machinery will register the -// line number at the point of macro expansion. So, if the macro was -// expanded in a line before the start of the function, the profile -// converter should emit a 0 as the offset (this means that the optimizers -// will not be able to associate a meaningful weight to the instructions -// in the macro). -// -// b. [OPTIONAL] Discriminator. This is used if the sampled program -// was compiled with DWARF discriminator support -// (http://wiki.dwarfstd.org/index.php?title=Path_Discriminators). -// DWARF discriminators are unsigned integer values that allow the -// compiler to distinguish between multiple execution paths on the -// same source line location. -// -// For example, consider the line of code ``if (cond) foo(); else bar();``. -// If the predicate ``cond`` is true 80% of the time, then the edge -// into function ``foo`` should be considered to be taken most of the -// time. But both calls to ``foo`` and ``bar`` are at the same source -// line, so a sample count at that line is not sufficient. The -// compiler needs to know which part of that line is taken more -// frequently. -// -// This is what discriminators provide. In this case, the calls to -// ``foo`` and ``bar`` will be at the same line, but will have -// different discriminator values. This allows the compiler to correctly -// set edge weights into ``foo`` and ``bar``. -// -// c. Number of samples. This is an integer quantity representing the -// number of samples collected by the profiler at this source -// location. -// -// d. [OPTIONAL] Potential call targets and samples. If present, this -// line contains a call instruction. This models both direct and -// number of samples. For example, -// -// 130: 7 foo:3 bar:2 baz:7 -// -// The above means that at relative line offset 130 there is a call -// instruction that calls one of ``foo()``, ``bar()`` and ``baz()``, -// with ``baz()`` being the relatively more frequently called target. +// All three encodings can be used interchangeably as an input sample profile. // //===----------------------------------------------------------------------===// #include "llvm/ProfileData/SampleProfReader.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/LEB128.h" #include "llvm/Support/LineIterator.h" #include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/Regex.h" using namespace llvm::sampleprof; using namespace llvm; -/// \brief Print the samples collected for a function on stream \p OS. -/// -/// \param OS Stream to emit the output to. -void FunctionSamples::print(raw_ostream &OS) { - OS << TotalSamples << ", " << TotalHeadSamples << ", " << BodySamples.size() - << " sampled lines\n"; - for (const auto &SI : BodySamples) { - LineLocation Loc = SI.first; - const SampleRecord &Sample = SI.second; - OS << "\tline offset: " << Loc.LineOffset - << ", discriminator: " << Loc.Discriminator - << ", number of samples: " << Sample.getSamples(); - if (Sample.hasCalls()) { - OS << ", calls:"; - for (const auto &I : Sample.getCallTargets()) - OS << " " << I.first() << ":" << I.second; - } - OS << "\n"; - } - OS << "\n"; -} - /// \brief Dump the function profile for \p FName. /// /// \param FName Name of the function to print. /// \param OS Stream to emit the output to. void SampleProfileReader::dumpFunctionProfile(StringRef FName, raw_ostream &OS) { - OS << "Function: " << FName << ": "; - Profiles[FName].print(OS); + OS << "Function: " << FName << ": " << Profiles[FName]; } /// \brief Dump all the function profiles found on stream \p OS. @@ -143,6 +47,102 @@ void SampleProfileReader::dump(raw_ostream &OS) { dumpFunctionProfile(I.getKey(), OS); } +/// \brief Parse \p Input as function head. +/// +/// Parse one line of \p Input, and update function name in \p FName, +/// function's total sample count in \p NumSamples, function's entry +/// count in \p NumHeadSamples. +/// +/// \returns true if parsing is successful. +static bool ParseHead(const StringRef &Input, StringRef &FName, + uint64_t &NumSamples, uint64_t &NumHeadSamples) { + if (Input[0] == ' ') + return false; + size_t n2 = Input.rfind(':'); + size_t n1 = Input.rfind(':', n2 - 1); + FName = Input.substr(0, n1); + if (Input.substr(n1 + 1, n2 - n1 - 1).getAsInteger(10, NumSamples)) + return false; + if (Input.substr(n2 + 1).getAsInteger(10, NumHeadSamples)) + return false; + return true; +} + + +/// \brief Returns true if line offset \p L is legal (only has 16 bits). +static bool isOffsetLegal(unsigned L) { + return (L & 0xffff) == L; +} + +/// \brief Parse \p Input as line sample. +/// +/// \param Input input line. +/// \param IsCallsite true if the line represents an inlined callsite. +/// \param Depth the depth of the inline stack. +/// \param NumSamples total samples of the line/inlined callsite. +/// \param LineOffset line offset to the start of the function. +/// \param Discriminator discriminator of the line. +/// \param TargetCountMap map from indirect call target to count. +/// +/// returns true if parsing is successful. +static bool ParseLine(const StringRef &Input, bool &IsCallsite, uint32_t &Depth, + uint64_t &NumSamples, uint32_t &LineOffset, + uint32_t &Discriminator, StringRef &CalleeName, + DenseMap<StringRef, uint64_t> &TargetCountMap) { + for (Depth = 0; Input[Depth] == ' '; Depth++) + ; + if (Depth == 0) + return false; + + size_t n1 = Input.find(':'); + StringRef Loc = Input.substr(Depth, n1 - Depth); + size_t n2 = Loc.find('.'); + if (n2 == StringRef::npos) { + if (Loc.getAsInteger(10, LineOffset) || !isOffsetLegal(LineOffset)) + return false; + Discriminator = 0; + } else { + if (Loc.substr(0, n2).getAsInteger(10, LineOffset)) + return false; + if (Loc.substr(n2 + 1).getAsInteger(10, Discriminator)) + return false; + } + + StringRef Rest = Input.substr(n1 + 2); + if (Rest[0] >= '0' && Rest[0] <= '9') { + IsCallsite = false; + size_t n3 = Rest.find(' '); + if (n3 == StringRef::npos) { + if (Rest.getAsInteger(10, NumSamples)) + return false; + } else { + if (Rest.substr(0, n3).getAsInteger(10, NumSamples)) + return false; + } + while (n3 != StringRef::npos) { + n3 += Rest.substr(n3).find_first_not_of(' '); + Rest = Rest.substr(n3); + n3 = Rest.find(' '); + StringRef pair = Rest; + if (n3 != StringRef::npos) { + pair = Rest.substr(0, n3); + } + size_t n4 = pair.find(':'); + uint64_t count; + if (pair.substr(n4 + 1).getAsInteger(10, count)) + return false; + TargetCountMap[pair.substr(0, n4)] = count; + } + } else { + IsCallsite = true; + size_t n3 = Rest.find_last_of(':'); + CalleeName = Rest.substr(0, n3); + if (Rest.substr(n3 + 1).getAsInteger(10, NumSamples)) + return false; + } + return true; +} + /// \brief Load samples from a text file. /// /// See the documentation at the top of the file for an explanation of @@ -151,14 +151,13 @@ void SampleProfileReader::dump(raw_ostream &OS) { /// \returns true if the file was loaded successfully, false otherwise. std::error_code SampleProfileReaderText::read() { line_iterator LineIt(*Buffer, /*SkipBlanks=*/true, '#'); + sampleprof_error Result = sampleprof_error::success; + + InlineCallStack InlineStack; - // Read the profile of each function. Since each function may be - // mentioned more than once, and we are collecting flat profiles, - // accumulate samples as we parse them. - Regex HeadRE("^([^0-9].*):([0-9]+):([0-9]+)$"); - Regex LineSampleRE("^([0-9]+)\\.?([0-9]+)?: ([0-9]+)(.*)$"); - Regex CallSampleRE(" +([^0-9 ][^ ]*):([0-9]+)"); - while (!LineIt.is_at_eof()) { + for (; !LineIt.is_at_eof(); ++LineIt) { + if ((*LineIt)[(*LineIt).find_first_not_of(' ')] == '#') + continue; // Read the header of each function. // // Note that for function identifiers we are actually expecting @@ -171,63 +170,74 @@ std::error_code SampleProfileReaderText::read() { // // The only requirement we place on the identifier, then, is that it // should not begin with a number. - SmallVector<StringRef, 4> Matches; - if (!HeadRE.match(*LineIt, &Matches)) { - reportParseError(LineIt.line_number(), - "Expected 'mangled_name:NUM:NUM', found " + *LineIt); - return sampleprof_error::malformed; - } - assert(Matches.size() == 4); - StringRef FName = Matches[1]; - unsigned NumSamples, NumHeadSamples; - Matches[2].getAsInteger(10, NumSamples); - Matches[3].getAsInteger(10, NumHeadSamples); - Profiles[FName] = FunctionSamples(); - FunctionSamples &FProfile = Profiles[FName]; - FProfile.addTotalSamples(NumSamples); - FProfile.addHeadSamples(NumHeadSamples); - ++LineIt; - - // Now read the body. The body of the function ends when we reach - // EOF or when we see the start of the next function. - while (!LineIt.is_at_eof() && isdigit((*LineIt)[0])) { - if (!LineSampleRE.match(*LineIt, &Matches)) { - reportParseError( - LineIt.line_number(), - "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " + *LineIt); + if ((*LineIt)[0] != ' ') { + uint64_t NumSamples, NumHeadSamples; + StringRef FName; + if (!ParseHead(*LineIt, FName, NumSamples, NumHeadSamples)) { + reportError(LineIt.line_number(), + "Expected 'mangled_name:NUM:NUM', found " + *LineIt); + return sampleprof_error::malformed; + } + Profiles[FName] = FunctionSamples(); + FunctionSamples &FProfile = Profiles[FName]; + MergeResult(Result, FProfile.addTotalSamples(NumSamples)); + MergeResult(Result, FProfile.addHeadSamples(NumHeadSamples)); + InlineStack.clear(); + InlineStack.push_back(&FProfile); + } else { + uint64_t NumSamples; + StringRef FName; + DenseMap<StringRef, uint64_t> TargetCountMap; + bool IsCallsite; + uint32_t Depth, LineOffset, Discriminator; + if (!ParseLine(*LineIt, IsCallsite, Depth, NumSamples, LineOffset, + Discriminator, FName, TargetCountMap)) { + reportError(LineIt.line_number(), + "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " + + *LineIt); return sampleprof_error::malformed; } - assert(Matches.size() == 5); - unsigned LineOffset, NumSamples, Discriminator = 0; - Matches[1].getAsInteger(10, LineOffset); - if (Matches[2] != "") - Matches[2].getAsInteger(10, Discriminator); - Matches[3].getAsInteger(10, NumSamples); - - // If there are function calls in this line, generate a call sample - // entry for each call. - std::string CallsLine(Matches[4]); - while (CallsLine != "") { - SmallVector<StringRef, 3> CallSample; - if (!CallSampleRE.match(CallsLine, &CallSample)) { - reportParseError(LineIt.line_number(), - "Expected 'mangled_name:NUM', found " + CallsLine); - return sampleprof_error::malformed; + if (IsCallsite) { + while (InlineStack.size() > Depth) { + InlineStack.pop_back(); } - StringRef CalledFunction = CallSample[1]; - unsigned CalledFunctionSamples; - CallSample[2].getAsInteger(10, CalledFunctionSamples); - FProfile.addCalledTargetSamples(LineOffset, Discriminator, - CalledFunction, CalledFunctionSamples); - CallsLine = CallSampleRE.sub("", CallsLine); + FunctionSamples &FSamples = InlineStack.back()->functionSamplesAt( + CallsiteLocation(LineOffset, Discriminator, FName)); + MergeResult(Result, FSamples.addTotalSamples(NumSamples)); + InlineStack.push_back(&FSamples); + } else { + while (InlineStack.size() > Depth) { + InlineStack.pop_back(); + } + FunctionSamples &FProfile = *InlineStack.back(); + for (const auto &name_count : TargetCountMap) { + MergeResult(Result, FProfile.addCalledTargetSamples( + LineOffset, Discriminator, name_count.first, + name_count.second)); + } + MergeResult(Result, FProfile.addBodySamples(LineOffset, Discriminator, + NumSamples)); } + } + } - FProfile.addBodySamples(LineOffset, Discriminator, NumSamples); - ++LineIt; + return Result; +} + +bool SampleProfileReaderText::hasFormat(const MemoryBuffer &Buffer) { + bool result = false; + + // Check that the first non-comment line is a valid function header. + line_iterator LineIt(Buffer, /*SkipBlanks=*/true, '#'); + if (!LineIt.is_at_eof()) { + if ((*LineIt)[0] != ' ') { + uint64_t NumSamples, NumHeadSamples; + StringRef FName; + result = ParseHead(*LineIt, FName, NumSamples, NumHeadSamples); } } - return sampleprof_error::success; + return result; } template <typename T> ErrorOr<T> SampleProfileReaderBinary::readNumber() { @@ -243,7 +253,7 @@ template <typename T> ErrorOr<T> SampleProfileReaderBinary::readNumber() { EC = sampleprof_error::success; if (EC) { - reportParseError(0, EC.message()); + reportError(0, EC.message()); return EC; } @@ -256,7 +266,7 @@ ErrorOr<StringRef> SampleProfileReaderBinary::readString() { StringRef Str(reinterpret_cast<const char *>(Data)); if (Data + Str.size() + 1 > End) { EC = sampleprof_error::truncated; - reportParseError(0, EC.message()); + reportError(0, EC.message()); return EC; } @@ -264,62 +274,109 @@ ErrorOr<StringRef> SampleProfileReaderBinary::readString() { return Str; } -std::error_code SampleProfileReaderBinary::read() { - while (!at_eof()) { - auto FName(readString()); - if (std::error_code EC = FName.getError()) +ErrorOr<StringRef> SampleProfileReaderBinary::readStringFromTable() { + std::error_code EC; + auto Idx = readNumber<uint32_t>(); + if (std::error_code EC = Idx.getError()) + return EC; + if (*Idx >= NameTable.size()) + return sampleprof_error::truncated_name_table; + return NameTable[*Idx]; +} + +std::error_code +SampleProfileReaderBinary::readProfile(FunctionSamples &FProfile) { + auto NumSamples = readNumber<uint64_t>(); + if (std::error_code EC = NumSamples.getError()) + return EC; + FProfile.addTotalSamples(*NumSamples); + + // Read the samples in the body. + auto NumRecords = readNumber<uint32_t>(); + if (std::error_code EC = NumRecords.getError()) + return EC; + + for (uint32_t I = 0; I < *NumRecords; ++I) { + auto LineOffset = readNumber<uint64_t>(); + if (std::error_code EC = LineOffset.getError()) return EC; - Profiles[*FName] = FunctionSamples(); - FunctionSamples &FProfile = Profiles[*FName]; + if (!isOffsetLegal(*LineOffset)) { + return std::error_code(); + } - auto Val = readNumber<unsigned>(); - if (std::error_code EC = Val.getError()) + auto Discriminator = readNumber<uint64_t>(); + if (std::error_code EC = Discriminator.getError()) return EC; - FProfile.addTotalSamples(*Val); - Val = readNumber<unsigned>(); - if (std::error_code EC = Val.getError()) + auto NumSamples = readNumber<uint64_t>(); + if (std::error_code EC = NumSamples.getError()) return EC; - FProfile.addHeadSamples(*Val); - // Read the samples in the body. - auto NumRecords = readNumber<unsigned>(); - if (std::error_code EC = NumRecords.getError()) + auto NumCalls = readNumber<uint32_t>(); + if (std::error_code EC = NumCalls.getError()) return EC; - for (unsigned I = 0; I < *NumRecords; ++I) { - auto LineOffset = readNumber<uint64_t>(); - if (std::error_code EC = LineOffset.getError()) - return EC; - auto Discriminator = readNumber<uint64_t>(); - if (std::error_code EC = Discriminator.getError()) + for (uint32_t J = 0; J < *NumCalls; ++J) { + auto CalledFunction(readStringFromTable()); + if (std::error_code EC = CalledFunction.getError()) return EC; - auto NumSamples = readNumber<uint64_t>(); - if (std::error_code EC = NumSamples.getError()) + auto CalledFunctionSamples = readNumber<uint64_t>(); + if (std::error_code EC = CalledFunctionSamples.getError()) return EC; - auto NumCalls = readNumber<unsigned>(); - if (std::error_code EC = NumCalls.getError()) - return EC; + FProfile.addCalledTargetSamples(*LineOffset, *Discriminator, + *CalledFunction, *CalledFunctionSamples); + } + + FProfile.addBodySamples(*LineOffset, *Discriminator, *NumSamples); + } - for (unsigned J = 0; J < *NumCalls; ++J) { - auto CalledFunction(readString()); - if (std::error_code EC = CalledFunction.getError()) - return EC; + // Read all the samples for inlined function calls. + auto NumCallsites = readNumber<uint32_t>(); + if (std::error_code EC = NumCallsites.getError()) + return EC; - auto CalledFunctionSamples = readNumber<uint64_t>(); - if (std::error_code EC = CalledFunctionSamples.getError()) - return EC; + for (uint32_t J = 0; J < *NumCallsites; ++J) { + auto LineOffset = readNumber<uint64_t>(); + if (std::error_code EC = LineOffset.getError()) + return EC; - FProfile.addCalledTargetSamples(*LineOffset, *Discriminator, - *CalledFunction, - *CalledFunctionSamples); - } + auto Discriminator = readNumber<uint64_t>(); + if (std::error_code EC = Discriminator.getError()) + return EC; - FProfile.addBodySamples(*LineOffset, *Discriminator, *NumSamples); - } + auto FName(readStringFromTable()); + if (std::error_code EC = FName.getError()) + return EC; + + FunctionSamples &CalleeProfile = FProfile.functionSamplesAt( + CallsiteLocation(*LineOffset, *Discriminator, *FName)); + if (std::error_code EC = readProfile(CalleeProfile)) + return EC; + } + + return sampleprof_error::success; +} + +std::error_code SampleProfileReaderBinary::read() { + while (!at_eof()) { + auto NumHeadSamples = readNumber<uint64_t>(); + if (std::error_code EC = NumHeadSamples.getError()) + return EC; + + auto FName(readStringFromTable()); + if (std::error_code EC = FName.getError()) + return EC; + + Profiles[*FName] = FunctionSamples(); + FunctionSamples &FProfile = Profiles[*FName]; + + FProfile.addHeadSamples(*NumHeadSamples); + + if (std::error_code EC = readProfile(FProfile)) + return EC; } return sampleprof_error::success; @@ -343,6 +400,18 @@ std::error_code SampleProfileReaderBinary::readHeader() { else if (*Version != SPVersion()) return sampleprof_error::unsupported_version; + // Read the name table. + auto Size = readNumber<uint32_t>(); + if (std::error_code EC = Size.getError()) + return EC; + NameTable.reserve(*Size); + for (uint32_t I = 0; I < *Size; ++I) { + auto Name(readString()); + if (std::error_code EC = Name.getError()) + return EC; + NameTable.push_back(*Name); + } + return sampleprof_error::success; } @@ -353,6 +422,249 @@ bool SampleProfileReaderBinary::hasFormat(const MemoryBuffer &Buffer) { return Magic == SPMagic(); } +std::error_code SampleProfileReaderGCC::skipNextWord() { + uint32_t dummy; + if (!GcovBuffer.readInt(dummy)) + return sampleprof_error::truncated; + return sampleprof_error::success; +} + +template <typename T> ErrorOr<T> SampleProfileReaderGCC::readNumber() { + if (sizeof(T) <= sizeof(uint32_t)) { + uint32_t Val; + if (GcovBuffer.readInt(Val) && Val <= std::numeric_limits<T>::max()) + return static_cast<T>(Val); + } else if (sizeof(T) <= sizeof(uint64_t)) { + uint64_t Val; + if (GcovBuffer.readInt64(Val) && Val <= std::numeric_limits<T>::max()) + return static_cast<T>(Val); + } + + std::error_code EC = sampleprof_error::malformed; + reportError(0, EC.message()); + return EC; +} + +ErrorOr<StringRef> SampleProfileReaderGCC::readString() { + StringRef Str; + if (!GcovBuffer.readString(Str)) + return sampleprof_error::truncated; + return Str; +} + +std::error_code SampleProfileReaderGCC::readHeader() { + // Read the magic identifier. + if (!GcovBuffer.readGCDAFormat()) + return sampleprof_error::unrecognized_format; + + // Read the version number. Note - the GCC reader does not validate this + // version, but the profile creator generates v704. + GCOV::GCOVVersion version; + if (!GcovBuffer.readGCOVVersion(version)) + return sampleprof_error::unrecognized_format; + + if (version != GCOV::V704) + return sampleprof_error::unsupported_version; + + // Skip the empty integer. + if (std::error_code EC = skipNextWord()) + return EC; + + return sampleprof_error::success; +} + +std::error_code SampleProfileReaderGCC::readSectionTag(uint32_t Expected) { + uint32_t Tag; + if (!GcovBuffer.readInt(Tag)) + return sampleprof_error::truncated; + + if (Tag != Expected) + return sampleprof_error::malformed; + + if (std::error_code EC = skipNextWord()) + return EC; + + return sampleprof_error::success; +} + +std::error_code SampleProfileReaderGCC::readNameTable() { + if (std::error_code EC = readSectionTag(GCOVTagAFDOFileNames)) + return EC; + + uint32_t Size; + if (!GcovBuffer.readInt(Size)) + return sampleprof_error::truncated; + + for (uint32_t I = 0; I < Size; ++I) { + StringRef Str; + if (!GcovBuffer.readString(Str)) + return sampleprof_error::truncated; + Names.push_back(Str); + } + + return sampleprof_error::success; +} + +std::error_code SampleProfileReaderGCC::readFunctionProfiles() { + if (std::error_code EC = readSectionTag(GCOVTagAFDOFunction)) + return EC; + + uint32_t NumFunctions; + if (!GcovBuffer.readInt(NumFunctions)) + return sampleprof_error::truncated; + + InlineCallStack Stack; + for (uint32_t I = 0; I < NumFunctions; ++I) + if (std::error_code EC = readOneFunctionProfile(Stack, true, 0)) + return EC; + + return sampleprof_error::success; +} + +std::error_code SampleProfileReaderGCC::readOneFunctionProfile( + const InlineCallStack &InlineStack, bool Update, uint32_t Offset) { + uint64_t HeadCount = 0; + if (InlineStack.size() == 0) + if (!GcovBuffer.readInt64(HeadCount)) + return sampleprof_error::truncated; + + uint32_t NameIdx; + if (!GcovBuffer.readInt(NameIdx)) + return sampleprof_error::truncated; + + StringRef Name(Names[NameIdx]); + + uint32_t NumPosCounts; + if (!GcovBuffer.readInt(NumPosCounts)) + return sampleprof_error::truncated; + + uint32_t NumCallsites; + if (!GcovBuffer.readInt(NumCallsites)) + return sampleprof_error::truncated; + + FunctionSamples *FProfile = nullptr; + if (InlineStack.size() == 0) { + // If this is a top function that we have already processed, do not + // update its profile again. This happens in the presence of + // function aliases. Since these aliases share the same function + // body, there will be identical replicated profiles for the + // original function. In this case, we simply not bother updating + // the profile of the original function. + FProfile = &Profiles[Name]; + FProfile->addHeadSamples(HeadCount); + if (FProfile->getTotalSamples() > 0) + Update = false; + } else { + // Otherwise, we are reading an inlined instance. The top of the + // inline stack contains the profile of the caller. Insert this + // callee in the caller's CallsiteMap. + FunctionSamples *CallerProfile = InlineStack.front(); + uint32_t LineOffset = Offset >> 16; + uint32_t Discriminator = Offset & 0xffff; + FProfile = &CallerProfile->functionSamplesAt( + CallsiteLocation(LineOffset, Discriminator, Name)); + } + + for (uint32_t I = 0; I < NumPosCounts; ++I) { + uint32_t Offset; + if (!GcovBuffer.readInt(Offset)) + return sampleprof_error::truncated; + + uint32_t NumTargets; + if (!GcovBuffer.readInt(NumTargets)) + return sampleprof_error::truncated; + + uint64_t Count; + if (!GcovBuffer.readInt64(Count)) + return sampleprof_error::truncated; + + // The line location is encoded in the offset as: + // high 16 bits: line offset to the start of the function. + // low 16 bits: discriminator. + uint32_t LineOffset = Offset >> 16; + uint32_t Discriminator = Offset & 0xffff; + + InlineCallStack NewStack; + NewStack.push_back(FProfile); + NewStack.insert(NewStack.end(), InlineStack.begin(), InlineStack.end()); + if (Update) { + // Walk up the inline stack, adding the samples on this line to + // the total sample count of the callers in the chain. + for (auto CallerProfile : NewStack) + CallerProfile->addTotalSamples(Count); + + // Update the body samples for the current profile. + FProfile->addBodySamples(LineOffset, Discriminator, Count); + } + + // Process the list of functions called at an indirect call site. + // These are all the targets that a function pointer (or virtual + // function) resolved at runtime. + for (uint32_t J = 0; J < NumTargets; J++) { + uint32_t HistVal; + if (!GcovBuffer.readInt(HistVal)) + return sampleprof_error::truncated; + + if (HistVal != HIST_TYPE_INDIR_CALL_TOPN) + return sampleprof_error::malformed; + + uint64_t TargetIdx; + if (!GcovBuffer.readInt64(TargetIdx)) + return sampleprof_error::truncated; + StringRef TargetName(Names[TargetIdx]); + + uint64_t TargetCount; + if (!GcovBuffer.readInt64(TargetCount)) + return sampleprof_error::truncated; + + if (Update) { + FunctionSamples &TargetProfile = Profiles[TargetName]; + TargetProfile.addCalledTargetSamples(LineOffset, Discriminator, + TargetName, TargetCount); + } + } + } + + // Process all the inlined callers into the current function. These + // are all the callsites that were inlined into this function. + for (uint32_t I = 0; I < NumCallsites; I++) { + // The offset is encoded as: + // high 16 bits: line offset to the start of the function. + // low 16 bits: discriminator. + uint32_t Offset; + if (!GcovBuffer.readInt(Offset)) + return sampleprof_error::truncated; + InlineCallStack NewStack; + NewStack.push_back(FProfile); + NewStack.insert(NewStack.end(), InlineStack.begin(), InlineStack.end()); + if (std::error_code EC = readOneFunctionProfile(NewStack, Update, Offset)) + return EC; + } + + return sampleprof_error::success; +} + +/// \brief Read a GCC AutoFDO profile. +/// +/// This format is generated by the Linux Perf conversion tool at +/// https://github.com/google/autofdo. +std::error_code SampleProfileReaderGCC::read() { + // Read the string table. + if (std::error_code EC = readNameTable()) + return EC; + + // Read the source profile. + if (std::error_code EC = readFunctionProfiles()) + return EC; + + return sampleprof_error::success; +} + +bool SampleProfileReaderGCC::hasFormat(const MemoryBuffer &Buffer) { + StringRef Magic(reinterpret_cast<const char *>(Buffer.getBufferStart())); + return Magic == "adcg*704"; +} + /// \brief Prepare a memory buffer for the contents of \p Filename. /// /// \returns an error code indicating the status of the buffer. @@ -364,7 +676,7 @@ setupMemoryBuffer(std::string Filename) { auto Buffer = std::move(BufferOrErr.get()); // Sanity check the file. - if (Buffer->getBufferSize() > std::numeric_limits<unsigned>::max()) + if (Buffer->getBufferSize() > std::numeric_limits<uint32_t>::max()) return sampleprof_error::too_large; return std::move(Buffer); @@ -384,13 +696,29 @@ SampleProfileReader::create(StringRef Filename, LLVMContext &C) { auto BufferOrError = setupMemoryBuffer(Filename); if (std::error_code EC = BufferOrError.getError()) return EC; + return create(BufferOrError.get(), C); +} - auto Buffer = std::move(BufferOrError.get()); +/// \brief Create a sample profile reader based on the format of the input data. +/// +/// \param B The memory buffer to create the reader from (assumes ownership). +/// +/// \param Reader The reader to instantiate according to \p Filename's format. +/// +/// \param C The LLVM context to use to emit diagnostics. +/// +/// \returns an error code indicating the status of the created reader. +ErrorOr<std::unique_ptr<SampleProfileReader>> +SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C) { std::unique_ptr<SampleProfileReader> Reader; - if (SampleProfileReaderBinary::hasFormat(*Buffer)) - Reader.reset(new SampleProfileReaderBinary(std::move(Buffer), C)); + if (SampleProfileReaderBinary::hasFormat(*B)) + Reader.reset(new SampleProfileReaderBinary(std::move(B), C)); + else if (SampleProfileReaderGCC::hasFormat(*B)) + Reader.reset(new SampleProfileReaderGCC(std::move(B), C)); + else if (SampleProfileReaderText::hasFormat(*B)) + Reader.reset(new SampleProfileReaderText(std::move(B), C)); else - Reader.reset(new SampleProfileReaderText(std::move(Buffer), C)); + return sampleprof_error::unrecognized_format; if (std::error_code EC = Reader->readHeader()) return EC; diff --git a/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp b/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp index c95267a..51feee5 100644 --- a/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp +++ b/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp @@ -30,16 +30,27 @@ using namespace llvm::sampleprof; using namespace llvm; /// \brief Write samples to a text file. -bool SampleProfileWriterText::write(StringRef FName, const FunctionSamples &S) { - if (S.empty()) - return true; - - OS << FName << ":" << S.getTotalSamples() << ":" << S.getHeadSamples() - << "\n"; - - for (const auto &I : S.getBodySamples()) { - LineLocation Loc = I.first; - const SampleRecord &Sample = I.second; +/// +/// Note: it may be tempting to implement this in terms of +/// FunctionSamples::print(). Please don't. The dump functionality is intended +/// for debugging and has no specified form. +/// +/// The format used here is more structured and deliberate because +/// it needs to be parsed by the SampleProfileReaderText class. +std::error_code SampleProfileWriterText::write(StringRef FName, + const FunctionSamples &S) { + auto &OS = *OutputStream; + + OS << FName << ":" << S.getTotalSamples(); + if (Indent == 0) + OS << ":" << S.getHeadSamples(); + OS << "\n"; + + SampleSorter<LineLocation, SampleRecord> SortedSamples(S.getBodySamples()); + for (const auto &I : SortedSamples.get()) { + LineLocation Loc = I->first; + const SampleRecord &Sample = I->second; + OS.indent(Indent + 1); if (Loc.Discriminator == 0) OS << Loc.LineOffset << ": "; else @@ -52,32 +63,89 @@ bool SampleProfileWriterText::write(StringRef FName, const FunctionSamples &S) { OS << "\n"; } - return true; + SampleSorter<CallsiteLocation, FunctionSamples> SortedCallsiteSamples( + S.getCallsiteSamples()); + Indent += 1; + for (const auto &I : SortedCallsiteSamples.get()) { + CallsiteLocation Loc = I->first; + const FunctionSamples &CalleeSamples = I->second; + OS.indent(Indent); + if (Loc.Discriminator == 0) + OS << Loc.LineOffset << ": "; + else + OS << Loc.LineOffset << "." << Loc.Discriminator << ": "; + if (std::error_code EC = write(Loc.CalleeName, CalleeSamples)) + return EC; + } + Indent -= 1; + + return sampleprof_error::success; } -SampleProfileWriterBinary::SampleProfileWriterBinary(StringRef F, - std::error_code &EC) - : SampleProfileWriter(F, EC, sys::fs::F_None) { - if (EC) - return; +std::error_code SampleProfileWriterBinary::writeNameIdx(StringRef FName) { + const auto &ret = NameTable.find(FName); + if (ret == NameTable.end()) + return sampleprof_error::truncated_name_table; + encodeULEB128(ret->second, *OutputStream); + return sampleprof_error::success; +} - // Write the file header. +void SampleProfileWriterBinary::addName(StringRef FName) { + auto NextIdx = NameTable.size(); + NameTable.insert(std::make_pair(FName, NextIdx)); +} + +void SampleProfileWriterBinary::addNames(const FunctionSamples &S) { + // Add all the names in indirect call targets. + for (const auto &I : S.getBodySamples()) { + const SampleRecord &Sample = I.second; + for (const auto &J : Sample.getCallTargets()) + addName(J.first()); + } + + // Recursively add all the names for inlined callsites. + for (const auto &J : S.getCallsiteSamples()) { + CallsiteLocation Loc = J.first; + const FunctionSamples &CalleeSamples = J.second; + addName(Loc.CalleeName); + addNames(CalleeSamples); + } +} + +std::error_code SampleProfileWriterBinary::writeHeader( + const StringMap<FunctionSamples> &ProfileMap) { + auto &OS = *OutputStream; + + // Write file magic identifier. encodeULEB128(SPMagic(), OS); encodeULEB128(SPVersion(), OS); + + // Generate the name table for all the functions referenced in the profile. + for (const auto &I : ProfileMap) { + addName(I.first()); + addNames(I.second); + } + + // Write out the name table. + encodeULEB128(NameTable.size(), OS); + for (auto N : NameTable) { + OS << N.first; + encodeULEB128(0, OS); + } + + return sampleprof_error::success; } -/// \brief Write samples to a binary file. -/// -/// \returns true if the samples were written successfully, false otherwise. -bool SampleProfileWriterBinary::write(StringRef FName, - const FunctionSamples &S) { - if (S.empty()) - return true; +std::error_code SampleProfileWriterBinary::writeBody(StringRef FName, + const FunctionSamples &S) { + auto &OS = *OutputStream; + + if (std::error_code EC = writeNameIdx(FName)) + return EC; - OS << FName; - encodeULEB128(0, OS); encodeULEB128(S.getTotalSamples(), OS); - encodeULEB128(S.getHeadSamples(), OS); + + // Emit all the body samples. encodeULEB128(S.getBodySamples().size(), OS); for (const auto &I : S.getBodySamples()) { LineLocation Loc = I.first; @@ -87,18 +155,38 @@ bool SampleProfileWriterBinary::write(StringRef FName, encodeULEB128(Sample.getSamples(), OS); encodeULEB128(Sample.getCallTargets().size(), OS); for (const auto &J : Sample.getCallTargets()) { - std::string Callee = J.first(); - unsigned CalleeSamples = J.second; - OS << Callee; - encodeULEB128(0, OS); + StringRef Callee = J.first(); + uint64_t CalleeSamples = J.second; + if (std::error_code EC = writeNameIdx(Callee)) + return EC; encodeULEB128(CalleeSamples, OS); } } - return true; + // Recursively emit all the callsite samples. + encodeULEB128(S.getCallsiteSamples().size(), OS); + for (const auto &J : S.getCallsiteSamples()) { + CallsiteLocation Loc = J.first; + const FunctionSamples &CalleeSamples = J.second; + encodeULEB128(Loc.LineOffset, OS); + encodeULEB128(Loc.Discriminator, OS); + if (std::error_code EC = writeBody(Loc.CalleeName, CalleeSamples)) + return EC; + } + + return sampleprof_error::success; } -/// \brief Create a sample profile writer based on the specified format. +/// \brief Write samples of a top-level function to a binary file. +/// +/// \returns true if the samples were written successfully, false otherwise. +std::error_code SampleProfileWriterBinary::write(StringRef FName, + const FunctionSamples &S) { + encodeULEB128(S.getHeadSamples(), *OutputStream); + return writeBody(FName, S); +} + +/// \brief Create a sample profile file writer based on the specified format. /// /// \param Filename The file to create. /// @@ -110,12 +198,38 @@ bool SampleProfileWriterBinary::write(StringRef FName, ErrorOr<std::unique_ptr<SampleProfileWriter>> SampleProfileWriter::create(StringRef Filename, SampleProfileFormat Format) { std::error_code EC; + std::unique_ptr<raw_ostream> OS; + if (Format == SPF_Binary) + OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::F_None)); + else + OS.reset(new raw_fd_ostream(Filename, EC, sys::fs::F_Text)); + if (EC) + return EC; + + return create(OS, Format); +} + +/// \brief Create a sample profile stream writer based on the specified format. +/// +/// \param OS The output stream to store the profile data to. +/// +/// \param Writer The writer to instantiate according to the specified format. +/// +/// \param Format Encoding format for the profile file. +/// +/// \returns an error code indicating the status of the created writer. +ErrorOr<std::unique_ptr<SampleProfileWriter>> +SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS, + SampleProfileFormat Format) { + std::error_code EC; std::unique_ptr<SampleProfileWriter> Writer; if (Format == SPF_Binary) - Writer.reset(new SampleProfileWriterBinary(Filename, EC)); + Writer.reset(new SampleProfileWriterBinary(OS)); else if (Format == SPF_Text) - Writer.reset(new SampleProfileWriterText(Filename, EC)); + Writer.reset(new SampleProfileWriterText(OS)); + else if (Format == SPF_GCC) + EC = sampleprof_error::unsupported_writing_format; else EC = sampleprof_error::unrecognized_format; diff --git a/contrib/llvm/lib/Support/APFloat.cpp b/contrib/llvm/lib/Support/APFloat.cpp index 5d31225..19b8221 100644 --- a/contrib/llvm/lib/Support/APFloat.cpp +++ b/contrib/llvm/lib/Support/APFloat.cpp @@ -768,6 +768,15 @@ APFloat::isLargest() const { } bool +APFloat::isInteger() const { + // This could be made more efficient; I'm going for obviously correct. + if (!isFinite()) return false; + APFloat truncated = *this; + truncated.roundToIntegral(rmTowardZero); + return compare(truncated) == cmpEqual; +} + +bool APFloat::bitwiseIsEqual(const APFloat &rhs) const { if (this == &rhs) return true; @@ -777,18 +786,12 @@ APFloat::bitwiseIsEqual(const APFloat &rhs) const { return false; if (category==fcZero || category==fcInfinity) return true; - else if (isFiniteNonZero() && exponent!=rhs.exponent) + + if (isFiniteNonZero() && exponent != rhs.exponent) return false; - else { - int i= partCount(); - const integerPart* p=significandParts(); - const integerPart* q=rhs.significandParts(); - for (; i>0; i--, p++, q++) { - if (*p != *q) - return false; - } - return true; - } + + return std::equal(significandParts(), significandParts() + partCount(), + rhs.significandParts()); } APFloat::APFloat(const fltSemantics &ourSemantics, integerPart value) { @@ -847,6 +850,21 @@ APFloat::semanticsPrecision(const fltSemantics &semantics) { return semantics.precision; } +APFloat::ExponentType +APFloat::semanticsMaxExponent(const fltSemantics &semantics) +{ + return semantics.maxExponent; +} +APFloat::ExponentType +APFloat::semanticsMinExponent(const fltSemantics &semantics) +{ + return semantics.minExponent; +} +unsigned int +APFloat::semanticsSizeInBits(const fltSemantics &semantics) +{ + return semantics.sizeInBits; +} const integerPart * APFloat::significandParts() const @@ -1762,7 +1780,7 @@ APFloat::remainder(const APFloat &rhs) /* Normalized llvm frem (C fmod). This is not currently correct in all cases. */ APFloat::opStatus -APFloat::mod(const APFloat &rhs, roundingMode rounding_mode) +APFloat::mod(const APFloat &rhs) { opStatus fs; fs = modSpecials(rhs); @@ -1787,10 +1805,10 @@ APFloat::mod(const APFloat &rhs, roundingMode rounding_mode) rmNearestTiesToEven); assert(fs==opOK); // should always work - fs = V.multiply(rhs, rounding_mode); + fs = V.multiply(rhs, rmNearestTiesToEven); assert(fs==opOK || fs==opInexact); // should not overflow or underflow - fs = subtract(V, rounding_mode); + fs = subtract(V, rmNearestTiesToEven); assert(fs==opOK || fs==opInexact); // likewise if (isZero()) diff --git a/contrib/llvm/lib/Support/BlockFrequency.cpp b/contrib/llvm/lib/Support/BlockFrequency.cpp index 6f7e341..e7f3e17 100644 --- a/contrib/llvm/lib/Support/BlockFrequency.cpp +++ b/contrib/llvm/lib/Support/BlockFrequency.cpp @@ -11,37 +11,35 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Support/BranchProbability.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/raw_ostream.h" #include <cassert> using namespace llvm; -BlockFrequency &BlockFrequency::operator*=(const BranchProbability &Prob) { +BlockFrequency &BlockFrequency::operator*=(BranchProbability Prob) { Frequency = Prob.scale(Frequency); return *this; } -const BlockFrequency -BlockFrequency::operator*(const BranchProbability &Prob) const { +BlockFrequency BlockFrequency::operator*(BranchProbability Prob) const { BlockFrequency Freq(Frequency); Freq *= Prob; return Freq; } -BlockFrequency &BlockFrequency::operator/=(const BranchProbability &Prob) { +BlockFrequency &BlockFrequency::operator/=(BranchProbability Prob) { Frequency = Prob.scaleByInverse(Frequency); return *this; } -BlockFrequency BlockFrequency::operator/(const BranchProbability &Prob) const { +BlockFrequency BlockFrequency::operator/(BranchProbability Prob) const { BlockFrequency Freq(Frequency); Freq /= Prob; return Freq; } -BlockFrequency &BlockFrequency::operator+=(const BlockFrequency &Freq) { +BlockFrequency &BlockFrequency::operator+=(BlockFrequency Freq) { uint64_t Before = Freq.Frequency; Frequency += Freq.Frequency; @@ -52,11 +50,25 @@ BlockFrequency &BlockFrequency::operator+=(const BlockFrequency &Freq) { return *this; } -const BlockFrequency -BlockFrequency::operator+(const BlockFrequency &Prob) const { - BlockFrequency Freq(Frequency); - Freq += Prob; - return Freq; +BlockFrequency BlockFrequency::operator+(BlockFrequency Freq) const { + BlockFrequency NewFreq(Frequency); + NewFreq += Freq; + return NewFreq; +} + +BlockFrequency &BlockFrequency::operator-=(BlockFrequency Freq) { + // If underflow, set frequency to 0. + if (Frequency <= Freq.Frequency) + Frequency = 0; + else + Frequency -= Freq.Frequency; + return *this; +} + +BlockFrequency BlockFrequency::operator-(BlockFrequency Freq) const { + BlockFrequency NewFreq(Frequency); + NewFreq -= Freq; + return NewFreq; } BlockFrequency &BlockFrequency::operator>>=(const unsigned count) { diff --git a/contrib/llvm/lib/Support/BranchProbability.cpp b/contrib/llvm/lib/Support/BranchProbability.cpp index 65878d6..771d02c 100644 --- a/contrib/llvm/lib/Support/BranchProbability.cpp +++ b/contrib/llvm/lib/Support/BranchProbability.cpp @@ -15,17 +15,58 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" +#include <cassert> using namespace llvm; +const uint32_t BranchProbability::D; + raw_ostream &BranchProbability::print(raw_ostream &OS) const { - return OS << N << " / " << D << " = " - << format("%g%%", ((double)N / D) * 100.0); + if (isUnknown()) + return OS << "?%"; + + // Get a percentage rounded to two decimal digits. This avoids + // implementation-defined rounding inside printf. + double Percent = rint(((double)N / D) * 100.0 * 100.0) / 100.0; + return OS << format("0x%08" PRIx32 " / 0x%08" PRIx32 " = %.2f%%", N, D, + Percent); } void BranchProbability::dump() const { print(dbgs()) << '\n'; } +BranchProbability::BranchProbability(uint32_t Numerator, uint32_t Denominator) { + assert(Denominator > 0 && "Denominator cannot be 0!"); + assert(Numerator <= Denominator && "Probability cannot be bigger than 1!"); + if (Denominator == D) + N = Numerator; + else { + uint64_t Prob64 = + (Numerator * static_cast<uint64_t>(D) + Denominator / 2) / Denominator; + N = static_cast<uint32_t>(Prob64); + } +} + +BranchProbability +BranchProbability::getBranchProbability(uint64_t Numerator, + uint64_t Denominator) { + assert(Numerator <= Denominator && "Probability cannot be bigger than 1!"); + // Scale down Denominator to fit in a 32-bit integer. + int Scale = 0; + while (Denominator > UINT32_MAX) { + Denominator >>= 1; + Scale++; + } + return BranchProbability(Numerator >> Scale, Denominator); +} + +// If ConstD is not zero, then replace D by ConstD so that division and modulo +// operations by D can be optimized, in case this function is not inlined by the +// compiler. +template <uint32_t ConstD> static uint64_t scale(uint64_t Num, uint32_t N, uint32_t D) { + if (ConstD > 0) + D = ConstD; + assert(D && "divide by 0"); // Fast path for multiplying by 1.0. @@ -65,9 +106,9 @@ static uint64_t scale(uint64_t Num, uint32_t N, uint32_t D) { } uint64_t BranchProbability::scale(uint64_t Num) const { - return ::scale(Num, N, D); + return ::scale<D>(Num, N, D); } uint64_t BranchProbability::scaleByInverse(uint64_t Num) const { - return ::scale(Num, D, N); + return ::scale<0>(Num, D, N); } diff --git a/contrib/llvm/lib/Support/CommandLine.cpp b/contrib/llvm/lib/Support/CommandLine.cpp index 17fba95..fdcdb03 100644 --- a/contrib/llvm/lib/Support/CommandLine.cpp +++ b/contrib/llvm/lib/Support/CommandLine.cpp @@ -120,7 +120,7 @@ public: void addOption(Option *O) { bool HadErrors = false; - if (O->ArgStr[0]) { + if (O->hasArgStr()) { // Add argument to the argument map! if (!OptionsMap.insert(std::make_pair(O->ArgStr, O)).second) { errs() << ProgramName << ": CommandLine Error: Option '" << O->ArgStr @@ -151,12 +151,12 @@ public: } void removeOption(Option *O) { - SmallVector<const char *, 16> OptionNames; + SmallVector<StringRef, 16> OptionNames; O->getExtraOptionNames(OptionNames); - if (O->ArgStr[0]) + if (O->hasArgStr()) OptionNames.push_back(O->ArgStr); for (auto Name : OptionNames) - OptionsMap.erase(StringRef(Name)); + OptionsMap.erase(Name); if (O->getFormattingFlag() == cl::Positional) for (auto Opt = PositionalOpts.begin(); Opt != PositionalOpts.end(); @@ -182,13 +182,13 @@ public: nullptr != ConsumeAfterOpt); } - void updateArgStr(Option *O, const char *NewName) { + void updateArgStr(Option *O, StringRef NewName) { if (!OptionsMap.insert(std::make_pair(NewName, O)).second) { errs() << ProgramName << ": CommandLine Error: Option '" << O->ArgStr << "' registered more than once!\n"; report_fatal_error("inconsistency in registered CommandLine options"); } - OptionsMap.erase(StringRef(O->ArgStr)); + OptionsMap.erase(O->ArgStr); } void printOptionValues(); @@ -227,7 +227,7 @@ void Option::addArgument() { void Option::removeArgument() { GlobalParser->removeOption(this); } -void Option::setArgStr(const char *S) { +void Option::setArgStr(StringRef S) { if (FullyInitialized) GlobalParser->updateArgStr(this, S); ArgStr = S; @@ -296,24 +296,23 @@ static Option *LookupNearestOption(StringRef Arg, ie = OptionsMap.end(); it != ie; ++it) { Option *O = it->second; - SmallVector<const char *, 16> OptionNames; + SmallVector<StringRef, 16> OptionNames; O->getExtraOptionNames(OptionNames); - if (O->ArgStr[0]) + if (O->hasArgStr()) OptionNames.push_back(O->ArgStr); bool PermitValue = O->getValueExpectedFlag() != cl::ValueDisallowed; StringRef Flag = PermitValue ? LHS : Arg; - for (size_t i = 0, e = OptionNames.size(); i != e; ++i) { - StringRef Name = OptionNames[i]; + for (auto Name : OptionNames) { unsigned Distance = StringRef(Name).edit_distance( Flag, /*AllowReplacements=*/true, /*MaxEditDistance=*/BestDistance); if (!Best || Distance < BestDistance) { Best = O; BestDistance = Distance; if (RHS.empty() || !PermitValue) - NearestString = OptionNames[i]; + NearestString = Name; else - NearestString = (Twine(OptionNames[i]) + "=" + RHS).str(); + NearestString = (Twine(Name) + "=" + RHS).str(); } } } @@ -346,10 +345,7 @@ static bool CommaSeparateAndAddOccurrence(Option *Handler, unsigned pos, Value = Val; } - if (Handler->addOccurrence(pos, ArgName, Value, MultiArg)) - return true; - - return false; + return Handler->addOccurrence(pos, ArgName, Value, MultiArg); } /// ProvideOption - For Value, this differentiates between an empty value ("") @@ -799,7 +795,7 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar, // telling us. SmallVector<const char *, 20> newArgv; BumpPtrAllocator A; - BumpPtrStringSaver Saver(A); + StringSaver Saver(A); newArgv.push_back(Saver.save(progName)); // Parse the value of the environment variable into a "command line" @@ -822,7 +818,7 @@ void CommandLineParser::ParseCommandLineOptions(int argc, // Expand response files. SmallVector<const char *, 20> newArgv(argv, argv + argc); BumpPtrAllocator A; - BumpPtrStringSaver Saver(A); + StringSaver Saver(A); ExpandResponseFiles(Saver, TokenizeGNUCommandLine, newArgv); argv = &newArgv[0]; argc = static_cast<int>(newArgv.size()); @@ -859,7 +855,7 @@ void CommandLineParser::ParseCommandLineOptions(int argc, "error - this positional option will never be matched, " "because it does not Require a value, and a " "cl::ConsumeAfter option is active!"); - } else if (UnboundedFound && !Opt->ArgStr[0]) { + } else if (UnboundedFound && !Opt->hasArgStr()) { // This option does not "require" a value... Make sure this option is // not specified after an option that eats all extra arguments, or this // one will never get any! @@ -1144,8 +1140,8 @@ bool Option::addOccurrence(unsigned pos, StringRef ArgName, StringRef Value, // getValueStr - Get the value description string, using "DefaultMsg" if nothing // has been specified yet. // -static const char *getValueStr(const Option &O, const char *DefaultMsg) { - if (O.ValueStr[0] == 0) +static StringRef getValueStr(const Option &O, StringRef DefaultMsg) { + if (O.ValueStr.empty()) return DefaultMsg; return O.ValueStr; } @@ -1155,7 +1151,7 @@ static const char *getValueStr(const Option &O, const char *DefaultMsg) { // // Return the width of the option tag for printing... -size_t alias::getOptionWidth() const { return std::strlen(ArgStr) + 6; } +size_t alias::getOptionWidth() const { return ArgStr.size() + 6; } static void printHelpStr(StringRef HelpStr, size_t Indent, size_t FirstLineIndentedBy) { @@ -1170,7 +1166,7 @@ static void printHelpStr(StringRef HelpStr, size_t Indent, // Print out the option for the alias. void alias::printOptionInfo(size_t GlobalWidth) const { outs() << " -" << ArgStr; - printHelpStr(HelpStr, GlobalWidth, std::strlen(ArgStr) + 6); + printHelpStr(HelpStr, GlobalWidth, ArgStr.size() + 6); } //===----------------------------------------------------------------------===// @@ -1182,9 +1178,9 @@ void alias::printOptionInfo(size_t GlobalWidth) const { // Return the width of the option tag for printing... size_t basic_parser_impl::getOptionWidth(const Option &O) const { - size_t Len = std::strlen(O.ArgStr); + size_t Len = O.ArgStr.size(); if (const char *ValName = getValueName()) - Len += std::strlen(getValueStr(O, ValName)) + 3; + Len += getValueStr(O, ValName).size() + 3; return Len + 6; } @@ -1205,7 +1201,7 @@ void basic_parser_impl::printOptionInfo(const Option &O, void basic_parser_impl::printOptionName(const Option &O, size_t GlobalWidth) const { outs() << " -" << O.ArgStr; - outs().indent(GlobalWidth - std::strlen(O.ArgStr)); + outs().indent(GlobalWidth - O.ArgStr.size()); } // parser<bool> implementation @@ -1319,7 +1315,7 @@ unsigned generic_parser_base::findOption(const char *Name) { // Return the width of the option tag for printing... size_t generic_parser_base::getOptionWidth(const Option &O) const { if (O.hasArgStr()) { - size_t Size = std::strlen(O.ArgStr) + 6; + size_t Size = O.ArgStr.size() + 6; for (unsigned i = 0, e = getNumOptions(); i != e; ++i) Size = std::max(Size, std::strlen(getOption(i)) + 8); return Size; @@ -1338,7 +1334,7 @@ void generic_parser_base::printOptionInfo(const Option &O, size_t GlobalWidth) const { if (O.hasArgStr()) { outs() << " -" << O.ArgStr; - printHelpStr(O.HelpStr, GlobalWidth, std::strlen(O.ArgStr) + 6); + printHelpStr(O.HelpStr, GlobalWidth, O.ArgStr.size() + 6); for (unsigned i = 0, e = getNumOptions(); i != e; ++i) { size_t NumSpaces = GlobalWidth - strlen(getOption(i)) - 8; @@ -1346,7 +1342,7 @@ void generic_parser_base::printOptionInfo(const Option &O, outs().indent(NumSpaces) << " - " << getDescription(i) << '\n'; } } else { - if (O.HelpStr[0]) + if (!O.HelpStr.empty()) outs() << " " << O.HelpStr << '\n'; for (unsigned i = 0, e = getNumOptions(); i != e; ++i) { const char *Option = getOption(i); @@ -1365,7 +1361,7 @@ void generic_parser_base::printGenericOptionDiff( const Option &O, const GenericOptionValue &Value, const GenericOptionValue &Default, size_t GlobalWidth) const { outs() << " -" << O.ArgStr; - outs().indent(GlobalWidth - std::strlen(O.ArgStr)); + outs().indent(GlobalWidth - O.ArgStr.size()); unsigned NumOpts = getNumOptions(); for (unsigned i = 0; i != NumOpts; ++i) { @@ -1508,7 +1504,7 @@ public: outs() << "USAGE: " << GlobalParser->ProgramName << " [options]"; for (auto Opt : GlobalParser->PositionalOpts) { - if (Opt->ArgStr[0]) + if (Opt->hasArgStr()) outs() << " --" << Opt->ArgStr; outs() << " " << Opt->HelpStr; } diff --git a/contrib/llvm/lib/Support/CrashRecoveryContext.cpp b/contrib/llvm/lib/Support/CrashRecoveryContext.cpp index aba0f1d..3f4ef9d 100644 --- a/contrib/llvm/lib/Support/CrashRecoveryContext.cpp +++ b/contrib/llvm/lib/Support/CrashRecoveryContext.cpp @@ -24,6 +24,12 @@ static ManagedStatic< sys::ThreadLocal<const CrashRecoveryContextImpl> > CurrentContext; struct CrashRecoveryContextImpl { + // When threads are disabled, this links up all active + // CrashRecoveryContextImpls. When threads are enabled there's one thread + // per CrashRecoveryContext and CurrentContext is a thread-local, so only one + // CrashRecoveryContextImpl is active per thread and this is always null. + const CrashRecoveryContextImpl *Next; + CrashRecoveryContext *CRC; std::string Backtrace; ::jmp_buf JumpBuffer; @@ -34,21 +40,26 @@ public: CrashRecoveryContextImpl(CrashRecoveryContext *CRC) : CRC(CRC), Failed(false), SwitchedThread(false) { + Next = CurrentContext->get(); CurrentContext->set(this); } ~CrashRecoveryContextImpl() { if (!SwitchedThread) - CurrentContext->erase(); + CurrentContext->set(Next); } /// \brief Called when the separate crash-recovery thread was finished, to /// indicate that we don't need to clear the thread-local CurrentContext. - void setSwitchedThread() { SwitchedThread = true; } + void setSwitchedThread() { +#if defined(LLVM_ENABLE_THREADS) && LLVM_ENABLE_THREADS != 0 + SwitchedThread = true; +#endif + } void HandleCrash() { // Eliminate the current context entry, to avoid re-entering in case the // cleanup code crashes. - CurrentContext->erase(); + CurrentContext->set(Next); assert(!Failed && "Crash recovery context already failed!"); Failed = true; @@ -65,7 +76,7 @@ public: static ManagedStatic<sys::Mutex> gCrashRecoveryContextMutex; static bool gCrashRecoveryEnabled = false; -static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContextCleanup> > +static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContext>> tlIsRecoveringFromCrash; CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {} @@ -73,7 +84,8 @@ CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {} CrashRecoveryContext::~CrashRecoveryContext() { // Reclaim registered resources. CrashRecoveryContextCleanup *i = head; - tlIsRecoveringFromCrash->set(head); + const CrashRecoveryContext *PC = tlIsRecoveringFromCrash->get(); + tlIsRecoveringFromCrash->set(this); while (i) { CrashRecoveryContextCleanup *tmp = i; i = tmp->next; @@ -81,7 +93,7 @@ CrashRecoveryContext::~CrashRecoveryContext() { tmp->recoverResources(); delete tmp; } - tlIsRecoveringFromCrash->erase(); + tlIsRecoveringFromCrash->set(PC); CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *) Impl; delete CRCI; @@ -232,7 +244,7 @@ void CrashRecoveryContext::Disable() { static const int Signals[] = { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP }; -static const unsigned NumSignals = sizeof(Signals) / sizeof(Signals[0]); +static const unsigned NumSignals = array_lengthof(Signals); static struct sigaction PrevActions[NumSignals]; static void CrashRecoverySignalHandler(int Signal) { diff --git a/contrib/llvm/lib/Support/Debug.cpp b/contrib/llvm/lib/Support/Debug.cpp index 47751fc..323d532 100644 --- a/contrib/llvm/lib/Support/Debug.cpp +++ b/contrib/llvm/lib/Support/Debug.cpp @@ -95,7 +95,10 @@ struct DebugOnlyOpt { if (Val.empty()) return; DebugFlag = true; - CurrentDebugType->push_back(Val); + SmallVector<StringRef,8> dbgTypes; + StringRef(Val).split(dbgTypes, ',', -1, false); + for (auto dbgType : dbgTypes) + CurrentDebugType->push_back(dbgType); } }; @@ -104,10 +107,9 @@ struct DebugOnlyOpt { static DebugOnlyOpt DebugOnlyOptLoc; static cl::opt<DebugOnlyOpt, true, cl::parser<std::string> > -DebugOnly("debug-only", cl::desc("Enable a specific type of debug output"), +DebugOnly("debug-only", cl::desc("Enable a specific type of debug output (comma separated list of types)"), cl::Hidden, cl::ZeroOrMore, cl::value_desc("debug string"), cl::location(DebugOnlyOptLoc), cl::ValueRequired); - // Signal handlers - dump debug output on termination. static void debug_user_sig_handler(void *Cookie) { // This is a bit sneaky. Since this is under #ifndef NDEBUG, we diff --git a/contrib/llvm/lib/Support/Dwarf.cpp b/contrib/llvm/lib/Support/Dwarf.cpp index 13a4155..7d72256 100644 --- a/contrib/llvm/lib/Support/Dwarf.cpp +++ b/contrib/llvm/lib/Support/Dwarf.cpp @@ -177,6 +177,23 @@ const char *llvm::dwarf::AttributeString(unsigned Attribute) { case DW_AT_MIPS_assumed_size: return "DW_AT_MIPS_assumed_size"; case DW_AT_lo_user: return "DW_AT_lo_user"; case DW_AT_hi_user: return "DW_AT_hi_user"; + case DW_AT_BORLAND_property_read: return "DW_AT_BORLAND_property_read"; + case DW_AT_BORLAND_property_write: return "DW_AT_BORLAND_property_write"; + case DW_AT_BORLAND_property_implements: return "DW_AT_BORLAND_property_implements"; + case DW_AT_BORLAND_property_index: return "DW_AT_BORLAND_property_index"; + case DW_AT_BORLAND_property_default: return "DW_AT_BORLAND_property_default"; + case DW_AT_BORLAND_Delphi_unit: return "DW_AT_BORLAND_Delphi_unit"; + case DW_AT_BORLAND_Delphi_class: return "DW_AT_BORLAND_Delphi_class"; + case DW_AT_BORLAND_Delphi_record: return "DW_AT_BORLAND_Delphi_record"; + case DW_AT_BORLAND_Delphi_metaclass: return "DW_AT_BORLAND_Delphi_metaclass"; + case DW_AT_BORLAND_Delphi_constructor: return "DW_AT_BORLAND_Delphi_constructor"; + case DW_AT_BORLAND_Delphi_destructor: return "DW_AT_BORLAND_Delphi_destructor"; + case DW_AT_BORLAND_Delphi_anonymous_method: return "DW_AT_BORLAND_Delphi_anonymous_method"; + case DW_AT_BORLAND_Delphi_interface: return "DW_AT_BORLAND_Delphi_interface"; + case DW_AT_BORLAND_Delphi_ABI: return "DW_AT_BORLAND_Delphi_ABI"; + case DW_AT_BORLAND_Delphi_return: return "DW_AT_BORLAND_Delphi_return"; + case DW_AT_BORLAND_Delphi_frameptr: return "DW_AT_BORLAND_Delphi_frameptr"; + case DW_AT_BORLAND_closure: return "DW_AT_BORLAND_closure"; case DW_AT_APPLE_optimized: return "DW_AT_APPLE_optimized"; case DW_AT_APPLE_flags: return "DW_AT_APPLE_flags"; case DW_AT_APPLE_isa: return "DW_AT_APPLE_isa"; @@ -201,6 +218,7 @@ const char *llvm::dwarf::AttributeString(unsigned Attribute) { case DW_AT_GNU_addr_base: return "DW_AT_GNU_addr_base"; case DW_AT_GNU_pubnames: return "DW_AT_GNU_pubnames"; case DW_AT_GNU_pubtypes: return "DW_AT_GNU_pubtypes"; + case DW_AT_GNU_discriminator: return "DW_AT_GNU_discriminator"; } return nullptr; } @@ -373,6 +391,14 @@ const char *llvm::dwarf::ConventionString(unsigned Convention) { case DW_CC_nocall: return "DW_CC_nocall"; case DW_CC_lo_user: return "DW_CC_lo_user"; case DW_CC_hi_user: return "DW_CC_hi_user"; + case DW_CC_GNU_borland_fastcall_i386: return "DW_CC_GNU_borland_fastcall_i386"; + case DW_CC_BORLAND_safecall: return "DW_CC_BORLAND_safecall"; + case DW_CC_BORLAND_stdcall: return "DW_CC_BORLAND_stdcall"; + case DW_CC_BORLAND_pascal: return "DW_CC_BORLAND_pascal"; + case DW_CC_BORLAND_msfastcall: return "DW_CC_BORLAND_msfastcall"; + case DW_CC_BORLAND_msreturn: return "DW_CC_BORLAND_msreturn"; + case DW_CC_BORLAND_thiscall: return "DW_CC_BORLAND_thiscall"; + case DW_CC_BORLAND_fastcall: return "DW_CC_BORLAND_fastcall"; } return nullptr; } @@ -442,10 +468,21 @@ const char *llvm::dwarf::MacinfoString(unsigned Encoding) { case DW_MACINFO_start_file: return "DW_MACINFO_start_file"; case DW_MACINFO_end_file: return "DW_MACINFO_end_file"; case DW_MACINFO_vendor_ext: return "DW_MACINFO_vendor_ext"; + case DW_MACINFO_invalid: return "DW_MACINFO_invalid"; } return nullptr; } +unsigned llvm::dwarf::getMacinfo(StringRef MacinfoString) { + return StringSwitch<unsigned>(MacinfoString) + .Case("DW_MACINFO_define", DW_MACINFO_define) + .Case("DW_MACINFO_undef", DW_MACINFO_undef) + .Case("DW_MACINFO_start_file", DW_MACINFO_start_file) + .Case("DW_MACINFO_end_file", DW_MACINFO_end_file) + .Case("DW_MACINFO_vendor_ext", DW_MACINFO_vendor_ext) + .Default(DW_MACINFO_invalid); +} + const char *llvm::dwarf::CallFrameString(unsigned Encoding) { switch (Encoding) { case DW_CFA_nop: return "DW_CFA_nop"; diff --git a/contrib/llvm/lib/Support/ErrorHandling.cpp b/contrib/llvm/lib/Support/ErrorHandling.cpp index a25e21a..2808bd3 100644 --- a/contrib/llvm/lib/Support/ErrorHandling.cpp +++ b/contrib/llvm/lib/Support/ErrorHandling.cpp @@ -13,7 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/ErrorHandling.h" -#include "llvm-c/Core.h" +#include "llvm-c/ErrorHandling.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/Config/config.h" diff --git a/contrib/llvm/lib/Support/FileOutputBuffer.cpp b/contrib/llvm/lib/Support/FileOutputBuffer.cpp index 307ff09..651e679 100644 --- a/contrib/llvm/lib/Support/FileOutputBuffer.cpp +++ b/contrib/llvm/lib/Support/FileOutputBuffer.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/Support/Errc.h" +#include "llvm/Support/Signals.h" #include <system_error> #if !defined(_MSC_VER) && !defined(__MINGW32__) @@ -34,10 +35,8 @@ FileOutputBuffer::~FileOutputBuffer() { sys::fs::remove(Twine(TempPath)); } -std::error_code -FileOutputBuffer::create(StringRef FilePath, size_t Size, - std::unique_ptr<FileOutputBuffer> &Result, - unsigned Flags) { +ErrorOr<std::unique_ptr<FileOutputBuffer>> +FileOutputBuffer::create(StringRef FilePath, size_t Size, unsigned Flags) { // If file already exists, it must be a regular file (to be mappable). sys::fs::file_status Stat; std::error_code EC = sys::fs::status(FilePath, Stat); @@ -76,6 +75,8 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size, if (EC) return EC; + sys::RemoveFileOnSignal(TempFilePath); + #ifndef LLVM_ON_WIN32 // On Windows, CreateFileMapping (the mmap function on Windows) // automatically extends the underlying file. We don't need to @@ -95,10 +96,9 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size, if (Ret) return std::error_code(errno, std::generic_category()); - Result.reset( + std::unique_ptr<FileOutputBuffer> Buf( new FileOutputBuffer(std::move(MappedFile), FilePath, TempFilePath)); - - return std::error_code(); + return std::move(Buf); } std::error_code FileOutputBuffer::commit() { @@ -107,6 +107,8 @@ std::error_code FileOutputBuffer::commit() { // Rename file to final name. - return sys::fs::rename(Twine(TempPath), Twine(FinalPath)); + std::error_code EC = sys::fs::rename(Twine(TempPath), Twine(FinalPath)); + sys::DontRemoveFileOnSignal(TempPath); + return EC; } } // namespace diff --git a/contrib/llvm/lib/Support/FoldingSet.cpp b/contrib/llvm/lib/Support/FoldingSet.cpp index b8538ff..bb0ec2d 100644 --- a/contrib/llvm/lib/Support/FoldingSet.cpp +++ b/contrib/llvm/lib/Support/FoldingSet.cpp @@ -232,9 +232,29 @@ FoldingSetImpl::FoldingSetImpl(unsigned Log2InitSize) { Buckets = AllocateBuckets(NumBuckets); NumNodes = 0; } + +FoldingSetImpl::FoldingSetImpl(FoldingSetImpl &&Arg) + : Buckets(Arg.Buckets), NumBuckets(Arg.NumBuckets), NumNodes(Arg.NumNodes) { + Arg.Buckets = nullptr; + Arg.NumBuckets = 0; + Arg.NumNodes = 0; +} + +FoldingSetImpl &FoldingSetImpl::operator=(FoldingSetImpl &&RHS) { + free(Buckets); // This may be null if the set is in a moved-from state. + Buckets = RHS.Buckets; + NumBuckets = RHS.NumBuckets; + NumNodes = RHS.NumNodes; + RHS.Buckets = nullptr; + RHS.NumBuckets = 0; + RHS.NumNodes = 0; + return *this; +} + FoldingSetImpl::~FoldingSetImpl() { free(Buckets); } + void FoldingSetImpl::clear() { // Set all but the last bucket to null pointers. memset(Buckets, 0, NumBuckets*sizeof(void*)); diff --git a/contrib/llvm/lib/Support/GraphWriter.cpp b/contrib/llvm/lib/Support/GraphWriter.cpp index a9b0220..d0e1d50 100644 --- a/contrib/llvm/lib/Support/GraphWriter.cpp +++ b/contrib/llvm/lib/Support/GraphWriter.cpp @@ -103,7 +103,7 @@ struct GraphSession { bool TryFindProgram(StringRef Names, std::string &ProgramPath) { raw_string_ostream Log(LogBuffer); SmallVector<StringRef, 8> parts; - Names.split(parts, "|"); + Names.split(parts, '|'); for (auto Name : parts) { if (ErrorOr<std::string> P = sys::findProgramByName(Name)) { ProgramPath = *P; @@ -189,61 +189,87 @@ bool llvm::DisplayGraph(StringRef FilenameRef, bool wait, return ExecGraphViewer(ViewerPath, args, Filename, wait, ErrMsg); } - enum PSViewerKind { PSV_None, PSV_OSXOpen, PSV_XDGOpen, PSV_Ghostview }; - PSViewerKind PSViewer = PSV_None; + enum ViewerKind { + VK_None, + VK_OSXOpen, + VK_XDGOpen, + VK_Ghostview, + VK_CmdStart + }; + ViewerKind Viewer = VK_None; #ifdef __APPLE__ - if (!PSViewer && S.TryFindProgram("open", ViewerPath)) - PSViewer = PSV_OSXOpen; + if (!Viewer && S.TryFindProgram("open", ViewerPath)) + Viewer = VK_OSXOpen; +#endif + if (!Viewer && S.TryFindProgram("gv", ViewerPath)) + Viewer = VK_Ghostview; + if (!Viewer && S.TryFindProgram("xdg-open", ViewerPath)) + Viewer = VK_XDGOpen; +#ifdef LLVM_ON_WIN32 + if (!Viewer && S.TryFindProgram("cmd", ViewerPath)) { + Viewer = VK_CmdStart; + } #endif - if (!PSViewer && S.TryFindProgram("gv", ViewerPath)) - PSViewer = PSV_Ghostview; - if (!PSViewer && S.TryFindProgram("xdg-open", ViewerPath)) - PSViewer = PSV_XDGOpen; - // PostScript graph generator + PostScript viewer + // PostScript or PDF graph generator + PostScript/PDF viewer std::string GeneratorPath; - if (PSViewer && + if (Viewer && (S.TryFindProgram(getProgramName(program), GeneratorPath) || S.TryFindProgram("dot|fdp|neato|twopi|circo", GeneratorPath))) { - std::string PSFilename = Filename + ".ps"; + std::string OutputFilename = + Filename + (Viewer == VK_CmdStart ? ".pdf" : ".ps"); std::vector<const char *> args; args.push_back(GeneratorPath.c_str()); - args.push_back("-Tps"); + if (Viewer == VK_CmdStart) + args.push_back("-Tpdf"); + else + args.push_back("-Tps"); args.push_back("-Nfontname=Courier"); args.push_back("-Gsize=7.5,10"); args.push_back(Filename.c_str()); args.push_back("-o"); - args.push_back(PSFilename.c_str()); + args.push_back(OutputFilename.c_str()); args.push_back(nullptr); errs() << "Running '" << GeneratorPath << "' program... "; - if (ExecGraphViewer(GeneratorPath, args, Filename, wait, ErrMsg)) + if (ExecGraphViewer(GeneratorPath, args, Filename, true, ErrMsg)) return true; + // The lifetime of StartArg must include the call of ExecGraphViewer + // because the args are passed as vector of char*. + std::string StartArg; + args.clear(); args.push_back(ViewerPath.c_str()); - switch (PSViewer) { - case PSV_OSXOpen: + switch (Viewer) { + case VK_OSXOpen: args.push_back("-W"); - args.push_back(PSFilename.c_str()); + args.push_back(OutputFilename.c_str()); break; - case PSV_XDGOpen: + case VK_XDGOpen: wait = false; - args.push_back(PSFilename.c_str()); + args.push_back(OutputFilename.c_str()); break; - case PSV_Ghostview: + case VK_Ghostview: args.push_back("--spartan"); - args.push_back(PSFilename.c_str()); + args.push_back(OutputFilename.c_str()); + break; + case VK_CmdStart: + args.push_back("/S"); + args.push_back("/C"); + StartArg = + (StringRef("start ") + (wait ? "/WAIT " : "") + OutputFilename).str(); + args.push_back(StartArg.c_str()); break; - case PSV_None: + case VK_None: llvm_unreachable("Invalid viewer"); } args.push_back(nullptr); ErrMsg.clear(); - return ExecGraphViewer(ViewerPath, args, PSFilename, wait, ErrMsg); + return ExecGraphViewer(ViewerPath, args, OutputFilename, wait, ErrMsg); } // dotty diff --git a/contrib/llvm/lib/Support/Host.cpp b/contrib/llvm/lib/Support/Host.cpp index 1bd1fe2..c0f9e07 100644 --- a/contrib/llvm/lib/Support/Host.cpp +++ b/contrib/llvm/lib/Support/Host.cpp @@ -368,8 +368,14 @@ StringRef sys::getHostCPUName() { // Broadwell: case 61: + case 71: return "broadwell"; + // Skylake: + case 78: + case 94: + return "skylake"; + case 28: // Most 45 nm Intel Atom processors case 38: // 45 nm Atom Lincroft case 39: // 32 nm Atom Medfield @@ -381,6 +387,8 @@ StringRef sys::getHostCPUName() { case 55: case 74: case 77: + case 90: + case 93: return "silvermont"; default: // Unknown family 6 CPU, try to guess. @@ -689,7 +697,7 @@ StringRef sys::getHostCPUName() { if (Lines[I].startswith("features")) { size_t Pos = Lines[I].find(":"); if (Pos != StringRef::npos) { - Lines[I].drop_front(Pos + 1).split(CPUFeatures, " "); + Lines[I].drop_front(Pos + 1).split(CPUFeatures, ' '); break; } } @@ -766,14 +774,17 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) { // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV // indicates that the AVX registers will be saved and restored on context // switch, then we have full AVX support. - bool HasAVX = ((ECX >> 27) & 1) && ((ECX >> 28) & 1) && - !GetX86XCR0(&EAX, &EDX) && ((EAX & 0x6) == 0x6); - Features["avx"] = HasAVX; - Features["fma"] = HasAVX && (ECX >> 12) & 1; - Features["f16c"] = HasAVX && (ECX >> 29) & 1; + bool HasAVXSave = ((ECX >> 27) & 1) && ((ECX >> 28) & 1) && + !GetX86XCR0(&EAX, &EDX) && ((EAX & 0x6) == 0x6); + Features["avx"] = HasAVXSave; + Features["fma"] = HasAVXSave && (ECX >> 12) & 1; + Features["f16c"] = HasAVXSave && (ECX >> 29) & 1; + + // Only enable XSAVE if OS has enabled support for saving YMM state. + Features["xsave"] = HasAVXSave && (ECX >> 26) & 1; // AVX512 requires additional context to be saved by the OS. - bool HasAVX512Save = HasAVX && ((EAX & 0xe0) == 0xe0); + bool HasAVX512Save = HasAVXSave && ((EAX & 0xe0) == 0xe0); unsigned MaxExtLevel; GetX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX); @@ -783,15 +794,15 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) { Features["lzcnt"] = HasExtLeaf1 && ((ECX >> 5) & 1); Features["sse4a"] = HasExtLeaf1 && ((ECX >> 6) & 1); Features["prfchw"] = HasExtLeaf1 && ((ECX >> 8) & 1); - Features["xop"] = HasAVX && HasExtLeaf1 && ((ECX >> 11) & 1); - Features["fma4"] = HasAVX && HasExtLeaf1 && ((ECX >> 16) & 1); + Features["xop"] = HasExtLeaf1 && ((ECX >> 11) & 1) && HasAVXSave; + Features["fma4"] = HasExtLeaf1 && ((ECX >> 16) & 1) && HasAVXSave; Features["tbm"] = HasExtLeaf1 && ((ECX >> 21) & 1); bool HasLeaf7 = MaxLevel >= 7 && !GetX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX); // AVX2 is only supported if we have the OS save support from AVX. - Features["avx2"] = HasAVX && HasLeaf7 && (EBX >> 5) & 1; + Features["avx2"] = HasAVXSave && HasLeaf7 && ((EBX >> 5) & 1); Features["fsgsbase"] = HasLeaf7 && ((EBX >> 0) & 1); Features["bmi"] = HasLeaf7 && ((EBX >> 3) & 1); @@ -801,6 +812,8 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) { Features["rdseed"] = HasLeaf7 && ((EBX >> 18) & 1); Features["adx"] = HasLeaf7 && ((EBX >> 19) & 1); Features["sha"] = HasLeaf7 && ((EBX >> 29) & 1); + // Enable protection keys + Features["pku"] = HasLeaf7 && ((ECX >> 4) & 1); // AVX512 is only supported if the OS supports the context save for it. Features["avx512f"] = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save; @@ -811,6 +824,14 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) { Features["avx512bw"] = HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save; Features["avx512vl"] = HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save; + bool HasLeafD = MaxLevel >= 0xd && + !GetX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX); + + // Only enable XSAVE if OS has enabled support for saving YMM state. + Features["xsaveopt"] = HasAVXSave && HasLeafD && ((EAX >> 0) & 1); + Features["xsavec"] = HasAVXSave && HasLeafD && ((EAX >> 1) & 1); + Features["xsaves"] = HasAVXSave && HasLeafD && ((EAX >> 3) & 1); + return true; } #elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__)) @@ -832,7 +853,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) { // Look for the CPU features. for (unsigned I = 0, E = Lines.size(); I != E; ++I) if (Lines[I].startswith("Features")) { - Lines[I].split(CPUFeatures, " "); + Lines[I].split(CPUFeatures, ' '); break; } diff --git a/contrib/llvm/lib/Support/IntEqClasses.cpp b/contrib/llvm/lib/Support/IntEqClasses.cpp index 1134495..ff21357 100644 --- a/contrib/llvm/lib/Support/IntEqClasses.cpp +++ b/contrib/llvm/lib/Support/IntEqClasses.cpp @@ -29,7 +29,7 @@ void IntEqClasses::grow(unsigned N) { EC.push_back(EC.size()); } -void IntEqClasses::join(unsigned a, unsigned b) { +unsigned IntEqClasses::join(unsigned a, unsigned b) { assert(NumClasses == 0 && "join() called after compress()."); unsigned eca = EC[a]; unsigned ecb = EC[b]; @@ -41,6 +41,8 @@ void IntEqClasses::join(unsigned a, unsigned b) { EC[b] = eca, b = ecb, ecb = EC[b]; else EC[a] = ecb, a = eca, eca = EC[a]; + + return eca; } unsigned IntEqClasses::findLeader(unsigned a) const { diff --git a/contrib/llvm/lib/Support/JamCRC.cpp b/contrib/llvm/lib/Support/JamCRC.cpp new file mode 100644 index 0000000..bc21c91 --- /dev/null +++ b/contrib/llvm/lib/Support/JamCRC.cpp @@ -0,0 +1,96 @@ +//===-- JamCRC.cpp - Cyclic Redundancy Check --------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains an implementation of JamCRC. +// +//===----------------------------------------------------------------------===// +// +// The implementation technique is the one mentioned in: +// D. V. Sarwate. 1988. Computation of cyclic redundancy checks via table +// look-up. Commun. ACM 31, 8 (August 1988) +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/JamCRC.h" + +using namespace llvm; + +static const uint32_t CRCTable[256] = { + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, + 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, + 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, + 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, + 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, + 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, + 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, + 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, + 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, + 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, + 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, + 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, + 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, + 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, + 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, + 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, + 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, + 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, + 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, + 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, + 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, + 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, + 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, + 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, + 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, + 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, + 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, + 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, + 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, + 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, + 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, + 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, + 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, + 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, + 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, + 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, + 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, + 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, + 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, + 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, + 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, + 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, + 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, + 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, + 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, + 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, + 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, + 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d +}; + +void JamCRC::update(ArrayRef<char> Data) { + for (char Byte : Data) { + int TableIdx = (CRC ^ Byte) & 0xff; + CRC = CRCTable[TableIdx] ^ (CRC >> 8); + } +} diff --git a/contrib/llvm/lib/Support/Locale.cpp b/contrib/llvm/lib/Support/Locale.cpp index d5cb72b..53bc0e3 100644 --- a/contrib/llvm/lib/Support/Locale.cpp +++ b/contrib/llvm/lib/Support/Locale.cpp @@ -1,3 +1,4 @@ +#include "llvm/Config/llvm-config.h" #include "llvm/Support/Locale.h" #include "llvm/Support/Unicode.h" diff --git a/contrib/llvm/lib/Support/ManagedStatic.cpp b/contrib/llvm/lib/Support/ManagedStatic.cpp index b8fb284..9868207 100644 --- a/contrib/llvm/lib/Support/ManagedStatic.cpp +++ b/contrib/llvm/lib/Support/ManagedStatic.cpp @@ -14,6 +14,7 @@ #include "llvm/Support/ManagedStatic.h" #include "llvm/Config/config.h" #include "llvm/Support/Atomic.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Mutex.h" #include "llvm/Support/MutexGuard.h" #include <cassert> diff --git a/contrib/llvm/lib/Support/MemoryBuffer.cpp b/contrib/llvm/lib/Support/MemoryBuffer.cpp index d09ef3a..faee10b 100644 --- a/contrib/llvm/lib/Support/MemoryBuffer.cpp +++ b/contrib/llvm/lib/Support/MemoryBuffer.cpp @@ -162,13 +162,14 @@ MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) { } ErrorOr<std::unique_ptr<MemoryBuffer>> -MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize) { +MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize, + bool RequiresNullTerminator) { SmallString<256> NameBuf; StringRef NameRef = Filename.toStringRef(NameBuf); if (NameRef == "-") return getSTDIN(); - return getFile(Filename, FileSize); + return getFile(Filename, FileSize, RequiresNullTerminator); } ErrorOr<std::unique_ptr<MemoryBuffer>> diff --git a/contrib/llvm/lib/Support/Path.cpp b/contrib/llvm/lib/Support/Path.cpp index cf46738..4952f59 100644 --- a/contrib/llvm/lib/Support/Path.cpp +++ b/contrib/llvm/lib/Support/Path.cpp @@ -455,17 +455,15 @@ void append(SmallVectorImpl<char> &path, const Twine &a, if (!c.isTriviallyEmpty()) components.push_back(c.toStringRef(c_storage)); if (!d.isTriviallyEmpty()) components.push_back(d.toStringRef(d_storage)); - for (SmallVectorImpl<StringRef>::const_iterator i = components.begin(), - e = components.end(); - i != e; ++i) { + for (auto &component : components) { bool path_has_sep = !path.empty() && is_separator(path[path.size() - 1]); - bool component_has_sep = !i->empty() && is_separator((*i)[0]); - bool is_root_name = has_root_name(*i); + bool component_has_sep = !component.empty() && is_separator(component[0]); + bool is_root_name = has_root_name(component); if (path_has_sep) { // Strip separators from beginning of component. - size_t loc = i->find_first_not_of(separators); - StringRef c = i->substr(loc); + size_t loc = component.find_first_not_of(separators); + StringRef c = component.substr(loc); // Append it. path.append(c.begin(), c.end()); @@ -477,7 +475,7 @@ void append(SmallVectorImpl<char> &path, const Twine &a, path.push_back(preferred_separator); } - path.append(i->begin(), i->end()); + path.append(component.begin(), component.end()); } } @@ -661,8 +659,51 @@ bool is_absolute(const Twine &path) { return rootDir && rootName; } -bool is_relative(const Twine &path) { - return !is_absolute(path); +bool is_relative(const Twine &path) { return !is_absolute(path); } + +StringRef remove_leading_dotslash(StringRef Path) { + // Remove leading "./" (or ".//" or "././" etc.) + while (Path.size() > 2 && Path[0] == '.' && is_separator(Path[1])) { + Path = Path.substr(2); + while (Path.size() > 0 && is_separator(Path[0])) + Path = Path.substr(1); + } + return Path; +} + +static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot) { + SmallVector<StringRef, 16> components; + + // Skip the root path, then look for traversal in the components. + StringRef rel = path::relative_path(path); + for (StringRef C : llvm::make_range(path::begin(rel), path::end(rel))) { + if (C == ".") + continue; + if (remove_dot_dot) { + if (C == "..") { + if (!components.empty()) + components.pop_back(); + continue; + } + } + components.push_back(C); + } + + SmallString<256> buffer = path::root_path(path); + for (StringRef C : components) + path::append(buffer, C); + return buffer; +} + +bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot) { + StringRef p(path.data(), path.size()); + + SmallString<256> result = remove_dots(p, remove_dot_dot); + if (result == path) + return false; + + path.swap(result); + return true; } } // end namespace path @@ -732,7 +773,9 @@ std::error_code createUniqueDirectory(const Twine &Prefix, true, 0, FS_Dir); } -std::error_code make_absolute(SmallVectorImpl<char> &path) { +static std::error_code make_absolute(const Twine ¤t_directory, + SmallVectorImpl<char> &path, + bool use_current_directory) { StringRef p(path.data(), path.size()); bool rootDirectory = path::has_root_directory(p), @@ -748,7 +791,9 @@ std::error_code make_absolute(SmallVectorImpl<char> &path) { // All of the following conditions will need the current directory. SmallString<128> current_dir; - if (std::error_code ec = current_path(current_dir)) + if (use_current_directory) + current_directory.toVector(current_dir); + else if (std::error_code ec = current_path(current_dir)) return ec; // Relative path. Prepend the current directory. @@ -785,12 +830,22 @@ std::error_code make_absolute(SmallVectorImpl<char> &path) { "occurred above!"); } -std::error_code create_directories(const Twine &Path, bool IgnoreExisting) { +std::error_code make_absolute(const Twine ¤t_directory, + SmallVectorImpl<char> &path) { + return make_absolute(current_directory, path, true); +} + +std::error_code make_absolute(SmallVectorImpl<char> &path) { + return make_absolute(Twine(), path, false); +} + +std::error_code create_directories(const Twine &Path, bool IgnoreExisting, + perms Perms) { SmallString<128> PathStorage; StringRef P = Path.toStringRef(PathStorage); // Be optimistic and try to create the directory - std::error_code EC = create_directory(P, IgnoreExisting); + std::error_code EC = create_directory(P, IgnoreExisting, Perms); // If we succeeded, or had any error other than the parent not existing, just // return it. if (EC != errc::no_such_file_or_directory) @@ -802,10 +857,10 @@ std::error_code create_directories(const Twine &Path, bool IgnoreExisting) { if (Parent.empty()) return EC; - if ((EC = create_directories(Parent))) + if ((EC = create_directories(Parent, IgnoreExisting, Perms))) return EC; - return create_directory(P, IgnoreExisting); + return create_directory(P, IgnoreExisting, Perms); } std::error_code copy_file(const Twine &From, const Twine &To) { @@ -889,8 +944,7 @@ std::error_code is_other(const Twine &Path, bool &Result) { } void directory_entry::replace_filename(const Twine &filename, file_status st) { - SmallString<128> path(Path.begin(), Path.end()); - path::remove_filename(path); + SmallString<128> path = path::parent_path(Path); path::append(path, filename); Path = path.str(); Status = st; @@ -940,7 +994,8 @@ file_magic identify_magic(StringRef Magic) { break; case '!': if (Magic.size() >= 8) - if (memcmp(Magic.data(),"!<arch>\n",8) == 0) + if (memcmp(Magic.data(), "!<arch>\n", 8) == 0 || + memcmp(Magic.data(), "!<thin>\n", 8) == 0) return file_magic::archive; break; @@ -1074,3 +1129,20 @@ std::error_code directory_entry::status(file_status &result) const { #if defined(LLVM_ON_WIN32) #include "Windows/Path.inc" #endif + +namespace llvm { +namespace sys { +namespace path { + +bool user_cache_directory(SmallVectorImpl<char> &Result, const Twine &Path1, + const Twine &Path2, const Twine &Path3) { + if (getUserCacheDir(Result)) { + append(Result, Path1, Path2, Path3); + return true; + } + return false; +} + +} // end namespace path +} // end namsspace sys +} // end namespace llvm diff --git a/contrib/llvm/lib/Support/PrettyStackTrace.cpp b/contrib/llvm/lib/Support/PrettyStackTrace.cpp index f9f8cab..05b3e31 100644 --- a/contrib/llvm/lib/Support/PrettyStackTrace.cpp +++ b/contrib/llvm/lib/Support/PrettyStackTrace.cpp @@ -13,7 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/PrettyStackTrace.h" -#include "llvm-c/Core.h" +#include "llvm-c/ErrorHandling.h" #include "llvm/ADT/SmallString.h" #include "llvm/Config/config.h" // Get autoconf configuration settings #include "llvm/Support/Compiler.h" @@ -154,6 +154,20 @@ void llvm::EnablePrettyStackTrace() { #endif } +const void* llvm::SavePrettyStackState() { +#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES) + return PrettyStackTraceHead; +#else + return nullptr; +#endif +} + +void llvm::RestorePrettyStackState(const void* Top) { +#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES) + PrettyStackTraceHead = (const PrettyStackTraceEntry*)Top; +#endif +} + void LLVMEnablePrettyStackTrace() { EnablePrettyStackTrace(); } diff --git a/contrib/llvm/lib/Support/Signals.cpp b/contrib/llvm/lib/Support/Signals.cpp index a117893..3dc6b7c 100644 --- a/contrib/llvm/lib/Support/Signals.cpp +++ b/contrib/llvm/lib/Support/Signals.cpp @@ -12,8 +12,21 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Support/Signals.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Config/config.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Mutex.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/Signals.h" +#include "llvm/Support/StringSaver.h" +#include "llvm/Support/raw_ostream.h" +#include <vector> namespace llvm { using namespace sys; @@ -23,6 +36,131 @@ using namespace sys; //=== independent code. //===----------------------------------------------------------------------===// +static ManagedStatic<std::vector<std::pair<void (*)(void *), void *>>> + CallBacksToRun; +void sys::RunSignalHandlers() { + if (!CallBacksToRun.isConstructed()) + return; + for (auto &I : *CallBacksToRun) + I.first(I.second); + CallBacksToRun->clear(); +} +} + +using namespace llvm; + +static bool findModulesAndOffsets(void **StackTrace, int Depth, + const char **Modules, intptr_t *Offsets, + const char *MainExecutableName, + StringSaver &StrPool); + +/// Format a pointer value as hexadecimal. Zero pad it out so its always the +/// same width. +static FormattedNumber format_ptr(void *PC) { + // Each byte is two hex digits plus 2 for the 0x prefix. + unsigned PtrWidth = 2 + 2 * sizeof(void *); + return format_hex((uint64_t)PC, PtrWidth); +} + +static bool printSymbolizedStackTrace(void **StackTrace, int Depth, + llvm::raw_ostream &OS) + LLVM_ATTRIBUTE_USED; + +/// Helper that launches llvm-symbolizer and symbolizes a backtrace. +static bool printSymbolizedStackTrace(void **StackTrace, int Depth, + llvm::raw_ostream &OS) { + // FIXME: Subtract necessary number from StackTrace entries to turn return addresses + // into actual instruction addresses. + // Use llvm-symbolizer tool to symbolize the stack traces. + ErrorOr<std::string> LLVMSymbolizerPathOrErr = + sys::findProgramByName("llvm-symbolizer"); + if (!LLVMSymbolizerPathOrErr) + return false; + const std::string &LLVMSymbolizerPath = *LLVMSymbolizerPathOrErr; + // We don't know argv0 or the address of main() at this point, but try + // to guess it anyway (it's possible on some platforms). + std::string MainExecutableName = sys::fs::getMainExecutable(nullptr, nullptr); + if (MainExecutableName.empty() || + MainExecutableName.find("llvm-symbolizer") != std::string::npos) + return false; + + BumpPtrAllocator Allocator; + StringSaver StrPool(Allocator); + std::vector<const char *> Modules(Depth, nullptr); + std::vector<intptr_t> Offsets(Depth, 0); + if (!findModulesAndOffsets(StackTrace, Depth, Modules.data(), Offsets.data(), + MainExecutableName.c_str(), StrPool)) + return false; + int InputFD; + SmallString<32> InputFile, OutputFile; + sys::fs::createTemporaryFile("symbolizer-input", "", InputFD, InputFile); + sys::fs::createTemporaryFile("symbolizer-output", "", OutputFile); + FileRemover InputRemover(InputFile.c_str()); + FileRemover OutputRemover(OutputFile.c_str()); + + { + raw_fd_ostream Input(InputFD, true); + for (int i = 0; i < Depth; i++) { + if (Modules[i]) + Input << Modules[i] << " " << (void*)Offsets[i] << "\n"; + } + } + + StringRef InputFileStr(InputFile); + StringRef OutputFileStr(OutputFile); + StringRef StderrFileStr; + const StringRef *Redirects[] = {&InputFileStr, &OutputFileStr, + &StderrFileStr}; + const char *Args[] = {"llvm-symbolizer", "--functions=linkage", "--inlining", +#ifdef LLVM_ON_WIN32 + // Pass --relative-address on Windows so that we don't + // have to add ImageBase from PE file. + // FIXME: Make this the default for llvm-symbolizer. + "--relative-address", +#endif + "--demangle", nullptr}; + int RunResult = + sys::ExecuteAndWait(LLVMSymbolizerPath, Args, nullptr, Redirects); + if (RunResult != 0) + return false; + + // This report format is based on the sanitizer stack trace printer. See + // sanitizer_stacktrace_printer.cc in compiler-rt. + auto OutputBuf = MemoryBuffer::getFile(OutputFile.c_str()); + if (!OutputBuf) + return false; + StringRef Output = OutputBuf.get()->getBuffer(); + SmallVector<StringRef, 32> Lines; + Output.split(Lines, "\n"); + auto CurLine = Lines.begin(); + int frame_no = 0; + for (int i = 0; i < Depth; i++) { + if (!Modules[i]) { + OS << '#' << frame_no++ << ' ' << format_ptr(StackTrace[i]) << '\n'; + continue; + } + // Read pairs of lines (function name and file/line info) until we + // encounter empty line. + for (;;) { + if (CurLine == Lines.end()) + return false; + StringRef FunctionName = *CurLine++; + if (FunctionName.empty()) + break; + OS << '#' << frame_no++ << ' ' << format_ptr(StackTrace[i]) << ' '; + if (!FunctionName.startswith("??")) + OS << FunctionName << ' '; + if (CurLine == Lines.end()) + return false; + StringRef FileLineInfo = *CurLine++; + if (!FileLineInfo.startswith("??")) + OS << FileLineInfo; + else + OS << "(" << Modules[i] << '+' << format_hex(Offsets[i], 0) << ")"; + OS << "\n"; + } + } + return true; } // Include the platform-specific parts of this class. diff --git a/contrib/llvm/lib/Support/Statistic.cpp b/contrib/llvm/lib/Support/Statistic.cpp index 56c3b0f..e49d1cb 100644 --- a/contrib/llvm/lib/Support/Statistic.cpp +++ b/contrib/llvm/lib/Support/Statistic.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Format.h" #include "llvm/Support/ManagedStatic.h" @@ -33,9 +34,6 @@ #include <cstring> using namespace llvm; -// CreateInfoOutputFile - Return a file stream to print our output on. -namespace llvm { extern raw_ostream *CreateInfoOutputFile(); } - /// -stats - Command line option to cause transformations to emit stats about /// what they did. /// @@ -144,20 +142,18 @@ void llvm::PrintStatistics() { if (Stats.Stats.empty()) return; // Get the stream to write to. - raw_ostream &OutStream = *CreateInfoOutputFile(); - PrintStatistics(OutStream); - delete &OutStream; // Close the file. + std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile(); + PrintStatistics(*OutStream); + #else // Check if the -stats option is set instead of checking // !Stats.Stats.empty(). In release builds, Statistics operators // do nothing, so stats are never Registered. if (Enabled) { // Get the stream to write to. - raw_ostream &OutStream = *CreateInfoOutputFile(); - OutStream << "Statistics are disabled. " - << "Build with asserts or with -DLLVM_ENABLE_STATS\n"; - OutStream.flush(); - delete &OutStream; // Close the file. + std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile(); + (*OutStream) << "Statistics are disabled. " + << "Build with asserts or with -DLLVM_ENABLE_STATS\n"; } #endif } diff --git a/contrib/llvm/lib/Support/StringRef.cpp b/contrib/llvm/lib/Support/StringRef.cpp index ddece08..7ecff29 100644 --- a/contrib/llvm/lib/Support/StringRef.cpp +++ b/contrib/llvm/lib/Support/StringRef.cpp @@ -140,37 +140,44 @@ std::string StringRef::upper() const { /// \return - The index of the first occurrence of \arg Str, or npos if not /// found. size_t StringRef::find(StringRef Str, size_t From) const { + if (From > Length) + return npos; + + const char *Needle = Str.data(); size_t N = Str.size(); - if (N > Length) + if (N == 0) + return From; + + size_t Size = Length - From; + if (Size < N) return npos; + const char *Start = Data + From; + const char *Stop = Start + (Size - N + 1); + // For short haystacks or unsupported needles fall back to the naive algorithm - if (Length < 16 || N > 255 || N == 0) { - for (size_t e = Length - N + 1, i = std::min(From, e); i != e; ++i) - if (substr(i, N).equals(Str)) - return i; + if (Size < 16 || N > 255) { + do { + if (std::memcmp(Start, Needle, N) == 0) + return Start - Data; + ++Start; + } while (Start < Stop); return npos; } - if (From >= Length) - return npos; - // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. uint8_t BadCharSkip[256]; std::memset(BadCharSkip, N, 256); for (unsigned i = 0; i != N-1; ++i) BadCharSkip[(uint8_t)Str[i]] = N-1-i; - unsigned Len = Length-From, Pos = From; - while (Len >= N) { - if (substr(Pos, N).equals(Str)) // See if this is the correct substring. - return Pos; + do { + if (std::memcmp(Start, Needle, N) == 0) + return Start - Data; // Otherwise skip the appropriate number of bytes. - uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]]; - Len -= Skip; - Pos += Skip; - } + Start += BadCharSkip[(uint8_t)Start[N-1]]; + } while (Start < Stop); return npos; } @@ -274,24 +281,56 @@ StringRef::size_type StringRef::find_last_not_of(StringRef Chars, } void StringRef::split(SmallVectorImpl<StringRef> &A, - StringRef Separators, int MaxSplit, + StringRef Separator, int MaxSplit, bool KeepEmpty) const { - StringRef rest = *this; - - // rest.data() is used to distinguish cases like "a," that splits into - // "a" + "" and "a" that splits into "a" + 0. - for (int splits = 0; - rest.data() != nullptr && (MaxSplit < 0 || splits < MaxSplit); - ++splits) { - std::pair<StringRef, StringRef> p = rest.split(Separators); - - if (KeepEmpty || p.first.size() != 0) - A.push_back(p.first); - rest = p.second; + StringRef S = *this; + + // Count down from MaxSplit. When MaxSplit is -1, this will just split + // "forever". This doesn't support splitting more than 2^31 times + // intentionally; if we ever want that we can make MaxSplit a 64-bit integer + // but that seems unlikely to be useful. + while (MaxSplit-- != 0) { + size_t Idx = S.find(Separator); + if (Idx == npos) + break; + + // Push this split. + if (KeepEmpty || Idx > 0) + A.push_back(S.slice(0, Idx)); + + // Jump forward. + S = S.slice(Idx + Separator.size(), npos); + } + + // Push the tail. + if (KeepEmpty || !S.empty()) + A.push_back(S); +} + +void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, + int MaxSplit, bool KeepEmpty) const { + StringRef S = *this; + + // Count down from MaxSplit. When MaxSplit is -1, this will just split + // "forever". This doesn't support splitting more than 2^31 times + // intentionally; if we ever want that we can make MaxSplit a 64-bit integer + // but that seems unlikely to be useful. + while (MaxSplit-- != 0) { + size_t Idx = S.find(Separator); + if (Idx == npos) + break; + + // Push this split. + if (KeepEmpty || Idx > 0) + A.push_back(S.slice(0, Idx)); + + // Jump forward. + S = S.slice(Idx + 1, npos); } - // If we have a tail left, add it. - if (rest.data() != nullptr && (rest.size() != 0 || KeepEmpty)) - A.push_back(rest); + + // Push the tail. + if (KeepEmpty || !S.empty()) + A.push_back(S); } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Support/StringSaver.cpp b/contrib/llvm/lib/Support/StringSaver.cpp index d6b84e5..bbc1fd2 100644 --- a/contrib/llvm/lib/Support/StringSaver.cpp +++ b/contrib/llvm/lib/Support/StringSaver.cpp @@ -11,7 +11,7 @@ using namespace llvm; -const char *StringSaver::saveImpl(StringRef S) { +const char *StringSaver::save(StringRef S) { char *P = Alloc.Allocate<char>(S.size() + 1); memcpy(P, S.data(), S.size()); P[S.size()] = '\0'; diff --git a/contrib/llvm/lib/Support/TargetParser.cpp b/contrib/llvm/lib/Support/TargetParser.cpp index 4d4c041..337532e 100644 --- a/contrib/llvm/lib/Support/TargetParser.cpp +++ b/contrib/llvm/lib/Support/TargetParser.cpp @@ -16,9 +16,11 @@ #include "llvm/Support/TargetParser.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" #include <cctype> using namespace llvm; +using namespace ARM; namespace { @@ -26,36 +28,19 @@ namespace { // features they correspond to (use getFPUFeatures). // FIXME: TableGen this. // The entries must appear in the order listed in ARM::FPUKind for correct indexing -struct { - const char * Name; +static const struct { + const char *NameCStr; + size_t NameLength; ARM::FPUKind ID; ARM::FPUVersion FPUVersion; ARM::NeonSupportLevel NeonSupport; ARM::FPURestriction Restriction; + + StringRef getName() const { return StringRef(NameCStr, NameLength); } } FPUNames[] = { - { "invalid", ARM::FK_INVALID, ARM::FV_NONE, ARM::NS_None, ARM::FR_None}, - { "none", ARM::FK_NONE, ARM::FV_NONE, ARM::NS_None, ARM::FR_None}, - { "vfp", ARM::FK_VFP, ARM::FV_VFPV2, ARM::NS_None, ARM::FR_None}, - { "vfpv2", ARM::FK_VFPV2, ARM::FV_VFPV2, ARM::NS_None, ARM::FR_None}, - { "vfpv3", ARM::FK_VFPV3, ARM::FV_VFPV3, ARM::NS_None, ARM::FR_None}, - { "vfpv3-fp16", ARM::FK_VFPV3_FP16, ARM::FV_VFPV3_FP16, ARM::NS_None, ARM::FR_None}, - { "vfpv3-d16", ARM::FK_VFPV3_D16, ARM::FV_VFPV3, ARM::NS_None, ARM::FR_D16}, - { "vfpv3-d16-fp16", ARM::FK_VFPV3_D16_FP16, ARM::FV_VFPV3_FP16, ARM::NS_None, ARM::FR_D16}, - { "vfpv3xd", ARM::FK_VFPV3XD, ARM::FV_VFPV3, ARM::NS_None, ARM::FR_SP_D16}, - { "vfpv3xd-fp16", ARM::FK_VFPV3XD_FP16, ARM::FV_VFPV3_FP16, ARM::NS_None, ARM::FR_SP_D16}, - { "vfpv4", ARM::FK_VFPV4, ARM::FV_VFPV4, ARM::NS_None, ARM::FR_None}, - { "vfpv4-d16", ARM::FK_VFPV4_D16, ARM::FV_VFPV4, ARM::NS_None, ARM::FR_D16}, - { "fpv4-sp-d16", ARM::FK_FPV4_SP_D16, ARM::FV_VFPV4, ARM::NS_None, ARM::FR_SP_D16}, - { "fpv5-d16", ARM::FK_FPV5_D16, ARM::FV_VFPV5, ARM::NS_None, ARM::FR_D16}, - { "fpv5-sp-d16", ARM::FK_FPV5_SP_D16, ARM::FV_VFPV5, ARM::NS_None, ARM::FR_SP_D16}, - { "fp-armv8", ARM::FK_FP_ARMV8, ARM::FV_VFPV5, ARM::NS_None, ARM::FR_None}, - { "neon", ARM::FK_NEON, ARM::FV_VFPV3, ARM::NS_Neon, ARM::FR_None}, - { "neon-fp16", ARM::FK_NEON_FP16, ARM::FV_VFPV3_FP16, ARM::NS_Neon, ARM::FR_None}, - { "neon-vfpv4", ARM::FK_NEON_VFPV4, ARM::FV_VFPV4, ARM::NS_Neon, ARM::FR_None}, - { "neon-fp-armv8", ARM::FK_NEON_FP_ARMV8, ARM::FV_VFPV5, ARM::NS_Neon, ARM::FR_None}, - { "crypto-neon-fp-armv8", - ARM::FK_CRYPTO_NEON_FP_ARMV8, ARM::FV_VFPV5, ARM::NS_Crypto, ARM::FR_None}, - { "softvfp", ARM::FK_SOFTVFP, ARM::FV_NONE, ARM::NS_None, ARM::FR_None}, +#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION) \ + { NAME, sizeof(NAME) - 1, KIND, VERSION, NEON_SUPPORT, RESTRICTION }, +#include "llvm/Support/ARMTargetParser.def" }; // List of canonical arch names (use getArchSynonym). @@ -66,165 +51,79 @@ struct { // of the triples and are not conforming with their official names. // Check to see if the expectation should be changed. // FIXME: TableGen this. -struct { - const char *Name; +static const struct { + const char *NameCStr; + size_t NameLength; + const char *CPUAttrCStr; + size_t CPUAttrLength; + const char *SubArchCStr; + size_t SubArchLength; + unsigned DefaultFPU; + unsigned ArchBaseExtensions; ARM::ArchKind ID; - const char *CPUAttr; // CPU class in build attributes. - const char *SubArch; // Sub-Arch name. ARMBuildAttrs::CPUArch ArchAttr; // Arch ID in build attributes. + + StringRef getName() const { return StringRef(NameCStr, NameLength); } + + // CPU class in build attributes. + StringRef getCPUAttr() const { return StringRef(CPUAttrCStr, CPUAttrLength); } + + // Sub-Arch name. + StringRef getSubArch() const { return StringRef(SubArchCStr, SubArchLength); } } ARCHNames[] = { - { "invalid", ARM::AK_INVALID, nullptr, nullptr, ARMBuildAttrs::CPUArch::Pre_v4 }, - { "armv2", ARM::AK_ARMV2, "2", "v2", ARMBuildAttrs::CPUArch::Pre_v4 }, - { "armv2a", ARM::AK_ARMV2A, "2A", "v2a", ARMBuildAttrs::CPUArch::Pre_v4 }, - { "armv3", ARM::AK_ARMV3, "3", "v3", ARMBuildAttrs::CPUArch::Pre_v4 }, - { "armv3m", ARM::AK_ARMV3M, "3M", "v3m", ARMBuildAttrs::CPUArch::Pre_v4 }, - { "armv4", ARM::AK_ARMV4, "4", "v4", ARMBuildAttrs::CPUArch::v4 }, - { "armv4t", ARM::AK_ARMV4T, "4T", "v4t", ARMBuildAttrs::CPUArch::v4T }, - { "armv5t", ARM::AK_ARMV5T, "5T", "v5", ARMBuildAttrs::CPUArch::v5T }, - { "armv5te", ARM::AK_ARMV5TE, "5TE", "v5e", ARMBuildAttrs::CPUArch::v5TE }, - { "armv5tej", ARM::AK_ARMV5TEJ, "5TEJ", "v5e", ARMBuildAttrs::CPUArch::v5TEJ }, - { "armv6", ARM::AK_ARMV6, "6", "v6", ARMBuildAttrs::CPUArch::v6 }, - { "armv6k", ARM::AK_ARMV6K, "6K", "v6k", ARMBuildAttrs::CPUArch::v6K }, - { "armv6t2", ARM::AK_ARMV6T2, "6T2", "v6t2", ARMBuildAttrs::CPUArch::v6T2 }, - { "armv6z", ARM::AK_ARMV6Z, "6Z", "v6z", ARMBuildAttrs::CPUArch::v6KZ }, - { "armv6zk", ARM::AK_ARMV6ZK, "6ZK", "v6zk", ARMBuildAttrs::CPUArch::v6KZ }, - { "armv6-m", ARM::AK_ARMV6M, "6-M", "v6m", ARMBuildAttrs::CPUArch::v6_M }, - { "armv6s-m", ARM::AK_ARMV6SM, "6S-M", "v6sm", ARMBuildAttrs::CPUArch::v6S_M }, - { "armv7-a", ARM::AK_ARMV7A, "7-A", "v7", ARMBuildAttrs::CPUArch::v7 }, - { "armv7-r", ARM::AK_ARMV7R, "7-R", "v7r", ARMBuildAttrs::CPUArch::v7 }, - { "armv7-m", ARM::AK_ARMV7M, "7-M", "v7m", ARMBuildAttrs::CPUArch::v7 }, - { "armv7e-m", ARM::AK_ARMV7EM, "7E-M", "v7em", ARMBuildAttrs::CPUArch::v7E_M }, - { "armv8-a", ARM::AK_ARMV8A, "8-A", "v8", ARMBuildAttrs::CPUArch::v8 }, - { "armv8.1-a", ARM::AK_ARMV8_1A, "8.1-A", "v8.1a", ARMBuildAttrs::CPUArch::v8 }, - // Non-standard Arch names. - { "iwmmxt", ARM::AK_IWMMXT, "iwmmxt", "", ARMBuildAttrs::CPUArch::v5TE }, - { "iwmmxt2", ARM::AK_IWMMXT2, "iwmmxt2", "", ARMBuildAttrs::CPUArch::v5TE }, - { "xscale", ARM::AK_XSCALE, "xscale", "", ARMBuildAttrs::CPUArch::v5TE }, - { "armv5", ARM::AK_ARMV5, "5T", "v5", ARMBuildAttrs::CPUArch::v5T }, - { "armv5e", ARM::AK_ARMV5E, "5TE", "v5e", ARMBuildAttrs::CPUArch::v5TE }, - { "armv6j", ARM::AK_ARMV6J, "6J", "v6", ARMBuildAttrs::CPUArch::v6 }, - { "armv6hl", ARM::AK_ARMV6HL, "6-M", "v6hl", ARMBuildAttrs::CPUArch::v6_M }, - { "armv7", ARM::AK_ARMV7, "7", "v7", ARMBuildAttrs::CPUArch::v7 }, - { "armv7l", ARM::AK_ARMV7L, "7-L", "v7l", ARMBuildAttrs::CPUArch::v7 }, - { "armv7hl", ARM::AK_ARMV7HL, "7-L", "v7hl", ARMBuildAttrs::CPUArch::v7 }, - { "armv7s", ARM::AK_ARMV7S, "7-S", "v7s", ARMBuildAttrs::CPUArch::v7 } +#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) \ + {NAME, sizeof(NAME) - 1, CPU_ATTR, sizeof(CPU_ATTR) - 1, SUB_ARCH, \ + sizeof(SUB_ARCH) - 1, ARCH_FPU, ARCH_BASE_EXT, ID, ARCH_ATTR}, +#include "llvm/Support/ARMTargetParser.def" }; + // List of Arch Extension names. // FIXME: TableGen this. -struct { - const char *Name; - ARM::ArchExtKind ID; +static const struct { + const char *NameCStr; + size_t NameLength; + unsigned ID; + const char *Feature; + const char *NegFeature; + + StringRef getName() const { return StringRef(NameCStr, NameLength); } } ARCHExtNames[] = { - { "invalid", ARM::AEK_INVALID }, - { "crc", ARM::AEK_CRC }, - { "crypto", ARM::AEK_CRYPTO }, - { "fp", ARM::AEK_FP }, - { "idiv", ARM::AEK_HWDIV }, - { "mp", ARM::AEK_MP }, - { "simd", ARM::AEK_SIMD }, - { "sec", ARM::AEK_SEC }, - { "virt", ARM::AEK_VIRT }, - { "os", ARM::AEK_OS }, - { "iwmmxt", ARM::AEK_IWMMXT }, - { "iwmmxt2", ARM::AEK_IWMMXT2 }, - { "maverick", ARM::AEK_MAVERICK }, - { "xscale", ARM::AEK_XSCALE } +#define ARM_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \ + { NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE }, +#include "llvm/Support/ARMTargetParser.def" }; + +// List of HWDiv names (use getHWDivSynonym) and which architectural +// features they correspond to (use getHWDivFeatures). +// FIXME: TableGen this. +static const struct { + const char *NameCStr; + size_t NameLength; + unsigned ID; + + StringRef getName() const { return StringRef(NameCStr, NameLength); } +} HWDivNames[] = { +#define ARM_HW_DIV_NAME(NAME, ID) { NAME, sizeof(NAME) - 1, ID }, +#include "llvm/Support/ARMTargetParser.def" +}; + // List of CPU names and their arches. // The same CPU can have multiple arches and can be default on multiple arches. // When finding the Arch for a CPU, first-found prevails. Sort them accordingly. // When this becomes table-generated, we'd probably need two tables. // FIXME: TableGen this. -struct { - const char *Name; +static const struct { + const char *NameCStr; + size_t NameLength; ARM::ArchKind ArchID; - bool Default; + bool Default; // is $Name the default CPU for $ArchID ? + unsigned DefaultExtensions; + + StringRef getName() const { return StringRef(NameCStr, NameLength); } } CPUNames[] = { - { "arm2", ARM::AK_ARMV2, true }, - { "arm3", ARM::AK_ARMV2A, true }, - { "arm6", ARM::AK_ARMV3, true }, - { "arm7m", ARM::AK_ARMV3M, true }, - { "arm8", ARM::AK_ARMV4, false }, - { "arm810", ARM::AK_ARMV4, false }, - { "strongarm", ARM::AK_ARMV4, true }, - { "strongarm110", ARM::AK_ARMV4, false }, - { "strongarm1100", ARM::AK_ARMV4, false }, - { "strongarm1110", ARM::AK_ARMV4, false }, - { "arm7tdmi", ARM::AK_ARMV4T, true }, - { "arm7tdmi-s", ARM::AK_ARMV4T, false }, - { "arm710t", ARM::AK_ARMV4T, false }, - { "arm720t", ARM::AK_ARMV4T, false }, - { "arm9", ARM::AK_ARMV4T, false }, - { "arm9tdmi", ARM::AK_ARMV4T, false }, - { "arm920", ARM::AK_ARMV4T, false }, - { "arm920t", ARM::AK_ARMV4T, false }, - { "arm922t", ARM::AK_ARMV4T, false }, - { "arm9312", ARM::AK_ARMV4T, false }, - { "arm940t", ARM::AK_ARMV4T, false }, - { "ep9312", ARM::AK_ARMV4T, false }, - { "arm10tdmi", ARM::AK_ARMV5T, true }, - { "arm1020t", ARM::AK_ARMV5T, false }, - { "arm9e", ARM::AK_ARMV5TE, false }, - { "arm946e-s", ARM::AK_ARMV5TE, false }, - { "arm966e-s", ARM::AK_ARMV5TE, false }, - { "arm968e-s", ARM::AK_ARMV5TE, false }, - { "arm10e", ARM::AK_ARMV5TE, false }, - { "arm1020e", ARM::AK_ARMV5TE, false }, - { "arm1022e", ARM::AK_ARMV5TE, true }, - { "iwmmxt", ARM::AK_ARMV5TE, false }, - { "xscale", ARM::AK_ARMV5TE, false }, - { "arm926ej-s", ARM::AK_ARMV5TEJ, true }, - { "arm1136jf-s", ARM::AK_ARMV6, true }, - { "arm1176j-s", ARM::AK_ARMV6K, false }, - { "arm1176jz-s", ARM::AK_ARMV6K, false }, - { "mpcore", ARM::AK_ARMV6K, false }, - { "mpcorenovfp", ARM::AK_ARMV6K, false }, - { "arm1176jzf-s", ARM::AK_ARMV6K, true }, - { "arm1176jzf-s", ARM::AK_ARMV6Z, true }, - { "arm1176jzf-s", ARM::AK_ARMV6ZK, true }, - { "arm1156t2-s", ARM::AK_ARMV6T2, true }, - { "arm1156t2f-s", ARM::AK_ARMV6T2, false }, - { "cortex-m0", ARM::AK_ARMV6M, true }, - { "cortex-m0plus", ARM::AK_ARMV6M, false }, - { "cortex-m1", ARM::AK_ARMV6M, false }, - { "sc000", ARM::AK_ARMV6M, false }, - { "cortex-a5", ARM::AK_ARMV7A, false }, - { "cortex-a7", ARM::AK_ARMV7A, false }, - { "cortex-a8", ARM::AK_ARMV7A, true }, - { "cortex-a9", ARM::AK_ARMV7A, false }, - { "cortex-a12", ARM::AK_ARMV7A, false }, - { "cortex-a15", ARM::AK_ARMV7A, false }, - { "cortex-a17", ARM::AK_ARMV7A, false }, - { "krait", ARM::AK_ARMV7A, false }, - { "cortex-r4", ARM::AK_ARMV7R, true }, - { "cortex-r4f", ARM::AK_ARMV7R, false }, - { "cortex-r5", ARM::AK_ARMV7R, false }, - { "cortex-r7", ARM::AK_ARMV7R, false }, - { "sc300", ARM::AK_ARMV7M, false }, - { "cortex-m3", ARM::AK_ARMV7M, true }, - { "cortex-m4", ARM::AK_ARMV7EM, true }, - { "cortex-m7", ARM::AK_ARMV7EM, false }, - { "cortex-a53", ARM::AK_ARMV8A, true }, - { "cortex-a57", ARM::AK_ARMV8A, false }, - { "cortex-a72", ARM::AK_ARMV8A, false }, - { "cyclone", ARM::AK_ARMV8A, false }, - { "generic", ARM::AK_ARMV8_1A, true }, - // Non-standard Arch names. - { "iwmmxt", ARM::AK_IWMMXT, true }, - { "xscale", ARM::AK_XSCALE, true }, - { "arm10tdmi", ARM::AK_ARMV5, true }, - { "arm1022e", ARM::AK_ARMV5E, true }, - { "arm1136j-s", ARM::AK_ARMV6J, true }, - { "arm1136jz-s", ARM::AK_ARMV6J, false }, - { "cortex-m0", ARM::AK_ARMV6SM, true }, - { "arm1176jzf-s", ARM::AK_ARMV6HL, true }, - { "cortex-a8", ARM::AK_ARMV7, true }, - { "cortex-a8", ARM::AK_ARMV7L, true }, - { "cortex-a8", ARM::AK_ARMV7HL, true }, - { "cortex-m4", ARM::AK_ARMV7EM, true }, - { "swift", ARM::AK_ARMV7S, true }, - // Invalid CPU - { "invalid", ARM::AK_INVALID, true } +#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ + { NAME, sizeof(NAME) - 1, ID, IS_DEFAULT, DEFAULT_EXT }, +#include "llvm/Support/ARMTargetParser.def" }; } // namespace @@ -233,33 +132,93 @@ struct { // Information by ID // ======================================================= // -const char *ARMTargetParser::getFPUName(unsigned FPUKind) { +StringRef llvm::ARM::getFPUName(unsigned FPUKind) { if (FPUKind >= ARM::FK_LAST) - return nullptr; - return FPUNames[FPUKind].Name; + return StringRef(); + return FPUNames[FPUKind].getName(); } -unsigned ARMTargetParser::getFPUVersion(unsigned FPUKind) { +unsigned llvm::ARM::getFPUVersion(unsigned FPUKind) { if (FPUKind >= ARM::FK_LAST) return 0; return FPUNames[FPUKind].FPUVersion; } -unsigned ARMTargetParser::getFPUNeonSupportLevel(unsigned FPUKind) { +unsigned llvm::ARM::getFPUNeonSupportLevel(unsigned FPUKind) { if (FPUKind >= ARM::FK_LAST) return 0; return FPUNames[FPUKind].NeonSupport; } -unsigned ARMTargetParser::getFPURestriction(unsigned FPUKind) { +unsigned llvm::ARM::getFPURestriction(unsigned FPUKind) { if (FPUKind >= ARM::FK_LAST) return 0; return FPUNames[FPUKind].Restriction; } -bool ARMTargetParser::getFPUFeatures(unsigned FPUKind, +unsigned llvm::ARM::getDefaultFPU(StringRef CPU, unsigned ArchKind) { + if (CPU == "generic") + return ARCHNames[ArchKind].DefaultFPU; + + return StringSwitch<unsigned>(CPU) +#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ + .Case(NAME, DEFAULT_FPU) +#include "llvm/Support/ARMTargetParser.def" + .Default(ARM::FK_INVALID); +} + +unsigned llvm::ARM::getDefaultExtensions(StringRef CPU, unsigned ArchKind) { + if (CPU == "generic") + return ARCHNames[ArchKind].ArchBaseExtensions; + + return StringSwitch<unsigned>(CPU) +#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \ + .Case(NAME, ARCHNames[ID].ArchBaseExtensions | DEFAULT_EXT) +#include "llvm/Support/ARMTargetParser.def" + .Default(ARM::AEK_INVALID); +} + +bool llvm::ARM::getHWDivFeatures(unsigned HWDivKind, + std::vector<const char *> &Features) { + + if (HWDivKind == ARM::AEK_INVALID) + return false; + + if (HWDivKind & ARM::AEK_HWDIVARM) + Features.push_back("+hwdiv-arm"); + else + Features.push_back("-hwdiv-arm"); + + if (HWDivKind & ARM::AEK_HWDIV) + Features.push_back("+hwdiv"); + else + Features.push_back("-hwdiv"); + + return true; +} + +bool llvm::ARM::getExtensionFeatures(unsigned Extensions, std::vector<const char *> &Features) { + if (Extensions == ARM::AEK_INVALID) + return false; + + if (Extensions & ARM::AEK_CRC) + Features.push_back("+crc"); + else + Features.push_back("-crc"); + + if (Extensions & ARM::AEK_DSP) + Features.push_back("+dsp"); + else + Features.push_back("-dsp"); + + return getHWDivFeatures(Extensions, Features); +} + +bool llvm::ARM::getFPUFeatures(unsigned FPUKind, + std::vector<const char *> &Features) { + if (FPUKind >= ARM::FK_LAST || FPUKind == ARM::FK_INVALID) return false; @@ -323,6 +282,7 @@ bool ARMTargetParser::getFPUFeatures(unsigned FPUKind, // crypto includes neon, so we handle this similarly to FPU version. switch (FPUNames[FPUKind].NeonSupport) { case ARM::NS_Crypto: + Features.push_back("+neon"); Features.push_back("+crypto"); break; case ARM::NS_Neon: @@ -338,88 +298,127 @@ bool ARMTargetParser::getFPUFeatures(unsigned FPUKind, return true; } -const char *ARMTargetParser::getArchName(unsigned ArchKind) { +StringRef llvm::ARM::getArchName(unsigned ArchKind) { if (ArchKind >= ARM::AK_LAST) - return nullptr; - return ARCHNames[ArchKind].Name; + return StringRef(); + return ARCHNames[ArchKind].getName(); } -const char *ARMTargetParser::getCPUAttr(unsigned ArchKind) { +StringRef llvm::ARM::getCPUAttr(unsigned ArchKind) { if (ArchKind >= ARM::AK_LAST) - return nullptr; - return ARCHNames[ArchKind].CPUAttr; + return StringRef(); + return ARCHNames[ArchKind].getCPUAttr(); } -const char *ARMTargetParser::getSubArch(unsigned ArchKind) { +StringRef llvm::ARM::getSubArch(unsigned ArchKind) { if (ArchKind >= ARM::AK_LAST) - return nullptr; - return ARCHNames[ArchKind].SubArch; + return StringRef(); + return ARCHNames[ArchKind].getSubArch(); } -unsigned ARMTargetParser::getArchAttr(unsigned ArchKind) { +unsigned llvm::ARM::getArchAttr(unsigned ArchKind) { if (ArchKind >= ARM::AK_LAST) return ARMBuildAttrs::CPUArch::Pre_v4; return ARCHNames[ArchKind].ArchAttr; } -const char *ARMTargetParser::getArchExtName(unsigned ArchExtKind) { - if (ArchExtKind >= ARM::AEK_LAST) - return nullptr; - return ARCHExtNames[ArchExtKind].Name; +StringRef llvm::ARM::getArchExtName(unsigned ArchExtKind) { + for (const auto AE : ARCHExtNames) { + if (ArchExtKind == AE.ID) + return AE.getName(); + } + return StringRef(); } -const char *ARMTargetParser::getDefaultCPU(StringRef Arch) { +const char *llvm::ARM::getArchExtFeature(StringRef ArchExt) { + if (ArchExt.startswith("no")) { + StringRef ArchExtBase(ArchExt.substr(2)); + for (const auto AE : ARCHExtNames) { + if (AE.NegFeature && ArchExtBase == AE.getName()) + return AE.NegFeature; + } + } + for (const auto AE : ARCHExtNames) { + if (AE.Feature && ArchExt == AE.getName()) + return AE.Feature; + } + + return nullptr; +} + +StringRef llvm::ARM::getHWDivName(unsigned HWDivKind) { + for (const auto D : HWDivNames) { + if (HWDivKind == D.ID) + return D.getName(); + } + return StringRef(); +} + +StringRef llvm::ARM::getDefaultCPU(StringRef Arch) { unsigned AK = parseArch(Arch); if (AK == ARM::AK_INVALID) - return nullptr; + return StringRef(); // Look for multiple AKs to find the default for pair AK+Name. for (const auto CPU : CPUNames) { if (CPU.ArchID == AK && CPU.Default) - return CPU.Name; + return CPU.getName(); } - return nullptr; + + // If we can't find a default then target the architecture instead + return "generic"; } // ======================================================= // // Parsers // ======================================================= // -StringRef ARMTargetParser::getFPUSynonym(StringRef FPU) { +static StringRef getHWDivSynonym(StringRef HWDiv) { + return StringSwitch<StringRef>(HWDiv) + .Case("thumb,arm", "arm,thumb") + .Default(HWDiv); +} + +static StringRef getFPUSynonym(StringRef FPU) { return StringSwitch<StringRef>(FPU) - .Cases("fpa", "fpe2", "fpe3", "maverick", "invalid") // Unsupported - .Case("vfp2", "vfpv2") - .Case("vfp3", "vfpv3") - .Case("vfp4", "vfpv4") - .Case("vfp3-d16", "vfpv3-d16") - .Case("vfp4-d16", "vfpv4-d16") - .Cases("fp4-sp-d16", "vfpv4-sp-d16", "fpv4-sp-d16") - .Cases("fp4-dp-d16", "fpv4-dp-d16", "vfpv4-d16") - .Case("fp5-sp-d16", "fpv5-sp-d16") - .Cases("fp5-dp-d16", "fpv5-dp-d16", "fpv5-d16") - // FIXME: Clang uses it, but it's bogus, since neon defaults to vfpv3. - .Case("neon-vfpv3", "neon") - .Default(FPU); + .Cases("fpa", "fpe2", "fpe3", "maverick", "invalid") // Unsupported + .Case("vfp2", "vfpv2") + .Case("vfp3", "vfpv3") + .Case("vfp4", "vfpv4") + .Case("vfp3-d16", "vfpv3-d16") + .Case("vfp4-d16", "vfpv4-d16") + .Cases("fp4-sp-d16", "vfpv4-sp-d16", "fpv4-sp-d16") + .Cases("fp4-dp-d16", "fpv4-dp-d16", "vfpv4-d16") + .Case("fp5-sp-d16", "fpv5-sp-d16") + .Cases("fp5-dp-d16", "fpv5-dp-d16", "fpv5-d16") + // FIXME: Clang uses it, but it's bogus, since neon defaults to vfpv3. + .Case("neon-vfpv3", "neon") + .Default(FPU); } -StringRef ARMTargetParser::getArchSynonym(StringRef Arch) { +static StringRef getArchSynonym(StringRef Arch) { return StringSwitch<StringRef>(Arch) - .Case("v6sm", "v6s-m") - .Case("v6m", "v6-m") - .Case("v7a", "v7-a") - .Case("v7r", "v7-r") - .Case("v7m", "v7-m") - .Case("v7em", "v7e-m") - .Cases("v8", "v8a", "aarch64", "arm64", "v8-a") - .Case("v8.1a", "v8.1-a") - .Default(Arch); + .Case("v5", "v5t") + .Case("v5e", "v5te") + .Case("v6j", "v6") + .Case("v6hl", "v6k") + .Cases("v6m", "v6sm", "v6s-m", "v6-m") + .Cases("v6z", "v6zk", "v6kz") + .Cases("v7", "v7a", "v7hl", "v7l", "v7-a") + .Case("v7r", "v7-r") + .Case("v7m", "v7-m") + .Case("v7em", "v7e-m") + .Cases("v8", "v8a", "aarch64", "arm64", "v8-a") + .Case("v8.1a", "v8.1-a") + .Case("v8.2a", "v8.2-a") + .Default(Arch); } // MArch is expected to be of the form (arm|thumb)?(eb)?(v.+)?(eb)?, but // (iwmmxt|xscale)(eb)? is also permitted. If the former, return // "v.+", if the latter, return unmodified string, minus 'eb'. // If invalid, return empty string. -StringRef ARMTargetParser::getCanonicalArchName(StringRef Arch) { +StringRef llvm::ARM::getCanonicalArchName(StringRef Arch) { size_t offset = StringRef::npos; StringRef A = Arch; StringRef Error = ""; @@ -436,7 +435,7 @@ StringRef ARMTargetParser::getCanonicalArchName(StringRef Arch) { // AArch64 uses "_be", not "eb" suffix. if (A.find("eb") != StringRef::npos) return Error; - if (A.substr(offset,3) == "_be") + if (A.substr(offset, 3) == "_be") offset += 3; } @@ -456,7 +455,7 @@ StringRef ARMTargetParser::getCanonicalArchName(StringRef Arch) { // Only match non-marketing names if (offset != StringRef::npos) { - // Must start with 'vN'. + // Must start with 'vN'. if (A[0] != 'v' || !std::isdigit(A[1])) return Error; // Can't have an extra 'eb'. @@ -468,56 +467,64 @@ StringRef ARMTargetParser::getCanonicalArchName(StringRef Arch) { return A; } -unsigned ARMTargetParser::parseFPU(StringRef FPU) { +unsigned llvm::ARM::parseHWDiv(StringRef HWDiv) { + StringRef Syn = getHWDivSynonym(HWDiv); + for (const auto D : HWDivNames) { + if (Syn == D.getName()) + return D.ID; + } + return ARM::AEK_INVALID; +} + +unsigned llvm::ARM::parseFPU(StringRef FPU) { StringRef Syn = getFPUSynonym(FPU); for (const auto F : FPUNames) { - if (Syn == F.Name) + if (Syn == F.getName()) return F.ID; } return ARM::FK_INVALID; } // Allows partial match, ex. "v7a" matches "armv7a". -unsigned ARMTargetParser::parseArch(StringRef Arch) { +unsigned llvm::ARM::parseArch(StringRef Arch) { Arch = getCanonicalArchName(Arch); StringRef Syn = getArchSynonym(Arch); for (const auto A : ARCHNames) { - if (StringRef(A.Name).endswith(Syn)) + if (A.getName().endswith(Syn)) return A.ID; } return ARM::AK_INVALID; } -unsigned ARMTargetParser::parseArchExt(StringRef ArchExt) { +unsigned llvm::ARM::parseArchExt(StringRef ArchExt) { for (const auto A : ARCHExtNames) { - if (ArchExt == A.Name) + if (ArchExt == A.getName()) return A.ID; } return ARM::AEK_INVALID; } -unsigned ARMTargetParser::parseCPUArch(StringRef CPU) { +unsigned llvm::ARM::parseCPUArch(StringRef CPU) { for (const auto C : CPUNames) { - if (CPU == C.Name) + if (CPU == C.getName()) return C.ArchID; } return ARM::AK_INVALID; } // ARM, Thumb, AArch64 -unsigned ARMTargetParser::parseArchISA(StringRef Arch) { +unsigned llvm::ARM::parseArchISA(StringRef Arch) { return StringSwitch<unsigned>(Arch) .StartsWith("aarch64", ARM::IK_AARCH64) - .StartsWith("arm64", ARM::IK_AARCH64) - .StartsWith("thumb", ARM::IK_THUMB) - .StartsWith("arm", ARM::IK_ARM) + .StartsWith("arm64", ARM::IK_AARCH64) + .StartsWith("thumb", ARM::IK_THUMB) + .StartsWith("arm", ARM::IK_ARM) .Default(ARM::EK_INVALID); } // Little/Big endian -unsigned ARMTargetParser::parseArchEndian(StringRef Arch) { - if (Arch.startswith("armeb") || - Arch.startswith("thumbeb") || +unsigned llvm::ARM::parseArchEndian(StringRef Arch) { + if (Arch.startswith("armeb") || Arch.startswith("thumbeb") || Arch.startswith("aarch64_be")) return ARM::EK_BIG; @@ -535,29 +542,29 @@ unsigned ARMTargetParser::parseArchEndian(StringRef Arch) { } // Profile A/R/M -unsigned ARMTargetParser::parseArchProfile(StringRef Arch) { +unsigned llvm::ARM::parseArchProfile(StringRef Arch) { Arch = getCanonicalArchName(Arch); - switch(parseArch(Arch)) { + switch (parseArch(Arch)) { case ARM::AK_ARMV6M: case ARM::AK_ARMV7M: - case ARM::AK_ARMV6SM: case ARM::AK_ARMV7EM: return ARM::PK_M; case ARM::AK_ARMV7R: return ARM::PK_R; - case ARM::AK_ARMV7: case ARM::AK_ARMV7A: + case ARM::AK_ARMV7K: case ARM::AK_ARMV8A: case ARM::AK_ARMV8_1A: + case ARM::AK_ARMV8_2A: return ARM::PK_A; } return ARM::PK_INVALID; } // Version number (ex. v7 = 7). -unsigned ARMTargetParser::parseArchVersion(StringRef Arch) { +unsigned llvm::ARM::parseArchVersion(StringRef Arch) { Arch = getCanonicalArchName(Arch); - switch(parseArch(Arch)) { + switch (parseArch(Arch)) { case ARM::AK_ARMV2: case ARM::AK_ARMV2A: return 2; @@ -567,36 +574,29 @@ unsigned ARMTargetParser::parseArchVersion(StringRef Arch) { case ARM::AK_ARMV4: case ARM::AK_ARMV4T: return 4; - case ARM::AK_ARMV5: case ARM::AK_ARMV5T: case ARM::AK_ARMV5TE: case ARM::AK_IWMMXT: case ARM::AK_IWMMXT2: case ARM::AK_XSCALE: - case ARM::AK_ARMV5E: case ARM::AK_ARMV5TEJ: return 5; case ARM::AK_ARMV6: - case ARM::AK_ARMV6J: case ARM::AK_ARMV6K: case ARM::AK_ARMV6T2: - case ARM::AK_ARMV6Z: - case ARM::AK_ARMV6ZK: + case ARM::AK_ARMV6KZ: case ARM::AK_ARMV6M: - case ARM::AK_ARMV6SM: - case ARM::AK_ARMV6HL: return 6; - case ARM::AK_ARMV7: case ARM::AK_ARMV7A: case ARM::AK_ARMV7R: case ARM::AK_ARMV7M: - case ARM::AK_ARMV7L: - case ARM::AK_ARMV7HL: case ARM::AK_ARMV7S: case ARM::AK_ARMV7EM: + case ARM::AK_ARMV7K: return 7; case ARM::AK_ARMV8A: case ARM::AK_ARMV8_1A: + case ARM::AK_ARMV8_2A: return 8; } return 0; diff --git a/contrib/llvm/lib/Support/ThreadPool.cpp b/contrib/llvm/lib/Support/ThreadPool.cpp new file mode 100644 index 0000000..d4dcb2e --- /dev/null +++ b/contrib/llvm/lib/Support/ThreadPool.cpp @@ -0,0 +1,155 @@ +//==-- llvm/Support/ThreadPool.cpp - A ThreadPool implementation -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a crude C++11 based thread pool. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/ThreadPool.h" + +#include "llvm/Config/llvm-config.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#if LLVM_ENABLE_THREADS + +// Default to std::thread::hardware_concurrency +ThreadPool::ThreadPool() : ThreadPool(std::thread::hardware_concurrency()) {} + +ThreadPool::ThreadPool(unsigned ThreadCount) + : ActiveThreads(0), EnableFlag(true) { + // Create ThreadCount threads that will loop forever, wait on QueueCondition + // for tasks to be queued or the Pool to be destroyed. + Threads.reserve(ThreadCount); + for (unsigned ThreadID = 0; ThreadID < ThreadCount; ++ThreadID) { + Threads.emplace_back([&] { + while (true) { + PackagedTaskTy Task; + { + std::unique_lock<std::mutex> LockGuard(QueueLock); + // Wait for tasks to be pushed in the queue + QueueCondition.wait(LockGuard, + [&] { return !EnableFlag || !Tasks.empty(); }); + // Exit condition + if (!EnableFlag && Tasks.empty()) + return; + // Yeah, we have a task, grab it and release the lock on the queue + + // We first need to signal that we are active before popping the queue + // in order for wait() to properly detect that even if the queue is + // empty, there is still a task in flight. + { + ++ActiveThreads; + std::unique_lock<std::mutex> LockGuard(CompletionLock); + } + Task = std::move(Tasks.front()); + Tasks.pop(); + } + // Run the task we just grabbed +#ifndef _MSC_VER + Task(); +#else + Task(/* unused */ false); +#endif + + { + // Adjust `ActiveThreads`, in case someone waits on ThreadPool::wait() + std::unique_lock<std::mutex> LockGuard(CompletionLock); + --ActiveThreads; + } + + // Notify task completion, in case someone waits on ThreadPool::wait() + CompletionCondition.notify_all(); + } + }); + } +} + +void ThreadPool::wait() { + // Wait for all threads to complete and the queue to be empty + std::unique_lock<std::mutex> LockGuard(CompletionLock); + CompletionCondition.wait(LockGuard, + [&] { return Tasks.empty() && !ActiveThreads; }); +} + +std::shared_future<ThreadPool::VoidTy> ThreadPool::asyncImpl(TaskTy Task) { + /// Wrap the Task in a packaged_task to return a future object. + PackagedTaskTy PackagedTask(std::move(Task)); + auto Future = PackagedTask.get_future(); + { + // Lock the queue and push the new task + std::unique_lock<std::mutex> LockGuard(QueueLock); + + // Don't allow enqueueing after disabling the pool + assert(EnableFlag && "Queuing a thread during ThreadPool destruction"); + + Tasks.push(std::move(PackagedTask)); + } + QueueCondition.notify_one(); + return Future.share(); +} + +// The destructor joins all threads, waiting for completion. +ThreadPool::~ThreadPool() { + { + std::unique_lock<std::mutex> LockGuard(QueueLock); + EnableFlag = false; + } + QueueCondition.notify_all(); + for (auto &Worker : Threads) + Worker.join(); +} + +#else // LLVM_ENABLE_THREADS Disabled + +ThreadPool::ThreadPool() : ThreadPool(0) {} + +// No threads are launched, issue a warning if ThreadCount is not 0 +ThreadPool::ThreadPool(unsigned ThreadCount) + : ActiveThreads(0) { + if (ThreadCount) { + errs() << "Warning: request a ThreadPool with " << ThreadCount + << " threads, but LLVM_ENABLE_THREADS has been turned off\n"; + } +} + +void ThreadPool::wait() { + // Sequential implementation running the tasks + while (!Tasks.empty()) { + auto Task = std::move(Tasks.front()); + Tasks.pop(); +#ifndef _MSC_VER + Task(); +#else + Task(/* unused */ false); +#endif + } +} + +std::shared_future<ThreadPool::VoidTy> ThreadPool::asyncImpl(TaskTy Task) { +#ifndef _MSC_VER + // Get a Future with launch::deferred execution using std::async + auto Future = std::async(std::launch::deferred, std::move(Task)).share(); + // Wrap the future so that both ThreadPool::wait() can operate and the + // returned future can be sync'ed on. + PackagedTaskTy PackagedTask([Future]() { Future.get(); }); +#else + auto Future = std::async(std::launch::deferred, std::move(Task), false).share(); + PackagedTaskTy PackagedTask([Future](bool) -> bool { Future.get(); return false; }); +#endif + Tasks.push(std::move(PackagedTask)); + return Future; +} + +ThreadPool::~ThreadPool() { + wait(); +} + +#endif diff --git a/contrib/llvm/lib/Support/TimeValue.cpp b/contrib/llvm/lib/Support/TimeValue.cpp index 136b93e..94a4c01 100644 --- a/contrib/llvm/lib/Support/TimeValue.cpp +++ b/contrib/llvm/lib/Support/TimeValue.cpp @@ -15,6 +15,7 @@ #include "llvm/Config/config.h" namespace llvm { + using namespace sys; const TimeValue::SecondsType @@ -22,8 +23,7 @@ const TimeValue::SecondsType const TimeValue::SecondsType TimeValue::Win32ZeroTimeSeconds = -12591158400ULL; -void -TimeValue::normalize( void ) { +void TimeValue::normalize() { if ( nanos_ >= NANOSECONDS_PER_SECOND ) { do { seconds_++; @@ -45,7 +45,7 @@ TimeValue::normalize( void ) { } } -} +} // namespace llvm /// Include the platform-specific portion of TimeValue class #ifdef LLVM_ON_UNIX diff --git a/contrib/llvm/lib/Support/Timer.cpp b/contrib/llvm/lib/Support/Timer.cpp index d7b6515..414f559 100644 --- a/contrib/llvm/lib/Support/Timer.cpp +++ b/contrib/llvm/lib/Support/Timer.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/Timer.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringMap.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" @@ -22,9 +23,6 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; -// CreateInfoOutputFile - Return a file stream to print our output on. -namespace llvm { extern raw_ostream *CreateInfoOutputFile(); } - // getLibSupportInfoOutputFilename - This ugly hack is brought to you courtesy // of constructor/destructor ordering being unspecified by C++. Basically the // problem is that a Statistic object gets destroyed, which ends up calling @@ -52,28 +50,27 @@ namespace { cl::Hidden, cl::location(getLibSupportInfoOutputFilename())); } -// CreateInfoOutputFile - Return a file stream to print our output on. -raw_ostream *llvm::CreateInfoOutputFile() { +// Return a file stream to print our output on. +std::unique_ptr<raw_fd_ostream> llvm::CreateInfoOutputFile() { const std::string &OutputFilename = getLibSupportInfoOutputFilename(); if (OutputFilename.empty()) - return new raw_fd_ostream(2, false); // stderr. + return llvm::make_unique<raw_fd_ostream>(2, false); // stderr. if (OutputFilename == "-") - return new raw_fd_ostream(1, false); // stdout. - + return llvm::make_unique<raw_fd_ostream>(1, false); // stdout. + // Append mode is used because the info output file is opened and closed // each time -stats or -time-passes wants to print output to it. To // compensate for this, the test-suite Makefiles have code to delete the // info output file before running commands which write to it. std::error_code EC; - raw_ostream *Result = new raw_fd_ostream(OutputFilename, EC, - sys::fs::F_Append | sys::fs::F_Text); + auto Result = llvm::make_unique<raw_fd_ostream>( + OutputFilename, EC, sys::fs::F_Append | sys::fs::F_Text); if (!EC) return Result; - + errs() << "Error opening info-output-file '" << OutputFilename << " for appending!\n"; - delete Result; - return new raw_fd_ostream(2, false); // stderr. + return llvm::make_unique<raw_fd_ostream>(2, false); // stderr. } @@ -99,17 +96,13 @@ static TimerGroup *getDefaultTimerGroup() { //===----------------------------------------------------------------------===// void Timer::init(StringRef N) { - assert(!TG && "Timer already initialized"); - Name.assign(N.begin(), N.end()); - Started = false; - TG = getDefaultTimerGroup(); - TG->addTimer(*this); + init(N, *getDefaultTimerGroup()); } void Timer::init(StringRef N, TimerGroup &tg) { assert(!TG && "Timer already initialized"); Name.assign(N.begin(), N.end()); - Started = false; + Running = Triggered = false; TG = &tg; TG->addTimer(*this); } @@ -142,25 +135,22 @@ TimeRecord TimeRecord::getCurrentTime(bool Start) { return Result; } -static ManagedStatic<std::vector<Timer*> > ActiveTimers; - void Timer::startTimer() { - Started = true; - ActiveTimers->push_back(this); - Time -= TimeRecord::getCurrentTime(true); + assert(!Running && "Cannot start a running timer"); + Running = Triggered = true; + StartTime = TimeRecord::getCurrentTime(true); } void Timer::stopTimer() { + assert(Running && "Cannot stop a paused timer"); + Running = false; Time += TimeRecord::getCurrentTime(false); + Time -= StartTime; +} - if (ActiveTimers->back() == this) { - ActiveTimers->pop_back(); - } else { - std::vector<Timer*>::iterator I = - std::find(ActiveTimers->begin(), ActiveTimers->end(), this); - assert(I != ActiveTimers->end() && "stop but no startTimer?"); - ActiveTimers->erase(I); - } +void Timer::clear() { + Running = Triggered = false; + Time = StartTime = TimeRecord(); } static void printVal(double Val, double Total, raw_ostream &OS) { @@ -278,8 +268,8 @@ void TimerGroup::removeTimer(Timer &T) { sys::SmartScopedLock<true> L(*TimerLock); // If the timer was started, move its data to TimersToPrint. - if (T.Started) - TimersToPrint.push_back(std::make_pair(T.Time, T.Name)); + if (T.hasTriggered()) + TimersToPrint.emplace_back(T.Time, T.Name); T.TG = nullptr; @@ -292,10 +282,9 @@ void TimerGroup::removeTimer(Timer &T) { // them were started. if (FirstTimer || TimersToPrint.empty()) return; - - raw_ostream *OutStream = CreateInfoOutputFile(); + + std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile(); PrintQueuedTimers(*OutStream); - delete OutStream; // Close the file. } void TimerGroup::addTimer(Timer &T) { @@ -314,8 +303,8 @@ void TimerGroup::PrintQueuedTimers(raw_ostream &OS) { std::sort(TimersToPrint.begin(), TimersToPrint.end()); TimeRecord Total; - for (unsigned i = 0, e = TimersToPrint.size(); i != e; ++i) - Total += TimersToPrint[i].first; + for (auto &RecordNamePair : TimersToPrint) + Total += RecordNamePair.first; // Print out timing header. OS << "===" << std::string(73, '-') << "===\n"; @@ -365,12 +354,11 @@ void TimerGroup::print(raw_ostream &OS) { // See if any of our timers were started, if so add them to TimersToPrint and // reset them. for (Timer *T = FirstTimer; T; T = T->Next) { - if (!T->Started) continue; - TimersToPrint.push_back(std::make_pair(T->Time, T->Name)); + if (!T->hasTriggered()) continue; + TimersToPrint.emplace_back(T->Time, T->Name); // Clear out the time. - T->Started = 0; - T->Time = TimeRecord(); + T->clear(); } // If any timers were started, print the group. diff --git a/contrib/llvm/lib/Support/Triple.cpp b/contrib/llvm/lib/Support/Triple.cpp index c6646fb..0e5d3ac 100644 --- a/contrib/llvm/lib/Support/Triple.cpp +++ b/contrib/llvm/lib/Support/Triple.cpp @@ -25,6 +25,7 @@ const char *Triple::getArchTypeName(ArchType Kind) { case aarch64_be: return "aarch64_be"; case arm: return "arm"; case armeb: return "armeb"; + case avr: return "avr"; case bpfel: return "bpfel"; case bpfeb: return "bpfeb"; case hexagon: return "hexagon"; @@ -80,6 +81,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) { case thumb: case thumbeb: return "arm"; + case avr: return "avr"; + case ppc64: case ppc64le: case ppc: return "ppc"; @@ -124,8 +127,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) { case spir64: return "spir"; case kalimba: return "kalimba"; case shave: return "shave"; - case wasm32: return "wasm32"; - case wasm64: return "wasm64"; + case wasm32: + case wasm64: return "wasm"; } } @@ -144,6 +147,7 @@ const char *Triple::getVendorTypeName(VendorType Kind) { case MipsTechnologies: return "mti"; case NVIDIA: return "nvidia"; case CSR: return "csr"; + case Myriad: return "myriad"; } llvm_unreachable("Invalid VendorType!"); @@ -177,6 +181,9 @@ const char *Triple::getOSTypeName(OSType Kind) { case NVCL: return "nvcl"; case AMDHSA: return "amdhsa"; case PS4: return "ps4"; + case ELFIAMCU: return "elfiamcu"; + case TvOS: return "tvos"; + case WatchOS: return "watchos"; } llvm_unreachable("Invalid OSType"); @@ -196,6 +203,8 @@ const char *Triple::getEnvironmentTypeName(EnvironmentType Kind) { case MSVC: return "msvc"; case Itanium: return "itanium"; case Cygnus: return "cygnus"; + case AMDOpenCL: return "amdopencl"; + case CoreCLR: return "coreclr"; } llvm_unreachable("Invalid EnvironmentType!"); @@ -224,6 +233,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) { .Case("arm64", aarch64) // "arm64" is an alias for "aarch64" .Case("arm", arm) .Case("armeb", armeb) + .Case("avr", avr) .StartsWith("bpf", BPFArch) .Case("mips", mips) .Case("mipsel", mipsel) @@ -265,8 +275,8 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) { } static Triple::ArchType parseARMArch(StringRef ArchName) { - unsigned ISA = ARMTargetParser::parseArchISA(ArchName); - unsigned ENDIAN = ARMTargetParser::parseArchEndian(ArchName); + unsigned ISA = ARM::parseArchISA(ArchName); + unsigned ENDIAN = ARM::parseArchEndian(ArchName); Triple::ArchType arch = Triple::UnknownArch; switch (ENDIAN) { @@ -300,7 +310,7 @@ static Triple::ArchType parseARMArch(StringRef ArchName) { } } - ArchName = ARMTargetParser::getCanonicalArchName(ArchName); + ArchName = ARM::getCanonicalArchName(ArchName); if (ArchName.empty()) return Triple::UnknownArch; @@ -310,8 +320,8 @@ static Triple::ArchType parseARMArch(StringRef ArchName) { return Triple::UnknownArch; // Thumb only for v6m - unsigned Profile = ARMTargetParser::parseArchProfile(ArchName); - unsigned Version = ARMTargetParser::parseArchVersion(ArchName); + unsigned Profile = ARM::parseArchProfile(ArchName); + unsigned Version = ARM::parseArchVersion(ArchName); if (Profile == ARM::PK_M && Version == 6) { if (ENDIAN == ARM::EK_BIG) return Triple::thumbeb; @@ -323,10 +333,7 @@ static Triple::ArchType parseARMArch(StringRef ArchName) { } static Triple::ArchType parseArch(StringRef ArchName) { - Triple::ArchType ARMArch(parseARMArch(ArchName)); - Triple::ArchType BPFArch(parseBPFArch(ArchName)); - - return StringSwitch<Triple::ArchType>(ArchName) + auto AT = StringSwitch<Triple::ArchType>(ArchName) .Cases("i386", "i486", "i586", "i686", Triple::x86) // FIXME: Do we need to support these? .Cases("i786", "i886", "i986", Triple::x86) @@ -336,9 +343,14 @@ static Triple::ArchType parseArch(StringRef ArchName) { .Case("powerpc64le", Triple::ppc64le) .Case("xscale", Triple::arm) .Case("xscaleeb", Triple::armeb) - .StartsWith("arm", ARMArch) - .StartsWith("thumb", ARMArch) - .StartsWith("aarch64", ARMArch) + .Case("aarch64", Triple::aarch64) + .Case("aarch64_be", Triple::aarch64_be) + .Case("arm64", Triple::aarch64) + .Case("arm", Triple::arm) + .Case("armeb", Triple::armeb) + .Case("thumb", Triple::thumb) + .Case("thumbeb", Triple::thumbeb) + .Case("avr", Triple::avr) .Case("msp430", Triple::msp430) .Cases("mips", "mipseb", "mipsallegrex", Triple::mips) .Cases("mipsel", "mipsallegrexel", Triple::mipsel) @@ -346,7 +358,6 @@ static Triple::ArchType parseArch(StringRef ArchName) { .Case("mips64el", Triple::mips64el) .Case("r600", Triple::r600) .Case("amdgcn", Triple::amdgcn) - .StartsWith("bpf", BPFArch) .Case("hexagon", Triple::hexagon) .Case("s390x", Triple::systemz) .Case("sparc", Triple::sparc) @@ -369,6 +380,18 @@ static Triple::ArchType parseArch(StringRef ArchName) { .Case("wasm32", Triple::wasm32) .Case("wasm64", Triple::wasm64) .Default(Triple::UnknownArch); + + // Some architectures require special parsing logic just to compute the + // ArchType result. + if (AT == Triple::UnknownArch) { + if (ArchName.startswith("arm") || ArchName.startswith("thumb") || + ArchName.startswith("aarch64")) + return parseARMArch(ArchName); + if (ArchName.startswith("bpf")) + return parseBPFArch(ArchName); + } + + return AT; } static Triple::VendorType parseVendor(StringRef VendorName) { @@ -384,6 +407,7 @@ static Triple::VendorType parseVendor(StringRef VendorName) { .Case("mti", Triple::MipsTechnologies) .Case("nvidia", Triple::NVIDIA) .Case("csr", Triple::CSR) + .Case("myriad", Triple::Myriad) .Default(Triple::UnknownVendor); } @@ -414,6 +438,9 @@ static Triple::OSType parseOS(StringRef OSName) { .StartsWith("nvcl", Triple::NVCL) .StartsWith("amdhsa", Triple::AMDHSA) .StartsWith("ps4", Triple::PS4) + .StartsWith("elfiamcu", Triple::ELFIAMCU) + .StartsWith("tvos", Triple::TvOS) + .StartsWith("watchos", Triple::WatchOS) .Default(Triple::UnknownOS); } @@ -430,6 +457,8 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) { .StartsWith("msvc", Triple::MSVC) .StartsWith("itanium", Triple::Itanium) .StartsWith("cygnus", Triple::Cygnus) + .StartsWith("amdopencl", Triple::AMDOpenCL) + .StartsWith("coreclr", Triple::CoreCLR) .Default(Triple::UnknownEnvironment); } @@ -442,7 +471,7 @@ static Triple::ObjectFormatType parseFormat(StringRef EnvironmentName) { } static Triple::SubArchType parseSubArch(StringRef SubArchName) { - StringRef ARMSubArch = ARMTargetParser::getCanonicalArchName(SubArchName); + StringRef ARMSubArch = ARM::getCanonicalArchName(SubArchName); // For now, this is the small part. Early return. if (ARMSubArch.empty()) @@ -453,14 +482,12 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) { .Default(Triple::NoSubArch); // ARM sub arch. - switch(ARMTargetParser::parseArch(ARMSubArch)) { + switch(ARM::parseArch(ARMSubArch)) { case ARM::AK_ARMV4: return Triple::NoSubArch; case ARM::AK_ARMV4T: return Triple::ARMSubArch_v4t; - case ARM::AK_ARMV5: case ARM::AK_ARMV5T: - case ARM::AK_ARMV5E: return Triple::ARMSubArch_v5; case ARM::AK_ARMV5TE: case ARM::AK_IWMMXT: @@ -469,24 +496,19 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) { case ARM::AK_ARMV5TEJ: return Triple::ARMSubArch_v5te; case ARM::AK_ARMV6: - case ARM::AK_ARMV6J: - case ARM::AK_ARMV6Z: return Triple::ARMSubArch_v6; case ARM::AK_ARMV6K: - case ARM::AK_ARMV6ZK: - case ARM::AK_ARMV6HL: + case ARM::AK_ARMV6KZ: return Triple::ARMSubArch_v6k; case ARM::AK_ARMV6T2: return Triple::ARMSubArch_v6t2; case ARM::AK_ARMV6M: - case ARM::AK_ARMV6SM: return Triple::ARMSubArch_v6m; - case ARM::AK_ARMV7: case ARM::AK_ARMV7A: case ARM::AK_ARMV7R: - case ARM::AK_ARMV7L: - case ARM::AK_ARMV7HL: return Triple::ARMSubArch_v7; + case ARM::AK_ARMV7K: + return Triple::ARMSubArch_v7k; case ARM::AK_ARMV7M: return Triple::ARMSubArch_v7m; case ARM::AK_ARMV7S: @@ -497,6 +519,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) { return Triple::ARMSubArch_v8; case ARM::AK_ARMV8_1A: return Triple::ARMSubArch_v8_1a; + case ARM::AK_ARMV8_2A: + return Triple::ARMSubArch_v8_2a; default: return Triple::NoSubArch; } @@ -514,20 +538,53 @@ static const char *getObjectFormatTypeName(Triple::ObjectFormatType Kind) { static Triple::ObjectFormatType getDefaultFormat(const Triple &T) { switch (T.getArch()) { - default: - break; + case Triple::UnknownArch: + case Triple::aarch64: + case Triple::arm: + case Triple::thumb: + case Triple::x86: + case Triple::x86_64: + if (T.isOSDarwin()) + return Triple::MachO; + else if (T.isOSWindows()) + return Triple::COFF; + return Triple::ELF; + + case Triple::aarch64_be: + case Triple::amdgcn: + case Triple::amdil: + case Triple::amdil64: + case Triple::armeb: + case Triple::avr: + case Triple::bpfeb: + case Triple::bpfel: case Triple::hexagon: + case Triple::hsail: + case Triple::hsail64: + case Triple::kalimba: + case Triple::le32: + case Triple::le64: case Triple::mips: - case Triple::mipsel: case Triple::mips64: case Triple::mips64el: + case Triple::mipsel: + case Triple::msp430: + case Triple::nvptx: + case Triple::nvptx64: + case Triple::ppc64le: case Triple::r600: - case Triple::amdgcn: + case Triple::shave: case Triple::sparc: + case Triple::sparcel: case Triple::sparcv9: + case Triple::spir: + case Triple::spir64: case Triple::systemz: + case Triple::tce: + case Triple::thumbeb: + case Triple::wasm32: + case Triple::wasm64: case Triple::xcore: - case Triple::ppc64le: return Triple::ELF; case Triple::ppc: @@ -536,12 +593,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) { return Triple::MachO; return Triple::ELF; } - - if (T.isOSDarwin()) - return Triple::MachO; - else if (T.isOSWindows()) - return Triple::COFF; - return Triple::ELF; + llvm_unreachable("unknown architecture"); } /// \brief Construct a triple from the string representation provided. @@ -549,14 +601,27 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) { /// This stores the string representation and parses the various pieces into /// enum members. Triple::Triple(const Twine &Str) - : Data(Str.str()), - Arch(parseArch(getArchName())), - SubArch(parseSubArch(getArchName())), - Vendor(parseVendor(getVendorName())), - OS(parseOS(getOSName())), - Environment(parseEnvironment(getEnvironmentName())), - ObjectFormat(parseFormat(getEnvironmentName())) { - if (ObjectFormat == Triple::UnknownObjectFormat) + : Data(Str.str()), Arch(UnknownArch), SubArch(NoSubArch), + Vendor(UnknownVendor), OS(UnknownOS), Environment(UnknownEnvironment), + ObjectFormat(UnknownObjectFormat) { + // Do minimal parsing by hand here. + SmallVector<StringRef, 4> Components; + StringRef(Data).split(Components, '-', /*MaxSplit*/ 3); + if (Components.size() > 0) { + Arch = parseArch(Components[0]); + SubArch = parseSubArch(Components[0]); + if (Components.size() > 1) { + Vendor = parseVendor(Components[1]); + if (Components.size() > 2) { + OS = parseOS(Components[2]); + if (Components.size() > 3) { + Environment = parseEnvironment(Components[3]); + ObjectFormat = parseFormat(Components[3]); + } + } + } + } + if (ObjectFormat == UnknownObjectFormat) ObjectFormat = getDefaultFormat(*this); } @@ -601,7 +666,7 @@ std::string Triple::normalize(StringRef Str) { // Parse into components. SmallVector<StringRef, 4> Components; - Str.split(Components, "-"); + Str.split(Components, '-'); // If the first component corresponds to a known architecture, preferentially // use it for the architecture. If the second component corresponds to a @@ -889,6 +954,8 @@ bool Triple::getMacOSXVersion(unsigned &Major, unsigned &Minor, return false; break; case IOS: + case TvOS: + case WatchOS: // Ignore the version from the triple. This is only handled because the // the clang driver combines OS X and IOS support into a common Darwin // toolchain that wants to know the OS X version number even when targeting @@ -916,11 +983,38 @@ void Triple::getiOSVersion(unsigned &Major, unsigned &Minor, Micro = 0; break; case IOS: + case TvOS: getOSVersion(Major, Minor, Micro); // Default to 5.0 (or 7.0 for arm64). if (Major == 0) Major = (getArch() == aarch64) ? 7 : 5; break; + case WatchOS: + llvm_unreachable("conflicting triple info"); + } +} + +void Triple::getWatchOSVersion(unsigned &Major, unsigned &Minor, + unsigned &Micro) const { + switch (getOS()) { + default: llvm_unreachable("unexpected OS for Darwin triple"); + case Darwin: + case MacOSX: + // Ignore the version from the triple. This is only handled because the + // the clang driver combines OS X and IOS support into a common Darwin + // toolchain that wants to know the iOS version number even when targeting + // OS X. + Major = 2; + Minor = 0; + Micro = 0; + break; + case WatchOS: + getOSVersion(Major, Minor, Micro); + if (Major == 0) + Major = 2; + break; + case IOS: + llvm_unreachable("conflicting triple info"); } } @@ -993,6 +1087,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) { case llvm::Triple::UnknownArch: return 0; + case llvm::Triple::avr: case llvm::Triple::msp430: return 16; @@ -1059,9 +1154,8 @@ Triple Triple::get32BitArchVariant() const { Triple T(*this); switch (getArch()) { case Triple::UnknownArch: - case Triple::aarch64: - case Triple::aarch64_be: case Triple::amdgcn: + case Triple::avr: case Triple::bpfel: case Triple::bpfeb: case Triple::msp430: @@ -1095,17 +1189,19 @@ Triple Triple::get32BitArchVariant() const { // Already 32-bit. break; - case Triple::le64: T.setArch(Triple::le32); break; - case Triple::mips64: T.setArch(Triple::mips); break; - case Triple::mips64el: T.setArch(Triple::mipsel); break; - case Triple::nvptx64: T.setArch(Triple::nvptx); break; - case Triple::ppc64: T.setArch(Triple::ppc); break; - case Triple::sparcv9: T.setArch(Triple::sparc); break; - case Triple::x86_64: T.setArch(Triple::x86); break; - case Triple::amdil64: T.setArch(Triple::amdil); break; - case Triple::hsail64: T.setArch(Triple::hsail); break; - case Triple::spir64: T.setArch(Triple::spir); break; - case Triple::wasm64: T.setArch(Triple::wasm32); break; + case Triple::aarch64: T.setArch(Triple::arm); break; + case Triple::aarch64_be: T.setArch(Triple::armeb); break; + case Triple::le64: T.setArch(Triple::le32); break; + case Triple::mips64: T.setArch(Triple::mips); break; + case Triple::mips64el: T.setArch(Triple::mipsel); break; + case Triple::nvptx64: T.setArch(Triple::nvptx); break; + case Triple::ppc64: T.setArch(Triple::ppc); break; + case Triple::sparcv9: T.setArch(Triple::sparc); break; + case Triple::x86_64: T.setArch(Triple::x86); break; + case Triple::amdil64: T.setArch(Triple::amdil); break; + case Triple::hsail64: T.setArch(Triple::hsail); break; + case Triple::spir64: T.setArch(Triple::spir); break; + case Triple::wasm64: T.setArch(Triple::wasm32); break; } return T; } @@ -1114,15 +1210,12 @@ Triple Triple::get64BitArchVariant() const { Triple T(*this); switch (getArch()) { case Triple::UnknownArch: - case Triple::arm: - case Triple::armeb: + case Triple::avr: case Triple::hexagon: case Triple::kalimba: case Triple::msp430: case Triple::r600: case Triple::tce: - case Triple::thumb: - case Triple::thumbeb: case Triple::xcore: case Triple::sparcel: case Triple::shave: @@ -1150,17 +1243,21 @@ Triple Triple::get64BitArchVariant() const { // Already 64-bit. break; - case Triple::le32: T.setArch(Triple::le64); break; - case Triple::mips: T.setArch(Triple::mips64); break; - case Triple::mipsel: T.setArch(Triple::mips64el); break; - case Triple::nvptx: T.setArch(Triple::nvptx64); break; - case Triple::ppc: T.setArch(Triple::ppc64); break; - case Triple::sparc: T.setArch(Triple::sparcv9); break; - case Triple::x86: T.setArch(Triple::x86_64); break; - case Triple::amdil: T.setArch(Triple::amdil64); break; - case Triple::hsail: T.setArch(Triple::hsail64); break; - case Triple::spir: T.setArch(Triple::spir64); break; - case Triple::wasm32: T.setArch(Triple::wasm64); break; + case Triple::arm: T.setArch(Triple::aarch64); break; + case Triple::armeb: T.setArch(Triple::aarch64_be); break; + case Triple::le32: T.setArch(Triple::le64); break; + case Triple::mips: T.setArch(Triple::mips64); break; + case Triple::mipsel: T.setArch(Triple::mips64el); break; + case Triple::nvptx: T.setArch(Triple::nvptx64); break; + case Triple::ppc: T.setArch(Triple::ppc64); break; + case Triple::sparc: T.setArch(Triple::sparcv9); break; + case Triple::x86: T.setArch(Triple::x86_64); break; + case Triple::amdil: T.setArch(Triple::amdil64); break; + case Triple::hsail: T.setArch(Triple::hsail64); break; + case Triple::spir: T.setArch(Triple::spir64); break; + case Triple::thumb: T.setArch(Triple::aarch64); break; + case Triple::thumbeb: T.setArch(Triple::aarch64_be); break; + case Triple::wasm32: T.setArch(Triple::wasm64); break; } return T; } @@ -1172,6 +1269,7 @@ Triple Triple::getBigEndianArchVariant() const { case Triple::amdgcn: case Triple::amdil64: case Triple::amdil: + case Triple::avr: case Triple::hexagon: case Triple::hsail64: case Triple::hsail: @@ -1244,6 +1342,7 @@ Triple Triple::getLittleEndianArchVariant() const { case Triple::amdil64: case Triple::amdil: case Triple::arm: + case Triple::avr: case Triple::bpfel: case Triple::hexagon: case Triple::hsail64: @@ -1281,10 +1380,10 @@ Triple Triple::getLittleEndianArchVariant() const { return T; } -const char *Triple::getARMCPUForArch(StringRef MArch) const { +StringRef Triple::getARMCPUForArch(StringRef MArch) const { if (MArch.empty()) MArch = getArchName(); - MArch = ARMTargetParser::getCanonicalArchName(MArch); + MArch = ARM::getCanonicalArchName(MArch); // Some defaults are forced. switch (getOS()) { @@ -1296,15 +1395,21 @@ const char *Triple::getARMCPUForArch(StringRef MArch) const { case llvm::Triple::Win32: // FIXME: this is invalid for WindowsCE return "cortex-a9"; + case llvm::Triple::MacOSX: + case llvm::Triple::IOS: + case llvm::Triple::WatchOS: + if (MArch == "v7k") + return "cortex-a7"; + break; default: break; } if (MArch.empty()) - return nullptr; + return StringRef(); - const char *CPU = ARMTargetParser::getDefaultCPU(MArch); - if (CPU) + StringRef CPU = ARM::getDefaultCPU(MArch); + if (!CPU.empty()) return CPU; // If no specific architecture version is requested, return the minimum CPU diff --git a/contrib/llvm/lib/Support/Unix/Memory.inc b/contrib/llvm/lib/Support/Unix/Memory.inc index c421ee8..d703191 100644 --- a/contrib/llvm/lib/Support/Unix/Memory.inc +++ b/contrib/llvm/lib/Support/Unix/Memory.inc @@ -50,9 +50,8 @@ int getPosixProtectionFlags(unsigned Flags) { return PROT_READ | PROT_WRITE; case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_EXEC: return PROT_READ | PROT_EXEC; - case llvm::sys::Memory::MF_READ | - llvm::sys::Memory::MF_WRITE | - llvm::sys::Memory::MF_EXEC: + case llvm::sys::Memory::MF_READ | llvm::sys::Memory::MF_WRITE | + llvm::sys::Memory::MF_EXEC: return PROT_READ | PROT_WRITE | PROT_EXEC; case llvm::sys::Memory::MF_EXEC: #if defined(__FreeBSD__) @@ -153,6 +152,7 @@ Memory::releaseMappedMemory(MemoryBlock &M) { std::error_code Memory::protectMappedMemory(const MemoryBlock &M, unsigned Flags) { + static const size_t PageSize = Process::getPageSize(); if (M.Address == nullptr || M.Size == 0) return std::error_code(); @@ -161,7 +161,7 @@ Memory::protectMappedMemory(const MemoryBlock &M, unsigned Flags) { int Protect = getPosixProtectionFlags(Flags); - int Result = ::mprotect(M.Address, M.Size, Protect); + int Result = ::mprotect((void*)((uintptr_t)M.Address & ~(PageSize-1)), PageSize*((M.Size+PageSize-1)/PageSize), Protect); if (Result != 0) return std::error_code(errno, std::generic_category()); @@ -181,7 +181,7 @@ Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock, std::string *ErrMsg) { if (NumBytes == 0) return MemoryBlock(); - size_t PageSize = Process::getPageSize(); + static const size_t PageSize = Process::getPageSize(); size_t NumPages = (NumBytes+PageSize-1)/PageSize; int fd = -1; @@ -265,15 +265,12 @@ bool Memory::setWritable (MemoryBlock &M, std::string *ErrMsg) { } bool Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) { -#if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__)) if (M.Address == 0 || M.Size == 0) return false; Memory::InvalidateInstructionCache(M.Address, M.Size); +#if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__)) kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address, (vm_size_t)M.Size, 0, VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY); return KERN_SUCCESS == kr; -#elif defined(__arm__) || defined(__aarch64__) - Memory::InvalidateInstructionCache(M.Address, M.Size); - return true; #else return true; #endif diff --git a/contrib/llvm/lib/Support/Unix/Path.inc b/contrib/llvm/lib/Support/Unix/Path.inc index 973d010..d85c37a 100644 --- a/contrib/llvm/lib/Support/Unix/Path.inc +++ b/contrib/llvm/lib/Support/Unix/Path.inc @@ -75,12 +75,12 @@ test_dir(char ret[PATH_MAX], const char *dir, const char *bin) char fullpath[PATH_MAX]; snprintf(fullpath, PATH_MAX, "%s/%s", dir, bin); - if (realpath(fullpath, ret) == NULL) - return (1); + if (!realpath(fullpath, ret)) + return 1; if (stat(fullpath, &sb) != 0) - return (1); + return 1; - return (0); + return 0; } static char * @@ -91,34 +91,34 @@ getprogpath(char ret[PATH_MAX], const char *bin) /* First approach: absolute path. */ if (bin[0] == '/') { if (test_dir(ret, "/", bin) == 0) - return (ret); - return (NULL); + return ret; + return nullptr; } /* Second approach: relative path. */ - if (strchr(bin, '/') != NULL) { + if (strchr(bin, '/')) { char cwd[PATH_MAX]; - if (getcwd(cwd, PATH_MAX) == NULL) - return (NULL); + if (!getcwd(cwd, PATH_MAX)) + return nullptr; if (test_dir(ret, cwd, bin) == 0) - return (ret); - return (NULL); + return ret; + return nullptr; } /* Third approach: $PATH */ - if ((pv = getenv("PATH")) == NULL) - return (NULL); + if ((pv = getenv("PATH")) == nullptr) + return nullptr; s = pv = strdup(pv); - if (pv == NULL) - return (NULL); - while ((t = strsep(&s, ":")) != NULL) { + if (!pv) + return nullptr; + while ((t = strsep(&s, ":")) != nullptr) { if (test_dir(ret, t, bin) == 0) { free(pv); - return (ret); + return ret; } } free(pv); - return (NULL); + return nullptr; } #endif // __FreeBSD__ || __NetBSD__ || __FreeBSD_kernel__ @@ -153,8 +153,8 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) { return std::string(exe_path, len); } else { // Fall back to the classical detection. - if (getprogpath(exe_path, argv0) != NULL) - return exe_path; + if (getprogpath(exe_path, argv0)) + return exe_path; } #elif defined(HAVE_DLFCN_H) // Use dladdr to get executable path if available. @@ -219,11 +219,12 @@ std::error_code current_path(SmallVectorImpl<char> &result) { return std::error_code(); } -std::error_code create_directory(const Twine &path, bool IgnoreExisting) { +std::error_code create_directory(const Twine &path, bool IgnoreExisting, + perms Perms) { SmallString<128> path_storage; StringRef p = path.toNullTerminatedStringRef(path_storage); - if (::mkdir(p.begin(), S_IRWXU | S_IRWXG) == -1) { + if (::mkdir(p.begin(), Perms) == -1) { if (errno != EEXIST || !IgnoreExisting) return std::error_code(errno, std::generic_category()); } @@ -324,6 +325,10 @@ std::error_code access(const Twine &Path, AccessMode Mode) { return std::error_code(); } +bool can_execute(const Twine &Path) { + return !access(Path, AccessMode::Execute); +} + bool equivalent(file_status A, file_status B) { assert(status_known(A) && status_known(B)); return A.fs_st_dev == B.fs_st_dev && @@ -555,6 +560,54 @@ bool home_directory(SmallVectorImpl<char> &result) { return false; } +static bool getDarwinConfDir(bool TempDir, SmallVectorImpl<char> &Result) { + #if defined(_CS_DARWIN_USER_TEMP_DIR) && defined(_CS_DARWIN_USER_CACHE_DIR) + // On Darwin, use DARWIN_USER_TEMP_DIR or DARWIN_USER_CACHE_DIR. + // macros defined in <unistd.h> on darwin >= 9 + int ConfName = TempDir ? _CS_DARWIN_USER_TEMP_DIR + : _CS_DARWIN_USER_CACHE_DIR; + size_t ConfLen = confstr(ConfName, nullptr, 0); + if (ConfLen > 0) { + do { + Result.resize(ConfLen); + ConfLen = confstr(ConfName, Result.data(), Result.size()); + } while (ConfLen > 0 && ConfLen != Result.size()); + + if (ConfLen > 0) { + assert(Result.back() == 0); + Result.pop_back(); + return true; + } + + Result.clear(); + } + #endif + return false; +} + +static bool getUserCacheDir(SmallVectorImpl<char> &Result) { + // First try using XDS_CACHE_HOME env variable, + // as specified in XDG Base Directory Specification at + // http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html + if (const char *XdsCacheDir = std::getenv("XDS_CACHE_HOME")) { + Result.clear(); + Result.append(XdsCacheDir, XdsCacheDir + strlen(XdsCacheDir)); + return true; + } + + // Try Darwin configuration query + if (getDarwinConfDir(false, Result)) + return true; + + // Use "$HOME/.cache" if $HOME is available + if (home_directory(Result)) { + append(Result, ".cache"); + return true; + } + + return false; +} + static const char *getEnvTempDir() { // Check whether the temporary directory is specified by an environment // variable. @@ -589,27 +642,8 @@ void system_temp_directory(bool ErasedOnReboot, SmallVectorImpl<char> &Result) { } } -#if defined(_CS_DARWIN_USER_TEMP_DIR) && defined(_CS_DARWIN_USER_CACHE_DIR) - // On Darwin, use DARWIN_USER_TEMP_DIR or DARWIN_USER_CACHE_DIR. - // macros defined in <unistd.h> on darwin >= 9 - int ConfName = ErasedOnReboot? _CS_DARWIN_USER_TEMP_DIR - : _CS_DARWIN_USER_CACHE_DIR; - size_t ConfLen = confstr(ConfName, nullptr, 0); - if (ConfLen > 0) { - do { - Result.resize(ConfLen); - ConfLen = confstr(ConfName, Result.data(), Result.size()); - } while (ConfLen > 0 && ConfLen != Result.size()); - - if (ConfLen > 0) { - assert(Result.back() == 0); - Result.pop_back(); - return; - } - - Result.clear(); - } -#endif + if (getDarwinConfDir(ErasedOnReboot, Result)) + return; const char *RequestedDir = getDefaultTempDir(ErasedOnReboot); Result.append(RequestedDir, RequestedDir + strlen(RequestedDir)); diff --git a/contrib/llvm/lib/Support/Unix/Process.inc b/contrib/llvm/lib/Support/Unix/Process.inc index df13bd2..27083ee 100644 --- a/contrib/llvm/lib/Support/Unix/Process.inc +++ b/contrib/llvm/lib/Support/Unix/Process.inc @@ -430,13 +430,18 @@ const char *Process::ResetColor() { #if !defined(HAVE_DECL_ARC4RANDOM) || !HAVE_DECL_ARC4RANDOM static unsigned GetRandomNumberSeed() { // Attempt to get the initial seed from /dev/urandom, if possible. - if (FILE *RandomSource = ::fopen("/dev/urandom", "r")) { + int urandomFD = open("/dev/urandom", O_RDONLY); + + if (urandomFD != -1) { unsigned seed; - int count = ::fread((void *)&seed, sizeof(seed), 1, RandomSource); - ::fclose(RandomSource); + // Don't use a buffered read to avoid reading more data + // from /dev/urandom than we need. + int count = read(urandomFD, (void *)&seed, sizeof(seed)); + + close(urandomFD); // Return the seed if the read was successful. - if (count == 1) + if (count == sizeof(seed)) return seed; } diff --git a/contrib/llvm/lib/Support/Unix/Program.inc b/contrib/llvm/lib/Support/Unix/Program.inc index 8947b62..7d3537e 100644 --- a/contrib/llvm/lib/Support/Unix/Program.inc +++ b/contrib/llvm/lib/Support/Unix/Program.inc @@ -323,7 +323,6 @@ namespace llvm { ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait, bool WaitUntilTerminates, std::string *ErrMsg) { -#ifdef HAVE_SYS_WAIT_H struct sigaction Act, Old; assert(PI.Pid && "invalid pid to wait on, process not started?"); @@ -417,12 +416,6 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait, // signal during execution as opposed to failing to execute. WaitResult.ReturnCode = -2; } -#else - if (ErrMsg) - *ErrMsg = "Program::Wait is not implemented on this platform yet!"; - ProcessInfo WaitResult; - WaitResult.ReturnCode = -2; -#endif return WaitResult; } @@ -453,7 +446,7 @@ llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents, return EC; } -bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) { +bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program, ArrayRef<const char*> Args) { static long ArgMax = sysconf(_SC_ARG_MAX); // System says no practical limit. @@ -463,7 +456,7 @@ bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) { // Conservatively account for space required by environment variables. long HalfArgMax = ArgMax / 2; - size_t ArgLength = 0; + size_t ArgLength = Program.size() + 1; for (ArrayRef<const char*>::iterator I = Args.begin(), E = Args.end(); I != E; ++I) { ArgLength += strlen(*I) + 1; diff --git a/contrib/llvm/lib/Support/Unix/Signals.inc b/contrib/llvm/lib/Support/Unix/Signals.inc index bfe2a3a..061cdb3 100644 --- a/contrib/llvm/lib/Support/Unix/Signals.inc +++ b/contrib/llvm/lib/Support/Unix/Signals.inc @@ -17,7 +17,6 @@ #include "llvm/Support/Format.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/FileUtilities.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Mutex.h" #include "llvm/Support/Program.h" @@ -25,7 +24,6 @@ #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <string> -#include <vector> #if HAVE_EXECINFO_H # include <execinfo.h> // For backtrace(). #endif @@ -58,8 +56,6 @@ static ManagedStatic<SmartMutex<true> > SignalsMutex; static void (*InterruptFunction)() = nullptr; static ManagedStatic<std::vector<std::string>> FilesToRemove; -static ManagedStatic<std::vector<std::pair<void (*)(void *), void *>>> - CallBacksToRun; // IntSigs - Signals that represent requested termination. There's no bug // or failure, or if there is, it's not our direct responsibility. For whatever @@ -90,12 +86,11 @@ static unsigned NumRegisteredSignals = 0; static struct { struct sigaction SA; int SigNo; -} RegisteredSignalInfo[(sizeof(IntSigs)+sizeof(KillSigs))/sizeof(KillSigs[0])]; +} RegisteredSignalInfo[array_lengthof(IntSigs) + array_lengthof(KillSigs)]; static void RegisterHandler(int Signal) { - assert(NumRegisteredSignals < - sizeof(RegisteredSignalInfo)/sizeof(RegisteredSignalInfo[0]) && + assert(NumRegisteredSignals < array_lengthof(RegisteredSignalInfo) && "Out of space for signal handlers!"); struct sigaction NewHandler; @@ -117,7 +112,7 @@ static void RegisterHandlers() { // during handling an actual signal because you can't safely call new in a // signal handler. *SignalsMutex; - + // If the handlers are already registered, we're done. if (NumRegisteredSignals != 0) return; @@ -148,9 +143,6 @@ static void RemoveFilesToRemove() { // memory. std::vector<std::string>& FilesToRemoveRef = *FilesToRemove; for (unsigned i = 0, e = FilesToRemoveRef.size(); i != e; ++i) { - // We rely on a std::string implementation for which repeated calls to - // 'c_str()' don't allocate memory. We pre-call 'c_str()' on all of these - // strings to try to ensure this is safe. const char *path = FilesToRemoveRef[i].c_str(); // Get the status so we can determine if it's a file or directory. If we @@ -164,7 +156,7 @@ static void RemoveFilesToRemove() { // super-user permissions. if (!S_ISREG(buf.st_mode)) continue; - + // Otherwise, remove the file. We ignore any errors here as there is nothing // else we can do. unlink(path); @@ -205,11 +197,7 @@ static RETSIGTYPE SignalHandler(int Sig) { } // Otherwise if it is a fault (like SEGV) run any handler. - if (CallBacksToRun.isConstructed()) { - auto &CallBacksToRunRef = *CallBacksToRun; - for (unsigned i = 0, e = CallBacksToRun->size(); i != e; ++i) - CallBacksToRunRef[i].first(CallBacksToRunRef[i].second); - } + llvm::sys::RunSignalHandlers(); #ifdef __s390__ // On S/390, certain signals are delivered with PSW Address pointing to @@ -239,21 +227,7 @@ bool llvm::sys::RemoveFileOnSignal(StringRef Filename, std::string* ErrMsg) { { sys::SmartScopedLock<true> Guard(*SignalsMutex); - std::vector<std::string>& FilesToRemoveRef = *FilesToRemove; - std::string *OldPtr = - FilesToRemoveRef.empty() ? nullptr : &FilesToRemoveRef[0]; - FilesToRemoveRef.push_back(Filename); - - // We want to call 'c_str()' on every std::string in this vector so that if - // the underlying implementation requires a re-allocation, it happens here - // rather than inside of the signal handler. If we see the vector grow, we - // have to call it on every entry. If it remains in place, we only need to - // call it on the latest one. - if (OldPtr == &FilesToRemoveRef[0]) - FilesToRemoveRef.back().c_str(); - else - for (unsigned i = 0, e = FilesToRemoveRef.size(); i != e; ++i) - FilesToRemoveRef[i].c_str(); + FilesToRemove->push_back(Filename); } RegisterHandlers(); @@ -268,13 +242,6 @@ void llvm::sys::DontRemoveFileOnSignal(StringRef Filename) { std::vector<std::string>::iterator I = FilesToRemove->end(); if (RI != FilesToRemove->rend()) I = FilesToRemove->erase(RI.base()-1); - - // We need to call c_str() on every element which would have been moved by - // the erase. These elements, in a C++98 implementation where c_str() - // requires a reallocation on the first call may have had the call to c_str() - // made on insertion become invalid by being copied down an element. - for (std::vector<std::string>::iterator E = FilesToRemove->end(); I != E; ++I) - I->c_str(); } /// AddSignalHandler - Add a function to be called when a signal is delivered @@ -285,10 +252,9 @@ void llvm::sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) { RegisterHandlers(); } -#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES) - -#if HAVE_LINK_H && (defined(__linux__) || defined(__FreeBSD__) || \ - defined(__FreeBSD_kernel__) || defined(__NetBSD__)) +#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES) && HAVE_LINK_H && \ + (defined(__linux__) || defined(__FreeBSD__) || \ + defined(__FreeBSD_kernel__) || defined(__NetBSD__)) struct DlIteratePhdrData { void **StackTrace; int depth; @@ -321,108 +287,27 @@ static int dl_iterate_phdr_cb(dl_phdr_info *info, size_t size, void *arg) { return 0; } +/// If this is an ELF platform, we can find all loaded modules and their virtual +/// addresses with dl_iterate_phdr. static bool findModulesAndOffsets(void **StackTrace, int Depth, const char **Modules, intptr_t *Offsets, - const char *MainExecutableName) { + const char *MainExecutableName, + StringSaver &StrPool) { DlIteratePhdrData data = {StackTrace, Depth, true, Modules, Offsets, MainExecutableName}; dl_iterate_phdr(dl_iterate_phdr_cb, &data); return true; } #else +/// This platform does not have dl_iterate_phdr, so we do not yet know how to +/// find all loaded DSOs. static bool findModulesAndOffsets(void **StackTrace, int Depth, const char **Modules, intptr_t *Offsets, - const char *MainExecutableName) { + const char *MainExecutableName, + StringSaver &StrPool) { return false; } -#endif - -static bool printSymbolizedStackTrace(void **StackTrace, int Depth, - llvm::raw_ostream &OS) { - // FIXME: Subtract necessary number from StackTrace entries to turn return addresses - // into actual instruction addresses. - // Use llvm-symbolizer tool to symbolize the stack traces. - ErrorOr<std::string> LLVMSymbolizerPathOrErr = - sys::findProgramByName("llvm-symbolizer"); - if (!LLVMSymbolizerPathOrErr) - return false; - const std::string &LLVMSymbolizerPath = *LLVMSymbolizerPathOrErr; - // We don't know argv0 or the address of main() at this point, but try - // to guess it anyway (it's possible on some platforms). - std::string MainExecutableName = sys::fs::getMainExecutable(nullptr, nullptr); - if (MainExecutableName.empty() || - MainExecutableName.find("llvm-symbolizer") != std::string::npos) - return false; - - std::vector<const char *> Modules(Depth, nullptr); - std::vector<intptr_t> Offsets(Depth, 0); - if (!findModulesAndOffsets(StackTrace, Depth, Modules.data(), Offsets.data(), - MainExecutableName.c_str())) - return false; - int InputFD; - SmallString<32> InputFile, OutputFile; - sys::fs::createTemporaryFile("symbolizer-input", "", InputFD, InputFile); - sys::fs::createTemporaryFile("symbolizer-output", "", OutputFile); - FileRemover InputRemover(InputFile.c_str()); - FileRemover OutputRemover(OutputFile.c_str()); - - { - raw_fd_ostream Input(InputFD, true); - for (int i = 0; i < Depth; i++) { - if (Modules[i]) - Input << Modules[i] << " " << (void*)Offsets[i] << "\n"; - } - } - - StringRef InputFileStr(InputFile); - StringRef OutputFileStr(OutputFile); - StringRef StderrFileStr; - const StringRef *Redirects[] = {&InputFileStr, &OutputFileStr, - &StderrFileStr}; - const char *Args[] = {"llvm-symbolizer", "--functions=linkage", "--inlining", - "--demangle", nullptr}; - int RunResult = - sys::ExecuteAndWait(LLVMSymbolizerPath, Args, nullptr, Redirects); - if (RunResult != 0) - return false; - - auto OutputBuf = MemoryBuffer::getFile(OutputFile.c_str()); - if (!OutputBuf) - return false; - StringRef Output = OutputBuf.get()->getBuffer(); - SmallVector<StringRef, 32> Lines; - Output.split(Lines, "\n"); - auto CurLine = Lines.begin(); - int frame_no = 0; - for (int i = 0; i < Depth; i++) { - if (!Modules[i]) { - OS << format("#%d %p\n", frame_no++, StackTrace[i]); - continue; - } - // Read pairs of lines (function name and file/line info) until we - // encounter empty line. - for (;;) { - if (CurLine == Lines.end()) - return false; - StringRef FunctionName = *CurLine++; - if (FunctionName.empty()) - break; - OS << format("#%d %p ", frame_no++, StackTrace[i]); - if (!FunctionName.startswith("??")) - OS << format("%s ", FunctionName.str().c_str()); - if (CurLine == Lines.end()) - return false; - StringRef FileLineInfo = *CurLine++; - if (!FileLineInfo.startswith("??")) - OS << format("%s", FileLineInfo.str().c_str()); - else - OS << format("(%s+%p)", Modules[i], (void *)Offsets[i]); - OS << "\n"; - } - } - return true; -} -#endif // defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES) +#endif // defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES) && ... // PrintStackTrace - In the case of a program crash or fault, print out a stack // trace so that the user has an indication of why and where we died. diff --git a/contrib/llvm/lib/Support/Unix/Unix.h b/contrib/llvm/lib/Support/Unix/Unix.h index e16a226..871e612 100644 --- a/contrib/llvm/lib/Support/Unix/Unix.h +++ b/contrib/llvm/lib/Support/Unix/Unix.h @@ -29,6 +29,7 @@ #include <cstring> #include <string> #include <sys/types.h> +#include <sys/wait.h> #ifdef HAVE_UNISTD_H #include <unistd.h> @@ -43,22 +44,10 @@ #endif #include <time.h> -#ifdef HAVE_SYS_WAIT_H -# include <sys/wait.h> -#endif - #ifdef HAVE_DLFCN_H # include <dlfcn.h> #endif -#ifndef WEXITSTATUS -# define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8) -#endif - -#ifndef WIFEXITED -# define WIFEXITED(stat_val) (((stat_val) & 255) == 0) -#endif - /// This function builds an error message into \p ErrMsg using the \p prefix /// string and the Unix error number given by \p errnum. If errnum is -1, the /// default then the value of errno is used. diff --git a/contrib/llvm/lib/Support/Valgrind.cpp b/contrib/llvm/lib/Support/Valgrind.cpp index facf8d9..8d852a6 100644 --- a/contrib/llvm/lib/Support/Valgrind.cpp +++ b/contrib/llvm/lib/Support/Valgrind.cpp @@ -15,6 +15,7 @@ #include "llvm/Support/Valgrind.h" #include "llvm/Config/config.h" +#include <cstddef> #if HAVE_VALGRIND_VALGRIND_H #include <valgrind/valgrind.h> @@ -52,23 +53,3 @@ void llvm::sys::ValgrindDiscardTranslations(const void *Addr, size_t Len) { } #endif // !HAVE_VALGRIND_VALGRIND_H - -// These functions require no implementation, tsan just looks at the arguments -// they're called with. However, they are required to be weak as some other -// application or library may already be providing these definitions for the -// same reason we are. -extern "C" { -LLVM_ATTRIBUTE_WEAK void AnnotateHappensAfter(const char *file, int line, - const volatile void *cv); -void AnnotateHappensAfter(const char *file, int line, const volatile void *cv) { -} -LLVM_ATTRIBUTE_WEAK void AnnotateHappensBefore(const char *file, int line, - const volatile void *cv); -void AnnotateHappensBefore(const char *file, int line, - const volatile void *cv) {} -LLVM_ATTRIBUTE_WEAK void AnnotateIgnoreWritesBegin(const char *file, int line); -void AnnotateIgnoreWritesBegin(const char *file, int line) {} -LLVM_ATTRIBUTE_WEAK void AnnotateIgnoreWritesEnd(const char *file, int line); -void AnnotateIgnoreWritesEnd(const char *file, int line) {} -} - diff --git a/contrib/llvm/lib/Support/Windows/COM.inc b/contrib/llvm/lib/Support/Windows/COM.inc index 0c50d6f..54f3ecf 100644 --- a/contrib/llvm/lib/Support/Windows/COM.inc +++ b/contrib/llvm/lib/Support/Windows/COM.inc @@ -1,4 +1,4 @@ -//===- llvm/Support/Windows/COM.inc - Windows COM Implementation *- C++ -*-===// +//==- llvm/Support/Windows/COM.inc - Windows COM Implementation -*- C++ -*-===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc b/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc index d38f197..17418b0 100644 --- a/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc +++ b/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc @@ -76,14 +76,14 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename, SmallVector<wchar_t, MAX_PATH> filenameUnicode; if (std::error_code ec = windows::UTF8ToUTF16(filename, filenameUnicode)) { SetLastError(ec.value()); - MakeErrMsg(errMsg, std::string(filename) + ": Can't convert to UTF-16: "); + MakeErrMsg(errMsg, std::string(filename) + ": Can't convert to UTF-16"); return DynamicLibrary(); } HMODULE a_handle = LoadLibraryW(filenameUnicode.data()); if (a_handle == 0) { - MakeErrMsg(errMsg, std::string(filename) + ": Can't open : "); + MakeErrMsg(errMsg, std::string(filename) + ": Can't open"); return DynamicLibrary(); } diff --git a/contrib/llvm/lib/Support/Windows/Memory.inc b/contrib/llvm/lib/Support/Windows/Memory.inc index 4b2ff2e..7eab9ff 100644 --- a/contrib/llvm/lib/Support/Windows/Memory.inc +++ b/contrib/llvm/lib/Support/Windows/Memory.inc @@ -192,14 +192,14 @@ static DWORD getProtection(const void *addr) { bool Memory::setWritable(MemoryBlock &M, std::string *ErrMsg) { if (!setRangeWritable(M.Address, M.Size)) { - return MakeErrMsg(ErrMsg, "Cannot set memory to writeable: "); + return MakeErrMsg(ErrMsg, "Cannot set memory to writeable"); } return true; } bool Memory::setExecutable(MemoryBlock &M, std::string *ErrMsg) { if (!setRangeExecutable(M.Address, M.Size)) { - return MakeErrMsg(ErrMsg, "Cannot set memory to executable: "); + return MakeErrMsg(ErrMsg, "Cannot set memory to executable"); } return true; } diff --git a/contrib/llvm/lib/Support/Windows/Path.inc b/contrib/llvm/lib/Support/Windows/Path.inc index 72da7c5..5ef77b1 100644 --- a/contrib/llvm/lib/Support/Windows/Path.inc +++ b/contrib/llvm/lib/Support/Windows/Path.inc @@ -38,6 +38,7 @@ typedef int errno_t; #ifdef _MSC_VER # pragma comment(lib, "advapi32.lib") // This provides CryptAcquireContextW. +# pragma comment(lib, "ole32.lib") // This provides CoTaskMemFree #endif using namespace llvm; @@ -182,7 +183,8 @@ std::error_code current_path(SmallVectorImpl<char> &result) { return UTF16ToUTF8(cur_path.begin(), cur_path.size(), result); } -std::error_code create_directory(const Twine &path, bool IgnoreExisting) { +std::error_code create_directory(const Twine &path, bool IgnoreExisting, + perms Perms) { SmallVector<wchar_t, 128> path_utf16; if (std::error_code ec = widenPath(path, path_utf16)) @@ -252,17 +254,34 @@ std::error_code rename(const Twine &from, const Twine &to) { return ec; std::error_code ec = std::error_code(); + + // Retry while we see ERROR_ACCESS_DENIED. + // System scanners (eg. indexer) might open the source file when it is written + // and closed. + for (int i = 0; i < 2000; i++) { + // Try ReplaceFile first, as it is able to associate a new data stream with + // the destination even if the destination file is currently open. + if (::ReplaceFileW(wide_to.begin(), wide_from.begin(), NULL, 0, NULL, NULL)) + return std::error_code(); + + // We get ERROR_FILE_NOT_FOUND if the destination file is missing. + // MoveFileEx can handle this case. + DWORD ReplaceError = ::GetLastError(); + ec = mapWindowsError(ReplaceError); + if (ReplaceError != ERROR_ACCESS_DENIED && + ReplaceError != ERROR_FILE_NOT_FOUND && + ReplaceError != ERROR_SHARING_VIOLATION) + break; + if (::MoveFileExW(wide_from.begin(), wide_to.begin(), MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING)) return std::error_code(); - DWORD LastError = ::GetLastError(); - ec = mapWindowsError(LastError); - if (LastError != ERROR_ACCESS_DENIED) - break; - // Retry MoveFile() at ACCESS_DENIED. - // System scanners (eg. indexer) might open the source file when - // It is written and closed. + + DWORD MoveError = ::GetLastError(); + ec = mapWindowsError(MoveError); + if (MoveError != ERROR_ACCESS_DENIED) break; + ::Sleep(1); } @@ -301,6 +320,11 @@ std::error_code access(const Twine &Path, AccessMode Mode) { return std::error_code(); } +bool can_execute(const Twine &Path) { + return !access(Path, AccessMode::Execute) || + !access(Path + ".exe", AccessMode::Execute); +} + bool equivalent(file_status A, file_status B) { assert(status_known(A) && status_known(B)); return A.FileIndexHigh == B.FileIndexHigh && @@ -325,10 +349,12 @@ std::error_code equivalent(const Twine &A, const Twine &B, bool &result) { static bool isReservedName(StringRef path) { // This list of reserved names comes from MSDN, at: // http://msdn.microsoft.com/en-us/library/aa365247%28v=vs.85%29.aspx - static const char *sReservedNames[] = { "nul", "con", "prn", "aux", - "com1", "com2", "com3", "com4", "com5", "com6", - "com7", "com8", "com9", "lpt1", "lpt2", "lpt3", - "lpt4", "lpt5", "lpt6", "lpt7", "lpt8", "lpt9" }; + static const char *const sReservedNames[] = { "nul", "con", "prn", "aux", + "com1", "com2", "com3", "com4", + "com5", "com6", "com7", "com8", + "com9", "lpt1", "lpt2", "lpt3", + "lpt4", "lpt5", "lpt6", "lpt7", + "lpt8", "lpt9" }; // First, check to see if this is a device namespace, which always // starts with \\.\, since device namespaces are not legal file paths. @@ -643,9 +669,10 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD) { if (std::error_code EC = widenPath(Name, PathUTF16)) return EC; - HANDLE H = ::CreateFileW(PathUTF16.begin(), GENERIC_READ, - FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, - OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + HANDLE H = + ::CreateFileW(PathUTF16.begin(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); if (H == INVALID_HANDLE_VALUE) { DWORD LastError = ::GetLastError(); std::error_code EC = mapWindowsError(LastError); @@ -728,30 +755,31 @@ std::error_code openFileForWrite(const Twine &Name, int &ResultFD, } // end namespace fs namespace path { - -bool home_directory(SmallVectorImpl<char> &result) { - wchar_t Path[MAX_PATH]; - if (::SHGetFolderPathW(0, CSIDL_APPDATA | CSIDL_FLAG_CREATE, 0, - /*SHGFP_TYPE_CURRENT*/0, Path) != S_OK) +static bool getKnownFolderPath(KNOWNFOLDERID folderId, + SmallVectorImpl<char> &result) { + wchar_t *path = nullptr; + if (::SHGetKnownFolderPath(folderId, KF_FLAG_CREATE, nullptr, &path) != S_OK) return false; - if (UTF16ToUTF8(Path, ::wcslen(Path), result)) - return false; + bool ok = !UTF16ToUTF8(path, ::wcslen(path), result); + ::CoTaskMemFree(path); + return ok; +} - return true; +bool getUserCacheDir(SmallVectorImpl<char> &Result) { + return getKnownFolderPath(FOLDERID_LocalAppData, Result); } -static bool getTempDirEnvVar(const char *Var, SmallVectorImpl<char> &Res) { - SmallVector<wchar_t, 128> NameUTF16; - if (windows::UTF8ToUTF16(Var, NameUTF16)) - return false; +bool home_directory(SmallVectorImpl<char> &result) { + return getKnownFolderPath(FOLDERID_Profile, result); +} +static bool getTempDirEnvVar(const wchar_t *Var, SmallVectorImpl<char> &Res) { SmallVector<wchar_t, 1024> Buf; size_t Size = 1024; do { Buf.reserve(Size); - Size = - GetEnvironmentVariableW(NameUTF16.data(), Buf.data(), Buf.capacity()); + Size = GetEnvironmentVariableW(Var, Buf.data(), Buf.capacity()); if (Size == 0) return false; @@ -759,14 +787,12 @@ static bool getTempDirEnvVar(const char *Var, SmallVectorImpl<char> &Res) { } while (Size > Buf.capacity()); Buf.set_size(Size); - if (windows::UTF16ToUTF8(Buf.data(), Size, Res)) - return false; - return true; + return !windows::UTF16ToUTF8(Buf.data(), Size, Res); } static bool getTempDirEnvVar(SmallVectorImpl<char> &Res) { - const char *EnvironmentVariables[] = {"TMP", "TEMP", "USERPROFILE"}; - for (const char *Env : EnvironmentVariables) { + const wchar_t *EnvironmentVariables[] = {L"TMP", L"TEMP", L"USERPROFILE"}; + for (auto *Env : EnvironmentVariables) { if (getTempDirEnvVar(Env, Res)) return true; } @@ -777,13 +803,19 @@ void system_temp_directory(bool ErasedOnReboot, SmallVectorImpl<char> &Result) { (void)ErasedOnReboot; Result.clear(); - // Check whether the temporary directory is specified by an environment - // variable. - if (getTempDirEnvVar(Result)) + // Check whether the temporary directory is specified by an environment var. + // This matches GetTempPath logic to some degree. GetTempPath is not used + // directly as it cannot handle evn var longer than 130 chars on Windows 7 + // (fixed on Windows 8). + if (getTempDirEnvVar(Result)) { + assert(!Result.empty() && "Unexpected empty path"); + native(Result); // Some Unix-like shells use Unix path separator in $TMP. + fs::make_absolute(Result); // Make it absolute if not already. return; + } // Fall back to a system default. - const char *DefaultResult = "C:\\TEMP"; + const char *DefaultResult = "C:\\Temp"; Result.append(DefaultResult, DefaultResult + strlen(DefaultResult)); } } // end namespace path diff --git a/contrib/llvm/lib/Support/Windows/Process.inc b/contrib/llvm/lib/Support/Windows/Process.inc index 8164956..dae35a8 100644 --- a/contrib/llvm/lib/Support/Windows/Process.inc +++ b/contrib/llvm/lib/Support/Windows/Process.inc @@ -417,16 +417,23 @@ const char *Process::ResetColor() { return 0; } +// Include GetLastError() in a fatal error message. +static void ReportLastErrorFatal(const char *Msg) { + std::string ErrMsg; + MakeErrMsg(&ErrMsg, Msg); + report_fatal_error(ErrMsg); +} + unsigned Process::GetRandomNumber() { HCRYPTPROV HCPC; if (!::CryptAcquireContextW(&HCPC, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT)) - report_fatal_error("Could not acquire a cryptographic context"); + ReportLastErrorFatal("Could not acquire a cryptographic context"); ScopedCryptContext CryptoProvider(HCPC); unsigned Ret; if (!::CryptGenRandom(CryptoProvider, sizeof(Ret), reinterpret_cast<BYTE *>(&Ret))) - report_fatal_error("Could not generate a random number"); + ReportLastErrorFatal("Could not generate a random number"); return Ret; } diff --git a/contrib/llvm/lib/Support/Windows/Program.inc b/contrib/llvm/lib/Support/Windows/Program.inc index c29d872..78fc538 100644 --- a/contrib/llvm/lib/Support/Windows/Program.inc +++ b/contrib/llvm/lib/Support/Windows/Program.inc @@ -75,8 +75,15 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name, do { U16Result.reserve(Len); - Len = ::SearchPathW(Path, c_str(U16Name), - U16Ext.empty() ? nullptr : c_str(U16Ext), + // Lets attach the extension manually. That is needed for files + // with a point in name like aaa.bbb. SearchPathW will not add extension + // from its argument to such files because it thinks they already had one. + SmallVector<wchar_t, MAX_PATH> U16NameExt; + if (std::error_code EC = + windows::UTF8ToUTF16(Twine(Name + Ext).str(), U16NameExt)) + return EC; + + Len = ::SearchPathW(Path, c_str(U16NameExt), nullptr, U16Result.capacity(), U16Result.data(), nullptr); } while (Len > U16Result.capacity()); @@ -132,7 +139,7 @@ static HANDLE RedirectIO(const StringRef *path, int fd, std::string* ErrMsg) { FILE_ATTRIBUTE_NORMAL, NULL); if (h == INVALID_HANDLE_VALUE) { MakeErrMsg(ErrMsg, fname + ": Can't open file for " + - (fd ? "input: " : "output: ")); + (fd ? "input" : "output")); } return h; @@ -251,6 +258,14 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args, return false; } + // can_execute may succeed by looking at Program + ".exe". CreateProcessW + // will implicitly add the .exe if we provide a command line without an + // executable path, but since we use an explicit executable, we have to add + // ".exe" ourselves. + SmallString<64> ProgramStorage; + if (!sys::fs::exists(Program)) + Program = Twine(Program + ".exe").toStringRef(ProgramStorage); + // Windows wants a command line, not an array of args, to pass to the new // process. We have to concatenate them all, while quoting the args that // have embedded spaces (or are empty). @@ -416,7 +431,7 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait, if (SecondsToWait) { if (!TerminateProcess(PI.ProcessHandle, 1)) { if (ErrMsg) - MakeErrMsg(ErrMsg, "Failed to terminate timed-out program."); + MakeErrMsg(ErrMsg, "Failed to terminate timed-out program"); // -2 indicates a crash or timeout as opposed to failure to execute. WaitResult.ReturnCode = -2; @@ -441,7 +456,7 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait, if (!rc) { SetLastError(err); if (ErrMsg) - MakeErrMsg(ErrMsg, "Failed getting status for program."); + MakeErrMsg(ErrMsg, "Failed getting status for program"); // -2 indicates a crash or timeout as opposed to failure to execute. WaitResult.ReturnCode = -2; @@ -520,14 +535,15 @@ llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents, return EC; } -bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) { +bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program, ArrayRef<const char*> Args) { // The documented max length of the command line passed to CreateProcess. static const size_t MaxCommandStringLength = 32768; - size_t ArgLength = 0; + // Account for the trailing space for the program path and the + // trailing NULL of the last argument. + size_t ArgLength = ArgLenWithQuotes(Program.str().c_str()) + 2; for (ArrayRef<const char*>::iterator I = Args.begin(), E = Args.end(); I != E; ++I) { - // Account for the trailing space for every arg but the last one and the - // trailing NULL of the last argument. + // Account for the trailing space for every arg ArgLength += ArgLenWithQuotes(*I) + 1; if (ArgLength > MaxCommandStringLength) { return false; diff --git a/contrib/llvm/lib/Support/Windows/Signals.inc b/contrib/llvm/lib/Support/Windows/Signals.inc index 5c8c239..f40ca72 100644 --- a/contrib/llvm/lib/Support/Windows/Signals.inc +++ b/contrib/llvm/lib/Support/Windows/Signals.inc @@ -14,7 +14,6 @@ #include <algorithm> #include <signal.h> #include <stdio.h> -#include <vector> #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" @@ -136,6 +135,10 @@ typedef BOOL (WINAPI *fpSymGetLineFromAddr64)(HANDLE, DWORD64, PDWORD, PIMAGEHLP_LINE64); static fpSymGetLineFromAddr64 fSymGetLineFromAddr64; +typedef BOOL(WINAPI *fpSymGetModuleInfo64)(HANDLE hProcess, DWORD64 dwAddr, + PIMAGEHLP_MODULE64 ModuleInfo); +static fpSymGetModuleInfo64 fSymGetModuleInfo64; + typedef PVOID (WINAPI *fpSymFunctionTableAccess64)(HANDLE, DWORD64); static fpSymFunctionTableAccess64 fSymFunctionTableAccess64; @@ -145,6 +148,9 @@ static fpSymSetOptions fSymSetOptions; typedef BOOL (WINAPI *fpSymInitialize)(HANDLE, PCSTR, BOOL); static fpSymInitialize fSymInitialize; +typedef BOOL (WINAPI *fpEnumerateLoadedModules)(HANDLE,PENUMLOADED_MODULES_CALLBACK64,PVOID); +static fpEnumerateLoadedModules fEnumerateLoadedModules; + static bool load64BitDebugHelp(void) { HMODULE hLib = ::LoadLibraryW(L"Dbghelp.dll"); if (hLib) { @@ -156,14 +162,20 @@ static bool load64BitDebugHelp(void) { ::GetProcAddress(hLib, "SymGetSymFromAddr64"); fSymGetLineFromAddr64 = (fpSymGetLineFromAddr64) ::GetProcAddress(hLib, "SymGetLineFromAddr64"); + fSymGetModuleInfo64 = (fpSymGetModuleInfo64) + ::GetProcAddress(hLib, "SymGetModuleInfo64"); fSymFunctionTableAccess64 = (fpSymFunctionTableAccess64) ::GetProcAddress(hLib, "SymFunctionTableAccess64"); fSymSetOptions = (fpSymSetOptions)::GetProcAddress(hLib, "SymSetOptions"); fSymInitialize = (fpSymInitialize)::GetProcAddress(hLib, "SymInitialize"); + fEnumerateLoadedModules = (fpEnumerateLoadedModules) + ::GetProcAddress(hLib, "EnumerateLoadedModules64"); } return fStackWalk64 && fSymInitialize && fSymSetOptions; } +using namespace llvm; + // Forward declare. static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep); static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType); @@ -172,7 +184,6 @@ static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType); static void (*InterruptFunction)() = 0; static std::vector<std::string> *FilesToRemove = NULL; -static std::vector<std::pair<void(*)(void*), void*> > *CallBacksToRun = 0; static bool RegisteredUnhandledExceptionFilter = false; static bool CleanupExecuted = false; static PTOP_LEVEL_EXCEPTION_FILTER OldFilter = NULL; @@ -183,23 +194,106 @@ static PTOP_LEVEL_EXCEPTION_FILTER OldFilter = NULL; static CRITICAL_SECTION CriticalSection; static bool CriticalSectionInitialized = false; -static void PrintStackTraceForThread(llvm::raw_ostream &OS, HANDLE hProcess, - HANDLE hThread, STACKFRAME64 &StackFrame, - CONTEXT *Context) { - DWORD machineType; +enum { #if defined(_M_X64) - machineType = IMAGE_FILE_MACHINE_AMD64; + NativeMachineType = IMAGE_FILE_MACHINE_AMD64 #else - machineType = IMAGE_FILE_MACHINE_I386; + NativeMachineType = IMAGE_FILE_MACHINE_I386 #endif +}; + +static bool printStackTraceWithLLVMSymbolizer(llvm::raw_ostream &OS, + HANDLE hProcess, HANDLE hThread, + STACKFRAME64 &StackFrameOrig, + CONTEXT *ContextOrig) { + // StackWalk64 modifies the incoming stack frame and context, so copy them. + STACKFRAME64 StackFrame = StackFrameOrig; + + // Copy the register context so that we don't modify it while we unwind. We + // could use InitializeContext + CopyContext, but that's only required to get + // at AVX registers, which typically aren't needed by StackWalk64. Reduce the + // flag set to indicate that there's less data. + CONTEXT Context = *ContextOrig; + Context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; + + static void *StackTrace[256]; + size_t Depth = 0; + while (fStackWalk64(NativeMachineType, hProcess, hThread, &StackFrame, + &Context, 0, fSymFunctionTableAccess64, + fSymGetModuleBase64, 0)) { + if (StackFrame.AddrFrame.Offset == 0) + break; + StackTrace[Depth++] = (void *)(uintptr_t)StackFrame.AddrPC.Offset; + if (Depth >= array_lengthof(StackTrace)) + break; + } + + return printSymbolizedStackTrace(&StackTrace[0], Depth, OS); +} + +namespace { +struct FindModuleData { + void **StackTrace; + int Depth; + const char **Modules; + intptr_t *Offsets; + StringSaver *StrPool; +}; +} + +static BOOL CALLBACK findModuleCallback(WIN32_ELMCB_PCSTR ModuleName, + DWORD64 ModuleBase, ULONG ModuleSize, + void *VoidData) { + FindModuleData *Data = (FindModuleData*)VoidData; + intptr_t Beg = ModuleBase; + intptr_t End = Beg + ModuleSize; + for (int I = 0; I < Data->Depth; I++) { + if (Data->Modules[I]) + continue; + intptr_t Addr = (intptr_t)Data->StackTrace[I]; + if (Beg <= Addr && Addr < End) { + Data->Modules[I] = Data->StrPool->save(ModuleName); + Data->Offsets[I] = Addr - Beg; + } + } + return TRUE; +} +static bool findModulesAndOffsets(void **StackTrace, int Depth, + const char **Modules, intptr_t *Offsets, + const char *MainExecutableName, + StringSaver &StrPool) { + if (!fEnumerateLoadedModules) + return false; + FindModuleData Data; + Data.StackTrace = StackTrace; + Data.Depth = Depth; + Data.Modules = Modules; + Data.Offsets = Offsets; + Data.StrPool = &StrPool; + fEnumerateLoadedModules(GetCurrentProcess(), findModuleCallback, &Data); + return true; +} + +static void PrintStackTraceForThread(llvm::raw_ostream &OS, HANDLE hProcess, + HANDLE hThread, STACKFRAME64 &StackFrame, + CONTEXT *Context) { // Initialize the symbol handler. fSymSetOptions(SYMOPT_DEFERRED_LOADS | SYMOPT_LOAD_LINES); fSymInitialize(hProcess, NULL, TRUE); + // Try llvm-symbolizer first. llvm-symbolizer knows how to deal with both PDBs + // and DWARF, so it should do a good job regardless of what debug info or + // linker is in use. + if (printStackTraceWithLLVMSymbolizer(OS, hProcess, hThread, StackFrame, + Context)) { + return; + } + while (true) { - if (!fStackWalk64(machineType, hProcess, hThread, &StackFrame, Context, 0, - fSymFunctionTableAccess64, fSymGetModuleBase64, 0)) { + if (!fStackWalk64(NativeMachineType, hProcess, hThread, &StackFrame, + Context, 0, fSymFunctionTableAccess64, + fSymGetModuleBase64, 0)) { break; } @@ -404,7 +498,6 @@ extern "C" VOID WINAPI RtlCaptureContext(PCONTEXT ContextRecord); #endif void llvm::sys::PrintStackTrace(raw_ostream &OS) { - STACKFRAME64 StackFrame = {}; CONTEXT Context = {}; ::RtlCaptureContext(&Context); @@ -436,8 +529,6 @@ void llvm::sys::SetInterruptFunction(void (*IF)()) { /// to the process. The handler can have a cookie passed to it to identify /// what instance of the handler it is. void llvm::sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) { - if (CallBacksToRun == 0) - CallBacksToRun = new std::vector<std::pair<void(*)(void*), void*> >(); CallBacksToRun->push_back(std::make_pair(FnPtr, Cookie)); RegisterHandler(); LeaveCriticalSection(&CriticalSection); @@ -454,17 +545,12 @@ static void Cleanup() { CleanupExecuted = true; // FIXME: open files cannot be deleted. - if (FilesToRemove != NULL) while (!FilesToRemove->empty()) { llvm::sys::fs::remove(FilesToRemove->back()); FilesToRemove->pop_back(); } - - if (CallBacksToRun) - for (auto &I : *CallBacksToRun) - I.first(I.second); - + llvm::sys::RunSignalHandlers(); LeaveCriticalSection(&CriticalSection); } diff --git a/contrib/llvm/lib/Support/Windows/WindowsSupport.h b/contrib/llvm/lib/Support/Windows/WindowsSupport.h index 5bb0b8d..60490f2 100644 --- a/contrib/llvm/lib/Support/Windows/WindowsSupport.h +++ b/contrib/llvm/lib/Support/Windows/WindowsSupport.h @@ -26,12 +26,16 @@ #undef _WIN32_WINNT #undef _WIN32_IE -// Require at least Windows XP(5.1) API. -#define _WIN32_WINNT 0x0501 -#define _WIN32_IE 0x0600 // MinGW at it again. +// Require at least Windows 7 API. +#define _WIN32_WINNT 0x0601 +#define _WIN32_IE 0x0800 // MinGW at it again. FIXME: verify if still needed. #define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Config/config.h" // Get build system configuration settings @@ -43,17 +47,42 @@ #include <string> #include <vector> +/// Determines if the program is running on Windows 8 or newer. This +/// reimplements one of the helpers in the Windows 8.1 SDK, which are intended +/// to supercede raw calls to GetVersionEx. Old SDKs, Cygwin, and MinGW don't +/// yet have VersionHelpers.h, so we have our own helper. +inline bool RunningWindows8OrGreater() { + // Windows 8 is version 6.2, service pack 0. + OSVERSIONINFOEXW osvi = {}; + osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); + osvi.dwMajorVersion = 6; + osvi.dwMinorVersion = 2; + osvi.wServicePackMajor = 0; + + DWORDLONG Mask = 0; + Mask = VerSetConditionMask(Mask, VER_MAJORVERSION, VER_GREATER_EQUAL); + Mask = VerSetConditionMask(Mask, VER_MINORVERSION, VER_GREATER_EQUAL); + Mask = VerSetConditionMask(Mask, VER_SERVICEPACKMAJOR, VER_GREATER_EQUAL); + + return VerifyVersionInfoW(&osvi, VER_MAJORVERSION | VER_MINORVERSION | + VER_SERVICEPACKMAJOR, + Mask) != FALSE; +} + inline bool MakeErrMsg(std::string* ErrMsg, const std::string& prefix) { if (!ErrMsg) return true; char *buffer = NULL; + DWORD LastError = GetLastError(); DWORD R = FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | - FORMAT_MESSAGE_FROM_SYSTEM, - NULL, GetLastError(), 0, (LPSTR)&buffer, 1, NULL); + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_MAX_WIDTH_MASK, + NULL, LastError, 0, (LPSTR)&buffer, 1, NULL); if (R) - *ErrMsg = prefix + buffer; + *ErrMsg = prefix + ": " + buffer; else - *ErrMsg = prefix + "Unknown error"; + *ErrMsg = prefix + ": Unknown error"; + *ErrMsg += " (0x" + llvm::utohexstr(LastError) + ")"; LocalFree(buffer); return R != 0; diff --git a/contrib/llvm/lib/Support/YAMLParser.cpp b/contrib/llvm/lib/Support/YAMLParser.cpp index d55da5e..c4384ca 100644 --- a/contrib/llvm/lib/Support/YAMLParser.cpp +++ b/contrib/llvm/lib/Support/YAMLParser.cpp @@ -801,7 +801,7 @@ Token &Scanner::peekNext() { removeStaleSimpleKeyCandidates(); SimpleKey SK; - SK.Tok = TokenQueue.front(); + SK.Tok = TokenQueue.begin(); if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) == SimpleKeys.end()) break; @@ -962,10 +962,8 @@ void Scanner::skip(uint32_t Distance) { bool Scanner::isBlankOrBreak(StringRef::iterator Position) { if (Position == End) return false; - if ( *Position == ' ' || *Position == '\t' - || *Position == '\r' || *Position == '\n') - return true; - return false; + return *Position == ' ' || *Position == '\t' || *Position == '\r' || + *Position == '\n'; } bool Scanner::consumeLineBreakIfPresent() { @@ -1163,7 +1161,7 @@ bool Scanner::scanFlowCollectionStart(bool IsSequence) { TokenQueue.push_back(T); // [ and { may begin a simple key. - saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); + saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); // And may also be followed by a simple key. IsSimpleKeyAllowed = true; @@ -1326,7 +1324,7 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { T.Range = StringRef(Start, Current - Start); TokenQueue.push_back(T); - saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); + saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); IsSimpleKeyAllowed = false; @@ -1404,7 +1402,7 @@ bool Scanner::scanPlainScalar() { TokenQueue.push_back(T); // Plain scalars can be simple keys. - saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); + saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); IsSimpleKeyAllowed = false; @@ -1439,7 +1437,7 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) { TokenQueue.push_back(T); // Alias and anchors can be simple keys. - saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); + saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); IsSimpleKeyAllowed = false; @@ -1669,7 +1667,7 @@ bool Scanner::scanTag() { TokenQueue.push_back(T); // Tags can be simple keys. - saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); + saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); IsSimpleKeyAllowed = false; diff --git a/contrib/llvm/lib/Support/YAMLTraits.cpp b/contrib/llvm/lib/Support/YAMLTraits.cpp index 6b59a16..2aa6e9b 100644 --- a/contrib/llvm/lib/Support/YAMLTraits.cpp +++ b/contrib/llvm/lib/Support/YAMLTraits.cpp @@ -332,17 +332,12 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) { StringRef KeyStr = SN->getValue(StringStorage); if (!StringStorage.empty()) { // Copy string to permanent storage - unsigned Len = StringStorage.size(); - char *Buf = StringAllocator.Allocate<char>(Len); - memcpy(Buf, &StringStorage[0], Len); - KeyStr = StringRef(Buf, Len); + KeyStr = StringStorage.str().copy(StringAllocator); } return llvm::make_unique<ScalarHNode>(N, KeyStr); } else if (BlockScalarNode *BSN = dyn_cast<BlockScalarNode>(N)) { - StringRef Value = BSN->getValue(); - char *Buf = StringAllocator.Allocate<char>(Value.size()); - memcpy(Buf, Value.data(), Value.size()); - return llvm::make_unique<ScalarHNode>(N, StringRef(Buf, Value.size())); + StringRef ValueCopy = BSN->getValue().copy(StringAllocator); + return llvm::make_unique<ScalarHNode>(N, ValueCopy); } else if (SequenceNode *SQ = dyn_cast<SequenceNode>(N)) { auto SQHNode = llvm::make_unique<SequenceHNode>(N); for (Node &SN : *SQ) { @@ -365,10 +360,7 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) { StringRef KeyStr = KeyScalar->getValue(StringStorage); if (!StringStorage.empty()) { // Copy string to permanent storage - unsigned Len = StringStorage.size(); - char *Buf = StringAllocator.Allocate<char>(Len); - memcpy(Buf, &StringStorage[0], Len); - KeyStr = StringRef(Buf, Len); + KeyStr = StringStorage.str().copy(StringAllocator); } auto ValueHNode = this->createHNodes(KVN.getValue()); if (EC) diff --git a/contrib/llvm/lib/Support/raw_ostream.cpp b/contrib/llvm/lib/Support/raw_ostream.cpp index 42f830b..15813fd 100644 --- a/contrib/llvm/lib/Support/raw_ostream.cpp +++ b/contrib/llvm/lib/Support/raw_ostream.cpp @@ -57,6 +57,10 @@ #endif #endif +#ifdef LLVM_ON_WIN32 +#include "Windows/WindowsSupport.h" +#endif + using namespace llvm; raw_ostream::~raw_ostream() { @@ -517,7 +521,7 @@ raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC, /// closes the file when the stream is destroyed. raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered) : raw_pwrite_stream(unbuffered), FD(fd), ShouldClose(shouldClose), - Error(false), UseAtomicWrites(false) { + Error(false) { if (FD < 0 ) { ShouldClose = false; return; @@ -567,22 +571,21 @@ void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) { assert(FD >= 0 && "File already closed."); pos += Size; - do { - ssize_t ret; - - // Check whether we should attempt to use atomic writes. - if (LLVM_LIKELY(!UseAtomicWrites)) { - ret = ::write(FD, Ptr, Size); - } else { - // Use ::writev() where available. -#if defined(HAVE_WRITEV) - const void *Addr = static_cast<const void *>(Ptr); - struct iovec IOV = {const_cast<void *>(Addr), Size }; - ret = ::writev(FD, &IOV, 1); +#ifndef LLVM_ON_WIN32 + bool ShouldWriteInChunks = false; #else - ret = ::write(FD, Ptr, Size); + // Writing a large size of output to Windows console returns ENOMEM. It seems + // that, prior to Windows 8, WriteFile() is redirecting to WriteConsole(), and + // the latter has a size limit (66000 bytes or less, depending on heap usage). + bool ShouldWriteInChunks = !!::_isatty(FD) && !RunningWindows8OrGreater(); #endif - } + + do { + size_t ChunkSize = Size; + if (ChunkSize > 32767 && ShouldWriteInChunks) + ChunkSize = 32767; + + ssize_t ret = ::write(FD, Ptr, ChunkSize); if (ret < 0) { // If it's a recoverable error, swallow it and retry the write. @@ -755,72 +758,15 @@ void raw_string_ostream::write_impl(const char *Ptr, size_t Size) { // raw_svector_ostream //===----------------------------------------------------------------------===// -// The raw_svector_ostream implementation uses the SmallVector itself as the -// buffer for the raw_ostream. We guarantee that the raw_ostream buffer is -// always pointing past the end of the vector, but within the vector -// capacity. This allows raw_ostream to write directly into the correct place, -// and we only need to set the vector size when the data is flushed. +uint64_t raw_svector_ostream::current_pos() const { return OS.size(); } -raw_svector_ostream::raw_svector_ostream(SmallVectorImpl<char> &O, unsigned) - : OS(O) {} - -raw_svector_ostream::raw_svector_ostream(SmallVectorImpl<char> &O) : OS(O) { - init(); -} - -void raw_svector_ostream::init() { - // Set up the initial external buffer. We make sure that the buffer has at - // least 128 bytes free; raw_ostream itself only requires 64, but we want to - // make sure that we don't grow the buffer unnecessarily on destruction (when - // the data is flushed). See the FIXME below. - OS.reserve(OS.size() + 128); - SetBuffer(OS.end(), OS.capacity() - OS.size()); -} - -raw_svector_ostream::~raw_svector_ostream() { - // FIXME: Prevent resizing during this flush(). - flush(); +void raw_svector_ostream::write_impl(const char *Ptr, size_t Size) { + OS.append(Ptr, Ptr + Size); } void raw_svector_ostream::pwrite_impl(const char *Ptr, size_t Size, uint64_t Offset) { - flush(); - memcpy(OS.begin() + Offset, Ptr, Size); -} - -/// resync - This is called when the SmallVector we're appending to is changed -/// outside of the raw_svector_ostream's control. It is only safe to do this -/// if the raw_svector_ostream has previously been flushed. -void raw_svector_ostream::resync() { - assert(GetNumBytesInBuffer() == 0 && "Didn't flush before mutating vector"); - - if (OS.capacity() - OS.size() < 64) - OS.reserve(OS.capacity() * 2); - SetBuffer(OS.end(), OS.capacity() - OS.size()); -} - -void raw_svector_ostream::write_impl(const char *Ptr, size_t Size) { - if (Ptr == OS.end()) { - // Grow the buffer to include the scratch area without copying. - size_t NewSize = OS.size() + Size; - assert(NewSize <= OS.capacity() && "Invalid write_impl() call!"); - OS.set_size(NewSize); - } else { - assert(!GetNumBytesInBuffer()); - OS.append(Ptr, Ptr + Size); - } - - OS.reserve(OS.size() + 64); - SetBuffer(OS.end(), OS.capacity() - OS.size()); -} - -uint64_t raw_svector_ostream::current_pos() const { - return OS.size(); -} - -StringRef raw_svector_ostream::str() { - flush(); - return StringRef(OS.begin(), OS.size()); + memcpy(OS.data() + Offset, Ptr, Size); } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/TableGen/Record.cpp b/contrib/llvm/lib/TableGen/Record.cpp index c9a31b6..11e35b7 100644 --- a/contrib/llvm/lib/TableGen/Record.cpp +++ b/contrib/llvm/lib/TableGen/Record.cpp @@ -673,6 +673,14 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const { PrintFatalError(CurRec->getLoc(), "Undefined reference:'" + Name + "'\n"); } + + if (isa<IntRecTy>(getType())) { + if (BitsInit *BI = dyn_cast<BitsInit>(LHS)) { + if (Init *NewInit = BI->convertInitializerTo(IntRecTy::get())) + return NewInit; + break; + } + } } break; } @@ -714,7 +722,7 @@ Init *UnOpInit::resolveReferences(Record &R, const RecordVal *RV) const { std::string UnOpInit::getAsString() const { std::string Result; - switch (Opc) { + switch (getOpcode()) { case CAST: Result = "!cast<" + getType()->getAsString() + ">"; break; case HEAD: Result = "!head"; break; case TAIL: Result = "!tail"; break; @@ -842,7 +850,7 @@ Init *BinOpInit::resolveReferences(Record &R, const RecordVal *RV) const { std::string BinOpInit::getAsString() const { std::string Result; - switch (Opc) { + switch (getOpcode()) { case CONCAT: Result = "!con"; break; case ADD: Result = "!add"; break; case AND: Result = "!and"; break; @@ -1046,7 +1054,7 @@ Init *TernOpInit::resolveReferences(Record &R, const RecordVal *RV) const { Init *lhs = LHS->resolveReferences(R, RV); - if (Opc == IF && lhs != LHS) { + if (getOpcode() == IF && lhs != LHS) { IntInit *Value = dyn_cast<IntInit>(lhs); if (Init *I = lhs->convertInitializerTo(IntRecTy::get())) Value = dyn_cast<IntInit>(I); @@ -1074,7 +1082,7 @@ Init *TernOpInit::resolveReferences(Record &R, std::string TernOpInit::getAsString() const { std::string Result; - switch (Opc) { + switch (getOpcode()) { case SUBST: Result = "!subst"; break; case FOREACH: Result = "!foreach"; break; case IF: Result = "!if"; break; @@ -1633,7 +1641,7 @@ void Record::dump() const { errs() << *this; } raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) { OS << R.getNameInitAsString(); - const std::vector<Init *> &TArgs = R.getTemplateArgs(); + ArrayRef<Init *> TArgs = R.getTemplateArgs(); if (!TArgs.empty()) { OS << "<"; bool NeedComma = false; diff --git a/contrib/llvm/lib/TableGen/SetTheory.cpp b/contrib/llvm/lib/TableGen/SetTheory.cpp index 07c5381..f56b17a 100644 --- a/contrib/llvm/lib/TableGen/SetTheory.cpp +++ b/contrib/llvm/lib/TableGen/SetTheory.cpp @@ -196,7 +196,7 @@ struct SequenceOp : public SetTheory::Operator { if (IntInit *II = dyn_cast<IntInit>(Expr->arg_begin()[2])) To = II->getValue(); else - PrintFatalError(Loc, "From must be an integer: " + Expr->getAsString()); + PrintFatalError(Loc, "To must be an integer: " + Expr->getAsString()); if (To < 0 || To >= (1 << 30)) PrintFatalError(Loc, "To out of range"); diff --git a/contrib/llvm/lib/TableGen/TGParser.cpp b/contrib/llvm/lib/TableGen/TGParser.cpp index 5c36fda..1506a71 100644 --- a/contrib/llvm/lib/TableGen/TGParser.cpp +++ b/contrib/llvm/lib/TableGen/TGParser.cpp @@ -77,7 +77,8 @@ bool TGParser::AddValue(Record *CurRec, SMLoc Loc, const RecordVal &RV) { /// SetValue - /// Return true on error, false on success. bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName, - const std::vector<unsigned> &BitList, Init *V) { + ArrayRef<unsigned> BitList, Init *V, + bool AllowSelfAssignment) { if (!V) return false; if (!CurRec) CurRec = &CurMultiClass->Rec; @@ -91,8 +92,8 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName, // in the resolution machinery. if (BitList.empty()) if (VarInit *VI = dyn_cast<VarInit>(V)) - if (VI->getNameInit() == ValName) - return false; + if (VI->getNameInit() == ValName && !AllowSelfAssignment) + return true; // If we are assigning to a subset of the bits in the value... then we must be // assigning to a field of BitsRecTy, which must have a BitsInit @@ -152,7 +153,7 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) { if (AddValue(CurRec, SubClass.RefRange.Start, Val)) return true; - const std::vector<Init *> &TArgs = SC->getTemplateArgs(); + ArrayRef<Init *> TArgs = SC->getTemplateArgs(); // Ensure that an appropriate number of template arguments are specified. if (TArgs.size() < SubClass.TemplateArgs.size()) @@ -165,7 +166,7 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) { if (i < SubClass.TemplateArgs.size()) { // If a value is specified for this template arg, set it now. if (SetValue(CurRec, SubClass.RefRange.Start, TArgs[i], - std::vector<unsigned>(), SubClass.TemplateArgs[i])) + None, SubClass.TemplateArgs[i])) return true; // Resolve it next. @@ -228,7 +229,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC, CurMC->DefPrototypes.push_back(std::move(NewDef)); } - const std::vector<Init *> &SMCTArgs = SMC->Rec.getTemplateArgs(); + ArrayRef<Init *> SMCTArgs = SMC->Rec.getTemplateArgs(); // Ensure that an appropriate number of template arguments are // specified. @@ -243,8 +244,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC, // If a value is specified for this template arg, set it in the // superclass now. if (SetValue(CurRec, SubMultiClass.RefRange.Start, SMCTArgs[i], - std::vector<unsigned>(), - SubMultiClass.TemplateArgs[i])) + None, SubMultiClass.TemplateArgs[i])) return true; // Resolve it next. @@ -258,8 +258,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC, for (const auto &Def : makeArrayRef(CurMC->DefPrototypes).slice(newDefStart)) { if (SetValue(Def.get(), SubMultiClass.RefRange.Start, SMCTArgs[i], - std::vector<unsigned>(), - SubMultiClass.TemplateArgs[i])) + None, SubMultiClass.TemplateArgs[i])) return true; // Resolve it next. @@ -332,8 +331,7 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){ IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false)); - if (SetValue(IterRec.get(), Loc, IterVar->getName(), - std::vector<unsigned>(), IVal)) + if (SetValue(IterRec.get(), Loc, IterVar->getName(), None, IVal)) return Error(Loc, "when instantiating this def"); // Resolve it next. @@ -1641,7 +1639,7 @@ std::vector<Init*> TGParser::ParseValueList(Record *CurRec, Record *ArgsRec, RecTy *ItemType = EltTy; unsigned int ArgN = 0; if (ArgsRec && !EltTy) { - const std::vector<Init *> &TArgs = ArgsRec->getTemplateArgs(); + ArrayRef<Init *> TArgs = ArgsRec->getTemplateArgs(); if (TArgs.empty()) { TokError("template argument provided to non-template class"); return std::vector<Init*>(); @@ -1662,7 +1660,7 @@ std::vector<Init*> TGParser::ParseValueList(Record *CurRec, Record *ArgsRec, Lex.Lex(); // Eat the comma if (ArgsRec && !EltTy) { - const std::vector<Init *> &TArgs = ArgsRec->getTemplateArgs(); + ArrayRef<Init *> TArgs = ArgsRec->getTemplateArgs(); if (ArgN >= TArgs.size()) { TokError("too many template arguments"); return std::vector<Init*>(); @@ -1728,7 +1726,7 @@ Init *TGParser::ParseDeclaration(Record *CurRec, SMLoc ValLoc = Lex.getLoc(); Init *Val = ParseValue(CurRec, Type); if (!Val || - SetValue(CurRec, ValLoc, DeclName, std::vector<unsigned>(), Val)) + SetValue(CurRec, ValLoc, DeclName, None, Val)) // Return the name, even if an error is thrown. This is so that we can // continue to make some progress, even without the value having been // initialized. @@ -2313,13 +2311,11 @@ bool TGParser::ParseMultiClass() { return false; } -Record *TGParser:: -InstantiateMulticlassDef(MultiClass &MC, - Record *DefProto, - Init *&DefmPrefix, - SMRange DefmPrefixRange, - const std::vector<Init *> &TArgs, - std::vector<Init *> &TemplateVals) { +Record *TGParser::InstantiateMulticlassDef(MultiClass &MC, Record *DefProto, + Init *&DefmPrefix, + SMRange DefmPrefixRange, + ArrayRef<Init *> TArgs, + std::vector<Init *> &TemplateVals) { // We need to preserve DefProto so it can be reused for later // instantiations, so create a new Record to inherit from it. @@ -2360,8 +2356,8 @@ InstantiateMulticlassDef(MultiClass &MC, // Set the value for NAME. We don't resolve references to it 'til later, // though, so that uses in nested multiclass names don't get // confused. - if (SetValue(CurRec.get(), Ref.RefRange.Start, "NAME", - std::vector<unsigned>(), DefmPrefix)) { + if (SetValue(CurRec.get(), Ref.RefRange.Start, "NAME", None, DefmPrefix, + /*AllowSelfAssignment*/true)) { Error(DefmPrefixRange.Start, "Could not resolve " + CurRec->getNameInitAsString() + ":NAME to '" + DefmPrefix->getAsUnquotedString() + "'"); @@ -2437,11 +2433,9 @@ InstantiateMulticlassDef(MultiClass &MC, return CurRec.release(); } -bool TGParser::ResolveMulticlassDefArgs(MultiClass &MC, - Record *CurRec, - SMLoc DefmPrefixLoc, - SMLoc SubClassLoc, - const std::vector<Init *> &TArgs, +bool TGParser::ResolveMulticlassDefArgs(MultiClass &MC, Record *CurRec, + SMLoc DefmPrefixLoc, SMLoc SubClassLoc, + ArrayRef<Init *> TArgs, std::vector<Init *> &TemplateVals, bool DeleteArgs) { // Loop over all of the template arguments, setting them to the specified @@ -2450,8 +2444,7 @@ bool TGParser::ResolveMulticlassDefArgs(MultiClass &MC, // Check if a value is specified for this temp-arg. if (i < TemplateVals.size()) { // Set it now. - if (SetValue(CurRec, DefmPrefixLoc, TArgs[i], std::vector<unsigned>(), - TemplateVals[i])) + if (SetValue(CurRec, DefmPrefixLoc, TArgs[i], None, TemplateVals[i])) return true; // Resolve it next. @@ -2540,7 +2533,7 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) { std::vector<Init*> &TemplateVals = Ref.TemplateArgs; // Verify that the correct number of template arguments were specified. - const std::vector<Init *> &TArgs = MC->Rec.getTemplateArgs(); + ArrayRef<Init *> TArgs = MC->Rec.getTemplateArgs(); if (TArgs.size() < TemplateVals.size()) return Error(SubClassLoc, "more template args specified than multiclass expects"); diff --git a/contrib/llvm/lib/TableGen/TGParser.h b/contrib/llvm/lib/TableGen/TGParser.h index d69d1f4..739d9a9 100644 --- a/contrib/llvm/lib/TableGen/TGParser.h +++ b/contrib/llvm/lib/TableGen/TGParser.h @@ -105,10 +105,13 @@ public: private: // Semantic analysis methods. bool AddValue(Record *TheRec, SMLoc Loc, const RecordVal &RV); bool SetValue(Record *TheRec, SMLoc Loc, Init *ValName, - const std::vector<unsigned> &BitList, Init *V); + ArrayRef<unsigned> BitList, Init *V, + bool AllowSelfAssignment = false); bool SetValue(Record *TheRec, SMLoc Loc, const std::string &ValName, - const std::vector<unsigned> &BitList, Init *V) { - return SetValue(TheRec, Loc, StringInit::get(ValName), BitList, V); + ArrayRef<unsigned> BitList, Init *V, + bool AllowSelfAssignment = false) { + return SetValue(TheRec, Loc, StringInit::get(ValName), BitList, V, + AllowSelfAssignment); } bool AddSubClass(Record *Rec, SubClassReference &SubClass); bool AddSubMultiClass(MultiClass *CurMC, @@ -135,17 +138,13 @@ private: // Parser methods. bool ParseObject(MultiClass *MC); bool ParseClass(); bool ParseMultiClass(); - Record *InstantiateMulticlassDef(MultiClass &MC, - Record *DefProto, - Init *&DefmPrefix, - SMRange DefmPrefixRange, - const std::vector<Init *> &TArgs, + Record *InstantiateMulticlassDef(MultiClass &MC, Record *DefProto, + Init *&DefmPrefix, SMRange DefmPrefixRange, + ArrayRef<Init *> TArgs, std::vector<Init *> &TemplateVals); - bool ResolveMulticlassDefArgs(MultiClass &MC, - Record *DefProto, - SMLoc DefmPrefixLoc, - SMLoc SubClassLoc, - const std::vector<Init *> &TArgs, + bool ResolveMulticlassDefArgs(MultiClass &MC, Record *DefProto, + SMLoc DefmPrefixLoc, SMLoc SubClassLoc, + ArrayRef<Init *> TArgs, std::vector<Init *> &TemplateVals, bool DeleteArgs); bool ResolveMulticlassDef(MultiClass &MC, diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td index 9a7d6c8..46ef2c1 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64.td @@ -32,6 +32,15 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", "Enable ARMv8 CRC-32 checksum instructions">; +def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", + "Enable ARMv8 PMUv3 Performance Monitors extension">; + +def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", + "Full FP16", [FeatureFPARMv8]>; + +def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", + "Enable Statistical Profiling extension">; + /// Cyclone has register move instructions which are "free". def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; @@ -40,6 +49,15 @@ def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", "Has zero-cycle zeroing instructions">; +def FeatureStrictAlign : SubtargetFeature<"strict-align", + "StrictAlign", "true", + "Disallow all unaligned memory " + "access">; + +def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true", + "Reserve X18, making it unavailable " + "as a GPR">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -47,6 +65,9 @@ def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", "Support ARM v8.1a instructions", [FeatureCRC]>; +def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", + "Support ARM v8.2a instructions", [HasV8_1aOps]>; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// @@ -70,19 +91,29 @@ include "AArch64SchedA53.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" +def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", + "Cortex-A35 ARM processors", + [FeatureFPARMv8, + FeatureNEON, + FeatureCrypto, + FeatureCRC, + FeaturePerfMon]>; + def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", "Cortex-A53 ARM processors", [FeatureFPARMv8, FeatureNEON, FeatureCrypto, - FeatureCRC]>; + FeatureCRC, + FeaturePerfMon]>; def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", "Cortex-A57 ARM processors", [FeatureFPARMv8, FeatureNEON, FeatureCrypto, - FeatureCRC]>; + FeatureCRC, + FeaturePerfMon]>; def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", "Cyclone", @@ -90,17 +121,31 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", FeatureNEON, FeatureCrypto, FeatureCRC, + FeaturePerfMon, FeatureZCRegMove, FeatureZCZeroing]>; +def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", + "Samsung Exynos-M1 processors", + [FeatureFPARMv8, + FeatureNEON, + FeatureCrypto, + FeatureCRC, + FeaturePerfMon]>; + def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, FeatureNEON, - FeatureCRC]>; + FeatureCRC, + FeaturePerfMon]>; +// FIXME: Cortex-A35 is currently modelled as a Cortex-A53 +def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; // FIXME: Cortex-A72 is currently modelled as an Cortex-A57. def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; +// FIXME: Exynos-M1 is currently modelled without a specific SchedModel. +def : ProcessorModel<"exynos-m1", NoSchedModel, [ProcExynosM1]>; //===----------------------------------------------------------------------===// // Assembly parser @@ -109,11 +154,13 @@ def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; def GenericAsmParserVariant : AsmParserVariant { int Variant = 0; string Name = "generic"; + string BreakCharacters = "."; } def AppleAsmParserVariant : AsmParserVariant { int Variant = 1; string Name = "apple-neon"; + string BreakCharacters = "."; } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp index d7ef3f4..d215d9e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -122,7 +122,7 @@ AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) { static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { // Get the previous machine basic block in the function. - MachineFunction::iterator MBBI = *MBB; + MachineFunction::iterator MBBI(MBB); // Can't go off top of function. if (MBBI == MBB->getParent()->begin()) @@ -131,7 +131,7 @@ static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB, MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 2> Cond; - MachineBasicBlock *PrevBB = std::prev(MBBI); + MachineBasicBlock *PrevBB = &*std::prev(MBBI); for (MachineBasicBlock *S : MBB->predecessors()) if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) && !TBB && !FBB) @@ -151,10 +151,9 @@ static MachineInstr *getLastNonPseudo(MachineBasicBlock &MBB, // If there is no non-pseudo in the current block, loop back around and try // the previous block (if there is one). while ((FMBB = getBBFallenThrough(FMBB, TII))) { - for (auto I = FMBB->rbegin(), E = FMBB->rend(); I != E; ++I) { - if (!I->isPseudo()) - return &*I; - } + for (MachineInstr &I : make_range(FMBB->rbegin(), FMBB->rend())) + if (!I.isPseudo()) + return &I; } // There was no previous non-pseudo in the fallen through blocks @@ -217,8 +216,8 @@ AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) { ++Idx; } - DEBUG(dbgs() << "Scan complete, "<< Sequences.size() - << " occurences of pattern found.\n"); + DEBUG(dbgs() << "Scan complete, " << Sequences.size() + << " occurrences of pattern found.\n"); // Then update the basic block, inserting nops between the detected sequences. for (auto &MI : Sequences) { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 9d6dbd6..3d1ab4e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -158,7 +158,7 @@ INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE, "AArch64 A57 FP Load-Balancing", false, false) namespace { -/// A Chain is a sequence of instructions that are linked together by +/// A Chain is a sequence of instructions that are linked together by /// an accumulation operand. For example: /// /// fmul d0<def>, ? @@ -285,7 +285,7 @@ public: std::string str() const { std::string S; raw_string_ostream OS(S); - + OS << "{"; StartInst->print(OS, /* SkipOpers= */true); OS << " -> "; @@ -427,7 +427,7 @@ Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor, return Ch; } } - + // Bailout case - just return the first item. Chain *Ch = L.front(); L.erase(L.begin()); @@ -495,7 +495,7 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, RS.enterBasicBlock(&MBB); RS.forward(MachineBasicBlock::iterator(G->getStart())); - // Can we find an appropriate register that is available throughout the life + // Can we find an appropriate register that is available throughout the life // of the chain? unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass; BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID)); @@ -593,7 +593,6 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, if (Change) { Substs[MO.getReg()] = Reg; MO.setReg(Reg); - MRI->setPhysRegUsed(Reg); Changed = true; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp index 716e1a3..3afcdfb 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp @@ -57,6 +57,8 @@ EnableMerge("aarch64-type-promotion-merge", cl::Hidden, " the other."), cl::init(true)); +#define AARCH64_TYPE_PROMO_NAME "AArch64 Address Type Promotion" + //===----------------------------------------------------------------------===// // AArch64AddressTypePromotion //===----------------------------------------------------------------------===// @@ -76,7 +78,7 @@ public: } const char *getPassName() const override { - return "AArch64 Address Type Promotion"; + return AARCH64_TYPE_PROMO_NAME; } /// Iterate over the functions and promote the computation of interesting @@ -143,10 +145,10 @@ private: char AArch64AddressTypePromotion::ID = 0; INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion", - "AArch64 Type Promotion Pass", false, false) + AARCH64_TYPE_PROMO_NAME, false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion", - "AArch64 Type Promotion Pass", false, false) + AARCH64_TYPE_PROMO_NAME, false, false) FunctionPass *llvm::createAArch64AddressTypePromotionPass() { return new AArch64AddressTypePromotion(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index 18d21fd..1644d71 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -61,6 +61,12 @@ STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used"); STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted"); STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted"); +namespace llvm { +void initializeAArch64AdvSIMDScalarPass(PassRegistry &); +} + +#define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization" + namespace { class AArch64AdvSIMDScalar : public MachineFunctionPass { MachineRegisterInfo *MRI; @@ -82,12 +88,14 @@ private: public: static char ID; // Pass identification, replacement for typeid. - explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {} + explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) { + initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry()); + } bool runOnMachineFunction(MachineFunction &F) override; const char *getPassName() const override { - return "AdvSIMD Scalar Operation Optimization"; + return AARCH64_ADVSIMD_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -98,6 +106,9 @@ public: char AArch64AdvSIMDScalar::ID = 0; } // end anonymous namespace +INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar", + AARCH64_ADVSIMD_NAME, false, false) + static bool isGPR64(unsigned Reg, unsigned SubReg, const MachineRegisterInfo *MRI) { if (SubReg) @@ -381,7 +392,7 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { // Just check things on a one-block-at-a-time basis. for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) - if (processMachineBasicBlock(I)) + if (processMachineBasicBlock(&*I)) Changed = true; return Changed; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp index d973234..a614f55 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp @@ -45,6 +45,12 @@ BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), STATISTIC(NumSplit, "Number of basic blocks split"); STATISTIC(NumRelaxed, "Number of conditional branches relaxed"); +namespace llvm { +void initializeAArch64BranchRelaxationPass(PassRegistry &); +} + +#define AARCH64_BR_RELAX_NAME "AArch64 branch relaxation pass" + namespace { class AArch64BranchRelaxation : public MachineFunctionPass { /// BasicBlockInfo - Information about the offset and size of a single @@ -93,17 +99,22 @@ class AArch64BranchRelaxation : public MachineFunctionPass { public: static char ID; - AArch64BranchRelaxation() : MachineFunctionPass(ID) {} + AArch64BranchRelaxation() : MachineFunctionPass(ID) { + initializeAArch64BranchRelaxationPass(*PassRegistry::getPassRegistry()); + } bool runOnMachineFunction(MachineFunction &MF) override; const char *getPassName() const override { - return "AArch64 branch relaxation pass"; + return AARCH64_BR_RELAX_NAME; } }; char AArch64BranchRelaxation::ID = 0; } +INITIALIZE_PASS(AArch64BranchRelaxation, "aarch64-branch-relax", + AARCH64_BR_RELAX_NAME, false, false) + /// verify - check BBOffsets, BBSizes, alignment of islands void AArch64BranchRelaxation::verify() { #ifndef NDEBUG @@ -131,14 +142,14 @@ void AArch64BranchRelaxation::dumpBBs() { /// into the block immediately after it. static bool BBHasFallthrough(MachineBasicBlock *MBB) { // Get the next machine basic block in the function. - MachineFunction::iterator MBBI = MBB; + MachineFunction::iterator MBBI(MBB); // Can't fall off end of function. - MachineBasicBlock *NextBB = std::next(MBBI); + auto NextBB = std::next(MBBI); if (NextBB == MBB->getParent()->end()) return false; for (MachineBasicBlock *S : MBB->successors()) - if (S == NextBB) + if (S == &*NextBB) return true; return false; @@ -216,9 +227,7 @@ AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) { // Create a new MBB for the code after the OrigBB. MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); - MachineFunction::iterator MBBI = OrigBB; - ++MBBI; - MF->insert(MBBI, NewBB); + MF->insert(++OrigBB->getIterator(), NewBB); // Splice the instructions starting with MI over to NewBB. NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end()); @@ -421,7 +430,7 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) { MBB->replaceSuccessor(FBB, NewBB); NewBB->addSuccessor(FBB); } - MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB)); + MachineBasicBlock *NextBB = &*std::next(MachineFunction::iterator(MBB)); DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber() << ", invert condition and change dest. to BB#" diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h index 1e2d1c3..bc44bc5 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h @@ -25,30 +25,28 @@ namespace { using namespace llvm; -static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2, - AArch64::X3, AArch64::X4, AArch64::X5, - AArch64::X6, AArch64::X7}; -static const uint16_t HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2, - AArch64::H3, AArch64::H4, AArch64::H5, - AArch64::H6, AArch64::H7}; -static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2, - AArch64::S3, AArch64::S4, AArch64::S5, - AArch64::S6, AArch64::S7}; -static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2, - AArch64::D3, AArch64::D4, AArch64::D5, - AArch64::D6, AArch64::D7}; -static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, - AArch64::Q3, AArch64::Q4, AArch64::Q5, - AArch64::Q6, AArch64::Q7}; +static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2, + AArch64::X3, AArch64::X4, AArch64::X5, + AArch64::X6, AArch64::X7}; +static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2, + AArch64::H3, AArch64::H4, AArch64::H5, + AArch64::H6, AArch64::H7}; +static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2, + AArch64::S3, AArch64::S4, AArch64::S5, + AArch64::S6, AArch64::S7}; +static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2, + AArch64::D3, AArch64::D4, AArch64::D5, + AArch64::D6, AArch64::D7}; +static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, + AArch64::Q3, AArch64::Q4, AArch64::Q5, + AArch64::Q6, AArch64::Q7}; static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers, MVT LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State, unsigned SlotAlign) { unsigned Size = LocVT.getSizeInBits() / 8; - unsigned StackAlign = State.getMachineFunction() - .getTarget() - .getDataLayout() - ->getStackAlignment(); + unsigned StackAlign = + State.getMachineFunction().getDataLayout().getStackAlignment(); unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign); for (auto &It : PendingMembers) { @@ -88,7 +86,7 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State) { // Try to allocate a contiguous block of registers, each of the correct // size to hold one member. - ArrayRef<uint16_t> RegList; + ArrayRef<MCPhysReg> RegList; if (LocVT.SimpleTy == MVT::i64) RegList = XRegList; else if (LocVT.SimpleTy == MVT::f16) diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 815ebef..388d64e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -16,7 +16,7 @@ class CCIfAlign<string Align, CCAction A> : CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>; /// CCIfBigEndian - Match only if we're in big endian mode. class CCIfBigEndian<CCAction A> : - CCIf<"State.getMachineFunction().getTarget().getDataLayout()->isBigEndian()", A>; + CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>; //===----------------------------------------------------------------------===// // ARM AAPCS64 Calling Convention @@ -279,6 +279,23 @@ def CSR_AArch64_TLS_Darwin FP, (sequence "Q%u", 0, 31))>; +// We can only handle a register pair with adjacent registers, the register pair +// should belong to the same class as well. Since the access function on the +// fast path calls a function that follows CSR_AArch64_TLS_Darwin, +// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin. +def CSR_AArch64_CXX_TLS_Darwin + : CalleeSavedRegs<(add CSR_AArch64_AAPCS, + (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), + (sequence "D%u", 0, 31))>; + +// CSRs that are handled by prologue, epilogue. +def CSR_AArch64_CXX_TLS_Darwin_PE + : CalleeSavedRegs<(add LR, FP)>; + +// CSRs that are handled explicitly via copies. +def CSR_AArch64_CXX_TLS_Darwin_ViaCopy + : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>; + // The ELF stub used for TLS-descriptor access saves every feasible // register. Only X0 and LR are clobbered. def CSR_AArch64_TLS_ELF diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 06ff9af..9310ac4 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -117,10 +117,10 @@ struct LDTLSCleanup : public MachineFunctionPass { *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass); // Insert a copy from X0 to TLSBaseAddrReg for later. - MachineInstr *Next = I->getNextNode(); - MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(), - TII->get(TargetOpcode::COPY), - *TLSBaseAddrReg).addReg(AArch64::X0); + MachineInstr *Copy = + BuildMI(*I->getParent(), ++I->getIterator(), I->getDebugLoc(), + TII->get(TargetOpcode::COPY), *TLSBaseAddrReg) + .addReg(AArch64::X0); return Copy; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp index efdb2e3..78c239b 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -168,6 +168,8 @@ namespace llvm { void initializeAArch64CollectLOHPass(PassRegistry &); } +#define AARCH64_COLLECT_LOH_NAME "AArch64 Collect Linker Optimization Hint (LOH)" + namespace { struct AArch64CollectLOH : public MachineFunctionPass { static char ID; @@ -178,7 +180,7 @@ struct AArch64CollectLOH : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; const char *getPassName() const override { - return "AArch64 Collect Linker Optimization Hint (LOH)"; + return AARCH64_COLLECT_LOH_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -220,12 +222,10 @@ typedef SmallVector<unsigned, 32> MapIdToReg; char AArch64CollectLOH::ID = 0; INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh", - "AArch64 Collect Linker Optimization Hint (LOH)", false, - false) + AARCH64_COLLECT_LOH_NAME, false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh", - "AArch64 Collect Linker Optimization Hint (LOH)", false, - false) + AARCH64_COLLECT_LOH_NAME, false, false) /// Given a couple (MBB, reg) get the corresponding set of instruction from /// the given "sets". @@ -353,9 +353,17 @@ static void initReachingDef(const MachineFunction &MF, for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) { MapRegToId::const_iterator ItRegId = RegToId.find(*AI); - assert(ItRegId != RegToId.end() && - "Sub-register of an " - "involved register, not recorded as involved!"); + // If this alias has not been recorded, then it is not interesting + // for the current analysis. + // We can end up in this situation because of tuple registers. + // E.g., Let say we are interested in S1. When we register + // S1, we will also register its aliases and in particular + // the tuple Q1_Q2. + // Now, when we encounter Q1_Q2, we will look through its aliases + // and will find that S2 is not registered. + if (ItRegId == RegToId.end()) + continue; + BBKillSet.set(ItRegId->second); BBGen[ItRegId->second] = &MI; } @@ -523,6 +531,8 @@ static bool isCandidateStore(const MachineInstr *Instr) { switch (Instr->getOpcode()) { default: return false; + case AArch64::STRBBui: + case AArch64::STRHHui: case AArch64::STRBui: case AArch64::STRHui: case AArch64::STRWui: @@ -884,7 +894,8 @@ static void computeOthers(const InstrToInstrs &UseToDefs, bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri); // If the chain is three instructions long and ldr is the second element, // then this ldr must load form GOT, otherwise this is not a correct chain. - if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != AArch64II::MO_GOT) + if (L2 && !IsL2Add && + !(L2->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)) continue; SmallVector<const MachineInstr *, 3> Args; MCLOHType Kind; @@ -1000,7 +1011,8 @@ static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId, DEBUG(dbgs() << "** Collect Involved Register\n"); for (const auto &MBB : MF) { for (const MachineInstr &MI : MBB) { - if (!canDefBePartOfLOH(&MI)) + if (!canDefBePartOfLOH(&MI) && + !isCandidateLoad(&MI) && !isCandidateStore(&MI)) continue; // Process defs diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index b9e41c6..fc27bfe 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -59,6 +59,7 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" +#include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -153,13 +154,20 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( case AArch64::SUBSXri: // cmn is an alias for adds with a dead destination register. case AArch64::ADDSWri: - case AArch64::ADDSXri: - if (MRI->use_empty(I->getOperand(0).getReg())) - return I; - - DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n'); - return nullptr; - + case AArch64::ADDSXri: { + unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm()); + if (!I->getOperand(2).isImm()) { + DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n'); + return nullptr; + } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) { + DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I << '\n'); + return nullptr; + } else if (!MRI->use_empty(I->getOperand(0).getReg())) { + DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n'); + return nullptr; + } + return I; + } // Prevent false positive case like: // cmp w19, #0 // cinc w0, w19, gt diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 2b0c92f..df1320f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -353,7 +353,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { MIOperands::PhysRegInfo PRI = MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI); - if (PRI.Reads) { + if (PRI.Read) { // The ccmp doesn't produce exactly the same flags as the original // compare, so reject the transform if there are uses of the flags // besides the terminators. @@ -362,7 +362,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { return nullptr; } - if (PRI.Clobbers) { + if (PRI.Defined || PRI.Clobbered) { DEBUG(dbgs() << "Not convertible compare: " << *I); ++NumUnknNZCVDefs; return nullptr; @@ -567,8 +567,8 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { // All CmpBB instructions are moved into Head, and CmpBB is deleted. // Update the CFG first. updateTailPHIs(); - Head->removeSuccessor(CmpBB); - CmpBB->removeSuccessor(Tail); + Head->removeSuccessor(CmpBB, true); + CmpBB->removeSuccessor(Tail, true); Head->transferSuccessorsAndUpdatePHIs(CmpBB); DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc(); TII->RemoveBranch(*Head); @@ -786,13 +786,13 @@ void AArch64ConditionalCompares::updateDomTree( // convert() removes CmpBB which was previously dominated by Head. // CmpBB children should be transferred to Head. MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head); - for (unsigned i = 0, e = Removed.size(); i != e; ++i) { - MachineDomTreeNode *Node = DomTree->getNode(Removed[i]); + for (MachineBasicBlock *RemovedMBB : Removed) { + MachineDomTreeNode *Node = DomTree->getNode(RemovedMBB); assert(Node != HeadNode && "Cannot erase the head node"); assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head"); while (Node->getNumChildren()) DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode); - DomTree->eraseNode(Removed[i]); + DomTree->eraseNode(RemovedMBB); } } @@ -801,8 +801,8 @@ void AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) { if (!Loops) return; - for (unsigned i = 0, e = Removed.size(); i != e; ++i) - Loops->removeBlock(Removed[i]); + for (MachineBasicBlock *RemovedMBB : Removed) + Loops->removeBlock(RemovedMBB); } /// Invalidate MachineTraceMetrics before if-conversion. @@ -899,7 +899,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { Loops = getAnalysisIfAvailable<MachineLoopInfo>(); Traces = &getAnalysis<MachineTraceMetrics>(); MinInstr = nullptr; - MinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); + MinSize = MF.getFunction()->optForMinSize(); bool Changed = false; CmpConv.runOnMachineFunction(MF); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index 74fc167..576cf4a 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -26,6 +26,12 @@ using namespace llvm; STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced"); +namespace llvm { +void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry &); +} + +#define AARCH64_DEAD_REG_DEF_NAME "AArch64 Dead register definitions" + namespace { class AArch64DeadRegisterDefinitions : public MachineFunctionPass { private: @@ -35,11 +41,14 @@ private: bool usesFrameIndex(const MachineInstr &MI); public: static char ID; // Pass identification, replacement for typeid. - explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {} + explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) { + initializeAArch64DeadRegisterDefinitionsPass( + *PassRegistry::getPassRegistry()); + } bool runOnMachineFunction(MachineFunction &F) override; - const char *getPassName() const override { return "Dead register definitions"; } + const char *getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -49,6 +58,9 @@ public: char AArch64DeadRegisterDefinitions::ID = 0; } // end anonymous namespace +INITIALIZE_PASS(AArch64DeadRegisterDefinitions, "aarch64-dead-defs", + AARCH64_DEAD_REG_DEF_NAME, false, false) + bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg( unsigned Reg, const MachineInstr &MI) { for (const MachineOperand &MO : MI.implicit_operands()) diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index c2470f7..d24e42a 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -22,18 +22,26 @@ #include "llvm/Support/MathExtras.h" using namespace llvm; +namespace llvm { +void initializeAArch64ExpandPseudoPass(PassRegistry &); +} + +#define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass" + namespace { class AArch64ExpandPseudo : public MachineFunctionPass { public: static char ID; - AArch64ExpandPseudo() : MachineFunctionPass(ID) {} + AArch64ExpandPseudo() : MachineFunctionPass(ID) { + initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry()); + } const AArch64InstrInfo *TII; bool runOnMachineFunction(MachineFunction &Fn) override; const char *getPassName() const override { - return "AArch64 pseudo instruction expansion pass"; + return AARCH64_EXPAND_PSEUDO_NAME; } private: @@ -45,6 +53,9 @@ private: char AArch64ExpandPseudo::ID = 0; } +INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo", + AARCH64_EXPAND_PSEUDO_NAME, false, false) + /// \brief Transfer implicit operands on the pseudo instruction to the /// instructions created from the expansion. static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 0728198..0ac4b39 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -523,7 +523,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) U = C; } - if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType())) + if (auto *Ty = dyn_cast<PointerType>(Obj->getType())) if (Ty->getAddressSpace() > 255) // Fast instruction selection doesn't support the special // address spaces. @@ -969,7 +969,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { // Cannot encode an offset register and an immediate offset in the same // instruction. Fold the immediate offset into the load/store instruction and - // emit an additonal add to take care of the offset register. + // emit an additional add to take care of the offset register. if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg()) RegisterOffsetNeedsLowering = true; @@ -1058,8 +1058,8 @@ void AArch64FastISel::addLoadStoreOperands(Address &Addr, // FIXME: We shouldn't be using getObjectSize/getObjectAlignment. The size // and alignment should be based on the VT. MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI, Offset), Flags, - MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); // Now add the rest of the operands. MIB.addFrameIndex(FI).addImm(Offset); } else { @@ -1178,7 +1178,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, } // Check if the mul can be folded into the instruction. - if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) { if (isMulPowOf2(RHS)) { const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0); const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1); @@ -1193,12 +1193,16 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, if (!RHSReg) return 0; bool RHSIsKill = hasTrivialKill(MulLHS); - return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill, - AArch64_AM::LSL, ShiftVal, SetFlags, WantResult); + ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, AArch64_AM::LSL, ShiftVal, SetFlags, + WantResult); + if (ResultReg) + return ResultReg; } + } // Check if the shift can be folded into the instruction. - if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) { if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) { if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) { AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend; @@ -1214,12 +1218,15 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, if (!RHSReg) return 0; bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); - return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftType, ShiftVal, SetFlags, - WantResult); + ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftType, ShiftVal, SetFlags, + WantResult); + if (ResultReg) + return ResultReg; } } } + } unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) @@ -1323,6 +1330,10 @@ unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg, if (RetVT != MVT::i32 && RetVT != MVT::i64) return 0; + // Don't deal with undefined shifts. + if (ShiftImm >= RetVT.getSizeInBits()) + return 0; + static const unsigned OpcTable[2][2][2] = { { { AArch64::SUBWrs, AArch64::SUBXrs }, { AArch64::ADDWrs, AArch64::ADDXrs } }, @@ -1360,6 +1371,9 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg, if (RetVT != MVT::i32 && RetVT != MVT::i64) return 0; + if (ShiftImm >= 4) + return 0; + static const unsigned OpcTable[2][2][2] = { { { AArch64::SUBWrx, AArch64::SUBXrx }, { AArch64::ADDWrx, AArch64::ADDXrx } }, @@ -1542,7 +1556,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, return ResultReg; // Check if the mul can be folded into the instruction. - if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) { if (isMulPowOf2(RHS)) { const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0); const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1); @@ -1558,12 +1572,15 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, if (!RHSReg) return 0; bool RHSIsKill = hasTrivialKill(MulLHS); - return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftVal); + ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + if (ResultReg) + return ResultReg; } + } // Check if the shift can be folded into the instruction. - if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) { if (const auto *SI = dyn_cast<ShlOperator>(RHS)) if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) { uint64_t ShiftVal = C->getZExtValue(); @@ -1571,9 +1588,12 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, if (!RHSReg) return 0; bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); - return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftVal); + ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + if (ResultReg) + return ResultReg; } + } unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) @@ -1646,6 +1666,11 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, { AArch64::ORRWrs, AArch64::ORRXrs }, { AArch64::EORWrs, AArch64::EORXrs } }; + + // Don't deal with undefined shifts. + if (ShiftImm >= RetVT.getSizeInBits()) + return 0; + const TargetRegisterClass *RC; unsigned Opc; switch (RetVT.SimpleTy) { @@ -2235,14 +2260,7 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { MIB.addImm(TestBit); MIB.addMBB(TBB); - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); - fastEmitBranch(FBB, DbgLoc); - + finishCondBranch(BI->getParent(), TBB, FBB); return true; } @@ -2257,7 +2275,6 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; - AArch64CC::CondCode CC = AArch64CC::NE; if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { if (CI->hasOneUse() && isValueAvailable(CI)) { // Try to optimize or fold the cmp. @@ -2289,7 +2306,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch // instruction. - CC = getCompareCC(Predicate); + AArch64CC::CondCode CC = getCompareCC(Predicate); AArch64CC::CondCode ExtraCC = AArch64CC::AL; switch (Predicate) { default: @@ -2317,52 +2334,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { .addImm(CC) .addMBB(TBB); - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); - - fastEmitBranch(FBB, DbgLoc); - return true; - } - } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { - MVT SrcVT; - if (TI->hasOneUse() && isValueAvailable(TI) && - isTypeSupported(TI->getOperand(0)->getType(), SrcVT)) { - unsigned CondReg = getRegForValue(TI->getOperand(0)); - if (!CondReg) - return false; - bool CondIsKill = hasTrivialKill(TI->getOperand(0)); - - // Issue an extract_subreg to get the lower 32-bits. - if (SrcVT == MVT::i64) { - CondReg = fastEmitInst_extractsubreg(MVT::i32, CondReg, CondIsKill, - AArch64::sub_32); - CondIsKill = true; - } - - unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondIsKill, 1); - assert(ANDReg && "Unexpected AND instruction emission failure."); - emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0); - - if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { - std::swap(TBB, FBB); - CC = AArch64CC::EQ; - } - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) - .addImm(CC) - .addMBB(TBB); - - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); - - fastEmitBranch(FBB, DbgLoc); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } } else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) { @@ -2371,34 +2343,31 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B)) .addMBB(Target); - // Obtain the branch weight and add the target to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - Target->getBasicBlock()); - FuncInfo.MBB->addSuccessor(Target, BranchWeight); + // Obtain the branch probability and add the target to the successor list. + if (FuncInfo.BPI) { + auto BranchProbability = FuncInfo.BPI->getEdgeProbability( + BI->getParent(), Target->getBasicBlock()); + FuncInfo.MBB->addSuccessor(Target, BranchProbability); + } else + FuncInfo.MBB->addSuccessorWithoutProb(Target); return true; - } else if (foldXALUIntrinsic(CC, I, BI->getCondition())) { - // Fake request the condition, otherwise the intrinsic might be completely - // optimized away. - unsigned CondReg = getRegForValue(BI->getCondition()); - if (!CondReg) - return false; - - // Emit the branch. - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) - .addImm(CC) - .addMBB(TBB); + } else { + AArch64CC::CondCode CC = AArch64CC::NE; + if (foldXALUIntrinsic(CC, I, BI->getCondition())) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned CondReg = getRegForValue(BI->getCondition()); + if (!CondReg) + return false; - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); + // Emit the branch. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) + .addImm(CC) + .addMBB(TBB); - fastEmitBranch(FBB, DbgLoc); - return true; + finishCondBranch(BI->getParent(), TBB, FBB); + return true; + } } unsigned CondReg = getRegForValue(BI->getCondition()); @@ -2406,32 +2375,22 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { return false; bool CondRegIsKill = hasTrivialKill(BI->getCondition()); - // We've been divorced from our compare! Our block was split, and - // now our compare lives in a predecessor block. We musn't - // re-compare here, as the children of the compare aren't guaranteed - // live across the block boundary (we *could* check for this). - // Regardless, the compare has been done in the predecessor block, - // and it left a value for us in a virtual register. Ergo, we test - // the one-bit value left in the virtual register. - emitICmp_ri(MVT::i32, CondReg, CondRegIsKill, 0); - + // i1 conditions come as i32 values, test the lowest bit with tb(n)z. + unsigned Opcode = AArch64::TBNZW; if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { std::swap(TBB, FBB); - CC = AArch64CC::EQ; + Opcode = AArch64::TBZW; } - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) - .addImm(CC) + const MCInstrDesc &II = TII.get(Opcode); + unsigned ConstrainedCondReg + = constrainOperandRegClass(II, CondReg, II.getNumDefs()); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill)) + .addImm(0) .addMBB(TBB); - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); - - fastEmitBranch(FBB, DbgLoc); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } @@ -2447,8 +2406,8 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg); // Make sure the CFG is up-to-date. - for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i) - FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]); + for (auto *Succ : BI->successors()) + FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[Succ]); return true; } @@ -2456,6 +2415,10 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) { bool AArch64FastISel::selectCmp(const Instruction *I) { const CmpInst *CI = cast<CmpInst>(I); + // Vectors of i1 are weird: bail out. + if (CI->getType()->isVectorTy()) + return false; + // Try to optimize or fold the cmp. CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); unsigned ResultReg = 0; @@ -2954,8 +2917,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, .addImm(NumBytes); // Process the args. - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; + for (CCValAssign &VA : ArgLocs) { const Value *ArgVal = CLI.OutVals[VA.getValNo()]; MVT ArgVT = OutVTs[VA.getValNo()]; @@ -3018,8 +2980,8 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getStack(Addr.getOffset()), - MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); + MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()), + MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); if (!emitStore(ArgVT, ArgReg, Addr, MMO)) return false; @@ -3318,8 +3280,8 @@ bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC, return false; // Make sure nothing is in the way - BasicBlock::const_iterator Start = I; - BasicBlock::const_iterator End = II; + BasicBlock::const_iterator Start(I); + BasicBlock::const_iterator End(II); for (auto Itr = std::prev(Start); Itr != End; --Itr) { // We only expect extractvalue instructions between the intrinsic and the // instruction to be selected. @@ -3684,6 +3646,9 @@ bool AArch64FastISel::selectRet(const Instruction *I) { if (F.isVarArg()) return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) + return false; + // Build a list of return value registers. SmallVector<unsigned, 4> RetRegs; @@ -3763,8 +3728,8 @@ bool AArch64FastISel::selectRet(const Instruction *I) { MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::RET_ReallyLR)); - for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) - MIB.addReg(RetRegs[i], RegState::Implicit); + for (unsigned RetReg : RetRegs) + MIB.addReg(RetReg, RegState::Implicit); return true; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index a76473f..11ae800 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -72,9 +72,9 @@ // // For most functions, some of the frame areas are empty. For those functions, // it may not be necessary to set up fp or bp: -// * A base pointer is definitly needed when there are both VLAs and local +// * A base pointer is definitely needed when there are both VLAs and local // variables with more-than-default alignment requirements. -// * A frame pointer is definitly needed when there are local variables with +// * A frame pointer is definitely needed when there are local variables with // more-than-default alignment requirements. // // In some cases when a base pointer is not strictly needed, it is generated @@ -216,11 +216,11 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( if (CSI.empty()) return; - const DataLayout *TD = MF.getTarget().getDataLayout(); + const DataLayout &TD = MF.getDataLayout(); bool HasFP = hasFP(MF); // Calculate amount of bytes used for return address storing. - int stackGrowth = -TD->getPointerSize(0); + int stackGrowth = -TD.getPointerSize(0); // Calculate offsets. int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth; @@ -280,14 +280,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.begin(); const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); - const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry(); bool HasFP = hasFP(MF); - DebugLoc DL = MBB.findDebugLoc(MBBI); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. @@ -354,7 +357,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (NumBytes && NeedsRealignment) { // Use the first callee-saved register as a scratch register. scratchSPReg = AArch64::X9; - MF.getRegInfo().setPhysRegUsed(scratchSPReg); } // If we're a leaf function, try using the red zone. @@ -400,8 +402,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } if (needsFrameMoves) { - const DataLayout *TD = MF.getTarget().getDataLayout(); - const int StackGrowth = -TD->getPointerSize(0); + const DataLayout &TD = MF.getDataLayout(); + const int StackGrowth = -TD.getPointerSize(0); unsigned FramePtr = RegInfo->getFrameRegister(MF); // An example of the prologue: // @@ -513,33 +515,33 @@ static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) { return false; } -static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) { +/// Checks whether the given instruction restores callee save registers +/// and if so returns how many. +static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) { unsigned RtIdx = 0; - if (MI->getOpcode() == AArch64::LDPXpost || - MI->getOpcode() == AArch64::LDPDpost) + switch (MI.getOpcode()) { + case AArch64::LDPXpost: + case AArch64::LDPDpost: RtIdx = 1; - - if (MI->getOpcode() == AArch64::LDPXpost || - MI->getOpcode() == AArch64::LDPDpost || - MI->getOpcode() == AArch64::LDPXi || MI->getOpcode() == AArch64::LDPDi) { - if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) || - !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) || - MI->getOperand(RtIdx + 2).getReg() != AArch64::SP) - return false; - return true; + // FALLTHROUGH + case AArch64::LDPXi: + case AArch64::LDPDi: + if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) || + !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) || + MI.getOperand(RtIdx + 2).getReg() != AArch64::SP) + return 0; + return 2; } - - return false; + return 0; } void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const AArch64InstrInfo *TII = - static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); - const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL; bool IsTailCallReturn = false; if (MBB.end() != MBBI) { @@ -585,7 +587,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // ---------------------| --- | // | | | | // | CalleeSavedReg | | | - // | (NumRestores * 16) | | | + // | (NumRestores * 8) | | | // | | | | // ---------------------| | NumBytes // | | StackSize (StackAdjustUp) @@ -606,17 +608,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); - if (LastPopI != MBB.begin()) { - do { - ++NumRestores; - --LastPopI; - } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs)); - if (!isCSRestore(LastPopI, CSRegs)) { + MachineBasicBlock::iterator Begin = MBB.begin(); + while (LastPopI != Begin) { + --LastPopI; + unsigned Restores = getNumCSRestores(*LastPopI, CSRegs); + NumRestores += Restores; + if (Restores == 0) { ++LastPopI; - --NumRestores; + break; } } - NumBytes -= NumRestores * 16; + NumBytes -= NumRestores * 8; assert(NumBytes >= 0 && "Negative stack allocation size!?"); if (!hasFP(MF)) { @@ -634,15 +636,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // be able to save any instructions. if (NumBytes || MFI->hasVarSizedObjects()) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags); -} - -/// getFrameIndexOffset - Returns the displacement from the frame register to -/// the stack frame of the specified index. -int AArch64FrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - unsigned FrameReg; - return getFrameIndexReference(MF, FI, FrameReg); + -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags); } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for @@ -739,9 +733,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( DebugLoc DL; assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - if (MI != MBB.end()) - DL = MI->getDebugLoc(); - for (unsigned i = 0; i < Count; i += 2) { unsigned idx = Count - i - 2; unsigned Reg1 = CSI[idx].getReg(); @@ -911,7 +902,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned NumFPRSpilled = 0; bool ExtraCSSpill = false; bool CanEliminateFrame = true; - DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:"); + DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"); const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); // Check pairs of consecutive callee-saved registers. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 731f031..427afdf 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -37,7 +37,6 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; int resolveFrameIndexReference(const MachineFunction &MF, int FI, @@ -61,6 +60,11 @@ public: void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; + + /// Returns true if the target will correctly handle shrink wrapping. + bool enableShrinkWrapping(const MachineFunction &MF) const override { + return true; + } }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 772e894..6c86888 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -34,7 +34,6 @@ using namespace llvm; namespace { class AArch64DAGToDAGISel : public SelectionDAGISel { - AArch64TargetMachine &TM; /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. @@ -45,7 +44,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { public: explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr), + : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), ForCodeSize(false) {} const char *getPassName() const override { @@ -53,9 +52,7 @@ public: } bool runOnMachineFunction(MachineFunction &MF) override { - ForCodeSize = - MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || - MF.getFunction()->hasFnAttribute(Attribute::MinSize); + ForCodeSize = MF.getFunction()->optForSize(); Subtarget = &MF.getSubtarget<AArch64Subtarget>(); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -79,6 +76,21 @@ public: bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) { return SelectShiftedRegister(N, true, Reg, Shift); } + bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 1, Base, OffImm); + } + bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 2, Base, OffImm); + } + bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 4, Base, OffImm); + } + bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 8, Base, OffImm); + } + bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 16, Base, OffImm); + } bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) { return SelectAddrModeIndexed(N, 1, Base, OffImm); } @@ -153,8 +165,7 @@ public: SDNode *SelectBitfieldExtractOp(SDNode *N); SDNode *SelectBitfieldInsertOp(SDNode *N); - - SDNode *SelectLIBM(SDNode *N); + SDNode *SelectBitfieldInsertInZeroOp(SDNode *N); SDNode *SelectReadRegister(SDNode *N); SDNode *SelectWriteRegister(SDNode *N); @@ -165,6 +176,8 @@ public: private: bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg, SDValue &Shift); + bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base, + SDValue &OffImm); bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base, SDValue &OffImm); bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base, @@ -422,7 +435,7 @@ static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) { return true; } -// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a +// Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a // high lane extract. static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp, SDValue &LaneOp, int &LaneIdx) { @@ -572,7 +585,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, } // AArch64 mandates that the RHS of the operation must use the smallest - // register classs that could contain the size being extended from. Thus, + // register class that could contain the size being extended from. Thus, // if we're folding a (sext i8), we need the RHS to be a GPR32, even though // there might not be an actual 32-bit value in the program. We can // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here. @@ -587,7 +600,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, /// need to create a real ADD instruction from it anyway and there's no point in /// folding it into the mem op. Theoretically, it shouldn't matter, but there's /// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding -/// leads to duplaicated ADRP instructions. +/// leads to duplicated ADRP instructions. static bool isWorthFoldingADDlow(SDValue N) { for (auto Use : N->uses()) { if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE && @@ -604,6 +617,51 @@ static bool isWorthFoldingADDlow(SDValue N) { return true; } +/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit +/// immediate" address. The "Size" argument is the size in bytes of the memory +/// reference, which determines the scale. +bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size, + SDValue &Base, + SDValue &OffImm) { + SDLoc dl(N); + const DataLayout &DL = CurDAG->getDataLayout(); + const TargetLowering *TLI = getTargetLowering(); + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); + return true; + } + + // As opposed to the (12-bit) Indexed addressing mode below, the 7-bit signed + // selected here doesn't support labels/immediates, only base+offset. + + if (CurDAG->isBaseWithConstantOffset(N)) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int64_t RHSC = RHS->getSExtValue(); + unsigned Scale = Log2_32(Size); + if ((RHSC & (Size - 1)) == 0 && RHSC >= -(0x40 << Scale) && + RHSC < (0x40 << Scale)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + } + OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64); + return true; + } + } + } + + // Base only. The address will be materialized into a register before + // the memory is accessed. + // add x0, Xbase, #offset + // stp x1, x2, [x0] + Base = N; + OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); + return true; +} + /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit /// immediate" address. The "Size" argument is the size in bytes of the memory /// reference, which determines the scale. @@ -867,7 +925,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size, if (isa<ConstantSDNode>(RHS)) { int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue(); unsigned Scale = Log2_32(Size); - // Skip the immediate can be seleced by load/store addressing mode. + // Skip the immediate can be selected by load/store addressing mode. // Also skip the immediate can be encoded by a single ADD (SUB is also // checked by using -ImmOff). if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) || @@ -1034,6 +1092,8 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { // it into an i64. DstVT = MVT::i32; } + } else if (VT == MVT::f16) { + Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost; } else if (VT == MVT::f32) { Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost; } else if (VT == MVT::f64 || VT.is64BitVector()) { @@ -1222,8 +1282,8 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, SDValue SuperReg = SDValue(Ld, 0); EVT WideVT = RegSeq.getOperand(1)->getValueType(0); - static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2, - AArch64::qsub3 }; + static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3 }; for (unsigned i = 0; i < NumVecs; ++i) { SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg); if (Narrow) @@ -1275,8 +1335,8 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg); } else { EVT WideVT = RegSeq.getOperand(1)->getValueType(0); - static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2, - AArch64::qsub3 }; + static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3 }; for (unsigned i = 0; i < NumVecs; ++i) { SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg); @@ -1420,7 +1480,7 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, // The resulting code will be at least as good as the original one // plus it may expose more opportunities for bitfield insert pattern. // FIXME: Currently we limit this to the bigger pattern, because - // some optimizations expect AND and not UBFM + // some optimizations expect AND and not UBFM. Opd0 = N->getOperand(0); } else return false; @@ -1852,6 +1912,7 @@ static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) { /// Does this tree qualify as an attempt to move a bitfield into position, /// essentially "(and (shl VAL, N), Mask)". static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, + bool BiggerPattern, SDValue &Src, int &ShiftAmount, int &MaskWidth) { EVT VT = Op.getValueType(); @@ -1874,6 +1935,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, Op = Op.getOperand(0); } + // Don't match if the SHL has more than one use, since then we'll end up + // generating SHL+UBFIZ instead of just keeping SHL+AND. + if (!BiggerPattern && !Op.hasOneUse()) + return false; + uint64_t ShlImm; if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm)) return false; @@ -1887,7 +1953,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, // BFI encompasses sufficiently many nodes that it's worth inserting an extra // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL - // amount. + // amount. BiggerPattern is true when this pattern is being matched for BFI, + // BiggerPattern is false when this pattern is being matched for UBFIZ, in + // which case it is not profitable to insert an extra shift. + if (ShlImm - ShiftAmount != 0 && !BiggerPattern) + return false; Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount); return true; @@ -1904,7 +1974,8 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, // f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2 static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, SDValue &Src, unsigned &ImmR, - unsigned &ImmS, SelectionDAG *CurDAG) { + unsigned &ImmS, const APInt &UsefulBits, + SelectionDAG *CurDAG) { assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); // Set Opc @@ -1918,23 +1989,30 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, // Because of simplify-demanded-bits in DAGCombine, involved masks may not // have the expected shape. Try to undo that. - APInt UsefulBits; - getUsefulBits(SDValue(N, 0), UsefulBits); unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros(); unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros(); - // OR is commutative, check both possibilities (does llvm provide a - // way to do that directely, e.g., via code matcher?) - SDValue OrOpd1Val = N->getOperand(1); - SDNode *OrOpd0 = N->getOperand(0).getNode(); - SDNode *OrOpd1 = N->getOperand(1).getNode(); - for (int i = 0; i < 2; - ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) { + // OR is commutative, check all combinations of operand order and values of + // BiggerPattern, i.e. + // Opd0, Opd1, BiggerPattern=false + // Opd1, Opd0, BiggerPattern=false + // Opd0, Opd1, BiggerPattern=true + // Opd1, Opd0, BiggerPattern=true + // Several of these combinations may match, so check with BiggerPattern=false + // first since that will produce better results by matching more instructions + // and/or inserting fewer extra instructions. + for (int I = 0; I < 4; ++I) { + + bool BiggerPattern = I / 2; + SDNode *OrOpd0 = N->getOperand(I % 2).getNode(); + SDValue OrOpd1Val = N->getOperand((I + 1) % 2); + SDNode *OrOpd1 = OrOpd1Val.getNode(); + unsigned BFXOpc; int DstLSB, Width; if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS, - NumberOfIgnoredLowBits, true)) { + NumberOfIgnoredLowBits, BiggerPattern)) { // Check that the returned opcode is compatible with the pattern, // i.e., same type and zero extended (U and not S) if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) || @@ -1952,8 +2030,9 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, // If the mask on the insertee is correct, we have a BFXIL operation. We // can share the ImmR and ImmS values from the already-computed UBFM. - } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src, - DstLSB, Width)) { + } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), + BiggerPattern, + Src, DstLSB, Width)) { ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); ImmS = Width - 1; } else @@ -2003,11 +2082,18 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) { unsigned Opc; unsigned LSB, MSB; SDValue Opd0, Opd1; + EVT VT = N->getValueType(0); + APInt NUsefulBits; + getUsefulBits(SDValue(N, 0), NUsefulBits); + + // If all bits are not useful, just return UNDEF. + if (!NUsefulBits) + return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT); - if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG)) + if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits, + CurDAG)) return nullptr; - EVT VT = N->getValueType(0); SDLoc dl(N); SDValue Ops[] = { Opd0, Opd1, @@ -2016,58 +2102,37 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) { return CurDAG->SelectNodeTo(N, Opc, VT, Ops); } -SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) { +/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the +/// equivalent of a left shift by a constant amount followed by an and masking +/// out a contiguous set of bits. +SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) { + if (N->getOpcode() != ISD::AND) + return nullptr; + EVT VT = N->getValueType(0); - unsigned Variant; unsigned Opc; - unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr }; - - if (VT == MVT::f32) { - Variant = 0; - } else if (VT == MVT::f64) { - Variant = 1; - } else - return nullptr; // Unrecognized argument type. Fall back on default codegen. - - // Pick the FRINTX variant needed to set the flags. - unsigned FRINTXOpc = FRINTXOpcs[Variant]; - - switch (N->getOpcode()) { - default: - return nullptr; // Unrecognized libm ISD node. Fall back on default codegen. - case ISD::FCEIL: { - unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr }; - Opc = FRINTPOpcs[Variant]; - break; - } - case ISD::FFLOOR: { - unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr }; - Opc = FRINTMOpcs[Variant]; - break; - } - case ISD::FTRUNC: { - unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr }; - Opc = FRINTZOpcs[Variant]; - break; - } - case ISD::FROUND: { - unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr }; - Opc = FRINTAOpcs[Variant]; - break; - } - } + if (VT == MVT::i32) + Opc = AArch64::UBFMWri; + else if (VT == MVT::i64) + Opc = AArch64::UBFMXri; + else + return nullptr; - SDLoc dl(N); - SDValue In = N->getOperand(0); - SmallVector<SDValue, 2> Ops; - Ops.push_back(In); + SDValue Op0; + int DstLSB, Width; + if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false, + Op0, DstLSB, Width)) + return nullptr; - if (!TM.Options.UnsafeFPMath) { - SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In); - Ops.push_back(SDValue(FRINTX, 1)); - } + // ImmR is the rotate right amount. + unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); + // ImmS is the most significant bit of the source to be moved. + unsigned ImmS = Width - 1; - return CurDAG->getMachineNode(Opc, dl, VT, Ops); + SDLoc DL(N); + SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT), + CurDAG->getTargetConstant(ImmS, DL, VT)}; + return CurDAG->SelectNodeTo(N, Opc, VT, Ops); } bool @@ -2119,7 +2184,7 @@ AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, // into a single value to be used in the MRS/MSR instruction. static int getIntOperandFromRegisterString(StringRef RegString) { SmallVector<StringRef, 5> Fields; - RegString.split(Fields, ":"); + RegString.split(Fields, ':'); if (Fields.size() == 1) return -1; @@ -2206,7 +2271,15 @@ SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) { assert (isa<ConstantSDNode>(N->getOperand(2)) && "Expected a constant integer expression."); uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); - return CurDAG->getMachineNode(AArch64::MSRpstate, DL, MVT::Other, + unsigned State; + if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) { + assert(Immed < 2 && "Bad imm"); + State = AArch64::MSRpstateImm1; + } else { + assert(Immed < 16 && "Bad imm"); + State = AArch64::MSRpstateImm4; + } + return CurDAG->getMachineNode(State, DL, MVT::Other, CurDAG->getTargetConstant(Reg, DL, MVT::i32), CurDAG->getTargetConstant(Immed, DL, MVT::i16), N->getOperand(0)); @@ -2279,6 +2352,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case ISD::SRA: if (SDNode *I = SelectBitfieldExtractOp(Node)) return I; + if (SDNode *I = SelectBitfieldInsertInZeroOp(Node)) + return I; break; case ISD::OR: @@ -2802,6 +2877,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { break; } } + break; } case AArch64ISD::LD2post: { if (VT == MVT::v8i8) @@ -3214,14 +3290,6 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST); break; } - - case ISD::FCEIL: - case ISD::FFLOOR: - case ISD::FTRUNC: - case ISD::FROUND: - if (SDNode *I = SelectLIBM(Node)) - return I; - break; } // Select the default instruction diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3e8f46c..4ecfbe9 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -40,23 +40,6 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); -namespace { -enum AlignMode { - StrictAlign, - NoStrictAlign -}; -} - -static cl::opt<AlignMode> -Align(cl::desc("Load/store alignment support"), - cl::Hidden, cl::init(NoStrictAlign), - cl::values( - clEnumValN(StrictAlign, "aarch64-strict-align", - "Disallow all unaligned memory accesses"), - clEnumValN(NoStrictAlign, "aarch64-no-strict-align", - "Allow unaligned memory accesses"), - clEnumValEnd)); - // Place holder until extr generation is tested fully. static cl::opt<bool> EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, @@ -76,6 +59,9 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); +/// Value type used for condition codes. +static const MVT MVT_CC = MVT::i32; + AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -210,11 +196,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); - // Exception handling. - // FIXME: These are guesses. Has this been defined yet? - setExceptionPointerRegister(AArch64::X0); - setExceptionSelectorRegister(AArch64::X1); - // Constant pool entries setOperationAction(ISD::ConstantPool, MVT::i64, Custom); @@ -234,6 +215,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // AArch64 lacks both left-rotate and popcount instructions. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + } // AArch64 doesn't have {U|S}MUL_LOHI. setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); @@ -252,6 +237,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + } setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::SREM, MVT::i64, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); @@ -315,6 +304,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FTRUNC, MVT::f16, Promote); setOperationAction(ISD::FMINNUM, MVT::f16, Promote); setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::FMINNAN, MVT::f16, Promote); + setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); // v4f16 is also a storage-only type, so promote it to v4f32 when that is // known to be safe. @@ -403,10 +394,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FRINT, Ty, Legal); setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); + setOperationAction(ISD::FMINNUM, Ty, Legal); + setOperationAction(ISD::FMAXNUM, Ty, Legal); + setOperationAction(ISD::FMINNAN, Ty, Legal); + setOperationAction(ISD::FMAXNAN, Ty, Legal); } setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. + // This requires the Performance Monitors extension. + if (Subtarget->hasPerfMon()) + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); + if (Subtarget->isTargetMachO()) { // For iOS, we don't want to the normal expansion of a libcall to // sincos. We want to issue a libcall to __sincos_stret to avoid memory @@ -456,12 +456,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setIndexedLoadAction(im, MVT::i64, Legal); setIndexedLoadAction(im, MVT::f64, Legal); setIndexedLoadAction(im, MVT::f32, Legal); + setIndexedLoadAction(im, MVT::f16, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); setIndexedStoreAction(im, MVT::i64, Legal); setIndexedStoreAction(im, MVT::f64, Legal); setIndexedStoreAction(im, MVT::f32, Legal); + setIndexedStoreAction(im, MVT::f16, Legal); } // Trap. @@ -479,6 +481,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::FP_TO_UINT); + setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::ANY_EXTEND); @@ -487,16 +493,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::STORE); + if (Subtarget->supportsAddressTopByteIgnored()) + setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::VSELECT); - setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; @@ -512,10 +520,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setMinFunctionAlignment(2); - RequireStrictAlign = (Align == StrictAlign); - setHasExtractBitsInsn(true); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + if (Subtarget->hasNEON()) { // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to // silliness like this: @@ -646,6 +654,9 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); + + // But we do support custom-lowering for FCOPYSIGN. + setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom); } setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); @@ -686,6 +697,12 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT.getSimpleVT(), Legal); + // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). + if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) + for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, + ISD::FMINNUM, ISD::FMAXNUM}) + setOperationAction(Opcode, VT.getSimpleVT(), Legal); + if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { @@ -730,7 +747,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( break; } case ISD::INTRINSIC_W_CHAIN: { - ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); + ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); switch (IntID) { default: return; @@ -780,6 +797,34 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, return MVT::i64; } +bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *Fast) const { + if (Subtarget->requiresStrictAlign()) + return false; + + // FIXME: This is mostly true for Cyclone, but not necessarily others. + if (Fast) { + // FIXME: Define an attribute for slow unaligned accesses instead of + // relying on the CPU type as a proxy. + // On Cyclone, unaligned 128-bit stores are slow. + *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 || + // See comments in performSTORECombine() for more details about + // these conditions. + + // Code that uses clang vector extensions can mark that it + // wants unaligned accesses to be treated as fast by + // underspecifying alignment to be 1 or 2. + Align <= 2 || + + // Disregard v2i64. Memcpy lowering produces those and splitting + // them regresses performance on micro-benchmarks and olden/bh. + VT == MVT::v2i64; + } + return true; +} + FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { @@ -809,9 +854,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; + case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; + case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; + case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; - case AArch64ISD::FMIN: return "AArch64ISD::FMIN"; - case AArch64ISD::FMAX: return "AArch64ISD::FMAX"; case AArch64ISD::DUP: return "AArch64ISD::DUP"; case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; @@ -931,8 +977,7 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); DebugLoc DL = MI->getDebugLoc(); - MachineFunction::iterator It = MBB; - ++It; + MachineFunction::iterator It = ++MBB->getIterator(); unsigned DestReg = MI->getOperand(0).getReg(); unsigned IfTrueReg = MI->getOperand(1).getReg(); @@ -1141,8 +1186,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // register to WZR/XZR if it ends up being unused. unsigned Opcode = AArch64ISD::SUBS; - if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) && - cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 && + if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags @@ -1156,8 +1200,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // the absence of information about op2. Opcode = AArch64ISD::ADDS; RHS = RHS.getOperand(1); - } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) && - cast<ConstantSDNode>(RHS)->getZExtValue() == 0 && + } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { // Similarly, (CMP (and X, Y), 0) can be implemented with a TST // (a.k.a. ANDS) except that the flags are only guaranteed to work for one @@ -1167,14 +1210,230 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, LHS = LHS.getOperand(0); } - return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS) + return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) .getValue(1); } +/// \defgroup AArch64CCMP CMP;CCMP matching +/// +/// These functions deal with the formation of CMP;CCMP;... sequences. +/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of +/// a comparison. They set the NZCV flags to a predefined value if their +/// predicate is false. This allows to express arbitrary conjunctions, for +/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))" +/// expressed as: +/// cmp A +/// ccmp B, inv(CB), CA +/// check for CB flags +/// +/// In general we can create code for arbitrary "... (and (and A B) C)" +/// sequences. We can also implement some "or" expressions, because "(or A B)" +/// is equivalent to "not (and (not A) (not B))" and we can implement some +/// negation operations: +/// We can negate the results of a single comparison by inverting the flags +/// used when the predicate fails and inverting the flags tested in the next +/// instruction; We can also negate the results of the whole previous +/// conditional compare sequence by inverting the flags tested in the next +/// instruction. However there is no way to negate the result of a partial +/// sequence. +/// +/// Therefore on encountering an "or" expression we can negate the subtree on +/// one side and have to be able to push the negate to the leafs of the subtree +/// on the other side (see also the comments in code). As complete example: +/// "or (or (setCA (cmp A)) (setCB (cmp B))) +/// (and (setCC (cmp C)) (setCD (cmp D)))" +/// is transformed to +/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D)))) +/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" +/// and implemented as: +/// cmp C +/// ccmp D, inv(CD), CC +/// ccmp A, CA, inv(CD) +/// ccmp B, CB, inv(CA) +/// check for CB flags +/// A counterexample is "or (and A B) (and C D)" which cannot be implemented +/// by conditional compare sequences. +/// @{ + +/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. +static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, + ISD::CondCode CC, SDValue CCOp, + SDValue Condition, unsigned NZCV, + SDLoc DL, SelectionDAG &DAG) { + unsigned Opcode = 0; + if (LHS.getValueType().isFloatingPoint()) + Opcode = AArch64ISD::FCCMP; + else if (RHS.getOpcode() == ISD::SUB) { + SDValue SubOp0 = RHS.getOperand(0); + if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + // See emitComparison() on why we can only do this for SETEQ and SETNE. + Opcode = AArch64ISD::CCMN; + RHS = RHS.getOperand(1); + } + } + if (Opcode == 0) + Opcode = AArch64ISD::CCMP; + + SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); + return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); +} + +/// Returns true if @p Val is a tree of AND/OR/SETCC operations. +/// CanPushNegate is set to true if we can push a negate operation through +/// the tree in a was that we are left with AND operations and negate operations +/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to +/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be +/// brought into such a form. +static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, + unsigned Depth = 0) { + if (!Val.hasOneUse()) + return false; + unsigned Opcode = Val->getOpcode(); + if (Opcode == ISD::SETCC) { + CanPushNegate = true; + return true; + } + // Protect against stack overflow. + if (Depth > 15) + return false; + if (Opcode == ISD::AND || Opcode == ISD::OR) { + SDValue O0 = Val->getOperand(0); + SDValue O1 = Val->getOperand(1); + bool CanPushNegateL; + if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1)) + return false; + bool CanPushNegateR; + if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1)) + return false; + // We cannot push a negate through an AND operation (it would become an OR), + // we can however change a (not (or x y)) to (and (not x) (not y)) if we can + // push the negate through the x/y subtrees. + CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR; + return true; + } + return false; +} + +/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain +/// of CCMP/CFCMP ops. See @ref AArch64CCMP. +/// Tries to transform the given i1 producing node @p Val to a series compare +/// and conditional compare operations. @returns an NZCV flags producing node +/// and sets @p OutCC to the flags that should be tested or returns SDValue() if +/// transformation was not possible. +/// On recursive invocations @p PushNegate may be set to true to have negation +/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate +/// for the comparisons in the current subtree; @p Depth limits the search +/// depth to avoid stack overflow. +static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, + AArch64CC::CondCode &OutCC, bool PushNegate = false, + SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL, + unsigned Depth = 0) { + // We're at a tree leaf, produce a conditional comparison operation. + unsigned Opcode = Val->getOpcode(); + if (Opcode == ISD::SETCC) { + SDValue LHS = Val->getOperand(0); + SDValue RHS = Val->getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); + bool isInteger = LHS.getValueType().isInteger(); + if (PushNegate) + CC = getSetCCInverse(CC, isInteger); + SDLoc DL(Val); + // Determine OutCC and handle FP special case. + if (isInteger) { + OutCC = changeIntCCToAArch64CC(CC); + } else { + assert(LHS.getValueType().isFloatingPoint()); + AArch64CC::CondCode ExtraCC; + changeFPCCToAArch64CC(CC, OutCC, ExtraCC); + // Surpisingly some floating point conditions can't be tested with a + // single condition code. Construct an additional comparison in this case. + // See comment below on how we deal with OR conditions. + if (ExtraCC != AArch64CC::AL) { + SDValue ExtraCmp; + if (!CCOp.getNode()) + ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); + else { + SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); + // Note that we want the inverse of ExtraCC, so NZCV is not inversed. + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC); + ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, + NZCV, DL, DAG); + } + CCOp = ExtraCmp; + Predicate = AArch64CC::getInvertedCondCode(ExtraCC); + OutCC = AArch64CC::getInvertedCondCode(OutCC); + } + } + + // Produce a normal comparison if we are first in the chain + if (!CCOp.getNode()) + return emitComparison(LHS, RHS, CC, DL, DAG); + // Otherwise produce a ccmp. + SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); + AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); + return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, + DAG); + } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse()) + return SDValue(); + + assert((Opcode == ISD::OR || !PushNegate) + && "Can only push negate through OR operation"); + + // Check if both sides can be transformed. + SDValue LHS = Val->getOperand(0); + SDValue RHS = Val->getOperand(1); + bool CanPushNegateL; + if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1)) + return SDValue(); + bool CanPushNegateR; + if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1)) + return SDValue(); + + // Do we need to negate our operands? + bool NegateOperands = Opcode == ISD::OR; + // We can negate the results of all previous operations by inverting the + // predicate flags giving us a free negation for one side. For the other side + // we need to be able to push the negation to the leafs of the tree. + if (NegateOperands) { + if (!CanPushNegateL && !CanPushNegateR) + return SDValue(); + // Order the side where we can push the negate through to LHS. + if (!CanPushNegateL && CanPushNegateR) + std::swap(LHS, RHS); + } else { + bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; + bool NeedsNegOutR = RHS->getOpcode() == ISD::OR; + if (NeedsNegOutL && NeedsNegOutR) + return SDValue(); + // Order the side where we need to negate the output flags to RHS so it + // gets emitted first. + if (NeedsNegOutL) + std::swap(LHS, RHS); + } + + // Emit RHS. If we want to negate the tree we only need to push a negate + // through if we are already in a PushNegate case, otherwise we can negate + // the "flags to test" afterwards. + AArch64CC::CondCode RHSCC; + SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate, + CCOp, Predicate, Depth+1); + if (NegateOperands && !PushNegate) + RHSCC = AArch64CC::getInvertedCondCode(RHSCC); + // Emit LHS. We must push the negate through if we need to negate it. + SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands, + CmpR, RHSCC, Depth+1); + // If we transformed an OR to and AND then we have to negate the result + // (or absorb a PushNegate resulting in a double negation). + if (Opcode == ISD::OR && !PushNegate) + OutCC = AArch64CC::getInvertedCondCode(OutCC); + return CmpL; +} + +/// @} + static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { - SDValue Cmp; - AArch64CC::CondCode AArch64CC; if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { EVT VT = RHS.getValueType(); uint64_t C = RHSC->getZExtValue(); @@ -1229,47 +1488,56 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, } } } - // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. - // For the i8 operand, the largest immediate is 255, so this can be easily - // encoded in the compare instruction. For the i16 operand, however, the - // largest immediate cannot be encoded in the compare. - // Therefore, use a sign extending load and cmn to avoid materializing the -1 - // constant. For example, - // movz w1, #65535 - // ldrh w0, [x0, #0] - // cmp w0, w1 - // > - // ldrsh w0, [x0, #0] - // cmn w0, #1 - // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) - // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure - // both the LHS and RHS are truely zero extended and to make sure the - // transformation is profitable. + SDValue Cmp; + AArch64CC::CondCode AArch64CC; if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { - if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) && - isa<LoadSDNode>(LHS)) { - if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && - cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && - LHS.getNode()->hasNUsesOfValue(1, 0)) { - int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); - if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { - SDValue SExt = - DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, - DAG.getValueType(MVT::i16)); - Cmp = emitComparison(SExt, - DAG.getConstant(ValueofRHS, dl, - RHS.getValueType()), - CC, dl, DAG); - AArch64CC = changeIntCCToAArch64CC(CC); - AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32); - return Cmp; - } + const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); + + // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. + // For the i8 operand, the largest immediate is 255, so this can be easily + // encoded in the compare instruction. For the i16 operand, however, the + // largest immediate cannot be encoded in the compare. + // Therefore, use a sign extending load and cmn to avoid materializing the + // -1 constant. For example, + // movz w1, #65535 + // ldrh w0, [x0, #0] + // cmp w0, w1 + // > + // ldrsh w0, [x0, #0] + // cmn w0, #1 + // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) + // if and only if (sext LHS) == (sext RHS). The checks are in place to + // ensure both the LHS and RHS are truly zero extended and to make sure the + // transformation is profitable. + if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && + cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && + cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && + LHS.getNode()->hasNUsesOfValue(1, 0)) { + int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); + if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { + SDValue SExt = + DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, + DAG.getValueType(MVT::i16)); + Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, + RHS.getValueType()), + CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); } } + + if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { + if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) { + if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) + AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); + } + } + } + + if (!Cmp) { + Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); } - Cmp = emitComparison(LHS, RHS, CC, dl, DAG); - AArch64CC = changeIntCCToAArch64CC(CC); - AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32); + AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); return Cmp; } @@ -1391,8 +1659,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false, - SDLoc(Op)).first; + return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; } static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { @@ -1571,8 +1838,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, - /*isSigned*/ false, SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + SDLoc(Op)).first; } static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { @@ -1581,6 +1848,16 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { // in the cost tables. EVT InVT = Op.getOperand(0).getValueType(); EVT VT = Op.getValueType(); + unsigned NumElts = InVT.getVectorNumElements(); + + // f16 vectors are promoted to f32 before a conversion. + if (InVT.getVectorElementType() == MVT::f16) { + MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); + SDLoc dl(Op); + return DAG.getNode( + Op.getOpcode(), dl, Op.getValueType(), + DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); + } if (VT.getSizeInBits() < InVT.getSizeInBits()) { SDLoc dl(Op); @@ -1628,8 +1905,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false, - SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -1931,6 +2207,31 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } +SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + SDLoc dl(Op); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::aarch64_thread_pointer: { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); + } + case Intrinsic::aarch64_neon_smax: + return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_umax: + return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_smin: + return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_umin: + return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } +} + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -2032,14 +2333,11 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerFSINCOS(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: + return LowerINTRINSIC_WO_CHAIN(Op, DAG); } } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const { - return 2; -} - //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -2128,7 +2426,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( continue; } - + if (VA.isRegLoc()) { // Arguments stored in registers. EVT RegVT = VA.getLocVT(); @@ -2214,9 +2512,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments( break; } - ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - MemVT, false, false, false, 0); + ArgValue = DAG.getExtLoad( + ExtType, DL, VA.getLocVT(), Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + MemVT, false, false, false, 0); InVals.push_back(ArgValue); } @@ -2289,9 +2588,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); - SDValue Store = - DAG.getStore(Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(i * 8), false, false, 0); + SDValue Store = DAG.getStore( + Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false, + false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); @@ -2318,9 +2618,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); - SDValue Store = - DAG.getStore(Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(i * 16), false, false, 0); + SDValue Store = DAG.getStore( + Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16), + false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(16, DL, PtrVT)); @@ -2453,8 +2754,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) - if (!ArgLocs[i].isRegLoc()) + for (const CCValAssign &ArgLoc : ArgLocs) + if (!ArgLoc.isRegLoc()) return false; } @@ -2758,7 +3059,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); DstAddr = DAG.getFrameIndex(FI, PtrVT); - DstInfo = MachinePointerInfo::getFixedStack(FI); + DstInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // Make sure any stack arguments overlapping with where we're storing // are loaded before this eventual operation. Otherwise they'll be @@ -2768,7 +3070,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); - DstInfo = MachinePointerInfo::getStack(LocMemOffset); + DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), + LocMemOffset); } if (Outs[i].Flags.isByVal()) { @@ -2802,9 +3105,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first, - RegsToPass[i].second, InFlag); + for (auto &RegToPass : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, + RegToPass.second, InFlag); InFlag = Chain.getValue(1); } @@ -2860,9 +3163,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Add argument registers to the end of the list so that they are known live // into the call. - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) - Ops.push_back(DAG.getRegister(RegsToPass[i].first, - RegsToPass[i].second.getValueType())); + for (auto &RegToPass : RegsToPass) + Ops.push_back(DAG.getRegister(RegToPass.first, + RegToPass.second.getValueType())); // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; @@ -2968,6 +3271,19 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (AArch64::GPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else if (AArch64::FPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } RetOps[0] = Chain; // Update chain. @@ -3010,11 +3326,12 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); - SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr, - MachinePointerInfo::getConstantPool(), - /*isVolatile=*/ false, - /*isNonTemporal=*/ true, - /*isInvariant=*/ true, 8); + SDValue GlobalAddr = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), PoolAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + /*isVolatile=*/false, + /*isNonTemporal=*/true, + /*isInvariant=*/true, 8); if (GN->getOffset() != 0) return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, DAG.getConstant(GN->getOffset(), DL, PtrVT)); @@ -3087,8 +3404,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = - DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(), - false, true, true, 8); + DAG.getLoad(MVT::i64, DL, Chain, DescAddr, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, + true, true, 8); Chain = FuncTLVGet.getValue(1); MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -3160,6 +3478,10 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); + + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + if (!EnableAArch64ELFLocalDynamicTLSGeneration) { if (Model == TLSModel::LocalDynamic) Model = TLSModel::GeneralDynamic; @@ -3277,8 +3599,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. unsigned Opc = LHS.getOpcode(); - if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) && - cast<ConstantSDNode>(RHS)->isOne() && + if (LHS.getResNo() == 1 && isOneConstant(RHS) && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && @@ -3392,17 +3713,11 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SDValue In1 = Op.getOperand(0); SDValue In2 = Op.getOperand(1); EVT SrcVT = In2.getValueType(); - if (SrcVT != VT) { - if (SrcVT == MVT::f32 && VT == MVT::f64) - In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); - else if (SrcVT == MVT::f64 && VT == MVT::f32) - In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, - DAG.getIntPtrConstant(0, DL)); - else - // FIXME: Src type is different, bail out for now. Can VT really be a - // vector type? - return SDValue(); - } + + if (SrcVT.bitsLT(VT)) + In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); + else if (SrcVT.bitsGT(VT)) + In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); EVT VecVT; EVT EltVT; @@ -3410,7 +3725,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SDValue VecVal1, VecVal2; if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { EltVT = MVT::i32; - VecVT = MVT::v4i32; + VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); EltMask = 0x80000000ULL; if (!VT.isVector()) { @@ -3571,32 +3886,6 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { } } -/// A SELECT_CC operation is really some kind of max or min if both values being -/// compared are, in some sense, equal to the results in either case. However, -/// it is permissible to compare f32 values and produce directly extended f64 -/// values. -/// -/// Extending the comparison operands would also be allowed, but is less likely -/// to happen in practice since their use is right here. Note that truncate -/// operations would *not* be semantically equivalent. -static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) { - if (Cmp == Result) - return (Cmp.getValueType() == MVT::f32 || - Cmp.getValueType() == MVT::f64); - - ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp); - ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result); - if (CCmp && CResult && Cmp.getValueType() == MVT::f32 && - Result.getValueType() == MVT::f64) { - bool Lossy; - APFloat CmpVal = CCmp->getValueAPF(); - CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy); - return CResult->getValueAPF().bitwiseIsEqual(CmpVal); - } - - return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp; -} - SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, SDLoc dl, @@ -3614,7 +3903,13 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } } - // Handle integers first. + // Also handle f16, for which we need to do a f32 comparison. + if (LHS.getValueType() == MVT::f16) { + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); + RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); + } + + // Next, handle integers. if (LHS.getValueType().isInteger()) { assert((LHS.getValueType() == RHS.getValueType()) && (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); @@ -3637,9 +3932,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } else if (TVal.getOpcode() == ISD::XOR) { // If TVal is a NOT we want to swap TVal and FVal so that we can match // with a CSINV rather than a CSEL. - ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1)); - - if (CVal && CVal->isAllOnesValue()) { + if (isAllOnesConstant(TVal.getOperand(1))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); @@ -3647,9 +3940,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } else if (TVal.getOpcode() == ISD::SUB) { // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so // that we can match with a CSNEG rather than a CSEL. - ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0)); - - if (CVal && CVal->isNullValue()) { + if (isNullConstant(TVal.getOperand(0))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); @@ -4109,46 +4400,57 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMcc; unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + + // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which + // is "undef". We wanted 0, so CSEL it directly. + SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), + ISD::SETEQ, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); + HiBitsForLo = + DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), + HiBitsForLo, CCVal, Cmp); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); - SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), - ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue LoForNormalShift = + DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); - SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); - SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); - SDValue Lo = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); + Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, + dl, DAG); + CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, + LoForNormalShift, CCVal, Cmp); // AArch64 shifts larger than the register width are wrapped rather than // clamped, so we can't just emit "hi >> x". - SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); - SDValue TrueValHi = Opc == ISD::SRA - ? DAG.getNode(Opc, dl, VT, ShOpHi, - DAG.getConstant(VTBits - 1, dl, - MVT::i64)) - : DAG.getConstant(0, dl, VT); - SDValue Hi = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp); + SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue HiForBigShift = + Opc == ISD::SRA + ? DAG.getNode(Opc, dl, VT, ShOpHi, + DAG.getConstant(VTBits - 1, dl, MVT::i64)) + : DAG.getConstant(0, dl, VT); + SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, + HiForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } + /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i64 values and take a 2 x i64 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -4156,31 +4458,41 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMcc; assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + + // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which + // is "undef". We wanted 0, so CSEL it directly. + SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), + ISD::SETEQ, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); + LoBitsForHi = + DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), + LoBitsForHi, CCVal, Cmp); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); - SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); - SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue HiForNormalShift = + DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); - SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), - ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); - SDValue Hi = - DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp); + Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, + dl, DAG); + CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, + HiForNormalShift, CCVal, Cmp); // AArch64 shifts of larger than register sizes are wrapped rather than // clamped, so we can't just emit "lo << a" if a is too big. - SDValue TrueValLo = DAG.getConstant(0, dl, VT); - SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); - SDValue Lo = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); + SDValue LoForBigShift = DAG.getConstant(0, dl, VT); + SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, + LoForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); @@ -4362,8 +4674,7 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint( // Validate and return a target constant for them if we can. case 'z': { // 'z' maps to xzr or wzr so it needs an input of 0. - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); - if (!C || C->getZExtValue() != 0) + if (!isNullConstant(Op)) return; if (Op.getValueType() == MVT::i64) @@ -4763,7 +5074,7 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, // The index of an EXT is the first element if it is not UNDEF. // Watch out for the beginning UNDEFs. The EXT index should be the expected - // value of the first element. E.g. + // value of the first element. E.g. // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. // ExpectedElt is the last mask index plus 1. @@ -5653,11 +5964,10 @@ static SDValue NormalizeBuildVector(SDValue Op, return Op; SmallVector<SDValue, 16> Ops; - for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) { - SDValue Lane = Op.getOperand(I); - if (Lane.getOpcode() == ISD::Constant) { + for (SDValue Lane : Op->ops()) { + if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) { APInt LowBits(EltTy.getSizeInBits(), - cast<ConstantSDNode>(Lane)->getZExtValue()); + CstLane->getZExtValue()); Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); } Ops.push_back(Lane); @@ -5997,8 +6307,7 @@ FailedModImm: // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { - SDValue shuffle = ReconstructShuffle(Op, DAG); - if (shuffle != SDValue()) + if (SDValue shuffle = ReconstructShuffle(Op, DAG)) return shuffle; } @@ -6017,7 +6326,10 @@ FailedModImm: // a) Avoid a RMW dependency on the full vector register, and // b) Allow the register coalescer to fold away the copy if the // value is already in an S or D register. - if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) { + // Do not do this for UNDEF/LOAD nodes because we have better patterns + // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. + if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD && + (ElemSize == 32 || ElemSize == 64)) { unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; MachineSDNode *N = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, @@ -6123,24 +6435,11 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, unsigned Val = Cst->getZExtValue(); unsigned Size = Op.getValueType().getSizeInBits(); - if (Val == 0) { - switch (Size) { - case 8: - return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(), - Op.getOperand(0)); - case 16: - return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(), - Op.getOperand(0)); - case 32: - return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(), - Op.getOperand(0)); - case 64: - return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(), - Op.getOperand(0)); - default: - llvm_unreachable("Unexpected vector type in extract_subvector!"); - } - } + + // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. + if (Val == 0) + return Op; + // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) @@ -6213,26 +6512,20 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); } /// isVShiftRImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift right operation. For a shift opcode, the value -/// is positive, but for an intrinsic the value count must be negative. The -/// absolute value must be in the range: -/// 1 <= |Value| <= ElementBits for a right shift; or -/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. -static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, - int64_t &Cnt) { +/// operand of a vector shift right operation. The value must be in the range: +/// 1 <= Value <= ElementBits for a right shift; or +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; - if (isIntrinsic) - Cnt = -Cnt; return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); } @@ -6261,8 +6554,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, case ISD::SRA: case ISD::SRL: // Right shift immediate - if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) && - Cnt < EltSize) { + if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { unsigned Opc = (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; return DAG.getNode(Opc, DL, VT, Op.getOperand(0), @@ -6451,7 +6743,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::aarch64_neon_ld4r: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. - uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8; + uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; @@ -6477,7 +6769,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += DL.getTypeAllocSize(ArgTy) / 8; + NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); @@ -6720,10 +7012,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad( const DataLayout &DL = LI->getModule()->getDataLayout(); VectorType *VecTy = Shuffles[0]->getType(); - unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy); + unsigned VecSize = DL.getTypeSizeInBits(VecTy); - // Skip illegal vector types. - if (VecSize != 64 && VecSize != 128) + // Skip if we do not have NEON and skip illegal vector types. + if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) return false; // A pointer vector can not be the return type of the ldN intrinsics. Need to @@ -6806,10 +7098,10 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); const DataLayout &DL = SI->getModule()->getDataLayout(); - unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy); + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); - // Skip illegal vector types. - if (SubVecSize != 64 && SubVecSize != 128) + // Skip if we do not have NEON and skip illegal vector types. + if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) return false; Value *Op0 = SVI->getOperand(0); @@ -7228,8 +7520,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from // a constant. Vectors only. - SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); - if (Res != SDValue()) + if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; EVT VT = N->getValueType(0); @@ -7242,7 +7533,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, // If the result of an integer load is only used by an integer-to-float // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. - // This eliminates an "integer-to-vector-move UOP and improve throughput. + // This eliminates an "integer-to-vector-move" UOP and improves throughput. SDValue N0 = N->getOperand(0); if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. @@ -7265,6 +7556,134 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Fold a floating-point multiply by power of two into floating-point to +/// fixed-point conversion. +static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) + return SDValue(); + + SDValue ConstVec = Op->getOperand(1); + if (!isa<BuildVectorSDNode>(ConstVec)) + return SDValue(); + + MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); + if (FloatBits != 32 && FloatBits != 64) + return SDValue(); + + MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t IntBits = IntTy.getSizeInBits(); + if (IntBits != 16 && IntBits != 32 && IntBits != 64) + return SDValue(); + + // Avoid conversions where iN is larger than the float (e.g., float -> i64). + if (IntBits > FloatBits) + return SDValue(); + + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t Bits = IntBits == 64 ? 64 : 32; + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); + if (C == -1 || C == 0 || C > Bits) + return SDValue(); + + MVT ResTy; + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + switch (NumLanes) { + default: + return SDValue(); + case 2: + ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; + break; + case 4: + ResTy = MVT::v4i32; + break; + } + + SDLoc DL(N); + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs + : Intrinsic::aarch64_neon_vcvtfp2fxu; + SDValue FixConv = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, + DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), + Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); + // We can handle smaller integers by generating an extra trunc. + if (IntBits < FloatBits) + FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); + + return FixConv; +} + +/// Fold a floating-point divide by power of two into fixed-point to +/// floating-point conversion. +static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + unsigned Opc = Op->getOpcode(); + if (!Op.getValueType().isVector() || + (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) + return SDValue(); + + SDValue ConstVec = N->getOperand(1); + if (!isa<BuildVectorSDNode>(ConstVec)) + return SDValue(); + + MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); + int32_t IntBits = IntTy.getSizeInBits(); + if (IntBits != 16 && IntBits != 32 && IntBits != 64) + return SDValue(); + + MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); + int32_t FloatBits = FloatTy.getSizeInBits(); + if (FloatBits != 32 && FloatBits != 64) + return SDValue(); + + // Avoid conversions where iN is larger than the float (e.g., i64 -> float). + if (IntBits > FloatBits) + return SDValue(); + + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); + if (C == -1 || C == 0 || C > FloatBits) + return SDValue(); + + MVT ResTy; + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + switch (NumLanes) { + default: + return SDValue(); + case 2: + ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; + break; + case 4: + ResTy = MVT::v4i32; + break; + } + + SDLoc DL(N); + SDValue ConvInput = Op.getOperand(0); + bool IsSigned = Opc == ISD::SINT_TO_FP; + if (IntBits < FloatBits) + ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, + ResTy, ConvInput); + + unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp + : Intrinsic::aarch64_neon_vcvtfxu2fp; + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), + DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, + DAG.getConstant(C, DL, MVT::i32)); +} + /// An EXTR instruction is made up of two shifts, ORed together. This helper /// searches for and classifies those shifts. static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, @@ -7964,7 +8383,6 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: return tryCombineFixedPointConvert(N, DCI, DAG); - break; case Intrinsic::aarch64_neon_saddv: return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); case Intrinsic::aarch64_neon_uaddv: @@ -7978,10 +8396,16 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_umaxv: return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); case Intrinsic::aarch64_neon_fmax: - return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0), + return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fmin: - return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0), + return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fmaxnm: + return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fminnm: + return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_smull: case Intrinsic::aarch64_neon_umull: @@ -8141,7 +8565,7 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { unsigned Alignment = std::min(OrigAlignment, EltOffset); // Create scalar stores. This is at least as good as the code sequence for a - // split unaligned store wich is a dup.s, ext.b, and two stores. + // split unaligned store which is a dup.s, ext.b, and two stores. // Most of the time the three stores should be replaced by store pair // instructions (stp). SDLoc DL(St); @@ -8162,10 +8586,9 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { return NewST1; } -static SDValue performSTORECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { +static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { if (!DCI.isBeforeLegalize()) return SDValue(); @@ -8173,15 +8596,17 @@ static SDValue performSTORECombine(SDNode *N, if (S->isVolatile()) return SDValue(); + // FIXME: The logic for deciding if an unaligned store should be split should + // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be + // a call to that function here. + // Cyclone has bad performance on unaligned 16B stores when crossing line and // page boundaries. We want to split such stores. if (!Subtarget->isCyclone()) return SDValue(); - // Don't split at Oz. - MachineFunction &MF = DAG.getMachineFunction(); - bool IsMinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); - if (IsMinSize) + // Don't split at -Oz. + if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); SDValue StVal = S->getValue(); @@ -8204,8 +8629,7 @@ static SDValue performSTORECombine(SDNode *N, // If we get a splat of a scalar convert this vector store to a store of // scalars. They will be merged into store pairs thereby removing two // instructions. - SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S); - if (ReplacedSplat != SDValue()) + if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S)) return ReplacedSplat; SDLoc DL(S); @@ -8326,6 +8750,299 @@ static SDValue performPostLD1Combine(SDNode *N, return SDValue(); } +/// Simplify \Addr given that the top byte of it is ignored by HW during +/// address translation. +static bool performTBISimplification(SDValue Addr, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + APInt DemandedMask = APInt::getLowBitsSet(64, 56); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), + DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + return true; + } + return false; +} + +static SDValue performSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + SDValue Split = split16BStores(N, DCI, DAG, Subtarget); + if (Split.getNode()) + return Split; + + if (Subtarget->supportsAddressTopByteIgnored() && + performTBISimplification(N->getOperand(2), DCI, DAG)) + return SDValue(N, 0); + + return SDValue(); +} + + /// This function handles the log2-shuffle pattern produced by the +/// LoopVectorizer for the across vector reduction. It consists of +/// log2(NumVectorElements) steps and, in each step, 2^(s) elements +/// are reduced, where s is an induction variable from 0 to +/// log2(NumVectorElements). +static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, + unsigned Op, + SelectionDAG &DAG) { + EVT VTy = OpV->getOperand(0).getValueType(); + if (!VTy.isVector()) + return SDValue(); + + int NumVecElts = VTy.getVectorNumElements(); + if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { + if (NumVecElts != 4) + return SDValue(); + } else { + if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) + return SDValue(); + } + + int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); + SDValue PreOp = OpV; + // Iterate over each step of the across vector reduction. + for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { + SDValue CurOp = PreOp.getOperand(0); + SDValue Shuffle = PreOp.getOperand(1); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { + // Try to swap the 1st and 2nd operand as add and min/max instructions + // are commutative. + CurOp = PreOp.getOperand(1); + Shuffle = PreOp.getOperand(0); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + } + + // Check if the input vector is fed by the operator we want to handle, + // except the last step; the very first input vector is not necessarily + // the same operator we are handling. + if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) + return SDValue(); + + // Check if it forms one step of the across vector reduction. + // E.g., + // %cur = add %1, %0 + // %shuffle = vector_shuffle %cur, <2, 3, u, u> + // %pre = add %cur, %shuffle + if (Shuffle.getOperand(0) != CurOp) + return SDValue(); + + int NumMaskElts = 1 << CurStep; + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask(); + // Check mask values in each step. + // We expect the shuffle mask in each step follows a specific pattern + // denoted here by the <M, U> form, where M is a sequence of integers + // starting from NumMaskElts, increasing by 1, and the number integers + // in M should be NumMaskElts. U is a sequence of UNDEFs and the number + // of undef in U should be NumVecElts - NumMaskElts. + // E.g., for <8 x i16>, mask values in each step should be : + // step 0 : <1,u,u,u,u,u,u,u> + // step 1 : <2,3,u,u,u,u,u,u> + // step 2 : <4,5,6,7,u,u,u,u> + for (int i = 0; i < NumVecElts; ++i) + if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || + (i >= NumMaskElts && !(Mask[i] < 0))) + return SDValue(); + + PreOp = CurOp; + } + unsigned Opcode; + bool IsIntrinsic = false; + + switch (Op) { + default: + llvm_unreachable("Unexpected operator for across vector reduction"); + case ISD::ADD: + Opcode = AArch64ISD::UADDV; + break; + case ISD::SMAX: + Opcode = AArch64ISD::SMAXV; + break; + case ISD::UMAX: + Opcode = AArch64ISD::UMAXV; + break; + case ISD::SMIN: + Opcode = AArch64ISD::SMINV; + break; + case ISD::UMIN: + Opcode = AArch64ISD::UMINV; + break; + case ISD::FMAXNUM: + Opcode = Intrinsic::aarch64_neon_fmaxnmv; + IsIntrinsic = true; + break; + case ISD::FMINNUM: + Opcode = Intrinsic::aarch64_neon_fminnmv; + IsIntrinsic = true; + break; + } + SDLoc DL(N); + + return IsIntrinsic + ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), + DAG.getConstant(Opcode, DL, MVT::i32), PreOp) + : DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), + DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), + DAG.getConstant(0, DL, MVT::i64)); +} + +/// Target-specific DAG combine for the across vector min/max reductions. +/// This function specifically handles the final clean-up step of the vector +/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which narrows down and finds the final min/max value from all +/// elements of the vector. +/// For example, for a <16 x i8> vector : +/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> +/// %smax0 = smax %arr, svn0 +/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax1 = smax %smax0, %svn1 +/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax2 = smax %smax1, svn2 +/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %sc = setcc %smax2, %svn3, gt +/// %n0 = extract_vector_elt %sc, #0 +/// %n1 = extract_vector_elt %smax2, #0 +/// %n2 = extract_vector_elt $smax2, #1 +/// %result = select %n0, %n1, n2 +/// becomes : +/// %1 = smaxv %0 +/// %result = extract_vector_elt %1, 0 +static SDValue +performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue IfTrue = N->getOperand(1); + SDValue IfFalse = N->getOperand(2); + + // Check if the SELECT merges up the final result of the min/max + // from a vector. + if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Expect N0 is fed by SETCC. + SDValue SetCC = N0.getOperand(0); + EVT SetCCVT = SetCC.getValueType(); + if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || + SetCCVT.getVectorElementType() != MVT::i1) + return SDValue(); + + SDValue VectorOp = SetCC.getOperand(0); + unsigned Op = VectorOp->getOpcode(); + // Check if the input vector is fed by the operator we want to handle. + if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && + Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM) + return SDValue(); + + EVT VTy = VectorOp.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + if (VTy.getSizeInBits() < 64) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { + if (EltTy != MVT::f32) + return SDValue(); + } else { + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + } + + // Check if extracting from the same vector. + // For example, + // %sc = setcc %vector, %svn1, gt + // %n0 = extract_vector_elt %sc, #0 + // %n1 = extract_vector_elt %vector, #0 + // %n2 = extract_vector_elt $vector, #1 + if (!(VectorOp == IfTrue->getOperand(0) && + VectorOp == IfFalse->getOperand(0))) + return SDValue(); + + // Check if the condition code is matched with the operator type. + ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); + if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || + (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || + (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || + (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) || + (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE && + CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT && + CC != ISD::SETGE) || + (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE && + CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT && + CC != ISD::SETLE)) + return SDValue(); + + // Expect to check only lane 0 from the vector SETCC. + if (!isNullConstant(N0.getOperand(1))) + return SDValue(); + + // Expect to extract the true value from lane 0. + if (!isNullConstant(IfTrue.getOperand(1))) + return SDValue(); + + // Expect to extract the false value from lane 1. + if (!isOneConstant(IfFalse.getOperand(1))) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); +} + +/// Target-specific DAG combine for the across vector add reduction. +/// This function specifically handles the final clean-up step of the vector +/// add reduction produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which adds all elements of a vector together. +/// For example, for a <4 x i32> vector : +/// %1 = vector_shuffle %0, <2,3,u,u> +/// %2 = add %0, %1 +/// %3 = vector_shuffle %2, <1,u,u,u> +/// %4 = add %2, %3 +/// %result = extract_vector_elt %4, 0 +/// becomes : +/// %0 = uaddv %0 +/// %result = extract_vector_elt %0, 0 +static SDValue +performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Check if the input vector is fed by the ADD. + if (N0->getOpcode() != ISD::ADD) + return SDValue(); + + // The vector extract idx must constant zero because we only expect the final + // result of the reduction is placed in lane 0. + if (!isNullConstant(N1)) + return SDValue(); + + EVT VTy = N0.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + + if (VTy.getSizeInBits() < 64) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); +} + /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, @@ -8751,10 +9468,10 @@ static SDValue performBRCONDCombine(SDNode *N, if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) return SDValue(); - if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue()) + if (isNullConstant(LHS)) std::swap(LHS, RHS); - if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue()) + if (!isNullConstant(RHS)) return SDValue(); if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || @@ -8774,6 +9491,103 @@ static SDValue performBRCONDCombine(SDNode *N, return SDValue(); } +// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test +// as well as whether the test should be inverted. This code is required to +// catch these cases (as opposed to standard dag combines) because +// AArch64ISD::TBZ is matched during legalization. +static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, + SelectionDAG &DAG) { + + if (!Op->hasOneUse()) + return Op; + + // We don't handle undef/constant-fold cases below, as they should have + // already been taken care of (e.g. and of 0, test of undefined shifted bits, + // etc.) + + // (tbz (trunc x), b) -> (tbz x, b) + // This case is just here to enable more of the below cases to be caught. + if (Op->getOpcode() == ISD::TRUNCATE && + Bit < Op->getValueType(0).getSizeInBits()) { + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + } + + if (Op->getNumOperands() != 2) + return Op; + + auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1)); + if (!C) + return Op; + + switch (Op->getOpcode()) { + default: + return Op; + + // (tbz (and x, m), b) -> (tbz x, b) + case ISD::AND: + if ((C->getZExtValue() >> Bit) & 1) + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + return Op; + + // (tbz (shl x, c), b) -> (tbz x, b-c) + case ISD::SHL: + if (C->getZExtValue() <= Bit && + (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { + Bit = Bit - C->getZExtValue(); + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + } + return Op; + + // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x + case ISD::SRA: + Bit = Bit + C->getZExtValue(); + if (Bit >= Op->getValueType(0).getSizeInBits()) + Bit = Op->getValueType(0).getSizeInBits() - 1; + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + + // (tbz (srl x, c), b) -> (tbz x, b+c) + case ISD::SRL: + if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { + Bit = Bit + C->getZExtValue(); + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + } + return Op; + + // (tbz (xor x, -1), b) -> (tbnz x, b) + case ISD::XOR: + if ((C->getZExtValue() >> Bit) & 1) + Invert = !Invert; + return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); + } +} + +// Optimize test single bit zero/non-zero and branch. +static SDValue performTBZCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); + bool Invert = false; + SDValue TestSrc = N->getOperand(1); + SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG); + + if (TestSrc == NewTestSrc) + return SDValue(); + + unsigned NewOpc = N->getOpcode(); + if (Invert) { + if (NewOpc == AArch64ISD::TBZ) + NewOpc = AArch64ISD::TBNZ; + else { + assert(NewOpc == AArch64ISD::TBNZ); + NewOpc = AArch64ISD::TBZ; + } + } + + SDLoc DL(N); + return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc, + DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); +} + // vselect (v1i1 setcc) -> // vselect (v1iXX setcc) (XX is the size of the compared operand type) // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as @@ -8868,75 +9682,6 @@ static SDValue performSelectCombine(SDNode *N, return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); } -/// performSelectCCCombine - Target-specific DAG combining for ISD::SELECT_CC -/// to match FMIN/FMAX patterns. -static SDValue performSelectCCCombine(SDNode *N, SelectionDAG &DAG) { - // Try to use FMIN/FMAX instructions for FP selects like "x < y ? x : y". - // Unless the NoNaNsFPMath option is set, be careful about NaNs: - // vmax/vmin return NaN if either operand is a NaN; - // only do the transformation when it matches that behavior. - - SDValue CondLHS = N->getOperand(0); - SDValue CondRHS = N->getOperand(1); - SDValue LHS = N->getOperand(2); - SDValue RHS = N->getOperand(3); - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); - - unsigned Opcode; - bool IsReversed; - if (selectCCOpsAreFMaxCompatible(CondLHS, LHS) && - selectCCOpsAreFMaxCompatible(CondRHS, RHS)) { - IsReversed = false; // x CC y ? x : y - } else if (selectCCOpsAreFMaxCompatible(CondRHS, LHS) && - selectCCOpsAreFMaxCompatible(CondLHS, RHS)) { - IsReversed = true ; // x CC y ? y : x - } else { - return SDValue(); - } - - bool IsUnordered = false, IsOrEqual; - switch (CC) { - default: - return SDValue(); - case ISD::SETULT: - case ISD::SETULE: - IsUnordered = true; - case ISD::SETOLT: - case ISD::SETOLE: - case ISD::SETLT: - case ISD::SETLE: - IsOrEqual = (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE); - Opcode = IsReversed ? AArch64ISD::FMAX : AArch64ISD::FMIN; - break; - - case ISD::SETUGT: - case ISD::SETUGE: - IsUnordered = true; - case ISD::SETOGT: - case ISD::SETOGE: - case ISD::SETGT: - case ISD::SETGE: - IsOrEqual = (CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE); - Opcode = IsReversed ? AArch64ISD::FMIN : AArch64ISD::FMAX; - break; - } - - // If LHS is NaN, an ordered comparison will be false and the result will be - // the RHS, but FMIN(NaN, RHS) = FMAX(NaN, RHS) = NaN. Avoid this by checking - // that LHS != NaN. Likewise, for unordered comparisons, check for RHS != NaN. - if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) - return SDValue(); - - // For xxx-or-equal comparisons, "+0 <= -0" and "-0 >= +0" will both be true, - // but FMIN will return -0, and FMAX will return +0. So FMIN/FMAX can only be - // used for unsafe math or if one of the operands is known to be nonzero. - if (IsOrEqual && !DAG.getTarget().Options.UnsafeFPMath && - !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) - return SDValue(); - - return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS); -} - /// Get rid of unnecessary NVCASTs (that don't change the type). static SDValue performNVCASTCombine(SDNode *N) { if (N->getValueType(0) == N->getOperand(0).getValueType()) @@ -8961,6 +9706,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performIntToFpCombine(N, DAG, Subtarget); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return performFpToIntCombine(N, DAG, Subtarget); + case ISD::FDIV: + return performFDivCombine(N, DAG, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: @@ -8973,16 +9723,25 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); - case ISD::SELECT: - return performSelectCombine(N, DCI); + case ISD::SELECT: { + SDValue RV = performSelectCombine(N, DCI); + if (!RV.getNode()) + RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); + return RV; + } case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); - case ISD::SELECT_CC: - return performSelectCCCombine(N, DCI.DAG); + case ISD::LOAD: + if (performTBISimplification(N->getOperand(1), DCI, DAG)) + return SDValue(N, 0); + break; case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); + case AArch64ISD::TBNZ: + case AArch64ISD::TBZ: + return performTBZCombine(N, DCI, DAG); case AArch64ISD::CSEL: return performCONDCombine(N, DCI, DAG, 2, 3); case AArch64ISD::DUP: @@ -8991,6 +9750,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); + case ISD::EXTRACT_VECTOR_ELT: + return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { @@ -9157,6 +9918,20 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); } +static void ReplaceReductionResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG, unsigned InterOp, + unsigned AcrossOp) { + EVT LoVT, HiVT; + SDValue Lo, Hi; + SDLoc dl(N); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); + SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); + SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); + Results.push_back(SplitVal); +} + void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -9165,6 +9940,24 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); return; + case AArch64ISD::SADDV: + ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); + return; + case AArch64ISD::UADDV: + ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); + return; + case AArch64ISD::SMINV: + ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); + return; + case AArch64ISD::UMINV: + ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); + return; + case AArch64ISD::SMAXV: + ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); + return; + case AArch64ISD::UMAXV: + ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); + return; case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); @@ -9177,10 +9970,10 @@ bool AArch64TargetLowering::useLoadStackGuardNode() const { return true; } -bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { +unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal if there are three or more FDIVs. - return NumUsers > 2; + return 3; } TargetLoweringBase::LegalizeTypeAction @@ -9206,20 +9999,21 @@ bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. -bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { +TargetLowering::AtomicExpansionKind +AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); - return Size == 128; + return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } // For the real atomic operations, we have ldxr/stxr up to 128 bits, -TargetLoweringBase::AtomicRMWExpansionKind +TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - return Size <= 128 ? AtomicRMWExpansionKind::LLSC - : AtomicRMWExpansionKind::None; + return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } -bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const { +bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( + AtomicCmpXchgInst *AI) const { return true; } @@ -9258,6 +10052,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, cast<PointerType>(Addr->getType())->getElementType()); } +void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( + IRBuilder<> &Builder) const { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Builder.CreateCall( + llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); +} + Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { @@ -9294,3 +10095,70 @@ bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { return Ty->isArrayTy(); } + +bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, + EVT) const { + return false; +} + +Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { + if (!Subtarget->isTargetAndroid()) + return TargetLowering::getSafeStackPointerLocation(IRB); + + // Android provides a fixed TLS slot for the SafeStack pointer. See the + // definition of TLS_SLOT_SAFESTACK in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + const unsigned TlsOffset = 0x48; + Module *M = IRB.GetInsertBlock()->getParent()->getParent(); + Function *ThreadPointerFunc = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer); + return IRB.CreatePointerCast( + IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); +} + +void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + // Update IsSplitCSR in AArch64unctionInfo. + AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); + AFI->setIsSplitCSR(true); +} + +void AArch64TargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const { + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (AArch64::GPR64RegClass.contains(*I)) + RC = &AArch64::GPR64RegClass; + else if (AArch64::FPR64RegClass.contains(*I)) + RC = &AArch64::FPR64RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + NewVR) + .addReg(*I); + + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + *I) + .addReg(NewVR); + } +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h index c73ce1e..e99616c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H +#include "AArch64.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/CallingConv.h" @@ -58,13 +59,14 @@ enum NodeType : unsigned { SBCS, ANDS, + // Conditional compares. Operands: left,right,falsecc,cc,flags + CCMP, + CCMN, + FCCMP, + // Floating point comparison FCMP, - // Floating point max and min instructions. - FMAX, - FMIN, - // Scalar extract EXTR, @@ -217,8 +219,6 @@ class AArch64Subtarget; class AArch64TargetMachine; class AArch64TargetLowering : public TargetLowering { - bool RequireStrictAlign; - public: explicit AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI); @@ -226,46 +226,35 @@ public: /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; - /// computeKnownBitsForTargetNode - Determine which of the bits specified in - /// Mask are known to be either zero or one and return them in the - /// KnownZero/KnownOne bitsets. + /// Determine which of the bits specified in Mask are known to be either zero + /// or one and return them in the KnownZero/KnownOne bitsets. void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth = 0) const override; MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; - /// allowsMisalignedMemoryAccesses - Returns true if the target allows - /// unaligned memory accesses of the specified type. + /// Returns true if the target allows unaligned memory accesses of the + /// specified type. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0, unsigned Align = 1, - bool *Fast = nullptr) const override { - if (RequireStrictAlign) - return false; - // FIXME: True for Cyclone, but not necessary others. - if (Fast) - *Fast = true; - return true; - } + bool *Fast = nullptr) const override; - /// LowerOperation - Provide custom lowering hooks for some operations. + /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; const char *getTargetNodeName(unsigned Opcode) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - /// getFunctionAlignment - Return the Log2 alignment of this function. - unsigned getFunctionAlignment(const Function *F) const; - /// Returns true if a cast between SrcAS and DestAS is a noop. bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { // Addrspacecasts are always noops. return true; } - /// createFastISel - This method returns a target specific FastISel object, - /// or null if the target does not support "fast" ISel. + /// This method returns a target specific FastISel object, or null if the + /// target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override; @@ -273,11 +262,11 @@ public: bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; - /// isShuffleMaskLegal - Return true if the given shuffle mask can be - /// codegen'd directly, or if it should be stack expanded. + /// Return true if the given shuffle mask can be codegen'd directly, or if it + /// should be stack expanded. bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override; - /// getSetCCResultType - Return the ISD::SETCC ValueType + /// Return the ISD::SETCC ValueType. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -322,8 +311,8 @@ public: bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const override; - /// isLegalAddressingMode - Return true if the addressing mode represented - /// by AM is legal for this target, for a load/store of the specified type. + /// Return true if the addressing mode represented by AM is legal for this + /// target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; @@ -335,10 +324,9 @@ public: int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; - /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster - /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be - /// expanded to FMAs when this method returns true, otherwise fmuladd is - /// expanded to fmul + fadd. + /// Return true if an FMA operation is faster than a pair of fmul and fadd + /// instructions. fmuladd intrinsics will be expanded to FMAs when this method + /// returns true, otherwise fmuladd is expanded to fmul + fadd. bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; @@ -351,25 +339,65 @@ public: bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; - bool hasLoadLinkedStoreConditional() const override; Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; - bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; + + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - TargetLoweringBase::AtomicRMWExpansionKind + TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; + bool useLoadStackGuardNode() const override; TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; + /// If the target has a standard location for the unsafe stack pointer, + /// returns the address of that location. Otherwise, returns nullptr. + Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; + + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + // FIXME: This is a guess. Has this been defined yet? + return AArch64::X0; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + // FIXME: This is a guess. Has this been defined yet? + return AArch64::X1; + } + + bool isCheapToSpeculateCttz() const override { + return true; + } + + bool isCheapToSpeculateCtlz() const override { + return true; + } + bool supportSplitCSR(MachineFunction *MF) const override { + return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; + private: bool isExtFreeImpl(const Instruction *Ext) const override; - /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can + /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. const AArch64Subtarget *Subtarget; @@ -392,6 +420,8 @@ private: SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, SDValue ThisVal) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + bool isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, @@ -470,7 +500,7 @@ private: SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const override; - bool combineRepeatedFPDivisors(unsigned NumUsers) const override; + unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; unsigned getRegisterByName(const char* RegName, EVT VT, @@ -516,6 +546,8 @@ private: bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; + + bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override; }; namespace AArch64 { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 3f2e772..6ac2175 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -248,6 +248,12 @@ def simm7s16 : Operand<i32> { let PrintMethod = "printImmScale<16>"; } +def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>; +def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>; +def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>; +def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>; +def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>; + class AsmImmRange<int Low, int High> : AsmOperandClass { let Name = "Imm" # Low # "_" # High; let DiagnosticType = "InvalidImm" # Low # "_" # High; @@ -346,9 +352,11 @@ class fixedpoint_i64<ValueType FloatVT> let ParserMatchClass = Imm1_64Operand; } +def fixedpoint_f16_i32 : fixedpoint_i32<f16>; def fixedpoint_f32_i32 : fixedpoint_i32<f32>; def fixedpoint_f64_i32 : fixedpoint_i32<f64>; +def fixedpoint_f16_i64 : fixedpoint_i64<f16>; def fixedpoint_f32_i64 : fixedpoint_i64<f32>; def fixedpoint_f64_i64 : fixedpoint_i64<f64>; @@ -402,6 +410,7 @@ def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{ let ParserMatchClass = Imm1_32Operand; } +def Imm0_1Operand : AsmImmRange<0, 1>; def Imm0_7Operand : AsmImmRange<0, 7>; def Imm0_15Operand : AsmImmRange<0, 15>; def Imm0_31Operand : AsmImmRange<0, 31>; @@ -525,6 +534,20 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{ let ParserMatchClass = Imm0_31Operand; } +// True if the 32-bit immediate is in the range [0,31] +def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint64_t)Imm) < 32; +}]> { + let ParserMatchClass = Imm0_31Operand; +} + +// imm0_1 predicate - True if the immediate is in the range [0,1] +def imm0_1 : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) < 2; +}]> { + let ParserMatchClass = Imm0_1Operand; +} + // imm0_15 predicate - True if the immediate is in the range [0,15] def imm0_15 : Operand<i64>, ImmLeaf<i64, [{ return ((uint64_t)Imm) < 16; @@ -542,7 +565,9 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{ // imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15] def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{ return ((uint32_t)Imm) < 16; -}]>; +}]> { + let ParserMatchClass = Imm0_15Operand; +} // An arithmetic shifter operand: // {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr @@ -690,6 +715,17 @@ class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>, } // Floating-point immediate. +def fpimm16 : Operand<f16>, + PatLeaf<(f16 fpimm), [{ + return AArch64_AM::getFP16Imm(N->getValueAPF()) != -1; + }], SDNodeXForm<fpimm, [{ + APFloat InVal = N->getValueAPF(); + uint32_t enc = AArch64_AM::getFP16Imm(InVal); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); + }]>> { + let ParserMatchClass = FPImmOperand; + let PrintMethod = "printFPImmOperand"; +} def fpimm32 : Operand<f32>, PatLeaf<(f32 fpimm), [{ return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1; @@ -822,7 +858,7 @@ class RtSystemI<bit L, dag oops, dag iops, string asm, string operands> // model patterns with sufficiently fine granularity let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in class HintI<string mnemonic> - : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "", + : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#"\t$imm", "", [(int_aarch64_hint imm0_127:$imm)]>, Sched<[WriteHint]> { bits <7> imm; @@ -875,6 +911,25 @@ def msr_sysreg_op : Operand<i32> { let PrintMethod = "printMSRSystemRegister"; } +def PSBHintOperand : AsmOperandClass { + let Name = "PSBHint"; + let ParserMethod = "tryParsePSBHint"; +} +def psbhint_op : Operand<i32> { + let ParserMatchClass = PSBHintOperand; + let PrintMethod = "printPSBHintOp"; + let MCOperandPredicate = [{ + // Check, if operand is valid, to fix exhaustive aliasing in disassembly. + // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields. + if (!MCOp.isImm()) + return false; + bool ValidNamed; + (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(), + STI.getFeatureBits(), ValidNamed); + return ValidNamed; + }]; +} + class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), "mrs", "\t$Rt, $systemreg"> { bits<16> systemreg; @@ -890,19 +945,19 @@ class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt), let Inst{20-5} = systemreg; } -def SystemPStateFieldOperand : AsmOperandClass { - let Name = "SystemPStateField"; +def SystemPStateFieldWithImm0_15Operand : AsmOperandClass { + let Name = "SystemPStateFieldWithImm0_15"; let ParserMethod = "tryParseSysReg"; } -def pstatefield_op : Operand<i32> { - let ParserMatchClass = SystemPStateFieldOperand; +def pstatefield4_op : Operand<i32> { + let ParserMatchClass = SystemPStateFieldWithImm0_15Operand; let PrintMethod = "printSystemPStateField"; } let Defs = [NZCV] in -class MSRpstateI - : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm), - "msr", "\t$pstate_field, $imm">, +class MSRpstateImm0_15 + : SimpleSystemI<0, (ins pstatefield4_op:$pstatefield, imm0_15:$imm), + "msr", "\t$pstatefield, $imm">, Sched<[WriteSys]> { bits<6> pstatefield; bits<4> imm; @@ -913,6 +968,37 @@ class MSRpstateI let Inst{7-5} = pstatefield{2-0}; let DecoderMethod = "DecodeSystemPStateInstruction"; + // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns + // Fail the decoder should attempt to decode the instruction as MSRI. + let hasCompleteDecoder = 0; +} + +def SystemPStateFieldWithImm0_1Operand : AsmOperandClass { + let Name = "SystemPStateFieldWithImm0_1"; + let ParserMethod = "tryParseSysReg"; +} +def pstatefield1_op : Operand<i32> { + let ParserMatchClass = SystemPStateFieldWithImm0_1Operand; + let PrintMethod = "printSystemPStateField"; +} + +let Defs = [NZCV] in +class MSRpstateImm0_1 + : SimpleSystemI<0, (ins pstatefield1_op:$pstatefield, imm0_1:$imm), + "msr", "\t$pstatefield, $imm">, + Sched<[WriteSys]> { + bits<6> pstatefield; + bit imm; + let Inst{20-19} = 0b00; + let Inst{18-16} = pstatefield{5-3}; + let Inst{15-9} = 0b0100000; + let Inst{8} = imm; + let Inst{7-5} = pstatefield{2-0}; + + let DecoderMethod = "DecodeSystemPStateInstruction"; + // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns + // Fail the decoder should attempt to decode the instruction as MSRI. + let hasCompleteDecoder = 0; } // SYS and SYSL generic system instructions. @@ -1341,7 +1427,7 @@ multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> { } class ShiftAlias<string asm, Instruction inst, RegisterClass regtype> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>; class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype, @@ -1407,13 +1493,13 @@ class MulHi<bits<3> opc, string asm, SDNode OpNode> } class MulAccumWAlias<string asm, Instruction inst> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>; class MulAccumXAlias<string asm, Instruction inst> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>; class WideMulAccumAlias<string asm, Instruction inst> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>; class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg, @@ -1643,7 +1729,7 @@ class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype, class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype, RegisterClass src1Regtype, RegisterClass src2Regtype, int shiftExt> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2, shiftExt)>; @@ -1701,10 +1787,10 @@ multiclass AddSub<bit isSub, string mnemonic, string alias, } // add Rd, Rb, -imm -> sub Rd, Rn, imm - def : InstAlias<alias#" $Rd, $Rn, $imm", + def : InstAlias<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<alias#" $Rd, $Rn, $imm", + def : InstAlias<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn, addsub_shifted_imm64_neg:$imm), 0>; @@ -1776,43 +1862,43 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp, } // Defs = [NZCV] // Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm - def : InstAlias<alias#" $Rd, $Rn, $imm", + def : InstAlias<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<alias#" $Rd, $Rn, $imm", + def : InstAlias<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn, addsub_shifted_imm64_neg:$imm), 0>; // Compare aliases - def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri") + def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri") WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>; - def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri") + def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri") XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx") WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx") XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64") XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs") WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs") XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>; // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm - def : InstAlias<cmpAlias#" $src, $imm", (!cast<Instruction>(NAME#"Wri") + def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri") WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<cmpAlias#" $src, $imm", (!cast<Instruction>(NAME#"Xri") + def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri") XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>; // Compare shorthands - def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs") + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs") WZR, GPR32:$src1, GPR32:$src2, 0), 5>; - def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs") + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs") XZR, GPR64:$src1, GPR64:$src2, 0), 5>; - def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrx") + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx") WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>; - def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrx64") + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64") XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>; // Register/register aliases with no shift when SP is not used. @@ -1998,7 +2084,7 @@ class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype, // Aliases for register+register logical instructions. class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>; multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode, @@ -2017,10 +2103,10 @@ multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode, let Inst{31} = 1; } - def : InstAlias<Alias # " $Rd, $Rn, $imm", + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn, logical_imm32_not:$imm), 0>; - def : InstAlias<Alias # " $Rd, $Rn, $imm", + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn, logical_imm64_not:$imm), 0>; } @@ -2039,10 +2125,10 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode, } } // end Defs = [NZCV] - def : InstAlias<Alias # " $Rd, $Rn, $imm", + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn, logical_imm32_not:$imm), 0>; - def : InstAlias<Alias # " $Rd, $Rn, $imm", + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn, logical_imm64_not:$imm), 0>; } @@ -2105,9 +2191,12 @@ multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic, //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm> - : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond), - asm, "\t$Rn, $imm, $nzcv, $cond", "", []>, +class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype, + string mnemonic, SDNode OpNode> + : I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond), + mnemonic, "\t$Rn, $imm, $nzcv, $cond", "", + [(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]>, Sched<[WriteI, ReadI]> { let Uses = [NZCV]; let Defs = [NZCV]; @@ -2127,19 +2216,13 @@ class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm> let Inst{3-0} = nzcv; } -multiclass CondSetFlagsImm<bit op, string asm> { - def Wi : BaseCondSetFlagsImm<op, GPR32, asm> { - let Inst{31} = 0; - } - def Xi : BaseCondSetFlagsImm<op, GPR64, asm> { - let Inst{31} = 1; - } -} - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm> - : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond), - asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>, +class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic, + SDNode OpNode> + : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond), + mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", + [(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]>, Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; let Defs = [NZCV]; @@ -2159,11 +2242,19 @@ class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm> let Inst{3-0} = nzcv; } -multiclass CondSetFlagsReg<bit op, string asm> { - def Wr : BaseCondSetFlagsReg<op, GPR32, asm> { +multiclass CondComparison<bit op, string mnemonic, SDNode OpNode> { + // immediate operand variants + def Wi : BaseCondComparisonImm<op, GPR32, imm32_0_31, mnemonic, OpNode> { let Inst{31} = 0; } - def Xr : BaseCondSetFlagsReg<op, GPR64, asm> { + def Xi : BaseCondComparisonImm<op, GPR64, imm0_31, mnemonic, OpNode> { + let Inst{31} = 1; + } + // register operand variants + def Wr : BaseCondComparisonReg<op, GPR32, mnemonic, OpNode> { + let Inst{31} = 0; + } + def Xr : BaseCondComparisonReg<op, GPR64, mnemonic, OpNode> { let Inst{31} = 1; } } @@ -2328,7 +2419,7 @@ multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, asm, pattern>, Sched<[WriteLD]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -2340,7 +2431,7 @@ multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, asm, pattern>, Sched<[WriteST]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -2508,7 +2599,7 @@ class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, } class ROInstAlias<string asm, RegisterClass regtype, Instruction INST> - : InstAlias<asm # " $Rt, [$Rn, $Rm]", + : InstAlias<asm # "\t$Rt, [$Rn, $Rm]", (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>; multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, @@ -2934,7 +3025,7 @@ multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>, Sched<[WriteLD]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -2946,7 +3037,7 @@ multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, asm, pattern>, Sched<[WriteST]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -2958,7 +3049,7 @@ multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm, asm, pat>, Sched<[WriteLD]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>; } @@ -2993,7 +3084,7 @@ multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc, (ins GPR64sp:$Rn, simm9:$offset), asm>, Sched<[WriteLD]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -3005,7 +3096,7 @@ multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc, asm>, Sched<[WriteST]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -3136,7 +3227,7 @@ multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype, (ins GPR64sp:$Rn, indextype:$offset), asm>, Sched<[WriteLD, WriteLDHi]>; - def : InstAlias<asm # " $Rt, $Rt2, [$Rn]", + def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2, GPR64sp:$Rn, 0)>; } @@ -3151,7 +3242,7 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype, asm>, Sched<[WriteSTP]>; - def : InstAlias<asm # " $Rt, $Rt2, [$Rn]", + def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2, GPR64sp:$Rn, 0)>; } @@ -3230,8 +3321,8 @@ class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype, let mayStore = 1, mayLoad = 0 in class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype, Operand idxtype, string asm> - : BaseLoadStorePairPostIdx<opc, V, 0, (outs), - (ins GPR64sp:$wback, regtype:$Rt, regtype:$Rt2, + : BaseLoadStorePairPostIdx<opc, V, 0, (outs GPR64sp:$wback), + (ins regtype:$Rt, regtype:$Rt2, GPR64sp:$Rn, idxtype:$offset), asm>, Sched<[WriteAdr, WriteSTP]>; @@ -3477,6 +3568,20 @@ class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode, multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm, SDPatternOperator OpN> { + // Unscaled half-precision to 32-bit + def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm, + [(set GPR32:$Rd, (OpN FPR16:$Rn))]> { + let Inst{31} = 0; // 32-bit GPR flag + let Predicates = [HasFullFP16]; + } + + // Unscaled half-precision to 64-bit + def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm, + [(set GPR64:$Rd, (OpN FPR16:$Rn))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Predicates = [HasFullFP16]; + } + // Unscaled single-precision to 32-bit def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm, [(set GPR32:$Rd, (OpN FPR32:$Rn))]> { @@ -3504,6 +3609,25 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm, multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm, SDPatternOperator OpN> { + // Scaled half-precision to 32-bit + def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32, + fixedpoint_f16_i32, asm, + [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn, + fixedpoint_f16_i32:$scale)))]> { + let Inst{31} = 0; // 32-bit GPR flag + let scale{5} = 1; + let Predicates = [HasFullFP16]; + } + + // Scaled half-precision to 64-bit + def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64, + fixedpoint_f16_i64, asm, + [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn, + fixedpoint_f16_i64:$scale)))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Predicates = [HasFullFP16]; + } + // Scaled single-precision to 32-bit def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32, fixedpoint_f32_i32, asm, @@ -3553,7 +3677,7 @@ class BaseIntegerToFP<bit isUnsigned, bits<5> Rd; bits<5> Rn; bits<6> scale; - let Inst{30-23} = 0b00111100; + let Inst{30-24} = 0b0011110; let Inst{21-17} = 0b00001; let Inst{16} = isUnsigned; let Inst{15-10} = scale; @@ -3570,7 +3694,7 @@ class BaseIntegerToFPUnscaled<bit isUnsigned, bits<5> Rd; bits<5> Rn; bits<6> scale; - let Inst{30-23} = 0b00111100; + let Inst{30-24} = 0b0011110; let Inst{21-17} = 0b10001; let Inst{16} = isUnsigned; let Inst{15-10} = 0b000000; @@ -3580,33 +3704,55 @@ class BaseIntegerToFPUnscaled<bit isUnsigned, multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> { // Unscaled + def UWHri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR16, f16, asm, node> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag + } + + def UXHri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR16, f16, asm, node> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; } def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag } // Scaled + def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm, + [(set FPR16:$Rd, + (fdiv (node GPR32:$Rn), + fixedpoint_f16_i32:$scale))]> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let scale{5} = 1; + let Predicates = [HasFullFP16]; + } + def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm, [(set FPR32:$Rd, (fdiv (node GPR32:$Rn), fixedpoint_f32_i32:$scale))]> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag let scale{5} = 1; } @@ -3615,16 +3761,25 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> { (fdiv (node GPR32:$Rn), fixedpoint_f64_i32:$scale))]> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag let scale{5} = 1; } + def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm, + [(set FPR16:$Rd, + (fdiv (node GPR64:$Rn), + fixedpoint_f16_i64:$scale))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm, [(set FPR32:$Rd, (fdiv (node GPR64:$Rn), fixedpoint_f32_i64:$scale))]> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm, @@ -3632,7 +3787,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> { (fdiv (node GPR64:$Rn), fixedpoint_f64_i64:$scale))]> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag } } @@ -3654,7 +3809,7 @@ class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode, Sched<[WriteFCopy]> { bits<5> Rd; bits<5> Rn; - let Inst{30-23} = 0b00111100; + let Inst{30-24} = 0b0011110; let Inst{21} = 1; let Inst{20-19} = rmode; let Inst{18-16} = opcode; @@ -3704,26 +3859,49 @@ class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode, } - multiclass UnscaledConversion<string asm> { + def WHr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR16, asm> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + + def XHr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR16, asm> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag + } + + def HWr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR32, asm> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + + def HXr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR64, asm> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; } def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag } def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128, @@ -3796,7 +3974,7 @@ class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype, Sched<[WriteF]> { bits<5> Rd; bits<5> Rn; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21-19} = 0b100; let Inst{18-15} = opcode; let Inst{14-10} = 0b10000; @@ -3806,12 +3984,17 @@ class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype, multiclass SingleOperandFPData<bits<4> opcode, string asm, SDPatternOperator node = null_frag> { + def Hr : BaseSingleOperandFPData<opcode, FPR16, f16, asm, node> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> { - let Inst{22} = 0; // 32-bit size flag + let Inst{23-22} = 0b00; // 32-bit size flag } def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> { - let Inst{22} = 1; // 64-bit size flag + let Inst{23-22} = 0b01; // 64-bit size flag } } @@ -3828,7 +4011,7 @@ class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype, bits<5> Rd; bits<5> Rn; bits<5> Rm; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = opcode; @@ -3839,28 +4022,41 @@ class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype, multiclass TwoOperandFPData<bits<4> opcode, string asm, SDPatternOperator node = null_frag> { + def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm, + [(set (f16 FPR16:$Rd), + (node (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + def Srr : BaseTwoOperandFPData<opcode, FPR32, asm, [(set (f32 FPR32:$Rd), (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> { - let Inst{22} = 0; // 32-bit size flag + let Inst{23-22} = 0b00; // 32-bit size flag } def Drr : BaseTwoOperandFPData<opcode, FPR64, asm, [(set (f64 FPR64:$Rd), (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> { - let Inst{22} = 1; // 64-bit size flag + let Inst{23-22} = 0b01; // 64-bit size flag } } multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> { + def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm, + [(set FPR16:$Rd, (fneg (node FPR16:$Rn, (f16 FPR16:$Rm))))]> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + def Srr : BaseTwoOperandFPData<opcode, FPR32, asm, [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> { - let Inst{22} = 0; // 32-bit size flag + let Inst{23-22} = 0b00; // 32-bit size flag } def Drr : BaseTwoOperandFPData<opcode, FPR64, asm, [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> { - let Inst{22} = 1; // 64-bit size flag + let Inst{23-22} = 0b01; // 64-bit size flag } } @@ -3878,7 +4074,7 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub, bits<5> Rn; bits<5> Rm; bits<5> Ra; - let Inst{31-23} = 0b000111110; + let Inst{31-24} = 0b00011111; let Inst{21} = isNegated; let Inst{20-16} = Rm; let Inst{15} = isSub; @@ -3889,16 +4085,23 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub, multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm, SDPatternOperator node> { + def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm, + [(set FPR16:$Rd, + (node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm, [(set FPR32:$Rd, (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> { - let Inst{22} = 0; // 32-bit size flag + let Inst{23-22} = 0b00; // 32-bit size flag } def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm, [(set FPR64:$Rd, (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> { - let Inst{22} = 1; // 64-bit size flag + let Inst{23-22} = 0b01; // 64-bit size flag } } @@ -3913,7 +4116,7 @@ class BaseOneOperandFPComparison<bit signalAllNans, : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>, Sched<[WriteFCmp]> { bits<5> Rn; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{15-10} = 0b001000; @@ -3932,7 +4135,7 @@ class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype, Sched<[WriteFCmp]> { bits<5> Rm; bits<5> Rn; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-10} = 0b001000; @@ -3944,24 +4147,36 @@ class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype, multiclass FPComparison<bit signalAllNans, string asm, SDPatternOperator OpNode = null_frag> { let Defs = [NZCV] in { + def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm, + [(OpNode FPR16:$Rn, (f16 FPR16:$Rm)), (implicit NZCV)]> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + + def Hri : BaseOneOperandFPComparison<signalAllNans, FPR16, asm, + [(OpNode (f16 FPR16:$Rn), fpimm0), (implicit NZCV)]> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm, [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> { - let Inst{22} = 0; + let Inst{23-22} = 0b00; } def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm, [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> { - let Inst{22} = 0; + let Inst{23-22} = 0b00; } def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm, [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> { - let Inst{22} = 1; + let Inst{23-22} = 0b01; } def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm, [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> { - let Inst{22} = 1; + let Inst{23-22} = 0b01; } } // Defs = [NZCV] } @@ -3971,17 +4186,20 @@ multiclass FPComparison<bit signalAllNans, string asm, //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseFPCondComparison<bit signalAllNans, - RegisterClass regtype, string asm> - : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond), - asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>, +class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype, + string mnemonic, list<dag> pat> + : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond), + mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>, Sched<[WriteFCmp]> { + let Uses = [NZCV]; + let Defs = [NZCV]; + bits<5> Rn; bits<5> Rm; bits<4> nzcv; bits<4> cond; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = cond; @@ -3991,16 +4209,24 @@ class BaseFPCondComparison<bit signalAllNans, let Inst{3-0} = nzcv; } -multiclass FPCondComparison<bit signalAllNans, string asm> { - let Defs = [NZCV], Uses = [NZCV] in { - def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> { - let Inst{22} = 0; +multiclass FPCondComparison<bit signalAllNans, string mnemonic, + SDPatternOperator OpNode = null_frag> { + def Hrr : BaseFPCondComparison<signalAllNans, FPR16, mnemonic, []> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; } - def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> { - let Inst{22} = 1; + def Srr : BaseFPCondComparison<signalAllNans, FPR32, mnemonic, + [(set NZCV, (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm), (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]> { + let Inst{23-22} = 0b00; + } + + def Drr : BaseFPCondComparison<signalAllNans, FPR64, mnemonic, + [(set NZCV, (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm), (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]> { + let Inst{23-22} = 0b01; } - } // Defs = [NZCV], Uses = [NZCV] } //--- @@ -4019,7 +4245,7 @@ class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm> bits<5> Rm; bits<4> cond; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = cond; @@ -4030,12 +4256,17 @@ class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm> multiclass FPCondSelect<string asm> { let Uses = [NZCV] in { + def Hrrr : BaseFPCondSelect<FPR16, f16, asm> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + def Srrr : BaseFPCondSelect<FPR32, f32, asm> { - let Inst{22} = 0; + let Inst{23-22} = 0b00; } def Drrr : BaseFPCondSelect<FPR64, f64, asm> { - let Inst{22} = 1; + let Inst{23-22} = 0b01; } } // Uses = [NZCV] } @@ -4050,7 +4281,7 @@ class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm> Sched<[WriteFImm]> { bits<5> Rd; bits<8> imm; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-13} = imm; let Inst{12-5} = 0b10000000; @@ -4058,12 +4289,17 @@ class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm> } multiclass FPMoveImmediate<string asm> { + def Hi : BaseFPMoveImmediate<FPR16, fpimm16, asm> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> { - let Inst{22} = 0; + let Inst{23-22} = 0b00; } def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> { - let Inst{22} = 1; + let Inst{23-22} = 0b01; } } } // end of 'let Predicates = [HasFPARMv8]' @@ -4079,7 +4315,7 @@ let Predicates = [HasNEON] in { //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, +class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode, RegisterOperand regtype, string asm, string kind, list<dag> pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, @@ -4093,8 +4329,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21} = 1; + let Inst{23-21} = size; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 1; @@ -4103,7 +4338,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, +class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode, RegisterOperand regtype, string asm, string kind, list<dag> pattern> : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm, @@ -4117,8 +4352,7 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21} = 1; + let Inst{23-21} = size; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 1; @@ -4129,25 +4363,25 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, // All operand sizes distinguished in the encoding. multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, asm, ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, asm, ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; - def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, asm, ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; - def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, asm, ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; - def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, asm, ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; - def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, asm, ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; - def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128, + def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128, asm, ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>; } @@ -4155,49 +4389,49 @@ multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm, // As above, but D sized elements unsupported. multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, asm, ".8b", [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, asm, ".16b", [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>; - def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, asm, ".4h", [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>; - def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, asm, ".8h", [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>; - def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, asm, ".2s", [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>; - def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, asm, ".4s", [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>; } multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64, asm, ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128, asm, ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; - def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64, asm, ".4h", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; - def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128, asm, ".8h", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; - def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64, asm, ".2s", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; - def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128, asm, ".4s", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; @@ -4206,54 +4440,80 @@ multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm, // As above, but only B sized elements supported. multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, asm, ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, asm, ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; } -// As above, but only S and D sized floating point elements supported. -multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc, +// As above, but only floating point elements supported. +multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64, asm, ".2s", [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; - def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128, asm, ".4s", [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; - def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128, asm, ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; } -multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc, +multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64, asm, ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; - def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128, asm, ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; - def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128, asm, ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; } -multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc, +multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4f16 V64:$dst), + (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8f16 V128:$dst), + (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64, asm, ".2s", [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; - def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128, asm, ".4s", [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; - def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128, asm, ".2d", [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; @@ -4262,16 +4522,16 @@ multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc, // As above, but D and B sized elements unsupported. multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, asm, ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; - def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, asm, ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; - def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, asm, ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; - def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, asm, ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; } @@ -4279,10 +4539,10 @@ multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm, // Logical three vector ops share opcode bits, and only use B sized elements. multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm, SDPatternOperator OpNode = null_frag> { - def v8i8 : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64, + def v8i8 : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64, asm, ".8b", [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128, + def v16i8 : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128, asm, ".16b", [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>; @@ -4303,11 +4563,11 @@ multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm, multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64, + def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64, asm, ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128, + def v16i8 : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128, asm, ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), @@ -4347,8 +4607,8 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size, let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, - RegisterOperand regtype, string asm, string dstkind, - string srckind, list<dag> pattern> + bits<2> size2, RegisterOperand regtype, string asm, + string dstkind, string srckind, list<dag> pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "{\t$Rd" # dstkind # ", $Rn" # srckind # "|" # dstkind # "\t$Rd, $Rn}", "", pattern>, @@ -4360,7 +4620,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -4369,8 +4631,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, - RegisterOperand regtype, string asm, string dstkind, - string srckind, list<dag> pattern> + bits<2> size2, RegisterOperand regtype, + string asm, string dstkind, string srckind, + list<dag> pattern> : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm, "{\t$Rd" # dstkind # ", $Rn" # srckind # "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>, @@ -4382,7 +4645,9 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -4392,22 +4657,22 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, // Supports B, H, and S element sizes. multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; - def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; - def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; - def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; } @@ -4450,49 +4715,49 @@ multiclass SIMDVectorLShiftLongBySizeBHS { // Supports all element sizes. multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".4h", ".8b", [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".8h", ".16b", [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; - def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".2s", ".4h", [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; - def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".4s", ".8h", [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; - def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, + def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, asm, ".1d", ".2s", [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, + def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, asm, ".2d", ".4s", [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; } multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64, + def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64, asm, ".4h", ".8b", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v8i8 V64:$Rn)))]>; - def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128, + def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128, asm, ".8h", ".16b", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v16i8 V128:$Rn)))]>; - def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64, + def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64, asm, ".2s", ".4h", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v4i16 V64:$Rn)))]>; - def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128, + def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128, asm, ".4s", ".8h", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v8i16 V128:$Rn)))]>; - def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64, + def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64, asm, ".1d", ".2s", [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd), (v2i32 V64:$Rn)))]>; - def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128, + def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128, asm, ".2d", ".4s", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v4i32 V128:$Rn)))]>; @@ -4501,50 +4766,50 @@ multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm, // Supports all element sizes, except 1xD. multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>; - def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>; - def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>; - def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>; - def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128, + def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>; } multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode = null_frag> { - def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; - def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; - def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; - def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; - def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128, + def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; } @@ -4553,10 +4818,10 @@ multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm, // Supports only B element sizes. multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, V64, + def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128, + def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; @@ -4565,16 +4830,16 @@ multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm, // Supports only B and H element sizes. multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>; - def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>; - def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>; } @@ -4583,13 +4848,21 @@ multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm, // as an extra opcode bit. multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, + asm, ".4h", ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>; + def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, + asm, ".8h", ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; - def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; - def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; } @@ -4597,10 +4870,10 @@ multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm, // Supports only S element size. multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; } @@ -4608,26 +4881,42 @@ multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm, multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>; + def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; - def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; - def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; } multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, + asm, ".4h", ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; + def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, + asm, ".8h", ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; - def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; } @@ -4706,10 +4995,10 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; } -class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode, - RegisterOperand regtype, - string asm, string kind, string zero, - ValueType dty, ValueType sty, SDNode OpNode> +class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2, + bits<5> opcode, RegisterOperand regtype, string asm, + string kind, string zero, ValueType dty, + ValueType sty, SDNode OpNode> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero # "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "", @@ -4722,7 +5011,9 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -4732,54 +5023,74 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode, // Comparisons support all element sizes, except 1xD. multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm, SDNode OpNode> { - def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64, + def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64, asm, ".8b", "0", v8i8, v8i8, OpNode>; - def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128, + def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128, asm, ".16b", "0", v16i8, v16i8, OpNode>; - def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64, + def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64, asm, ".4h", "0", v4i16, v4i16, OpNode>; - def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128, + def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128, asm, ".8h", "0", v8i16, v8i16, OpNode>; - def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64, + def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64, asm, ".2s", "0", v2i32, v2i32, OpNode>; - def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128, + def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128, asm, ".4s", "0", v4i32, v4i32, OpNode>; - def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128, + def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128, asm, ".2d", "0", v2i64, v2i64, OpNode>; } -// FP Comparisons support only S and D element sizes. +// FP Comparisons support only S and D element sizes (and H for v8.2a). multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc, string asm, SDNode OpNode> { - def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64, + asm, ".4h", "0.0", + v4i16, v4f16, OpNode>; + def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128, + asm, ".8h", "0.0", + v8i16, v8f16, OpNode>; + } // Predicates = [HasNEON, HasFullFP16] + def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64, asm, ".2s", "0.0", v2i32, v2f32, OpNode>; - def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128, + def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128, asm, ".4s", "0.0", v4i32, v4f32, OpNode>; - def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128, + def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128, asm, ".2d", "0.0", v2i64, v2f64, OpNode>; - def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0", + let Predicates = [HasNEON, HasFullFP16] in { + def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0", + (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>; + def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0", + (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>; + } + def : InstAlias<asm # "\t$Vd.2s, $Vn.2s, #0", (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; - def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0", + def : InstAlias<asm # "\t$Vd.4s, $Vn.4s, #0", (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; - def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0", + def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0", (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; - def : InstAlias<asm # ".2s $Vd, $Vn, #0", + let Predicates = [HasNEON, HasFullFP16] in { + def : InstAlias<asm # ".4h\t$Vd, $Vn, #0", + (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>; + def : InstAlias<asm # ".8h\t$Vd, $Vn, #0", + (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>; + } + def : InstAlias<asm # ".2s\t$Vd, $Vn, #0", (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; - def : InstAlias<asm # ".4s $Vd, $Vn, #0", + def : InstAlias<asm # ".4s\t$Vd, $Vn, #0", (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; - def : InstAlias<asm # ".2d $Vd, $Vn, #0", + def : InstAlias<asm # ".2d\t$Vd, $Vn, #0", (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; } @@ -5325,7 +5636,7 @@ multiclass SIMDZipVector<bits<3>opc, string asm, //---------------------------------------------------------------------------- let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in -class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode, +class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode, RegisterClass regtype, string asm, list<dag> pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, @@ -5337,8 +5648,7 @@ class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode, let Inst{31-30} = 0b01; let Inst{29} = U; let Inst{28-24} = 0b11110; - let Inst{23-22} = size; - let Inst{21} = 1; + let Inst{23-21} = size; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 1; @@ -5369,17 +5679,17 @@ class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode, multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64 : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm, + def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm, [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>; } multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64 : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm, + def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm, [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>; - def v1i32 : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>; - def v1i16 : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>; - def v1i8 : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>; + def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>; + def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>; + def v1i8 : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>; def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))), (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>; @@ -5389,9 +5699,9 @@ multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm, multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i32 : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, + def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>; - def v1i16 : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>; + def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>; } multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm, @@ -5404,26 +5714,34 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm, asm, []>; } -multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm, +multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm, + def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>; - def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm, + def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm, [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>; + let Predicates = [HasNEON, HasFullFP16] in { + def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm, + [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>; + } // Predicates = [HasNEON, HasFullFP16] } def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>; } -multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm, +multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm, + def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>; - def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm, + def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm, [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>; + let Predicates = [HasNEON, HasFullFP16] in { + def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm, + []>; + } // Predicates = [HasNEON, HasFullFP16] } def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), @@ -5482,7 +5800,7 @@ multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm, //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode, +class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode, RegisterClass regtype, RegisterClass regtype2, string asm, list<dag> pat> : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm, @@ -5494,7 +5812,9 @@ class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b11110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -5523,7 +5843,7 @@ class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode, let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode, +class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode, RegisterClass regtype, string asm, string zero> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn, #" # zero, "", []>, @@ -5534,7 +5854,9 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b11110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -5556,21 +5878,28 @@ class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm> multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">; + def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, 0b00, opc, FPR64, asm, "0">; def : Pat<(v1i64 (OpNode FPR64:$Rn)), (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>; } -multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm, +multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">; - def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">; + def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">; + def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">; + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">; + } - def : InstAlias<asm # " $Rd, $Rn, #0", + def : InstAlias<asm # "\t$Rd, $Rn, #0", (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>; - def : InstAlias<asm # " $Rd, $Rn, #0", + def : InstAlias<asm # "\t$Rd, $Rn, #0", (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>; + let Predicates = [HasNEON, HasFullFP16] in { + def : InstAlias<asm # "\t$Rd, $Rn, #0", + (!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>; + } def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))), (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>; @@ -5578,35 +5907,42 @@ multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm, multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode = null_frag> { - def v1i64 : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm, + def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm, [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>; def : Pat<(i64 (OpNode (i64 FPR64:$Rn))), (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>; } -multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> { - def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>; - def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>; +multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> { + def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>; + def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>; + let Predicates = [HasNEON, HasFullFP16] in { + def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>; + } } -multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm, +multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm, + def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm, [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>; - def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm, + def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm, [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>; + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm, + [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>; + } } multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def v1i64 : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm, + def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm, [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>; - def v1i32 : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm, + def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR32, asm, [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>; - def v1i16 : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>; - def v1i8 : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>; + def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR16, asm, []>; + def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR8 , asm, []>; } def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))), @@ -5633,10 +5969,10 @@ multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm, let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode = null_frag> { - def v1i32 : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm, + def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR64, asm, [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>; - def v1i16 : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>; - def v1i8 : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>; + def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR32, asm, []>; + def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR16, asm, []>; } //---------------------------------------------------------------------------- @@ -5668,10 +6004,14 @@ multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> { asm, ".2d">; } -multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> { - def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64, +multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> { + let Predicates = [HasNEON, HasFullFP16] in { + def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64, + asm, ".2h">; + } + def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64, asm, ".2s">; - def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128, + def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128, asm, ".2d">; } @@ -5727,8 +6067,16 @@ multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> { asm, ".4s", []>; } -multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm, +multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm, Intrinsic intOp> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64, + asm, ".4h", + [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>; + def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128, + asm, ".8h", + [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128, asm, ".4s", [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>; @@ -5925,7 +6273,7 @@ class SIMDInsMainMovAlias<string size, Instruction inst, class SIMDInsElementMovAlias<string size, Instruction inst, Operand idxtype> : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" # - # "|" # size #" $dst$idx, $src$idx2}", + # "|" # size #"\t$dst$idx, $src$idx2}", (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>; @@ -6215,7 +6563,7 @@ multiclass SIMDScalarCPY<string asm> { // AdvSIMD modified immediate instructions //---------------------------------------------------------------------------- -class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops, +class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops, string asm, string op_string, string cstr, list<dag> pattern> : I<oops, iops, asm, op_string, cstr, pattern>, @@ -6227,16 +6575,17 @@ class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops, let Inst{29} = op; let Inst{28-19} = 0b0111100000; let Inst{18-16} = imm8{7-5}; - let Inst{11-10} = 0b01; + let Inst{11} = op2; + let Inst{10} = 1; let Inst{9-5} = imm8{4-0}; let Inst{4-0} = Rd; } -class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype, +class BaseSIMDModifiedImmVector<bit Q, bit op, bit op2, RegisterOperand vectype, Operand immtype, dag opt_shift_iop, string opt_shift, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd), + : BaseSIMDModifiedImm<Q, op, op2, (outs vectype:$Rd), !con((ins immtype:$imm8), opt_shift_iop), asm, "{\t$Rd" # kind # ", $imm8" # opt_shift # "|" # kind # "\t$Rd, $imm8" # opt_shift # "}", @@ -6248,7 +6597,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype, Operand immtype, dag opt_shift_iop, string opt_shift, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst), + : BaseSIMDModifiedImm<Q, op, 0, (outs vectype:$dst), !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop), asm, "{\t$Rd" # kind # ", $imm8" # opt_shift # "|" # kind # "\t$Rd, $imm8" # opt_shift # "}", @@ -6259,7 +6608,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype, class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12, RegisterOperand vectype, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255, + : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255, (ins logical_vec_shift:$shift), "$shift", asm, kind, pattern> { bits<2> shift; @@ -6284,7 +6633,7 @@ class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12, class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12, RegisterOperand vectype, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255, + : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255, (ins logical_vec_hw_shift:$shift), "$shift", asm, kind, pattern> { bits<2> shift; @@ -6349,7 +6698,7 @@ multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode, class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode, RegisterOperand vectype, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255, + : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255, (ins move_vec_shift:$shift), "$shift", asm, kind, pattern> { bits<1> shift; @@ -6357,18 +6706,18 @@ class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode, let Inst{12} = shift; } -class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode, +class SIMDModifiedImmVectorNoShift<bit Q, bit op, bit op2, bits<4> cmode, RegisterOperand vectype, Operand imm_type, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "", + : BaseSIMDModifiedImmVector<Q, op, op2, vectype, imm_type, (ins), "", asm, kind, pattern> { let Inst{15-12} = cmode; } class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm, list<dag> pattern> - : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm, + : BaseSIMDModifiedImm<Q, op, 0, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm, "\t$Rd, $imm8", "", pattern> { let Inst{15-12} = cmode; let DecoderMethod = "DecodeModImmInstruction"; @@ -6438,8 +6787,36 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc, let Inst{4-0} = Rd; } -multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm, - SDPatternOperator OpNode> { +multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc, + V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", + [(set (v4f16 V64:$Rd), + (OpNode (v4f16 V64:$Rn), + (v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", + [(set (v8f16 V128:$Rd), + (OpNode (v8f16 V128:$Rn), + (v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] + def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, V64, V64, V128, VectorIndexS, @@ -6476,6 +6853,21 @@ multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm, let Inst{21} = 0; } + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc, + FPR16Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", + [(set (f16 FPR16Op:$Rd), + (OpNode (f16 FPR16Op:$Rn), + (f16 (vector_extract (v8f16 V128_lo:$Rm), + VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] + def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, FPR32Op, FPR32Op, V128, VectorIndexS, asm, ".s", "", "", ".s", @@ -6501,7 +6893,7 @@ multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm, } } -multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> { +multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> { // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar. def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (AArch64duplane32 (v4f32 V128:$Rm), @@ -6553,7 +6945,28 @@ multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> { V128:$Rm, VectorIndexD:$idx)>; } -multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> { +multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] + def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64, V128, VectorIndexS, asm, ".2s", ".2s", ".2s", ".s", []> { @@ -6580,6 +6993,16 @@ multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> { let Inst{21} = 0; } + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc, + FPR16Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, FPR32Op, FPR32Op, V128, VectorIndexS, @@ -7117,7 +7540,13 @@ class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm, } -multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> { +multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> { + let Predicates = [HasNEON, HasFullFP16] in { + def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?}, + FPR16, FPR16, vecshiftR16, asm, []> { + let Inst{19-16} = imm{3-0}; + } + } // Predicates = [HasNEON, HasFullFP16] def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?}, FPR32, FPR32, vecshiftR32, asm, []> { let Inst{20-16} = imm{4-0}; @@ -7297,6 +7726,23 @@ class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm, multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm, Intrinsic OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftR16, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR16, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + } // Predicates = [HasNEON, HasFullFP16] def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, V64, V64, vecshiftR32, asm, ".2s", ".2s", @@ -7322,8 +7768,26 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm, } } -multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm, +multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm, Intrinsic OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftR16, + asm, ".4h", ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR16, + asm, ".8h", ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + } // Predicates = [HasNEON, HasFullFP16] + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, V64, V64, vecshiftR32, asm, ".2s", ".2s", @@ -8604,9 +9068,8 @@ let Predicates = [HasNEON, HasV8_1a] in { class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode, RegisterOperand regtype, string asm, string kind, list<dag> pattern> - : BaseSIMDThreeSameVectorTied<Q, U, size, opcode, regtype, asm, kind, + : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind, pattern> { - let Inst{21}=0; } multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm, SDPatternOperator Accum> { @@ -9041,6 +9504,7 @@ def : TokenAlias<".8H", ".8h">; def : TokenAlias<".4S", ".4s">; def : TokenAlias<".2D", ".2d">; def : TokenAlias<".1Q", ".1q">; +def : TokenAlias<".2H", ".2h">; def : TokenAlias<".B", ".b">; def : TokenAlias<".H", ".h">; def : TokenAlias<".S", ".s">; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index c0b3f2c..f398117 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" -#include "AArch64MachineCombinerPattern.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -533,6 +532,14 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, CC); } +/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. +static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) { + uint64_t Imm = MI->getOperand(1).getImm(); + uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); + uint64_t Encoding; + return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); +} + // FIXME: this implementation should be micro-architecture dependent, so a // micro-architecture target hook should be introduced here in future. bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { @@ -573,6 +580,12 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { case AArch64::ORRWrr: case AArch64::ORRXrr: return true; + // If MOVi32imm or MOVi64imm can be expanded into ORRWri or + // ORRXri, it is as cheap as MOV + case AArch64::MOVi32imm: + return canBeExpandedToORR(MI, 32); + case AArch64::MOVi64imm: + return canBeExpandedToORR(MI, 64); } llvm_unreachable("Unknown opcode to check as cheap as a move!"); @@ -1379,42 +1392,34 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( Width = 1; Scale = 1; break; + case AArch64::LDRQui: + case AArch64::STRQui: + Scale = Width = 16; + break; case AArch64::LDRXui: + case AArch64::LDRDui: case AArch64::STRXui: + case AArch64::STRDui: Scale = Width = 8; break; case AArch64::LDRWui: + case AArch64::LDRSui: case AArch64::STRWui: + case AArch64::STRSui: Scale = Width = 4; break; - case AArch64::LDRBui: - case AArch64::STRBui: - Scale = Width = 1; - break; case AArch64::LDRHui: + case AArch64::LDRHHui: case AArch64::STRHui: + case AArch64::STRHHui: Scale = Width = 2; break; - case AArch64::LDRSui: - case AArch64::STRSui: - Scale = Width = 4; - break; - case AArch64::LDRDui: - case AArch64::STRDui: - Scale = Width = 8; - break; - case AArch64::LDRQui: - case AArch64::STRQui: - Scale = Width = 16; - break; + case AArch64::LDRBui: case AArch64::LDRBBui: + case AArch64::STRBui: case AArch64::STRBBui: Scale = Width = 1; break; - case AArch64::LDRHHui: - case AArch64::STRHHui: - Scale = Width = 2; - break; }; BaseReg = LdSt->getOperand(1).getReg(); @@ -1445,23 +1450,43 @@ bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, MachineInstr *Second) const { - // Cyclone can fuse CMN, CMP followed by Bcc. - - // FIXME: B0 can also fuse: - // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ. - if (Second->getOpcode() != AArch64::Bcc) - return false; - switch (First->getOpcode()) { - default: - return false; - case AArch64::SUBSWri: - case AArch64::ADDSWri: - case AArch64::ANDSWri: - case AArch64::SUBSXri: - case AArch64::ADDSXri: - case AArch64::ANDSXri: - return true; + if (Subtarget.isCyclone()) { + // Cyclone can fuse CMN, CMP, TST followed by Bcc. + unsigned SecondOpcode = Second->getOpcode(); + if (SecondOpcode == AArch64::Bcc) { + switch (First->getOpcode()) { + default: + return false; + case AArch64::SUBSWri: + case AArch64::ADDSWri: + case AArch64::ANDSWri: + case AArch64::SUBSXri: + case AArch64::ADDSXri: + case AArch64::ANDSXri: + return true; + } + } + // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ. + if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || + SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { + switch (First->getOpcode()) { + default: + return false; + case AArch64::ADDWri: + case AArch64::ADDXri: + case AArch64::ANDWri: + case AArch64::ANDXri: + case AArch64::EORWri: + case AArch64::EORXri: + case AArch64::ORRWri: + case AArch64::ORRXri: + case AArch64::SUBWri: + case AArch64::SUBXri: + return true; + } + } } + return false; } MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( @@ -1814,7 +1839,7 @@ void AArch64InstrInfo::storeRegToStackSlot( MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI)); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); unsigned Opc = 0; @@ -1911,7 +1936,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI)); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); @@ -2226,11 +2251,19 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, case AArch64::LDPDi: case AArch64::STPXi: case AArch64::STPDi: + case AArch64::LDNPXi: + case AArch64::LDNPDi: + case AArch64::STNPXi: + case AArch64::STNPDi: + ImmIdx = 3; IsSigned = true; Scale = 8; break; case AArch64::LDPQi: case AArch64::STPQi: + case AArch64::LDNPQi: + case AArch64::STNPQi: + ImmIdx = 3; IsSigned = true; Scale = 16; break; @@ -2238,6 +2271,11 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, case AArch64::LDPSi: case AArch64::STPWi: case AArch64::STPSi: + case AArch64::LDNPWi: + case AArch64::LDNPSi: + case AArch64::STNPWi: + case AArch64::STNPSi: + ImmIdx = 3; IsSigned = true; Scale = 4; break; @@ -2449,15 +2487,36 @@ static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, return true; } -/// Return true when there is potentially a faster code sequence -/// for an instruction chain ending in \p Root. All potential patterns are -/// listed -/// in the \p Pattern vector. Pattern should be sorted in priority order since -/// the pattern evaluator stops checking as soon as it finds a faster sequence. +// TODO: There are many more machine instruction opcodes to match: +// 1. Other data types (integer, vectors) +// 2. Other math / logic operations (xor, or) +// 3. Other forms of the same operation (intrinsics and other variants) +bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { + switch (Inst.getOpcode()) { + case AArch64::FADDDrr: + case AArch64::FADDSrr: + case AArch64::FADDv2f32: + case AArch64::FADDv2f64: + case AArch64::FADDv4f32: + case AArch64::FMULDrr: + case AArch64::FMULSrr: + case AArch64::FMULX32: + case AArch64::FMULX64: + case AArch64::FMULXv2f32: + case AArch64::FMULXv2f64: + case AArch64::FMULXv4f32: + case AArch64::FMULv2f32: + case AArch64::FMULv2f64: + case AArch64::FMULv4f32: + return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; + default: + return false; + } +} -bool AArch64InstrInfo::getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const { +/// Find instructions that can be turned into madd. +static bool getMaddPatterns(MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) { unsigned Opc = Root.getOpcode(); MachineBasicBlock &MBB = *Root.getParent(); bool Found = false; @@ -2485,76 +2544,76 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( "ADDWrr does not have register operands"); if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP1); + Patterns.push_back(MachineCombinerPattern::MULADDW_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP2); + Patterns.push_back(MachineCombinerPattern::MULADDW_OP2); Found = true; } break; case AArch64::ADDXrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP1); + Patterns.push_back(MachineCombinerPattern::MULADDX_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP2); + Patterns.push_back(MachineCombinerPattern::MULADDX_OP2); Found = true; } break; case AArch64::SUBWrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP1); + Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP2); + Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2); Found = true; } break; case AArch64::SUBXrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP1); + Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP2); + Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2); Found = true; } break; case AArch64::ADDWri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDWI_OP1); + Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1); Found = true; } break; case AArch64::ADDXri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDXI_OP1); + Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1); Found = true; } break; case AArch64::SUBWri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1); + Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1); Found = true; } break; case AArch64::SUBXri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1); + Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1); Found = true; } break; @@ -2562,6 +2621,20 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( return Found; } +/// Return true when there is potentially a faster code sequence for an +/// instruction chain ending in \p Root. All potential patterns are listed in +/// the \p Pattern vector. Pattern should be sorted in priority order since the +/// pattern evaluator stops checking as soon as it finds a faster sequence. + +bool AArch64InstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) const { + if (getMaddPatterns(Root, Patterns)) + return true; + + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); +} + /// genMadd - Generate madd instruction and combine mul and add. /// Example: /// MUL I=A,B,0 @@ -2661,7 +2734,7 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, /// this function generates the instructions that could replace the /// original code sequence void AArch64InstrInfo::genAlternativeCodeSequence( - MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern, + MachineInstr &Root, MachineCombinerPattern Pattern, SmallVectorImpl<MachineInstr *> &InsInstrs, SmallVectorImpl<MachineInstr *> &DelInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { @@ -2675,15 +2748,17 @@ void AArch64InstrInfo::genAlternativeCodeSequence( unsigned Opc; switch (Pattern) { default: - // signal error. - break; - case MachineCombinerPattern::MC_MULADDW_OP1: - case MachineCombinerPattern::MC_MULADDX_OP1: + // Reassociate instructions. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + return; + case MachineCombinerPattern::MULADDW_OP1: + case MachineCombinerPattern::MULADDX_OP1: // MUL I=A,B,0 // ADD R,I,C // ==> MADD R,A,B,C // --- Create(MADD); - if (Pattern == MachineCombinerPattern::MC_MULADDW_OP1) { + if (Pattern == MachineCombinerPattern::MULADDW_OP1) { Opc = AArch64::MADDWrrr; RC = &AArch64::GPR32RegClass; } else { @@ -2692,13 +2767,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; - case MachineCombinerPattern::MC_MULADDW_OP2: - case MachineCombinerPattern::MC_MULADDX_OP2: + case MachineCombinerPattern::MULADDW_OP2: + case MachineCombinerPattern::MULADDX_OP2: // MUL I=A,B,0 // ADD R,C,I // ==> MADD R,A,B,C // --- Create(MADD); - if (Pattern == MachineCombinerPattern::MC_MULADDW_OP2) { + if (Pattern == MachineCombinerPattern::MULADDW_OP2) { Opc = AArch64::MADDWrrr; RC = &AArch64::GPR32RegClass; } else { @@ -2707,8 +2782,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; - case MachineCombinerPattern::MC_MULADDWI_OP1: - case MachineCombinerPattern::MC_MULADDXI_OP1: { + case MachineCombinerPattern::MULADDWI_OP1: + case MachineCombinerPattern::MULADDXI_OP1: { // MUL I=A,B,0 // ADD R,I,Imm // ==> ORR V, ZR, Imm @@ -2716,7 +2791,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( // --- Create(MADD); const TargetRegisterClass *OrrRC; unsigned BitSize, OrrOpc, ZeroReg; - if (Pattern == MachineCombinerPattern::MC_MULADDWI_OP1) { + if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { OrrOpc = AArch64::ORRWri; OrrRC = &AArch64::GPR32spRegClass; BitSize = 32; @@ -2751,8 +2826,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; } - case MachineCombinerPattern::MC_MULSUBW_OP1: - case MachineCombinerPattern::MC_MULSUBX_OP1: { + case MachineCombinerPattern::MULSUBW_OP1: + case MachineCombinerPattern::MULSUBX_OP1: { // MUL I=A,B,0 // SUB R,I, C // ==> SUB V, 0, C @@ -2760,7 +2835,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( // --- Create(MADD); const TargetRegisterClass *SubRC; unsigned SubOpc, ZeroReg; - if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP1) { + if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { SubOpc = AArch64::SUBWrr; SubRC = &AArch64::GPR32spRegClass; ZeroReg = AArch64::WZR; @@ -2784,13 +2859,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); break; } - case MachineCombinerPattern::MC_MULSUBW_OP2: - case MachineCombinerPattern::MC_MULSUBX_OP2: + case MachineCombinerPattern::MULSUBW_OP2: + case MachineCombinerPattern::MULSUBX_OP2: // MUL I=A,B,0 // SUB R,C,I // ==> MSUB R,A,B,C (computes C - A*B) // --- Create(MSUB); - if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP2) { + if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { Opc = AArch64::MSUBWrrr; RC = &AArch64::GPR32RegClass; } else { @@ -2799,8 +2874,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; - case MachineCombinerPattern::MC_MULSUBWI_OP1: - case MachineCombinerPattern::MC_MULSUBXI_OP1: { + case MachineCombinerPattern::MULSUBWI_OP1: + case MachineCombinerPattern::MULSUBXI_OP1: { // MUL I=A,B,0 // SUB R,I, Imm // ==> ORR V, ZR, -Imm @@ -2808,7 +2883,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( // --- Create(MADD); const TargetRegisterClass *OrrRC; unsigned BitSize, OrrOpc, ZeroReg; - if (Pattern == MachineCombinerPattern::MC_MULSUBWI_OP1) { + if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { OrrOpc = AArch64::ORRWri; OrrRC = &AArch64::GPR32spRegClass; BitSize = 32; @@ -2944,3 +3019,34 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const { MI->eraseFromParent(); return true; } + +std::pair<unsigned, unsigned> +AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + const unsigned Mask = AArch64II::MO_FRAGMENT; + return std::make_pair(TF & Mask, TF & ~Mask); +} + +ArrayRef<std::pair<unsigned, const char *>> +AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace AArch64II; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_PAGE, "aarch64-page"}, + {MO_PAGEOFF, "aarch64-pageoff"}, + {MO_G3, "aarch64-g3"}, + {MO_G2, "aarch64-g2"}, + {MO_G1, "aarch64-g1"}, + {MO_G0, "aarch64-g0"}, + {MO_HI12, "aarch64-hi12"}}; + return makeArrayRef(TargetFlags); +} + +ArrayRef<std::pair<unsigned, const char *>> +AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { + using namespace AArch64II; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_GOT, "aarch64-got"}, + {MO_NC, "aarch64-nc"}, + {MO_TLS, "aarch64-tls"}, + {MO_CONSTPOOL, "aarch64-constant-pool"}}; + return makeArrayRef(TargetFlags); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 68c2a28..b5bb446 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -167,13 +167,15 @@ public: /// for an instruction chain ending in <Root>. All potential patterns are /// listed in the <Patterns> array. bool getMachineCombinerPatterns(MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) + SmallVectorImpl<MachineCombinerPattern> &Patterns) const override; - + /// Return true when Inst is associative and commutative so that it can be + /// reassociated. + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; /// When getMachineCombinerPatterns() finds patterns, this function generates /// the instructions that could replace the original code sequence void genAlternativeCodeSequence( - MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern, + MachineInstr &Root, MachineCombinerPattern Pattern, SmallVectorImpl<MachineInstr *> &InsInstrs, SmallVectorImpl<MachineInstr *> &DelInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override; @@ -181,6 +183,14 @@ public: bool useMachineCombiner() const override; bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + ArrayRef<std::pair<unsigned, const char *>> + getSerializableBitmaskMachineOperandTargetFlags() const override; + private: void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB, diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td index fa1a46a..d02bc9f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -16,6 +16,8 @@ // def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; +def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, + AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; def HasNEON : Predicate<"Subtarget->hasNEON()">, @@ -24,6 +26,12 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, AssemblerPredicate<"FeatureCrypto", "crypto">; def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; +def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; +def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, + AssemblerPredicate<"FeatureFullFP16", "fullfp16">; +def HasSPE : Predicate<"Subtarget->hasSPE()">, + AssemblerPredicate<"FeatureSPE", "spe">; + def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsCyclone : Predicate<"Subtarget->isCyclone()">; @@ -66,6 +74,20 @@ def SDT_AArch64CSel : SDTypeProfile<1, 4, SDTCisSameAs<0, 2>, SDTCisInt<3>, SDTCisVT<4, i32>]>; +def SDT_AArch64CCMP : SDTypeProfile<1, 5, + [SDTCisVT<0, i32>, + SDTCisInt<1>, + SDTCisSameAs<1, 2>, + SDTCisInt<3>, + SDTCisInt<4>, + SDTCisVT<5, i32>]>; +def SDT_AArch64FCCMP : SDTypeProfile<1, 5, + [SDTCisVT<0, i32>, + SDTCisFP<1>, + SDTCisSameAs<1, 2>, + SDTCisInt<3>, + SDTCisInt<4>, + SDTCisVT<5, i32>]>; def SDT_AArch64FCmp : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>; @@ -160,13 +182,14 @@ def AArch64and_flag : SDNode<"AArch64ISD::ANDS", SDTBinaryArithWithFlagsOut, def AArch64adc_flag : SDNode<"AArch64ISD::ADCS", SDTBinaryArithWithFlagsInOut>; def AArch64sbc_flag : SDNode<"AArch64ISD::SBCS", SDTBinaryArithWithFlagsInOut>; +def AArch64ccmp : SDNode<"AArch64ISD::CCMP", SDT_AArch64CCMP>; +def AArch64ccmn : SDNode<"AArch64ISD::CCMN", SDT_AArch64CCMP>; +def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>; + def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>; def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>; -def AArch64fmax : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>; -def AArch64fmin : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>; - def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>; def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>; def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>; @@ -361,6 +384,9 @@ def : InstAlias<"wfi", (HINT 0b011)>; def : InstAlias<"sev", (HINT 0b100)>; def : InstAlias<"sevl", (HINT 0b101)>; +// v8.2a Statistical Profiling extension +def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>; + // As far as LLVM is concerned this writes to the system's exclusive monitors. let mayLoad = 1, mayStore = 1 in def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">; @@ -383,12 +409,17 @@ def : InstAlias<"isb", (ISB 0xf)>; def MRS : MRSI; def MSR : MSRI; -def MSRpstate: MSRpstateI; +def MSRpstateImm1 : MSRpstateImm0_1; +def MSRpstateImm4 : MSRpstateImm0_15; // The thread pointer (on Linux, at least, where this has been implemented) is // TPIDR_EL0. def : Pat<(AArch64threadpointer), (MRS 0xde82)>; +// The cycle counter PMC register is PMCCNTR_EL0. +let Predicates = [HasPerfMon] in +def : Pat<(readcyclecounter), (MRS 0xdce8)>; + // Generic system instructions def SYSxt : SystemXtI<0, "sys">; def SYSLxt : SystemLXtI<1, "sysl">; @@ -595,10 +626,12 @@ def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm), (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>; def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm), (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>; +let AddedComplexity = 1 in { def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3), (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>; def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3), (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>; +} // Because of the immediate format for add/sub-imm instructions, the // expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1). @@ -823,7 +856,7 @@ defm AND : LogicalReg<0b00, 0, "and", and>; defm BIC : LogicalReg<0b00, 1, "bic", BinOpFrag<(and node:$LHS, (not node:$RHS))>>; defm EON : LogicalReg<0b10, 1, "eon", - BinOpFrag<(xor node:$LHS, (not node:$RHS))>>; + BinOpFrag<(not (xor node:$LHS, node:$RHS))>>; defm EOR : LogicalReg<0b10, 0, "eor", xor>; defm ORN : LogicalReg<0b01, 1, "orn", BinOpFrag<(or node:$LHS, (not node:$RHS))>>; @@ -1020,13 +1053,10 @@ def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>; def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>; //===----------------------------------------------------------------------===// -// Conditionally set flags instructions. +// Conditional comparison instructions. //===----------------------------------------------------------------------===// -defm CCMN : CondSetFlagsImm<0, "ccmn">; -defm CCMP : CondSetFlagsImm<1, "ccmp">; - -defm CCMN : CondSetFlagsReg<0, "ccmn">; -defm CCMP : CondSetFlagsReg<1, "ccmp">; +defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>; +defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>; //===----------------------------------------------------------------------===// // Conditional select instructions. @@ -2421,6 +2451,26 @@ defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvt defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>; } +multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> { + def : Pat<(i32 (to_int (round f32:$Rn))), + (!cast<Instruction>(INST # UWSr) f32:$Rn)>; + def : Pat<(i64 (to_int (round f32:$Rn))), + (!cast<Instruction>(INST # UXSr) f32:$Rn)>; + def : Pat<(i32 (to_int (round f64:$Rn))), + (!cast<Instruction>(INST # UWDr) f64:$Rn)>; + def : Pat<(i64 (to_int (round f64:$Rn))), + (!cast<Instruction>(INST # UXDr) f64:$Rn)>; +} + +defm : FPToIntegerPats<fp_to_sint, fceil, "FCVTPS">; +defm : FPToIntegerPats<fp_to_uint, fceil, "FCVTPU">; +defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">; +defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">; +defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">; +defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">; +defm : FPToIntegerPats<fp_to_sint, frnd, "FCVTAS">; +defm : FPToIntegerPats<fp_to_uint, frnd, "FCVTAU">; + //===----------------------------------------------------------------------===// // Scaled integer to floating point conversion instructions. //===----------------------------------------------------------------------===// @@ -2466,14 +2516,7 @@ defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>; def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))), (FRINTNDr FPR64:$Rn)>; -// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior -// in the C spec. Setting hasSideEffects ensures it is not DCE'd. -// <rdar://problem/13715968> -// TODO: We should really model the FPSR flags correctly. This is really ugly. -let hasSideEffects = 1 in { defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>; -} - defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>; let SchedRW = [WriteFDiv] in { @@ -2488,23 +2531,23 @@ defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>; let SchedRW = [WriteFDiv] in { defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>; } -defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>; -defm FMAX : TwoOperandFPData<0b0100, "fmax", AArch64fmax>; -defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>; -defm FMIN : TwoOperandFPData<0b0101, "fmin", AArch64fmin>; +defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>; +defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaxnan>; +defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>; +defm FMIN : TwoOperandFPData<0b0101, "fmin", fminnan>; let SchedRW = [WriteFMul] in { defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>; defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>; } defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>; -def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fmaxnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMAXDrr FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fminnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMINDrr FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>; //===----------------------------------------------------------------------===// @@ -2556,7 +2599,7 @@ defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>; //===----------------------------------------------------------------------===// defm FCCMPE : FPCondComparison<1, "fccmpe">; -defm FCCMP : FPCondComparison<0, "fccmp">; +defm FCCMP : FPCondComparison<0, "fccmp", AArch64fccmp>; //===----------------------------------------------------------------------===// // Floating point conditional select instruction. @@ -2589,6 +2632,40 @@ defm FMOV : FPMoveImmediate<"fmov">; // Advanced SIMD two vector instructions. //===----------------------------------------------------------------------===// +defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", + int_aarch64_neon_uabd>; +// Match UABDL in log2-shuffle patterns. +def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), + (v8i16 (add (sub (zext (v8i8 V64:$opA)), + (zext (v8i8 V64:$opB))), + (AArch64vashr v8i16:$src, (i32 15))))), + (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; +def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), + (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)), + (zext (extract_high_v16i8 V128:$opB))), + (AArch64vashr v8i16:$src, (i32 15))))), + (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; +def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), + (v4i32 (add (sub (zext (v4i16 V64:$opA)), + (zext (v4i16 V64:$opB))), + (AArch64vashr v4i32:$src, (i32 31))))), + (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>; +def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), + (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)), + (zext (extract_high_v8i16 V128:$opB))), + (AArch64vashr v4i32:$src, (i32 31))))), + (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>; +def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), + (v2i64 (add (sub (zext (v2i32 V64:$opA)), + (zext (v2i32 V64:$opB))), + (AArch64vashr v2i64:$src, (i32 63))))), + (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>; +def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), + (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)), + (zext (extract_high_v4i32 V128:$opB))), + (AArch64vashr v2i64:$src, (i32 63))))), + (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>; + defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>; def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))), (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))), @@ -2780,29 +2857,29 @@ defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>; defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>; defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>; defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>; -defm FABD : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>; -defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>; -defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>; -defm FADDP : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>; -defm FADD : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>; -defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>; -defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>; -defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>; -defm FDIV : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>; -defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>; -defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>; -defm FMAXP : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>; -defm FMAX : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>; -defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>; -defm FMINNM : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>; -defm FMINP : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>; -defm FMIN : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>; +defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>; +defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>; +defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>; +defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>; +defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>; +defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; +defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; +defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>; +defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>; +defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>; +defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>; +defm FMAXP : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>; +defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>; +defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>; +defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>; +defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>; +defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>; // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the // instruction expects the addend first, while the fma intrinsic puts it last. -defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla", +defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla", TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >; -defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls", +defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls", TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; // The following def pats catch the case where the LHS of an FMA is negated. @@ -2816,11 +2893,11 @@ def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>; -defm FMULX : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>; -defm FMUL : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>; -defm FRECPS : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>; -defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>; -defm FSUB : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>; +defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>; +defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>; +defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>; +defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>; +defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>; defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >; defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", @@ -2833,9 +2910,9 @@ defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>; defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>; defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>; defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>; -defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>; +defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>; defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>; -defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>; +defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>; defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>; defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>; @@ -2852,9 +2929,9 @@ defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>; defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>; defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>; defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>; -defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>; +defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>; defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>; -defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>; +defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>; defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>; defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; @@ -2879,54 +2956,6 @@ defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn", BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >; defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>; -def : Pat<(v8i8 (smin V64:$Rn, V64:$Rm)), - (SMINv8i8 V64:$Rn, V64:$Rm)>; -def : Pat<(v4i16 (smin V64:$Rn, V64:$Rm)), - (SMINv4i16 V64:$Rn, V64:$Rm)>; -def : Pat<(v2i32 (smin V64:$Rn, V64:$Rm)), - (SMINv2i32 V64:$Rn, V64:$Rm)>; -def : Pat<(v16i8 (smin V128:$Rn, V128:$Rm)), - (SMINv16i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i16 (smin V128:$Rn, V128:$Rm)), - (SMINv8i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i32 (smin V128:$Rn, V128:$Rm)), - (SMINv4i32 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i8 (smax V64:$Rn, V64:$Rm)), - (SMAXv8i8 V64:$Rn, V64:$Rm)>; -def : Pat<(v4i16 (smax V64:$Rn, V64:$Rm)), - (SMAXv4i16 V64:$Rn, V64:$Rm)>; -def : Pat<(v2i32 (smax V64:$Rn, V64:$Rm)), - (SMAXv2i32 V64:$Rn, V64:$Rm)>; -def : Pat<(v16i8 (smax V128:$Rn, V128:$Rm)), - (SMAXv16i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i16 (smax V128:$Rn, V128:$Rm)), - (SMAXv8i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i32 (smax V128:$Rn, V128:$Rm)), - (SMAXv4i32 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i8 (umin V64:$Rn, V64:$Rm)), - (UMINv8i8 V64:$Rn, V64:$Rm)>; -def : Pat<(v4i16 (umin V64:$Rn, V64:$Rm)), - (UMINv4i16 V64:$Rn, V64:$Rm)>; -def : Pat<(v2i32 (umin V64:$Rn, V64:$Rm)), - (UMINv2i32 V64:$Rn, V64:$Rm)>; -def : Pat<(v16i8 (umin V128:$Rn, V128:$Rm)), - (UMINv16i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i16 (umin V128:$Rn, V128:$Rm)), - (UMINv8i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i32 (umin V128:$Rn, V128:$Rm)), - (UMINv4i32 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i8 (umax V64:$Rn, V64:$Rm)), - (UMAXv8i8 V64:$Rn, V64:$Rm)>; -def : Pat<(v4i16 (umax V64:$Rn, V64:$Rm)), - (UMAXv4i16 V64:$Rn, V64:$Rm)>; -def : Pat<(v2i32 (umax V64:$Rn, V64:$Rm)), - (UMAXv2i32 V64:$Rn, V64:$Rm)>; -def : Pat<(v16i8 (umax V128:$Rn, V128:$Rm)), - (UMAXv16i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i16 (umax V128:$Rn, V128:$Rm)), - (UMAXv8i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i32 (umax V128:$Rn, V128:$Rm)), - (UMAXv4i32 V128:$Rn, V128:$Rm)>; def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; @@ -3052,6 +3081,14 @@ def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" # "|cmlt.2d\t$dst, $src1, $src2}", (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" # + "|fcmle.4h\t$dst, $src1, $src2}", + (FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" # + "|fcmle.8h\t$dst, $src1, $src2}", + (FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" # "|fcmle.2s\t$dst, $src1, $src2}", (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; @@ -3062,6 +3099,14 @@ def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" # "|fcmle.2d\t$dst, $src1, $src2}", (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" # + "|fcmlt.4h\t$dst, $src1, $src2}", + (FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" # + "|fcmlt.8h\t$dst, $src1, $src2}", + (FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" # "|fcmlt.2s\t$dst, $src1, $src2}", (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; @@ -3072,6 +3117,14 @@ def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" # "|fcmlt.2d\t$dst, $src1, $src2}", (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" # + "|facle.4h\t$dst, $src1, $src2}", + (FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" # + "|facle.8h\t$dst, $src1, $src2}", + (FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" # "|facle.2s\t$dst, $src1, $src2}", (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; @@ -3082,6 +3135,14 @@ def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" # "|facle.2d\t$dst, $src1, $src2}", (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" # + "|faclt.4h\t$dst, $src1, $src2}", + (FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" # + "|faclt.8h\t$dst, $src1, $src2}", + (FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" # "|faclt.2s\t$dst, $src1, $src2}", (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; @@ -3103,19 +3164,19 @@ defm CMGT : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>; defm CMHI : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>; defm CMHS : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>; defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>; -defm FABD : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>; +defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>; def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FABD64 FPR64:$Rn, FPR64:$Rm)>; -defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge", +defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge", int_aarch64_neon_facge>; -defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt", +defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt", int_aarch64_neon_facgt>; -defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>; -defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>; -defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>; -defm FMULX : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>; -defm FRECPS : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>; -defm FRSQRTS : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>; +defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; +defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; +defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>; +defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>; +defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>; +defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>; defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>; defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>; @@ -3198,35 +3259,35 @@ defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>; defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>; defm CMLE : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>; defm CMLT : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>; -defm FCMEQ : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>; -defm FCMGE : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>; -defm FCMGT : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; -defm FCMLE : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>; -defm FCMLT : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>; -defm FCVTAS : SIMDTwoScalarSD< 0, 0, 0b11100, "fcvtas">; -defm FCVTAU : SIMDTwoScalarSD< 1, 0, 0b11100, "fcvtau">; -defm FCVTMS : SIMDTwoScalarSD< 0, 0, 0b11011, "fcvtms">; -defm FCVTMU : SIMDTwoScalarSD< 1, 0, 0b11011, "fcvtmu">; -defm FCVTNS : SIMDTwoScalarSD< 0, 0, 0b11010, "fcvtns">; -defm FCVTNU : SIMDTwoScalarSD< 1, 0, 0b11010, "fcvtnu">; -defm FCVTPS : SIMDTwoScalarSD< 0, 1, 0b11010, "fcvtps">; -defm FCVTPU : SIMDTwoScalarSD< 1, 1, 0b11010, "fcvtpu">; +defm FCMEQ : SIMDFPCmpTwoScalar<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>; +defm FCMGE : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>; +defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; +defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>; +defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>; +defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">; +defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau">; +defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms">; +defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu">; +defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns">; +defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu">; +defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps">; +defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">; def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">; -defm FCVTZS : SIMDTwoScalarSD< 0, 1, 0b11011, "fcvtzs">; -defm FCVTZU : SIMDTwoScalarSD< 1, 1, 0b11011, "fcvtzu">; -defm FRECPE : SIMDTwoScalarSD< 0, 1, 0b11101, "frecpe">; -defm FRECPX : SIMDTwoScalarSD< 0, 1, 0b11111, "frecpx">; -defm FRSQRTE : SIMDTwoScalarSD< 1, 1, 0b11101, "frsqrte">; +defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">; +defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">; +defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">; +defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">; +defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">; defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg", UnOpFrag<(sub immAllZerosV, node:$LHS)> >; -defm SCVTF : SIMDTwoScalarCVTSD< 0, 0, 0b11101, "scvtf", AArch64sitof>; +defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>; defm SQABS : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>; defm SQNEG : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>; defm SQXTN : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>; defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>; defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd", int_aarch64_neon_suqadd>; -defm UCVTF : SIMDTwoScalarCVTSD< 1, 0, 0b11101, "ucvtf", AArch64uitof>; +defm UCVTF : SIMDFPTwoScalarCVT< 1, 0, 0b11101, "ucvtf", AArch64uitof>; defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>; defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd", int_aarch64_neon_usqadd>; @@ -3390,8 +3451,6 @@ defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>; defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", int_aarch64_neon_uabd>; -defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", - int_aarch64_neon_uabd>; defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>; defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", @@ -3449,8 +3508,8 @@ defm : Neon_mulacc_widen_patterns< // Patterns for 64-bit pmull def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm), (PMULLv1i64 V64:$Rn, V64:$Rm)>; -def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)), - (vector_extract (v2i64 V128:$Rm), (i64 1))), +def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)), + (extractelt (v2i64 V128:$Rm), (i64 1))), (PMULLv2i64 V128:$Rn, V128:$Rm)>; // CodeGen patterns for addhn and subhn instructions, which can actually be @@ -3593,11 +3652,11 @@ defm CPY : SIMDScalarCPY<"cpy">; //---------------------------------------------------------------------------- defm ADDP : SIMDPairwiseScalarD<0, 0b11011, "addp">; -defm FADDP : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">; -defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">; -defm FMAXP : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">; -defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">; -defm FMINP : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">; +defm FADDP : SIMDFPPairwiseScalar<0, 0b01101, "faddp">; +defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">; +defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">; +defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">; +defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">; def : Pat<(v2i64 (AArch64saddv V128:$Rn)), (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>; def : Pat<(v2i64 (AArch64uaddv V128:$Rn)), @@ -3713,12 +3772,12 @@ defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>; multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP, SDNodeXForm IdxXFORM> { - def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn), + def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn), imm:$idx))))), (DUP V128:$Rn, (IdxXFORM imm:$idx))>; - def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn), - imm:$idx))))), + def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn), + imm:$idx))))), (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>; } @@ -3747,6 +3806,13 @@ def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16), def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))), (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>; +def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn), + VectorIndexB:$idx)))), i8), + (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>; +def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn), + VectorIndexH:$idx)))), i16), + (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>; + // Extracting i8 or i16 elements will have the zero-extend transformed to // an 'and' mask by type legalization since neither i8 nor i16 are legal types // for AArch64. Match these patterns here since UMOV already zeroes out the high @@ -3784,6 +3850,11 @@ def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))), (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (i64 FPR64:$Rn), dsub))>; +def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), + (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), + (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; + def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), @@ -3949,10 +4020,10 @@ defm UMAXV : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">; defm UMINV : SIMDAcrossLanesBHS<1, 0b11010, "uminv">; defm SADDLV : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">; defm UADDLV : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">; -defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>; -defm FMAXV : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>; -defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>; -defm FMINV : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>; +defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>; +defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>; +defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>; +defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>; // Patterns for across-vector intrinsics, that have a node equivalent, that // returns a vector (with only the low lane defined) instead of a scalar. @@ -4199,15 +4270,23 @@ def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; // AdvSIMD FMOV -def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8, +def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8, "fmov", ".2d", [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; -def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64, fpimm8, +def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64, fpimm8, "fmov", ".2s", [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>; -def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8, +def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8, "fmov", ".4s", [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; +let Predicates = [HasNEON, HasFullFP16] in { +def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64, fpimm8, + "fmov", ".4h", + [(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>; +def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8, + "fmov", ".8h", + [(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; +} // Predicates = [HasNEON, HasFullFP16] // AdvSIMD MOVI @@ -4235,7 +4314,7 @@ def : Pat<(v8i8 immAllOnesV), (MOVID (i32 255))>; // The movi_edit node has the immediate value already encoded, so we use // a plain imm0_255 in the pattern let isReMaterializable = 1, isAsCheapAsAMove = 1 in -def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128, +def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128, simdimmtype10, "movi", ".2d", [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>; @@ -4296,10 +4375,10 @@ def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s", (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>; // Per byte: 8b & 16b -def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64, imm0_255, +def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64, imm0_255, "movi", ".8b", [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>; -def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255, +def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255, "movi", ".16b", [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>; @@ -4340,8 +4419,8 @@ def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s", //---------------------------------------------------------------------------- let hasSideEffects = 0 in { - defm FMLA : SIMDFPIndexedSDTied<0, 0b0001, "fmla">; - defm FMLS : SIMDFPIndexedSDTied<0, 0b0101, "fmls">; + defm FMLA : SIMDFPIndexedTied<0, 0b0001, "fmla">; + defm FMLS : SIMDFPIndexedTied<0, 0b0101, "fmls">; } // NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the @@ -4349,18 +4428,18 @@ let hasSideEffects = 0 in { // On the other hand, there are quite a few valid combinatorial options due to // the commutativity of multiplication and the fact that (-x) * y = x * (-y). -defm : SIMDFPIndexedSDTiedPatterns<"FMLA", +defm : SIMDFPIndexedTiedPatterns<"FMLA", TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>; -defm : SIMDFPIndexedSDTiedPatterns<"FMLA", +defm : SIMDFPIndexedTiedPatterns<"FMLA", TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", +defm : SIMDFPIndexedTiedPatterns<"FMLS", TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", +defm : SIMDFPIndexedTiedPatterns<"FMLS", TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", +defm : SIMDFPIndexedTiedPatterns<"FMLS", TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", +defm : SIMDFPIndexedTiedPatterns<"FMLS", TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >; multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> { @@ -4424,7 +4503,9 @@ multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> { (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, V128:$Rm, VectorIndexS:$idx)>; def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), - (vector_extract (v2f32 (fneg V64:$Rm)), + (vector_extract (v4f32 (insert_subvector undef, + (v2f32 (fneg V64:$Rm)), + (i32 0))), VectorIndexS:$idx))), (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; @@ -4442,8 +4523,8 @@ defm : FMLSIndexedAfterNegPatterns< defm : FMLSIndexedAfterNegPatterns< TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >; -defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>; -defm FMUL : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>; +defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>; +defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", fmul>; def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))), (FMULv2i32_indexed V64:$Rn, @@ -4497,10 +4578,10 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn), //---------------------------------------------------------------------------- // AdvSIMD scalar shift instructions //---------------------------------------------------------------------------- -defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">; -defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">; -defm SCVTF : SIMDScalarRShiftSD<0, 0b11100, "scvtf">; -defm UCVTF : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">; +defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">; +defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">; +defm SCVTF : SIMDFPScalarRShift<0, 0b11100, "scvtf">; +defm UCVTF : SIMDFPScalarRShift<1, 0b11100, "ucvtf">; // Codegen patterns for the above. We don't put these directly on the // instructions because TableGen's type inference can't handle the truth. // Having the same base pattern for fp <--> int totally freaks it out. @@ -4573,7 +4654,7 @@ defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra", //---------------------------------------------------------------------------- defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>; defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>; -defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf", +defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf", int_aarch64_neon_vcvtfxs2fp>; defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", int_aarch64_neon_rshrn>; @@ -4608,7 +4689,7 @@ defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll", defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>; defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra", TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; -defm UCVTF : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf", +defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf", int_aarch64_neon_vcvtfxu2fp>; defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn", int_aarch64_neon_uqrshrn>; @@ -5133,10 +5214,10 @@ def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>; def : Pat<(i64 (anyext GPR32:$src)), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>; -// When we need to explicitly zero-extend, we use an unsigned bitfield move -// instruction (UBFM) on the enclosing super-reg. +// When we need to explicitly zero-extend, we use a 32-bit MOV instruction and +// then assert the extension has happened. def : Pat<(i64 (zext GPR32:$src)), - (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>; + (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; // To sign extend, we use a signed bitfield move instruction (SBFM) on the // containing super-reg. @@ -5801,6 +5882,21 @@ def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 (REV16v16i8 FPR128:$src))>; } +def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; + def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))), (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))), @@ -5852,6 +5948,45 @@ def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; +// Patterns for nontemporal/no-allocate stores. +// We have to resort to tricks to turn a single-input store into a store pair, +// because there is no single-input nontemporal store, only STNP. +let Predicates = [IsLE] in { +let AddedComplexity = 15 in { +class NTStore128Pat<ValueType VT> : + Pat<(nontemporalstore (VT FPR128:$Rt), + (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), + (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub), + (CPYi64 FPR128:$Rt, (i64 1)), + GPR64sp:$Rn, simm7s8:$offset)>; + +def : NTStore128Pat<v2i64>; +def : NTStore128Pat<v4i32>; +def : NTStore128Pat<v8i16>; +def : NTStore128Pat<v16i8>; + +class NTStore64Pat<ValueType VT> : + Pat<(nontemporalstore (VT FPR64:$Rt), + (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), + (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub), + (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)), + GPR64sp:$Rn, simm7s4:$offset)>; + +// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64? +def : NTStore64Pat<v1f64>; +def : NTStore64Pat<v1i64>; +def : NTStore64Pat<v2i32>; +def : NTStore64Pat<v4i16>; +def : NTStore64Pat<v8i8>; + +def : Pat<(nontemporalstore GPR64:$Rt, + (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), + (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), + (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32), + GPR64sp:$Rn, simm7s4:$offset)>; +} // AddedComplexity=10 +} // Predicates = [IsLE] + // Tail call return handling. These are all compiler pseudo-instructions, // so no encoding information or anything like that. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 82f77a7..43664df 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -41,54 +41,85 @@ STATISTIC(NumPostFolded, "Number of post-index updates folded"); STATISTIC(NumPreFolded, "Number of pre-index updates folded"); STATISTIC(NumUnscaledPairCreated, "Number of load/store from unscaled generated"); +STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted"); +STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); +STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", cl::init(20), cl::Hidden); -// Place holder while testing unscaled load/store combining -static cl::opt<bool> EnableAArch64UnscaledMemOp( - "aarch64-unscaled-mem-op", cl::Hidden, - cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true)); +namespace llvm { +void initializeAArch64LoadStoreOptPass(PassRegistry &); +} + +#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" namespace { + +typedef struct LdStPairFlags { + // If a matching instruction is found, MergeForward is set to true if the + // merge is to remove the first instruction and replace the second with + // a pair-wise insn, and false if the reverse is true. + bool MergeForward; + + // SExtIdx gives the index of the result of the load pair that must be + // extended. The value of SExtIdx assumes that the paired load produces the + // value in this order: (I, returned iterator), i.e., -1 means no value has + // to be extended, 0 means I, and 1 means the returned iterator. + int SExtIdx; + + LdStPairFlags() : MergeForward(false), SExtIdx(-1) {} + + void setMergeForward(bool V = true) { MergeForward = V; } + bool getMergeForward() const { return MergeForward; } + + void setSExtIdx(int V) { SExtIdx = V; } + int getSExtIdx() const { return SExtIdx; } + +} LdStPairFlags; + struct AArch64LoadStoreOpt : public MachineFunctionPass { static char ID; - AArch64LoadStoreOpt() : MachineFunctionPass(ID) {} + AArch64LoadStoreOpt() : MachineFunctionPass(ID) { + initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); + } const AArch64InstrInfo *TII; const TargetRegisterInfo *TRI; + const AArch64Subtarget *Subtarget; // Scan the instructions looking for a load/store that can be combined // with the current instruction into a load/store pair. // Return the matching instruction if one is found, else MBB->end(). - // If a matching instruction is found, MergeForward is set to true if the - // merge is to remove the first instruction and replace the second with - // a pair-wise insn, and false if the reverse is true. - // \p SExtIdx[out] gives the index of the result of the load pair that - // must be extended. The value of SExtIdx assumes that the paired load - // produces the value in this order: (I, returned iterator), i.e., - // -1 means no value has to be extended, 0 means I, and 1 means the - // returned iterator. MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, - bool &MergeForward, int &SExtIdx, + LdStPairFlags &Flags, unsigned Limit); + + // Scan the instructions looking for a store that writes to the address from + // which the current load instruction reads. Return true if one is found. + bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, + MachineBasicBlock::iterator &StoreI); + // Merge the two instructions indicated into a single pair-wise instruction. // If MergeForward is true, erase the first instruction and fold its // operation into the second. If false, the reverse. Return the instruction // following the first instruction (which may change during processing). - // \p SExtIdx index of the result that must be extended for a paired load. - // -1 means none, 0 means I, and 1 means Paired. MachineBasicBlock::iterator mergePairedInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, bool MergeForward, - int SExtIdx); + MachineBasicBlock::iterator Paired, + const LdStPairFlags &Flags); + + // Promote the load that reads directly from the address stored to. + MachineBasicBlock::iterator + promoteLoadFromStore(MachineBasicBlock::iterator LoadI, + MachineBasicBlock::iterator StoreI); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using // pre or post indexed addressing with writeback. Scan forwards. MachineBasicBlock::iterator findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, - int Value); + int UnscaledOffset); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using @@ -96,97 +127,177 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { MachineBasicBlock::iterator findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); - // Merge a pre-index base register update into a ld/st instruction. - MachineBasicBlock::iterator - mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update); + // Find an instruction that updates the base register of the ld/st + // instruction. + bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, + unsigned BaseReg, int Offset); - // Merge a post-index base register update into a ld/st instruction. + // Merge a pre- or post-index base register update into a ld/st instruction. MachineBasicBlock::iterator - mergePostIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update); + mergeUpdateInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, bool IsPreIdx); + + // Find and merge foldable ldr/str instructions. + bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); - bool optimizeBlock(MachineBasicBlock &MBB); + // Find and promote load instructions which read directly from store. + bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); + + // Check if converting two narrow loads into a single wider load with + // bitfield extracts could be enabled. + bool enableNarrowLdMerge(MachineFunction &Fn); + + bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); bool runOnMachineFunction(MachineFunction &Fn) override; const char *getPassName() const override { - return "AArch64 load / store optimization pass"; + return AARCH64_LOAD_STORE_OPT_NAME; } - -private: - int getMemSize(MachineInstr *MemMI); }; char AArch64LoadStoreOpt::ID = 0; } // namespace -static bool isUnscaledLdst(unsigned Opc) { +INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", + AARCH64_LOAD_STORE_OPT_NAME, false, false) + +static bool isUnscaledLdSt(unsigned Opc) { switch (Opc) { default: return false; case AArch64::STURSi: - return true; case AArch64::STURDi: - return true; case AArch64::STURQi: - return true; + case AArch64::STURBBi: + case AArch64::STURHHi: case AArch64::STURWi: - return true; case AArch64::STURXi: - return true; case AArch64::LDURSi: - return true; case AArch64::LDURDi: - return true; case AArch64::LDURQi: - return true; case AArch64::LDURWi: - return true; case AArch64::LDURXi: - return true; case AArch64::LDURSWi: + case AArch64::LDURHHi: + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: + return true; + } +} + +static bool isUnscaledLdSt(MachineInstr *MI) { + return isUnscaledLdSt(MI->getOpcode()); +} + +static unsigned getBitExtrOpcode(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode."); + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRHHui: + case AArch64::LDURHHi: + return AArch64::UBFMWri; + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + return AArch64::SBFMWri; + } +} + +static bool isNarrowStore(unsigned Opc) { + switch (Opc) { + default: + return false; + case AArch64::STRBBui: + case AArch64::STURBBi: + case AArch64::STRHHui: + case AArch64::STURHHi: + return true; + } +} + +static bool isNarrowStore(MachineInstr *MI) { + return isNarrowStore(MI->getOpcode()); +} + +static bool isNarrowLoad(unsigned Opc) { + switch (Opc) { + default: + return false; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: return true; } } -// Size in bytes of the data moved by an unscaled load or store -int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) { - switch (MemMI->getOpcode()) { +static bool isNarrowLoad(MachineInstr *MI) { + return isNarrowLoad(MI->getOpcode()); +} + +// Scaling factor for unscaled load or store. +static int getMemScale(MachineInstr *MI) { + switch (MI->getOpcode()) { default: - llvm_unreachable("Opcode has unknown size!"); + llvm_unreachable("Opcode has unknown scale!"); + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + case AArch64::STRBBui: + case AArch64::STURBBi: + return 1; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + case AArch64::STRHHui: + case AArch64::STURHHi: + return 2; + case AArch64::LDRSui: + case AArch64::LDURSi: + case AArch64::LDRSWui: + case AArch64::LDURSWi: + case AArch64::LDRWui: + case AArch64::LDURWi: case AArch64::STRSui: case AArch64::STURSi: - return 4; - case AArch64::STRDui: - case AArch64::STURDi: - return 8; - case AArch64::STRQui: - case AArch64::STURQi: - return 16; case AArch64::STRWui: case AArch64::STURWi: - return 4; - case AArch64::STRXui: - case AArch64::STURXi: - return 8; - case AArch64::LDRSui: - case AArch64::LDURSi: + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPWi: + case AArch64::STPSi: + case AArch64::STPWi: return 4; case AArch64::LDRDui: case AArch64::LDURDi: + case AArch64::LDRXui: + case AArch64::LDURXi: + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDPDi: + case AArch64::LDPXi: + case AArch64::STPDi: + case AArch64::STPXi: return 8; case AArch64::LDRQui: case AArch64::LDURQi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::LDPQi: + case AArch64::STPQi: return 16; - case AArch64::LDRWui: - case AArch64::LDURWi: - return 4; - case AArch64::LDRXui: - case AArch64::LDURXi: - return 8; - case AArch64::LDRSWui: - case AArch64::LDURSWi: - return 4; } } @@ -203,6 +314,10 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc, case AArch64::STURDi: case AArch64::STRQui: case AArch64::STURQi: + case AArch64::STRBBui: + case AArch64::STURBBi: + case AArch64::STRHHui: + case AArch64::STURHHi: case AArch64::STRWui: case AArch64::STURWi: case AArch64::STRXui: @@ -219,11 +334,23 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc, case AArch64::STURSi: case AArch64::LDRSui: case AArch64::LDURSi: + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRBBui: + case AArch64::LDURBBi: return Opc; case AArch64::LDRSWui: return AArch64::LDRWui; case AArch64::LDURSWi: return AArch64::LDURWi; + case AArch64::LDRSBWui: + return AArch64::LDRBBui; + case AArch64::LDRSHWui: + return AArch64::LDRHHui; + case AArch64::LDURSBWi: + return AArch64::LDURBBi; + case AArch64::LDURSHWi: + return AArch64::LDURHHi; } } @@ -240,6 +367,14 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::STRQui: case AArch64::STURQi: return AArch64::STPQi; + case AArch64::STRBBui: + return AArch64::STRHHui; + case AArch64::STRHHui: + return AArch64::STRWui; + case AArch64::STURBBi: + return AArch64::STURHHi; + case AArch64::STURHHi: + return AArch64::STURWi; case AArch64::STRWui: case AArch64::STURWi: return AArch64::STPWi; @@ -264,6 +399,48 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::LDRSWui: case AArch64::LDURSWi: return AArch64::LDPSWi; + case AArch64::LDRHHui: + case AArch64::LDRSHWui: + return AArch64::LDRWui; + case AArch64::LDURHHi: + case AArch64::LDURSHWi: + return AArch64::LDURWi; + case AArch64::LDRBBui: + case AArch64::LDRSBWui: + return AArch64::LDRHHui; + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + return AArch64::LDURHHi; + } +} + +static unsigned isMatchingStore(MachineInstr *LoadInst, + MachineInstr *StoreInst) { + unsigned LdOpc = LoadInst->getOpcode(); + unsigned StOpc = StoreInst->getOpcode(); + switch (LdOpc) { + default: + llvm_unreachable("Unsupported load instruction!"); + case AArch64::LDRBBui: + return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui || + StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; + case AArch64::LDURBBi: + return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi || + StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; + case AArch64::LDRHHui: + return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || + StOpc == AArch64::STRXui; + case AArch64::LDURHHi: + return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || + StOpc == AArch64::STURXi; + case AArch64::LDRWui: + return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; + case AArch64::LDURWi: + return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; + case AArch64::LDRXui: + return StOpc == AArch64::STRXui; + case AArch64::LDURXi: + return StOpc == AArch64::STURXi; } } @@ -277,6 +454,10 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::STRDpre; case AArch64::STRQui: return AArch64::STRQpre; + case AArch64::STRBBui: + return AArch64::STRBBpre; + case AArch64::STRHHui: + return AArch64::STRHHpre; case AArch64::STRWui: return AArch64::STRWpre; case AArch64::STRXui: @@ -287,12 +468,38 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::LDRDpre; case AArch64::LDRQui: return AArch64::LDRQpre; + case AArch64::LDRBBui: + return AArch64::LDRBBpre; + case AArch64::LDRHHui: + return AArch64::LDRHHpre; case AArch64::LDRWui: return AArch64::LDRWpre; case AArch64::LDRXui: return AArch64::LDRXpre; case AArch64::LDRSWui: return AArch64::LDRSWpre; + case AArch64::LDPSi: + return AArch64::LDPSpre; + case AArch64::LDPSWi: + return AArch64::LDPSWpre; + case AArch64::LDPDi: + return AArch64::LDPDpre; + case AArch64::LDPQi: + return AArch64::LDPQpre; + case AArch64::LDPWi: + return AArch64::LDPWpre; + case AArch64::LDPXi: + return AArch64::LDPXpre; + case AArch64::STPSi: + return AArch64::STPSpre; + case AArch64::STPDi: + return AArch64::STPDpre; + case AArch64::STPQi: + return AArch64::STPQpre; + case AArch64::STPWi: + return AArch64::STPWpre; + case AArch64::STPXi: + return AArch64::STPXpre; } } @@ -306,6 +513,10 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::STRDpost; case AArch64::STRQui: return AArch64::STRQpost; + case AArch64::STRBBui: + return AArch64::STRBBpost; + case AArch64::STRHHui: + return AArch64::STRHHpost; case AArch64::STRWui: return AArch64::STRWpost; case AArch64::STRXui: @@ -316,19 +527,96 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::LDRDpost; case AArch64::LDRQui: return AArch64::LDRQpost; + case AArch64::LDRBBui: + return AArch64::LDRBBpost; + case AArch64::LDRHHui: + return AArch64::LDRHHpost; case AArch64::LDRWui: return AArch64::LDRWpost; case AArch64::LDRXui: return AArch64::LDRXpost; case AArch64::LDRSWui: return AArch64::LDRSWpost; + case AArch64::LDPSi: + return AArch64::LDPSpost; + case AArch64::LDPSWi: + return AArch64::LDPSWpost; + case AArch64::LDPDi: + return AArch64::LDPDpost; + case AArch64::LDPQi: + return AArch64::LDPQpost; + case AArch64::LDPWi: + return AArch64::LDPWpost; + case AArch64::LDPXi: + return AArch64::LDPXpost; + case AArch64::STPSi: + return AArch64::STPSpost; + case AArch64::STPDi: + return AArch64::STPDpost; + case AArch64::STPQi: + return AArch64::STPQpost; + case AArch64::STPWi: + return AArch64::STPWpost; + case AArch64::STPXi: + return AArch64::STPXpost; } } +static bool isPairedLdSt(const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + return false; + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + return true; + } +} + +static const MachineOperand &getLdStRegOp(const MachineInstr *MI, + unsigned PairedRegOp = 0) { + assert(PairedRegOp < 2 && "Unexpected register operand idx."); + unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; + return MI->getOperand(Idx); +} + +static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { + unsigned Idx = isPairedLdSt(MI) ? 2 : 1; + return MI->getOperand(Idx); +} + +static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { + unsigned Idx = isPairedLdSt(MI) ? 3 : 2; + return MI->getOperand(Idx); +} + +static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst, + MachineInstr *StoreInst) { + assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); + int LoadSize = getMemScale(LoadInst); + int StoreSize = getMemScale(StoreInst); + int UnscaledStOffset = isUnscaledLdSt(StoreInst) + ? getLdStOffsetOp(StoreInst).getImm() + : getLdStOffsetOp(StoreInst).getImm() * StoreSize; + int UnscaledLdOffset = isUnscaledLdSt(LoadInst) + ? getLdStOffsetOp(LoadInst).getImm() + : getLdStOffsetOp(LoadInst).getImm() * LoadSize; + return (UnscaledStOffset <= UnscaledLdOffset) && + (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - bool MergeForward, int SExtIdx) { + const LdStPairFlags &Flags) { MachineBasicBlock::iterator NextI = I; ++NextI; // If NextI is the second of the two instructions to be merged, we need @@ -338,25 +626,26 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, if (NextI == Paired) ++NextI; + int SExtIdx = Flags.getSExtIdx(); unsigned Opc = SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); - bool IsUnscaled = isUnscaledLdst(Opc); - int OffsetStride = - IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1; + bool IsUnscaled = isUnscaledLdSt(Opc); + int OffsetStride = IsUnscaled ? getMemScale(I) : 1; + bool MergeForward = Flags.getMergeForward(); unsigned NewOpc = getMatchingPairOpcode(Opc); // Insert our new paired instruction after whichever of the paired // instructions MergeForward indicates. MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. - MachineOperand &BaseRegOp = - MergeForward ? Paired->getOperand(1) : I->getOperand(1); + const MachineOperand &BaseRegOp = + MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I); // Which register is Rt and which is Rt2 depends on the offset order. MachineInstr *RtMI, *Rt2MI; - if (I->getOperand(2).getImm() == - Paired->getOperand(2).getImm() + OffsetStride) { + if (getLdStOffsetOp(I).getImm() == + getLdStOffsetOp(Paired).getImm() + OffsetStride) { RtMI = Paired; Rt2MI = I; // Here we swapped the assumption made for SExtIdx. @@ -368,18 +657,132 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, RtMI = I; Rt2MI = Paired; } - // Handle Unscaled - int OffsetImm = RtMI->getOperand(2).getImm(); - if (IsUnscaled && EnableAArch64UnscaledMemOp) - OffsetImm /= OffsetStride; + + int OffsetImm = getLdStOffsetOp(RtMI).getImm(); + + if (isNarrowLoad(Opc)) { + // Change the scaled offset from small to large type. + if (!IsUnscaled) { + assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); + OffsetImm /= 2; + } + MachineInstr *RtNewDest = MergeForward ? I : Paired; + // When merging small (< 32 bit) loads for big-endian targets, the order of + // the component parts gets swapped. + if (!Subtarget->isLittleEndian()) + std::swap(RtMI, Rt2MI); + // Construct the new load instruction. + MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; + NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(RtNewDest)) + .addOperand(BaseRegOp) + .addImm(OffsetImm) + .setMemRefs(I->mergeMemRefsWith(*Paired)); + + DEBUG( + dbgs() + << "Creating the new load and extract. Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(Paired->print(dbgs())); + DEBUG(dbgs() << " with instructions:\n "); + DEBUG((NewMemMI)->print(dbgs())); + + int Width = getMemScale(I) == 1 ? 8 : 16; + int LSBLow = 0; + int LSBHigh = Width; + int ImmsLow = LSBLow + Width - 1; + int ImmsHigh = LSBHigh + Width - 1; + MachineInstr *ExtDestMI = MergeForward ? Paired : I; + if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) { + // Create the bitfield extract for high bits. + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(Rt2MI))) + .addOperand(getLdStRegOp(Rt2MI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); + // Create the bitfield extract for low bits. + if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { + // For unsigned, prefer to use AND for low bits. + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(ImmsLow); + } else { + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(RtMI))) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); + } + } else { + // Create the bitfield extract for low bits. + if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { + // For unsigned, prefer to use AND for low bits. + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(ImmsLow); + } else { + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(RtMI))) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); + } + + // Create the bitfield extract for high bits. + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(Rt2MI))) + .addOperand(getLdStRegOp(Rt2MI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); + } + DEBUG(dbgs() << " "); + DEBUG((BitExtMI1)->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG((BitExtMI2)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + I->eraseFromParent(); + Paired->eraseFromParent(); + return NextI; + } // Construct the new instruction. - MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint, - I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(RtMI->getOperand(0)) - .addOperand(Rt2MI->getOperand(0)) - .addOperand(BaseRegOp) - .addImm(OffsetImm); + MachineInstrBuilder MIB; + if (isNarrowStore(Opc)) { + // Change the scaled offset from small to large type. + if (!IsUnscaled) { + assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); + OffsetImm /= 2; + } + MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(I)) + .addOperand(BaseRegOp) + .addImm(OffsetImm) + .setMemRefs(I->mergeMemRefsWith(*Paired)); + } else { + // Handle Unscaled + if (IsUnscaled) + OffsetImm /= OffsetStride; + MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(RtMI)) + .addOperand(getLdStRegOp(Rt2MI)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + } + (void)MIB; // FIXME: Do we need/want to copy the mem operands from the source @@ -439,13 +842,112 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, return NextI; } +MachineBasicBlock::iterator +AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, + MachineBasicBlock::iterator StoreI) { + MachineBasicBlock::iterator NextI = LoadI; + ++NextI; + + int LoadSize = getMemScale(LoadI); + int StoreSize = getMemScale(StoreI); + unsigned LdRt = getLdStRegOp(LoadI).getReg(); + unsigned StRt = getLdStRegOp(StoreI).getReg(); + bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); + + assert((IsStoreXReg || + TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && + "Unexpected RegClass"); + + MachineInstr *BitExtMI; + if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) { + // Remove the load, if the destination register of the loads is the same + // register for stored value. + if (StRt == LdRt && LoadSize == 8) { + DEBUG(dbgs() << "Remove load instruction:\n "); + DEBUG(LoadI->print(dbgs())); + DEBUG(dbgs() << "\n"); + LoadI->eraseFromParent(); + return NextI; + } + // Replace the load with a mov if the load and store are in the same size. + BitExtMI = + BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) + .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) + .addReg(StRt) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else { + // FIXME: Currently we disable this transformation in big-endian targets as + // performance and correctness are verified only in little-endian. + if (!Subtarget->isLittleEndian()) + return NextI; + bool IsUnscaled = isUnscaledLdSt(LoadI); + assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match"); + assert(LoadSize <= StoreSize && "Invalid load size"); + int UnscaledLdOffset = IsUnscaled + ? getLdStOffsetOp(LoadI).getImm() + : getLdStOffsetOp(LoadI).getImm() * LoadSize; + int UnscaledStOffset = IsUnscaled + ? getLdStOffsetOp(StoreI).getImm() + : getLdStOffsetOp(StoreI).getImm() * StoreSize; + int Width = LoadSize * 8; + int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); + int Imms = Immr + Width - 1; + unsigned DestReg = IsStoreXReg + ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32, + &AArch64::GPR64RegClass) + : LdRt; + + assert((UnscaledLdOffset >= UnscaledStOffset && + (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && + "Invalid offset"); + + Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); + Imms = Immr + Width - 1; + if (UnscaledLdOffset == UnscaledStOffset) { + uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N + | ((Immr) << 6) // immr + | ((Imms) << 0) // imms + ; + + BitExtMI = + BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), + DestReg) + .addReg(StRt) + .addImm(AndMaskEncoded); + } else { + BitExtMI = + BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), + DestReg) + .addReg(StRt) + .addImm(Immr) + .addImm(Imms); + } + } + + DEBUG(dbgs() << "Promoting load by replacing :\n "); + DEBUG(StoreI->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(LoadI->print(dbgs())); + DEBUG(dbgs() << " with instructions:\n "); + DEBUG(StoreI->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG((BitExtMI)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + LoadI->eraseFromParent(); + return NextI; +} + /// trackRegDefsUses - Remember what registers the specified instruction uses /// and modifies. -static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs, +static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, BitVector &UsedRegs, const TargetRegisterInfo *TRI) { - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (MO.isRegMask()) ModifiedRegs.setBitsNotInMask(MO.getRegMask()); @@ -464,16 +966,12 @@ static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs, } static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { - if (!IsUnscaled && (Offset > 63 || Offset < -64)) - return false; - if (IsUnscaled) { - // Convert the byte-offset used by unscaled into an "element" offset used - // by the scaled pair load/store instructions. - int ElemOffset = Offset / OffsetStride; - if (ElemOffset > 63 || ElemOffset < -64) - return false; - } - return true; + // Convert the byte-offset used by unscaled into an "element" offset used + // by the scaled pair load/store instructions. + if (IsUnscaled) + Offset /= OffsetStride; + + return Offset <= 63 && Offset >= -64; } // Do alignment, specialized to power of 2 and for signed ints, @@ -507,12 +1005,65 @@ static bool mayAlias(MachineInstr *MIa, return false; } +bool AArch64LoadStoreOpt::findMatchingStore( + MachineBasicBlock::iterator I, unsigned Limit, + MachineBasicBlock::iterator &StoreI) { + MachineBasicBlock::iterator E = I->getParent()->begin(); + MachineBasicBlock::iterator MBBI = I; + MachineInstr *FirstMI = I; + unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + + // Track which registers have been modified and used between the first insn + // and the second insn. + BitVector ModifiedRegs, UsedRegs; + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + + for (unsigned Count = 0; MBBI != E && Count < Limit;) { + --MBBI; + MachineInstr *MI = MBBI; + // Skip DBG_VALUE instructions. Otherwise debug info can affect the + // optimization by changing how far we scan. + if (MI->isDebugValue()) + continue; + // Now that we know this is a real instruction, count it. + ++Count; + + // If the load instruction reads directly from the address to which the + // store instruction writes and the stored value is not modified, we can + // promote the load. Since we do not handle stores with pre-/post-index, + // it's unnecessary to check if BaseReg is modified by the store itself. + if (MI->mayStore() && isMatchingStore(FirstMI, MI) && + BaseReg == getLdStBaseOp(MI).getReg() && + isLdOffsetInRangeOfSt(FirstMI, MI) && + !ModifiedRegs[getLdStRegOp(MI).getReg()]) { + StoreI = MBBI; + return true; + } + + if (MI->isCall()) + return false; + + // Update modified / uses register lists. + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + + // Otherwise, if the base register is modified, we have no match, so + // return early. + if (ModifiedRegs[BaseReg]) + return false; + + // If we encounter a store aliased with the load, return early. + if (MI->mayStore() && mayAlias(FirstMI, MI, TII)) + return false; + } + return false; +} + /// findMatchingInsn - Scan the instructions looking for a load/store that can /// be combined with the current instruction into a load/store pair. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, - bool &MergeForward, int &SExtIdx, - unsigned Limit) { + LdStPairFlags &Flags, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; MachineInstr *FirstMI = I; @@ -520,21 +1071,27 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, unsigned Opc = FirstMI->getOpcode(); bool MayLoad = FirstMI->mayLoad(); - bool IsUnscaled = isUnscaledLdst(Opc); - unsigned Reg = FirstMI->getOperand(0).getReg(); - unsigned BaseReg = FirstMI->getOperand(1).getReg(); - int Offset = FirstMI->getOperand(2).getImm(); + bool IsUnscaled = isUnscaledLdSt(FirstMI); + unsigned Reg = getLdStRegOp(FirstMI).getReg(); + unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + int Offset = getLdStOffsetOp(FirstMI).getImm(); + bool IsNarrowStore = isNarrowStore(Opc); + + // For narrow stores, find only the case where the stored value is WZR. + if (IsNarrowStore && Reg != AArch64::WZR) + return E; // Early exit if the first instruction modifies the base register. // e.g., ldr x0, [x0] - // Early exit if the offset if not possible to match. (6 bits of positive - // range, plus allow an extra one in case we find a later insn that matches - // with Offset-1 if (FirstMI->modifiesRegister(BaseReg, TRI)) return E; - int OffsetStride = - IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1; - if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) + + // Early exit if the offset if not possible to match. (6 bits of positive + // range, plus allow an extra one in case we find a later insn that matches + // with Offset-1) + int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; + if (!(isNarrowLoad(Opc) || IsNarrowStore) && + !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) return E; // Track which registers have been modified and used between the first insn @@ -557,18 +1114,19 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, ++Count; bool CanMergeOpc = Opc == MI->getOpcode(); - SExtIdx = -1; + Flags.setSExtIdx(-1); if (!CanMergeOpc) { bool IsValidLdStrOpc; unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); - if (!IsValidLdStrOpc) - continue; + assert(IsValidLdStrOpc && + "Given Opc should be a Load or Store with an immediate"); // Opc will be the first instruction in the pair. - SExtIdx = NonSExtOpc == (unsigned)Opc ? 1 : 0; + Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0); CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); } - if (CanMergeOpc && MI->getOperand(2).isImm()) { + if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) { + assert(MI->mayLoadOrStore() && "Expected memory operation."); // If we've found another instruction with the same opcode, check to see // if the base and offset are compatible with our starting instruction. // These instructions all have scaled immediate operands, so we just @@ -579,8 +1137,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // Pairwise instructions have a 7-bit signed offset field. Single insns // have a 12-bit unsigned offset field. To be a valid combine, the // final offset must be in range. - unsigned MIBaseReg = MI->getOperand(1).getReg(); - int MIOffset = MI->getOperand(2).getImm(); + unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); + int MIOffset = getLdStOffsetOp(MI).getImm(); if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || (Offset + OffsetStride == MIOffset))) { int MinOffset = Offset < MIOffset ? Offset : MIOffset; @@ -591,30 +1149,43 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, return E; // If the resultant immediate offset of merging these instructions // is out of range for a pairwise instruction, bail and keep looking. - bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode()); - if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { + bool MIIsUnscaled = isUnscaledLdSt(MI); + bool IsNarrowLoad = isNarrowLoad(MI->getOpcode()); + if (!IsNarrowLoad && + !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - if (MI->mayLoadOrStore()) - MemInsns.push_back(MI); + MemInsns.push_back(MI); continue; } - // If the alignment requirements of the paired (scaled) instruction - // can't express the offset of the unscaled input, bail and keep - // looking. - if (IsUnscaled && EnableAArch64UnscaledMemOp && - (alignTo(MinOffset, OffsetStride) != MinOffset)) { - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - if (MI->mayLoadOrStore()) + + if (IsNarrowLoad || IsNarrowStore) { + // If the alignment requirements of the scaled wide load/store + // instruction can't express the offset of the scaled narrow + // input, bail and keep looking. + if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); MemInsns.push_back(MI); - continue; + continue; + } + } else { + // If the alignment requirements of the paired (scaled) instruction + // can't express the offset of the unscaled input, bail and keep + // looking. + if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(MI); + continue; + } } // If the destination register of the loads is the same register, bail // and keep looking. A load-pair instruction with both destination // registers the same is UNPREDICTABLE and will result in an exception. - if (MayLoad && Reg == MI->getOperand(0).getReg()) { + // For narrow stores, allow only when the stored value is the same + // (i.e., WZR). + if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) || + (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - if (MI->mayLoadOrStore()) - MemInsns.push_back(MI); + MemInsns.push_back(MI); continue; } @@ -622,10 +1193,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // the two instructions and none of the instructions between the second // and first alias with the second, we can combine the second into the // first. - if (!ModifiedRegs[MI->getOperand(0).getReg()] && - !(MI->mayLoad() && UsedRegs[MI->getOperand(0).getReg()]) && + if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && + !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && !mayAlias(MI, MemInsns, TII)) { - MergeForward = false; + Flags.setMergeForward(false); return MBBI; } @@ -633,11 +1204,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // between the two instructions and none of the instructions between the // first and the second alias with the first, we can combine the first // into the second. - if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] && - !(FirstMI->mayLoad() && - UsedRegs[FirstMI->getOperand(0).getReg()]) && + if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && + !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && !mayAlias(FirstMI, MemInsns, TII)) { - MergeForward = true; + Flags.setMergeForward(true); return MBBI; } // Unable to combine these instructions due to interference in between. @@ -666,51 +1236,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, } MachineBasicBlock::iterator -AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update) { - assert((Update->getOpcode() == AArch64::ADDXri || - Update->getOpcode() == AArch64::SUBXri) && - "Unexpected base register update instruction to merge!"); - MachineBasicBlock::iterator NextI = I; - // Return the instruction following the merged instruction, which is - // the instruction following our unmerged load. Unless that's the add/sub - // instruction we're merging, in which case it's the one after that. - if (++NextI == Update) - ++NextI; - - int Value = Update->getOperand(2).getImm(); - assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && - "Can't merge 1 << 12 offset into pre-indexed load / store"); - if (Update->getOpcode() == AArch64::SUBXri) - Value = -Value; - - unsigned NewOpc = getPreIndexedOpcode(I->getOpcode()); - MachineInstrBuilder MIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(Update->getOperand(0)) - .addOperand(I->getOperand(0)) - .addOperand(I->getOperand(1)) - .addImm(Value); - (void)MIB; - - DEBUG(dbgs() << "Creating pre-indexed load/store."); - DEBUG(dbgs() << " Replacing instructions:\n "); - DEBUG(I->print(dbgs())); - DEBUG(dbgs() << " "); - DEBUG(Update->print(dbgs())); - DEBUG(dbgs() << " with instruction:\n "); - DEBUG(((MachineInstr *)MIB)->print(dbgs())); - DEBUG(dbgs() << "\n"); - - // Erase the old instructions for the block. - I->eraseFromParent(); - Update->eraseFromParent(); - - return NextI; -} - -MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( - MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) { +AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, + bool IsPreIdx) { assert((Update->getOpcode() == AArch64::ADDXri || Update->getOpcode() == AArch64::SUBXri) && "Unexpected base register update instruction to merge!"); @@ -723,20 +1251,36 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( int Value = Update->getOperand(2).getImm(); assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && - "Can't merge 1 << 12 offset into post-indexed load / store"); + "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); if (Update->getOpcode() == AArch64::SUBXri) Value = -Value; - unsigned NewOpc = getPostIndexedOpcode(I->getOpcode()); - MachineInstrBuilder MIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(Update->getOperand(0)) - .addOperand(I->getOperand(0)) - .addOperand(I->getOperand(1)) - .addImm(Value); + unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) + : getPostIndexedOpcode(I->getOpcode()); + MachineInstrBuilder MIB; + if (!isPairedLdSt(I)) { + // Non-paired instruction. + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(getLdStRegOp(Update)) + .addOperand(getLdStRegOp(I)) + .addOperand(getLdStBaseOp(I)) + .addImm(Value); + } else { + // Paired instruction. + int Scale = getMemScale(I); + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(getLdStRegOp(Update)) + .addOperand(getLdStRegOp(I, 0)) + .addOperand(getLdStRegOp(I, 1)) + .addOperand(getLdStBaseOp(I)) + .addImm(Value / Scale); + } (void)MIB; - DEBUG(dbgs() << "Creating post-indexed load/store."); + if (IsPreIdx) + DEBUG(dbgs() << "Creating pre-indexed load/store."); + else + DEBUG(dbgs() << "Creating post-indexed load/store."); DEBUG(dbgs() << " Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); @@ -752,8 +1296,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( return NextI; } -static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, - int Offset) { +bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, + MachineInstr *MI, + unsigned BaseReg, int Offset) { switch (MI->getOpcode()) { default: break; @@ -769,44 +1314,65 @@ static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, // Watch out for 1 << 12 shifted value. if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) break; - // If the instruction has the base register as source and dest and the - // immediate will fit in a signed 9-bit integer, then we have a match. - if (MI->getOperand(0).getReg() == BaseReg && - MI->getOperand(1).getReg() == BaseReg && - MI->getOperand(2).getImm() <= 255 && - MI->getOperand(2).getImm() >= -256) { - // If we have a non-zero Offset, we check that it matches the amount - // we're adding to the register. - if (!Offset || Offset == MI->getOperand(2).getImm()) - return true; + + // The update instruction source and destination register must be the + // same as the load/store base register. + if (MI->getOperand(0).getReg() != BaseReg || + MI->getOperand(1).getReg() != BaseReg) + break; + + bool IsPairedInsn = isPairedLdSt(MemMI); + int UpdateOffset = MI->getOperand(2).getImm(); + // For non-paired load/store instructions, the immediate must fit in a + // signed 9-bit integer. + if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) + break; + + // For paired load/store instructions, the immediate must be a multiple of + // the scaling factor. The scaled offset must also fit into a signed 7-bit + // integer. + if (IsPairedInsn) { + int Scale = getMemScale(MemMI); + if (UpdateOffset % Scale != 0) + break; + + int ScaledOffset = UpdateOffset / Scale; + if (ScaledOffset > 64 || ScaledOffset < -64) + break; } + + // If we have a non-zero Offset, we check that it matches the amount + // we're adding to the register. + if (!Offset || Offset == MI->getOperand(2).getImm()) + return true; break; } return false; } MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( - MachineBasicBlock::iterator I, unsigned Limit, int Value) { + MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineInstr *MemMI = I; MachineBasicBlock::iterator MBBI = I; - const MachineFunction &MF = *MemMI->getParent()->getParent(); - unsigned DestReg = MemMI->getOperand(0).getReg(); - unsigned BaseReg = MemMI->getOperand(1).getReg(); - int Offset = MemMI->getOperand(2).getImm() * - TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); + unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); + int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); - // If the base register overlaps the destination register, we can't - // merge the update. - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + // Scan forward looking for post-index opportunities. Updating instructions + // can't be formed if the memory instruction doesn't have the offset we're + // looking for. + if (MIUnscaledOffset != UnscaledOffset) return E; - // Scan forward looking for post-index opportunities. - // Updating instructions can't be formed if the memory insn already - // has an offset other than the value we're looking for. - if (Offset != Value) - return E; + // If the base register overlaps a destination register, we can't + // merge the update. + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. @@ -825,7 +1391,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(MI, BaseReg, Value)) + if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -845,21 +1411,22 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineBasicBlock::iterator E = I->getParent()->end(); MachineInstr *MemMI = I; MachineBasicBlock::iterator MBBI = I; - const MachineFunction &MF = *MemMI->getParent()->getParent(); - unsigned DestReg = MemMI->getOperand(0).getReg(); - unsigned BaseReg = MemMI->getOperand(1).getReg(); - int Offset = MemMI->getOperand(2).getImm(); - unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); + unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); + int Offset = getLdStOffsetOp(MemMI).getImm(); // If the load/store is the first instruction in the block, there's obviously // not any matching update. Ditto if the memory offset isn't zero. if (MBBI == B || Offset != 0) return E; - // If the base register overlaps the destination register, we can't + // If the base register overlaps a destination register, we can't // merge the update. - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. @@ -878,7 +1445,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(MI, BaseReg, RegSize)) + if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -892,17 +1459,101 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( return E; } -bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { +bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = MBBI; + // If this is a volatile load, don't mess with it. + if (MI->hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm. + // FIXME: It is possible to extend it to handle reg+reg cases. + if (!getLdStOffsetOp(MI).isImm()) + return false; + + // Look backward up to ScanLimit instructions. + MachineBasicBlock::iterator StoreI; + if (findMatchingStore(MBBI, ScanLimit, StoreI)) { + ++NumLoadsFromStoresPromoted; + // Promote the load. Keeping the iterator straight is a + // pain, so we let the merge routine tell us what the next instruction + // is after it's done mucking about. + MBBI = promoteLoadFromStore(MBBI, StoreI); + return true; + } + return false; +} + +bool AArch64LoadStoreOpt::tryToMergeLdStInst( + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = MBBI; + MachineBasicBlock::iterator E = MI->getParent()->end(); + // If this is a volatile load/store, don't mess with it. + if (MI->hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm (as opposed to an address reloc). + if (!getLdStOffsetOp(MI).isImm()) + return false; + + // Check if this load/store has a hint to avoid pair formation. + // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. + if (TII->isLdStPairSuppressed(MI)) + return false; + + // Look ahead up to ScanLimit instructions for a pairable instruction. + LdStPairFlags Flags; + MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); + if (Paired != E) { + if (isNarrowLoad(MI)) { + ++NumNarrowLoadsPromoted; + } else if (isNarrowStore(MI)) { + ++NumZeroStoresPromoted; + } else { + ++NumPairCreated; + if (isUnscaledLdSt(MI)) + ++NumUnscaledPairCreated; + } + + // Merge the loads into a pair. Keeping the iterator straight is a + // pain, so we let the merge routine tell us what the next instruction + // is after it's done mucking about. + MBBI = mergePairedInsns(MBBI, Paired, Flags); + return true; + } + return false; +} + +bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, + bool enableNarrowLdOpt) { bool Modified = false; - // Two tranformations to do here: - // 1) Find loads and stores that can be merged into a single load or store + // Three tranformations to do here: + // 1) Find loads that directly read from stores and promote them by + // replacing with mov instructions. If the store is wider than the load, + // the load will be replaced with a bitfield extract. + // e.g., + // str w1, [x0, #4] + // ldrh w2, [x0, #6] + // ; becomes + // str w1, [x0, #4] + // lsr w2, w1, #16 + // 2) Find narrow loads that can be converted into a single wider load + // with bitfield extract instructions. + // e.g., + // ldrh w0, [x2] + // ldrh w1, [x2, #2] + // ; becomes + // ldr w0, [x2] + // ubfx w1, w0, #16, #16 + // and w0, w0, #ffff + // 3) Find loads and stores that can be merged into a single load or store // pair instruction. // e.g., // ldr x0, [x2] // ldr x1, [x2, #8] // ; becomes // ldp x0, x1, [x2] - // 2) Find base register updates that can be merged into the load or store + // 4) Find base register updates that can be merged into the load or store // as a base-reg writeback. // e.g., // ldr x0, [x2] @@ -918,6 +1569,69 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { // Just move on to the next instruction. ++MBBI; break; + // Scaled instructions. + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRWui: + case AArch64::LDRXui: + // Unscaled instructions. + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURWi: + case AArch64::LDURXi: { + if (tryToPromoteLoadFromStore(MBBI)) { + Modified = true; + break; + } + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + enableNarrowLdOpt && MBBI != E;) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + // Scaled instructions. + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRSBWui: + case AArch64::LDRSHWui: + case AArch64::STRBBui: + case AArch64::STRHHui: + // Unscaled instructions. + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: + case AArch64::STURBBi: + case AArch64::STURHHi: { + if (tryToMergeLdStInst(MBBI)) { + Modified = true; + break; + } + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: @@ -929,7 +1643,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDRXui: case AArch64::LDRWui: case AArch64::LDRSWui: - // do the unscaled versions as well + // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: @@ -941,37 +1655,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDURWi: case AArch64::LDURXi: case AArch64::LDURSWi: { - // If this is a volatile load/store, don't mess with it. - if (MI->hasOrderedMemoryRef()) { - ++MBBI; - break; - } - // Make sure this is a reg+imm (as opposed to an address reloc). - if (!MI->getOperand(2).isImm()) { - ++MBBI; - break; - } - // Check if this load/store has a hint to avoid pair formation. - // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. - if (TII->isLdStPairSuppressed(MI)) { - ++MBBI; - break; - } - // Look ahead up to ScanLimit instructions for a pairable instruction. - bool MergeForward = false; - int SExtIdx = -1; - MachineBasicBlock::iterator Paired = - findMatchingInsn(MBBI, MergeForward, SExtIdx, ScanLimit); - if (Paired != E) { - // Merge the loads into a pair. Keeping the iterator straight is a - // pain, so we let the merge routine tell us what the next instruction - // is after it's done mucking about. - MBBI = mergePairedInsns(MBBI, Paired, MergeForward, SExtIdx); - + if (tryToMergeLdStInst(MBBI)) { Modified = true; - ++NumPairCreated; - if (isUnscaledLdst(MI->getOpcode())) - ++NumUnscaledPairCreated; break; } ++MBBI; @@ -992,17 +1677,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { // Just move on to the next instruction. ++MBBI; break; + // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: case AArch64::STRXui: case AArch64::STRWui: + case AArch64::STRHHui: + case AArch64::STRBBui: case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: case AArch64::LDRXui: case AArch64::LDRWui: - // do the unscaled versions as well + case AArch64::LDRHHui: + case AArch64::LDRBBui: + // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: @@ -1012,25 +1702,41 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDURDi: case AArch64::LDURQi: case AArch64::LDURWi: - case AArch64::LDURXi: { + case AArch64::LDURXi: + // Paired instructions. + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: { // Make sure this is a reg+imm (as opposed to an address reloc). - if (!MI->getOperand(2).isImm()) { + if (!getLdStOffsetOp(MI).isImm()) { ++MBBI; break; } - // Look ahead up to ScanLimit instructions for a mergable instruction. + // Look forward to try to form a post-index instruction. For example, + // ldr x0, [x20] + // add x20, x20, #32 + // merged into: + // ldr x0, [x20], #32 MachineBasicBlock::iterator Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); if (Update != E) { // Merge the update into the ld/st. - MBBI = mergePostIdxUpdateInsn(MBBI, Update); + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); Modified = true; ++NumPostFolded; break; } // Don't know how to handle pre/post-index versions, so move to the next // instruction. - if (isUnscaledLdst(Opc)) { + if (isUnscaledLdSt(Opc)) { ++MBBI; break; } @@ -1043,28 +1749,25 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); if (Update != E) { // Merge the update into the ld/st. - MBBI = mergePreIdxUpdateInsn(MBBI, Update); + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); Modified = true; ++NumPreFolded; break; } + // The immediate in the load/store is scaled by the size of the memory + // operation. The immediate in the add we're looking for, + // however, is not, so adjust here. + int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); // Look forward to try to find a post-index instruction. For example, // ldr x1, [x0, #64] // add x0, x0, #64 // merged into: // ldr x1, [x0, #64]! - - // The immediate in the load/store is scaled by the size of the register - // being loaded. The immediate in the add we're looking for, - // however, is not, so adjust here. - int Value = MI->getOperand(2).getImm() * - TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent())) - ->getSize(); - Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value); + Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); if (Update != E) { // Merge the update into the ld/st. - MBBI = mergePreIdxUpdateInsn(MBBI, Update); + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); Modified = true; ++NumPreFolded; break; @@ -1081,13 +1784,24 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { return Modified; } +bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { + bool ProfitableArch = Subtarget->isCortexA57(); + // FIXME: The benefit from converting narrow loads into a wider load could be + // microarchitectural as it assumes that a single load with two bitfield + // extracts is cheaper than two narrow loads. Currently, this conversion is + // enabled only in cortex-a57 on which performance benefits were verified. + return ProfitableArch && !Subtarget->requiresStrictAlign(); +} + bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { - TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo()); - TRI = Fn.getSubtarget().getRegisterInfo(); + Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); + TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); + TRI = Subtarget->getRegisterInfo(); bool Modified = false; + bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); for (auto &MBB : Fn) - Modified |= optimizeBlock(MBB); + Modified |= optimizeBlock(MBB, enableNarrowLdOpt); return Modified; } @@ -1095,8 +1809,8 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep // loads and stores near one another? -/// createARMLoadStoreOptimizationPass - returns an instance of the load / store -/// optimization pass. +/// createAArch64LoadStoreOptimizationPass - returns an instance of the +/// load / store optimization pass. FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { return new AArch64LoadStoreOpt(); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp index 580427a..2b4cdf1 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -207,9 +207,9 @@ bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO, void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp; - if (lowerOperand(MI->getOperand(i), MCOp)) + if (lowerOperand(MO, MCOp)) OutMI.addOperand(MCOp); } } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h deleted file mode 100644 index 4164b33..0000000 --- a/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h +++ /dev/null @@ -1,42 +0,0 @@ -//===- AArch64MachineCombinerPattern.h -===// -//===- AArch64 instruction pattern supported by combiner -===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines instruction pattern supported by combiner -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H -#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H - -namespace llvm { - -/// Enumeration of instruction pattern supported by machine combiner -/// -/// -namespace MachineCombinerPattern { -enum MC_PATTERN : int { - MC_NONE = 0, - MC_MULADDW_OP1 = 1, - MC_MULADDW_OP2 = 2, - MC_MULSUBW_OP1 = 3, - MC_MULSUBW_OP2 = 4, - MC_MULADDWI_OP1 = 5, - MC_MULSUBWI_OP1 = 6, - MC_MULADDX_OP1 = 7, - MC_MULADDX_OP2 = 8, - MC_MULSUBX_OP1 = 9, - MC_MULSUBX_OP2 = 10, - MC_MULADDXI_OP1 = 11, - MC_MULSUBXI_OP1 = 12 -}; -} // end namespace MachineCombinerPattern -} // end namespace llvm - -#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 536a8d0..318f839 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -1,4 +1,4 @@ -//=- AArch64MachineFuctionInfo.h - AArch64 machine function info --*- C++ -*-=// +//=- AArch64MachineFunctionInfo.h - AArch64 machine function info -*- C++ -*-=// // // The LLVM Compiler Infrastructure // @@ -42,7 +42,7 @@ class AArch64FunctionInfo : public MachineFunctionInfo { unsigned ArgumentStackToRestore; /// HasStackFrame - True if this function has a stack frame. Set by - /// processFunctionBeforeCalleeSavedScan(). + /// determineCalleeSaves(). bool HasStackFrame; /// \brief Amount of stack frame size, not including callee-saved registers. @@ -72,16 +72,22 @@ class AArch64FunctionInfo : public MachineFunctionInfo { /// registers. unsigned VarArgsFPRSize; + /// True if this function has a subset of CSRs that is handled explicitly via + /// copies. + bool IsSplitCSR; + public: AArch64FunctionInfo() : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {} + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), + IsSplitCSR(false) {} explicit AArch64FunctionInfo(MachineFunction &MF) : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) { + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), + IsSplitCSR(false) { (void)MF; } @@ -96,6 +102,9 @@ public: bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } + bool isSplitCSR() const { return IsSplitCSR; } + void setIsSplitCSR(bool s) { IsSplitCSR = s; } + void setLocalStackSize(unsigned Size) { LocalStackSize = Size; } unsigned getLocalStackSize() const { return LocalStackSize; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp index e1b93bf..79c09d9 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -489,7 +489,7 @@ bool AArch64PromoteConstant::insertDefinitions( for (const auto &IPI : InsertPts) { // Create the load of the global variable. - IRBuilder<> Builder(IPI.first->getParent(), IPI.first); + IRBuilder<> Builder(IPI.first); LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV); DEBUG(dbgs() << "**********\n"); DEBUG(dbgs() << "New def: "); @@ -540,7 +540,7 @@ bool AArch64PromoteConstant::runOnFunction(Function &F) { bool LocalChange = false; SmallPtrSet<Constant *, 8> AlreadyChecked; - for (Instruction &I : inst_range(&F)) { + for (Instruction &I : instructions(&F)) { // Traverse the operand, looking for constant vectors. Replace them by a // load of a global variable of constant vector type. for (Value *Op : I.operand_values()) { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 841af55..32b4888 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -15,6 +15,7 @@ #include "AArch64RegisterInfo.h" #include "AArch64FrameLowering.h" #include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" @@ -34,10 +35,6 @@ using namespace llvm; #define GET_REGINFO_TARGET_DESC #include "AArch64GenRegisterInfo.inc" -static cl::opt<bool> -ReserveX18("aarch64-reserve-x18", cl::Hidden, - cl::desc("Reserve X18, making it unavailable as GPR")); - AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT) : AArch64GenRegisterInfo(AArch64::LR), TT(TT) {} @@ -50,10 +47,23 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_NoRegs_SaveList; if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) return CSR_AArch64_AllRegs_SaveList; + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS) + return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ? + CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : + CSR_AArch64_CXX_TLS_Darwin_SaveList; else return CSR_AArch64_AAPCS_SaveList; } +const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy( + const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getInfo<AArch64FunctionInfo>()->isSplitCSR()) + return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList; + return nullptr; +} + const uint32_t * AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { @@ -62,6 +72,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_AArch64_NoRegs_RegMask; if (CC == CallingConv::AnyReg) return CSR_AArch64_AllRegs_RegMask; + if (CC == CallingConv::CXX_FAST_TLS) + return CSR_AArch64_CXX_TLS_Darwin_RegMask; else return CSR_AArch64_AAPCS_RegMask; } @@ -104,7 +116,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AArch64::W29); } - if (TT.isOSDarwin() || ReserveX18) { + if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) { Reserved.set(AArch64::X18); // Platform register Reserved.set(AArch64::W18); } @@ -131,7 +143,7 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF, return true; case AArch64::X18: case AArch64::W18: - return TT.isOSDarwin() || ReserveX18; + return MF.getSubtarget<AArch64Subtarget>().isX18Reserved(); case AArch64::FP: case AArch64::W29: return TFI->hasFP(MF) || TT.isOSDarwin(); @@ -186,29 +198,6 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { return false; } -bool AArch64RegisterInfo::canRealignStack(const MachineFunction &MF) const { - - if (MF.getFunction()->hasFnAttribute("no-realign-stack")) - return false; - - return true; -} - -// FIXME: share this with other backends with identical implementation? -bool -AArch64RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const AArch64FrameLowering *TFI = getFrameLowering(MF); - const Function *F = MF.getFunction(); - unsigned StackAlign = TFI->getStackAlignment(); - bool requiresRealignment = - ((MFI->getMaxAlignment() > StackAlign) || - F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackAlignment)); - - return requiresRealignment && canRealignStack(MF); -} - unsigned AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const { const AArch64FrameLowering *TFI = getFrameLowering(MF); @@ -424,10 +413,11 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, case AArch64::GPR64RegClassID: case AArch64::GPR32commonRegClassID: case AArch64::GPR64commonRegClassID: - return 32 - 1 // XZR/SP - - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP - - (TT.isOSDarwin() || ReserveX18) // X18 reserved as platform register - - hasBasePointer(MF); // X19 + return 32 - 1 // XZR/SP + - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP + - MF.getSubtarget<AArch64Subtarget>() + .isX18Reserved() // X18 reserved as platform register + - hasBasePointer(MF); // X19 case AArch64::FPR8RegClassID: case AArch64::FPR16RegClassID: case AArch64::FPR32RegClassID: diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 8c379d9..f33f788 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -35,6 +35,8 @@ public: /// Code Generation virtual methods... const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg * + getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; @@ -93,9 +95,6 @@ public: unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; - // Base pointer (stack realignment) support. - bool canRealignStack(const MachineFunction &MF) const; - bool needsStackRealignment(const MachineFunction &MF) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index b2efca0..a8c8b17 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -407,7 +407,7 @@ def FPR128 : RegisterClass<"AArch64", // The lower 16 vector registers. Some instructions can only take registers // in this range. def FPR128_lo : RegisterClass<"AArch64", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128, (trunc FPR128, 16)>; // Pairs, triples, and quads of 64-bit vector registers. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 486efd6..f6ee8cf 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -31,6 +31,11 @@ static cl::opt<bool> EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " "converter pass"), cl::init(true), cl::Hidden); +// If OS supports TBI, use this flag to enable it. +static cl::opt<bool> +UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " + "an address is ignored"), cl::init(false), cl::Hidden); + AArch64Subtarget & AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) { // Determine default and user-specified characteristics @@ -46,9 +51,11 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian) : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), - HasV8_1aOps(false), HasFPARMv8(false), HasNEON(false), HasCrypto(false), - HasCRC(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), - IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(), + HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false), + HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false), + HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), + StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian), + CPUString(CPU), TargetTriple(TT), FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(), TLInfo(TM, *this) {} @@ -113,12 +120,30 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, // bi-directional scheduling. 253.perlbmk. Policy.OnlyTopDown = false; Policy.OnlyBottomUp = false; + // Enabling or Disabling the latency heuristic is a close call: It seems to + // help nearly no benchmark on out-of-order architectures, on the other hand + // it regresses register pressure on a few benchmarking. + if (isCyclone()) + Policy.DisableLatencyHeuristic = true; } bool AArch64Subtarget::enableEarlyIfConversion() const { return EnableEarlyIfConvert; } +bool AArch64Subtarget::supportsAddressTopByteIgnored() const { + if (!UseAddressTopByteIgnored) + return false; + + if (TargetTriple.isiOS()) { + unsigned Major, Minor, Micro; + TargetTriple.getiOSVersion(Major, Minor, Micro); + return Major >= 8; + } + + return false; +} + std::unique_ptr<PBQPRAConstraint> AArch64Subtarget::getCustomPBQPConstraints() const { if (!isCortexA57()) diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h index 6bb0694..151133b 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -33,17 +33,28 @@ class Triple; class AArch64Subtarget : public AArch64GenSubtargetInfo { protected: - enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone}; + enum ARMProcFamilyEnum { + Others, + CortexA35, + CortexA53, + CortexA57, + Cyclone, + ExynosM1 + }; /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. ARMProcFamilyEnum ARMProcFamily; bool HasV8_1aOps; + bool HasV8_2aOps; bool HasFPARMv8; bool HasNEON; bool HasCrypto; bool HasCRC; + bool HasPerfMon; + bool HasFullFP16; + bool HasSPE; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove; @@ -51,6 +62,12 @@ protected: // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. bool HasZeroCycleZeroing; + // StrictAlign - Disallow unaligned memory accesses. + bool StrictAlign; + + // ReserveX18 - X18 is not available as a general purpose register. + bool ReserveX18; + bool IsLittle; /// CPUString - String name of used CPU. @@ -92,19 +109,30 @@ public: const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { - return isCortexA53() || isCortexA57(); + return isGeneric() || isCortexA53() || isCortexA57(); } bool hasV8_1aOps() const { return HasV8_1aOps; } + bool hasV8_2aOps() const { return HasV8_2aOps; } bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; } bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; } + bool requiresStrictAlign() const { return StrictAlign; } + + bool isX18Reserved() const { return ReserveX18; } bool hasFPARMv8() const { return HasFPARMv8; } bool hasNEON() const { return HasNEON; } bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } + /// CPU has TBI (top byte of addresses is ignored during HW address + /// translation) and OS enables it. + bool supportsAddressTopByteIgnored() const; + + bool hasPerfMon() const { return HasPerfMon; } + bool hasFullFP16() const { return HasFullFP16; } + bool hasSPE() const { return HasSPE; } bool isLittleEndian() const { return IsLittle; } @@ -112,14 +140,17 @@ public: bool isTargetIOS() const { return TargetTriple.isiOS(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } bool isTargetWindows() const { return TargetTriple.isOSWindows(); } + bool isTargetAndroid() const { return TargetTriple.isAndroid(); } bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } + bool isGeneric() const { return CPUString == "generic"; } bool isCyclone() const { return CPUString == "cyclone"; } bool isCortexA57() const { return CPUString == "cortex-a57"; } bool isCortexA53() const { return CPUString == "cortex-a53"; } + bool isExynosM1() const { return CPUString == "exynos-m1"; } bool useAA() const override { return isCortexA53(); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index db6e244..c52c554 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -203,7 +203,7 @@ public: } // namespace TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(AArch64TTIImpl(this, F)); }); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e085cca..9af0e64 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -23,7 +23,7 @@ using namespace llvm; /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. -unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) { +int AArch64TTIImpl::getIntImmCost(int64_t Val) { // Check if the immediate can be encoded within an instruction. if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) return 0; @@ -37,7 +37,7 @@ unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) { } /// \brief Calculate the cost of materializing the given constant. -unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -51,18 +51,18 @@ unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { // Split the constant into 64-bit chunks and calculate the cost for each // chunk. - unsigned Cost = 0; + int Cost = 0; for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); int64_t Val = Tmp.getSExtValue(); Cost += getIntImmCost(Val); } // We need at least one instruction to materialze the constant. - return std::max(1U, Cost); + return std::max(1, Cost); } -unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) { +int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -118,17 +118,17 @@ unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, } if (Idx == ImmIdx) { - unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + int NumConstants = (BitSize + 63) / 64; + int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); return (Cost <= NumConstants * TTI::TCC_Basic) - ? static_cast<unsigned>(TTI::TCC_Free) + ? static_cast<int>(TTI::TCC_Free) : Cost; } return AArch64TTIImpl::getIntImmCost(Imm, Ty); } -unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { +int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -147,10 +147,10 @@ unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, case Intrinsic::smul_with_overflow: case Intrinsic::umul_with_overflow: if (Idx == 1) { - unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + int NumConstants = (BitSize + 63) / 64; + int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); return (Cost <= NumConstants * TTI::TCC_Basic) - ? static_cast<unsigned>(TTI::TCC_Free) + ? static_cast<int>(TTI::TCC_Free) : Cost; } break; @@ -176,8 +176,7 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } -unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src) { +int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -187,7 +186,31 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (!SrcTy.isSimple() || !DstTy.isSimple()) return BaseT::getCastInstrCost(Opcode, Dst, Src); - static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = { + static const TypeConversionCostTblEntry + ConversionTbl[] = { + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, + + // The number of shll instructions for the extension. + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + // LowerVectorINT_TO_FP: { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, @@ -210,6 +233,16 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + // Complex: to v8f32 + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + + // Complex: to v16f32 + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, + // Complex: to v2f64 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, @@ -250,22 +283,21 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, }; - int Idx = ConvertCostTableLookup<MVT>( - ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(), - SrcTy.getSimpleVT()); - if (Idx != -1) - return ConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { +int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); if (Index != -1U) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); // This type is legalized to a scalar type. if (!LT.second.isVector()) @@ -281,15 +313,15 @@ unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, } // All other insert/extracts cost this much. - return 2; + return 3; } -unsigned AArch64TTIImpl::getArithmeticInstrCost( +int AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -300,10 +332,9 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost( // normally expanded to the sequence ADD + CMP + SELECT + SRA. // The OperandValue properties many not be same as that of previous // operation; conservatively assume OP_None. - unsigned Cost = - getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); + int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); @@ -331,7 +362,7 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost( } } -unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -346,19 +377,20 @@ unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { return 1; } -unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { +int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) { int ISD = TLI->InstructionOpcodeToISD(Opcode); - // We don't lower vector selects well that are wider than the register width. + // We don't lower some vector selects well that are wider than the register + // width. if (ValTy->isVectorTy() && ISD == ISD::SELECT) { // We would need this many instructions to hide the scalarization happening. - const unsigned AmortizationCost = 20; - static const TypeConversionCostTblEntry<MVT::SimpleValueType> + const int AmortizationCost = 20; + static const TypeConversionCostTblEntry VectorSelectTbl[] = { - { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost }, - { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost }, - { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost }, + { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, + { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, + { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } @@ -367,20 +399,18 @@ unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, EVT SelCondTy = TLI->getValueType(DL, CondTy); EVT SelValTy = TLI->getValueType(DL, ValTy); if (SelCondTy.isSimple() && SelValTy.isSimple()) { - int Idx = - ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(), - SelValTy.getSimpleVT()); - if (Idx != -1) - return VectorSelectTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, + SelCondTy.getSimpleVT(), + SelValTy.getSimpleVT())) + return Entry->Cost; } } return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); +int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, unsigned AddressSpace) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && Src->getVectorElementType()->isIntegerTy(64)) { @@ -389,7 +419,7 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // practice on inlined memcpy code. // We make v2i64 stores expensive so that we will only vectorize if there // are 6 other instructions getting vectorized. - unsigned AmortizationCost = 6; + int AmortizationCost = 6; return LT.first * 2 * AmortizationCost; } @@ -407,16 +437,18 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, return LT.first; } -unsigned AArch64TTIImpl::getInterleavedMemoryOpCost( - unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace) { +int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa<VectorType>(VecTy) && "Expect a vector type"); if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); - unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy); + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); // ldN/stN only support legal vector types of size 64 or 128 in bits. if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128)) @@ -427,8 +459,8 @@ unsigned AArch64TTIImpl::getInterleavedMemoryOpCost( Alignment, AddressSpace); } -unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { - unsigned Cost = 0; +int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { + int Cost = 0; for (auto *I : Tys) { if (!I->isVectorTy()) continue; @@ -506,7 +538,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, case Intrinsic::aarch64_neon_ld4: Info.ReadMem = true; Info.WriteMem = false; - Info.Vol = false; + Info.IsSimple = true; Info.NumMemRefs = 1; Info.PtrVal = Inst->getArgOperand(0); break; @@ -515,7 +547,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, case Intrinsic::aarch64_neon_st4: Info.ReadMem = false; Info.WriteMem = true; - Info.Vol = false; + Info.IsSimple = true; Info.NumMemRefs = 1; Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1); break; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 444d3cc..ec58c4f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -48,7 +48,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { }; public: - explicit AArch64TTIImpl(const AArch64TargetMachine *TM, Function &F) + explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -63,12 +63,11 @@ public: /// @{ using BaseT::getIntImmCost; - unsigned getIntImmCost(int64_t Val); - unsigned getIntImmCost(const APInt &Imm, Type *Ty); - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty); - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); + int getIntImmCost(int64_t Val); + int getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); /// @} @@ -76,6 +75,8 @@ public: /// \name Vector TTI Implementations /// @{ + bool enableInterleavedAccessVectorization() { return true; } + unsigned getNumberOfRegisters(bool Vector) { if (Vector) { if (ST->hasNEON()) @@ -96,25 +97,25 @@ public: unsigned getMaxInterleaveFactor(unsigned VF); - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - unsigned getArithmeticInstrCost( + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - unsigned getAddressComputationCost(Type *Ty, bool IsComplex); + int getAddressComputationCost(Type *Ty, bool IsComplex); - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); - unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys); + int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys); void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); @@ -123,11 +124,9 @@ public: bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); - unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef<unsigned> Indices, - unsigned Alignment, - unsigned AddressSpace); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef<unsigned> Indices, unsigned Alignment, + unsigned AddressSpace); /// @} }; diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 38e8b4d..394c8e7 100644 --- a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -43,7 +43,6 @@ class AArch64Operand; class AArch64AsmParser : public MCTargetAsmParser { private: StringRef Mnemonic; ///< Instruction mnemonic. - MCSubtargetInfo &STI; // Map of register aliases registers via the .req directive. StringMap<std::pair<bool, unsigned> > RegisterReqs; @@ -101,6 +100,7 @@ private: OperandMatchResultTy tryParseSysReg(OperandVector &Operands); OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands); OperandMatchResultTy tryParsePrefetch(OperandVector &Operands); + OperandMatchResultTy tryParsePSBHint(OperandVector &Operands); OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands); OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands); OperandMatchResultTy tryParseFPImm(OperandVector &Operands); @@ -115,16 +115,16 @@ public: #define GET_OPERAND_DIAGNOSTIC_TYPES #include "AArch64GenAsmMatcher.inc" }; - AArch64AsmParser(MCSubtargetInfo &STI, MCAsmParser &Parser, + AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(STI) { + : MCTargetAsmParser(Options, STI) { MCAsmParserExtension::Initialize(Parser); MCStreamer &S = getParser().getStreamer(); if (S.getTargetStreamer() == nullptr) new AArch64TargetStreamer(S); // Initialize the set of available features. - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); } bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, @@ -160,7 +160,8 @@ private: k_Prefetch, k_ShiftExtend, k_FPImm, - k_Barrier + k_Barrier, + k_PSBHint, } Kind; SMLoc StartLoc, EndLoc; @@ -228,6 +229,12 @@ private: unsigned Length; }; + struct PSBHintOp { + unsigned Val; + const char *Data; + unsigned Length; + }; + struct ShiftExtendOp { AArch64_AM::ShiftExtendType Type; unsigned Amount; @@ -251,6 +258,7 @@ private: struct SysRegOp SysReg; struct SysCRImmOp SysCRImm; struct PrefetchOp Prefetch; + struct PSBHintOp PSBHint; struct ShiftExtendOp ShiftExtend; }; @@ -302,6 +310,9 @@ public: case k_Prefetch: Prefetch = o.Prefetch; break; + case k_PSBHint: + PSBHint = o.PSBHint; + break; case k_ShiftExtend: ShiftExtend = o.ShiftExtend; break; @@ -393,6 +404,16 @@ public: return Prefetch.Val; } + unsigned getPSBHint() const { + assert(Kind == k_PSBHint && "Invalid access!"); + return PSBHint.Val; + } + + StringRef getPSBHintName() const { + assert(Kind == k_PSBHint && "Invalid access!"); + return StringRef(PSBHint.Data, PSBHint.Length); + } + StringRef getPrefetchName() const { assert(Kind == k_Prefetch && "Invalid access!"); return StringRef(Prefetch.Data, Prefetch.Length); @@ -497,6 +518,15 @@ public: return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000; } + bool isImm0_1() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 2); + } bool isImm0_7() const { if (!isImm()) return false; @@ -876,12 +906,15 @@ public: } bool isMSRSystemRegister() const { if (!isSysReg()) return false; - return SysReg.MSRReg != -1U; } - bool isSystemPStateField() const { + bool isSystemPStateFieldWithImm0_1() const { if (!isSysReg()) return false; - + return (SysReg.PStateField == AArch64PState::PAN || + SysReg.PStateField == AArch64PState::UAO); + } + bool isSystemPStateFieldWithImm0_15() const { + if (!isSysReg() || isSystemPStateFieldWithImm0_1()) return false; return SysReg.PStateField != -1U; } bool isReg() const override { return Kind == k_Register && !Reg.isVector; } @@ -950,6 +983,7 @@ public: } bool isSysCR() const { return Kind == k_SysCR; } bool isPrefetch() const { return Kind == k_Prefetch; } + bool isPSBHint() const { return Kind == k_PSBHint; } bool isShiftExtend() const { return Kind == k_ShiftExtend; } bool isShifter() const { if (!isShiftExtend()) @@ -1175,8 +1209,10 @@ public: template <unsigned NumRegs> void addVectorList64Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - static unsigned FirstRegs[] = { AArch64::D0, AArch64::D0_D1, - AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 }; + static const unsigned FirstRegs[] = { AArch64::D0, + AArch64::D0_D1, + AArch64::D0_D1_D2, + AArch64::D0_D1_D2_D3 }; unsigned FirstReg = FirstRegs[NumRegs - 1]; Inst.addOperand( @@ -1186,8 +1222,10 @@ public: template <unsigned NumRegs> void addVectorList128Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - static unsigned FirstRegs[] = { AArch64::Q0, AArch64::Q0_Q1, - AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 }; + static const unsigned FirstRegs[] = { AArch64::Q0, + AArch64::Q0_Q1, + AArch64::Q0_Q1_Q2, + AArch64::Q0_Q1_Q2_Q3 }; unsigned FirstReg = FirstRegs[NumRegs - 1]; Inst.addOperand( @@ -1304,6 +1342,12 @@ public: Inst.addOperand(MCOperand::createImm(MCE->getValue() / 16)); } + void addImm0_1Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + void addImm0_7Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); @@ -1491,7 +1535,13 @@ public: Inst.addOperand(MCOperand::createImm(SysReg.MSRReg)); } - void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const { + void addSystemPStateFieldWithImm0_1Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + Inst.addOperand(MCOperand::createImm(SysReg.PStateField)); + } + + void addSystemPStateFieldWithImm0_15Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createImm(SysReg.PStateField)); @@ -1507,6 +1557,11 @@ public: Inst.addOperand(MCOperand::createImm(getPrefetch())); } + void addPSBHintOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getPSBHint())); + } + void addShifterOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); unsigned Imm = @@ -1703,6 +1758,19 @@ public: return Op; } + static std::unique_ptr<AArch64Operand> CreatePSBHint(unsigned Val, + StringRef Str, + SMLoc S, + MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx); + Op->PSBHint.Val = Val; + Op->PSBHint.Data = Str.data(); + Op->PSBHint.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + static std::unique_ptr<AArch64Operand> CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val, bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) { @@ -1776,6 +1844,10 @@ void AArch64Operand::print(raw_ostream &OS) const { OS << "<prfop invalid #" << getPrefetch() << ">"; break; } + case k_PSBHint: { + OS << getPSBHintName(); + break; + } case k_ShiftExtend: { OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #" << getShiftExtendAmount(); @@ -1849,6 +1921,8 @@ static bool isValidVectorKind(StringRef Name) { .Case(".h", true) .Case(".s", true) .Case(".d", true) + // Needed for fp16 scalar pairwise reductions + .Case(".2h", true) .Default(false); } @@ -2016,7 +2090,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { bool Valid; auto Mapper = AArch64PRFM::PRFMMapper(); StringRef Name = - Mapper.toString(MCE->getValue(), STI.getFeatureBits(), Valid); + Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid); Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name, S, getContext())); return MatchOperand_Success; @@ -2030,7 +2104,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { bool Valid; auto Mapper = AArch64PRFM::PRFMMapper(); unsigned prfop = - Mapper.fromString(Tok.getString(), STI.getFeatureBits(), Valid); + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); if (!Valid) { TokError("pre-fetch hint expected"); return MatchOperand_ParseFail; @@ -2042,6 +2116,32 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { return MatchOperand_Success; } +/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + bool Valid; + auto Mapper = AArch64PSBHint::PSBHintMapper(); + unsigned psbhint = + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); + if (!Valid) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + Parser.Lex(); // Eat identifier token. + Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(), + S, getContext())); + return MatchOperand_Success; +} + /// tryParseAdrpLabel - Parse and validate a source label for the ADRP /// instruction. AArch64AsmParser::OperandMatchResultTy @@ -2439,6 +2539,13 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, } else if (!Op.compare_lower("cisw")) { // SYS #0, C7, C14, #2 SYS_ALIAS(0, 7, 14, 2); + } else if (!Op.compare_lower("cvap")) { + if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { + // SYS #3, C7, C12, #1 + SYS_ALIAS(3, 7, 12, 1); + } else { + return TokError("DC CVAP requires ARMv8.2a"); + } } else { return TokError("invalid operand for DC instruction"); } @@ -2479,6 +2586,20 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, } else if (!Op.compare_lower("s12e0w")) { // SYS #4, C7, C8, #7 SYS_ALIAS(4, 7, 8, 7); + } else if (!Op.compare_lower("s1e1rp")) { + if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { + // SYS #0, C7, C9, #0 + SYS_ALIAS(0, 7, 9, 0); + } else { + return TokError("AT S1E1RP requires ARMv8.2a"); + } + } else if (!Op.compare_lower("s1e1wp")) { + if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { + // SYS #0, C7, C9, #1 + SYS_ALIAS(0, 7, 9, 1); + } else { + return TokError("AT S1E1WP requires ARMv8.2a"); + } } else { return TokError("invalid operand for AT instruction"); } @@ -2644,7 +2765,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { bool Valid; auto Mapper = AArch64DB::DBarrierMapper(); StringRef Name = - Mapper.toString(MCE->getValue(), STI.getFeatureBits(), Valid); + Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid); Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name, ExprLoc, getContext())); return MatchOperand_Success; @@ -2658,7 +2779,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { bool Valid; auto Mapper = AArch64DB::DBarrierMapper(); unsigned Opt = - Mapper.fromString(Tok.getString(), STI.getFeatureBits(), Valid); + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); if (!Valid) { TokError("invalid barrier option name"); return MatchOperand_ParseFail; @@ -2687,20 +2808,21 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) { bool IsKnown; auto MRSMapper = AArch64SysReg::MRSMapper(); - uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), STI.getFeatureBits(), - IsKnown); + uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), + getSTI().getFeatureBits(), IsKnown); assert(IsKnown == (MRSReg != -1U) && "register should be -1 if and only if it's unknown"); auto MSRMapper = AArch64SysReg::MSRMapper(); - uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), STI.getFeatureBits(), - IsKnown); + uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), + getSTI().getFeatureBits(), IsKnown); assert(IsKnown == (MSRReg != -1U) && "register should be -1 if and only if it's unknown"); auto PStateMapper = AArch64PState::PStateMapper(); uint32_t PStateField = - PStateMapper.fromString(Tok.getString(), STI.getFeatureBits(), IsKnown); + PStateMapper.fromString(Tok.getString(), + getSTI().getFeatureBits(), IsKnown); assert(IsKnown == (PStateField != -1U) && "register should be -1 if and only if it's unknown"); @@ -3151,7 +3273,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, if (Operands.size() < 2 || !static_cast<AArch64Operand &>(*Operands[1]).isReg()) - return true; + return Error(Loc, "Only valid when first operand is register"); bool IsXReg = AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( @@ -3183,7 +3305,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, } // If it is a label or an imm that cannot fit in a movz, put it into CP. const MCExpr *CPLoc = - getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4); + getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4, Loc); Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx)); return false; } @@ -3601,6 +3723,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { return Error(Loc, "index must be a multiple of 8 in range [0, 32760]."); case Match_InvalidMemoryIndexed16: return Error(Loc, "index must be a multiple of 16 in range [0, 65520]."); + case Match_InvalidImm0_1: + return Error(Loc, "immediate must be an integer in range [0, 1]."); case Match_InvalidImm0_7: return Error(Loc, "immediate must be an integer in range [0, 7]."); case Match_InvalidImm0_15: @@ -3912,7 +4036,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]); if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) { unsigned zreg = - AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains( + !AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains( RegOp.getReg()) ? AArch64::WZR : AArch64::XZR; @@ -3929,10 +4053,27 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // If that fails, try against the alternate table containing long-form NEON: // "fadd v0.2s, v1.2s, v2.2s" - if (MatchResult != Match_Success) + if (MatchResult != Match_Success) { + // But first, save the short-form match result: we can use it in case the + // long-form match also fails. + auto ShortFormNEONErrorInfo = ErrorInfo; + auto ShortFormNEONMatchResult = MatchResult; + MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0); + // Now, both matches failed, and the long-form match failed on the mnemonic + // suffix token operand. The short-form match failure is probably more + // relevant: use it instead. + if (MatchResult == Match_InvalidOperand && ErrorInfo == 1 && + Operands.size() > 1 && ((AArch64Operand &)*Operands[1]).isToken() && + ((AArch64Operand &)*Operands[1]).isTokenSuffix()) { + MatchResult = ShortFormNEONMatchResult; + ErrorInfo = ShortFormNEONErrorInfo; + } + } + + switch (MatchResult) { case Match_Success: { // Perform range checking and other semantic validations @@ -3944,7 +4085,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return true; Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, getSTI()); return false; } case Match_MissingFeature: { @@ -3966,6 +4107,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return showMatchError(IDLoc, MatchResult); case Match_InvalidOperand: { SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0ULL) { if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); @@ -4011,6 +4153,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidMemoryIndexed8SImm7: case Match_InvalidMemoryIndexed16SImm7: case Match_InvalidMemoryIndexedSImm9: + case Match_InvalidImm0_1: case Match_InvalidImm0_7: case Match_InvalidImm0_15: case Match_InvalidImm0_31: @@ -4083,7 +4226,7 @@ bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) { if (getParser().parseExpression(Value)) return true; - getParser().getStreamer().EmitValue(Value, Size); + getParser().getStreamer().EmitValue(Value, Size, L); if (getLexer().is(AsmToken::EndOfStatement)) break; @@ -4155,7 +4298,7 @@ bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) { Inst.setOpcode(AArch64::TLSDESCCALL); Inst.addOperand(MCOperand::createExpr(Expr)); - getParser().getStreamer().EmitInstruction(Inst, STI); + getParser().getStreamer().EmitInstruction(Inst, getSTI()); return false; } diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index db9fb0e..f1f968e 100644 --- a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -1516,6 +1516,10 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, uint64_t pstate_field = (op1 << 3) | op2; + if ((pstate_field == AArch64PState::PAN || + pstate_field == AArch64PState::UAO) && crm > 1) + return Fail; + Inst.addOperand(MCOperand::createImm(pstate_field)); Inst.addOperand(MCOperand::createImm(crm)); diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index 7f56c2c..d8a8108 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -19,6 +19,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -55,7 +56,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, unsigned Opcode = MI->getOpcode(); if (Opcode == AArch64::SYSxt) - if (printSysAlias(MI, O)) { + if (printSysAlias(MI, STI, O)) { printAnnotation(O, Annot); return; } @@ -269,7 +270,7 @@ struct LdStNInstrDesc { int NaturalOffset; }; -static LdStNInstrDesc LdStNInstInfo[] = { +static const LdStNInstrDesc LdStNInstInfo[] = { { AArch64::LD1i8, "ld1", ".b", 1, true, 0 }, { AArch64::LD1i16, "ld1", ".h", 1, true, 0 }, { AArch64::LD1i32, "ld1", ".s", 1, true, 0 }, @@ -612,7 +613,7 @@ static LdStNInstrDesc LdStNInstInfo[] = { { AArch64::ST4Fourv2s_POST, "st4", ".2s", 1, false, 32 }, }; -static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) { +static const LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) { unsigned Idx; for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx) if (LdStNInstInfo[Idx].Opcode == Opcode) @@ -641,7 +642,7 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O, return; } - if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) { + if (const LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) { O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t'; // Now onto the operands: first a vector list with possible lane @@ -674,7 +675,9 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O, AArch64InstPrinter::printInst(MI, O, Annot, STI); } -bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) { +bool AArch64InstPrinter::printSysAlias(const MCInst *MI, + const MCSubtargetInfo &STI, + raw_ostream &O) { #ifndef NDEBUG unsigned Opcode = MI->getOpcode(); assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!"); @@ -729,6 +732,11 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) { if (Op1Val == 3 && Op2Val == 1) Asm = "dc\tcvau"; break; + case 12: + if (Op1Val == 3 && Op2Val == 1 && + (STI.getFeatureBits()[AArch64::HasV8_2aOps])) + Asm = "dc\tcvap"; + break; case 14: if (Op1Val == 3 && Op2Val == 1) Asm = "dc\tcivac"; @@ -773,6 +781,21 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) { break; } break; + case 9: + switch (Op1Val) { + default: + break; + case 0: + if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) { + switch (Op2Val) { + default: + break; + case 0: Asm = "at\ts1e1rp"; break; + case 1: Asm = "at\ts1e1wp"; break; + } + } + break; + } } } else if (CnVal == 8) { // TLBI aliases @@ -1122,6 +1145,19 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum, O << '#' << prfop; } +void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned psbhintop = MI->getOperand(OpNum).getImm(); + bool Valid; + StringRef Name = + AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid); + if (Valid) + O << Name; + else + O << '#' << psbhintop; +} + void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index 15dee97..ea68d98 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -15,14 +15,10 @@ #define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H #include "MCTargetDesc/AArch64MCTargetDesc.h" -#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCSubtargetInfo.h" namespace llvm { -class MCOperand; - class AArch64InstPrinter : public MCInstPrinter { public: AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, @@ -48,7 +44,8 @@ public: unsigned AltIdx = AArch64::NoRegAltName); protected: - bool printSysAlias(const MCInst *MI, raw_ostream &O); + bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); // Operand printers void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); @@ -122,6 +119,9 @@ protected: void printPrefetchOp(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); + void printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h index ed24343..648b1df 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -364,6 +364,32 @@ static inline float getFPImmFloat(unsigned Imm) { return FPUnion.F; } +/// getFP16Imm - Return an 8-bit floating-point version of the 16-bit +/// floating-point value. If the value cannot be represented as an 8-bit +/// floating-point value, then return -1. +static inline int getFP16Imm(const APInt &Imm) { + uint32_t Sign = Imm.lshr(15).getZExtValue() & 1; + int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15; // -14 to 15 + int32_t Mantissa = Imm.getZExtValue() & 0x3ff; // 10 bits + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0x3f) + return -1; + Mantissa >>= 6; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; +} + +static inline int getFP16Imm(const APFloat &FPImm) { + return getFP16Imm(FPImm.bitcastToAPInt()); +} + /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit /// floating-point value. If the value cannot be represented as an 8-bit /// floating-point value, then return -1. diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 16d5356..d26604f 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -128,10 +128,9 @@ public: /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. - void EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) override { + void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { EmitDataMappingSymbol(); - MCELFStreamer::EmitValueImpl(Value, Size); + MCELFStreamer::EmitValueImpl(Value, Size, Loc); } private: diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 921c4b9..fbce26e 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -48,10 +48,6 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { UseDataRegionDirectives = true; ExceptionsType = ExceptionHandling::DwarfCFI; - - // AArch64 Darwin doesn't have the baggage of X86/ARM, so it's fine to use - // LShr instead of AShr. - UseLogicalShr = true; } const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol( diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 2870341..a540f49 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -85,13 +85,13 @@ void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const { Streamer.visitUsedExpr(*getSubExpr()); } -MCSection *AArch64MCExpr::findAssociatedSection() const { +MCFragment *AArch64MCExpr::findAssociatedFragment() const { llvm_unreachable("FIXME: what goes here?"); } bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const { + const MCAsmLayout *Layout, + const MCFixup *Fixup) const { if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup)) return false; diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index 1165314..db36a65 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -149,11 +149,10 @@ public: void visitUsedExpr(MCStreamer &Streamer) const override; - MCSection *findAssociatedSection() const override; + MCFragment *findAssociatedFragment() const override; - bool evaluateAsRelocatableImpl(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const override; + bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const override; void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override; @@ -162,7 +161,6 @@ public: } static bool classof(const AArch64MCExpr *) { return true; } - }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 741b273..61c96f1 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -90,9 +90,11 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( Log2Size = llvm::Log2_32(4); // This encompasses the relocation for the whole 21-bit value. switch (Sym->getKind()) { - default: - Asm.getContext().reportFatalError(Fixup.getLoc(), - "ADR/ADRP relocations must be GOT relative"); + default: { + Asm.getContext().reportError(Fixup.getLoc(), + "ADR/ADRP relocations must be GOT relative"); + return false; + } case MCSymbolRefExpr::VK_PAGE: RelocType = unsigned(MachO::ARM64_RELOC_PAGE21); return true; @@ -170,25 +172,25 @@ void AArch64MachObjectWriter::recordRelocation( // assembler local symbols. If we got here, that's not what we have, // so complain loudly. if (Kind == AArch64::fixup_aarch64_pcrel_branch19) { - Asm.getContext().reportFatalError(Fixup.getLoc(), - "conditional branch requires assembler-local" - " label. '" + - Target.getSymA()->getSymbol().getName() + - "' is external."); + Asm.getContext().reportError(Fixup.getLoc(), + "conditional branch requires assembler-local" + " label. '" + + Target.getSymA()->getSymbol().getName() + + "' is external."); return; } // 14-bit branch relocations should only target internal labels, and so // should never get here. if (Kind == AArch64::fixup_aarch64_pcrel_branch14) { - Asm.getContext().reportFatalError(Fixup.getLoc(), - "Invalid relocation on conditional branch!"); + Asm.getContext().reportError(Fixup.getLoc(), + "Invalid relocation on conditional branch!"); return; } if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size, - Asm)) { - Asm.getContext().reportFatalError(Fixup.getLoc(), "unknown AArch64 fixup kind!"); + Asm)) { + Asm.getContext().reportError(Fixup.getLoc(), "unknown AArch64 fixup kind!"); return; } @@ -200,8 +202,9 @@ void AArch64MachObjectWriter::recordRelocation( Type = MachO::ARM64_RELOC_UNSIGNED; if (IsPCRel) { - Asm.getContext().reportFatalError(Fixup.getLoc(), - "PC relative absolute relocation!"); + Asm.getContext().reportError(Fixup.getLoc(), + "PC relative absolute relocation!"); + return; // FIXME: x86_64 sets the type to a branch reloc here. Should we do // something similar? @@ -229,16 +232,20 @@ void AArch64MachObjectWriter::recordRelocation( Writer->addRelocation(A_Base, Fragment->getParent(), MRE); return; } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || - Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) + Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) { // Otherwise, neither symbol can be modified. - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unsupported relocation of modified symbol"); + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation of modified symbol"); + return; + } // We don't support PCrel relocations of differences. - if (IsPCRel) - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unsupported pc-relative relocation of " - "difference"); + if (IsPCRel) { + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported pc-relative relocation of " + "difference"); + return; + } // AArch64 always uses external relocations. If there is no symbol to use as // a base address (a local symbol with no preceding non-local symbol), @@ -246,20 +253,26 @@ void AArch64MachObjectWriter::recordRelocation( // // FIXME: We should probably just synthesize an external symbol and use // that. - if (!A_Base) - Asm.getContext().reportFatalError( + if (!A_Base) { + Asm.getContext().reportError( Fixup.getLoc(), "unsupported relocation of local symbol '" + A->getName() + "'. Must have non-local symbol earlier in section."); - if (!B_Base) - Asm.getContext().reportFatalError( + return; + } + if (!B_Base) { + Asm.getContext().reportError( Fixup.getLoc(), "unsupported relocation of local symbol '" + B->getName() + "'. Must have non-local symbol earlier in section."); + return; + } - if (A_Base == B_Base && A_Base) - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unsupported relocation with identical base"); + if (A_Base == B_Base && A_Base) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported relocation with identical base"); + return; + } Value += (!A->getFragment() ? 0 : Writer->getSymbolAddress(*A, Layout)) - (!A_Base || !A_Base->getFragment() ? 0 : Writer->getSymbolAddress( @@ -309,10 +322,12 @@ void AArch64MachObjectWriter::recordRelocation( // we need to preserve and merge with the new Target? How about // the FixedValue? if (!Symbol->getVariableValue()->evaluateAsRelocatable(Target, &Layout, - &Fixup)) - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unable to resolve variable '" + - Symbol->getName() + "'"); + &Fixup)) { + Asm.getContext().reportError(Fixup.getLoc(), + "unable to resolve variable '" + + Symbol->getName() + "'"); + return; + } return recordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, FixedValue); } @@ -337,11 +352,13 @@ void AArch64MachObjectWriter::recordRelocation( Value += Layout.getSymbolOffset(*Symbol) - Layout.getSymbolOffset(*Base); } else if (Symbol->isInSection()) { - if (!CanUseLocalRelocation) - Asm.getContext().reportFatalError( + if (!CanUseLocalRelocation) { + Asm.getContext().reportError( Fixup.getLoc(), "unsupported relocation of local symbol '" + Symbol->getName() + "'. Must have non-local symbol earlier in section."); + return; + } // Adjust the relocation to be section-relative. // The index is the section ordinal (1-based). const MCSection &Sec = Symbol->getSection(); @@ -361,9 +378,10 @@ void AArch64MachObjectWriter::recordRelocation( return; } } - Asm.getContext().reportFatalError(Fixup.getLoc(), + Asm.getContext().reportError(Fixup.getLoc(), "unsupported relocation of variable '" + Symbol->getName() + "'"); + return; } } diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index 52b000d..3e86a42 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -26,8 +26,9 @@ AArch64TargetStreamer::~AArch64TargetStreamer() {} // The constant pool handling is shared by all AArch64TargetStreamer // implementations. const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr, - unsigned Size) { - return ConstantPools->addEntry(Streamer, Expr, Size); + unsigned Size, + SMLoc Loc) { + return ConstantPools->addEntry(Streamer, Expr, Size, Loc); } void AArch64TargetStreamer::emitCurrentConstantPool() { diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h index fcc0d05..51432830 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h @@ -24,7 +24,7 @@ public: /// Callback used to implement the ldr= pseudo. /// Add a new entry to the constant pool for the current section and return an /// MCExpr that can be used to refer to the constant pool location. - const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size); + const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size, SMLoc Loc); /// Callback used to implemnt the .ltorg directive. /// Emit contents of constant pool for the current section. diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index ee85b65b..cde1c6d 100644 --- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -146,11 +146,22 @@ const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings // v8.1a "Privileged Access Never" extension-specific PStates {"pan", PAN, {AArch64::HasV8_1aOps}}, + + // v8.2a + {"uao", UAO, {AArch64::HasV8_2aOps}}, }; AArch64PState::PStateMapper::PStateMapper() : AArch64NamedImmMapper(PStateMappings, 0) {} +const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = { + // v8.2a "Statistical Profiling" extension-specific PSB operand + {"csync", CSync, {AArch64::FeatureSPE}}, +}; + +AArch64PSBHint::PSBHintMapper::PSBHintMapper() + : AArch64NamedImmMapper(PSBHintMappings, 0) {} + const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = { {"mdccsr_el0", MDCCSR_EL0, {}}, {"dbgdtrrx_el0", DBGDTRRX_EL0, {}}, @@ -192,6 +203,7 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = { {"id_aa64isar1_el1", ID_A64ISAR1_EL1, {}}, {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1, {}}, {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1, {}}, + {"id_aa64mmfr2_el1", ID_A64MMFR2_EL1, {AArch64::HasV8_2aOps}}, {"mvfr0_el1", MVFR0_EL1, {}}, {"mvfr1_el1", MVFR1_EL1, {}}, {"mvfr2_el1", MVFR2_EL1, {}}, @@ -275,9 +287,6 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = { {"icc_sgi1r_el1", ICC_SGI1R_EL1, {}}, {"icc_asgi1r_el1", ICC_ASGI1R_EL1, {}}, {"icc_sgi0r_el1", ICC_SGI0R_EL1, {}}, - - // v8.1a "Privileged Access Never" extension-specific system registers - {"pan", PAN, {AArch64::HasV8_1aOps}}, }; AArch64SysReg::MSRMapper::MSRMapper() { @@ -804,10 +813,28 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings {"cntv_cval_el02", CNTV_CVAL_EL02, {AArch64::HasV8_1aOps}}, {"spsr_el12", SPSR_EL12, {AArch64::HasV8_1aOps}}, {"elr_el12", ELR_EL12, {AArch64::HasV8_1aOps}}, + + // v8.2a registers + {"uao", UAO, {AArch64::HasV8_2aOps}}, + + // v8.2a "Statistical Profiling extension" registers + {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}}, + {"pmbptr_el1", PMBPTR_EL1, {AArch64::FeatureSPE}}, + {"pmbsr_el1", PMBSR_EL1, {AArch64::FeatureSPE}}, + {"pmbidr_el1", PMBIDR_EL1, {AArch64::FeatureSPE}}, + {"pmscr_el2", PMSCR_EL2, {AArch64::FeatureSPE}}, + {"pmscr_el12", PMSCR_EL12, {AArch64::FeatureSPE}}, + {"pmscr_el1", PMSCR_EL1, {AArch64::FeatureSPE}}, + {"pmsicr_el1", PMSICR_EL1, {AArch64::FeatureSPE}}, + {"pmsirr_el1", PMSIRR_EL1, {AArch64::FeatureSPE}}, + {"pmsfcr_el1", PMSFCR_EL1, {AArch64::FeatureSPE}}, + {"pmsevfr_el1", PMSEVFR_EL1, {AArch64::FeatureSPE}}, + {"pmslatfr_el1", PMSLATFR_EL1, {AArch64::FeatureSPE}}, + {"pmsidr_el1", PMSIDR_EL1, {AArch64::FeatureSPE}}, }; uint32_t -AArch64SysReg::SysRegMapper::fromString(StringRef Name, +AArch64SysReg::SysRegMapper::fromString(StringRef Name, const FeatureBitset& FeatureBits, bool &Valid) const { std::string NameLower = Name.lower(); @@ -851,7 +878,7 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name, } std::string -AArch64SysReg::SysRegMapper::toString(uint32_t Bits, +AArch64SysReg::SysRegMapper::toString(uint32_t Bits, const FeatureBitset& FeatureBits) const { // First search the registers shared by all for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) { diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 7e42f8e..e63627e 100644 --- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -285,17 +285,17 @@ struct AArch64NamedImmMapper { // Zero value of FeatureBitSet means the mapping is always available FeatureBitset FeatureBitSet; - bool isNameEqual(std::string Other, + bool isNameEqual(std::string Other, const FeatureBitset& FeatureBits) const { - if (FeatureBitSet.any() && + if (FeatureBitSet.any() && (FeatureBitSet & FeatureBits).none()) return false; return Name == Other; } - bool isValueEqual(uint32_t Other, + bool isValueEqual(uint32_t Other, const FeatureBitset& FeatureBits) const { - if (FeatureBitSet.any() && + if (FeatureBitSet.any() && (FeatureBitSet & FeatureBits).none()) return false; return Value == Other; @@ -310,7 +310,7 @@ struct AArch64NamedImmMapper { StringRef toString(uint32_t Value, const FeatureBitset& FeatureBits, bool &Valid) const; // Maps string to value, depending on availability for FeatureBits given - uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits, + uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits, bool &Valid) const; /// Many of the instructions allow an alternative assembly form consisting of @@ -337,7 +337,9 @@ namespace AArch64AT { S12E1R = 0x63c4, // 01 100 0111 1000 100 S12E1W = 0x63c5, // 01 100 0111 1000 101 S12E0R = 0x63c6, // 01 100 0111 1000 110 - S12E0W = 0x63c7 // 01 100 0111 1000 111 + S12E0W = 0x63c7, // 01 100 0111 1000 111 + S1E1RP = 0x43c8, // 01 000 0111 1001 000 + S1E1WP = 0x43c9 // 01 000 0111 1001 001 }; struct ATMapper : AArch64NamedImmMapper { @@ -463,6 +465,9 @@ namespace AArch64PState { // v8.1a "Privileged Access Never" extension-specific PStates PAN = 0x04, + + // v8.2a "User Access Override" extension-specific PStates + UAO = 0x03 }; struct PStateMapper : AArch64NamedImmMapper { @@ -473,6 +478,21 @@ namespace AArch64PState { } +namespace AArch64PSBHint { + enum PSBHintValues { + Invalid = -1, + // v8.2a "Statistical Profiling" extension-specific PSB operands + CSync = 0x11, // psb csync = hint #0x11 + }; + + struct PSBHintMapper : AArch64NamedImmMapper { + const static Mapping PSBHintMappings[]; + + PSBHintMapper(); + }; + +} + namespace AArch64SE { enum ShiftExtSpecifiers { Invalid = -1, @@ -594,6 +614,7 @@ namespace AArch64SysReg { ID_A64ISAR1_EL1 = 0xc031, // 11 000 0000 0110 001 ID_A64MMFR0_EL1 = 0xc038, // 11 000 0000 0111 000 ID_A64MMFR1_EL1 = 0xc039, // 11 000 0000 0111 001 + ID_A64MMFR2_EL1 = 0xc03a, // 11 000 0000 0111 010 MVFR0_EL1 = 0xc018, // 11 000 0000 0011 000 MVFR1_EL1 = 0xc019, // 11 000 0000 0011 001 MVFR2_EL1 = 0xc01a, // 11 000 0000 0011 010 @@ -1190,6 +1211,24 @@ namespace AArch64SysReg { SPSR_EL12 = 0xea00, // 11 101 0100 0000 000 ELR_EL12 = 0xea01, // 11 101 0100 0000 001 + // v8.2a registers + UAO = 0xc214, // 11 000 0100 0010 100 + + // v8.2a "Statistical Profiling extension" registers + PMBLIMITR_EL1 = 0xc4d0, // 11 000 1001 1010 000 + PMBPTR_EL1 = 0xc4d1, // 11 000 1001 1010 001 + PMBSR_EL1 = 0xc4d3, // 11 000 1001 1010 011 + PMBIDR_EL1 = 0xc4d7, // 11 000 1001 1010 111 + PMSCR_EL2 = 0xe4c8, // 11 100 1001 1001 000 + PMSCR_EL12 = 0xecc8, // 11 101 1001 1001 000 + PMSCR_EL1 = 0xc4c8, // 11 000 1001 1001 000 + PMSICR_EL1 = 0xc4ca, // 11 000 1001 1001 010 + PMSIRR_EL1 = 0xc4cb, // 11 000 1001 1001 011 + PMSFCR_EL1 = 0xc4cc, // 11 000 1001 1001 100 + PMSEVFR_EL1 = 0xc4cd, // 11 000 1001 1001 101 + PMSLATFR_EL1 = 0xc4ce, // 11 000 1001 1001 110 + PMSIDR_EL1 = 0xc4cf, // 11 000 1001 1001 111 + // Cyclone specific system registers CPM_IOACC_CTL_EL3 = 0xff90, }; @@ -1283,7 +1322,7 @@ namespace AArch64TLBI { return true; } } -} +} namespace AArch64II { /// Target Operand Flag enum. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h index 0a05d25..5d00e1c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -20,8 +20,10 @@ class AMDGPUInstrPrinter; class AMDGPUSubtarget; class AMDGPUTargetMachine; class FunctionPass; +class MachineSchedContext; class MCAsmInfo; class raw_ostream; +class ScheduleDAGInstrs; class Target; class TargetMachine; @@ -44,15 +46,23 @@ FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); -FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); +FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); -FunctionPass *createSIPrepareScratchRegs(); + +ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C); + +ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); +void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); +extern char &AMDGPUAnnotateKernelFeaturesID; void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; +void initializeSIFixSGPRCopiesPass(PassRegistry &); +extern char &SIFixSGPRCopiesID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; @@ -64,6 +74,8 @@ FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); ModulePass *createAMDGPUAlwaysInlinePass(); +ModulePass *createAMDGPUOpenCLImageTypeLoweringPass(); +FunctionPass *createAMDGPUAnnotateUniformValues(); void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); extern char &SIFixControlFlowLiveIntervalsID; @@ -71,6 +83,8 @@ extern char &SIFixControlFlowLiveIntervalsID; void initializeSIFixSGPRLiveRangesPass(PassRegistry&); extern char &SIFixSGPRLiveRangesID; +void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&); +extern char &AMDGPUAnnotateUniformValuesPassID; extern Target TheAMDGPUTarget; extern Target TheGCNTarget; @@ -85,8 +99,6 @@ enum TargetIndex { }; } -#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel" - } // End namespace llvm namespace ShaderType { diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td index 68b5050..db869cf 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -108,11 +108,21 @@ def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-fol "true", "Force using DS instruction immediate offsets on SI">; +def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", + "FlatForGlobal", + "true", + "Force to generate flat instruction for global">; + def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "FlatAddressSpace", "true", "Support flat address space">; +def FeatureXNACK : SubtargetFeature<"xnack", + "EnableXNACK", + "true", + "Enable XNACK support">; + def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", "EnableVGPRSpilling", "true", @@ -272,9 +282,14 @@ def isSICI : Predicate< "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" >, AssemblerPredicate<"FeatureGCN1Encoding">; +def isVI : Predicate < + "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, + AssemblerPredicate<"FeatureGCN3Encoding">; + class PredicateControl { Predicate SubtargetPredicate; Predicate SIAssemblerPredicate = isSICI; + Predicate VIAssemblerPredicate = isVI; list<Predicate> AssemblerPredicates = []; Predicate AssemblerPredicate = TruePredicate; list<Predicate> OtherPredicates = []; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp new file mode 100644 index 0000000..3781839 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -0,0 +1,126 @@ +//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass adds target attributes to functions which use intrinsics +/// which will impact calling convention lowering. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" + +#define DEBUG_TYPE "amdgpu-annotate-kernel-features" + +using namespace llvm; + +namespace { + +class AMDGPUAnnotateKernelFeatures : public ModulePass { +private: + void addAttrToCallers(Function *Intrin, StringRef AttrName); + bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>); + +public: + static char ID; + + AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { } + bool runOnModule(Module &M) override; + const char *getPassName() const override { + return "AMDGPU Annotate Kernel Features"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + ModulePass::getAnalysisUsage(AU); + } +}; + +} + +char AMDGPUAnnotateKernelFeatures::ID = 0; + +char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; + + +INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, + "Add AMDGPU function attributes", false, false) +INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, + "Add AMDGPU function attributes", false, false) + + +void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin, + StringRef AttrName) { + SmallPtrSet<Function *, 4> SeenFuncs; + + for (User *U : Intrin->users()) { + // CallInst is the only valid user for an intrinsic. + CallInst *CI = cast<CallInst>(U); + + Function *CallingFunction = CI->getParent()->getParent(); + if (SeenFuncs.insert(CallingFunction).second) + CallingFunction->addFnAttr(AttrName); + } +} + +bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics( + Module &M, + ArrayRef<StringRef[2]> IntrinsicToAttr) { + bool Changed = false; + + for (const StringRef *Arr : IntrinsicToAttr) { + if (Function *Fn = M.getFunction(Arr[0])) { + addAttrToCallers(Fn, Arr[1]); + Changed = true; + } + } + + return Changed; +} + +bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { + Triple TT(M.getTargetTriple()); + + static const StringRef IntrinsicToAttr[][2] = { + // .x omitted + { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" }, + { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" }, + + // .x omitted + { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" }, + { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" } + + }; + + static const StringRef HSAIntrinsicToAttr[][2] = { + { "llvm.r600.read.local.size.x", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.local.size.y", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.local.size.z", "amdgpu-dispatch-ptr" }, + + { "llvm.r600.read.global.size.x", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.global.size.y", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.global.size.z", "amdgpu-dispatch-ptr" }, + { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" } + }; + + // TODO: Intrinsics that require queue ptr. + + // We do not need to note the x workitem or workgroup id because they are + // always initialized. + + bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); + if (TT.getOS() == Triple::AMDHSA) + Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); + + return Changed; +} + +ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { + return new AMDGPUAnnotateKernelFeatures(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp new file mode 100644 index 0000000..dfddc34 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -0,0 +1,84 @@ +//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass adds amdgpu.uniform metadata to IR values so this information +/// can be used during instruction selection. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-annotate-uniform" + +using namespace llvm; + +namespace { + +class AMDGPUAnnotateUniformValues : public FunctionPass, + public InstVisitor<AMDGPUAnnotateUniformValues> { + DivergenceAnalysis *DA; + +public: + static char ID; + AMDGPUAnnotateUniformValues() : + FunctionPass(ID) { } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { return "AMDGPU Annotate Uniform Values"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DivergenceAnalysis>(); + AU.setPreservesAll(); + } + + void visitLoadInst(LoadInst &I); + +}; + +} // End anonymous namespace + +INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, + "Add AMDGPU uniform metadata", false, false) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, + "Add AMDGPU uniform metadata", false, false) + +char AMDGPUAnnotateUniformValues::ID = 0; + +void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { + Value *Ptr = I.getPointerOperand(); + if (!DA->isUniform(Ptr)) + return; + + if (Instruction *PtrI = dyn_cast<Instruction>(Ptr)) + PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {})); + +} + +bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { + return false; +} + +bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { + DA = &getAnalysis<DivergenceAnalysis>(); + visit(F); + + return true; +} + +FunctionPass * +llvm::createAMDGPUAnnotateUniformValues() { + return new AMDGPUAnnotateUniformValues(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 0a5309b..1239dfb2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -91,6 +91,25 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)) {} +void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { + if (TM.getTargetTriple().getOS() != Triple::AMDHSA) + return; + + // Need to construct an MCSubtargetInfo here in case we have no functions + // in the module. + std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo( + TM.getTargetTriple().str(), TM.getTargetCPU(), + TM.getTargetFeatureString())); + + AMDGPUTargetStreamer *TS = + static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + + TS->EmitDirectiveHSACodeObjectVersion(1, 0); + AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits()); + TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, + "AMD", "AMDGPU"); +} + void AMDGPUAsmPrinter::EmitFunctionBodyStart() { const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; @@ -100,14 +119,67 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { } } -void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { +void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { + const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); + if (MFI->isKernel() && STM.isAmdHsaOS()) { + AMDGPUTargetStreamer *TS = + static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(), + ELF::STT_AMDGPU_HSA_KERNEL); + } + + AsmPrinter::EmitFunctionEntryLabel(); +} - // This label is used to mark the end of the .text section. - const TargetLoweringObjectFile &TLOF = getObjFileLowering(); - OutStreamer->SwitchSection(TLOF.getTextSection()); - MCSymbol *EndOfTextLabel = - OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - OutStreamer->EmitLabel(EndOfTextLabel); +static bool isModuleLinkage(const GlobalValue *GV) { + switch (GV->getLinkage()) { + case GlobalValue::InternalLinkage: + case GlobalValue::CommonLinkage: + return true; + case GlobalValue::ExternalLinkage: + return false; + default: llvm_unreachable("unknown linkage type"); + } +} + +void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { + + if (TM.getTargetTriple().getOS() != Triple::AMDHSA) { + AsmPrinter::EmitGlobalVariable(GV); + return; + } + + if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) { + AsmPrinter::EmitGlobalVariable(GV); + return; + } + + // Group segment variables aren't emitted in HSA. + if (AMDGPU::isGroupSegment(GV)) + return; + + AMDGPUTargetStreamer *TS = + static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + if (isModuleLinkage(GV)) { + TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName()); + } else { + TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName()); + } + + MCSymbolELF *GVSym = cast<MCSymbolELF>(getSymbol(GV)); + const DataLayout &DL = getDataLayout(); + + // Emit the size + uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); + OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext)); + OutStreamer->PushSection(); + OutStreamer->SwitchSection( + getObjFileLowering().SectionForGlobal(GV, *Mang, TM)); + const Constant *C = GV->getInitializer(); + OutStreamer->EmitLabel(GVSym); + EmitGlobalConstant(DL, C); + OutStreamer->PopSection(); } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { @@ -125,17 +197,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + getSIProgramInfo(KernelInfo, MF); if (!STM.isAmdHsaOS()) { - getSIProgramInfo(KernelInfo, MF); EmitProgramInfoSI(MF, KernelInfo); } - // Emit directives - AMDGPUTargetStreamer *TS = - static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); - TS->EmitDirectiveHSACodeObjectVersion(1, 0); - AMDGPU::IsaVersion ISA = STM.getIsaVersion(); - TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, - "AMD", "AMDGPU"); } else { EmitProgramInfoR600(MF); } @@ -165,6 +230,23 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), false); + + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + + Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + + Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + + Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + + Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + + Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)), + false); + } else { R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); OutStreamer->emitRawComment( @@ -278,27 +360,30 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, unsigned width = 0; bool isSGPR = false; - if (!MO.isReg()) { + if (!MO.isReg()) continue; - } + unsigned reg = MO.getReg(); - if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO || - reg == AMDGPU::VCC_HI) { + switch (reg) { + case AMDGPU::EXEC: + case AMDGPU::SCC: + case AMDGPU::M0: + continue; + + case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: VCCUsed = true; continue; - } else if (reg == AMDGPU::FLAT_SCR || - reg == AMDGPU::FLAT_SCR_LO || - reg == AMDGPU::FLAT_SCR_HI) { + + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: FlatUsed = true; continue; - } - switch (reg) { - default: break; - case AMDGPU::SCC: - case AMDGPU::EXEC: - case AMDGPU::M0: - continue; + default: + break; } if (AMDGPU::SReg_32RegClass.contains(reg)) { @@ -348,11 +433,23 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } } + unsigned ExtraSGPRs = 0; + if (VCCUsed) - MaxSGPR += 2; + ExtraSGPRs = 2; - if (FlatUsed) - MaxSGPR += 2; + if (STM.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (FlatUsed) + ExtraSGPRs = 4; + } else { + if (STM.isXNACKEnabled()) + ExtraSGPRs = 4; + + if (FlatUsed) + ExtraSGPRs = 6; + } + + MaxSGPR += ExtraSGPRs; // We found the maximum register index. They start at 0, so add one to get the // number of registers. @@ -368,6 +465,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; } + if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.emitError("too many user SGPRs used"); + } + ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode @@ -419,18 +521,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | S_00B848_PRIV(ProgInfo.Priv) | S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | - S_00B848_IEEE_MODE(ProgInfo.DebugMode) | + S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + // 0 = X, 1 = XY, 2 = XYZ + unsigned TIDIGCompCnt = 0; + if (MFI->hasWorkItemIDZ()) + TIDIGCompCnt = 2; + else if (MFI->hasWorkItemIDY()) + TIDIGCompCnt = 1; + ProgInfo.ComputePGMRSrc2 = S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | - S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | - S_00B84C_TGID_X_EN(1) | - S_00B84C_TGID_Y_EN(1) | - S_00B84C_TGID_Z_EN(1) | - S_00B84C_TG_SIZE_EN(1) | - S_00B84C_TIDIG_COMP_CNT(2) | - S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); + S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | + S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | + S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | + S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | + S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | + S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | + S_00B84C_EXCP_EN_MSB(0) | + S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) | + S_00B84C_EXCP_EN(0); } static unsigned getRsrcReg(unsigned ShaderType) { @@ -476,7 +587,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); - OutStreamer->EmitIntValue(MFI->PSInputAddr, 4); + OutStreamer->EmitIntValue(MFI->PSInputEna, 4); + OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); + OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); } } @@ -491,14 +604,56 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.compute_pgm_resource_registers = KernelInfo.ComputePGMRSrc1 | (KernelInfo.ComputePGMRSrc2 << 32); - header.code_properties = - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | - AMD_CODE_PROPERTY_IS_PTR64; + header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + + if (MFI->hasPrivateSegmentBuffer()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; + } + + if (MFI->hasDispatchPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + + if (MFI->hasQueuePtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; + + if (MFI->hasKernargSegmentPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; + + if (MFI->hasDispatchID()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; + + if (MFI->hasFlatScratchInit()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + + // TODO: Private segment size + + if (MFI->hasGridWorkgroupCountX()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; + } + + if (MFI->hasGridWorkgroupCountY()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; + } + + if (MFI->hasGridWorkgroupCountZ()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; + } + + if (MFI->hasDispatchPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + + if (STM.isXNACKEnabled()) + header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; header.kernarg_segment_byte_size = MFI->ABIArgOffset; header.wavefront_sgpr_count = KernelInfo.NumSGPR; header.workitem_vgpr_count = KernelInfo.NumVGPR; - + header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; + header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; AMDGPUTargetStreamer *TS = static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 345af9b..99d4091 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -99,7 +99,11 @@ public: void EmitFunctionBodyStart() override; - void EmitEndOfAsmFile(Module &M) override; + void EmitFunctionEntryLabel() override; + + void EmitGlobalVariable(const GlobalVariable *GV) override; + + void EmitStartOfAsmFile(Module &M) override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 6ffa7a0..b0db261 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -20,28 +20,83 @@ def CC_SI : CallingConv<[ CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, - SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21 + SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, + SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, + SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 ]>>>, CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow< - [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ], - [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ] + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14, + SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30, + SGPR32, SGPR34, SGPR36, SGPR38 ], + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15, + SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31, + SGPR33, SGPR35, SGPR37, SGPR39 ] >>>, + // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, - VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31 + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31, + VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39, + VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47, + VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55, + VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63, + VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71, + VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79, + VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87, + VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95, + VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103, + VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111, + VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119, + VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127, + VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135 ]>>>, CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow< - [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ], - [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ] + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14, + SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30, + SGPR32, SGPR34, SGPR36, SGPR38 ], + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15, + SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31, + SGPR33, SGPR35, SGPR37, SGPR39 ] >>> ]>; +def RetCC_SI : CallingConv<[ + CCIfType<[i32] , CCAssignToReg<[ + SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, + SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, + SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, + SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, + SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 + ]>>, + + // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. + CCIfType<[f32] , CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31, + VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39, + VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47, + VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55, + VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63, + VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71, + VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79, + VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87, + VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95, + VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103, + VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111, + VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119, + VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127, + VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135 + ]>> +]>; + // Calling convention for R600 def CC_R600 : CallingConv<[ CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[ diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp new file mode 100644 index 0000000..2f6b302 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp @@ -0,0 +1,26 @@ +//===-- AMDGPUDiagnosticInfoUnsupported.cpp -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUDiagnosticInfoUnsupported.h" + +using namespace llvm; + +DiagnosticInfoUnsupported::DiagnosticInfoUnsupported( + const Function &Fn, + const Twine &Desc, + DiagnosticSeverity Severity) + : DiagnosticInfo(getKindID(), Severity), + Description(Desc), + Fn(Fn) { } + +int DiagnosticInfoUnsupported::KindID = 0; + +void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const { + DP << "unsupported " << getDescription() << " in " << Fn.getName(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h new file mode 100644 index 0000000..0fd37e1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h @@ -0,0 +1,48 @@ +//===-- AMDGPUDiagnosticInfoUnsupported.h - Error reporting -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H + +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" + +namespace llvm { + +/// Diagnostic information for unimplemented or unsupported feature reporting. +class DiagnosticInfoUnsupported : public DiagnosticInfo { +private: + const Twine &Description; + const Function &Fn; + + static int KindID; + + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, + DiagnosticSeverity Severity = DS_Error); + + const Function &getFunction() const { return Fn; } + const Twine &getDescription() const { return Description; } + + void print(DiagnosticPrinter &DP) const override; + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp index 8175786..4d84d28 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -71,9 +71,15 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { } /// \returns The number of registers allocated for \p FI. -int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { +int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); + const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + + // Fill in FrameReg output argument. + FrameReg = RI->getFrameRegister(MF); + // Start the offset at 2 so we don't overwrite work group information. // XXX: We should only do this when the shader actually uses this // information. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 9f31be1..257a3da 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -8,14 +8,12 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Interface to describe a layout of a stack frame on a AMDIL target -/// machine. +/// \brief Interface to describe a layout of a stack frame on an AMDGPU target. // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H -#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { @@ -34,7 +32,8 @@ public: /// \returns The number of 32-bit sub-registers that are used when storing /// values to the stack. unsigned getStackWidth(const MachineFunction &MF) const; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; const SpillSlot * getCalleeSavedSpillSlots(unsigned &NumEntries) const override; void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 64c54cc..b33040b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -11,6 +11,8 @@ /// \brief Defines an instruction selector for the AMDGPU target. // //===----------------------------------------------------------------------===// + +#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPURegisterInfo.h" @@ -20,9 +22,9 @@ #include "SIISelLowering.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/Function.h" @@ -40,12 +42,14 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can // make the right decision when generating code for different targets. const AMDGPUSubtarget *Subtarget; + public: AMDGPUDAGToDAGISel(TargetMachine &TM); virtual ~AMDGPUDAGToDAGISel(); bool runOnMachineFunction(MachineFunction &MF) override; SDNode *Select(SDNode *N) override; const char *getPassName() const override; + void PreprocessISelDAG() override; void PostprocessISelDAG() override; private: @@ -91,7 +95,7 @@ private: bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const; - void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; @@ -108,6 +112,16 @@ private: SDValue &TFE) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &GLC) const; + bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, + bool &Imm) const; + bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, + bool &Imm) const; + bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; + bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; + bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; + bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; + bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; + bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; SDNode *SelectAddrSpaceCast(SDNode *N); bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; @@ -273,6 +287,23 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { return N; } +static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { + switch (NumVectorElts) { + case 1: + return AMDGPU::SReg_32RegClassID; + case 2: + return AMDGPU::SReg_64RegClassID; + case 4: + return AMDGPU::SReg_128RegClassID; + case 8: + return AMDGPU::SReg_256RegClassID; + case 16: + return AMDGPU::SReg_512RegClassID; + } + + llvm_unreachable("invalid vector size"); +} + SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -306,38 +337,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { EVT EltVT = VT.getVectorElementType(); assert(EltVT.bitsEq(MVT::i32)); if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - bool UseVReg = true; - for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); - U != E; ++U) { - if (!U->isMachineOpcode()) { - continue; - } - const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); - if (!RC) { - continue; - } - if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) { - UseVReg = false; - } - } - switch(NumVectorElts) { - case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID : - AMDGPU::SReg_32RegClassID; - break; - case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID : - AMDGPU::SReg_64RegClassID; - break; - case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID : - AMDGPU::SReg_128RegClassID; - break; - case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID : - AMDGPU::SReg_256RegClassID; - break; - case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID : - AMDGPU::SReg_512RegClassID; - break; - default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); - } + RegClassID = selectSGPRVectorRegClassID(NumVectorElts); } else { // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG // that adds a 128 bits reg copy when going through TwoAddressInstructions @@ -455,98 +455,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, N->getValueType(0), Ops); } - - case ISD::LOAD: { - LoadSDNode *LD = cast<LoadSDNode>(N); - SDLoc SL(N); - EVT VT = N->getValueType(0); - - if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) { - N = glueCopyToM0(N); - break; - } - - // To simplify the TableGen patters, we replace all i64 loads with - // v2i32 loads. Alternatively, we could promote i64 loads to v2i32 - // during DAG legalization, however, so places (ExpandUnalignedLoad) - // in the DAG legalizer assume that if i64 is legal, so doing this - // promotion early can cause problems. - - SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(), - LD->getBasePtr(), LD->getMemOperand()); - SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, - MVT::i64, NewLoad); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1)); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast); - SDNode *Load = glueCopyToM0(NewLoad.getNode()); - SelectCode(Load); - N = BitCast.getNode(); - break; - } - + case ISD::LOAD: case ISD::STORE: { - // Handle i64 stores here for the same reason mentioned above for loads. - StoreSDNode *ST = cast<StoreSDNode>(N); - SDValue Value = ST->getValue(); - if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) { - - SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), - MVT::v2i32, Value); - SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, - ST->getBasePtr(), ST->getMemOperand()); - - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); - - if (NewValue.getOpcode() == ISD::BITCAST) { - Select(NewStore.getNode()); - return SelectCode(NewValue.getNode()); - } - - // getNode() may fold the bitcast if its input was another bitcast. If that - // happens we should only select the new store. - N = NewStore.getNode(); - } - N = glueCopyToM0(N); break; } - case AMDGPUISD::REGISTER_LOAD: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - - SDLoc DL(N); - SelectADDRIndirect(N->getOperand(1), Addr, Offset); - const SDValue Ops[] = { - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL, - CurDAG->getVTList(MVT::i32, MVT::i64, - MVT::Other), - Ops); - } - case AMDGPUISD::REGISTER_STORE: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - SelectADDRIndirect(N->getOperand(2), Addr, Offset); - SDLoc DL(N); - const SDValue Ops[] = { - N->getOperand(1), - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL, - CurDAG->getVTList(MVT::Other), - Ops); - } - case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) @@ -575,7 +489,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N), N->getOperand(0), OffsetVal, WidthVal); - } case AMDGPUISD::DIV_SCALE: { return SelectDIV_SCALE(N); @@ -601,7 +514,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return SelectCode(N); } - bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) { assert(AS != 0 && "Use checkPrivateAddress instead."); if (!Ptr) @@ -681,7 +593,7 @@ bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const { if (checkPrivateAddress(N->getMemOperand())) { if (MMO) { const PseudoSourceValue *PSV = MMO->getPseudoValue(); - if (PSV && PSV == PseudoSourceValue::getConstantPool()) { + if (PSV && PSV->isConstantPool()) { return true; } } @@ -847,7 +759,8 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { unsigned Opc = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; - // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, + // omod SDValue Ops[8]; SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); @@ -883,15 +796,39 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, Offset = N1; return true; } - } + } else if (Addr.getOpcode() == ISD::SUB) { + // sub C, x -> add (sub 0, x), C + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { + int64_t ByteOffset = C->getSExtValue(); + if (isUInt<16>(ByteOffset)) { + SDLoc DL(Addr); + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + + // XXX - This is kind of hacky. Create a dummy sub node so we can check + // the known bits in isDSOffsetLegal. We need to emit the selected node + // here, so this is thrown away. + SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + if (isDSOffsetLegal(Sub, ByteOffset, 16)) { + MachineSDNode *MachineSub + = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + Base = SDValue(MachineSub, 0); + Offset = Addr.getOperand(0); + return true; + } + } + } + } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + // If we have a constant address, prefer to put the constant into the + // offset. This can save moves to load the constant address since multiple + // operations can share the zero base address register, and enables merging + // into read2 / write2 instructions. - SDLoc DL(Addr); + SDLoc DL(Addr); - // If we have a constant address, prefer to put the constant into the - // offset. This can save moves to load the constant address since multiple - // operations can share the zero base address register, and enables merging - // into read2 / write2 instructions. - if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { if (isUInt<16>(CAddr->getZExtValue())) { SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, @@ -904,10 +841,11 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, // default case Base = Addr; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16); return true; } +// TODO: If offset is too big, put low 16-bit into offset. bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const { @@ -926,9 +864,35 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); return true; } - } - - if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + } else if (Addr.getOpcode() == ISD::SUB) { + // sub C, x -> add (sub 0, x), C + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { + unsigned DWordOffset0 = C->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + + if (isUInt<8>(DWordOffset0)) { + SDLoc DL(Addr); + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + + // XXX - This is kind of hacky. Create a dummy sub node so we can check + // the known bits in isDSOffsetLegal. We need to emit the selected node + // here, so this is thrown away. + SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { + MachineSDNode *MachineSub + = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + Base = SDValue(MachineSub, 0); + Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + return true; + } + } + } + } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { unsigned DWordOffset0 = CAddr->getZExtValue() / 4; unsigned DWordOffset1 = DWordOffset0 + 1; assert(4 * DWordOffset0 == CAddr->getZExtValue()); @@ -956,12 +920,16 @@ static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { return isUInt<12>(Imm->getZExtValue()); } -void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, +bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE) const { + // Subtarget prefers to use flat instruction + if (Subtarget->useFlatForGlobal()) + return false; + SDLoc DL(Addr); GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -994,14 +962,14 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, if (isLegalMUBUFImmOffset(C1)) { Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return; + return true; } else if (isUInt<32>(C1->getZExtValue())) { // Illegal offset, store it in soffset. Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), 0); - return; + return true; } } @@ -1013,7 +981,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, Ptr = N0; VAddr = N1; Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - return; + return true; } // default case -> offset @@ -1021,6 +989,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, Ptr = Addr; Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return true; } bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, @@ -1033,8 +1002,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return false; - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE)) + return false; ConstantSDNode *C = cast<ConstantSDNode>(Addr64); if (C->getSExtValue()) { @@ -1052,8 +1022,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, - SDValue &SLC) const { + SDValue &Offset, + SDValue &SLC) const { SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); SDValue GLC, TFE; @@ -1066,36 +1036,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SITargetLowering& Lowering = - *static_cast<const SITargetLowering*>(getTargetLowering()); - - unsigned ScratchOffsetReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, - ScratchOffsetReg, MVT::i32); - SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32); - SDValue ScratchRsrcDword0 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32); - SDValue ScratchRsrcDword1 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0); - - const SDValue RsrcOps[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - ScratchRsrcDword0, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - ScratchRsrcDword1, - CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32), - }; - SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, RsrcOps), 0); - Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); - SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, - MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); + Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); // (add n0, c1) if (CurDAG->isBaseWithConstantOffset(Addr)) { @@ -1126,8 +1070,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE)) + return false; if (!cast<ConstantSDNode>(Offen)->getSExtValue() && !cast<ConstantSDNode>(Idxen)->getSExtValue() && @@ -1153,18 +1098,134 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); } +/// +/// \param EncodedOffset This is the immediate value that will be encoded +/// directly into the instruction. On SI/CI the \p EncodedOffset +/// will be in units of dwords and on VI+ it will be units of bytes. +static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST, + int64_t EncodedOffset) { + return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ? + isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, + SDValue &Offset, bool &Imm) const { + + // FIXME: Handle non-constant offsets. + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); + if (!C) + return false; + + SDLoc SL(ByteOffsetNode); + AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration(); + int64_t ByteOffset = C->getSExtValue(); + int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ? + ByteOffset >> 2 : ByteOffset; + + if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) { + Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); + Imm = true; + return true; + } + + if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset)) + return false; + + if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) { + // 32-bit Immediates are supported on Sea Islands. + Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); + } else { + SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); + Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, + C32Bit), 0); + } + Imm = false; + return true; +} + +bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, + SDValue &Offset, bool &Imm) const { + + SDLoc SL(Addr); + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + + if (SelectSMRDOffset(N1, Offset, Imm)) { + SBase = N0; + return true; + } + } + SBase = Addr; + Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); + Imm = true; + return true; +} + +bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, + SDValue &Offset) const { + bool Imm; + return SelectSMRD(Addr, SBase, Offset, Imm) && Imm; +} + +bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, + SDValue &Offset) const { + + if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) + return false; + + bool Imm; + if (!SelectSMRD(Addr, SBase, Offset, Imm)) + return false; + + return !Imm && isa<ConstantSDNode>(Offset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, + SDValue &Offset) const { + bool Imm; + return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm && + !isa<ConstantSDNode>(Offset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, + SDValue &Offset) const { + bool Imm; + return SelectSMRDOffset(Addr, Offset, Imm) && Imm; +} + +bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, + SDValue &Offset) const { + if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) + return false; + + bool Imm; + if (!SelectSMRDOffset(Addr, Offset, Imm)) + return false; + + return !Imm && isa<ConstantSDNode>(Offset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr, + SDValue &Offset) const { + bool Imm; + return SelectSMRDOffset(Addr, Offset, Imm) && !Imm && + !isa<ConstantSDNode>(Offset); +} + // FIXME: This is incorrect and only enough to be able to compile. SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N); SDLoc DL(N); + const MachineFunction &MF = CurDAG->getMachineFunction(); + DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(), + "addrspacecast not implemented"); + CurDAG->getContext()->diagnose(NotImplemented); + assert(Subtarget->hasFlatAddressSpace() && "addrspacecast only supported with flat address space!"); - assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && - ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) && - "Cannot cast address space to / from constant address!"); - assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && "Can only cast to / from flat address space!"); @@ -1190,7 +1251,6 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32)); } - if (DestSize > SrcSize) { assert(SrcSize == 32 && DestSize == 64); @@ -1371,6 +1431,65 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +void AMDGPUDAGToDAGISel::PreprocessISelDAG() { + bool Modified = false; + + // XXX - Other targets seem to be able to do this without a worklist. + SmallVector<LoadSDNode *, 8> LoadsToReplace; + SmallVector<StoreSDNode *, 8> StoresToReplace; + + for (SDNode &Node : CurDAG->allnodes()) { + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(&Node)) { + EVT VT = LD->getValueType(0); + if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) + continue; + + // To simplify the TableGen patters, we replace all i64 loads with v2i32 + // loads. Alternatively, we could promote i64 loads to v2i32 during DAG + // legalization, however, so places (ExpandUnalignedLoad) in the DAG + // legalizer assume that if i64 is legal, so doing this promotion early + // can cause problems. + LoadsToReplace.push_back(LD); + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(&Node)) { + // Handle i64 stores here for the same reason mentioned above for loads. + SDValue Value = ST->getValue(); + if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore()) + continue; + StoresToReplace.push_back(ST); + } + } + + for (LoadSDNode *LD : LoadsToReplace) { + SDLoc SL(LD); + + SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(), + LD->getBasePtr(), LD->getMemOperand()); + SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, + MVT::i64, NewLoad); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast); + Modified = true; + } + + for (StoreSDNode *ST : StoresToReplace) { + SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST), + MVT::v2i32, ST->getValue()); + const SDValue StoreOps[] = { + ST->getChain(), + NewValue, + ST->getBasePtr(), + ST->getOffset() + }; + + CurDAG->UpdateNodeOperands(ST, StoreOps); + Modified = true; + } + + // XXX - Is this necessary? + if (Modified) + CurDAG->RemoveDeadNodes(); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3a65f3b..1a59a46 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -15,6 +15,7 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUFrameLowering.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" @@ -27,50 +28,9 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/DiagnosticPrinter.h" using namespace llvm; -namespace { - -/// Diagnostic information for unimplemented or unsupported feature reporting. -class DiagnosticInfoUnsupported : public DiagnosticInfo { -private: - const Twine &Description; - const Function &Fn; - - static int KindID; - - static int getKindID() { - if (KindID == 0) - KindID = llvm::getNextAvailablePluginDiagnosticKind(); - return KindID; - } - -public: - DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, - DiagnosticSeverity Severity = DS_Error) - : DiagnosticInfo(getKindID(), Severity), - Description(Desc), - Fn(Fn) { } - - const Function &getFunction() const { return Fn; } - const Twine &getDescription() const { return Description; } - - void print(DiagnosticPrinter &DP) const override { - DP << "unsupported " << getDescription() << " in " << Fn.getName(); - } - - static bool classof(const DiagnosticInfo *DI) { - return DI->getKind() == getKindID(); - } -}; - -int DiagnosticInfoUnsupported::KindID = 0; -} - - static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { @@ -113,6 +73,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::BR_JT, MVT::Other, Expand); setOperationAction(ISD::BRIND, MVT::Other, Expand); + // This is totally unsupported, just custom lower to produce an error. + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + // We need to custom lower some of the intrinsics setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -319,12 +282,19 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::SMAX, MVT::i32, Legal); setOperationAction(ISD::UMAX, MVT::i32, Legal); - if (!Subtarget->hasFFBH()) + if (Subtarget->hasFFBH()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); + else setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); if (!Subtarget->hasFFBL()) setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + + setOperationAction(ISD::CTLZ, MVT::i64, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v4i32 }; @@ -352,7 +322,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Custom); - setOperationAction(ISD::UDIVREM, VT, Custom); + setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::ADDC, VT, Expand); setOperationAction(ISD::SUBC, VT, Expand); setOperationAction(ISD::ADDE, VT, Expand); @@ -429,12 +399,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setSelectIsExpensive(false); PredictableSelectIsExpensive = false; - // There are no integer divide instructions, and these expand to a pretty - // large sequence of instructions. - setIntDivIsCheap(false); - setPow2SDivIsCheap(false); setFsqrtIsCheap(true); + // We want to find all load dependencies for long chains of stores to enable + // merging into very wide vectors. The problem is with vectors with > 4 + // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 + // vectors are a legal type, even though we have to split the loads + // usually. When we can more precisely specify load legality per address + // space, we should be able to make FindBetterChain/MergeConsecutiveStores + // smarter so that they can figure out what to do in 2 iterations without all + // N > 4 stores on the same chain. + GatherAllAliasesMaxDepth = 16; + // FIXME: Need to really handle these. MaxStoresPerMemcpy = 4096; MaxStoresPerMemmove = 4096; @@ -534,6 +510,18 @@ bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, return true; } +bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const { + // There are few operations which truly have vector input operands. Any vector + // operation is going to involve operations on each component, and a + // build_vector will be a copy per element, so it always makes sense to use a + // build_vector input in place of the extracted element to avoid a copy into a + // super register. + // + // We should probably only do this if all users are extracts only, but this + // should be the common case. + return true; +} + bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { // Truncate is just accessing a subregister. return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); @@ -584,6 +572,12 @@ void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, State.AnalyzeFormalArguments(Ins, CC_AMDGPU); } +void AMDGPUTargetLowering::AnalyzeReturn(CCState &State, + const SmallVectorImpl<ISD::OutputArg> &Outs) const { + + State.AnalyzeReturn(Outs, RetCC_SI); +} + SDValue AMDGPUTargetLowering::LowerReturn( SDValue Chain, CallingConv::ID CallConv, @@ -617,6 +611,15 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, return SDValue(); } +SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + const Function &Fn = *DAG.getMachineFunction().getFunction(); + + DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca"); + DAG.getContext()->diagnose(NoDynamicAlloca); + return SDValue(); +} + SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -643,6 +646,10 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::CTLZ: + case ISD::CTLZ_ZERO_UNDEF: + return LowerCTLZ(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } return Op; } @@ -892,7 +899,9 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); unsigned FrameIndex = FIN->getIndex(); - unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); + unsigned IgnoredFrameReg; + unsigned Offset = + TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), Op.getValueType()); } @@ -1043,9 +1052,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_brev: - return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); - case Intrinsic::AMDGPU_class: return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -1057,6 +1063,8 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name + return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1)); } } @@ -1077,6 +1085,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); + // TODO: Should this propagate fast-math-flags? SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, DAG.getConstantFP(1.0f, DL, MVT::f32), Op.getOperand(1)); @@ -1167,45 +1176,6 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, return SDValue(); } -// FIXME: Remove this when combines added to DAGCombiner. -SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - SelectionDAG &DAG) const { - if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) - return SDValue(); - - ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); - switch (CCOpcode) { - case ISD::SETULE: - case ISD::SETULT: { - unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETLE: - case ISD::SETLT: { - unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETGT: - case ISD::SETGE: { - unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETUGE: - case ISD::SETUGT: { - unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - default: - return SDValue(); - } -} - SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, SelectionDAG &DAG) const { LoadSDNode *Load = cast<LoadSDNode>(Op); @@ -1260,7 +1230,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, EVT PtrVT = BasePtr.getValueType(); EVT MemVT = Load->getMemoryVT(); SDLoc SL(Op); - MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); + + const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); EVT LoVT, HiVT; EVT LoMemVT, HiMemVT; @@ -1269,23 +1240,27 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); + + unsigned Size = LoMemVT.getStoreSize(); + unsigned BaseAlign = Load->getAlignment(); + unsigned HiAlign = MinAlign(BaseAlign, Size); + SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue, LoMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); + Load->isInvariant(), BaseAlign); SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(LoMemVT.getStoreSize(), SL, - PtrVT)); + DAG.getConstant(Size, SL, PtrVT)); SDValue HiLoad = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); + Load->isInvariant(), HiAlign); SDValue Ops[] = { DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), @@ -1415,7 +1390,11 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, DAG.getConstant(LoMemVT.getStoreSize(), SL, PtrVT)); - MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); + const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); + unsigned BaseAlign = Store->getAlignment(); + unsigned Size = LoMemVT.getStoreSize(); + unsigned HiAlign = MinAlign(BaseAlign, Size); + SDValue LoStore = DAG.getTruncStore(Chain, SL, Lo, BasePtr, @@ -1423,15 +1402,15 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, LoMemVT, Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); + BaseAlign); SDValue HiStore = DAG.getTruncStore(Chain, SL, Hi, HiPtr, - SrcValue.getWithOffset(LoMemVT.getStoreSize()), + SrcValue.getWithOffset(Size), HiMemVT, Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); + HiAlign); return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); } @@ -1529,7 +1508,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && Store->getValue().getValueType().isVector()) { - return ScalarizeVectorStore(Op, DAG); + return SplitVectorStore(Op, DAG); } EVT MemVT = Store->getMemoryVT(); @@ -1630,6 +1609,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool // float fb = (float)ib; SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); + // TODO: Should this propagate fast-math-flags? // float fq = native_divide(fa, fb); SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); @@ -1940,6 +1920,8 @@ SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { SDValue X = Op.getOperand(0); SDValue Y = Op.getOperand(1); + // TODO: Should this propagate fast-math-flags? + SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); @@ -1968,6 +1950,7 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); + // TODO: Should this propagate fast-math-flags? return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } @@ -2045,6 +2028,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); + // TODO: Should this propagate fast-math-flags? + SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); @@ -2074,6 +2059,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); + // TODO: Should this propagate fast-math-flags? + SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); @@ -2184,9 +2171,149 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); + // TODO: Should this propagate fast-math-flags? return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } +SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + + if (ZeroUndef && Src.getValueType() == MVT::i32) + return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src); + + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), MVT::i32); + + SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ); + + SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo); + SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi); + + const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32); + SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32); + + // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) + SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi); + + if (!ZeroUndef) { + // Test if the full 64-bit input is zero. + + // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32, + // which we probably don't want. + SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ); + SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0); + + // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction + // with the same cycles, otherwise it is slower. + // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src, + // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ); + + const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32); + + // The instruction returns -1 for 0 input, but the defined intrinsic + // behavior is to return the number of bits. + NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, + SrcIsZero, Bits32, NewCtlz); + } + + return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz); +} + +SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + // Unsigned + // cul2f(ulong u) + //{ + // uint lz = clz(u); + // uint e = (u != 0) ? 127U + 63U - lz : 0; + // u = (u << lz) & 0x7fffffffffffffffUL; + // ulong t = u & 0xffffffffffUL; + // uint v = (e << 23) | (uint)(u >> 40); + // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U); + // return as_float(v + r); + //} + // Signed + // cl2f(long l) + //{ + // long s = l >> 63; + // float r = cul2f((l + s) ^ s); + // return s ? -r : r; + //} + + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + SDValue L = Src; + + SDValue S; + if (Signed) { + const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64); + S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit); + + SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S); + L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S); + } + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), MVT::f32); + + + SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32); + SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64); + SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L); + LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ); + + SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32); + SDValue E = DAG.getSelect(SL, MVT::i32, + DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE), + DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ), + ZeroI32); + + SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64, + DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ), + DAG.getConstant((-1ULL) >> 1, SL, MVT::i64)); + + SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U, + DAG.getConstant(0xffffffffffULL, SL, MVT::i64)); + + SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64, + U, DAG.getConstant(40, SL, MVT::i64)); + + SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32, + DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)), + DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl)); + + SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64); + SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT); + SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ); + + SDValue One = DAG.getConstant(1, SL, MVT::i32); + + SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One); + + SDValue R = DAG.getSelect(SL, MVT::i32, + RCmp, + One, + DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32)); + R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R); + R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R); + + if (!Signed) + return R; + + SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R); + return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R); +} + SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const { SDLoc SL(Op); @@ -2206,40 +2333,35 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, DAG.getConstant(32, SL, MVT::i32)); - + // TODO: Should this propagate fast-math-flags? return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); } SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue S0 = Op.getOperand(0); - if (S0.getValueType() != MVT::i64) - return SDValue(); + assert(Op.getOperand(0).getValueType() == MVT::i64 && + "operation should be legal"); EVT DestVT = Op.getValueType(); if (DestVT == MVT::f64) return LowerINT_TO_FP64(Op, DAG, false); - assert(DestVT == MVT::f32); + if (DestVT == MVT::f32) + return LowerINT_TO_FP32(Op, DAG, false); - SDLoc DL(Op); - - // f32 uint_to_fp i64 - SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, - DAG.getConstant(0, DL, MVT::i32)); - SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); - SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, - DAG.getConstant(1, DL, MVT::i32)); - SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); - FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, - DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32 - return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); + return SDValue(); } SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); - if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64) + assert(Op.getOperand(0).getValueType() == MVT::i64 && + "operation should be legal"); + + EVT DestVT = Op.getValueType(); + if (DestVT == MVT::f32) + return LowerINT_TO_FP32(Op, DAG, true); + + if (DestVT == MVT::f64) return LowerINT_TO_FP64(Op, DAG, true); return SDValue(); @@ -2257,7 +2379,7 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, MVT::f64); SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL, MVT::f64); - + // TODO: Should this propagate fast-math-flags? SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); @@ -2474,6 +2596,97 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, return DAG.getSExtOrTrunc(Mul, DL, VT); } +static bool isNegativeOne(SDValue Val) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) + return C->isAllOnesValue(); + return false; +} + +static bool isCtlzOpc(unsigned Opc) { + return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; +} + +// Get FFBH node if the incoming op may have been type legalized from a smaller +// type VT. +// Need to match pre-legalized type because the generic legalization inserts the +// add/sub between the select and compare. +static SDValue getFFBH_U32(const TargetLowering &TLI, + SelectionDAG &DAG, SDLoc SL, SDValue Op) { + EVT VT = Op.getValueType(); + EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + if (LegalVT != MVT::i32) + return SDValue(); + + if (VT != MVT::i32) + Op = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Op); + + SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Op); + if (VT != MVT::i32) + FFBH = DAG.getNode(ISD::TRUNCATE, SL, VT, FFBH); + + return FFBH; +} + +// The native instructions return -1 on 0 input. Optimize out a select that +// produces -1 on 0. +// +// TODO: If zero is not undef, we could also do this if the output is compared +// against the bitwidth. +// +// TODO: Should probably combine against FFBH_U32 instead of ctlz directly. +SDValue AMDGPUTargetLowering::performCtlzCombine(SDLoc SL, + SDValue Cond, + SDValue LHS, + SDValue RHS, + DAGCombinerInfo &DCI) const { + ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); + if (!CmpRhs || !CmpRhs->isNullValue()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + SDValue CmpLHS = Cond.getOperand(0); + + // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x + if (CCOpcode == ISD::SETEQ && + isCtlzOpc(RHS.getOpcode()) && + RHS.getOperand(0) == CmpLHS && + isNegativeOne(LHS)) { + return getFFBH_U32(*this, DAG, SL, CmpLHS); + } + + // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x + if (CCOpcode == ISD::SETNE && + isCtlzOpc(LHS.getOpcode()) && + LHS.getOperand(0) == CmpLHS && + isNegativeOne(RHS)) { + return getFFBH_U32(*this, DAG, SL, CmpLHS); + } + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue Cond = N->getOperand(0); + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + EVT VT = N->getValueType(0); + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + SDValue CC = Cond.getOperand(2); + + SDValue True = N->getOperand(1); + SDValue False = N->getOperand(2); + + if (VT == MVT::f32 && Cond.hasOneUse()) + return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + + // There's no reason to not do this if the condition has other uses. + return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -2498,29 +2711,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, simplifyI24(N1, DCI); return SDValue(); } - case ISD::SELECT: { - SDValue Cond = N->getOperand(0); - if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { - EVT VT = N->getValueType(0); - SDValue LHS = Cond.getOperand(0); - SDValue RHS = Cond.getOperand(1); - SDValue CC = Cond.getOperand(2); - - SDValue True = N->getOperand(1); - SDValue False = N->getOperand(2); - - if (VT == MVT::f32) - return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); - - // TODO: Implement min / max Evergreen instructions. - if (VT == MVT::i32 && - Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); - } - } - - break; - } + case ISD::SELECT: + return performSelectCombine(N, DCI); case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && @@ -2652,20 +2844,14 @@ bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { return CFP->isExactlyValue(1.0); } - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { - return C->isAllOnesValue(); - } - return false; + return isAllOnesConstant(Op); } bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { return CFP->getValueAPF().isZero(); } - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { - return C->isNullValue(); - } - return false; + return isNullConstant(Op); } SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, @@ -2738,7 +2924,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) NODE_NAME_CASE(BFM) - NODE_NAME_CASE(BREV) + NODE_NAME_CASE(FFBH_U32) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MAD_U24) @@ -2893,8 +3079,7 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( return 1; unsigned SignBits = 32 - Width->getZExtValue() + 1; - ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1)); - if (!Offset || !Offset->isNullValue()) + if (!isNullConstant(Op.getOperand(1))) return SignBits; // TODO: Could probably figure something out with non-0 offsets. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 478b203..3792541 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -54,6 +54,9 @@ private: SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; @@ -67,6 +70,9 @@ private: SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performCtlzCombine(SDLoc SL, SDValue Cond, SDValue LHS, SDValue RHS, + DAGCombinerInfo &DCI) const; + SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; protected: static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); @@ -109,6 +115,8 @@ protected: SmallVectorImpl<ISD::InputArg> &OrigIns) const; void AnalyzeFormalArguments(CCState &State, const SmallVectorImpl<ISD::InputArg> &Ins) const; + void AnalyzeReturn(CCState &State, + const SmallVectorImpl<ISD::OutputArg> &Outs) const; public: AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); @@ -138,6 +146,7 @@ public: bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, unsigned AS) const override; + bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override; bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; @@ -149,6 +158,9 @@ public: SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; void ReplaceNodeResults(SDNode * N, @@ -165,14 +177,6 @@ public: SDValue False, SDValue CC, DAGCombinerInfo &DCI) const; - SDValue CombineIMinMax(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - SelectionDAG &DAG) const; const char* getTargetNodeName(unsigned Opcode) const override; @@ -216,7 +220,7 @@ public: /// \brief Helper function that returns the byte offset of the given /// type of implicit parameter. - unsigned getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, + uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const; }; @@ -267,7 +271,7 @@ enum NodeType : unsigned { BFE_I32, // Extract range of bits with sign extension to 32-bits. BFI, // (src0 & src1) | (~src0 & src2) BFM, // Insert a range of bits into a 32-bit word. - BREV, // Reverse bits. + FFBH_U32, // ctlz with -1 if input is zero. MUL_U24, MUL_I24, MAD_U24, diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index 15a3d54..a266e71 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -164,11 +164,6 @@ MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( // TODO: Implement this function return nullptr; } -bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef<unsigned> Ops) const { - // TODO: Implement this function - return false; -} bool AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, @@ -312,7 +307,9 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { return -1; } - Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1); + unsigned IgnoredFrameReg; + Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexReference( + MF, -1, IgnoredFrameReg); return getIndirectIndexBegin(MF) + Offset; } @@ -367,3 +364,14 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { return MCOp; } + +ArrayRef<std::pair<int, const char *>> +AMDGPUInstrInfo::getSerializableTargetIndices() const { + static const std::pair<int, const char *> TargetIndices[] = { + {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; + return makeArrayRef(TargetIndices); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 86d3962..53e8b23 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -103,8 +103,6 @@ public: /// read or write or -1 if indirect addressing is not used by this program. int getIndirectIndexEnd(const MachineFunction &MF) const; - bool canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef<unsigned> Ops) const override; bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const override; @@ -147,6 +145,9 @@ public: return get(pseudoToMCOpcode(Opcode)); } + ArrayRef<std::pair<int, const char *>> + getSerializableTargetIndices() const override; + //===---------------------------------------------------------------------===// // Pure virtual funtions to be implemented by sub-classes. //===---------------------------------------------------------------------===// @@ -195,6 +196,7 @@ public: }; namespace AMDGPU { + LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); } // End namespace AMDGPU diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index b413897..575dfe4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -191,7 +191,7 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; -def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>; +def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>; // Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when // performing the mulitply. The result is a 32-bit value. @@ -242,4 +242,4 @@ def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai // Call/Return DAG Nodes //===----------------------------------------------------------------------===// def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue]>; + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 72cab39..2a7ce6a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -204,14 +204,6 @@ def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ return isGlobalLoad(dyn_cast<LoadSDNode>(N)); }]>; -def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isFlatLoad(dyn_cast<LoadSDNode>(N)); -}]>; - -def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isFlatLoad(dyn_cast<LoadSDNode>(N)); -}]>; - def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); }]>; @@ -243,14 +235,6 @@ def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ return isGlobalLoad(dyn_cast<LoadSDNode>(N)); }]>; -def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isFlatLoad(dyn_cast<LoadSDNode>(N)); -}]>; - -def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isFlatLoad(dyn_cast<LoadSDNode>(N)); -}]>; - def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); }]>; @@ -299,16 +283,6 @@ def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr), return isGlobalStore(dyn_cast<StoreSDNode>(N)); }]>; -def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr), - (truncstorei8 node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast<StoreSDNode>(N)); -}]>; - -def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr), - (truncstorei16 node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast<StoreSDNode>(N)); -}]>; - def local_store : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ return isLocalStore(dyn_cast<StoreSDNode>(N)); @@ -385,15 +359,6 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> { defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>; -def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isFlatLoad(dyn_cast<LoadSDNode>(N)); -}]>; - -def flat_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast<StoreSDNode>(N)); -}]>; - def mskor_flat : PatFrag<(ops node:$val, node:$ptr), (AMDGPUstore_mskor node:$val, node:$ptr), [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; @@ -514,7 +479,7 @@ class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul> class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx, SubRegIndex sub_reg> : Pat< - (sub_type (vector_extract vec_type:$src, sub_idx)), + (sub_type (extractelt vec_type:$src, sub_idx)), (EXTRACT_SUBREG $src, sub_reg) >; @@ -522,7 +487,7 @@ class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx, class Insert_Element <ValueType elem_type, ValueType vec_type, int sub_idx, SubRegIndex sub_reg> : Pat < - (vector_insert vec_type:$vec, elem_type:$elem, sub_idx), + (insertelt vec_type:$vec, elem_type:$elem, sub_idx), (INSERT_SUBREG $vec, $elem, sub_reg) >; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td index ab489cd..1de3546 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -69,8 +69,8 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_barrier_local : Intrinsic<[], [], []>; - def int_AMDGPU_barrier_global : Intrinsic<[], [], []>; + def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>; + def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>; } // Legacy names for compatibility. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 2083146..dfc652f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -61,7 +61,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { MCOp = MCOperand::createImm(MO.getImm()); break; case MachineOperand::MO_Register: - MCOp = MCOperand::createReg(MO.getReg()); + MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST)); break; case MachineOperand::MO_MachineBasicBlock: MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( @@ -73,13 +73,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx)); break; } - case MachineOperand::MO_TargetIndex: { - assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START); - MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); - MCOp = MCOperand::createExpr(Expr); - break; - } case MachineOperand::MO_ExternalSymbol: { MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); @@ -104,10 +97,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { #endif if (MI->isBundle()) { const MachineBasicBlock *MBB = MI->getParent(); - MachineBasicBlock::const_instr_iterator I = MI; - ++I; - while (I != MBB->end() && I->isInsideBundle()) { - EmitInstruction(I); + MachineBasicBlock::const_instr_iterator I = ++MI->getIterator(); + while (I != MBB->instr_end() && I->isInsideBundle()) { + EmitInstruction(&*I); ++I; } } else { @@ -136,8 +128,6 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter(); InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups, MF->getSubtarget<MCSubtargetInfo>()); - CodeStream.flush(); - HexLines.resize(HexLines.size() + 1); std::string &HexLine = HexLines.back(); raw_string_ostream HexStream(HexLine); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 21c7da6..5413717 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -1,11 +1,10 @@ #include "AMDGPUMachineFunction.h" #include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" using namespace llvm; -static const char *const ShaderTypeAttribute = "ShaderType"; - // Pin the vtable to this file. void AMDGPUMachineFunction::anchor() {} @@ -13,13 +12,9 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), ShaderType(ShaderType::COMPUTE), LDSSize(0), + ABIArgOffset(0), ScratchSize(0), IsKernel(true) { - Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute); - if (A.isStringAttribute()) { - StringRef Str = A.getValueAsString(); - if (Str.getAsInteger(0, ShaderType)) - llvm_unreachable("Can't parse shader type!"); - } + ShaderType = AMDGPU::getShaderType(*MF.getFunction()); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index f5e4694..46fcee8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -37,6 +37,11 @@ public: return ShaderType; } + bool isKernel() const { + // FIXME: Assume everything is a kernel until function calls are supported. + return true; + } + unsigned ScratchSize; bool IsKernel; }; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp new file mode 100644 index 0000000..554bf1d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp @@ -0,0 +1,373 @@ +//===-- AMDGPUOpenCLImageTypeLoweringPass.cpp -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass resolves calls to OpenCL image attribute, image resource ID and +/// sampler resource ID getter functions. +/// +/// Image attributes (size and format) are expected to be passed to the kernel +/// as kernel arguments immediately following the image argument itself, +/// therefore this pass adds image size and format arguments to the kernel +/// functions in the module. The kernel functions with image arguments are +/// re-created using the new signature. The new arguments are added to the +/// kernel metadata with kernel_arg_type set to "image_size" or "image_format". +/// Note: this pass may invalidate pointers to functions. +/// +/// Resource IDs of read-only images, write-only images and samplers are +/// defined to be their index among the kernel arguments of the same +/// type and access qualifier. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +namespace { + +StringRef GetImageSizeFunc = "llvm.OpenCL.image.get.size"; +StringRef GetImageFormatFunc = "llvm.OpenCL.image.get.format"; +StringRef GetImageResourceIDFunc = "llvm.OpenCL.image.get.resource.id"; +StringRef GetSamplerResourceIDFunc = "llvm.OpenCL.sampler.get.resource.id"; + +StringRef ImageSizeArgMDType = "__llvm_image_size"; +StringRef ImageFormatArgMDType = "__llvm_image_format"; + +StringRef KernelsMDNodeName = "opencl.kernels"; +StringRef KernelArgMDNodeNames[] = { + "kernel_arg_addr_space", + "kernel_arg_access_qual", + "kernel_arg_type", + "kernel_arg_base_type", + "kernel_arg_type_qual"}; +const unsigned NumKernelArgMDNodes = 5; + +typedef SmallVector<Metadata *, 8> MDVector; +struct KernelArgMD { + MDVector ArgVector[NumKernelArgMDNodes]; +}; + +} // end anonymous namespace + +static inline bool +IsImageType(StringRef TypeString) { + return TypeString == "image2d_t" || TypeString == "image3d_t"; +} + +static inline bool +IsSamplerType(StringRef TypeString) { + return TypeString == "sampler_t"; +} + +static Function * +GetFunctionFromMDNode(MDNode *Node) { + if (!Node) + return nullptr; + + size_t NumOps = Node->getNumOperands(); + if (NumOps != NumKernelArgMDNodes + 1) + return nullptr; + + auto F = mdconst::dyn_extract<Function>(Node->getOperand(0)); + if (!F) + return nullptr; + + // Sanity checks. + size_t ExpectNumArgNodeOps = F->arg_size() + 1; + for (size_t i = 0; i < NumKernelArgMDNodes; ++i) { + MDNode *ArgNode = dyn_cast_or_null<MDNode>(Node->getOperand(i + 1)); + if (ArgNode->getNumOperands() != ExpectNumArgNodeOps) + return nullptr; + if (!ArgNode->getOperand(0)) + return nullptr; + + // FIXME: It should be possible to do image lowering when some metadata + // args missing or not in the expected order. + MDString *StringNode = dyn_cast<MDString>(ArgNode->getOperand(0)); + if (!StringNode || StringNode->getString() != KernelArgMDNodeNames[i]) + return nullptr; + } + + return F; +} + +static StringRef +AccessQualFromMD(MDNode *KernelMDNode, unsigned ArgIdx) { + MDNode *ArgAQNode = cast<MDNode>(KernelMDNode->getOperand(2)); + return cast<MDString>(ArgAQNode->getOperand(ArgIdx + 1))->getString(); +} + +static StringRef +ArgTypeFromMD(MDNode *KernelMDNode, unsigned ArgIdx) { + MDNode *ArgTypeNode = cast<MDNode>(KernelMDNode->getOperand(3)); + return cast<MDString>(ArgTypeNode->getOperand(ArgIdx + 1))->getString(); +} + +static MDVector +GetArgMD(MDNode *KernelMDNode, unsigned OpIdx) { + MDVector Res; + for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) { + MDNode *Node = cast<MDNode>(KernelMDNode->getOperand(i + 1)); + Res.push_back(Node->getOperand(OpIdx)); + } + return Res; +} + +static void +PushArgMD(KernelArgMD &MD, const MDVector &V) { + assert(V.size() == NumKernelArgMDNodes); + for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) { + MD.ArgVector[i].push_back(V[i]); + } +} + +namespace { + +class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass { + static char ID; + + LLVMContext *Context; + Type *Int32Type; + Type *ImageSizeType; + Type *ImageFormatType; + SmallVector<Instruction *, 4> InstsToErase; + + bool replaceImageUses(Argument &ImageArg, uint32_t ResourceID, + Argument &ImageSizeArg, + Argument &ImageFormatArg) { + bool Modified = false; + + for (auto &Use : ImageArg.uses()) { + auto Inst = dyn_cast<CallInst>(Use.getUser()); + if (!Inst) { + continue; + } + + Function *F = Inst->getCalledFunction(); + if (!F) + continue; + + Value *Replacement = nullptr; + StringRef Name = F->getName(); + if (Name.startswith(GetImageResourceIDFunc)) { + Replacement = ConstantInt::get(Int32Type, ResourceID); + } else if (Name.startswith(GetImageSizeFunc)) { + Replacement = &ImageSizeArg; + } else if (Name.startswith(GetImageFormatFunc)) { + Replacement = &ImageFormatArg; + } else { + continue; + } + + Inst->replaceAllUsesWith(Replacement); + InstsToErase.push_back(Inst); + Modified = true; + } + + return Modified; + } + + bool replaceSamplerUses(Argument &SamplerArg, uint32_t ResourceID) { + bool Modified = false; + + for (const auto &Use : SamplerArg.uses()) { + auto Inst = dyn_cast<CallInst>(Use.getUser()); + if (!Inst) { + continue; + } + + Function *F = Inst->getCalledFunction(); + if (!F) + continue; + + Value *Replacement = nullptr; + StringRef Name = F->getName(); + if (Name == GetSamplerResourceIDFunc) { + Replacement = ConstantInt::get(Int32Type, ResourceID); + } else { + continue; + } + + Inst->replaceAllUsesWith(Replacement); + InstsToErase.push_back(Inst); + Modified = true; + } + + return Modified; + } + + bool replaceImageAndSamplerUses(Function *F, MDNode *KernelMDNode) { + uint32_t NumReadOnlyImageArgs = 0; + uint32_t NumWriteOnlyImageArgs = 0; + uint32_t NumSamplerArgs = 0; + + bool Modified = false; + InstsToErase.clear(); + for (auto ArgI = F->arg_begin(); ArgI != F->arg_end(); ++ArgI) { + Argument &Arg = *ArgI; + StringRef Type = ArgTypeFromMD(KernelMDNode, Arg.getArgNo()); + + // Handle image types. + if (IsImageType(Type)) { + StringRef AccessQual = AccessQualFromMD(KernelMDNode, Arg.getArgNo()); + uint32_t ResourceID; + if (AccessQual == "read_only") { + ResourceID = NumReadOnlyImageArgs++; + } else if (AccessQual == "write_only") { + ResourceID = NumWriteOnlyImageArgs++; + } else { + llvm_unreachable("Wrong image access qualifier."); + } + + Argument &SizeArg = *(++ArgI); + Argument &FormatArg = *(++ArgI); + Modified |= replaceImageUses(Arg, ResourceID, SizeArg, FormatArg); + + // Handle sampler type. + } else if (IsSamplerType(Type)) { + uint32_t ResourceID = NumSamplerArgs++; + Modified |= replaceSamplerUses(Arg, ResourceID); + } + } + for (unsigned i = 0; i < InstsToErase.size(); ++i) { + InstsToErase[i]->eraseFromParent(); + } + + return Modified; + } + + std::tuple<Function *, MDNode *> + addImplicitArgs(Function *F, MDNode *KernelMDNode) { + bool Modified = false; + + FunctionType *FT = F->getFunctionType(); + SmallVector<Type *, 8> ArgTypes; + + // Metadata operands for new MDNode. + KernelArgMD NewArgMDs; + PushArgMD(NewArgMDs, GetArgMD(KernelMDNode, 0)); + + // Add implicit arguments to the signature. + for (unsigned i = 0; i < FT->getNumParams(); ++i) { + ArgTypes.push_back(FT->getParamType(i)); + MDVector ArgMD = GetArgMD(KernelMDNode, i + 1); + PushArgMD(NewArgMDs, ArgMD); + + if (!IsImageType(ArgTypeFromMD(KernelMDNode, i))) + continue; + + // Add size implicit argument. + ArgTypes.push_back(ImageSizeType); + ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageSizeArgMDType); + PushArgMD(NewArgMDs, ArgMD); + + // Add format implicit argument. + ArgTypes.push_back(ImageFormatType); + ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageFormatArgMDType); + PushArgMD(NewArgMDs, ArgMD); + + Modified = true; + } + if (!Modified) { + return std::make_tuple(nullptr, nullptr); + } + + // Create function with new signature and clone the old body into it. + auto NewFT = FunctionType::get(FT->getReturnType(), ArgTypes, false); + auto NewF = Function::Create(NewFT, F->getLinkage(), F->getName()); + ValueToValueMapTy VMap; + auto NewFArgIt = NewF->arg_begin(); + for (auto &Arg: F->args()) { + auto ArgName = Arg.getName(); + NewFArgIt->setName(ArgName); + VMap[&Arg] = &(*NewFArgIt++); + if (IsImageType(ArgTypeFromMD(KernelMDNode, Arg.getArgNo()))) { + (NewFArgIt++)->setName(Twine("__size_") + ArgName); + (NewFArgIt++)->setName(Twine("__format_") + ArgName); + } + } + SmallVector<ReturnInst*, 8> Returns; + CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns); + + // Build new MDNode. + SmallVector<llvm::Metadata *, 6> KernelMDArgs; + KernelMDArgs.push_back(ConstantAsMetadata::get(NewF)); + for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) + KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i])); + MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs); + + return std::make_tuple(NewF, NewMDNode); + } + + bool transformKernels(Module &M) { + NamedMDNode *KernelsMDNode = M.getNamedMetadata(KernelsMDNodeName); + if (!KernelsMDNode) + return false; + + bool Modified = false; + for (unsigned i = 0; i < KernelsMDNode->getNumOperands(); ++i) { + MDNode *KernelMDNode = KernelsMDNode->getOperand(i); + Function *F = GetFunctionFromMDNode(KernelMDNode); + if (!F) + continue; + + Function *NewF; + MDNode *NewMDNode; + std::tie(NewF, NewMDNode) = addImplicitArgs(F, KernelMDNode); + if (NewF) { + // Replace old function and metadata with new ones. + F->eraseFromParent(); + M.getFunctionList().push_back(NewF); + M.getOrInsertFunction(NewF->getName(), NewF->getFunctionType(), + NewF->getAttributes()); + KernelsMDNode->setOperand(i, NewMDNode); + + F = NewF; + KernelMDNode = NewMDNode; + Modified = true; + } + + Modified |= replaceImageAndSamplerUses(F, KernelMDNode); + } + + return Modified; + } + + public: + AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {} + + bool runOnModule(Module &M) override { + Context = &M.getContext(); + Int32Type = Type::getInt32Ty(M.getContext()); + ImageSizeType = ArrayType::get(Int32Type, 3); + ImageFormatType = ArrayType::get(Int32Type, 2); + + return transformKernels(M); + } + + const char *getPassName() const override { + return "AMDGPU OpenCL Image Type Pass"; + } +}; + +char AMDGPUOpenCLImageTypeLoweringPass::ID = 0; + +} // end anonymous namespace + +ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() { + return new AMDGPUOpenCLImageTypeLoweringPass(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 57b7a73..87d50d5 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -54,7 +54,7 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) { bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { - const FunctionType *FTy = F.getFunctionType(); + FunctionType *FTy = F.getFunctionType(); LocalMemAvailable = ST.getLocalMemorySize(); @@ -63,7 +63,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { // possible these arguments require the entire local memory space, so // we cannot use local memory in the pass. for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { - const Type *ParamTy = FTy->getParamType(i); + Type *ParamTy = FTy->getParamType(i); if (ParamTy->isPointerTy() && ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { LocalMemAvailable = 0; @@ -77,7 +77,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { // Check how much local memory is being used by global objects for (Module::global_iterator I = Mod->global_begin(), E = Mod->global_end(); I != E; ++I) { - GlobalVariable *GV = I; + GlobalVariable *GV = &*I; PointerType *GVTy = GV->getType(); if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) continue; @@ -101,7 +101,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { return false; } -static VectorType *arrayTypeToVecType(const Type *ArrayTy) { +static VectorType *arrayTypeToVecType(Type *ArrayTy) { return VectorType::get(ArrayTy->getArrayElementType(), ArrayTy->getArrayNumElements()); } @@ -276,6 +276,9 @@ static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { } void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { + if (!I.isStaticAlloca()) + return; + IRBuilder<> Builder(&I); // First try to replace the alloca with a vector diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h index cfd800b..0344834 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -37,10 +37,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { assert(!"Unimplemented"); return BitVector(); } - virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const { - assert(!"Unimplemented"); return nullptr; - } - virtual unsigned getHWRegIndex(unsigned Reg) const { assert(!"Unimplemented"); return 0; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 5f32a65..c6af5b9 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -16,6 +16,7 @@ #include "R600ISelLowering.h" #include "R600InstrInfo.h" #include "R600MachineScheduler.h" +#include "SIFrameLowering.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -44,6 +45,8 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // disable it. SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); + if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. + FullFS += "+flat-for-global,"; FullFS += FS; if (GPU == "" && TT.getArch() == Triple::amdgcn) @@ -67,26 +70,37 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, DumpCode(false), R600ALUInst(false), HasVertexCache(false), TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), - CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), - EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), - EnableUnsafeDSOffsetFolding(false), + CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false), + EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), + EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), + EnableXNACK(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), - FrameLowering(TargetFrameLowering::StackGrowsUp, - 64 * 16, // Maximum stack alignment (long16) - 0), + FrameLowering(nullptr), InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { initializeSubtargetDependencies(TT, GPU, FS); + const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16) + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { InstrInfo.reset(new R600InstrInfo(*this)); TLInfo.reset(new R600TargetLowering(TM, *this)); + + // FIXME: Should have R600 specific FrameLowering + FrameLowering.reset(new AMDGPUFrameLowering( + TargetFrameLowering::StackGrowsUp, + MaxStackAlign, + 0)); } else { InstrInfo.reset(new SIInstrInfo(*this)); TLInfo.reset(new SITargetLowering(TM, *this)); + FrameLowering.reset(new SIFrameLowering( + TargetFrameLowering::StackGrowsUp, + MaxStackAlign, + 0)); } } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 735f01d..d371227 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -1,4 +1,4 @@ -//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====// +//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// // // The LLVM Compiler Infrastructure // @@ -12,17 +12,15 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H -#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H + #include "AMDGPU.h" #include "AMDGPUFrameLowering.h" #include "AMDGPUInstrInfo.h" -#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUISelLowering.h" #include "AMDGPUSubtarget.h" -#include "R600ISelLowering.h" -#include "AMDKernelCodeT.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -72,11 +70,13 @@ private: bool FastFMAF32; bool CaymanISA; bool FlatAddressSpace; + bool FlatForGlobal; bool EnableIRStructurizer; bool EnablePromoteAlloca; bool EnableIfCvt; bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; + bool EnableXNACK; unsigned WavefrontSize; bool CFALUBug; int LocalMemorySize; @@ -88,10 +88,10 @@ private: bool CIInsts; bool FeatureDisable; int LDSBankCount; - unsigned IsaVersion; + unsigned IsaVersion; bool EnableHugeScratchBuffer; - AMDGPUFrameLowering FrameLowering; + std::unique_ptr<AMDGPUFrameLowering> FrameLowering; std::unique_ptr<AMDGPUTargetLowering> TLInfo; std::unique_ptr<AMDGPUInstrInfo> InstrInfo; InstrItineraryData InstrItins; @@ -104,7 +104,7 @@ public: StringRef GPU, StringRef FS); const AMDGPUFrameLowering *getFrameLowering() const override { - return &FrameLowering; + return FrameLowering.get(); } const AMDGPUInstrInfo *getInstrInfo() const override { return InstrInfo.get(); @@ -161,6 +161,10 @@ public: return FlatAddressSpace; } + bool useFlatForGlobal() const { + return FlatForGlobal; + } + bool hasBFE() const { return (getGeneration() >= EVERGREEN); } @@ -287,6 +291,10 @@ public: } bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const; + bool isXNACKEnabled() const { + return EnableXNACK; + } + unsigned getMaxWavesPerCU() const { if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) return 10; @@ -305,6 +313,9 @@ public: return isAmdHsaOS() ? 0 : 36; } + unsigned getMaxNumUserSGPRs() const { + return 16; + } }; } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 2297b52..b1be619 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUTargetMachine.h" +#include "AMDGPUTargetObjectFile.h" #include "AMDGPU.h" #include "AMDGPUTargetTransformInfo.h" #include "R600ISelLowering.h" @@ -41,6 +42,23 @@ extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); + + PassRegistry *PR = PassRegistry::getPassRegistry(); + initializeSILowerI1CopiesPass(*PR); + initializeSIFixSGPRCopiesPass(*PR); + initializeSIFoldOperandsPass(*PR); + initializeSIFixSGPRLiveRangesPass(*PR); + initializeSIFixControlFlowLiveIntervalsPass(*PR); + initializeSILoadStoreOptimizerPass(*PR); + initializeAMDGPUAnnotateKernelFeaturesPass(*PR); + initializeAMDGPUAnnotateUniformValuesPass(*PR); +} + +static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { + if (TT.getOS() == Triple::AMDHSA) + return make_unique<AMDGPUHSATargetObjectFile>(); + + return make_unique<AMDGPUTargetObjectFile>(); } static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { @@ -48,8 +66,12 @@ static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { } static MachineSchedRegistry -SchedCustomRegistry("r600", "Run R600's custom scheduler", - createR600MachineScheduler); +R600SchedRegistry("r600", "Run R600's custom scheduler", + createR600MachineScheduler); + +static MachineSchedRegistry +SISchedRegistry("si", "Run SI's custom scheduler", + createSIMachineScheduler); static std::string computeDataLayout(const Triple &TT) { std::string Ret = "e-p:32:32"; @@ -72,15 +94,13 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OptLevel) : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, OptLevel), - TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this), + TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this), IntrinsicInfo() { setRequiresStructuredCFG(true); initAsmInfo(); } -AMDGPUTargetMachine::~AMDGPUTargetMachine() { - delete TLOF; -} +AMDGPUTargetMachine::~AMDGPUTargetMachine() { } //===----------------------------------------------------------------------===// // R600 Target Machine (R600 -> Cayman) @@ -110,7 +130,13 @@ namespace { class AMDGPUPassConfig : public TargetPassConfig { public: AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + : TargetPassConfig(TM, PM) { + + // Exceptions and StackMaps are not supported, so these passes will never do + // anything. + disablePass(&StackMapLivenessID); + disablePass(&FuncletLayoutID); + } AMDGPUTargetMachine &getAMDGPUTargetMachine() const { return getTM<AMDGPUTargetMachine>(); @@ -126,8 +152,9 @@ public: void addIRPasses() override; void addCodeGenPrepare() override; - virtual bool addPreISel() override; - virtual bool addInstSelector() override; + bool addPreISel() override; + bool addInstSelector() override; + bool addGCPasses() override; }; class R600PassConfig : public AMDGPUPassConfig { @@ -147,6 +174,8 @@ public: : AMDGPUPassConfig(TM, PM) { } bool addPreISel() override; bool addInstSelector() override; + void addFastRegAlloc(FunctionPass *RegAllocPass) override; + void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreSched2() override; @@ -156,7 +185,7 @@ public: } // End of anonymous namespace TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo( AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); }); @@ -172,6 +201,10 @@ void AMDGPUPassConfig::addIRPasses() { // functions, then we will generate code for the first function // without ever running any passes on the second. addPass(createBarrierNoopPass()); + + // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. + addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + TargetPassConfig::addIRPasses(); } @@ -198,6 +231,11 @@ bool AMDGPUPassConfig::addInstSelector() { return false; } +bool AMDGPUPassConfig::addGCPasses() { + // Do nothing. GC is not supported. + return false; +} + //===----------------------------------------------------------------------===// // R600 Pass Setup //===----------------------------------------------------------------------===// @@ -238,16 +276,23 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); + + // FIXME: We need to run a pass to propagate the attributes when calls are + // supported. + addPass(&AMDGPUAnnotateKernelFeaturesID); + addPass(createSinkingPass()); addPass(createSITypeRewriter()); addPass(createSIAnnotateControlFlowPass()); + addPass(createAMDGPUAnnotateUniformValues()); + return false; } bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); - addPass(createSIFixSGPRCopiesPass(*TM)); + addPass(&SIFixSGPRCopiesID); addPass(createSIFoldOperandsPass()); return false; } @@ -259,7 +304,6 @@ void GCNPassConfig::addPreRegAlloc() { // earlier passes might recompute live intervals. // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass if (getOptLevel() > CodeGenOpt::None) { - initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); } @@ -269,16 +313,27 @@ void GCNPassConfig::addPreRegAlloc() { // This should be run after scheduling, but before register allocation. It // also need extra copies to the address operand to be eliminated. - initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); insertPass(&MachineSchedulerID, &RegisterCoalescerID); } addPass(createSIShrinkInstructionsPass(), false); - addPass(createSIFixSGPRLiveRangesPass(), false); +} + +void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { + addPass(&SIFixSGPRLiveRangesID); + TargetPassConfig::addFastRegAlloc(RegAllocPass); +} + +void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { + // We want to run this after LiveVariables is computed to avoid computing them + // twice. + // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure + // that needs to be fixed. + insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false); + TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); } void GCNPassConfig::addPostRegAlloc() { - addPass(createSIPrepareScratchRegs(), false); addPass(createSIShrinkInstructionsPass(), false); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 14792e3..236e3f8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -32,7 +32,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { private: protected: - TargetLoweringObjectFile *TLOF; + std::unique_ptr<TargetLoweringObjectFile> TLOF; AMDGPUSubtarget Subtarget; AMDGPUIntrinsicInfo IntrinsicInfo; @@ -52,7 +52,7 @@ public: TargetIRAnalysis getTargetIRAnalysis() override; TargetLoweringObjectFile *getObjFileLowering() const override { - return TLOF; + return TLOF.get(); } }; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp new file mode 100644 index 0000000..e050f21 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -0,0 +1,87 @@ +//===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetObjectFile.h" +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/ELF.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Generic Object File +//===----------------------------------------------------------------------===// + +MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, + SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const { + if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV)) + return TextSection; + + return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM); +} + +//===----------------------------------------------------------------------===// +// HSA Object File +//===----------------------------------------------------------------------===// + + +void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM){ + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); + + TextSection = AMDGPU::getHSATextSection(Ctx); + + DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx); + DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx); + + RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx); +} + +bool AMDGPUHSATargetObjectFile::isAgentAllocationSection( + const char *SectionName) const { + return cast<MCSectionELF>(DataGlobalAgentSection) + ->getSectionName() + .equals(SectionName); +} + +bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const { + // Read-only segments can only have agent allocation. + return AMDGPU::isReadOnlySegment(GV) || + (AMDGPU::isGlobalSegment(GV) && GV->hasSection() && + isAgentAllocationSection(GV->getSection())); +} + +bool AMDGPUHSATargetObjectFile::isProgramAllocation( + const GlobalValue *GV) const { + // The default for global segments is program allocation. + return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV); +} + +MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal( + const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const { + if (Kind.isText() && !GV->hasComdat()) + return getTextSection(); + + if (AMDGPU::isGlobalSegment(GV)) { + if (isAgentAllocation(GV)) + return DataGlobalAgentSection; + + if (isProgramAllocation(GV)) + return DataGlobalProgramSection; + } + + return AMDGPUTargetObjectFile::SelectSectionForGlobal(GV, Kind, Mang, TM); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h new file mode 100644 index 0000000..921341e --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -0,0 +1,51 @@ +//===-- AMDGPUTargetObjectFile.h - AMDGPU Object Info ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file declares the AMDGPU-specific subclass of +/// TargetLoweringObjectFile. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF { + public: + MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const override; +}; + +class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile { +private: + MCSection *DataGlobalAgentSection; + MCSection *DataGlobalProgramSection; + MCSection *RodataReadonlyAgentSection; + + bool isAgentAllocationSection(const char *SectionName) const; + bool isAgentAllocation(const GlobalValue *GV) const; + bool isProgramAllocation(const GlobalValue *GV) const; + +public: + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + + MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 6dacc74..54a003d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -74,9 +74,109 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { return 4 * 128; // XXX - 4 channels. Should these count as vector instead? } -unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; } +unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { + return Vector ? 0 : 32; +} unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Semi-arbitrary large amount. return 64; } + +unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { + // XXX - For some reason this isn't called for switch. + switch (Opcode) { + case Instruction::Br: + case Instruction::Ret: + return 10; + default: + return BaseT::getCFInstrCost(Opcode); + } +} + +int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index) { + switch (Opcode) { + case Instruction::ExtractElement: + // Dynamic indexing isn't free and is best avoided. + return Index == ~0u ? 2 : 0; + default: + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + } +} + +static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, + const IntrinsicInst *I) { + switch (I->getIntrinsicID()) { + default: + return false; + case Intrinsic::not_intrinsic: + // This means we have an intrinsic that isn't defined in + // IntrinsicsAMDGPU.td + break; + + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_mbcnt_hi: + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::r600_read_tidig_x: + case Intrinsic::r600_read_tidig_y: + case Intrinsic::r600_read_tidig_z: + return true; + } + + StringRef Name = I->getCalledFunction()->getName(); + switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) { + default: + return false; + case AMDGPUIntrinsic::SI_tid: + case AMDGPUIntrinsic::SI_fs_interp: + return true; + } +} + +static bool isArgPassedInSGPR(const Argument *A) { + const Function *F = A->getParent(); + unsigned ShaderType = AMDGPU::getShaderType(*F); + + // Arguments to compute shaders are never a source of divergence. + if (ShaderType == ShaderType::COMPUTE) + return true; + + // For non-compute shaders, SGPR inputs are marked with either inreg or byval. + if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) || + F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal)) + return true; + + // Everything else is in VGPRs. + return false; +} + +/// +/// \returns true if the result of the value could potentially be +/// different across workitems in a wavefront. +bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { + + if (const Argument *A = dyn_cast<Argument>(V)) + return !isArgPassedInSGPR(A); + + // Loads from the private address space are divergent, because threads + // can execute the load instruction with the same inputs and get different + // results. + // + // All other loads are not divergent, because if threads issue loads with the + // same arguments, they will always get the same result. + if (const LoadInst *Load = dyn_cast<LoadInst>(V)) + return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { + const TargetMachine &TM = getTLI()->getTargetMachine(); + return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic); + } + + // Assume all function calls are a source of divergence. + if (isa<CallInst>(V) || isa<InvokeInst>(V)) + return true; + + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index dee0a69..976afb0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -60,6 +60,11 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(unsigned VF); + + unsigned getCFInstrCost(unsigned Opcode); + + int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); + bool isSourceOfDivergence(const Value *V) const; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index d918ac3..917efd1 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -185,7 +185,7 @@ protected: MachinePostDominatorTree *PDT; MachineLoopInfo *MLI; const R600InstrInfo *TII; - const AMDGPURegisterInfo *TRI; + const R600RegisterInfo *TRI; // PRINT FUNCTIONS /// Print the ordered Blocks. @@ -881,7 +881,7 @@ bool AMDGPUCFGStructurizer::run() { } //while, "one iteration" over the function. MachineBasicBlock *EntryMBB = - GraphTraits<MachineFunction *>::nodes_begin(FuncRep); + &*GraphTraits<MachineFunction *>::nodes_begin(FuncRep); if (EntryMBB->succ_size() == 0) { Finish = true; DEBUG( @@ -904,7 +904,7 @@ bool AMDGPUCFGStructurizer::run() { } while (!Finish && MakeProgress); // Misc wrap up to maintain the consistency of the Function representation. - wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep)); + wrapup(&*GraphTraits<MachineFunction *>::nodes_begin(FuncRep)); // Detach retired Block, release memory. for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end(); @@ -1164,7 +1164,7 @@ int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep, for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(), E = ContMBB.end(); It != E; ++It) { - (*It)->removeSuccessor(LoopHeader); + (*It)->removeSuccessor(LoopHeader, true); } numLoopcontPatternMatch += NumCont; @@ -1353,7 +1353,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // If MigrateTrue is true, then TrueBB is the block being "branched into" // and if MigrateFalse is true, then FalseBB is the block being // "branched into" - // + // // Here is the pseudo code for how I think the optimization should work: // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head. // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from. @@ -1372,7 +1372,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // the late machine optimization passes, however if we implement // bool TargetRegisterInfo::requiresRegisterScavenging( // const MachineFunction &MF) - // and have it return true, liveness will be tracked correctly + // and have it return true, liveness will be tracked correctly // by generic optimization passes. We will also need to make sure that // all of our target-specific passes that run after regalloc and before // the CFGStructurizer track liveness and we will need to modify this pass @@ -1487,7 +1487,7 @@ void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, ); DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end()); - DstMBB->removeSuccessor(SrcMBB); + DstMBB->removeSuccessor(SrcMBB, true); cloneSuccessorList(DstMBB, SrcMBB); removeSuccessor(SrcMBB); @@ -1537,9 +1537,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, if (TrueMBB) { MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end()); - MBB->removeSuccessor(TrueMBB); + MBB->removeSuccessor(TrueMBB, true); if (LandMBB && TrueMBB->succ_size()!=0) - TrueMBB->removeSuccessor(LandMBB); + TrueMBB->removeSuccessor(LandMBB, true); retireBlock(TrueMBB); MLI->removeBlock(TrueMBB); } @@ -1548,9 +1548,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, insertInstrBefore(I, AMDGPU::ELSE); MBB->splice(I, FalseMBB, FalseMBB->begin(), FalseMBB->end()); - MBB->removeSuccessor(FalseMBB); + MBB->removeSuccessor(FalseMBB, true); if (LandMBB && FalseMBB->succ_size() != 0) - FalseMBB->removeSuccessor(LandMBB); + FalseMBB->removeSuccessor(LandMBB, true); retireBlock(FalseMBB); MLI->removeBlock(FalseMBB); } @@ -1570,8 +1570,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); - DstBlk->addSuccessor(LandMBB); - DstBlk->removeSuccessor(DstBlk); + DstBlk->replaceSuccessor(DstBlk, LandMBB); } @@ -1592,7 +1591,7 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, //now branchInst can be erase safely BranchMI->eraseFromParent(); //now take care of successors, retire blocks - ExitingMBB->removeSuccessor(LandMBB); + ExitingMBB->removeSuccessor(LandMBB, true); } void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, @@ -1666,8 +1665,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); //srcBlk, oldBlk, newBlk - PredMBB->removeSuccessor(MBB); - PredMBB->addSuccessor(CloneMBB); + PredMBB->replaceSuccessor(MBB, CloneMBB); // add all successor to cloneBlk cloneSuccessorList(CloneMBB, MBB); @@ -1695,10 +1693,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, ); SpliceEnd = SrcMBB->end(); } else { - DEBUG( - dbgs() << "migrateInstruction see branch instr\n" ; - BranchMI->dump(); - ); + DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI); SpliceEnd = BranchMI; } DEBUG( @@ -1711,7 +1706,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, DEBUG( dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size() - << "srcSize = " << SrcMBB->size() << "\n"; + << "srcSize = " << SrcMBB->size() << '\n'; ); } @@ -1743,7 +1738,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. while ((BranchMI = getLoopendBlockBranchInstr(MBB)) && isUncondBranch(BranchMI)) { - DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump();); + DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI); BranchMI->eraseFromParent(); } } @@ -1759,10 +1754,10 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch( MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); assert(BranchMI && isCondBranch(BranchMI)); - DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump();); + DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI); BranchMI->eraseFromParent(); SHOWNEWBLK(MBB1, "Removing redundant successor"); - MBB->removeSuccessor(MBB1); + MBB->removeSuccessor(MBB1, true); } void AMDGPUCFGStructurizer::addDummyExitBlock( diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2018983..d9f753f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -28,7 +28,9 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" @@ -83,6 +85,7 @@ public: unsigned RegNo; int Modifiers; const MCRegisterInfo *TRI; + const MCSubtargetInfo *STI; bool IsForcedVOP3; }; @@ -102,7 +105,7 @@ public: } void addRegOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createReg(getReg())); + Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI))); } void addRegOrImmOperands(MCInst &Inst, unsigned N) const { @@ -215,6 +218,10 @@ public: (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)); } + bool isSCSrc64() const { + return (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)) || isInlineImm(); + } + bool isVCSrc32() const { return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); } @@ -251,7 +258,22 @@ public: return EndLoc; } - void print(raw_ostream &OS) const override { } + void print(raw_ostream &OS) const override { + switch (Kind) { + case Register: + OS << "<register " << getReg() << " mods: " << Reg.Modifiers << '>'; + break; + case Immediate: + OS << getImm(); + break; + case Token: + OS << '\'' << getToken() << '\''; + break; + case Expression: + OS << "<expr " << *Expr << '>'; + break; + } + } static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc, enum ImmTy Type = ImmTyNone, @@ -278,10 +300,12 @@ public: static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S, SMLoc E, const MCRegisterInfo *TRI, + const MCSubtargetInfo *STI, bool ForceVOP3) { auto Op = llvm::make_unique<AMDGPUOperand>(Register); Op->Reg.RegNo = RegNo; Op->Reg.TRI = TRI; + Op->Reg.STI = STI; Op->Reg.Modifiers = -1; Op->Reg.IsForcedVOP3 = ForceVOP3; Op->StartLoc = S; @@ -301,14 +325,32 @@ public: bool isDSOffset01() const; bool isSWaitCnt() const; bool isMubufOffset() const; + bool isSMRDOffset() const; + bool isSMRDLiteralOffset() const; }; class AMDGPUAsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; const MCInstrInfo &MII; MCAsmParser &Parser; unsigned ForcedEncodingSize; + + bool isSI() const { + return AMDGPU::isSI(getSTI()); + } + + bool isCI() const { + return AMDGPU::isCI(getSTI()); + } + + bool isVI() const { + return AMDGPU::isVI(getSTI()); + } + + bool hasSGPR102_SGPR103() const { + return !isVI(); + } + /// @name Auto-generated Match Functions /// { @@ -323,20 +365,34 @@ private: bool ParseDirectiveHSACodeObjectISA(); bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header); bool ParseDirectiveAMDKernelCodeT(); + bool ParseSectionDirectiveHSAText(); + bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const; + bool ParseDirectiveAMDGPUHsaKernel(); + bool ParseDirectiveAMDGPUHsaModuleGlobal(); + bool ParseDirectiveAMDGPUHsaProgramGlobal(); + bool ParseSectionDirectiveHSADataGlobalAgent(); + bool ParseSectionDirectiveHSADataGlobalProgram(); + bool ParseSectionDirectiveHSARodataReadonlyAgent(); public: - AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser, +public: + enum AMDGPUMatchResultTy { + Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY + }; + + AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser), - ForcedEncodingSize(0){ + : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser), + ForcedEncodingSize(0) { + MCAsmParserExtension::Initialize(Parser); - if (STI.getFeatureBits().none()) { + if (getSTI().getFeatureBits().none()) { // Set default features. - STI.ToggleFeature("SOUTHERN_ISLANDS"); + copySTI().ToggleFeature("SOUTHERN_ISLANDS"); } - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); } AMDGPUTargetStreamer &getTargetStreamer() { @@ -420,10 +476,10 @@ struct OptionalOperand { } -static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { +static int getRegClass(bool IsVgpr, unsigned RegWidth) { if (IsVgpr) { switch (RegWidth) { - default: llvm_unreachable("Unknown register width"); + default: return -1; case 1: return AMDGPU::VGPR_32RegClassID; case 2: return AMDGPU::VReg_64RegClassID; case 3: return AMDGPU::VReg_96RegClassID; @@ -434,7 +490,7 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { } switch (RegWidth) { - default: llvm_unreachable("Unknown register width"); + default: return -1; case 1: return AMDGPU::SGPR_32RegClassID; case 2: return AMDGPU::SGPR_64RegClassID; case 4: return AMDGPU::SReg_128RegClassID; @@ -443,16 +499,16 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { } } -static unsigned getRegForName(const StringRef &RegName) { +static unsigned getRegForName(StringRef RegName) { return StringSwitch<unsigned>(RegName) .Case("exec", AMDGPU::EXEC) .Case("vcc", AMDGPU::VCC) - .Case("flat_scr", AMDGPU::FLAT_SCR) + .Case("flat_scratch", AMDGPU::FLAT_SCR) .Case("m0", AMDGPU::M0) .Case("scc", AMDGPU::SCC) - .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO) - .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI) + .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) + .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) .Case("vcc_lo", AMDGPU::VCC_LO) .Case("vcc_hi", AMDGPU::VCC_HI) .Case("exec_lo", AMDGPU::EXEC_LO) @@ -464,12 +520,14 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End const AsmToken Tok = Parser.getTok(); StartLoc = Tok.getLoc(); EndLoc = Tok.getEndLoc(); - const StringRef &RegName = Tok.getString(); + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + + StringRef RegName = Tok.getString(); RegNo = getRegForName(RegName); if (RegNo) { Parser.Lex(); - return false; + return !subtargetHasRegister(*TRI, RegNo); } // Match vgprs and sgprs @@ -514,16 +572,24 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End RegIndexInClass = RegLo; } else { // SGPR registers are aligned. Max alignment is 4 dwords. - RegIndexInClass = RegLo / std::min(RegWidth, 4u); + unsigned Size = std::min(RegWidth, 4u); + if (RegLo % Size != 0) + return true; + + RegIndexInClass = RegLo / Size; } } - const MCRegisterInfo *TRC = getContext().getRegisterInfo(); - unsigned RC = getRegClass(IsVgpr, RegWidth); - if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs()) + int RCID = getRegClass(IsVgpr, RegWidth); + if (RCID == -1) return true; - RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass); - return false; + + const MCRegisterClass RC = TRI->getRegClass(RCID); + if (RegIndexInClass >= RC.getNumRegs()) + return true; + + RegNo = RC.getRegister(RegIndexInClass); + return !subtargetHasRegister(*TRI, RegNo); } unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { @@ -534,6 +600,11 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3))) return Match_InvalidOperand; + if ((TSFlags & SIInstrFlags::VOP3) && + (TSFlags & SIInstrFlags::VOPAsmPrefer32Bit) && + getForcedEncodingSize() != 64) + return Match_PreferE32; + return Match_Success; } @@ -549,7 +620,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, default: break; case Match_Success: Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, getSTI()); return false; case Match_MissingFeature: return Error(IDLoc, "instruction not supported on this GPU"); @@ -592,6 +663,9 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } return Error(ErrorLoc, "invalid operand for instruction"); } + case Match_PreferE32: + return Error(IDLoc, "internal error: instruction without _e64 suffix " + "should be encoded as e32"); } llvm_unreachable("Implement any new match types added!"); } @@ -640,7 +714,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { // If this directive has no arguments, then use the ISA version for the // targeted GPU. if (getLexer().is(AsmToken::EndOfStatement)) { - AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(STI.getFeatureBits()); + AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits()); getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor, Isa.Stepping, "AMD", "AMDGPU"); @@ -852,7 +926,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { amd_kernel_code_t Header; - AMDGPU::initDefaultAMDKernelCodeT(Header, STI.getFeatureBits()); + AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits()); while (true) { @@ -882,6 +956,64 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { return false; } +bool AMDGPUAsmParser::ParseSectionDirectiveHSAText() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSATextSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef KernelName = Parser.getTok().getString(); + + getTargetStreamer().EmitAMDGPUSymbolType(KernelName, + ELF::STT_AMDGPU_HSA_KERNEL); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef GlobalName = Parser.getTok().getIdentifier(); + + getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef GlobalName = Parser.getTok().getIdentifier(); + + getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSADataGlobalAgentSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSADataGlobalProgramSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSARodataReadonlyAgentSection(getContext())); + return false; +} + bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getString(); @@ -894,6 +1026,55 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".amd_kernel_code_t") return ParseDirectiveAMDKernelCodeT(); + if (IDVal == ".hsatext" || IDVal == ".text") + return ParseSectionDirectiveHSAText(); + + if (IDVal == ".amdgpu_hsa_kernel") + return ParseDirectiveAMDGPUHsaKernel(); + + if (IDVal == ".amdgpu_hsa_module_global") + return ParseDirectiveAMDGPUHsaModuleGlobal(); + + if (IDVal == ".amdgpu_hsa_program_global") + return ParseDirectiveAMDGPUHsaProgramGlobal(); + + if (IDVal == ".hsadata_global_agent") + return ParseSectionDirectiveHSADataGlobalAgent(); + + if (IDVal == ".hsadata_global_program") + return ParseSectionDirectiveHSADataGlobalProgram(); + + if (IDVal == ".hsarodata_readonly_agent") + return ParseSectionDirectiveHSARodataReadonlyAgent(); + + return true; +} + +bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, + unsigned RegNo) const { + if (isCI()) + return true; + + if (isSI()) { + // No flat_scr + switch (RegNo) { + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + return false; + default: + return true; + } + } + + // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that + // SI/CI have. + for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true); + R.isValid(); ++R) { + if (*R == RegNo) + return false; + } + return true; } @@ -943,13 +1124,11 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { int64_t IntVal; if (getParser().parseAbsoluteExpression(IntVal)) return MatchOperand_ParseFail; - APInt IntVal32(32, IntVal); - if (IntVal32.getSExtValue() != IntVal) { + if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) { Error(S, "invalid immediate: only 32-bit values are legal"); return MatchOperand_ParseFail; } - IntVal = IntVal32.getSExtValue(); if (Negate) IntVal *= -1; Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); @@ -1002,7 +1181,7 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { Operands.push_back(AMDGPUOperand::CreateReg( - RegNo, S, E, getContext().getRegisterInfo(), + RegNo, S, E, getContext().getRegisterInfo(), &getSTI(), isForcedVOP3())); if (HasModifiers || Modifiers) { @@ -1571,6 +1750,23 @@ AMDGPUAsmParser::parseR128(OperandVector &Operands) { } //===----------------------------------------------------------------------===// +// smrd +//===----------------------------------------------------------------------===// + +bool AMDGPUOperand::isSMRDOffset() const { + + // FIXME: Support 20-bit offsets on VI. We need to to pass subtarget + // information here. + return isImm() && isUInt<8>(getImm()); +} + +bool AMDGPUOperand::isSMRDLiteralOffset() const { + // 32-bit literals are only supported on CI and we only want to use them + // when the offset is > 8-bits. + return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm()); +} + +//===----------------------------------------------------------------------===// // vop3 //===----------------------------------------------------------------------===// @@ -1653,8 +1849,12 @@ AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) { } void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { - ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); - unsigned i = 2; + + unsigned i = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + if (Desc.getNumDefs() > 0) { + ((AMDGPUOperand &)*Operands[i++]).addRegOperands(Inst, 1); + } std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; diff --git a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td index 2f5fdbe..c543814 100644 --- a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td @@ -8,6 +8,22 @@ //===----------------------------------------------------------------------===// // Instruction definitions for CI and newer. //===----------------------------------------------------------------------===// +// Remaining instructions: +// S_CBRANCH_CDBGUSER +// S_CBRANCH_CDBGSYS +// S_CBRANCH_CDBGSYS_OR_USER +// S_CBRANCH_CDBGSYS_AND_USER +// DS_NOP +// DS_GWS_SEMA_RELEASE_ALL +// DS_WRAP_RTN_B32 +// DS_CNDXCHG32_RTN_B64 +// DS_WRITE_B96 +// DS_WRITE_B128 +// DS_CONDXCHG32_RTN_B128 +// DS_READ_B96 +// DS_READ_B128 +// BUFFER_LOAD_DWORDX3 +// BUFFER_STORE_DWORDX3 def isCIVI : Predicate < @@ -23,6 +39,7 @@ def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; let SubtargetPredicate = isCIVI in { +let SchedRW = [WriteDoubleAdd] in { defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64", VOP_F64_F64, ftrunc >; @@ -35,115 +52,282 @@ defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64", defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64", VOP_F64_F64, frint >; +} // End SchedRW = [WriteDoubleAdd] + +let SchedRW = [WriteQuarterRate32] in { defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32", VOP_F32_F32 >; defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32", VOP_F32_F32 >; +} // End SchedRW = [WriteQuarterRate32] + +//===----------------------------------------------------------------------===// +// VOP3 Instructions +//===----------------------------------------------------------------------===// + +defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8", + VOP_I32_I32_I32 +>; + +let isCommutable = 1 in { +defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32", + VOP_I64_I32_I32_I64 +>; + +// XXX - Does this set VCC? +defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", + VOP_I64_I32_I32_I64 +>; +} // End isCommutable = 1 + + +//===----------------------------------------------------------------------===// +// DS Instructions +//===----------------------------------------------------------------------===// +defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; + +// DS_CONDXCHG32_RTN_B64 +// DS_CONDXCHG32_RTN_B128 + +//===----------------------------------------------------------------------===// +// SMRD Instructions +//===----------------------------------------------------------------------===// + +defm S_DCACHE_INV_VOL : SMRD_Inval <smrd<0x1d, 0x22>, + "s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>; + +//===----------------------------------------------------------------------===// +// MUBUF Instructions +//===----------------------------------------------------------------------===// + +defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate <mubuf<0x70, 0x3f>, + "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol +>; //===----------------------------------------------------------------------===// // Flat Instructions //===----------------------------------------------------------------------===// -def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x8, "flat_load_ubyte", VGPR_32>; -def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x9, "flat_load_sbyte", VGPR_32>; -def FLAT_LOAD_USHORT : FLAT_Load_Helper <0xa, "flat_load_ushort", VGPR_32>; -def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0xb, "flat_load_sshort", VGPR_32>; -def FLAT_LOAD_DWORD : FLAT_Load_Helper <0xc, "flat_load_dword", VGPR_32>; -def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0xd, "flat_load_dwordx2", VReg_64>; -def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0xe, "flat_load_dwordx4", VReg_128>; -def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0xf, "flat_load_dwordx3", VReg_96>; -def FLAT_STORE_BYTE : FLAT_Store_Helper <0x18, "flat_store_byte", VGPR_32>; -def FLAT_STORE_SHORT : FLAT_Store_Helper <0x1a, "flat_store_short", VGPR_32>; -def FLAT_STORE_DWORD : FLAT_Store_Helper <0x1c, "flat_store_dword", VGPR_32>; -def FLAT_STORE_DWORDX2 : FLAT_Store_Helper < - 0x1d, "flat_store_dwordx2", VReg_64 +defm FLAT_LOAD_UBYTE : FLAT_Load_Helper < + flat<0x8, 0x10>, "flat_load_ubyte", VGPR_32 +>; +defm FLAT_LOAD_SBYTE : FLAT_Load_Helper < + flat<0x9, 0x11>, "flat_load_sbyte", VGPR_32 +>; +defm FLAT_LOAD_USHORT : FLAT_Load_Helper < + flat<0xa, 0x12>, "flat_load_ushort", VGPR_32 +>; +defm FLAT_LOAD_SSHORT : FLAT_Load_Helper < + flat<0xb, 0x13>, "flat_load_sshort", VGPR_32> +; +defm FLAT_LOAD_DWORD : FLAT_Load_Helper < + flat<0xc, 0x14>, "flat_load_dword", VGPR_32 +>; +defm FLAT_LOAD_DWORDX2 : FLAT_Load_Helper < + flat<0xd, 0x15>, "flat_load_dwordx2", VReg_64 +>; +defm FLAT_LOAD_DWORDX4 : FLAT_Load_Helper < + flat<0xe, 0x17>, "flat_load_dwordx4", VReg_128 +>; +defm FLAT_LOAD_DWORDX3 : FLAT_Load_Helper < + flat<0xf, 0x16>, "flat_load_dwordx3", VReg_96 >; -def FLAT_STORE_DWORDX4 : FLAT_Store_Helper < - 0x1e, "flat_store_dwordx4", VReg_128 +defm FLAT_STORE_BYTE : FLAT_Store_Helper < + flat<0x18>, "flat_store_byte", VGPR_32 >; -def FLAT_STORE_DWORDX3 : FLAT_Store_Helper < - 0x1f, "flat_store_dwordx3", VReg_96 +defm FLAT_STORE_SHORT : FLAT_Store_Helper < + flat <0x1a>, "flat_store_short", VGPR_32 +>; +defm FLAT_STORE_DWORD : FLAT_Store_Helper < + flat<0x1c>, "flat_store_dword", VGPR_32 +>; +defm FLAT_STORE_DWORDX2 : FLAT_Store_Helper < + flat<0x1d>, "flat_store_dwordx2", VReg_64 +>; +defm FLAT_STORE_DWORDX4 : FLAT_Store_Helper < + flat<0x1e, 0x1f>, "flat_store_dwordx4", VReg_128 +>; +defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper < + flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96 +>; +defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC < + flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32 >; -defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <0x30, "flat_atomic_swap", VGPR_32>; defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC < - 0x31, "flat_atomic_cmpswap", VGPR_32, VReg_64 ->; -defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <0x32, "flat_atomic_add", VGPR_32>; -defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <0x33, "flat_atomic_sub", VGPR_32>; -defm FLAT_ATOMIC_RSUB : FLAT_ATOMIC <0x34, "flat_atomic_rsub", VGPR_32>; -defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <0x35, "flat_atomic_smin", VGPR_32>; -defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <0x36, "flat_atomic_umin", VGPR_32>; -defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <0x37, "flat_atomic_smax", VGPR_32>; -defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <0x38, "flat_atomic_umax", VGPR_32>; -defm FLAT_ATOMIC_AND : FLAT_ATOMIC <0x39, "flat_atomic_and", VGPR_32>; -defm FLAT_ATOMIC_OR : FLAT_ATOMIC <0x3a, "flat_atomic_or", VGPR_32>; -defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <0x3b, "flat_atomic_xor", VGPR_32>; -defm FLAT_ATOMIC_INC : FLAT_ATOMIC <0x3c, "flat_atomic_inc", VGPR_32>; -defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <0x3d, "flat_atomic_dec", VGPR_32>; -defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC < - 0x3e, "flat_atomic_fcmpswap", VGPR_32, VReg_64 + flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64 +>; +defm FLAT_ATOMIC_ADD : FLAT_ATOMIC < + flat<0x32, 0x42>, "flat_atomic_add", VGPR_32 +>; +defm FLAT_ATOMIC_SUB : FLAT_ATOMIC < + flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32 +>; +defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC < + flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32 +>; +defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC < + flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32 +>; +defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC < + flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32 +>; +defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC < + flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32 +>; +defm FLAT_ATOMIC_AND : FLAT_ATOMIC < + flat<0x39, 0x48>, "flat_atomic_and", VGPR_32 +>; +defm FLAT_ATOMIC_OR : FLAT_ATOMIC < + flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32 +>; +defm FLAT_ATOMIC_XOR : FLAT_ATOMIC < + flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32 +>; +defm FLAT_ATOMIC_INC : FLAT_ATOMIC < + flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32 +>; +defm FLAT_ATOMIC_DEC : FLAT_ATOMIC < + flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32 +>; +defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC < + flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64 >; -defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <0x3f, "flat_atomic_fmin", VGPR_32>; -defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <0x40, "flat_atomic_fmax", VGPR_32>; -defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <0x50, "flat_atomic_swap_x2", VReg_64>; defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC < - 0x51, "flat_atomic_cmpswap_x2", VReg_64, VReg_128 ->; -defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <0x52, "flat_atomic_add_x2", VReg_64>; -defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <0x53, "flat_atomic_sub_x2", VReg_64>; -defm FLAT_ATOMIC_RSUB_X2 : FLAT_ATOMIC <0x54, "flat_atomic_rsub_x2", VReg_64>; -defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <0x55, "flat_atomic_smin_x2", VReg_64>; -defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <0x56, "flat_atomic_umin_x2", VReg_64>; -defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <0x57, "flat_atomic_smax_x2", VReg_64>; -defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <0x58, "flat_atomic_umax_x2", VReg_64>; -defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <0x59, "flat_atomic_and_x2", VReg_64>; -defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <0x5a, "flat_atomic_or_x2", VReg_64>; -defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <0x5b, "flat_atomic_xor_x2", VReg_64>; -defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <0x5c, "flat_atomic_inc_x2", VReg_64>; -defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <0x5d, "flat_atomic_dec_x2", VReg_64>; -defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC < - 0x5e, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128 + flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128 +>; +defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC < + flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64 +>; +defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC < + flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64 +>; +defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC < + flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64 +>; +defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC < + flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64 +>; +defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC < + flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64 +>; +defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC < + flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64 +>; +defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC < + flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64 +>; +defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC < + flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64 +>; +defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC < + flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64 +>; +defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC < + flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64 +>; +defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC < + flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64 >; -defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <0x5f, "flat_atomic_fmin_x2", VReg_64>; -defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <0x60, "flat_atomic_fmax_x2", VReg_64>; } // End SubtargetPredicate = isCIVI +// CI Only flat instructions + +let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst in { + +defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC < + flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, VReg_64 +>; +defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC < + flat<0x3f>, "flat_atomic_fmin", VGPR_32 +>; +defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC < + flat<0x40>, "flat_atomic_fmax", VGPR_32 +>; +defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC < + flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128 +>; +defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC < + flat<0x5f>, "flat_atomic_fmin_x2", VReg_64 +>; +defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC < + flat<0x60>, "flat_atomic_fmax_x2", VReg_64 +>; + +} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst + +let Predicates = [isCI] in { + +// Convert (x - floor(x)) to fract(x) +def : Pat < + (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), + (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), + (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +// Convert (x + (-floor(x))) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [isCI] + + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// -let Predicates = [HasFlatAddressSpace] in { +let Predicates = [isCIVI] in { -class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt, - PatFrag flat_ld> : - Pat <(vt (flat_ld i64:$ptr)), - (Instr_ADDR64 $ptr, 0, 0, 0) +// Patterns for global loads with no offset +class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr)), + (inst $addr, 0, 0, 0) >; -def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>; -def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>; -def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>; -def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>; -def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>; -def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>; -def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>; -def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>; -def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>; +def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>; +def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>; +def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>; +def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>; -class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> : - Pat <(st vt:$value, i64:$ptr), - (Instr $value, $ptr, 0, 0, 0) - >; +class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (node vt:$data, i64:$addr), + (inst $data, $addr, 0, 0, 0) +>; -def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>; -def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>; -def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>; -def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>; -def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>; -def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>; +def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>; +def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>; +def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>; +def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>; +def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>; + +class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr, vt:$data)), + (inst $addr, $data, 0, 0) +>; -} // End HasFlatAddressSpace predicate +def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; +} // End Predicates = [isCIVI] diff --git a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td index ba4df82..a6c3785 100644 --- a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td @@ -82,6 +82,10 @@ def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>; def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>; def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>; +def RAT_STORE_TYPED_cm: CF_MEM_RAT_STORE_TYPED<0> { + let eop = 0; // This bit is not used on Cayman. +} + class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern> : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> { diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 7adcd46..2245f14 100644 --- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -40,6 +40,15 @@ class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name, : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins, "MEM_RAT "#name, pattern>; +class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop> + : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr, + i32imm:$rat_id, InstFlag:$eop), + "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr" + #!if(has_eop, ", $eop", ""), + [(int_r600_rat_store_typed R600_Reg128:$rw_gpr, + R600_Reg128:$index_gpr, + (i32 imm:$rat_id))]>; + def RAT_MSKOR : CF_MEM_RAT <0x11, 0, (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), "MSKOR $rw_gpr.XW, $index_gpr", @@ -105,6 +114,8 @@ def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf, [(global_store v4i32:$rw_gpr, i32:$index_gpr)] >; +def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>; + } // End usesCustomInserter = 1 class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern> @@ -338,7 +349,7 @@ def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>; def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>; -def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>; +def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>; def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; let hasSideEffects = 1 in { diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index e811d5c..a187de8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -16,6 +16,7 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -283,8 +284,13 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) { O << "4.0"; else if (Imm == DoubleToBits(-4.0)) O << "-4.0"; - else - llvm_unreachable("64-bit literal constants not supported"); + else { + assert(isUInt<32>(Imm)); + + // In rare situations, we will have a 32-bit literal in a 64-bit + // operand. This is technically allowed for the encoding of s_mov_b64. + O << formatHex(static_cast<uint64_t>(Imm)); + } } void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, @@ -592,11 +598,11 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, } else { unsigned Stream = (SImm16 >> 8) & 0x3; if (Op == 1) - O << "cut"; + O << "cut"; else if (Op == 2) - O << "emit"; + O << "emit"; else if (Op == 3) - O << "emit-cut"; + O << "emit-cut"; O << " stream " << Stream; } O << "), [m0] "; diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index 14fb511..90541d8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -13,9 +13,7 @@ #ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H #define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H -#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInstPrinter.h" -#include "llvm/Support/raw_ostream.h" namespace llvm { diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 4434d9b..60e8c8f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -99,14 +99,22 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, case AMDGPU::fixup_si_rodata: { uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - *Dst = Value; - break; - } - - case AMDGPU::fixup_si_end_of_text: { - uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - // The value points to the last instruction in the text section, so we - // need to add 4 bytes to get to the start of the constants. + // We emit constant data at the end of the text section and generate its + // address using the following code sequence: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol + // s_addc_u32 s1, s1, 0 + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // the fixup replaces $symbol with a literal constant, which is a + // pc-relative offset from the encoding of the $symbol operand to the + // constant data. + // + // What we want here is an offset from the start of the s_add_u32 + // instruction to the constant data, but since the encoding of $symbol + // starts 4 bytes after the start of the add instruction, we end up + // with an offset that is 4 bytes too small. This requires us to + // add 4 to the fixup value before applying it. *Dst = Value + 4; break; } @@ -136,8 +144,7 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { // name offset bits flags { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_si_rodata", 0, 32, 0 }, - { "fixup_si_end_of_text", 0, 32, MCFixupKindInfo::FKF_IsPCRel } + { "fixup_si_rodata", 0, 32, MCFixupKindInfo::FKF_IsPCRel } }; if (Kind < FirstTargetFixupKind) diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp new file mode 100644 index 0000000..9ff9fe7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp @@ -0,0 +1,26 @@ +//===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUELFStreamer.h" +#include "Utils/AMDGPUBaseInfo.h" + +using namespace llvm; + +void AMDGPUELFStreamer::InitSections(bool NoExecStack) { + // Start with the .hsatext section by default. + SwitchSection(AMDGPU::getHSATextSection(getContext())); +} + +MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context, + MCAsmBackend &MAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, + bool RelaxAll) { + return new AMDGPUELFStreamer(Context, MAB, OS, Emitter); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h new file mode 100644 index 0000000..488d7e7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h @@ -0,0 +1,40 @@ +//===-------- AMDGPUELFStreamer.h - ELF Object Output ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a custom MCELFStreamer which allows us to insert some hooks before +// emitting data into an actual object file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H + +#include "llvm/MC/MCELFStreamer.h" + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCSubtargetInfo; + +class AMDGPUELFStreamer : public MCELFStreamer { +public: + AMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter) + : MCELFStreamer(Context, MAB, OS, Emitter) { } + + virtual void InitSections(bool NoExecStac) override; +}; + +MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll); +} // namespace llvm. + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h index 01021d6..59a9178 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h @@ -21,9 +21,6 @@ enum Fixups { /// fixup for global addresses with constant initializers fixup_si_rodata, - /// fixup for offset from instruction to end of text section - fixup_si_end_of_text, - // Marker LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 028a86d..4bc80a0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -22,22 +22,21 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { InlineAsmEnd = ";#ASMEND"; //===--- Data Emission Directives -------------------------------------===// - ZeroDirective = ".zero"; - AsciiDirective = ".ascii\t"; - AscizDirective = ".asciz\t"; - Data8bitsDirective = ".byte\t"; - Data16bitsDirective = ".short\t"; - Data32bitsDirective = ".long\t"; - Data64bitsDirective = ".quad\t"; SunStyleELFSectionSwitchSyntax = true; UsesELFSectionDirectiveForBSS = true; //===--- Global Variable Emission Directives --------------------------===// HasAggressiveSymbolFolding = true; COMMDirectiveAlignmentIsInBytes = false; - HasDotTypeDotSizeDirective = false; HasNoDeadStrip = true; WeakRefDirective = ".weakref\t"; //===--- Dwarf Emission Directives -----------------------------------===// SupportsDebugInformation = true; } + +bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const { + return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" || + SectionName == ".hsadata_global_program" || + SectionName == ".hsarodata_readonly_agent" || + MCAsmInfo::shouldOmitSectionDirective(SectionName); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h index a5bac51..a546961 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -21,12 +21,13 @@ class Triple; // If you need to create another MCAsmInfo class, which inherits from MCAsmInfo, // you will need to make sure your new class sets PrivateGlobalPrefix to -// a prefix that won't appeary in a fuction name. The default value +// a prefix that won't appear in a function name. The default value // for PrivateGlobalPrefix is 'L', so it will consider any function starting // with 'L' as a local symbol. class AMDGPUMCAsmInfo : public MCAsmInfoELF { public: explicit AMDGPUMCAsmInfo(const Triple &TT); + bool shouldOmitSectionDirective(StringRef SectionName) const override; }; } // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index c709741..f704094 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMCTargetDesc.h" +#include "AMDGPUELFStreamer.h" #include "AMDGPUMCAsmInfo.h" #include "AMDGPUTargetStreamer.h" #include "InstPrinter/AMDGPUInstPrinter.h" @@ -85,6 +86,15 @@ static MCTargetStreamer * createAMDGPUObjectTargetStreamer( return new AMDGPUTargetELFStreamer(S); } +static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context, + MCAsmBackend &MAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll) { + if (T.getOS() == Triple::AMDHSA) + return createAMDGPUELFStreamer(Context, MAB, OS, Emitter, RelaxAll); + + return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll); +} + extern "C" void LLVMInitializeAMDGPUTargetMC() { for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) { RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T); @@ -95,6 +105,7 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() { TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend); + TargetRegistry::RegisterELFStreamer(*T, createMCStreamer); } // R600 specific registration diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 09e6cb1..b91134d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -13,6 +13,7 @@ #include "AMDGPUTargetStreamer.h" #include "SIDefines.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFStreamer.h" @@ -220,6 +221,26 @@ AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { } +void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, + unsigned Type) { + switch (Type) { + default: llvm_unreachable("Invalid AMDGPU symbol type"); + case ELF::STT_AMDGPU_HSA_KERNEL: + OS << "\t.amdgpu_hsa_kernel " << SymbolName << '\n' ; + break; + } +} + +void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal( + StringRef GlobalName) { + OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n'; +} + +void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal( + StringRef GlobalName) { + OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n'; +} + //===----------------------------------------------------------------------===// // AMDGPUTargetELFStreamer //===----------------------------------------------------------------------===// @@ -291,7 +312,35 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { MCStreamer &OS = getStreamer(); OS.PushSection(); - OS.SwitchSection(OS.getContext().getObjectFileInfo()->getTextSection()); + // The MCObjectFileInfo that is available to the assembler is a generic + // implementation and not AMDGPUHSATargetObjectFile, so we can't use + // MCObjectFileInfo::getTextSection() here for fetching the HSATextSection. + OS.SwitchSection(AMDGPU::getHSATextSection(OS.getContext())); OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header))); OS.PopSection(); } + +void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, + unsigned Type) { + MCSymbolELF *Symbol = cast<MCSymbolELF>( + getStreamer().getContext().getOrCreateSymbol(SymbolName)); + Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL); +} + +void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal( + StringRef GlobalName) { + + MCSymbolELF *Symbol = cast<MCSymbolELF>( + getStreamer().getContext().getOrCreateSymbol(GlobalName)); + Symbol->setType(ELF::STT_OBJECT); + Symbol->setBinding(ELF::STB_LOCAL); +} + +void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal( + StringRef GlobalName) { + + MCSymbolELF *Symbol = cast<MCSymbolELF>( + getStreamer().getContext().getOrCreateSymbol(GlobalName)); + Symbol->setType(ELF::STT_OBJECT); + Symbol->setBinding(ELF::STB_GLOBAL); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index d37677c..83bb728 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -7,6 +7,9 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H + #include "AMDKernelCodeT.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" @@ -27,6 +30,12 @@ public: StringRef ArchName) = 0; virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0; + + virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0; + + virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0; + + virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0; }; class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer { @@ -41,6 +50,12 @@ public: StringRef ArchName) override; void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; + + void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + + void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; + + void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; }; class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer { @@ -72,6 +87,12 @@ public: void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; + void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + + void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; + + void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; }; } +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index e683498..3c1142d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -37,7 +37,6 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { const MCRegisterInfo &MRI; public: - R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri) : MCII(mcii), MRI(mri) { } @@ -50,8 +49,8 @@ public: uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; -private: +private: void EmitByte(unsigned int byte, raw_ostream &OS) const; void Emit(uint32_t value, raw_ostream &OS) const; @@ -59,7 +58,6 @@ private: unsigned getHWRegChan(unsigned reg) const; unsigned getHWReg(unsigned regNo) const; - }; } // End anonymous namespace @@ -83,7 +81,7 @@ enum FCInstr { MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - MCContext &Ctx) { + MCContext &Ctx) { return new R600MCCodeEmitter(MCII, MRI); } diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 65a0eeb..9eb3dad 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -36,7 +36,6 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { void operator=(const SIMCCodeEmitter &) = delete; const MCInstrInfo &MCII; const MCRegisterInfo &MRI; - MCContext &Ctx; /// \brief Can this operand also contain immediate values? bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; @@ -47,7 +46,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { public: SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, MCContext &ctx) - : MCII(mcii), MRI(mri), Ctx(ctx) { } + : MCII(mcii), MRI(mri) { } ~SIMCCodeEmitter() override {} @@ -250,17 +249,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, if (MO.isExpr()) { const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr()); - MCFixupKind Kind; - const MCSymbol *Sym = - Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - - if (&Expr->getSymbol() == Sym) { - // Add the offset to the beginning of the constant values. - Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text; - } else { - // This is used for constant data stored in .rodata. - Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; - } + MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc())); } diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td index d9a0723..a1584a2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Processors.td +++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td @@ -142,3 +142,7 @@ def : ProcessorModel<"carrizo", SIQuarterSpeedModel, def : ProcessorModel<"fiji", SIQuarterSpeedModel, [FeatureVolcanicIslands, FeatureISAVersion8_0_1] >; + +def : ProcessorModel<"stoney", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureISAVersion8_0_1] +>; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index c8f37f6..bd80bb2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -405,8 +405,8 @@ private: if (MO.isReg() && MO.isInternalRead()) MO.setIsInternalRead(false); } - getLiteral(BI, Literals); - ClauseContent.push_back(BI); + getLiteral(&*BI, Literals); + ClauseContent.push_back(&*BI); } I = BI; DeleteMI->eraseFromParent(); diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 4e4d554..124a9c6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -190,6 +190,10 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, setSchedulingPreference(Sched::Source); } +static inline bool isEOP(MachineBasicBlock::iterator I) { + return std::next(I)->getOpcode() == AMDGPU::RETURN; +} + MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineFunction * MF = BB->getParent(); @@ -276,12 +280,18 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( case AMDGPU::RAT_WRITE_CACHELESS_32_eg: case AMDGPU::RAT_WRITE_CACHELESS_64_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) .addOperand(MI->getOperand(0)) .addOperand(MI->getOperand(1)) - .addImm(EOP); // Set End of program bit + .addImm(isEOP(I)); // Set End of program bit + break; + } + case AMDGPU::RAT_STORE_TYPED_eg: { + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addOperand(MI->getOperand(2)) + .addImm(isEOP(I)); // Set End of program bit break; } @@ -539,7 +549,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( } } } - bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; + bool EOP = isEOP(I); if (!EOP && !isLastInstructionOfItsType) return BB; unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; @@ -946,6 +956,8 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDValue Arg = Op.getOperand(0); SDLoc DL(Op); + + // TODO: Should this propagate fast-math-flags? SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, DAG.getNode(ISD::FADD, DL, VT, DAG.getNode(ISD::FMUL, DL, VT, Arg, @@ -1936,6 +1948,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, Arg->getOperand(0).getOperand(Element)); } } + break; } case ISD::SELECT_CC: { diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 855fa9f..8b6eea1 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -922,7 +922,7 @@ bool R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const{ + BranchProbability Probability) const{ return true; } @@ -933,14 +933,14 @@ R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, MachineBasicBlock &FMBB, unsigned NumFCycles, unsigned ExtraFCycles, - const BranchProbability &Probability) const { + BranchProbability Probability) const { return true; } bool R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, - const BranchProbability &Probability) + BranchProbability Probability) const { return true; } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h index dee4c2b..e7251c3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h @@ -174,18 +174,18 @@ namespace llvm { bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const override ; + BranchProbability Probability) const override ; bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTCycles, unsigned ExtraTCycles, MachineBasicBlock &FMBB, unsigned NumFCycles, unsigned ExtraFCycles, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool DefinesPredicate(MachineInstr *MI, std::vector<MachineOperand> &Pred) const override; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td index 7beed09..33ef6a4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -1655,7 +1655,7 @@ def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>; // ISel Patterns //===----------------------------------------------------------------------===// -// CND*_INT Pattterns for f32 True / False values +// CND*_INT Patterns for f32 True / False values class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat < (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc), diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 0c06ccc..5efb3b9 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -318,7 +318,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { MRI = &(Fn.getRegInfo()); for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); MBB != MBBe; ++MBB) { - MachineBasicBlock *MB = MBB; + MachineBasicBlock *MB = &*MBB; PreviousRegSeq.clear(); PreviousRegSeqByReg.clear(); PreviousRegSeqByUndefCount.clear(); diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index deee5bc..2126961 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -81,11 +81,11 @@ private: int LastDstChan = -1; do { bool isTrans = false; - int BISlot = getSlot(BI); + int BISlot = getSlot(&*BI); if (LastDstChan >= BISlot) isTrans = true; LastDstChan = BISlot; - if (TII->isPredicated(BI)) + if (TII->isPredicated(&*BI)) continue; int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) @@ -95,7 +95,7 @@ private: continue; } unsigned Dst = BI->getOperand(DstIdx).getReg(); - if (isTrans || TII->isTransOnly(BI)) { + if (isTrans || TII->isTransOnly(&*BI)) { Result[Dst] = AMDGPU::PS; continue; } @@ -149,7 +149,7 @@ private: public: // Ctor. R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI) - : VLIWPacketizerList(MF, MLI, true), + : VLIWPacketizerList(MF, MLI, nullptr), TII(static_cast<const R600InstrInfo *>( MF.getSubtarget().getInstrInfo())), TRI(TII->getRegisterInfo()) { @@ -162,14 +162,14 @@ public: } // ignorePseudoInstruction - Ignore bundling of pseudo instructions. - bool ignorePseudoInstruction(MachineInstr *MI, - MachineBasicBlock *MBB) override { + bool ignorePseudoInstruction(const MachineInstr *MI, + const MachineBasicBlock *MBB) override { return false; } // isSoloInstruction - return true if instruction MI can not be packetized // with any other instruction, which means that MI itself is a packet. - bool isSoloInstruction(MachineInstr *MI) override { + bool isSoloInstruction(const MachineInstr *MI) override { if (TII->isVector(*MI)) return true; if (!TII->isALUInstr(MI->getOpcode())) @@ -375,7 +375,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { // instruction stream until we find the nearest boundary. MachineBasicBlock::iterator I = RegionEnd; for(;I != MBB->begin(); --I, --RemainingCount) { - if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn)) + if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn)) break; } I = MBB->begin(); @@ -392,7 +392,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { continue; } - Packetizer.PacketizeMIs(MBB, I, RegionEnd); + Packetizer.PacketizeMIs(&*MBB, &*I, RegionEnd); RegionEnd = I; } } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h index 9713e60..4f8a129 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h @@ -35,7 +35,7 @@ struct R600RegisterInfo : public AMDGPURegisterInfo { /// \brief get the register class of the specified type to use in the /// CFGStructurizer - const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; + const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const; const RegClassWeight & getRegClassWeight(const TargetRegisterClass *RC) const override; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index ccfbf1b..fa4d24a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -312,11 +312,10 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end()) Preds.push_back(*PI); } - BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT, - LI, false); + BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); } - CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt()); + CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt()); } /// \brief Annotate the control flow with intrinsics so the backend can diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h index 4c32639..aa1e352 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h @@ -37,7 +37,8 @@ enum { MIMG = 1 << 18, FLAT = 1 << 19, WQM = 1 << 20, - VGPRSpill = 1 << 21 + VGPRSpill = 1 << 21, + VOPAsmPrefer32Bit = 1 << 22 }; } @@ -136,7 +137,7 @@ namespace SIOutMods { #define C_00B84C_EXCP_EN #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC - +#define R_0286D0_SPI_PS_INPUT_ADDR 0x0286D0 #define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 #define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp index 5fe8d19..636750d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp @@ -16,15 +16,9 @@ #include "AMDGPU.h" #include "SIInstrInfo.h" -#include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" using namespace llvm; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 23502b4..f59d994 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -82,22 +82,10 @@ using namespace llvm; namespace { class SIFixSGPRCopies : public MachineFunctionPass { - -private: +public: static char ID; - const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const; - const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const; - bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI) const; -public: - SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { } + SIFixSGPRCopies() : MachineFunctionPass(ID) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -105,14 +93,23 @@ public: return "SI Fix SGPR copies"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace +INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE, + "SI Fix SGPR copies", false, false) + char SIFixSGPRCopies::ID = 0; -FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) { - return new SIFixSGPRCopies(tm); +char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; + +FunctionPass *llvm::createSIFixSGPRCopiesPass() { + return new SIFixSGPRCopies(); } static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { @@ -128,77 +125,115 @@ static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { return false; } -/// This functions walks the use list of Reg until it finds an Instruction -/// that isn't a COPY returns the register class of that instruction. -/// \return The register defined by the first non-COPY instruction. -const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses( - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const { - - const TargetRegisterClass *RC - = TargetRegisterInfo::isVirtualRegister(Reg) ? - MRI.getRegClass(Reg) : - TRI->getPhysRegClass(Reg); - - RC = TRI->getSubRegClass(RC, SubReg); - for (MachineRegisterInfo::use_instr_iterator - I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) { - switch (I->getOpcode()) { - case AMDGPU::COPY: - RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI, - I->getOperand(0).getReg(), - I->getOperand(0).getSubReg())); - break; - } - } +static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> +getCopyRegClasses(const MachineInstr &Copy, + const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + unsigned DstReg = Copy.getOperand(0).getReg(); + unsigned SrcReg = Copy.getOperand(1).getReg(); + + const TargetRegisterClass *SrcRC = + TargetRegisterInfo::isVirtualRegister(SrcReg) ? + MRI.getRegClass(SrcReg) : + TRI.getPhysRegClass(SrcReg); - return RC; + // We don't really care about the subregister here. + // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); + + const TargetRegisterClass *DstRC = + TargetRegisterInfo::isVirtualRegister(DstReg) ? + MRI.getRegClass(DstReg) : + TRI.getPhysRegClass(DstReg); + + return std::make_pair(SrcRC, DstRC); } -const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef( - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const { - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { - const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg); - return TRI->getSubRegClass(RC, SubReg); - } - MachineInstr *Def = MRI.getVRegDef(Reg); - if (Def->getOpcode() != AMDGPU::COPY) { - return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg); - } +static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, + const TargetRegisterClass *DstRC, + const SIRegisterInfo &TRI) { + return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC); +} - return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(), - Def->getOperand(1).getSubReg()); +static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, + const TargetRegisterClass *DstRC, + const SIRegisterInfo &TRI) { + return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); } -bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy, - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI) const { +// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. +// +// SGPRx = ... +// SGPRy = REG_SEQUENCE SGPRx, sub0 ... +// VGPRz = COPY SGPRy +// +// ==> +// +// VGPRx = COPY SGPRx +// VGPRz = REG_SEQUENCE VGPRx, sub0 +// +// This exposes immediate folding opportunities when materializing 64-bit +// immediates. +static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, + const SIRegisterInfo *TRI, + const SIInstrInfo *TII, + MachineRegisterInfo &MRI) { + assert(MI.isRegSequence()); + + unsigned DstReg = MI.getOperand(0).getReg(); + if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) + return false; - unsigned DstReg = Copy.getOperand(0).getReg(); - unsigned SrcReg = Copy.getOperand(1).getReg(); - unsigned SrcSubReg = Copy.getOperand(1).getSubReg(); + if (!MRI.hasOneUse(DstReg)) + return false; - if (!TargetRegisterInfo::isVirtualRegister(DstReg)) { - // If the destination register is a physical register there isn't really - // much we can do to fix this. + MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); + if (!CopyUse.isCopy()) return false; - } - const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); - const TargetRegisterClass *SrcRC; + if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) + return false; - if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || - MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass) + // TODO: Could have multiple extracts? + unsigned SubReg = CopyUse.getOperand(1).getSubReg(); + if (SubReg != AMDGPU::NoSubRegister) return false; - SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg); - return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC); + MRI.setRegClass(DstReg, DstRC); + + // SGPRx = ... + // SGPRy = REG_SEQUENCE SGPRx, sub0 ... + // VGPRz = COPY SGPRy + + // => + // VGPRx = COPY SGPRx + // VGPRz = REG_SEQUENCE VGPRx, sub0 + + MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); + + for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { + unsigned SrcReg = MI.getOperand(I).getReg(); + unsigned SrcSubReg = MI.getOperand(I).getSubReg(); + + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + assert(TRI->isSGPRClass(SrcRC) && + "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); + + SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); + const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); + + unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC); + + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg) + .addOperand(MI.getOperand(I)); + + MI.getOperand(I).setReg(TmpReg); + } + + CopyUse.eraseFromParent(); + return true; } bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { @@ -207,40 +242,38 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + + SmallVector<MachineInstr *, 16> Worklist; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { + I != E; ++I) { MachineInstr &MI = *I; - if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) { - DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n"); - DEBUG(MI.print(dbgs())); - TII->moveToVALU(MI); - - } switch (MI.getOpcode()) { - default: continue; - case AMDGPU::PHI: { - DEBUG(dbgs() << "Fixing PHI: " << MI); - - for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { - const MachineOperand &Op = MI.getOperand(i); - unsigned Reg = Op.getReg(); - const TargetRegisterClass *RC - = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg()); + default: + continue; + case AMDGPU::COPY: { + // If the destination register is a physical register there isn't really + // much we can do to fix this. + if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) + continue; - MRI.constrainRegClass(Op.getReg(), RC); - } - unsigned Reg = MI.getOperand(0).getReg(); - const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg, - MI.getOperand(0).getSubReg()); - if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) { - MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass); + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI); + if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { + DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI); + TII->moveToVALU(MI); } + break; + } + case AMDGPU::PHI: { + DEBUG(dbgs() << "Fixing PHI: " << MI); + unsigned Reg = MI.getOperand(0).getReg(); if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) break; @@ -310,8 +343,10 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { } case AMDGPU::REG_SEQUENCE: { if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || - !hasVGPROperands(MI, TRI)) + !hasVGPROperands(MI, TRI)) { + foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); continue; + } DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp index 0c54446..8bda283 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp @@ -7,9 +7,8 @@ // //===----------------------------------------------------------------------===// // -/// \file -/// SALU instructions ignore control flow, so we need to modify the live ranges -/// of the registers they define in some cases. +/// \file SALU instructions ignore the execution mask, so we need to modify the +/// live ranges of the registers they define in some cases. /// /// The main case we need to handle is when a def is used in one side of a /// branch and not another. For example: @@ -42,13 +41,15 @@ /// ENDIF /// %use /// -/// Adding this use will make the def live thoughout the IF branch, which is +/// Adding this use will make the def live throughout the IF branch, which is /// what we want. #include "AMDGPU.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachinePostDominators.h" @@ -79,9 +80,13 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LiveIntervals>(); + AU.addRequired<LiveVariables>(); + AU.addPreserved<LiveVariables>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addPreserved<MachinePostDominatorTree>(); AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -90,7 +95,7 @@ public: INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, "SI Fix SGPR Live Ranges", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(LiveVariables) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, "SI Fix SGPR Live Ranges", false, false) @@ -108,40 +113,48 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( MF.getSubtarget().getRegisterInfo()); - LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); - MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>(); - std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges; + bool MadeChange = false; + + MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>(); + SmallVector<unsigned, 16> SGPRLiveRanges; + + LiveVariables *LV = &getAnalysis<LiveVariables>(); + MachineBasicBlock *Entry = &MF.front(); - // First pass, collect all live intervals for SGPRs - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { + // Use a depth first order so that in SSA, we encounter all defs before + // uses. Once the defs of the block have been found, attempt to insert + // SGPR_USE instructions in successor blocks if required. + for (MachineBasicBlock *MBB : depth_first(Entry)) { + for (const MachineInstr &MI : *MBB) { for (const MachineOperand &MO : MI.defs()) { - if (MO.isImplicit()) - continue; + // We should never see a live out def of a physical register, so we also + // do not need to worry about implicit_defs(). unsigned Def = MO.getReg(); if (TargetRegisterInfo::isVirtualRegister(Def)) { - if (TRI->isSGPRClass(MRI.getRegClass(Def))) - SGPRLiveRanges.push_back( - std::make_pair(Def, &LIS->getInterval(Def))); - } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) { - SGPRLiveRanges.push_back( - std::make_pair(Def, &LIS->getRegUnit(Def))); + if (TRI->isSGPRClass(MRI.getRegClass(Def))) { + // Only consider defs that are live outs. We don't care about def / + // use within the same block. + + // LiveVariables does not consider registers that are only used in a + // phi in a sucessor block as live out, unlike LiveIntervals. + // + // This is OK because SIFixSGPRCopies replaced any SGPR phis with + // VGPRs. + if (LV->isLiveOut(Def, *MBB)) + SGPRLiveRanges.push_back(Def); + } } } } - } - // Second pass fix the intervals - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; - if (MBB.succ_size() < 2) + if (MBB->succ_size() < 2) continue; - // We have structured control flow, so number of succesors should be two. - assert(MBB.succ_size() == 2); - MachineBasicBlock *SuccA = *MBB.succ_begin(); - MachineBasicBlock *SuccB = *(++MBB.succ_begin()); + // We have structured control flow, so the number of successors should be + // two. + assert(MBB->succ_size() == 2); + MachineBasicBlock *SuccA = *MBB->succ_begin(); + MachineBasicBlock *SuccB = *(++MBB->succ_begin()); MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB); if (!NCD) @@ -156,37 +169,51 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(), *(++NCD->succ_begin())); } - assert(SuccA && SuccB); - for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) { - unsigned Reg = RegLR.first; - LiveRange *LR = RegLR.second; - - // FIXME: We could be smarter here. If the register is Live-In to - // one block, but the other doesn't have any SGPR defs, then there - // won't be a conflict. Also, if the branch decision is based on - // a value in an SGPR, then there will be no conflict. - bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA); - bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB); - - if ((!LiveInToA && !LiveInToB) || - (LiveInToA && LiveInToB)) + + for (unsigned Reg : SGPRLiveRanges) { + // FIXME: We could be smarter here. If the register is Live-In to one + // block, but the other doesn't have any SGPR defs, then there won't be a + // conflict. Also, if the branch condition is uniform then there will be + // no conflict. + bool LiveInToA = LV->isLiveIn(Reg, *SuccA); + bool LiveInToB = LV->isLiveIn(Reg, *SuccB); + + if (!LiveInToA && !LiveInToB) { + DEBUG(dbgs() << PrintReg(Reg, TRI, 0) + << " is live into neither successor\n"); continue; + } + + if (LiveInToA && LiveInToB) { + DEBUG(dbgs() << PrintReg(Reg, TRI, 0) + << " is live into both successors\n"); + continue; + } // This interval is live in to one successor, but not the other, so // we need to update its range so it is live in to both. - DEBUG(dbgs() << "Possible SGPR conflict detected " << " in " << *LR << - " BB#" << SuccA->getNumber() << ", BB#" << - SuccB->getNumber() << - " with NCD = " << NCD->getNumber() << '\n'); + DEBUG(dbgs() << "Possible SGPR conflict detected for " + << PrintReg(Reg, TRI, 0) + << " BB#" << SuccA->getNumber() + << ", BB#" << SuccB->getNumber() + << " with NCD = BB#" << NCD->getNumber() << '\n'); + + assert(TargetRegisterInfo::isVirtualRegister(Reg) && + "Not expecting to extend live range of physreg"); // FIXME: Need to figure out how to update LiveRange here so this pass // will be able to preserve LiveInterval analysis. - BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), - TII->get(AMDGPU::SGPR_USE)) - .addReg(Reg, RegState::Implicit); - DEBUG(NCD->getFirstNonPHI()->dump()); + MachineInstr *NCDSGPRUse = + BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), + TII->get(AMDGPU::SGPR_USE)) + .addReg(Reg, RegState::Implicit); + + MadeChange = true; + LV->HandleVirtRegUse(Reg, NCD, NCDSGPRUse); + + DEBUG(NCDSGPRUse->dump()); } } - return false; + return MadeChange; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index c288725..6230d1e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -45,6 +45,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -164,8 +165,8 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, // Operand is not legal, so try to commute the instruction to // see if this makes it possible to fold. - unsigned CommuteIdx0; - unsigned CommuteIdx1; + unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; + unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); if (CanCommute) { @@ -175,7 +176,16 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, OpNo = CommuteIdx0; } - if (!CanCommute || !TII->commuteInstruction(MI)) + // One of operands might be an Imm operand, and OpNo may refer to it after + // the call of commuteInstruction() below. Such situations are avoided + // here explicitly as OpNo must be a register operand to be a candidate + // for memory folding. + if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() || + !MI->getOperand(CommuteIdx1).isReg())) + return false; + + if (!CanCommute || + !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1)) return false; if (!TII->isOperandLegal(MI, OpNo, OpToFold)) @@ -186,6 +196,110 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, return true; } +static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, + unsigned UseOpIdx, + std::vector<FoldCandidate> &FoldList, + SmallVectorImpl<MachineInstr *> &CopiesToReplace, + const SIInstrInfo *TII, const SIRegisterInfo &TRI, + MachineRegisterInfo &MRI) { + const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); + + // FIXME: Fold operands with subregs. + if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || + UseOp.isImplicit())) { + return; + } + + bool FoldingImm = OpToFold.isImm(); + APInt Imm; + + if (FoldingImm) { + unsigned UseReg = UseOp.getReg(); + const TargetRegisterClass *UseRC + = TargetRegisterInfo::isVirtualRegister(UseReg) ? + MRI.getRegClass(UseReg) : + TRI.getPhysRegClass(UseReg); + + Imm = APInt(64, OpToFold.getImm()); + + const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode()); + const TargetRegisterClass *FoldRC = + TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); + + // Split 64-bit constants into 32-bits for folding. + if (FoldRC->getSize() == 8 && UseOp.getSubReg()) { + if (UseRC->getSize() != 8) + return; + + if (UseOp.getSubReg() == AMDGPU::sub0) { + Imm = Imm.getLoBits(32); + } else { + assert(UseOp.getSubReg() == AMDGPU::sub1); + Imm = Imm.getHiBits(32); + } + } + + // In order to fold immediates into copies, we need to change the + // copy to a MOV. + if (UseMI->getOpcode() == AMDGPU::COPY) { + unsigned DestReg = UseMI->getOperand(0).getReg(); + const TargetRegisterClass *DestRC + = TargetRegisterInfo::isVirtualRegister(DestReg) ? + MRI.getRegClass(DestReg) : + TRI.getPhysRegClass(DestReg); + + unsigned MovOp = TII->getMovOpcode(DestRC); + if (MovOp == AMDGPU::COPY) + return; + + UseMI->setDesc(TII->get(MovOp)); + CopiesToReplace.push_back(UseMI); + } + } + + // Special case for REG_SEQUENCE: We can't fold literals into + // REG_SEQUENCE instructions, so we have to fold them into the + // uses of REG_SEQUENCE. + if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) { + unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); + unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + + for (MachineRegisterInfo::use_iterator + RSUse = MRI.use_begin(RegSeqDstReg), + RSE = MRI.use_end(); RSUse != RSE; ++RSUse) { + + MachineInstr *RSUseMI = RSUse->getParent(); + if (RSUse->getSubReg() != RegSeqDstSubReg) + continue; + + foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, + CopiesToReplace, TII, TRI, MRI); + } + return; + } + + const MCInstrDesc &UseDesc = UseMI->getDesc(); + + // Don't fold into target independent nodes. Target independent opcodes + // don't have defined register classes. + if (UseDesc.isVariadic() || + UseDesc.OpInfo[UseOpIdx].RegClass == -1) + return; + + if (FoldingImm) { + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + return; + } + + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); + + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunites. The shrink operands pass + // already does this. + return; +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = @@ -220,94 +334,50 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { !MRI.hasOneUse(MI.getOperand(0).getReg())) continue; - // FIXME: Fold operands with subregs. if (OpToFold.isReg() && - (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) || - OpToFold.getSubReg())) + !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) continue; + // Prevent folding operands backwards in the function. For example, + // the COPY opcode must not be replaced by 1 in this example: + // + // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3 + // ... + // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use> + MachineOperand &Dst = MI.getOperand(0); + if (Dst.isReg() && + !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) + continue; + + // We need mutate the operands of new mov instructions to add implicit + // uses of EXEC, but adding them invalidates the use_iterator, so defer + // this. + SmallVector<MachineInstr *, 4> CopiesToReplace; + std::vector<FoldCandidate> FoldList; for (MachineRegisterInfo::use_iterator Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); Use != E; ++Use) { MachineInstr *UseMI = Use->getParent(); - const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); - // FIXME: Fold operands with subregs. - if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || - UseOp.isImplicit())) { - continue; - } - - APInt Imm; - - if (FoldingImm) { - unsigned UseReg = UseOp.getReg(); - const TargetRegisterClass *UseRC - = TargetRegisterInfo::isVirtualRegister(UseReg) ? - MRI.getRegClass(UseReg) : - TRI.getPhysRegClass(UseReg); - - Imm = APInt(64, OpToFold.getImm()); - - // Split 64-bit constants into 32-bits for folding. - if (UseOp.getSubReg()) { - if (UseRC->getSize() != 8) - continue; - - if (UseOp.getSubReg() == AMDGPU::sub0) { - Imm = Imm.getLoBits(32); - } else { - assert(UseOp.getSubReg() == AMDGPU::sub1); - Imm = Imm.getHiBits(32); - } - } - - // In order to fold immediates into copies, we need to change the - // copy to a MOV. - if (UseMI->getOpcode() == AMDGPU::COPY) { - unsigned DestReg = UseMI->getOperand(0).getReg(); - const TargetRegisterClass *DestRC - = TargetRegisterInfo::isVirtualRegister(DestReg) ? - MRI.getRegClass(DestReg) : - TRI.getPhysRegClass(DestReg); - - unsigned MovOp = TII->getMovOpcode(DestRC); - if (MovOp == AMDGPU::COPY) - continue; - - UseMI->setDesc(TII->get(MovOp)); - } - } - - const MCInstrDesc &UseDesc = UseMI->getDesc(); - - // Don't fold into target independent nodes. Target independent opcodes - // don't have defined register classes. - if (UseDesc.isVariadic() || - UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1) - continue; - - if (FoldingImm) { - MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); - tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII); - continue; - } - - tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII); - - // FIXME: We could try to change the instruction from 64-bit to 32-bit - // to enable more folding opportunites. The shrink operands pass - // already does this. + foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, + CopiesToReplace, TII, TRI, MRI); } + // Make sure we add EXEC uses to any new v_mov instructions created. + for (MachineInstr *Copy : CopiesToReplace) + Copy->addImplicitDefUseOperands(MF); + for (FoldCandidate &Fold : FoldList) { if (updateOperand(Fold, TRI)) { // Clear kill flags. if (!Fold.isImm()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); - Fold.OpToFold->setIsKill(false); + // FIXME: Probably shouldn't bother trying to fold if not an + // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR + // copies. + MRI.clearKillFlags(Fold.OpToFold->getReg()); } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp new file mode 100644 index 0000000..7d20509 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -0,0 +1,245 @@ +//===----------------------- SIFrameLowering.cpp --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +#include "SIFrameLowering.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/RegisterScavenging.h" + +using namespace llvm; + + +static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, + const MachineFrameInfo *FrameInfo) { + if (!FuncInfo->hasSpilledSGPRs()) + return false; + + if (FuncInfo->hasSpilledVGPRs()) + return false; + + for (int I = FrameInfo->getObjectIndexBegin(), + E = FrameInfo->getObjectIndexEnd(); I != E; ++I) { + if (!FrameInfo->isSpillSlotObjectIndex(I)) + return false; + } + + return true; +} + +static ArrayRef<MCPhysReg> getAllSGPR128() { + return makeArrayRef(AMDGPU::SReg_128RegClass.begin(), + AMDGPU::SReg_128RegClass.getNumRegs()); +} + +static ArrayRef<MCPhysReg> getAllSGPRs() { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), + AMDGPU::SGPR_32RegClass.getNumRegs()); +} + +void SIFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + if (!MF.getFrameInfo()->hasStackObjects()) + return; + + assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); + + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // If we only have SGPR spills, we won't actually be using scratch memory + // since these spill to VGPRs. + // + // FIXME: We should be cleaning up these unused SGPR spill frame indices + // somewhere. + if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) + return; + + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + + // We need to insert initialization of the scratch resource descriptor. + unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); + assert(ScratchRsrcReg != AMDGPU::NoRegister); + + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + assert(ScratchWaveOffsetReg != AMDGPU::NoRegister); + + unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + + unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; + if (ST.isAmdHsaOS()) { + PreloadedPrivateBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + } + + // If we reserved the original input registers, we don't need to copy to the + // reserved registers. + if (ScratchRsrcReg == PreloadedPrivateBufferReg) { + // We should always reserve these 5 registers at the same time. + assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg && + "scratch wave offset and private segment buffer inconsistent"); + return; + } + + + // We added live-ins during argument lowering, but since they were not used + // they were deleted. We're adding the uses now, so add them back. + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); + + if (ST.isAmdHsaOS()) { + MRI.addLiveIn(PreloadedPrivateBufferReg); + MBB.addLiveIn(PreloadedPrivateBufferReg); + } + + if (!ST.hasSGPRInitBug()) { + // We reserved the last registers for this. Shift it down to the end of those + // which were actually used. + // + // FIXME: It might be safer to use a pseudoregister before replacement. + + // FIXME: We should be able to eliminate unused input registers. We only + // cannot do this for the resources required for scratch access. For now we + // skip over user SGPRs and may leave unused holes. + + // We find the resource first because it has an alignment requirement. + if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) { + // Pick the first unallocated one. Make sure we don't clobber the other + // reserved input we needed. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg)); + MRI.replaceRegWith(ScratchRsrcReg, Reg); + ScratchRsrcReg = Reg; + MFI->setScratchRSrcReg(ScratchRsrcReg); + break; + } + } + } + + if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { + // Pick the first unallocated SGPR. Be careful not to pick an alias of the + // scratch descriptor, since we haven’t added its uses yet. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); + + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + ScratchWaveOffsetReg = Reg; + MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + break; + } + } + } + } + + + assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); + + const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); + MachineBasicBlock::iterator I = MBB.begin(); + DebugLoc DL; + + if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { + // Make sure we emit the copy for the offset first. We may have chosen to copy + // the buffer resource into a register that aliases the input offset register. + BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg) + .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + } + + if (ST.isAmdHsaOS()) { + // Insert copies from argument register. + assert( + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) && + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg)); + + unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3); + + unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1); + unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3); + + const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64); + + BuildMI(MBB, I, DL, SMovB64, Rsrc01) + .addReg(Lo, RegState::Kill); + BuildMI(MBB, I, DL, SMovB64, Rsrc23) + .addReg(Hi, RegState::Kill); + } else { + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + // Use relocations to get the pointer, and setup the other bits manually. + uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + BuildMI(MBB, I, DL, SMovB32, Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc2) + .addImm(Rsrc23 & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc3) + .addImm(Rsrc23 >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } + + // Make the register selected live throughout the function. + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB == &MBB) + continue; + + OtherBB.addLiveIn(ScratchRsrcReg); + OtherBB.addLiveIn(ScratchWaveOffsetReg); + } +} + +void SIFrameLowering::processFunctionBeforeFrameFinalized( + MachineFunction &MF, + RegScavenger *RS) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + + if (!MFI->hasStackObjects()) + return; + + bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects(); + + assert((RS || !MayNeedScavengingEmergencySlot) && + "RegScavenger required if spilling"); + + if (MayNeedScavengingEmergencySlot) { + int ScavengeFI = MFI->CreateSpillStackObject( + AMDGPU::SGPR_32RegClass.getSize(), + AMDGPU::SGPR_32RegClass.getAlignment()); + RS->addScavengingFrameIndex(ScavengeFI); + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h new file mode 100644 index 0000000..a9152fd --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -0,0 +1,34 @@ +//===--------------------- SIFrameLowering.h --------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H + +#include "AMDGPUFrameLowering.h" + +namespace llvm { + +class SIFrameLowering final : public AMDGPUFrameLowering { +public: + SIFrameLowering(StackDirection D, unsigned StackAl, int LAO, + unsigned TransAl = 1) : + AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} + ~SIFrameLowering() override {} + + void emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const override; + + void processFunctionBeforeFrameFinalized( + MachineFunction &MF, + RegScavenger *RS = nullptr) const override; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c2db9ff..5448675 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -20,6 +20,7 @@ #include "SIISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" @@ -51,6 +52,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); @@ -103,6 +107,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::v4i1, Expand); setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); @@ -155,13 +160,30 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, for (MVT VT : MVT::fp_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + + setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::LOAD, MVT::v2i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::STORE, MVT::v2i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); @@ -173,9 +195,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); + + + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch(Op) { case ISD::LOAD: @@ -186,6 +213,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, case ISD::INSERT_VECTOR_ELT: case ISD::INSERT_SUBVECTOR: case ISD::EXTRACT_SUBVECTOR: + case ISD::SCALAR_TO_VECTOR: break; case ISD::CONCAT_VECTORS: setOperationAction(Op, VT, Custom); @@ -197,6 +225,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, } } + // Most operations are naturally 32-bit vector operations. We only support + // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. + for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); + } + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); @@ -215,7 +259,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setTargetDAGCombine(ISD::SMAX); setTargetDAGCombine(ISD::UMIN); setTargetDAGCombine(ISD::UMAX); - setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); @@ -261,6 +304,41 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); } +bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { + // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and + // additionally can do r + r + i with addr64. 32-bit has more addressing + // mode options. Depending on the resource constant, it can also do + // (i64 r0) + (i32 r1) * (i14 i). + // + // Private arrays end up using a scratch buffer most of the time, so also + // assume those use MUBUF instructions. Scratch loads / stores are currently + // implemented as mubuf instructions with offen bit set, so slightly + // different than the normal addr64. + if (!isUInt<12>(AM.BaseOffs)) + return false; + + // FIXME: Since we can split immediate into soffset and immediate offset, + // would it make sense to allow any immediate? + + switch (AM.Scale) { + case 0: // r + i or just i, depending on HasBaseReg. + return true; + case 1: + return true; // We have r + r or r + i. + case 2: + if (AM.HasBaseReg) { + // Reject 2 * r + r. + return false; + } + + // Allow 2 * r as r + r + // Or 2 * r + i is allowed as r + r + i. + return true; + default: // Don't allow n * r + return false; + } +} + bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { @@ -269,7 +347,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return false; switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: { if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { // Assume the we will use FLAT for all global memory accesses // on VI. @@ -282,51 +360,51 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // because it has never been validated. return isLegalFlatAddressingMode(AM); } - // fall-through - case AMDGPUAS::PRIVATE_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? - case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: { - // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and - // additionally can do r + r + i with addr64. 32-bit has more addressing - // mode options. Depending on the resource constant, it can also do - // (i64 r0) + (i32 r1) * (i14 i). - // - // SMRD instructions have an 8-bit, dword offset. - // - // Assume nonunifom access, since the address space isn't enough to know - // what instruction we will use, and since we don't know if this is a load - // or store and scalar stores are only available on VI. - // - // We also know if we are doing an extload, we can't do a scalar load. - // - // Private arrays end up using a scratch buffer most of the time, so also - // assume those use MUBUF instructions. Scratch loads / stores are currently - // implemented as mubuf instructions with offen bit set, so slightly - // different than the normal addr64. - if (!isUInt<12>(AM.BaseOffs)) - return false; - // FIXME: Since we can split immediate into soffset and immediate offset, - // would it make sense to allow any immediate? + return isLegalMUBUFAddressingMode(AM); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + // If the offset isn't a multiple of 4, it probably isn't going to be + // correctly aligned. + if (AM.BaseOffs % 4 != 0) + return isLegalMUBUFAddressingMode(AM); + + // There are no SMRD extloads, so if we have to do a small type access we + // will use a MUBUF load. + // FIXME?: We also need to do this if unaligned, but we don't know the + // alignment here. + if (DL.getTypeStoreSize(Ty) < 4) + return isLegalMUBUFAddressingMode(AM); + + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + // SMRD instructions have an 8-bit, dword offset on SI. + if (!isUInt<8>(AM.BaseOffs / 4)) + return false; + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { + // On CI+, this can also be a 32-bit literal constant offset. If it fits + // in 8-bits, it can use a smaller encoding. + if (!isUInt<32>(AM.BaseOffs / 4)) + return false; + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // On VI, these use the SMEM format and the offset is 20-bit in bytes. + if (!isUInt<20>(AM.BaseOffs)) + return false; + } else + llvm_unreachable("unhandled generation"); - switch (AM.Scale) { - case 0: // r + i or just i, depending on HasBaseReg. + if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. return true; - case 1: - return true; // We have r + r or r + i. - case 2: - if (AM.HasBaseReg) { - // Reject 2 * r + r. - return false; - } - // Allow 2 * r as r + r - // Or 2 * r + i is allowed as r + r + i. + if (AM.Scale == 1 && AM.HasBaseReg) return true; - default: // Don't allow n * r - return false; - } + + return false; } + + case AMDGPUAS::PRIVATE_ADDRESS: + case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: + return isLegalMUBUFAddressingMode(AM); + case AMDGPUAS::LOCAL_ADDRESS: case AMDGPUAS::REGION_ADDRESS: { // Basic, single offset DS instructions allow a 16-bit unsigned immediate @@ -374,7 +452,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte // aligned, 8 byte access in a single operation using ds_read2/write2_b32 // with adjacent offsets. - return Align % 4 == 0; + bool AlignedBy4 = (Align % 4 == 0); + if (IsFast) + *IsFast = AlignedBy4; + return AlignedBy4; } // Smaller than dword value must be aligned. @@ -411,6 +492,32 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, return MVT::Other; } +static bool isFlatGlobalAddrSpace(unsigned AS) { + return AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; +} + +bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); +} + + +bool SITargetLowering::isMemOpUniform(const SDNode *N) const { + const MemSDNode *MemNode = cast<MemSDNode>(N); + const Value *Ptr = MemNode->getMemOperand()->getValue(); + + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers + if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) || + isa<GlobalValue>(Ptr)) + return true; + + const Instruction *I = dyn_cast_or_null<Instruction>(Ptr); + return I && I->getMetadata("amdgpu.uniform"); +} + TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(EVT VT) const { if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) @@ -426,12 +533,6 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return TII->isInlineConstant(Imm); } -static EVT toIntegerVT(EVT VT) { - if (VT.isVector()) - return VT.changeVectorElementTypeToInteger(); - return MVT::getIntegerVT(VT.getSizeInBits()); -} - SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc SL, SDValue Chain, unsigned Offset, bool Signed) const { @@ -439,7 +540,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); Type *Ty = VT.getTypeForEVT(*DAG.getContext()); @@ -455,30 +556,10 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, unsigned Align = DL.getABITypeAlignment(Ty); - if (VT != MemVT && VT.isFloatingPoint()) { - // Do an integer load and convert. - // FIXME: This is mostly because load legalization after type legalization - // doesn't handle FP extloads. - assert(VT.getScalarType() == MVT::f32 && - MemVT.getScalarType() == MVT::f16); - - EVT IVT = toIntegerVT(VT); - EVT MemIVT = toIntegerVT(MemVT); - SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, - IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT, - false, // isVolatile - true, // isNonTemporal - true, // isInvariant - Align); // Alignment - SDValue Ops[] = { - DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load), - Load.getValue(1) - }; - - return DAG.getMergeValues(Ops, SL); - } - ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + if (MemVT.isFloatingPoint()) + ExtTy = ISD::EXTLOAD; + return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, false, // isVolatile @@ -497,8 +578,16 @@ SDValue SITargetLowering::LowerFormalArguments( MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); - assert(CallConv == CallingConv::C); + if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { + const Function *Fn = MF.getFunction(); + DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA"); + DAG.getContext()->diagnose(NoGraphicsHSA); + return SDValue(); + } + + // FIXME: We currently assume all calling conventions are kernels. SmallVector<ISD::InputArg, 16> Splits; BitVector Skipped(Ins.size()); @@ -508,18 +597,20 @@ SDValue SITargetLowering::LowerFormalArguments( // First check if it's a PS input addr if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && - !Arg.Flags.isByVal()) { - - assert((PSInputNum <= 15) && "Too many PS inputs!"); + !Arg.Flags.isByVal() && PSInputNum <= 15) { - if (!Arg.Used) { - // We can savely skip PS inputs + if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { + // We can safely skip PS inputs Skipped.set(i); ++PSInputNum; continue; } - Info->PSInputAddr |= 1 << PSInputNum++; + Info->markPSInputAllocated(PSInputNum); + if (Arg.Used) + Info->PSInputEna |= 1 << PSInputNum; + + ++PSInputNum; } // Second split vertices into their elements @@ -530,7 +621,7 @@ SDValue SITargetLowering::LowerFormalArguments( // We REALLY want the ORIGINAL number of vertex elements here, e.g. a // three or five element vertex only needs three or five registers, - // NOT four or eigth. + // NOT four or eight. Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); @@ -549,41 +640,25 @@ SDValue SITargetLowering::LowerFormalArguments( *DAG.getContext()); // At least one interpolation mode must be enabled or else the GPU will hang. + // + // Check PSInputAddr instead of PSInputEna. The idea is that if the user set + // PSInputAddr, the user wants to enable some bits after the compilation + // based on run-time states. Since we can't know what the final PSInputEna + // will look like, so we shouldn't do anything here and the user should take + // responsibility for the correct programming. + // + // Otherwise, the following restrictions apply: + // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. + // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be + // enabled too. if (Info->getShaderType() == ShaderType::PIXEL && - (Info->PSInputAddr & 0x7F) == 0) { - Info->PSInputAddr |= 1; + ((Info->getPSInputAddr() & 0x7F) == 0 || + ((Info->getPSInputAddr() & 0xF) == 0 && + Info->isPSInputAllocated(11)))) { CCInfo.AllocateReg(AMDGPU::VGPR0); CCInfo.AllocateReg(AMDGPU::VGPR1); - } - - // The pointer to the list of arguments is stored in SGPR0, SGPR1 - // The pointer to the scratch buffer is stored in SGPR2, SGPR3 - if (Info->getShaderType() == ShaderType::COMPUTE) { - if (Subtarget->isAmdHsaOS()) - Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. - else - Info->NumUserSGPRs = 4; - - unsigned InputPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); - unsigned InputPtrRegLo = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned InputPtrRegHi = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); - - unsigned ScratchPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchPtrRegLo = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned ScratchPtrRegHi = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); - - CCInfo.AllocateReg(InputPtrRegLo); - CCInfo.AllocateReg(InputPtrRegHi); - CCInfo.AllocateReg(ScratchPtrRegLo); - CCInfo.AllocateReg(ScratchPtrRegHi); - MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); - MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); + Info->markPSInputAllocated(0); + Info->PSInputEna |= 1; } if (Info->getShaderType() == ShaderType::COMPUTE) { @@ -591,6 +666,25 @@ SDValue SITargetLowering::LowerFormalArguments( Splits); } + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? + if (Info->hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + CCInfo.AllocateReg(PrivateSegmentBufferReg); + } + + if (Info->hasDispatchPtr()) { + unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(DispatchPtrReg); + } + + if (Info->hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(InputPtrReg); + } + AnalyzeFormalArguments(CCInfo, Splits); SmallVector<SDValue, 16> Chains; @@ -617,7 +711,7 @@ SDValue SITargetLowering::LowerFormalArguments( Offset, Ins[i].Flags.isSExt()); Chains.push_back(Arg.getValue(1)); - const PointerType *ParamTy = + auto *ParamTy = dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { @@ -678,10 +772,113 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } - if (Info->getShaderType() != ShaderType::COMPUTE) { - unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>( - AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); - Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. + + // Start adding system SGPRs. + if (Info->hasWorkGroupIDX()) { + unsigned Reg = Info->addWorkGroupIDX(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("work group id x is always enabled"); + + if (Info->hasWorkGroupIDY()) { + unsigned Reg = Info->addWorkGroupIDY(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupIDZ()) { + unsigned Reg = Info->addWorkGroupIDZ(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupInfo()) { + unsigned Reg = Info->addWorkGroupInfo(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasPrivateSegmentWaveByteOffset()) { + // Scratch wave offset passed in system SGPR. + unsigned PrivateSegmentWaveByteOffsetReg + = Info->addPrivateSegmentWaveByteOffset(); + + MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); + } + + // Now that we've figured out where the scratch register inputs are, see if + // should reserve the arguments and use them directly. + + bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + + if (ST.isAmdHsaOS()) { + // TODO: Assume we will spill without optimizations. + if (HasStackObjects) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the HSA ABI, this will be the first 4 user SGPR + // inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + Info->setScratchRSrcReg(PrivateSegmentBufferReg); + + unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + } else { + unsigned ReservedBufferReg + = TRI->reservedPrivateSegmentBufferReg(MF); + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + + // We tentatively reserve the last registers (skipping the last two + // which may contain VCC). After register allocation, we'll replace + // these with the ones immediately after those which were really + // allocated. In the prologue copies will be inserted from the argument + // to these reserved registers. + Info->setScratchRSrcReg(ReservedBufferReg); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } else { + unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); + + // Without HSA, relocations are used for the scratch pointer and the + // buffer resource setup is always inserted in the prologue. Scratch wave + // offset is still in an input SGPR. + Info->setScratchRSrcReg(ReservedBufferReg); + + if (HasStackObjects) { + unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + } else { + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } + + if (Info->hasWorkItemIDX()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("workitem id x should always be enabled"); + + if (Info->hasWorkItemIDY()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkItemIDZ()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); } if (Chains.empty()) @@ -690,30 +887,105 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } +SDValue SITargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc DL, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + if (Info->getShaderType() == ShaderType::COMPUTE) + return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, + OutVals, DL, DAG); + + Info->setIfReturnsVoid(Outs.size() == 0); + + SmallVector<ISD::OutputArg, 48> Splits; + SmallVector<SDValue, 48> SplitVals; + + // Split vectors into their elements. + for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + const ISD::OutputArg &Out = Outs[i]; + + if (Out.VT.isVector()) { + MVT VT = Out.VT.getVectorElementType(); + ISD::OutputArg NewOut = Out; + NewOut.Flags.setSplit(); + NewOut.VT = VT; + + // We want the original number of vector elements here, e.g. + // three or five, not four or eight. + unsigned NumElements = Out.ArgVT.getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], + DAG.getConstant(j, DL, MVT::i32)); + SplitVals.push_back(Elem); + Splits.push_back(NewOut); + NewOut.PartOffset += NewOut.VT.getStoreSize(); + } + } else { + SplitVals.push_back(OutVals[i]); + Splits.push_back(Out); + } + } + + // CCValAssign - represent the assignment of the return value to a location. + SmallVector<CCValAssign, 48> RVLocs; + + // CCState - Info about the registers and stack slots. + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + + // Analyze outgoing return values. + AnalyzeReturn(CCInfo, Splits); + + SDValue Flag; + SmallVector<SDValue, 48> RetOps; + RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + + // Copy the result values into the output registers. + for (unsigned i = 0, realRVLocIdx = 0; + i != RVLocs.size(); + ++i, ++realRVLocIdx) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + SDValue Arg = SplitVals[realRVLocIdx]; + + // Copied from other backends. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + break; + } + + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + + // Update chain and glue. + RetOps[0] = Chain; + if (Flag.getNode()) + RetOps.push_back(Flag); + + return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps); +} + MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { - MachineBasicBlock::iterator I = *MI; - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); - switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; - case AMDGPU::SI_RegisterStorePseudo: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - MachineInstrBuilder MIB = - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), - Reg); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) - MIB.addOperand(MI->getOperand(i)); - - MI->eraseFromParent(); - break; - } } return BB; } @@ -944,20 +1216,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, const GlobalValue *GV = GSD->getGlobal(); MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); - SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); - - SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(0, DL, MVT::i32)); - SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(1, DL, MVT::i32)); - - SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrLo, GA); - SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrHi, DAG.getConstant(0, DL, MVT::i32), - SDValue(Lo.getNode(), 1)); - return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); + return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA); } SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, @@ -977,6 +1237,18 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, // a glue result. } +SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, + SDValue Op, + MVT VT, + unsigned Offset) const { + SDLoc SL(Op); + SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, + DAG.getEntryNode(), Offset, false); + // The local size values will have the hi 16-bits as zero. + return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, + DAG.getValueType(VT)); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -988,7 +1260,20 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc DL(Op); unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + // TODO: Should this propagate fast-math-flags? + switch (IntrinsicID) { + case Intrinsic::amdgcn_dispatch_ptr: + if (!Subtarget->isAmdHsaOS()) { + DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(), + "hsa intrinsic without hsa target"); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); + } + + return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT); + case Intrinsic::r600_read_ngroups_x: return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_X, false); @@ -1008,37 +1293,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_X, false); + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_X); case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Y, false); + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Y); case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Z, false); - + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Z); case Intrinsic::AMDGPU_read_workdim: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - getImplicitParameterOffset(MFI, GRID_DIM), false); - + // Really only 2 bits. + return lowerImplicitZextParam(DAG, Op, MVT::i8, + getImplicitParameterOffset(MFI, GRID_DIM)); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); case Intrinsic::r600_read_tgid_y: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); case Intrinsic::r600_read_tgid_z: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); case Intrinsic::r600_read_tidig_x: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); case Intrinsic::r600_read_tidig_y: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { Op.getOperand(1), @@ -1077,6 +1361,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getConstant(2, DL, MVT::i32), // P0 Op.getOperand(1), Op.getOperand(2), Glue); } + case AMDGPUIntrinsic::SI_packf16: + if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) + return DAG.getUNDEF(MVT::i32); + return Op; case AMDGPUIntrinsic::SI_fs_interp: { SDValue IJ = Op.getOperand(4); SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, @@ -1092,6 +1380,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, Op.getOperand(1), Op.getOperand(2), Glue); } + case Intrinsic::amdgcn_interp_p1: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); + SDValue Glue = M0.getValue(1); + return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Glue); + } + case Intrinsic::amdgcn_interp_p2: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); + SDValue Glue = SDValue(M0.getNode(), 1); + return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), + Glue); + } default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } @@ -1152,16 +1453,29 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { "Custom lowering for non-i32 vectors hasn't been implemented."); unsigned NumElements = Op.getValueType().getVectorNumElements(); assert(NumElements != 2 && "v2 loads are supported for all address spaces."); + switch (Load->getAddressSpace()) { default: break; + case AMDGPUAS::CONSTANT_ADDRESS: + if (isMemOpUniform(Load)) + break; + // Non-uniform loads will be selected to MUBUF instructions, so they + // have the same legalization requires ments as global and private + // loads. + // + // Fall-through case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::PRIVATE_ADDRESS: + if (NumElements >= 8) + return SplitVectorLoad(Op, DAG); + // v4 loads are supported for private and global memory. if (NumElements <= 4) break; // fall-through case AMDGPUAS::LOCAL_ADDRESS: - return ScalarizeVectorLoad(Op, DAG); + // If properly aligned, if we split we might be able to use ds_read_b64. + return SplitVectorLoad(Op, DAG); } } @@ -1236,8 +1550,10 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { if (Unsafe) { // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) + SDNodeFlags Flags; + Flags.setUnsafeAlgebra(true); SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags); } return SDValue(); @@ -1274,6 +1590,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + // TODO: Should this propagate fast-math-flags? + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); @@ -1379,7 +1697,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return Ret; if (VT.isVector() && VT.getVectorNumElements() >= 8) - return ScalarizeVectorStore(Op, DAG); + return SplitVectorStore(Op, DAG); if (VT == MVT::i1) return DAG.getTruncStore(Store->getChain(), DL, @@ -1393,6 +1711,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue Arg = Op.getOperand(0); + // TODO: Should this propagate fast-math-flags? SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, DAG.getNode(ISD::FMUL, DL, VT, Arg, DAG.getConstantFP(0.5/M_PI, DL, @@ -1821,7 +2140,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: { return performUCharToFloatCombine(N, DCI); - + } case ISD::FADD: { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) break; @@ -1903,7 +2222,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, break; } - } case ISD::LOAD: case ISD::STORE: case ISD::ATOMIC_LOAD: @@ -2125,9 +2443,14 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - TII->legalizeOperands(MI); - if (TII->isMIMG(MI->getOpcode())) { + if (TII->isVOP3(MI->getOpcode())) { + // Make sure constant bus requirements are respected. + TII->legalizeOperandsVOP3(MRI, MI); + return; + } + + if (TII->isMIMG(*MI)) { unsigned VReg = MI->getOperand(0).getReg(); unsigned Writemask = MI->getOperand(1).getImm(); unsigned BitsSet = 0; @@ -2169,53 +2492,38 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const { const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); -#if 1 - // XXX - Workaround for moveToVALU not handling different register class - // inserts for REG_SEQUENCE. - - // Build the half of the subregister with the constants. - const SDValue Ops0[] = { - DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), - buildSMovImm32(DAG, DL, 0), - DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), - DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, Ops0), 0); - - // Combine the constants and the pointer. - const SDValue Ops1[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), - Ptr, - DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), - SubRegHi, - DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) - }; + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + // Build the half of the subregister with the constants before building the + // full 128-bit register. If we are building multiple resource descriptors, + // this will allow CSEing of the 2-component register. + const SDValue Ops0[] = { + DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), + buildSMovImm32(DAG, DL, 0), + DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), + DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) + }; - return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); -#else - const SDValue Ops[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), - Ptr, - DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), - buildSMovImm32(DAG, DL, 0), - DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), - buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32), - DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) - }; + SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::v2i32, Ops0), 0); - return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); + // Combine the constants and the pointer. + const SDValue Ops1[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + Ptr, + DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), + SubRegHi, + DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) + }; -#endif + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); } /// \brief Return a resource descriptor with the 'Add TID' bit enabled -/// The TID (Thread ID) is multipled by the stride value (bits [61:48] -/// of the resource descriptor) to create an offset, which is added to the -/// resource ponter. +/// The TID (Thread ID) is multiplied by the stride value (bits [61:48] +/// of the resource descriptor) to create an offset, which is added to +/// the resource pointer. MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, SDLoc DL, SDValue Ptr, @@ -2248,15 +2556,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); } -MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); - - return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23()); -} - SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { @@ -2274,13 +2573,41 @@ std::pair<unsigned, const TargetRegisterClass *> SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { - if (Constraint == "r") { - switch(VT.SimpleTy) { - default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); - case MVT::i64: - return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); - case MVT::i32: + + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 's': + case 'r': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); + case 64: + return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); + case 128: + return std::make_pair(0U, &AMDGPU::SReg_128RegClass); + case 256: + return std::make_pair(0U, &AMDGPU::SReg_256RegClass); + } + + case 'v': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: + return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); + case 64: + return std::make_pair(0U, &AMDGPU::VReg_64RegClass); + case 96: + return std::make_pair(0U, &AMDGPU::VReg_96RegClass); + case 128: + return std::make_pair(0U, &AMDGPU::VReg_128RegClass); + case 256: + return std::make_pair(0U, &AMDGPU::VReg_256RegClass); + case 512: + return std::make_pair(0U, &AMDGPU::VReg_512RegClass); + } } } @@ -2301,3 +2628,16 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } + +SITargetLowering::ConstraintType +SITargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 's': + case 'v': + return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h index d84c32e..f01b2c0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -28,6 +28,9 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; + SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, + MVT VT, unsigned Offset) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; @@ -57,6 +60,7 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; + bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; public: SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); @@ -76,6 +80,9 @@ public: bool MemcpyStrSrc, MachineFunction &MF) const override; + bool isMemOpUniform(const SDNode *N) const; + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; + TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; @@ -88,6 +95,13 @@ public: SDLoc DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const override; + SDValue LowerReturn(SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc DL, SelectionDAG &DAG) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, MachineBasicBlock * BB) const override; bool enableAggressiveFMAFusion(EVT VT) const override; @@ -112,13 +126,10 @@ public: SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const; - MachineSDNode *buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const; - std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; }; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp index 90a37f1..94e6147 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -84,6 +84,9 @@ private: bool LastInstWritesM0; + /// \brief Whether the machine function returns void + bool ReturnsVoid; + /// \brief Get increment/decrement amount for this instruction. Counters getHwCounts(MachineInstr &MI); @@ -91,7 +94,8 @@ private: bool isOpRelevant(MachineOperand &Op); /// \brief Get register interval an operand affects. - RegInterval getRegInterval(MachineOperand &Op); + RegInterval getRegInterval(const TargetRegisterClass *RC, + const MachineOperand &Reg) const; /// \brief Handle instructions async components void pushInstruction(MachineBasicBlock &MBB, @@ -121,9 +125,13 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; const char *getPassName() const override { - return "SI insert wait instructions"; + return "SI insert wait instructions"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace @@ -138,9 +146,8 @@ FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { } Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { - - uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; - Counters Result; + uint64_t TSFlags = MI.getDesc().TSFlags; + Counters Result = { { 0, 0, 0 } }; Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); @@ -151,15 +158,22 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { // LGKM may uses larger values if (TSFlags & SIInstrFlags::LGKM_CNT) { - if (TII->isSMRD(MI.getOpcode())) { - - MachineOperand &Op = MI.getOperand(0); - assert(Op.isReg() && "First LGKM operand must be a register!"); - - unsigned Reg = Op.getReg(); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); - Result.Named.LGKM = Size > 4 ? 2 : 1; - + if (TII->isSMRD(MI)) { + + if (MI.getNumOperands() != 0) { + assert(MI.getOperand(0).isReg() && + "First LGKM operand must be a register!"); + + // XXX - What if this is a write into a super register? + const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0); + unsigned Size = RC->getSize(); + Result.Named.LGKM = Size > 4 ? 2 : 1; + } else { + // s_dcache_inv etc. do not have a a destination register. Assume we + // want a wait on these. + // XXX - What is the right value? + Result.Named.LGKM = 1; + } } else { // DS Result.Named.LGKM = 1; @@ -173,9 +187,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { } bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { - // Constants are always irrelevant - if (!Op.isReg()) + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) return false; // Defines are always relevant @@ -196,7 +209,7 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { // operand comes before the value operand and it may have // multiple data operands. - if (TII->isDS(MI.getOpcode())) { + if (TII->isDS(MI)) { MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); if (Data && Op.isIdenticalTo(*Data)) return true; @@ -224,18 +237,13 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { return false; } -RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { - - if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) - return std::make_pair(0, 0); - - unsigned Reg = Op.getReg(); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); - +RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, + const MachineOperand &Reg) const { + unsigned Size = RC->getSize(); assert(Size >= 4); RegInterval Result; - Result.first = TRI->getEncodingValue(Reg); + Result.first = TRI->getEncodingValue(Reg.getReg()); Result.second = Result.first + Size / 4; return Result; @@ -246,10 +254,13 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, // Get the hardware counter increments and sum them up Counters Increment = getHwCounts(*I); + Counters Limit = ZeroCounts; unsigned Sum = 0; for (unsigned i = 0; i < 3; ++i) { LastIssued.Array[i] += Increment.Array[i]; + if (Increment.Array[i]) + Limit.Array[i] = LastIssued.Array[i]; Sum += Increment.Array[i]; } @@ -261,7 +272,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM + // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM // or SMEM clause, respectively. // // The temporary workaround is to break the clauses with S_NOP. @@ -270,7 +281,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, // and destination registers don't overlap, e.g. this is illegal: // r0 = load r2 // r2 = load r0 - if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) || + if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) || (LastOpcodeType == VMEM && Increment.Named.VM)) { // Insert a NOP to break the clause. BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) @@ -278,7 +289,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, LastInstWritesM0 = false; } - if (TII->isSMRD(I->getOpcode())) + if (TII->isSMRD(*I)) LastOpcodeType = SMEM; else if (Increment.Named.VM) LastOpcodeType = VMEM; @@ -290,21 +301,21 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, } for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - MachineOperand &Op = I->getOperand(i); if (!isOpRelevant(Op)) continue; - RegInterval Interval = getRegInterval(Op); + const TargetRegisterClass *RC = TII->getOpRegClass(*I, i); + RegInterval Interval = getRegInterval(RC, Op); for (unsigned j = Interval.first; j < Interval.second; ++j) { // Remember which registers we define if (Op.isDef()) - DefinedRegs[j] = LastIssued; + DefinedRegs[j] = Limit; // and which one we are using if (Op.isUse()) - UsedRegs[j] = LastIssued; + UsedRegs[j] = Limit; } } } @@ -314,7 +325,9 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, const Counters &Required) { // End of program? No need to wait on anything - if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) + // A function not returning void needs to wait, because other bytecode will + // be appended after it and we don't know what it will be. + if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid) return false; // Figure out if the async instructions execute in order @@ -390,12 +403,18 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { if (MI.getOpcode() == AMDGPU::S_SENDMSG) return LastIssued; - // For each register affected by this - // instruction increase the result sequence + // For each register affected by this instruction increase the result + // sequence. + // + // TODO: We could probably just look at explicit operands if we removed VCC / + // EXEC from SMRD dest reg classes. for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &Op = MI.getOperand(i); - RegInterval Interval = getRegInterval(Op); + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) + continue; + + const TargetRegisterClass *RC = TII->getOpRegClass(MI, i); + RegInterval Interval = getRegInterval(RC, Op); for (unsigned j = Interval.first; j < Interval.second; ++j) { if (Op.isDef()) { @@ -451,6 +470,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { LastIssued = ZeroCounts; LastOpcodeType = OTHER; LastInstWritesM0 = false; + ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid(); memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); @@ -474,6 +494,14 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); + + // Functions returning something shouldn't contain S_ENDPGM, because other + // bytecode will be appended after it. + if (!ReturnsVoid) { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) + I->eraseFromParent(); + } } return Changes; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 211666a..0e883f6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -41,6 +41,10 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : field bits<1> WQM = 0; field bits<1> VGPRSpill = 0; + // This bit tells the assembler to use the 32-bit encoding in case it + // is unable to infer the encoding from the operands. + field bits<1> VOPAsmPrefer32Bit = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; let TSFlags{1} = EXP_CNT; @@ -68,10 +72,8 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : let TSFlags{19} = FLAT; let TSFlags{20} = WQM; let TSFlags{21} = VGPRSpill; + let TSFlags{22} = VOPAsmPrefer32Bit; - // Most instructions require adjustments after selection to satisfy - // operand requirements. - let hasPostISelHook = 1; let SchedRW = [Write32Bit]; } @@ -86,7 +88,6 @@ class Enc64 { } class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">; -def VOPDstVCC : VOPDstOperand <VCCReg>; let Uses = [EXEC] in { @@ -101,11 +102,11 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : } class VOPCCommon <dag ins, string asm, list<dag> pattern> : - VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> { + VOPAnyCommon <(outs), ins, asm, pattern> { - let DisableEncoding = "$dst"; let VOPC = 1; let Size = 4; + let Defs = [VCC]; } class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> : @@ -138,6 +139,11 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : let isCodeGenOnly = 0; int Size = 8; + + // Because SGPRs may be allowed if there are multiple operands, we + // need a post-isel hook to insert copies in order to avoid + // violating constant bus requirements. + let hasPostISelHook = 1; } } // End Uses = [EXEC] @@ -222,6 +228,20 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 { let Inst{31-27} = 0x18; //encoding } +class SMRD_IMMe_ci <bits<5> op> : Enc64 { + bits<7> sdst; + bits<7> sbase; + bits<32> offset; + + let Inst{7-0} = 0xff; + let Inst{8} = 0; + let Inst{14-9} = sbase{6-1}; + let Inst{21-15} = sdst; + let Inst{26-22} = op; + let Inst{31-27} = 0x18; //encoding + let Inst{63-32} = offset; +} + let SchedRW = [WriteSALU] in { class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> : InstSI<outs, ins, asm, pattern> { @@ -249,13 +269,13 @@ class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> : class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : InstSI<outs, ins, asm, pattern>, SOPCe <op> { - let DisableEncoding = "$dst"; let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; let SALU = 1; let SOPC = 1; let isCodeGenOnly = 0; + let Defs = [SCC]; let UseNamedOperandTable = 1; } @@ -598,15 +618,13 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> : // Vector I/O operations //===----------------------------------------------------------------------===// -let Uses = [EXEC] in { - class DS <dag outs, dag ins, string asm, list<dag> pattern> : InstSI <outs, ins, asm, pattern> { let LGKM_CNT = 1; let DS = 1; let UseNamedOperandTable = 1; - let Uses = [M0]; + let Uses = [M0, EXEC]; // Most instruction load and store data, so set this as the default. let mayLoad = 1; @@ -623,6 +641,7 @@ class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> : let VM_CNT = 1; let EXP_CNT = 1; let MUBUF = 1; + let Uses = [EXEC]; let hasSideEffects = 0; let UseNamedOperandTable = 1; @@ -636,6 +655,7 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> : let VM_CNT = 1; let EXP_CNT = 1; let MTBUF = 1; + let Uses = [EXEC]; let hasSideEffects = 0; let UseNamedOperandTable = 1; @@ -665,9 +685,7 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let VM_CNT = 1; let EXP_CNT = 1; let MIMG = 1; + let Uses = [EXEC]; let hasSideEffects = 0; // XXX ???? } - - -} // End Uses = [EXEC] diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index cfd2c42..1e10d25 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -82,6 +82,7 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, switch (MI->getOpcode()) { case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: return true; default: return false; @@ -204,7 +205,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, unsigned &Offset, const TargetRegisterInfo *TRI) const { unsigned Opc = LdSt->getOpcode(); - if (isDS(Opc)) { + + if (isDS(*LdSt)) { const MachineOperand *OffsetImm = getNamedOperand(*LdSt, AMDGPU::OpName::offset); if (OffsetImm) { @@ -254,7 +256,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, return false; } - if (isMUBUF(Opc) || isMTBUF(Opc)) { + if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) return false; @@ -270,7 +272,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, return true; } - if (isSMRD(Opc)) { + if (isSMRD(*LdSt)) { const MachineOperand *OffsetImm = getNamedOperand(*LdSt, AMDGPU::OpName::offset); if (!OffsetImm) @@ -289,20 +291,18 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, unsigned NumLoads) const { - unsigned Opc0 = FirstLdSt->getOpcode(); - unsigned Opc1 = SecondLdSt->getOpcode(); - // TODO: This needs finer tuning if (NumLoads > 4) return false; - if (isDS(Opc0) && isDS(Opc1)) + if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) return true; - if (isSMRD(Opc0) && isSMRD(Opc1)) + if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt)) return true; - if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) + if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) && + (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt))) return true; return false; @@ -323,28 +323,45 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, - AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0 + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + }; + + static const int16_t Sub0_15_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, }; static const int16_t Sub0_7[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0 + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + }; + + static const int16_t Sub0_7_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, }; static const int16_t Sub0_3[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + }; + + static const int16_t Sub0_3_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, }; static const int16_t Sub0_2[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0 + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, }; static const int16_t Sub0_1[] = { - AMDGPU::sub0, AMDGPU::sub1, 0 + AMDGPU::sub0, AMDGPU::sub1, }; unsigned Opcode; - const int16_t *SubIndices; + ArrayRef<int16_t> SubIndices; + bool Forward; if (AMDGPU::SReg_32RegClass.contains(DestReg)) { assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); @@ -360,7 +377,7 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else { // FIXME: Hack until VReg_1 removed. assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC) + BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) .addImm(0) .addReg(SrcReg, getKillRegState(KillSrc)); } @@ -375,18 +392,18 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_3; + Opcode = AMDGPU::S_MOV_B64; + SubIndices = Sub0_3_64; } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_7; + Opcode = AMDGPU::S_MOV_B64; + SubIndices = Sub0_7_64; } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_15; + Opcode = AMDGPU::S_MOV_B64; + SubIndices = Sub0_15_64; } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || @@ -428,13 +445,27 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, llvm_unreachable("Can't copy register!"); } - while (unsigned SubIdx = *SubIndices++) { + if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg)) + Forward = true; + else + Forward = false; + + for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { + unsigned SubIdx; + if (Forward) + SubIdx = SubIndices[Idx]; + else + SubIdx = SubIndices[SubIndices.size() - Idx - 1]; + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)); - Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); + Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); - if (*SubIndices) + if (Idx == SubIndices.size() - 1) + Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit); + + if (Idx == 0) Builder.addReg(DestReg, RegState::Define | RegState::Implicit); } } @@ -471,6 +502,40 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { return AMDGPU::COPY; } +static unsigned getSGPRSpillSaveOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_S32_SAVE; + case 8: + return AMDGPU::SI_SPILL_S64_SAVE; + case 16: + return AMDGPU::SI_SPILL_S128_SAVE; + case 32: + return AMDGPU::SI_SPILL_S256_SAVE; + case 64: + return AMDGPU::SI_SPILL_S512_SAVE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getVGPRSpillSaveOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_V32_SAVE; + case 8: + return AMDGPU::SI_SPILL_V64_SAVE; + case 16: + return AMDGPU::SI_SPILL_V128_SAVE; + case 32: + return AMDGPU::SI_SPILL_V256_SAVE; + case 64: + return AMDGPU::SI_SPILL_V512_SAVE; + default: + llvm_unreachable("unknown register size"); + } +} + void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, @@ -481,47 +546,83 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - int Opcode = -1; + + unsigned Size = FrameInfo->getObjectSize(FrameIndex); + unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, FrameIndex); + MachineMemOperand *MMO + = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, + Size, Align); if (RI.isSGPRClass(RC)) { + MFI->setHasSpilledSGPRs(); + // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling // SGPRs. - switch (RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; - } - } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { - MFI->setHasSpilledVGPRs(); - - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; - case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; - case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; - case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; - case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; - case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; - } + unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); + BuildMI(MBB, MI, DL, get(Opcode)) + .addReg(SrcReg) // src + .addFrameIndex(FrameIndex) // frame_idx + .addMemOperand(MMO); + + return; } - if (Opcode != -1) { - FrameInfo->setObjectAlignment(FrameIndex, 4); - BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg) - .addFrameIndex(FrameIndex) - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else { + if (!ST.isVGPRSpillingEnabled(MFI)) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" " spill register"); BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) - .addReg(SrcReg); + .addReg(SrcReg); + + return; + } + + assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); + + unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); + MFI->setHasSpilledVGPRs(); + BuildMI(MBB, MI, DL, get(Opcode)) + .addReg(SrcReg) // src + .addFrameIndex(FrameIndex) // frame_idx + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addMemOperand(MMO); +} + +static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_S32_RESTORE; + case 8: + return AMDGPU::SI_SPILL_S64_RESTORE; + case 16: + return AMDGPU::SI_SPILL_S128_RESTORE; + case 32: + return AMDGPU::SI_SPILL_S256_RESTORE; + case 64: + return AMDGPU::SI_SPILL_S512_RESTORE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_V32_RESTORE; + case 8: + return AMDGPU::SI_SPILL_V64_RESTORE; + case 16: + return AMDGPU::SI_SPILL_V128_RESTORE; + case 32: + return AMDGPU::SI_SPILL_V256_RESTORE; + case 64: + return AMDGPU::SI_SPILL_V512_RESTORE; + default: + llvm_unreachable("unknown register size"); } } @@ -534,42 +635,43 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - int Opcode = -1; - - if (RI.isSGPRClass(RC)){ - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; - } - } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; - case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; - case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; - case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; - case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; - case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; - } - } + unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); + unsigned Size = FrameInfo->getObjectSize(FrameIndex); + + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, FrameIndex); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, Size, Align); - if (Opcode != -1) { - FrameInfo->setObjectAlignment(FrameIndex, 4); + if (RI.isSGPRClass(RC)) { + // FIXME: Maybe this should not include a memoperand because it will be + // lowered to non-memory instructions. + unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); + .addFrameIndex(FrameIndex) // frame_idx + .addMemOperand(MMO); - } else { + return; + } + + if (!ST.isVGPRSpillingEnabled(MFI)) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" " restore register"); BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); + + return; } + + assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); + + unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); + BuildMI(MBB, MI, DL, get(Opcode), DestReg) + .addFrameIndex(FrameIndex) // frame_idx + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addMemOperand(MMO); } /// \param @Offset Offset in bytes of the FrameIndex being spilled @@ -601,17 +703,21 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, if (MFI->getShaderType() == ShaderType::COMPUTE && WorkGroupSize > WavefrontSize) { - unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); - unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); - unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); + unsigned TIDIGXReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); + unsigned TIDIGYReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); + unsigned TIDIGZReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); unsigned InputPtrReg = - TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); + TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { if (!Entry.isLiveIn(Reg)) Entry.addLiveIn(Reg); } RS->enterBasicBlock(&Entry); + // FIXME: Can we scavenge an SReg_64 and access the subregs? unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) @@ -667,8 +773,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, return TmpReg; } -void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, - int Count) const { +void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI, + int Count) const { while (Count > 0) { int Arg; if (Count >= 8) @@ -687,26 +793,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { switch (MI->getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - case AMDGPU::SI_CONSTDATA_PTR: { - unsigned Reg = MI->getOperand(0).getReg(); - unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); - unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); - - BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); - - // Add 32-bit offset from this instruction to the start of the constant data. - BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) - .addReg(RegLo) - .addTargetIndex(AMDGPU::TI_CONSTDATA_START) - .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); - BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) - .addReg(RegHi) - .addImm(0) - .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) - .addReg(AMDGPU::SCC, RegState::Implicit); - MI->eraseFromParent(); - break; - } case AMDGPU::SGPR_USE: // This is just a placeholder for register allocation. MI->eraseFromParent(); @@ -760,49 +846,90 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MI->eraseFromParent(); break; } + + case AMDGPU::SI_CONSTDATA_PTR: { + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); + MachineFunction &MF = *MBB.getParent(); + unsigned Reg = MI->getOperand(0).getReg(); + unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); + unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); + + // Create a bundle so these instructions won't be re-ordered by the + // post-RA scheduler. + MIBundleBuilder Bundler(MBB, MI); + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); + + // Add 32-bit offset from this instruction to the start of the + // constant data. + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) + .addReg(RegLo) + .addOperand(MI->getOperand(1))); + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) + .addReg(RegHi) + .addImm(0)); + + llvm::finalizeBundle(MBB, Bundler.begin()); + + MI->eraseFromParent(); + break; + } } return true; } -MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, - bool NewMI) const { - - if (MI->getNumOperands() < 3) - return nullptr; - +/// Commutes the operands in the given instruction. +/// The commutable operands are specified by their indices OpIdx0 and OpIdx1. +/// +/// Do not call this method for a non-commutable instruction or for +/// non-commutable pair of operand indices OpIdx0 and OpIdx1. +/// Even though the instruction is commutable, the method may still +/// fail to commute the operands, null pointer is returned in such cases. +MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx0, + unsigned OpIdx1) const { int CommutedOpcode = commuteOpcode(*MI); if (CommutedOpcode == -1) return nullptr; int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::src0); - assert(Src0Idx != -1 && "Should always have src0 operand"); - MachineOperand &Src0 = MI->getOperand(Src0Idx); if (!Src0.isReg()) return nullptr; int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::src1); - if (Src1Idx == -1) + + if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || + OpIdx1 != static_cast<unsigned>(Src1Idx)) && + (OpIdx0 != static_cast<unsigned>(Src1Idx) || + OpIdx1 != static_cast<unsigned>(Src0Idx))) return nullptr; MachineOperand &Src1 = MI->getOperand(Src1Idx); - // Make sure it's legal to commute operands for VOP2. - if (isVOP2(MI->getOpcode()) && - (!isOperandLegal(MI, Src0Idx, &Src1) || - !isOperandLegal(MI, Src1Idx, &Src0))) { - return nullptr; + + if (isVOP2(*MI)) { + const MCInstrDesc &InstrDesc = MI->getDesc(); + // For VOP2 instructions, any operand type is valid to use for src0. Make + // sure we can use the src1 as src0. + // + // We could be stricter here and only allow commuting if there is a reason + // to do so. i.e. if both operands are VGPRs there is no real benefit, + // although MachineCSE attempts to find matches by commuting. + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) + return nullptr; } if (!Src1.isReg()) { // Allow commuting instructions with Imm operands. if (NewMI || !Src1.isImm() || - (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { + (!isVOP2(*MI) && !isVOP3(*MI))) { return nullptr; } - // Be sure to copy the source modifiers to the right place. if (MachineOperand *Src0Mods = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { @@ -832,7 +959,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, Src1.ChangeToRegister(Reg, false); Src1.setSubReg(SubReg); } else { - MI = TargetInstrInfo::commuteInstruction(MI, NewMI); + MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); } if (MI) @@ -845,8 +972,8 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, // between the true commutable operands, and the base // TargetInstrInfo::commuteInstruction uses it. bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2) const { + unsigned &SrcOpIdx0, + unsigned &SrcOpIdx1) const { const MCInstrDesc &MCID = MI->getDesc(); if (!MCID.isCommutable()) return false; @@ -857,7 +984,8 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, return false; // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on - // immediate. + // immediate. Also, immediate src0 operand is not handled in + // SIInstrInfo::commuteInstruction(); if (!MI->getOperand(Src0Idx).isReg()) return false; @@ -865,18 +993,22 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, if (Src1Idx == -1) return false; - if (!MI->getOperand(Src1Idx).isReg()) - return false; - - // If any source modifiers are set, the generic instruction commuting won't - // understand how to copy the source modifiers. - if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + MachineOperand &Src1 = MI->getOperand(Src1Idx); + if (Src1.isImm()) { + // SIInstrInfo::commuteInstruction() does support commuting the immediate + // operand src1 in 2 and 3 operand instructions. + if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) + return false; + } else if (Src1.isReg()) { + // If any source modifiers are set, the generic instruction commuting won't + // understand how to copy the source modifiers. + if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + return false; + } else return false; - SrcOpIdx1 = Src0Idx; - SrcOpIdx2 = Src1Idx; - return true; + return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, @@ -898,11 +1030,6 @@ bool SIInstrInfo::isMov(unsigned Opcode) const { } } -bool -SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { - return RC != &AMDGPU::EXECRegRegClass; -} - static void removeModOperands(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, @@ -984,9 +1111,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); } - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::src2)); - // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); removeModOperands(*UseMI); @@ -1045,18 +1169,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, return false; } -bool -SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA) const { - switch(MI->getOpcode()) { - default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA); - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::V_MOV_B32_e32: - return MI->getOperand(1).isImm(); - } -} - static bool offsetsDoNotOverlap(int WidthA, int OffsetA, int WidthB, int OffsetB) { int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; @@ -1088,9 +1200,6 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb, AliasAnalysis *AA) const { - unsigned Opc0 = MIa->getOpcode(); - unsigned Opc1 = MIb->getOpcode(); - assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && "MIa must load from or modify a memory location"); assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && @@ -1105,32 +1214,32 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the - // underlying addres space, even if it was lowered to a different one, + // underlying address space, even if it was lowered to a different one, // e.g. private accesses lowered to use MUBUF instructions on a scratch // buffer. - if (isDS(Opc0)) { - if (isDS(Opc1)) + if (isDS(*MIa)) { + if (isDS(*MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(Opc1); + return !isFLAT(*MIb); } - if (isMUBUF(Opc0) || isMTBUF(Opc0)) { - if (isMUBUF(Opc1) || isMTBUF(Opc1)) + if (isMUBUF(*MIa) || isMTBUF(*MIa)) { + if (isMUBUF(*MIb) || isMTBUF(*MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(Opc1) && !isSMRD(Opc1); + return !isFLAT(*MIb) && !isSMRD(*MIb); } - if (isSMRD(Opc0)) { - if (isSMRD(Opc1)) + if (isSMRD(*MIa)) { + if (isSMRD(*MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0); + return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); } - if (isFLAT(Opc0)) { - if (isFLAT(Opc1)) + if (isFLAT(*MIa)) { + if (isFLAT(*MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); return false; @@ -1319,6 +1428,26 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, return false; } +static unsigned findImplicitSGPRRead(const MachineInstr &MI) { + for (const MachineOperand &MO : MI.implicit_operands()) { + // We only care about reads. + if (MO.isDef()) + continue; + + switch (MO.getReg()) { + case AMDGPU::VCC: + case AMDGPU::M0: + case AMDGPU::FLAT_SCR: + return MO.getReg(); + + default: + break; + } + } + + return AMDGPU::NoRegister; +} + bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const { uint16_t Opcode = MI->getOpcode(); @@ -1335,7 +1464,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, return false; } - // Make sure the register classes are correct + // Make sure the register classes are correct. for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { if (MI->getOperand(i).isFPImm()) { ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " @@ -1392,14 +1521,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Verify VOP* - if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) { + if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; unsigned ConstantBusCount = 0; - unsigned SGPRUsed = AMDGPU::NoRegister; + unsigned SGPRUsed = findImplicitSGPRRead(*MI); + if (SGPRUsed != AMDGPU::NoRegister) + ++ConstantBusCount; + for (int OpIdx : OpIndices) { if (OpIdx == -1) break; @@ -1435,6 +1567,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, } } + // Make sure we aren't losing exec uses in the td files. This mostly requires + // being careful when using let Uses to try to add other use registers. + if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { + const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); + if (!Exec || !Exec->isImplicit()) { + ErrInfo = "VALU instruction does not implicitly read exec mask"; + return false; + } + } + return true; } @@ -1483,11 +1625,17 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; + case AMDGPU::S_LOAD_DWORD_SGPR: + case AMDGPU::S_LOAD_DWORD_IMM_ci: + return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; + case AMDGPU::S_LOAD_DWORDX2_SGPR: + case AMDGPU::S_LOAD_DWORDX2_IMM_ci: + return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; + case AMDGPU::S_LOAD_DWORDX4_SGPR: + case AMDGPU::S_LOAD_DWORDX4_IMM_ci: + return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; @@ -1562,17 +1710,21 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, unsigned SubIdx, const TargetRegisterClass *SubRC) const { - assert(SuperReg.isReg()); - - unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); + MachineBasicBlock *MBB = MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); unsigned SubReg = MRI.createVirtualRegister(SubRC); + if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) + .addReg(SuperReg.getReg(), 0, SubIdx); + return SubReg; + } + // Just in case the super register is itself a sub-register, copy it to a new // value so we don't need to worry about merging its subreg index with the // SubIdx passed to this function. The register coalescer should be able to // eliminate this extra copy. - MachineBasicBlock *MBB = MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); + unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); @@ -1605,36 +1757,6 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm( return MachineOperand::CreateReg(SubReg, false); } -unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, - MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineOperand &Op) const { - MachineBasicBlock *MBB = MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned Dst = MRI.createVirtualRegister(RC); - - MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), - LoDst) - .addImm(Op.getImm() & 0xFFFFFFFF); - MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), - HiDst) - .addImm(Op.getImm() >> 32); - - BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst) - .addReg(LoDst) - .addImm(AMDGPU::sub0) - .addReg(HiDst) - .addImm(AMDGPU::sub1); - - Worklist.push_back(Lo); - Worklist.push_back(Hi); - - return Dst; -} - // Change the order of operands from (0, 1, 2) to (0, 2, 1) void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { assert(Inst->getNumExplicitOperands() == 3); @@ -1643,6 +1765,45 @@ void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { Inst->addOperand(Op1); } +bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const { + if (!MO.isReg()) + return false; + + unsigned Reg = MO.getReg(); + const TargetRegisterClass *RC = + TargetRegisterInfo::isVirtualRegister(Reg) ? + MRI.getRegClass(Reg) : + RI.getPhysRegClass(Reg); + + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); + RC = TRI->getSubRegClass(RC, MO.getSubReg()); + + // In order to be legal, the common sub-class must be equal to the + // class of the current operand. For example: + // + // v_mov_b32 s0 ; Operand defined as vsrc_32 + // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL + // + // s_sendmsg 0, s0 ; Operand defined as m0reg + // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL + + return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; +} + +bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const { + if (MO.isReg()) + return isLegalRegOperand(MRI, OpInfo, MO); + + // Handle non-register types that are treated like immediates. + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + return true; +} + bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, const MachineOperand *MO) const { const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); @@ -1653,7 +1814,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, if (!MO) MO = &MI->getOperand(OpIdx); - if (isVALU(InstDesc.Opcode) && + if (isVALU(*MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) { unsigned SGPRUsed = MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; @@ -1670,21 +1831,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, if (MO->isReg()) { assert(DefinedRC); - const TargetRegisterClass *RC = - TargetRegisterInfo::isVirtualRegister(MO->getReg()) ? - MRI.getRegClass(MO->getReg()) : - RI.getPhysRegClass(MO->getReg()); - - // In order to be legal, the common sub-class must be equal to the - // class of the current operand. For example: - // - // v_mov_b32 s0 ; Operand defined as vsrc_32 - // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL - // - // s_sendmsg 0, s0 ; Operand defined as m0reg - // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL - - return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; + return isLegalRegOperand(MRI, OpInfo, *MO); } @@ -1699,81 +1846,143 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, return isImmOperandLegal(MI, OpIdx, *MO); } -void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); +void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, + MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + const MCInstrDesc &InstrDesc = get(Opc); - int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src0); - int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src1); - int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src2); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + MachineOperand &Src1 = MI->getOperand(Src1Idx); - // Legalize VOP2 - if (isVOP2(MI->getOpcode()) && Src1Idx != -1) { - // Legalize src0 - if (!isOperandLegal(MI, Src0Idx)) + // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 + // we need to only have one constant bus use. + // + // Note we do not need to worry about literal constants here. They are + // disabled for the operand type for instructions because they will always + // violate the one constant bus use rule. + bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; + if (HasImplicitSGPR) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI->getOperand(Src0Idx); + + if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) legalizeOpWithMove(MI, Src0Idx); + } - // Legalize src1 - if (isOperandLegal(MI, Src1Idx)) - return; + // VOP2 src0 instructions support all operand types, so we don't need to check + // their legality. If src1 is already legal, we don't need to do anything. + if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) + return; - // Usually src0 of VOP2 instructions allow more types of inputs - // than src1, so try to commute the instruction to decrease our - // chances of having to insert a MOV instruction to legalize src1. - if (MI->isCommutable()) { - if (commuteInstruction(MI)) - // If we are successful in commuting, then we know MI is legal, so - // we are done. - return; - } + // We do not use commuteInstruction here because it is too aggressive and will + // commute if it is possible. We only want to commute here if it improves + // legality. This can be called a fairly large number of times so don't waste + // compile time pointlessly swapping and checking legality again. + if (HasImplicitSGPR || !MI->isCommutable()) { + legalizeOpWithMove(MI, Src1Idx); + return; + } + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI->getOperand(Src0Idx); + + // If src0 can be used as src1, commuting will make the operands legal. + // Otherwise we have to give up and insert a move. + // + // TODO: Other immediate-like operand kinds could be commuted if there was a + // MachineOperand::ChangeTo* for them. + if ((!Src1.isImm() && !Src1.isReg()) || + !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { legalizeOpWithMove(MI, Src1Idx); return; } - // XXX - Do any VOP3 instructions read VCC? - // Legalize VOP3 - if (isVOP3(MI->getOpcode())) { - int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx }; + int CommutedOpc = commuteOpcode(*MI); + if (CommutedOpc == -1) { + legalizeOpWithMove(MI, Src1Idx); + return; + } - // Find the one SGPR operand we are allowed to use. - unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + MI->setDesc(get(CommutedOpc)); - for (unsigned i = 0; i < 3; ++i) { - int Idx = VOP3Idx[i]; - if (Idx == -1) - break; - MachineOperand &MO = MI->getOperand(Idx); + unsigned Src0Reg = Src0.getReg(); + unsigned Src0SubReg = Src0.getSubReg(); + bool Src0Kill = Src0.isKill(); - if (MO.isReg()) { - if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) - continue; // VGPRs are legal + if (Src1.isImm()) + Src0.ChangeToImmediate(Src1.getImm()); + else if (Src1.isReg()) { + Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); + Src0.setSubReg(Src1.getSubReg()); + } else + llvm_unreachable("Should only have register or immediate operands"); - assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction"); + Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); + Src1.setSubReg(Src0SubReg); +} - if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { - SGPRReg = MO.getReg(); - // We can use one SGPR in each VOP3 instruction. - continue; - } - } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) { - // If it is not a register and not a literal constant, then it must be - // an inline constant which is always legal. - continue; - } - // If we make it this far, then the operand is not legal and we must - // legalize it. - legalizeOpWithMove(MI, Idx); +// Legalize VOP3 operands. Because all operand types are supported for any +// operand, and since literal constants are not allowed and should never be +// seen, we only need to worry about inserting copies if we use multiple SGPR +// operands. +void SIInstrInfo::legalizeOperandsVOP3( + MachineRegisterInfo &MRI, + MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + + int VOP3Idx[3] = { + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) + }; + + // Find the one SGPR operand we are allowed to use. + unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + + for (unsigned i = 0; i < 3; ++i) { + int Idx = VOP3Idx[i]; + if (Idx == -1) + break; + MachineOperand &MO = MI->getOperand(Idx); + + // We should never see a VOP3 instruction with an illegal immediate operand. + if (!MO.isReg()) + continue; + + if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) + continue; // VGPRs are legal + + if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { + SGPRReg = MO.getReg(); + // We can use one SGPR in each VOP3 instruction. + continue; } + + // If we make it this far, then the operand is not legal and we must + // legalize it. + legalizeOpWithMove(MI, Idx); + } +} + +void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + // Legalize VOP2 + if (isVOP2(*MI)) { + legalizeOperandsVOP2(MRI, MI); + return; + } + + // Legalize VOP3 + if (isVOP3(*MI)) { + legalizeOperandsVOP3(MRI, MI); + return; } // Legalize REG_SEQUENCE and PHI // The register class of the operands much be the same type as the register // class of the output. - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE || - MI->getOpcode() == AMDGPU::PHI) { + if (MI->getOpcode() == AMDGPU::PHI) { const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { if (!MI->getOperand(i).isReg() || @@ -1802,26 +2011,53 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } // Update all the operands so they have the same type. - for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { - if (!MI->getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) + for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI->getOperand(I); + if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; unsigned DstReg = MRI.createVirtualRegister(RC); - MachineBasicBlock *InsertBB; - MachineBasicBlock::iterator Insert; - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { - InsertBB = MI->getParent(); - Insert = MI; - } else { - // MI is a PHI instruction. - InsertBB = MI->getOperand(i + 1).getMBB(); - Insert = InsertBB->getFirstTerminator(); + + // MI is a PHI instruction. + MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); + MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); + + BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); + Op.setReg(DstReg); + } + } + + // REG_SEQUENCE doesn't really require operand legalization, but if one has a + // VGPR dest type and SGPR sources, insert copies so all operands are + // VGPRs. This seems to help operand folding / the register coalescer. + if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { + MachineBasicBlock *MBB = MI->getParent(); + const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); + if (RI.hasVGPRs(DstRC)) { + // Update all the operands so they are VGPR register classes. These may + // not be the same register class because REG_SEQUENCE supports mixing + // subregister index types e.g. sub0_sub1 + sub2 + sub3 + for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI->getOperand(I); + if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) + continue; + + const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); + const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); + if (VRC == OpRC) + continue; + + unsigned DstReg = MRI.createVirtualRegister(VRC); + + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); + + Op.setReg(DstReg); + Op.setIsKill(); } - BuildMI(*InsertBB, Insert, MI->getDebugLoc(), - get(AMDGPU::COPY), DstReg) - .addOperand(MI->getOperand(i)); - MI->getOperand(i).setReg(DstReg); } + + return; } // Legalize INSERT_SUBREG @@ -1858,15 +2094,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } MachineBasicBlock &MBB = *MI->getParent(); - // Extract the ptr from the resource descriptor. - - // SRsrcPtrLo = srsrc:sub0 - unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); - // SRsrcPtrHi = srsrc:sub1 - unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); + // Extract the ptr from the resource descriptor. + unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); // Create an empty resource descriptor unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -1891,80 +2122,112 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { .addImm(RsrcDataFormat >> 32); // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewSRsrc) - .addReg(Zero64) - .addImm(AMDGPU::sub0_sub1) - .addReg(SRsrcFormatLo) - .addImm(AMDGPU::sub2) - .addReg(SRsrcFormatHi) - .addImm(AMDGPU::sub3); + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - unsigned NewVAddrLo; - unsigned NewVAddrHi; if (VAddr) { // This is already an ADDR64 instruction so we need to add the pointer // extracted from the resource descriptor to the current value of VAddr. - NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), - NewVAddrLo) - .addReg(SRsrcPtrLo) - .addReg(VAddr->getReg(), 0, AMDGPU::sub0) - .addReg(AMDGPU::VCC, RegState::ImplicitDefine); - - // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32), - NewVAddrHi) - .addReg(SRsrcPtrHi) - .addReg(VAddr->getReg(), 0, AMDGPU::sub1) - .addReg(AMDGPU::VCC, RegState::ImplicitDefine) - .addReg(AMDGPU::VCC, RegState::Implicit); - + unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 + DebugLoc DL = MI->getDebugLoc(); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addReg(VAddr->getReg(), 0, AMDGPU::sub0); + + // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addReg(VAddr->getReg(), 0, AMDGPU::sub1); + + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(NewVAddrLo) + .addImm(AMDGPU::sub0) + .addReg(NewVAddrHi) + .addImm(AMDGPU::sub1); } else { // This instructions is the _OFFSET variant, so we need to convert it to // ADDR64. + assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() + < AMDGPUSubtarget::VOLCANIC_ISLANDS && + "FIXME: Need to emit flat atomics here"); + MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); - - // Create the new instruction. unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); - MachineInstr *Addr64 = - BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. - // This will be replaced later - // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0); // tfe + + // Atomics rith return have have an additional tied operand and are + // missing some of the special bits. + MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); + MachineInstr *Addr64; + + if (!VDataIn) { + // Regular buffer load / store. + MachineInstrBuilder MIB + = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset); + + // Atomics do not have this operand. + if (const MachineOperand *GLC + = getNamedOperand(*MI, AMDGPU::OpName::glc)) { + MIB.addImm(GLC->getImm()); + } + + MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); + + if (const MachineOperand *TFE + = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { + MIB.addImm(TFE->getImm()); + } + + MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + Addr64 = MIB; + } else { + // Atomics with return. + Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addOperand(*VDataIn) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset) + .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + } MI->removeFromParent(); MI = Addr64; - NewVAddrLo = SRsrcPtrLo; - NewVAddrHi = SRsrcPtrHi; + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addImm(AMDGPU::sub1); + VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); } - // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewVAddr) - .addReg(NewVAddrLo) - .addImm(AMDGPU::sub0) - .addReg(NewVAddrHi) - .addImm(AMDGPU::sub1); - - // Update the instruction to use NewVaddr VAddr->setReg(NewVAddr); // Update the instruction to use NewSRsrc @@ -2028,53 +2291,64 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI, .addOperand(*SOff); unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) - .addOperand(*SOff) - .addImm(HalfSize); - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) + .addReg(SOff->getReg(), 0, SOff->getSubReg()) + .addImm(HalfSize); + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) .addReg(SBase->getReg(), getKillRegState(IsKill), SBase->getSubReg()) .addReg(OffsetSGPR); } unsigned SubLo, SubHi; + const TargetRegisterClass *NewDstRC; switch (HalfSize) { case 4: SubLo = AMDGPU::sub0; SubHi = AMDGPU::sub1; + NewDstRC = &AMDGPU::VReg_64RegClass; break; case 8: SubLo = AMDGPU::sub0_sub1; SubHi = AMDGPU::sub2_sub3; + NewDstRC = &AMDGPU::VReg_128RegClass; break; case 16: SubLo = AMDGPU::sub0_sub1_sub2_sub3; SubHi = AMDGPU::sub4_sub5_sub6_sub7; + NewDstRC = &AMDGPU::VReg_256RegClass; break; case 32: SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; + NewDstRC = &AMDGPU::VReg_512RegClass; break; default: llvm_unreachable("Unhandled HalfSize"); } - BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE)) - .addOperand(MI->getOperand(0)) - .addReg(RegLo) - .addImm(SubLo) - .addReg(RegHi) - .addImm(SubHi); + unsigned OldDst = MI->getOperand(0).getReg(); + unsigned NewDst = MRI.createVirtualRegister(NewDstRC); + + MRI.replaceRegWith(OldDst, NewDst); + + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst) + .addReg(RegLo) + .addImm(SubLo) + .addReg(RegHi) + .addImm(SubHi); } -void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { +void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, + MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const { MachineBasicBlock *MBB = MI->getParent(); - switch (MI->getOpcode()) { - case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: - case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: { + int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + assert(DstIdx != -1); + unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass; + switch(RI.getRegClass(DstRCID)->getSize()) { + case 4: + case 8: + case 16: { unsigned NewOpcode = getVALUOp(*MI); unsigned RegOffset; unsigned ImmOffset; @@ -2118,53 +2392,55 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) .addImm(RsrcDataFormat >> 32); BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) - .addReg(DWord0) - .addImm(AMDGPU::sub0) - .addReg(DWord1) - .addImm(AMDGPU::sub1) - .addReg(DWord2) - .addImm(AMDGPU::sub2) - .addReg(DWord3) - .addImm(AMDGPU::sub3); - MI->setDesc(get(NewOpcode)); - if (MI->getOperand(2).isReg()) { - MI->getOperand(2).setReg(SRsrc); - } else { - MI->getOperand(2).ChangeToRegister(SRsrc, false); - } - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe - - const TargetRegisterClass *NewDstRC = - RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); - - unsigned DstReg = MI->getOperand(0).getReg(); + .addReg(DWord0) + .addImm(AMDGPU::sub0) + .addReg(DWord1) + .addImm(AMDGPU::sub1) + .addReg(DWord2) + .addImm(AMDGPU::sub2) + .addReg(DWord3) + .addImm(AMDGPU::sub3); + + const MCInstrDesc &NewInstDesc = get(NewOpcode); + const TargetRegisterClass *NewDstRC + = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); + unsigned DstReg = MI->getOperand(0).getReg(); MRI.replaceRegWith(DstReg, NewDstReg); + + MachineInstr *NewInst = + BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg) + .addOperand(MI->getOperand(1)) // sbase + .addReg(SRsrc) + .addImm(0) + .addImm(ImmOffset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MI->eraseFromParent(); + + legalizeOperands(NewInst); + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); break; } - case AMDGPU::S_LOAD_DWORDX8_IMM: - case AMDGPU::S_LOAD_DWORDX8_SGPR: { + case 32: { MachineInstr *Lo, *Hi; splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI); - moveSMRDToVALU(Hi, MRI); + moveSMRDToVALU(Lo, MRI, Worklist); + moveSMRDToVALU(Hi, MRI, Worklist); break; } - case AMDGPU::S_LOAD_DWORDX16_IMM: - case AMDGPU::S_LOAD_DWORDX16_SGPR: { + case 64: { MachineInstr *Lo, *Hi; splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI); - moveSMRDToVALU(Hi, MRI); + moveSMRDToVALU(Lo, MRI, Worklist); + moveSMRDToVALU(Hi, MRI, Worklist); break; } } @@ -2185,51 +2461,28 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // Handle some special cases switch (Opcode) { default: - if (isSMRD(Inst->getOpcode())) { - moveSMRDToVALU(Inst, MRI); + if (isSMRD(*Inst)) { + moveSMRDToVALU(Inst, MRI, Worklist); + continue; } break; - case AMDGPU::S_MOV_B64: { - DebugLoc DL = Inst->getDebugLoc(); - - // If the source operand is a register we can replace this with a - // copy. - if (Inst->getOperand(1).isReg()) { - MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY)) - .addOperand(Inst->getOperand(0)) - .addOperand(Inst->getOperand(1)); - Worklist.push_back(Copy); - } else { - // Otherwise, we need to split this into two movs, because there is - // no 64-bit VALU move instruction. - unsigned Reg = Inst->getOperand(0).getReg(); - unsigned Dst = split64BitImm(Worklist, - Inst, - MRI, - MRI.getRegClass(Reg), - Inst->getOperand(1)); - MRI.replaceRegWith(Reg, Dst); - } - Inst->eraseFromParent(); - continue; - } case AMDGPU::S_AND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); Inst->eraseFromParent(); continue; case AMDGPU::S_OR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); Inst->eraseFromParent(); continue; case AMDGPU::S_XOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); Inst->eraseFromParent(); continue; case AMDGPU::S_NOT_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); Inst->eraseFromParent(); continue; @@ -2281,6 +2534,11 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { } break; + case AMDGPU::S_ABS_I32: + lowerScalarAbs(Worklist, Inst); + Inst->eraseFromParent(); + continue; + case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); @@ -2319,7 +2577,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { Inst->addOperand(MachineOperand::CreateImm(0)); } - addDescImplicitUseDef(NewDesc, Inst); + Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { const MachineOperand &OffsetWidthOp = Inst->getOperand(2); @@ -2337,27 +2595,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { } // Update the destination register class. - - const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); - - switch (Opcode) { - // For target instructions, getOpRegClass just returns the virtual - // register class associated with the operand, so we need to find an - // equivalent VGPR register class in order to move the instruction to the - // VALU. - case AMDGPU::COPY: - case AMDGPU::PHI: - case AMDGPU::REG_SEQUENCE: - case AMDGPU::INSERT_SUBREG: - if (RI.hasVGPRs(NewDstRC)) - continue; - NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); - if (!NewDstRC) - continue; - break; - default: - break; - } + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); + if (!NewDstRC) + continue; unsigned DstReg = Inst->getOperand(0).getReg(); unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); @@ -2366,13 +2606,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // Legalize the operands legalizeOperands(Inst); - for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), - E = MRI.use_end(); I != E; ++I) { - MachineInstr &UseMI = *I->getParent(); - if (!canReadVGPR(UseMI, I.getOperandNo())) { - Worklist.push_back(&UseMI); - } - } + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } } @@ -2390,6 +2624,30 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { return &AMDGPU::VGPR_32RegClass; } +void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src = Inst->getOperand(1); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) + .addImm(0) + .addReg(Src.getReg()); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) + .addReg(Src.getReg()) + .addReg(TmpReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::splitScalar64BitUnaryOp( SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr *Inst, @@ -2414,20 +2672,21 @@ void SIInstrInfo::splitScalar64BitUnaryOp( AMDGPU::sub0, Src0SubRC); const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); + const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); + const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); - unsigned DestSub0 = MRI.createVirtualRegister(DestRC); - MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) + unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); + BuildMI(MBB, MII, DL, InstDesc, DestSub0) .addOperand(SrcReg0Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); - unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); - MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) + unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); + BuildMI(MBB, MII, DL, InstDesc, DestSub1) .addOperand(SrcReg0Sub1); - unsigned FullDestReg = MRI.createVirtualRegister(DestRC); + unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) .addImm(AMDGPU::sub0) @@ -2436,10 +2695,11 @@ void SIInstrInfo::splitScalar64BitUnaryOp( MRI.replaceRegWith(Dest.getReg(), FullDestReg); - // Try to legalize the operands in case we need to swap the order to keep it - // valid. - Worklist.push_back(LoHalf); - Worklist.push_back(HiHalf); + // We don't need to legalizeOperands here because for a single operand, src0 + // will support any kind of input. + + // Move all users of this moved value. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } void SIInstrInfo::splitScalar64BitBinaryOp( @@ -2474,9 +2734,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp( AMDGPU::sub0, Src1SubRC); const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); + const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); + const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); - unsigned DestSub0 = MRI.createVirtualRegister(DestRC); + unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) .addOperand(SrcReg0Sub0) .addOperand(SrcReg1Sub0); @@ -2486,12 +2747,12 @@ void SIInstrInfo::splitScalar64BitBinaryOp( MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); - unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); + unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) .addOperand(SrcReg0Sub1) .addOperand(SrcReg1Sub1); - unsigned FullDestReg = MRI.createVirtualRegister(DestRC); + unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) .addImm(AMDGPU::sub0) @@ -2502,8 +2763,11 @@ void SIInstrInfo::splitScalar64BitBinaryOp( // Try to legalize the operands in case we need to swap the order to keep it // valid. - Worklist.push_back(LoHalf); - Worklist.push_back(HiHalf); + legalizeOperands(LoHalf); + legalizeOperands(HiHalf); + + // Move all users of this moved vlaue. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, @@ -2532,18 +2796,19 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC); - MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) + BuildMI(MBB, MII, DL, InstDesc, MidReg) .addOperand(SrcRegSub0) .addImm(0); - MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) + BuildMI(MBB, MII, DL, InstDesc, ResultReg) .addOperand(SrcRegSub1) .addReg(MidReg); MRI.replaceRegWith(Dest.getReg(), ResultReg); - Worklist.push_back(First); - Worklist.push_back(Second); + // We don't need to legalize operands here. src0 for etiher instruction can be + // an SGPR, and the second input is unused or determined here. + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, @@ -2587,6 +2852,7 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, .addImm(AMDGPU::sub1); MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); return; } @@ -2605,33 +2871,53 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, .addImm(AMDGPU::sub1); MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, - MachineInstr *Inst) const { - // Add the implict and explicit register definitions. - if (NewDesc.ImplicitUses) { - for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { - unsigned Reg = NewDesc.ImplicitUses[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); +void SIInstrInfo::addUsersToMoveToVALUWorklist( + unsigned DstReg, + MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const { + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), + E = MRI.use_end(); I != E; ++I) { + MachineInstr &UseMI = *I->getParent(); + if (!canReadVGPR(UseMI, I.getOperandNo())) { + Worklist.push_back(&UseMI); } } +} - if (NewDesc.ImplicitDefs) { - for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { - unsigned Reg = NewDesc.ImplicitDefs[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); - } +const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( + const MachineInstr &Inst) const { + const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); + + switch (Inst.getOpcode()) { + // For target instructions, getOpRegClass just returns the virtual register + // class associated with the operand, so we need to find an equivalent VGPR + // register class in order to move the instruction to the VALU. + case AMDGPU::COPY: + case AMDGPU::PHI: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::INSERT_SUBREG: + if (RI.hasVGPRs(NewDstRC)) + return nullptr; + + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + if (!NewDstRC) + return nullptr; + return NewDstRC; + default: + return NewDstRC; } } +// Find the one SGPR operand we are allowed to use. unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const { - const MCInstrDesc &Desc = get(MI->getOpcode()); + const MCInstrDesc &Desc = MI->getDesc(); // Find the one SGPR operand we are allowed to use. - unsigned SGPRReg = AMDGPU::NoRegister; - + // // First we need to consider the instruction's operand requirements before // legalizing. Some operands are required to be SGPRs, such as implicit uses // of VCC, but we are still bound by the constant bus requirement to only use @@ -2639,17 +2925,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, // // If the operand's class is an SGPR, we can never move it. - for (const MachineOperand &MO : MI->implicit_operands()) { - // We only care about reads. - if (MO.isDef()) - continue; - - if (MO.getReg() == AMDGPU::VCC) - return AMDGPU::VCC; - - if (MO.getReg() == AMDGPU::FLAT_SCR) - return AMDGPU::FLAT_SCR; - } + unsigned SGPRReg = findImplicitSGPRRead(*MI); + if (SGPRReg != AMDGPU::NoRegister) + return SGPRReg; unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); @@ -2660,15 +2938,22 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, break; const MachineOperand &MO = MI->getOperand(Idx); - if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass)) - SGPRReg = MO.getReg(); + if (!MO.isReg()) + continue; - if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) - UsedSGPRs[i] = MO.getReg(); - } + // Is this operand statically required to be an SGPR based on the operand + // constraints? + const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); + bool IsRequiredSGPR = RI.isSGPRClass(OpRC); + if (IsRequiredSGPR) + return MO.getReg(); - if (SGPRReg != AMDGPU::NoRegister) - return SGPRReg; + // If this could be a VGPR or an SGPR, Check the dynamic register class. + unsigned Reg = MO.getReg(); + const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); + if (RI.isSGPRClass(RegRC)) + UsedSGPRs[i] = Reg; + } // We don't have a required SGPR operand, so we have a bit more freedom in // selecting operands to move. @@ -2680,6 +2965,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, // V_FMA_F32 v0, s0, s0, s0 -> No moves // V_FMA_F32 v0, s0, s1, s0 -> Move s1 + // TODO: If some of the operands are 64-bit SGPRs and some 32, we should + // prefer those. + if (UsedSGPRs[0] != AMDGPU::NoRegister) { if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) SGPRReg = UsedSGPRs[0]; @@ -2720,7 +3008,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead( unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( getIndirectIndexBegin(*MBB->getParent())); - return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC)) + return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1)) .addOperand(I->getOperand(0)) .addOperand(I->getOperand(1)) .addReg(IndirectBaseReg) @@ -2791,3 +3079,15 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const { return Rsrc23; } + +bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + + return isSMRD(Opc); +} + +bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + + return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 5053786..cce1ae7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -39,14 +39,11 @@ private: unsigned SubIdx, const TargetRegisterClass *SubRC) const; - unsigned split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, - MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineOperand &Op) const; - void swapOperands(MachineBasicBlock::iterator Inst) const; + void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const; + void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr *Inst, unsigned Opcode) const; @@ -58,13 +55,24 @@ private: void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr *Inst) const; - void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; + void addUsersToMoveToVALUWorklist( + unsigned Reg, MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const; + + const TargetRegisterClass * + getDestEquivalentVGPRClass(const MachineInstr &Inst) const; bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa, MachineInstr *MIb) const; unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const; +protected: + MachineInstr *commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx0, + unsigned OpIdx1) const override; + public: explicit SIInstrInfo(const AMDGPUSubtarget &st); @@ -117,17 +125,14 @@ public: // register. If there is no hardware instruction that can store to \p // DstRC, then AMDGPU::COPY is returned. unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; + + LLVM_READONLY int commuteOpcode(const MachineInstr &MI) const; - MachineInstr *commuteInstruction(MachineInstr *MI, - bool NewMI = false) const override; bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; - bool isTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA = nullptr) const; - bool areMemAccessesTriviallyDisjoint( MachineInstr *MIa, MachineInstr *MIb, AliasAnalysis *AA = nullptr) const override; @@ -137,8 +142,6 @@ public: unsigned DstReg, unsigned SrcReg) const override; bool isMov(unsigned Opcode) const override; - bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; - bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, unsigned Reg, MachineRegisterInfo *MRI) const final; @@ -148,78 +151,154 @@ public: MachineBasicBlock::iterator &MI, LiveVariables *LV) const override; + static bool isSALU(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SALU; + } + bool isSALU(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SALU; } + static bool isVALU(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VALU; + } + bool isVALU(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VALU; } + static bool isSOP1(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOP1; + } + bool isSOP1(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOP1; } + static bool isSOP2(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOP2; + } + bool isSOP2(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOP2; } + static bool isSOPC(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOPC; + } + bool isSOPC(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOPC; } + static bool isSOPK(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOPK; + } + bool isSOPK(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOPK; } + static bool isSOPP(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOPP; + } + bool isSOPP(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOPP; } + static bool isVOP1(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP1; + } + bool isVOP1(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP1; } + static bool isVOP2(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP2; + } + bool isVOP2(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP2; } + static bool isVOP3(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP3; + } + bool isVOP3(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP3; } + static bool isVOPC(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOPC; + } + bool isVOPC(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOPC; } + static bool isMUBUF(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::MUBUF; + } + bool isMUBUF(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MUBUF; } + static bool isMTBUF(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::MTBUF; + } + bool isMTBUF(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MTBUF; } + static bool isSMRD(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SMRD; + } + bool isSMRD(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SMRD; } + static bool isDS(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::DS; + } + bool isDS(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::DS; } + static bool isMIMG(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::MIMG; + } + bool isMIMG(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MIMG; } + static bool isFLAT(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::FLAT; + } + bool isFLAT(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FLAT; } + static bool isWQM(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::WQM; + } + bool isWQM(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::WQM; } + static bool isVGPRSpill(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill; + } + bool isVGPRSpill(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; } @@ -302,6 +381,26 @@ public: bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx, const MachineOperand *MO = nullptr) const; + /// \brief Check if \p MO would be a valid operand for the given operand + /// definition \p OpInfo. Note this does not attempt to validate constant bus + /// restrictions (e.g. literal constant usage). + bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const; + + /// \brief Check if \p MO (a register operand) is a legal register for the + /// given operand description. + bool isLegalRegOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const; + + /// \brief Legalize operands in \p MI by either commuting it or inserting a + /// copy of src1. + void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const; + + /// \brief Fix operands in \p MI to satisfy constant bus requirements. + void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const; + /// \brief Legalize all operands in this instruction. This function may /// create new instruction and insert them before \p MI. void legalizeOperands(MachineInstr *MI) const; @@ -312,7 +411,8 @@ public: unsigned HalfImmOp, unsigned HalfSGPROp, MachineInstr *&Lo, MachineInstr *&Hi) const; - void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const; + void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const; /// \brief Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the @@ -341,29 +441,52 @@ public: void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I, unsigned SavReg, unsigned IndexReg) const; - void insertNOPs(MachineBasicBlock::iterator MI, int Count) const; + void insertWaitStates(MachineBasicBlock::iterator MI, int Count) const; /// \brief Returns the operand named \p Op. If \p MI does not have an /// operand named \c Op, this function returns nullptr. + LLVM_READONLY MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const; + LLVM_READONLY const MachineOperand *getNamedOperand(const MachineInstr &MI, unsigned OpName) const { return getNamedOperand(const_cast<MachineInstr &>(MI), OpName); } + /// Get required immediate operand + int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const { + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); + return MI.getOperand(Idx).getImm(); + } + uint64_t getDefaultRsrcDataFormat() const; uint64_t getScratchRsrcWords23() const; + + bool isLowLatencyInstruction(const MachineInstr *MI) const; + bool isHighLatencyInstruction(const MachineInstr *MI) const; }; namespace AMDGPU { - + LLVM_READONLY int getVOPe64(uint16_t Opcode); + + LLVM_READONLY int getVOPe32(uint16_t Opcode); + + LLVM_READONLY int getCommuteRev(uint16_t Opcode); + + LLVM_READONLY int getCommuteOrig(uint16_t Opcode); + + LLVM_READONLY int getAddr64Inst(uint16_t Opcode); + + LLVM_READONLY int getAtomicRetOp(uint16_t Opcode); + + LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode); const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 8d8110b..8735277 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -8,9 +8,9 @@ //===----------------------------------------------------------------------===// def isCI : Predicate<"Subtarget->getGeneration() " ">= AMDGPUSubtarget::SEA_ISLANDS">; -def isVI : Predicate < - "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, - AssemblerPredicate<"FeatureGCN3Encoding">; +def isCIOnly : Predicate<"Subtarget->getGeneration() ==" + "AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate <"FeatureSeaIslands">; def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; @@ -69,6 +69,15 @@ class sopk <bits<5> si, bits<5> vi = si> { field bits<5> VI = vi; } +// Specify an SMRD opcode for SI and SMEM opcode for VI + +// FIXME: This should really be bits<5> si, Tablegen crashes if +// parameter default value is other parameter with different bit size +class smrd<bits<8> si, bits<8> vi = si> { + field bits<5> SI = si{4-0}; + field bits<8> VI = vi; +} + // Execpt for the NONE field, this must be kept in sync with the SISubtarget enum // in AMDGPUInstrInfo.cpp def SISubtarget { @@ -121,10 +130,49 @@ def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; def SIconstdata_ptr : SDNode< - "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]> + "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>, + SDTCisVT<0, i64>]> >; //===----------------------------------------------------------------------===// +// PatFrags for FLAT instructions +//===----------------------------------------------------------------------===// + +class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr), + (ld node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)) || + isGlobalLoad(dyn_cast<LoadSDNode>(N)) || + isConstantLoad(cast<LoadSDNode>(N), -1); +}]>; + +def flat_load : flat_ld <load>; +def flat_az_extloadi8 : flat_ld <az_extloadi8>; +def flat_sextloadi8 : flat_ld <sextloadi8>; +def flat_az_extloadi16 : flat_ld <az_extloadi16>; +def flat_sextloadi16 : flat_ld <sextloadi16>; + +class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast<StoreSDNode>(N)) || + isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def flat_store: flat_st <store>; +def flat_truncstorei8 : flat_st <truncstorei8>; +def flat_truncstorei16 : flat_st <truncstorei16>; + + +def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ + return isGlobalLoad(cast<LoadSDNode>(N)) || + isConstantLoad(cast<LoadSDNode>(N), -1); +}]>; + +def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ + return isConstantLoad(cast<LoadSDNode>(N), -1) && + static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N); +}]>; + +//===----------------------------------------------------------------------===// // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 // to be glued to the memory instructions. //===----------------------------------------------------------------------===// @@ -328,9 +376,9 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{ static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); U != E; ++U) { - if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) { + const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); + if (RC && SIRI->isSGPRClass(RC)) return true; - } } return false; }]>; @@ -354,6 +402,8 @@ def sopp_brtarget : Operand<OtherVT> { let ParserMatchClass = SoppBrTarget; } +def const_ga : Operand<iPTR>; + include "SIInstrFormats.td" include "VIInstrFormats.td" @@ -393,7 +443,7 @@ def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">; class GLCBaseMatchClass <string parser> : AsmOperandClass { let Name = "GLC"#parser; let PredicateMethod = "isImm"; - let ParserMethod = parser; + let ParserMethod = parser; let RenderMethod = "addImmOperands"; } @@ -436,6 +486,17 @@ def ClampMatchClass : AsmOperandClass { let RenderMethod = "addImmOperands"; } +class SMRDOffsetBaseMatchClass <string predicate> : AsmOperandClass { + let Name = "SMRDOffset"#predicate; + let PredicateMethod = predicate; + let RenderMethod = "addImmOperands"; +} + +def SMRDOffsetMatchClass : SMRDOffsetBaseMatchClass <"isSMRDOffset">; +def SMRDLiteralOffsetMatchClass : SMRDOffsetBaseMatchClass < + "isSMRDLiteralOffset" +>; + let OperandType = "OPERAND_IMMEDIATE" in { def offen : Operand<i1> { @@ -510,6 +571,16 @@ def ClampMod : Operand <i1> { let ParserMatchClass = ClampMatchClass; } +def smrd_offset : Operand <i32> { + let PrintMethod = "printU32ImmOperand"; + let ParserMatchClass = SMRDOffsetMatchClass; +} + +def smrd_literal_offset : Operand <i32> { + let PrintMethod = "printU32ImmOperand"; + let ParserMatchClass = SMRDLiteralOffsetMatchClass; +} + } // End OperandType = "OPERAND_IMMEDIATE" def VOPDstS64 : VOPDstOperand <SReg_64>; @@ -528,6 +599,13 @@ def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; +def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">; +def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">; +def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">; +def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">; +def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">; +def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">; + def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">; def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; @@ -717,19 +795,6 @@ class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> : let AssemblerPredicates = [isVI]; } -multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> { - def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>; - - def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), - opName#" $dst, $src0, $src1 [$scc]">; - - def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), - opName#" $dst, $src0, $src1 [$scc]">; -} - multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm, list<dag> pattern> { @@ -758,8 +823,10 @@ multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, string opName, PatLeaf cond> : SOPC < - op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1), - opName#" $src0, $src1", []>; + op, (outs), (ins rc:$src0, rc:$src1), + opName#" $src0, $src1", []> { + let Defs = [SCC]; +} class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL> : SOPC_Helper<op, SSrc_32, i32, opName, cond>; @@ -812,15 +879,20 @@ multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> { } multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> { - def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst), - (ins SReg_32:$src0, u16imm:$src1), pattern>; + def "" : SOPK_Pseudo <opName, (outs), + (ins SReg_32:$src0, u16imm:$src1), pattern> { + let Defs = [SCC]; + } + - let DisableEncoding = "$dst" in { - def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst), - (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">; + def _si : SOPK_Real_si <op, opName, (outs), + (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> { + let Defs = [SCC]; + } - def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst), - (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">; + def _vi : SOPK_Real_vi <op, opName, (outs), + (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> { + let Defs = [SCC]; } } @@ -868,35 +940,68 @@ class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins, } class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins, - string asm> : - SMRD <outs, ins, asm, []>, + string asm, list<dag> pattern = []> : + SMRD <outs, ins, asm, pattern>, SMEMe_vi <op, imm>, SIMCInstr<opName, SISubtarget.VI> { let AssemblerPredicates = [isVI]; } -multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins, +multiclass SMRD_m <smrd op, string opName, bit imm, dag outs, dag ins, string asm, list<dag> pattern> { def "" : SMRD_Pseudo <opName, outs, ins, pattern>; - def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>; + def _si : SMRD_Real_si <op.SI, opName, imm, outs, ins, asm>; // glc is only applicable to scalar stores, which are not yet // implemented. let glc = 0 in { - def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>; + def _vi : SMRD_Real_vi <op.VI, opName, imm, outs, ins, asm>; + } +} + +multiclass SMRD_Inval <smrd op, string opName, + SDPatternOperator node> { + let hasSideEffects = 1, mayStore = 1 in { + def "" : SMRD_Pseudo <opName, (outs), (ins), [(node)]>; + + let sbase = 0, offset = 0 in { + let sdst = 0 in { + def _si : SMRD_Real_si <op.SI, opName, 0, (outs), (ins), opName>; + } + + let glc = 0, sdata = 0 in { + def _vi : SMRD_Real_vi <op.VI, opName, 0, (outs), (ins), opName>; + } + } } } -multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass, +class SMEM_Inval <bits<8> op, string opName, SDPatternOperator node> : + SMRD_Real_vi<op, opName, 0, (outs), (ins), opName, [(node)]> { + let hasSideEffects = 1; + let mayStore = 1; + let sbase = 0; + let sdata = 0; + let glc = 0; + let offset = 0; +} + +multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass, RegisterClass dstClass> { defm _IMM : SMRD_m < op, opName#"_IMM", 1, (outs dstClass:$dst), - (ins baseClass:$sbase, u32imm:$offset), + (ins baseClass:$sbase, smrd_offset:$offset), opName#" $dst, $sbase, $offset", [] >; + def _IMM_ci : SMRD < + (outs dstClass:$dst), (ins baseClass:$sbase, smrd_literal_offset:$offset), + opName#" $dst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> { + let AssemblerPredicates = [isCIOnly]; + } + defm _SGPR : SMRD_m < op, opName#"_SGPR", 0, (outs dstClass:$dst), (ins baseClass:$sbase, SReg_32:$soff), @@ -922,11 +1027,12 @@ def InputModsNoDefault : Operand <i32> { let ParserMatchClass = InputModsMatchClass; } -class getNumSrcArgs<ValueType Src1, ValueType Src2> { +class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> { int ret = - !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 + !if (!eq(Src0.Value, untyped.Value), 0, + !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 !if (!eq(Src2.Value, untyped.Value), 2, // VOP2 - 3)); // VOP3 + 3))); // VOP3 } // Returns the register class to use for the destination of VOP[123C] @@ -934,28 +1040,37 @@ class getNumSrcArgs<ValueType Src1, ValueType Src2> { class getVALUDstForVT<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>, !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, - VOPDstOperand<SReg_64>)); // else VT == i1 + !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>, + VOPDstOperand<SReg_64>))); // else VT == i1 } // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT<ValueType VT> { - RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64); + RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32); } // Returns the register class to use for source 1 of VOP[12C] for the // given VT. class getVOPSrc1ForVT<ValueType VT> { - RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64); + RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32); } // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT<ValueType VT> { - RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64); + RegisterOperand ret = + !if(!eq(VT.Size, 64), + VCSrc_64, + !if(!eq(VT.Value, i1.Value), + SCSrc_64, + VCSrc_32 + ) + ); } // Returns 1 if the source arguments have modifiers, 0 if they do not. +// XXX - do f16 instructions? class hasModifiers<ValueType SrcVT> { bit ret = !if(!eq(SrcVT.Value, f32.Value), 1, !if(!eq(SrcVT.Value, f64.Value), 1, 0)); @@ -1009,17 +1124,20 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, // Returns the assembly string for the inputs and outputs of a VOP[12C] // instruction. This does not add the _e32 suffix, so it can be reused // by getAsm64. -class getAsm32 <int NumSrcArgs> { +class getAsm32 <bit HasDst, int NumSrcArgs> { + string dst = "$dst"; + string src0 = ", $src0"; string src1 = ", $src1"; string src2 = ", $src2"; - string ret = "$dst, $src0"# - !if(!eq(NumSrcArgs, 1), "", src1)# - !if(!eq(NumSrcArgs, 3), src2, ""); + string ret = !if(HasDst, dst, "") # + !if(!eq(NumSrcArgs, 1), src0, "") # + !if(!eq(NumSrcArgs, 2), src0#src1, "") # + !if(!eq(NumSrcArgs, 3), src0#src1#src2, ""); } // Returns the assembly string for the inputs and outputs of a VOP3 // instruction. -class getAsm64 <int NumSrcArgs, bit HasModifiers> { +class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> { string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); string src1 = !if(!eq(NumSrcArgs, 1), "", !if(!eq(NumSrcArgs, 2), " $src1_modifiers", @@ -1027,11 +1145,10 @@ class getAsm64 <int NumSrcArgs, bit HasModifiers> { string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); string ret = !if(!eq(HasModifiers, 0), - getAsm32<NumSrcArgs>.ret, + getAsm32<HasDst, NumSrcArgs>.ret, "$dst, "#src0#src1#src2#"$clamp"#"$omod"); } - class VOPProfile <list<ValueType> _ArgVT> { field list<ValueType> ArgVT = _ArgVT; @@ -1047,29 +1164,38 @@ class VOPProfile <list<ValueType> _ArgVT> { field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret; field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; - field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret; + field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); + field bit HasDst32 = HasDst; + field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret; field bit HasModifiers = hasModifiers<Src0VT>.ret; - field dag Outs = (outs DstRC:$dst); + field dag Outs = !if(HasDst,(outs DstRC:$dst),(outs)); + + // VOP3b instructions are a special case with a second explicit + // output. This is manually overridden for them. + field dag Outs32 = Outs; + field dag Outs64 = Outs; field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, HasModifiers>.ret; - field string Asm32 = getAsm32<NumSrcArgs>.ret; - field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret; + field string Asm32 = getAsm32<HasDst, NumSrcArgs>.ret; + field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers>.ret; } // FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order // for the instruction patterns to work. -def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>; -def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>; -def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>; +def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; +def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>; +def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>; -def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>; -def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>; +def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; +def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>; + def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>; def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>; @@ -1087,25 +1213,76 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; -def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> { + +// Write out to vcc or arbitrary SGPR. +def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { + let Asm32 = "$dst, vcc, $src0, $src1"; + let Asm64 = "$dst, $sdst, $src0, $src1"; + let Outs32 = (outs DstRC:$dst); + let Outs64 = (outs DstRC:$dst, SReg_64:$sdst); +} + +// Write out to vcc or arbitrary SGPR and read in from vcc or +// arbitrary SGPR. +def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { + // We use VCSrc_32 to exclude literal constants, even though the + // encoding normally allows them since the implicit VCC use means + // using one would always violate the constant bus + // restriction. SGPRs are still allowed because it should + // technically be possible to use VCC again as src0. let Src0RC32 = VCSrc_32; + let Asm32 = "$dst, vcc, $src0, $src1, vcc"; + let Asm64 = "$dst, $sdst, $src0, $src1, $src2"; + let Outs32 = (outs DstRC:$dst); + let Outs64 = (outs DstRC:$dst, SReg_64:$sdst); + + // Suppress src2 implied by type since the 32-bit encoding uses an + // implicit VCC use. + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); } -def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> { - let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = "$dst, $src0_modifiers, $src1"; +class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { + let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod"; +} + +def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> { + // FIXME: Hack to stop printing _e64 + let DstRC = RegisterOperand<VGPR_32>; +} + +def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> { + // FIXME: Hack to stop printing _e64 + let DstRC = RegisterOperand<VReg_64>; +} + +// VOPC instructions are a special case because for the 32-bit +// encoding, we want to display the implicit vcc write as if it were +// an explicit $dst. +class VOPC_Profile<ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, vt1, untyped]> { + let Asm32 = "vcc, $src0, $src1"; + // The destination for 32-bit encoding is implicit. + let HasDst32 = 0; } -def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> { +class VOPC_Class_Profile<ValueType vt> : VOPC_Profile<vt, i32> { let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); let Asm64 = "$dst, $src0_modifiers, $src1"; } +def VOPC_I1_F32_F32 : VOPC_Profile<f32>; +def VOPC_I1_F64_F64 : VOPC_Profile<f64>; +def VOPC_I1_I32_I32 : VOPC_Profile<i32>; +def VOPC_I1_I64_I64 : VOPC_Profile<i64>; + +def VOPC_I1_F32_I32 : VOPC_Class_Profile<f32>; +def VOPC_I1_F64_I32 : VOPC_Class_Profile<f64>; + def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> { - let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2); + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2); let Asm64 = "$dst, $src0, $src1, $src2"; } @@ -1119,13 +1296,60 @@ def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, HasModifiers>.ret; - let Asm32 = getAsm32<2>.ret; - let Asm64 = getAsm64<2, HasModifiers>.ret; + let Asm32 = getAsm32<1, 2>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers>.ret; } def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; +class SIInstAlias <string asm, Instruction inst, VOPProfile p> : + InstAlias <asm, (inst)>, PredicateControl { + + field bit isCompare; + field bit isCommutable; + + let ResultInst = + !if (p.HasDst32, + !if (!eq(p.NumSrcArgs, 0), + // 1 dst, 0 src + (inst p.DstRC:$dst), + !if (!eq(p.NumSrcArgs, 1), + // 1 dst, 1 src + (inst p.DstRC:$dst, p.Src0RC32:$src0), + !if (!eq(p.NumSrcArgs, 2), + // 1 dst, 2 src + (inst p.DstRC:$dst, p.Src0RC32:$src0, p.Src1RC32:$src1), + // else - unreachable + (inst)))), + // else + !if (!eq(p.NumSrcArgs, 2), + // 0 dst, 2 src + (inst p.Src0RC32:$src0, p.Src1RC32:$src1), + !if (!eq(p.NumSrcArgs, 1), + // 0 dst, 1 src + (inst p.Src0RC32:$src1), + // else + // 0 dst, 0 src + (inst)))); +} + +class SIInstAliasSI <string asm, string op_name, VOPProfile p> : + SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_si"), p> { + let AssemblerPredicate = SIAssemblerPredicate; +} + +class SIInstAliasVI <string asm, string op_name, VOPProfile p> : + SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_vi"), p> { + let AssemblerPredicates = [isVI]; +} + +multiclass SIInstAliasBuilder <string asm, VOPProfile p> { + + def : SIInstAliasSI <asm, NAME, p>; + + def : SIInstAliasVI <asm, NAME, p>; +} class VOP <string opName> { string OpName = opName; @@ -1165,20 +1389,22 @@ class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> : let AssemblerPredicates = [isVI]; } -multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName> { - def "" : VOP1_Pseudo <outs, ins, pattern, opName>; +multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, + string asm = opName#p.Asm32> { + def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>; - def _si : VOP1_Real_si <opName, op, outs, ins, asm>; + def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>; + + def _vi : VOP1_Real_vi <opName, op, p.Outs, p.Ins32, asm>; - def _vi : VOP1_Real_vi <opName, op, outs, ins, asm>; } -multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName> { - def "" : VOP1_Pseudo <outs, ins, pattern, opName>; +multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, + string asm = opName#p.Asm32> { + + def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>; - def _si : VOP1_Real_si <opName, op, outs, ins, asm>; + def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>; } class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : @@ -1202,22 +1428,24 @@ class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> : let AssemblerPredicates = [isVI]; } -multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, string revOp> { - def "" : VOP2_Pseudo <outs, ins, pattern, opName>, +multiclass VOP2SI_m <vop2 op, string opName, VOPProfile p, list<dag> pattern, + string revOp> { + + def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - def _si : VOP2_Real_si <opName, op, outs, ins, asm>; + def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>; } -multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, string revOp> { - def "" : VOP2_Pseudo <outs, ins, pattern, opName>, +multiclass VOP2_m <vop2 op, string opName, VOPProfile p, list <dag> pattern, + string revOp> { + + def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - def _si : VOP2_Real_si <opName, op, outs, ins, asm>; + def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>; - def _vi : VOP2_Real_vi <opName, op, outs, ins, asm>; + def _vi : VOP2_Real_vi <opName, op, p.Outs32, p.Ins32, p.Asm32>; } @@ -1250,6 +1478,9 @@ class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : MnemonicAlias<opName#"_e64", opName> { let isPseudo = 1; let isCodeGenOnly = 1; + + field bit vdst; + field bit src0; } class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : @@ -1295,22 +1526,6 @@ multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, HasMods>; } -// VOP3_m without source modifiers -multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, int NumSrcArgs, bit HasMods = 1> { - - def "" : VOP3_Pseudo <outs, ins, pattern, opName>; - - let src0_modifiers = 0, - src1_modifiers = 0, - src2_modifiers = 0, - clamp = 0, - omod = 0 in { - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>; - def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>; - } -} - multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, bit HasMods = 1> { @@ -1335,7 +1550,7 @@ multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm, multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { + bit HasMods = 1> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; @@ -1349,7 +1564,7 @@ multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { + bit HasMods = 1> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; @@ -1360,54 +1575,41 @@ multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm, // No VI instruction. This class is for SI only. } -// XXX - Is v_div_scale_{f32|f64} only available in vop3b without -// option of implicit vcc use? -multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { - def "" : VOP3_Pseudo <outs, ins, pattern, opName>, - VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; - - // The VOP2 variant puts the carry out into VCC, the VOP3 variant - // can write it into any SGPR. We currently don't use the carry out, - // so for now hardcode it to VCC as well. - let sdst = SIOperand.VCC, Defs = [VCC] in { - def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, - VOP3DisableFields<1, 0, HasMods>; - - def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, - VOP3DisableFields<1, 0, HasMods>; - } // End sdst = SIOperand.VCC, Defs = [VCC] -} - -multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { +// Two operand VOP3b instruction that may have a 3rd SGPR bool operand +// instead of an implicit VCC as in the VOP2b format. +multiclass VOP3b_2_3_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit useSrc2Input = 0> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>; - def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, - VOP3DisableFields<1, 1, HasMods>; + VOP3DisableFields<1, useSrc2Input, HasMods>; def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, - VOP3DisableFields<1, 1, HasMods>; + VOP3DisableFields<1, useSrc2Input, HasMods>; } multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, - bit HasMods, bit defExec, string revOp> { + bit HasMods, bit defExec, + string revOp, list<SchedReadWrite> sched> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>, - VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; + VOP2_REV<revOp#"_e64", !eq(revOp, opName)> { + let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; + } def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; } def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; } } @@ -1432,32 +1634,28 @@ multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins, } } -multiclass VOP1_Helper <vop1 op, string opName, dag outs, - dag ins32, string asm32, list<dag> pat32, - dag ins64, string asm64, list<dag> pat64, - bit HasMods> { +multiclass VOP1_Helper <vop1 op, string opName, VOPProfile p, list<dag> pat32, + list<dag> pat64> { - defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>; + defm _e32 : VOP1_m <op, opName, p, pat32>; - defm _e64 : VOP3_1_m <op, outs, ins64, opName#asm64, pat64, opName, HasMods>; + defm _e64 : VOP3_1_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, + p.HasModifiers>; } multiclass VOP1Inst <vop1 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> : VOP1_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, + op, opName, P, [], !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), - P.HasModifiers + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]) >; multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> { - defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>; + defm _e32 : VOP1SI_m <op, opName, P, []>; defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64, !if(P.HasModifiers, @@ -1467,36 +1665,33 @@ multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, opName, P.HasModifiers>; } -multiclass VOP2_Helper <vop2 op, string opName, dag outs, - dag ins32, string asm32, list<dag> pat32, - dag ins64, string asm64, list<dag> pat64, - string revOp, bit HasMods> { - defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; +multiclass VOP2_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32, + list<dag> pat64, string revOp> { - defm _e64 : VOP3_2_m <op, - outs, ins64, opName#asm64, pat64, opName, revOp, HasMods - >; + defm _e32 : VOP2_m <op, opName, p, pat32, revOp>; + + defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, + revOp, p.HasModifiers>; } multiclass VOP2Inst <vop2 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName> : VOP2_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, + op, opName, P, [], !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers + revOp >; multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName> { - defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>; + + defm _e32 : VOP2SI_m <op, opName, P, [], revOp>; defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64, !if(P.HasModifiers, @@ -1508,58 +1703,55 @@ multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, opName, revOp, P.HasModifiers>; } -multiclass VOP2b_Helper <vop2 op, string opName, dag outs, - dag ins32, string asm32, list<dag> pat32, - dag ins64, string asm64, list<dag> pat64, - string revOp, bit HasMods> { +multiclass VOP2b_Helper <vop2 op, string opName, VOPProfile p, + list<dag> pat32, list<dag> pat64, + string revOp, bit useSGPRInput> { - defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; + let SchedRW = [Write32Bit, WriteSALU] in { + let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in { + defm _e32 : VOP2_m <op, opName, p, pat32, revOp>; + } - defm _e64 : VOP3b_2_m <op, - outs, ins64, opName#asm64, pat64, opName, revOp, HasMods - >; + defm _e64 : VOP3b_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64, + opName, revOp, p.HasModifiers, useSGPRInput>; + } } multiclass VOP2bInst <vop2 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName> : VOP2b_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, + op, opName, P, [], !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers + revOp, !eq(P.NumSrcArgs, 3) >; // A VOP2 instruction that is VOP3-only on VI. -multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs, - dag ins32, string asm32, list<dag> pat32, - dag ins64, string asm64, list<dag> pat64, - string revOp, bit HasMods> { - defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>; +multiclass VOP2_VI3_Helper <vop23 op, string opName, VOPProfile p, + list<dag> pat32, list<dag> pat64, string revOp> { + + defm _e32 : VOP2SI_m <op, opName, p, pat32, revOp>; - defm _e64 : VOP3_2_m <op, outs, ins64, opName#asm64, pat64, opName, - revOp, HasMods>; + defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, + revOp, p.HasModifiers>; } multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName> : VOP2_VI3_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, + op, opName, P, [], !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers + revOp >; multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> { @@ -1583,64 +1775,75 @@ let isCodeGenOnly = 0 in { } // End isCodeGenOnly = 0 } -class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : +class VOPC_Pseudo <dag ins, list<dag> pattern, string opName> : VOPCCommon <ins, "", pattern>, VOP <opName>, - SIMCInstr<opName#"_e32", SISubtarget.NONE>, - MnemonicAlias<opName#"_e32", opName> { + SIMCInstr<opName#"_e32", SISubtarget.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; } -multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, bit DefExec, string revOpName = ""> { - def "" : VOPC_Pseudo <outs, ins, pattern, opName>; - - def _si : VOPC<op.SI, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI> { - let Defs = !if(DefExec, [EXEC], []); - let hasSideEffects = DefExec; - let AssemblerPredicates = [isSICI]; +multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern, + string opName, bit DefExec, VOPProfile p, + list<SchedReadWrite> sched, + string revOpName = "", string asm = opName#"_e32 "#op_asm, + string alias_asm = opName#" "#op_asm> { + def "" : VOPC_Pseudo <ins, pattern, opName> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = sched; } - def _vi : VOPC<op.VI, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.VI> { - let Defs = !if(DefExec, [EXEC], []); - let hasSideEffects = DefExec; - let AssemblerPredicates = [isVI]; - } + let AssemblerPredicates = [isSICI] in { + def _si : VOPC<op.SI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let SchedRW = sched; + } + + } // End AssemblerPredicates = [isSICI] + + let AssemblerPredicates = [isVI] in { + def _vi : VOPC<op.VI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let SchedRW = sched; + } + + } // End AssemblerPredicates = [isVI] + + defm : SIInstAliasBuilder<alias_asm, p>; } -multiclass VOPC_Helper <vopc op, string opName, - dag ins32, string asm32, list<dag> pat32, - dag out64, dag ins64, string asm64, list<dag> pat64, - bit HasMods, bit DefExec, string revOp> { - defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; +multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32, + list<dag> pat64, bit DefExec, string revOp, + VOPProfile p, list<SchedReadWrite> sched> { + defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>; - defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64, - opName, HasMods, DefExec, revOp>; + defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64, + opName, p.HasModifiers, DefExec, revOp, sched>; } // Special case for class instructions which only have modifiers on // the 1st source operand. -multiclass VOPC_Class_Helper <vopc op, string opName, - dag ins32, string asm32, list<dag> pat32, - dag out64, dag ins64, string asm64, list<dag> pat64, - bit HasMods, bit DefExec, string revOp> { - defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; - - defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64, - opName, HasMods, DefExec, revOp>, +multiclass VOPC_Class_Helper <vopc op, string opName, list<dag> pat32, + list<dag> pat64, bit DefExec, string revOp, + VOPProfile p, list<SchedReadWrite> sched> { + defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>; + + defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64, + opName, p.HasModifiers, DefExec, revOp, sched>, VOP3DisableModFields<1, 0, 0>; } multiclass VOPCInst <vopc op, string opName, VOPProfile P, PatLeaf cond = COND_NULL, string revOp = opName, - bit DefExec = 0> : VOPC_Helper < - op, opName, - P.Ins32, P.Asm32, [], - (outs VOPDstS64:$dst), P.Ins64, P.Asm64, + bit DefExec = 0, + list<SchedReadWrite> sched = [Write32Bit]> : + VOPC_Helper < + op, opName, [], !if(P.HasModifiers, [(set i1:$dst, (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, @@ -1648,51 +1851,51 @@ multiclass VOPCInst <vopc op, string opName, (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), cond))], [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), - P.HasModifiers, DefExec, revOp + DefExec, revOp, P, sched >; multiclass VOPCClassInst <vopc op, string opName, VOPProfile P, - bit DefExec = 0> : VOPC_Class_Helper < - op, opName, - P.Ins32, P.Asm32, [], - (outs VOPDstS64:$dst), P.Ins64, P.Asm64, + bit DefExec = 0, + list<SchedReadWrite> sched> : VOPC_Class_Helper < + op, opName, [], !if(P.HasModifiers, [(set i1:$dst, (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), - P.HasModifiers, DefExec, opName + DefExec, opName, P, sched >; multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOP_F32_F32_F32, cond, revOp>; + VOPCInst <op, opName, VOPC_I1_F32_F32, cond, revOp>; multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOP_F64_F64_F64, cond, revOp>; + VOPCInst <op, opName, VOPC_I1_F64_F64, cond, revOp, 0, [WriteDoubleAdd]>; multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOP_I32_I32_I32, cond, revOp>; + VOPCInst <op, opName, VOPC_I1_I32_I32, cond, revOp>; multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOP_I64_I64_I64, cond, revOp>; + VOPCInst <op, opName, VOPC_I1_I64_I64, cond, revOp, 0, [Write64Bit]>; multiclass VOPCX <vopc op, string opName, VOPProfile P, PatLeaf cond = COND_NULL, + list<SchedReadWrite> sched, string revOp = ""> - : VOPCInst <op, opName, P, cond, revOp, 1>; + : VOPCInst <op, opName, P, cond, revOp, 1, sched>; multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOP_F32_F32_F32, COND_NULL, revOp>; + VOPCX <op, opName, VOPC_I1_F32_F32, COND_NULL, [Write32Bit], revOp>; multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOP_F64_F64_F64, COND_NULL, revOp>; + VOPCX <op, opName, VOPC_I1_F64_F64, COND_NULL, [WriteDoubleAdd], revOp>; multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOP_I32_I32_I32, COND_NULL, revOp>; + VOPCX <op, opName, VOPC_I1_I32_I32, COND_NULL, [Write32Bit], revOp>; multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOP_I64_I64_I64, COND_NULL, revOp>; + VOPCX <op, opName, VOPC_I1_I64_I64, COND_NULL, [Write64Bit], revOp>; multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m < @@ -1700,16 +1903,16 @@ multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, >; multiclass VOPC_CLASS_F32 <vopc op, string opName> : - VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>; + VOPCClassInst <op, opName, VOPC_I1_F32_I32, 0, [Write32Bit]>; multiclass VOPCX_CLASS_F32 <vopc op, string opName> : - VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>; + VOPCClassInst <op, opName, VOPC_I1_F32_I32, 1, [Write32Bit]>; multiclass VOPC_CLASS_F64 <vopc op, string opName> : - VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>; + VOPCClassInst <op, opName, VOPC_I1_F64_I32, 0, [WriteDoubleAdd]>; multiclass VOPCX_CLASS_F64 <vopc op, string opName> : - VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>; + VOPCClassInst <op, opName, VOPC_I1_F64_I32, 1, [WriteDoubleAdd]>; multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> : VOP3_Helper < @@ -1761,25 +1964,13 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName, 3, 1 >; -multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc, - string opName, list<dag> pattern> : - VOP3b_3_m < - op, (outs vrc:$vdst, SReg_64:$sdst), - (ins InputModsNoDefault:$src0_modifiers, arc:$src0, - InputModsNoDefault:$src1_modifiers, arc:$src1, - InputModsNoDefault:$src2_modifiers, arc:$src2, - ClampMod:$clamp, omod:$omod), - opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern, - opName, opName, 1, 1 +multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = []> : + VOP3b_2_3_m < + op, P.Outs64, P.Ins64, + opName#" "#P.Asm64, pattern, + opName, "", 1, 1 >; -multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> : - VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>; - -multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> : - VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>; - - class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat< (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), @@ -1925,12 +2116,14 @@ multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc, dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { - def "" : DS_Pseudo <opName, outs, ins, []>, - AtomicNoRet<noRetOp, 1>; + let hasPostISelHook = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; - let data1 = 0 in { - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + let data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } } } @@ -1939,11 +2132,13 @@ multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc, dag outs = (outs rc:$vdst), string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> { - def "" : DS_Pseudo <opName, outs, ins, []>, - AtomicNoRet<noRetOp, 1>; + let hasPostISelHook = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } } multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, @@ -2214,7 +2409,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, defm _ADDR64 : MUBUFAtomicAddr64_m < op, name#"_addr64", (outs), - (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, + (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 >; @@ -2233,7 +2428,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < op, name#"_rtn_addr64", (outs rc:$vdata), - (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr, + (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", [(set vt:$vdata, @@ -2245,7 +2440,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, op, name#"_rtn_offset", (outs rc:$vdata), (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), - name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc", + name#" $vdata, $srsrc, $soffset"#"$offset"#" glc$slc", [(set vt:$vdata, (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc), vt:$vdata_in))], 1 @@ -2256,6 +2451,8 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 } +// FIXME: tfe can't be an operand because it requires a separate +// opcode because it needs an N+1 register class dest register. multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { @@ -2368,47 +2565,121 @@ multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, } // End mayLoad = 0, mayStore = 1 } -class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : - FLAT <op, (outs regClass:$vdst), - (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe), - asm#" $vdst, $addr"#"$glc"#"$slc"#"$tfe", []> { - let data = 0; - let mayLoad = 1; +// For cache invalidation instructions. +multiclass MUBUF_Invalidate <mubuf op, string opName, SDPatternOperator node> { + let hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" in { + def "" : MUBUF_Pseudo <opName, (outs), (ins), [(node)]>; + + // Set everything to 0. + let offset = 0, offen = 0, idxen = 0, glc = 0, vaddr = 0, + vdata = 0, srsrc = 0, slc = 0, tfe = 0, soffset = 0 in { + let addr64 = 0 in { + def _si : MUBUF_Real_si <op, opName, (outs), (ins), opName>; + } + + def _vi : MUBUF_Real_vi <op, opName, (outs), (ins), opName>; + } + } // End hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" } -class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> : - FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr, - glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe), - name#" $data, $addr"#"$glc"#"$slc"#"$tfe", - []> { +//===----------------------------------------------------------------------===// +// FLAT classes +//===----------------------------------------------------------------------===// - let mayLoad = 0; - let mayStore = 1; +class flat <bits<7> ci, bits<7> vi = ci> { + field bits<7> CI = ci; + field bits<7> VI = vi; +} - // Encoding - let vdst = 0; +class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + FLAT <0, outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; } -multiclass FLAT_ATOMIC <bits<7> op, string name, RegisterClass vdst_rc, - RegisterClass data_rc = vdst_rc> { +class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> : + FLAT <op, outs, ins, asm, []>, + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicate = isCIOnly; +} - let mayLoad = 1, mayStore = 1 in { - def "" : FLAT <op, (outs), - (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc, - tfe_flat_atomic:$tfe), - name#" $addr, $data"#"$slc"#"$tfe", []>, - AtomicNoRet <NAME, 0> { - let glc = 0; - let vdst = 0; - } +class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> : + FLAT <op, outs, ins, asm, []>, + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicate = VIAssemblerPredicate; +} - def _RTN : FLAT <op, (outs vdst_rc:$vdst), - (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc, - tfe_flat_atomic:$tfe), - name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>, - AtomicNoRet <NAME, 1> { - let glc = 1; - } +multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm, + list<dag> pattern> { + def "" : FLAT_Pseudo <NAME#"_RTN", outs, ins, pattern>, + AtomicNoRet <NAME, 1>; + + def _ci : FLAT_Real_ci <op.CI, NAME#"_RTN", outs, ins, asm>; + + def _vi : FLAT_Real_vi <op.VI, NAME#"_RTN", outs, ins, asm>; +} + +multiclass FLAT_Load_Helper <flat op, string asm_name, + RegisterClass regClass, + dag outs = (outs regClass:$vdst), + dag ins = (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe), + string asm = asm_name#" $vdst, $addr"#"$glc"#"$slc"#"$tfe"> { + + let data = 0, mayLoad = 1 in { + + def "" : FLAT_Pseudo <NAME, outs, ins, []>; + + def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>; + + def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>; + } +} + +multiclass FLAT_Store_Helper <flat op, string asm_name, + RegisterClass vdataClass, + dag outs = (outs), + dag ins = (ins vdataClass:$data, VReg_64:$addr, glc_flat:$glc, + slc_flat:$slc, tfe_flat:$tfe), + string asm = asm_name#" $data, $addr"#"$glc"#"$slc"#"$tfe"> { + + let mayLoad = 0, mayStore = 1, vdst = 0 in { + + def "" : FLAT_Pseudo <NAME, outs, ins, []>; + + def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>; + + def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>; + } +} + +multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc, + RegisterClass data_rc = vdst_rc, + dag outs_noret = (outs), + string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> { + + let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in { + def "" : FLAT_Pseudo <NAME, outs_noret, + (ins VReg_64:$addr, data_rc:$data, + slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), []>, + AtomicNoRet <NAME, 0>; + + def _ci : FLAT_Real_ci <op.CI, NAME, outs_noret, + (ins VReg_64:$addr, data_rc:$data, + slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), + asm_noret>; + + def _vi : FLAT_Real_vi <op.VI, NAME, outs_noret, + (ins VReg_64:$addr, data_rc:$data, + slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), + asm_noret>; + } + + let glc = 1, hasPostISelHook = 1 in { + defm _RTN : FLAT_AtomicRet_m <op, (outs vdst_rc:$vdst), + (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc, + tfe_flat_atomic:$tfe), + asm_name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>; } } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td index e0eeea9..89692ab 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -30,7 +30,9 @@ def isGCN : Predicate<"Subtarget->getGeneration() " ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, AssemblerPredicate<"FeatureGCN">; def isSI : Predicate<"Subtarget->getGeneration() " - "== AMDGPUSubtarget::SOUTHERN_ISLANDS">; + "== AMDGPUSubtarget::SOUTHERN_ISLANDS">, + AssemblerPredicate<"FeatureSouthernIslands">; + def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; @@ -57,41 +59,39 @@ defm EXP : EXP_m; // SMRD Instructions //===----------------------------------------------------------------------===// -let mayLoad = 1 in { - // We are using the SGPR_32 and not the SReg_32 register class for 32-bit // SMRD instructions, because the SGPR_32 register class does not include M0 // and writing to M0 from an SMRD instruction will hang the GPU. -defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>; -defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>; -defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>; -defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>; -defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>; +defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SGPR_32>; +defm S_LOAD_DWORDX2 : SMRD_Helper <smrd<0x01>, "s_load_dwordx2", SReg_64, SReg_64>; +defm S_LOAD_DWORDX4 : SMRD_Helper <smrd<0x02>, "s_load_dwordx4", SReg_64, SReg_128>; +defm S_LOAD_DWORDX8 : SMRD_Helper <smrd<0x03>, "s_load_dwordx8", SReg_64, SReg_256>; +defm S_LOAD_DWORDX16 : SMRD_Helper <smrd<0x04>, "s_load_dwordx16", SReg_64, SReg_512>; defm S_BUFFER_LOAD_DWORD : SMRD_Helper < - 0x08, "s_buffer_load_dword", SReg_128, SGPR_32 + smrd<0x08>, "s_buffer_load_dword", SReg_128, SGPR_32 >; defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < - 0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64 + smrd<0x09>, "s_buffer_load_dwordx2", SReg_128, SReg_64 >; defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < - 0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128 + smrd<0x0a>, "s_buffer_load_dwordx4", SReg_128, SReg_128 >; defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < - 0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256 + smrd<0x0b>, "s_buffer_load_dwordx8", SReg_128, SReg_256 >; defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < - 0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512 + smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512 >; -} // mayLoad = 1 - //def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>; -//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>; + +defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv", + int_amdgcn_s_dcache_inv>; //===----------------------------------------------------------------------===// // SOP1 Instructions @@ -123,7 +123,7 @@ let Defs = [SCC] in { defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32", - [(set i32:$dst, (AMDGPUbrev i32:$src0))] + [(set i32:$dst, (bitreverse i32:$src0))] >; defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>; @@ -144,7 +144,7 @@ defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32", defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>; defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32", - [(set i32:$dst, (ctlz_zero_undef i32:$src0))] + [(set i32:$dst, (AMDGPUffbh_u32 i32:$src0))] >; defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>; @@ -183,10 +183,14 @@ defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []> defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>; defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>; + +let Uses = [M0] in { defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>; defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>; defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>; defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>; +} // End Uses = [M0] + defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>; defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>; let Defs = [SCC] in { @@ -354,7 +358,7 @@ def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; // SOPK Instructions //===----------------------------------------------------------------------===// -let isReMaterializable = 1 in { +let isReMaterializable = 1, isMoveImm = 1 in { defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>; } // End isReMaterializable = 1 let Uses = [SCC] in { @@ -438,36 +442,38 @@ def S_BRANCH : SOPP < let isBarrier = 1; } -let DisableEncoding = "$scc" in { +let Uses = [SCC] in { def S_CBRANCH_SCC0 : SOPP < - 0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc), + 0x00000004, (ins sopp_brtarget:$simm16), "s_cbranch_scc0 $simm16" >; def S_CBRANCH_SCC1 : SOPP < - 0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc), + 0x00000005, (ins sopp_brtarget:$simm16), "s_cbranch_scc1 $simm16" >; -} // End DisableEncoding = "$scc" +} // End Uses = [SCC] +let Uses = [VCC] in { def S_CBRANCH_VCCZ : SOPP < - 0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + 0x00000006, (ins sopp_brtarget:$simm16), "s_cbranch_vccz $simm16" >; def S_CBRANCH_VCCNZ : SOPP < - 0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + 0x00000007, (ins sopp_brtarget:$simm16), "s_cbranch_vccnz $simm16" >; +} // End Uses = [VCC] -let DisableEncoding = "$exec" in { +let Uses = [EXEC] in { def S_CBRANCH_EXECZ : SOPP < - 0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec), + 0x00000008, (ins sopp_brtarget:$simm16), "s_cbranch_execz $simm16" >; def S_CBRANCH_EXECNZ : SOPP < - 0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec), + 0x00000009, (ins sopp_brtarget:$simm16), "s_cbranch_execnz $simm16" >; -} // End DisableEncoding = "$exec" +} // End Uses = [EXEC] } // End isBranch = 1 @@ -477,11 +483,11 @@ let hasSideEffects = 1 in { def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", [(int_AMDGPU_barrier_local)] > { + let SchedRW = [WriteBarrier]; let simm16 = 0; - let isBarrier = 1; - let hasCtrlDep = 1; let mayLoad = 1; let mayStore = 1; + let isConvergent = 1; } def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; @@ -805,9 +811,6 @@ defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmps defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; -let SubtargetPredicate = isCI in { -defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; -} // End isCI defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>; let mayStore = 0 in { defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; @@ -905,11 +908,6 @@ defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">; defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; -//let SubtargetPredicate = isCI in { -// DS_CONDXCHG32_RTN_B64 -// DS_CONDXCHG32_RTN_B128 -//} // End isCI - //===----------------------------------------------------------------------===// // MUBUF Instructions //===----------------------------------------------------------------------===// @@ -951,13 +949,13 @@ defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global >; defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < - mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load + mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < - mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load + mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < - mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load + mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load >; defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < @@ -1034,9 +1032,12 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < //def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI -//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI -//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI -//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>; + +let SubtargetPredicate = isSI in { +defm BUFFER_WBINVL1_SC : MUBUF_Invalidate <mubuf<0x70>, "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI +} + +defm BUFFER_WBINVL1 : MUBUF_Invalidate <mubuf<0x71, 0x3e>, "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>; //===----------------------------------------------------------------------===// // MTBUF Instructions @@ -1155,8 +1156,8 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o" // VOP1 Instructions //===----------------------------------------------------------------------===// -let vdst = 0, src0 = 0 in { -defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">; +let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { +defm V_NOP : VOP1Inst <vop1<0x0>, "v_nop", VOP_NONE>; } let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { @@ -1292,7 +1293,9 @@ defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64", VOP_F64_F64, fsqrt >; -} // let SchedRW = [WriteDouble] +} // End SchedRW = [WriteDouble] + +let SchedRW = [WriteQuarterRate32] in { defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32", VOP_F32_F32, AMDGPUsin @@ -1300,6 +1303,9 @@ defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32", defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32", VOP_F32_F32, AMDGPUcos >; + +} // End SchedRW = [WriteQuarterRate32] + defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>; defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>; defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>; @@ -1308,24 +1314,33 @@ defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>; defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64", VOP_I32_F64 >; + +let SchedRW = [WriteDoubleAdd] in { defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64", VOP_F64_F64 >; -defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>; + +defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", + VOP_F64_F64 +>; +} // End SchedRW = [WriteDoubleAdd] + + defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32", VOP_I32_F32 >; defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32", VOP_F32_F32 >; -let vdst = 0, src0 = 0 in { -defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [], - "v_clrexcp" ->; +let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { +defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NONE>; } + +let Uses = [M0, EXEC] in { defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>; defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>; defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>; +} // End Uses = [M0, EXEC] // These instruction only exist on SI and CI let SubtargetPredicate = isSICI in { @@ -1343,7 +1358,7 @@ defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy >; -} // End let SchedRW = [WriteQuarterRate32] +} // End SchedRW = [WriteQuarterRate32] let SchedRW = [WriteDouble] in { @@ -1360,7 +1375,7 @@ defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64", // VINTRP Instructions //===----------------------------------------------------------------------===// -let Uses = [M0] in { +let Uses = [M0, EXEC] in { // FIXME: Specify SchedRW for VINTRP insturctions. @@ -1405,16 +1420,14 @@ defm V_INTERP_MOV_F32 : VINTRP_m < [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan), (i32 imm:$attr)))]>; -} // End Uses = [M0] +} // End Uses = [M0, EXEC] //===----------------------------------------------------------------------===// // VOP2 Instructions //===----------------------------------------------------------------------===// multiclass V_CNDMASK <vop2 op, string name> { - defm _e32 : VOP2_m < - op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [], - name, name>; + defm _e32 : VOP2_m <op, name, VOP_CNDMASK, [], name>; defm _e64 : VOP3_m < op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64, @@ -1500,34 +1513,32 @@ let isCommutable = 1 in { defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">; } // End isCommutable = 1 -let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC +let isCommutable = 1 in { // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. // V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, // but the VI instructions behave the same as the SI versions. defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32", - VOP_I32_I32_I32, add + VOP2b_I32_I1_I32_I32 >; -defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>; +defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP2b_I32_I1_I32_I32>; defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32", - VOP_I32_I32_I32, null_frag, "v_sub_i32" + VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32" >; -let Uses = [VCC] in { // Carry-in comes from VCC defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32", - VOP_I32_I32_I32_VCC + VOP2b_I32_I1_I32_I32_I1 >; defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32", - VOP_I32_I32_I32_VCC + VOP2b_I32_I1_I32_I32_I1 >; defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32", - VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32" + VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32" >; -} // End Uses = [VCC] -} // End isCommutable = 1, Defs = [VCC] +} // End isCommutable = 1 defm V_READLANE_B32 : VOP2SI_3VI_m < vop3 <0x001, 0x289>, @@ -1575,10 +1586,10 @@ defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32", VOP_I32_I32_I32 >; defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32", - VOP_I32_I32_I32 + VOP_I32_I32_I32, int_amdgcn_mbcnt_lo >; defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32", - VOP_I32_I32_I32 + VOP_I32_I32_I32, int_amdgcn_mbcnt_hi >; defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp @@ -1704,15 +1715,15 @@ defm V_DIV_FIXUP_F32 : VOP3Inst < vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup >; -let SchedRW = [WriteDouble] in { +let SchedRW = [WriteDoubleAdd] in { defm V_DIV_FIXUP_F64 : VOP3Inst < vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup >; -} // let SchedRW = [WriteDouble] +} // End SchedRW = [WriteDouble] -let SchedRW = [WriteDouble] in { +let SchedRW = [WriteDoubleAdd] in { let isCommutable = 1 in { defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64", @@ -1735,7 +1746,7 @@ defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64", VOP_F64_F64_I32, AMDGPUldexp >; -} // let SchedRW = [WriteDouble] +} // let SchedRW = [WriteDoubleAdd] let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { @@ -1756,16 +1767,21 @@ defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32", } // isCommutable = 1, SchedRW = [WriteQuarterRate32] let SchedRW = [WriteFloatFMA, WriteSALU] in { -defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>; +defm V_DIV_SCALE_F32 : VOP3bInst <vop3<0x16d, 0x1e0>, "v_div_scale_f32", + VOP3b_F32_I1_F32_F32_F32 +>; } let SchedRW = [WriteDouble, WriteSALU] in { // Double precision division pre-scale. -defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>; +defm V_DIV_SCALE_F64 : VOP3bInst <vop3<0x16e, 0x1e1>, "v_div_scale_f64", + VOP3b_F64_I1_F64_F64_F64 +>; } // let SchedRW = [WriteDouble] -let isCommutable = 1, Uses = [VCC] in { +let isCommutable = 1, Uses = [VCC, EXEC] in { +let SchedRW = [WriteFloatFMA] in { // v_div_fmas_f32: // result = src0 * src1 + src2 // if (vcc) @@ -1774,6 +1790,7 @@ let isCommutable = 1, Uses = [VCC] in { defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fmas >; +} let SchedRW = [WriteDouble] in { // v_div_fmas_f64: @@ -1786,7 +1803,7 @@ defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64", >; } // End SchedRW = [WriteDouble] -} // End isCommutable = 1 +} // End isCommutable = 1, Uses = [VCC, EXEC] //def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; @@ -1835,13 +1852,13 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst), (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", [] >; -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // 64-bit vector move instruction. This is mainly used by the SIFoldOperands // pass to enable folding of inline immediates. def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>; } // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0 -let hasSideEffects = 1 in { +let hasSideEffects = 1, SALU = 1 in { def SGPR_USE : InstSI <(outs),(ins), "", []>; } @@ -1921,39 +1938,9 @@ def SI_KILL : InstSI < let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { -//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>; - -let UseNamedOperandTable = 1 in { - -def SI_RegisterLoad : InstSI < +class SI_INDIRECT_SRC<RegisterClass rc> : InstSI < (outs VGPR_32:$dst, SReg_64:$temp), - (ins FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterLoad = 1; - let mayLoad = 1; -} - -class SIRegStore<dag outs> : InstSI < - outs, - (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterStore = 1; - let mayStore = 1; -} - -let usesCustomInserter = 1 in { -def SI_RegisterStorePseudo : SIRegStore<(outs)>; -} // End usesCustomInserter = 1 -def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>; - - -} // End UseNamedOperandTable = 1 - -def SI_INDIRECT_SRC : InstSI < - (outs VGPR_32:$dst, SReg_64:$temp), - (ins unknown:$src, VSrc_32:$idx, i32imm:$off), + (ins rc:$src, VSrc_32:$idx, i32imm:$off), "si_indirect_src $dst, $temp, $src, $idx, $off", [] >; @@ -1967,6 +1954,13 @@ class SI_INDIRECT_DST<RegisterClass rc> : InstSI < let Constraints = "$src = $dst"; } +// TODO: We can support indirect SGPR access. +def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>; +def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>; +def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>; +def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>; +def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>; + def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; @@ -1977,19 +1971,24 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { - let UseNamedOperandTable = 1 in { + let UseNamedOperandTable = 1, Uses = [EXEC] in { def _SAVE : InstSI < (outs), - (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, - SReg_32:$scratch_offset), + (ins sgpr_class:$src, i32imm:$frame_idx), "", [] - >; + > { + let mayStore = 1; + let mayLoad = 0; + } def _RESTORE : InstSI < (outs sgpr_class:$dst), - (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), + (ins i32imm:$frame_idx), "", [] - >; + > { + let mayStore = 0; + let mayLoad = 1; + } } // End UseNamedOperandTable = 1 } @@ -2003,19 +2002,25 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { - let UseNamedOperandTable = 1, VGPRSpill = 1 in { + let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in { def _SAVE : InstSI < (outs), (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), "", [] - >; + > { + let mayStore = 1; + let mayLoad = 0; + } def _RESTORE : InstSI < (outs vgpr_class:$dst), (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), "", [] - >; + > { + let mayStore = 0; + let mayLoad = 1; + } } // End UseNamedOperandTable = 1, VGPRSpill = 1 } @@ -2030,9 +2035,11 @@ let Defs = [SCC] in { def SI_CONSTDATA_PTR : InstSI < (outs SReg_64:$dst), - (ins), - "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))] ->; + (ins const_ga:$ptr), + "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))] +> { + let SALU = 1; +} } // End Defs = [SCC] @@ -2072,84 +2079,63 @@ def : Pat < // SMRD Patterns //===----------------------------------------------------------------------===// -multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { +multiclass SMRD_Pattern <string Instr, ValueType vt> { - // 1. SI-CI: Offset as 8bit DWORD immediate + // 1. IMM offset def : Pat < - (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))), - (vt (Instr_IMM $sbase, (as_dword_i32imm $offset))) + (smrd_load (SMRDImm i64:$sbase, i32:$offset)), + (vt (!cast<SMRD>(Instr#"_IMM") $sbase, $offset)) >; - // 2. Offset loaded in an 32bit SGPR + // 2. SGPR offset def : Pat < - (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), - (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) + (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), + (vt (!cast<SMRD>(Instr#"_SGPR") $sbase, $offset)) >; - // 3. No offset at all def : Pat < - (constant_load i64:$sbase), - (vt (Instr_IMM $sbase, 0)) - >; + (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), + (vt (!cast<SMRD>(Instr#"_IMM_ci") $sbase, $offset)) + > { + let Predicates = [isCIOnly]; + } } -multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { - - // 1. VI: Offset as 20bit immediate in bytes - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))), - (vt (Instr_IMM $sbase, (as_i32imm $offset))) - >; - - // 2. Offset loaded in an 32bit SGPR - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), - (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) - >; - - // 3. No offset at all - def : Pat < - (constant_load i64:$sbase), - (vt (Instr_IMM $sbase, 0)) - >; -} - -let Predicates = [isSICI] in { -defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; -defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; -defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; -} // End Predicates = [isSICI] +// Global and constant loads can be selected to either MUBUF or SMRD +// instructions, but SMRD instructions are faster so we want the instruction +// selector to prefer those. +let AddedComplexity = 100 in { -let Predicates = [isVI] in { -defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; -} // End Predicates = [isVI] +defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX8", v32i8>; +defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; -let Predicates = [isSICI] in { +// 1. Offset as an immediate +def : Pat < + (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), + (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset) +>; -// 1. Offset as 8bit DWORD immediate +// 2. Offset loaded in an 32bit SGPR def : Pat < - (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset)) + (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)), + (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset) >; -} // End Predicates = [isSICI] +let Predicates = [isCI] in { -// 2. Offset loaded in an 32bit SGPR def : Pat < - (SIload_constant v4i32:$sbase, imm:$offset), - (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) + (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)), + (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset) >; +} // End Predicates = [isCI] + +} // End let AddedComplexity = 10000 + //===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// @@ -2161,6 +2147,11 @@ def : Pat < (S_MOV_B32 0), sub1)) >; +def : Pat < + (i32 (smax i32:$x, (i32 (ineg i32:$x)))), + (S_ABS_I32 $x) +>; + //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// @@ -2488,6 +2479,11 @@ def : Pat < /********** Extraction, Insertion, Building and Casting **********/ /********** ============================================ **********/ +//def : Extract_Element<i64, v2i64, 0, sub0_sub1>; +//def : Extract_Element<i64, v2i64, 1, sub2_sub3>; +//def : Extract_Element<f64, v2f64, 0, sub0_sub1>; +//def : Extract_Element<f64, v2f64, 1, sub2_sub3>; + foreach Index = 0-2 in { def Extract_Element_v2i32_#Index : Extract_Element < i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) @@ -2568,11 +2564,25 @@ def : BitConvert <v2i32, i64, VReg_64>; def : BitConvert <i64, v2i32, VReg_64>; def : BitConvert <v2f32, i64, VReg_64>; def : BitConvert <i64, v2f32, VReg_64>; +def : BitConvert <v2f32, f64, VReg_64>; def : BitConvert <v2i32, f64, VReg_64>; +def : BitConvert <f64, v2f32, VReg_64>; def : BitConvert <f64, v2i32, VReg_64>; def : BitConvert <v4f32, v4i32, VReg_128>; def : BitConvert <v4i32, v4f32, VReg_128>; + +def : BitConvert <v2i64, v4i32, SReg_128>; +def : BitConvert <v4i32, v2i64, SReg_128>; + +def : BitConvert <v2f64, v4f32, VReg_128>; +def : BitConvert <v2f64, v4i32, VReg_128>; +def : BitConvert <v4f32, v2f64, VReg_128>; +def : BitConvert <v4i32, v2f64, VReg_128>; + + + + def : BitConvert <v8f32, v8i32, SReg_256>; def : BitConvert <v8i32, v8f32, SReg_256>; def : BitConvert <v8i32, v32i8, SReg_256>; @@ -2601,10 +2611,9 @@ def : Pat < // Prevent expanding both fneg and fabs. -// FIXME: Should use S_OR_B32 def : Pat < (fneg (fabs f32:$src)), - (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */ + (S_OR_B32 $src, 0x80000000) /* Set sign bit */ >; // FIXME: Should use S_OR_B32 @@ -2836,10 +2845,6 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < // -1. For the non-rtn variants, the manual says it does // DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max // will always do the increment so I'm assuming it's the same. -// -// We also load this -1 with s_mov_b32 / s_mov_b64 even though this -// needs to be a VGPR. The SGPR copy pass will fix this, and it's -// easier since there is no v_mov_b64. class DSAtomicIncRetPat<DS inst, ValueType vt, Instruction LoadImm, PatFrag frag> : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), @@ -2855,9 +2860,9 @@ class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat < // 32-bit atomics. def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32, - S_MOV_B32, si_atomic_load_add_local>; + V_MOV_B32_e32, si_atomic_load_add_local>; def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32, - S_MOV_B32, si_atomic_load_sub_local>; + V_MOV_B32_e32, si_atomic_load_sub_local>; def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>; def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>; @@ -2874,9 +2879,9 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>; // 64-bit atomics. def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64, - S_MOV_B64, si_atomic_load_add_local>; + V_MOV_B64_PSEUDO, si_atomic_load_add_local>; def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64, - S_MOV_B64, si_atomic_load_sub_local>; + V_MOV_B64_PSEUDO, si_atomic_load_sub_local>; def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>; def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>; @@ -3019,90 +3024,46 @@ def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>; def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>; def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; -let SubtargetPredicate = isCI in { - -defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8", - VOP_I32_I32_I32 ->; -defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8", - VOP_I32_I32_I32 ->; -defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8", - VOP_I32_I32_I32 ->; - -let isCommutable = 1 in { -defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32", - VOP_I64_I32_I32_I64 ->; - -// XXX - Does this set VCC? -defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", - VOP_I64_I32_I32_I64 ->; -} // End isCommutable = 1 - -// Remaining instructions: -// FLAT_* -// S_CBRANCH_CDBGUSER -// S_CBRANCH_CDBGSYS -// S_CBRANCH_CDBGSYS_OR_USER -// S_CBRANCH_CDBGSYS_AND_USER -// S_DCACHE_INV_VOL -// DS_NOP -// DS_GWS_SEMA_RELEASE_ALL -// DS_WRAP_RTN_B32 -// DS_CNDXCHG32_RTN_B64 -// DS_WRITE_B96 -// DS_WRITE_B128 -// DS_CONDXCHG32_RTN_B128 -// DS_READ_B96 -// DS_READ_B128 -// BUFFER_LOAD_DWORDX3 -// BUFFER_STORE_DWORDX3 - -} // End isCI - /********** ====================== **********/ /********** Indirect adressing **********/ /********** ====================== **********/ -multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST IndDst> { +multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { // 1. Extract with offset def : Pat< - (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))), - (SI_INDIRECT_SRC $vec, $idx, imm:$off) + (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))), + (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off) >; // 2. Extract without offset def : Pat< - (eltvt (vector_extract vt:$vec, i32:$idx)), - (SI_INDIRECT_SRC $vec, $idx, 0) + (eltvt (extractelt vt:$vec, i32:$idx)), + (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0) >; // 3. Insert with offset def : Pat< - (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), - (IndDst $vec, $idx, imm:$off, $val) + (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), + (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val) >; // 4. Insert without offset def : Pat< - (vector_insert vt:$vec, eltvt:$val, i32:$idx), - (IndDst $vec, $idx, 0, $val) + (insertelt vt:$vec, eltvt:$val, i32:$idx), + (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val) >; } -defm : SI_INDIRECT_Pattern <v2f32, f32, SI_INDIRECT_DST_V2>; -defm : SI_INDIRECT_Pattern <v4f32, f32, SI_INDIRECT_DST_V4>; -defm : SI_INDIRECT_Pattern <v8f32, f32, SI_INDIRECT_DST_V8>; -defm : SI_INDIRECT_Pattern <v16f32, f32, SI_INDIRECT_DST_V16>; +defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">; +defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">; +defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">; +defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">; -defm : SI_INDIRECT_Pattern <v2i32, i32, SI_INDIRECT_DST_V2>; -defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>; -defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>; -defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>; +defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">; +defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">; +defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; +defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; //===----------------------------------------------------------------------===// // Conversion Patterns @@ -3215,12 +3176,12 @@ def : Pat < def : Pat < (i1 (trunc i32:$a)), - (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1) + (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), $a), 1) >; def : Pat < (i1 (trunc i64:$a)), - (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), + (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), (EXTRACT_SUBREG $a, sub0)), 1) >; @@ -3301,24 +3262,6 @@ def : Pat < } // End Predicates = [isSI] -let Predicates = [isCI] in { - -// Convert (x - floor(x)) to fract(x) -def : Pat < - (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), - (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), - (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -// Convert (x + (-floor(x))) to fract(x) -def : Pat < - (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), - (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -} // End Predicates = [isCI] - //============================================================================// // Miscellaneous Optimization Patterns //============================================================================// diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index c319b32..126f624 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -103,6 +103,10 @@ public: return "SI Lower control flow instructions"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace @@ -140,8 +144,7 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { DebugLoc DL = From.getDebugLoc(); BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addOperand(To) - .addReg(AMDGPU::EXEC); + .addOperand(To); } void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { @@ -159,8 +162,7 @@ void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { // If the exec mask is non-zero, skip the next two instructions BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(3) - .addReg(AMDGPU::EXEC); + .addImm(3); // Exec mask is zero: Export to NULL target... BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) @@ -269,8 +271,7 @@ void SILowerControlFlowPass::Loop(MachineInstr &MI) { .addReg(Src); BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addOperand(MI.getOperand(1)) - .addReg(AMDGPU::EXEC); + .addOperand(MI.getOperand(1)); MI.eraseFromParent(); } @@ -316,7 +317,7 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { .addImm(0); } } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) .addImm(0) .addOperand(Op); } @@ -362,9 +363,9 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int .addReg(AMDGPU::VCC_LO); // Compare the just read M0 value to all possible Idx values - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) - .addReg(AMDGPU::M0) - .addReg(Idx); + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) + .addReg(AMDGPU::M0) + .addReg(Idx); // Update EXEC, save the original EXEC value to VCC BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) @@ -385,8 +386,7 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(-7) - .addReg(AMDGPU::EXEC); + .addImm(-7); // Restore EXEC BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) @@ -438,7 +438,6 @@ void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { MachineInstr *MovRel = BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) .addReg(Reg) - .addReg(AMDGPU::M0, RegState::Implicit) .addReg(Vec, RegState::Implicit); LoadM0(MI, MovRel, Off); @@ -460,7 +459,6 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) .addReg(Reg, RegState::Define) .addReg(Val) - .addReg(AMDGPU::M0, RegState::Implicit) .addReg(Dst, RegState::Implicit); LoadM0(MI, MovRel, Off); @@ -486,11 +484,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; - if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode())) + if (TII->isWQM(MI) || TII->isDS(MI)) NeedWQM = true; // Flat uses m0 in case it needs to access LDS. - if (TII->isFLAT(MI.getOpcode())) + if (TII->isFLAT(MI)) NeedFlat = true; switch (MI.getOpcode()) { @@ -541,7 +539,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { Branch(MI); break; - case AMDGPU::SI_INDIRECT_SRC: + case AMDGPU::SI_INDIRECT_SRC_V1: + case AMDGPU::SI_INDIRECT_SRC_V2: + case AMDGPU::SI_INDIRECT_SRC_V4: + case AMDGPU::SI_INDIRECT_SRC_V8: + case AMDGPU::SI_INDIRECT_SRC_V16: IndirectSrc(MI); break; diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 67421e2..a2fa5fd 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -48,6 +48,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 587ea63..49677fc 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -29,10 +29,118 @@ void SIMachineFunctionInfo::anchor() {} SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), - HasSpilledVGPRs(false), + ScratchRSrcReg(AMDGPU::NoRegister), + ScratchWaveOffsetReg(AMDGPU::NoRegister), + PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), + DispatchPtrUserSGPR(AMDGPU::NoRegister), + QueuePtrUserSGPR(AMDGPU::NoRegister), + KernargSegmentPtrUserSGPR(AMDGPU::NoRegister), + DispatchIDUserSGPR(AMDGPU::NoRegister), + FlatScratchInitUserSGPR(AMDGPU::NoRegister), + PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister), + WorkGroupIDXSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDYSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), + WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), + PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), PSInputAddr(0), + ReturnsVoid(true), + LDSWaveSpillSize(0), + PSInputEna(0), NumUserSGPRs(0), - LDSWaveSpillSize(0) { } + NumSystemSGPRs(0), + HasSpilledSGPRs(false), + HasSpilledVGPRs(false), + PrivateSegmentBuffer(false), + DispatchPtr(false), + QueuePtr(false), + DispatchID(false), + KernargSegmentPtr(false), + FlatScratchInit(false), + GridWorkgroupCountX(false), + GridWorkgroupCountY(false), + GridWorkgroupCountZ(false), + WorkGroupIDX(true), + WorkGroupIDY(false), + WorkGroupIDZ(false), + WorkGroupInfo(false), + PrivateSegmentWaveByteOffset(false), + WorkItemIDX(true), + WorkItemIDY(false), + WorkItemIDZ(false) { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + const Function *F = MF.getFunction(); + + PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); + + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + if (getShaderType() == ShaderType::COMPUTE) + KernargSegmentPtr = true; + + if (F->hasFnAttribute("amdgpu-work-group-id-y")) + WorkGroupIDY = true; + + if (F->hasFnAttribute("amdgpu-work-group-id-z")) + WorkGroupIDZ = true; + + if (F->hasFnAttribute("amdgpu-work-item-id-y")) + WorkItemIDY = true; + + if (F->hasFnAttribute("amdgpu-work-item-id-z")) + WorkItemIDZ = true; + + bool MaySpill = ST.isVGPRSpillingEnabled(this); + bool HasStackObjects = FrameInfo->hasStackObjects(); + + if (HasStackObjects || MaySpill) + PrivateSegmentWaveByteOffset = true; + + if (ST.isAmdHsaOS()) { + if (HasStackObjects || MaySpill) + PrivateSegmentBuffer = true; + + if (F->hasFnAttribute("amdgpu-dispatch-ptr")) + DispatchPtr = true; + } + + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; +} + +unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( + const SIRegisterInfo &TRI) { + PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + NumUserSGPRs += 4; + return PrivateSegmentBufferUserSGPR; +} + +unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { + DispatchPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return DispatchPtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { + QueuePtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return QueuePtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { + KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return KernargSegmentPtrUserSGPR; +} SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( MachineFunction *MF, @@ -52,8 +160,18 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( if (!LaneVGPRs.count(LaneVGPRIdx)) { unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); + + if (LaneVGPR == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling SGPR"); + + // When compiling from inside Mesa, the compilation continues. + // Select an arbitrary register to avoid triggering assertions + // during subsequent passes. + LaneVGPR = AMDGPU::VGPR0; + } + LaneVGPRs[LaneVGPRIdx] = LaneVGPR; - MRI.setPhysRegUsed(LaneVGPR); // Add this register as live-in to all blocks to avoid machine verifer // complaining about use of an undefined physical register. diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 667da4c..846ee5d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -26,13 +26,87 @@ class MachineRegisterInfo; /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public AMDGPUMachineFunction { + // FIXME: This should be removed and getPreloadedValue moved here. + friend struct SIRegisterInfo; void anchor() override; unsigned TIDReg; - bool HasSpilledVGPRs; + + // Registers that may be reserved for spilling purposes. These may be the same + // as the input registers. + unsigned ScratchRSrcReg; + unsigned ScratchWaveOffsetReg; + + // Input registers setup for the HSA ABI. + // User SGPRs in allocation order. + unsigned PrivateSegmentBufferUserSGPR; + unsigned DispatchPtrUserSGPR; + unsigned QueuePtrUserSGPR; + unsigned KernargSegmentPtrUserSGPR; + unsigned DispatchIDUserSGPR; + unsigned FlatScratchInitUserSGPR; + unsigned PrivateSegmentSizeUserSGPR; + unsigned GridWorkGroupCountXUserSGPR; + unsigned GridWorkGroupCountYUserSGPR; + unsigned GridWorkGroupCountZUserSGPR; + + // System SGPRs in allocation order. + unsigned WorkGroupIDXSystemSGPR; + unsigned WorkGroupIDYSystemSGPR; + unsigned WorkGroupIDZSystemSGPR; + unsigned WorkGroupInfoSystemSGPR; + unsigned PrivateSegmentWaveByteOffsetSystemSGPR; + + // Graphics info. + unsigned PSInputAddr; + bool ReturnsVoid; public: + // FIXME: Make private + unsigned LDSWaveSpillSize; + unsigned PSInputEna; + std::map<unsigned, unsigned> LaneVGPRs; + unsigned ScratchOffsetReg; + unsigned NumUserSGPRs; + unsigned NumSystemSGPRs; + +private: + bool HasSpilledSGPRs; + bool HasSpilledVGPRs; + + // Feature bits required for inputs passed in user SGPRs. + bool PrivateSegmentBuffer : 1; + bool DispatchPtr : 1; + bool QueuePtr : 1; + bool DispatchID : 1; + bool KernargSegmentPtr : 1; + bool FlatScratchInit : 1; + bool GridWorkgroupCountX : 1; + bool GridWorkgroupCountY : 1; + bool GridWorkgroupCountZ : 1; + + // Feature bits required for inputs passed in system SGPRs. + bool WorkGroupIDX : 1; // Always initialized. + bool WorkGroupIDY : 1; + bool WorkGroupIDZ : 1; + bool WorkGroupInfo : 1; + bool PrivateSegmentWaveByteOffset : 1; + + bool WorkItemIDX : 1; // Always initialized. + bool WorkItemIDY : 1; + bool WorkItemIDZ : 1; + + MCPhysReg getNextUserSGPR() const { + assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); + return AMDGPU::SGPR0 + NumUserSGPRs; + } + + MCPhysReg getNextSystemSGPR() const { + return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; + } + +public: struct SpilledReg { unsigned VGPR; int Lane; @@ -46,16 +120,182 @@ public: SIMachineFunctionInfo(const MachineFunction &MF); SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex, unsigned SubIdx); - unsigned PSInputAddr; - unsigned NumUserSGPRs; - std::map<unsigned, unsigned> LaneVGPRs; - unsigned LDSWaveSpillSize; - unsigned ScratchOffsetReg; bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; unsigned getTIDReg() const { return TIDReg; }; void setTIDReg(unsigned Reg) { TIDReg = Reg; } - bool hasSpilledVGPRs() const { return HasSpilledVGPRs; } - void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; } + + // Add user SGPRs. + unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI); + unsigned addDispatchPtr(const SIRegisterInfo &TRI); + unsigned addQueuePtr(const SIRegisterInfo &TRI); + unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); + + // Add system SGPRs. + unsigned addWorkGroupIDX() { + WorkGroupIDXSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDXSystemSGPR; + } + + unsigned addWorkGroupIDY() { + WorkGroupIDYSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDYSystemSGPR; + } + + unsigned addWorkGroupIDZ() { + WorkGroupIDZSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDZSystemSGPR; + } + + unsigned addWorkGroupInfo() { + WorkGroupInfoSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupInfoSystemSGPR; + } + + unsigned addPrivateSegmentWaveByteOffset() { + PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + + bool hasPrivateSegmentBuffer() const { + return PrivateSegmentBuffer; + } + + bool hasDispatchPtr() const { + return DispatchPtr; + } + + bool hasQueuePtr() const { + return QueuePtr; + } + + bool hasDispatchID() const { + return DispatchID; + } + + bool hasKernargSegmentPtr() const { + return KernargSegmentPtr; + } + + bool hasFlatScratchInit() const { + return FlatScratchInit; + } + + bool hasGridWorkgroupCountX() const { + return GridWorkgroupCountX; + } + + bool hasGridWorkgroupCountY() const { + return GridWorkgroupCountY; + } + + bool hasGridWorkgroupCountZ() const { + return GridWorkgroupCountZ; + } + + bool hasWorkGroupIDX() const { + return WorkGroupIDX; + } + + bool hasWorkGroupIDY() const { + return WorkGroupIDY; + } + + bool hasWorkGroupIDZ() const { + return WorkGroupIDZ; + } + + bool hasWorkGroupInfo() const { + return WorkGroupInfo; + } + + bool hasPrivateSegmentWaveByteOffset() const { + return PrivateSegmentWaveByteOffset; + } + + bool hasWorkItemIDX() const { + return WorkItemIDX; + } + + bool hasWorkItemIDY() const { + return WorkItemIDY; + } + + bool hasWorkItemIDZ() const { + return WorkItemIDZ; + } + + unsigned getNumUserSGPRs() const { + return NumUserSGPRs; + } + + unsigned getNumPreloadedSGPRs() const { + return NumUserSGPRs + NumSystemSGPRs; + } + + unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + + /// \brief Returns the physical register reserved for use as the resource + /// descriptor for scratch accesses. + unsigned getScratchRSrcReg() const { + return ScratchRSrcReg; + } + + void setScratchRSrcReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchRSrcReg = Reg; + } + + unsigned getScratchWaveOffsetReg() const { + return ScratchWaveOffsetReg; + } + + void setScratchWaveOffsetReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchWaveOffsetReg = Reg; + } + + bool hasSpilledSGPRs() const { + return HasSpilledSGPRs; + } + + void setHasSpilledSGPRs(bool Spill = true) { + HasSpilledSGPRs = Spill; + } + + bool hasSpilledVGPRs() const { + return HasSpilledVGPRs; + } + + void setHasSpilledVGPRs(bool Spill = true) { + HasSpilledVGPRs = Spill; + } + + unsigned getPSInputAddr() const { + return PSInputAddr; + } + + bool isPSInputAllocated(unsigned Index) const { + return PSInputAddr & (1 << Index); + } + + void markPSInputAllocated(unsigned Index) { + PSInputAddr |= 1 << Index; + } + + bool returnsVoid() const { + return ReturnsVoid; + } + + void setIfReturnsVoid(bool Value) { + ReturnsVoid = Value; + } unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; }; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp new file mode 100644 index 0000000..1cfa984 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -0,0 +1,1968 @@ +//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#include "SIMachineScheduler.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/RegisterPressure.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +// This scheduler implements a different scheduling algorithm than +// GenericScheduler. +// +// There are several specific architecture behaviours that can't be modelled +// for GenericScheduler: +// . When accessing the result of an SGPR load instruction, you have to wait +// for all the SGPR load instructions before your current instruction to +// have finished. +// . When accessing the result of an VGPR load instruction, you have to wait +// for all the VGPR load instructions previous to the VGPR load instruction +// you are interested in to finish. +// . The less the register pressure, the best load latencies are hidden +// +// Moreover some specifities (like the fact a lot of instructions in the shader +// have few dependencies) makes the generic scheduler have some unpredictable +// behaviours. For example when register pressure becomes high, it can either +// manage to prevent register pressure from going too high, or it can +// increase register pressure even more than if it hadn't taken register +// pressure into account. +// +// Also some other bad behaviours are generated, like loading at the beginning +// of the shader a constant in VGPR you won't need until the end of the shader. +// +// The scheduling problem for SI can distinguish three main parts: +// . Hiding high latencies (texture sampling, etc) +// . Hiding low latencies (SGPR constant loading, etc) +// . Keeping register usage low for better latency hiding and general +// performance +// +// Some other things can also affect performance, but are hard to predict +// (cache usage, the fact the HW can issue several instructions from different +// wavefronts if different types, etc) +// +// This scheduler tries to solve the scheduling problem by dividing it into +// simpler sub-problems. It divides the instructions into blocks, schedules +// locally inside the blocks where it takes care of low latencies, and then +// chooses the order of the blocks by taking care of high latencies. +// Dividing the instructions into blocks helps control keeping register +// usage low. +// +// First the instructions are put into blocks. +// We want the blocks help control register usage and hide high latencies +// later. To help control register usage, we typically want all local +// computations, when for example you create a result that can be comsummed +// right away, to be contained in a block. Block inputs and outputs would +// typically be important results that are needed in several locations of +// the shader. Since we do want blocks to help hide high latencies, we want +// the instructions inside the block to have a minimal set of dependencies +// on high latencies. It will make it easy to pick blocks to hide specific +// high latencies. +// The block creation algorithm is divided into several steps, and several +// variants can be tried during the scheduling process. +// +// Second the order of the instructions inside the blocks is choosen. +// At that step we do take into account only register usage and hiding +// low latency instructions +// +// Third the block order is choosen, there we try to hide high latencies +// and keep register usage low. +// +// After the third step, a pass is done to improve the hiding of low +// latencies. +// +// Actually when talking about 'low latency' or 'high latency' it includes +// both the latency to get the cache (or global mem) data go to the register, +// and the bandwith limitations. +// Increasing the number of active wavefronts helps hide the former, but it +// doesn't solve the latter, thus why even if wavefront count is high, we have +// to try have as many instructions hiding high latencies as possible. +// The OpenCL doc says for example latency of 400 cycles for a global mem access, +// which is hidden by 10 instructions if the wavefront count is 10. + +// Some figures taken from AMD docs: +// Both texture and constant L1 caches are 4-way associative with 64 bytes +// lines. +// Constant cache is shared with 4 CUs. +// For texture sampling, the address generation unit receives 4 texture +// addresses per cycle, thus we could expect texture sampling latency to be +// equivalent to 4 instructions in the very best case (a VGPR is 64 work items, +// instructions in a wavefront group are executed every 4 cycles), +// or 16 instructions if the other wavefronts associated to the 3 other VALUs +// of the CU do texture sampling too. (Don't take these figures too seriously, +// as I'm not 100% sure of the computation) +// Data exports should get similar latency. +// For constant loading, the cache is shader with 4 CUs. +// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" +// I guess if the other CU don't read the cache, it can go up to 64B/cycle. +// It means a simple s_buffer_load should take one instruction to hide, as +// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same +// cache line. +// +// As of today the driver doesn't preload the constants in cache, thus the +// first loads get extra latency. The doc says global memory access can be +// 300-600 cycles. We do not specially take that into account when scheduling +// As we expect the driver to be able to preload the constants soon. + + +// common code // + +#ifndef NDEBUG + +static const char *getReasonStr(SIScheduleCandReason Reason) { + switch (Reason) { + case NoCand: return "NOCAND"; + case RegUsage: return "REGUSAGE"; + case Latency: return "LATENCY"; + case Successor: return "SUCCESSOR"; + case Depth: return "DEPTH"; + case NodeOrder: return "ORDER"; + } + llvm_unreachable("Unknown reason!"); +} + +#endif + +static bool tryLess(int TryVal, int CandVal, + SISchedulerCandidate &TryCand, + SISchedulerCandidate &Cand, + SIScheduleCandReason Reason) { + if (TryVal < CandVal) { + TryCand.Reason = Reason; + return true; + } + if (TryVal > CandVal) { + if (Cand.Reason > Reason) + Cand.Reason = Reason; + return true; + } + Cand.setRepeat(Reason); + return false; +} + +static bool tryGreater(int TryVal, int CandVal, + SISchedulerCandidate &TryCand, + SISchedulerCandidate &Cand, + SIScheduleCandReason Reason) { + if (TryVal > CandVal) { + TryCand.Reason = Reason; + return true; + } + if (TryVal < CandVal) { + if (Cand.Reason > Reason) + Cand.Reason = Reason; + return true; + } + Cand.setRepeat(Reason); + return false; +} + +// SIScheduleBlock // + +void SIScheduleBlock::addUnit(SUnit *SU) { + NodeNum2Index[SU->NodeNum] = SUnits.size(); + SUnits.push_back(SU); +} + +#ifndef NDEBUG + +void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { + + dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); + dbgs() << '\n'; +} +#endif + +void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, + SISchedCandidate &TryCand) { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return; + } + + if (Cand.SGPRUsage > 60 && + tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage)) + return; + + // Schedule low latency instructions as top as possible. + // Order of priority is: + // . Low latency instructions which do not depend on other low latency + // instructions we haven't waited for + // . Other instructions which do not depend on low latency instructions + // we haven't waited for + // . Low latencies + // . All other instructions + // Goal is to get: low latency instructions - independant instructions + // - (eventually some more low latency instructions) + // - instructions that depend on the first low latency instructions. + // If in the block there is a lot of constant loads, the SGPR usage + // could go quite high, thus above the arbitrary limit of 60 will encourage + // use the already loaded constants (in order to release some SGPRs) before + // loading more. + if (tryLess(TryCand.HasLowLatencyNonWaitedParent, + Cand.HasLowLatencyNonWaitedParent, + TryCand, Cand, SIScheduleCandReason::Depth)) + return; + + if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, + TryCand, Cand, SIScheduleCandReason::Depth)) + return; + + if (TryCand.IsLowLatency && + tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, + TryCand, Cand, SIScheduleCandReason::Depth)) + return; + + if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage)) + return; + + // Fall through to original instruction order. + if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { + TryCand.Reason = NodeOrder; + } +} + +SUnit* SIScheduleBlock::pickNode() { + SISchedCandidate TopCand; + + for (SUnit* SU : TopReadySUs) { + SISchedCandidate TryCand; + std::vector<unsigned> pressure; + std::vector<unsigned> MaxPressure; + // Predict register usage after this instruction. + TryCand.SU = SU; + TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); + TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()]; + TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()]; + TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; + TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; + TryCand.HasLowLatencyNonWaitedParent = + HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; + tryCandidateTopDown(TopCand, TryCand); + if (TryCand.Reason != NoCand) + TopCand.setBest(TryCand); + } + + return TopCand.SU; +} + + +// Schedule something valid. +void SIScheduleBlock::fastSchedule() { + TopReadySUs.clear(); + if (Scheduled) + undoSchedule(); + + for (SUnit* SU : SUnits) { + if (!SU->NumPredsLeft) + TopReadySUs.push_back(SU); + } + + while (!TopReadySUs.empty()) { + SUnit *SU = TopReadySUs[0]; + ScheduledSUnits.push_back(SU); + nodeScheduled(SU); + } + + Scheduled = true; +} + +// Returns if the register was set between first and last. +static bool isDefBetween(unsigned Reg, + SlotIndex First, SlotIndex Last, + const MachineRegisterInfo *MRI, + const LiveIntervals *LIS) { + for (MachineRegisterInfo::def_instr_iterator + UI = MRI->def_instr_begin(Reg), + UE = MRI->def_instr_end(); UI != UE; ++UI) { + const MachineInstr* MI = &*UI; + if (MI->isDebugValue()) + continue; + SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot(); + if (InstSlot >= First && InstSlot <= Last) + return true; + } + return false; +} + +void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock) { + IntervalPressure Pressure, BotPressure; + RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); + LiveIntervals *LIS = DAG->getLIS(); + MachineRegisterInfo *MRI = DAG->getMRI(); + DAG->initRPTracker(TopRPTracker); + DAG->initRPTracker(BotRPTracker); + DAG->initRPTracker(RPTracker); + + // Goes though all SU. RPTracker captures what had to be alive for the SUs + // to execute, and what is still alive at the end. + for (SUnit* SU : ScheduledSUnits) { + RPTracker.setPos(SU->getInstr()); + RPTracker.advance(); + } + + // Close the RPTracker to finalize live ins/outs. + RPTracker.closeRegion(); + + // Initialize the live ins and live outs. + TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); + BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); + + // Do not Track Physical Registers, because it messes up. + for (unsigned Reg : RPTracker.getPressure().LiveInRegs) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + LiveInRegs.insert(Reg); + } + LiveOutRegs.clear(); + // There is several possibilities to distinguish: + // 1) Reg is not input to any instruction in the block, but is output of one + // 2) 1) + read in the block and not needed after it + // 3) 1) + read in the block but needed in another block + // 4) Reg is input of an instruction but another block will read it too + // 5) Reg is input of an instruction and then rewritten in the block. + // result is not read in the block (implies used in another block) + // 6) Reg is input of an instruction and then rewritten in the block. + // result is read in the block and not needed in another block + // 7) Reg is input of an instruction and then rewritten in the block. + // result is read in the block but also needed in another block + // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 + // We want LiveOutRegs to contain only Regs whose content will be read after + // in another block, and whose content was written in the current block, + // that is we want it to get 1, 3, 5, 7 + // Since we made the MIs of a block to be packed all together before + // scheduling, then the LiveIntervals were correct, and the RPTracker was + // able to correctly handle 5 vs 6, 2 vs 3. + // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) + // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 + // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 + // The use of findDefBetween removes the case 4. + for (unsigned Reg : RPTracker.getPressure().LiveOutRegs) { + if (TargetRegisterInfo::isVirtualRegister(Reg) && + isDefBetween(Reg, LIS->getInstructionIndex(BeginBlock).getRegSlot(), + LIS->getInstructionIndex(EndBlock).getRegSlot(), + MRI, LIS)) { + LiveOutRegs.insert(Reg); + } + } + + // Pressure = sum_alive_registers register size + // Internally llvm will represent some registers as big 128 bits registers + // for example, but they actually correspond to 4 actual 32 bits registers. + // Thus Pressure is not equal to num_alive_registers * constant. + LiveInPressure = TopPressure.MaxSetPressure; + LiveOutPressure = BotPressure.MaxSetPressure; + + // Prepares TopRPTracker for top down scheduling. + TopRPTracker.closeTop(); +} + +void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock) { + if (!Scheduled) + fastSchedule(); + + // PreScheduling phase to set LiveIn and LiveOut. + initRegPressure(BeginBlock, EndBlock); + undoSchedule(); + + // Schedule for real now. + + TopReadySUs.clear(); + + for (SUnit* SU : SUnits) { + if (!SU->NumPredsLeft) + TopReadySUs.push_back(SU); + } + + while (!TopReadySUs.empty()) { + SUnit *SU = pickNode(); + ScheduledSUnits.push_back(SU); + TopRPTracker.setPos(SU->getInstr()); + TopRPTracker.advance(); + nodeScheduled(SU); + } + + // TODO: compute InternalAdditionnalPressure. + InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size()); + + // Check everything is right. +#ifndef NDEBUG + assert(SUnits.size() == ScheduledSUnits.size() && + TopReadySUs.empty()); + for (SUnit* SU : SUnits) { + assert(SU->isScheduled && + SU->NumPredsLeft == 0); + } +#endif + + Scheduled = true; +} + +void SIScheduleBlock::undoSchedule() { + for (SUnit* SU : SUnits) { + SU->isScheduled = false; + for (SDep& Succ : SU->Succs) { + if (BC->isSUInBlock(Succ.getSUnit(), ID)) + undoReleaseSucc(SU, &Succ); + } + } + HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); + ScheduledSUnits.clear(); + Scheduled = false; +} + +void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { + SUnit *SuccSU = SuccEdge->getSUnit(); + + if (SuccEdge->isWeak()) { + ++SuccSU->WeakPredsLeft; + return; + } + ++SuccSU->NumPredsLeft; +} + +void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { + SUnit *SuccSU = SuccEdge->getSUnit(); + + if (SuccEdge->isWeak()) { + --SuccSU->WeakPredsLeft; + return; + } +#ifndef NDEBUG + if (SuccSU->NumPredsLeft == 0) { + dbgs() << "*** Scheduling failed! ***\n"; + SuccSU->dump(DAG); + dbgs() << " has been released too many times!\n"; + llvm_unreachable(nullptr); + } +#endif + + --SuccSU->NumPredsLeft; +} + +/// Release Successors of the SU that are in the block or not. +void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { + for (SDep& Succ : SU->Succs) { + SUnit *SuccSU = Succ.getSUnit(); + + if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) + continue; + + releaseSucc(SU, &Succ); + if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) + TopReadySUs.push_back(SuccSU); + } +} + +void SIScheduleBlock::nodeScheduled(SUnit *SU) { + // Is in TopReadySUs + assert (!SU->NumPredsLeft); + std::vector<SUnit*>::iterator I = + std::find(TopReadySUs.begin(), TopReadySUs.end(), SU); + if (I == TopReadySUs.end()) { + dbgs() << "Data Structure Bug in SI Scheduler\n"; + llvm_unreachable(nullptr); + } + TopReadySUs.erase(I); + + releaseSuccessors(SU, true); + // Scheduling this node will trigger a wait, + // thus propagate to other instructions that they do not need to wait either. + if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) + HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); + + if (DAG->IsLowLatencySU[SU->NodeNum]) { + for (SDep& Succ : SU->Succs) { + std::map<unsigned, unsigned>::iterator I = + NodeNum2Index.find(Succ.getSUnit()->NodeNum); + if (I != NodeNum2Index.end()) + HasLowLatencyNonWaitedParent[I->second] = 1; + } + } + SU->isScheduled = true; +} + +void SIScheduleBlock::finalizeUnits() { + // We remove links from outside blocks to enable scheduling inside the block. + for (SUnit* SU : SUnits) { + releaseSuccessors(SU, false); + if (DAG->IsHighLatencySU[SU->NodeNum]) + HighLatencyBlock = true; + } + HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); +} + +// we maintain ascending order of IDs +void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { + unsigned PredID = Pred->getID(); + + // Check if not already predecessor. + for (SIScheduleBlock* P : Preds) { + if (PredID == P->getID()) + return; + } + Preds.push_back(Pred); + +#ifndef NDEBUG + for (SIScheduleBlock* S : Succs) { + if (PredID == S->getID()) + assert(!"Loop in the Block Graph!\n"); + } +#endif +} + +void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) { + unsigned SuccID = Succ->getID(); + + // Check if not already predecessor. + for (SIScheduleBlock* S : Succs) { + if (SuccID == S->getID()) + return; + } + if (Succ->isHighLatencyBlock()) + ++NumHighLatencySuccessors; + Succs.push_back(Succ); +#ifndef NDEBUG + for (SIScheduleBlock* P : Preds) { + if (SuccID == P->getID()) + assert("Loop in the Block Graph!\n"); + } +#endif +} + +#ifndef NDEBUG +void SIScheduleBlock::printDebug(bool full) { + dbgs() << "Block (" << ID << ")\n"; + if (!full) + return; + + dbgs() << "\nContains High Latency Instruction: " + << HighLatencyBlock << '\n'; + dbgs() << "\nDepends On:\n"; + for (SIScheduleBlock* P : Preds) { + P->printDebug(false); + } + + dbgs() << "\nSuccessors:\n"; + for (SIScheduleBlock* S : Succs) { + S->printDebug(false); + } + + if (Scheduled) { + dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' ' + << LiveInPressure[DAG->getVGPRSetID()] << '\n'; + dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' ' + << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n"; + dbgs() << "LiveIns:\n"; + for (unsigned Reg : LiveInRegs) + dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + + dbgs() << "\nLiveOuts:\n"; + for (unsigned Reg : LiveOutRegs) + dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + } + + dbgs() << "\nInstructions:\n"; + if (!Scheduled) { + for (SUnit* SU : SUnits) { + SU->dump(DAG); + } + } else { + for (SUnit* SU : SUnits) { + SU->dump(DAG); + } + } + + dbgs() << "///////////////////////\n"; +} + +#endif + +// SIScheduleBlockCreator // + +SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) : +DAG(DAG) { +} + +SIScheduleBlockCreator::~SIScheduleBlockCreator() { +} + +SIScheduleBlocks +SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { + std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = + Blocks.find(BlockVariant); + if (B == Blocks.end()) { + SIScheduleBlocks Res; + createBlocksForVariant(BlockVariant); + topologicalSort(); + scheduleInsideBlocks(); + fillStats(); + Res.Blocks = CurrentBlocks; + Res.TopDownIndex2Block = TopDownIndex2Block; + Res.TopDownBlock2Index = TopDownBlock2Index; + Blocks[BlockVariant] = Res; + return Res; + } else { + return B->second; + } +} + +bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { + if (SU->NodeNum >= DAG->SUnits.size()) + return false; + return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; +} + +void SIScheduleBlockCreator::colorHighLatenciesAlone() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + if (DAG->IsHighLatencySU[SU->NodeNum]) { + CurrentColoring[SU->NodeNum] = NextReservedID++; + } + } +} + +void SIScheduleBlockCreator::colorHighLatenciesGroups() { + unsigned DAGSize = DAG->SUnits.size(); + unsigned NumHighLatencies = 0; + unsigned GroupSize; + unsigned Color = NextReservedID; + unsigned Count = 0; + std::set<unsigned> FormingGroup; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + if (DAG->IsHighLatencySU[SU->NodeNum]) + ++NumHighLatencies; + } + + if (NumHighLatencies == 0) + return; + + if (NumHighLatencies <= 6) + GroupSize = 2; + else if (NumHighLatencies <= 12) + GroupSize = 3; + else + GroupSize = 4; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + if (DAG->IsHighLatencySU[SU->NodeNum]) { + unsigned CompatibleGroup = true; + unsigned ProposedColor = Color; + for (unsigned j : FormingGroup) { + // TODO: Currently CompatibleGroup will always be false, + // because the graph enforces the load order. This + // can be fixed, but as keeping the load order is often + // good for performance that causes a performance hit (both + // the default scheduler and this scheduler). + // When this scheduler determines a good load order, + // this can be fixed. + if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) || + !DAG->canAddEdge(&DAG->SUnits[j], SU)) + CompatibleGroup = false; + } + if (!CompatibleGroup || ++Count == GroupSize) { + FormingGroup.clear(); + Color = ++NextReservedID; + if (!CompatibleGroup) { + ProposedColor = Color; + FormingGroup.insert(SU->NodeNum); + } + Count = 0; + } else { + FormingGroup.insert(SU->NodeNum); + } + CurrentColoring[SU->NodeNum] = ProposedColor; + } + } +} + +void SIScheduleBlockCreator::colorComputeReservedDependencies() { + unsigned DAGSize = DAG->SUnits.size(); + std::map<std::set<unsigned>, unsigned> ColorCombinations; + + CurrentTopDownReservedDependencyColoring.clear(); + CurrentBottomUpReservedDependencyColoring.clear(); + + CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); + CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); + + // Traverse TopDown, and give different colors to SUs depending + // on which combination of High Latencies they depend on. + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]]; + std::set<unsigned> SUColors; + + // Already given. + if (CurrentColoring[SU->NodeNum]) { + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = + CurrentColoring[SU->NodeNum]; + continue; + } + + for (SDep& PredDep : SU->Preds) { + SUnit *Pred = PredDep.getSUnit(); + if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) + continue; + if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) + SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); + } + // Color 0 by default. + if (SUColors.empty()) + continue; + // Same color than parents. + if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = + *SUColors.begin(); + else { + std::map<std::set<unsigned>, unsigned>::iterator Pos = + ColorCombinations.find(SUColors); + if (Pos != ColorCombinations.end()) { + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; + } else { + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = + NextNonReservedID; + ColorCombinations[SUColors] = NextNonReservedID++; + } + } + } + + ColorCombinations.clear(); + + // Same as before, but BottomUp. + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + + // Already given. + if (CurrentColoring[SU->NodeNum]) { + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = + CurrentColoring[SU->NodeNum]; + continue; + } + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) + SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); + } + // Keep color 0. + if (SUColors.empty()) + continue; + // Same color than parents. + if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = + *SUColors.begin(); + else { + std::map<std::set<unsigned>, unsigned>::iterator Pos = + ColorCombinations.find(SUColors); + if (Pos != ColorCombinations.end()) { + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; + } else { + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = + NextNonReservedID; + ColorCombinations[SUColors] = NextNonReservedID++; + } + } + } +} + +void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { + unsigned DAGSize = DAG->SUnits.size(); + std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; + + // Every combination of colors given by the top down + // and bottom up Reserved node dependency + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + std::pair<unsigned, unsigned> SUColors; + + // High latency instructions: already given. + if (CurrentColoring[SU->NodeNum]) + continue; + + SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; + SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; + + std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = + ColorCombinations.find(SUColors); + if (Pos != ColorCombinations.end()) { + CurrentColoring[SU->NodeNum] = Pos->second; + } else { + CurrentColoring[SU->NodeNum] = NextNonReservedID; + ColorCombinations[SUColors] = NextNonReservedID++; + } + } +} + +void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { + unsigned DAGSize = DAG->SUnits.size(); + std::vector<int> PendingColoring = CurrentColoring; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + std::set<unsigned> SUColorsPending; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || + CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || + CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) + SUColors.insert(CurrentColoring[Succ->NodeNum]); + SUColorsPending.insert(PendingColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1 && SUColorsPending.size() == 1) + PendingColoring[SU->NodeNum] = *SUColors.begin(); + else // TODO: Attribute new colors depending on color + // combination of children. + PendingColoring[SU->NodeNum] = NextNonReservedID++; + } + CurrentColoring = PendingColoring; +} + + +void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { + unsigned DAGSize = DAG->SUnits.size(); + unsigned PreviousColor; + std::set<unsigned> SeenColors; + + if (DAGSize <= 1) + return; + + PreviousColor = CurrentColoring[0]; + + for (unsigned i = 1, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + unsigned CurrentColor = CurrentColoring[i]; + unsigned PreviousColorSave = PreviousColor; + assert(i == SU->NodeNum); + + if (CurrentColor != PreviousColor) + SeenColors.insert(PreviousColor); + PreviousColor = CurrentColor; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + if (SeenColors.find(CurrentColor) == SeenColors.end()) + continue; + + if (PreviousColorSave != CurrentColor) + CurrentColoring[i] = NextNonReservedID++; + else + CurrentColoring[i] = CurrentColoring[i-1]; + } +} + +void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + // No predecessor: Vgpr constant loading. + // Low latency instructions usually have a predecessor (the address) + if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1) + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + } +} + +void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1) + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + } +} + +void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + } +} + +void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { + unsigned DAGSize = DAG->SUnits.size(); + std::map<unsigned, unsigned> ColorCount; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + unsigned color = CurrentColoring[SU->NodeNum]; + std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color); + if (Pos != ColorCount.end()) { + ++ColorCount[color]; + } else { + ColorCount[color] = 1; + } + } + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + unsigned color = CurrentColoring[SU->NodeNum]; + std::set<unsigned> SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + if (ColorCount[color] > 1) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1 && *SUColors.begin() != color) { + --ColorCount[color]; + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + ++ColorCount[*SUColors.begin()]; + } + } +} + +void SIScheduleBlockCreator::cutHugeBlocks() { + // TODO +} + +void SIScheduleBlockCreator::regroupNoUserInstructions() { + unsigned DAGSize = DAG->SUnits.size(); + int GroupID = NextNonReservedID++; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + bool hasSuccessor = false; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + hasSuccessor = true; + } + if (!hasSuccessor) + CurrentColoring[SU->NodeNum] = GroupID; + } +} + +void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { + unsigned DAGSize = DAG->SUnits.size(); + std::map<unsigned,unsigned> RealID; + + CurrentBlocks.clear(); + CurrentColoring.clear(); + CurrentColoring.resize(DAGSize, 0); + Node2CurrentBlock.clear(); + + // Restore links previous scheduling variant has overridden. + DAG->restoreSULinksLeft(); + + NextReservedID = 1; + NextNonReservedID = DAGSize + 1; + + DEBUG(dbgs() << "Coloring the graph\n"); + + if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) + colorHighLatenciesGroups(); + else + colorHighLatenciesAlone(); + colorComputeReservedDependencies(); + colorAccordingToReservedDependencies(); + colorEndsAccordingToDependencies(); + if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) + colorForceConsecutiveOrderInGroup(); + regroupNoUserInstructions(); + colorMergeConstantLoadsNextGroup(); + colorMergeIfPossibleNextGroupOnlyForReserved(); + + // Put SUs of same color into same block + Node2CurrentBlock.resize(DAGSize, -1); + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + unsigned Color = CurrentColoring[SU->NodeNum]; + if (RealID.find(Color) == RealID.end()) { + int ID = CurrentBlocks.size(); + BlockPtrs.push_back( + make_unique<SIScheduleBlock>(DAG, this, ID)); + CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); + RealID[Color] = ID; + } + CurrentBlocks[RealID[Color]]->addUnit(SU); + Node2CurrentBlock[SU->NodeNum] = RealID[Color]; + } + + // Build dependencies between blocks. + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + int SUID = Node2CurrentBlock[i]; + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + if (Node2CurrentBlock[Succ->NodeNum] != SUID) + CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]); + } + for (SDep& PredDep : SU->Preds) { + SUnit *Pred = PredDep.getSUnit(); + if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) + continue; + if (Node2CurrentBlock[Pred->NodeNum] != SUID) + CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); + } + } + + // Free root and leafs of all blocks to enable scheduling inside them. + for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->finalizeUnits(); + } + DEBUG( + dbgs() << "Blocks created:\n\n"; + for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->printDebug(true); + } + ); +} + +// Two functions taken from Codegen/MachineScheduler.cpp + +/// If this iterator is a debug value, increment until reaching the End or a +/// non-debug instruction. +static MachineBasicBlock::const_iterator +nextIfDebug(MachineBasicBlock::const_iterator I, + MachineBasicBlock::const_iterator End) { + for(; I != End; ++I) { + if (!I->isDebugValue()) + break; + } + return I; +} + +/// Non-const version. +static MachineBasicBlock::iterator +nextIfDebug(MachineBasicBlock::iterator I, + MachineBasicBlock::const_iterator End) { + // Cast the return value to nonconst MachineInstr, then cast to an + // instr_iterator, which does not check for null, finally return a + // bundle_iterator. + return MachineBasicBlock::instr_iterator( + const_cast<MachineInstr*>( + &*nextIfDebug(MachineBasicBlock::const_iterator(I), End))); +} + +void SIScheduleBlockCreator::topologicalSort() { + unsigned DAGSize = CurrentBlocks.size(); + std::vector<int> WorkList; + + DEBUG(dbgs() << "Topological Sort\n"); + + WorkList.reserve(DAGSize); + TopDownIndex2Block.resize(DAGSize); + TopDownBlock2Index.resize(DAGSize); + BottomUpIndex2Block.resize(DAGSize); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + unsigned Degree = Block->getSuccs().size(); + TopDownBlock2Index[i] = Degree; + if (Degree == 0) { + WorkList.push_back(i); + } + } + + int Id = DAGSize; + while (!WorkList.empty()) { + int i = WorkList.back(); + SIScheduleBlock *Block = CurrentBlocks[i]; + WorkList.pop_back(); + TopDownBlock2Index[i] = --Id; + TopDownIndex2Block[Id] = i; + for (SIScheduleBlock* Pred : Block->getPreds()) { + if (!--TopDownBlock2Index[Pred->getID()]) + WorkList.push_back(Pred->getID()); + } + } + +#ifndef NDEBUG + // Check correctness of the ordering. + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + for (SIScheduleBlock* Pred : Block->getPreds()) { + assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && + "Wrong Top Down topological sorting"); + } + } +#endif + + BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), + TopDownIndex2Block.rend()); +} + +void SIScheduleBlockCreator::scheduleInsideBlocks() { + unsigned DAGSize = CurrentBlocks.size(); + + DEBUG(dbgs() << "\nScheduling Blocks\n\n"); + + // We do schedule a valid scheduling such that a Block corresponds + // to a range of instructions. + DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->fastSchedule(); + } + + // Note: the following code, and the part restoring previous position + // is by far the most expensive operation of the Scheduler. + + // Do not update CurrentTop. + MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); + std::vector<MachineBasicBlock::iterator> PosOld; + std::vector<MachineBasicBlock::iterator> PosNew; + PosOld.reserve(DAG->SUnits.size()); + PosNew.reserve(DAG->SUnits.size()); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + int BlockIndice = TopDownIndex2Block[i]; + SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; + std::vector<SUnit*> SUs = Block->getScheduledUnits(); + + for (SUnit* SU : SUs) { + MachineInstr *MI = SU->getInstr(); + MachineBasicBlock::iterator Pos = MI; + PosOld.push_back(Pos); + if (&*CurrentTopFastSched == MI) { + PosNew.push_back(Pos); + CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, + DAG->getCurrentBottom()); + } else { + // Update the instruction stream. + DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); + + // Update LiveIntervals. + // Note: Moving all instructions and calling handleMove everytime + // is the most cpu intensive operation of the scheduler. + // It would gain a lot if there was a way to recompute the + // LiveIntervals for the entire scheduling region. + DAG->getLIS()->handleMove(MI, /*UpdateFlags=*/true); + PosNew.push_back(CurrentTopFastSched); + } + } + } + + // Now we have Block of SUs == Block of MI. + // We do the final schedule for the instructions inside the block. + // The property that all the SUs of the Block are grouped together as MI + // is used for correct reg usage tracking. + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + std::vector<SUnit*> SUs = Block->getScheduledUnits(); + Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); + } + + DEBUG(dbgs() << "Restoring MI Pos\n"); + // Restore old ordering (which prevents a LIS->handleMove bug). + for (unsigned i = PosOld.size(), e = 0; i != e; --i) { + MachineBasicBlock::iterator POld = PosOld[i-1]; + MachineBasicBlock::iterator PNew = PosNew[i-1]; + if (PNew != POld) { + // Update the instruction stream. + DAG->getBB()->splice(POld, DAG->getBB(), PNew); + + // Update LiveIntervals. + DAG->getLIS()->handleMove(POld, /*UpdateFlags=*/true); + } + } + + DEBUG( + for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->printDebug(true); + } + ); +} + +void SIScheduleBlockCreator::fillStats() { + unsigned DAGSize = CurrentBlocks.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + int BlockIndice = TopDownIndex2Block[i]; + SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; + if (Block->getPreds().size() == 0) + Block->Depth = 0; + else { + unsigned Depth = 0; + for (SIScheduleBlock *Pred : Block->getPreds()) { + if (Depth < Pred->Depth + 1) + Depth = Pred->Depth + 1; + } + Block->Depth = Depth; + } + } + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + int BlockIndice = BottomUpIndex2Block[i]; + SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; + if (Block->getSuccs().size() == 0) + Block->Height = 0; + else { + unsigned Height = 0; + for (SIScheduleBlock *Succ : Block->getSuccs()) { + if (Height < Succ->Height + 1) + Height = Succ->Height + 1; + } + Block->Height = Height; + } + } +} + +// SIScheduleBlockScheduler // + +SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, + SISchedulerBlockSchedulerVariant Variant, + SIScheduleBlocks BlocksStruct) : + DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), + LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), + SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { + + // Fill the usage of every output + // Warning: while by construction we always have a link between two blocks + // when one needs a result from the other, the number of users of an output + // is not the sum of child blocks having as input the same virtual register. + // Here is an example. A produces x and y. B eats x and produces x'. + // C eats x' and y. The register coalescer may have attributed the same + // virtual register to x and x'. + // To count accurately, we do a topological sort. In case the register is + // found for several parents, we increment the usage of the one with the + // highest topological index. + LiveOutRegsNumUsages.resize(Blocks.size()); + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + for (unsigned Reg : Block->getInRegs()) { + bool Found = false; + int topoInd = -1; + for (SIScheduleBlock* Pred: Block->getPreds()) { + std::set<unsigned> PredOutRegs = Pred->getOutRegs(); + std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); + + if (RegPos != PredOutRegs.end()) { + Found = true; + if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { + topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; + } + } + } + + if (!Found) + continue; + + int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; + std::map<unsigned, unsigned>::iterator RegPos = + LiveOutRegsNumUsages[PredID].find(Reg); + if (RegPos != LiveOutRegsNumUsages[PredID].end()) { + ++LiveOutRegsNumUsages[PredID][Reg]; + } else { + LiveOutRegsNumUsages[PredID][Reg] = 1; + } + } + } + + LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); + BlockNumPredsLeft.resize(Blocks.size()); + BlockNumSuccsLeft.resize(Blocks.size()); + + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + BlockNumPredsLeft[i] = Block->getPreds().size(); + BlockNumSuccsLeft[i] = Block->getSuccs().size(); + } + +#ifndef NDEBUG + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + assert(Block->getID() == i); + } +#endif + + std::set<unsigned> InRegs = DAG->getInRegs(); + addLiveRegs(InRegs); + + // Fill LiveRegsConsumers for regs that were already + // defined before scheduling. + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + for (unsigned Reg : Block->getInRegs()) { + bool Found = false; + for (SIScheduleBlock* Pred: Block->getPreds()) { + std::set<unsigned> PredOutRegs = Pred->getOutRegs(); + std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); + + if (RegPos != PredOutRegs.end()) { + Found = true; + break; + } + } + + if (!Found) { + if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end()) + LiveRegsConsumers[Reg] = 1; + else + ++LiveRegsConsumers[Reg]; + } + } + } + + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + if (BlockNumPredsLeft[i] == 0) { + ReadyBlocks.push_back(Block); + } + } + + while (SIScheduleBlock *Block = pickBlock()) { + BlocksScheduled.push_back(Block); + blockScheduled(Block); + } + + DEBUG( + dbgs() << "Block Order:"; + for (SIScheduleBlock* Block : BlocksScheduled) { + dbgs() << ' ' << Block->getID(); + } + ); +} + +bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand) { + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + // Try to hide high latencies. + if (tryLess(TryCand.LastPosHighLatParentScheduled, + Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) + return true; + // Schedule high latencies early so you can hide them better. + if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, + TryCand, Cand, Latency)) + return true; + if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height, + TryCand, Cand, Depth)) + return true; + if (tryGreater(TryCand.NumHighLatencySuccessors, + Cand.NumHighLatencySuccessors, + TryCand, Cand, Successor)) + return true; + return false; +} + +bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand) { + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, + TryCand, Cand, RegUsage)) + return true; + if (tryGreater(TryCand.NumSuccessors > 0, + Cand.NumSuccessors > 0, + TryCand, Cand, Successor)) + return true; + if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) + return true; + if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, + TryCand, Cand, RegUsage)) + return true; + return false; +} + +SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { + SIBlockSchedCandidate Cand; + std::vector<SIScheduleBlock*>::iterator Best; + SIScheduleBlock *Block; + if (ReadyBlocks.empty()) + return nullptr; + + DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), + VregCurrentUsage, SregCurrentUsage); + if (VregCurrentUsage > maxVregUsage) + maxVregUsage = VregCurrentUsage; + if (VregCurrentUsage > maxSregUsage) + maxSregUsage = VregCurrentUsage; + DEBUG( + dbgs() << "Picking New Blocks\n"; + dbgs() << "Available: "; + for (SIScheduleBlock* Block : ReadyBlocks) + dbgs() << Block->getID() << ' '; + dbgs() << "\nCurrent Live:\n"; + for (unsigned Reg : LiveRegs) + dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + dbgs() << '\n'; + dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; + dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n'; + ); + + Cand.Block = nullptr; + for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), + E = ReadyBlocks.end(); I != E; ++I) { + SIBlockSchedCandidate TryCand; + TryCand.Block = *I; + TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); + TryCand.VGPRUsageDiff = + checkRegUsageImpact(TryCand.Block->getInRegs(), + TryCand.Block->getOutRegs())[DAG->getVGPRSetID()]; + TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); + TryCand.NumHighLatencySuccessors = + TryCand.Block->getNumHighLatencySuccessors(); + TryCand.LastPosHighLatParentScheduled = + (unsigned int) std::max<int> (0, + LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - + LastPosWaitedHighLatency); + TryCand.Height = TryCand.Block->Height; + // Try not to increase VGPR usage too much, else we may spill. + if (VregCurrentUsage > 120 || + Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { + if (!tryCandidateRegUsage(Cand, TryCand) && + Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) + tryCandidateLatency(Cand, TryCand); + } else { + if (!tryCandidateLatency(Cand, TryCand)) + tryCandidateRegUsage(Cand, TryCand); + } + if (TryCand.Reason != NoCand) { + Cand.setBest(TryCand); + Best = I; + DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' + << getReasonStr(Cand.Reason) << '\n'); + } + } + + DEBUG( + dbgs() << "Picking: " << Cand.Block->getID() << '\n'; + dbgs() << "Is a block with high latency instruction: " + << (Cand.IsHighLatency ? "yes\n" : "no\n"); + dbgs() << "Position of last high latency dependency: " + << Cand.LastPosHighLatParentScheduled << '\n'; + dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; + dbgs() << '\n'; + ); + + Block = Cand.Block; + ReadyBlocks.erase(Best); + return Block; +} + +// Tracking of currently alive registers to determine VGPR Usage. + +void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { + for (unsigned Reg : Regs) { + // For now only track virtual registers. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + // If not already in the live set, then add it. + (void) LiveRegs.insert(Reg); + } +} + +void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, + std::set<unsigned> &Regs) { + for (unsigned Reg : Regs) { + // For now only track virtual registers. + std::set<unsigned>::iterator Pos = LiveRegs.find(Reg); + assert (Pos != LiveRegs.end() && // Reg must be live. + LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && + LiveRegsConsumers[Reg] >= 1); + --LiveRegsConsumers[Reg]; + if (LiveRegsConsumers[Reg] == 0) + LiveRegs.erase(Pos); + } +} + +void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { + for (SIScheduleBlock* Block : Parent->getSuccs()) { + --BlockNumPredsLeft[Block->getID()]; + if (BlockNumPredsLeft[Block->getID()] == 0) { + ReadyBlocks.push_back(Block); + } + // TODO: Improve check. When the dependency between the high latency + // instructions and the instructions of the other blocks are WAR or WAW + // there will be no wait triggered. We would like these cases to not + // update LastPosHighLatencyParentScheduled. + if (Parent->isHighLatencyBlock()) + LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled; + } +} + +void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { + decreaseLiveRegs(Block, Block->getInRegs()); + addLiveRegs(Block->getOutRegs()); + releaseBlockSuccs(Block); + for (std::map<unsigned, unsigned>::iterator RegI = + LiveOutRegsNumUsages[Block->getID()].begin(), + E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { + std::pair<unsigned, unsigned> RegP = *RegI; + if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end()) + LiveRegsConsumers[RegP.first] = RegP.second; + else { + assert(LiveRegsConsumers[RegP.first] == 0); + LiveRegsConsumers[RegP.first] += RegP.second; + } + } + if (LastPosHighLatencyParentScheduled[Block->getID()] > + (unsigned)LastPosWaitedHighLatency) + LastPosWaitedHighLatency = + LastPosHighLatencyParentScheduled[Block->getID()]; + ++NumBlockScheduled; +} + +std::vector<int> +SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, + std::set<unsigned> &OutRegs) { + std::vector<int> DiffSetPressure; + DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); + + for (unsigned Reg : InRegs) { + // For now only track virtual registers. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + if (LiveRegsConsumers[Reg] > 1) + continue; + PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); + for (; PSetI.isValid(); ++PSetI) { + DiffSetPressure[*PSetI] -= PSetI.getWeight(); + } + } + + for (unsigned Reg : OutRegs) { + // For now only track virtual registers. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); + for (; PSetI.isValid(); ++PSetI) { + DiffSetPressure[*PSetI] += PSetI.getWeight(); + } + } + + return DiffSetPressure; +} + +// SIScheduler // + +struct SIScheduleBlockResult +SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, + SISchedulerBlockSchedulerVariant ScheduleVariant) { + SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); + SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); + std::vector<SIScheduleBlock*> ScheduledBlocks; + struct SIScheduleBlockResult Res; + + ScheduledBlocks = Scheduler.getBlocks(); + + for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { + SIScheduleBlock *Block = ScheduledBlocks[b]; + std::vector<SUnit*> SUs = Block->getScheduledUnits(); + + for (SUnit* SU : SUs) + Res.SUs.push_back(SU->NodeNum); + } + + Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); + Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); + return Res; +} + +// SIScheduleDAGMI // + +SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : + ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) { + SITII = static_cast<const SIInstrInfo*>(TII); + SITRI = static_cast<const SIRegisterInfo*>(TRI); + + VGPRSetID = SITRI->getVGPR32PressureSet(); + SGPRSetID = SITRI->getSGPR32PressureSet(); +} + +SIScheduleDAGMI::~SIScheduleDAGMI() { +} + +ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) { + return new SIScheduleDAGMI(C); +} + +// Code adapted from scheduleDAG.cpp +// Does a topological sort over the SUs. +// Both TopDown and BottomUp +void SIScheduleDAGMI::topologicalSort() { + std::vector<int> TopDownSU2Index; + unsigned DAGSize = SUnits.size(); + std::vector<SUnit*> WorkList; + + DEBUG(dbgs() << "Topological Sort\n"); + WorkList.reserve(DAGSize); + + TopDownIndex2SU.resize(DAGSize); + TopDownSU2Index.resize(DAGSize); + BottomUpIndex2SU.resize(DAGSize); + + WorkList.push_back(&getExitSU()); + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + int NodeNum = SU->NodeNum; + unsigned Degree = SU->Succs.size(); + TopDownSU2Index[NodeNum] = Degree; + if (Degree == 0) { + assert(SU->Succs.empty() && "SUnit should have no successors"); + WorkList.push_back(SU); + } + } + + int Id = DAGSize; + while (!WorkList.empty()) { + SUnit *SU = WorkList.back(); + WorkList.pop_back(); + if (SU->NodeNum < DAGSize) { + TopDownSU2Index[SU->NodeNum] = --Id; + TopDownIndex2SU[Id] = SU->NodeNum; + } + for (SDep& Pred : SU->Preds) { + SUnit *SU = Pred.getSUnit(); + if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum]) + WorkList.push_back(SU); + } + } + + BottomUpIndex2SU = std::vector<int>(TopDownIndex2SU.rbegin(), + TopDownIndex2SU.rend()); + +#ifndef NDEBUG + // Check correctness of the ordering + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + for (SDep& Pred : SU->Preds) { + if (Pred.getSUnit()->NodeNum >= DAGSize) + continue; + assert(TopDownSU2Index[SU->NodeNum] > + TopDownSU2Index[Pred.getSUnit()->NodeNum] && + "Wrong Top Down topological sorting"); + } + } + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + for (SDep& Succ : SU->Succs) { + if (Succ.getSUnit()->NodeNum >= DAGSize) + continue; + assert(TopDownSU2Index[SU->NodeNum] < + TopDownSU2Index[Succ.getSUnit()->NodeNum] && + "Wrong Bottom Up topological sorting"); + } + } +#endif +} + +// Move low latencies further from their user without +// increasing SGPR usage (in general) +// This is to be replaced by a better pass that would +// take into account SGPR usage (based on VGPR Usage +// and the corresponding wavefront count), that would +// try to merge groups of loads if it make sense, etc +void SIScheduleDAGMI::moveLowLatencies() { + unsigned DAGSize = SUnits.size(); + int LastLowLatencyUser = -1; + int LastLowLatencyPos = -1; + + for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { + SUnit *SU = &SUnits[ScheduledSUnits[i]]; + bool IsLowLatencyUser = false; + unsigned MinPos = 0; + + for (SDep& PredDep : SU->Preds) { + SUnit *Pred = PredDep.getSUnit(); + if (SITII->isLowLatencyInstruction(Pred->getInstr())) { + IsLowLatencyUser = true; + } + if (Pred->NodeNum >= DAGSize) + continue; + unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; + if (PredPos >= MinPos) + MinPos = PredPos + 1; + } + + if (SITII->isLowLatencyInstruction(SU->getInstr())) { + unsigned BestPos = LastLowLatencyUser + 1; + if ((int)BestPos <= LastLowLatencyPos) + BestPos = LastLowLatencyPos + 1; + if (BestPos < MinPos) + BestPos = MinPos; + if (BestPos < i) { + for (unsigned u = i; u > BestPos; --u) { + ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; + ScheduledSUnits[u] = ScheduledSUnits[u-1]; + } + ScheduledSUnits[BestPos] = SU->NodeNum; + ScheduledSUnitsInv[SU->NodeNum] = BestPos; + } + LastLowLatencyPos = BestPos; + if (IsLowLatencyUser) + LastLowLatencyUser = BestPos; + } else if (IsLowLatencyUser) { + LastLowLatencyUser = i; + // Moves COPY instructions on which depends + // the low latency instructions too. + } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { + bool CopyForLowLat = false; + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SITII->isLowLatencyInstruction(Succ->getInstr())) { + CopyForLowLat = true; + } + } + if (!CopyForLowLat) + continue; + if (MinPos < i) { + for (unsigned u = i; u > MinPos; --u) { + ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; + ScheduledSUnits[u] = ScheduledSUnits[u-1]; + } + ScheduledSUnits[MinPos] = SU->NodeNum; + ScheduledSUnitsInv[SU->NodeNum] = MinPos; + } + } + } +} + +void SIScheduleDAGMI::restoreSULinksLeft() { + for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { + SUnits[i].isScheduled = false; + SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; + SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; + SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; + SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; + } +} + +// Return the Vgpr and Sgpr usage corresponding to some virtual registers. +template<typename _Iterator> void +SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, + unsigned &VgprUsage, unsigned &SgprUsage) { + VgprUsage = 0; + SgprUsage = 0; + for (_Iterator RegI = First; RegI != End; ++RegI) { + unsigned Reg = *RegI; + // For now only track virtual registers + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + PSetIterator PSetI = MRI.getPressureSets(Reg); + for (; PSetI.isValid(); ++PSetI) { + if (*PSetI == VGPRSetID) + VgprUsage += PSetI.getWeight(); + else if (*PSetI == SGPRSetID) + SgprUsage += PSetI.getWeight(); + } + } +} + +void SIScheduleDAGMI::schedule() +{ + SmallVector<SUnit*, 8> TopRoots, BotRoots; + SIScheduleBlockResult Best, Temp; + DEBUG(dbgs() << "Preparing Scheduling\n"); + + buildDAGWithRegPressure(); + DEBUG( + for(SUnit& SU : SUnits) + SU.dumpAll(this) + ); + + Topo.InitDAGTopologicalSorting(); + topologicalSort(); + findRootsAndBiasEdges(TopRoots, BotRoots); + // We reuse several ScheduleDAGMI and ScheduleDAGMILive + // functions, but to make them happy we must initialize + // the default Scheduler implementation (even if we do not + // run it) + SchedImpl->initialize(this); + initQueues(TopRoots, BotRoots); + + // Fill some stats to help scheduling. + + SUnitsLinksBackup = SUnits; + IsLowLatencySU.clear(); + LowLatencyOffset.clear(); + IsHighLatencySU.clear(); + + IsLowLatencySU.resize(SUnits.size(), 0); + LowLatencyOffset.resize(SUnits.size(), 0); + IsHighLatencySU.resize(SUnits.size(), 0); + + for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { + SUnit *SU = &SUnits[i]; + unsigned BaseLatReg, OffLatReg; + if (SITII->isLowLatencyInstruction(SU->getInstr())) { + IsLowLatencySU[i] = 1; + if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg, + OffLatReg, TRI)) + LowLatencyOffset[i] = OffLatReg; + } else if (SITII->isHighLatencyInstruction(SU->getInstr())) + IsHighLatencySU[i] = 1; + } + + SIScheduler Scheduler(this); + Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, + SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); +#if 0 // To enable when handleMove fix lands + // if VGPR usage is extremely high, try other good performing variants + // which could lead to lower VGPR usage + if (Best.MaxVGPRUsage > 180) { + std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { + { LatenciesAlone, BlockRegUsageLatency }, +// { LatenciesAlone, BlockRegUsage }, + { LatenciesGrouped, BlockLatencyRegUsage }, +// { LatenciesGrouped, BlockRegUsageLatency }, +// { LatenciesGrouped, BlockRegUsage }, + { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, +// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, +// { LatenciesAlonePlusConsecutive, BlockRegUsage } + }; + for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { + Temp = Scheduler.scheduleVariant(v.first, v.second); + if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) + Best = Temp; + } + } + // if VGPR usage is still extremely high, we may spill. Try other variants + // which are less performing, but that could lead to lower VGPR usage. + if (Best.MaxVGPRUsage > 200) { + std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { +// { LatenciesAlone, BlockRegUsageLatency }, + { LatenciesAlone, BlockRegUsage }, +// { LatenciesGrouped, BlockLatencyRegUsage }, + { LatenciesGrouped, BlockRegUsageLatency }, + { LatenciesGrouped, BlockRegUsage }, +// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, + { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, + { LatenciesAlonePlusConsecutive, BlockRegUsage } + }; + for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { + Temp = Scheduler.scheduleVariant(v.first, v.second); + if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) + Best = Temp; + } + } +#endif + ScheduledSUnits = Best.SUs; + ScheduledSUnitsInv.resize(SUnits.size()); + + for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { + ScheduledSUnitsInv[ScheduledSUnits[i]] = i; + } + + moveLowLatencies(); + + // Tell the outside world about the result of the scheduling. + + assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); + TopRPTracker.setPos(CurrentTop); + + for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), + E = ScheduledSUnits.end(); I != E; ++I) { + SUnit *SU = &SUnits[*I]; + + scheduleMI(SU, true); + + DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " + << *SU->getInstr()); + } + + assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); + + placeDebugValues(); + + DEBUG({ + unsigned BBNum = begin()->getParent()->getNumber(); + dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n"; + dumpSchedule(); + dbgs() << '\n'; + }); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h new file mode 100644 index 0000000..b270136 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h @@ -0,0 +1,489 @@ +//===-- SIMachineScheduler.h - SI Scheduler Interface -*- C++ -*-------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H +#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H + +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/RegisterPressure.h" + +using namespace llvm; + +namespace llvm { + +enum SIScheduleCandReason { + NoCand, + RegUsage, + Latency, + Successor, + Depth, + NodeOrder +}; + +struct SISchedulerCandidate { + // The reason for this candidate. + SIScheduleCandReason Reason; + + // Set of reasons that apply to multiple candidates. + uint32_t RepeatReasonSet; + + SISchedulerCandidate() + : Reason(NoCand), RepeatReasonSet(0) {} + + bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); } + void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); } +}; + +class SIScheduleDAGMI; +class SIScheduleBlockCreator; + +class SIScheduleBlock { + SIScheduleDAGMI *DAG; + SIScheduleBlockCreator *BC; + + std::vector<SUnit*> SUnits; + std::map<unsigned, unsigned> NodeNum2Index; + std::vector<SUnit*> TopReadySUs; + std::vector<SUnit*> ScheduledSUnits; + + /// The top of the unscheduled zone. + IntervalPressure TopPressure; + RegPressureTracker TopRPTracker; + + // Pressure: number of said class of registers needed to + // store the live virtual and real registers. + // We do care only of SGPR32 and VGPR32 and do track only virtual registers. + // Pressure of additional registers required inside the block. + std::vector<unsigned> InternalAdditionnalPressure; + // Pressure of input and output registers + std::vector<unsigned> LiveInPressure; + std::vector<unsigned> LiveOutPressure; + // Registers required by the block, and outputs. + // We do track only virtual registers. + // Note that some registers are not 32 bits, + // and thus the pressure is not equal + // to the number of live registers. + std::set<unsigned> LiveInRegs; + std::set<unsigned> LiveOutRegs; + + bool Scheduled; + bool HighLatencyBlock; + + std::vector<unsigned> HasLowLatencyNonWaitedParent; + + // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table. + unsigned ID; + + std::vector<SIScheduleBlock*> Preds; // All blocks predecessors. + std::vector<SIScheduleBlock*> Succs; // All blocks successors. + unsigned NumHighLatencySuccessors; + +public: + SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC, + unsigned ID): + DAG(DAG), BC(BC), SUnits(), TopReadySUs(), ScheduledSUnits(), + TopRPTracker(TopPressure), Scheduled(false), + HighLatencyBlock(false), ID(ID), + Preds(), Succs(), NumHighLatencySuccessors(0) {}; + + ~SIScheduleBlock() {}; + + unsigned getID() const { return ID; } + + /// Functions for Block construction. + void addUnit(SUnit *SU); + + // When all SUs have been added. + void finalizeUnits(); + + // Add block pred, which has instruction predecessor of SU. + void addPred(SIScheduleBlock *Pred); + void addSucc(SIScheduleBlock *Succ); + + const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; } + const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; } + + unsigned Height; // Maximum topdown path length to block without outputs + unsigned Depth; // Maximum bottomup path length to block without inputs + + unsigned getNumHighLatencySuccessors() const { + return NumHighLatencySuccessors; + } + + bool isHighLatencyBlock() { return HighLatencyBlock; } + + // This is approximative. + // Ideally should take into accounts some instructions (rcp, etc) + // are 4 times slower. + int getCost() { return SUnits.size(); } + + // The block Predecessors and Successors must be all registered + // before fastSchedule(). + // Fast schedule with no particular requirement. + void fastSchedule(); + + std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; } + + // Complete schedule that will try to minimize reg pressure and + // low latencies, and will fill liveins and liveouts. + // Needs all MIs to be grouped between BeginBlock and EndBlock. + // The MIs can be moved after the scheduling, + // it is just used to allow correct track of live registers. + void schedule(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock); + + bool isScheduled() { return Scheduled; } + + + // Needs the block to be scheduled inside + // TODO: find a way to compute it. + std::vector<unsigned> &getInternalAdditionnalRegUsage() { + return InternalAdditionnalPressure; + } + + std::set<unsigned> &getInRegs() { return LiveInRegs; } + std::set<unsigned> &getOutRegs() { return LiveOutRegs; } + + void printDebug(bool Full); + +private: + struct SISchedCandidate : SISchedulerCandidate { + // The best SUnit candidate. + SUnit *SU; + + unsigned SGPRUsage; + unsigned VGPRUsage; + bool IsLowLatency; + unsigned LowLatencyOffset; + bool HasLowLatencyNonWaitedParent; + + SISchedCandidate() + : SU(nullptr) {} + + bool isValid() const { return SU; } + + // Copy the status of another candidate without changing policy. + void setBest(SISchedCandidate &Best) { + assert(Best.Reason != NoCand && "uninitialized Sched candidate"); + SU = Best.SU; + Reason = Best.Reason; + SGPRUsage = Best.SGPRUsage; + VGPRUsage = Best.VGPRUsage; + IsLowLatency = Best.IsLowLatency; + LowLatencyOffset = Best.LowLatencyOffset; + HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent; + } + }; + + void undoSchedule(); + + void undoReleaseSucc(SUnit *SU, SDep *SuccEdge); + void releaseSucc(SUnit *SU, SDep *SuccEdge); + // InOrOutBlock: restrict to links pointing inside the block (true), + // or restrict to links pointing outside the block (false). + void releaseSuccessors(SUnit *SU, bool InOrOutBlock); + + void nodeScheduled(SUnit *SU); + void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand); + void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand); + SUnit* pickNode(); + void traceCandidate(const SISchedCandidate &Cand); + void initRegPressure(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock); +}; + +struct SIScheduleBlocks { + std::vector<SIScheduleBlock*> Blocks; + std::vector<int> TopDownIndex2Block; + std::vector<int> TopDownBlock2Index; +}; + +enum SISchedulerBlockCreatorVariant { + LatenciesAlone, + LatenciesGrouped, + LatenciesAlonePlusConsecutive +}; + +class SIScheduleBlockCreator { + SIScheduleDAGMI *DAG; + // unique_ptr handles freeing memory for us. + std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs; + std::map<SISchedulerBlockCreatorVariant, + SIScheduleBlocks> Blocks; + std::vector<SIScheduleBlock*> CurrentBlocks; + std::vector<int> Node2CurrentBlock; + + // Topological sort + // Maps topological index to the node number. + std::vector<int> TopDownIndex2Block; + std::vector<int> TopDownBlock2Index; + std::vector<int> BottomUpIndex2Block; + + // 0 -> Color not given. + // 1 to SUnits.size() -> Reserved group (you should only add elements to them). + // Above -> Other groups. + int NextReservedID; + int NextNonReservedID; + std::vector<int> CurrentColoring; + std::vector<int> CurrentTopDownReservedDependencyColoring; + std::vector<int> CurrentBottomUpReservedDependencyColoring; + +public: + SIScheduleBlockCreator(SIScheduleDAGMI *DAG); + ~SIScheduleBlockCreator(); + + SIScheduleBlocks + getBlocks(SISchedulerBlockCreatorVariant BlockVariant); + + bool isSUInBlock(SUnit *SU, unsigned ID); + +private: + // Give a Reserved color to every high latency. + void colorHighLatenciesAlone(); + + // Create groups of high latencies with a Reserved color. + void colorHighLatenciesGroups(); + + // Compute coloring for topdown and bottom traversals with + // different colors depending on dependencies on Reserved colors. + void colorComputeReservedDependencies(); + + // Give color to all non-colored SUs according to Reserved groups dependencies. + void colorAccordingToReservedDependencies(); + + // Divides Blocks having no bottom up or top down dependencies on Reserved groups. + // The new colors are computed according to the dependencies on the other blocks + // formed with colorAccordingToReservedDependencies. + void colorEndsAccordingToDependencies(); + + // Cut groups into groups with SUs in consecutive order (except for Reserved groups). + void colorForceConsecutiveOrderInGroup(); + + // Merge Constant loads that have all their users into another group to the group. + // (TODO: else if all their users depend on the same group, put them there) + void colorMergeConstantLoadsNextGroup(); + + // Merge SUs that have all their users into another group to the group + void colorMergeIfPossibleNextGroup(); + + // Merge SUs that have all their users into another group to the group, + // but only for Reserved groups. + void colorMergeIfPossibleNextGroupOnlyForReserved(); + + // Merge SUs that have all their users into another group to the group, + // but only if the group is no more than a few SUs. + void colorMergeIfPossibleSmallGroupsToNextGroup(); + + // Divides Blocks with important size. + // Idea of implementation: attribute new colors depending on topdown and + // bottom up links to other blocks. + void cutHugeBlocks(); + + // Put in one group all instructions with no users in this scheduling region + // (we'd want these groups be at the end). + void regroupNoUserInstructions(); + + void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant); + + void topologicalSort(); + + void scheduleInsideBlocks(); + + void fillStats(); +}; + +enum SISchedulerBlockSchedulerVariant { + BlockLatencyRegUsage, + BlockRegUsageLatency, + BlockRegUsage +}; + +class SIScheduleBlockScheduler { + SIScheduleDAGMI *DAG; + SISchedulerBlockSchedulerVariant Variant; + std::vector<SIScheduleBlock*> Blocks; + + std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages; + std::set<unsigned> LiveRegs; + // Num of schedulable unscheduled blocks reading the register. + std::map<unsigned, unsigned> LiveRegsConsumers; + + std::vector<unsigned> LastPosHighLatencyParentScheduled; + int LastPosWaitedHighLatency; + + std::vector<SIScheduleBlock*> BlocksScheduled; + unsigned NumBlockScheduled; + std::vector<SIScheduleBlock*> ReadyBlocks; + + unsigned VregCurrentUsage; + unsigned SregCurrentUsage; + + // Currently is only approximation. + unsigned maxVregUsage; + unsigned maxSregUsage; + + std::vector<unsigned> BlockNumPredsLeft; + std::vector<unsigned> BlockNumSuccsLeft; + +public: + SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, + SISchedulerBlockSchedulerVariant Variant, + SIScheduleBlocks BlocksStruct); + ~SIScheduleBlockScheduler() {}; + + std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }; + + unsigned getVGPRUsage() { return maxVregUsage; }; + unsigned getSGPRUsage() { return maxSregUsage; }; + +private: + struct SIBlockSchedCandidate : SISchedulerCandidate { + // The best Block candidate. + SIScheduleBlock *Block; + + bool IsHighLatency; + int VGPRUsageDiff; + unsigned NumSuccessors; + unsigned NumHighLatencySuccessors; + unsigned LastPosHighLatParentScheduled; + unsigned Height; + + SIBlockSchedCandidate() + : Block(nullptr) {} + + bool isValid() const { return Block; } + + // Copy the status of another candidate without changing policy. + void setBest(SIBlockSchedCandidate &Best) { + assert(Best.Reason != NoCand && "uninitialized Sched candidate"); + Block = Best.Block; + Reason = Best.Reason; + IsHighLatency = Best.IsHighLatency; + VGPRUsageDiff = Best.VGPRUsageDiff; + NumSuccessors = Best.NumSuccessors; + NumHighLatencySuccessors = Best.NumHighLatencySuccessors; + LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled; + Height = Best.Height; + } + }; + + bool tryCandidateLatency(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand); + bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand); + SIScheduleBlock *pickBlock(); + + void addLiveRegs(std::set<unsigned> &Regs); + void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs); + void releaseBlockSuccs(SIScheduleBlock *Parent); + void blockScheduled(SIScheduleBlock *Block); + + // Check register pressure change + // by scheduling a block with these LiveIn and LiveOut. + std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs, + std::set<unsigned> &OutRegs); + + void schedule(); +}; + +struct SIScheduleBlockResult { + std::vector<unsigned> SUs; + unsigned MaxSGPRUsage; + unsigned MaxVGPRUsage; +}; + +class SIScheduler { + SIScheduleDAGMI *DAG; + SIScheduleBlockCreator BlockCreator; + +public: + SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}; + + ~SIScheduler() {}; + + struct SIScheduleBlockResult + scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, + SISchedulerBlockSchedulerVariant ScheduleVariant); +}; + +class SIScheduleDAGMI : public ScheduleDAGMILive { + const SIInstrInfo *SITII; + const SIRegisterInfo *SITRI; + + std::vector<SUnit> SUnitsLinksBackup; + + // For moveLowLatencies. After all Scheduling variants are tested. + std::vector<unsigned> ScheduledSUnits; + std::vector<unsigned> ScheduledSUnitsInv; + + unsigned VGPRSetID; + unsigned SGPRSetID; + +public: + SIScheduleDAGMI(MachineSchedContext *C); + + ~SIScheduleDAGMI() override; + + // Entry point for the schedule. + void schedule() override; + + // To init Block's RPTracker. + void initRPTracker(RegPressureTracker &RPTracker) { + RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); + } + + MachineBasicBlock *getBB() { return BB; } + MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }; + MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }; + LiveIntervals *getLIS() { return LIS; } + MachineRegisterInfo *getMRI() { return &MRI; } + const TargetRegisterInfo *getTRI() { return TRI; } + SUnit& getEntrySU() { return EntrySU; }; + SUnit& getExitSU() { return ExitSU; }; + + void restoreSULinksLeft(); + + template<typename _Iterator> void fillVgprSgprCost(_Iterator First, + _Iterator End, + unsigned &VgprUsage, + unsigned &SgprUsage); + std::set<unsigned> getInRegs() { + std::set<unsigned> InRegs (RPTracker.getPressure().LiveInRegs.begin(), + RPTracker.getPressure().LiveInRegs.end()); + return InRegs; + }; + + unsigned getVGPRSetID() const { return VGPRSetID; } + unsigned getSGPRSetID() const { return SGPRSetID; } + +private: + void topologicalSort(); + // After scheduling is done, improve low latency placements. + void moveLowLatencies(); + +public: + // Some stats for scheduling inside blocks. + std::vector<unsigned> IsLowLatencySU; + std::vector<unsigned> LowLatencyOffset; + std::vector<unsigned> IsHighLatencySU; + // Topological sort + // Maps topological index to the node number. + std::vector<int> TopDownIndex2SU; + std::vector<int> BottomUpIndex2SU; +}; + +} // namespace llvm + +#endif /* SIMACHINESCHEDULER_H_ */ diff --git a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp deleted file mode 100644 index 2cd600d..0000000 --- a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp +++ /dev/null @@ -1,193 +0,0 @@ -//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// This pass loads scratch pointer and scratch offset into a register or a -/// frame index which can be used anywhere in the program. These values will -/// be used for spilling VGPRs. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIDefines.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" - -using namespace llvm; - -namespace { - -class SIPrepareScratchRegs : public MachineFunctionPass { - -private: - static char ID; - -public: - SIPrepareScratchRegs() : MachineFunctionPass(ID) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI prepare scratch registers"; - } - -}; - -} // End anonymous namespace - -char SIPrepareScratchRegs::ID = 0; - -FunctionPass *llvm::createSIPrepareScratchRegs() { - return new SIPrepareScratchRegs(); -} - -bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - MachineBasicBlock *Entry = MF.begin(); - MachineBasicBlock::iterator I = Entry->begin(); - DebugLoc DL = I->getDebugLoc(); - - // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to - // run this pass. - if (!MFI->hasSpilledVGPRs()) - return false; - - unsigned ScratchPtrPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchOffsetPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - - if (!Entry->isLiveIn(ScratchPtrPreloadReg)) - Entry->addLiveIn(ScratchPtrPreloadReg); - - if (!Entry->isLiveIn(ScratchOffsetPreloadReg)) - Entry->addLiveIn(ScratchOffsetPreloadReg); - - // Load the scratch offset. - unsigned ScratchOffsetReg = - TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass); - int ScratchOffsetFI = -1; - - if (ScratchOffsetReg != AMDGPU::NoRegister) { - // Found an SGPR to use - MRI.setPhysRegUsed(ScratchOffsetReg); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) - .addReg(ScratchOffsetPreloadReg); - } else { - // No SGPR is available, we must spill. - ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE)) - .addReg(ScratchOffsetPreloadReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } - - - // Now that we have the scratch pointer and offset values, we need to - // add them to all the SI_SPILL_V* instructions. - - RegScavenger RS; - unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4); - RS.addScavengingFrameIndex(ScratchRsrcFI); - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - // Add the scratch offset reg as a live-in so that the register scavenger - // doesn't re-use it. - if (!MBB.isLiveIn(ScratchOffsetReg) && - ScratchOffsetReg != AMDGPU::NoRegister) - MBB.addLiveIn(ScratchOffsetReg); - RS.enterBasicBlock(&MBB); - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - RS.forward(I); - DebugLoc DL = MI.getDebugLoc(); - if (!TII->isVGPRSpill(MI.getOpcode())) - continue; - - // Scratch resource - unsigned ScratchRsrcReg = - RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); - - uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) - .addImm(Rsrc23 & 0xffffffff) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) - .addImm(Rsrc23 >> 32) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - // Scratch Offset - if (ScratchOffsetReg == AMDGPU::NoRegister) { - ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), - ScratchOffsetReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else if (!MBB.isLiveIn(ScratchOffsetReg)) { - MBB.addLiveIn(ScratchOffsetReg); - } - - if (ScratchRsrcReg == AMDGPU::NoRegister || - ScratchOffsetReg == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF.getFunction()->getContext(); - Ctx.emitError("ran out of SGPRs for spilling VGPRs"); - ScratchRsrcReg = AMDGPU::SGPR0; - ScratchOffsetReg = AMDGPU::SGPR0; - } - MI.getOperand(2).setReg(ScratchRsrcReg); - MI.getOperand(2).setIsKill(true); - MI.getOperand(2).setIsUndef(false); - MI.getOperand(3).setReg(ScratchOffsetReg); - MI.getOperand(3).setIsUndef(false); - MI.getOperand(3).setIsKill(false); - MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); - } - } - return true; -} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index e9e8412..609f5e7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// - #include "SIRegisterInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -24,7 +23,20 @@ using namespace llvm; -SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {} +SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() { + unsigned NumRegPressureSets = getNumRegPressureSets(); + + SGPR32SetID = NumRegPressureSets; + VGPR32SetID = NumRegPressureSets; + for (unsigned i = 0; i < NumRegPressureSets; ++i) { + if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0) + SGPR32SetID = i; + else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0) + VGPR32SetID = i; + } + assert(SGPR32SetID < NumRegPressureSets && + VGPR32SetID < NumRegPressureSets); +} void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { MCRegAliasIterator R(Reg, this, true); @@ -33,6 +45,42 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co Reserved.set(*R); } +unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + if (ST.hasSGPRInitBug()) { + // Leave space for flat_scr, xnack_mask, vcc, and alignment + unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4; + unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and + // 100/101 for vcc. This is the next sgpr128 down. + return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; + } + + return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99; +} + +unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + if (ST.hasSGPRInitBug()) { + unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1; + return AMDGPU::SGPR_32RegClass.getRegister(Idx); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Next register before reservations for flat_scr, xnack_mask, vcc, + // and scratch resource. + return AMDGPU::SGPR91; + } + + return AMDGPU::SGPR95; +} + BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); @@ -42,17 +90,30 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); - // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs - reserveRegisterTuples(Reserved, AMDGPU::VGPR254); - reserveRegisterTuples(Reserved, AMDGPU::VGPR255); + // Reserve the last 2 registers so we will always have at least 2 more that + // will physically contain VCC. + reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103); + + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation + // for VCC/XNACK_MASK/FLAT_SCR. + // + // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose + // SGPRs when the XNACK feature is not used. This is currently not done + // because the code that counts SGPRs cannot account for such holes. + reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97); + reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99); + reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101); + } // Tonga and Iceland can only allocate a fixed number of SGPRs due // to a hw bug. - if (MF.getSubtarget<AMDGPUSubtarget>().hasSGPRInitBug()) { + if (ST.hasSGPRInitBug()) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); - // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs). - // Assume XNACK_MASK is unused. - unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4; + // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs). + unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6; for (unsigned i = Limit; i < NumSGPRs; ++i) { unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); @@ -60,34 +121,57 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { } } + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { + // Reserve 1 SGPR for scratch wave offset in case we need to spill. + reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); + } + + unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); + if (ScratchRSrcReg != AMDGPU::NoRegister) { + // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need + // to spill. + // TODO: May need to reserve a VGPR if doing LDS spilling. + reserveRegisterTuples(Reserved, ScratchRSrcReg); + assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); + } + return Reserved; } unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { - const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>(); // FIXME: We should adjust the max number of waves based on LDS size. unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(), STI.getMaxWavesPerCU()); unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); + unsigned VSLimit = SGPRLimit + VGPRLimit; + for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I) { + const TargetRegisterClass *RC = *I; - unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1); + unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1); unsigned Limit; - if (isSGPRClass(*I)) { + if (isPseudoRegClass(RC)) { + // FIXME: This is a hack. We should never be considering the pressure of + // these since no virtual register should ever have this class. + Limit = VSLimit; + } else if (isSGPRClass(RC)) { Limit = SGPRLimit / NumSubRegs; } else { Limit = VGPRLimit / NumSubRegs; } - const int *Sets = getRegClassPressureSets(*I); + const int *Sets = getRegClassPressureSets(RC); assert(Sets); for (unsigned i = 0; Sets[i] != -1; ++i) { - if (Sets[i] == (int)Idx) + if (Sets[i] == (int)Idx) return Limit; } } @@ -174,17 +258,17 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned SubReg = NumSubRegs > 1 ? getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : Value; - bool IsKill = (i == e - 1); BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) - .addReg(SubReg, getDefRegState(IsLoad)) - .addReg(ScratchRsrcReg, getKillRegState(IsKill)) - .addReg(SOffset) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)); + .addReg(SubReg, getDefRegState(IsLoad)) + .addReg(ScratchRsrcReg) + .addReg(SOffset) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); } } @@ -217,17 +301,15 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); - if (Spill.VGPR == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Ran out of VGPRs for spilling SGPR"); - } - BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) .addReg(SubReg) .addImm(Spill.Lane); + // FIXME: Since this spills to another register instead of an actual + // frame index, we should delete the frame index when all references to + // it are fixed. } MI->eraseFromParent(); break; @@ -247,11 +329,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); - if (Spill.VGPR == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Ran out of VGPRs for spilling SGPR"); - } - BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) @@ -263,16 +340,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // TODO: only do this when it is needed switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) { case AMDGPUSubtarget::SOUTHERN_ISLANDS: - // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI - TII->insertNOPs(MI, 3); + // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states + // ("S_NOP 3") on SI + TII->insertWaitStates(MI, 4); break; case AMDGPUSubtarget::SEA_ISLANDS: break; default: // VOLCANIC_ISLANDS and later - // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI - // and later. This also applies to VALUs which write VCC, but we're - // unlikely to see VMEM use VCC. - TII->insertNOPs(MI, 4); + // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states + // ("S_NOP 4") on VI and later. This also applies to VALUs which write + // VCC, but we're unlikely to see VMEM use VCC. + TII->insertWaitStates(MI, 5); } MI->eraseFromParent(); @@ -322,22 +400,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } } -const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( - MVT VT) const { - switch(VT.SimpleTy) { - default: - case MVT::i32: return &AMDGPU::VGPR_32RegClass; - } -} - unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const { return getEncodingValue(Reg) & 0xff; } +// FIXME: This is very slow. It might be worth creating a map from physreg to +// register class. const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { assert(!TargetRegisterInfo::isVirtualRegister(Reg)); - static const TargetRegisterClass *BaseClasses[] = { + static const TargetRegisterClass *const BaseClasses[] = { &AMDGPU::VGPR_32RegClass, &AMDGPU::SReg_32RegClass, &AMDGPU::VReg_64RegClass, @@ -359,33 +431,45 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { return nullptr; } +// TODO: It might be helpful to have some target specific flags in +// TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { - return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_512RegClass, RC); + switch (RC->getSize()) { + case 4: + return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; + case 8: + return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; + case 12: + return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; + case 16: + return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; + case 32: + return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; + case 64: + return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; + default: + llvm_unreachable("Invalid register class size"); + } } const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( const TargetRegisterClass *SRC) const { - if (hasVGPRs(SRC)) { - return SRC; - } else if (SRC == &AMDGPU::SCCRegRegClass) { - return &AMDGPU::VCCRegRegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) { - return &AMDGPU::VGPR_32RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) { - return &AMDGPU::VReg_64RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) { - return &AMDGPU::VReg_128RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) { - return &AMDGPU::VReg_256RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) { - return &AMDGPU::VReg_512RegClass; - } - return nullptr; + switch (SRC->getSize()) { + case 4: + return &AMDGPU::VGPR_32RegClass; + case 8: + return &AMDGPU::VReg_64RegClass; + case 12: + return &AMDGPU::VReg_96RegClass; + case 16: + return &AMDGPU::VReg_128RegClass; + case 32: + return &AMDGPU::VReg_256RegClass; + case 64: + return &AMDGPU::VReg_512RegClass; + default: + llvm_unreachable("Invalid register class size"); + } } const TargetRegisterClass *SIRegisterInfo::getSubRegClass( @@ -393,15 +477,65 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( if (SubIdx == AMDGPU::NoSubRegister) return RC; - // If this register has a sub-register, we can safely assume it is a 32-bit - // register, because all of SI's sub-registers are 32-bit. + // We can assume that each lane corresponds to one 32-bit register. + unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx)); if (isSGPRClass(RC)) { - return &AMDGPU::SGPR_32RegClass; + switch (Count) { + case 1: + return &AMDGPU::SGPR_32RegClass; + case 2: + return &AMDGPU::SReg_64RegClass; + case 4: + return &AMDGPU::SReg_128RegClass; + case 8: + return &AMDGPU::SReg_256RegClass; + case 16: /* fall-through */ + default: + llvm_unreachable("Invalid sub-register class size"); + } } else { - return &AMDGPU::VGPR_32RegClass; + switch (Count) { + case 1: + return &AMDGPU::VGPR_32RegClass; + case 2: + return &AMDGPU::VReg_64RegClass; + case 3: + return &AMDGPU::VReg_96RegClass; + case 4: + return &AMDGPU::VReg_128RegClass; + case 8: + return &AMDGPU::VReg_256RegClass; + case 16: /* fall-through */ + default: + llvm_unreachable("Invalid sub-register class size"); + } } } +bool SIRegisterInfo::shouldRewriteCopySrc( + const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { + // We want to prefer the smallest register class possible, so we don't want to + // stop and rewrite on anything that looks like a subregister + // extract. Operations mostly don't care about the super register class, so we + // only want to stop on the most basic of copies between the smae register + // class. + // + // e.g. if we have something like + // vreg0 = ... + // vreg1 = ... + // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2 + // vreg3 = COPY vreg2, sub0 + // + // We want to look through the COPY to find: + // => vreg3 = COPY vreg0 + + // Plain copy. + return getCommonSubClass(DefRC, SrcRC) != nullptr; +} + unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, unsigned Channel) const { @@ -462,30 +596,47 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { return OpType == AMDGPU::OPERAND_REG_INLINE_C; } +// FIXME: Most of these are flexible with HSA and we don't need to reserve them +// as input registers if unused. Whether the dispatch ptr is necessary should be +// easy to detect from used intrinsics. Scratch setup is harder to know. unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + (void)ST; switch (Value) { - case SIRegisterInfo::TGID_X: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); - case SIRegisterInfo::TGID_Y: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); - case SIRegisterInfo::TGID_Z: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); - case SIRegisterInfo::SCRATCH_WAVE_OFFSET: - if (MFI->getShaderType() != ShaderType::COMPUTE) - return MFI->ScratchOffsetReg; - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); - case SIRegisterInfo::SCRATCH_PTR: - return AMDGPU::SGPR2_SGPR3; - case SIRegisterInfo::INPUT_PTR: - return AMDGPU::SGPR0_SGPR1; - case SIRegisterInfo::TIDIG_X: + case SIRegisterInfo::WORKGROUP_ID_X: + assert(MFI->hasWorkGroupIDX()); + return MFI->WorkGroupIDXSystemSGPR; + case SIRegisterInfo::WORKGROUP_ID_Y: + assert(MFI->hasWorkGroupIDY()); + return MFI->WorkGroupIDYSystemSGPR; + case SIRegisterInfo::WORKGROUP_ID_Z: + assert(MFI->hasWorkGroupIDZ()); + return MFI->WorkGroupIDZSystemSGPR; + case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: + return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; + case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: + assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations"); + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; + case SIRegisterInfo::KERNARG_SEGMENT_PTR: + assert(MFI->hasKernargSegmentPtr()); + return MFI->KernargSegmentPtrUserSGPR; + case SIRegisterInfo::DISPATCH_PTR: + assert(MFI->hasDispatchPtr()); + return MFI->DispatchPtrUserSGPR; + case SIRegisterInfo::QUEUE_PTR: + llvm_unreachable("not implemented"); + case SIRegisterInfo::WORKITEM_ID_X: + assert(MFI->hasWorkItemIDX()); return AMDGPU::VGPR0; - case SIRegisterInfo::TIDIG_Y: + case SIRegisterInfo::WORKITEM_ID_Y: + assert(MFI->hasWorkItemIDY()); return AMDGPU::VGPR1; - case SIRegisterInfo::TIDIG_Z: + case SIRegisterInfo::WORKITEM_ID_Z: + assert(MFI->hasWorkItemIDZ()); return AMDGPU::VGPR2; } llvm_unreachable("unexpected preloaded value type"); @@ -496,12 +647,9 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, // AMDGPU::NoRegister. unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC) const { - - for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); - I != E; ++I) { - if (!MRI.isPhysRegUsed(*I)) - return *I; - } + for (unsigned Reg : *RC) + if (!MRI.isPhysRegUsed(Reg)) + return Reg; return AMDGPU::NoRegister; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 7da6de2..9410e20 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -18,17 +18,30 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" namespace llvm { struct SIRegisterInfo : public AMDGPURegisterInfo { private: + unsigned SGPR32SetID; + unsigned VGPR32SetID; + void reserveRegisterTuples(BitVector &, unsigned Reg) const; public: SIRegisterInfo(); + /// Return the end register initially reserved for the scratch buffer in case + /// spilling is needed. + unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; + + /// Return the end register initially reserved for the scratch wave offset in + /// case spilling is needed. + unsigned reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; unsigned getRegPressureSetLimit(const MachineFunction &MF, @@ -40,10 +53,6 @@ public: unsigned FIOperandNum, RegScavenger *RS) const override; - /// \brief get the register class of the specified type to use in the - /// CFGStructurizer - const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; - unsigned getHWRegIndex(unsigned Reg) const override; /// \brief Return the 'base' register class for this register. @@ -52,23 +61,30 @@ public: /// \returns true if this class contains only SGPR registers bool isSGPRClass(const TargetRegisterClass *RC) const { - if (!RC) - return false; - return !hasVGPRs(RC); } /// \returns true if this class ID contains only SGPR registers bool isSGPRClassID(unsigned RCID) const { - if (static_cast<int>(RCID) == -1) - return false; - return isSGPRClass(getRegClass(RCID)); } + bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return isSGPRClass(MRI.getRegClass(Reg)); + return getPhysRegClass(Reg); + } + /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; + /// returns true if this is a pseudoregister class combination of VGPRs and + /// SGPRs for operand modeling. FIXME: We should set isAllocatable = 0 on + /// them. + static bool isPseudoRegClass(const TargetRegisterClass *RC) { + return RC == &AMDGPU::VS_32RegClass || RC == &AMDGPU::VS_64RegClass; + } + /// \returns A VGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentVGPRClass( const TargetRegisterClass *SRC) const; @@ -79,6 +95,11 @@ public: const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, unsigned SubIdx) const; + bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const override; + /// \p Channel This is the register channel (e.g. a value from 0-16), not the /// SubReg index. /// \returns The sub-register of Reg that is in Channel. @@ -91,19 +112,25 @@ public: /// \returns True if operands defined with this operand type can accept /// an inline constant. i.e. An integer value in the range (-16, 64) or - /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. + /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. bool opCanUseInlineConstant(unsigned OpType) const; enum PreloadedValue { - TGID_X, - TGID_Y, - TGID_Z, - SCRATCH_WAVE_OFFSET, - SCRATCH_PTR, - INPUT_PTR, - TIDIG_X, - TIDIG_Y, - TIDIG_Z + // SGPRS: + PRIVATE_SEGMENT_BUFFER = 0, + DISPATCH_PTR = 1, + QUEUE_PTR = 2, + KERNARG_SEGMENT_PTR = 3, + WORKGROUP_ID_X = 10, + WORKGROUP_ID_Y = 11, + WORKGROUP_ID_Z = 12, + PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, + + // VGPRS: + FIRST_VGPR_VALUE = 15, + WORKITEM_ID_X = FIRST_VGPR_VALUE, + WORKITEM_ID_Y = 16, + WORKITEM_ID_Z = 17 }; /// \brief Returns the physical register that \p Value is stored in. @@ -122,6 +149,9 @@ public: unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC) const; + unsigned getSGPR32PressureSet() const { return SGPR32SetID; }; + unsigned getVGPR32PressureSet() const { return VGPR32SetID; }; + private: void buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, unsigned Value, diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 2a9017f..bfaf937 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -10,10 +10,13 @@ //===----------------------------------------------------------------------===// // Declarations that describe the SI registers //===----------------------------------------------------------------------===// - -class SIReg <string n, bits<16> encoding = 0> : Register<n> { +class SIReg <string n, bits<16> regIdx = 0> : Register<n>, + DwarfRegNum<[!cast<int>(HWEncoding)]> { let Namespace = "AMDGPU"; - let HWEncoding = encoding; + + // This is the not yet the complete register encoding. An additional + // bit is set for VGPRs. + let HWEncoding = regIdx; } // Special Registers @@ -21,7 +24,8 @@ def VCC_LO : SIReg<"vcc_lo", 106>; def VCC_HI : SIReg<"vcc_hi", 107>; // VCC for 64-bit instructions -def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { +def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, + DwarfRegAlias<VCC_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 106; @@ -30,7 +34,8 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { def EXEC_LO : SIReg<"exec_lo", 126>; def EXEC_HI : SIReg<"exec_hi", 127>; -def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { +def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>, + DwarfRegAlias<EXEC_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 126; @@ -39,18 +44,29 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { def SCC : SIReg<"scc", 253>; def M0 : SIReg <"m0", 124>; -def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes. -def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes. +multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> { + def _ci : SIReg<n, ci_e>; + def _vi : SIReg<n, vi_e>; + def "" : SIReg<"", 0>; +} -// Pair to indicate location of scratch space for flat accesses. -def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> { +class FlatReg <Register lo, Register hi, bits<16> encoding> : + RegisterWithSubRegs<"flat_scratch", [lo, hi]>, + DwarfRegAlias<lo> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; - let HWEncoding = 104; + let HWEncoding = encoding; } +defm FLAT_SCR_LO : FLAT_SCR_LOHI_m<"flat_scratch_lo", 104, 102>; // Offset in units of 256-bytes. +defm FLAT_SCR_HI : FLAT_SCR_LOHI_m<"flat_scratch_hi", 105, 103>; // Size is the per-thread scratch size, in bytes. + +def FLAT_SCR_ci : FlatReg<FLAT_SCR_LO_ci, FLAT_SCR_HI_ci, 104>; +def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>; +def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>; + // SGPR registers -foreach Index = 0-101 in { +foreach Index = 0-103 in { def SGPR#Index : SIReg <"SGPR"#Index, Index>; } @@ -65,25 +81,27 @@ foreach Index = 0-255 in { // Groupings using register classes and tuples //===----------------------------------------------------------------------===// +// TODO: Do we need to set DwarfRegAlias on register tuples? + // SGPR 32-bit registers def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add (sequence "SGPR%u", 0, 101))>; + (add (sequence "SGPR%u", 0, 103))>; // SGPR 64-bit registers def SGPR_64Regs : RegisterTuples<[sub0, sub1], - [(add (decimate (trunc SGPR_32, 101), 2)), + [(add (decimate SGPR_32, 2)), (add (decimate (shl SGPR_32, 1), 2))]>; // SGPR 128-bit registers def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], - [(add (decimate (trunc SGPR_32, 99), 4)), + [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4))]>; // SGPR 256-bit registers def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], - [(add (decimate (trunc SGPR_32, 95), 4)), + [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4)), @@ -95,7 +113,7 @@ def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], // SGPR 512-bit registers def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], - [(add (decimate (trunc SGPR_32, 87), 4)), + [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4)), @@ -174,44 +192,57 @@ class RegImmMatcher<string name> : AsmOperandClass { let RenderMethod = "addRegOrImmOperands"; } -// Special register classes for predicates and the M0 register -def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> { - let CopyCost = -1; // Theoretically it is possible to read from SCC, - // but it should never be necessary. -} - -def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>; -def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>; - // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) >; -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>; +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>; -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64, - (add SGPR_64, VCCReg, EXECReg, FLAT_SCR) +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, + (add SGPR_64, VCC, EXEC, FLAT_SCR) >; -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>; +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> { + // Requires 2 s_mov_b64 to copy + let CopyCost = 2; +} -def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>; +def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add SGPR_256)> { + // Requires 4 s_mov_b64 to copy + let CopyCost = 4; +} -def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>; +def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> { + // Requires 8 s_mov_b64 to copy + let CopyCost = 8; +} // Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>; +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> { + // Requires 2 v_mov_b32 to copy + let CopyCost = 2; +} -def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> { +def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { let Size = 96; + + // Requires 3 v_mov_b32 to copy + let CopyCost = 3; } -def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>; +def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> { + // Requires 4 v_mov_b32 to copy + let CopyCost = 4; +} -def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>; +def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add VGPR_256)> { + let CopyCost = 8; +} -def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>; +def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> { + let CopyCost = 16; +} def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { let Size = 32; @@ -253,7 +284,9 @@ def SCSrc_32 : RegInlineOperand<SReg_32> { def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>; -def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; +def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> { + let CopyCost = 2; +} def VSrc_32 : RegisterOperand<VS_32> { let OperandNamespace = "AMDGPU"; @@ -282,3 +315,13 @@ def VCSrc_64 : RegisterOperand<VS_64> { let OperandType = "OPERAND_REG_INLINE_C"; let ParserMatchClass = RegImmMatcher<"VCSrc64">; } + +//===----------------------------------------------------------------------===// +// SCSrc_* Operands with an SGPR or an inline constant +//===----------------------------------------------------------------------===// + +def SCSrc_64 : RegisterOperand<SReg_64> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; + let ParserMatchClass = RegImmMatcher<"SCSrc64">; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td index 9b1f676..cd77e51 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td @@ -17,16 +17,28 @@ def WriteLDS : SchedWrite; def WriteSALU : SchedWrite; def WriteSMEM : SchedWrite; def WriteVMEM : SchedWrite; +def WriteBarrier : SchedWrite; // Vector ALU instructions def Write32Bit : SchedWrite; def WriteQuarterRate32 : SchedWrite; +def WriteFullOrQuarterRate32 : SchedWrite; def WriteFloatFMA : SchedWrite; -def WriteDouble : SchedWrite; +// Slow quarter rate f64 instruction. +def WriteDouble : SchedWrite; + +// half rate f64 instruction (same as v_add_f64) def WriteDoubleAdd : SchedWrite; +// Half rate 64-bit instructions. +def Write64Bit : SchedWrite; + +// FIXME: Should there be a class for instructions which are VALU +// instructions and have VALU rates, but write to the SALU (i.e. VOPC +// instructions) + def SIFullSpeedModel : SchedMachineModel; def SIQuarterSpeedModel : SchedMachineModel; @@ -53,7 +65,7 @@ class HWVALUWriteRes<SchedWrite write, int latency> : // The latency numbers are taken from AMD Accelerated Parallel Processing -// guide. They may not be acurate. +// guide. They may not be accurate. // The latency values are 1 / (operations / cycle) / 4. multiclass SICommonWriteRes { @@ -64,8 +76,10 @@ multiclass SICommonWriteRes { def : HWWriteRes<WriteSALU, [HWSALU], 1>; def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ??? def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600 + def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ??? def : HWVALUWriteRes<Write32Bit, 1>; + def : HWVALUWriteRes<Write64Bit, 2>; def : HWVALUWriteRes<WriteQuarterRate32, 4>; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 5d00bdd..4f0913f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -141,8 +141,7 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, if (!MRI.isSSA()) return; - assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) || - TII->isVOPC(MI.getOpcode())); + assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); const SIRegisterInfo &TRI = TII->getRegisterInfo(); int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); @@ -187,6 +186,21 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, } +// Copy MachineOperand with all flags except setting it as implicit. +static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) { + assert(!Orig.isImplicit()); + return MachineOperand::CreateReg(Orig.getReg(), + Orig.isDef(), + true, + Orig.isKill(), + Orig.isDead(), + Orig.isUndef(), + Orig.isEarlyClobber(), + Orig.getSubReg(), + Orig.isDebug(), + Orig.isInternalRead()); +} + bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = @@ -236,14 +250,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (TII->isVOPC(Op32)) { unsigned DstReg = MI.getOperand(0).getReg(); if (TargetRegisterInfo::isVirtualRegister(DstReg)) { - // VOPC instructions can only write to the VCC register. We can't - // force them to use VCC here, because the register allocator has - // trouble with sequences like this, which cause the allocator to run - // out of registers if vreg0 and vreg1 belong to the VCCReg register - // class: - // vreg0 = VOPC; - // vreg1 = VOPC; - // S_AND_B64 vreg0, vreg1 + // VOPC instructions can only write to the VCC register. We can't + // force them to use VCC here, because this is only one register and + // cannot deal with sequences which would require multiple copies of + // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) // // So, instead of forcing the instruction to write to VCC, we provide // a hint to the register allocator to use VCC and then we we will run @@ -272,13 +282,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { } // We can shrink this instruction - DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';); + DEBUG(dbgs() << "Shrinking " << MI); MachineInstrBuilder Inst32 = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); - // dst - Inst32.addOperand(MI.getOperand(0)); + // Add the dst operand if the 32-bit encoding also has an explicit $dst. + // For VOPC instructions, this is replaced by an implicit def of vcc. + int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst); + if (Op32DstIdx != -1) { + // dst + Inst32.addOperand(MI.getOperand(0)); + } else { + assert(MI.getOperand(0).getReg() == AMDGPU::VCC && + "Unexpected case"); + } + Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); @@ -288,9 +307,19 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { Inst32.addOperand(*Src1); const MachineOperand *Src2 = - TII->getNamedOperand(MI, AMDGPU::OpName::src2); - if (Src2) - Inst32.addOperand(*Src2); + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (Src2) { + int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); + if (Op32Src2Idx != -1) { + Inst32.addOperand(*Src2); + } else { + // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is + // replaced with an implicit read of vcc. + assert(Src2->getReg() == AMDGPU::VCC && + "Unexpected missing register operand"); + Inst32.addOperand(copyRegOperandAsImplicit(*Src2)); + } + } ++NumInstructionsShrunk; MI.eraseFromParent(); diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp index 591ce85..d36c5d2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp @@ -22,6 +22,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" @@ -61,14 +62,7 @@ bool SITypeRewriter::doInitialization(Module &M) { } bool SITypeRewriter::runOnFunction(Function &F) { - Attribute A = F.getFnAttribute("ShaderType"); - - unsigned ShaderType = ShaderType::COMPUTE; - if (A.isStringAttribute()) { - StringRef Str = A.getValueAsString(); - Str.getAsInteger(0, ShaderType); - } - if (ShaderType == ShaderType::COMPUTE) + if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE) return false; visit(F); @@ -104,6 +98,9 @@ void SITypeRewriter::visitCallInst(CallInst &I) { SmallVector <Type*, 8> Types; bool NeedToReplace = false; Function *F = I.getCalledFunction(); + if (!F) + return; + std::string Name = F->getName(); for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { Value *Arg = I.getArgOperand(i); diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b76b400..3b4c235 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -7,12 +7,23 @@ // //===----------------------------------------------------------------------===// #include "AMDGPUBaseInfo.h" +#include "AMDGPU.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/SubtargetFeature.h" #define GET_SUBTARGETINFO_ENUM #include "AMDGPUGenSubtargetInfo.inc" #undef GET_SUBTARGETINFO_ENUM +#define GET_REGINFO_ENUM +#include "AMDGPUGenRegisterInfo.inc" +#undef GET_REGINFO_ENUM + namespace llvm { namespace AMDGPU { @@ -56,5 +67,98 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.private_segment_alignment = 4; } +MCSection *getHSATextSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsatext", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_EXECINSTR | + ELF::SHF_AMDGPU_HSA_AGENT | + ELF::SHF_AMDGPU_HSA_CODE); +} + +MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_AMDGPU_HSA_GLOBAL | + ELF::SHF_AMDGPU_HSA_AGENT); +} + +MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_AMDGPU_HSA_GLOBAL); +} + +MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY | + ELF::SHF_AMDGPU_HSA_AGENT); +} + +bool isGroupSegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +} + +bool isGlobalSegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; +} + +bool isReadOnlySegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; +} + +static unsigned getIntegerAttribute(const Function &F, const char *Name, + unsigned Default) { + Attribute A = F.getFnAttribute(Name); + unsigned Result = Default; + + if (A.isStringAttribute()) { + StringRef Str = A.getValueAsString(); + if (Str.getAsInteger(0, Result)) { + LLVMContext &Ctx = F.getContext(); + Ctx.emitError("can't parse shader type"); + } + } + return Result; +} + +unsigned getShaderType(const Function &F) { + return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE); +} + +unsigned getInitialPSInputAddr(const Function &F) { + return getIntegerAttribute(F, "InitialPSInputAddr", 0); +} + +bool isSI(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands]; +} + +bool isCI(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands]; +} + +bool isVI(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; +} + +unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { + + switch(Reg) { + default: break; + case AMDGPU::FLAT_SCR: + assert(!isSI(STI)); + return isCI(STI) ? AMDGPU::FLAT_SCR_ci : AMDGPU::FLAT_SCR_vi; + + case AMDGPU::FLAT_SCR_LO: + assert(!isSI(STI)); + return isCI(STI) ? AMDGPU::FLAT_SCR_LO_ci : AMDGPU::FLAT_SCR_LO_vi; + + case AMDGPU::FLAT_SCR_HI: + assert(!isSI(STI)); + return isCI(STI) ? AMDGPU::FLAT_SCR_HI_ci : AMDGPU::FLAT_SCR_HI_vi; + } + return Reg; +} + } // End namespace AMDGPU } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index f57028c..57cbe1b5 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -15,6 +15,11 @@ namespace llvm { class FeatureBitset; +class Function; +class GlobalValue; +class MCContext; +class MCSection; +class MCSubtargetInfo; namespace AMDGPU { @@ -27,6 +32,29 @@ struct IsaVersion { IsaVersion getIsaVersion(const FeatureBitset &Features); void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const FeatureBitset &Features); +MCSection *getHSATextSection(MCContext &Ctx); + +MCSection *getHSADataGlobalAgentSection(MCContext &Ctx); + +MCSection *getHSADataGlobalProgramSection(MCContext &Ctx); + +MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx); + +bool isGroupSegment(const GlobalValue *GV); +bool isGlobalSegment(const GlobalValue *GV); +bool isReadOnlySegment(const GlobalValue *GV); + +unsigned getShaderType(const Function &F); +unsigned getInitialPSInputAddr(const Function &F); + + +bool isSI(const MCSubtargetInfo &STI); +bool isCI(const MCSubtargetInfo &STI); +bool isVI(const MCSubtargetInfo &STI); + +/// If \p Reg is a pseudo reg, return the correct hardware register given +/// \p STI otherwise return \p Reg. +unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI); } // end namespace AMDGPU } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td index aca4673..1a7801c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td @@ -73,8 +73,8 @@ defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>; } // End isCommutable = 1 defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>; -// Aliases to simplify matching of floating-pint instructions that are VOP2 on -// SI and VOP3 on VI. +// Aliases to simplify matching of floating-point instructions that +// are VOP2 on SI and VOP3 on VI. class SI2_VI3Alias <string name, Instruction inst> : InstAlias < name#" $dst, $src0, $src1", @@ -89,12 +89,18 @@ def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; -} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI - //===----------------------------------------------------------------------===// -// SMEM Patterns +// SMEM Instructions //===----------------------------------------------------------------------===// +def S_DCACHE_WB : SMEM_Inval <0x21, + "s_dcache_wb", int_amdgcn_s_dcache_wb>; + +def S_DCACHE_WB_VOL : SMEM_Inval <0x23, + "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>; + +} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI + let Predicates = [isVI] in { // 1. Offset as 20bit DWORD immediate @@ -103,46 +109,4 @@ def : Pat < (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) >; -// Patterns for global loads with no offset -class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node i64:$addr)), - (inst $addr, 0, 0, 0) ->; - -def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>; - -class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (node vt:$data, i64:$addr), - (inst $data, $addr, 0, 0, 0) ->; - -def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>; -def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>; -def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>; - -class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node i64:$addr, vt:$data)), - (inst $addr, $data, 0, 0) ->; - -def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; - - } // End Predicates = [isVI] diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h index 9550a3a..cd7540e 100644 --- a/contrib/llvm/lib/Target/ARM/ARM.h +++ b/contrib/llvm/lib/Target/ARM/ARM.h @@ -35,7 +35,6 @@ FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, FunctionPass *createA15SDOptimizerPass(); FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); FunctionPass *createARMExpandPseudoPass(); -FunctionPass *createARMGlobalBaseRegPass(); FunctionPass *createARMConstantIslandPass(); FunctionPass *createMLxExpansionPass(); FunctionPass *createThumb2ITBlockPass(); diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td index ef609a6..c171656 100644 --- a/contrib/llvm/lib/Target/ARM/ARM.td +++ b/contrib/llvm/lib/Target/ARM/ARM.td @@ -17,6 +17,17 @@ include "llvm/Target/Target.td" //===----------------------------------------------------------------------===// +// ARM Helper classes. +// + +class ProcNoItin<string Name, list<SubtargetFeature> Features> + : Processor<Name, NoItineraries, Features>; + +class Architecture<string fname, string aname, list<SubtargetFeature> features > + : SubtargetFeature<fname, "ARMArch", aname, + !strconcat(aname, " architecture"), features>; + +//===----------------------------------------------------------------------===// // ARM Subtarget state. // @@ -51,8 +62,11 @@ def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true", def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true", "Enable ARMv8 FP", [FeatureVFP4]>; +def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", + "Enable full half-precision floating point", + [FeatureFPARMv8]>; def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", - "Restrict VFP3 to 16 double registers">; + "Restrict FP to 16 double registers">; def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true", "Enable divide instructions">; def FeatureHWDivARM : SubtargetFeature<"hwdiv-arm", @@ -119,9 +133,9 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop", def FeatureHasRAS : SubtargetFeature<"ras", "HasRAS", "true", "Has return address stack">; -/// Some M architectures don't have the DSP extension (v7E-M vs. v7M) -def FeatureDSPThumb2 : SubtargetFeature<"t2dsp", "Thumb2DSP", "true", - "Supports v7 DSP instructions in Thumb2">; +/// DSP extension. +def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", + "Supports DSP instructions in ARM and/or Thumb2">; // Multiprocessing extension. def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true", @@ -150,11 +164,28 @@ def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass", def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true", "NaCl trap">; +def FeatureStrictAlign : SubtargetFeature<"strict-align", + "StrictAlign", "true", + "Disallow all unaligned memory " + "access">; + def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true", "Generate calls via indirect call " "instructions">; -// ARM ISAs. +def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true", + "Reserve R9, making it unavailable as " + "GPR">; + +def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true", + "Don't use movt/movw pairs for 32-bit " + "imms">; + + +//===----------------------------------------------------------------------===// +// ARM ISAa. +// + def HasV4TOps : SubtargetFeature<"v4t", "HasV4TOps", "true", "Support ARM v4T instructions">; def HasV5TOps : SubtargetFeature<"v5t", "HasV5TOps", "true", @@ -180,302 +211,452 @@ def HasV7Ops : SubtargetFeature<"v7", "HasV7Ops", "true", [HasV6T2Ops, FeaturePerfMon]>; def HasV8Ops : SubtargetFeature<"v8", "HasV8Ops", "true", "Support ARM v8 instructions", - [HasV7Ops, FeatureVirtualization, - FeatureMP]>; + [HasV7Ops]>; def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", "Support ARM v8.1a instructions", - [HasV8Ops, FeatureAClass, FeatureCRC]>; + [HasV8Ops]>; +def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", + "Support ARM v8.2a instructions", + [HasV8_1aOps]>; + //===----------------------------------------------------------------------===// -// ARM Processors supported. +// ARM Processor subtarget features. // -include "ARMSchedule.td" - -// ARM processor families. def ProcA5 : SubtargetFeature<"a5", "ARMProcFamily", "CortexA5", - "Cortex-A5 ARM processors", - [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, - FeatureVMLxForwarding, FeatureT2XtPk, - FeatureTrustZone, FeatureMP]>; + "Cortex-A5 ARM processors", []>; def ProcA7 : SubtargetFeature<"a7", "ARMProcFamily", "CortexA7", - "Cortex-A7 ARM processors", - [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, - FeatureVMLxForwarding, FeatureT2XtPk, - FeatureVFP4, FeatureMP, - FeatureHWDiv, FeatureHWDivARM, - FeatureTrustZone, FeatureVirtualization]>; + "Cortex-A7 ARM processors", []>; def ProcA8 : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8", - "Cortex-A8 ARM processors", - [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, - FeatureVMLxForwarding, FeatureT2XtPk, - FeatureTrustZone]>; + "Cortex-A8 ARM processors", []>; def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9", - "Cortex-A9 ARM processors", - [FeatureVMLxForwarding, - FeatureT2XtPk, FeatureFP16, - FeatureAvoidPartialCPSR, - FeatureTrustZone]>; -def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift", - "Swift ARM processors", - [FeatureNEONForFP, FeatureT2XtPk, - FeatureVFP4, FeatureMP, FeatureHWDiv, - FeatureHWDivARM, FeatureAvoidPartialCPSR, - FeatureAvoidMOVsShOp, - FeatureHasSlowFPVMLx, FeatureTrustZone]>; + "Cortex-A9 ARM processors", []>; def ProcA12 : SubtargetFeature<"a12", "ARMProcFamily", "CortexA12", - "Cortex-A12 ARM processors", - [FeatureVMLxForwarding, - FeatureT2XtPk, FeatureVFP4, - FeatureHWDiv, FeatureHWDivARM, - FeatureAvoidPartialCPSR, - FeatureVirtualization, - FeatureTrustZone]>; - - -// FIXME: It has not been determined if A15 has these features. -def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15", - "Cortex-A15 ARM processors", - [FeatureT2XtPk, FeatureVFP4, - FeatureMP, FeatureHWDiv, FeatureHWDivARM, - FeatureAvoidPartialCPSR, - FeatureTrustZone, FeatureVirtualization]>; - + "Cortex-A12 ARM processors", []>; +def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15", + "Cortex-A15 ARM processors", []>; def ProcA17 : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17", - "Cortex-A17 ARM processors", - [FeatureVMLxForwarding, - FeatureT2XtPk, FeatureVFP4, - FeatureHWDiv, FeatureHWDivARM, - FeatureAvoidPartialCPSR, - FeatureVirtualization, - FeatureTrustZone]>; - + "Cortex-A17 ARM processors", []>; +def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", + "Cortex-A35 ARM processors", []>; def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", - "Cortex-A53 ARM processors", - [FeatureHWDiv, FeatureHWDivARM, - FeatureTrustZone, FeatureT2XtPk, - FeatureCrypto, FeatureCRC]>; - + "Cortex-A53 ARM processors", []>; def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", - "Cortex-A57 ARM processors", - [FeatureHWDiv, FeatureHWDivARM, - FeatureTrustZone, FeatureT2XtPk, - FeatureCrypto, FeatureCRC]>; + "Cortex-A57 ARM processors", []>; +def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", + "Cortex-A72 ARM processors", []>; -def ProcR4 : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4", - "Cortex-R4 ARM processors", - [FeatureHWDiv, - FeatureAvoidPartialCPSR, - FeatureDSPThumb2, FeatureT2XtPk, - HasV7Ops, FeatureDB, FeatureHasRAS, - FeatureRClass]>; +def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait", + "Qualcomm ARM processors", []>; +def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift", + "Swift ARM processors", []>; +def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", + "Samsung Exynos-M1 processors", []>; + +def ProcR4 : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4", + "Cortex-R4 ARM processors", []>; def ProcR5 : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5", - "Cortex-R5 ARM processors", - [FeatureSlowFPBrcc, - FeatureHWDiv, FeatureHWDivARM, - FeatureHasSlowFPVMLx, - FeatureAvoidPartialCPSR, - FeatureT2XtPk]>; - -// FIXME: krait has currently the same features as A9 -// plus VFP4 and hardware division features. -def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait", - "Qualcomm ARM processors", - [FeatureVMLxForwarding, - FeatureT2XtPk, FeatureFP16, - FeatureAvoidPartialCPSR, - FeatureTrustZone, - FeatureVFP4, - FeatureHWDiv, - FeatureHWDivARM]>; + "Cortex-R5 ARM processors", []>; +def ProcR7 : SubtargetFeature<"r7", "ARMProcFamily", "CortexR7", + "Cortex-R7 ARM processors", []>; -class ProcNoItin<string Name, list<SubtargetFeature> Features> - : Processor<Name, NoItineraries, Features>; +//===----------------------------------------------------------------------===// +// ARM schedules. +// + +include "ARMSchedule.td" + + +//===----------------------------------------------------------------------===// +// ARM architectures +// + +def ARMv2 : Architecture<"armv2", "ARMv2", []>; + +def ARMv2a : Architecture<"armv2a", "ARMv2a", []>; + +def ARMv3 : Architecture<"armv3", "ARMv3", []>; + +def ARMv3m : Architecture<"armv3m", "ARMv3m", []>; + +def ARMv4 : Architecture<"armv4", "ARMv4", []>; + +def ARMv4t : Architecture<"armv4t", "ARMv4t", [HasV4TOps]>; + +def ARMv5t : Architecture<"armv5t", "ARMv5t", [HasV5TOps]>; + +def ARMv5te : Architecture<"armv5te", "ARMv5te", [HasV5TEOps]>; + +def ARMv5tej : Architecture<"armv5tej", "ARMv5tej", [HasV5TEOps]>; + +def ARMv6 : Architecture<"armv6", "ARMv6", [HasV6Ops]>; + +def ARMv6t2 : Architecture<"armv6t2", "ARMv6t2", [HasV6T2Ops, + FeatureDSP]>; + +def ARMv6k : Architecture<"armv6k", "ARMv6k", [HasV6KOps]>; + +def ARMv6kz : Architecture<"armv6kz", "ARMv6kz", [HasV6KOps, + FeatureTrustZone]>; + +def ARMv6m : Architecture<"armv6-m", "ARMv6m", [HasV6MOps, + FeatureNoARM, + FeatureDB, + FeatureMClass]>; + +def ARMv6sm : Architecture<"armv6s-m", "ARMv6sm", [HasV6MOps, + FeatureNoARM, + FeatureDB, + FeatureMClass]>; + +def ARMv7a : Architecture<"armv7-a", "ARMv7a", [HasV7Ops, + FeatureNEON, + FeatureDB, + FeatureDSP, + FeatureAClass]>; + +def ARMv7r : Architecture<"armv7-r", "ARMv7r", [HasV7Ops, + FeatureDB, + FeatureDSP, + FeatureHWDiv, + FeatureRClass]>; + +def ARMv7m : Architecture<"armv7-m", "ARMv7m", [HasV7Ops, + FeatureThumb2, + FeatureNoARM, + FeatureDB, + FeatureHWDiv, + FeatureMClass]>; + +def ARMv7em : Architecture<"armv7e-m", "ARMv7em", [HasV7Ops, + FeatureThumb2, + FeatureNoARM, + FeatureDB, + FeatureHWDiv, + FeatureMClass, + FeatureDSP, + FeatureT2XtPk]>; + +def ARMv8a : Architecture<"armv8-a", "ARMv8a", [HasV8Ops, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC]>; + +def ARMv81a : Architecture<"armv8.1-a", "ARMv81a", [HasV8_1aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC]>; + +def ARMv82a : Architecture<"armv8.2-a", "ARMv82a", [HasV8_2aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC]>; + +// Aliases +def IWMMXT : Architecture<"iwmmxt", "ARMv5te", [ARMv5te]>; +def IWMMXT2 : Architecture<"iwmmxt2", "ARMv5te", [ARMv5te]>; +def XScale : Architecture<"xscale", "ARMv5te", [ARMv5te]>; +def ARMv6j : Architecture<"armv6j", "ARMv7a", [ARMv6]>; +def ARMv7k : Architecture<"armv7k", "ARMv7a", [ARMv7a]>; +def ARMv7s : Architecture<"armv7s", "ARMv7a", [ARMv7a]>; + + +//===----------------------------------------------------------------------===// +// ARM processors +// + +// Dummy CPU, used to target architectures +def : ProcNoItin<"generic", []>; + +def : ProcNoItin<"arm8", [ARMv4]>; +def : ProcNoItin<"arm810", [ARMv4]>; +def : ProcNoItin<"strongarm", [ARMv4]>; +def : ProcNoItin<"strongarm110", [ARMv4]>; +def : ProcNoItin<"strongarm1100", [ARMv4]>; +def : ProcNoItin<"strongarm1110", [ARMv4]>; + +def : ProcNoItin<"arm7tdmi", [ARMv4t]>; +def : ProcNoItin<"arm7tdmi-s", [ARMv4t]>; +def : ProcNoItin<"arm710t", [ARMv4t]>; +def : ProcNoItin<"arm720t", [ARMv4t]>; +def : ProcNoItin<"arm9", [ARMv4t]>; +def : ProcNoItin<"arm9tdmi", [ARMv4t]>; +def : ProcNoItin<"arm920", [ARMv4t]>; +def : ProcNoItin<"arm920t", [ARMv4t]>; +def : ProcNoItin<"arm922t", [ARMv4t]>; +def : ProcNoItin<"arm940t", [ARMv4t]>; +def : ProcNoItin<"ep9312", [ARMv4t]>; + +def : ProcNoItin<"arm10tdmi", [ARMv5t]>; +def : ProcNoItin<"arm1020t", [ARMv5t]>; + +def : ProcNoItin<"arm9e", [ARMv5te]>; +def : ProcNoItin<"arm926ej-s", [ARMv5te]>; +def : ProcNoItin<"arm946e-s", [ARMv5te]>; +def : ProcNoItin<"arm966e-s", [ARMv5te]>; +def : ProcNoItin<"arm968e-s", [ARMv5te]>; +def : ProcNoItin<"arm10e", [ARMv5te]>; +def : ProcNoItin<"arm1020e", [ARMv5te]>; +def : ProcNoItin<"arm1022e", [ARMv5te]>; +def : ProcNoItin<"xscale", [ARMv5te]>; +def : ProcNoItin<"iwmmxt", [ARMv5te]>; + +def : Processor<"arm1136j-s", ARMV6Itineraries, [ARMv6]>; +def : Processor<"arm1136jf-s", ARMV6Itineraries, [ARMv6, + FeatureVFP2, + FeatureHasSlowFPVMLx]>; + +def : Processor<"cortex-m0", ARMV6Itineraries, [ARMv6m]>; +def : Processor<"cortex-m0plus", ARMV6Itineraries, [ARMv6m]>; +def : Processor<"cortex-m1", ARMV6Itineraries, [ARMv6m]>; +def : Processor<"sc000", ARMV6Itineraries, [ARMv6m]>; + +def : Processor<"arm1176jz-s", ARMV6Itineraries, [ARMv6kz]>; +def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ARMv6kz, + FeatureVFP2, + FeatureHasSlowFPVMLx]>; + +def : Processor<"mpcorenovfp", ARMV6Itineraries, [ARMv6k]>; +def : Processor<"mpcore", ARMV6Itineraries, [ARMv6k, + FeatureVFP2, + FeatureHasSlowFPVMLx]>; + +def : Processor<"arm1156t2-s", ARMV6Itineraries, [ARMv6t2]>; +def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ARMv6t2, + FeatureVFP2, + FeatureHasSlowFPVMLx]>; -// V4 Processors. -def : ProcNoItin<"generic", []>; -def : ProcNoItin<"arm8", []>; -def : ProcNoItin<"arm810", []>; -def : ProcNoItin<"strongarm", []>; -def : ProcNoItin<"strongarm110", []>; -def : ProcNoItin<"strongarm1100", []>; -def : ProcNoItin<"strongarm1110", []>; - -// V4T Processors. -def : ProcNoItin<"arm7tdmi", [HasV4TOps]>; -def : ProcNoItin<"arm7tdmi-s", [HasV4TOps]>; -def : ProcNoItin<"arm710t", [HasV4TOps]>; -def : ProcNoItin<"arm720t", [HasV4TOps]>; -def : ProcNoItin<"arm9", [HasV4TOps]>; -def : ProcNoItin<"arm9tdmi", [HasV4TOps]>; -def : ProcNoItin<"arm920", [HasV4TOps]>; -def : ProcNoItin<"arm920t", [HasV4TOps]>; -def : ProcNoItin<"arm922t", [HasV4TOps]>; -def : ProcNoItin<"arm940t", [HasV4TOps]>; -def : ProcNoItin<"ep9312", [HasV4TOps]>; - -// V5T Processors. -def : ProcNoItin<"arm10tdmi", [HasV5TOps]>; -def : ProcNoItin<"arm1020t", [HasV5TOps]>; - -// V5TE Processors. -def : ProcNoItin<"arm9e", [HasV5TEOps]>; -def : ProcNoItin<"arm926ej-s", [HasV5TEOps]>; -def : ProcNoItin<"arm946e-s", [HasV5TEOps]>; -def : ProcNoItin<"arm966e-s", [HasV5TEOps]>; -def : ProcNoItin<"arm968e-s", [HasV5TEOps]>; -def : ProcNoItin<"arm10e", [HasV5TEOps]>; -def : ProcNoItin<"arm1020e", [HasV5TEOps]>; -def : ProcNoItin<"arm1022e", [HasV5TEOps]>; -def : ProcNoItin<"xscale", [HasV5TEOps]>; -def : ProcNoItin<"iwmmxt", [HasV5TEOps]>; - -// V6 Processors. -def : Processor<"arm1136j-s", ARMV6Itineraries, [HasV6Ops]>; -def : Processor<"arm1136jf-s", ARMV6Itineraries, [HasV6Ops, FeatureVFP2, - FeatureHasSlowFPVMLx]>; - -// V6M Processors. -def : Processor<"cortex-m0", ARMV6Itineraries, [HasV6MOps, FeatureNoARM, - FeatureDB, FeatureMClass]>; -def : Processor<"cortex-m0plus", ARMV6Itineraries, [HasV6MOps, FeatureNoARM, - FeatureDB, FeatureMClass]>; -def : Processor<"cortex-m1", ARMV6Itineraries, [HasV6MOps, FeatureNoARM, - FeatureDB, FeatureMClass]>; -def : Processor<"sc000", ARMV6Itineraries, [HasV6MOps, FeatureNoARM, - FeatureDB, FeatureMClass]>; - -// V6K Processors. -def : Processor<"arm1176jz-s", ARMV6Itineraries, [HasV6KOps]>; -def : Processor<"arm1176jzf-s", ARMV6Itineraries, [HasV6KOps, FeatureVFP2, - FeatureHasSlowFPVMLx]>; -def : Processor<"mpcorenovfp", ARMV6Itineraries, [HasV6KOps]>; -def : Processor<"mpcore", ARMV6Itineraries, [HasV6KOps, FeatureVFP2, - FeatureHasSlowFPVMLx]>; - -// V6T2 Processors. -def : Processor<"arm1156t2-s", ARMV6Itineraries, [HasV6T2Ops, - FeatureDSPThumb2]>; -def : Processor<"arm1156t2f-s", ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2, - FeatureHasSlowFPVMLx, - FeatureDSPThumb2]>; - -// V7a Processors. // FIXME: A5 has currently the same Schedule model as A8 -def : ProcessorModel<"cortex-a5", CortexA8Model, - [ProcA5, HasV7Ops, FeatureNEON, FeatureDB, - FeatureVFP4, FeatureDSPThumb2, - FeatureHasRAS, FeatureAClass]>; -def : ProcessorModel<"cortex-a7", CortexA8Model, - [ProcA7, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS, - FeatureAClass]>; -def : ProcessorModel<"cortex-a8", CortexA8Model, - [ProcA8, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS, - FeatureAClass]>; -def : ProcessorModel<"cortex-a9", CortexA9Model, - [ProcA9, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS, FeatureMP, - FeatureAClass]>; +def : ProcessorModel<"cortex-a5", CortexA8Model, [ARMv7a, ProcA5, + FeatureHasRAS, + FeatureTrustZone, + FeatureSlowFPBrcc, + FeatureHasSlowFPVMLx, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureMP, + FeatureVFP4]>; + +def : ProcessorModel<"cortex-a7", CortexA8Model, [ARMv7a, ProcA7, + FeatureHasRAS, + FeatureTrustZone, + FeatureSlowFPBrcc, + FeatureHasSlowFPVMLx, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureMP, + FeatureVFP4, + FeatureHWDiv, + FeatureHWDivARM, + FeatureVirtualization]>; + +def : ProcessorModel<"cortex-a8", CortexA8Model, [ARMv7a, ProcA8, + FeatureHasRAS, + FeatureTrustZone, + FeatureSlowFPBrcc, + FeatureHasSlowFPVMLx, + FeatureVMLxForwarding, + FeatureT2XtPk]>; + +def : ProcessorModel<"cortex-a9", CortexA9Model, [ARMv7a, ProcA9, + FeatureHasRAS, + FeatureTrustZone, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureFP16, + FeatureAvoidPartialCPSR, + FeatureMP]>; // FIXME: A12 has currently the same Schedule model as A9 -def : ProcessorModel<"cortex-a12", CortexA9Model, - [ProcA12, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureMP, - FeatureHasRAS, FeatureAClass]>; - -// FIXME: A15 has currently the same ProcessorModel as A9. -def : ProcessorModel<"cortex-a15", CortexA9Model, - [ProcA15, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS, - FeatureAClass]>; +def : ProcessorModel<"cortex-a12", CortexA9Model, [ARMv7a, ProcA12, + FeatureHasRAS, + FeatureTrustZone, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureVFP4, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureVirtualization, + FeatureMP]>; + +// FIXME: A15 has currently the same Schedule model as A9. +def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15, + FeatureHasRAS, + FeatureTrustZone, + FeatureT2XtPk, + FeatureVFP4, + FeatureMP, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureVirtualization]>; // FIXME: A17 has currently the same Schedule model as A9 -def : ProcessorModel<"cortex-a17", CortexA9Model, - [ProcA17, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureMP, - FeatureHasRAS, FeatureAClass]>; +def : ProcessorModel<"cortex-a17", CortexA9Model, [ARMv7a, ProcA17, + FeatureHasRAS, + FeatureTrustZone, + FeatureMP, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureVFP4, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureVirtualization]>; // FIXME: krait has currently the same Schedule model as A9 -def : ProcessorModel<"krait", CortexA9Model, - [ProcKrait, HasV7Ops, - FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS, - FeatureAClass]>; +// FIXME: krait has currently the same features as A9 plus VFP4 and hardware +// division features. +def : ProcessorModel<"krait", CortexA9Model, [ARMv7a, ProcKrait, + FeatureHasRAS, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureFP16, + FeatureAvoidPartialCPSR, + FeatureVFP4, + FeatureHWDiv, + FeatureHWDivARM]>; + +def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift, + FeatureHasRAS, + FeatureNEONForFP, + FeatureT2XtPk, + FeatureVFP4, + FeatureMP, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureAvoidMOVsShOp, + FeatureHasSlowFPVMLx]>; // FIXME: R4 has currently the same ProcessorModel as A8. -def : ProcessorModel<"cortex-r4", CortexA8Model, - [ProcR4]>; +def : ProcessorModel<"cortex-r4", CortexA8Model, [ARMv7r, ProcR4, + FeatureHasRAS, + FeatureAvoidPartialCPSR, + FeatureT2XtPk]>; // FIXME: R4F has currently the same ProcessorModel as A8. -def : ProcessorModel<"cortex-r4f", CortexA8Model, - [ProcR4, - FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, - FeatureVFP3, FeatureD16]>; +def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4, + FeatureHasRAS, + FeatureSlowFPBrcc, + FeatureHasSlowFPVMLx, + FeatureVFP3, + FeatureD16, + FeatureAvoidPartialCPSR, + FeatureT2XtPk]>; // FIXME: R5 has currently the same ProcessorModel as A8. -def : ProcessorModel<"cortex-r5", CortexA8Model, - [ProcR5, HasV7Ops, FeatureDB, - FeatureVFP3, FeatureDSPThumb2, - FeatureHasRAS, - FeatureD16, FeatureRClass]>; +def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5, + FeatureHasRAS, + FeatureVFP3, + FeatureD16, + FeatureSlowFPBrcc, + FeatureHWDivARM, + FeatureHasSlowFPVMLx, + FeatureAvoidPartialCPSR, + FeatureT2XtPk]>; // FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5. -def : ProcessorModel<"cortex-r7", CortexA8Model, - [ProcR5, HasV7Ops, FeatureDB, - FeatureVFP3, FeatureDSPThumb2, - FeatureHasRAS, FeatureVFPOnlySP, - FeatureD16, FeatureMP, FeatureRClass]>; - -// V7M Processors. -def : ProcNoItin<"cortex-m3", [HasV7Ops, - FeatureThumb2, FeatureNoARM, FeatureDB, - FeatureHWDiv, FeatureMClass]>; -def : ProcNoItin<"sc300", [HasV7Ops, - FeatureThumb2, FeatureNoARM, FeatureDB, - FeatureHWDiv, FeatureMClass]>; - -// V7EM Processors. -def : ProcNoItin<"cortex-m4", [HasV7Ops, - FeatureThumb2, FeatureNoARM, FeatureDB, - FeatureHWDiv, FeatureDSPThumb2, - FeatureT2XtPk, FeatureVFP4, - FeatureVFPOnlySP, FeatureD16, - FeatureMClass]>; -def : ProcNoItin<"cortex-m7", [HasV7Ops, - FeatureThumb2, FeatureNoARM, FeatureDB, - FeatureHWDiv, FeatureDSPThumb2, - FeatureT2XtPk, FeatureFPARMv8, - FeatureD16, FeatureMClass]>; - - -// Swift uArch Processors. -def : ProcessorModel<"swift", SwiftModel, - [ProcSwift, HasV7Ops, FeatureNEON, - FeatureDB, FeatureDSPThumb2, - FeatureHasRAS, FeatureAClass]>; - -// V8 Processors -def : ProcNoItin<"cortex-a53", [ProcA53, HasV8Ops, FeatureAClass, - FeatureDB, FeatureFPARMv8, - FeatureNEON, FeatureDSPThumb2]>; -def : ProcNoItin<"cortex-a57", [ProcA57, HasV8Ops, FeatureAClass, - FeatureDB, FeatureFPARMv8, - FeatureNEON, FeatureDSPThumb2]>; -// FIXME: Cortex-A72 is currently modelled as an Cortex-A57. -def : ProcNoItin<"cortex-a72", [ProcA57, HasV8Ops, FeatureAClass, - FeatureDB, FeatureFPARMv8, - FeatureNEON, FeatureDSPThumb2]>; +def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, + FeatureHasRAS, + FeatureVFP3, + FeatureVFPOnlySP, + FeatureD16, + FeatureFP16, + FeatureMP, + FeatureSlowFPBrcc, + FeatureHWDivARM, + FeatureHasSlowFPVMLx, + FeatureAvoidPartialCPSR, + FeatureT2XtPk]>; + +def : ProcNoItin<"cortex-m3", [ARMv7m]>; +def : ProcNoItin<"sc300", [ARMv7m]>; + +def : ProcNoItin<"cortex-m4", [ARMv7em, + FeatureVFP4, + FeatureVFPOnlySP, + FeatureD16]>; + +def : ProcNoItin<"cortex-m7", [ARMv7em, + FeatureFPARMv8, + FeatureD16]>; + + +def : ProcNoItin<"cortex-a35", [ARMv8a, ProcA35, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; + +def : ProcNoItin<"cortex-a53", [ARMv8a, ProcA53, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; + +def : ProcNoItin<"cortex-a57", [ARMv8a, ProcA57, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; + +def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; // Cyclone is very similar to swift -def : ProcessorModel<"cyclone", SwiftModel, - [ProcSwift, HasV8Ops, HasV7Ops, - FeatureCrypto, FeatureFPARMv8, - FeatureDB,FeatureDSPThumb2, - FeatureHasRAS, FeatureZCZeroing]>; +def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, + FeatureHasRAS, + FeatureNEONForFP, + FeatureT2XtPk, + FeatureVFP4, + FeatureMP, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureAvoidMOVsShOp, + FeatureHasSlowFPVMLx, + FeatureCrypto, + FeatureZCZeroing]>; + +def : ProcNoItin<"exynos-m1", [ARMv8a, ProcExynosM1, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; //===----------------------------------------------------------------------===// // Register File Description @@ -504,8 +685,15 @@ def ARMAsmWriter : AsmWriter { bit isMCAsmWriter = 1; } +def ARMAsmParserVariant : AsmParserVariant { + int Variant = 0; + string Name = "ARM"; + string BreakCharacters = "."; +} + def ARM : Target { // Pull in Instruction Info: let InstructionSet = ARMInstrInfo; let AssemblyWriters = [ARMAsmWriter]; + let AssemblyParserVariants = [ARMAsmParserVariant]; } diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 738dded..206db96 100644 --- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -60,7 +60,7 @@ using namespace llvm; ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)), AFI(nullptr), MCP(nullptr), - InConstantPool(false) {} + InConstantPool(false), OptimizationGoals(-1) {} void ARMAsmPrinter::EmitFunctionBodyEnd() { // Make sure to terminate any constant pools that were at the end @@ -80,8 +80,8 @@ void ARMAsmPrinter::EmitFunctionEntryLabel() { OutStreamer->EmitLabel(CurrentFnSym); } -void ARMAsmPrinter::EmitXXStructor(const Constant *CV) { - uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType()); +void ARMAsmPrinter::EmitXXStructor(const DataLayout &DL, const Constant *CV) { + uint64_t Size = getDataLayout().getTypeAllocSize(CV->getType()); assert(Size && "C++ constructor pointer had zero size!"); const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts()); @@ -106,9 +106,38 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<ARMSubtarget>(); SetupMachineFunction(MF); + const Function* F = MF.getFunction(); + const TargetMachine& TM = MF.getTarget(); + + // Calculate this function's optimization goal. + unsigned OptimizationGoal; + if (F->hasFnAttribute(Attribute::OptimizeNone)) + // For best debugging illusion, speed and small size sacrificed + OptimizationGoal = 6; + else if (F->optForMinSize()) + // Aggressively for small size, speed and debug illusion sacrificed + OptimizationGoal = 4; + else if (F->optForSize()) + // For small size, but speed and debugging illusion preserved + OptimizationGoal = 3; + else if (TM.getOptLevel() == CodeGenOpt::Aggressive) + // Aggressively for speed, small size and debug illusion sacrificed + OptimizationGoal = 2; + else if (TM.getOptLevel() > CodeGenOpt::None) + // For speed, but small size and good debug illusion preserved + OptimizationGoal = 1; + else // TM.getOptLevel() == CodeGenOpt::None + // For good debugging, but speed and small size preserved + OptimizationGoal = 5; + + // Combine a new optimization goal with existing ones. + if (OptimizationGoals == -1) // uninitialized goals + OptimizationGoals = OptimizationGoal; + else if (OptimizationGoals != (int)OptimizationGoal) // conflicting goals + OptimizationGoals = 0; if (Subtarget->isTargetCOFF()) { - bool Internal = MF.getFunction()->hasInternalLinkage(); + bool Internal = F->hasInternalLinkage(); COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL; int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT; @@ -198,22 +227,13 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, MCSymbol *ARMAsmPrinter:: GetARMJTIPICJumpTableLabel(unsigned uid) const { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); SmallString<60> Name; - raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI" + raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_' << uid; return OutContext.getOrCreateSymbol(Name); } - -MCSymbol *ARMAsmPrinter::GetARMSJLJEHLabel() const { - const DataLayout *DL = TM.getDataLayout(); - SmallString<60> Name; - raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "SJLJEH" - << getFunctionNumber(); - return OutContext.getOrCreateSymbol(Name); -} - bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { @@ -515,6 +535,17 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { // generates code that does this, it is always safe to set. OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); } + + // The last attribute to be emitted is ABI_optimization_goals + MCTargetStreamer &TS = *OutStreamer->getTargetStreamer(); + ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); + + if (OptimizationGoals > 0 && + (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI())) + ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals); + OptimizationGoals = -1; + + ATS.finishAttributeSection(); } //===----------------------------------------------------------------------===// @@ -532,7 +563,7 @@ static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU, if (Subtarget->hasV8Ops()) return ARMBuildAttrs::v8; else if (Subtarget->hasV7Ops()) { - if (Subtarget->isMClass() && Subtarget->hasThumb2DSP()) + if (Subtarget->isMClass() && Subtarget->hasDSP()) return ARMBuildAttrs::v7E_M; return ARMBuildAttrs::v7; } else if (Subtarget->hasV6T2Ops()) @@ -587,7 +618,7 @@ void ARMAsmPrinter::emitAttributes() { // We consider krait as a "cortex-a9" + hwdiv CPU // Enable hwdiv through ".arch_extension idiv" if (STI.hasDivide() || STI.hasDivideInARMMode()) - ATS.emitArchExtension(ARM::AEK_HWDIV); + ATS.emitArchExtension(ARM::AEK_HWDIV | ARM::AEK_HWDIVARM); } else ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString); } @@ -807,8 +838,6 @@ void ARMAsmPrinter::emitAttributes() { else if (STI.hasVirtualization()) ATS.emitAttribute(ARMBuildAttrs::Virtualization_use, ARMBuildAttrs::AllowVirtualization); - - ATS.finishAttributeSection(); } //===----------------------------------------------------------------------===// @@ -828,8 +857,7 @@ getModifierVariantKind(ARMCP::ARMCPModifier Modifier) { case ARMCP::TLSGD: return MCSymbolRefExpr::VK_TLSGD; case ARMCP::TPOFF: return MCSymbolRefExpr::VK_TPOFF; case ARMCP::GOTTPOFF: return MCSymbolRefExpr::VK_GOTTPOFF; - case ARMCP::GOT: return MCSymbolRefExpr::VK_GOT; - case ARMCP::GOTOFF: return MCSymbolRefExpr::VK_GOTOFF; + case ARMCP::GOT_PREL: return MCSymbolRefExpr::VK_ARM_GOT_PREL; } llvm_unreachable("Invalid ARMCPModifier!"); } @@ -875,8 +903,8 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV, void ARMAsmPrinter:: EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { - const DataLayout *DL = TM.getDataLayout(); - int Size = TM.getDataLayout()->getTypeAllocSize(MCPV->getType()); + const DataLayout &DL = getDataLayout(); + int Size = DL.getTypeAllocSize(MCPV->getType()); ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV); @@ -909,10 +937,9 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { OutContext); if (ACPV->getPCAdjustment()) { - MCSymbol *PCLabel = getPICLabel(DL->getPrivateGlobalPrefix(), - getFunctionNumber(), - ACPV->getLabelId(), - OutContext); + MCSymbol *PCLabel = + getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), + ACPV->getLabelId(), OutContext); const MCExpr *PCRelExpr = MCSymbolRefExpr::create(PCLabel, OutContext); PCRelExpr = MCBinaryExpr::createAdd(PCRelExpr, @@ -1136,6 +1163,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { Offset = 0; break; case ARM::ADDri: + case ARM::t2ADDri: Offset = -MI->getOperand(2).getImm(); break; case ARM::SUBri: @@ -1198,7 +1226,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { #include "ARMGenMCPseudoLowering.inc" void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // If we just ended a constant pool, mark it as such. if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) { @@ -1355,9 +1383,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCSymbol *GVSym = GetARMGVSymbol(GV, TF); const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext); - MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(), - getFunctionNumber(), - MI->getOperand(2).getImm(), OutContext); + MCSymbol *LabelSym = + getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), + MI->getOperand(2).getImm(), OutContext); const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext); unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4; const MCExpr *PCRelExpr = @@ -1388,9 +1416,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCSymbol *GVSym = GetARMGVSymbol(GV, TF); const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext); - MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(), - getFunctionNumber(), - MI->getOperand(3).getImm(), OutContext); + MCSymbol *LabelSym = + getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), + MI->getOperand(3).getImm(), OutContext); const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext); unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4; const MCExpr *PCRelExpr = @@ -1414,10 +1442,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // This adds the address of LPC0 to r0. // Emit the label. - OutStreamer->EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(), + OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), - MI->getOperand(2).getImm(), - OutContext)); + MI->getOperand(2).getImm(), OutContext)); // Form and emit the add. EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr) @@ -1436,10 +1463,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // This adds the address of LPC0 to r0. // Emit the label. - OutStreamer->EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(), + OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), - MI->getOperand(2).getImm(), - OutContext)); + MI->getOperand(2).getImm(), OutContext)); // Form and emit the add. EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDrr) @@ -1468,10 +1494,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // a PC-relative address at the ldr instruction. // Emit the label. - OutStreamer->EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(), + OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), - MI->getOperand(2).getImm(), - OutContext)); + MI->getOperand(2).getImm(), OutContext)); // Form and emit the load unsigned Opcode; @@ -1519,7 +1544,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { if (MCPE.isMachineConstantPoolEntry()) EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal); else - EmitGlobalConstant(MCPE.Val.ConstVal); + EmitGlobalConstant(DL, MCPE.Val.ConstVal); return; } case ARM::JUMPTABLE_ADDRS: @@ -1653,12 +1678,12 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // adds $val, #7 // str $val, [$src, #4] // movs r0, #0 - // b 1f + // b LSJLJEH // movs r0, #1 - // 1: + // LSJLJEH: unsigned SrcReg = MI->getOperand(0).getReg(); unsigned ValReg = MI->getOperand(1).getReg(); - MCSymbol *Label = GetARMSJLJEHLabel(); + MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true); OutStreamer->AddComment("eh_setjmp begin"); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr) .addReg(ValReg) diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h index 3d25121..ed7be2d 100644 --- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h @@ -51,6 +51,11 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter { /// labels used for ARMv4t thumb code to make register indirect calls. SmallVector<std::pair<unsigned, MCSymbol*>, 4> ThumbIndirectPads; + /// OptimizationGoals - Maintain a combined optimization goal for all + /// functions in a module: one of Tag_ABI_optimization_goals values, + /// -1 if uninitialized, 0 if conflicting goals + int OptimizationGoals; + public: explicit ARMAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer); @@ -84,7 +89,7 @@ public: void EmitFunctionEntryLabel() override; void EmitStartOfAsmFile(Module &M) override; void EmitEndOfAsmFile(Module &M) override; - void EmitXXStructor(const Constant *CV) override; + void EmitXXStructor(const DataLayout &DL, const Constant *CV) override; // lowerOperand - Convert a MachineOperand into the equivalent MCOperand. bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp); @@ -119,8 +124,6 @@ private: MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol); MCSymbol *GetARMJTIPICJumpTableLabel(unsigned uid) const; - MCSymbol *GetARMSJLJEHLabel() const; - MCSymbol *GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags); public: diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 9f43e73..49f3288 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -97,7 +97,7 @@ ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI) Subtarget(STI) { for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) { if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second) - assert(false && "Duplicated entries?"); + llvm_unreachable("Duplicated entries?"); MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc); MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc); } @@ -440,7 +440,7 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const { if (MI->isBundle()) { - MachineBasicBlock::const_instr_iterator I = MI; + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); while (++I != E && I->isInsideBundle()) { int PIdx = I->findFirstPredOperandIdx(); @@ -518,7 +518,7 @@ bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI, static bool isCPSRDefined(const MachineInstr *MI) { for (const auto &MO : MI->operands()) - if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef()) + if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef() && !MO.isDead()) return true; return false; } @@ -647,7 +647,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr *MI) const { unsigned Size = 0; - MachineBasicBlock::const_instr_iterator I = MI; + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); while (++I != E && I->isInsideBundle()) { assert(!I->isBundle() && "No nested bundle!"); @@ -853,11 +853,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOStore, - MFI.getObjectSize(FI), - Align); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), Align); switch (RC->getSize()) { case 4: @@ -1043,12 +1041,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachineMemOperand *MMO = - MF.getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), - Align); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), Align); switch (RC->getSize()) { case 4: @@ -1224,6 +1219,60 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, return MI->mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex); } +/// \brief Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD +/// depending on whether the result is used. +void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MBBI) const { + bool isThumb1 = Subtarget.isThumb1Only(); + bool isThumb2 = Subtarget.isThumb2(); + const ARMBaseInstrInfo *TII = Subtarget.getInstrInfo(); + + MachineInstr *MI = MBBI; + DebugLoc dl = MI->getDebugLoc(); + MachineBasicBlock *BB = MI->getParent(); + + MachineInstrBuilder LDM, STM; + if (isThumb1 || !MI->getOperand(1).isDead()) { + LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD + : isThumb1 ? ARM::tLDMIA_UPD + : ARM::LDMIA_UPD)) + .addOperand(MI->getOperand(1)); + } else { + LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA)); + } + + if (isThumb1 || !MI->getOperand(0).isDead()) { + STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD + : isThumb1 ? ARM::tSTMIA_UPD + : ARM::STMIA_UPD)) + .addOperand(MI->getOperand(0)); + } else { + STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA)); + } + + AddDefaultPred(LDM.addOperand(MI->getOperand(3))); + AddDefaultPred(STM.addOperand(MI->getOperand(2))); + + // Sort the scratch registers into ascending order. + const TargetRegisterInfo &TRI = getRegisterInfo(); + llvm::SmallVector<unsigned, 6> ScratchRegs; + for(unsigned I = 5; I < MI->getNumOperands(); ++I) + ScratchRegs.push_back(MI->getOperand(I).getReg()); + std::sort(ScratchRegs.begin(), ScratchRegs.end(), + [&TRI](const unsigned &Reg1, + const unsigned &Reg2) -> bool { + return TRI.getEncodingValue(Reg1) < + TRI.getEncodingValue(Reg2); + }); + + for (const auto &Reg : ScratchRegs) { + LDM.addReg(Reg, RegState::Define); + STM.addReg(Reg, RegState::Kill); + } + + BB->erase(MBBI); +} + + bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MachineFunction &MF = *MI->getParent()->getParent(); @@ -1237,6 +1286,11 @@ ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { return true; } + if (MI->getOpcode() == ARM::MEMCPY) { + expandMEMCPY(MI); + return true; + } + // This hook gets to expand COPY instructions before they become // copyPhysReg() calls. Look for VMOVS instructions that can legally be // widened to VMOVD. We prefer the VMOVD when possible because it may be @@ -1325,9 +1379,9 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) { // instructions, so that's probably OK, but is PIC always correct when // we get here? if (ACPV->isGlobalValue()) - NewCPV = ARMConstantPoolConstant:: - Create(cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId, - ARMCP::CPValue, 4); + NewCPV = ARMConstantPoolConstant::Create( + cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId, ARMCP::CPValue, + 4, ACPV->getModifier(), ACPV->mustAddCurrentAddress()); else if (ACPV->isExtSymbol()) NewCPV = ARMConstantPoolSymbol:: Create(MF.getFunction()->getContext(), @@ -1645,16 +1699,14 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI, bool ARMBaseInstrInfo:: isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const { + BranchProbability Probability) const { if (!NumCycles) return false; // If we are optimizing for size, see if the branch in the predecessor can be // lowered to cbn?z by the constant island lowering pass, and return false if // so. This results in a shorter instruction sequence. - const Function *F = MBB.getParent()->getFunction(); - if (F->hasFnAttribute(Attribute::OptimizeForSize) || - F->hasFnAttribute(Attribute::MinSize)) { + if (MBB.getParent()->getFunction()->optForSize()) { MachineBasicBlock *Pred = *MBB.pred_begin(); if (!Pred->empty()) { MachineInstr *LastMI = &*Pred->rbegin(); @@ -1677,12 +1729,14 @@ isProfitableToIfCvt(MachineBasicBlock &MBB, } // Attempt to estimate the relative costs of predication versus branching. - unsigned UnpredCost = Probability.getNumerator() * NumCycles; - UnpredCost /= Probability.getDenominator(); - UnpredCost += 1; // The branch itself - UnpredCost += Subtarget.getMispredictionPenalty() / 10; - - return (NumCycles + ExtraPredCycles) <= UnpredCost; + // Here we scale up each component of UnpredCost to avoid precision issue when + // scaling NumCycles by Probability. + const unsigned ScalingUpFactor = 1024; + unsigned UnpredCost = Probability.scale(NumCycles * ScalingUpFactor); + UnpredCost += ScalingUpFactor; // The branch itself + UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10; + + return (NumCycles + ExtraPredCycles) * ScalingUpFactor <= UnpredCost; } bool ARMBaseInstrInfo:: @@ -1690,23 +1744,22 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned TCycles, unsigned TExtra, MachineBasicBlock &FMBB, unsigned FCycles, unsigned FExtra, - const BranchProbability &Probability) const { + BranchProbability Probability) const { if (!TCycles || !FCycles) return false; // Attempt to estimate the relative costs of predication versus branching. - unsigned TUnpredCost = Probability.getNumerator() * TCycles; - TUnpredCost /= Probability.getDenominator(); - - uint32_t Comp = Probability.getDenominator() - Probability.getNumerator(); - unsigned FUnpredCost = Comp * FCycles; - FUnpredCost /= Probability.getDenominator(); - + // Here we scale up each component of UnpredCost to avoid precision issue when + // scaling TCycles/FCycles by Probability. + const unsigned ScalingUpFactor = 1024; + unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor); + unsigned FUnpredCost = + Probability.getCompl().scale(FCycles * ScalingUpFactor); unsigned UnpredCost = TUnpredCost + FUnpredCost; - UnpredCost += 1; // The branch itself - UnpredCost += Subtarget.getMispredictionPenalty() / 10; + UnpredCost += 1 * ScalingUpFactor; // The branch itself + UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10; - return (TCycles + FCycles + TExtra + FExtra) <= UnpredCost; + return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost; } bool @@ -1744,9 +1797,10 @@ unsigned llvm::getMatchingCondBranchOpcode(unsigned Opc) { llvm_unreachable("Unknown unconditional branch opcode!"); } -/// commuteInstruction - Handle commutable instructions. -MachineInstr * -ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { +MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { switch (MI->getOpcode()) { case ARM::MOVCCr: case ARM::t2MOVCCr: { @@ -1756,7 +1810,7 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { // MOVCC AL can't be inverted. Shouldn't happen. if (CC == ARMCC::AL || PredReg != ARM::CPSR) return nullptr; - MI = TargetInstrInfo::commuteInstruction(MI, NewMI); + MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); if (!MI) return nullptr; // After swapping the MOVCC operands, also invert the condition. @@ -1765,7 +1819,7 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { return MI; } } - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } /// Identify instructions that can be folded into a MOVCC instruction, and @@ -1975,21 +2029,12 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, } } -static bool isAnySubRegLive(unsigned Reg, const TargetRegisterInfo *TRI, - MachineInstr *MI) { - for (MCSubRegIterator Subreg(Reg, TRI, /* IncludeSelf */ true); - Subreg.isValid(); ++Subreg) - if (MI->getParent()->computeRegisterLiveness(TRI, *Subreg, MI) != - MachineBasicBlock::LQR_Dead) - return true; - return false; -} bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, MachineFunction &MF, MachineInstr *MI, unsigned NumBytes) { // This optimisation potentially adds lots of load and store // micro-operations, it's only really a great benefit to code-size. - if (!MF.getFunction()->hasFnAttribute(Attribute::MinSize)) + if (!MF.getFunction()->optForMinSize()) return false; // If only one register is pushed/popped, LLVM can use an LDR/STR @@ -2058,11 +2103,9 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, // registers live within the function we might clobber a return value // register; the other way a register can be live here is if it's // callee-saved. - // TODO: Currently, computeRegisterLiveness() does not report "live" if a - // sub reg is live. When computeRegisterLiveness() works for sub reg, it - // can replace isAnySubRegLive(). if (isCalleeSavedRegister(CurReg, CSRegs) || - isAnySubRegLive(CurReg, TRI, MI)) { + MI->getParent()->computeRegisterLiveness(TRI, CurReg, MI) != + MachineBasicBlock::LQR_Dead) { // VFP pops don't allow holes in the register list, so any skip is fatal // for our transformation. GPR pops do, so we should just keep looking. if (IsVFPPushPop) @@ -3381,7 +3424,7 @@ static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI, assert(Idx != -1 && "Cannot find bundled definition!"); DefIdx = Idx; - return II; + return &*II; } static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI, @@ -3389,7 +3432,7 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI, unsigned &UseIdx, unsigned &Dist) { Dist = 0; - MachineBasicBlock::const_instr_iterator II = MI; ++II; + MachineBasicBlock::const_instr_iterator II = ++MI->getIterator(); assert(II->isInsideBundle() && "Empty bundle?"); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); @@ -3410,7 +3453,7 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI, } UseIdx = Idx; - return II; + return &*II; } /// Return the number of cycles to add to (or subtract from) the static @@ -3652,6 +3695,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, // instructions). if (Latency > 0 && Subtarget.isThumb2()) { const MachineFunction *MF = DefMI->getParent()->getParent(); + // FIXME: Use Function::optForSize(). if (MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize)) --Latency; } @@ -3931,11 +3975,11 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, // other passes may query the latency of a bundled instruction. if (MI->isBundle()) { unsigned Latency = 0; - MachineBasicBlock::const_instr_iterator I = MI; + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); while (++I != E && I->isInsideBundle()) { if (I->getOpcode() != ARM::t2IT) - Latency += getInstrLatency(ItinData, I, PredCost); + Latency += getInstrLatency(ItinData, &*I, PredCost); } return Latency; } @@ -4054,8 +4098,8 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI, MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg); MIB.addReg(Reg, RegState::Kill).addImm(0); unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; - MachineMemOperand *MMO = MBB.getParent()-> - getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 4, 4); + MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( + MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4); MIB.addMemOperand(MMO); AddDefaultPred(MIB); } diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index b4706e3..d80c494 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -86,6 +86,18 @@ protected: RegSubRegPair &BaseReg, RegSubRegPairAndIdx &InsertedReg) const override; + /// Commutes the operands in the given instruction. + /// The commutable operands are specified by their indices OpIdx1 and OpIdx2. + /// + /// Do not call this method for a non-commutable instruction or for + /// non-commutable pair of operand indices OpIdx1 and OpIdx2. + /// Even though the instruction is commutable, the method may still + /// fail to commute the operands, null pointer is returned in such cases. + MachineInstr *commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const override; + public: // Return whether the target has an explicit NOP encoding. bool hasNOP() const; @@ -188,9 +200,6 @@ public: MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const override; - MachineInstr *commuteInstruction(MachineInstr*, - bool=false) const override; - const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg, unsigned SubIdx, unsigned State, const TargetRegisterInfo *TRI) const; @@ -224,15 +233,15 @@ public: bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT, unsigned ExtraT, MachineBasicBlock &FMBB, unsigned NumF, unsigned ExtraF, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, - const BranchProbability &Probability) const override { + BranchProbability Probability) const override { return NumCycles == 1; } @@ -343,6 +352,8 @@ private: virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI, Reloc::Model RM) const = 0; + void expandMEMCPY(MachineBasicBlock::iterator) const; + private: /// Modeling special VFP / NEON fp MLA / MLS hazards. diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index e7d5be77..a520770 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -87,9 +87,22 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { } } + if (STI.isTargetDarwin() && F->getCallingConv() == CallingConv::CXX_FAST_TLS) + return MF->getInfo<ARMFunctionInfo>()->isSplitCSR() + ? CSR_iOS_CXX_TLS_PE_SaveList + : CSR_iOS_CXX_TLS_SaveList; return RegList; } +const MCPhysReg *ARMBaseRegisterInfo::getCalleeSavedRegsViaCopy( + const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getInfo<ARMFunctionInfo>()->isSplitCSR()) + return CSR_iOS_CXX_TLS_ViaCopy_SaveList; + return nullptr; +} + const uint32_t * ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { @@ -97,6 +110,8 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF, if (CC == CallingConv::GHC) // This is academic becase all GHC calls are (supposed to be) tail calls return CSR_NoRegs_RegMask; + if (STI.isTargetDarwin() && CC == CallingConv::CXX_FAST_TLS) + return CSR_iOS_CXX_TLS_RegMask; return STI.isTargetDarwin() ? CSR_iOS_RegMask : CSR_AAPCS_RegMask; } @@ -106,6 +121,14 @@ ARMBaseRegisterInfo::getNoPreservedMask() const { } const uint32_t * +ARMBaseRegisterInfo::getTLSCallPreservedMask(const MachineFunction &MF) const { + assert(MF.getSubtarget<ARMSubtarget>().isTargetDarwin() && + "only know about special TLS call on Darwin"); + return CSR_iOS_TLSCall_RegMask; +} + + +const uint32_t * ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); @@ -225,7 +248,8 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg, ArrayRef<MCPhysReg> Order, SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF, - const VirtRegMap *VRM) const { + const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg); @@ -338,7 +362,7 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const { // 1. Dynamic stack realignment is explicitly disabled, // 2. This is a Thumb1 function (it's not useful, so we don't bother), or // 3. There are VLAs in the function and the base pointer is disabled. - if (MF.getFunction()->hasFnAttribute("no-realign-stack")) + if (!TargetRegisterInfo::canRealignStack(MF)) return false; if (AFI->isThumb1OnlyFunction()) return false; @@ -356,18 +380,6 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const { } bool ARMBaseRegisterInfo:: -needsStackRealignment(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const ARMFrameLowering *TFI = getFrameLowering(MF); - const Function *F = MF.getFunction(); - unsigned StackAlign = TFI->getStackAlignment(); - bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || - F->hasFnAttribute(Attribute::StackAlignment)); - - return requiresRealignment && canRealignStack(MF); -} - -bool ARMBaseRegisterInfo:: cannotEliminateFrame(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->adjustsStack()) diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h index fdc1ef9..6a9a45a 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -62,6 +62,12 @@ static inline bool isARMArea3Register(unsigned Reg, bool isIOS) { switch (Reg) { case D15: case D14: case D13: case D12: case D11: case D10: case D9: case D8: + case D7: case D6: case D5: case D4: + case D3: case D2: case D1: case D0: + case D31: case D30: case D29: case D28: + case D27: case D26: case D25: case D24: + case D23: case D22: case D21: case D20: + case D19: case D18: case D17: case D16: return true; default: return false; @@ -92,9 +98,12 @@ protected: public: /// Code Generation virtual methods... const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg * + getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; - const uint32_t *getNoPreservedMask() const; + const uint32_t *getNoPreservedMask() const override; + const uint32_t *getTLSCallPreservedMask(const MachineFunction &MF) const; /// getThisReturnPreservedMask - Returns a call preserved mask specific to the /// case that 'returned' is on an i32 first argument if the calling convention @@ -126,15 +135,15 @@ public: ArrayRef<MCPhysReg> Order, SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF, - const VirtRegMap *VRM) const override; + const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const override; void updateRegAllocHint(unsigned Reg, unsigned NewReg, MachineFunction &MF) const override; bool hasBasePointer(const MachineFunction &MF) const; - bool canRealignStack(const MachineFunction &MF) const; - bool needsStackRealignment(const MachineFunction &MF) const override; + bool canRealignStack(const MachineFunction &MF) const override; int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override; bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h index d687568..a731d00 100644 --- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h @@ -160,15 +160,15 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, State); } -static const uint16_t RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; +static const MCPhysReg RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; -static const uint16_t SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3, - ARM::S4, ARM::S5, ARM::S6, ARM::S7, - ARM::S8, ARM::S9, ARM::S10, ARM::S11, - ARM::S12, ARM::S13, ARM::S14, ARM::S15 }; -static const uint16_t DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3, - ARM::D4, ARM::D5, ARM::D6, ARM::D7 }; -static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 }; +static const MCPhysReg SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3, + ARM::S4, ARM::S5, ARM::S6, ARM::S7, + ARM::S8, ARM::S9, ARM::S10, ARM::S11, + ARM::S12, ARM::S13, ARM::S14, ARM::S15 }; +static const MCPhysReg DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7 }; +static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 }; // Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA @@ -199,9 +199,11 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, // Try to allocate a contiguous block of registers, each of the correct // size to hold one member. - unsigned Align = std::min(PendingMembers[0].getExtraInfo(), 8U); + auto &DL = State.getMachineFunction().getDataLayout(); + unsigned StackAlign = DL.getStackAlignment(); + unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign); - ArrayRef<uint16_t> RegList; + ArrayRef<MCPhysReg> RegList; switch (LocVT.SimpleTy) { case MVT::i32: { RegList = RRegList; diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td index 27cf06b..847ef87 100644 --- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td @@ -125,6 +125,8 @@ def CC_ARM_AAPCS_Common : CallingConv<[ CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>, CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>, + CCIfType<[v2f64], CCIfAlign<"16", + CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>, CCIfType<[v2f64], CCAssignToStackWithShadow<16, 8, [Q0, Q1, Q2, Q3]>> ]>; @@ -223,6 +225,21 @@ def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>; def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS_ThisReturn, R9))>; +def CSR_iOS_TLSCall : CalleeSavedRegs<(add LR, SP, + (sequence "R%u", 12, 1), + (sequence "D%u", 31, 0))>; + +// C++ TLS access function saves all registers except SP. Try to match +// the order of CSRs in CSR_iOS. +def CSR_iOS_CXX_TLS : CalleeSavedRegs<(add CSR_iOS, (sequence "R%u", 12, 1), + (sequence "D%u", 31, 0))>; + +// CSRs that are handled by prologue, epilogue. +def CSR_iOS_CXX_TLS_PE : CalleeSavedRegs<(add LR)>; + +// CSRs that are handled explicitly via copies. +def CSR_iOS_CXX_TLS_ViaCopy : CalleeSavedRegs<(sub CSR_iOS_CXX_TLS, LR)>; + // The "interrupt" attribute is used to generate code that is acceptable in // exception-handlers of various kinds. It makes us use a different return // instruction (handled elsewhere) and affects which registers we must return to diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index f4ec8c6..55c1684 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -340,12 +340,12 @@ namespace { /// verify - check BBOffsets, BBSizes, alignment of islands void ARMConstantIslands::verify() { #ifndef NDEBUG - for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); - MBBI != E; ++MBBI) { - MachineBasicBlock *MBB = MBBI; - unsigned MBBId = MBB->getNumber(); - assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset); - } + assert(std::is_sorted(MF->begin(), MF->end(), + [this](const MachineBasicBlock &LHS, + const MachineBasicBlock &RHS) { + return BBInfo[LHS.getNumber()].postOffset() < + BBInfo[RHS.getNumber()].postOffset(); + })); DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n"); for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) { CPUser &U = CPUsers[i]; @@ -542,7 +542,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) // identity mapping of CPI's to CPE's. const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants(); - const DataLayout &TD = *MF->getTarget().getDataLayout(); + const DataLayout &TD = MF->getDataLayout(); for (unsigned i = 0, e = CPs.size(); i != e; ++i) { unsigned Size = TD.getTypeAllocSize(CPs[i].getType()); assert(Size >= 4 && "Too small constant pool entry"); @@ -589,6 +589,8 @@ void ARMConstantIslands::doInitialJumpTablePlacement( MachineBasicBlock *LastCorrectlyNumberedBB = nullptr; for (MachineBasicBlock &MBB : *MF) { auto MI = MBB.getLastNonDebugInstr(); + if (MI == MBB.end()) + continue; unsigned JTOpcode; switch (MI->getOpcode()) { @@ -639,12 +641,12 @@ void ARMConstantIslands::doInitialJumpTablePlacement( /// into the block immediately after it. bool ARMConstantIslands::BBHasFallthrough(MachineBasicBlock *MBB) { // Get the next machine basic block in the function. - MachineFunction::iterator MBBI = MBB; + MachineFunction::iterator MBBI = MBB->getIterator(); // Can't fall off end of function. if (std::next(MBBI) == MBB->getParent()->end()) return false; - MachineBasicBlock *NextBB = std::next(MBBI); + MachineBasicBlock *NextBB = &*std::next(MBBI); if (std::find(MBB->succ_begin(), MBB->succ_end(), NextBB) == MBB->succ_end()) return false; @@ -722,15 +724,15 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) { // has any inline assembly in it. If so, we have to be conservative about // alignment assumptions, as we don't know for sure the size of any // instructions in the inline assembly. - for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) - computeBlockSize(I); + for (MachineBasicBlock &MBB : *MF) + computeBlockSize(&MBB); // The known bits of the entry block offset are determined by the function // alignment. BBInfo.front().KnownBits = MF->getAlignment(); // Compute block offsets and known bits. - adjustBBOffsetsAfter(MF->begin()); + adjustBBOffsetsAfter(&MF->front()); // Now go back through the instructions and build up our data structures. for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); @@ -968,7 +970,7 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) { // Create a new MBB for the code after the OrigBB. MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); - MachineFunction::iterator MBBI = OrigBB; ++MBBI; + MachineFunction::iterator MBBI = ++OrigBB->getIterator(); MF->insert(MBBI, NewBB); // Splice the instructions starting with MI over to NewBB. @@ -1088,7 +1090,7 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset, unsigned CPELogAlign = getCPELogAlign(U.CPEMI); unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign); unsigned NextBlockOffset, NextBlockAlignment; - MachineFunction::const_iterator NextBlock = Water; + MachineFunction::const_iterator NextBlock = Water->getIterator(); if (++NextBlock == MF->end()) { NextBlockOffset = BBInfo[Water->getNumber()].postOffset(); NextBlockAlignment = 0; @@ -1350,7 +1352,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, if (isOffsetInRange(UserOffset, CPEOffset, U)) { DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber() << format(", expected CPE offset %#x\n", CPEOffset)); - NewMBB = std::next(MachineFunction::iterator(UserMBB)); + NewMBB = &*++UserMBB->getIterator(); // Add an unconditional branch from UserMBB to fallthrough block. Record // it for branch lengthening; this new branch will not get out of range, // but if the preceding conditional branch is out of range, the targets @@ -1503,8 +1505,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { NewWaterList.insert(NewIsland); // The new CPE goes before the following block (NewMBB). - NewMBB = std::next(MachineFunction::iterator(WaterBB)); - + NewMBB = &*++WaterBB->getIterator(); } else { // No water found. DEBUG(dbgs() << "No water found\n"); @@ -1515,7 +1516,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { // next iteration for constant pools, but in this context, we don't want // it. Check for this so it will be removed from the WaterList. // Also remove any entry from NewWaterList. - MachineBasicBlock *WaterBB = std::prev(MachineFunction::iterator(NewMBB)); + MachineBasicBlock *WaterBB = &*--NewMBB->getIterator(); IP = std::find(WaterList.begin(), WaterList.end(), WaterBB); if (IP != WaterList.end()) NewWaterList.erase(WaterBB); @@ -1532,7 +1533,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { WaterList.erase(IP); // Okay, we know we can put an island before NewMBB now, do it! - MF->insert(NewMBB, NewIsland); + MF->insert(NewMBB->getIterator(), NewIsland); // Update internal data structures to account for the newly inserted MBB. updateForInsertedWaterBlock(NewIsland); @@ -1553,7 +1554,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { // Increase the size of the island block to account for the new entry. BBInfo[NewIsland->getNumber()].Size += Size; - adjustBBOffsetsAfter(std::prev(MachineFunction::iterator(NewIsland))); + adjustBBOffsetsAfter(&*--NewIsland->getIterator()); // Finally, change the CPI in the instruction operand to be ID. for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i) @@ -1732,7 +1733,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) { MBB->back().eraseFromParent(); // BBInfo[SplitBB].Offset is wrong temporarily, fixed below } - MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB)); + MachineBasicBlock *NextBB = &*++MBB->getIterator(); DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber() << " also invert condition and change dest. to BB#" @@ -2058,9 +2059,9 @@ bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI, /// \brief Returns whether CPEMI is the first instruction in the block /// immediately following JTMI (assumed to be a TBB or TBH terminator). If so, /// we can switch the first register to PC and usually remove the address -/// calculation that preceeded it. +/// calculation that preceded it. static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) { - MachineFunction::iterator MBB = JTMI->getParent(); + MachineFunction::iterator MBB = JTMI->getParent()->getIterator(); MachineFunction *MF = MBB->getParent(); ++MBB; @@ -2235,7 +2236,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; SmallVector<MachineOperand, 4> CondPrior; - MachineFunction::iterator BBi = BB; + MachineFunction::iterator BBi = BB->getIterator(); MachineFunction::iterator OldPrior = std::prev(BBi); // If the block terminator isn't analyzable, don't try to move the block @@ -2258,7 +2259,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { // Create a new MBB for the code after the jump BB. MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(JTBB->getBasicBlock()); - MachineFunction::iterator MBBI = JTBB; ++MBBI; + MachineFunction::iterator MBBI = ++JTBB->getIterator(); MF->insert(MBBI, NewBB); // Add an unconditional branch from NewBB to BB. @@ -2273,8 +2274,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { // Update the CFG. NewBB->addSuccessor(BB); - JTBB->removeSuccessor(BB); - JTBB->addSuccessor(NewBB); + JTBB->replaceSuccessor(BB, NewBB); ++NumJTInserted; return NewBB; diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp index 7d41c69..c9849b2 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -52,8 +52,7 @@ const char *ARMConstantPoolValue::getModifierText() const { // strings if that's legal. case ARMCP::no_modifier: return "none"; case ARMCP::TLSGD: return "tlsgd"; - case ARMCP::GOT: return "GOT"; - case ARMCP::GOTOFF: return "GOTOFF"; + case ARMCP::GOT_PREL: return "GOT_PREL"; case ARMCP::GOTTPOFF: return "gottpoff"; case ARMCP::TPOFF: return "tpoff"; } diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h index 36f63e2..6b18a4e 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h @@ -39,8 +39,7 @@ namespace ARMCP { enum ARMCPModifier { no_modifier, TLSGD, - GOT, - GOTOFF, + GOT_PREL, GOTTPOFF, TPOFF }; @@ -103,8 +102,6 @@ public: bool isLSDA() const { return Kind == ARMCP::CPLSDA; } bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; } - unsigned getRelocationInfo() const override { return 2; } - int getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) override; diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 4438f50..56f3498 100644 --- a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -330,22 +330,19 @@ static const NEONLdStTableEntry NEONLdStTable[] = { /// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON /// load or store pseudo instruction. static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) { - const unsigned NumEntries = array_lengthof(NEONLdStTable); - #ifndef NDEBUG // Make sure the table is sorted. static bool TableChecked = false; if (!TableChecked) { - for (unsigned i = 0; i != NumEntries-1; ++i) - assert(NEONLdStTable[i] < NEONLdStTable[i+1] && - "NEONLdStTable is not sorted!"); + assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) && + "NEONLdStTable is not sorted!"); TableChecked = true; } #endif - const NEONLdStTableEntry *I = - std::lower_bound(NEONLdStTable, NEONLdStTable + NumEntries, Opcode); - if (I != NEONLdStTable + NumEntries && I->PseudoOpc == Opcode) + auto I = std::lower_bound(std::begin(NEONLdStTable), + std::end(NEONLdStTable), Opcode); + if (I != std::end(NEONLdStTable) && I->PseudoOpc == Opcode) return I; return nullptr; } @@ -734,7 +731,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, HI16.addImm(Pred).addReg(PredReg); if (RequiresBundling) - finalizeBundle(MBB, &*LO16, &*MBBI); + finalizeBundle(MBB, LO16->getIterator(), MBBI->getIterator()); TransferImpOps(MI, LO16, HI16); MI.eraseFromParent(); @@ -747,6 +744,55 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, switch (Opcode) { default: return false; + + case ARM::TCRETURNdi: + case ARM::TCRETURNri: { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->isReturn() && + "Can only insert epilog into returning blocks"); + unsigned RetOpcode = MBBI->getOpcode(); + DebugLoc dl = MBBI->getDebugLoc(); + const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>( + MBB.getParent()->getSubtarget().getInstrInfo()); + + // Tail call return: adjust the stack pointer and jump to callee. + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + + // Jump to label or value in register. + if (RetOpcode == ARM::TCRETURNdi) { + unsigned TCOpcode = + STI->isThumb() + ? (STI->isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) + : ARM::TAILJMPd; + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode)); + if (JumpTarget.isGlobal()) + MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), + JumpTarget.getTargetFlags()); + else { + assert(JumpTarget.isSymbol()); + MIB.addExternalSymbol(JumpTarget.getSymbolName(), + JumpTarget.getTargetFlags()); + } + + // Add the default predicate in Thumb mode. + if (STI->isThumb()) + MIB.addImm(ARMCC::AL).addReg(0); + } else if (RetOpcode == ARM::TCRETURNri) { + BuildMI(MBB, MBBI, dl, + TII.get(STI->isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)) + .addReg(JumpTarget.getReg(), RegState::Kill); + } + + MachineInstr *NewMI = std::prev(MBBI); + for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) + NewMI->addOperand(MBBI->getOperand(i)); + + // Delete the pseudo instruction TCRETURN. + MBB.erase(MBBI); + MBBI = NewMI; + return true; + } case ARM::VMOVScc: case ARM::VMOVDcc: { unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD; diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp index fdd0763..ff2fcfa 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -578,7 +578,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) { unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { // For now 32-bit only. - if (VT != MVT::i32) return 0; + if (VT != MVT::i32 || GV->isThreadLocal()) return 0; Reloc::Model RelocM = TM.getRelocationModel(); bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM); @@ -922,12 +922,9 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr, if (Addr.BaseType == Address::FrameIndexBase) { int FI = Addr.Base.FI; int Offset = Addr.Offset; - MachineMemOperand *MMO = - FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI, Offset), - Flags, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); // Now add the rest of the operands. MIB.addFrameIndex(FI); @@ -1278,8 +1275,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc)) .addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR); - fastEmitBranch(FBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TBB); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { @@ -1303,8 +1299,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc)) .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR); - fastEmitBranch(FBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TBB); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } } else if (const ConstantInt *CI = @@ -1341,8 +1336,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc)) .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR); - fastEmitBranch(FBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TBB); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } @@ -1355,8 +1349,8 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) { TII.get(Opc)).addReg(AddrReg)); const IndirectBrInst *IB = cast<IndirectBrInst>(I); - for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i) - FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]); + for (const BasicBlock *SuccBB : IB->successors()) + FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]); return true; } @@ -1860,8 +1854,9 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); else return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); - } else - return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + } else { + return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + } case CallingConv::ARM_AAPCS_VFP: if (!isVarArg) return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); @@ -2088,6 +2083,9 @@ bool ARMFastISel::SelectRet(const Instruction *I) { if (!FuncInfo.CanLowerReturn) return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) + return false; + // Build a list of return value registers. SmallVector<unsigned, 4> RetRegs; @@ -2944,48 +2942,51 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT) { - bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); - ARMConstantPoolConstant *CPV = - ARMConstantPoolConstant::Create(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT); - unsigned Idx = MCP.getConstantPoolIndex(CPV, Align); + bool UseGOT_PREL = + !(GV->hasHiddenVisibility() || GV->hasLocalLinkage()); + + LLVMContext *Context = &MF->getFunction()->getContext(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create( + GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj, + UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier, + /*AddCurrentAddress=*/UseGOT_PREL); + + unsigned ConstAlign = + MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context)); + unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign); + + unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass); + unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp; + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg) + .addConstantPoolIndex(Idx); + if (Opc == ARM::LDRcp) + MIB.addImm(0); + AddDefaultPred(MIB); - unsigned Opc; - unsigned DestReg1 = createResultReg(TLI.getRegClassFor(VT)); - // Load value. - if (isThumb2) { - DestReg1 = constrainOperandRegClass(TII.get(ARM::t2LDRpci), DestReg1, 0); - AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(ARM::t2LDRpci), DestReg1) - .addConstantPoolIndex(Idx)); - Opc = UseGOTOFF ? ARM::t2ADDrr : ARM::t2LDRs; - } else { - // The extra immediate is for addrmode2. - DestReg1 = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg1, 0); - AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DbgLoc, TII.get(ARM::LDRcp), DestReg1) - .addConstantPoolIndex(Idx).addImm(0)); - Opc = UseGOTOFF ? ARM::ADDrr : ARM::LDRrs; - } + // Fix the address by adding pc. + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + Opc = Subtarget->isThumb() ? ARM::tPICADD : UseGOT_PREL ? ARM::PICLDR + : ARM::PICADD; + DestReg = constrainOperandRegClass(TII.get(Opc), DestReg, 0); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addReg(TempReg) + .addImm(ARMPCLabelIndex); + if (!Subtarget->isThumb()) + AddDefaultPred(MIB); - unsigned GlobalBaseReg = AFI->getGlobalBaseReg(); - if (GlobalBaseReg == 0) { - GlobalBaseReg = MRI.createVirtualRegister(TLI.getRegClassFor(VT)); - AFI->setGlobalBaseReg(GlobalBaseReg); + if (UseGOT_PREL && Subtarget->isThumb()) { + unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::t2LDRi12), NewDestReg) + .addReg(DestReg) + .addImm(0); + DestReg = NewDestReg; + AddOptionalDefs(MIB); } - - unsigned DestReg2 = createResultReg(TLI.getRegClassFor(VT)); - DestReg2 = constrainOperandRegClass(TII.get(Opc), DestReg2, 0); - DestReg1 = constrainOperandRegClass(TII.get(Opc), DestReg1, 1); - GlobalBaseReg = constrainOperandRegClass(TII.get(Opc), GlobalBaseReg, 2); - MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DbgLoc, TII.get(Opc), DestReg2) - .addReg(DestReg1) - .addReg(GlobalBaseReg); - if (!UseGOTOFF) - MIB.addImm(0); - AddOptionalDefs(MIB); - - return DestReg2; + return DestReg; } bool ARMFastISel::fastLowerArguments() { @@ -3038,7 +3039,7 @@ bool ARMFastISel::fastLowerArguments() { } - static const uint16_t GPRArgRegs[] = { + static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; @@ -3055,7 +3056,7 @@ bool ARMFastISel::fastLowerArguments() { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(DstReg, getKillRegState(true)); - updateValueMap(I, ResultReg); + updateValueMap(&*I, ResultReg); } return true; diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 6744000..c5990bb 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCContext.h" @@ -58,7 +59,7 @@ bool ARMFrameLowering::hasFP(const MachineFunction &MF) const { const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); // iOS requires FP not to be clobbered for backtracing purpose. - if (STI.isTargetIOS()) + if (STI.isTargetIOS() || STI.isTargetWatchOS()) return true; const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -288,7 +289,6 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI, void ARMFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - assert(&MBB == &MF.front() && "Shrink-wrapping not yet implemented"); MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -305,7 +305,11 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); unsigned NumBytes = MFI->getStackSize(); const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc dl; + unsigned FramePtr = RegInfo->getFrameRegister(MF); // Determine the sizes of each callee-save spill areas and record which frame @@ -489,7 +493,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, if (NumBytes) { // Adjust SP after all the callee-save spills. - if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes)) + if (AFI->getNumAlignedDPRCS2Regs() == 0 && + tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes)) DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes); else { emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, @@ -689,60 +694,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, AFI->setShouldRestoreSPFromFP(true); } -// Resolve TCReturn pseudo-instruction -void ARMFrameLowering::fixTCReturn(MachineFunction &MF, - MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); - assert(MBBI->isReturn() && "Can only insert epilog into returning blocks"); - unsigned RetOpcode = MBBI->getOpcode(); - DebugLoc dl = MBBI->getDebugLoc(); - const ARMBaseInstrInfo &TII = - *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); - - if (!(RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri)) - return; - - // Tail call return: adjust the stack pointer and jump to callee. - MBBI = MBB.getLastNonDebugInstr(); - MachineOperand &JumpTarget = MBBI->getOperand(0); - - // Jump to label or value in register. - if (RetOpcode == ARM::TCRETURNdi) { - unsigned TCOpcode = STI.isThumb() ? - (STI.isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) : - ARM::TAILJMPd; - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode)); - if (JumpTarget.isGlobal()) - MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), - JumpTarget.getTargetFlags()); - else { - assert(JumpTarget.isSymbol()); - MIB.addExternalSymbol(JumpTarget.getSymbolName(), - JumpTarget.getTargetFlags()); - } - - // Add the default predicate in Thumb mode. - if (STI.isThumb()) MIB.addImm(ARMCC::AL).addReg(0); - } else if (RetOpcode == ARM::TCRETURNri) { - BuildMI(MBB, MBBI, dl, - TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)). - addReg(JumpTarget.getReg(), RegState::Kill); - } - - MachineInstr *NewMI = std::prev(MBBI); - for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) - NewMI->addOperand(MBBI->getOperand(i)); - - // Delete the pseudo instruction TCRETURN. - MBB.erase(MBBI); - MBBI = NewMI; -} - void ARMFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); - assert(MBBI->isReturn() && "Can only insert epilog into returning blocks"); - DebugLoc dl = MBBI->getDebugLoc(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); @@ -758,10 +711,12 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. - if (MF.getFunction()->getCallingConv() == CallingConv::GHC) { - fixTCReturn(MF, MBB); + if (MF.getFunction()->getCallingConv() == CallingConv::GHC) return; - } + + // First put ourselves on the first (from top) terminator instructions. + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); if (!AFI->hasStackFrame()) { if (NumBytes - ArgRegsSaveSize != 0) @@ -840,8 +795,6 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, if (AFI->getGPRCalleeSavedArea1Size()) MBBI++; } - fixTCReturn(MF, MBB); - if (ArgRegsSaveSize) emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize); } @@ -932,12 +885,6 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, return Offset; } -int ARMFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - unsigned FrameReg; - return getFrameIndexReference(MF, FI, FrameReg); -} - void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, @@ -950,7 +897,6 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); DebugLoc DL; - if (MI != MBB.end()) DL = MI->getDebugLoc(); SmallVector<std::pair<unsigned,bool>, 4> Regs; unsigned i = CSI.size(); @@ -1008,7 +954,8 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, // Put any subsequent vpush instructions before this one: they will refer to // higher register numbers so need to be pushed first in order to preserve // monotonicity. - --MI; + if (MI != MBB.begin()) + --MI; } } @@ -1022,12 +969,20 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - DebugLoc DL = MI->getDebugLoc(); - unsigned RetOpcode = MI->getOpcode(); - bool isTailCall = (RetOpcode == ARM::TCRETURNdi || - RetOpcode == ARM::TCRETURNri); - bool isInterrupt = - RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR; + DebugLoc DL; + bool isTailCall = false; + bool isInterrupt = false; + bool isTrap = false; + if (MBB.end() != MI) { + DL = MI->getDebugLoc(); + unsigned RetOpcode = MI->getOpcode(); + isTailCall = (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri); + isInterrupt = + RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR; + isTrap = + RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl || + RetOpcode == ARM::tTRAP; + } SmallVector<unsigned, 4> Regs; unsigned i = CSI.size(); @@ -1043,11 +998,14 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, continue; if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt && - STI.hasV5TOps()) { - Reg = ARM::PC; - LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET; + !isTrap && STI.hasV5TOps()) { + if (MBB.succ_empty()) { + Reg = ARM::PC; + DeleteRet = true; + LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET; + } else + LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD; // Fold the return instruction into the LDM. - DeleteRet = true; } // If NoGap is true, pop consecutive registers and then leave the rest @@ -1068,7 +1026,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, .addReg(ARM::SP)); for (unsigned i = 0, e = Regs.size(); i < e; ++i) MIB.addReg(Regs[i], getDefRegState(true)); - if (DeleteRet) { + if (DeleteRet && MI != MBB.end()) { MIB.copyImplicitOps(&*MI); MI->eraseFromParent(); } @@ -1095,7 +1053,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, // Put any subsequent vpop instructions after this one: they will refer to // higher register numbers so need to be popped afterwards. - ++MI; + if (MI != MBB.end()) + ++MI; } } @@ -1109,7 +1068,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) { MachineFunction &MF = *MBB.getParent(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineFrameInfo &MFI = *MF.getFrameInfo(); @@ -1118,7 +1077,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, // slot offsets can be wrong. The offset for d8 will always be correct. for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned DNum = CSI[i].getReg() - ARM::D8; - if (DNum >= 8) + if (DNum > NumAlignedDPRCS2Regs - 1) continue; int FI = CSI[i].getFrameIdx(); // The even-numbered registers will be 16-byte aligned, the odd-numbered @@ -1269,7 +1228,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) { MachineFunction &MF = *MBB.getParent(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); // Find the frame index assigned to d8. @@ -1654,13 +1613,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, // FIXME: We could add logic to be more precise about negative offsets // and which instructions will need a scratch register for them. Is it // worth the effort and added fragility? - bool BigStack = - (RS && - (MFI->estimateStackSize(MF) + - ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >= - estimateRSStackSizeLimit(MF, this))) - || MFI->hasVarSizedObjects() - || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF)); + bool BigStack = (RS && (MFI->estimateStackSize(MF) + + ((hasFP(MF) && AFI->hasStackFrame()) ? 4 : 0) >= + estimateRSStackSizeLimit(MF, this))) || + MFI->hasVarSizedObjects() || + (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF)); bool ExtraCSSpill = false; if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) { @@ -1698,8 +1655,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, if (CS1Spilled && !UnspilledCS1GPRs.empty()) { for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) { unsigned Reg = UnspilledCS1GPRs[i]; - // Don't spill high register if the function is thumb + // Don't spill high register if the function is thumb. In the case of + // Windows on ARM, accept R11 (frame pointer) if (!AFI->isThumbFunction() || + (STI.isTargetWindows() && Reg == ARM::R11) || isARMLowRegister(Reg) || Reg == ARM::LR) { SavedRegs.set(Reg); if (!MRI.isReserved(Reg)) @@ -1784,8 +1743,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned Align = getStackAlignment(); - Amount = (Amount+Align-1)/Align*Align; + Amount = alignSPAdjust(Amount); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); assert(!AFI->isThumb1OnlyFunction() && @@ -1885,7 +1843,6 @@ void ARMFrameLowering::adjustForSegmentedStacks( if (!ST->isTargetAndroid() && !ST->isTargetLinux()) report_fatal_error("Segmented stacks not supported on this platform."); - assert(&PrologueMBB == &MF.front() && "Shrink-wrapping not yet implemented"); MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); MCContext &Context = MMI.getContext(); @@ -1913,21 +1870,48 @@ void ARMFrameLowering::adjustForSegmentedStacks( MachineBasicBlock *GetMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *McrMBB = MF.CreateMachineBasicBlock(); - for (MachineBasicBlock::livein_iterator i = PrologueMBB.livein_begin(), - e = PrologueMBB.livein_end(); - i != e; ++i) { - AllocMBB->addLiveIn(*i); - GetMBB->addLiveIn(*i); - McrMBB->addLiveIn(*i); - PrevStackMBB->addLiveIn(*i); - PostStackMBB->addLiveIn(*i); + // Grab everything that reaches PrologueMBB to update there liveness as well. + SmallPtrSet<MachineBasicBlock *, 8> BeforePrologueRegion; + SmallVector<MachineBasicBlock *, 2> WalkList; + WalkList.push_back(&PrologueMBB); + + do { + MachineBasicBlock *CurMBB = WalkList.pop_back_val(); + for (MachineBasicBlock *PredBB : CurMBB->predecessors()) { + if (BeforePrologueRegion.insert(PredBB).second) + WalkList.push_back(PredBB); + } + } while (!WalkList.empty()); + + // The order in that list is important. + // The blocks will all be inserted before PrologueMBB using that order. + // Therefore the block that should appear first in the CFG should appear + // first in the list. + MachineBasicBlock *AddedBlocks[] = {PrevStackMBB, McrMBB, GetMBB, AllocMBB, + PostStackMBB}; + + for (MachineBasicBlock *B : AddedBlocks) + BeforePrologueRegion.insert(B); + + for (const auto &LI : PrologueMBB.liveins()) { + for (MachineBasicBlock *PredBB : BeforePrologueRegion) + PredBB->addLiveIn(LI); + } + + // Remove the newly added blocks from the list, since we know + // we do not have to do the following updates for them. + for (MachineBasicBlock *B : AddedBlocks) { + BeforePrologueRegion.erase(B); + MF.insert(PrologueMBB.getIterator(), B); } - MF.push_front(PostStackMBB); - MF.push_front(AllocMBB); - MF.push_front(GetMBB); - MF.push_front(McrMBB); - MF.push_front(PrevStackMBB); + for (MachineBasicBlock *MBB : BeforePrologueRegion) { + // Make sure the LiveIns are still sorted and unique. + MBB->sortUniqueLiveIns(); + // Replace the edges to PrologueMBB by edges to the sequences + // we are about to add. + MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]); + } // The required stack size that is aligned to ARM constant criterion. AlignedStackSize = alignToARMConstant(StackSize); @@ -1991,7 +1975,7 @@ void ARMFrameLowering::adjustForSegmentedStacks( ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol::Create( MF.getFunction()->getContext(), "__STACK_LIMIT", PCLabelId, 0); MachineConstantPool *MCP = MF.getConstantPool(); - unsigned CPI = MCP->getConstantPoolIndex(NewCPV, MF.getAlignment()); + unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4); // ldr SR0, [pc, offset(STACK_LIMIT)] AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0) diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h index 6fdc5ef..66f4dfb 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h @@ -31,8 +31,6 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - void fixTCReturn(MachineFunction &MF, MachineBasicBlock &MBB) const; - bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, @@ -52,7 +50,6 @@ public: unsigned &FrameReg) const override; int ResolveFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg, int SPAdj) const; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; @@ -60,6 +57,11 @@ public: void adjustForSegmentedStacks(MachineFunction &MF, MachineBasicBlock &MBB) const override; + /// Returns true if the target will correctly handle shrink wrapping. + bool enableShrinkWrapping(const MachineFunction &MF) const override { + return true; + } + private: void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc, diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index b110628..dfbb969 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -160,11 +160,6 @@ public: // Thumb Addressing Modes: bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset); - bool SelectThumbAddrModeRI(SDValue N, SDValue &Base, SDValue &Offset, - unsigned Scale); - bool SelectThumbAddrModeRI5S1(SDValue N, SDValue &Base, SDValue &Offset); - bool SelectThumbAddrModeRI5S2(SDValue N, SDValue &Base, SDValue &Offset); - bool SelectThumbAddrModeRI5S4(SDValue N, SDValue &Base, SDValue &Offset); bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base, SDValue &OffImm); bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base, @@ -176,8 +171,6 @@ public: bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm); // Thumb 2 Addressing Modes: - bool SelectT2ShifterOperandReg(SDValue N, - SDValue &BaseReg, SDValue &Opc); bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm); @@ -278,6 +271,22 @@ private: // Get the alignment operand for a NEON VLD or VST instruction. SDValue GetVLDSTAlign(SDValue Align, SDLoc dl, unsigned NumVecs, bool is64BitVector); + + /// Returns the number of instructions required to materialize the given + /// constant in a register, or 3 if a literal pool load is needed. + unsigned ConstantMaterializationCost(unsigned Val) const; + + /// Checks if N is a multiplication by a constant where we can extract out a + /// power of two from the constant so that it can be used in a shift, but only + /// if it simplifies the materialization of the constant. Returns true if it + /// is, and assigns to PowerOfTwo the power of two that should be extracted + /// out and to NewMulConst the new constant to be multiplied by. + bool canExtractShiftFromMul(const SDValue &N, unsigned MaxShift, + unsigned &PowerOfTwo, SDValue &NewMulConst) const; + + /// Replace N with M in CurDAG, in a way that also ensures that M gets + /// selected when N would have been selected. + void replaceDAGValue(const SDValue &N, SDValue M); }; } @@ -334,7 +343,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() { bool isThumb2 = Subtarget->isThumb(); for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { - SDNode *N = I++; // Preincrement iterator to avoid invalidation issues. + SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. if (N->getOpcode() != ISD::ADD) continue; @@ -388,7 +397,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() { SDValue CPTmp1; SDValue CPTmp2; if (isThumb2) { - if (SelectT2ShifterOperandReg(N0, CPTmp0, CPTmp1)) + if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1)) continue; } else { if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1) || @@ -471,6 +480,61 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift, (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1)); } +unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const { + if (Subtarget->isThumb()) { + if (Val <= 255) return 1; // MOV + if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW + if (~Val <= 255) return 2; // MOV + MVN + if (ARM_AM::isThumbImmShiftedVal(Val)) return 2; // MOV + LSL + } else { + if (ARM_AM::getSOImmVal(Val) != -1) return 1; // MOV + if (ARM_AM::getSOImmVal(~Val) != -1) return 1; // MVN + if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW + if (ARM_AM::isSOImmTwoPartVal(Val)) return 2; // two instrs + } + if (Subtarget->useMovt(*MF)) return 2; // MOVW + MOVT + return 3; // Literal pool load +} + +bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N, + unsigned MaxShift, + unsigned &PowerOfTwo, + SDValue &NewMulConst) const { + assert(N.getOpcode() == ISD::MUL); + assert(MaxShift > 0); + + // If the multiply is used in more than one place then changing the constant + // will make other uses incorrect, so don't. + if (!N.hasOneUse()) return false; + // Check if the multiply is by a constant + ConstantSDNode *MulConst = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (!MulConst) return false; + // If the constant is used in more than one place then modifying it will mean + // we need to materialize two constants instead of one, which is a bad idea. + if (!MulConst->hasOneUse()) return false; + unsigned MulConstVal = MulConst->getZExtValue(); + if (MulConstVal == 0) return false; + + // Find the largest power of 2 that MulConstVal is a multiple of + PowerOfTwo = MaxShift; + while ((MulConstVal % (1 << PowerOfTwo)) != 0) { + --PowerOfTwo; + if (PowerOfTwo == 0) return false; + } + + // Only optimise if the new cost is better + unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo); + NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32); + unsigned OldCost = ConstantMaterializationCost(MulConstVal); + unsigned NewCost = ConstantMaterializationCost(NewMulConstVal); + return NewCost < OldCost; +} + +void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) { + CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode()); + CurDAG->ReplaceAllUsesWith(N, M); +} + bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N, SDValue &BaseReg, SDValue &Opc, @@ -478,6 +542,24 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N, if (DisableShifterOp) return false; + // If N is a multiply-by-constant and it's profitable to extract a shift and + // use it in a shifted operand do so. + if (N.getOpcode() == ISD::MUL) { + unsigned PowerOfTwo = 0; + SDValue NewMulConst; + if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) { + BaseReg = SDValue(Select(CurDAG->getNode(ISD::MUL, SDLoc(N), MVT::i32, + N.getOperand(0), NewMulConst) + .getNode()), + 0); + replaceDAGValue(N.getOperand(1), NewMulConst); + Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl, + PowerOfTwo), + SDLoc(N), MVT::i32); + return true; + } + } + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); // Don't match base register only case. That is matched to a separate @@ -540,7 +622,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, } if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } else Base = N; @@ -662,6 +745,18 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, } } + // If Offset is a multiply-by-constant and it's profitable to extract a shift + // and use it in a shifted operand do so. + if (Offset.getOpcode() == ISD::MUL) { + unsigned PowerOfTwo = 0; + SDValue NewMulConst; + if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) { + replaceDAGValue(Offset.getOperand(1), NewMulConst); + ShAmt = PowerOfTwo; + ShOpcVal = ARM_AM::lsl; + } + } + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), SDLoc(N), MVT::i32); return true; @@ -707,7 +802,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N, Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } else if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } Offset = CurDAG->getRegister(0, MVT::i32); @@ -973,7 +1069,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } else if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), @@ -1086,77 +1183,14 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N, } bool -ARMDAGToDAGISel::SelectThumbAddrModeRI(SDValue N, SDValue &Base, - SDValue &Offset, unsigned Scale) { - if (Scale == 4) { - SDValue TmpBase, TmpOffImm; - if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm)) - return false; // We want to select tLDRspi / tSTRspi instead. - - if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() == ISD::TargetConstantPool) - return false; // We want to select tLDRpci instead. - } - - if (!CurDAG->isBaseWithConstantOffset(N)) - return false; - - // Thumb does not have [sp, r] address mode. - RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); - RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1)); - if ((LHSR && LHSR->getReg() == ARM::SP) || - (RHSR && RHSR->getReg() == ARM::SP)) - return false; - - // FIXME: Why do we explicitly check for a match here and then return false? - // Presumably to allow something else to match, but shouldn't this be - // documented? - int RHSC; - if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) - return false; - - Base = N.getOperand(0); - Offset = N.getOperand(1); - return true; -} - -bool -ARMDAGToDAGISel::SelectThumbAddrModeRI5S1(SDValue N, - SDValue &Base, - SDValue &Offset) { - return SelectThumbAddrModeRI(N, Base, Offset, 1); -} - -bool -ARMDAGToDAGISel::SelectThumbAddrModeRI5S2(SDValue N, - SDValue &Base, - SDValue &Offset) { - return SelectThumbAddrModeRI(N, Base, Offset, 2); -} - -bool -ARMDAGToDAGISel::SelectThumbAddrModeRI5S4(SDValue N, - SDValue &Base, - SDValue &Offset) { - return SelectThumbAddrModeRI(N, Base, Offset, 4); -} - -bool ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base, SDValue &OffImm) { - if (Scale == 4) { - SDValue TmpBase, TmpOffImm; - if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm)) - return false; // We want to select tLDRspi / tSTRspi instead. - - if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() == ISD::TargetConstantPool) - return false; // We want to select tLDRpci instead. - } - if (!CurDAG->isBaseWithConstantOffset(N)) { - if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + if (N.getOpcode() == ISD::ADD) { + return false; // We want to select register offset instead + } else if (N.getOpcode() == ARMISD::Wrapper && + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } else { Base = N; @@ -1166,23 +1200,6 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, return true; } - RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); - RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1)); - if ((LHSR && LHSR->getReg() == ARM::SP) || - (RHSR && RHSR->getReg() == ARM::SP)) { - ConstantSDNode *LHS = dyn_cast<ConstantSDNode>(N.getOperand(0)); - ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1)); - unsigned LHSC = LHS ? LHS->getZExtValue() : 0; - unsigned RHSC = RHS ? RHS->getZExtValue() : 0; - - // Thumb does not have [sp, #imm5] address mode for non-zero imm5. - if (LHSC != 0 || RHSC != 0) return false; - - Base = N; - OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); - return true; - } - // If the RHS is + imm5 * scale, fold into addr mode. int RHSC; if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) { @@ -1191,9 +1208,8 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, return true; } - Base = N.getOperand(0); - OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); - return true; + // Offset is too large, so use register offset instead. + return false; } bool @@ -1263,28 +1279,6 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, //===----------------------------------------------------------------------===// -bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue N, SDValue &BaseReg, - SDValue &Opc) { - if (DisableShifterOp) - return false; - - ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); - - // Don't match base register only case. That is matched to a separate - // lower complexity pattern with explicit register operand. - if (ShOpcVal == ARM_AM::no_shift) return false; - - BaseReg = N.getOperand(0); - unsigned ShImmVal = 0; - if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - ShImmVal = RHS->getZExtValue() & 31; - Opc = getI32Imm(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), SDLoc(N)); - return true; - } - - return false; -} - bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm) { // Match simple R + imm12 operands. @@ -1302,7 +1296,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, } if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); if (Base.getOpcode() == ISD::TargetConstantPool) return false; // We want to select t2LDRpci instead. @@ -1425,6 +1420,17 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, } } + // If OffReg is a multiply-by-constant and it's profitable to extract a shift + // and use it in a shifted operand do so. + if (OffReg.getOpcode() == ISD::MUL) { + unsigned PowerOfTwo = 0; + SDValue NewMulConst; + if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) { + replaceDAGValue(OffReg.getOperand(1), NewMulConst); + ShAmt = PowerOfTwo; + } + } + ShImm = CurDAG->getTargetConstant(ShAmt, SDLoc(N), MVT::i32); return true; @@ -2503,25 +2509,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { } case ISD::Constant: { unsigned Val = cast<ConstantSDNode>(N)->getZExtValue(); - bool UseCP = true; - if (Subtarget->useMovt(*MF)) - // Thumb2-aware targets have the MOVT instruction, so all immediates can - // be done with MOV + MOVT, at worst. - UseCP = false; - else { - if (Subtarget->isThumb()) { - UseCP = (Val > 255 && // MOV - ~Val > 255 && // MOV + MVN - !ARM_AM::isThumbImmShiftedVal(Val) && // MOV + LSL - !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW - } else - UseCP = (ARM_AM::getSOImmVal(Val) == -1 && // MOV - ARM_AM::getSOImmVal(~Val) == -1 && // MVN - !ARM_AM::isSOImmTwoPartVal(Val) && // two instrs. - !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW - } - - if (UseCP) { + // If we can't materialize the constant we need to use a literal pool + if (ConstantMaterializationCost(Val) > 2) { SDValue CPIdx = CurDAG->getTargetConstantPool( ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val), TLI->getPointerTy(CurDAG->getDataLayout())); @@ -3376,7 +3365,7 @@ static void getIntOperandsFromRegisterString(StringRef RegString, SelectionDAG *CurDAG, SDLoc DL, std::vector<SDValue>& Ops) { SmallVector<StringRef, 5> Fields; - RegString.split(Fields, ":"); + RegString.split(Fields, ':'); if (Fields.size() > 1) { bool AllIntFields = true; @@ -3461,9 +3450,9 @@ static inline int getMClassRegisterSYSmValueMask(StringRef RegString) { // The flags here are common to those allowed for apsr in the A class cores and // those allowed for the special registers in the M class cores. Returns a // value representing which flags were present, -1 if invalid. -static inline int getMClassFlagsMask(StringRef Flags) { +static inline int getMClassFlagsMask(StringRef Flags, bool hasDSP) { if (Flags.empty()) - return 0x3; + return 0x2 | (int)hasDSP; return StringSwitch<int>(Flags) .Case("g", 0x1) @@ -3492,7 +3481,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead, } // We know we are now handling a write so need to get the mask for the flags. - int Mask = getMClassFlagsMask(Flags); + int Mask = getMClassFlagsMask(Flags, Subtarget->hasDSP()); // Only apsr, iapsr, eapsr, xpsr can have flags. The other register values // shouldn't have flags present. @@ -3501,7 +3490,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead, // The _g and _nzcvqg versions are only valid if the DSP extension is // available. - if (!Subtarget->hasThumb2DSP() && (Mask & 0x2)) + if (!Subtarget->hasDSP() && (Mask & 0x1)) return -1; // The register was valid so need to put the mask in the correct place @@ -3523,7 +3512,7 @@ static int getARClassRegisterMask(StringRef Reg, StringRef Flags) { // The flags permitted for apsr are the same flags that are allowed in // M class registers. We get the flag value and then shift the flags into // the correct place to combine with the mask. - Mask = getMClassFlagsMask(Flags); + Mask = getMClassFlagsMask(Flags, true); if (Mask == -1) return -1; return Mask << 2; @@ -3742,7 +3731,7 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){ } SmallVector<StringRef, 5> Fields; - StringRef(SpecialReg).split(Fields, "_", 1, false); + StringRef(SpecialReg).split(Fields, '_', 1, false); std::string Reg = Fields[0].str(); StringRef Flags = Fields.size() == 2 ? Fields[1] : ""; @@ -3943,6 +3932,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, // be an immediate and not a memory constraint. // Fallthrough. case InlineAsm::Constraint_m: + case InlineAsm::Constraint_o: case InlineAsm::Constraint_Q: case InlineAsm::Constraint_Um: case InlineAsm::Constraint_Un: diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp index 8cc06df..37c0795 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -142,6 +142,11 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); + + if (!VT.isFloatingPoint() && + VT != MVT::v2i64 && VT != MVT::v1i64) + for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) + setOperationAction(Opcode, VT, Legal); } void ARMTargetLowering::addDRTypeForNEON(MVT VT) { @@ -166,77 +171,78 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // Uses VFP for Thumb libfuncs if available. if (Subtarget->isThumb() && Subtarget->hasVFP2() && Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { - // Single-precision floating-point arithmetic. - setLibcallName(RTLIB::ADD_F32, "__addsf3vfp"); - setLibcallName(RTLIB::SUB_F32, "__subsf3vfp"); - setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp"); - setLibcallName(RTLIB::DIV_F32, "__divsf3vfp"); - - // Double-precision floating-point arithmetic. - setLibcallName(RTLIB::ADD_F64, "__adddf3vfp"); - setLibcallName(RTLIB::SUB_F64, "__subdf3vfp"); - setLibcallName(RTLIB::MUL_F64, "__muldf3vfp"); - setLibcallName(RTLIB::DIV_F64, "__divdf3vfp"); - - // Single-precision comparisons. - setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp"); - setLibcallName(RTLIB::UNE_F32, "__nesf2vfp"); - setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp"); - setLibcallName(RTLIB::OLE_F32, "__lesf2vfp"); - setLibcallName(RTLIB::OGE_F32, "__gesf2vfp"); - setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp"); - setLibcallName(RTLIB::UO_F32, "__unordsf2vfp"); - setLibcallName(RTLIB::O_F32, "__unordsf2vfp"); - - setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::UO_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::O_F32, ISD::SETEQ); - - // Double-precision comparisons. - setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp"); - setLibcallName(RTLIB::UNE_F64, "__nedf2vfp"); - setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp"); - setLibcallName(RTLIB::OLE_F64, "__ledf2vfp"); - setLibcallName(RTLIB::OGE_F64, "__gedf2vfp"); - setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp"); - setLibcallName(RTLIB::UO_F64, "__unorddf2vfp"); - setLibcallName(RTLIB::O_F64, "__unorddf2vfp"); - - setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::UO_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::O_F64, ISD::SETEQ); - - // Floating-point to integer conversions. - // i64 conversions are done via library routines even when generating VFP - // instructions, so use the same ones. - setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp"); - setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp"); - setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp"); - setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp"); - - // Conversions between floating types. - setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp"); - setLibcallName(RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp"); + static const struct { + const RTLIB::Libcall Op; + const char * const Name; + const ISD::CondCode Cond; + } LibraryCalls[] = { + // Single-precision floating-point arithmetic. + { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, + { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, + { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, + { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, + + // Double-precision floating-point arithmetic. + { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, + { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, + { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, + { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, + + // Single-precision comparisons. + { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, + { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, + { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, + { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, + { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, + { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, + { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, + { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, + + // Double-precision comparisons. + { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, + { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, + { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, + { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, + { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, + { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, + { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, + { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, + + // Floating-point to integer conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, + + // Conversions between floating types. + { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, + { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, + + // Integer to floating-point conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + // FIXME: There appears to be some naming inconsistency in ARM libgcc: + // e.g., __floatunsidf vs. __floatunssidfvfp. + { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, + { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, + }; + + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + if (LC.Cond != ISD::SETCC_INVALID) + setCmpLibcallCC(LC.Op, LC.Cond); + } + } - // Integer to floating-point conversions. - // i64 conversions are done via library routines even when generating VFP - // instructions, so use the same ones. - // FIXME: There appears to be some naming inconsistency in ARM libgcc: - // e.g., __floatunsidf vs. __floatunssidfvfp. - setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp"); - setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp"); - setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp"); - setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp"); + // Set the correct calling convention for ARMv7k WatchOS. It's just + // AAPCS_VFP for functions as simple as libcalls. + if (Subtarget->isTargetWatchOS()) { + for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) + setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP); } } @@ -245,8 +251,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); - if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetMachO() && - !Subtarget->isTargetWindows()) { + // RTLIB + if (Subtarget->isAAPCS_ABI() && + (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || + Subtarget->isTargetAndroid())) { static const struct { const RTLIB::Libcall Op; const char * const Name; @@ -334,12 +342,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, - - // Memory operations - // RTABI chapter 4.3.4 - { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, - { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, - { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, }; for (const auto &LC : LibraryCalls) { @@ -348,6 +350,30 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (LC.Cond != ISD::SETCC_INVALID) setCmpLibcallCC(LC.Op, LC.Cond); } + + // EABI dependent RTLIB + if (TM.Options.EABIVersion == EABI::EABI4 || + TM.Options.EABIVersion == EABI::EABI5) { + static const struct { + const RTLIB::Libcall Op; + const char *const Name; + const CallingConv::ID CC; + const ISD::CondCode Cond; + } MemOpsLibraryCalls[] = { + // Memory operations + // RTABI chapter 4.3.4 + { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + }; + + for (const auto &LC : MemOpsLibraryCalls) { + setLibcallName(LC.Op, LC.Name); + setLibcallCallingConv(LC.Op, LC.CC); + if (LC.Cond != ISD::SETCC_INVALID) + setCmpLibcallCC(LC.Op, LC.Cond); + } + } } if (Subtarget->isTargetWindows()) { @@ -364,6 +390,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::SDIV_I32, "__rt_sdiv", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::UDIV_I32, "__rt_udiv", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::SDIV_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::UDIV_I64, "__rt_udiv64", CallingConv::ARM_AAPCS_VFP }, }; for (const auto &LC : LibraryCalls) { @@ -373,8 +403,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } // Use divmod compiler-rt calls for iOS 5.0 and later. - if (Subtarget->getTargetTriple().isiOS() && - !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) { + if (Subtarget->isTargetWatchOS() || + (Subtarget->isTargetIOS() && + !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); } @@ -392,6 +423,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); } + // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have + // a __gnu_ prefix (which is the default). + if (Subtarget->isTargetAEABI()) { + setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h"); + setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h"); + setLibcallName(RTLIB::FPEXT_F16_F32, "__aeabi_h2f"); + } + if (Subtarget->isThumb1Only()) addRegisterClass(MVT::i32, &ARM::tGPRRegClass); else @@ -579,7 +618,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); - setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); @@ -605,7 +643,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ADDC); if (Subtarget->isFPOnlySP()) { - // When targetting a floating-point unit with only single-precision + // When targeting a floating-point unit with only single-precision // operations, f64 is legal for the few double-precision instructions which // are present However, no double-precision operations other than moves, // loads and stores are provided by the hardware. @@ -689,7 +727,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); } if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() - || (Subtarget->isThumb2() && !Subtarget->hasThumb2DSP())) + || (Subtarget->isThumb2() && !Subtarget->hasDSP())) setOperationAction(ISD::MULHS, MVT::i32, Expand); setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); @@ -706,8 +744,15 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SUBE, MVT::i32, Custom); } + if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + // ARM does not have ROTL. - setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTL, MVT::i32, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + } setOperationAction(ISD::CTTZ, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i32, Expand); if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) @@ -717,7 +762,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand); - setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); + // @llvm.readcyclecounter requires the Performance Monitors extension. + // Default to the 0 expansion on unsupported platforms. + // FIXME: Technically there are older ARM CPUs that have + // implementation-specific ways of obtaining this information. + if (Subtarget->hasPerfMon()) + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); // Only ARMv6 has BSWAP. if (!Subtarget->hasV6Ops()) @@ -726,15 +776,17 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) && !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) { // These are expanded into libcalls if the cpu doesn't have HW divider. - setOperationAction(ISD::SDIV, MVT::i32, Expand); - setOperationAction(ISD::UDIV, MVT::i32, Expand); + setOperationAction(ISD::SDIV, MVT::i32, LibCall); + setOperationAction(ISD::UDIV, MVT::i32, LibCall); } - // FIXME: Also set divmod for SREM on EABI setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); // Register based DivRem for AEABI (RTABI 4.2) - if (Subtarget->isTargetAEABI()) { + if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) { + setOperationAction(ISD::SREM, MVT::i64, Custom); + setOperationAction(ISD::UREM, MVT::i64, Custom); + setLibcallName(RTLIB::SDIVREM_I8, "__aeabi_idivmod"); setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod"); setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod"); @@ -762,7 +814,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::i32, Custom); - setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); setOperationAction(ISD::BlockAddress, MVT::i32, Custom); @@ -776,13 +827,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - if (!Subtarget->isTargetMachO()) { - // Non-MachO platforms may return values in these registers via the - // personality function. - setExceptionPointerRegister(ARM::R0); - setExceptionSelectorRegister(ARM::R1); - } - if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); else @@ -849,11 +893,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - if (Subtarget->isTargetDarwin()) { - setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); - setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); + if (Subtarget->useSjLjEH()) setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); - } setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); @@ -912,7 +956,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (Subtarget->hasSinCos()) { setLibcallName(RTLIB::SINCOS_F32, "sincosf"); setLibcallName(RTLIB::SINCOS_F64, "sincos"); - if (Subtarget->getTargetTriple().isiOS()) { + if (Subtarget->isTargetWatchOS()) { + setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP); + setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP); + } + if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) { // For iOS, we don't want to the normal expansion of a libcall to // sincos. We want to issue a libcall to __sincos_stret. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); @@ -928,6 +976,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FTRUNC, MVT::f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); + if (!Subtarget->isFPOnlySP()) { setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); @@ -935,8 +990,22 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); } } + + if (Subtarget->hasNEON()) { + // vmin and vmax aren't available in a scalar form, so we use + // a NEON instruction with an undef lane instead. + setOperationAction(ISD::FMINNAN, MVT::f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); + setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal); + setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); + } + // We have target-specific dag combine patterns for the following nodes: // ARMISD::VMOVRRD - No need to call setTargetDAGCombine setTargetDAGCombine(ISD::ADD); @@ -959,11 +1028,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, //// temporary - rewrite interface to use type MaxStoresPerMemset = 8; - MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemsetOptSize = 4; MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores - MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2; + MaxStoresPerMemcpyOptSize = 2; MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores - MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2; + MaxStoresPerMemmoveOptSize = 2; // On ARM arguments smaller than 4 bytes are extended, so all arguments // are at least 4 bytes aligned. @@ -1054,8 +1123,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::CMOV: return "ARMISD::CMOV"; - case ARMISD::RBIT: return "ARMISD::RBIT"; - case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; case ARMISD::RRX: return "ARMISD::RRX"; @@ -1069,7 +1136,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; - case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; @@ -1082,6 +1150,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; case ARMISD::WIN__CHKSTK: return "ARMISD:::WIN__CHKSTK"; + case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; case ARMISD::VCEQ: return "ARMISD::VCEQ"; case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; @@ -1133,14 +1202,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; - case ARMISD::FMAX: return "ARMISD::FMAX"; - case ARMISD::FMIN: return "ARMISD::FMIN"; - case ARMISD::VMAXNM: return "ARMISD::VMAX"; - case ARMISD::VMINNM: return "ARMISD::VMIN"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; case ARMISD::VBSL: return "ARMISD::VBSL"; + case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; @@ -1319,6 +1385,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, else return CallingConv::ARM_AAPCS; case CallingConv::Fast: + case CallingConv::CXX_FAST_TLS: if (!Subtarget->isAAPCS_ABI()) { if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::Fast; @@ -1449,9 +1516,10 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, PtrOff); - return DAG.getStore(Chain, dl, Arg, PtrOff, - MachinePointerInfo::getStack(LocMemOffset), - false, false, 0); + return DAG.getStore( + Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), + false, false, 0); } void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, @@ -1734,9 +1802,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Get the address of the callee into a register SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), false, false, - false, 0); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { const char *Sym = S->getSymbol(); @@ -1748,9 +1817,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Get the address of the callee into a register SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), false, false, - false, 0); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { const GlobalValue *GV = G->getGlobal(); @@ -1768,7 +1838,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMISD::WrapperPIC, dl, PtrVt, DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(), false, false, true, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, true, 0); } else if (Subtarget->isTargetCOFF()) { assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); @@ -1781,7 +1852,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); } else { // On ELF targets for PIC code, direct calls should go through the PLT unsigned OpFlags = 0; @@ -1804,9 +1876,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMPCLabelIndex, 4); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), false, false, - false, 0); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); } else { @@ -1821,7 +1894,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // FIXME: handle tail calls differently. unsigned CallOpc; - bool HasMinSizeAttr = MF.getFunction()->hasFnAttribute(Attribute::MinSize); if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; @@ -1831,8 +1903,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (!isDirect && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else if (doesNotRet && isDirect && Subtarget->hasRAS() && - // Emit regular call when code size is the priority - !HasMinSizeAttr) + // Emit regular call when code size is the priority + !MF.getFunction()->optForMinSize()) // "mov lr, pc; b _foo" to avoid confusing the RSP CallOpc = ARMISD::CALL_NOLINK; else @@ -2014,6 +2086,8 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; + assert(Subtarget->supportsTailCall()); + // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. @@ -2033,26 +2107,6 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (isCalleeStructRet || isCallerStructRet) return false; - // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo:: - // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as - // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation - // support in the assembler and linker to be used. This would need to be - // fixed to fully support tail calls in Thumb1. - // - // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take - // LR. This means if we need to reload LR, it takes an extra instructions, - // which outweighs the value of the tail call; but here we don't know yet - // whether LR is going to be used. Probably the right approach is to - // generate the tail call here and turn it back into CALL/RET in - // emitEpilogue if LR is used. - - // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, - // but we need to make sure there are enough registers; the only valid - // registers are the 4 used for parameters. We don't currently do this - // case. - if (Subtarget->isThumb1Only()) - return false; - // Externally-defined functions with weak linkage should not be // tail-called on ARM when the OS does not support dynamic // pre-emption of symbols, as the AAELF spec requires normal calls @@ -2294,6 +2348,19 @@ ARMTargetLowering::LowerReturn(SDValue Chain, Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (ARM::GPRRegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i32)); + else if (ARM::DPRRegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } // Update chain and glue. RetOps[0] = Chain; @@ -2400,7 +2467,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; - return !Subtarget->isThumb1Only(); + return true; } // Trying to write a 64 bit value so need to split into two 32 bit values first, @@ -2467,15 +2534,82 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); } CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); - SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + SDValue Result = + DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 0); if (RelocM == Reloc::Static) return Result; SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); } +/// \brief Convert a TLS address reference into the correct sequence of loads +/// and calls to compute the variable's address for Darwin, and return an +/// SDValue containing the final node. + +/// Darwin only has one TLS scheme which must be capable of dealing with the +/// fully general situation, in the worst case. This means: +/// + "extern __thread" declaration. +/// + Defined in a possibly unknown dynamic library. +/// +/// The general system is that each __thread variable has a [3 x i32] descriptor +/// which contains information used by the runtime to calculate the address. The +/// only part of this the compiler needs to know about is the first word, which +/// contains a function pointer that must be called with the address of the +/// entire descriptor in "r0". +/// +/// Since this descriptor may be in a different unit, in general access must +/// proceed along the usual ARM rules. A common sequence to produce is: +/// +/// movw rT1, :lower16:_var$non_lazy_ptr +/// movt rT1, :upper16:_var$non_lazy_ptr +/// ldr r0, [rT1] +/// ldr rT2, [r0] +/// blx rT2 +/// [...address now in r0...] +SDValue +ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); + SDLoc DL(Op); + + // First step is to get the address of the actua global symbol. This is where + // the TLS descriptor lives. + SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); + + // The first entry in the descriptor is a function pointer that we must call + // to obtain the address of the variable. + SDValue Chain = DAG.getEntryNode(); + SDValue FuncTLVGet = + DAG.getLoad(MVT::i32, DL, Chain, DescAddr, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, true, true, 4); + Chain = FuncTLVGet.getValue(1); + + MachineFunction &F = DAG.getMachineFunction(); + MachineFrameInfo *MFI = F.getFrameInfo(); + MFI->setAdjustsStack(true); + + // TLS calls preserve all registers except those that absolutely must be + // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be + // silly). + auto TRI = + getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo(); + auto ARI = static_cast<const ARMRegisterInfo *>(TRI); + const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); + + // Finally, we can make the call. This is just a degenerate version of a + // normal AArch64 call node: r0 takes the address of the descriptor, and + // returns the address of the variable in this thread. + Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); + Chain = + DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), + Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), + DAG.getRegisterMask(Mask), Chain.getValue(1)); + return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); +} + // Lower ISD::GlobalTLSAddress using the "general dynamic" model SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, @@ -2491,9 +2625,10 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); - Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Argument = + DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 0); SDValue Chain = Argument.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); @@ -2543,17 +2678,19 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, true); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); - Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); Chain = Offset.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); - Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } else { // local exec model assert(model == TLSModel::LocalExec); @@ -2561,9 +2698,10 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); - Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } // The address of the thread local variable is the add of the thread @@ -2573,10 +2711,14 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, SDValue ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { + if (Subtarget->isTargetDarwin()) + return LowerGlobalTLSAddressDarwin(Op, DAG); + // TODO: implement the "local dynamic" model - assert(Subtarget->isTargetELF() && - "TLS not implemented for non-ELF targets"); + assert(Subtarget->isTargetELF() && "Only ELF implemented here"); GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); @@ -2597,22 +2739,31 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SDLoc dl(Op); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { - bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); - ARMConstantPoolValue *CPV = - ARMConstantPoolConstant::Create(GV, - UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT); + bool UseGOT_PREL = + !(GV->hasHiddenVisibility() || GV->hasLocalLinkage()); + + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc dl(Op); + unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create( + GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj, + UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier, + /*AddCurrentAddress=*/UseGOT_PREL); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), - CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + SDValue Result = DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); SDValue Chain = Result.getValue(1); - SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT); - Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT); - if (!UseGOTOFF) + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); + Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); + if (UseGOT_PREL) Result = DAG.getLoad(PtrVT, dl, Chain, Result, - MachinePointerInfo::getGOT(), + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); return Result; } @@ -2628,9 +2779,10 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, } else { SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + return DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } } @@ -2654,7 +2806,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); return Result; } @@ -2680,32 +2833,11 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, TargetFlags)); if (GV->hasDLLImportStorageClass()) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); return Result; } -SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, - SelectionDAG &DAG) const { - assert(Subtarget->isTargetELF() && - "GLOBAL OFFSET TABLE not implemented for non-ELF targets"); - MachineFunction &MF = DAG.getMachineFunction(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc dl(Op); - unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; - ARMConstantPoolValue *CPV = - ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_", - ARMPCLabelIndex, PCAdj); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); - CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); - SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); - return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); -} - SDValue ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -2722,6 +2854,13 @@ ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); } +SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, + Op.getOperand(0)); +} + SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { @@ -2732,7 +2871,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, case Intrinsic::arm_rbit: { assert(Op.getOperand(1).getValueType() == MVT::i32 && "RBIT intrinsic must have i32 type!"); - return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1)); + return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1)); } case Intrinsic::arm_thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -2752,10 +2891,10 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, ARMCP::CPLSDA, PCAdj); CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - SDValue Result = - DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + SDValue Result = DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); if (RelocM == Reloc::PIC_) { SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); @@ -2770,6 +2909,36 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::arm_neon_vminnm: + case Intrinsic::arm_neon_vmaxnm: { + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) + ? ISD::FMINNUM : ISD::FMAXNUM; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + case Intrinsic::arm_neon_vminu: + case Intrinsic::arm_neon_vmaxu: { + if (Op.getValueType().isFloatingPoint()) + return SDValue(); + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) + ? ISD::UMIN : ISD::UMAX; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + case Intrinsic::arm_neon_vmins: + case Intrinsic::arm_neon_vmaxs: { + // v{min,max}s is overloaded between signed integers and floats. + if (!Op.getValueType().isFloatingPoint()) { + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) + ? ISD::SMIN : ISD::SMAX; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) + ? ISD::FMINNAN : ISD::FMAXNAN; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } } } @@ -2870,9 +3039,10 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, // Create load node to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + ArgValue2 = DAG.getLoad( + MVT::i32, dl, Root, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, false, 0); } else { Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); @@ -3056,9 +3226,10 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, if (VA.isMemLoc()) { int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + ArgValue2 = DAG.getLoad( + MVT::f64, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, false, 0); } else { ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); @@ -3139,9 +3310,9 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, "Byval arguments cannot be implicit"); unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); - int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, CurOrigArg, - CurByValIndex, VA.getLocMemOffset(), - Flags.getByValSize()); + int FrameIndex = StoreByValRegs( + CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, + VA.getLocMemOffset(), Flags.getByValSize()); InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); CCInfo.nextInRegsParam(); } else { @@ -3151,9 +3322,10 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0)); + InVals.push_back(DAG.getLoad( + VA.getValVT(), dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, false, 0)); } lastInsIndex = index; } @@ -3188,13 +3360,9 @@ static bool isFloatingPointZero(SDValue Op) { // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) // created by LowerConstantFP(). SDValue BitcastOp = Op->getOperand(0); - if (BitcastOp->getOpcode() == ARMISD::VMOVIMM) { - SDValue MoveOp = BitcastOp->getOperand(0); - if (MoveOp->getOpcode() == ISD::TargetConstant && - cast<ConstantSDNode>(MoveOp)->getZExtValue() == 0) { - return true; - } - } + if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && + isNullConstant(BitcastOp->getOperand(0))) + return true; } return false; } @@ -3559,113 +3727,6 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // Try to generate VMAXNM/VMINNM on ARMv8. if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || TrueVal.getValueType() == MVT::f64)) { - // We can use VMAXNM/VMINNM for a compare followed by a select with the - // same operands, as follows: - // c = fcmp [?gt, ?ge, ?lt, ?le] a, b - // select c, a, b - // In NoNaNsFPMath the CC will have been changed from, e.g., 'ogt' to 'gt'. - bool swapSides = false; - if (!getTargetMachine().Options.NoNaNsFPMath) { - // transformability may depend on which way around we compare - switch (CC) { - default: - break; - case ISD::SETOGT: - case ISD::SETOGE: - case ISD::SETOLT: - case ISD::SETOLE: - // the non-NaN should be RHS - swapSides = DAG.isKnownNeverNaN(LHS) && !DAG.isKnownNeverNaN(RHS); - break; - case ISD::SETUGT: - case ISD::SETUGE: - case ISD::SETULT: - case ISD::SETULE: - // the non-NaN should be LHS - swapSides = DAG.isKnownNeverNaN(RHS) && !DAG.isKnownNeverNaN(LHS); - break; - } - } - swapSides = swapSides || (LHS == FalseVal && RHS == TrueVal); - if (swapSides) { - CC = ISD::getSetCCSwappedOperands(CC); - std::swap(LHS, RHS); - } - if (LHS == TrueVal && RHS == FalseVal) { - bool canTransform = true; - // FIXME: FastMathFlags::noSignedZeros() doesn't appear reachable from here - if (!getTargetMachine().Options.UnsafeFPMath && - !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { - const ConstantFPSDNode *Zero; - switch (CC) { - default: - break; - case ISD::SETOGT: - case ISD::SETUGT: - case ISD::SETGT: - // RHS must not be -0 - canTransform = (Zero = dyn_cast<ConstantFPSDNode>(RHS)) && - !Zero->isNegative(); - break; - case ISD::SETOGE: - case ISD::SETUGE: - case ISD::SETGE: - // LHS must not be -0 - canTransform = (Zero = dyn_cast<ConstantFPSDNode>(LHS)) && - !Zero->isNegative(); - break; - case ISD::SETOLT: - case ISD::SETULT: - case ISD::SETLT: - // RHS must not be +0 - canTransform = (Zero = dyn_cast<ConstantFPSDNode>(RHS)) && - Zero->isNegative(); - break; - case ISD::SETOLE: - case ISD::SETULE: - case ISD::SETLE: - // LHS must not be +0 - canTransform = (Zero = dyn_cast<ConstantFPSDNode>(LHS)) && - Zero->isNegative(); - break; - } - } - if (canTransform) { - // Note: If one of the elements in a pair is a number and the other - // element is NaN, the corresponding result element is the number. - // This is consistent with the IEEE 754-2008 standard. - // Therefore, a > b ? a : b <=> vmax(a,b), if b is constant and a is NaN - switch (CC) { - default: - break; - case ISD::SETOGT: - case ISD::SETOGE: - if (!DAG.isKnownNeverNaN(RHS)) - break; - return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS); - case ISD::SETUGT: - case ISD::SETUGE: - if (!DAG.isKnownNeverNaN(LHS)) - break; - case ISD::SETGT: - case ISD::SETGE: - return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS); - case ISD::SETOLT: - case ISD::SETOLE: - if (!DAG.isKnownNeverNaN(RHS)) - break; - return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS); - case ISD::SETULT: - case ISD::SETULE: - if (!DAG.isKnownNeverNaN(LHS)) - break; - case ISD::SETLT: - case ISD::SETLE: - return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS); - } - } - } - bool swpCmpOps = false; bool swpVselOps = false; checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); @@ -3890,16 +3951,18 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { Addr, Op.getOperand(2), JTI); } if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { - Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, - MachinePointerInfo::getJumpTable(), - false, false, false, 0); + Addr = + DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, + MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), + false, false, false, 0); Chain = Addr.getValue(1); Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } else { - Addr = DAG.getLoad(PTy, dl, Chain, Addr, - MachinePointerInfo::getJumpTable(), - false, false, false, 0); + Addr = + DAG.getLoad(PTy, dl, Chain, Addr, + MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), + false, false, false, 0); Chain = Addr.getValue(1); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } @@ -3936,7 +3999,7 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); - return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1, + return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), /*isSigned*/ false, SDLoc(Op)).first; } @@ -3988,7 +4051,7 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { else LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); - return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1, + return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), /*isSigned*/ false, SDLoc(Op)).first; } @@ -4153,6 +4216,56 @@ static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, Results.push_back(Read.getOperand(0)); } +/// \p BC is a bitcast that is about to be turned into a VMOVDRR. +/// When \p DstVT, the destination type of \p BC, is on the vector +/// register bank and the source of bitcast, \p Op, operates on the same bank, +/// it might be possible to combine them, such that everything stays on the +/// vector register bank. +/// \p return The node that would replace \p BT, if the combine +/// is possible. +static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, + SelectionDAG &DAG) { + SDValue Op = BC->getOperand(0); + EVT DstVT = BC->getValueType(0); + + // The only vector instruction that can produce a scalar (remember, + // since the bitcast was about to be turned into VMOVDRR, the source + // type is i64) from a vector is EXTRACT_VECTOR_ELT. + // Moreover, we can do this combine only if there is one use. + // Finally, if the destination type is not a vector, there is not + // much point on forcing everything on the vector bank. + if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !Op.hasOneUse()) + return SDValue(); + + // If the index is not constant, we will introduce an additional + // multiply that will stick. + // Give up in that case. + ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!Index) + return SDValue(); + unsigned DstNumElt = DstVT.getVectorNumElements(); + + // Compute the new index. + const APInt &APIntIndex = Index->getAPIntValue(); + APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); + NewIndex *= APIntIndex; + // Check if the new constant index fits into i32. + if (NewIndex.getBitWidth() > 32) + return SDValue(); + + // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> + // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) + SDLoc dl(Op); + SDValue ExtractSrc = Op.getOperand(0); + EVT VecVT = EVT::getVectorVT( + *DAG.getContext(), DstVT.getScalarType(), + ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); + SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, + DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); +} + /// ExpandBITCAST - If the target supports VFP, this function is called to /// expand a bit convert where either the source or destination type is i64 to /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 @@ -4172,6 +4285,11 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { // Turn i64->f64 into VMOVDRR. if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { + // Do not force values to GPRs (this is what VMOVDRR does for the inputs) + // if we can combine the bitcast with its source. + if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) + return Val; + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, @@ -4383,7 +4501,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, if (!ST->hasV6T2Ops()) return SDValue(); - SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0)); + SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); return DAG.getNode(ISD::CTLZ, dl, VT, rbit); } @@ -4544,8 +4662,7 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, "Unknown shift to lower!"); // We only lower SRA, SRL of 1 here, all others use generic lowering. - if (!isa<ConstantSDNode>(N->getOperand(1)) || - cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1) + if (!isOneConstant(N->getOperand(1))) return SDValue(); // If we are in thumb mode, we don't have RRX. @@ -5036,18 +5153,56 @@ static bool isVTBLMask(ArrayRef<int> M, EVT VT) { return VT == MVT::v8i8 && M.size() == 8; } +// Checks whether the shuffle mask represents a vector transpose (VTRN) by +// checking that pairs of elements in the shuffle mask represent the same index +// in each vector, incrementing the expected index by 2 at each step. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} +// v2={e,f,g,h} +// WhichResult gives the offset for each element in the mask based on which +// of the two results it belongs to. +// +// The transpose can be represented either as: +// result1 = shufflevector v1, v2, result1_shuffle_mask +// result2 = shufflevector v1, v2, result2_shuffle_mask +// where v1/v2 and the shuffle masks have the same number of elements +// (here WhichResult (see below) indicates which result is being checked) +// +// or as: +// results = shufflevector v1, v2, shuffle_mask +// where both results are returned in one vector and the shuffle mask has twice +// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we +// want to check the low half and high half of the shuffle mask as if it were +// the other case static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || - (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult)) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + // If the mask is twice as long as the input vector then we need to check the + // upper and lower parts of the mask with a matching value for WhichResult + // FIXME: A mask with only even values will be rejected in case the first + // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only + // M[0] is used to determine WhichResult + for (unsigned i = 0; i < M.size(); i += NumElts) { + if (M.size() == NumElts * 2) + WhichResult = i / NumElts; + else + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) + return false; + } } + + if (M.size() == NumElts*2) + WhichResult = 0; + return true; } @@ -5060,28 +5215,55 @@ static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || - (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult)) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + if (M.size() == NumElts * 2) + WhichResult = i / NumElts; + else + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) + return false; + } } + + if (M.size() == NumElts*2) + WhichResult = 0; + return true; } +// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking +// that the mask elements are either all even and in steps of size 2 or all odd +// and in steps of size 2. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} +// v2={e,f,g,h} +// Requires similar checks to that of isVTRNMask with +// respect the how results are returned. static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i != NumElts; ++i) { - if (M[i] < 0) continue; // ignore UNDEF indices - if ((unsigned) M[i] != 2 * i + WhichResult) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; ++j) { + if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) + return false; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5097,18 +5279,27 @@ static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ if (EltSz == 64) return false; - unsigned Half = VT.getVectorNumElements() / 2; - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned j = 0; j != 2; ++j) { - unsigned Idx = WhichResult; - for (unsigned i = 0; i != Half; ++i) { - int MIdx = M[i + j * Half]; - if (MIdx >= 0 && (unsigned) MIdx != Idx) - return false; - Idx += 2; + unsigned NumElts = VT.getVectorNumElements(); + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + unsigned Half = NumElts / 2; + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += Half) { + unsigned Idx = WhichResult; + for (unsigned k = 0; k < Half; ++k) { + int MIdx = M[i + j + k]; + if (MIdx >= 0 && (unsigned) MIdx != Idx) + return false; + Idx += 2; + } } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5116,21 +5307,37 @@ static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ return true; } +// Checks whether the shuffle mask represents a vector zip (VZIP) by checking +// that pairs of elements of the shufflemask represent the same index in each +// vector incrementing sequentially through the vectors. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} +// v2={e,f,g,h} +// Requires similar checks to that of isVTRNMask with respect the how results +// are returned. static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - unsigned Idx = WhichResult * NumElts / 2; - for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != Idx) || - (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts)) - return false; - Idx += 1; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) + return false; + Idx += 1; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5147,15 +5354,23 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - unsigned Idx = WhichResult * NumElts / 2; - for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != Idx) || - (M[i+1] >= 0 && (unsigned) M[i+1] != Idx)) - return false; - Idx += 1; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) + return false; + Idx += 1; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5329,16 +5544,14 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // just use VDUPLANE. We can only do this if the lane being extracted // is at a constant index, as the VDUP from lane instructions only have // constant-index forms. + ConstantSDNode *constIndex; if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && - isa<ConstantSDNode>(Value->getOperand(1))) { + (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { // We need to create a new undef vector to use for the VDUPLANE if the // size of the vector from which we get the value is different than the // size of the vector that we need to create. We will insert the element // such that the register coalescer will remove unnecessary copies. if (VT != Value->getOperand(0).getValueType()) { - ConstantSDNode *constIndex; - constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)); - assert(constIndex && "The index is not a constant!"); unsigned index = constIndex->getAPIntValue().getLimitedValue() % VT.getVectorNumElements(); N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, @@ -5437,14 +5650,35 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // shuffle in combination with VEXTs. SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); - SmallVector<SDValue, 2> SourceVecs; - SmallVector<unsigned, 2> MinElts; - SmallVector<unsigned, 2> MaxElts; + struct ShuffleSourceInfo { + SDValue Vec; + unsigned MinElt; + unsigned MaxElt; + + // We may insert some combination of BITCASTs and VEXT nodes to force Vec to + // be compatible with the shuffle we intend to construct. As a result + // ShuffleVec will be some sliding window into the original Vec. + SDValue ShuffleVec; + + // Code should guarantee that element i in Vec starts at element "WindowBase + // + i * WindowScale in ShuffleVec". + int WindowBase; + int WindowScale; + + bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } + ShuffleSourceInfo(SDValue Vec) + : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), + WindowScale(1) {} + }; + // First gather all vectors used as an immediate source for this BUILD_VECTOR + // node. + SmallVector<ShuffleSourceInfo, 2> Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) @@ -5453,127 +5687,166 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, // A shuffle can only come from building a vector from various // elements of other vectors. return SDValue(); - } else if (V.getOperand(0).getValueType().getVectorElementType() != - VT.getVectorElementType()) { - // This code doesn't know how to handle shuffles where the vector - // element types do not match (this happens because type legalization - // promotes the return type of EXTRACT_VECTOR_ELT). - // FIXME: It might be appropriate to extend this code to handle - // mismatched types. + } else if (!isa<ConstantSDNode>(V.getOperand(1))) { + // Furthermore, shuffles require a constant mask, whereas extractelts + // accept variable indices. return SDValue(); } - // Record this extraction against the appropriate vector if possible... + // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); - // If the element number isn't a constant, we can't effectively - // analyze what's going on. - if (!isa<ConstantSDNode>(V.getOperand(1))) - return SDValue(); - unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); - bool FoundSource = false; - for (unsigned j = 0; j < SourceVecs.size(); ++j) { - if (SourceVecs[j] == SourceVec) { - if (MinElts[j] > EltNo) - MinElts[j] = EltNo; - if (MaxElts[j] < EltNo) - MaxElts[j] = EltNo; - FoundSource = true; - break; - } - } + auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); + if (Source == Sources.end()) + Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); - // Or record a new source if not... - if (!FoundSource) { - SourceVecs.push_back(SourceVec); - MinElts.push_back(EltNo); - MaxElts.push_back(EltNo); - } + // Update the minimum and maximum lane number seen. + unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); + Source->MinElt = std::min(Source->MinElt, EltNo); + Source->MaxElt = std::max(Source->MaxElt, EltNo); } // Currently only do something sane when at most two source vectors - // involved. - if (SourceVecs.size() > 2) + // are involved. + if (Sources.size() > 2) return SDValue(); - SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; - int VEXTOffsets[2] = {0, 0}; + // Find out the smallest element size among result and two sources, and use + // it as element size to build the shuffle_vector. + EVT SmallestEltTy = VT.getVectorElementType(); + for (auto &Source : Sources) { + EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); + if (SrcEltTy.bitsLT(SmallestEltTy)) + SmallestEltTy = SrcEltTy; + } + unsigned ResMultiplier = + VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); + NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); + EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); + + // If the source vector is too wide or too narrow, we may nevertheless be able + // to construct a compatible shuffle either by concatenating it with UNDEF or + // extracting a suitable range of elements. + for (auto &Src : Sources) { + EVT SrcVT = Src.ShuffleVec.getValueType(); + + if (SrcVT.getSizeInBits() == VT.getSizeInBits()) + continue; + + // This stage of the search produces a source with the same element type as + // the original, but with a total width matching the BUILD_VECTOR output. + EVT EltVT = SrcVT.getVectorElementType(); + unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); + EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); - // This loop extracts the usage patterns of the source vectors - // and prepares appropriate SDValues for a shuffle if possible. - for (unsigned i = 0; i < SourceVecs.size(); ++i) { - if (SourceVecs[i].getValueType() == VT) { - // No VEXT necessary - ShuffleSrcs[i] = SourceVecs[i]; - VEXTOffsets[i] = 0; + if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { + if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) + return SDValue(); + // We can pad out the smaller vector for free, so if it's part of a + // shuffle... + Src.ShuffleVec = + DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, + DAG.getUNDEF(Src.ShuffleVec.getValueType())); continue; - } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) { - // It probably isn't worth padding out a smaller vector just to - // break it down again in a shuffle. - return SDValue(); } - // Since only 64-bit and 128-bit vectors are legal on ARM and - // we've eliminated the other cases... - assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts && - "unexpected vector sizes in ReconstructShuffle"); + if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) + return SDValue(); - if (MaxElts[i] - MinElts[i] >= NumElts) { + if (Src.MaxElt - Src.MinElt >= NumSrcElts) { // Span too large for a VEXT to cope return SDValue(); } - if (MinElts[i] >= NumElts) { + if (Src.MinElt >= NumSrcElts) { // The extraction can just take the second half - VEXTOffsets[i] = NumElts; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(NumElts, dl)); - } else if (MaxElts[i] < NumElts) { + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, dl, MVT::i32)); + Src.WindowBase = -NumSrcElts; + } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half - VEXTOffsets[i] = 0; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(0, dl)); + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, dl, MVT::i32)); } else { // An actual VEXT is needed - VEXTOffsets[i] = MinElts[i]; - SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(0, dl)); - SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(NumElts, dl)); - ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2, - DAG.getConstant(VEXTOffsets[i], dl, - MVT::i32)); - } - } - - SmallVector<int, 8> Mask; - - for (unsigned i = 0; i < NumElts; ++i) { + SDValue VEXTSrc1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, dl, MVT::i32)); + SDValue VEXTSrc2 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, dl, MVT::i32)); + + Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, + VEXTSrc2, + DAG.getConstant(Src.MinElt, dl, MVT::i32)); + Src.WindowBase = -Src.MinElt; + } + } + + // Another possible incompatibility occurs from the vector element types. We + // can fix this by bitcasting the source vectors to the same type we intend + // for the shuffle. + for (auto &Src : Sources) { + EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); + if (SrcEltTy == SmallestEltTy) + continue; + assert(ShuffleVT.getVectorElementType() == SmallestEltTy); + Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); + Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); + Src.WindowBase *= Src.WindowScale; + } + + // Final sanity check before we try to actually produce a shuffle. + DEBUG( + for (auto Src : Sources) + assert(Src.ShuffleVec.getValueType() == ShuffleVT); + ); + + // The stars all align, our next step is to produce the mask for the shuffle. + SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); + int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); + for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); - if (Entry.getOpcode() == ISD::UNDEF) { - Mask.push_back(-1); + if (Entry.getOpcode() == ISD::UNDEF) continue; - } - SDValue ExtractVec = Entry.getOperand(0); - int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i) - .getOperand(1))->getSExtValue(); - if (ExtractVec == SourceVecs[0]) { - Mask.push_back(ExtractElt - VEXTOffsets[0]); - } else { - Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]); - } + auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); + int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); + + // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit + // trunc. So only std::min(SrcBits, DestBits) actually get defined in this + // segment. + EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); + int BitsDefined = std::min(OrigEltTy.getSizeInBits(), + VT.getVectorElementType().getSizeInBits()); + int LanesDefined = BitsDefined / BitsPerShuffleLane; + + // This source is expected to fill ResMultiplier lanes of the final shuffle, + // starting at the appropriate offset. + int *LaneMask = &Mask[i * ResMultiplier]; + + int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; + ExtractBase += NumElts * (Src - Sources.begin()); + for (int j = 0; j < LanesDefined; ++j) + LaneMask[j] = ExtractBase + j; } // Final check before we try to produce nonsense... - if (isShuffleMaskLegal(Mask, VT)) - return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1], - &Mask[0]); + if (!isShuffleMaskLegal(Mask, ShuffleVT)) + return SDValue(); - return SDValue(); + // We can't handle more than two sources. This should have already + // been checked before this point. + assert(Sources.size() <= 2 && "Too many sources!"); + + SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; + for (unsigned i = 0; i < Sources.size(); ++i) + ShuffleOps[i] = Sources[i].ShuffleVec; + + SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], + ShuffleOps[1], &Mask[0]); + return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } /// isShuffleMaskLegal - Targets can use this to indicate that they only @@ -6235,6 +6508,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { + // TODO: Should this propagate fast-math-flags? + // Convert to float // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); @@ -6265,6 +6540,8 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) { + // TODO: Should this propagate fast-math-flags? + SDValue N2; // Convert to float. // float4 yf = vcvt_f32_s32(vmovl_s16(y)); @@ -6337,6 +6614,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { + // TODO: Should this propagate fast-math-flags? EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::UDIV"); @@ -6445,45 +6723,56 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { auto PtrVT = getPointerTy(DAG.getDataLayout()); MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Pair of floats / doubles used to pass the result. - StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); - - // Create stack object for sret. + Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr); auto &DL = DAG.getDataLayout(); - const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); - const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); - int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); - SDValue SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); ArgListTy Args; - ArgListEntry Entry; - - Entry.Node = SRet; - Entry.Ty = RetTy->getPointerTo(); - Entry.isSExt = false; - Entry.isZExt = false; - Entry.isSRet = true; - Args.push_back(Entry); + bool ShouldUseSRet = Subtarget->isAPCS_ABI(); + SDValue SRet; + if (ShouldUseSRet) { + // Create stack object for sret. + const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); + const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); + int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); + SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); + + ArgListEntry Entry; + Entry.Node = SRet; + Entry.Ty = RetTy->getPointerTo(); + Entry.isSExt = false; + Entry.isZExt = false; + Entry.isSRet = true; + Args.push_back(Entry); + RetTy = Type::getVoidTy(*DAG.getContext()); + } + ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.isSExt = false; Entry.isZExt = false; Args.push_back(Entry); - const char *LibcallName = (ArgVT == MVT::f64) - ? "__sincos_stret" : "__sincosf_stret"; + const char *LibcallName = + (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; + RTLIB::Libcall LC = + (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32; + CallingConv::ID CC = getLibcallCallingConv(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), Callee, - std::move(Args), 0) - .setDiscardResult(); - + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setCallee(CC, RetTy, Callee, std::move(Args), 0) + .setDiscardResult(ShouldUseSRet); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + if (!ShouldUseSRet) + return CallResult.first; + SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo(), false, false, false, 0); @@ -6498,6 +6787,85 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { LoadSin.getValue(0), LoadCos.getValue(0)); } +SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, + bool Signed, + SDValue &Chain) const { + EVT VT = Op.getValueType(); + assert((VT == MVT::i32 || VT == MVT::i64) && + "unexpected type for custom lowering DIV"); + SDLoc dl(Op); + + const auto &DL = DAG.getDataLayout(); + const auto &TLI = DAG.getTargetLoweringInfo(); + + const char *Name = nullptr; + if (Signed) + Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; + else + Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; + + SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); + + ARMTargetLowering::ArgListTy Args; + + for (auto AI : {1, 0}) { + ArgListEntry Arg; + Arg.Node = Op.getOperand(AI); + Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); + Args.push_back(Arg); + } + + CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), + ES, std::move(Args), 0); + + return LowerCallTo(CLI).first; +} + +SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + assert(Op.getValueType() == MVT::i32 && + "unexpected type for custom lowering DIV"); + SDLoc dl(Op); + + SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, + DAG.getEntryNode(), Op.getOperand(1)); + + return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); +} + +void ARMTargetLowering::ExpandDIV_Windows( + SDValue Op, SelectionDAG &DAG, bool Signed, + SmallVectorImpl<SDValue> &Results) const { + const auto &DL = DAG.getDataLayout(); + const auto &TLI = DAG.getTargetLoweringInfo(); + + assert(Op.getValueType() == MVT::i64 && + "unexpected type for custom lowering DIV"); + SDLoc dl(Op); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), + DAG.getConstant(0, dl, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), + DAG.getConstant(1, dl, MVT::i32)); + SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, Lo, Hi); + + SDValue DBZCHK = + DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Or); + + SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); + + SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); + SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, + DAG.getConstant(32, dl, TLI.getPointerTy(DL))); + Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); + + Results.push_back(Lower); + Results.push_back(Upper); +} + static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { // Monotonic load/store is legal for all targets if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic) @@ -6513,36 +6881,22 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { SDLoc DL(N); - SDValue Cycles32, OutChain; - - if (Subtarget->hasPerfMon()) { - // Under Power Management extensions, the cycle-count is: - // mrc p15, #0, <Rt>, c9, c13, #0 - SDValue Ops[] = { N->getOperand(0), // Chain - DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), - DAG.getConstant(15, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(9, DL, MVT::i32), - DAG.getConstant(13, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32) - }; - - Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, - DAG.getVTList(MVT::i32, MVT::Other), Ops); - OutChain = Cycles32.getValue(1); - } else { - // Intrinsic is defined to return 0 on unsupported platforms. Technically - // there are older ARM CPUs that have implementation-specific ways of - // obtaining this information (FIXME!). - Cycles32 = DAG.getConstant(0, DL, MVT::i32); - OutChain = DAG.getEntryNode(); - } - + // Under Power Management extensions, the cycle-count is: + // mrc p15, #0, <Rt>, c9, c13, #0 + SDValue Ops[] = { N->getOperand(0), // Chain + DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), + DAG.getConstant(15, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(9, DL, MVT::i32), + DAG.getConstant(13, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32) + }; - SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, - Cycles32, DAG.getConstant(0, DL, MVT::i32)); - Results.push_back(Cycles64); - Results.push_back(OutChain); + SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, + DAG.getVTList(MVT::i32, MVT::Other), Ops); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, + DAG.getConstant(0, DL, MVT::i32))); + Results.push_back(Cycles32.getValue(1)); } SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -6576,15 +6930,17 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); - case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); + case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); case ISD::SHL: case ISD::SRL: case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); + case ISD::SREM: return LowerREM(Op.getNode(), DAG); + case ISD::UREM: return LowerREM(Op.getNode(), DAG); case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); @@ -6622,13 +6978,14 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("Don't know how to custom lower this!"); case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ARMISD::WIN__DBZCHK: return SDValue(); } } /// ReplaceNodeResults - Replace the results of node with an illegal result /// type with new values built out of custom code. void ARMTargetLowering::ReplaceNodeResults(SDNode *N, - SmallVectorImpl<SDValue>&Results, + SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { SDValue Res; switch (N->getOpcode()) { @@ -6644,9 +7001,18 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SRA: Res = Expand64BitShift(N, DAG, Subtarget); break; + case ISD::SREM: + case ISD::UREM: + Res = LowerREM(N, DAG); + break; case ISD::READCYCLECOUNTER: ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); return; + case ISD::UDIV: + case ISD::SDIV: + assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); + return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, + Results); } if (Res.getNode()) Results.push_back(Res); @@ -6683,12 +7049,12 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, // Grab constant pool and fixed stack memory operands. MachineMemOperand *CPMMO = - MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(), - MachineMemOperand::MOLoad, 4, 4); + MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), + MachineMemOperand::MOLoad, 4, 4); MachineMemOperand *FIMMOSt = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOStore, 4, 4); + MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), + MachineMemOperand::MOStore, 4, 4); // Load the address of the dispatch MBB into the jump buffer. if (isThumb2) { @@ -6792,7 +7158,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, MachineModuleInfo &MMI = MF->getMMI(); for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; ++BB) { - if (!BB->isLandingPad()) continue; + if (!BB->isEHPad()) continue; // FIXME: We should assert that the EH_LABEL is the first MI in the landing // pad. @@ -6807,7 +7173,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, for (SmallVectorImpl<unsigned>::iterator CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); CSI != CSE; ++CSI) { - CallSiteNumToLPad[*CSI].push_back(BB); + CallSiteNumToLPad[*CSI].push_back(&*BB); MaxCSNum = std::max(MaxCSNum, *CSI); } break; @@ -6840,7 +7206,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, // Shove the dispatch's address into the return slot in the function context. MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); - DispatchBB->setIsLandingPad(); + DispatchBB->setIsEHPad(); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); unsigned trap_opcode; @@ -6864,10 +7230,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, // context. SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); - MachineMemOperand *FIMMOLd = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOLoad | - MachineMemOperand::MOVolatile, 4, 4); + MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), + MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); MachineInstrBuilder MIB; MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); @@ -6982,9 +7347,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, .addReg(NewVReg2, RegState::Kill) .addReg(NewVReg3)); - MachineMemOperand *JTMMOLd = - MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) @@ -7066,9 +7430,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) .addJumpTableIndex(MJTI)); - MachineMemOperand *JTMMOLd = - MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); AddDefaultPred( BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) @@ -7109,13 +7472,14 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, BB->succ_end()); while (!Successors.empty()) { MachineBasicBlock *SMBB = Successors.pop_back_val(); - if (SMBB->isLandingPad()) { + if (SMBB->isEHPad()) { BB->removeSuccessor(SMBB); MBBLPads.push_back(SMBB); } } - BB->addSuccessor(DispatchBB); + BB->addSuccessor(DispatchBB, BranchProbability::getZero()); + BB->normalizeSuccProbs(); // Find the invoke call and mark all of the callee-saved registers as // 'implicit defined' so that they're spilled. This prevents code from @@ -7157,7 +7521,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, // landing pad now. for (SmallVectorImpl<MachineBasicBlock*>::iterator I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) - (*I)->setIsLandingPad(false); + (*I)->setIsEHPad(false); // The instruction is gone now. MI->eraseFromParent(); @@ -7280,8 +7644,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, // Otherwise, we will generate unrolled scalar copies. const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); unsigned dest = MI->getOperand(0).getReg(); unsigned src = MI->getOperand(1).getReg(); @@ -7574,6 +7937,32 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, } MachineBasicBlock * +ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); + MF->push_back(ContBB); + ContBB->splice(ContBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + MBB->addSuccessor(ContBB); + + MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); + MF->push_back(TrapBB); + BuildMI(TrapBB, DL, TII->get(ARM::t2UDF)).addImm(249); + MBB->addSuccessor(TrapBB); + + BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ)) + .addReg(MI->getOperand(0).getReg()) + .addMBB(TrapBB); + + MI->eraseFromParent(); + return ContBB; +} + +MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); @@ -7643,8 +8032,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -7741,6 +8129,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case ARM::tInt_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp_nofp: + return BB; + + case ARM::Int_eh_sjlj_setup_dispatch: EmitSjLjDispatchBlock(MI, BB); return BB; @@ -7759,8 +8150,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) // SinkBB: V1 = PHI(V2, V3) const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator BBI = BB; - ++BBI; + MachineFunction::iterator BBI = ++BB->getIterator(); MachineFunction *Fn = BB->getParent(); MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); @@ -7824,11 +8214,46 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return EmitStructByval(MI, BB); case ARM::WIN__CHKSTK: return EmitLowered__chkstk(MI, BB); + case ARM::WIN__DBZCHK: + return EmitLowered__dbzchk(MI, BB); + } +} + +/// \brief Attaches vregs to MEMCPY that it will use as scratch registers +/// when it is expanded into LDM/STM. This is done as a post-isel lowering +/// instead of as a custom inserter because we need the use list from the SDNode. +static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, + MachineInstr *MI, const SDNode *Node) { + bool isThumb1 = Subtarget->isThumb1Only(); + + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = MI->getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineInstrBuilder MIB(*MF, MI); + + // If the new dst/src is unused mark it as dead. + if (!Node->hasAnyUseOfValue(0)) { + MI->getOperand(0).setIsDead(true); + } + if (!Node->hasAnyUseOfValue(1)) { + MI->getOperand(1).setIsDead(true); + } + + // The MEMCPY both defines and kills the scratch registers. + for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) { + unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass + : &ARM::GPRRegClass); + MIB.addReg(TmpReg, RegState::Define|RegState::Dead); } } void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { + if (MI->getOpcode() == ARM::MEMCPY) { + attachMEMCPYScratchRegs(Subtarget, MI, Node); + return; + } + const MCInstrDesc *MCID = &MI->getDesc(); // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, // RSC. Coming out of isel, they have an implicit CPSR def, but the optional @@ -7898,10 +8323,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, // Helper function that checks if N is a null or all ones constant. static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N); - if (!C) - return false; - return AllOnes ? C->isAllOnesValue() : C->isNullValue(); + return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); } // Return true if N is conditionally 0 or all ones. @@ -8723,12 +9145,88 @@ static SDValue PerformXORCombine(SDNode *N, return SDValue(); } -/// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff -/// the bits being cleared by the AND are not demanded by the BFI. +// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, +// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and +// their position in "to" (Rd). +static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { + assert(N->getOpcode() == ARMISD::BFI); + + SDValue From = N->getOperand(1); + ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); + FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); + + // If the Base came from a SHR #C, we can deduce that it is really testing bit + // #C in the base of the SHR. + if (From->getOpcode() == ISD::SRL && + isa<ConstantSDNode>(From->getOperand(1))) { + APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); + assert(Shift.getLimitedValue() < 32 && "Shift too large!"); + FromMask <<= Shift.getLimitedValue(31); + From = From->getOperand(0); + } + + return From; +} + +// If A and B contain one contiguous set of bits, does A | B == A . B? +// +// Neither A nor B must be zero. +static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { + unsigned LastActiveBitInA = A.countTrailingZeros(); + unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; + return LastActiveBitInA - 1 == FirstActiveBitInB; +} + +static SDValue FindBFIToCombineWith(SDNode *N) { + // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, + // if one exists. + APInt ToMask, FromMask; + SDValue From = ParseBFI(N, ToMask, FromMask); + SDValue To = N->getOperand(0); + + // Now check for a compatible BFI to merge with. We can pass through BFIs that + // aren't compatible, but not if they set the same bit in their destination as + // we do (or that of any BFI we're going to combine with). + SDValue V = To; + APInt CombinedToMask = ToMask; + while (V.getOpcode() == ARMISD::BFI) { + APInt NewToMask, NewFromMask; + SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); + if (NewFrom != From) { + // This BFI has a different base. Keep going. + CombinedToMask |= NewToMask; + V = V.getOperand(0); + continue; + } + + // Do the written bits conflict with any we've seen so far? + if ((NewToMask & CombinedToMask).getBoolValue()) + // Conflicting bits - bail out because going further is unsafe. + return SDValue(); + + // Are the new bits contiguous when combined with the old bits? + if (BitsProperlyConcatenate(ToMask, NewToMask) && + BitsProperlyConcatenate(FromMask, NewFromMask)) + return V; + if (BitsProperlyConcatenate(NewToMask, ToMask) && + BitsProperlyConcatenate(NewFromMask, FromMask)) + return V; + + // We've seen a write to some bits, so track it. + CombinedToMask |= NewToMask; + // Keep going... + V = V.getOperand(0); + } + + return SDValue(); +} + static SDValue PerformBFICombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue N1 = N->getOperand(1); if (N1.getOpcode() == ISD::AND) { + // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff + // the bits being cleared by the AND are not demanded by the BFI. ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); if (!N11C) return SDValue(); @@ -8744,6 +9242,38 @@ static SDValue PerformBFICombine(SDNode *N, return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), N->getOperand(0), N1.getOperand(0), N->getOperand(2)); + } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { + // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. + // Keep track of any consecutive bits set that all come from the same base + // value. We can combine these together into a single BFI. + SDValue CombineBFI = FindBFIToCombineWith(N); + if (CombineBFI == SDValue()) + return SDValue(); + + // We've found a BFI. + APInt ToMask1, FromMask1; + SDValue From1 = ParseBFI(N, ToMask1, FromMask1); + + APInt ToMask2, FromMask2; + SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); + assert(From1 == From2); + (void)From2; + + // First, unlink CombineBFI. + DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); + // Then create a new BFI, combining the two together. + APInt NewFromMask = FromMask1 | FromMask2; + APInt NewToMask = ToMask1 | ToMask2; + + EVT VT = N->getValueType(0); + SDLoc dl(N); + + if (NewFromMask[0] == 0) + From1 = DCI.DAG.getNode( + ISD::SRL, dl, VT, From1, + DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); + return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, + DCI.DAG.getConstant(~NewToMask, dl, VT)); } return SDValue(); } @@ -9521,32 +10051,6 @@ static SDValue PerformSTORECombine(SDNode *N, return SDValue(); } -// isConstVecPow2 - Return true if each vector element is a power of 2, all -// elements are the same constant, C, and Log2(C) ranges from 1 to 32. -static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C) -{ - integerPart cN; - integerPart c0 = 0; - for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements(); - I != E; I++) { - ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(ConstVec.getOperand(I)); - if (!C) - return false; - - bool isExact; - APFloat APF = C->getValueAPF(); - if (APF.convertToInteger(&cN, 64, isSigned, APFloat::rmTowardZero, &isExact) - != APFloat::opOK || !isExact) - return false; - - c0 = (I == 0) ? cN : c0; - if (!isPowerOf2_64(cN) || c0 != cN || Log2_64(c0) < 1 || Log2_64(c0) > 32) - return false; - } - C = c0; - return true; -} - /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) /// can replace combinations of VMUL and VCVT (floating-point to integer) /// when the VMUL has a constant operand that is a power of 2. @@ -9556,30 +10060,25 @@ static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C) /// vcvt.s32.f32 d16, d16 /// becomes: /// vcvt.s32.f32 d16, d16, #3 -static SDValue PerformVCVTCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, +static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { - SelectionDAG &DAG = DCI.DAG; - SDValue Op = N->getOperand(0); + if (!Subtarget->hasNEON()) + return SDValue(); - if (!Subtarget->hasNEON() || !Op.getValueType().isVector() || - Op.getOpcode() != ISD::FMUL) + SDValue Op = N->getOperand(0); + if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) return SDValue(); - uint64_t C; - SDValue N0 = Op->getOperand(0); SDValue ConstVec = Op->getOperand(1); - bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; - - if (ConstVec.getOpcode() != ISD::BUILD_VECTOR || - !isConstVecPow2(ConstVec, isSigned, C)) + if (!isa<BuildVectorSDNode>(ConstVec)) return SDValue(); MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t IntBits = IntTy.getSizeInBits(); unsigned NumLanes = Op.getValueType().getVectorNumElements(); - if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32 || - NumLanes > 4) { + if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { // These instructions only exist converting from f32 to i32. We can handle // smaller integers by generating an extra truncate, but larger ones would // be lossy. We also can't handle more then 4 lanes, since these intructions @@ -9587,16 +10086,22 @@ static SDValue PerformVCVTCombine(SDNode *N, return SDValue(); } + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); + if (C == -1 || C == 0 || C > 32) + return SDValue(); + SDLoc dl(N); + bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : Intrinsic::arm_neon_vcvtfp2fxu; - SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, - NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, - DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), - N0, - DAG.getConstant(Log2_64(C), dl, MVT::i32)); + SDValue FixConv = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, + DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), + DAG.getConstant(C, dl, MVT::i32)); - if (IntTy.getSizeInBits() < FloatTy.getSizeInBits()) + if (IntBits < FloatBits) FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); return FixConv; @@ -9611,38 +10116,44 @@ static SDValue PerformVCVTCombine(SDNode *N, /// vdiv.f32 d16, d17, d16 /// becomes: /// vcvt.f32.s32 d16, d16, #3 -static SDValue PerformVDIVCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, +static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { - SelectionDAG &DAG = DCI.DAG; + if (!Subtarget->hasNEON()) + return SDValue(); + SDValue Op = N->getOperand(0); unsigned OpOpcode = Op.getNode()->getOpcode(); - - if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() || + if (!N->getValueType(0).isVector() || (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) return SDValue(); - uint64_t C; SDValue ConstVec = N->getOperand(1); - bool isSigned = OpOpcode == ISD::SINT_TO_FP; - - if (ConstVec.getOpcode() != ISD::BUILD_VECTOR || - !isConstVecPow2(ConstVec, isSigned, C)) + if (!isa<BuildVectorSDNode>(ConstVec)) return SDValue(); MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); - if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) { + uint32_t IntBits = IntTy.getSizeInBits(); + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { // These instructions only exist converting from i32 to f32. We can handle // smaller integers by generating an extra extend, but larger ones would - // be lossy. + // be lossy. We also can't handle more then 4 lanes, since these intructions + // only support v2i32/v4i32 types. return SDValue(); } + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); + if (C == -1 || C == 0 || C > 32) + return SDValue(); + SDLoc dl(N); + bool isSigned = OpOpcode == ISD::SINT_TO_FP; SDValue ConvInput = Op.getOperand(0); - unsigned NumLanes = Op.getValueType().getVectorNumElements(); - if (IntTy.getSizeInBits() < FloatTy.getSizeInBits()) + if (IntBits < FloatBits) ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput); @@ -9652,7 +10163,7 @@ static SDValue PerformVDIVCombine(SDNode *N, return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), - ConvInput, DAG.getConstant(Log2_64(C), dl, MVT::i32)); + ConvInput, DAG.getConstant(C, dl, MVT::i32)); } /// Getvshiftimm - Check if this is a valid build_vector for the immediate @@ -9680,7 +10191,7 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (! getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); @@ -9695,12 +10206,16 @@ static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (! getVShiftImm(Op, ElementBits, Cnt)) return false; - if (isIntrinsic) + if (!isIntrinsic) + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); + if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { Cnt = -Cnt; - return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); + return true; + } + return false; } /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. @@ -9939,89 +10454,123 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC -/// to match f32 max/min patterns to use NEON vmax/vmin instructions. -static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *ST) { - // If the target supports NEON, try to use vmax/vmin instructions for f32 - // selects like "x < y ? x : y". Unless the NoNaNsFPMath option is set, - // be careful about NaNs: NEON's vmax/vmin return NaN if either operand is - // a NaN; only do the transformation when it matches that behavior. - - // For now only do this when using NEON for FP operations; if using VFP, it - // is not obvious that the benefit outweighs the cost of switching to the - // NEON pipeline. - if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() || - N->getValueType(0) != MVT::f32) +static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero, + APInt &KnownOne) { + if (Op.getOpcode() == ARMISD::BFI) { + // Conservatively, we can recurse down the first operand + // and just mask out all affected bits. + computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne); + + // The operand to BFI is already a mask suitable for removing the bits it + // sets. + ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); + APInt Mask = CI->getAPIntValue(); + KnownZero &= Mask; + KnownOne &= Mask; + return; + } + if (Op.getOpcode() == ARMISD::CMOV) { + APInt KZ2(KnownZero.getBitWidth(), 0); + APInt KO2(KnownOne.getBitWidth(), 0); + computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne); + computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2); + + KnownZero &= KZ2; + KnownOne &= KO2; + return; + } + return DAG.computeKnownBits(Op, KnownZero, KnownOne); +} + +SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { + // If we have a CMOV, OR and AND combination such as: + // if (x & CN) + // y |= CM; + // + // And: + // * CN is a single bit; + // * All bits covered by CM are known zero in y + // + // Then we can convert this into a sequence of BFI instructions. This will + // always be a win if CM is a single bit, will always be no worse than the + // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is + // three bits (due to the extra IT instruction). + + SDValue Op0 = CMOV->getOperand(0); + SDValue Op1 = CMOV->getOperand(1); + auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); + auto CC = CCNode->getAPIntValue().getLimitedValue(); + SDValue CmpZ = CMOV->getOperand(4); + + // The compare must be against zero. + if (!isNullConstant(CmpZ->getOperand(1))) return SDValue(); - SDValue CondLHS = N->getOperand(0); - SDValue CondRHS = N->getOperand(1); - SDValue LHS = N->getOperand(2); - SDValue RHS = N->getOperand(3); - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); - - unsigned Opcode = 0; - bool IsReversed; - if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) { - IsReversed = false; // x CC y ? x : y - } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) { - IsReversed = true ; // x CC y ? y : x - } else { + assert(CmpZ->getOpcode() == ARMISD::CMPZ); + SDValue And = CmpZ->getOperand(0); + if (And->getOpcode() != ISD::AND) return SDValue(); + ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(And->getOperand(1)); + if (!AndC || !AndC->getAPIntValue().isPowerOf2()) + return SDValue(); + SDValue X = And->getOperand(0); + + if (CC == ARMCC::EQ) { + // We're performing an "equal to zero" compare. Swap the operands so we + // canonicalize on a "not equal to zero" compare. + std::swap(Op0, Op1); + } else { + assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); } + + if (Op1->getOpcode() != ISD::OR) + return SDValue(); - bool IsUnordered; - switch (CC) { - default: break; - case ISD::SETOLT: - case ISD::SETOLE: - case ISD::SETLT: - case ISD::SETLE: - case ISD::SETULT: - case ISD::SETULE: - // If LHS is NaN, an ordered comparison will be false and the result will - // be the RHS, but vmin(NaN, RHS) = NaN. Avoid this by checking that LHS - // != NaN. Likewise, for unordered comparisons, check for RHS != NaN. - IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE); - if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) - break; - // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin - // will return -0, so vmin can only be used for unsafe math or if one of - // the operands is known to be nonzero. - if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) && - !DAG.getTarget().Options.UnsafeFPMath && - !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) - break; - Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN; - break; + ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); + if (!OrC) + return SDValue(); + SDValue Y = Op1->getOperand(0); - case ISD::SETOGT: - case ISD::SETOGE: - case ISD::SETGT: - case ISD::SETGE: - case ISD::SETUGT: - case ISD::SETUGE: - // If LHS is NaN, an ordered comparison will be false and the result will - // be the RHS, but vmax(NaN, RHS) = NaN. Avoid this by checking that LHS - // != NaN. Likewise, for unordered comparisons, check for RHS != NaN. - IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE); - if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) - break; - // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax - // will return +0, so vmax can only be used for unsafe math or if one of - // the operands is known to be nonzero. - if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) && - !DAG.getTarget().Options.UnsafeFPMath && - !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) - break; - Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX; - break; - } + if (Op0 != Y) + return SDValue(); + + // Now, is it profitable to continue? + APInt OrCI = OrC->getAPIntValue(); + unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; + if (OrCI.countPopulation() > Heuristic) + return SDValue(); - if (!Opcode) + // Lastly, can we determine that the bits defined by OrCI + // are zero in Y? + APInt KnownZero, KnownOne; + computeKnownBits(DAG, Y, KnownZero, KnownOne); + if ((OrCI & KnownZero) != OrCI) return SDValue(); - return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS); + + // OK, we can do the combine. + SDValue V = Y; + SDLoc dl(X); + EVT VT = X.getValueType(); + unsigned BitInX = AndC->getAPIntValue().logBase2(); + + if (BitInX != 0) { + // We must shift X first. + X = DAG.getNode(ISD::SRL, dl, VT, X, + DAG.getConstant(BitInX, dl, VT)); + } + + for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); + BitInY < NumActiveBits; ++BitInY) { + if (OrCI[BitInY] == 0) + continue; + APInt Mask(VT.getSizeInBits(), 0); + Mask.setBit(BitInY); + V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, + // Confusingly, the operand is an *inverted* mask. + DAG.getConstant(~Mask, dl, VT)); + } + + return V; } /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. @@ -10042,6 +10591,13 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { ARMCC::CondCodes CC = (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); + // BFI is only available on V6T2+. + if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { + SDValue R = PerformCMOVToBFICombine(N, DAG); + if (R) + return R; + } + // Simplify // mov r1, r0 // cmp r1, x @@ -10108,8 +10664,10 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget); - case ISD::FDIV: return PerformVDIVCombine(N, DCI, Subtarget); + case ISD::FP_TO_UINT: + return PerformVCVTCombine(N, DCI.DAG, Subtarget); + case ISD::FDIV: + return PerformVDIVCombine(N, DCI.DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: @@ -10117,7 +10675,6 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); - case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); case ISD::LOAD: return PerformLOADCombine(N, DCI); case ARMISD::VLD2DUP: @@ -10932,7 +11489,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'J': - if (Subtarget->isThumb()) { // FIXME thumb2 + if (Subtarget->isThumb1Only()) { // This must be a constant between -255 and -1, for negated ADD // immediates. This can be used in GCC with an "n" modifier that // prints the negated value, for use with SUB instructions. It is @@ -11001,7 +11558,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'M': - if (Subtarget->isThumb()) { // FIXME thumb2 + if (Subtarget->isThumb1Only()) { // This must be a multiple of 4 between 0 and 1020, for // ADD sp + immediate. if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) @@ -11043,37 +11600,61 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } -SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only"); - unsigned Opcode = Op->getOpcode(); - assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && - "Invalid opcode for Div/Rem lowering"); - bool isSigned = (Opcode == ISD::SDIVREM); - EVT VT = Op->getValueType(0); - Type *Ty = VT.getTypeForEVT(*DAG.getContext()); - +static RTLIB::Libcall getDivRemLibcall( + const SDNode *N, MVT::SimpleValueType SVT) { + assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || + N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && + "Unhandled Opcode in getDivRemLibcall"); + bool isSigned = N->getOpcode() == ISD::SDIVREM || + N->getOpcode() == ISD::SREM; RTLIB::Libcall LC; - switch (VT.getSimpleVT().SimpleTy) { + switch (SVT) { default: llvm_unreachable("Unexpected request for libcall!"); case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; } + return LC; +} - SDValue InChain = DAG.getEntryNode(); - +static TargetLowering::ArgListTy getDivRemArgList( + const SDNode *N, LLVMContext *Context) { + assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || + N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && + "Unhandled Opcode in getDivRemArgList"); + bool isSigned = N->getOpcode() == ISD::SDIVREM || + N->getOpcode() == ISD::SREM; TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; - for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { - EVT ArgVT = Op->getOperand(i).getValueType(); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - Entry.Node = Op->getOperand(i); + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + EVT ArgVT = N->getOperand(i).getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*Context); + Entry.Node = N->getOperand(i); Entry.Ty = ArgTy; Entry.isSExt = isSigned; Entry.isZExt = !isSigned; Args.push_back(Entry); } + return Args; +} + +SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { + assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) && + "Register-based DivRem lowering only"); + unsigned Opcode = Op->getOpcode(); + assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && + "Invalid opcode for Div/Rem lowering"); + bool isSigned = (Opcode == ISD::SDIVREM); + EVT VT = Op->getValueType(0); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + + RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), + VT.getSimpleVT().SimpleTy); + SDValue InChain = DAG.getEntryNode(); + + TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), + DAG.getContext()); SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); @@ -11090,6 +11671,47 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { return CallInfo.first; } +// Lowers REM using divmod helpers +// see RTABI section 4.2/4.3 +SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { + // Build return types (div and rem) + std::vector<Type*> RetTyParams; + Type *RetTyElement; + + switch (N->getValueType(0).getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unexpected request for libcall!"); + case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; + case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; + case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; + case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; + } + + RetTyParams.push_back(RetTyElement); + RetTyParams.push_back(RetTyElement); + ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); + Type *RetTy = StructType::get(*DAG.getContext(), ret); + + RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). + SimpleTy); + SDValue InChain = DAG.getEntryNode(); + TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext()); + bool isSigned = N->getOpcode() == ISD::SREM; + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), + getPointerTy(DAG.getDataLayout())); + + // Lower call + CallLoweringInfo CLI(DAG); + CLI.setChain(InChain) + .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args), 0) + .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + + // Return second (rem) result operand (first contains div) + SDNode *ResNode = CallResult.first.getNode(); + assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); + return ResNode->getOperand(1); +} + SDValue ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "unsupported target platform"); @@ -11124,8 +11746,8 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, - /*isSigned*/ false, SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + SDLoc(Op)).first; } SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { @@ -11137,8 +11759,8 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, - /*isSigned*/ false, SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + SDLoc(Op)).first; } bool @@ -11186,7 +11808,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); - uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8; + uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; @@ -11212,7 +11834,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += DL.getTypeAllocSize(ArgTy) / 8; + NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); @@ -11295,8 +11917,6 @@ bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return true; } -bool ARMTargetLowering::hasLoadLinkedStoreConditional() const { return true; } - Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); @@ -11392,19 +12012,26 @@ bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that // guarantee, see DDI0406C ARM architecture reference manual, // sections A8.8.72-74 LDRD) -bool ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { +TargetLowering::AtomicExpansionKind +ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); - return (Size == 64) && !Subtarget->isMClass(); + return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly + : AtomicExpansionKind::None; } // For the real atomic operations, we have ldrex/strex up to 32 bits, // and up to 64 bits on the non-M profiles -TargetLoweringBase::AtomicRMWExpansionKind +TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); return (Size <= (Subtarget->isMClass() ? 32U : 64U)) - ? AtomicRMWExpansionKind::LLSC - : AtomicRMWExpansionKind::None; + ? AtomicExpansionKind::LLSC + : AtomicExpansionKind::None; +} + +bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR( + AtomicCmpXchgInst *AI) const { + return true; } // This has so far only been implemented for MachO. @@ -11419,7 +12046,7 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; // Floating point values and vector values map to the same register file. - // Therefore, althought we could do a store extract of a vector type, this is + // Therefore, although we could do a store extract of a vector type, this is // better to leave at float as we have more freedom in the addressing mode for // those. if (VectorTy->isFPOrFPVectorTy()) @@ -11441,6 +12068,14 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; } +bool ARMTargetLowering::isCheapToSpeculateCttz() const { + return Subtarget->hasV6T2Ops(); +} + +bool ARMTargetLowering::isCheapToSpeculateCtlz() const { + return Subtarget->hasV6T2Ops(); +} + Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); @@ -11477,6 +12112,14 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, cast<PointerType>(Addr->getType())->getElementType()); } +void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( + IRBuilder<> &Builder) const { + if (!Subtarget->hasV7Ops()) + return; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); +} + Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { @@ -11534,12 +12177,12 @@ bool ARMTargetLowering::lowerInterleavedLoad( Type *EltTy = VecTy->getVectorElementType(); const DataLayout &DL = LI->getModule()->getDataLayout(); - unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy); - bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64; + unsigned VecSize = DL.getTypeSizeInBits(VecTy); + bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; - // Skip illegal vector types and vector types of i64/f64 element (vldN doesn't - // support i64/f64 element). - if ((VecSize != 64 && VecSize != 128) || EltIs64Bits) + // Skip if we do not have NEON and skip illegal vector types and vector types + // with i64/f64 elements (vldN doesn't support i64/f64 elements). + if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits) return false; // A pointer vector can not be the return type of the ldN intrinsics. Need to @@ -11552,9 +12195,6 @@ bool ARMTargetLowering::lowerInterleavedLoad( Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; - Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy); - IRBuilder<> Builder(LI); SmallVector<Value *, 2> Ops; @@ -11562,6 +12202,9 @@ bool ARMTargetLowering::lowerInterleavedLoad( Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); Ops.push_back(Builder.getInt32(LI->getAlignment())); + Type *Tys[] = { VecTy, Int8Ptr }; + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); // Replace uses of each shufflevector with the corresponding vector loaded @@ -11624,12 +12267,13 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); const DataLayout &DL = SI->getModule()->getDataLayout(); - unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy); - bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64; + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); + bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; - // Skip illegal sub vector types and vector types of i64/f64 element (vstN - // doesn't support i64/f64 element). - if ((SubVecSize != 64 && SubVecSize != 128) || EltIs64Bits) + // Skip if we do not have NEON and skip illegal vector types and vector types + // with i64/f64 elements (vstN doesn't support i64/f64 elements). + if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) || + EltIs64Bits) return false; Value *Op0 = SVI->getOperand(0); @@ -11650,17 +12294,18 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, SubVecTy = VectorType::get(IntTy, NumSubElts); } - static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, - Intrinsic::arm_neon_vst3, - Intrinsic::arm_neon_vst4}; - Function *VstNFunc = Intrinsic::getDeclaration( - SI->getModule(), StoreInts[Factor - 2], SubVecTy); - + static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, + Intrinsic::arm_neon_vst3, + Intrinsic::arm_neon_vst4}; SmallVector<Value *, 6> Ops; Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); + Type *Tys[] = { Int8Ptr, SubVecTy }; + Function *VstNFunc = Intrinsic::getDeclaration( + SI->getModule(), StoreInts[Factor - 2], Tys); + // Split the shufflevector operands into sub vectors for the new vstN call. for (unsigned i = 0; i < Factor; i++) Ops.push_back(Builder.CreateShuffleVector( @@ -11681,14 +12326,14 @@ enum HABaseType { static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members) { - if (const StructType *ST = dyn_cast<StructType>(Ty)) { + if (auto *ST = dyn_cast<StructType>(Ty)) { for (unsigned i = 0; i < ST->getNumElements(); ++i) { uint64_t SubMembers = 0; if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) return false; Members += SubMembers; } - } else if (const ArrayType *AT = dyn_cast<ArrayType>(Ty)) { + } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { uint64_t SubMembers = 0; if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) return false; @@ -11703,7 +12348,7 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, return false; Members = 1; Base = HA_DOUBLE; - } else if (const VectorType *VT = dyn_cast<VectorType>(Ty)) { + } else if (auto *VT = dyn_cast<VectorType>(Ty)) { Members = 1; switch (Base) { case HA_FLOAT: @@ -11747,3 +12392,63 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); return IsHA || IsIntArray; } + +unsigned ARMTargetLowering::getExceptionPointerRegister( + const Constant *PersonalityFn) const { + // Platforms which do not use SjLj EH may return values in these registers + // via the personality function. + return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; +} + +unsigned ARMTargetLowering::getExceptionSelectorRegister( + const Constant *PersonalityFn) const { + // Platforms which do not use SjLj EH may return values in these registers + // via the personality function. + return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; +} + +void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + // Update IsSplitCSR in ARMFunctionInfo. + ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); + AFI->setIsSplitCSR(true); +} + +void ARMTargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const { + const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (ARM::GPRRegClass.contains(*I)) + RC = &ARM::GPRRegClass; + else if (ARM::DPRRegClass.contains(*I)) + RC = &ARM::DPRRegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + NewVR) + .addReg(*I); + + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + *I) + .addReg(NewVR); + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h index efc9020..96b56c3 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h @@ -63,8 +63,6 @@ namespace llvm { BCC_i64, - RBIT, // ARM bitreverse instruction - SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out. SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out. RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag. @@ -79,6 +77,7 @@ namespace llvm { EH_SJLJ_SETJMP, // SjLj exception handling setjmp. EH_SJLJ_LONGJMP, // SjLj exception handling longjmp. + EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch. TC_RETURN, // Tail call return pseudo. @@ -91,6 +90,7 @@ namespace llvm { PRELOAD, // Preload WIN__CHKSTK, // Windows' __chkstk call to do stack probing. + WIN__DBZCHK, // Windows' divide by zero check VCEQ, // Vector compare equal. VCEQZ, // Vector compare equal to zero. @@ -172,12 +172,6 @@ namespace llvm { // BUILD_VECTOR for this purpose. BUILD_VECTOR, - // Floating-point max and min: - FMAX, - FMIN, - VMAXNM, - VMINNM, - // Bit-field insert BFI, @@ -189,6 +183,10 @@ namespace llvm { // Vector bitwise select VBSL, + // Pseudo-instruction representing a memory copy using ldm/stm + // instructions. + MEMCPY, + // Vector load N-element structure to all lanes: VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, VLD3DUP, @@ -260,6 +258,7 @@ namespace llvm { SDNode *Node) const override; SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const; + SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override; @@ -348,6 +347,8 @@ namespace llvm { getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode == "Q") return InlineAsm::Constraint_Q; + else if (ConstraintCode == "o") + return InlineAsm::Constraint_o; else if (ConstraintCode.size() == 2) { if (ConstraintCode[0] == 'U') { switch(ConstraintCode[1]) { @@ -420,13 +421,24 @@ namespace llvm { bool functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; - bool hasLoadLinkedStoreConditional() const override; + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override; + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const; Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; + void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; + Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, bool IsStore, bool IsLoad) const override; Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, @@ -441,16 +453,21 @@ namespace llvm { bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - TargetLoweringBase::AtomicRMWExpansionKind + TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; bool useLoadStackGuardNode() const override; bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override; + bool isCheapToSpeculateCttz() const override; + bool isCheapToSpeculateCtlz() const override; + protected: std::pair<const TargetRegisterClass *, uint8_t> findRepresentativeClass(const TargetRegisterInfo *TRI, @@ -496,6 +513,7 @@ namespace llvm { ISD::ArgFlagsTy Flags) const; SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; @@ -508,6 +526,7 @@ namespace llvm { SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA, SelectionDAG &DAG, TLSModel::Model model) const; + SDValue LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const; @@ -526,6 +545,12 @@ namespace llvm { const ARMSubtarget *ST) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const; + void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed, + SmallVectorImpl<SDValue> &Results) const; + SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, + SDValue &Chain) const; + SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; @@ -555,6 +580,15 @@ namespace llvm { SmallVectorImpl<SDValue> &InVals, bool isThisReturn, SDValue ThisVal) const; + bool supportSplitCSR(MachineFunction *MF) const override { + return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -635,6 +669,8 @@ namespace llvm { MachineBasicBlock *EmitLowered__chkstk(MachineInstr *MI, MachineBasicBlock *MBB) const; + MachineBasicBlock *EmitLowered__dbzchk(MachineInstr *MI, + MachineBasicBlock *MBB) const; }; enum NEONModImmType { diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp index 84f95be..cf973d6 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -51,7 +51,8 @@ void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const { switch (Opc) { - default: break; + default: + break; case ARM::LDR_PRE_IMM: case ARM::LDR_PRE_REG: case ARM::LDR_POST_IMM: @@ -124,82 +125,10 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI, .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY); unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( - MachinePointerInfo::getGOT(), Flag, 4, 4); + MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4); MIB.addMemOperand(MMO); MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg); MIB.addReg(Reg, RegState::Kill).addImm(0); MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); AddDefaultPred(MIB); } - -namespace { - /// ARMCGBR - Create Global Base Reg pass. This initializes the PIC - /// global base register for ARM ELF. - struct ARMCGBR : public MachineFunctionPass { - static char ID; - ARMCGBR() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &MF) override { - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - if (AFI->getGlobalBaseReg() == 0) - return false; - const ARMSubtarget &STI = - static_cast<const ARMSubtarget &>(MF.getSubtarget()); - // Don't do this for Thumb1. - if (STI.isThumb1Only()) - return false; - - const TargetMachine &TM = MF.getTarget(); - if (TM.getRelocationModel() != Reloc::PIC_) - return false; - - LLVMContext *Context = &MF.getFunction()->getContext(); - unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - unsigned PCAdj = STI.isThumb() ? 4 : 8; - ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create( - *Context, "_GLOBAL_OFFSET_TABLE_", ARMPCLabelIndex, PCAdj); - - unsigned Align = TM.getDataLayout()->getPrefTypeAlignment( - Type::getInt32PtrTy(*Context)); - unsigned Idx = MF.getConstantPool()->getConstantPoolIndex(CPV, Align); - - MachineBasicBlock &FirstMBB = MF.front(); - MachineBasicBlock::iterator MBBI = FirstMBB.begin(); - DebugLoc DL = FirstMBB.findDebugLoc(MBBI); - unsigned TempReg = - MF.getRegInfo().createVirtualRegister(&ARM::rGPRRegClass); - unsigned Opc = STI.isThumb2() ? ARM::t2LDRpci : ARM::LDRcp; - const TargetInstrInfo &TII = *STI.getInstrInfo(); - MachineInstrBuilder MIB = BuildMI(FirstMBB, MBBI, DL, - TII.get(Opc), TempReg) - .addConstantPoolIndex(Idx); - if (Opc == ARM::LDRcp) - MIB.addImm(0); - AddDefaultPred(MIB); - - // Fix the GOT address by adding pc. - unsigned GlobalBaseReg = AFI->getGlobalBaseReg(); - Opc = STI.isThumb2() ? ARM::tPICADD : ARM::PICADD; - MIB = BuildMI(FirstMBB, MBBI, DL, TII.get(Opc), GlobalBaseReg) - .addReg(TempReg) - .addImm(ARMPCLabelIndex); - if (Opc == ARM::PICADD) - AddDefaultPred(MIB); - - return true; - } - - const char *getPassName() const override { - return "ARM PIC Global Base Reg Initialization"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - }; -} - -char ARMCGBR::ID = 0; -FunctionPass* -llvm::createARMGlobalBaseRegPass() { return new ARMCGBR(); } diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td index 9f5bde3..c446ba3 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -59,6 +59,7 @@ def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>; def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>; def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>; +def SDT_ARMEH_SJLJ_SetupDispatch: SDTypeProfile<0, 0, []>; def SDT_ARMMEMBARRIER : SDTypeProfile<0, 1, [SDTCisInt<0>]>; @@ -70,8 +71,11 @@ def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; -def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; -def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; +def SDT_WIN__DBZCHK : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; + +def SDT_ARMMEMCPY : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>, + SDTCisVT<4, i32>]>; def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, @@ -163,21 +167,23 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP", SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain, SDNPSideEffect]>; +def ARMeh_sjlj_setup_dispatch: SDNode<"ARMISD::EH_SJLJ_SETUP_DISPATCH", + SDT_ARMEH_SJLJ_SetupDispatch, + [SDNPHasChain, SDNPSideEffect]>; def ARMMemBarrierMCR : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER, [SDNPHasChain, SDNPSideEffect]>; def ARMPreload : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH, [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; -def ARMrbit : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>; - def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>; -def ARMvmaxnm : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>; -def ARMvminnm : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>; +def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad]>; //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. @@ -209,6 +215,8 @@ def PreV8 : Predicate<"!Subtarget->hasV8Ops()">, AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">; def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; +def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, + AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; def NoVFP : Predicate<"!Subtarget->hasVFP2()">; def HasVFP2 : Predicate<"Subtarget->hasVFP2()">, AssemblerPredicate<"FeatureVFP2", "VFP2">; @@ -228,7 +236,9 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, - AssemblerPredicate<"FeatureFP16","half-float">; + AssemblerPredicate<"FeatureFP16","half-float conversions">; +def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, + AssemblerPredicate<"FeatureFullFP16","full half-float">; def HasDivide : Predicate<"Subtarget->hasDivide()">, AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">; def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, @@ -236,9 +246,8 @@ def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">, AssemblerPredicate<"FeatureT2XtPk", "pack/extract">; -def HasThumb2DSP : Predicate<"Subtarget->hasThumb2DSP()">, - AssemblerPredicate<"FeatureDSPThumb2", - "thumb2-dsp">; +def HasDSP : Predicate<"Subtarget->hasDSP()">, + AssemblerPredicate<"FeatureDSP", "dsp">; def HasDB : Predicate<"Subtarget->hasDataBarrier()">, AssemblerPredicate<"FeatureDB", "data-barriers">; @@ -2322,6 +2331,7 @@ def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", let Inst{23-4} = 0b01100000000000000111; let Inst{3-0} = opt; } +def : MnemonicAlias<"smi", "smc">; // Supervisor Call (Software Interrupt) let isCall = 1, Uses = [SP] in { @@ -3671,10 +3681,10 @@ def USAT16 : AI<(outs GPRnopc:$Rd), let Inst{3-0} = Rn; } -def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm:$pos), - (SSAT imm:$pos, GPRnopc:$a, 0)>; -def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm:$pos), - (USAT imm:$pos, GPRnopc:$a, 0)>; +def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos), + (SSAT imm1_32:$pos, GPRnopc:$a, 0)>; +def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos), + (USAT imm0_31:$pos, GPRnopc:$a, 0)>; //===----------------------------------------------------------------------===// // Bitwise Instructions. @@ -4186,7 +4196,7 @@ def CLZ : AMiscA1I<0b00010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm), def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm), IIC_iUNAr, "rbit", "\t$Rd, $Rm", - [(set GPR:$Rd, (ARMrbit GPR:$Rm))]>, + [(set GPR:$Rd, (bitreverse GPR:$Rm))]>, Requires<[IsARM, HasV6T2]>, Sched<[WriteALU]>; @@ -4578,6 +4588,19 @@ let usesCustomInserter = 1 in { [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>; } +let hasPostISelHook = 1, Constraints = "$newdst = $dst, $newsrc = $src" in { + // %newsrc, %newdst = MEMCPY %dst, %src, N, ...N scratch regs... + // Copies N registers worth of memory from address %src to address %dst + // and returns the incremented addresses. N scratch register will + // be attached for the copy to use. + def MEMCPY : PseudoInst< + (outs GPR:$newdst, GPR:$newsrc), + (ins GPR:$dst, GPR:$src, i32imm:$nreg, variable_ops), + NoItinerary, + [(set GPR:$newdst, GPR:$newsrc, + (ARMmemcopy GPR:$dst, GPR:$src, imm:$nreg))]>; +} + def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; }]>; @@ -4705,7 +4728,7 @@ def STLEXD : AIstlex<0b01, (outs GPR:$Rd), def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", [(int_arm_clrex)]>, - Requires<[IsARM, HasV6]> { + Requires<[IsARM, HasV6K]> { let Inst{31-0} = 0b11110101011111111111000000011111; } @@ -5242,6 +5265,12 @@ def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone, let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>; +def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK, + [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; +let usesCustomInserter = 1, Defs = [CPSR] in + def WIN__DBZCHK : PseudoInst<(outs), (ins GPR:$divisor), NoItinerary, + [(win__dbzchk GPR:$divisor)]>; + //===----------------------------------------------------------------------===// // TLS Instructions // @@ -5301,6 +5330,10 @@ def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch), Requires<[IsARM]>; } +let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1 in +def Int_eh_sjlj_setup_dispatch : PseudoInst<(outs), (ins), NoItinerary, + [(ARMeh_sjlj_setup_dispatch)]>; + // eh.sjlj.dispatchsetup pseudo-instruction. // This pseudo is used for both ARM and Thumb. Any differences are handled when // the pseudo is expanded (which happens before any passes that need the @@ -5365,6 +5398,27 @@ def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), Requires<[IsARM, UseMovt]>; } // isReMaterializable +// The many different faces of TLS access. +def : ARMPat<(ARMWrapper tglobaltlsaddr :$dst), + (MOVi32imm tglobaltlsaddr :$dst)>, + Requires<[IsARM, UseMovt]>; + +def : Pat<(ARMWrapper tglobaltlsaddr:$src), + (LDRLIT_ga_abs tglobaltlsaddr:$src)>, + Requires<[IsARM, DontUseMovt]>; + +def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr), + (MOV_ga_pcrel tglobaltlsaddr:$addr)>, Requires<[IsARM, UseMovt]>; + +def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr), + (LDRLIT_ga_pcrel tglobaltlsaddr:$addr)>, + Requires<[IsARM, DontUseMovt]>; +let AddedComplexity = 10 in +def : Pat<(load (ARMWrapperPIC tglobaltlsaddr:$addr)), + (MOV_ga_pcrel_ldr tglobaltlsaddr:$addr)>, + Requires<[IsARM, UseMovt]>; + + // ConstantPool, GlobalAddress, and JumpTable def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>; def : ARMPat<(ARMWrapper tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>, @@ -5622,16 +5676,16 @@ def : ARMInstAlias<"mvn${s}${p} $Rd, $imm", (MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>; // Same for AND <--> BIC def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm", - (ANDri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm, + (ANDri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, pred:$p, cc_out:$s)>; def : ARMInstAlias<"bic${s}${p} $Rdn, $imm", - (ANDri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm, + (ANDri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, pred:$p, cc_out:$s)>; def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm", - (BICri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm, + (BICri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, pred:$p, cc_out:$s)>; def : ARMInstAlias<"and${s}${p} $Rdn, $imm", - (BICri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm, + (BICri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, pred:$p, cc_out:$s)>; // Likewise, "add Rd, mod_imm_neg" -> sub diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td index f035d61..defef4e 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -587,11 +587,6 @@ def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>; def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>; -def SDTARMFMAX : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>, - SDTCisSameAs<0, 2>]>; -def NEONfmax : SDNode<"ARMISD::FMAX", SDTARMFMAX>; -def NEONfmin : SDNode<"ARMISD::FMIN", SDTARMFMAX>; - def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{ ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); unsigned EltBits = 0; @@ -2465,17 +2460,17 @@ class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; // Same as above, but not predicated. -class N2VDIntnp<bits<2> op17_16, bits<3> op10_8, bit op7, +class N2VDIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> - : N2Vnp<0b10, op17_16, op10_8, op7, 0, (outs DPR:$Vd), (ins DPR:$Vm), + : N2Vnp<op19_18, op17_16, op10_8, op7, 0, (outs DPR:$Vd), (ins DPR:$Vm), itin, OpcodeStr, Dt, [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>; -class N2VQIntnp<bits<2> op17_16, bits<3> op10_8, bit op7, +class N2VQIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> - : N2Vnp<0b10, op17_16, op10_8, op7, 1, (outs QPR:$Vd), (ins QPR:$Vm), + : N2Vnp<op19_18, op17_16, op10_8, op7, 1, (outs QPR:$Vd), (ins QPR:$Vm), itin, OpcodeStr, Dt, [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; @@ -3255,6 +3250,13 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> { let Inst{10} = 1; // overwrite F = 1 } + def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, "f16", asm, "", + [(set DPR:$Vd, (v4i16 (OpNode (v4f16 DPR:$Vm))))]>, + Requires<[HasNEON,HasFullFP16]> { + let Inst{10} = 1; // overwrite F = 1 + } // 128-bit vector types. def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4, @@ -3275,6 +3277,13 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> { let Inst{10} = 1; // overwrite F = 1 } + def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, "f16", asm, "", + [(set QPR:$Vd, (v8i16 (OpNode (v8f16 QPR:$Vm))))]>, + Requires<[HasNEON,HasFullFP16]> { + let Inst{10} = 1; // overwrite F = 1 + } } @@ -4110,6 +4119,12 @@ def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32", v2f32, v2f32, fadd, 1>; def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32", v4f32, v4f32, fadd, 1>; +def VADDhd : N3VD<0, 0, 0b01, 0b1101, 0, IIC_VBIND, "vadd", "f16", + v4f16, v4f16, fadd, 1>, + Requires<[HasNEON,HasFullFP16]>; +def VADDhq : N3VQ<0, 0, 0b01, 0b1101, 0, IIC_VBINQ, "vadd", "f16", + v8f16, v8f16, fadd, 1>, + Requires<[HasNEON,HasFullFP16]>; // VADDL : Vector Add Long (Q = D + D) defm VADDLs : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, "vaddl", "s", add, sext, 1>; @@ -4165,10 +4180,21 @@ def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32", v2f32, v2f32, fmul, 1>; def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32", v4f32, v4f32, fmul, 1>; +def VMULhd : N3VD<1, 0, 0b01, 0b1101, 1, IIC_VFMULD, "vmul", "f16", + v4f16, v4f16, fmul, 1>, + Requires<[HasNEON,HasFullFP16]>; +def VMULhq : N3VQ<1, 0, 0b01, 0b1101, 1, IIC_VFMULQ, "vmul", "f16", + v8f16, v8f16, fmul, 1>, + Requires<[HasNEON,HasFullFP16]>; defm VMULsl : N3VSL_HS<0b1000, "vmul", mul>; def VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>; def VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32, v2f32, fmul>; +def VMULslhd : N3VDSL16<0b01, 0b1001, "vmul", "f16", v4f16, fmul>, + Requires<[HasNEON,HasFullFP16]>; +def VMULslhq : N3VQSL16<0b01, 0b1001, "vmul", "f16", v8f16, + v4f16, fmul>, + Requires<[HasNEON,HasFullFP16]>; def : Pat<(v8i16 (mul (v8i16 QPR:$src1), (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))), @@ -4277,6 +4303,12 @@ def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32", v4f32, fmul_su, fadd_mlx>, Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; +def VMLAhd : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16", + v4f16, fmul_su, fadd_mlx>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; +def VMLAhq : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16", + v8f16, fmul_su, fadd_mlx>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", @@ -4285,6 +4317,12 @@ def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32", v4f32, v2f32, fmul_su, fadd_mlx>, Requires<[HasNEON, UseFPVMLx]>; +def VMLAslhd : N3VDMulOpSL16<0b01, 0b0001, IIC_VMACD, "vmla", "f16", + v4f16, fmul, fadd>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; +def VMLAslhq : N3VQMulOpSL16<0b01, 0b0001, IIC_VMACQ, "vmla", "f16", + v8f16, v4f16, fmul, fadd>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; def : Pat<(v8i16 (add (v8i16 QPR:$src1), (mul (v8i16 QPR:$src2), @@ -4495,6 +4533,12 @@ def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32", v4f32, fmul_su, fsub_mlx>, Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; +def VMLShd : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16", + v4f16, fmul, fsub>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; +def VMLShq : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16", + v8f16, fmul, fsub>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", @@ -4503,6 +4547,12 @@ def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32", v4f32, v2f32, fmul_su, fsub_mlx>, Requires<[HasNEON, UseFPVMLx]>; +def VMLSslhd : N3VDMulOpSL16<0b01, 0b0101, IIC_VMACD, "vmls", "f16", + v4f16, fmul, fsub>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; +def VMLSslhq : N3VQMulOpSL16<0b01, 0b0101, IIC_VMACQ, "vmls", "f16", + v8f16, v4f16, fmul, fsub>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; def : Pat<(v8i16 (sub (v8i16 QPR:$src1), (mul (v8i16 QPR:$src2), @@ -4570,6 +4620,13 @@ def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32", def VFMAfq : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32", v4f32, fmul_su, fadd_mlx>, Requires<[HasNEON,HasVFP4,UseFusedMAC]>; +def VFMAhd : N3VDMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACD, "vfma", "f16", + v4f16, fmul, fadd>, + Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; + +def VFMAhq : N3VQMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACQ, "vfma", "f16", + v8f16, fmul, fadd>, + Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; // Fused Vector Multiply Subtract (floating-point) def VFMSfd : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32", @@ -4578,6 +4635,12 @@ def VFMSfd : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32", def VFMSfq : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32", v4f32, fmul_su, fsub_mlx>, Requires<[HasNEON,HasVFP4,UseFusedMAC]>; +def VFMShd : N3VDMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACD, "vfms", "f16", + v4f16, fmul, fsub>, + Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; +def VFMShq : N3VQMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACQ, "vfms", "f16", + v8f16, fmul, fsub>, + Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; // Match @llvm.fma.* intrinsics def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)), @@ -4602,6 +4665,12 @@ def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32", v2f32, v2f32, fsub, 0>; def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32", v4f32, v4f32, fsub, 0>; +def VSUBhd : N3VD<0, 0, 0b11, 0b1101, 0, IIC_VBIND, "vsub", "f16", + v4f16, v4f16, fsub, 0>, + Requires<[HasNEON,HasFullFP16]>; +def VSUBhq : N3VQ<0, 0, 0b11, 0b1101, 0, IIC_VBINQ, "vsub", "f16", + v8f16, v8f16, fsub, 0>, + Requires<[HasNEON,HasFullFP16]>; // VSUBL : Vector Subtract Long (Q = D - D) defm VSUBLs : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, "vsubl", "s", sub, sext, 0>; @@ -4646,6 +4715,12 @@ def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32, NEONvceq, 1>; def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32, NEONvceq, 1>; +def VCEQhd : N3VD<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16, + NEONvceq, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VCEQhq : N3VQ<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16, + NEONvceq, 1>, + Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i", @@ -4660,6 +4735,12 @@ def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32, NEONvcge, 0>; def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32, NEONvcge, 0>; +def VCGEhd : N3VD<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16, + NEONvcge, 0>, + Requires<[HasNEON, HasFullFP16]>; +def VCGEhq : N3VQ<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16, + NEONvcge, 0>, + Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in { defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s", @@ -4677,6 +4758,12 @@ def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32, NEONvcgt, 0>; def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32, NEONvcgt, 0>; +def VCGThd : N3VD<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16, + NEONvcgt, 0>, + Requires<[HasNEON, HasFullFP16]>; +def VCGThq : N3VQ<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16, + NEONvcgt, 0>, + Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in { defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s", @@ -4686,36 +4773,68 @@ defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s", } // VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE) -def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge", +def VACGEfd : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge", "f32", v2i32, v2f32, int_arm_neon_vacge, 0>; -def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge", +def VACGEfq : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge", "f32", v4i32, v4f32, int_arm_neon_vacge, 0>; +def VACGEhd : N3VDInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge", + "f16", v4i16, v4f16, int_arm_neon_vacge, 0>, + Requires<[HasNEON, HasFullFP16]>; +def VACGEhq : N3VQInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge", + "f16", v8i16, v8f16, int_arm_neon_vacge, 0>, + Requires<[HasNEON, HasFullFP16]>; // VACGT : Vector Absolute Compare Greater Than (aka VCAGT) -def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", +def VACGTfd : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", "f32", v2i32, v2f32, int_arm_neon_vacgt, 0>; -def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", +def VACGTfq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", "f32", v4i32, v4f32, int_arm_neon_vacgt, 0>; +def VACGThd : N3VDInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", + "f16", v4i16, v4f16, int_arm_neon_vacgt, 0>, + Requires<[HasNEON, HasFullFP16]>; +def VACGThq : N3VQInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", + "f16", v8f16, v8f16, int_arm_neon_vacgt, 0>, + Requires<[HasNEON, HasFullFP16]>; // VTST : Vector Test Bits defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vtst", "", NEONvtst, 1>; def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm", - (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; + (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm", - (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; + (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm", - (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; + (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm", - (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; + (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in { +def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm", + (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm", + (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm", + (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm", + (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; +} def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm", - (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; + (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm", - (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; + (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm", - (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; + (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm", - (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; + (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in { +def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm", + (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm", + (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm", + (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm", + (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; +} // Vector Bitwise Operations. @@ -5007,6 +5126,12 @@ def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND, "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>; def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ, "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>; +def VABDhd : N3VDInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBIND, + "vabd", "f16", v4f16, v4f16, int_arm_neon_vabds, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VABDhq : N3VQInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBINQ, + "vabd", "f16", v8f16, v8f16, int_arm_neon_vabds, 1>, + Requires<[HasNEON, HasFullFP16]>; // VABDL : Vector Absolute Difference Long (Q = | D - D |) defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, @@ -5014,6 +5139,29 @@ defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, defm VABDLu : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, "vabdl", "u", int_arm_neon_vabdu, zext, 1>; +def abd_shr : + PatFrag<(ops node:$in1, node:$in2, node:$shift), + (NEONvshrs (sub (zext node:$in1), + (zext node:$in2)), (i32 $shift))>; + +def : Pat<(xor (v4i32 (bitconvert (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15)))), + (v4i32 (bitconvert (v8i16 (add (sub (zext (v8i8 DPR:$opA)), + (zext (v8i8 DPR:$opB))), + (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15))))))), + (VABDLuv8i16 DPR:$opA, DPR:$opB)>; + +def : Pat<(xor (v4i32 (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)), + (v4i32 (add (sub (zext (v4i16 DPR:$opA)), + (zext (v4i16 DPR:$opB))), + (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)))), + (VABDLuv4i32 DPR:$opA, DPR:$opB)>; + +def : Pat<(xor (v4i32 (bitconvert (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))), + (v4i32 (bitconvert (v2i64 (add (sub (zext (v2i32 DPR:$opA)), + (zext (v2i32 DPR:$opB))), + (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))))), + (VABDLuv2i64 DPR:$opA, DPR:$opB)>; + // VABA : Vector Absolute Difference and Accumulate defm VABAs : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ, "vaba", "s", int_arm_neon_vabds, add>; @@ -5031,53 +5179,85 @@ defm VABALu : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD, // VMAX : Vector Maximum defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vmax", "s", int_arm_neon_vmaxs, 1>; + "vmax", "s", smax, 1>; defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vmax", "u", int_arm_neon_vmaxu, 1>; + "vmax", "u", umax, 1>; def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmax", "f32", - v2f32, v2f32, int_arm_neon_vmaxs, 1>; + v2f32, v2f32, fmaxnan, 1>; def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmax", "f32", - v4f32, v4f32, int_arm_neon_vmaxs, 1>; + v4f32, v4f32, fmaxnan, 1>; +def VMAXhd : N3VDInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBIND, + "vmax", "f16", + v4f16, v4f16, fmaxnan, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VMAXhq : N3VQInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBINQ, + "vmax", "f16", + v8f16, v8f16, fmaxnan, 1>, + Requires<[HasNEON, HasFullFP16]>; // VMAXNM let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { - def VMAXNMND : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1, + def VMAXNMNDf : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1, N3RegFrm, NoItinerary, "vmaxnm", "f32", - v2f32, v2f32, int_arm_neon_vmaxnm, 1>, + v2f32, v2f32, fmaxnum, 1>, Requires<[HasV8, HasNEON]>; - def VMAXNMNQ : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1, + def VMAXNMNQf : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1, N3RegFrm, NoItinerary, "vmaxnm", "f32", - v4f32, v4f32, int_arm_neon_vmaxnm, 1>, + v4f32, v4f32, fmaxnum, 1>, Requires<[HasV8, HasNEON]>; + def VMAXNMNDh : N3VDIntnp<0b00110, 0b01, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f16", + v4f16, v4f16, fmaxnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def VMAXNMNQh : N3VQIntnp<0b00110, 0b01, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f16", + v8f16, v8f16, fmaxnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; } // VMIN : Vector Minimum defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vmin", "s", int_arm_neon_vmins, 1>; + "vmin", "s", smin, 1>; defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vmin", "u", int_arm_neon_vminu, 1>; + "vmin", "u", umin, 1>; def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmin", "f32", - v2f32, v2f32, int_arm_neon_vmins, 1>; + v2f32, v2f32, fminnan, 1>; def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmin", "f32", - v4f32, v4f32, int_arm_neon_vmins, 1>; + v4f32, v4f32, fminnan, 1>; +def VMINhd : N3VDInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBIND, + "vmin", "f16", + v4f16, v4f16, fminnan, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VMINhq : N3VQInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBINQ, + "vmin", "f16", + v8f16, v8f16, fminnan, 1>, + Requires<[HasNEON, HasFullFP16]>; // VMINNM let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { - def VMINNMND : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1, + def VMINNMNDf : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1, N3RegFrm, NoItinerary, "vminnm", "f32", - v2f32, v2f32, int_arm_neon_vminnm, 1>, + v2f32, v2f32, fminnum, 1>, Requires<[HasV8, HasNEON]>; - def VMINNMNQ : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1, + def VMINNMNQf : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1, N3RegFrm, NoItinerary, "vminnm", "f32", - v4f32, v4f32, int_arm_neon_vminnm, 1>, + v4f32, v4f32, fminnum, 1>, Requires<[HasV8, HasNEON]>; + def VMINNMNDh : N3VDIntnp<0b00110, 0b11, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vminnm", "f16", + v4f16, v4f16, fminnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def VMINNMNQh : N3VQIntnp<0b00110, 0b11, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vminnm", "f16", + v8f16, v8f16, fminnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; } // Vector Pairwise Operations. @@ -5095,6 +5275,10 @@ def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD, def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm, IIC_VPBIND, "vpadd", "f32", v2f32, v2f32, int_arm_neon_vpadd, 0>; +def VPADDh : N3VDInt<1, 0, 0b01, 0b1101, 0, N3RegFrm, + IIC_VPBIND, "vpadd", "f16", + v4f16, v4f16, int_arm_neon_vpadd, 0>, + Requires<[HasNEON, HasFullFP16]>; // VPADDL : Vector Pairwise Add Long defm VPADDLs : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s", @@ -5123,6 +5307,9 @@ def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>; def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax", "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>; +def VPMAXh : N3VDInt<1, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax", + "f16", v4f16, v4f16, int_arm_neon_vpmaxs, 0>, + Requires<[HasNEON, HasFullFP16]>; // VPMIN : Vector Pairwise Minimum def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", @@ -5139,6 +5326,9 @@ def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>; def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin", "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>; +def VPMINh : N3VDInt<1, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin", + "f16", v4f16, v4f16, int_arm_neon_vpmins, 0>, + Requires<[HasNEON, HasFullFP16]>; // Vector Reciprocal and Reciprocal Square Root Estimate and Step. @@ -5155,6 +5345,14 @@ def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, IIC_VUNAQ, "vrecpe", "f32", v4f32, v4f32, int_arm_neon_vrecpe>; +def VRECPEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0, + IIC_VUNAD, "vrecpe", "f16", + v4f16, v4f16, int_arm_neon_vrecpe>, + Requires<[HasNEON, HasFullFP16]>; +def VRECPEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0, + IIC_VUNAQ, "vrecpe", "f16", + v8f16, v8f16, int_arm_neon_vrecpe>, + Requires<[HasNEON, HasFullFP16]>; // VRECPS : Vector Reciprocal Step def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, @@ -5163,6 +5361,14 @@ def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, IIC_VRECSQ, "vrecps", "f32", v4f32, v4f32, int_arm_neon_vrecps, 1>; +def VRECPShd : N3VDInt<0, 0, 0b01, 0b1111, 1, N3RegFrm, + IIC_VRECSD, "vrecps", "f16", + v4f16, v4f16, int_arm_neon_vrecps, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VRECPShq : N3VQInt<0, 0, 0b01, 0b1111, 1, N3RegFrm, + IIC_VRECSQ, "vrecps", "f16", + v8f16, v8f16, int_arm_neon_vrecps, 1>, + Requires<[HasNEON, HasFullFP16]>; // VRSQRTE : Vector Reciprocal Square Root Estimate def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, @@ -5177,6 +5383,14 @@ def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, IIC_VUNAQ, "vrsqrte", "f32", v4f32, v4f32, int_arm_neon_vrsqrte>; +def VRSQRTEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0, + IIC_VUNAD, "vrsqrte", "f16", + v4f16, v4f16, int_arm_neon_vrsqrte>, + Requires<[HasNEON, HasFullFP16]>; +def VRSQRTEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0, + IIC_VUNAQ, "vrsqrte", "f16", + v8f16, v8f16, int_arm_neon_vrsqrte>, + Requires<[HasNEON, HasFullFP16]>; // VRSQRTS : Vector Reciprocal Square Root Step def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, @@ -5185,6 +5399,14 @@ def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, IIC_VRECSQ, "vrsqrts", "f32", v4f32, v4f32, int_arm_neon_vrsqrts, 1>; +def VRSQRTShd : N3VDInt<0, 0, 0b11, 0b1111, 1, N3RegFrm, + IIC_VRECSD, "vrsqrts", "f16", + v4f16, v4f16, int_arm_neon_vrsqrts, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VRSQRTShq : N3VQInt<0, 0, 0b11, 0b1111, 1, N3RegFrm, + IIC_VRECSQ, "vrsqrts", "f16", + v8f16, v8f16, int_arm_neon_vrsqrts, 1>, + Requires<[HasNEON, HasFullFP16]>; // Vector Shifts. @@ -5336,6 +5558,14 @@ def VABSfd : N2VD<0b11, 0b11, 0b10, 0b01, 0b01110, 0, def VABSfq : N2VQ<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs", "f32", v4f32, v4f32, fabs>; +def VABShd : N2VD<0b11, 0b11, 0b01, 0b01, 0b01110, 0, + "vabs", "f16", + v4f16, v4f16, fabs>, + Requires<[HasNEON, HasFullFP16]>; +def VABShq : N2VQ<0b11, 0b11, 0b01, 0b01, 0b01110, 0, + "vabs", "f16", + v8f16, v8f16, fabs>, + Requires<[HasNEON, HasFullFP16]>; def : Pat<(xor (v2i32 (bitconvert (v8i8 (NEONvshrs DPR:$src, (i32 7))))), (v2i32 (bitconvert (v8i8 (add DPR:$src, @@ -5398,6 +5628,16 @@ def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ, "vneg", "f32", "$Vd, $Vm", "", [(set QPR:$Vd, (v4f32 (fneg QPR:$Vm)))]>; +def VNEGhd : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 0, 0, + (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD, + "vneg", "f16", "$Vd, $Vm", "", + [(set DPR:$Vd, (v4f16 (fneg DPR:$Vm)))]>, + Requires<[HasNEON, HasFullFP16]>; +def VNEGhq : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ, + "vneg", "f16", "$Vd, $Vm", "", + [(set QPR:$Vd, (v8f16 (fneg QPR:$Vm)))]>, + Requires<[HasNEON, HasFullFP16]>; def : Pat<(v8i8 (vnegd DPR:$src)), (VNEGs8d DPR:$src)>; def : Pat<(v4i16 (vnegd DPR:$src)), (VNEGs16d DPR:$src)>; @@ -5449,7 +5689,10 @@ def : NEONInstAlias<"vmov${p} $Vd, $Vm", // VMOV : Vector Move (Immediate) -let isReMaterializable = 1 in { +// Although VMOVs are not strictly speaking cheap, they are as expensive +// as their copies counterpart (VORR), so we should prefer rematerialization +// over splitting when it applies. +let isReMaterializable = 1, isAsCheapAsAMove=1 in { def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd), (ins nImmSplatI8:$SIMM), IIC_VMOVImm, "vmov", "i8", "$Vd, $SIMM", "", @@ -5504,7 +5747,7 @@ def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd), (ins nImmVMOVF32:$SIMM), IIC_VMOVImm, "vmov", "f32", "$Vd, $SIMM", "", [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>; -} // isReMaterializable +} // isReMaterializable, isAsCheapAsAMove // Add support for bytes replication feature, so it could be GAS compatible. // E.g. instructions below: @@ -5868,18 +6111,56 @@ def VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32", def VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32", v4f32, v4i32, uint_to_fp>; +def VCVTh2sd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16", + v4i16, v4f16, fp_to_sint>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTh2ud : N2VD<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16", + v4i16, v4f16, fp_to_uint>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTs2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16", + v4f16, v4i16, sint_to_fp>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTu2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16", + v4f16, v4i16, uint_to_fp>, + Requires<[HasNEON, HasFullFP16]>; + +def VCVTh2sq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16", + v8i16, v8f16, fp_to_sint>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTh2uq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16", + v8i16, v8f16, fp_to_uint>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTs2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16", + v8f16, v8i16, sint_to_fp>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTu2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16", + v8f16, v8i16, uint_to_fp>, + Requires<[HasNEON, HasFullFP16]>; + // VCVT{A, N, P, M} multiclass VCVT_FPI<string op, bits<3> op10_8, SDPatternOperator IntS, SDPatternOperator IntU> { let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { - def SD : N2VDIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + def SDf : N2VDIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), "s32.f32", v2i32, v2f32, IntS>, Requires<[HasV8, HasNEON]>; - def SQ : N2VQIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + def SQf : N2VQIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), "s32.f32", v4i32, v4f32, IntS>, Requires<[HasV8, HasNEON]>; - def UD : N2VDIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + def UDf : N2VDIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), "u32.f32", v2i32, v2f32, IntU>, Requires<[HasV8, HasNEON]>; - def UQ : N2VQIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + def UQf : N2VQIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), "u32.f32", v4i32, v4f32, IntU>, Requires<[HasV8, HasNEON]>; + def SDh : N2VDIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + "s16.f16", v4i16, v4f16, IntS>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def SQh : N2VQIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + "s16.f16", v8i16, v8f16, IntS>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def UDh : N2VDIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + "u16.f16", v4i16, v4f16, IntU>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def UQh : N2VQIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + "u16.f16", v8i16, v8f16, IntU>, + Requires<[HasV8, HasNEON, HasFullFP16]>; } } @@ -5898,6 +6179,16 @@ def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32", v2f32, v2i32, int_arm_neon_vcvtfxs2fp>; def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32", v2f32, v2i32, int_arm_neon_vcvtfxu2fp>; +let Predicates = [HasNEON, HasFullFP16] in { +def VCVTh2xsd : N2VCvtD<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16", + v4i16, v4f16, int_arm_neon_vcvtfp2fxs>; +def VCVTh2xud : N2VCvtD<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16", + v4i16, v4f16, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2hd : N2VCvtD<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16", + v4f16, v4i16, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2hd : N2VCvtD<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16", + v4f16, v4i16, int_arm_neon_vcvtfxu2fp>; +} // Predicates = [HasNEON, HasFullFP16] } let DecoderMethod = "DecodeVCVTQ" in { @@ -5909,6 +6200,16 @@ def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32", v4f32, v4i32, int_arm_neon_vcvtfxs2fp>; def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32", v4f32, v4i32, int_arm_neon_vcvtfxu2fp>; +let Predicates = [HasNEON, HasFullFP16] in { +def VCVTh2xsq : N2VCvtQ<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16", + v8i16, v8f16, int_arm_neon_vcvtfp2fxs>; +def VCVTh2xuq : N2VCvtQ<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16", + v8i16, v8f16, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2hq : N2VCvtQ<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16", + v8f16, v8i16, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2hq : N2VCvtQ<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16", + v8f16, v8i16, int_arm_neon_vcvtfxu2fp>; +} // Predicates = [HasNEON, HasFullFP16] } def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0", @@ -5929,6 +6230,24 @@ def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0", def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0", (VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.s16.f16 $Dd, $Dm, #0", + (VCVTh2sd DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.u16.f16 $Dd, $Dm, #0", + (VCVTh2ud DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f16.s16 $Dd, $Dm, #0", + (VCVTs2hd DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f16.u16 $Dd, $Dm, #0", + (VCVTu2hd DPR:$Dd, DPR:$Dm, pred:$p)>; + +def : NEONInstAlias<"vcvt${p}.s16.f16 $Qd, $Qm, #0", + (VCVTh2sq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.u16.f16 $Qd, $Qm, #0", + (VCVTh2uq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f16.s16 $Qd, $Qm, #0", + (VCVTs2hq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f16.u16 $Qd, $Qm, #0", + (VCVTu2hq QPR:$Qd, QPR:$Qm, pred:$p)>; + // VCVT : Vector Convert Between Half-Precision and Single-Precision. def VCVTf2h : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0, @@ -6182,22 +6501,40 @@ def VTBX4Pseudo // VRINT : Vector Rounding multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> { let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { - def D : N2VDIntnp<0b10, 0b100, 0, NoItinerary, + def Df : N2VDIntnp<0b10, 0b10, 0b100, 0, NoItinerary, !strconcat("vrint", op), "f32", v2f32, v2f32, Int>, Requires<[HasV8, HasNEON]> { let Inst{9-7} = op9_7; } - def Q : N2VQIntnp<0b10, 0b100, 0, NoItinerary, + def Qf : N2VQIntnp<0b10, 0b10, 0b100, 0, NoItinerary, !strconcat("vrint", op), "f32", v4f32, v4f32, Int>, Requires<[HasV8, HasNEON]> { let Inst{9-7} = op9_7; } + def Dh : N2VDIntnp<0b01, 0b10, 0b100, 0, NoItinerary, + !strconcat("vrint", op), "f16", + v4f16, v4f16, Int>, + Requires<[HasV8, HasNEON, HasFullFP16]> { + let Inst{9-7} = op9_7; + } + def Qh : N2VQIntnp<0b01, 0b10, 0b100, 0, NoItinerary, + !strconcat("vrint", op), "f16", + v8f16, v8f16, Int>, + Requires<[HasV8, HasNEON, HasFullFP16]> { + let Inst{9-7} = op9_7; + } } def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"), - (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>; + (!cast<Instruction>(NAME#"Df") DPR:$Dd, DPR:$Dm)>; def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"), - (!cast<Instruction>(NAME#"Q") QPR:$Qd, QPR:$Qm)>; + (!cast<Instruction>(NAME#"Qf") QPR:$Qd, QPR:$Qm)>; + let Predicates = [HasNEON, HasFullFP16] in { + def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Dd, $Dm"), + (!cast<Instruction>(NAME#"Dh") DPR:$Dd, DPR:$Dm)>; + def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Qd, $Qm"), + (!cast<Instruction>(NAME#"Qh") QPR:$Qd, QPR:$Qm)>; + } } defm VRINTNN : VRINT_FPI<"n", 0b000, int_arm_neon_vrintn>; @@ -6343,8 +6680,8 @@ def : N3VSMulOpPat<fmul, fsub, VFMSfd>, Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>; def : N2VSPat<fabs, VABSfd>; def : N2VSPat<fneg, VNEGfd>; -def : N3VSPat<NEONfmax, VMAXfd>; -def : N3VSPat<NEONfmin, VMINfd>; +def : N3VSPat<fmaxnan, VMAXfd>, Requires<[HasNEON]>; +def : N3VSPat<fminnan, VMINfd>, Requires<[HasNEON]>; def : NVCVTFIPat<fp_to_sint, VCVTf2sd>; def : NVCVTFIPat<fp_to_uint, VCVTf2ud>; def : NVCVTIFPat<sint_to_fp, VCVTs2fd>; @@ -7704,6 +8041,9 @@ def : NEONInstAlias<"vcle${p}.u32 $Dd, $Dn, $Dm", (VCGEuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; def : NEONInstAlias<"vcle${p}.f32 $Dd, $Dn, $Dm", (VCGEfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in +def : NEONInstAlias<"vcle${p}.f16 $Dd, $Dn, $Dm", + (VCGEhd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; // Q-register versions. def : NEONInstAlias<"vcle${p}.s8 $Qd, $Qn, $Qm", (VCGEsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; @@ -7719,6 +8059,9 @@ def : NEONInstAlias<"vcle${p}.u32 $Qd, $Qn, $Qm", (VCGEuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; def : NEONInstAlias<"vcle${p}.f32 $Qd, $Qn, $Qm", (VCGEfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in +def : NEONInstAlias<"vcle${p}.f16 $Qd, $Qn, $Qm", + (VCGEhq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; // VCLT (register) is an assembler alias for VCGT w/ the operands reversed. // D-register versions. @@ -7736,6 +8079,9 @@ def : NEONInstAlias<"vclt${p}.u32 $Dd, $Dn, $Dm", (VCGTuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; def : NEONInstAlias<"vclt${p}.f32 $Dd, $Dn, $Dm", (VCGTfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in +def : NEONInstAlias<"vclt${p}.f16 $Dd, $Dn, $Dm", + (VCGThd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; // Q-register versions. def : NEONInstAlias<"vclt${p}.s8 $Qd, $Qn, $Qm", (VCGTsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; @@ -7751,6 +8097,9 @@ def : NEONInstAlias<"vclt${p}.u32 $Qd, $Qn, $Qm", (VCGTuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; def : NEONInstAlias<"vclt${p}.f32 $Qd, $Qn, $Qm", (VCGTfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in +def : NEONInstAlias<"vclt${p}.f16 $Qd, $Qn, $Qm", + (VCGThq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; // VSWP allows, but does not require, a type suffix. defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm", diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td index 40414da..5b1f9a0 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -591,6 +591,34 @@ def tTRAP : TI<(outs), (ins), IIC_Br, // Load Store Instructions. // +// PC-relative loads need to be matched first as constant pool accesses need to +// always be PC-relative. We do this using AddedComplexity, as the pattern is +// simpler than the patterns of the other load instructions. +let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 10 in +def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, + "ldr", "\t$Rt, $addr", + [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>, + T1Encoding<{0,1,0,0,1,?}> { + // A6.2 & A8.6.59 + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + +// SP-relative loads should be matched before standard immediate-offset loads as +// it means we avoid having to move SP to another register. +let canFoldAsLoad = 1 in +def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i, + "ldr", "\t$Rt, $addr", + [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>, + T1LdStSP<{1,?,?}> { + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + // Loads: reg/reg and reg/imm5 let canFoldAsLoad = 1, isReMaterializable = 1 in multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, @@ -598,16 +626,20 @@ multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, AddrMode am, InstrItinClass itin_r, InstrItinClass itin_i, string asm, PatFrag opnode> { - def r : // reg/reg - T1pILdStEncode<reg_opc, - (outs tGPR:$Rt), (ins AddrMode_r:$addr), - am, itin_r, asm, "\t$Rt, $addr", - [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>; + // Immediate-offset loads should be matched before register-offset loads as + // when the offset is a constant it's simpler to first check if it fits in the + // immediate offset field then fall back to register-offset if it doesn't. def i : // reg/imm5 T1pILdStEncodeImm<imm_opc, 1 /* Load */, (outs tGPR:$Rt), (ins AddrMode_i:$addr), am, itin_i, asm, "\t$Rt, $addr", [(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>; + // Register-offset loads are matched last. + def r : // reg/reg + T1pILdStEncode<reg_opc, + (outs tGPR:$Rt), (ins AddrMode_r:$addr), + am, itin_r, asm, "\t$Rt, $addr", + [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>; } // Stores: reg/reg and reg/imm5 multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, @@ -615,32 +647,32 @@ multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, AddrMode am, InstrItinClass itin_r, InstrItinClass itin_i, string asm, PatFrag opnode> { - def r : // reg/reg - T1pILdStEncode<reg_opc, - (outs), (ins tGPR:$Rt, AddrMode_r:$addr), - am, itin_r, asm, "\t$Rt, $addr", - [(opnode tGPR:$Rt, AddrMode_r:$addr)]>; def i : // reg/imm5 T1pILdStEncodeImm<imm_opc, 0 /* Store */, (outs), (ins tGPR:$Rt, AddrMode_i:$addr), am, itin_i, asm, "\t$Rt, $addr", [(opnode tGPR:$Rt, AddrMode_i:$addr)]>; + def r : // reg/reg + T1pILdStEncode<reg_opc, + (outs), (ins tGPR:$Rt, AddrMode_r:$addr), + am, itin_r, asm, "\t$Rt, $addr", + [(opnode tGPR:$Rt, AddrMode_r:$addr)]>; } // A8.6.57 & A8.6.60 -defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rrs4, +defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr, t_addrmode_is4, AddrModeT1_4, IIC_iLoad_r, IIC_iLoad_i, "ldr", UnOpFrag<(load node:$Src)>>; // A8.6.64 & A8.6.61 -defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rrs1, +defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr, t_addrmode_is1, AddrModeT1_1, IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb", UnOpFrag<(zextloadi8 node:$Src)>>; // A8.6.76 & A8.6.73 -defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rrs2, +defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr, t_addrmode_is2, AddrModeT1_2, IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh", UnOpFrag<(zextloadi16 node:$Src)>>; @@ -659,58 +691,36 @@ def tLDRSH : // A8.6.84 "ldrsh", "\t$Rt, $addr", [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>; -let canFoldAsLoad = 1 in -def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i, - "ldr", "\t$Rt, $addr", - [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>, - T1LdStSP<{1,?,?}> { - bits<3> Rt; - bits<8> addr; - let Inst{10-8} = Rt; - let Inst{7-0} = addr; -} -let canFoldAsLoad = 1, isReMaterializable = 1 in -def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, - "ldr", "\t$Rt, $addr", - [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>, - T1Encoding<{0,1,0,0,1,?}> { - // A6.2 & A8.6.59 +def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i, + "str", "\t$Rt, $addr", + [(store tGPR:$Rt, t_addrmode_sp:$addr)]>, + T1LdStSP<{0,?,?}> { bits<3> Rt; bits<8> addr; let Inst{10-8} = Rt; - let Inst{7-0} = addr; + let Inst{7-0} = addr; } // A8.6.194 & A8.6.192 -defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4, +defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr, t_addrmode_is4, AddrModeT1_4, IIC_iStore_r, IIC_iStore_i, "str", BinOpFrag<(store node:$LHS, node:$RHS)>>; // A8.6.197 & A8.6.195 -defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rrs1, +defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr, t_addrmode_is1, AddrModeT1_1, IIC_iStore_bh_r, IIC_iStore_bh_i, "strb", BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; // A8.6.207 & A8.6.205 -defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rrs2, +defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr, t_addrmode_is2, AddrModeT1_2, IIC_iStore_bh_r, IIC_iStore_bh_i, "strh", BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; -def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i, - "str", "\t$Rt, $addr", - [(store tGPR:$Rt, t_addrmode_sp:$addr)]>, - T1LdStSP<{0,?,?}> { - bits<3> Rt; - bits<8> addr; - let Inst{10-8} = Rt; - let Inst{7-0} = addr; -} - //===----------------------------------------------------------------------===// // Load / store multiple Instructions. // @@ -730,6 +740,7 @@ def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops), // Writeback version is just a pseudo, as there's no encoding difference. // Writeback happens iff the base register is not in the destination register // list. +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in def tLDMIA_UPD : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain, "$Rn = $wb", IIC_iLoad_mu>, @@ -1328,16 +1339,16 @@ def : T1Pat<(subc tGPR:$lhs, tGPR:$rhs), (tSUBrr tGPR:$lhs, tGPR:$rhs)>; // Bswap 16 with load/store -def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rrs2:$addr)), (i32 16)), - (tREV16 (tLDRHr t_addrmode_rrs2:$addr))>; def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)), (tREV16 (tLDRHi t_addrmode_is2:$addr))>; -def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)), - t_addrmode_rrs2:$addr), - (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rrs2:$addr)>; +def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rr:$addr)), (i32 16)), + (tREV16 (tLDRHr t_addrmode_rr:$addr))>; def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)), t_addrmode_is2:$addr), (tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>; +def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)), + t_addrmode_rr:$addr), + (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rr:$addr)>; // ConstantPool def : T1Pat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>; @@ -1355,6 +1366,14 @@ def tLDRLIT_ga_abs : PseudoInst<(outs tGPR:$dst), (ins i32imm:$src), (ARMWrapper tglobaladdr:$src))]>, Requires<[IsThumb, DontUseMovt]>; +// TLS globals +def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr), + (tLDRLIT_ga_pcrel tglobaltlsaddr:$addr)>, + Requires<[IsThumb, DontUseMovt]>; +def : Pat<(ARMWrapper tglobaltlsaddr:$addr), + (tLDRLIT_ga_abs tglobaltlsaddr:$addr)>, + Requires<[IsThumb, DontUseMovt]>; + // JumpTable def : T1Pat<(ARMWrapperJT tjumptable:$dst), @@ -1372,10 +1391,10 @@ def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>, Requires<[IsThumb, HasV5T]>; // zextload i1 -> zextload i8 -def : T1Pat<(zextloadi1 t_addrmode_rrs1:$addr), - (tLDRBr t_addrmode_rrs1:$addr)>; def : T1Pat<(zextloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +def : T1Pat<(zextloadi1 t_addrmode_rr:$addr), + (tLDRBr t_addrmode_rr:$addr)>; // extload from the stack -> word load from the stack, as it avoids having to // materialize the base in a separate register. This only works when a word @@ -1389,61 +1408,61 @@ def : T1Pat<(extloadi16 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>, Requires<[IsThumb, IsThumb1Only, IsLE]>; // extload -> zextload -def : T1Pat<(extloadi1 t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>; -def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; -def : T1Pat<(extloadi8 t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>; -def : T1Pat<(extloadi8 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; -def : T1Pat<(extloadi16 t_addrmode_rrs2:$addr), (tLDRHr t_addrmode_rrs2:$addr)>; -def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>; +def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +def : T1Pat<(extloadi1 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>; +def : T1Pat<(extloadi8 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +def : T1Pat<(extloadi8 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>; +def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>; +def : T1Pat<(extloadi16 t_addrmode_rr:$addr), (tLDRHr t_addrmode_rr:$addr)>; // If it's impossible to use [r,r] address mode for sextload, select to // ldr{b|h} + sxt{b|h} instead. def : T1Pat<(sextloadi8 t_addrmode_is1:$addr), (tSXTB (tLDRBi t_addrmode_is1:$addr))>, Requires<[IsThumb, IsThumb1Only, HasV6]>; -def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr), - (tSXTB (tLDRBr t_addrmode_rrs1:$addr))>, +def : T1Pat<(sextloadi8 t_addrmode_rr:$addr), + (tSXTB (tLDRBr t_addrmode_rr:$addr))>, Requires<[IsThumb, IsThumb1Only, HasV6]>; def : T1Pat<(sextloadi16 t_addrmode_is2:$addr), (tSXTH (tLDRHi t_addrmode_is2:$addr))>, Requires<[IsThumb, IsThumb1Only, HasV6]>; -def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr), - (tSXTH (tLDRHr t_addrmode_rrs2:$addr))>, +def : T1Pat<(sextloadi16 t_addrmode_rr:$addr), + (tSXTH (tLDRHr t_addrmode_rr:$addr))>, Requires<[IsThumb, IsThumb1Only, HasV6]>; -def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr), - (tASRri (tLSLri (tLDRBr t_addrmode_rrs1:$addr), 24), 24)>; def : T1Pat<(sextloadi8 t_addrmode_is1:$addr), (tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>; -def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr), - (tASRri (tLSLri (tLDRHr t_addrmode_rrs2:$addr), 16), 16)>; +def : T1Pat<(sextloadi8 t_addrmode_rr:$addr), + (tASRri (tLSLri (tLDRBr t_addrmode_rr:$addr), 24), 24)>; def : T1Pat<(sextloadi16 t_addrmode_is2:$addr), (tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>; +def : T1Pat<(sextloadi16 t_addrmode_rr:$addr), + (tASRri (tLSLri (tLDRHr t_addrmode_rr:$addr), 16), 16)>; def : T1Pat<(atomic_load_8 t_addrmode_is1:$src), (tLDRBi t_addrmode_is1:$src)>; -def : T1Pat<(atomic_load_8 t_addrmode_rrs1:$src), - (tLDRBr t_addrmode_rrs1:$src)>; +def : T1Pat<(atomic_load_8 t_addrmode_rr:$src), + (tLDRBr t_addrmode_rr:$src)>; def : T1Pat<(atomic_load_16 t_addrmode_is2:$src), (tLDRHi t_addrmode_is2:$src)>; -def : T1Pat<(atomic_load_16 t_addrmode_rrs2:$src), - (tLDRHr t_addrmode_rrs2:$src)>; +def : T1Pat<(atomic_load_16 t_addrmode_rr:$src), + (tLDRHr t_addrmode_rr:$src)>; def : T1Pat<(atomic_load_32 t_addrmode_is4:$src), (tLDRi t_addrmode_is4:$src)>; -def : T1Pat<(atomic_load_32 t_addrmode_rrs4:$src), - (tLDRr t_addrmode_rrs4:$src)>; +def : T1Pat<(atomic_load_32 t_addrmode_rr:$src), + (tLDRr t_addrmode_rr:$src)>; def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val), (tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>; -def : T1Pat<(atomic_store_8 t_addrmode_rrs1:$ptr, tGPR:$val), - (tSTRBr tGPR:$val, t_addrmode_rrs1:$ptr)>; +def : T1Pat<(atomic_store_8 t_addrmode_rr:$ptr, tGPR:$val), + (tSTRBr tGPR:$val, t_addrmode_rr:$ptr)>; def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val), (tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>; -def : T1Pat<(atomic_store_16 t_addrmode_rrs2:$ptr, tGPR:$val), - (tSTRHr tGPR:$val, t_addrmode_rrs2:$ptr)>; +def : T1Pat<(atomic_store_16 t_addrmode_rr:$ptr, tGPR:$val), + (tSTRHr tGPR:$val, t_addrmode_rr:$ptr)>; def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val), (tSTRi tGPR:$val, t_addrmode_is4:$ptr)>; -def : T1Pat<(atomic_store_32 t_addrmode_rrs4:$ptr, tGPR:$val), - (tSTRr tGPR:$val, t_addrmode_rrs4:$ptr)>; +def : T1Pat<(atomic_store_32 t_addrmode_rr:$ptr, tGPR:$val), + (tSTRr tGPR:$val, t_addrmode_rr:$ptr)>; // Large immediate handling. diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td index aba8a7b..f42f456 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -43,7 +43,7 @@ def t2_shift_imm : Operand<i32> { // Shifted operands. No register controlled shifts for Thumb2. // Note: We do not support rrx shifted operands yet. def t2_so_reg : Operand<i32>, // reg imm - ComplexPattern<i32, 2, "SelectT2ShifterOperandReg", + ComplexPattern<i32, 2, "SelectShiftImmShifterOperand", [shl,srl,sra,rotr]> { let EncoderMethod = "getT2SORegOpValue"; let PrintMethod = "printT2SOOperand"; @@ -1554,19 +1554,21 @@ def t2STRBT : T2IstT<0b00, "strbt", IIC_iStore_bh_i>; def t2STRHT : T2IstT<0b01, "strht", IIC_iStore_bh_i>; // ldrd / strd pre / post variants -// For disassembly only. +let mayLoad = 1 in def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb), (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> { let DecoderMethod = "DecodeT2LDRDPreInstruction"; } +let mayLoad = 1 in def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb), (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm", "$addr.base = $wb", []>; +let mayStore = 1 in def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs GPR:$wb), (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr), IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!", @@ -1574,6 +1576,7 @@ def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs GPR:$wb), let DecoderMethod = "DecodeT2STRDPreInstruction"; } +let mayStore = 1 in def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb), (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr, t2am_imm8s4_offset:$imm), @@ -2100,7 +2103,7 @@ def : T2Pat<(ARMadde rGPR:$src, imm0_65535_neg:$imm, CPSR), def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-24} = 0b010; let Inst{23} = 0b1; @@ -2117,7 +2120,7 @@ class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc, dag iops = (ins rGPR:$Rn, rGPR:$Rm), string asm = "\t$Rd, $Rn, $Rm"> : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, pat>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0101; let Inst{22-20} = op22_20; @@ -2215,13 +2218,13 @@ class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops, def t2USAD8 : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{15-12} = 0b1111; } def t2USADA8 : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), NoItinerary, "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; // Signed/Unsigned saturate. class T2SatI<dag oops, dag iops, InstrItinClass itin, @@ -2254,7 +2257,7 @@ def t2SSAT: T2SatI< def t2SSAT16: T2SatI< (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1100; let Inst{20} = 0; @@ -2278,7 +2281,7 @@ def t2USAT: T2SatI< def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn), NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-22} = 0b1111001110; let Inst{20} = 0; let Inst{15} = 0; @@ -2288,8 +2291,8 @@ def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn), let Inst{5-4} = 0b00; } -def : T2Pat<(int_arm_ssat GPR:$a, imm:$pos), (t2SSAT imm:$pos, GPR:$a, 0)>; -def : T2Pat<(int_arm_usat GPR:$a, imm:$pos), (t2USAT imm:$pos, GPR:$a, 0)>; +def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>; +def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos), (t2USAT imm0_31:$pos, GPR:$a, 0)>; //===----------------------------------------------------------------------===// // Shift and rotate Instructions. @@ -2605,7 +2608,7 @@ def t2UMAAL : T2MulLong<0b110, 0b0110, (outs rGPR:$RdLo, rGPR:$RdHi), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; } // hasSideEffects // Rounding variants of the below included for disassembly only @@ -2614,7 +2617,7 @@ def t2UMAAL : T2MulLong<0b110, 0b0110, def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm", [(set rGPR:$Rd, (mulhs rGPR:$Rn, rGPR:$Rm))]>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b101; @@ -2624,7 +2627,7 @@ def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, def t2SMMULR : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b101; @@ -2636,7 +2639,7 @@ def t2SMMLA : T2FourReg< (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra", [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b101; @@ -2646,7 +2649,7 @@ def t2SMMLA : T2FourReg< def t2SMMLAR: T2FourReg< (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b101; @@ -2657,7 +2660,7 @@ def t2SMMLS: T2FourReg< (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", [(set rGPR:$Rd, (sub rGPR:$Ra, (mulhs rGPR:$Rn, rGPR:$Rm)))]>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b110; @@ -2667,7 +2670,7 @@ def t2SMMLS: T2FourReg< def t2SMMLSR:T2FourReg< (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b110; @@ -2679,7 +2682,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> { !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm", [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16), (sext_inreg rGPR:$Rm, i16)))]>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2692,7 +2695,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> { !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm", [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16), (sra rGPR:$Rm, (i32 16))))]>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2705,7 +2708,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> { !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm", [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)), (sext_inreg rGPR:$Rm, i16)))]>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2718,7 +2721,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> { !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm", [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)), (sra rGPR:$Rm, (i32 16))))]>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2730,7 +2733,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> { def WB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b011; @@ -2742,7 +2745,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> { def WT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b011; @@ -2760,7 +2763,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> { [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16), (sext_inreg rGPR:$Rm, i16))))]>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2773,7 +2776,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> { !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra", [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16), (sra rGPR:$Rm, (i32 16)))))]>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2786,7 +2789,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> { !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra", [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)), (sext_inreg rGPR:$Rm, i16))))]>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2799,7 +2802,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> { !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra", [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)), (sra rGPR:$Rm, (i32 16)))))]>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2811,7 +2814,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> { (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b011; @@ -2823,7 +2826,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> { (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b011; @@ -2839,79 +2842,79 @@ defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbb", "\t$Ra, $Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLALBT : T2FourReg_mac<1, 0b100, 0b1001, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbt", "\t$Ra, $Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLALTB : T2FourReg_mac<1, 0b100, 0b1010, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltb", "\t$Ra, $Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLALTT : T2FourReg_mac<1, 0b100, 0b1011, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltt", "\t$Ra, $Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; // Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD def t2SMUAD: T2ThreeReg_mac< 0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC32, "smuad", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{15-12} = 0b1111; } def t2SMUADX:T2ThreeReg_mac< 0, 0b010, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC32, "smuadx", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{15-12} = 0b1111; } def t2SMUSD: T2ThreeReg_mac< 0, 0b100, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC32, "smusd", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{15-12} = 0b1111; } def t2SMUSDX:T2ThreeReg_mac< 0, 0b100, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC32, "smusdx", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{15-12} = 0b1111; } def t2SMLAD : T2FourReg_mac< 0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlad", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLADX : T2FourReg_mac< 0, 0b010, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smladx", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLSD : T2FourReg_mac<0, 0b100, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsd", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLSDX : T2FourReg_mac<0, 0b100, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsdx", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLALD : T2FourReg_mac<1, 0b100, 0b1100, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, "smlald", "\t$Ra, $Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLALDX : T2FourReg_mac<1, 0b100, 0b1101, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaldx", "\t$Ra, $Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLSLD : T2FourReg_mac<1, 0b101, 0b1100, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlsld", "\t$Ra, $Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsldx", "\t$Ra, $Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; //===----------------------------------------------------------------------===// // Division Instructions. @@ -2961,7 +2964,7 @@ def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, "rbit", "\t$Rd, $Rm", - [(set rGPR:$Rd, (ARMrbit rGPR:$Rm))]>, + [(set rGPR:$Rd, (bitreverse rGPR:$Rm))]>, Sched<[WriteALU]>; def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, @@ -3872,6 +3875,13 @@ def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr), } +def : T2Pat<(ARMWrapperPIC tglobaltlsaddr :$dst), + (t2MOV_ga_pcrel tglobaltlsaddr:$dst)>, + Requires<[IsThumb2, UseMovt]>; +def : T2Pat<(ARMWrapper tglobaltlsaddr:$dst), + (t2MOVi32imm tglobaltlsaddr:$dst)>, + Requires<[IsThumb2, UseMovt]>; + // ConstantPool, GlobalAddress, and JumpTable def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>; def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>, diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td index e83f8c8..63e7940 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -20,7 +20,6 @@ def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutGlue]>; def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>; def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>; - //===----------------------------------------------------------------------===// // Operand Definitions. // @@ -93,7 +92,7 @@ def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr), def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr), IIC_fpLoad32, "vldr", "\t$Sd, $addr", - [(set SPR:$Sd, (load addrmode5:$addr))]> { + [(set SPR:$Sd, (alignedload32 addrmode5:$addr))]> { // Some single precision VFP instructions may be executed on both NEON and VFP // pipelines. let D = VFPNeonDomain; @@ -107,7 +106,7 @@ def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr), def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr), IIC_fpStore32, "vstr", "\t$Sd, $addr", - [(store SPR:$Sd, addrmode5:$addr)]> { + [(alignedstore32 SPR:$Sd, addrmode5:$addr)]> { // Some single precision VFP instructions may be executed on both NEON and VFP // pipelines. let D = VFPNeonDomain; @@ -393,8 +392,8 @@ multiclass vmaxmin_inst<string op, bit opc, SDNode SD> { } } -defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, ARMvmaxnm>; -defm VMINNM : vmaxmin_inst<"vminnm", 1, ARMvminnm>; +defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>; +defm VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>; // Match reassociated forms only if not sign dependent rounding. def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)), @@ -541,19 +540,23 @@ def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, // FIXME: Verify encoding after integrated assembler is working. def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]>, + Requires<[HasFP16]>; def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]>, + Requires<[HasFP16]>; def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]>, + Requires<[HasFP16]>; def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]>, + Requires<[HasFP16]>; def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs DPR:$Dd), (ins SPR:$Sm), @@ -922,6 +925,22 @@ def VMOVDRR : AVConv5I<0b11000100, 0b1011, let isRegSequence = 1; } +// Hoist an fabs or a fneg of a value coming from integer registers +// and do the fabs/fneg on the integer value. This is never a lose +// and could enable the conversion to float to be removed completely. +def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)), + (VMOVDRR GPR:$Rl, (BFC GPR:$Rh, (i32 0x7FFFFFFF)))>, + Requires<[IsARM, HasV6T2]>; +def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)), + (VMOVDRR GPR:$Rl, (t2BFC GPR:$Rh, (i32 0x7FFFFFFF)))>, + Requires<[IsThumb2, HasV6T2]>; +def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)), + (VMOVDRR GPR:$Rl, (EORri GPR:$Rh, (i32 0x80000000)))>, + Requires<[IsARM]>; +def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)), + (VMOVDRR GPR:$Rl, (t2EORri GPR:$Rh, (i32 0x80000000)))>, + Requires<[IsThumb2]>; + let hasSideEffects = 0 in def VMOVSRR : AVConv5I<0b11000100, 0b1010, (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2), @@ -1003,7 +1022,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { def : VFPPat<(f64 (sint_to_fp GPR:$a)), (VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; - def : VFPPat<(f64 (sint_to_fp (i32 (load addrmode5:$a)))), + def : VFPPat<(f64 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOD (VLDRS addrmode5:$a))>; } @@ -1021,7 +1040,7 @@ def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)), (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; -def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (load addrmode5:$a)))), +def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOS (VLDRS addrmode5:$a))>; def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, @@ -1035,7 +1054,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { def : VFPPat<(f64 (uint_to_fp GPR:$a)), (VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; - def : VFPPat<(f64 (uint_to_fp (i32 (load addrmode5:$a)))), + def : VFPPat<(f64 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOD (VLDRS addrmode5:$a))>; } @@ -1053,7 +1072,7 @@ def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)), (VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; -def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (load addrmode5:$a)))), +def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOS (VLDRS addrmode5:$a))>; // FP -> Int: @@ -1106,7 +1125,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))), (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>; - def : VFPPat<(store (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), + def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; } @@ -1124,7 +1143,8 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)), (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>; -def : VFPNoNEONPat<(store (i32 (fp_to_sint (f32 SPR:$a))), addrmode5:$ptr), +def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))), + addrmode5:$ptr), (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>; def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, @@ -1138,7 +1158,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))), (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>; - def : VFPPat<(store (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), + def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; } @@ -1156,7 +1176,8 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)), (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>; -def : VFPNoNEONPat<(store (i32 (fp_to_uint (f32 SPR:$a))), addrmode5:$ptr), +def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))), + addrmode5:$ptr), (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>; // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR. diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 265b86f..6e7e47b 100644 --- a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -60,17 +60,24 @@ STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm"); STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); +namespace llvm { +void initializeARMLoadStoreOptPass(PassRegistry &); +} + +#define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass" + namespace { /// Post- register allocation pass the combine load / store instructions to /// form ldm / stm instructions. struct ARMLoadStoreOpt : public MachineFunctionPass { static char ID; - ARMLoadStoreOpt() : MachineFunctionPass(ID) {} + ARMLoadStoreOpt() : MachineFunctionPass(ID) { + initializeARMLoadStoreOptPass(*PassRegistry::getPassRegistry()); + } const MachineFunction *MF; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; - const MachineRegisterInfo *MRI; const ARMSubtarget *STI; const TargetLowering *TL; ARMFunctionInfo *AFI; @@ -84,7 +91,7 @@ namespace { bool runOnMachineFunction(MachineFunction &Fn) override; const char *getPassName() const override { - return "ARM load / store optimization pass"; + return ARM_LOAD_STORE_OPT_NAME; } private: @@ -118,6 +125,7 @@ namespace { }; SpecificBumpPtrAllocator<MergeCandidate> Allocator; SmallVector<const MergeCandidate*,4> Candidates; + SmallVector<MachineInstr*,4> MergeBaseCandidates; void moveLiveRegsBefore(const MachineBasicBlock &MBB, MachineBasicBlock::const_iterator Before); @@ -140,12 +148,16 @@ namespace { MachineBasicBlock::iterator &MBBI); bool MergeBaseUpdateLoadStore(MachineInstr *MI); bool MergeBaseUpdateLSMultiple(MachineInstr *MI); + bool MergeBaseUpdateLSDouble(MachineInstr &MI) const; bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); bool MergeReturnIntoLDM(MachineBasicBlock &MBB); + bool CombineMovBx(MachineBasicBlock &MBB); }; char ARMLoadStoreOpt::ID = 0; } +INITIALIZE_PASS(ARMLoadStoreOpt, "arm-load-store-opt", ARM_LOAD_STORE_OPT_NAME, false, false) + static bool definesCPSR(const MachineInstr *MI) { for (const auto &MO : MI->operands()) { if (!MO.isReg()) @@ -619,9 +631,10 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB, unsigned NewBase; if (isi32Load(Opcode)) { - // If it is a load, then just use one of the destination register to - // use as the new base. + // If it is a load, then just use one of the destination registers + // as the new base. Will no longer be writeback in Thumb1. NewBase = Regs[NumRegs-1].first; + Writeback = false; } else { // Find a free register that we can use as scratch register. moveLiveRegsBefore(MBB, InsertBefore); @@ -725,9 +738,12 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB, MachineInstrBuilder MIB; if (Writeback) { - if (Opcode == ARM::tLDMIA) + assert(isThumb1 && "expected Writeback only inThumb1"); + if (Opcode == ARM::tLDMIA) { + assert(!(ContainsReg(Regs, Base)) && "Thumb1 can't LDM ! with Base in Regs"); // Update tLDMIA with writeback if necessary. Opcode = ARM::tLDMIA_UPD; + } MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode)); @@ -784,6 +800,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { SmallVector<std::pair<unsigned, bool>, 8> Regs; SmallVector<unsigned, 4> ImpDefs; DenseSet<unsigned> KilledRegs; + DenseSet<unsigned> UsedRegs; // Determine list of registers and list of implicit super-register defs. for (const MachineInstr *MI : Cand.Instrs) { const MachineOperand &MO = getLoadStoreRegOp(*MI); @@ -792,6 +809,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { if (IsKill) KilledRegs.insert(Reg); Regs.push_back(std::make_pair(Reg, IsKill)); + UsedRegs.insert(Reg); if (IsLoad) { // Collect any implicit defs of super-registers, after merging we can't @@ -881,7 +899,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { for (MachineOperand &MO : MI.uses()) { if (!MO.isReg() || !MO.isKill()) continue; - if (KilledRegs.count(MO.getReg())) + if (UsedRegs.count(MO.getReg())) MO.setIsKill(false); } } @@ -995,76 +1013,6 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { } while (SIndex < EIndex); } -static bool isMatchingDecrement(MachineInstr *MI, unsigned Base, - unsigned Bytes, unsigned Limit, - ARMCC::CondCodes Pred, unsigned PredReg) { - unsigned MyPredReg = 0; - if (!MI) - return false; - - bool CheckCPSRDef = false; - switch (MI->getOpcode()) { - default: return false; - case ARM::tSUBi8: - case ARM::t2SUBri: - case ARM::SUBri: - CheckCPSRDef = true; - break; - case ARM::tSUBspi: - break; - } - - // Make sure the offset fits in 8 bits. - if (Bytes == 0 || (Limit && Bytes >= Limit)) - return false; - - unsigned Scale = (MI->getOpcode() == ARM::tSUBspi || - MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME - if (!(MI->getOperand(0).getReg() == Base && - MI->getOperand(1).getReg() == Base && - (MI->getOperand(2).getImm() * Scale) == Bytes && - getInstrPredicate(MI, MyPredReg) == Pred && - MyPredReg == PredReg)) - return false; - - return CheckCPSRDef ? !definesCPSR(MI) : true; -} - -static bool isMatchingIncrement(MachineInstr *MI, unsigned Base, - unsigned Bytes, unsigned Limit, - ARMCC::CondCodes Pred, unsigned PredReg) { - unsigned MyPredReg = 0; - if (!MI) - return false; - - bool CheckCPSRDef = false; - switch (MI->getOpcode()) { - default: return false; - case ARM::tADDi8: - case ARM::t2ADDri: - case ARM::ADDri: - CheckCPSRDef = true; - break; - case ARM::tADDspi: - break; - } - - if (Bytes == 0 || (Limit && Bytes >= Limit)) - // Make sure the offset fits in 8 bits. - return false; - - unsigned Scale = (MI->getOpcode() == ARM::tADDspi || - MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME - if (!(MI->getOperand(0).getReg() == Base && - MI->getOperand(1).getReg() == Base && - (MI->getOperand(2).getImm() * Scale) == Bytes && - getInstrPredicate(MI, MyPredReg) == Pred && - MyPredReg == PredReg)) - return false; - - return CheckCPSRDef ? !definesCPSR(MI) : true; -} - static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, ARM_AM::AMSubMode Mode) { switch (Opc) { @@ -1132,6 +1080,75 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, } } +/// Check if the given instruction increments or decrements a register and +/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags +/// generated by the instruction are possibly read as well. +static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg) { + bool CheckCPSRDef; + int Scale; + switch (MI.getOpcode()) { + case ARM::tADDi8: Scale = 4; CheckCPSRDef = true; break; + case ARM::tSUBi8: Scale = -4; CheckCPSRDef = true; break; + case ARM::t2SUBri: + case ARM::SUBri: Scale = -1; CheckCPSRDef = true; break; + case ARM::t2ADDri: + case ARM::ADDri: Scale = 1; CheckCPSRDef = true; break; + case ARM::tADDspi: Scale = 4; CheckCPSRDef = false; break; + case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break; + default: return 0; + } + + unsigned MIPredReg; + if (MI.getOperand(0).getReg() != Reg || + MI.getOperand(1).getReg() != Reg || + getInstrPredicate(&MI, MIPredReg) != Pred || + MIPredReg != PredReg) + return 0; + + if (CheckCPSRDef && definesCPSR(&MI)) + return 0; + return MI.getOperand(2).getImm() * Scale; +} + +/// Searches for an increment or decrement of \p Reg before \p MBBI. +static MachineBasicBlock::iterator +findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator BeginMBBI = MBB.begin(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + if (MBBI == BeginMBBI) + return EndMBBI; + + // Skip debug values. + MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); + while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI) + --PrevMBBI; + + Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg); + return Offset == 0 ? EndMBBI : PrevMBBI; +} + +/// Searches for a increment or decrement of \p Reg after \p MBBI. +static MachineBasicBlock::iterator +findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + MachineBasicBlock::iterator NextMBBI = std::next(MBBI); + // Skip debug values. + while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) + ++NextMBBI; + if (NextMBBI == EndMBBI) + return EndMBBI; + + Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg); + return Offset == 0 ? EndMBBI : NextMBBI; +} + /// Fold proceeding/trailing inc/dec of base register into the /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: /// @@ -1151,7 +1168,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { const MachineOperand &BaseOP = MI->getOperand(0); unsigned Base = BaseOP.getReg(); bool BaseKill = BaseOP.isKill(); - unsigned Bytes = getLSMultipleTransferSize(MI); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); unsigned Opcode = MI->getOpcode(); @@ -1163,49 +1179,24 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { if (MI->getOperand(i).getReg() == Base) return false; - bool DoMerge = false; - ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode); - - // Try merging with the previous instruction. + int Bytes = getLSMultipleTransferSize(MI); MachineBasicBlock &MBB = *MI->getParent(); - MachineBasicBlock::iterator BeginMBBI = MBB.begin(); MachineBasicBlock::iterator MBBI(MI); - if (MBBI != BeginMBBI) { - MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); - while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) - --PrevMBBI; - if (Mode == ARM_AM::ia && - isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { - Mode = ARM_AM::db; - DoMerge = true; - } else if (Mode == ARM_AM::ib && - isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { - Mode = ARM_AM::da; - DoMerge = true; - } - if (DoMerge) - MBB.erase(PrevMBBI); - } - - // Try merging with the next instruction. - MachineBasicBlock::iterator EndMBBI = MBB.end(); - if (!DoMerge && MBBI != EndMBBI) { - MachineBasicBlock::iterator NextMBBI = std::next(MBBI); - while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) - ++NextMBBI; - if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && - isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { - DoMerge = true; - } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && - isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { - DoMerge = true; - } - if (DoMerge) - MBB.erase(NextMBBI); + int Offset; + MachineBasicBlock::iterator MergeInstr + = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset); + ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode); + if (Mode == ARM_AM::ia && Offset == -Bytes) { + Mode = ARM_AM::db; + } else if (Mode == ARM_AM::ib && Offset == -Bytes) { + Mode = ARM_AM::da; + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) && + ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) + return false; } - - if (!DoMerge) - return false; + MBB.erase(MergeInstr); unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) @@ -1283,7 +1274,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { unsigned Base = getLoadStoreBaseOp(*MI).getReg(); bool BaseKill = getLoadStoreBaseOp(*MI).isKill(); - unsigned Bytes = getLSMultipleTransferSize(MI); unsigned Opcode = MI->getOpcode(); DebugLoc DL = MI->getDebugLoc(); bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || @@ -1295,7 +1285,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) return false; - bool isLd = isLoadSingle(Opcode); // Can't do the merge if the destination register is the same as the would-be // writeback register. if (MI->getOperand(0).getReg() == Base) @@ -1303,55 +1292,31 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); - bool DoMerge = false; - ARM_AM::AddrOpc AddSub = ARM_AM::add; - unsigned NewOpc = 0; - // AM2 - 12 bits, thumb2 - 8 bits. - unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); - - // Try merging with the previous instruction. + int Bytes = getLSMultipleTransferSize(MI); MachineBasicBlock &MBB = *MI->getParent(); - MachineBasicBlock::iterator BeginMBBI = MBB.begin(); MachineBasicBlock::iterator MBBI(MI); - if (MBBI != BeginMBBI) { - MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); - while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) - --PrevMBBI; - if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { - DoMerge = true; - AddSub = ARM_AM::sub; - } else if (!isAM5 && - isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { - DoMerge = true; - } - if (DoMerge) { - NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub); - MBB.erase(PrevMBBI); - } - } - - // Try merging with the next instruction. - MachineBasicBlock::iterator EndMBBI = MBB.end(); - if (!DoMerge && MBBI != EndMBBI) { - MachineBasicBlock::iterator NextMBBI = std::next(MBBI); - while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) - ++NextMBBI; - if (!isAM5 && - isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { - DoMerge = true; - AddSub = ARM_AM::sub; - } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { - DoMerge = true; - } - if (DoMerge) { - NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub); - MBB.erase(NextMBBI); - } + int Offset; + MachineBasicBlock::iterator MergeInstr + = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset); + unsigned NewOpc; + if (!isAM5 && Offset == Bytes) { + NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add); + } else if (Offset == -Bytes) { + NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub); + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (Offset == Bytes) { + NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add); + } else if (!isAM5 && Offset == -Bytes) { + NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub); + } else + return false; } + MBB.erase(MergeInstr); - if (!DoMerge) - return false; + ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add; + bool isLd = isLoadSingle(Opcode); if (isAM5) { // VLDM[SD]_UPD, VSTM[SD]_UPD // (There are no base-updating versions of VLDR/VSTR instructions, but the @@ -1368,18 +1333,16 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { if (isAM2) { // LDR_PRE, LDR_POST if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) { - int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); } else { - int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) - .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg); } } else { - int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; // t2LDR_PRE, t2LDR_POST BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) @@ -1391,13 +1354,12 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { // the vestigal zero-reg offset register. When that's fixed, this clause // can be removed entirely. if (isAM2 && NewOpc == ARM::STR_POST_IMM) { - int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); // STR_PRE, STR_POST BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) - .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg); } else { - int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; // t2STR_PRE, t2STR_POST BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) @@ -1409,46 +1371,75 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { return true; } -/// Returns true if instruction is a memory operation that this pass is capable -/// of operating on. -static bool isMemoryOp(const MachineInstr *MI) { - // When no memory operands are present, conservatively assume unaligned, - // volatile, unfoldable. - if (!MI->hasOneMemOperand()) +bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) && + "Must have t2STRDi8 or t2LDRDi8"); + if (MI.getOperand(3).getImm() != 0) return false; - const MachineMemOperand *MMO = *MI->memoperands_begin(); - - // Don't touch volatile memory accesses - we may be changing their order. - if (MMO->isVolatile()) + // Behaviour for writeback is undefined if base register is the same as one + // of the others. + const MachineOperand &BaseOp = MI.getOperand(2); + unsigned Base = BaseOp.getReg(); + const MachineOperand &Reg0Op = MI.getOperand(0); + const MachineOperand &Reg1Op = MI.getOperand(1); + if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base) return false; - // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is - // not. - if (MMO->getAlignment() < 4) - return false; + unsigned PredReg; + ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg); + MachineBasicBlock::iterator MBBI(MI); + MachineBasicBlock &MBB = *MI.getParent(); + int Offset; + MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred, + PredReg, Offset); + unsigned NewOpc; + if (Offset == 8 || Offset == -8) { + NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE; + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (Offset == 8 || Offset == -8) { + NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST; + } else + return false; + } + MBB.erase(MergeInstr); - // str <undef> could probably be eliminated entirely, but for now we just want - // to avoid making a mess of it. - // FIXME: Use str <undef> as a wildcard to enable better stm folding. - if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() && - MI->getOperand(0).isUndef()) - return false; + DebugLoc DL = MI.getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); + if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) { + MIB.addOperand(Reg0Op).addOperand(Reg1Op) + .addReg(BaseOp.getReg(), RegState::Define); + } else { + assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST); + MIB.addReg(BaseOp.getReg(), RegState::Define) + .addOperand(Reg0Op).addOperand(Reg1Op); + } + MIB.addReg(BaseOp.getReg(), RegState::Kill) + .addImm(Offset).addImm(Pred).addReg(PredReg); + assert(TII->get(Opcode).getNumOperands() == 6 && + TII->get(NewOpc).getNumOperands() == 7 && + "Unexpected number of operands in Opcode specification."); - // Likewise don't mess with references to undefined addresses. - if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() && - MI->getOperand(1).isUndef()) - return false; + // Transfer implicit operands. + for (const MachineOperand &MO : MI.implicit_operands()) + MIB.addOperand(MO); + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - unsigned Opcode = MI->getOpcode(); + MBB.erase(MBBI); + return true; +} + +/// Returns true if instruction is a memory operation that this pass is capable +/// of operating on. +static bool isMemoryOp(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); switch (Opcode) { - default: break; case ARM::VLDRS: case ARM::VSTRS: - return MI->getOperand(1).isReg(); case ARM::VLDRD: case ARM::VSTRD: - return MI->getOperand(1).isReg(); case ARM::LDRi12: case ARM::STRi12: case ARM::tLDRi: @@ -1459,9 +1450,40 @@ static bool isMemoryOp(const MachineInstr *MI) { case ARM::t2LDRi12: case ARM::t2STRi8: case ARM::t2STRi12: - return MI->getOperand(1).isReg(); + break; + default: + return false; } - return false; + if (!MI.getOperand(1).isReg()) + return false; + + // When no memory operands are present, conservatively assume unaligned, + // volatile, unfoldable. + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand &MMO = **MI.memoperands_begin(); + + // Don't touch volatile memory accesses - we may be changing their order. + if (MMO.isVolatile()) + return false; + + // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is + // not. + if (MMO.getAlignment() < 4) + return false; + + // str <undef> could probably be eliminated entirely, but for now we just want + // to avoid making a mess of it. + // FIXME: Use str <undef> as a wildcard to enable better stm folding. + if (MI.getOperand(0).isReg() && MI.getOperand(0).isUndef()) + return false; + + // Likewise don't mess with references to undefined addresses. + if (MI.getOperand(1).isUndef()) + return false; + + return true; } static void InsertLDR_STR(MachineBasicBlock &MBB, @@ -1616,6 +1638,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { ARMCC::CondCodes CurrPred = ARMCC::AL; unsigned Position = 0; assert(Candidates.size() == 0); + assert(MergeBaseCandidates.size() == 0); LiveRegsValid = false; for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin(); @@ -1626,7 +1649,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { continue; ++Position; - if (isMemoryOp(MBBI)) { + if (isMemoryOp(*MBBI)) { unsigned Opcode = MBBI->getOpcode(); const MachineOperand &MO = MBBI->getOperand(0); unsigned Reg = MO.getReg(); @@ -1694,8 +1717,15 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { MBBI = I; --Position; // Fallthrough to look into existing chain. - } else if (MBBI->isDebugValue()) + } else if (MBBI->isDebugValue()) { continue; + } else if (MBBI->getOpcode() == ARM::t2LDRDi8 || + MBBI->getOpcode() == ARM::t2STRDi8) { + // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions + // remember them because we may still be able to merge add/sub into them. + MergeBaseCandidates.push_back(MBBI); + } + // If we are here then the chain is broken; Extract candidates for a merge. if (MemOps.size() > 0) { @@ -1726,7 +1756,9 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { if (Merged) { Changed = true; unsigned Opcode = Merged->getOpcode(); - if (Opcode != ARM::t2STRDi8 && Opcode != ARM::t2LDRDi8) + if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8) + MergeBaseUpdateLSDouble(*Merged); + else MergeBaseUpdateLSMultiple(Merged); } else { for (MachineInstr *MI : Candidate->Instrs) { @@ -1741,6 +1773,10 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { } } Candidates.clear(); + // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt. + for (MachineInstr *MI : MergeBaseCandidates) + MergeBaseUpdateLSDouble(*MI); + MergeBaseCandidates.clear(); return Changed; } @@ -1765,7 +1801,11 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET || MBBI->getOpcode() == ARM::MOVPCLR)) { - MachineInstr *PrevMI = std::prev(MBBI); + MachineBasicBlock::iterator PrevI = std::prev(MBBI); + // Ignore any DBG_VALUE instructions. + while (PrevI->isDebugValue() && PrevI != MBB.begin()) + --PrevI; + MachineInstr *PrevMI = PrevI; unsigned Opcode = PrevMI->getOpcode(); if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD || Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD || @@ -1786,6 +1826,30 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { return false; } +bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) { + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + if (MBBI == MBB.begin() || MBBI == MBB.end() || + MBBI->getOpcode() != ARM::tBX_RET) + return false; + + MachineBasicBlock::iterator Prev = MBBI; + --Prev; + if (Prev->getOpcode() != ARM::tMOVr || !Prev->definesRegister(ARM::LR)) + return false; + + for (auto Use : Prev->uses()) + if (Use.isKill()) { + AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX)) + .addReg(Use.getReg(), RegState::Kill)) + .copyImplicitOps(&*MBBI); + MBB.erase(MBBI); + MBB.erase(Prev); + return true; + } + + llvm_unreachable("tMOVr doesn't kill a reg before tBX_RET?"); +} + bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { MF = &Fn; STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget()); @@ -1793,7 +1857,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { AFI = Fn.getInfo<ARMFunctionInfo>(); TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); - MRI = &Fn.getRegInfo(); + RegClassInfoValid = false; isThumb2 = AFI->isThumb2Function(); isThumb1 = AFI->isThumbFunction() && !isThumb2; @@ -1805,18 +1869,29 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { Modified |= LoadStoreMultipleOpti(MBB); if (STI->hasV5TOps()) Modified |= MergeReturnIntoLDM(MBB); + if (isThumb1) + Modified |= CombineMovBx(MBB); } Allocator.DestroyAll(); return Modified; } +namespace llvm { +void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); +} + +#define ARM_PREALLOC_LOAD_STORE_OPT_NAME \ + "ARM pre- register allocation load / store optimization pass" + namespace { /// Pre- register allocation pass that move load / stores from consecutive /// locations close to make it more likely they will be combined later. struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{ static char ID; - ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {} + ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) { + initializeARMPreAllocLoadStoreOptPass(*PassRegistry::getPassRegistry()); + } const DataLayout *TD; const TargetInstrInfo *TII; @@ -1828,7 +1903,7 @@ namespace { bool runOnMachineFunction(MachineFunction &Fn) override; const char *getPassName() const override { - return "ARM pre- register allocation load / store optimization pass"; + return ARM_PREALLOC_LOAD_STORE_OPT_NAME; } private: @@ -1847,8 +1922,11 @@ namespace { char ARMPreAllocLoadStoreOpt::ID = 0; } +INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-load-store-opt", + ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) + bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { - TD = Fn.getTarget().getDataLayout(); + TD = &Fn.getDataLayout(); STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget()); TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); @@ -1856,9 +1934,8 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { MF = &Fn; bool Modified = false; - for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; - ++MFI) - Modified |= RescheduleLoadStoreInstrs(MFI); + for (MachineBasicBlock &MFI : Fn) + Modified |= RescheduleLoadStoreInstrs(&MFI); return Modified; } @@ -1909,23 +1986,6 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, return AddedRegPressure.size() <= MemRegs.size() * 2; } - -/// Copy \p Op0 and \p Op1 operands into a new array assigned to MI. -static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, - MachineInstr *Op1) { - assert(MI->memoperands_empty() && "expected a new machineinstr"); - size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) - + (Op1->memoperands_end() - Op1->memoperands_begin()); - - MachineFunction *MF = MI->getParent()->getParent(); - MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); - MachineSDNode::mmo_iterator MemEnd = - std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); - MemEnd = - std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); - MI->setMemRefs(MemBegin, MemEnd); -} - bool ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, unsigned &NewOpc, @@ -2119,7 +2179,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, if (!isT2) MIB.addReg(0); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); - concatenateMemOperands(MIB, Op0, Op1); + MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1)); DEBUG(dbgs() << "Formed " << *MIB << "\n"); ++NumLDRDFormed; } else { @@ -2133,7 +2193,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, if (!isT2) MIB.addReg(0); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); - concatenateMemOperands(MIB, Op0, Op1); + MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1)); DEBUG(dbgs() << "Formed " << *MIB << "\n"); ++NumSTRDFormed; } @@ -2187,7 +2247,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { if (!MI->isDebugValue()) MI2LocMap[MI] = ++Loc; - if (!isMemoryOp(MI)) + if (!isMemoryOp(*MI)) continue; unsigned PredReg = 0; if (getInstrPredicate(MI, PredReg) != ARMCC::AL) @@ -2275,3 +2335,4 @@ FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { return new ARMPreAllocLoadStoreOpt(); return new ARMLoadStoreOpt(); } + diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp index f5250ff..71ad7a4 100644 --- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp @@ -1,4 +1,4 @@ -//===-- ARMMachineFuctionInfo.cpp - ARM machine function info -------------===// +//===-- ARMMachineFunctionInfo.cpp - ARM machine function info ------------===// // // The LLVM Compiler Infrastructure // @@ -21,4 +21,4 @@ ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF) FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false), - GlobalBaseReg(0) {} + IsSplitCSR(false) {} diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index 14dd9ef..68f9aec 100644 --- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -1,4 +1,4 @@ -//===-- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===// +//===-- ARMMachineFunctionInfo.h - ARM machine function info ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -52,7 +52,7 @@ class ARMFunctionInfo : public MachineFunctionInfo { unsigned ReturnRegsCount; /// HasStackFrame - True if this function has a stack frame. Set by - /// processFunctionBeforeCalleeSavedScan(). + /// determineCalleeSaves(). bool HasStackFrame; /// RestoreSPFromFP - True if epilogue should restore SP from FP. Set by @@ -110,11 +110,6 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// pass. DenseMap<unsigned, unsigned> CPEClones; - /// GlobalBaseReg - keeps track of the virtual register initialized for - /// use as the global base register. This is used for PIC in some PIC - /// relocation models. - unsigned GlobalBaseReg; - /// ArgumentStackSize - amount of bytes on stack consumed by the arguments /// being passed on the stack unsigned ArgumentStackSize; @@ -123,6 +118,10 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// coalesced weights. DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights; + /// True if this function has a subset of CSRs that is handled explicitly via + /// copies. + bool IsSplitCSR; + public: ARMFunctionInfo() : isThumb(false), @@ -133,7 +132,7 @@ public: FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0), NumAlignedDPRCS2Regs(0), PICLabelUId(0), - VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {} + VarArgsFrameIndex(0), HasITBlocks(false), IsSplitCSR(false) {} explicit ARMFunctionInfo(MachineFunction &MF); @@ -204,8 +203,8 @@ public: bool hasITBlocks() const { return HasITBlocks; } void setHasITBlocks(bool h) { HasITBlocks = h; } - unsigned getGlobalBaseReg() const { return GlobalBaseReg; } - void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; } + bool isSplitCSR() const { return IsSplitCSR; } + void setIsSplitCSR(bool s) { IsSplitCSR = s; } void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) { if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second) diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td index 45cc9ea..02cbfb1 100644 --- a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -266,12 +266,19 @@ def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> { } // Scalar single precision floating point register class.. -// FIXME: Allocation order changed to s0, s2, s4, ... as a quick hack to -// avoid partial-write dependencies on D registers (S registers are -// renamed as portions of D registers). -def SPR : RegisterClass<"ARM", [f32], 32, (add (decimate - (sequence "S%u", 0, 31), 2), - (sequence "S%u", 0, 31))>; +// FIXME: Allocation order changed to s0, s2, ... or s0, s4, ... as a quick hack +// to avoid partial-write dependencies on D or Q (depending on platform) +// registers (S registers are renamed as portions of D/Q registers). +def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> { + let AltOrders = [(add (decimate SPR, 2), SPR), + (add (decimate SPR, 4), + (decimate SPR, 2), + (decimate (rotl SPR, 1), 4), + (decimate (rotl SPR, 1), 2))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF); + }]; +} // Subset of SPR which can be used as a source of NEON scalars for 16-bit // operations @@ -281,25 +288,29 @@ def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>; // class. // ARM requires only word alignment for double. It's more performant if it // is double-word alignment though. -def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, +def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64, (sequence "D%u", 0, 31)> { - // Allocate non-VFP2 registers D16-D31 first. - let AltOrders = [(rotl DPR, 16)]; - let AltOrderSelect = [{ return 1; }]; + // Allocate non-VFP2 registers D16-D31 first, and prefer even registers on + // Darwin platforms. + let AltOrders = [(rotl DPR, 16), + (add (decimate (rotl DPR, 16), 2), (rotl DPR, 16))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF); + }]; } // Subset of DPR that are accessible with VFP2 (and so that also have // 32-bit SPR subregs). -def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, +def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64, (trunc DPR, 16)>; // Subset of DPR which can be used as a source of NEON scalars for 16-bit // operations -def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, +def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64, (trunc DPR, 8)>; // Generic 128-bit vector register class. -def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, +def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128, (sequence "Q%u", 0, 15)> { // Allocate non-VFP2 aliases Q8-Q15 first. let AltOrders = [(rotl QPR, 8)]; diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td index b03d5ff..3ad7730 100644 --- a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td @@ -37,1050 +37,13 @@ def SW_FDIV : FuncUnit; // FIXME: Add preload instruction when it is documented. // FIXME: Model non-pipelined nature of FP div / sqrt unit. -def SwiftItineraries : ProcessorItineraries< - [SW_DIS0, SW_DIS1, SW_DIS2, SW_ALU0, SW_ALU1, SW_LS, SW_IDIV, SW_FDIV], [], [ - // - // Move instructions, unconditional - InstrItinData<IIC_iMOVi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMOVr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMOVsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMOVsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMOVix2 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2]>, - InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [3]>, - InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_LS]>], - [5]>, - // - // MVN instructions - InstrItinData<IIC_iMVNi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMVNr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMVNsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMVNsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - // - // No operand cycles - InstrItinData<IIC_iALUx , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>]>, - // - // Binary Instructions that produce a result - InstrItinData<IIC_iALUi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iALUr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - InstrItinData<IIC_iALUsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1]>, - InstrItinData<IIC_iALUsir,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1]>, - InstrItinData<IIC_iALUsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1]>, - // - // Bitwise Instructions that produce a result - InstrItinData<IIC_iBITi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iBITr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - InstrItinData<IIC_iBITsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1]>, - InstrItinData<IIC_iBITsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1]>, - // - // Unary Instructions that produce a result - - // CLZ, RBIT, etc. - InstrItinData<IIC_iUNAr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - - // BFC, BFI, UBFX, SBFX - InstrItinData<IIC_iUNAsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1]>, - - // - // Zero and sign extension instructions - InstrItinData<IIC_iEXTr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iEXTAr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - InstrItinData<IIC_iEXTAsr,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1, 1, 1]>, - // - // Compare instructions - InstrItinData<IIC_iCMPi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iCMPr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iCMPsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<2, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iCMPsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<2, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - // - // Test instructions - InstrItinData<IIC_iTSTi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iTSTr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iTSTsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<2, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iTSTsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<2, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - // - // Move instructions, conditional - // FIXME: Correctly model the extra input dep on the destination. - InstrItinData<IIC_iCMOVi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iCMOVr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iCMOVsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iCMOVsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1]>, - InstrItinData<IIC_iCMOVix2, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2]>, - - // Integer multiply pipeline - // - InstrItinData<IIC_iMUL16 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [3, 1, 1]>, - InstrItinData<IIC_iMAC16 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [3, 1, 1, 1]>, - InstrItinData<IIC_iMUL32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - InstrItinData<IIC_iMAC32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1, 1]>, - InstrItinData<IIC_iMUL64 , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_ALU0], 3>, - InstrStage<1, [SW_ALU0]>], - [5, 5, 1, 1]>, - InstrItinData<IIC_iMAC64 , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [5, 6, 1, 1]>, - // - // Integer divide - InstrItinData<IIC_iDIV , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0], 0>, - InstrStage<14, [SW_IDIV]>], - [14, 1, 1]>, - - // Integer load pipeline - // FIXME: The timings are some rough approximations - // - // Immediate offset - InstrItinData<IIC_iLoad_i , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [3, 1]>, - InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [3, 1]>, - InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_LS]>], - [3, 4, 1]>, - // - // Register offset - InstrItinData<IIC_iLoad_r , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [3, 1, 1]>, - InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [3, 1, 1]>, - InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [3, 4, 1, 1]>, - // - // Scaled register offset - InstrItinData<IIC_iLoad_si , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS]>], - [5, 1, 1]>, - InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS]>], - [5, 1, 1]>, - // - // Immediate offset with update - InstrItinData<IIC_iLoad_iu , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [3, 1, 1]>, - InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [3, 1, 1]>, - // - // Register offset with update - InstrItinData<IIC_iLoad_ru , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_LS]>], - [3, 1, 1, 1]>, - InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_LS]>], - [3, 1, 1, 1]>, - InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [3, 4, 1, 1]>, - // - // Scaled register offset with update - InstrItinData<IIC_iLoad_siu , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [5, 3, 1, 1]>, - InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [5, 3, 1, 1]>, - // - // Load multiple, def is the 5th operand. - // FIXME: This assumes 3 to 4 registers. - InstrItinData<IIC_iLoad_m , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1, 3], [], -1>, // dynamic uops - - // - // Load multiple + update, defs are the 1st and 5th operands. - InstrItinData<IIC_iLoad_mu , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1, 3], [], -1>, // dynamic uops - // - // Load multiple plus branch - InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1, 3], [], -1>, // dynamic uops - // - // Pop, def is the 3rd operand. - InstrItinData<IIC_iPop , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 3], [], -1>, // dynamic uops - // - // Pop + branch, def is the 3rd operand. - InstrItinData<IIC_iPop_Br, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 3], [], -1>, // dynamic uops - - // - // iLoadi + iALUr for t2LDRpci_pic. - InstrItinData<IIC_iLoadiALU, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [4, 1]>, - - // Integer store pipeline - /// - // Immediate offset - InstrItinData<IIC_iStore_i , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1]>, - InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1]>, - InstrItinData<IIC_iStore_d_i, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1]>, - // - // Register offset - InstrItinData<IIC_iStore_r , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - InstrItinData<IIC_iStore_d_r, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - // - // Scaled register offset - InstrItinData<IIC_iStore_si , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - // - // Immediate offset with update - InstrItinData<IIC_iStore_iu , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - // - // Register offset with update - InstrItinData<IIC_iStore_ru , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1]>, - InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1]>, - InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1]>, - // - // Scaled register offset with update - InstrItinData<IIC_iStore_siu, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>], - [3, 1, 1, 1]>, - InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>], - [3, 1, 1, 1]>, - // - // Store multiple - InstrItinData<IIC_iStore_m , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [], [], -1>, // dynamic uops - // - // Store multiple + update - InstrItinData<IIC_iStore_mu, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [2], [], -1>, // dynamic uops - - // - // Preload - InstrItinData<IIC_Preload, [InstrStage<1, [SW_DIS0], 0>], [1, 1]>, - - // Branch - // - // no delay slots, so the latency of a branch is unimportant - InstrItinData<IIC_Br , [InstrStage<1, [SW_DIS0], 0>]>, - - // FP Special Register to Integer Register File Move - InstrItinData<IIC_fpSTAT , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - // - // Single-precision FP Unary - // - // Most floating-point moves get issued on ALU0. - InstrItinData<IIC_fpUNA32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - // - // Double-precision FP Unary - InstrItinData<IIC_fpUNA64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - - // - // Single-precision FP Compare - InstrItinData<IIC_fpCMP32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [1, 1]>, - // - // Double-precision FP Compare - InstrItinData<IIC_fpCMP64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [1, 1]>, - // - // Single to Double FP Convert - InstrItinData<IIC_fpCVTSD , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Double to Single FP Convert - InstrItinData<IIC_fpCVTDS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - - // - // Single to Half FP Convert - InstrItinData<IIC_fpCVTSH , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU1], 4>, - InstrStage<1, [SW_ALU1]>], - [6, 1]>, - // - // Half to Single FP Convert - InstrItinData<IIC_fpCVTHS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - - // - // Single-Precision FP to Integer Convert - InstrItinData<IIC_fpCVTSI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Double-Precision FP to Integer Convert - InstrItinData<IIC_fpCVTDI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Integer to Single-Precision FP Convert - InstrItinData<IIC_fpCVTIS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Integer to Double-Precision FP Convert - InstrItinData<IIC_fpCVTID , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Single-precision FP ALU - InstrItinData<IIC_fpALU32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-precision FP ALU - InstrItinData<IIC_fpALU64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Single-precision FP Multiply - InstrItinData<IIC_fpMUL32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Double-precision FP Multiply - InstrItinData<IIC_fpMUL64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [6, 1, 1]>, - // - // Single-precision FP MAC - InstrItinData<IIC_fpMAC32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-precision FP MAC - InstrItinData<IIC_fpMAC64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [12, 1, 1]>, - // - // Single-precision Fused FP MAC - InstrItinData<IIC_fpFMAC32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-precision Fused FP MAC - InstrItinData<IIC_fpFMAC64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [12, 1, 1]>, - // - // Single-precision FP DIV - InstrItinData<IIC_fpDIV32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 0>, - InstrStage<15, [SW_FDIV]>], - [17, 1, 1]>, - // - // Double-precision FP DIV - InstrItinData<IIC_fpDIV64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 0>, - InstrStage<30, [SW_FDIV]>], - [32, 1, 1]>, - // - // Single-precision FP SQRT - InstrItinData<IIC_fpSQRT32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 0>, - InstrStage<15, [SW_FDIV]>], - [17, 1]>, - // - // Double-precision FP SQRT - InstrItinData<IIC_fpSQRT64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 0>, - InstrStage<30, [SW_FDIV]>], - [32, 1, 1]>, - - // - // Integer to Single-precision Move - InstrItinData<IIC_fpMOVIS, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0]>], - [6, 1]>, - // - // Integer to Double-precision Move - InstrItinData<IIC_fpMOVID, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [4, 1]>, - // - // Single-precision to Integer Move - InstrItinData<IIC_fpMOVSI, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [3, 1]>, - // - // Double-precision to Integer Move - InstrItinData<IIC_fpMOVDI, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_LS]>], - [3, 4, 1]>, - // - // Single-precision FP Load - InstrItinData<IIC_fpLoad32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [4, 1]>, - // - // Double-precision FP Load - InstrItinData<IIC_fpLoad64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [4, 1]>, - // - // FP Load Multiple - // FIXME: Assumes a single Q register. - InstrItinData<IIC_fpLoad_m, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 4], [], -1>, // dynamic uops - // - // FP Load Multiple + update - // FIXME: Assumes a single Q register. - InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1, 4], [], -1>, // dynamic uops - // - // Single-precision FP Store - InstrItinData<IIC_fpStore32,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1]>, - // - // Double-precision FP Store - InstrItinData<IIC_fpStore64,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1]>, - // - // FP Store Multiple - // FIXME: Assumes a single Q register. - InstrItinData<IIC_fpStore_m,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1, 1], [], -1>, // dynamic uops - // - // FP Store Multiple + update - // FIXME: Assumes a single Q register. - InstrItinData<IIC_fpStore_mu,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1], [], -1>, // dynamic uops - // NEON - // - // Double-register Integer Unary - InstrItinData<IIC_VUNAiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1]>, - // - // Quad-register Integer Unary - InstrItinData<IIC_VUNAiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1]>, - // - // Double-register Integer Q-Unary - InstrItinData<IIC_VQUNAiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1]>, - // - // Quad-register Integer CountQ-Unary - InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1]>, - // - // Double-register Integer Binary - InstrItinData<IIC_VBINiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Quad-register Integer Binary - InstrItinData<IIC_VBINiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-register Integer Subtract - InstrItinData<IIC_VSUBiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Quad-register Integer Subtract - InstrItinData<IIC_VSUBiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-register Integer Shift - InstrItinData<IIC_VSHLiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Quad-register Integer Shift - InstrItinData<IIC_VSHLiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-register Integer Shift (4 cycle) - InstrItinData<IIC_VSHLi4D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register Integer Shift (4 cycle) - InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Double-register Integer Binary (4 cycle) - InstrItinData<IIC_VBINi4D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register Integer Binary (4 cycle) - InstrItinData<IIC_VBINi4Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Double-register Integer Subtract (4 cycle) - InstrItinData<IIC_VSUBi4D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register Integer Subtract (4 cycle) - InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - - // - // Double-register Integer Count - InstrItinData<IIC_VCNTiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Quad-register Integer Count - InstrItinData<IIC_VCNTiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-register Absolute Difference and Accumulate - InstrItinData<IIC_VABAD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1, 1]>, - // - // Quad-register Absolute Difference and Accumulate - InstrItinData<IIC_VABAQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1, 1]>, - // - // Double-register Integer Pair Add Long - InstrItinData<IIC_VPALiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register Integer Pair Add Long - InstrItinData<IIC_VPALiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - - // - // Double-register Integer Multiply (.8, .16) - InstrItinData<IIC_VMULi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Quad-register Integer Multiply (.8, .16) - InstrItinData<IIC_VMULi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - - // - // Double-register Integer Multiply (.32) - InstrItinData<IIC_VMULi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Quad-register Integer Multiply (.32) - InstrItinData<IIC_VMULi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Double-register Integer Multiply-Accumulate (.8, .16) - InstrItinData<IIC_VMACi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1, 1]>, - // - // Double-register Integer Multiply-Accumulate (.32) - InstrItinData<IIC_VMACi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1, 1]>, - // - // Quad-register Integer Multiply-Accumulate (.8, .16) - InstrItinData<IIC_VMACi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1, 1]>, - // - // Quad-register Integer Multiply-Accumulate (.32) - InstrItinData<IIC_VMACi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1, 1]>, - - // - // Move - InstrItinData<IIC_VMOV, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - // - // Move Immediate - InstrItinData<IIC_VMOVImm, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2]>, - // - // Double-register Permute Move - InstrItinData<IIC_VMOVD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1]>, - // - // Quad-register Permute Move - InstrItinData<IIC_VMOVQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1]>, - // - // Integer to Single-precision Move - InstrItinData<IIC_VMOVIS , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0]>], - [6, 1]>, - // - // Integer to Double-precision Move - InstrItinData<IIC_VMOVID , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [4, 1, 1]>, - // - // Single-precision to Integer Move - InstrItinData<IIC_VMOVSI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [3, 1]>, - // - // Double-precision to Integer Move - InstrItinData<IIC_VMOVDI , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_LS]>], - [3, 4, 1]>, - // - // Integer to Lane Move - // FIXME: I think this is correct, but it is not clear from the tuning guide. - InstrItinData<IIC_VMOVISL , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0]>], - [6, 1]>, - - // - // Vector narrow move - InstrItinData<IIC_VMOVN, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1]>, - // - // Double-register FP Unary - // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here, - // and they issue on a different pipeline. - InstrItinData<IIC_VUNAD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - // - // Quad-register FP Unary - // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here, - // and they issue on a different pipeline. - InstrItinData<IIC_VUNAQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - // - // Double-register FP Binary - // FIXME: We're using this itin for many instructions. - InstrItinData<IIC_VBIND, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - - // - // VPADD, etc. - InstrItinData<IIC_VPBIND, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Double-register FP VMUL - InstrItinData<IIC_VFMULD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Quad-register FP Binary - InstrItinData<IIC_VBINQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register FP VMUL - InstrItinData<IIC_VFMULQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Double-register FP Multiple-Accumulate - InstrItinData<IIC_VMACD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Quad-register FP Multiple-Accumulate - InstrItinData<IIC_VMACQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-register Fused FP Multiple-Accumulate - InstrItinData<IIC_VFMACD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Quad-register FusedF P Multiple-Accumulate - InstrItinData<IIC_VFMACQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-register Reciprical Step - InstrItinData<IIC_VRECSD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Quad-register Reciprical Step - InstrItinData<IIC_VRECSQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-register Permute - // FIXME: The latencies are unclear from the documentation. - InstrItinData<IIC_VPERMD, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [3, 4, 3, 4]>, - // - // Quad-register Permute - // FIXME: The latencies are unclear from the documentation. - InstrItinData<IIC_VPERMQ, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [3, 4, 3, 4]>, - // - // Quad-register Permute (3 cycle issue on A9) - InstrItinData<IIC_VPERMQ3, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [3, 4, 3, 4]>, - - // - // Double-register VEXT - InstrItinData<IIC_VEXTD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1, 1]>, - // - // Quad-register VEXT - InstrItinData<IIC_VEXTQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1, 1]>, - // - // VTB - InstrItinData<IIC_VTB1, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1, 1]>, - InstrItinData<IIC_VTB2, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 3, 3]>, - InstrItinData<IIC_VTB3, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [6, 1, 3, 5, 5]>, - InstrItinData<IIC_VTB4, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 3, 5, 7, 7]>, - // - // VTBX - InstrItinData<IIC_VTBX1, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1, 1]>, - InstrItinData<IIC_VTBX2, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 3, 3]>, - InstrItinData<IIC_VTBX3, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [6, 1, 3, 5, 5]>, - InstrItinData<IIC_VTBX4, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 3, 5, 7, 7]> -]>; - -// ===---------------------------------------------------------------------===// -// This following definitions describe the simple machine model which -// will replace itineraries. - // Swift machine model for scheduling and other instruction cost heuristics. def SwiftModel : SchedMachineModel { let IssueWidth = 3; // 3 micro-ops are dispatched per cycle. let MicroOpBufferSize = 45; // Based on NEON renamed registers. let LoadLatency = 3; let MispredictPenalty = 14; // A branch direction mispredict. - - let Itineraries = SwiftItineraries; + let CompleteModel = 0; // FIXME: Remove if all instructions are covered. } // Swift predicates. @@ -1558,6 +521,13 @@ let SchedModel = SwiftModel in { (instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD", "PUSH", "tPUSH")>; + // LDRLIT pseudo instructions, they expand to LDR + PICADD + def : InstRW<[SwiftWriteP2ThreeCycle, WriteALU], + (instregex "t?LDRLIT_ga_abs", "t?LDRLIT_ga_pcrel")>; + // LDRLIT_ga_pcrel_ldr expands to LDR + PICLDR + def : InstRW<[SwiftWriteP2ThreeCycle, SwiftWriteP2ThreeCycle], + (instregex "LDRLIT_ga_pcrel_ldr")>; + // 4.2.26 Branch def : WriteRes<WriteBr, [SwiftUnitP1]> { let Latency = 0; } def : WriteRes<WriteBrL, [SwiftUnitP1]> { let Latency = 2; } diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp index 6cafbbb..6fded9c 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -160,41 +160,39 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, unsigned VTSize = 4; unsigned i = 0; // Emit a maximum of 4 loads in Thumb1 since we have fewer registers - const unsigned MAX_LOADS_IN_LDM = Subtarget.isThumb1Only() ? 4 : 6; + const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6; SDValue TFOps[6]; SDValue Loads[6]; uint64_t SrcOff = 0, DstOff = 0; - // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the - // same number of stores. The loads and stores will get combined into - // ldm/stm later on. - while (EmittedNumMemOps < NumMemOps) { - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - Loads[i] = DAG.getLoad(VT, dl, Chain, - DAG.getNode(ISD::ADD, dl, MVT::i32, Src, - DAG.getConstant(SrcOff, dl, MVT::i32)), - SrcPtrInfo.getWithOffset(SrcOff), isVolatile, - false, false, 0); - TFOps[i] = Loads[i].getValue(1); - SrcOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - makeArrayRef(TFOps, i)); + // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to + // VLDM/VSTM and make this code emit it when appropriate. This would reduce + // pressure on the general purpose registers. However this seems harder to map + // onto the register allocator's view of the world. - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - TFOps[i] = DAG.getStore(Chain, dl, Loads[i], - DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, - DAG.getConstant(DstOff, dl, MVT::i32)), - DstPtrInfo.getWithOffset(DstOff), - isVolatile, false, 0); - DstOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - makeArrayRef(TFOps, i)); + // The number of MEMCPY pseudo-instructions to emit. We use up to + // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm + // later on. This is a lower bound on the number of MEMCPY operations we must + // emit. + unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM; + + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue); + + for (unsigned I = 0; I != NumMEMCPYs; ++I) { + // Evenly distribute registers among MEMCPY operations to reduce register + // pressure. + unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs; + unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps; + + Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src, + DAG.getConstant(NumRegs, dl, MVT::i32)); + Src = Dst.getValue(1); + Chain = Dst.getValue(2); + + DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize); + SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize); - EmittedNumMemOps += i; + EmittedNumMemOps = NextEmittedNumMemOps; } if (BytesLeft == 0) diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp index 002c3e9..bb6ae28 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetOptions.h" @@ -40,37 +41,9 @@ using namespace llvm; #include "ARMGenSubtargetInfo.inc" static cl::opt<bool> -ReserveR9("arm-reserve-r9", cl::Hidden, - cl::desc("Reserve R9, making it unavailable as GPR")); - -static cl::opt<bool> -ArmUseMOVT("arm-use-movt", cl::init(true), cl::Hidden); - -static cl::opt<bool> UseFusedMulOps("arm-use-mulops", cl::init(true), cl::Hidden); -namespace { -enum AlignMode { - DefaultAlign, - StrictAlign, - NoStrictAlign -}; -} - -static cl::opt<AlignMode> -Align(cl::desc("Load/store alignment support"), - cl::Hidden, cl::init(DefaultAlign), - cl::values( - clEnumValN(DefaultAlign, "arm-default-align", - "Generate unaligned accesses only on hardware/OS " - "combinations that are known to support them"), - clEnumValN(StrictAlign, "arm-strict-align", - "Disallow all unaligned memory accesses"), - clEnumValN(NoStrictAlign, "arm-no-strict-align", - "Allow unaligned memory accesses"), - clEnumValEnd)); - enum ITMode { DefaultIT, RestrictedIT, @@ -88,6 +61,12 @@ IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), "Allow IT blocks based on ARMv7"), clEnumValEnd)); +/// ForceFastISel - Use the fast-isel, even for subtargets where it is not +/// currently supported (for testing only). +static cl::opt<bool> +ForceFastISel("arm-force-fast-isel", + cl::init(false), cl::Hidden); + /// initializeSubtargetDependencies - Initializes using a CPU and feature string /// so that we can use initializer lists for subtarget initialization. ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU, @@ -110,8 +89,8 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const ARMBaseTargetMachine &TM, bool IsLittle) : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), - ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle), - TargetTriple(TT), Options(TM.Options), TM(TM), + ARMProcClass(None), ARMArch(ARMv4t), stackAlignment(4), CPUString(CPU), + IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM), FrameLowering(initializeFrameLowering(CPU, FS)), // At this point initializeSubtargetDependencies has been called so // we can query directly. @@ -133,6 +112,7 @@ void ARMSubtarget::initializeEnvironment() { HasV7Ops = false; HasV8Ops = false; HasV8_1aOps = false; + HasV8_2aOps = false; HasVFPv2 = false; HasVFPv3 = false; HasVFPv4 = false; @@ -147,10 +127,11 @@ void ARMSubtarget::initializeEnvironment() { UseSoftFloat = false; HasThumb2 = false; NoARM = false; - IsR9Reserved = ReserveR9; - UseMovt = false; + ReserveR9 = false; + NoMovt = false; SupportsTailCall = false; HasFP16 = false; + HasFullFP16 = false; HasD16 = false; HasHardwareDivide = false; HasHardwareDivideInARM = false; @@ -168,20 +149,36 @@ void ARMSubtarget::initializeEnvironment() { HasCrypto = false; HasCRC = false; HasZeroCycleZeroing = false; - AllowsUnalignedMem = false; - Thumb2DSP = false; + StrictAlign = false; + HasDSP = false; UseNaClTrap = false; GenLongCalls = false; UnsafeFPMath = false; + + // MCAsmInfo isn't always present (e.g. in opt) so we can't initialize this + // directly from it, but we can try to make sure they're consistent when both + // available. + UseSjLjEH = isTargetDarwin() && !isTargetWatchOS(); + assert((!TM.getMCAsmInfo() || + (TM.getMCAsmInfo()->getExceptionHandlingType() == + ExceptionHandling::SjLj) == UseSjLjEH) && + "inconsistent sjlj choice between CodeGen and MC"); } void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (CPUString.empty()) { - if (isTargetDarwin() && TargetTriple.getArchName().endswith("v7s")) - // Default to the Swift CPU when targeting armv7s/thumbv7s. - CPUString = "swift"; - else - CPUString = "generic"; + CPUString = "generic"; + + if (isTargetDarwin()) { + StringRef ArchName = TargetTriple.getArchName(); + if (ArchName.endswith("v7s")) + // Default to the Swift CPU when targeting armv7s/thumbv7s. + CPUString = "swift"; + else if (ArchName.endswith("v7k")) + // Default to the Cortex-a7 CPU when targeting armv7k/thumbv7k. + // ARMv7k does not use SjLj exception handling. + CPUString = "cortex-a7"; + } } // Insert the architecture feature derived from the target triple into the @@ -212,44 +209,31 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (isAAPCS_ABI()) stackAlignment = 8; - if (isTargetNaCl()) + if (isTargetNaCl() || isAAPCS16_ABI()) stackAlignment = 16; - UseMovt = hasV6T2Ops() && ArmUseMOVT; - - if (isTargetMachO()) { - IsR9Reserved = ReserveR9 || !HasV6Ops; - SupportsTailCall = !isTargetIOS() || !getTargetTriple().isOSVersionLT(5, 0); - } else { - IsR9Reserved = ReserveR9; - SupportsTailCall = !isThumb1Only(); - } - - if (Align == DefaultAlign) { - // Assume pre-ARMv6 doesn't support unaligned accesses. - // - // ARMv6 may or may not support unaligned accesses depending on the - // SCTLR.U bit, which is architecture-specific. We assume ARMv6 - // Darwin and NetBSD targets support unaligned accesses, and others don't. - // - // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit - // which raises an alignment fault on unaligned accesses. Linux - // defaults this bit to 0 and handles it as a system-wide (not - // per-process) setting. It is therefore safe to assume that ARMv7+ - // Linux targets support unaligned accesses. The same goes for NaCl. - // - // The above behavior is consistent with GCC. - AllowsUnalignedMem = - (hasV7Ops() && (isTargetLinux() || isTargetNaCl() || - isTargetNetBSD())) || - (hasV6Ops() && (isTargetMachO() || isTargetNetBSD())); - } else { - AllowsUnalignedMem = !(Align == StrictAlign); - } - - // No v6M core supports unaligned memory access (v6M ARM ARM A3.2) - if (isV6M()) - AllowsUnalignedMem = false; + // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo:: + // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as + // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation + // support in the assembler and linker to be used. This would need to be + // fixed to fully support tail calls in Thumb1. + // + // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take + // LR. This means if we need to reload LR, it takes an extra instructions, + // which outweighs the value of the tail call; but here we don't know yet + // whether LR is going to be used. Probably the right approach is to + // generate the tail call here and turn it back into CALL/RET in + // emitEpilogue if LR is used. + + // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, + // but we need to make sure there are enough registers; the only valid + // registers are the 4 used for parameters. We don't currently do this + // case. + + SupportsTailCall = !isThumb1Only(); + + if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0)) + SupportsTailCall = false; switch (IT) { case DefaultIT: @@ -276,9 +260,15 @@ bool ARMSubtarget::isAPCS_ABI() const { } bool ARMSubtarget::isAAPCS_ABI() const { assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN); - return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS; + return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS || + TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16; +} +bool ARMSubtarget::isAAPCS16_ABI() const { + assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN); + return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16; } + /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol. bool ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV, @@ -321,11 +311,23 @@ unsigned ARMSubtarget::getMispredictionPenalty() const { } bool ARMSubtarget::hasSinCos() const { - return getTargetTriple().isiOS() && !getTargetTriple().isOSVersionLT(7, 0); + return isTargetWatchOS() || + (isTargetIOS() && !getTargetTriple().isOSVersionLT(7, 0)); +} + +bool ARMSubtarget::enableMachineScheduler() const { + // Enable the MachineScheduler before register allocation for out-of-order + // architectures where we do not use the PostRA scheduler anymore (for now + // restricted to swift). + return getSchedModel().isOutOfOrder() && isSwift(); } // This overrides the PostRAScheduler bit in the SchedModel for any CPU. bool ARMSubtarget::enablePostRAScheduler() const { + // No need for PostRA scheduling on out of order CPUs (for now restricted to + // swift). + if (getSchedModel().isOutOfOrder() && isSwift()) + return false; return (!isThumb() || hasThumb2()); } @@ -333,15 +335,30 @@ bool ARMSubtarget::enableAtomicExpand() const { return hasAnyDataBarrier() && !isThumb1Only(); } +bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const { + // For general targets, the prologue can grow when VFPs are allocated with + // stride 4 (more vpush instructions). But WatchOS uses a compact unwind + // format which it's more important to get right. + return isTargetWatchOS() || (isSwift() && !MF.getFunction()->optForMinSize()); +} + bool ARMSubtarget::useMovt(const MachineFunction &MF) const { // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit // immediates as it is inherently position independent, and may be out of // range otherwise. - return UseMovt && (isTargetWindows() || - !MF.getFunction()->hasFnAttribute(Attribute::MinSize)); + return !NoMovt && hasV6T2Ops() && + (isTargetWindows() || !MF.getFunction()->optForMinSize()); } bool ARMSubtarget::useFastISel() const { + // Enable fast-isel for any target, for testing only. + if (ForceFastISel) + return true; + + // Limit fast-isel to the targets that are or have been tested. + if (!hasV6Ops()) + return false; + // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl. return TM.Options.EnableFastISel && ((isTargetMachO() && !isThumb1Only()) || diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h index dd101df..4d54e57 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h @@ -43,11 +43,17 @@ class ARMSubtarget : public ARMGenSubtargetInfo { protected: enum ARMProcFamilyEnum { Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15, - CortexA17, CortexR4, CortexR4F, CortexR5, Swift, CortexA53, CortexA57, Krait, + CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA35, CortexA53, + CortexA57, CortexA72, Krait, Swift, ExynosM1 }; enum ARMProcClassEnum { None, AClass, RClass, MClass }; + enum ARMArchEnum { + ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te, + ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r, + ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a + }; /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others. ARMProcFamilyEnum ARMProcFamily; @@ -55,6 +61,9 @@ protected: /// ARMProcClass - ARM processor class: None, AClass, RClass or MClass. ARMProcClassEnum ARMProcClass; + /// ARMArch - ARM architecture + ARMArchEnum ARMArch; + /// HasV4TOps, HasV5TOps, HasV5TEOps, /// HasV6Ops, HasV6MOps, HasV6KOps, HasV6T2Ops, HasV7Ops, HasV8Ops - /// Specify whether target support specific ARM ISA variants. @@ -68,6 +77,7 @@ protected: bool HasV7Ops; bool HasV8Ops; bool HasV8_1aOps; + bool HasV8_2aOps; /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what /// floating point ISAs are supported. @@ -109,22 +119,24 @@ protected: /// NoARM - True if subtarget does not support ARM mode execution. bool NoARM; - /// IsR9Reserved - True if R9 is a not available as general purpose register. - bool IsR9Reserved; + /// ReserveR9 - True if R9 is not available as a general purpose register. + bool ReserveR9; - /// UseMovt - True if MOVT / MOVW pairs are used for materialization of 32-bit - /// imms (including global addresses). - bool UseMovt; + /// NoMovt - True if MOVT / MOVW pairs are not used for materialization of + /// 32-bit imms (including global addresses). + bool NoMovt; /// SupportsTailCall - True if the OS supports tail call. The dynamic linker /// must be able to synthesize call stubs for interworking between ARM and /// Thumb. bool SupportsTailCall; - /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF - /// only so far) + /// HasFP16 - True if subtarget supports half-precision FP conversions bool HasFP16; + /// HasFullFP16 - True if subtarget supports half-precision FP operations + bool HasFullFP16; + /// HasD16 - True if subtarget is limited to 16 double precision /// FP registers for VFPv3. bool HasD16; @@ -190,18 +202,18 @@ protected: /// particularly effective at zeroing a VFP register. bool HasZeroCycleZeroing; - /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory + /// StrictAlign - If true, the subtarget disallows unaligned memory /// accesses for some types. For details, see /// ARMTargetLowering::allowsMisalignedMemoryAccesses(). - bool AllowsUnalignedMem; + bool StrictAlign; /// RestrictIT - If true, the subtarget disallows generation of deprecated IT /// blocks to conform to ARMv8 rule. bool RestrictIT; - /// Thumb2DSP - If true, the subtarget supports the v7 DSP (saturating arith - /// and such) instructions in Thumb2 code. - bool Thumb2DSP; + /// HasDSP - If true, the subtarget supports the DSP (saturating arith + /// and such) instructions. + bool HasDSP; /// NaCl TRAP instruction is generated instead of the regular TRAP. bool UseNaClTrap; @@ -212,6 +224,9 @@ protected: /// Target machine allowed unsafe FP math (such as use of NEON fp) bool UnsafeFPMath; + /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS). + bool UseSjLjEH; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -297,6 +312,7 @@ public: bool hasV7Ops() const { return HasV7Ops; } bool hasV8Ops() const { return HasV8Ops; } bool hasV8_1aOps() const { return HasV8_1aOps; } + bool hasV8_2aOps() const { return HasV8_2aOps; } bool isCortexA5() const { return ARMProcFamily == CortexA5; } bool isCortexA7() const { return ARMProcFamily == CortexA7; } @@ -343,17 +359,20 @@ public: bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; } bool hasRAS() const { return HasRAS; } bool hasMPExtension() const { return HasMPExtension; } - bool hasThumb2DSP() const { return Thumb2DSP; } + bool hasDSP() const { return HasDSP; } bool useNaClTrap() const { return UseNaClTrap; } + bool useSjLjEH() const { return UseSjLjEH; } bool genLongCalls() const { return GenLongCalls; } bool hasFP16() const { return HasFP16; } bool hasD16() const { return HasD16; } + bool hasFullFP16() const { return HasFullFP16; } const Triple &getTargetTriple() const { return TargetTriple; } bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } bool isTargetIOS() const { return TargetTriple.isiOS(); } + bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); } @@ -375,6 +394,11 @@ public: TargetTriple.getEnvironment() == Triple::EABIHF) && !isTargetDarwin() && !isTargetWindows(); } + bool isTargetGNUAEABI() const { + return (TargetTriple.getEnvironment() == Triple::GNUEABI || + TargetTriple.getEnvironment() == Triple::GNUEABIHF) && + !isTargetDarwin() && !isTargetWindows(); + } // ARM Targets that support EHABI exception handling standard // Darwin uses SjLj. Other targets might need more checks. @@ -383,7 +407,7 @@ public: TargetTriple.getEnvironment() == Triple::GNUEABI || TargetTriple.getEnvironment() == Triple::EABIHF || TargetTriple.getEnvironment() == Triple::GNUEABIHF || - TargetTriple.getEnvironment() == Triple::Android) && + isTargetAndroid()) && !isTargetDarwin() && !isTargetWindows(); } @@ -391,14 +415,13 @@ public: // FIXME: this is invalid for WindowsCE return TargetTriple.getEnvironment() == Triple::GNUEABIHF || TargetTriple.getEnvironment() == Triple::EABIHF || - isTargetWindows(); - } - bool isTargetAndroid() const { - return TargetTriple.getEnvironment() == Triple::Android; + isTargetWindows() || isAAPCS16_ABI(); } + bool isTargetAndroid() const { return TargetTriple.isAndroid(); } bool isAPCS_ABI() const; bool isAAPCS_ABI() const; + bool isAAPCS16_ABI() const; bool useSoftFloat() const { return UseSoftFloat; } bool isThumb() const { return InThumbMode; } @@ -409,17 +432,17 @@ public: bool isRClass() const { return ARMProcClass == RClass; } bool isAClass() const { return ARMProcClass == AClass; } - bool isV6M() const { - return isThumb1Only() && isMClass(); + bool isR9Reserved() const { + return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9; } - bool isR9Reserved() const { return IsR9Reserved; } + bool useStride4VFPs(const MachineFunction &MF) const; bool useMovt(const MachineFunction &MF) const; bool supportsTailCall() const { return SupportsTailCall; } - bool allowsUnalignedMem() const { return AllowsUnalignedMem; } + bool allowsUnalignedMem() const { return !StrictAlign; } bool restrictIT() const { return RestrictIT; } @@ -433,6 +456,9 @@ public: /// compiler runtime or math libraries. bool hasSinCos() const; + /// Returns true if machine scheduler should be enabled. + bool enableMachineScheduler() const override; + /// True for some subtargets at > -O0. bool enablePostRAScheduler() const override; diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 93495d6..fca1901 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -66,7 +66,9 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { static ARMBaseTargetMachine::ARMABI computeTargetABI(const Triple &TT, StringRef CPU, const TargetOptions &Options) { - if (Options.MCOptions.getABIName().startswith("aapcs")) + if (Options.MCOptions.getABIName() == "aapcs16") + return ARMBaseTargetMachine::ARM_ABI_AAPCS16; + else if (Options.MCOptions.getABIName().startswith("aapcs")) return ARMBaseTargetMachine::ARM_ABI_AAPCS; else if (Options.MCOptions.getABIName().startswith("apcs")) return ARMBaseTargetMachine::ARM_ABI_APCS; @@ -83,6 +85,8 @@ computeTargetABI(const Triple &TT, StringRef CPU, (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) || CPU.startswith("cortex-m")) { TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; + } else if (TT.isWatchOS()) { + TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16; } else { TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS; } @@ -106,7 +110,7 @@ computeTargetABI(const Triple &TT, StringRef CPU, if (TT.isOSNetBSD()) TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS; else - TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; + TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; break; } } @@ -145,7 +149,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU, // to 64. We always ty to give them natural alignment. if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS) Ret += "-v64:32:64-v128:32:128"; - else + else if (ABI != ARMBaseTargetMachine::ARM_ABI_AAPCS16) Ret += "-v128:64:128"; // Try to align aggregates to 32 bits (the default is 64 bits, which has no @@ -157,7 +161,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU, // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit // aligned everywhere else. - if (TT.isOSNaCl()) + if (TT.isOSNaCl() || ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16) Ret += "-S128"; else if (ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS) Ret += "-S64"; @@ -184,6 +188,15 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, if (Options.FloatABIType == FloatABI::Default) this->Options.FloatABIType = Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft; + + // Default to triple-appropriate EABI + if (Options.EABIVersion == EABI::Default || + Options.EABIVersion == EABI::Unknown) { + if (Subtarget.isTargetGNUAEABI()) + this->Options.EABIVersion = EABI::GNU; + else + this->Options.EABIVersion = EABI::EABI5; + } } ARMBaseTargetMachine::~ARMBaseTargetMachine() {} @@ -225,12 +238,12 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const { } TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &F) { return TargetTransformInfo(ARMTTIImpl(this, F)); }); + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo(ARMTTIImpl(this, F)); + }); } - -void ARMTargetMachine::anchor() { } +void ARMTargetMachine::anchor() {} ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -244,7 +257,7 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT, "support ARM mode execution!"); } -void ARMLETargetMachine::anchor() { } +void ARMLETargetMachine::anchor() {} ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -253,7 +266,7 @@ ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL) : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} -void ARMBETargetMachine::anchor() { } +void ARMBETargetMachine::anchor() {} ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -262,7 +275,7 @@ ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL) : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} -void ThumbTargetMachine::anchor() { } +void ThumbTargetMachine::anchor() {} ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -273,7 +286,7 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT, initAsmInfo(); } -void ThumbLETargetMachine::anchor() { } +void ThumbLETargetMachine::anchor() {} ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -282,7 +295,7 @@ ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL) : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} -void ThumbBETargetMachine::anchor() { } +void ThumbBETargetMachine::anchor() {} ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -348,7 +361,13 @@ bool ARMPassConfig::addPreISel() { // tricky when doing code gen per function. bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) && (EnableGlobalMerge == cl::BOU_UNSET); - addPass(createGlobalMergePass(TM, 127, OnlyOptimizeForSize)); + // Merging of extern globals is enabled by default on non-Mach-O as we + // expect it to be generally either beneficial or harmless. On Mach-O it + // is disabled as we emit the .subsections_via_symbols directive which + // means that merging extern globals is not safe. + bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO(); + addPass(createGlobalMergePass(TM, 127, OnlyOptimizeForSize, + MergeExternalByDefault)); } return false; @@ -356,9 +375,6 @@ bool ARMPassConfig::addPreISel() { bool ARMPassConfig::addInstSelector() { addPass(createARMISelDag(getARMTargetMachine(), getOptLevel())); - - if (TM->getTargetTriple().isOSBinFormatELF() && TM->Options.EnableFastISel) - addPass(createARMGlobalBaseRegPass()); return false; } diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h index 8c98e08..8ad1f3d 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h @@ -26,7 +26,8 @@ public: enum ARMABI { ARM_ABI_UNKNOWN, ARM_ABI_APCS, - ARM_ABI_AAPCS // ARM EABI + ARM_ABI_AAPCS, // ARM EABI + ARM_ABI_AAPCS16 } TargetABI; protected: diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 2f194cf..c152011 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -15,7 +15,7 @@ using namespace llvm; #define DEBUG_TYPE "armtti" -unsigned ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned Bits = Ty->getPrimitiveSizeInBits(); @@ -47,12 +47,12 @@ unsigned ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { return 3; } -unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); // Single to/from double precision conversions. - static const CostTblEntry<MVT::SimpleValueType> NEONFltDblTbl[] = { + static const CostTblEntry NEONFltDblTbl[] = { // Vector fptrunc/fpext conversions. { ISD::FP_ROUND, MVT::v2f64, 2 }, { ISD::FP_EXTEND, MVT::v2f32, 2 }, @@ -61,10 +61,9 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND)) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); - int Idx = CostTableLookup(NEONFltDblTbl, ISD, LT.second); - if (Idx != -1) - return LT.first * NEONFltDblTbl[Idx].Cost; + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) + return LT.first * Entry->Cost; } EVT SrcTy = TLI->getValueType(DL, Src); @@ -76,8 +75,7 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { // Some arithmetic, load and store operations have specific instructions // to cast up/down their types automatically at no extra cost. // TODO: Get these tables to know at least what the related operations are. - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - NEONVectorConversionTbl[] = { + static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = { { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, @@ -153,15 +151,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { }; if (SrcTy.isVector() && ST->hasNEON()) { - int Idx = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, - DstTy.getSimpleVT(), SrcTy.getSimpleVT()); - if (Idx != -1) - return NEONVectorConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } // Scalar float to integer conversions. - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - NEONFloatConversionTbl[] = { + static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = { { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 }, { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 }, { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 }, @@ -184,15 +181,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 } }; if (SrcTy.isFloatingPoint() && ST->hasNEON()) { - int Idx = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, - DstTy.getSimpleVT(), SrcTy.getSimpleVT()); - if (Idx != -1) - return NEONFloatConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } // Scalar integer to float conversions. - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - NEONIntegerConversionTbl[] = { + static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = { { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 }, { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 }, { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 }, @@ -216,15 +212,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { }; if (SrcTy.isInteger() && ST->hasNEON()) { - int Idx = ConvertCostTableLookup(NEONIntegerConversionTbl, ISD, - DstTy.getSimpleVT(), SrcTy.getSimpleVT()); - if (Idx != -1) - return NEONIntegerConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl, + ISD, DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } // Scalar integer conversion costs. - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - ARMIntegerConversionTbl[] = { + static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { // i16 -> i64 requires two dependent operations. { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 }, @@ -236,17 +231,17 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { }; if (SrcTy.isInteger()) { - int Idx = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, - DstTy.getSimpleVT(), SrcTy.getSimpleVT()); - if (Idx != -1) - return ARMIntegerConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index) { +int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index) { // Penalize inserting into an D-subregister. We end up with a three times // lower estimated throughput on swift. if (ST->isSwift() && @@ -255,28 +250,30 @@ unsigned ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, ValTy->getScalarSizeInBits() <= 32) return 3; - // Cross-class copies are expensive on many microarchitectures, - // so assume they are expensive by default. if ((Opcode == Instruction::InsertElement || - Opcode == Instruction::ExtractElement) && - ValTy->getVectorElementType()->isIntegerTy()) - return 3; + Opcode == Instruction::ExtractElement)) { + // Cross-class copies are expensive on many microarchitectures, + // so assume they are expensive by default. + if (ValTy->getVectorElementType()->isIntegerTy()) + return 3; + + // Even if it's not a cross class copy, this likely leads to mixing + // of NEON and VFP code and should be therefore penalized. + if (ValTy->isVectorTy() && + ValTy->getScalarSizeInBits() <= 32) + return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); + } return BaseT::getVectorInstrCost(Opcode, ValTy, Index); } -unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { +int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { int ISD = TLI->InstructionOpcodeToISD(Opcode); // On NEON a a vector select gets lowered to vbsl. if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) { // Lowering of some vector selects is currently far from perfect. - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - NEONVectorSelectTbl[] = { - { ISD::SELECT, MVT::v16i1, MVT::v16i16, 2*16 + 1 + 3*1 + 4*1 }, - { ISD::SELECT, MVT::v8i1, MVT::v8i32, 4*8 + 1*3 + 1*4 + 1*2 }, - { ISD::SELECT, MVT::v16i1, MVT::v16i32, 4*16 + 1*6 + 1*8 + 1*4 }, + static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = { { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 }, { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 }, { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 } @@ -285,21 +282,20 @@ unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, EVT SelCondTy = TLI->getValueType(DL, CondTy); EVT SelValTy = TLI->getValueType(DL, ValTy); if (SelCondTy.isSimple() && SelValTy.isSimple()) { - int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, - SelCondTy.getSimpleVT(), - SelValTy.getSimpleVT()); - if (Idx != -1) - return NEONVectorSelectTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, + SelCondTy.getSimpleVT(), + SelValTy.getSimpleVT())) + return Entry->Cost; } - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); return LT.first; } return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -314,7 +310,7 @@ unsigned ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { return 1; } -unsigned ARMTTIImpl::getFPOpCost(Type *Ty) { +int ARMTTIImpl::getFPOpCost(Type *Ty) { // Use similar logic that's in ARMISelLowering: // Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access // to VFP. @@ -333,14 +329,14 @@ unsigned ARMTTIImpl::getFPOpCost(Type *Ty) { return TargetTransformInfo::TCC_Expensive; } -unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { +int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { // We only handle costs of reverse and alternate shuffles for now. if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); if (Kind == TTI::SK_Reverse) { - static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = { + static const CostTblEntry NEONShuffleTbl[] = { // Reverse shuffle cost one instruction if we are shuffling within a // double word (vrev) or two if we shuffle a quad word (vrev, vext). {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, @@ -353,16 +349,16 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx == -1) - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, + LT.second)) + return LT.first * Entry->Cost; - return LT.first * NEONShuffleTbl[Idx].Cost; + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } if (Kind == TTI::SK_Alternate) { - static const CostTblEntry<MVT::SimpleValueType> NEONAltShuffleTbl[] = { + static const CostTblEntry NEONAltShuffleTbl[] = { // Alt shuffle cost table for ARM. Cost is the number of instructions // required to create the shuffled vector. @@ -379,27 +375,26 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - int Idx = - CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx == -1) - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); - return LT.first * NEONAltShuffleTbl[Idx].Cost; + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + if (const auto *Entry = CostTableLookup(NEONAltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } -unsigned ARMTTIImpl::getArithmeticInstrCost( +int ARMTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); const unsigned FunctionCallDivCost = 20; const unsigned ReciprocalDivCost = 10; - static const CostTblEntry<MVT::SimpleValueType> CostTbl[] = { + static const CostTblEntry CostTbl[] = { // Division. // These costs are somewhat random. Choose a cost of 20 to indicate that // vectorizing devision (added function call) is going to be very expensive. @@ -440,16 +435,12 @@ unsigned ARMTTIImpl::getArithmeticInstrCost( // Multiplication. }; - int Idx = -1; - if (ST->hasNEON()) - Idx = CostTableLookup(CostTbl, ISDOpcode, LT.second); - - if (Idx != -1) - return LT.first * CostTbl[Idx].Cost; + if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) + return LT.first * Entry->Cost; - unsigned Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, - Opd1PropInfo, Opd2PropInfo); + int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo); // This is somewhat of a hack. The problem that we are facing is that SROA // creates a sequence of shift, and, or instructions to construct values. @@ -465,10 +456,9 @@ unsigned ARMTTIImpl::getArithmeticInstrCost( return Cost; } -unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); +int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); if (Src->isVectorTy() && Alignment != 16 && Src->getVectorElementType()->isDoubleTy()) { @@ -479,21 +469,21 @@ unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, return LT.first; } -unsigned ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef<unsigned> Indices, - unsigned Alignment, - unsigned AddressSpace) { +int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa<VectorType>(VecTy) && "Expect a vector type"); // vldN/vstN doesn't support vector types of i64/f64 element. - bool EltIs64Bits = DL.getTypeAllocSizeInBits(VecTy->getScalarType()) == 64; + bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) { unsigned NumElts = VecTy->getVectorNumElements(); Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); - unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy); + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); // vldN/vstN only support legal vector types of size 64 or 128 in bits. if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128)) diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 84f256f..7d8d238 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -41,7 +41,7 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> { const ARMTargetLowering *getTLI() const { return TLI; } public: - explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, Function &F) + explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -52,11 +52,13 @@ public: : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {} + bool enableInterleavedAccessVectorization() { return true; } + /// \name Scalar TTI Implementations /// @{ using BaseT::getIntImmCost; - unsigned getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(const APInt &Imm, Type *Ty); /// @} @@ -92,34 +94,31 @@ public: return 1; } - unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - unsigned getAddressComputationCost(Type *Val, bool IsComplex); + int getAddressComputationCost(Type *Val, bool IsComplex); - unsigned getFPOpCost(Type *Ty); + int getFPOpCost(Type *Ty); - unsigned getArithmeticInstrCost( + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info = TTI::OK_AnyValue, TTI::OperandValueKind Op2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); - unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef<unsigned> Indices, - unsigned Alignment, - unsigned AddressSpace); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef<unsigned> Indices, unsigned Alignment, + unsigned AddressSpace); /// @} }; diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index cf6b892..c69a741 100644 --- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -129,7 +129,6 @@ public: }; class ARMAsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; const MCInstrInfo &MII; const MCRegisterInfo *MRI; UnwindContext UC; @@ -247,48 +246,49 @@ class ARMAsmParser : public MCTargetAsmParser { OperandVector &Operands); bool isThumb() const { // FIXME: Can tablegen auto-generate this? - return STI.getFeatureBits()[ARM::ModeThumb]; + return getSTI().getFeatureBits()[ARM::ModeThumb]; } bool isThumbOne() const { - return isThumb() && !STI.getFeatureBits()[ARM::FeatureThumb2]; + return isThumb() && !getSTI().getFeatureBits()[ARM::FeatureThumb2]; } bool isThumbTwo() const { - return isThumb() && STI.getFeatureBits()[ARM::FeatureThumb2]; + return isThumb() && getSTI().getFeatureBits()[ARM::FeatureThumb2]; } bool hasThumb() const { - return STI.getFeatureBits()[ARM::HasV4TOps]; + return getSTI().getFeatureBits()[ARM::HasV4TOps]; } bool hasV6Ops() const { - return STI.getFeatureBits()[ARM::HasV6Ops]; + return getSTI().getFeatureBits()[ARM::HasV6Ops]; } bool hasV6MOps() const { - return STI.getFeatureBits()[ARM::HasV6MOps]; + return getSTI().getFeatureBits()[ARM::HasV6MOps]; } bool hasV7Ops() const { - return STI.getFeatureBits()[ARM::HasV7Ops]; + return getSTI().getFeatureBits()[ARM::HasV7Ops]; } bool hasV8Ops() const { - return STI.getFeatureBits()[ARM::HasV8Ops]; + return getSTI().getFeatureBits()[ARM::HasV8Ops]; } bool hasARM() const { - return !STI.getFeatureBits()[ARM::FeatureNoARM]; + return !getSTI().getFeatureBits()[ARM::FeatureNoARM]; } - bool hasThumb2DSP() const { - return STI.getFeatureBits()[ARM::FeatureDSPThumb2]; + bool hasDSP() const { + return getSTI().getFeatureBits()[ARM::FeatureDSP]; } bool hasD16() const { - return STI.getFeatureBits()[ARM::FeatureD16]; + return getSTI().getFeatureBits()[ARM::FeatureD16]; } bool hasV8_1aOps() const { - return STI.getFeatureBits()[ARM::HasV8_1aOps]; + return getSTI().getFeatureBits()[ARM::HasV8_1aOps]; } void SwitchMode() { + MCSubtargetInfo &STI = copySTI(); uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb)); setAvailableFeatures(FB); } bool isMClass() const { - return STI.getFeatureBits()[ARM::FeatureMClass]; + return getSTI().getFeatureBits()[ARM::FeatureMClass]; } /// @name Auto-generated Match Functions @@ -343,14 +343,15 @@ public: Match_RequiresNotITBlock, Match_RequiresV6, Match_RequiresThumb2, + Match_RequiresV8, #define GET_OPERAND_DIAGNOSTIC_TYPES #include "ARMGenAsmMatcher.inc" }; - ARMAsmParser(MCSubtargetInfo &STI, MCAsmParser &Parser, + ARMAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : STI(STI), MII(MII), UC(Parser) { + : MCTargetAsmParser(Options, STI), MII(MII), UC(Parser) { MCAsmParserExtension::Initialize(Parser); // Cache the MCRegisterInfo. @@ -564,87 +565,6 @@ class ARMOperand : public MCParsedAsmOperand { public: ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} - ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() { - Kind = o.Kind; - StartLoc = o.StartLoc; - EndLoc = o.EndLoc; - switch (Kind) { - case k_CondCode: - CC = o.CC; - break; - case k_ITCondMask: - ITMask = o.ITMask; - break; - case k_Token: - Tok = o.Tok; - break; - case k_CCOut: - case k_Register: - Reg = o.Reg; - break; - case k_RegisterList: - case k_DPRRegisterList: - case k_SPRRegisterList: - Registers = o.Registers; - break; - case k_VectorList: - case k_VectorListAllLanes: - case k_VectorListIndexed: - VectorList = o.VectorList; - break; - case k_CoprocNum: - case k_CoprocReg: - Cop = o.Cop; - break; - case k_CoprocOption: - CoprocOption = o.CoprocOption; - break; - case k_Immediate: - Imm = o.Imm; - break; - case k_MemBarrierOpt: - MBOpt = o.MBOpt; - break; - case k_InstSyncBarrierOpt: - ISBOpt = o.ISBOpt; - case k_Memory: - Memory = o.Memory; - break; - case k_PostIndexRegister: - PostIdxReg = o.PostIdxReg; - break; - case k_MSRMask: - MMask = o.MMask; - break; - case k_BankedReg: - BankedReg = o.BankedReg; - break; - case k_ProcIFlags: - IFlags = o.IFlags; - break; - case k_ShifterImmediate: - ShifterImm = o.ShifterImm; - break; - case k_ShiftedRegister: - RegShiftedReg = o.RegShiftedReg; - break; - case k_ShiftedImmediate: - RegShiftedImm = o.RegShiftedImm; - break; - case k_RotateImmediate: - RotImm = o.RotImm; - break; - case k_ModifiedImmediate: - ModImm = o.ModImm; - break; - case k_BitfieldDescriptor: - Bitfield = o.Bitfield; - break; - case k_VectorIndex: - VectorIndex = o.VectorIndex; - break; - } - } /// getStartLoc - Get the location of the first token of this operand. SMLoc getStartLoc() const override { return StartLoc; } @@ -4054,7 +3974,7 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) { if (FlagsVal == ~0U) return MatchOperand_NoMatch; - if (!hasThumb2DSP() && (FlagsVal & 0x400)) + if (!hasDSP() && (FlagsVal & 0x400)) // The _g and _nzcvqg versions are only valid if the DSP extension is // available. return MatchOperand_NoMatch; @@ -5202,6 +5122,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { // FALLTHROUGH } case AsmToken::Colon: { + S = Parser.getTok().getLoc(); // ":lower16:" and ":upper16:" expression prefixes // FIXME: Check it's an expression prefix, // e.g. (FOO - :lower16:BAR) isn't legal. @@ -5220,8 +5141,9 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { return false; } case AsmToken::Equal: { + S = Parser.getTok().getLoc(); if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val) - return Error(Parser.getTok().getLoc(), "unexpected token in operand"); + return Error(S, "unexpected token in operand"); Parser.Lex(); // Eat '=' const MCExpr *SubExprVal; @@ -5229,7 +5151,8 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { return true; E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); - const MCExpr *CPLoc = getTargetStreamer().addConstantPoolEntry(SubExprVal); + const MCExpr *CPLoc = + getTargetStreamer().addConstantPoolEntry(SubExprVal, S); Operands.push_back(ARMOperand::CreateImm(CPLoc, S, E)); return false; } @@ -5682,9 +5605,11 @@ bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic, // VRINT{Z, R, X} have a predicate operand in VFP, but not in NEON unsigned RegIdx = 3; if ((Mnemonic == "vrintz" || Mnemonic == "vrintx" || Mnemonic == "vrintr") && - static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32") { + (static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32" || + static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f16")) { if (static_cast<ARMOperand &>(*Operands[3]).isToken() && - static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32") + (static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32" || + static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f16")) RegIdx = 4; if (static_cast<ARMOperand &>(*Operands[RegIdx]).isReg() && @@ -8610,18 +8535,29 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) { if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR && inITBlock()) return Match_RequiresNotITBlock; + } else if (isThumbOne()) { + // Some high-register supporting Thumb1 encodings only allow both registers + // to be from r0-r7 when in Thumb2. + if (Opc == ARM::tADDhirr && !hasV6MOps() && + isARMLowRegister(Inst.getOperand(1).getReg()) && + isARMLowRegister(Inst.getOperand(2).getReg())) + return Match_RequiresThumb2; + // Others only require ARMv6 or later. + else if (Opc == ARM::tMOVr && !hasV6Ops() && + isARMLowRegister(Inst.getOperand(0).getReg()) && + isARMLowRegister(Inst.getOperand(1).getReg())) + return Match_RequiresV6; } - // Some high-register supporting Thumb1 encodings only allow both registers - // to be from r0-r7 when in Thumb2. - else if (Opc == ARM::tADDhirr && isThumbOne() && !hasV6MOps() && - isARMLowRegister(Inst.getOperand(1).getReg()) && - isARMLowRegister(Inst.getOperand(2).getReg())) - return Match_RequiresThumb2; - // Others only require ARMv6 or later. - else if (Opc == ARM::tMOVr && isThumbOne() && !hasV6Ops() && - isARMLowRegister(Inst.getOperand(0).getReg()) && - isARMLowRegister(Inst.getOperand(1).getReg())) - return Match_RequiresV6; + + for (unsigned I = 0; I < MCID.NumOperands; ++I) + if (MCID.OpInfo[I].RegClass == ARM::rGPRRegClassID) { + // rGPRRegClass excludes PC, and also excluded SP before ARMv8 + if ((Inst.getOperand(I).getReg() == ARM::SP) && !hasV8Ops()) + return Match_RequiresV8; + else if (Inst.getOperand(I).getReg() == ARM::PC) + return Match_InvalidOperand; + } + return Match_Success; } @@ -8680,7 +8616,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return false; Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, getSTI()); return false; case Match_MissingFeature: { assert(ErrorInfo && "Unknown missing feature!"); @@ -8720,6 +8656,8 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(IDLoc, "instruction variant requires ARMv6 or later"); case Match_RequiresThumb2: return Error(IDLoc, "instruction variant requires Thumb2"); + case Match_RequiresV8: + return Error(IDLoc, "instruction variant requires ARMv8 or later"); case Match_ImmRange0_15: { SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; @@ -8868,7 +8806,7 @@ bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) { return false; } - getParser().getStreamer().EmitValue(Value, Size); + getParser().getStreamer().EmitValue(Value, Size, L); if (getLexer().is(AsmToken::EndOfStatement)) break; @@ -9098,7 +9036,7 @@ bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) { bool ARMAsmParser::parseDirectiveArch(SMLoc L) { StringRef Arch = getParser().parseStringToEndOfStatement().trim(); - unsigned ID = ARMTargetParser::parseArch(Arch); + unsigned ID = ARM::parseArch(Arch); if (ID == ARM::AK_INVALID) { Error(L, "Unknown arch name"); @@ -9106,7 +9044,8 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) { } Triple T; - STI.setDefaultFeatures(T.getARMCPUForArch(Arch)); + MCSubtargetInfo &STI = copySTI(); + STI.setDefaultFeatures("", ("+" + ARM::getArchName(ID)).str()); setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); getTargetStreamer().emitArch(ID); @@ -9233,12 +9172,13 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) { // FIXME: This is using table-gen data, but should be moved to // ARMTargetParser once that is table-gen'd. - if (!STI.isCPUStringValid(CPU)) { + if (!getSTI().isCPUStringValid(CPU)) { Error(L, "Unknown CPU name"); return false; } - STI.setDefaultFeatures(CPU); + MCSubtargetInfo &STI = copySTI(); + STI.setDefaultFeatures(CPU, ""); setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); return false; @@ -9249,13 +9189,14 @@ bool ARMAsmParser::parseDirectiveFPU(SMLoc L) { SMLoc FPUNameLoc = getTok().getLoc(); StringRef FPU = getParser().parseStringToEndOfStatement().trim(); - unsigned ID = ARMTargetParser::parseFPU(FPU); + unsigned ID = ARM::parseFPU(FPU); std::vector<const char *> Features; - if (!ARMTargetParser::getFPUFeatures(ID, Features)) { + if (!ARM::getFPUFeatures(ID, Features)) { Error(FPUNameLoc, "Unknown FPU name"); return false; } + MCSubtargetInfo &STI = copySTI(); for (auto Feature : Features) STI.ApplyFeatureFlag(Feature); setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); @@ -9895,7 +9836,7 @@ bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) { SMLoc ArchLoc = Parser.getTok().getLoc(); getLexer().Lex(); - unsigned ID = ARMTargetParser::parseArch(Arch); + unsigned ID = ARM::parseArch(Arch); if (ID == ARM::AK_INVALID) { Error(ArchLoc, "unknown architecture '" + Arch + "'"); @@ -9976,22 +9917,22 @@ extern "C" void LLVMInitializeARMAsmParser() { // when we start to table-generate them, and we can use the ARM // flags below, that were generated by table-gen. static const struct { - const ARM::ArchExtKind Kind; - const unsigned ArchCheck; + const unsigned Kind; + const uint64_t ArchCheck; const FeatureBitset Features; } Extensions[] = { { ARM::AEK_CRC, Feature_HasV8, {ARM::FeatureCRC} }, { ARM::AEK_CRYPTO, Feature_HasV8, {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} }, { ARM::AEK_FP, Feature_HasV8, {ARM::FeatureFPARMv8} }, - { ARM::AEK_HWDIV, Feature_HasV7 | Feature_IsNotMClass, + { (ARM::AEK_HWDIV | ARM::AEK_HWDIVARM), Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureHWDiv, ARM::FeatureHWDivARM} }, { ARM::AEK_MP, Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureMP} }, { ARM::AEK_SIMD, Feature_HasV8, {ARM::FeatureNEON, ARM::FeatureFPARMv8} }, - // FIXME: Also available in ARMv6-K - { ARM::AEK_SEC, Feature_HasV7, {ARM::FeatureTrustZone} }, + { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} }, // FIXME: Only available in A-class, isel not predicated { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} }, + { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} }, // FIXME: Unsupported extensions. { ARM::AEK_OS, Feature_None, {} }, { ARM::AEK_IWMMXT, Feature_None, {} }, @@ -10020,7 +9961,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) { EnableFeature = false; Name = Name.substr(2); } - unsigned FeatureKind = ARMTargetParser::parseArchExt(Name); + unsigned FeatureKind = ARM::parseArchExt(Name); if (FeatureKind == ARM::AEK_INVALID) Error(ExtLoc, "unknown architectural extension: " + Name); @@ -10037,6 +9978,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) { return false; } + MCSubtargetInfo &STI = copySTI(); FeatureBitset ToggleFeatures = EnableFeature ? (~STI.getFeatureBits() & Extension.Features) : ( STI.getFeatureBits() & Extension.Features); @@ -10078,6 +10020,10 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, "expression value must be representable in 32 bits"); } break; + case MCK_rGPR: + if (hasV8Ops() && Op.isReg() && Op.getReg() == ARM::SP) + return Match_Success; + break; case MCK_GPRPair: if (Op.isReg() && MRI->getRegClass(ARM::GPRRegClassID).contains(Op.getReg())) diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 097ec04..e63defe 100644 --- a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -59,7 +59,7 @@ namespace { } // Called when decoding an IT instruction. Sets the IT state for the following - // instructions that for the IT block. Firstcond and Mask correspond to the + // instructions that for the IT block. Firstcond and Mask correspond to the // fields in the IT instruction encoding. void setITState(char Firstcond, char Mask) { // (3 - the number of trailing zeros) is the number of then / else. @@ -459,21 +459,18 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // VFP and NEON instructions, similarly, are shared between ARM // and Thumb modes. - MI.clear(); Result = decodeInstruction(DecoderTableVFP32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { Size = 4; return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableVFPV832, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { Size = 4; return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableNEONData32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -485,7 +482,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -497,7 +493,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -509,7 +504,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTablev8NEON32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -517,7 +511,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTablev8Crypto32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -525,7 +518,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Size = 0; return MCDisassembler::Fail; } @@ -718,7 +710,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this, STI); if (Result) { @@ -729,7 +720,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -763,7 +753,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, uint32_t Insn32 = (Bytes[3] << 8) | (Bytes[2] << 0) | (Bytes[1] << 24) | (Bytes[0] << 16); - MI.clear(); Result = decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -774,7 +763,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -784,7 +772,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } if (fieldFromInstruction(Insn32, 28, 4) == 0xE) { - MI.clear(); Result = decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -794,7 +781,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - MI.clear(); Result = decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -803,7 +789,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } if (fieldFromInstruction(Insn32, 28, 4) == 0xE) { - MI.clear(); Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -814,7 +799,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) { - MI.clear(); uint32_t NEONLdStInsn = Insn32; NEONLdStInsn &= 0xF0FFFFFF; NEONLdStInsn |= 0x04000000; @@ -828,7 +812,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } if (fieldFromInstruction(Insn32, 24, 4) == 0xF) { - MI.clear(); uint32_t NEONDataInsn = Insn32; NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24 NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24 @@ -841,7 +824,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); uint32_t NEONCryptoInsn = Insn32; NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24 NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24 @@ -853,7 +835,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); uint32_t NEONv8Insn = Insn32; NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26 Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address, @@ -864,7 +845,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - MI.clear(); Size = 0; return MCDisassembler::Fail; } @@ -902,7 +882,7 @@ static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; - + if (RegNo == 15) S = MCDisassembler::SoftFail; @@ -986,8 +966,13 @@ static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; - if (RegNo == 13 || RegNo == 15) + + const FeatureBitset &featureBits = + ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + + if ((RegNo == 13 && !featureBits[ARM::HasV8Ops]) || RegNo == 15) S = MCDisassembler::SoftFail; + Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder)); return S; } @@ -1147,7 +1132,7 @@ static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val, unsigned imm = fieldFromInstruction(Val, 7, 5); // Register-immediate - if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) return MCDisassembler::Fail; ARM_AM::ShiftOpc Shift = ARM_AM::lsl; @@ -1658,7 +1643,7 @@ DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn, case ARM::STRD_POST: if (P == 0 && W == 1) S = MCDisassembler::SoftFail; - + if (writeback && (Rn == 15 || Rn == Rt || Rn == Rt2)) S = MCDisassembler::SoftFail; if (type && Rm == 15) @@ -4131,7 +4116,7 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val, // indicates the move for the GE{3:0} bits, the mask{0} bit can be set // only if the processor includes the DSP extension. if (Mask == 0 || (Mask != 2 && ValLow > 3) || - (!(FeatureBits[ARM::FeatureDSPThumb2]) && (Mask & 1))) + (!(FeatureBits[ARM::FeatureDSP]) && (Mask & 1))) S = MCDisassembler::SoftFail; } } @@ -5065,6 +5050,10 @@ static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn, static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { + const FeatureBitset &featureBits = + ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits(); + bool hasFullFP16 = featureBits[ARM::FeatureFullFP16]; + unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0); Vd |= (fieldFromInstruction(Insn, 22, 1) << 4); unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0); @@ -5075,10 +5064,35 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, DecodeStatus S = MCDisassembler::Success; - // VMOVv2f32 is ambiguous with these decodings. - if (!(imm & 0x38) && cmode == 0xF) { - if (op == 1) return MCDisassembler::Fail; - Inst.setOpcode(ARM::VMOVv2f32); + // If the top 3 bits of imm are clear, this is a VMOV (immediate) + if (!(imm & 0x38)) { + if (cmode == 0xF) { + if (op == 1) return MCDisassembler::Fail; + Inst.setOpcode(ARM::VMOVv2f32); + } + if (hasFullFP16) { + if (cmode == 0xE) { + if (op == 1) { + Inst.setOpcode(ARM::VMOVv1i64); + } else { + Inst.setOpcode(ARM::VMOVv8i8); + } + } + if (cmode == 0xD) { + if (op == 1) { + Inst.setOpcode(ARM::VMVNv2i32); + } else { + Inst.setOpcode(ARM::VMOVv2i32); + } + } + if (cmode == 0xC) { + if (op == 1) { + Inst.setOpcode(ARM::VMVNv2i32); + } else { + Inst.setOpcode(ARM::VMOVv2i32); + } + } + } return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder); } @@ -5095,6 +5109,10 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { + const FeatureBitset &featureBits = + ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits(); + bool hasFullFP16 = featureBits[ARM::FeatureFullFP16]; + unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0); Vd |= (fieldFromInstruction(Insn, 22, 1) << 4); unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0); @@ -5105,10 +5123,35 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, DecodeStatus S = MCDisassembler::Success; - // VMOVv4f32 is ambiguous with these decodings. - if (!(imm & 0x38) && cmode == 0xF) { - if (op == 1) return MCDisassembler::Fail; - Inst.setOpcode(ARM::VMOVv4f32); + // If the top 3 bits of imm are clear, this is a VMOV (immediate) + if (!(imm & 0x38)) { + if (cmode == 0xF) { + if (op == 1) return MCDisassembler::Fail; + Inst.setOpcode(ARM::VMOVv4f32); + } + if (hasFullFP16) { + if (cmode == 0xE) { + if (op == 1) { + Inst.setOpcode(ARM::VMOVv2i64); + } else { + Inst.setOpcode(ARM::VMOVv16i8); + } + } + if (cmode == 0xD) { + if (op == 1) { + Inst.setOpcode(ARM::VMVNv4i32); + } else { + Inst.setOpcode(ARM::VMOVv4i32); + } + } + if (cmode == 0xC) { + if (op == 1) { + Inst.setOpcode(ARM::VMVNv4i32); + } else { + Inst.setOpcode(ARM::VMOVv4i32); + } + } + } return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder); } @@ -5132,7 +5175,7 @@ static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val, unsigned Rm = fieldFromInstruction(Val, 0, 4); Rm |= (fieldFromInstruction(Val, 23, 1) << 4); unsigned Cond = fieldFromInstruction(Val, 28, 4); - + if (fieldFromInstruction(Val, 8, 4) != 0 || Rn == Rt) S = MCDisassembler::SoftFail; diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp index 0bff521..33fc85a 100644 --- a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -19,6 +19,7 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -804,7 +805,7 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum, unsigned Opcode = MI->getOpcode(); // For writes, handle extended mask bits if the DSP extension is present. - if (Opcode == ARM::t2MSR_M && FeatureBits[ARM::FeatureDSPThumb2]) { + if (Opcode == ARM::t2MSR_M && FeatureBits[ARM::FeatureDSP]) { switch (SYSm) { case 0x400: O << "apsr_g"; diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h index 3927c9f..52f7115 100644 --- a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h @@ -15,12 +15,9 @@ #define LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCSubtargetInfo.h" namespace llvm { -class MCOperand; - class ARMInstPrinter : public MCInstPrinter { public: ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 1114635..fa52c93 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -25,13 +25,17 @@ #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCMachObjectWriter.h" #include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCValue.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" #include "llvm/Support/MachO.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -180,9 +184,8 @@ bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst) const { return false; } -bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, - const MCRelaxableFragment *DF, - const MCAsmLayout &Layout) const { +const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup, + uint64_t Value) const { switch ((unsigned)Fixup.getKind()) { case ARM::fixup_arm_thumb_br: { // Relaxing tB to t2B. tB has a signed 12-bit displacement with the @@ -192,7 +195,9 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, // // Relax if the value is too big for a (signed) i8. int64_t Offset = int64_t(Value) - 4; - return Offset > 2046 || Offset < -2048; + if (Offset > 2046 || Offset < -2048) + return "out of range pc-relative fixup value"; + break; } case ARM::fixup_arm_thumb_bcc: { // Relaxing tBcc to t2Bcc. tBcc has a signed 9-bit displacement with the @@ -202,23 +207,40 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, // // Relax if the value is too big for a (signed) i8. int64_t Offset = int64_t(Value) - 4; - return Offset > 254 || Offset < -256; + if (Offset > 254 || Offset < -256) + return "out of range pc-relative fixup value"; + break; } case ARM::fixup_thumb_adr_pcrel_10: case ARM::fixup_arm_thumb_cp: { // If the immediate is negative, greater than 1020, or not a multiple // of four, the wide version of the instruction must be used. int64_t Offset = int64_t(Value) - 4; - return Offset > 1020 || Offset < 0 || Offset & 3; + if (Offset & 3) + return "misaligned pc-relative fixup value"; + else if (Offset > 1020 || Offset < 0) + return "out of range pc-relative fixup value"; + break; } - case ARM::fixup_arm_thumb_cb: + case ARM::fixup_arm_thumb_cb: { // If we have a Thumb CBZ or CBNZ instruction and its target is the next // instruction it is is actually out of range for the instruction. // It will be changed to a NOP. int64_t Offset = (Value & ~1); - return Offset == 2; + if (Offset == 2) + return "will be converted to nop"; + break; } - llvm_unreachable("Unexpected fixup kind in fixupNeedsRelaxation()!"); + default: + llvm_unreachable("Unexpected fixup kind in reasonForFixupRelaxation()!"); + } + return nullptr; +} + +bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const { + return reasonForFixupRelaxation(Fixup, Value); } void ARMAsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const { @@ -317,9 +339,10 @@ static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf, return Value; } -static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, - bool IsPCRel, MCContext *Ctx, - bool IsLittleEndian) { +unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value, + bool IsPCRel, MCContext *Ctx, + bool IsLittleEndian, + bool IsResolved) const { unsigned Kind = Fixup.getKind(); switch (Kind) { default: @@ -372,8 +395,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, Value = -Value; isAdd = false; } - if (Ctx && Value >= 4096) - Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value"); + if (Ctx && Value >= 4096) { + Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value"); + return 0; + } Value |= isAdd << 23; // Same addressing mode as fixup_arm_pcrel_10, @@ -383,8 +408,6 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, return Value; } - case ARM::fixup_thumb_adr_pcrel_10: - return ((Value - 4) >> 2) & 0xff; case ARM::fixup_arm_adr_pcrel_12: { // ARM PC-relative values are offset by 8. Value -= 8; @@ -393,8 +416,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, Value = -Value; opc = 2; // 0b0010 } - if (Ctx && ARM_AM::getSOImmVal(Value) == -1) - Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value"); + if (Ctx && ARM_AM::getSOImmVal(Value) == -1) { + Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value"); + return 0; + } // Encode the immediate and shift the opcode into place. return ARM_AM::getSOImmVal(Value) | (opc << 21); } @@ -517,21 +542,44 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, ((uint16_t)imm10LBits) << 1); return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian); } + case ARM::fixup_thumb_adr_pcrel_10: case ARM::fixup_arm_thumb_cp: - // Offset by 4, and don't encode the low two bits. Two bytes of that - // 'off by 4' is implicitly handled by the half-word ordering of the - // Thumb encoding, so we only need to adjust by 2 here. - return ((Value - 2) >> 2) & 0xff; + // On CPUs supporting Thumb2, this will be relaxed to an ldr.w, otherwise we + // could have an error on our hands. + if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) { + const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); + if (FixupDiagnostic) { + Ctx->reportError(Fixup.getLoc(), FixupDiagnostic); + return 0; + } + } + // Offset by 4, and don't encode the low two bits. + return ((Value - 4) >> 2) & 0xff; case ARM::fixup_arm_thumb_cb: { // Offset by 4 and don't encode the lower bit, which is always 0. + // FIXME: diagnose if no Thumb2 uint32_t Binary = (Value - 4) >> 1; return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3); } case ARM::fixup_arm_thumb_br: // Offset by 4 and don't encode the lower bit, which is always 0. + if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) { + const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); + if (FixupDiagnostic) { + Ctx->reportError(Fixup.getLoc(), FixupDiagnostic); + return 0; + } + } return ((Value - 4) >> 1) & 0x7ff; case ARM::fixup_arm_thumb_bcc: // Offset by 4 and don't encode the lower bit, which is always 0. + if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) { + const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); + if (FixupDiagnostic) { + Ctx->reportError(Fixup.getLoc(), FixupDiagnostic); + return 0; + } + } return ((Value - 4) >> 1) & 0xff; case ARM::fixup_arm_pcrel_10_unscaled: { Value = Value - 8; // ARM fixups offset by an additional word and don't @@ -542,8 +590,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, isAdd = false; } // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8]. - if (Ctx && Value >= 256) - Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value"); + if (Ctx && Value >= 256) { + Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value"); + return 0; + } Value = (Value & 0xf) | ((Value & 0xf0) << 4); return Value | (isAdd << 23); } @@ -561,8 +611,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, } // These values don't encode the low two bits since they're always zero. Value >>= 2; - if (Ctx && Value >= 256) - Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value"); + if (Ctx && Value >= 256) { + Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value"); + return 0; + } Value |= isAdd << 23; // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords @@ -582,6 +634,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, const MCValue &Target, uint64_t &Value, bool &IsResolved) { const MCSymbolRefExpr *A = Target.getSymA(); + const MCSymbol *Sym = A ? &A->getSymbol() : nullptr; // Some fixups to thumb function symbols need the low bit (thumb bit) // twiddled. if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 && @@ -590,18 +643,21 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, (unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 && (unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 && (unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) { - if (A) { - const MCSymbol &Sym = A->getSymbol(); - if (Asm.isThumbFunc(&Sym)) + if (Sym) { + if (Asm.isThumbFunc(Sym)) Value |= 1; } } - // For Thumb1 BL instruction, it is possible to be a long jump between - // the basic blocks of the same function. Thus, we would like to resolve - // the offset when the destination has the same MCFragment. - if (A && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) { - const MCSymbol &Sym = A->getSymbol(); - IsResolved = (Sym.getFragment() == DF); + if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) { + assert(Sym && "How did we resolve this?"); + + // If the symbol is external the linker will handle it. + // FIXME: Should we handle it as an optimization? + + // If the symbol is out of range, produce a relocation and hope the + // linker can handle it. GNU AS produces an error in this case. + if (Sym->isExternal() || Value >= 0x400004) + IsResolved = false; } // We must always generate a relocation for BL/BLX instructions if we have // a symbol to reference, as the linker relies on knowing the destination @@ -616,7 +672,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, // the instruction. This allows adjustFixupValue() to issue a diagnostic // if the value aren't invalid. (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext(), - IsLittleEndian); + IsLittleEndian, IsResolved); } /// getFixupKindNumBytes - The number of bytes the fixup may change. @@ -719,7 +775,8 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const { unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - Value = adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian); + Value = + adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian, true); if (!Value) return; // Doesn't change encoding. @@ -743,6 +800,249 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, } } +namespace CU { + +/// \brief Compact unwind encoding values. +enum CompactUnwindEncodings { + UNWIND_ARM_MODE_MASK = 0x0F000000, + UNWIND_ARM_MODE_FRAME = 0x01000000, + UNWIND_ARM_MODE_FRAME_D = 0x02000000, + UNWIND_ARM_MODE_DWARF = 0x04000000, + + UNWIND_ARM_FRAME_STACK_ADJUST_MASK = 0x00C00000, + + UNWIND_ARM_FRAME_FIRST_PUSH_R4 = 0x00000001, + UNWIND_ARM_FRAME_FIRST_PUSH_R5 = 0x00000002, + UNWIND_ARM_FRAME_FIRST_PUSH_R6 = 0x00000004, + + UNWIND_ARM_FRAME_SECOND_PUSH_R8 = 0x00000008, + UNWIND_ARM_FRAME_SECOND_PUSH_R9 = 0x00000010, + UNWIND_ARM_FRAME_SECOND_PUSH_R10 = 0x00000020, + UNWIND_ARM_FRAME_SECOND_PUSH_R11 = 0x00000040, + UNWIND_ARM_FRAME_SECOND_PUSH_R12 = 0x00000080, + + UNWIND_ARM_FRAME_D_REG_COUNT_MASK = 0x00000F00, + + UNWIND_ARM_DWARF_SECTION_OFFSET = 0x00FFFFFF +}; + +} // end CU namespace + +/// Generate compact unwind encoding for the function based on the CFI +/// instructions. If the CFI instructions describe a frame that cannot be +/// encoded in compact unwind, the method returns UNWIND_ARM_MODE_DWARF which +/// tells the runtime to fallback and unwind using dwarf. +uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding( + ArrayRef<MCCFIInstruction> Instrs) const { + DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "generateCU()\n"); + // Only armv7k uses CFI based unwinding. + if (Subtype != MachO::CPU_SUBTYPE_ARM_V7K) + return 0; + // No .cfi directives means no frame. + if (Instrs.empty()) + return 0; + // Start off assuming CFA is at SP+0. + int CFARegister = ARM::SP; + int CFARegisterOffset = 0; + // Mark savable registers as initially unsaved + DenseMap<unsigned, int> RegOffsets; + int FloatRegCount = 0; + // Process each .cfi directive and build up compact unwind info. + for (size_t i = 0, e = Instrs.size(); i != e; ++i) { + int Reg; + const MCCFIInstruction &Inst = Instrs[i]; + switch (Inst.getOperation()) { + case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa + CFARegisterOffset = -Inst.getOffset(); + CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true); + break; + case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset + CFARegisterOffset = -Inst.getOffset(); + break; + case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register + CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true); + break; + case MCCFIInstruction::OpOffset: // DW_CFA_offset + Reg = MRI.getLLVMRegNum(Inst.getRegister(), true); + if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) + RegOffsets[Reg] = Inst.getOffset(); + else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) { + RegOffsets[Reg] = Inst.getOffset(); + ++FloatRegCount; + } else { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << ".cfi_offset on unknown register=" + << Inst.getRegister() << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + break; + case MCCFIInstruction::OpRelOffset: // DW_CFA_advance_loc + // Ignore + break; + default: + // Directive not convertable to compact unwind, bail out. + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() + << "CFI directive not compatiable with comact " + "unwind encoding, opcode=" << Inst.getOperation() + << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + break; + } + } + + // If no frame set up, return no unwind info. + if ((CFARegister == ARM::SP) && (CFARegisterOffset == 0)) + return 0; + + // Verify standard frame (lr/r7) was used. + if (CFARegister != ARM::R7) { + DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "frame register is " + << CFARegister + << " instead of r7\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + int StackAdjust = CFARegisterOffset - 8; + if (RegOffsets.lookup(ARM::LR) != (-4 - StackAdjust)) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() + << "LR not saved as standard frame, StackAdjust=" + << StackAdjust + << ", CFARegisterOffset=" << CFARegisterOffset + << ", lr save at offset=" << RegOffsets[14] << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + if (RegOffsets.lookup(ARM::R7) != (-8 - StackAdjust)) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << "r7 not saved as standard frame\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + uint32_t CompactUnwindEncoding = CU::UNWIND_ARM_MODE_FRAME; + + // If var-args are used, there may be a stack adjust required. + switch (StackAdjust) { + case 0: + break; + case 4: + CompactUnwindEncoding |= 0x00400000; + break; + case 8: + CompactUnwindEncoding |= 0x00800000; + break; + case 12: + CompactUnwindEncoding |= 0x00C00000; + break; + default: + DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() + << ".cfi_def_cfa stack adjust (" + << StackAdjust << ") out of range\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + + // If r6 is saved, it must be right below r7. + static struct { + unsigned Reg; + unsigned Encoding; + } GPRCSRegs[] = {{ARM::R6, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R6}, + {ARM::R5, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R5}, + {ARM::R4, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R4}, + {ARM::R12, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R12}, + {ARM::R11, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R11}, + {ARM::R10, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R10}, + {ARM::R9, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R9}, + {ARM::R8, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R8}}; + + int CurOffset = -8 - StackAdjust; + for (auto CSReg : GPRCSRegs) { + auto Offset = RegOffsets.find(CSReg.Reg); + if (Offset == RegOffsets.end()) + continue; + + int RegOffset = Offset->second; + if (RegOffset != CurOffset - 4) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << MRI.getName(CSReg.Reg) << " saved at " + << RegOffset << " but only supported at " + << CurOffset << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + CompactUnwindEncoding |= CSReg.Encoding; + CurOffset -= 4; + } + + // If no floats saved, we are done. + if (FloatRegCount == 0) + return CompactUnwindEncoding; + + // Switch mode to include D register saving. + CompactUnwindEncoding &= ~CU::UNWIND_ARM_MODE_MASK; + CompactUnwindEncoding |= CU::UNWIND_ARM_MODE_FRAME_D; + + // FIXME: supporting more than 4 saved D-registers compactly would be trivial, + // but needs coordination with the linker and libunwind. + if (FloatRegCount > 4) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << "unsupported number of D registers saved (" + << FloatRegCount << ")\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + + // Floating point registers must either be saved sequentially, or we defer to + // DWARF. No gaps allowed here so check that each saved d-register is + // precisely where it should be. + static unsigned FPRCSRegs[] = { ARM::D8, ARM::D10, ARM::D12, ARM::D14 }; + for (int Idx = FloatRegCount - 1; Idx >= 0; --Idx) { + auto Offset = RegOffsets.find(FPRCSRegs[Idx]); + if (Offset == RegOffsets.end()) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << FloatRegCount << " D-regs saved, but " + << MRI.getName(FPRCSRegs[Idx]) + << " not saved\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } else if (Offset->second != CurOffset - 8) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << FloatRegCount << " D-regs saved, but " + << MRI.getName(FPRCSRegs[Idx]) + << " saved at " << Offset->second + << ", expected at " << CurOffset - 8 + << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + CurOffset -= 8; + } + + return CompactUnwindEncoding | ((FloatRegCount - 1) << 8); +} + +static MachO::CPUSubTypeARM getMachOSubTypeFromArch(StringRef Arch) { + unsigned AK = ARM::parseArch(Arch); + switch (AK) { + default: + return MachO::CPU_SUBTYPE_ARM_V7; + case ARM::AK_ARMV4T: + return MachO::CPU_SUBTYPE_ARM_V4T; + case ARM::AK_ARMV5T: + case ARM::AK_ARMV5TE: + case ARM::AK_ARMV5TEJ: + return MachO::CPU_SUBTYPE_ARM_V5; + case ARM::AK_ARMV6: + case ARM::AK_ARMV6K: + return MachO::CPU_SUBTYPE_ARM_V6; + case ARM::AK_ARMV7A: + return MachO::CPU_SUBTYPE_ARM_V7; + case ARM::AK_ARMV7S: + return MachO::CPU_SUBTYPE_ARM_V7S; + case ARM::AK_ARMV7K: + return MachO::CPU_SUBTYPE_ARM_V7K; + case ARM::AK_ARMV6M: + return MachO::CPU_SUBTYPE_ARM_V6M; + case ARM::AK_ARMV7M: + return MachO::CPU_SUBTYPE_ARM_V7M; + case ARM::AK_ARMV7EM: + return MachO::CPU_SUBTYPE_ARM_V7EM; + } +} + MCAsmBackend *llvm::createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI, const Triple &TheTriple, StringRef CPU, @@ -751,19 +1051,8 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T, default: llvm_unreachable("unsupported object format"); case Triple::MachO: { - MachO::CPUSubTypeARM CS = - StringSwitch<MachO::CPUSubTypeARM>(TheTriple.getArchName()) - .Cases("armv4t", "thumbv4t", MachO::CPU_SUBTYPE_ARM_V4T) - .Cases("armv5e", "thumbv5e", MachO::CPU_SUBTYPE_ARM_V5TEJ) - .Cases("armv6", "thumbv6", MachO::CPU_SUBTYPE_ARM_V6) - .Cases("armv6m", "thumbv6m", MachO::CPU_SUBTYPE_ARM_V6M) - .Cases("armv7em", "thumbv7em", MachO::CPU_SUBTYPE_ARM_V7EM) - .Cases("armv7k", "thumbv7k", MachO::CPU_SUBTYPE_ARM_V7K) - .Cases("armv7m", "thumbv7m", MachO::CPU_SUBTYPE_ARM_V7M) - .Cases("armv7s", "thumbv7s", MachO::CPU_SUBTYPE_ARM_V7S) - .Default(MachO::CPU_SUBTYPE_ARM_V7); - - return new ARMAsmBackendDarwin(T, TheTriple, CS); + MachO::CPUSubTypeARM CS = getMachOSubTypeFromArch(TheTriple.getArchName()); + return new ARMAsmBackendDarwin(T, TheTriple, MRI, CS); } case Triple::COFF: assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported"); diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h index 6b4abd5..28a6213 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -45,6 +45,10 @@ public: const MCValue &Target, uint64_t &Value, bool &IsResolved) override; + unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel, + MCContext *Ctx, bool IsLittleEndian, + bool IsResolved) const; + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const override; @@ -52,6 +56,9 @@ public: bool mayNeedRelaxation(const MCInst &Inst) const override; + const char *reasonForFixupRelaxation(const MCFixup &Fixup, + uint64_t Value) const; + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override; diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h index a6206e3..995dd0f 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h @@ -16,11 +16,12 @@ using namespace llvm; namespace { class ARMAsmBackendDarwin : public ARMAsmBackend { + const MCRegisterInfo &MRI; public: const MachO::CPUSubTypeARM Subtype; ARMAsmBackendDarwin(const Target &T, const Triple &TT, - MachO::CPUSubTypeARM st) - : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), Subtype(st) { + const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st) + : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), MRI(MRI), Subtype(st) { HasDataInCodeSupport = true; } @@ -28,6 +29,9 @@ public: return createARMMachObjectWriter(OS, /*Is64Bit=*/false, MachO::CPU_TYPE_ARM, Subtype); } + + uint32_t generateCompactUnwindEncoding( + ArrayRef<MCCFIInstruction> Instrs) const override; }; } diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 804d353..52eba8be 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -95,7 +95,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, case MCSymbolRefExpr::VK_GOTTPOFF: Type = ELF::R_ARM_TLS_IE32; break; - case MCSymbolRefExpr::VK_GOTPCREL: + case MCSymbolRefExpr::VK_ARM_GOT_PREL: Type = ELF::R_ARM_GOT_PREL; break; } @@ -192,7 +192,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, case MCSymbolRefExpr::VK_GOTOFF: Type = ELF::R_ARM_GOTOFF32; break; - case MCSymbolRefExpr::VK_GOTPCREL: + case MCSymbolRefExpr::VK_ARM_GOT_PREL: Type = ELF::R_ARM_GOT_PREL; break; case MCSymbolRefExpr::VK_ARM_TARGET1: diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index d17fdb9..57577dc 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -79,7 +79,7 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer { void emitAttribute(unsigned Attribute, unsigned Value) override; void emitTextAttribute(unsigned Attribute, StringRef String) override; void emitIntTextAttribute(unsigned Attribute, unsigned IntValue, - StringRef StrinValue) override; + StringRef StringValue) override; void emitArch(unsigned Arch) override; void emitArchExtension(unsigned ArchExt) override; void emitObjectArch(unsigned Arch) override; @@ -195,16 +195,16 @@ void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute, OS << "\n"; } void ARMTargetAsmStreamer::emitArch(unsigned Arch) { - OS << "\t.arch\t" << ARMTargetParser::getArchName(Arch) << "\n"; + OS << "\t.arch\t" << ARM::getArchName(Arch) << "\n"; } void ARMTargetAsmStreamer::emitArchExtension(unsigned ArchExt) { - OS << "\t.arch_extension\t" << ARMTargetParser::getArchExtName(ArchExt) << "\n"; + OS << "\t.arch_extension\t" << ARM::getArchExtName(ArchExt) << "\n"; } void ARMTargetAsmStreamer::emitObjectArch(unsigned Arch) { - OS << "\t.object_arch\t" << ARMTargetParser::getArchName(Arch) << '\n'; + OS << "\t.object_arch\t" << ARM::getArchName(Arch) << '\n'; } void ARMTargetAsmStreamer::emitFPU(unsigned FPU) { - OS << "\t.fpu\t" << ARMTargetParser::getFPUName(FPU) << "\n"; + OS << "\t.fpu\t" << ARM::getFPUName(FPU) << "\n"; } void ARMTargetAsmStreamer::finishAttributeSection() { } @@ -243,7 +243,7 @@ void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset, class ARMTargetELFStreamer : public ARMTargetStreamer { private: // This structure holds all attributes, accounting for - // their string/numeric value, so we can later emmit them + // their string/numeric value, so we can later emit them // in declaration order, keeping all in the same vector struct AttributeItem { enum { @@ -254,7 +254,7 @@ private: } Type; unsigned Tag; unsigned IntValue; - StringRef StringValue; + std::string StringValue; static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) { // The conformance tag must be emitted first when serialised @@ -388,6 +388,9 @@ private: size_t calculateContentSize() const; + // Reset state between object emissions + void reset() override; + public: ARMTargetELFStreamer(MCStreamer &S) : ARMTargetStreamer(S), CurrentVendor("aeabi"), FPU(ARM::FK_INVALID), @@ -415,7 +418,7 @@ public: MCCodeEmitter *Emitter, bool IsThumb) : MCELFStreamer(Context, TAB, OS, Emitter), IsThumb(IsThumb), MappingSymbolCounter(0), LastEMS(EMS_None) { - Reset(); + EHReset(); } ~ARMELFStreamer() {} @@ -507,14 +510,15 @@ public: /// This is one of the functions used to emit data into an ELF section, so the /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if /// necessary. - void EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) override { + void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value)) - if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) - getContext().reportFatalError(Loc, "relocated expression must be 32-bit"); + if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) { + getContext().reportError(Loc, "relocated expression must be 32-bit"); + return; + } EmitDataMappingSymbol(); - MCELFStreamer::EmitValueImpl(Value, Size); + MCELFStreamer::EmitValueImpl(Value, Size, Loc); } void EmitAssemblerFlag(MCAssemblerFlag Flag) override { @@ -578,7 +582,10 @@ private: } // Helper functions for ARM exception handling directives - void Reset(); + void EHReset(); + + // Reset state between object emissions + void reset() override; void EmitPersonalityFixup(StringRef Name); void FlushPendingOffset(); @@ -684,16 +691,16 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { using namespace ARMBuildAttrs; setAttributeItem(CPU_name, - ARMTargetParser::getCPUAttr(Arch), + ARM::getCPUAttr(Arch), false); if (EmittedArch == ARM::AK_INVALID) setAttributeItem(CPU_arch, - ARMTargetParser::getArchAttr(Arch), + ARM::getArchAttr(Arch), false); else setAttributeItem(CPU_arch, - ARMTargetParser::getArchAttr(EmittedArch), + ARM::getArchAttr(EmittedArch), false); switch (Arch) { @@ -702,7 +709,6 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { case ARM::AK_ARMV3: case ARM::AK_ARMV3M: case ARM::AK_ARMV4: - case ARM::AK_ARMV5: setAttributeItem(ARM_ISA_use, Allowed, false); break; @@ -710,7 +716,6 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { case ARM::AK_ARMV5T: case ARM::AK_ARMV5TE: case ARM::AK_ARMV6: - case ARM::AK_ARMV6J: setAttributeItem(ARM_ISA_use, Allowed, false); setAttributeItem(THUMB_ISA_use, Allowed, false); break; @@ -721,8 +726,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { break; case ARM::AK_ARMV6K: - case ARM::AK_ARMV6Z: - case ARM::AK_ARMV6ZK: + case ARM::AK_ARMV6KZ: setAttributeItem(ARM_ISA_use, Allowed, false); setAttributeItem(THUMB_ISA_use, Allowed, false); setAttributeItem(Virtualization_use, AllowTZ, false); @@ -732,10 +736,6 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { setAttributeItem(THUMB_ISA_use, Allowed, false); break; - case ARM::AK_ARMV7: - setAttributeItem(THUMB_ISA_use, AllowThumb32, false); - break; - case ARM::AK_ARMV7A: setAttributeItem(CPU_arch_profile, ApplicationProfile, false); setAttributeItem(ARM_ISA_use, Allowed, false); @@ -755,6 +755,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { case ARM::AK_ARMV8A: case ARM::AK_ARMV8_1A: + case ARM::AK_ARMV8_2A: setAttributeItem(CPU_arch_profile, ApplicationProfile, false); setAttributeItem(ARM_ISA_use, Allowed, false); setAttributeItem(THUMB_ISA_use, AllowThumb32, false); @@ -1045,6 +1046,8 @@ void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) { getStreamer().emitInst(Inst, Suffix); } +void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; } + void ARMELFStreamer::FinishImpl() { MCTargetStreamer &TS = *getTargetStreamer(); ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); @@ -1053,6 +1056,18 @@ void ARMELFStreamer::FinishImpl() { MCELFStreamer::FinishImpl(); } +void ARMELFStreamer::reset() { + MCTargetStreamer &TS = *getTargetStreamer(); + ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); + ATS.reset(); + MappingSymbolCounter = 0; + MCELFStreamer::reset(); + // MCELFStreamer clear's the assembler's e_flags. However, for + // arm we manually set the ABI version on streamer creation, so + // do the same here + getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5); +} + inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix, unsigned Type, unsigned Flags, @@ -1084,19 +1099,14 @@ inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix, } inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) { - SwitchToEHSection(".ARM.extab", - ELF::SHT_PROGBITS, - ELF::SHF_ALLOC, - SectionKind::getDataRel(), - FnStart); + SwitchToEHSection(".ARM.extab", ELF::SHT_PROGBITS, ELF::SHF_ALLOC, + SectionKind::getData(), FnStart); } inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) { - SwitchToEHSection(".ARM.exidx", - ELF::SHT_ARM_EXIDX, + SwitchToEHSection(".ARM.exidx", ELF::SHT_ARM_EXIDX, ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER, - SectionKind::getDataRel(), - FnStart); + SectionKind::getData(), FnStart); } void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) { MCDataFragment *Frag = getOrCreateDataFragment(); @@ -1104,7 +1114,7 @@ void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) { Kind)); } -void ARMELFStreamer::Reset() { +void ARMELFStreamer::EHReset() { ExTab = nullptr; FnStart = nullptr; Personality = nullptr; @@ -1174,7 +1184,7 @@ void ARMELFStreamer::emitFnEnd() { SwitchSection(&FnStart->getSection()); // Clean exception handling frame information - Reset(); + EHReset(); } void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; } diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp index 1ac0815..bda37f6 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp @@ -33,7 +33,9 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) { SupportsDebugInformation = true; // Exceptions handling - ExceptionsType = ExceptionHandling::SjLj; + ExceptionsType = TheTriple.isOSDarwin() && !TheTriple.isWatchOS() + ? ExceptionHandling::SjLj + : ExceptionHandling::DwarfCFI; UseIntegratedAssembler = true; } diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h index 99a5fff..5e54816 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h @@ -19,34 +19,37 @@ #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { - class Triple; - - class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin { - virtual void anchor(); - - public: - explicit ARMMCAsmInfoDarwin(const Triple &TheTriple); - }; - - class ARMELFMCAsmInfo : public MCAsmInfoELF { - void anchor() override; - public: - explicit ARMELFMCAsmInfo(const Triple &TT); - - void setUseIntegratedAssembler(bool Value) override; - }; - - class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { - void anchor() override; - public: - explicit ARMCOFFMCAsmInfoMicrosoft(); - }; - - class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF { - void anchor() override; - public: - explicit ARMCOFFMCAsmInfoGNU(); - }; +class Triple; + +class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin { + virtual void anchor(); + +public: + explicit ARMMCAsmInfoDarwin(const Triple &TheTriple); +}; + +class ARMELFMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit ARMELFMCAsmInfo(const Triple &TT); + + void setUseIntegratedAssembler(bool Value) override; +}; + +class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { + void anchor() override; + +public: + explicit ARMCOFFMCAsmInfoMicrosoft(); +}; + +class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF { + void anchor() override; + +public: + explicit ARMCOFFMCAsmInfoGNU(); +}; } // namespace llvm diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h index 9146d4d..75dde80 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h @@ -63,8 +63,8 @@ public: return false; } void visitUsedExpr(MCStreamer &Streamer) const override; - MCSection *findAssociatedSection() const override { - return getSubExpr()->findAssociatedSection(); + MCFragment *findAssociatedFragment() const override { + return getSubExpr()->findAssociatedFragment(); } // There are no TLS ARMMCExprs at the moment. diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 21c9fc1..8c8c249 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -24,6 +24,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; @@ -134,101 +135,11 @@ std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) { bool isThumb = TT.getArch() == Triple::thumb || TT.getArch() == Triple::thumbeb; - bool NoCPU = CPU == "generic" || CPU.empty(); std::string ARMArchFeature; - switch (TT.getSubArch()) { - default: - llvm_unreachable("invalid sub-architecture for ARM"); - case Triple::ARMSubArch_v8: - if (NoCPU) - // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2, - // FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone, - // FeatureT2XtPk, FeatureCrypto, FeatureCRC - ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm," - "+trustzone,+t2xtpk,+crypto,+crc"; - else - // Use CPU to figure out the exact features - ARMArchFeature = "+v8"; - break; - case Triple::ARMSubArch_v8_1a: - if (NoCPU) - // v8.1a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2, - // FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone, - // FeatureT2XtPk, FeatureCrypto, FeatureCRC, FeatureV8_1a - ARMArchFeature = "+v8.1a,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm," - "+trustzone,+t2xtpk,+crypto,+crc"; - else - // Use CPU to figure out the exact features - ARMArchFeature = "+v8.1a"; - break; - case Triple::ARMSubArch_v7m: - isThumb = true; - if (NoCPU) - // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass - ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass"; - else - // Use CPU to figure out the exact features. - ARMArchFeature = "+v7"; - break; - case Triple::ARMSubArch_v7em: - if (NoCPU) - // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2, - // FeatureT2XtPk, FeatureMClass - ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,+t2xtpk,+mclass"; - else - // Use CPU to figure out the exact features. - ARMArchFeature = "+v7"; - break; - case Triple::ARMSubArch_v7s: - if (NoCPU) - // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureHasRAS - // Swift - ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+ras"; - else - // Use CPU to figure out the exact features. - ARMArchFeature = "+v7"; - break; - case Triple::ARMSubArch_v7: - // v7 CPUs have lots of different feature sets. If no CPU is specified, - // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return - // the "minimum" feature set and use CPU string to figure out the exact - // features. - if (NoCPU) - // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk - ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk"; - else - // Use CPU to figure out the exact features. - ARMArchFeature = "+v7"; - break; - case Triple::ARMSubArch_v6t2: - ARMArchFeature = "+v6t2"; - break; - case Triple::ARMSubArch_v6k: - ARMArchFeature = "+v6k"; - break; - case Triple::ARMSubArch_v6m: - isThumb = true; - if (NoCPU) - // v6m: FeatureNoARM, FeatureMClass - ARMArchFeature = "+v6m,+noarm,+mclass"; - else - ARMArchFeature = "+v6"; - break; - case Triple::ARMSubArch_v6: - ARMArchFeature = "+v6"; - break; - case Triple::ARMSubArch_v5te: - ARMArchFeature = "+v5te"; - break; - case Triple::ARMSubArch_v5: - ARMArchFeature = "+v5t"; - break; - case Triple::ARMSubArch_v4t: - ARMArchFeature = "+v4t"; - break; - case Triple::NoSubArch: - break; - } + + unsigned ArchID = ARM::parseArch(TT.getArchName()); + if (ArchID != ARM::AK_INVALID && (CPU.empty() || CPU == "generic")) + ARMArchFeature = (ARMArchFeature + "+" + ARM::getArchName(ArchID)).str(); if (isThumb) { if (ARMArchFeature.empty()) diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index fd30623..c2bbc8e 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -86,7 +86,8 @@ MCAsmBackend *createThumbBEAsmBackend(const Target &T, // object file. MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, bool RelaxAll); + MCCodeEmitter *Emitter, bool RelaxAll, + bool IncrementalLinkerCompatible); /// Construct an ELF Mach-O object writer. MCObjectWriter *createARMELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI, diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 95d7ea7..cfd504e 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -150,10 +150,12 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer, // See <reloc.h>. const MCSymbol *A = &Target.getSymA()->getSymbol(); - if (!A->getFragment()) - Asm.getContext().reportFatalError(Fixup.getLoc(), + if (!A->getFragment()) { + Asm.getContext().reportError(Fixup.getLoc(), "symbol '" + A->getName() + "' can not be undefined in a subtraction expression"); + return; + } uint32_t Value = Writer->getSymbolAddress(*A, Layout); uint32_t Value2 = 0; @@ -163,10 +165,12 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer, if (const MCSymbolRefExpr *B = Target.getSymB()) { const MCSymbol *SB = &B->getSymbol(); - if (!SB->getFragment()) - Asm.getContext().reportFatalError(Fixup.getLoc(), + if (!SB->getFragment()) { + Asm.getContext().reportError(Fixup.getLoc(), "symbol '" + B->getSymbol().getName() + "' can not be undefined in a subtraction expression"); + return; + } // Select the appropriate difference relocation type. Type = MachO::ARM_RELOC_HALF_SECTDIFF; @@ -251,10 +255,12 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer, // See <reloc.h>. const MCSymbol *A = &Target.getSymA()->getSymbol(); - if (!A->getFragment()) - Asm.getContext().reportFatalError(Fixup.getLoc(), + if (!A->getFragment()) { + Asm.getContext().reportError(Fixup.getLoc(), "symbol '" + A->getName() + "' can not be undefined in a subtraction expression"); + return; + } uint32_t Value = Writer->getSymbolAddress(*A, Layout); uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent()); @@ -265,10 +271,12 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer, assert(Type == MachO::ARM_RELOC_VANILLA && "invalid reloc for 2 symbols"); const MCSymbol *SB = &B->getSymbol(); - if (!SB->getFragment()) - Asm.getContext().reportFatalError(Fixup.getLoc(), + if (!SB->getFragment()) { + Asm.getContext().reportError(Fixup.getLoc(), "symbol '" + B->getSymbol().getName() + "' can not be undefined in a subtraction expression"); + return; + } // Select the appropriate difference relocation type. Type = MachO::ARM_RELOC_SECTDIFF; @@ -346,13 +354,15 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer, unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); unsigned Log2Size; unsigned RelocType = MachO::ARM_RELOC_VANILLA; - if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size)) + if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size)) { // If we failed to get fixup kind info, it's because there's no legal // relocation type for the fixup kind. This happens when it's a fixup that's // expected to always be resolvable at assembly time and not have any // relocations needed. - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unsupported relocation on symbol"); + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation on symbol"); + return; + } // If this is a difference or a defined symbol plus an offset, then we need a // scattered relocation entry. Differences always require scattered diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp index b680db5..c0d10c8 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -27,8 +27,8 @@ ARMTargetStreamer::~ARMTargetStreamer() {} // The constant pool handling is shared by all ARMTargetStreamer // implementations. -const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr) { - return ConstantPools->addEntry(Streamer, Expr, 4); +const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr, SMLoc Loc) { + return ConstantPools->addEntry(Streamer, Expr, 4, Loc); } void ARMTargetStreamer::emitCurrentConstantPool() { @@ -38,6 +38,9 @@ void ARMTargetStreamer::emitCurrentConstantPool() { // finish() - write out any non-empty assembler constant pools. void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); } +// reset() - Reset any state +void ARMTargetStreamer::reset() {} + // The remaining callbacks should be handled separately by each // streamer. void ARMTargetStreamer::emitFnStart() {} diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp index b993b1b..83fa084 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp @@ -37,11 +37,11 @@ void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) { } } -MCStreamer *llvm::createARMWinCOFFStreamer(MCContext &Context, - MCAsmBackend &MAB, - raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, - bool RelaxAll) { - return new ARMWinCOFFStreamer(Context, MAB, *Emitter, OS); +MCStreamer *llvm::createARMWinCOFFStreamer( + MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, bool IncrementalLinkerCompatible) { + auto *S = new ARMWinCOFFStreamer(Context, MAB, *Emitter, OS); + S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible); + return S; } diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp index 3b4358b..93e0ac4 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -13,6 +13,7 @@ #include "Thumb1FrameLowering.h" #include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -84,7 +85,6 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - assert(&MBB == &MF.front() && "Shrink-wrapping not yet implemented"); MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -100,7 +100,11 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, assert(NumBytes >= ArgRegsSaveSize && "ArgRegsSaveSize is included in NumBytes"); const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc dl; + unsigned FramePtr = RegInfo->getFrameRegister(MF); unsigned BasePtr = RegInfo->getBaseRegister(); int CFAOffset = 0; @@ -168,8 +172,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) { ++MBBI; - if (MBBI != MBB.end()) - dl = MBBI->getDebugLoc(); } // Determine starting offsets of spill areas. @@ -232,11 +234,10 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, } } - // Adjust FP so it point to the stack slot that contains the previous FP. if (HasFP) { - FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI) - + GPRCS1Size + ArgRegsSaveSize; + FramePtrOffsetInBlock += + MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size + ArgRegsSaveSize; AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr) .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4) .setMIFlags(MachineInstr::FrameSetup)); @@ -321,11 +322,8 @@ static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) { void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); - assert((MBBI->getOpcode() == ARM::tBX_RET || - MBBI->getOpcode() == ARM::tPOP_RET) && - "Can only insert epilog into returning blocks"); - DebugLoc dl = MBBI->getDebugLoc(); + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); const ThumbRegisterInfo *RegInfo = @@ -377,9 +375,8 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, ARM::SP) .addReg(FramePtr)); } else { - if (MBBI->getOpcode() == ARM::tBX_RET && - &MBB.front() != MBBI && - std::prev(MBBI)->getOpcode() == ARM::tPOP) { + if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET && + &MBB.front() != MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) { MachineBasicBlock::iterator PMBBI = std::prev(MBBI); if (!tryFoldSPUpdateIntoPushPop(STI, MF, PMBBI, NumBytes)) emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes); @@ -388,66 +385,189 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, } } - bool IsV4PopReturn = false; - for (const CalleeSavedInfo &CSI : MFI->getCalleeSavedInfo()) + if (needPopSpecialFixUp(MF)) { + bool Done = emitPopSpecialFixUp(MBB, /* DoIt */ true); + (void)Done; + assert(Done && "Emission of the special fixup failed!?"); + } +} + +bool Thumb1FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { + if (!needPopSpecialFixUp(*MBB.getParent())) + return true; + + MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); + return emitPopSpecialFixUp(*TmpMBB, /* DoIt */ false); +} + +bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const { + ARMFunctionInfo *AFI = + const_cast<MachineFunction *>(&MF)->getInfo<ARMFunctionInfo>(); + if (AFI->getArgRegsSaveSize()) + return true; + + // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up. + for (const CalleeSavedInfo &CSI : MF.getFrameInfo()->getCalleeSavedInfo()) if (CSI.getReg() == ARM::LR) - IsV4PopReturn = true; - IsV4PopReturn &= STI.hasV4TOps() && !STI.hasV5TOps(); - - // Unlike T2 and ARM mode, the T1 pop instruction cannot restore - // to LR, and we can't pop the value directly to the PC since - // we need to update the SP after popping the value. So instead - // we have to emit: - // POP {r3} - // ADD sp, #offset - // BX r3 - // If this would clobber a return value, then generate this sequence instead: - // MOV ip, r3 - // POP {r3} - // ADD sp, #offset - // MOV lr, r3 - // MOV r3, ip - // BX lr - if (ArgRegsSaveSize || IsV4PopReturn) { - // Get the last instruction, tBX_RET - MBBI = MBB.getLastNonDebugInstr(); - assert (MBBI->getOpcode() == ARM::tBX_RET); - DebugLoc dl = MBBI->getDebugLoc(); - - if (AFI->getReturnRegsCount() <= 3) { - // Epilogue: pop saved LR to R3 and branch off it. - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) - .addReg(ARM::R3, RegState::Define); - - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); - - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX)) - .addReg(ARM::R3, RegState::Kill); - AddDefaultPred(MIB); - MIB.copyImplicitOps(&*MBBI); - // erase the old tBX_RET instruction - MBB.erase(MBBI); - } else { - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) - .addReg(ARM::R12, RegState::Define) - .addReg(ARM::R3, RegState::Kill)); + return true; + + return false; +} + +bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, + bool DoIt) const { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + const ThumbRegisterInfo *RegInfo = + static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo()); - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) - .addReg(ARM::R3, RegState::Define); + // If MBBI is a return instruction, or is a tPOP followed by a return + // instruction in the successor BB, we may be able to directly restore + // LR in the PC. + // This is only possible with v5T ops (v4T can't change the Thumb bit via + // a POP PC instruction), and only if we do not need to emit any SP update. + // Otherwise, we need a temporary register to pop the value + // and copy that value into LR. + auto MBBI = MBB.getFirstTerminator(); + bool CanRestoreDirectly = STI.hasV5TOps() && !ArgRegsSaveSize; + if (CanRestoreDirectly) { + if (MBBI != MBB.end() && MBBI->getOpcode() != ARM::tB) + CanRestoreDirectly = (MBBI->getOpcode() == ARM::tBX_RET || + MBBI->getOpcode() == ARM::tPOP_RET); + else { + auto MBBI_prev = MBBI; + MBBI_prev--; + assert(MBBI_prev->getOpcode() == ARM::tPOP); + assert(MBB.succ_size() == 1); + if ((*MBB.succ_begin())->begin()->getOpcode() == ARM::tBX_RET) + MBBI = MBBI_prev; // Replace the final tPOP with a tPOP_RET. + else + CanRestoreDirectly = false; + } + } - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); + if (CanRestoreDirectly) { + if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET) + return true; + MachineInstrBuilder MIB = + AddDefaultPred( + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET))); + // Copy implicit ops and popped registers, if any. + for (auto MO: MBBI->operands()) + if (MO.isReg() && (MO.isImplicit() || MO.isDef())) + MIB.addOperand(MO); + MIB.addReg(ARM::PC, RegState::Define); + // Erase the old instruction (tBX_RET or tPOP). + MBB.erase(MBBI); + return true; + } - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) - .addReg(ARM::LR, RegState::Define) - .addReg(ARM::R3, RegState::Kill)); + // Look for a temporary register to use. + // First, compute the liveness information. + LivePhysRegs UsedRegs(STI.getRegisterInfo()); + UsedRegs.addLiveOuts(&MBB, /*AddPristines*/ true); + // The semantic of pristines changed recently and now, + // the callee-saved registers that are touched in the function + // are not part of the pristines set anymore. + // Add those callee-saved now. + const TargetRegisterInfo *TRI = STI.getRegisterInfo(); + const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); + for (unsigned i = 0; CSRegs[i]; ++i) + UsedRegs.addReg(CSRegs[i]); + + DebugLoc dl = DebugLoc(); + if (MBBI != MBB.end()) { + dl = MBBI->getDebugLoc(); + auto InstUpToMBBI = MBB.end(); + while (InstUpToMBBI != MBBI) + // The pre-decrement is on purpose here. + // We want to have the liveness right before MBBI. + UsedRegs.stepBackward(*--InstUpToMBBI); + } - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) - .addReg(ARM::R3, RegState::Define) - .addReg(ARM::R12, RegState::Kill)); - // Keep the tBX_RET instruction + // Look for a register that can be directly use in the POP. + unsigned PopReg = 0; + // And some temporary register, just in case. + unsigned TemporaryReg = 0; + BitVector PopFriendly = + TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::tGPRRegClassID)); + assert(PopFriendly.any() && "No allocatable pop-friendly register?!"); + // Rebuild the GPRs from the high registers because they are removed + // form the GPR reg class for thumb1. + BitVector GPRsNoLRSP = + TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::hGPRRegClassID)); + GPRsNoLRSP |= PopFriendly; + GPRsNoLRSP.reset(ARM::LR); + GPRsNoLRSP.reset(ARM::SP); + GPRsNoLRSP.reset(ARM::PC); + for (int Register = GPRsNoLRSP.find_first(); Register != -1; + Register = GPRsNoLRSP.find_next(Register)) { + if (!UsedRegs.contains(Register)) { + // Remember the first pop-friendly register and exit. + if (PopFriendly.test(Register)) { + PopReg = Register; + TemporaryReg = 0; + break; + } + // Otherwise, remember that the register will be available to + // save a pop-friendly register. + TemporaryReg = Register; } } + + if (!DoIt && !PopReg && !TemporaryReg) + return false; + + assert((PopReg || TemporaryReg) && "Cannot get LR"); + + if (TemporaryReg) { + assert(!PopReg && "Unnecessary MOV is about to be inserted"); + PopReg = PopFriendly.find_first(); + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(TemporaryReg, RegState::Define) + .addReg(PopReg, RegState::Kill)); + } + + if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPOP_RET) { + // We couldn't use the direct restoration above, so + // perform the opposite conversion: tPOP_RET to tPOP. + MachineInstrBuilder MIB = + AddDefaultPred( + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP))); + bool Popped = false; + for (auto MO: MBBI->operands()) + if (MO.isReg() && (MO.isImplicit() || MO.isDef()) && + MO.getReg() != ARM::PC) { + MIB.addOperand(MO); + if (!MO.isImplicit()) + Popped = true; + } + // Is there anything left to pop? + if (!Popped) + MBB.erase(MIB.getInstr()); + // Erase the old instruction. + MBB.erase(MBBI); + MBBI = AddDefaultPred(BuildMI(MBB, MBB.end(), dl, TII.get(ARM::tBX_RET))); + } + + assert(PopReg && "Do not know how to get LR"); + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) + .addReg(PopReg, RegState::Define); + + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(ARM::LR, RegState::Define) + .addReg(PopReg, RegState::Kill)); + + if (TemporaryReg) + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(PopReg, RegState::Define) + .addReg(TemporaryReg, RegState::Kill)); + + return true; } bool Thumb1FrameLowering:: @@ -461,8 +581,6 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, DebugLoc DL; const TargetInstrInfo &TII = *STI.getInstrInfo(); - if (MI != MBB.end()) DL = MI->getDebugLoc(); - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)); AddDefaultPred(MIB); for (unsigned i = CSI.size(); i != 0; --i) { @@ -501,31 +619,38 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB, const TargetInstrInfo &TII = *STI.getInstrInfo(); bool isVarArg = AFI->getArgRegsSaveSize() > 0; - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP)); AddDefaultPred(MIB); - bool NumRegs = false; + bool NeedsPop = false; for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); if (Reg == ARM::LR) { - // Special epilogue for vararg functions. See emitEpilogue - if (isVarArg) - continue; - // ARMv4T requires BX, see emitEpilogue - if (STI.hasV4TOps() && !STI.hasV5TOps()) + if (MBB.succ_empty()) { + // Special epilogue for vararg functions. See emitEpilogue + if (isVarArg) + continue; + // ARMv4T requires BX, see emitEpilogue + if (!STI.hasV5TOps()) + continue; + Reg = ARM::PC; + (*MIB).setDesc(TII.get(ARM::tPOP_RET)); + if (MI != MBB.end()) + MIB.copyImplicitOps(&*MI); + MI = MBB.erase(MI); + } else + // LR may only be popped into PC, as part of return sequence. + // If this isn't the return sequence, we'll need emitPopSpecialFixUp + // to restore LR the hard way. continue; - Reg = ARM::PC; - (*MIB).setDesc(TII.get(ARM::tPOP_RET)); - MIB.copyImplicitOps(&*MI); - MI = MBB.erase(MI); } MIB.addReg(Reg, getDefRegState(true)); - NumRegs = true; + NeedsPop = true; } // It's illegal to emit pop instruction without operands. - if (NumRegs) + if (NeedsPop) MBB.insert(MI, &*MIB); else MF.DeleteMachineInstr(MIB); diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h index 31d5732..27faac6 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h +++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h @@ -45,6 +45,47 @@ public: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + + /// Check whether or not the given \p MBB can be used as a epilogue + /// for the target. + /// The epilogue will be inserted before the first terminator of that block. + /// This method is used by the shrink-wrapping pass to decide if + /// \p MBB will be correctly handled by the target. + bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; + + /// Disable shrink wrap as tBfar/BL will be used to adjust for long jumps. + bool enableShrinkWrapping(const MachineFunction &MF) const override { + return false; + } + +private: + /// Check if the frame lowering of \p MF needs a special fixup + /// code sequence for the epilogue. + /// Unlike T2 and ARM mode, the T1 pop instruction cannot restore + /// to LR, and we can't pop the value directly to the PC when + /// we need to update the SP after popping the value. So instead + /// we have to emit: + /// POP {r3} + /// ADD sp, #offset + /// BX r3 + /// If this would clobber a return value, then generate this sequence instead: + /// MOV ip, r3 + /// POP {r3} + /// ADD sp, #offset + /// MOV lr, r3 + /// MOV r3, ip + /// BX lr + bool needPopSpecialFixUp(const MachineFunction &MF) const; + + /// Emit the special fixup code sequence for the epilogue. + /// \see needPopSpecialFixUp for more details. + /// \p DoIt, tells this method whether or not to actually insert + /// the code sequence in \p MBB. I.e., when \p DoIt is false, + /// \p MBB is left untouched. + /// \returns For \p DoIt == true: True when the emission succeeded + /// false otherwise. For \p DoIt == false: True when the emission + /// would have been possible, false otherwise. + bool emitPopSpecialFixUp(MachineBasicBlock &MBB, bool DoIt) const; }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index 216e776..530e1d3 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -84,11 +84,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOStore, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSTRspi)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); @@ -112,11 +110,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg) .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); } diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp index 68736bc..bf0498d 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -256,8 +256,8 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) { LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill(); // Finalize the bundle. - MachineBasicBlock::instr_iterator LI = LastITMI; - finalizeBundle(MBB, InsertPos.getInstrIterator(), std::next(LI)); + finalizeBundle(MBB, InsertPos.getInstrIterator(), + ++LastITMI->getIterator()); Modified = true; ++NumITs; diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index dc74f4e..4da769f 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -131,11 +131,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOStore, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || @@ -171,11 +169,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index d9ab824..bcd0e57 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -125,7 +125,10 @@ namespace { { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1,0 }, - // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent + // ARM::t2STMIA (with no basereg writeback) has no Thumb1 equivalent. + // tSTMIA_UPD is a change in semantics which can only be used if the base + // register is killed. This difference is correctly handled elsewhere. + { ARM::t2STMIA, ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1,0 } }; @@ -210,12 +213,12 @@ Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor) for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) { unsigned FromOpc = ReduceTable[i].WideOpc; if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second) - assert(false && "Duplicated entries?"); + llvm_unreachable("Duplicated entries?"); } } static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) { - for (const uint16_t *Regs = MCID.getImplicitDefs(); *Regs; ++Regs) + for (const MCPhysReg *Regs = MCID.getImplicitDefs(); *Regs; ++Regs) if (*Regs == ARM::CPSR) return true; return false; @@ -435,6 +438,14 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, isLdStMul = true; break; } + case ARM::t2STMIA: { + // If the base register is killed, we don't care what its value is after the + // instruction, so we can use an updating STMIA. + if (!MI->getOperand(0).isKill()) + return false; + + break; + } case ARM::t2LDMIA_RET: { unsigned BaseReg = MI->getOperand(1).getReg(); if (BaseReg != ARM::SP) @@ -492,6 +503,12 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, // Add the 16-bit load / store instruction. DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc)); + + // tSTMIA_UPD takes a defining register operand. We've already checked that + // the register is killed, so mark it as dead here. + if (Entry.WideOpc == ARM::t2STMIA) + MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead); + if (!isLdStMul) { MIB.addOperand(MI->getOperand(0)); MIB.addOperand(MI->getOperand(1)); @@ -633,10 +650,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr)) return false; - if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs && - STI->avoidMOVsShifterOperand()) + if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand()) // Don't issue movs with shifter operand for some CPUs unless we - // are optimizing / minimizing for size. + // are optimizing for size. return false; unsigned Reg0 = MI->getOperand(0).getReg(); @@ -660,11 +676,13 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, } } else if (Reg0 != Reg1) { // Try to commute the operands to make it a 2-address instruction. - unsigned CommOpIdx1, CommOpIdx2; + unsigned CommOpIdx1 = 1; + unsigned CommOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex; if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) || - CommOpIdx1 != 1 || MI->getOperand(CommOpIdx2).getReg() != Reg0) + MI->getOperand(CommOpIdx2).getReg() != Reg0) return false; - MachineInstr *CommutedMI = TII->commuteInstruction(MI); + MachineInstr *CommutedMI = + TII->commuteInstruction(MI, false, CommOpIdx1, CommOpIdx2); if (!CommutedMI) return false; } @@ -750,10 +768,9 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit)) return false; - if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs && - STI->avoidMOVsShifterOperand()) + if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand()) // Don't issue movs with shifter operand for some CPUs unless we - // are optimizing / minimizing for size. + // are optimizing for size. return false; unsigned Limit = ~0U; @@ -1012,9 +1029,9 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { TII = static_cast<const Thumb2InstrInfo *>(STI->getInstrInfo()); - // Optimizing / minimizing size? - OptimizeSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); - MinimizeSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); + // Optimizing / minimizing size? Minimizing size implies optimizing for size. + OptimizeSize = MF.getFunction()->optForSize(); + MinimizeSize = MF.getFunction()->optForMinSize(); BlockInfo.clear(); BlockInfo.resize(MF.getNumBlockIDs()); diff --git a/contrib/llvm/lib/Target/AVR/AVR.h b/contrib/llvm/lib/Target/AVR/AVR.h new file mode 100644 index 0000000..4c1667e --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVR.h @@ -0,0 +1,54 @@ +//===-- AVR.h - Top-level interface for AVR representation ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// AVR back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AVR_H +#define LLVM_AVR_H + +#include "llvm/Target/TargetMachine.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" + +namespace llvm { + +class AVRTargetMachine; +class FunctionPass; + +FunctionPass *createAVRISelDag(AVRTargetMachine &TM, + CodeGenOpt::Level OptLevel); +FunctionPass *createAVRExpandPseudoPass(); +FunctionPass *createAVRFrameAnalyzerPass(); +FunctionPass *createAVRDynAllocaSRPass(); +FunctionPass *createAVRBranchSelectionPass(); + +/** + * Contains the AVR backend. + */ +namespace AVR { + +enum AddressSpace { DataMemory, ProgramMemory }; + +template <typename T> bool isProgramMemoryAddress(T *V) { + return cast<PointerType>(V->getType())->getAddressSpace() == ProgramMemory; +} + +inline bool isProgramMemoryAccess(MemSDNode const *N) { + auto V = N->getMemOperand()->getValue(); + + return (V != nullptr) ? isProgramMemoryAddress(V) : false; +} + +} // end of namespace AVR + +} // end namespace llvm + +#endif // LLVM_AVR_H diff --git a/contrib/llvm/lib/Target/AVR/AVR.td b/contrib/llvm/lib/Target/AVR/AVR.td new file mode 100644 index 0000000..9e80717 --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVR.td @@ -0,0 +1,563 @@ +//===-- AVR.td - Describe the AVR Target Machine ----------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// This is the top level entry point for the AVR target. +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing +//===---------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===---------------------------------------------------------------------===// +// AVR Subtarget Features. +//===---------------------------------------------------------------------===// + +// :TODO: Implement the skip errata, see `gcc/config/avr/avr-arch.h` for details +// :TODO: We define all devices with SRAM to have all variants of LD/ST/LDD/STD. +// In reality, avr1 (no SRAM) has one variant each of `LD` and `ST`. +// avr2 (with SRAM) adds the rest of the variants. +// :TODO: s/AVRTiny/Tiny + + +// A feature set aggregates features, grouping them. We don't want to create a +// new member in AVRSubtarget (to store a value) for each set because we do not +// care if the set is supported, only the subfeatures inside the set. We fix +// this by simply setting the same dummy member for all feature sets, which is +// then ignored. +class FeatureSet<string name, string desc, list<SubtargetFeature> i> + : SubtargetFeature<name, "m_FeatureSetDummy", "true", desc, i>; + +// A family of microcontrollers, defining a set of supported features. +class Family<string name, list<SubtargetFeature> i> + : FeatureSet<name, !strconcat("The device is a part of the ", + name, " family"), i>; + +// The device has SRAM, and supports the bare minimum of +// SRAM-relevant instructions. +// +// These are: +// LD - all 9 variants +// ST - all 9 variants +// LDD - two variants for Y and Z +// STD - two variants for Y and Z +// `LDS Rd, K` +// `STS k, Rr` +// `PUSH`/`POP` +def FeatureSRAM : SubtargetFeature<"sram", "m_hasSRAM", "true", + "The device has random access memory">; + +// The device supports the `JMP k` and `CALL k` instructions. +def FeatureJMPCALL : SubtargetFeature<"jmpcall", "m_hasJMPCALL", "true", + "The device supports the `JMP` and " + "`CALL` instructions">; + + +// The device supports the indirect branches `IJMP` and `ICALL`. +def FeatureIJMPCALL : SubtargetFeature<"ijmpcall", "m_hasIJMPCALL", + "true", + "The device supports `IJMP`/`ICALL`" + "instructions">; + +// The device supports the extended indirect branches `EIJMP` and `EICALL`. +def FeatureEIJMPCALL : SubtargetFeature<"eijmpcall", "m_hasEIJMPCALL", + "true", "The device supports the " + "`EIJMP`/`EICALL` instructions">; + +// The device supports `ADDI Rd, K`, `SUBI Rd, K`. +def FeatureADDSUBIW : SubtargetFeature<"addsubiw", "m_hasADDSUBIW", + "true", "Enable 16-bit register-immediate " + "addition and subtraction instructions">; + +// The device has an 8-bit stack pointer (SP) register. +def FeatureSmallStack : SubtargetFeature<"smallstack", "m_hasSmallStack", + "true", "The device has an 8-bit " + "stack pointer">; + +// The device supports the 16-bit GPR pair MOVW instruction. +def FeatureMOVW : SubtargetFeature<"movw", "m_hasMOVW", "true", + "The device supports the 16-bit MOVW " + "instruction">; + +// The device supports the `LPM` instruction, with implied destination being r0. +def FeatureLPM : SubtargetFeature<"lpm", "m_hasLPM", "true", + "The device supports the `LPM` instruction">; + +// The device supports the `LPM Rd, Z[+] instruction. +def FeatureLPMX : SubtargetFeature<"lpmx", "m_hasLPMX", "true", + "The device supports the `LPM Rd, Z[+]` " + "instruction">; + +// The device supports the `ELPM` instruction. +def FeatureELPM : SubtargetFeature<"elpm", "m_hasELPM", "true", + "The device supports the ELPM instruction">; + +// The device supports the `ELPM Rd, Z[+]` instructions. +def FeatureELPMX : SubtargetFeature<"elpmx", "m_hasELPMX", "true", + "The device supports the `ELPM Rd, Z[+]` " + "instructions">; + +// The device supports the `SPM` instruction. +def FeatureSPM : SubtargetFeature<"spm", "m_hasSPM", "true", + "The device supports the `SPM` instruction">; + +// The device supports the `SPM Z+` instruction. +def FeatureSPMX : SubtargetFeature<"spmx", "m_hasSPMX", "true", + "The device supports the `SPM Z+` " + "instruction">; + +// The device supports the `DES k` instruction. +def FeatureDES : SubtargetFeature<"des", "m_hasDES", "true", + "The device supports the `DES k` encryption " + "instruction">; + +// The device supports the Read-Write-Modify instructions +// XCH, LAS, LAC, and LAT. +def FeatureRMW : SubtargetFeature<"rmw", "m_supportsRMW", "true", + "The device supports the read-write-modify " + "instructions: XCH, LAS, LAC, LAT">; + +// The device supports the `[F]MUL[S][U]` family of instructions. +def FeatureMultiplication : SubtargetFeature<"mul", "m_supportsMultiplication", + "true", "The device supports the " + "multiplication instructions">; + +// The device supports the `BREAK` instruction. +def FeatureBREAK : SubtargetFeature<"break", "m_hasBREAK", "true", + "The device supports the `BREAK` debugging " + "instruction">; + +// The device has instruction encodings specific to the Tiny core. +def FeatureTinyEncoding : SubtargetFeature<"tinyencoding", + "m_hasTinyEncoding", "true", + "The device has Tiny core specific " + "instruction encodings">; + +class ELFArch<string name> : SubtargetFeature<"", "ELFArch", + !strconcat("ELF::",name), "">; + +// ELF e_flags architecture values +def ELFArchAVR1 : ELFArch<"EF_AVR_ARCH_AVR1">; +def ELFArchAVR2 : ELFArch<"EF_AVR_ARCH_AVR2">; +def ELFArchAVR25 : ELFArch<"EF_AVR_ARCH_AVR25">; +def ELFArchAVR3 : ELFArch<"EF_AVR_ARCH_AVR3">; +def ELFArchAVR31 : ELFArch<"EF_AVR_ARCH_AVR31">; +def ELFArchAVR35 : ELFArch<"EF_AVR_ARCH_AVR35">; +def ELFArchAVR4 : ELFArch<"EF_AVR_ARCH_AVR4">; +def ELFArchAVR5 : ELFArch<"EF_AVR_ARCH_AVR5">; +def ELFArchAVR51 : ELFArch<"EF_AVR_ARCH_AVR51">; +def ELFArchAVR6 : ELFArch<"EF_AVR_ARCH_AVR6">; +def ELFArchAVRTiny : ELFArch<"EF_AVR_ARCH_AVRTINY">; +def ELFArchXMEGA1 : ELFArch<"EF_AVR_ARCH_XMEGA1">; +def ELFArchXMEGA2 : ELFArch<"EF_AVR_ARCH_XMEGA2">; +def ELFArchXMEGA3 : ELFArch<"EF_AVR_ARCH_XMEGA3">; +def ELFArchXMEGA4 : ELFArch<"EF_AVR_ARCH_XMEGA4">; +def ELFArchXMEGA5 : ELFArch<"EF_AVR_ARCH_XMEGA5">; +def ELFArchXMEGA6 : ELFArch<"EF_AVR_ARCH_XMEGA6">; +def ELFArchXMEGA7 : ELFArch<"EF_AVR_ARCH_XMEGA7">; + +//===---------------------------------------------------------------------===// +// AVR Families +//===---------------------------------------------------------------------===// + +// The device has at least the bare minimum that **every** single AVR +// device should have. +def FamilyAVR0 : Family<"avr0", []>; + +def FamilyAVR1 : Family<"avr1", [FamilyAVR0, FeatureLPM]>; + +def FamilyAVR2 : Family<"avr2", + [FamilyAVR1, FeatureIJMPCALL, FeatureADDSUBIW, + FeatureSRAM]>; + +def FamilyAVR25 : Family<"avr25", + [FamilyAVR2, FeatureMOVW, FeatureLPMX, + FeatureSPM, FeatureBREAK]>; + +def FamilyAVR3 : Family<"avr3", + [FamilyAVR2, FeatureJMPCALL]>; + +def FamilyAVR31 : Family<"avr31", + [FamilyAVR3, FeatureELPM]>; + +def FamilyAVR35 : Family<"avr35", + [FamilyAVR3, FeatureMOVW, FeatureLPMX, + FeatureSPM, FeatureBREAK]>; + +def FamilyAVR4 : Family<"avr4", + [FamilyAVR2, FeatureMultiplication, + FeatureMOVW, FeatureLPMX, FeatureSPM, + FeatureBREAK]>; + +def FamilyAVR5 : Family<"avr5", + [FamilyAVR3, FeatureMultiplication, + FeatureMOVW, FeatureLPMX, FeatureSPM, + FeatureBREAK]>; + +def FamilyAVR51 : Family<"avr51", + [FamilyAVR5, FeatureELPM, FeatureELPMX]>; + +def FamilyAVR6 : Family<"avr6", + [FamilyAVR51]>; + +def FamilyAVRTiny : Family<"avrtiny", + [FamilyAVR0, FeatureBREAK, FeatureSRAM, + FeatureTinyEncoding]>; + +def FamilyXMEGA : Family<"xmega", + [FamilyAVR51, FeatureEIJMPCALL, FeatureSPMX, + FeatureDES]>; + +def FamilyXMEGAU : Family<"xmegau", + [FamilyXMEGA, FeatureRMW]>; + +def FeatureSetSpecial : FeatureSet<"special", + "Enable use of the entire instruction " + "set - used for debugging", + [FeatureSRAM, FeatureJMPCALL, + FeatureIJMPCALL, FeatureEIJMPCALL, + FeatureADDSUBIW, FeatureMOVW, + FeatureLPM, FeatureLPMX, FeatureELPM, + FeatureELPMX, FeatureSPM, FeatureSPMX, + FeatureDES, FeatureRMW, + FeatureMultiplication, FeatureBREAK]>; + +//===---------------------------------------------------------------------===// +// AVR microcontrollers supported. +//===---------------------------------------------------------------------===// + +class Device<string Name, Family Fam, ELFArch Arch, + list<SubtargetFeature> ExtraFeatures = []> + : Processor<Name, NoItineraries, !listconcat([Fam,Arch],ExtraFeatures)>; + +// Generic MCUs +// Note that several versions of GCC has strange ELF architecture +// settings for backwards compatibility - see `gas/config/tc-avr.c` +// in AVR binutils. We do not replicate this. +def : Device<"avr1", FamilyAVR1, ELFArchAVR1>; +def : Device<"avr2", FamilyAVR2, ELFArchAVR2>; +def : Device<"avr25", FamilyAVR25, ELFArchAVR25>; +def : Device<"avr3", FamilyAVR3, ELFArchAVR3>; +def : Device<"avr31", FamilyAVR31, ELFArchAVR31>; +def : Device<"avr35", FamilyAVR35, ELFArchAVR35>; +def : Device<"avr4", FamilyAVR4, ELFArchAVR4>; +def : Device<"avr5", FamilyAVR5, ELFArchAVR5>; +def : Device<"avr51", FamilyAVR51, ELFArchAVR51>; +def : Device<"avr6", FamilyAVR6, ELFArchAVR6>; +def : Device<"avrxmega1", FamilyXMEGA, ELFArchXMEGA1>; +def : Device<"avrxmega2", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"avrxmega3", FamilyXMEGA, ELFArchXMEGA3>; +def : Device<"avrxmega4", FamilyXMEGA, ELFArchXMEGA4>; +def : Device<"avrxmega5", FamilyXMEGA, ELFArchXMEGA5>; +def : Device<"avrxmega6", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"avrxmega7", FamilyXMEGA, ELFArchXMEGA7>; +def : Device<"avrtiny", FamilyAVRTiny, ELFArchAVRTiny>; + +// Specific MCUs +def : Device<"at90s1200", FamilyAVR0, ELFArchAVR1>; +def : Device<"attiny11", FamilyAVR1, ELFArchAVR1>; +def : Device<"attiny12", FamilyAVR1, ELFArchAVR1>; +def : Device<"attiny15", FamilyAVR1, ELFArchAVR1>; +def : Device<"attiny28", FamilyAVR1, ELFArchAVR1>; +def : Device<"at90s2313", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s2323", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s2333", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s2343", FamilyAVR2, ELFArchAVR2>; +def : Device<"attiny22", FamilyAVR2, ELFArchAVR2>; +def : Device<"attiny26", FamilyAVR2, ELFArchAVR2, [FeatureLPMX]>; +def : Device<"at86rf401", FamilyAVR2, ELFArchAVR25, + [FeatureMOVW, FeatureLPMX]>; +def : Device<"at90s4414", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s4433", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s4434", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s8515", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90c8534", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s8535", FamilyAVR2, ELFArchAVR2>; +def : Device<"ata5272", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny13", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny13a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny2313", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny2313a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny24", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny24a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny4313", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny44", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny44a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny84", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny84a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny25", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny45", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny85", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny261", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny261a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny461", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny461a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny861", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny861a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny87", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny43u", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny48", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny88", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny828", FamilyAVR25, ELFArchAVR25>; +def : Device<"at43usb355", FamilyAVR3, ELFArchAVR3>; +def : Device<"at76c711", FamilyAVR3, ELFArchAVR3>; +def : Device<"atmega103", FamilyAVR31, ELFArchAVR31>; +def : Device<"at43usb320", FamilyAVR31, ELFArchAVR31>; +def : Device<"attiny167", FamilyAVR35, ELFArchAVR35>; +def : Device<"at90usb82", FamilyAVR35, ELFArchAVR35>; +def : Device<"at90usb162", FamilyAVR35, ELFArchAVR35>; +def : Device<"ata5505", FamilyAVR35, ELFArchAVR35>; +def : Device<"atmega8u2", FamilyAVR35, ELFArchAVR35>; +def : Device<"atmega16u2", FamilyAVR35, ELFArchAVR35>; +def : Device<"atmega32u2", FamilyAVR35, ELFArchAVR35>; +def : Device<"attiny1634", FamilyAVR35, ELFArchAVR35>; +def : Device<"atmega8", FamilyAVR4, ELFArchAVR4>; // FIXME: family may be wrong +def : Device<"ata6289", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega8a", FamilyAVR4, ELFArchAVR4>; +def : Device<"ata6285", FamilyAVR4, ELFArchAVR4>; +def : Device<"ata6286", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega48", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega48a", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega48pa", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega48p", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega88", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega88a", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega88p", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega88pa", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega8515", FamilyAVR2, ELFArchAVR4, + [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>; +def : Device<"atmega8535", FamilyAVR2, ELFArchAVR4, + [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>; +def : Device<"atmega8hva", FamilyAVR4, ELFArchAVR4>; +def : Device<"at90pwm1", FamilyAVR4, ELFArchAVR4>; +def : Device<"at90pwm2", FamilyAVR4, ELFArchAVR4>; +def : Device<"at90pwm2b", FamilyAVR4, ELFArchAVR4>; +def : Device<"at90pwm3", FamilyAVR4, ELFArchAVR4>; +def : Device<"at90pwm3b", FamilyAVR4, ELFArchAVR4>; +def : Device<"at90pwm81", FamilyAVR4, ELFArchAVR4>; +def : Device<"ata5790", FamilyAVR5, ELFArchAVR5>; +def : Device<"ata5795", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega161", FamilyAVR3, ELFArchAVR5, + [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>; +def : Device<"atmega162", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega163", FamilyAVR3, ELFArchAVR5, + [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>; +def : Device<"atmega164a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega164p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega164pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega165", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega165a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega165p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega165pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega168", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega168a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega168p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega168pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega169", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega169a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega169p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega169pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega323", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega324a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega324p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega324pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega325", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega325a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega325p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega325pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3250", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3250a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3250p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3250pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega328", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega328p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega329", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega329a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega329p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega329pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3290", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3290a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3290p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3290pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega406", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega64", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega64a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega640", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega644", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega644a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega644p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega644pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega645", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega645a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega645p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega649", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega649a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega649p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega6450", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega6450a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega6450p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega6490", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega6490a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega6490p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega64rfr2", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega644rfr2", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16hva", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16hva2", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16hvb", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16hvbrevb", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32hvb", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32hvbrevb", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega64hve", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90can32", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90can64", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90pwm161", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90pwm216", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90pwm316", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32c1", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega64c1", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16m1", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32m1", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega64m1", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16u4", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32u4", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32u6", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90usb646", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90usb647", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90scr100", FamilyAVR5, ELFArchAVR5>; +def : Device<"at94k", FamilyAVR3, ELFArchAVR5, + [FeatureMultiplication, FeatureMOVW, FeatureLPMX]>; +def : Device<"m3000", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega128", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega128a", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega1280", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega1281", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega1284", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega1284p", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega128rfa1", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega128rfr2", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega1284rfr2", FamilyAVR51, ELFArchAVR51>; +def : Device<"at90can128", FamilyAVR51, ELFArchAVR51>; +def : Device<"at90usb1286", FamilyAVR51, ELFArchAVR51>; +def : Device<"at90usb1287", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega2560", FamilyAVR6, ELFArchAVR6>; +def : Device<"atmega2561", FamilyAVR6, ELFArchAVR6>; +def : Device<"atmega256rfr2", FamilyAVR6, ELFArchAVR6>; +def : Device<"atmega2564rfr2", FamilyAVR6, ELFArchAVR6>; +def : Device<"atxmega16a4", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega16a4u", FamilyXMEGAU, ELFArchXMEGA2>; +def : Device<"atxmega16c4", FamilyXMEGAU, ELFArchXMEGA2>; +def : Device<"atxmega16d4", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega32a4", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega32a4u", FamilyXMEGAU, ELFArchXMEGA2>; +def : Device<"atxmega32c4", FamilyXMEGAU, ELFArchXMEGA2>; +def : Device<"atxmega32d4", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega32e5", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega16e5", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega8e5", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega32x1", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega64a3", FamilyXMEGA, ELFArchXMEGA4>; +def : Device<"atxmega64a3u", FamilyXMEGAU, ELFArchXMEGA4>; +def : Device<"atxmega64a4u", FamilyXMEGAU, ELFArchXMEGA4>; +def : Device<"atxmega64b1", FamilyXMEGAU, ELFArchXMEGA4>; +def : Device<"atxmega64b3", FamilyXMEGAU, ELFArchXMEGA4>; +def : Device<"atxmega64c3", FamilyXMEGAU, ELFArchXMEGA4>; +def : Device<"atxmega64d3", FamilyXMEGA, ELFArchXMEGA4>; +def : Device<"atxmega64d4", FamilyXMEGA, ELFArchXMEGA4>; +def : Device<"atxmega64a1", FamilyXMEGA, ELFArchXMEGA5>; +def : Device<"atxmega64a1u", FamilyXMEGAU, ELFArchXMEGA5>; +def : Device<"atxmega128a3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega128a3u", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega128b1", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega128b3", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega128c3", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega128d3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega128d4", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega192a3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega192a3u", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega192c3", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega192d3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega256a3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega256a3u", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega256a3b", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega256a3bu", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega256c3", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega256d3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega384c3", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega384d3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega128a1", FamilyXMEGA, ELFArchXMEGA7>; +def : Device<"atxmega128a1u", FamilyXMEGAU, ELFArchXMEGA7>; +def : Device<"atxmega128a4u", FamilyXMEGAU, ELFArchXMEGA7>; +def : Device<"attiny4", FamilyAVRTiny, ELFArchAVRTiny>; +def : Device<"attiny5", FamilyAVRTiny, ELFArchAVRTiny>; +def : Device<"attiny9", FamilyAVRTiny, ELFArchAVRTiny>; +def : Device<"attiny10", FamilyAVRTiny, ELFArchAVRTiny>; +def : Device<"attiny20", FamilyAVRTiny, ELFArchAVRTiny>; +def : Device<"attiny40", FamilyAVRTiny, ELFArchAVRTiny>; + +//===---------------------------------------------------------------------===// +// Register File Description +//===---------------------------------------------------------------------===// + +include "AVRRegisterInfo.td" + +//===---------------------------------------------------------------------===// +// Instruction Descriptions +//===---------------------------------------------------------------------===// + +//include "AVRInstrInfo.td" + +//def AVRInstrInfo : InstrInfo; + +//===---------------------------------------------------------------------===// +// Calling Conventions +//===---------------------------------------------------------------------===// + +include "AVRCallingConv.td" + +//===---------------------------------------------------------------------===// +// Assembly Printers +//===---------------------------------------------------------------------===// + +// def AVRAsmWriter : AsmWriter { +// string AsmWriterClassName = "InstPrinter"; +// bit isMCAsmWriter = 1; +// } + +//===---------------------------------------------------------------------===// +// Assembly Parsers +//===---------------------------------------------------------------------===// + +// def AVRAsmParser : AsmParser { +// let ShouldEmitMatchRegisterName = 1; +// let ShouldEmitMatchRegisterAltName = 1; +// } + +// def AVRAsmParserVariant : AsmParserVariant { +// int Variant = 0; +// +// // Recognize hard coded registers. +// string RegisterPrefix = "$"; +// } + +//===---------------------------------------------------------------------===// +// Target Declaration +//===---------------------------------------------------------------------===// + +def AVR : Target { +// let InstructionSet = AVRInstrInfo; +// let AssemblyWriters = [AVRAsmWriter]; +// +// let AssemblyParsers = [AVRAsmParser]; +// let AssemblyParserVariants = [AVRAsmParserVariant]; +} + diff --git a/contrib/llvm/lib/Target/AVR/AVRCallingConv.td b/contrib/llvm/lib/Target/AVR/AVRCallingConv.td new file mode 100644 index 0000000..d8cb3fe --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRCallingConv.td @@ -0,0 +1,65 @@ +//===-- AVRCallingConv.td - Calling Conventions for AVR ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This describes the calling conventions for AVR architecture. +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// AVR Return Value Calling Convention +//===----------------------------------------------------------------------===// + +def RetCC_AVR : CallingConv +<[ + // i8 is returned in R24. + CCIfType<[i8], CCAssignToReg<[R24]>>, + + // i16 are returned in R25:R24, R23:R22, R21:R20 and R19:R18. + CCIfType<[i16], CCAssignToReg<[R25R24, R23R22, R21R20, R19R18]>> +]>; + +// Special return value calling convention for runtime functions. +def RetCC_AVR_RT : CallingConv +<[ + CCIfType<[i8], CCAssignToReg<[R24,R25]>>, + CCIfType<[i16], CCAssignToReg<[R23R22, R25R24]>> +]>; + +//===----------------------------------------------------------------------===// +// AVR Argument Calling Conventions +//===----------------------------------------------------------------------===// + +// The calling conventions are implemented in custom C++ code + +// Calling convention for variadic functions. +def ArgCC_AVR_Vararg : CallingConv +<[ + // i16 are always passed through the stack with an alignment of 1. + CCAssignToStack<2, 1> +]>; + +// Special argument calling convention for +// multiplication runtime functions. +def ArgCC_AVR_RT_MUL : CallingConv +<[ + CCIfType<[i16], CCAssignToReg<[R27R26,R19R18]>> +]>; + +// Special argument calling convention for +// division runtime functions. +def ArgCC_AVR_RT_DIV : CallingConv +<[ + CCIfType<[i8], CCAssignToReg<[R24,R22]>>, + CCIfType<[i16], CCAssignToReg<[R25R24, R23R22]>> +]>; + +//===----------------------------------------------------------------------===// +// Callee-saved register lists. +//===----------------------------------------------------------------------===// + +def CSR_Normal : CalleeSavedRegs<(add R29, R28, (sequence "R%u", 17, 2))>; +def CSR_Interrupts : CalleeSavedRegs<(add (sequence "R%u", 31, 0))>; diff --git a/contrib/llvm/lib/Target/AVR/AVRConfig.h b/contrib/llvm/lib/Target/AVR/AVRConfig.h new file mode 100644 index 0000000..65588bc --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRConfig.h @@ -0,0 +1,15 @@ +//===-- AVRConfig.h - AVR Backend Configuration Header ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AVR_CONFIG_H +#define LLVM_AVR_CONFIG_H + +#define LLVM_AVR_GCC_COMPAT + +#endif // LLVM_AVR_CONFIG_H diff --git a/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h b/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h new file mode 100644 index 0000000..6571d5d --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h @@ -0,0 +1,73 @@ +//===-- AVRMachineFuctionInfo.h - AVR machine function info -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares AVR-specific per-machine-function information. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AVR_MACHINE_FUNCTION_INFO_H +#define LLVM_AVR_MACHINE_FUNCTION_INFO_H + +#include "AVRConfig.h" + +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { + +/** + * Contains AVR-specific information for each MachineFunction. + */ +class AVRMachineFunctionInfo : public MachineFunctionInfo { + /// Indicates if a register has been spilled by the register + /// allocator. + bool HasSpills; + + /// Indicates if there are any fixed size allocas present. + /// Note that if there are only variable sized allocas this is set to false. + bool HasAllocas; + + /// Indicates if arguments passed using the stack are being + /// used inside the function. + bool HasStackArgs; + + /// Size of the callee-saved register portion of the + /// stack frame in bytes. + unsigned CalleeSavedFrameSize; + + /// FrameIndex for start of varargs area. + int VarArgsFrameIndex; + +public: + AVRMachineFunctionInfo() + : HasSpills(false), HasAllocas(false), HasStackArgs(false), + CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {} + + explicit AVRMachineFunctionInfo(MachineFunction &MF) + : HasSpills(false), HasAllocas(false), HasStackArgs(false), + CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {} + + bool getHasSpills() const { return HasSpills; } + void setHasSpills(bool B) { HasSpills = B; } + + bool getHasAllocas() const { return HasAllocas; } + void setHasAllocas(bool B) { HasAllocas = B; } + + bool getHasStackArgs() const { return HasStackArgs; } + void setHasStackArgs(bool B) { HasStackArgs = B; } + + unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } + void setCalleeSavedFrameSize(unsigned Bytes) { CalleeSavedFrameSize = Bytes; } + + int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } + void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; } +}; + +} // end llvm namespace + +#endif // LLVM_AVR_MACHINE_FUNCTION_INFO_H diff --git a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td new file mode 100644 index 0000000..32650fc --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td @@ -0,0 +1,216 @@ +//===-- AVRRegisterInfo.td - AVR Register defs -------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the AVR register file +//===----------------------------------------------------------------------===// + +// 8-bit General purpose register definition. +class AVRReg<bits<16> num, + string name, + list<Register> subregs = [], + list<string> altNames = []> + : RegisterWithSubRegs<name, subregs> +{ + field bits<16> Num = num; + + let HWEncoding = num; + let Namespace = "AVR"; + let SubRegs = subregs; + let AltNames = altNames; +} + +// Subregister indices. +let Namespace = "AVR" in +{ + def sub_lo : SubRegIndex<8>; + def sub_hi : SubRegIndex<8, 8>; +} + +let Namespace = "AVR" in { + def ptr : RegAltNameIndex; +} + + +//===----------------------------------------------------------------------===// +// 8-bit general purpose registers +//===----------------------------------------------------------------------===// + +def R0 : AVRReg<0, "r0">, DwarfRegNum<[0]>; +def R1 : AVRReg<1, "r1">, DwarfRegNum<[1]>; +def R2 : AVRReg<2, "r2">, DwarfRegNum<[2]>; +def R3 : AVRReg<3, "r3">, DwarfRegNum<[3]>; +def R4 : AVRReg<4, "r4">, DwarfRegNum<[4]>; +def R5 : AVRReg<5, "r5">, DwarfRegNum<[5]>; +def R6 : AVRReg<6, "r6">, DwarfRegNum<[6]>; +def R7 : AVRReg<7, "r7">, DwarfRegNum<[7]>; +def R8 : AVRReg<8, "r8">, DwarfRegNum<[8]>; +def R9 : AVRReg<9, "r9">, DwarfRegNum<[9]>; +def R10 : AVRReg<10, "r10">, DwarfRegNum<[10]>; +def R11 : AVRReg<11, "r11">, DwarfRegNum<[11]>; +def R12 : AVRReg<12, "r12">, DwarfRegNum<[12]>; +def R13 : AVRReg<13, "r13">, DwarfRegNum<[13]>; +def R14 : AVRReg<14, "r14">, DwarfRegNum<[14]>; +def R15 : AVRReg<15, "r15">, DwarfRegNum<[15]>; +def R16 : AVRReg<16, "r16">, DwarfRegNum<[16]>; +def R17 : AVRReg<17, "r17">, DwarfRegNum<[17]>; +def R18 : AVRReg<18, "r18">, DwarfRegNum<[18]>; +def R19 : AVRReg<19, "r19">, DwarfRegNum<[19]>; +def R20 : AVRReg<20, "r20">, DwarfRegNum<[20]>; +def R21 : AVRReg<21, "r21">, DwarfRegNum<[21]>; +def R22 : AVRReg<22, "r22">, DwarfRegNum<[22]>; +def R23 : AVRReg<23, "r23">, DwarfRegNum<[23]>; +def R24 : AVRReg<24, "r24">, DwarfRegNum<[24]>; +def R25 : AVRReg<25, "r25">, DwarfRegNum<[25]>; +def R26 : AVRReg<26, "r26">, DwarfRegNum<[26]>; +def R27 : AVRReg<27, "r27">, DwarfRegNum<[27]>; +def R28 : AVRReg<28, "r28">, DwarfRegNum<[28]>; +def R29 : AVRReg<29, "r29">, DwarfRegNum<[29]>; +def R30 : AVRReg<30, "r30">, DwarfRegNum<[30]>; +def R31 : AVRReg<31, "r31">, DwarfRegNum<[31]>; +def SPL : AVRReg<32, "SPL">, DwarfRegNum<[32]>; +def SPH : AVRReg<33, "SPH">, DwarfRegNum<[33]>; + +let SubRegIndices = [sub_lo, sub_hi], +CoveredBySubRegs = 1 in +{ + // 16 bit GPR pairs. + def SP : AVRReg<32, "SP", [SPL, SPH]>, DwarfRegNum<[32]>; + + // The pointer registers (X,Y,Z) are a special case because they + // are printed as a `high:low` pair when a DREG is expected, + // but printed using `X`, `Y`, `Z` when a pointer register is expected. + let RegAltNameIndices = [ptr] in { + def R31R30 : AVRReg<30, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>; + def R29R28 : AVRReg<28, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>; + def R27R26 : AVRReg<26, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>; + } + def R25R24 : AVRReg<24, "r25:r24", [R24, R25]>, DwarfRegNum<[24]>; + def R23R22 : AVRReg<22, "r23:r22", [R22, R23]>, DwarfRegNum<[22]>; + def R21R20 : AVRReg<20, "r21:r20", [R20, R21]>, DwarfRegNum<[20]>; + def R19R18 : AVRReg<18, "r19:r18", [R18, R19]>, DwarfRegNum<[18]>; + def R17R16 : AVRReg<16, "r17:r16", [R16, R17]>, DwarfRegNum<[16]>; + def R15R14 : AVRReg<14, "r15:r14", [R14, R15]>, DwarfRegNum<[14]>; + def R13R12 : AVRReg<12, "r13:r12", [R12, R13]>, DwarfRegNum<[12]>; + def R11R10 : AVRReg<10, "r11:r10", [R10, R11]>, DwarfRegNum<[10]>; + def R9R8 : AVRReg<8, "r9:r8", [R8, R9]>, DwarfRegNum<[8]>; + def R7R6 : AVRReg<6, "r7:r6", [R6, R7]>, DwarfRegNum<[6]>; + def R5R4 : AVRReg<4, "r5:r4", [R4, R5]>, DwarfRegNum<[4]>; + def R3R2 : AVRReg<2, "r3:r2", [R2, R3]>, DwarfRegNum<[2]>; + def R1R0 : AVRReg<0, "r1:r0", [R0, R1]>, DwarfRegNum<[0]>; +} + +//===----------------------------------------------------------------------===// +// Register Classes +//===----------------------------------------------------------------------===// + +//:TODO: use proper set instructions instead of using always "add" + +// Main 8-bit register class. +def GPR8 : RegisterClass<"AVR", [i8], 8, + ( + // Return value and argument registers. + add R24, R25, R18, R19, R20, R21, R22, R23, + // Scratch registers. + R30, R31, R26, R27, + // Callee saved registers. + R28, R29, R17, R16, R15, R14, R13, R12, R11, R10, + R9, R8, R7, R6, R5, R4, R3, R2, R0, R1 + )>; + +// Simple lower registers r0..r15 +def GPR8lo : RegisterClass<"AVR", [i8], 8, + ( + add R15, R14, R13, R12, R11, R10, R9, R8, R7, R6, R5, R4, R3, R2, R0, R1 + )>; + +// 8-bit register class for instructions which take immediates. +def LD8 : RegisterClass<"AVR", [i8], 8, + ( + // Return value and arguments. + add R24, R25, R18, R19, R20, R21, R22, R23, + // Scratch registers. + R30, R31, R26, R27, + // Callee saved registers. + R28, R29, R17, R16 + )>; + +// Simple lower registers r16..r23 +def LD8lo : RegisterClass<"AVR", [i8], 8, + ( + add R23, R22, R21, R20, R19, R18, R17, R16 + )>; + +// Main 16-bit pair register class. +def DREGS : RegisterClass<"AVR", [i16], 8, + ( + // Return value and arguments. + add R25R24, R19R18, R21R20, R23R22, + // Scratch registers. + R31R30, R27R26, + // Callee saved registers. + R29R28, R17R16, R15R14, R13R12, R11R10, + R9R8, R7R6, R5R4, R3R2, R1R0 + )>; + +// 16-bit register class for immediate instructions. +def DLDREGS : RegisterClass<"AVR", [i16], 8, + ( + // Return value and arguments. + add R25R24, R19R18, R21R20, R23R22, + // Scratch registers. + R31R30, R27R26, + // Callee saved registers. + R29R28, R17R16 + )>; + +// 16-bit register class for the adiw/sbiw instructions. +def IWREGS : RegisterClass<"AVR", [i16], 8, + ( + // Return value and arguments. + add R25R24, + // Scratch registers. + R31R30, R27R26, + // Callee saved registers. + R29R28 + )>; + +// 16-bit register class for the ld and st instructions. +// AKA X,Y, and Z +def PTRREGS : RegisterClass<"AVR", [i16], 8, + ( + add R27R26, // X + R29R28, // Y + R31R30 // Z + ), ptr>; + +// 16-bit register class for the ldd and std instructions. +// AKA Y and Z. +def PTRDISPREGS : RegisterClass<"AVR", [i16], 8, + ( + add R31R30, R29R28 + ), ptr>; + +// We have a bunch of instructions with an explicit Z register argument. We +// model this using a register class containing only the Z register. +// :TODO: Rename to 'ZREG'. +def ZREGS : RegisterClass<"AVR", [i16], 8, (add R31R30)>; + +// Register class used for the stack read pseudo instruction. +def GPRSP: RegisterClass<"AVR", [i16], 8, (add SP)>; + +//:TODO: if we remove this we get an error in tablegen +//:TODO: this is just a hack, remove it once add16 works! +// Status register. +def SREG : AVRReg<14, "FLAGS">, DwarfRegNum<[88]>; +def CCR : RegisterClass<"AVR", [i8], 8, (add SREG)> +{ + let CopyCost = -1; // Don't allow copying of status registers +} + diff --git a/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h b/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h new file mode 100644 index 0000000..ee832ad --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h @@ -0,0 +1,29 @@ +//===-- AVRSelectionDAGInfo.h - AVR SelectionDAG Info -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the AVR subclass for TargetSelectionDAGInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AVR_SELECTION_DAG_INFO_H +#define LLVM_AVR_SELECTION_DAG_INFO_H + +#include "llvm/Target/TargetSelectionDAGInfo.h" + +namespace llvm { +/** + * Holds information about the AVR instruction selection DAG. + */ +class AVRSelectionDAGInfo : public TargetSelectionDAGInfo { +public: +}; + +} // end namespace llvm + +#endif // LLVM_AVR_SELECTION_DAG_INFO_H diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp new file mode 100644 index 0000000..a91dce8 --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp @@ -0,0 +1,4 @@ + +extern "C" void LLVMInitializeAVRTarget() { + +} diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp new file mode 100644 index 0000000..85f03e8 --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp @@ -0,0 +1,40 @@ +//===-- AVRTargetObjectFile.cpp - AVR Object Files ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AVRTargetObjectFile.h" + +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/ELF.h" + +#include "AVR.h" + +namespace llvm { +void AVRTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { + Base::Initialize(Ctx, TM); + ProgmemDataSection = + Ctx.getELFSection(".progmem.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); +} + +MCSection * +AVRTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, + SectionKind Kind, Mangler &Mang, + const TargetMachine &TM) const { + // Global values in flash memory are placed in the progmem.data section + // unless they already have a user assigned section. + if (AVR::isProgramMemoryAddress(GV) && !GV->hasSection()) + return ProgmemDataSection; + + // Otherwise, we work the same way as ELF. + return Base::SelectSectionForGlobal(GV, Kind, Mang, TM); +} +} // end of namespace llvm diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h new file mode 100644 index 0000000..bdda35b --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h @@ -0,0 +1,35 @@ +//===-- AVRTargetObjectFile.h - AVR Object Info -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AVR_TARGET_OBJECT_FILE_H +#define LLVM_AVR_TARGET_OBJECT_FILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" + +namespace llvm { +/** + * Lowering for an AVR ELF32 object file. + */ +class AVRTargetObjectFile : public TargetLoweringObjectFileELF { + typedef TargetLoweringObjectFileELF Base; + +public: + void Initialize(MCContext &ctx, const TargetMachine &TM) override; + + MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const override; + +private: + MCSection *ProgmemDataSection; +}; + +} // end namespace llvm + +#endif // LLVM_AVR_TARGET_OBJECT_FILE_H diff --git a/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp b/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp new file mode 100644 index 0000000..c0e0d20 --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp @@ -0,0 +1,25 @@ +//===-- AVRTargetInfo.cpp - AVR Target Implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/Module.h" +#include "llvm/Support/TargetRegistry.h" + +namespace llvm { +Target TheAVRTarget; +} + +extern "C" void LLVMInitializeAVRTargetInfo() { + llvm::RegisterTarget<llvm::Triple::avr> X( + llvm::TheAVRTarget, "avr", "Atmel AVR Microcontroller"); +} + +// FIXME: Temporary stub - this function must be defined for linking +// to succeed. Remove once this function is properly implemented. +extern "C" void LLVMInitializeAVRTargetMC() { +} diff --git a/contrib/llvm/lib/Target/BPF/BPF.td b/contrib/llvm/lib/Target/BPF/BPF.td index a4ce90a..8493b0f 100644 --- a/contrib/llvm/lib/Target/BPF/BPF.td +++ b/contrib/llvm/lib/Target/BPF/BPF.td @@ -25,7 +25,14 @@ def BPFInstPrinter : AsmWriter { bit isMCAsmWriter = 1; } +def BPFAsmParserVariant : AsmParserVariant { + int Variant = 0; + string Name = "BPF"; + string BreakCharacters = "."; +} + def BPF : Target { let InstructionSet = BPFInstrInfo; let AssemblyWriters = [BPFInstPrinter]; + let AssemblyParserVariants = [BPFAsmParserVariant]; } diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp index 7341828..6a5b37e 100644 --- a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp +++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp @@ -547,8 +547,7 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // to set, the condition code register to branch on, the true/false values to // select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator I = BB; - ++I; + MachineFunction::iterator I = ++BB->getIterator(); // ThisMBB: // ... diff --git a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h index adcaff6..4276d08 100644 --- a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h +++ b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h @@ -17,8 +17,6 @@ #include "llvm/MC/MCInstPrinter.h" namespace llvm { -class MCOperand; - class BPFInstPrinter : public MCInstPrinter { public: BPFInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp index 36f9926..8c358ca 100644 --- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp +++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp @@ -68,16 +68,23 @@ void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) { assert(Value == 0); - return; - } - assert(Fixup.getKind() == FK_PCRel_2); - Value = (uint16_t)((Value - 8) / 8); - if (IsLittleEndian) { - Data[Fixup.getOffset() + 2] = Value & 0xFF; - Data[Fixup.getOffset() + 3] = Value >> 8; + } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) { + unsigned Size = Fixup.getKind() == FK_Data_4 ? 4 : 8; + + for (unsigned i = 0; i != Size; ++i) { + unsigned Idx = IsLittleEndian ? i : Size - i; + Data[Fixup.getOffset() + Idx] = uint8_t(Value >> (i * 8)); + } } else { - Data[Fixup.getOffset() + 2] = Value >> 8; - Data[Fixup.getOffset() + 3] = Value & 0xFF; + assert(Fixup.getKind() == FK_PCRel_2); + Value = (uint16_t)((Value - 8) / 8); + if (IsLittleEndian) { + Data[Fixup.getOffset() + 2] = Value & 0xFF; + Data[Fixup.getOffset() + 3] = Value >> 8; + } else { + Data[Fixup.getOffset() + 2] = Value >> 8; + Data[Fixup.getOffset() + 3] = Value & 0xFF; + } } } diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp index 05ba618..87cdd5e 100644 --- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp @@ -44,6 +44,10 @@ unsigned BPFELFObjectWriter::GetRelocType(const MCValue &Target, return ELF::R_X86_64_64; case FK_SecRel_4: return ELF::R_X86_64_PC32; + case FK_Data_8: + return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64; + case FK_Data_4: + return IsPCRel ? ELF::R_X86_64_PC32 : ELF::R_X86_64_32; } } diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h index d63bbf4..1f440fe 100644 --- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h +++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h @@ -34,6 +34,8 @@ public: UsesELFSectionDirectiveForBSS = true; HasSingleParameterDotFile = false; HasDotTypeDotSizeDirective = false; + + SupportsDebugInformation = true; } }; } diff --git a/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp b/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp index 272688e..5ea6551 100644 --- a/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp +++ b/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp @@ -551,7 +551,8 @@ void CppWriter::printAttributes(const AttributeSet &PAL, void CppWriter::printType(Type* Ty) { // We don't print definitions for primitive types if (Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() || - Ty->isLabelTy() || Ty->isMetadataTy() || Ty->isVoidTy()) + Ty->isLabelTy() || Ty->isMetadataTy() || Ty->isVoidTy() || + Ty->isTokenTy()) return; // If we already defined this type, we don't need to define it again. @@ -1355,23 +1356,18 @@ void CppWriter::printInstruction(const Instruction *I, } case Instruction::GetElementPtr: { const GetElementPtrInst* gep = cast<GetElementPtrInst>(I); - if (gep->getNumOperands() <= 2) { - Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create(" - << opNames[0]; - if (gep->getNumOperands() == 2) - Out << ", " << opNames[1]; - } else { - Out << "std::vector<Value*> " << iName << "_indices;"; - nl(Out); - for (unsigned i = 1; i < gep->getNumOperands(); ++i ) { - Out << iName << "_indices.push_back(" - << opNames[i] << ");"; - nl(Out); + Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create(" + << getCppName(gep->getSourceElementType()) << ", " << opNames[0] << ", {"; + in(); + for (unsigned i = 1; i < gep->getNumOperands(); ++i ) { + if (i != 1) { + Out << ", "; } - Out << "Instruction* " << iName << " = GetElementPtrInst::Create(" - << opNames[0] << ", " << iName << "_indices"; + nl(Out); + Out << opNames[i]; } - Out << ", \""; + out(); + nl(Out) << "}, \""; printEscapedString(gep->getName()); Out << "\", " << bbname << ");"; break; @@ -1803,13 +1799,12 @@ void CppWriter::printFunctionBody(const Function *F) { << "->arg_begin();"; nl(Out); } - for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end(); - AI != AE; ++AI) { - Out << "Value* " << getCppName(AI) << " = args++;"; + for (const Argument &AI : F->args()) { + Out << "Value* " << getCppName(&AI) << " = args++;"; nl(Out); - if (AI->hasName()) { - Out << getCppName(AI) << "->setName(\""; - printEscapedString(AI->getName()); + if (AI.hasName()) { + Out << getCppName(&AI) << "->setName(\""; + printEscapedString(AI.getName()); Out << "\");"; nl(Out); } @@ -1818,29 +1813,25 @@ void CppWriter::printFunctionBody(const Function *F) { // Create all the basic blocks nl(Out); - for (Function::const_iterator BI = F->begin(), BE = F->end(); - BI != BE; ++BI) { - std::string bbname(getCppName(BI)); + for (const BasicBlock &BI : *F) { + std::string bbname(getCppName(&BI)); Out << "BasicBlock* " << bbname << " = BasicBlock::Create(mod->getContext(), \""; - if (BI->hasName()) - printEscapedString(BI->getName()); - Out << "\"," << getCppName(BI->getParent()) << ",0);"; + if (BI.hasName()) + printEscapedString(BI.getName()); + Out << "\"," << getCppName(BI.getParent()) << ",0);"; nl(Out); } // Output all of its basic blocks... for the function - for (Function::const_iterator BI = F->begin(), BE = F->end(); - BI != BE; ++BI) { - std::string bbname(getCppName(BI)); - nl(Out) << "// Block " << BI->getName() << " (" << bbname << ")"; + for (const BasicBlock &BI : *F) { + std::string bbname(getCppName(&BI)); + nl(Out) << "// Block " << BI.getName() << " (" << bbname << ")"; nl(Out); // Output all of the instructions in the basic block... - for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); - I != E; ++I) { - printInstruction(I,bbname); - } + for (const Instruction &I : BI) + printInstruction(&I, bbname); } // Loop over the ForwardRefs and resolve them now that all instructions @@ -1883,7 +1874,7 @@ void CppWriter::printInline(const std::string& fname, printFunctionUses(F); printFunctionBody(F); is_inline = false; - Out << "return " << getCppName(F->begin()) << ";"; + Out << "return " << getCppName(&F->front()) << ";"; nl(Out) << "}"; nl(Out); } @@ -1896,17 +1887,14 @@ void CppWriter::printModuleBody() { // Functions can call each other and global variables can reference them so // define all the functions first before emitting their function bodies. nl(Out) << "// Function Declarations"; nl(Out); - for (Module::const_iterator I = TheModule->begin(), E = TheModule->end(); - I != E; ++I) - printFunctionHead(I); + for (const Function &I : *TheModule) + printFunctionHead(&I); // Process the global variables declarations. We can't initialze them until // after the constants are printed so just print a header for each global nl(Out) << "// Global Variable Declarations\n"; nl(Out); - for (Module::const_global_iterator I = TheModule->global_begin(), - E = TheModule->global_end(); I != E; ++I) { - printVariableHead(I); - } + for (const GlobalVariable &I : TheModule->globals()) + printVariableHead(&I); // Print out all the constants definitions. Constants don't recurse except // through GlobalValues. All GlobalValues have been declared at this point @@ -1918,21 +1906,18 @@ void CppWriter::printModuleBody() { // been emitted. These definitions just couple the gvars with their constant // initializers. nl(Out) << "// Global Variable Definitions"; nl(Out); - for (Module::const_global_iterator I = TheModule->global_begin(), - E = TheModule->global_end(); I != E; ++I) { - printVariableBody(I); - } + for (const GlobalVariable &I : TheModule->globals()) + printVariableBody(&I); // Finally, we can safely put out all of the function bodies. nl(Out) << "// Function Definitions"; nl(Out); - for (Module::const_iterator I = TheModule->begin(), E = TheModule->end(); - I != E; ++I) { - if (!I->isDeclaration()) { - nl(Out) << "// Function: " << I->getName() << " (" << getCppName(I) + for (const Function &I : *TheModule) { + if (!I.isDeclaration()) { + nl(Out) << "// Function: " << I.getName() << " (" << getCppName(&I) << ")"; nl(Out) << "{"; nl(Out,1); - printFunctionBody(I); + printFunctionBody(&I); nl(Out,-1) << "}"; nl(Out); } diff --git a/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp new file mode 100644 index 0000000..a8622a9 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -0,0 +1,2152 @@ +//===-- HexagonAsmParser.cpp - Parse Hexagon asm to MCInst instructions----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mcasmparser" + +#include "Hexagon.h" +#include "HexagonRegisterInfo.h" +#include "HexagonTargetStreamer.h" +#include "MCTargetDesc/HexagonBaseInfo.h" +#include "MCTargetDesc/HexagonMCELFStreamer.h" +#include "MCTargetDesc/HexagonMCChecker.h" +#include "MCTargetDesc/HexagonMCExpr.h" +#include "MCTargetDesc/HexagonMCShuffler.h" +#include "MCTargetDesc/HexagonMCTargetDesc.h" +#include "MCTargetDesc/HexagonMCAsmInfo.h" +#include "MCTargetDesc/HexagonShuffler.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include <sstream> + +using namespace llvm; + +static cl::opt<bool> EnableFutureRegs("mfuture-regs", + cl::desc("Enable future registers")); + +static cl::opt<bool> WarnMissingParenthesis("mwarn-missing-parenthesis", +cl::desc("Warn for missing parenthesis around predicate registers"), +cl::init(true)); +static cl::opt<bool> ErrorMissingParenthesis("merror-missing-parenthesis", +cl::desc("Error for missing parenthesis around predicate registers"), +cl::init(false)); +static cl::opt<bool> WarnSignedMismatch("mwarn-sign-mismatch", +cl::desc("Warn for mismatching a signed and unsigned value"), +cl::init(true)); +static cl::opt<bool> WarnNoncontigiousRegister("mwarn-noncontigious-register", +cl::desc("Warn for register names that arent contigious"), +cl::init(true)); +static cl::opt<bool> ErrorNoncontigiousRegister("merror-noncontigious-register", +cl::desc("Error for register names that aren't contigious"), +cl::init(false)); + + +namespace { +struct HexagonOperand; + +class HexagonAsmParser : public MCTargetAsmParser { + + HexagonTargetStreamer &getTargetStreamer() { + MCTargetStreamer &TS = *Parser.getStreamer().getTargetStreamer(); + return static_cast<HexagonTargetStreamer &>(TS); + } + + MCAsmParser &Parser; + MCAssembler *Assembler; + MCInstrInfo const &MCII; + MCInst MCB; + bool InBrackets; + + MCAsmParser &getParser() const { return Parser; } + MCAssembler *getAssembler() const { return Assembler; } + MCAsmLexer &getLexer() const { return Parser.getLexer(); } + + bool equalIsAsmAssignment() override { return false; } + bool isLabel(AsmToken &Token) override; + + void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); } + bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } + bool ParseDirectiveFalign(unsigned Size, SMLoc L); + + virtual bool ParseRegister(unsigned &RegNo, + SMLoc &StartLoc, + SMLoc &EndLoc) override; + bool ParseDirectiveSubsection(SMLoc L); + bool ParseDirectiveValue(unsigned Size, SMLoc L); + bool ParseDirectiveComm(bool IsLocal, SMLoc L); + bool RegisterMatchesArch(unsigned MatchNum) const; + + bool matchBundleOptions(); + bool handleNoncontigiousRegister(bool Contigious, SMLoc &Loc); + bool finishBundle(SMLoc IDLoc, MCStreamer &Out); + void canonicalizeImmediates(MCInst &MCI); + bool matchOneInstruction(MCInst &MCB, SMLoc IDLoc, + OperandVector &InstOperands, uint64_t &ErrorInfo, + bool MatchingInlineAsm, bool &MustExtend); + + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, bool MatchingInlineAsm) override; + + unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override; + void OutOfRange(SMLoc IDLoc, long long Val, long long Max); + int processInstruction(MCInst &Inst, OperandVector const &Operands, + SMLoc IDLoc, bool &MustExtend); + + // Check if we have an assembler and, if so, set the ELF e_header flags. + void chksetELFHeaderEFlags(unsigned flags) { + if (getAssembler()) + getAssembler()->setELFHeaderEFlags(flags); + } + +/// @name Auto-generated Match Functions +/// { + +#define GET_ASSEMBLER_HEADER +#include "HexagonGenAsmMatcher.inc" + + /// } + +public: + HexagonAsmParser(const MCSubtargetInfo &_STI, MCAsmParser &_Parser, + const MCInstrInfo &MII, const MCTargetOptions &Options) + : MCTargetAsmParser(Options, _STI), Parser(_Parser), + MCII (MII), MCB(HexagonMCInstrInfo::createBundle()), InBrackets(false) { + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); + + MCAsmParserExtension::Initialize(_Parser); + + Assembler = nullptr; + // FIXME: need better way to detect AsmStreamer (upstream removed getKind()) + if (!Parser.getStreamer().hasRawTextSupport()) { + MCELFStreamer *MES = static_cast<MCELFStreamer *>(&Parser.getStreamer()); + Assembler = &MES->getAssembler(); + } + } + + bool mustExtend(OperandVector &Operands); + bool splitIdentifier(OperandVector &Operands); + bool parseOperand(OperandVector &Operands); + bool parseInstruction(OperandVector &Operands); + bool implicitExpressionLocation(OperandVector &Operands); + bool parseExpressionOrOperand(OperandVector &Operands); + bool parseExpression(MCExpr const *& Expr); + virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override + { + llvm_unreachable("Unimplemented"); + } + virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + AsmToken ID, OperandVector &Operands) override; + + virtual bool ParseDirective(AsmToken DirectiveID) override; +}; + +/// HexagonOperand - Instances of this class represent a parsed Hexagon machine +/// instruction. +struct HexagonOperand : public MCParsedAsmOperand { + enum KindTy { Token, Immediate, Register } Kind; + + SMLoc StartLoc, EndLoc; + + struct TokTy { + const char *Data; + unsigned Length; + }; + + struct RegTy { + unsigned RegNum; + }; + + struct ImmTy { + const MCExpr *Val; + bool MustExtend; + }; + + struct InstTy { + OperandVector *SubInsts; + }; + + union { + struct TokTy Tok; + struct RegTy Reg; + struct ImmTy Imm; + }; + + HexagonOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} + +public: + HexagonOperand(const HexagonOperand &o) : MCParsedAsmOperand() { + Kind = o.Kind; + StartLoc = o.StartLoc; + EndLoc = o.EndLoc; + switch (Kind) { + case Register: + Reg = o.Reg; + break; + case Immediate: + Imm = o.Imm; + break; + case Token: + Tok = o.Tok; + break; + } + } + + /// getStartLoc - Get the location of the first token of this operand. + SMLoc getStartLoc() const { return StartLoc; } + + /// getEndLoc - Get the location of the last token of this operand. + SMLoc getEndLoc() const { return EndLoc; } + + unsigned getReg() const { + assert(Kind == Register && "Invalid access!"); + return Reg.RegNum; + } + + const MCExpr *getImm() const { + assert(Kind == Immediate && "Invalid access!"); + return Imm.Val; + } + + bool isToken() const { return Kind == Token; } + bool isImm() const { return Kind == Immediate; } + bool isMem() const { llvm_unreachable("No isMem"); } + bool isReg() const { return Kind == Register; } + + bool CheckImmRange(int immBits, int zeroBits, bool isSigned, + bool isRelocatable, bool Extendable) const { + if (Kind == Immediate) { + const MCExpr *myMCExpr = getImm(); + if (Imm.MustExtend && !Extendable) + return false; + int64_t Res; + if (myMCExpr->evaluateAsAbsolute(Res)) { + int bits = immBits + zeroBits; + // Field bit range is zerobits + bits + // zeroBits must be 0 + if (Res & ((1 << zeroBits) - 1)) + return false; + if (isSigned) { + if (Res < (1LL << (bits - 1)) && Res >= -(1LL << (bits - 1))) + return true; + } else { + if (bits == 64) + return true; + if (Res >= 0) + return ((uint64_t)Res < (uint64_t)(1ULL << bits)) ? true : false; + else { + const int64_t high_bit_set = 1ULL << 63; + const uint64_t mask = (high_bit_set >> (63 - bits)); + return (((uint64_t)Res & mask) == mask) ? true : false; + } + } + } else if (myMCExpr->getKind() == MCExpr::SymbolRef && isRelocatable) + return true; + else if (myMCExpr->getKind() == MCExpr::Binary || + myMCExpr->getKind() == MCExpr::Unary) + return true; + } + return false; + } + + bool isf32Ext() const { return false; } + bool iss32Imm() const { return CheckImmRange(32, 0, true, true, false); } + bool iss8Imm() const { return CheckImmRange(8, 0, true, false, false); } + bool iss8Imm64() const { return CheckImmRange(8, 0, true, true, false); } + bool iss7Imm() const { return CheckImmRange(7, 0, true, false, false); } + bool iss6Imm() const { return CheckImmRange(6, 0, true, false, false); } + bool iss4Imm() const { return CheckImmRange(4, 0, true, false, false); } + bool iss4_0Imm() const { return CheckImmRange(4, 0, true, false, false); } + bool iss4_1Imm() const { return CheckImmRange(4, 1, true, false, false); } + bool iss4_2Imm() const { return CheckImmRange(4, 2, true, false, false); } + bool iss4_3Imm() const { return CheckImmRange(4, 3, true, false, false); } + bool iss4_6Imm() const { return CheckImmRange(4, 0, true, false, false); } + bool iss3_6Imm() const { return CheckImmRange(3, 0, true, false, false); } + bool iss3Imm() const { return CheckImmRange(3, 0, true, false, false); } + + bool isu64Imm() const { return CheckImmRange(64, 0, false, true, true); } + bool isu32Imm() const { return CheckImmRange(32, 0, false, true, false); } + bool isu26_6Imm() const { return CheckImmRange(26, 6, false, true, false); } + bool isu16Imm() const { return CheckImmRange(16, 0, false, true, false); } + bool isu16_0Imm() const { return CheckImmRange(16, 0, false, true, false); } + bool isu16_1Imm() const { return CheckImmRange(16, 1, false, true, false); } + bool isu16_2Imm() const { return CheckImmRange(16, 2, false, true, false); } + bool isu16_3Imm() const { return CheckImmRange(16, 3, false, true, false); } + bool isu11_3Imm() const { return CheckImmRange(11, 3, false, false, false); } + bool isu6_0Imm() const { return CheckImmRange(6, 0, false, false, false); } + bool isu6_1Imm() const { return CheckImmRange(6, 1, false, false, false); } + bool isu6_2Imm() const { return CheckImmRange(6, 2, false, false, false); } + bool isu6_3Imm() const { return CheckImmRange(6, 3, false, false, false); } + bool isu10Imm() const { return CheckImmRange(10, 0, false, false, false); } + bool isu9Imm() const { return CheckImmRange(9, 0, false, false, false); } + bool isu8Imm() const { return CheckImmRange(8, 0, false, false, false); } + bool isu7Imm() const { return CheckImmRange(7, 0, false, false, false); } + bool isu6Imm() const { return CheckImmRange(6, 0, false, false, false); } + bool isu5Imm() const { return CheckImmRange(5, 0, false, false, false); } + bool isu4Imm() const { return CheckImmRange(4, 0, false, false, false); } + bool isu3Imm() const { return CheckImmRange(3, 0, false, false, false); } + bool isu2Imm() const { return CheckImmRange(2, 0, false, false, false); } + bool isu1Imm() const { return CheckImmRange(1, 0, false, false, false); } + + bool ism6Imm() const { return CheckImmRange(6, 0, false, false, false); } + bool isn8Imm() const { return CheckImmRange(8, 0, false, false, false); } + + bool iss16Ext() const { return CheckImmRange(16 + 26, 0, true, true, true); } + bool iss12Ext() const { return CheckImmRange(12 + 26, 0, true, true, true); } + bool iss10Ext() const { return CheckImmRange(10 + 26, 0, true, true, true); } + bool iss9Ext() const { return CheckImmRange(9 + 26, 0, true, true, true); } + bool iss8Ext() const { return CheckImmRange(8 + 26, 0, true, true, true); } + bool iss7Ext() const { return CheckImmRange(7 + 26, 0, true, true, true); } + bool iss6Ext() const { return CheckImmRange(6 + 26, 0, true, true, true); } + bool iss11_0Ext() const { + return CheckImmRange(11 + 26, 0, true, true, true); + } + bool iss11_1Ext() const { + return CheckImmRange(11 + 26, 1, true, true, true); + } + bool iss11_2Ext() const { + return CheckImmRange(11 + 26, 2, true, true, true); + } + bool iss11_3Ext() const { + return CheckImmRange(11 + 26, 3, true, true, true); + } + + bool isu6Ext() const { return CheckImmRange(6 + 26, 0, false, true, true); } + bool isu7Ext() const { return CheckImmRange(7 + 26, 0, false, true, true); } + bool isu8Ext() const { return CheckImmRange(8 + 26, 0, false, true, true); } + bool isu9Ext() const { return CheckImmRange(9 + 26, 0, false, true, true); } + bool isu10Ext() const { return CheckImmRange(10 + 26, 0, false, true, true); } + bool isu6_0Ext() const { return CheckImmRange(6 + 26, 0, false, true, true); } + bool isu6_1Ext() const { return CheckImmRange(6 + 26, 1, false, true, true); } + bool isu6_2Ext() const { return CheckImmRange(6 + 26, 2, false, true, true); } + bool isu6_3Ext() const { return CheckImmRange(6 + 26, 3, false, true, true); } + bool isu32MustExt() const { return isImm() && Imm.MustExtend; } + + void addRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(getReg())); + } + + void addImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createExpr(getImm())); + } + + void addSignedImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + MCExpr const *Expr = getImm(); + int64_t Value; + if (!Expr->evaluateAsAbsolute(Value)) { + Inst.addOperand(MCOperand::createExpr(Expr)); + return; + } + int64_t Extended = SignExtend64 (Value, 32); + if ((Extended < 0) == (Value < 0)) { + Inst.addOperand(MCOperand::createExpr(Expr)); + return; + } + // Flip bit 33 to signal signed unsigned mismatch + Extended ^= 0x100000000; + Inst.addOperand(MCOperand::createImm(Extended)); + } + + void addf32ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + + void adds32ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds8ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds8Imm64Operands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds6ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds4ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds4_0ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds4_1ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds4_2ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds4_3ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds3ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + + void addu64ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu32ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu26_6ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu16ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu16_0ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu16_1ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu16_2ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu16_3ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu11_3ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu10ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu9ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu8ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu7ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_0ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_1ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_2ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_3ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu5ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu4ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu3ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu2ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu1ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + + void addm6ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addn8ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + + void adds16ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds12ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds10ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds9ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds8ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds6ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds11_0ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds11_1ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds11_2ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds11_3ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + + void addu6ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu7ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu8ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu9ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu10ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_0ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_1ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_2ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_3ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu32MustExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + + void adds4_6ImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(CE->getValue() * 64)); + } + + void adds3_6ImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(CE->getValue() * 64)); + } + + StringRef getToken() const { + assert(Kind == Token && "Invalid access!"); + return StringRef(Tok.Data, Tok.Length); + } + + virtual void print(raw_ostream &OS) const; + + static std::unique_ptr<HexagonOperand> CreateToken(StringRef Str, SMLoc S) { + HexagonOperand *Op = new HexagonOperand(Token); + Op->Tok.Data = Str.data(); + Op->Tok.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return std::unique_ptr<HexagonOperand>(Op); + } + + static std::unique_ptr<HexagonOperand> CreateReg(unsigned RegNum, SMLoc S, + SMLoc E) { + HexagonOperand *Op = new HexagonOperand(Register); + Op->Reg.RegNum = RegNum; + Op->StartLoc = S; + Op->EndLoc = E; + return std::unique_ptr<HexagonOperand>(Op); + } + + static std::unique_ptr<HexagonOperand> CreateImm(const MCExpr *Val, SMLoc S, + SMLoc E) { + HexagonOperand *Op = new HexagonOperand(Immediate); + Op->Imm.Val = Val; + Op->Imm.MustExtend = false; + Op->StartLoc = S; + Op->EndLoc = E; + return std::unique_ptr<HexagonOperand>(Op); + } +}; + +} // end anonymous namespace. + +void HexagonOperand::print(raw_ostream &OS) const { + switch (Kind) { + case Immediate: + getImm()->print(OS, nullptr); + break; + case Register: + OS << "<register R"; + OS << getReg() << ">"; + break; + case Token: + OS << "'" << getToken() << "'"; + break; + } +} + +/// @name Auto-generated Match Functions +static unsigned MatchRegisterName(StringRef Name); + +bool HexagonAsmParser::finishBundle(SMLoc IDLoc, MCStreamer &Out) { + DEBUG(dbgs() << "Bundle:"); + DEBUG(MCB.dump_pretty(dbgs())); + DEBUG(dbgs() << "--\n"); + + // Check the bundle for errors. + const MCRegisterInfo *RI = getContext().getRegisterInfo(); + HexagonMCChecker Check(MCII, getSTI(), MCB, MCB, *RI); + + bool CheckOk = HexagonMCInstrInfo::canonicalizePacket(MCII, getSTI(), + getContext(), MCB, + &Check); + + while (Check.getNextErrInfo() == true) { + unsigned Reg = Check.getErrRegister(); + Twine R(RI->getName(Reg)); + + uint64_t Err = Check.getError(); + if (Err != HexagonMCErrInfo::CHECK_SUCCESS) { + if (HexagonMCErrInfo::CHECK_ERROR_BRANCHES & Err) + Error(IDLoc, + "unconditional branch cannot precede another branch in packet"); + + if (HexagonMCErrInfo::CHECK_ERROR_NEWP & Err || + HexagonMCErrInfo::CHECK_ERROR_NEWV & Err) + Error(IDLoc, "register `" + R + + "' used with `.new' " + "but not validly modified in the same packet"); + + if (HexagonMCErrInfo::CHECK_ERROR_REGISTERS & Err) + Error(IDLoc, "register `" + R + "' modified more than once"); + + if (HexagonMCErrInfo::CHECK_ERROR_READONLY & Err) + Error(IDLoc, "cannot write to read-only register `" + R + "'"); + + if (HexagonMCErrInfo::CHECK_ERROR_LOOP & Err) + Error(IDLoc, "loop-setup and some branch instructions " + "cannot be in the same packet"); + + if (HexagonMCErrInfo::CHECK_ERROR_ENDLOOP & Err) { + Twine N(HexagonMCInstrInfo::isInnerLoop(MCB) ? '0' : '1'); + Error(IDLoc, "packet marked with `:endloop" + N + "' " + + "cannot contain instructions that modify register " + + "`" + R + "'"); + } + + if (HexagonMCErrInfo::CHECK_ERROR_SOLO & Err) + Error(IDLoc, + "instruction cannot appear in packet with other instructions"); + + if (HexagonMCErrInfo::CHECK_ERROR_NOSLOTS & Err) + Error(IDLoc, "too many slots used in packet"); + + if (Err & HexagonMCErrInfo::CHECK_ERROR_SHUFFLE) { + uint64_t Erm = Check.getShuffleError(); + + if (HexagonShuffler::SHUFFLE_ERROR_INVALID == Erm) + Error(IDLoc, "invalid instruction packet"); + else if (HexagonShuffler::SHUFFLE_ERROR_STORES == Erm) + Error(IDLoc, "invalid instruction packet: too many stores"); + else if (HexagonShuffler::SHUFFLE_ERROR_LOADS == Erm) + Error(IDLoc, "invalid instruction packet: too many loads"); + else if (HexagonShuffler::SHUFFLE_ERROR_BRANCHES == Erm) + Error(IDLoc, "too many branches in packet"); + else if (HexagonShuffler::SHUFFLE_ERROR_NOSLOTS == Erm) + Error(IDLoc, "invalid instruction packet: out of slots"); + else if (HexagonShuffler::SHUFFLE_ERROR_SLOTS == Erm) + Error(IDLoc, "invalid instruction packet: slot error"); + else if (HexagonShuffler::SHUFFLE_ERROR_ERRATA2 == Erm) + Error(IDLoc, "v60 packet violation"); + else if (HexagonShuffler::SHUFFLE_ERROR_STORE_LOAD_CONFLICT == Erm) + Error(IDLoc, "slot 0 instruction does not allow slot 1 store"); + else + Error(IDLoc, "unknown error in instruction packet"); + } + } + + unsigned Warn = Check.getWarning(); + if (Warn != HexagonMCErrInfo::CHECK_SUCCESS) { + if (HexagonMCErrInfo::CHECK_WARN_CURRENT & Warn) + Warning(IDLoc, "register `" + R + "' used with `.cur' " + "but not used in the same packet"); + else if (HexagonMCErrInfo::CHECK_WARN_TEMPORARY & Warn) + Warning(IDLoc, "register `" + R + "' used with `.tmp' " + "but not used in the same packet"); + } + } + + if (CheckOk) { + MCB.setLoc(IDLoc); + if (HexagonMCInstrInfo::bundleSize(MCB) == 0) { + assert(!HexagonMCInstrInfo::isInnerLoop(MCB)); + assert(!HexagonMCInstrInfo::isOuterLoop(MCB)); + // Empty packets are valid yet aren't emitted + return false; + } + Out.EmitInstruction(MCB, getSTI()); + } else { + // If compounding and duplexing didn't reduce the size below + // 4 or less we have a packet that is too big. + if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE) { + Error(IDLoc, "invalid instruction packet: out of slots"); + return true; // Error + } + } + + return false; // No error +} + +bool HexagonAsmParser::matchBundleOptions() { + MCAsmParser &Parser = getParser(); + MCAsmLexer &Lexer = getLexer(); + while (true) { + if (!Parser.getTok().is(AsmToken::Colon)) + return false; + Lexer.Lex(); + StringRef Option = Parser.getTok().getString(); + if (Option.compare_lower("endloop0") == 0) + HexagonMCInstrInfo::setInnerLoop(MCB); + else if (Option.compare_lower("endloop1") == 0) + HexagonMCInstrInfo::setOuterLoop(MCB); + else if (Option.compare_lower("mem_noshuf") == 0) + HexagonMCInstrInfo::setMemReorderDisabled(MCB); + else if (Option.compare_lower("mem_shuf") == 0) + HexagonMCInstrInfo::setMemStoreReorderEnabled(MCB); + else + return true; + Lexer.Lex(); + } +} + +// For instruction aliases, immediates are generated rather than +// MCConstantExpr. Convert them for uniform MCExpr. +// Also check for signed/unsigned mismatches and warn +void HexagonAsmParser::canonicalizeImmediates(MCInst &MCI) { + MCInst NewInst; + NewInst.setOpcode(MCI.getOpcode()); + for (MCOperand &I : MCI) + if (I.isImm()) { + int64_t Value (I.getImm()); + if ((Value & 0x100000000) != (Value & 0x80000000)) { + // Detect flipped bit 33 wrt bit 32 and signal warning + Value ^= 0x100000000; + if (WarnSignedMismatch) + Warning (MCI.getLoc(), "Signed/Unsigned mismatch"); + } + NewInst.addOperand(MCOperand::createExpr( + MCConstantExpr::create(Value, getContext()))); + } + else + NewInst.addOperand(I); + MCI = NewInst; +} + +bool HexagonAsmParser::matchOneInstruction(MCInst &MCI, SMLoc IDLoc, + OperandVector &InstOperands, + uint64_t &ErrorInfo, + bool MatchingInlineAsm, + bool &MustExtend) { + // Perform matching with tablegen asmmatcher generated function + int result = + MatchInstructionImpl(InstOperands, MCI, ErrorInfo, MatchingInlineAsm); + if (result == Match_Success) { + MCI.setLoc(IDLoc); + MustExtend = mustExtend(InstOperands); + canonicalizeImmediates(MCI); + result = processInstruction(MCI, InstOperands, IDLoc, MustExtend); + + DEBUG(dbgs() << "Insn:"); + DEBUG(MCI.dump_pretty(dbgs())); + DEBUG(dbgs() << "\n\n"); + + MCI.setLoc(IDLoc); + } + + // Create instruction operand for bundle instruction + // Break this into a separate function Code here is less readable + // Think about how to get an instruction error to report correctly. + // SMLoc will return the "{" + switch (result) { + default: + break; + case Match_Success: + return false; + case Match_MissingFeature: + return Error(IDLoc, "invalid instruction"); + case Match_MnemonicFail: + return Error(IDLoc, "unrecognized instruction"); + case Match_InvalidOperand: + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0U) { + if (ErrorInfo >= InstOperands.size()) + return Error(IDLoc, "too few operands for instruction"); + + ErrorLoc = (static_cast<HexagonOperand *>(InstOperands[ErrorInfo].get())) + ->getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + } + return Error(ErrorLoc, "invalid operand for instruction"); + } + llvm_unreachable("Implement any new match types added!"); +} + +bool HexagonAsmParser::mustExtend(OperandVector &Operands) { + unsigned Count = 0; + for (std::unique_ptr<MCParsedAsmOperand> &i : Operands) + if (i->isImm()) + if (static_cast<HexagonOperand *>(i.get())->Imm.MustExtend) + ++Count; + // Multiple extenders should have been filtered by iss9Ext et. al. + assert(Count < 2 && "Multiple extenders"); + return Count == 1; +} + +bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + if (!InBrackets) { + MCB.clear(); + MCB.addOperand(MCOperand::createImm(0)); + } + HexagonOperand &FirstOperand = static_cast<HexagonOperand &>(*Operands[0]); + if (FirstOperand.isToken() && FirstOperand.getToken() == "{") { + assert(Operands.size() == 1 && "Brackets should be by themselves"); + if (InBrackets) { + getParser().Error(IDLoc, "Already in a packet"); + return true; + } + InBrackets = true; + return false; + } + if (FirstOperand.isToken() && FirstOperand.getToken() == "}") { + assert(Operands.size() == 1 && "Brackets should be by themselves"); + if (!InBrackets) { + getParser().Error(IDLoc, "Not in a packet"); + return true; + } + InBrackets = false; + if (matchBundleOptions()) + return true; + return finishBundle(IDLoc, Out); + } + MCInst *SubInst = new (getParser().getContext()) MCInst; + bool MustExtend = false; + if (matchOneInstruction(*SubInst, IDLoc, Operands, ErrorInfo, + MatchingInlineAsm, MustExtend)) + return true; + HexagonMCInstrInfo::extendIfNeeded( + getParser().getContext(), MCII, MCB, *SubInst, + HexagonMCInstrInfo::isExtended(MCII, *SubInst) || MustExtend); + MCB.addOperand(MCOperand::createInst(SubInst)); + if (!InBrackets) + return finishBundle(IDLoc, Out); + return false; +} + +/// ParseDirective parses the Hexagon specific directives +bool HexagonAsmParser::ParseDirective(AsmToken DirectiveID) { + StringRef IDVal = DirectiveID.getIdentifier(); + if ((IDVal.lower() == ".word") || (IDVal.lower() == ".4byte")) + return ParseDirectiveValue(4, DirectiveID.getLoc()); + if (IDVal.lower() == ".short" || IDVal.lower() == ".hword" || + IDVal.lower() == ".half") + return ParseDirectiveValue(2, DirectiveID.getLoc()); + if (IDVal.lower() == ".falign") + return ParseDirectiveFalign(256, DirectiveID.getLoc()); + if ((IDVal.lower() == ".lcomm") || (IDVal.lower() == ".lcommon")) + return ParseDirectiveComm(true, DirectiveID.getLoc()); + if ((IDVal.lower() == ".comm") || (IDVal.lower() == ".common")) + return ParseDirectiveComm(false, DirectiveID.getLoc()); + if (IDVal.lower() == ".subsection") + return ParseDirectiveSubsection(DirectiveID.getLoc()); + + return true; +} +bool HexagonAsmParser::ParseDirectiveSubsection(SMLoc L) { + const MCExpr *Subsection = 0; + int64_t Res; + + assert((getLexer().isNot(AsmToken::EndOfStatement)) && + "Invalid subsection directive"); + getParser().parseExpression(Subsection); + + if (!Subsection->evaluateAsAbsolute(Res)) + return Error(L, "Cannot evaluate subsection number"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + // 0-8192 is the hard-coded range in MCObjectStreamper.cpp, this keeps the + // negative subsections together and in the same order but at the opposite + // end of the section. Only legacy hexagon-gcc created assembly code + // used negative subsections. + if ((Res < 0) && (Res > -8193)) + Subsection = MCConstantExpr::create(8192 + Res, this->getContext()); + + getStreamer().SubSection(Subsection); + return false; +} + +/// ::= .falign [expression] +bool HexagonAsmParser::ParseDirectiveFalign(unsigned Size, SMLoc L) { + + int64_t MaxBytesToFill = 15; + + // if there is an arguement + if (getLexer().isNot(AsmToken::EndOfStatement)) { + const MCExpr *Value; + SMLoc ExprLoc = L; + + // Make sure we have a number (false is returned if expression is a number) + if (getParser().parseExpression(Value) == false) { + // Make sure this is a number that is in range + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value); + uint64_t IntValue = MCE->getValue(); + if (!isUIntN(Size, IntValue) && !isIntN(Size, IntValue)) + return Error(ExprLoc, "literal value out of range (256) for falign"); + MaxBytesToFill = IntValue; + Lex(); + } else { + return Error(ExprLoc, "not a valid expression for falign directive"); + } + } + + getTargetStreamer().emitFAlign(16, MaxBytesToFill); + Lex(); + + return false; +} + +/// ::= .word [ expression (, expression)* ] +bool HexagonAsmParser::ParseDirectiveValue(unsigned Size, SMLoc L) { + if (getLexer().isNot(AsmToken::EndOfStatement)) { + + for (;;) { + const MCExpr *Value; + SMLoc ExprLoc = L; + if (getParser().parseExpression(Value)) + return true; + + // Special case constant expressions to match code generator. + if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) { + assert(Size <= 8 && "Invalid size"); + uint64_t IntValue = MCE->getValue(); + if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue)) + return Error(ExprLoc, "literal value out of range for directive"); + getStreamer().EmitIntValue(IntValue, Size); + } else + getStreamer().EmitValue(Value, Size); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + // FIXME: Improve diagnostic. + if (getLexer().isNot(AsmToken::Comma)) + return TokError("unexpected token in directive"); + Lex(); + } + } + + Lex(); + return false; +} + +// This is largely a copy of AsmParser's ParseDirectiveComm extended to +// accept a 3rd argument, AccessAlignment which indicates the smallest +// memory access made to the symbol, expressed in bytes. If no +// AccessAlignment is specified it defaults to the Alignment Value. +// Hexagon's .lcomm: +// .lcomm Symbol, Length, Alignment, AccessAlignment +bool HexagonAsmParser::ParseDirectiveComm(bool IsLocal, SMLoc Loc) { + // FIXME: need better way to detect if AsmStreamer (upstream removed + // getKind()) + if (getStreamer().hasRawTextSupport()) + return true; // Only object file output requires special treatment. + + StringRef Name; + if (getParser().parseIdentifier(Name)) + return TokError("expected identifier in directive"); + // Handle the identifier as the key symbol. + MCSymbol *Sym = getContext().getOrCreateSymbol(Name); + + if (getLexer().isNot(AsmToken::Comma)) + return TokError("unexpected token in directive"); + Lex(); + + int64_t Size; + SMLoc SizeLoc = getLexer().getLoc(); + if (getParser().parseAbsoluteExpression(Size)) + return true; + + int64_t ByteAlignment = 1; + SMLoc ByteAlignmentLoc; + if (getLexer().is(AsmToken::Comma)) { + Lex(); + ByteAlignmentLoc = getLexer().getLoc(); + if (getParser().parseAbsoluteExpression(ByteAlignment)) + return true; + if (!isPowerOf2_64(ByteAlignment)) + return Error(ByteAlignmentLoc, "alignment must be a power of 2"); + } + + int64_t AccessAlignment = 0; + if (getLexer().is(AsmToken::Comma)) { + // The optional access argument specifies the size of the smallest memory + // access to be made to the symbol, expressed in bytes. + SMLoc AccessAlignmentLoc; + Lex(); + AccessAlignmentLoc = getLexer().getLoc(); + if (getParser().parseAbsoluteExpression(AccessAlignment)) + return true; + + if (!isPowerOf2_64(AccessAlignment)) + return Error(AccessAlignmentLoc, "access alignment must be a power of 2"); + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in '.comm' or '.lcomm' directive"); + + Lex(); + + // NOTE: a size of zero for a .comm should create a undefined symbol + // but a size of .lcomm creates a bss symbol of size zero. + if (Size < 0) + return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't " + "be less than zero"); + + // NOTE: The alignment in the directive is a power of 2 value, the assembler + // may internally end up wanting an alignment in bytes. + // FIXME: Diagnose overflow. + if (ByteAlignment < 0) + return Error(ByteAlignmentLoc, "invalid '.comm' or '.lcomm' directive " + "alignment, can't be less than zero"); + + if (!Sym->isUndefined()) + return Error(Loc, "invalid symbol redefinition"); + + HexagonMCELFStreamer &HexagonELFStreamer = + static_cast<HexagonMCELFStreamer &>(getStreamer()); + if (IsLocal) { + HexagonELFStreamer.HexagonMCEmitLocalCommonSymbol(Sym, Size, ByteAlignment, + AccessAlignment); + return false; + } + + HexagonELFStreamer.HexagonMCEmitCommonSymbol(Sym, Size, ByteAlignment, + AccessAlignment); + return false; +} + +// validate register against architecture +bool HexagonAsmParser::RegisterMatchesArch(unsigned MatchNum) const { + return true; +} + +// extern "C" void LLVMInitializeHexagonAsmLexer(); + +/// Force static initialization. +extern "C" void LLVMInitializeHexagonAsmParser() { + RegisterMCAsmParser<HexagonAsmParser> X(TheHexagonTarget); +} + +#define GET_MATCHER_IMPLEMENTATION +#define GET_REGISTER_MATCHER +#include "HexagonGenAsmMatcher.inc" + +namespace { +bool previousEqual(OperandVector &Operands, size_t Index, StringRef String) { + if (Index >= Operands.size()) + return false; + MCParsedAsmOperand &Operand = *Operands[Operands.size() - Index - 1]; + if (!Operand.isToken()) + return false; + return static_cast<HexagonOperand &>(Operand).getToken().equals_lower(String); +} +bool previousIsLoop(OperandVector &Operands, size_t Index) { + return previousEqual(Operands, Index, "loop0") || + previousEqual(Operands, Index, "loop1") || + previousEqual(Operands, Index, "sp1loop0") || + previousEqual(Operands, Index, "sp2loop0") || + previousEqual(Operands, Index, "sp3loop0"); +} +} + +bool HexagonAsmParser::splitIdentifier(OperandVector &Operands) { + AsmToken const &Token = getParser().getTok(); + StringRef String = Token.getString(); + SMLoc Loc = Token.getLoc(); + getLexer().Lex(); + do { + std::pair<StringRef, StringRef> HeadTail = String.split('.'); + if (!HeadTail.first.empty()) + Operands.push_back(HexagonOperand::CreateToken(HeadTail.first, Loc)); + if (!HeadTail.second.empty()) + Operands.push_back(HexagonOperand::CreateToken( + String.substr(HeadTail.first.size(), 1), Loc)); + String = HeadTail.second; + } while (!String.empty()); + return false; +} + +bool HexagonAsmParser::parseOperand(OperandVector &Operands) { + unsigned Register; + SMLoc Begin; + SMLoc End; + MCAsmLexer &Lexer = getLexer(); + if (!ParseRegister(Register, Begin, End)) { + if (!ErrorMissingParenthesis) + switch (Register) { + default: + break; + case Hexagon::P0: + case Hexagon::P1: + case Hexagon::P2: + case Hexagon::P3: + if (previousEqual(Operands, 0, "if")) { + if (WarnMissingParenthesis) + Warning (Begin, "Missing parenthesis around predicate register"); + static char const *LParen = "("; + static char const *RParen = ")"; + Operands.push_back(HexagonOperand::CreateToken(LParen, Begin)); + Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End)); + AsmToken MaybeDotNew = Lexer.getTok(); + if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) && + MaybeDotNew.getString().equals_lower(".new")) + splitIdentifier(Operands); + Operands.push_back(HexagonOperand::CreateToken(RParen, Begin)); + return false; + } + if (previousEqual(Operands, 0, "!") && + previousEqual(Operands, 1, "if")) { + if (WarnMissingParenthesis) + Warning (Begin, "Missing parenthesis around predicate register"); + static char const *LParen = "("; + static char const *RParen = ")"; + Operands.insert(Operands.end () - 1, + HexagonOperand::CreateToken(LParen, Begin)); + Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End)); + AsmToken MaybeDotNew = Lexer.getTok(); + if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) && + MaybeDotNew.getString().equals_lower(".new")) + splitIdentifier(Operands); + Operands.push_back(HexagonOperand::CreateToken(RParen, Begin)); + return false; + } + break; + } + Operands.push_back(HexagonOperand::CreateReg( + Register, Begin, End)); + return false; + } + return splitIdentifier(Operands); +} + +bool HexagonAsmParser::isLabel(AsmToken &Token) { + MCAsmLexer &Lexer = getLexer(); + AsmToken const &Second = Lexer.getTok(); + AsmToken Third = Lexer.peekTok(); + StringRef String = Token.getString(); + if (Token.is(AsmToken::TokenKind::LCurly) || + Token.is(AsmToken::TokenKind::RCurly)) + return false; + if (!Token.is(AsmToken::TokenKind::Identifier)) + return true; + if (!MatchRegisterName(String.lower())) + return true; + (void)Second; + assert(Second.is(AsmToken::Colon)); + StringRef Raw (String.data(), Third.getString().data() - String.data() + + Third.getString().size()); + std::string Collapsed = Raw; + Collapsed.erase(std::remove_if(Collapsed.begin(), Collapsed.end(), isspace), + Collapsed.end()); + StringRef Whole = Collapsed; + std::pair<StringRef, StringRef> DotSplit = Whole.split('.'); + if (!MatchRegisterName(DotSplit.first.lower())) + return true; + return false; +} + +bool HexagonAsmParser::handleNoncontigiousRegister(bool Contigious, SMLoc &Loc) { + if (!Contigious && ErrorNoncontigiousRegister) { + Error(Loc, "Register name is not contigious"); + return true; + } + if (!Contigious && WarnNoncontigiousRegister) + Warning(Loc, "Register name is not contigious"); + return false; +} + +bool HexagonAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + MCAsmLexer &Lexer = getLexer(); + StartLoc = getLexer().getLoc(); + SmallVector<AsmToken, 5> Lookahead; + StringRef RawString(Lexer.getTok().getString().data(), 0); + bool Again = Lexer.is(AsmToken::Identifier); + bool NeededWorkaround = false; + while (Again) { + AsmToken const &Token = Lexer.getTok(); + RawString = StringRef(RawString.data(), + Token.getString().data() - RawString.data () + + Token.getString().size()); + Lookahead.push_back(Token); + Lexer.Lex(); + bool Contigious = Lexer.getTok().getString().data() == + Lookahead.back().getString().data() + + Lookahead.back().getString().size(); + bool Type = Lexer.is(AsmToken::Identifier) || Lexer.is(AsmToken::Dot) || + Lexer.is(AsmToken::Integer) || Lexer.is(AsmToken::Real) || + Lexer.is(AsmToken::Colon); + bool Workaround = Lexer.is(AsmToken::Colon) || + Lookahead.back().is(AsmToken::Colon); + Again = (Contigious && Type) || (Workaround && Type); + NeededWorkaround = NeededWorkaround || (Again && !(Contigious && Type)); + } + std::string Collapsed = RawString; + Collapsed.erase(std::remove_if(Collapsed.begin(), Collapsed.end(), isspace), + Collapsed.end()); + StringRef FullString = Collapsed; + std::pair<StringRef, StringRef> DotSplit = FullString.split('.'); + unsigned DotReg = MatchRegisterName(DotSplit.first.lower()); + if (DotReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) { + if (DotSplit.second.empty()) { + RegNo = DotReg; + EndLoc = Lexer.getLoc(); + if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc)) + return true; + return false; + } else { + RegNo = DotReg; + size_t First = RawString.find('.'); + StringRef DotString (RawString.data() + First, RawString.size() - First); + Lexer.UnLex(AsmToken(AsmToken::Identifier, DotString)); + EndLoc = Lexer.getLoc(); + if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc)) + return true; + return false; + } + } + std::pair<StringRef, StringRef> ColonSplit = StringRef(FullString).split(':'); + unsigned ColonReg = MatchRegisterName(ColonSplit.first.lower()); + if (ColonReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) { + Lexer.UnLex(Lookahead.back()); + Lookahead.pop_back(); + Lexer.UnLex(Lookahead.back()); + Lookahead.pop_back(); + RegNo = ColonReg; + EndLoc = Lexer.getLoc(); + if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc)) + return true; + return false; + } + while (!Lookahead.empty()) { + Lexer.UnLex(Lookahead.back()); + Lookahead.pop_back(); + } + return true; +} + +bool HexagonAsmParser::implicitExpressionLocation(OperandVector &Operands) { + if (previousEqual(Operands, 0, "call")) + return true; + if (previousEqual(Operands, 0, "jump")) + if (!getLexer().getTok().is(AsmToken::Colon)) + return true; + if (previousEqual(Operands, 0, "(") && previousIsLoop(Operands, 1)) + return true; + if (previousEqual(Operands, 1, ":") && previousEqual(Operands, 2, "jump") && + (previousEqual(Operands, 0, "nt") || previousEqual(Operands, 0, "t"))) + return true; + return false; +} + +bool HexagonAsmParser::parseExpression(MCExpr const *& Expr) { + llvm::SmallVector<AsmToken, 4> Tokens; + MCAsmLexer &Lexer = getLexer(); + bool Done = false; + static char const * Comma = ","; + do { + Tokens.emplace_back (Lexer.getTok()); + Lexer.Lex(); + switch (Tokens.back().getKind()) + { + case AsmToken::TokenKind::Hash: + if (Tokens.size () > 1) + if ((Tokens.end () - 2)->getKind() == AsmToken::TokenKind::Plus) { + Tokens.insert(Tokens.end() - 2, + AsmToken(AsmToken::TokenKind::Comma, Comma)); + Done = true; + } + break; + case AsmToken::TokenKind::RCurly: + case AsmToken::TokenKind::EndOfStatement: + case AsmToken::TokenKind::Eof: + Done = true; + break; + default: + break; + } + } while (!Done); + while (!Tokens.empty()) { + Lexer.UnLex(Tokens.back()); + Tokens.pop_back(); + } + return getParser().parseExpression(Expr); +} + +bool HexagonAsmParser::parseExpressionOrOperand(OperandVector &Operands) { + if (implicitExpressionLocation(Operands)) { + MCAsmParser &Parser = getParser(); + SMLoc Loc = Parser.getLexer().getLoc(); + std::unique_ptr<HexagonOperand> Expr = + HexagonOperand::CreateImm(nullptr, Loc, Loc); + MCExpr const *& Val = Expr->Imm.Val; + Operands.push_back(std::move(Expr)); + return parseExpression(Val); + } + return parseOperand(Operands); +} + +/// Parse an instruction. +bool HexagonAsmParser::parseInstruction(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + MCAsmLexer &Lexer = getLexer(); + while (true) { + AsmToken const &Token = Parser.getTok(); + switch (Token.getKind()) { + case AsmToken::EndOfStatement: { + Lexer.Lex(); + return false; + } + case AsmToken::LCurly: { + if (!Operands.empty()) + return true; + Operands.push_back( + HexagonOperand::CreateToken(Token.getString(), Token.getLoc())); + Lexer.Lex(); + return false; + } + case AsmToken::RCurly: { + if (Operands.empty()) { + Operands.push_back( + HexagonOperand::CreateToken(Token.getString(), Token.getLoc())); + Lexer.Lex(); + } + return false; + } + case AsmToken::Comma: { + Lexer.Lex(); + continue; + } + case AsmToken::EqualEqual: + case AsmToken::ExclaimEqual: + case AsmToken::GreaterEqual: + case AsmToken::GreaterGreater: + case AsmToken::LessEqual: + case AsmToken::LessLess: { + Operands.push_back(HexagonOperand::CreateToken( + Token.getString().substr(0, 1), Token.getLoc())); + Operands.push_back(HexagonOperand::CreateToken( + Token.getString().substr(1, 1), Token.getLoc())); + Lexer.Lex(); + continue; + } + case AsmToken::Hash: { + bool MustNotExtend = false; + bool ImplicitExpression = implicitExpressionLocation(Operands); + std::unique_ptr<HexagonOperand> Expr = HexagonOperand::CreateImm( + nullptr, Lexer.getLoc(), Lexer.getLoc()); + if (!ImplicitExpression) + Operands.push_back( + HexagonOperand::CreateToken(Token.getString(), Token.getLoc())); + Lexer.Lex(); + bool MustExtend = false; + bool HiOnly = false; + bool LoOnly = false; + if (Lexer.is(AsmToken::Hash)) { + Lexer.Lex(); + MustExtend = true; + } else if (ImplicitExpression) + MustNotExtend = true; + AsmToken const &Token = Parser.getTok(); + if (Token.is(AsmToken::Identifier)) { + StringRef String = Token.getString(); + AsmToken IDToken = Token; + if (String.lower() == "hi") { + HiOnly = true; + } else if (String.lower() == "lo") { + LoOnly = true; + } + if (HiOnly || LoOnly) { + AsmToken LParen = Lexer.peekTok(); + if (!LParen.is(AsmToken::LParen)) { + HiOnly = false; + LoOnly = false; + } else { + Lexer.Lex(); + } + } + } + if (parseExpression(Expr->Imm.Val)) + return true; + int64_t Value; + MCContext &Context = Parser.getContext(); + assert(Expr->Imm.Val != nullptr); + if (Expr->Imm.Val->evaluateAsAbsolute(Value)) { + if (HiOnly) + Expr->Imm.Val = MCBinaryExpr::createLShr( + Expr->Imm.Val, MCConstantExpr::create(16, Context), Context); + if (HiOnly || LoOnly) + Expr->Imm.Val = MCBinaryExpr::createAnd( + Expr->Imm.Val, MCConstantExpr::create(0xffff, Context), Context); + } + if (MustNotExtend) + Expr->Imm.Val = HexagonNoExtendOperand::Create(Expr->Imm.Val, Context); + Expr->Imm.MustExtend = MustExtend; + Operands.push_back(std::move(Expr)); + continue; + } + default: + break; + } + if (parseExpressionOrOperand(Operands)) + return true; + } +} + +bool HexagonAsmParser::ParseInstruction(ParseInstructionInfo &Info, + StringRef Name, + AsmToken ID, + OperandVector &Operands) { + getLexer().UnLex(ID); + return parseInstruction(Operands); +} + +namespace { +MCInst makeCombineInst(int opCode, MCOperand &Rdd, + MCOperand &MO1, MCOperand &MO2) { + MCInst TmpInst; + TmpInst.setOpcode(opCode); + TmpInst.addOperand(Rdd); + TmpInst.addOperand(MO1); + TmpInst.addOperand(MO2); + + return TmpInst; +} +} + +// Define this matcher function after the auto-generated include so we +// have the match class enum definitions. +unsigned HexagonAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, + unsigned Kind) { + HexagonOperand *Op = static_cast<HexagonOperand *>(&AsmOp); + + switch (Kind) { + case MCK_0: { + int64_t Value; + return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == 0 + ? Match_Success + : Match_InvalidOperand; + } + case MCK_1: { + int64_t Value; + return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == 1 + ? Match_Success + : Match_InvalidOperand; + } + case MCK__MINUS_1: { + int64_t Value; + return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == -1 + ? Match_Success + : Match_InvalidOperand; + } + } + if (Op->Kind == HexagonOperand::Token && Kind != InvalidMatchClass) { + StringRef myStringRef = StringRef(Op->Tok.Data, Op->Tok.Length); + if (matchTokenString(myStringRef.lower()) == (MatchClassKind)Kind) + return Match_Success; + if (matchTokenString(myStringRef.upper()) == (MatchClassKind)Kind) + return Match_Success; + } + + DEBUG(dbgs() << "Unmatched Operand:"); + DEBUG(Op->dump()); + DEBUG(dbgs() << "\n"); + + return Match_InvalidOperand; +} + +void HexagonAsmParser::OutOfRange(SMLoc IDLoc, long long Val, long long Max) { + std::string errStr; + raw_string_ostream ES(errStr); + ES << "value " << Val << "(" << format_hex(Val, 0) << ") out of range: "; + if (Max >= 0) + ES << "0-" << Max; + else + ES << Max << "-" << (-Max - 1); + Error(IDLoc, ES.str().c_str()); +} + +int HexagonAsmParser::processInstruction(MCInst &Inst, + OperandVector const &Operands, + SMLoc IDLoc, bool &MustExtend) { + MCContext &Context = getParser().getContext(); + const MCRegisterInfo *RI = getContext().getRegisterInfo(); + std::string r = "r"; + std::string v = "v"; + std::string Colon = ":"; + + bool is32bit = false; // used to distinguish between CONST32 and CONST64 + switch (Inst.getOpcode()) { + default: + break; + + case Hexagon::M4_mpyrr_addr: + case Hexagon::S4_addi_asl_ri: + case Hexagon::S4_addi_lsr_ri: + case Hexagon::S4_andi_asl_ri: + case Hexagon::S4_andi_lsr_ri: + case Hexagon::S4_ori_asl_ri: + case Hexagon::S4_ori_lsr_ri: + case Hexagon::S4_or_andix: + case Hexagon::S4_subi_asl_ri: + case Hexagon::S4_subi_lsr_ri: { + MCOperand &Ry = Inst.getOperand(0); + MCOperand &src = Inst.getOperand(2); + if (RI->getEncodingValue(Ry.getReg()) != RI->getEncodingValue(src.getReg())) + return Match_InvalidOperand; + break; + } + + case Hexagon::C2_cmpgei: { + MCOperand &MO = Inst.getOperand(2); + MO.setExpr(MCBinaryExpr::createSub( + MO.getExpr(), MCConstantExpr::create(1, Context), Context)); + Inst.setOpcode(Hexagon::C2_cmpgti); + break; + } + + case Hexagon::C2_cmpgeui: { + MCOperand &MO = Inst.getOperand(2); + int64_t Value; + bool Success = MO.getExpr()->evaluateAsAbsolute(Value); + (void)Success; + assert(Success && "Assured by matcher"); + if (Value == 0) { + MCInst TmpInst; + MCOperand &Pd = Inst.getOperand(0); + MCOperand &Rt = Inst.getOperand(1); + TmpInst.setOpcode(Hexagon::C2_cmpeq); + TmpInst.addOperand(Pd); + TmpInst.addOperand(Rt); + TmpInst.addOperand(Rt); + Inst = TmpInst; + } else { + MO.setExpr(MCBinaryExpr::createSub( + MO.getExpr(), MCConstantExpr::create(1, Context), Context)); + Inst.setOpcode(Hexagon::C2_cmpgtui); + } + break; + } + case Hexagon::J2_loop1r: + case Hexagon::J2_loop1i: + case Hexagon::J2_loop0r: + case Hexagon::J2_loop0i: { + MCOperand &MO = Inst.getOperand(0); + // Loop has different opcodes for extended vs not extended, but we should + // not use the other opcode as it is a legacy artifact of TD files. + int64_t Value; + if (MO.getExpr()->evaluateAsAbsolute(Value)) { + // if the operand can fit within a 7:2 field + if (Value < (1 << 8) && Value >= -(1 << 8)) { + SMLoc myLoc = Operands[2]->getStartLoc(); + // # is left in startLoc in the case of ## + // If '##' found then force extension. + if (*myLoc.getPointer() == '#') { + MustExtend = true; + break; + } + } else { + // If immediate and out of 7:2 range. + MustExtend = true; + } + } + break; + } + + // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)" + case Hexagon::A2_tfrp: { + MCOperand &MO = Inst.getOperand(1); + unsigned int RegPairNum = RI->getEncodingValue(MO.getReg()); + std::string R1 = r + llvm::utostr_32(RegPairNum + 1); + StringRef Reg1(R1); + MO.setReg(MatchRegisterName(Reg1)); + // Add a new operand for the second register in the pair. + std::string R2 = r + llvm::utostr_32(RegPairNum); + StringRef Reg2(R2); + Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2))); + Inst.setOpcode(Hexagon::A2_combinew); + break; + } + + case Hexagon::A2_tfrpt: + case Hexagon::A2_tfrpf: { + MCOperand &MO = Inst.getOperand(2); + unsigned int RegPairNum = RI->getEncodingValue(MO.getReg()); + std::string R1 = r + llvm::utostr_32(RegPairNum + 1); + StringRef Reg1(R1); + MO.setReg(MatchRegisterName(Reg1)); + // Add a new operand for the second register in the pair. + std::string R2 = r + llvm::utostr_32(RegPairNum); + StringRef Reg2(R2); + Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2))); + Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt) + ? Hexagon::C2_ccombinewt + : Hexagon::C2_ccombinewf); + break; + } + case Hexagon::A2_tfrptnew: + case Hexagon::A2_tfrpfnew: { + MCOperand &MO = Inst.getOperand(2); + unsigned int RegPairNum = RI->getEncodingValue(MO.getReg()); + std::string R1 = r + llvm::utostr_32(RegPairNum + 1); + StringRef Reg1(R1); + MO.setReg(MatchRegisterName(Reg1)); + // Add a new operand for the second register in the pair. + std::string R2 = r + llvm::utostr_32(RegPairNum); + StringRef Reg2(R2); + Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2))); + Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew) + ? Hexagon::C2_ccombinewnewt + : Hexagon::C2_ccombinewnewf); + break; + } + + // Translate a "$Rx = CONST32(#imm)" to "$Rx = memw(gp+#LABEL) " + case Hexagon::CONST32: + case Hexagon::CONST32_Float_Real: + case Hexagon::CONST32_Int_Real: + case Hexagon::FCONST32_nsdata: + is32bit = true; + // Translate a "$Rx:y = CONST64(#imm)" to "$Rx:y = memd(gp+#LABEL) " + case Hexagon::CONST64_Float_Real: + case Hexagon::CONST64_Int_Real: + + // FIXME: need better way to detect AsmStreamer (upstream removed getKind()) + if (!Parser.getStreamer().hasRawTextSupport()) { + MCELFStreamer *MES = static_cast<MCELFStreamer *>(&Parser.getStreamer()); + MCOperand &MO_1 = Inst.getOperand(1); + MCOperand &MO_0 = Inst.getOperand(0); + + // push section onto section stack + MES->PushSection(); + + std::string myCharStr; + MCSectionELF *mySection; + + // check if this as an immediate or a symbol + int64_t Value; + bool Absolute = MO_1.getExpr()->evaluateAsAbsolute(Value); + if (Absolute) { + // Create a new section - one for each constant + // Some or all of the zeros are replaced with the given immediate. + if (is32bit) { + std::string myImmStr = utohexstr(static_cast<uint32_t>(Value)); + myCharStr = StringRef(".gnu.linkonce.l4.CONST_00000000") + .drop_back(myImmStr.size()) + .str() + + myImmStr; + } else { + std::string myImmStr = utohexstr(Value); + myCharStr = StringRef(".gnu.linkonce.l8.CONST_0000000000000000") + .drop_back(myImmStr.size()) + .str() + + myImmStr; + } + + mySection = getContext().getELFSection(myCharStr, ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE); + } else if (MO_1.isExpr()) { + // .lita - for expressions + myCharStr = ".lita"; + mySection = getContext().getELFSection(myCharStr, ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE); + } else + llvm_unreachable("unexpected type of machine operand!"); + + MES->SwitchSection(mySection); + unsigned byteSize = is32bit ? 4 : 8; + getStreamer().EmitCodeAlignment(byteSize, byteSize); + + MCSymbol *Sym; + + // for symbols, get rid of prepended ".gnu.linkonce.lx." + + // emit symbol if needed + if (Absolute) { + Sym = getContext().getOrCreateSymbol(StringRef(myCharStr.c_str() + 16)); + if (Sym->isUndefined()) { + getStreamer().EmitLabel(Sym); + getStreamer().EmitSymbolAttribute(Sym, MCSA_Global); + getStreamer().EmitIntValue(Value, byteSize); + } + } else if (MO_1.isExpr()) { + const char *StringStart = 0; + const char *StringEnd = 0; + if (*Operands[4]->getStartLoc().getPointer() == '#') { + StringStart = Operands[5]->getStartLoc().getPointer(); + StringEnd = Operands[6]->getStartLoc().getPointer(); + } else { // no pound + StringStart = Operands[4]->getStartLoc().getPointer(); + StringEnd = Operands[5]->getStartLoc().getPointer(); + } + + unsigned size = StringEnd - StringStart; + std::string DotConst = ".CONST_"; + Sym = getContext().getOrCreateSymbol(DotConst + + StringRef(StringStart, size)); + + if (Sym->isUndefined()) { + // case where symbol is not yet defined: emit symbol + getStreamer().EmitLabel(Sym); + getStreamer().EmitSymbolAttribute(Sym, MCSA_Local); + getStreamer().EmitValue(MO_1.getExpr(), 4); + } + } else + llvm_unreachable("unexpected type of machine operand!"); + + MES->PopSection(); + + if (Sym) { + MCInst TmpInst; + if (is32bit) // 32 bit + TmpInst.setOpcode(Hexagon::L2_loadrigp); + else // 64 bit + TmpInst.setOpcode(Hexagon::L2_loadrdgp); + + TmpInst.addOperand(MO_0); + TmpInst.addOperand( + MCOperand::createExpr(MCSymbolRefExpr::create(Sym, getContext()))); + Inst = TmpInst; + } + } + break; + + // Translate a "$Rdd = #-imm" to "$Rdd = combine(#[-1,0], #-imm)" + case Hexagon::A2_tfrpi: { + MCOperand &Rdd = Inst.getOperand(0); + MCOperand &MO = Inst.getOperand(1); + int64_t Value; + int sVal = (MO.getExpr()->evaluateAsAbsolute(Value) && Value < 0) ? -1 : 0; + MCOperand imm(MCOperand::createExpr(MCConstantExpr::create(sVal, Context))); + Inst = makeCombineInst(Hexagon::A2_combineii, Rdd, imm, MO); + break; + } + + // Translate a "$Rdd = [#]#imm" to "$Rdd = combine(#, [#]#imm)" + case Hexagon::TFRI64_V4: { + MCOperand &Rdd = Inst.getOperand(0); + MCOperand &MO = Inst.getOperand(1); + int64_t Value; + if (MO.getExpr()->evaluateAsAbsolute(Value)) { + unsigned long long u64 = Value; + signed int s8 = (u64 >> 32) & 0xFFFFFFFF; + if (s8 < -128 || s8 > 127) + OutOfRange(IDLoc, s8, -128); + MCOperand imm(MCOperand::createExpr( + MCConstantExpr::create(s8, Context))); // upper 32 + MCOperand imm2(MCOperand::createExpr( + MCConstantExpr::create(u64 & 0xFFFFFFFF, Context))); // lower 32 + Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, imm2); + } else { + MCOperand imm(MCOperand::createExpr( + MCConstantExpr::create(0, Context))); // upper 32 + Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, MO); + } + break; + } + + // Handle $Rdd = combine(##imm, #imm)" + case Hexagon::TFRI64_V2_ext: { + MCOperand &Rdd = Inst.getOperand(0); + MCOperand &MO1 = Inst.getOperand(1); + MCOperand &MO2 = Inst.getOperand(2); + int64_t Value; + if (MO2.getExpr()->evaluateAsAbsolute(Value)) { + int s8 = Value; + if (s8 < -128 || s8 > 127) + OutOfRange(IDLoc, s8, -128); + } + Inst = makeCombineInst(Hexagon::A2_combineii, Rdd, MO1, MO2); + break; + } + + // Handle $Rdd = combine(#imm, ##imm)" + case Hexagon::A4_combineii: { + MCOperand &Rdd = Inst.getOperand(0); + MCOperand &MO1 = Inst.getOperand(1); + int64_t Value; + if (MO1.getExpr()->evaluateAsAbsolute(Value)) { + int s8 = Value; + if (s8 < -128 || s8 > 127) + OutOfRange(IDLoc, s8, -128); + } + MCOperand &MO2 = Inst.getOperand(2); + Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, MO1, MO2); + break; + } + + case Hexagon::S2_tableidxb_goodsyntax: { + Inst.setOpcode(Hexagon::S2_tableidxb); + break; + } + + case Hexagon::S2_tableidxh_goodsyntax: { + MCInst TmpInst; + MCOperand &Rx = Inst.getOperand(0); + MCOperand &_dst_ = Inst.getOperand(1); + MCOperand &Rs = Inst.getOperand(2); + MCOperand &Imm4 = Inst.getOperand(3); + MCOperand &Imm6 = Inst.getOperand(4); + Imm6.setExpr(MCBinaryExpr::createSub( + Imm6.getExpr(), MCConstantExpr::create(1, Context), Context)); + TmpInst.setOpcode(Hexagon::S2_tableidxh); + TmpInst.addOperand(Rx); + TmpInst.addOperand(_dst_); + TmpInst.addOperand(Rs); + TmpInst.addOperand(Imm4); + TmpInst.addOperand(Imm6); + Inst = TmpInst; + break; + } + + case Hexagon::S2_tableidxw_goodsyntax: { + MCInst TmpInst; + MCOperand &Rx = Inst.getOperand(0); + MCOperand &_dst_ = Inst.getOperand(1); + MCOperand &Rs = Inst.getOperand(2); + MCOperand &Imm4 = Inst.getOperand(3); + MCOperand &Imm6 = Inst.getOperand(4); + Imm6.setExpr(MCBinaryExpr::createSub( + Imm6.getExpr(), MCConstantExpr::create(2, Context), Context)); + TmpInst.setOpcode(Hexagon::S2_tableidxw); + TmpInst.addOperand(Rx); + TmpInst.addOperand(_dst_); + TmpInst.addOperand(Rs); + TmpInst.addOperand(Imm4); + TmpInst.addOperand(Imm6); + Inst = TmpInst; + break; + } + + case Hexagon::S2_tableidxd_goodsyntax: { + MCInst TmpInst; + MCOperand &Rx = Inst.getOperand(0); + MCOperand &_dst_ = Inst.getOperand(1); + MCOperand &Rs = Inst.getOperand(2); + MCOperand &Imm4 = Inst.getOperand(3); + MCOperand &Imm6 = Inst.getOperand(4); + Imm6.setExpr(MCBinaryExpr::createSub( + Imm6.getExpr(), MCConstantExpr::create(3, Context), Context)); + TmpInst.setOpcode(Hexagon::S2_tableidxd); + TmpInst.addOperand(Rx); + TmpInst.addOperand(_dst_); + TmpInst.addOperand(Rs); + TmpInst.addOperand(Imm4); + TmpInst.addOperand(Imm6); + Inst = TmpInst; + break; + } + + case Hexagon::M2_mpyui: { + Inst.setOpcode(Hexagon::M2_mpyi); + break; + } + case Hexagon::M2_mpysmi: { + MCInst TmpInst; + MCOperand &Rd = Inst.getOperand(0); + MCOperand &Rs = Inst.getOperand(1); + MCOperand &Imm = Inst.getOperand(2); + int64_t Value; + bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); + assert(Absolute); + (void)Absolute; + if (!MustExtend) { + if (Value < 0 && Value > -256) { + Imm.setExpr(MCConstantExpr::create(Value * -1, Context)); + TmpInst.setOpcode(Hexagon::M2_mpysin); + } else if (Value < 256 && Value >= 0) + TmpInst.setOpcode(Hexagon::M2_mpysip); + else + return Match_InvalidOperand; + } else { + if (Value >= 0) + TmpInst.setOpcode(Hexagon::M2_mpysip); + else + return Match_InvalidOperand; + } + TmpInst.addOperand(Rd); + TmpInst.addOperand(Rs); + TmpInst.addOperand(Imm); + Inst = TmpInst; + break; + } + + case Hexagon::S2_asr_i_r_rnd_goodsyntax: { + MCOperand &Imm = Inst.getOperand(2); + MCInst TmpInst; + int64_t Value; + bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); + assert(Absolute); + (void)Absolute; + if (Value == 0) { // convert to $Rd = $Rs + TmpInst.setOpcode(Hexagon::A2_tfr); + MCOperand &Rd = Inst.getOperand(0); + MCOperand &Rs = Inst.getOperand(1); + TmpInst.addOperand(Rd); + TmpInst.addOperand(Rs); + } else { + Imm.setExpr(MCBinaryExpr::createSub( + Imm.getExpr(), MCConstantExpr::create(1, Context), Context)); + TmpInst.setOpcode(Hexagon::S2_asr_i_r_rnd); + MCOperand &Rd = Inst.getOperand(0); + MCOperand &Rs = Inst.getOperand(1); + TmpInst.addOperand(Rd); + TmpInst.addOperand(Rs); + TmpInst.addOperand(Imm); + } + Inst = TmpInst; + break; + } + + case Hexagon::S2_asr_i_p_rnd_goodsyntax: { + MCOperand &Rdd = Inst.getOperand(0); + MCOperand &Rss = Inst.getOperand(1); + MCOperand &Imm = Inst.getOperand(2); + int64_t Value; + bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); + assert(Absolute); + (void)Absolute; + if (Value == 0) { // convert to $Rdd = combine ($Rs[0], $Rs[1]) + MCInst TmpInst; + unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg()); + std::string R1 = r + llvm::utostr_32(RegPairNum + 1); + StringRef Reg1(R1); + Rss.setReg(MatchRegisterName(Reg1)); + // Add a new operand for the second register in the pair. + std::string R2 = r + llvm::utostr_32(RegPairNum); + StringRef Reg2(R2); + TmpInst.setOpcode(Hexagon::A2_combinew); + TmpInst.addOperand(Rdd); + TmpInst.addOperand(Rss); + TmpInst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2))); + Inst = TmpInst; + } else { + Imm.setExpr(MCBinaryExpr::createSub( + Imm.getExpr(), MCConstantExpr::create(1, Context), Context)); + Inst.setOpcode(Hexagon::S2_asr_i_p_rnd); + } + break; + } + + case Hexagon::A4_boundscheck: { + MCOperand &Rs = Inst.getOperand(1); + unsigned int RegNum = RI->getEncodingValue(Rs.getReg()); + if (RegNum & 1) { // Odd mapped to raw:hi, regpair is rodd:odd-1, like r3:2 + Inst.setOpcode(Hexagon::A4_boundscheck_hi); + std::string Name = + r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1); + StringRef RegPair = Name; + Rs.setReg(MatchRegisterName(RegPair)); + } else { // raw:lo + Inst.setOpcode(Hexagon::A4_boundscheck_lo); + std::string Name = + r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum); + StringRef RegPair = Name; + Rs.setReg(MatchRegisterName(RegPair)); + } + break; + } + + case Hexagon::A2_addsp: { + MCOperand &Rs = Inst.getOperand(1); + unsigned int RegNum = RI->getEncodingValue(Rs.getReg()); + if (RegNum & 1) { // Odd mapped to raw:hi + Inst.setOpcode(Hexagon::A2_addsph); + std::string Name = + r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1); + StringRef RegPair = Name; + Rs.setReg(MatchRegisterName(RegPair)); + } else { // Even mapped raw:lo + Inst.setOpcode(Hexagon::A2_addspl); + std::string Name = + r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum); + StringRef RegPair = Name; + Rs.setReg(MatchRegisterName(RegPair)); + } + break; + } + + case Hexagon::M2_vrcmpys_s1: { + MCOperand &Rt = Inst.getOperand(2); + unsigned int RegNum = RI->getEncodingValue(Rt.getReg()); + if (RegNum & 1) { // Odd mapped to sat:raw:hi + Inst.setOpcode(Hexagon::M2_vrcmpys_s1_h); + std::string Name = + r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1); + StringRef RegPair = Name; + Rt.setReg(MatchRegisterName(RegPair)); + } else { // Even mapped sat:raw:lo + Inst.setOpcode(Hexagon::M2_vrcmpys_s1_l); + std::string Name = + r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum); + StringRef RegPair = Name; + Rt.setReg(MatchRegisterName(RegPair)); + } + break; + } + + case Hexagon::M2_vrcmpys_acc_s1: { + MCInst TmpInst; + MCOperand &Rxx = Inst.getOperand(0); + MCOperand &Rss = Inst.getOperand(2); + MCOperand &Rt = Inst.getOperand(3); + unsigned int RegNum = RI->getEncodingValue(Rt.getReg()); + if (RegNum & 1) { // Odd mapped to sat:raw:hi + TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_h); + std::string Name = + r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1); + StringRef RegPair = Name; + Rt.setReg(MatchRegisterName(RegPair)); + } else { // Even mapped sat:raw:lo + TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_l); + std::string Name = + r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum); + StringRef RegPair = Name; + Rt.setReg(MatchRegisterName(RegPair)); + } + // Registers are in different positions + TmpInst.addOperand(Rxx); + TmpInst.addOperand(Rxx); + TmpInst.addOperand(Rss); + TmpInst.addOperand(Rt); + Inst = TmpInst; + break; + } + + case Hexagon::M2_vrcmpys_s1rp: { + MCOperand &Rt = Inst.getOperand(2); + unsigned int RegNum = RI->getEncodingValue(Rt.getReg()); + if (RegNum & 1) { // Odd mapped to rnd:sat:raw:hi + Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_h); + std::string Name = + r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1); + StringRef RegPair = Name; + Rt.setReg(MatchRegisterName(RegPair)); + } else { // Even mapped rnd:sat:raw:lo + Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_l); + std::string Name = + r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum); + StringRef RegPair = Name; + Rt.setReg(MatchRegisterName(RegPair)); + } + break; + } + + case Hexagon::S5_asrhub_rnd_sat_goodsyntax: { + MCOperand &Imm = Inst.getOperand(2); + int64_t Value; + bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); + assert(Absolute); + (void)Absolute; + if (Value == 0) + Inst.setOpcode(Hexagon::S2_vsathub); + else { + Imm.setExpr(MCBinaryExpr::createSub( + Imm.getExpr(), MCConstantExpr::create(1, Context), Context)); + Inst.setOpcode(Hexagon::S5_asrhub_rnd_sat); + } + break; + } + + case Hexagon::S5_vasrhrnd_goodsyntax: { + MCOperand &Rdd = Inst.getOperand(0); + MCOperand &Rss = Inst.getOperand(1); + MCOperand &Imm = Inst.getOperand(2); + int64_t Value; + bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); + assert(Absolute); + (void)Absolute; + if (Value == 0) { + MCInst TmpInst; + unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg()); + std::string R1 = r + llvm::utostr_32(RegPairNum + 1); + StringRef Reg1(R1); + Rss.setReg(MatchRegisterName(Reg1)); + // Add a new operand for the second register in the pair. + std::string R2 = r + llvm::utostr_32(RegPairNum); + StringRef Reg2(R2); + TmpInst.setOpcode(Hexagon::A2_combinew); + TmpInst.addOperand(Rdd); + TmpInst.addOperand(Rss); + TmpInst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2))); + Inst = TmpInst; + } else { + Imm.setExpr(MCBinaryExpr::createSub( + Imm.getExpr(), MCConstantExpr::create(1, Context), Context)); + Inst.setOpcode(Hexagon::S5_vasrhrnd); + } + break; + } + + case Hexagon::A2_not: { + MCInst TmpInst; + MCOperand &Rd = Inst.getOperand(0); + MCOperand &Rs = Inst.getOperand(1); + TmpInst.setOpcode(Hexagon::A2_subri); + TmpInst.addOperand(Rd); + TmpInst.addOperand( + MCOperand::createExpr(MCConstantExpr::create(-1, Context))); + TmpInst.addOperand(Rs); + Inst = TmpInst; + break; + } + } // switch + + return Match_Success; +} diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp index cb7e633..ea96eb0 100644 --- a/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp +++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp @@ -868,7 +868,7 @@ void BT::visitNonBranch(const MachineInstr *MI) { continue; bool Changed = false; - if (!Eval || !ResMap.has(RD.Reg)) { + if (!Eval || ResMap.count(RD.Reg) == 0) { // Set to "ref" (aka "bottom"). uint16_t DefBW = ME.getRegBitWidth(RD); RegisterCell RefC = RegisterCell::self(RD.Reg, DefBW); @@ -951,11 +951,11 @@ void BT::visitBranchesFrom(const MachineInstr *BI) { // be processed. for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I) { const MachineBasicBlock *SB = *I; - if (SB->isLandingPad()) + if (SB->isEHPad()) Targets.insert(SB); } if (FallsThrough) { - MachineFunction::const_iterator BIt = &B; + MachineFunction::const_iterator BIt = B.getIterator(); MachineFunction::const_iterator Next = std::next(BIt); if (Next != MF.end()) Targets.insert(&*Next); @@ -1005,7 +1005,7 @@ void BT::put(RegisterRef RR, const RegisterCell &RC) { // Replace all references to bits from OldRR with the corresponding bits // in NewRR. void BT::subst(RegisterRef OldRR, RegisterRef NewRR) { - assert(Map.has(OldRR.Reg) && "OldRR not present in map"); + assert(Map.count(OldRR.Reg) > 0 && "OldRR not present in map"); BitMask OM = ME.mask(OldRR.Reg, OldRR.Sub); BitMask NM = ME.mask(NewRR.Reg, NewRR.Sub); uint16_t OMB = OM.first(), OME = OM.last(); @@ -1104,9 +1104,9 @@ void BT::run() { } // If block end has been reached, add the fall-through edge to the queue. if (It == End) { - MachineFunction::const_iterator BIt = &B; + MachineFunction::const_iterator BIt = B.getIterator(); MachineFunction::const_iterator Next = std::next(BIt); - if (Next != MF.end()) { + if (Next != MF.end() && B.isSuccessor(&*Next)) { int ThisN = B.getNumber(); int NextN = Next->getNumber(); FlowQ.push(CFGEdge(ThisN, NextN)); diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.h b/contrib/llvm/lib/Target/Hexagon/BitTracker.h index ed002a7..959c831 100644 --- a/contrib/llvm/lib/Target/Hexagon/BitTracker.h +++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.h @@ -36,9 +36,7 @@ struct BitTracker { typedef SetVector<const MachineBasicBlock *> BranchTargetList; - struct CellMapType : public std::map<unsigned,RegisterCell> { - bool has(unsigned Reg) const; - }; + typedef std::map<unsigned, RegisterCell> CellMapType; BitTracker(const MachineEvaluator &E, MachineFunction &F); ~BitTracker(); @@ -79,7 +77,6 @@ private: // Abstraction of a reference to bit at position Pos from a register Reg. struct BitTracker::BitRef { BitRef(unsigned R = 0, uint16_t P = 0) : Reg(R), Pos(P) {} - BitRef(const BitRef &BR) : Reg(BR.Reg), Pos(BR.Pos) {} bool operator== (const BitRef &BR) const { // If Reg is 0, disregard Pos. return Reg == BR.Reg && (Reg == 0 || Pos == BR.Pos); @@ -146,7 +143,6 @@ struct BitTracker::BitValue { BitValue(ValueType T = Top) : Type(T) {} BitValue(bool B) : Type(B ? One : Zero) {} - BitValue(const BitValue &V) : Type(V.Type), RefI(V.RefI) {} BitValue(unsigned Reg, uint16_t Pos) : Type(Ref), RefI(Reg, Pos) {} bool operator== (const BitValue &V) const { @@ -279,11 +275,6 @@ struct BitTracker::RegisterCell { return !operator==(RC); } - const RegisterCell &operator=(const RegisterCell &RC) { - Bits = RC.Bits; - return *this; - } - // Generate a "ref" cell for the corresponding register. In the resulting // cell each bit will be described as being the same as the corresponding // bit in register Reg (i.e. the cell is "defined" by register Reg). @@ -344,11 +335,6 @@ BitTracker::RegisterCell::ref(const RegisterCell &C) { return RC; } - -inline bool BitTracker::CellMapType::has(unsigned Reg) const { - return find(Reg) != end(); -} - // A class to evaluate target's instructions and update the cell maps. // This is used internally by the bit tracker. A target that wants to // utilize this should implement the evaluation functions (noted below) diff --git a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 9cc1e94..4a9c341 100644 --- a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -7,42 +7,45 @@ // //===----------------------------------------------------------------------===// +#define DEBUG_TYPE "hexagon-disassembler" + #include "Hexagon.h" #include "MCTargetDesc/HexagonBaseInfo.h" -#include "MCTargetDesc/HexagonMCInstrInfo.h" +#include "MCTargetDesc/HexagonMCChecker.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" - -#include "llvm/MC/MCContext.h" +#include "MCTargetDesc/HexagonMCInstrInfo.h" +#include "MCTargetDesc/HexagonInstPrinter.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixedLenDisassembler.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Endian.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/LEB128.h" -#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/MemoryObject.h" #include "llvm/Support/raw_ostream.h" -#include <array> +#include "llvm/Support/TargetRegistry.h" #include <vector> using namespace llvm; using namespace Hexagon; -#define DEBUG_TYPE "hexagon-disassembler" - -// Pull DecodeStatus and its enum values into the global namespace. -typedef llvm::MCDisassembler::DecodeStatus DecodeStatus; +typedef MCDisassembler::DecodeStatus DecodeStatus; namespace { /// \brief Hexagon disassembler for all Hexagon platforms. class HexagonDisassembler : public MCDisassembler { public: + std::unique_ptr<MCInstrInfo const> const MCII; std::unique_ptr<MCInst *> CurrentBundle; - HexagonDisassembler(MCSubtargetInfo const &STI, MCContext &Ctx) - : MCDisassembler(STI, Ctx), CurrentBundle(new MCInst *) {} + HexagonDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, + MCInstrInfo const *MCII) + : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(new MCInst *) {} DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB, ArrayRef<uint8_t> Bytes, uint64_t Address, @@ -52,23 +55,57 @@ public: ArrayRef<uint8_t> Bytes, uint64_t Address, raw_ostream &VStream, raw_ostream &CStream) const override; + + void adjustExtendedInstructions(MCInst &MCI, MCInst const &MCB) const; + void addSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) const; }; } -static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo, +// Forward declare these because the auto-generated code will reference them. +// Definitions are further down. + +static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - void const *Decoder); + const void *Decoder); + +static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn); +static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn, + void const *Decoder); static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op, raw_ostream &os); -static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst); +static unsigned getRegFromSubinstEncoding(unsigned encoded_reg); + +static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp, + uint64_t Address, const void *Decoder); static DecodeStatus s16ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, const void *Decoder); static DecodeStatus s12ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, @@ -95,129 +132,19 @@ static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, const void *Decoder); static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, const void *Decoder); - -static const uint16_t IntRegDecoderTable[] = { - Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4, - Hexagon::R5, Hexagon::R6, Hexagon::R7, Hexagon::R8, Hexagon::R9, - Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14, - Hexagon::R15, Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19, - Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23, Hexagon::R24, - Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29, - Hexagon::R30, Hexagon::R31}; - -static const uint16_t PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1, - Hexagon::P2, Hexagon::P3}; - -static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo, - const uint16_t Table[], size_t Size) { - if (RegNo < Size) { - Inst.addOperand(MCOperand::createReg(Table[RegNo])); - return MCDisassembler::Success; - } else - return MCDisassembler::Fail; -} - -static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t /*Address*/, - void const *Decoder) { - if (RegNo > 31) - return MCDisassembler::Fail; - - unsigned Register = IntRegDecoderTable[RegNo]; - Inst.addOperand(MCOperand::createReg(Register)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t /*Address*/, - const void *Decoder) { - static const uint16_t CtrlRegDecoderTable[] = { - Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1, - Hexagon::P3_0, Hexagon::NoRegister, Hexagon::C6, Hexagon::C7, - Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP, - Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPCH}; - - if (RegNo >= sizeof(CtrlRegDecoderTable) / sizeof(CtrlRegDecoderTable[0])) - return MCDisassembler::Fail; - - if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister) - return MCDisassembler::Fail; - - unsigned Register = CtrlRegDecoderTable[RegNo]; - Inst.addOperand(MCOperand::createReg(Register)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t /*Address*/, - void const *Decoder) { - static const uint16_t CtrlReg64DecoderTable[] = { - Hexagon::C1_0, Hexagon::NoRegister, Hexagon::C3_2, - Hexagon::NoRegister, Hexagon::NoRegister, Hexagon::NoRegister, - Hexagon::C7_6, Hexagon::NoRegister, Hexagon::C9_8, - Hexagon::NoRegister, Hexagon::C11_10, Hexagon::NoRegister, - Hexagon::CS, Hexagon::NoRegister, Hexagon::UPC, - Hexagon::NoRegister}; - - if (RegNo >= sizeof(CtrlReg64DecoderTable) / sizeof(CtrlReg64DecoderTable[0])) - return MCDisassembler::Fail; - - if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister) - return MCDisassembler::Fail; - - unsigned Register = CtrlReg64DecoderTable[RegNo]; - Inst.addOperand(MCOperand::createReg(Register)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t /*Address*/, - const void *Decoder) { - unsigned Register = 0; - switch (RegNo) { - case 0: - Register = Hexagon::M0; - break; - case 1: - Register = Hexagon::M1; - break; - default: - return MCDisassembler::Fail; - } - Inst.addOperand(MCOperand::createReg(Register)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t /*Address*/, - const void *Decoder) { - static const uint16_t DoubleRegDecoderTable[] = { - Hexagon::D0, Hexagon::D1, Hexagon::D2, Hexagon::D3, - Hexagon::D4, Hexagon::D5, Hexagon::D6, Hexagon::D7, - Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11, - Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15}; - - return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable, - sizeof(DoubleRegDecoderTable))); -} - -static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t /*Address*/, - void const *Decoder) { - if (RegNo > 3) - return MCDisassembler::Fail; - - unsigned Register = PredRegDecoderTable[RegNo]; - Inst.addOperand(MCOperand::createReg(Register)); - return MCDisassembler::Success; -} +static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); #include "HexagonGenDisassemblerTables.inc" -static MCDisassembler *createHexagonDisassembler(Target const &T, - MCSubtargetInfo const &STI, +static MCDisassembler *createHexagonDisassembler(const Target &T, + const MCSubtargetInfo &STI, MCContext &Ctx) { - return new HexagonDisassembler(STI, Ctx); + return new HexagonDisassembler(STI, Ctx, T.createMCInstrInfo()); } extern "C" void LLVMInitializeHexagonDisassembler() { @@ -235,8 +162,7 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Size = 0; *CurrentBundle = &MI; - MI.setOpcode(Hexagon::BUNDLE); - MI.addOperand(MCOperand::createImm(0)); + MI = HexagonMCInstrInfo::createBundle(); while (Result == Success && Complete == false) { if (Bytes.size() < HEXAGON_INSTR_SIZE) return MCDisassembler::Fail; @@ -246,7 +172,21 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Size += HEXAGON_INSTR_SIZE; Bytes = Bytes.slice(HEXAGON_INSTR_SIZE); } - return Result; + if(Result == MCDisassembler::Fail) + return Result; + HexagonMCChecker Checker (*MCII, STI, MI, MI, *getContext().getRegisterInfo()); + if(!Checker.check()) + return MCDisassembler::Fail; + return MCDisassembler::Success; +} + +namespace { +HexagonDisassembler const &disassembler(void const *Decoder) { + return *static_cast<HexagonDisassembler const *>(Decoder); +} +MCContext &contextFromDecoder(void const *Decoder) { + return disassembler(Decoder).getContext(); +} } DecodeStatus HexagonDisassembler::getSingleInstruction( @@ -255,8 +195,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction( assert(Bytes.size() >= HEXAGON_INSTR_SIZE); uint32_t Instruction = - llvm::support::endian::read<uint32_t, llvm::support::little, - llvm::support::unaligned>(Bytes.data()); + (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0); auto BundleSize = HexagonMCInstrInfo::bundleSize(MCB); if ((Instruction & HexagonII::INST_PARSE_MASK) == @@ -360,8 +299,8 @@ DecodeStatus HexagonDisassembler::getSingleInstruction( MILow->setOpcode(opLow); MCInst *MIHigh = new (getContext()) MCInst; MIHigh->setOpcode(opHigh); - AddSubinstOperands(MILow, opLow, instLow); - AddSubinstOperands(MIHigh, opHigh, instHigh); + addSubinstOperands(MILow, opLow, instLow); + addSubinstOperands(MIHigh, opHigh, instHigh); // see ConvertToSubInst() in // lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp @@ -378,102 +317,774 @@ DecodeStatus HexagonDisassembler::getSingleInstruction( // Calling the auto-generated decoder function. Result = decodeInstruction(DecoderTable32, MI, Instruction, Address, this, STI); + + // If a, "standard" insn isn't found check special cases. + if (MCDisassembler::Success != Result || + MI.getOpcode() == Hexagon::A4_ext) { + Result = decodeImmext(MI, Instruction, this); + if (MCDisassembler::Success != Result) { + Result = decodeSpecial(MI, Instruction); + } + } else { + // If the instruction is a compound instruction, register values will + // follow the duplex model, so the register values in the MCInst are + // incorrect. If the instruction is a compound, loop through the + // operands and change registers appropriately. + if (llvm::HexagonMCInstrInfo::getType(*MCII, MI) == + HexagonII::TypeCOMPOUND) { + for (MCInst::iterator i = MI.begin(), last = MI.end(); i < last; ++i) { + if (i->isReg()) { + unsigned reg = i->getReg() - Hexagon::R0; + i->setReg(getRegFromSubinstEncoding(reg)); + } + } + } + } + } + + if (HexagonMCInstrInfo::isNewValue(*MCII, MI)) { + unsigned OpIndex = HexagonMCInstrInfo::getNewValueOp(*MCII, MI); + MCOperand &MCO = MI.getOperand(OpIndex); + assert(MCO.isReg() && "New value consumers must be registers"); + unsigned Register = + getContext().getRegisterInfo()->getEncodingValue(MCO.getReg()); + if ((Register & 0x6) == 0) + // HexagonPRM 10.11 Bit 1-2 == 0 is reserved + return MCDisassembler::Fail; + unsigned Lookback = (Register & 0x6) >> 1; + unsigned Offset = 1; + bool Vector = HexagonMCInstrInfo::isVector(*MCII, MI); + auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle); + auto i = Instructions.end() - 1; + for (auto n = Instructions.begin() - 1;; --i, ++Offset) { + if (i == n) + // Couldn't find producer + return MCDisassembler::Fail; + if (Vector && !HexagonMCInstrInfo::isVector(*MCII, *i->getInst())) + // Skip scalars when calculating distances for vectors + ++Lookback; + if (HexagonMCInstrInfo::isImmext(*i->getInst())) + ++Lookback; + if (Offset == Lookback) + break; + } + auto const &Inst = *i->getInst(); + bool SubregBit = (Register & 0x1) != 0; + if (SubregBit && HexagonMCInstrInfo::hasNewValue2(*MCII, Inst)) { + // If subreg bit is set we're selecting the second produced newvalue + unsigned Producer = + HexagonMCInstrInfo::getNewValueOperand2(*MCII, Inst).getReg(); + assert(Producer != Hexagon::NoRegister); + MCO.setReg(Producer); + } else if (HexagonMCInstrInfo::hasNewValue(*MCII, Inst)) { + unsigned Producer = + HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg(); + if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15) + Producer = ((Producer - Hexagon::W0) << 1) + SubregBit + Hexagon::V0; + else if (SubregBit) + // Subreg bit should not be set for non-doublevector newvalue producers + return MCDisassembler::Fail; + assert(Producer != Hexagon::NoRegister); + MCO.setReg(Producer); + } else + return MCDisassembler::Fail; } + adjustExtendedInstructions(MI, MCB); + MCInst const *Extender = + HexagonMCInstrInfo::extenderForIndex(MCB, + HexagonMCInstrInfo::bundleSize(MCB)); + if(Extender != nullptr) { + MCInst const & Inst = HexagonMCInstrInfo::isDuplex(*MCII, MI) ? + *MI.getOperand(1).getInst() : MI; + if (!HexagonMCInstrInfo::isExtendable(*MCII, Inst) && + !HexagonMCInstrInfo::isExtended(*MCII, Inst)) + return MCDisassembler::Fail; + } return Result; } +void HexagonDisassembler::adjustExtendedInstructions(MCInst &MCI, + MCInst const &MCB) const { + if (!HexagonMCInstrInfo::hasExtenderForIndex( + MCB, HexagonMCInstrInfo::bundleSize(MCB))) { + unsigned opcode; + // This code is used by the disassembler to disambiguate between GP + // relative and absolute addressing instructions since they both have + // same encoding bits. However, an absolute addressing instruction must + // follow an immediate extender. Disassembler alwaus select absolute + // addressing instructions first and uses this code to change them into + // GP relative instruction in the absence of the corresponding immediate + // extender. + switch (MCI.getOpcode()) { + case Hexagon::S2_storerbabs: + opcode = Hexagon::S2_storerbgp; + break; + case Hexagon::S2_storerhabs: + opcode = Hexagon::S2_storerhgp; + break; + case Hexagon::S2_storerfabs: + opcode = Hexagon::S2_storerfgp; + break; + case Hexagon::S2_storeriabs: + opcode = Hexagon::S2_storerigp; + break; + case Hexagon::S2_storerbnewabs: + opcode = Hexagon::S2_storerbnewgp; + break; + case Hexagon::S2_storerhnewabs: + opcode = Hexagon::S2_storerhnewgp; + break; + case Hexagon::S2_storerinewabs: + opcode = Hexagon::S2_storerinewgp; + break; + case Hexagon::S2_storerdabs: + opcode = Hexagon::S2_storerdgp; + break; + case Hexagon::L4_loadrb_abs: + opcode = Hexagon::L2_loadrbgp; + break; + case Hexagon::L4_loadrub_abs: + opcode = Hexagon::L2_loadrubgp; + break; + case Hexagon::L4_loadrh_abs: + opcode = Hexagon::L2_loadrhgp; + break; + case Hexagon::L4_loadruh_abs: + opcode = Hexagon::L2_loadruhgp; + break; + case Hexagon::L4_loadri_abs: + opcode = Hexagon::L2_loadrigp; + break; + case Hexagon::L4_loadrd_abs: + opcode = Hexagon::L2_loadrdgp; + break; + default: + opcode = MCI.getOpcode(); + } + MCI.setOpcode(opcode); + } +} + +namespace llvm { +extern const MCInstrDesc HexagonInsts[]; +} + +static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo, + ArrayRef<MCPhysReg> Table) { + if (RegNo < Table.size()) { + Inst.addOperand(MCOperand::createReg(Table[RegNo])); + return MCDisassembler::Success; + } + + return MCDisassembler::Fail; +} + +static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeIntRegsRegisterClass(Inst, RegNo, Address, Decoder); +} + +static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + static const MCPhysReg IntRegDecoderTable[] = { + Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4, + Hexagon::R5, Hexagon::R6, Hexagon::R7, Hexagon::R8, Hexagon::R9, + Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14, + Hexagon::R15, Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19, + Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23, Hexagon::R24, + Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29, + Hexagon::R30, Hexagon::R31}; + + return DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable); +} + +static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg VecRegDecoderTable[] = { + Hexagon::V0, Hexagon::V1, Hexagon::V2, Hexagon::V3, Hexagon::V4, + Hexagon::V5, Hexagon::V6, Hexagon::V7, Hexagon::V8, Hexagon::V9, + Hexagon::V10, Hexagon::V11, Hexagon::V12, Hexagon::V13, Hexagon::V14, + Hexagon::V15, Hexagon::V16, Hexagon::V17, Hexagon::V18, Hexagon::V19, + Hexagon::V20, Hexagon::V21, Hexagon::V22, Hexagon::V23, Hexagon::V24, + Hexagon::V25, Hexagon::V26, Hexagon::V27, Hexagon::V28, Hexagon::V29, + Hexagon::V30, Hexagon::V31}; + + return DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable); +} + +static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg DoubleRegDecoderTable[] = { + Hexagon::D0, Hexagon::D1, Hexagon::D2, Hexagon::D3, + Hexagon::D4, Hexagon::D5, Hexagon::D6, Hexagon::D7, + Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11, + Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15}; + + return DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable); +} + +static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg VecDblRegDecoderTable[] = { + Hexagon::W0, Hexagon::W1, Hexagon::W2, Hexagon::W3, + Hexagon::W4, Hexagon::W5, Hexagon::W6, Hexagon::W7, + Hexagon::W8, Hexagon::W9, Hexagon::W10, Hexagon::W11, + Hexagon::W12, Hexagon::W13, Hexagon::W14, Hexagon::W15}; + + return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable)); +} + +static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1, + Hexagon::P2, Hexagon::P3}; + + return DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable); +} + +static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1, + Hexagon::Q2, Hexagon::Q3}; + + return DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable); +} + +static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg CtrlRegDecoderTable[] = { + Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1, + Hexagon::P3_0, Hexagon::C5, Hexagon::C6, Hexagon::C7, + Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP, + Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPC + }; + + if (RegNo >= array_lengthof(CtrlRegDecoderTable)) + return MCDisassembler::Fail; + + if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister) + return MCDisassembler::Fail; + + unsigned Register = CtrlRegDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg CtrlReg64DecoderTable[] = { + Hexagon::C1_0, Hexagon::NoRegister, + Hexagon::C3_2, Hexagon::NoRegister, + Hexagon::C7_6, Hexagon::NoRegister, + Hexagon::C9_8, Hexagon::NoRegister, + Hexagon::C11_10, Hexagon::NoRegister, + Hexagon::CS, Hexagon::NoRegister, + Hexagon::UPC, Hexagon::NoRegister + }; + + if (RegNo >= array_lengthof(CtrlReg64DecoderTable)) + return MCDisassembler::Fail; + + if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister) + return MCDisassembler::Fail; + + unsigned Register = CtrlReg64DecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + unsigned Register = 0; + switch (RegNo) { + case 0: + Register = Hexagon::M0; + break; + case 1: + Register = Hexagon::M1; + break; + default: + return MCDisassembler::Fail; + } + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +namespace { +uint32_t fullValue(MCInstrInfo const &MCII, + MCInst &MCB, + MCInst &MI, + int64_t Value) { + MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex( + MCB, HexagonMCInstrInfo::bundleSize(MCB)); + if(!Extender || MI.size() != HexagonMCInstrInfo::getExtendableOp(MCII, MI)) + return Value; + unsigned Alignment = HexagonMCInstrInfo::getExtentAlignment(MCII, MI); + uint32_t Lower6 = static_cast<uint32_t>(Value >> Alignment) & 0x3f; + int64_t Bits; + bool Success = Extender->getOperand(0).getExpr()->evaluateAsAbsolute(Bits); + assert(Success);(void)Success; + uint32_t Upper26 = static_cast<uint32_t>(Bits); + uint32_t Operand = Upper26 | Lower6; + return Operand; +} +template <size_t T> +void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) { + HexagonDisassembler const &Disassembler = disassembler(Decoder); + int64_t FullValue = fullValue(*Disassembler.MCII, + **Disassembler.CurrentBundle, + MI, SignExtend64<T>(tmp)); + int64_t Extended = SignExtend64<32>(FullValue); + HexagonMCInstrInfo::addConstant(MI, Extended, + Disassembler.getContext()); +} +} + +static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, + const void *Decoder) { + HexagonDisassembler const &Disassembler = disassembler(Decoder); + int64_t FullValue = fullValue(*Disassembler.MCII, + **Disassembler.CurrentBundle, + MI, tmp); + assert(FullValue >= 0 && "Negative in unsigned decoder"); + HexagonMCInstrInfo::addConstant(MI, FullValue, Disassembler.getContext()); + return MCDisassembler::Success; +} + static DecodeStatus s16ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<16>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<16>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s12ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<12>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<12>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<11>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<11>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<12>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + HexagonMCInstrInfo::addConstant(MI, SignExtend64<12>(tmp), contextFromDecoder(Decoder)); return MCDisassembler::Success; } static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<13>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<13>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<14>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<14>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s10ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<10>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<10>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s8ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<8>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<8>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<6>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<6>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<4>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<4>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<5>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<5>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<6>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<6>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<7>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<7>(MI, tmp, Decoder); return MCDisassembler::Success; } +static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + signedDecoder<10>(MI, tmp, Decoder); + return MCDisassembler::Success; +} + +static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + signedDecoder<19>(MI, tmp, Decoder); + return MCDisassembler::Success; +} + +// custom decoder for various jump/call immediates +static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder) { + HexagonDisassembler const &Disassembler = disassembler(Decoder); + unsigned Bits = HexagonMCInstrInfo::getExtentBits(*Disassembler.MCII, MI); + // r13_2 is not extendable, so if there are no extent bits, it's r13_2 + if (Bits == 0) + Bits = 15; + uint32_t FullValue = fullValue(*Disassembler.MCII, + **Disassembler.CurrentBundle, + MI, SignExtend64(tmp, Bits)); + int64_t Extended = SignExtend64<32>(FullValue) + Address; + if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true, + 0, 4)) + HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext()); + return MCDisassembler::Success; +} + +// Addressing mode dependent load store opcode map. +// - If an insn is preceded by an extender the address is absolute. +// - memw(##symbol) = r0 +// - If an insn is not preceded by an extender the address is GP relative. +// - memw(gp + #symbol) = r0 +// Please note that the instructions must be ordered in the descending order +// of their opcode. +// HexagonII::INST_ICLASS_ST +static const unsigned int StoreConditionalOpcodeData[][2] = { + {S4_pstorerdfnew_abs, 0xafc02084}, + {S4_pstorerdtnew_abs, 0xafc02080}, + {S4_pstorerdf_abs, 0xafc00084}, + {S4_pstorerdt_abs, 0xafc00080}, + {S4_pstorerinewfnew_abs, 0xafa03084}, + {S4_pstorerinewtnew_abs, 0xafa03080}, + {S4_pstorerhnewfnew_abs, 0xafa02884}, + {S4_pstorerhnewtnew_abs, 0xafa02880}, + {S4_pstorerbnewfnew_abs, 0xafa02084}, + {S4_pstorerbnewtnew_abs, 0xafa02080}, + {S4_pstorerinewf_abs, 0xafa01084}, + {S4_pstorerinewt_abs, 0xafa01080}, + {S4_pstorerhnewf_abs, 0xafa00884}, + {S4_pstorerhnewt_abs, 0xafa00880}, + {S4_pstorerbnewf_abs, 0xafa00084}, + {S4_pstorerbnewt_abs, 0xafa00080}, + {S4_pstorerifnew_abs, 0xaf802084}, + {S4_pstoreritnew_abs, 0xaf802080}, + {S4_pstorerif_abs, 0xaf800084}, + {S4_pstorerit_abs, 0xaf800080}, + {S4_pstorerhfnew_abs, 0xaf402084}, + {S4_pstorerhtnew_abs, 0xaf402080}, + {S4_pstorerhf_abs, 0xaf400084}, + {S4_pstorerht_abs, 0xaf400080}, + {S4_pstorerbfnew_abs, 0xaf002084}, + {S4_pstorerbtnew_abs, 0xaf002080}, + {S4_pstorerbf_abs, 0xaf000084}, + {S4_pstorerbt_abs, 0xaf000080}}; +// HexagonII::INST_ICLASS_LD + +// HexagonII::INST_ICLASS_LD_ST_2 +static unsigned int LoadStoreOpcodeData[][2] = {{L4_loadrd_abs, 0x49c00000}, + {L4_loadri_abs, 0x49800000}, + {L4_loadruh_abs, 0x49600000}, + {L4_loadrh_abs, 0x49400000}, + {L4_loadrub_abs, 0x49200000}, + {L4_loadrb_abs, 0x49000000}, + {S2_storerdabs, 0x48c00000}, + {S2_storerinewabs, 0x48a01000}, + {S2_storerhnewabs, 0x48a00800}, + {S2_storerbnewabs, 0x48a00000}, + {S2_storeriabs, 0x48800000}, + {S2_storerfabs, 0x48600000}, + {S2_storerhabs, 0x48400000}, + {S2_storerbabs, 0x48000000}}; +static const size_t NumCondS = array_lengthof(StoreConditionalOpcodeData); +static const size_t NumLS = array_lengthof(LoadStoreOpcodeData); + +static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) { + + unsigned MachineOpcode = 0; + unsigned LLVMOpcode = 0; + + if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_ST) { + for (size_t i = 0; i < NumCondS; ++i) { + if ((insn & StoreConditionalOpcodeData[i][1]) == + StoreConditionalOpcodeData[i][1]) { + MachineOpcode = StoreConditionalOpcodeData[i][1]; + LLVMOpcode = StoreConditionalOpcodeData[i][0]; + break; + } + } + } + if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_LD_ST_2) { + for (size_t i = 0; i < NumLS; ++i) { + if ((insn & LoadStoreOpcodeData[i][1]) == LoadStoreOpcodeData[i][1]) { + MachineOpcode = LoadStoreOpcodeData[i][1]; + LLVMOpcode = LoadStoreOpcodeData[i][0]; + break; + } + } + } + + if (MachineOpcode) { + unsigned Value = 0; + unsigned shift = 0; + MI.setOpcode(LLVMOpcode); + // Remove the parse bits from the insn. + insn &= ~HexagonII::INST_PARSE_MASK; + + switch (LLVMOpcode) { + default: + return MCDisassembler::Fail; + break; + + case Hexagon::S4_pstorerdf_abs: + case Hexagon::S4_pstorerdt_abs: + case Hexagon::S4_pstorerdfnew_abs: + case Hexagon::S4_pstorerdtnew_abs: { + // op: Pv + Value = insn & UINT64_C(3); + DecodePredRegsRegisterClass(MI, Value, 0, 0); + // op: u6 + Value = (insn >> 12) & UINT64_C(48); + Value |= (insn >> 3) & UINT64_C(15); + MI.addOperand(MCOperand::createImm(Value)); + // op: Rtt + Value = (insn >> 8) & UINT64_C(31); + DecodeDoubleRegsRegisterClass(MI, Value, 0, 0); + break; + } + + case Hexagon::S4_pstorerbnewf_abs: + case Hexagon::S4_pstorerbnewt_abs: + case Hexagon::S4_pstorerbnewfnew_abs: + case Hexagon::S4_pstorerbnewtnew_abs: + case Hexagon::S4_pstorerhnewf_abs: + case Hexagon::S4_pstorerhnewt_abs: + case Hexagon::S4_pstorerhnewfnew_abs: + case Hexagon::S4_pstorerhnewtnew_abs: + case Hexagon::S4_pstorerinewf_abs: + case Hexagon::S4_pstorerinewt_abs: + case Hexagon::S4_pstorerinewfnew_abs: + case Hexagon::S4_pstorerinewtnew_abs: { + // op: Pv + Value = insn & UINT64_C(3); + DecodePredRegsRegisterClass(MI, Value, 0, 0); + // op: u6 + Value = (insn >> 12) & UINT64_C(48); + Value |= (insn >> 3) & UINT64_C(15); + MI.addOperand(MCOperand::createImm(Value)); + // op: Nt + Value = (insn >> 8) & UINT64_C(7); + DecodeIntRegsRegisterClass(MI, Value, 0, 0); + break; + } + + case Hexagon::S4_pstorerbf_abs: + case Hexagon::S4_pstorerbt_abs: + case Hexagon::S4_pstorerbfnew_abs: + case Hexagon::S4_pstorerbtnew_abs: + case Hexagon::S4_pstorerhf_abs: + case Hexagon::S4_pstorerht_abs: + case Hexagon::S4_pstorerhfnew_abs: + case Hexagon::S4_pstorerhtnew_abs: + case Hexagon::S4_pstorerif_abs: + case Hexagon::S4_pstorerit_abs: + case Hexagon::S4_pstorerifnew_abs: + case Hexagon::S4_pstoreritnew_abs: { + // op: Pv + Value = insn & UINT64_C(3); + DecodePredRegsRegisterClass(MI, Value, 0, 0); + // op: u6 + Value = (insn >> 12) & UINT64_C(48); + Value |= (insn >> 3) & UINT64_C(15); + MI.addOperand(MCOperand::createImm(Value)); + // op: Rt + Value = (insn >> 8) & UINT64_C(31); + DecodeIntRegsRegisterClass(MI, Value, 0, 0); + break; + } + + case Hexagon::L4_ploadrdf_abs: + case Hexagon::L4_ploadrdt_abs: + case Hexagon::L4_ploadrdfnew_abs: + case Hexagon::L4_ploadrdtnew_abs: { + // op: Rdd + Value = insn & UINT64_C(31); + DecodeDoubleRegsRegisterClass(MI, Value, 0, 0); + // op: Pt + Value = ((insn >> 9) & UINT64_C(3)); + DecodePredRegsRegisterClass(MI, Value, 0, 0); + // op: u6 + Value = ((insn >> 15) & UINT64_C(62)); + Value |= ((insn >> 8) & UINT64_C(1)); + MI.addOperand(MCOperand::createImm(Value)); + break; + } + + case Hexagon::L4_ploadrbf_abs: + case Hexagon::L4_ploadrbt_abs: + case Hexagon::L4_ploadrbfnew_abs: + case Hexagon::L4_ploadrbtnew_abs: + case Hexagon::L4_ploadrhf_abs: + case Hexagon::L4_ploadrht_abs: + case Hexagon::L4_ploadrhfnew_abs: + case Hexagon::L4_ploadrhtnew_abs: + case Hexagon::L4_ploadrubf_abs: + case Hexagon::L4_ploadrubt_abs: + case Hexagon::L4_ploadrubfnew_abs: + case Hexagon::L4_ploadrubtnew_abs: + case Hexagon::L4_ploadruhf_abs: + case Hexagon::L4_ploadruht_abs: + case Hexagon::L4_ploadruhfnew_abs: + case Hexagon::L4_ploadruhtnew_abs: + case Hexagon::L4_ploadrif_abs: + case Hexagon::L4_ploadrit_abs: + case Hexagon::L4_ploadrifnew_abs: + case Hexagon::L4_ploadritnew_abs: + // op: Rd + Value = insn & UINT64_C(31); + DecodeIntRegsRegisterClass(MI, Value, 0, 0); + // op: Pt + Value = (insn >> 9) & UINT64_C(3); + DecodePredRegsRegisterClass(MI, Value, 0, 0); + // op: u6 + Value = (insn >> 15) & UINT64_C(62); + Value |= (insn >> 8) & UINT64_C(1); + MI.addOperand(MCOperand::createImm(Value)); + break; + + // op: g16_2 + case (Hexagon::L4_loadri_abs): + ++shift; + // op: g16_1 + case Hexagon::L4_loadrh_abs: + case Hexagon::L4_loadruh_abs: + ++shift; + // op: g16_0 + case Hexagon::L4_loadrb_abs: + case Hexagon::L4_loadrub_abs: { + // op: Rd + Value |= insn & UINT64_C(31); + DecodeIntRegsRegisterClass(MI, Value, 0, 0); + Value = (insn >> 11) & UINT64_C(49152); + Value |= (insn >> 7) & UINT64_C(15872); + Value |= (insn >> 5) & UINT64_C(511); + MI.addOperand(MCOperand::createImm(Value << shift)); + break; + } + + case Hexagon::L4_loadrd_abs: { + Value = insn & UINT64_C(31); + DecodeDoubleRegsRegisterClass(MI, Value, 0, 0); + Value = (insn >> 11) & UINT64_C(49152); + Value |= (insn >> 7) & UINT64_C(15872); + Value |= (insn >> 5) & UINT64_C(511); + MI.addOperand(MCOperand::createImm(Value << 3)); + break; + } + + case Hexagon::S2_storerdabs: { + // op: g16_3 + Value = (insn >> 11) & UINT64_C(49152); + Value |= (insn >> 7) & UINT64_C(15872); + Value |= (insn >> 5) & UINT64_C(256); + Value |= insn & UINT64_C(255); + MI.addOperand(MCOperand::createImm(Value << 3)); + // op: Rtt + Value = (insn >> 8) & UINT64_C(31); + DecodeDoubleRegsRegisterClass(MI, Value, 0, 0); + break; + } + + // op: g16_2 + case Hexagon::S2_storerinewabs: + ++shift; + // op: g16_1 + case Hexagon::S2_storerhnewabs: + ++shift; + // op: g16_0 + case Hexagon::S2_storerbnewabs: { + Value = (insn >> 11) & UINT64_C(49152); + Value |= (insn >> 7) & UINT64_C(15872); + Value |= (insn >> 5) & UINT64_C(256); + Value |= insn & UINT64_C(255); + MI.addOperand(MCOperand::createImm(Value << shift)); + // op: Nt + Value = (insn >> 8) & UINT64_C(7); + DecodeIntRegsRegisterClass(MI, Value, 0, 0); + break; + } + + // op: g16_2 + case Hexagon::S2_storeriabs: + ++shift; + // op: g16_1 + case Hexagon::S2_storerhabs: + case Hexagon::S2_storerfabs: + ++shift; + // op: g16_0 + case Hexagon::S2_storerbabs: { + Value = (insn >> 11) & UINT64_C(49152); + Value |= (insn >> 7) & UINT64_C(15872); + Value |= (insn >> 5) & UINT64_C(256); + Value |= insn & UINT64_C(255); + MI.addOperand(MCOperand::createImm(Value << shift)); + // op: Rt + Value = (insn >> 8) & UINT64_C(31); + DecodeIntRegsRegisterClass(MI, Value, 0, 0); + break; + } + } + return MCDisassembler::Success; + } + return MCDisassembler::Fail; +} + +static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn, + void const *Decoder) { + + // Instruction Class for a constant a extender: bits 31:28 = 0x0000 + if ((~insn & 0xf0000000) == 0xf0000000) { + unsigned Value; + // 27:16 High 12 bits of 26-bit extender. + Value = (insn & 0x0fff0000) << 4; + // 13:0 Low 14 bits of 26-bit extender. + Value |= ((insn & 0x3fff) << 6); + MI.setOpcode(Hexagon::A4_ext); + HexagonMCInstrInfo::addConstant(MI, Value, contextFromDecoder(Decoder)); + return MCDisassembler::Success; + } + return MCDisassembler::Fail; +} + // These values are from HexagonGenMCCodeEmitter.inc and HexagonIsetDx.td enum subInstBinaryValues { V4_SA1_addi_BITS = 0x0000, @@ -731,6 +1342,8 @@ static unsigned getRegFromSubinstEncoding(unsigned encoded_reg) { return Hexagon::R0 + encoded_reg; else if (encoded_reg < 16) return Hexagon::R0 + encoded_reg + 8; + + // patently false value return Hexagon::NoRegister; } @@ -739,10 +1352,13 @@ static unsigned getDRegFromSubinstEncoding(unsigned encoded_dreg) { return Hexagon::D0 + encoded_dreg; else if (encoded_dreg < 8) return Hexagon::D0 + encoded_dreg + 4; + + // patently false value return Hexagon::NoRegister; } -static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { +void HexagonDisassembler::addSubinstOperands(MCInst *MI, unsigned opcode, + unsigned inst) const { int64_t operand; MCOperand Op; switch (opcode) { @@ -762,8 +1378,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { case Hexagon::V4_SS2_allocframe: // u 8-4{5_3} operand = ((inst & 0x1f0) >> 4) << 3; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SL1_loadri_io: // Rd 3-0, Rs 7-4, u 11-8{4_2} @@ -774,8 +1389,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0xf00) >> 6; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SL1_loadrub_io: // Rd 3-0, Rs 7-4, u 11-8 @@ -786,8 +1400,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0xf00) >> 8; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SL2_loadrb_io: // Rd 3-0, Rs 7-4, u 10-8 @@ -798,8 +1411,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0x700) >> 8; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SL2_loadrh_io: case Hexagon::V4_SL2_loadruh_io: @@ -811,8 +1423,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = ((inst & 0x700) >> 8) << 1; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SL2_loadrd_sp: // Rdd 2-0, u 7-3{5_3} @@ -820,8 +1431,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = ((inst & 0x0f8) >> 3) << 3; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SL2_loadri_sp: // Rd 3-0, u 8-4{5_2} @@ -829,8 +1439,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = ((inst & 0x1f0) >> 4) << 2; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SA1_addi: // Rx 3-0 (x2), s7 10-4 @@ -839,8 +1448,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { MI->addOperand(Op); MI->addOperand(Op); operand = SignExtend64<7>((inst & 0x7f0) >> 4); - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SA1_addrx: // Rx 3-0 (x2), Rs 7-4 @@ -873,8 +1481,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = ((inst & 0x3f0) >> 4) << 2; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SA1_seti: // Rd 3-0, u 9-4 @@ -882,8 +1489,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0x3f0) >> 4; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SA1_clrf: case Hexagon::V4_SA1_clrfnew: @@ -901,8 +1507,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = inst & 0x3; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SA1_combine0i: case Hexagon::V4_SA1_combine1i: @@ -913,8 +1518,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0x060) >> 5; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SA1_combinerz: case Hexagon::V4_SA1_combinezr: @@ -932,8 +1536,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0xf00) >> 8; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); operand = getRegFromSubinstEncoding(inst & 0xf); Op = MCOperand::createReg(operand); MI->addOperand(Op); @@ -944,8 +1547,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = ((inst & 0xf00) >> 8) << 2; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); operand = getRegFromSubinstEncoding(inst & 0xf); Op = MCOperand::createReg(operand); MI->addOperand(Op); @@ -957,8 +1559,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = inst & 0xf; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SS2_storewi0: case Hexagon::V4_SS2_storewi1: @@ -967,25 +1568,23 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0xf) << 2; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SS2_stored_sp: // s 8-3{6_3}, Rtt 2-0 operand = SignExtend64<9>(((inst & 0x1f8) >> 3) << 3); - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); operand = getDRegFromSubinstEncoding(inst & 0x7); Op = MCOperand::createReg(operand); MI->addOperand(Op); + break; case Hexagon::V4_SS2_storeh_io: // Rs 7-4, u 10-8{3_1}, Rt 3-0 operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4); Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = ((inst & 0x700) >> 8) << 1; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); operand = getRegFromSubinstEncoding(inst & 0xf); Op = MCOperand::createReg(operand); MI->addOperand(Op); @@ -993,8 +1592,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { case Hexagon::V4_SS2_storew_sp: // u 8-4{5_2}, Rd 3-0 operand = ((inst & 0x1f0) >> 4) << 2; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); operand = getRegFromSubinstEncoding(inst & 0xf); Op = MCOperand::createReg(operand); MI->addOperand(Op); diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.h b/contrib/llvm/lib/Target/Hexagon/Hexagon.h index d360be2..ed7d957 100644 --- a/contrib/llvm/lib/Target/Hexagon/Hexagon.h +++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.h @@ -47,15 +47,8 @@ #include "llvm/Target/TargetMachine.h" namespace llvm { - class MachineInstr; - class MCInst; - class MCInstrInfo; - class HexagonAsmPrinter; class HexagonTargetMachine; - void HexagonLowerToMC(const MachineInstr *MI, MCInst &MCI, - HexagonAsmPrinter &AP); - /// \brief Creates a Hexagon-specific Target Transformation Info pass. ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM); } // end namespace llvm; diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.td b/contrib/llvm/lib/Target/Hexagon/Hexagon.td index 53a687c..5a7eb21 100644 --- a/contrib/llvm/lib/Target/Hexagon/Hexagon.td +++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.td @@ -24,14 +24,32 @@ include "llvm/Target/Target.td" // Hexagon Architectures def ArchV4: SubtargetFeature<"v4", "HexagonArchVersion", "V4", "Hexagon V4">; def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "V5", "Hexagon V5">; +def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "V55", "Hexagon V55">; +def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "V60", "Hexagon V60">; + +// Hexagon ISA Extensions +def ExtensionHVX: SubtargetFeature<"hvx", "UseHVXOps", + "true", "Hexagon HVX instructions">; +def ExtensionHVXDbl: SubtargetFeature<"hvx-double", "UseHVXDblOps", + "true", "Hexagon HVX Double instructions">; //===----------------------------------------------------------------------===// // Hexagon Instruction Predicate Definitions. //===----------------------------------------------------------------------===// -def HasV5T : Predicate<"HST->hasV5TOps()">; -def NoV5T : Predicate<"!HST->hasV5TOps()">; -def UseMEMOP : Predicate<"HST->useMemOps()">; -def IEEERndNearV5T : Predicate<"HST->modeIEEERndNear()">; +def HasV5T : Predicate<"HST->hasV5TOps()">; +def NoV5T : Predicate<"!HST->hasV5TOps()">; +def HasV55T : Predicate<"HST->hasV55TOps()">, + AssemblerPredicate<"ArchV55">; +def HasV60T : Predicate<"HST->hasV60TOps()">, + AssemblerPredicate<"ArchV60">; +def UseMEMOP : Predicate<"HST->useMemOps()">; +def IEEERndNearV5T : Predicate<"HST->modeIEEERndNear()">; +def UseHVXDbl : Predicate<"HST->useHVXDblOps()">, + AssemblerPredicate<"ExtensionHVXDbl">; +def UseHVXSgl : Predicate<"HST->useHVXSglOps()">; + +def UseHVX : Predicate<"HST->useHVXSglOps() ||HST->useHVXDblOps()">, + AssemblerPredicate<"ExtensionHVX">; //===----------------------------------------------------------------------===// // Classes used for relation maps. @@ -53,6 +71,7 @@ class NewValueRel: PredNewRel; // NewValueRel - Filter class used to relate load/store instructions having // different addressing modes with each other. class AddrModeRel: NewValueRel; +class IntrinsicsRel; //===----------------------------------------------------------------------===// // Generate mapping table to relate non-predicate instructions with their @@ -62,7 +81,7 @@ class AddrModeRel: NewValueRel; def getPredOpcode : InstrMapping { let FilterClass = "PredRel"; // Instructions with the same BaseOpcode and isNVStore values form a row. - let RowFields = ["BaseOpcode", "isNVStore", "PNewValue"]; + let RowFields = ["BaseOpcode", "isNVStore", "PNewValue", "isNT"]; // Instructions with the same predicate sense form a column. let ColFields = ["PredSense"]; // The key column is the unpredicated instructions. @@ -77,7 +96,7 @@ def getPredOpcode : InstrMapping { // def getFalsePredOpcode : InstrMapping { let FilterClass = "PredRel"; - let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken"]; + let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken", "isNT"]; let ColFields = ["PredSense"]; let KeyCol = ["true"]; let ValueCols = [["false"]]; @@ -89,7 +108,7 @@ def getFalsePredOpcode : InstrMapping { // def getTruePredOpcode : InstrMapping { let FilterClass = "PredRel"; - let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken"]; + let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken", "isNT"]; let ColFields = ["PredSense"]; let KeyCol = ["false"]; let ValueCols = [["true"]]; @@ -125,7 +144,7 @@ def getPredOldOpcode : InstrMapping { // def getNewValueOpcode : InstrMapping { let FilterClass = "NewValueRel"; - let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode"]; + let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode", "isNT"]; let ColFields = ["NValueST"]; let KeyCol = ["false"]; let ValueCols = [["true"]]; @@ -137,16 +156,16 @@ def getNewValueOpcode : InstrMapping { // def getNonNVStore : InstrMapping { let FilterClass = "NewValueRel"; - let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode"]; + let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode", "isNT"]; let ColFields = ["NValueST"]; let KeyCol = ["true"]; let ValueCols = [["false"]]; } -def getBasedWithImmOffset : InstrMapping { +def getBaseWithImmOffset : InstrMapping { let FilterClass = "AddrModeRel"; let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore", - "isMEMri", "isFloat"]; + "isFloat"]; let ColFields = ["addrMode"]; let KeyCol = ["Absolute"]; let ValueCols = [["BaseImmOffset"]]; @@ -168,6 +187,37 @@ def getRegForm : InstrMapping { let ValueCols = [["reg"]]; } +def getRegShlForm : InstrMapping { + let FilterClass = "ImmRegShl"; + let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"]; + let ColFields = ["InputType"]; + let KeyCol = ["imm"]; + let ValueCols = [["reg"]]; +} + +def notTakenBranchPrediction : InstrMapping { + let FilterClass = "PredRel"; + let RowFields = ["BaseOpcode", "PNewValue", "PredSense", "isBranch", "isPredicated"]; + let ColFields = ["isBrTaken"]; + let KeyCol = ["true"]; + let ValueCols = [["false"]]; +} + +def takenBranchPrediction : InstrMapping { + let FilterClass = "PredRel"; + let RowFields = ["BaseOpcode", "PNewValue", "PredSense", "isBranch", "isPredicated"]; + let ColFields = ["isBrTaken"]; + let KeyCol = ["false"]; + let ValueCols = [["true"]]; +} + +def getRealHWInstr : InstrMapping { + let FilterClass = "IntrinsicsRel"; + let RowFields = ["BaseOpcode"]; + let ColFields = ["InstrType"]; + let KeyCol = ["Pseudo"]; + let ValueCols = [["Pseudo"], ["Real"]]; +} //===----------------------------------------------------------------------===// // Register File, Calling Conv, Instruction Descriptions //===----------------------------------------------------------------------===// @@ -192,12 +242,27 @@ def : Proc<"hexagonv4", HexagonModelV4, [ArchV4]>; def : Proc<"hexagonv5", HexagonModelV4, [ArchV4, ArchV5]>; +def : Proc<"hexagonv55", HexagonModelV55, + [ArchV4, ArchV5, ArchV55]>; +def : Proc<"hexagonv60", HexagonModelV60, + [ArchV4, ArchV5, ArchV55, ArchV60, ExtensionHVX]>; //===----------------------------------------------------------------------===// // Declare the target which we are implementing //===----------------------------------------------------------------------===// +def HexagonAsmParser : AsmParser { + bit HasMnemonicFirst = 0; +} + +def HexagonAsmParserVariant : AsmParserVariant { + int Variant = 0; + string TokenizingCharacters = "#()=:.<>!+*"; +} + def Hexagon : Target { // Pull in Instruction Info: let InstructionSet = HexagonInstrInfo; + let AssemblyParsers = [HexagonAsmParser]; + let AssemblyParserVariants = [HexagonAsmParserVariant]; } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp index 05728d2..4c7c039 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -40,11 +40,13 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" @@ -56,12 +58,27 @@ using namespace llvm; +namespace llvm { + void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI, + MCInst &MCB, HexagonAsmPrinter &AP); +} + #define DEBUG_TYPE "asm-printer" static cl::opt<bool> AlignCalls( "hexagon-align-calls", cl::Hidden, cl::init(true), cl::desc("Insert falign after call instruction for Hexagon target")); +// Given a scalar register return its pair. +inline static unsigned getHexagonRegisterPair(unsigned Reg, + const MCRegisterInfo *RI) { + assert(Hexagon::IntRegsRegClass.contains(Reg)); + MCSuperRegIterator SR(Reg, RI, false); + unsigned Pair = *SR; + assert(Hexagon::DoubleRegsRegClass.contains(Pair)); + return Pair; +} + HexagonAsmPrinter::HexagonAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr) {} @@ -102,9 +119,8 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, // bool HexagonAsmPrinter:: isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const { - if (MBB->hasAddressTaken()) { + if (MBB->hasAddressTaken()) return false; - } return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB); } @@ -117,7 +133,8 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &OS) { // Does this asm operand have a single letter operand modifier? if (ExtraCode && ExtraCode[0]) { - if (ExtraCode[1] != 0) return true; // Unknown modifier. + if (ExtraCode[1] != 0) + return true; // Unknown modifier. switch (ExtraCode[0]) { default: @@ -173,45 +190,407 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, return false; } +static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI, + MCStreamer &OutStreamer, const MCOperand &Imm, + int AlignSize) { + MCSymbol *Sym; + int64_t Value; + if (Imm.getExpr()->evaluateAsAbsolute(Value)) { + StringRef sectionPrefix; + std::string ImmString; + StringRef Name; + if (AlignSize == 8) { + Name = ".CONST_0000000000000000"; + sectionPrefix = ".gnu.linkonce.l8"; + ImmString = utohexstr(Value); + } else { + Name = ".CONST_00000000"; + sectionPrefix = ".gnu.linkonce.l4"; + ImmString = utohexstr(static_cast<uint32_t>(Value)); + } + + std::string symbolName = // Yes, leading zeros are kept. + Name.drop_back(ImmString.size()).str() + ImmString; + std::string sectionName = sectionPrefix.str() + symbolName; + + MCSectionELF *Section = OutStreamer.getContext().getELFSection( + sectionName, ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); + OutStreamer.SwitchSection(Section); + + Sym = AP.OutContext.getOrCreateSymbol(Twine(symbolName)); + if (Sym->isUndefined()) { + OutStreamer.EmitLabel(Sym); + OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global); + OutStreamer.EmitIntValue(Value, AlignSize); + OutStreamer.EmitCodeAlignment(AlignSize); + } + } else { + assert(Imm.isExpr() && "Expected expression and found none"); + const MachineOperand &MO = MI.getOperand(1); + assert(MO.isGlobal() || MO.isCPI() || MO.isJTI()); + MCSymbol *MOSymbol = nullptr; + if (MO.isGlobal()) + MOSymbol = AP.getSymbol(MO.getGlobal()); + else if (MO.isCPI()) + MOSymbol = AP.GetCPISymbol(MO.getIndex()); + else if (MO.isJTI()) + MOSymbol = AP.GetJTISymbol(MO.getIndex()); + else + llvm_unreachable("Unknown operand type!"); + + StringRef SymbolName = MOSymbol->getName(); + std::string LitaName = ".CONST_" + SymbolName.str(); + + MCSectionELF *Section = OutStreamer.getContext().getELFSection( + ".lita", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); + + OutStreamer.SwitchSection(Section); + Sym = AP.OutContext.getOrCreateSymbol(Twine(LitaName)); + if (Sym->isUndefined()) { + OutStreamer.EmitLabel(Sym); + OutStreamer.EmitSymbolAttribute(Sym, MCSA_Local); + OutStreamer.EmitValue(Imm.getExpr(), AlignSize); + OutStreamer.EmitCodeAlignment(AlignSize); + } + } + return Sym; +} + +void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst, + const MachineInstr &MI) { + MCInst &MappedInst = static_cast <MCInst &>(Inst); + const MCRegisterInfo *RI = OutStreamer->getContext().getRegisterInfo(); + + switch (Inst.getOpcode()) { + default: return; + + // "$dst = CONST64(#$src1)", + case Hexagon::CONST64_Float_Real: + case Hexagon::CONST64_Int_Real: + if (!OutStreamer->hasRawTextSupport()) { + const MCOperand &Imm = MappedInst.getOperand(1); + MCSectionSubPair Current = OutStreamer->getCurrentSection(); + + MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 8); + + OutStreamer->SwitchSection(Current.first, Current.second); + MCInst TmpInst; + MCOperand &Reg = MappedInst.getOperand(0); + TmpInst.setOpcode(Hexagon::L2_loadrdgp); + TmpInst.addOperand(Reg); + TmpInst.addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create(Sym, OutContext))); + MappedInst = TmpInst; + + } + break; + case Hexagon::CONST32: + case Hexagon::CONST32_Float_Real: + case Hexagon::CONST32_Int_Real: + case Hexagon::FCONST32_nsdata: + if (!OutStreamer->hasRawTextSupport()) { + MCOperand &Imm = MappedInst.getOperand(1); + MCSectionSubPair Current = OutStreamer->getCurrentSection(); + MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 4); + OutStreamer->SwitchSection(Current.first, Current.second); + MCInst TmpInst; + MCOperand &Reg = MappedInst.getOperand(0); + TmpInst.setOpcode(Hexagon::L2_loadrigp); + TmpInst.addOperand(Reg); + TmpInst.addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create(Sym, OutContext))); + MappedInst = TmpInst; + } + break; + + // C2_pxfer_map maps to C2_or instruction. Though, it's possible to use + // C2_or during instruction selection itself but it results + // into suboptimal code. + case Hexagon::C2_pxfer_map: { + MCOperand &Ps = Inst.getOperand(1); + MappedInst.setOpcode(Hexagon::C2_or); + MappedInst.addOperand(Ps); + return; + } + + // Vector reduce complex multiply by scalar, Rt & 1 map to :hi else :lo + // The insn is mapped from the 4 operand to the 3 operand raw form taking + // 3 register pairs. + case Hexagon::M2_vrcmpys_acc_s1: { + MCOperand &Rt = Inst.getOperand(3); + assert (Rt.isReg() && "Expected register and none was found"); + unsigned Reg = RI->getEncodingValue(Rt.getReg()); + if (Reg & 1) + MappedInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_h); + else + MappedInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_l); + Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI)); + return; + } + case Hexagon::M2_vrcmpys_s1: { + MCOperand &Rt = Inst.getOperand(2); + assert (Rt.isReg() && "Expected register and none was found"); + unsigned Reg = RI->getEncodingValue(Rt.getReg()); + if (Reg & 1) + MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1_h); + else + MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1_l); + Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI)); + return; + } + + case Hexagon::M2_vrcmpys_s1rp: { + MCOperand &Rt = Inst.getOperand(2); + assert (Rt.isReg() && "Expected register and none was found"); + unsigned Reg = RI->getEncodingValue(Rt.getReg()); + if (Reg & 1) + MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1rp_h); + else + MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1rp_l); + Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI)); + return; + } + + case Hexagon::A4_boundscheck: { + MCOperand &Rs = Inst.getOperand(1); + assert (Rs.isReg() && "Expected register and none was found"); + unsigned Reg = RI->getEncodingValue(Rs.getReg()); + if (Reg & 1) // Odd mapped to raw:hi, regpair is rodd:odd-1, like r3:2 + MappedInst.setOpcode(Hexagon::A4_boundscheck_hi); + else // raw:lo + MappedInst.setOpcode(Hexagon::A4_boundscheck_lo); + Rs.setReg(getHexagonRegisterPair(Rs.getReg(), RI)); + return; + } + case Hexagon::S5_asrhub_rnd_sat_goodsyntax: { + MCOperand &MO = MappedInst.getOperand(2); + int64_t Imm; + MCExpr const *Expr = MO.getExpr(); + bool Success = Expr->evaluateAsAbsolute(Imm); + assert (Success && "Expected immediate and none was found");(void)Success; + MCInst TmpInst; + if (Imm == 0) { + TmpInst.setOpcode(Hexagon::S2_vsathub); + TmpInst.addOperand(MappedInst.getOperand(0)); + TmpInst.addOperand(MappedInst.getOperand(1)); + MappedInst = TmpInst; + return; + } + TmpInst.setOpcode(Hexagon::S5_asrhub_rnd_sat); + TmpInst.addOperand(MappedInst.getOperand(0)); + TmpInst.addOperand(MappedInst.getOperand(1)); + const MCExpr *One = MCConstantExpr::create(1, OutContext); + const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext); + TmpInst.addOperand(MCOperand::createExpr(Sub)); + MappedInst = TmpInst; + return; + } + case Hexagon::S5_vasrhrnd_goodsyntax: + case Hexagon::S2_asr_i_p_rnd_goodsyntax: { + MCOperand &MO2 = MappedInst.getOperand(2); + MCExpr const *Expr = MO2.getExpr(); + int64_t Imm; + bool Success = Expr->evaluateAsAbsolute(Imm); + assert (Success && "Expected immediate and none was found");(void)Success; + MCInst TmpInst; + if (Imm == 0) { + TmpInst.setOpcode(Hexagon::A2_combinew); + TmpInst.addOperand(MappedInst.getOperand(0)); + MCOperand &MO1 = MappedInst.getOperand(1); + unsigned High = RI->getSubReg(MO1.getReg(), Hexagon::subreg_hireg); + unsigned Low = RI->getSubReg(MO1.getReg(), Hexagon::subreg_loreg); + // Add a new operand for the second register in the pair. + TmpInst.addOperand(MCOperand::createReg(High)); + TmpInst.addOperand(MCOperand::createReg(Low)); + MappedInst = TmpInst; + return; + } + + if (Inst.getOpcode() == Hexagon::S2_asr_i_p_rnd_goodsyntax) + TmpInst.setOpcode(Hexagon::S2_asr_i_p_rnd); + else + TmpInst.setOpcode(Hexagon::S5_vasrhrnd); + TmpInst.addOperand(MappedInst.getOperand(0)); + TmpInst.addOperand(MappedInst.getOperand(1)); + const MCExpr *One = MCConstantExpr::create(1, OutContext); + const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext); + TmpInst.addOperand(MCOperand::createExpr(Sub)); + MappedInst = TmpInst; + return; + } + // if ("#u5==0") Assembler mapped to: "Rd=Rs"; else Rd=asr(Rs,#u5-1):rnd + case Hexagon::S2_asr_i_r_rnd_goodsyntax: { + MCOperand &MO = Inst.getOperand(2); + MCExpr const *Expr = MO.getExpr(); + int64_t Imm; + bool Success = Expr->evaluateAsAbsolute(Imm); + assert (Success && "Expected immediate and none was found");(void)Success; + MCInst TmpInst; + if (Imm == 0) { + TmpInst.setOpcode(Hexagon::A2_tfr); + TmpInst.addOperand(MappedInst.getOperand(0)); + TmpInst.addOperand(MappedInst.getOperand(1)); + MappedInst = TmpInst; + return; + } + TmpInst.setOpcode(Hexagon::S2_asr_i_r_rnd); + TmpInst.addOperand(MappedInst.getOperand(0)); + TmpInst.addOperand(MappedInst.getOperand(1)); + const MCExpr *One = MCConstantExpr::create(1, OutContext); + const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext); + TmpInst.addOperand(MCOperand::createExpr(Sub)); + MappedInst = TmpInst; + return; + } + case Hexagon::TFRI_f: + MappedInst.setOpcode(Hexagon::A2_tfrsi); + return; + case Hexagon::TFRI_cPt_f: + MappedInst.setOpcode(Hexagon::C2_cmoveit); + return; + case Hexagon::TFRI_cNotPt_f: + MappedInst.setOpcode(Hexagon::C2_cmoveif); + return; + case Hexagon::MUX_ri_f: + MappedInst.setOpcode(Hexagon::C2_muxri); + return; + case Hexagon::MUX_ir_f: + MappedInst.setOpcode(Hexagon::C2_muxir); + return; + + // Translate a "$Rdd = #imm" to "$Rdd = combine(#[-1,0], #imm)" + case Hexagon::A2_tfrpi: { + MCInst TmpInst; + MCOperand &Rdd = MappedInst.getOperand(0); + MCOperand &MO = MappedInst.getOperand(1); + + TmpInst.setOpcode(Hexagon::A2_combineii); + TmpInst.addOperand(Rdd); + int64_t Imm; + bool Success = MO.getExpr()->evaluateAsAbsolute(Imm); + if (Success && Imm < 0) { + const MCExpr *MOne = MCConstantExpr::create(-1, OutContext); + TmpInst.addOperand(MCOperand::createExpr(MOne)); + } else { + const MCExpr *Zero = MCConstantExpr::create(0, OutContext); + TmpInst.addOperand(MCOperand::createExpr(Zero)); + } + TmpInst.addOperand(MO); + MappedInst = TmpInst; + return; + } + // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)" + case Hexagon::A2_tfrp: { + MCOperand &MO = MappedInst.getOperand(1); + unsigned High = RI->getSubReg(MO.getReg(), Hexagon::subreg_hireg); + unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::subreg_loreg); + MO.setReg(High); + // Add a new operand for the second register in the pair. + MappedInst.addOperand(MCOperand::createReg(Low)); + MappedInst.setOpcode(Hexagon::A2_combinew); + return; + } + + case Hexagon::A2_tfrpt: + case Hexagon::A2_tfrpf: { + MCOperand &MO = MappedInst.getOperand(2); + unsigned High = RI->getSubReg(MO.getReg(), Hexagon::subreg_hireg); + unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::subreg_loreg); + MO.setReg(High); + // Add a new operand for the second register in the pair. + MappedInst.addOperand(MCOperand::createReg(Low)); + MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt) + ? Hexagon::C2_ccombinewt + : Hexagon::C2_ccombinewf); + return; + } + case Hexagon::A2_tfrptnew: + case Hexagon::A2_tfrpfnew: { + MCOperand &MO = MappedInst.getOperand(2); + unsigned High = RI->getSubReg(MO.getReg(), Hexagon::subreg_hireg); + unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::subreg_loreg); + MO.setReg(High); + // Add a new operand for the second register in the pair. + MappedInst.addOperand(MCOperand::createReg(Low)); + MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew) + ? Hexagon::C2_ccombinewnewt + : Hexagon::C2_ccombinewnewf); + return; + } + + case Hexagon::M2_mpysmi: { + MCOperand &Imm = MappedInst.getOperand(2); + MCExpr const *Expr = Imm.getExpr(); + int64_t Value; + bool Success = Expr->evaluateAsAbsolute(Value); + assert(Success);(void)Success; + if (Value < 0 && Value > -256) { + MappedInst.setOpcode(Hexagon::M2_mpysin); + Imm.setExpr(MCUnaryExpr::createMinus(Expr, OutContext)); + } + else + MappedInst.setOpcode(Hexagon::M2_mpysip); + return; + } + + case Hexagon::A2_addsp: { + MCOperand &Rt = Inst.getOperand(1); + assert (Rt.isReg() && "Expected register and none was found"); + unsigned Reg = RI->getEncodingValue(Rt.getReg()); + if (Reg & 1) + MappedInst.setOpcode(Hexagon::A2_addsph); + else + MappedInst.setOpcode(Hexagon::A2_addspl); + Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI)); + return; + } + case Hexagon::HEXAGON_V6_vd0_pseudo: + case Hexagon::HEXAGON_V6_vd0_pseudo_128B: { + MCInst TmpInst; + assert (Inst.getOperand(0).isReg() && + "Expected register and none was found"); + + TmpInst.setOpcode(Hexagon::V6_vxor); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(0)); + MappedInst = TmpInst; + return; + } + + } +} + /// printMachineInstruction -- Print out a single Hexagon MI in Darwin syntax to /// the current output stream. /// void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) { - MCInst MCB; - MCB.setOpcode(Hexagon::BUNDLE); - MCB.addOperand(MCOperand::createImm(0)); + MCInst MCB = HexagonMCInstrInfo::createBundle(); + const MCInstrInfo &MCII = *Subtarget->getInstrInfo(); if (MI->isBundle()) { const MachineBasicBlock* MBB = MI->getParent(); - MachineBasicBlock::const_instr_iterator MII = MI; + MachineBasicBlock::const_instr_iterator MII = MI->getIterator(); unsigned IgnoreCount = 0; - for (++MII; MII != MBB->end() && MII->isInsideBundle(); ++MII) { + for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII) if (MII->getOpcode() == TargetOpcode::DBG_VALUE || MII->getOpcode() == TargetOpcode::IMPLICIT_DEF) ++IgnoreCount; - else { - HexagonLowerToMC(MII, MCB, *this); - } - } + else + HexagonLowerToMC(MCII, &*MII, MCB, *this); } - else { - HexagonLowerToMC(MI, MCB, *this); - HexagonMCInstrInfo::padEndloop(MCB); - } - // Examine the packet and try to find instructions that can be converted - // to compounds. - HexagonMCInstrInfo::tryCompound(*Subtarget->getInstrInfo(), - OutStreamer->getContext(), MCB); - // Examine the packet and convert pairs of instructions to duplex - // instructions when possible. - SmallVector<DuplexCandidate, 8> possibleDuplexes; - possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties( - *Subtarget->getInstrInfo(), MCB); - HexagonMCShuffle(*Subtarget->getInstrInfo(), *Subtarget, - OutStreamer->getContext(), MCB, possibleDuplexes); - EmitToStreamer(*OutStreamer, MCB); + else + HexagonLowerToMC(MCII, MI, MCB, *this); + + bool Ok = HexagonMCInstrInfo::canonicalizePacket( + MCII, *Subtarget, OutStreamer->getContext(), MCB, nullptr); + assert(Ok); + (void)Ok; + if(HexagonMCInstrInfo::bundleSize(MCB) == 0) + return; + OutStreamer->EmitInstruction(MCB, getSubtargetInfo()); } extern "C" void LLVMInitializeHexagonAsmPrinter() { diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h index 792fc8b..a78d97e 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h @@ -42,6 +42,10 @@ namespace llvm { void EmitInstruction(const MachineInstr *MI) override; + void HexagonProcessInstruction(MCInst &Inst, + const MachineInstr &MBB); + + void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp new file mode 100644 index 0000000..4d2b545 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -0,0 +1,2790 @@ +//===--- HexagonBitSimplify.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "hexbit" + +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "HexagonTargetMachine.h" +#include "HexagonBitTracker.h" + +using namespace llvm; + +namespace llvm { + void initializeHexagonBitSimplifyPass(PassRegistry& Registry); + FunctionPass *createHexagonBitSimplify(); +} + +namespace { + // Set of virtual registers, based on BitVector. + struct RegisterSet : private BitVector { + RegisterSet() : BitVector() {} + explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {} + RegisterSet(const RegisterSet &RS) : BitVector(RS) {} + + using BitVector::clear; + using BitVector::count; + + unsigned find_first() const { + int First = BitVector::find_first(); + if (First < 0) + return 0; + return x2v(First); + } + + unsigned find_next(unsigned Prev) const { + int Next = BitVector::find_next(v2x(Prev)); + if (Next < 0) + return 0; + return x2v(Next); + } + + RegisterSet &insert(unsigned R) { + unsigned Idx = v2x(R); + ensure(Idx); + return static_cast<RegisterSet&>(BitVector::set(Idx)); + } + RegisterSet &remove(unsigned R) { + unsigned Idx = v2x(R); + if (Idx >= size()) + return *this; + return static_cast<RegisterSet&>(BitVector::reset(Idx)); + } + + RegisterSet &insert(const RegisterSet &Rs) { + return static_cast<RegisterSet&>(BitVector::operator|=(Rs)); + } + RegisterSet &remove(const RegisterSet &Rs) { + return static_cast<RegisterSet&>(BitVector::reset(Rs)); + } + + reference operator[](unsigned R) { + unsigned Idx = v2x(R); + ensure(Idx); + return BitVector::operator[](Idx); + } + bool operator[](unsigned R) const { + unsigned Idx = v2x(R); + assert(Idx < size()); + return BitVector::operator[](Idx); + } + bool has(unsigned R) const { + unsigned Idx = v2x(R); + if (Idx >= size()) + return false; + return BitVector::test(Idx); + } + + bool empty() const { + return !BitVector::any(); + } + bool includes(const RegisterSet &Rs) const { + // A.BitVector::test(B) <=> A-B != {} + return !Rs.BitVector::test(*this); + } + bool intersects(const RegisterSet &Rs) const { + return BitVector::anyCommon(Rs); + } + + private: + void ensure(unsigned Idx) { + if (size() <= Idx) + resize(std::max(Idx+1, 32U)); + } + static inline unsigned v2x(unsigned v) { + return TargetRegisterInfo::virtReg2Index(v); + } + static inline unsigned x2v(unsigned x) { + return TargetRegisterInfo::index2VirtReg(x); + } + }; + + + struct PrintRegSet { + PrintRegSet(const RegisterSet &S, const TargetRegisterInfo *RI) + : RS(S), TRI(RI) {} + friend raw_ostream &operator<< (raw_ostream &OS, + const PrintRegSet &P); + private: + const RegisterSet &RS; + const TargetRegisterInfo *TRI; + }; + + raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) + LLVM_ATTRIBUTE_UNUSED; + raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) { + OS << '{'; + for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R)) + OS << ' ' << PrintReg(R, P.TRI); + OS << " }"; + return OS; + } +} + + +namespace { + class Transformation; + + class HexagonBitSimplify : public MachineFunctionPass { + public: + static char ID; + HexagonBitSimplify() : MachineFunctionPass(ID), MDT(0) { + initializeHexagonBitSimplifyPass(*PassRegistry::getPassRegistry()); + } + virtual const char *getPassName() const { + return "Hexagon bit simplification"; + } + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + virtual bool runOnMachineFunction(MachineFunction &MF); + + static void getInstrDefs(const MachineInstr &MI, RegisterSet &Defs); + static void getInstrUses(const MachineInstr &MI, RegisterSet &Uses); + static bool isEqual(const BitTracker::RegisterCell &RC1, uint16_t B1, + const BitTracker::RegisterCell &RC2, uint16_t B2, uint16_t W); + static bool isConst(const BitTracker::RegisterCell &RC, uint16_t B, + uint16_t W); + static bool isZero(const BitTracker::RegisterCell &RC, uint16_t B, + uint16_t W); + static bool getConst(const BitTracker::RegisterCell &RC, uint16_t B, + uint16_t W, uint64_t &U); + static bool replaceReg(unsigned OldR, unsigned NewR, + MachineRegisterInfo &MRI); + static bool getSubregMask(const BitTracker::RegisterRef &RR, + unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI); + static bool replaceRegWithSub(unsigned OldR, unsigned NewR, + unsigned NewSR, MachineRegisterInfo &MRI); + static bool replaceSubWithSub(unsigned OldR, unsigned OldSR, + unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI); + static bool parseRegSequence(const MachineInstr &I, + BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH); + + static bool getUsedBitsInStore(unsigned Opc, BitVector &Bits, + uint16_t Begin); + static bool getUsedBits(unsigned Opc, unsigned OpN, BitVector &Bits, + uint16_t Begin, const HexagonInstrInfo &HII); + + static const TargetRegisterClass *getFinalVRegClass( + const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI); + static bool isTransparentCopy(const BitTracker::RegisterRef &RD, + const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI); + + private: + MachineDominatorTree *MDT; + + bool visitBlock(MachineBasicBlock &B, Transformation &T, RegisterSet &AVs); + }; + + char HexagonBitSimplify::ID = 0; + typedef HexagonBitSimplify HBS; + + + // The purpose of this class is to provide a common facility to traverse + // the function top-down or bottom-up via the dominator tree, and keep + // track of the available registers. + class Transformation { + public: + bool TopDown; + Transformation(bool TD) : TopDown(TD) {} + virtual bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) = 0; + virtual ~Transformation() {} + }; +} + +INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexbit", + "Hexagon bit simplification", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(HexagonBitSimplify, "hexbit", + "Hexagon bit simplification", false, false) + + +bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T, + RegisterSet &AVs) { + MachineDomTreeNode *N = MDT->getNode(&B); + typedef GraphTraits<MachineDomTreeNode*> GTN; + bool Changed = false; + + if (T.TopDown) + Changed = T.processBlock(B, AVs); + + RegisterSet Defs; + for (auto &I : B) + getInstrDefs(I, Defs); + RegisterSet NewAVs = AVs; + NewAVs.insert(Defs); + + for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) { + MachineBasicBlock *SB = (*I)->getBlock(); + Changed |= visitBlock(*SB, T, NewAVs); + } + if (!T.TopDown) + Changed |= T.processBlock(B, AVs); + + return Changed; +} + +// +// Utility functions: +// +void HexagonBitSimplify::getInstrDefs(const MachineInstr &MI, + RegisterSet &Defs) { + for (auto &Op : MI.operands()) { + if (!Op.isReg() || !Op.isDef()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + Defs.insert(R); + } +} + +void HexagonBitSimplify::getInstrUses(const MachineInstr &MI, + RegisterSet &Uses) { + for (auto &Op : MI.operands()) { + if (!Op.isReg() || !Op.isUse()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + Uses.insert(R); + } +} + +// Check if all the bits in range [B, E) in both cells are equal. +bool HexagonBitSimplify::isEqual(const BitTracker::RegisterCell &RC1, + uint16_t B1, const BitTracker::RegisterCell &RC2, uint16_t B2, + uint16_t W) { + for (uint16_t i = 0; i < W; ++i) { + // If RC1[i] is "bottom", it cannot be proven equal to RC2[i]. + if (RC1[B1+i].Type == BitTracker::BitValue::Ref && RC1[B1+i].RefI.Reg == 0) + return false; + // Same for RC2[i]. + if (RC2[B2+i].Type == BitTracker::BitValue::Ref && RC2[B2+i].RefI.Reg == 0) + return false; + if (RC1[B1+i] != RC2[B2+i]) + return false; + } + return true; +} + + +bool HexagonBitSimplify::isConst(const BitTracker::RegisterCell &RC, + uint16_t B, uint16_t W) { + assert(B < RC.width() && B+W <= RC.width()); + for (uint16_t i = B; i < B+W; ++i) + if (!RC[i].num()) + return false; + return true; +} + + +bool HexagonBitSimplify::isZero(const BitTracker::RegisterCell &RC, + uint16_t B, uint16_t W) { + assert(B < RC.width() && B+W <= RC.width()); + for (uint16_t i = B; i < B+W; ++i) + if (!RC[i].is(0)) + return false; + return true; +} + + +bool HexagonBitSimplify::getConst(const BitTracker::RegisterCell &RC, + uint16_t B, uint16_t W, uint64_t &U) { + assert(B < RC.width() && B+W <= RC.width()); + int64_t T = 0; + for (uint16_t i = B+W; i > B; --i) { + const BitTracker::BitValue &BV = RC[i-1]; + T <<= 1; + if (BV.is(1)) + T |= 1; + else if (!BV.is(0)) + return false; + } + U = T; + return true; +} + + +bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR, + MachineRegisterInfo &MRI) { + if (!TargetRegisterInfo::isVirtualRegister(OldR) || + !TargetRegisterInfo::isVirtualRegister(NewR)) + return false; + auto Begin = MRI.use_begin(OldR), End = MRI.use_end(); + decltype(End) NextI; + for (auto I = Begin; I != End; I = NextI) { + NextI = std::next(I); + I->setReg(NewR); + } + return Begin != End; +} + + +bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR, + unsigned NewSR, MachineRegisterInfo &MRI) { + if (!TargetRegisterInfo::isVirtualRegister(OldR) || + !TargetRegisterInfo::isVirtualRegister(NewR)) + return false; + auto Begin = MRI.use_begin(OldR), End = MRI.use_end(); + decltype(End) NextI; + for (auto I = Begin; I != End; I = NextI) { + NextI = std::next(I); + I->setReg(NewR); + I->setSubReg(NewSR); + } + return Begin != End; +} + + +bool HexagonBitSimplify::replaceSubWithSub(unsigned OldR, unsigned OldSR, + unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI) { + if (!TargetRegisterInfo::isVirtualRegister(OldR) || + !TargetRegisterInfo::isVirtualRegister(NewR)) + return false; + auto Begin = MRI.use_begin(OldR), End = MRI.use_end(); + decltype(End) NextI; + for (auto I = Begin; I != End; I = NextI) { + NextI = std::next(I); + if (I->getSubReg() != OldSR) + continue; + I->setReg(NewR); + I->setSubReg(NewSR); + } + return Begin != End; +} + + +// For a register ref (pair Reg:Sub), set Begin to the position of the LSB +// of Sub in Reg, and set Width to the size of Sub in bits. Return true, +// if this succeeded, otherwise return false. +bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR, + unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI) { + const TargetRegisterClass *RC = MRI.getRegClass(RR.Reg); + if (RC == &Hexagon::IntRegsRegClass) { + assert(RR.Sub == 0); + Begin = 0; + Width = 32; + return true; + } + if (RC == &Hexagon::DoubleRegsRegClass) { + if (RR.Sub == 0) { + Begin = 0; + Width = 64; + return true; + } + assert(RR.Sub == Hexagon::subreg_loreg || RR.Sub == Hexagon::subreg_hireg); + Width = 32; + Begin = (RR.Sub == Hexagon::subreg_loreg ? 0 : 32); + return true; + } + return false; +} + + +// For a REG_SEQUENCE, set SL to the low subregister and SH to the high +// subregister. +bool HexagonBitSimplify::parseRegSequence(const MachineInstr &I, + BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH) { + assert(I.getOpcode() == TargetOpcode::REG_SEQUENCE); + unsigned Sub1 = I.getOperand(2).getImm(), Sub2 = I.getOperand(4).getImm(); + assert(Sub1 != Sub2); + if (Sub1 == Hexagon::subreg_loreg && Sub2 == Hexagon::subreg_hireg) { + SL = I.getOperand(1); + SH = I.getOperand(3); + return true; + } + if (Sub1 == Hexagon::subreg_hireg && Sub2 == Hexagon::subreg_loreg) { + SH = I.getOperand(1); + SL = I.getOperand(3); + return true; + } + return false; +} + + +// All stores (except 64-bit stores) take a 32-bit register as the source +// of the value to be stored. If the instruction stores into a location +// that is shorter than 32 bits, some bits of the source register are not +// used. For each store instruction, calculate the set of used bits in +// the source register, and set appropriate bits in Bits. Return true if +// the bits are calculated, false otherwise. +bool HexagonBitSimplify::getUsedBitsInStore(unsigned Opc, BitVector &Bits, + uint16_t Begin) { + using namespace Hexagon; + + switch (Opc) { + // Store byte + case S2_storerb_io: // memb(Rs32+#s11:0)=Rt32 + case S2_storerbnew_io: // memb(Rs32+#s11:0)=Nt8.new + case S2_pstorerbt_io: // if (Pv4) memb(Rs32+#u6:0)=Rt32 + case S2_pstorerbf_io: // if (!Pv4) memb(Rs32+#u6:0)=Rt32 + case S4_pstorerbtnew_io: // if (Pv4.new) memb(Rs32+#u6:0)=Rt32 + case S4_pstorerbfnew_io: // if (!Pv4.new) memb(Rs32+#u6:0)=Rt32 + case S2_pstorerbnewt_io: // if (Pv4) memb(Rs32+#u6:0)=Nt8.new + case S2_pstorerbnewf_io: // if (!Pv4) memb(Rs32+#u6:0)=Nt8.new + case S4_pstorerbnewtnew_io: // if (Pv4.new) memb(Rs32+#u6:0)=Nt8.new + case S4_pstorerbnewfnew_io: // if (!Pv4.new) memb(Rs32+#u6:0)=Nt8.new + case S2_storerb_pi: // memb(Rx32++#s4:0)=Rt32 + case S2_storerbnew_pi: // memb(Rx32++#s4:0)=Nt8.new + case S2_pstorerbt_pi: // if (Pv4) memb(Rx32++#s4:0)=Rt32 + case S2_pstorerbf_pi: // if (!Pv4) memb(Rx32++#s4:0)=Rt32 + case S2_pstorerbtnew_pi: // if (Pv4.new) memb(Rx32++#s4:0)=Rt32 + case S2_pstorerbfnew_pi: // if (!Pv4.new) memb(Rx32++#s4:0)=Rt32 + case S2_pstorerbnewt_pi: // if (Pv4) memb(Rx32++#s4:0)=Nt8.new + case S2_pstorerbnewf_pi: // if (!Pv4) memb(Rx32++#s4:0)=Nt8.new + case S2_pstorerbnewtnew_pi: // if (Pv4.new) memb(Rx32++#s4:0)=Nt8.new + case S2_pstorerbnewfnew_pi: // if (!Pv4.new) memb(Rx32++#s4:0)=Nt8.new + case S4_storerb_ap: // memb(Re32=#U6)=Rt32 + case S4_storerbnew_ap: // memb(Re32=#U6)=Nt8.new + case S2_storerb_pr: // memb(Rx32++Mu2)=Rt32 + case S2_storerbnew_pr: // memb(Rx32++Mu2)=Nt8.new + case S4_storerb_ur: // memb(Ru32<<#u2+#U6)=Rt32 + case S4_storerbnew_ur: // memb(Ru32<<#u2+#U6)=Nt8.new + case S2_storerb_pbr: // memb(Rx32++Mu2:brev)=Rt32 + case S2_storerbnew_pbr: // memb(Rx32++Mu2:brev)=Nt8.new + case S2_storerb_pci: // memb(Rx32++#s4:0:circ(Mu2))=Rt32 + case S2_storerbnew_pci: // memb(Rx32++#s4:0:circ(Mu2))=Nt8.new + case S2_storerb_pcr: // memb(Rx32++I:circ(Mu2))=Rt32 + case S2_storerbnew_pcr: // memb(Rx32++I:circ(Mu2))=Nt8.new + case S4_storerb_rr: // memb(Rs32+Ru32<<#u2)=Rt32 + case S4_storerbnew_rr: // memb(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerbt_rr: // if (Pv4) memb(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerbf_rr: // if (!Pv4) memb(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerbtnew_rr: // if (Pv4.new) memb(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerbfnew_rr: // if (!Pv4.new) memb(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerbnewt_rr: // if (Pv4) memb(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerbnewf_rr: // if (!Pv4) memb(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerbnewtnew_rr: // if (Pv4.new) memb(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerbnewfnew_rr: // if (!Pv4.new) memb(Rs32+Ru32<<#u2)=Nt8.new + case S2_storerbgp: // memb(gp+#u16:0)=Rt32 + case S2_storerbnewgp: // memb(gp+#u16:0)=Nt8.new + case S4_pstorerbt_abs: // if (Pv4) memb(#u6)=Rt32 + case S4_pstorerbf_abs: // if (!Pv4) memb(#u6)=Rt32 + case S4_pstorerbtnew_abs: // if (Pv4.new) memb(#u6)=Rt32 + case S4_pstorerbfnew_abs: // if (!Pv4.new) memb(#u6)=Rt32 + case S4_pstorerbnewt_abs: // if (Pv4) memb(#u6)=Nt8.new + case S4_pstorerbnewf_abs: // if (!Pv4) memb(#u6)=Nt8.new + case S4_pstorerbnewtnew_abs: // if (Pv4.new) memb(#u6)=Nt8.new + case S4_pstorerbnewfnew_abs: // if (!Pv4.new) memb(#u6)=Nt8.new + Bits.set(Begin, Begin+8); + return true; + + // Store low half + case S2_storerh_io: // memh(Rs32+#s11:1)=Rt32 + case S2_storerhnew_io: // memh(Rs32+#s11:1)=Nt8.new + case S2_pstorerht_io: // if (Pv4) memh(Rs32+#u6:1)=Rt32 + case S2_pstorerhf_io: // if (!Pv4) memh(Rs32+#u6:1)=Rt32 + case S4_pstorerhtnew_io: // if (Pv4.new) memh(Rs32+#u6:1)=Rt32 + case S4_pstorerhfnew_io: // if (!Pv4.new) memh(Rs32+#u6:1)=Rt32 + case S2_pstorerhnewt_io: // if (Pv4) memh(Rs32+#u6:1)=Nt8.new + case S2_pstorerhnewf_io: // if (!Pv4) memh(Rs32+#u6:1)=Nt8.new + case S4_pstorerhnewtnew_io: // if (Pv4.new) memh(Rs32+#u6:1)=Nt8.new + case S4_pstorerhnewfnew_io: // if (!Pv4.new) memh(Rs32+#u6:1)=Nt8.new + case S2_storerh_pi: // memh(Rx32++#s4:1)=Rt32 + case S2_storerhnew_pi: // memh(Rx32++#s4:1)=Nt8.new + case S2_pstorerht_pi: // if (Pv4) memh(Rx32++#s4:1)=Rt32 + case S2_pstorerhf_pi: // if (!Pv4) memh(Rx32++#s4:1)=Rt32 + case S2_pstorerhtnew_pi: // if (Pv4.new) memh(Rx32++#s4:1)=Rt32 + case S2_pstorerhfnew_pi: // if (!Pv4.new) memh(Rx32++#s4:1)=Rt32 + case S2_pstorerhnewt_pi: // if (Pv4) memh(Rx32++#s4:1)=Nt8.new + case S2_pstorerhnewf_pi: // if (!Pv4) memh(Rx32++#s4:1)=Nt8.new + case S2_pstorerhnewtnew_pi: // if (Pv4.new) memh(Rx32++#s4:1)=Nt8.new + case S2_pstorerhnewfnew_pi: // if (!Pv4.new) memh(Rx32++#s4:1)=Nt8.new + case S4_storerh_ap: // memh(Re32=#U6)=Rt32 + case S4_storerhnew_ap: // memh(Re32=#U6)=Nt8.new + case S2_storerh_pr: // memh(Rx32++Mu2)=Rt32 + case S2_storerhnew_pr: // memh(Rx32++Mu2)=Nt8.new + case S4_storerh_ur: // memh(Ru32<<#u2+#U6)=Rt32 + case S4_storerhnew_ur: // memh(Ru32<<#u2+#U6)=Nt8.new + case S2_storerh_pbr: // memh(Rx32++Mu2:brev)=Rt32 + case S2_storerhnew_pbr: // memh(Rx32++Mu2:brev)=Nt8.new + case S2_storerh_pci: // memh(Rx32++#s4:1:circ(Mu2))=Rt32 + case S2_storerhnew_pci: // memh(Rx32++#s4:1:circ(Mu2))=Nt8.new + case S2_storerh_pcr: // memh(Rx32++I:circ(Mu2))=Rt32 + case S2_storerhnew_pcr: // memh(Rx32++I:circ(Mu2))=Nt8.new + case S4_storerh_rr: // memh(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerht_rr: // if (Pv4) memh(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerhf_rr: // if (!Pv4) memh(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerhtnew_rr: // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerhfnew_rr: // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Rt32 + case S4_storerhnew_rr: // memh(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerhnewt_rr: // if (Pv4) memh(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerhnewf_rr: // if (!Pv4) memh(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerhnewtnew_rr: // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerhnewfnew_rr: // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Nt8.new + case S2_storerhgp: // memh(gp+#u16:1)=Rt32 + case S2_storerhnewgp: // memh(gp+#u16:1)=Nt8.new + case S4_pstorerht_abs: // if (Pv4) memh(#u6)=Rt32 + case S4_pstorerhf_abs: // if (!Pv4) memh(#u6)=Rt32 + case S4_pstorerhtnew_abs: // if (Pv4.new) memh(#u6)=Rt32 + case S4_pstorerhfnew_abs: // if (!Pv4.new) memh(#u6)=Rt32 + case S4_pstorerhnewt_abs: // if (Pv4) memh(#u6)=Nt8.new + case S4_pstorerhnewf_abs: // if (!Pv4) memh(#u6)=Nt8.new + case S4_pstorerhnewtnew_abs: // if (Pv4.new) memh(#u6)=Nt8.new + case S4_pstorerhnewfnew_abs: // if (!Pv4.new) memh(#u6)=Nt8.new + Bits.set(Begin, Begin+16); + return true; + + // Store high half + case S2_storerf_io: // memh(Rs32+#s11:1)=Rt.H32 + case S2_pstorerft_io: // if (Pv4) memh(Rs32+#u6:1)=Rt.H32 + case S2_pstorerff_io: // if (!Pv4) memh(Rs32+#u6:1)=Rt.H32 + case S4_pstorerftnew_io: // if (Pv4.new) memh(Rs32+#u6:1)=Rt.H32 + case S4_pstorerffnew_io: // if (!Pv4.new) memh(Rs32+#u6:1)=Rt.H32 + case S2_storerf_pi: // memh(Rx32++#s4:1)=Rt.H32 + case S2_pstorerft_pi: // if (Pv4) memh(Rx32++#s4:1)=Rt.H32 + case S2_pstorerff_pi: // if (!Pv4) memh(Rx32++#s4:1)=Rt.H32 + case S2_pstorerftnew_pi: // if (Pv4.new) memh(Rx32++#s4:1)=Rt.H32 + case S2_pstorerffnew_pi: // if (!Pv4.new) memh(Rx32++#s4:1)=Rt.H32 + case S4_storerf_ap: // memh(Re32=#U6)=Rt.H32 + case S2_storerf_pr: // memh(Rx32++Mu2)=Rt.H32 + case S4_storerf_ur: // memh(Ru32<<#u2+#U6)=Rt.H32 + case S2_storerf_pbr: // memh(Rx32++Mu2:brev)=Rt.H32 + case S2_storerf_pci: // memh(Rx32++#s4:1:circ(Mu2))=Rt.H32 + case S2_storerf_pcr: // memh(Rx32++I:circ(Mu2))=Rt.H32 + case S4_storerf_rr: // memh(Rs32+Ru32<<#u2)=Rt.H32 + case S4_pstorerft_rr: // if (Pv4) memh(Rs32+Ru32<<#u2)=Rt.H32 + case S4_pstorerff_rr: // if (!Pv4) memh(Rs32+Ru32<<#u2)=Rt.H32 + case S4_pstorerftnew_rr: // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Rt.H32 + case S4_pstorerffnew_rr: // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Rt.H32 + case S2_storerfgp: // memh(gp+#u16:1)=Rt.H32 + case S4_pstorerft_abs: // if (Pv4) memh(#u6)=Rt.H32 + case S4_pstorerff_abs: // if (!Pv4) memh(#u6)=Rt.H32 + case S4_pstorerftnew_abs: // if (Pv4.new) memh(#u6)=Rt.H32 + case S4_pstorerffnew_abs: // if (!Pv4.new) memh(#u6)=Rt.H32 + Bits.set(Begin+16, Begin+32); + return true; + } + + return false; +} + + +// For an instruction with opcode Opc, calculate the set of bits that it +// uses in a register in operand OpN. This only calculates the set of used +// bits for cases where it does not depend on any operands (as is the case +// in shifts, for example). For concrete instructions from a program, the +// operand may be a subregister of a larger register, while Bits would +// correspond to the larger register in its entirety. Because of that, +// the parameter Begin can be used to indicate which bit of Bits should be +// considered the LSB of of the operand. +bool HexagonBitSimplify::getUsedBits(unsigned Opc, unsigned OpN, + BitVector &Bits, uint16_t Begin, const HexagonInstrInfo &HII) { + using namespace Hexagon; + + const MCInstrDesc &D = HII.get(Opc); + if (D.mayStore()) { + if (OpN == D.getNumOperands()-1) + return getUsedBitsInStore(Opc, Bits, Begin); + return false; + } + + switch (Opc) { + // One register source. Used bits: R1[0-7]. + case A2_sxtb: + case A2_zxtb: + case A4_cmpbeqi: + case A4_cmpbgti: + case A4_cmpbgtui: + if (OpN == 1) { + Bits.set(Begin, Begin+8); + return true; + } + break; + + // One register source. Used bits: R1[0-15]. + case A2_aslh: + case A2_sxth: + case A2_zxth: + case A4_cmpheqi: + case A4_cmphgti: + case A4_cmphgtui: + if (OpN == 1) { + Bits.set(Begin, Begin+16); + return true; + } + break; + + // One register source. Used bits: R1[16-31]. + case A2_asrh: + if (OpN == 1) { + Bits.set(Begin+16, Begin+32); + return true; + } + break; + + // Two register sources. Used bits: R1[0-7], R2[0-7]. + case A4_cmpbeq: + case A4_cmpbgt: + case A4_cmpbgtu: + if (OpN == 1) { + Bits.set(Begin, Begin+8); + return true; + } + break; + + // Two register sources. Used bits: R1[0-15], R2[0-15]. + case A4_cmpheq: + case A4_cmphgt: + case A4_cmphgtu: + case A2_addh_h16_ll: + case A2_addh_h16_sat_ll: + case A2_addh_l16_ll: + case A2_addh_l16_sat_ll: + case A2_combine_ll: + case A2_subh_h16_ll: + case A2_subh_h16_sat_ll: + case A2_subh_l16_ll: + case A2_subh_l16_sat_ll: + case M2_mpy_acc_ll_s0: + case M2_mpy_acc_ll_s1: + case M2_mpy_acc_sat_ll_s0: + case M2_mpy_acc_sat_ll_s1: + case M2_mpy_ll_s0: + case M2_mpy_ll_s1: + case M2_mpy_nac_ll_s0: + case M2_mpy_nac_ll_s1: + case M2_mpy_nac_sat_ll_s0: + case M2_mpy_nac_sat_ll_s1: + case M2_mpy_rnd_ll_s0: + case M2_mpy_rnd_ll_s1: + case M2_mpy_sat_ll_s0: + case M2_mpy_sat_ll_s1: + case M2_mpy_sat_rnd_ll_s0: + case M2_mpy_sat_rnd_ll_s1: + case M2_mpyd_acc_ll_s0: + case M2_mpyd_acc_ll_s1: + case M2_mpyd_ll_s0: + case M2_mpyd_ll_s1: + case M2_mpyd_nac_ll_s0: + case M2_mpyd_nac_ll_s1: + case M2_mpyd_rnd_ll_s0: + case M2_mpyd_rnd_ll_s1: + case M2_mpyu_acc_ll_s0: + case M2_mpyu_acc_ll_s1: + case M2_mpyu_ll_s0: + case M2_mpyu_ll_s1: + case M2_mpyu_nac_ll_s0: + case M2_mpyu_nac_ll_s1: + case M2_mpyud_acc_ll_s0: + case M2_mpyud_acc_ll_s1: + case M2_mpyud_ll_s0: + case M2_mpyud_ll_s1: + case M2_mpyud_nac_ll_s0: + case M2_mpyud_nac_ll_s1: + if (OpN == 1 || OpN == 2) { + Bits.set(Begin, Begin+16); + return true; + } + break; + + // Two register sources. Used bits: R1[0-15], R2[16-31]. + case A2_addh_h16_lh: + case A2_addh_h16_sat_lh: + case A2_combine_lh: + case A2_subh_h16_lh: + case A2_subh_h16_sat_lh: + case M2_mpy_acc_lh_s0: + case M2_mpy_acc_lh_s1: + case M2_mpy_acc_sat_lh_s0: + case M2_mpy_acc_sat_lh_s1: + case M2_mpy_lh_s0: + case M2_mpy_lh_s1: + case M2_mpy_nac_lh_s0: + case M2_mpy_nac_lh_s1: + case M2_mpy_nac_sat_lh_s0: + case M2_mpy_nac_sat_lh_s1: + case M2_mpy_rnd_lh_s0: + case M2_mpy_rnd_lh_s1: + case M2_mpy_sat_lh_s0: + case M2_mpy_sat_lh_s1: + case M2_mpy_sat_rnd_lh_s0: + case M2_mpy_sat_rnd_lh_s1: + case M2_mpyd_acc_lh_s0: + case M2_mpyd_acc_lh_s1: + case M2_mpyd_lh_s0: + case M2_mpyd_lh_s1: + case M2_mpyd_nac_lh_s0: + case M2_mpyd_nac_lh_s1: + case M2_mpyd_rnd_lh_s0: + case M2_mpyd_rnd_lh_s1: + case M2_mpyu_acc_lh_s0: + case M2_mpyu_acc_lh_s1: + case M2_mpyu_lh_s0: + case M2_mpyu_lh_s1: + case M2_mpyu_nac_lh_s0: + case M2_mpyu_nac_lh_s1: + case M2_mpyud_acc_lh_s0: + case M2_mpyud_acc_lh_s1: + case M2_mpyud_lh_s0: + case M2_mpyud_lh_s1: + case M2_mpyud_nac_lh_s0: + case M2_mpyud_nac_lh_s1: + // These four are actually LH. + case A2_addh_l16_hl: + case A2_addh_l16_sat_hl: + case A2_subh_l16_hl: + case A2_subh_l16_sat_hl: + if (OpN == 1) { + Bits.set(Begin, Begin+16); + return true; + } + if (OpN == 2) { + Bits.set(Begin+16, Begin+32); + return true; + } + break; + + // Two register sources, used bits: R1[16-31], R2[0-15]. + case A2_addh_h16_hl: + case A2_addh_h16_sat_hl: + case A2_combine_hl: + case A2_subh_h16_hl: + case A2_subh_h16_sat_hl: + case M2_mpy_acc_hl_s0: + case M2_mpy_acc_hl_s1: + case M2_mpy_acc_sat_hl_s0: + case M2_mpy_acc_sat_hl_s1: + case M2_mpy_hl_s0: + case M2_mpy_hl_s1: + case M2_mpy_nac_hl_s0: + case M2_mpy_nac_hl_s1: + case M2_mpy_nac_sat_hl_s0: + case M2_mpy_nac_sat_hl_s1: + case M2_mpy_rnd_hl_s0: + case M2_mpy_rnd_hl_s1: + case M2_mpy_sat_hl_s0: + case M2_mpy_sat_hl_s1: + case M2_mpy_sat_rnd_hl_s0: + case M2_mpy_sat_rnd_hl_s1: + case M2_mpyd_acc_hl_s0: + case M2_mpyd_acc_hl_s1: + case M2_mpyd_hl_s0: + case M2_mpyd_hl_s1: + case M2_mpyd_nac_hl_s0: + case M2_mpyd_nac_hl_s1: + case M2_mpyd_rnd_hl_s0: + case M2_mpyd_rnd_hl_s1: + case M2_mpyu_acc_hl_s0: + case M2_mpyu_acc_hl_s1: + case M2_mpyu_hl_s0: + case M2_mpyu_hl_s1: + case M2_mpyu_nac_hl_s0: + case M2_mpyu_nac_hl_s1: + case M2_mpyud_acc_hl_s0: + case M2_mpyud_acc_hl_s1: + case M2_mpyud_hl_s0: + case M2_mpyud_hl_s1: + case M2_mpyud_nac_hl_s0: + case M2_mpyud_nac_hl_s1: + if (OpN == 1) { + Bits.set(Begin+16, Begin+32); + return true; + } + if (OpN == 2) { + Bits.set(Begin, Begin+16); + return true; + } + break; + + // Two register sources, used bits: R1[16-31], R2[16-31]. + case A2_addh_h16_hh: + case A2_addh_h16_sat_hh: + case A2_combine_hh: + case A2_subh_h16_hh: + case A2_subh_h16_sat_hh: + case M2_mpy_acc_hh_s0: + case M2_mpy_acc_hh_s1: + case M2_mpy_acc_sat_hh_s0: + case M2_mpy_acc_sat_hh_s1: + case M2_mpy_hh_s0: + case M2_mpy_hh_s1: + case M2_mpy_nac_hh_s0: + case M2_mpy_nac_hh_s1: + case M2_mpy_nac_sat_hh_s0: + case M2_mpy_nac_sat_hh_s1: + case M2_mpy_rnd_hh_s0: + case M2_mpy_rnd_hh_s1: + case M2_mpy_sat_hh_s0: + case M2_mpy_sat_hh_s1: + case M2_mpy_sat_rnd_hh_s0: + case M2_mpy_sat_rnd_hh_s1: + case M2_mpyd_acc_hh_s0: + case M2_mpyd_acc_hh_s1: + case M2_mpyd_hh_s0: + case M2_mpyd_hh_s1: + case M2_mpyd_nac_hh_s0: + case M2_mpyd_nac_hh_s1: + case M2_mpyd_rnd_hh_s0: + case M2_mpyd_rnd_hh_s1: + case M2_mpyu_acc_hh_s0: + case M2_mpyu_acc_hh_s1: + case M2_mpyu_hh_s0: + case M2_mpyu_hh_s1: + case M2_mpyu_nac_hh_s0: + case M2_mpyu_nac_hh_s1: + case M2_mpyud_acc_hh_s0: + case M2_mpyud_acc_hh_s1: + case M2_mpyud_hh_s0: + case M2_mpyud_hh_s1: + case M2_mpyud_nac_hh_s0: + case M2_mpyud_nac_hh_s1: + if (OpN == 1 || OpN == 2) { + Bits.set(Begin+16, Begin+32); + return true; + } + break; + } + + return false; +} + + +// Calculate the register class that matches Reg:Sub. For example, if +// vreg1 is a double register, then vreg1:subreg_hireg would match "int" +// register class. +const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass( + const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI) { + if (!TargetRegisterInfo::isVirtualRegister(RR.Reg)) + return nullptr; + auto *RC = MRI.getRegClass(RR.Reg); + if (RR.Sub == 0) + return RC; + + auto VerifySR = [] (unsigned Sub) -> void { + assert(Sub == Hexagon::subreg_hireg || Sub == Hexagon::subreg_loreg); + }; + + switch (RC->getID()) { + case Hexagon::DoubleRegsRegClassID: + VerifySR(RR.Sub); + return &Hexagon::IntRegsRegClass; + } + return nullptr; +} + + +// Check if RD could be replaced with RS at any possible use of RD. +// For example a predicate register cannot be replaced with a integer +// register, but a 64-bit register with a subregister can be replaced +// with a 32-bit register. +bool HexagonBitSimplify::isTransparentCopy(const BitTracker::RegisterRef &RD, + const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI) { + if (!TargetRegisterInfo::isVirtualRegister(RD.Reg) || + !TargetRegisterInfo::isVirtualRegister(RS.Reg)) + return false; + // Return false if one (or both) classes are nullptr. + auto *DRC = getFinalVRegClass(RD, MRI); + if (!DRC) + return false; + + return DRC == getFinalVRegClass(RS, MRI); +} + + +// +// Dead code elimination +// +namespace { + class DeadCodeElimination { + public: + DeadCodeElimination(MachineFunction &mf, MachineDominatorTree &mdt) + : MF(mf), HII(*MF.getSubtarget<HexagonSubtarget>().getInstrInfo()), + MDT(mdt), MRI(mf.getRegInfo()) {} + + bool run() { + return runOnNode(MDT.getRootNode()); + } + + private: + bool isDead(unsigned R) const; + bool runOnNode(MachineDomTreeNode *N); + + MachineFunction &MF; + const HexagonInstrInfo &HII; + MachineDominatorTree &MDT; + MachineRegisterInfo &MRI; + }; +} + + +bool DeadCodeElimination::isDead(unsigned R) const { + for (auto I = MRI.use_begin(R), E = MRI.use_end(); I != E; ++I) { + MachineInstr *UseI = I->getParent(); + if (UseI->isDebugValue()) + continue; + if (UseI->isPHI()) { + assert(!UseI->getOperand(0).getSubReg()); + unsigned DR = UseI->getOperand(0).getReg(); + if (DR == R) + continue; + } + return false; + } + return true; +} + + +bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) { + bool Changed = false; + typedef GraphTraits<MachineDomTreeNode*> GTN; + for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) + Changed |= runOnNode(*I); + + MachineBasicBlock *B = N->getBlock(); + std::vector<MachineInstr*> Instrs; + for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) + Instrs.push_back(&*I); + + for (auto MI : Instrs) { + unsigned Opc = MI->getOpcode(); + // Do not touch lifetime markers. This is why the target-independent DCE + // cannot be used. + if (Opc == TargetOpcode::LIFETIME_START || + Opc == TargetOpcode::LIFETIME_END) + continue; + bool Store = false; + if (MI->isInlineAsm()) + continue; + // Delete PHIs if possible. + if (!MI->isPHI() && !MI->isSafeToMove(nullptr, Store)) + continue; + + bool AllDead = true; + SmallVector<unsigned,2> Regs; + for (auto &Op : MI->operands()) { + if (!Op.isReg() || !Op.isDef()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R) || !isDead(R)) { + AllDead = false; + break; + } + Regs.push_back(R); + } + if (!AllDead) + continue; + + B->erase(MI); + for (unsigned i = 0, n = Regs.size(); i != n; ++i) + MRI.markUsesInDebugValueAsUndef(Regs[i]); + Changed = true; + } + + return Changed; +} + + +// +// Eliminate redundant instructions +// +// This transformation will identify instructions where the output register +// is the same as one of its input registers. This only works on instructions +// that define a single register (unlike post-increment loads, for example). +// The equality check is actually more detailed: the code calculates which +// bits of the output are used, and only compares these bits with the input +// registers. +// If the output matches an input, the instruction is replaced with COPY. +// The copies will be removed by another transformation. +namespace { + class RedundantInstrElimination : public Transformation { + public: + RedundantInstrElimination(BitTracker &bt, const HexagonInstrInfo &hii, + MachineRegisterInfo &mri) + : Transformation(true), HII(hii), MRI(mri), BT(bt) {} + bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override; + private: + bool isLossyShiftLeft(const MachineInstr &MI, unsigned OpN, + unsigned &LostB, unsigned &LostE); + bool isLossyShiftRight(const MachineInstr &MI, unsigned OpN, + unsigned &LostB, unsigned &LostE); + bool computeUsedBits(unsigned Reg, BitVector &Bits); + bool computeUsedBits(const MachineInstr &MI, unsigned OpN, BitVector &Bits, + uint16_t Begin); + bool usedBitsEqual(BitTracker::RegisterRef RD, BitTracker::RegisterRef RS); + + const HexagonInstrInfo &HII; + MachineRegisterInfo &MRI; + BitTracker &BT; + }; +} + + +// Check if the instruction is a lossy shift left, where the input being +// shifted is the operand OpN of MI. If true, [LostB, LostE) is the range +// of bit indices that are lost. +bool RedundantInstrElimination::isLossyShiftLeft(const MachineInstr &MI, + unsigned OpN, unsigned &LostB, unsigned &LostE) { + using namespace Hexagon; + unsigned Opc = MI.getOpcode(); + unsigned ImN, RegN, Width; + switch (Opc) { + case S2_asl_i_p: + ImN = 2; + RegN = 1; + Width = 64; + break; + case S2_asl_i_p_acc: + case S2_asl_i_p_and: + case S2_asl_i_p_nac: + case S2_asl_i_p_or: + case S2_asl_i_p_xacc: + ImN = 3; + RegN = 2; + Width = 64; + break; + case S2_asl_i_r: + ImN = 2; + RegN = 1; + Width = 32; + break; + case S2_addasl_rrri: + case S4_andi_asl_ri: + case S4_ori_asl_ri: + case S4_addi_asl_ri: + case S4_subi_asl_ri: + case S2_asl_i_r_acc: + case S2_asl_i_r_and: + case S2_asl_i_r_nac: + case S2_asl_i_r_or: + case S2_asl_i_r_sat: + case S2_asl_i_r_xacc: + ImN = 3; + RegN = 2; + Width = 32; + break; + default: + return false; + } + + if (RegN != OpN) + return false; + + assert(MI.getOperand(ImN).isImm()); + unsigned S = MI.getOperand(ImN).getImm(); + if (S == 0) + return false; + LostB = Width-S; + LostE = Width; + return true; +} + + +// Check if the instruction is a lossy shift right, where the input being +// shifted is the operand OpN of MI. If true, [LostB, LostE) is the range +// of bit indices that are lost. +bool RedundantInstrElimination::isLossyShiftRight(const MachineInstr &MI, + unsigned OpN, unsigned &LostB, unsigned &LostE) { + using namespace Hexagon; + unsigned Opc = MI.getOpcode(); + unsigned ImN, RegN; + switch (Opc) { + case S2_asr_i_p: + case S2_lsr_i_p: + ImN = 2; + RegN = 1; + break; + case S2_asr_i_p_acc: + case S2_asr_i_p_and: + case S2_asr_i_p_nac: + case S2_asr_i_p_or: + case S2_lsr_i_p_acc: + case S2_lsr_i_p_and: + case S2_lsr_i_p_nac: + case S2_lsr_i_p_or: + case S2_lsr_i_p_xacc: + ImN = 3; + RegN = 2; + break; + case S2_asr_i_r: + case S2_lsr_i_r: + ImN = 2; + RegN = 1; + break; + case S4_andi_lsr_ri: + case S4_ori_lsr_ri: + case S4_addi_lsr_ri: + case S4_subi_lsr_ri: + case S2_asr_i_r_acc: + case S2_asr_i_r_and: + case S2_asr_i_r_nac: + case S2_asr_i_r_or: + case S2_lsr_i_r_acc: + case S2_lsr_i_r_and: + case S2_lsr_i_r_nac: + case S2_lsr_i_r_or: + case S2_lsr_i_r_xacc: + ImN = 3; + RegN = 2; + break; + + default: + return false; + } + + if (RegN != OpN) + return false; + + assert(MI.getOperand(ImN).isImm()); + unsigned S = MI.getOperand(ImN).getImm(); + LostB = 0; + LostE = S; + return true; +} + + +// Calculate the bit vector that corresponds to the used bits of register Reg. +// The vector Bits has the same size, as the size of Reg in bits. If the cal- +// culation fails (i.e. the used bits are unknown), it returns false. Other- +// wise, it returns true and sets the corresponding bits in Bits. +bool RedundantInstrElimination::computeUsedBits(unsigned Reg, BitVector &Bits) { + BitVector Used(Bits.size()); + RegisterSet Visited; + std::vector<unsigned> Pending; + Pending.push_back(Reg); + + for (unsigned i = 0; i < Pending.size(); ++i) { + unsigned R = Pending[i]; + if (Visited.has(R)) + continue; + Visited.insert(R); + for (auto I = MRI.use_begin(R), E = MRI.use_end(); I != E; ++I) { + BitTracker::RegisterRef UR = *I; + unsigned B, W; + if (!HBS::getSubregMask(UR, B, W, MRI)) + return false; + MachineInstr &UseI = *I->getParent(); + if (UseI.isPHI() || UseI.isCopy()) { + unsigned DefR = UseI.getOperand(0).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(DefR)) + return false; + Pending.push_back(DefR); + } else { + if (!computeUsedBits(UseI, I.getOperandNo(), Used, B)) + return false; + } + } + } + Bits |= Used; + return true; +} + + +// Calculate the bits used by instruction MI in a register in operand OpN. +// Return true/false if the calculation succeeds/fails. If is succeeds, set +// used bits in Bits. This function does not reset any bits in Bits, so +// subsequent calls over different instructions will result in the union +// of the used bits in all these instructions. +// The register in question may be used with a sub-register, whereas Bits +// holds the bits for the entire register. To keep track of that, the +// argument Begin indicates where in Bits is the lowest-significant bit +// of the register used in operand OpN. For example, in instruction: +// vreg1 = S2_lsr_i_r vreg2:subreg_hireg, 10 +// the operand 1 is a 32-bit register, which happens to be a subregister +// of the 64-bit register vreg2, and that subregister starts at position 32. +// In this case Begin=32, since Bits[32] would be the lowest-significant bit +// of vreg2:subreg_hireg. +bool RedundantInstrElimination::computeUsedBits(const MachineInstr &MI, + unsigned OpN, BitVector &Bits, uint16_t Begin) { + unsigned Opc = MI.getOpcode(); + BitVector T(Bits.size()); + bool GotBits = HBS::getUsedBits(Opc, OpN, T, Begin, HII); + // Even if we don't have bits yet, we could still provide some information + // if the instruction is a lossy shift: the lost bits will be marked as + // not used. + unsigned LB, LE; + if (isLossyShiftLeft(MI, OpN, LB, LE) || isLossyShiftRight(MI, OpN, LB, LE)) { + assert(MI.getOperand(OpN).isReg()); + BitTracker::RegisterRef RR = MI.getOperand(OpN); + const TargetRegisterClass *RC = HBS::getFinalVRegClass(RR, MRI); + uint16_t Width = RC->getSize()*8; + + if (!GotBits) + T.set(Begin, Begin+Width); + assert(LB <= LE && LB < Width && LE <= Width); + T.reset(Begin+LB, Begin+LE); + GotBits = true; + } + if (GotBits) + Bits |= T; + return GotBits; +} + + +// Calculates the used bits in RD ("defined register"), and checks if these +// bits in RS ("used register") and RD are identical. +bool RedundantInstrElimination::usedBitsEqual(BitTracker::RegisterRef RD, + BitTracker::RegisterRef RS) { + const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg); + const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg); + + unsigned DB, DW; + if (!HBS::getSubregMask(RD, DB, DW, MRI)) + return false; + unsigned SB, SW; + if (!HBS::getSubregMask(RS, SB, SW, MRI)) + return false; + if (SW != DW) + return false; + + BitVector Used(DC.width()); + if (!computeUsedBits(RD.Reg, Used)) + return false; + + for (unsigned i = 0; i != DW; ++i) + if (Used[i+DB] && DC[DB+i] != SC[SB+i]) + return false; + return true; +} + + +bool RedundantInstrElimination::processBlock(MachineBasicBlock &B, + const RegisterSet&) { + bool Changed = false; + + for (auto I = B.begin(), E = B.end(), NextI = I; I != E; ++I) { + NextI = std::next(I); + MachineInstr *MI = &*I; + + if (MI->getOpcode() == TargetOpcode::COPY) + continue; + if (MI->hasUnmodeledSideEffects() || MI->isInlineAsm()) + continue; + unsigned NumD = MI->getDesc().getNumDefs(); + if (NumD != 1) + continue; + + BitTracker::RegisterRef RD = MI->getOperand(0); + if (!BT.has(RD.Reg)) + continue; + const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg); + auto At = MI->isPHI() ? B.getFirstNonPHI() + : MachineBasicBlock::iterator(MI); + + // Find a source operand that is equal to the result. + for (auto &Op : MI->uses()) { + if (!Op.isReg()) + continue; + BitTracker::RegisterRef RS = Op; + if (!BT.has(RS.Reg)) + continue; + if (!HBS::isTransparentCopy(RD, RS, MRI)) + continue; + + unsigned BN, BW; + if (!HBS::getSubregMask(RS, BN, BW, MRI)) + continue; + + const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg); + if (!usedBitsEqual(RD, RS) && !HBS::isEqual(DC, 0, SC, BN, BW)) + continue; + + // If found, replace the instruction with a COPY. + DebugLoc DL = MI->getDebugLoc(); + const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI); + unsigned NewR = MRI.createVirtualRegister(FRC); + BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR) + .addReg(RS.Reg, 0, RS.Sub); + HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI); + BT.put(BitTracker::RegisterRef(NewR), SC); + Changed = true; + break; + } + } + + return Changed; +} + + +// +// Const generation +// +// Recognize instructions that produce constant values known at compile-time. +// Replace them with register definitions that load these constants directly. +namespace { + class ConstGeneration : public Transformation { + public: + ConstGeneration(BitTracker &bt, const HexagonInstrInfo &hii, + MachineRegisterInfo &mri) + : Transformation(true), HII(hii), MRI(mri), BT(bt) {} + bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override; + private: + bool isTfrConst(const MachineInstr *MI) const; + bool isConst(unsigned R, int64_t &V) const; + unsigned genTfrConst(const TargetRegisterClass *RC, int64_t C, + MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL); + + const HexagonInstrInfo &HII; + MachineRegisterInfo &MRI; + BitTracker &BT; + }; +} + +bool ConstGeneration::isConst(unsigned R, int64_t &C) const { + if (!BT.has(R)) + return false; + const BitTracker::RegisterCell &RC = BT.lookup(R); + int64_t T = 0; + for (unsigned i = RC.width(); i > 0; --i) { + const BitTracker::BitValue &V = RC[i-1]; + T <<= 1; + if (V.is(1)) + T |= 1; + else if (!V.is(0)) + return false; + } + C = T; + return true; +} + + +bool ConstGeneration::isTfrConst(const MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case Hexagon::A2_combineii: + case Hexagon::A4_combineii: + case Hexagon::A2_tfrsi: + case Hexagon::A2_tfrpi: + case Hexagon::TFR_PdTrue: + case Hexagon::TFR_PdFalse: + case Hexagon::CONST32_Int_Real: + case Hexagon::CONST64_Int_Real: + return true; + } + return false; +} + + +// Generate a transfer-immediate instruction that is appropriate for the +// register class and the actual value being transferred. +unsigned ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C, + MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL) { + unsigned Reg = MRI.createVirtualRegister(RC); + if (RC == &Hexagon::IntRegsRegClass) { + BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), Reg) + .addImm(int32_t(C)); + return Reg; + } + + if (RC == &Hexagon::DoubleRegsRegClass) { + if (isInt<8>(C)) { + BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrpi), Reg) + .addImm(C); + return Reg; + } + + unsigned Lo = Lo_32(C), Hi = Hi_32(C); + if (isInt<8>(Lo) || isInt<8>(Hi)) { + unsigned Opc = isInt<8>(Lo) ? Hexagon::A2_combineii + : Hexagon::A4_combineii; + BuildMI(B, At, DL, HII.get(Opc), Reg) + .addImm(int32_t(Hi)) + .addImm(int32_t(Lo)); + return Reg; + } + + BuildMI(B, At, DL, HII.get(Hexagon::CONST64_Int_Real), Reg) + .addImm(C); + return Reg; + } + + if (RC == &Hexagon::PredRegsRegClass) { + unsigned Opc; + if (C == 0) + Opc = Hexagon::TFR_PdFalse; + else if ((C & 0xFF) == 0xFF) + Opc = Hexagon::TFR_PdTrue; + else + return 0; + BuildMI(B, At, DL, HII.get(Opc), Reg); + return Reg; + } + + return 0; +} + + +bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) { + bool Changed = false; + RegisterSet Defs; + + for (auto I = B.begin(), E = B.end(); I != E; ++I) { + if (isTfrConst(I)) + continue; + Defs.clear(); + HBS::getInstrDefs(*I, Defs); + if (Defs.count() != 1) + continue; + unsigned DR = Defs.find_first(); + if (!TargetRegisterInfo::isVirtualRegister(DR)) + continue; + int64_t C; + if (isConst(DR, C)) { + DebugLoc DL = I->getDebugLoc(); + auto At = I->isPHI() ? B.getFirstNonPHI() : I; + unsigned ImmReg = genTfrConst(MRI.getRegClass(DR), C, B, At, DL); + if (ImmReg) { + HBS::replaceReg(DR, ImmReg, MRI); + BT.put(ImmReg, BT.lookup(DR)); + Changed = true; + } + } + } + return Changed; +} + + +// +// Copy generation +// +// Identify pairs of available registers which hold identical values. +// In such cases, only one of them needs to be calculated, the other one +// will be defined as a copy of the first. +// +// Copy propagation +// +// Eliminate register copies RD = RS, by replacing the uses of RD with +// with uses of RS. +namespace { + class CopyGeneration : public Transformation { + public: + CopyGeneration(BitTracker &bt, const HexagonInstrInfo &hii, + MachineRegisterInfo &mri) + : Transformation(true), HII(hii), MRI(mri), BT(bt) {} + bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override; + private: + bool findMatch(const BitTracker::RegisterRef &Inp, + BitTracker::RegisterRef &Out, const RegisterSet &AVs); + + const HexagonInstrInfo &HII; + MachineRegisterInfo &MRI; + BitTracker &BT; + }; + + class CopyPropagation : public Transformation { + public: + CopyPropagation(const HexagonRegisterInfo &hri, MachineRegisterInfo &mri) + : Transformation(false), MRI(mri) {} + bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override; + static bool isCopyReg(unsigned Opc); + private: + bool propagateRegCopy(MachineInstr &MI); + + MachineRegisterInfo &MRI; + }; + +} + + +/// Check if there is a register in AVs that is identical to Inp. If so, +/// set Out to the found register. The output may be a pair Reg:Sub. +bool CopyGeneration::findMatch(const BitTracker::RegisterRef &Inp, + BitTracker::RegisterRef &Out, const RegisterSet &AVs) { + if (!BT.has(Inp.Reg)) + return false; + const BitTracker::RegisterCell &InpRC = BT.lookup(Inp.Reg); + unsigned B, W; + if (!HBS::getSubregMask(Inp, B, W, MRI)) + return false; + + for (unsigned R = AVs.find_first(); R; R = AVs.find_next(R)) { + if (!BT.has(R) || !HBS::isTransparentCopy(R, Inp, MRI)) + continue; + const BitTracker::RegisterCell &RC = BT.lookup(R); + unsigned RW = RC.width(); + if (W == RW) { + if (MRI.getRegClass(Inp.Reg) != MRI.getRegClass(R)) + continue; + if (!HBS::isEqual(InpRC, B, RC, 0, W)) + continue; + Out.Reg = R; + Out.Sub = 0; + return true; + } + // Check if there is a super-register, whose part (with a subregister) + // is equal to the input. + // Only do double registers for now. + if (W*2 != RW) + continue; + if (MRI.getRegClass(R) != &Hexagon::DoubleRegsRegClass) + continue; + + if (HBS::isEqual(InpRC, B, RC, 0, W)) + Out.Sub = Hexagon::subreg_loreg; + else if (HBS::isEqual(InpRC, B, RC, W, W)) + Out.Sub = Hexagon::subreg_hireg; + else + continue; + Out.Reg = R; + return true; + } + return false; +} + + +bool CopyGeneration::processBlock(MachineBasicBlock &B, + const RegisterSet &AVs) { + RegisterSet AVB(AVs); + bool Changed = false; + RegisterSet Defs; + + for (auto I = B.begin(), E = B.end(), NextI = I; I != E; + ++I, AVB.insert(Defs)) { + NextI = std::next(I); + Defs.clear(); + HBS::getInstrDefs(*I, Defs); + + unsigned Opc = I->getOpcode(); + if (CopyPropagation::isCopyReg(Opc)) + continue; + + for (unsigned R = Defs.find_first(); R; R = Defs.find_next(R)) { + BitTracker::RegisterRef MR; + if (!findMatch(R, MR, AVB)) + continue; + DebugLoc DL = I->getDebugLoc(); + auto *FRC = HBS::getFinalVRegClass(MR, MRI); + unsigned NewR = MRI.createVirtualRegister(FRC); + auto At = I->isPHI() ? B.getFirstNonPHI() : I; + BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR) + .addReg(MR.Reg, 0, MR.Sub); + BT.put(BitTracker::RegisterRef(NewR), BT.get(MR)); + } + } + + return Changed; +} + + +bool CopyPropagation::isCopyReg(unsigned Opc) { + switch (Opc) { + case TargetOpcode::COPY: + case TargetOpcode::REG_SEQUENCE: + case Hexagon::A2_tfr: + case Hexagon::A2_tfrp: + case Hexagon::A2_combinew: + case Hexagon::A4_combineir: + case Hexagon::A4_combineri: + return true; + default: + break; + } + return false; +} + + +bool CopyPropagation::propagateRegCopy(MachineInstr &MI) { + bool Changed = false; + unsigned Opc = MI.getOpcode(); + BitTracker::RegisterRef RD = MI.getOperand(0); + assert(MI.getOperand(0).getSubReg() == 0); + + switch (Opc) { + case TargetOpcode::COPY: + case Hexagon::A2_tfr: + case Hexagon::A2_tfrp: { + BitTracker::RegisterRef RS = MI.getOperand(1); + if (!HBS::isTransparentCopy(RD, RS, MRI)) + break; + if (RS.Sub != 0) + Changed = HBS::replaceRegWithSub(RD.Reg, RS.Reg, RS.Sub, MRI); + else + Changed = HBS::replaceReg(RD.Reg, RS.Reg, MRI); + break; + } + case TargetOpcode::REG_SEQUENCE: { + BitTracker::RegisterRef SL, SH; + if (HBS::parseRegSequence(MI, SL, SH)) { + Changed = HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_loreg, + SL.Reg, SL.Sub, MRI); + Changed |= HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_hireg, + SH.Reg, SH.Sub, MRI); + } + break; + } + case Hexagon::A2_combinew: { + BitTracker::RegisterRef RH = MI.getOperand(1), RL = MI.getOperand(2); + Changed = HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_loreg, + RL.Reg, RL.Sub, MRI); + Changed |= HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_hireg, + RH.Reg, RH.Sub, MRI); + break; + } + case Hexagon::A4_combineir: + case Hexagon::A4_combineri: { + unsigned SrcX = (Opc == Hexagon::A4_combineir) ? 2 : 1; + unsigned Sub = (Opc == Hexagon::A4_combineir) ? Hexagon::subreg_loreg + : Hexagon::subreg_hireg; + BitTracker::RegisterRef RS = MI.getOperand(SrcX); + Changed = HBS::replaceSubWithSub(RD.Reg, Sub, RS.Reg, RS.Sub, MRI); + break; + } + } + return Changed; +} + + +bool CopyPropagation::processBlock(MachineBasicBlock &B, const RegisterSet&) { + std::vector<MachineInstr*> Instrs; + for (auto I = B.rbegin(), E = B.rend(); I != E; ++I) + Instrs.push_back(&*I); + + bool Changed = false; + for (auto I : Instrs) { + unsigned Opc = I->getOpcode(); + if (!CopyPropagation::isCopyReg(Opc)) + continue; + Changed |= propagateRegCopy(*I); + } + + return Changed; +} + + +// +// Bit simplification +// +// Recognize patterns that can be simplified and replace them with the +// simpler forms. +// This is by no means complete +namespace { + class BitSimplification : public Transformation { + public: + BitSimplification(BitTracker &bt, const HexagonInstrInfo &hii, + MachineRegisterInfo &mri) + : Transformation(true), HII(hii), MRI(mri), BT(bt) {} + bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override; + private: + struct RegHalf : public BitTracker::RegisterRef { + bool Low; // Low/High halfword. + }; + + bool matchHalf(unsigned SelfR, const BitTracker::RegisterCell &RC, + unsigned B, RegHalf &RH); + + bool matchPackhl(unsigned SelfR, const BitTracker::RegisterCell &RC, + BitTracker::RegisterRef &Rs, BitTracker::RegisterRef &Rt); + unsigned getCombineOpcode(bool HLow, bool LLow); + + bool genStoreUpperHalf(MachineInstr *MI); + bool genStoreImmediate(MachineInstr *MI); + bool genPackhl(MachineInstr *MI, BitTracker::RegisterRef RD, + const BitTracker::RegisterCell &RC); + bool genExtractHalf(MachineInstr *MI, BitTracker::RegisterRef RD, + const BitTracker::RegisterCell &RC); + bool genCombineHalf(MachineInstr *MI, BitTracker::RegisterRef RD, + const BitTracker::RegisterCell &RC); + bool genExtractLow(MachineInstr *MI, BitTracker::RegisterRef RD, + const BitTracker::RegisterCell &RC); + bool simplifyTstbit(MachineInstr *MI, BitTracker::RegisterRef RD, + const BitTracker::RegisterCell &RC); + + const HexagonInstrInfo &HII; + MachineRegisterInfo &MRI; + BitTracker &BT; + }; +} + + +// Check if the bits [B..B+16) in register cell RC form a valid halfword, +// i.e. [0..16), [16..32), etc. of some register. If so, return true and +// set the information about the found register in RH. +bool BitSimplification::matchHalf(unsigned SelfR, + const BitTracker::RegisterCell &RC, unsigned B, RegHalf &RH) { + // XXX This could be searching in the set of available registers, in case + // the match is not exact. + + // Match 16-bit chunks, where the RC[B..B+15] references exactly one + // register and all the bits B..B+15 match between RC and the register. + // This is meant to match "v1[0-15]", where v1 = { [0]:0 [1-15]:v1... }, + // and RC = { [0]:0 [1-15]:v1[1-15]... }. + bool Low = false; + unsigned I = B; + while (I < B+16 && RC[I].num()) + I++; + if (I == B+16) + return false; + + unsigned Reg = RC[I].RefI.Reg; + unsigned P = RC[I].RefI.Pos; // The RefI.Pos will be advanced by I-B. + if (P < I-B) + return false; + unsigned Pos = P - (I-B); + + if (Reg == 0 || Reg == SelfR) // Don't match "self". + return false; + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return false; + if (!BT.has(Reg)) + return false; + + const BitTracker::RegisterCell &SC = BT.lookup(Reg); + if (Pos+16 > SC.width()) + return false; + + for (unsigned i = 0; i < 16; ++i) { + const BitTracker::BitValue &RV = RC[i+B]; + if (RV.Type == BitTracker::BitValue::Ref) { + if (RV.RefI.Reg != Reg) + return false; + if (RV.RefI.Pos != i+Pos) + return false; + continue; + } + if (RC[i+B] != SC[i+Pos]) + return false; + } + + unsigned Sub = 0; + switch (Pos) { + case 0: + Sub = Hexagon::subreg_loreg; + Low = true; + break; + case 16: + Sub = Hexagon::subreg_loreg; + Low = false; + break; + case 32: + Sub = Hexagon::subreg_hireg; + Low = true; + break; + case 48: + Sub = Hexagon::subreg_hireg; + Low = false; + break; + default: + return false; + } + + RH.Reg = Reg; + RH.Sub = Sub; + RH.Low = Low; + // If the subregister is not valid with the register, set it to 0. + if (!HBS::getFinalVRegClass(RH, MRI)) + RH.Sub = 0; + + return true; +} + + +// Check if RC matches the pattern of a S2_packhl. If so, return true and +// set the inputs Rs and Rt. +bool BitSimplification::matchPackhl(unsigned SelfR, + const BitTracker::RegisterCell &RC, BitTracker::RegisterRef &Rs, + BitTracker::RegisterRef &Rt) { + RegHalf L1, H1, L2, H2; + + if (!matchHalf(SelfR, RC, 0, L2) || !matchHalf(SelfR, RC, 16, L1)) + return false; + if (!matchHalf(SelfR, RC, 32, H2) || !matchHalf(SelfR, RC, 48, H1)) + return false; + + // Rs = H1.L1, Rt = H2.L2 + if (H1.Reg != L1.Reg || H1.Sub != L1.Sub || H1.Low || !L1.Low) + return false; + if (H2.Reg != L2.Reg || H2.Sub != L2.Sub || H2.Low || !L2.Low) + return false; + + Rs = H1; + Rt = H2; + return true; +} + + +unsigned BitSimplification::getCombineOpcode(bool HLow, bool LLow) { + return HLow ? LLow ? Hexagon::A2_combine_ll + : Hexagon::A2_combine_lh + : LLow ? Hexagon::A2_combine_hl + : Hexagon::A2_combine_hh; +} + + +// If MI stores the upper halfword of a register (potentially obtained via +// shifts or extracts), replace it with a storerf instruction. This could +// cause the "extraction" code to become dead. +bool BitSimplification::genStoreUpperHalf(MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + if (Opc != Hexagon::S2_storerh_io) + return false; + + MachineOperand &ValOp = MI->getOperand(2); + BitTracker::RegisterRef RS = ValOp; + if (!BT.has(RS.Reg)) + return false; + const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg); + RegHalf H; + if (!matchHalf(0, RC, 0, H)) + return false; + if (H.Low) + return false; + MI->setDesc(HII.get(Hexagon::S2_storerf_io)); + ValOp.setReg(H.Reg); + ValOp.setSubReg(H.Sub); + return true; +} + + +// If MI stores a value known at compile-time, and the value is within a range +// that avoids using constant-extenders, replace it with a store-immediate. +bool BitSimplification::genStoreImmediate(MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + unsigned Align = 0; + switch (Opc) { + case Hexagon::S2_storeri_io: + Align++; + case Hexagon::S2_storerh_io: + Align++; + case Hexagon::S2_storerb_io: + break; + default: + return false; + } + + // Avoid stores to frame-indices (due to an unknown offset). + if (!MI->getOperand(0).isReg()) + return false; + MachineOperand &OffOp = MI->getOperand(1); + if (!OffOp.isImm()) + return false; + + int64_t Off = OffOp.getImm(); + // Offset is u6:a. Sadly, there is no isShiftedUInt(n,x). + if (!isUIntN(6+Align, Off) || (Off & ((1<<Align)-1))) + return false; + // Source register: + BitTracker::RegisterRef RS = MI->getOperand(2); + if (!BT.has(RS.Reg)) + return false; + const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg); + uint64_t U; + if (!HBS::getConst(RC, 0, RC.width(), U)) + return false; + + // Only consider 8-bit values to avoid constant-extenders. + int V; + switch (Opc) { + case Hexagon::S2_storerb_io: + V = int8_t(U); + break; + case Hexagon::S2_storerh_io: + V = int16_t(U); + break; + case Hexagon::S2_storeri_io: + V = int32_t(U); + break; + } + if (!isInt<8>(V)) + return false; + + MI->RemoveOperand(2); + switch (Opc) { + case Hexagon::S2_storerb_io: + MI->setDesc(HII.get(Hexagon::S4_storeirb_io)); + break; + case Hexagon::S2_storerh_io: + MI->setDesc(HII.get(Hexagon::S4_storeirh_io)); + break; + case Hexagon::S2_storeri_io: + MI->setDesc(HII.get(Hexagon::S4_storeiri_io)); + break; + } + MI->addOperand(MachineOperand::CreateImm(V)); + return true; +} + + +// If MI is equivalent o S2_packhl, generate the S2_packhl. MI could be the +// last instruction in a sequence that results in something equivalent to +// the pack-halfwords. The intent is to cause the entire sequence to become +// dead. +bool BitSimplification::genPackhl(MachineInstr *MI, + BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) { + unsigned Opc = MI->getOpcode(); + if (Opc == Hexagon::S2_packhl) + return false; + BitTracker::RegisterRef Rs, Rt; + if (!matchPackhl(RD.Reg, RC, Rs, Rt)) + return false; + + MachineBasicBlock &B = *MI->getParent(); + unsigned NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass); + DebugLoc DL = MI->getDebugLoc(); + auto At = MI->isPHI() ? B.getFirstNonPHI() + : MachineBasicBlock::iterator(MI); + BuildMI(B, At, DL, HII.get(Hexagon::S2_packhl), NewR) + .addReg(Rs.Reg, 0, Rs.Sub) + .addReg(Rt.Reg, 0, Rt.Sub); + HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI); + BT.put(BitTracker::RegisterRef(NewR), RC); + return true; +} + + +// If MI produces halfword of the input in the low half of the output, +// replace it with zero-extend or extractu. +bool BitSimplification::genExtractHalf(MachineInstr *MI, + BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) { + RegHalf L; + // Check for halfword in low 16 bits, zeros elsewhere. + if (!matchHalf(RD.Reg, RC, 0, L) || !HBS::isZero(RC, 16, 16)) + return false; + + unsigned Opc = MI->getOpcode(); + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + // Prefer zxth, since zxth can go in any slot, while extractu only in + // slots 2 and 3. + unsigned NewR = 0; + auto At = MI->isPHI() ? B.getFirstNonPHI() + : MachineBasicBlock::iterator(MI); + if (L.Low && Opc != Hexagon::A2_zxth) { + NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + BuildMI(B, At, DL, HII.get(Hexagon::A2_zxth), NewR) + .addReg(L.Reg, 0, L.Sub); + } else if (!L.Low && Opc != Hexagon::S2_extractu) { + NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + BuildMI(B, MI, DL, HII.get(Hexagon::S2_extractu), NewR) + .addReg(L.Reg, 0, L.Sub) + .addImm(16) + .addImm(16); + } + if (NewR == 0) + return false; + HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI); + BT.put(BitTracker::RegisterRef(NewR), RC); + return true; +} + + +// If MI is equivalent to a combine(.L/.H, .L/.H) replace with with the +// combine. +bool BitSimplification::genCombineHalf(MachineInstr *MI, + BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) { + RegHalf L, H; + // Check for combine h/l + if (!matchHalf(RD.Reg, RC, 0, L) || !matchHalf(RD.Reg, RC, 16, H)) + return false; + // Do nothing if this is just a reg copy. + if (L.Reg == H.Reg && L.Sub == H.Sub && !H.Low && L.Low) + return false; + + unsigned Opc = MI->getOpcode(); + unsigned COpc = getCombineOpcode(H.Low, L.Low); + if (COpc == Opc) + return false; + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + auto At = MI->isPHI() ? B.getFirstNonPHI() + : MachineBasicBlock::iterator(MI); + BuildMI(B, At, DL, HII.get(COpc), NewR) + .addReg(H.Reg, 0, H.Sub) + .addReg(L.Reg, 0, L.Sub); + HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI); + BT.put(BitTracker::RegisterRef(NewR), RC); + return true; +} + + +// If MI resets high bits of a register and keeps the lower ones, replace it +// with zero-extend byte/half, and-immediate, or extractu, as appropriate. +bool BitSimplification::genExtractLow(MachineInstr *MI, + BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) { + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case Hexagon::A2_zxtb: + case Hexagon::A2_zxth: + case Hexagon::S2_extractu: + return false; + } + if (Opc == Hexagon::A2_andir && MI->getOperand(2).isImm()) { + int32_t Imm = MI->getOperand(2).getImm(); + if (isInt<10>(Imm)) + return false; + } + + if (MI->hasUnmodeledSideEffects() || MI->isInlineAsm()) + return false; + unsigned W = RC.width(); + while (W > 0 && RC[W-1].is(0)) + W--; + if (W == 0 || W == RC.width()) + return false; + unsigned NewOpc = (W == 8) ? Hexagon::A2_zxtb + : (W == 16) ? Hexagon::A2_zxth + : (W < 10) ? Hexagon::A2_andir + : Hexagon::S2_extractu; + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + for (auto &Op : MI->uses()) { + if (!Op.isReg()) + continue; + BitTracker::RegisterRef RS = Op; + if (!BT.has(RS.Reg)) + continue; + const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg); + unsigned BN, BW; + if (!HBS::getSubregMask(RS, BN, BW, MRI)) + continue; + if (BW < W || !HBS::isEqual(RC, 0, SC, BN, W)) + continue; + + unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + auto At = MI->isPHI() ? B.getFirstNonPHI() + : MachineBasicBlock::iterator(MI); + auto MIB = BuildMI(B, At, DL, HII.get(NewOpc), NewR) + .addReg(RS.Reg, 0, RS.Sub); + if (NewOpc == Hexagon::A2_andir) + MIB.addImm((1 << W) - 1); + else if (NewOpc == Hexagon::S2_extractu) + MIB.addImm(W).addImm(0); + HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI); + BT.put(BitTracker::RegisterRef(NewR), RC); + return true; + } + return false; +} + + +// Check for tstbit simplification opportunity, where the bit being checked +// can be tracked back to another register. For example: +// vreg2 = S2_lsr_i_r vreg1, 5 +// vreg3 = S2_tstbit_i vreg2, 0 +// => +// vreg3 = S2_tstbit_i vreg1, 5 +bool BitSimplification::simplifyTstbit(MachineInstr *MI, + BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) { + unsigned Opc = MI->getOpcode(); + if (Opc != Hexagon::S2_tstbit_i) + return false; + + unsigned BN = MI->getOperand(2).getImm(); + BitTracker::RegisterRef RS = MI->getOperand(1); + unsigned F, W; + DebugLoc DL = MI->getDebugLoc(); + if (!BT.has(RS.Reg) || !HBS::getSubregMask(RS, F, W, MRI)) + return false; + MachineBasicBlock &B = *MI->getParent(); + auto At = MI->isPHI() ? B.getFirstNonPHI() + : MachineBasicBlock::iterator(MI); + + const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg); + const BitTracker::BitValue &V = SC[F+BN]; + if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg != RS.Reg) { + const TargetRegisterClass *TC = MRI.getRegClass(V.RefI.Reg); + // Need to map V.RefI.Reg to a 32-bit register, i.e. if it is + // a double register, need to use a subregister and adjust bit + // number. + unsigned P = UINT_MAX; + BitTracker::RegisterRef RR(V.RefI.Reg, 0); + if (TC == &Hexagon::DoubleRegsRegClass) { + P = V.RefI.Pos; + RR.Sub = Hexagon::subreg_loreg; + if (P >= 32) { + P -= 32; + RR.Sub = Hexagon::subreg_hireg; + } + } else if (TC == &Hexagon::IntRegsRegClass) { + P = V.RefI.Pos; + } + if (P != UINT_MAX) { + unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass); + BuildMI(B, At, DL, HII.get(Hexagon::S2_tstbit_i), NewR) + .addReg(RR.Reg, 0, RR.Sub) + .addImm(P); + HBS::replaceReg(RD.Reg, NewR, MRI); + BT.put(NewR, RC); + return true; + } + } else if (V.is(0) || V.is(1)) { + unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass); + unsigned NewOpc = V.is(0) ? Hexagon::TFR_PdFalse : Hexagon::TFR_PdTrue; + BuildMI(B, At, DL, HII.get(NewOpc), NewR); + HBS::replaceReg(RD.Reg, NewR, MRI); + return true; + } + + return false; +} + + +bool BitSimplification::processBlock(MachineBasicBlock &B, + const RegisterSet &AVs) { + bool Changed = false; + RegisterSet AVB = AVs; + RegisterSet Defs; + + for (auto I = B.begin(), E = B.end(); I != E; ++I, AVB.insert(Defs)) { + MachineInstr *MI = &*I; + Defs.clear(); + HBS::getInstrDefs(*MI, Defs); + + unsigned Opc = MI->getOpcode(); + if (Opc == TargetOpcode::COPY || Opc == TargetOpcode::REG_SEQUENCE) + continue; + + if (MI->mayStore()) { + bool T = genStoreUpperHalf(MI); + T = T || genStoreImmediate(MI); + Changed |= T; + continue; + } + + if (Defs.count() != 1) + continue; + const MachineOperand &Op0 = MI->getOperand(0); + if (!Op0.isReg() || !Op0.isDef()) + continue; + BitTracker::RegisterRef RD = Op0; + if (!BT.has(RD.Reg)) + continue; + const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI); + const BitTracker::RegisterCell &RC = BT.lookup(RD.Reg); + + if (FRC->getID() == Hexagon::DoubleRegsRegClassID) { + bool T = genPackhl(MI, RD, RC); + Changed |= T; + continue; + } + + if (FRC->getID() == Hexagon::IntRegsRegClassID) { + bool T = genExtractHalf(MI, RD, RC); + T = T || genCombineHalf(MI, RD, RC); + T = T || genExtractLow(MI, RD, RC); + Changed |= T; + continue; + } + + if (FRC->getID() == Hexagon::PredRegsRegClassID) { + bool T = simplifyTstbit(MI, RD, RC); + Changed |= T; + continue; + } + } + return Changed; +} + + +bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) { + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + auto &HRI = *HST.getRegisterInfo(); + auto &HII = *HST.getInstrInfo(); + + MDT = &getAnalysis<MachineDominatorTree>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + bool Changed; + + Changed = DeadCodeElimination(MF, *MDT).run(); + + const HexagonEvaluator HE(HRI, MRI, HII, MF); + BitTracker BT(HE, MF); + DEBUG(BT.trace(true)); + BT.run(); + + MachineBasicBlock &Entry = MF.front(); + + RegisterSet AIG; // Available registers for IG. + ConstGeneration ImmG(BT, HII, MRI); + Changed |= visitBlock(Entry, ImmG, AIG); + + RegisterSet ARE; // Available registers for RIE. + RedundantInstrElimination RIE(BT, HII, MRI); + Changed |= visitBlock(Entry, RIE, ARE); + + RegisterSet ACG; // Available registers for CG. + CopyGeneration CopyG(BT, HII, MRI); + Changed |= visitBlock(Entry, CopyG, ACG); + + RegisterSet ACP; // Available registers for CP. + CopyPropagation CopyP(HRI, MRI); + Changed |= visitBlock(Entry, CopyP, ACP); + + Changed = DeadCodeElimination(MF, *MDT).run() || Changed; + + BT.run(); + RegisterSet ABS; // Available registers for BS. + BitSimplification BitS(BT, HII, MRI); + Changed |= visitBlock(Entry, BitS, ABS); + + Changed = DeadCodeElimination(MF, *MDT).run() || Changed; + + if (Changed) { + for (auto &B : MF) + for (auto &I : B) + I.clearKillInfo(); + DeadCodeElimination(MF, *MDT).run(); + } + return Changed; +} + + +// Recognize loops where the code at the end of the loop matches the code +// before the entry of the loop, and the matching code is such that is can +// be simplified. This pass relies on the bit simplification above and only +// prepares code in a way that can be handled by the bit simplifcation. +// +// This is the motivating testcase (and explanation): +// +// { +// loop0(.LBB0_2, r1) // %for.body.preheader +// r5:4 = memd(r0++#8) +// } +// { +// r3 = lsr(r4, #16) +// r7:6 = combine(r5, r5) +// } +// { +// r3 = insert(r5, #16, #16) +// r7:6 = vlsrw(r7:6, #16) +// } +// .LBB0_2: +// { +// memh(r2+#4) = r5 +// memh(r2+#6) = r6 # R6 is really R5.H +// } +// { +// r2 = add(r2, #8) +// memh(r2+#0) = r4 +// memh(r2+#2) = r3 # R3 is really R4.H +// } +// { +// r5:4 = memd(r0++#8) +// } +// { # "Shuffling" code that sets up R3 and R6 +// r3 = lsr(r4, #16) # so that their halves can be stored in the +// r7:6 = combine(r5, r5) # next iteration. This could be folded into +// } # the stores if the code was at the beginning +// { # of the loop iteration. Since the same code +// r3 = insert(r5, #16, #16) # precedes the loop, it can actually be moved +// r7:6 = vlsrw(r7:6, #16) # there. +// }:endloop0 +// +// +// The outcome: +// +// { +// loop0(.LBB0_2, r1) +// r5:4 = memd(r0++#8) +// } +// .LBB0_2: +// { +// memh(r2+#4) = r5 +// memh(r2+#6) = r5.h +// } +// { +// r2 = add(r2, #8) +// memh(r2+#0) = r4 +// memh(r2+#2) = r4.h +// } +// { +// r5:4 = memd(r0++#8) +// }:endloop0 + +namespace llvm { + FunctionPass *createHexagonLoopRescheduling(); + void initializeHexagonLoopReschedulingPass(PassRegistry&); +} + +namespace { + class HexagonLoopRescheduling : public MachineFunctionPass { + public: + static char ID; + HexagonLoopRescheduling() : MachineFunctionPass(ID), + HII(0), HRI(0), MRI(0), BTP(0) { + initializeHexagonLoopReschedulingPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + private: + const HexagonInstrInfo *HII; + const HexagonRegisterInfo *HRI; + MachineRegisterInfo *MRI; + BitTracker *BTP; + + struct LoopCand { + LoopCand(MachineBasicBlock *lb, MachineBasicBlock *pb, + MachineBasicBlock *eb) : LB(lb), PB(pb), EB(eb) {} + MachineBasicBlock *LB, *PB, *EB; + }; + typedef std::vector<MachineInstr*> InstrList; + struct InstrGroup { + BitTracker::RegisterRef Inp, Out; + InstrList Ins; + }; + struct PhiInfo { + PhiInfo(MachineInstr &P, MachineBasicBlock &B); + unsigned DefR; + BitTracker::RegisterRef LR, PR; + MachineBasicBlock *LB, *PB; + }; + + static unsigned getDefReg(const MachineInstr *MI); + bool isConst(unsigned Reg) const; + bool isBitShuffle(const MachineInstr *MI, unsigned DefR) const; + bool isStoreInput(const MachineInstr *MI, unsigned DefR) const; + bool isShuffleOf(unsigned OutR, unsigned InpR) const; + bool isSameShuffle(unsigned OutR1, unsigned InpR1, unsigned OutR2, + unsigned &InpR2) const; + void moveGroup(InstrGroup &G, MachineBasicBlock &LB, MachineBasicBlock &PB, + MachineBasicBlock::iterator At, unsigned OldPhiR, unsigned NewPredR); + bool processLoop(LoopCand &C); + }; +} + +char HexagonLoopRescheduling::ID = 0; + +INITIALIZE_PASS(HexagonLoopRescheduling, "hexagon-loop-resched", + "Hexagon Loop Rescheduling", false, false) + + +HexagonLoopRescheduling::PhiInfo::PhiInfo(MachineInstr &P, + MachineBasicBlock &B) { + DefR = HexagonLoopRescheduling::getDefReg(&P); + LB = &B; + PB = nullptr; + for (unsigned i = 1, n = P.getNumOperands(); i < n; i += 2) { + const MachineOperand &OpB = P.getOperand(i+1); + if (OpB.getMBB() == &B) { + LR = P.getOperand(i); + continue; + } + PB = OpB.getMBB(); + PR = P.getOperand(i); + } +} + + +unsigned HexagonLoopRescheduling::getDefReg(const MachineInstr *MI) { + RegisterSet Defs; + HBS::getInstrDefs(*MI, Defs); + if (Defs.count() != 1) + return 0; + return Defs.find_first(); +} + + +bool HexagonLoopRescheduling::isConst(unsigned Reg) const { + if (!BTP->has(Reg)) + return false; + const BitTracker::RegisterCell &RC = BTP->lookup(Reg); + for (unsigned i = 0, w = RC.width(); i < w; ++i) { + const BitTracker::BitValue &V = RC[i]; + if (!V.is(0) && !V.is(1)) + return false; + } + return true; +} + + +bool HexagonLoopRescheduling::isBitShuffle(const MachineInstr *MI, + unsigned DefR) const { + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case TargetOpcode::COPY: + case Hexagon::S2_lsr_i_r: + case Hexagon::S2_asr_i_r: + case Hexagon::S2_asl_i_r: + case Hexagon::S2_lsr_i_p: + case Hexagon::S2_asr_i_p: + case Hexagon::S2_asl_i_p: + case Hexagon::S2_insert: + case Hexagon::A2_or: + case Hexagon::A2_orp: + case Hexagon::A2_and: + case Hexagon::A2_andp: + case Hexagon::A2_combinew: + case Hexagon::A4_combineri: + case Hexagon::A4_combineir: + case Hexagon::A2_combineii: + case Hexagon::A4_combineii: + case Hexagon::A2_combine_ll: + case Hexagon::A2_combine_lh: + case Hexagon::A2_combine_hl: + case Hexagon::A2_combine_hh: + return true; + } + return false; +} + + +bool HexagonLoopRescheduling::isStoreInput(const MachineInstr *MI, + unsigned InpR) const { + for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) { + const MachineOperand &Op = MI->getOperand(i); + if (!Op.isReg()) + continue; + if (Op.getReg() == InpR) + return i == n-1; + } + return false; +} + + +bool HexagonLoopRescheduling::isShuffleOf(unsigned OutR, unsigned InpR) const { + if (!BTP->has(OutR) || !BTP->has(InpR)) + return false; + const BitTracker::RegisterCell &OutC = BTP->lookup(OutR); + for (unsigned i = 0, w = OutC.width(); i < w; ++i) { + const BitTracker::BitValue &V = OutC[i]; + if (V.Type != BitTracker::BitValue::Ref) + continue; + if (V.RefI.Reg != InpR) + return false; + } + return true; +} + + +bool HexagonLoopRescheduling::isSameShuffle(unsigned OutR1, unsigned InpR1, + unsigned OutR2, unsigned &InpR2) const { + if (!BTP->has(OutR1) || !BTP->has(InpR1) || !BTP->has(OutR2)) + return false; + const BitTracker::RegisterCell &OutC1 = BTP->lookup(OutR1); + const BitTracker::RegisterCell &OutC2 = BTP->lookup(OutR2); + unsigned W = OutC1.width(); + unsigned MatchR = 0; + if (W != OutC2.width()) + return false; + for (unsigned i = 0; i < W; ++i) { + const BitTracker::BitValue &V1 = OutC1[i], &V2 = OutC2[i]; + if (V1.Type != V2.Type || V1.Type == BitTracker::BitValue::One) + return false; + if (V1.Type != BitTracker::BitValue::Ref) + continue; + if (V1.RefI.Pos != V2.RefI.Pos) + return false; + if (V1.RefI.Reg != InpR1) + return false; + if (V2.RefI.Reg == 0 || V2.RefI.Reg == OutR2) + return false; + if (!MatchR) + MatchR = V2.RefI.Reg; + else if (V2.RefI.Reg != MatchR) + return false; + } + InpR2 = MatchR; + return true; +} + + +void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB, + MachineBasicBlock &PB, MachineBasicBlock::iterator At, unsigned OldPhiR, + unsigned NewPredR) { + DenseMap<unsigned,unsigned> RegMap; + + const TargetRegisterClass *PhiRC = MRI->getRegClass(NewPredR); + unsigned PhiR = MRI->createVirtualRegister(PhiRC); + BuildMI(LB, At, At->getDebugLoc(), HII->get(TargetOpcode::PHI), PhiR) + .addReg(NewPredR) + .addMBB(&PB) + .addReg(G.Inp.Reg) + .addMBB(&LB); + RegMap.insert(std::make_pair(G.Inp.Reg, PhiR)); + + for (unsigned i = G.Ins.size(); i > 0; --i) { + const MachineInstr *SI = G.Ins[i-1]; + unsigned DR = getDefReg(SI); + const TargetRegisterClass *RC = MRI->getRegClass(DR); + unsigned NewDR = MRI->createVirtualRegister(RC); + DebugLoc DL = SI->getDebugLoc(); + + auto MIB = BuildMI(LB, At, DL, HII->get(SI->getOpcode()), NewDR); + for (unsigned j = 0, m = SI->getNumOperands(); j < m; ++j) { + const MachineOperand &Op = SI->getOperand(j); + if (!Op.isReg()) { + MIB.addOperand(Op); + continue; + } + if (!Op.isUse()) + continue; + unsigned UseR = RegMap[Op.getReg()]; + MIB.addReg(UseR, 0, Op.getSubReg()); + } + RegMap.insert(std::make_pair(DR, NewDR)); + } + + HBS::replaceReg(OldPhiR, RegMap[G.Out.Reg], *MRI); +} + + +bool HexagonLoopRescheduling::processLoop(LoopCand &C) { + DEBUG(dbgs() << "Processing loop in BB#" << C.LB->getNumber() << "\n"); + std::vector<PhiInfo> Phis; + for (auto &I : *C.LB) { + if (!I.isPHI()) + break; + unsigned PR = getDefReg(&I); + if (isConst(PR)) + continue; + bool BadUse = false, GoodUse = false; + for (auto UI = MRI->use_begin(PR), UE = MRI->use_end(); UI != UE; ++UI) { + MachineInstr *UseI = UI->getParent(); + if (UseI->getParent() != C.LB) { + BadUse = true; + break; + } + if (isBitShuffle(UseI, PR) || isStoreInput(UseI, PR)) + GoodUse = true; + } + if (BadUse || !GoodUse) + continue; + + Phis.push_back(PhiInfo(I, *C.LB)); + } + + DEBUG({ + dbgs() << "Phis: {"; + for (auto &I : Phis) { + dbgs() << ' ' << PrintReg(I.DefR, HRI) << "=phi(" + << PrintReg(I.PR.Reg, HRI, I.PR.Sub) << ":b" << I.PB->getNumber() + << ',' << PrintReg(I.LR.Reg, HRI, I.LR.Sub) << ":b" + << I.LB->getNumber() << ')'; + } + dbgs() << " }\n"; + }); + + if (Phis.empty()) + return false; + + bool Changed = false; + InstrList ShufIns; + + // Go backwards in the block: for each bit shuffling instruction, check + // if that instruction could potentially be moved to the front of the loop: + // the output of the loop cannot be used in a non-shuffling instruction + // in this loop. + for (auto I = C.LB->rbegin(), E = C.LB->rend(); I != E; ++I) { + if (I->isTerminator()) + continue; + if (I->isPHI()) + break; + + RegisterSet Defs; + HBS::getInstrDefs(*I, Defs); + if (Defs.count() != 1) + continue; + unsigned DefR = Defs.find_first(); + if (!TargetRegisterInfo::isVirtualRegister(DefR)) + continue; + if (!isBitShuffle(&*I, DefR)) + continue; + + bool BadUse = false; + for (auto UI = MRI->use_begin(DefR), UE = MRI->use_end(); UI != UE; ++UI) { + MachineInstr *UseI = UI->getParent(); + if (UseI->getParent() == C.LB) { + if (UseI->isPHI()) { + // If the use is in a phi node in this loop, then it should be + // the value corresponding to the back edge. + unsigned Idx = UI.getOperandNo(); + if (UseI->getOperand(Idx+1).getMBB() != C.LB) + BadUse = true; + } else { + auto F = std::find(ShufIns.begin(), ShufIns.end(), UseI); + if (F == ShufIns.end()) + BadUse = true; + } + } else { + // There is a use outside of the loop, but there is no epilog block + // suitable for a copy-out. + if (C.EB == nullptr) + BadUse = true; + } + if (BadUse) + break; + } + + if (BadUse) + continue; + ShufIns.push_back(&*I); + } + + // Partition the list of shuffling instructions into instruction groups, + // where each group has to be moved as a whole (i.e. a group is a chain of + // dependent instructions). A group produces a single live output register, + // which is meant to be the input of the loop phi node (although this is + // not checked here yet). It also uses a single register as its input, + // which is some value produced in the loop body. After moving the group + // to the beginning of the loop, that input register would need to be + // the loop-carried register (through a phi node) instead of the (currently + // loop-carried) output register. + typedef std::vector<InstrGroup> InstrGroupList; + InstrGroupList Groups; + + for (unsigned i = 0, n = ShufIns.size(); i < n; ++i) { + MachineInstr *SI = ShufIns[i]; + if (SI == nullptr) + continue; + + InstrGroup G; + G.Ins.push_back(SI); + G.Out.Reg = getDefReg(SI); + RegisterSet Inputs; + HBS::getInstrUses(*SI, Inputs); + + for (unsigned j = i+1; j < n; ++j) { + MachineInstr *MI = ShufIns[j]; + if (MI == nullptr) + continue; + RegisterSet Defs; + HBS::getInstrDefs(*MI, Defs); + // If this instruction does not define any pending inputs, skip it. + if (!Defs.intersects(Inputs)) + continue; + // Otherwise, add it to the current group and remove the inputs that + // are defined by MI. + G.Ins.push_back(MI); + Inputs.remove(Defs); + // Then add all registers used by MI. + HBS::getInstrUses(*MI, Inputs); + ShufIns[j] = nullptr; + } + + // Only add a group if it requires at most one register. + if (Inputs.count() > 1) + continue; + auto LoopInpEq = [G] (const PhiInfo &P) -> bool { + return G.Out.Reg == P.LR.Reg; + }; + if (std::find_if(Phis.begin(), Phis.end(), LoopInpEq) == Phis.end()) + continue; + + G.Inp.Reg = Inputs.find_first(); + Groups.push_back(G); + } + + DEBUG({ + for (unsigned i = 0, n = Groups.size(); i < n; ++i) { + InstrGroup &G = Groups[i]; + dbgs() << "Group[" << i << "] inp: " + << PrintReg(G.Inp.Reg, HRI, G.Inp.Sub) + << " out: " << PrintReg(G.Out.Reg, HRI, G.Out.Sub) << "\n"; + for (unsigned j = 0, m = G.Ins.size(); j < m; ++j) + dbgs() << " " << *G.Ins[j]; + } + }); + + for (unsigned i = 0, n = Groups.size(); i < n; ++i) { + InstrGroup &G = Groups[i]; + if (!isShuffleOf(G.Out.Reg, G.Inp.Reg)) + continue; + auto LoopInpEq = [G] (const PhiInfo &P) -> bool { + return G.Out.Reg == P.LR.Reg; + }; + auto F = std::find_if(Phis.begin(), Phis.end(), LoopInpEq); + if (F == Phis.end()) + continue; + unsigned PredR = 0; + if (!isSameShuffle(G.Out.Reg, G.Inp.Reg, F->PR.Reg, PredR)) { + const MachineInstr *DefPredR = MRI->getVRegDef(F->PR.Reg); + unsigned Opc = DefPredR->getOpcode(); + if (Opc != Hexagon::A2_tfrsi && Opc != Hexagon::A2_tfrpi) + continue; + if (!DefPredR->getOperand(1).isImm()) + continue; + if (DefPredR->getOperand(1).getImm() != 0) + continue; + const TargetRegisterClass *RC = MRI->getRegClass(G.Inp.Reg); + if (RC != MRI->getRegClass(F->PR.Reg)) { + PredR = MRI->createVirtualRegister(RC); + unsigned TfrI = (RC == &Hexagon::IntRegsRegClass) ? Hexagon::A2_tfrsi + : Hexagon::A2_tfrpi; + auto T = C.PB->getFirstTerminator(); + DebugLoc DL = (T != C.PB->end()) ? T->getDebugLoc() : DebugLoc(); + BuildMI(*C.PB, T, DL, HII->get(TfrI), PredR) + .addImm(0); + } else { + PredR = F->PR.Reg; + } + } + assert(MRI->getRegClass(PredR) == MRI->getRegClass(G.Inp.Reg)); + moveGroup(G, *F->LB, *F->PB, F->LB->getFirstNonPHI(), F->DefR, PredR); + Changed = true; + } + + return Changed; +} + + +bool HexagonLoopRescheduling::runOnMachineFunction(MachineFunction &MF) { + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + HII = HST.getInstrInfo(); + HRI = HST.getRegisterInfo(); + MRI = &MF.getRegInfo(); + const HexagonEvaluator HE(*HRI, *MRI, *HII, MF); + BitTracker BT(HE, MF); + DEBUG(BT.trace(true)); + BT.run(); + BTP = &BT; + + std::vector<LoopCand> Cand; + + for (auto &B : MF) { + if (B.pred_size() != 2 || B.succ_size() != 2) + continue; + MachineBasicBlock *PB = nullptr; + bool IsLoop = false; + for (auto PI = B.pred_begin(), PE = B.pred_end(); PI != PE; ++PI) { + if (*PI != &B) + PB = *PI; + else + IsLoop = true; + } + if (!IsLoop) + continue; + + MachineBasicBlock *EB = nullptr; + for (auto SI = B.succ_begin(), SE = B.succ_end(); SI != SE; ++SI) { + if (*SI == &B) + continue; + // Set EP to the epilog block, if it has only 1 predecessor (i.e. the + // edge from B to EP is non-critical. + if ((*SI)->pred_size() == 1) + EB = *SI; + break; + } + + Cand.push_back(LoopCand(&B, PB, EB)); + } + + bool Changed = false; + for (auto &C : Cand) + Changed |= processLoop(C); + + return Changed; +} + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// + +FunctionPass *llvm::createHexagonLoopRescheduling() { + return new HexagonLoopRescheduling(); +} + +FunctionPass *llvm::createHexagonBitSimplify() { + return new HexagonBitSimplify(); +} + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp index 021e58a..d5848dc 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp @@ -84,6 +84,8 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const { uint16_t RW = getRegBitWidth(RegisterRef(Reg, Sub)); switch (ID) { case DoubleRegsRegClassID: + case VecDblRegsRegClassID: + case VecDblRegs128BRegClassID: return (Sub == subreg_loreg) ? BT::BitMask(0, RW-1) : BT::BitMask(RW, 2*RW-1); default: @@ -95,30 +97,29 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const { llvm_unreachable("Unexpected register/subregister"); } - namespace { - struct RegisterRefs : public std::vector<BT::RegisterRef> { - typedef std::vector<BT::RegisterRef> Base; - RegisterRefs(const MachineInstr *MI); - const BT::RegisterRef &operator[](unsigned n) const { - // The main purpose of this operator is to assert with bad argument. - assert(n < size()); - return Base::operator[](n); - } - }; +class RegisterRefs { + std::vector<BT::RegisterRef> Vector; - RegisterRefs::RegisterRefs(const MachineInstr *MI) - : Base(MI->getNumOperands()) { - for (unsigned i = 0, n = size(); i < n; ++i) { +public: + RegisterRefs(const MachineInstr *MI) : Vector(MI->getNumOperands()) { + for (unsigned i = 0, n = Vector.size(); i < n; ++i) { const MachineOperand &MO = MI->getOperand(i); if (MO.isReg()) - at(i) = BT::RegisterRef(MO); + Vector[i] = BT::RegisterRef(MO); // For indices that don't correspond to registers, the entry will // remain constructed via the default constructor. } } -} + size_t size() const { return Vector.size(); } + const BT::RegisterRef &operator[](unsigned n) const { + // The main purpose of this operator is to assert with bad argument. + assert(n < Vector.size()); + return Vector[n]; + } +}; +} bool HexagonEvaluator::evaluate(const MachineInstr *MI, const CellMapType &Inputs, CellMapType &Outputs) const { @@ -189,7 +190,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr *MI, return true; }; // Get the cell corresponding to the N-th operand. - auto cop = [this,Reg,MI,Inputs] (unsigned N, uint16_t W) + auto cop = [this,&Reg,&MI,&Inputs] (unsigned N, uint16_t W) -> BT::RegisterCell { const MachineOperand &Op = MI->getOperand(N); if (Op.isImm()) diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp index 3753b745..efafdd0 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp @@ -102,7 +102,7 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { // Loop over all of the basic blocks. for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); MBBb != MBBe; ++MBBb) { - MachineBasicBlock* MBB = MBBb; + MachineBasicBlock *MBB = &*MBBb; // Traverse the basic block. MachineBasicBlock::iterator MII = MBB->getFirstTerminator(); @@ -186,13 +186,11 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { if (case1 || case2) { InvertAndChangeJumpTarget(MI, UncondTarget); - MBB->removeSuccessor(JumpAroundTarget); - MBB->addSuccessor(UncondTarget); + MBB->replaceSuccessor(JumpAroundTarget, UncondTarget); // Remove the unconditional branch in LayoutSucc. LayoutSucc->erase(LayoutSucc->begin()); - LayoutSucc->removeSuccessor(UncondTarget); - LayoutSucc->addSuccessor(JumpAroundTarget); + LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget); // This code performs the conversion for case 2, which moves // the block to the fall-thru case (BB3 in the code above). @@ -210,16 +208,15 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { // The live-in to LayoutSucc is now all values live-in to // JumpAroundTarget. // - std::vector<unsigned> OrigLiveIn(LayoutSucc->livein_begin(), - LayoutSucc->livein_end()); - std::vector<unsigned> NewLiveIn(JumpAroundTarget->livein_begin(), - JumpAroundTarget->livein_end()); - for (unsigned i = 0; i < OrigLiveIn.size(); ++i) { - LayoutSucc->removeLiveIn(OrigLiveIn[i]); - } - for (unsigned i = 0; i < NewLiveIn.size(); ++i) { - LayoutSucc->addLiveIn(NewLiveIn[i]); - } + std::vector<MachineBasicBlock::RegisterMaskPair> OrigLiveIn( + LayoutSucc->livein_begin(), LayoutSucc->livein_end()); + std::vector<MachineBasicBlock::RegisterMaskPair> NewLiveIn( + JumpAroundTarget->livein_begin(), + JumpAroundTarget->livein_end()); + for (const auto &OrigLI : OrigLiveIn) + LayoutSucc->removeLiveIn(OrigLI.PhysReg); + for (const auto &NewLI : NewLiveIn) + LayoutSucc->addLiveIn(NewLI); } } } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp index 9f5fac1..931db66 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp @@ -59,30 +59,23 @@ namespace { // Numbering map for gep nodes. Used to keep track of ordering for // gep nodes. - struct NodeNumbering : public std::map<const GepNode*,unsigned> { - }; - - struct NodeOrdering : public NodeNumbering { + struct NodeOrdering { NodeOrdering() : LastNum(0) {} -#ifdef _MSC_VER - void special_insert_for_special_msvc(const GepNode *N) -#else - using NodeNumbering::insert; - void insert(const GepNode* N) -#endif - { - insert(std::make_pair(N, ++LastNum)); - } - bool operator() (const GepNode* N1, const GepNode *N2) const { - const_iterator F1 = find(N1), F2 = find(N2); - assert(F1 != end() && F2 != end()); + + void insert(const GepNode *N) { Map.insert(std::make_pair(N, ++LastNum)); } + void clear() { Map.clear(); } + + bool operator()(const GepNode *N1, const GepNode *N2) const { + auto F1 = Map.find(N1), F2 = Map.find(N2); + assert(F1 != Map.end() && F2 != Map.end()); return F1->second < F2->second; } + private: + std::map<const GepNode *, unsigned> Map; unsigned LastNum; }; - class HexagonCommonGEP : public FunctionPass { public: static char ID; @@ -360,11 +353,7 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI, Us.insert(&UI.getUse()); } Nodes.push_back(N); -#ifdef _MSC_VER - NodeOrder.special_insert_for_special_msvc(N); -#else NodeOrder.insert(N); -#endif // Skip the first index operand, since we only handle 0. This dereferences // the pointer operand. @@ -379,11 +368,7 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI, Nx->PTy = PtrTy; Nx->Idx = Op; Nodes.push_back(Nx); -#ifdef _MSC_VER - NodeOrder.special_insert_for_special_msvc(Nx); -#else NodeOrder.insert(Nx); -#endif PN = Nx; PtrTy = next_type(PtrTy, Op); @@ -404,7 +389,7 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI, void HexagonCommonGEP::collect() { // Establish depth-first traversal order of the dominator tree. ValueVect BO; - getBlockTraversalOrder(Fn->begin(), BO); + getBlockTraversalOrder(&Fn->front(), BO); // The creation of gep nodes requires DT-traversal. When processing a GEP // instruction that uses another GEP instruction as the base pointer, the @@ -737,7 +722,7 @@ namespace { Instruction *In = cast<Instruction>(V); if (In->getParent() != B) continue; - BasicBlock::iterator It = In; + BasicBlock::iterator It = In->getIterator(); if (std::distance(FirstUse, BEnd) < std::distance(It, BEnd)) FirstUse = It; } @@ -1135,7 +1120,7 @@ Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At, ArrayRef<Value*> A(IdxList, IdxC); Type *InpTy = Input->getType(); Type *ElTy = cast<PointerType>(InpTy->getScalarType())->getElementType(); - NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", At); + NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", &*At); DEBUG(dbgs() << "new GEP: " << *NewInst << '\n'); Input = NewInst; } while (nax <= Num); @@ -1213,7 +1198,7 @@ void HexagonCommonGEP::materialize(NodeToValueMap &Loc) { Last = Child; } while (true); - BasicBlock::iterator InsertAt = LastB->getTerminator(); + BasicBlock::iterator InsertAt = LastB->getTerminator()->getIterator(); if (LastUsed || LastCN > 0) { ValueVect Urs; getAllUsersForNode(Root, Urs, NCM); diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp new file mode 100644 index 0000000..ee0c318 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp @@ -0,0 +1,1063 @@ +//===--- HexagonEarlyIfConv.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This implements a Hexagon-specific if-conversion pass that runs on the +// SSA form. +// In SSA it is not straightforward to represent instructions that condi- +// tionally define registers, since a conditionally-defined register may +// only be used under the same condition on which the definition was based. +// To avoid complications of this nature, this patch will only generate +// predicated stores, and speculate other instructions from the "if-conver- +// ted" block. +// The code will recognize CFG patterns where a block with a conditional +// branch "splits" into a "true block" and a "false block". Either of these +// could be omitted (in case of a triangle, for example). +// If after conversion of the side block(s) the CFG allows it, the resul- +// ting blocks may be merged. If the "join" block contained PHI nodes, they +// will be replaced with MUX (or MUX-like) instructions to maintain the +// semantics of the PHI. +// +// Example: +// +// %vreg40<def> = L2_loadrub_io %vreg39<kill>, 1 +// %vreg41<def> = S2_tstbit_i %vreg40<kill>, 0 +// J2_jumpt %vreg41<kill>, <BB#5>, %PC<imp-def,dead> +// J2_jump <BB#4>, %PC<imp-def,dead> +// Successors according to CFG: BB#4(62) BB#5(62) +// +// BB#4: derived from LLVM BB %if.then +// Predecessors according to CFG: BB#3 +// %vreg11<def> = A2_addp %vreg6, %vreg10 +// S2_storerd_io %vreg32, 16, %vreg11 +// Successors according to CFG: BB#5 +// +// BB#5: derived from LLVM BB %if.end +// Predecessors according to CFG: BB#3 BB#4 +// %vreg12<def> = PHI %vreg6, <BB#3>, %vreg11, <BB#4> +// %vreg13<def> = A2_addp %vreg7, %vreg12 +// %vreg42<def> = C2_cmpeqi %vreg9, 10 +// J2_jumpf %vreg42<kill>, <BB#3>, %PC<imp-def,dead> +// J2_jump <BB#6>, %PC<imp-def,dead> +// Successors according to CFG: BB#6(4) BB#3(124) +// +// would become: +// +// %vreg40<def> = L2_loadrub_io %vreg39<kill>, 1 +// %vreg41<def> = S2_tstbit_i %vreg40<kill>, 0 +// spec-> %vreg11<def> = A2_addp %vreg6, %vreg10 +// pred-> S2_pstorerdf_io %vreg41, %vreg32, 16, %vreg11 +// %vreg46<def> = MUX64_rr %vreg41, %vreg6, %vreg11 +// %vreg13<def> = A2_addp %vreg7, %vreg46 +// %vreg42<def> = C2_cmpeqi %vreg9, 10 +// J2_jumpf %vreg42<kill>, <BB#3>, %PC<imp-def,dead> +// J2_jump <BB#6>, %PC<imp-def,dead> +// Successors according to CFG: BB#6 BB#3 + +#define DEBUG_TYPE "hexagon-eif" + +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "HexagonTargetMachine.h" + +#include <functional> +#include <set> +#include <vector> + +using namespace llvm; + +namespace llvm { + FunctionPass *createHexagonEarlyIfConversion(); + void initializeHexagonEarlyIfConversionPass(PassRegistry& Registry); +} + +namespace { + cl::opt<bool> EnableHexagonBP("enable-hexagon-br-prob", cl::Hidden, + cl::init(false), cl::desc("Enable branch probability info")); + cl::opt<unsigned> SizeLimit("eif-limit", cl::init(6), cl::Hidden, + cl::desc("Size limit in Hexagon early if-conversion")); + + struct PrintMB { + PrintMB(const MachineBasicBlock *B) : MB(B) {} + const MachineBasicBlock *MB; + }; + raw_ostream &operator<< (raw_ostream &OS, const PrintMB &P) { + if (!P.MB) + return OS << "<none>"; + return OS << '#' << P.MB->getNumber(); + } + + struct FlowPattern { + FlowPattern() : SplitB(0), TrueB(0), FalseB(0), JoinB(0), PredR(0) {} + FlowPattern(MachineBasicBlock *B, unsigned PR, MachineBasicBlock *TB, + MachineBasicBlock *FB, MachineBasicBlock *JB) + : SplitB(B), TrueB(TB), FalseB(FB), JoinB(JB), PredR(PR) {} + + MachineBasicBlock *SplitB; + MachineBasicBlock *TrueB, *FalseB, *JoinB; + unsigned PredR; + }; + struct PrintFP { + PrintFP(const FlowPattern &P, const TargetRegisterInfo &T) + : FP(P), TRI(T) {} + const FlowPattern &FP; + const TargetRegisterInfo &TRI; + friend raw_ostream &operator<< (raw_ostream &OS, const PrintFP &P); + }; + raw_ostream &operator<<(raw_ostream &OS, + const PrintFP &P) LLVM_ATTRIBUTE_UNUSED; + raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P) { + OS << "{ SplitB:" << PrintMB(P.FP.SplitB) + << ", PredR:" << PrintReg(P.FP.PredR, &P.TRI) + << ", TrueB:" << PrintMB(P.FP.TrueB) << ", FalseB:" + << PrintMB(P.FP.FalseB) + << ", JoinB:" << PrintMB(P.FP.JoinB) << " }"; + return OS; + } + + class HexagonEarlyIfConversion : public MachineFunctionPass { + public: + static char ID; + HexagonEarlyIfConversion() : MachineFunctionPass(ID), + TII(0), TRI(0), MFN(0), MRI(0), MDT(0), MLI(0) { + initializeHexagonEarlyIfConversionPass(*PassRegistry::getPassRegistry()); + } + const char *getPassName() const override { + return "Hexagon early if conversion"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineBranchProbabilityInfo>(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; + + private: + typedef DenseSet<MachineBasicBlock*> BlockSetType; + + bool isPreheader(const MachineBasicBlock *B) const; + bool matchFlowPattern(MachineBasicBlock *B, MachineLoop *L, + FlowPattern &FP); + bool visitBlock(MachineBasicBlock *B, MachineLoop *L); + bool visitLoop(MachineLoop *L); + + bool hasEHLabel(const MachineBasicBlock *B) const; + bool hasUncondBranch(const MachineBasicBlock *B) const; + bool isValidCandidate(const MachineBasicBlock *B) const; + bool usesUndefVReg(const MachineInstr *MI) const; + bool isValid(const FlowPattern &FP) const; + unsigned countPredicateDefs(const MachineBasicBlock *B) const; + unsigned computePhiCost(MachineBasicBlock *B) const; + bool isProfitable(const FlowPattern &FP) const; + bool isPredicableStore(const MachineInstr *MI) const; + bool isSafeToSpeculate(const MachineInstr *MI) const; + + unsigned getCondStoreOpcode(unsigned Opc, bool IfTrue) const; + void predicateInstr(MachineBasicBlock *ToB, MachineBasicBlock::iterator At, + MachineInstr *MI, unsigned PredR, bool IfTrue); + void predicateBlockNB(MachineBasicBlock *ToB, + MachineBasicBlock::iterator At, MachineBasicBlock *FromB, + unsigned PredR, bool IfTrue); + + void updatePhiNodes(MachineBasicBlock *WhereB, const FlowPattern &FP); + void convert(const FlowPattern &FP); + + void removeBlock(MachineBasicBlock *B); + void eliminatePhis(MachineBasicBlock *B); + void replacePhiEdges(MachineBasicBlock *OldB, MachineBasicBlock *NewB); + void mergeBlocks(MachineBasicBlock *PredB, MachineBasicBlock *SuccB); + void simplifyFlowGraph(const FlowPattern &FP); + + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineFunction *MFN; + MachineRegisterInfo *MRI; + MachineDominatorTree *MDT; + MachineLoopInfo *MLI; + BlockSetType Deleted; + const MachineBranchProbabilityInfo *MBPI; + }; + + char HexagonEarlyIfConversion::ID = 0; +} + +INITIALIZE_PASS(HexagonEarlyIfConversion, "hexagon-eif", + "Hexagon early if conversion", false, false) + +bool HexagonEarlyIfConversion::isPreheader(const MachineBasicBlock *B) const { + if (B->succ_size() != 1) + return false; + MachineBasicBlock *SB = *B->succ_begin(); + MachineLoop *L = MLI->getLoopFor(SB); + return L && SB == L->getHeader(); +} + + +bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B, + MachineLoop *L, FlowPattern &FP) { + DEBUG(dbgs() << "Checking flow pattern at BB#" << B->getNumber() << "\n"); + + // Interested only in conditional branches, no .new, no new-value, etc. + // Check the terminators directly, it's easier than handling all responses + // from AnalyzeBranch. + MachineBasicBlock *TB = 0, *FB = 0; + MachineBasicBlock::const_iterator T1I = B->getFirstTerminator(); + if (T1I == B->end()) + return false; + unsigned Opc = T1I->getOpcode(); + if (Opc != Hexagon::J2_jumpt && Opc != Hexagon::J2_jumpf) + return false; + unsigned PredR = T1I->getOperand(0).getReg(); + + // Get the layout successor, or 0 if B does not have one. + MachineFunction::iterator NextBI = std::next(MachineFunction::iterator(B)); + MachineBasicBlock *NextB = (NextBI != MFN->end()) ? &*NextBI : 0; + + MachineBasicBlock *T1B = T1I->getOperand(1).getMBB(); + MachineBasicBlock::const_iterator T2I = std::next(T1I); + // The second terminator should be an unconditional branch. + assert(T2I == B->end() || T2I->getOpcode() == Hexagon::J2_jump); + MachineBasicBlock *T2B = (T2I == B->end()) ? NextB + : T2I->getOperand(0).getMBB(); + if (T1B == T2B) { + // XXX merge if T1B == NextB, or convert branch to unconditional. + // mark as diamond with both sides equal? + return false; + } + // Loop could be null for both. + if (MLI->getLoopFor(T1B) != L || MLI->getLoopFor(T2B) != L) + return false; + + // Record the true/false blocks in such a way that "true" means "if (PredR)", + // and "false" means "if (!PredR)". + if (Opc == Hexagon::J2_jumpt) + TB = T1B, FB = T2B; + else + TB = T2B, FB = T1B; + + if (!MDT->properlyDominates(B, TB) || !MDT->properlyDominates(B, FB)) + return false; + + // Detect triangle first. In case of a triangle, one of the blocks TB/FB + // can fall through into the other, in other words, it will be executed + // in both cases. We only want to predicate the block that is executed + // conditionally. + unsigned TNP = TB->pred_size(), FNP = FB->pred_size(); + unsigned TNS = TB->succ_size(), FNS = FB->succ_size(); + + // A block is predicable if it has one predecessor (it must be B), and + // it has a single successor. In fact, the block has to end either with + // an unconditional branch (which can be predicated), or with a fall- + // through. + bool TOk = (TNP == 1) && (TNS == 1); + bool FOk = (FNP == 1) && (FNS == 1); + + // If neither is predicable, there is nothing interesting. + if (!TOk && !FOk) + return false; + + MachineBasicBlock *TSB = (TNS > 0) ? *TB->succ_begin() : 0; + MachineBasicBlock *FSB = (FNS > 0) ? *FB->succ_begin() : 0; + MachineBasicBlock *JB = 0; + + if (TOk) { + if (FOk) { + if (TSB == FSB) + JB = TSB; + // Diamond: "if (P) then TB; else FB;". + } else { + // TOk && !FOk + if (TSB == FB) { + JB = FB; + FB = 0; + } + } + } else { + // !TOk && FOk (at least one must be true by now). + if (FSB == TB) { + JB = TB; + TB = 0; + } + } + // Don't try to predicate loop preheaders. + if ((TB && isPreheader(TB)) || (FB && isPreheader(FB))) { + DEBUG(dbgs() << "One of blocks " << PrintMB(TB) << ", " << PrintMB(FB) + << " is a loop preheader. Skipping.\n"); + return false; + } + + FP = FlowPattern(B, PredR, TB, FB, JB); + DEBUG(dbgs() << "Detected " << PrintFP(FP, *TRI) << "\n"); + return true; +} + + +// KLUDGE: HexagonInstrInfo::AnalyzeBranch won't work on a block that +// contains EH_LABEL. +bool HexagonEarlyIfConversion::hasEHLabel(const MachineBasicBlock *B) const { + for (auto &I : *B) + if (I.isEHLabel()) + return true; + return false; +} + + +// KLUDGE: HexagonInstrInfo::AnalyzeBranch may be unable to recognize +// that a block can never fall-through. +bool HexagonEarlyIfConversion::hasUncondBranch(const MachineBasicBlock *B) + const { + MachineBasicBlock::const_iterator I = B->getFirstTerminator(), E = B->end(); + while (I != E) { + if (I->isBarrier()) + return true; + ++I; + } + return false; +} + + +bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B) + const { + if (!B) + return true; + if (B->isEHPad() || B->hasAddressTaken()) + return false; + if (B->succ_size() == 0) + return false; + + for (auto &MI : *B) { + if (MI.isDebugValue()) + continue; + if (MI.isConditionalBranch()) + return false; + unsigned Opc = MI.getOpcode(); + bool IsJMP = (Opc == Hexagon::J2_jump); + if (!isPredicableStore(&MI) && !IsJMP && !isSafeToSpeculate(&MI)) + return false; + // Look for predicate registers defined by this instruction. It's ok + // to speculate such an instruction, but the predicate register cannot + // be used outside of this block (or else it won't be possible to + // update the use of it after predication). PHI uses will be updated + // to use a result of a MUX, and a MUX cannot be created for predicate + // registers. + for (ConstMIOperands MO(&MI); MO.isValid(); ++MO) { + if (!MO->isReg() || !MO->isDef()) + continue; + unsigned R = MO->getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + if (MRI->getRegClass(R) != &Hexagon::PredRegsRegClass) + continue; + for (auto U = MRI->use_begin(R); U != MRI->use_end(); ++U) + if (U->getParent()->isPHI()) + return false; + } + } + return true; +} + + +bool HexagonEarlyIfConversion::usesUndefVReg(const MachineInstr *MI) const { + for (ConstMIOperands MO(MI); MO.isValid(); ++MO) { + if (!MO->isReg() || !MO->isUse()) + continue; + unsigned R = MO->getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + const MachineInstr *DefI = MRI->getVRegDef(R); + // "Undefined" virtual registers are actually defined via IMPLICIT_DEF. + assert(DefI && "Expecting a reaching def in MRI"); + if (DefI->isImplicitDef()) + return true; + } + return false; +} + + +bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const { + if (hasEHLabel(FP.SplitB)) // KLUDGE: see function definition + return false; + if (FP.TrueB && !isValidCandidate(FP.TrueB)) + return false; + if (FP.FalseB && !isValidCandidate(FP.FalseB)) + return false; + // Check the PHIs in the join block. If any of them use a register + // that is defined as IMPLICIT_DEF, do not convert this. This can + // legitimately happen if one side of the split never executes, but + // the compiler is unable to prove it. That side may then seem to + // provide an "undef" value to the join block, however it will never + // execute at run-time. If we convert this case, the "undef" will + // be used in a MUX instruction, and that may seem like actually + // using an undefined value to other optimizations. This could lead + // to trouble further down the optimization stream, cause assertions + // to fail, etc. + if (FP.JoinB) { + const MachineBasicBlock &B = *FP.JoinB; + for (auto &MI : B) { + if (!MI.isPHI()) + break; + if (usesUndefVReg(&MI)) + return false; + unsigned DefR = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI->getRegClass(DefR); + if (RC == &Hexagon::PredRegsRegClass) + return false; + } + } + return true; +} + + +unsigned HexagonEarlyIfConversion::computePhiCost(MachineBasicBlock *B) const { + assert(B->pred_size() <= 2); + if (B->pred_size() < 2) + return 0; + + unsigned Cost = 0; + MachineBasicBlock::const_iterator I, E = B->getFirstNonPHI(); + for (I = B->begin(); I != E; ++I) { + const MachineOperand &RO1 = I->getOperand(1); + const MachineOperand &RO3 = I->getOperand(3); + assert(RO1.isReg() && RO3.isReg()); + // Must have a MUX if the phi uses a subregister. + if (RO1.getSubReg() != 0 || RO3.getSubReg() != 0) { + Cost++; + continue; + } + MachineInstr *Def1 = MRI->getVRegDef(RO1.getReg()); + MachineInstr *Def3 = MRI->getVRegDef(RO3.getReg()); + if (!TII->isPredicable(Def1) || !TII->isPredicable(Def3)) + Cost++; + } + return Cost; +} + + +unsigned HexagonEarlyIfConversion::countPredicateDefs( + const MachineBasicBlock *B) const { + unsigned PredDefs = 0; + for (auto &MI : *B) { + for (ConstMIOperands MO(&MI); MO.isValid(); ++MO) { + if (!MO->isReg() || !MO->isDef()) + continue; + unsigned R = MO->getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + if (MRI->getRegClass(R) == &Hexagon::PredRegsRegClass) + PredDefs++; + } + } + return PredDefs; +} + + +bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const { + if (FP.TrueB && FP.FalseB) { + + // Do not IfCovert if the branch is one sided. + if (MBPI) { + BranchProbability Prob(9, 10); + if (MBPI->getEdgeProbability(FP.SplitB, FP.TrueB) > Prob) + return false; + if (MBPI->getEdgeProbability(FP.SplitB, FP.FalseB) > Prob) + return false; + } + + // If both sides are predicable, convert them if they join, and the + // join block has no other predecessors. + MachineBasicBlock *TSB = *FP.TrueB->succ_begin(); + MachineBasicBlock *FSB = *FP.FalseB->succ_begin(); + if (TSB != FSB) + return false; + if (TSB->pred_size() != 2) + return false; + } + + // Calculate the total size of the predicated blocks. + // Assume instruction counts without branches to be the approximation of + // the code size. If the predicated blocks are smaller than a packet size, + // approximate the spare room in the packet that could be filled with the + // predicated/speculated instructions. + unsigned TS = 0, FS = 0, Spare = 0; + if (FP.TrueB) { + TS = std::distance(FP.TrueB->begin(), FP.TrueB->getFirstTerminator()); + if (TS < HEXAGON_PACKET_SIZE) + Spare += HEXAGON_PACKET_SIZE-TS; + } + if (FP.FalseB) { + FS = std::distance(FP.FalseB->begin(), FP.FalseB->getFirstTerminator()); + if (FS < HEXAGON_PACKET_SIZE) + Spare += HEXAGON_PACKET_SIZE-TS; + } + unsigned TotalIn = TS+FS; + DEBUG(dbgs() << "Total number of instructions to be predicated/speculated: " + << TotalIn << ", spare room: " << Spare << "\n"); + if (TotalIn >= SizeLimit+Spare) + return false; + + // Count the number of PHI nodes that will need to be updated (converted + // to MUX). Those can be later converted to predicated instructions, so + // they aren't always adding extra cost. + // KLUDGE: Also, count the number of predicate register definitions in + // each block. The scheduler may increase the pressure of these and cause + // expensive spills (e.g. bitmnp01). + unsigned TotalPh = 0; + unsigned PredDefs = countPredicateDefs(FP.SplitB); + if (FP.JoinB) { + TotalPh = computePhiCost(FP.JoinB); + PredDefs += countPredicateDefs(FP.JoinB); + } else { + if (FP.TrueB && FP.TrueB->succ_size() > 0) { + MachineBasicBlock *SB = *FP.TrueB->succ_begin(); + TotalPh += computePhiCost(SB); + PredDefs += countPredicateDefs(SB); + } + if (FP.FalseB && FP.FalseB->succ_size() > 0) { + MachineBasicBlock *SB = *FP.FalseB->succ_begin(); + TotalPh += computePhiCost(SB); + PredDefs += countPredicateDefs(SB); + } + } + DEBUG(dbgs() << "Total number of extra muxes from converted phis: " + << TotalPh << "\n"); + if (TotalIn+TotalPh >= SizeLimit+Spare) + return false; + + DEBUG(dbgs() << "Total number of predicate registers: " << PredDefs << "\n"); + if (PredDefs > 4) + return false; + + return true; +} + + +bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B, + MachineLoop *L) { + bool Changed = false; + + // Visit all dominated blocks from the same loop first, then process B. + MachineDomTreeNode *N = MDT->getNode(B); + typedef GraphTraits<MachineDomTreeNode*> GTN; + // We will change CFG/DT during this traversal, so take precautions to + // avoid problems related to invalidated iterators. In fact, processing + // a child C of B cannot cause another child to be removed, but it can + // cause a new child to be added (which was a child of C before C itself + // was removed. This new child C, however, would have been processed + // prior to processing B, so there is no need to process it again. + // Simply keep a list of children of B, and traverse that list. + typedef SmallVector<MachineDomTreeNode*,4> DTNodeVectType; + DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N)); + for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) { + MachineBasicBlock *SB = (*I)->getBlock(); + if (!Deleted.count(SB)) + Changed |= visitBlock(SB, L); + } + // When walking down the dominator tree, we want to traverse through + // blocks from nested (other) loops, because they can dominate blocks + // that are in L. Skip the non-L blocks only after the tree traversal. + if (MLI->getLoopFor(B) != L) + return Changed; + + FlowPattern FP; + if (!matchFlowPattern(B, L, FP)) + return Changed; + + if (!isValid(FP)) { + DEBUG(dbgs() << "Conversion is not valid\n"); + return Changed; + } + if (!isProfitable(FP)) { + DEBUG(dbgs() << "Conversion is not profitable\n"); + return Changed; + } + + convert(FP); + simplifyFlowGraph(FP); + return true; +} + + +bool HexagonEarlyIfConversion::visitLoop(MachineLoop *L) { + MachineBasicBlock *HB = L ? L->getHeader() : 0; + DEBUG((L ? dbgs() << "Visiting loop H:" << PrintMB(HB) + : dbgs() << "Visiting function") << "\n"); + bool Changed = false; + if (L) { + for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) + Changed |= visitLoop(*I); + } + + MachineBasicBlock *EntryB = GraphTraits<MachineFunction*>::getEntryNode(MFN); + Changed |= visitBlock(L ? HB : EntryB, L); + return Changed; +} + + +bool HexagonEarlyIfConversion::isPredicableStore(const MachineInstr *MI) + const { + // Exclude post-increment stores. Those return a value, so we cannot + // predicate them. + unsigned Opc = MI->getOpcode(); + using namespace Hexagon; + switch (Opc) { + // Store byte: + case S2_storerb_io: case S4_storerb_rr: + case S2_storerbabs: case S4_storeirb_io: case S2_storerbgp: + // Store halfword: + case S2_storerh_io: case S4_storerh_rr: + case S2_storerhabs: case S4_storeirh_io: case S2_storerhgp: + // Store upper halfword: + case S2_storerf_io: case S4_storerf_rr: + case S2_storerfabs: case S2_storerfgp: + // Store word: + case S2_storeri_io: case S4_storeri_rr: + case S2_storeriabs: case S4_storeiri_io: case S2_storerigp: + // Store doubleword: + case S2_storerd_io: case S4_storerd_rr: + case S2_storerdabs: case S2_storerdgp: + return true; + } + return false; +} + + +bool HexagonEarlyIfConversion::isSafeToSpeculate(const MachineInstr *MI) + const { + if (MI->mayLoad() || MI->mayStore()) + return false; + if (MI->isCall() || MI->isBarrier() || MI->isBranch()) + return false; + if (MI->hasUnmodeledSideEffects()) + return false; + + return true; +} + + +unsigned HexagonEarlyIfConversion::getCondStoreOpcode(unsigned Opc, + bool IfTrue) const { + // Exclude post-increment stores. + using namespace Hexagon; + switch (Opc) { + case S2_storerb_io: + return IfTrue ? S2_pstorerbt_io : S2_pstorerbf_io; + case S4_storerb_rr: + return IfTrue ? S4_pstorerbt_rr : S4_pstorerbf_rr; + case S2_storerbabs: + case S2_storerbgp: + return IfTrue ? S4_pstorerbt_abs : S4_pstorerbf_abs; + case S4_storeirb_io: + return IfTrue ? S4_storeirbt_io : S4_storeirbf_io; + case S2_storerh_io: + return IfTrue ? S2_pstorerht_io : S2_pstorerhf_io; + case S4_storerh_rr: + return IfTrue ? S4_pstorerht_rr : S4_pstorerhf_rr; + case S2_storerhabs: + case S2_storerhgp: + return IfTrue ? S4_pstorerht_abs : S4_pstorerhf_abs; + case S2_storerf_io: + return IfTrue ? S2_pstorerft_io : S2_pstorerff_io; + case S4_storerf_rr: + return IfTrue ? S4_pstorerft_rr : S4_pstorerff_rr; + case S2_storerfabs: + case S2_storerfgp: + return IfTrue ? S4_pstorerft_abs : S4_pstorerff_abs; + case S4_storeirh_io: + return IfTrue ? S4_storeirht_io : S4_storeirhf_io; + case S2_storeri_io: + return IfTrue ? S2_pstorerit_io : S2_pstorerif_io; + case S4_storeri_rr: + return IfTrue ? S4_pstorerit_rr : S4_pstorerif_rr; + case S2_storeriabs: + case S2_storerigp: + return IfTrue ? S4_pstorerit_abs : S4_pstorerif_abs; + case S4_storeiri_io: + return IfTrue ? S4_storeirit_io : S4_storeirif_io; + case S2_storerd_io: + return IfTrue ? S2_pstorerdt_io : S2_pstorerdf_io; + case S4_storerd_rr: + return IfTrue ? S4_pstorerdt_rr : S4_pstorerdf_rr; + case S2_storerdabs: + case S2_storerdgp: + return IfTrue ? S4_pstorerdt_abs : S4_pstorerdf_abs; + } + llvm_unreachable("Unexpected opcode"); + return 0; +} + + +void HexagonEarlyIfConversion::predicateInstr(MachineBasicBlock *ToB, + MachineBasicBlock::iterator At, MachineInstr *MI, + unsigned PredR, bool IfTrue) { + DebugLoc DL; + if (At != ToB->end()) + DL = At->getDebugLoc(); + else if (!ToB->empty()) + DL = ToB->back().getDebugLoc(); + + unsigned Opc = MI->getOpcode(); + + if (isPredicableStore(MI)) { + unsigned COpc = getCondStoreOpcode(Opc, IfTrue); + assert(COpc); + MachineInstrBuilder MIB = BuildMI(*ToB, At, DL, TII->get(COpc)) + .addReg(PredR); + for (MIOperands MO(MI); MO.isValid(); ++MO) + MIB.addOperand(*MO); + + // Set memory references. + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + MIB.setMemRefs(MMOBegin, MMOEnd); + + MI->eraseFromParent(); + return; + } + + if (Opc == Hexagon::J2_jump) { + MachineBasicBlock *TB = MI->getOperand(0).getMBB(); + const MCInstrDesc &D = TII->get(IfTrue ? Hexagon::J2_jumpt + : Hexagon::J2_jumpf); + BuildMI(*ToB, At, DL, D) + .addReg(PredR) + .addMBB(TB); + MI->eraseFromParent(); + return; + } + + // Print the offending instruction unconditionally as we are about to + // abort. + dbgs() << *MI; + llvm_unreachable("Unexpected instruction"); +} + + +// Predicate/speculate non-branch instructions from FromB into block ToB. +// Leave the branches alone, they will be handled later. Btw, at this point +// FromB should have at most one branch, and it should be unconditional. +void HexagonEarlyIfConversion::predicateBlockNB(MachineBasicBlock *ToB, + MachineBasicBlock::iterator At, MachineBasicBlock *FromB, + unsigned PredR, bool IfTrue) { + DEBUG(dbgs() << "Predicating block " << PrintMB(FromB) << "\n"); + MachineBasicBlock::iterator End = FromB->getFirstTerminator(); + MachineBasicBlock::iterator I, NextI; + + for (I = FromB->begin(); I != End; I = NextI) { + assert(!I->isPHI()); + NextI = std::next(I); + if (isSafeToSpeculate(&*I)) + ToB->splice(At, FromB, I); + else + predicateInstr(ToB, At, &*I, PredR, IfTrue); + } +} + + +void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB, + const FlowPattern &FP) { + // Visit all PHI nodes in the WhereB block and generate MUX instructions + // in the split block. Update the PHI nodes with the values of the MUX. + auto NonPHI = WhereB->getFirstNonPHI(); + for (auto I = WhereB->begin(); I != NonPHI; ++I) { + MachineInstr *PN = &*I; + // Registers and subregisters corresponding to TrueB, FalseB and SplitB. + unsigned TR = 0, TSR = 0, FR = 0, FSR = 0, SR = 0, SSR = 0; + for (int i = PN->getNumOperands()-2; i > 0; i -= 2) { + const MachineOperand &RO = PN->getOperand(i), &BO = PN->getOperand(i+1); + if (BO.getMBB() == FP.SplitB) + SR = RO.getReg(), SSR = RO.getSubReg(); + else if (BO.getMBB() == FP.TrueB) + TR = RO.getReg(), TSR = RO.getSubReg(); + else if (BO.getMBB() == FP.FalseB) + FR = RO.getReg(), FSR = RO.getSubReg(); + else + continue; + PN->RemoveOperand(i+1); + PN->RemoveOperand(i); + } + if (TR == 0) + TR = SR, TSR = SSR; + else if (FR == 0) + FR = SR, FSR = SSR; + assert(TR && FR); + + using namespace Hexagon; + unsigned DR = PN->getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI->getRegClass(DR); + const MCInstrDesc &D = RC == &IntRegsRegClass ? TII->get(C2_mux) + : TII->get(MUX64_rr); + + MachineBasicBlock::iterator MuxAt = FP.SplitB->getFirstTerminator(); + DebugLoc DL; + if (MuxAt != FP.SplitB->end()) + DL = MuxAt->getDebugLoc(); + unsigned MuxR = MRI->createVirtualRegister(RC); + BuildMI(*FP.SplitB, MuxAt, DL, D, MuxR) + .addReg(FP.PredR) + .addReg(TR, 0, TSR) + .addReg(FR, 0, FSR); + + PN->addOperand(MachineOperand::CreateReg(MuxR, false)); + PN->addOperand(MachineOperand::CreateMBB(FP.SplitB)); + } +} + + +void HexagonEarlyIfConversion::convert(const FlowPattern &FP) { + MachineBasicBlock *TSB = 0, *FSB = 0; + MachineBasicBlock::iterator OldTI = FP.SplitB->getFirstTerminator(); + assert(OldTI != FP.SplitB->end()); + DebugLoc DL = OldTI->getDebugLoc(); + + if (FP.TrueB) { + TSB = *FP.TrueB->succ_begin(); + predicateBlockNB(FP.SplitB, OldTI, FP.TrueB, FP.PredR, true); + } + if (FP.FalseB) { + FSB = *FP.FalseB->succ_begin(); + MachineBasicBlock::iterator At = FP.SplitB->getFirstTerminator(); + predicateBlockNB(FP.SplitB, At, FP.FalseB, FP.PredR, false); + } + + // Regenerate new terminators in the split block and update the successors. + // First, remember any information that may be needed later and remove the + // existing terminators/successors from the split block. + MachineBasicBlock *SSB = 0; + FP.SplitB->erase(OldTI, FP.SplitB->end()); + while (FP.SplitB->succ_size() > 0) { + MachineBasicBlock *T = *FP.SplitB->succ_begin(); + // It's possible that the split block had a successor that is not a pre- + // dicated block. This could only happen if there was only one block to + // be predicated. Example: + // split_b: + // if (p) jump true_b + // jump unrelated2_b + // unrelated1_b: + // ... + // unrelated2_b: ; can have other predecessors, so it's not "false_b" + // jump other_b + // true_b: ; only reachable from split_b, can be predicated + // ... + // + // Find this successor (SSB) if it exists. + if (T != FP.TrueB && T != FP.FalseB) { + assert(!SSB); + SSB = T; + } + FP.SplitB->removeSuccessor(FP.SplitB->succ_begin()); + } + + // Insert new branches and update the successors of the split block. This + // may create unconditional branches to the layout successor, etc., but + // that will be cleaned up later. For now, make sure that correct code is + // generated. + if (FP.JoinB) { + assert(!SSB || SSB == FP.JoinB); + BuildMI(*FP.SplitB, FP.SplitB->end(), DL, TII->get(Hexagon::J2_jump)) + .addMBB(FP.JoinB); + FP.SplitB->addSuccessor(FP.JoinB); + } else { + bool HasBranch = false; + if (TSB) { + BuildMI(*FP.SplitB, FP.SplitB->end(), DL, TII->get(Hexagon::J2_jumpt)) + .addReg(FP.PredR) + .addMBB(TSB); + FP.SplitB->addSuccessor(TSB); + HasBranch = true; + } + if (FSB) { + const MCInstrDesc &D = HasBranch ? TII->get(Hexagon::J2_jump) + : TII->get(Hexagon::J2_jumpf); + MachineInstrBuilder MIB = BuildMI(*FP.SplitB, FP.SplitB->end(), DL, D); + if (!HasBranch) + MIB.addReg(FP.PredR); + MIB.addMBB(FSB); + FP.SplitB->addSuccessor(FSB); + } + if (SSB) { + // This cannot happen if both TSB and FSB are set. [TF]SB are the + // successor blocks of the TrueB and FalseB (or null of the TrueB + // or FalseB block is null). SSB is the potential successor block + // of the SplitB that is neither TrueB nor FalseB. + BuildMI(*FP.SplitB, FP.SplitB->end(), DL, TII->get(Hexagon::J2_jump)) + .addMBB(SSB); + FP.SplitB->addSuccessor(SSB); + } + } + + // What is left to do is to update the PHI nodes that could have entries + // referring to predicated blocks. + if (FP.JoinB) { + updatePhiNodes(FP.JoinB, FP); + } else { + if (TSB) + updatePhiNodes(TSB, FP); + if (FSB) + updatePhiNodes(FSB, FP); + // Nothing to update in SSB, since SSB's predecessors haven't changed. + } +} + + +void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) { + DEBUG(dbgs() << "Removing block " << PrintMB(B) << "\n"); + + // Transfer the immediate dominator information from B to its descendants. + MachineDomTreeNode *N = MDT->getNode(B); + MachineDomTreeNode *IDN = N->getIDom(); + if (IDN) { + MachineBasicBlock *IDB = IDN->getBlock(); + typedef GraphTraits<MachineDomTreeNode*> GTN; + typedef SmallVector<MachineDomTreeNode*,4> DTNodeVectType; + DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N)); + for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) { + MachineBasicBlock *SB = (*I)->getBlock(); + MDT->changeImmediateDominator(SB, IDB); + } + } + + while (B->succ_size() > 0) + B->removeSuccessor(B->succ_begin()); + + for (auto I = B->pred_begin(), E = B->pred_end(); I != E; ++I) + (*I)->removeSuccessor(B, true); + + Deleted.insert(B); + MDT->eraseNode(B); + MFN->erase(B->getIterator()); +} + + +void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) { + DEBUG(dbgs() << "Removing phi nodes from block " << PrintMB(B) << "\n"); + MachineBasicBlock::iterator I, NextI, NonPHI = B->getFirstNonPHI(); + for (I = B->begin(); I != NonPHI; I = NextI) { + NextI = std::next(I); + MachineInstr *PN = &*I; + assert(PN->getNumOperands() == 3 && "Invalid phi node"); + MachineOperand &UO = PN->getOperand(1); + unsigned UseR = UO.getReg(), UseSR = UO.getSubReg(); + unsigned DefR = PN->getOperand(0).getReg(); + unsigned NewR = UseR; + if (UseSR) { + // MRI.replaceVregUsesWith does not allow to update the subregister, + // so instead of doing the use-iteration here, create a copy into a + // "non-subregistered" register. + DebugLoc DL = PN->getDebugLoc(); + const TargetRegisterClass *RC = MRI->getRegClass(DefR); + NewR = MRI->createVirtualRegister(RC); + NonPHI = BuildMI(*B, NonPHI, DL, TII->get(TargetOpcode::COPY), NewR) + .addReg(UseR, 0, UseSR); + } + MRI->replaceRegWith(DefR, NewR); + B->erase(I); + } +} + + +void HexagonEarlyIfConversion::replacePhiEdges(MachineBasicBlock *OldB, + MachineBasicBlock *NewB) { + for (auto I = OldB->succ_begin(), E = OldB->succ_end(); I != E; ++I) { + MachineBasicBlock *SB = *I; + MachineBasicBlock::iterator P, N = SB->getFirstNonPHI(); + for (P = SB->begin(); P != N; ++P) { + MachineInstr *PN = &*P; + for (MIOperands MO(PN); MO.isValid(); ++MO) + if (MO->isMBB() && MO->getMBB() == OldB) + MO->setMBB(NewB); + } + } +} + + +void HexagonEarlyIfConversion::mergeBlocks(MachineBasicBlock *PredB, + MachineBasicBlock *SuccB) { + DEBUG(dbgs() << "Merging blocks " << PrintMB(PredB) << " and " + << PrintMB(SuccB) << "\n"); + bool TermOk = hasUncondBranch(SuccB); + eliminatePhis(SuccB); + TII->RemoveBranch(*PredB); + PredB->removeSuccessor(SuccB); + PredB->splice(PredB->end(), SuccB, SuccB->begin(), SuccB->end()); + MachineBasicBlock::succ_iterator I, E = SuccB->succ_end(); + for (I = SuccB->succ_begin(); I != E; ++I) + PredB->addSuccessor(*I); + PredB->normalizeSuccProbs(); + replacePhiEdges(SuccB, PredB); + removeBlock(SuccB); + if (!TermOk) + PredB->updateTerminator(); +} + + +void HexagonEarlyIfConversion::simplifyFlowGraph(const FlowPattern &FP) { + if (FP.TrueB) + removeBlock(FP.TrueB); + if (FP.FalseB) + removeBlock(FP.FalseB); + + FP.SplitB->updateTerminator(); + if (FP.SplitB->succ_size() != 1) + return; + + MachineBasicBlock *SB = *FP.SplitB->succ_begin(); + if (SB->pred_size() != 1) + return; + + // By now, the split block has only one successor (SB), and SB has only + // one predecessor. We can try to merge them. We will need to update ter- + // minators in FP.Split+SB, and that requires working AnalyzeBranch, which + // fails on Hexagon for blocks that have EH_LABELs. However, if SB ends + // with an unconditional branch, we won't need to touch the terminators. + if (!hasEHLabel(SB) || hasUncondBranch(SB)) + mergeBlocks(FP.SplitB, SB); +} + + +bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) { + auto &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + MFN = &MF; + MRI = &MF.getRegInfo(); + MDT = &getAnalysis<MachineDominatorTree>(); + MLI = &getAnalysis<MachineLoopInfo>(); + MBPI = EnableHexagonBP ? &getAnalysis<MachineBranchProbabilityInfo>() : + nullptr; + + Deleted.clear(); + bool Changed = false; + + for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I) + Changed |= visitLoop(*I); + Changed |= visitLoop(0); + + return Changed; +} + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// +FunctionPass *llvm::createHexagonEarlyIfConversion() { + return new HexagonEarlyIfConversion(); +} + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp index e4c8d8f..6e2dbc0 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp @@ -74,7 +74,7 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) { // Loop over all of the basic blocks. for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); MBBb != MBBe; ++MBBb) { - MachineBasicBlock* MBB = MBBb; + MachineBasicBlock *MBB = &*MBBb; // Traverse the basic block. for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end(); ++MII) { diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index 21a8996..7a52a1c 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -147,6 +147,48 @@ static cl::opt<unsigned> ShrinkLimit("shrink-frame-limit", cl::init(UINT_MAX), cl::Hidden, cl::ZeroOrMore, cl::desc("Max count of stack frame " "shrink-wraps")); +static cl::opt<bool> UseAllocframe("use-allocframe", cl::init(true), + cl::Hidden, cl::desc("Use allocframe more conservatively")); + + +namespace llvm { + void initializeHexagonCallFrameInformationPass(PassRegistry&); + FunctionPass *createHexagonCallFrameInformation(); +} + +namespace { + class HexagonCallFrameInformation : public MachineFunctionPass { + public: + static char ID; + HexagonCallFrameInformation() : MachineFunctionPass(ID) { + PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializeHexagonCallFrameInformationPass(PR); + } + bool runOnMachineFunction(MachineFunction &MF) override; + }; + + char HexagonCallFrameInformation::ID = 0; +} + +bool HexagonCallFrameInformation::runOnMachineFunction(MachineFunction &MF) { + auto &HFI = *MF.getSubtarget<HexagonSubtarget>().getFrameLowering(); + bool NeedCFI = MF.getMMI().hasDebugInfo() || + MF.getFunction()->needsUnwindTableEntry(); + + if (!NeedCFI) + return false; + HFI.insertCFIInstructions(MF); + return true; +} + +INITIALIZE_PASS(HexagonCallFrameInformation, "hexagon-cfi", + "Hexagon call frame information", false, false) + +FunctionPass *llvm::createHexagonCallFrameInformation() { + return new HexagonCallFrameInformation(); +} + + namespace { /// Map a register pair Reg to the subregister that has the greater "number", /// i.e. D3 (aka R7:6) will be mapped to R7, etc. @@ -370,11 +412,11 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF, insertEpilogueInBlock(*EpilogB); } else { for (auto &B : MF) - if (!B.empty() && B.back().isReturn()) + if (B.isReturnBlock()) insertCSRRestoresInBlock(B, CSI, HRI); for (auto &B : MF) - if (!B.empty() && B.back().isReturn()) + if (B.isReturnBlock()) insertEpilogueInBlock(B); } } @@ -383,10 +425,7 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF, void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); - MachineModuleInfo &MMI = MF.getMMI(); - MachineBasicBlock::iterator MBBI = MBB.begin(); - auto &HTM = static_cast<const HexagonTargetMachine&>(MF.getTarget()); - auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget()); + auto &HST = MF.getSubtarget<HexagonSubtarget>(); auto &HII = *HST.getInstrInfo(); auto &HRI = *HST.getRegisterInfo(); DebugLoc dl; @@ -405,10 +444,6 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const { bool AlignStack = (MaxAlign > getStackAlignment()); - // Check if frame moves are needed for EH. - bool needsFrameMoves = MMI.hasDebugInfo() || - MF.getFunction()->needsUnwindTableEntry(); - // Get the number of bytes to allocate from the FrameInfo. unsigned NumBytes = MFI->getStackSize(); unsigned SP = HRI.getStackRegister(); @@ -424,14 +459,7 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const { MI->eraseFromParent(); } - // - // Only insert ALLOCFRAME if we need to or at -O0 for the debugger. Think - // that this shouldn't be required, but doing so now because gcc does and - // gdb can't break at the start of the function without it. Will remove if - // this turns out to be a gdb bug. - // - bool NoOpt = (HTM.getOptLevel() == CodeGenOpt::None); - if (!NoOpt && !FuncInfo->hasClobberLR() && !hasFP(MF)) + if (!hasFP(MF)) return; // Check for overflow. @@ -469,92 +497,11 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const { .addReg(SP) .addImm(-int64_t(MaxAlign)); } - - if (needsFrameMoves) { - std::vector<MCCFIInstruction> Instructions = MMI.getFrameInstructions(); - MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); - - // Advance CFA. DW_CFA_def_cfa - unsigned DwFPReg = HRI.getDwarfRegNum(HRI.getFrameRegister(), true); - unsigned DwRAReg = HRI.getDwarfRegNum(HRI.getRARegister(), true); - - // CFA = FP + 8 - unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfa( - FrameLabel, DwFPReg, -8)); - BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - - // R31 (return addr) = CFA - #4 - CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( - FrameLabel, DwRAReg, -4)); - BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - - // R30 (frame ptr) = CFA - #8) - CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( - FrameLabel, DwFPReg, -8)); - BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - - unsigned int regsToMove[] = { - Hexagon::R1, Hexagon::R0, Hexagon::R3, Hexagon::R2, - Hexagon::R17, Hexagon::R16, Hexagon::R19, Hexagon::R18, - Hexagon::R21, Hexagon::R20, Hexagon::R23, Hexagon::R22, - Hexagon::R25, Hexagon::R24, Hexagon::R27, Hexagon::R26, - Hexagon::D0, Hexagon::D1, Hexagon::D8, Hexagon::D9, Hexagon::D10, - Hexagon::D11, Hexagon::D12, Hexagon::D13, Hexagon::NoRegister - }; - - const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - - for (unsigned i = 0; regsToMove[i] != Hexagon::NoRegister; ++i) { - for (unsigned I = 0, E = CSI.size(); I < E; ++I) { - if (CSI[I].getReg() == regsToMove[i]) { - // Subtract 8 to make room for R30 and R31, which are added above. - int64_t Offset = getFrameIndexOffset(MF, CSI[I].getFrameIdx()) - 8; - - if (regsToMove[i] < Hexagon::D0 || regsToMove[i] > Hexagon::D15) { - unsigned DwarfReg = HRI.getDwarfRegNum(regsToMove[i], true); - unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(FrameLabel, - DwarfReg, Offset)); - BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - } else { - // Split the double regs into subregs, and generate appropriate - // cfi_offsets. - // The only reason, we are split double regs is, llvm-mc does not - // understand paired registers for cfi_offset. - // Eg .cfi_offset r1:0, -64 - unsigned HiReg = getMax32BitSubRegister(regsToMove[i], HRI); - unsigned LoReg = getMax32BitSubRegister(regsToMove[i], HRI, false); - unsigned HiDwarfReg = HRI.getDwarfRegNum(HiReg, true); - unsigned LoDwarfReg = HRI.getDwarfRegNum(LoReg, true); - unsigned HiCFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(FrameLabel, - HiDwarfReg, Offset+4)); - BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(HiCFIIndex); - unsigned LoCFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(FrameLabel, - LoDwarfReg, Offset)); - BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(LoCFIIndex); - } - break; - } - } // for CSI.size() - } // for regsToMove - } // needsFrameMoves } void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const { MachineFunction &MF = *MBB.getParent(); - // - // Only insert deallocframe if we need to. Also at -O0. See comment - // in insertPrologueInBlock above. - // - if (!hasFP(MF) && MF.getTarget().getOptLevel() != CodeGenOpt::None) + if (!hasFP(MF)) return; auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget()); @@ -630,12 +577,172 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const { } +namespace { + bool IsAllocFrame(MachineBasicBlock::const_iterator It) { + if (!It->isBundle()) + return It->getOpcode() == Hexagon::S2_allocframe; + auto End = It->getParent()->instr_end(); + MachineBasicBlock::const_instr_iterator I = It.getInstrIterator(); + while (++I != End && I->isBundled()) + if (I->getOpcode() == Hexagon::S2_allocframe) + return true; + return false; + } + + MachineBasicBlock::iterator FindAllocFrame(MachineBasicBlock &B) { + for (auto &I : B) + if (IsAllocFrame(I)) + return I; + return B.end(); + } +} + + +void HexagonFrameLowering::insertCFIInstructions(MachineFunction &MF) const { + for (auto &B : MF) { + auto AF = FindAllocFrame(B); + if (AF == B.end()) + continue; + insertCFIInstructionsAt(B, ++AF); + } +} + + +void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator At) const { + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + auto &HII = *HST.getInstrInfo(); + auto &HRI = *HST.getRegisterInfo(); + + // If CFI instructions have debug information attached, something goes + // wrong with the final assembly generation: the prolog_end is placed + // in a wrong location. + DebugLoc DL; + const MCInstrDesc &CFID = HII.get(TargetOpcode::CFI_INSTRUCTION); + + MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); + + if (hasFP(MF)) { + unsigned DwFPReg = HRI.getDwarfRegNum(HRI.getFrameRegister(), true); + unsigned DwRAReg = HRI.getDwarfRegNum(HRI.getRARegister(), true); + + // Define CFA via an offset from the value of FP. + // + // -8 -4 0 (SP) + // --+----+----+--------------------- + // | FP | LR | increasing addresses --> + // --+----+----+--------------------- + // | +-- Old SP (before allocframe) + // +-- New FP (after allocframe) + // + // MCCFIInstruction::createDefCfa subtracts the offset from the register. + // MCCFIInstruction::createOffset takes the offset without sign change. + auto DefCfa = MCCFIInstruction::createDefCfa(FrameLabel, DwFPReg, -8); + BuildMI(MBB, At, DL, CFID) + .addCFIIndex(MMI.addFrameInst(DefCfa)); + // R31 (return addr) = CFA - 4 + auto OffR31 = MCCFIInstruction::createOffset(FrameLabel, DwRAReg, -4); + BuildMI(MBB, At, DL, CFID) + .addCFIIndex(MMI.addFrameInst(OffR31)); + // R30 (frame ptr) = CFA - 8 + auto OffR30 = MCCFIInstruction::createOffset(FrameLabel, DwFPReg, -8); + BuildMI(MBB, At, DL, CFID) + .addCFIIndex(MMI.addFrameInst(OffR30)); + } + + static unsigned int RegsToMove[] = { + Hexagon::R1, Hexagon::R0, Hexagon::R3, Hexagon::R2, + Hexagon::R17, Hexagon::R16, Hexagon::R19, Hexagon::R18, + Hexagon::R21, Hexagon::R20, Hexagon::R23, Hexagon::R22, + Hexagon::R25, Hexagon::R24, Hexagon::R27, Hexagon::R26, + Hexagon::D0, Hexagon::D1, Hexagon::D8, Hexagon::D9, + Hexagon::D10, Hexagon::D11, Hexagon::D12, Hexagon::D13, + Hexagon::NoRegister + }; + + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + + for (unsigned i = 0; RegsToMove[i] != Hexagon::NoRegister; ++i) { + unsigned Reg = RegsToMove[i]; + auto IfR = [Reg] (const CalleeSavedInfo &C) -> bool { + return C.getReg() == Reg; + }; + auto F = std::find_if(CSI.begin(), CSI.end(), IfR); + if (F == CSI.end()) + continue; + + // Subtract 8 to make room for R30 and R31, which are added above. + unsigned FrameReg; + int64_t Offset = getFrameIndexReference(MF, F->getFrameIdx(), FrameReg) - 8; + + if (Reg < Hexagon::D0 || Reg > Hexagon::D15) { + unsigned DwarfReg = HRI.getDwarfRegNum(Reg, true); + auto OffReg = MCCFIInstruction::createOffset(FrameLabel, DwarfReg, + Offset); + BuildMI(MBB, At, DL, CFID) + .addCFIIndex(MMI.addFrameInst(OffReg)); + } else { + // Split the double regs into subregs, and generate appropriate + // cfi_offsets. + // The only reason, we are split double regs is, llvm-mc does not + // understand paired registers for cfi_offset. + // Eg .cfi_offset r1:0, -64 + + unsigned HiReg = HRI.getSubReg(Reg, Hexagon::subreg_hireg); + unsigned LoReg = HRI.getSubReg(Reg, Hexagon::subreg_loreg); + unsigned HiDwarfReg = HRI.getDwarfRegNum(HiReg, true); + unsigned LoDwarfReg = HRI.getDwarfRegNum(LoReg, true); + auto OffHi = MCCFIInstruction::createOffset(FrameLabel, HiDwarfReg, + Offset+4); + BuildMI(MBB, At, DL, CFID) + .addCFIIndex(MMI.addFrameInst(OffHi)); + auto OffLo = MCCFIInstruction::createOffset(FrameLabel, LoDwarfReg, + Offset); + BuildMI(MBB, At, DL, CFID) + .addCFIIndex(MMI.addFrameInst(OffLo)); + } + } +} + + bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const HexagonMachineFunctionInfo *FuncInfo = - MF.getInfo<HexagonMachineFunctionInfo>(); - return MFI->hasCalls() || MFI->getStackSize() > 0 || - FuncInfo->hasClobberLR(); + auto &MFI = *MF.getFrameInfo(); + auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); + + bool HasFixed = MFI.getNumFixedObjects(); + bool HasPrealloc = const_cast<MachineFrameInfo&>(MFI) + .getLocalFrameObjectCount(); + bool HasExtraAlign = HRI.needsStackRealignment(MF); + bool HasAlloca = MFI.hasVarSizedObjects(); + + // Insert ALLOCFRAME if we need to or at -O0 for the debugger. Think + // that this shouldn't be required, but doing so now because gcc does and + // gdb can't break at the start of the function without it. Will remove if + // this turns out to be a gdb bug. + // + if (MF.getTarget().getOptLevel() == CodeGenOpt::None) + return true; + + // By default we want to use SP (since it's always there). FP requires + // some setup (i.e. ALLOCFRAME). + // Fixed and preallocated objects need FP if the distance from them to + // the SP is unknown (as is with alloca or aligna). + if ((HasFixed || HasPrealloc) && (HasAlloca || HasExtraAlign)) + return true; + + if (MFI.getStackSize() > 0) { + if (UseAllocframe) + return true; + } + + if (MFI.hasCalls() || + MF.getInfo<HexagonMachineFunctionInfo>()->hasClobberLR()) + return true; + + return false; } @@ -718,9 +825,89 @@ static void addCalleeSaveRegistersAsImpOperand(MachineInstr *Inst, } -int HexagonFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - return MF.getFrameInfo()->getObjectOffset(FI); +int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, unsigned &FrameReg) const { + auto &MFI = *MF.getFrameInfo(); + auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); + + // Large parts of this code are shared with HRI::eliminateFrameIndex. + int Offset = MFI.getObjectOffset(FI); + bool HasAlloca = MFI.hasVarSizedObjects(); + bool HasExtraAlign = HRI.needsStackRealignment(MF); + bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None; + + unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister(); + unsigned AP = 0; + if (const MachineInstr *AI = getAlignaInstr(MF)) + AP = AI->getOperand(0).getReg(); + unsigned FrameSize = MFI.getStackSize(); + + bool UseFP = false, UseAP = false; // Default: use SP (except at -O0). + // Use FP at -O0, except when there are objects with extra alignment. + // That additional alignment requirement may cause a pad to be inserted, + // which will make it impossible to use FP to access objects located + // past the pad. + if (NoOpt && !HasExtraAlign) + UseFP = true; + if (MFI.isFixedObjectIndex(FI) || MFI.isObjectPreAllocated(FI)) { + // Fixed and preallocated objects will be located before any padding + // so FP must be used to access them. + UseFP |= (HasAlloca || HasExtraAlign); + } else { + if (HasAlloca) { + if (HasExtraAlign) + UseAP = true; + else + UseFP = true; + } + } + + // If FP was picked, then there had better be FP. + bool HasFP = hasFP(MF); + assert((HasFP || !UseFP) && "This function must have frame pointer"); + + // Having FP implies allocframe. Allocframe will store extra 8 bytes: + // FP/LR. If the base register is used to access an object across these + // 8 bytes, then the offset will need to be adjusted by 8. + // + // After allocframe: + // HexagonISelLowering adds 8 to ---+ + // the offsets of all stack-based | + // arguments (*) | + // | + // getObjectOffset < 0 0 8 getObjectOffset >= 8 + // ------------------------+-----+------------------------> increasing + // <local objects> |FP/LR| <input arguments> addresses + // -----------------+------+-----+------------------------> + // | | + // SP/AP point --+ +-- FP points here (**) + // somewhere on + // this side of FP/LR + // + // (*) See LowerFormalArguments. The FP/LR is assumed to be present. + // (**) *FP == old-FP. FP+0..7 are the bytes of FP/LR. + + // The lowering assumes that FP/LR is present, and so the offsets of + // the formal arguments start at 8. If FP/LR is not there we need to + // reduce the offset by 8. + if (Offset > 0 && !HasFP) + Offset -= 8; + + if (UseFP) + FrameReg = FP; + else if (UseAP) + FrameReg = AP; + else + FrameReg = SP; + + // Calculate the actual offset in the instruction. If there is no FP + // (in other words, no allocframe), then SP will not be adjusted (i.e. + // there will be no SP -= FrameSize), so the frame size should not be + // added to the calculated offset. + int RealOffset = Offset; + if (!UseFP && !UseAP && HasFP) + RealOffset = FrameSize+Offset; + return RealOffset; } @@ -731,7 +918,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI = MBB.begin(); MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); if (useSpillFunction(MF, CSI)) { unsigned MaxReg = getMaxCalleeSavedReg(CSI, HRI); @@ -739,7 +926,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB, // Call spill function. DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); MachineInstr *SaveRegsCall = - BuildMI(MBB, MI, DL, TII.get(Hexagon::SAVE_REGISTERS_CALL_V4)) + BuildMI(MBB, MI, DL, HII.get(Hexagon::SAVE_REGISTERS_CALL_V4)) .addExternalSymbol(SpillFun); // Add callee-saved registers as use. addCalleeSaveRegistersAsImpOperand(SaveRegsCall, MaxReg, false); @@ -757,7 +944,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB, bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg); int FI = CSI[i].getFrameIdx(); const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI); + HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI); if (IsKill) MBB.addLiveIn(Reg); } @@ -772,7 +959,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI = MBB.getFirstTerminator(); MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); if (useRestoreFunction(MF, CSI)) { bool HasTC = hasTailCall(MBB) || !hasReturn(MBB); @@ -787,14 +974,14 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB, if (HasTC) { unsigned ROpc = Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4; - DeallocCall = BuildMI(MBB, MI, DL, TII.get(ROpc)) + DeallocCall = BuildMI(MBB, MI, DL, HII.get(ROpc)) .addExternalSymbol(RestoreFn); } else { // The block has a return. MachineBasicBlock::iterator It = MBB.getFirstTerminator(); assert(It->isReturn() && std::next(It) == MBB.end()); unsigned ROpc = Hexagon::RESTORE_DEALLOC_RET_JMP_V4; - DeallocCall = BuildMI(MBB, It, DL, TII.get(ROpc)) + DeallocCall = BuildMI(MBB, It, DL, HII.get(ROpc)) .addExternalSymbol(RestoreFn); // Transfer the function live-out registers. DeallocCall->copyImplicitOps(MF, It); @@ -807,7 +994,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB, unsigned Reg = CSI[i].getReg(); const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg); int FI = CSI[i].getFrameIdx(); - TII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI); + HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI); } return true; } @@ -832,9 +1019,9 @@ void HexagonFrameLowering::processFunctionBeforeFrameFinalized( // via AP, which may not be available at the particular place in the program. MachineFrameInfo *MFI = MF.getFrameInfo(); bool HasAlloca = MFI->hasVarSizedObjects(); - bool HasAligna = (MFI->getMaxAlignment() > getStackAlignment()); + bool NeedsAlign = (MFI->getMaxAlignment() > getStackAlignment()); - if (!HasAlloca || !HasAligna) + if (!HasAlloca || !NeedsAlign) return; unsigned LFS = MFI->getLocalFrameSize(); @@ -864,13 +1051,13 @@ static bool needToReserveScavengingSpillSlots(MachineFunction &MF, // Check for an unused caller-saved register. for ( ; *CallerSavedRegs; ++CallerSavedRegs) { MCPhysReg FreeReg = *CallerSavedRegs; - if (MRI.isPhysRegUsed(FreeReg)) + if (!MRI.reg_nodbg_empty(FreeReg)) continue; // Check aliased register usage. bool IsCurrentRegUsed = false; for (MCRegAliasIterator AI(FreeReg, &HRI, false); AI.isValid(); ++AI) - if (MRI.isPhysRegUsed(*AI)) { + if (!MRI.reg_nodbg_empty(*AI)) { IsCurrentRegUsed = true; break; } @@ -896,7 +1083,7 @@ bool HexagonFrameLowering::replacePredRegPseudoSpillCode(MachineFunction &MF) // Loop over all of the basic blocks. for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end(); MBBb != MBBe; ++MBBb) { - MachineBasicBlock* MBB = MBBb; + MachineBasicBlock *MBB = &*MBBb; // Traverse the basic block. MachineBasicBlock::iterator NextII; for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end(); @@ -1210,7 +1397,8 @@ bool HexagonFrameLowering::needsAligna(const MachineFunction &MF) const { } -MachineInstr *HexagonFrameLowering::getAlignaInstr(MachineFunction &MF) const { +const MachineInstr *HexagonFrameLowering::getAlignaInstr( + const MachineFunction &MF) const { for (auto &B : MF) for (auto &I : B) if (I.getOpcode() == Hexagon::ALIGNA) @@ -1219,6 +1407,7 @@ MachineInstr *HexagonFrameLowering::getAlignaInstr(MachineFunction &MF) const { } +// FIXME: Use Function::optForSize(). inline static bool isOptSize(const MachineFunction &MF) { AttributeSet AF = MF.getFunction()->getAttributes(); return AF.hasAttribute(AttributeSet::FunctionIndex, @@ -1226,8 +1415,7 @@ inline static bool isOptSize(const MachineFunction &MF) { } inline static bool isMinSize(const MachineFunction &MF) { - AttributeSet AF = MF.getFunction()->getAttributes(); - return AF.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + return MF.getFunction()->optForMinSize(); } @@ -1289,4 +1477,3 @@ bool HexagonFrameLowering::useRestoreFunction(MachineFunction &MF, : SpillFuncThreshold; return Threshold < NumCSI; } - diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h index d39ee2c..683b303 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h @@ -51,7 +51,8 @@ public: bool targetHandlesStackFrameRounding() const override { return true; } - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; bool hasFP(const MachineFunction &MF) const override; const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) @@ -73,7 +74,9 @@ public: const override; bool needsAligna(const MachineFunction &MF) const; - MachineInstr *getAlignaInstr(MachineFunction &MF) const; + const MachineInstr *getAlignaInstr(const MachineFunction &MF) const; + + void insertCFIInstructions(MachineFunction &MF) const; private: typedef std::vector<CalleeSavedInfo> CSIVect; @@ -86,6 +89,8 @@ private: const HexagonRegisterInfo &HRI) const; bool insertCSRRestoresInBlock(MachineBasicBlock &MBB, const CSIVect &CSI, const HexagonRegisterInfo &HRI) const; + void insertCFIInstructionsAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator At) const; void adjustForCalleeSavedRegsSpillCall(MachineFunction &MF) const; bool replacePredRegPseudoSpillCode(MachineFunction &MF) const; @@ -94,7 +99,7 @@ private: void findShrunkPrologEpilog(MachineFunction &MF, MachineBasicBlock *&PrologB, MachineBasicBlock *&EpilogB) const; - bool shouldInlineCSR(llvm::MachineFunction&, const CSIVect&) const; + bool shouldInlineCSR(llvm::MachineFunction &MF, const CSIVect &CSI) const; bool useSpillFunction(MachineFunction &MF, const CSIVect &CSI) const; bool useRestoreFunction(MachineFunction &MF, const CSIVect &CSI) const; }; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp index 4d32208..f26e2ff 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp @@ -195,7 +195,7 @@ bool HexagonGenExtract::convert(Instruction *In) { return false; } - IRBuilder<> IRB(BB, In); + IRBuilder<> IRB(In); Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu : Intrinsic::hexagon_S2_extractup; Module *Mod = BB->getParent()->getParent(); diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp index 096da94..64a2b6c 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -77,9 +77,8 @@ namespace { namespace { // Set of virtual registers, based on BitVector. struct RegisterSet : private BitVector { - RegisterSet() : BitVector() {} + RegisterSet() = default; explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {} - RegisterSet(const RegisterSet &RS) : BitVector(RS) {} using BitVector::clear; @@ -1496,7 +1495,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { // version of DCE that preserves lifetime markers. Without it, merging // of stack objects can fail to recognize and merge disjoint objects // leading to unnecessary stack growth. - Changed |= removeDeadCode(MDT->getRootNode()); + Changed = removeDeadCode(MDT->getRootNode()); const HexagonEvaluator HE(*HRI, *MRI, *HII, MF); BitTracker BTLoc(HE, MF); @@ -1534,7 +1533,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { } if (IFMap.empty()) - return false; + return Changed; { NamedRegionTimer _T("pruning", "hexinsert", TimingDetail); @@ -1547,7 +1546,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { } if (IFMap.empty()) - return false; + return Changed; { NamedRegionTimer _T("selection", "hexinsert", TimingDetail); @@ -1572,13 +1571,15 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { for (unsigned i = 0, n = Out.size(); i < n; ++i) IFMap.erase(Out[i]); } + if (IFMap.empty()) + return Changed; { NamedRegionTimer _T("generation", "hexinsert", TimingDetail); - Changed = generateInserts(); + generateInserts(); } - return Changed; + return true; } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp new file mode 100644 index 0000000..c059d56 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp @@ -0,0 +1,319 @@ +//===--- HexagonGenMux.cpp ------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// During instruction selection, MUX instructions are generated for +// conditional assignments. Since such assignments often present an +// opportunity to predicate instructions, HexagonExpandCondsets +// expands MUXes into pairs of conditional transfers, and then proceeds +// with predication of the producers/consumers of the registers involved. +// This happens after exiting from the SSA form, but before the machine +// instruction scheduler. After the scheduler and after the register +// allocation there can be cases of pairs of conditional transfers +// resulting from a MUX where neither of them was further predicated. If +// these transfers are now placed far enough from the instruction defining +// the predicate register, they cannot use the .new form. In such cases it +// is better to collapse them back to a single MUX instruction. + +#define DEBUG_TYPE "hexmux" + +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "HexagonTargetMachine.h" + +using namespace llvm; + +namespace llvm { + FunctionPass *createHexagonGenMux(); + void initializeHexagonGenMuxPass(PassRegistry& Registry); +} + +namespace { + class HexagonGenMux : public MachineFunctionPass { + public: + static char ID; + HexagonGenMux() : MachineFunctionPass(ID), HII(0), HRI(0) { + initializeHexagonGenMuxPass(*PassRegistry::getPassRegistry()); + } + const char *getPassName() const override { + return "Hexagon generate mux instructions"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; + + private: + const HexagonInstrInfo *HII; + const HexagonRegisterInfo *HRI; + + struct CondsetInfo { + unsigned PredR; + unsigned TrueX, FalseX; + CondsetInfo() : PredR(0), TrueX(UINT_MAX), FalseX(UINT_MAX) {} + }; + struct DefUseInfo { + BitVector Defs, Uses; + DefUseInfo() : Defs(), Uses() {} + DefUseInfo(const BitVector &D, const BitVector &U) : Defs(D), Uses(U) {} + }; + struct MuxInfo { + MachineBasicBlock::iterator At; + unsigned DefR, PredR; + MachineOperand *SrcT, *SrcF; + MachineInstr *Def1, *Def2; + MuxInfo(MachineBasicBlock::iterator It, unsigned DR, unsigned PR, + MachineOperand *TOp, MachineOperand *FOp, + MachineInstr *D1, MachineInstr *D2) + : At(It), DefR(DR), PredR(PR), SrcT(TOp), SrcF(FOp), Def1(D1), + Def2(D2) {} + }; + typedef DenseMap<MachineInstr*,unsigned> InstrIndexMap; + typedef DenseMap<unsigned,DefUseInfo> DefUseInfoMap; + typedef SmallVector<MuxInfo,4> MuxInfoList; + + bool isRegPair(unsigned Reg) const { + return Hexagon::DoubleRegsRegClass.contains(Reg); + } + void getSubRegs(unsigned Reg, BitVector &SRs) const; + void expandReg(unsigned Reg, BitVector &Set) const; + void getDefsUses(const MachineInstr *MI, BitVector &Defs, + BitVector &Uses) const; + void buildMaps(MachineBasicBlock &B, InstrIndexMap &I2X, + DefUseInfoMap &DUM); + bool isCondTransfer(unsigned Opc) const; + unsigned getMuxOpcode(const MachineOperand &Src1, + const MachineOperand &Src2) const; + bool genMuxInBlock(MachineBasicBlock &B); + }; + + char HexagonGenMux::ID = 0; +} + +INITIALIZE_PASS(HexagonGenMux, "hexagon-mux", + "Hexagon generate mux instructions", false, false) + + +void HexagonGenMux::getSubRegs(unsigned Reg, BitVector &SRs) const { + for (MCSubRegIterator I(Reg, HRI); I.isValid(); ++I) + SRs[*I] = true; +} + + +void HexagonGenMux::expandReg(unsigned Reg, BitVector &Set) const { + if (isRegPair(Reg)) + getSubRegs(Reg, Set); + else + Set[Reg] = true; +} + + +void HexagonGenMux::getDefsUses(const MachineInstr *MI, BitVector &Defs, + BitVector &Uses) const { + // First, get the implicit defs and uses for this instruction. + unsigned Opc = MI->getOpcode(); + const MCInstrDesc &D = HII->get(Opc); + if (const MCPhysReg *R = D.ImplicitDefs) + while (*R) + expandReg(*R++, Defs); + if (const MCPhysReg *R = D.ImplicitUses) + while (*R) + expandReg(*R++, Uses); + + // Look over all operands, and collect explicit defs and uses. + for (ConstMIOperands Mo(MI); Mo.isValid(); ++Mo) { + if (!Mo->isReg() || Mo->isImplicit()) + continue; + unsigned R = Mo->getReg(); + BitVector &Set = Mo->isDef() ? Defs : Uses; + expandReg(R, Set); + } +} + + +void HexagonGenMux::buildMaps(MachineBasicBlock &B, InstrIndexMap &I2X, + DefUseInfoMap &DUM) { + unsigned Index = 0; + unsigned NR = HRI->getNumRegs(); + BitVector Defs(NR), Uses(NR); + + for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) { + MachineInstr *MI = &*I; + I2X.insert(std::make_pair(MI, Index)); + Defs.reset(); + Uses.reset(); + getDefsUses(MI, Defs, Uses); + DUM.insert(std::make_pair(Index, DefUseInfo(Defs, Uses))); + Index++; + } +} + + +bool HexagonGenMux::isCondTransfer(unsigned Opc) const { + switch (Opc) { + case Hexagon::A2_tfrt: + case Hexagon::A2_tfrf: + case Hexagon::C2_cmoveit: + case Hexagon::C2_cmoveif: + return true; + } + return false; +} + + +unsigned HexagonGenMux::getMuxOpcode(const MachineOperand &Src1, + const MachineOperand &Src2) const { + bool IsReg1 = Src1.isReg(), IsReg2 = Src2.isReg(); + if (IsReg1) + return IsReg2 ? Hexagon::C2_mux : Hexagon::C2_muxir; + if (IsReg2) + return Hexagon::C2_muxri; + + // Neither is a register. The first source is extendable, but the second + // is not (s8). + if (Src2.isImm() && isInt<8>(Src2.getImm())) + return Hexagon::C2_muxii; + + return 0; +} + + +bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) { + bool Changed = false; + InstrIndexMap I2X; + DefUseInfoMap DUM; + buildMaps(B, I2X, DUM); + + typedef DenseMap<unsigned,CondsetInfo> CondsetMap; + CondsetMap CM; + MuxInfoList ML; + + MachineBasicBlock::iterator NextI, End = B.end(); + for (MachineBasicBlock::iterator I = B.begin(); I != End; I = NextI) { + MachineInstr *MI = &*I; + NextI = std::next(I); + unsigned Opc = MI->getOpcode(); + if (!isCondTransfer(Opc)) + continue; + unsigned DR = MI->getOperand(0).getReg(); + if (isRegPair(DR)) + continue; + + unsigned PR = MI->getOperand(1).getReg(); + unsigned Idx = I2X.lookup(MI); + CondsetMap::iterator F = CM.find(DR); + bool IfTrue = HII->isPredicatedTrue(Opc); + + // If there is no record of a conditional transfer for this register, + // or the predicate register differs, create a new record for it. + if (F != CM.end() && F->second.PredR != PR) { + CM.erase(F); + F = CM.end(); + } + if (F == CM.end()) { + auto It = CM.insert(std::make_pair(DR, CondsetInfo())); + F = It.first; + F->second.PredR = PR; + } + CondsetInfo &CI = F->second; + if (IfTrue) + CI.TrueX = Idx; + else + CI.FalseX = Idx; + if (CI.TrueX == UINT_MAX || CI.FalseX == UINT_MAX) + continue; + + // There is now a complete definition of DR, i.e. we have the predicate + // register, the definition if-true, and definition if-false. + + // First, check if both definitions are far enough from the definition + // of the predicate register. + unsigned MinX = std::min(CI.TrueX, CI.FalseX); + unsigned MaxX = std::max(CI.TrueX, CI.FalseX); + unsigned SearchX = (MaxX > 4) ? MaxX-4 : 0; + bool NearDef = false; + for (unsigned X = SearchX; X < MaxX; ++X) { + const DefUseInfo &DU = DUM.lookup(X); + if (!DU.Defs[PR]) + continue; + NearDef = true; + break; + } + if (NearDef) + continue; + + // The predicate register is not defined in the last few instructions. + // Check if the conversion to MUX is possible (either "up", i.e. at the + // place of the earlier partial definition, or "down", where the later + // definition is located). Examine all defs and uses between these two + // definitions. + // SR1, SR2 - source registers from the first and the second definition. + MachineBasicBlock::iterator It1 = B.begin(), It2 = B.begin(); + std::advance(It1, MinX); + std::advance(It2, MaxX); + MachineInstr *Def1 = It1, *Def2 = It2; + MachineOperand *Src1 = &Def1->getOperand(2), *Src2 = &Def2->getOperand(2); + unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0; + unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0; + bool Failure = false, CanUp = true, CanDown = true; + for (unsigned X = MinX+1; X < MaxX; X++) { + const DefUseInfo &DU = DUM.lookup(X); + if (DU.Defs[PR] || DU.Defs[DR] || DU.Uses[DR]) { + Failure = true; + break; + } + if (CanDown && DU.Defs[SR1]) + CanDown = false; + if (CanUp && DU.Defs[SR2]) + CanUp = false; + } + if (Failure || (!CanUp && !CanDown)) + continue; + + MachineOperand *SrcT = (MinX == CI.TrueX) ? Src1 : Src2; + MachineOperand *SrcF = (MinX == CI.FalseX) ? Src1 : Src2; + // Prefer "down", since this will move the MUX farther away from the + // predicate definition. + MachineBasicBlock::iterator At = CanDown ? Def2 : Def1; + ML.push_back(MuxInfo(At, DR, PR, SrcT, SrcF, Def1, Def2)); + } + + for (unsigned I = 0, N = ML.size(); I < N; ++I) { + MuxInfo &MX = ML[I]; + MachineBasicBlock &B = *MX.At->getParent(); + DebugLoc DL = MX.At->getDebugLoc(); + unsigned MxOpc = getMuxOpcode(*MX.SrcT, *MX.SrcF); + if (!MxOpc) + continue; + BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR) + .addReg(MX.PredR) + .addOperand(*MX.SrcT) + .addOperand(*MX.SrcF); + B.erase(MX.Def1); + B.erase(MX.Def2); + Changed = true; + } + + return Changed; +} + +bool HexagonGenMux::runOnMachineFunction(MachineFunction &MF) { + HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); + HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); + bool Changed = false; + for (auto &I : MF) + Changed |= genMuxInBlock(I); + return Changed; +} + +FunctionPass *llvm::createHexagonGenMux() { + return new HexagonGenMux(); +} + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp index 6905c4f..d9675b5 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -250,7 +250,7 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) { unsigned NewPR = MRI->createVirtualRegister(PredRC); // For convertible instructions, do not modify them, so that they can - // be coverted later. Generate a copy from Reg to NewPR. + // be converted later. Generate a copy from Reg to NewPR. if (isConvertibleToPredForm(DefI)) { MachineBasicBlock::iterator DefIt = DefI; BuildMI(B, std::next(DefIt), DL, TII->get(TargetOpcode::COPY), NewPR) diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp index 53b6bf6..d20a809 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -727,9 +727,9 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop, // Phis that may feed into the loop. LoopFeederMap LoopFeederPhi; - // Check if the inital value may be zero and can be decremented in the first + // Check if the initial value may be zero and can be decremented in the first // iteration. If the value is zero, the endloop instruction will not decrement - // the loop counter, so we shoudn't generate a hardware loop in this case. + // the loop counter, so we shouldn't generate a hardware loop in this case. if (loopCountMayWrapOrUnderFlow(Start, End, Loop->getLoopPreheader(), Loop, LoopFeederPhi)) return nullptr; @@ -1288,14 +1288,14 @@ bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI, typedef MachineBasicBlock::instr_iterator instr_iterator; // Check if things are in order to begin with. - for (instr_iterator I = BumpI, E = BB->instr_end(); I != E; ++I) + for (instr_iterator I(BumpI), E = BB->instr_end(); I != E; ++I) if (&*I == CmpI) return true; // Out of order. unsigned PredR = CmpI->getOperand(0).getReg(); bool FoundBump = false; - instr_iterator CmpIt = CmpI, NextIt = std::next(CmpIt); + instr_iterator CmpIt = CmpI->getIterator(), NextIt = std::next(CmpIt); for (instr_iterator I = NextIt, E = BB->instr_end(); I != E; ++I) { MachineInstr *In = &*I; for (unsigned i = 0, n = In->getNumOperands(); i < n; ++i) { @@ -1307,9 +1307,7 @@ bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI, } if (In == BumpI) { - instr_iterator After = BumpI; - instr_iterator From = CmpI; - BB->splice(std::next(After), BB, From); + BB->splice(++BumpI->getIterator(), BB, CmpI->getIterator()); FoundBump = true; break; } @@ -1440,7 +1438,7 @@ bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow( if (Comparison::isSigned(Cmp)) return false; - // Check if there is a comparison of the inital value. If the initial value + // Check if there is a comparison of the initial value. If the initial value // is greater than or not equal to another value, then assume this is a // range check. if ((Cmp & Comparison::G) || Cmp == Comparison::NE) @@ -1850,7 +1848,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop( } MachineBasicBlock *NewPH = MF->CreateMachineBasicBlock(); - MF->insert(Header, NewPH); + MF->insert(Header->getIterator(), NewPH); if (Header->pred_size() > 2) { // Ensure that the header has only two predecessors: the preheader and diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 9123057..a0da945 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -50,16 +50,21 @@ namespace { class HexagonDAGToDAGISel : public SelectionDAGISel { const HexagonTargetMachine& HTM; const HexagonSubtarget *HST; + const HexagonInstrInfo *HII; + const HexagonRegisterInfo *HRI; public: explicit HexagonDAGToDAGISel(HexagonTargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), HTM(tm) { + : SelectionDAGISel(tm, OptLevel), HTM(tm), HST(nullptr), HII(nullptr), + HRI(nullptr) { initializeHexagonDAGToDAGISelPass(*PassRegistry::getPassRegistry()); } bool runOnMachineFunction(MachineFunction &MF) override { // Reset the subtarget each time through. HST = &MF.getSubtarget<HexagonSubtarget>(); + HII = HST->getInstrInfo(); + HRI = HST->getRegisterInfo(); SelectionDAGISel::runOnMachineFunction(MF); return true; } @@ -104,7 +109,6 @@ public: SDNode *SelectConstantFP(SDNode *N); SDNode *SelectAdd(SDNode *N); SDNode *SelectBitOp(SDNode *N); - bool isConstExtProfitable(SDNode *N) const; // XformMskToBitPosU5Imm - Returns the bit position which // the single bit 32 bit mask represents. @@ -139,8 +143,8 @@ public: // type i32 where the negative literal is transformed into a positive literal // for use in -= memops. inline SDValue XformM5ToU5Imm(signed Imm, SDLoc DL) { - assert( (Imm >= -31 && Imm <= -1) && "Constant out of range for Memops"); - return CurDAG->getTargetConstant( - Imm, DL, MVT::i32); + assert((Imm >= -31 && Imm <= -1) && "Constant out of range for Memops"); + return CurDAG->getTargetConstant(-Imm, DL, MVT::i32); } // XformU7ToU7M1Imm - Return a target constant decremented by 1, in range @@ -203,11 +207,10 @@ void llvm::initializeHexagonDAGToDAGISelPass(PassRegistry &Registry) { // Intrinsics that return a a predicate. -static unsigned doesIntrinsicReturnPredicate(unsigned ID) -{ +static bool doesIntrinsicReturnPredicate(unsigned ID) { switch (ID) { default: - return 0; + return false; case Intrinsic::hexagon_C2_cmpeq: case Intrinsic::hexagon_C2_cmpgt: case Intrinsic::hexagon_C2_cmpgtu: @@ -244,7 +247,7 @@ static unsigned doesIntrinsicReturnPredicate(unsigned ID) case Intrinsic::hexagon_C2_tfrrp: case Intrinsic::hexagon_S2_tstbit_i: case Intrinsic::hexagon_S2_tstbit_r: - return 1; + return true; } } @@ -258,8 +261,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD, SDNode *OffsetNode = Offset.getNode(); int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue(); - const HexagonInstrInfo &TII = *HST->getInstrInfo(); - if (TII.isValidAutoIncImm(LoadedVT, Val)) { + if (HII->isValidAutoIncImm(LoadedVT, Val)) { SDValue TargetConst = CurDAG->getTargetConstant(Val, dl, MVT::i32); SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32, MVT::Other, Base, TargetConst, @@ -312,8 +314,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD, SDNode *OffsetNode = Offset.getNode(); int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue(); - const HexagonInstrInfo &TII = *HST->getInstrInfo(); - if (TII.isValidAutoIncImm(LoadedVT, Val)) { + if (HII->isValidAutoIncImm(LoadedVT, Val)) { SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32); SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32); SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, @@ -378,29 +379,46 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) { // loads. ISD::LoadExtType ExtType = LD->getExtensionType(); bool IsZeroExt = (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD); + bool HasVecOffset = false; // Figure out the opcode. - const HexagonInstrInfo &TII = *HST->getInstrInfo(); if (LoadedVT == MVT::i64) { - if (TII.isValidAutoIncImm(LoadedVT, Val)) + if (HII->isValidAutoIncImm(LoadedVT, Val)) Opcode = Hexagon::L2_loadrd_pi; else Opcode = Hexagon::L2_loadrd_io; } else if (LoadedVT == MVT::i32) { - if (TII.isValidAutoIncImm(LoadedVT, Val)) + if (HII->isValidAutoIncImm(LoadedVT, Val)) Opcode = Hexagon::L2_loadri_pi; else Opcode = Hexagon::L2_loadri_io; } else if (LoadedVT == MVT::i16) { - if (TII.isValidAutoIncImm(LoadedVT, Val)) + if (HII->isValidAutoIncImm(LoadedVT, Val)) Opcode = IsZeroExt ? Hexagon::L2_loadruh_pi : Hexagon::L2_loadrh_pi; else Opcode = IsZeroExt ? Hexagon::L2_loadruh_io : Hexagon::L2_loadrh_io; } else if (LoadedVT == MVT::i8) { - if (TII.isValidAutoIncImm(LoadedVT, Val)) + if (HII->isValidAutoIncImm(LoadedVT, Val)) Opcode = IsZeroExt ? Hexagon::L2_loadrub_pi : Hexagon::L2_loadrb_pi; else Opcode = IsZeroExt ? Hexagon::L2_loadrub_io : Hexagon::L2_loadrb_io; + } else if (LoadedVT == MVT::v16i32 || LoadedVT == MVT::v8i64 || + LoadedVT == MVT::v32i16 || LoadedVT == MVT::v64i8) { + HasVecOffset = true; + if (HII->isValidAutoIncImm(LoadedVT, Val)) { + Opcode = Hexagon::V6_vL32b_pi; + } + else + Opcode = Hexagon::V6_vL32b_ai; + // 128B + } else if (LoadedVT == MVT::v32i32 || LoadedVT == MVT::v16i64 || + LoadedVT == MVT::v64i16 || LoadedVT == MVT::v128i8) { + HasVecOffset = true; + if (HII->isValidAutoIncImm(LoadedVT, Val)) { + Opcode = Hexagon::V6_vL32b_pi_128B; + } + else + Opcode = Hexagon::V6_vL32b_ai_128B; } else llvm_unreachable("unknown memory type"); @@ -411,7 +429,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) { if (LD->getValueType(0) == MVT::i64 && ExtType == ISD::SEXTLOAD) return SelectIndexedLoadSignExtend64(LD, Opcode, dl); - if (TII.isValidAutoIncImm(LoadedVT, Val)) { + if (HII->isValidAutoIncImm(LoadedVT, Val)) { SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32); SDNode* Result = CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0), @@ -420,15 +438,25 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) { MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = LD->getMemOperand(); cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1); - const SDValue Froms[] = { SDValue(LD, 0), - SDValue(LD, 1), - SDValue(LD, 2) - }; - const SDValue Tos[] = { SDValue(Result, 0), - SDValue(Result, 1), - SDValue(Result, 2) - }; - ReplaceUses(Froms, Tos, 3); + if (HasVecOffset) { + const SDValue Froms[] = { SDValue(LD, 0), + SDValue(LD, 2) + }; + const SDValue Tos[] = { SDValue(Result, 0), + SDValue(Result, 2) + }; + ReplaceUses(Froms, Tos, 2); + } else { + const SDValue Froms[] = { SDValue(LD, 0), + SDValue(LD, 1), + SDValue(LD, 2) + }; + const SDValue Tos[] = { SDValue(Result, 0), + SDValue(Result, 1), + SDValue(Result, 2) + }; + ReplaceUses(Froms, Tos, 3); + } return Result; } else { SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32); @@ -487,8 +515,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) { // Offset value must be within representable range // and must have correct alignment properties. - const HexagonInstrInfo &TII = *HST->getInstrInfo(); - if (TII.isValidAutoIncImm(StoredVT, Val)) { + if (HII->isValidAutoIncImm(StoredVT, Val)) { unsigned Opcode = 0; // Figure out the post inc version of opcode. @@ -496,7 +523,15 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) { else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_pi; else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_pi; else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_pi; - else llvm_unreachable("unknown memory type"); + else if (StoredVT == MVT::v16i32 || StoredVT == MVT::v8i64 || + StoredVT == MVT::v32i16 || StoredVT == MVT::v64i8) { + Opcode = Hexagon::V6_vS32b_pi; + } + // 128B + else if (StoredVT == MVT::v32i32 || StoredVT == MVT::v16i64 || + StoredVT == MVT::v64i16 || StoredVT == MVT::v128i8) { + Opcode = Hexagon::V6_vS32b_pi_128B; + } else llvm_unreachable("unknown memory type"); if (ST->isTruncatingStore() && ValueVT.getSizeInBits() == 64) { assert(StoredVT.getSizeInBits() < 64 && "Not a truncating store"); @@ -530,6 +565,13 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) { else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_io; else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_io; else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_io; + else if (StoredVT == MVT::v16i32 || StoredVT == MVT::v8i64 || + StoredVT == MVT::v32i16 || StoredVT == MVT::v64i8) + Opcode = Hexagon::V6_vS32b_ai; + // 128B + else if (StoredVT == MVT::v32i32 || StoredVT == MVT::v16i64 || + StoredVT == MVT::v64i16 || StoredVT == MVT::v128i8) + Opcode = Hexagon::V6_vS32b_ai_128B; else llvm_unreachable("unknown memory type"); // Build regular store. @@ -1113,14 +1155,12 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) { } if (Opc == ISD::AND) { - if (((ValueVT == MVT::i32) && - (!((Val & 0x80000000) || (Val & 0x7fffffff)))) || - ((ValueVT == MVT::i64) && - (!((Val & 0x8000000000000000) || (Val & 0x7fffffff))))) - // If it's simple AND, do the normal op. - return SelectCode(N); - else + // Check if this is a bit-clearing AND, if not select code the usual way. + if ((ValueVT == MVT::i32 && isPowerOf2_32(~Val)) || + (ValueVT == MVT::i64 && isPowerOf2_64(~Val))) Val = ~Val; + else + return SelectCode(N); } // If OR or AND is being fed by shl, srl and, sra don't do this change, @@ -1128,7 +1168,8 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) { // Traverse the DAG to see if there is shl, srl and sra. if (Opc == ISD::OR || Opc == ISD::AND) { switch (N->getOperand(0)->getOpcode()) { - default: break; + default: + break; case ISD::SRA: case ISD::SRL: case ISD::SHL: @@ -1137,23 +1178,24 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) { } // Make sure it's power of 2. - unsigned bitpos = 0; + unsigned BitPos = 0; if (Opc != ISD::FABS && Opc != ISD::FNEG) { - if (((ValueVT == MVT::i32) && !isPowerOf2_32(Val)) || - ((ValueVT == MVT::i64) && !isPowerOf2_64(Val))) + if ((ValueVT == MVT::i32 && !isPowerOf2_32(Val)) || + (ValueVT == MVT::i64 && !isPowerOf2_64(Val))) return SelectCode(N); // Get the bit position. - bitpos = countTrailingZeros(uint64_t(Val)); + BitPos = countTrailingZeros(uint64_t(Val)); } else { // For fabs and fneg, it's always the 31st bit. - bitpos = 31; + BitPos = 31; } unsigned BitOpc = 0; // Set the right opcode for bitwise operations. - switch(Opc) { - default: llvm_unreachable("Only bit-wise/abs/neg operations are allowed."); + switch (Opc) { + default: + llvm_unreachable("Only bit-wise/abs/neg operations are allowed."); case ISD::AND: case ISD::FABS: BitOpc = Hexagon::S2_clrbit_i; @@ -1169,7 +1211,7 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) { SDNode *Result; // Get the right SDVal for the opcode. - SDValue SDVal = CurDAG->getTargetConstant(bitpos, dl, MVT::i32); + SDValue SDVal = CurDAG->getTargetConstant(BitPos, dl, MVT::i32); if (ValueVT == MVT::i32 || ValueVT == MVT::f32) { Result = CurDAG->getMachineNode(BitOpc, dl, ValueVT, @@ -1198,7 +1240,7 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) { MVT::i32, SDValue(Reg, 0)); // Clear/set/toggle hi or lo registers depending on the bit position. - if (SubValueVT != MVT::f32 && bitpos < 32) { + if (SubValueVT != MVT::f32 && BitPos < 32) { SDNode *Result0 = CurDAG->getMachineNode(BitOpc, dl, SubValueVT, SubregLO, SDVal); const SDValue Ops[] = { RegClass, SubregHI, SubregHiIdx, @@ -1207,7 +1249,7 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) { dl, ValueVT, Ops); } else { if (Opc != ISD::FABS && Opc != ISD::FNEG) - SDVal = CurDAG->getTargetConstant(bitpos - 32, dl, MVT::i32); + SDVal = CurDAG->getTargetConstant(BitPos-32, dl, MVT::i32); SDNode *Result0 = CurDAG->getMachineNode(BitOpc, dl, SubValueVT, SubregHI, SDVal); const SDValue Ops[] = { RegClass, SDValue(Result0, 0), SubregHiIdx, @@ -1328,25 +1370,12 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, return false; } -bool HexagonDAGToDAGISel::isConstExtProfitable(SDNode *N) const { - unsigned UseCount = 0; - unsigned CallCount = 0; - for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { - // Ignore call instructions. - if (I->getOpcode() == ISD::CopyToReg) - ++CallCount; - UseCount++; - } - - return (UseCount <= 1) || (CallCount > 1); - -} void HexagonDAGToDAGISel::PreprocessISelDAG() { SelectionDAG &DAG = *CurDAG; std::vector<SDNode*> Nodes; - for (auto I = DAG.allnodes_begin(), E = DAG.allnodes_end(); I != E; ++I) - Nodes.push_back(I); + for (SDNode &Node : DAG.allnodes()) + Nodes.push_back(&Node); // Simplify: (or (select c x 0) z) -> (select c (or x z) z) // (or (select c 0 y) z) -> (select c z (or y z)) @@ -1397,11 +1426,10 @@ void HexagonDAGToDAGISel::EmitFunctionEntryCode() { return; MachineFrameInfo *MFI = MF->getFrameInfo(); - MachineBasicBlock *EntryBB = MF->begin(); + MachineBasicBlock *EntryBB = &MF->front(); unsigned AR = FuncInfo->CreateReg(MVT::i32); unsigned MaxA = MFI->getMaxAlignment(); - auto &HII = *HST.getInstrInfo(); - BuildMI(EntryBB, DebugLoc(), HII.get(Hexagon::ALIGNA), AR) + BuildMI(EntryBB, DebugLoc(), HII->get(Hexagon::ALIGNA), AR) .addImm(MaxA); MF->getInfo<HexagonMachineFunctionInfo>()->setStackAlignBaseVReg(AR); } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index c739afb..0167090 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -41,8 +41,8 @@ using namespace llvm; #define DEBUG_TYPE "hexagon-lowering" -static cl::opt<bool> -EmitJumpTables("hexagon-emit-jump-tables", cl::init(true), cl::Hidden, +static cl::opt<bool> EmitJumpTables("hexagon-emit-jump-tables", + cl::init(true), cl::Hidden, cl::desc("Control jump table emission on Hexagon target")); static cl::opt<bool> EnableHexSDNodeSched("enable-hexagon-sdnode-sched", @@ -98,6 +98,9 @@ public: } // Implement calling convention for Hexagon. + +static bool IsHvxVectorType(MVT ty); + static bool CC_Hexagon(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, @@ -114,6 +117,11 @@ CC_Hexagon64(unsigned ValNo, MVT ValVT, ISD::ArgFlagsTy ArgFlags, CCState &State); static bool +CC_HexagonVector(unsigned ValNo, MVT ValVT, + MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); + +static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State); @@ -129,6 +137,11 @@ RetCC_Hexagon64(unsigned ValNo, MVT ValVT, ISD::ArgFlagsTy ArgFlags, CCState &State); static bool +RetCC_HexagonVector(unsigned ValNo, MVT ValVT, + MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); + +static bool CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { @@ -169,15 +182,43 @@ CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT, State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); return false; } + if (LocVT == MVT::v2i64 || LocVT == MVT::v4i32 || LocVT == MVT::v8i16 || + LocVT == MVT::v16i8) { + ofst = State.AllocateStack(16, 16); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); + return false; + } + if (LocVT == MVT::v4i64 || LocVT == MVT::v8i32 || LocVT == MVT::v16i16 || + LocVT == MVT::v32i8) { + ofst = State.AllocateStack(32, 32); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); + return false; + } + if (LocVT == MVT::v8i64 || LocVT == MVT::v16i32 || LocVT == MVT::v32i16 || + LocVT == MVT::v64i8 || LocVT == MVT::v512i1) { + ofst = State.AllocateStack(64, 64); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); + return false; + } + if (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 || + LocVT == MVT::v128i8 || LocVT == MVT::v1024i1) { + ofst = State.AllocateStack(128, 128); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); + return false; + } + if (LocVT == MVT::v32i64 || LocVT == MVT::v64i32 || LocVT == MVT::v128i16 || + LocVT == MVT::v256i8) { + ofst = State.AllocateStack(256, 256); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); + return false; + } + llvm_unreachable(nullptr); } -static bool -CC_Hexagon (unsigned ValNo, MVT ValVT, - MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - +static bool CC_Hexagon (unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { if (ArgFlags.isByVal()) { // Passed on stack. unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), @@ -213,6 +254,17 @@ CC_Hexagon (unsigned ValNo, MVT ValVT, return false; } + if (LocVT == MVT::v8i32 || LocVT == MVT::v16i16 || LocVT == MVT::v32i8) { + unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), 32); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; + } + + if (IsHvxVectorType(LocVT)) { + if (!CC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State)) + return false; + } + return true; // CC didn't match. } @@ -260,10 +312,82 @@ static bool CC_Hexagon64(unsigned ValNo, MVT ValVT, return false; } +static bool CC_HexagonVector(unsigned ValNo, MVT ValVT, + MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + + static const MCPhysReg VecLstS[] = { Hexagon::V0, Hexagon::V1, + Hexagon::V2, Hexagon::V3, + Hexagon::V4, Hexagon::V5, + Hexagon::V6, Hexagon::V7, + Hexagon::V8, Hexagon::V9, + Hexagon::V10, Hexagon::V11, + Hexagon::V12, Hexagon::V13, + Hexagon::V14, Hexagon::V15}; + static const MCPhysReg VecLstD[] = { Hexagon::W0, Hexagon::W1, + Hexagon::W2, Hexagon::W3, + Hexagon::W4, Hexagon::W5, + Hexagon::W6, Hexagon::W7}; + auto &MF = State.getMachineFunction(); + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + bool UseHVX = HST.useHVXOps(); + bool UseHVXDbl = HST.useHVXDblOps(); + + if ((UseHVX && !UseHVXDbl) && + (LocVT == MVT::v8i64 || LocVT == MVT::v16i32 || LocVT == MVT::v32i16 || + LocVT == MVT::v64i8 || LocVT == MVT::v512i1)) { + if (unsigned Reg = State.AllocateReg(VecLstS)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + unsigned Offset = State.AllocateStack(64, 64); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; + } + if ((UseHVX && !UseHVXDbl) && + (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 || + LocVT == MVT::v128i8)) { + if (unsigned Reg = State.AllocateReg(VecLstD)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + unsigned Offset = State.AllocateStack(128, 128); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; + } + // 128B Mode + if ((UseHVX && UseHVXDbl) && + (LocVT == MVT::v32i64 || LocVT == MVT::v64i32 || LocVT == MVT::v128i16 || + LocVT == MVT::v256i8)) { + if (unsigned Reg = State.AllocateReg(VecLstD)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + unsigned Offset = State.AllocateStack(256, 256); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; + } + if ((UseHVX && UseHVXDbl) && + (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 || + LocVT == MVT::v128i8 || LocVT == MVT::v1024i1)) { + if (unsigned Reg = State.AllocateReg(VecLstS)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + unsigned Offset = State.AllocateStack(128, 128); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; + } + return true; +} + static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { - + auto &MF = State.getMachineFunction(); + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + bool UseHVX = HST.useHVXOps(); + bool UseHVXDbl = HST.useHVXDblOps(); if (LocVT == MVT::i1 || LocVT == MVT::i8 || @@ -282,8 +406,24 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT, } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) { LocVT = MVT::i64; LocInfo = CCValAssign::BCvt; + } else if (LocVT == MVT::v64i8 || LocVT == MVT::v32i16 || + LocVT == MVT::v16i32 || LocVT == MVT::v8i64 || + LocVT == MVT::v512i1) { + LocVT = MVT::v16i32; + ValVT = MVT::v16i32; + LocInfo = CCValAssign::Full; + } else if (LocVT == MVT::v128i8 || LocVT == MVT::v64i16 || + LocVT == MVT::v32i32 || LocVT == MVT::v16i64 || + (LocVT == MVT::v1024i1 && UseHVX && UseHVXDbl)) { + LocVT = MVT::v32i32; + ValVT = MVT::v32i32; + LocInfo = CCValAssign::Full; + } else if (LocVT == MVT::v256i8 || LocVT == MVT::v128i16 || + LocVT == MVT::v64i32 || LocVT == MVT::v32i64) { + LocVT = MVT::v64i32; + ValVT = MVT::v64i32; + LocInfo = CCValAssign::Full; } - if (LocVT == MVT::i32 || LocVT == MVT::f32) { if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State)) return false; @@ -293,7 +433,10 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT, if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State)) return false; } - + if (LocVT == MVT::v16i32 || LocVT == MVT::v32i32 || LocVT == MVT::v64i32) { + if (!RetCC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State)) + return false; + } return true; // CC didn't match. } @@ -328,6 +471,52 @@ static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT, return false; } +static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT, + MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + auto &MF = State.getMachineFunction(); + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + bool UseHVX = HST.useHVXOps(); + bool UseHVXDbl = HST.useHVXDblOps(); + + unsigned OffSiz = 64; + if (LocVT == MVT::v16i32) { + if (unsigned Reg = State.AllocateReg(Hexagon::V0)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } else if (LocVT == MVT::v32i32) { + unsigned Req = (UseHVX && UseHVXDbl) ? Hexagon::V0 : Hexagon::W0; + if (unsigned Reg = State.AllocateReg(Req)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + OffSiz = 128; + } else if (LocVT == MVT::v64i32) { + if (unsigned Reg = State.AllocateReg(Hexagon::W0)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + OffSiz = 256; + } + + unsigned Offset = State.AllocateStack(OffSiz, OffSiz); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; +} + +void HexagonTargetLowering::promoteLdStType(EVT VT, EVT PromotedLdStVT) { + if (VT != PromotedLdStVT) { + setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), + PromotedLdStVT.getSimpleVT()); + + setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::STORE, VT.getSimpleVT(), + PromotedLdStVT.getSimpleVT()); + } +} + SDValue HexagonTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { @@ -351,6 +540,15 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, MachinePointerInfo(), MachinePointerInfo()); } +static bool IsHvxVectorType(MVT ty) { + return (ty == MVT::v8i64 || ty == MVT::v16i32 || ty == MVT::v32i16 || + ty == MVT::v64i8 || + ty == MVT::v16i64 || ty == MVT::v32i32 || ty == MVT::v64i16 || + ty == MVT::v128i8 || + ty == MVT::v32i64 || ty == MVT::v64i32 || ty == MVT::v128i16 || + ty == MVT::v256i8 || + ty == MVT::v512i1 || ty == MVT::v1024i1); +} // LowerReturn - Lower ISD::RET. If a struct is larger than 8 bytes and is // passed by value, the function prototype is modified to return void and @@ -463,19 +661,15 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Check for varargs. int NumNamedVarArgParams = -1; - if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) - { - const Function* CalleeFn = nullptr; - Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, MVT::i32); - if ((CalleeFn = dyn_cast<Function>(GA->getGlobal()))) - { + if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = GAN->getGlobal(); + Callee = DAG.getTargetGlobalAddress(GV, dl, MVT::i32); + if (const Function* F = dyn_cast<Function>(GV)) { // If a function has zero args and is a vararg function, that's // disallowed so it must be an undeclared function. Do not assume // varargs if the callee is undefined. - if (CalleeFn->isVarArg() && - CalleeFn->getFunctionType()->getNumParams() != 0) { - NumNamedVarArgParams = CalleeFn->getFunctionType()->getNumParams(); - } + if (F->isVarArg() && F->getFunctionType()->getNumParams() != 0) + NumNamedVarArgParams = F->getFunctionType()->getNumParams(); } } @@ -519,11 +713,16 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT); + bool NeedsArgAlign = false; + unsigned LargestAlignSeen = 0; // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; + // Record if we need > 8 byte alignment on an argument. + bool ArgAlign = IsHvxVectorType(VA.getValVT()); + NeedsArgAlign |= ArgAlign; // Promote the value if needed. switch (VA.getLocInfo()) { @@ -549,13 +748,17 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue MemAddr = DAG.getConstant(LocMemOffset, dl, StackPtr.getValueType()); MemAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, MemAddr); + if (ArgAlign) + LargestAlignSeen = std::max(LargestAlignSeen, + VA.getLocVT().getStoreSizeInBits() >> 3); if (Flags.isByVal()) { // The argument is a struct passed by value. According to LLVM, "Arg" // is is pointer. MemOpChains.push_back(CreateCopyOfByValArgument(Arg, MemAddr, Chain, Flags, DAG, dl)); } else { - MachinePointerInfo LocPI = MachinePointerInfo::getStack(LocMemOffset); + MachinePointerInfo LocPI = MachinePointerInfo::getStack( + DAG.getMachineFunction(), LocMemOffset); SDValue S = DAG.getStore(Chain, dl, Arg, MemAddr, LocPI, false, false, 0); MemOpChains.push_back(S); @@ -569,6 +772,17 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } + if (NeedsArgAlign && Subtarget.hasV60TOps()) { + DEBUG(dbgs() << "Function needs byte stack align due to call args\n"); + MachineFrameInfo* MFI = DAG.getMachineFunction().getFrameInfo(); + // V6 vectors passed by value have 64 or 128 byte alignment depending + // on whether we are 64 byte vector mode or 128 byte. + bool UseHVXDbl = Subtarget.useHVXDblOps(); + assert(Subtarget.useHVXOps()); + const unsigned ObjAlign = UseHVXDbl ? 128 : 64; + LargestAlignSeen = std::max(LargestAlignSeen, ObjAlign); + MFI->ensureMaxAlignment(LargestAlignSeen); + } // Transform all store nodes into one single node because all store // nodes are independent of each other. if (!MemOpChains.empty()) @@ -613,12 +827,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. - if (flag_aligned_memcpy) { - const char *MemcpyName = - "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes"; - Callee = DAG.getTargetExternalSymbol(MemcpyName, PtrVT); - flag_aligned_memcpy = false; - } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, PtrVT); } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { @@ -668,7 +877,19 @@ static bool getIndexedAddressParts(SDNode *Ptr, EVT VT, if (Ptr->getOpcode() != ISD::ADD) return false; - if (VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) { + auto &HST = static_cast<const HexagonSubtarget&>(DAG.getSubtarget()); + bool UseHVX = HST.useHVXOps(); + bool UseHVXDbl = HST.useHVXDblOps(); + + bool ValidHVXDblType = + (UseHVX && UseHVXDbl) && (VT == MVT::v32i32 || VT == MVT::v16i64 || + VT == MVT::v64i16 || VT == MVT::v128i8); + bool ValidHVXType = + UseHVX && !UseHVXDbl && (VT == MVT::v16i32 || VT == MVT::v8i64 || + VT == MVT::v32i16 || VT == MVT::v64i8); + + if (ValidHVXDblType || ValidHVXType || + VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) { isInc = (Ptr->getOpcode() == ISD::ADD); Base = Ptr->getOperand(0); Offset = Ptr->getOperand(1); @@ -679,23 +900,6 @@ static bool getIndexedAddressParts(SDNode *Ptr, EVT VT, return false; } -// TODO: Put this function along with the other isS* functions in -// HexagonISelDAGToDAG.cpp into a common file. Or better still, use the -// functions defined in HexagonOperands.td. -static bool Is_PostInc_S4_Offset(SDNode * S, int ShiftAmount) { - ConstantSDNode *N = cast<ConstantSDNode>(S); - - // immS4 predicate - True if the immediate fits in a 4-bit sign extended. - // field. - int64_t v = (int64_t)N->getSExtValue(); - int64_t m = 0; - if (ShiftAmount > 0) { - m = v % ShiftAmount; - v = v >> ShiftAmount; - } - return (v <= 7) && (v >= -8) && (m == 0); -} - /// getPostIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if this node can be /// combined with a load / store to form a post-indexed load / store. @@ -724,18 +928,20 @@ bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, bool isInc = false; bool isLegal = getIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, isInc, DAG); - // ShiftAmount = number of left-shifted bits in the Hexagon instruction. - int ShiftAmount = VT.getSizeInBits() / 16; - if (isLegal && Is_PostInc_S4_Offset(Offset.getNode(), ShiftAmount)) { - AM = isInc ? ISD::POST_INC : ISD::POST_DEC; - return true; + if (isLegal) { + auto &HII = *Subtarget.getInstrInfo(); + int32_t OffsetVal = cast<ConstantSDNode>(Offset.getNode())->getSExtValue(); + if (HII.isValidAutoIncImm(VT, OffsetVal)) { + AM = isInc ? ISD::POST_INC : ISD::POST_DEC; + return true; + } } return false; } -SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op, - SelectionDAG &DAG) const { +SDValue +HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); MachineFunction &MF = DAG.getMachineFunction(); auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>(); @@ -784,47 +990,6 @@ SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op, return Op; } - -// -// Taken from the XCore backend. -// -SDValue HexagonTargetLowering:: -LowerBR_JT(SDValue Op, SelectionDAG &DAG) const -{ - SDValue Chain = Op.getOperand(0); - SDValue Table = Op.getOperand(1); - SDValue Index = Op.getOperand(2); - SDLoc dl(Op); - JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); - unsigned JTI = JT->getIndex(); - MachineFunction &MF = DAG.getMachineFunction(); - const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo(); - SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32); - - // Mark all jump table targets as address taken. - const std::vector<MachineJumpTableEntry> &JTE = MJTI->getJumpTables(); - const std::vector<MachineBasicBlock*> &JTBBs = JTE[JTI].MBBs; - for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { - MachineBasicBlock *MBB = JTBBs[i]; - MBB->setHasAddressTaken(); - // This line is needed to set the hasAddressTaken flag on the BasicBlock - // object. - BlockAddress::get(const_cast<BasicBlock *>(MBB->getBasicBlock())); - } - - SDValue JumpTableBase = DAG.getNode( - HexagonISD::JT, dl, getPointerTy(DAG.getDataLayout()), TargetJT); - SDValue ShiftIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index, - DAG.getConstant(2, dl, MVT::i32)); - SDValue JTAddress = DAG.getNode(ISD::ADD, dl, MVT::i32, JumpTableBase, - ShiftIndex); - SDValue LoadTarget = DAG.getLoad(MVT::i32, dl, Chain, JTAddress, - MachinePointerInfo(), false, false, false, - 0); - return DAG.getNode(HexagonISD::BR_JT, dl, MVT::Other, Chain, LoadTarget); -} - - SDValue HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { @@ -850,7 +1015,10 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue AC = DAG.getConstant(A, dl, MVT::i32); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); - return DAG.getNode(HexagonISD::ALLOCA, dl, VTs, Chain, Size, AC); + SDValue AA = DAG.getNode(HexagonISD::ALLOCA, dl, VTs, Chain, Size, AC); + if (Op.getNode()->getHasDebugValue()) + DAG.TransferDbgValues(Op, AA); + return AA; } SDValue @@ -882,7 +1050,8 @@ const { // equal to) 8 bytes. If not, no address will be passed into callee and // callee return the result direclty through R0/R1. - SmallVector<SDValue, 4> MemOps; + SmallVector<SDValue, 8> MemOps; + bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -908,6 +1077,42 @@ const { RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); + + // Single Vector + } else if ((RegVT == MVT::v8i64 || RegVT == MVT::v16i32 || + RegVT == MVT::v32i16 || RegVT == MVT::v64i8)) { + unsigned VReg = + RegInfo.createVirtualRegister(&Hexagon::VectorRegsRegClass); + RegInfo.addLiveIn(VA.getLocReg(), VReg); + InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); + } else if (UseHVX && UseHVXDbl && + ((RegVT == MVT::v16i64 || RegVT == MVT::v32i32 || + RegVT == MVT::v64i16 || RegVT == MVT::v128i8))) { + unsigned VReg = + RegInfo.createVirtualRegister(&Hexagon::VectorRegs128BRegClass); + RegInfo.addLiveIn(VA.getLocReg(), VReg); + InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); + + // Double Vector + } else if ((RegVT == MVT::v16i64 || RegVT == MVT::v32i32 || + RegVT == MVT::v64i16 || RegVT == MVT::v128i8)) { + unsigned VReg = + RegInfo.createVirtualRegister(&Hexagon::VecDblRegsRegClass); + RegInfo.addLiveIn(VA.getLocReg(), VReg); + InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); + } else if (UseHVX && UseHVXDbl && + ((RegVT == MVT::v32i64 || RegVT == MVT::v64i32 || + RegVT == MVT::v128i16 || RegVT == MVT::v256i8))) { + unsigned VReg = + RegInfo.createVirtualRegister(&Hexagon::VecDblRegs128BRegClass); + RegInfo.addLiveIn(VA.getLocReg(), VReg); + InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); + } else if (RegVT == MVT::v512i1 || RegVT == MVT::v1024i1) { + assert(0 && "need to support VecPred regs"); + unsigned VReg = + RegInfo.createVirtualRegister(&Hexagon::VecPredRegsRegClass); + RegInfo.addLiveIn(VA.getLocReg(), VReg); + InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); } else { assert (0); } @@ -1056,8 +1261,8 @@ SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) - const { +SDValue +HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue PredOp = Op.getOperand(0); SDValue Op1 = Op.getOperand(1), Op2 = Op.getOperand(2); EVT OpVT = Op1.getValueType(); @@ -1163,16 +1368,33 @@ SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { EVT ValTy = Op.getValueType(); - SDLoc dl(Op); - ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); - SDValue Res; - if (CP->isMachineConstantPoolEntry()) - Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), ValTy, - CP->getAlignment()); + ConstantPoolSDNode *CPN = cast<ConstantPoolSDNode>(Op); + unsigned Align = CPN->getAlignment(); + Reloc::Model RM = HTM.getRelocationModel(); + unsigned char TF = (RM == Reloc::PIC_) ? HexagonII::MO_PCREL : 0; + + SDValue T; + if (CPN->isMachineConstantPoolEntry()) + T = DAG.getTargetConstantPool(CPN->getMachineCPVal(), ValTy, Align, TF); else - Res = DAG.getTargetConstantPool(CP->getConstVal(), ValTy, - CP->getAlignment()); - return DAG.getNode(HexagonISD::CP, dl, ValTy, Res); + T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, TF); + if (RM == Reloc::PIC_) + return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), ValTy, T); + return DAG.getNode(HexagonISD::CP, SDLoc(Op), ValTy, T); +} + +SDValue +HexagonTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + int Idx = cast<JumpTableSDNode>(Op)->getIndex(); + Reloc::Model RM = HTM.getRelocationModel(); + if (RM == Reloc::PIC_) { + SDValue T = DAG.getTargetJumpTable(Idx, VT, HexagonII::MO_PCREL); + return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), VT, T); + } + + SDValue T = DAG.getTargetJumpTable(Idx, VT); + return DAG.getNode(HexagonISD::JT, SDLoc(Op), VT, T); } SDValue @@ -1219,52 +1441,70 @@ HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { return FrameAddr; } -SDValue HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op, - SelectionDAG& DAG) const { +SDValue +HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const { SDLoc dl(Op); return DAG.getNode(HexagonISD::BARRIER, dl, MVT::Other, Op.getOperand(0)); } -SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, - SelectionDAG &DAG) const { - SDValue Result; - const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); - int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); +SDValue +HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); + auto *GAN = cast<GlobalAddressSDNode>(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); - Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); + auto *GV = GAN->getGlobal(); + int64_t Offset = GAN->getOffset(); + + auto &HLOF = *HTM.getObjFileLowering(); + Reloc::Model RM = HTM.getRelocationModel(); - const HexagonTargetObjectFile *TLOF = - static_cast<const HexagonTargetObjectFile *>( - getTargetMachine().getObjFileLowering()); - if (TLOF->IsGlobalInSmallSection(GV, getTargetMachine())) { - return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, Result); + if (RM == Reloc::Static) { + SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); + if (HLOF.IsGlobalInSmallSection(GV, HTM)) + return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, GA); + return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, GA); } - return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, Result); + bool UsePCRel = GV->hasInternalLinkage() || GV->hasHiddenVisibility() || + (GV->hasLocalLinkage() && !isa<Function>(GV)); + if (UsePCRel) { + SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset, + HexagonII::MO_PCREL); + return DAG.getNode(HexagonISD::AT_PCREL, dl, PtrVT, GA); + } + + // Use GOT index. + SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT); + SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, HexagonII::MO_GOT); + SDValue Off = DAG.getConstant(Offset, dl, MVT::i32); + return DAG.getNode(HexagonISD::AT_GOT, dl, PtrVT, GOT, GA, Off); } // Specifies that for loads and stores VT can be promoted to PromotedLdStVT. -void HexagonTargetLowering::promoteLdStType(EVT VT, EVT PromotedLdStVT) { - if (VT != PromotedLdStVT) { - setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), - PromotedLdStVT.getSimpleVT()); +SDValue +HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { + const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + SDLoc dl(Op); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); - setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::STORE, VT.getSimpleVT(), - PromotedLdStVT.getSimpleVT()); + Reloc::Model RM = HTM.getRelocationModel(); + if (RM == Reloc::Static) { + SDValue A = DAG.getTargetBlockAddress(BA, PtrVT); + return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, A); } + + SDValue A = DAG.getTargetBlockAddress(BA, PtrVT, 0, HexagonII::MO_PCREL); + return DAG.getNode(HexagonISD::AT_PCREL, dl, PtrVT, A); } SDValue -HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { - const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); - SDValue BA_SD = DAG.getTargetBlockAddress(BA, MVT::i32); - SDLoc dl(Op); - return DAG.getNode(HexagonISD::CONST32_GP, dl, - getPointerTy(DAG.getDataLayout()), BA_SD); +HexagonTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) + const { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue GOTSym = DAG.getTargetExternalSymbol(HEXAGON_GOT_SYM_NAME, PtrVT, + HexagonII::MO_PCREL); + return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), PtrVT, GOTSym); } //===----------------------------------------------------------------------===// @@ -1272,18 +1512,19 @@ HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { //===----------------------------------------------------------------------===// HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, - const HexagonSubtarget &STI) + const HexagonSubtarget &ST) : TargetLowering(TM), HTM(static_cast<const HexagonTargetMachine&>(TM)), - Subtarget(STI) { + Subtarget(ST) { bool IsV4 = !Subtarget.hasV5TOps(); auto &HRI = *Subtarget.getRegisterInfo(); + bool UseHVX = Subtarget.useHVXOps(); + bool UseHVXSgl = Subtarget.useHVXSglOps(); + bool UseHVXDbl = Subtarget.useHVXDblOps(); setPrefLoopAlignment(4); setPrefFunctionAlignment(4); setMinFunctionAlignment(2); setInsertFencesForAtomic(false); - setExceptionPointerRegister(Hexagon::R0); - setExceptionSelectorRegister(Hexagon::R1); setStackPointerRegisterToSaveRestore(HRI.getStackRegister()); if (EnableHexSDNodeSched) @@ -1320,6 +1561,31 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass); } + if (Subtarget.hasV60TOps()) { + if (Subtarget.useHVXSglOps()) { + addRegisterClass(MVT::v64i8, &Hexagon::VectorRegsRegClass); + addRegisterClass(MVT::v32i16, &Hexagon::VectorRegsRegClass); + addRegisterClass(MVT::v16i32, &Hexagon::VectorRegsRegClass); + addRegisterClass(MVT::v8i64, &Hexagon::VectorRegsRegClass); + addRegisterClass(MVT::v128i8, &Hexagon::VecDblRegsRegClass); + addRegisterClass(MVT::v64i16, &Hexagon::VecDblRegsRegClass); + addRegisterClass(MVT::v32i32, &Hexagon::VecDblRegsRegClass); + addRegisterClass(MVT::v16i64, &Hexagon::VecDblRegsRegClass); + addRegisterClass(MVT::v512i1, &Hexagon::VecPredRegsRegClass); + } else if (Subtarget.useHVXDblOps()) { + addRegisterClass(MVT::v128i8, &Hexagon::VectorRegs128BRegClass); + addRegisterClass(MVT::v64i16, &Hexagon::VectorRegs128BRegClass); + addRegisterClass(MVT::v32i32, &Hexagon::VectorRegs128BRegClass); + addRegisterClass(MVT::v16i64, &Hexagon::VectorRegs128BRegClass); + addRegisterClass(MVT::v256i8, &Hexagon::VecDblRegs128BRegClass); + addRegisterClass(MVT::v128i16, &Hexagon::VecDblRegs128BRegClass); + addRegisterClass(MVT::v64i32, &Hexagon::VecDblRegs128BRegClass); + addRegisterClass(MVT::v32i64, &Hexagon::VecDblRegs128BRegClass); + addRegisterClass(MVT::v1024i1, &Hexagon::VecPredRegs128BRegClass); + } + + } + // // Handling of scalar operations. // @@ -1336,10 +1602,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::ConstantFP, MVT::f64, Legal); // Default: expand setOperationAction(ISD::ConstantPool, MVT::i32, Custom); + setOperationAction(ISD::JumpTable, MVT::i32, Custom); setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); setOperationAction(ISD::INLINEASM, MVT::Other, Custom); setOperationAction(ISD::EH_RETURN, MVT::Other, Custom); + setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); // Custom legalize GlobalAddress nodes into CONST32. @@ -1361,11 +1629,10 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); if (EmitJumpTables) - setOperationAction(ISD::BR_JT, MVT::Other, Custom); + setMinimumJumpTableEntries(2); else - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - // Increase jump tables cutover to 5, was 4. - setMinimumJumpTableEntries(MinimumJumpTables); + setMinimumJumpTableEntries(MinimumJumpTables); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); // Hexagon has instructions for add/sub with carry. The problem with // modeling these instructions is that they produce 2 results: Rdd and Px. @@ -1420,9 +1687,10 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::MULHS, MVT::i64, Expand); for (unsigned IntExpOp : - {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::SDIVREM, ISD::UDIVREM, - ISD::ROTL, ISD::ROTR, ISD::BSWAP, ISD::SHL_PARTS, ISD::SRA_PARTS, - ISD::SRL_PARTS, ISD::SMUL_LOHI, ISD::UMUL_LOHI}) { + { ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, + ISD::SDIVREM, ISD::UDIVREM, ISD::ROTL, ISD::ROTR, + ISD::BSWAP, ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS, + ISD::SMUL_LOHI, ISD::UMUL_LOHI }) { setOperationAction(IntExpOp, MVT::i32, Expand); setOperationAction(IntExpOp, MVT::i64, Expand); } @@ -1475,7 +1743,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, // Set the action for vector operations to "expand", then override it with // either "custom" or "legal" for specific cases. - static unsigned VectExpOps[] = { + static const unsigned VectExpOps[] = { // Integer arithmetic: ISD::ADD, ISD::SUB, ISD::MUL, ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::SDIVREM, ISD::UDIVREM, ISD::ADDC, @@ -1539,7 +1807,21 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v2i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); - + if (UseHVX) { + if (UseHVXSgl) { + setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i8, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i16, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i64, Custom); + } else if (UseHVXDbl) { + setOperationAction(ISD::CONCAT_VECTORS, MVT::v256i8, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i16, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i64, Custom); + } else { + llvm_unreachable("Unrecognized HVX mode"); + } + } // Subtarget-specific operation actions. // if (Subtarget.hasV5TOps()) { @@ -1586,7 +1868,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, for (ISD::CondCode FPExpCCV4 : {ISD::SETOEQ, ISD::SETOGT, ISD::SETOLT, ISD::SETOGE, ISD::SETOLE, - ISD::SETUO, ISD::SETO}) { + ISD::SETUO, ISD::SETO}) { setCondCodeAction(FPExpCCV4, MVT::f32, Expand); setCondCodeAction(FPExpCCV4, MVT::f64, Expand); } @@ -1599,6 +1881,13 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setIndexedStoreAction(ISD::POST_INC, LSXTy, Legal); } + if (UseHVXDbl) { + for (MVT VT : {MVT::v128i8, MVT::v64i16, MVT::v32i32, MVT::v16i64}) { + setIndexedLoadAction(ISD::POST_INC, VT, Legal); + setIndexedStoreAction(ISD::POST_INC, VT, Legal); + } + } + computeRegisterProperties(&HRI); // @@ -1720,7 +2009,6 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::AT_GOT: return "HexagonISD::AT_GOT"; case HexagonISD::AT_PCREL: return "HexagonISD::AT_PCREL"; case HexagonISD::BARRIER: return "HexagonISD::BARRIER"; - case HexagonISD::BR_JT: return "HexagonISD::BR_JT"; case HexagonISD::CALLR: return "HexagonISD::CALLR"; case HexagonISD::CALLv3nr: return "HexagonISD::CALLv3nr"; case HexagonISD::CALLv3: return "HexagonISD::CALLv3"; @@ -1737,7 +2025,6 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::INSERTRP: return "HexagonISD::INSERTRP"; case HexagonISD::JT: return "HexagonISD::JT"; case HexagonISD::PACKHL: return "HexagonISD::PACKHL"; - case HexagonISD::PIC_ADD: return "HexagonISD::PIC_ADD"; case HexagonISD::POPCOUNT: return "HexagonISD::POPCOUNT"; case HexagonISD::RET_FLAG: return "HexagonISD::RET_FLAG"; case HexagonISD::SHUFFEB: return "HexagonISD::SHUFFEB"; @@ -1754,6 +2041,7 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::VCMPWEQ: return "HexagonISD::VCMPWEQ"; case HexagonISD::VCMPWGT: return "HexagonISD::VCMPWGT"; case HexagonISD::VCMPWGTU: return "HexagonISD::VCMPWGTU"; + case HexagonISD::VCOMBINE: return "HexagonISD::VCOMBINE"; case HexagonISD::VSHLH: return "HexagonISD::VSHLH"; case HexagonISD::VSHLW: return "HexagonISD::VSHLW"; case HexagonISD::VSPLATB: return "HexagonISD::VSPLTB"; @@ -1923,8 +2211,7 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned Size = VT.getSizeInBits(); - // A vector larger than 64 bits cannot be represented in Hexagon. - // Expand will split the vector. + // Only handle vectors of 64 bits or shorter. if (Size > 64) return SDValue(); @@ -2058,58 +2345,61 @@ SDValue HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); + bool UseHVX = Subtarget.useHVXOps(); EVT VT = Op.getValueType(); unsigned NElts = Op.getNumOperands(); - SDValue Vec = Op.getOperand(0); - EVT VecVT = Vec.getValueType(); - SDValue Width = DAG.getConstant(VecVT.getSizeInBits(), dl, MVT::i64); - SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width, - DAG.getConstant(32, dl, MVT::i64)); - SDValue ConstVal = DAG.getConstant(0, dl, MVT::i64); - - ConstantSDNode *W = dyn_cast<ConstantSDNode>(Width); - ConstantSDNode *S = dyn_cast<ConstantSDNode>(Shifted); - - if ((VecVT.getSimpleVT() == MVT::v2i16) && (NElts == 2) && W && S) { - if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) { - // We are trying to concat two v2i16 to a single v4i16. - SDValue Vec0 = Op.getOperand(1); - SDValue Combined = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec); - return DAG.getNode(ISD::BITCAST, dl, VT, Combined); + SDValue Vec0 = Op.getOperand(0); + EVT VecVT = Vec0.getValueType(); + unsigned Width = VecVT.getSizeInBits(); + + if (NElts == 2) { + MVT ST = VecVT.getSimpleVT(); + // We are trying to concat two v2i16 to a single v4i16, or two v4i8 + // into a single v8i8. + if (ST == MVT::v2i16 || ST == MVT::v4i8) + return DAG.getNode(HexagonISD::COMBINE, dl, VT, Op.getOperand(1), Vec0); + + if (UseHVX) { + assert((Width == 64*8 && Subtarget.useHVXSglOps()) || + (Width == 128*8 && Subtarget.useHVXDblOps())); + SDValue Vec1 = Op.getOperand(1); + MVT OpTy = Subtarget.useHVXSglOps() ? MVT::v16i32 : MVT::v32i32; + MVT ReTy = Subtarget.useHVXSglOps() ? MVT::v32i32 : MVT::v64i32; + SDValue B0 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec0); + SDValue B1 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec1); + SDValue VC = DAG.getNode(HexagonISD::VCOMBINE, dl, ReTy, B1, B0); + return DAG.getNode(ISD::BITCAST, dl, VT, VC); } } - if ((VecVT.getSimpleVT() == MVT::v4i8) && (NElts == 2) && W && S) { - if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) { - // We are trying to concat two v4i8 to a single v8i8. - SDValue Vec0 = Op.getOperand(1); - SDValue Combined = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec); - return DAG.getNode(ISD::BITCAST, dl, VT, Combined); - } - } + if (VT.getSizeInBits() != 32 && VT.getSizeInBits() != 64) + return SDValue(); + + SDValue C0 = DAG.getConstant(0, dl, MVT::i64); + SDValue C32 = DAG.getConstant(32, dl, MVT::i64); + SDValue W = DAG.getConstant(Width, dl, MVT::i64); + // Create the "width" part of the argument to insert_rp/insertp_rp. + SDValue S = DAG.getNode(ISD::SHL, dl, MVT::i64, W, C32); + SDValue V = C0; for (unsigned i = 0, e = NElts; i != e; ++i) { - unsigned OpIdx = NElts - i - 1; - SDValue Operand = Op.getOperand(OpIdx); + unsigned N = NElts-i-1; + SDValue OpN = Op.getOperand(N); - if (VT.getSizeInBits() == 64 && - Operand.getValueType().getSizeInBits() == 32) { + if (VT.getSizeInBits() == 64 && OpN.getValueType().getSizeInBits() == 32) { SDValue C = DAG.getConstant(0, dl, MVT::i32); - Operand = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Operand); + OpN = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, OpN); } - - SDValue Idx = DAG.getConstant(OpIdx, dl, MVT::i64); - SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, Width); - SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset); - const SDValue Ops[] = {ConstVal, Operand, Combined}; - + SDValue Idx = DAG.getConstant(N, dl, MVT::i64); + SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, W); + SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, S, Offset); if (VT.getSizeInBits() == 32) - ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, Ops); + V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, {V, OpN, Or}); else - ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, Ops); + V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, {V, OpN, Or}); } - return DAG.getNode(ISD::BITCAST, dl, VT, ConstVal); + return DAG.getNode(ISD::BITCAST, dl, VT, V); } SDValue @@ -2301,6 +2591,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SHL: case ISD::SRL: return LowerVECTOR_SHIFT(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); // Frame & Return address. Currently unimplemented. case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); @@ -2308,8 +2599,8 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG); case ISD::GlobalAddress: return LowerGLOBALADDRESS(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); + case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); - case ISD::BR_JT: return LowerBR_JT(Op, DAG); // Custom lower some vector loads. case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); @@ -2321,6 +2612,16 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } } +/// Returns relocation base for the given PIC jumptable. +SDValue +HexagonTargetLowering::getPICJumpTableRelocBase(SDValue Table, + SelectionDAG &DAG) const { + int Idx = cast<JumpTableSDNode>(Table)->getIndex(); + EVT VT = Table.getValueType(); + SDValue T = DAG.getTargetJumpTable(Idx, VT, HexagonII::MO_PCREL); + return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Table), VT, T); +} + MachineBasicBlock * HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) @@ -2343,6 +2644,8 @@ HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, std::pair<unsigned, const TargetRegisterClass *> HexagonTargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { + bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps(); + if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': // R0-R31 @@ -2358,6 +2661,42 @@ HexagonTargetLowering::getRegForInlineAsmConstraint( case MVT::f64: return std::make_pair(0U, &Hexagon::DoubleRegsRegClass); } + case 'q': // q0-q3 + switch (VT.SimpleTy) { + default: + llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type"); + case MVT::v1024i1: + case MVT::v512i1: + case MVT::v32i16: + case MVT::v16i32: + case MVT::v64i8: + case MVT::v8i64: + return std::make_pair(0U, &Hexagon::VecPredRegsRegClass); + } + case 'v': // V0-V31 + switch (VT.SimpleTy) { + default: + llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type"); + case MVT::v16i32: + case MVT::v32i16: + case MVT::v64i8: + case MVT::v8i64: + return std::make_pair(0U, &Hexagon::VectorRegsRegClass); + case MVT::v32i32: + case MVT::v64i16: + case MVT::v16i64: + case MVT::v128i8: + if (Subtarget.hasV60TOps() && UseHVX && UseHVXDbl) + return std::make_pair(0U, &Hexagon::VectorRegs128BRegClass); + else + return std::make_pair(0U, &Hexagon::VecDblRegsRegClass); + case MVT::v256i8: + case MVT::v128i16: + case MVT::v64i32: + case MVT::v32i64: + return std::make_pair(0U, &Hexagon::VecDblRegs128BRegClass); + } + default: llvm_unreachable("Unknown asm register class"); } @@ -2397,6 +2736,14 @@ bool HexagonTargetLowering::isLegalAddressingMode(const DataLayout &DL, return true; } +/// Return true if folding a constant offset with the given GlobalAddress is +/// legal. It is frequently not legal in PIC relocation models. +bool HexagonTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) + const { + return HTM.getRelocationModel() == Reloc::Static; +} + + /// isLegalICmpImmediate - Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can compare /// a register against the immediate without having to materialize the @@ -2428,8 +2775,8 @@ bool HexagonTargetLowering::IsEligibleForTailCallOptimization( // *************************************************************************** // If this is a tail call via a function pointer, then don't do it! - if (!(dyn_cast<GlobalAddressSDNode>(Callee)) - && !(dyn_cast<ExternalSymbolSDNode>(Callee))) { + if (!(isa<GlobalAddressSDNode>(Callee)) && + !(isa<ExternalSymbolSDNode>(Callee))) { return false; } @@ -2467,6 +2814,41 @@ bool llvm::isPositiveHalfWord(SDNode *N) { } } +std::pair<const TargetRegisterClass*, uint8_t> +HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const { + const TargetRegisterClass *RRC = nullptr; + + uint8_t Cost = 1; + switch (VT.SimpleTy) { + default: + return TargetLowering::findRepresentativeClass(TRI, VT); + case MVT::v64i8: + case MVT::v32i16: + case MVT::v16i32: + case MVT::v8i64: + RRC = &Hexagon::VectorRegsRegClass; + break; + case MVT::v128i8: + case MVT::v64i16: + case MVT::v32i32: + case MVT::v16i64: + if (Subtarget.hasV60TOps() && Subtarget.useHVXOps() && + Subtarget.useHVXDblOps()) + RRC = &Hexagon::VectorRegs128BRegClass; + else + RRC = &Hexagon::VecDblRegsRegClass; + break; + case MVT::v256i8: + case MVT::v128i16: + case MVT::v64i32: + case MVT::v32i64: + RRC = &Hexagon::VecDblRegs128BRegClass; + break; + } + return std::make_pair(RRC, Cost); +} + Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { BasicBlock *BB = Builder.GetInsertBlock(); @@ -2498,13 +2880,15 @@ Value *HexagonTargetLowering::emitStoreConditional(IRBuilder<> &Builder, return Ext; } -bool HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { +TargetLowering::AtomicExpansionKind +HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { // Do not expand loads and stores that don't exceed 64 bits. - return LI->getType()->getPrimitiveSizeInBits() > 64; + return LI->getType()->getPrimitiveSizeInBits() > 64 + ? AtomicExpansionKind::LLOnly + : AtomicExpansionKind::None; } bool HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // Do not expand loads and stores that don't exceed 64 bits. return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() > 64; } - diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h index 2642abf..bf378b9 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -35,16 +35,14 @@ bool isPositiveHalfWord(SDNode *N); ALLOCA, ARGEXTEND, - PIC_ADD, - AT_GOT, - AT_PCREL, + AT_GOT, // Index in GOT. + AT_PCREL, // Offset relative to PC. CALLv3, // A V3+ call instruction. CALLv3nr, // A V3+ call instruction that doesn't return. CALLR, RET_FLAG, // Return with a flag operand. - BR_JT, // Branch through jump table. BARRIER, // Memory barrier. JT, // Jump table. CP, // Constant pool. @@ -80,6 +78,7 @@ bool isPositiveHalfWord(SDNode *N); INSERTRP, EXTRACTU, EXTRACTURP, + VCOMBINE, TC_RETURN, EH_RETURN, DCFETCH, @@ -127,7 +126,6 @@ bool isPositiveHalfWord(SDNode *N); SDValue LowerEXTRACT_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const; @@ -137,6 +135,7 @@ bool isPositiveHalfWord(SDNode *N); SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const override; SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; @@ -163,8 +162,23 @@ bool isPositiveHalfWord(SDNode *N); MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const override; + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + return Hexagon::R0; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + return Hexagon::R1; + } + SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; EVT getSetCCResultType(const DataLayout &, LLVMContext &C, EVT VT) const override { if (!VT.isVector()) @@ -200,6 +214,10 @@ bool isPositiveHalfWord(SDNode *N); /// TODO: Handle pre/postinc as well. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; + /// Return true if folding a constant offset with the given GlobalAddress + /// is legal. It is frequently not legal in PIC relocation models. + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; /// isLegalICmpImmediate - Return true if the specified immediate is legal @@ -208,20 +226,26 @@ bool isPositiveHalfWord(SDNode *N); /// the immediate into a register. bool isLegalICmpImmediate(int64_t Imm) const override; + /// Returns relocation base for the given PIC jumptable. + SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) + const override; + // Handling of atomic RMW instructions. - bool hasLoadLinkedStoreConditional() const override { - return true; - } Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; - bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - AtomicRMWExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) - const override { - return AtomicRMWExpansionKind::LLSC; + AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override { + return AtomicExpansionKind::LLSC; } + + protected: + std::pair<const TargetRegisterClass*, uint8_t> + findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) + const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td new file mode 100644 index 0000000..5a1a69b --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td @@ -0,0 +1,462 @@ +//==- HexagonInstrAlias.td - Hexagon Instruction Aliases ---*- tablegen -*--==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Hexagon Instruction Mappings +//===----------------------------------------------------------------------===// + + +def : InstAlias<"memb({GP}+#$addr) = $Nt.new", + (S2_storerbnewgp u16_0Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memh({GP}+#$addr) = $Nt.new", + (S2_storerhnewgp u16_1Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memw({GP}+#$addr) = $Nt.new", + (S2_storerinewgp u16_2Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memb({GP}+#$addr) = $Nt", + (S2_storerbgp u16_0Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memh({GP}+#$addr) = $Nt", + (S2_storerhgp u16_1Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memh({GP}+#$addr) = $Nt.h", + (S2_storerfgp u16_1Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memw({GP}+#$addr) = $Nt", + (S2_storerigp u16_2Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memd({GP}+#$addr) = $Nt", + (S2_storerdgp u16_3Imm:$addr, DoubleRegs:$Nt)>; + +def : InstAlias<"$Nt = memb({GP}+#$addr)", + (L2_loadrbgp IntRegs:$Nt, u16_0Imm:$addr)>; +def : InstAlias<"$Nt = memub({GP}+#$addr)", + (L2_loadrubgp IntRegs:$Nt, u16_0Imm:$addr)>; +def : InstAlias<"$Nt = memh({GP}+#$addr)", + (L2_loadrhgp IntRegs:$Nt, u16_1Imm:$addr)>; +def : InstAlias<"$Nt = memuh({GP}+#$addr)", + (L2_loadruhgp IntRegs:$Nt, u16_1Imm:$addr)>; +def : InstAlias<"$Nt = memw({GP}+#$addr)", + (L2_loadrigp IntRegs:$Nt, u16_2Imm:$addr)>; +def : InstAlias<"$Nt = memd({GP}+#$addr)", + (L2_loadrdgp DoubleRegs:$Nt, u16_3Imm:$addr)>; + +// Alias of: memXX($Rs+#XX) = $Rt to memXX($Rs) = $Rt +def : InstAlias<"memb($Rs) = $Rt", + (S2_storerb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memh($Rs) = $Rt", + (S2_storerh_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memh($Rs) = $Rt.h", + (S2_storerf_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memw($Rs) = $Rt", + (S2_storeri_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memb($Rs) = $Rt.new", + (S2_storerbnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memh($Rs) = $Rt.new", + (S2_storerhnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memw($Rs) = $Rt.new", + (S2_storerinew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memb($Rs) = #$S8", + (S4_storeirb_io IntRegs:$Rs, 0, s8Ext:$S8), 0>; + +def : InstAlias<"memh($Rs) = #$S8", + (S4_storeirh_io IntRegs:$Rs, 0, s8Ext:$S8), 0>; + +def : InstAlias<"memw($Rs) = #$S8", + (S4_storeiri_io IntRegs:$Rs, 0, s8Ext:$S8), 0>; + +def : InstAlias<"memd($Rs) = $Rtt", + (S2_storerd_io IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>; + +def : InstAlias<"memb($Rs) = setbit(#$U5)", + (L4_ior_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>; + +def : InstAlias<"memh($Rs) = setbit(#$U5)", + (L4_ior_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>; + +def : InstAlias<"memw($Rs) = setbit(#$U5)", + (L4_ior_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>; + +def : InstAlias<"memb($Rs) = clrbit(#$U5)", + (L4_iand_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>; + +def : InstAlias<"memh($Rs) = clrbit(#$U5)", + (L4_iand_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>; + +def : InstAlias<"memw($Rs) = clrbit(#$U5)", + (L4_iand_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>; + +// Alias of: $Rd = memXX($Rs+#XX) to $Rd = memXX($Rs) +def : InstAlias<"$Rd = memb($Rs)", + (L2_loadrb_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rd = memub($Rs)", + (L2_loadrub_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rd = memh($Rs)", + (L2_loadrh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rd = memuh($Rs)", + (L2_loadruh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rd = memw($Rs)", + (L2_loadri_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rdd = memd($Rs)", + (L2_loadrd_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rd = memubh($Rs)", + (L2_loadbzw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rdd = memubh($Rs)", + (L2_loadbzw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rd = membh($Rs)", + (L2_loadbsw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rdd = membh($Rs)", + (L2_loadbsw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rdd = memb_fifo($Rs)", + (L2_loadalignb_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rdd = memh_fifo($Rs)", + (L2_loadalignh_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>; + +// Alias of: if ($Pt) $Rd = memXX($Rs + #$u6_X) +// to: if ($Pt) $Rd = memXX($Rs) +def : InstAlias<"if ($Pt) $Rd = memb($Rs)", + (L2_ploadrbt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt) $Rd = memub($Rs)", + (L2_ploadrubt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt) $Rd = memh($Rs)", + (L2_ploadrht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt) $Rd = memuh($Rs)", + (L2_ploadruht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt) $Rd = memw($Rs)", + (L2_ploadrit_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt) $Rdd = memd($Rs)", + (L2_ploadrdt_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +// Alias of: if ($Pt) memXX($Rs + #$u6_X) = $Rt +// to: if ($Pt) memXX($Rs) = $Rt +def : InstAlias<"if ($Pt) memb($Rs) = $Rt", + (S2_pstorerbt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memh($Rs) = $Rt", + (S2_pstorerht_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memh($Rs) = $Rt.h", + (S2_pstorerft_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memw($Rs) = $Rt", + (S2_pstorerit_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memd($Rs) = $Rtt", + (S2_pstorerdt_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>; + +def : InstAlias<"if ($Pt) memb($Rs) = $Rt.new", + (S2_pstorerbnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memh($Rs) = $Rt.new", + (S2_pstorerhnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memw($Rs) = $Rt.new", + (S2_pstorerinewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt.new) memb($Rs) = $Rt.new", + (S4_pstorerbnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt.new) memh($Rs) = $Rt.new", + (S4_pstorerhnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt.new) memw($Rs) = $Rt.new", + (S4_pstorerinewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + + +// Alias of: if (!$Pt) $Rd = memXX($Rs + #$u6_X) +// to: if (!$Pt) $Rd = memXX($Rs) +def : InstAlias<"if (!$Pt) $Rd = memb($Rs)", + (L2_ploadrbf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt) $Rd = memub($Rs)", + (L2_ploadrubf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt) $Rd = memh($Rs)", + (L2_ploadrhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt) $Rd = memuh($Rs)", + (L2_ploadruhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt) $Rd = memw($Rs)", + (L2_ploadrif_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt) $Rdd = memd($Rs)", + (L2_ploadrdf_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +// Alias of: if (!$Pt) memXX($Rs + #$u6_X) = $Rt +// to: if (!$Pt) memXX($Rs) = $Rt +def : InstAlias<"if (!$Pt) memb($Rs) = $Rt", + (S2_pstorerbf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt) memh($Rs) = $Rt", + (S2_pstorerhf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.h", + (S2_pstorerff_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt) memw($Rs) = $Rt", + (S2_pstorerif_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt) memd($Rs) = $Rtt", + (S2_pstorerdf_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>; + +def : InstAlias<"if (!$Pt) memb($Rs) = $Rt.new", + (S2_pstorerbnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.new", + (S2_pstorerhnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt) memw($Rs) = $Rt.new", + (S2_pstorerinewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt.new) memb($Rs) = $Rt.new", + (S4_pstorerbnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt.new) memh($Rs) = $Rt.new", + (S4_pstorerhnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt.new) memw($Rs) = $Rt.new", + (S4_pstorerinewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memb($Rs) = #$S6", + (S4_storeirbt_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if ($Pt) memh($Rs) = #$S6", + (S4_storeirht_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if ($Pt) memw($Rs) = #$S6", + (S4_storeirit_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if ($Pt.new) memb($Rs) = #$S6", + (S4_storeirbtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if ($Pt.new) memh($Rs) = #$S6", + (S4_storeirhtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if ($Pt.new) memw($Rs) = #$S6", + (S4_storeiritnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if (!$Pt) memb($Rs) = #$S6", + (S4_storeirbf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if (!$Pt) memh($Rs) = #$S6", + (S4_storeirhf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if (!$Pt) memw($Rs) = #$S6", + (S4_storeirif_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if (!$Pt.new) memb($Rs) = #$S6", + (S4_storeirbfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if (!$Pt.new) memh($Rs) = #$S6", + (S4_storeirhfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if (!$Pt.new) memw($Rs) = #$S6", + (S4_storeirifnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +// Alias of: memXX($Rs + $u6_X) |= $Rt, also &=, +=, -= +// to: memXX($Rs) |= $Rt +def : InstAlias<"memb($Rs) &= $Rt", + (L4_and_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memb($Rs) |= $Rt", + (L4_or_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memb($Rs) += $Rt", + (L4_add_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memb($Rs) -= $Rt", + (L4_sub_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memb($Rs) += #$U5", + (L4_iadd_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memb($Rs) -= #$U5", + (L4_isub_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memh($Rs) &= $Rt", + (L4_and_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memh($Rs) |= $Rt", + (L4_or_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memh($Rs) += $Rt", + (L4_add_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memh($Rs) -= $Rt", + (L4_sub_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memh($Rs) += #$U5", + (L4_iadd_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memh($Rs) -= #$U5", + (L4_isub_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memw($Rs) &= $Rt", + (L4_and_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memw($Rs) |= $Rt", + (L4_or_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memw($Rs) += $Rt", + (L4_add_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memw($Rs) -= $Rt", + (L4_sub_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memw($Rs) += #$U5", + (L4_iadd_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memw($Rs) -= #$U5", + (L4_isub_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>, + Requires<[UseMEMOP]>; + +// +// Alias of: if ($Pv.new) memX($Rs) = $Rt +// to: if (p3.new) memX(r17 + #0) = $Rt +def : InstAlias<"if ($Pv.new) memb($Rs) = $Rt", + (S4_pstorerbtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt", + (S4_pstorerhtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt.h", + (S4_pstorerftnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pv.new) memw($Rs) = $Rt", + (S4_pstoreritnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pv.new) memd($Rs) = $Rtt", + (S4_pstorerdtnew_io + PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>; + +def : InstAlias<"if (!$Pv.new) memb($Rs) = $Rt", + (S4_pstorerbfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt", + (S4_pstorerhfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt.h", + (S4_pstorerffnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pv.new) memw($Rs) = $Rt", + (S4_pstorerifnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pv.new) memd($Rs) = $Rtt", + (S4_pstorerdfnew_io + PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>; + +// +// Alias of: if ($Pt.new) $Rd = memub($Rs) -- And if (!$Pt.new) ... +// to: if ($Pt.new) $Rd = memub($Rs + #$u6_0) +def : InstAlias<"if ($Pt.new) $Rd = memub($Rs)", + (L2_ploadrubtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt.new) $Rd = memb($Rs)", + (L2_ploadrbtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt.new) $Rd = memh($Rs)", + (L2_ploadrhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt.new) $Rd = memuh($Rs)", + (L2_ploadruhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt.new) $Rd = memw($Rs)", + (L2_ploadritnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt.new) $Rdd = memd($Rs)", + (L2_ploadrdtnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt.new) $Rd = memub($Rs)", + (L2_ploadrubfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt.new) $Rd = memb($Rs)", + (L2_ploadrbfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt.new) $Rd = memh($Rs)", + (L2_ploadrhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt.new) $Rd = memuh($Rs)", + (L2_ploadruhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt.new) $Rd = memw($Rs)", + (L2_ploadrifnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt.new) $Rdd = memd($Rs)", + (L2_ploadrdfnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"dcfetch($Rs)", + (Y2_dcfetchbo IntRegs:$Rs, 0), 0>; + +// Alias of some insn mappings, others must be handled by the parser +def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)", + (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>; +def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)", + (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>; + +// Rd=neg(Rs) is aliased to Rd=sub(#0,Rs) +def : InstAlias<"$Rd = neg($Rs)", + (A2_subri IntRegs:$Rd, 0, IntRegs:$Rs), 0>; + +def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>; +def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>; +def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>; +def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>; + +def : InstAlias<"$Pd = $Ps", + (C2_or PredRegs:$Pd, PredRegs:$Ps, PredRegs:$Ps), 0>; + +def : InstAlias<"$Rdd = vaddb($Rss, $Rtt)", + (A2_vaddub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 1>; + +def : InstAlias<"$Rdd = vsubb($Rss,$Rtt)", + (A2_vsubub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 0>; + +def : InstAlias<"$Rd = mpyui($Rs,$Rt)", + (M2_mpyi IntRegs:$Rd, IntRegs:$Rs, IntRegs:$Rt), 0>; + +// Assembler mapped insns: cmp.lt(a,b) -> cmp.gt(b,a) +def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)", + (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>; +def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)", + (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>; + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td new file mode 100644 index 0000000..280832f --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td @@ -0,0 +1,1019 @@ +class Enc_COPROC_VX_3op_v<bits<15> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<5> src2; + + let Inst{31-16} = { opc{14-4}, src2}; + let Inst{13-0} = { opc{3}, src1, opc{2-0}, dst}; +} + +class V6_vtmpyb_enc : Enc_COPROC_VX_3op_v<0b000110010000000>; +class V6_vtmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000001>; +class V6_vdmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110010000010>; +class V6_vrmpyub_enc : Enc_COPROC_VX_3op_v<0b000110010000011>; +class V6_vrmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000100>; +class V6_vdsaduh_enc : Enc_COPROC_VX_3op_v<0b000110010000101>; +class V6_vdmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000110>; +class V6_vdmpybus_dv_enc : Enc_COPROC_VX_3op_v<0b000110010000111>; +class V6_vtmpyb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001000>; +class V6_vtmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001001>; +class V6_vtmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001010>; +class V6_vdmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001011>; +class V6_vrmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001100>; +class V6_vrmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001101>; +class V6_vdmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001110>; +class V6_vdmpybus_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001111>; +class V6_vdmpyhsusat_enc : Enc_COPROC_VX_3op_v<0b000110010010000>; +class V6_vdmpyhsuisat_enc : Enc_COPROC_VX_3op_v<0b000110010010001>; +class V6_vdmpyhsat_enc : Enc_COPROC_VX_3op_v<0b000110010010010>; +class V6_vdmpyhisat_enc : Enc_COPROC_VX_3op_v<0b000110010010011>; +class V6_vdmpyhb_dv_enc : Enc_COPROC_VX_3op_v<0b000110010010100>; +class V6_vmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010010101>; +class V6_vmpabus_enc : Enc_COPROC_VX_3op_v<0b000110010010110>; +class V6_vmpahb_enc : Enc_COPROC_VX_3op_v<0b000110010010111>; +class V6_vdmpyhsusat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011000>; +class V6_vdmpyhsuisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011001>; +class V6_vdmpyhisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011010>; +class V6_vdmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011011>; +class V6_vdmpyhb_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011100>; +class V6_vmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011101>; +class V6_vmpabus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011110>; +class V6_vmpahb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011111>; +class V6_vmpyh_enc : Enc_COPROC_VX_3op_v<0b000110010100000>; +class V6_vmpyhss_enc : Enc_COPROC_VX_3op_v<0b000110010100001>; +class V6_vmpyhsrs_enc : Enc_COPROC_VX_3op_v<0b000110010100010>; +class V6_vmpyuh_enc : Enc_COPROC_VX_3op_v<0b000110010100011>; +class V6_vmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101000>; +class V6_vmpyuh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101001>; +class V6_vmpyiwb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101010>; +class V6_vmpyiwh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101011>; +class V6_vmpyihb_enc : Enc_COPROC_VX_3op_v<0b000110010110000>; +class V6_vror_enc : Enc_COPROC_VX_3op_v<0b000110010110001>; +class V6_vasrw_enc : Enc_COPROC_VX_3op_v<0b000110010110101>; +class V6_vasrh_enc : Enc_COPROC_VX_3op_v<0b000110010110110>; +class V6_vaslw_enc : Enc_COPROC_VX_3op_v<0b000110010110111>; +class V6_vdsaduh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111000>; +class V6_vmpyihb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111001>; +class V6_vaslw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111010>; +class V6_vasrw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111101>; +class V6_vaslh_enc : Enc_COPROC_VX_3op_v<0b000110011000000>; +class V6_vlsrw_enc : Enc_COPROC_VX_3op_v<0b000110011000001>; +class V6_vlsrh_enc : Enc_COPROC_VX_3op_v<0b000110011000010>; +class V6_vmpyiwh_enc : Enc_COPROC_VX_3op_v<0b000110011000111>; +class V6_vmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110011001000>; +class V6_vmpyiwb_enc : Enc_COPROC_VX_3op_v<0b000110011010000>; +class V6_vtmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110011010100>; +class V6_vmpyub_enc : Enc_COPROC_VX_3op_v<0b000110011100000>; +class V6_vrmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000000>; +class V6_vrmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000001>; +class V6_vrmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000010>; +class V6_vdmpyhvsat_enc : Enc_COPROC_VX_3op_v<0b000111000000011>; +class V6_vmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000100>; +class V6_vmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000101>; +class V6_vmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000110>; +class V6_vmpyhv_enc : Enc_COPROC_VX_3op_v<0b000111000000111>; +class V6_vrmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001000>; +class V6_vrmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001001>; +class V6_vrmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001010>; +class V6_vdmpyhvsat_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001011>; +class V6_vmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001100>; +class V6_vmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001101>; +class V6_vmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001110>; +class V6_vmpyhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001111>; +class V6_vmpyuhv_enc : Enc_COPROC_VX_3op_v<0b000111000010000>; +class V6_vmpyhvsrs_enc : Enc_COPROC_VX_3op_v<0b000111000010001>; +class V6_vmpyhus_enc : Enc_COPROC_VX_3op_v<0b000111000010010>; +class V6_vmpabusv_enc : Enc_COPROC_VX_3op_v<0b000111000010011>; +class V6_vmpyih_enc : Enc_COPROC_VX_3op_v<0b000111000010100>; +class V6_vand_enc : Enc_COPROC_VX_3op_v<0b000111000010101>; +class V6_vor_enc : Enc_COPROC_VX_3op_v<0b000111000010110>; +class V6_vxor_enc : Enc_COPROC_VX_3op_v<0b000111000010111>; +class V6_vmpyuhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011000>; +class V6_vmpyhus_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011001>; +class V6_vmpyih_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011100>; +class V6_vmpyiewuh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011101>; +class V6_vmpyowh_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011110>; +class V6_vmpyowh_rnd_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011111>; +class V6_vaddw_enc : Enc_COPROC_VX_3op_v<0b000111000100000>; +class V6_vaddubsat_enc : Enc_COPROC_VX_3op_v<0b000111000100001>; +class V6_vadduhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100010>; +class V6_vaddhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100011>; +class V6_vaddwsat_enc : Enc_COPROC_VX_3op_v<0b000111000100100>; +class V6_vsubb_enc : Enc_COPROC_VX_3op_v<0b000111000100101>; +class V6_vsubh_enc : Enc_COPROC_VX_3op_v<0b000111000100110>; +class V6_vsubw_enc : Enc_COPROC_VX_3op_v<0b000111000100111>; +class V6_vmpyiewh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000101000>; +class V6_vsububsat_enc : Enc_COPROC_VX_3op_v<0b000111000110000>; +class V6_vsubuhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110001>; +class V6_vsubhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110010>; +class V6_vsubwsat_enc : Enc_COPROC_VX_3op_v<0b000111000110011>; +class V6_vaddb_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110100>; +class V6_vaddh_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110101>; +class V6_vaddw_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110110>; +class V6_vaddubsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110111>; +class V6_vadduhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000000>; +class V6_vaddhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000001>; +class V6_vaddwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000010>; +class V6_vsubb_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000011>; +class V6_vsubh_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000100>; +class V6_vsubw_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000101>; +class V6_vsububsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000110>; +class V6_vsubuhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000111>; +class V6_vsubhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010000>; +class V6_vsubwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010001>; +class V6_vaddubh_enc : Enc_COPROC_VX_3op_v<0b000111001010010>; +class V6_vadduhw_enc : Enc_COPROC_VX_3op_v<0b000111001010011>; +class V6_vaddhw_enc : Enc_COPROC_VX_3op_v<0b000111001010100>; +class V6_vsububh_enc : Enc_COPROC_VX_3op_v<0b000111001010101>; +class V6_vsubuhw_enc : Enc_COPROC_VX_3op_v<0b000111001010110>; +class V6_vsubhw_enc : Enc_COPROC_VX_3op_v<0b000111001010111>; +class V6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b000111001100000>; +class V6_vabsdiffh_enc : Enc_COPROC_VX_3op_v<0b000111001100001>; +class V6_vabsdiffuh_enc : Enc_COPROC_VX_3op_v<0b000111001100010>; +class V6_vabsdiffw_enc : Enc_COPROC_VX_3op_v<0b000111001100011>; +class V6_vavgub_enc : Enc_COPROC_VX_3op_v<0b000111001100100>; +class V6_vavguh_enc : Enc_COPROC_VX_3op_v<0b000111001100101>; +class V6_vavgh_enc : Enc_COPROC_VX_3op_v<0b000111001100110>; +class V6_vavgw_enc : Enc_COPROC_VX_3op_v<0b000111001100111>; +class V6_vnavgub_enc : Enc_COPROC_VX_3op_v<0b000111001110000>; +class V6_vnavgh_enc : Enc_COPROC_VX_3op_v<0b000111001110001>; +class V6_vnavgw_enc : Enc_COPROC_VX_3op_v<0b000111001110010>; +class V6_vavgubrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110011>; +class V6_vavguhrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110100>; +class V6_vavghrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110101>; +class V6_vavgwrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110110>; +class V6_vmpabuuv_enc : Enc_COPROC_VX_3op_v<0b000111001110111>; +class V6_vminub_enc : Enc_COPROC_VX_3op_v<0b000111110000001>; +class V6_vminuh_enc : Enc_COPROC_VX_3op_v<0b000111110000010>; +class V6_vminh_enc : Enc_COPROC_VX_3op_v<0b000111110000011>; +class V6_vminw_enc : Enc_COPROC_VX_3op_v<0b000111110000100>; +class V6_vmaxub_enc : Enc_COPROC_VX_3op_v<0b000111110000101>; +class V6_vmaxuh_enc : Enc_COPROC_VX_3op_v<0b000111110000110>; +class V6_vmaxh_enc : Enc_COPROC_VX_3op_v<0b000111110000111>; +class V6_vmaxw_enc : Enc_COPROC_VX_3op_v<0b000111110010000>; +class V6_vdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010001>; +class V6_vrdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010011>; +class V6_vdealb4w_enc : Enc_COPROC_VX_3op_v<0b000111110010111>; +class V6_vmpyowh_rnd_enc : Enc_COPROC_VX_3op_v<0b000111110100000>; +class V6_vshuffeb_enc : Enc_COPROC_VX_3op_v<0b000111110100001>; +class V6_vshuffob_enc : Enc_COPROC_VX_3op_v<0b000111110100010>; +class V6_vshufeh_enc : Enc_COPROC_VX_3op_v<0b000111110100011>; +class V6_vshufoh_enc : Enc_COPROC_VX_3op_v<0b000111110100100>; +class V6_vshufoeh_enc : Enc_COPROC_VX_3op_v<0b000111110100101>; +class V6_vshufoeb_enc : Enc_COPROC_VX_3op_v<0b000111110100110>; +class V6_vcombine_enc : Enc_COPROC_VX_3op_v<0b000111110100111>; +class V6_vmpyieoh_enc : Enc_COPROC_VX_3op_v<0b000111110110000>; +class V6_vsathub_enc : Enc_COPROC_VX_3op_v<0b000111110110010>; +class V6_vsatwh_enc : Enc_COPROC_VX_3op_v<0b000111110110011>; +class V6_vroundwh_enc : Enc_COPROC_VX_3op_v<0b000111110110100>; +class V6_vroundwuh_enc : Enc_COPROC_VX_3op_v<0b000111110110101>; +class V6_vroundhb_enc : Enc_COPROC_VX_3op_v<0b000111110110110>; +class V6_vroundhub_enc : Enc_COPROC_VX_3op_v<0b000111110110111>; +class V6_vasrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010000>; +class V6_vlsrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010001>; +class V6_vlsrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010010>; +class V6_vasrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010011>; +class V6_vaslwv_enc : Enc_COPROC_VX_3op_v<0b000111111010100>; +class V6_vaslhv_enc : Enc_COPROC_VX_3op_v<0b000111111010101>; +class V6_vaddb_enc : Enc_COPROC_VX_3op_v<0b000111111010110>; +class V6_vaddh_enc : Enc_COPROC_VX_3op_v<0b000111111010111>; +class V6_vmpyiewuh_enc : Enc_COPROC_VX_3op_v<0b000111111100000>; +class V6_vmpyiowh_enc : Enc_COPROC_VX_3op_v<0b000111111100001>; +class V6_vpackeb_enc : Enc_COPROC_VX_3op_v<0b000111111100010>; +class V6_vpackeh_enc : Enc_COPROC_VX_3op_v<0b000111111100011>; +class V6_vpackhub_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100101>; +class V6_vpackhb_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100110>; +class V6_vpackwuh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100111>; +class V6_vpackwh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111110000>; +class V6_vpackob_enc : Enc_COPROC_VX_3op_v<0b000111111110001>; +class V6_vpackoh_enc : Enc_COPROC_VX_3op_v<0b000111111110010>; +class V6_vmpyewuh_enc : Enc_COPROC_VX_3op_v<0b000111111110101>; +class V6_vmpyowh_enc : Enc_COPROC_VX_3op_v<0b000111111110111>; +class V6_extractw_enc : Enc_COPROC_VX_3op_v<0b100100100000001>; +class M6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b111010001010000>; +class M6_vabsdiffb_enc : Enc_COPROC_VX_3op_v<0b111010001110000>; + +class Enc_COPROC_VX_cmp<bits<13> opc> : OpcodeHexagon { + bits<2> dst; + bits<5> src1; + bits<5> src2; + + let Inst{31-16} = { 0b00011, opc{12-7}, src2{4-0} }; + let Inst{13-0} = { opc{6}, src1{4-0}, opc{5-0}, dst{1-0} }; +} + +class V6_vandvrt_acc_enc : Enc_COPROC_VX_cmp<0b0010111100000>; +class V6_vandvrt_enc : Enc_COPROC_VX_cmp<0b0011010010010>; +class V6_veqb_and_enc : Enc_COPROC_VX_cmp<0b1001001000000>; +class V6_veqh_and_enc : Enc_COPROC_VX_cmp<0b1001001000001>; +class V6_veqw_and_enc : Enc_COPROC_VX_cmp<0b1001001000010>; +class V6_vgtb_and_enc : Enc_COPROC_VX_cmp<0b1001001000100>; +class V6_vgth_and_enc : Enc_COPROC_VX_cmp<0b1001001000101>; +class V6_vgtw_and_enc : Enc_COPROC_VX_cmp<0b1001001000110>; +class V6_vgtub_and_enc : Enc_COPROC_VX_cmp<0b1001001001000>; +class V6_vgtuh_and_enc : Enc_COPROC_VX_cmp<0b1001001001001>; +class V6_vgtuw_and_enc : Enc_COPROC_VX_cmp<0b1001001001010>; +class V6_veqb_or_enc : Enc_COPROC_VX_cmp<0b1001001010000>; +class V6_veqh_or_enc : Enc_COPROC_VX_cmp<0b1001001010001>; +class V6_veqw_or_enc : Enc_COPROC_VX_cmp<0b1001001010010>; +class V6_vgtb_or_enc : Enc_COPROC_VX_cmp<0b1001001010100>; +class V6_vgth_or_enc : Enc_COPROC_VX_cmp<0b1001001010101>; +class V6_vgtw_or_enc : Enc_COPROC_VX_cmp<0b1001001010110>; +class V6_vgtub_or_enc : Enc_COPROC_VX_cmp<0b1001001011000>; +class V6_vgtuh_or_enc : Enc_COPROC_VX_cmp<0b1001001011001>; +class V6_vgtuw_or_enc : Enc_COPROC_VX_cmp<0b1001001011010>; +class V6_veqb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100000>; +class V6_veqh_xor_enc : Enc_COPROC_VX_cmp<0b1001001100001>; +class V6_veqw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100010>; +class V6_vgtb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100100>; +class V6_vgth_xor_enc : Enc_COPROC_VX_cmp<0b1001001100101>; +class V6_vgtw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100110>; +class V6_vgtub_xor_enc : Enc_COPROC_VX_cmp<0b1001001101000>; +class V6_vgtuh_xor_enc : Enc_COPROC_VX_cmp<0b1001001101001>; +class V6_vgtuw_xor_enc : Enc_COPROC_VX_cmp<0b1001001101010>; +class V6_veqb_enc : Enc_COPROC_VX_cmp<0b1111000000000>; +class V6_veqh_enc : Enc_COPROC_VX_cmp<0b1111000000001>; +class V6_veqw_enc : Enc_COPROC_VX_cmp<0b1111000000010>; +class V6_vgtb_enc : Enc_COPROC_VX_cmp<0b1111000000100>; +class V6_vgth_enc : Enc_COPROC_VX_cmp<0b1111000000101>; +class V6_vgtw_enc : Enc_COPROC_VX_cmp<0b1111000000110>; +class V6_vgtub_enc : Enc_COPROC_VX_cmp<0b1111000001000>; +class V6_vgtuh_enc : Enc_COPROC_VX_cmp<0b1111000001001>; +class V6_vgtuw_enc : Enc_COPROC_VX_cmp<0b1111000001010>; + +class Enc_COPROC_VX_p2op<bits<5> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> dst; + bits<5> src2; + + let Inst{31-16} = { 0b00011110, src1{1-0}, 0b0000, opc{4-3} }; + let Inst{13-0} = { 1, src2{4-0}, opc{2-0}, dst{4-0} }; +} + +class V6_vaddbq_enc : Enc_COPROC_VX_p2op<0b01000>; +class V6_vaddhq_enc : Enc_COPROC_VX_p2op<0b01001>; +class V6_vaddwq_enc : Enc_COPROC_VX_p2op<0b01010>; +class V6_vaddbnq_enc : Enc_COPROC_VX_p2op<0b01011>; +class V6_vaddhnq_enc : Enc_COPROC_VX_p2op<0b01100>; +class V6_vaddwnq_enc : Enc_COPROC_VX_p2op<0b01101>; +class V6_vsubbq_enc : Enc_COPROC_VX_p2op<0b01110>; +class V6_vsubhq_enc : Enc_COPROC_VX_p2op<0b01111>; +class V6_vsubwq_enc : Enc_COPROC_VX_p2op<0b10000>; +class V6_vsubbnq_enc : Enc_COPROC_VX_p2op<0b10001>; +class V6_vsubhnq_enc : Enc_COPROC_VX_p2op<0b10010>; +class V6_vsubwnq_enc : Enc_COPROC_VX_p2op<0b10011>; + +class Enc_COPROC_VX_2op<bits<6> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + + let Inst{31-16} = { 0b00011110000000, opc{5-4} }; + let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} }; +} + +class V6_vabsh_enc : Enc_COPROC_VX_2op<0b000000>; +class V6_vabsh_sat_enc : Enc_COPROC_VX_2op<0b000001>; +class V6_vabsw_enc : Enc_COPROC_VX_2op<0b000010>; +class V6_vabsw_sat_enc : Enc_COPROC_VX_2op<0b000011>; +class V6_vnot_enc : Enc_COPROC_VX_2op<0b000100>; +class V6_vdealh_enc : Enc_COPROC_VX_2op<0b000110>; +class V6_vdealb_enc : Enc_COPROC_VX_2op<0b000111>; +class V6_vunpackob_enc : Enc_COPROC_VX_2op<0b001000>; +class V6_vunpackoh_enc : Enc_COPROC_VX_2op<0b001001>; +class V6_vunpackub_enc : Enc_COPROC_VX_2op<0b010000>; +class V6_vunpackuh_enc : Enc_COPROC_VX_2op<0b010001>; +class V6_vunpackb_enc : Enc_COPROC_VX_2op<0b010010>; +class V6_vunpackh_enc : Enc_COPROC_VX_2op<0b010011>; +class V6_vshuffh_enc : Enc_COPROC_VX_2op<0b010111>; +class V6_vshuffb_enc : Enc_COPROC_VX_2op<0b100000>; +class V6_vzb_enc : Enc_COPROC_VX_2op<0b100001>; +class V6_vzh_enc : Enc_COPROC_VX_2op<0b100010>; +class V6_vsb_enc : Enc_COPROC_VX_2op<0b100011>; +class V6_vsh_enc : Enc_COPROC_VX_2op<0b100100>; +class V6_vcl0w_enc : Enc_COPROC_VX_2op<0b100101>; +class V6_vpopcounth_enc : Enc_COPROC_VX_2op<0b100110>; +class V6_vcl0h_enc : Enc_COPROC_VX_2op<0b100111>; +class V6_vnormamtw_enc : Enc_COPROC_VX_2op<0b110100>; +class V6_vnormamth_enc : Enc_COPROC_VX_2op<0b110101>; +class V6_vassign_enc : Enc_COPROC_VX_2op<0b111111>; + +class Enc_COPROC_VMEM_vL32_b_ai<bits<4> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<10> src2; + bits<4> src2_vector; + + let src2_vector = src2{9-6}; + let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} }; + let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} }; +} + +class V6_vL32b_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0000>; +class V6_vL32b_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0001>; +class V6_vL32b_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0010>; +class V6_vL32Ub_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0111>; +class V6_vL32b_nt_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1000>; +class V6_vL32b_nt_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1001>; +class V6_vL32b_nt_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1010>; + +class Enc_COPROC_VMEM_vL32_b_ai_128B<bits<4> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<11> src2; + bits<4> src2_vector; + + let src2_vector = src2{10-7}; + let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} }; + let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} }; +} + +class V6_vL32b_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0000>; +class V6_vL32b_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0001>; +class V6_vL32b_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0010>; +class V6_vL32Ub_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0111>; +class V6_vL32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1000>; +class V6_vL32b_nt_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1001>; +class V6_vL32b_nt_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1010>; + +class Enc_COPROC_VMEM_vS32_b_ai_64B<bits<4> opc> : OpcodeHexagon { + bits<5> src1; + bits<10> src2; + bits<4> src2_vector; + bits<5> src3; + + let src2_vector = src2{9-6}; + let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} }; + let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} }; +} + +class Enc_COPROC_VMEM_vS32_b_ai_128B<bits<4> opc> : OpcodeHexagon { + bits<5> src1; + bits<11> src2; + bits<4> src2_vector; + bits<5> src3; + + let src2_vector = src2{10-7}; + let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} }; + let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} }; +} + +class V6_vS32b_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0000>; +class V6_vS32Ub_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0111>; +class V6_vS32b_nt_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b1000>; + +class V6_vS32b_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0000>; +class V6_vS32Ub_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0111>; +class V6_vS32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b1000>; + +class Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<bits<1> opc> : OpcodeHexagon { + bits<5> src1; + bits<10> src2; + bits<4> src2_vector; + bits<3> src3; + + let src2_vector = src2{9-6}; + let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} }; + let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} }; +} + +class V6_vS32b_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<0>; +class V6_vS32b_nt_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<1>; + +class Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<bits<1> opc> : OpcodeHexagon { + bits<5> src1; + bits<11> src2; + bits<4> src2_vector; + bits<3> src3; + + let src2_vector = src2{10-7}; + let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} }; + let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} }; +} + +class V6_vS32b_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<0>; +class V6_vS32b_nt_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<1>; + +class Enc_COPROC_VMEM_vS32_b_pred_ai<bits<5> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<10> src3; + bits<4> src3_vector; + bits<5> src4; + + let src3_vector = src3{9-6}; + let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} }; + let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} }; +} + +class Enc_COPROC_VMEM_vS32_b_pred_ai_128B<bits<5> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<11> src3; + bits<4> src3_vector; + bits<5> src4; + + let src3_vector = src3{10-7}; + let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} }; + let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} }; +} + +class V6_vS32b_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00000>; +class V6_vS32b_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00001>; +class V6_vS32b_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01000>; +class V6_vS32b_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01001>; +class V6_vS32Ub_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01110>; +class V6_vS32Ub_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01111>; +class V6_vS32b_nt_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10000>; +class V6_vS32b_nt_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10001>; +class V6_vS32b_nt_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11000>; +class V6_vS32b_nt_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11001>; + +class V6_vS32b_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00000>; +class V6_vS32b_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00001>; +class V6_vS32b_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01000>; +class V6_vS32b_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01001>; +class V6_vS32Ub_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01110>; +class V6_vS32Ub_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01111>; +class V6_vS32b_nt_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10000>; +class V6_vS32b_nt_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10001>; +class V6_vS32b_nt_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11000>; +class V6_vS32b_nt_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11001>; + +class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<bits<4> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<10> src3; + bits<4> src3_vector; + bits<3> src4; + + let src3_vector = src3{9-6}; + let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} }; + let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} }; +} + +class V6_vS32b_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0000>; +class V6_vS32b_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0101>; +class V6_vS32b_nt_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1010>; +class V6_vS32b_nt_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1111>; + +class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<bits<4> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<11> src3; + bits<4> src3_vector; + bits<3> src4; + + let src3_vector = src3{10-7}; + let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} }; + let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} }; +} + +class V6_vS32b_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0000>; +class V6_vS32b_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0101>; +class V6_vS32b_nt_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1010>; +class V6_vS32b_nt_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1111>; + +// TODO: Change script to generate dst, src1, src2 instead of +// dst, dst2, src1. +class Enc_COPROC_VMEM_vL32_b_pi<bits<4> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<9> src2; + bits<3> src2_vector; + + let src2_vector = src2{8-6}; + let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} }; + let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} }; +} + +class V6_vL32b_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0000>; +class V6_vL32b_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0001>; +class V6_vL32b_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0010>; +class V6_vL32Ub_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0111>; +class V6_vL32b_nt_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1000>; +class V6_vL32b_nt_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1001>; +class V6_vL32b_nt_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1010>; + +class Enc_COPROC_VMEM_vL32_b_pi_128B<bits<4> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<10> src2; + bits<3> src2_vector; + + let src2_vector = src2{9-7}; + let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} }; + let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} }; +} + +class V6_vL32b_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0000>; +class V6_vL32b_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0001>; +class V6_vL32b_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0010>; +class V6_vL32Ub_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0111>; +class V6_vL32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1000>; +class V6_vL32b_nt_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1001>; +class V6_vL32b_nt_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1010>; + + +// TODO: Change script to generate src1, src2 and src3 instead of +// dst, src1, src2. +class Enc_COPROC_VMEM_vS32_b_pi<bits<4> opc> : OpcodeHexagon { + bits<5> src1; + bits<9> src2; + bits<3> src2_vector; + bits<5> src3; + + let src2_vector = src2{8-6}; + let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} }; + let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} }; +} + +class V6_vS32b_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0000>; +class V6_vS32Ub_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0111>; +class V6_vS32b_nt_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b1000>; + +class Enc_COPROC_VMEM_vS32_b_pi_128B<bits<4> opc> : OpcodeHexagon { + bits<5> src1; + bits<10> src2; + bits<3> src2_vector; + bits<5> src3; + + let src2_vector = src2{9-7}; + let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} }; + let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} }; +} + +class V6_vS32b_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0000>; +class V6_vS32Ub_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0111>; +class V6_vS32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b1000>; + +// TODO: Change script to generate src1, src2 and src3 instead of +// dst, src1, src2. +class Enc_COPROC_VMEM_vS32b_n_ew_pi<bits<1> opc> : OpcodeHexagon { + bits<5> src1; + bits<9> src2; + bits<3> src2_vector; + bits<3> src3; + + let src2_vector = src2{8-6}; + let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} }; + let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} }; +} + +class V6_vS32b_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<0>; +class V6_vS32b_nt_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<1>; + +class Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<bits<1> opc> : OpcodeHexagon { + bits<5> src1; + bits<10> src2; + bits<3> src2_vector; + bits<3> src3; + + let src2_vector = src2{9-7}; + let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} }; + let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} }; +} + +class V6_vS32b_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<0>; +class V6_vS32b_nt_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<1>; + +// TODO: Change script to generate src1, src2,src3 and src4 instead of +// dst, src1, src2, src3. +class Enc_COPROC_VMEM_vS32_b_pred_pi<bits<5> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<9> src3; + bits<3> src3_vector; + bits<5> src4; + + let src3_vector = src3{8-6}; + let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} }; + let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} }; +} + +class V6_vS32b_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00000>; +class V6_vS32b_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00001>; +class V6_vS32b_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01000>; +class V6_vS32b_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01001>; +class V6_vS32Ub_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01110>; +class V6_vS32Ub_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01111>; +class V6_vS32b_nt_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10000>; +class V6_vS32b_nt_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10001>; +class V6_vS32b_nt_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11000>; +class V6_vS32b_nt_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11001>; + +// TODO: Change script to generate src1, src2,src3 and src4 instead of +// dst, src1, src2, src3. +class Enc_COPROC_VMEM_vS32_b_pred_pi_128B<bits<5> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<10> src3; + bits<3> src3_vector; + bits<5> src4; + + let src3_vector = src3{9-7}; + let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} }; + let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} }; +} + +class V6_vS32b_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00000>; +class V6_vS32b_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00001>; +class V6_vS32b_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01000>; +class V6_vS32b_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01001>; +class V6_vS32Ub_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01110>; +class V6_vS32Ub_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01111>; +class V6_vS32b_nt_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10000>; +class V6_vS32b_nt_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10001>; +class V6_vS32b_nt_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11000>; +class V6_vS32b_nt_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11001>; + +class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<bits<4> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<9> src3; + bits<3> src3_vector; + bits<3> src4; + + let src3_vector = src3{8-6}; + let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} }; + let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} }; +} + +class V6_vS32b_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0000>; +class V6_vS32b_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0101>; +class V6_vS32b_nt_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1010>; +class V6_vS32b_nt_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1111>; + +class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<bits<4> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<10> src3; + bits<3> src3_vector; + bits<3> src4; + + let src3_vector = src3{9-7}; + let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} }; + let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} }; +} + +class V6_vS32b_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0000>; +class V6_vS32b_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0101>; +class V6_vS32b_nt_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1010>; +class V6_vS32b_nt_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1111>; + +class Enc_LD_load_m<bits<13> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<1> src2; + + let Inst{31-16} = { opc{12}, 0, opc{11-10}, 1, opc{9-4}, src1{4-0} }; + let Inst{13-0} = { src2{0}, 0b000, opc{3}, 0, opc{2-0}, dst{4-0} }; +} + +class V6_vL32b_ppu_enc : Enc_LD_load_m<0b0100110000000>; +class V6_vL32b_cur_ppu_enc : Enc_LD_load_m<0b0100110000001>; +class V6_vL32b_tmp_ppu_enc : Enc_LD_load_m<0b0100110000010>; +class V6_vL32Ub_ppu_enc : Enc_LD_load_m<0b0100110000111>; +class V6_vL32b_nt_ppu_enc : Enc_LD_load_m<0b0100110100000>; +class V6_vL32b_nt_cur_ppu_enc : Enc_LD_load_m<0b0100110100001>; +class V6_vL32b_nt_tmp_ppu_enc : Enc_LD_load_m<0b0100110100010>; + +class Enc_COPROC_VMEM_vS32_b_ppu<bits<4> opc> : OpcodeHexagon { + bits<5> src1; + bits<1> src2; + bits<5> src3; + + let Inst{31-16} = { 0b001010110, opc{3}, 1, src1{4-0} }; + let Inst{13-0} = { src2{0}, 0b00000, opc{2-0}, src3{4-0} }; +} + +class V6_vS32b_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0000>; +class V6_vS32Ub_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0111>; +class V6_vS32b_nt_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b1000>; + +class Enc_COPROC_VMEM_vS32b_new_ppu<bits<1> opc> : OpcodeHexagon { + bits<5> src1; + bits<1> src2; + bits<3> src3; + + let Inst{31-16} = { 0b001010110, opc{0}, 1, src1{4-0} }; + let Inst{13-0} = { src2{0}, 0b0000000100, src3{2-0} }; +} + +class V6_vS32b_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<0>; +class V6_vS32b_nt_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<1>; + +class Enc_COPROC_VMEM_vS32_b_pred_ppu<bits<5> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<1> src3; + bits<5> src4; + + let Inst{31-16} = { 0b001010111, opc{4-3}, src2{4-0} }; + let Inst{13-0} = { src3{0}, src1{1-0}, 0b000, opc{2-0}, src4{4-0} }; +} + +class V6_vS32b_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00000>; +class V6_vS32b_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00001>; +class V6_vS32b_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01000>; +class V6_vS32b_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01001>; +class V6_vS32Ub_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01110>; +class V6_vS32Ub_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01111>; +class V6_vS32b_nt_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10000>; +class V6_vS32b_nt_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10001>; +class V6_vS32b_nt_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11000>; +class V6_vS32b_nt_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11001>; + +class Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<bits<4> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<1> src3; + bits<3> src4; + + let Inst{31-16} = { 0b001010111, opc{3}, 1, src2{4-0} }; + let Inst{13-0} = { src3{0}, src1{1-0}, 0b00001, opc{2-0}, src4{2-0} }; +} + +class V6_vS32b_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0000>; +class V6_vS32b_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0101>; +class V6_vS32b_nt_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1010>; +class V6_vS32b_nt_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1111>; + + +class Enc_COPROC_VX_4op_i<bits<5> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<5> src2; + bits<1> src3; + + let Inst{31-16} = { 0b00011001, opc{4-2}, src2{4-0} }; + let Inst{13-0} = { opc{1}, src1{4-0}, 1, opc{0}, src3{0}, dst{4-0} }; +} + +class V6_vrmpybusi_enc : Enc_COPROC_VX_4op_i<0b01000>; +class V6_vrsadubi_enc : Enc_COPROC_VX_4op_i<0b01001>; +class V6_vrmpybusi_acc_enc : Enc_COPROC_VX_4op_i<0b01010>; +class V6_vrsadubi_acc_enc : Enc_COPROC_VX_4op_i<0b01011>; +class V6_vrmpyubi_acc_enc : Enc_COPROC_VX_4op_i<0b01111>; +class V6_vrmpyubi_enc : Enc_COPROC_VX_4op_i<0b10101>; + +class Enc_COPROC_VX_vandqrt<bits<5> opc> : OpcodeHexagon { + bits<5> dst; + bits<2> src1; + bits<5> src2; + + let Inst{31-16} = { 0b00011001, opc{4-3}, 1, src2{4-0} }; + let Inst{13-0} = { opc{2}, 0b000, src1{1-0}, opc{1-0}, 1, dst{4-0} }; +} + +class V6_vandqrt_acc_enc : Enc_COPROC_VX_vandqrt<0b01101>; +class V6_vandqrt_enc : Enc_COPROC_VX_vandqrt<0b10010>; + +class Enc_COPROC_VX_cards<bits<2> opc> : OpcodeHexagon { + bits<5> src1; + bits<5> src2; + bits<5> src3; + + let Inst{31-16} = { 0b00011001111, src3{4-0} }; + let Inst{13-0} = { 1, src1{4-0}, 0, opc{1-0}, src2{4-0} }; +} + +class V6_vshuff_enc : Enc_COPROC_VX_cards<0b01>; +class V6_vdeal_enc : Enc_COPROC_VX_cards<0b10>; + + +class Enc_COPROC_VX_v_cmov<bits<1> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> dst; + bits<5> src2; + + let Inst{31-16} = { 0b0001101000, opc{0}, 0b00000 }; + let Inst{13-0} = { 0, src2{4-0}, 0, src1{1-0}, dst{4-0} }; +} + +class V6_vcmov_enc : Enc_COPROC_VX_v_cmov<0>; +class V6_vncmov_enc : Enc_COPROC_VX_v_cmov<1>; + +class Enc_X_p3op<bits<8> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> dst; + bits<5> src2; + bits<5> src3; + + let Inst{31-16} = { opc{7-5}, 0b1101, opc{4}, 0, opc{3-2}, src3{4-0} }; + let Inst{13-0} = { opc{1}, src2{4-0}, opc{0}, src1{1-0}, dst{4-0} }; +} + +class V6_vnccombine_enc : Enc_X_p3op<0b00001000>; +class V6_vccombine_enc : Enc_X_p3op<0b00001100>; + +class Enc_COPROC_VX_4op_r<bits<4> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<5> src2; + bits<3> src3; + + let Inst{31-16} = { 0b00011011, src2{4-0}, src3{2-0} }; + let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} }; +} + +class V6_valignb_enc : Enc_COPROC_VX_4op_r<0b0000>; +class V6_vlalignb_enc : Enc_COPROC_VX_4op_r<0b0001>; +class V6_vasrwh_enc : Enc_COPROC_VX_4op_r<0b0010>; +class V6_vasrwhsat_enc : Enc_COPROC_VX_4op_r<0b0011>; +class V6_vasrwhrndsat_enc : Enc_COPROC_VX_4op_r<0b0100>; +class V6_vasrwuhsat_enc : Enc_COPROC_VX_4op_r<0b0101>; +class V6_vasrhubsat_enc : Enc_COPROC_VX_4op_r<0b0110>; +class V6_vasrhubrndsat_enc : Enc_COPROC_VX_4op_r<0b0111>; +class V6_vasrhbrndsat_enc : Enc_COPROC_VX_4op_r<0b1000>; +class V6_vlutvvb_enc : Enc_COPROC_VX_4op_r<0b1001>; +class V6_vshuffvdd_enc : Enc_COPROC_VX_4op_r<0b1011>; +class V6_vdealvdd_enc : Enc_COPROC_VX_4op_r<0b1100>; +class V6_vlutvvb_oracc_enc : Enc_COPROC_VX_4op_r<0b1101>; +class V6_vlutvwh_enc : Enc_COPROC_VX_4op_r<0b1110>; +class V6_vlutvwh_oracc_enc : Enc_COPROC_VX_4op_r<0b1111>; + +class Enc_S_3op_valign_i<bits<9> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<5> src2; + bits<3> src3; + + let Inst{31-16} = { opc{8-7}, 0, opc{6-3}, 0b00, opc{2-1}, src2{4-0} }; + let Inst{13-0} = { opc{0}, src1{4-0}, src3{2-0}, dst{4-0} }; +} + +class V6_vlutb_enc : Enc_S_3op_valign_i<0b001100000>; +class V6_vlutb_dv_enc : Enc_S_3op_valign_i<0b001100010>; +class V6_vlutb_acc_enc : Enc_S_3op_valign_i<0b001100100>; +class V6_vlutb_dv_acc_enc : Enc_S_3op_valign_i<0b001100110>; +class V6_valignbi_enc : Enc_S_3op_valign_i<0b001111011>; +class V6_vlalignbi_enc : Enc_S_3op_valign_i<0b001111111>; +class S2_valignib_enc : Enc_S_3op_valign_i<0b110000000>; +class S2_addasl_rrri_enc : Enc_S_3op_valign_i<0b110010000>; + +class Enc_COPROC_VX_3op_q<bits<3> opc> : OpcodeHexagon { + bits<2> dst; + bits<2> src1; + bits<2> src2; + + let Inst{31-16} = { 0b00011110, src2{1-0}, 0b000011 }; + let Inst{13-0} = { 0b0000, src1{1-0}, 0b000, opc{2-0}, dst{1-0} }; +} + +class V6_pred_and_enc : Enc_COPROC_VX_3op_q<0b000>; +class V6_pred_or_enc : Enc_COPROC_VX_3op_q<0b001>; +class V6_pred_xor_enc : Enc_COPROC_VX_3op_q<0b011>; +class V6_pred_or_n_enc : Enc_COPROC_VX_3op_q<0b100>; +class V6_pred_and_n_enc : Enc_COPROC_VX_3op_q<0b101>; + +class V6_pred_not_enc : OpcodeHexagon { + bits<2> dst; + bits<2> src1; + + let Inst{31-16} = { 0b0001111000000011 }; + let Inst{13-0} = { 0b0000, src1{1-0}, 0b000010, dst{1-0} }; +} + +class Enc_COPROC_VX_4op_q<bits<1> opc> : OpcodeHexagon { + bits<5> dst; + bits<2> src1; + bits<5> src2; + bits<5> src3; + + let Inst{31-16} = { 0b000111101, opc{0}, 1, src3{4-0} }; + let Inst{13-0} = { 1, src2{4-0}, 0, src1{1-0}, dst{4-0} }; +} + +class V6_vswap_enc : Enc_COPROC_VX_4op_q<0>; +class V6_vmux_enc : Enc_COPROC_VX_4op_q<1>; + +class Enc_X_2op<bits<16> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + + let Inst{31-16} = { opc{15-5}, src1{4-0} }; + let Inst{13-0} = { opc{4-3}, 0b0000, opc{2-0}, dst{4-0} }; +} + +class V6_lvsplatw_enc : Enc_X_2op<0b0001100110100001>; +class V6_vinsertwr_enc : Enc_X_2op<0b0001100110110001>; +class S6_vsplatrbp_enc : Enc_X_2op<0b1000010001000100>; + + +class Enc_CR_2op_r<bits<12> opc> : OpcodeHexagon { + bits<2> dst; + bits<5> src1; + + let Inst{31-16} = { opc{11}, 0, opc{10-7}, 0, opc{6-3}, src1{4-0} }; + let Inst{13-0} = { opc{2}, 0b000000, opc{1}, 0b000, opc{0}, dst{1-0} }; +} + +class V6_pred_scalar2_enc : Enc_CR_2op_r<0b001101101011>; +class Y5_l2locka_enc : Enc_CR_2op_r<0b110000111100>; + +class Enc_S_3op_i6<bits<9> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<6> src2; + + let Inst{31-16} = { 0b1000, opc{8-6}, 0, opc{5-3}, src1{4-0} }; + let Inst{13-0} = { src2{5-0}, opc{2-0}, dst{4-0} }; +} + +class S6_rol_i_p_enc : Enc_S_3op_i6<0b000000011>; +class S6_rol_i_p_nac_enc : Enc_S_3op_i6<0b001000011>; +class S6_rol_i_p_acc_enc : Enc_S_3op_i6<0b001000111>; +class S6_rol_i_p_and_enc : Enc_S_3op_i6<0b001010011>; +class S6_rol_i_p_or_enc : Enc_S_3op_i6<0b001010111>; +class S6_rol_i_p_xacc_enc : Enc_S_3op_i6<0b001100011>; + +class Enc_X_3op_r<bits<15> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<5> src2; + + let Inst{31-16} = { opc{14-4}, src1{4-0} }; + let Inst{13-0} = { opc{3}, src2{4-0}, opc{2-0}, dst{4-0} }; +} + +class S6_rol_i_r_enc : Enc_X_3op_r<0b100011000000011>; +class S6_rol_i_r_nac_enc : Enc_X_3op_r<0b100011100000011>; +class S6_rol_i_r_acc_enc : Enc_X_3op_r<0b100011100000111>; +class S6_rol_i_r_and_enc : Enc_X_3op_r<0b100011100100011>; +class S6_rol_i_r_or_enc : Enc_X_3op_r<0b100011100100111>; +class S6_rol_i_r_xacc_enc : Enc_X_3op_r<0b100011101000011>; +class S6_vtrunehb_ppp_enc : Enc_X_3op_r<0b110000011000011>; +class S6_vtrunohb_ppp_enc : Enc_X_3op_r<0b110000011000101>; + +class Enc_no_operands<bits<25> opc> : OpcodeHexagon { + + let Inst{31-16} = { opc{24-10}, 0 }; + let Inst{13-0} = { opc{9-7}, 0b000, opc{6-0}, 0 }; +} + +class Y5_l2gunlock_enc : Enc_no_operands<0b1010100000100000010000000>; +class Y5_l2gclean_enc : Enc_no_operands<0b1010100000100000100000000>; +class Y5_l2gcleaninv_enc : Enc_no_operands<0b1010100000100000110000000>; +class V6_vhist_enc : Enc_no_operands<0b0001111000000001001000000>; + +class Enc_J_jumpr<bits<13> opc> : OpcodeHexagon { + bits<5> src1; + + let Inst{31-16} = { opc{12-6}, 0, opc{5-3}, src1{4-0} }; + let Inst{13-0} = { 0b00, opc{2}, 0b0000, opc{1-0}, 0b00000 }; +} + +class Y5_l2unlocka_enc : Enc_J_jumpr<0b1010011011000>; +class Y2_l2cleaninvidx_enc : Enc_J_jumpr<0b1010100011000>; + +class Enc_ST_l2gclean_pa<bits<2> opc> : OpcodeHexagon { + bits<5> src1; + + let Inst{31-16} = { 0b101001101, opc{1-0}, 0b00000 }; + let Inst{13-0} = { 0, src1{4-0}, 0b00000000 }; +} + +class Y6_l2gcleanpa_enc : Enc_ST_l2gclean_pa<0b01>; +class Y6_l2gcleaninvpa_enc : Enc_ST_l2gclean_pa<0b10>; + +class A5_ACS_enc : OpcodeHexagon { + bits<5> dst1; + bits<2> dst2; + bits<5> src1; + bits<5> src2; + + let Inst{31-16} = { 0b11101010101, src1{4-0} }; + let Inst{13-0} = { 0, src2{4-0}, 0, dst2{1-0}, dst1{4-0} }; +} + +class Enc_X_4op_r<bits<8> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<5> src2; + bits<2> src3; + + let Inst{31-16} = { 0b11, opc{7}, 0, opc{6-5}, 1, opc{4-1}, src1{4-0} }; + let Inst{13-0} = { 0, src2{4-0}, opc{0}, src3{1-0}, dst{4-0} }; +} + +class S2_vsplicerb_enc : Enc_X_4op_r<0b00001000>; +class S2_cabacencbin_enc : Enc_X_4op_r<0b00001010>; +class F2_sffma_sc_enc : Enc_X_4op_r<0b11110111>; + +class V6_vhistq_enc : OpcodeHexagon { + bits<2> src1; + + let Inst{31-16} = { 0b00011110, src1{1-0}, 0b000010 }; + let Inst{13-0} = { 0b10000010000000 }; +} + +// TODO: Change script to generate dst1 instead of dst. +class A6_vminub_RdP_enc : OpcodeHexagon { + bits<5> dst1; + bits<2> dst2; + bits<5> src1; + bits<5> src2; + + let Inst{31-16} = { 0b11101010111, src2{4-0} }; + let Inst{13-0} = { 0, src1{4-0}, 0, dst2{1-0}, dst1{4-0} }; +} diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td index 44bab29..3c5ec17 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td @@ -34,6 +34,8 @@ class SubTarget<bits<6> value> { def HasAnySubT : SubTarget<0x3f>; // 111111 def HasV5SubT : SubTarget<0x3e>; // 111110 +def HasV55SubT : SubTarget<0x3c>; // 111100 +def HasV60SubT : SubTarget<0x38>; // 111000 // Addressing modes for load/store instructions class AddrModeType<bits<3> value> { @@ -57,6 +59,8 @@ def ByteAccess : MemAccessSize<1>;// Byte access instruction (memb). def HalfWordAccess : MemAccessSize<2>;// Half word access instruction (memh). def WordAccess : MemAccessSize<3>;// Word access instruction (memw). def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd) +def Vector64Access : MemAccessSize<7>;// Vector access instruction (memv) +def Vector128Access : MemAccessSize<8>;// Vector access instruction (memv) //===----------------------------------------------------------------------===// @@ -167,14 +171,23 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern, bits<1> isFP = 0; let TSFlags {48} = isFP; // Floating-point. + bits<1> hasNewValue2 = 0; + let TSFlags{50} = hasNewValue2; // Second New-value producer insn. + bits<3> opNewValue2 = 0; + let TSFlags{53-51} = opNewValue2; // Second New-value produced operand. + + bits<1> isAccumulator = 0; + let TSFlags{54} = isAccumulator; + // Fields used for relation models. + bit isNonTemporal = 0; + string isNT = ""; // set to "true" for non-temporal vector stores. string BaseOpcode = ""; string CextOpcode = ""; string PredSense = ""; string PNewValue = ""; string NValueST = ""; // Set to "true" for new-value stores. string InputType = ""; // Input is "imm" or "reg" type. - string isMEMri = "false"; // Set to "true" for load/store with MEMri operand. string isFloat = "false"; // Set to "true" for the floating-point load/store. string isBrTaken = !if(isTaken, "true", "false"); // Set to "true"/"false" for jump instructions @@ -182,6 +195,7 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern, ""); let PNewValue = !if(isPredicatedNew, "new", ""); let NValueST = !if(isNVStore, "true", "false"); + let isNT = !if(isNonTemporal, "true", "false"); // *** Must match MCTargetDesc/HexagonBaseInfo.h *** } @@ -217,6 +231,11 @@ class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0> : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon; +let mayLoad = 1 in +class LD1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>; + // ST Instruction Class in V2/V3 can take SLOT0 only. // ST Instruction Class in V4 can take SLOT0 & SLOT1. // Definition of the instruction class CHANGED from V2/V3 to V4. @@ -234,6 +253,12 @@ class ST0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = ST_tc_ld_SLOT0> : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon; +// Same as ST0Inst but doesn't derive from OpcodeHexagon. +let mayStore = 1 in +class ST1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>; + // ST Instruction Class in V2/V3 can take SLOT0 only. // ST Instruction Class in V4 can take SLOT0 & SLOT1. // Definition of the instruction class CHANGED from V2/V3 to V4. @@ -277,6 +302,11 @@ class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>, OpcodeHexagon; +// Same as above but doesn't derive from OpcodeHexagon +class MInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>; + // M Instruction Class in V2/V3. // XTYPE Instruction Class in V4. // Definition of the instruction class NOT CHANGED. @@ -294,6 +324,10 @@ class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>, OpcodeHexagon; +class SInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>; + // S Instruction Class in V2/V3. // XTYPE Instruction Class in V4. // Definition of the instruction class NOT CHANGED. @@ -402,3 +436,13 @@ include "HexagonInstrFormatsV4.td" //===----------------------------------------------------------------------===// // V4 Instruction Format Definitions + //===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// V60 Instruction Format Definitions + +//===----------------------------------------------------------------------===// + +include "HexagonInstrFormatsV60.td" + +//===----------------------------------------------------------------------===// +// V60 Instruction Format Definitions + +//===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td index db83ef6..2d1dea5 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td @@ -21,8 +21,6 @@ def TypeMEMOP : IType<9>; def TypeNV : IType<10>; def TypeDUPLEX : IType<11>; def TypeCOMPOUND : IType<12>; -def TypeAG_VX : IType<28>; -def TypeAG_VM : IType<29>; def TypePREFIX : IType<30>; // Duplex Instruction Class Declaration diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td new file mode 100644 index 0000000..f3d43de --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td @@ -0,0 +1,238 @@ +//==- HexagonInstrFormatsV60.td - Hexagon Instruction Formats -*- tablegen -==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Hexagon V60 instruction classes in TableGen format. +// +//===----------------------------------------------------------------------===// + +//----------------------------------------------------------------------------// +// Hexagon Intruction Flags + +// +// *** Must match BaseInfo.h *** +//----------------------------------------------------------------------------// + +def TypeCVI_VA : IType<13>; +def TypeCVI_VA_DV : IType<14>; +def TypeCVI_VX : IType<15>; +def TypeCVI_VX_DV : IType<16>; +def TypeCVI_VP : IType<17>; +def TypeCVI_VP_VS : IType<18>; +def TypeCVI_VS : IType<19>; +def TypeCVI_VINLANESAT : IType<20>; +def TypeCVI_VM_LD : IType<21>; +def TypeCVI_VM_TMP_LD : IType<22>; +def TypeCVI_VM_CUR_LD : IType<23>; +def TypeCVI_VM_VP_LDU : IType<24>; +def TypeCVI_VM_ST : IType<25>; +def TypeCVI_VM_NEW_ST : IType<26>; +def TypeCVI_VM_STU : IType<27>; +def TypeCVI_HIST : IType<28>; +//----------------------------------------------------------------------------// +// Intruction Classes Definitions + +//----------------------------------------------------------------------------// + +let validSubTargets = HasV60SubT in +{ +class CVI_VA_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VA> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VA_DV_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VA_DV> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA_DV>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VX_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX_LONG> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VX_Resource_late<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX_LATE> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>, + Requires<[HasV60T, UseHVX]>; + +class CVI_VX_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VX_DV_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX_DV> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VX_DV_Slot2_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX_DV_SLOT2> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VX_DV_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX_DV_LONG> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VP_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VP_LONG> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VP_VS_Resource_early<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VP_VS_EARLY> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VP_VS_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VP_VS_LONG> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VP_VS_Resource_long_early<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VP_VS_LONG_EARLY> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VS_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VS> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VINLANESAT_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VINLANESAT> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VINLANESAT>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VS_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VS> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_LD_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_LD> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_LD_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_LD> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_TMP_LD_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_TMP_LD> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_TMP_LD_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_TMP_LD> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_CUR_LD_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_CUR_LD> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_CUR_LD>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_VP_LDU_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_VP_LDU> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_VP_LDU_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_VP_LDU> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_ST_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_ST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_ST_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_ST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_NEW_ST_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_NEW_ST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_NEW_ST_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_NEW_ST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_STU_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_STU> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_STU_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_STU> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_HIST_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_HIST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; +} + +let validSubTargets = HasV60SubT in +{ +class CVI_VA_Resource1<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VA> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>, + Requires<[HasV60T, UseHVX]>; + +class CVI_VX_DV_Resource1<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX_DV> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>, + Requires<[HasV60T, UseHVX]>; + +class CVI_HIST_Resource1<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_HIST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>, + Requires<[HasV60T, UseHVX]>; +} + + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 3cb0823..eb3590c 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -23,9 +23,11 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include <cctype> using namespace llvm; @@ -36,9 +38,41 @@ using namespace llvm; #include "HexagonGenInstrInfo.inc" #include "HexagonGenDFAPacketizer.inc" +using namespace llvm; + +cl::opt<bool> ScheduleInlineAsm("hexagon-sched-inline-asm", cl::Hidden, + cl::init(false), cl::desc("Do not consider inline-asm a scheduling/" + "packetization boundary.")); + +static cl::opt<bool> EnableBranchPrediction("hexagon-enable-branch-prediction", + cl::Hidden, cl::init(true), cl::desc("Enable branch prediction")); + +static cl::opt<bool> DisableNVSchedule("disable-hexagon-nv-schedule", + cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Disable schedule adjustment for new value stores.")); + +static cl::opt<bool> EnableTimingClassLatency( + "enable-timing-class-latency", cl::Hidden, cl::init(false), + cl::desc("Enable timing class latency")); + +static cl::opt<bool> EnableALUForwarding( + "enable-alu-forwarding", cl::Hidden, cl::init(true), + cl::desc("Enable vec alu forwarding")); + +static cl::opt<bool> EnableACCForwarding( + "enable-acc-forwarding", cl::Hidden, cl::init(true), + cl::desc("Enable vec acc forwarding")); + +static cl::opt<bool> BranchRelaxAsmLarge("branch-relax-asm-large", + cl::init(true), cl::Hidden, cl::ZeroOrMore, cl::desc("branch relax asm")); + /// /// Constants for Hexagon instructions. /// +const int Hexagon_MEMV_OFFSET_MAX_128B = 2047; // #s7 +const int Hexagon_MEMV_OFFSET_MIN_128B = -2048; // #s7 +const int Hexagon_MEMV_OFFSET_MAX = 1023; // #s6 +const int Hexagon_MEMV_OFFSET_MIN = -1024; // #s6 const int Hexagon_MEMW_OFFSET_MAX = 4095; const int Hexagon_MEMW_OFFSET_MIN = -4096; const int Hexagon_MEMD_OFFSET_MAX = 8191; @@ -57,71 +91,49 @@ const int Hexagon_MEMH_AUTOINC_MAX = 14; const int Hexagon_MEMH_AUTOINC_MIN = -16; const int Hexagon_MEMB_AUTOINC_MAX = 7; const int Hexagon_MEMB_AUTOINC_MIN = -8; +const int Hexagon_MEMV_AUTOINC_MAX = 192; +const int Hexagon_MEMV_AUTOINC_MIN = -256; +const int Hexagon_MEMV_AUTOINC_MAX_128B = 384; +const int Hexagon_MEMV_AUTOINC_MIN_128B = -512; // Pin the vtable to this file. void HexagonInstrInfo::anchor() {} HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST) : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP), - RI(), Subtarget(ST) {} + RI() {} -/// isLoadFromStackSlot - If the specified machine instruction is a direct -/// load from a stack slot, return the virtual or physical register number of -/// the destination along with the FrameIndex of the loaded stack slot. If -/// not, return 0. This predicate must return 0 if the instruction has -/// any side effects other than loading from the stack slot. -unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { +static bool isIntRegForSubInst(unsigned Reg) { + return (Reg >= Hexagon::R0 && Reg <= Hexagon::R7) || + (Reg >= Hexagon::R16 && Reg <= Hexagon::R23); +} - switch (MI->getOpcode()) { - default: break; - case Hexagon::L2_loadri_io: - case Hexagon::L2_loadrd_io: - case Hexagon::L2_loadrh_io: - case Hexagon::L2_loadrb_io: - case Hexagon::L2_loadrub_io: - if (MI->getOperand(2).isFI() && - MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) { - FrameIndex = MI->getOperand(2).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - } - return 0; + +static bool isDblRegForSubInst(unsigned Reg, const HexagonRegisterInfo &HRI) { + return isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::subreg_loreg)) && + isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::subreg_hireg)); } -/// isStoreToStackSlot - If the specified machine instruction is a direct -/// store to a stack slot, return the virtual or physical register number of -/// the source reg along with the FrameIndex of the loaded stack slot. If -/// not, return 0. This predicate must return 0 if the instruction has -/// any side effects other than storing to the stack slot. -unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { - default: break; - case Hexagon::S2_storeri_io: - case Hexagon::S2_storerd_io: - case Hexagon::S2_storerh_io: - case Hexagon::S2_storerb_io: - if (MI->getOperand(2).isFI() && - MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) { - FrameIndex = MI->getOperand(0).getIndex(); - return MI->getOperand(2).getReg(); - } - break; +/// Calculate number of instructions excluding the debug instructions. +static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB, + MachineBasicBlock::const_instr_iterator MIE) { + unsigned Count = 0; + for (; MIB != MIE; ++MIB) { + if (!MIB->isDebugValue()) + ++Count; } - return 0; + return Count; } -// Find the hardware loop instruction used to set-up the specified loop. -// On Hexagon, we have two instructions used to set-up the hardware loop -// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions -// to indicate the end of a loop. -static MachineInstr * -findLoopInstr(MachineBasicBlock *BB, int EndLoopOp, - SmallPtrSet<MachineBasicBlock *, 8> &Visited) { + +/// Find the hardware loop instruction used to set-up the specified loop. +/// On Hexagon, we have two instructions used to set-up the hardware loop +/// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions +/// to indicate the end of a loop. +static MachineInstr *findLoopInstr(MachineBasicBlock *BB, int EndLoopOp, + SmallPtrSet<MachineBasicBlock *, 8> &Visited) { int LOOPi; int LOOPr; if (EndLoopOp == Hexagon::ENDLOOP0) { @@ -157,100 +169,108 @@ findLoopInstr(MachineBasicBlock *BB, int EndLoopOp, return 0; } -unsigned HexagonInstrInfo::InsertBranch( - MachineBasicBlock &MBB,MachineBasicBlock *TBB, MachineBasicBlock *FBB, - ArrayRef<MachineOperand> Cond, DebugLoc DL) const { - Opcode_t BOpc = Hexagon::J2_jump; - Opcode_t BccOpc = Hexagon::J2_jumpt; +/// Gather register def/uses from MI. +/// This treats possible (predicated) defs as actually happening ones +/// (conservatively). +static inline void parseOperands(const MachineInstr *MI, + SmallVector<unsigned, 4> &Defs, SmallVector<unsigned, 8> &Uses) { + Defs.clear(); + Uses.clear(); - assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); - // Check if ReverseBranchCondition has asked to reverse this branch - // If we want to reverse the branch an odd number of times, we want - // J2_jumpf. - if (!Cond.empty() && Cond[0].isImm()) - BccOpc = Cond[0].getImm(); + if (!MO.isReg()) + continue; - if (!FBB) { - if (Cond.empty()) { - // Due to a bug in TailMerging/CFG Optimization, we need to add a - // special case handling of a predicated jump followed by an - // unconditional jump. If not, Tail Merging and CFG Optimization go - // into an infinite loop. - MachineBasicBlock *NewTBB, *NewFBB; - SmallVector<MachineOperand, 4> Cond; - MachineInstr *Term = MBB.getFirstTerminator(); - if (Term != MBB.end() && isPredicated(Term) && - !AnalyzeBranch(MBB, NewTBB, NewFBB, Cond, false)) { - MachineBasicBlock *NextBB = - std::next(MachineFunction::iterator(&MBB)); - if (NewTBB == NextBB) { - ReverseBranchCondition(Cond); - RemoveBranch(MBB); - return InsertBranch(MBB, TBB, nullptr, Cond, DL); - } - } - BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB); - } else if (isEndLoopN(Cond[0].getImm())) { - int EndLoopOp = Cond[0].getImm(); - assert(Cond[1].isMBB()); - // Since we're adding an ENDLOOP, there better be a LOOP instruction. - // Check for it, and change the BB target if needed. - SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs; - MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs); - assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP"); - Loop->getOperand(0).setMBB(TBB); - // Add the ENDLOOP after the finding the LOOP0. - BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB); - } else if (isNewValueJump(Cond[0].getImm())) { - assert((Cond.size() == 3) && "Only supporting rr/ri version of nvjump"); - // New value jump - // (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset) - // (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset) - unsigned Flags1 = getUndefRegState(Cond[1].isUndef()); - DEBUG(dbgs() << "\nInserting NVJump for BB#" << MBB.getNumber();); - if (Cond[2].isReg()) { - unsigned Flags2 = getUndefRegState(Cond[2].isUndef()); - BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1). - addReg(Cond[2].getReg(), Flags2).addMBB(TBB); - } else if(Cond[2].isImm()) { - BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1). - addImm(Cond[2].getImm()).addMBB(TBB); - } else - llvm_unreachable("Invalid condition for branching"); - } else { - assert((Cond.size() == 2) && "Malformed cond vector"); - const MachineOperand &RO = Cond[1]; - unsigned Flags = getUndefRegState(RO.isUndef()); - BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB); - } - return 1; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + + if (MO.isUse()) + Uses.push_back(MO.getReg()); + + if (MO.isDef()) + Defs.push_back(MO.getReg()); } - assert((!Cond.empty()) && - "Cond. cannot be empty when multiple branchings are required"); - assert((!isNewValueJump(Cond[0].getImm())) && - "NV-jump cannot be inserted with another branch"); - // Special case for hardware loops. The condition is a basic block. - if (isEndLoopN(Cond[0].getImm())) { - int EndLoopOp = Cond[0].getImm(); - assert(Cond[1].isMBB()); - // Since we're adding an ENDLOOP, there better be a LOOP instruction. - // Check for it, and change the BB target if needed. - SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs; - MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs); - assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP"); - Loop->getOperand(0).setMBB(TBB); - // Add the ENDLOOP after the finding the LOOP0. - BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB); - } else { - const MachineOperand &RO = Cond[1]; - unsigned Flags = getUndefRegState(RO.isUndef()); - BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB); +} + + +// Position dependent, so check twice for swap. +static bool isDuplexPairMatch(unsigned Ga, unsigned Gb) { + switch (Ga) { + case HexagonII::HSIG_None: + default: + return false; + case HexagonII::HSIG_L1: + return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_A); + case HexagonII::HSIG_L2: + return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 || + Gb == HexagonII::HSIG_A); + case HexagonII::HSIG_S1: + return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 || + Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_A); + case HexagonII::HSIG_S2: + return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 || + Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_S2 || + Gb == HexagonII::HSIG_A); + case HexagonII::HSIG_A: + return (Gb == HexagonII::HSIG_A); + case HexagonII::HSIG_Compound: + return (Gb == HexagonII::HSIG_Compound); } - BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB); + return false; +} - return 2; + + +/// isLoadFromStackSlot - If the specified machine instruction is a direct +/// load from a stack slot, return the virtual or physical register number of +/// the destination along with the FrameIndex of the loaded stack slot. If +/// not, return 0. This predicate must return 0 if the instruction has +/// any side effects other than loading from the stack slot. +unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case Hexagon::L2_loadri_io: + case Hexagon::L2_loadrd_io: + case Hexagon::L2_loadrh_io: + case Hexagon::L2_loadrb_io: + case Hexagon::L2_loadrub_io: + if (MI->getOperand(2).isFI() && + MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) { + FrameIndex = MI->getOperand(2).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + + +/// isStoreToStackSlot - If the specified machine instruction is a direct +/// store to a stack slot, return the virtual or physical register number of +/// the source reg along with the FrameIndex of the loaded stack slot. If +/// not, return 0. This predicate must return 0 if the instruction has +/// any side effects other than storing to the stack slot. +unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case Hexagon::S2_storeri_io: + case Hexagon::S2_storerd_io: + case Hexagon::S2_storerh_io: + case Hexagon::S2_storerb_io: + if (MI->getOperand(2).isFI() && + MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) { + FrameIndex = MI->getOperand(0).getIndex(); + return MI->getOperand(2).getReg(); + } + break; + } + return 0; } @@ -269,9 +289,6 @@ unsigned HexagonInstrInfo::InsertBranch( /// Cond[0] = Hexagon::CMPEQri_f_Jumpnv_t_V4 -- specific opcode /// Cond[1] = R /// Cond[2] = Imm -/// @note Related function is \fn findInstrPredicate which fills in -/// Cond. vector when a predicated instruction is passed to it. -/// We follow same protocol in that case too. /// bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, @@ -314,7 +331,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return false; --I; } - + bool JumpToBlock = I->getOpcode() == Hexagon::J2_jump && I->getOperand(0).isMBB(); // Delete the J2_jump if it's equivalent to a fall-through. @@ -327,17 +344,17 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return false; --I; } - if (!isUnpredicatedTerminator(I)) + if (!isUnpredicatedTerminator(&*I)) return false; // Get the last instruction in the block. - MachineInstr *LastInst = I; + MachineInstr *LastInst = &*I; MachineInstr *SecondLastInst = nullptr; // Find one more terminator if present. - do { - if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(I)) { + for (;;) { + if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(&*I)) { if (!SecondLastInst) - SecondLastInst = I; + SecondLastInst = &*I; else // This is a third branch. return true; @@ -345,7 +362,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, if (I == MBB.instr_begin()) break; --I; - } while(I); + } int LastOpcode = LastInst->getOpcode(); int SecLastOpcode = SecondLastInst ? SecondLastInst->getOpcode() : 0; @@ -418,7 +435,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, // executed, so remove it. if (SecLastOpcode == Hexagon::J2_jump && LastOpcode == Hexagon::J2_jump) { TBB = SecondLastInst->getOperand(0).getMBB(); - I = LastInst; + I = LastInst->getIterator(); if (AllowModify) I->eraseFromParent(); return false; @@ -438,6 +455,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return true; } + unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { DEBUG(dbgs() << "\nRemoving branches out of BB#" << MBB.getNumber()); MachineBasicBlock::iterator I = MBB.end(); @@ -458,100 +476,127 @@ unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return Count; } -/// \brief For a comparison instruction, return the source registers in -/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it -/// compares against in CmpValue. Return true if the comparison instruction -/// can be analyzed. -bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI, - unsigned &SrcReg, unsigned &SrcReg2, - int &Mask, int &Value) const { - unsigned Opc = MI->getOpcode(); - // Set mask and the first source register. - switch (Opc) { - case Hexagon::C2_cmpeq: - case Hexagon::C2_cmpeqp: - case Hexagon::C2_cmpgt: - case Hexagon::C2_cmpgtp: - case Hexagon::C2_cmpgtu: - case Hexagon::C2_cmpgtup: - case Hexagon::C4_cmpneq: - case Hexagon::C4_cmplte: - case Hexagon::C4_cmplteu: - case Hexagon::C2_cmpeqi: - case Hexagon::C2_cmpgti: - case Hexagon::C2_cmpgtui: - case Hexagon::C4_cmpneqi: - case Hexagon::C4_cmplteui: - case Hexagon::C4_cmpltei: - SrcReg = MI->getOperand(1).getReg(); - Mask = ~0; - break; - case Hexagon::A4_cmpbeq: - case Hexagon::A4_cmpbgt: - case Hexagon::A4_cmpbgtu: - case Hexagon::A4_cmpbeqi: - case Hexagon::A4_cmpbgti: - case Hexagon::A4_cmpbgtui: - SrcReg = MI->getOperand(1).getReg(); - Mask = 0xFF; - break; - case Hexagon::A4_cmpheq: - case Hexagon::A4_cmphgt: - case Hexagon::A4_cmphgtu: - case Hexagon::A4_cmpheqi: - case Hexagon::A4_cmphgti: - case Hexagon::A4_cmphgtui: - SrcReg = MI->getOperand(1).getReg(); - Mask = 0xFFFF; - break; - } +unsigned HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, DebugLoc DL) const { + unsigned BOpc = Hexagon::J2_jump; + unsigned BccOpc = Hexagon::J2_jumpt; + assert(validateBranchCond(Cond) && "Invalid branching condition"); + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); - // Set the value/second source register. - switch (Opc) { - case Hexagon::C2_cmpeq: - case Hexagon::C2_cmpeqp: - case Hexagon::C2_cmpgt: - case Hexagon::C2_cmpgtp: - case Hexagon::C2_cmpgtu: - case Hexagon::C2_cmpgtup: - case Hexagon::A4_cmpbeq: - case Hexagon::A4_cmpbgt: - case Hexagon::A4_cmpbgtu: - case Hexagon::A4_cmpheq: - case Hexagon::A4_cmphgt: - case Hexagon::A4_cmphgtu: - case Hexagon::C4_cmpneq: - case Hexagon::C4_cmplte: - case Hexagon::C4_cmplteu: - SrcReg2 = MI->getOperand(2).getReg(); - return true; + // Check if ReverseBranchCondition has asked to reverse this branch + // If we want to reverse the branch an odd number of times, we want + // J2_jumpf. + if (!Cond.empty() && Cond[0].isImm()) + BccOpc = Cond[0].getImm(); - case Hexagon::C2_cmpeqi: - case Hexagon::C2_cmpgtui: - case Hexagon::C2_cmpgti: - case Hexagon::C4_cmpneqi: - case Hexagon::C4_cmplteui: - case Hexagon::C4_cmpltei: - case Hexagon::A4_cmpbeqi: - case Hexagon::A4_cmpbgti: - case Hexagon::A4_cmpbgtui: - case Hexagon::A4_cmpheqi: - case Hexagon::A4_cmphgti: - case Hexagon::A4_cmphgtui: - SrcReg2 = 0; - Value = MI->getOperand(2).getImm(); - return true; + if (!FBB) { + if (Cond.empty()) { + // Due to a bug in TailMerging/CFG Optimization, we need to add a + // special case handling of a predicated jump followed by an + // unconditional jump. If not, Tail Merging and CFG Optimization go + // into an infinite loop. + MachineBasicBlock *NewTBB, *NewFBB; + SmallVector<MachineOperand, 4> Cond; + MachineInstr *Term = MBB.getFirstTerminator(); + if (Term != MBB.end() && isPredicated(Term) && + !AnalyzeBranch(MBB, NewTBB, NewFBB, Cond, false)) { + MachineBasicBlock *NextBB = &*++MBB.getIterator(); + if (NewTBB == NextBB) { + ReverseBranchCondition(Cond); + RemoveBranch(MBB); + return InsertBranch(MBB, TBB, nullptr, Cond, DL); + } + } + BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB); + } else if (isEndLoopN(Cond[0].getImm())) { + int EndLoopOp = Cond[0].getImm(); + assert(Cond[1].isMBB()); + // Since we're adding an ENDLOOP, there better be a LOOP instruction. + // Check for it, and change the BB target if needed. + SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs; + MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs); + assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP"); + Loop->getOperand(0).setMBB(TBB); + // Add the ENDLOOP after the finding the LOOP0. + BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB); + } else if (isNewValueJump(Cond[0].getImm())) { + assert((Cond.size() == 3) && "Only supporting rr/ri version of nvjump"); + // New value jump + // (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset) + // (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset) + unsigned Flags1 = getUndefRegState(Cond[1].isUndef()); + DEBUG(dbgs() << "\nInserting NVJump for BB#" << MBB.getNumber();); + if (Cond[2].isReg()) { + unsigned Flags2 = getUndefRegState(Cond[2].isUndef()); + BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1). + addReg(Cond[2].getReg(), Flags2).addMBB(TBB); + } else if(Cond[2].isImm()) { + BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1). + addImm(Cond[2].getImm()).addMBB(TBB); + } else + llvm_unreachable("Invalid condition for branching"); + } else { + assert((Cond.size() == 2) && "Malformed cond vector"); + const MachineOperand &RO = Cond[1]; + unsigned Flags = getUndefRegState(RO.isUndef()); + BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB); + } + return 1; } + assert((!Cond.empty()) && + "Cond. cannot be empty when multiple branchings are required"); + assert((!isNewValueJump(Cond[0].getImm())) && + "NV-jump cannot be inserted with another branch"); + // Special case for hardware loops. The condition is a basic block. + if (isEndLoopN(Cond[0].getImm())) { + int EndLoopOp = Cond[0].getImm(); + assert(Cond[1].isMBB()); + // Since we're adding an ENDLOOP, there better be a LOOP instruction. + // Check for it, and change the BB target if needed. + SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs; + MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs); + assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP"); + Loop->getOperand(0).setMBB(TBB); + // Add the ENDLOOP after the finding the LOOP0. + BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB); + } else { + const MachineOperand &RO = Cond[1]; + unsigned Flags = getUndefRegState(RO.isUndef()); + BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB); + } + BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB); - return false; + return 2; +} + + +bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumCycles, unsigned ExtraPredCycles, + BranchProbability Probability) const { + return nonDbgBBSize(&MBB) <= 3; +} + + +bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, unsigned ExtraTCycles, MachineBasicBlock &FMBB, + unsigned NumFCycles, unsigned ExtraFCycles, BranchProbability Probability) + const { + return nonDbgBBSize(&TMBB) <= 3 && nonDbgBBSize(&FMBB) <= 3; +} + + +bool HexagonInstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, + unsigned NumInstrs, BranchProbability Probability) const { + return NumInstrs <= 4; } void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { + MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { + auto &HRI = getRegisterInfo(); if (Hexagon::IntRegsRegClass.contains(SrcReg, DestReg)) { BuildMI(MBB, I, DL, get(Hexagon::A2_tfr), DestReg).addReg(SrcReg); return; @@ -599,28 +644,74 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB, addReg(SrcReg, getKillRegState(KillSrc)); return; } + if (Hexagon::PredRegsRegClass.contains(SrcReg) && + Hexagon::IntRegsRegClass.contains(DestReg)) { + BuildMI(MBB, I, DL, get(Hexagon::C2_tfrpr), DestReg). + addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + if (Hexagon::VectorRegsRegClass.contains(SrcReg, DestReg)) { + BuildMI(MBB, I, DL, get(Hexagon::V6_vassign), DestReg). + addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + if (Hexagon::VecDblRegsRegClass.contains(SrcReg, DestReg)) { + BuildMI(MBB, I, DL, get(Hexagon::V6_vcombine), DestReg). + addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_hireg), + getKillRegState(KillSrc)). + addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_loreg), + getKillRegState(KillSrc)); + return; + } + if (Hexagon::VecPredRegsRegClass.contains(SrcReg, DestReg)) { + BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), DestReg). + addReg(SrcReg). + addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + if (Hexagon::VecPredRegsRegClass.contains(SrcReg) && + Hexagon::VectorRegsRegClass.contains(DestReg)) { + llvm_unreachable("Unimplemented pred to vec"); + return; + } + if (Hexagon::VecPredRegsRegClass.contains(DestReg) && + Hexagon::VectorRegsRegClass.contains(SrcReg)) { + llvm_unreachable("Unimplemented vec to pred"); + return; + } + if (Hexagon::VecPredRegs128BRegClass.contains(SrcReg, DestReg)) { + BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), + HRI.getSubReg(DestReg, Hexagon::subreg_hireg)). + addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_hireg), + getKillRegState(KillSrc)); + BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), + HRI.getSubReg(DestReg, Hexagon::subreg_loreg)). + addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_loreg), + getKillRegState(KillSrc)); + return; + } +#ifndef NDEBUG + // Show the invalid registers to ease debugging. + dbgs() << "Invalid registers for copy in BB#" << MBB.getNumber() + << ": " << PrintReg(DestReg, &HRI) + << " = " << PrintReg(SrcReg, &HRI) << '\n'; +#endif llvm_unreachable("Unimplemented"); } -void HexagonInstrInfo:: -storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned SrcReg, bool isKill, int FI, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - +void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, unsigned SrcReg, bool isKill, int FI, + const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { DebugLoc DL = MBB.findDebugLoc(I); MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachineMemOperand *MMO = - MF.getMachineMemOperand( - MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), - MachineMemOperand::MOStore, - MFI.getObjectSize(FI), - Align); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), Align); if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(Hexagon::S2_storeri_io)) @@ -640,33 +731,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, } -void HexagonInstrInfo::storeRegToAddr( - MachineFunction &MF, unsigned SrcReg, - bool isKill, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const -{ - llvm_unreachable("Unimplemented"); -} - - -void HexagonInstrInfo:: -loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned DestReg, int FI, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { +void HexagonInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, unsigned DestReg, int FI, + const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { DebugLoc DL = MBB.findDebugLoc(I); MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachineMemOperand *MMO = - MF.getMachineMemOperand( - MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), - Align); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), Align); if (RC == &Hexagon::IntRegsRegClass) { BuildMI(MBB, I, DL, get(Hexagon::L2_loadri_io), DestReg) .addFrameIndex(FI).addImm(0).addMemOperand(MMO); @@ -682,27 +757,136 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, } -void HexagonInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const { - llvm_unreachable("Unimplemented"); -} -bool -HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - const HexagonRegisterInfo &TRI = getRegisterInfo(); +/// expandPostRAPseudo - This function is called for all pseudo instructions +/// that remain after register allocation. Many pseudo instructions are +/// created to help register allocation. This is the place to convert them +/// into real instructions. The target can edit MI in place, or it can insert +/// new instructions and erase MI. The function should return true if +/// anything was changed. +bool HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) + const { + const HexagonRegisterInfo &HRI = getRegisterInfo(); MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); unsigned Opc = MI->getOpcode(); + const unsigned VecOffset = 1; + bool Is128B = false; switch (Opc) { case Hexagon::ALIGNA: BuildMI(MBB, MI, DL, get(Hexagon::A2_andir), MI->getOperand(0).getReg()) - .addReg(TRI.getFrameRegister()) + .addReg(HRI.getFrameRegister()) .addImm(-MI->getOperand(1).getImm()); MBB.erase(MI); return true; + case Hexagon::HEXAGON_V6_vassignp_128B: + case Hexagon::HEXAGON_V6_vassignp: { + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned DstReg = MI->getOperand(0).getReg(); + if (SrcReg != DstReg) + copyPhysReg(MBB, MI, DL, DstReg, SrcReg, MI->getOperand(1).isKill()); + MBB.erase(MI); + return true; + } + case Hexagon::HEXAGON_V6_lo_128B: + case Hexagon::HEXAGON_V6_lo: { + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::subreg_loreg); + copyPhysReg(MBB, MI, DL, DstReg, SrcSubLo, MI->getOperand(1).isKill()); + MBB.erase(MI); + MRI.clearKillFlags(SrcSubLo); + return true; + } + case Hexagon::HEXAGON_V6_hi_128B: + case Hexagon::HEXAGON_V6_hi: { + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::subreg_hireg); + copyPhysReg(MBB, MI, DL, DstReg, SrcSubHi, MI->getOperand(1).isKill()); + MBB.erase(MI); + MRI.clearKillFlags(SrcSubHi); + return true; + } + case Hexagon::STrivv_indexed_128B: + Is128B = true; + case Hexagon::STrivv_indexed: { + unsigned SrcReg = MI->getOperand(2).getReg(); + unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::subreg_hireg); + unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::subreg_loreg); + unsigned NewOpcd = Is128B ? Hexagon::V6_vS32b_ai_128B + : Hexagon::V6_vS32b_ai; + unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6; + MachineInstr *MI1New = BuildMI(MBB, MI, DL, get(NewOpcd)) + .addOperand(MI->getOperand(0)) + .addImm(MI->getOperand(1).getImm()) + .addReg(SrcSubLo) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MI1New->getOperand(0).setIsKill(false); + BuildMI(MBB, MI, DL, get(NewOpcd)) + .addOperand(MI->getOperand(0)) + // The Vectors are indexed in multiples of vector size. + .addImm(MI->getOperand(1).getImm()+Offset) + .addReg(SrcSubHi) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MBB.erase(MI); + return true; + } + case Hexagon::LDrivv_pseudo_V6_128B: + case Hexagon::LDrivv_indexed_128B: + Is128B = true; + case Hexagon::LDrivv_pseudo_V6: + case Hexagon::LDrivv_indexed: { + unsigned NewOpcd = Is128B ? Hexagon::V6_vL32b_ai_128B + : Hexagon::V6_vL32b_ai; + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6; + MachineInstr *MI1New = + BuildMI(MBB, MI, DL, get(NewOpcd), + HRI.getSubReg(DstReg, Hexagon::subreg_loreg)) + .addOperand(MI->getOperand(1)) + .addImm(MI->getOperand(2).getImm()); + MI1New->getOperand(1).setIsKill(false); + BuildMI(MBB, MI, DL, get(NewOpcd), + HRI.getSubReg(DstReg, Hexagon::subreg_hireg)) + .addOperand(MI->getOperand(1)) + // The Vectors are indexed in multiples of vector size. + .addImm(MI->getOperand(2).getImm() + Offset) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MBB.erase(MI); + return true; + } + case Hexagon::LDriv_pseudo_V6_128B: + Is128B = true; + case Hexagon::LDriv_pseudo_V6: { + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned NewOpc = Is128B ? Hexagon::V6_vL32b_ai_128B + : Hexagon::V6_vL32b_ai; + int32_t Off = MI->getOperand(2).getImm(); + int32_t Idx = Off; + BuildMI(MBB, MI, DL, get(NewOpc), DstReg) + .addOperand(MI->getOperand(1)) + .addImm(Idx) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MBB.erase(MI); + return true; + } + case Hexagon::STriv_pseudo_V6_128B: + Is128B = true; + case Hexagon::STriv_pseudo_V6: { + unsigned NewOpc = Is128B ? Hexagon::V6_vS32b_ai_128B + : Hexagon::V6_vS32b_ai; + int32_t Off = MI->getOperand(1).getImm(); + int32_t Idx = Is128B ? (Off >> 7) : (Off >> 6); + BuildMI(MBB, MI, DL, get(NewOpc)) + .addOperand(MI->getOperand(0)) + .addImm(Idx) + .addOperand(MI->getOperand(2)) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MBB.erase(MI); + return true; + } case Hexagon::TFR_PdTrue: { unsigned Reg = MI->getOperand(0).getReg(); BuildMI(MBB, MI, DL, get(Hexagon::C2_orn), Reg) @@ -724,15 +908,15 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { unsigned DstReg = MI->getOperand(0).getReg(); unsigned Src1Reg = MI->getOperand(1).getReg(); unsigned Src2Reg = MI->getOperand(2).getReg(); - unsigned Src1SubHi = TRI.getSubReg(Src1Reg, Hexagon::subreg_hireg); - unsigned Src1SubLo = TRI.getSubReg(Src1Reg, Hexagon::subreg_loreg); - unsigned Src2SubHi = TRI.getSubReg(Src2Reg, Hexagon::subreg_hireg); - unsigned Src2SubLo = TRI.getSubReg(Src2Reg, Hexagon::subreg_loreg); + unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::subreg_hireg); + unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::subreg_loreg); + unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::subreg_hireg); + unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::subreg_loreg); BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi), - TRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi) + HRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi) .addReg(Src2SubHi); BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi), - TRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo) + HRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo) .addReg(Src2SubLo); MBB.erase(MI); MRI.clearKillFlags(Src1SubHi); @@ -747,17 +931,17 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { unsigned Src1Reg = MI->getOperand(1).getReg(); unsigned Src2Reg = MI->getOperand(2).getReg(); unsigned Src3Reg = MI->getOperand(3).getReg(); - unsigned Src1SubHi = TRI.getSubReg(Src1Reg, Hexagon::subreg_hireg); - unsigned Src1SubLo = TRI.getSubReg(Src1Reg, Hexagon::subreg_loreg); - unsigned Src2SubHi = TRI.getSubReg(Src2Reg, Hexagon::subreg_hireg); - unsigned Src2SubLo = TRI.getSubReg(Src2Reg, Hexagon::subreg_loreg); - unsigned Src3SubHi = TRI.getSubReg(Src3Reg, Hexagon::subreg_hireg); - unsigned Src3SubLo = TRI.getSubReg(Src3Reg, Hexagon::subreg_loreg); + unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::subreg_hireg); + unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::subreg_loreg); + unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::subreg_hireg); + unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::subreg_loreg); + unsigned Src3SubHi = HRI.getSubReg(Src3Reg, Hexagon::subreg_hireg); + unsigned Src3SubLo = HRI.getSubReg(Src3Reg, Hexagon::subreg_loreg); BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci), - TRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi) + HRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi) .addReg(Src2SubHi).addReg(Src3SubHi); BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci), - TRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo) + HRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo) .addReg(Src2SubLo).addReg(Src3SubLo); MBB.erase(MI); MRI.clearKillFlags(Src1SubHi); @@ -768,104 +952,168 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MRI.clearKillFlags(Src3SubLo); return true; } + case Hexagon::MUX64_rr: { + const MachineOperand &Op0 = MI->getOperand(0); + const MachineOperand &Op1 = MI->getOperand(1); + const MachineOperand &Op2 = MI->getOperand(2); + const MachineOperand &Op3 = MI->getOperand(3); + unsigned Rd = Op0.getReg(); + unsigned Pu = Op1.getReg(); + unsigned Rs = Op2.getReg(); + unsigned Rt = Op3.getReg(); + DebugLoc DL = MI->getDebugLoc(); + unsigned K1 = getKillRegState(Op1.isKill()); + unsigned K2 = getKillRegState(Op2.isKill()); + unsigned K3 = getKillRegState(Op3.isKill()); + if (Rd != Rs) + BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrpt), Rd) + .addReg(Pu, (Rd == Rt) ? K1 : 0) + .addReg(Rs, K2); + if (Rd != Rt) + BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrpf), Rd) + .addReg(Pu, K1) + .addReg(Rt, K3); + MBB.erase(MI); + return true; + } case Hexagon::TCRETURNi: MI->setDesc(get(Hexagon::J2_jump)); return true; case Hexagon::TCRETURNr: MI->setDesc(get(Hexagon::J2_jumpr)); return true; + case Hexagon::TFRI_f: + case Hexagon::TFRI_cPt_f: + case Hexagon::TFRI_cNotPt_f: { + unsigned Opx = (Opc == Hexagon::TFRI_f) ? 1 : 2; + APFloat FVal = MI->getOperand(Opx).getFPImm()->getValueAPF(); + APInt IVal = FVal.bitcastToAPInt(); + MI->RemoveOperand(Opx); + unsigned NewOpc = (Opc == Hexagon::TFRI_f) ? Hexagon::A2_tfrsi : + (Opc == Hexagon::TFRI_cPt_f) ? Hexagon::C2_cmoveit : + Hexagon::C2_cmoveif; + MI->setDesc(get(NewOpc)); + MI->addOperand(MachineOperand::CreateImm(IVal.getZExtValue())); + return true; + } } return false; } -MachineInstr *HexagonInstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, int FI) const { - // Hexagon_TODO: Implement. - return nullptr; + +// We indicate that we want to reverse the branch by +// inserting the reversed branching opcode. +bool HexagonInstrInfo::ReverseBranchCondition( + SmallVectorImpl<MachineOperand> &Cond) const { + if (Cond.empty()) + return true; + assert(Cond[0].isImm() && "First entry in the cond vector not imm-val"); + unsigned opcode = Cond[0].getImm(); + //unsigned temp; + assert(get(opcode).isBranch() && "Should be a branching condition."); + if (isEndLoopN(opcode)) + return true; + unsigned NewOpcode = getInvertedPredicatedOpcode(opcode); + Cond[0].setImm(NewOpcode); + return false; } -unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const { - MachineRegisterInfo &RegInfo = MF->getRegInfo(); - const TargetRegisterClass *TRC; - if (VT == MVT::i1) { - TRC = &Hexagon::PredRegsRegClass; - } else if (VT == MVT::i32 || VT == MVT::f32) { - TRC = &Hexagon::IntRegsRegClass; - } else if (VT == MVT::i64 || VT == MVT::f64) { - TRC = &Hexagon::DoubleRegsRegClass; - } else { - llvm_unreachable("Cannot handle this register class"); - } +void HexagonInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + DebugLoc DL; + BuildMI(MBB, MI, DL, get(Hexagon::A2_nop)); +} - unsigned NewReg = RegInfo.createVirtualRegister(TRC); - return NewReg; + +// Returns true if an instruction is predicated irrespective of the predicate +// sense. For example, all of the following will return true. +// if (p0) R1 = add(R2, R3) +// if (!p0) R1 = add(R2, R3) +// if (p0.new) R1 = add(R2, R3) +// if (!p0.new) R1 = add(R2, R3) +// Note: New-value stores are not included here as in the current +// implementation, we don't need to check their predicate sense. +bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask; } -bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const { - const MCInstrDesc &MID = MI->getDesc(); - const uint64_t F = MID.TSFlags; - if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask) - return true; - // TODO: This is largely obsolete now. Will need to be removed - // in consecutive patches. - switch(MI->getOpcode()) { - // TFR_FI Remains a special case. - case Hexagon::TFR_FI: - return true; - default: - return false; +bool HexagonInstrInfo::PredicateInstruction(MachineInstr *MI, + ArrayRef<MachineOperand> Cond) const { + if (Cond.empty() || isNewValueJump(Cond[0].getImm()) || + isEndLoopN(Cond[0].getImm())) { + DEBUG(dbgs() << "\nCannot predicate:"; MI->dump();); + return false; } - return false; -} + int Opc = MI->getOpcode(); + assert (isPredicable(MI) && "Expected predicable instruction"); + bool invertJump = predOpcodeHasNot(Cond); -// This returns true in two cases: -// - The OP code itself indicates that this is an extended instruction. -// - One of MOs has been marked with HMOTF_ConstExtended flag. -bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const { - // First check if this is permanently extended op code. - const uint64_t F = MI->getDesc().TSFlags; - if ((F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask) - return true; - // Use MO operand flags to determine if one of MI's operands - // has HMOTF_ConstExtended flag set. - for (MachineInstr::const_mop_iterator I = MI->operands_begin(), - E = MI->operands_end(); I != E; ++I) { - if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended) - return true; + // We have to predicate MI "in place", i.e. after this function returns, + // MI will need to be transformed into a predicated form. To avoid com- + // plicated manipulations with the operands (handling tied operands, + // etc.), build a new temporary instruction, then overwrite MI with it. + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned PredOpc = getCondOpcode(Opc, invertJump); + MachineInstrBuilder T = BuildMI(B, MI, DL, get(PredOpc)); + unsigned NOp = 0, NumOps = MI->getNumOperands(); + while (NOp < NumOps) { + MachineOperand &Op = MI->getOperand(NOp); + if (!Op.isReg() || !Op.isDef() || Op.isImplicit()) + break; + T.addOperand(Op); + NOp++; } - return false; -} -bool HexagonInstrInfo::isBranch (const MachineInstr *MI) const { - return MI->getDesc().isBranch(); -} + unsigned PredReg, PredRegPos, PredRegFlags; + bool GotPredReg = getPredReg(Cond, PredReg, PredRegPos, PredRegFlags); + (void)GotPredReg; + assert(GotPredReg); + T.addReg(PredReg, PredRegFlags); + while (NOp < NumOps) + T.addOperand(MI->getOperand(NOp++)); -bool HexagonInstrInfo::isNewValueInst(const MachineInstr *MI) const { - if (isNewValueJump(MI)) - return true; + MI->setDesc(get(PredOpc)); + while (unsigned n = MI->getNumOperands()) + MI->RemoveOperand(n-1); + for (unsigned i = 0, n = T->getNumOperands(); i < n; ++i) + MI->addOperand(T->getOperand(i)); - if (isNewValueStore(MI)) - return true; + MachineBasicBlock::instr_iterator TI = T->getIterator(); + B.erase(TI); - return false; + MachineRegisterInfo &MRI = B.getParent()->getRegInfo(); + MRI.clearKillFlags(PredReg); + return true; } -bool HexagonInstrInfo::isNewValue(const MachineInstr* MI) const { - const uint64_t F = MI->getDesc().TSFlags; - return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask); -} -bool HexagonInstrInfo::isNewValue(Opcode_t Opcode) const { - const uint64_t F = get(Opcode).TSFlags; - return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask); +bool HexagonInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const { + // TODO: Fix this + return false; } -bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const { - return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4; + +bool HexagonInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + auto &HRI = getRegisterInfo(); + for (unsigned oper = 0; oper < MI->getNumOperands(); ++oper) { + MachineOperand MO = MI->getOperand(oper); + if (MO.isReg() && MO.isDef()) { + const TargetRegisterClass* RC = HRI.getMinimalPhysRegClass(MO.getReg()); + if (RC == &Hexagon::PredRegsRegClass) { + Pred.push_back(MO); + return true; + } + } + } + return false; } bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const { @@ -875,10 +1123,21 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const { return false; const int Opc = MI->getOpcode(); + int NumOperands = MI->getNumOperands(); + + // Keep a flag for upto 4 operands in the instructions, to indicate if + // that operand has been constant extended. + bool OpCExtended[4]; + if (NumOperands > 4) + NumOperands = 4; + + for (int i = 0; i < NumOperands; i++) + OpCExtended[i] = (isOperandExtended(MI, i) && isConstExtended(MI)); switch(Opc) { case Hexagon::A2_tfrsi: - return (isOperandExtended(MI, 1) && isConstExtended(MI)) || isInt<12>(MI->getOperand(1).getImm()); + return (isOperandExtended(MI, 1) && isConstExtended(MI)) || + isInt<12>(MI->getOperand(1).getImm()); case Hexagon::S2_storerd_io: return isShiftedUInt<6,3>(MI->getOperand(1).getImm()); @@ -926,8 +1185,8 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const { case Hexagon::S4_storeirb_io: case Hexagon::S4_storeirh_io: case Hexagon::S4_storeiri_io: - return (isUInt<6>(MI->getOperand(1).getImm()) && - isInt<6>(MI->getOperand(2).getImm())); + return (OpCExtended[1] || isUInt<6>(MI->getOperand(1).getImm())) && + (OpCExtended[2] || isInt<6>(MI->getOperand(2).getImm())); case Hexagon::A2_addi: return isInt<8>(MI->getOperand(2).getImm()); @@ -944,269 +1203,1117 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const { return true; } -// This function performs the following inversiones: -// -// cPt ---> cNotPt -// cNotPt ---> cPt -// -unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const { - int InvPredOpcode; - InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc) - : Hexagon::getTruePredOpcode(Opc); - if (InvPredOpcode >= 0) // Valid instruction with the inverted predicate. - return InvPredOpcode; - switch(Opc) { - default: llvm_unreachable("Unexpected predicated instruction"); - case Hexagon::C2_ccombinewt: - return Hexagon::C2_ccombinewf; +bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI, + const MachineBasicBlock *MBB, const MachineFunction &MF) const { + // Debug info is never a scheduling boundary. It's necessary to be explicit + // due to the special treatment of IT instructions below, otherwise a + // dbg_value followed by an IT will result in the IT instruction being + // considered a scheduling hazard, which is wrong. It should be the actual + // instruction preceding the dbg_value instruction(s), just like it is + // when debug info is not present. + if (MI->isDebugValue()) + return false; + + // Throwing call is a boundary. + if (MI->isCall()) { + // If any of the block's successors is a landing pad, this could be a + // throwing call. + for (auto I : MBB->successors()) + if (I->isEHPad()) + return true; + } + + // Don't mess around with no return calls. + if (MI->getOpcode() == Hexagon::CALLv3nr) + return true; + + // Terminators and labels can't be scheduled around. + if (MI->getDesc().isTerminator() || MI->isPosition()) + return true; + + if (MI->isInlineAsm() && !ScheduleInlineAsm) + return true; + + return false; +} + + +/// Measure the specified inline asm to determine an approximation of its +/// length. +/// Comments (which run till the next SeparatorString or newline) do not +/// count as an instruction. +/// Any other non-whitespace text is considered an instruction, with +/// multiple instructions separated by SeparatorString or newlines. +/// Variable-length instructions are not handled here; this function +/// may be overloaded in the target code to do that. +/// Hexagon counts the number of ##'s and adjust for that many +/// constant exenders. +unsigned HexagonInstrInfo::getInlineAsmLength(const char *Str, + const MCAsmInfo &MAI) const { + StringRef AStr(Str); + // Count the number of instructions in the asm. + bool atInsnStart = true; + unsigned Length = 0; + for (; *Str; ++Str) { + if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(), + strlen(MAI.getSeparatorString())) == 0) + atInsnStart = true; + if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) { + Length += MAI.getMaxInstLength(); + atInsnStart = false; + } + if (atInsnStart && strncmp(Str, MAI.getCommentString(), + strlen(MAI.getCommentString())) == 0) + atInsnStart = false; + } + + // Add to size number of constant extenders seen * 4. + StringRef Occ("##"); + Length += AStr.count(Occ)*4; + return Length; +} + + +ScheduleHazardRecognizer* +HexagonInstrInfo::CreateTargetPostRAHazardRecognizer( + const InstrItineraryData *II, const ScheduleDAG *DAG) const { + return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG); +} + + +/// \brief For a comparison instruction, return the source registers in +/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it +/// compares against in CmpValue. Return true if the comparison instruction +/// can be analyzed. +bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI, + unsigned &SrcReg, unsigned &SrcReg2, int &Mask, int &Value) const { + unsigned Opc = MI->getOpcode(); + + // Set mask and the first source register. + switch (Opc) { + case Hexagon::C2_cmpeq: + case Hexagon::C2_cmpeqp: + case Hexagon::C2_cmpgt: + case Hexagon::C2_cmpgtp: + case Hexagon::C2_cmpgtu: + case Hexagon::C2_cmpgtup: + case Hexagon::C4_cmpneq: + case Hexagon::C4_cmplte: + case Hexagon::C4_cmplteu: + case Hexagon::C2_cmpeqi: + case Hexagon::C2_cmpgti: + case Hexagon::C2_cmpgtui: + case Hexagon::C4_cmpneqi: + case Hexagon::C4_cmplteui: + case Hexagon::C4_cmpltei: + SrcReg = MI->getOperand(1).getReg(); + Mask = ~0; + break; + case Hexagon::A4_cmpbeq: + case Hexagon::A4_cmpbgt: + case Hexagon::A4_cmpbgtu: + case Hexagon::A4_cmpbeqi: + case Hexagon::A4_cmpbgti: + case Hexagon::A4_cmpbgtui: + SrcReg = MI->getOperand(1).getReg(); + Mask = 0xFF; + break; + case Hexagon::A4_cmpheq: + case Hexagon::A4_cmphgt: + case Hexagon::A4_cmphgtu: + case Hexagon::A4_cmpheqi: + case Hexagon::A4_cmphgti: + case Hexagon::A4_cmphgtui: + SrcReg = MI->getOperand(1).getReg(); + Mask = 0xFFFF; + break; + } + + // Set the value/second source register. + switch (Opc) { + case Hexagon::C2_cmpeq: + case Hexagon::C2_cmpeqp: + case Hexagon::C2_cmpgt: + case Hexagon::C2_cmpgtp: + case Hexagon::C2_cmpgtu: + case Hexagon::C2_cmpgtup: + case Hexagon::A4_cmpbeq: + case Hexagon::A4_cmpbgt: + case Hexagon::A4_cmpbgtu: + case Hexagon::A4_cmpheq: + case Hexagon::A4_cmphgt: + case Hexagon::A4_cmphgtu: + case Hexagon::C4_cmpneq: + case Hexagon::C4_cmplte: + case Hexagon::C4_cmplteu: + SrcReg2 = MI->getOperand(2).getReg(); + return true; + + case Hexagon::C2_cmpeqi: + case Hexagon::C2_cmpgtui: + case Hexagon::C2_cmpgti: + case Hexagon::C4_cmpneqi: + case Hexagon::C4_cmplteui: + case Hexagon::C4_cmpltei: + case Hexagon::A4_cmpbeqi: + case Hexagon::A4_cmpbgti: + case Hexagon::A4_cmpbgtui: + case Hexagon::A4_cmpheqi: + case Hexagon::A4_cmphgti: + case Hexagon::A4_cmphgtui: + SrcReg2 = 0; + Value = MI->getOperand(2).getImm(); + return true; + } + + return false; +} + + +unsigned HexagonInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, unsigned *PredCost) const { + return getInstrTimingClassLatency(ItinData, MI); +} + + +DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState( + const TargetSubtargetInfo &STI) const { + const InstrItineraryData *II = STI.getInstrItineraryData(); + return static_cast<const HexagonSubtarget&>(STI).createDFAPacketizer(II); +} + + +// Inspired by this pair: +// %R13<def> = L2_loadri_io %R29, 136; mem:LD4[FixedStack0] +// S2_storeri_io %R29, 132, %R1<kill>; flags: mem:ST4[FixedStack1] +// Currently AA considers the addresses in these instructions to be aliasing. +bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, + MachineInstr *MIb, AliasAnalysis *AA) const { + int OffsetA = 0, OffsetB = 0; + unsigned SizeA = 0, SizeB = 0; + + if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() || + MIa->hasOrderedMemoryRef() || MIa->hasOrderedMemoryRef()) + return false; + + // Instructions that are pure loads, not loads and stores like memops are not + // dependent. + if (MIa->mayLoad() && !isMemOp(MIa) && MIb->mayLoad() && !isMemOp(MIb)) + return true; + + // Get base, offset, and access size in MIa. + unsigned BaseRegA = getBaseAndOffset(MIa, OffsetA, SizeA); + if (!BaseRegA || !SizeA) + return false; + + // Get base, offset, and access size in MIb. + unsigned BaseRegB = getBaseAndOffset(MIb, OffsetB, SizeB); + if (!BaseRegB || !SizeB) + return false; + + if (BaseRegA != BaseRegB) + return false; + + // This is a mem access with the same base register and known offsets from it. + // Reason about it. + if (OffsetA > OffsetB) { + uint64_t offDiff = (uint64_t)((int64_t)OffsetA - (int64_t)OffsetB); + return (SizeB <= offDiff); + } else if (OffsetA < OffsetB) { + uint64_t offDiff = (uint64_t)((int64_t)OffsetB - (int64_t)OffsetA); + return (SizeA <= offDiff); + } + + return false; +} + + +unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const { + MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterClass *TRC; + if (VT == MVT::i1) { + TRC = &Hexagon::PredRegsRegClass; + } else if (VT == MVT::i32 || VT == MVT::f32) { + TRC = &Hexagon::IntRegsRegClass; + } else if (VT == MVT::i64 || VT == MVT::f64) { + TRC = &Hexagon::DoubleRegsRegClass; + } else { + llvm_unreachable("Cannot handle this register class"); + } + + unsigned NewReg = MRI.createVirtualRegister(TRC); + return NewReg; +} + + +bool HexagonInstrInfo::isAbsoluteSet(const MachineInstr* MI) const { + return (getAddrMode(MI) == HexagonII::AbsoluteSet); +} + + +bool HexagonInstrInfo::isAccumulator(const MachineInstr *MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return((F >> HexagonII::AccumulatorPos) & HexagonII::AccumulatorMask); +} + + +bool HexagonInstrInfo::isComplex(const MachineInstr *MI) const { + const MachineFunction *MF = MI->getParent()->getParent(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + + if (!(isTC1(MI)) + && !(QII->isTC2Early(MI)) + && !(MI->getDesc().mayLoad()) + && !(MI->getDesc().mayStore()) + && (MI->getDesc().getOpcode() != Hexagon::S2_allocframe) + && (MI->getDesc().getOpcode() != Hexagon::L2_deallocframe) + && !(QII->isMemOp(MI)) + && !(MI->isBranch()) + && !(MI->isReturn()) + && !MI->isCall()) + return true; + + return false; +} + + +// Return true if the instruction is a compund branch instruction. +bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr *MI) const { + return (getType(MI) == HexagonII::TypeCOMPOUND && MI->isBranch()); +} + + +bool HexagonInstrInfo::isCondInst(const MachineInstr *MI) const { + return (MI->isBranch() && isPredicated(MI)) || + isConditionalTransfer(MI) || + isConditionalALU32(MI) || + isConditionalLoad(MI) || + // Predicated stores which don't have a .new on any operands. + (MI->mayStore() && isPredicated(MI) && !isNewValueStore(MI) && + !isPredicatedNew(MI)); +} + + +bool HexagonInstrInfo::isConditionalALU32(const MachineInstr* MI) const { + switch (MI->getOpcode()) { + case Hexagon::A2_paddf: + case Hexagon::A2_paddfnew: + case Hexagon::A2_paddif: + case Hexagon::A2_paddifnew: + case Hexagon::A2_paddit: + case Hexagon::A2_padditnew: + case Hexagon::A2_paddt: + case Hexagon::A2_paddtnew: + case Hexagon::A2_pandf: + case Hexagon::A2_pandfnew: + case Hexagon::A2_pandt: + case Hexagon::A2_pandtnew: + case Hexagon::A2_porf: + case Hexagon::A2_porfnew: + case Hexagon::A2_port: + case Hexagon::A2_portnew: + case Hexagon::A2_psubf: + case Hexagon::A2_psubfnew: + case Hexagon::A2_psubt: + case Hexagon::A2_psubtnew: + case Hexagon::A2_pxorf: + case Hexagon::A2_pxorfnew: + case Hexagon::A2_pxort: + case Hexagon::A2_pxortnew: + case Hexagon::A4_paslhf: + case Hexagon::A4_paslhfnew: + case Hexagon::A4_paslht: + case Hexagon::A4_paslhtnew: + case Hexagon::A4_pasrhf: + case Hexagon::A4_pasrhfnew: + case Hexagon::A4_pasrht: + case Hexagon::A4_pasrhtnew: + case Hexagon::A4_psxtbf: + case Hexagon::A4_psxtbfnew: + case Hexagon::A4_psxtbt: + case Hexagon::A4_psxtbtnew: + case Hexagon::A4_psxthf: + case Hexagon::A4_psxthfnew: + case Hexagon::A4_psxtht: + case Hexagon::A4_psxthtnew: + case Hexagon::A4_pzxtbf: + case Hexagon::A4_pzxtbfnew: + case Hexagon::A4_pzxtbt: + case Hexagon::A4_pzxtbtnew: + case Hexagon::A4_pzxthf: + case Hexagon::A4_pzxthfnew: + case Hexagon::A4_pzxtht: + case Hexagon::A4_pzxthtnew: case Hexagon::C2_ccombinewf: - return Hexagon::C2_ccombinewt; + case Hexagon::C2_ccombinewt: + return true; + } + return false; +} + + +// FIXME - Function name and it's functionality don't match. +// It should be renamed to hasPredNewOpcode() +bool HexagonInstrInfo::isConditionalLoad(const MachineInstr* MI) const { + if (!MI->getDesc().mayLoad() || !isPredicated(MI)) + return false; + + int PNewOpcode = Hexagon::getPredNewOpcode(MI->getOpcode()); + // Instruction with valid predicated-new opcode can be promoted to .new. + return PNewOpcode >= 0; +} + - // Dealloc_return. - case Hexagon::L4_return_t: - return Hexagon::L4_return_f; - case Hexagon::L4_return_f: - return Hexagon::L4_return_t; +// Returns true if an instruction is a conditional store. +// +// Note: It doesn't include conditional new-value stores as they can't be +// converted to .new predicate. +bool HexagonInstrInfo::isConditionalStore(const MachineInstr* MI) const { + switch (MI->getOpcode()) { + default: return false; + case Hexagon::S4_storeirbt_io: + case Hexagon::S4_storeirbf_io: + case Hexagon::S4_pstorerbt_rr: + case Hexagon::S4_pstorerbf_rr: + case Hexagon::S2_pstorerbt_io: + case Hexagon::S2_pstorerbf_io: + case Hexagon::S2_pstorerbt_pi: + case Hexagon::S2_pstorerbf_pi: + case Hexagon::S2_pstorerdt_io: + case Hexagon::S2_pstorerdf_io: + case Hexagon::S4_pstorerdt_rr: + case Hexagon::S4_pstorerdf_rr: + case Hexagon::S2_pstorerdt_pi: + case Hexagon::S2_pstorerdf_pi: + case Hexagon::S2_pstorerht_io: + case Hexagon::S2_pstorerhf_io: + case Hexagon::S4_storeirht_io: + case Hexagon::S4_storeirhf_io: + case Hexagon::S4_pstorerht_rr: + case Hexagon::S4_pstorerhf_rr: + case Hexagon::S2_pstorerht_pi: + case Hexagon::S2_pstorerhf_pi: + case Hexagon::S2_pstorerit_io: + case Hexagon::S2_pstorerif_io: + case Hexagon::S4_storeirit_io: + case Hexagon::S4_storeirif_io: + case Hexagon::S4_pstorerit_rr: + case Hexagon::S4_pstorerif_rr: + case Hexagon::S2_pstorerit_pi: + case Hexagon::S2_pstorerif_pi: + + // V4 global address store before promoting to dot new. + case Hexagon::S4_pstorerdt_abs: + case Hexagon::S4_pstorerdf_abs: + case Hexagon::S4_pstorerbt_abs: + case Hexagon::S4_pstorerbf_abs: + case Hexagon::S4_pstorerht_abs: + case Hexagon::S4_pstorerhf_abs: + case Hexagon::S4_pstorerit_abs: + case Hexagon::S4_pstorerif_abs: + return true; + + // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded + // from the "Conditional Store" list. Because a predicated new value store + // would NOT be promoted to a double dot new store. + // This function returns yes for those stores that are predicated but not + // yet promoted to predicate dot new instructions. } } -// New Value Store instructions. -bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const { + +bool HexagonInstrInfo::isConditionalTransfer(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case Hexagon::A2_tfrt: + case Hexagon::A2_tfrf: + case Hexagon::C2_cmoveit: + case Hexagon::C2_cmoveif: + case Hexagon::A2_tfrtnew: + case Hexagon::A2_tfrfnew: + case Hexagon::C2_cmovenewit: + case Hexagon::C2_cmovenewif: + case Hexagon::A2_tfrpt: + case Hexagon::A2_tfrpf: + return true; + + default: + return false; + } + return false; +} + + +// TODO: In order to have isExtendable for fpimm/f32Ext, we need to handle +// isFPImm and later getFPImm as well. +bool HexagonInstrInfo::isConstExtended(const MachineInstr *MI) const { const uint64_t F = MI->getDesc().TSFlags; + unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask; + if (isExtended) // Instruction must be extended. + return true; + + unsigned isExtendable = + (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask; + if (!isExtendable) + return false; + + if (MI->isCall()) + return false; + + short ExtOpNum = getCExtOpNum(MI); + const MachineOperand &MO = MI->getOperand(ExtOpNum); + // Use MO operand flags to determine if MO + // has the HMOTF_ConstExtended flag set. + if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended) + return true; + // If this is a Machine BB address we are talking about, and it is + // not marked as extended, say so. + if (MO.isMBB()) + return false; + + // We could be using an instruction with an extendable immediate and shoehorn + // a global address into it. If it is a global address it will be constant + // extended. We do this for COMBINE. + // We currently only handle isGlobal() because it is the only kind of + // object we are going to end up with here for now. + // In the future we probably should add isSymbol(), etc. + if (MO.isGlobal() || MO.isSymbol() || MO.isBlockAddress() || + MO.isJTI() || MO.isCPI()) + return true; - return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask); + // If the extendable operand is not 'Immediate' type, the instruction should + // have 'isExtended' flag set. + assert(MO.isImm() && "Extendable operand must be Immediate type"); + + int MinValue = getMinValue(MI); + int MaxValue = getMaxValue(MI); + int ImmValue = MO.getImm(); + + return (ImmValue < MinValue || ImmValue > MaxValue); } -bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const { + +bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case Hexagon::L4_return : + case Hexagon::L4_return_t : + case Hexagon::L4_return_f : + case Hexagon::L4_return_tnew_pnt : + case Hexagon::L4_return_fnew_pnt : + case Hexagon::L4_return_tnew_pt : + case Hexagon::L4_return_fnew_pt : + return true; + } + return false; +} + + +// Return true when ConsMI uses a register defined by ProdMI. +bool HexagonInstrInfo::isDependent(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) const { + const MCInstrDesc &ProdMCID = ProdMI->getDesc(); + if (!ProdMCID.getNumDefs()) + return false; + + auto &HRI = getRegisterInfo(); + + SmallVector<unsigned, 4> DefsA; + SmallVector<unsigned, 4> DefsB; + SmallVector<unsigned, 8> UsesA; + SmallVector<unsigned, 8> UsesB; + + parseOperands(ProdMI, DefsA, UsesA); + parseOperands(ConsMI, DefsB, UsesB); + + for (auto &RegA : DefsA) + for (auto &RegB : UsesB) { + // True data dependency. + if (RegA == RegB) + return true; + + if (Hexagon::DoubleRegsRegClass.contains(RegA)) + for (MCSubRegIterator SubRegs(RegA, &HRI); SubRegs.isValid(); ++SubRegs) + if (RegB == *SubRegs) + return true; + + if (Hexagon::DoubleRegsRegClass.contains(RegB)) + for (MCSubRegIterator SubRegs(RegB, &HRI); SubRegs.isValid(); ++SubRegs) + if (RegA == *SubRegs) + return true; + } + + return false; +} + + +// Returns true if the instruction is alread a .cur. +bool HexagonInstrInfo::isDotCurInst(const MachineInstr* MI) const { + switch (MI->getOpcode()) { + case Hexagon::V6_vL32b_cur_pi: + case Hexagon::V6_vL32b_cur_ai: + case Hexagon::V6_vL32b_cur_pi_128B: + case Hexagon::V6_vL32b_cur_ai_128B: + return true; + } + return false; +} + + +// Returns true, if any one of the operands is a dot new +// insn, whether it is predicated dot new or register dot new. +bool HexagonInstrInfo::isDotNewInst(const MachineInstr* MI) const { + if (isNewValueInst(MI) || + (isPredicated(MI) && isPredicatedNew(MI))) + return true; + + return false; +} + + +/// Symmetrical. See if these two instructions are fit for duplex pair. +bool HexagonInstrInfo::isDuplexPair(const MachineInstr *MIa, + const MachineInstr *MIb) const { + HexagonII::SubInstructionGroup MIaG = getDuplexCandidateGroup(MIa); + HexagonII::SubInstructionGroup MIbG = getDuplexCandidateGroup(MIb); + return (isDuplexPairMatch(MIaG, MIbG) || isDuplexPairMatch(MIbG, MIaG)); +} + + +bool HexagonInstrInfo::isEarlySourceInstr(const MachineInstr *MI) const { + if (!MI) + return false; + + if (MI->mayLoad() || MI->mayStore() || MI->isCompare()) + return true; + + // Multiply + unsigned SchedClass = MI->getDesc().getSchedClass(); + if (SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23) + return true; + return false; +} + + +bool HexagonInstrInfo::isEndLoopN(unsigned Opcode) const { + return (Opcode == Hexagon::ENDLOOP0 || + Opcode == Hexagon::ENDLOOP1); +} + + +bool HexagonInstrInfo::isExpr(unsigned OpType) const { + switch(OpType) { + case MachineOperand::MO_MachineBasicBlock: + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + case MachineOperand::MO_JumpTableIndex: + case MachineOperand::MO_ConstantPoolIndex: + case MachineOperand::MO_BlockAddress: + return true; + default: + return false; + } +} + + +bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const { + const MCInstrDesc &MID = MI->getDesc(); + const uint64_t F = MID.TSFlags; + if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask) + return true; + + // TODO: This is largely obsolete now. Will need to be removed + // in consecutive patches. + switch(MI->getOpcode()) { + // TFR_FI Remains a special case. + case Hexagon::TFR_FI: + return true; + default: + return false; + } + return false; +} + + +// This returns true in two cases: +// - The OP code itself indicates that this is an extended instruction. +// - One of MOs has been marked with HMOTF_ConstExtended flag. +bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const { + // First check if this is permanently extended op code. + const uint64_t F = MI->getDesc().TSFlags; + if ((F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask) + return true; + // Use MO operand flags to determine if one of MI's operands + // has HMOTF_ConstExtended flag set. + for (MachineInstr::const_mop_iterator I = MI->operands_begin(), + E = MI->operands_end(); I != E; ++I) { + if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended) + return true; + } + return false; +} + + +bool HexagonInstrInfo::isFloat(const MachineInstr *MI) const { + unsigned Opcode = MI->getOpcode(); const uint64_t F = get(Opcode).TSFlags; + return (F >> HexagonII::FPPos) & HexagonII::FPMask; +} - return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask); + +// No V60 HVX VMEM with A_INDIRECT. +bool HexagonInstrInfo::isHVXMemWithAIndirect(const MachineInstr *I, + const MachineInstr *J) const { + if (!isV60VectorInstruction(I)) + return false; + if (!I->mayLoad() && !I->mayStore()) + return false; + return J->isIndirectBranch() || isIndirectCall(J) || isIndirectL4Return(J); } -int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const { - enum Hexagon::PredSense inPredSense; - inPredSense = invertPredicate ? Hexagon::PredSense_false : - Hexagon::PredSense_true; - int CondOpcode = Hexagon::getPredOpcode(Opc, inPredSense); - if (CondOpcode >= 0) // Valid Conditional opcode/instruction - return CondOpcode; - // This switch case will be removed once all the instructions have been - // modified to use relation maps. - switch(Opc) { - case Hexagon::TFRI_f: - return !invertPredicate ? Hexagon::TFRI_cPt_f : - Hexagon::TFRI_cNotPt_f; - case Hexagon::A2_combinew: - return !invertPredicate ? Hexagon::C2_ccombinewt : - Hexagon::C2_ccombinewf; +bool HexagonInstrInfo::isIndirectCall(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case Hexagon::J2_callr : + case Hexagon::J2_callrf : + case Hexagon::J2_callrt : + return true; + } + return false; +} - // DEALLOC_RETURN. - case Hexagon::L4_return: - return !invertPredicate ? Hexagon::L4_return_t: - Hexagon::L4_return_f; + +bool HexagonInstrInfo::isIndirectL4Return(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case Hexagon::L4_return : + case Hexagon::L4_return_t : + case Hexagon::L4_return_f : + case Hexagon::L4_return_fnew_pnt : + case Hexagon::L4_return_fnew_pt : + case Hexagon::L4_return_tnew_pnt : + case Hexagon::L4_return_tnew_pt : + return true; } - llvm_unreachable("Unexpected predicable instruction"); + return false; } -bool HexagonInstrInfo:: -PredicateInstruction(MachineInstr *MI, - ArrayRef<MachineOperand> Cond) const { - if (Cond.empty() || isEndLoopN(Cond[0].getImm())) { - DEBUG(dbgs() << "\nCannot predicate:"; MI->dump();); +bool HexagonInstrInfo::isJumpR(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case Hexagon::J2_jumpr : + case Hexagon::J2_jumprt : + case Hexagon::J2_jumprf : + case Hexagon::J2_jumprtnewpt : + case Hexagon::J2_jumprfnewpt : + case Hexagon::J2_jumprtnew : + case Hexagon::J2_jumprfnew : + return true; + } + return false; +} + + +// Return true if a given MI can accomodate given offset. +// Use abs estimate as oppose to the exact number. +// TODO: This will need to be changed to use MC level +// definition of instruction extendable field size. +bool HexagonInstrInfo::isJumpWithinBranchRange(const MachineInstr *MI, + unsigned offset) const { + // This selection of jump instructions matches to that what + // AnalyzeBranch can parse, plus NVJ. + if (isNewValueJump(MI)) // r9:2 + return isInt<11>(offset); + + switch (MI->getOpcode()) { + // Still missing Jump to address condition on register value. + default: return false; + case Hexagon::J2_jump: // bits<24> dst; // r22:2 + case Hexagon::J2_call: + case Hexagon::CALLv3nr: + return isInt<24>(offset); + case Hexagon::J2_jumpt: //bits<17> dst; // r15:2 + case Hexagon::J2_jumpf: + case Hexagon::J2_jumptnew: + case Hexagon::J2_jumptnewpt: + case Hexagon::J2_jumpfnew: + case Hexagon::J2_jumpfnewpt: + case Hexagon::J2_callt: + case Hexagon::J2_callf: + return isInt<17>(offset); + case Hexagon::J2_loop0i: + case Hexagon::J2_loop0iext: + case Hexagon::J2_loop0r: + case Hexagon::J2_loop0rext: + case Hexagon::J2_loop1i: + case Hexagon::J2_loop1iext: + case Hexagon::J2_loop1r: + case Hexagon::J2_loop1rext: + return isInt<9>(offset); + // TODO: Add all the compound branches here. Can we do this in Relation model? + case Hexagon::J4_cmpeqi_tp0_jump_nt: + case Hexagon::J4_cmpeqi_tp1_jump_nt: + return isInt<11>(offset); } - int Opc = MI->getOpcode(); - assert (isPredicable(MI) && "Expected predicable instruction"); - bool invertJump = predOpcodeHasNot(Cond); +} - // We have to predicate MI "in place", i.e. after this function returns, - // MI will need to be transformed into a predicated form. To avoid com- - // plicated manipulations with the operands (handling tied operands, - // etc.), build a new temporary instruction, then overwrite MI with it. - MachineBasicBlock &B = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - unsigned PredOpc = getCondOpcode(Opc, invertJump); - MachineInstrBuilder T = BuildMI(B, MI, DL, get(PredOpc)); - unsigned NOp = 0, NumOps = MI->getNumOperands(); - while (NOp < NumOps) { - MachineOperand &Op = MI->getOperand(NOp); - if (!Op.isReg() || !Op.isDef() || Op.isImplicit()) - break; - T.addOperand(Op); - NOp++; +bool HexagonInstrInfo::isLateInstrFeedsEarlyInstr(const MachineInstr *LRMI, + const MachineInstr *ESMI) const { + if (!LRMI || !ESMI) + return false; + + bool isLate = isLateResultInstr(LRMI); + bool isEarly = isEarlySourceInstr(ESMI); + + DEBUG(dbgs() << "V60" << (isLate ? "-LR " : " -- ")); + DEBUG(LRMI->dump()); + DEBUG(dbgs() << "V60" << (isEarly ? "-ES " : " -- ")); + DEBUG(ESMI->dump()); + + if (isLate && isEarly) { + DEBUG(dbgs() << "++Is Late Result feeding Early Source\n"); + return true; } - unsigned PredReg, PredRegPos, PredRegFlags; - bool GotPredReg = getPredReg(Cond, PredReg, PredRegPos, PredRegFlags); - (void)GotPredReg; - assert(GotPredReg); - T.addReg(PredReg, PredRegFlags); - while (NOp < NumOps) - T.addOperand(MI->getOperand(NOp++)); + return false; +} - MI->setDesc(get(PredOpc)); - while (unsigned n = MI->getNumOperands()) - MI->RemoveOperand(n-1); - for (unsigned i = 0, n = T->getNumOperands(); i < n; ++i) - MI->addOperand(T->getOperand(i)); - MachineBasicBlock::instr_iterator TI = &*T; - B.erase(TI); +bool HexagonInstrInfo::isLateResultInstr(const MachineInstr *MI) const { + if (!MI) + return false; - MachineRegisterInfo &MRI = B.getParent()->getRegInfo(); - MRI.clearKillFlags(PredReg); + switch (MI->getOpcode()) { + case TargetOpcode::EXTRACT_SUBREG: + case TargetOpcode::INSERT_SUBREG: + case TargetOpcode::SUBREG_TO_REG: + case TargetOpcode::REG_SEQUENCE: + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::COPY: + case TargetOpcode::INLINEASM: + case TargetOpcode::PHI: + return false; + default: + break; + } + unsigned SchedClass = MI->getDesc().getSchedClass(); + + switch (SchedClass) { + case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123: + case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123: + case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123: + case Hexagon::Sched::ALU64_tc_1_SLOT23: + case Hexagon::Sched::EXTENDER_tc_1_SLOT0123: + case Hexagon::Sched::S_2op_tc_1_SLOT23: + case Hexagon::Sched::S_3op_tc_1_SLOT23: + case Hexagon::Sched::V2LDST_tc_ld_SLOT01: + case Hexagon::Sched::V2LDST_tc_st_SLOT0: + case Hexagon::Sched::V2LDST_tc_st_SLOT01: + case Hexagon::Sched::V4LDST_tc_ld_SLOT01: + case Hexagon::Sched::V4LDST_tc_st_SLOT0: + case Hexagon::Sched::V4LDST_tc_st_SLOT01: + return false; + } return true; } -bool -HexagonInstrInfo:: -isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumCycles, - unsigned ExtraPredCycles, - const BranchProbability &Probability) const { - return true; +bool HexagonInstrInfo::isLateSourceInstr(const MachineInstr *MI) const { + if (!MI) + return false; + + // Instructions with iclass A_CVI_VX and attribute A_CVI_LATE uses a multiply + // resource, but all operands can be received late like an ALU instruction. + return MI->getDesc().getSchedClass() == Hexagon::Sched::CVI_VX_LATE; } -bool -HexagonInstrInfo:: -isProfitableToIfCvt(MachineBasicBlock &TMBB, - unsigned NumTCycles, - unsigned ExtraTCycles, - MachineBasicBlock &FMBB, - unsigned NumFCycles, - unsigned ExtraFCycles, - const BranchProbability &Probability) const { - return true; +bool HexagonInstrInfo::isLoopN(const MachineInstr *MI) const { + unsigned Opcode = MI->getOpcode(); + return Opcode == Hexagon::J2_loop0i || + Opcode == Hexagon::J2_loop0r || + Opcode == Hexagon::J2_loop0iext || + Opcode == Hexagon::J2_loop0rext || + Opcode == Hexagon::J2_loop1i || + Opcode == Hexagon::J2_loop1r || + Opcode == Hexagon::J2_loop1iext || + Opcode == Hexagon::J2_loop1rext; +} + + +bool HexagonInstrInfo::isMemOp(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: return false; + case Hexagon::L4_iadd_memopw_io : + case Hexagon::L4_isub_memopw_io : + case Hexagon::L4_add_memopw_io : + case Hexagon::L4_sub_memopw_io : + case Hexagon::L4_and_memopw_io : + case Hexagon::L4_or_memopw_io : + case Hexagon::L4_iadd_memoph_io : + case Hexagon::L4_isub_memoph_io : + case Hexagon::L4_add_memoph_io : + case Hexagon::L4_sub_memoph_io : + case Hexagon::L4_and_memoph_io : + case Hexagon::L4_or_memoph_io : + case Hexagon::L4_iadd_memopb_io : + case Hexagon::L4_isub_memopb_io : + case Hexagon::L4_add_memopb_io : + case Hexagon::L4_sub_memopb_io : + case Hexagon::L4_and_memopb_io : + case Hexagon::L4_or_memopb_io : + case Hexagon::L4_ior_memopb_io: + case Hexagon::L4_ior_memoph_io: + case Hexagon::L4_ior_memopw_io: + case Hexagon::L4_iand_memopb_io: + case Hexagon::L4_iand_memoph_io: + case Hexagon::L4_iand_memopw_io: + return true; + } + return false; } -// Returns true if an instruction is predicated irrespective of the predicate -// sense. For example, all of the following will return true. -// if (p0) R1 = add(R2, R3) -// if (!p0) R1 = add(R2, R3) -// if (p0.new) R1 = add(R2, R3) -// if (!p0.new) R1 = add(R2, R3) -bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const { - const uint64_t F = MI->getDesc().TSFlags; - return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask); +bool HexagonInstrInfo::isNewValue(const MachineInstr* MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask; } -bool HexagonInstrInfo::isPredicated(unsigned Opcode) const { + +bool HexagonInstrInfo::isNewValue(unsigned Opcode) const { const uint64_t F = get(Opcode).TSFlags; + return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask; +} + - return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask); +bool HexagonInstrInfo::isNewValueInst(const MachineInstr *MI) const { + return isNewValueJump(MI) || isNewValueStore(MI); } -bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr *MI) const { - const uint64_t F = MI->getDesc().TSFlags; - assert(isPredicated(MI)); - return (!((F >> HexagonII::PredicatedFalsePos) & - HexagonII::PredicatedFalseMask)); +bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const { + return isNewValue(MI) && MI->isBranch(); } -bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const { + +bool HexagonInstrInfo::isNewValueJump(unsigned Opcode) const { + return isNewValue(Opcode) && get(Opcode).isBranch() && isPredicated(Opcode); +} + + +bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask; +} + + +bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const { const uint64_t F = get(Opcode).TSFlags; + return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask; +} - // Make sure that the instruction is predicated. - assert((F>> HexagonII::PredicatedPos) & HexagonII::PredicatedMask); - return (!((F >> HexagonII::PredicatedFalsePos) & - HexagonII::PredicatedFalseMask)); + +// Returns true if a particular operand is extendable for an instruction. +bool HexagonInstrInfo::isOperandExtended(const MachineInstr *MI, + unsigned OperandNum) const { + const uint64_t F = MI->getDesc().TSFlags; + return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask) + == OperandNum; } + +bool HexagonInstrInfo::isPostIncrement(const MachineInstr* MI) const { + return getAddrMode(MI) == HexagonII::PostInc; +} + + bool HexagonInstrInfo::isPredicatedNew(const MachineInstr *MI) const { const uint64_t F = MI->getDesc().TSFlags; - assert(isPredicated(MI)); - return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask); + return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask; } + bool HexagonInstrInfo::isPredicatedNew(unsigned Opcode) const { const uint64_t F = get(Opcode).TSFlags; - assert(isPredicated(Opcode)); - return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask); + return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask; } -// Returns true, if a ST insn can be promoted to a new-value store. -bool HexagonInstrInfo::mayBeNewStore(const MachineInstr *MI) const { + +bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr *MI) const { const uint64_t F = MI->getDesc().TSFlags; + return !((F >> HexagonII::PredicatedFalsePos) & + HexagonII::PredicatedFalseMask); +} + - return ((F >> HexagonII::mayNVStorePos) & - HexagonII::mayNVStoreMask); +bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const { + const uint64_t F = get(Opcode).TSFlags; + // Make sure that the instruction is predicated. + assert((F>> HexagonII::PredicatedPos) & HexagonII::PredicatedMask); + return !((F >> HexagonII::PredicatedFalsePos) & + HexagonII::PredicatedFalseMask); } -bool -HexagonInstrInfo::DefinesPredicate(MachineInstr *MI, - std::vector<MachineOperand> &Pred) const { - for (unsigned oper = 0; oper < MI->getNumOperands(); ++oper) { - MachineOperand MO = MI->getOperand(oper); - if (MO.isReg() && MO.isDef()) { - const TargetRegisterClass* RC = RI.getMinimalPhysRegClass(MO.getReg()); - if (RC == &Hexagon::PredRegsRegClass) { - Pred.push_back(MO); - return true; - } - } + +bool HexagonInstrInfo::isPredicated(unsigned Opcode) const { + const uint64_t F = get(Opcode).TSFlags; + return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask; +} + + +bool HexagonInstrInfo::isPredicateLate(unsigned Opcode) const { + const uint64_t F = get(Opcode).TSFlags; + return ~(F >> HexagonII::PredicateLatePos) & HexagonII::PredicateLateMask; +} + + +bool HexagonInstrInfo::isPredictedTaken(unsigned Opcode) const { + const uint64_t F = get(Opcode).TSFlags; + assert(get(Opcode).isBranch() && + (isPredicatedNew(Opcode) || isNewValue(Opcode))); + return (F >> HexagonII::TakenPos) & HexagonII::TakenMask; +} + + +bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const { + return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4 || + MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_EXT; +} + + +bool HexagonInstrInfo::isSolo(const MachineInstr* MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::SoloPos) & HexagonII::SoloMask; +} + + +bool HexagonInstrInfo::isSpillPredRegOp(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case Hexagon::STriw_pred : + case Hexagon::LDriw_pred : + return true; + default: + return false; } - return false; } -bool -HexagonInstrInfo:: -SubsumesPredicate(ArrayRef<MachineOperand> Pred1, - ArrayRef<MachineOperand> Pred2) const { - // TODO: Fix this - return false; +// Returns true when SU has a timing class TC1. +bool HexagonInstrInfo::isTC1(const MachineInstr *MI) const { + unsigned SchedClass = MI->getDesc().getSchedClass(); + switch (SchedClass) { + case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123: + case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123: + case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123: + case Hexagon::Sched::ALU64_tc_1_SLOT23: + case Hexagon::Sched::EXTENDER_tc_1_SLOT0123: + //case Hexagon::Sched::M_tc_1_SLOT23: + case Hexagon::Sched::S_2op_tc_1_SLOT23: + case Hexagon::Sched::S_3op_tc_1_SLOT23: + return true; + + default: + return false; + } } -// -// We indicate that we want to reverse the branch by -// inserting the reversed branching opcode. -// -bool HexagonInstrInfo::ReverseBranchCondition( - SmallVectorImpl<MachineOperand> &Cond) const { - if (Cond.empty()) +bool HexagonInstrInfo::isTC2(const MachineInstr *MI) const { + unsigned SchedClass = MI->getDesc().getSchedClass(); + switch (SchedClass) { + case Hexagon::Sched::ALU32_3op_tc_2_SLOT0123: + case Hexagon::Sched::ALU64_tc_2_SLOT23: + case Hexagon::Sched::CR_tc_2_SLOT3: + case Hexagon::Sched::M_tc_2_SLOT23: + case Hexagon::Sched::S_2op_tc_2_SLOT23: + case Hexagon::Sched::S_3op_tc_2_SLOT23: return true; - assert(Cond[0].isImm() && "First entry in the cond vector not imm-val"); - Opcode_t opcode = Cond[0].getImm(); - //unsigned temp; - assert(get(opcode).isBranch() && "Should be a branching condition."); - if (isEndLoopN(opcode)) + + default: + return false; + } +} + + +bool HexagonInstrInfo::isTC2Early(const MachineInstr *MI) const { + unsigned SchedClass = MI->getDesc().getSchedClass(); + switch (SchedClass) { + case Hexagon::Sched::ALU32_2op_tc_2early_SLOT0123: + case Hexagon::Sched::ALU32_3op_tc_2early_SLOT0123: + case Hexagon::Sched::ALU64_tc_2early_SLOT23: + case Hexagon::Sched::CR_tc_2early_SLOT23: + case Hexagon::Sched::CR_tc_2early_SLOT3: + case Hexagon::Sched::J_tc_2early_SLOT0123: + case Hexagon::Sched::J_tc_2early_SLOT2: + case Hexagon::Sched::J_tc_2early_SLOT23: + case Hexagon::Sched::S_2op_tc_2early_SLOT23: + case Hexagon::Sched::S_3op_tc_2early_SLOT23: return true; - Opcode_t NewOpcode = getInvertedPredicatedOpcode(opcode); - Cond[0].setImm(NewOpcode); - return false; + + default: + return false; + } +} + + +bool HexagonInstrInfo::isTC4x(const MachineInstr *MI) const { + if (!MI) + return false; + + unsigned SchedClass = MI->getDesc().getSchedClass(); + return SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23; } -bool HexagonInstrInfo:: -isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumInstrs, - const BranchProbability &Probability) const { - return (NumInstrs <= 4); +bool HexagonInstrInfo::isV60VectorInstruction(const MachineInstr *MI) const { + if (!MI) + return false; + + const uint64_t V = getType(MI); + return HexagonII::TypeCVI_FIRST <= V && V <= HexagonII::TypeCVI_LAST; } -bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const { - switch (MI->getOpcode()) { - default: return false; - case Hexagon::L4_return: - case Hexagon::L4_return_t: - case Hexagon::L4_return_f: - case Hexagon::L4_return_tnew_pnt: - case Hexagon::L4_return_fnew_pnt: - case Hexagon::L4_return_tnew_pt: - case Hexagon::L4_return_fnew_pt: - return true; + +// Check if the Offset is a valid auto-inc imm by Load/Store Type. +// +bool HexagonInstrInfo::isValidAutoIncImm(const EVT VT, const int Offset) const { + if (VT == MVT::v16i32 || VT == MVT::v8i64 || + VT == MVT::v32i16 || VT == MVT::v64i8) { + return (Offset >= Hexagon_MEMV_AUTOINC_MIN && + Offset <= Hexagon_MEMV_AUTOINC_MAX && + (Offset & 0x3f) == 0); + } + // 128B + if (VT == MVT::v32i32 || VT == MVT::v16i64 || + VT == MVT::v64i16 || VT == MVT::v128i8) { + return (Offset >= Hexagon_MEMV_AUTOINC_MIN_128B && + Offset <= Hexagon_MEMV_AUTOINC_MAX_128B && + (Offset & 0x7f) == 0); + } + if (VT == MVT::i64) { + return (Offset >= Hexagon_MEMD_AUTOINC_MIN && + Offset <= Hexagon_MEMD_AUTOINC_MAX && + (Offset & 0x7) == 0); + } + if (VT == MVT::i32) { + return (Offset >= Hexagon_MEMW_AUTOINC_MIN && + Offset <= Hexagon_MEMW_AUTOINC_MAX && + (Offset & 0x3) == 0); + } + if (VT == MVT::i16) { + return (Offset >= Hexagon_MEMH_AUTOINC_MIN && + Offset <= Hexagon_MEMH_AUTOINC_MAX && + (Offset & 0x1) == 0); + } + if (VT == MVT::i8) { + return (Offset >= Hexagon_MEMB_AUTOINC_MIN && + Offset <= Hexagon_MEMB_AUTOINC_MAX); } + llvm_unreachable("Not an auto-inc opc!"); } @@ -1222,6 +2329,40 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, // misaligns with respect to load size. switch (Opcode) { + case Hexagon::STriq_pred_V6: + case Hexagon::STriq_pred_vec_V6: + case Hexagon::STriv_pseudo_V6: + case Hexagon::STrivv_pseudo_V6: + case Hexagon::LDriq_pred_V6: + case Hexagon::LDriq_pred_vec_V6: + case Hexagon::LDriv_pseudo_V6: + case Hexagon::LDrivv_pseudo_V6: + case Hexagon::LDrivv_indexed: + case Hexagon::STrivv_indexed: + case Hexagon::V6_vL32b_ai: + case Hexagon::V6_vS32b_ai: + case Hexagon::V6_vL32Ub_ai: + case Hexagon::V6_vS32Ub_ai: + return (Offset >= Hexagon_MEMV_OFFSET_MIN) && + (Offset <= Hexagon_MEMV_OFFSET_MAX); + + case Hexagon::STriq_pred_V6_128B: + case Hexagon::STriq_pred_vec_V6_128B: + case Hexagon::STriv_pseudo_V6_128B: + case Hexagon::STrivv_pseudo_V6_128B: + case Hexagon::LDriq_pred_V6_128B: + case Hexagon::LDriq_pred_vec_V6_128B: + case Hexagon::LDriv_pseudo_V6_128B: + case Hexagon::LDrivv_pseudo_V6_128B: + case Hexagon::LDrivv_indexed_128B: + case Hexagon::STrivv_indexed_128B: + case Hexagon::V6_vL32b_ai_128B: + case Hexagon::V6_vS32b_ai_128B: + case Hexagon::V6_vL32Ub_ai_128B: + case Hexagon::V6_vS32Ub_ai_128B: + return (Offset >= Hexagon_MEMV_OFFSET_MIN_128B) && + (Offset <= Hexagon_MEMV_OFFSET_MAX_128B); + case Hexagon::J2_loop0i: case Hexagon::J2_loop1i: return isUInt<10>(Offset); @@ -1248,8 +2389,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, (Offset <= Hexagon_MEMH_OFFSET_MAX); case Hexagon::L2_loadrb_io: - case Hexagon::S2_storerb_io: case Hexagon::L2_loadrub_io: + case Hexagon::S2_storerb_io: return (Offset >= Hexagon_MEMB_OFFSET_MIN) && (Offset <= Hexagon_MEMB_OFFSET_MAX); @@ -1257,28 +2398,28 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, return (Offset >= Hexagon_ADDI_OFFSET_MIN) && (Offset <= Hexagon_ADDI_OFFSET_MAX); - case Hexagon::L4_iadd_memopw_io: - case Hexagon::L4_isub_memopw_io: - case Hexagon::L4_add_memopw_io: - case Hexagon::L4_sub_memopw_io: - case Hexagon::L4_and_memopw_io: - case Hexagon::L4_or_memopw_io: + case Hexagon::L4_iadd_memopw_io : + case Hexagon::L4_isub_memopw_io : + case Hexagon::L4_add_memopw_io : + case Hexagon::L4_sub_memopw_io : + case Hexagon::L4_and_memopw_io : + case Hexagon::L4_or_memopw_io : return (0 <= Offset && Offset <= 255); - case Hexagon::L4_iadd_memoph_io: - case Hexagon::L4_isub_memoph_io: - case Hexagon::L4_add_memoph_io: - case Hexagon::L4_sub_memoph_io: - case Hexagon::L4_and_memoph_io: - case Hexagon::L4_or_memoph_io: + case Hexagon::L4_iadd_memoph_io : + case Hexagon::L4_isub_memoph_io : + case Hexagon::L4_add_memoph_io : + case Hexagon::L4_sub_memoph_io : + case Hexagon::L4_and_memoph_io : + case Hexagon::L4_or_memoph_io : return (0 <= Offset && Offset <= 127); - case Hexagon::L4_iadd_memopb_io: - case Hexagon::L4_isub_memopb_io: - case Hexagon::L4_add_memopb_io: - case Hexagon::L4_sub_memopb_io: - case Hexagon::L4_and_memopb_io: - case Hexagon::L4_or_memopb_io: + case Hexagon::L4_iadd_memopb_io : + case Hexagon::L4_isub_memopb_io : + case Hexagon::L4_add_memopb_io : + case Hexagon::L4_sub_memopb_io : + case Hexagon::L4_and_memopb_io : + case Hexagon::L4_or_memopb_io : return (0 <= Offset && Offset <= 63); // LDri_pred and STriw_pred are pseudo operations, so it has to take offset of @@ -1291,223 +2432,556 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::TFR_FIA: case Hexagon::INLINEASM: return true; - } + + case Hexagon::L2_ploadrbt_io: + case Hexagon::L2_ploadrbf_io: + case Hexagon::L2_ploadrubt_io: + case Hexagon::L2_ploadrubf_io: + case Hexagon::S2_pstorerbt_io: + case Hexagon::S2_pstorerbf_io: + case Hexagon::S4_storeirb_io: + case Hexagon::S4_storeirbt_io: + case Hexagon::S4_storeirbf_io: + return isUInt<6>(Offset); + + case Hexagon::L2_ploadrht_io: + case Hexagon::L2_ploadrhf_io: + case Hexagon::L2_ploadruht_io: + case Hexagon::L2_ploadruhf_io: + case Hexagon::S2_pstorerht_io: + case Hexagon::S2_pstorerhf_io: + case Hexagon::S4_storeirh_io: + case Hexagon::S4_storeirht_io: + case Hexagon::S4_storeirhf_io: + return isShiftedUInt<6,1>(Offset); + + case Hexagon::L2_ploadrit_io: + case Hexagon::L2_ploadrif_io: + case Hexagon::S2_pstorerit_io: + case Hexagon::S2_pstorerif_io: + case Hexagon::S4_storeiri_io: + case Hexagon::S4_storeirit_io: + case Hexagon::S4_storeirif_io: + return isShiftedUInt<6,2>(Offset); + + case Hexagon::L2_ploadrdt_io: + case Hexagon::L2_ploadrdf_io: + case Hexagon::S2_pstorerdt_io: + case Hexagon::S2_pstorerdf_io: + return isShiftedUInt<6,3>(Offset); + } // switch llvm_unreachable("No offset range is defined for this opcode. " "Please define it in the above switch statement!"); } -// -// Check if the Offset is a valid auto-inc imm by Load/Store Type. -// -bool HexagonInstrInfo:: -isValidAutoIncImm(const EVT VT, const int Offset) const { +bool HexagonInstrInfo::isVecAcc(const MachineInstr *MI) const { + return MI && isV60VectorInstruction(MI) && isAccumulator(MI); +} - if (VT == MVT::i64) { - return (Offset >= Hexagon_MEMD_AUTOINC_MIN && - Offset <= Hexagon_MEMD_AUTOINC_MAX && - (Offset & 0x7) == 0); - } - if (VT == MVT::i32) { - return (Offset >= Hexagon_MEMW_AUTOINC_MIN && - Offset <= Hexagon_MEMW_AUTOINC_MAX && - (Offset & 0x3) == 0); - } - if (VT == MVT::i16) { - return (Offset >= Hexagon_MEMH_AUTOINC_MIN && - Offset <= Hexagon_MEMH_AUTOINC_MAX && - (Offset & 0x1) == 0); - } - if (VT == MVT::i8) { - return (Offset >= Hexagon_MEMB_AUTOINC_MIN && - Offset <= Hexagon_MEMB_AUTOINC_MAX); + +bool HexagonInstrInfo::isVecALU(const MachineInstr *MI) const { + if (!MI) + return false; + const uint64_t F = get(MI->getOpcode()).TSFlags; + const uint64_t V = ((F >> HexagonII::TypePos) & HexagonII::TypeMask); + return + V == HexagonII::TypeCVI_VA || + V == HexagonII::TypeCVI_VA_DV; +} + + +bool HexagonInstrInfo::isVecUsableNextPacket(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) const { + if (EnableACCForwarding && isVecAcc(ProdMI) && isVecAcc(ConsMI)) + return true; + + if (EnableALUForwarding && (isVecALU(ConsMI) || isLateSourceInstr(ConsMI))) + return true; + + if (mayBeNewStore(ConsMI)) + return true; + + return false; +} + + +/// \brief Can these instructions execute at the same time in a bundle. +bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr *First, + const MachineInstr *Second) const { + if (DisableNVSchedule) + return false; + if (mayBeNewStore(Second)) { + // Make sure the definition of the first instruction is the value being + // stored. + const MachineOperand &Stored = + Second->getOperand(Second->getNumOperands() - 1); + if (!Stored.isReg()) + return false; + for (unsigned i = 0, e = First->getNumOperands(); i < e; ++i) { + const MachineOperand &Op = First->getOperand(i); + if (Op.isReg() && Op.isDef() && Op.getReg() == Stored.getReg()) + return true; + } } - llvm_unreachable("Not an auto-inc opc!"); + return false; +} + + +bool HexagonInstrInfo::hasEHLabel(const MachineBasicBlock *B) const { + for (auto &I : *B) + if (I.isEHLabel()) + return true; + return false; } -bool HexagonInstrInfo:: -isMemOp(const MachineInstr *MI) const { -// return MI->getDesc().mayLoad() && MI->getDesc().mayStore(); - - switch (MI->getOpcode()) - { - default: return false; - case Hexagon::L4_iadd_memopw_io: - case Hexagon::L4_isub_memopw_io: - case Hexagon::L4_add_memopw_io: - case Hexagon::L4_sub_memopw_io: - case Hexagon::L4_and_memopw_io: - case Hexagon::L4_or_memopw_io: - case Hexagon::L4_iadd_memoph_io: - case Hexagon::L4_isub_memoph_io: - case Hexagon::L4_add_memoph_io: - case Hexagon::L4_sub_memoph_io: - case Hexagon::L4_and_memoph_io: - case Hexagon::L4_or_memoph_io: - case Hexagon::L4_iadd_memopb_io: - case Hexagon::L4_isub_memopb_io: - case Hexagon::L4_add_memopb_io: - case Hexagon::L4_sub_memopb_io: - case Hexagon::L4_and_memopb_io: - case Hexagon::L4_or_memopb_io: - case Hexagon::L4_ior_memopb_io: - case Hexagon::L4_ior_memoph_io: - case Hexagon::L4_ior_memopw_io: - case Hexagon::L4_iand_memopb_io: - case Hexagon::L4_iand_memoph_io: - case Hexagon::L4_iand_memopw_io: +// Returns true if an instruction can be converted into a non-extended +// equivalent instruction. +bool HexagonInstrInfo::hasNonExtEquivalent(const MachineInstr *MI) const { + short NonExtOpcode; + // Check if the instruction has a register form that uses register in place + // of the extended operand, if so return that as the non-extended form. + if (Hexagon::getRegForm(MI->getOpcode()) >= 0) + return true; + + if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) { + // Check addressing mode and retrieve non-ext equivalent instruction. + + switch (getAddrMode(MI)) { + case HexagonII::Absolute : + // Load/store with absolute addressing mode can be converted into + // base+offset mode. + NonExtOpcode = Hexagon::getBaseWithImmOffset(MI->getOpcode()); + break; + case HexagonII::BaseImmOffset : + // Load/store with base+offset addressing mode can be converted into + // base+register offset addressing mode. However left shift operand should + // be set to 0. + NonExtOpcode = Hexagon::getBaseWithRegOffset(MI->getOpcode()); + break; + case HexagonII::BaseLongOffset: + NonExtOpcode = Hexagon::getRegShlForm(MI->getOpcode()); + break; + default: + return false; + } + if (NonExtOpcode < 0) + return false; return true; } return false; } -bool HexagonInstrInfo:: -isSpillPredRegOp(const MachineInstr *MI) const { - switch (MI->getOpcode()) { - default: return false; - case Hexagon::STriw_pred : - case Hexagon::LDriw_pred : +bool HexagonInstrInfo::hasPseudoInstrPair(const MachineInstr *MI) const { + return Hexagon::getRealHWInstr(MI->getOpcode(), + Hexagon::InstrType_Pseudo) >= 0; +} + + +bool HexagonInstrInfo::hasUncondBranch(const MachineBasicBlock *B) + const { + MachineBasicBlock::const_iterator I = B->getFirstTerminator(), E = B->end(); + while (I != E) { + if (I->isBarrier()) return true; + ++I; } + return false; } -bool HexagonInstrInfo::isNewValueJumpCandidate(const MachineInstr *MI) const { - switch (MI->getOpcode()) { - default: return false; - case Hexagon::C2_cmpeq: - case Hexagon::C2_cmpeqi: - case Hexagon::C2_cmpgt: - case Hexagon::C2_cmpgti: - case Hexagon::C2_cmpgtu: - case Hexagon::C2_cmpgtui: + +// Returns true, if a LD insn can be promoted to a cur load. +bool HexagonInstrInfo::mayBeCurLoad(const MachineInstr *MI) const { + auto &HST = MI->getParent()->getParent()->getSubtarget<HexagonSubtarget>(); + const uint64_t F = MI->getDesc().TSFlags; + return ((F >> HexagonII::mayCVLoadPos) & HexagonII::mayCVLoadMask) && + HST.hasV60TOps(); +} + + +// Returns true, if a ST insn can be promoted to a new-value store. +bool HexagonInstrInfo::mayBeNewStore(const MachineInstr *MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::mayNVStorePos) & HexagonII::mayNVStoreMask; +} + + +bool HexagonInstrInfo::producesStall(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) const { + // There is no stall when ProdMI is not a V60 vector. + if (!isV60VectorInstruction(ProdMI)) + return false; + + // There is no stall when ProdMI and ConsMI are not dependent. + if (!isDependent(ProdMI, ConsMI)) + return false; + + // When Forward Scheduling is enabled, there is no stall if ProdMI and ConsMI + // are scheduled in consecutive packets. + if (isVecUsableNextPacket(ProdMI, ConsMI)) + return false; + + return true; +} + + +bool HexagonInstrInfo::producesStall(const MachineInstr *MI, + MachineBasicBlock::const_instr_iterator BII) const { + // There is no stall when I is not a V60 vector. + if (!isV60VectorInstruction(MI)) + return false; + + MachineBasicBlock::const_instr_iterator MII = BII; + MachineBasicBlock::const_instr_iterator MIE = MII->getParent()->instr_end(); + + if (!(*MII).isBundle()) { + const MachineInstr *J = &*MII; + if (!isV60VectorInstruction(J)) + return false; + else if (isVecUsableNextPacket(J, MI)) + return false; + return true; + } + + for (++MII; MII != MIE && MII->isInsideBundle(); ++MII) { + const MachineInstr *J = &*MII; + if (producesStall(J, MI)) return true; } + return false; +} + + +bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr *MI, + unsigned PredReg) const { + for (unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) { + const MachineOperand &MO = MI->getOperand(opNum); + if (MO.isReg() && MO.isDef() && MO.isImplicit() && (MO.getReg() == PredReg)) + return false; // Predicate register must be explicitly defined. + } + + // Hexagon Programmer's Reference says that decbin, memw_locked, and + // memd_locked cannot be used as .new as well, + // but we don't seem to have these instructions defined. + return MI->getOpcode() != Hexagon::A4_tlbmatch; +} + + +bool HexagonInstrInfo::PredOpcodeHasJMP_c(unsigned Opcode) const { + return (Opcode == Hexagon::J2_jumpt) || + (Opcode == Hexagon::J2_jumpf) || + (Opcode == Hexagon::J2_jumptnew) || + (Opcode == Hexagon::J2_jumpfnew) || + (Opcode == Hexagon::J2_jumptnewpt) || + (Opcode == Hexagon::J2_jumpfnewpt); +} + + +bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const { + if (Cond.empty() || !isPredicated(Cond[0].getImm())) + return false; + return !isPredicatedTrue(Cond[0].getImm()); +} + + +unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask; +} + + +// Returns the base register in a memory access (load/store). The offset is +// returned in Offset and the access size is returned in AccessSize. +unsigned HexagonInstrInfo::getBaseAndOffset(const MachineInstr *MI, + int &Offset, unsigned &AccessSize) const { + // Return if it is not a base+offset type instruction or a MemOp. + if (getAddrMode(MI) != HexagonII::BaseImmOffset && + getAddrMode(MI) != HexagonII::BaseLongOffset && + !isMemOp(MI) && !isPostIncrement(MI)) + return 0; + + // Since it is a memory access instruction, getMemAccessSize() should never + // return 0. + assert (getMemAccessSize(MI) && + "BaseImmOffset or BaseLongOffset or MemOp without accessSize"); + + // Return Values of getMemAccessSize() are + // 0 - Checked in the assert above. + // 1, 2, 3, 4 & 7, 8 - The statement below is correct for all these. + // MemAccessSize is represented as 1+log2(N) where N is size in bits. + AccessSize = (1U << (getMemAccessSize(MI) - 1)); + + unsigned basePos = 0, offsetPos = 0; + if (!getBaseAndOffsetPosition(MI, basePos, offsetPos)) + return 0; + + // Post increment updates its EA after the mem access, + // so we need to treat its offset as zero. + if (isPostIncrement(MI)) + Offset = 0; + else { + Offset = MI->getOperand(offsetPos).getImm(); + } + + return MI->getOperand(basePos).getReg(); +} + + +/// Return the position of the base and offset operands for this instruction. +bool HexagonInstrInfo::getBaseAndOffsetPosition(const MachineInstr *MI, + unsigned &BasePos, unsigned &OffsetPos) const { + // Deal with memops first. + if (isMemOp(MI)) { + assert (MI->getOperand(0).isReg() && MI->getOperand(1).isImm() && + "Bad Memop."); + BasePos = 0; + OffsetPos = 1; + } else if (MI->mayStore()) { + BasePos = 0; + OffsetPos = 1; + } else if (MI->mayLoad()) { + BasePos = 1; + OffsetPos = 2; + } else + return false; + + if (isPredicated(MI)) { + BasePos++; + OffsetPos++; + } + if (isPostIncrement(MI)) { + BasePos++; + OffsetPos++; + } + + if (!MI->getOperand(BasePos).isReg() || !MI->getOperand(OffsetPos).isImm()) + return false; + + return true; +} + + +// Inserts branching instructions in reverse order of their occurence. +// e.g. jump_t t1 (i1) +// jump t2 (i2) +// Jumpers = {i2, i1} +SmallVector<MachineInstr*, 2> HexagonInstrInfo::getBranchingInstrs( + MachineBasicBlock& MBB) const { + SmallVector<MachineInstr*, 2> Jumpers; + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::instr_iterator I = MBB.instr_end(); + if (I == MBB.instr_begin()) + return Jumpers; + + // A basic block may looks like this: + // + // [ insn + // EH_LABEL + // insn + // insn + // insn + // EH_LABEL + // insn ] + // + // It has two succs but does not have a terminator + // Don't know how to handle it. + do { + --I; + if (I->isEHLabel()) + return Jumpers; + } while (I != MBB.instr_begin()); + + I = MBB.instr_end(); + --I; + + while (I->isDebugValue()) { + if (I == MBB.instr_begin()) + return Jumpers; + --I; + } + if (!isUnpredicatedTerminator(&*I)) + return Jumpers; + + // Get the last instruction in the block. + MachineInstr *LastInst = &*I; + Jumpers.push_back(LastInst); + MachineInstr *SecondLastInst = nullptr; + // Find one more terminator if present. + do { + if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(&*I)) { + if (!SecondLastInst) { + SecondLastInst = &*I; + Jumpers.push_back(SecondLastInst); + } else // This is a third branch. + return Jumpers; + } + if (I == MBB.instr_begin()) + break; + --I; + } while (true); + return Jumpers; } -bool HexagonInstrInfo:: -isConditionalTransfer (const MachineInstr *MI) const { + +// Returns Operand Index for the constant extended instruction. +unsigned HexagonInstrInfo::getCExtOpNum(const MachineInstr *MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask; +} + +// See if instruction could potentially be a duplex candidate. +// If so, return its group. Zero otherwise. +HexagonII::CompoundGroup HexagonInstrInfo::getCompoundCandidateGroup( + const MachineInstr *MI) const { + unsigned DstReg, SrcReg, Src1Reg, Src2Reg; + switch (MI->getOpcode()) { - default: return false; - case Hexagon::A2_tfrt: - case Hexagon::A2_tfrf: - case Hexagon::C2_cmoveit: - case Hexagon::C2_cmoveif: - case Hexagon::A2_tfrtnew: - case Hexagon::A2_tfrfnew: - case Hexagon::C2_cmovenewit: - case Hexagon::C2_cmovenewif: - return true; + default: + return HexagonII::HCG_None; + // + // Compound pairs. + // "p0=cmp.eq(Rs16,Rt16); if (p0.new) jump:nt #r9:2" + // "Rd16=#U6 ; jump #r9:2" + // "Rd16=Rs16 ; jump #r9:2" + // + case Hexagon::C2_cmpeq: + case Hexagon::C2_cmpgt: + case Hexagon::C2_cmpgtu: + DstReg = MI->getOperand(0).getReg(); + Src1Reg = MI->getOperand(1).getReg(); + Src2Reg = MI->getOperand(2).getReg(); + if (Hexagon::PredRegsRegClass.contains(DstReg) && + (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) && + isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg)) + return HexagonII::HCG_A; + break; + case Hexagon::C2_cmpeqi: + case Hexagon::C2_cmpgti: + case Hexagon::C2_cmpgtui: + // P0 = cmp.eq(Rs,#u2) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (Hexagon::PredRegsRegClass.contains(DstReg) && + (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) && + isIntRegForSubInst(SrcReg) && MI->getOperand(2).isImm() && + ((isUInt<5>(MI->getOperand(2).getImm())) || + (MI->getOperand(2).getImm() == -1))) + return HexagonII::HCG_A; + break; + case Hexagon::A2_tfr: + // Rd = Rs + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg)) + return HexagonII::HCG_A; + break; + case Hexagon::A2_tfrsi: + // Rd = #u6 + // Do not test for #u6 size since the const is getting extended + // regardless and compound could be formed. + DstReg = MI->getOperand(0).getReg(); + if (isIntRegForSubInst(DstReg)) + return HexagonII::HCG_A; + break; + case Hexagon::S2_tstbit_i: + DstReg = MI->getOperand(0).getReg(); + Src1Reg = MI->getOperand(1).getReg(); + if (Hexagon::PredRegsRegClass.contains(DstReg) && + (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) && + MI->getOperand(2).isImm() && + isIntRegForSubInst(Src1Reg) && (MI->getOperand(2).getImm() == 0)) + return HexagonII::HCG_A; + break; + // The fact that .new form is used pretty much guarantees + // that predicate register will match. Nevertheless, + // there could be some false positives without additional + // checking. + case Hexagon::J2_jumptnew: + case Hexagon::J2_jumpfnew: + case Hexagon::J2_jumptnewpt: + case Hexagon::J2_jumpfnewpt: + Src1Reg = MI->getOperand(0).getReg(); + if (Hexagon::PredRegsRegClass.contains(Src1Reg) && + (Hexagon::P0 == Src1Reg || Hexagon::P1 == Src1Reg)) + return HexagonII::HCG_B; + break; + // Transfer and jump: + // Rd=#U6 ; jump #r9:2 + // Rd=Rs ; jump #r9:2 + // Do not test for jump range here. + case Hexagon::J2_jump: + case Hexagon::RESTORE_DEALLOC_RET_JMP_V4: + return HexagonII::HCG_C; + break; } + + return HexagonII::HCG_None; +} + + +// Returns -1 when there is no opcode found. +unsigned HexagonInstrInfo::getCompoundOpcode(const MachineInstr *GA, + const MachineInstr *GB) const { + assert(getCompoundCandidateGroup(GA) == HexagonII::HCG_A); + assert(getCompoundCandidateGroup(GB) == HexagonII::HCG_B); + if ((GA->getOpcode() != Hexagon::C2_cmpeqi) || + (GB->getOpcode() != Hexagon::J2_jumptnew)) + return -1; + unsigned DestReg = GA->getOperand(0).getReg(); + if (!GB->readsRegister(DestReg)) + return -1; + if (DestReg == Hexagon::P0) + return Hexagon::J4_cmpeqi_tp0_jump_nt; + if (DestReg == Hexagon::P1) + return Hexagon::J4_cmpeqi_tp1_jump_nt; + return -1; } -bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const { - switch (MI->getOpcode()) - { - default: return false; - case Hexagon::A2_paddf: - case Hexagon::A2_paddfnew: - case Hexagon::A2_paddt: - case Hexagon::A2_paddtnew: - case Hexagon::A2_pandf: - case Hexagon::A2_pandfnew: - case Hexagon::A2_pandt: - case Hexagon::A2_pandtnew: - case Hexagon::A4_paslhf: - case Hexagon::A4_paslhfnew: - case Hexagon::A4_paslht: - case Hexagon::A4_paslhtnew: - case Hexagon::A4_pasrhf: - case Hexagon::A4_pasrhfnew: - case Hexagon::A4_pasrht: - case Hexagon::A4_pasrhtnew: - case Hexagon::A2_porf: - case Hexagon::A2_porfnew: - case Hexagon::A2_port: - case Hexagon::A2_portnew: - case Hexagon::A2_psubf: - case Hexagon::A2_psubfnew: - case Hexagon::A2_psubt: - case Hexagon::A2_psubtnew: - case Hexagon::A2_pxorf: - case Hexagon::A2_pxorfnew: - case Hexagon::A2_pxort: - case Hexagon::A2_pxortnew: - case Hexagon::A4_psxthf: - case Hexagon::A4_psxthfnew: - case Hexagon::A4_psxtht: - case Hexagon::A4_psxthtnew: - case Hexagon::A4_psxtbf: - case Hexagon::A4_psxtbfnew: - case Hexagon::A4_psxtbt: - case Hexagon::A4_psxtbtnew: - case Hexagon::A4_pzxtbf: - case Hexagon::A4_pzxtbfnew: - case Hexagon::A4_pzxtbt: - case Hexagon::A4_pzxtbtnew: - case Hexagon::A4_pzxthf: - case Hexagon::A4_pzxthfnew: - case Hexagon::A4_pzxtht: - case Hexagon::A4_pzxthtnew: - case Hexagon::A2_paddit: - case Hexagon::A2_paddif: - case Hexagon::C2_ccombinewt: - case Hexagon::C2_ccombinewf: - return true; + +int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const { + enum Hexagon::PredSense inPredSense; + inPredSense = invertPredicate ? Hexagon::PredSense_false : + Hexagon::PredSense_true; + int CondOpcode = Hexagon::getPredOpcode(Opc, inPredSense); + if (CondOpcode >= 0) // Valid Conditional opcode/instruction + return CondOpcode; + + // This switch case will be removed once all the instructions have been + // modified to use relation maps. + switch(Opc) { + case Hexagon::TFRI_f: + return !invertPredicate ? Hexagon::TFRI_cPt_f : + Hexagon::TFRI_cNotPt_f; } + + llvm_unreachable("Unexpected predicable instruction"); } -bool HexagonInstrInfo:: -isConditionalLoad (const MachineInstr* MI) const { - switch (MI->getOpcode()) - { - default: return false; - case Hexagon::L2_ploadrdt_io : - case Hexagon::L2_ploadrdf_io: - case Hexagon::L2_ploadrit_io: - case Hexagon::L2_ploadrif_io: - case Hexagon::L2_ploadrht_io: - case Hexagon::L2_ploadrhf_io: - case Hexagon::L2_ploadrbt_io: - case Hexagon::L2_ploadrbf_io: - case Hexagon::L2_ploadruht_io: - case Hexagon::L2_ploadruhf_io: - case Hexagon::L2_ploadrubt_io: - case Hexagon::L2_ploadrubf_io: - case Hexagon::L2_ploadrdt_pi: - case Hexagon::L2_ploadrdf_pi: - case Hexagon::L2_ploadrit_pi: - case Hexagon::L2_ploadrif_pi: - case Hexagon::L2_ploadrht_pi: - case Hexagon::L2_ploadrhf_pi: - case Hexagon::L2_ploadrbt_pi: - case Hexagon::L2_ploadrbf_pi: - case Hexagon::L2_ploadruht_pi: - case Hexagon::L2_ploadruhf_pi: - case Hexagon::L2_ploadrubt_pi: - case Hexagon::L2_ploadrubf_pi: - case Hexagon::L4_ploadrdt_rr: - case Hexagon::L4_ploadrdf_rr: - case Hexagon::L4_ploadrbt_rr: - case Hexagon::L4_ploadrbf_rr: - case Hexagon::L4_ploadrubt_rr: - case Hexagon::L4_ploadrubf_rr: - case Hexagon::L4_ploadrht_rr: - case Hexagon::L4_ploadrhf_rr: - case Hexagon::L4_ploadruht_rr: - case Hexagon::L4_ploadruhf_rr: - case Hexagon::L4_ploadrit_rr: - case Hexagon::L4_ploadrif_rr: - return true; + +// Return the cur value instruction for a given store. +int HexagonInstrInfo::getDotCurOp(const MachineInstr* MI) const { + switch (MI->getOpcode()) { + default: llvm_unreachable("Unknown .cur type"); + case Hexagon::V6_vL32b_pi: + return Hexagon::V6_vL32b_cur_pi; + case Hexagon::V6_vL32b_ai: + return Hexagon::V6_vL32b_cur_ai; + //128B + case Hexagon::V6_vL32b_pi_128B: + return Hexagon::V6_vL32b_cur_pi_128B; + case Hexagon::V6_vL32b_ai_128B: + return Hexagon::V6_vL32b_cur_ai_128B; } + return 0; } -// Returns true if an instruction is a conditional store. -// -// Note: It doesn't include conditional new-value stores as they can't be -// converted to .new predicate. + + +// The diagram below shows the steps involved in the conversion of a predicated +// store instruction to its .new predicated new-value form. // // p.new NV store [ if(p0.new)memw(R0+#0)=R2.new ] // ^ ^ @@ -1524,8 +2998,6 @@ isConditionalLoad (const MachineInstr* MI) const { // p.old store // [if (p0)memw(R0+#0)=R2] // -// The above diagram shows the steps involoved in the conversion of a predicated -// store instruction to its .new predicated new-value form. // // The following set of instructions further explains the scenario where // conditional new-value store becomes invalid when promoted to .new predicate @@ -1538,105 +3010,33 @@ isConditionalLoad (const MachineInstr* MI) const { // the first two instructions because in instr 1, r0 is conditional on old value // of p0 but its use in instr 3 is conditional on p0 modified by instr 2 which // is not valid for new-value stores. -bool HexagonInstrInfo:: -isConditionalStore (const MachineInstr* MI) const { - switch (MI->getOpcode()) - { - default: return false; - case Hexagon::S4_storeirbt_io: - case Hexagon::S4_storeirbf_io: - case Hexagon::S4_pstorerbt_rr: - case Hexagon::S4_pstorerbf_rr: - case Hexagon::S2_pstorerbt_io: - case Hexagon::S2_pstorerbf_io: - case Hexagon::S2_pstorerbt_pi: - case Hexagon::S2_pstorerbf_pi: - case Hexagon::S2_pstorerdt_io: - case Hexagon::S2_pstorerdf_io: - case Hexagon::S4_pstorerdt_rr: - case Hexagon::S4_pstorerdf_rr: - case Hexagon::S2_pstorerdt_pi: - case Hexagon::S2_pstorerdf_pi: - case Hexagon::S2_pstorerht_io: - case Hexagon::S2_pstorerhf_io: - case Hexagon::S4_storeirht_io: - case Hexagon::S4_storeirhf_io: - case Hexagon::S4_pstorerht_rr: - case Hexagon::S4_pstorerhf_rr: - case Hexagon::S2_pstorerht_pi: - case Hexagon::S2_pstorerhf_pi: - case Hexagon::S2_pstorerit_io: - case Hexagon::S2_pstorerif_io: - case Hexagon::S4_storeirit_io: - case Hexagon::S4_storeirif_io: - case Hexagon::S4_pstorerit_rr: - case Hexagon::S4_pstorerif_rr: - case Hexagon::S2_pstorerit_pi: - case Hexagon::S2_pstorerif_pi: - - // V4 global address store before promoting to dot new. - case Hexagon::S4_pstorerdt_abs: - case Hexagon::S4_pstorerdf_abs: - case Hexagon::S4_pstorerbt_abs: - case Hexagon::S4_pstorerbf_abs: - case Hexagon::S4_pstorerht_abs: - case Hexagon::S4_pstorerhf_abs: - case Hexagon::S4_pstorerit_abs: - case Hexagon::S4_pstorerif_abs: - return true; - - // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded - // from the "Conditional Store" list. Because a predicated new value store - // would NOT be promoted to a double dot new store. See diagram below: - // This function returns yes for those stores that are predicated but not - // yet promoted to predicate dot new instructions. - // - // +---------------------+ - // /-----| if (p0) memw(..)=r0 |---------\~ - // || +---------------------+ || - // promote || /\ /\ || promote - // || /||\ /||\ || - // \||/ demote || \||/ - // \/ || || \/ - // +-------------------------+ || +-------------------------+ - // | if (p0.new) memw(..)=r0 | || | if (p0) memw(..)=r0.new | - // +-------------------------+ || +-------------------------+ - // || || || - // || demote \||/ - // promote || \/ NOT possible - // || || /\~ - // \||/ || /||\~ - // \/ || || - // +-----------------------------+ - // | if (p0.new) memw(..)=r0.new | - // +-----------------------------+ - // Double Dot New Store - // - } -} - - -bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const { - if (isNewValue(MI) && isBranch(MI)) - return true; - return false; -} - -bool HexagonInstrInfo::isNewValueJump(Opcode_t Opcode) const { - return isNewValue(Opcode) && get(Opcode).isBranch() && isPredicated(Opcode); -} - -bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const { - return (getAddrMode(MI) == HexagonII::PostInc); -} - -// Returns true, if any one of the operands is a dot new -// insn, whether it is predicated dot new or register dot new. -bool HexagonInstrInfo::isDotNewInst (const MachineInstr* MI) const { - return (isNewValueInst(MI) || - (isPredicated(MI) && isPredicatedNew(MI))); -} - +// Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded +// from the "Conditional Store" list. Because a predicated new value store +// would NOT be promoted to a double dot new store. See diagram below: +// This function returns yes for those stores that are predicated but not +// yet promoted to predicate dot new instructions. +// +// +---------------------+ +// /-----| if (p0) memw(..)=r0 |---------\~ +// || +---------------------+ || +// promote || /\ /\ || promote +// || /||\ /||\ || +// \||/ demote || \||/ +// \/ || || \/ +// +-------------------------+ || +-------------------------+ +// | if (p0.new) memw(..)=r0 | || | if (p0) memw(..)=r0.new | +// +-------------------------+ || +-------------------------+ +// || || || +// || demote \||/ +// promote || \/ NOT possible +// || || /\~ +// \||/ || /||\~ +// \/ || || +// +-----------------------------+ +// | if (p0.new) memw(..)=r0.new | +// +-----------------------------+ +// Double Dot New Store +// // Returns the most basic instruction for the .new predicated instructions and // new-value stores. // For example, all of the following instructions will be converted back to the @@ -1645,24 +3045,23 @@ bool HexagonInstrInfo::isDotNewInst (const MachineInstr* MI) const { // 2) if (p0) memw(R0+#0)= R1.new -------> if (p0) memw(R0+#0) = R1 // 3) if (p0.new) memw(R0+#0) = R1 ---> // +// To understand the translation of instruction 1 to its original form, consider +// a packet with 3 instructions. +// { p0 = cmp.eq(R0,R1) +// if (p0.new) R2 = add(R3, R4) +// R5 = add (R3, R1) +// } +// if (p0) memw(R5+#0) = R2 <--- trying to include it in the previous packet +// +// This instruction can be part of the previous packet only if both p0 and R2 +// are promoted to .new values. This promotion happens in steps, first +// predicate register is promoted to .new and in the next iteration R2 is +// promoted. Therefore, in case of dependence check failure (due to R5) during +// next iteration, it should be converted back to its most basic form. -int HexagonInstrInfo::GetDotOldOp(const int opc) const { - int NewOp = opc; - if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form - NewOp = Hexagon::getPredOldOpcode(NewOp); - assert(NewOp >= 0 && - "Couldn't change predicate new instruction to its old form."); - } - - if (isNewValueStore(NewOp)) { // Convert into non-new-value format - NewOp = Hexagon::getNonNVStore(NewOp); - assert(NewOp >= 0 && "Couldn't change new-value store to its old form."); - } - return NewOp; -} // Return the new value instruction for a given store. -int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const { +int HexagonInstrInfo::getDotNewOp(const MachineInstr* MI) const { int NVOpcode = Hexagon::getNewValueOpcode(MI->getOpcode()); if (NVOpcode >= 0) // Valid new-value store instruction. return NVOpcode; @@ -1672,12 +3071,6 @@ int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const { case Hexagon::S4_storerb_ur: return Hexagon::S4_storerbnew_ur; - case Hexagon::S4_storerh_ur: - return Hexagon::S4_storerhnew_ur; - - case Hexagon::S4_storeri_ur: - return Hexagon::S4_storerinew_ur; - case Hexagon::S2_storerb_pci: return Hexagon::S2_storerb_pci; @@ -1692,203 +3085,496 @@ int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const { case Hexagon::S2_storerf_pci: return Hexagon::S2_storerf_pci; + + case Hexagon::V6_vS32b_ai: + return Hexagon::V6_vS32b_new_ai; + + case Hexagon::V6_vS32b_pi: + return Hexagon::V6_vS32b_new_pi; + + // 128B + case Hexagon::V6_vS32b_ai_128B: + return Hexagon::V6_vS32b_new_ai_128B; + + case Hexagon::V6_vS32b_pi_128B: + return Hexagon::V6_vS32b_new_pi_128B; } return 0; } -// Return .new predicate version for an instruction. -int HexagonInstrInfo::GetDotNewPredOp(MachineInstr *MI, - const MachineBranchProbabilityInfo - *MBPI) const { +// Returns the opcode to use when converting MI, which is a conditional jump, +// into a conditional instruction which uses the .new value of the predicate. +// We also use branch probabilities to add a hint to the jump. +int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr *MI, + const MachineBranchProbabilityInfo *MBPI) const { + // We assume that block can have at most two successors. + bool taken = false; + const MachineBasicBlock *Src = MI->getParent(); + const MachineOperand *BrTarget = &MI->getOperand(1); + const MachineBasicBlock *Dst = BrTarget->getMBB(); + const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst); + if (Prediction >= BranchProbability(1,2)) + taken = true; + + switch (MI->getOpcode()) { + case Hexagon::J2_jumpt: + return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew; + case Hexagon::J2_jumpf: + return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew; + + default: + llvm_unreachable("Unexpected jump instruction."); + } +} + + +// Return .new predicate version for an instruction. +int HexagonInstrInfo::getDotNewPredOp(const MachineInstr *MI, + const MachineBranchProbabilityInfo *MBPI) const { int NewOpcode = Hexagon::getPredNewOpcode(MI->getOpcode()); if (NewOpcode >= 0) // Valid predicate new instruction return NewOpcode; switch (MI->getOpcode()) { - default: llvm_unreachable("Unknown .new type"); // Condtional Jumps case Hexagon::J2_jumpt: case Hexagon::J2_jumpf: return getDotNewPredJumpOp(MI, MBPI); - case Hexagon::J2_jumprt: - return Hexagon::J2_jumptnewpt; - - case Hexagon::J2_jumprf: - return Hexagon::J2_jumprfnewpt; - - case Hexagon::JMPrett: - return Hexagon::J2_jumprtnewpt; - - case Hexagon::JMPretf: - return Hexagon::J2_jumprfnewpt; - - - // Conditional combine - case Hexagon::C2_ccombinewt: - return Hexagon::C2_ccombinewnewt; - case Hexagon::C2_ccombinewf: - return Hexagon::C2_ccombinewnewf; + default: + assert(0 && "Unknown .new type"); } + return 0; } -unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const { - const uint64_t F = MI->getDesc().TSFlags; +int HexagonInstrInfo::getDotOldOp(const int opc) const { + int NewOp = opc; + if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form + NewOp = Hexagon::getPredOldOpcode(NewOp); + assert(NewOp >= 0 && + "Couldn't change predicate new instruction to its old form."); + } - return((F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask); + if (isNewValueStore(NewOp)) { // Convert into non-new-value format + NewOp = Hexagon::getNonNVStore(NewOp); + assert(NewOp >= 0 && "Couldn't change new-value store to its old form."); + } + return NewOp; } -/// immediateExtend - Changes the instruction in place to one using an immediate -/// extender. -void HexagonInstrInfo::immediateExtend(MachineInstr *MI) const { - assert((isExtendable(MI)||isConstExtended(MI)) && - "Instruction must be extendable"); - // Find which operand is extendable. - short ExtOpNum = getCExtOpNum(MI); - MachineOperand &MO = MI->getOperand(ExtOpNum); - // This needs to be something we understand. - assert((MO.isMBB() || MO.isImm()) && - "Branch with unknown extendable field type"); - // Mark given operand as extended. - MO.addTargetFlag(HexagonII::HMOTF_ConstExtended); -} -DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState( - const TargetSubtargetInfo &STI) const { - const InstrItineraryData *II = STI.getInstrItineraryData(); - return static_cast<const HexagonSubtarget &>(STI).createDFAPacketizer(II); -} - -bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI, - const MachineBasicBlock *MBB, - const MachineFunction &MF) const { - // Debug info is never a scheduling boundary. It's necessary to be explicit - // due to the special treatment of IT instructions below, otherwise a - // dbg_value followed by an IT will result in the IT instruction being - // considered a scheduling hazard, which is wrong. It should be the actual - // instruction preceding the dbg_value instruction(s), just like it is - // when debug info is not present. - if (MI->isDebugValue()) - return false; +// See if instruction could potentially be a duplex candidate. +// If so, return its group. Zero otherwise. +HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup( + const MachineInstr *MI) const { + unsigned DstReg, SrcReg, Src1Reg, Src2Reg; + auto &HRI = getRegisterInfo(); - // Terminators and labels can't be scheduled around. - if (MI->getDesc().isTerminator() || MI->isPosition() || MI->isInlineAsm()) - return true; + switch (MI->getOpcode()) { + default: + return HexagonII::HSIG_None; + // + // Group L1: + // + // Rd = memw(Rs+#u4:2) + // Rd = memub(Rs+#u4:0) + case Hexagon::L2_loadri_io: + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + // Special case this one from Group L2. + // Rd = memw(r29+#u5:2) + if (isIntRegForSubInst(DstReg)) { + if (Hexagon::IntRegsRegClass.contains(SrcReg) && + HRI.getStackRegister() == SrcReg && + MI->getOperand(2).isImm() && + isShiftedUInt<5,2>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_L2; + // Rd = memw(Rs+#u4:2) + if (isIntRegForSubInst(SrcReg) && + (MI->getOperand(2).isImm() && + isShiftedUInt<4,2>(MI->getOperand(2).getImm()))) + return HexagonII::HSIG_L1; + } + break; + case Hexagon::L2_loadrub_io: + // Rd = memub(Rs+#u4:0) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) && + MI->getOperand(2).isImm() && isUInt<4>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_L1; + break; + // + // Group L2: + // + // Rd = memh/memuh(Rs+#u3:1) + // Rd = memb(Rs+#u3:0) + // Rd = memw(r29+#u5:2) - Handled above. + // Rdd = memd(r29+#u5:3) + // deallocframe + // [if ([!]p0[.new])] dealloc_return + // [if ([!]p0[.new])] jumpr r31 + case Hexagon::L2_loadrh_io: + case Hexagon::L2_loadruh_io: + // Rd = memh/memuh(Rs+#u3:1) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) && + MI->getOperand(2).isImm() && + isShiftedUInt<3,1>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_L2; + break; + case Hexagon::L2_loadrb_io: + // Rd = memb(Rs+#u3:0) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) && + MI->getOperand(2).isImm() && + isUInt<3>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_L2; + break; + case Hexagon::L2_loadrd_io: + // Rdd = memd(r29+#u5:3) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isDblRegForSubInst(DstReg, HRI) && + Hexagon::IntRegsRegClass.contains(SrcReg) && + HRI.getStackRegister() == SrcReg && + MI->getOperand(2).isImm() && + isShiftedUInt<5,3>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_L2; + break; + // dealloc_return is not documented in Hexagon Manual, but marked + // with A_SUBINSN attribute in iset_v4classic.py. + case Hexagon::RESTORE_DEALLOC_RET_JMP_V4: + case Hexagon::L4_return: + case Hexagon::L2_deallocframe: + return HexagonII::HSIG_L2; + case Hexagon::EH_RETURN_JMPR: + case Hexagon::JMPret : + // jumpr r31 + // Actual form JMPR %PC<imp-def>, %R31<imp-use>, %R0<imp-use,internal>. + DstReg = MI->getOperand(0).getReg(); + if (Hexagon::IntRegsRegClass.contains(DstReg) && (Hexagon::R31 == DstReg)) + return HexagonII::HSIG_L2; + break; + case Hexagon::JMPrett: + case Hexagon::JMPretf: + case Hexagon::JMPrettnewpt: + case Hexagon::JMPretfnewpt : + case Hexagon::JMPrettnew : + case Hexagon::JMPretfnew : + DstReg = MI->getOperand(1).getReg(); + SrcReg = MI->getOperand(0).getReg(); + // [if ([!]p0[.new])] jumpr r31 + if ((Hexagon::PredRegsRegClass.contains(SrcReg) && + (Hexagon::P0 == SrcReg)) && + (Hexagon::IntRegsRegClass.contains(DstReg) && (Hexagon::R31 == DstReg))) + return HexagonII::HSIG_L2; + break; + case Hexagon::L4_return_t : + case Hexagon::L4_return_f : + case Hexagon::L4_return_tnew_pnt : + case Hexagon::L4_return_fnew_pnt : + case Hexagon::L4_return_tnew_pt : + case Hexagon::L4_return_fnew_pt : + // [if ([!]p0[.new])] dealloc_return + SrcReg = MI->getOperand(0).getReg(); + if (Hexagon::PredRegsRegClass.contains(SrcReg) && (Hexagon::P0 == SrcReg)) + return HexagonII::HSIG_L2; + break; + // + // Group S1: + // + // memw(Rs+#u4:2) = Rt + // memb(Rs+#u4:0) = Rt + case Hexagon::S2_storeri_io: + // Special case this one from Group S2. + // memw(r29+#u5:2) = Rt + Src1Reg = MI->getOperand(0).getReg(); + Src2Reg = MI->getOperand(2).getReg(); + if (Hexagon::IntRegsRegClass.contains(Src1Reg) && + isIntRegForSubInst(Src2Reg) && + HRI.getStackRegister() == Src1Reg && MI->getOperand(1).isImm() && + isShiftedUInt<5,2>(MI->getOperand(1).getImm())) + return HexagonII::HSIG_S2; + // memw(Rs+#u4:2) = Rt + if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) && + MI->getOperand(1).isImm() && + isShiftedUInt<4,2>(MI->getOperand(1).getImm())) + return HexagonII::HSIG_S1; + break; + case Hexagon::S2_storerb_io: + // memb(Rs+#u4:0) = Rt + Src1Reg = MI->getOperand(0).getReg(); + Src2Reg = MI->getOperand(2).getReg(); + if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) && + MI->getOperand(1).isImm() && isUInt<4>(MI->getOperand(1).getImm())) + return HexagonII::HSIG_S1; + break; + // + // Group S2: + // + // memh(Rs+#u3:1) = Rt + // memw(r29+#u5:2) = Rt + // memd(r29+#s6:3) = Rtt + // memw(Rs+#u4:2) = #U1 + // memb(Rs+#u4) = #U1 + // allocframe(#u5:3) + case Hexagon::S2_storerh_io: + // memh(Rs+#u3:1) = Rt + Src1Reg = MI->getOperand(0).getReg(); + Src2Reg = MI->getOperand(2).getReg(); + if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) && + MI->getOperand(1).isImm() && + isShiftedUInt<3,1>(MI->getOperand(1).getImm())) + return HexagonII::HSIG_S1; + break; + case Hexagon::S2_storerd_io: + // memd(r29+#s6:3) = Rtt + Src1Reg = MI->getOperand(0).getReg(); + Src2Reg = MI->getOperand(2).getReg(); + if (isDblRegForSubInst(Src2Reg, HRI) && + Hexagon::IntRegsRegClass.contains(Src1Reg) && + HRI.getStackRegister() == Src1Reg && MI->getOperand(1).isImm() && + isShiftedInt<6,3>(MI->getOperand(1).getImm())) + return HexagonII::HSIG_S2; + break; + case Hexagon::S4_storeiri_io: + // memw(Rs+#u4:2) = #U1 + Src1Reg = MI->getOperand(0).getReg(); + if (isIntRegForSubInst(Src1Reg) && MI->getOperand(1).isImm() && + isShiftedUInt<4,2>(MI->getOperand(1).getImm()) && + MI->getOperand(2).isImm() && isUInt<1>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_S2; + break; + case Hexagon::S4_storeirb_io: + // memb(Rs+#u4) = #U1 + Src1Reg = MI->getOperand(0).getReg(); + if (isIntRegForSubInst(Src1Reg) && MI->getOperand(1).isImm() && + isUInt<4>(MI->getOperand(1).getImm()) && MI->getOperand(2).isImm() && + MI->getOperand(2).isImm() && isUInt<1>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_S2; + break; + case Hexagon::S2_allocframe: + if (MI->getOperand(0).isImm() && + isShiftedUInt<5,3>(MI->getOperand(0).getImm())) + return HexagonII::HSIG_S1; + break; + // + // Group A: + // + // Rx = add(Rx,#s7) + // Rd = Rs + // Rd = #u6 + // Rd = #-1 + // if ([!]P0[.new]) Rd = #0 + // Rd = add(r29,#u6:2) + // Rx = add(Rx,Rs) + // P0 = cmp.eq(Rs,#u2) + // Rdd = combine(#0,Rs) + // Rdd = combine(Rs,#0) + // Rdd = combine(#u2,#U2) + // Rd = add(Rs,#1) + // Rd = add(Rs,#-1) + // Rd = sxth/sxtb/zxtb/zxth(Rs) + // Rd = and(Rs,#1) + case Hexagon::A2_addi: + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg)) { + // Rd = add(r29,#u6:2) + if (Hexagon::IntRegsRegClass.contains(SrcReg) && + HRI.getStackRegister() == SrcReg && MI->getOperand(2).isImm() && + isShiftedUInt<6,2>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_A; + // Rx = add(Rx,#s7) + if ((DstReg == SrcReg) && MI->getOperand(2).isImm() && + isInt<7>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_A; + // Rd = add(Rs,#1) + // Rd = add(Rs,#-1) + if (isIntRegForSubInst(SrcReg) && MI->getOperand(2).isImm() && + ((MI->getOperand(2).getImm() == 1) || + (MI->getOperand(2).getImm() == -1))) + return HexagonII::HSIG_A; + } + break; + case Hexagon::A2_add: + // Rx = add(Rx,Rs) + DstReg = MI->getOperand(0).getReg(); + Src1Reg = MI->getOperand(1).getReg(); + Src2Reg = MI->getOperand(2).getReg(); + if (isIntRegForSubInst(DstReg) && (DstReg == Src1Reg) && + isIntRegForSubInst(Src2Reg)) + return HexagonII::HSIG_A; + break; + case Hexagon::A2_andir: + // Same as zxtb. + // Rd16=and(Rs16,#255) + // Rd16=and(Rs16,#1) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) && + MI->getOperand(2).isImm() && + ((MI->getOperand(2).getImm() == 1) || + (MI->getOperand(2).getImm() == 255))) + return HexagonII::HSIG_A; + break; + case Hexagon::A2_tfr: + // Rd = Rs + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg)) + return HexagonII::HSIG_A; + break; + case Hexagon::A2_tfrsi: + // Rd = #u6 + // Do not test for #u6 size since the const is getting extended + // regardless and compound could be formed. + // Rd = #-1 + DstReg = MI->getOperand(0).getReg(); + if (isIntRegForSubInst(DstReg)) + return HexagonII::HSIG_A; + break; + case Hexagon::C2_cmoveit: + case Hexagon::C2_cmovenewit: + case Hexagon::C2_cmoveif: + case Hexagon::C2_cmovenewif: + // if ([!]P0[.new]) Rd = #0 + // Actual form: + // %R16<def> = C2_cmovenewit %P0<internal>, 0, %R16<imp-use,undef>; + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && + Hexagon::PredRegsRegClass.contains(SrcReg) && Hexagon::P0 == SrcReg && + MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) + return HexagonII::HSIG_A; + break; + case Hexagon::C2_cmpeqi: + // P0 = cmp.eq(Rs,#u2) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (Hexagon::PredRegsRegClass.contains(DstReg) && + Hexagon::P0 == DstReg && isIntRegForSubInst(SrcReg) && + MI->getOperand(2).isImm() && isUInt<2>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_A; + break; + case Hexagon::A2_combineii: + case Hexagon::A4_combineii: + // Rdd = combine(#u2,#U2) + DstReg = MI->getOperand(0).getReg(); + if (isDblRegForSubInst(DstReg, HRI) && + ((MI->getOperand(1).isImm() && isUInt<2>(MI->getOperand(1).getImm())) || + (MI->getOperand(1).isGlobal() && + isUInt<2>(MI->getOperand(1).getOffset()))) && + ((MI->getOperand(2).isImm() && isUInt<2>(MI->getOperand(2).getImm())) || + (MI->getOperand(2).isGlobal() && + isUInt<2>(MI->getOperand(2).getOffset())))) + return HexagonII::HSIG_A; + break; + case Hexagon::A4_combineri: + // Rdd = combine(Rs,#0) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isDblRegForSubInst(DstReg, HRI) && isIntRegForSubInst(SrcReg) && + ((MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) || + (MI->getOperand(2).isGlobal() && MI->getOperand(2).getOffset() == 0))) + return HexagonII::HSIG_A; + break; + case Hexagon::A4_combineir: + // Rdd = combine(#0,Rs) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(2).getReg(); + if (isDblRegForSubInst(DstReg, HRI) && isIntRegForSubInst(SrcReg) && + ((MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) || + (MI->getOperand(1).isGlobal() && MI->getOperand(1).getOffset() == 0))) + return HexagonII::HSIG_A; + break; + case Hexagon::A2_sxtb: + case Hexagon::A2_sxth: + case Hexagon::A2_zxtb: + case Hexagon::A2_zxth: + // Rd = sxth/sxtb/zxtb/zxth(Rs) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg)) + return HexagonII::HSIG_A; + break; + } - return false; + return HexagonII::HSIG_None; } -bool HexagonInstrInfo::isConstExtended(const MachineInstr *MI) const { - const uint64_t F = MI->getDesc().TSFlags; - unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask; - if (isExtended) // Instruction must be extended. - return true; - unsigned isExtendable = - (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask; - if (!isExtendable) - return false; - - short ExtOpNum = getCExtOpNum(MI); - const MachineOperand &MO = MI->getOperand(ExtOpNum); - // Use MO operand flags to determine if MO - // has the HMOTF_ConstExtended flag set. - if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended) - return true; - // If this is a Machine BB address we are talking about, and it is - // not marked as extended, say so. - if (MO.isMBB()) - return false; - - // We could be using an instruction with an extendable immediate and shoehorn - // a global address into it. If it is a global address it will be constant - // extended. We do this for COMBINE. - // We currently only handle isGlobal() because it is the only kind of - // object we are going to end up with here for now. - // In the future we probably should add isSymbol(), etc. - if (MO.isGlobal() || MO.isSymbol() || MO.isBlockAddress() || - MO.isJTI() || MO.isCPI()) - return true; - - // If the extendable operand is not 'Immediate' type, the instruction should - // have 'isExtended' flag set. - assert(MO.isImm() && "Extendable operand must be Immediate type"); +short HexagonInstrInfo::getEquivalentHWInstr(const MachineInstr *MI) const { + return Hexagon::getRealHWInstr(MI->getOpcode(), Hexagon::InstrType_Real); +} - int MinValue = getMinValue(MI); - int MaxValue = getMaxValue(MI); - int ImmValue = MO.getImm(); - return (ImmValue < MinValue || ImmValue > MaxValue); +// Return first non-debug instruction in the basic block. +MachineInstr *HexagonInstrInfo::getFirstNonDbgInst(MachineBasicBlock *BB) + const { + for (auto MII = BB->instr_begin(), End = BB->instr_end(); MII != End; MII++) { + MachineInstr *MI = &*MII; + if (MI->isDebugValue()) + continue; + return MI; + } + return nullptr; } -// Return the number of bytes required to encode the instruction. -// Hexagon instructions are fixed length, 4 bytes, unless they -// use a constant extender, which requires another 4 bytes. -// For debug instructions and prolog labels, return 0. -unsigned HexagonInstrInfo::getSize(const MachineInstr *MI) const { - if (MI->isDebugValue() || MI->isPosition()) - return 0; +unsigned HexagonInstrInfo::getInstrTimingClassLatency( + const InstrItineraryData *ItinData, const MachineInstr *MI) const { + // Default to one cycle for no itinerary. However, an "empty" itinerary may + // still have a MinLatency property, which getStageLatency checks. + if (!ItinData) + return getInstrLatency(ItinData, MI); - unsigned Size = MI->getDesc().getSize(); - if (!Size) - // Assume the default insn size in case it cannot be determined - // for whatever reason. - Size = HEXAGON_INSTR_SIZE; - - if (isConstExtended(MI) || isExtended(MI)) - Size += HEXAGON_INSTR_SIZE; - - return Size; + // Get the latency embedded in the itinerary. If we're not using timing class + // latencies or if we using BSB scheduling, then restrict the maximum latency + // to 1 (that is, either 0 or 1). + if (MI->isTransient()) + return 0; + unsigned Latency = ItinData->getStageLatency(MI->getDesc().getSchedClass()); + if (!EnableTimingClassLatency || + MI->getParent()->getParent()->getSubtarget<HexagonSubtarget>(). + useBSBScheduling()) + if (Latency > 1) + Latency = 1; + return Latency; } -// Returns the opcode to use when converting MI, which is a conditional jump, -// into a conditional instruction which uses the .new value of the predicate. -// We also use branch probabilities to add a hint to the jump. -int -HexagonInstrInfo::getDotNewPredJumpOp(MachineInstr *MI, - const - MachineBranchProbabilityInfo *MBPI) const { - - // We assume that block can have at most two successors. - bool taken = false; - MachineBasicBlock *Src = MI->getParent(); - MachineOperand *BrTarget = &MI->getOperand(1); - MachineBasicBlock *Dst = BrTarget->getMBB(); - const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst); - if (Prediction >= BranchProbability(1,2)) - taken = true; +// inverts the predication logic. +// p -> NotP +// NotP -> P +bool HexagonInstrInfo::getInvertedPredSense( + SmallVectorImpl<MachineOperand> &Cond) const { + if (Cond.empty()) + return false; + unsigned Opc = getInvertedPredicatedOpcode(Cond[0].getImm()); + Cond[0].setImm(Opc); + return true; +} - switch (MI->getOpcode()) { - case Hexagon::J2_jumpt: - return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew; - case Hexagon::J2_jumpf: - return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew; - default: - llvm_unreachable("Unexpected jump instruction."); - } -} -// Returns true if a particular operand is extendable for an instruction. -bool HexagonInstrInfo::isOperandExtended(const MachineInstr *MI, - unsigned short OperandNum) const { - const uint64_t F = MI->getDesc().TSFlags; +unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const { + int InvPredOpcode; + InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc) + : Hexagon::getTruePredOpcode(Opc); + if (InvPredOpcode >= 0) // Valid instruction with the inverted predicate. + return InvPredOpcode; - return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask) - == OperandNum; + llvm_unreachable("Unexpected predicated instruction"); } -// Returns Operand Index for the constant extended instruction. -unsigned short HexagonInstrInfo::getCExtOpNum(const MachineInstr *MI) const { - const uint64_t F = MI->getDesc().TSFlags; - return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask); -} -// Returns the min value that doesn't need to be extended. -int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const { +// Returns the max value that doesn't need to be extended. +int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const { const uint64_t F = MI->getDesc().TSFlags; unsigned isSigned = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask; @@ -1896,13 +3582,20 @@ int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const { & HexagonII::ExtentBitsMask; if (isSigned) // if value is signed - return -1U << (bits - 1); + return ~(-1U << (bits - 1)); else - return 0; + return ~(-1U << bits); } -// Returns the max value that doesn't need to be extended. -int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const { + +unsigned HexagonInstrInfo::getMemAccessSize(const MachineInstr* MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::MemAccessSizePos) & HexagonII::MemAccesSizeMask; +} + + +// Returns the min value that doesn't need to be extended. +int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const { const uint64_t F = MI->getDesc().TSFlags; unsigned isSigned = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask; @@ -1910,49 +3603,14 @@ int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const { & HexagonII::ExtentBitsMask; if (isSigned) // if value is signed - return ~(-1U << (bits - 1)); + return -1U << (bits - 1); else - return ~(-1U << bits); + return 0; } -// Returns true if an instruction can be converted into a non-extended -// equivalent instruction. -bool HexagonInstrInfo::NonExtEquivalentExists (const MachineInstr *MI) const { - - short NonExtOpcode; - // Check if the instruction has a register form that uses register in place - // of the extended operand, if so return that as the non-extended form. - if (Hexagon::getRegForm(MI->getOpcode()) >= 0) - return true; - - if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) { - // Check addressing mode and retrieve non-ext equivalent instruction. - - switch (getAddrMode(MI)) { - case HexagonII::Absolute : - // Load/store with absolute addressing mode can be converted into - // base+offset mode. - NonExtOpcode = Hexagon::getBasedWithImmOffset(MI->getOpcode()); - break; - case HexagonII::BaseImmOffset : - // Load/store with base+offset addressing mode can be converted into - // base+register offset addressing mode. However left shift operand should - // be set to 0. - NonExtOpcode = Hexagon::getBaseWithRegOffset(MI->getOpcode()); - break; - default: - return false; - } - if (NonExtOpcode < 0) - return false; - return true; - } - return false; -} // Returns opcode of the non-extended equivalent instruction. -short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const { - +short HexagonInstrInfo::getNonExtOpcode(const MachineInstr *MI) const { // Check if the instruction has a register form that uses register in place // of the extended operand, if so return that as the non-extended form. short NonExtOpcode = Hexagon::getRegForm(MI->getOpcode()); @@ -1963,9 +3621,12 @@ short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const { // Check addressing mode and retrieve non-ext equivalent instruction. switch (getAddrMode(MI)) { case HexagonII::Absolute : - return Hexagon::getBasedWithImmOffset(MI->getOpcode()); + return Hexagon::getBaseWithImmOffset(MI->getOpcode()); case HexagonII::BaseImmOffset : return Hexagon::getBaseWithRegOffset(MI->getOpcode()); + case HexagonII::BaseLongOffset: + return Hexagon::getRegShlForm(MI->getOpcode()); + default: return -1; } @@ -1973,29 +3634,9 @@ short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const { return -1; } -bool HexagonInstrInfo::PredOpcodeHasJMP_c(Opcode_t Opcode) const { - return (Opcode == Hexagon::J2_jumpt) || - (Opcode == Hexagon::J2_jumpf) || - (Opcode == Hexagon::J2_jumptnewpt) || - (Opcode == Hexagon::J2_jumpfnewpt) || - (Opcode == Hexagon::J2_jumpt) || - (Opcode == Hexagon::J2_jumpf); -} - -bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const { - if (Cond.empty() || !isPredicated(Cond[0].getImm())) - return false; - return !isPredicatedTrue(Cond[0].getImm()); -} - -bool HexagonInstrInfo::isEndLoopN(Opcode_t Opcode) const { - return (Opcode == Hexagon::ENDLOOP0 || - Opcode == Hexagon::ENDLOOP1); -} bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond, - unsigned &PredReg, unsigned &PredRegPos, - unsigned &PredRegFlags) const { + unsigned &PredReg, unsigned &PredRegPos, unsigned &PredRegFlags) const { if (Cond.empty()) return false; assert(Cond.size() == 2); @@ -2014,3 +3655,174 @@ bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond, return true; } + +short HexagonInstrInfo::getPseudoInstrPair(const MachineInstr *MI) const { + return Hexagon::getRealHWInstr(MI->getOpcode(), Hexagon::InstrType_Pseudo); +} + + +short HexagonInstrInfo::getRegForm(const MachineInstr *MI) const { + return Hexagon::getRegForm(MI->getOpcode()); +} + + +// Return the number of bytes required to encode the instruction. +// Hexagon instructions are fixed length, 4 bytes, unless they +// use a constant extender, which requires another 4 bytes. +// For debug instructions and prolog labels, return 0. +unsigned HexagonInstrInfo::getSize(const MachineInstr *MI) const { + if (MI->isDebugValue() || MI->isPosition()) + return 0; + + unsigned Size = MI->getDesc().getSize(); + if (!Size) + // Assume the default insn size in case it cannot be determined + // for whatever reason. + Size = HEXAGON_INSTR_SIZE; + + if (isConstExtended(MI) || isExtended(MI)) + Size += HEXAGON_INSTR_SIZE; + + // Try and compute number of instructions in asm. + if (BranchRelaxAsmLarge && MI->getOpcode() == Hexagon::INLINEASM) { + const MachineBasicBlock &MBB = *MI->getParent(); + const MachineFunction *MF = MBB.getParent(); + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + + // Count the number of register definitions to find the asm string. + unsigned NumDefs = 0; + for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef(); + ++NumDefs) + assert(NumDefs != MI->getNumOperands()-2 && "No asm string?"); + + assert(MI->getOperand(NumDefs).isSymbol() && "No asm string?"); + // Disassemble the AsmStr and approximate number of instructions. + const char *AsmStr = MI->getOperand(NumDefs).getSymbolName(); + Size = getInlineAsmLength(AsmStr, *MAI); + } + + return Size; +} + + +uint64_t HexagonInstrInfo::getType(const MachineInstr* MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::TypePos) & HexagonII::TypeMask; +} + + +unsigned HexagonInstrInfo::getUnits(const MachineInstr* MI) const { + const TargetSubtargetInfo &ST = MI->getParent()->getParent()->getSubtarget(); + const InstrItineraryData &II = *ST.getInstrItineraryData(); + const InstrStage &IS = *II.beginStage(MI->getDesc().getSchedClass()); + + return IS.getUnits(); +} + + +unsigned HexagonInstrInfo::getValidSubTargets(const unsigned Opcode) const { + const uint64_t F = get(Opcode).TSFlags; + return (F >> HexagonII::validSubTargetPos) & HexagonII::validSubTargetMask; +} + + +// Calculate size of the basic block without debug instructions. +unsigned HexagonInstrInfo::nonDbgBBSize(const MachineBasicBlock *BB) const { + return nonDbgMICount(BB->instr_begin(), BB->instr_end()); +} + + +unsigned HexagonInstrInfo::nonDbgBundleSize( + MachineBasicBlock::const_iterator BundleHead) const { + assert(BundleHead->isBundle() && "Not a bundle header"); + auto MII = BundleHead.getInstrIterator(); + // Skip the bundle header. + return nonDbgMICount(++MII, getBundleEnd(BundleHead)); +} + + +/// immediateExtend - Changes the instruction in place to one using an immediate +/// extender. +void HexagonInstrInfo::immediateExtend(MachineInstr *MI) const { + assert((isExtendable(MI)||isConstExtended(MI)) && + "Instruction must be extendable"); + // Find which operand is extendable. + short ExtOpNum = getCExtOpNum(MI); + MachineOperand &MO = MI->getOperand(ExtOpNum); + // This needs to be something we understand. + assert((MO.isMBB() || MO.isImm()) && + "Branch with unknown extendable field type"); + // Mark given operand as extended. + MO.addTargetFlag(HexagonII::HMOTF_ConstExtended); +} + + +bool HexagonInstrInfo::invertAndChangeJumpTarget( + MachineInstr* MI, MachineBasicBlock* NewTarget) const { + DEBUG(dbgs() << "\n[invertAndChangeJumpTarget] to BB#" + << NewTarget->getNumber(); MI->dump();); + assert(MI->isBranch()); + unsigned NewOpcode = getInvertedPredicatedOpcode(MI->getOpcode()); + int TargetPos = MI->getNumOperands() - 1; + // In general branch target is the last operand, + // but some implicit defs added at the end might change it. + while ((TargetPos > -1) && !MI->getOperand(TargetPos).isMBB()) + --TargetPos; + assert((TargetPos >= 0) && MI->getOperand(TargetPos).isMBB()); + MI->getOperand(TargetPos).setMBB(NewTarget); + if (EnableBranchPrediction && isPredicatedNew(MI)) { + NewOpcode = reversePrediction(NewOpcode); + } + MI->setDesc(get(NewOpcode)); + return true; +} + + +void HexagonInstrInfo::genAllInsnTimingClasses(MachineFunction &MF) const { + /* +++ The code below is used to generate complete set of Hexagon Insn +++ */ + MachineFunction::iterator A = MF.begin(); + MachineBasicBlock &B = *A; + MachineBasicBlock::iterator I = B.begin(); + MachineInstr *MI = &*I; + DebugLoc DL = MI->getDebugLoc(); + MachineInstr *NewMI; + + for (unsigned insn = TargetOpcode::GENERIC_OP_END+1; + insn < Hexagon::INSTRUCTION_LIST_END; ++insn) { + NewMI = BuildMI(B, MI, DL, get(insn)); + DEBUG(dbgs() << "\n" << getName(NewMI->getOpcode()) << + " Class: " << NewMI->getDesc().getSchedClass()); + NewMI->eraseFromParent(); + } + /* --- The code above is used to generate complete set of Hexagon Insn --- */ +} + + +// inverts the predication logic. +// p -> NotP +// NotP -> P +bool HexagonInstrInfo::reversePredSense(MachineInstr* MI) const { + DEBUG(dbgs() << "\nTrying to reverse pred. sense of:"; MI->dump()); + MI->setDesc(get(getInvertedPredicatedOpcode(MI->getOpcode()))); + return true; +} + + +// Reverse the branch prediction. +unsigned HexagonInstrInfo::reversePrediction(unsigned Opcode) const { + int PredRevOpcode = -1; + if (isPredictedTaken(Opcode)) + PredRevOpcode = Hexagon::notTakenBranchPrediction(Opcode); + else + PredRevOpcode = Hexagon::takenBranchPrediction(Opcode); + assert(PredRevOpcode > 0); + return PredRevOpcode; +} + + +// TODO: Add more rigorous validation. +bool HexagonInstrInfo::validateBranchCond(const ArrayRef<MachineOperand> &Cond) + const { + return Cond.empty() || (Cond[0].isImm() && (Cond.size() != 1)); +} + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index d0b8a46..9530d9f 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -1,4 +1,3 @@ - //===- HexagonInstrInfo.h - Hexagon Instruction Information -----*- C++ -*-===// // // The LLVM Compiler Infrastructure @@ -28,23 +27,18 @@ namespace llvm { struct EVT; class HexagonSubtarget; + class HexagonInstrInfo : public HexagonGenInstrInfo { virtual void anchor(); const HexagonRegisterInfo RI; - const HexagonSubtarget &Subtarget; public: - typedef unsigned Opcode_t; - explicit HexagonInstrInfo(HexagonSubtarget &ST); - /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As - /// such, whenever a client has an instance of instruction info, it should - /// always be able to get register info as well (through this method). + /// TargetInstrInfo overrides. /// - const HexagonRegisterInfo &getRegisterInfo() const { return RI; } - /// isLoadFromStackSlot - If the specified machine instruction is a direct + /// If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of /// the destination along with the FrameIndex of the loaded stack slot. If /// not, return 0. This predicate must return 0 if the instruction has @@ -52,7 +46,7 @@ public: unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const override; - /// isStoreToStackSlot - If the specified machine instruction is a direct + /// If the specified machine instruction is a direct /// store to a stack slot, return the virtual or physical register number of /// the source reg along with the FrameIndex of the loaded stack slot. If /// not, return 0. This predicate must return 0 if the instruction has @@ -60,50 +54,118 @@ public: unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const override; - + /// Analyze the branching code at the end of MBB, returning + /// true if it cannot be understood (e.g. it's a switch dispatch or isn't + /// implemented for a target). Upon success, this returns false and returns + /// with the following information in various cases: + /// + /// 1. If this block ends with no branches (it just falls through to its succ) + /// just return false, leaving TBB/FBB null. + /// 2. If this block ends with only an unconditional branch, it sets TBB to be + /// the destination block. + /// 3. If this block ends with a conditional branch and it falls through to a + /// successor block, it sets TBB to be the branch destination block and a + /// list of operands that evaluate the condition. These operands can be + /// passed to other TargetInstrInfo methods to create new branches. + /// 4. If this block ends with a conditional branch followed by an + /// unconditional branch, it returns the 'true' destination in TBB, the + /// 'false' destination in FBB, and a list of operands that evaluate the + /// condition. These operands can be passed to other TargetInstrInfo + /// methods to create new branches. + /// + /// Note that RemoveBranch and InsertBranch must be implemented to support + /// cases where this method returns success. + /// + /// If AllowModify is true, then this routine is allowed to modify the basic + /// block (e.g. delete instructions after the unconditional branch). + /// bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; + /// Remove the branching code at the end of the specific MBB. + /// This is only invoked in cases where AnalyzeBranch returns success. It + /// returns the number of instructions that were removed. unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + /// Insert branch code into the end of the specified MachineBasicBlock. + /// The operands to this method are the same as those + /// returned by AnalyzeBranch. This is only invoked in cases where + /// AnalyzeBranch returns success. It returns the number of instructions + /// inserted. + /// + /// It is also invoked by tail merging to add unconditional branches in + /// cases where AnalyzeBranch doesn't apply because there was no original + /// branch to analyze. At least this much must be implemented, else tail + /// merging needs to be disabled. unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; - bool analyzeCompare(const MachineInstr *MI, - unsigned &SrcReg, unsigned &SrcReg2, - int &Mask, int &Value) const override; + /// Return true if it's profitable to predicate + /// instructions with accumulated instruction latency of "NumCycles" + /// of the specified basic block, where the probability of the instructions + /// being executed is given by Probability, and Confidence is a measure + /// of our confidence that it will be properly predicted. + bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, + unsigned ExtraPredCycles, + BranchProbability Probability) const override; + + /// Second variant of isProfitableToIfCvt. This one + /// checks for the case where two basic blocks from true and false path + /// of a if-then-else (diamond) are predicated on mutally exclusive + /// predicates, where the probability of the true path being taken is given + /// by Probability, and Confidence is a measure of our confidence that it + /// will be properly predicted. + bool isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, unsigned ExtraTCycles, + MachineBasicBlock &FMBB, + unsigned NumFCycles, unsigned ExtraFCycles, + BranchProbability Probability) const override; + + /// Return true if it's profitable for if-converter to duplicate instructions + /// of specified accumulated instruction latencies in the specified MBB to + /// enable if-conversion. + /// The probability of the instructions being executed is given by + /// Probability, and Confidence is a measure of our confidence that it + /// will be properly predicted. + bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, + BranchProbability Probability) const override; + /// Emit instructions to copy a pair of physical registers. + /// + /// This function should support copies within any legal register class as + /// well as any cross-class copies created during instruction selection. + /// + /// The source and destination registers may overlap, which may require a + /// careful implementation when multiple copy instructions are required for + /// large registers. See for example the ARM target. void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; + /// Store the specified register of the given register class to the specified + /// stack frame index. The store instruction is to be added to the given + /// machine basic block before the specified machine instruction. If isKill + /// is true, the register operand is the last use and must be marked kill. void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const; - + /// Load the specified register of the given register class from the specified + /// stack frame index. The load instruction is to be added to the given + /// machine basic block before the specified machine instruction. void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - void loadRegFromAddr(MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const; - - /// expandPostRAPseudo - This function is called for all pseudo instructions + /// This function is called for all pseudo instructions /// that remain after register allocation. Many pseudo instructions are /// created to help register allocation. This is the place to convert them /// into real instructions. The target can edit MI in place, or it can insert @@ -111,122 +173,228 @@ public: /// anything was changed. bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, - int FrameIndex) const override; + /// Reverses the branch condition of the specified condition list, + /// returning false on success and true if it cannot be reversed. + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) + const override; - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, - MachineInstr *LoadMI) const override { - return nullptr; - } + /// Insert a noop into the instruction stream at the specified point. + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; - unsigned createVR(MachineFunction* MF, MVT VT) const; + /// Returns true if the instruction is already predicated. + bool isPredicated(const MachineInstr *MI) const override; - bool isBranch(const MachineInstr *MI) const; - bool isPredicable(MachineInstr *MI) const override; + /// Convert the instruction into a predicated instruction. + /// It returns true if the operation was successful. bool PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Cond) const override; - bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, - unsigned ExtraPredCycles, - const BranchProbability &Probability) const override; - - bool isProfitableToIfCvt(MachineBasicBlock &TMBB, - unsigned NumTCycles, unsigned ExtraTCycles, - MachineBasicBlock &FMBB, - unsigned NumFCycles, unsigned ExtraFCycles, - const BranchProbability &Probability) const override; + /// Returns true if the first specified predicate + /// subsumes the second, e.g. GE subsumes GT. + bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const override; - bool isPredicated(const MachineInstr *MI) const override; - bool isPredicated(unsigned Opcode) const; - bool isPredicatedTrue(const MachineInstr *MI) const; - bool isPredicatedTrue(unsigned Opcode) const; - bool isPredicatedNew(const MachineInstr *MI) const; - bool isPredicatedNew(unsigned Opcode) const; + /// If the specified instruction defines any predicate + /// or condition code register(s) used for predication, returns true as well + /// as the definition predicate(s) by reference. bool DefinesPredicate(MachineInstr *MI, std::vector<MachineOperand> &Pred) const override; - bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, - ArrayRef<MachineOperand> Pred2) const override; - bool - ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + /// Return true if the specified instruction can be predicated. + /// By default, this returns true for every instruction with a + /// PredicateOperand. + bool isPredicable(MachineInstr *MI) const override; - bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, - const BranchProbability &Probability) const override; + /// Test if the given instruction should be considered a scheduling boundary. + /// This primarily includes labels and terminators. + bool isSchedulingBoundary(const MachineInstr *MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const override; + /// Measure the specified inline asm to determine an approximation of its + /// length. + unsigned getInlineAsmLength(const char *Str, + const MCAsmInfo &MAI) const override; + + /// Allocate and return a hazard recognizer to use for this target when + /// scheduling the machine instructions after register allocation. + ScheduleHazardRecognizer* + CreateTargetPostRAHazardRecognizer(const InstrItineraryData*, + const ScheduleDAG *DAG) const override; + + /// For a comparison instruction, return the source registers + /// in SrcReg and SrcReg2 if having two register operands, and the value it + /// compares against in CmpValue. Return true if the comparison instruction + /// can be analyzed. + bool analyzeCompare(const MachineInstr *MI, + unsigned &SrcReg, unsigned &SrcReg2, + int &Mask, int &Value) const override; + + /// Compute the instruction latency of a given instruction. + /// If the instruction has higher cost when predicated, it's returned via + /// PredCost. + unsigned getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost = 0) const override; + + /// Create machine specific model for scheduling. DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override; - bool isSchedulingBoundary(const MachineInstr *MI, - const MachineBasicBlock *MBB, - const MachineFunction &MF) const override; - bool isValidOffset(unsigned Opcode, int Offset, bool Extend = true) const; - bool isValidAutoIncImm(const EVT VT, const int Offset) const; - bool isMemOp(const MachineInstr *MI) const; - bool isSpillPredRegOp(const MachineInstr *MI) const; - bool isU6_3Immediate(const int value) const; - bool isU6_2Immediate(const int value) const; - bool isU6_1Immediate(const int value) const; - bool isU6_0Immediate(const int value) const; - bool isS4_3Immediate(const int value) const; - bool isS4_2Immediate(const int value) const; - bool isS4_1Immediate(const int value) const; - bool isS4_0Immediate(const int value) const; - bool isS12_Immediate(const int value) const; - bool isU6_Immediate(const int value) const; - bool isS8_Immediate(const int value) const; - bool isS6_Immediate(const int value) const; - - bool isSaveCalleeSavedRegsCall(const MachineInstr* MI) const; - bool isConditionalTransfer(const MachineInstr* MI) const; + // Sometimes, it is possible for the target + // to tell, even without aliasing information, that two MIs access different + // memory addresses. This function returns true if two MIs access different + // memory addresses and false otherwise. + bool areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb, + AliasAnalysis *AA = nullptr) + const override; + + + /// HexagonInstrInfo specifics. + /// + + const HexagonRegisterInfo &getRegisterInfo() const { return RI; } + + unsigned createVR(MachineFunction* MF, MVT VT) const; + + bool isAbsoluteSet(const MachineInstr* MI) const; + bool isAccumulator(const MachineInstr *MI) const; + bool isComplex(const MachineInstr *MI) const; + bool isCompoundBranchInstr(const MachineInstr *MI) const; + bool isCondInst(const MachineInstr *MI) const; bool isConditionalALU32 (const MachineInstr* MI) const; - bool isConditionalLoad (const MachineInstr* MI) const; + bool isConditionalLoad(const MachineInstr* MI) const; bool isConditionalStore(const MachineInstr* MI) const; - bool isNewValueInst(const MachineInstr* MI) const; - bool isNewValue(const MachineInstr* MI) const; - bool isNewValue(Opcode_t Opcode) const; - bool isDotNewInst(const MachineInstr* MI) const; - int GetDotOldOp(const int opc) const; - int GetDotNewOp(const MachineInstr* MI) const; - int GetDotNewPredOp(MachineInstr *MI, - const MachineBranchProbabilityInfo - *MBPI) const; - bool mayBeNewStore(const MachineInstr* MI) const; + bool isConditionalTransfer(const MachineInstr* MI) const; + bool isConstExtended(const MachineInstr *MI) const; bool isDeallocRet(const MachineInstr *MI) const; - unsigned getInvertedPredicatedOpcode(const int Opc) const; + bool isDependent(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) const; + bool isDotCurInst(const MachineInstr* MI) const; + bool isDotNewInst(const MachineInstr* MI) const; + bool isDuplexPair(const MachineInstr *MIa, const MachineInstr *MIb) const; + bool isEarlySourceInstr(const MachineInstr *MI) const; + bool isEndLoopN(unsigned Opcode) const; + bool isExpr(unsigned OpType) const; bool isExtendable(const MachineInstr* MI) const; bool isExtended(const MachineInstr* MI) const; - bool isPostIncrement(const MachineInstr* MI) const; + bool isFloat(const MachineInstr *MI) const; + bool isHVXMemWithAIndirect(const MachineInstr *I, + const MachineInstr *J) const; + bool isIndirectCall(const MachineInstr *MI) const; + bool isIndirectL4Return(const MachineInstr *MI) const; + bool isJumpR(const MachineInstr *MI) const; + bool isJumpWithinBranchRange(const MachineInstr *MI, unsigned offset) const; + bool isLateInstrFeedsEarlyInstr(const MachineInstr *LRMI, + const MachineInstr *ESMI) const; + bool isLateResultInstr(const MachineInstr *MI) const; + bool isLateSourceInstr(const MachineInstr *MI) const; + bool isLoopN(const MachineInstr *MI) const; + bool isMemOp(const MachineInstr *MI) const; + bool isNewValue(const MachineInstr* MI) const; + bool isNewValue(unsigned Opcode) const; + bool isNewValueInst(const MachineInstr* MI) const; + bool isNewValueJump(const MachineInstr* MI) const; + bool isNewValueJump(unsigned Opcode) const; bool isNewValueStore(const MachineInstr* MI) const; bool isNewValueStore(unsigned Opcode) const; - bool isNewValueJump(const MachineInstr* MI) const; - bool isNewValueJump(Opcode_t Opcode) const; - bool isNewValueJumpCandidate(const MachineInstr *MI) const; + bool isOperandExtended(const MachineInstr *MI, unsigned OperandNum) const; + bool isPostIncrement(const MachineInstr* MI) const; + bool isPredicatedNew(const MachineInstr *MI) const; + bool isPredicatedNew(unsigned Opcode) const; + bool isPredicatedTrue(const MachineInstr *MI) const; + bool isPredicatedTrue(unsigned Opcode) const; + bool isPredicated(unsigned Opcode) const; + bool isPredicateLate(unsigned Opcode) const; + bool isPredictedTaken(unsigned Opcode) const; + bool isSaveCalleeSavedRegsCall(const MachineInstr *MI) const; + bool isSolo(const MachineInstr* MI) const; + bool isSpillPredRegOp(const MachineInstr *MI) const; + bool isTC1(const MachineInstr *MI) const; + bool isTC2(const MachineInstr *MI) const; + bool isTC2Early(const MachineInstr *MI) const; + bool isTC4x(const MachineInstr *MI) const; + bool isV60VectorInstruction(const MachineInstr *MI) const; + bool isValidAutoIncImm(const EVT VT, const int Offset) const; + bool isValidOffset(unsigned Opcode, int Offset, bool Extend = true) const; + bool isVecAcc(const MachineInstr *MI) const; + bool isVecALU(const MachineInstr *MI) const; + bool isVecUsableNextPacket(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) const; + + + bool canExecuteInBundle(const MachineInstr *First, + const MachineInstr *Second) const; + bool hasEHLabel(const MachineBasicBlock *B) const; + bool hasNonExtEquivalent(const MachineInstr *MI) const; + bool hasPseudoInstrPair(const MachineInstr *MI) const; + bool hasUncondBranch(const MachineBasicBlock *B) const; + bool mayBeCurLoad(const MachineInstr* MI) const; + bool mayBeNewStore(const MachineInstr* MI) const; + bool producesStall(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) const; + bool producesStall(const MachineInstr *MI, + MachineBasicBlock::const_instr_iterator MII) const; + bool predCanBeUsedAsDotNew(const MachineInstr *MI, unsigned PredReg) const; + bool PredOpcodeHasJMP_c(unsigned Opcode) const; + bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const; - void immediateExtend(MachineInstr *MI) const; - bool isConstExtended(const MachineInstr *MI) const; - unsigned getSize(const MachineInstr *MI) const; - int getDotNewPredJumpOp(MachineInstr *MI, - const MachineBranchProbabilityInfo *MBPI) const; unsigned getAddrMode(const MachineInstr* MI) const; - bool isOperandExtended(const MachineInstr *MI, - unsigned short OperandNum) const; - unsigned short getCExtOpNum(const MachineInstr *MI) const; - int getMinValue(const MachineInstr *MI) const; + unsigned getBaseAndOffset(const MachineInstr *MI, int &Offset, + unsigned &AccessSize) const; + bool getBaseAndOffsetPosition(const MachineInstr *MI, unsigned &BasePos, + unsigned &OffsetPos) const; + SmallVector<MachineInstr*,2> getBranchingInstrs(MachineBasicBlock& MBB) const; + unsigned getCExtOpNum(const MachineInstr *MI) const; + HexagonII::CompoundGroup + getCompoundCandidateGroup(const MachineInstr *MI) const; + unsigned getCompoundOpcode(const MachineInstr *GA, + const MachineInstr *GB) const; + int getCondOpcode(int Opc, bool sense) const; + int getDotCurOp(const MachineInstr* MI) const; + int getDotNewOp(const MachineInstr* MI) const; + int getDotNewPredJumpOp(const MachineInstr *MI, + const MachineBranchProbabilityInfo *MBPI) const; + int getDotNewPredOp(const MachineInstr *MI, + const MachineBranchProbabilityInfo *MBPI) const; + int getDotOldOp(const int opc) const; + HexagonII::SubInstructionGroup getDuplexCandidateGroup(const MachineInstr *MI) + const; + short getEquivalentHWInstr(const MachineInstr *MI) const; + MachineInstr *getFirstNonDbgInst(MachineBasicBlock *BB) const; + unsigned getInstrTimingClassLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI) const; + bool getInvertedPredSense(SmallVectorImpl<MachineOperand> &Cond) const; + unsigned getInvertedPredicatedOpcode(const int Opc) const; int getMaxValue(const MachineInstr *MI) const; - bool NonExtEquivalentExists (const MachineInstr *MI) const; + unsigned getMemAccessSize(const MachineInstr* MI) const; + int getMinValue(const MachineInstr *MI) const; short getNonExtOpcode(const MachineInstr *MI) const; - bool PredOpcodeHasJMP_c(Opcode_t Opcode) const; - bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const; - bool isEndLoopN(Opcode_t Opcode) const; bool getPredReg(ArrayRef<MachineOperand> Cond, unsigned &PredReg, unsigned &PredRegPos, unsigned &PredRegFlags) const; - int getCondOpcode(int Opc, bool sense) const; + short getPseudoInstrPair(const MachineInstr *MI) const; + short getRegForm(const MachineInstr *MI) const; + unsigned getSize(const MachineInstr *MI) const; + uint64_t getType(const MachineInstr* MI) const; + unsigned getUnits(const MachineInstr* MI) const; + unsigned getValidSubTargets(const unsigned Opcode) const; + + /// getInstrTimingClassLatency - Compute the instruction latency of a given + /// instruction using Timing Class information, if available. + unsigned nonDbgBBSize(const MachineBasicBlock *BB) const; + unsigned nonDbgBundleSize(MachineBasicBlock::const_iterator BundleHead) const; + + + void immediateExtend(MachineInstr *MI) const; + bool invertAndChangeJumpTarget(MachineInstr* MI, + MachineBasicBlock* NewTarget) const; + void genAllInsnTimingClasses(MachineFunction &MF) const; + bool reversePredSense(MachineInstr* MI) const; + unsigned reversePrediction(unsigned Opcode) const; + bool validateBranchCond(const ArrayRef<MachineOperand> &Cond) const; }; } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td index 3b32c10..421403f 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td @@ -13,7 +13,7 @@ include "HexagonInstrFormats.td" include "HexagonOperands.td" - +include "HexagonInstrEnc.td" // Pattern fragment that combines the value type and the register class // into a single parameter. // The pat frags in the definitions below need to have a named register, @@ -1426,9 +1426,6 @@ def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def eh_return: SDNode<"HexagonISD::EH_RETURN", SDTNone, [SDNPHasChain]>; -def SDHexagonBR_JT: SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; -def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>; - class CondStr<string CReg, bit True, bit New> { string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") "; } @@ -1606,8 +1603,6 @@ def EH_RETURN_JMPR : T_JMPr; def: Pat<(eh_return), (EH_RETURN_JMPR (i32 R31))>; -def: Pat<(HexagonBR_JT (i32 IntRegs:$dst)), - (J2_jumpr IntRegs:$dst)>; def: Pat<(brind (i32 IntRegs:$dst)), (J2_jumpr IntRegs:$dst)>; @@ -2825,7 +2820,7 @@ let CextOpcode = "ADD_acc" in { let isExtentSigned = 1 in def M2_accii : T_MType_acc_ri <"+= add", 0b100, s8Ext, [(set (i32 IntRegs:$dst), - (add (add (i32 IntRegs:$src2), s16_16ImmPred:$src3), + (add (add (i32 IntRegs:$src2), s32ImmPred:$src3), (i32 IntRegs:$src1)))]>, ImmRegRel; def M2_acci : T_MType_acc_rr <"+= add", 0b000, 0b001, 0, @@ -2859,7 +2854,7 @@ class T_MType_acc_pat2 <InstHexagon MI, SDNode firstOp, SDNode secOp> def : T_MType_acc_pat2 <M2_xor_xacc, xor, xor>; def : T_MType_acc_pat1 <M2_macsin, mul, sub, u32ImmPred>; -def : T_MType_acc_pat1 <M2_naccii, add, sub, s16_16ImmPred>; +def : T_MType_acc_pat1 <M2_naccii, add, sub, s32ImmPred>; def : T_MType_acc_pat2 <M2_nacci, add, sub>; //===----------------------------------------------------------------------===// @@ -3303,7 +3298,8 @@ class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp, !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2}, !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1}, /* s4_0Imm */ offset{3-0}))); - let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1); + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1)); let IClass = 0b1010; @@ -3322,7 +3318,7 @@ class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp, //===----------------------------------------------------------------------===// let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp, - bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew > + bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew> : STInst <(outs IntRegs:$_dst_), (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3), !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ", @@ -3341,7 +3337,8 @@ class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp, !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1}, /* s4_0Imm */ offset{3-0}))); - let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1); + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1)); let isPredicatedNew = isPredNew; let isPredicatedFalse = isPredNot; @@ -3404,7 +3401,6 @@ def: Storepi_pat<post_store, I64, s4_3ImmPred, S2_storerd_pi>; //===----------------------------------------------------------------------===// // Template class for post increment stores with register offset. //===----------------------------------------------------------------------===// -let isNVStorable = 1 in class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0> : STInst <(outs IntRegs:$_dst_), @@ -3416,6 +3412,9 @@ class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp, bits<5> src3; let accessSize = AccessSz; + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if(!eq(mnemonic,"memd"), 0, !if(isHalf,0,1)); + let IClass = 0b1010; let Inst{27-24} = 0b1101; @@ -3430,12 +3429,11 @@ def S2_storerb_pr : T_store_pr<"memb", IntRegs, 0b000, ByteAccess>; def S2_storerh_pr : T_store_pr<"memh", IntRegs, 0b010, HalfWordAccess>; def S2_storeri_pr : T_store_pr<"memw", IntRegs, 0b100, WordAccess>; def S2_storerd_pr : T_store_pr<"memd", DoubleRegs, 0b110, DoubleWordAccess>; - def S2_storerf_pr : T_store_pr<"memh", IntRegs, 0b011, HalfWordAccess, 1>; let opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp, - bits<3>MajOp, bit isH = 0> + bits<3> MajOp, bit isH = 0> : STInst <(outs), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3), mnemonic#"($src1+#$src2) = $src3"#!if(isH,".h","")>, @@ -3455,6 +3453,8 @@ class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp, !if (!eq(ImmOpStr, "s11_2Ext"), src2{12-2}, !if (!eq(ImmOpStr, "s11_1Ext"), src2{11-1}, /* s11_0Ext */ src2{10-0}))); + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1)); let IClass = 0b1010; let Inst{27} = 0b0; @@ -3494,7 +3494,10 @@ class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp, !if (!eq(ImmOpStr, "u6_2Ext"), src3{7-2}, !if (!eq(ImmOpStr, "u6_1Ext"), src3{6-1}, /* u6_0Ext */ src3{5-0}))); - let IClass = 0b0100; + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1)); + + let IClass = 0b0100; let Inst{27} = 0b0; let Inst{26} = PredNot; @@ -3508,7 +3511,7 @@ class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp, let Inst{1-0} = src1; } -let isExtendable = 1, isNVStorable = 1, hasSideEffects = 0 in +let isExtendable = 1, hasSideEffects = 0 in multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC, Operand ImmOp, Operand predImmOp, bits<3> MajOp, bit isH = 0> { let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in { @@ -3665,7 +3668,7 @@ def S2_allocframe: ST0Inst < // S2_storer[bhwdf]_pci: Store byte/half/word/double. // S2_storer[bhwdf]_pci -> S2_storerbnew_pci -let Uses = [CS], isNVStorable = 1 in +let Uses = [CS] in class T_store_pci <string mnemonic, RegisterClass RC, Operand Imm, bits<4>MajOp, MemAccessSize AlignSize, string RegSrc = "Rt"> @@ -3679,6 +3682,8 @@ class T_store_pci <string mnemonic, RegisterClass RC, bits<1> Mu; bits<5> Rt; let accessSize = AlignSize; + let isNVStorable = !if(!eq(mnemonic,"memd"), 0, + !if(!eq(RegSrc,"Rt.h"), 0, 1)); let IClass = 0b1010; let Inst{27-25} = 0b100; @@ -3696,15 +3701,15 @@ class T_store_pci <string mnemonic, RegisterClass RC, } def S2_storerb_pci : T_store_pci<"memb", IntRegs, s4_0Imm, 0b1000, - ByteAccess>; + ByteAccess>; def S2_storerh_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1010, - HalfWordAccess>; + HalfWordAccess>; def S2_storerf_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1011, - HalfWordAccess, "Rt.h">; + HalfWordAccess, "Rt.h">; def S2_storeri_pci : T_store_pci<"memw", IntRegs, s4_2Imm, 0b1100, - WordAccess>; + WordAccess>; def S2_storerd_pci : T_store_pci<"memd", DoubleRegs, s4_3Imm, 0b1110, - DoubleWordAccess>; + DoubleWordAccess>; let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 4 in class T_storenew_pci <string mnemonic, Operand Imm, @@ -3762,7 +3767,7 @@ def S2_storerd_pci_pseudo : T_store_pci_pseudo <"memd", DoubleRegs>; //===----------------------------------------------------------------------===// // Circular stores with auto-increment register //===----------------------------------------------------------------------===// -let Uses = [CS], isNVStorable = 1 in +let Uses = [CS] in class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp, MemAccessSize AlignSize, string RegSrc = "Rt"> : STInst <(outs IntRegs:$_dst_), @@ -3775,6 +3780,8 @@ class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp, bits<5> Rt; let accessSize = AlignSize; + let isNVStorable = !if(!eq(mnemonic,"memd"), 0, + !if(!eq(RegSrc,"Rt.h"), 0, 1)); let IClass = 0b1010; let Inst{27-25} = 0b100; @@ -5784,7 +5791,21 @@ include "HexagonInstrInfoV5.td" //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// +// V60 Instructions + +//===----------------------------------------------------------------------===// + +include "HexagonInstrInfoV60.td" + +//===----------------------------------------------------------------------===// +// V60 Instructions - +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// // ALU32/64/Vector + //===----------------------------------------------------------------------===/// include "HexagonInstrInfoVector.td" + +include "HexagonInstrAlias.td" +include "HexagonSystemInst.td" + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td index 65b0f49..37c2042 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td @@ -684,7 +684,7 @@ def: Pat<(i64 (zext (i32 IntRegs:$src1))), // Template class for store instructions with Absolute set addressing mode. //===----------------------------------------------------------------------===// let isExtended = 1, opExtendable = 1, opExtentBits = 6, - addrMode = AbsoluteSet, isNVStorable = 1 in + addrMode = AbsoluteSet in class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC, bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0> : STInst<(outs IntRegs:$dst), @@ -696,6 +696,9 @@ class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC, let accessSize = AccessSz; let BaseOpcode = BaseOp#"_AbsSet"; + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1)); + let IClass = 0b1010; let Inst{27-24} = 0b1011; @@ -750,7 +753,7 @@ let mayStore = 1, addrMode = AbsoluteSet in { } let isExtended = 1, opExtendable = 2, opExtentBits = 6, InputType = "imm", -addrMode = BaseLongOffset, AddedComplexity = 40 in + addrMode = BaseLongOffset, AddedComplexity = 40 in class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC, bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0> : STInst<(outs), @@ -766,6 +769,10 @@ class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC, let accessSize = AccessSz; let CextOpcode = CextOp; let BaseOpcode = CextOp#"_shl"; + + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1)); + let IClass = 0b1010; let Inst{27-24} =0b1101; @@ -856,6 +863,9 @@ class T_store_rr <string mnemonic, RegisterClass RC, bits<3> MajOp, bit isH> bits<2> u2; bits<5> Rt; + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1)); + let IClass = 0b0011; let Inst{27-24} = 0b1011; @@ -888,6 +898,8 @@ class T_pstore_rr <string mnemonic, RegisterClass RC, bits<3> MajOp, let isPredicatedFalse = isNot; let isPredicatedNew = isPredNew; + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1)); let IClass = 0b0011; @@ -1826,43 +1838,22 @@ def: LogLogNot_pat<or, or, C4_or_orn>; // below are needed to support code generation for PIC //===----------------------------------------------------------------------===// -def SDT_HexagonPICAdd +def SDT_HexagonAtGot + : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; +def SDT_HexagonAtPcrel : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; -def SDT_HexagonGOTAdd - : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; - -def SDT_HexagonGOTAddInternal : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>; -def SDT_HexagonGOTAddInternalJT : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>; -def SDT_HexagonGOTAddInternalBA : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>; - -def Hexagonpic_add : SDNode<"HexagonISD::PIC_ADD", SDT_HexagonPICAdd>; -def Hexagonat_got : SDNode<"HexagonISD::AT_GOT", SDT_HexagonGOTAdd>; -def Hexagongat_pcrel : SDNode<"HexagonISD::AT_PCREL", - SDT_HexagonGOTAddInternal>; -def Hexagongat_pcrel_jt : SDNode<"HexagonISD::AT_PCREL", - SDT_HexagonGOTAddInternalJT>; -def Hexagongat_pcrel_ba : SDNode<"HexagonISD::AT_PCREL", - SDT_HexagonGOTAddInternalBA>; - -// PIC: Map from a block address computation to a PC-relative add -def: Pat<(Hexagongat_pcrel_ba tblockaddress:$src1), - (C4_addipc u32ImmPred:$src1)>; - -// PIC: Map from the computation to generate a GOT pointer to a PC-relative add -def: Pat<(Hexagonpic_add texternalsym:$src1), - (C4_addipc u32ImmPred:$src1)>; - -// PIC: Map from a jump table address computation to a PC-relative add -def: Pat<(Hexagongat_pcrel_jt tjumptable:$src1), - (C4_addipc u32ImmPred:$src1)>; -// PIC: Map from a GOT-relative symbol reference to a load -def: Pat<(Hexagonat_got (i32 IntRegs:$src1), tglobaladdr:$src2), - (L2_loadri_io IntRegs:$src1, s30_2ImmPred:$src2)>; +// AT_GOT address-of-GOT, address-of-global, offset-in-global +def HexagonAtGot : SDNode<"HexagonISD::AT_GOT", SDT_HexagonAtGot>; +// AT_PCREL address-of-global +def HexagonAtPcrel : SDNode<"HexagonISD::AT_PCREL", SDT_HexagonAtPcrel>; -// PIC: Map from a static symbol reference to a PC-relative add -def: Pat<(Hexagongat_pcrel tglobaladdr:$src1), - (C4_addipc u32ImmPred:$src1)>; +def: Pat<(HexagonAtGot I32:$got, I32:$addr, (i32 0)), + (L2_loadri_io I32:$got, imm:$addr)>; +def: Pat<(HexagonAtGot I32:$got, I32:$addr, s30_2ImmPred:$off), + (A2_addi (L2_loadri_io I32:$got, imm:$addr), imm:$off)>; +def: Pat<(HexagonAtPcrel I32:$addr), + (C4_addipc imm:$addr)>; //===----------------------------------------------------------------------===// // CR - @@ -1903,7 +1894,7 @@ def S4_addaddi : ALU64Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Ru, s6Ext:$s6), "$Rd = add($Rs, add($Ru, #$s6))" , [(set (i32 IntRegs:$Rd), (add (i32 IntRegs:$Rs), - (add (i32 IntRegs:$Ru), s16_16ImmPred:$s6)))], + (add (i32 IntRegs:$Ru), s32ImmPred:$s6)))], "", ALU64_tc_2_SLOT23> { bits<5> Rd; bits<5> Rs; @@ -3290,27 +3281,33 @@ defm L4_return: LD_MISC_L4_RETURN <"dealloc_return">, PredNewRel; let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1, Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in { def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP<"">; + let isExtended = 1, opExtendable = 0 in + def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP<"">; } // Restore registers and dealloc frame before a tail call. let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in { def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<"">, PredRel; + let isExtended = 1, opExtendable = 0 in + def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<"">, PredRel; } // Save registers function call. let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in { def SAVE_REGISTERS_CALL_V4 : T_Call<"">, PredRel; + let isExtended = 1, opExtendable = 0 in + def SAVE_REGISTERS_CALL_V4_EXT : T_Call<"">, PredRel; } //===----------------------------------------------------------------------===// // Template class for non predicated store instructions with // GP-Relative or absolute addressing. //===----------------------------------------------------------------------===// -let hasSideEffects = 0, isPredicable = 1, isNVStorable = 1 in +let hasSideEffects = 0, isPredicable = 1 in class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp, - bits<2>MajOp, Operand AddrOp, bit isAbs, bit isHalf> - : STInst<(outs), (ins AddrOp:$addr, RC:$src), - mnemonic # !if(isAbs, "(##", "(#")#"$addr) = $src"#!if(isHalf, ".h",""), + bits<2>MajOp, bit isAbs, bit isHalf> + : STInst<(outs), (ins ImmOp:$addr, RC:$src), + mnemonic # "(#$addr) = $src"#!if(isHalf, ".h",""), [], "", V2LDST_tc_st_SLOT01> { bits<19> addr; bits<5> src; @@ -3321,6 +3318,10 @@ class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp, !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2}, !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1}, /* u16_0Imm */ addr{15-0}))); + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1)); + let Uses = !if (isAbs, [], [GP]); + let IClass = 0b0100; let Inst{27} = 1; let Inst{26-25} = offsetBits{15-14}; @@ -3337,11 +3338,10 @@ class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp, // Template class for predicated store instructions with // GP-Relative or absolute addressing. //===----------------------------------------------------------------------===// -let hasSideEffects = 0, isPredicated = 1, isNVStorable = 1, opExtentBits = 6, - opExtendable = 1 in +let hasSideEffects = 0, isPredicated = 1, opExtentBits = 6, opExtendable = 1 in class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp, bit isHalf, bit isNot, bit isNew> - : STInst<(outs), (ins PredRegs:$src1, u6Ext:$absaddr, RC: $src2), + : STInst<(outs), (ins PredRegs:$src1, u32MustExt:$absaddr, RC: $src2), !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ", ") ")#mnemonic#"(#$absaddr) = $src2"#!if(isHalf, ".h",""), [], "", ST_tc_st_SLOT01>, AddrModeRel { @@ -3351,6 +3351,8 @@ class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp, let isPredicatedNew = isNew; let isPredicatedFalse = isNot; + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1)); let IClass = 0b1010; @@ -3371,7 +3373,7 @@ class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp, //===----------------------------------------------------------------------===// class T_StoreAbs <string mnemonic, RegisterClass RC, Operand ImmOp, bits<2> MajOp, bit isHalf> - : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, u32Imm, 1, isHalf>, + : T_StoreAbsGP <mnemonic, RC, u32MustExt, MajOp, 1, isHalf>, AddrModeRel { string ImmOpStr = !cast<string>(ImmOp); let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19, @@ -3424,6 +3426,7 @@ class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp, bit isAbs> !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2}, !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1}, /* u16_0Imm */ addr{15-0}))); + let Uses = !if (isAbs, [], [GP]); let IClass = 0b0100; let Inst{27} = 1; @@ -3538,7 +3541,7 @@ defm storerf : ST_Abs <"memh", "STrif", IntRegs, u16_1Imm, 0b01, 1>; let isAsmParserOnly = 1 in class T_StoreGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp, bits<2> MajOp, bit isHalf = 0> - : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, globaladdress, 0, isHalf> { + : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, 0, isHalf> { // Set BaseOpcode same as absolute addressing instructions so that // non-predicated GP-Rel instructions can have relate with predicated // Absolute instruction. @@ -3553,7 +3556,7 @@ multiclass ST_GP <string mnemonic, string BaseOp, Operand ImmOp, // Absolute instruction. let BaseOpcode = BaseOp#_abs in { def NAME#gp : T_StoreAbsGP <mnemonic, IntRegs, ImmOp, MajOp, - globaladdress, 0, isHalf>; + 0, isHalf>; // New-value store def NAME#newgp : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp, 0> ; } @@ -3615,9 +3618,9 @@ let AddedComplexity = 100 in { //===----------------------------------------------------------------------===// let isPredicable = 1, hasSideEffects = 0 in class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp, - bits<3> MajOp, Operand AddrOp, bit isAbs> - : LDInst <(outs RC:$dst), (ins AddrOp:$addr), - "$dst = "#mnemonic# !if(isAbs, "(##", "(#")#"$addr)", + bits<3> MajOp> + : LDInst <(outs RC:$dst), (ins ImmOp:$addr), + "$dst = "#mnemonic# "(#$addr)", [], "", V2LDST_tc_ld_SLOT01> { bits<5> dst; bits<19> addr; @@ -3642,7 +3645,7 @@ class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp, class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp, bits<3> MajOp> - : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp, u32Imm, 1>, AddrModeRel { + : T_LoadAbsGP <mnemonic, RC, u32MustExt, MajOp>, AddrModeRel { string ImmOpStr = !cast<string>(ImmOp); let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19, @@ -3660,10 +3663,11 @@ class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp, // Template class for predicated load instructions with // absolute addressing mode. //===----------------------------------------------------------------------===// -let isPredicated = 1, opExtentBits = 6, opExtendable = 2 in +let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opExtentBits = 6, + opExtendable = 2 in class T_LoadAbs_Pred <string mnemonic, RegisterClass RC, bits<3> MajOp, bit isPredNot, bit isPredNew> - : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u6Ext:$absaddr), + : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u32MustExt:$absaddr), !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ", ") ")#"$dst = "#mnemonic#"(#$absaddr)">, AddrModeRel { bits<5> dst; @@ -3734,10 +3738,10 @@ defm loadrd : LD_Abs<"memd", "LDrid", DoubleRegs, u16_3Imm, 0b110>; // if ([!]Pv[.new]) Rx=mem[bhwd](##global) //===----------------------------------------------------------------------===// -let isAsmParserOnly = 1 in +let isAsmParserOnly = 1, Uses = [GP] in class T_LoadGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp, bits<3> MajOp> - : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp, globaladdress, 0>, PredNewRel { + : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp>, PredNewRel { let BaseOpcode = BaseOp#_abs; } @@ -3841,26 +3845,6 @@ let AddedComplexity = 100 in { def: Stoream_pat<truncstorei32, I64, addrga, LoReg, S2_storeriabs>; } -// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd -let AddedComplexity = 100 in -def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))), - (i1 (C2_tfrrp (i32 (L2_loadrbgp tglobaladdr:$global))))>; - -// Transfer global address into a register -let isExtended = 1, opExtendable = 1, AddedComplexity=50, isMoveImm = 1, -isAsCheapAsAMove = 1, isReMaterializable = 1, isCodeGenOnly = 1 in -def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1), - "$dst = #$src1", - [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>; - -// Transfer a block address into a register -def : Pat<(HexagonCONST32_GP tblockaddress:$src1), - (TFRI_V4 tblockaddress:$src1)>; - -let AddedComplexity = 50 in -def : Pat<(HexagonCONST32_GP tglobaladdr:$src1), - (TFRI_V4 tglobaladdr:$src1)>; - // i8/i16/i32 -> i64 loads // We need a complexity of 120 here to override preceding handling of // zextload. diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td index 337f4ea..823961f 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td @@ -98,21 +98,21 @@ def CONST32_Float_Real : LDInst<(outs IntRegs:$dst), (ins f32imm:$src1), // HexagonInstrInfo.td patterns. let isExtended = 1, opExtendable = 1, isMoveImm = 1, isReMaterializable = 1, isPredicable = 1, AddedComplexity = 30, validSubTargets = HasV5SubT, - isCodeGenOnly = 1 in + isCodeGenOnly = 1, isPseudo = 1 in def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32Ext:$src1), "$dst = #$src1", [(set F32:$dst, fpimm:$src1)]>, Requires<[HasV5T]>; -let isExtended = 1, opExtendable = 2, isPredicated = 1, - hasSideEffects = 0, validSubTargets = HasV5SubT, isCodeGenOnly = 1 in +let isExtended = 1, opExtendable = 2, isPredicated = 1, hasSideEffects = 0, + validSubTargets = HasV5SubT, isCodeGenOnly = 1, isPseudo = 1 in def TFRI_cPt_f : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, f32Ext:$src2), "if ($src1) $dst = #$src2", []>, Requires<[HasV5T]>; -let isPseudo = 1, isExtended = 1, opExtendable = 2, isPredicated = 1, - isPredicatedFalse = 1, hasSideEffects = 0, validSubTargets = HasV5SubT in +let isExtended = 1, opExtendable = 2, isPredicated = 1, isPredicatedFalse = 1, + hasSideEffects = 0, validSubTargets = HasV5SubT, isPseudo = 1 in def TFRI_cNotPt_f : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, f32Ext:$src2), "if (!$src1) $dst = #$src2", []>, diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td new file mode 100644 index 0000000..897ada0 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td @@ -0,0 +1,2241 @@ +//=- HexagonInstrInfoV60.td - Target Desc. for Hexagon Target -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Hexagon V60 instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + + +// Vector store +let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in +{ + class VSTInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = CVI_VM_ST, + IType type = TypeCVI_VM_ST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>, OpcodeHexagon; + +} + +// Vector load +let Predicates = [HasV60T, UseHVX] in +let mayLoad = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in + class V6_LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = CVI_VM_LD, + IType type = TypeCVI_VM_LD> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>; + +let Predicates = [HasV60T, UseHVX] in +let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in +class V6_STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = CVI_VM_ST, + IType type = TypeCVI_VM_ST> +: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>; + +//===----------------------------------------------------------------------===// +// Vector loads with base + immediate offset +//===----------------------------------------------------------------------===// +let addrMode = BaseImmOffset, accessSize = Vector64Access in +class T_vload_ai<string asmStr> + : V6_LDInst <(outs VectorRegs:$dst), (ins IntRegs:$src1, s4_6Imm:$src2), + asmStr>; + +let isCodeGenOnly = 1, addrMode = BaseImmOffset, accessSize = Vector128Access in +class T_vload_ai_128B<string asmStr> + : V6_LDInst <(outs VectorRegs128B:$dst), (ins IntRegs:$src1, s4_7Imm:$src2), + asmStr>; + +let isCVLoadable = 1, hasNewValue = 1 in { + def V6_vL32b_ai : T_vload_ai <"$dst = vmem($src1+#$src2)">, + V6_vL32b_ai_enc; + def V6_vL32b_nt_ai : T_vload_ai <"$dst = vmem($src1+#$src2):nt">, + V6_vL32b_nt_ai_enc; + // 128B + def V6_vL32b_ai_128B : T_vload_ai_128B <"$dst = vmem($src1+#$src2)">, + V6_vL32b_ai_128B_enc; + def V6_vL32b_nt_ai_128B : T_vload_ai_128B <"$dst = vmem($src1+#$src2):nt">, + V6_vL32b_nt_ai_128B_enc; +} + +let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU, hasNewValue = 1 in { + def V6_vL32Ub_ai : T_vload_ai <"$dst = vmemu($src1+#$src2)">, + V6_vL32Ub_ai_enc; + def V6_vL32Ub_ai_128B : T_vload_ai_128B <"$dst = vmemu($src1+#$src2)">, + V6_vL32Ub_ai_128B_enc; +} + +let Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD, isCVLoad = 1, + hasNewValue = 1 in { + def V6_vL32b_cur_ai : T_vload_ai <"$dst.cur = vmem($src1+#$src2)">, + V6_vL32b_cur_ai_enc; + def V6_vL32b_nt_cur_ai : T_vload_ai <"$dst.cur = vmem($src1+#$src2):nt">, + V6_vL32b_nt_cur_ai_enc; + // 128B + def V6_vL32b_cur_ai_128B : T_vload_ai_128B + <"$dst.cur = vmem($src1+#$src2)">, + V6_vL32b_cur_ai_128B_enc; + def V6_vL32b_nt_cur_ai_128B : T_vload_ai_128B + <"$dst.cur = vmem($src1+#$src2):nt">, + V6_vL32b_nt_cur_ai_128B_enc; +} + + +let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD, hasNewValue = 1 in { + def V6_vL32b_tmp_ai : T_vload_ai <"$dst.tmp = vmem($src1+#$src2)">, + V6_vL32b_tmp_ai_enc; + def V6_vL32b_nt_tmp_ai : T_vload_ai <"$dst.tmp = vmem($src1+#$src2):nt">, + V6_vL32b_nt_tmp_ai_enc; + // 128B + def V6_vL32b_tmp_ai_128B : T_vload_ai_128B + <"$dst.tmp = vmem($src1+#$src2)">, + V6_vL32b_tmp_ai_128B_enc; + def V6_vL32b_nt_tmp_ai_128B : T_vload_ai_128B + <"$dst.tmp = vmem($src1+#$src2)">, + V6_vL32b_nt_tmp_ai_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Vector stores with base + immediate offset - unconditional +//===----------------------------------------------------------------------===// +let addrMode = BaseImmOffset, accessSize = Vector64Access in +class T_vstore_ai <string mnemonic, string baseOp, Operand ImmOp, + RegisterClass RC, bit isNT> + : V6_STInst <(outs), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3), + mnemonic#"($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3">, NewValueRel { + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_ai_64B <string mnemonic, string baseOp, bit isNT = 0> + : T_vstore_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_ai_128B <string mnemonic, string baseOp, bit isNT = 0> + : T_vstore_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>; + +let isNVStorable = 1 in { + def V6_vS32b_ai : T_vstore_ai_64B <"vmem", "vS32b_ai">, + V6_vS32b_ai_enc; + def V6_vS32b_ai_128B : T_vstore_ai_128B <"vmem", "vS32b_ai">, + V6_vS32b_ai_128B_enc; +} + +let isNVStorable = 1, isNonTemporal = 1 in { + def V6_vS32b_nt_ai : T_vstore_ai_64B <"vmem", "vS32b_ai", 1>, + V6_vS32b_nt_ai_enc; + def V6_vS32b_nt_ai_128B : T_vstore_ai_128B <"vmem", "vS32b_ai", 1>, + V6_vS32b_nt_ai_128B_enc; +} + +let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in { + def V6_vS32Ub_ai : T_vstore_ai_64B <"vmemu", "vs32Ub_ai">, + V6_vS32Ub_ai_enc; + def V6_vS32Ub_ai_128B : T_vstore_ai_128B <"vmemu", "vs32Ub_ai">, + V6_vS32Ub_ai_128B_enc; +} +//===----------------------------------------------------------------------===// +// Vector stores with base + immediate offset - unconditional new +//===----------------------------------------------------------------------===// +let addrMode = BaseImmOffset, isNewValue = 1, opNewValue = 2, isNVStore = 1, + Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST in +class T_vstore_new_ai <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT> + : V6_STInst <(outs ), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3), + "vmem($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3.new">, NewValueRel { + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_new_ai_64B <string baseOp, bit isNT = 0> + : T_vstore_new_ai <baseOp, s4_6Imm, VectorRegs, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_new_ai_128B <string baseOp, bit isNT = 0> + : T_vstore_new_ai <baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>; + +def V6_vS32b_new_ai : T_vstore_new_ai_64B <"vS32b_ai">, V6_vS32b_new_ai_enc; +def V6_vS32b_new_ai_128B : T_vstore_new_ai_128B <"vS32b_ai">, + V6_vS32b_new_ai_128B_enc; + +let isNonTemporal = 1 in { + def V6_vS32b_nt_new_ai : T_vstore_new_ai_64B<"vS32b_ai", 1>, + V6_vS32b_nt_new_ai_enc; + def V6_vS32b_nt_new_ai_128B : T_vstore_new_ai_128B<"vS32b_ai", 1>, + V6_vS32b_nt_new_ai_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Vector stores with base + immediate offset - conditional +//===----------------------------------------------------------------------===// +let addrMode = BaseImmOffset, isPredicated = 1 in +class T_vstore_pred_ai <string mnemonic, string baseOp, Operand ImmOp, + RegisterClass RC, bit isPredNot = 0, bit isNT = 0> + : V6_STInst <(outs), + (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4), + "if ("#!if(isPredNot, "!", "")#"$src1) " + #mnemonic#"($src2+#$src3)"#!if(isNT, ":nt", "")#" = $src4">, NewValueRel { + let isPredicatedFalse = isPredNot; + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_pred_ai_64B <string mnemonic, string baseOp, + bit isPredNot = 0, bit isNT = 0> + : T_vstore_pred_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_pred_ai_128B <string mnemonic, string baseOp, + bit isPredNot = 0, bit isNT = 0> + : T_vstore_pred_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B, + isPredNot, isNT>; + +let isNVStorable = 1 in { + def V6_vS32b_pred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai">, + V6_vS32b_pred_ai_enc; + def V6_vS32b_npred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1>, + V6_vS32b_npred_ai_enc; + // 128B + def V6_vS32b_pred_ai_128B : T_vstore_pred_ai_128B <"vmem", "vS32b_ai">, + V6_vS32b_pred_ai_128B_enc; + def V6_vS32b_npred_ai_128B : T_vstore_pred_ai_128B <"vmem", "vS32b_ai", 1>, + V6_vS32b_npred_ai_128B_enc; +} +let isNVStorable = 1, isNonTemporal = 1 in { + def V6_vS32b_nt_pred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 0, 1>, + V6_vS32b_nt_pred_ai_enc; + def V6_vS32b_nt_npred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1, 1>, + V6_vS32b_nt_npred_ai_enc; + // 128B + def V6_vS32b_nt_pred_ai_128B : T_vstore_pred_ai_128B + <"vmem", "vS32b_ai", 0, 1>, + V6_vS32b_nt_pred_ai_128B_enc; + def V6_vS32b_nt_npred_ai_128B : T_vstore_pred_ai_128B + <"vmem", "vS32b_ai", 1, 1>, + V6_vS32b_nt_npred_ai_128B_enc; +} + +let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in { + def V6_vS32Ub_pred_ai : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai">, + V6_vS32Ub_pred_ai_enc; + def V6_vS32Ub_npred_ai : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai", 1>, + V6_vS32Ub_npred_ai_enc; + // 128B + def V6_vS32Ub_pred_ai_128B :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai">, + V6_vS32Ub_pred_ai_128B_enc; + def V6_vS32Ub_npred_ai_128B :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai", 1>, + V6_vS32Ub_npred_ai_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Vector stores with base + immediate offset - byte-enabled aligned +//===----------------------------------------------------------------------===// +let addrMode = BaseImmOffset in +class T_vstore_qpred_ai <Operand ImmOp, RegisterClass RC, + bit isPredNot = 0, bit isNT = 0> + : V6_STInst <(outs), + (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4), + "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)" + #!if(isNT, ":nt", "")#" = $src4"> { + let isPredicatedFalse = isPredNot; +} + +let accessSize = Vector64Access in +class T_vstore_qpred_ai_64B <bit isPredNot = 0, bit isNT = 0> + : T_vstore_qpred_ai <s4_6Imm, VectorRegs, isPredNot, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_qpred_ai_128B <bit isPredNot = 0, bit isNT = 0> + : T_vstore_qpred_ai <s4_7Imm, VectorRegs128B, isPredNot, isNT>; + +def V6_vS32b_qpred_ai : T_vstore_qpred_ai_64B, V6_vS32b_qpred_ai_enc; +def V6_vS32b_nqpred_ai : T_vstore_qpred_ai_64B <1>, + V6_vS32b_nqpred_ai_enc; +def V6_vS32b_nt_qpred_ai : T_vstore_qpred_ai_64B <0, 1>, + V6_vS32b_nt_qpred_ai_enc; +def V6_vS32b_nt_nqpred_ai : T_vstore_qpred_ai_64B <1, 1>, + V6_vS32b_nt_nqpred_ai_enc; +// 128B +def V6_vS32b_qpred_ai_128B : T_vstore_qpred_ai_128B, V6_vS32b_qpred_ai_128B_enc; +def V6_vS32b_nqpred_ai_128B : T_vstore_qpred_ai_128B<1>, + V6_vS32b_nqpred_ai_128B_enc; +def V6_vS32b_nt_qpred_ai_128B : T_vstore_qpred_ai_128B<0, 1>, + V6_vS32b_nt_qpred_ai_128B_enc; +def V6_vS32b_nt_nqpred_ai_128B : T_vstore_qpred_ai_128B<1, 1>, + V6_vS32b_nt_nqpred_ai_128B_enc; + + +//===----------------------------------------------------------------------===// +// Vector stores with base + immediate offset - conditional new +//===----------------------------------------------------------------------===// +let addrMode = BaseImmOffset, isPredicated = 1, isNewValue = 1, opNewValue = 3, + isNVStore = 1, Type = TypeCVI_VM_NEW_ST, Itinerary = CVI_VM_NEW_ST in +class T_vstore_new_pred_ai <string baseOp, Operand ImmOp, RegisterClass RC, + bit isPredNot, bit isNT> + : V6_STInst <(outs), + (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4), + "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)" + #!if(isNT, ":nt", "")#" = $src4.new">, NewValueRel { + let isPredicatedFalse = isPredNot; + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_new_pred_ai_64B <string baseOp, bit isPredNot = 0, bit isNT = 0> + : T_vstore_new_pred_ai <baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_new_pred_ai_128B <string baseOp, bit isPredNot = 0, bit isNT = 0> + : T_vstore_new_pred_ai <baseOp#"128B", s4_7Imm, VectorRegs128B, + isPredNot, isNT>; + + +def V6_vS32b_new_pred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai">, + V6_vS32b_new_pred_ai_enc; +def V6_vS32b_new_npred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 1>, + V6_vS32b_new_npred_ai_enc; +// 128B +def V6_vS32b_new_pred_ai_128B : T_vstore_new_pred_ai_128B <"vS32b_ai">, + V6_vS32b_new_pred_ai_128B_enc; +def V6_vS32b_new_npred_ai_128B : T_vstore_new_pred_ai_128B <"vS32b_ai", 1>, + V6_vS32b_new_npred_ai_128B_enc; +let isNonTemporal = 1 in { + def V6_vS32b_nt_new_pred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 0, 1>, + V6_vS32b_nt_new_pred_ai_enc; + def V6_vS32b_nt_new_npred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 1, 1>, + V6_vS32b_nt_new_npred_ai_enc; + // 128B + def V6_vS32b_nt_new_pred_ai_128B : T_vstore_new_pred_ai_128B + <"vS32b_ai", 0, 1>, + V6_vS32b_nt_new_pred_ai_128B_enc; + def V6_vS32b_nt_new_npred_ai_128B : T_vstore_new_pred_ai_128B + <"vS32b_ai", 1, 1>, + V6_vS32b_nt_new_npred_ai_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment vector loads with immediate offset. +//===----------------------------------------------------------------------===// +let addrMode = PostInc, hasNewValue = 1 in +class T_vload_pi<string asmStr, Operand ImmOp, RegisterClass RC> + : V6_LDInst <(outs RC:$dst, IntRegs:$_dst_), + (ins IntRegs:$src1, ImmOp:$src2), asmStr, [], + "$src1 = $_dst_">; + +let accessSize = Vector64Access in +class T_vload_pi_64B <string asmStr> + : T_vload_pi <asmStr, s3_6Imm, VectorRegs>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vload_pi_128B <string asmStr> + : T_vload_pi <asmStr, s3_7Imm, VectorRegs128B>; + +let isCVLoadable = 1 in { + def V6_vL32b_pi : T_vload_pi_64B <"$dst = vmem($src1++#$src2)">, + V6_vL32b_pi_enc; + def V6_vL32b_nt_pi : T_vload_pi_64B <"$dst = vmem($src1++#$src2):nt">, + V6_vL32b_nt_pi_enc; + // 128B + def V6_vL32b_pi_128B : T_vload_pi_128B <"$dst = vmem($src1++#$src2)">, + V6_vL32b_pi_128B_enc; + def V6_vL32b_nt_pi_128B : T_vload_pi_128B <"$dst = vmem($src1++#$src2):nt">, + V6_vL32b_nt_pi_128B_enc; +} + +let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in { + def V6_vL32Ub_pi : T_vload_pi_64B <"$dst = vmemu($src1++#$src2)">, + V6_vL32Ub_pi_enc; + // 128B + def V6_vL32Ub_pi_128B : T_vload_pi_128B <"$dst = vmemu($src1++#$src2)">, + V6_vL32Ub_pi_128B_enc; +} + +let isCVLoad = 1, Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD in { + def V6_vL32b_cur_pi : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2)">, + V6_vL32b_cur_pi_enc; + def V6_vL32b_nt_cur_pi : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2):nt">, + V6_vL32b_nt_cur_pi_enc; + // 128B + def V6_vL32b_cur_pi_128B : T_vload_pi_128B + <"$dst.cur = vmem($src1++#$src2)">, + V6_vL32b_cur_pi_128B_enc; + def V6_vL32b_nt_cur_pi_128B : T_vload_pi_128B + <"$dst.cur = vmem($src1++#$src2):nt">, + V6_vL32b_nt_cur_pi_128B_enc; +} + +let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in { + def V6_vL32b_tmp_pi : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2)">, + V6_vL32b_tmp_pi_enc; + def V6_vL32b_nt_tmp_pi : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2):nt">, + V6_vL32b_nt_tmp_pi_enc; + //128B + def V6_vL32b_tmp_pi_128B : T_vload_pi_128B + <"$dst.tmp = vmem($src1++#$src2)">, + V6_vL32b_tmp_pi_128B_enc; + def V6_vL32b_nt_tmp_pi_128B : T_vload_pi_128B + <"$dst.tmp = vmem($src1++#$src2):nt">, + V6_vL32b_nt_tmp_pi_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment vector stores with immediate offset. +//===----------------------------------------------------------------------===// +let addrMode = PostInc in +class T_vstore_pi <string mnemonic, string baseOp, Operand ImmOp, + RegisterClass RC, bit isNT> + : V6_STInst <(outs IntRegs:$_dst_), + (ins IntRegs:$src1, ImmOp:$src2, RC:$src3), + mnemonic#"($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3", [], + "$src1 = $_dst_">, NewValueRel; + +let accessSize = Vector64Access in +class T_vstore_pi_64B <string mnemonic, string baseOp, bit isNT = 0> + : T_vstore_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_pi_128B <string mnemonic, string baseOp, bit isNT = 0> + : T_vstore_pi <mnemonic, baseOp, s3_7Imm, VectorRegs128B, isNT>; + +let isNVStorable = 1 in { + def V6_vS32b_pi : T_vstore_pi_64B <"vmem", "vS32b_pi">, V6_vS32b_pi_enc; + def V6_vS32b_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi">, + V6_vS32b_pi_128B_enc; +} + +let isNVStorable = 1 , isNonTemporal = 1 in { + def V6_vS32b_nt_pi : T_vstore_pi_64B <"vmem", "vS32b_pi", 1>, + V6_vS32b_nt_pi_enc; + def V6_vS32b_nt_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi", 1>, + V6_vS32b_nt_pi_128B_enc; +} + + +let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in { + def V6_vS32Ub_pi : T_vstore_pi_64B <"vmemu", "vS32Ub_pi">, + V6_vS32Ub_pi_enc; + def V6_vS32Ub_pi_128B : T_vstore_pi_128B <"vmemu", "vS32Ub_pi">, + V6_vS32Ub_pi_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment unconditional .new vector stores with immediate offset. +//===----------------------------------------------------------------------===// +let addrMode = PostInc, isNVStore = 1 in +let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1, + opNewValue = 3, isNVStore = 1 in +class T_vstore_new_pi <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT> + : V6_STInst <(outs IntRegs:$_dst_), + (ins IntRegs:$src1, ImmOp:$src2, RC:$src3), + "vmem($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [], + "$src1 = $_dst_">, NewValueRel { + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_new_pi_64B <string baseOp, bit isNT = 0> + : T_vstore_new_pi <baseOp, s3_6Imm, VectorRegs, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_new_pi_128B <string baseOp, bit isNT = 0> + : T_vstore_new_pi <baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>; + + +def V6_vS32b_new_pi : T_vstore_new_pi_64B <"vS32b_pi">, + V6_vS32b_new_pi_enc; +def V6_vS32b_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi">, + V6_vS32b_new_pi_128B_enc; + +let isNonTemporal = 1 in { + def V6_vS32b_nt_new_pi : T_vstore_new_pi_64B <"vS32b_pi", 1>, + V6_vS32b_nt_new_pi_enc; + def V6_vS32b_nt_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi", 1>, + V6_vS32b_nt_new_pi_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment conditional vector stores with immediate offset +//===----------------------------------------------------------------------===// +let isPredicated = 1, addrMode = PostInc in +class T_vstore_pred_pi <string mnemonic, string baseOp, Operand ImmOp, + RegisterClass RC, bit isPredNot, bit isNT> + : V6_STInst<(outs IntRegs:$_dst_), + (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4), + "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++#$src3)" + #!if(isNT, ":nt", "")#" = $src4", [], + "$src2 = $_dst_">, NewValueRel { + let isPredicatedFalse = isPredNot; + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_pred_pi_64B <string mnemonic, string baseOp, + bit isPredNot = 0, bit isNT = 0> + : T_vstore_pred_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_pred_pi_128B <string mnemonic, string baseOp, + bit isPredNot = 0, bit isNT = 0> + : T_vstore_pred_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B, + isPredNot, isNT>; + +let isNVStorable = 1 in { + def V6_vS32b_pred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi">, + V6_vS32b_pred_pi_enc; + def V6_vS32b_npred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1>, + V6_vS32b_npred_pi_enc; + // 128B + def V6_vS32b_pred_pi_128B : T_vstore_pred_pi_128B <"vmem", "vS32b_pi">, + V6_vS32b_pred_pi_128B_enc; + def V6_vS32b_npred_pi_128B : T_vstore_pred_pi_128B <"vmem", "vS32b_pi", 1>, + V6_vS32b_npred_pi_128B_enc; +} +let isNVStorable = 1, isNonTemporal = 1 in { + def V6_vS32b_nt_pred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 0, 1>, + V6_vS32b_nt_pred_pi_enc; + def V6_vS32b_nt_npred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1, 1>, + V6_vS32b_nt_npred_pi_enc; + // 128B + def V6_vS32b_nt_pred_pi_128B : T_vstore_pred_pi_128B + <"vmem", "vS32b_pi", 0, 1>, + V6_vS32b_nt_pred_pi_128B_enc; + def V6_vS32b_nt_npred_pi_128B : T_vstore_pred_pi_128B + <"vmem", "vS32b_pi", 1, 1>, + V6_vS32b_nt_npred_pi_128B_enc; +} + +let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in { + def V6_vS32Ub_pred_pi : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi">, + V6_vS32Ub_pred_pi_enc; + def V6_vS32Ub_npred_pi : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi", 1>, + V6_vS32Ub_npred_pi_enc; + // 128B + def V6_vS32Ub_pred_pi_128B : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi">, + V6_vS32Ub_pred_pi_128B_enc; + def V6_vS32Ub_npred_pi_128B : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi", 1>, + V6_vS32Ub_npred_pi_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment vector stores with immediate offset - byte-enabled aligned +//===----------------------------------------------------------------------===// +let addrMode = PostInc in +class T_vstore_qpred_pi <Operand ImmOp, RegisterClass RC, bit isPredNot = 0, + bit isNT = 0> + : V6_STInst <(outs IntRegs:$_dst_), + (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4), + "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)" + #!if(isNT, ":nt", "")#" = $src4", [], + "$src2 = $_dst_">; + +let accessSize = Vector64Access in +class T_vstore_qpred_pi_64B <bit isPredNot = 0, bit isNT = 0> + : T_vstore_qpred_pi <s3_6Imm, VectorRegs, isPredNot, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_qpred_pi_128B <bit isPredNot = 0, bit isNT = 0> + : T_vstore_qpred_pi <s3_7Imm, VectorRegs128B, isPredNot, isNT>; + +def V6_vS32b_qpred_pi : T_vstore_qpred_pi_64B, V6_vS32b_qpred_pi_enc; +def V6_vS32b_nqpred_pi : T_vstore_qpred_pi_64B <1>, V6_vS32b_nqpred_pi_enc; +// 128B +def V6_vS32b_qpred_pi_128B : T_vstore_qpred_pi_128B, + V6_vS32b_qpred_pi_128B_enc; +def V6_vS32b_nqpred_pi_128B : T_vstore_qpred_pi_128B<1>, + V6_vS32b_nqpred_pi_128B_enc; + +let isNonTemporal = 1 in { + def V6_vS32b_nt_qpred_pi : T_vstore_qpred_pi_64B <0, 1>, + V6_vS32b_nt_qpred_pi_enc; + def V6_vS32b_nt_nqpred_pi : T_vstore_qpred_pi_64B <1, 1>, + V6_vS32b_nt_nqpred_pi_enc; + // 128B + def V6_vS32b_nt_qpred_pi_128B : T_vstore_qpred_pi_128B<0, 1>, + V6_vS32b_nt_qpred_pi_128B_enc; + def V6_vS32b_nt_nqpred_pi_128B : T_vstore_qpred_pi_128B<1, 1>, + V6_vS32b_nt_nqpred_pi_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment conditional .new vector stores with immediate offset +//===----------------------------------------------------------------------===// +let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1, + isNewValue = 1, opNewValue = 4, addrMode = PostInc, isNVStore = 1 in +class T_vstore_new_pred_pi <string baseOp, Operand ImmOp, RegisterClass RC, + bit isPredNot, bit isNT> + : V6_STInst <(outs IntRegs:$_dst_), + (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4), + "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)" + #!if(isNT, ":nt", "")#" = $src4.new", [], + "$src2 = $_dst_"> , NewValueRel { + let isPredicatedFalse = isPredNot; + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_new_pred_pi_64B <string baseOp, bit isPredNot = 0, bit isNT = 0> + : T_vstore_new_pred_pi <baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_new_pred_pi_128B <string baseOp, bit isPredNot = 0, bit isNT = 0> + : T_vstore_new_pred_pi <baseOp#"128B", s3_7Imm, VectorRegs128B, + isPredNot, isNT>; + +def V6_vS32b_new_pred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi">, + V6_vS32b_new_pred_pi_enc; +def V6_vS32b_new_npred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 1>, + V6_vS32b_new_npred_pi_enc; +// 128B +def V6_vS32b_new_pred_pi_128B : T_vstore_new_pred_pi_128B <"vS32b_pi">, + V6_vS32b_new_pred_pi_128B_enc; +def V6_vS32b_new_npred_pi_128B : T_vstore_new_pred_pi_128B <"vS32b_pi", 1>, + V6_vS32b_new_npred_pi_128B_enc; +let isNonTemporal = 1 in { + def V6_vS32b_nt_new_pred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 0, 1>, + V6_vS32b_nt_new_pred_pi_enc; + def V6_vS32b_nt_new_npred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 1, 1>, + V6_vS32b_nt_new_npred_pi_enc; + // 128B + def V6_vS32b_nt_new_pred_pi_128B : T_vstore_new_pred_pi_128B + <"vS32b_pi", 0, 1>, + V6_vS32b_nt_new_pred_pi_128B_enc; + def V6_vS32b_nt_new_npred_pi_128B : T_vstore_new_pred_pi_128B + <"vS32b_pi", 1, 1>, + V6_vS32b_nt_new_npred_pi_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment vector loads with register offset +//===----------------------------------------------------------------------===// +let hasNewValue = 1 in +class T_vload_ppu<string asmStr> + : V6_LDInst <(outs VectorRegs:$dst, IntRegs:$_dst_), + (ins IntRegs:$src1, ModRegs:$src2), asmStr, [], + "$src1 = $_dst_">, NewValueRel; + +let isCVLoadable = 1 in { + def V6_vL32b_ppu : T_vload_ppu <"$dst = vmem($src1++$src2)">, + V6_vL32b_ppu_enc; + def V6_vL32b_nt_ppu : T_vload_ppu <"$dst = vmem($src1++$src2):nt">, + V6_vL32b_nt_ppu_enc; +} + +let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in +def V6_vL32Ub_ppu : T_vload_ppu <"$dst = vmemu($src1++$src2)">, + V6_vL32Ub_ppu_enc; + +let isCVLoad = 1, Itinerary = CVI_VM_CUR_LD, Type = TypeCVI_VM_CUR_LD in { + def V6_vL32b_cur_ppu : T_vload_ppu <"$dst.cur = vmem($src1++$src2)">, + V6_vL32b_cur_ppu_enc; + def V6_vL32b_nt_cur_ppu : T_vload_ppu <"$dst.cur = vmem($src1++$src2):nt">, + V6_vL32b_nt_cur_ppu_enc; +} + +let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in { + def V6_vL32b_tmp_ppu : T_vload_ppu <"$dst.tmp = vmem($src1++$src2)">, + V6_vL32b_tmp_ppu_enc; + def V6_vL32b_nt_tmp_ppu : T_vload_ppu <"$dst.tmp = vmem($src1++$src2):nt">, + V6_vL32b_nt_tmp_ppu_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment vector stores with register offset +//===----------------------------------------------------------------------===// +class T_vstore_ppu <string mnemonic, bit isNT = 0> + : V6_STInst <(outs IntRegs:$_dst_), + (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3), + mnemonic#"($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3", [], + "$src1 = $_dst_">, NewValueRel; + +let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in { + def V6_vS32b_ppu : T_vstore_ppu <"vmem">, + V6_vS32b_ppu_enc; + let isNonTemporal = 1, BaseOpcode = "vS32b_ppu" in + def V6_vS32b_nt_ppu : T_vstore_ppu <"vmem", 1>, + V6_vS32b_nt_ppu_enc; +} + +let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in +def V6_vS32Ub_ppu : T_vstore_ppu <"vmemu">, V6_vS32Ub_ppu_enc; + +//===----------------------------------------------------------------------===// +// Post increment .new vector stores with register offset +//===----------------------------------------------------------------------===// +let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1, + opNewValue = 3, isNVStore = 1 in +class T_vstore_new_ppu <bit isNT = 0> + : V6_STInst <(outs IntRegs:$_dst_), + (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3), + "vmem($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [], + "$src1 = $_dst_">, NewValueRel; + +let BaseOpcode = "vS32b_ppu" in +def V6_vS32b_new_ppu : T_vstore_new_ppu, V6_vS32b_new_ppu_enc; + +let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in +def V6_vS32b_nt_new_ppu : T_vstore_new_ppu<1>, V6_vS32b_nt_new_ppu_enc; + +//===----------------------------------------------------------------------===// +// Post increment conditional .new vector stores with register offset +//===----------------------------------------------------------------------===// +let isPredicated = 1 in +class T_vstore_pred_ppu <string mnemonic, bit isPredNot = 0, bit isNT = 0> + : V6_STInst<(outs IntRegs:$_dst_), + (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4), + "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++$src3)" + #!if(isNT, ":nt", "")#" = $src4", [], + "$src2 = $_dst_">, NewValueRel { + let isPredicatedFalse = isPredNot; +} + +let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in { + def V6_vS32b_pred_ppu : T_vstore_pred_ppu<"vmem">, V6_vS32b_pred_ppu_enc; + def V6_vS32b_npred_ppu: T_vstore_pred_ppu<"vmem", 1>, V6_vS32b_npred_ppu_enc; +} + +let isNVStorable = 1, BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in { + def V6_vS32b_nt_pred_ppu : T_vstore_pred_ppu <"vmem", 0, 1>, + V6_vS32b_nt_pred_ppu_enc; + def V6_vS32b_nt_npred_ppu : T_vstore_pred_ppu <"vmem", 1, 1>, + V6_vS32b_nt_npred_ppu_enc; +} + +let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU, + Type = TypeCVI_VM_STU in { + def V6_vS32Ub_pred_ppu : T_vstore_pred_ppu <"vmemu">, + V6_vS32Ub_pred_ppu_enc; + def V6_vS32Ub_npred_ppu : T_vstore_pred_ppu <"vmemu", 1>, + V6_vS32Ub_npred_ppu_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment vector stores with register offset - byte-enabled aligned +//===----------------------------------------------------------------------===// +class T_vstore_qpred_ppu <bit isPredNot = 0, bit isNT = 0> + : V6_STInst <(outs IntRegs:$_dst_), + (ins VecPredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4), + "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)" + #!if(isNT, ":nt", "")#" = $src4", [], + "$src2 = $_dst_">, NewValueRel; + +def V6_vS32b_qpred_ppu : T_vstore_qpred_ppu, V6_vS32b_qpred_ppu_enc; +def V6_vS32b_nqpred_ppu : T_vstore_qpred_ppu<1>, V6_vS32b_nqpred_ppu_enc; +def V6_vS32b_nt_qpred_ppu : T_vstore_qpred_ppu<0, 1>, + V6_vS32b_nt_qpred_ppu_enc; +def V6_vS32b_nt_nqpred_ppu : T_vstore_qpred_ppu<1, 1>, + V6_vS32b_nt_nqpred_ppu_enc; + +//===----------------------------------------------------------------------===// +// Post increment conditional .new vector stores with register offset +//===----------------------------------------------------------------------===// +let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1, + isNewValue = 1, opNewValue = 4, isNVStore = 1 in +class T_vstore_new_pred_ppu <bit isPredNot = 0, bit isNT = 0> + : V6_STInst <(outs IntRegs:$_dst_), + (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4), + "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)" + #!if(isNT, ":nt", "")#" = $src4.new", [], + "$src2 = $_dst_">, NewValueRel { + let isPredicatedFalse = isPredNot; +} + +let BaseOpcode = "vS32b_ppu" in { + def V6_vS32b_new_pred_ppu : T_vstore_new_pred_ppu, + V6_vS32b_new_pred_ppu_enc; + def V6_vS32b_new_npred_ppu : T_vstore_new_pred_ppu<1>, + V6_vS32b_new_npred_ppu_enc; +} + +let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in { +def V6_vS32b_nt_new_pred_ppu : T_vstore_new_pred_ppu<0, 1>, + V6_vS32b_nt_new_pred_ppu_enc; +def V6_vS32b_nt_new_npred_ppu : T_vstore_new_pred_ppu<1, 1>, + V6_vS32b_nt_new_npred_ppu_enc; +} + +let isPseudo = 1, validSubTargets = HasV60SubT in +class STrivv_template<string mnemonic, Operand ImmOp, RegisterClass RC>: + VSTInst<(outs), (ins IntRegs:$addr, ImmOp:$off, RC:$src), + #mnemonic#"($addr+#$off) = $src", []>; + +def STrivv_indexed: STrivv_template<"vvmem", s4_6Imm, VecDblRegs>, + Requires<[HasV60T, UseHVXSgl]>; +def STrivv_indexed_128B: STrivv_template<"vvmem", s4_7Imm, VecDblRegs128B>, + Requires<[HasV60T, UseHVXDbl]>; + +multiclass STrivv_pats <ValueType VTSgl, ValueType VTDbl> { + def : Pat<(store (VTSgl VecDblRegs:$src1), IntRegs:$addr), + (STrivv_indexed IntRegs:$addr, #0, (VTSgl VecDblRegs:$src1))>, + Requires<[UseHVXSgl]>; + + def : Pat<(store (VTDbl VecDblRegs128B:$src1), IntRegs:$addr), + (STrivv_indexed_128B IntRegs:$addr, #0, + (VTDbl VecDblRegs128B:$src1))>, + Requires<[UseHVXDbl]>; +} + +defm : STrivv_pats <v128i8, v256i8>; +defm : STrivv_pats <v64i16, v128i16>; +defm : STrivv_pats <v32i32, v64i32>; +defm : STrivv_pats <v16i64, v32i64>; + + +multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> { + // Aligned stores + def : Pat<(store (VTSgl VectorRegs:$src1), IntRegs:$addr), + (V6_vS32b_ai IntRegs:$addr, #0, (VTSgl VectorRegs:$src1))>, + Requires<[UseHVXSgl]>; + + // 128B Aligned stores + def : Pat<(store (VTDbl VectorRegs128B:$src1), IntRegs:$addr), + (V6_vS32b_ai_128B IntRegs:$addr, #0, (VTDbl VectorRegs128B:$src1))>, + Requires<[UseHVXDbl]>; + + // Fold Add R+IFF into vector store. + let AddedComplexity = 10 in + def : Pat<(store (VTSgl VectorRegs:$src1), + (add IntRegs:$src2, s4_6ImmPred:$offset)), + (V6_vS32b_ai IntRegs:$src2, s4_6ImmPred:$offset, + (VTSgl VectorRegs:$src1))>, + Requires<[UseHVXSgl]>; + + // Fold Add R+IFF into vector store 128B. + let AddedComplexity = 10 in + def : Pat<(store (VTDbl VectorRegs128B:$src1), + (add IntRegs:$src2, s4_7ImmPred:$offset)), + (V6_vS32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset, + (VTDbl VectorRegs128B:$src1))>, + Requires<[UseHVXDbl]>; +} + +defm : vS32b_ai_pats <v64i8, v128i8>; +defm : vS32b_ai_pats <v32i16, v64i16>; +defm : vS32b_ai_pats <v16i32, v32i32>; +defm : vS32b_ai_pats <v8i64, v16i64>; + +let isPseudo = 1, validSubTargets = HasV60SubT in +class LDrivv_template<string mnemonic, Operand ImmOp, RegisterClass RC> + : V6_LDInst <(outs RC:$dst), (ins IntRegs:$addr, ImmOp:$off), + "$dst="#mnemonic#"($addr+#$off)", + []>, + Requires<[HasV60T,UseHVXSgl]>; + +def LDrivv_indexed: LDrivv_template<"vvmem", s4_6Imm, VecDblRegs>; +def LDrivv_indexed_128B: LDrivv_template<"vvmem", s4_7Imm, VecDblRegs128B>; + +multiclass LDrivv_pats <ValueType VTSgl, ValueType VTDbl> { + def : Pat < (VTSgl (load IntRegs:$addr)), + (LDrivv_indexed IntRegs:$addr, #0) >, + Requires<[UseHVXSgl]>; + + def : Pat < (VTDbl (load IntRegs:$addr)), + (LDrivv_indexed_128B IntRegs:$addr, #0) >, + Requires<[UseHVXDbl]>; +} + +defm : LDrivv_pats <v128i8, v256i8>; +defm : LDrivv_pats <v64i16, v128i16>; +defm : LDrivv_pats <v32i32, v64i32>; +defm : LDrivv_pats <v16i64, v32i64>; + +multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> { + // Aligned loads + def : Pat < (VTSgl (load IntRegs:$addr)), + (V6_vL32b_ai IntRegs:$addr, #0) >, + Requires<[UseHVXSgl]>; + + // 128B Load + def : Pat < (VTDbl (load IntRegs:$addr)), + (V6_vL32b_ai_128B IntRegs:$addr, #0) >, + Requires<[UseHVXDbl]>; + + // Fold Add R+IFF into vector load. + let AddedComplexity = 10 in + def : Pat<(VTDbl (load (add IntRegs:$src2, s4_7ImmPred:$offset))), + (V6_vL32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>, + Requires<[UseHVXDbl]>; + + let AddedComplexity = 10 in + def : Pat<(VTSgl (load (add IntRegs:$src2, s4_6ImmPred:$offset))), + (V6_vL32b_ai IntRegs:$src2, s4_6ImmPred:$offset)>, + Requires<[UseHVXSgl]>; +} + +defm : vL32b_ai_pats <v64i8, v128i8>; +defm : vL32b_ai_pats <v32i16, v64i16>; +defm : vL32b_ai_pats <v16i32, v32i32>; +defm : vL32b_ai_pats <v8i64, v16i64>; + +// Store vector predicate pseudo. +let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13, + isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in { +def STriq_pred_V6 : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VecPredRegs:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; + +def STriq_pred_vec_V6 : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VectorRegs:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; + +def STriq_pred_V6_128B : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VecPredRegs128B:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; + +def STriq_pred_vec_V6_128B : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VectorRegs128B:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +} + +// Load vector predicate pseudo. +let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13, + opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in { +def LDriq_pred_V6 : LDInst<(outs VecPredRegs:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def LDriq_pred_vec_V6 : LDInst<(outs VectorRegs:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def LDriq_pred_V6_128B : LDInst<(outs VecPredRegs128B:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +def LDriq_pred_vec_V6_128B : LDInst<(outs VectorRegs128B:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +} + +// Store vector pseudo. +let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13, + isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in { +def STriv_pseudo_V6 : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VectorRegs:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def STriv_pseudo_V6_128B : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VectorRegs128B:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +} + +let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13, + isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in { +def STrivv_pseudo_V6 : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VecDblRegs:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def STrivv_pseudo_V6_128B : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VecDblRegs128B:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +} + +// Load vector pseudo. +let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13, + opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in { +def LDriv_pseudo_V6 : LDInst<(outs VectorRegs:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def LDriv_pseudo_V6_128B : LDInst<(outs VectorRegs128B:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +} + +let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13, + opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in { +def LDrivv_pseudo_V6 : LDInst<(outs VecDblRegs:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def LDrivv_pseudo_V6_128B : LDInst<(outs VecDblRegs128B:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +} + +class VSELInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = CVI_VA_DV, + IType type = TypeCVI_VA_DV> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>; + +let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in { +def VSelectPseudo_V6 : VSELInst<(outs VectorRegs:$dst), + (ins PredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def VSelectDblPseudo_V6 : VSELInst<(outs VecDblRegs:$dst), + (ins PredRegs:$src1, VecDblRegs:$src2, VecDblRegs:$src3), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +} + +def : Pat <(v16i32 (selectcc (i32 IntRegs:$lhs), (i32 IntRegs:$rhs), + (v16i32 VectorRegs:$tval), + (v16i32 VectorRegs:$fval), SETEQ)), + (v16i32 (VSelectPseudo_V6 (i32 (C2_cmpeq (i32 IntRegs:$lhs), + (i32 IntRegs:$rhs))), + (v16i32 VectorRegs:$tval), + (v16i32 VectorRegs:$fval)))>; + + +let hasNewValue = 1 in +class T_vmpy <string asmString, RegisterClass RCout, RegisterClass RCin> + : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2), + asmString >; + +multiclass T_vmpy <string asmString, RegisterClass RCout, + RegisterClass RCin> { + def NAME : T_vmpy <asmString, RCout, RCin>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_vmpy <asmString, !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin#"128B")>; +} + +multiclass T_vmpy_VV <string asmString>: + T_vmpy <asmString, VectorRegs, VectorRegs>; + +multiclass T_vmpy_WW <string asmString>: + T_vmpy <asmString, VecDblRegs, VecDblRegs>; + +multiclass T_vmpy_VW <string asmString>: + T_vmpy <asmString, VectorRegs, VecDblRegs>; + +multiclass T_vmpy_WV <string asmString>: + T_vmpy <asmString, VecDblRegs, VectorRegs>; + +defm V6_vtmpyb :T_vmpy_WW<"$dst.h = vtmpy($src1.b,$src2.b)">, V6_vtmpyb_enc; +defm V6_vtmpybus :T_vmpy_WW<"$dst.h = vtmpy($src1.ub,$src2.b)">, V6_vtmpybus_enc; +defm V6_vdsaduh :T_vmpy_WW<"$dst.uw = vdsad($src1.uh,$src2.uh)">, V6_vdsaduh_enc; +defm V6_vmpybus :T_vmpy_WV<"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybus_enc; +defm V6_vmpabus :T_vmpy_WW<"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabus_enc; +defm V6_vmpahb :T_vmpy_WW<"$dst.w = vmpa($src1.h,$src2.b)">, V6_vmpahb_enc; +defm V6_vmpyh :T_vmpy_WV<"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyh_enc; +defm V6_vmpyuh :T_vmpy_WV<"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuh_enc; +defm V6_vmpyiwh :T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_enc; +defm V6_vtmpyhb :T_vmpy_WW<"$dst.w = vtmpy($src1.h,$src2.b)">, V6_vtmpyhb_enc; +defm V6_vmpyub :T_vmpy_WV<"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyub_enc; + +let Itinerary = CVI_VX_LONG, Type = TypeCVI_VX in +defm V6_vmpyihb :T_vmpy_VV<"$dst.h = vmpyi($src1.h,$src2.b)">, V6_vmpyihb_enc; + +defm V6_vdmpybus_dv : + T_vmpy_WW <"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_dv_enc; +defm V6_vdmpyhsusat : + T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.uh):sat">, V6_vdmpyhsusat_enc; +defm V6_vdmpyhsuisat : + T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.uh,#1):sat">, V6_vdmpyhsuisat_enc; +defm V6_vdmpyhsat : + T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhsat_enc; +defm V6_vdmpyhisat : + T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhisat_enc; +defm V6_vdmpyhb_dv : + T_vmpy_WW <"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_dv_enc; +defm V6_vmpyhss : + T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:sat">, V6_vmpyhss_enc; +defm V6_vmpyhsrs : + T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhsrs_enc; + +let Itinerary = CVI_VP, Type = TypeCVI_VP in +defm V6_vror : T_vmpy_VV <"$dst = vror($src1,$src2)">, V6_vror_enc; + +let Itinerary = CVI_VX, Type = TypeCVI_VX in { +defm V6_vdmpyhb : T_vmpy_VV<"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_enc; +defm V6_vrmpybus : T_vmpy_VV<"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybus_enc; +defm V6_vdmpybus : T_vmpy_VV<"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_enc; +defm V6_vmpyiwb : T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.b)">, V6_vmpyiwb_enc; +defm V6_vrmpyub : T_vmpy_VV<"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyub_enc; +} + +let Itinerary = CVI_VS, Type = TypeCVI_VS in { +defm V6_vasrw : T_vmpy_VV <"$dst.w = vasr($src1.w,$src2)">, V6_vasrw_enc; +defm V6_vasrh : T_vmpy_VV <"$dst.h = vasr($src1.h,$src2)">, V6_vasrh_enc; +defm V6_vaslw : T_vmpy_VV <"$dst.w = vasl($src1.w,$src2)">, V6_vaslw_enc; +defm V6_vaslh : T_vmpy_VV <"$dst.h = vasl($src1.h,$src2)">, V6_vaslh_enc; +defm V6_vlsrw : T_vmpy_VV <"$dst.uw = vlsr($src1.uw,$src2)">, V6_vlsrw_enc; +defm V6_vlsrh : T_vmpy_VV <"$dst.uh = vlsr($src1.uh,$src2)">, V6_vlsrh_enc; +} + +let hasNewValue = 1 in +class T_HVX_alu <string asmString, InstrItinClass itin, + RegisterClass RCout, RegisterClass RCin> + : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2), + asmString >{ + let Itinerary = itin; + let Type = !cast<IType>("Type"#itin); +} + +multiclass T_HVX_alu <string asmString, RegisterClass RCout, + RegisterClass RCin, InstrItinClass itin> { + def NAME : T_HVX_alu <asmString, itin, RCout, RCin>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_alu <asmString, itin, + !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin#"128B")>; +} + +multiclass T_HVX_alu_VV <string asmString>: + T_HVX_alu <asmString, VectorRegs, VectorRegs, CVI_VA>; + +multiclass T_HVX_alu_WW <string asmString>: + T_HVX_alu <asmString, VecDblRegs, VecDblRegs, CVI_VA_DV>; + +multiclass T_HVX_alu_WV <string asmString>: + T_HVX_alu <asmString, VecDblRegs, VectorRegs, CVI_VX_DV>; + + +let Itinerary = CVI_VX, Type = TypeCVI_VX in { +defm V6_vrmpyubv : + T_HVX_alu_VV <"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyubv_enc; +defm V6_vrmpybv : + T_HVX_alu_VV <"$dst.w = vrmpy($src1.b,$src2.b)">, V6_vrmpybv_enc; +defm V6_vrmpybusv : + T_HVX_alu_VV <"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybusv_enc; +defm V6_vabsdiffub : + T_HVX_alu_VV <"$dst.ub = vabsdiff($src1.ub,$src2.ub)">, V6_vabsdiffub_enc; +defm V6_vabsdiffh : + T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.h,$src2.h)">, V6_vabsdiffh_enc; +defm V6_vabsdiffuh : + T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.uh,$src2.uh)">, V6_vabsdiffuh_enc; +defm V6_vabsdiffw : + T_HVX_alu_VV <"$dst.uw = vabsdiff($src1.w,$src2.w)">, V6_vabsdiffw_enc; +} + +let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in { +defm V6_vdmpyhvsat : + T_HVX_alu_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhvsat_enc; +defm V6_vmpyhvsrs : + T_HVX_alu_VV<"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhvsrs_enc; +defm V6_vmpyih : + T_HVX_alu_VV <"$dst.h = vmpyi($src1.h,$src2.h)">, V6_vmpyih_enc; +} + +defm V6_vand : + T_HVX_alu_VV <"$dst = vand($src1,$src2)">, V6_vand_enc; +defm V6_vor : + T_HVX_alu_VV <"$dst = vor($src1,$src2)">, V6_vor_enc; +defm V6_vxor : + T_HVX_alu_VV <"$dst = vxor($src1,$src2)">, V6_vxor_enc; +defm V6_vaddw : + T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_enc; +defm V6_vaddubsat : + T_HVX_alu_VV <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_enc; +defm V6_vadduhsat : + T_HVX_alu_VV <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_enc; +defm V6_vaddhsat : + T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_enc; +defm V6_vaddwsat : + T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_enc; +defm V6_vsubb : + T_HVX_alu_VV <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_enc; +defm V6_vsubh : + T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_enc; +defm V6_vsubw : + T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_enc; +defm V6_vsububsat : + T_HVX_alu_VV <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_enc; +defm V6_vsubuhsat : + T_HVX_alu_VV <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_enc; +defm V6_vsubhsat : + T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_enc; +defm V6_vsubwsat : + T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_enc; +defm V6_vavgub : + T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub)">, V6_vavgub_enc; +defm V6_vavguh : + T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh)">, V6_vavguh_enc; +defm V6_vavgh : + T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h)">, V6_vavgh_enc; +defm V6_vavgw : + T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w)">, V6_vavgw_enc; +defm V6_vnavgub : + T_HVX_alu_VV <"$dst.b = vnavg($src1.ub,$src2.ub)">, V6_vnavgub_enc; +defm V6_vnavgh : + T_HVX_alu_VV <"$dst.h = vnavg($src1.h,$src2.h)">, V6_vnavgh_enc; +defm V6_vnavgw : + T_HVX_alu_VV <"$dst.w = vnavg($src1.w,$src2.w)">, V6_vnavgw_enc; +defm V6_vavgubrnd : + T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub):rnd">, V6_vavgubrnd_enc; +defm V6_vavguhrnd : + T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh):rnd">, V6_vavguhrnd_enc; +defm V6_vavghrnd : + T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h):rnd">, V6_vavghrnd_enc; +defm V6_vavgwrnd : + T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w):rnd">, V6_vavgwrnd_enc; + +defm V6_vmpybv : + T_HVX_alu_WV <"$dst.h = vmpy($src1.b,$src2.b)">, V6_vmpybv_enc; +defm V6_vmpyubv : + T_HVX_alu_WV <"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyubv_enc; +defm V6_vmpybusv : + T_HVX_alu_WV <"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybusv_enc; +defm V6_vmpyhv : + T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyhv_enc; +defm V6_vmpyuhv : + T_HVX_alu_WV <"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuhv_enc; +defm V6_vmpyhus : + T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.uh)">, V6_vmpyhus_enc; +defm V6_vaddubh : + T_HVX_alu_WV <"$dst.h = vadd($src1.ub,$src2.ub)">, V6_vaddubh_enc; +defm V6_vadduhw : + T_HVX_alu_WV <"$dst.w = vadd($src1.uh,$src2.uh)">, V6_vadduhw_enc; +defm V6_vaddhw : + T_HVX_alu_WV <"$dst.w = vadd($src1.h,$src2.h)">, V6_vaddhw_enc; +defm V6_vsububh : + T_HVX_alu_WV <"$dst.h = vsub($src1.ub,$src2.ub)">, V6_vsububh_enc; +defm V6_vsubuhw : + T_HVX_alu_WV <"$dst.w = vsub($src1.uh,$src2.uh)">, V6_vsubuhw_enc; +defm V6_vsubhw : + T_HVX_alu_WV <"$dst.w = vsub($src1.h,$src2.h)">, V6_vsubhw_enc; + +defm V6_vaddb_dv : + T_HVX_alu_WW <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_dv_enc; +defm V6_vaddh_dv : + T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_dv_enc; +defm V6_vaddw_dv : + T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_dv_enc; +defm V6_vaddubsat_dv : + T_HVX_alu_WW <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_dv_enc; +defm V6_vadduhsat_dv : + T_HVX_alu_WW <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_dv_enc; +defm V6_vaddhsat_dv : + T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_dv_enc; +defm V6_vaddwsat_dv : + T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_dv_enc; +defm V6_vsubb_dv : + T_HVX_alu_WW <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_dv_enc; +defm V6_vsubh_dv : + T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_dv_enc; +defm V6_vsubw_dv : + T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_dv_enc; +defm V6_vsububsat_dv : + T_HVX_alu_WW <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_dv_enc; +defm V6_vsubuhsat_dv : + T_HVX_alu_WW <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_dv_enc; +defm V6_vsubhsat_dv : + T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_dv_enc; +defm V6_vsubwsat_dv : + T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_dv_enc; + +let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV in { +defm V6_vmpabusv : + T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabusv_enc; +defm V6_vmpabuuv : + T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.ub)">, V6_vmpabuuv_enc; +} + +let isAccumulator = 1, hasNewValue = 1 in +class T_HVX_vmpyacc <string asmString, InstrItinClass itin, RegisterClass RCout, + RegisterClass RCin1, RegisterClass RCin2> + : CVI_VA_Resource1 <(outs RCout:$dst), + (ins RCout:$_src_, RCin1:$src1, RCin2:$src2), asmString, + [], "$dst = $_src_" > { + let Itinerary = itin; + let Type = !cast<IType>("Type"#itin); +} + +multiclass T_HVX_vmpyacc_both <string asmString, RegisterClass RCout, + RegisterClass RCin1, RegisterClass RCin2, InstrItinClass itin > { + def NAME : T_HVX_vmpyacc <asmString, itin, RCout, RCin1, RCin2>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vmpyacc <asmString, itin, + !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin1#"128B"), + !cast<RegisterClass>(RCin2# + !if(!eq (!cast<string>(RCin2), "IntRegs"), "", "128B"))>; +} + +multiclass T_HVX_vmpyacc_VVR <string asmString>: + T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, IntRegs, CVI_VX>; + +multiclass T_HVX_vmpyacc_VWR <string asmString>: + T_HVX_vmpyacc_both <asmString, VectorRegs, VecDblRegs, IntRegs, CVI_VX_DV>; + +multiclass T_HVX_vmpyacc_WVR <string asmString>: + T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, IntRegs, CVI_VX_DV>; + +multiclass T_HVX_vmpyacc_WWR <string asmString>: + T_HVX_vmpyacc_both <asmString, VecDblRegs, VecDblRegs, IntRegs, CVI_VX_DV>; + +multiclass T_HVX_vmpyacc_VVV <string asmString>: + T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, VectorRegs, CVI_VX_DV>; + +multiclass T_HVX_vmpyacc_WVV <string asmString>: + T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, VectorRegs, CVI_VX_DV>; + + +defm V6_vtmpyb_acc : + T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.b,$src2.b)">, + V6_vtmpyb_acc_enc; +defm V6_vtmpybus_acc : + T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.ub,$src2.b)">, + V6_vtmpybus_acc_enc; +defm V6_vtmpyhb_acc : + T_HVX_vmpyacc_WWR <"$dst.w += vtmpy($src1.h,$src2.b)">, + V6_vtmpyhb_acc_enc; +defm V6_vdmpyhb_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.b)">, + V6_vdmpyhb_acc_enc; +defm V6_vrmpyub_acc : + T_HVX_vmpyacc_VVR <"$dst.uw += vrmpy($src1.ub,$src2.ub)">, + V6_vrmpyub_acc_enc; +defm V6_vrmpybus_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vrmpy($src1.ub,$src2.b)">, + V6_vrmpybus_acc_enc; +defm V6_vdmpybus_acc : + T_HVX_vmpyacc_VVR <"$dst.h += vdmpy($src1.ub,$src2.b)">, + V6_vdmpybus_acc_enc; +defm V6_vdmpybus_dv_acc : + T_HVX_vmpyacc_WWR <"$dst.h += vdmpy($src1.ub,$src2.b)">, + V6_vdmpybus_dv_acc_enc; +defm V6_vdmpyhsuisat_acc : + T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.uh,#1):sat">, + V6_vdmpyhsuisat_acc_enc; +defm V6_vdmpyhisat_acc : + T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.h):sat">, + V6_vdmpyhisat_acc_enc; +defm V6_vdmpyhb_dv_acc : + T_HVX_vmpyacc_WWR <"$dst.w += vdmpy($src1.h,$src2.b)">, + V6_vdmpyhb_dv_acc_enc; +defm V6_vmpybus_acc : + T_HVX_vmpyacc_WVR <"$dst.h += vmpy($src1.ub,$src2.b)">, + V6_vmpybus_acc_enc; +defm V6_vmpabus_acc : + T_HVX_vmpyacc_WWR <"$dst.h += vmpa($src1.ub,$src2.b)">, + V6_vmpabus_acc_enc; +defm V6_vmpahb_acc : + T_HVX_vmpyacc_WWR <"$dst.w += vmpa($src1.h,$src2.b)">, + V6_vmpahb_acc_enc; +defm V6_vmpyhsat_acc : + T_HVX_vmpyacc_WVR <"$dst.w += vmpy($src1.h,$src2.h):sat">, + V6_vmpyhsat_acc_enc; +defm V6_vmpyuh_acc : + T_HVX_vmpyacc_WVR <"$dst.uw += vmpy($src1.uh,$src2.uh)">, + V6_vmpyuh_acc_enc; +defm V6_vmpyiwb_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vmpyi($src1.w,$src2.b)">, + V6_vmpyiwb_acc_enc; +defm V6_vdsaduh_acc : + T_HVX_vmpyacc_WWR <"$dst.uw += vdsad($src1.uh,$src2.uh)">, + V6_vdsaduh_acc_enc; +defm V6_vmpyihb_acc : + T_HVX_vmpyacc_VVR <"$dst.h += vmpyi($src1.h,$src2.b)">, + V6_vmpyihb_acc_enc; +defm V6_vmpyub_acc : + T_HVX_vmpyacc_WVR <"$dst.uh += vmpy($src1.ub,$src2.ub)">, + V6_vmpyub_acc_enc; + +let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in { +defm V6_vdmpyhsusat_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.uh):sat">, + V6_vdmpyhsusat_acc_enc; +defm V6_vdmpyhsat_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.h):sat">, + V6_vdmpyhsat_acc_enc; +defm V6_vmpyiwh_acc : T_HVX_vmpyacc_VVR + <"$dst.w += vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_acc_enc; +} + +let Itinerary = CVI_VS, Type = TypeCVI_VS in { +defm V6_vaslw_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vasl($src1.w,$src2)">, V6_vaslw_acc_enc; +defm V6_vasrw_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vasr($src1.w,$src2)">, V6_vasrw_acc_enc; +} + +defm V6_vdmpyhvsat_acc : + T_HVX_vmpyacc_VVV <"$dst.w += vdmpy($src1.h,$src2.h):sat">, + V6_vdmpyhvsat_acc_enc; +defm V6_vmpybusv_acc : + T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.ub,$src2.b)">, + V6_vmpybusv_acc_enc; +defm V6_vmpybv_acc : + T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.b,$src2.b)">, V6_vmpybv_acc_enc; +defm V6_vmpyhus_acc : + T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.uh)">, V6_vmpyhus_acc_enc; +defm V6_vmpyhv_acc : + T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.h)">, V6_vmpyhv_acc_enc; +defm V6_vmpyiewh_acc : + T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.h)">, + V6_vmpyiewh_acc_enc; +defm V6_vmpyiewuh_acc : + T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.uh)">, + V6_vmpyiewuh_acc_enc; +defm V6_vmpyih_acc : + T_HVX_vmpyacc_VVV <"$dst.h += vmpyi($src1.h,$src2.h)">, V6_vmpyih_acc_enc; +defm V6_vmpyowh_rnd_sacc : + T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:rnd:sat:shift">, + V6_vmpyowh_rnd_sacc_enc; +defm V6_vmpyowh_sacc : + T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:sat:shift">, + V6_vmpyowh_sacc_enc; +defm V6_vmpyubv_acc : + T_HVX_vmpyacc_WVV <"$dst.uh += vmpy($src1.ub,$src2.ub)">, + V6_vmpyubv_acc_enc; +defm V6_vmpyuhv_acc : + T_HVX_vmpyacc_WVV <"$dst.uw += vmpy($src1.uh,$src2.uh)">, + V6_vmpyuhv_acc_enc; +defm V6_vrmpybusv_acc : + T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.ub,$src2.b)">, + V6_vrmpybusv_acc_enc; +defm V6_vrmpybv_acc : + T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.b,$src2.b)">, V6_vrmpybv_acc_enc; +defm V6_vrmpyubv_acc : + T_HVX_vmpyacc_VVV <"$dst.uw += vrmpy($src1.ub,$src2.ub)">, + V6_vrmpyubv_acc_enc; + + +class T_HVX_vcmp <string asmString, RegisterClass RCout, RegisterClass RCin> + : CVI_VA_Resource1 <(outs RCout:$dst), + (ins RCout:$_src_, RCin:$src1, RCin:$src2), asmString, + [], "$dst = $_src_" > { + let Itinerary = CVI_VA; + let Type = TypeCVI_VA; +} + +multiclass T_HVX_vcmp <string asmString> { + def NAME : T_HVX_vcmp <asmString, VecPredRegs, VectorRegs>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vcmp <asmString, VecPredRegs128B, VectorRegs128B>; +} + +defm V6_veqb_and : + T_HVX_vcmp <"$dst &= vcmp.eq($src1.b,$src2.b)">, V6_veqb_and_enc; +defm V6_veqh_and : + T_HVX_vcmp <"$dst &= vcmp.eq($src1.h,$src2.h)">, V6_veqh_and_enc; +defm V6_veqw_and : + T_HVX_vcmp <"$dst &= vcmp.eq($src1.w,$src2.w)">, V6_veqw_and_enc; +defm V6_vgtb_and : + T_HVX_vcmp <"$dst &= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_and_enc; +defm V6_vgth_and : + T_HVX_vcmp <"$dst &= vcmp.gt($src1.h,$src2.h)">, V6_vgth_and_enc; +defm V6_vgtw_and : + T_HVX_vcmp <"$dst &= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_and_enc; +defm V6_vgtub_and : + T_HVX_vcmp <"$dst &= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_and_enc; +defm V6_vgtuh_and : + T_HVX_vcmp <"$dst &= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_and_enc; +defm V6_vgtuw_and : + T_HVX_vcmp <"$dst &= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_and_enc; +defm V6_veqb_or : + T_HVX_vcmp <"$dst |= vcmp.eq($src1.b,$src2.b)">, V6_veqb_or_enc; +defm V6_veqh_or : + T_HVX_vcmp <"$dst |= vcmp.eq($src1.h,$src2.h)">, V6_veqh_or_enc; +defm V6_veqw_or : + T_HVX_vcmp <"$dst |= vcmp.eq($src1.w,$src2.w)">, V6_veqw_or_enc; +defm V6_vgtb_or : + T_HVX_vcmp <"$dst |= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_or_enc; +defm V6_vgth_or : + T_HVX_vcmp <"$dst |= vcmp.gt($src1.h,$src2.h)">, V6_vgth_or_enc; +defm V6_vgtw_or : + T_HVX_vcmp <"$dst |= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_or_enc; +defm V6_vgtub_or : + T_HVX_vcmp <"$dst |= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_or_enc; +defm V6_vgtuh_or : + T_HVX_vcmp <"$dst |= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_or_enc; +defm V6_vgtuw_or : + T_HVX_vcmp <"$dst |= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_or_enc; +defm V6_veqb_xor : + T_HVX_vcmp <"$dst ^= vcmp.eq($src1.b,$src2.b)">, V6_veqb_xor_enc; +defm V6_veqh_xor : + T_HVX_vcmp <"$dst ^= vcmp.eq($src1.h,$src2.h)">, V6_veqh_xor_enc; +defm V6_veqw_xor : + T_HVX_vcmp <"$dst ^= vcmp.eq($src1.w,$src2.w)">, V6_veqw_xor_enc; +defm V6_vgtb_xor : + T_HVX_vcmp <"$dst ^= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_xor_enc; +defm V6_vgth_xor : + T_HVX_vcmp <"$dst ^= vcmp.gt($src1.h,$src2.h)">, V6_vgth_xor_enc; +defm V6_vgtw_xor : + T_HVX_vcmp <"$dst ^= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_xor_enc; +defm V6_vgtub_xor : + T_HVX_vcmp <"$dst ^= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_xor_enc; +defm V6_vgtuh_xor : + T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_xor_enc; +defm V6_vgtuw_xor : + T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_xor_enc; + +defm V6_vminub : + T_HVX_alu_VV <"$dst.ub = vmin($src1.ub,$src2.ub)">, V6_vminub_enc; +defm V6_vminuh : + T_HVX_alu_VV <"$dst.uh = vmin($src1.uh,$src2.uh)">, V6_vminuh_enc; +defm V6_vminh : + T_HVX_alu_VV <"$dst.h = vmin($src1.h,$src2.h)">, V6_vminh_enc; +defm V6_vminw : + T_HVX_alu_VV <"$dst.w = vmin($src1.w,$src2.w)">, V6_vminw_enc; +defm V6_vmaxub : + T_HVX_alu_VV <"$dst.ub = vmax($src1.ub,$src2.ub)">, V6_vmaxub_enc; +defm V6_vmaxuh : + T_HVX_alu_VV <"$dst.uh = vmax($src1.uh,$src2.uh)">, V6_vmaxuh_enc; +defm V6_vmaxh : + T_HVX_alu_VV <"$dst.h = vmax($src1.h,$src2.h)">, V6_vmaxh_enc; +defm V6_vmaxw : + T_HVX_alu_VV <"$dst.w = vmax($src1.w,$src2.w)">, V6_vmaxw_enc; +defm V6_vshuffeb : + T_HVX_alu_VV <"$dst.b = vshuffe($src1.b,$src2.b)">, V6_vshuffeb_enc; +defm V6_vshuffob : + T_HVX_alu_VV <"$dst.b = vshuffo($src1.b,$src2.b)">, V6_vshuffob_enc; +defm V6_vshufeh : + T_HVX_alu_VV <"$dst.h = vshuffe($src1.h,$src2.h)">, V6_vshufeh_enc; +defm V6_vshufoh : + T_HVX_alu_VV <"$dst.h = vshuffo($src1.h,$src2.h)">, V6_vshufoh_enc; + +let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in { +defm V6_vmpyowh_rnd : + T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:rnd:sat">, + V6_vmpyowh_rnd_enc; +defm V6_vmpyiewuh : + T_HVX_alu_VV <"$dst.w = vmpyie($src1.w,$src2.uh)">, V6_vmpyiewuh_enc; +defm V6_vmpyewuh : + T_HVX_alu_VV <"$dst.w = vmpye($src1.w,$src2.uh)">, V6_vmpyewuh_enc; +defm V6_vmpyowh : + T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:sat">, V6_vmpyowh_enc; +defm V6_vmpyiowh : + T_HVX_alu_VV <"$dst.w = vmpyio($src1.w,$src2.h)">, V6_vmpyiowh_enc; +} +let Itinerary = CVI_VX, Type = TypeCVI_VX in +defm V6_vmpyieoh : + T_HVX_alu_VV <"$dst.w = vmpyieo($src1.h,$src2.h)">, V6_vmpyieoh_enc; + +let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in { +defm V6_vshufoeh : + T_HVX_alu_WV <"$dst.h = vshuffoe($src1.h,$src2.h)">, V6_vshufoeh_enc; +defm V6_vshufoeb : + T_HVX_alu_WV <"$dst.b = vshuffoe($src1.b,$src2.b)">, V6_vshufoeb_enc; +} + +let isRegSequence = 1, Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in +defm V6_vcombine : + T_HVX_alu_WV <"$dst = vcombine($src1,$src2)">, V6_vcombine_enc; + +def SDTHexagonVCOMBINE: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>, + SDTCisSubVecOfVec<1, 0>]>; + +def HexagonVCOMBINE: SDNode<"HexagonISD::VCOMBINE", SDTHexagonVCOMBINE>; + +def: Pat<(v32i32 (HexagonVCOMBINE (v16i32 VectorRegs:$Vs), + (v16i32 VectorRegs:$Vt))), + (V6_vcombine VectorRegs:$Vs, VectorRegs:$Vt)>, + Requires<[UseHVXSgl]>; +def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs), + (v32i32 VecDblRegs:$Vt))), + (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>, + Requires<[UseHVXDbl]>; + +let Itinerary = CVI_VINLANESAT, Type = TypeCVI_VINLANESAT in { +defm V6_vsathub : + T_HVX_alu_VV <"$dst.ub = vsat($src1.h,$src2.h)">, V6_vsathub_enc; +defm V6_vsatwh : + T_HVX_alu_VV <"$dst.h = vsat($src1.w,$src2.w)">, V6_vsatwh_enc; +} + +let Itinerary = CVI_VS, Type = TypeCVI_VS in { +defm V6_vroundwh : + T_HVX_alu_VV <"$dst.h = vround($src1.w,$src2.w):sat">, V6_vroundwh_enc; +defm V6_vroundwuh : + T_HVX_alu_VV <"$dst.uh = vround($src1.w,$src2.w):sat">, V6_vroundwuh_enc; +defm V6_vroundhb : + T_HVX_alu_VV <"$dst.b = vround($src1.h,$src2.h):sat">, V6_vroundhb_enc; +defm V6_vroundhub : + T_HVX_alu_VV <"$dst.ub = vround($src1.h,$src2.h):sat">, V6_vroundhub_enc; +defm V6_vasrwv : + T_HVX_alu_VV <"$dst.w = vasr($src1.w,$src2.w)">, V6_vasrwv_enc; +defm V6_vlsrwv : + T_HVX_alu_VV <"$dst.w = vlsr($src1.w,$src2.w)">, V6_vlsrwv_enc; +defm V6_vlsrhv : + T_HVX_alu_VV <"$dst.h = vlsr($src1.h,$src2.h)">, V6_vlsrhv_enc; +defm V6_vasrhv : + T_HVX_alu_VV <"$dst.h = vasr($src1.h,$src2.h)">, V6_vasrhv_enc; +defm V6_vaslwv : + T_HVX_alu_VV <"$dst.w = vasl($src1.w,$src2.w)">, V6_vaslwv_enc; +defm V6_vaslhv : + T_HVX_alu_VV <"$dst.h = vasl($src1.h,$src2.h)">, V6_vaslhv_enc; +} + +defm V6_vaddb : + T_HVX_alu_VV <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_enc; +defm V6_vaddh : + T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_enc; + +let Itinerary = CVI_VP, Type = TypeCVI_VP in { +defm V6_vdelta : + T_HVX_alu_VV <"$dst = vdelta($src1,$src2)">, V6_vdelta_enc; +defm V6_vrdelta : + T_HVX_alu_VV <"$dst = vrdelta($src1,$src2)">, V6_vrdelta_enc; +defm V6_vdealb4w : + T_HVX_alu_VV <"$dst.b = vdeale($src1.b,$src2.b)">, V6_vdealb4w_enc; +defm V6_vpackeb : + T_HVX_alu_VV <"$dst.b = vpacke($src1.h,$src2.h)">, V6_vpackeb_enc; +defm V6_vpackeh : + T_HVX_alu_VV <"$dst.h = vpacke($src1.w,$src2.w)">, V6_vpackeh_enc; +defm V6_vpackhub_sat : + T_HVX_alu_VV <"$dst.ub = vpack($src1.h,$src2.h):sat">, V6_vpackhub_sat_enc; +defm V6_vpackhb_sat : + T_HVX_alu_VV <"$dst.b = vpack($src1.h,$src2.h):sat">, V6_vpackhb_sat_enc; +defm V6_vpackwuh_sat : + T_HVX_alu_VV <"$dst.uh = vpack($src1.w,$src2.w):sat">, V6_vpackwuh_sat_enc; +defm V6_vpackwh_sat : + T_HVX_alu_VV <"$dst.h = vpack($src1.w,$src2.w):sat">, V6_vpackwh_sat_enc; +defm V6_vpackob : + T_HVX_alu_VV <"$dst.b = vpacko($src1.h,$src2.h)">, V6_vpackob_enc; +defm V6_vpackoh : + T_HVX_alu_VV <"$dst.h = vpacko($src1.w,$src2.w)">, V6_vpackoh_enc; +} + +let hasNewValue = 1, hasSideEffects = 0 in +class T_HVX_condALU <string asmString, RegisterClass RC1, RegisterClass RC2> + : CVI_VA_Resource1 <(outs RC2:$dst), + (ins RC1:$src1, RC2:$_src_, RC2:$src2), asmString, + [], "$dst = $_src_" > { + let Itinerary = CVI_VA; + let Type = TypeCVI_VA; +} + +multiclass T_HVX_condALU <string asmString> { + def NAME : T_HVX_condALU <asmString, VecPredRegs, VectorRegs>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_condALU <asmString, VecPredRegs128B, VectorRegs128B>; +} + +defm V6_vaddbq : T_HVX_condALU <"if ($src1) $dst.b += $src2.b">, + V6_vaddbq_enc; +defm V6_vaddhq : T_HVX_condALU <"if ($src1) $dst.h += $src2.h">, + V6_vaddhq_enc; +defm V6_vaddwq : T_HVX_condALU <"if ($src1) $dst.w += $src2.w">, + V6_vaddwq_enc; +defm V6_vsubbq : T_HVX_condALU <"if ($src1) $dst.b -= $src2.b">, + V6_vsubbq_enc; +defm V6_vsubhq : T_HVX_condALU <"if ($src1) $dst.h -= $src2.h">, + V6_vsubhq_enc; +defm V6_vsubwq : T_HVX_condALU <"if ($src1) $dst.w -= $src2.w">, + V6_vsubwq_enc; +defm V6_vaddbnq : T_HVX_condALU <"if (!$src1) $dst.b += $src2.b">, + V6_vaddbnq_enc; +defm V6_vaddhnq : T_HVX_condALU <"if (!$src1) $dst.h += $src2.h">, + V6_vaddhnq_enc; +defm V6_vaddwnq : T_HVX_condALU <"if (!$src1) $dst.w += $src2.w">, + V6_vaddwnq_enc; +defm V6_vsubbnq : T_HVX_condALU <"if (!$src1) $dst.b -= $src2.b">, + V6_vsubbnq_enc; +defm V6_vsubhnq : T_HVX_condALU <"if (!$src1) $dst.h -= $src2.h">, + V6_vsubhnq_enc; +defm V6_vsubwnq : T_HVX_condALU <"if (!$src1) $dst.w -= $src2.w">, + V6_vsubwnq_enc; + +let hasNewValue = 1 in +class T_HVX_alu_2op <string asmString, InstrItinClass itin, + RegisterClass RCout, RegisterClass RCin> + : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1), + asmString >{ + let Itinerary = itin; + let Type = !cast<IType>("Type"#itin); +} + +multiclass T_HVX_alu_2op <string asmString, RegisterClass RCout, + RegisterClass RCin, InstrItinClass itin> { + def NAME : T_HVX_alu_2op <asmString, itin, RCout, RCin>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_alu_2op <asmString, itin, + !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin#"128B")>; +} + +let hasNewValue = 1 in +multiclass T_HVX_alu_2op_VV <string asmString>: + T_HVX_alu_2op <asmString, VectorRegs, VectorRegs, CVI_VA>; + +multiclass T_HVX_alu_2op_WV <string asmString>: + T_HVX_alu_2op <asmString, VecDblRegs, VectorRegs, CVI_VA_DV>; + + +defm V6_vabsh : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h)">, + V6_vabsh_enc; +defm V6_vabsw : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w)">, + V6_vabsw_enc; +defm V6_vabsh_sat : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h):sat">, + V6_vabsh_sat_enc; +defm V6_vabsw_sat : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w):sat">, + V6_vabsw_sat_enc; +defm V6_vnot : T_HVX_alu_2op_VV <"$dst = vnot($src1)">, + V6_vnot_enc; +defm V6_vassign : T_HVX_alu_2op_VV <"$dst = $src1">, + V6_vassign_enc; + +defm V6_vzb : T_HVX_alu_2op_WV <"$dst.uh = vzxt($src1.ub)">, + V6_vzb_enc; +defm V6_vzh : T_HVX_alu_2op_WV <"$dst.uw = vzxt($src1.uh)">, + V6_vzh_enc; +defm V6_vsb : T_HVX_alu_2op_WV <"$dst.h = vsxt($src1.b)">, + V6_vsb_enc; +defm V6_vsh : T_HVX_alu_2op_WV <"$dst.w = vsxt($src1.h)">, + V6_vsh_enc; + +let Itinerary = CVI_VP, Type = TypeCVI_VP in { +defm V6_vdealh : T_HVX_alu_2op_VV <"$dst.h = vdeal($src1.h)">, + V6_vdealh_enc; +defm V6_vdealb : T_HVX_alu_2op_VV <"$dst.b = vdeal($src1.b)">, + V6_vdealb_enc; +defm V6_vshuffh : T_HVX_alu_2op_VV <"$dst.h = vshuff($src1.h)">, + V6_vshuffh_enc; +defm V6_vshuffb : T_HVX_alu_2op_VV <"$dst.b = vshuff($src1.b)">, + V6_vshuffb_enc; +} + +let Itinerary = CVI_VP_VS, Type = TypeCVI_VP_VS in { +defm V6_vunpackub : T_HVX_alu_2op_WV <"$dst.uh = vunpack($src1.ub)">, + V6_vunpackub_enc; +defm V6_vunpackuh : T_HVX_alu_2op_WV <"$dst.uw = vunpack($src1.uh)">, + V6_vunpackuh_enc; +defm V6_vunpackb : T_HVX_alu_2op_WV <"$dst.h = vunpack($src1.b)">, + V6_vunpackb_enc; +defm V6_vunpackh : T_HVX_alu_2op_WV <"$dst.w = vunpack($src1.h)">, + V6_vunpackh_enc; +} + +let Itinerary = CVI_VS, Type = TypeCVI_VS in { +defm V6_vcl0w : T_HVX_alu_2op_VV <"$dst.uw = vcl0($src1.uw)">, + V6_vcl0w_enc; +defm V6_vcl0h : T_HVX_alu_2op_VV <"$dst.uh = vcl0($src1.uh)">, + V6_vcl0h_enc; +defm V6_vnormamtw : T_HVX_alu_2op_VV <"$dst.w = vnormamt($src1.w)">, + V6_vnormamtw_enc; +defm V6_vnormamth : T_HVX_alu_2op_VV <"$dst.h = vnormamt($src1.h)">, + V6_vnormamth_enc; +defm V6_vpopcounth : T_HVX_alu_2op_VV <"$dst.h = vpopcount($src1.h)">, + V6_vpopcounth_enc; +} + +let isAccumulator = 1, hasNewValue = 1, Itinerary = CVI_VX_DV_LONG, + Type = TypeCVI_VX_DV in +class T_HVX_vmpyacc2 <string asmString, RegisterClass RC> + : CVI_VA_Resource1 <(outs RC:$dst), + (ins RC:$_src_, RC:$src1, IntRegs:$src2, u1Imm:$src3), + asmString, [], "$dst = $_src_" > ; + + +multiclass T_HVX_vmpyacc2 <string asmString> { + def NAME : T_HVX_vmpyacc2 <asmString, VecDblRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vmpyacc2 <asmString, VecDblRegs128B>; +} + +defm V6_vrmpybusi_acc : + T_HVX_vmpyacc2<"$dst.w += vrmpy($src1.ub,$src2.b,#$src3)">, + V6_vrmpybusi_acc_enc; +defm V6_vrsadubi_acc : + T_HVX_vmpyacc2<"$dst.uw += vrsad($src1.ub,$src2.ub,#$src3)">, + V6_vrsadubi_acc_enc; +defm V6_vrmpyubi_acc : + T_HVX_vmpyacc2<"$dst.uw += vrmpy($src1.ub,$src2.ub,#$src3)">, + V6_vrmpyubi_acc_enc; + + +let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV, hasNewValue = 1 in +class T_HVX_vmpy2 <string asmString, RegisterClass RC> + : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, IntRegs:$src2, u1Imm:$src3), + asmString>; + + +multiclass T_HVX_vmpy2 <string asmString> { + def NAME : T_HVX_vmpy2 <asmString, VecDblRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vmpy2 <asmString, VecDblRegs128B>; +} + +defm V6_vrmpybusi : + T_HVX_vmpy2 <"$dst.w = vrmpy($src1.ub,$src2.b,#$src3)">, V6_vrmpybusi_enc; +defm V6_vrsadubi : + T_HVX_vmpy2 <"$dst.uw = vrsad($src1.ub,$src2.ub,#$src3)">, V6_vrsadubi_enc; +defm V6_vrmpyubi : + T_HVX_vmpy2 <"$dst.uw = vrmpy($src1.ub,$src2.ub,#$src3)">, V6_vrmpyubi_enc; + + +let Itinerary = CVI_VP_VS_LONG_EARLY, Type = TypeCVI_VP_VS, + hasSideEffects = 0, hasNewValue2 = 1, opNewValue2 = 1 in +class T_HVX_perm <string asmString, RegisterClass RC> + : CVI_VA_Resource1 <(outs RC:$_dst1_, RC:$_dst2_), + (ins RC:$src1, RC:$src2, IntRegs:$src3), + asmString, [], "$_dst1_ = $src1, $_dst2_ = $src2" >; + +multiclass T_HVX_perm <string asmString> { + def NAME : T_HVX_perm <asmString, VectorRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_perm <asmString, VectorRegs128B>; +} + +let hasNewValue = 1, opNewValue = 0, hasNewValue2 = 1, opNewValue2 = 1 in { + defm V6_vshuff : T_HVX_perm <"vshuff($src1,$src2,$src3)">, V6_vshuff_enc; + defm V6_vdeal : T_HVX_perm <"vdeal($src1,$src2,$src3)">, V6_vdeal_enc; +} + +// Conditional vector move. +let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in +class T_HVX_cmov <bit isPredNot, RegisterClass RC> + : CVI_VA_Resource1 <(outs RC:$dst), (ins PredRegs:$src1, RC:$src2), + "if ("#!if(isPredNot, "!", "")#"$src1) $dst = $src2"> { + let isPredicatedFalse = isPredNot; +} + +multiclass T_HVX_cmov <bit isPredNot = 0> { + def NAME : T_HVX_cmov <isPredNot, VectorRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_cmov <isPredNot, VectorRegs128B>; +} + +defm V6_vcmov : T_HVX_cmov, V6_vcmov_enc; +defm V6_vncmov : T_HVX_cmov<1>, V6_vncmov_enc; + +// Conditional vector combine. +let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, isPredicated = 1, + hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in +class T_HVX_ccombine <bit isPredNot, RegisterClass RCout, RegisterClass RCin> + : CVI_VA_Resource1 < (outs RCout:$dst), + (ins PredRegs:$src1, RCin:$src2, RCin:$src3), + "if ("#!if(isPredNot, "!", "")#"$src1) $dst = vcombine($src2,$src3)"> { + let isPredicatedFalse = isPredNot; +} + +multiclass T_HVX_ccombine <bit isPredNot = 0> { + def NAME : T_HVX_ccombine <isPredNot, VecDblRegs, VectorRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_ccombine <isPredNot, VecDblRegs128B, VectorRegs128B>; +} + +defm V6_vccombine : T_HVX_ccombine, V6_vccombine_enc; +defm V6_vnccombine : T_HVX_ccombine<1>, V6_vnccombine_enc; + +let hasNewValue = 1 in +class T_HVX_shift <string asmString, RegisterClass RCout, RegisterClass RCin> + : CVI_VX_DV_Resource1<(outs RCout:$dst), + (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3), + asmString >; + +multiclass T_HVX_shift <string asmString, RegisterClass RCout, + RegisterClass RCin> { + def NAME : T_HVX_shift <asmString, RCout, RCin>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_shift <asmString, !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin#"128B")>; +} + +multiclass T_HVX_shift_VV <string asmString>: + T_HVX_shift <asmString, VectorRegs, VectorRegs>; + +multiclass T_HVX_shift_WV <string asmString>: + T_HVX_shift <asmString, VecDblRegs, VectorRegs>; + +let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in { +defm V6_valignb : + T_HVX_shift_VV <"$dst = valign($src1,$src2,$src3)">, V6_valignb_enc; +defm V6_vlalignb : + T_HVX_shift_VV <"$dst = vlalign($src1,$src2,$src3)">, V6_vlalignb_enc; +} + +let Itinerary = CVI_VS, Type = TypeCVI_VS in { +defm V6_vasrwh : + T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3)">, V6_vasrwh_enc; +defm V6_vasrwhsat : + T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):sat">, + V6_vasrwhsat_enc; +defm V6_vasrwhrndsat : + T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):rnd:sat">, + V6_vasrwhrndsat_enc; +defm V6_vasrwuhsat : + T_HVX_shift_VV <"$dst.uh = vasr($src1.w,$src2.w,$src3):sat">, + V6_vasrwuhsat_enc; +defm V6_vasrhubsat : + T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):sat">, + V6_vasrhubsat_enc; +defm V6_vasrhubrndsat : + T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):rnd:sat">, + V6_vasrhubrndsat_enc; +defm V6_vasrhbrndsat : + T_HVX_shift_VV <"$dst.b = vasr($src1.h,$src2.h,$src3):rnd:sat">, + V6_vasrhbrndsat_enc; +} + +// Assembler mapped -- alias? +//defm V6_vtran2x2vdd : T_HVX_shift_VV <"">, V6_vtran2x2vdd_enc; +let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in { +defm V6_vshuffvdd : + T_HVX_shift_WV <"$dst = vshuff($src1,$src2,$src3)">, V6_vshuffvdd_enc; +defm V6_vdealvdd : + T_HVX_shift_WV <"$dst = vdeal($src1,$src2,$src3)">, V6_vdealvdd_enc; +} + +let hasNewValue = 1, Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in +class T_HVX_unpack <string asmString, RegisterClass RCout, RegisterClass RCin> + : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCout:$_src_, RCin:$src1), + asmString, [], "$dst = $_src_">; + +multiclass T_HVX_unpack <string asmString> { + def NAME : T_HVX_unpack <asmString, VecDblRegs, VectorRegs>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_unpack <asmString, VecDblRegs128B, VectorRegs128B>; +} + +defm V6_vunpackob : T_HVX_unpack <"$dst.h |= vunpacko($src1.b)">, V6_vunpackob_enc; +defm V6_vunpackoh : T_HVX_unpack <"$dst.w |= vunpacko($src1.h)">, V6_vunpackoh_enc; + +let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1, + hasSideEffects = 0 in +class T_HVX_valign <string asmString, RegisterClass RC> + : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2, u3Imm:$src3), + asmString>; + +multiclass T_HVX_valign <string asmString> { + def NAME : T_HVX_valign <asmString, VectorRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_valign <asmString, VectorRegs128B>; +} + +defm V6_valignbi : + T_HVX_valign <"$dst = valign($src1,$src2,#$src3)">, V6_valignbi_enc; +defm V6_vlalignbi : + T_HVX_valign <"$dst = vlalign($src1,$src2,#$src3)">, V6_vlalignbi_enc; + +let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in +class T_HVX_predAlu <string asmString, RegisterClass RC> + : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2), + asmString>; + +multiclass T_HVX_predAlu <string asmString> { + def NAME : T_HVX_predAlu <asmString, VecPredRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_predAlu <asmString, VecPredRegs128B>; +} + +defm V6_pred_and : T_HVX_predAlu <"$dst = and($src1,$src2)">, V6_pred_and_enc; +defm V6_pred_or : T_HVX_predAlu <"$dst = or($src1,$src2)">, V6_pred_or_enc; +defm V6_pred_xor : T_HVX_predAlu <"$dst = xor($src1,$src2)">, V6_pred_xor_enc; +defm V6_pred_or_n : T_HVX_predAlu <"$dst = or($src1,!$src2)">, V6_pred_or_n_enc; +defm V6_pred_and_n : + T_HVX_predAlu <"$dst = and($src1,!$src2)">, V6_pred_and_n_enc; + +let Itinerary = CVI_VA, Type = TypeCVI_VA in +class T_HVX_prednot <RegisterClass RC> + : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1), + "$dst = not($src1)">, V6_pred_not_enc; + +def V6_pred_not : T_HVX_prednot <VecPredRegs>; +let isCodeGenOnly = 1 in +def V6_pred_not_128B : T_HVX_prednot <VecPredRegs128B>; + +let Itinerary = CVI_VA, Type = TypeCVI_VA in +class T_HVX_vcmp2 <string asmString, RegisterClass RCout, RegisterClass RCin> + : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2), + asmString >; + +multiclass T_HVX_vcmp2 <string asmString> { + def NAME : T_HVX_vcmp2 <asmString, VecPredRegs, VectorRegs>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vcmp2 <asmString, VecPredRegs128B, VectorRegs128B>; +} + +defm V6_veqb : T_HVX_vcmp2 <"$dst = vcmp.eq($src1.b,$src2.b)">, V6_veqb_enc; +defm V6_veqh : T_HVX_vcmp2 <"$dst = vcmp.eq($src1.h,$src2.h)">, V6_veqh_enc; +defm V6_veqw : T_HVX_vcmp2 <"$dst = vcmp.eq($src1.w,$src2.w)">, V6_veqw_enc; +defm V6_vgtb : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.b,$src2.b)">, V6_vgtb_enc; +defm V6_vgth : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.h,$src2.h)">, V6_vgth_enc; +defm V6_vgtw : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.w,$src2.w)">, V6_vgtw_enc; +defm V6_vgtub : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_enc; +defm V6_vgtuh : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_enc; +defm V6_vgtuw : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_enc; + +let isAccumulator = 1, hasNewValue = 1, hasSideEffects = 0 in +class T_V6_vandqrt_acc <RegisterClass RCout, RegisterClass RCin> + : CVI_VX_Resource_late<(outs RCout:$dst), + (ins RCout:$_src_, RCin:$src1, IntRegs:$src2), + "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandqrt_acc_enc; + +def V6_vandqrt_acc : T_V6_vandqrt_acc <VectorRegs, VecPredRegs>; +let isCodeGenOnly = 1 in +def V6_vandqrt_acc_128B : T_V6_vandqrt_acc <VectorRegs128B, VecPredRegs128B>; + +let isAccumulator = 1 in +class T_V6_vandvrt_acc <RegisterClass RCout, RegisterClass RCin> + : CVI_VX_Resource_late<(outs RCout:$dst), + (ins RCout:$_src_, RCin:$src1, IntRegs:$src2), + "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandvrt_acc_enc; + +def V6_vandvrt_acc : T_V6_vandvrt_acc <VecPredRegs, VectorRegs>; +let isCodeGenOnly = 1 in +def V6_vandvrt_acc_128B : T_V6_vandvrt_acc <VecPredRegs128B, VectorRegs128B>; + +let hasNewValue = 1, hasSideEffects = 0 in +class T_V6_vandqrt <RegisterClass RCout, RegisterClass RCin> + : CVI_VX_Resource_late<(outs RCout:$dst), + (ins RCin:$src1, IntRegs:$src2), + "$dst = vand($src1,$src2)" >, V6_vandqrt_enc; + +def V6_vandqrt : T_V6_vandqrt <VectorRegs, VecPredRegs>; +let isCodeGenOnly = 1 in +def V6_vandqrt_128B : T_V6_vandqrt <VectorRegs128B, VecPredRegs128B>; + +let hasNewValue = 1, hasSideEffects = 0 in +class T_V6_lvsplatw <RegisterClass RC> + : CVI_VX_Resource_late<(outs RC:$dst), (ins IntRegs:$src1), + "$dst = vsplat($src1)" >, V6_lvsplatw_enc; + +def V6_lvsplatw : T_V6_lvsplatw <VectorRegs>; +let isCodeGenOnly = 1 in +def V6_lvsplatw_128B : T_V6_lvsplatw <VectorRegs128B>; + + +let hasNewValue = 1 in +class T_V6_vinsertwr <RegisterClass RC> + : CVI_VX_Resource_late<(outs RC:$dst), (ins RC:$_src_, IntRegs:$src1), + "$dst.w = vinsert($src1)", [], "$dst = $_src_">, + V6_vinsertwr_enc; + +def V6_vinsertwr : T_V6_vinsertwr <VectorRegs>; +let isCodeGenOnly = 1 in +def V6_vinsertwr_128B : T_V6_vinsertwr <VectorRegs128B>; + + +let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in +class T_V6_pred_scalar2 <RegisterClass RC> + : CVI_VA_Resource1<(outs RC:$dst), (ins IntRegs:$src1), + "$dst = vsetq($src1)">, V6_pred_scalar2_enc; + +def V6_pred_scalar2 : T_V6_pred_scalar2 <VecPredRegs>; +let isCodeGenOnly = 1 in +def V6_pred_scalar2_128B : T_V6_pred_scalar2 <VecPredRegs128B>; + +class T_V6_vandvrt <RegisterClass RCout, RegisterClass RCin> + : CVI_VX_Resource_late<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2), + "$dst = vand($src1,$src2)">, V6_vandvrt_enc; + +def V6_vandvrt : T_V6_vandvrt <VecPredRegs, VectorRegs>; +let isCodeGenOnly = 1 in +def V6_vandvrt_128B : T_V6_vandvrt <VecPredRegs128B, VectorRegs128B>; + +let validSubTargets = HasV60SubT in +class T_HVX_rol <string asmString, RegisterClass RC, Operand ImmOp > + : SInst2 <(outs RC:$dst), (ins RC:$src1, ImmOp:$src2), asmString>; + +class T_HVX_rol_R <string asmString> + : T_HVX_rol <asmString, IntRegs, u5Imm>; +class T_HVX_rol_P <string asmString> + : T_HVX_rol <asmString, DoubleRegs, u6Imm>; + +def S6_rol_i_p : T_HVX_rol_P <"$dst = rol($src1,#$src2)">, S6_rol_i_p_enc; +let hasNewValue = 1, opNewValue = 0 in +def S6_rol_i_r : T_HVX_rol_R <"$dst = rol($src1,#$src2)">, S6_rol_i_r_enc; + +let validSubTargets = HasV60SubT in +class T_HVX_rol_acc <string asmString, RegisterClass RC, Operand ImmOp> + : SInst2 <(outs RC:$dst), (ins RC:$_src_, RC:$src1, ImmOp:$src2), + asmString, [], "$dst = $_src_" >; + +class T_HVX_rol_acc_P <string asmString> + : T_HVX_rol_acc <asmString, DoubleRegs, u6Imm>; + +class T_HVX_rol_acc_R <string asmString> + : T_HVX_rol_acc <asmString, IntRegs, u5Imm>; + +def S6_rol_i_p_nac : + T_HVX_rol_acc_P <"$dst -= rol($src1,#$src2)">, S6_rol_i_p_nac_enc; +def S6_rol_i_p_acc : + T_HVX_rol_acc_P <"$dst += rol($src1,#$src2)">, S6_rol_i_p_acc_enc; +def S6_rol_i_p_and : + T_HVX_rol_acc_P <"$dst &= rol($src1,#$src2)">, S6_rol_i_p_and_enc; +def S6_rol_i_p_or : + T_HVX_rol_acc_P <"$dst |= rol($src1,#$src2)">, S6_rol_i_p_or_enc; +def S6_rol_i_p_xacc : + T_HVX_rol_acc_P<"$dst ^= rol($src1,#$src2)">, S6_rol_i_p_xacc_enc; + +let hasNewValue = 1, opNewValue = 0 in { +def S6_rol_i_r_nac : + T_HVX_rol_acc_R <"$dst -= rol($src1,#$src2)">, S6_rol_i_r_nac_enc; +def S6_rol_i_r_acc : + T_HVX_rol_acc_R <"$dst += rol($src1,#$src2)">, S6_rol_i_r_acc_enc; +def S6_rol_i_r_and : + T_HVX_rol_acc_R <"$dst &= rol($src1,#$src2)">, S6_rol_i_r_and_enc; +def S6_rol_i_r_or : + T_HVX_rol_acc_R <"$dst |= rol($src1,#$src2)">, S6_rol_i_r_or_enc; +def S6_rol_i_r_xacc : + T_HVX_rol_acc_R <"$dst ^= rol($src1,#$src2)">, S6_rol_i_r_xacc_enc; +} + +let isSolo = 1, Itinerary = LD_tc_ld_SLOT0, Type = TypeLD in +class T_V6_extractw <RegisterClass RC> + : LD1Inst <(outs IntRegs:$dst), (ins RC:$src1, IntRegs:$src2), + "$dst = vextract($src1,$src2)">, V6_extractw_enc; + +def V6_extractw : T_V6_extractw <VectorRegs>; +let isCodeGenOnly = 1 in +def V6_extractw_128B : T_V6_extractw <VectorRegs128B>; + +let Itinerary = ST_tc_st_SLOT0, validSubTargets = HasV55SubT in +class T_sys0op <string asmString> + : ST1Inst <(outs), (ins), asmString>; + +let isSolo = 1, validSubTargets = HasV55SubT in { +def Y5_l2gunlock : T_sys0op <"l2gunlock">, Y5_l2gunlock_enc; +def Y5_l2gclean : T_sys0op <"l2gclean">, Y5_l2gclean_enc; +def Y5_l2gcleaninv : T_sys0op <"l2gcleaninv">, Y5_l2gcleaninv_enc; +} + +class T_sys1op <string asmString, RegisterClass RC> + : ST1Inst <(outs), (ins RC:$src1), asmString>; + +class T_sys1op_R <string asmString> : T_sys1op <asmString, IntRegs>; +class T_sys1op_P <string asmString> : T_sys1op <asmString, DoubleRegs>; + +let isSoloAX = 1, validSubTargets = HasV55SubT in +def Y5_l2unlocka : T_sys1op_R <"l2unlocka($src1)">, Y5_l2unlocka_enc; + +let isSolo = 1, validSubTargets = HasV60SubT in { +def Y6_l2gcleanpa : T_sys1op_P <"l2gclean($src1)">, Y6_l2gcleanpa_enc; +def Y6_l2gcleaninvpa : T_sys1op_P <"l2gcleaninv($src1)">, Y6_l2gcleaninvpa_enc; +} + +let Itinerary = ST_tc_3stall_SLOT0, isPredicateLate = 1, isSoloAX = 1, + validSubTargets = HasV55SubT in +def Y5_l2locka : ST1Inst <(outs PredRegs:$dst), (ins IntRegs:$src1), + "$dst = l2locka($src1)">, Y5_l2locka_enc; + +// not defined on etc side. why? +// defm S2_cabacencbin : _VV <"Rdd=encbin(Rss,$src2,Pu)">, S2_cabacencbin_enc; + +let Defs = [USR_OVF], Itinerary = M_tc_3stall_SLOT23, isPredicateLate = 1, + hasSideEffects = 0, +validSubTargets = HasV55SubT in +def A5_ACS : MInst2 <(outs DoubleRegs:$dst1, PredRegs:$dst2), + (ins DoubleRegs:$_src_, DoubleRegs:$src1, DoubleRegs:$src2), + "$dst1,$dst2 = vacsh($src1,$src2)", [], + "$dst1 = $_src_" >, Requires<[HasV55T]>, A5_ACS_enc; + +let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, hasNewValue = 1, + hasSideEffects = 0 in +class T_HVX_alu2 <string asmString, RegisterClass RCout, RegisterClass RCin1, + RegisterClass RCin2> + : CVI_VA_Resource1<(outs RCout:$dst), + (ins RCin1:$src1, RCin2:$src2, RCin2:$src3), asmString>; + +multiclass T_HVX_alu2 <string asmString, RegisterClass RC > { + def NAME : T_HVX_alu2 <asmString, RC, VecPredRegs, VectorRegs>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_alu2 <asmString, !cast<RegisterClass>(RC#"128B"), + VecPredRegs128B, VectorRegs128B>; +} + +multiclass T_HVX_alu2_V <string asmString> : + T_HVX_alu2 <asmString, VectorRegs>; + +multiclass T_HVX_alu2_W <string asmString> : + T_HVX_alu2 <asmString, VecDblRegs>; + +defm V6_vswap : T_HVX_alu2_W <"$dst = vswap($src1,$src2,$src3)">, V6_vswap_enc; + +let Itinerary = CVI_VA, Type = TypeCVI_VA, hasNewValue = 1, + hasSideEffects = 0 in +defm V6_vmux : T_HVX_alu2_V <"$dst = vmux($src1,$src2,$src3)">, V6_vmux_enc; + +class T_HVX_vlutb <string asmString, RegisterClass RCout, RegisterClass RCin> + : CVI_VA_Resource1<(outs RCout:$dst), + (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3), asmString>; + +multiclass T_HVX_vlutb <string asmString, RegisterClass RCout, + RegisterClass RCin> { + def NAME : T_HVX_vlutb <asmString, RCout, RCin>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vlutb <asmString, !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin#"128B")>; +} + +multiclass T_HVX_vlutb_V <string asmString> : + T_HVX_vlutb <asmString, VectorRegs, VectorRegs>; + +multiclass T_HVX_vlutb_W <string asmString> : + T_HVX_vlutb <asmString, VecDblRegs, VectorRegs>; + +let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, isAccumulator = 1 in +class T_HVX_vlutb_acc <string asmString, RegisterClass RCout, + RegisterClass RCin> + : CVI_VA_Resource1<(outs RCout:$dst), + (ins RCout:$_src_, RCin:$src1, RCin:$src2, IntRegsLow8:$src3), + asmString, [], "$dst = $_src_">; + +multiclass T_HVX_vlutb_acc <string asmString, RegisterClass RCout, + RegisterClass RCin> { + def NAME : T_HVX_vlutb_acc <asmString, RCout, RCin>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vlutb_acc<asmString, + !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin#"128B")>; +} + +multiclass T_HVX_vlutb_acc_V <string asmString> : + T_HVX_vlutb_acc <asmString, VectorRegs, VectorRegs>; + +multiclass T_HVX_vlutb_acc_W <string asmString> : + T_HVX_vlutb_acc <asmString, VecDblRegs, VectorRegs>; + + +let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1 in +defm V6_vlutvvb: + T_HVX_vlutb_V <"$dst.b = vlut32($src1.b,$src2.b,$src3)">, V6_vlutvvb_enc; + +let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, hasNewValue = 1 in +defm V6_vlutvwh: + T_HVX_vlutb_W <"$dst.h = vlut16($src1.b,$src2.h,$src3)">, V6_vlutvwh_enc; + +let hasNewValue = 1 in { + defm V6_vlutvvb_oracc: + T_HVX_vlutb_acc_V <"$dst.b |= vlut32($src1.b,$src2.b,$src3)">, + V6_vlutvvb_oracc_enc; + defm V6_vlutvwh_oracc: + T_HVX_vlutb_acc_W <"$dst.h |= vlut16($src1.b,$src2.h,$src3)">, + V6_vlutvwh_oracc_enc; +} + +// It's a fake instruction and should not be defined? +def S2_cabacencbin + : SInst2<(outs DoubleRegs:$dst), + (ins DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3), + "$dst = encbin($src1,$src2,$src3)">, S2_cabacencbin_enc; + +// Vhist instructions +def V6_vhistq + : CVI_HIST_Resource1 <(outs), (ins VecPredRegs:$src1), + "vhist($src1)">, V6_vhistq_enc; + +def V6_vhist + : CVI_HIST_Resource1 <(outs), (ins), + "vhist" >, V6_vhist_enc; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td index f4fb946..96dd531 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td @@ -35,6 +35,34 @@ multiclass bitconvert_64<ValueType a, ValueType b> { (a DoubleRegs:$src)>; } +multiclass bitconvert_vec<ValueType a, ValueType b> { + def : Pat <(b (bitconvert (a VectorRegs:$src))), + (b VectorRegs:$src)>; + def : Pat <(a (bitconvert (b VectorRegs:$src))), + (a VectorRegs:$src)>; +} + +multiclass bitconvert_dblvec<ValueType a, ValueType b> { + def : Pat <(b (bitconvert (a VecDblRegs:$src))), + (b VecDblRegs:$src)>; + def : Pat <(a (bitconvert (b VecDblRegs:$src))), + (a VecDblRegs:$src)>; +} + +multiclass bitconvert_predvec<ValueType a, ValueType b> { + def : Pat <(b (bitconvert (a VecPredRegs:$src))), + (b VectorRegs:$src)>; + def : Pat <(a (bitconvert (b VectorRegs:$src))), + (a VecPredRegs:$src)>; +} + +multiclass bitconvert_dblvec128B<ValueType a, ValueType b> { + def : Pat <(b (bitconvert (a VecDblRegs128B:$src))), + (b VecDblRegs128B:$src)>; + def : Pat <(a (bitconvert (b VecDblRegs128B:$src))), + (a VecDblRegs128B:$src)>; +} + // Bit convert vector types. defm : bitconvert_32<v4i8, i32>; defm : bitconvert_32<v2i16, i32>; @@ -47,6 +75,21 @@ defm : bitconvert_64<v8i8, v4i16>; defm : bitconvert_64<v8i8, v2i32>; defm : bitconvert_64<v4i16, v2i32>; +defm : bitconvert_vec<v64i8, v16i32>; +defm : bitconvert_vec<v8i64 , v16i32>; +defm : bitconvert_vec<v32i16, v16i32>; + +defm : bitconvert_dblvec<v16i64, v128i8>; +defm : bitconvert_dblvec<v32i32, v128i8>; +defm : bitconvert_dblvec<v64i16, v128i8>; + +defm : bitconvert_dblvec128B<v64i32, v128i16>; +defm : bitconvert_dblvec128B<v256i8, v128i16>; +defm : bitconvert_dblvec128B<v32i64, v128i16>; + +defm : bitconvert_dblvec128B<v64i32, v256i8>; +defm : bitconvert_dblvec128B<v32i64, v256i8>; +defm : bitconvert_dblvec128B<v128i16, v256i8>; // Vector shift support. Vector shifting in Hexagon is rather different // from internal representation of LLVM. diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td index 1d0d015..b207aaf 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td @@ -691,15 +691,15 @@ def: T_RR_pat<A2_combine_hl, int_hexagon_A2_combine_hl>; def: T_RR_pat<A2_combine_lh, int_hexagon_A2_combine_lh>; def: T_RR_pat<A2_combine_ll, int_hexagon_A2_combine_ll>; -def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s8ExtPred, s8ImmPred>; +def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s32ImmPred, s8ImmPred>; def: Pat<(i32 (int_hexagon_C2_mux (I32:$Rp), (I32:$Rs), (I32:$Rt))), (i32 (C2_mux (C2_tfrrp IntRegs:$Rp), IntRegs:$Rs, IntRegs:$Rt))>; // Mux -def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s8ExtPred>; -def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s8ExtPred>; -def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s8ExtPred, s8ImmPred>; +def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s32ImmPred>; +def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s32ImmPred>; +def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s32ImmPred, s8ImmPred>; // Shift halfword def : T_R_pat<A2_aslh, int_hexagon_A2_aslh>; @@ -720,17 +720,17 @@ def : T_RR_pat<C2_cmpeq, int_hexagon_C2_cmpeq>; def : T_RR_pat<C2_cmpgt, int_hexagon_C2_cmpgt>; def : T_RR_pat<C2_cmpgtu, int_hexagon_C2_cmpgtu>; -def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s10ExtPred>; -def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s10ExtPred>; -def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u9ExtPred>; +def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s32ImmPred>; +def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s32ImmPred>; +def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u32ImmPred>; -def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s8ExtPred:$src2)), +def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s32ImmPred:$src2)), (i32 (C2_cmpgti (I32:$src1), - (DEC_CONST_SIGNED s8ExtPred:$src2)))>; + (DEC_CONST_SIGNED s32ImmPred:$src2)))>; -def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u8ExtPred:$src2)), +def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u32ImmPred:$src2)), (i32 (C2_cmpgtui (I32:$src1), - (DEC_CONST_UNSIGNED u8ExtPred:$src2)))>; + (DEC_CONST_UNSIGNED u32ImmPred:$src2)))>; // The instruction, Pd=cmp.geu(Rs, #u8) -> Pd=cmp.eq(Rs,Rs) when #u8 == 0. def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), 0)), @@ -1289,3 +1289,5 @@ def: T_stc_pat<S2_storerf_pci_pseudo, int_hexagon_circ_sthhi, s4_1ImmPred, I32>; include "HexagonIntrinsicsV3.td" include "HexagonIntrinsicsV4.td" include "HexagonIntrinsicsV5.td" +include "HexagonIntrinsicsV60.td" + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td new file mode 100644 index 0000000..24a3e4d --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td @@ -0,0 +1,836 @@ +//=- HexagonIntrinsicsV60.td - Target Description for Hexagon -*- tablegen *-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Hexagon V60 Compiler Intrinsics in TableGen format. +// +//===----------------------------------------------------------------------===// + +let isCodeGenOnly = 1 in { +def HEXAGON_V6_vd0_pseudo : CVI_VA_Resource<(outs VectorRegs:$dst), + (ins ), + "$dst=#0", + [(set VectorRegs:$dst, (int_hexagon_V6_vd0 ))]>; + +def HEXAGON_V6_vd0_pseudo_128B : CVI_VA_Resource<(outs VectorRegs128B:$dst), + (ins ), + "$dst=#0", + [(set VectorRegs128B:$dst, (int_hexagon_V6_vd0_128B ))]>; +} +let isPseudo = 1 in +def HEXAGON_V6_vassignp : CVI_VA_Resource<(outs VecDblRegs:$dst), + (ins VecDblRegs:$src1), + "$dst=vassignp_W($src1)", + [(set VecDblRegs:$dst, (int_hexagon_V6_vassignp VecDblRegs:$src1))]>; + +let isPseudo = 1 in +def HEXAGON_V6_vassignp_128B : CVI_VA_Resource<(outs VecDblRegs128B:$dst), + (ins VecDblRegs128B:$src1), + "$dst=vassignp_W_128B($src1)", + [(set VecDblRegs128B:$dst, (int_hexagon_V6_vassignp_128B + VecDblRegs128B:$src1))]>; + +let isPseudo = 1 in +def HEXAGON_V6_lo : CVI_VA_Resource<(outs VectorRegs:$dst), + (ins VecDblRegs:$src1), + "$dst=lo_W($src1)", + [(set VectorRegs:$dst, (int_hexagon_V6_lo VecDblRegs:$src1))]>; + +let isPseudo = 1 in +def HEXAGON_V6_hi : CVI_VA_Resource<(outs VectorRegs:$dst), + (ins VecDblRegs:$src1), + "$dst=hi_W($src1)", + [(set VectorRegs:$dst, (int_hexagon_V6_hi VecDblRegs:$src1))]>; + +let isPseudo = 1 in +def HEXAGON_V6_lo_128B : CVI_VA_Resource<(outs VectorRegs128B:$dst), + (ins VecDblRegs128B:$src1), + "$dst=lo_W($src1)", + [(set VectorRegs128B:$dst, (int_hexagon_V6_lo_128B VecDblRegs128B:$src1))]>; + +let isPseudo = 1 in +def HEXAGON_V6_hi_128B : CVI_VA_Resource<(outs VectorRegs128B:$dst), + (ins VecDblRegs128B:$src1), + "$dst=hi_W($src1)", + [(set VectorRegs128B:$dst, (int_hexagon_V6_hi_128B VecDblRegs128B:$src1))]>; + +let AddedComplexity = 100 in { +def : Pat < (v16i32 (int_hexagon_V6_lo (v32i32 VecDblRegs:$src1))), + (v16i32 (EXTRACT_SUBREG (v32i32 VecDblRegs:$src1), subreg_loreg)) >, + Requires<[UseHVXSgl]>; + +def : Pat < (v16i32 (int_hexagon_V6_hi (v32i32 VecDblRegs:$src1))), + (v16i32 (EXTRACT_SUBREG (v32i32 VecDblRegs:$src1), subreg_hireg)) >, + Requires<[UseHVXSgl]>; + +def : Pat < (v32i32 (int_hexagon_V6_lo_128B (v64i32 VecDblRegs128B:$src1))), + (v32i32 (EXTRACT_SUBREG (v64i32 VecDblRegs128B:$src1), + subreg_loreg)) >, + Requires<[UseHVXDbl]>; + +def : Pat < (v32i32 (int_hexagon_V6_hi_128B (v64i32 VecDblRegs128B:$src1))), + (v32i32 (EXTRACT_SUBREG (v64i32 VecDblRegs128B:$src1), + subreg_hireg)) >, + Requires<[UseHVXDbl]>; +} + +def : Pat <(v512i1 (bitconvert (v16i32 VectorRegs:$src1))), + (v512i1 (V6_vandvrt(v16i32 VectorRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v512i1 (bitconvert (v32i16 VectorRegs:$src1))), + (v512i1 (V6_vandvrt(v32i16 VectorRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v512i1 (bitconvert (v64i8 VectorRegs:$src1))), + (v512i1 (V6_vandvrt(v64i8 VectorRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v512i1 (bitconvert (v8i64 VectorRegs:$src1))), + (v512i1 (V6_vandvrt(v8i64 VectorRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v16i32 (bitconvert (v512i1 VecPredRegs:$src1))), + (v16i32 (V6_vandqrt(v512i1 VecPredRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v32i16 (bitconvert (v512i1 VecPredRegs:$src1))), + (v32i16 (V6_vandqrt(v512i1 VecPredRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v64i8 (bitconvert (v512i1 VecPredRegs:$src1))), + (v64i8 (V6_vandqrt(v512i1 VecPredRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v8i64 (bitconvert (v512i1 VecPredRegs:$src1))), + (v8i64 (V6_vandqrt(v512i1 VecPredRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v1024i1 (bitconvert (v32i32 VectorRegs128B:$src1))), + (v1024i1 (V6_vandvrt_128B(v32i32 VectorRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v1024i1 (bitconvert (v64i16 VectorRegs128B:$src1))), + (v1024i1 (V6_vandvrt_128B(v64i16 VectorRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v1024i1 (bitconvert (v128i8 VectorRegs128B:$src1))), + (v1024i1 (V6_vandvrt_128B(v128i8 VectorRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v1024i1 (bitconvert (v16i64 VectorRegs128B:$src1))), + (v1024i1 (V6_vandvrt_128B(v16i64 VectorRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v32i32 (bitconvert (v1024i1 VecPredRegs128B:$src1))), + (v32i32 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v64i16 (bitconvert (v1024i1 VecPredRegs128B:$src1))), + (v64i16 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v128i8 (bitconvert (v1024i1 VecPredRegs128B:$src1))), + (v128i8 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v16i64 (bitconvert (v1024i1 VecPredRegs128B:$src1))), + (v16i64 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +let AddedComplexity = 140 in { +def : Pat <(store (v512i1 VecPredRegs:$src1), (i32 IntRegs:$addr)), + (V6_vS32b_ai IntRegs:$addr, 0, + (v16i32 (V6_vandqrt (v512i1 VecPredRegs:$src1), + (A2_tfrsi 0x01010101))))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v512i1 (load (i32 IntRegs:$addr))), + (v512i1 (V6_vandvrt + (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(store (v1024i1 VecPredRegs128B:$src1), (i32 IntRegs:$addr)), + (V6_vS32b_ai_128B IntRegs:$addr, 0, + (v32i32 (V6_vandqrt_128B (v1024i1 VecPredRegs128B:$src1), + (A2_tfrsi 0x01010101))))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v1024i1 (load (i32 IntRegs:$addr))), + (v1024i1 (V6_vandvrt_128B + (v32i32 (V6_vL32b_ai_128B IntRegs:$addr, 0)), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; +} + +multiclass T_R_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID IntRegs:$src1), (MI IntRegs:$src1)>, + Requires<[UseHVXSgl]>; + def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1), + (!cast<InstHexagon>(MI#"_128B") IntRegs:$src1)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_V_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1), + (MI VectorRegs:$src1)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_Q_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecPredRegs:$src1), + (MI VecPredRegs:$src1)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1), + (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2), + (MI VecDblRegs:$src1, IntRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B")VecDblRegs128B:$src1, IntRegs:$src2), + (!cast<InstHexagon>(MI#"_128B")VecDblRegs128B:$src1, IntRegs:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, IntRegs:$src2), + (MI VectorRegs:$src1, IntRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B")VectorRegs128B:$src1, IntRegs:$src2), + (!cast<InstHexagon>(MI#"_128B")VectorRegs128B:$src1, IntRegs:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WV_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2), + (MI VecDblRegs:$src1, VectorRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WW_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2), + (MI VecDblRegs:$src1, VecDblRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VecDblRegs128B:$src2), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VecDblRegs128B:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VV_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2), + (MI VectorRegs:$src1, VectorRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_QR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecPredRegs:$src1, IntRegs:$src2), + (MI VecPredRegs:$src1, IntRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, + IntRegs:$src2), + (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, + IntRegs:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_QQ_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecPredRegs:$src1, VecPredRegs:$src2), + (MI VecPredRegs:$src1, VecPredRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, + VecPredRegs128B:$src2), + (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, + VecPredRegs128B:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WWR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3), + (MI VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VecDblRegs128B:$src2, + IntRegs:$src3), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VecDblRegs128B:$src2, + IntRegs:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VVR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3), + (MI VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, + IntRegs:$src3), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, + IntRegs:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WVR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, IntRegs:$src3), + (MI VecDblRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2, + IntRegs:$src3), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2, + IntRegs:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VWR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VecDblRegs:$src2, IntRegs:$src3), + (MI VectorRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VecDblRegs128B:$src2, + IntRegs:$src3), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VecDblRegs128B:$src2, + IntRegs:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VVV_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), + (MI VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WVV_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), + (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_QVV_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), + (MI VecPredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3), + (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VQR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3), + (MI VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VecPredRegs128B:$src2, + IntRegs:$src3), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VecPredRegs128B:$src2, + IntRegs:$src3)>, + Requires<[UseHVXDbl]>; +} + + +multiclass T_QVR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2, IntRegs:$src3), + (MI VecPredRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, + VectorRegs128B:$src2, + IntRegs:$src3), + (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, + VectorRegs128B:$src2, + IntRegs:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VVI_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, imm:$src3), + (MI VectorRegs:$src1, VectorRegs:$src2, imm:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, imm:$src3), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, imm:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WRI_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2, imm:$src3), + (MI VecDblRegs:$src1, IntRegs:$src2, imm:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + IntRegs:$src2, imm:$src3), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + IntRegs:$src2, imm:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WWRI_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3, imm:$src4), + (MI VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3, imm:$src4)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VecDblRegs128B:$src2, + IntRegs:$src3, imm:$src4), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VecDblRegs128B:$src2, + IntRegs:$src3, imm:$src4)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VVVR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, + IntRegs:$src4), + (MI VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, + IntRegs:$src4)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3, + IntRegs:$src4), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3, + IntRegs:$src4)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WVVR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, + IntRegs:$src4), + (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, + IntRegs:$src4)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3, + IntRegs:$src4), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3, + IntRegs:$src4)>, + Requires<[UseHVXDbl]>; +} + +defm : T_WR_pat<V6_vtmpyb, int_hexagon_V6_vtmpyb>; +defm : T_WR_pat <V6_vtmpybus, int_hexagon_V6_vtmpybus>; +defm : T_VR_pat <V6_vdmpyhb, int_hexagon_V6_vdmpyhb>; +defm : T_VR_pat <V6_vrmpyub, int_hexagon_V6_vrmpyub>; +defm : T_VR_pat <V6_vrmpybus, int_hexagon_V6_vrmpybus>; +defm : T_WR_pat <V6_vdsaduh, int_hexagon_V6_vdsaduh>; +defm : T_VR_pat <V6_vdmpybus, int_hexagon_V6_vdmpybus>; +defm : T_WR_pat <V6_vdmpybus_dv, int_hexagon_V6_vdmpybus_dv>; +defm : T_VR_pat <V6_vdmpyhsusat, int_hexagon_V6_vdmpyhsusat>; +defm : T_WR_pat <V6_vdmpyhsuisat, int_hexagon_V6_vdmpyhsuisat>; +defm : T_VR_pat <V6_vdmpyhsat, int_hexagon_V6_vdmpyhsat>; +defm : T_WR_pat <V6_vdmpyhisat, int_hexagon_V6_vdmpyhisat>; +defm : T_WR_pat <V6_vdmpyhb_dv, int_hexagon_V6_vdmpyhb_dv>; +defm : T_VR_pat <V6_vmpybus, int_hexagon_V6_vmpybus>; +defm : T_WR_pat <V6_vmpabus, int_hexagon_V6_vmpabus>; +defm : T_WR_pat <V6_vmpahb, int_hexagon_V6_vmpahb>; +defm : T_VR_pat <V6_vmpyh, int_hexagon_V6_vmpyh>; +defm : T_VR_pat <V6_vmpyhss, int_hexagon_V6_vmpyhss>; +defm : T_VR_pat <V6_vmpyhsrs, int_hexagon_V6_vmpyhsrs>; +defm : T_VR_pat <V6_vmpyuh, int_hexagon_V6_vmpyuh>; +defm : T_VR_pat <V6_vmpyihb, int_hexagon_V6_vmpyihb>; +defm : T_VR_pat <V6_vror, int_hexagon_V6_vror>; +defm : T_VR_pat <V6_vasrw, int_hexagon_V6_vasrw>; +defm : T_VR_pat <V6_vasrh, int_hexagon_V6_vasrh>; +defm : T_VR_pat <V6_vaslw, int_hexagon_V6_vaslw>; +defm : T_VR_pat <V6_vaslh, int_hexagon_V6_vaslh>; +defm : T_VR_pat <V6_vlsrw, int_hexagon_V6_vlsrw>; +defm : T_VR_pat <V6_vlsrh, int_hexagon_V6_vlsrh>; +defm : T_VR_pat <V6_vmpyiwh, int_hexagon_V6_vmpyiwh>; +defm : T_VR_pat <V6_vmpyiwb, int_hexagon_V6_vmpyiwb>; +defm : T_WR_pat <V6_vtmpyhb, int_hexagon_V6_vtmpyhb>; +defm : T_VR_pat <V6_vmpyub, int_hexagon_V6_vmpyub>; + +defm : T_VV_pat <V6_vrmpyubv, int_hexagon_V6_vrmpyubv>; +defm : T_VV_pat <V6_vrmpybv, int_hexagon_V6_vrmpybv>; +defm : T_VV_pat <V6_vrmpybusv, int_hexagon_V6_vrmpybusv>; +defm : T_VV_pat <V6_vdmpyhvsat, int_hexagon_V6_vdmpyhvsat>; +defm : T_VV_pat <V6_vmpybv, int_hexagon_V6_vmpybv>; +defm : T_VV_pat <V6_vmpyubv, int_hexagon_V6_vmpyubv>; +defm : T_VV_pat <V6_vmpybusv, int_hexagon_V6_vmpybusv>; +defm : T_VV_pat <V6_vmpyhv, int_hexagon_V6_vmpyhv>; +defm : T_VV_pat <V6_vmpyuhv, int_hexagon_V6_vmpyuhv>; +defm : T_VV_pat <V6_vmpyhvsrs, int_hexagon_V6_vmpyhvsrs>; +defm : T_VV_pat <V6_vmpyhus, int_hexagon_V6_vmpyhus>; +defm : T_WW_pat <V6_vmpabusv, int_hexagon_V6_vmpabusv>; +defm : T_VV_pat <V6_vmpyih, int_hexagon_V6_vmpyih>; +defm : T_VV_pat <V6_vand, int_hexagon_V6_vand>; +defm : T_VV_pat <V6_vor, int_hexagon_V6_vor>; +defm : T_VV_pat <V6_vxor, int_hexagon_V6_vxor>; +defm : T_VV_pat <V6_vaddw, int_hexagon_V6_vaddw>; +defm : T_VV_pat <V6_vaddubsat, int_hexagon_V6_vaddubsat>; +defm : T_VV_pat <V6_vadduhsat, int_hexagon_V6_vadduhsat>; +defm : T_VV_pat <V6_vaddhsat, int_hexagon_V6_vaddhsat>; +defm : T_VV_pat <V6_vaddwsat, int_hexagon_V6_vaddwsat>; +defm : T_VV_pat <V6_vsubb, int_hexagon_V6_vsubb>; +defm : T_VV_pat <V6_vsubh, int_hexagon_V6_vsubh>; +defm : T_VV_pat <V6_vsubw, int_hexagon_V6_vsubw>; +defm : T_VV_pat <V6_vsububsat, int_hexagon_V6_vsububsat>; +defm : T_VV_pat <V6_vsubuhsat, int_hexagon_V6_vsubuhsat>; +defm : T_VV_pat <V6_vsubhsat, int_hexagon_V6_vsubhsat>; +defm : T_VV_pat <V6_vsubwsat, int_hexagon_V6_vsubwsat>; +defm : T_WW_pat <V6_vaddb_dv, int_hexagon_V6_vaddb_dv>; +defm : T_WW_pat <V6_vaddh_dv, int_hexagon_V6_vaddh_dv>; +defm : T_WW_pat <V6_vaddw_dv, int_hexagon_V6_vaddw_dv>; +defm : T_WW_pat <V6_vaddubsat_dv, int_hexagon_V6_vaddubsat_dv>; +defm : T_WW_pat <V6_vadduhsat_dv, int_hexagon_V6_vadduhsat_dv>; +defm : T_WW_pat <V6_vaddhsat_dv, int_hexagon_V6_vaddhsat_dv>; +defm : T_WW_pat <V6_vaddwsat_dv, int_hexagon_V6_vaddwsat_dv>; +defm : T_WW_pat <V6_vsubb_dv, int_hexagon_V6_vsubb_dv>; +defm : T_WW_pat <V6_vsubh_dv, int_hexagon_V6_vsubh_dv>; +defm : T_WW_pat <V6_vsubw_dv, int_hexagon_V6_vsubw_dv>; +defm : T_WW_pat <V6_vsububsat_dv, int_hexagon_V6_vsububsat_dv>; +defm : T_WW_pat <V6_vsubuhsat_dv, int_hexagon_V6_vsubuhsat_dv>; +defm : T_WW_pat <V6_vsubhsat_dv, int_hexagon_V6_vsubhsat_dv>; +defm : T_WW_pat <V6_vsubwsat_dv, int_hexagon_V6_vsubwsat_dv>; +defm : T_VV_pat <V6_vaddubh, int_hexagon_V6_vaddubh>; +defm : T_VV_pat <V6_vadduhw, int_hexagon_V6_vadduhw>; +defm : T_VV_pat <V6_vaddhw, int_hexagon_V6_vaddhw>; +defm : T_VV_pat <V6_vsububh, int_hexagon_V6_vsububh>; +defm : T_VV_pat <V6_vsubuhw, int_hexagon_V6_vsubuhw>; +defm : T_VV_pat <V6_vsubhw, int_hexagon_V6_vsubhw>; +defm : T_VV_pat <V6_vabsdiffub, int_hexagon_V6_vabsdiffub>; +defm : T_VV_pat <V6_vabsdiffh, int_hexagon_V6_vabsdiffh>; +defm : T_VV_pat <V6_vabsdiffuh, int_hexagon_V6_vabsdiffuh>; +defm : T_VV_pat <V6_vabsdiffw, int_hexagon_V6_vabsdiffw>; +defm : T_VV_pat <V6_vavgub, int_hexagon_V6_vavgub>; +defm : T_VV_pat <V6_vavguh, int_hexagon_V6_vavguh>; +defm : T_VV_pat <V6_vavgh, int_hexagon_V6_vavgh>; +defm : T_VV_pat <V6_vavgw, int_hexagon_V6_vavgw>; +defm : T_VV_pat <V6_vnavgub, int_hexagon_V6_vnavgub>; +defm : T_VV_pat <V6_vnavgh, int_hexagon_V6_vnavgh>; +defm : T_VV_pat <V6_vnavgw, int_hexagon_V6_vnavgw>; +defm : T_VV_pat <V6_vavgubrnd, int_hexagon_V6_vavgubrnd>; +defm : T_VV_pat <V6_vavguhrnd, int_hexagon_V6_vavguhrnd>; +defm : T_VV_pat <V6_vavghrnd, int_hexagon_V6_vavghrnd>; +defm : T_VV_pat <V6_vavgwrnd, int_hexagon_V6_vavgwrnd>; +defm : T_WW_pat <V6_vmpabuuv, int_hexagon_V6_vmpabuuv>; + +defm : T_VVR_pat <V6_vdmpyhb_acc, int_hexagon_V6_vdmpyhb_acc>; +defm : T_VVR_pat <V6_vrmpyub_acc, int_hexagon_V6_vrmpyub_acc>; +defm : T_VVR_pat <V6_vrmpybus_acc, int_hexagon_V6_vrmpybus_acc>; +defm : T_VVR_pat <V6_vdmpybus_acc, int_hexagon_V6_vdmpybus_acc>; +defm : T_VVR_pat <V6_vdmpyhsusat_acc, int_hexagon_V6_vdmpyhsusat_acc>; +defm : T_VVR_pat <V6_vdmpyhsat_acc, int_hexagon_V6_vdmpyhsat_acc>; +defm : T_VVR_pat <V6_vmpyiwb_acc, int_hexagon_V6_vmpyiwb_acc>; +defm : T_VVR_pat <V6_vmpyiwh_acc, int_hexagon_V6_vmpyiwh_acc>; +defm : T_VVR_pat <V6_vmpyihb_acc, int_hexagon_V6_vmpyihb_acc>; +defm : T_VVR_pat <V6_vaslw_acc, int_hexagon_V6_vaslw_acc>; +defm : T_VVR_pat <V6_vasrw_acc, int_hexagon_V6_vasrw_acc>; + +defm : T_VWR_pat <V6_vdmpyhsuisat_acc, int_hexagon_V6_vdmpyhsuisat_acc>; +defm : T_VWR_pat <V6_vdmpyhisat_acc, int_hexagon_V6_vdmpyhisat_acc>; + +defm : T_WVR_pat <V6_vmpybus_acc, int_hexagon_V6_vmpybus_acc>; +defm : T_WVR_pat <V6_vmpyhsat_acc, int_hexagon_V6_vmpyhsat_acc>; +defm : T_WVR_pat <V6_vmpyuh_acc, int_hexagon_V6_vmpyuh_acc>; +defm : T_WVR_pat <V6_vmpyub_acc, int_hexagon_V6_vmpyub_acc>; + +defm : T_WWR_pat <V6_vtmpyb_acc, int_hexagon_V6_vtmpyb_acc>; +defm : T_WWR_pat <V6_vtmpybus_acc, int_hexagon_V6_vtmpybus_acc>; +defm : T_WWR_pat <V6_vtmpyhb_acc, int_hexagon_V6_vtmpyhb_acc>; +defm : T_WWR_pat <V6_vdmpybus_dv_acc, int_hexagon_V6_vdmpybus_dv_acc>; +defm : T_WWR_pat <V6_vdmpyhb_dv_acc, int_hexagon_V6_vdmpyhb_dv_acc>; +defm : T_WWR_pat <V6_vmpabus_acc, int_hexagon_V6_vmpabus_acc>; +defm : T_WWR_pat <V6_vmpahb_acc, int_hexagon_V6_vmpahb_acc>; +defm : T_WWR_pat <V6_vdsaduh_acc, int_hexagon_V6_vdsaduh_acc>; + +defm : T_VVV_pat <V6_vdmpyhvsat_acc, int_hexagon_V6_vdmpyhvsat_acc>; +defm : T_WVV_pat <V6_vmpybusv_acc, int_hexagon_V6_vmpybusv_acc>; +defm : T_WVV_pat <V6_vmpybv_acc, int_hexagon_V6_vmpybv_acc>; +defm : T_WVV_pat <V6_vmpyhus_acc, int_hexagon_V6_vmpyhus_acc>; +defm : T_WVV_pat <V6_vmpyhv_acc, int_hexagon_V6_vmpyhv_acc>; +defm : T_VVV_pat <V6_vmpyiewh_acc, int_hexagon_V6_vmpyiewh_acc>; +defm : T_VVV_pat <V6_vmpyiewuh_acc, int_hexagon_V6_vmpyiewuh_acc>; +defm : T_VVV_pat <V6_vmpyih_acc, int_hexagon_V6_vmpyih_acc>; +defm : T_VVV_pat <V6_vmpyowh_rnd_sacc, int_hexagon_V6_vmpyowh_rnd_sacc>; +defm : T_VVV_pat <V6_vmpyowh_sacc, int_hexagon_V6_vmpyowh_sacc>; +defm : T_WVV_pat <V6_vmpyubv_acc, int_hexagon_V6_vmpyubv_acc>; +defm : T_WVV_pat <V6_vmpyuhv_acc, int_hexagon_V6_vmpyuhv_acc>; +defm : T_VVV_pat <V6_vrmpybusv_acc, int_hexagon_V6_vrmpybusv_acc>; +defm : T_VVV_pat <V6_vrmpybv_acc, int_hexagon_V6_vrmpybv_acc>; +defm : T_VVV_pat <V6_vrmpyubv_acc, int_hexagon_V6_vrmpyubv_acc>; + +// Compare instructions +defm : T_QVV_pat <V6_veqb_and, int_hexagon_V6_veqb_and>; +defm : T_QVV_pat <V6_veqh_and, int_hexagon_V6_veqh_and>; +defm : T_QVV_pat <V6_veqw_and, int_hexagon_V6_veqw_and>; +defm : T_QVV_pat <V6_vgtb_and, int_hexagon_V6_vgtb_and>; +defm : T_QVV_pat <V6_vgth_and, int_hexagon_V6_vgth_and>; +defm : T_QVV_pat <V6_vgtw_and, int_hexagon_V6_vgtw_and>; +defm : T_QVV_pat <V6_vgtub_and, int_hexagon_V6_vgtub_and>; +defm : T_QVV_pat <V6_vgtuh_and, int_hexagon_V6_vgtuh_and>; +defm : T_QVV_pat <V6_vgtuw_and, int_hexagon_V6_vgtuw_and>; +defm : T_QVV_pat <V6_veqb_or, int_hexagon_V6_veqb_or>; +defm : T_QVV_pat <V6_veqh_or, int_hexagon_V6_veqh_or>; +defm : T_QVV_pat <V6_veqw_or, int_hexagon_V6_veqw_or>; +defm : T_QVV_pat <V6_vgtb_or, int_hexagon_V6_vgtb_or>; +defm : T_QVV_pat <V6_vgth_or, int_hexagon_V6_vgth_or>; +defm : T_QVV_pat <V6_vgtw_or, int_hexagon_V6_vgtw_or>; +defm : T_QVV_pat <V6_vgtub_or, int_hexagon_V6_vgtub_or>; +defm : T_QVV_pat <V6_vgtuh_or, int_hexagon_V6_vgtuh_or>; +defm : T_QVV_pat <V6_vgtuw_or, int_hexagon_V6_vgtuw_or>; +defm : T_QVV_pat <V6_veqb_xor, int_hexagon_V6_veqb_xor>; +defm : T_QVV_pat <V6_veqh_xor, int_hexagon_V6_veqh_xor>; +defm : T_QVV_pat <V6_veqw_xor, int_hexagon_V6_veqw_xor>; +defm : T_QVV_pat <V6_vgtb_xor, int_hexagon_V6_vgtb_xor>; +defm : T_QVV_pat <V6_vgth_xor, int_hexagon_V6_vgth_xor>; +defm : T_QVV_pat <V6_vgtw_xor, int_hexagon_V6_vgtw_xor>; +defm : T_QVV_pat <V6_vgtub_xor, int_hexagon_V6_vgtub_xor>; +defm : T_QVV_pat <V6_vgtuh_xor, int_hexagon_V6_vgtuh_xor>; +defm : T_QVV_pat <V6_vgtuw_xor, int_hexagon_V6_vgtuw_xor>; + +defm : T_VV_pat <V6_vminub, int_hexagon_V6_vminub>; +defm : T_VV_pat <V6_vminuh, int_hexagon_V6_vminuh>; +defm : T_VV_pat <V6_vminh, int_hexagon_V6_vminh>; +defm : T_VV_pat <V6_vminw, int_hexagon_V6_vminw>; +defm : T_VV_pat <V6_vmaxub, int_hexagon_V6_vmaxub>; +defm : T_VV_pat <V6_vmaxuh, int_hexagon_V6_vmaxuh>; +defm : T_VV_pat <V6_vmaxh, int_hexagon_V6_vmaxh>; +defm : T_VV_pat <V6_vmaxw, int_hexagon_V6_vmaxw>; +defm : T_VV_pat <V6_vdelta, int_hexagon_V6_vdelta>; +defm : T_VV_pat <V6_vrdelta, int_hexagon_V6_vrdelta>; +defm : T_VV_pat <V6_vdealb4w, int_hexagon_V6_vdealb4w>; +defm : T_VV_pat <V6_vmpyowh_rnd, int_hexagon_V6_vmpyowh_rnd>; +defm : T_VV_pat <V6_vshuffeb, int_hexagon_V6_vshuffeb>; +defm : T_VV_pat <V6_vshuffob, int_hexagon_V6_vshuffob>; +defm : T_VV_pat <V6_vshufeh, int_hexagon_V6_vshufeh>; +defm : T_VV_pat <V6_vshufoh, int_hexagon_V6_vshufoh>; +defm : T_VV_pat <V6_vshufoeh, int_hexagon_V6_vshufoeh>; +defm : T_VV_pat <V6_vshufoeb, int_hexagon_V6_vshufoeb>; +defm : T_VV_pat <V6_vcombine, int_hexagon_V6_vcombine>; +defm : T_VV_pat <V6_vmpyieoh, int_hexagon_V6_vmpyieoh>; +defm : T_VV_pat <V6_vsathub, int_hexagon_V6_vsathub>; +defm : T_VV_pat <V6_vsatwh, int_hexagon_V6_vsatwh>; +defm : T_VV_pat <V6_vroundwh, int_hexagon_V6_vroundwh>; +defm : T_VV_pat <V6_vroundwuh, int_hexagon_V6_vroundwuh>; +defm : T_VV_pat <V6_vroundhb, int_hexagon_V6_vroundhb>; +defm : T_VV_pat <V6_vroundhub, int_hexagon_V6_vroundhub>; +defm : T_VV_pat <V6_vasrwv, int_hexagon_V6_vasrwv>; +defm : T_VV_pat <V6_vlsrwv, int_hexagon_V6_vlsrwv>; +defm : T_VV_pat <V6_vlsrhv, int_hexagon_V6_vlsrhv>; +defm : T_VV_pat <V6_vasrhv, int_hexagon_V6_vasrhv>; +defm : T_VV_pat <V6_vaslwv, int_hexagon_V6_vaslwv>; +defm : T_VV_pat <V6_vaslhv, int_hexagon_V6_vaslhv>; +defm : T_VV_pat <V6_vaddb, int_hexagon_V6_vaddb>; +defm : T_VV_pat <V6_vaddh, int_hexagon_V6_vaddh>; +defm : T_VV_pat <V6_vmpyiewuh, int_hexagon_V6_vmpyiewuh>; +defm : T_VV_pat <V6_vmpyiowh, int_hexagon_V6_vmpyiowh>; +defm : T_VV_pat <V6_vpackeb, int_hexagon_V6_vpackeb>; +defm : T_VV_pat <V6_vpackeh, int_hexagon_V6_vpackeh>; +defm : T_VV_pat <V6_vpackhub_sat, int_hexagon_V6_vpackhub_sat>; +defm : T_VV_pat <V6_vpackhb_sat, int_hexagon_V6_vpackhb_sat>; +defm : T_VV_pat <V6_vpackwuh_sat, int_hexagon_V6_vpackwuh_sat>; +defm : T_VV_pat <V6_vpackwh_sat, int_hexagon_V6_vpackwh_sat>; +defm : T_VV_pat <V6_vpackob, int_hexagon_V6_vpackob>; +defm : T_VV_pat <V6_vpackoh, int_hexagon_V6_vpackoh>; +defm : T_VV_pat <V6_vmpyewuh, int_hexagon_V6_vmpyewuh>; +defm : T_VV_pat <V6_vmpyowh, int_hexagon_V6_vmpyowh>; + +defm : T_QVV_pat <V6_vaddbq, int_hexagon_V6_vaddbq>; +defm : T_QVV_pat <V6_vaddhq, int_hexagon_V6_vaddhq>; +defm : T_QVV_pat <V6_vaddwq, int_hexagon_V6_vaddwq>; +defm : T_QVV_pat <V6_vaddbnq, int_hexagon_V6_vaddbnq>; +defm : T_QVV_pat <V6_vaddhnq, int_hexagon_V6_vaddhnq>; +defm : T_QVV_pat <V6_vaddwnq, int_hexagon_V6_vaddwnq>; +defm : T_QVV_pat <V6_vsubbq, int_hexagon_V6_vsubbq>; +defm : T_QVV_pat <V6_vsubhq, int_hexagon_V6_vsubhq>; +defm : T_QVV_pat <V6_vsubwq, int_hexagon_V6_vsubwq>; +defm : T_QVV_pat <V6_vsubbnq, int_hexagon_V6_vsubbnq>; +defm : T_QVV_pat <V6_vsubhnq, int_hexagon_V6_vsubhnq>; +defm : T_QVV_pat <V6_vsubwnq, int_hexagon_V6_vsubwnq>; + +defm : T_V_pat <V6_vabsh, int_hexagon_V6_vabsh>; +defm : T_V_pat <V6_vabsw, int_hexagon_V6_vabsw>; +defm : T_V_pat <V6_vabsw_sat, int_hexagon_V6_vabsw_sat>; +defm : T_V_pat <V6_vabsh_sat, int_hexagon_V6_vabsh_sat>; +defm : T_V_pat <V6_vnot, int_hexagon_V6_vnot>; +defm : T_V_pat <V6_vassign, int_hexagon_V6_vassign>; +defm : T_V_pat <V6_vzb, int_hexagon_V6_vzb>; +defm : T_V_pat <V6_vzh, int_hexagon_V6_vzh>; +defm : T_V_pat <V6_vsb, int_hexagon_V6_vsb>; +defm : T_V_pat <V6_vsh, int_hexagon_V6_vsh>; +defm : T_V_pat <V6_vdealh, int_hexagon_V6_vdealh>; +defm : T_V_pat <V6_vdealb, int_hexagon_V6_vdealb>; +defm : T_V_pat <V6_vunpackub, int_hexagon_V6_vunpackub>; +defm : T_V_pat <V6_vunpackuh, int_hexagon_V6_vunpackuh>; +defm : T_V_pat <V6_vunpackb, int_hexagon_V6_vunpackb>; +defm : T_V_pat <V6_vunpackh, int_hexagon_V6_vunpackh>; +defm : T_V_pat <V6_vshuffh, int_hexagon_V6_vshuffh>; +defm : T_V_pat <V6_vshuffb, int_hexagon_V6_vshuffb>; +defm : T_V_pat <V6_vcl0w, int_hexagon_V6_vcl0w>; +defm : T_V_pat <V6_vpopcounth, int_hexagon_V6_vpopcounth>; +defm : T_V_pat <V6_vcl0h, int_hexagon_V6_vcl0h>; +defm : T_V_pat <V6_vnormamtw, int_hexagon_V6_vnormamtw>; +defm : T_V_pat <V6_vnormamth, int_hexagon_V6_vnormamth>; + +defm : T_WRI_pat <V6_vrmpybusi, int_hexagon_V6_vrmpybusi>; +defm : T_WRI_pat <V6_vrsadubi, int_hexagon_V6_vrsadubi>; +defm : T_WRI_pat <V6_vrmpyubi, int_hexagon_V6_vrmpyubi>; + +defm : T_WWRI_pat <V6_vrmpybusi_acc, int_hexagon_V6_vrmpybusi_acc>; +defm : T_WWRI_pat <V6_vrsadubi_acc, int_hexagon_V6_vrsadubi_acc>; +defm : T_WWRI_pat <V6_vrmpyubi_acc, int_hexagon_V6_vrmpyubi_acc>; + +// assembler mapped. +//defm : T_V_pat <V6_vtran2x2, int_hexagon_V6_vtran2x2>; +// not present earlier.. need to add intrinsic +defm : T_VVR_pat <V6_valignb, int_hexagon_V6_valignb>; +defm : T_VVR_pat <V6_vlalignb, int_hexagon_V6_vlalignb>; +defm : T_VVR_pat <V6_vasrwh, int_hexagon_V6_vasrwh>; +defm : T_VVR_pat <V6_vasrwhsat, int_hexagon_V6_vasrwhsat>; +defm : T_VVR_pat <V6_vasrwhrndsat, int_hexagon_V6_vasrwhrndsat>; +defm : T_VVR_pat <V6_vasrwuhsat, int_hexagon_V6_vasrwuhsat>; +defm : T_VVR_pat <V6_vasrhubsat, int_hexagon_V6_vasrhubsat>; +defm : T_VVR_pat <V6_vasrhubrndsat, int_hexagon_V6_vasrhubrndsat>; +defm : T_VVR_pat <V6_vasrhbrndsat, int_hexagon_V6_vasrhbrndsat>; + +defm : T_VVR_pat <V6_vshuffvdd, int_hexagon_V6_vshuffvdd>; +defm : T_VVR_pat <V6_vdealvdd, int_hexagon_V6_vdealvdd>; + +defm : T_WV_pat <V6_vunpackob, int_hexagon_V6_vunpackob>; +defm : T_WV_pat <V6_vunpackoh, int_hexagon_V6_vunpackoh>; +defm : T_VVI_pat <V6_valignbi, int_hexagon_V6_valignbi>; +defm : T_VVI_pat <V6_vlalignbi, int_hexagon_V6_vlalignbi>; + +defm : T_QVV_pat <V6_vswap, int_hexagon_V6_vswap>; +defm : T_QVV_pat <V6_vmux, int_hexagon_V6_vmux>; +defm : T_QQ_pat <V6_pred_and, int_hexagon_V6_pred_and>; +defm : T_QQ_pat <V6_pred_or, int_hexagon_V6_pred_or>; +defm : T_Q_pat <V6_pred_not, int_hexagon_V6_pred_not>; +defm : T_QQ_pat <V6_pred_xor, int_hexagon_V6_pred_xor>; +defm : T_QQ_pat <V6_pred_or_n, int_hexagon_V6_pred_or_n>; +defm : T_QQ_pat <V6_pred_and_n, int_hexagon_V6_pred_and_n>; +defm : T_VV_pat <V6_veqb, int_hexagon_V6_veqb>; +defm : T_VV_pat <V6_veqh, int_hexagon_V6_veqh>; +defm : T_VV_pat <V6_veqw, int_hexagon_V6_veqw>; +defm : T_VV_pat <V6_vgtb, int_hexagon_V6_vgtb>; +defm : T_VV_pat <V6_vgth, int_hexagon_V6_vgth>; +defm : T_VV_pat <V6_vgtw, int_hexagon_V6_vgtw>; +defm : T_VV_pat <V6_vgtub, int_hexagon_V6_vgtub>; +defm : T_VV_pat <V6_vgtuh, int_hexagon_V6_vgtuh>; +defm : T_VV_pat <V6_vgtuw, int_hexagon_V6_vgtuw>; + +defm : T_VQR_pat <V6_vandqrt_acc, int_hexagon_V6_vandqrt_acc>; +defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>; +defm : T_QR_pat <V6_vandqrt, int_hexagon_V6_vandqrt>; +defm : T_R_pat <V6_lvsplatw, int_hexagon_V6_lvsplatw>; +defm : T_R_pat <V6_pred_scalar2, int_hexagon_V6_pred_scalar2>; +defm : T_VR_pat <V6_vandvrt, int_hexagon_V6_vandvrt>; + +defm : T_VVR_pat <V6_vlutvvb, int_hexagon_V6_vlutvvb>; +defm : T_VVR_pat <V6_vlutvwh, int_hexagon_V6_vlutvwh>; +defm : T_VVVR_pat <V6_vlutvvb_oracc, int_hexagon_V6_vlutvvb_oracc>; +defm : T_WVVR_pat <V6_vlutvwh_oracc, int_hexagon_V6_vlutvwh_oracc>; + +defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>; +def : T_PI_pat <S6_rol_i_p, int_hexagon_S6_rol_i_p>; +def : T_RI_pat <S6_rol_i_r, int_hexagon_S6_rol_i_r>; +def : T_PPI_pat <S6_rol_i_p_nac, int_hexagon_S6_rol_i_p_nac>; +def : T_PPI_pat <S6_rol_i_p_acc, int_hexagon_S6_rol_i_p_acc>; +def : T_PPI_pat <S6_rol_i_p_and, int_hexagon_S6_rol_i_p_and>; +def : T_PPI_pat <S6_rol_i_p_or, int_hexagon_S6_rol_i_p_or>; +def : T_PPI_pat <S6_rol_i_p_xacc, int_hexagon_S6_rol_i_p_xacc>; +def : T_RRI_pat <S6_rol_i_r_nac, int_hexagon_S6_rol_i_r_nac>; +def : T_RRI_pat <S6_rol_i_r_acc, int_hexagon_S6_rol_i_r_acc>; +def : T_RRI_pat <S6_rol_i_r_and, int_hexagon_S6_rol_i_r_and>; +def : T_RRI_pat <S6_rol_i_r_or, int_hexagon_S6_rol_i_r_or>; +def : T_RRI_pat <S6_rol_i_r_xacc, int_hexagon_S6_rol_i_r_xacc>; + +defm : T_VR_pat <V6_extractw, int_hexagon_V6_extractw>; +defm : T_VR_pat <V6_vinsertwr, int_hexagon_V6_vinsertwr>; + +def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>; + +def: Pat<(v64i16 (trunc v64i32:$Vdd)), + (v64i16 (V6_vpackwh_sat_128B + (v32i32 (HEXAGON_V6_hi_128B VecDblRegs128B:$Vdd)), + (v32i32 (HEXAGON_V6_lo_128B VecDblRegs128B:$Vdd))))>, + Requires<[UseHVXDbl]>; + + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp index 75189b6..624c0f6 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp @@ -26,39 +26,71 @@ using namespace llvm; -static MCOperand GetSymbolRef(const MachineOperand& MO, const MCSymbol* Symbol, - HexagonAsmPrinter& Printer) { +namespace llvm { + void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI, + MCInst &MCB, HexagonAsmPrinter &AP); +} + +static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, + HexagonAsmPrinter &Printer) { MCContext &MC = Printer.OutContext; const MCExpr *ME; - ME = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, MC); + // Populate the relocation type based on Hexagon target flags + // set on an operand + MCSymbolRefExpr::VariantKind RelocationType; + switch (MO.getTargetFlags()) { + default: + RelocationType = MCSymbolRefExpr::VK_None; + break; + case HexagonII::MO_PCREL: + RelocationType = MCSymbolRefExpr::VK_Hexagon_PCREL; + break; + case HexagonII::MO_GOT: + RelocationType = MCSymbolRefExpr::VK_GOT; + break; + case HexagonII::MO_LO16: + RelocationType = MCSymbolRefExpr::VK_Hexagon_LO16; + break; + case HexagonII::MO_HI16: + RelocationType = MCSymbolRefExpr::VK_Hexagon_HI16; + break; + case HexagonII::MO_GPREL: + RelocationType = MCSymbolRefExpr::VK_Hexagon_GPREL; + break; + } + + ME = MCSymbolRefExpr::create(Symbol, RelocationType, MC); if (!MO.isJTI() && MO.getOffset()) ME = MCBinaryExpr::createAdd(ME, MCConstantExpr::create(MO.getOffset(), MC), MC); - return (MCOperand::createExpr(ME)); + return MCOperand::createExpr(ME); } // Create an MCInst from a MachineInstr -void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB, - HexagonAsmPrinter& AP) { - if(MI->getOpcode() == Hexagon::ENDLOOP0){ +void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI, + MCInst &MCB, HexagonAsmPrinter &AP) { + if (MI->getOpcode() == Hexagon::ENDLOOP0) { HexagonMCInstrInfo::setInnerLoop(MCB); return; } - if(MI->getOpcode() == Hexagon::ENDLOOP1){ + if (MI->getOpcode() == Hexagon::ENDLOOP1) { HexagonMCInstrInfo::setOuterLoop(MCB); return; } - MCInst* MCI = new (AP.OutContext) MCInst; + MCInst *MCI = new (AP.OutContext) MCInst; MCI->setOpcode(MI->getOpcode()); assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) && "MCI opcode should have been set on construction"); + bool MustExtend = false; for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) { const MachineOperand &MO = MI->getOperand(i); MCOperand MCO; + if (MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended) + MustExtend = true; switch (MO.getType()) { default: @@ -73,11 +105,14 @@ void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB, APFloat Val = MO.getFPImm()->getValueAPF(); // FP immediates are used only when setting GPRs, so they may be dealt // with like regular immediates from this point on. - MCO = MCOperand::createImm(*Val.bitcastToAPInt().getRawData()); + MCO = MCOperand::createExpr( + MCConstantExpr::create(*Val.bitcastToAPInt().getRawData(), + AP.OutContext)); break; } case MachineOperand::MO_Immediate: - MCO = MCOperand::createImm(MO.getImm()); + MCO = MCOperand::createExpr( + MCConstantExpr::create(MO.getImm(), AP.OutContext)); break; case MachineOperand::MO_MachineBasicBlock: MCO = MCOperand::createExpr @@ -104,5 +139,8 @@ void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB, MCI->addOperand(MCO); } + AP.HexagonProcessInstruction(*MCI, *MI); + HexagonMCInstrInfo::extendIfNeeded(AP.OutContext, MCII, MCB, *MCI, + MustExtend); MCB.addOperand(MCOperand::createInst(MCI)); } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp index 35f732c..7a52d68 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp @@ -179,7 +179,11 @@ void VLIWMachineScheduler::schedule() { initQueues(TopRoots, BotRoots); bool IsTopNode = false; - while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) { + while (true) { + DEBUG(dbgs() << "** VLIWMachineScheduler::schedule picking next node\n"); + SUnit *SU = SchedImpl->pickNode(IsTopNode); + if (!SU) break; + if (!checkSchedLimit()) break; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp index 707bfdb..20c4ab1 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp @@ -92,6 +92,7 @@ namespace { /// \brief A handle to the branch probability pass. const MachineBranchProbabilityInfo *MBPI; + bool isNewValueJumpCandidate(const MachineInstr *MI) const; }; } // end of anonymous namespace @@ -280,9 +281,9 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII, return true; } -// Given a compare operator, return a matching New Value Jump -// compare operator. Make sure that MI here is included in -// HexagonInstrInfo.cpp::isNewValueJumpCandidate + +// Given a compare operator, return a matching New Value Jump compare operator. +// Make sure that MI here is included in isNewValueJumpCandidate. static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg, bool secondRegNewified, MachineBasicBlock *jmpTarget, @@ -341,6 +342,24 @@ static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg, return taken ? Hexagon::J4_cmpgtui_t_jumpnv_t : Hexagon::J4_cmpgtui_t_jumpnv_nt; + case Hexagon::C4_cmpneq: + return taken ? Hexagon::J4_cmpeq_f_jumpnv_t + : Hexagon::J4_cmpeq_f_jumpnv_nt; + + case Hexagon::C4_cmplte: + if (secondRegNewified) + return taken ? Hexagon::J4_cmplt_f_jumpnv_t + : Hexagon::J4_cmplt_f_jumpnv_nt; + return taken ? Hexagon::J4_cmpgt_f_jumpnv_t + : Hexagon::J4_cmpgt_f_jumpnv_nt; + + case Hexagon::C4_cmplteu: + if (secondRegNewified) + return taken ? Hexagon::J4_cmpltu_f_jumpnv_t + : Hexagon::J4_cmpltu_f_jumpnv_nt; + return taken ? Hexagon::J4_cmpgtu_f_jumpnv_t + : Hexagon::J4_cmpgtu_f_jumpnv_nt; + default: llvm_unreachable("Could not find matching New Value Jump instruction."); } @@ -348,6 +367,26 @@ static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg, return 0; } +bool HexagonNewValueJump::isNewValueJumpCandidate(const MachineInstr *MI) + const { + switch (MI->getOpcode()) { + case Hexagon::C2_cmpeq: + case Hexagon::C2_cmpeqi: + case Hexagon::C2_cmpgt: + case Hexagon::C2_cmpgti: + case Hexagon::C2_cmpgtu: + case Hexagon::C2_cmpgtui: + case Hexagon::C4_cmpneq: + case Hexagon::C4_cmplte: + case Hexagon::C4_cmplteu: + return true; + + default: + return false; + } +} + + bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n" @@ -372,7 +411,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { // Loop through all the bb's of the function for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end(); MBBb != MBBe; ++MBBb) { - MachineBasicBlock* MBB = MBBb; + MachineBasicBlock *MBB = &*MBBb; DEBUG(dbgs() << "** dumping bb ** " << MBB->getNumber() << "\n"); @@ -468,7 +507,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { MI->getOperand(0).getReg() == predReg) { // Not all compares can be new value compare. Arch Spec: 7.6.1.1 - if (QII->isNewValueJumpCandidate(MI)) { + if (isNewValueJumpCandidate(MI)) { assert((MI->getDesc().isCompare()) && "Only compare instruction can be collapsed into New Value Jump"); @@ -591,8 +630,8 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { DebugLoc dl = MI->getDebugLoc(); MachineInstr *NewMI; - assert((QII->isNewValueJumpCandidate(cmpInstr)) && - "This compare is not a New Value Jump candidate."); + assert((isNewValueJumpCandidate(cmpInstr)) && + "This compare is not a New Value Jump candidate."); unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2, isSecondOpNewified, jmpTarget, MBPI); diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td index 2bece8f..fbd29cd 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td @@ -1,4 +1,4 @@ -//===- HexagonOperands.td - Hexagon immediate processing -*- tablegen -*-===// +//===- HexagonImmediates.td - Hexagon immediate processing -*- tablegen -*-===// // // The LLVM Compiler Infrastructure // @@ -7,59 +7,114 @@ // //===----------------------------------------------------------------------===// +def s32ImmOperand : AsmOperandClass { let Name = "s32Imm"; } +def s8ImmOperand : AsmOperandClass { let Name = "s8Imm"; } +def s8Imm64Operand : AsmOperandClass { let Name = "s8Imm64"; } +def s6ImmOperand : AsmOperandClass { let Name = "s6Imm"; } +def s4ImmOperand : AsmOperandClass { let Name = "s4Imm"; } def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; } def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; } def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; } def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; } - +def s4_6ImmOperand : AsmOperandClass { let Name = "s4_6Imm"; } +def s3_6ImmOperand : AsmOperandClass { let Name = "s3_6Imm"; } +def u64ImmOperand : AsmOperandClass { let Name = "u64Imm"; } +def u32ImmOperand : AsmOperandClass { let Name = "u32Imm"; } +def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; } +def u16ImmOperand : AsmOperandClass { let Name = "u16Imm"; } +def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; } +def u16_1ImmOperand : AsmOperandClass { let Name = "u16_1Imm"; } +def u16_2ImmOperand : AsmOperandClass { let Name = "u16_2Imm"; } +def u16_3ImmOperand : AsmOperandClass { let Name = "u16_3Imm"; } +def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; } +def u10ImmOperand : AsmOperandClass { let Name = "u10Imm"; } +def u9ImmOperand : AsmOperandClass { let Name = "u9Imm"; } +def u8ImmOperand : AsmOperandClass { let Name = "u8Imm"; } +def u7ImmOperand : AsmOperandClass { let Name = "u7Imm"; } +def u6ImmOperand : AsmOperandClass { let Name = "u6Imm"; } +def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; } +def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; } +def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; } +def u6_3ImmOperand : AsmOperandClass { let Name = "u6_3Imm"; } +def u5ImmOperand : AsmOperandClass { let Name = "u5Imm"; } +def u4ImmOperand : AsmOperandClass { let Name = "u4Imm"; } +def u3ImmOperand : AsmOperandClass { let Name = "u3Imm"; } +def u2ImmOperand : AsmOperandClass { let Name = "u2Imm"; } +def u1ImmOperand : AsmOperandClass { let Name = "u1Imm"; } +def n8ImmOperand : AsmOperandClass { let Name = "n8Imm"; } // Immediate operands. -let PrintMethod = "printImmOperand" in { - def s32Imm : Operand<i32>; - def s8Imm : Operand<i32>; - def s8Imm64 : Operand<i64>; - def s6Imm : Operand<i32>; +let OperandType = "OPERAND_IMMEDIATE", + DecoderMethod = "unsignedImmDecoder" in { + def s32Imm : Operand<i32> { let ParserMatchClass = s32ImmOperand; + let DecoderMethod = "s32ImmDecoder"; } + def s8Imm : Operand<i32> { let ParserMatchClass = s8ImmOperand; + let DecoderMethod = "s8ImmDecoder"; } + def s8Imm64 : Operand<i64> { let ParserMatchClass = s8Imm64Operand; + let DecoderMethod = "s8ImmDecoder"; } + def s6Imm : Operand<i32> { let ParserMatchClass = s6ImmOperand; + let DecoderMethod = "s6_0ImmDecoder"; } def s6_3Imm : Operand<i32>; - def s4Imm : Operand<i32>; - def s4_0Imm : Operand<i32> { let DecoderMethod = "s4_0ImmDecoder"; } - def s4_1Imm : Operand<i32> { let DecoderMethod = "s4_1ImmDecoder"; } - def s4_2Imm : Operand<i32> { let DecoderMethod = "s4_2ImmDecoder"; } - def s4_3Imm : Operand<i32> { let DecoderMethod = "s4_3ImmDecoder"; } - def u64Imm : Operand<i64>; - def u32Imm : Operand<i32>; - def u26_6Imm : Operand<i32>; - def u16Imm : Operand<i32>; - def u16_0Imm : Operand<i32>; - def u16_1Imm : Operand<i32>; - def u16_2Imm : Operand<i32>; - def u16_3Imm : Operand<i32>; - def u11_3Imm : Operand<i32>; - def u10Imm : Operand<i32>; - def u9Imm : Operand<i32>; - def u8Imm : Operand<i32>; - def u7Imm : Operand<i32>; - def u6Imm : Operand<i32>; - def u6_0Imm : Operand<i32>; - def u6_1Imm : Operand<i32>; - def u6_2Imm : Operand<i32>; - def u6_3Imm : Operand<i32>; - def u5Imm : Operand<i32>; + def s4Imm : Operand<i32> { let ParserMatchClass = s4ImmOperand; + let DecoderMethod = "s4_0ImmDecoder"; } + def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand; + let DecoderMethod = "s4_0ImmDecoder"; } + def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand; + let DecoderMethod = "s4_1ImmDecoder"; } + def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand; + let DecoderMethod = "s4_2ImmDecoder"; } + def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand; + let DecoderMethod = "s4_3ImmDecoder"; } + def u64Imm : Operand<i64> { let ParserMatchClass = u64ImmOperand; } + def u32Imm : Operand<i32> { let ParserMatchClass = u32ImmOperand; } + def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; } + def u16Imm : Operand<i32> { let ParserMatchClass = u16ImmOperand; } + def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; } + def u16_1Imm : Operand<i32> { let ParserMatchClass = u16_1ImmOperand; } + def u16_2Imm : Operand<i32> { let ParserMatchClass = u16_2ImmOperand; } + def u16_3Imm : Operand<i32> { let ParserMatchClass = u16_3ImmOperand; } + def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; } + def u10Imm : Operand<i32> { let ParserMatchClass = u10ImmOperand; } + def u9Imm : Operand<i32> { let ParserMatchClass = u9ImmOperand; } + def u8Imm : Operand<i32> { let ParserMatchClass = u8ImmOperand; } + def u7Imm : Operand<i32> { let ParserMatchClass = u7ImmOperand; } + def u6Imm : Operand<i32> { let ParserMatchClass = u6ImmOperand; } + def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; } + def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; } + def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; } + def u6_3Imm : Operand<i32> { let ParserMatchClass = u6_3ImmOperand; } + def u5Imm : Operand<i32> { let ParserMatchClass = u5ImmOperand; } + def u5_0Imm : Operand<i32>; + def u5_1Imm : Operand<i32>; def u5_2Imm : Operand<i32>; def u5_3Imm : Operand<i32>; - def u4Imm : Operand<i32>; + def u4Imm : Operand<i32> { let ParserMatchClass = u4ImmOperand; } def u4_0Imm : Operand<i32>; + def u4_1Imm : Operand<i32>; def u4_2Imm : Operand<i32>; - def u3Imm : Operand<i32>; + def u4_3Imm : Operand<i32>; + def u3Imm : Operand<i32> { let ParserMatchClass = u3ImmOperand; } def u3_0Imm : Operand<i32>; def u3_1Imm : Operand<i32>; - def u2Imm : Operand<i32>; - def u1Imm : Operand<i32>; - def n8Imm : Operand<i32>; - def m6Imm : Operand<i32>; + def u3_2Imm : Operand<i32>; + def u3_3Imm : Operand<i32>; + def u2Imm : Operand<i32> { let ParserMatchClass = u2ImmOperand; } + def u1Imm : Operand<i32> { let ParserMatchClass = u1ImmOperand; } + def n8Imm : Operand<i32> { let ParserMatchClass = n8ImmOperand; } } -let PrintMethod = "printNOneImmOperand" in -def nOneImm : Operand<i32>; +let OperandType = "OPERAND_IMMEDIATE" in { + def s4_6Imm : Operand<i32> { let ParserMatchClass = s4_6ImmOperand; + let PrintMethod = "prints4_6ImmOperand"; + let DecoderMethod = "s4_6ImmDecoder";} + def s4_7Imm : Operand<i32> { let PrintMethod = "prints4_7ImmOperand"; + let DecoderMethod = "s4_6ImmDecoder";} + def s3_6Imm : Operand<i32> { let ParserMatchClass = s3_6ImmOperand; + let PrintMethod = "prints3_6ImmOperand"; + let DecoderMethod = "s3_6ImmDecoder";} + def s3_7Imm : Operand<i32> { let PrintMethod = "prints3_7ImmOperand"; + let DecoderMethod = "s3_6ImmDecoder";} +} // // Immediate predicates @@ -81,32 +136,12 @@ def s31_1ImmPred : PatLeaf<(i32 imm), [{ def s30_2ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<31,1>(v); + return isShiftedInt<30,2>(v); }]>; def s29_3ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<31,1>(v); -}]>; - -def s22_10ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<22,10>(v); -}]>; - -def s8_24ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<8,24>(v); -}]>; - -def s16_16ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<16,16>(v); -}]>; - -def s26_6ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<26,6>(v); + return isShiftedInt<29,3>(v); }]>; def s16ImmPred : PatLeaf<(i32 imm), [{ @@ -114,16 +149,6 @@ def s16ImmPred : PatLeaf<(i32 imm), [{ return isInt<16>(v); }]>; -def s13ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isInt<13>(v); -}]>; - -def s12ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isInt<12>(v); -}]>; - def s11_0ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); return isInt<11>(v); @@ -149,16 +174,6 @@ def s10ImmPred : PatLeaf<(i32 imm), [{ return isInt<10>(v); }]>; -def s9ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isInt<9>(v); -}]>; - -def m9ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isInt<9>(v) && (v != -256); -}]>; - def s8ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); return isInt<8>(v); @@ -194,7 +209,6 @@ def s4_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4,3>(v); }]>; - def u64ImmPred : PatLeaf<(i64 imm), [{ // Adding "N ||" to suppress gcc unused warning. return (N || true); @@ -230,19 +244,19 @@ def u26_6ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<26,6>(v); }]>; -def u16ImmPred : PatLeaf<(i32 imm), [{ +def u16_0ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); return isUInt<16>(v); }]>; -def u16_s8ImmPred : PatLeaf<(i32 imm), [{ +def u16_1ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - return isShiftedUInt<16,8>(v); + return isShiftedUInt<16,1>(v); }]>; -def u16_0ImmPred : PatLeaf<(i32 imm), [{ +def u16_2ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - return isUInt<16>(v); + return isShiftedUInt<16,2>(v); }]>; def u11_3ImmPred : PatLeaf<(i32 imm), [{ @@ -250,6 +264,11 @@ def u11_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<11,3>(v); }]>; +def u10ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + return isUInt<10>(v); +}]>; + def u9ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); return isUInt<9>(v); @@ -321,6 +340,11 @@ def u1ImmPred : PatLeaf<(i1 imm), [{ return isUInt<1>(v); }]>; +def u1ImmPred32 : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + return isUInt<1>(v); +}]>; + def m5BImmPred : PatLeaf<(i32 imm), [{ // m5BImmPred predicate - True if the (char) number is in range -1 .. -31 // and will fit in a 5 bit field when made positive, for use in memops. @@ -379,7 +403,7 @@ def Clr5ImmPred : PatLeaf<(i32 imm), [{ }]>; def SetClr5ImmPred : PatLeaf<(i32 imm), [{ - // SetClr5ImmPred predicate - True if the immediate is in range 0..31. + // True if the immediate is in range 0..31. int32_t v = (int32_t)N->getSExtValue(); return (v >= 0 && v <= 31); }]>; @@ -404,14 +428,13 @@ def Clr4ImmPred : PatLeaf<(i32 imm), [{ }]>; def SetClr4ImmPred : PatLeaf<(i32 imm), [{ - // SetClr4ImmPred predicate - True if the immediate is in the range 0..15. + // True if the immediate is in the range 0..15. int16_t v = (int16_t)N->getSExtValue(); return (v >= 0 && v <= 15); }]>; def Set3ImmPred : PatLeaf<(i32 imm), [{ - // Set3ImmPred predicate - True if the number is in the series of values: - // [ 2^0, 2^1, ... 2^7 ]. + // True if the number is in the series of values: [ 2^0, 2^1, ... 2^7 ]. // For use in setbit immediate. uint8_t v = (int8_t)N->getSExtValue(); // Constrain to 8 bits, and then check for single bit. @@ -419,9 +442,7 @@ def Set3ImmPred : PatLeaf<(i32 imm), [{ }]>; def Clr3ImmPred : PatLeaf<(i32 imm), [{ - // Clr3ImmPred predicate - True if the number is in the series of - // bit negated values: - // [ 2^0, 2^1, ... 2^7 ]. + // True if the number is in the series of bit negated values: [ 2^0, 2^1, ... 2^7 ]. // For use in setbit and clrbit immediate. uint8_t v = ~ (int8_t)N->getSExtValue(); // Constrain to 8 bits, and then check for single bit. @@ -429,76 +450,109 @@ def Clr3ImmPred : PatLeaf<(i32 imm), [{ }]>; def SetClr3ImmPred : PatLeaf<(i32 imm), [{ - // SetClr3ImmPred predicate - True if the immediate is in the range 0..7. + // True if the immediate is in the range 0..7. int8_t v = (int8_t)N->getSExtValue(); return (v >= 0 && v <= 7); }]>; // Extendable immediate operands. - -let PrintMethod = "printExtOperand" in { - def f32Ext : Operand<f32>; - def s16Ext : Operand<i32> { let DecoderMethod = "s16ImmDecoder"; } - def s12Ext : Operand<i32> { let DecoderMethod = "s12ImmDecoder"; } - def s11_0Ext : Operand<i32> { let DecoderMethod = "s11_0ImmDecoder"; } - def s11_1Ext : Operand<i32> { let DecoderMethod = "s11_1ImmDecoder"; } - def s11_2Ext : Operand<i32> { let DecoderMethod = "s11_2ImmDecoder"; } - def s11_3Ext : Operand<i32> { let DecoderMethod = "s11_3ImmDecoder"; } - def s10Ext : Operand<i32> { let DecoderMethod = "s10ImmDecoder"; } - def s9Ext : Operand<i32> { let DecoderMethod = "s90ImmDecoder"; } - def s8Ext : Operand<i32> { let DecoderMethod = "s8ImmDecoder"; } - def s7Ext : Operand<i32>; - def s6Ext : Operand<i32> { let DecoderMethod = "s6_0ImmDecoder"; } - def u6Ext : Operand<i32>; - def u7Ext : Operand<i32>; - def u8Ext : Operand<i32>; - def u9Ext : Operand<i32>; - def u10Ext : Operand<i32>; - def u6_0Ext : Operand<i32>; - def u6_1Ext : Operand<i32>; - def u6_2Ext : Operand<i32>; - def u6_3Ext : Operand<i32>; +def f32ExtOperand : AsmOperandClass { let Name = "f32Ext"; } +def s16ExtOperand : AsmOperandClass { let Name = "s16Ext"; } +def s12ExtOperand : AsmOperandClass { let Name = "s12Ext"; } +def s10ExtOperand : AsmOperandClass { let Name = "s10Ext"; } +def s9ExtOperand : AsmOperandClass { let Name = "s9Ext"; } +def s8ExtOperand : AsmOperandClass { let Name = "s8Ext"; } +def s7ExtOperand : AsmOperandClass { let Name = "s7Ext"; } +def s6ExtOperand : AsmOperandClass { let Name = "s6Ext"; } +def s11_0ExtOperand : AsmOperandClass { let Name = "s11_0Ext"; } +def s11_1ExtOperand : AsmOperandClass { let Name = "s11_1Ext"; } +def s11_2ExtOperand : AsmOperandClass { let Name = "s11_2Ext"; } +def s11_3ExtOperand : AsmOperandClass { let Name = "s11_3Ext"; } +def u6ExtOperand : AsmOperandClass { let Name = "u6Ext"; } +def u7ExtOperand : AsmOperandClass { let Name = "u7Ext"; } +def u8ExtOperand : AsmOperandClass { let Name = "u8Ext"; } +def u9ExtOperand : AsmOperandClass { let Name = "u9Ext"; } +def u10ExtOperand : AsmOperandClass { let Name = "u10Ext"; } +def u6_0ExtOperand : AsmOperandClass { let Name = "u6_0Ext"; } +def u6_1ExtOperand : AsmOperandClass { let Name = "u6_1Ext"; } +def u6_2ExtOperand : AsmOperandClass { let Name = "u6_2Ext"; } +def u6_3ExtOperand : AsmOperandClass { let Name = "u6_3Ext"; } +def u32MustExtOperand : AsmOperandClass { let Name = "u32MustExt"; } + + + +let OperandType = "OPERAND_IMMEDIATE", PrintMethod = "printExtOperand", + DecoderMethod = "unsignedImmDecoder" in { + def f32Ext : Operand<f32> { let ParserMatchClass = f32ExtOperand; } + def s16Ext : Operand<i32> { let ParserMatchClass = s16ExtOperand; + let DecoderMethod = "s16ImmDecoder"; } + def s12Ext : Operand<i32> { let ParserMatchClass = s12ExtOperand; + let DecoderMethod = "s12ImmDecoder"; } + def s11_0Ext : Operand<i32> { let ParserMatchClass = s11_0ExtOperand; + let DecoderMethod = "s11_0ImmDecoder"; } + def s11_1Ext : Operand<i32> { let ParserMatchClass = s11_1ExtOperand; + let DecoderMethod = "s11_1ImmDecoder"; } + def s11_2Ext : Operand<i32> { let ParserMatchClass = s11_2ExtOperand; + let DecoderMethod = "s11_2ImmDecoder"; } + def s11_3Ext : Operand<i32> { let ParserMatchClass = s11_3ExtOperand; + let DecoderMethod = "s11_3ImmDecoder"; } + def s10Ext : Operand<i32> { let ParserMatchClass = s10ExtOperand; + let DecoderMethod = "s10ImmDecoder"; } + def s9Ext : Operand<i32> { let ParserMatchClass = s9ExtOperand; + let DecoderMethod = "s90ImmDecoder"; } + def s8Ext : Operand<i32> { let ParserMatchClass = s8ExtOperand; + let DecoderMethod = "s8ImmDecoder"; } + def s7Ext : Operand<i32> { let ParserMatchClass = s7ExtOperand; } + def s6Ext : Operand<i32> { let ParserMatchClass = s6ExtOperand; + let DecoderMethod = "s6_0ImmDecoder"; } + def u6Ext : Operand<i32> { let ParserMatchClass = u6ExtOperand; } + def u7Ext : Operand<i32> { let ParserMatchClass = u7ExtOperand; } + def u8Ext : Operand<i32> { let ParserMatchClass = u8ExtOperand; } + def u9Ext : Operand<i32> { let ParserMatchClass = u9ExtOperand; } + def u10Ext : Operand<i32> { let ParserMatchClass = u10ExtOperand; } + def u6_0Ext : Operand<i32> { let ParserMatchClass = u6_0ExtOperand; } + def u6_1Ext : Operand<i32> { let ParserMatchClass = u6_1ExtOperand; } + def u6_2Ext : Operand<i32> { let ParserMatchClass = u6_2ExtOperand; } + def u6_3Ext : Operand<i32> { let ParserMatchClass = u6_3ExtOperand; } + def u32MustExt : Operand<i32> { let ParserMatchClass = u32MustExtOperand; } } -def s10ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<10>(v)) - return true; - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit signed field. - return isConstExtProfitable(Node) && isInt<32>(v); +def s4_7ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + if (HST->hasV60TOps()) + // Return true if the immediate can fit in a 10-bit sign extended field and + // is 128-byte aligned. + return isShiftedInt<4,7>(v); + return false; }]>; -def s8ExtPred : PatLeaf<(i32 imm), [{ +def s3_7ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - if (isInt<8>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit signed field. - return isConstExtProfitable(Node) && isInt<32>(v); + if (HST->hasV60TOps()) + // Return true if the immediate can fit in a 9-bit sign extended field and + // is 128-byte aligned. + return isShiftedInt<3,7>(v); + return false; }]>; -def u8ExtPred : PatLeaf<(i32 imm), [{ +def s4_6ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - if (isUInt<8>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit unsigned field. - return isConstExtProfitable(Node) && isUInt<32>(v); + if (HST->hasV60TOps()) + // Return true if the immediate can fit in a 10-bit sign extended field and + // is 64-byte aligned. + return isShiftedInt<4,6>(v); + return false; }]>; -def u9ExtPred : PatLeaf<(i32 imm), [{ +def s3_6ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - if (isUInt<9>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit unsigned field. - return isConstExtProfitable(Node) && isUInt<32>(v); + if (HST->hasV60TOps()) + // Return true if the immediate can fit in a 9-bit sign extended field and + // is 64-byte aligned. + return isShiftedInt<3,6>(v); + return false; }]>; @@ -523,21 +577,21 @@ let PrintMethod = "printGlobalOperand" in { let PrintMethod = "printJumpTable" in def jumptablebase : Operand<i32>; -def brtarget : Operand<OtherVT>; +def brtarget : Operand<OtherVT> { + let DecoderMethod = "brtargetDecoder"; + let PrintMethod = "printBrtarget"; +} def brtargetExt : Operand<OtherVT> { - let PrintMethod = "printExtBrtarget"; + let DecoderMethod = "brtargetDecoder"; + let PrintMethod = "printBrtarget"; +} +def calltarget : Operand<i32> { + let DecoderMethod = "brtargetDecoder"; + let PrintMethod = "printBrtarget"; } -def calltarget : Operand<i32>; def bblabel : Operand<i32>; -def bbl : SDNode<"ISD::BasicBlock", SDTPtrLeaf , [], "BasicBlockSDNode">; - -def symbolHi32 : Operand<i32> { - let PrintMethod = "printSymbolHi"; -} -def symbolLo32 : Operand<i32> { - let PrintMethod = "printSymbolLo"; -} +def bbl : SDNode<"ISD::BasicBlock", SDTPtrLeaf, [], "BasicBlockSDNode">; // Return true if for a 32 to 64-bit sign-extended load. def is_sext_i32 : PatLeaf<(i64 DoubleRegs:$src1), [{ diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp new file mode 100644 index 0000000..1723771 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp @@ -0,0 +1,150 @@ +//===- HexagonOptimizeSZextends.cpp - Remove unnecessary argument extends -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Pass that removes sign extends for function parameters. These parameters +// are already sign extended by the caller per Hexagon's ABI +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/StackProtector.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" + +#include "Hexagon.h" + +using namespace llvm; + +namespace llvm { + FunctionPass *createHexagonOptimizeSZextends(); + void initializeHexagonOptimizeSZextendsPass(PassRegistry&); +} + +namespace { + struct HexagonOptimizeSZextends : public FunctionPass { + public: + static char ID; + HexagonOptimizeSZextends() : FunctionPass(ID) { + initializeHexagonOptimizeSZextendsPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override; + + const char *getPassName() const override { + return "Remove sign extends"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineFunctionAnalysis>(); + AU.addPreserved<MachineFunctionAnalysis>(); + AU.addPreserved<StackProtector>(); + FunctionPass::getAnalysisUsage(AU); + } + + bool intrinsicAlreadySextended(Intrinsic::ID IntID); + }; +} + +char HexagonOptimizeSZextends::ID = 0; + +INITIALIZE_PASS(HexagonOptimizeSZextends, "reargs", + "Remove Sign and Zero Extends for Args", false, false) + +bool HexagonOptimizeSZextends::intrinsicAlreadySextended(Intrinsic::ID IntID) { + switch(IntID) { + case llvm::Intrinsic::hexagon_A2_addh_l16_sat_ll: + return true; + default: + break; + } + return false; +} + +bool HexagonOptimizeSZextends::runOnFunction(Function &F) { + unsigned Idx = 1; + // Try to optimize sign extends in formal parameters. It's relying on + // callee already sign extending the values. I'm not sure if our ABI + // requires callee to sign extend though. + for (auto &Arg : F.args()) { + if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) { + if (!isa<PointerType>(Arg.getType())) { + for (auto UI = Arg.use_begin(); UI != Arg.use_end();) { + if (isa<SExtInst>(*UI)) { + Instruction* Use = cast<Instruction>(*UI); + SExtInst* SI = new SExtInst(&Arg, Use->getType()); + assert (EVT::getEVT(SI->getType()) == + (EVT::getEVT(Use->getType()))); + ++UI; + Use->replaceAllUsesWith(SI); + Instruction* First = &F.getEntryBlock().front(); + SI->insertBefore(First); + Use->eraseFromParent(); + } else { + ++UI; + } + } + } + } + ++Idx; + } + + // Try to remove redundant sext operations on Hexagon. The hardware + // already sign extends many 16 bit intrinsic operations to 32 bits. + // For example: + // %34 = tail call i32 @llvm.hexagon.A2.addh.l16.sat.ll(i32 %x, i32 %y) + // %sext233 = shl i32 %34, 16 + // %conv52 = ashr exact i32 %sext233, 16 + for (auto &B : F) { + for (auto &I : B) { + // Look for arithmetic shift right by 16. + BinaryOperator *Ashr = dyn_cast<BinaryOperator>(&I); + if (!(Ashr && Ashr->getOpcode() == Instruction::AShr)) + continue; + Value *AshrOp1 = Ashr->getOperand(1); + ConstantInt *C = dyn_cast<ConstantInt>(AshrOp1); + // Right shifted by 16. + if (!(C && C->getSExtValue() == 16)) + continue; + + // The first operand of Ashr comes from logical shift left. + Instruction *Shl = dyn_cast<Instruction>(Ashr->getOperand(0)); + if (!(Shl && Shl->getOpcode() == Instruction::Shl)) + continue; + Value *Intr = Shl->getOperand(0); + Value *ShlOp1 = Shl->getOperand(1); + C = dyn_cast<ConstantInt>(ShlOp1); + // Left shifted by 16. + if (!(C && C->getSExtValue() == 16)) + continue; + + // The first operand of Shl comes from an intrinsic. + if (IntrinsicInst *I = dyn_cast<IntrinsicInst>(Intr)) { + if (!intrinsicAlreadySextended(I->getIntrinsicID())) + continue; + // All is well. Replace all uses of AShr with I. + for (auto UI = Ashr->user_begin(), UE = Ashr->user_end(); + UI != UE; ++UI) { + const Use &TheUse = UI.getUse(); + if (Instruction *J = dyn_cast<Instruction>(TheUse.getUser())) { + J->replaceUsesOfWith(Ashr, I); + } + } + } + } + } + + return true; +} + + +FunctionPass *llvm::createHexagonOptimizeSZextends() { + return new HexagonOptimizeSZextends(); +} diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp index 93dcbe2..e68ff85 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp @@ -124,7 +124,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { // Loop over all of the basic blocks. for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end(); MBBb != MBBe; ++MBBb) { - MachineBasicBlock* MBB = MBBb; + MachineBasicBlock *MBB = &*MBBb; PeepholeMap.clear(); PeepholeDoubleRegsMap.clear(); @@ -180,7 +180,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { unsigned DstReg = Dst.getReg(); unsigned SrcReg = Src1.getReg(); PeepholeDoubleRegsMap[DstReg] = - std::make_pair(*&SrcReg, 1/*Hexagon::subreg_hireg*/); + std::make_pair(*&SrcReg, Hexagon::subreg_hireg); } // Look for P=NOT(P). diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDF.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.cpp new file mode 100644 index 0000000..06719cd --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.cpp @@ -0,0 +1,60 @@ +//===--- HexagonRDF.cpp ---------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "HexagonRDF.h" +#include "HexagonInstrInfo.h" +#include "HexagonRegisterInfo.h" + +#include "llvm/CodeGen/MachineInstr.h" + +using namespace llvm; +using namespace rdf; + +bool HexagonRegisterAliasInfo::covers(RegisterRef RA, RegisterRef RB) const { + if (RA == RB) + return true; + + if (TargetRegisterInfo::isVirtualRegister(RA.Reg) && + TargetRegisterInfo::isVirtualRegister(RB.Reg)) { + // Hexagon-specific cases. + if (RA.Reg == RB.Reg) { + if (RA.Sub == 0) + return true; + if (RB.Sub == 0) + return false; + } + } + + return RegisterAliasInfo::covers(RA, RB); +} + +bool HexagonRegisterAliasInfo::covers(const RegisterSet &RRs, RegisterRef RR) + const { + if (RRs.count(RR)) + return true; + + if (!TargetRegisterInfo::isPhysicalRegister(RR.Reg)) { + assert(TargetRegisterInfo::isVirtualRegister(RR.Reg)); + // Check if both covering subregisters are present. + bool HasLo = RRs.count({RR.Reg, Hexagon::subreg_loreg}); + bool HasHi = RRs.count({RR.Reg, Hexagon::subreg_hireg}); + if (HasLo && HasHi) + return true; + } + + if (RR.Sub == 0) { + // Check if both covering subregisters are present. + unsigned Lo = TRI.getSubReg(RR.Reg, Hexagon::subreg_loreg); + unsigned Hi = TRI.getSubReg(RR.Reg, Hexagon::subreg_hireg); + if (RRs.count({Lo, 0}) && RRs.count({Hi, 0})) + return true; + } + + return RegisterAliasInfo::covers(RRs, RR); +} diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDF.h b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.h new file mode 100644 index 0000000..00c1889 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.h @@ -0,0 +1,28 @@ +//===--- HexagonRDF.h -----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef HEXAGON_RDF_H +#define HEXAGON_RDF_H +#include "RDFGraph.h" + +namespace llvm { + class TargetRegisterInfo; +} + +namespace rdf { + struct HexagonRegisterAliasInfo : public RegisterAliasInfo { + HexagonRegisterAliasInfo(const TargetRegisterInfo &TRI) + : RegisterAliasInfo(TRI) {} + bool covers(RegisterRef RA, RegisterRef RR) const override; + bool covers(const RegisterSet &RRs, RegisterRef RR) const override; + }; +} + +#endif + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp new file mode 100644 index 0000000..3fcda984 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp @@ -0,0 +1,272 @@ +//===--- HexagonRDFOpt.cpp ------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "HexagonInstrInfo.h" +#include "HexagonRDF.h" +#include "HexagonSubtarget.h" +#include "RDFCopy.h" +#include "RDFDeadCode.h" +#include "RDFGraph.h" +#include "RDFLiveness.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominanceFrontier.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Format.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; +using namespace rdf; + +namespace llvm { + void initializeHexagonRDFOptPass(PassRegistry&); + FunctionPass *createHexagonRDFOpt(); +} + +namespace { + cl::opt<unsigned> RDFLimit("rdf-limit", cl::init(UINT_MAX)); + unsigned RDFCount = 0; + cl::opt<bool> RDFDump("rdf-dump", cl::init(false)); + + class HexagonRDFOpt : public MachineFunctionPass { + public: + HexagonRDFOpt() : MachineFunctionPass(ID) { + initializeHexagonRDFOptPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominanceFrontier>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + const char *getPassName() const override { + return "Hexagon RDF optimizations"; + } + bool runOnMachineFunction(MachineFunction &MF) override; + + static char ID; + + private: + MachineDominatorTree *MDT; + MachineRegisterInfo *MRI; + }; + + char HexagonRDFOpt::ID = 0; +} + +INITIALIZE_PASS_BEGIN(HexagonRDFOpt, "rdfopt", "Hexagon RDF opt", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) +INITIALIZE_PASS_END(HexagonRDFOpt, "rdfopt", "Hexagon RDF opt", false, false) + + +struct HexagonDCE : public DeadCodeElimination { + HexagonDCE(DataFlowGraph &G, MachineRegisterInfo &MRI) + : DeadCodeElimination(G, MRI) {} + bool rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove); + void removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum); + + bool run(); +}; + + +bool HexagonDCE::run() { + bool Collected = collect(); + if (!Collected) + return false; + + const SetVector<NodeId> &DeadNodes = getDeadNodes(); + const SetVector<NodeId> &DeadInstrs = getDeadInstrs(); + + typedef DenseMap<NodeId,NodeId> RefToInstrMap; + RefToInstrMap R2I; + SetVector<NodeId> PartlyDead; + DataFlowGraph &DFG = getDFG(); + + for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) { + for (auto TA : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Stmt>, DFG)) { + NodeAddr<StmtNode*> SA = TA; + for (NodeAddr<RefNode*> RA : SA.Addr->members(DFG)) { + R2I.insert(std::make_pair(RA.Id, SA.Id)); + if (DFG.IsDef(RA) && DeadNodes.count(RA.Id)) + if (!DeadInstrs.count(SA.Id)) + PartlyDead.insert(SA.Id); + } + } + } + + // Nodes to remove. + SetVector<NodeId> Remove = DeadInstrs; + + bool Changed = false; + for (NodeId N : PartlyDead) { + auto SA = DFG.addr<StmtNode*>(N); + if (trace()) + dbgs() << "Partly dead: " << *SA.Addr->getCode(); + Changed |= rewrite(SA, Remove); + } + + return erase(Remove) || Changed; +} + + +void HexagonDCE::removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum) { + MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode(); + + auto getOpNum = [MI] (MachineOperand &Op) -> unsigned { + for (unsigned i = 0, n = MI->getNumOperands(); i != n; ++i) + if (&MI->getOperand(i) == &Op) + return i; + llvm_unreachable("Invalid operand"); + }; + DenseMap<NodeId,unsigned> OpMap; + NodeList Refs = IA.Addr->members(getDFG()); + for (NodeAddr<RefNode*> RA : Refs) + OpMap.insert(std::make_pair(RA.Id, getOpNum(RA.Addr->getOp()))); + + MI->RemoveOperand(OpNum); + + for (NodeAddr<RefNode*> RA : Refs) { + unsigned N = OpMap[RA.Id]; + if (N < OpNum) + RA.Addr->setRegRef(&MI->getOperand(N)); + else if (N > OpNum) + RA.Addr->setRegRef(&MI->getOperand(N-1)); + } +} + + +bool HexagonDCE::rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove) { + if (!getDFG().IsCode<NodeAttrs::Stmt>(IA)) + return false; + DataFlowGraph &DFG = getDFG(); + MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode(); + auto &HII = static_cast<const HexagonInstrInfo&>(DFG.getTII()); + if (HII.getAddrMode(MI) != HexagonII::PostInc) + return false; + unsigned Opc = MI->getOpcode(); + unsigned OpNum, NewOpc; + switch (Opc) { + case Hexagon::L2_loadri_pi: + NewOpc = Hexagon::L2_loadri_io; + OpNum = 1; + break; + case Hexagon::L2_loadrd_pi: + NewOpc = Hexagon::L2_loadrd_io; + OpNum = 1; + break; + case Hexagon::V6_vL32b_pi: + NewOpc = Hexagon::V6_vL32b_ai; + OpNum = 1; + break; + case Hexagon::S2_storeri_pi: + NewOpc = Hexagon::S2_storeri_io; + OpNum = 0; + break; + case Hexagon::S2_storerd_pi: + NewOpc = Hexagon::S2_storerd_io; + OpNum = 0; + break; + case Hexagon::V6_vS32b_pi: + NewOpc = Hexagon::V6_vS32b_ai; + OpNum = 0; + break; + default: + return false; + } + auto IsDead = [this] (NodeAddr<DefNode*> DA) -> bool { + return getDeadNodes().count(DA.Id); + }; + NodeList Defs; + MachineOperand &Op = MI->getOperand(OpNum); + for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG)) { + if (&DA.Addr->getOp() != &Op) + continue; + Defs = DFG.getRelatedRefs(IA, DA); + if (!std::all_of(Defs.begin(), Defs.end(), IsDead)) + return false; + break; + } + + // Mark all nodes in Defs for removal. + for (auto D : Defs) + Remove.insert(D.Id); + + if (trace()) + dbgs() << "Rewriting: " << *MI; + MI->setDesc(HII.get(NewOpc)); + MI->getOperand(OpNum+2).setImm(0); + removeOperand(IA, OpNum); + if (trace()) + dbgs() << " to: " << *MI; + + return true; +} + + +bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) { + if (RDFLimit.getPosition()) { + if (RDFCount >= RDFLimit) + return false; + RDFCount++; + } + + MDT = &getAnalysis<MachineDominatorTree>(); + const auto &MDF = getAnalysis<MachineDominanceFrontier>(); + const auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); + const auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); + MRI = &MF.getRegInfo(); + + HexagonRegisterAliasInfo HAI(HRI); + TargetOperandInfo TOI(HII); + + if (RDFDump) + MF.print(dbgs() << "Before " << getPassName() << "\n", nullptr); + DataFlowGraph G(MF, HII, HRI, *MDT, MDF, HAI, TOI); + G.build(); + if (RDFDump) { + dbgs() << PrintNode<FuncNode*>(G.getFunc(), G) << '\n'; + dbgs() << MF.getName() << '\n'; + } + + bool Changed; + CopyPropagation CP(G); + CP.trace(RDFDump); + Changed = CP.run(); + if (Changed) + G.build(); + + HexagonDCE DCE(G, *MRI); + DCE.trace(RDFDump); + Changed |= DCE.run(); + + if (Changed) { + Liveness LV(*MRI, G); + LV.trace(RDFDump); + LV.computeLiveIns(); + LV.resetLiveIns(); + LV.resetKills(); + } + + if (RDFDump) + MF.print(dbgs() << "After " << getPassName() << "\n", nullptr); + return false; +} + + +FunctionPass *llvm::createHexagonRDFOpt() { + return new HexagonRDFOpt(); +} + + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp index f6bb4a0..6e5f732 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp @@ -66,6 +66,8 @@ HexagonRegisterInfo::getCallerSavedRegs(const MachineFunction *MF) const { switch (HST.getHexagonArchVersion()) { case HexagonSubtarget::V4: case HexagonSubtarget::V5: + case HexagonSubtarget::V55: + case HexagonSubtarget::V60: return CallerSavedRegsV4; } llvm_unreachable( @@ -84,6 +86,8 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { switch (MF->getSubtarget<HexagonSubtarget>().getHexagonArchVersion()) { case HexagonSubtarget::V4: case HexagonSubtarget::V5: + case HexagonSubtarget::V55: + case HexagonSubtarget::V60: return CalleeSavedRegsV3; } llvm_unreachable("Callee saved registers requested for unknown architecture " @@ -98,6 +102,8 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF) Reserved.set(Hexagon::R29); Reserved.set(Hexagon::R30); Reserved.set(Hexagon::R31); + Reserved.set(Hexagon::PC); + Reserved.set(Hexagon::GP); Reserved.set(Hexagon::D14); Reserved.set(Hexagon::D15); Reserved.set(Hexagon::LC0); @@ -116,62 +122,21 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, assert(SPAdj == 0 && "Unexpected"); MachineInstr &MI = *II; - MachineBasicBlock &MB = *MI.getParent(); MachineFunction &MF = *MB.getParent(); - MachineFrameInfo &MFI = *MF.getFrameInfo(); - auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget()); + auto &HST = MF.getSubtarget<HexagonSubtarget>(); auto &HII = *HST.getInstrInfo(); auto &HFI = *HST.getFrameLowering(); + unsigned BP = 0; int FI = MI.getOperand(FIOp).getIndex(); - int Offset = MFI.getObjectOffset(FI) + MI.getOperand(FIOp+1).getImm(); - bool HasAlloca = MFI.hasVarSizedObjects(); - bool HasAlign = needsStackRealignment(MF); - - // XXX: Fixed objects cannot be accessed through SP if there are aligned - // objects in the local frame, or if there are dynamically allocated objects. - // In such cases, there has to be FP available. - if (!HFI.hasFP(MF)) { - assert(!HasAlloca && !HasAlign && "This function must have frame pointer"); - // We will not reserve space on the stack for the lr and fp registers. - Offset -= 8; - } - - unsigned SP = getStackRegister(), FP = getFrameRegister(); - unsigned AP = 0; - if (MachineInstr *AI = HFI.getAlignaInstr(MF)) - AP = AI->getOperand(0).getReg(); - unsigned FrameSize = MFI.getStackSize(); - - // Special handling of dbg_value instructions and INLINEASM. - if (MI.isDebugValue() || MI.isInlineAsm()) { - MI.getOperand(FIOp).ChangeToRegister(SP, false /*isDef*/); - MI.getOperand(FIOp+1).ChangeToImmediate(Offset+FrameSize); - return; - } - - bool UseFP = false, UseAP = false; // Default: use SP. - if (MFI.isFixedObjectIndex(FI) || MFI.isObjectPreAllocated(FI)) { - UseFP = HasAlloca || HasAlign; - } else { - if (HasAlloca) { - if (HasAlign) - UseAP = true; - else - UseFP = true; - } - } + // Select the base pointer (BP) and calculate the actual offset from BP + // to the beginning of the object at index FI. + int Offset = HFI.getFrameIndexReference(MF, FI, BP); + // Add the offset from the instruction. + int RealOffset = Offset + MI.getOperand(FIOp+1).getImm(); unsigned Opc = MI.getOpcode(); - bool ValidSP = HII.isValidOffset(Opc, FrameSize+Offset); - bool ValidFP = HII.isValidOffset(Opc, Offset); - - // Calculate the actual offset in the instruction. - int64_t RealOffset = Offset; - if (!UseFP && !UseAP) - RealOffset = FrameSize+Offset; - switch (Opc) { case Hexagon::TFR_FIA: MI.setDesc(HII.get(Hexagon::A2_addi)); @@ -184,20 +149,7 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, break; } - unsigned BP = 0; - bool Valid = false; - if (UseFP) { - BP = FP; - Valid = ValidFP; - } else if (UseAP) { - BP = AP; - Valid = ValidFP; - } else { - BP = SP; - Valid = ValidSP; - } - - if (Valid) { + if (HII.isValidOffset(Opc, RealOffset)) { MI.getOperand(FIOp).ChangeToRegister(BP, false); MI.getOperand(FIOp+1).ChangeToImmediate(RealOffset); return; @@ -223,8 +175,8 @@ unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const HexagonFrameLowering *TFI = getFrameLowering(MF); if (TFI->hasFP(MF)) - return Hexagon::R30; - return Hexagon::R29; + return getFrameRegister(); + return getStackRegister(); } @@ -238,17 +190,9 @@ unsigned HexagonRegisterInfo::getStackRegister() const { } -bool -HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { - const HexagonFrameLowering *TFI = getFrameLowering(MF); - return TFI->hasFP(MF); -} - - -bool -HexagonRegisterInfo::needsStackRealignment(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - return MFI->getMaxAlignment() > 8; +bool HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) + const { + return MF.getSubtarget<HexagonSubtarget>().getFrameLowering()->hasFP(MF); } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h index 7edefee..db7e0f2 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h @@ -63,8 +63,6 @@ public: return true; } - bool needsStackRealignment(const MachineFunction &MF) const override; - /// Returns true if the frame pointer is valid. bool useFPForScavengingIndex(const MachineFunction &MF) const override; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td index edf1c25..81629dc 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td @@ -53,6 +53,12 @@ let Namespace = "Hexagon" in { let Num = num; } + + // Rq - vector predicate registers + class Rq<bits<3> num, string n> : Register<n, []> { + let HWEncoding{2-0} = num; + } + // Rc - control registers class Rc<bits<5> num, string n, list<string> alt = [], list<Register> alias = []> : @@ -131,20 +137,21 @@ let Namespace = "Hexagon" in { def LC1 : Rc<3, "lc1", ["c3"]>, DwarfRegNum<[70]>; def P3_0 : Rc<4, "p3:0", ["c4"], [P0, P1, P2, P3]>, DwarfRegNum<[71]>; - def C6 : Rc<6, "c6", [], [M0]>, DwarfRegNum<[72]>; - def C7 : Rc<7, "c7", [], [M1]>, DwarfRegNum<[73]>; + def C5 : Rc<5, "c5", ["c5"]>, DwarfRegNum<[72]>; // future use + def C6 : Rc<6, "c6", [], [M0]>, DwarfRegNum<[73]>; + def C7 : Rc<7, "c7", [], [M1]>, DwarfRegNum<[74]>; - def USR : Rc<8, "usr", ["c8"]>, DwarfRegNum<[74]> { + def USR : Rc<8, "usr", ["c8"]>, DwarfRegNum<[75]> { let SubRegIndices = [subreg_overflow]; let SubRegs = [USR_OVF]; } - def PC : Rc<9, "pc">, DwarfRegNum<[75]>; - def UGP : Rc<10, "ugp", ["c10"]>, DwarfRegNum<[76]>; - def GP : Rc<11, "gp">, DwarfRegNum<[77]>; - def CS0 : Rc<12, "cs0", ["c12"]>, DwarfRegNum<[78]>; - def CS1 : Rc<13, "cs1", ["c13"]>, DwarfRegNum<[79]>; - def UPCL : Rc<14, "upcyclelo", ["c14"]>, DwarfRegNum<[80]>; - def UPCH : Rc<15, "upcyclehi", ["c15"]>, DwarfRegNum<[81]>; + def PC : Rc<9, "pc">, DwarfRegNum<[76]>; + def UGP : Rc<10, "ugp", ["c10"]>, DwarfRegNum<[77]>; + def GP : Rc<11, "gp">, DwarfRegNum<[78]>; + def CS0 : Rc<12, "cs0", ["c12"]>, DwarfRegNum<[79]>; + def CS1 : Rc<13, "cs1", ["c13"]>, DwarfRegNum<[80]>; + def UPCL : Rc<14, "upcyclelo", ["c14"]>, DwarfRegNum<[81]>; + def UPCH : Rc<15, "upcyclehi", ["c15"]>, DwarfRegNum<[82]>; } // Control registers pairs. @@ -158,6 +165,36 @@ let Namespace = "Hexagon" in { def UPC : Rcc<14, "c15:14", [UPCL, UPCH]>, DwarfRegNum<[80]>; } + foreach i = 0-31 in { + def V#i : Ri<i, "v"#i>, DwarfRegNum<[!add(i, 99)]>; + } + + // Aliases of the V* registers used to hold double vec values. + let SubRegIndices = [subreg_loreg, subreg_hireg], CoveredBySubRegs = 1 in { + def W0 : Rd< 0, "v1:0", [V0, V1]>, DwarfRegNum<[99]>; + def W1 : Rd< 2, "v3:2", [V2, V3]>, DwarfRegNum<[101]>; + def W2 : Rd< 4, "v5:4", [V4, V5]>, DwarfRegNum<[103]>; + def W3 : Rd< 6, "v7:6", [V6, V7]>, DwarfRegNum<[105]>; + def W4 : Rd< 8, "v9:8", [V8, V9]>, DwarfRegNum<[107]>; + def W5 : Rd<10, "v11:10", [V10, V11]>, DwarfRegNum<[109]>; + def W6 : Rd<12, "v13:12", [V12, V13]>, DwarfRegNum<[111]>; + def W7 : Rd<14, "v15:14", [V14, V15]>, DwarfRegNum<[113]>; + def W8 : Rd<16, "v17:16", [V16, V17]>, DwarfRegNum<[115]>; + def W9 : Rd<18, "v19:18", [V18, V19]>, DwarfRegNum<[117]>; + def W10 : Rd<20, "v21:20", [V20, V21]>, DwarfRegNum<[119]>; + def W11 : Rd<22, "v23:22", [V22, V23]>, DwarfRegNum<[121]>; + def W12 : Rd<24, "v25:24", [V24, V25]>, DwarfRegNum<[123]>; + def W13 : Rd<26, "v27:26", [V26, V27]>, DwarfRegNum<[125]>; + def W14 : Rd<28, "v29:28", [V28, V29]>, DwarfRegNum<[127]>; + def W15 : Rd<30, "v31:30", [V30, V31]>, DwarfRegNum<[129]>; + } + + // Vector Predicate registers. + def Q0 : Rq<0, "q0">, DwarfRegNum<[131]>; + def Q1 : Rq<1, "q1">, DwarfRegNum<[132]>; + def Q2 : Rq<2, "q2">, DwarfRegNum<[133]>; + def Q3 : Rq<3, "q3">, DwarfRegNum<[134]>; + // Register classes. // // FIXME: the register order should be defined in terms of the preferred @@ -169,10 +206,34 @@ def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32, R10, R11, R29, R30, R31)> { } +// Registers are listed in reverse order for allocation preference reasons. +def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32, + (add R7, R6, R5, R4, R3, R2, R1, R0)> ; + def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64, (add (sequence "D%u", 0, 4), (sequence "D%u", 6, 13), D5, D14, D15)>; +def VectorRegs : RegisterClass<"Hexagon", [v64i8, v32i16, v16i32, v8i64], 512, + (add (sequence "V%u", 0, 31))>; + +def VecDblRegs : RegisterClass<"Hexagon", + [v128i8, v64i16, v32i32, v16i64], 1024, + (add (sequence "W%u", 0, 15))>; + +def VectorRegs128B : RegisterClass<"Hexagon", + [v128i8, v64i16, v32i32, v16i64], 1024, + (add (sequence "V%u", 0, 31))>; + +def VecDblRegs128B : RegisterClass<"Hexagon", + [v256i8,v128i16,v64i32,v32i64], 2048, + (add (sequence "W%u", 0, 15))>; + +def VecPredRegs : RegisterClass<"Hexagon", [v512i1], 512, + (add (sequence "Q%u", 0, 3))>; + +def VecPredRegs128B : RegisterClass<"Hexagon", [v1024i1], 1024, + (add (sequence "Q%u", 0, 3))>; def PredRegs : RegisterClass<"Hexagon", [i1, v2i1, v4i1, v8i1, v4i8, v2i16, i32], 32, diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp deleted file mode 100644 index 7069ad3..0000000 --- a/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp +++ /dev/null @@ -1,91 +0,0 @@ -//===- HexagonRemoveExtendArgs.cpp - Remove unnecessary argument sign extends // -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Pass that removes sign extends for function parameters. These parameters -// are already sign extended by the caller per Hexagon's ABI -// -//===----------------------------------------------------------------------===// - -#include "Hexagon.h" -#include "HexagonTargetMachine.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/StackProtector.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/Pass.h" -#include "llvm/Transforms/Scalar.h" - -using namespace llvm; - -namespace llvm { - FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM); - void initializeHexagonRemoveExtendArgsPass(PassRegistry&); -} - -namespace { - struct HexagonRemoveExtendArgs : public FunctionPass { - public: - static char ID; - HexagonRemoveExtendArgs() : FunctionPass(ID) { - initializeHexagonRemoveExtendArgsPass(*PassRegistry::getPassRegistry()); - } - bool runOnFunction(Function &F) override; - - const char *getPassName() const override { - return "Remove sign extends"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineFunctionAnalysis>(); - AU.addPreserved<MachineFunctionAnalysis>(); - AU.addPreserved<StackProtector>(); - FunctionPass::getAnalysisUsage(AU); - } - }; -} - -char HexagonRemoveExtendArgs::ID = 0; - -INITIALIZE_PASS(HexagonRemoveExtendArgs, "reargs", - "Remove Sign and Zero Extends for Args", false, false) - -bool HexagonRemoveExtendArgs::runOnFunction(Function &F) { - unsigned Idx = 1; - for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE; - ++AI, ++Idx) { - if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) { - Argument* Arg = AI; - if (!isa<PointerType>(Arg->getType())) { - for (auto UI = Arg->user_begin(); UI != Arg->user_end();) { - if (isa<SExtInst>(*UI)) { - Instruction* I = cast<Instruction>(*UI); - SExtInst* SI = new SExtInst(Arg, I->getType()); - assert (EVT::getEVT(SI->getType()) == - (EVT::getEVT(I->getType()))); - ++UI; - I->replaceAllUsesWith(SI); - Instruction* First = F.getEntryBlock().begin(); - SI->insertBefore(First); - I->eraseFromParent(); - } else { - ++UI; - } - } - } - } - } - return true; -} - - - -FunctionPass* -llvm::createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM) { - return new HexagonRemoveExtendArgs(); -} diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td index 528cafc..6e4987b 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td @@ -13,6 +13,12 @@ include "HexagonScheduleV4.td" +// V55 Machine Info + +include "HexagonScheduleV55.td" + //===----------------------------------------------------------------------===// -// V4 Machine Info - +// V60 Machine Info - //===----------------------------------------------------------------------===// + +include "HexagonScheduleV60.td" + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td index a7d2d47..67af147 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td @@ -35,10 +35,11 @@ def SLOT_ENDLOOP: FuncUnit; // Itinerary classes. def PSEUDO : InstrItinClass; -def PSEUDOM : InstrItinClass; +def PSEUDOM : InstrItinClass; // ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4. def DUPLEX : InstrItinClass; def PREFIX : InstrItinClass; +def COMPOUND_CJ_ARCHDEPSLOT : InstrItinClass; def COMPOUND : InstrItinClass; def ALU32_2op_tc_1_SLOT0123 : InstrItinClass; @@ -58,6 +59,7 @@ def CR_tc_2early_SLOT3 : InstrItinClass; def CR_tc_3x_SLOT23 : InstrItinClass; def CR_tc_3x_SLOT3 : InstrItinClass; def J_tc_2early_SLOT23 : InstrItinClass; +def J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT : InstrItinClass; def J_tc_2early_SLOT2 : InstrItinClass; def LD_tc_ld_SLOT01 : InstrItinClass; def LD_tc_ld_SLOT0 : InstrItinClass; @@ -91,6 +93,7 @@ def V4LDST_tc_st_SLOT0 : InstrItinClass; def V4LDST_tc_st_SLOT01 : InstrItinClass; def J_tc_2early_SLOT0123 : InstrItinClass; def EXTENDER_tc_1_SLOT0123 : InstrItinClass; +def S_3op_tc_3stall_SLOT23 : InstrItinClass; def HexagonItinerariesV4 : diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td new file mode 100644 index 0000000..d9ad25d --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td @@ -0,0 +1,170 @@ +//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// There are four SLOTS (four parallel pipelines) in Hexagon V4 machine. +// This file describes that machine information. + +// +// |===========|==================================================| +// | PIPELINE | Instruction Classes | +// |===========|==================================================| +// | SLOT0 | LD ST ALU32 MEMOP NV SYSTEM | +// |-----------|--------------------------------------------------| +// | SLOT1 | LD ST ALU32 | +// |-----------|--------------------------------------------------| +// | SLOT2 | XTYPE ALU32 J JR | +// |-----------|--------------------------------------------------| +// | SLOT3 | XTYPE ALU32 J CR | +// |===========|==================================================| + +def CJ_tc_1_SLOT23 : InstrItinClass; +def CJ_tc_2early_SLOT23 : InstrItinClass; +def COPROC_VMEM_vtc_long_SLOT01 : InstrItinClass; +def COPROC_VX_vtc_long_SLOT23 : InstrItinClass; +def COPROC_VX_vtc_SLOT23 : InstrItinClass; +def J_tc_3stall_SLOT2 : InstrItinClass; +def MAPPING_tc_1_SLOT0123 : InstrItinClass; +def M_tc_3stall_SLOT23 : InstrItinClass; +def SUBINSN_tc_1_SLOT01 : InstrItinClass; +def SUBINSN_tc_2early_SLOT0 : InstrItinClass; +def SUBINSN_tc_2early_SLOT01 : InstrItinClass; +def SUBINSN_tc_3stall_SLOT0 : InstrItinClass; +def SUBINSN_tc_ld_SLOT0 : InstrItinClass; +def SUBINSN_tc_ld_SLOT01 : InstrItinClass; +def SUBINSN_tc_st_SLOT01 : InstrItinClass; + +def HexagonItinerariesV55 : + ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [ + // ALU32 + InstrItinData<ALU32_2op_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_2op_tc_2early_SLOT0123, + [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_3op_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_3op_tc_2_SLOT0123 , + [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_3op_tc_2early_SLOT0123, + [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_ADDI_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + + // ALU64 + InstrItinData<ALU64_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<ALU64_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<ALU64_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + + // CR -> System + InstrItinData<CR_tc_2_SLOT3 , [InstrStage<2, [SLOT3]>]>, + InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<2, [SLOT3]>]>, + InstrItinData<CR_tc_3x_SLOT3 , [InstrStage<3, [SLOT3]>]>, + + // Jump (conditional/unconditional/return etc) + InstrItinData<CR_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<CR_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<CJ_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<J_tc_2early_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>, + + // JR + InstrItinData<J_tc_2early_SLOT2 , [InstrStage<2, [SLOT2]>]>, + InstrItinData<J_tc_3stall_SLOT2 , [InstrStage<3, [SLOT2]>]>, + + // Extender + InstrItinData<EXTENDER_tc_1_SLOT0123, + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + + // Load + InstrItinData<LD_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<3, [SLOT0]>]>, + InstrItinData<LD_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>, + + // M + InstrItinData<M_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>, + + // Store + InstrItinData<ST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>, + InstrItinData<ST_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>, + InstrItinData<ST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>, + + // Subinsn + InstrItinData<SUBINSN_tc_2early_SLOT0, [InstrStage<2, [SLOT0]>]>, + InstrItinData<SUBINSN_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>, + InstrItinData<SUBINSN_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>, + InstrItinData<SUBINSN_tc_1_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<SUBINSN_tc_2early_SLOT01, + [InstrStage<2, [SLOT0, SLOT1]>]>, + InstrItinData<SUBINSN_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<SUBINSN_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + + // S + InstrItinData<S_2op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<S_2op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + + // New Value Compare Jump + InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<3, [SLOT0]>]>, + + // Mem ops + InstrItinData<V2LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>, + InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<2, [SLOT0, SLOT1]>]>, + InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<V4LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>, + InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + + // Endloop + InstrItinData<J_tc_2early_SLOT0123, [InstrStage<2, [SLOT_ENDLOOP]>]>, + + // Vector + InstrItinData<COPROC_VMEM_vtc_long_SLOT01, + [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<COPROC_VX_vtc_long_SLOT23 , + [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<COPROC_VX_vtc_SLOT23 , + [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<MAPPING_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + + // Misc + InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>, + InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [SLOT2, SLOT3]>]> + + ]>; + +def HexagonModelV55 : SchedMachineModel { + // Max issue per cycle == bundle width. + let IssueWidth = 4; + let Itineraries = HexagonItinerariesV55; + let LoadLatency = 1; +} + +//===----------------------------------------------------------------------===// +// Hexagon V4 Resource Definitions - +//===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td new file mode 100644 index 0000000..2ccff82 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td @@ -0,0 +1,310 @@ +//=-HexagonScheduleV60.td - HexagonV60 Scheduling Definitions *- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// CVI pipes from the "Hexagon Multimedia Co-Processor Extensions Arch Spec". +def CVI_ST : FuncUnit; +def CVI_XLANE : FuncUnit; +def CVI_SHIFT : FuncUnit; +def CVI_MPY0 : FuncUnit; +def CVI_MPY1 : FuncUnit; +def CVI_LD : FuncUnit; + +// Combined functional units. +def CVI_XLSHF : FuncUnit; +def CVI_MPY01 : FuncUnit; +def CVI_ALL : FuncUnit; + +// Combined functional unit data. +def HexagonComboFuncsV60 : + ComboFuncUnits<[ + ComboFuncData<CVI_XLSHF , [CVI_XLANE, CVI_SHIFT]>, + ComboFuncData<CVI_MPY01 , [CVI_MPY0, CVI_MPY1]>, + ComboFuncData<CVI_ALL , [CVI_ST, CVI_XLANE, CVI_SHIFT, + CVI_MPY0, CVI_MPY1, CVI_LD]> + ]>; + +// Note: When adding additional vector scheduling classes, add the +// corresponding methods to the class HexagonInstrInfo. +def CVI_VA : InstrItinClass; +def CVI_VA_DV : InstrItinClass; +def CVI_VX_LONG : InstrItinClass; +def CVI_VX_LATE : InstrItinClass; +def CVI_VX : InstrItinClass; +def CVI_VX_DV_LONG : InstrItinClass; +def CVI_VX_DV : InstrItinClass; +def CVI_VX_DV_SLOT2 : InstrItinClass; +def CVI_VP : InstrItinClass; +def CVI_VP_LONG : InstrItinClass; +def CVI_VP_VS_EARLY : InstrItinClass; +def CVI_VP_VS_LONG_EARLY : InstrItinClass; +def CVI_VP_VS_LONG : InstrItinClass; +def CVI_VP_VS : InstrItinClass; +def CVI_VP_DV : InstrItinClass; +def CVI_VS : InstrItinClass; +def CVI_VINLANESAT : InstrItinClass; +def CVI_VM_LD : InstrItinClass; +def CVI_VM_TMP_LD : InstrItinClass; +def CVI_VM_CUR_LD : InstrItinClass; +def CVI_VM_VP_LDU : InstrItinClass; +def CVI_VM_ST : InstrItinClass; +def CVI_VM_NEW_ST : InstrItinClass; +def CVI_VM_STU : InstrItinClass; +def CVI_HIST : InstrItinClass; +def CVI_VA_EXT : InstrItinClass; + +// There are four SLOTS (four parallel pipelines) in Hexagon V60 machine. +// This file describes that machine information. +// +// |===========|==================================================| +// | PIPELINE | Instruction Classes | +// |===========|==================================================| +// | SLOT0 | LD ST ALU32 MEMOP NV SYSTEM | +// |-----------|--------------------------------------------------| +// | SLOT1 | LD ST ALU32 | +// |-----------|--------------------------------------------------| +// | SLOT2 | XTYPE ALU32 J JR | +// |-----------|--------------------------------------------------| +// | SLOT3 | XTYPE ALU32 J CR | +// |===========|==================================================| +// +// +// In addition to using the above SLOTS, there are also six vector pipelines +// in the CVI co-processor in the Hexagon V60 machine. +// +// |=========| |=========| |=========| |=========| |=========| |=========| +// SLOT | CVI_LD | |CVI_MPY3 | |CVI_MPY2 | |CVI_SHIFT| |CVI_XLANE| | CVI_ST | +// ==== |=========| |=========| |=========| |=========| |=========| |=========| +// S0-3 | | | CVI_VA | | CVI_VA | | CVI_VA | | CVI_VA | | | +// S2-3 | | | CVI_VX | | CVI_VX | | | | | | | +// S0-3 | | | | | | | | | CVI_VP | | | +// S0-3 | | | | | | | CVI_VS | | | | | +// S0-1 |(CVI_LD) | | CVI_LD | | CVI_LD | | CVI_LD | | CVI_LD | | | +// S0-1 |(C*TMP_LD) | | | | | | | | | | +// S01 |(C*_LDU) | | | | | | | | C*_LDU | | | +// S0 | | | CVI_ST | | CVI_ST | | CVI_ST | | CVI_ST | |(CVI_ST) | +// S0 | | | | | | | | | | |(C*TMP_ST) +// S01 | | | | | | | | | VSTU | |(C*_STU) | +// |=========| |=========| |=========| |=========| |=========| |=========| +// |=====================| |=====================| +// | CVI_MPY2 & CVI_MPY3 | |CVI_XLANE & CVI_SHIFT| +// |=====================| |=====================| +// S0-3 | CVI_VA_DV | | CVI_VA_DV | +// S0-3 | | | CVI_VP_DV | +// S2-3 | CVI_VX_DV | | | +// |=====================| |=====================| +// |=====================================================================| +// S0-3 | CVI_HIST Histogram | +// S0123| CVI_VA_EXT Extract | +// |=====================================================================| + +def HexagonItinerariesV60 : + ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP, + CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1, + CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL], [], [ + // ALU32 + InstrItinData<ALU32_2op_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_2op_tc_2early_SLOT0123, + [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_3op_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_3op_tc_2_SLOT0123 , + [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_3op_tc_2early_SLOT0123, + [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_ADDI_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + + // ALU64 + InstrItinData<ALU64_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<ALU64_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<ALU64_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + + // CR -> System + InstrItinData<CR_tc_2_SLOT3 , [InstrStage<2, [SLOT3]>]>, + InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<2, [SLOT3]>]>, + InstrItinData<CR_tc_3x_SLOT3 , [InstrStage<3, [SLOT3]>]>, + + // Jump (conditional/unconditional/return etc) + InstrItinData<CR_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<CR_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<CJ_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<J_tc_2early_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>, + + // JR + InstrItinData<J_tc_2early_SLOT2 , [InstrStage<2, [SLOT2]>]>, + InstrItinData<J_tc_3stall_SLOT2 , [InstrStage<3, [SLOT2]>]>, + + // Extender + InstrItinData<EXTENDER_tc_1_SLOT0123, [InstrStage<1, + [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + + // Load + InstrItinData<LD_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>, + InstrItinData<LD_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>, + + // M + InstrItinData<M_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>, + + // Store + InstrItinData<ST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>, + InstrItinData<ST_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>, + InstrItinData<ST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>, + + // Subinsn + InstrItinData<SUBINSN_tc_2early_SLOT0, [InstrStage<2, [SLOT0]>]>, + InstrItinData<SUBINSN_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>, + InstrItinData<SUBINSN_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>, + InstrItinData<SUBINSN_tc_1_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<SUBINSN_tc_2early_SLOT01, + [InstrStage<2, [SLOT0, SLOT1]>]>, + InstrItinData<SUBINSN_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<SUBINSN_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + + // S + InstrItinData<S_2op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<S_2op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + // The S_2op_tc_3x_SLOT23 slots are 4 cycles on v60. + InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + + // New Value Compare Jump + InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>, + + // Mem ops + InstrItinData<V2LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>, + InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<2, [SLOT0, SLOT1]>]>, + InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<V4LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>, + InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + + // Endloop + InstrItinData<J_tc_2early_SLOT0123, [InstrStage<2, [SLOT_ENDLOOP]>]>, + + // Vector + InstrItinData<COPROC_VMEM_vtc_long_SLOT01, + [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<COPROC_VX_vtc_long_SLOT23 , + [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<COPROC_VX_vtc_SLOT23 , + [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<MAPPING_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + + // Duplex and Compound + InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>, + InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>, + // Misc + InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<PSEUDOM , [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [SLOT2, SLOT3]>]>, + + // Latest CVI spec definitions. + InstrItinData<CVI_VA,[InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLANE,CVI_SHIFT, + CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VA_DV, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLSHF, CVI_MPY01]>]>, + InstrItinData<CVI_VX_LONG, [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VX_LATE, [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VX,[InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VX_DV_LONG, + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>]>, + InstrItinData<CVI_VX_DV, + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>]>, + InstrItinData<CVI_VX_DV_SLOT2, + [InstrStage<1, [SLOT2], 0>, + InstrStage<1, [CVI_MPY01]>]>, + InstrItinData<CVI_VP, [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>]>, + InstrItinData<CVI_VP_LONG, [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>]>, + InstrItinData<CVI_VP_VS_EARLY, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>]>, + InstrItinData<CVI_VP_VS_LONG, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>]>, + InstrItinData<CVI_VP_VS, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>]>, + InstrItinData<CVI_VP_VS_LONG_EARLY, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>]>, + InstrItinData<CVI_VP_DV , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>]>, + InstrItinData<CVI_VS, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>]>, + InstrItinData<CVI_VINLANESAT, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>]>, + InstrItinData<CVI_VM_LD , [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE, CVI_SHIFT, + CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VM_TMP_LD,[InstrStage<1,[SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD]>]>, + InstrItinData<CVI_VM_CUR_LD,[InstrStage<1,[SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE, CVI_SHIFT, + CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VM_VP_LDU,[InstrStage<1,[SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>]>, + InstrItinData<CVI_VM_ST , [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE, CVI_SHIFT, + CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VM_NEW_ST,[InstrStage<1,[SLOT0], 0>, + InstrStage<1, [CVI_ST]>]>, + InstrItinData<CVI_VM_STU , [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>]>, + InstrItinData<CVI_HIST , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_ALL]>]> + ]>; + +def HexagonModelV60 : SchedMachineModel { + // Max issue per cycle == bundle width. + let IssueWidth = 4; + let Itineraries = HexagonItinerariesV60; + let LoadLatency = 1; +} + +//===----------------------------------------------------------------------===// +// Hexagon V60 Resource Definitions - +//===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp index 276cc69..239dbda 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp @@ -12,12 +12,11 @@ //===----------------------------------------------------------------------===// #include "HexagonTargetMachine.h" +#include "llvm/CodeGen/SelectionDAG.h" using namespace llvm; #define DEBUG_TYPE "hexagon-selectiondag-info" -bool llvm::flag_aligned_memcpy; - SDValue HexagonSelectionDAGInfo:: EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, @@ -25,15 +24,40 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { - flag_aligned_memcpy = false; - if ((Align & 0x3) == 0) { - ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); - if (ConstantSize) { - uint64_t SizeVal = ConstantSize->getZExtValue(); - if ((SizeVal > 32) && ((SizeVal % 8) == 0)) - flag_aligned_memcpy = true; - } - } - - return SDValue(); + ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + if (AlwaysInline || (Align & 0x3) != 0 || !ConstantSize) + return SDValue(); + + uint64_t SizeVal = ConstantSize->getZExtValue(); + if (SizeVal < 32 || (SizeVal % 8) != 0) + return SDValue(); + + // Special case aligned memcpys with size >= 32 bytes and a multiple of 8. + // + const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering(); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Entry.Node = Dst; + Args.push_back(Entry); + Entry.Node = Src; + Args.push_back(Entry); + Entry.Node = Size; + Args.push_back(Entry); + + const char *SpecialMemcpyName = + "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes"; + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY), + Type::getVoidTy(*DAG.getContext()), + DAG.getTargetExternalSymbol( + SpecialMemcpyName, TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args), 0) + .setDiscardResult(); + + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + return CallResult.second; } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp index d3eb56f..10fe606 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp @@ -81,7 +81,7 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) { // Loop over all of the basic blocks for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); MBBb != MBBe; ++MBBb) { - MachineBasicBlock* MBB = MBBb; + MachineBasicBlock *MBB = &*MBBb; // Traverse the basic block MachineBasicBlock::iterator MII = MBB->begin(); MachineBasicBlock::iterator MIE = MBB->end (); diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp new file mode 100644 index 0000000..d4e95b0d --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp @@ -0,0 +1,1209 @@ +//===--- HexagonSplitDouble.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "hsdr" + +#include "HexagonRegisterInfo.h" +#include "HexagonTargetMachine.h" + +#include "llvm/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" + +#include <map> +#include <set> +#include <vector> + +using namespace llvm; + +namespace llvm { + FunctionPass *createHexagonSplitDoubleRegs(); + void initializeHexagonSplitDoubleRegsPass(PassRegistry&); +} + +namespace { + static cl::opt<int> MaxHSDR("max-hsdr", cl::Hidden, cl::init(-1), + cl::desc("Maximum number of split partitions")); + static cl::opt<bool> MemRefsFixed("hsdr-no-mem", cl::Hidden, cl::init(true), + cl::desc("Do not split loads or stores")); + + class HexagonSplitDoubleRegs : public MachineFunctionPass { + public: + static char ID; + HexagonSplitDoubleRegs() : MachineFunctionPass(ID), TRI(nullptr), + TII(nullptr) { + initializeHexagonSplitDoubleRegsPass(*PassRegistry::getPassRegistry()); + } + const char *getPassName() const override { + return "Hexagon Split Double Registers"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; + + private: + static const TargetRegisterClass *const DoubleRC; + + const HexagonRegisterInfo *TRI; + const HexagonInstrInfo *TII; + const MachineLoopInfo *MLI; + MachineRegisterInfo *MRI; + + typedef std::set<unsigned> USet; + typedef std::map<unsigned,USet> UUSetMap; + typedef std::pair<unsigned,unsigned> UUPair; + typedef std::map<unsigned,UUPair> UUPairMap; + typedef std::map<const MachineLoop*,USet> LoopRegMap; + + bool isInduction(unsigned Reg, LoopRegMap &IRM) const; + bool isVolatileInstr(const MachineInstr *MI) const; + bool isFixedInstr(const MachineInstr *MI) const; + void partitionRegisters(UUSetMap &P2Rs); + int32_t profit(const MachineInstr *MI) const; + bool isProfitable(const USet &Part, LoopRegMap &IRM) const; + + void collectIndRegsForLoop(const MachineLoop *L, USet &Rs); + void collectIndRegs(LoopRegMap &IRM); + + void createHalfInstr(unsigned Opc, MachineInstr *MI, + const UUPairMap &PairMap, unsigned SubR); + void splitMemRef(MachineInstr *MI, const UUPairMap &PairMap); + void splitImmediate(MachineInstr *MI, const UUPairMap &PairMap); + void splitCombine(MachineInstr *MI, const UUPairMap &PairMap); + void splitExt(MachineInstr *MI, const UUPairMap &PairMap); + void splitShift(MachineInstr *MI, const UUPairMap &PairMap); + void splitAslOr(MachineInstr *MI, const UUPairMap &PairMap); + bool splitInstr(MachineInstr *MI, const UUPairMap &PairMap); + void replaceSubregUses(MachineInstr *MI, const UUPairMap &PairMap); + void collapseRegPairs(MachineInstr *MI, const UUPairMap &PairMap); + bool splitPartition(const USet &Part); + + static int Counter; + static void dump_partition(raw_ostream&, const USet&, + const TargetRegisterInfo&); + }; + char HexagonSplitDoubleRegs::ID; + int HexagonSplitDoubleRegs::Counter = 0; + const TargetRegisterClass *const HexagonSplitDoubleRegs::DoubleRC + = &Hexagon::DoubleRegsRegClass; +} + +INITIALIZE_PASS(HexagonSplitDoubleRegs, "hexagon-split-double", + "Hexagon Split Double Registers", false, false) + + +static inline uint32_t getRegState(const MachineOperand &R) { + assert(R.isReg()); + return getDefRegState(R.isDef()) | + getImplRegState(R.isImplicit()) | + getKillRegState(R.isKill()) | + getDeadRegState(R.isDead()) | + getUndefRegState(R.isUndef()) | + getInternalReadRegState(R.isInternalRead()) | + (R.isDebug() ? RegState::Debug : 0); +} + + +void HexagonSplitDoubleRegs::dump_partition(raw_ostream &os, + const USet &Part, const TargetRegisterInfo &TRI) { + dbgs() << '{'; + for (auto I : Part) + dbgs() << ' ' << PrintReg(I, &TRI); + dbgs() << " }"; +} + + +bool HexagonSplitDoubleRegs::isInduction(unsigned Reg, LoopRegMap &IRM) const { + for (auto I : IRM) { + const USet &Rs = I.second; + if (Rs.find(Reg) != Rs.end()) + return true; + } + return false; +} + + +bool HexagonSplitDoubleRegs::isVolatileInstr(const MachineInstr *MI) const { + for (auto &I : MI->memoperands()) + if (I->isVolatile()) + return true; + return false; +} + + +bool HexagonSplitDoubleRegs::isFixedInstr(const MachineInstr *MI) const { + if (MI->mayLoad() || MI->mayStore()) + if (MemRefsFixed || isVolatileInstr(MI)) + return true; + if (MI->isDebugValue()) + return false; + + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + return true; + + case TargetOpcode::PHI: + case TargetOpcode::COPY: + break; + + case Hexagon::L2_loadrd_io: + // Not handling stack stores (only reg-based addresses). + if (MI->getOperand(1).isReg()) + break; + return true; + case Hexagon::S2_storerd_io: + // Not handling stack stores (only reg-based addresses). + if (MI->getOperand(0).isReg()) + break; + return true; + case Hexagon::L2_loadrd_pi: + case Hexagon::S2_storerd_pi: + + case Hexagon::A2_tfrpi: + case Hexagon::A2_combineii: + case Hexagon::A4_combineir: + case Hexagon::A4_combineii: + case Hexagon::A4_combineri: + case Hexagon::A2_combinew: + case Hexagon::CONST64_Int_Real: + + case Hexagon::A2_sxtw: + + case Hexagon::A2_andp: + case Hexagon::A2_orp: + case Hexagon::A2_xorp: + case Hexagon::S2_asl_i_p_or: + case Hexagon::S2_asl_i_p: + case Hexagon::S2_asr_i_p: + case Hexagon::S2_lsr_i_p: + break; + } + + for (auto &Op : MI->operands()) { + if (!Op.isReg()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + return true; + } + return false; +} + + +void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) { + typedef std::map<unsigned,unsigned> UUMap; + typedef std::vector<unsigned> UVect; + + unsigned NumRegs = MRI->getNumVirtRegs(); + BitVector DoubleRegs(NumRegs); + for (unsigned i = 0; i < NumRegs; ++i) { + unsigned R = TargetRegisterInfo::index2VirtReg(i); + if (MRI->getRegClass(R) == DoubleRC) + DoubleRegs.set(i); + } + + BitVector FixedRegs(NumRegs); + for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) { + unsigned R = TargetRegisterInfo::index2VirtReg(x); + MachineInstr *DefI = MRI->getVRegDef(R); + // In some cases a register may exist, but never be defined or used. + // It should never appear anywhere, but mark it as "fixed", just to be + // safe. + if (!DefI || isFixedInstr(DefI)) + FixedRegs.set(x); + } + + UUSetMap AssocMap; + for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) { + if (FixedRegs[x]) + continue; + unsigned R = TargetRegisterInfo::index2VirtReg(x); + DEBUG(dbgs() << PrintReg(R, TRI) << " ~~"); + USet &Asc = AssocMap[R]; + for (auto U = MRI->use_nodbg_begin(R), Z = MRI->use_nodbg_end(); + U != Z; ++U) { + MachineOperand &Op = *U; + MachineInstr *UseI = Op.getParent(); + if (isFixedInstr(UseI)) + continue; + for (unsigned i = 0, n = UseI->getNumOperands(); i < n; ++i) { + MachineOperand &MO = UseI->getOperand(i); + // Skip non-registers or registers with subregisters. + if (&MO == &Op || !MO.isReg() || MO.getSubReg()) + continue; + unsigned T = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(T)) { + FixedRegs.set(x); + continue; + } + if (MRI->getRegClass(T) != DoubleRC) + continue; + unsigned u = TargetRegisterInfo::virtReg2Index(T); + if (FixedRegs[u]) + continue; + DEBUG(dbgs() << ' ' << PrintReg(T, TRI)); + Asc.insert(T); + // Make it symmetric. + AssocMap[T].insert(R); + } + } + DEBUG(dbgs() << '\n'); + } + + UUMap R2P; + unsigned NextP = 1; + USet Visited; + for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) { + unsigned R = TargetRegisterInfo::index2VirtReg(x); + if (Visited.count(R)) + continue; + // Create a new partition for R. + unsigned ThisP = FixedRegs[x] ? 0 : NextP++; + UVect WorkQ; + WorkQ.push_back(R); + for (unsigned i = 0; i < WorkQ.size(); ++i) { + unsigned T = WorkQ[i]; + if (Visited.count(T)) + continue; + R2P[T] = ThisP; + Visited.insert(T); + // Add all registers associated with T. + USet &Asc = AssocMap[T]; + for (USet::iterator J = Asc.begin(), F = Asc.end(); J != F; ++J) + WorkQ.push_back(*J); + } + } + + for (auto I : R2P) + P2Rs[I.second].insert(I.first); +} + + +static inline int32_t profitImm(unsigned Lo, unsigned Hi) { + int32_t P = 0; + bool LoZ1 = false, HiZ1 = false; + if (Lo == 0 || Lo == 0xFFFFFFFF) + P += 10, LoZ1 = true; + if (Hi == 0 || Hi == 0xFFFFFFFF) + P += 10, HiZ1 = true; + if (!LoZ1 && !HiZ1 && Lo == Hi) + P += 3; + return P; +} + + +int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const { + unsigned ImmX = 0; + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case TargetOpcode::PHI: + for (const auto &Op : MI->operands()) + if (!Op.getSubReg()) + return 0; + return 10; + case TargetOpcode::COPY: + if (MI->getOperand(1).getSubReg() != 0) + return 10; + return 0; + + case Hexagon::L2_loadrd_io: + case Hexagon::S2_storerd_io: + return -1; + case Hexagon::L2_loadrd_pi: + case Hexagon::S2_storerd_pi: + return 2; + + case Hexagon::A2_tfrpi: + case Hexagon::CONST64_Int_Real: { + uint64_t D = MI->getOperand(1).getImm(); + unsigned Lo = D & 0xFFFFFFFFULL; + unsigned Hi = D >> 32; + return profitImm(Lo, Hi); + } + case Hexagon::A2_combineii: + case Hexagon::A4_combineii: + return profitImm(MI->getOperand(1).getImm(), + MI->getOperand(2).getImm()); + case Hexagon::A4_combineri: + ImmX++; + case Hexagon::A4_combineir: { + ImmX++; + int64_t V = MI->getOperand(ImmX).getImm(); + if (V == 0 || V == -1) + return 10; + // Fall through into A2_combinew. + } + case Hexagon::A2_combinew: + return 2; + + case Hexagon::A2_sxtw: + return 3; + + case Hexagon::A2_andp: + case Hexagon::A2_orp: + case Hexagon::A2_xorp: + return 1; + + case Hexagon::S2_asl_i_p_or: { + unsigned S = MI->getOperand(3).getImm(); + if (S == 0 || S == 32) + return 10; + return -1; + } + case Hexagon::S2_asl_i_p: + case Hexagon::S2_asr_i_p: + case Hexagon::S2_lsr_i_p: + unsigned S = MI->getOperand(2).getImm(); + if (S == 0 || S == 32) + return 10; + if (S == 16) + return 5; + if (S == 48) + return 7; + return -10; + } + + return 0; +} + + +bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM) + const { + unsigned FixedNum = 0, SplitNum = 0, LoopPhiNum = 0; + int32_t TotalP = 0; + + for (unsigned DR : Part) { + MachineInstr *DefI = MRI->getVRegDef(DR); + int32_t P = profit(DefI); + if (P == INT_MIN) + return false; + TotalP += P; + // Reduce the profitability of splitting induction registers. + if (isInduction(DR, IRM)) + TotalP -= 30; + + for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end(); + U != W; ++U) { + MachineInstr *UseI = U->getParent(); + if (isFixedInstr(UseI)) { + FixedNum++; + // Calculate the cost of generating REG_SEQUENCE instructions. + for (auto &Op : UseI->operands()) { + if (Op.isReg() && Part.count(Op.getReg())) + if (Op.getSubReg()) + TotalP -= 2; + } + continue; + } + // If a register from this partition is used in a fixed instruction, + // and there is also a register in this partition that is used in + // a loop phi node, then decrease the splitting profit as this can + // confuse the modulo scheduler. + if (UseI->isPHI()) { + const MachineBasicBlock *PB = UseI->getParent(); + const MachineLoop *L = MLI->getLoopFor(PB); + if (L && L->getHeader() == PB) + LoopPhiNum++; + } + // Splittable instruction. + SplitNum++; + int32_t P = profit(UseI); + if (P == INT_MIN) + return false; + TotalP += P; + } + } + + if (FixedNum > 0 && LoopPhiNum > 0) + TotalP -= 20*LoopPhiNum; + + DEBUG(dbgs() << "Partition profit: " << TotalP << '\n'); + return TotalP > 0; +} + + +void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L, + USet &Rs) { + const MachineBasicBlock *HB = L->getHeader(); + const MachineBasicBlock *LB = L->getLoopLatch(); + if (!HB || !LB) + return; + + // Examine the latch branch. Expect it to be a conditional branch to + // the header (either "br-cond header" or "br-cond exit; br header"). + MachineBasicBlock *TB = 0, *FB = 0; + MachineBasicBlock *TmpLB = const_cast<MachineBasicBlock*>(LB); + SmallVector<MachineOperand,2> Cond; + bool BadLB = TII->AnalyzeBranch(*TmpLB, TB, FB, Cond, false); + // Only analyzable conditional branches. HII::AnalyzeBranch will put + // the branch opcode as the first element of Cond, and the predicate + // operand as the second. + if (BadLB || Cond.size() != 2) + return; + // Only simple jump-conditional (with or without negation). + if (!TII->PredOpcodeHasJMP_c(Cond[0].getImm())) + return; + // Must go to the header. + if (TB != HB && FB != HB) + return; + assert(Cond[1].isReg() && "Unexpected Cond vector from AnalyzeBranch"); + // Expect a predicate register. + unsigned PR = Cond[1].getReg(); + assert(MRI->getRegClass(PR) == &Hexagon::PredRegsRegClass); + + // Get the registers on which the loop controlling compare instruction + // depends. + unsigned CmpR1 = 0, CmpR2 = 0; + const MachineInstr *CmpI = MRI->getVRegDef(PR); + while (CmpI->getOpcode() == Hexagon::C2_not) + CmpI = MRI->getVRegDef(CmpI->getOperand(1).getReg()); + + int Mask = 0, Val = 0; + bool OkCI = TII->analyzeCompare(CmpI, CmpR1, CmpR2, Mask, Val); + if (!OkCI) + return; + // Eliminate non-double input registers. + if (CmpR1 && MRI->getRegClass(CmpR1) != DoubleRC) + CmpR1 = 0; + if (CmpR2 && MRI->getRegClass(CmpR2) != DoubleRC) + CmpR2 = 0; + if (!CmpR1 && !CmpR2) + return; + + // Now examine the top of the loop: the phi nodes that could poten- + // tially define loop induction registers. The registers defined by + // such a phi node would be used in a 64-bit add, which then would + // be used in the loop compare instruction. + + // Get the set of all double registers defined by phi nodes in the + // loop header. + typedef std::vector<unsigned> UVect; + UVect DP; + for (auto &MI : *HB) { + if (!MI.isPHI()) + break; + const MachineOperand &MD = MI.getOperand(0); + unsigned R = MD.getReg(); + if (MRI->getRegClass(R) == DoubleRC) + DP.push_back(R); + } + if (DP.empty()) + return; + + auto NoIndOp = [this, CmpR1, CmpR2] (unsigned R) -> bool { + for (auto I = MRI->use_nodbg_begin(R), E = MRI->use_nodbg_end(); + I != E; ++I) { + const MachineInstr *UseI = I->getParent(); + if (UseI->getOpcode() != Hexagon::A2_addp) + continue; + // Get the output from the add. If it is one of the inputs to the + // loop-controlling compare instruction, then R is likely an induc- + // tion register. + unsigned T = UseI->getOperand(0).getReg(); + if (T == CmpR1 || T == CmpR2) + return false; + } + return true; + }; + UVect::iterator End = std::remove_if(DP.begin(), DP.end(), NoIndOp); + Rs.insert(DP.begin(), End); + Rs.insert(CmpR1); + Rs.insert(CmpR2); + + DEBUG({ + dbgs() << "For loop at BB#" << HB->getNumber() << " ind regs: "; + dump_partition(dbgs(), Rs, *TRI); + dbgs() << '\n'; + }); +} + + +void HexagonSplitDoubleRegs::collectIndRegs(LoopRegMap &IRM) { + typedef std::vector<MachineLoop*> LoopVector; + LoopVector WorkQ; + + for (auto I : *MLI) + WorkQ.push_back(I); + for (unsigned i = 0; i < WorkQ.size(); ++i) { + for (auto I : *WorkQ[i]) + WorkQ.push_back(I); + } + + USet Rs; + for (unsigned i = 0, n = WorkQ.size(); i < n; ++i) { + MachineLoop *L = WorkQ[i]; + Rs.clear(); + collectIndRegsForLoop(L, Rs); + if (!Rs.empty()) + IRM.insert(std::make_pair(L, Rs)); + } +} + + +void HexagonSplitDoubleRegs::createHalfInstr(unsigned Opc, MachineInstr *MI, + const UUPairMap &PairMap, unsigned SubR) { + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + MachineInstr *NewI = BuildMI(B, MI, DL, TII->get(Opc)); + + for (auto &Op : MI->operands()) { + if (!Op.isReg()) { + NewI->addOperand(Op); + continue; + } + // For register operands, set the subregister. + unsigned R = Op.getReg(); + unsigned SR = Op.getSubReg(); + bool isVirtReg = TargetRegisterInfo::isVirtualRegister(R); + bool isKill = Op.isKill(); + if (isVirtReg && MRI->getRegClass(R) == DoubleRC) { + isKill = false; + UUPairMap::const_iterator F = PairMap.find(R); + if (F == PairMap.end()) { + SR = SubR; + } else { + const UUPair &P = F->second; + R = (SubR == Hexagon::subreg_loreg) ? P.first : P.second; + SR = 0; + } + } + auto CO = MachineOperand::CreateReg(R, Op.isDef(), Op.isImplicit(), isKill, + Op.isDead(), Op.isUndef(), Op.isEarlyClobber(), SR, Op.isDebug(), + Op.isInternalRead()); + NewI->addOperand(CO); + } +} + + +void HexagonSplitDoubleRegs::splitMemRef(MachineInstr *MI, + const UUPairMap &PairMap) { + bool Load = MI->mayLoad(); + unsigned OrigOpc = MI->getOpcode(); + bool PostInc = (OrigOpc == Hexagon::L2_loadrd_pi || + OrigOpc == Hexagon::S2_storerd_pi); + MachineInstr *LowI, *HighI; + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + // Index of the base-address-register operand. + unsigned AdrX = PostInc ? (Load ? 2 : 1) + : (Load ? 1 : 0); + MachineOperand &AdrOp = MI->getOperand(AdrX); + unsigned RSA = getRegState(AdrOp); + MachineOperand &ValOp = Load ? MI->getOperand(0) + : (PostInc ? MI->getOperand(3) + : MI->getOperand(2)); + UUPairMap::const_iterator F = PairMap.find(ValOp.getReg()); + assert(F != PairMap.end()); + + if (Load) { + const UUPair &P = F->second; + int64_t Off = PostInc ? 0 : MI->getOperand(2).getImm(); + LowI = BuildMI(B, MI, DL, TII->get(Hexagon::L2_loadri_io), P.first) + .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg()) + .addImm(Off); + HighI = BuildMI(B, MI, DL, TII->get(Hexagon::L2_loadri_io), P.second) + .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg()) + .addImm(Off+4); + } else { + const UUPair &P = F->second; + int64_t Off = PostInc ? 0 : MI->getOperand(1).getImm(); + LowI = BuildMI(B, MI, DL, TII->get(Hexagon::S2_storeri_io)) + .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg()) + .addImm(Off) + .addReg(P.first); + HighI = BuildMI(B, MI, DL, TII->get(Hexagon::S2_storeri_io)) + .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg()) + .addImm(Off+4) + .addReg(P.second); + } + + if (PostInc) { + // Create the increment of the address register. + int64_t Inc = Load ? MI->getOperand(3).getImm() + : MI->getOperand(2).getImm(); + MachineOperand &UpdOp = Load ? MI->getOperand(1) : MI->getOperand(0); + const TargetRegisterClass *RC = MRI->getRegClass(UpdOp.getReg()); + unsigned NewR = MRI->createVirtualRegister(RC); + assert(!UpdOp.getSubReg() && "Def operand with subreg"); + BuildMI(B, MI, DL, TII->get(Hexagon::A2_addi), NewR) + .addReg(AdrOp.getReg(), RSA) + .addImm(Inc); + MRI->replaceRegWith(UpdOp.getReg(), NewR); + // The original instruction will be deleted later. + } + + // Generate a new pair of memory-operands. + MachineFunction &MF = *B.getParent(); + for (auto &MO : MI->memoperands()) { + const MachinePointerInfo &Ptr = MO->getPointerInfo(); + unsigned F = MO->getFlags(); + int A = MO->getAlignment(); + + auto *Tmp1 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, A); + LowI->addMemOperand(MF, Tmp1); + auto *Tmp2 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, std::min(A, 4)); + HighI->addMemOperand(MF, Tmp2); + } +} + + +void HexagonSplitDoubleRegs::splitImmediate(MachineInstr *MI, + const UUPairMap &PairMap) { + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + assert(Op0.isReg() && Op1.isImm()); + uint64_t V = Op1.getImm(); + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + UUPairMap::const_iterator F = PairMap.find(Op0.getReg()); + assert(F != PairMap.end()); + const UUPair &P = F->second; + + // The operand to A2_tfrsi can only have 32 significant bits. Immediate + // values in MachineOperand are stored as 64-bit integers, and so the + // value -1 may be represented either as 64-bit -1, or 4294967295. Both + // will have the 32 higher bits truncated in the end, but -1 will remain + // as -1, while the latter may appear to be a large unsigned value + // requiring a constant extender. The casting to int32_t will select the + // former representation. (The same reasoning applies to all 32-bit + // values.) + BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first) + .addImm(int32_t(V & 0xFFFFFFFFULL)); + BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second) + .addImm(int32_t(V >> 32)); +} + + +void HexagonSplitDoubleRegs::splitCombine(MachineInstr *MI, + const UUPairMap &PairMap) { + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + MachineOperand &Op2 = MI->getOperand(2); + assert(Op0.isReg()); + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + UUPairMap::const_iterator F = PairMap.find(Op0.getReg()); + assert(F != PairMap.end()); + const UUPair &P = F->second; + + if (Op1.isImm()) { + BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second) + .addImm(Op1.getImm()); + } else if (Op1.isReg()) { + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.second) + .addReg(Op1.getReg(), getRegState(Op1), Op1.getSubReg()); + } else + llvm_unreachable("Unexpected operand"); + + if (Op2.isImm()) { + BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first) + .addImm(Op2.getImm()); + } else if (Op2.isReg()) { + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first) + .addReg(Op2.getReg(), getRegState(Op2), Op2.getSubReg()); + } else + llvm_unreachable("Unexpected operand"); +} + + +void HexagonSplitDoubleRegs::splitExt(MachineInstr *MI, + const UUPairMap &PairMap) { + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + assert(Op0.isReg() && Op1.isReg()); + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + UUPairMap::const_iterator F = PairMap.find(Op0.getReg()); + assert(F != PairMap.end()); + const UUPair &P = F->second; + unsigned RS = getRegState(Op1); + + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first) + .addReg(Op1.getReg(), RS & ~RegState::Kill, Op1.getSubReg()); + BuildMI(B, MI, DL, TII->get(Hexagon::S2_asr_i_r), P.second) + .addReg(Op1.getReg(), RS, Op1.getSubReg()) + .addImm(31); +} + + +void HexagonSplitDoubleRegs::splitShift(MachineInstr *MI, + const UUPairMap &PairMap) { + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + MachineOperand &Op2 = MI->getOperand(2); + assert(Op0.isReg() && Op1.isReg() && Op2.isImm()); + int64_t Sh64 = Op2.getImm(); + assert(Sh64 >= 0 && Sh64 < 64); + unsigned S = Sh64; + + UUPairMap::const_iterator F = PairMap.find(Op0.getReg()); + assert(F != PairMap.end()); + const UUPair &P = F->second; + unsigned LoR = P.first; + unsigned HiR = P.second; + using namespace Hexagon; + + unsigned Opc = MI->getOpcode(); + bool Right = (Opc == S2_lsr_i_p || Opc == S2_asr_i_p); + bool Left = !Right; + bool Signed = (Opc == S2_asr_i_p); + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned RS = getRegState(Op1); + unsigned ShiftOpc = Left ? S2_asl_i_r + : (Signed ? S2_asr_i_r : S2_lsr_i_r); + unsigned LoSR = subreg_loreg; + unsigned HiSR = subreg_hireg; + + if (S == 0) { + // No shift, subregister copy. + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR); + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), HiR) + .addReg(Op1.getReg(), RS, HiSR); + } else if (S < 32) { + const TargetRegisterClass *IntRC = &IntRegsRegClass; + unsigned TmpR = MRI->createVirtualRegister(IntRC); + // Expansion: + // Shift left: DR = shl R, #s + // LoR = shl R.lo, #s + // TmpR = extractu R.lo, #s, #32-s + // HiR = or (TmpR, asl(R.hi, #s)) + // Shift right: DR = shr R, #s + // HiR = shr R.hi, #s + // TmpR = shr R.lo, #s + // LoR = insert TmpR, R.hi, #s, #32-s + + // Shift left: + // LoR = shl R.lo, #s + // Shift right: + // TmpR = shr R.lo, #s + + // Make a special case for A2_aslh and A2_asrh (they are predicable as + // opposed to S2_asl_i_r/S2_asr_i_r). + if (S == 16 && Left) + BuildMI(B, MI, DL, TII->get(A2_aslh), LoR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR); + else if (S == 16 && Signed) + BuildMI(B, MI, DL, TII->get(A2_asrh), TmpR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR); + else + BuildMI(B, MI, DL, TII->get(ShiftOpc), (Left ? LoR : TmpR)) + .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR) + .addImm(S); + + if (Left) { + // TmpR = extractu R.lo, #s, #32-s + BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR) + .addImm(S) + .addImm(32-S); + // HiR = or (TmpR, asl(R.hi, #s)) + BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR) + .addReg(TmpR) + .addReg(Op1.getReg(), RS, HiSR) + .addImm(S); + } else { + // HiR = shr R.hi, #s + BuildMI(B, MI, DL, TII->get(ShiftOpc), HiR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, HiSR) + .addImm(S); + // LoR = insert TmpR, R.hi, #s, #32-s + BuildMI(B, MI, DL, TII->get(S2_insert), LoR) + .addReg(TmpR) + .addReg(Op1.getReg(), RS, HiSR) + .addImm(S) + .addImm(32-S); + } + } else if (S == 32) { + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), (Left ? HiR : LoR)) + .addReg(Op1.getReg(), RS & ~RegState::Kill, (Left ? LoSR : HiSR)); + if (!Signed) + BuildMI(B, MI, DL, TII->get(A2_tfrsi), (Left ? LoR : HiR)) + .addImm(0); + else // Must be right shift. + BuildMI(B, MI, DL, TII->get(S2_asr_i_r), HiR) + .addReg(Op1.getReg(), RS, HiSR) + .addImm(31); + } else if (S < 64) { + S -= 32; + if (S == 16 && Left) + BuildMI(B, MI, DL, TII->get(A2_aslh), HiR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR); + else if (S == 16 && Signed) + BuildMI(B, MI, DL, TII->get(A2_asrh), LoR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, HiSR); + else + BuildMI(B, MI, DL, TII->get(ShiftOpc), (Left ? HiR : LoR)) + .addReg(Op1.getReg(), RS & ~RegState::Kill, (Left ? LoSR : HiSR)) + .addImm(S); + + if (Signed) + BuildMI(B, MI, DL, TII->get(S2_asr_i_r), HiR) + .addReg(Op1.getReg(), RS, HiSR) + .addImm(31); + else + BuildMI(B, MI, DL, TII->get(A2_tfrsi), (Left ? LoR : HiR)) + .addImm(0); + } +} + + +void HexagonSplitDoubleRegs::splitAslOr(MachineInstr *MI, + const UUPairMap &PairMap) { + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + MachineOperand &Op2 = MI->getOperand(2); + MachineOperand &Op3 = MI->getOperand(3); + assert(Op0.isReg() && Op1.isReg() && Op2.isReg() && Op3.isImm()); + int64_t Sh64 = Op3.getImm(); + assert(Sh64 >= 0 && Sh64 < 64); + unsigned S = Sh64; + + UUPairMap::const_iterator F = PairMap.find(Op0.getReg()); + assert(F != PairMap.end()); + const UUPair &P = F->second; + unsigned LoR = P.first; + unsigned HiR = P.second; + using namespace Hexagon; + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned RS1 = getRegState(Op1); + unsigned RS2 = getRegState(Op2); + const TargetRegisterClass *IntRC = &IntRegsRegClass; + + unsigned LoSR = subreg_loreg; + unsigned HiSR = subreg_hireg; + + // Op0 = S2_asl_i_p_or Op1, Op2, Op3 + // means: Op0 = or (Op1, asl(Op2, Op3)) + + // Expansion of + // DR = or (R1, asl(R2, #s)) + // + // LoR = or (R1.lo, asl(R2.lo, #s)) + // Tmp1 = extractu R2.lo, #s, #32-s + // Tmp2 = or R1.hi, Tmp1 + // HiR = or (Tmp2, asl(R2.hi, #s)) + + if (S == 0) { + // DR = or (R1, asl(R2, #0)) + // -> or (R1, R2) + // i.e. LoR = or R1.lo, R2.lo + // HiR = or R1.hi, R2.hi + BuildMI(B, MI, DL, TII->get(A2_or), LoR) + .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR) + .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR); + BuildMI(B, MI, DL, TII->get(A2_or), HiR) + .addReg(Op1.getReg(), RS1, HiSR) + .addReg(Op2.getReg(), RS2, HiSR); + } else if (S < 32) { + BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), LoR) + .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR) + .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR) + .addImm(S); + unsigned TmpR1 = MRI->createVirtualRegister(IntRC); + BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR1) + .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR) + .addImm(S) + .addImm(32-S); + unsigned TmpR2 = MRI->createVirtualRegister(IntRC); + BuildMI(B, MI, DL, TII->get(A2_or), TmpR2) + .addReg(Op1.getReg(), RS1, HiSR) + .addReg(TmpR1); + BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR) + .addReg(TmpR2) + .addReg(Op2.getReg(), RS2, HiSR) + .addImm(S); + } else if (S == 32) { + // DR = or (R1, asl(R2, #32)) + // -> or R1, R2.lo + // LoR = R1.lo + // HiR = or R1.hi, R2.lo + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR) + .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR); + BuildMI(B, MI, DL, TII->get(A2_or), HiR) + .addReg(Op1.getReg(), RS1, HiSR) + .addReg(Op2.getReg(), RS2, LoSR); + } else if (S < 64) { + // DR = or (R1, asl(R2, #s)) + // + // LoR = R1:lo + // HiR = or (R1:hi, asl(R2:lo, #s-32)) + S -= 32; + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR) + .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR); + BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR) + .addReg(Op1.getReg(), RS1, HiSR) + .addReg(Op2.getReg(), RS2, LoSR) + .addImm(S); + } +} + + +bool HexagonSplitDoubleRegs::splitInstr(MachineInstr *MI, + const UUPairMap &PairMap) { + DEBUG(dbgs() << "Splitting: " << *MI); + bool Split = false; + unsigned Opc = MI->getOpcode(); + using namespace Hexagon; + + switch (Opc) { + case TargetOpcode::PHI: + case TargetOpcode::COPY: { + unsigned DstR = MI->getOperand(0).getReg(); + if (MRI->getRegClass(DstR) == DoubleRC) { + createHalfInstr(Opc, MI, PairMap, subreg_loreg); + createHalfInstr(Opc, MI, PairMap, subreg_hireg); + Split = true; + } + break; + } + case A2_andp: + createHalfInstr(A2_and, MI, PairMap, subreg_loreg); + createHalfInstr(A2_and, MI, PairMap, subreg_hireg); + Split = true; + break; + case A2_orp: + createHalfInstr(A2_or, MI, PairMap, subreg_loreg); + createHalfInstr(A2_or, MI, PairMap, subreg_hireg); + Split = true; + break; + case A2_xorp: + createHalfInstr(A2_xor, MI, PairMap, subreg_loreg); + createHalfInstr(A2_xor, MI, PairMap, subreg_hireg); + Split = true; + break; + + case L2_loadrd_io: + case L2_loadrd_pi: + case S2_storerd_io: + case S2_storerd_pi: + splitMemRef(MI, PairMap); + Split = true; + break; + + case A2_tfrpi: + case CONST64_Int_Real: + splitImmediate(MI, PairMap); + Split = true; + break; + + case A2_combineii: + case A4_combineir: + case A4_combineii: + case A4_combineri: + case A2_combinew: + splitCombine(MI, PairMap); + Split = true; + break; + + case A2_sxtw: + splitExt(MI, PairMap); + Split = true; + break; + + case S2_asl_i_p: + case S2_asr_i_p: + case S2_lsr_i_p: + splitShift(MI, PairMap); + Split = true; + break; + + case S2_asl_i_p_or: + splitAslOr(MI, PairMap); + Split = true; + break; + + default: + llvm_unreachable("Instruction not splitable"); + return false; + } + + return Split; +} + + +void HexagonSplitDoubleRegs::replaceSubregUses(MachineInstr *MI, + const UUPairMap &PairMap) { + for (auto &Op : MI->operands()) { + if (!Op.isReg() || !Op.isUse() || !Op.getSubReg()) + continue; + unsigned R = Op.getReg(); + UUPairMap::const_iterator F = PairMap.find(R); + if (F == PairMap.end()) + continue; + const UUPair &P = F->second; + switch (Op.getSubReg()) { + case Hexagon::subreg_loreg: + Op.setReg(P.first); + break; + case Hexagon::subreg_hireg: + Op.setReg(P.second); + break; + } + Op.setSubReg(0); + } +} + + +void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI, + const UUPairMap &PairMap) { + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + for (auto &Op : MI->operands()) { + if (!Op.isReg() || !Op.isUse()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + if (MRI->getRegClass(R) != DoubleRC || Op.getSubReg()) + continue; + UUPairMap::const_iterator F = PairMap.find(R); + if (F == PairMap.end()) + continue; + const UUPair &Pr = F->second; + unsigned NewDR = MRI->createVirtualRegister(DoubleRC); + BuildMI(B, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), NewDR) + .addReg(Pr.first) + .addImm(Hexagon::subreg_loreg) + .addReg(Pr.second) + .addImm(Hexagon::subreg_hireg); + Op.setReg(NewDR); + } +} + + +bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) { + const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass; + typedef std::set<MachineInstr*> MISet; + bool Changed = false; + + DEBUG(dbgs() << "Splitting partition: "; dump_partition(dbgs(), Part, *TRI); + dbgs() << '\n'); + + UUPairMap PairMap; + + MISet SplitIns; + for (unsigned DR : Part) { + MachineInstr *DefI = MRI->getVRegDef(DR); + SplitIns.insert(DefI); + + // Collect all instructions, including fixed ones. We won't split them, + // but we need to visit them again to insert the REG_SEQUENCE instructions. + for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end(); + U != W; ++U) + SplitIns.insert(U->getParent()); + + unsigned LoR = MRI->createVirtualRegister(IntRC); + unsigned HiR = MRI->createVirtualRegister(IntRC); + DEBUG(dbgs() << "Created mapping: " << PrintReg(DR, TRI) << " -> " + << PrintReg(HiR, TRI) << ':' << PrintReg(LoR, TRI) << '\n'); + PairMap.insert(std::make_pair(DR, UUPair(LoR, HiR))); + } + + MISet Erase; + for (auto MI : SplitIns) { + if (isFixedInstr(MI)) { + collapseRegPairs(MI, PairMap); + } else { + bool Done = splitInstr(MI, PairMap); + if (Done) + Erase.insert(MI); + Changed |= Done; + } + } + + for (unsigned DR : Part) { + // Before erasing "double" instructions, revisit all uses of the double + // registers in this partition, and replace all uses of them with subre- + // gisters, with the corresponding single registers. + MISet Uses; + for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end(); + U != W; ++U) + Uses.insert(U->getParent()); + for (auto M : Uses) + replaceSubregUses(M, PairMap); + } + + for (auto MI : Erase) { + MachineBasicBlock *B = MI->getParent(); + B->erase(MI); + } + + return Changed; +} + + +bool HexagonSplitDoubleRegs::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "Splitting double registers in function: " + << MF.getName() << '\n'); + + auto &ST = MF.getSubtarget<HexagonSubtarget>(); + TRI = ST.getRegisterInfo(); + TII = ST.getInstrInfo(); + MRI = &MF.getRegInfo(); + MLI = &getAnalysis<MachineLoopInfo>(); + + UUSetMap P2Rs; + LoopRegMap IRM; + + collectIndRegs(IRM); + partitionRegisters(P2Rs); + + DEBUG({ + dbgs() << "Register partitioning: (partition #0 is fixed)\n"; + for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) { + dbgs() << '#' << I->first << " -> "; + dump_partition(dbgs(), I->second, *TRI); + dbgs() << '\n'; + } + }); + + bool Changed = false; + int Limit = MaxHSDR; + + for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) { + if (I->first == 0) + continue; + if (Limit >= 0 && Counter >= Limit) + break; + USet &Part = I->second; + DEBUG(dbgs() << "Calculating profit for partition #" << I->first << '\n'); + if (!isProfitable(Part, IRM)) + continue; + Counter++; + Changed |= splitPartition(Part); + } + + return Changed; +} + +FunctionPass *llvm::createHexagonSplitDoubleRegs() { + return new HexagonSplitDoubleRegs(); +} diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp new file mode 100644 index 0000000..b5339ff --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp @@ -0,0 +1,616 @@ +//===--- HexagonStoreWidening.cpp------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Replace sequences of "narrow" stores to adjacent memory locations with +// a fewer "wide" stores that have the same effect. +// For example, replace: +// S4_storeirb_io %vreg100, 0, 0 ; store-immediate-byte +// S4_storeirb_io %vreg100, 1, 0 ; store-immediate-byte +// with +// S4_storeirh_io %vreg100, 0, 0 ; store-immediate-halfword +// The above is the general idea. The actual cases handled by the code +// may be a bit more complex. +// The purpose of this pass is to reduce the number of outstanding stores, +// or as one could say, "reduce store queue pressure". Also, wide stores +// mean fewer stores, and since there are only two memory instructions allowed +// per packet, it also means fewer packets, and ultimately fewer cycles. +//===---------------------------------------------------------------------===// + +#define DEBUG_TYPE "hexagon-widen-stores" + +#include "HexagonTargetMachine.h" + +#include "llvm/PassSupport.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +#include <algorithm> + + +using namespace llvm; + +namespace llvm { + FunctionPass *createHexagonStoreWidening(); + void initializeHexagonStoreWideningPass(PassRegistry&); +} + +namespace { + struct HexagonStoreWidening : public MachineFunctionPass { + const HexagonInstrInfo *TII; + const HexagonRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + AliasAnalysis *AA; + MachineFunction *MF; + + public: + static char ID; + HexagonStoreWidening() : MachineFunctionPass(ID) { + initializeHexagonStoreWideningPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "Hexagon Store Widening"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + static bool handledStoreType(const MachineInstr *MI); + + private: + static const int MaxWideSize = 4; + + typedef std::vector<MachineInstr*> InstrGroup; + typedef std::vector<InstrGroup> InstrGroupList; + + bool instrAliased(InstrGroup &Stores, const MachineMemOperand &MMO); + bool instrAliased(InstrGroup &Stores, const MachineInstr *MI); + void createStoreGroup(MachineInstr *BaseStore, InstrGroup::iterator Begin, + InstrGroup::iterator End, InstrGroup &Group); + void createStoreGroups(MachineBasicBlock &MBB, + InstrGroupList &StoreGroups); + bool processBasicBlock(MachineBasicBlock &MBB); + bool processStoreGroup(InstrGroup &Group); + bool selectStores(InstrGroup::iterator Begin, InstrGroup::iterator End, + InstrGroup &OG, unsigned &TotalSize, unsigned MaxSize); + bool createWideStores(InstrGroup &OG, InstrGroup &NG, unsigned TotalSize); + bool replaceStores(InstrGroup &OG, InstrGroup &NG); + bool storesAreAdjacent(const MachineInstr *S1, const MachineInstr *S2); + }; + +} // namespace + + +namespace { + +// Some local helper functions... +unsigned getBaseAddressRegister(const MachineInstr *MI) { + const MachineOperand &MO = MI->getOperand(0); + assert(MO.isReg() && "Expecting register operand"); + return MO.getReg(); +} + +int64_t getStoreOffset(const MachineInstr *MI) { + unsigned OpC = MI->getOpcode(); + assert(HexagonStoreWidening::handledStoreType(MI) && "Unhandled opcode"); + + switch (OpC) { + case Hexagon::S4_storeirb_io: + case Hexagon::S4_storeirh_io: + case Hexagon::S4_storeiri_io: { + const MachineOperand &MO = MI->getOperand(1); + assert(MO.isImm() && "Expecting immediate offset"); + return MO.getImm(); + } + } + dbgs() << *MI; + llvm_unreachable("Store offset calculation missing for a handled opcode"); + return 0; +} + +const MachineMemOperand &getStoreTarget(const MachineInstr *MI) { + assert(!MI->memoperands_empty() && "Expecting memory operands"); + return **MI->memoperands_begin(); +} + +} // namespace + + +char HexagonStoreWidening::ID = 0; + +INITIALIZE_PASS_BEGIN(HexagonStoreWidening, "hexagon-widen-stores", + "Hexason Store Widening", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_END(HexagonStoreWidening, "hexagon-widen-stores", + "Hexagon Store Widening", false, false) + + +// Filtering function: any stores whose opcodes are not "approved" of by +// this function will not be subjected to widening. +inline bool HexagonStoreWidening::handledStoreType(const MachineInstr *MI) { + // For now, only handle stores of immediate values. + // Also, reject stores to stack slots. + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case Hexagon::S4_storeirb_io: + case Hexagon::S4_storeirh_io: + case Hexagon::S4_storeiri_io: + // Base address must be a register. (Implement FI later.) + return MI->getOperand(0).isReg(); + default: + return false; + } +} + + +// Check if the machine memory operand MMO is aliased with any of the +// stores in the store group Stores. +bool HexagonStoreWidening::instrAliased(InstrGroup &Stores, + const MachineMemOperand &MMO) { + if (!MMO.getValue()) + return true; + + MemoryLocation L(MMO.getValue(), MMO.getSize(), MMO.getAAInfo()); + + for (auto SI : Stores) { + const MachineMemOperand &SMO = getStoreTarget(SI); + if (!SMO.getValue()) + return true; + + MemoryLocation SL(SMO.getValue(), SMO.getSize(), SMO.getAAInfo()); + if (AA->alias(L, SL)) + return true; + } + + return false; +} + + +// Check if the machine instruction MI accesses any storage aliased with +// any store in the group Stores. +bool HexagonStoreWidening::instrAliased(InstrGroup &Stores, + const MachineInstr *MI) { + for (auto &I : MI->memoperands()) + if (instrAliased(Stores, *I)) + return true; + return false; +} + + +// Inspect a machine basic block, and generate store groups out of stores +// encountered in the block. +// +// A store group is a group of stores that use the same base register, +// and which can be reordered within that group without altering the +// semantics of the program. A single store group could be widened as +// a whole, if there existed a single store instruction with the same +// semantics as the entire group. In many cases, a single store group +// may need more than one wide store. +void HexagonStoreWidening::createStoreGroups(MachineBasicBlock &MBB, + InstrGroupList &StoreGroups) { + InstrGroup AllInsns; + + // Copy all instruction pointers from the basic block to a temporary + // list. This will allow operating on the list, and modifying its + // elements without affecting the basic block. + for (auto &I : MBB) + AllInsns.push_back(&I); + + // Traverse all instructions in the AllInsns list, and if we encounter + // a store, then try to create a store group starting at that instruction + // i.e. a sequence of independent stores that can be widened. + for (auto I = AllInsns.begin(), E = AllInsns.end(); I != E; ++I) { + MachineInstr *MI = *I; + // Skip null pointers (processed instructions). + if (!MI || !handledStoreType(MI)) + continue; + + // Found a store. Try to create a store group. + InstrGroup G; + createStoreGroup(MI, I+1, E, G); + if (G.size() > 1) + StoreGroups.push_back(G); + } +} + + +// Create a single store group. The stores need to be independent between +// themselves, and also there cannot be other instructions between them +// that could read or modify storage being stored into. +void HexagonStoreWidening::createStoreGroup(MachineInstr *BaseStore, + InstrGroup::iterator Begin, InstrGroup::iterator End, InstrGroup &Group) { + assert(handledStoreType(BaseStore) && "Unexpected instruction"); + unsigned BaseReg = getBaseAddressRegister(BaseStore); + InstrGroup Other; + + Group.push_back(BaseStore); + + for (auto I = Begin; I != End; ++I) { + MachineInstr *MI = *I; + if (!MI) + continue; + + if (handledStoreType(MI)) { + // If this store instruction is aliased with anything already in the + // group, terminate the group now. + if (instrAliased(Group, getStoreTarget(MI))) + return; + // If this store is aliased to any of the memory instructions we have + // seen so far (that are not a part of this group), terminate the group. + if (instrAliased(Other, getStoreTarget(MI))) + return; + + unsigned BR = getBaseAddressRegister(MI); + if (BR == BaseReg) { + Group.push_back(MI); + *I = 0; + continue; + } + } + + // Assume calls are aliased to everything. + if (MI->isCall() || MI->hasUnmodeledSideEffects()) + return; + + if (MI->mayLoad() || MI->mayStore()) { + if (MI->hasOrderedMemoryRef() || instrAliased(Group, MI)) + return; + Other.push_back(MI); + } + } // for +} + + +// Check if store instructions S1 and S2 are adjacent. More precisely, +// S2 has to access memory immediately following that accessed by S1. +bool HexagonStoreWidening::storesAreAdjacent(const MachineInstr *S1, + const MachineInstr *S2) { + if (!handledStoreType(S1) || !handledStoreType(S2)) + return false; + + const MachineMemOperand &S1MO = getStoreTarget(S1); + + // Currently only handling immediate stores. + int Off1 = S1->getOperand(1).getImm(); + int Off2 = S2->getOperand(1).getImm(); + + return (Off1 >= 0) ? Off1+S1MO.getSize() == unsigned(Off2) + : int(Off1+S1MO.getSize()) == Off2; +} + + +/// Given a sequence of adjacent stores, and a maximum size of a single wide +/// store, pick a group of stores that can be replaced by a single store +/// of size not exceeding MaxSize. The selected sequence will be recorded +/// in OG ("old group" of instructions). +/// OG should be empty on entry, and should be left empty if the function +/// fails. +bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin, + InstrGroup::iterator End, InstrGroup &OG, unsigned &TotalSize, + unsigned MaxSize) { + assert(Begin != End && "No instructions to analyze"); + assert(OG.empty() && "Old group not empty on entry"); + + if (std::distance(Begin, End) <= 1) + return false; + + MachineInstr *FirstMI = *Begin; + assert(!FirstMI->memoperands_empty() && "Expecting some memory operands"); + const MachineMemOperand &FirstMMO = getStoreTarget(FirstMI); + unsigned Alignment = FirstMMO.getAlignment(); + unsigned SizeAccum = FirstMMO.getSize(); + unsigned FirstOffset = getStoreOffset(FirstMI); + + // The initial value of SizeAccum should always be a power of 2. + assert(isPowerOf2_32(SizeAccum) && "First store size not a power of 2"); + + // If the size of the first store equals to or exceeds the limit, do nothing. + if (SizeAccum >= MaxSize) + return false; + + // If the size of the first store is greater than or equal to the address + // stored to, then the store cannot be made any wider. + if (SizeAccum >= Alignment) + return false; + + // The offset of a store will put restrictions on how wide the store can be. + // Offsets in stores of size 2^n bytes need to have the n lowest bits be 0. + // If the first store already exhausts the offset limits, quit. Test this + // by checking if the next wider size would exceed the limit. + if ((2*SizeAccum-1) & FirstOffset) + return false; + + OG.push_back(FirstMI); + MachineInstr *S1 = FirstMI, *S2 = *(Begin+1); + InstrGroup::iterator I = Begin+1; + + // Pow2Num will be the largest number of elements in OG such that the sum + // of sizes of stores 0...Pow2Num-1 will be a power of 2. + unsigned Pow2Num = 1; + unsigned Pow2Size = SizeAccum; + + // Be greedy: keep accumulating stores as long as they are to adjacent + // memory locations, and as long as the total number of bytes stored + // does not exceed the limit (MaxSize). + // Keep track of when the total size covered is a power of 2, since + // this is a size a single store can cover. + while (I != End) { + S2 = *I; + // Stores are sorted, so if S1 and S2 are not adjacent, there won't be + // any other store to fill the "hole". + if (!storesAreAdjacent(S1, S2)) + break; + + unsigned S2Size = getStoreTarget(S2).getSize(); + if (SizeAccum + S2Size > std::min(MaxSize, Alignment)) + break; + + OG.push_back(S2); + SizeAccum += S2Size; + if (isPowerOf2_32(SizeAccum)) { + Pow2Num = OG.size(); + Pow2Size = SizeAccum; + } + if ((2*Pow2Size-1) & FirstOffset) + break; + + S1 = S2; + ++I; + } + + // The stores don't add up to anything that can be widened. Clean up. + if (Pow2Num <= 1) { + OG.clear(); + return false; + } + + // Only leave the stored being widened. + OG.resize(Pow2Num); + TotalSize = Pow2Size; + return true; +} + + +/// Given an "old group" OG of stores, create a "new group" NG of instructions +/// to replace them. Ideally, NG would only have a single instruction in it, +/// but that may only be possible for store-immediate. +bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG, + unsigned TotalSize) { + // XXX Current limitations: + // - only expect stores of immediate values in OG, + // - only handle a TotalSize of up to 4. + + if (TotalSize > 4) + return false; + + unsigned Acc = 0; // Value accumulator. + unsigned Shift = 0; + + for (InstrGroup::iterator I = OG.begin(), E = OG.end(); I != E; ++I) { + MachineInstr *MI = *I; + const MachineMemOperand &MMO = getStoreTarget(MI); + MachineOperand &SO = MI->getOperand(2); // Source. + assert(SO.isImm() && "Expecting an immediate operand"); + + unsigned NBits = MMO.getSize()*8; + unsigned Mask = (0xFFFFFFFFU >> (32-NBits)); + unsigned Val = (SO.getImm() & Mask) << Shift; + Acc |= Val; + Shift += NBits; + } + + + MachineInstr *FirstSt = OG.front(); + DebugLoc DL = OG.back()->getDebugLoc(); + const MachineMemOperand &OldM = getStoreTarget(FirstSt); + MachineMemOperand *NewM = + MF->getMachineMemOperand(OldM.getPointerInfo(), OldM.getFlags(), + TotalSize, OldM.getAlignment(), + OldM.getAAInfo()); + + if (Acc < 0x10000) { + // Create mem[hw] = #Acc + unsigned WOpc = (TotalSize == 2) ? Hexagon::S4_storeirh_io : + (TotalSize == 4) ? Hexagon::S4_storeiri_io : 0; + assert(WOpc && "Unexpected size"); + + int Val = (TotalSize == 2) ? int16_t(Acc) : int(Acc); + const MCInstrDesc &StD = TII->get(WOpc); + MachineOperand &MR = FirstSt->getOperand(0); + int64_t Off = FirstSt->getOperand(1).getImm(); + MachineInstr *StI = BuildMI(*MF, DL, StD) + .addReg(MR.getReg(), getKillRegState(MR.isKill())) + .addImm(Off) + .addImm(Val); + StI->addMemOperand(*MF, NewM); + NG.push_back(StI); + } else { + // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg + const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi); + const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF); + unsigned VReg = MF->getRegInfo().createVirtualRegister(RC); + MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg) + .addImm(int(Acc)); + NG.push_back(TfrI); + + unsigned WOpc = (TotalSize == 2) ? Hexagon::S2_storerh_io : + (TotalSize == 4) ? Hexagon::S2_storeri_io : 0; + assert(WOpc && "Unexpected size"); + + const MCInstrDesc &StD = TII->get(WOpc); + MachineOperand &MR = FirstSt->getOperand(0); + int64_t Off = FirstSt->getOperand(1).getImm(); + MachineInstr *StI = BuildMI(*MF, DL, StD) + .addReg(MR.getReg(), getKillRegState(MR.isKill())) + .addImm(Off) + .addReg(VReg, RegState::Kill); + StI->addMemOperand(*MF, NewM); + NG.push_back(StI); + } + + return true; +} + + +// Replace instructions from the old group OG with instructions from the +// new group NG. Conceptually, remove all instructions in OG, and then +// insert all instructions in NG, starting at where the first instruction +// from OG was (in the order in which they appeared in the basic block). +// (The ordering in OG does not have to match the order in the basic block.) +bool HexagonStoreWidening::replaceStores(InstrGroup &OG, InstrGroup &NG) { + DEBUG({ + dbgs() << "Replacing:\n"; + for (auto I : OG) + dbgs() << " " << *I; + dbgs() << "with\n"; + for (auto I : NG) + dbgs() << " " << *I; + }); + + MachineBasicBlock *MBB = OG.back()->getParent(); + MachineBasicBlock::iterator InsertAt = MBB->end(); + + // Need to establish the insertion point. The best one is right before + // the first store in the OG, but in the order in which the stores occur + // in the program list. Since the ordering in OG does not correspond + // to the order in the program list, we need to do some work to find + // the insertion point. + + // Create a set of all instructions in OG (for quick lookup). + SmallPtrSet<MachineInstr*, 4> InstrSet; + for (auto I : OG) + InstrSet.insert(I); + + // Traverse the block, until we hit an instruction from OG. + for (auto &I : *MBB) { + if (InstrSet.count(&I)) { + InsertAt = I; + break; + } + } + + assert((InsertAt != MBB->end()) && "Cannot locate any store from the group"); + + bool AtBBStart = false; + + // InsertAt points at the first instruction that will be removed. We need + // to move it out of the way, so it remains valid after removing all the + // old stores, and so we are able to recover it back to the proper insertion + // position. + if (InsertAt != MBB->begin()) + --InsertAt; + else + AtBBStart = true; + + for (auto I : OG) + I->eraseFromParent(); + + if (!AtBBStart) + ++InsertAt; + else + InsertAt = MBB->begin(); + + for (auto I : NG) + MBB->insert(InsertAt, I); + + return true; +} + + +// Break up the group into smaller groups, each of which can be replaced by +// a single wide store. Widen each such smaller group and replace the old +// instructions with the widened ones. +bool HexagonStoreWidening::processStoreGroup(InstrGroup &Group) { + bool Changed = false; + InstrGroup::iterator I = Group.begin(), E = Group.end(); + InstrGroup OG, NG; // Old and new groups. + unsigned CollectedSize; + + while (I != E) { + OG.clear(); + NG.clear(); + + bool Succ = selectStores(I++, E, OG, CollectedSize, MaxWideSize) && + createWideStores(OG, NG, CollectedSize) && + replaceStores(OG, NG); + if (!Succ) + continue; + + assert(OG.size() > 1 && "Created invalid group"); + assert(distance(I, E)+1 >= int(OG.size()) && "Too many elements"); + I += OG.size()-1; + + Changed = true; + } + + return Changed; +} + + +// Process a single basic block: create the store groups, and replace them +// with the widened stores, if possible. Processing of each basic block +// is independent from processing of any other basic block. This transfor- +// mation could be stopped after having processed any basic block without +// any ill effects (other than not having performed widening in the unpro- +// cessed blocks). Also, the basic blocks can be processed in any order. +bool HexagonStoreWidening::processBasicBlock(MachineBasicBlock &MBB) { + InstrGroupList SGs; + bool Changed = false; + + createStoreGroups(MBB, SGs); + + auto Less = [] (const MachineInstr *A, const MachineInstr *B) -> bool { + return getStoreOffset(A) < getStoreOffset(B); + }; + for (auto &G : SGs) { + assert(G.size() > 1 && "Store group with fewer than 2 elements"); + std::sort(G.begin(), G.end(), Less); + + Changed |= processStoreGroup(G); + } + + return Changed; +} + + +bool HexagonStoreWidening::runOnMachineFunction(MachineFunction &MFn) { + MF = &MFn; + auto &ST = MFn.getSubtarget<HexagonSubtarget>(); + TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + MRI = &MFn.getRegInfo(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + + bool Changed = false; + + for (auto &B : MFn) + Changed |= processBasicBlock(B); + + return Changed; +} + + +FunctionPass *llvm::createHexagonStoreWidening() { + return new HexagonStoreWidening(); +} + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index cd482b3..aa0efd4 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -16,6 +16,8 @@ #include "HexagonRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include <map> + using namespace llvm; #define DEBUG_TYPE "hexagon-subtarget" @@ -24,49 +26,65 @@ using namespace llvm; #define GET_SUBTARGETINFO_TARGET_DESC #include "HexagonGenSubtargetInfo.inc" -static cl::opt<bool> -EnableV3("enable-hexagon-v3", cl::Hidden, - cl::desc("Enable Hexagon V3 instructions.")); - -static cl::opt<bool> -EnableMemOps( - "enable-hexagon-memops", - cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true), - cl::desc( - "Generate V4 MEMOP in code generation for Hexagon target")); - -static cl::opt<bool> -DisableMemOps( - "disable-hexagon-memops", - cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false), - cl::desc( - "Do not generate V4 MEMOP in code generation for Hexagon target")); - -static cl::opt<bool> -EnableIEEERndNear( - "enable-hexagon-ieee-rnd-near", - cl::Hidden, cl::ZeroOrMore, cl::init(false), - cl::desc("Generate non-chopped conversion from fp to int.")); +static cl::opt<bool> EnableMemOps("enable-hexagon-memops", + cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true), + cl::desc("Generate V4 MEMOP in code generation for Hexagon target")); + +static cl::opt<bool> DisableMemOps("disable-hexagon-memops", + cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false), + cl::desc("Do not generate V4 MEMOP in code generation for Hexagon target")); + +static cl::opt<bool> EnableIEEERndNear("enable-hexagon-ieee-rnd-near", + cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Generate non-chopped conversion from fp to int.")); + +static cl::opt<bool> EnableBSBSched("enable-bsb-sched", + cl::Hidden, cl::ZeroOrMore, cl::init(true)); + +static cl::opt<bool> EnableHexagonHVXDouble("enable-hexagon-hvx-double", + cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Enable Hexagon Double Vector eXtensions")); + +static cl::opt<bool> EnableHexagonHVX("enable-hexagon-hvx", + cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Enable Hexagon Vector eXtensions")); static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched", - cl::Hidden, cl::ZeroOrMore, cl::init(false), - cl::desc("Disable Hexagon MI Scheduling")); + cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Disable Hexagon MI Scheduling")); + +void HexagonSubtarget::initializeEnvironment() { + UseMemOps = false; + ModeIEEERndNear = false; + UseBSBScheduling = false; +} HexagonSubtarget & HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { - // If the programmer has not specified a Hexagon version, default to -mv4. - if (CPUString.empty()) - CPUString = "hexagonv4"; - - if (CPUString == "hexagonv4") { - HexagonArchVersion = V4; - } else if (CPUString == "hexagonv5") { - HexagonArchVersion = V5; - } else { + CPUString = HEXAGON_MC::selectHexagonCPU(getTargetTriple(), CPU); + + static std::map<StringRef, HexagonArchEnum> CpuTable { + { "hexagonv4", V4 }, + { "hexagonv5", V5 }, + { "hexagonv55", V55 }, + { "hexagonv60", V60 }, + }; + + auto foundIt = CpuTable.find(CPUString); + if (foundIt != CpuTable.end()) + HexagonArchVersion = foundIt->second; + else llvm_unreachable("Unrecognized Hexagon processor version"); - } + UseHVXOps = false; + UseHVXDblOps = false; ParseSubtargetFeatures(CPUString, FS); + + if (EnableHexagonHVX.getPosition()) + UseHVXOps = EnableHexagonHVX; + if (EnableHexagonHVXDouble.getPosition()) + UseHVXDblOps = EnableHexagonHVXDouble; + return *this; } @@ -76,6 +94,8 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU, InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), FrameLowering() { + initializeEnvironment(); + // Initialize scheduling itinerary for the specified CPU. InstrItins = getInstrItineraryForCPU(CPUString); @@ -91,6 +111,8 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU, ModeIEEERndNear = true; else ModeIEEERndNear = false; + + UseBSBScheduling = hasV60TOps() && EnableBSBSched; } // Pin the vtable to this file. diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h index 34cdad7..c7ae139 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -34,15 +34,19 @@ namespace llvm { class HexagonSubtarget : public HexagonGenSubtargetInfo { virtual void anchor(); - bool UseMemOps; + bool UseMemOps, UseHVXOps, UseHVXDblOps; bool ModeIEEERndNear; public: enum HexagonArchEnum { - V4, V5 + V4, V5, V55, V60 }; HexagonArchEnum HexagonArchVersion; + /// True if the target should use Back-Skip-Back scheduling. This is the + /// default for V60. + bool UseBSBScheduling; + private: std::string CPUString; HexagonInstrInfo InstrInfo; @@ -50,6 +54,7 @@ private: HexagonSelectionDAGInfo TSInfo; HexagonFrameLowering FrameLowering; InstrItineraryData InstrItins; + void initializeEnvironment(); public: HexagonSubtarget(const Triple &TT, StringRef CPU, StringRef FS, @@ -84,7 +89,16 @@ public: bool useMemOps() const { return UseMemOps; } bool hasV5TOps() const { return getHexagonArchVersion() >= V5; } bool hasV5TOpsOnly() const { return getHexagonArchVersion() == V5; } + bool hasV55TOps() const { return getHexagonArchVersion() >= V55; } + bool hasV55TOpsOnly() const { return getHexagonArchVersion() == V55; } + bool hasV60TOps() const { return getHexagonArchVersion() >= V60; } + bool hasV60TOpsOnly() const { return getHexagonArchVersion() == V60; } bool modeIEEERndNear() const { return ModeIEEERndNear; } + bool useHVXOps() const { return UseHVXOps; } + bool useHVXDblOps() const { return UseHVXOps && UseHVXDblOps; } + bool useHVXSglOps() const { return UseHVXOps && !UseHVXDblOps; } + + bool useBSBScheduling() const { return UseBSBScheduling; } bool enableMachineScheduler() const override; // Always use the TargetLowering default scheduler. // FIXME: This will use the vliw scheduler which is probably just hurting @@ -98,7 +112,7 @@ public: return Hexagon_SMALL_DATA_THRESHOLD; } const HexagonArchEnum &getHexagonArchVersion() const { - return HexagonArchVersion; + return HexagonArchVersion; } }; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td b/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td new file mode 100644 index 0000000..784686a --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td @@ -0,0 +1,113 @@ +//==- HexagonSystemInst.td - System Instructions for Hexagon -*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Hexagon instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Cache manipulation instructions. +//===----------------------------------------------------------------------===// +let mayStore = 1 in +class ST_MISC_CACHEOP<dag outs, dag ins, + string asmstr, list<dag> pattern = [], + bits<3> amode, bits<3> type, bits<1> un> + : ST0Inst<outs, ins, asmstr, pattern, "", ST_tc_ld_SLOT0> { + + bits<5> Rs; + bits<5> Rt; + bits<5> Rd; + let Inst{31-28} = 0b1010; + let Inst{27-25} = amode; + let Inst{24-22} = type; + let Inst{21} = un; + let Inst{20-16} = Rs; + let Inst{12-8} = Rt; + let Inst{4-0} = Rd; +} + +let mayStore = 1 in +class ST_MISC_CACHEOP_SYS<dag outs, dag ins, + string asmstr, list<dag> pattern = [], + bits<3> amode, bits<3> type, bits<1> un> + : SYSInst<outs, ins, asmstr, pattern, ""> { + + bits<5> Rs; + bits<5> Rt; + bits<5> Rd; + let Inst{31-28} = 0b1010; + let Inst{27-25} = amode; + let Inst{24-22} = type; + let Inst{21} = un; + let Inst{20-16} = Rs; + let Inst{12-8} = Rt; + let Inst{4-0} = Rd; +} + + +let isSolo = 1, Rs = 0, Rt = 0, Rd = 0 in { +def Y2_syncht: ST_MISC_CACHEOP <(outs), (ins), + "syncht" , [], 0b100, 0b001, 0b0>; +} + +let Rt = 0, Rd = 0 in { +let isSoloAin1 = 1 in { + def Y2_dccleana: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs), + "dccleana($Rs)", [], 0b000, 0b000, 0b0>; + def Y2_dcinva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs), + "dcinva($Rs)", [], 0b000, 0b000, 0b1>; + def Y2_dccleaninva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs), + "dccleaninva($Rs)", [], 0b000, 0b001, 0b0>; + } +} + +let isSoloAX = 1, hasSideEffects = 1, Rd = 0 in { + def Y4_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, IntRegs:$Rt), + "l2fetch($Rs, $Rt)", [], 0b011, 0b000, 0b0>; + def Y5_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, DoubleRegs:$Rt), + "l2fetch($Rs, $Rt)", [], 0b011, 0b010, 0b0>; +} + +let hasSideEffects = 0, isSolo = 1 in +class Y2_INVALIDATE_CACHE<string mnemonic, bit MajOp> + : JRInst < + (outs), (ins IntRegs:$Rs), + #mnemonic#"($Rs)" > { + bits<5> Rs; + + let IClass = 0b0101; + let Inst{27-21} = 0b0110110; + let Inst{20-16} = Rs; + let Inst{13-12} = 0b00; + let Inst{11} = MajOp; + } +// Instruction cache invalidate +def Y2_icinva : Y2_INVALIDATE_CACHE<"icinva", 0b0>; + +// Zero an aligned 32-byte cacheline. +let isSoloAin1 = 1 in +def Y2_dczeroa: ST0Inst <(outs), (ins IntRegs:$Rs), + "dczeroa($Rs)"> { + bits<5> Rs; + let IClass = 0b1010; + let Inst{27-21} = 0b0000110; + let Inst{13} = 0b0; + let Inst{20-16} = Rs; + } + +// Memory synchronization. +let hasSideEffects = 0, isSolo = 1 in +def Y2_isync: JRInst <(outs), (ins), + "isync"> { + let IClass = 0b0101; + let Inst{27-16} = 0b011111000000; + let Inst{13} = 0b0; + let Inst{9-0} = 0b0000000010; + } + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index b504429..34b03fb 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -16,27 +16,37 @@ #include "HexagonISelLowering.h" #include "HexagonMachineScheduler.h" #include "HexagonTargetObjectFile.h" +#include "HexagonTargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Scalar.h" using namespace llvm; -static cl:: opt<bool> DisableHardwareLoops("disable-hexagon-hwloops", + +static cl::opt<bool> EnableRDFOpt("rdf-opt", cl::Hidden, cl::ZeroOrMore, + cl::init(true), cl::desc("Enable RDF-based optimizations")); + +static cl::opt<bool> DisableHardwareLoops("disable-hexagon-hwloops", cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target")); static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt", cl::Hidden, cl::ZeroOrMore, cl::init(false), cl::desc("Disable Hexagon CFG Optimization")); +static cl::opt<bool> DisableStoreWidening("disable-store-widen", + cl::Hidden, cl::init(false), cl::desc("Disable store widening")); + static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets", cl::init(true), cl::Hidden, cl::ZeroOrMore, cl::desc("Early expansion of MUX")); +static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden, + cl::ZeroOrMore, cl::desc("Enable early if-conversion")); + static cl::opt<bool> EnableGenInsert("hexagon-insert", cl::init(true), cl::Hidden, cl::desc("Generate \"insert\" instructions")); @@ -46,10 +56,22 @@ static cl::opt<bool> EnableCommGEP("hexagon-commgep", cl::init(true), static cl::opt<bool> EnableGenExtract("hexagon-extract", cl::init(true), cl::Hidden, cl::desc("Generate \"extract\" instructions")); +static cl::opt<bool> EnableGenMux("hexagon-mux", cl::init(true), cl::Hidden, + cl::desc("Enable converting conditional transfers into MUX instructions")); + static cl::opt<bool> EnableGenPred("hexagon-gen-pred", cl::init(true), cl::Hidden, cl::desc("Enable conversion of arithmetic operations to " "predicate instructions")); +static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden, + cl::desc("Disable splitting double registers")); + +static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true), + cl::Hidden, cl::desc("Bit simplification")); + +static cl::opt<bool> EnableLoopResched("hexagon-loop-resched", cl::init(true), + cl::Hidden, cl::desc("Loop rescheduling")); + /// HexagonTargetMachineModule - Note that this is used on hosts that /// cannot link in a library unless there are references into the /// library. In particular, it seems that it is not possible to get @@ -72,23 +94,31 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler", createVLIWMachineSched); namespace llvm { + FunctionPass *createHexagonBitSimplify(); + FunctionPass *createHexagonCallFrameInformation(); FunctionPass *createHexagonCFGOptimizer(); FunctionPass *createHexagonCommonGEP(); FunctionPass *createHexagonCopyToCombine(); + FunctionPass *createHexagonEarlyIfConversion(); FunctionPass *createHexagonExpandCondsets(); FunctionPass *createHexagonExpandPredSpillCode(); FunctionPass *createHexagonFixupHwLoops(); FunctionPass *createHexagonGenExtract(); FunctionPass *createHexagonGenInsert(); + FunctionPass *createHexagonGenMux(); FunctionPass *createHexagonGenPredicate(); FunctionPass *createHexagonHardwareLoops(); FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM, CodeGenOpt::Level OptLevel); + FunctionPass *createHexagonLoopRescheduling(); FunctionPass *createHexagonNewValueJump(); + FunctionPass *createHexagonOptimizeSZextends(); FunctionPass *createHexagonPacketizer(); FunctionPass *createHexagonPeephole(); - FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM); + FunctionPass *createHexagonRDFOpt(); FunctionPass *createHexagonSplitConst32AndConst64(); + FunctionPass *createHexagonSplitDoubleRegs(); + FunctionPass *createHexagonStoreWidening(); } // end namespace llvm; /// HexagonTargetMachine ctor - Create an ILP32 architecture model. @@ -101,13 +131,46 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, "e-m:e-p:32:32-i1:32-i64:64-a:0-n32", TT, CPU, FS, - Options, RM, CM, OL), - TLOF(make_unique<HexagonTargetObjectFile>()), - Subtarget(TT, CPU, FS, *this) { - initAsmInfo(); + : LLVMTargetMachine(T, "e-m:e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-" + "i1:8:8-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-" + "n16:32", TT, CPU, FS, Options, RM, CM, OL), + TLOF(make_unique<HexagonTargetObjectFile>()) { + initAsmInfo(); +} + +const HexagonSubtarget * +HexagonTargetMachine::getSubtargetImpl(const Function &F) const { + AttributeSet FnAttrs = F.getAttributes(); + Attribute CPUAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu"); + Attribute FSAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features"); + + std::string CPU = !CPUAttr.hasAttribute(Attribute::None) + ? CPUAttr.getValueAsString().str() + : TargetCPU; + std::string FS = !FSAttr.hasAttribute(Attribute::None) + ? FSAttr.getValueAsString().str() + : TargetFS; + + auto &I = SubtargetMap[CPU + FS]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique<HexagonSubtarget>(TargetTriple, CPU, FS, *this); + } + return I.get(); +} + +TargetIRAnalysis HexagonTargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo(HexagonTTIImpl(this, F)); + }); } + HexagonTargetMachine::~HexagonTargetMachine() {} namespace { @@ -166,7 +229,7 @@ bool HexagonPassConfig::addInstSelector() { bool NoOpt = (getOptLevel() == CodeGenOpt::None); if (!NoOpt) - addPass(createHexagonRemoveExtendArgs(TM)); + addPass(createHexagonOptimizeSZextends()); addPass(createHexagonISelDag(TM, getOptLevel())); @@ -174,25 +237,42 @@ bool HexagonPassConfig::addInstSelector() { // Create logical operations on predicate registers. if (EnableGenPred) addPass(createHexagonGenPredicate(), false); + // Rotate loops to expose bit-simplification opportunities. + if (EnableLoopResched) + addPass(createHexagonLoopRescheduling(), false); + // Split double registers. + if (!DisableHSDR) + addPass(createHexagonSplitDoubleRegs()); + // Bit simplification. + if (EnableBitSimplify) + addPass(createHexagonBitSimplify(), false); addPass(createHexagonPeephole()); printAndVerify("After hexagon peephole pass"); if (EnableGenInsert) addPass(createHexagonGenInsert(), false); + if (EnableEarlyIf) + addPass(createHexagonEarlyIfConversion(), false); } return false; } void HexagonPassConfig::addPreRegAlloc() { - if (getOptLevel() != CodeGenOpt::None) + if (getOptLevel() != CodeGenOpt::None) { + if (!DisableStoreWidening) + addPass(createHexagonStoreWidening(), false); if (!DisableHardwareLoops) addPass(createHexagonHardwareLoops(), false); + } } void HexagonPassConfig::addPostRegAlloc() { - if (getOptLevel() != CodeGenOpt::None) + if (getOptLevel() != CodeGenOpt::None) { + if (EnableRDFOpt) + addPass(createHexagonRDFOpt()); if (!DisableHexagonCFGOpt) addPass(createHexagonCFGOptimizer(), false); + } } void HexagonPassConfig::addPreSched2() { @@ -215,6 +295,13 @@ void HexagonPassConfig::addPreEmitPass() { if (!NoOpt) { if (!DisableHardwareLoops) addPass(createHexagonFixupHwLoops(), false); + // Generate MUX from pairs of conditional transfers. + if (EnableGenMux) + addPass(createHexagonGenMux(), false); + addPass(createHexagonPacketizer(), false); } + + // Add CFI instructions if necessary. + addPass(createHexagonCallFrameInformation(), false); } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h index 115eadb..968814b 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h @@ -16,6 +16,7 @@ #include "HexagonInstrInfo.h" #include "HexagonSubtarget.h" +#include "HexagonTargetObjectFile.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -24,7 +25,7 @@ class Module; class HexagonTargetMachine : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; - HexagonSubtarget Subtarget; + mutable StringMap<std::unique_ptr<HexagonSubtarget>> SubtargetMap; public: HexagonTargetMachine(const Target &T, const Triple &TT, StringRef CPU, @@ -32,20 +33,18 @@ public: Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~HexagonTargetMachine() override; - const HexagonSubtarget *getSubtargetImpl(const Function &) const override { - return &Subtarget; - } + const HexagonSubtarget *getSubtargetImpl(const Function &F) const override; + static unsigned getModuleMatchQuality(const Module &M); TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + TargetIRAnalysis getTargetIRAnalysis() override; - TargetLoweringObjectFile *getObjFileLowering() const override { - return TLOF.get(); + HexagonTargetObjectFile *getObjFileLowering() const override { + return static_cast<HexagonTargetObjectFile*>(TLOF.get()); } }; -extern bool flag_aligned_memcpy; - } // end namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp index 4ea0e0d..ccca620 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp @@ -73,9 +73,10 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM, if (!GVA) return false; - if (Kind.isBSS() || Kind.isDataNoRel() || Kind.isCommon()) { + if (Kind.isBSS() || Kind.isData() || Kind.isCommon()) { Type *Ty = GV->getType()->getElementType(); - return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty)); + return IsInSmallSection( + GV->getParent()->getDataLayout().getTypeAllocSize(Ty)); } return false; @@ -89,7 +90,7 @@ HexagonTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, // Handle Small Section classification here. if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind)) return SmallBSSSection; - if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind)) + if (Kind.isData() && IsGlobalInSmallSection(GV, TM, Kind)) return SmallDataSection; // Otherwise, we work the same as ELF. diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp new file mode 100644 index 0000000..a05443e --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -0,0 +1,38 @@ +//===-- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +/// This file implements a TargetTransformInfo analysis pass specific to the +/// Hexagon target machine. It uses the target's detailed information to provide +/// more precise answers to certain TTI queries, while letting the target +/// independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#include "HexagonTargetTransformInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "hexagontti" + +TargetTransformInfo::PopcntSupportKind +HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const { + // Return Fast Hardware support as every input < 64 bits will be promoted + // to 64 bits. + return TargetTransformInfo::PSK_FastHardware; +} + +// The Hexagon target can unroll loops with run-time trip counts. +void HexagonTTIImpl::getUnrollingPreferences(Loop *L, + TTI::UnrollingPreferences &UP) { + UP.Runtime = UP.Partial = true; +} + +unsigned HexagonTTIImpl::getNumberOfRegisters(bool vector) const { + return vector ? 0 : 32; +} diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h new file mode 100644 index 0000000..71ae17a --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -0,0 +1,70 @@ +//===-- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +/// This file implements a TargetTransformInfo analysis pass specific to the +/// Hexagon target machine. It uses the target's detailed information to provide +/// more precise answers to certain TTI queries, while letting the target +/// independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H + +#include "Hexagon.h" +#include "HexagonTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> { + typedef BasicTTIImplBase<HexagonTTIImpl> BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const HexagonSubtarget *ST; + const HexagonTargetLowering *TLI; + + const HexagonSubtarget *getST() const { return ST; } + const HexagonTargetLowering *getTLI() const { return TLI; } + +public: + explicit HexagonTTIImpl(const HexagonTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + HexagonTTIImpl(const HexagonTTIImpl &Arg) + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} + HexagonTTIImpl(HexagonTTIImpl &&Arg) + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} + + /// \name Scalar TTI Implementations + /// @{ + + TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const; + + // The Hexagon target can unroll loops with run-time trip counts. + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + + /// @} + + /// \name Vector TTI Implementations + /// @{ + + unsigned getNumberOfRegisters(bool vector) const; + + /// @} +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index b91a3f6..8185054 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -16,35 +16,19 @@ // prune the dependence. // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/DFAPacketizer.h" -#include "Hexagon.h" -#include "HexagonMachineFunctionInfo.h" #include "HexagonRegisterInfo.h" #include "HexagonSubtarget.h" #include "HexagonTargetMachine.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/LatencyPriorityQueue.h" +#include "HexagonVLIWPacketizer.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/ScheduleDAG.h" -#include "llvm/CodeGen/ScheduleDAGInstrs.h" -#include "llvm/CodeGen/ScheduleHazardRecognizer.h" -#include "llvm/CodeGen/SchedulerRegistry.h" -#include "llvm/MC/MCInstrItineraries.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetRegisterInfo.h" #include <map> #include <vector> @@ -52,9 +36,22 @@ using namespace llvm; #define DEBUG_TYPE "packets" +static cl::opt<bool> DisablePacketizer("disable-packetizer", cl::Hidden, + cl::ZeroOrMore, cl::init(false), + cl::desc("Disable Hexagon packetizer pass")); + static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles", - cl::ZeroOrMore, cl::Hidden, cl::init(true), - cl::desc("Allow non-solo packetization of volatile memory references")); + cl::ZeroOrMore, cl::Hidden, cl::init(true), + cl::desc("Allow non-solo packetization of volatile memory references")); + +static cl::opt<bool> EnableGenAllInsnClass("enable-gen-insn", cl::init(false), + cl::Hidden, cl::ZeroOrMore, cl::desc("Generate all instruction with TC")); + +static cl::opt<bool> DisableVecDblNVStores("disable-vecdbl-nv-stores", + cl::init(false), cl::Hidden, cl::ZeroOrMore, + cl::desc("Disable vector double new-value-stores")); + +extern cl::opt<bool> ScheduleInlineAsm; namespace llvm { FunctionPass *createHexagonPacketizer(); @@ -64,7 +61,6 @@ namespace llvm { namespace { class HexagonPacketizer : public MachineFunctionPass { - public: static char ID; HexagonPacketizer() : MachineFunctionPass(ID) { @@ -73,103 +69,25 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<MachineBranchProbabilityInfo>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTree>(); AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineDominatorTree>(); AU.addPreserved<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); } - const char *getPassName() const override { return "Hexagon Packetizer"; } - bool runOnMachineFunction(MachineFunction &Fn) override; - }; - char HexagonPacketizer::ID = 0; - - class HexagonPacketizerList : public VLIWPacketizerList { private: - - // Has the instruction been promoted to a dot-new instruction. - bool PromotedToDotNew; - - // Has the instruction been glued to allocframe. - bool GlueAllocframeStore; - - // Has the feeder instruction been glued to new value jump. - bool GlueToNewValueJump; - - // Check if there is a dependence between some instruction already in this - // packet and this instruction. - bool Dependence; - - // Only check for dependence if there are resources available to - // schedule this instruction. - bool FoundSequentialDependence; - - /// \brief A handle to the branch probability pass. - const MachineBranchProbabilityInfo *MBPI; - - // Track MIs with ignored dependece. - std::vector<MachineInstr*> IgnoreDepMIs; - - public: - // Ctor. - HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI, - const MachineBranchProbabilityInfo *MBPI); - - // initPacketizerState - initialize some internal flags. - void initPacketizerState() override; - - // ignorePseudoInstruction - Ignore bundling of pseudo instructions. - bool ignorePseudoInstruction(MachineInstr *MI, - MachineBasicBlock *MBB) override; - - // isSoloInstruction - return true if instruction MI can not be packetized - // with any other instruction, which means that MI itself is a packet. - bool isSoloInstruction(MachineInstr *MI) override; - - // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ - // together. - bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override; - - // isLegalToPruneDependencies - Is it legal to prune dependece between SUI - // and SUJ. - bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override; - - MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override; - private: - bool IsCallDependent(MachineInstr* MI, SDep::Kind DepType, unsigned DepReg); - bool PromoteToDotNew(MachineInstr* MI, SDep::Kind DepType, - MachineBasicBlock::iterator &MII, - const TargetRegisterClass* RC); - bool CanPromoteToDotNew(MachineInstr *MI, SUnit *PacketSU, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit, - MachineBasicBlock::iterator &MII, - const TargetRegisterClass *RC); - bool - CanPromoteToNewValue(MachineInstr *MI, SUnit *PacketSU, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit, - MachineBasicBlock::iterator &MII); - bool CanPromoteToNewValueStore( - MachineInstr *MI, MachineInstr *PacketMI, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit); - bool DemoteToDotOld(MachineInstr *MI); - bool ArePredicatesComplements( - MachineInstr *MI1, MachineInstr *MI2, - const std::map<MachineInstr *, SUnit *> &MIToSUnit); - bool RestrictingDepExistInPacket(MachineInstr *, unsigned, - const std::map<MachineInstr *, SUnit *> &); - bool isNewifiable(MachineInstr* MI); - bool isCondInst(MachineInstr* MI); - bool tryAllocateResourcesForConstExt(MachineInstr* MI); - bool canReserveResourcesForConstExt(MachineInstr *MI); - void reserveResourcesForConstExt(MachineInstr* MI); - bool isNewValueInst(MachineInstr* MI); + const HexagonInstrInfo *HII; + const HexagonRegisterInfo *HRI; }; + + char HexagonPacketizer::ID = 0; } INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer", @@ -177,26 +95,93 @@ INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer", INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(HexagonPacketizer, "packets", "Hexagon Packetizer", false, false) -// HexagonPacketizerList Ctor. -HexagonPacketizerList::HexagonPacketizerList( - MachineFunction &MF, MachineLoopInfo &MLI, - const MachineBranchProbabilityInfo *MBPI) - : VLIWPacketizerList(MF, MLI, true) { - this->MBPI = MBPI; +HexagonPacketizerList::HexagonPacketizerList(MachineFunction &MF, + MachineLoopInfo &MLI, AliasAnalysis *AA, + const MachineBranchProbabilityInfo *MBPI) + : VLIWPacketizerList(MF, MLI, AA), MBPI(MBPI), MLI(&MLI) { + HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); + HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); } -bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) { - const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); - MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); - const MachineBranchProbabilityInfo *MBPI = - &getAnalysis<MachineBranchProbabilityInfo>(); +// Check if FirstI modifies a register that SecondI reads. +static bool hasWriteToReadDep(const MachineInstr *FirstI, + const MachineInstr *SecondI, const TargetRegisterInfo *TRI) { + for (auto &MO : FirstI->operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + unsigned R = MO.getReg(); + if (SecondI->readsRegister(R, TRI)) + return true; + } + return false; +} + + +static MachineBasicBlock::iterator moveInstrOut(MachineInstr *MI, + MachineBasicBlock::iterator BundleIt, bool Before) { + MachineBasicBlock::instr_iterator InsertPt; + if (Before) + InsertPt = BundleIt.getInstrIterator(); + else + InsertPt = std::next(BundleIt).getInstrIterator(); + + MachineBasicBlock &B = *MI->getParent(); + // The instruction should at least be bundled with the preceding instruction + // (there will always be one, i.e. BUNDLE, if nothing else). + assert(MI->isBundledWithPred()); + if (MI->isBundledWithSucc()) { + MI->clearFlag(MachineInstr::BundledSucc); + MI->clearFlag(MachineInstr::BundledPred); + } else { + // If it's not bundled with the successor (i.e. it is the last one + // in the bundle), then we can simply unbundle it from the predecessor, + // which will take care of updating the predecessor's flag. + MI->unbundleFromPred(); + } + B.splice(InsertPt, &B, MI); + + // Get the size of the bundle without asserting. + MachineBasicBlock::const_instr_iterator I(BundleIt); + MachineBasicBlock::const_instr_iterator E = B.instr_end(); + unsigned Size = 0; + for (++I; I != E && I->isBundledWithPred(); ++I) + ++Size; + + // If there are still two or more instructions, then there is nothing + // else to be done. + if (Size > 1) + return BundleIt; + + // Otherwise, extract the single instruction out and delete the bundle. + MachineBasicBlock::iterator NextIt = std::next(BundleIt); + MachineInstr *SingleI = BundleIt->getNextNode(); + SingleI->unbundleFromPred(); + assert(!SingleI->isBundledWithSucc()); + BundleIt->eraseFromParent(); + return NextIt; +} + + +bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) { + if (DisablePacketizer) + return false; + + HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); + HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); + auto &MLI = getAnalysis<MachineLoopInfo>(); + auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + auto *MBPI = &getAnalysis<MachineBranchProbabilityInfo>(); + + if (EnableGenAllInsnClass) + HII->genAllInsnTimingClasses(MF); + // Instantiate the packetizer. - HexagonPacketizerList Packetizer(Fn, MLI, MBPI); + HexagonPacketizerList Packetizer(MF, MLI, AA, MBPI); // DFA state table should not be empty. assert(Packetizer.getResourceTracker() && "Empty DFA table!"); @@ -211,162 +196,107 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) { // dependence between Insn 0 and Insn 2. This can lead to incorrect // packetization // - for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); - MBB != MBBe; ++MBB) { - MachineBasicBlock::iterator End = MBB->end(); - MachineBasicBlock::iterator MI = MBB->begin(); + for (auto &MB : MF) { + auto End = MB.end(); + auto MI = MB.begin(); while (MI != End) { + auto NextI = std::next(MI); if (MI->isKill()) { - MachineBasicBlock::iterator DeleteMI = MI; - ++MI; - MBB->erase(DeleteMI); - End = MBB->end(); - continue; + MB.erase(MI); + End = MB.end(); } - ++MI; + MI = NextI; } } // Loop over all of the basic blocks. - for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); - MBB != MBBe; ++MBB) { - // Find scheduling regions and schedule / packetize each region. - unsigned RemainingCount = MBB->size(); - for(MachineBasicBlock::iterator RegionEnd = MBB->end(); - RegionEnd != MBB->begin();) { - // The next region starts above the previous region. Look backward in the - // instruction stream until we find the nearest boundary. - MachineBasicBlock::iterator I = RegionEnd; - for(;I != MBB->begin(); --I, --RemainingCount) { - if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn)) - break; - } - I = MBB->begin(); - - // Skip empty scheduling regions. - if (I == RegionEnd) { - RegionEnd = std::prev(RegionEnd); - --RemainingCount; - continue; - } - // Skip regions with one instruction. - if (I == std::prev(RegionEnd)) { - RegionEnd = std::prev(RegionEnd); - continue; - } - - Packetizer.PacketizeMIs(MBB, I, RegionEnd); - RegionEnd = I; + for (auto &MB : MF) { + auto Begin = MB.begin(), End = MB.end(); + while (Begin != End) { + // First the first non-boundary starting from the end of the last + // scheduling region. + MachineBasicBlock::iterator RB = Begin; + while (RB != End && HII->isSchedulingBoundary(RB, &MB, MF)) + ++RB; + // First the first boundary starting from the beginning of the new + // region. + MachineBasicBlock::iterator RE = RB; + while (RE != End && !HII->isSchedulingBoundary(RE, &MB, MF)) + ++RE; + // Add the scheduling boundary if it's not block end. + if (RE != End) + ++RE; + // If RB == End, then RE == End. + if (RB != End) + Packetizer.PacketizeMIs(&MB, RB, RE); + + Begin = RE; } } + Packetizer.unpacketizeSoloInstrs(MF); return true; } -static bool IsIndirectCall(MachineInstr* MI) { - return MI->getOpcode() == Hexagon::J2_callr; +// Reserve resources for a constant extender. Trigger an assertion if the +// reservation fails. +void HexagonPacketizerList::reserveResourcesForConstExt() { + if (!tryAllocateResourcesForConstExt(true)) + llvm_unreachable("Resources not available"); } -// Reserve resources for constant extender. Trigure an assertion if -// reservation fail. -void HexagonPacketizerList::reserveResourcesForConstExt(MachineInstr* MI) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - MachineInstr *PseudoMI = MF.CreateMachineInstr(QII->get(Hexagon::A4_ext), - MI->getDebugLoc()); - - if (ResourceTracker->canReserveResources(PseudoMI)) { - ResourceTracker->reserveResources(PseudoMI); - MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI); - } else { - MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI); - llvm_unreachable("can not reserve resources for constant extender."); - } - return; +bool HexagonPacketizerList::canReserveResourcesForConstExt() { + return tryAllocateResourcesForConstExt(false); } -bool HexagonPacketizerList::canReserveResourcesForConstExt(MachineInstr *MI) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - assert((QII->isExtended(MI) || QII->isConstExtended(MI)) && - "Should only be called for constant extended instructions"); - MachineInstr *PseudoMI = MF.CreateMachineInstr(QII->get(Hexagon::A4_ext), - MI->getDebugLoc()); - bool CanReserve = ResourceTracker->canReserveResources(PseudoMI); - MF.DeleteMachineInstr(PseudoMI); - return CanReserve; +// Allocate resources (i.e. 4 bytes) for constant extender. If succeeded, +// return true, otherwise, return false. +bool HexagonPacketizerList::tryAllocateResourcesForConstExt(bool Reserve) { + auto *ExtMI = MF.CreateMachineInstr(HII->get(Hexagon::A4_ext), DebugLoc()); + bool Avail = ResourceTracker->canReserveResources(ExtMI); + if (Reserve && Avail) + ResourceTracker->reserveResources(ExtMI); + MF.DeleteMachineInstr(ExtMI); + return Avail; } -// Allocate resources (i.e. 4 bytes) for constant extender. If succeed, return -// true, otherwise, return false. -bool HexagonPacketizerList::tryAllocateResourcesForConstExt(MachineInstr* MI) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - MachineInstr *PseudoMI = MF.CreateMachineInstr(QII->get(Hexagon::A4_ext), - MI->getDebugLoc()); - if (ResourceTracker->canReserveResources(PseudoMI)) { - ResourceTracker->reserveResources(PseudoMI); - MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI); +bool HexagonPacketizerList::isCallDependent(const MachineInstr* MI, + SDep::Kind DepType, unsigned DepReg) { + // Check for LR dependence. + if (DepReg == HRI->getRARegister()) return true; - } else { - MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI); - return false; - } -} - - -bool HexagonPacketizerList::IsCallDependent(MachineInstr* MI, - SDep::Kind DepType, - unsigned DepReg) { - - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - const HexagonRegisterInfo *QRI = - (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo(); - - // Check for lr dependence - if (DepReg == QRI->getRARegister()) { - return true; - } - if (QII->isDeallocRet(MI)) { - if (DepReg == QRI->getFrameRegister() || - DepReg == QRI->getStackRegister()) + if (HII->isDeallocRet(MI)) + if (DepReg == HRI->getFrameRegister() || DepReg == HRI->getStackRegister()) return true; - } - // Check if this is a predicate dependence - const TargetRegisterClass* RC = QRI->getMinimalPhysRegClass(DepReg); - if (RC == &Hexagon::PredRegsRegClass) { + // Check if this is a predicate dependence. + const TargetRegisterClass* RC = HRI->getMinimalPhysRegClass(DepReg); + if (RC == &Hexagon::PredRegsRegClass) return true; - } - // - // Lastly check for an operand used in an indirect call - // If we had an attribute for checking if an instruction is an indirect call, - // then we could have avoided this relatively brittle implementation of - // IsIndirectCall() - // - // Assumes that the first operand of the CALLr is the function address - // - if (IsIndirectCall(MI) && (DepType == SDep::Data)) { + // Assumes that the first operand of the CALLr is the function address. + if (HII->isIndirectCall(MI) && (DepType == SDep::Data)) { MachineOperand MO = MI->getOperand(0); - if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) { + if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) return true; - } } return false; } -static bool IsRegDependence(const SDep::Kind DepType) { - return (DepType == SDep::Data || DepType == SDep::Anti || - DepType == SDep::Output); +static bool isRegDependence(const SDep::Kind DepType) { + return DepType == SDep::Data || DepType == SDep::Anti || + DepType == SDep::Output; } -static bool IsDirectJump(MachineInstr* MI) { - return (MI->getOpcode() == Hexagon::J2_jump); +static bool isDirectJump(const MachineInstr* MI) { + return MI->getOpcode() == Hexagon::J2_jump; } -static bool IsSchedBarrier(MachineInstr* MI) { +static bool isSchedBarrier(const MachineInstr* MI) { switch (MI->getOpcode()) { case Hexagon::Y2_barrier: return true; @@ -374,76 +304,127 @@ static bool IsSchedBarrier(MachineInstr* MI) { return false; } -static bool IsControlFlow(MachineInstr* MI) { +static bool isControlFlow(const MachineInstr* MI) { return (MI->getDesc().isTerminator() || MI->getDesc().isCall()); } -static bool IsLoopN(MachineInstr *MI) { - return (MI->getOpcode() == Hexagon::J2_loop0i || - MI->getOpcode() == Hexagon::J2_loop0r); -} -/// DoesModifyCalleeSavedReg - Returns true if the instruction modifies a -/// callee-saved register. -static bool DoesModifyCalleeSavedReg(MachineInstr *MI, +/// Returns true if the instruction modifies a callee-saved register. +static bool doesModifyCalleeSavedReg(const MachineInstr *MI, const TargetRegisterInfo *TRI) { - for (const MCPhysReg *CSR = - TRI->getCalleeSavedRegs(MI->getParent()->getParent()); - *CSR; ++CSR) { - unsigned CalleeSavedReg = *CSR; - if (MI->modifiesRegister(CalleeSavedReg, TRI)) + const MachineFunction &MF = *MI->getParent()->getParent(); + for (auto *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR) + if (MI->modifiesRegister(*CSR, TRI)) return true; - } return false; } -// Returns true if an instruction can be promoted to .new predicate -// or new-value store. -bool HexagonPacketizerList::isNewifiable(MachineInstr* MI) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - return isCondInst(MI) || QII->mayBeNewStore(MI); +// TODO: MI->isIndirectBranch() and IsRegisterJump(MI) +// Returns true if an instruction can be promoted to .new predicate or +// new-value store. +bool HexagonPacketizerList::isNewifiable(const MachineInstr* MI) { + return HII->isCondInst(MI) || MI->isReturn() || HII->mayBeNewStore(MI); } -bool HexagonPacketizerList::isCondInst (MachineInstr* MI) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - const MCInstrDesc& TID = MI->getDesc(); - // bug 5670: until that is fixed, - // this portion is disabled. - if ( TID.isConditionalBranch() // && !IsRegisterJump(MI)) || - || QII->isConditionalTransfer(MI) - || QII->isConditionalALU32(MI) - || QII->isConditionalLoad(MI) - || QII->isConditionalStore(MI)) { - return true; +// Promote an instructiont to its .cur form. +// At this time, we have already made a call to canPromoteToDotCur and made +// sure that it can *indeed* be promoted. +bool HexagonPacketizerList::promoteToDotCur(MachineInstr* MI, + SDep::Kind DepType, MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC) { + assert(DepType == SDep::Data); + int CurOpcode = HII->getDotCurOp(MI); + MI->setDesc(HII->get(CurOpcode)); + return true; +} + +void HexagonPacketizerList::cleanUpDotCur() { + MachineInstr *MI = NULL; + for (auto BI : CurrentPacketMIs) { + DEBUG(dbgs() << "Cleanup packet has "; BI->dump();); + if (BI->getOpcode() == Hexagon::V6_vL32b_cur_ai) { + MI = BI; + continue; + } + if (MI) { + for (auto &MO : BI->operands()) + if (MO.isReg() && MO.getReg() == MI->getOperand(0).getReg()) + return; + } } - return false; + if (!MI) + return; + // We did not find a use of the CUR, so de-cur it. + MI->setDesc(HII->get(Hexagon::V6_vL32b_ai)); + DEBUG(dbgs() << "Demoted CUR "; MI->dump();); } +// Check to see if an instruction can be dot cur. +bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr *MI, + const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII, + const TargetRegisterClass *RC) { + if (!HII->isV60VectorInstruction(MI)) + return false; + if (!HII->isV60VectorInstruction(MII)) + return false; -// Promote an instructiont to its .new form. -// At this time, we have already made a call to CanPromoteToDotNew -// and made sure that it can *indeed* be promoted. -bool HexagonPacketizerList::PromoteToDotNew(MachineInstr* MI, - SDep::Kind DepType, MachineBasicBlock::iterator &MII, - const TargetRegisterClass* RC) { + // Already a dot new instruction. + if (HII->isDotCurInst(MI) && !HII->mayBeCurLoad(MI)) + return false; - assert (DepType == SDep::Data); - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + if (!HII->mayBeCurLoad(MI)) + return false; + + // The "cur value" cannot come from inline asm. + if (PacketSU->getInstr()->isInlineAsm()) + return false; + + // Make sure candidate instruction uses cur. + DEBUG(dbgs() << "Can we DOT Cur Vector MI\n"; + MI->dump(); + dbgs() << "in packet\n";); + MachineInstr *MJ = MII; + DEBUG(dbgs() << "Checking CUR against "; MJ->dump();); + unsigned DestReg = MI->getOperand(0).getReg(); + bool FoundMatch = false; + for (auto &MO : MJ->operands()) + if (MO.isReg() && MO.getReg() == DestReg) + FoundMatch = true; + if (!FoundMatch) + return false; + + // Check for existing uses of a vector register within the packet which + // would be affected by converting a vector load into .cur formt. + for (auto BI : CurrentPacketMIs) { + DEBUG(dbgs() << "packet has "; BI->dump();); + if (BI->readsRegister(DepReg, MF.getSubtarget().getRegisterInfo())) + return false; + } + + DEBUG(dbgs() << "Can Dot CUR MI\n"; MI->dump();); + // We can convert the opcode into a .cur. + return true; +} +// Promote an instruction to its .new form. At this time, we have already +// made a call to canPromoteToDotNew and made sure that it can *indeed* be +// promoted. +bool HexagonPacketizerList::promoteToDotNew(MachineInstr* MI, + SDep::Kind DepType, MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC) { + assert (DepType == SDep::Data); int NewOpcode; if (RC == &Hexagon::PredRegsRegClass) - NewOpcode = QII->GetDotNewPredOp(MI, MBPI); + NewOpcode = HII->getDotNewPredOp(MI, MBPI); else - NewOpcode = QII->GetDotNewOp(MI); - MI->setDesc(QII->get(NewOpcode)); - + NewOpcode = HII->getDotNewOp(MI); + MI->setDesc(HII->get(NewOpcode)); return true; } -bool HexagonPacketizerList::DemoteToDotOld(MachineInstr* MI) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - int NewOpcode = QII->GetDotOldOp(MI->getOpcode()); - MI->setDesc(QII->get(NewOpcode)); +bool HexagonPacketizerList::demoteToDotOld(MachineInstr* MI) { + int NewOpcode = HII->getDotOldOp(MI->getOpcode()); + MI->setDesc(HII->get(NewOpcode)); return true; } @@ -455,175 +436,173 @@ enum PredicateKind { /// Returns true if an instruction is predicated on p0 and false if it's /// predicated on !p0. -static PredicateKind getPredicateSense(MachineInstr* MI, - const HexagonInstrInfo *QII) { - if (!QII->isPredicated(MI)) +static PredicateKind getPredicateSense(const MachineInstr *MI, + const HexagonInstrInfo *HII) { + if (!HII->isPredicated(MI)) return PK_Unknown; - - if (QII->isPredicatedTrue(MI)) + if (HII->isPredicatedTrue(MI)) return PK_True; - return PK_False; } -static MachineOperand& GetPostIncrementOperand(MachineInstr *MI, - const HexagonInstrInfo *QII) { - assert(QII->isPostIncrement(MI) && "Not a post increment operation."); +static const MachineOperand &getPostIncrementOperand(const MachineInstr *MI, + const HexagonInstrInfo *HII) { + assert(HII->isPostIncrement(MI) && "Not a post increment operation."); #ifndef NDEBUG // Post Increment means duplicates. Use dense map to find duplicates in the // list. Caution: Densemap initializes with the minimum of 64 buckets, // whereas there are at most 5 operands in the post increment. - DenseMap<unsigned, unsigned> DefRegsSet; - for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) - if (MI->getOperand(opNum).isReg() && - MI->getOperand(opNum).isDef()) { - DefRegsSet[MI->getOperand(opNum).getReg()] = 1; - } - - for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) - if (MI->getOperand(opNum).isReg() && - MI->getOperand(opNum).isUse()) { - if (DefRegsSet[MI->getOperand(opNum).getReg()]) { - return MI->getOperand(opNum); - } - } + DenseSet<unsigned> DefRegsSet; + for (auto &MO : MI->operands()) + if (MO.isReg() && MO.isDef()) + DefRegsSet.insert(MO.getReg()); + + for (auto &MO : MI->operands()) + if (MO.isReg() && MO.isUse() && DefRegsSet.count(MO.getReg())) + return MO; #else - if (MI->getDesc().mayLoad()) { + if (MI->mayLoad()) { + const MachineOperand &Op1 = MI->getOperand(1); // The 2nd operand is always the post increment operand in load. - assert(MI->getOperand(1).isReg() && - "Post increment operand has be to a register."); - return (MI->getOperand(1)); + assert(Op1.isReg() && "Post increment operand has be to a register."); + return Op1; } if (MI->getDesc().mayStore()) { + const MachineOperand &Op0 = MI->getOperand(0); // The 1st operand is always the post increment operand in store. - assert(MI->getOperand(0).isReg() && - "Post increment operand has be to a register."); - return (MI->getOperand(0)); + assert(Op0.isReg() && "Post increment operand has be to a register."); + return Op0; } #endif // we should never come here. llvm_unreachable("mayLoad or mayStore not set for Post Increment operation"); } -// get the value being stored -static MachineOperand& GetStoreValueOperand(MachineInstr *MI) { +// Get the value being stored. +static const MachineOperand& getStoreValueOperand(const MachineInstr *MI) { // value being stored is always the last operand. - return (MI->getOperand(MI->getNumOperands()-1)); + return MI->getOperand(MI->getNumOperands()-1); +} + +static bool isLoadAbsSet(const MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case Hexagon::L4_loadrd_ap: + case Hexagon::L4_loadrb_ap: + case Hexagon::L4_loadrh_ap: + case Hexagon::L4_loadrub_ap: + case Hexagon::L4_loadruh_ap: + case Hexagon::L4_loadri_ap: + return true; + } + return false; } -// can be new value store? +static const MachineOperand &getAbsSetOperand(const MachineInstr *MI) { + assert(isLoadAbsSet(MI)); + return MI->getOperand(1); +} + + +// Can be new value store? // Following restrictions are to be respected in convert a store into // a new value store. // 1. If an instruction uses auto-increment, its address register cannot // be a new-value register. Arch Spec 5.4.2.1 -// 2. If an instruction uses absolute-set addressing mode, -// its address register cannot be a new-value register. -// Arch Spec 5.4.2.1.TODO: This is not enabled as -// as absolute-set address mode patters are not implemented. +// 2. If an instruction uses absolute-set addressing mode, its address +// register cannot be a new-value register. Arch Spec 5.4.2.1. // 3. If an instruction produces a 64-bit result, its registers cannot be used // as new-value registers. Arch Spec 5.4.2.2. -// 4. If the instruction that sets a new-value register is conditional, then +// 4. If the instruction that sets the new-value register is conditional, then // the instruction that uses the new-value register must also be conditional, // and both must always have their predicates evaluate identically. // Arch Spec 5.4.2.3. -// 5. There is an implied restriction of a packet can not have another store, -// if there is a new value store in the packet. Corollary, if there is +// 5. There is an implied restriction that a packet cannot have another store, +// if there is a new value store in the packet. Corollary: if there is // already a store in a packet, there can not be a new value store. // Arch Spec: 3.4.4.2 -bool HexagonPacketizerList::CanPromoteToNewValueStore( - MachineInstr *MI, MachineInstr *PacketMI, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; +bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr *MI, + const MachineInstr *PacketMI, unsigned DepReg) { // Make sure we are looking at the store, that can be promoted. - if (!QII->mayBeNewStore(MI)) + if (!HII->mayBeNewStore(MI)) return false; - // Make sure there is dependency and can be new value'ed - if (GetStoreValueOperand(MI).isReg() && - GetStoreValueOperand(MI).getReg() != DepReg) + // Make sure there is dependency and can be new value'd. + const MachineOperand &Val = getStoreValueOperand(MI); + if (Val.isReg() && Val.getReg() != DepReg) return false; - const HexagonRegisterInfo *QRI = - (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo(); const MCInstrDesc& MCID = PacketMI->getDesc(); - // first operand is always the result - - const TargetRegisterClass* PacketRC = QII->getRegClass(MCID, 0, QRI, MF); - - // if there is already an store in the packet, no can do new value store - // Arch Spec 3.4.4.2. - for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(), - VE = CurrentPacketMIs.end(); - (VI != VE); ++VI) { - SUnit *PacketSU = MIToSUnit.find(*VI)->second; - if (PacketSU->getInstr()->getDesc().mayStore() || - // if we have mayStore = 1 set on ALLOCFRAME and DEALLOCFRAME, - // then we don't need this - PacketSU->getInstr()->getOpcode() == Hexagon::S2_allocframe || - PacketSU->getInstr()->getOpcode() == Hexagon::L2_deallocframe) - return false; - } - if (PacketRC == &Hexagon::DoubleRegsRegClass) { - // new value store constraint: double regs can not feed into new value store - // arch spec section: 5.4.2.2 + // First operand is always the result. + const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0, HRI, MF); + // Double regs can not feed into new value store: PRM section: 5.4.2.2. + if (PacketRC == &Hexagon::DoubleRegsRegClass) return false; + + // New-value stores are of class NV (slot 0), dual stores require class ST + // in slot 0 (PRM 5.5). + for (auto I : CurrentPacketMIs) { + SUnit *PacketSU = MIToSUnit.find(I)->second; + if (PacketSU->getInstr()->mayStore()) + return false; } // Make sure it's NOT the post increment register that we are going to // new value. - if (QII->isPostIncrement(MI) && - MI->getDesc().mayStore() && - GetPostIncrementOperand(MI, QII).getReg() == DepReg) { + if (HII->isPostIncrement(MI) && + getPostIncrementOperand(MI, HII).getReg() == DepReg) { return false; } - if (QII->isPostIncrement(PacketMI) && - PacketMI->getDesc().mayLoad() && - GetPostIncrementOperand(PacketMI, QII).getReg() == DepReg) { - // if source is post_inc, or absolute-set addressing, - // it can not feed into new value store - // r3 = memw(r2++#4) - // memw(r30 + #-1404) = r2.new -> can not be new value store - // arch spec section: 5.4.2.1 + if (HII->isPostIncrement(PacketMI) && PacketMI->mayLoad() && + getPostIncrementOperand(PacketMI, HII).getReg() == DepReg) { + // If source is post_inc, or absolute-set addressing, it can not feed + // into new value store + // r3 = memw(r2++#4) + // memw(r30 + #-1404) = r2.new -> can not be new value store + // arch spec section: 5.4.2.1. return false; } + if (isLoadAbsSet(PacketMI) && getAbsSetOperand(PacketMI).getReg() == DepReg) + return false; + // If the source that feeds the store is predicated, new value store must // also be predicated. - if (QII->isPredicated(PacketMI)) { - if (!QII->isPredicated(MI)) + if (HII->isPredicated(PacketMI)) { + if (!HII->isPredicated(MI)) return false; // Check to make sure that they both will have their predicates - // evaluate identically + // evaluate identically. unsigned predRegNumSrc = 0; unsigned predRegNumDst = 0; const TargetRegisterClass* predRegClass = nullptr; - // Get predicate register used in the source instruction - for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) { - if ( PacketMI->getOperand(opNum).isReg()) - predRegNumSrc = PacketMI->getOperand(opNum).getReg(); - predRegClass = QRI->getMinimalPhysRegClass(predRegNumSrc); - if (predRegClass == &Hexagon::PredRegsRegClass) { + // Get predicate register used in the source instruction. + for (auto &MO : PacketMI->operands()) { + if (!MO.isReg()) + continue; + predRegNumSrc = MO.getReg(); + predRegClass = HRI->getMinimalPhysRegClass(predRegNumSrc); + if (predRegClass == &Hexagon::PredRegsRegClass) break; - } } - assert ((predRegClass == &Hexagon::PredRegsRegClass ) && - ("predicate register not found in a predicated PacketMI instruction")); - - // Get predicate register used in new-value store instruction - for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) { - if ( MI->getOperand(opNum).isReg()) - predRegNumDst = MI->getOperand(opNum).getReg(); - predRegClass = QRI->getMinimalPhysRegClass(predRegNumDst); - if (predRegClass == &Hexagon::PredRegsRegClass) { + assert((predRegClass == &Hexagon::PredRegsRegClass) && + "predicate register not found in a predicated PacketMI instruction"); + + // Get predicate register used in new-value store instruction. + for (auto &MO : MI->operands()) { + if (!MO.isReg()) + continue; + predRegNumDst = MO.getReg(); + predRegClass = HRI->getMinimalPhysRegClass(predRegNumDst); + if (predRegClass == &Hexagon::PredRegsRegClass) break; - } } - assert ((predRegClass == &Hexagon::PredRegsRegClass ) && - ("predicate register not found in a predicated MI instruction")); + assert((predRegClass == &Hexagon::PredRegsRegClass) && + "predicate register not found in a predicated MI instruction"); // New-value register producer and user (store) need to satisfy these // constraints: @@ -632,13 +611,11 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore( // should also be .new predicated and if producer is not .new predicated // then store should not be .new predicated. // 3) Both new-value register producer and user should have same predicate - // sense, i.e, either both should be negated or both should be none negated. - - if (( predRegNumDst != predRegNumSrc) || - QII->isDotNewInst(PacketMI) != QII->isDotNewInst(MI) || - getPredicateSense(MI, QII) != getPredicateSense(PacketMI, QII)) { + // sense, i.e, either both should be negated or both should be non-negated. + if (predRegNumDst != predRegNumSrc || + HII->isDotNewInst(PacketMI) != HII->isDotNewInst(MI) || + getPredicateSense(MI, HII) != getPredicateSense(PacketMI, HII)) return false; - } } // Make sure that other than the new-value register no other store instruction @@ -649,81 +626,77 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore( // including PacketMI. Howerver, we need to perform the check for the // remaining instructions in the packet. - std::vector<MachineInstr*>::iterator VI; - std::vector<MachineInstr*>::iterator VE; unsigned StartCheck = 0; - for (VI=CurrentPacketMIs.begin(), VE = CurrentPacketMIs.end(); - (VI != VE); ++VI) { - SUnit *TempSU = MIToSUnit.find(*VI)->second; + for (auto I : CurrentPacketMIs) { + SUnit *TempSU = MIToSUnit.find(I)->second; MachineInstr* TempMI = TempSU->getInstr(); // Following condition is true for all the instructions until PacketMI is // reached (StartCheck is set to 0 before the for loop). // StartCheck flag is 1 for all the instructions after PacketMI. - if (TempMI != PacketMI && !StartCheck) // start processing only after - continue; // encountering PacketMI + if (TempMI != PacketMI && !StartCheck) // Start processing only after + continue; // encountering PacketMI. StartCheck = 1; - if (TempMI == PacketMI) // We don't want to check PacketMI for dependence + if (TempMI == PacketMI) // We don't want to check PacketMI for dependence. continue; - for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) { - if (MI->getOperand(opNum).isReg() && - TempSU->getInstr()->modifiesRegister(MI->getOperand(opNum).getReg(), - QRI)) + for (auto &MO : MI->operands()) + if (MO.isReg() && TempSU->getInstr()->modifiesRegister(MO.getReg(), HRI)) return false; - } } // Make sure that for non-POST_INC stores: // 1. The only use of reg is DepReg and no other registers. // This handles V4 base+index registers. // The following store can not be dot new. - // Eg. r0 = add(r0, #3)a + // Eg. r0 = add(r0, #3) // memw(r1+r0<<#2) = r0 - if (!QII->isPostIncrement(MI) && - GetStoreValueOperand(MI).isReg() && - GetStoreValueOperand(MI).getReg() == DepReg) { - for(unsigned opNum = 0; opNum < MI->getNumOperands()-1; opNum++) { - if (MI->getOperand(opNum).isReg() && - MI->getOperand(opNum).getReg() == DepReg) { - return false; - } - } - // 2. If data definition is because of implicit definition of the register, - // do not newify the store. Eg. - // %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def> - // STrih_indexed %R8, 2, %R12<kill>; mem:ST2[%scevgep343] - for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) { - if (PacketMI->getOperand(opNum).isReg() && - PacketMI->getOperand(opNum).getReg() == DepReg && - PacketMI->getOperand(opNum).isDef() && - PacketMI->getOperand(opNum).isImplicit()) { + if (!HII->isPostIncrement(MI)) { + for (unsigned opNum = 0; opNum < MI->getNumOperands()-1; opNum++) { + const MachineOperand &MO = MI->getOperand(opNum); + if (MO.isReg() && MO.getReg() == DepReg) return false; - } } } + // If data definition is because of implicit definition of the register, + // do not newify the store. Eg. + // %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def> + // S2_storerh_io %R8, 2, %R12<kill>; mem:ST2[%scevgep343] + for (auto &MO : PacketMI->operands()) { + if (!MO.isReg() || !MO.isDef() || !MO.isImplicit()) + continue; + unsigned R = MO.getReg(); + if (R == DepReg || HRI->isSuperRegister(DepReg, R)) + return false; + } + + // Handle imp-use of super reg case. There is a target independent side + // change that should prevent this situation but I am handling it for + // just-in-case. For example, we cannot newify R2 in the following case: + // %R3<def> = A2_tfrsi 0; + // S2_storeri_io %R0<kill>, 0, %R2<kill>, %D1<imp-use,kill>; + for (auto &MO : MI->operands()) { + if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == DepReg) + return false; + } + // Can be dot new store. return true; } -// can this MI to promoted to either -// new value store or new value jump -bool HexagonPacketizerList::CanPromoteToNewValue( - MachineInstr *MI, SUnit *PacketSU, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit, - MachineBasicBlock::iterator &MII) { - - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - if (!QII->mayBeNewStore(MI)) +// Can this MI to promoted to either new value store or new value jump. +bool HexagonPacketizerList::canPromoteToNewValue(const MachineInstr *MI, + const SUnit *PacketSU, unsigned DepReg, + MachineBasicBlock::iterator &MII) { + if (!HII->mayBeNewStore(MI)) return false; - MachineInstr *PacketMI = PacketSU->getInstr(); - // Check to see the store can be new value'ed. - if (CanPromoteToNewValueStore(MI, PacketMI, DepReg, MIToSUnit)) + MachineInstr *PacketMI = PacketSU->getInstr(); + if (canPromoteToNewValueStore(MI, PacketMI, DepReg)) return true; // Check to see the compare/jump can be new value'ed. @@ -731,93 +704,110 @@ bool HexagonPacketizerList::CanPromoteToNewValue( return false; } +static bool isImplicitDependency(const MachineInstr *I, unsigned DepReg) { + for (auto &MO : I->operands()) + if (MO.isReg() && MO.isDef() && (MO.getReg() == DepReg) && MO.isImplicit()) + return true; + return false; +} + // Check to see if an instruction can be dot new // There are three kinds. // 1. dot new on predicate - V2/V3/V4 // 2. dot new on stores NV/ST - V4 // 3. dot new on jump NV/J - V4 -- This is generated in a pass. -bool HexagonPacketizerList::CanPromoteToDotNew( - MachineInstr *MI, SUnit *PacketSU, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit, - MachineBasicBlock::iterator &MII, const TargetRegisterClass *RC) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; +bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr *MI, + const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC) { // Already a dot new instruction. - if (QII->isDotNewInst(MI) && !QII->mayBeNewStore(MI)) + if (HII->isDotNewInst(MI) && !HII->mayBeNewStore(MI)) return false; if (!isNewifiable(MI)) return false; + const MachineInstr *PI = PacketSU->getInstr(); + + // The "new value" cannot come from inline asm. + if (PI->isInlineAsm()) + return false; + + // IMPLICIT_DEFs won't materialize as real instructions, so .new makes no + // sense. + if (PI->isImplicitDef()) + return false; + + // If dependency is trough an implicitly defined register, we should not + // newify the use. + if (isImplicitDependency(PI, DepReg)) + return false; + + const MCInstrDesc& MCID = PI->getDesc(); + const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0, HRI, MF); + if (DisableVecDblNVStores && VecRC == &Hexagon::VecDblRegsRegClass) + return false; + // predicate .new - if (RC == &Hexagon::PredRegsRegClass && isCondInst(MI)) - return true; - else if (RC != &Hexagon::PredRegsRegClass && - !QII->mayBeNewStore(MI)) // MI is not a new-value store + // bug 5670: until that is fixed + // TODO: MI->isIndirectBranch() and IsRegisterJump(MI) + if (RC == &Hexagon::PredRegsRegClass) + if (HII->isCondInst(MI) || MI->isReturn()) + return HII->predCanBeUsedAsDotNew(PI, DepReg); + + if (RC != &Hexagon::PredRegsRegClass && !HII->mayBeNewStore(MI)) + return false; + + // Create a dot new machine instruction to see if resources can be + // allocated. If not, bail out now. + int NewOpcode = HII->getDotNewOp(MI); + const MCInstrDesc &D = HII->get(NewOpcode); + MachineInstr *NewMI = MF.CreateMachineInstr(D, DebugLoc()); + bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI); + MF.DeleteMachineInstr(NewMI); + if (!ResourcesAvailable) + return false; + + // New Value Store only. New Value Jump generated as a separate pass. + if (!canPromoteToNewValue(MI, PacketSU, DepReg, MII)) return false; - else { - // Create a dot new machine instruction to see if resources can be - // allocated. If not, bail out now. - int NewOpcode = QII->GetDotNewOp(MI); - const MCInstrDesc &desc = QII->get(NewOpcode); - DebugLoc dl; - MachineInstr *NewMI = - MI->getParent()->getParent()->CreateMachineInstr(desc, dl); - bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI); - MI->getParent()->getParent()->DeleteMachineInstr(NewMI); - - if (!ResourcesAvailable) - return false; - // new value store only - // new new value jump generated as a passes - if (!CanPromoteToNewValue(MI, PacketSU, DepReg, MIToSUnit, MII)) { - return false; - } - } return true; } -// Go through the packet instructions and search for anti dependency -// between them and DepReg from MI -// Consider this case: +// Go through the packet instructions and search for an anti dependency between +// them and DepReg from MI. Consider this case: // Trying to add // a) %R1<def> = TFRI_cdNotPt %P3, 2 // to this packet: // { -// b) %P0<def> = OR_pp %P3<kill>, %P0<kill> -// c) %P3<def> = TFR_PdRs %R23 -// d) %R1<def> = TFRI_cdnPt %P3, 4 +// b) %P0<def> = C2_or %P3<kill>, %P0<kill> +// c) %P3<def> = C2_tfrrp %R23 +// d) %R1<def> = C2_cmovenewit %P3, 4 // } // The P3 from a) and d) will be complements after // a)'s P3 is converted to .new form -// Anti Dep between c) and b) is irrelevant for this case -bool HexagonPacketizerList::RestrictingDepExistInPacket( - MachineInstr *MI, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit) { - - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; +// Anti-dep between c) and b) is irrelevant for this case +bool HexagonPacketizerList::restrictingDepExistInPacket(MachineInstr* MI, + unsigned DepReg) { SUnit *PacketSUDep = MIToSUnit.find(MI)->second; - for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(), - VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) { - + for (auto I : CurrentPacketMIs) { // We only care for dependencies to predicated instructions - if(!QII->isPredicated(*VIN)) continue; + if (!HII->isPredicated(I)) + continue; // Scheduling Unit for current insn in the packet - SUnit *PacketSU = MIToSUnit.find(*VIN)->second; + SUnit *PacketSU = MIToSUnit.find(I)->second; - // Look at dependencies between current members of the packet - // and predicate defining instruction MI. - // Make sure that dependency is on the exact register - // we care about. + // Look at dependencies between current members of the packet and + // predicate defining instruction MI. Make sure that dependency is + // on the exact register we care about. if (PacketSU->isSucc(PacketSUDep)) { for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) { - if ((PacketSU->Succs[i].getSUnit() == PacketSUDep) && - (PacketSU->Succs[i].getKind() == SDep::Anti) && - (PacketSU->Succs[i].getReg() == DepReg)) { + auto &Dep = PacketSU->Succs[i]; + if (Dep.getSUnit() == PacketSUDep && Dep.getKind() == SDep::Anti && + Dep.getReg() == DepReg) return true; - } } } } @@ -831,276 +821,362 @@ static unsigned getPredicatedRegister(MachineInstr *MI, const HexagonInstrInfo *QII) { /// We use the following rule: The first predicate register that is a use is /// the predicate register of a predicated instruction. - assert(QII->isPredicated(MI) && "Must be predicated instruction"); - for (MachineInstr::mop_iterator OI = MI->operands_begin(), - OE = MI->operands_end(); OI != OE; ++OI) { - MachineOperand &Op = *OI; + for (auto &Op : MI->operands()) { if (Op.isReg() && Op.getReg() && Op.isUse() && Hexagon::PredRegsRegClass.contains(Op.getReg())) return Op.getReg(); } llvm_unreachable("Unknown instruction operand layout"); - return 0; } // Given two predicated instructions, this function detects whether -// the predicates are complements -bool HexagonPacketizerList::ArePredicatesComplements( - MachineInstr *MI1, MachineInstr *MI2, - const std::map<MachineInstr *, SUnit *> &MIToSUnit) { - - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - +// the predicates are complements. +bool HexagonPacketizerList::arePredicatesComplements(MachineInstr *MI1, + MachineInstr *MI2) { // If we don't know the predicate sense of the instructions bail out early, we // need it later. - if (getPredicateSense(MI1, QII) == PK_Unknown || - getPredicateSense(MI2, QII) == PK_Unknown) + if (getPredicateSense(MI1, HII) == PK_Unknown || + getPredicateSense(MI2, HII) == PK_Unknown) return false; - // Scheduling unit for candidate - SUnit *SU = MIToSUnit.find(MI1)->second; + // Scheduling unit for candidate. + SUnit *SU = MIToSUnit[MI1]; // One corner case deals with the following scenario: // Trying to add - // a) %R24<def> = TFR_cPt %P0, %R25 + // a) %R24<def> = A2_tfrt %P0, %R25 // to this packet: - // // { - // b) %R25<def> = TFR_cNotPt %P0, %R24 - // c) %P0<def> = CMPEQri %R26, 1 + // b) %R25<def> = A2_tfrf %P0, %R24 + // c) %P0<def> = C2_cmpeqi %R26, 1 // } // - // On general check a) and b) are complements, but - // presence of c) will convert a) to .new form, and - // then it is not a complement - // We attempt to detect it by analyzing existing - // dependencies in the packet + // On general check a) and b) are complements, but presence of c) will + // convert a) to .new form, and then it is not a complement. + // We attempt to detect it by analyzing existing dependencies in the packet. // Analyze relationships between all existing members of the packet. - // Look for Anti dependecy on the same predicate reg - // as used in the candidate - for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(), - VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) { - - // Scheduling Unit for current insn in the packet - SUnit *PacketSU = MIToSUnit.find(*VIN)->second; + // Look for Anti dependecy on the same predicate reg as used in the + // candidate. + for (auto I : CurrentPacketMIs) { + // Scheduling Unit for current insn in the packet. + SUnit *PacketSU = MIToSUnit.find(I)->second; // If this instruction in the packet is succeeded by the candidate... if (PacketSU->isSucc(SU)) { for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) { - // The corner case exist when there is true data - // dependency between candidate and one of current - // packet members, this dep is on predicate reg, and - // there already exist anti dep on the same pred in + auto Dep = PacketSU->Succs[i]; + // The corner case exist when there is true data dependency between + // candidate and one of current packet members, this dep is on + // predicate reg, and there already exist anti dep on the same pred in // the packet. - if (PacketSU->Succs[i].getSUnit() == SU && - PacketSU->Succs[i].getKind() == SDep::Data && - Hexagon::PredRegsRegClass.contains( - PacketSU->Succs[i].getReg()) && - // Here I know that *VIN is predicate setting instruction - // with true data dep to candidate on the register - // we care about - c) in the above example. - // Now I need to see if there is an anti dependency - // from c) to any other instruction in the - // same packet on the pred reg of interest - RestrictingDepExistInPacket(*VIN,PacketSU->Succs[i].getReg(), - MIToSUnit)) { - return false; + if (Dep.getSUnit() == SU && Dep.getKind() == SDep::Data && + Hexagon::PredRegsRegClass.contains(Dep.getReg())) { + // Here I know that I is predicate setting instruction with true + // data dep to candidate on the register we care about - c) in the + // above example. Now I need to see if there is an anti dependency + // from c) to any other instruction in the same packet on the pred + // reg of interest. + if (restrictingDepExistInPacket(I, Dep.getReg())) + return false; } } } } - // If the above case does not apply, check regular - // complement condition. - // Check that the predicate register is the same and - // that the predicate sense is different - // We also need to differentiate .old vs. .new: - // !p0 is not complimentary to p0.new - unsigned PReg1 = getPredicatedRegister(MI1, QII); - unsigned PReg2 = getPredicatedRegister(MI2, QII); - return ((PReg1 == PReg2) && - Hexagon::PredRegsRegClass.contains(PReg1) && - Hexagon::PredRegsRegClass.contains(PReg2) && - (getPredicateSense(MI1, QII) != getPredicateSense(MI2, QII)) && - (QII->isDotNewInst(MI1) == QII->isDotNewInst(MI2))); + // If the above case does not apply, check regular complement condition. + // Check that the predicate register is the same and that the predicate + // sense is different We also need to differentiate .old vs. .new: !p0 + // is not complementary to p0.new. + unsigned PReg1 = getPredicatedRegister(MI1, HII); + unsigned PReg2 = getPredicatedRegister(MI2, HII); + return PReg1 == PReg2 && + Hexagon::PredRegsRegClass.contains(PReg1) && + Hexagon::PredRegsRegClass.contains(PReg2) && + getPredicateSense(MI1, HII) != getPredicateSense(MI2, HII) && + HII->isDotNewInst(MI1) == HII->isDotNewInst(MI2); } -// initPacketizerState - Initialize packetizer flags +// Initialize packetizer flags. void HexagonPacketizerList::initPacketizerState() { - Dependence = false; PromotedToDotNew = false; GlueToNewValueJump = false; GlueAllocframeStore = false; FoundSequentialDependence = false; - - return; } -// ignorePseudoInstruction - Ignore bundling of pseudo instructions. -bool HexagonPacketizerList::ignorePseudoInstruction(MachineInstr *MI, - MachineBasicBlock *MBB) { +// Ignore bundling of pseudo instructions. +bool HexagonPacketizerList::ignorePseudoInstruction(const MachineInstr *MI, + const MachineBasicBlock*) { if (MI->isDebugValue()) return true; if (MI->isCFIInstruction()) return false; - // We must print out inline assembly + // We must print out inline assembly. if (MI->isInlineAsm()) return false; - // We check if MI has any functional units mapped to it. - // If it doesn't, we ignore the instruction. + if (MI->isImplicitDef()) + return false; + + // We check if MI has any functional units mapped to it. If it doesn't, + // we ignore the instruction. const MCInstrDesc& TID = MI->getDesc(); - unsigned SchedClass = TID.getSchedClass(); - const InstrStage* IS = - ResourceTracker->getInstrItins()->beginStage(SchedClass); + auto *IS = ResourceTracker->getInstrItins()->beginStage(TID.getSchedClass()); unsigned FuncUnits = IS->getUnits(); return !FuncUnits; } -// isSoloInstruction: - Returns true for instructions that must be -// scheduled in their own packet. -bool HexagonPacketizerList::isSoloInstruction(MachineInstr *MI) { +bool HexagonPacketizerList::isSoloInstruction(const MachineInstr *MI) { if (MI->isEHLabel() || MI->isCFIInstruction()) return true; - if (MI->isInlineAsm()) + // Consider inline asm to not be a solo instruction by default. + // Inline asm will be put in a packet temporarily, but then it will be + // removed, and placed outside of the packet (before or after, depending + // on dependencies). This is to reduce the impact of inline asm as a + // "packet splitting" instruction. + if (MI->isInlineAsm() && !ScheduleInlineAsm) return true; // From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints: // trap, pause, barrier, icinva, isync, and syncht are solo instructions. // They must not be grouped with other instructions in a packet. - if (IsSchedBarrier(MI)) + if (isSchedBarrier(MI)) + return true; + + if (HII->isSolo(MI)) + return true; + + if (MI->getOpcode() == Hexagon::A2_nop) return true; return false; } -// isLegalToPacketizeTogether: -// SUI is the current instruction that is out side of the current packet. -// SUJ is the current instruction inside the current packet against which that -// SUI will be packetized. -bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { - MachineInstr *I = SUI->getInstr(); - MachineInstr *J = SUJ->getInstr(); - assert(I && J && "Unable to packetize null instruction!"); - const MCInstrDesc &MCIDI = I->getDesc(); - const MCInstrDesc &MCIDJ = J->getDesc(); +// Quick check if instructions MI and MJ cannot coexist in the same packet. +// Limit the tests to be "one-way", e.g. "if MI->isBranch and MJ->isInlineAsm", +// but not the symmetric case: "if MJ->isBranch and MI->isInlineAsm". +// For full test call this function twice: +// cannotCoexistAsymm(MI, MJ) || cannotCoexistAsymm(MJ, MI) +// Doing the test only one way saves the amount of code in this function, +// since every test would need to be repeated with the MI and MJ reversed. +static bool cannotCoexistAsymm(const MachineInstr *MI, const MachineInstr *MJ, + const HexagonInstrInfo &HII) { + const MachineFunction *MF = MI->getParent()->getParent(); + if (MF->getSubtarget<HexagonSubtarget>().hasV60TOpsOnly() && + HII.isHVXMemWithAIndirect(MI, MJ)) + return true; - MachineBasicBlock::iterator II = I; + // An inline asm cannot be together with a branch, because we may not be + // able to remove the asm out after packetizing (i.e. if the asm must be + // moved past the bundle). Similarly, two asms cannot be together to avoid + // complications when determining their relative order outside of a bundle. + if (MI->isInlineAsm()) + return MJ->isInlineAsm() || MJ->isBranch() || MJ->isBarrier() || + MJ->isCall() || MJ->isTerminator(); - const unsigned FrameSize = MF.getFrameInfo()->getStackSize(); - const HexagonRegisterInfo *QRI = - (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo(); - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + // "False" really means that the quick check failed to determine if + // I and J cannot coexist. + return false; +} - // Inline asm cannot go in the packet. - if (I->getOpcode() == Hexagon::INLINEASM) - llvm_unreachable("Should not meet inline asm here!"); - if (isSoloInstruction(I)) - llvm_unreachable("Should not meet solo instr here!"); +// Full, symmetric check. +bool HexagonPacketizerList::cannotCoexist(const MachineInstr *MI, + const MachineInstr *MJ) { + return cannotCoexistAsymm(MI, MJ, *HII) || cannotCoexistAsymm(MJ, MI, *HII); +} - // A save callee-save register function call can only be in a packet - // with instructions that don't write to the callee-save registers. - if ((QII->isSaveCalleeSavedRegsCall(I) && - DoesModifyCalleeSavedReg(J, QRI)) || - (QII->isSaveCalleeSavedRegsCall(J) && - DoesModifyCalleeSavedReg(I, QRI))) { - Dependence = true; - return false; +void HexagonPacketizerList::unpacketizeSoloInstrs(MachineFunction &MF) { + for (auto &B : MF) { + MachineBasicBlock::iterator BundleIt; + MachineBasicBlock::instr_iterator NextI; + for (auto I = B.instr_begin(), E = B.instr_end(); I != E; I = NextI) { + NextI = std::next(I); + MachineInstr *MI = &*I; + if (MI->isBundle()) + BundleIt = I; + if (!MI->isInsideBundle()) + continue; + + // Decide on where to insert the instruction that we are pulling out. + // Debug instructions always go before the bundle, but the placement of + // INLINE_ASM depends on potential dependencies. By default, try to + // put it before the bundle, but if the asm writes to a register that + // other instructions in the bundle read, then we need to place it + // after the bundle (to preserve the bundle semantics). + bool InsertBeforeBundle; + if (MI->isInlineAsm()) + InsertBeforeBundle = !hasWriteToReadDep(MI, BundleIt, HRI); + else if (MI->isDebugValue()) + InsertBeforeBundle = true; + else + continue; + + BundleIt = moveInstrOut(MI, BundleIt, InsertBeforeBundle); + } } +} - // Two control flow instructions cannot go in the same packet. - if (IsControlFlow(I) && IsControlFlow(J)) { - Dependence = true; - return false; +// Check if a given instruction is of class "system". +static bool isSystemInstr(const MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case Hexagon::Y2_barrier: + case Hexagon::Y2_dcfetchbo: + return true; } + return false; +} - // A LoopN instruction cannot appear in the same packet as a jump or call. - if (IsLoopN(I) && - (IsDirectJump(J) || MCIDJ.isCall() || QII->isDeallocRet(J))) { - Dependence = true; +bool HexagonPacketizerList::hasDeadDependence(const MachineInstr *I, + const MachineInstr *J) { + // The dependence graph may not include edges between dead definitions, + // so without extra checks, we could end up packetizing two instruction + // defining the same (dead) register. + if (I->isCall() || J->isCall()) return false; - } - if (IsLoopN(J) && - (IsDirectJump(I) || MCIDI.isCall() || QII->isDeallocRet(I))) { - Dependence = true; + if (HII->isPredicated(I) || HII->isPredicated(J)) return false; + + BitVector DeadDefs(Hexagon::NUM_TARGET_REGS); + for (auto &MO : I->operands()) { + if (!MO.isReg() || !MO.isDef() || !MO.isDead()) + continue; + DeadDefs[MO.getReg()] = true; } + for (auto &MO : J->operands()) { + if (!MO.isReg() || !MO.isDef() || !MO.isDead()) + continue; + unsigned R = MO.getReg(); + if (R != Hexagon::USR_OVF && DeadDefs[R]) + return true; + } + return false; +} + +bool HexagonPacketizerList::hasControlDependence(const MachineInstr *I, + const MachineInstr *J) { + // A save callee-save register function call can only be in a packet + // with instructions that don't write to the callee-save registers. + if ((HII->isSaveCalleeSavedRegsCall(I) && + doesModifyCalleeSavedReg(J, HRI)) || + (HII->isSaveCalleeSavedRegsCall(J) && + doesModifyCalleeSavedReg(I, HRI))) + return true; + + // Two control flow instructions cannot go in the same packet. + if (isControlFlow(I) && isControlFlow(J)) + return true; + + // \ref-manual (7.3.4) A loop setup packet in loopN or spNloop0 cannot + // contain a speculative indirect jump, + // a new-value compare jump or a dealloc_return. + auto isBadForLoopN = [this] (const MachineInstr *MI) -> bool { + if (MI->isCall() || HII->isDeallocRet(MI) || HII->isNewValueJump(MI)) + return true; + if (HII->isPredicated(MI) && HII->isPredicatedNew(MI) && HII->isJumpR(MI)) + return true; + return false; + }; + + if (HII->isLoopN(I) && isBadForLoopN(J)) + return true; + if (HII->isLoopN(J) && isBadForLoopN(I)) + return true; + // dealloc_return cannot appear in the same packet as a conditional or // unconditional jump. - if (QII->isDeallocRet(I) && - (MCIDJ.isBranch() || MCIDJ.isCall() || MCIDJ.isBarrier())) { - Dependence = true; - return false; + return HII->isDeallocRet(I) && + (J->isBranch() || J->isCall() || J->isBarrier()); +} + +bool HexagonPacketizerList::hasV4SpecificDependence(const MachineInstr *I, + const MachineInstr *J) { + bool SysI = isSystemInstr(I), SysJ = isSystemInstr(J); + bool StoreI = I->mayStore(), StoreJ = J->mayStore(); + if ((SysI && StoreJ) || (SysJ && StoreI)) + return true; + + if (StoreI && StoreJ) { + if (HII->isNewValueInst(J) || HII->isMemOp(J) || HII->isMemOp(I)) + return true; + } else { + // A memop cannot be in the same packet with another memop or a store. + // Two stores can be together, but here I and J cannot both be stores. + bool MopStI = HII->isMemOp(I) || StoreI; + bool MopStJ = HII->isMemOp(J) || StoreJ; + if (MopStI && MopStJ) + return true; } + return (StoreJ && HII->isDeallocRet(I)) || (StoreI && HII->isDeallocRet(J)); +} - // V4 allows dual store. But does not allow second store, if the - // first store is not in SLOT0. New value store, new value jump, - // dealloc_return and memop always take SLOT0. - // Arch spec 3.4.4.2 - if (MCIDI.mayStore() && MCIDJ.mayStore() && - (QII->isNewValueInst(J) || QII->isMemOp(J) || QII->isMemOp(I))) { - Dependence = true; +// SUI is the current instruction that is out side of the current packet. +// SUJ is the current instruction inside the current packet against which that +// SUI will be packetized. +bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { + MachineInstr *I = SUI->getInstr(); + MachineInstr *J = SUJ->getInstr(); + assert(I && J && "Unable to packetize null instruction!"); + + // Clear IgnoreDepMIs when Packet starts. + if (CurrentPacketMIs.size() == 1) + IgnoreDepMIs.clear(); + + MachineBasicBlock::iterator II = I; + const unsigned FrameSize = MF.getFrameInfo()->getStackSize(); + + // Solo instructions cannot go in the packet. + assert(!isSoloInstruction(I) && "Unexpected solo instr!"); + + if (cannotCoexist(I, J)) return false; - } - if ((QII->isMemOp(J) && MCIDI.mayStore()) - || (MCIDJ.mayStore() && QII->isMemOp(I)) - || (QII->isMemOp(J) && QII->isMemOp(I))) { - Dependence = true; + Dependence = hasDeadDependence(I, J) || hasControlDependence(I, J); + if (Dependence) return false; - } - //if dealloc_return - if (MCIDJ.mayStore() && QII->isDeallocRet(I)) { - Dependence = true; + // V4 allows dual stores. It does not allow second store, if the first + // store is not in SLOT0. New value store, new value jump, dealloc_return + // and memop always take SLOT0. Arch spec 3.4.4.2. + Dependence = hasV4SpecificDependence(I, J); + if (Dependence) return false; - } // If an instruction feeds new value jump, glue it. MachineBasicBlock::iterator NextMII = I; ++NextMII; - if (NextMII != I->getParent()->end() && QII->isNewValueJump(NextMII)) { + if (NextMII != I->getParent()->end() && HII->isNewValueJump(NextMII)) { MachineInstr *NextMI = NextMII; bool secondRegMatch = false; - bool maintainNewValueJump = false; + const MachineOperand &NOp0 = NextMI->getOperand(0); + const MachineOperand &NOp1 = NextMI->getOperand(1); - if (NextMI->getOperand(1).isReg() && - I->getOperand(0).getReg() == NextMI->getOperand(1).getReg()) { + if (NOp1.isReg() && I->getOperand(0).getReg() == NOp1.getReg()) secondRegMatch = true; - maintainNewValueJump = true; - } - - if (!secondRegMatch && - I->getOperand(0).getReg() == NextMI->getOperand(0).getReg()) { - maintainNewValueJump = true; - } - for (std::vector<MachineInstr*>::iterator - VI = CurrentPacketMIs.begin(), - VE = CurrentPacketMIs.end(); - (VI != VE && maintainNewValueJump); ++VI) { - SUnit *PacketSU = MIToSUnit.find(*VI)->second; - - // NVJ can not be part of the dual jump - Arch Spec: section 7.8 - if (PacketSU->getInstr()->getDesc().isCall()) { + for (auto I : CurrentPacketMIs) { + SUnit *PacketSU = MIToSUnit.find(I)->second; + MachineInstr *PI = PacketSU->getInstr(); + // NVJ can not be part of the dual jump - Arch Spec: section 7.8. + if (PI->isCall()) { Dependence = true; break; } - // Validate + // Validate: // 1. Packet does not have a store in it. // 2. If the first operand of the nvj is newified, and the second // operand is also a reg, it (second reg) is not defined in @@ -1108,302 +1184,413 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { // 3. If the second operand of the nvj is newified, (which means // first operand is also a reg), first reg is not defined in // the same packet. - if (PacketSU->getInstr()->getDesc().mayStore() || - PacketSU->getInstr()->getOpcode() == Hexagon::S2_allocframe || - // Check #2. - (!secondRegMatch && NextMI->getOperand(1).isReg() && - PacketSU->getInstr()->modifiesRegister( - NextMI->getOperand(1).getReg(), QRI)) || - // Check #3. - (secondRegMatch && - PacketSU->getInstr()->modifiesRegister( - NextMI->getOperand(0).getReg(), QRI))) { + if (PI->getOpcode() == Hexagon::S2_allocframe || PI->mayStore() || + HII->isLoopN(PI)) { + Dependence = true; + break; + } + // Check #2/#3. + const MachineOperand &OpR = secondRegMatch ? NOp0 : NOp1; + if (OpR.isReg() && PI->modifiesRegister(OpR.getReg(), HRI)) { Dependence = true; break; } } - if (!Dependence) - GlueToNewValueJump = true; - else + + if (Dependence) return false; + GlueToNewValueJump = true; } - if (SUJ->isSucc(SUI)) { - for (unsigned i = 0; - (i < SUJ->Succs.size()) && !FoundSequentialDependence; - ++i) { + // There no dependency between a prolog instruction and its successor. + if (!SUJ->isSucc(SUI)) + return true; - if (SUJ->Succs[i].getSUnit() != SUI) { - continue; - } + for (unsigned i = 0; i < SUJ->Succs.size(); ++i) { + if (FoundSequentialDependence) + break; - SDep::Kind DepType = SUJ->Succs[i].getKind(); + if (SUJ->Succs[i].getSUnit() != SUI) + continue; - // For direct calls: - // Ignore register dependences for call instructions for - // packetization purposes except for those due to r31 and - // predicate registers. - // - // For indirect calls: - // Same as direct calls + check for true dependences to the register - // used in the indirect call. - // - // We completely ignore Order dependences for call instructions - // - // For returns: - // Ignore register dependences for return instructions like jumpr, - // dealloc return unless we have dependencies on the explicit uses - // of the registers used by jumpr (like r31) or dealloc return - // (like r29 or r30). - // - // TODO: Currently, jumpr is handling only return of r31. So, the - // following logic (specificaly IsCallDependent) is working fine. - // We need to enable jumpr for register other than r31 and then, - // we need to rework the last part, where it handles indirect call - // of that (IsCallDependent) function. Bug 6216 is opened for this. - // - unsigned DepReg = 0; - const TargetRegisterClass* RC = nullptr; - if (DepType == SDep::Data) { - DepReg = SUJ->Succs[i].getReg(); - RC = QRI->getMinimalPhysRegClass(DepReg); - } - if ((MCIDI.isCall() || MCIDI.isReturn()) && - (!IsRegDependence(DepType) || - !IsCallDependent(I, DepType, SUJ->Succs[i].getReg()))) { - /* do nothing */ - } + SDep::Kind DepType = SUJ->Succs[i].getKind(); + // For direct calls: + // Ignore register dependences for call instructions for packetization + // purposes except for those due to r31 and predicate registers. + // + // For indirect calls: + // Same as direct calls + check for true dependences to the register + // used in the indirect call. + // + // We completely ignore Order dependences for call instructions. + // + // For returns: + // Ignore register dependences for return instructions like jumpr, + // dealloc return unless we have dependencies on the explicit uses + // of the registers used by jumpr (like r31) or dealloc return + // (like r29 or r30). + // + // TODO: Currently, jumpr is handling only return of r31. So, the + // following logic (specificaly isCallDependent) is working fine. + // We need to enable jumpr for register other than r31 and then, + // we need to rework the last part, where it handles indirect call + // of that (isCallDependent) function. Bug 6216 is opened for this. + unsigned DepReg = 0; + const TargetRegisterClass *RC = nullptr; + if (DepType == SDep::Data) { + DepReg = SUJ->Succs[i].getReg(); + RC = HRI->getMinimalPhysRegClass(DepReg); + } - // For instructions that can be promoted to dot-new, try to promote. - else if ((DepType == SDep::Data) && - CanPromoteToDotNew(I, SUJ, DepReg, MIToSUnit, II, RC) && - PromoteToDotNew(I, DepType, II, RC)) { - PromotedToDotNew = true; - /* do nothing */ - } + if (I->isCall() || I->isReturn()) { + if (!isRegDependence(DepType)) + continue; + if (!isCallDependent(I, DepType, SUJ->Succs[i].getReg())) + continue; + } - else if ((DepType == SDep::Data) && - (QII->isNewValueJump(I))) { - /* do nothing */ - } + if (DepType == SDep::Data) { + if (canPromoteToDotCur(J, SUJ, DepReg, II, RC)) + if (promoteToDotCur(J, DepType, II, RC)) + continue; + } - // For predicated instructions, if the predicates are complements - // then there can be no dependence. - else if (QII->isPredicated(I) && - QII->isPredicated(J) && - ArePredicatesComplements(I, J, MIToSUnit)) { - /* do nothing */ + // Data dpendence ok if we have load.cur. + if (DepType == SDep::Data && HII->isDotCurInst(J)) { + if (HII->isV60VectorInstruction(I)) + continue; + } + // For instructions that can be promoted to dot-new, try to promote. + if (DepType == SDep::Data) { + if (canPromoteToDotNew(I, SUJ, DepReg, II, RC)) { + if (promoteToDotNew(I, DepType, II, RC)) { + PromotedToDotNew = true; + continue; + } } - else if (IsDirectJump(I) && - !MCIDJ.isBranch() && - !MCIDJ.isCall() && - (DepType == SDep::Order)) { - // Ignore Order dependences between unconditional direct branches - // and non-control-flow instructions - /* do nothing */ - } - else if (MCIDI.isConditionalBranch() && (DepType != SDep::Data) && - (DepType != SDep::Output)) { - // Ignore all dependences for jumps except for true and output - // dependences - /* do nothing */ - } - - // Ignore output dependences due to superregs. We can - // write to two different subregisters of R1:0 for instance - // in the same cycle - // + if (HII->isNewValueJump(I)) + continue; + } + // For predicated instructions, if the predicates are complements then + // there can be no dependence. + if (HII->isPredicated(I) && HII->isPredicated(J) && + arePredicatesComplements(I, J)) { + // Not always safe to do this translation. + // DAG Builder attempts to reduce dependence edges using transitive + // nature of dependencies. Here is an example: // - // Let the - // If neither I nor J defines DepReg, then this is a - // superfluous output dependence. The dependence must be of the - // form: - // R0 = ... - // R1 = ... - // and there is an output dependence between the two instructions - // with - // DepReg = D0 - // We want to ignore these dependences. - // Ideally, the dependence constructor should annotate such - // dependences. We can then avoid this relatively expensive check. + // r0 = tfr_pt ... (1) + // r0 = tfr_pf ... (2) + // r0 = tfr_pt ... (3) // - else if (DepType == SDep::Output) { - // DepReg is the register that's responsible for the dependence. - unsigned DepReg = SUJ->Succs[i].getReg(); + // There will be an output dependence between (1)->(2) and (2)->(3). + // However, there is no dependence edge between (1)->(3). This results + // in all 3 instructions going in the same packet. We ignore dependce + // only once to avoid this situation. + auto Itr = std::find(IgnoreDepMIs.begin(), IgnoreDepMIs.end(), J); + if (Itr != IgnoreDepMIs.end()) { + Dependence = true; + return false; + } + IgnoreDepMIs.push_back(I); + continue; + } + + // Ignore Order dependences between unconditional direct branches + // and non-control-flow instructions. + if (isDirectJump(I) && !J->isBranch() && !J->isCall() && + DepType == SDep::Order) + continue; + + // Ignore all dependences for jumps except for true and output + // dependences. + if (I->isConditionalBranch() && DepType != SDep::Data && + DepType != SDep::Output) + continue; + + // Ignore output dependences due to superregs. We can write to two + // different subregisters of R1:0 for instance in the same cycle. + + // If neither I nor J defines DepReg, then this is a superfluous output + // dependence. The dependence must be of the form: + // R0 = ... + // R1 = ... + // and there is an output dependence between the two instructions with + // DepReg = D0. + // We want to ignore these dependences. Ideally, the dependence + // constructor should annotate such dependences. We can then avoid this + // relatively expensive check. + // + if (DepType == SDep::Output) { + // DepReg is the register that's responsible for the dependence. + unsigned DepReg = SUJ->Succs[i].getReg(); + + // Check if I and J really defines DepReg. + if (!I->definesRegister(DepReg) && !J->definesRegister(DepReg)) + continue; + FoundSequentialDependence = true; + break; + } - // Check if I and J really defines DepReg. - if (I->definesRegister(DepReg) || - J->definesRegister(DepReg)) { + // For Order dependences: + // 1. On V4 or later, volatile loads/stores can be packetized together, + // unless other rules prevent is. + // 2. Store followed by a load is not allowed. + // 3. Store followed by a store is only valid on V4 or later. + // 4. Load followed by any memory operation is allowed. + if (DepType == SDep::Order) { + if (!PacketizeVolatiles) { + bool OrdRefs = I->hasOrderedMemoryRef() || J->hasOrderedMemoryRef(); + if (OrdRefs) { FoundSequentialDependence = true; break; } } - - // We ignore Order dependences for - // 1. Two loads unless they are volatile. - // 2. Two stores in V4 unless they are volatile. - else if ((DepType == SDep::Order) && - !I->hasOrderedMemoryRef() && - !J->hasOrderedMemoryRef()) { - if (MCIDI.mayStore() && MCIDJ.mayStore()) { - /* do nothing */ - } - // store followed by store-- not OK on V2 - // store followed by load -- not OK on all (OK if addresses - // are not aliased) - // load followed by store -- OK on all - // load followed by load -- OK on all - else if ( !MCIDJ.mayStore()) { - /* do nothing */ - } - else { + // J is first, I is second. + bool LoadJ = J->mayLoad(), StoreJ = J->mayStore(); + bool LoadI = I->mayLoad(), StoreI = I->mayStore(); + if (StoreJ) { + // Two stores are only allowed on V4+. Load following store is never + // allowed. + if (LoadI) { FoundSequentialDependence = true; break; } - } - - // For V4, special case ALLOCFRAME. Even though there is dependency - // between ALLOCFRAME and subsequent store, allow it to be - // packetized in a same packet. This implies that the store is using - // caller's SP. Hence, offset needs to be updated accordingly. - else if (DepType == SDep::Data - && J->getOpcode() == Hexagon::S2_allocframe - && (I->getOpcode() == Hexagon::S2_storerd_io - || I->getOpcode() == Hexagon::S2_storeri_io - || I->getOpcode() == Hexagon::S2_storerb_io) - && I->getOperand(0).getReg() == QRI->getStackRegister() - && QII->isValidOffset(I->getOpcode(), - I->getOperand(1).getImm() - - (FrameSize + HEXAGON_LRFP_SIZE))) - { - GlueAllocframeStore = true; - // Since this store is to be glued with allocframe in the same - // packet, it will use SP of the previous stack frame, i.e - // caller's SP. Therefore, we need to recalculate offset according - // to this change. - I->getOperand(1).setImm(I->getOperand(1).getImm() - - (FrameSize + HEXAGON_LRFP_SIZE)); - } - - // - // Skip over anti-dependences. Two instructions that are - // anti-dependent can share a packet - // - else if (DepType != SDep::Anti) { + } else if (!LoadJ || (!LoadI && !StoreI)) { + // If J is neither load nor store, assume a dependency. + // If J is a load, but I is neither, also assume a dependency. FoundSequentialDependence = true; break; } + // Store followed by store: not OK on V2. + // Store followed by load: not OK on all. + // Load followed by store: OK on all. + // Load followed by load: OK on all. + continue; } - if (FoundSequentialDependence) { - Dependence = true; - return false; + // For V4, special case ALLOCFRAME. Even though there is dependency + // between ALLOCFRAME and subsequent store, allow it to be packetized + // in a same packet. This implies that the store is using the caller's + // SP. Hence, offset needs to be updated accordingly. + if (DepType == SDep::Data && J->getOpcode() == Hexagon::S2_allocframe) { + unsigned Opc = I->getOpcode(); + switch (Opc) { + case Hexagon::S2_storerd_io: + case Hexagon::S2_storeri_io: + case Hexagon::S2_storerh_io: + case Hexagon::S2_storerb_io: + if (I->getOperand(0).getReg() == HRI->getStackRegister()) { + int64_t Imm = I->getOperand(1).getImm(); + int64_t NewOff = Imm - (FrameSize + HEXAGON_LRFP_SIZE); + if (HII->isValidOffset(Opc, NewOff)) { + GlueAllocframeStore = true; + // Since this store is to be glued with allocframe in the same + // packet, it will use SP of the previous stack frame, i.e. + // caller's SP. Therefore, we need to recalculate offset + // according to this change. + I->getOperand(1).setImm(NewOff); + continue; + } + } + default: + break; + } + } + + // Skip over anti-dependences. Two instructions that are anti-dependent + // can share a packet. + if (DepType != SDep::Anti) { + FoundSequentialDependence = true; + break; } } + if (FoundSequentialDependence) { + Dependence = true; + return false; + } + return true; } -// isLegalToPruneDependencies bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) { MachineInstr *I = SUI->getInstr(); - assert(I && SUJ->getInstr() && "Unable to packetize null instruction!"); - - const unsigned FrameSize = MF.getFrameInfo()->getStackSize(); - - if (Dependence) { + MachineInstr *J = SUJ->getInstr(); + assert(I && J && "Unable to packetize null instruction!"); - // Check if the instruction was promoted to a dot-new. If so, demote it - // back into a dot-old. - if (PromotedToDotNew) { - DemoteToDotOld(I); - } + if (cannotCoexist(I, J)) + return false; - // Check if the instruction (must be a store) was glued with an Allocframe - // instruction. If so, restore its offset to its original value, i.e. use - // curent SP instead of caller's SP. - if (GlueAllocframeStore) { - I->getOperand(1).setImm(I->getOperand(1).getImm() + - FrameSize + HEXAGON_LRFP_SIZE); - } + if (!Dependence) + return true; - return false; + // Check if the instruction was promoted to a dot-new. If so, demote it + // back into a dot-old. + if (PromotedToDotNew) + demoteToDotOld(I); + + cleanUpDotCur(); + // Check if the instruction (must be a store) was glued with an allocframe + // instruction. If so, restore its offset to its original value, i.e. use + // current SP instead of caller's SP. + if (GlueAllocframeStore) { + unsigned FrameSize = MF.getFrameInfo()->getStackSize(); + MachineOperand &MOff = I->getOperand(1); + MOff.setImm(MOff.getImm() + FrameSize + HEXAGON_LRFP_SIZE); } - return true; + return false; } + MachineBasicBlock::iterator HexagonPacketizerList::addToPacket(MachineInstr *MI) { + MachineBasicBlock::iterator MII = MI; + MachineBasicBlock *MBB = MI->getParent(); + if (MI->isImplicitDef()) { + unsigned R = MI->getOperand(0).getReg(); + if (Hexagon::IntRegsRegClass.contains(R)) { + MCSuperRegIterator S(R, HRI, false); + MI->addOperand(MachineOperand::CreateReg(*S, true, true)); + } + return MII; + } + assert(ResourceTracker->canReserveResources(MI)); + + bool ExtMI = HII->isExtended(MI) || HII->isConstExtended(MI); + bool Good = true; + + if (GlueToNewValueJump) { + MachineInstr *NvjMI = ++MII; + // We need to put both instructions in the same packet: MI and NvjMI. + // Either of them can require a constant extender. Try to add both to + // the current packet, and if that fails, end the packet and start a + // new one. + ResourceTracker->reserveResources(MI); + if (ExtMI) + Good = tryAllocateResourcesForConstExt(true); + + bool ExtNvjMI = HII->isExtended(NvjMI) || HII->isConstExtended(NvjMI); + if (Good) { + if (ResourceTracker->canReserveResources(NvjMI)) + ResourceTracker->reserveResources(NvjMI); + else + Good = false; + } + if (Good && ExtNvjMI) + Good = tryAllocateResourcesForConstExt(true); - MachineBasicBlock::iterator MII = MI; - MachineBasicBlock *MBB = MI->getParent(); - - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - - if (GlueToNewValueJump) { - - ++MII; - MachineInstr *nvjMI = MII; + if (!Good) { + endPacket(MBB, MI); assert(ResourceTracker->canReserveResources(MI)); ResourceTracker->reserveResources(MI); - if ((QII->isExtended(MI) || QII->isConstExtended(MI)) && - !tryAllocateResourcesForConstExt(MI)) { - endPacket(MBB, MI); - ResourceTracker->reserveResources(MI); - assert(canReserveResourcesForConstExt(MI) && - "Ensure that there is a slot"); - reserveResourcesForConstExt(MI); - // Reserve resources for new value jump constant extender. - assert(canReserveResourcesForConstExt(MI) && - "Ensure that there is a slot"); - reserveResourcesForConstExt(nvjMI); - assert(ResourceTracker->canReserveResources(nvjMI) && - "Ensure that there is a slot"); - - } else if ( // Extended instruction takes two slots in the packet. - // Try reserve and allocate 4-byte in the current packet first. - (QII->isExtended(nvjMI) - && (!tryAllocateResourcesForConstExt(nvjMI) - || !ResourceTracker->canReserveResources(nvjMI))) - || // For non-extended instruction, no need to allocate extra 4 bytes. - (!QII->isExtended(nvjMI) && - !ResourceTracker->canReserveResources(nvjMI))) - { - endPacket(MBB, MI); - // A new and empty packet starts. - // We are sure that the resources requirements can be satisfied. - // Therefore, do not need to call "canReserveResources" anymore. - ResourceTracker->reserveResources(MI); - if (QII->isExtended(nvjMI)) - reserveResourcesForConstExt(nvjMI); + if (ExtMI) { + assert(canReserveResourcesForConstExt()); + tryAllocateResourcesForConstExt(true); } - // Here, we are sure that "reserveResources" would succeed. - ResourceTracker->reserveResources(nvjMI); - CurrentPacketMIs.push_back(MI); - CurrentPacketMIs.push_back(nvjMI); - } else { - if ( (QII->isExtended(MI) || QII->isConstExtended(MI)) - && ( !tryAllocateResourcesForConstExt(MI) - || !ResourceTracker->canReserveResources(MI))) - { - endPacket(MBB, MI); - // Check if the instruction was promoted to a dot-new. If so, demote it - // back into a dot-old - if (PromotedToDotNew) { - DemoteToDotOld(MI); - } - reserveResourcesForConstExt(MI); + assert(ResourceTracker->canReserveResources(NvjMI)); + ResourceTracker->reserveResources(NvjMI); + if (ExtNvjMI) { + assert(canReserveResourcesForConstExt()); + reserveResourcesForConstExt(); } - // In case that "MI" is not an extended insn, - // the resource availability has already been checked. - ResourceTracker->reserveResources(MI); - CurrentPacketMIs.push_back(MI); } + CurrentPacketMIs.push_back(MI); + CurrentPacketMIs.push_back(NvjMI); return MII; + } + + ResourceTracker->reserveResources(MI); + if (ExtMI && !tryAllocateResourcesForConstExt(true)) { + endPacket(MBB, MI); + if (PromotedToDotNew) + demoteToDotOld(MI); + ResourceTracker->reserveResources(MI); + reserveResourcesForConstExt(); + } + + CurrentPacketMIs.push_back(MI); + return MII; +} + +void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB, + MachineInstr *MI) { + OldPacketMIs = CurrentPacketMIs; + VLIWPacketizerList::endPacket(MBB, MI); +} + +bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr *MI) { + return !producesStall(MI); +} + + +// Return true when ConsMI uses a register defined by ProdMI. +static bool isDependent(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) { + if (!ProdMI->getOperand(0).isReg()) + return false; + unsigned DstReg = ProdMI->getOperand(0).getReg(); + + for (auto &Op : ConsMI->operands()) + if (Op.isReg() && Op.isUse() && Op.getReg() == DstReg) + // The MIs depend on each other. + return true; + + return false; } +// V60 forward scheduling. +bool HexagonPacketizerList::producesStall(const MachineInstr *I) { + // Check whether the previous packet is in a different loop. If this is the + // case, there is little point in trying to avoid a stall because that would + // favor the rare case (loop entry) over the common case (loop iteration). + // + // TODO: We should really be able to check all the incoming edges if this is + // the first packet in a basic block, so we can avoid stalls from the loop + // backedge. + if (!OldPacketMIs.empty()) { + auto *OldBB = OldPacketMIs.front()->getParent(); + auto *ThisBB = I->getParent(); + if (MLI->getLoopFor(OldBB) != MLI->getLoopFor(ThisBB)) + return false; + } + + // Check for stall between two vector instructions. + if (HII->isV60VectorInstruction(I)) { + for (auto J : OldPacketMIs) { + if (!HII->isV60VectorInstruction(J)) + continue; + if (isDependent(J, I) && !HII->isVecUsableNextPacket(J, I)) + return true; + } + return false; + } + + // Check for stall between two scalar instructions. First, check that + // there is no definition of a use in the current packet, because it + // may be a candidate for .new. + for (auto J : CurrentPacketMIs) + if (!HII->isV60VectorInstruction(J) && isDependent(J, I)) + return false; + + // Check for stall between I and instructions in the previous packet. + if (MF.getSubtarget<HexagonSubtarget>().useBSBScheduling()) { + for (auto J : OldPacketMIs) { + if (HII->isV60VectorInstruction(J)) + continue; + if (!HII->isLateInstrFeedsEarlyInstr(J, I)) + continue; + if (isDependent(J, I) && !HII->canExecuteInBundle(J, I)) + return true; + } + } + + return false; +} + + //===----------------------------------------------------------------------===// // Public Constructor Functions //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h new file mode 100644 index 0000000..960cf6c --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h @@ -0,0 +1,114 @@ +#ifndef HEXAGONVLIWPACKETIZER_H +#define HEXAGONVLIWPACKETIZER_H + +#include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" + +namespace llvm { +class HexagonPacketizerList : public VLIWPacketizerList { + // Vector of instructions assigned to the packet that has just been created. + std::vector<MachineInstr*> OldPacketMIs; + + // Has the instruction been promoted to a dot-new instruction. + bool PromotedToDotNew; + + // Has the instruction been glued to allocframe. + bool GlueAllocframeStore; + + // Has the feeder instruction been glued to new value jump. + bool GlueToNewValueJump; + + // Check if there is a dependence between some instruction already in this + // packet and this instruction. + bool Dependence; + + // Only check for dependence if there are resources available to + // schedule this instruction. + bool FoundSequentialDependence; + + // Track MIs with ignored dependence. + std::vector<MachineInstr*> IgnoreDepMIs; + +protected: + /// \brief A handle to the branch probability pass. + const MachineBranchProbabilityInfo *MBPI; + const MachineLoopInfo *MLI; + +private: + const HexagonInstrInfo *HII; + const HexagonRegisterInfo *HRI; + +public: + // Ctor. + HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI, + AliasAnalysis *AA, + const MachineBranchProbabilityInfo *MBPI); + + // initPacketizerState - initialize some internal flags. + void initPacketizerState() override; + + // ignorePseudoInstruction - Ignore bundling of pseudo instructions. + bool ignorePseudoInstruction(const MachineInstr *MI, + const MachineBasicBlock *MBB) override; + + // isSoloInstruction - return true if instruction MI can not be packetized + // with any other instruction, which means that MI itself is a packet. + bool isSoloInstruction(const MachineInstr *MI) override; + + // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ + // together. + bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override; + + // isLegalToPruneDependencies - Is it legal to prune dependece between SUI + // and SUJ. + bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override; + + MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override; + void endPacket(MachineBasicBlock *MBB, MachineInstr *MI) override; + bool shouldAddToPacket(const MachineInstr *MI) override; + + void unpacketizeSoloInstrs(MachineFunction &MF); + +protected: + bool isCallDependent(const MachineInstr* MI, SDep::Kind DepType, + unsigned DepReg); + bool promoteToDotCur(MachineInstr* MI, SDep::Kind DepType, + MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC); + bool canPromoteToDotCur(const MachineInstr* MI, const SUnit* PacketSU, + unsigned DepReg, MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC); + void cleanUpDotCur(); + + bool promoteToDotNew(MachineInstr* MI, SDep::Kind DepType, + MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC); + bool canPromoteToDotNew(const MachineInstr* MI, const SUnit* PacketSU, + unsigned DepReg, MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC); + bool canPromoteToNewValue(const MachineInstr* MI, const SUnit* PacketSU, + unsigned DepReg, MachineBasicBlock::iterator &MII); + bool canPromoteToNewValueStore(const MachineInstr* MI, + const MachineInstr* PacketMI, unsigned DepReg); + bool demoteToDotOld(MachineInstr* MI); + bool arePredicatesComplements(MachineInstr* MI1, MachineInstr* MI2); + bool restrictingDepExistInPacket(MachineInstr*, unsigned); + bool isNewifiable(const MachineInstr *MI); + bool isCurifiable(MachineInstr* MI); + bool cannotCoexist(const MachineInstr *MI, const MachineInstr *MJ); + inline bool isPromotedToDotNew() const { + return PromotedToDotNew; + } + bool tryAllocateResourcesForConstExt(bool Reserve); + bool canReserveResourcesForConstExt(); + void reserveResourcesForConstExt(); + bool hasDeadDependence(const MachineInstr *I, const MachineInstr *J); + bool hasControlDependence(const MachineInstr *I, const MachineInstr *J); + bool hasV4SpecificDependence(const MachineInstr *I, const MachineInstr *J); + bool producesStall(const MachineInstr *MI); +}; +} // namespace llvm +#endif // HEXAGONVLIWPACKETIZER_H + diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index 99ea2fa..b73af82 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -13,7 +13,9 @@ #include "MCTargetDesc/HexagonBaseInfo.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCInstrInfo.h" @@ -33,14 +35,28 @@ class HexagonAsmBackend : public MCAsmBackend { mutable uint64_t relaxedCnt; std::unique_ptr <MCInstrInfo> MCII; std::unique_ptr <MCInst *> RelaxTarget; + MCInst * Extender; public: HexagonAsmBackend(Target const &T, uint8_t OSABI, StringRef CPU) : - OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *){} + OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *), + Extender(nullptr) {} MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createHexagonELFObjectWriter(OS, OSABI, CPU); } + void setExtender(MCContext &Context) const { + if (Extender == nullptr) + const_cast<HexagonAsmBackend *>(this)->Extender = new (Context) MCInst; + } + + MCInst *takeExtender() const { + assert(Extender != nullptr); + MCInst * Result = Extender; + const_cast<HexagonAsmBackend *>(this)->Extender = nullptr; + return Result; + } + unsigned getNumFixupKinds() const override { return Hexagon::NumTargetFixupKinds; } @@ -222,6 +238,7 @@ public: if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) { ++relaxedCnt; *RelaxTarget = &MCI; + setExtender(Layout.getAssembler().getContext()); return true; } else { return false; @@ -262,6 +279,7 @@ public: if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) { ++relaxedCnt; *RelaxTarget = &MCI; + setExtender(Layout.getAssembler().getContext()); return true; } } @@ -276,9 +294,35 @@ public: llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced"); } - void relaxInstruction(MCInst const & /*Inst*/, - MCInst & /*Res*/) const override { - llvm_unreachable("relaxInstruction() unimplemented"); + void relaxInstruction(MCInst const & Inst, + MCInst & Res) const override { + assert(HexagonMCInstrInfo::isBundle(Inst) && + "Hexagon relaxInstruction only works on bundles"); + + Res = HexagonMCInstrInfo::createBundle(); + // Copy the results into the bundle. + bool Update = false; + for (auto &I : HexagonMCInstrInfo::bundleInstructions(Inst)) { + MCInst &CrntHMI = const_cast<MCInst &>(*I.getInst()); + + // if immediate extender needed, add it in + if (*RelaxTarget == &CrntHMI) { + Update = true; + assert((HexagonMCInstrInfo::bundleSize(Res) < HEXAGON_PACKET_SIZE) && + "No room to insert extender for relaxation"); + + MCInst *HMIx = takeExtender(); + *HMIx = HexagonMCInstrInfo::deriveExtender( + *MCII, CrntHMI, + HexagonMCInstrInfo::getExtendableOperand(*MCII, CrntHMI)); + Res.addOperand(MCOperand::createInst(HMIx)); + *RelaxTarget = nullptr; + } + // now copy over the original instruction(the one we may have extended) + Res.addOperand(MCOperand::createInst(I.getInst())); + } + (void)Update; + assert(Update && "Didn't find relaxation target"); } bool writeNopData(uint64_t Count, diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h index f4d162c..47a6f86 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h @@ -44,6 +44,25 @@ namespace HexagonII { TypeMEMOP = 9, TypeNV = 10, TypeDUPLEX = 11, + TypeCOMPOUND = 12, + TypeCVI_FIRST = 13, + TypeCVI_VA = TypeCVI_FIRST, + TypeCVI_VA_DV = 14, + TypeCVI_VX = 15, + TypeCVI_VX_DV = 16, + TypeCVI_VP = 17, + TypeCVI_VP_VS = 18, + TypeCVI_VS = 19, + TypeCVI_VINLANESAT= 20, + TypeCVI_VM_LD = 21, + TypeCVI_VM_TMP_LD = 22, + TypeCVI_VM_CUR_LD = 23, + TypeCVI_VM_VP_LDU = 24, + TypeCVI_VM_ST = 25, + TypeCVI_VM_NEW_ST = 26, + TypeCVI_VM_STU = 27, + TypeCVI_HIST = 28, + TypeCVI_LAST = TypeCVI_HIST, TypePREFIX = 30, // Such as extenders. TypeENDLOOP = 31 // Such as end of a HW loop. }; @@ -71,12 +90,16 @@ namespace HexagonII { PostInc = 6 // Post increment addressing mode }; + // MemAccessSize is represented as 1+log2(N) where N is size in bits. enum class MemAccessSize { NoMemAccess = 0, // Not a memory acces instruction. ByteAccess = 1, // Byte access instruction (memb). HalfWordAccess = 2, // Half word access instruction (memh). WordAccess = 3, // Word access instruction (memw). - DoubleWordAccess = 4 // Double word access instruction (memd) + DoubleWordAccess = 4, // Double word access instruction (memd) + // 5, // We do not have a 16 byte vector access. + Vector64Access = 7, // 64 Byte vector access instruction (vmem). + Vector128Access = 8 // 128 Byte vector access instruction (vmem). }; // MCInstrDesc TSFlags @@ -156,7 +179,7 @@ namespace HexagonII { AddrModeMask = 0x7, // Access size for load/store instructions. MemAccessSizePos = 43, - MemAccesSizeMask = 0x7, + MemAccesSizeMask = 0xf, // Branch predicted taken. TakenPos = 47, @@ -164,7 +187,23 @@ namespace HexagonII { // Floating-point instructions. FPPos = 48, - FPMask = 0x1 + FPMask = 0x1, + + // New-Value producer-2 instructions. + hasNewValuePos2 = 50, + hasNewValueMask2 = 0x1, + + // Which operand consumes or produces a new value. + NewValueOpPos2 = 51, + NewValueOpMask2 = 0x7, + + // Accumulator instructions. + AccumulatorPos = 54, + AccumulatorMask = 0x1, + + // Complex XU, prevent xu competition by prefering slot3 + PrefersSlot3Pos = 55, + PrefersSlot3Mask = 0x1, }; // *** The code above must match HexagonInstrFormat*.td *** // @@ -219,6 +258,26 @@ namespace HexagonII { INST_PARSE_EXTENDER = 0x00000000 }; + enum InstIClassBits : unsigned { + INST_ICLASS_MASK = 0xf0000000, + INST_ICLASS_EXTENDER = 0x00000000, + INST_ICLASS_J_1 = 0x10000000, + INST_ICLASS_J_2 = 0x20000000, + INST_ICLASS_LD_ST_1 = 0x30000000, + INST_ICLASS_LD_ST_2 = 0x40000000, + INST_ICLASS_J_3 = 0x50000000, + INST_ICLASS_CR = 0x60000000, + INST_ICLASS_ALU32_1 = 0x70000000, + INST_ICLASS_XTYPE_1 = 0x80000000, + INST_ICLASS_LD = 0x90000000, + INST_ICLASS_ST = 0xa0000000, + INST_ICLASS_ALU32_2 = 0xb0000000, + INST_ICLASS_XTYPE_2 = 0xc0000000, + INST_ICLASS_XTYPE_3 = 0xd0000000, + INST_ICLASS_XTYPE_4 = 0xe0000000, + INST_ICLASS_ALU32_3 = 0xf0000000 + }; + } // End namespace HexagonII. } // End namespace llvm. diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp index 36f8146..06ccec5 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp @@ -12,13 +12,13 @@ //===----------------------------------------------------------------------===// #include "HexagonAsmPrinter.h" -#include "Hexagon.h" #include "HexagonInstPrinter.h" +#include "MCTargetDesc/HexagonBaseInfo.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -28,104 +28,33 @@ using namespace llvm; #define GET_INSTRUCTION_NAME #include "HexagonGenAsmWriter.inc" -HexagonAsmInstPrinter::HexagonAsmInstPrinter(MCInstPrinter *RawPrinter) - : MCInstPrinter(*RawPrinter), RawPrinter(RawPrinter) {} - -void HexagonAsmInstPrinter::printInst(MCInst const *MI, raw_ostream &O, - StringRef Annot, - MCSubtargetInfo const &STI) { - assert(HexagonMCInstrInfo::isBundle(*MI)); - assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE); - std::string Buffer; - { - raw_string_ostream TempStream(Buffer); - RawPrinter->printInst(MI, TempStream, "", STI); - } - StringRef Contents(Buffer); - auto PacketBundle = Contents.rsplit('\n'); - auto HeadTail = PacketBundle.first.split('\n'); - auto Preamble = "\t{\n\t\t"; - auto Separator = ""; - while(!HeadTail.first.empty()) { - O << Separator; - StringRef Inst; - auto Duplex = HeadTail.first.split('\v'); - if(!Duplex.second.empty()){ - O << Duplex.first << "\n"; - Inst = Duplex.second; - } - else - Inst = Duplex.first; - O << Preamble; - O << Inst; - HeadTail = HeadTail.second.split('\n'); - Preamble = ""; - Separator = "\n\t\t"; - } - O << "\n\t}" << PacketBundle.second; -} - -void HexagonAsmInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const { - RawPrinter->printRegName(O, RegNo); -} - -// Return the minimum value that a constant extendable operand can have -// without being extended. -static int getMinValue(uint64_t TSFlags) { - unsigned isSigned = - (TSFlags >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask; - unsigned bits = - (TSFlags >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask; - - if (isSigned) - return -1U << (bits - 1); - - return 0; -} - -// Return the maximum value that a constant extendable operand can have -// without being extended. -static int getMaxValue(uint64_t TSFlags) { - unsigned isSigned = - (TSFlags >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask; - unsigned bits = - (TSFlags >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask; - - if (isSigned) - return ~(-1U << (bits - 1)); - - return ~(-1U << bits); -} - -// Return true if the instruction must be extended. -static bool isExtended(uint64_t TSFlags) { - return (TSFlags >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask; -} - -// Currently just used in an assert statement -static bool isExtendable(uint64_t TSFlags) LLVM_ATTRIBUTE_UNUSED; -// Return true if the instruction may be extended based on the operand value. -static bool isExtendable(uint64_t TSFlags) { - return (TSFlags >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask; +HexagonInstPrinter::HexagonInstPrinter(MCAsmInfo const &MAI, + MCInstrInfo const &MII, + MCRegisterInfo const &MRI) + : MCInstPrinter(MAI, MII, MRI), MII(MII), HasExtender(false) { } StringRef HexagonInstPrinter::getOpcodeName(unsigned Opcode) const { return MII.getName(Opcode); } -void HexagonInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - OS << getRegisterName(RegNo); +void HexagonInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const { + O << getRegName(RegNo); +} + +StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const { + return getRegisterName(RegNo); } void HexagonInstPrinter::setExtender(MCInst const &MCI) { HasExtender = HexagonMCInstrInfo::isImmext(MCI); } -void HexagonInstPrinter::printInst(MCInst const *MI, raw_ostream &OS, - StringRef Annot, - MCSubtargetInfo const &STI) { +void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, const MCSubtargetInfo &STI) { assert(HexagonMCInstrInfo::isBundle(*MI)); assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE); + assert(HexagonMCInstrInfo::bundleSize(*MI) > 0); HasExtender = false; for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) { MCInst const &MCI = *I.getInst(); @@ -157,145 +86,148 @@ void HexagonInstPrinter::printInst(MCInst const *MI, raw_ostream &OS, } } -void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { - const MCOperand& MO = MI->getOperand(OpNo); - + if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo && + (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI))) + O << "#"; + MCOperand const &MO = MI->getOperand(OpNo); if (MO.isReg()) { - printRegName(O, MO.getReg()); - } else if(MO.isExpr()) { - MO.getExpr()->print(O, &MAI); - } else if(MO.isImm()) { - printImmOperand(MI, OpNo, O); - } else { - llvm_unreachable("Unknown operand"); - } -} - -void HexagonInstPrinter::printImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { - const MCOperand& MO = MI->getOperand(OpNo); - - if(MO.isExpr()) { - MO.getExpr()->print(O, &MAI); - } else if(MO.isImm()) { - O << MI->getOperand(OpNo).getImm(); + O << getRegisterName(MO.getReg()); + } else if (MO.isExpr()) { + int64_t Value; + if (MO.getExpr()->evaluateAsAbsolute(Value)) + O << formatImm(Value); + else + O << *MO.getExpr(); } else { llvm_unreachable("Unknown operand"); } } -void HexagonInstPrinter::printExtOperand(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printExtOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { - const MCOperand &MO = MI->getOperand(OpNo); - const MCInstrDesc &MII = getMII().get(MI->getOpcode()); - - assert((isExtendable(MII.TSFlags) || isExtended(MII.TSFlags)) && - "Expecting an extendable operand"); - - if (MO.isExpr() || isExtended(MII.TSFlags)) { - O << "#"; - } else if (MO.isImm()) { - int ImmValue = MO.getImm(); - if (ImmValue < getMinValue(MII.TSFlags) || - ImmValue > getMaxValue(MII.TSFlags)) - O << "#"; - } printOperand(MI, OpNo, O); } -void HexagonInstPrinter::printUnsignedImmOperand(const MCInst *MI, - unsigned OpNo, raw_ostream &O) const { +void HexagonInstPrinter::printUnsignedImmOperand(MCInst const *MI, + unsigned OpNo, + raw_ostream &O) const { O << MI->getOperand(OpNo).getImm(); } -void HexagonInstPrinter::printNegImmOperand(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printNegImmOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { O << -MI->getOperand(OpNo).getImm(); } -void HexagonInstPrinter::printNOneImmOperand(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printNOneImmOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { O << -1; } -void HexagonInstPrinter::printMEMriOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { - const MCOperand& MO0 = MI->getOperand(OpNo); - const MCOperand& MO1 = MI->getOperand(OpNo + 1); +void HexagonInstPrinter::prints3_6ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const { + int64_t Imm; + bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm); + Imm = SignExtend64<9>(Imm); + assert(Success); (void)Success; + assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO."); + O << formatImm(Imm/64); +} - printRegName(O, MO0.getReg()); - O << " + #" << MO1.getImm(); +void HexagonInstPrinter::prints3_7ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const { + int64_t Imm; + bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm); + Imm = SignExtend64<10>(Imm); + assert(Success); (void)Success; + assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO."); + O << formatImm(Imm/128); } -void HexagonInstPrinter::printFrameIndexOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { - const MCOperand& MO0 = MI->getOperand(OpNo); - const MCOperand& MO1 = MI->getOperand(OpNo + 1); +void HexagonInstPrinter::prints4_6ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const { + int64_t Imm; + bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm); + Imm = SignExtend64<10>(Imm); + assert(Success); (void)Success; + assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO."); + O << formatImm(Imm/64); +} - printRegName(O, MO0.getReg()); - O << ", #" << MO1.getImm(); +void HexagonInstPrinter::prints4_7ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const { + int64_t Imm; + bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm); + Imm = SignExtend64<11>(Imm); + assert(Success); (void)Success; + assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO."); + O << formatImm(Imm/128); } -void HexagonInstPrinter::printGlobalOperand(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printGlobalOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { - assert(MI->getOperand(OpNo).isExpr() && "Expecting expression"); - printOperand(MI, OpNo, O); } -void HexagonInstPrinter::printJumpTable(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printJumpTable(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { assert(MI->getOperand(OpNo).isExpr() && "Expecting expression"); printOperand(MI, OpNo, O); } -void HexagonInstPrinter::printConstantPool(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printConstantPool(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { assert(MI->getOperand(OpNo).isExpr() && "Expecting expression"); printOperand(MI, OpNo, O); } -void HexagonInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printBranchOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { // Branches can take an immediate operand. This is used by the branch // selection pass to print $+8, an eight byte displacement from the PC. llvm_unreachable("Unknown branch operand."); } -void HexagonInstPrinter::printCallOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { -} +void HexagonInstPrinter::printCallOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const {} -void HexagonInstPrinter::printAbsAddrOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { -} +void HexagonInstPrinter::printAbsAddrOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const {} -void HexagonInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { -} +void HexagonInstPrinter::printPredicateOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const {} -void HexagonInstPrinter::printSymbol(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printSymbol(MCInst const *MI, unsigned OpNo, raw_ostream &O, bool hi) const { - assert(MI->getOperand(OpNo).isImm() && "Unknown symbol operand"); + MCOperand const &MO = MI->getOperand(OpNo); - O << '#' << (hi ? "HI" : "LO") << "(#"; - printOperand(MI, OpNo, O); + O << '#' << (hi ? "HI" : "LO") << '('; + if (MO.isImm()) { + O << '#'; + printOperand(MI, OpNo, O); + } else { + printOperand(MI, OpNo, O); + assert("Unknown symbol operand"); + } O << ')'; } -void HexagonInstPrinter::printExtBrtarget(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { - const MCOperand &MO = MI->getOperand(OpNo); - const MCInstrDesc &MII = getMII().get(MI->getOpcode()); - - assert((isExtendable(MII.TSFlags) || isExtended(MII.TSFlags)) && - "Expecting an extendable operand"); - - if (MO.isExpr() || isExtended(MII.TSFlags)) { - O << "##"; +void HexagonInstPrinter::printBrtarget(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const { + MCOperand const &MO = MI->getOperand(OpNo); + assert (MO.isExpr()); + MCExpr const &Expr = *MO.getExpr(); + int64_t Value; + if (Expr.evaluateAsAbsolute(Value)) + O << format("0x%" PRIx64, Value); + else { + if (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI)) + if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo) + O << "##"; + O << Expr; } - printOperand(MI, OpNo, O); } diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h index 534ac23..5f42118 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h @@ -7,7 +7,6 @@ // //===----------------------------------------------------------------------===// // -// This class prints an Hexagon MCInst to a .s file. // //===----------------------------------------------------------------------===// @@ -15,17 +14,8 @@ #define LLVM_LIB_TARGET_HEXAGON_INSTPRINTER_HEXAGONINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCInstrInfo.h" namespace llvm { -class HexagonAsmInstPrinter : public MCInstPrinter { -public: - HexagonAsmInstPrinter(MCInstPrinter *RawPrinter); - void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot, - MCSubtargetInfo const &STI) override; - void printRegName(raw_ostream &O, unsigned RegNo) const override; - std::unique_ptr<MCInstPrinter> RawPrinter; -}; /// Prints bundles as a newline separated list of individual instructions /// Duplexes are separated by a vertical tab \v character /// A trailing line includes bundle properties such as endloop0/1 @@ -33,68 +23,69 @@ public: /// r0 = add(r1, r2) /// r0 = #0 \v jump 0x0 /// :endloop0 :endloop1 - class HexagonInstPrinter : public MCInstPrinter { - public: - explicit HexagonInstPrinter(MCAsmInfo const &MAI, - MCInstrInfo const &MII, - MCRegisterInfo const &MRI) - : MCInstPrinter(MAI, MII, MRI), MII(MII) {} - - void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - virtual StringRef getOpcodeName(unsigned Opcode) const; - void printInstruction(const MCInst *MI, raw_ostream &O); - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - static const char *getRegisterName(unsigned RegNo); +class HexagonInstPrinter : public MCInstPrinter { +public: + explicit HexagonInstPrinter(MCAsmInfo const &MAI, MCInstrInfo const &MII, + MCRegisterInfo const &MRI); + void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + virtual StringRef getOpcodeName(unsigned Opcode) const; + void printInstruction(MCInst const *MI, raw_ostream &O); - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; - void printImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; - void printExtOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; - void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const; - void printNegImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printNOneImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printMEMriOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printFrameIndexOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const; - void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printCallOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printAbsAddrOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printPredicateOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printGlobalOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printJumpTable(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; - void printExtBrtarget(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; + StringRef getRegName(unsigned RegNo) const; + static char const *getRegisterName(unsigned RegNo); + void printRegName(raw_ostream &O, unsigned RegNo) const override; - void printConstantPool(const MCInst *MI, unsigned OpNo, + void printOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; + void printExtOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; + void printUnsignedImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printNegImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printNOneImmOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; + void prints3_6ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void prints3_7ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void prints4_6ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void prints4_7ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printBranchOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printCallOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; + void printAbsAddrOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printPredicateOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printGlobalOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printJumpTable(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; + void printBrtarget(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; + + void printConstantPool(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; - void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O) const - { printSymbol(MI, OpNo, O, true); } - void printSymbolLo(const MCInst *MI, unsigned OpNo, raw_ostream &O) const - { printSymbol(MI, OpNo, O, false); } + void printSymbolHi(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { + printSymbol(MI, OpNo, O, true); + } + void printSymbolLo(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { + printSymbol(MI, OpNo, O, false); + } - const MCInstrInfo &getMII() const { - return MII; - } + MCAsmInfo const &getMAI() const { return MAI; } + MCInstrInfo const &getMII() const { return MII; } - protected: - void printSymbol(const MCInst *MI, unsigned OpNo, raw_ostream &O, bool hi) - const; +protected: + void printSymbol(MCInst const *MI, unsigned OpNo, raw_ostream &O, + bool hi) const; - private: - const MCInstrInfo &MII; +private: + MCInstrInfo const &MII; - bool HasExtender; - void setExtender(MCInst const &MCI); - }; + bool HasExtender; + void setExtender(MCInst const &MCI); +}; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h index dc07069..a8456b4 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h @@ -18,13 +18,14 @@ #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { - class Triple; +class Triple; - class HexagonMCAsmInfo : public MCAsmInfoELF { - void anchor() override; - public: - explicit HexagonMCAsmInfo(const Triple &TT); - }; +class HexagonMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit HexagonMCAsmInfo(const Triple &TT); +}; } // namespace llvm diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp new file mode 100644 index 0000000..46b7b41 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp @@ -0,0 +1,581 @@ +//===----- HexagonMCChecker.cpp - Instruction bundle checking -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This implements the checking of insns inside a bundle according to the +// packet constraint rules of the Hexagon ISA. +// +//===----------------------------------------------------------------------===// + +#include "HexagonMCChecker.h" + +#include "HexagonBaseInfo.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +static cl::opt<bool> RelaxNVChecks("relax-nv-checks", cl::init(false), + cl::ZeroOrMore, cl::Hidden, cl::desc("Relax checks of new-value validity")); + +const HexagonMCChecker::PredSense + HexagonMCChecker::Unconditional(Hexagon::NoRegister, false); + +void HexagonMCChecker::init() { + // Initialize read-only registers set. + ReadOnly.insert(Hexagon::PC); + + // Figure out the loop-registers definitions. + if (HexagonMCInstrInfo::isInnerLoop(MCB)) { + Defs[Hexagon::SA0].insert(Unconditional); // FIXME: define or change SA0? + Defs[Hexagon::LC0].insert(Unconditional); + } + if (HexagonMCInstrInfo::isOuterLoop(MCB)) { + Defs[Hexagon::SA1].insert(Unconditional); // FIXME: define or change SA0? + Defs[Hexagon::LC1].insert(Unconditional); + } + + if (HexagonMCInstrInfo::isBundle(MCB)) + // Unfurl a bundle. + for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) { + init(*I.getInst()); + } + else + init(MCB); +} + +void HexagonMCChecker::init(MCInst const& MCI) { + const MCInstrDesc& MCID = HexagonMCInstrInfo::getDesc(MCII, MCI); + unsigned PredReg = Hexagon::NoRegister; + bool isTrue = false; + + // Get used registers. + for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i) + if (MCI.getOperand(i).isReg()) { + unsigned R = MCI.getOperand(i).getReg(); + + if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) { + // Note an used predicate register. + PredReg = R; + isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI); + + // Note use of new predicate register. + if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) + NewPreds.insert(PredReg); + } + else + // Note register use. Super-registers are not tracked directly, + // but their components. + for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid()); + SRI.isValid(); + ++SRI) + if (!MCSubRegIterator(*SRI, &RI).isValid()) + // Skip super-registers used indirectly. + Uses.insert(*SRI); + } + + // Get implicit register definitions. + if (const MCPhysReg *ImpDef = MCID.getImplicitDefs()) + for (; *ImpDef; ++ImpDef) { + unsigned R = *ImpDef; + + if (Hexagon::R31 != R && MCID.isCall()) + // Any register other than the LR and the PC are actually volatile ones + // as defined by the ABI, not modified implicitly by the call insn. + continue; + if (Hexagon::PC == R) + // Branches are the only insns that can change the PC, + // otherwise a read-only register. + continue; + + if (Hexagon::USR_OVF == R) + // Many insns change the USR implicitly, but only one or another flag. + // The instruction table models the USR.OVF flag, which can be implicitly + // modified more than once, but cannot be modified in the same packet + // with an instruction that modifies is explicitly. Deal with such situ- + // ations individually. + SoftDefs.insert(R); + else if (isPredicateRegister(R) && + HexagonMCInstrInfo::isPredicateLate(MCII, MCI)) + // Include implicit late predicates. + LatePreds.insert(R); + else + Defs[R].insert(PredSense(PredReg, isTrue)); + } + + // Figure out explicit register definitions. + for (unsigned i = 0; i < MCID.getNumDefs(); ++i) { + unsigned R = MCI.getOperand(i).getReg(), + S = Hexagon::NoRegister; + + // Note register definitions, direct ones as well as indirect side-effects. + // Super-registers are not tracked directly, but their components. + for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid()); + SRI.isValid(); + ++SRI) { + if (MCSubRegIterator(*SRI, &RI).isValid()) + // Skip super-registers defined indirectly. + continue; + + if (R == *SRI) { + if (S == R) + // Avoid scoring the defined register multiple times. + continue; + else + // Note that the defined register has already been scored. + S = R; + } + + if (Hexagon::P3_0 != R && Hexagon::P3_0 == *SRI) + // P3:0 is a special case, since multiple predicate register definitions + // in a packet is allowed as the equivalent of their logical "and". + // Only an explicit definition of P3:0 is noted as such; if a + // side-effect, then note as a soft definition. + SoftDefs.insert(*SRI); + else if (HexagonMCInstrInfo::isPredicateLate(MCII, MCI) && isPredicateRegister(*SRI)) + // Some insns produce predicates too late to be used in the same packet. + LatePreds.insert(*SRI); + else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_CUR_LD) + // Current loads should be used in the same packet. + // TODO: relies on the impossibility of a current and a temporary loads + // in the same packet. + CurDefs.insert(*SRI), Defs[*SRI].insert(PredSense(PredReg, isTrue)); + else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_TMP_LD) + // Temporary loads should be used in the same packet, but don't commit + // results, so it should be disregarded if another insn changes the same + // register. + // TODO: relies on the impossibility of a current and a temporary loads + // in the same packet. + TmpDefs.insert(*SRI); + else if (i <= 1 && llvm::HexagonMCInstrInfo::hasNewValue2(MCII, MCI) ) + // vshuff(Vx, Vy, Rx) <- Vx(0) and Vy(1) are both source and + // destination registers with this instruction. same for vdeal(Vx,Vy,Rx) + Uses.insert(*SRI); + else + Defs[*SRI].insert(PredSense(PredReg, isTrue)); + } + } + + // Figure out register definitions that produce new values. + if (HexagonMCInstrInfo::hasNewValue(MCII, MCI)) { + unsigned R = HexagonMCInstrInfo::getNewValueOperand(MCII, MCI).getReg(); + + if (HexagonMCInstrInfo::isCompound(MCII, MCI)) + compoundRegisterMap(R); // Compound insns have a limited register range. + + for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid()); + SRI.isValid(); + ++SRI) + if (!MCSubRegIterator(*SRI, &RI).isValid()) + // No super-registers defined indirectly. + NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI), + HexagonMCInstrInfo::isFloat(MCII, MCI))); + + // For fairly unique 2-dot-new producers, example: + // vdeal(V1, V9, R0) V1.new and V9.new can be used by consumers. + if (HexagonMCInstrInfo::hasNewValue2(MCII, MCI)) { + unsigned R2 = HexagonMCInstrInfo::getNewValueOperand2(MCII, MCI).getReg(); + + for(MCRegAliasIterator SRI(R2, &RI, !MCSubRegIterator(R2, &RI).isValid()); + SRI.isValid(); + ++SRI) + if (!MCSubRegIterator(*SRI, &RI).isValid()) + NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI), + HexagonMCInstrInfo::isFloat(MCII, MCI))); + } + } + + // Figure out definitions of new predicate registers. + if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) + for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i) + if (MCI.getOperand(i).isReg()) { + unsigned P = MCI.getOperand(i).getReg(); + + if (isPredicateRegister(P)) + NewPreds.insert(P); + } + + // Figure out uses of new values. + if (HexagonMCInstrInfo::isNewValue(MCII, MCI)) { + unsigned N = HexagonMCInstrInfo::getNewValueOperand(MCII, MCI).getReg(); + + if (!MCSubRegIterator(N, &RI).isValid()) { + // Super-registers cannot use new values. + if (MCID.isBranch()) + NewUses[N] = NewSense::Jmp(llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV); + else + NewUses[N] = NewSense::Use(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI)); + } + } +} + +HexagonMCChecker::HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst &mcb, MCInst &mcbdx, + MCRegisterInfo const &ri) + : MCB(mcb), MCBDX(mcbdx), RI(ri), MCII(MCII), STI(STI), + bLoadErrInfo(false) { + init(); +} + +bool HexagonMCChecker::check() { + bool chkB = checkBranches(); + bool chkP = checkPredicates(); + bool chkNV = checkNewValues(); + bool chkR = checkRegisters(); + bool chkS = checkSolo(); + bool chkSh = checkShuffle(); + bool chkSl = checkSlots(); + bool chk = chkB && chkP && chkNV && chkR && chkS && chkSh && chkSl; + + return chk; +} + +bool HexagonMCChecker::checkSlots() + +{ + unsigned slotsUsed = 0; + for (auto HMI: HexagonMCInstrInfo::bundleInstructions(MCBDX)) { + MCInst const& MCI = *HMI.getInst(); + if (HexagonMCInstrInfo::isImmext(MCI)) + continue; + if (HexagonMCInstrInfo::isDuplex(MCII, MCI)) + slotsUsed += 2; + else + ++slotsUsed; + } + + if (slotsUsed > HEXAGON_PACKET_SIZE) { + HexagonMCErrInfo errInfo; + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NOSLOTS); + addErrInfo(errInfo); + return false; + } + return true; +} + +// Check legal use of branches. +bool HexagonMCChecker::checkBranches() { + HexagonMCErrInfo errInfo; + if (HexagonMCInstrInfo::isBundle(MCB)) { + bool hasConditional = false; + unsigned Branches = 0, Returns = 0, NewIndirectBranches = 0, + NewValueBranches = 0, Conditional = HEXAGON_PRESHUFFLE_PACKET_SIZE, + Unconditional = HEXAGON_PRESHUFFLE_PACKET_SIZE; + + for (unsigned i = HexagonMCInstrInfo::bundleInstructionsOffset; + i < MCB.size(); ++i) { + MCInst const &MCI = *MCB.begin()[i].getInst(); + + if (HexagonMCInstrInfo::isImmext(MCI)) + continue; + if (HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch() || + HexagonMCInstrInfo::getDesc(MCII, MCI).isCall()) { + ++Branches; + if (HexagonMCInstrInfo::getDesc(MCII, MCI).isIndirectBranch() && + HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) + ++NewIndirectBranches; + if (HexagonMCInstrInfo::isNewValue(MCII, MCI)) + ++NewValueBranches; + + if (HexagonMCInstrInfo::isPredicated(MCII, MCI) || + HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) { + hasConditional = true; + Conditional = i; // Record the position of the conditional branch. + } else { + Unconditional = i; // Record the position of the unconditional branch. + } + } + if (HexagonMCInstrInfo::getDesc(MCII, MCI).isReturn() && + HexagonMCInstrInfo::getDesc(MCII, MCI).mayLoad()) + ++Returns; + } + + if (Branches) // FIXME: should "Defs.count(Hexagon::PC)" be here too? + if (HexagonMCInstrInfo::isInnerLoop(MCB) || + HexagonMCInstrInfo::isOuterLoop(MCB)) { + // Error out if there's any branch in a loop-end packet. + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_ENDLOOP, Hexagon::PC); + addErrInfo(errInfo); + return false; + } + if (Branches > 1) + if (!hasConditional || Conditional > Unconditional) { + // Error out if more than one unconditional branch or + // the conditional branch appears after the unconditional one. + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_BRANCHES); + addErrInfo(errInfo); + return false; + } + } + + return true; +} + +// Check legal use of predicate registers. +bool HexagonMCChecker::checkPredicates() { + HexagonMCErrInfo errInfo; + // Check for proper use of new predicate registers. + for (const auto& I : NewPreds) { + unsigned P = I; + + if (!Defs.count(P) || LatePreds.count(P)) { + // Error out if the new predicate register is not defined, + // or defined "late" + // (e.g., "{ if (p3.new)... ; p3 = sp1loop0(#r7:2, Rs) }"). + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWP, P); + addErrInfo(errInfo); + return false; + } + } + + // Check for proper use of auto-anded of predicate registers. + for (const auto& I : LatePreds) { + unsigned P = I; + + if (LatePreds.count(P) > 1 || Defs.count(P)) { + // Error out if predicate register defined "late" multiple times or + // defined late and regularly defined + // (e.g., "{ p3 = sp1loop0(...); p3 = cmp.eq(...) }". + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, P); + addErrInfo(errInfo); + return false; + } + } + + return true; +} + +// Check legal use of new values. +bool HexagonMCChecker::checkNewValues() { + HexagonMCErrInfo errInfo; + memset(&errInfo, 0, sizeof(errInfo)); + for (auto& I : NewUses) { + unsigned R = I.first; + NewSense &US = I.second; + + if (!hasValidNewValueDef(US, NewDefs[R])) { + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWV, R); + addErrInfo(errInfo); + return false; + } + } + + return true; +} + +// Check for legal register uses and definitions. +bool HexagonMCChecker::checkRegisters() { + HexagonMCErrInfo errInfo; + // Check for proper register definitions. + for (const auto& I : Defs) { + unsigned R = I.first; + + if (ReadOnly.count(R)) { + // Error out for definitions of read-only registers. + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_READONLY, R); + addErrInfo(errInfo); + return false; + } + if (isLoopRegister(R) && Defs.count(R) > 1 && + (HexagonMCInstrInfo::isInnerLoop(MCB) || + HexagonMCInstrInfo::isOuterLoop(MCB))) { + // Error out for definitions of loop registers at the end of a loop. + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_LOOP, R); + addErrInfo(errInfo); + return false; + } + if (SoftDefs.count(R)) { + // Error out for explicit changes to registers also weakly defined + // (e.g., "{ usr = r0; r0 = sfadd(...) }"). + unsigned UsrR = Hexagon::USR; // Silence warning about mixed types in ?:. + unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR); + addErrInfo(errInfo); + return false; + } + if (!isPredicateRegister(R) && Defs[R].size() > 1) { + // Check for multiple register definitions. + PredSet &PM = Defs[R]; + + // Check for multiple unconditional register definitions. + if (PM.count(Unconditional)) { + // Error out on an unconditional change when there are any other + // changes, conditional or not. + unsigned UsrR = Hexagon::USR; + unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR); + addErrInfo(errInfo); + return false; + } + // Check for multiple conditional register definitions. + for (const auto& J : PM) { + PredSense P = J; + + // Check for multiple uses of the same condition. + if (PM.count(P) > 1) { + // Error out on conditional changes based on the same predicate + // (e.g., "{ if (!p0) r0 =...; if (!p0) r0 =... }"). + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R); + addErrInfo(errInfo); + return false; + } + // Check for the use of the complementary condition. + P.second = !P.second; + if (PM.count(P) && PM.size() > 2) { + // Error out on conditional changes based on the same predicate + // multiple times + // (e.g., "{ if (p0) r0 =...; if (!p0) r0 =... }; if (!p0) r0 =... }"). + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R); + addErrInfo(errInfo); + return false; + } + } + } + } + + // Check for use of current definitions. + for (const auto& I : CurDefs) { + unsigned R = I; + + if (!Uses.count(R)) { + // Warn on an unused current definition. + errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_CURRENT, R); + addErrInfo(errInfo); + return true; + } + } + + // Check for use of temporary definitions. + for (const auto& I : TmpDefs) { + unsigned R = I; + + if (!Uses.count(R)) { + // special case for vhist + bool vHistFound = false; + for (auto const&HMI : HexagonMCInstrInfo::bundleInstructions(MCB)) { + if(llvm::HexagonMCInstrInfo::getType(MCII, *HMI.getInst()) == HexagonII::TypeCVI_HIST) { + vHistFound = true; // vhist() implicitly uses ALL REGxx.tmp + break; + } + } + // Warn on an unused temporary definition. + if (vHistFound == false) { + errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_TEMPORARY, R); + addErrInfo(errInfo); + return true; + } + } + } + + return true; +} + +// Check for legal use of solo insns. +bool HexagonMCChecker::checkSolo() { + HexagonMCErrInfo errInfo; + if (HexagonMCInstrInfo::isBundle(MCB) && + HexagonMCInstrInfo::bundleSize(MCB) > 1) { + for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) { + if (llvm::HexagonMCInstrInfo::isSolo(MCII, *I.getInst())) { + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SOLO); + addErrInfo(errInfo); + return false; + } + } + } + + return true; +} + +bool HexagonMCChecker::checkShuffle() { + HexagonMCErrInfo errInfo; + // Branch info is lost when duplexing. The unduplexed insns must be + // checked and only branch errors matter for this case. + HexagonMCShuffler MCS(MCII, STI, MCB); + if (!MCS.check()) { + if (MCS.getError() == HexagonShuffler::SHUFFLE_ERROR_BRANCHES) { + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE); + errInfo.setShuffleError(MCS.getError()); + addErrInfo(errInfo); + return false; + } + } + HexagonMCShuffler MCSDX(MCII, STI, MCBDX); + if (!MCSDX.check()) { + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE); + errInfo.setShuffleError(MCSDX.getError()); + addErrInfo(errInfo); + return false; + } + return true; +} + +void HexagonMCChecker::compoundRegisterMap(unsigned& Register) { + switch (Register) { + default: + break; + case Hexagon::R15: + Register = Hexagon::R23; + break; + case Hexagon::R14: + Register = Hexagon::R22; + break; + case Hexagon::R13: + Register = Hexagon::R21; + break; + case Hexagon::R12: + Register = Hexagon::R20; + break; + case Hexagon::R11: + Register = Hexagon::R19; + break; + case Hexagon::R10: + Register = Hexagon::R18; + break; + case Hexagon::R9: + Register = Hexagon::R17; + break; + case Hexagon::R8: + Register = Hexagon::R16; + break; + } +} + +bool HexagonMCChecker::hasValidNewValueDef(const NewSense &Use, + const NewSenseList &Defs) const { + bool Strict = !RelaxNVChecks; + + for (unsigned i = 0, n = Defs.size(); i < n; ++i) { + const NewSense &Def = Defs[i]; + // NVJ cannot use a new FP value [7.6.1] + if (Use.IsNVJ && (Def.IsFloat || Def.PredReg != 0)) + continue; + // If the definition was not predicated, then it does not matter if + // the use is. + if (Def.PredReg == 0) + return true; + // With the strict checks, both the definition and the use must be + // predicated on the same register and condition. + if (Strict) { + if (Def.PredReg == Use.PredReg && Def.Cond == Use.Cond) + return true; + } else { + // With the relaxed checks, if the definition was predicated, the only + // detectable violation is if the use is predicated on the opposing + // condition, otherwise, it's ok. + if (Def.PredReg != Use.PredReg || Def.Cond == Use.Cond) + return true; + } + } + return false; +} + diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h new file mode 100644 index 0000000..5fc0bde --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h @@ -0,0 +1,218 @@ +//===----- HexagonMCChecker.h - Instruction bundle checking ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This implements the checking of insns inside a bundle according to the +// packet constraint rules of the Hexagon ISA. +// +//===----------------------------------------------------------------------===// + +#ifndef HEXAGONMCCHECKER_H +#define HEXAGONMCCHECKER_H + +#include <map> +#include <set> +#include <queue> +#include "MCTargetDesc/HexagonMCShuffler.h" + +using namespace llvm; + +namespace llvm { +class MCOperandInfo; + +typedef struct { + unsigned Error, Warning, ShuffleError; + unsigned Register; +} ErrInfo_T; + +class HexagonMCErrInfo { +public: + enum { + CHECK_SUCCESS = 0, + // Errors. + CHECK_ERROR_BRANCHES = 0x00001, + CHECK_ERROR_NEWP = 0x00002, + CHECK_ERROR_NEWV = 0x00004, + CHECK_ERROR_REGISTERS = 0x00008, + CHECK_ERROR_READONLY = 0x00010, + CHECK_ERROR_LOOP = 0x00020, + CHECK_ERROR_ENDLOOP = 0x00040, + CHECK_ERROR_SOLO = 0x00080, + CHECK_ERROR_SHUFFLE = 0x00100, + CHECK_ERROR_NOSLOTS = 0x00200, + CHECK_ERROR_UNKNOWN = 0x00400, + // Warnings. + CHECK_WARN_CURRENT = 0x10000, + CHECK_WARN_TEMPORARY = 0x20000 + }; + ErrInfo_T s; + + void reset() { + s.Error = CHECK_SUCCESS; + s.Warning = CHECK_SUCCESS; + s.ShuffleError = HexagonShuffler::SHUFFLE_SUCCESS; + s.Register = Hexagon::NoRegister; + }; + HexagonMCErrInfo() { + reset(); + }; + + void setError(unsigned e, unsigned r = Hexagon::NoRegister) + { s.Error = e; s.Register = r; }; + void setWarning(unsigned w, unsigned r = Hexagon::NoRegister) + { s.Warning = w; s.Register = r; }; + void setShuffleError(unsigned e) { s.ShuffleError = e; }; +}; + +/// Check for a valid bundle. +class HexagonMCChecker { + /// Insn bundle. + MCInst& MCB; + MCInst& MCBDX; + const MCRegisterInfo& RI; + MCInstrInfo const &MCII; + MCSubtargetInfo const &STI; + bool bLoadErrInfo; + + /// Set of definitions: register #, if predicated, if predicated true. + typedef std::pair<unsigned, bool> PredSense; + static const PredSense Unconditional; + typedef std::multiset<PredSense> PredSet; + typedef std::multiset<PredSense>::iterator PredSetIterator; + + typedef llvm::DenseMap<unsigned, PredSet>::iterator DefsIterator; + llvm::DenseMap<unsigned, PredSet> Defs; + + /// Information about how a new-value register is defined or used: + /// PredReg = predicate register, 0 if use/def not predicated, + /// Cond = true/false for if(PredReg)/if(!PredReg) respectively, + /// IsFloat = true if definition produces a floating point value + /// (not valid for uses), + /// IsNVJ = true if the use is a new-value branch (not valid for + /// definitions). + struct NewSense { + unsigned PredReg; + bool IsFloat, IsNVJ, Cond; + // The special-case "constructors": + static NewSense Jmp(bool isNVJ) { + NewSense NS = { /*PredReg=*/ 0, /*IsFloat=*/ false, /*IsNVJ=*/ isNVJ, + /*Cond=*/ false }; + return NS; + } + static NewSense Use(unsigned PR, bool True) { + NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ false, /*IsNVJ=*/ false, + /*Cond=*/ True }; + return NS; + } + static NewSense Def(unsigned PR, bool True, bool Float) { + NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ Float, /*IsNVJ=*/ false, + /*Cond=*/ True }; + return NS; + } + }; + /// Set of definitions that produce new register: + typedef llvm::SmallVector<NewSense,2> NewSenseList; + typedef llvm::DenseMap<unsigned, NewSenseList>::iterator NewDefsIterator; + llvm::DenseMap<unsigned, NewSenseList> NewDefs; + + /// Set of weak definitions whose clashes should be enforced selectively. + typedef std::set<unsigned>::iterator SoftDefsIterator; + std::set<unsigned> SoftDefs; + + /// Set of current definitions committed to the register file. + typedef std::set<unsigned>::iterator CurDefsIterator; + std::set<unsigned> CurDefs; + + /// Set of temporary definitions not committed to the register file. + typedef std::set<unsigned>::iterator TmpDefsIterator; + std::set<unsigned> TmpDefs; + + /// Set of new predicates used. + typedef std::set<unsigned>::iterator NewPredsIterator; + std::set<unsigned> NewPreds; + + /// Set of predicates defined late. + typedef std::multiset<unsigned>::iterator LatePredsIterator; + std::multiset<unsigned> LatePreds; + + /// Set of uses. + typedef std::set<unsigned>::iterator UsesIterator; + std::set<unsigned> Uses; + + /// Set of new values used: new register, if new-value jump. + typedef llvm::DenseMap<unsigned, NewSense>::iterator NewUsesIterator; + llvm::DenseMap<unsigned, NewSense> NewUses; + + /// Pre-defined set of read-only registers. + typedef std::set<unsigned>::iterator ReadOnlyIterator; + std::set<unsigned> ReadOnly; + + std::queue<ErrInfo_T> ErrInfoQ; + HexagonMCErrInfo CrntErrInfo; + + void getErrInfo() { + if (bLoadErrInfo == true) { + if (ErrInfoQ.empty()) { + CrntErrInfo.reset(); + } else { + CrntErrInfo.s = ErrInfoQ.front(); + ErrInfoQ.pop(); + } + } + bLoadErrInfo = false; + } + + void init(); + void init(MCInst const&); + + // Checks performed. + bool checkBranches(); + bool checkPredicates(); + bool checkNewValues(); + bool checkRegisters(); + bool checkSolo(); + bool checkShuffle(); + bool checkSlots(); + + static void compoundRegisterMap(unsigned&); + + bool isPredicateRegister(unsigned R) const { + return (Hexagon::P0 == R || Hexagon::P1 == R || + Hexagon::P2 == R || Hexagon::P3 == R); + }; + bool isLoopRegister(unsigned R) const { + return (Hexagon::SA0 == R || Hexagon::LC0 == R || + Hexagon::SA1 == R || Hexagon::LC1 == R); + }; + + bool hasValidNewValueDef(const NewSense &Use, + const NewSenseList &Defs) const; + + public: + explicit HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst& mcb, MCInst &mcbdx, + const MCRegisterInfo& ri); + + bool check(); + + /// add a new error/warning + void addErrInfo(HexagonMCErrInfo &err) { ErrInfoQ.push(err.s); }; + + /// Return the error code for the last operation in the insn bundle. + unsigned getError() { getErrInfo(); return CrntErrInfo.s.Error; }; + unsigned getWarning() { getErrInfo(); return CrntErrInfo.s.Warning; }; + unsigned getShuffleError() { getErrInfo(); return CrntErrInfo.s.ShuffleError; }; + unsigned getErrRegister() { getErrInfo(); return CrntErrInfo.s.Register; }; + bool getNextErrInfo() { + bLoadErrInfo = true; + return (ErrInfoQ.empty()) ? false : (getErrInfo(), true); + } +}; + +} + +#endif // HEXAGONMCCHECKER_H diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp index 9fc4e2a..4b07ca7 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp @@ -96,6 +96,12 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction( assert(!HexagonMCInstrInfo::isBundle(HMB)); uint64_t Binary; + // Compound instructions are limited to using registers 0-7 and 16-23 + // and here we make a map 16-23 to 8-15 so they can be correctly encoded. + static unsigned RegMap[8] = {Hexagon::R8, Hexagon::R9, Hexagon::R10, + Hexagon::R11, Hexagon::R12, Hexagon::R13, + Hexagon::R14, Hexagon::R15}; + // Pseudo instructions don't get encoded and shouldn't be here // in the first place! assert(!HexagonMCInstrInfo::getDesc(MCII, HMB).isPseudo() && @@ -104,6 +110,16 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction( " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'" "\n"); + if (llvm::HexagonMCInstrInfo::getType(MCII, HMB) == HexagonII::TypeCOMPOUND) { + for (unsigned i = 0; i < HMB.getNumOperands(); ++i) + if (HMB.getOperand(i).isReg()) { + unsigned Reg = + MCT.getRegisterInfo()->getEncodingValue(HMB.getOperand(i).getReg()); + if ((Reg <= 23) && (Reg >= 16)) + HMB.getOperand(i).setReg(RegMap[Reg - 16]); + } + } + if (HexagonMCInstrInfo::isNewValue(MCII, HMB)) { // Calculate the new value distance to the associated producer MCOperand &MCO = @@ -318,21 +334,21 @@ static Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI, // The only relocs left should be GP relative: default: if (MCID.mayStore() || MCID.mayLoad()) { - for (const uint16_t *ImpUses = MCID.getImplicitUses(); *ImpUses; - ++ImpUses) { - if (*ImpUses == Hexagon::GP) { - switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) { - case HexagonII::MemAccessSize::ByteAccess: - return fixup_Hexagon_GPREL16_0; - case HexagonII::MemAccessSize::HalfWordAccess: - return fixup_Hexagon_GPREL16_1; - case HexagonII::MemAccessSize::WordAccess: - return fixup_Hexagon_GPREL16_2; - case HexagonII::MemAccessSize::DoubleWordAccess: - return fixup_Hexagon_GPREL16_3; - default: - llvm_unreachable("unhandled fixup"); - } + for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); + ImpUses && *ImpUses; ++ImpUses) { + if (*ImpUses != Hexagon::GP) + continue; + switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) { + case HexagonII::MemAccessSize::ByteAccess: + return fixup_Hexagon_GPREL16_0; + case HexagonII::MemAccessSize::HalfWordAccess: + return fixup_Hexagon_GPREL16_1; + case HexagonII::MemAccessSize::WordAccess: + return fixup_Hexagon_GPREL16_2; + case HexagonII::MemAccessSize::DoubleWordAccess: + return fixup_Hexagon_GPREL16_3; + default: + llvm_unreachable("unhandled fixup"); } } } else @@ -389,10 +405,8 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI, return cast<MCConstantExpr>(ME)->getValue(); } if (MK == MCExpr::Binary) { - unsigned Res; - Res = getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getLHS(), Fixups, STI); - Res += - getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getRHS(), Fixups, STI); + getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getLHS(), Fixups, STI); + getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getRHS(), Fixups, STI); return 0; } diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp index 886f8db..d194bea 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp @@ -115,8 +115,8 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) { SrcReg = MI.getOperand(1).getReg(); if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MI.getOperand(2).isImm() && ((isUInt<5>(MI.getOperand(2).getImm())) || - (MI.getOperand(2).getImm() == -1))) + (HexagonMCInstrInfo::inRange<5>(MI, 2) || + HexagonMCInstrInfo::minConstant(MI, 2) == -1)) return HexagonII::HCG_A; break; case Hexagon::A2_tfr: @@ -134,8 +134,8 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) { return false; // Rd = #u6 DstReg = MI.getOperand(0).getReg(); - if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() <= 63 && - MI.getOperand(1).getImm() >= 0 && + if (HexagonMCInstrInfo::minConstant(MI, 1) <= 63 && + HexagonMCInstrInfo::minConstant(MI, 1) >= 0 && HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) return HexagonII::HCG_A; break; @@ -145,9 +145,8 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) { DstReg = MI.getOperand(0).getReg(); Src1Reg = MI.getOperand(1).getReg(); if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) && - MI.getOperand(2).isImm() && HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) && - (MI.getOperand(2).getImm() == 0)) + HexagonMCInstrInfo::minConstant(MI, 2) == 0) return HexagonII::HCG_A; break; // The fact that .new form is used pretty much guarantees @@ -206,6 +205,8 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) { MCInst *CompoundInsn = 0; unsigned compoundOpcode; MCOperand Rs, Rt; + int64_t Value; + bool Success; switch (L.getOpcode()) { default: @@ -277,7 +278,10 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) { case Hexagon::C2_cmpeqi: DEBUG(dbgs() << "CX: C2_cmpeqi\n"); - if (L.getOperand(2).getImm() == -1) + Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value); + (void)Success; + assert(Success); + if (Value == -1) compoundOpcode = cmpeqn1BitOpcode[getCompoundOp(R)]; else compoundOpcode = cmpeqiBitOpcode[getCompoundOp(R)]; @@ -286,14 +290,17 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) { CompoundInsn = new (Context) MCInst; CompoundInsn->setOpcode(compoundOpcode); CompoundInsn->addOperand(Rs); - if (L.getOperand(2).getImm() != -1) + if (Value != -1) CompoundInsn->addOperand(L.getOperand(2)); CompoundInsn->addOperand(R.getOperand(1)); break; case Hexagon::C2_cmpgti: DEBUG(dbgs() << "CX: C2_cmpgti\n"); - if (L.getOperand(2).getImm() == -1) + Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value); + (void)Success; + assert(Success); + if (Value == -1) compoundOpcode = cmpgtn1BitOpcode[getCompoundOp(R)]; else compoundOpcode = cmpgtiBitOpcode[getCompoundOp(R)]; @@ -302,7 +309,7 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) { CompoundInsn = new (Context) MCInst; CompoundInsn->setOpcode(compoundOpcode); CompoundInsn->addOperand(Rs); - if (L.getOperand(2).getImm() != -1) + if (Value != -1) CompoundInsn->addOperand(L.getOperand(2)); CompoundInsn->addOperand(R.getOperand(1)); break; @@ -404,7 +411,7 @@ bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) { /// additional slot. void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) { - assert(MCI.getOpcode() == Hexagon::BUNDLE && + assert(HexagonMCInstrInfo::isBundle(MCI) && "Non-Bundle where Bundle expected"); // By definition a compound must have 2 insn. diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp index 7e9247c..e6194f6 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp @@ -26,7 +26,7 @@ using namespace Hexagon; #define DEBUG_TYPE "hexagon-mcduplex-info" // pair table of subInstructions with opcodes -static std::pair<unsigned, unsigned> opcodeData[] = { +static const std::pair<unsigned, unsigned> opcodeData[] = { std::make_pair((unsigned)V4_SA1_addi, 0), std::make_pair((unsigned)V4_SA1_addrx, 6144), std::make_pair((unsigned)V4_SA1_addsp, 3072), @@ -81,8 +81,7 @@ static std::pair<unsigned, unsigned> opcodeData[] = { std::make_pair((unsigned)V4_SS2_storewi1, 4352)}; static std::map<unsigned, unsigned> - subinstOpcodeMap(opcodeData, - opcodeData + sizeof(opcodeData) / sizeof(opcodeData[0])); + subinstOpcodeMap(std::begin(opcodeData), std::end(opcodeData)); bool HexagonMCInstrInfo::isDuplexPairMatch(unsigned Ga, unsigned Gb) { switch (Ga) { @@ -195,15 +194,13 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { // Special case this one from Group L2. // Rd = memw(r29+#u5:2) if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) { - if (HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg && - MCI.getOperand(2).isImm() && - isShiftedUInt<5, 2>(MCI.getOperand(2).getImm())) { + if (HexagonMCInstrInfo::isIntReg(SrcReg) && + Hexagon::R29 == SrcReg && inRange<5, 2>(MCI, 2)) { return HexagonII::HSIG_L2; } // Rd = memw(Rs+#u4:2) if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - (MCI.getOperand(2).isImm() && - isShiftedUInt<4, 2>(MCI.getOperand(2).getImm()))) { + inRange<4, 2>(MCI, 2)) { return HexagonII::HSIG_L1; } } @@ -214,7 +211,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MCI.getOperand(2).isImm() && isUInt<4>(MCI.getOperand(2).getImm())) { + inRange<4>(MCI, 2)) { return HexagonII::HSIG_L1; } break; @@ -235,8 +232,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MCI.getOperand(2).isImm() && - isShiftedUInt<3, 1>(MCI.getOperand(2).getImm())) { + inRange<3, 1>(MCI, 2)) { return HexagonII::HSIG_L2; } break; @@ -246,7 +242,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MCI.getOperand(2).isImm() && isUInt<3>(MCI.getOperand(2).getImm())) { + inRange<3>(MCI, 2)) { return HexagonII::HSIG_L2; } break; @@ -256,8 +252,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg && - MCI.getOperand(2).isImm() && - isShiftedUInt<5, 3>(MCI.getOperand(2).getImm())) { + inRange<5, 3>(MCI, 2)) { return HexagonII::HSIG_L2; } break; @@ -326,15 +321,13 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { Src2Reg = MCI.getOperand(2).getReg(); if (HexagonMCInstrInfo::isIntReg(Src1Reg) && HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) && - Hexagon::R29 == Src1Reg && MCI.getOperand(1).isImm() && - isShiftedUInt<5, 2>(MCI.getOperand(1).getImm())) { + Hexagon::R29 == Src1Reg && inRange<5, 2>(MCI, 1)) { return HexagonII::HSIG_S2; } // memw(Rs+#u4:2) = Rt if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) && HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) && - MCI.getOperand(1).isImm() && - isShiftedUInt<4, 2>(MCI.getOperand(1).getImm())) { + inRange<4, 2>(MCI, 1)) { return HexagonII::HSIG_S1; } break; @@ -344,7 +337,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { Src2Reg = MCI.getOperand(2).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) && HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) && - MCI.getOperand(1).isImm() && isUInt<4>(MCI.getOperand(1).getImm())) { + inRange<4>(MCI, 1)) { return HexagonII::HSIG_S1; } break; @@ -363,8 +356,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { Src2Reg = MCI.getOperand(2).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) && HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) && - MCI.getOperand(1).isImm() && - isShiftedUInt<3, 1>(MCI.getOperand(1).getImm())) { + inRange<3, 1>(MCI, 1)) { return HexagonII::HSIG_S2; } break; @@ -374,8 +366,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { Src2Reg = MCI.getOperand(2).getReg(); if (HexagonMCInstrInfo::isDblRegForSubInst(Src2Reg) && HexagonMCInstrInfo::isIntReg(Src1Reg) && Hexagon::R29 == Src1Reg && - MCI.getOperand(1).isImm() && - isShiftedInt<6, 3>(MCI.getOperand(1).getImm())) { + inSRange<6, 3>(MCI, 1)) { return HexagonII::HSIG_S2; } break; @@ -383,9 +374,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { // memw(Rs+#u4:2) = #U1 Src1Reg = MCI.getOperand(0).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) && - MCI.getOperand(1).isImm() && - isShiftedUInt<4, 2>(MCI.getOperand(1).getImm()) && - MCI.getOperand(2).isImm() && isUInt<1>(MCI.getOperand(2).getImm())) { + inRange<4, 2>(MCI, 1) && inRange<1>(MCI, 2)) { return HexagonII::HSIG_S2; } break; @@ -393,16 +382,13 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { // memb(Rs+#u4) = #U1 Src1Reg = MCI.getOperand(0).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) && - MCI.getOperand(1).isImm() && isUInt<4>(MCI.getOperand(1).getImm()) && - MCI.getOperand(2).isImm() && isUInt<1>(MCI.getOperand(2).getImm())) { + inRange<4>(MCI, 1) && inRange<1>(MCI, 2)) { return HexagonII::HSIG_S2; } break; case Hexagon::S2_allocframe: - if (MCI.getOperand(0).isImm() && - isShiftedUInt<5, 3>(MCI.getOperand(0).getImm())) { + if (inRange<5, 3>(MCI, 0)) return HexagonII::HSIG_S2; - } break; // // Group A: @@ -428,8 +414,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) { // Rd = add(r29,#u6:2) if (HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg && - MCI.getOperand(2).isImm() && - isShiftedUInt<6, 2>(MCI.getOperand(2).getImm())) { + inRange<6, 2>(MCI, 2)) { return HexagonII::HSIG_A; } // Rx = add(Rx,#s7) @@ -439,8 +424,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { // Rd = add(Rs,#1) // Rd = add(Rs,#-1) if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MCI.getOperand(2).isImm() && ((MCI.getOperand(2).getImm() == 1) || - (MCI.getOperand(2).getImm() == -1))) { + (minConstant(MCI, 2) == 1 || minConstant(MCI, 2) == -1)) { return HexagonII::HSIG_A; } } @@ -460,8 +444,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MCI.getOperand(2).isImm() && ((MCI.getOperand(2).getImm() == 1) || - (MCI.getOperand(2).getImm() == 255))) { + (minConstant(MCI, 2) == 1 || minConstant(MCI, 2) == 255)) { return HexagonII::HSIG_A; } break; @@ -491,8 +474,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { DstReg = MCI.getOperand(0).getReg(); // Rd PredReg = MCI.getOperand(1).getReg(); // P0 if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) && - Hexagon::P0 == PredReg && MCI.getOperand(2).isImm() && - MCI.getOperand(2).getImm() == 0) { + Hexagon::P0 == PredReg && minConstant(MCI, 2) == 0) { return HexagonII::HSIG_A; } break; @@ -502,7 +484,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (Hexagon::P0 == DstReg && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MCI.getOperand(2).isImm() && isUInt<2>(MCI.getOperand(2).getImm())) { + inRange<2>(MCI, 2)) { return HexagonII::HSIG_A; } break; @@ -511,10 +493,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { // Rdd = combine(#u2,#U2) DstReg = MCI.getOperand(0).getReg(); if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) && - // TODO: Handle Globals/Symbols - (MCI.getOperand(1).isImm() && isUInt<2>(MCI.getOperand(1).getImm())) && - ((MCI.getOperand(2).isImm() && - isUInt<2>(MCI.getOperand(2).getImm())))) { + inRange<2>(MCI, 1) && inRange<2>(MCI, 2)) { return HexagonII::HSIG_A; } break; @@ -524,7 +503,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - (MCI.getOperand(2).isImm() && MCI.getOperand(2).getImm() == 0)) { + minConstant(MCI, 2) == 0) { return HexagonII::HSIG_A; } break; @@ -534,7 +513,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(2).getReg(); if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - (MCI.getOperand(1).isImm() && MCI.getOperand(1).getImm() == 0)) { + minConstant(MCI, 1) == 0) { return HexagonII::HSIG_A; } break; @@ -556,19 +535,17 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { } bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) { - unsigned DstReg, SrcReg; - switch (potentialDuplex.getOpcode()) { case Hexagon::A2_addi: // testing for case of: Rx = add(Rx,#s7) DstReg = potentialDuplex.getOperand(0).getReg(); SrcReg = potentialDuplex.getOperand(1).getReg(); if (DstReg == SrcReg && HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) { - if (potentialDuplex.getOperand(2).isExpr()) + int64_t Value; + if (!potentialDuplex.getOperand(2).getExpr()->evaluateAsAbsolute(Value)) return true; - if (potentialDuplex.getOperand(2).isImm() && - !(isShiftedInt<7, 0>(potentialDuplex.getOperand(2).getImm()))) + if (!isShiftedInt<7, 0>(Value)) return true; } break; @@ -576,15 +553,14 @@ bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) { DstReg = potentialDuplex.getOperand(0).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) { - if (potentialDuplex.getOperand(1).isExpr()) + int64_t Value; + if (!potentialDuplex.getOperand(1).getExpr()->evaluateAsAbsolute(Value)) return true; // Check for case of Rd = #-1. - if (potentialDuplex.getOperand(1).isImm() && - (potentialDuplex.getOperand(1).getImm() == -1)) + if (Value == -1) return false; // Check for case of Rd = #u6. - if (potentialDuplex.getOperand(1).isImm() && - !isShiftedUInt<6, 0>(potentialDuplex.getOperand(1).getImm())) + if (!isShiftedUInt<6, 0>(Value)) return true; } break; @@ -712,19 +688,23 @@ inline static void addOps(MCInst &subInstPtr, MCInst const &Inst, MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { MCInst Result; + bool Absolute; + int64_t Value; switch (Inst.getOpcode()) { default: // dbgs() << "opcode: "<< Inst->getOpcode() << "\n"; llvm_unreachable("Unimplemented subinstruction \n"); break; case Hexagon::A2_addi: - if (Inst.getOperand(2).isImm() && Inst.getOperand(2).getImm() == 1) { + Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value); + assert(Absolute);(void)Absolute; + if (Value == 1) { Result.setOpcode(Hexagon::V4_SA1_inc); addOps(Result, Inst, 0); addOps(Result, Inst, 1); break; } // 1,2 SUBInst $Rd = add($Rs, #1) - else if (Inst.getOperand(2).isImm() && Inst.getOperand(2).getImm() == -1) { + else if (Value == -1) { Result.setOpcode(Hexagon::V4_SA1_dec); addOps(Result, Inst, 0); addOps(Result, Inst, 1); @@ -754,7 +734,7 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { addOps(Result, Inst, 0); break; // 1 SUBInst allocframe(#$u5_3) case Hexagon::A2_andir: - if (Inst.getOperand(2).getImm() == 255) { + if (minConstant(Inst, 2) == 255) { Result.setOpcode(Hexagon::V4_SA1_zxtb); addOps(Result, Inst, 0); addOps(Result, Inst, 1); @@ -772,26 +752,27 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { break; // 2,3 SUBInst p0 = cmp.eq($Rs, #$u2) case Hexagon::A4_combineii: case Hexagon::A2_combineii: - if (Inst.getOperand(1).getImm() == 1) { + Absolute = Inst.getOperand(1).getExpr()->evaluateAsAbsolute(Value); + assert(Absolute);(void)Absolute; + if (Value == 1) { Result.setOpcode(Hexagon::V4_SA1_combine1i); addOps(Result, Inst, 0); addOps(Result, Inst, 2); break; // 1,3 SUBInst $Rdd = combine(#1, #$u2) } - - if (Inst.getOperand(1).getImm() == 3) { + if (Value == 3) { Result.setOpcode(Hexagon::V4_SA1_combine3i); addOps(Result, Inst, 0); addOps(Result, Inst, 2); break; // 1,3 SUBInst $Rdd = combine(#3, #$u2) } - if (Inst.getOperand(1).getImm() == 0) { + if (Value == 0) { Result.setOpcode(Hexagon::V4_SA1_combine0i); addOps(Result, Inst, 0); addOps(Result, Inst, 2); break; // 1,3 SUBInst $Rdd = combine(#0, #$u2) } - if (Inst.getOperand(1).getImm() == 2) { + if (Value == 2) { Result.setOpcode(Hexagon::V4_SA1_combine2i); addOps(Result, Inst, 0); addOps(Result, Inst, 2); @@ -894,12 +875,14 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { break; // 1,2,3 SUBInst $Rd = memw($Rs + #$u4_2) } case Hexagon::S4_storeirb_io: - if (Inst.getOperand(2).getImm() == 0) { + Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value); + assert(Absolute);(void)Absolute; + if (Value == 0) { Result.setOpcode(Hexagon::V4_SS2_storebi0); addOps(Result, Inst, 0); addOps(Result, Inst, 1); break; // 1,2 SUBInst memb($Rs + #$u4_0)=#0 - } else if (Inst.getOperand(2).getImm() == 1) { + } else if (Value == 1) { Result.setOpcode(Hexagon::V4_SS2_storebi1); addOps(Result, Inst, 0); addOps(Result, Inst, 1); @@ -923,12 +906,14 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { addOps(Result, Inst, 2); break; // 1,2,3 SUBInst memb($Rs + #$u4_0) = $Rt case Hexagon::S4_storeiri_io: - if (Inst.getOperand(2).getImm() == 0) { + Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value); + assert(Absolute);(void)Absolute; + if (Value == 0) { Result.setOpcode(Hexagon::V4_SS2_storewi0); addOps(Result, Inst, 0); addOps(Result, Inst, 1); break; // 3 1,2 SUBInst memw($Rs + #$u4_2)=#0 - } else if (Inst.getOperand(2).getImm() == 1) { + } else if (Value == 1) { Result.setOpcode(Hexagon::V4_SS2_storewi1); addOps(Result, Inst, 0); addOps(Result, Inst, 1); @@ -983,7 +968,8 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { addOps(Result, Inst, 0); break; // 2 SUBInst if (p0) $Rd = #0 case Hexagon::A2_tfrsi: - if (Inst.getOperand(1).isImm() && Inst.getOperand(1).getImm() == -1) { + Absolute = Inst.getOperand(1).getExpr()->evaluateAsAbsolute(Value); + if (Absolute && Value == -1) { Result.setOpcode(Hexagon::V4_SA1_setin1); addOps(Result, Inst, 0); break; // 2 1 SUBInst $Rd = #-1 @@ -1044,6 +1030,8 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII, << "\n"); bisReversable = false; } + if (HexagonMCInstrInfo::isMemReorderDisabled(MCB)) // }:mem_noshuf + bisReversable = false; // Try in order. if (isOrderedDuplexPair( diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp index bf51c35..eaa3550 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -37,9 +37,7 @@ static cl::opt<unsigned> void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK, const MCSubtargetInfo &STI) { - MCInst HMI; - HMI.setOpcode(Hexagon::BUNDLE); - HMI.addOperand(MCOperand::createImm(0)); + MCInst HMI = HexagonMCInstrInfo::createBundle(); MCInst *MCB; if (MCK.getOpcode() != Hexagon::BUNDLE) { @@ -50,7 +48,7 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK, // Examines packet and pad the packet, if needed, when an // end-loop is in the bundle. - HexagonMCInstrInfo::padEndloop(*MCB); + HexagonMCInstrInfo::padEndloop(getContext(), *MCB); HexagonMCShuffle(*MCII, STI, *MCB); assert(HexagonMCInstrInfo::bundleSize(*MCB) <= HEXAGON_PACKET_SIZE); @@ -60,9 +58,9 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK, if (Extended) { if (HexagonMCInstrInfo::isDuplex(*MCII, *MCI)) { MCInst *SubInst = const_cast<MCInst *>(MCI->getOperand(1).getInst()); - HexagonMCInstrInfo::clampExtended(*MCII, *SubInst); + HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *SubInst); } else { - HexagonMCInstrInfo::clampExtended(*MCII, *MCI); + HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *MCI); } Extended = false; } else { @@ -114,7 +112,7 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol, MCSection *Section = getAssembler().getContext().getELFSection( SectionName, ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); SwitchSection(Section); - AssignSection(Symbol, Section); + AssignFragment(Symbol, getCurrentFragment()); MCELFStreamer::EmitCommonSymbol(Symbol, Size, ByteAlignment); SwitchSection(CrntSection); diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp new file mode 100644 index 0000000..fc62626 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp @@ -0,0 +1,49 @@ +//===-- HexagonMCExpr.cpp - Hexagon specific MC expression classes +//----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "HexagonMCExpr.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "hexagon-mcexpr" + +HexagonNoExtendOperand *HexagonNoExtendOperand::Create(MCExpr const *Expr, + MCContext &Ctx) { + return new (Ctx) HexagonNoExtendOperand(Expr); +} + +bool HexagonNoExtendOperand::evaluateAsRelocatableImpl( + MCValue &Res, MCAsmLayout const *Layout, MCFixup const *Fixup) const { + return Expr->evaluateAsRelocatable(Res, Layout, Fixup); +} + +void HexagonNoExtendOperand::visitUsedExpr(MCStreamer &Streamer) const {} + +MCFragment *llvm::HexagonNoExtendOperand::findAssociatedFragment() const { + return Expr->findAssociatedFragment(); +} + +void HexagonNoExtendOperand::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {} + +MCExpr const *HexagonNoExtendOperand::getExpr() const { return Expr; } + +bool HexagonNoExtendOperand::classof(MCExpr const *E) { + return E->getKind() == MCExpr::Target; +} + +HexagonNoExtendOperand::HexagonNoExtendOperand(MCExpr const *Expr) + : Expr(Expr) {} + +void HexagonNoExtendOperand::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { + Expr->print(OS, MAI); +} diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h new file mode 100644 index 0000000..60f180f --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h @@ -0,0 +1,35 @@ +//==- HexagonMCExpr.h - Hexagon specific MC expression classes --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H +#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H + +#include "llvm/MC/MCExpr.h" + +namespace llvm { +class MCInst; +class HexagonNoExtendOperand : public MCTargetExpr { +public: + static HexagonNoExtendOperand *Create(MCExpr const *Expr, MCContext &Ctx); + void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override; + bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; + MCFragment *findAssociatedFragment() const override; + void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override; + static bool classof(MCExpr const *E); + MCExpr const *getExpr() const; + +private: + HexagonNoExtendOperand(MCExpr const *Expr); + MCExpr const *Expr; +}; +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp index 48b15f8..e684207 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp @@ -15,17 +15,37 @@ #include "Hexagon.h" #include "HexagonBaseInfo.h" +#include "HexagonMCChecker.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" namespace llvm { +void HexagonMCInstrInfo::addConstant(MCInst &MI, uint64_t Value, + MCContext &Context) { + MI.addOperand(MCOperand::createExpr(MCConstantExpr::create(Value, Context))); +} + +void HexagonMCInstrInfo::addConstExtender(MCContext &Context, + MCInstrInfo const &MCII, MCInst &MCB, + MCInst const &MCI) { + assert(HexagonMCInstrInfo::isBundle(MCB)); + MCOperand const &exOp = + MCI.getOperand(HexagonMCInstrInfo::getExtendableOp(MCII, MCI)); + + // Create the extender. + MCInst *XMCI = + new (Context) MCInst(HexagonMCInstrInfo::deriveExtender(MCII, MCI, exOp)); + + MCB.addOperand(MCOperand::createInst(XMCI)); +} + iterator_range<MCInst::const_iterator> HexagonMCInstrInfo::bundleInstructions(MCInst const &MCI) { assert(isBundle(MCI)); - return iterator_range<MCInst::const_iterator>( - MCI.begin() + bundleInstructionsOffset, MCI.end()); + return make_range(MCI.begin() + bundleInstructionsOffset, MCI.end()); } size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) { @@ -35,7 +55,40 @@ size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) { return (1); } -void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII, MCInst &MCI) { +bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII, + MCSubtargetInfo const &STI, + MCContext &Context, MCInst &MCB, + HexagonMCChecker *Check) { + // Examine the packet and convert pairs of instructions to compound + // instructions when possible. + if (!HexagonDisableCompound) + HexagonMCInstrInfo::tryCompound(MCII, Context, MCB); + // Check the bundle for errors. + bool CheckOk = Check ? Check->check() : true; + if (!CheckOk) + return false; + HexagonMCShuffle(MCII, STI, MCB); + // Examine the packet and convert pairs of instructions to duplex + // instructions when possible. + MCInst InstBundlePreDuplex = MCInst(MCB); + if (!HexagonDisableDuplex) { + SmallVector<DuplexCandidate, 8> possibleDuplexes; + possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB); + HexagonMCShuffle(MCII, STI, Context, MCB, possibleDuplexes); + } + // Examines packet and pad the packet, if needed, when an + // end-loop is in the bundle. + HexagonMCInstrInfo::padEndloop(Context, MCB); + // If compounding and duplexing didn't reduce the size below + // 4 or less we have a packet that is too big. + if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE) + return false; + HexagonMCShuffle(MCII, STI, MCB); + return true; +} + +void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII, + MCContext &Context, MCInst &MCI) { assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) || HexagonMCInstrInfo::isExtended(MCII, MCI)); MCOperand &exOp = @@ -43,13 +96,20 @@ void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII, MCInst &MCI) { // If the extended value is a constant, then use it for the extended and // for the extender instructions, masking off the lower 6 bits and // including the assumed bits. - if (exOp.isImm()) { + int64_t Value; + if (exOp.getExpr()->evaluateAsAbsolute(Value)) { unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MCI); - int64_t Bits = exOp.getImm(); - exOp.setImm((Bits & 0x3f) << Shift); + exOp.setExpr(MCConstantExpr::create((Value & 0x3f) << Shift, Context)); } } +MCInst HexagonMCInstrInfo::createBundle() { + MCInst Result; + Result.setOpcode(Hexagon::BUNDLE); + Result.addOperand(MCOperand::createImm(0)); + return Result; +} + MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0, MCInst const &inst1) { @@ -64,6 +124,27 @@ MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass, return duplexInst; } +MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII, + MCInst const &Inst, + MCOperand const &MO) { + assert(HexagonMCInstrInfo::isExtendable(MCII, Inst) || + HexagonMCInstrInfo::isExtended(MCII, Inst)); + + MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, Inst); + MCInst XMI; + XMI.setOpcode((Desc.isBranch() || Desc.isCall() || + HexagonMCInstrInfo::getType(MCII, Inst) == HexagonII::TypeCR) + ? Hexagon::A4_ext_b + : Hexagon::A4_ext); + if (MO.isImm()) + XMI.addOperand(MCOperand::createImm(MO.getImm() & (~0x3f))); + else if (MO.isExpr()) + XMI.addOperand(MCOperand::createExpr(MO.getExpr())); + else + llvm_unreachable("invalid extendable operand"); + return XMI; +} + MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB, size_t Index) { assert(Index <= bundleSize(MCB)); @@ -76,6 +157,13 @@ MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB, return nullptr; } +void HexagonMCInstrInfo::extendIfNeeded(MCContext &Context, + MCInstrInfo const &MCII, MCInst &MCB, + MCInst const &MCI, bool MustExtend) { + if (isConstExtended(MCII, MCI) || MustExtend) + addConstExtender(Context, MCII, MCB, MCI); +} + HexagonII::MemAccessSize HexagonMCInstrInfo::getAccessSize(MCInstrInfo const &MCII, MCInst const &MCI) { const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; @@ -186,6 +274,25 @@ MCOperand const &HexagonMCInstrInfo::getNewValueOperand(MCInstrInfo const &MCII, return (MCO); } +/// Return the new value or the newly produced value. +unsigned short HexagonMCInstrInfo::getNewValueOp2(MCInstrInfo const &MCII, + MCInst const &MCI) { + const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; + return ((F >> HexagonII::NewValueOpPos2) & HexagonII::NewValueOpMask2); +} + +MCOperand const & +HexagonMCInstrInfo::getNewValueOperand2(MCInstrInfo const &MCII, + MCInst const &MCI) { + unsigned O = HexagonMCInstrInfo::getNewValueOp2(MCII, MCI); + MCOperand const &MCO = MCI.getOperand(O); + + assert((HexagonMCInstrInfo::isNewValue(MCII, MCI) || + HexagonMCInstrInfo::hasNewValue2(MCII, MCI)) && + MCO.isReg()); + return (MCO); +} + int HexagonMCInstrInfo::getSubTarget(MCInstrInfo const &MCII, MCInst const &MCI) { const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; @@ -242,6 +349,13 @@ bool HexagonMCInstrInfo::hasNewValue(MCInstrInfo const &MCII, return ((F >> HexagonII::hasNewValuePos) & HexagonII::hasNewValueMask); } +/// Return whether the insn produces a second value. +bool HexagonMCInstrInfo::hasNewValue2(MCInstrInfo const &MCII, + MCInst const &MCI) { + const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; + return ((F >> HexagonII::hasNewValuePos2) & HexagonII::hasNewValueMask2); +} + MCInst const &HexagonMCInstrInfo::instruction(MCInst const &MCB, size_t Index) { assert(isBundle(MCB)); assert(Index < HEXAGON_PACKET_SIZE); @@ -261,6 +375,11 @@ bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) { HexagonMCInstrInfo::getType(MCII, MCI) != HexagonII::TypeENDLOOP); } +bool HexagonMCInstrInfo::isCompound(MCInstrInfo const &MCII, + MCInst const &MCI) { + return (getType(MCII, MCI) == HexagonII::TypeCOMPOUND); +} + bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) { return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) || (Reg >= Hexagon::D8 && Reg <= Hexagon::D11)); @@ -282,14 +401,21 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI) { if (HexagonMCInstrInfo::isExtended(MCII, MCI)) return true; - - if (!HexagonMCInstrInfo::isExtendable(MCII, MCI)) + // Branch insns are handled as necessary by relaxation. + if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeJ) || + (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCOMPOUND && + HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()) || + (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV && + HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch())) + return false; + // Otherwise loop instructions and other CR insts are handled by relaxation + else if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCR) && + (MCI.getOpcode() != Hexagon::C4_addipc)) + return false; + else if (!HexagonMCInstrInfo::isExtendable(MCII, MCI)) return false; - short ExtOpNum = HexagonMCInstrInfo::getCExtOpNum(MCII, MCI); - int MinValue = HexagonMCInstrInfo::getMinValue(MCII, MCI); - int MaxValue = HexagonMCInstrInfo::getMaxValue(MCII, MCI); - MCOperand const &MO = MCI.getOperand(ExtOpNum); + MCOperand const &MO = HexagonMCInstrInfo::getExtendableOperand(MCII, MCI); // We could be using an instruction with an extendable immediate and shoehorn // a global address into it. If it is a global address it will be constant @@ -297,15 +423,13 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII, // We currently only handle isGlobal() because it is the only kind of // object we are going to end up with here for now. // In the future we probably should add isSymbol(), etc. - if (MO.isExpr()) + assert(!MO.isImm()); + int64_t Value; + if (!MO.getExpr()->evaluateAsAbsolute(Value)) return true; - - // If the extendable operand is not 'Immediate' type, the instruction should - // have 'isExtended' flag set. - assert(MO.isImm() && "Extendable operand must be Immediate type"); - - int ImmValue = MO.getImm(); - return (ImmValue < MinValue || ImmValue > MaxValue); + int MinValue = HexagonMCInstrInfo::getMinValue(MCII, MCI); + int MaxValue = HexagonMCInstrInfo::getMaxValue(MCII, MCI); + return (MinValue > Value || Value > MaxValue); } bool HexagonMCInstrInfo::isExtendable(MCInstrInfo const &MCII, @@ -374,6 +498,19 @@ bool HexagonMCInstrInfo::isPredicated(MCInstrInfo const &MCII, return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask); } +bool HexagonMCInstrInfo::isPredicateLate(MCInstrInfo const &MCII, + MCInst const &MCI) { + const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; + return (F >> HexagonII::PredicateLatePos & HexagonII::PredicateLateMask); +} + +/// Return whether the insn is newly predicated. +bool HexagonMCInstrInfo::isPredicatedNew(MCInstrInfo const &MCII, + MCInst const &MCI) { + const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; + return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask); +} + bool HexagonMCInstrInfo::isPredicatedTrue(MCInstrInfo const &MCII, MCInst const &MCI) { const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; @@ -394,6 +531,18 @@ bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) { return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask); } +bool HexagonMCInstrInfo::isMemReorderDisabled(MCInst const &MCI) { + assert(isBundle(MCI)); + auto Flags = MCI.getOperand(0).getImm(); + return (Flags & memReorderDisabledMask) != 0; +} + +bool HexagonMCInstrInfo::isMemStoreReorderEnabled(MCInst const &MCI) { + assert(isBundle(MCI)); + auto Flags = MCI.getOperand(0).getImm(); + return (Flags & memStoreReorderEnabledMask) != 0; +} + bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) { const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask); @@ -405,7 +554,28 @@ bool HexagonMCInstrInfo::isSoloAin1(MCInstrInfo const &MCII, return ((F >> HexagonII::SoloAin1Pos) & HexagonII::SoloAin1Mask); } -void HexagonMCInstrInfo::padEndloop(MCInst &MCB) { +bool HexagonMCInstrInfo::isVector(MCInstrInfo const &MCII, MCInst const &MCI) { + if ((getType(MCII, MCI) <= HexagonII::TypeCVI_LAST) && + (getType(MCII, MCI) >= HexagonII::TypeCVI_FIRST)) + return true; + return false; +} + +int64_t HexagonMCInstrInfo::minConstant(MCInst const &MCI, size_t Index) { + auto Sentinal = static_cast<int64_t>(std::numeric_limits<uint32_t>::max()) + << 8; + if (MCI.size() <= Index) + return Sentinal; + MCOperand const &MCO = MCI.getOperand(Index); + if (!MCO.isExpr()) + return Sentinal; + int64_t Value; + if (!MCO.getExpr()->evaluateAsAbsolute(Value)) + return Sentinal; + return Value; +} + +void HexagonMCInstrInfo::padEndloop(MCContext &Context, MCInst &MCB) { MCInst Nop; Nop.setOpcode(Hexagon::A2_nop); assert(isBundle(MCB)); @@ -413,7 +583,7 @@ void HexagonMCInstrInfo::padEndloop(MCInst &MCB) { (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_INNER_SIZE)) || ((HexagonMCInstrInfo::isOuterLoop(MCB) && (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_OUTER_SIZE)))) - MCB.addOperand(MCOperand::createInst(new MCInst(Nop))); + MCB.addOperand(MCOperand::createInst(new (Context) MCInst(Nop))); } bool HexagonMCInstrInfo::prefersSlot3(MCInstrInfo const &MCII, @@ -456,6 +626,20 @@ void HexagonMCInstrInfo::setInnerLoop(MCInst &MCI) { Operand.setImm(Operand.getImm() | innerLoopMask); } +void HexagonMCInstrInfo::setMemReorderDisabled(MCInst &MCI) { + assert(isBundle(MCI)); + MCOperand &Operand = MCI.getOperand(0); + Operand.setImm(Operand.getImm() | memReorderDisabledMask); + assert(isMemReorderDisabled(MCI)); +} + +void HexagonMCInstrInfo::setMemStoreReorderEnabled(MCInst &MCI) { + assert(isBundle(MCI)); + MCOperand &Operand = MCI.getOperand(0); + Operand.setImm(Operand.getImm() | memStoreReorderEnabledMask); + assert(isMemStoreReorderEnabled(MCI)); +} + void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) { assert(isBundle(MCI)); MCOperand &Operand = MCI.getOperand(0); diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h index 32d61a4..0237b28 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h @@ -14,9 +14,11 @@ #ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H #define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H +#include "HexagonMCExpr.h" #include "llvm/MC/MCInst.h" namespace llvm { +class HexagonMCChecker; class MCContext; class MCInstrDesc; class MCInstrInfo; @@ -39,20 +41,47 @@ int64_t const innerLoopMask = 1 << innerLoopOffset; size_t const outerLoopOffset = 1; int64_t const outerLoopMask = 1 << outerLoopOffset; +// do not reorder memory load/stores by default load/stores are re-ordered +// and by default loads can be re-ordered +size_t const memReorderDisabledOffset = 2; +int64_t const memReorderDisabledMask = 1 << memReorderDisabledOffset; + +// allow re-ordering of memory stores by default stores cannot be re-ordered +size_t const memStoreReorderEnabledOffset = 3; +int64_t const memStoreReorderEnabledMask = 1 << memStoreReorderEnabledOffset; + size_t const bundleInstructionsOffset = 1; +void addConstant(MCInst &MI, uint64_t Value, MCContext &Context); +void addConstExtender(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB, + MCInst const &MCI); + // Returns a iterator range of instructions in this bundle iterator_range<MCInst::const_iterator> bundleInstructions(MCInst const &MCI); // Returns the number of instructions in the bundle size_t bundleSize(MCInst const &MCI); +// Put the packet in to canonical form, compound, duplex, pad, and shuffle +bool canonicalizePacket(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, + MCContext &Context, MCInst &MCB, + HexagonMCChecker *Checker); + // Clamp off upper 26 bits of extendable operand for emission -void clampExtended(MCInstrInfo const &MCII, MCInst &MCI); +void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI); + +MCInst createBundle(); + +// Return the extender for instruction at Index or nullptr if none +MCInst const *extenderForIndex(MCInst const &MCB, size_t Index); +void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB, + MCInst const &MCI, bool MustExtend); // Create a duplex instruction given the two subinsts MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0, MCInst const &inst1); +MCInst deriveExtender(MCInstrInfo const &MCII, MCInst const &Inst, + MCOperand const &MO); // Convert this instruction in to a duplex subinst MCInst deriveSubInst(MCInst const &Inst); @@ -108,6 +137,9 @@ unsigned short getNewValueOp(MCInstrInfo const &MCII, MCInst const &MCI); // Return the operand that consumes or produces a new value. MCOperand const &getNewValueOperand(MCInstrInfo const &MCII, MCInst const &MCI); +unsigned short getNewValueOp2(MCInstrInfo const &MCII, MCInst const &MCI); +MCOperand const &getNewValueOperand2(MCInstrInfo const &MCII, + MCInst const &MCI); int getSubTarget(MCInstrInfo const &MCII, MCInst const &MCI); @@ -125,6 +157,7 @@ bool hasImmExt(MCInst const &MCI); // Return whether the instruction is a legal new-value producer. bool hasNewValue(MCInstrInfo const &MCII, MCInst const &MCI); +bool hasNewValue2(MCInstrInfo const &MCII, MCInst const &MCI); // Return the instruction at Index MCInst const &instruction(MCInst const &MCB, size_t Index); @@ -134,10 +167,24 @@ bool isBundle(MCInst const &MCI); // Return whether the insn is an actual insn. bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI); +bool isCompound(MCInstrInfo const &MCII, MCInst const &MCI); // Return the duplex iclass given the two duplex classes unsigned iClassOfDuplexPair(unsigned Ga, unsigned Gb); +int64_t minConstant(MCInst const &MCI, size_t Index); +template <unsigned N, unsigned S> +bool inRange(MCInst const &MCI, size_t Index) { + return isShiftedUInt<N, S>(minConstant(MCI, Index)); +} +template <unsigned N, unsigned S> +bool inSRange(MCInst const &MCI, size_t Index) { + return isShiftedInt<N, S>(minConstant(MCI, Index)); +} +template <unsigned N> bool inRange(MCInst const &MCI, size_t Index) { + return isUInt<N>(minConstant(MCI, Index)); +} + // Return whether the instruction needs to be constant extended. bool isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI); @@ -173,6 +220,8 @@ bool isIntReg(unsigned Reg); // Is this register suitable for use in a duplex subinst bool isIntRegForSubInst(unsigned Reg); +bool isMemReorderDisabled(MCInst const &MCI); +bool isMemStoreReorderEnabled(MCInst const &MCI); // Return whether the insn is a new-value consumer. bool isNewValue(MCInstrInfo const &MCII, MCInst const &MCI); @@ -191,6 +240,8 @@ bool isOuterLoop(MCInst const &MCI); // Return whether this instruction is predicated bool isPredicated(MCInstrInfo const &MCII, MCInst const &MCI); +bool isPredicateLate(MCInstrInfo const &MCII, MCInst const &MCI); +bool isPredicatedNew(MCInstrInfo const &MCII, MCInst const &MCI); // Return whether the predicate sense is true bool isPredicatedTrue(MCInstrInfo const &MCII, MCInst const &MCI); @@ -209,9 +260,10 @@ bool isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI); /// Return whether the insn can be packaged only with an A-type insn in slot #1. bool isSoloAin1(MCInstrInfo const &MCII, MCInst const &MCI); +bool isVector(MCInstrInfo const &MCII, MCInst const &MCI); // Pad the bundle with nops to satisfy endloop requirements -void padEndloop(MCInst &MCI); +void padEndloop(MCContext &Context, MCInst &MCI); bool prefersSlot3(MCInstrInfo const &MCII, MCInst const &MCI); @@ -220,6 +272,8 @@ void replaceDuplex(MCContext &Context, MCInst &MCB, DuplexCandidate Candidate); // Marks a bundle as endloop0 void setInnerLoop(MCInst &MCI); +void setMemReorderDisabled(MCInst &MCI); +void setMemStoreReorderEnabled(MCInst &MCI); // Marks a bundle as endloop1 void setOuterLoop(MCInst &MCI); diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index 53305d8..9a29257 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -40,6 +40,20 @@ using namespace llvm; #define GET_REGINFO_MC_DESC #include "HexagonGenRegisterInfo.inc" +cl::opt<bool> llvm::HexagonDisableCompound + ("mno-compound", + cl::desc("Disable looking for compound instructions for Hexagon")); + +cl::opt<bool> llvm::HexagonDisableDuplex + ("mno-pairing", + cl::desc("Disable looking for duplex instructions for Hexagon")); + +StringRef HEXAGON_MC::selectHexagonCPU(const Triple &TT, StringRef CPU) { + if (CPU.empty()) + CPU = "hexagonv60"; + return CPU; +} + MCInstrInfo *llvm::createHexagonMCInstrInfo() { MCInstrInfo *X = new MCInstrInfo(); InitHexagonMCInstrInfo(X); @@ -54,6 +68,7 @@ static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) { static MCSubtargetInfo * createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { + CPU = HEXAGON_MC::selectHexagonCPU(TT, CPU); return createHexagonMCSubtargetInfoImpl(TT, CPU, FS); } @@ -76,28 +91,23 @@ public: StringRef Contents(Buffer); auto PacketBundle = Contents.rsplit('\n'); auto HeadTail = PacketBundle.first.split('\n'); - auto Preamble = "\t{\n\t\t"; - auto Separator = ""; - while(!HeadTail.first.empty()) { - OS << Separator; - StringRef Inst; + StringRef Separator = "\n"; + StringRef Indent = "\t\t"; + OS << "\t{\n"; + while (!HeadTail.first.empty()) { + StringRef InstTxt; auto Duplex = HeadTail.first.split('\v'); - if(!Duplex.second.empty()){ - OS << Duplex.first << "\n"; - Inst = Duplex.second; - } - else { - if(!HeadTail.first.startswith("immext")) - Inst = Duplex.first; + if (!Duplex.second.empty()) { + OS << Indent << Duplex.first << Separator; + InstTxt = Duplex.second; + } else if (!HeadTail.first.trim().startswith("immext")) { + InstTxt = Duplex.first; } - OS << Preamble; - OS << Inst; + if (!InstTxt.empty()) + OS << Indent << InstTxt << Separator; HeadTail = HeadTail.second.split('\n'); - Preamble = ""; - Separator = "\n\t\t"; } - if(HexagonMCInstrInfo::bundleSize(Inst) != 0) - OS << "\n\t}" << PacketBundle.second; + OS << "\t}" << PacketBundle.second; } }; } @@ -154,9 +164,9 @@ static MCCodeGenInfo *createHexagonMCCodeGenInfo(const Triple &TT, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); - // For the time being, use static relocations, since there's really no - // support for PIC yet. - X->initMCCodeGenInfo(Reloc::Static, CM, OL); + if (RM == Reloc::Default) + RM = Reloc::Static; + X->initMCCodeGenInfo(RM, CM, OL); return X; } diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h index cb62650..a005a01 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h @@ -16,6 +16,8 @@ #include <cstdint> +#include "llvm/Support/CommandLine.h" + namespace llvm { struct InstrItinerary; struct InstrStage; @@ -33,22 +35,27 @@ class raw_ostream; class raw_pwrite_stream; extern Target TheHexagonTarget; - +extern cl::opt<bool> HexagonDisableCompound; +extern cl::opt<bool> HexagonDisableDuplex; extern const InstrStage HexagonStages[]; MCInstrInfo *createHexagonMCInstrInfo(); -MCCodeEmitter *createHexagonMCCodeEmitter(MCInstrInfo const &MCII, - MCRegisterInfo const &MRI, +MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, MCContext &MCT); -MCAsmBackend *createHexagonAsmBackend(Target const &T, - MCRegisterInfo const &MRI, +MCAsmBackend *createHexagonAsmBackend(const Target &T, + const MCRegisterInfo &MRI, const Triple &TT, StringRef CPU); MCObjectWriter *createHexagonELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI, StringRef CPU); +namespace HEXAGON_MC { + StringRef selectHexagonCPU(const Triple &TT, StringRef CPU); +} + } // End llvm namespace // Define symbolic names for Hexagon registers. This defines a mapping from diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp index 41112ac..4e1cce3 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp @@ -27,6 +27,7 @@ using namespace llvm; +namespace { // Insn shuffling priority. class HexagonBid { // The priority is directly proportional to how restricted the insn is based @@ -75,6 +76,7 @@ public: return false; }; }; +} // end anonymous namespace unsigned HexagonResource::setWeight(unsigned s) { const unsigned SlotWeight = 8; @@ -93,10 +95,57 @@ unsigned HexagonResource::setWeight(unsigned s) { return (Weight); } +void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) { + (*TUL)[HexagonII::TypeCVI_VA] = + UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1); + (*TUL)[HexagonII::TypeCVI_VA_DV] = UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2); + (*TUL)[HexagonII::TypeCVI_VX] = UnitsAndLanes(CVI_MPY0 | CVI_MPY1, 1); + (*TUL)[HexagonII::TypeCVI_VX_DV] = UnitsAndLanes(CVI_MPY0, 2); + (*TUL)[HexagonII::TypeCVI_VP] = UnitsAndLanes(CVI_XLANE, 1); + (*TUL)[HexagonII::TypeCVI_VP_VS] = UnitsAndLanes(CVI_XLANE, 2); + (*TUL)[HexagonII::TypeCVI_VS] = UnitsAndLanes(CVI_SHIFT, 1); + (*TUL)[HexagonII::TypeCVI_VINLANESAT] = UnitsAndLanes(CVI_SHIFT, 1); + (*TUL)[HexagonII::TypeCVI_VM_LD] = + UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1); + (*TUL)[HexagonII::TypeCVI_VM_TMP_LD] = UnitsAndLanes(CVI_NONE, 0); + (*TUL)[HexagonII::TypeCVI_VM_CUR_LD] = + UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1); + (*TUL)[HexagonII::TypeCVI_VM_VP_LDU] = UnitsAndLanes(CVI_XLANE, 1); + (*TUL)[HexagonII::TypeCVI_VM_ST] = + UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1); + (*TUL)[HexagonII::TypeCVI_VM_NEW_ST] = UnitsAndLanes(CVI_NONE, 0); + (*TUL)[HexagonII::TypeCVI_VM_STU] = UnitsAndLanes(CVI_XLANE, 1); + (*TUL)[HexagonII::TypeCVI_HIST] = UnitsAndLanes(CVI_XLANE, 4); +} + +HexagonCVIResource::HexagonCVIResource(TypeUnitsAndLanes *TUL, + MCInstrInfo const &MCII, unsigned s, + MCInst const *id) + : HexagonResource(s), TUL(TUL) { + unsigned T = HexagonMCInstrInfo::getType(MCII, *id); + + if (TUL->count(T)) { + // For an HVX insn. + Valid = true; + setUnits((*TUL)[T].first); + setLanes((*TUL)[T].second); + setLoad(HexagonMCInstrInfo::getDesc(MCII, *id).mayLoad()); + setStore(HexagonMCInstrInfo::getDesc(MCII, *id).mayStore()); + } else { + // For core insns. + Valid = false; + setUnits(0); + setLanes(0); + setLoad(false); + setStore(false); + } +} + HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI) : MCII(MCII), STI(STI) { reset(); + HexagonCVIResource::SetupTUL(&TUL, STI.getCPU()); } void HexagonShuffler::reset() { @@ -107,7 +156,7 @@ void HexagonShuffler::reset() { void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender, unsigned S, bool X) { - HexagonInstr PI(ID, Extender, S, X); + HexagonInstr PI(&TUL, MCII, ID, Extender, S, X); Packet.push_back(PI); } @@ -126,6 +175,8 @@ bool HexagonShuffler::check() { // Number of memory operations, loads, solo loads, stores, solo stores, single // stores. unsigned memory = 0, loads = 0, load0 = 0, stores = 0, store0 = 0, store1 = 0; + // Number of HVX loads, HVX stores. + unsigned CVIloads = 0, CVIstores = 0; // Number of duplex insns, solo insns. unsigned duplex = 0, solo = 0; // Number of insns restricting other insns in the packet to A and X types, @@ -168,6 +219,12 @@ bool HexagonShuffler::check() { case HexagonII::TypeJ: ++jumps; break; + case HexagonII::TypeCVI_VM_VP_LDU: + ++onlyNo1; + case HexagonII::TypeCVI_VM_LD: + case HexagonII::TypeCVI_VM_TMP_LD: + case HexagonII::TypeCVI_VM_CUR_LD: + ++CVIloads; case HexagonII::TypeLD: ++loads; ++memory; @@ -176,6 +233,11 @@ bool HexagonShuffler::check() { if (HexagonMCInstrInfo::getDesc(MCII, *ID).isReturn()) ++jumps, ++jump1; // DEALLOC_RETURN is of type LD. break; + case HexagonII::TypeCVI_VM_STU: + ++onlyNo1; + case HexagonII::TypeCVI_VM_ST: + case HexagonII::TypeCVI_VM_NEW_ST: + ++CVIstores; case HexagonII::TypeST: ++stores; ++memory; @@ -203,9 +265,9 @@ bool HexagonShuffler::check() { } // Check if the packet is legal. - if ((load0 > 1 || store0 > 1) || (duplex > 1 || (duplex && memory)) || - (solo && size() > 1) || (onlyAX && neitherAnorX > 1) || - (onlyAX && xtypeFloat)) { + if ((load0 > 1 || store0 > 1 || CVIloads > 1 || CVIstores > 1) || + (duplex > 1 || (duplex && memory)) || (solo && size() > 1) || + (onlyAX && neitherAnorX > 1) || (onlyAX && xtypeFloat)) { Error = SHUFFLE_ERROR_INVALID; return false; } @@ -336,6 +398,19 @@ bool HexagonShuffler::check() { return false; } } + // Verify the CVI slot subscriptions. + { + HexagonUnitAuction AuctionCVI; + + std::sort(begin(), end(), HexagonInstr::lessCVI); + + for (iterator I = begin(); I != end(); ++I) + for (unsigned i = 0; i < I->CVI.getLanes(); ++i) // TODO: I->CVI.isValid? + if (!AuctionCVI.bid(I->CVI.getUnits() << i)) { + Error = SHUFFLE_ERROR_SLOTS; + return false; + } + } Error = SHUFFLE_SUCCESS; return true; diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h index 8b6c72e..a093f85 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h @@ -20,6 +20,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" using namespace llvm; @@ -51,6 +52,46 @@ public: }; }; +// HVX insn resources. +class HexagonCVIResource : public HexagonResource { +public: + typedef std::pair<unsigned, unsigned> UnitsAndLanes; + typedef llvm::DenseMap<unsigned, UnitsAndLanes> TypeUnitsAndLanes; + +private: + // Available HVX slots. + enum { + CVI_NONE = 0, + CVI_XLANE = 1 << 0, + CVI_SHIFT = 1 << 1, + CVI_MPY0 = 1 << 2, + CVI_MPY1 = 1 << 3 + }; + + TypeUnitsAndLanes *TUL; + + // Count of adjacent slots that the insn requires to be executed. + unsigned Lanes; + // Flag whether the insn is a load or a store. + bool Load, Store; + // Flag whether the HVX resources are valid. + bool Valid; + + void setLanes(unsigned l) { Lanes = l; }; + void setLoad(bool f = true) { Load = f; }; + void setStore(bool f = true) { Store = f; }; + +public: + HexagonCVIResource(TypeUnitsAndLanes *TUL, MCInstrInfo const &MCII, + unsigned s, MCInst const *id); + static void SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU); + + bool isValid() const { return (Valid); }; + unsigned getLanes() const { return (Lanes); }; + bool mayLoad() const { return (Load); }; + bool mayStore() const { return (Store); }; +}; + // Handle to an insn used by the shuffling algorithm. class HexagonInstr { friend class HexagonShuffler; @@ -58,12 +99,15 @@ class HexagonInstr { MCInst const *ID; MCInst const *Extender; HexagonResource Core; + HexagonCVIResource CVI; bool SoloException; public: - HexagonInstr(MCInst const *id, MCInst const *Extender, unsigned s, - bool x = false) - : ID(id), Extender(Extender), Core(s), SoloException(x){}; + HexagonInstr(HexagonCVIResource::TypeUnitsAndLanes *T, + MCInstrInfo const &MCII, MCInst const *id, + MCInst const *Extender, unsigned s, bool x = false) + : ID(id), Extender(Extender), Core(s), CVI(T, MCII, s, id), + SoloException(x) {}; MCInst const *getDesc() const { return (ID); }; @@ -79,6 +123,10 @@ public: static bool lessCore(const HexagonInstr &A, const HexagonInstr &B) { return (HexagonResource::lessUnits(A.Core, B.Core)); }; + // Check if the handles are in ascending order by HVX slots. + static bool lessCVI(const HexagonInstr &A, const HexagonInstr &B) { + return (HexagonResource::lessUnits(A.CVI, B.CVI)); + }; }; // Bundle shuffler. @@ -92,6 +140,8 @@ class HexagonShuffler { // Shuffling error code. unsigned Error; + HexagonCVIResource::TypeUnitsAndLanes TUL; + protected: int64_t BundleFlags; MCInstrInfo const &MCII; @@ -108,6 +158,8 @@ public: SHUFFLE_ERROR_BRANCHES, ///< No free slots for branch insns. SHUFFLE_ERROR_NOSLOTS, ///< No free slots for other insns. SHUFFLE_ERROR_SLOTS, ///< Over-subscribed slots. + SHUFFLE_ERROR_ERRATA2, ///< Errata violation (v60). + SHUFFLE_ERROR_STORE_LOAD_CONFLICT, ///< store/load conflict SHUFFLE_ERROR_UNKNOWN ///< Unknown error. }; diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp new file mode 100644 index 0000000..c547c71 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp @@ -0,0 +1,180 @@ +//===--- RDFCopy.cpp ------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Simplistic RDF-based copy propagation. + +#include "RDFCopy.h" +#include "RDFGraph.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Support/CommandLine.h" + +#include <atomic> + +#ifndef NDEBUG +static cl::opt<unsigned> CpLimit("rdf-cp-limit", cl::init(0), cl::Hidden); +static unsigned CpCount = 0; +#endif + +using namespace llvm; +using namespace rdf; + +void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, MachineInstr *MI) { + assert(MI->getOpcode() == TargetOpcode::COPY); + const MachineOperand &Op0 = MI->getOperand(0), &Op1 = MI->getOperand(1); + RegisterRef DstR = { Op0.getReg(), Op0.getSubReg() }; + RegisterRef SrcR = { Op1.getReg(), Op1.getSubReg() }; + auto FS = DefM.find(SrcR); + if (FS == DefM.end() || FS->second.empty()) + return; + Copies.push_back(SA.Id); + RDefMap[SrcR][SA.Id] = FS->second.top()->Id; + // Insert DstR into the map. + RDefMap[DstR]; +} + + +void CopyPropagation::updateMap(NodeAddr<InstrNode*> IA) { + RegisterSet RRs; + for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) + RRs.insert(RA.Addr->getRegRef()); + bool Common = false; + for (auto &R : RDefMap) { + if (!RRs.count(R.first)) + continue; + Common = true; + break; + } + if (!Common) + return; + + for (auto &R : RDefMap) { + if (!RRs.count(R.first)) + continue; + auto F = DefM.find(R.first); + if (F == DefM.end() || F->second.empty()) + continue; + R.second[IA.Id] = F->second.top()->Id; + } +} + + +bool CopyPropagation::scanBlock(MachineBasicBlock *B) { + bool Changed = false; + auto BA = DFG.getFunc().Addr->findBlock(B, DFG); + DFG.markBlock(BA.Id, DefM); + + for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) { + if (DFG.IsCode<NodeAttrs::Stmt>(IA)) { + NodeAddr<StmtNode*> SA = IA; + MachineInstr *MI = SA.Addr->getCode(); + if (MI->isCopy()) + recordCopy(SA, MI); + } + + updateMap(IA); + DFG.pushDefs(IA, DefM); + } + + MachineDomTreeNode *N = MDT.getNode(B); + for (auto I : *N) + Changed |= scanBlock(I->getBlock()); + + DFG.releaseBlock(BA.Id, DefM); + return Changed; +} + + +bool CopyPropagation::run() { + scanBlock(&DFG.getMF().front()); + + if (trace()) { + dbgs() << "Copies:\n"; + for (auto I : Copies) + dbgs() << *DFG.addr<StmtNode*>(I).Addr->getCode(); + dbgs() << "\nRDef map:\n"; + for (auto R : RDefMap) { + dbgs() << Print<RegisterRef>(R.first, DFG) << " -> {"; + for (auto &M : R.second) + dbgs() << ' ' << Print<NodeId>(M.first, DFG) << ':' + << Print<NodeId>(M.second, DFG); + dbgs() << " }\n"; + } + } + + bool Changed = false; + NodeSet Deleted; +#ifndef NDEBUG + bool HasLimit = CpLimit.getNumOccurrences() > 0; +#endif + + for (auto I : Copies) { +#ifndef NDEBUG + if (HasLimit && CpCount >= CpLimit) + break; +#endif + if (Deleted.count(I)) + continue; + auto SA = DFG.addr<InstrNode*>(I); + NodeList Ds = SA.Addr->members_if(DFG.IsDef, DFG); + if (Ds.size() != 1) + continue; + NodeAddr<DefNode*> DA = Ds[0]; + RegisterRef DR0 = DA.Addr->getRegRef(); + NodeList Us = SA.Addr->members_if(DFG.IsUse, DFG); + if (Us.size() != 1) + continue; + NodeAddr<UseNode*> UA0 = Us[0]; + RegisterRef UR0 = UA0.Addr->getRegRef(); + NodeId RD0 = UA0.Addr->getReachingDef(); + + for (NodeId N = DA.Addr->getReachedUse(), NextN; N; N = NextN) { + auto UA = DFG.addr<UseNode*>(N); + NextN = UA.Addr->getSibling(); + uint16_t F = UA.Addr->getFlags(); + if ((F & NodeAttrs::PhiRef) || (F & NodeAttrs::Fixed)) + continue; + if (UA.Addr->getRegRef() != DR0) + continue; + NodeAddr<InstrNode*> IA = UA.Addr->getOwner(DFG); + assert(DFG.IsCode<NodeAttrs::Stmt>(IA)); + MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode(); + if (RDefMap[UR0][IA.Id] != RD0) + continue; + MachineOperand &Op = UA.Addr->getOp(); + if (Op.isTied()) + continue; + if (trace()) { + dbgs() << "can replace " << Print<RegisterRef>(DR0, DFG) + << " with " << Print<RegisterRef>(UR0, DFG) << " in " + << *NodeAddr<StmtNode*>(IA).Addr->getCode(); + } + + Op.setReg(UR0.Reg); + Op.setSubReg(UR0.Sub); + Changed = true; +#ifndef NDEBUG + if (HasLimit && CpCount >= CpLimit) + break; + CpCount++; +#endif + + if (MI->isCopy()) { + MachineOperand &Op0 = MI->getOperand(0), &Op1 = MI->getOperand(1); + if (Op0.getReg() == Op1.getReg() && Op0.getSubReg() == Op1.getSubReg()) + MI->eraseFromParent(); + Deleted.insert(IA.Id); + } + } + } + + return Changed; +} + diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.h b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h new file mode 100644 index 0000000..02531b9 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h @@ -0,0 +1,48 @@ +//===--- RDFCopy.h --------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef RDF_COPY_H +#define RDF_COPY_H + +#include "RDFGraph.h" +#include <map> +#include <vector> + +namespace llvm { + class MachineBasicBlock; + class MachineDominatorTree; + class MachineInstr; +} + +namespace rdf { + struct CopyPropagation { + CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg), + Trace(false) {} + + bool run(); + void trace(bool On) { Trace = On; } + bool trace() const { return Trace; } + + private: + const MachineDominatorTree &MDT; + DataFlowGraph &DFG; + DataFlowGraph::DefStackMap DefM; + bool Trace; + + // map: register -> (map: stmt -> reaching def) + std::map<RegisterRef,std::map<NodeId,NodeId>> RDefMap; + std::vector<NodeId> Copies; + + void recordCopy(NodeAddr<StmtNode*> SA, MachineInstr *MI); + void updateMap(NodeAddr<InstrNode*> IA); + bool scanBlock(MachineBasicBlock *B); + }; +} + +#endif diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp new file mode 100644 index 0000000..9566857 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp @@ -0,0 +1,204 @@ +//===--- RDFDeadCode.cpp --------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// RDF-based generic dead code elimination. + +#include "RDFGraph.h" +#include "RDFLiveness.h" +#include "RDFDeadCode.h" + +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; +using namespace rdf; + +// Check if the given instruction has observable side-effects, i.e. if +// it should be considered "live". It is safe for this function to be +// overly conservative (i.e. return "true" for all instructions), but it +// is not safe to return "false" for an instruction that should not be +// considered removable. +bool DeadCodeElimination::isLiveInstr(const MachineInstr *MI) const { + if (MI->mayStore() || MI->isBranch() || MI->isCall() || MI->isReturn()) + return true; + if (MI->hasOrderedMemoryRef() || MI->hasUnmodeledSideEffects()) + return true; + if (MI->isPHI()) + return false; + for (auto &Op : MI->operands()) + if (Op.isReg() && MRI.isReserved(Op.getReg())) + return true; + return false; +} + +void DeadCodeElimination::scanInstr(NodeAddr<InstrNode*> IA, + SetVector<NodeId> &WorkQ) { + if (!DFG.IsCode<NodeAttrs::Stmt>(IA)) + return; + if (!isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode())) + return; + for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) { + if (!LiveNodes.count(RA.Id)) + WorkQ.insert(RA.Id); + } +} + +void DeadCodeElimination::processDef(NodeAddr<DefNode*> DA, + SetVector<NodeId> &WorkQ) { + NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG); + for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) { + if (!LiveNodes.count(UA.Id)) + WorkQ.insert(UA.Id); + } + for (NodeAddr<DefNode*> TA : DFG.getRelatedRefs(IA, DA)) + LiveNodes.insert(TA.Id); +} + +void DeadCodeElimination::processUse(NodeAddr<UseNode*> UA, + SetVector<NodeId> &WorkQ) { + for (NodeAddr<DefNode*> DA : LV.getAllReachingDefs(UA)) { + if (!LiveNodes.count(DA.Id)) + WorkQ.insert(DA.Id); + } +} + +// Traverse the DFG and collect the set dead RefNodes and the set of +// dead instructions. Return "true" if any of these sets is non-empty, +// "false" otherwise. +bool DeadCodeElimination::collect() { + // This function works by first finding all live nodes. The dead nodes + // are then the complement of the set of live nodes. + // + // Assume that all nodes are dead. Identify instructions which must be + // considered live, i.e. instructions with observable side-effects, such + // as calls and stores. All arguments of such instructions are considered + // live. For each live def, all operands used in the corresponding + // instruction are considered live. For each live use, all its reaching + // defs are considered live. + LiveNodes.clear(); + SetVector<NodeId> WorkQ; + for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) + for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) + scanInstr(IA, WorkQ); + + while (!WorkQ.empty()) { + NodeId N = *WorkQ.begin(); + WorkQ.remove(N); + LiveNodes.insert(N); + auto RA = DFG.addr<RefNode*>(N); + if (DFG.IsDef(RA)) + processDef(RA, WorkQ); + else + processUse(RA, WorkQ); + } + + if (trace()) { + dbgs() << "Live nodes:\n"; + for (NodeId N : LiveNodes) { + auto RA = DFG.addr<RefNode*>(N); + dbgs() << PrintNode<RefNode*>(RA, DFG) << "\n"; + } + } + + auto IsDead = [this] (NodeAddr<InstrNode*> IA) -> bool { + for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG)) + if (LiveNodes.count(DA.Id)) + return false; + return true; + }; + + for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) { + for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) { + for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) + if (!LiveNodes.count(RA.Id)) + DeadNodes.insert(RA.Id); + if (DFG.IsCode<NodeAttrs::Stmt>(IA)) + if (isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode())) + continue; + if (IsDead(IA)) { + DeadInstrs.insert(IA.Id); + if (trace()) + dbgs() << "Dead instr: " << PrintNode<InstrNode*>(IA, DFG) << "\n"; + } + } + } + + return !DeadNodes.empty(); +} + +// Erase the nodes given in the Nodes set from DFG. In addition to removing +// them from the DFG, if a node corresponds to a statement, the corresponding +// machine instruction is erased from the function. +bool DeadCodeElimination::erase(const SetVector<NodeId> &Nodes) { + if (Nodes.empty()) + return false; + + // Prepare the actual set of ref nodes to remove: ref nodes from Nodes + // are included directly, for each InstrNode in Nodes, include the set + // of all RefNodes from it. + NodeList DRNs, DINs; + for (auto I : Nodes) { + auto BA = DFG.addr<NodeBase*>(I); + uint16_t Type = BA.Addr->getType(); + if (Type == NodeAttrs::Ref) { + DRNs.push_back(DFG.addr<RefNode*>(I)); + continue; + } + + // If it's a code node, add all ref nodes from it. + uint16_t Kind = BA.Addr->getKind(); + if (Kind == NodeAttrs::Stmt || Kind == NodeAttrs::Phi) { + for (auto N : NodeAddr<CodeNode*>(BA).Addr->members(DFG)) + DRNs.push_back(N); + DINs.push_back(DFG.addr<InstrNode*>(I)); + } else { + llvm_unreachable("Unexpected code node"); + return false; + } + } + + // Sort the list so that use nodes are removed first. This makes the + // "unlink" functions a bit faster. + auto UsesFirst = [] (NodeAddr<RefNode*> A, NodeAddr<RefNode*> B) -> bool { + uint16_t KindA = A.Addr->getKind(), KindB = B.Addr->getKind(); + if (KindA == NodeAttrs::Use && KindB == NodeAttrs::Def) + return true; + if (KindA == NodeAttrs::Def && KindB == NodeAttrs::Use) + return false; + return A.Id < B.Id; + }; + std::sort(DRNs.begin(), DRNs.end(), UsesFirst); + + if (trace()) + dbgs() << "Removing dead ref nodes:\n"; + for (NodeAddr<RefNode*> RA : DRNs) { + if (trace()) + dbgs() << " " << PrintNode<RefNode*>(RA, DFG) << '\n'; + if (DFG.IsUse(RA)) + DFG.unlinkUse(RA); + else if (DFG.IsDef(RA)) + DFG.unlinkDef(RA); + } + + // Now, remove all dead instruction nodes. + for (NodeAddr<InstrNode*> IA : DINs) { + NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG); + BA.Addr->removeMember(IA, DFG); + if (!DFG.IsCode<NodeAttrs::Stmt>(IA)) + continue; + + MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode(); + if (trace()) + dbgs() << "erasing: " << *MI; + MI->eraseFromParent(); + } + return true; +} diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h new file mode 100644 index 0000000..f4373fb --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h @@ -0,0 +1,65 @@ +//===--- RDFDeadCode.h ----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// RDF-based generic dead code elimination. +// +// The main interface of this class are functions "collect" and "erase". +// This allows custom processing of the function being optimized by a +// particular consumer. The simplest way to use this class would be to +// instantiate an object, and then simply call "collect" and "erase", +// passing the result of "getDeadInstrs()" to it. +// A more complex scenario would be to call "collect" first, then visit +// all post-increment instructions to see if the address update is dead +// or not, and if it is, convert the instruction to a non-updating form. +// After that "erase" can be called with the set of nodes including both, +// dead defs from the updating instructions and the nodes corresponding +// to the dead instructions. + +#ifndef RDF_DEADCODE_H +#define RDF_DEADCODE_H + +#include "RDFGraph.h" +#include "RDFLiveness.h" +#include "llvm/ADT/SetVector.h" + +namespace llvm { + class MachineRegisterInfo; +} + +namespace rdf { + struct DeadCodeElimination { + DeadCodeElimination(DataFlowGraph &dfg, MachineRegisterInfo &mri) + : Trace(false), DFG(dfg), MRI(mri), LV(mri, dfg) {} + + bool collect(); + bool erase(const SetVector<NodeId> &Nodes); + void trace(bool On) { Trace = On; } + bool trace() const { return Trace; } + + SetVector<NodeId> getDeadNodes() { return DeadNodes; } + SetVector<NodeId> getDeadInstrs() { return DeadInstrs; } + DataFlowGraph &getDFG() { return DFG; } + + private: + bool Trace; + SetVector<NodeId> LiveNodes; + SetVector<NodeId> DeadNodes; + SetVector<NodeId> DeadInstrs; + DataFlowGraph &DFG; + MachineRegisterInfo &MRI; + Liveness LV; + + bool isLiveInstr(const MachineInstr *MI) const; + void scanInstr(NodeAddr<InstrNode*> IA, SetVector<NodeId> &WorkQ); + void processDef(NodeAddr<DefNode*> DA, SetVector<NodeId> &WorkQ); + void processUse(NodeAddr<UseNode*> UA, SetVector<NodeId> &WorkQ); + }; +} + +#endif diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp new file mode 100644 index 0000000..9b47422 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp @@ -0,0 +1,1716 @@ +//===--- RDFGraph.cpp -----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Target-independent, SSA-based data flow graph for register data flow (RDF). +// +#include "RDFGraph.h" + +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominanceFrontier.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; +using namespace rdf; + +// Printing functions. Have them here first, so that the rest of the code +// can use them. +namespace rdf { + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterRef> &P) { + auto &TRI = P.G.getTRI(); + if (P.Obj.Reg > 0 && P.Obj.Reg < TRI.getNumRegs()) + OS << TRI.getName(P.Obj.Reg); + else + OS << '#' << P.Obj.Reg; + if (P.Obj.Sub > 0) { + OS << ':'; + if (P.Obj.Sub < TRI.getNumSubRegIndices()) + OS << TRI.getSubRegIndexName(P.Obj.Sub); + else + OS << '#' << P.Obj.Sub; + } + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeId> &P) { + auto NA = P.G.addr<NodeBase*>(P.Obj); + uint16_t Attrs = NA.Addr->getAttrs(); + uint16_t Kind = NodeAttrs::kind(Attrs); + uint16_t Flags = NodeAttrs::flags(Attrs); + switch (NodeAttrs::type(Attrs)) { + case NodeAttrs::Code: + switch (Kind) { + case NodeAttrs::Func: OS << 'f'; break; + case NodeAttrs::Block: OS << 'b'; break; + case NodeAttrs::Stmt: OS << 's'; break; + case NodeAttrs::Phi: OS << 'p'; break; + default: OS << "c?"; break; + } + break; + case NodeAttrs::Ref: + if (Flags & NodeAttrs::Preserving) + OS << '+'; + if (Flags & NodeAttrs::Clobbering) + OS << '~'; + switch (Kind) { + case NodeAttrs::Use: OS << 'u'; break; + case NodeAttrs::Def: OS << 'd'; break; + case NodeAttrs::Block: OS << 'b'; break; + default: OS << "r?"; break; + } + break; + default: + OS << '?'; + break; + } + OS << P.Obj; + if (Flags & NodeAttrs::Shadow) + OS << '"'; + return OS; +} + +namespace { + void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA, + const DataFlowGraph &G) { + OS << Print<NodeId>(RA.Id, G) << '<' + << Print<RegisterRef>(RA.Addr->getRegRef(), G) << '>'; + if (RA.Addr->getFlags() & NodeAttrs::Fixed) + OS << '!'; + } +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<DefNode*>> &P) { + printRefHeader(OS, P.Obj, P.G); + OS << '('; + if (NodeId N = P.Obj.Addr->getReachingDef()) + OS << Print<NodeId>(N, P.G); + OS << ','; + if (NodeId N = P.Obj.Addr->getReachedDef()) + OS << Print<NodeId>(N, P.G); + OS << ','; + if (NodeId N = P.Obj.Addr->getReachedUse()) + OS << Print<NodeId>(N, P.G); + OS << "):"; + if (NodeId N = P.Obj.Addr->getSibling()) + OS << Print<NodeId>(N, P.G); + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<UseNode*>> &P) { + printRefHeader(OS, P.Obj, P.G); + OS << '('; + if (NodeId N = P.Obj.Addr->getReachingDef()) + OS << Print<NodeId>(N, P.G); + OS << "):"; + if (NodeId N = P.Obj.Addr->getSibling()) + OS << Print<NodeId>(N, P.G); + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, + const Print<NodeAddr<PhiUseNode*>> &P) { + printRefHeader(OS, P.Obj, P.G); + OS << '('; + if (NodeId N = P.Obj.Addr->getReachingDef()) + OS << Print<NodeId>(N, P.G); + OS << ','; + if (NodeId N = P.Obj.Addr->getPredecessor()) + OS << Print<NodeId>(N, P.G); + OS << "):"; + if (NodeId N = P.Obj.Addr->getSibling()) + OS << Print<NodeId>(N, P.G); + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<RefNode*>> &P) { + switch (P.Obj.Addr->getKind()) { + case NodeAttrs::Def: + OS << PrintNode<DefNode*>(P.Obj, P.G); + break; + case NodeAttrs::Use: + if (P.Obj.Addr->getFlags() & NodeAttrs::PhiRef) + OS << PrintNode<PhiUseNode*>(P.Obj, P.G); + else + OS << PrintNode<UseNode*>(P.Obj, P.G); + break; + } + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeList> &P) { + unsigned N = P.Obj.size(); + for (auto I : P.Obj) { + OS << Print<NodeId>(I.Id, P.G); + if (--N) + OS << ' '; + } + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeSet> &P) { + unsigned N = P.Obj.size(); + for (auto I : P.Obj) { + OS << Print<NodeId>(I, P.G); + if (--N) + OS << ' '; + } + return OS; +} + +namespace { + template <typename T> + struct PrintListV { + PrintListV(const NodeList &L, const DataFlowGraph &G) : List(L), G(G) {} + typedef T Type; + const NodeList &List; + const DataFlowGraph &G; + }; + + template <typename T> + raw_ostream &operator<< (raw_ostream &OS, const PrintListV<T> &P) { + unsigned N = P.List.size(); + for (NodeAddr<T> A : P.List) { + OS << PrintNode<T>(A, P.G); + if (--N) + OS << ", "; + } + return OS; + } +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<PhiNode*>> &P) { + OS << Print<NodeId>(P.Obj.Id, P.G) << ": phi [" + << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']'; + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, + const Print<NodeAddr<StmtNode*>> &P) { + unsigned Opc = P.Obj.Addr->getCode()->getOpcode(); + OS << Print<NodeId>(P.Obj.Id, P.G) << ": " << P.G.getTII().getName(Opc) + << " [" << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']'; + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, + const Print<NodeAddr<InstrNode*>> &P) { + switch (P.Obj.Addr->getKind()) { + case NodeAttrs::Phi: + OS << PrintNode<PhiNode*>(P.Obj, P.G); + break; + case NodeAttrs::Stmt: + OS << PrintNode<StmtNode*>(P.Obj, P.G); + break; + default: + OS << "instr? " << Print<NodeId>(P.Obj.Id, P.G); + break; + } + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, + const Print<NodeAddr<BlockNode*>> &P) { + auto *BB = P.Obj.Addr->getCode(); + unsigned NP = BB->pred_size(); + std::vector<int> Ns; + auto PrintBBs = [&OS,&P] (std::vector<int> Ns) -> void { + unsigned N = Ns.size(); + for (auto I : Ns) { + OS << "BB#" << I; + if (--N) + OS << ", "; + } + }; + + OS << Print<NodeId>(P.Obj.Id, P.G) << ": === BB#" << BB->getNumber() + << " === preds(" << NP << "): "; + for (auto I : BB->predecessors()) + Ns.push_back(I->getNumber()); + PrintBBs(Ns); + + unsigned NS = BB->succ_size(); + OS << " succs(" << NS << "): "; + Ns.clear(); + for (auto I : BB->successors()) + Ns.push_back(I->getNumber()); + PrintBBs(Ns); + OS << '\n'; + + for (auto I : P.Obj.Addr->members(P.G)) + OS << PrintNode<InstrNode*>(I, P.G) << '\n'; + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, + const Print<NodeAddr<FuncNode*>> &P) { + OS << "DFG dump:[\n" << Print<NodeId>(P.Obj.Id, P.G) << ": Function: " + << P.Obj.Addr->getCode()->getName() << '\n'; + for (auto I : P.Obj.Addr->members(P.G)) + OS << PrintNode<BlockNode*>(I, P.G) << '\n'; + OS << "]\n"; + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterSet> &P) { + OS << '{'; + for (auto I : P.Obj) + OS << ' ' << Print<RegisterRef>(I, P.G); + OS << " }"; + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, + const Print<DataFlowGraph::DefStack> &P) { + for (auto I = P.Obj.top(), E = P.Obj.bottom(); I != E; ) { + OS << Print<NodeId>(I->Id, P.G) + << '<' << Print<RegisterRef>(I->Addr->getRegRef(), P.G) << '>'; + I.down(); + if (I != E) + OS << ' '; + } + return OS; +} + +} // namespace rdf + +// Node allocation functions. +// +// Node allocator is like a slab memory allocator: it allocates blocks of +// memory in sizes that are multiples of the size of a node. Each block has +// the same size. Nodes are allocated from the currently active block, and +// when it becomes full, a new one is created. +// There is a mapping scheme between node id and its location in a block, +// and within that block is described in the header file. +// +void NodeAllocator::startNewBlock() { + void *T = MemPool.Allocate(NodesPerBlock*NodeMemSize, NodeMemSize); + char *P = static_cast<char*>(T); + Blocks.push_back(P); + // Check if the block index is still within the allowed range, i.e. less + // than 2^N, where N is the number of bits in NodeId for the block index. + // BitsPerIndex is the number of bits per node index. + assert((Blocks.size() < (1U << (8*sizeof(NodeId)-BitsPerIndex))) && + "Out of bits for block index"); + ActiveEnd = P; +} + +bool NodeAllocator::needNewBlock() { + if (Blocks.empty()) + return true; + + char *ActiveBegin = Blocks.back(); + uint32_t Index = (ActiveEnd-ActiveBegin)/NodeMemSize; + return Index >= NodesPerBlock; +} + +NodeAddr<NodeBase*> NodeAllocator::New() { + if (needNewBlock()) + startNewBlock(); + + uint32_t ActiveB = Blocks.size()-1; + uint32_t Index = (ActiveEnd - Blocks[ActiveB])/NodeMemSize; + NodeAddr<NodeBase*> NA = { reinterpret_cast<NodeBase*>(ActiveEnd), + makeId(ActiveB, Index) }; + ActiveEnd += NodeMemSize; + return NA; +} + +NodeId NodeAllocator::id(const NodeBase *P) const { + uintptr_t A = reinterpret_cast<uintptr_t>(P); + for (unsigned i = 0, n = Blocks.size(); i != n; ++i) { + uintptr_t B = reinterpret_cast<uintptr_t>(Blocks[i]); + if (A < B || A >= B + NodesPerBlock*NodeMemSize) + continue; + uint32_t Idx = (A-B)/NodeMemSize; + return makeId(i, Idx); + } + llvm_unreachable("Invalid node address"); +} + +void NodeAllocator::clear() { + MemPool.Reset(); + Blocks.clear(); + ActiveEnd = nullptr; +} + + +// Insert node NA after "this" in the circular chain. +void NodeBase::append(NodeAddr<NodeBase*> NA) { + NodeId Nx = Next; + // If NA is already "next", do nothing. + if (Next != NA.Id) { + Next = NA.Id; + NA.Addr->Next = Nx; + } +} + + +// Fundamental node manipulator functions. + +// Obtain the register reference from a reference node. +RegisterRef RefNode::getRegRef() const { + assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref); + if (NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef) + return Ref.RR; + assert(Ref.Op != nullptr); + return { Ref.Op->getReg(), Ref.Op->getSubReg() }; +} + +// Set the register reference in the reference node directly (for references +// in phi nodes). +void RefNode::setRegRef(RegisterRef RR) { + assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref); + assert(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef); + Ref.RR = RR; +} + +// Set the register reference in the reference node based on a machine +// operand (for references in statement nodes). +void RefNode::setRegRef(MachineOperand *Op) { + assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref); + assert(!(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef)); + Ref.Op = Op; +} + +// Get the owner of a given reference node. +NodeAddr<NodeBase*> RefNode::getOwner(const DataFlowGraph &G) { + NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext()); + + while (NA.Addr != this) { + if (NA.Addr->getType() == NodeAttrs::Code) + return NA; + NA = G.addr<NodeBase*>(NA.Addr->getNext()); + } + llvm_unreachable("No owner in circular list"); +} + +// Connect the def node to the reaching def node. +void DefNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) { + Ref.RD = DA.Id; + Ref.Sib = DA.Addr->getReachedDef(); + DA.Addr->setReachedDef(Self); +} + +// Connect the use node to the reaching def node. +void UseNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) { + Ref.RD = DA.Id; + Ref.Sib = DA.Addr->getReachedUse(); + DA.Addr->setReachedUse(Self); +} + +// Get the first member of the code node. +NodeAddr<NodeBase*> CodeNode::getFirstMember(const DataFlowGraph &G) const { + if (Code.FirstM == 0) + return NodeAddr<NodeBase*>(); + return G.addr<NodeBase*>(Code.FirstM); +} + +// Get the last member of the code node. +NodeAddr<NodeBase*> CodeNode::getLastMember(const DataFlowGraph &G) const { + if (Code.LastM == 0) + return NodeAddr<NodeBase*>(); + return G.addr<NodeBase*>(Code.LastM); +} + +// Add node NA at the end of the member list of the given code node. +void CodeNode::addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) { + auto ML = getLastMember(G); + if (ML.Id != 0) { + ML.Addr->append(NA); + } else { + Code.FirstM = NA.Id; + NodeId Self = G.id(this); + NA.Addr->setNext(Self); + } + Code.LastM = NA.Id; +} + +// Add node NA after member node MA in the given code node. +void CodeNode::addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA, + const DataFlowGraph &G) { + MA.Addr->append(NA); + if (Code.LastM == MA.Id) + Code.LastM = NA.Id; +} + +// Remove member node NA from the given code node. +void CodeNode::removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) { + auto MA = getFirstMember(G); + assert(MA.Id != 0); + + // Special handling if the member to remove is the first member. + if (MA.Id == NA.Id) { + if (Code.LastM == MA.Id) { + // If it is the only member, set both first and last to 0. + Code.FirstM = Code.LastM = 0; + } else { + // Otherwise, advance the first member. + Code.FirstM = MA.Addr->getNext(); + } + return; + } + + while (MA.Addr != this) { + NodeId MX = MA.Addr->getNext(); + if (MX == NA.Id) { + MA.Addr->setNext(NA.Addr->getNext()); + // If the member to remove happens to be the last one, update the + // LastM indicator. + if (Code.LastM == NA.Id) + Code.LastM = MA.Id; + return; + } + MA = G.addr<NodeBase*>(MX); + } + llvm_unreachable("No such member"); +} + +// Return the list of all members of the code node. +NodeList CodeNode::members(const DataFlowGraph &G) const { + static auto True = [] (NodeAddr<NodeBase*>) -> bool { return true; }; + return members_if(True, G); +} + +// Return the owner of the given instr node. +NodeAddr<NodeBase*> InstrNode::getOwner(const DataFlowGraph &G) { + NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext()); + + while (NA.Addr != this) { + assert(NA.Addr->getType() == NodeAttrs::Code); + if (NA.Addr->getKind() == NodeAttrs::Block) + return NA; + NA = G.addr<NodeBase*>(NA.Addr->getNext()); + } + llvm_unreachable("No owner in circular list"); +} + +// Add the phi node PA to the given block node. +void BlockNode::addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G) { + auto M = getFirstMember(G); + if (M.Id == 0) { + addMember(PA, G); + return; + } + + assert(M.Addr->getType() == NodeAttrs::Code); + if (M.Addr->getKind() == NodeAttrs::Stmt) { + // If the first member of the block is a statement, insert the phi as + // the first member. + Code.FirstM = PA.Id; + PA.Addr->setNext(M.Id); + } else { + // If the first member is a phi, find the last phi, and append PA to it. + assert(M.Addr->getKind() == NodeAttrs::Phi); + NodeAddr<NodeBase*> MN = M; + do { + M = MN; + MN = G.addr<NodeBase*>(M.Addr->getNext()); + assert(MN.Addr->getType() == NodeAttrs::Code); + } while (MN.Addr->getKind() == NodeAttrs::Phi); + + // M is the last phi. + addMemberAfter(M, PA, G); + } +} + +// Find the block node corresponding to the machine basic block BB in the +// given func node. +NodeAddr<BlockNode*> FuncNode::findBlock(const MachineBasicBlock *BB, + const DataFlowGraph &G) const { + auto EqBB = [BB] (NodeAddr<NodeBase*> NA) -> bool { + return NodeAddr<BlockNode*>(NA).Addr->getCode() == BB; + }; + NodeList Ms = members_if(EqBB, G); + if (!Ms.empty()) + return Ms[0]; + return NodeAddr<BlockNode*>(); +} + +// Get the block node for the entry block in the given function. +NodeAddr<BlockNode*> FuncNode::getEntryBlock(const DataFlowGraph &G) { + MachineBasicBlock *EntryB = &getCode()->front(); + return findBlock(EntryB, G); +} + + +// Register aliasing information. +// +// In theory, the lane information could be used to determine register +// covering (and aliasing), but depending on the sub-register structure, +// the lane mask information may be missing. The covering information +// must be available for this framework to work, so relying solely on +// the lane data is not sufficient. + +// Determine whether RA covers RB. +bool RegisterAliasInfo::covers(RegisterRef RA, RegisterRef RB) const { + if (RA == RB) + return true; + if (TargetRegisterInfo::isVirtualRegister(RA.Reg)) { + assert(TargetRegisterInfo::isVirtualRegister(RB.Reg)); + if (RA.Reg != RB.Reg) + return false; + if (RA.Sub == 0) + return true; + return TRI.composeSubRegIndices(RA.Sub, RB.Sub) == RA.Sub; + } + + assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg) && + TargetRegisterInfo::isPhysicalRegister(RB.Reg)); + unsigned A = RA.Sub != 0 ? TRI.getSubReg(RA.Reg, RA.Sub) : RA.Reg; + unsigned B = RB.Sub != 0 ? TRI.getSubReg(RB.Reg, RB.Sub) : RB.Reg; + return TRI.isSubRegister(A, B); +} + +// Determine whether RR is covered by the set of references RRs. +bool RegisterAliasInfo::covers(const RegisterSet &RRs, RegisterRef RR) const { + if (RRs.count(RR)) + return true; + + // For virtual registers, we cannot accurately determine covering based + // on subregisters. If RR itself is not present in RRs, but it has a sub- + // register reference, check for the super-register alone. Otherwise, + // assume non-covering. + if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) { + if (RR.Sub != 0) + return RRs.count({RR.Reg, 0}); + return false; + } + + // If any super-register of RR is present, then RR is covered. + unsigned Reg = RR.Sub == 0 ? RR.Reg : TRI.getSubReg(RR.Reg, RR.Sub); + for (MCSuperRegIterator SR(Reg, &TRI); SR.isValid(); ++SR) + if (RRs.count({*SR, 0})) + return true; + + return false; +} + +// Get the list of references aliased to RR. +std::vector<RegisterRef> RegisterAliasInfo::getAliasSet(RegisterRef RR) const { + // Do not include RR in the alias set. For virtual registers return an + // empty set. + std::vector<RegisterRef> AS; + if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) + return AS; + assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg)); + unsigned R = RR.Reg; + if (RR.Sub) + R = TRI.getSubReg(RR.Reg, RR.Sub); + + for (MCRegAliasIterator AI(R, &TRI, false); AI.isValid(); ++AI) + AS.push_back(RegisterRef({*AI, 0})); + return AS; +} + +// Check whether RA and RB are aliased. +bool RegisterAliasInfo::alias(RegisterRef RA, RegisterRef RB) const { + bool VirtA = TargetRegisterInfo::isVirtualRegister(RA.Reg); + bool VirtB = TargetRegisterInfo::isVirtualRegister(RB.Reg); + bool PhysA = TargetRegisterInfo::isPhysicalRegister(RA.Reg); + bool PhysB = TargetRegisterInfo::isPhysicalRegister(RB.Reg); + + if (VirtA != VirtB) + return false; + + if (VirtA) { + if (RA.Reg != RB.Reg) + return false; + // RA and RB refer to the same register. If any of them refer to the + // whole register, they must be aliased. + if (RA.Sub == 0 || RB.Sub == 0) + return true; + unsigned SA = TRI.getSubRegIdxSize(RA.Sub); + unsigned OA = TRI.getSubRegIdxOffset(RA.Sub); + unsigned SB = TRI.getSubRegIdxSize(RB.Sub); + unsigned OB = TRI.getSubRegIdxOffset(RB.Sub); + if (OA <= OB && OA+SA > OB) + return true; + if (OB <= OA && OB+SB > OA) + return true; + return false; + } + + assert(PhysA && PhysB); + (void)PhysA, (void)PhysB; + unsigned A = RA.Sub ? TRI.getSubReg(RA.Reg, RA.Sub) : RA.Reg; + unsigned B = RB.Sub ? TRI.getSubReg(RB.Reg, RB.Sub) : RB.Reg; + for (MCRegAliasIterator I(A, &TRI, true); I.isValid(); ++I) + if (B == *I) + return true; + return false; +} + + +// Target operand information. +// + +// For a given instruction, check if there are any bits of RR that can remain +// unchanged across this def. +bool TargetOperandInfo::isPreserving(const MachineInstr &In, unsigned OpNum) + const { + return TII.isPredicated(&In); +} + +// Check if the definition of RR produces an unspecified value. +bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum) + const { + if (In.isCall()) + if (In.getOperand(OpNum).isImplicit()) + return true; + return false; +} + +// Check if the given instruction specifically requires +bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum) + const { + if (In.isCall() || In.isReturn()) + return true; + const MCInstrDesc &D = In.getDesc(); + if (!D.getImplicitDefs() && !D.getImplicitUses()) + return false; + const MachineOperand &Op = In.getOperand(OpNum); + // If there is a sub-register, treat the operand as non-fixed. Currently, + // fixed registers are those that are listed in the descriptor as implicit + // uses or defs, and those lists do not allow sub-registers. + if (Op.getSubReg() != 0) + return false; + unsigned Reg = Op.getReg(); + const MCPhysReg *ImpR = Op.isDef() ? D.getImplicitDefs() + : D.getImplicitUses(); + if (!ImpR) + return false; + while (*ImpR) + if (*ImpR++ == Reg) + return true; + return false; +} + + +// +// The data flow graph construction. +// + +DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii, + const TargetRegisterInfo &tri, const MachineDominatorTree &mdt, + const MachineDominanceFrontier &mdf, const RegisterAliasInfo &rai, + const TargetOperandInfo &toi) + : TimeG("rdf"), MF(mf), TII(tii), TRI(tri), MDT(mdt), MDF(mdf), RAI(rai), + TOI(toi) { +} + + +// The implementation of the definition stack. +// Each register reference has its own definition stack. In particular, +// for a register references "Reg" and "Reg:subreg" will each have their +// own definition stacks. + +// Construct a stack iterator. +DataFlowGraph::DefStack::Iterator::Iterator(const DataFlowGraph::DefStack &S, + bool Top) : DS(S) { + if (!Top) { + // Initialize to bottom. + Pos = 0; + return; + } + // Initialize to the top, i.e. top-most non-delimiter (or 0, if empty). + Pos = DS.Stack.size(); + while (Pos > 0 && DS.isDelimiter(DS.Stack[Pos-1])) + Pos--; +} + +// Return the size of the stack, including block delimiters. +unsigned DataFlowGraph::DefStack::size() const { + unsigned S = 0; + for (auto I = top(), E = bottom(); I != E; I.down()) + S++; + return S; +} + +// Remove the top entry from the stack. Remove all intervening delimiters +// so that after this, the stack is either empty, or the top of the stack +// is a non-delimiter. +void DataFlowGraph::DefStack::pop() { + assert(!empty()); + unsigned P = nextDown(Stack.size()); + Stack.resize(P); +} + +// Push a delimiter for block node N on the stack. +void DataFlowGraph::DefStack::start_block(NodeId N) { + assert(N != 0); + Stack.push_back(NodeAddr<DefNode*>(nullptr, N)); +} + +// Remove all nodes from the top of the stack, until the delimited for +// block node N is encountered. Remove the delimiter as well. In effect, +// this will remove from the stack all definitions from block N. +void DataFlowGraph::DefStack::clear_block(NodeId N) { + assert(N != 0); + unsigned P = Stack.size(); + while (P > 0) { + bool Found = isDelimiter(Stack[P-1], N); + P--; + if (Found) + break; + } + // This will also remove the delimiter, if found. + Stack.resize(P); +} + +// Move the stack iterator up by one. +unsigned DataFlowGraph::DefStack::nextUp(unsigned P) const { + // Get the next valid position after P (skipping all delimiters). + // The input position P does not have to point to a non-delimiter. + unsigned SS = Stack.size(); + bool IsDelim; + assert(P < SS); + do { + P++; + IsDelim = isDelimiter(Stack[P-1]); + } while (P < SS && IsDelim); + assert(!IsDelim); + return P; +} + +// Move the stack iterator down by one. +unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const { + // Get the preceding valid position before P (skipping all delimiters). + // The input position P does not have to point to a non-delimiter. + assert(P > 0 && P <= Stack.size()); + bool IsDelim = isDelimiter(Stack[P-1]); + do { + if (--P == 0) + break; + IsDelim = isDelimiter(Stack[P-1]); + } while (P > 0 && IsDelim); + assert(!IsDelim); + return P; +} + +// Node management functions. + +// Get the pointer to the node with the id N. +NodeBase *DataFlowGraph::ptr(NodeId N) const { + if (N == 0) + return nullptr; + return Memory.ptr(N); +} + +// Get the id of the node at the address P. +NodeId DataFlowGraph::id(const NodeBase *P) const { + if (P == nullptr) + return 0; + return Memory.id(P); +} + +// Allocate a new node and set the attributes to Attrs. +NodeAddr<NodeBase*> DataFlowGraph::newNode(uint16_t Attrs) { + NodeAddr<NodeBase*> P = Memory.New(); + P.Addr->init(); + P.Addr->setAttrs(Attrs); + return P; +} + +// Make a copy of the given node B, except for the data-flow links, which +// are set to 0. +NodeAddr<NodeBase*> DataFlowGraph::cloneNode(const NodeAddr<NodeBase*> B) { + NodeAddr<NodeBase*> NA = newNode(0); + memcpy(NA.Addr, B.Addr, sizeof(NodeBase)); + // Ref nodes need to have the data-flow links reset. + if (NA.Addr->getType() == NodeAttrs::Ref) { + NodeAddr<RefNode*> RA = NA; + RA.Addr->setReachingDef(0); + RA.Addr->setSibling(0); + if (NA.Addr->getKind() == NodeAttrs::Def) { + NodeAddr<DefNode*> DA = NA; + DA.Addr->setReachedDef(0); + DA.Addr->setReachedUse(0); + } + } + return NA; +} + + +// Allocation routines for specific node types/kinds. + +NodeAddr<UseNode*> DataFlowGraph::newUse(NodeAddr<InstrNode*> Owner, + MachineOperand &Op, uint16_t Flags) { + NodeAddr<UseNode*> UA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags); + UA.Addr->setRegRef(&Op); + return UA; +} + +NodeAddr<PhiUseNode*> DataFlowGraph::newPhiUse(NodeAddr<PhiNode*> Owner, + RegisterRef RR, NodeAddr<BlockNode*> PredB, uint16_t Flags) { + NodeAddr<PhiUseNode*> PUA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags); + assert(Flags & NodeAttrs::PhiRef); + PUA.Addr->setRegRef(RR); + PUA.Addr->setPredecessor(PredB.Id); + return PUA; +} + +NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner, + MachineOperand &Op, uint16_t Flags) { + NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags); + DA.Addr->setRegRef(&Op); + return DA; +} + +NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner, + RegisterRef RR, uint16_t Flags) { + NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags); + assert(Flags & NodeAttrs::PhiRef); + DA.Addr->setRegRef(RR); + return DA; +} + +NodeAddr<PhiNode*> DataFlowGraph::newPhi(NodeAddr<BlockNode*> Owner) { + NodeAddr<PhiNode*> PA = newNode(NodeAttrs::Code | NodeAttrs::Phi); + Owner.Addr->addPhi(PA, *this); + return PA; +} + +NodeAddr<StmtNode*> DataFlowGraph::newStmt(NodeAddr<BlockNode*> Owner, + MachineInstr *MI) { + NodeAddr<StmtNode*> SA = newNode(NodeAttrs::Code | NodeAttrs::Stmt); + SA.Addr->setCode(MI); + Owner.Addr->addMember(SA, *this); + return SA; +} + +NodeAddr<BlockNode*> DataFlowGraph::newBlock(NodeAddr<FuncNode*> Owner, + MachineBasicBlock *BB) { + NodeAddr<BlockNode*> BA = newNode(NodeAttrs::Code | NodeAttrs::Block); + BA.Addr->setCode(BB); + Owner.Addr->addMember(BA, *this); + return BA; +} + +NodeAddr<FuncNode*> DataFlowGraph::newFunc(MachineFunction *MF) { + NodeAddr<FuncNode*> FA = newNode(NodeAttrs::Code | NodeAttrs::Func); + FA.Addr->setCode(MF); + return FA; +} + +// Build the data flow graph. +void DataFlowGraph::build() { + reset(); + Func = newFunc(&MF); + + if (MF.empty()) + return; + + for (auto &B : MF) { + auto BA = newBlock(Func, &B); + for (auto &I : B) { + if (I.isDebugValue()) + continue; + buildStmt(BA, I); + } + } + + // Collect information about block references. + NodeAddr<BlockNode*> EA = Func.Addr->getEntryBlock(*this); + BlockRefsMap RefM; + buildBlockRefs(EA, RefM); + + // Add function-entry phi nodes. + MachineRegisterInfo &MRI = MF.getRegInfo(); + for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) { + NodeAddr<PhiNode*> PA = newPhi(EA); + RegisterRef RR = { I->first, 0 }; + uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving; + NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags); + PA.Addr->addMember(DA, *this); + } + + // Build a map "PhiM" which will contain, for each block, the set + // of references that will require phi definitions in that block. + BlockRefsMap PhiM; + auto Blocks = Func.Addr->members(*this); + for (NodeAddr<BlockNode*> BA : Blocks) + recordDefsForDF(PhiM, RefM, BA); + for (NodeAddr<BlockNode*> BA : Blocks) + buildPhis(PhiM, RefM, BA); + + // Link all the refs. This will recursively traverse the dominator tree. + DefStackMap DM; + linkBlockRefs(DM, EA); + + // Finally, remove all unused phi nodes. + removeUnusedPhis(); +} + +// For each stack in the map DefM, push the delimiter for block B on it. +void DataFlowGraph::markBlock(NodeId B, DefStackMap &DefM) { + // Push block delimiters. + for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I) + I->second.start_block(B); +} + +// Remove all definitions coming from block B from each stack in DefM. +void DataFlowGraph::releaseBlock(NodeId B, DefStackMap &DefM) { + // Pop all defs from this block from the definition stack. Defs that were + // added to the map during the traversal of instructions will not have a + // delimiter, but for those, the whole stack will be emptied. + for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I) + I->second.clear_block(B); + + // Finally, remove empty stacks from the map. + for (auto I = DefM.begin(), E = DefM.end(), NextI = I; I != E; I = NextI) { + NextI = std::next(I); + // This preserves the validity of iterators other than I. + if (I->second.empty()) + DefM.erase(I); + } +} + +// Push all definitions from the instruction node IA to an appropriate +// stack in DefM. +void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) { + NodeList Defs = IA.Addr->members_if(IsDef, *this); + NodeSet Visited; +#ifndef NDEBUG + RegisterSet Defined; +#endif + + // The important objectives of this function are: + // - to be able to handle instructions both while the graph is being + // constructed, and after the graph has been constructed, and + // - maintain proper ordering of definitions on the stack for each + // register reference: + // - if there are two or more related defs in IA (i.e. coming from + // the same machine operand), then only push one def on the stack, + // - if there are multiple unrelated defs of non-overlapping + // subregisters of S, then the stack for S will have both (in an + // unspecified order), but the order does not matter from the data- + // -flow perspective. + + for (NodeAddr<DefNode*> DA : Defs) { + if (Visited.count(DA.Id)) + continue; + NodeList Rel = getRelatedRefs(IA, DA); + NodeAddr<DefNode*> PDA = Rel.front(); + // Push the definition on the stack for the register and all aliases. + RegisterRef RR = PDA.Addr->getRegRef(); +#ifndef NDEBUG + // Assert if the register is defined in two or more unrelated defs. + // This could happen if there are two or more def operands defining it. + if (!Defined.insert(RR).second) { + auto *MI = NodeAddr<StmtNode*>(IA).Addr->getCode(); + dbgs() << "Multiple definitions of register: " + << Print<RegisterRef>(RR, *this) << " in\n " << *MI + << "in BB#" << MI->getParent()->getNumber() << '\n'; + llvm_unreachable(nullptr); + } +#endif + DefM[RR].push(DA); + for (auto A : RAI.getAliasSet(RR)) { + assert(A != RR); + DefM[A].push(DA); + } + // Mark all the related defs as visited. + for (auto T : Rel) + Visited.insert(T.Id); + } +} + +// Return the list of all reference nodes related to RA, including RA itself. +// See "getNextRelated" for the meaning of a "related reference". +NodeList DataFlowGraph::getRelatedRefs(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const { + assert(IA.Id != 0 && RA.Id != 0); + + NodeList Refs; + NodeId Start = RA.Id; + do { + Refs.push_back(RA); + RA = getNextRelated(IA, RA); + } while (RA.Id != 0 && RA.Id != Start); + return Refs; +} + + +// Clear all information in the graph. +void DataFlowGraph::reset() { + Memory.clear(); + Func = NodeAddr<FuncNode*>(); +} + + +// Return the next reference node in the instruction node IA that is related +// to RA. Conceptually, two reference nodes are related if they refer to the +// same instance of a register access, but differ in flags or other minor +// characteristics. Specific examples of related nodes are shadow reference +// nodes. +// Return the equivalent of nullptr if there are no more related references. +NodeAddr<RefNode*> DataFlowGraph::getNextRelated(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const { + assert(IA.Id != 0 && RA.Id != 0); + + auto Related = [RA](NodeAddr<RefNode*> TA) -> bool { + if (TA.Addr->getKind() != RA.Addr->getKind()) + return false; + if (TA.Addr->getRegRef() != RA.Addr->getRegRef()) + return false; + return true; + }; + auto RelatedStmt = [&Related,RA](NodeAddr<RefNode*> TA) -> bool { + return Related(TA) && + &RA.Addr->getOp() == &TA.Addr->getOp(); + }; + auto RelatedPhi = [&Related,RA](NodeAddr<RefNode*> TA) -> bool { + if (!Related(TA)) + return false; + if (TA.Addr->getKind() != NodeAttrs::Use) + return true; + // For phi uses, compare predecessor blocks. + const NodeAddr<const PhiUseNode*> TUA = TA; + const NodeAddr<const PhiUseNode*> RUA = RA; + return TUA.Addr->getPredecessor() == RUA.Addr->getPredecessor(); + }; + + RegisterRef RR = RA.Addr->getRegRef(); + if (IA.Addr->getKind() == NodeAttrs::Stmt) + return RA.Addr->getNextRef(RR, RelatedStmt, true, *this); + return RA.Addr->getNextRef(RR, RelatedPhi, true, *this); +} + +// Find the next node related to RA in IA that satisfies condition P. +// If such a node was found, return a pair where the second element is the +// located node. If such a node does not exist, return a pair where the +// first element is the element after which such a node should be inserted, +// and the second element is a null-address. +template <typename Predicate> +std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>> +DataFlowGraph::locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA, + Predicate P) const { + assert(IA.Id != 0 && RA.Id != 0); + + NodeAddr<RefNode*> NA; + NodeId Start = RA.Id; + while (true) { + NA = getNextRelated(IA, RA); + if (NA.Id == 0 || NA.Id == Start) + break; + if (P(NA)) + break; + RA = NA; + } + + if (NA.Id != 0 && NA.Id != Start) + return std::make_pair(RA, NA); + return std::make_pair(RA, NodeAddr<RefNode*>()); +} + +// Get the next shadow node in IA corresponding to RA, and optionally create +// such a node if it does not exist. +NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA, bool Create) { + assert(IA.Id != 0 && RA.Id != 0); + + uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow; + auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool { + return TA.Addr->getFlags() == Flags; + }; + auto Loc = locateNextRef(IA, RA, IsShadow); + if (Loc.second.Id != 0 || !Create) + return Loc.second; + + // Create a copy of RA and mark is as shadow. + NodeAddr<RefNode*> NA = cloneNode(RA); + NA.Addr->setFlags(Flags | NodeAttrs::Shadow); + IA.Addr->addMemberAfter(Loc.first, NA, *this); + return NA; +} + +// Get the next shadow node in IA corresponding to RA. Return null-address +// if such a node does not exist. +NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const { + assert(IA.Id != 0 && RA.Id != 0); + uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow; + auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool { + return TA.Addr->getFlags() == Flags; + }; + return locateNextRef(IA, RA, IsShadow).second; +} + +// Create a new statement node in the block node BA that corresponds to +// the machine instruction MI. +void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) { + auto SA = newStmt(BA, &In); + + // Collect a set of registers that this instruction implicitly uses + // or defines. Implicit operands from an instruction will be ignored + // unless they are listed here. + RegisterSet ImpUses, ImpDefs; + if (const uint16_t *ImpD = In.getDesc().getImplicitDefs()) + while (uint16_t R = *ImpD++) + ImpDefs.insert({R, 0}); + if (const uint16_t *ImpU = In.getDesc().getImplicitUses()) + while (uint16_t R = *ImpU++) + ImpUses.insert({R, 0}); + + bool IsCall = In.isCall(), IsReturn = In.isReturn(); + bool IsPredicated = TII.isPredicated(&In); + unsigned NumOps = In.getNumOperands(); + + // Avoid duplicate implicit defs. This will not detect cases of implicit + // defs that define registers that overlap, but it is not clear how to + // interpret that in the absence of explicit defs. Overlapping explicit + // defs are likely illegal already. + RegisterSet DoneDefs; + // Process explicit defs first. + for (unsigned OpN = 0; OpN < NumOps; ++OpN) { + MachineOperand &Op = In.getOperand(OpN); + if (!Op.isReg() || !Op.isDef() || Op.isImplicit()) + continue; + RegisterRef RR = { Op.getReg(), Op.getSubReg() }; + uint16_t Flags = NodeAttrs::None; + if (TOI.isPreserving(In, OpN)) + Flags |= NodeAttrs::Preserving; + if (TOI.isClobbering(In, OpN)) + Flags |= NodeAttrs::Clobbering; + if (TOI.isFixedReg(In, OpN)) + Flags |= NodeAttrs::Fixed; + NodeAddr<DefNode*> DA = newDef(SA, Op, Flags); + SA.Addr->addMember(DA, *this); + DoneDefs.insert(RR); + } + + // Process implicit defs, skipping those that have already been added + // as explicit. + for (unsigned OpN = 0; OpN < NumOps; ++OpN) { + MachineOperand &Op = In.getOperand(OpN); + if (!Op.isReg() || !Op.isDef() || !Op.isImplicit()) + continue; + RegisterRef RR = { Op.getReg(), Op.getSubReg() }; + if (!IsCall && !ImpDefs.count(RR)) + continue; + if (DoneDefs.count(RR)) + continue; + uint16_t Flags = NodeAttrs::None; + if (TOI.isPreserving(In, OpN)) + Flags |= NodeAttrs::Preserving; + if (TOI.isClobbering(In, OpN)) + Flags |= NodeAttrs::Clobbering; + if (TOI.isFixedReg(In, OpN)) + Flags |= NodeAttrs::Fixed; + NodeAddr<DefNode*> DA = newDef(SA, Op, Flags); + SA.Addr->addMember(DA, *this); + DoneDefs.insert(RR); + } + + for (unsigned OpN = 0; OpN < NumOps; ++OpN) { + MachineOperand &Op = In.getOperand(OpN); + if (!Op.isReg() || !Op.isUse()) + continue; + RegisterRef RR = { Op.getReg(), Op.getSubReg() }; + // Add implicit uses on return and call instructions, and on predicated + // instructions regardless of whether or not they appear in the instruction + // descriptor's list. + bool Implicit = Op.isImplicit(); + bool TakeImplicit = IsReturn || IsCall || IsPredicated; + if (Implicit && !TakeImplicit && !ImpUses.count(RR)) + continue; + uint16_t Flags = NodeAttrs::None; + if (TOI.isFixedReg(In, OpN)) + Flags |= NodeAttrs::Fixed; + NodeAddr<UseNode*> UA = newUse(SA, Op, Flags); + SA.Addr->addMember(UA, *this); + } +} + +// Build a map that for each block will have the set of all references from +// that block, and from all blocks dominated by it. +void DataFlowGraph::buildBlockRefs(NodeAddr<BlockNode*> BA, + BlockRefsMap &RefM) { + auto &Refs = RefM[BA.Id]; + MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode()); + assert(N); + for (auto I : *N) { + MachineBasicBlock *SB = I->getBlock(); + auto SBA = Func.Addr->findBlock(SB, *this); + buildBlockRefs(SBA, RefM); + const auto &SRs = RefM[SBA.Id]; + Refs.insert(SRs.begin(), SRs.end()); + } + + for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) + for (NodeAddr<RefNode*> RA : IA.Addr->members(*this)) + Refs.insert(RA.Addr->getRegRef()); +} + +// Scan all defs in the block node BA and record in PhiM the locations of +// phi nodes corresponding to these defs. +void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM, BlockRefsMap &RefM, + NodeAddr<BlockNode*> BA) { + // Check all defs from block BA and record them in each block in BA's + // iterated dominance frontier. This information will later be used to + // create phi nodes. + MachineBasicBlock *BB = BA.Addr->getCode(); + assert(BB); + auto DFLoc = MDF.find(BB); + if (DFLoc == MDF.end() || DFLoc->second.empty()) + return; + + // Traverse all instructions in the block and collect the set of all + // defined references. For each reference there will be a phi created + // in the block's iterated dominance frontier. + // This is done to make sure that each defined reference gets only one + // phi node, even if it is defined multiple times. + RegisterSet Defs; + for (auto I : BA.Addr->members(*this)) { + assert(I.Addr->getType() == NodeAttrs::Code); + assert(I.Addr->getKind() == NodeAttrs::Phi || + I.Addr->getKind() == NodeAttrs::Stmt); + NodeAddr<InstrNode*> IA = I; + for (NodeAddr<RefNode*> RA : IA.Addr->members_if(IsDef, *this)) + Defs.insert(RA.Addr->getRegRef()); + } + + // Finally, add the set of defs to each block in the iterated dominance + // frontier. + const MachineDominanceFrontier::DomSetType &DF = DFLoc->second; + SetVector<MachineBasicBlock*> IDF(DF.begin(), DF.end()); + for (unsigned i = 0; i < IDF.size(); ++i) { + auto F = MDF.find(IDF[i]); + if (F != MDF.end()) + IDF.insert(F->second.begin(), F->second.end()); + } + + // Get the register references that are reachable from this block. + RegisterSet &Refs = RefM[BA.Id]; + for (auto DB : IDF) { + auto DBA = Func.Addr->findBlock(DB, *this); + const auto &Rs = RefM[DBA.Id]; + Refs.insert(Rs.begin(), Rs.end()); + } + + for (auto DB : IDF) { + auto DBA = Func.Addr->findBlock(DB, *this); + PhiM[DBA.Id].insert(Defs.begin(), Defs.end()); + } +} + +// Given the locations of phi nodes in the map PhiM, create the phi nodes +// that are located in the block node BA. +void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM, + NodeAddr<BlockNode*> BA) { + // Check if this blocks has any DF defs, i.e. if there are any defs + // that this block is in the iterated dominance frontier of. + auto HasDF = PhiM.find(BA.Id); + if (HasDF == PhiM.end() || HasDF->second.empty()) + return; + + // First, remove all R in Refs in such that there exists T in Refs + // such that T covers R. In other words, only leave those refs that + // are not covered by another ref (i.e. maximal with respect to covering). + + auto MaxCoverIn = [this] (RegisterRef RR, RegisterSet &RRs) -> RegisterRef { + for (auto I : RRs) + if (I != RR && RAI.covers(I, RR)) + RR = I; + return RR; + }; + + RegisterSet MaxDF; + for (auto I : HasDF->second) + MaxDF.insert(MaxCoverIn(I, HasDF->second)); + + std::vector<RegisterRef> MaxRefs; + auto &RefB = RefM[BA.Id]; + for (auto I : MaxDF) + MaxRefs.push_back(MaxCoverIn(I, RefB)); + + // Now, for each R in MaxRefs, get the alias closure of R. If the closure + // only has R in it, create a phi a def for R. Otherwise, create a phi, + // and add a def for each S in the closure. + + // Sort the refs so that the phis will be created in a deterministic order. + std::sort(MaxRefs.begin(), MaxRefs.end()); + // Remove duplicates. + auto NewEnd = std::unique(MaxRefs.begin(), MaxRefs.end()); + MaxRefs.erase(NewEnd, MaxRefs.end()); + + auto Aliased = [this,&MaxRefs](RegisterRef RR, + std::vector<unsigned> &Closure) -> bool { + for (auto I : Closure) + if (RAI.alias(RR, MaxRefs[I])) + return true; + return false; + }; + + // Prepare a list of NodeIds of the block's predecessors. + std::vector<NodeId> PredList; + const MachineBasicBlock *MBB = BA.Addr->getCode(); + for (auto PB : MBB->predecessors()) { + auto B = Func.Addr->findBlock(PB, *this); + PredList.push_back(B.Id); + } + + while (!MaxRefs.empty()) { + // Put the first element in the closure, and then add all subsequent + // elements from MaxRefs to it, if they alias at least one element + // already in the closure. + // ClosureIdx: vector of indices in MaxRefs of members of the closure. + std::vector<unsigned> ClosureIdx = { 0 }; + for (unsigned i = 1; i != MaxRefs.size(); ++i) + if (Aliased(MaxRefs[i], ClosureIdx)) + ClosureIdx.push_back(i); + + // Build a phi for the closure. + unsigned CS = ClosureIdx.size(); + NodeAddr<PhiNode*> PA = newPhi(BA); + + // Add defs. + for (unsigned X = 0; X != CS; ++X) { + RegisterRef RR = MaxRefs[ClosureIdx[X]]; + uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving; + NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags); + PA.Addr->addMember(DA, *this); + } + // Add phi uses. + for (auto P : PredList) { + auto PBA = addr<BlockNode*>(P); + for (unsigned X = 0; X != CS; ++X) { + RegisterRef RR = MaxRefs[ClosureIdx[X]]; + NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA); + PA.Addr->addMember(PUA, *this); + } + } + + // Erase from MaxRefs all elements in the closure. + auto Begin = MaxRefs.begin(); + for (unsigned i = ClosureIdx.size(); i != 0; --i) + MaxRefs.erase(Begin + ClosureIdx[i-1]); + } +} + +// Remove any unneeded phi nodes that were created during the build process. +void DataFlowGraph::removeUnusedPhis() { + // This will remove unused phis, i.e. phis where each def does not reach + // any uses or other defs. This will not detect or remove circular phi + // chains that are otherwise dead. Unused/dead phis are created during + // the build process and this function is intended to remove these cases + // that are easily determinable to be unnecessary. + + SetVector<NodeId> PhiQ; + for (NodeAddr<BlockNode*> BA : Func.Addr->members(*this)) { + for (auto P : BA.Addr->members_if(IsPhi, *this)) + PhiQ.insert(P.Id); + } + + static auto HasUsedDef = [](NodeList &Ms) -> bool { + for (auto M : Ms) { + if (M.Addr->getKind() != NodeAttrs::Def) + continue; + NodeAddr<DefNode*> DA = M; + if (DA.Addr->getReachedDef() != 0 || DA.Addr->getReachedUse() != 0) + return true; + } + return false; + }; + + // Any phi, if it is removed, may affect other phis (make them dead). + // For each removed phi, collect the potentially affected phis and add + // them back to the queue. + while (!PhiQ.empty()) { + auto PA = addr<PhiNode*>(PhiQ[0]); + PhiQ.remove(PA.Id); + NodeList Refs = PA.Addr->members(*this); + if (HasUsedDef(Refs)) + continue; + for (NodeAddr<RefNode*> RA : Refs) { + if (NodeId RD = RA.Addr->getReachingDef()) { + auto RDA = addr<DefNode*>(RD); + NodeAddr<InstrNode*> OA = RDA.Addr->getOwner(*this); + if (IsPhi(OA)) + PhiQ.insert(OA.Id); + } + if (RA.Addr->isDef()) + unlinkDef(RA); + else + unlinkUse(RA); + } + NodeAddr<BlockNode*> BA = PA.Addr->getOwner(*this); + BA.Addr->removeMember(PA, *this); + } +} + +// For a given reference node TA in an instruction node IA, connect the +// reaching def of TA to the appropriate def node. Create any shadow nodes +// as appropriate. +template <typename T> +void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA, + DefStack &DS) { + if (DS.empty()) + return; + RegisterRef RR = TA.Addr->getRegRef(); + NodeAddr<T> TAP; + + // References from the def stack that have been examined so far. + RegisterSet Defs; + + for (auto I = DS.top(), E = DS.bottom(); I != E; I.down()) { + RegisterRef QR = I->Addr->getRegRef(); + auto AliasQR = [QR,this] (RegisterRef RR) -> bool { + return RAI.alias(QR, RR); + }; + bool PrecUp = RAI.covers(QR, RR); + // Skip all defs that are aliased to any of the defs that we have already + // seen. If we encounter a covering def, stop the stack traversal early. + if (std::any_of(Defs.begin(), Defs.end(), AliasQR)) { + if (PrecUp) + break; + continue; + } + // The reaching def. + NodeAddr<DefNode*> RDA = *I; + + // Pick the reached node. + if (TAP.Id == 0) { + TAP = TA; + } else { + // Mark the existing ref as "shadow" and create a new shadow. + TAP.Addr->setFlags(TAP.Addr->getFlags() | NodeAttrs::Shadow); + TAP = getNextShadow(IA, TAP, true); + } + + // Create the link. + TAP.Addr->linkToDef(TAP.Id, RDA); + + if (PrecUp) + break; + Defs.insert(QR); + } +} + +// Create data-flow links for all reference nodes in the statement node SA. +void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA) { + RegisterSet Defs; + + // Link all nodes (upwards in the data-flow) with their reaching defs. + for (NodeAddr<RefNode*> RA : SA.Addr->members(*this)) { + uint16_t Kind = RA.Addr->getKind(); + assert(Kind == NodeAttrs::Def || Kind == NodeAttrs::Use); + RegisterRef RR = RA.Addr->getRegRef(); + // Do not process multiple defs of the same reference. + if (Kind == NodeAttrs::Def && Defs.count(RR)) + continue; + Defs.insert(RR); + + auto F = DefM.find(RR); + if (F == DefM.end()) + continue; + DefStack &DS = F->second; + if (Kind == NodeAttrs::Use) + linkRefUp<UseNode*>(SA, RA, DS); + else if (Kind == NodeAttrs::Def) + linkRefUp<DefNode*>(SA, RA, DS); + else + llvm_unreachable("Unexpected node in instruction"); + } +} + +// Create data-flow links for all instructions in the block node BA. This +// will include updating any phi nodes in BA. +void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) { + // Push block delimiters. + markBlock(BA.Id, DefM); + + // For each non-phi instruction in the block, link all the defs and uses + // to their reaching defs. For any member of the block (including phis), + // push the defs on the corresponding stacks. + for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) { + // Ignore phi nodes here. They will be linked part by part from the + // predecessors. + if (IA.Addr->getKind() == NodeAttrs::Stmt) + linkStmtRefs(DefM, IA); + + // Push the definitions on the stack. + pushDefs(IA, DefM); + } + + // Recursively process all children in the dominator tree. + MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode()); + for (auto I : *N) { + MachineBasicBlock *SB = I->getBlock(); + auto SBA = Func.Addr->findBlock(SB, *this); + linkBlockRefs(DefM, SBA); + } + + // Link the phi uses from the successor blocks. + auto IsUseForBA = [BA](NodeAddr<NodeBase*> NA) -> bool { + if (NA.Addr->getKind() != NodeAttrs::Use) + return false; + assert(NA.Addr->getFlags() & NodeAttrs::PhiRef); + NodeAddr<PhiUseNode*> PUA = NA; + return PUA.Addr->getPredecessor() == BA.Id; + }; + MachineBasicBlock *MBB = BA.Addr->getCode(); + for (auto SB : MBB->successors()) { + auto SBA = Func.Addr->findBlock(SB, *this); + for (NodeAddr<InstrNode*> IA : SBA.Addr->members_if(IsPhi, *this)) { + // Go over each phi use associated with MBB, and link it. + for (auto U : IA.Addr->members_if(IsUseForBA, *this)) { + NodeAddr<PhiUseNode*> PUA = U; + RegisterRef RR = PUA.Addr->getRegRef(); + linkRefUp<UseNode*>(IA, PUA, DefM[RR]); + } + } + } + + // Pop all defs from this block from the definition stacks. + releaseBlock(BA.Id, DefM); +} + +// Remove the use node UA from any data-flow and structural links. +void DataFlowGraph::unlinkUse(NodeAddr<UseNode*> UA) { + NodeId RD = UA.Addr->getReachingDef(); + NodeId Sib = UA.Addr->getSibling(); + + NodeAddr<InstrNode*> IA = UA.Addr->getOwner(*this); + IA.Addr->removeMember(UA, *this); + + if (RD == 0) { + assert(Sib == 0); + return; + } + + auto RDA = addr<DefNode*>(RD); + auto TA = addr<UseNode*>(RDA.Addr->getReachedUse()); + if (TA.Id == UA.Id) { + RDA.Addr->setReachedUse(Sib); + return; + } + + while (TA.Id != 0) { + NodeId S = TA.Addr->getSibling(); + if (S == UA.Id) { + TA.Addr->setSibling(UA.Addr->getSibling()); + return; + } + TA = addr<UseNode*>(S); + } +} + +// Remove the def node DA from any data-flow and structural links. +void DataFlowGraph::unlinkDef(NodeAddr<DefNode*> DA) { + // + // RD + // | reached + // | def + // : + // . + // +----+ + // ... -- | DA | -- ... -- 0 : sibling chain of DA + // +----+ + // | | reached + // | : def + // | . + // | ... : Siblings (defs) + // | + // : reached + // . use + // ... : sibling chain of reached uses + + NodeId RD = DA.Addr->getReachingDef(); + + // Visit all siblings of the reached def and reset their reaching defs. + // Also, defs reached by DA are now "promoted" to being reached by RD, + // so all of them will need to be spliced into the sibling chain where + // DA belongs. + auto getAllNodes = [this] (NodeId N) -> NodeList { + NodeList Res; + while (N) { + auto RA = addr<RefNode*>(N); + // Keep the nodes in the exact sibling order. + Res.push_back(RA); + N = RA.Addr->getSibling(); + } + return Res; + }; + NodeList ReachedDefs = getAllNodes(DA.Addr->getReachedDef()); + NodeList ReachedUses = getAllNodes(DA.Addr->getReachedUse()); + + if (RD == 0) { + for (NodeAddr<RefNode*> I : ReachedDefs) + I.Addr->setSibling(0); + for (NodeAddr<RefNode*> I : ReachedUses) + I.Addr->setSibling(0); + } + for (NodeAddr<DefNode*> I : ReachedDefs) + I.Addr->setReachingDef(RD); + for (NodeAddr<UseNode*> I : ReachedUses) + I.Addr->setReachingDef(RD); + + NodeId Sib = DA.Addr->getSibling(); + if (RD == 0) { + assert(Sib == 0); + return; + } + + // Update the reaching def node and remove DA from the sibling list. + auto RDA = addr<DefNode*>(RD); + auto TA = addr<DefNode*>(RDA.Addr->getReachedDef()); + if (TA.Id == DA.Id) { + // If DA is the first reached def, just update the RD's reached def + // to the DA's sibling. + RDA.Addr->setReachedDef(Sib); + } else { + // Otherwise, traverse the sibling list of the reached defs and remove + // DA from it. + while (TA.Id != 0) { + NodeId S = TA.Addr->getSibling(); + if (S == DA.Id) { + TA.Addr->setSibling(Sib); + break; + } + TA = addr<DefNode*>(S); + } + } + + // Splice the DA's reached defs into the RDA's reached def chain. + if (!ReachedDefs.empty()) { + auto Last = NodeAddr<DefNode*>(ReachedDefs.back()); + Last.Addr->setSibling(RDA.Addr->getReachedDef()); + RDA.Addr->setReachedDef(ReachedDefs.front().Id); + } + // Splice the DA's reached uses into the RDA's reached use chain. + if (!ReachedUses.empty()) { + auto Last = NodeAddr<UseNode*>(ReachedUses.back()); + Last.Addr->setSibling(RDA.Addr->getReachedUse()); + RDA.Addr->setReachedUse(ReachedUses.front().Id); + } + + NodeAddr<InstrNode*> IA = DA.Addr->getOwner(*this); + IA.Addr->removeMember(DA, *this); +} diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.h b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h new file mode 100644 index 0000000..7da7bb5 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h @@ -0,0 +1,841 @@ +//===--- RDFGraph.h -------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Target-independent, SSA-based data flow graph for register data flow (RDF) +// for a non-SSA program representation (e.g. post-RA machine code). +// +// +// *** Introduction +// +// The RDF graph is a collection of nodes, each of which denotes some element +// of the program. There are two main types of such elements: code and refe- +// rences. Conceptually, "code" is something that represents the structure +// of the program, e.g. basic block or a statement, while "reference" is an +// instance of accessing a register, e.g. a definition or a use. Nodes are +// connected with each other based on the structure of the program (such as +// blocks, instructions, etc.), and based on the data flow (e.g. reaching +// definitions, reached uses, etc.). The single-reaching-definition principle +// of SSA is generally observed, although, due to the non-SSA representation +// of the program, there are some differences between the graph and a "pure" +// SSA representation. +// +// +// *** Implementation remarks +// +// Since the graph can contain a large number of nodes, memory consumption +// was one of the major design considerations. As a result, there is a single +// base class NodeBase which defines all members used by all possible derived +// classes. The members are arranged in a union, and a derived class cannot +// add any data members of its own. Each derived class only defines the +// functional interface, i.e. member functions. NodeBase must be a POD, +// which implies that all of its members must also be PODs. +// Since nodes need to be connected with other nodes, pointers have been +// replaced with 32-bit identifiers: each node has an id of type NodeId. +// There are mapping functions in the graph that translate between actual +// memory addresses and the corresponding identifiers. +// A node id of 0 is equivalent to nullptr. +// +// +// *** Structure of the graph +// +// A code node is always a collection of other nodes. For example, a code +// node corresponding to a basic block will contain code nodes corresponding +// to instructions. In turn, a code node corresponding to an instruction will +// contain a list of reference nodes that correspond to the definitions and +// uses of registers in that instruction. The members are arranged into a +// circular list, which is yet another consequence of the effort to save +// memory: for each member node it should be possible to obtain its owner, +// and it should be possible to access all other members. There are other +// ways to accomplish that, but the circular list seemed the most natural. +// +// +- CodeNode -+ +// | | <---------------------------------------------------+ +// +-+--------+-+ | +// |FirstM |LastM | +// | +-------------------------------------+ | +// | | | +// V V | +// +----------+ Next +----------+ Next Next +----------+ Next | +// | |----->| |-----> ... ----->| |----->-+ +// +- Member -+ +- Member -+ +- Member -+ +// +// The order of members is such that related reference nodes (see below) +// should be contiguous on the member list. +// +// A reference node is a node that encapsulates an access to a register, +// in other words, data flowing into or out of a register. There are two +// major kinds of reference nodes: defs and uses. A def node will contain +// the id of the first reached use, and the id of the first reached def. +// Each def and use will contain the id of the reaching def, and also the +// id of the next reached def (for def nodes) or use (for use nodes). +// The "next node sharing the same reaching def" is denoted as "sibling". +// In summary: +// - Def node contains: reaching def, sibling, first reached def, and first +// reached use. +// - Use node contains: reaching def and sibling. +// +// +-- DefNode --+ +// | R2 = ... | <---+--------------------+ +// ++---------+--+ | | +// |Reached |Reached | | +// |Def |Use | | +// | | |Reaching |Reaching +// | V |Def |Def +// | +-- UseNode --+ Sib +-- UseNode --+ Sib Sib +// | | ... = R2 |----->| ... = R2 |----> ... ----> 0 +// | +-------------+ +-------------+ +// V +// +-- DefNode --+ Sib +// | R2 = ... |----> ... +// ++---------+--+ +// | | +// | | +// ... ... +// +// To get a full picture, the circular lists connecting blocks within a +// function, instructions within a block, etc. should be superimposed with +// the def-def, def-use links shown above. +// To illustrate this, consider a small example in a pseudo-assembly: +// foo: +// add r2, r0, r1 ; r2 = r0+r1 +// addi r0, r2, 1 ; r0 = r2+1 +// ret r0 ; return value in r0 +// +// The graph (in a format used by the debugging functions) would look like: +// +// DFG dump:[ +// f1: Function foo +// b2: === BB#0 === preds(0), succs(0): +// p3: phi [d4<r0>(,d12,u9):] +// p5: phi [d6<r1>(,,u10):] +// s7: add [d8<r2>(,,u13):, u9<r0>(d4):, u10<r1>(d6):] +// s11: addi [d12<r0>(d4,,u15):, u13<r2>(d8):] +// s14: ret [u15<r0>(d12):] +// ] +// +// The f1, b2, p3, etc. are node ids. The letter is prepended to indicate the +// kind of the node (i.e. f - function, b - basic block, p - phi, s - state- +// ment, d - def, u - use). +// The format of a def node is: +// dN<R>(rd,d,u):sib, +// where +// N - numeric node id, +// R - register being defined +// rd - reaching def, +// d - reached def, +// u - reached use, +// sib - sibling. +// The format of a use node is: +// uN<R>[!](rd):sib, +// where +// N - numeric node id, +// R - register being used, +// rd - reaching def, +// sib - sibling. +// Possible annotations (usually preceding the node id): +// + - preserving def, +// ~ - clobbering def, +// " - shadow ref (follows the node id), +// ! - fixed register (appears after register name). +// +// The circular lists are not explicit in the dump. +// +// +// *** Node attributes +// +// NodeBase has a member "Attrs", which is the primary way of determining +// the node's characteristics. The fields in this member decide whether +// the node is a code node or a reference node (i.e. node's "type"), then +// within each type, the "kind" determines what specifically this node +// represents. The remaining bits, "flags", contain additional information +// that is even more detailed than the "kind". +// CodeNode's kinds are: +// - Phi: Phi node, members are reference nodes. +// - Stmt: Statement, members are reference nodes. +// - Block: Basic block, members are instruction nodes (i.e. Phi or Stmt). +// - Func: The whole function. The members are basic block nodes. +// RefNode's kinds are: +// - Use. +// - Def. +// +// Meaning of flags: +// - Preserving: applies only to defs. A preserving def is one that can +// preserve some of the original bits among those that are included in +// the register associated with that def. For example, if R0 is a 32-bit +// register, but a def can only change the lower 16 bits, then it will +// be marked as preserving. +// - Shadow: a reference that has duplicates holding additional reaching +// defs (see more below). +// - Clobbering: applied only to defs, indicates that the value generated +// by this def is unspecified. A typical example would be volatile registers +// after function calls. +// +// +// *** Shadow references +// +// It may happen that a super-register can have two (or more) non-overlapping +// sub-registers. When both of these sub-registers are defined and followed +// by a use of the super-register, the use of the super-register will not +// have a unique reaching def: both defs of the sub-registers need to be +// accounted for. In such cases, a duplicate use of the super-register is +// added and it points to the extra reaching def. Both uses are marked with +// a flag "shadow". Example: +// Assume t0 is a super-register of r0 and r1, r0 and r1 do not overlap: +// set r0, 1 ; r0 = 1 +// set r1, 1 ; r1 = 1 +// addi t1, t0, 1 ; t1 = t0+1 +// +// The DFG: +// s1: set [d2<r0>(,,u9):] +// s3: set [d4<r1>(,,u10):] +// s5: addi [d6<t1>(,,):, u7"<t0>(d2):, u8"<t0>(d4):] +// +// The statement s5 has two use nodes for t0: u7" and u9". The quotation +// mark " indicates that the node is a shadow. +// +#ifndef RDF_GRAPH_H +#define RDF_GRAPH_H + +#include "llvm/ADT/BitVector.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Timer.h" + +#include <functional> +#include <map> +#include <set> +#include <vector> + +using namespace llvm; + +namespace llvm { + class MachineBasicBlock; + class MachineFunction; + class MachineInstr; + class MachineOperand; + class MachineDominanceFrontier; + class MachineDominatorTree; + class TargetInstrInfo; + class TargetRegisterInfo; +} + +namespace rdf { + typedef uint32_t NodeId; + + struct NodeAttrs { + enum : uint16_t { + None = 0x0000, // Nothing + + // Types: 2 bits + TypeMask = 0x0003, + Code = 0x0001, // 01, Container + Ref = 0x0002, // 10, Reference + + // Kind: 3 bits + KindMask = 0x0007 << 2, + Def = 0x0001 << 2, // 001 + Use = 0x0002 << 2, // 010 + Phi = 0x0003 << 2, // 011 + Stmt = 0x0004 << 2, // 100 + Block = 0x0005 << 2, // 101 + Func = 0x0006 << 2, // 110 + + // Flags: 5 bits for now + FlagMask = 0x001F << 5, + Shadow = 0x0001 << 5, // 00001, Has extra reaching defs. + Clobbering = 0x0002 << 5, // 00010, Produces unspecified values. + PhiRef = 0x0004 << 5, // 00100, Member of PhiNode. + Preserving = 0x0008 << 5, // 01000, Def can keep original bits. + Fixed = 0x0010 << 5, // 10000, Fixed register. + }; + + static uint16_t type(uint16_t T) { return T & TypeMask; } + static uint16_t kind(uint16_t T) { return T & KindMask; } + static uint16_t flags(uint16_t T) { return T & FlagMask; } + + static uint16_t set_type(uint16_t A, uint16_t T) { + return (A & ~TypeMask) | T; + } + static uint16_t set_kind(uint16_t A, uint16_t K) { + return (A & ~KindMask) | K; + } + static uint16_t set_flags(uint16_t A, uint16_t F) { + return (A & ~FlagMask) | F; + } + + // Test if A contains B. + static bool contains(uint16_t A, uint16_t B) { + if (type(A) != Code) + return false; + uint16_t KB = kind(B); + switch (kind(A)) { + case Func: + return KB == Block; + case Block: + return KB == Phi || KB == Stmt; + case Phi: + case Stmt: + return type(B) == Ref; + } + return false; + } + }; + + template <typename T> struct NodeAddr { + NodeAddr() : Addr(nullptr), Id(0) {} + NodeAddr(T A, NodeId I) : Addr(A), Id(I) {} + NodeAddr(const NodeAddr&) = default; + NodeAddr &operator= (const NodeAddr&) = default; + + bool operator== (const NodeAddr<T> &NA) const { + assert((Addr == NA.Addr) == (Id == NA.Id)); + return Addr == NA.Addr; + } + bool operator!= (const NodeAddr<T> &NA) const { + return !operator==(NA); + } + // Type cast (casting constructor). The reason for having this class + // instead of std::pair. + template <typename S> NodeAddr(const NodeAddr<S> &NA) + : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {} + + T Addr; + NodeId Id; + }; + + struct NodeBase; + + // Fast memory allocation and translation between node id and node address. + // This is really the same idea as the one underlying the "bump pointer + // allocator", the difference being in the translation. A node id is + // composed of two components: the index of the block in which it was + // allocated, and the index within the block. With the default settings, + // where the number of nodes per block is 4096, the node id (minus 1) is: + // + // bit position: 11 0 + // +----------------------------+--------------+ + // | Index of the block |Index in block| + // +----------------------------+--------------+ + // + // The actual node id is the above plus 1, to avoid creating a node id of 0. + // + // This method significantly improved the build time, compared to using maps + // (std::unordered_map or DenseMap) to translate between pointers and ids. + struct NodeAllocator { + // Amount of storage for a single node. + enum { NodeMemSize = 32 }; + NodeAllocator(uint32_t NPB = 4096) + : NodesPerBlock(NPB), BitsPerIndex(Log2_32(NPB)), + IndexMask((1 << BitsPerIndex)-1), ActiveEnd(nullptr) { + assert(isPowerOf2_32(NPB)); + } + NodeBase *ptr(NodeId N) const { + uint32_t N1 = N-1; + uint32_t BlockN = N1 >> BitsPerIndex; + uint32_t Offset = (N1 & IndexMask) * NodeMemSize; + return reinterpret_cast<NodeBase*>(Blocks[BlockN]+Offset); + } + NodeId id(const NodeBase *P) const; + NodeAddr<NodeBase*> New(); + void clear(); + + private: + void startNewBlock(); + bool needNewBlock(); + uint32_t makeId(uint32_t Block, uint32_t Index) const { + // Add 1 to the id, to avoid the id of 0, which is treated as "null". + return ((Block << BitsPerIndex) | Index) + 1; + } + + const uint32_t NodesPerBlock; + const uint32_t BitsPerIndex; + const uint32_t IndexMask; + char *ActiveEnd; + std::vector<char*> Blocks; + typedef BumpPtrAllocatorImpl<MallocAllocator, 65536> AllocatorTy; + AllocatorTy MemPool; + }; + + struct RegisterRef { + unsigned Reg, Sub; + + // No non-trivial constructors, since this will be a member of a union. + RegisterRef() = default; + RegisterRef(const RegisterRef &RR) = default; + RegisterRef &operator= (const RegisterRef &RR) = default; + bool operator== (const RegisterRef &RR) const { + return Reg == RR.Reg && Sub == RR.Sub; + } + bool operator!= (const RegisterRef &RR) const { + return !operator==(RR); + } + bool operator< (const RegisterRef &RR) const { + return Reg < RR.Reg || (Reg == RR.Reg && Sub < RR.Sub); + } + }; + typedef std::set<RegisterRef> RegisterSet; + + struct RegisterAliasInfo { + RegisterAliasInfo(const TargetRegisterInfo &tri) : TRI(tri) {} + virtual ~RegisterAliasInfo() {} + + virtual std::vector<RegisterRef> getAliasSet(RegisterRef RR) const; + virtual bool alias(RegisterRef RA, RegisterRef RB) const; + virtual bool covers(RegisterRef RA, RegisterRef RB) const; + virtual bool covers(const RegisterSet &RRs, RegisterRef RR) const; + + const TargetRegisterInfo &TRI; + }; + + struct TargetOperandInfo { + TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {} + virtual ~TargetOperandInfo() {} + virtual bool isPreserving(const MachineInstr &In, unsigned OpNum) const; + virtual bool isClobbering(const MachineInstr &In, unsigned OpNum) const; + virtual bool isFixedReg(const MachineInstr &In, unsigned OpNum) const; + + const TargetInstrInfo &TII; + }; + + + struct DataFlowGraph; + + struct NodeBase { + public: + // Make sure this is a POD. + NodeBase() = default; + uint16_t getType() const { return NodeAttrs::type(Attrs); } + uint16_t getKind() const { return NodeAttrs::kind(Attrs); } + uint16_t getFlags() const { return NodeAttrs::flags(Attrs); } + NodeId getNext() const { return Next; } + + uint16_t getAttrs() const { return Attrs; } + void setAttrs(uint16_t A) { Attrs = A; } + void setFlags(uint16_t F) { setAttrs(NodeAttrs::set_flags(getAttrs(), F)); } + + // Insert node NA after "this" in the circular chain. + void append(NodeAddr<NodeBase*> NA); + // Initialize all members to 0. + void init() { memset(this, 0, sizeof *this); } + void setNext(NodeId N) { Next = N; } + + protected: + uint16_t Attrs; + uint16_t Reserved; + NodeId Next; // Id of the next node in the circular chain. + // Definitions of nested types. Using anonymous nested structs would make + // this class definition clearer, but unnamed structs are not a part of + // the standard. + struct Def_struct { + NodeId DD, DU; // Ids of the first reached def and use. + }; + struct PhiU_struct { + NodeId PredB; // Id of the predecessor block for a phi use. + }; + struct Code_struct { + void *CP; // Pointer to the actual code. + NodeId FirstM, LastM; // Id of the first member and last. + }; + struct Ref_struct { + NodeId RD, Sib; // Ids of the reaching def and the sibling. + union { + Def_struct Def; + PhiU_struct PhiU; + }; + union { + MachineOperand *Op; // Non-phi refs point to a machine operand. + RegisterRef RR; // Phi refs store register info directly. + }; + }; + + // The actual payload. + union { + Ref_struct Ref; + Code_struct Code; + }; + }; + // The allocator allocates chunks of 32 bytes for each node. The fact that + // each node takes 32 bytes in memory is used for fast translation between + // the node id and the node address. + static_assert(sizeof(NodeBase) <= NodeAllocator::NodeMemSize, + "NodeBase must be at most NodeAllocator::NodeMemSize bytes"); + + typedef std::vector<NodeAddr<NodeBase*>> NodeList; + typedef std::set<NodeId> NodeSet; + + struct RefNode : public NodeBase { + RefNode() = default; + RegisterRef getRegRef() const; + MachineOperand &getOp() { + assert(!(getFlags() & NodeAttrs::PhiRef)); + return *Ref.Op; + } + void setRegRef(RegisterRef RR); + void setRegRef(MachineOperand *Op); + NodeId getReachingDef() const { + return Ref.RD; + } + void setReachingDef(NodeId RD) { + Ref.RD = RD; + } + NodeId getSibling() const { + return Ref.Sib; + } + void setSibling(NodeId Sib) { + Ref.Sib = Sib; + } + bool isUse() const { + assert(getType() == NodeAttrs::Ref); + return getKind() == NodeAttrs::Use; + } + bool isDef() const { + assert(getType() == NodeAttrs::Ref); + return getKind() == NodeAttrs::Def; + } + + template <typename Predicate> + NodeAddr<RefNode*> getNextRef(RegisterRef RR, Predicate P, bool NextOnly, + const DataFlowGraph &G); + NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G); + }; + + struct DefNode : public RefNode { + NodeId getReachedDef() const { + return Ref.Def.DD; + } + void setReachedDef(NodeId D) { + Ref.Def.DD = D; + } + NodeId getReachedUse() const { + return Ref.Def.DU; + } + void setReachedUse(NodeId U) { + Ref.Def.DU = U; + } + + void linkToDef(NodeId Self, NodeAddr<DefNode*> DA); + }; + + struct UseNode : public RefNode { + void linkToDef(NodeId Self, NodeAddr<DefNode*> DA); + }; + + struct PhiUseNode : public UseNode { + NodeId getPredecessor() const { + assert(getFlags() & NodeAttrs::PhiRef); + return Ref.PhiU.PredB; + } + void setPredecessor(NodeId B) { + assert(getFlags() & NodeAttrs::PhiRef); + Ref.PhiU.PredB = B; + } + }; + + struct CodeNode : public NodeBase { + template <typename T> T getCode() const { + return static_cast<T>(Code.CP); + } + void setCode(void *C) { + Code.CP = C; + } + + NodeAddr<NodeBase*> getFirstMember(const DataFlowGraph &G) const; + NodeAddr<NodeBase*> getLastMember(const DataFlowGraph &G) const; + void addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G); + void addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA, + const DataFlowGraph &G); + void removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G); + + NodeList members(const DataFlowGraph &G) const; + template <typename Predicate> + NodeList members_if(Predicate P, const DataFlowGraph &G) const; + }; + + struct InstrNode : public CodeNode { + NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G); + }; + + struct PhiNode : public InstrNode { + MachineInstr *getCode() const { + return nullptr; + } + }; + + struct StmtNode : public InstrNode { + MachineInstr *getCode() const { + return CodeNode::getCode<MachineInstr*>(); + } + }; + + struct BlockNode : public CodeNode { + MachineBasicBlock *getCode() const { + return CodeNode::getCode<MachineBasicBlock*>(); + } + void addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G); + }; + + struct FuncNode : public CodeNode { + MachineFunction *getCode() const { + return CodeNode::getCode<MachineFunction*>(); + } + NodeAddr<BlockNode*> findBlock(const MachineBasicBlock *BB, + const DataFlowGraph &G) const; + NodeAddr<BlockNode*> getEntryBlock(const DataFlowGraph &G); + }; + + struct DataFlowGraph { + DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii, + const TargetRegisterInfo &tri, const MachineDominatorTree &mdt, + const MachineDominanceFrontier &mdf, const RegisterAliasInfo &rai, + const TargetOperandInfo &toi); + + NodeBase *ptr(NodeId N) const; + template <typename T> T ptr(NodeId N) const { + return static_cast<T>(ptr(N)); + } + NodeId id(const NodeBase *P) const; + + template <typename T> NodeAddr<T> addr(NodeId N) const { + return { ptr<T>(N), N }; + } + + NodeAddr<FuncNode*> getFunc() const { + return Func; + } + MachineFunction &getMF() const { + return MF; + } + const TargetInstrInfo &getTII() const { + return TII; + } + const TargetRegisterInfo &getTRI() const { + return TRI; + } + const MachineDominatorTree &getDT() const { + return MDT; + } + const MachineDominanceFrontier &getDF() const { + return MDF; + } + const RegisterAliasInfo &getRAI() const { + return RAI; + } + + struct DefStack { + DefStack() = default; + bool empty() const { return Stack.empty() || top() == bottom(); } + private: + typedef NodeAddr<DefNode*> value_type; + struct Iterator { + typedef DefStack::value_type value_type; + Iterator &up() { Pos = DS.nextUp(Pos); return *this; } + Iterator &down() { Pos = DS.nextDown(Pos); return *this; } + value_type operator*() const { + assert(Pos >= 1); + return DS.Stack[Pos-1]; + } + const value_type *operator->() const { + assert(Pos >= 1); + return &DS.Stack[Pos-1]; + } + bool operator==(const Iterator &It) const { return Pos == It.Pos; } + bool operator!=(const Iterator &It) const { return Pos != It.Pos; } + private: + Iterator(const DefStack &S, bool Top); + // Pos-1 is the index in the StorageType object that corresponds to + // the top of the DefStack. + const DefStack &DS; + unsigned Pos; + friend struct DefStack; + }; + public: + typedef Iterator iterator; + iterator top() const { return Iterator(*this, true); } + iterator bottom() const { return Iterator(*this, false); } + unsigned size() const; + + void push(NodeAddr<DefNode*> DA) { Stack.push_back(DA); } + void pop(); + void start_block(NodeId N); + void clear_block(NodeId N); + private: + friend struct Iterator; + typedef std::vector<value_type> StorageType; + bool isDelimiter(const StorageType::value_type &P, NodeId N = 0) const { + return (P.Addr == nullptr) && (N == 0 || P.Id == N); + } + unsigned nextUp(unsigned P) const; + unsigned nextDown(unsigned P) const; + StorageType Stack; + }; + + typedef std::map<RegisterRef,DefStack> DefStackMap; + + void build(); + void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM); + void markBlock(NodeId B, DefStackMap &DefM); + void releaseBlock(NodeId B, DefStackMap &DefM); + + NodeAddr<RefNode*> getNextRelated(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const; + NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA, bool Create); + NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const; + NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA, bool Create); + NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const; + + NodeList getRelatedRefs(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const; + + void unlinkUse(NodeAddr<UseNode*> UA); + void unlinkDef(NodeAddr<DefNode*> DA); + + // Some useful filters. + template <uint16_t Kind> + static bool IsRef(const NodeAddr<NodeBase*> BA) { + return BA.Addr->getType() == NodeAttrs::Ref && + BA.Addr->getKind() == Kind; + } + template <uint16_t Kind> + static bool IsCode(const NodeAddr<NodeBase*> BA) { + return BA.Addr->getType() == NodeAttrs::Code && + BA.Addr->getKind() == Kind; + } + static bool IsDef(const NodeAddr<NodeBase*> BA) { + return BA.Addr->getType() == NodeAttrs::Ref && + BA.Addr->getKind() == NodeAttrs::Def; + } + static bool IsUse(const NodeAddr<NodeBase*> BA) { + return BA.Addr->getType() == NodeAttrs::Ref && + BA.Addr->getKind() == NodeAttrs::Use; + } + static bool IsPhi(const NodeAddr<NodeBase*> BA) { + return BA.Addr->getType() == NodeAttrs::Code && + BA.Addr->getKind() == NodeAttrs::Phi; + } + + private: + void reset(); + + NodeAddr<NodeBase*> newNode(uint16_t Attrs); + NodeAddr<NodeBase*> cloneNode(const NodeAddr<NodeBase*> B); + NodeAddr<UseNode*> newUse(NodeAddr<InstrNode*> Owner, + MachineOperand &Op, uint16_t Flags = NodeAttrs::None); + NodeAddr<PhiUseNode*> newPhiUse(NodeAddr<PhiNode*> Owner, + RegisterRef RR, NodeAddr<BlockNode*> PredB, + uint16_t Flags = NodeAttrs::PhiRef); + NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner, + MachineOperand &Op, uint16_t Flags = NodeAttrs::None); + NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner, + RegisterRef RR, uint16_t Flags = NodeAttrs::PhiRef); + NodeAddr<PhiNode*> newPhi(NodeAddr<BlockNode*> Owner); + NodeAddr<StmtNode*> newStmt(NodeAddr<BlockNode*> Owner, + MachineInstr *MI); + NodeAddr<BlockNode*> newBlock(NodeAddr<FuncNode*> Owner, + MachineBasicBlock *BB); + NodeAddr<FuncNode*> newFunc(MachineFunction *MF); + + template <typename Predicate> + std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>> + locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA, + Predicate P) const; + + typedef std::map<NodeId,RegisterSet> BlockRefsMap; + + void buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In); + void buildBlockRefs(NodeAddr<BlockNode*> BA, BlockRefsMap &RefM); + void recordDefsForDF(BlockRefsMap &PhiM, BlockRefsMap &RefM, + NodeAddr<BlockNode*> BA); + void buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM, + NodeAddr<BlockNode*> BA); + void removeUnusedPhis(); + + template <typename T> void linkRefUp(NodeAddr<InstrNode*> IA, + NodeAddr<T> TA, DefStack &DS); + void linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA); + void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA); + + TimerGroup TimeG; + NodeAddr<FuncNode*> Func; + NodeAllocator Memory; + + MachineFunction &MF; + const TargetInstrInfo &TII; + const TargetRegisterInfo &TRI; + const MachineDominatorTree &MDT; + const MachineDominanceFrontier &MDF; + const RegisterAliasInfo &RAI; + const TargetOperandInfo &TOI; + }; // struct DataFlowGraph + + template <typename Predicate> + NodeAddr<RefNode*> RefNode::getNextRef(RegisterRef RR, Predicate P, + bool NextOnly, const DataFlowGraph &G) { + // Get the "Next" reference in the circular list that references RR and + // satisfies predicate "Pred". + auto NA = G.addr<NodeBase*>(getNext()); + + while (NA.Addr != this) { + if (NA.Addr->getType() == NodeAttrs::Ref) { + NodeAddr<RefNode*> RA = NA; + if (RA.Addr->getRegRef() == RR && P(NA)) + return NA; + if (NextOnly) + break; + NA = G.addr<NodeBase*>(NA.Addr->getNext()); + } else { + // We've hit the beginning of the chain. + assert(NA.Addr->getType() == NodeAttrs::Code); + NodeAddr<CodeNode*> CA = NA; + NA = CA.Addr->getFirstMember(G); + } + } + // Return the equivalent of "nullptr" if such a node was not found. + return NodeAddr<RefNode*>(); + } + + template <typename Predicate> + NodeList CodeNode::members_if(Predicate P, const DataFlowGraph &G) const { + NodeList MM; + auto M = getFirstMember(G); + if (M.Id == 0) + return MM; + + while (M.Addr != this) { + if (P(M)) + MM.push_back(M); + M = G.addr<NodeBase*>(M.Addr->getNext()); + } + return MM; + } + + + template <typename T> struct Print; + template <typename T> + raw_ostream &operator<< (raw_ostream &OS, const Print<T> &P); + + template <typename T> + struct Print { + Print(const T &x, const DataFlowGraph &g) : Obj(x), G(g) {} + const T &Obj; + const DataFlowGraph &G; + }; + + template <typename T> + struct PrintNode : Print<NodeAddr<T>> { + PrintNode(const NodeAddr<T> &x, const DataFlowGraph &g) + : Print<NodeAddr<T>>(x, g) {} + }; +} // namespace rdf + +#endif // RDF_GRAPH_H diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp new file mode 100644 index 0000000..1d9bd37 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp @@ -0,0 +1,848 @@ +//===--- RDFLiveness.cpp --------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Computation of the liveness information from the data-flow graph. +// +// The main functionality of this code is to compute block live-in +// information. With the live-in information in place, the placement +// of kill flags can also be recalculated. +// +// The block live-in calculation is based on the ideas from the following +// publication: +// +// Dibyendu Das, Ramakrishna Upadrasta, Benoit Dupont de Dinechin. +// "Efficient Liveness Computation Using Merge Sets and DJ-Graphs." +// ACM Transactions on Architecture and Code Optimization, Association for +// Computing Machinery, 2012, ACM TACO Special Issue on "High-Performance +// and Embedded Architectures and Compilers", 8 (4), +// <10.1145/2086696.2086706>. <hal-00647369> +// +#include "RDFGraph.h" +#include "RDFLiveness.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominanceFrontier.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; +using namespace rdf; + +namespace rdf { + template<> + raw_ostream &operator<< (raw_ostream &OS, const Print<Liveness::RefMap> &P) { + OS << '{'; + for (auto I : P.Obj) { + OS << ' ' << Print<RegisterRef>(I.first, P.G) << '{'; + for (auto J = I.second.begin(), E = I.second.end(); J != E; ) { + OS << Print<NodeId>(*J, P.G); + if (++J != E) + OS << ','; + } + OS << '}'; + } + OS << " }"; + return OS; + } +} + +// The order in the returned sequence is the order of reaching defs in the +// upward traversal: the first def is the closest to the given reference RefA, +// the next one is further up, and so on. +// The list ends at a reaching phi def, or when the reference from RefA is +// covered by the defs in the list (see FullChain). +// This function provides two modes of operation: +// (1) Returning the sequence of reaching defs for a particular reference +// node. This sequence will terminate at the first phi node [1]. +// (2) Returning a partial sequence of reaching defs, where the final goal +// is to traverse past phi nodes to the actual defs arising from the code +// itself. +// In mode (2), the register reference for which the search was started +// may be different from the reference node RefA, for which this call was +// made, hence the argument RefRR, which holds the original register. +// Also, some definitions may have already been encountered in a previous +// call that will influence register covering. The register references +// already defined are passed in through DefRRs. +// In mode (1), the "continuation" considerations do not apply, and the +// RefRR is the same as the register in RefA, and the set DefRRs is empty. +// +// [1] It is possible for multiple phi nodes to be included in the returned +// sequence: +// SubA = phi ... +// SubB = phi ... +// ... = SuperAB(rdef:SubA), SuperAB"(rdef:SubB) +// However, these phi nodes are independent from one another in terms of +// the data-flow. + +NodeList Liveness::getAllReachingDefs(RegisterRef RefRR, + NodeAddr<RefNode*> RefA, bool FullChain, const RegisterSet &DefRRs) { + SetVector<NodeId> DefQ; + SetVector<NodeId> Owners; + + // The initial queue should not have reaching defs for shadows. The + // whole point of a shadow is that it will have a reaching def that + // is not aliased to the reaching defs of the related shadows. + NodeId Start = RefA.Id; + auto SNA = DFG.addr<RefNode*>(Start); + if (NodeId RD = SNA.Addr->getReachingDef()) + DefQ.insert(RD); + + // Collect all the reaching defs, going up until a phi node is encountered, + // or there are no more reaching defs. From this set, the actual set of + // reaching defs will be selected. + // The traversal upwards must go on until a covering def is encountered. + // It is possible that a collection of non-covering (individually) defs + // will be sufficient, but keep going until a covering one is found. + for (unsigned i = 0; i < DefQ.size(); ++i) { + auto TA = DFG.addr<DefNode*>(DefQ[i]); + if (TA.Addr->getFlags() & NodeAttrs::PhiRef) + continue; + // Stop at the covering/overwriting def of the initial register reference. + RegisterRef RR = TA.Addr->getRegRef(); + if (RAI.covers(RR, RefRR)) { + uint16_t Flags = TA.Addr->getFlags(); + if (!(Flags & NodeAttrs::Preserving)) + continue; + } + // Get the next level of reaching defs. This will include multiple + // reaching defs for shadows. + for (auto S : DFG.getRelatedRefs(TA.Addr->getOwner(DFG), TA)) + if (auto RD = NodeAddr<RefNode*>(S).Addr->getReachingDef()) + DefQ.insert(RD); + } + + // Remove all non-phi defs that are not aliased to RefRR, and collect + // the owners of the remaining defs. + SetVector<NodeId> Defs; + for (auto N : DefQ) { + auto TA = DFG.addr<DefNode*>(N); + bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef; + if (!IsPhi && !RAI.alias(RefRR, TA.Addr->getRegRef())) + continue; + Defs.insert(TA.Id); + Owners.insert(TA.Addr->getOwner(DFG).Id); + } + + // Return the MachineBasicBlock containing a given instruction. + auto Block = [this] (NodeAddr<InstrNode*> IA) -> MachineBasicBlock* { + if (IA.Addr->getKind() == NodeAttrs::Stmt) + return NodeAddr<StmtNode*>(IA).Addr->getCode()->getParent(); + assert(IA.Addr->getKind() == NodeAttrs::Phi); + NodeAddr<PhiNode*> PA = IA; + NodeAddr<BlockNode*> BA = PA.Addr->getOwner(DFG); + return BA.Addr->getCode(); + }; + // Less(A,B) iff instruction A is further down in the dominator tree than B. + auto Less = [&Block,this] (NodeId A, NodeId B) -> bool { + if (A == B) + return false; + auto OA = DFG.addr<InstrNode*>(A), OB = DFG.addr<InstrNode*>(B); + MachineBasicBlock *BA = Block(OA), *BB = Block(OB); + if (BA != BB) + return MDT.dominates(BB, BA); + // They are in the same block. + bool StmtA = OA.Addr->getKind() == NodeAttrs::Stmt; + bool StmtB = OB.Addr->getKind() == NodeAttrs::Stmt; + if (StmtA) { + if (!StmtB) // OB is a phi and phis dominate statements. + return true; + auto CA = NodeAddr<StmtNode*>(OA).Addr->getCode(); + auto CB = NodeAddr<StmtNode*>(OB).Addr->getCode(); + // The order must be linear, so tie-break such equalities. + if (CA == CB) + return A < B; + return MDT.dominates(CB, CA); + } else { + // OA is a phi. + if (StmtB) + return false; + // Both are phis. There is no ordering between phis (in terms of + // the data-flow), so tie-break this via node id comparison. + return A < B; + } + }; + + std::vector<NodeId> Tmp(Owners.begin(), Owners.end()); + std::sort(Tmp.begin(), Tmp.end(), Less); + + // The vector is a list of instructions, so that defs coming from + // the same instruction don't need to be artificially ordered. + // Then, when computing the initial segment, and iterating over an + // instruction, pick the defs that contribute to the covering (i.e. is + // not covered by previously added defs). Check the defs individually, + // i.e. first check each def if is covered or not (without adding them + // to the tracking set), and then add all the selected ones. + + // The reason for this is this example: + // *d1<A>, *d2<B>, ... Assume A and B are aliased (can happen in phi nodes). + // *d3<C> If A \incl BuC, and B \incl AuC, then *d2 would be + // covered if we added A first, and A would be covered + // if we added B first. + + NodeList RDefs; + RegisterSet RRs = DefRRs; + + auto DefInSet = [&Defs] (NodeAddr<RefNode*> TA) -> bool { + return TA.Addr->getKind() == NodeAttrs::Def && + Defs.count(TA.Id); + }; + for (auto T : Tmp) { + if (!FullChain && RAI.covers(RRs, RefRR)) + break; + auto TA = DFG.addr<InstrNode*>(T); + bool IsPhi = DFG.IsCode<NodeAttrs::Phi>(TA); + NodeList Ds; + for (NodeAddr<DefNode*> DA : TA.Addr->members_if(DefInSet, DFG)) { + auto QR = DA.Addr->getRegRef(); + // Add phi defs even if they are covered by subsequent defs. This is + // for cases where the reached use is not covered by any of the defs + // encountered so far: the phi def is needed to expose the liveness + // of that use to the entry of the block. + // Example: + // phi d1<R3>(,d2,), ... Phi def d1 is covered by d2. + // d2<R3>(d1,,u3), ... + // ..., u3<D1>(d2) This use needs to be live on entry. + if (FullChain || IsPhi || !RAI.covers(RRs, QR)) + Ds.push_back(DA); + } + RDefs.insert(RDefs.end(), Ds.begin(), Ds.end()); + for (NodeAddr<DefNode*> DA : Ds) { + // When collecting a full chain of definitions, do not consider phi + // defs to actually define a register. + uint16_t Flags = DA.Addr->getFlags(); + if (!FullChain || !(Flags & NodeAttrs::PhiRef)) + if (!(Flags & NodeAttrs::Preserving)) + RRs.insert(DA.Addr->getRegRef()); + } + } + + return RDefs; +} + + +static const RegisterSet NoRegs; + +NodeList Liveness::getAllReachingDefs(NodeAddr<RefNode*> RefA) { + return getAllReachingDefs(RefA.Addr->getRegRef(), RefA, false, NoRegs); +} + + +void Liveness::computePhiInfo() { + NodeList Phis; + NodeAddr<FuncNode*> FA = DFG.getFunc(); + auto Blocks = FA.Addr->members(DFG); + for (NodeAddr<BlockNode*> BA : Blocks) { + auto Ps = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG); + Phis.insert(Phis.end(), Ps.begin(), Ps.end()); + } + + // phi use -> (map: reaching phi -> set of registers defined in between) + std::map<NodeId,std::map<NodeId,RegisterSet>> PhiUp; + std::vector<NodeId> PhiUQ; // Work list of phis for upward propagation. + + // Go over all phis. + for (NodeAddr<PhiNode*> PhiA : Phis) { + // Go over all defs and collect the reached uses that are non-phi uses + // (i.e. the "real uses"). + auto &RealUses = RealUseMap[PhiA.Id]; + auto PhiRefs = PhiA.Addr->members(DFG); + + // Have a work queue of defs whose reached uses need to be found. + // For each def, add to the queue all reached (non-phi) defs. + SetVector<NodeId> DefQ; + NodeSet PhiDefs; + for (auto R : PhiRefs) { + if (!DFG.IsRef<NodeAttrs::Def>(R)) + continue; + DefQ.insert(R.Id); + PhiDefs.insert(R.Id); + } + for (unsigned i = 0; i < DefQ.size(); ++i) { + NodeAddr<DefNode*> DA = DFG.addr<DefNode*>(DefQ[i]); + NodeId UN = DA.Addr->getReachedUse(); + while (UN != 0) { + NodeAddr<UseNode*> A = DFG.addr<UseNode*>(UN); + if (!(A.Addr->getFlags() & NodeAttrs::PhiRef)) + RealUses[getRestrictedRegRef(A)].insert(A.Id); + UN = A.Addr->getSibling(); + } + NodeId DN = DA.Addr->getReachedDef(); + while (DN != 0) { + NodeAddr<DefNode*> A = DFG.addr<DefNode*>(DN); + for (auto T : DFG.getRelatedRefs(A.Addr->getOwner(DFG), A)) { + uint16_t Flags = NodeAddr<DefNode*>(T).Addr->getFlags(); + // Must traverse the reached-def chain. Consider: + // def(D0) -> def(R0) -> def(R0) -> use(D0) + // The reachable use of D0 passes through a def of R0. + if (!(Flags & NodeAttrs::PhiRef)) + DefQ.insert(T.Id); + } + DN = A.Addr->getSibling(); + } + } + // Filter out these uses that appear to be reachable, but really + // are not. For example: + // + // R1:0 = d1 + // = R1:0 u2 Reached by d1. + // R0 = d3 + // = R1:0 u4 Still reached by d1: indirectly through + // the def d3. + // R1 = d5 + // = R1:0 u6 Not reached by d1 (covered collectively + // by d3 and d5), but following reached + // defs and uses from d1 will lead here. + auto HasDef = [&PhiDefs] (NodeAddr<DefNode*> DA) -> bool { + return PhiDefs.count(DA.Id); + }; + for (auto UI = RealUses.begin(), UE = RealUses.end(); UI != UE; ) { + // For each reached register UI->first, there is a set UI->second, of + // uses of it. For each such use, check if it is reached by this phi, + // i.e. check if the set of its reaching uses intersects the set of + // this phi's defs. + auto &Uses = UI->second; + for (auto I = Uses.begin(), E = Uses.end(); I != E; ) { + auto UA = DFG.addr<UseNode*>(*I); + NodeList RDs = getAllReachingDefs(UI->first, UA); + if (std::any_of(RDs.begin(), RDs.end(), HasDef)) + ++I; + else + I = Uses.erase(I); + } + if (Uses.empty()) + UI = RealUses.erase(UI); + else + ++UI; + } + + // If this phi reaches some "real" uses, add it to the queue for upward + // propagation. + if (!RealUses.empty()) + PhiUQ.push_back(PhiA.Id); + + // Go over all phi uses and check if the reaching def is another phi. + // Collect the phis that are among the reaching defs of these uses. + // While traversing the list of reaching defs for each phi use, collect + // the set of registers defined between this phi (Phi) and the owner phi + // of the reaching def. + for (auto I : PhiRefs) { + if (!DFG.IsRef<NodeAttrs::Use>(I)) + continue; + NodeAddr<UseNode*> UA = I; + auto &UpMap = PhiUp[UA.Id]; + RegisterSet DefRRs; + for (NodeAddr<DefNode*> DA : getAllReachingDefs(UA)) { + if (DA.Addr->getFlags() & NodeAttrs::PhiRef) + UpMap[DA.Addr->getOwner(DFG).Id] = DefRRs; + else + DefRRs.insert(DA.Addr->getRegRef()); + } + } + } + + if (Trace) { + dbgs() << "Phi-up-to-phi map:\n"; + for (auto I : PhiUp) { + dbgs() << "phi " << Print<NodeId>(I.first, DFG) << " -> {"; + for (auto R : I.second) + dbgs() << ' ' << Print<NodeId>(R.first, DFG) + << Print<RegisterSet>(R.second, DFG); + dbgs() << " }\n"; + } + } + + // Propagate the reached registers up in the phi chain. + // + // The following type of situation needs careful handling: + // + // phi d1<R1:0> (1) + // | + // ... d2<R1> + // | + // phi u3<R1:0> (2) + // | + // ... u4<R1> + // + // The phi node (2) defines a register pair R1:0, and reaches a "real" + // use u4 of just R1. The same phi node is also known to reach (upwards) + // the phi node (1). However, the use u4 is not reached by phi (1), + // because of the intervening definition d2 of R1. The data flow between + // phis (1) and (2) is restricted to R1:0 minus R1, i.e. R0. + // + // When propagating uses up the phi chains, get the all reaching defs + // for a given phi use, and traverse the list until the propagated ref + // is covered, or until or until reaching the final phi. Only assume + // that the reference reaches the phi in the latter case. + + for (unsigned i = 0; i < PhiUQ.size(); ++i) { + auto PA = DFG.addr<PhiNode*>(PhiUQ[i]); + auto &RealUses = RealUseMap[PA.Id]; + for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) { + NodeAddr<UseNode*> UA = U; + auto &UpPhis = PhiUp[UA.Id]; + for (auto UP : UpPhis) { + bool Changed = false; + auto &MidDefs = UP.second; + // Collect the set UpReached of uses that are reached by the current + // phi PA, and are not covered by any intervening def between PA and + // the upward phi UP. + RegisterSet UpReached; + for (auto T : RealUses) { + if (!isRestricted(PA, UA, T.first)) + continue; + if (!RAI.covers(MidDefs, T.first)) + UpReached.insert(T.first); + } + if (UpReached.empty()) + continue; + // Update the set PRUs of real uses reached by the upward phi UP with + // the actual set of uses (UpReached) that the UP phi reaches. + auto &PRUs = RealUseMap[UP.first]; + for (auto R : UpReached) { + unsigned Z = PRUs[R].size(); + PRUs[R].insert(RealUses[R].begin(), RealUses[R].end()); + Changed |= (PRUs[R].size() != Z); + } + if (Changed) + PhiUQ.push_back(UP.first); + } + } + } + + if (Trace) { + dbgs() << "Real use map:\n"; + for (auto I : RealUseMap) { + dbgs() << "phi " << Print<NodeId>(I.first, DFG); + NodeAddr<PhiNode*> PA = DFG.addr<PhiNode*>(I.first); + NodeList Ds = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Def>, DFG); + if (!Ds.empty()) { + RegisterRef RR = NodeAddr<DefNode*>(Ds[0]).Addr->getRegRef(); + dbgs() << '<' << Print<RegisterRef>(RR, DFG) << '>'; + } else { + dbgs() << "<noreg>"; + } + dbgs() << " -> " << Print<RefMap>(I.second, DFG) << '\n'; + } + } +} + + +void Liveness::computeLiveIns() { + // Populate the node-to-block map. This speeds up the calculations + // significantly. + NBMap.clear(); + for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) { + MachineBasicBlock *BB = BA.Addr->getCode(); + for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) { + for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) + NBMap.insert(std::make_pair(RA.Id, BB)); + NBMap.insert(std::make_pair(IA.Id, BB)); + } + } + + MachineFunction &MF = DFG.getMF(); + + // Compute IDF first, then the inverse. + decltype(IIDF) IDF; + for (auto &B : MF) { + auto F1 = MDF.find(&B); + if (F1 == MDF.end()) + continue; + SetVector<MachineBasicBlock*> IDFB(F1->second.begin(), F1->second.end()); + for (unsigned i = 0; i < IDFB.size(); ++i) { + auto F2 = MDF.find(IDFB[i]); + if (F2 != MDF.end()) + IDFB.insert(F2->second.begin(), F2->second.end()); + } + // Add B to the IDF(B). This will put B in the IIDF(B). + IDFB.insert(&B); + IDF[&B].insert(IDFB.begin(), IDFB.end()); + } + + for (auto I : IDF) + for (auto S : I.second) + IIDF[S].insert(I.first); + + computePhiInfo(); + + NodeAddr<FuncNode*> FA = DFG.getFunc(); + auto Blocks = FA.Addr->members(DFG); + + // Build the phi live-on-entry map. + for (NodeAddr<BlockNode*> BA : Blocks) { + MachineBasicBlock *MB = BA.Addr->getCode(); + auto &LON = PhiLON[MB]; + for (auto P : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG)) + for (auto S : RealUseMap[P.Id]) + LON[S.first].insert(S.second.begin(), S.second.end()); + } + + if (Trace) { + dbgs() << "Phi live-on-entry map:\n"; + for (auto I : PhiLON) + dbgs() << "block #" << I.first->getNumber() << " -> " + << Print<RefMap>(I.second, DFG) << '\n'; + } + + // Build the phi live-on-exit map. Each phi node has some set of reached + // "real" uses. Propagate this set backwards into the block predecessors + // through the reaching defs of the corresponding phi uses. + for (NodeAddr<BlockNode*> BA : Blocks) { + auto Phis = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG); + for (NodeAddr<PhiNode*> PA : Phis) { + auto &RUs = RealUseMap[PA.Id]; + if (RUs.empty()) + continue; + + for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) { + NodeAddr<PhiUseNode*> UA = U; + if (UA.Addr->getReachingDef() == 0) + continue; + + // Mark all reached "real" uses of P as live on exit in the + // predecessor. + // Remap all the RUs so that they have a correct reaching def. + auto PrA = DFG.addr<BlockNode*>(UA.Addr->getPredecessor()); + auto &LOX = PhiLOX[PrA.Addr->getCode()]; + for (auto R : RUs) { + RegisterRef RR = R.first; + if (!isRestricted(PA, UA, RR)) + RR = getRestrictedRegRef(UA); + // The restricted ref may be different from the ref that was + // accessed in the "real use". This means that this phi use + // is not the one that carries this reference, so skip it. + if (!RAI.alias(R.first, RR)) + continue; + for (auto D : getAllReachingDefs(RR, UA)) + LOX[RR].insert(D.Id); + } + } // for U : phi uses + } // for P : Phis + } // for B : Blocks + + if (Trace) { + dbgs() << "Phi live-on-exit map:\n"; + for (auto I : PhiLOX) + dbgs() << "block #" << I.first->getNumber() << " -> " + << Print<RefMap>(I.second, DFG) << '\n'; + } + + RefMap LiveIn; + traverse(&MF.front(), LiveIn); + + // Add function live-ins to the live-in set of the function entry block. + auto &EntryIn = LiveMap[&MF.front()]; + for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) + EntryIn.insert({I->first,0}); + + if (Trace) { + // Dump the liveness map + for (auto &B : MF) { + BitVector LV(TRI.getNumRegs()); + for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I) + LV.set(I->PhysReg); + dbgs() << "BB#" << B.getNumber() << "\t rec = {"; + for (int x = LV.find_first(); x >= 0; x = LV.find_next(x)) + dbgs() << ' ' << Print<RegisterRef>({unsigned(x),0}, DFG); + dbgs() << " }\n"; + dbgs() << "\tcomp = " << Print<RegisterSet>(LiveMap[&B], DFG) << '\n'; + } + } +} + + +void Liveness::resetLiveIns() { + for (auto &B : DFG.getMF()) { + // Remove all live-ins. + std::vector<unsigned> T; + for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I) + T.push_back(I->PhysReg); + for (auto I : T) + B.removeLiveIn(I); + // Add the newly computed live-ins. + auto &LiveIns = LiveMap[&B]; + for (auto I : LiveIns) { + assert(I.Sub == 0); + B.addLiveIn(I.Reg); + } + } +} + + +void Liveness::resetKills() { + for (auto &B : DFG.getMF()) + resetKills(&B); +} + + +void Liveness::resetKills(MachineBasicBlock *B) { + auto CopyLiveIns = [] (MachineBasicBlock *B, BitVector &LV) -> void { + for (auto I = B->livein_begin(), E = B->livein_end(); I != E; ++I) + LV.set(I->PhysReg); + }; + + BitVector LiveIn(TRI.getNumRegs()), Live(TRI.getNumRegs()); + CopyLiveIns(B, LiveIn); + for (auto SI : B->successors()) + CopyLiveIns(SI, Live); + + for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) { + MachineInstr *MI = &*I; + if (MI->isDebugValue()) + continue; + + MI->clearKillInfo(); + for (auto &Op : MI->operands()) { + if (!Op.isReg() || !Op.isDef()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(R)) + continue; + for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR) + Live.reset(*SR); + } + for (auto &Op : MI->operands()) { + if (!Op.isReg() || !Op.isUse()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(R)) + continue; + bool IsLive = false; + for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR) { + if (!Live[*SR]) + continue; + IsLive = true; + break; + } + if (IsLive) + continue; + Op.setIsKill(true); + for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR) + Live.set(*SR); + } + } +} + + +// For shadows, determine if RR is aliased to a reaching def of any other +// shadow associated with RA. If it is not, then RR is "restricted" to RA, +// and so it can be considered a value specific to RA. This is important +// for accurately determining values associated with phi uses. +// For non-shadows, this function returns "true". +bool Liveness::isRestricted(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA, + RegisterRef RR) const { + NodeId Start = RA.Id; + for (NodeAddr<RefNode*> TA = DFG.getNextShadow(IA, RA); + TA.Id != 0 && TA.Id != Start; TA = DFG.getNextShadow(IA, TA)) { + NodeId RD = TA.Addr->getReachingDef(); + if (RD == 0) + continue; + if (RAI.alias(RR, DFG.addr<DefNode*>(RD).Addr->getRegRef())) + return false; + } + return true; +} + + +RegisterRef Liveness::getRestrictedRegRef(NodeAddr<RefNode*> RA) const { + assert(DFG.IsRef<NodeAttrs::Use>(RA)); + if (RA.Addr->getFlags() & NodeAttrs::Shadow) { + NodeId RD = RA.Addr->getReachingDef(); + assert(RD); + RA = DFG.addr<DefNode*>(RD); + } + return RA.Addr->getRegRef(); +} + + +unsigned Liveness::getPhysReg(RegisterRef RR) const { + if (!TargetRegisterInfo::isPhysicalRegister(RR.Reg)) + return 0; + return RR.Sub ? TRI.getSubReg(RR.Reg, RR.Sub) : RR.Reg; +} + + +// Helper function to obtain the basic block containing the reaching def +// of the given use. +MachineBasicBlock *Liveness::getBlockWithRef(NodeId RN) const { + auto F = NBMap.find(RN); + if (F != NBMap.end()) + return F->second; + llvm_unreachable("Node id not in map"); +} + + +void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) { + // The LiveIn map, for each (physical) register, contains the set of live + // reaching defs of that register that are live on entry to the associated + // block. + + // The summary of the traversal algorithm: + // + // R is live-in in B, if there exists a U(R), such that rdef(R) dom B + // and (U \in IDF(B) or B dom U). + // + // for (C : children) { + // LU = {} + // traverse(C, LU) + // LiveUses += LU + // } + // + // LiveUses -= Defs(B); + // LiveUses += UpwardExposedUses(B); + // for (C : IIDF[B]) + // for (U : LiveUses) + // if (Rdef(U) dom C) + // C.addLiveIn(U) + // + + // Go up the dominator tree (depth-first). + MachineDomTreeNode *N = MDT.getNode(B); + for (auto I : *N) { + RefMap L; + MachineBasicBlock *SB = I->getBlock(); + traverse(SB, L); + + for (auto S : L) + LiveIn[S.first].insert(S.second.begin(), S.second.end()); + } + + if (Trace) { + dbgs() << LLVM_FUNCTION_NAME << " in BB#" << B->getNumber() + << " after recursion into"; + for (auto I : *N) + dbgs() << ' ' << I->getBlock()->getNumber(); + dbgs() << "\n LiveIn: " << Print<RefMap>(LiveIn, DFG); + dbgs() << "\n Local: " << Print<RegisterSet>(LiveMap[B], DFG) << '\n'; + } + + // Add phi uses that are live on exit from this block. + RefMap &PUs = PhiLOX[B]; + for (auto S : PUs) + LiveIn[S.first].insert(S.second.begin(), S.second.end()); + + if (Trace) { + dbgs() << "after LOX\n"; + dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n'; + dbgs() << " Local: " << Print<RegisterSet>(LiveMap[B], DFG) << '\n'; + } + + // Stop tracking all uses defined in this block: erase those records + // where the reaching def is located in B and which cover all reached + // uses. + auto Copy = LiveIn; + LiveIn.clear(); + + for (auto I : Copy) { + auto &Defs = LiveIn[I.first]; + NodeSet Rest; + for (auto R : I.second) { + auto DA = DFG.addr<DefNode*>(R); + RegisterRef DDR = DA.Addr->getRegRef(); + NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG); + NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG); + // Defs from a different block need to be preserved. Defs from this + // block will need to be processed further, except for phi defs, the + // liveness of which is handled through the PhiLON/PhiLOX maps. + if (B != BA.Addr->getCode()) + Defs.insert(R); + else { + bool IsPreserving = DA.Addr->getFlags() & NodeAttrs::Preserving; + if (IA.Addr->getKind() != NodeAttrs::Phi && !IsPreserving) { + bool Covering = RAI.covers(DDR, I.first); + NodeId U = DA.Addr->getReachedUse(); + while (U && Covering) { + auto DUA = DFG.addr<UseNode*>(U); + RegisterRef Q = DUA.Addr->getRegRef(); + Covering = RAI.covers(DA.Addr->getRegRef(), Q); + U = DUA.Addr->getSibling(); + } + if (!Covering) + Rest.insert(R); + } + } + } + + // Non-covering defs from B. + for (auto R : Rest) { + auto DA = DFG.addr<DefNode*>(R); + RegisterRef DRR = DA.Addr->getRegRef(); + RegisterSet RRs; + for (NodeAddr<DefNode*> TA : getAllReachingDefs(DA)) { + NodeAddr<InstrNode*> IA = TA.Addr->getOwner(DFG); + NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG); + // Preserving defs do not count towards covering. + if (!(TA.Addr->getFlags() & NodeAttrs::Preserving)) + RRs.insert(TA.Addr->getRegRef()); + if (BA.Addr->getCode() == B) + continue; + if (RAI.covers(RRs, DRR)) + break; + Defs.insert(TA.Id); + } + } + } + + emptify(LiveIn); + + if (Trace) { + dbgs() << "after defs in block\n"; + dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n'; + dbgs() << " Local: " << Print<RegisterSet>(LiveMap[B], DFG) << '\n'; + } + + // Scan the block for upward-exposed uses and add them to the tracking set. + for (auto I : DFG.getFunc().Addr->findBlock(B, DFG).Addr->members(DFG)) { + NodeAddr<InstrNode*> IA = I; + if (IA.Addr->getKind() != NodeAttrs::Stmt) + continue; + for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) { + RegisterRef RR = UA.Addr->getRegRef(); + for (auto D : getAllReachingDefs(UA)) + if (getBlockWithRef(D.Id) != B) + LiveIn[RR].insert(D.Id); + } + } + + if (Trace) { + dbgs() << "after uses in block\n"; + dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n'; + dbgs() << " Local: " << Print<RegisterSet>(LiveMap[B], DFG) << '\n'; + } + + // Phi uses should not be propagated up the dominator tree, since they + // are not dominated by their corresponding reaching defs. + auto &Local = LiveMap[B]; + auto &LON = PhiLON[B]; + for (auto R : LON) + Local.insert(R.first); + + if (Trace) { + dbgs() << "after phi uses in block\n"; + dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n'; + dbgs() << " Local: " << Print<RegisterSet>(Local, DFG) << '\n'; + } + + for (auto C : IIDF[B]) { + auto &LiveC = LiveMap[C]; + for (auto S : LiveIn) + for (auto R : S.second) + if (MDT.properlyDominates(getBlockWithRef(R), C)) + LiveC.insert(S.first); + } +} + + +void Liveness::emptify(RefMap &M) { + for (auto I = M.begin(), E = M.end(); I != E; ) + I = I->second.empty() ? M.erase(I) : std::next(I); +} + diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h new file mode 100644 index 0000000..4c1e8f3 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h @@ -0,0 +1,106 @@ +//===--- RDFLiveness.h ----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Recalculate the liveness information given a data flow graph. +// This includes block live-ins and kill flags. + +#ifndef RDF_LIVENESS_H +#define RDF_LIVENESS_H + +#include "RDFGraph.h" +#include "llvm/ADT/DenseMap.h" +#include <map> + +using namespace llvm; + +namespace llvm { + class MachineBasicBlock; + class MachineFunction; + class MachineRegisterInfo; + class TargetRegisterInfo; + class MachineDominatorTree; + class MachineDominanceFrontier; +} + +namespace rdf { + struct Liveness { + public: + typedef std::map<MachineBasicBlock*,RegisterSet> LiveMapType; + typedef std::map<RegisterRef,NodeSet> RefMap; + + Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g) + : DFG(g), TRI(g.getTRI()), MDT(g.getDT()), MDF(g.getDF()), + RAI(g.getRAI()), MRI(mri), Empty(), Trace(false) {} + + NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA, + bool FullChain = false, const RegisterSet &DefRRs = RegisterSet()); + NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA); + + LiveMapType &getLiveMap() { return LiveMap; } + const LiveMapType &getLiveMap() const { return LiveMap; } + const RefMap &getRealUses(NodeId P) const { + auto F = RealUseMap.find(P); + return F == RealUseMap.end() ? Empty : F->second; + } + + void computePhiInfo(); + void computeLiveIns(); + void resetLiveIns(); + void resetKills(); + void resetKills(MachineBasicBlock *B); + + void trace(bool T) { Trace = T; } + + private: + const DataFlowGraph &DFG; + const TargetRegisterInfo &TRI; + const MachineDominatorTree &MDT; + const MachineDominanceFrontier &MDF; + const RegisterAliasInfo &RAI; + MachineRegisterInfo &MRI; + LiveMapType LiveMap; + const RefMap Empty; + bool Trace; + + // Cache of mapping from node ids (for RefNodes) to the containing + // basic blocks. Not computing it each time for each node reduces + // the liveness calculation time by a large fraction. + typedef DenseMap<NodeId,MachineBasicBlock*> NodeBlockMap; + NodeBlockMap NBMap; + + // Phi information: + // + // map: NodeId -> (map: RegisterRef -> NodeSet) + // phi id -> (map: register -> set of reached non-phi uses) + std::map<NodeId, RefMap> RealUseMap; + + // Inverse iterated dominance frontier. + std::map<MachineBasicBlock*,std::set<MachineBasicBlock*>> IIDF; + + // Live on entry. + std::map<MachineBasicBlock*,RefMap> PhiLON; + + // Phi uses are considered to be located at the end of the block that + // they are associated with. The reaching def of a phi use dominates the + // block that the use corresponds to, but not the block that contains + // the phi itself. To include these uses in the liveness propagation (up + // the dominator tree), create a map: block -> set of uses live on exit. + std::map<MachineBasicBlock*,RefMap> PhiLOX; + + bool isRestricted(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA, + RegisterRef RR) const; + RegisterRef getRestrictedRegRef(NodeAddr<RefNode*> RA) const; + unsigned getPhysReg(RegisterRef RR) const; + MachineBasicBlock *getBlockWithRef(NodeId RN) const; + void traverse(MachineBasicBlock *B, RefMap &LiveIn); + void emptify(RefMap &M); + }; +} + +#endif // RDF_LIVENESS_H diff --git a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h index 70141a9..72afec1 100644 --- a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h +++ b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h @@ -17,8 +17,6 @@ #include "llvm/MC/MCInstPrinter.h" namespace llvm { - class MCOperand; - class MSP430InstPrinter : public MCInstPrinter { public: MSP430InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h index ff5b0b6..183dee3 100644 --- a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h +++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h @@ -17,13 +17,14 @@ #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { - class Triple; +class Triple; - class MSP430MCAsmInfo : public MCAsmInfoELF { - void anchor() override; - public: - explicit MSP430MCAsmInfo(const Triple &TT); - }; +class MSP430MCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit MSP430MCAsmInfo(const Triple &TT); +}; } // namespace llvm diff --git a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp index ffcf222..606abc2 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp @@ -64,7 +64,7 @@ bool MSP430BSel::runOnMachineFunction(MachineFunction &Fn) { unsigned FuncSize = 0; for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ++MFI) { - MachineBasicBlock *MBB = MFI; + MachineBasicBlock *MBB = &*MFI; unsigned BlockSize = 0; for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end(); diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp index 29bc8b3..18f38b7 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -69,10 +69,6 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, computeRegisterProperties(STI.getRegisterInfo()); // Provide all sorts of operation actions - - // Division is expensive - setIntDivIsCheap(false); - setStackPointerRegisterToSaveRestore(MSP430::SP); setBooleanContents(ZeroOrOneBooleanContent); setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct? @@ -508,9 +504,10 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain, // Create the SelectionDAG nodes corresponding to a load //from this parameter SDValue FIN = DAG.getFrameIndex(FI, MVT::i16); - InVal = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + InVal = DAG.getLoad( + VA.getLocVT(), dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, false, 0); } InVals.push_back(InVal); @@ -1231,8 +1228,7 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI, } const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator I = BB; - ++I; + MachineFunction::iterator I = ++BB->getIterator(); // Create loop block MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB); @@ -1320,8 +1316,7 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // to set, the condition code register to branch on, the true/false values to // select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator I = BB; - ++I; + MachineFunction::iterator I = ++BB->getIterator(); // thisMBB: // ... diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp index 72b1780..d4f82bd 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -44,11 +44,10 @@ void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), - MachineMemOperand::MOStore, - MFI.getObjectSize(FrameIdx), - MFI.getObjectAlignment(FrameIdx)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdx), + MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); if (RC == &MSP430::GR16RegClass) BuildMI(MBB, MI, DL, get(MSP430::MOV16mr)) @@ -72,11 +71,10 @@ void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FrameIdx), - MFI.getObjectAlignment(FrameIdx)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdx), + MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); if (RC == &MSP430::GR16RegClass) BuildMI(MBB, MI, DL, get(MSP430::MOV16rm)) diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp index 54154a8..47b0e27 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp @@ -50,9 +50,9 @@ GetExternalSymbolSymbol(const MachineOperand &MO) const { MCSymbol *MSP430MCInstLower:: GetJumpTableSymbol(const MachineOperand &MO) const { - const DataLayout *DL = Printer.TM.getDataLayout(); + const DataLayout &DL = Printer.getDataLayout(); SmallString<256> Name; - raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI" + raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "JTI" << Printer.getFunctionNumber() << '_' << MO.getIndex(); @@ -67,9 +67,9 @@ GetJumpTableSymbol(const MachineOperand &MO) const { MCSymbol *MSP430MCInstLower:: GetConstantPoolIndexSymbol(const MachineOperand &MO) const { - const DataLayout *DL = Printer.TM.getDataLayout(); + const DataLayout &DL = Printer.getDataLayout(); SmallString<256> Name; - raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "CPI" + raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "CPI" << Printer.getFunctionNumber() << '_' << MO.getIndex(); diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp index 0f75399..b442fc0 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp @@ -1,4 +1,4 @@ -//===-- MSP430MachineFuctionInfo.cpp - MSP430 machine function info -------===// +//===-- MSP430MachineFunctionInfo.cpp - MSP430 machine function info ------===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h index fcc5f5b..2d93731 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h @@ -1,4 +1,4 @@ -//===- MSP430MachineFuctionInfo.h - MSP430 machine function info -*- C++ -*-==// +//=== MSP430MachineFunctionInfo.h - MSP430 machine function info -*- C++ -*-==// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 5107d2a..d4e061f 100644 --- a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -11,6 +11,7 @@ #include "MCTargetDesc/MipsMCExpr.h" #include "MCTargetDesc/MipsMCTargetDesc.h" #include "MipsRegisterInfo.h" +#include "MipsTargetObjectFile.h" #include "MipsTargetStreamer.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" @@ -106,7 +107,6 @@ class MipsAsmParser : public MCTargetAsmParser { return static_cast<MipsTargetStreamer &>(TS); } - MCSubtargetInfo &STI; MipsABIInfo ABI; SmallVector<std::unique_ptr<MipsAssemblerOptions>, 2> AssemblerOptions; MCSymbol *CurrentFn; // Pointer to the function being parsed. It may be a @@ -114,6 +114,12 @@ class MipsAsmParser : public MCTargetAsmParser { // selected. This usually happens after an '.end func' // directive. bool IsLittleEndian; + bool IsPicEnabled; + bool IsCpRestoreSet; + int CpRestoreOffset; + unsigned CpSaveLocation; + /// If true, then CpSaveLocation is a register, otherwise it's an offset. + bool CpSaveLocationIsRegister; // Print a warning along with its fix-it message at the given range. void printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg, @@ -141,50 +147,41 @@ class MipsAsmParser : public MCTargetAsmParser { bool ParseDirective(AsmToken DirectiveID) override; - MipsAsmParser::OperandMatchResultTy parseMemOperand(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy + OperandMatchResultTy parseMemOperand(OperandVector &Operands); + OperandMatchResultTy matchAnyRegisterNameWithoutDollar(OperandVector &Operands, StringRef Identifier, SMLoc S); - - MipsAsmParser::OperandMatchResultTy - matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S); - - MipsAsmParser::OperandMatchResultTy parseAnyRegister(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy parseImm(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy parseJumpTarget(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy parseInvNum(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy parseLSAImm(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy - parseRegisterPair (OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy - parseMovePRegPair(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy - parseRegisterList (OperandVector &Operands); + OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands, + SMLoc S); + OperandMatchResultTy parseAnyRegister(OperandVector &Operands); + OperandMatchResultTy parseImm(OperandVector &Operands); + OperandMatchResultTy parseJumpTarget(OperandVector &Operands); + OperandMatchResultTy parseInvNum(OperandVector &Operands); + OperandMatchResultTy parseLSAImm(OperandVector &Operands); + OperandMatchResultTy parseRegisterPair(OperandVector &Operands); + OperandMatchResultTy parseMovePRegPair(OperandVector &Operands); + OperandMatchResultTy parseRegisterList(OperandVector &Operands); bool searchSymbolAlias(OperandVector &Operands); bool parseOperand(OperandVector &, StringRef Mnemonic); - bool needsExpansion(MCInst &Inst); + enum MacroExpanderResultTy { + MER_NotAMacro, + MER_Success, + MER_Fail, + }; // Expands assembly pseudo instructions. - // Returns false on success, true otherwise. - bool expandInstruction(MCInst &Inst, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions); + MacroExpanderResultTy + tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); bool expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); bool loadImmediate(int64_t ImmValue, unsigned DstReg, unsigned SrcReg, - bool Is32BitImm, SMLoc IDLoc, + bool Is32BitImm, bool IsAddress, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); bool loadAndAddSymbolAddress(const MCExpr *SymExpr, unsigned DstReg, @@ -194,11 +191,10 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); - bool expandLoadAddressImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions); + bool expandLoadAddress(unsigned DstReg, unsigned BaseReg, + const MCOperand &Offset, bool Is32BitAddress, + SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); - bool expandLoadAddressReg(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions); bool expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); @@ -209,24 +205,43 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); + bool expandAliasImmediate(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + bool expandBranchImm(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); bool expandCondBranches(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); - bool expandUlhu(MCInst &Inst, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions); + bool expandDiv(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions, const bool IsMips64, + const bool Signed); + + bool expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); bool expandUlw(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); + bool expandRotation(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + bool expandRotationImm(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + bool expandDRotation(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + bool expandDRotationImm(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + void createNop(bool hasShortDelaySlot, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); void createAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit, SmallVectorImpl<MCInst> &Instructions); + void createCpRestoreMemOp(bool IsLoad, int StackOffset, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + bool reportParseError(Twine ErrorMsg); bool reportParseError(SMLoc Loc, Twine ErrorMsg); @@ -239,8 +254,11 @@ class MipsAsmParser : public MCTargetAsmParser { bool parseSetMips0Directive(); bool parseSetArchDirective(); bool parseSetFeature(uint64_t Feature); + bool isPicAndNotNxxAbi(); // Used by .cpload, .cprestore, and .cpsetup. bool parseDirectiveCpLoad(SMLoc Loc); + bool parseDirectiveCpRestore(SMLoc Loc); bool parseDirectiveCPSetup(); + bool parseDirectiveCPReturn(); bool parseDirectiveNaN(); bool parseDirectiveSet(); bool parseDirectiveOption(); @@ -337,6 +355,7 @@ class MipsAsmParser : public MCTargetAsmParser { // FeatureMipsGP64 | FeatureMips1) // Clearing Mips3 is equivalent to clear (FeatureMips3 | FeatureMips4). void selectArch(StringRef ArchFeature) { + MCSubtargetInfo &STI = copySTI(); FeatureBitset FeatureBits = STI.getFeatureBits(); FeatureBits &= ~MipsAssemblerOptions::AllArchRelatedMask; STI.setFeatureBits(FeatureBits); @@ -346,7 +365,8 @@ class MipsAsmParser : public MCTargetAsmParser { } void setFeatureBits(uint64_t Feature, StringRef FeatureString) { - if (!(STI.getFeatureBits()[Feature])) { + if (!(getSTI().getFeatureBits()[Feature])) { + MCSubtargetInfo &STI = copySTI(); setAvailableFeatures( ComputeAvailableFeatures(STI.ToggleFeature(FeatureString))); AssemblerOptions.back()->setFeatures(STI.getFeatureBits()); @@ -354,7 +374,8 @@ class MipsAsmParser : public MCTargetAsmParser { } void clearFeatureBits(uint64_t Feature, StringRef FeatureString) { - if (STI.getFeatureBits()[Feature]) { + if (getSTI().getFeatureBits()[Feature]) { + MCSubtargetInfo &STI = copySTI(); setAvailableFeatures( ComputeAvailableFeatures(STI.ToggleFeature(FeatureString))); AssemblerOptions.back()->setFeatures(STI.getFeatureBits()); @@ -363,26 +384,25 @@ class MipsAsmParser : public MCTargetAsmParser { void setModuleFeatureBits(uint64_t Feature, StringRef FeatureString) { setFeatureBits(Feature, FeatureString); - AssemblerOptions.front()->setFeatures(STI.getFeatureBits()); + AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits()); } void clearModuleFeatureBits(uint64_t Feature, StringRef FeatureString) { clearFeatureBits(Feature, FeatureString); - AssemblerOptions.front()->setFeatures(STI.getFeatureBits()); + AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits()); } public: enum MipsMatchResultTy { - Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY + Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY, #define GET_OPERAND_DIAGNOSTIC_TYPES #include "MipsGenAsmMatcher.inc" #undef GET_OPERAND_DIAGNOSTIC_TYPES - }; - MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser, + MipsAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(sti), + : MCTargetAsmParser(Options, sti), ABI(MipsABIInfo::computeTargetABI(Triple(sti.getTargetTriple()), sti.getCPU(), Options)) { MCAsmParserExtension::Initialize(parser); @@ -390,15 +410,15 @@ public: parser.addAliasForDirective(".asciiz", ".asciz"); // Initialize the set of available features. - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); - + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); + // Remember the initial assembler options. The user can not modify these. AssemblerOptions.push_back( - llvm::make_unique<MipsAssemblerOptions>(STI.getFeatureBits())); - + llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits())); + // Create an assembler options environment for the user to modify. AssemblerOptions.push_back( - llvm::make_unique<MipsAssemblerOptions>(STI.getFeatureBits())); + llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits())); getTargetStreamer().updateABIInfo(*this); @@ -407,6 +427,12 @@ public: CurrentFn = nullptr; + IsPicEnabled = + (getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_); + + IsCpRestoreSet = false; + CpRestoreOffset = -1; + Triple TheTriple(sti.getTargetTriple()); if ((TheTriple.getArch() == Triple::mips) || (TheTriple.getArch() == Triple::mips64)) @@ -418,70 +444,103 @@ public: /// True if all of $fcc0 - $fcc7 exist for the current ISA. bool hasEightFccRegisters() const { return hasMips4() || hasMips32(); } - bool isGP64bit() const { return STI.getFeatureBits()[Mips::FeatureGP64Bit]; } - bool isFP64bit() const { return STI.getFeatureBits()[Mips::FeatureFP64Bit]; } + bool isGP64bit() const { + return getSTI().getFeatureBits()[Mips::FeatureGP64Bit]; + } + bool isFP64bit() const { + return getSTI().getFeatureBits()[Mips::FeatureFP64Bit]; + } const MipsABIInfo &getABI() const { return ABI; } bool isABI_N32() const { return ABI.IsN32(); } bool isABI_N64() const { return ABI.IsN64(); } bool isABI_O32() const { return ABI.IsO32(); } - bool isABI_FPXX() const { return STI.getFeatureBits()[Mips::FeatureFPXX]; } + bool isABI_FPXX() const { + return getSTI().getFeatureBits()[Mips::FeatureFPXX]; + } bool useOddSPReg() const { - return !(STI.getFeatureBits()[Mips::FeatureNoOddSPReg]); + return !(getSTI().getFeatureBits()[Mips::FeatureNoOddSPReg]); } bool inMicroMipsMode() const { - return STI.getFeatureBits()[Mips::FeatureMicroMips]; + return getSTI().getFeatureBits()[Mips::FeatureMicroMips]; + } + bool hasMips1() const { + return getSTI().getFeatureBits()[Mips::FeatureMips1]; + } + bool hasMips2() const { + return getSTI().getFeatureBits()[Mips::FeatureMips2]; + } + bool hasMips3() const { + return getSTI().getFeatureBits()[Mips::FeatureMips3]; + } + bool hasMips4() const { + return getSTI().getFeatureBits()[Mips::FeatureMips4]; + } + bool hasMips5() const { + return getSTI().getFeatureBits()[Mips::FeatureMips5]; } - bool hasMips1() const { return STI.getFeatureBits()[Mips::FeatureMips1]; } - bool hasMips2() const { return STI.getFeatureBits()[Mips::FeatureMips2]; } - bool hasMips3() const { return STI.getFeatureBits()[Mips::FeatureMips3]; } - bool hasMips4() const { return STI.getFeatureBits()[Mips::FeatureMips4]; } - bool hasMips5() const { return STI.getFeatureBits()[Mips::FeatureMips5]; } bool hasMips32() const { - return STI.getFeatureBits()[Mips::FeatureMips32]; + return getSTI().getFeatureBits()[Mips::FeatureMips32]; } bool hasMips64() const { - return STI.getFeatureBits()[Mips::FeatureMips64]; + return getSTI().getFeatureBits()[Mips::FeatureMips64]; } bool hasMips32r2() const { - return STI.getFeatureBits()[Mips::FeatureMips32r2]; + return getSTI().getFeatureBits()[Mips::FeatureMips32r2]; } bool hasMips64r2() const { - return STI.getFeatureBits()[Mips::FeatureMips64r2]; + return getSTI().getFeatureBits()[Mips::FeatureMips64r2]; } bool hasMips32r3() const { - return (STI.getFeatureBits()[Mips::FeatureMips32r3]); + return (getSTI().getFeatureBits()[Mips::FeatureMips32r3]); } bool hasMips64r3() const { - return (STI.getFeatureBits()[Mips::FeatureMips64r3]); + return (getSTI().getFeatureBits()[Mips::FeatureMips64r3]); } bool hasMips32r5() const { - return (STI.getFeatureBits()[Mips::FeatureMips32r5]); + return (getSTI().getFeatureBits()[Mips::FeatureMips32r5]); } bool hasMips64r5() const { - return (STI.getFeatureBits()[Mips::FeatureMips64r5]); + return (getSTI().getFeatureBits()[Mips::FeatureMips64r5]); } bool hasMips32r6() const { - return STI.getFeatureBits()[Mips::FeatureMips32r6]; + return getSTI().getFeatureBits()[Mips::FeatureMips32r6]; } bool hasMips64r6() const { - return STI.getFeatureBits()[Mips::FeatureMips64r6]; + return getSTI().getFeatureBits()[Mips::FeatureMips64r6]; } - bool hasDSP() const { return STI.getFeatureBits()[Mips::FeatureDSP]; } - bool hasDSPR2() const { return STI.getFeatureBits()[Mips::FeatureDSPR2]; } - bool hasMSA() const { return STI.getFeatureBits()[Mips::FeatureMSA]; } + bool hasDSP() const { + return getSTI().getFeatureBits()[Mips::FeatureDSP]; + } + bool hasDSPR2() const { + return getSTI().getFeatureBits()[Mips::FeatureDSPR2]; + } + bool hasDSPR3() const { + return getSTI().getFeatureBits()[Mips::FeatureDSPR3]; + } + bool hasMSA() const { + return getSTI().getFeatureBits()[Mips::FeatureMSA]; + } bool hasCnMips() const { - return (STI.getFeatureBits()[Mips::FeatureCnMips]); + return (getSTI().getFeatureBits()[Mips::FeatureCnMips]); + } + + bool inPicMode() { + return IsPicEnabled; } bool inMips16Mode() const { - return STI.getFeatureBits()[Mips::FeatureMips16]; + return getSTI().getFeatureBits()[Mips::FeatureMips16]; + } + + bool useTraps() const { + return getSTI().getFeatureBits()[Mips::FeatureUseTCCInDIV]; } bool useSoftFloat() const { - return STI.getFeatureBits()[Mips::FeatureSoftFloat]; + return getSTI().getFeatureBits()[Mips::FeatureSoftFloat]; } /// Warn if RegIndex is the same as the current AT. @@ -869,6 +928,16 @@ public: Inst.addOperand(MCOperand::createReg(getHWRegsReg())); } + template <unsigned Bits, int Offset = 0, int AdjustOffset = 0> + void addConstantUImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + uint64_t Imm = getConstantImm() - Offset; + Imm &= (1 << Bits) - 1; + Imm += Offset; + Imm += AdjustOffset; + Inst.addOperand(MCOperand::createImm(Imm)); + } + void addImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const MCExpr *Expr = getImm(); @@ -878,7 +947,9 @@ public: void addMemOperands(MCInst &Inst, unsigned N) const { assert(N == 2 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(getMemBase()->getGPR32Reg())); + Inst.addOperand(MCOperand::createReg(AsmParser.getABI().ArePtrs64bit() + ? getMemBase()->getGPR64Reg() + : getMemBase()->getGPR32Reg())); const MCExpr *Expr = getMemOff(); addExpr(Inst, Expr); @@ -924,10 +995,16 @@ public: bool isRegIdx() const { return Kind == k_RegisterIndex; } bool isImm() const override { return Kind == k_Immediate; } bool isConstantImm() const { - return isImm() && dyn_cast<MCConstantExpr>(getImm()); + return isImm() && isa<MCConstantExpr>(getImm()); + } + bool isConstantImmz() const { + return isConstantImm() && getConstantImm() == 0; } - template <unsigned Bits> bool isUImm() const { - return isImm() && isConstantImm() && isUInt<Bits>(getConstantImm()); + template <unsigned Bits, int Offset = 0> bool isConstantUImm() const { + return isConstantImm() && isUInt<Bits>(getConstantImm() - Offset); + } + template <unsigned Bits> bool isConstantSImm() const { + return isConstantImm() && isInt<Bits>(getConstantImm()); } bool isToken() const override { // Note: It's not possible to pretend that other operand kinds are tokens. @@ -936,10 +1013,15 @@ public: } bool isMem() const override { return Kind == k_Memory; } bool isConstantMemOff() const { - return isMem() && dyn_cast<MCConstantExpr>(getMemOff()); + return isMem() && isa<MCConstantExpr>(getMemOff()); } template <unsigned Bits> bool isMemWithSimmOffset() const { - return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff()); + return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff()) + && getMemBase()->isGPRAsmReg(); + } + template <unsigned Bits> bool isMemWithSimmOffsetGPR() const { + return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff()) && + getMemBase()->isGPRAsmReg(); } bool isMemWithGRPMM16Base() const { return isMem() && getMemBase()->isMM16AsmReg(); @@ -953,13 +1035,23 @@ public: && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx() && (getMemBase()->getGPR32Reg() == Mips::SP); } + template <unsigned Bits, unsigned ShiftLeftAmount> + bool isScaledUImm() const { + return isConstantImm() && + isShiftedUInt<Bits, ShiftLeftAmount>(getConstantImm()); + } bool isRegList16() const { if (!isRegList()) return false; int Size = RegList.List->size(); - if (Size < 2 || Size > 5 || *RegList.List->begin() != Mips::S0 || - RegList.List->back() != Mips::RA) + if (Size < 2 || Size > 5) + return false; + + unsigned R0 = RegList.List->front(); + unsigned R1 = RegList.List->back(); + if (!((R0 == Mips::S0 && R1 == Mips::RA) || + (R0 == Mips::S0_64 && R1 == Mips::RA_64))) return false; int PrevReg = *RegList.List->begin(); @@ -1304,9 +1396,123 @@ static bool hasShortDelaySlot(unsigned Opcode) { } } +static const MCSymbol *getSingleMCSymbol(const MCExpr *Expr) { + if (const MCSymbolRefExpr *SRExpr = dyn_cast<MCSymbolRefExpr>(Expr)) { + return &SRExpr->getSymbol(); + } + + if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr)) { + const MCSymbol *LHSSym = getSingleMCSymbol(BExpr->getLHS()); + const MCSymbol *RHSSym = getSingleMCSymbol(BExpr->getRHS()); + + if (LHSSym) + return LHSSym; + + if (RHSSym) + return RHSSym; + + return nullptr; + } + + if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr)) + return getSingleMCSymbol(UExpr->getSubExpr()); + + return nullptr; +} + +static unsigned countMCSymbolRefExpr(const MCExpr *Expr) { + if (isa<MCSymbolRefExpr>(Expr)) + return 1; + + if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr)) + return countMCSymbolRefExpr(BExpr->getLHS()) + + countMCSymbolRefExpr(BExpr->getRHS()); + + if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr)) + return countMCSymbolRefExpr(UExpr->getSubExpr()); + + return 0; +} + +namespace { +void emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + MCInst tmpInst; + tmpInst.setOpcode(Opcode); + tmpInst.addOperand(MCOperand::createReg(Reg0)); + tmpInst.addOperand(Op1); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); +} + +void emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + emitRX(Opcode, Reg0, MCOperand::createImm(Imm), IDLoc, Instructions); +} + +void emitRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + emitRX(Opcode, Reg0, MCOperand::createReg(Reg1), IDLoc, Instructions); +} + +void emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + MCInst tmpInst; + tmpInst.setOpcode(Opcode); + tmpInst.addOperand(MCOperand::createImm(Imm1)); + tmpInst.addOperand(MCOperand::createImm(Imm2)); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); +} + +void emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + MCInst tmpInst; + tmpInst.setOpcode(Opcode); + tmpInst.addOperand(MCOperand::createReg(Reg0)); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); +} + +void emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, MCOperand Op2, + SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { + MCInst tmpInst; + tmpInst.setOpcode(Opcode); + tmpInst.addOperand(MCOperand::createReg(Reg0)); + tmpInst.addOperand(MCOperand::createReg(Reg1)); + tmpInst.addOperand(Op2); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); +} + +void emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2, + SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { + emitRRX(Opcode, Reg0, Reg1, MCOperand::createReg(Reg2), IDLoc, + Instructions); +} + +void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm, + SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { + emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc, + Instructions); +} + +void emitAppropriateDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount, + SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { + if (ShiftAmount >= 32) { + emitRRI(Mips::DSLL32, DstReg, SrcReg, ShiftAmount - 32, IDLoc, + Instructions); + return; + } + + emitRRI(Mips::DSLL, DstReg, SrcReg, ShiftAmount, IDLoc, Instructions); +} +} // end anonymous namespace. + bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode()); + bool ExpandedJalSym = false; Inst.setLoc(IDLoc); @@ -1365,12 +1571,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, return Error(IDLoc, "branch to misaligned address"); break; case Mips::BEQZ16_MM: + case Mips::BEQZC16_MMR6: case Mips::BNEZ16_MM: + case Mips::BNEZC16_MMR6: assert(MCID.getNumOperands() == 2 && "unexpected number of operands"); Offset = Inst.getOperand(1); if (!Offset.isImm()) break; // We'll deal with this situation later on when applying fixups. - if (!isIntN(8, Offset.getImm())) + if (!isInt<8>(Offset.getImm())) return Error(IDLoc, "branch target out of range"); if (OffsetToAlignment(Offset.getImm(), 2LL)) return Error(IDLoc, "branch to misaligned address"); @@ -1415,32 +1623,6 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, } break; - case Mips::CINS: - case Mips::CINS32: - case Mips::EXTS: - case Mips::EXTS32: - assert(MCID.getNumOperands() == 4 && "unexpected number of operands"); - // Check length - Opnd = Inst.getOperand(3); - if (!Opnd.isImm()) - return Error(IDLoc, "expected immediate operand kind"); - Imm = Opnd.getImm(); - if (Imm < 0 || Imm > 31) - return Error(IDLoc, "immediate operand value out of range"); - // Check position - Opnd = Inst.getOperand(2); - if (!Opnd.isImm()) - return Error(IDLoc, "expected immediate operand kind"); - Imm = Opnd.getImm(); - if (Imm < 0 || Imm > (Opcode == Mips::CINS || - Opcode == Mips::EXTS ? 63 : 31)) - return Error(IDLoc, "immediate operand value out of range"); - if (Imm > 31) { - Inst.setOpcode(Opcode == Mips::CINS ? Mips::CINS32 : Mips::EXTS32); - Inst.getOperand(2).setImm(Imm - 32); - } - break; - case Mips::SEQi: case Mips::SNEi: assert(MCID.getNumOperands() == 3 && "unexpected number of operands"); @@ -1454,6 +1636,81 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, } } + // This expansion is not in a function called by tryExpandInstruction() + // because the pseudo-instruction doesn't have a distinct opcode. + if ((Inst.getOpcode() == Mips::JAL || Inst.getOpcode() == Mips::JAL_MM) && + inPicMode()) { + warnIfNoMacro(IDLoc); + + const MCExpr *JalExpr = Inst.getOperand(0).getExpr(); + + // We can do this expansion if there's only 1 symbol in the argument + // expression. + if (countMCSymbolRefExpr(JalExpr) > 1) + return Error(IDLoc, "jal doesn't support multiple symbols in PIC mode"); + + // FIXME: This is checking the expression can be handled by the later stages + // of the assembler. We ought to leave it to those later stages but + // we can't do that until we stop evaluateRelocExpr() rewriting the + // expressions into non-equivalent forms. + const MCSymbol *JalSym = getSingleMCSymbol(JalExpr); + + // FIXME: Add support for label+offset operands (currently causes an error). + // FIXME: Add support for forward-declared local symbols. + // FIXME: Add expansion for when the LargeGOT option is enabled. + if (JalSym->isInSection() || JalSym->isTemporary()) { + if (isABI_O32()) { + // If it's a local symbol and the O32 ABI is being used, we expand to: + // lw $25, 0($gp) + // R_(MICRO)MIPS_GOT16 label + // addiu $25, $25, 0 + // R_(MICRO)MIPS_LO16 label + // jalr $25 + const MCExpr *Got16RelocExpr = evaluateRelocExpr(JalExpr, "got"); + const MCExpr *Lo16RelocExpr = evaluateRelocExpr(JalExpr, "lo"); + + emitRRX(Mips::LW, Mips::T9, Mips::GP, + MCOperand::createExpr(Got16RelocExpr), IDLoc, Instructions); + emitRRX(Mips::ADDiu, Mips::T9, Mips::T9, + MCOperand::createExpr(Lo16RelocExpr), IDLoc, Instructions); + } else if (isABI_N32() || isABI_N64()) { + // If it's a local symbol and the N32/N64 ABIs are being used, + // we expand to: + // lw/ld $25, 0($gp) + // R_(MICRO)MIPS_GOT_DISP label + // jalr $25 + const MCExpr *GotDispRelocExpr = evaluateRelocExpr(JalExpr, "got_disp"); + + emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP, + MCOperand::createExpr(GotDispRelocExpr), IDLoc, Instructions); + } + } else { + // If it's an external/weak symbol, we expand to: + // lw/ld $25, 0($gp) + // R_(MICRO)MIPS_CALL16 label + // jalr $25 + const MCExpr *Call16RelocExpr = evaluateRelocExpr(JalExpr, "call16"); + + emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP, + MCOperand::createExpr(Call16RelocExpr), IDLoc, Instructions); + } + + MCInst JalrInst; + if (IsCpRestoreSet && inMicroMipsMode()) + JalrInst.setOpcode(Mips::JALRS_MM); + else + JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR); + JalrInst.addOperand(MCOperand::createReg(Mips::RA)); + JalrInst.addOperand(MCOperand::createReg(Mips::T9)); + + // FIXME: Add an R_(MICRO)MIPS_JALR relocation after the JALR. + // This relocation is supposed to be an optimization hint for the linker + // and is not necessary for correctness. + + Inst = JalrInst; + ExpandedJalSym = true; + } + if (MCID.mayLoad() || MCID.mayStore()) { // Check the offset of memory operand, if it is a symbol // reference or immediate we may have to expand instructions. @@ -1500,17 +1757,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, int MemOffset = Op.getImm(); MCOperand &DstReg = Inst.getOperand(0); MCOperand &BaseReg = Inst.getOperand(1); - if (isIntN(9, MemOffset) && (MemOffset % 4 == 0) && + if (isInt<9>(MemOffset) && (MemOffset % 4 == 0) && getContext().getRegisterInfo()->getRegClass( Mips::GPRMM16RegClassID).contains(DstReg.getReg()) && - BaseReg.getReg() == Mips::GP) { - MCInst TmpInst; - TmpInst.setLoc(IDLoc); - TmpInst.setOpcode(Mips::LWGP_MM); - TmpInst.addOperand(MCOperand::createReg(DstReg.getReg())); - TmpInst.addOperand(MCOperand::createReg(Mips::GP)); - TmpInst.addOperand(MCOperand::createImm(MemOffset)); - Instructions.push_back(TmpInst); + (BaseReg.getReg() == Mips::GP || + BaseReg.getReg() == Mips::GP_64)) { + + emitRRI(Mips::LWGP_MM, DstReg.getReg(), Mips::GP, MemOffset, + IDLoc, Instructions); return false; } } @@ -1597,7 +1851,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, if (Imm < -1 || Imm > 14) return Error(IDLoc, "immediate operand value out of range"); break; + case Mips::TEQ_MM: + case Mips::TGE_MM: + case Mips::TGEU_MM: + case Mips::TLT_MM: + case Mips::TLTU_MM: + case Mips::TNE_MM: case Mips::SB16_MM: + case Mips::SB16_MMR6: Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); @@ -1607,6 +1868,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, break; case Mips::LHU16_MM: case Mips::SH16_MM: + case Mips::SH16_MMR6: Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); @@ -1616,6 +1878,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, break; case Mips::LW16_MM: case Mips::SW16_MM: + case Mips::SW16_MMR6: Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); @@ -1623,93 +1886,111 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, if (Imm < 0 || Imm > 60 || (Imm % 4 != 0)) return Error(IDLoc, "immediate operand value out of range"); break; - case Mips::CACHE: - case Mips::PREF: - Opnd = Inst.getOperand(2); - if (!Opnd.isImm()) - return Error(IDLoc, "expected immediate operand kind"); - Imm = Opnd.getImm(); - if (!isUInt<5>(Imm)) - return Error(IDLoc, "immediate operand value out of range"); - break; case Mips::ADDIUPC_MM: MCOperand Opnd = Inst.getOperand(1); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); int Imm = Opnd.getImm(); - if ((Imm % 4 != 0) || !isIntN(25, Imm)) + if ((Imm % 4 != 0) || !isInt<25>(Imm)) return Error(IDLoc, "immediate operand value out of range"); break; } } - if (needsExpansion(Inst)) { - if (expandInstruction(Inst, IDLoc, Instructions)) - return true; - } else + MacroExpanderResultTy ExpandResult = + tryExpandInstruction(Inst, IDLoc, Instructions); + switch (ExpandResult) { + case MER_NotAMacro: Instructions.push_back(Inst); + break; + case MER_Success: + break; + case MER_Fail: + return true; + } // If this instruction has a delay slot and .set reorder is active, // emit a NOP after it. if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) createNop(hasShortDelaySlot(Inst.getOpcode()), IDLoc, Instructions); - return false; -} + if ((Inst.getOpcode() == Mips::JalOneReg || + Inst.getOpcode() == Mips::JalTwoReg || ExpandedJalSym) && + isPicAndNotNxxAbi()) { + if (IsCpRestoreSet) { + // We need a NOP between the JALR and the LW: + // If .set reorder has been used, we've already emitted a NOP. + // If .set noreorder has been used, we need to emit a NOP at this point. + if (!AssemblerOptions.back()->isReorder()) + createNop(hasShortDelaySlot(Inst.getOpcode()), IDLoc, Instructions); -bool MipsAsmParser::needsExpansion(MCInst &Inst) { + // Load the $gp from the stack. + SmallVector<MCInst, 3> LoadInsts; + createCpRestoreMemOp(true /*IsLoad*/, CpRestoreOffset /*StackOffset*/, + IDLoc, LoadInsts); - switch (Inst.getOpcode()) { - case Mips::LoadImm32: - case Mips::LoadImm64: - case Mips::LoadAddrImm32: - case Mips::LoadAddrReg32: - case Mips::B_MM_Pseudo: - case Mips::LWM_MM: - case Mips::SWM_MM: - case Mips::JalOneReg: - case Mips::JalTwoReg: - case Mips::BneImm: - case Mips::BeqImm: - case Mips::BLT: - case Mips::BLE: - case Mips::BGE: - case Mips::BGT: - case Mips::BLTU: - case Mips::BLEU: - case Mips::BGEU: - case Mips::BGTU: - case Mips::Ulhu: - case Mips::Ulw: - return true; - default: - return false; + for (const MCInst &Inst : LoadInsts) + Instructions.push_back(Inst); + + } else + Warning(IDLoc, "no .cprestore used in PIC mode"); } + + return false; } -bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { +MipsAsmParser::MacroExpanderResultTy +MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { switch (Inst.getOpcode()) { - default: llvm_unreachable("unimplemented expansion"); + default: + return MER_NotAMacro; case Mips::LoadImm32: - return expandLoadImm(Inst, true, IDLoc, Instructions); + return expandLoadImm(Inst, true, IDLoc, Instructions) ? MER_Fail + : MER_Success; case Mips::LoadImm64: - return expandLoadImm(Inst, false, IDLoc, Instructions); + return expandLoadImm(Inst, false, IDLoc, Instructions) ? MER_Fail + : MER_Success; case Mips::LoadAddrImm32: - return expandLoadAddressImm(Inst, true, IDLoc, Instructions); + case Mips::LoadAddrImm64: + assert(Inst.getOperand(0).isReg() && "expected register operand kind"); + assert((Inst.getOperand(1).isImm() || Inst.getOperand(1).isExpr()) && + "expected immediate operand kind"); + + return expandLoadAddress(Inst.getOperand(0).getReg(), Mips::NoRegister, + Inst.getOperand(1), + Inst.getOpcode() == Mips::LoadAddrImm32, IDLoc, + Instructions) + ? MER_Fail + : MER_Success; case Mips::LoadAddrReg32: - return expandLoadAddressReg(Inst, true, IDLoc, Instructions); + case Mips::LoadAddrReg64: + assert(Inst.getOperand(0).isReg() && "expected register operand kind"); + assert(Inst.getOperand(1).isReg() && "expected register operand kind"); + assert((Inst.getOperand(2).isImm() || Inst.getOperand(2).isExpr()) && + "expected immediate operand kind"); + + return expandLoadAddress(Inst.getOperand(0).getReg(), + Inst.getOperand(1).getReg(), Inst.getOperand(2), + Inst.getOpcode() == Mips::LoadAddrReg32, IDLoc, + Instructions) + ? MER_Fail + : MER_Success; case Mips::B_MM_Pseudo: - return expandUncondBranchMMPseudo(Inst, IDLoc, Instructions); + case Mips::B_MMR6_Pseudo: + return expandUncondBranchMMPseudo(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; case Mips::SWM_MM: case Mips::LWM_MM: - return expandLoadStoreMultiple(Inst, IDLoc, Instructions); + return expandLoadStoreMultiple(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; case Mips::JalOneReg: case Mips::JalTwoReg: - return expandJalWithRegs(Inst, IDLoc, Instructions); + return expandJalWithRegs(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; case Mips::BneImm: case Mips::BeqImm: - return expandBranchImm(Inst, IDLoc, Instructions); + return expandBranchImm(Inst, IDLoc, Instructions) ? MER_Fail : MER_Success; case Mips::BLT: case Mips::BLE: case Mips::BGE: @@ -1718,78 +1999,97 @@ bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc, case Mips::BLEU: case Mips::BGEU: case Mips::BGTU: - return expandCondBranches(Inst, IDLoc, Instructions); + case Mips::BLTL: + case Mips::BLEL: + case Mips::BGEL: + case Mips::BGTL: + case Mips::BLTUL: + case Mips::BLEUL: + case Mips::BGEUL: + case Mips::BGTUL: + case Mips::BLTImmMacro: + case Mips::BLEImmMacro: + case Mips::BGEImmMacro: + case Mips::BGTImmMacro: + case Mips::BLTUImmMacro: + case Mips::BLEUImmMacro: + case Mips::BGEUImmMacro: + case Mips::BGTUImmMacro: + case Mips::BLTLImmMacro: + case Mips::BLELImmMacro: + case Mips::BGELImmMacro: + case Mips::BGTLImmMacro: + case Mips::BLTULImmMacro: + case Mips::BLEULImmMacro: + case Mips::BGEULImmMacro: + case Mips::BGTULImmMacro: + return expandCondBranches(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + case Mips::SDivMacro: + return expandDiv(Inst, IDLoc, Instructions, false, true) ? MER_Fail + : MER_Success; + case Mips::DSDivMacro: + return expandDiv(Inst, IDLoc, Instructions, true, true) ? MER_Fail + : MER_Success; + case Mips::UDivMacro: + return expandDiv(Inst, IDLoc, Instructions, false, false) ? MER_Fail + : MER_Success; + case Mips::DUDivMacro: + return expandDiv(Inst, IDLoc, Instructions, true, false) ? MER_Fail + : MER_Success; + case Mips::Ulh: + return expandUlh(Inst, true, IDLoc, Instructions) ? MER_Fail : MER_Success; case Mips::Ulhu: - return expandUlhu(Inst, IDLoc, Instructions); + return expandUlh(Inst, false, IDLoc, Instructions) ? MER_Fail : MER_Success; case Mips::Ulw: - return expandUlw(Inst, IDLoc, Instructions); + return expandUlw(Inst, IDLoc, Instructions) ? MER_Fail : MER_Success; + case Mips::NORImm: + return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + case Mips::ADDi: + case Mips::ADDiu: + case Mips::SLTi: + case Mips::SLTiu: + if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) { + int64_t ImmValue = Inst.getOperand(2).getImm(); + if (isInt<16>(ImmValue)) + return MER_NotAMacro; + return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + } + return MER_NotAMacro; + case Mips::ANDi: + case Mips::ORi: + case Mips::XORi: + if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) { + int64_t ImmValue = Inst.getOperand(2).getImm(); + if (isUInt<16>(ImmValue)) + return MER_NotAMacro; + return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + } + return MER_NotAMacro; + case Mips::ROL: + case Mips::ROR: + return expandRotation(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + case Mips::ROLImm: + case Mips::RORImm: + return expandRotationImm(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + case Mips::DROL: + case Mips::DROR: + return expandDRotation(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + case Mips::DROLImm: + case Mips::DRORImm: + return expandDRotationImm(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; } } -namespace { -void emitRX(unsigned Opcode, unsigned DstReg, MCOperand Imm, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - MCInst tmpInst; - tmpInst.setOpcode(Opcode); - tmpInst.addOperand(MCOperand::createReg(DstReg)); - tmpInst.addOperand(Imm); - tmpInst.setLoc(IDLoc); - Instructions.push_back(tmpInst); -} - -void emitRI(unsigned Opcode, unsigned DstReg, int16_t Imm, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - emitRX(Opcode, DstReg, MCOperand::createImm(Imm), IDLoc, Instructions); -} - - -void emitRRX(unsigned Opcode, unsigned DstReg, unsigned SrcReg, MCOperand Imm, - SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { - MCInst tmpInst; - tmpInst.setOpcode(Opcode); - tmpInst.addOperand(MCOperand::createReg(DstReg)); - tmpInst.addOperand(MCOperand::createReg(SrcReg)); - tmpInst.addOperand(Imm); - tmpInst.setLoc(IDLoc); - Instructions.push_back(tmpInst); -} - -void emitRRR(unsigned Opcode, unsigned DstReg, unsigned SrcReg, - unsigned SrcReg2, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - emitRRX(Opcode, DstReg, SrcReg, MCOperand::createReg(SrcReg2), IDLoc, - Instructions); -} - -void emitRRI(unsigned Opcode, unsigned DstReg, unsigned SrcReg, int16_t Imm, - SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { - emitRRX(Opcode, DstReg, SrcReg, MCOperand::createImm(Imm), IDLoc, - Instructions); -} - -template <int16_t ShiftAmount> -void createLShiftOri(MCOperand Operand, unsigned RegNo, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - if (ShiftAmount >= 32) - emitRRI(Mips::DSLL32, RegNo, RegNo, ShiftAmount - 32, IDLoc, Instructions); - else if (ShiftAmount > 0) - emitRRI(Mips::DSLL, RegNo, RegNo, ShiftAmount, IDLoc, Instructions); - - // There's no need for an ORi if the immediate is 0. - if (Operand.isImm() && Operand.getImm() == 0) - return; - - emitRRX(Mips::ORi, RegNo, RegNo, Operand, IDLoc, Instructions); -} - -template <unsigned ShiftAmount> -void createLShiftOri(int64_t Value, unsigned RegNo, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - createLShiftOri<ShiftAmount>(MCOperand::createImm(Value), RegNo, IDLoc, - Instructions); -} -} - bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { // Create a JALR instruction which is going to replace the pseudo-JAL. @@ -1800,8 +2100,11 @@ bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, if (Opcode == Mips::JalOneReg) { // jal $rs => jalr $rs - if (inMicroMipsMode()) { - JalrInst.setOpcode(Mips::JALR16_MM); + if (IsCpRestoreSet && inMicroMipsMode()) { + JalrInst.setOpcode(Mips::JALRS16_MM); + JalrInst.addOperand(FirstRegOp); + } else if (inMicroMipsMode()) { + JalrInst.setOpcode(hasMips32r6() ? Mips::JALRC16_MMR6 : Mips::JALR16_MM); JalrInst.addOperand(FirstRegOp); } else { JalrInst.setOpcode(Mips::JALR); @@ -1810,30 +2113,47 @@ bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, } } else if (Opcode == Mips::JalTwoReg) { // jal $rd, $rs => jalr $rd, $rs - JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR); + if (IsCpRestoreSet && inMicroMipsMode()) + JalrInst.setOpcode(Mips::JALRS_MM); + else + JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR); JalrInst.addOperand(FirstRegOp); const MCOperand SecondRegOp = Inst.getOperand(1); JalrInst.addOperand(SecondRegOp); } Instructions.push_back(JalrInst); - // If .set reorder is active, emit a NOP after it. - if (AssemblerOptions.back()->isReorder()) { - // This is a 32-bit NOP because these 2 pseudo-instructions - // do not have a short delay slot. - MCInst NopInst; - NopInst.setOpcode(Mips::SLL); - NopInst.addOperand(MCOperand::createReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::createReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::createImm(0)); - Instructions.push_back(NopInst); + // If .set reorder is active and branch instruction has a delay slot, + // emit a NOP after it. + const MCInstrDesc &MCID = getInstDesc(JalrInst.getOpcode()); + if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) { + createNop(hasShortDelaySlot(JalrInst.getOpcode()), IDLoc, Instructions); } return false; } +/// Can the value be represented by a unsigned N-bit value and a shift left? +template <unsigned N> static bool isShiftedUIntAtAnyPosition(uint64_t x) { + unsigned BitNum = findFirstSet(x); + + return (x == x >> BitNum << BitNum) && isUInt<N>(x >> BitNum); +} + +/// Load (or add) an immediate into a register. +/// +/// @param ImmValue The immediate to load. +/// @param DstReg The register that will hold the immediate. +/// @param SrcReg A register to add to the immediate or Mips::NoRegister +/// for a simple initialization. +/// @param Is32BitImm Is ImmValue 32-bit or 64-bit? +/// @param IsAddress True if the immediate represents an address. False if it +/// is an integer. +/// @param IDLoc Location of the immediate in the source file. +/// @param Instructions The instructions emitted by this expansion. bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg, - unsigned SrcReg, bool Is32BitImm, SMLoc IDLoc, + unsigned SrcReg, bool Is32BitImm, + bool IsAddress, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { if (!Is32BitImm && !isGP64bit()) { Error(IDLoc, "instruction requires a 64-bit architecture"); @@ -1852,6 +2172,9 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg, } } + unsigned ZeroReg = IsAddress ? ABI.GetNullPtr() : ABI.GetZeroReg(); + unsigned AdduOp = !Is32BitImm ? Mips::DADDu : Mips::ADDu; + bool UseSrcReg = false; if (SrcReg != Mips::NoRegister) UseSrcReg = true; @@ -1866,111 +2189,129 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg, TmpReg = ATReg; } - // FIXME: gas has a special case for values that are 000...1111, which - // becomes a li -1 and then a dsrl if (isInt<16>(ImmValue)) { - // li d,j => addiu d,$zero,j if (!UseSrcReg) - SrcReg = Mips::ZERO; + SrcReg = ZeroReg; + + // This doesn't quite follow the usual ABI expectations for N32 but matches + // traditional assembler behaviour. N32 would normally use addiu for both + // integers and addresses. + if (IsAddress && !Is32BitImm) { + emitRRI(Mips::DADDiu, DstReg, SrcReg, ImmValue, IDLoc, Instructions); + return false; + } + emitRRI(Mips::ADDiu, DstReg, SrcReg, ImmValue, IDLoc, Instructions); - } else if (isUInt<16>(ImmValue)) { - // li d,j => ori d,$zero,j + return false; + } + + if (isUInt<16>(ImmValue)) { unsigned TmpReg = DstReg; if (SrcReg == DstReg) { - unsigned ATReg = getATReg(IDLoc); - if (!ATReg) + TmpReg = getATReg(IDLoc); + if (!TmpReg) return true; - TmpReg = ATReg; } - emitRRI(Mips::ORi, TmpReg, Mips::ZERO, ImmValue, IDLoc, Instructions); + emitRRI(Mips::ORi, TmpReg, ZeroReg, ImmValue, IDLoc, Instructions); if (UseSrcReg) - emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, Instructions); - } else if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) { + emitRRR(ABI.GetPtrAdduOp(), DstReg, TmpReg, SrcReg, IDLoc, Instructions); + return false; + } + + if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) { warnIfNoMacro(IDLoc); - // For all other values which are representable as a 32-bit integer: - // li d,j => lui d,hi16(j) - // ori d,d,lo16(j) uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff; uint16_t Bits15To0 = ImmValue & 0xffff; if (!Is32BitImm && !isInt<32>(ImmValue)) { - // For DLI, expand to an ORi instead of a LUi to avoid sign-extending the + // Traditional behaviour seems to special case this particular value. It's + // not clear why other masks are handled differently. + if (ImmValue == 0xffffffff) { + emitRI(Mips::LUi, TmpReg, 0xffff, IDLoc, Instructions); + emitRRI(Mips::DSRL32, TmpReg, TmpReg, 0, IDLoc, Instructions); + if (UseSrcReg) + emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions); + return false; + } + + // Expand to an ORi instead of a LUi to avoid sign-extending into the // upper 32 bits. - emitRRI(Mips::ORi, TmpReg, Mips::ZERO, Bits31To16, IDLoc, Instructions); + emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits31To16, IDLoc, Instructions); emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, Instructions); - } else - emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, Instructions); - createLShiftOri<0>(Bits15To0, TmpReg, IDLoc, Instructions); + if (Bits15To0) + emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, Instructions); + if (UseSrcReg) + emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions); + return false; + } + emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, Instructions); + if (Bits15To0) + emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, Instructions); if (UseSrcReg) - createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions); - - } else if ((ImmValue & (0xffffLL << 48)) == 0) { - warnIfNoMacro(IDLoc); + emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions); + return false; + } - // <------- lo32 ------> - // <------- hi32 ------> - // <- hi16 -> <- lo16 -> - // _________________________________ - // | | | | - // | 16-bits | 16-bits | 16-bits | - // |__________|__________|__________| - // - // For any 64-bit value that is representable as a 48-bit integer: - // li d,j => lui d,hi16(j) - // ori d,d,hi16(lo32(j)) - // dsll d,d,16 - // ori d,d,lo16(lo32(j)) - uint16_t Bits47To32 = (ImmValue >> 32) & 0xffff; - uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff; - uint16_t Bits15To0 = ImmValue & 0xffff; + if (isShiftedUIntAtAnyPosition<16>(ImmValue)) { + if (Is32BitImm) { + Error(IDLoc, "instruction requires a 32-bit immediate"); + return true; + } - emitRI(Mips::LUi, TmpReg, Bits47To32, IDLoc, Instructions); - createLShiftOri<0>(Bits31To16, TmpReg, IDLoc, Instructions); - createLShiftOri<16>(Bits15To0, TmpReg, IDLoc, Instructions); + // Traditionally, these immediates are shifted as little as possible and as + // such we align the most significant bit to bit 15 of our temporary. + unsigned FirstSet = findFirstSet((uint64_t)ImmValue); + unsigned LastSet = findLastSet((uint64_t)ImmValue); + unsigned ShiftAmount = FirstSet - (15 - (LastSet - FirstSet)); + uint16_t Bits = (ImmValue >> ShiftAmount) & 0xffff; + emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits, IDLoc, Instructions); + emitRRI(Mips::DSLL, TmpReg, TmpReg, ShiftAmount, IDLoc, Instructions); if (UseSrcReg) - createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions); + emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions); - } else { - warnIfNoMacro(IDLoc); + return false; + } - // <------- hi32 ------> <------- lo32 ------> - // <- hi16 -> <- lo16 -> - // ___________________________________________ - // | | | | | - // | 16-bits | 16-bits | 16-bits | 16-bits | - // |__________|__________|__________|__________| - // - // For all other values which are representable as a 64-bit integer: - // li d,j => lui d,hi16(j) - // ori d,d,lo16(hi32(j)) - // dsll d,d,16 - // ori d,d,hi16(lo32(j)) - // dsll d,d,16 - // ori d,d,lo16(lo32(j)) - uint16_t Bits63To48 = (ImmValue >> 48) & 0xffff; - uint16_t Bits47To32 = (ImmValue >> 32) & 0xffff; - uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff; - uint16_t Bits15To0 = ImmValue & 0xffff; + warnIfNoMacro(IDLoc); - emitRI(Mips::LUi, TmpReg, Bits63To48, IDLoc, Instructions); - createLShiftOri<0>(Bits47To32, TmpReg, IDLoc, Instructions); + // The remaining case is packed with a sequence of dsll and ori with zeros + // being omitted and any neighbouring dsll's being coalesced. + // The highest 32-bit's are equivalent to a 32-bit immediate load. - // When Bits31To16 is 0, do a left shift of 32 bits instead of doing - // two left shifts of 16 bits. - if (Bits31To16 == 0) { - createLShiftOri<32>(Bits15To0, TmpReg, IDLoc, Instructions); - } else { - createLShiftOri<16>(Bits31To16, TmpReg, IDLoc, Instructions); - createLShiftOri<16>(Bits15To0, TmpReg, IDLoc, Instructions); + // Load bits 32-63 of ImmValue into bits 0-31 of the temporary register. + if (loadImmediate(ImmValue >> 32, TmpReg, Mips::NoRegister, true, false, + IDLoc, Instructions)) + return false; + + // Shift and accumulate into the register. If a 16-bit chunk is zero, then + // skip it and defer the shift to the next chunk. + unsigned ShiftCarriedForwards = 16; + for (int BitNum = 16; BitNum >= 0; BitNum -= 16) { + uint16_t ImmChunk = (ImmValue >> BitNum) & 0xffff; + + if (ImmChunk != 0) { + emitAppropriateDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, + Instructions); + emitRRI(Mips::ORi, TmpReg, TmpReg, ImmChunk, IDLoc, Instructions); + ShiftCarriedForwards = 0; } - if (UseSrcReg) - createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions); + ShiftCarriedForwards += 16; } + ShiftCarriedForwards -= 16; + + // Finish any remaining shifts left by trailing zeros. + if (ShiftCarriedForwards) + emitAppropriateDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, + Instructions); + + if (UseSrcReg) + emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions); + return false; } @@ -1982,63 +2323,38 @@ bool MipsAsmParser::expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, assert(DstRegOp.isReg() && "expected register operand kind"); if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister, - Is32BitImm, IDLoc, Instructions)) + Is32BitImm, false, IDLoc, Instructions)) return true; return false; } -bool -MipsAsmParser::expandLoadAddressReg(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - const MCOperand &DstRegOp = Inst.getOperand(0); - assert(DstRegOp.isReg() && "expected register operand kind"); - - const MCOperand &SrcRegOp = Inst.getOperand(1); - assert(SrcRegOp.isReg() && "expected register operand kind"); - - const MCOperand &ImmOp = Inst.getOperand(2); - assert((ImmOp.isImm() || ImmOp.isExpr()) && - "expected immediate operand kind"); - if (!ImmOp.isImm()) { - if (loadAndAddSymbolAddress(ImmOp.getExpr(), DstRegOp.getReg(), - SrcRegOp.getReg(), Is32BitImm, IDLoc, - Instructions)) - return true; - - return false; - } - - if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), SrcRegOp.getReg(), - Is32BitImm, IDLoc, Instructions)) +bool MipsAsmParser::expandLoadAddress(unsigned DstReg, unsigned BaseReg, + const MCOperand &Offset, + bool Is32BitAddress, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + // la can't produce a usable address when addresses are 64-bit. + if (Is32BitAddress && ABI.ArePtrs64bit()) { + // FIXME: Demote this to a warning and continue as if we had 'dla' instead. + // We currently can't do this because we depend on the equality + // operator and N64 can end up with a GPR32/GPR64 mismatch. + Error(IDLoc, "la used to load 64-bit address"); + // Continue as if we had 'dla' instead. + Is32BitAddress = false; + } + + // dla requires 64-bit addresses. + if (!Is32BitAddress && !ABI.ArePtrs64bit()) { + Error(IDLoc, "instruction requires a 64-bit architecture"); return true; - - return false; -} - -bool -MipsAsmParser::expandLoadAddressImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - const MCOperand &DstRegOp = Inst.getOperand(0); - assert(DstRegOp.isReg() && "expected register operand kind"); - - const MCOperand &ImmOp = Inst.getOperand(1); - assert((ImmOp.isImm() || ImmOp.isExpr()) && - "expected immediate operand kind"); - if (!ImmOp.isImm()) { - if (loadAndAddSymbolAddress(ImmOp.getExpr(), DstRegOp.getReg(), - Mips::NoRegister, Is32BitImm, IDLoc, - Instructions)) - return true; - - return false; } - if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister, - Is32BitImm, IDLoc, Instructions)) - return true; + if (!Offset.isImm()) + return loadAndAddSymbolAddress(Offset.getExpr(), DstReg, BaseReg, + Is32BitAddress, IDLoc, Instructions); - return false; + return loadImmediate(Offset.getImm(), DstReg, BaseReg, Is32BitAddress, true, + IDLoc, Instructions); } bool MipsAsmParser::loadAndAddSymbolAddress( @@ -2046,67 +2362,102 @@ bool MipsAsmParser::loadAndAddSymbolAddress( SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { warnIfNoMacro(IDLoc); - if (Is32BitSym && isABI_N64()) - Warning(IDLoc, "instruction loads the 32-bit address of a 64-bit symbol"); - - MCInst tmpInst; - const MCSymbolRefExpr *Symbol = cast<MCSymbolRefExpr>(SymExpr); - const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::create( - &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_ABS_HI, getContext()); - const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::create( - &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_ABS_LO, getContext()); + const MCExpr *Symbol = cast<MCExpr>(SymExpr); + const MipsMCExpr *HiExpr = MipsMCExpr::create( + MCSymbolRefExpr::VK_Mips_ABS_HI, Symbol, getContext()); + const MipsMCExpr *LoExpr = MipsMCExpr::create( + MCSymbolRefExpr::VK_Mips_ABS_LO, Symbol, getContext()); bool UseSrcReg = SrcReg != Mips::NoRegister; + // This is the 64-bit symbol address expansion. + if (ABI.ArePtrs64bit() && isGP64bit()) { + // We always need AT for the 64-bit expansion. + // If it is not available we exit. + unsigned ATReg = getATReg(IDLoc); + if (!ATReg) + return true; + + const MipsMCExpr *HighestExpr = MipsMCExpr::create( + MCSymbolRefExpr::VK_Mips_HIGHEST, Symbol, getContext()); + const MipsMCExpr *HigherExpr = MipsMCExpr::create( + MCSymbolRefExpr::VK_Mips_HIGHER, Symbol, getContext()); + + if (UseSrcReg && (DstReg == SrcReg)) { + // If $rs is the same as $rd: + // (d)la $rd, sym($rd) => lui $at, %highest(sym) + // daddiu $at, $at, %higher(sym) + // dsll $at, $at, 16 + // daddiu $at, $at, %hi(sym) + // dsll $at, $at, 16 + // daddiu $at, $at, %lo(sym) + // daddu $rd, $at, $rd + emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc, + Instructions); + emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HigherExpr), + IDLoc, Instructions); + emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, Instructions); + emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr), IDLoc, + Instructions); + emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, Instructions); + emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc, + Instructions); + emitRRR(Mips::DADDu, DstReg, ATReg, SrcReg, IDLoc, Instructions); + + return false; + } + + // Otherwise, if the $rs is different from $rd or if $rs isn't specified: + // (d)la $rd, sym/sym($rs) => lui $rd, %highest(sym) + // lui $at, %hi(sym) + // daddiu $rd, $rd, %higher(sym) + // daddiu $at, $at, %lo(sym) + // dsll32 $rd, $rd, 0 + // daddu $rd, $rd, $at + // (daddu $rd, $rd, $rs) + emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc, + Instructions); + emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, + Instructions); + emitRRX(Mips::DADDiu, DstReg, DstReg, MCOperand::createExpr(HigherExpr), + IDLoc, Instructions); + emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc, + Instructions); + emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, Instructions); + emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, Instructions); + if (UseSrcReg) + emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, Instructions); + + return false; + } + + // And now, the 32-bit symbol address expansion: + // If $rs is the same as $rd: + // (d)la $rd, sym($rd) => lui $at, %hi(sym) + // ori $at, $at, %lo(sym) + // addu $rd, $at, $rd + // Otherwise, if the $rs is different from $rd or if $rs isn't specified: + // (d)la $rd, sym/sym($rs) => lui $rd, %hi(sym) + // ori $rd, $rd, %lo(sym) + // (addu $rd, $rd, $rs) unsigned TmpReg = DstReg; if (UseSrcReg && (DstReg == SrcReg)) { - // At this point we need AT to perform the expansions and we exit if it is - // not available. + // If $rs is the same as $rd, we need to use AT. + // If it is not available we exit. unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; TmpReg = ATReg; } - if (!Is32BitSym) { - // If it's a 64-bit architecture, expand to: - // la d,sym => lui d,highest(sym) - // ori d,d,higher(sym) - // dsll d,d,16 - // ori d,d,hi16(sym) - // dsll d,d,16 - // ori d,d,lo16(sym) - const MCSymbolRefExpr *HighestExpr = MCSymbolRefExpr::create( - &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_HIGHEST, getContext()); - const MCSymbolRefExpr *HigherExpr = MCSymbolRefExpr::create( - &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_HIGHER, getContext()); - - tmpInst.setOpcode(Mips::LUi); - tmpInst.addOperand(MCOperand::createReg(TmpReg)); - tmpInst.addOperand(MCOperand::createExpr(HighestExpr)); - Instructions.push_back(tmpInst); - - createLShiftOri<0>(MCOperand::createExpr(HigherExpr), TmpReg, SMLoc(), - Instructions); - createLShiftOri<16>(MCOperand::createExpr(HiExpr), TmpReg, SMLoc(), - Instructions); - createLShiftOri<16>(MCOperand::createExpr(LoExpr), TmpReg, SMLoc(), - Instructions); - } else { - // Otherwise, expand to: - // la d,sym => lui d,hi16(sym) - // ori d,d,lo16(sym) - tmpInst.setOpcode(Mips::LUi); - tmpInst.addOperand(MCOperand::createReg(TmpReg)); - tmpInst.addOperand(MCOperand::createExpr(HiExpr)); - Instructions.push_back(tmpInst); - - emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), SMLoc(), - Instructions); - } + emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(HiExpr), IDLoc, Instructions); + emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), IDLoc, + Instructions); if (UseSrcReg) - createAddu(DstReg, TmpReg, SrcReg, !Is32BitSym, Instructions); + emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, Instructions); + else + assert(DstReg == TmpReg); return false; } @@ -2125,12 +2476,13 @@ bool MipsAsmParser::expandUncondBranchMMPseudo( Inst.addOperand(MCOperand::createExpr(Offset.getExpr())); } else { assert(Offset.isImm() && "expected immediate operand kind"); - if (isIntN(11, Offset.getImm())) { + if (isInt<11>(Offset.getImm())) { // If offset fits into 11 bits then this instruction becomes microMIPS // 16-bit unconditional branch instruction. - Inst.setOpcode(Mips::B16_MM); + if (inMicroMipsMode()) + Inst.setOpcode(hasMips32r6() ? Mips::BC16_MMR6 : Mips::B16_MM); } else { - if (!isIntN(17, Offset.getImm())) + if (!isInt<17>(Offset.getImm())) Error(IDLoc, "branch target out of range"); if (OffsetToAlignment(Offset.getImm(), 1LL << 1)) Error(IDLoc, "branch to misaligned address"); @@ -2143,8 +2495,10 @@ bool MipsAsmParser::expandUncondBranchMMPseudo( } Instructions.push_back(Inst); - // If .set reorder is active, emit a NOP after the branch instruction. - if (AssemblerOptions.back()->isReorder()) + // If .set reorder is active and branch instruction has a delay slot, + // emit a NOP after it. + const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode()); + if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) createNop(true, IDLoc, Instructions); return false; @@ -2175,30 +2529,21 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, } int64_t ImmValue = ImmOp.getImm(); - if (ImmValue == 0) { - MCInst BranchInst; - BranchInst.setOpcode(OpCode); - BranchInst.addOperand(DstRegOp); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MemOffsetOp); - Instructions.push_back(BranchInst); - } else { + if (ImmValue == 0) + emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc, + Instructions); + else { warnIfNoMacro(IDLoc); unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; - if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), IDLoc, - Instructions)) + if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), true, + IDLoc, Instructions)) return true; - MCInst BranchInst; - BranchInst.setOpcode(OpCode); - BranchInst.addOperand(DstRegOp); - BranchInst.addOperand(MCOperand::createReg(ATReg)); - BranchInst.addOperand(MemOffsetOp); - Instructions.push_back(BranchInst); + emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, Instructions); } return false; } @@ -2206,7 +2551,6 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions, bool isLoad, bool isImmOpnd) { - MCInst TempInst; unsigned ImmOffset, HiOffset, LoOffset; const MCExpr *ExprOffset; unsigned TmpRegNum; @@ -2227,8 +2571,6 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, HiOffset++; } else ExprOffset = Inst.getOperand(2).getExpr(); - // All instructions will have the same location. - TempInst.setLoc(IDLoc); // These are some of the types of expansions we perform here: // 1) lw $8, sym => lui $8, %hi(sym) // lw $8, %lo(sym)($8) @@ -2267,40 +2609,20 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, return; } - TempInst.setOpcode(Mips::LUi); - TempInst.addOperand(MCOperand::createReg(TmpRegNum)); - if (isImmOpnd) - TempInst.addOperand(MCOperand::createImm(HiOffset)); - else { - const MCExpr *HiExpr = evaluateRelocExpr(ExprOffset, "hi"); - TempInst.addOperand(MCOperand::createExpr(HiExpr)); - } - // Add the instruction to the list. - Instructions.push_back(TempInst); - // Prepare TempInst for next instruction. - TempInst.clear(); + emitRX(Mips::LUi, TmpRegNum, + isImmOpnd ? MCOperand::createImm(HiOffset) + : MCOperand::createExpr(evaluateRelocExpr(ExprOffset, "hi")), + IDLoc, Instructions); // Add temp register to base. - if (BaseRegNum != Mips::ZERO) { - TempInst.setOpcode(Mips::ADDu); - TempInst.addOperand(MCOperand::createReg(TmpRegNum)); - TempInst.addOperand(MCOperand::createReg(TmpRegNum)); - TempInst.addOperand(MCOperand::createReg(BaseRegNum)); - Instructions.push_back(TempInst); - TempInst.clear(); - } + if (BaseRegNum != Mips::ZERO) + emitRRR(Mips::ADDu, TmpRegNum, TmpRegNum, BaseRegNum, IDLoc, Instructions); // And finally, create original instruction with low part // of offset and new base. - TempInst.setOpcode(Inst.getOpcode()); - TempInst.addOperand(MCOperand::createReg(RegOpNum)); - TempInst.addOperand(MCOperand::createReg(TmpRegNum)); - if (isImmOpnd) - TempInst.addOperand(MCOperand::createImm(LoOffset)); - else { - const MCExpr *LoExpr = evaluateRelocExpr(ExprOffset, "lo"); - TempInst.addOperand(MCOperand::createExpr(LoExpr)); - } - Instructions.push_back(TempInst); - TempInst.clear(); + emitRRX(Inst.getOpcode(), RegOpNum, TmpRegNum, + isImmOpnd + ? MCOperand::createImm(LoOffset) + : MCOperand::createExpr(evaluateRelocExpr(ExprOffset, "lo")), + IDLoc, Instructions); } bool @@ -2316,10 +2638,16 @@ MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, if (OpNum < 8 && Inst.getOperand(OpNum - 1).getImm() <= 60 && Inst.getOperand(OpNum - 1).getImm() >= 0 && - Inst.getOperand(OpNum - 2).getReg() == Mips::SP && - Inst.getOperand(OpNum - 3).getReg() == Mips::RA) + (Inst.getOperand(OpNum - 2).getReg() == Mips::SP || + Inst.getOperand(OpNum - 2).getReg() == Mips::SP_64) && + (Inst.getOperand(OpNum - 3).getReg() == Mips::RA || + Inst.getOperand(OpNum - 3).getReg() == Mips::RA_64)) { // It can be implemented as SWM16 or LWM16 instruction. - NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MM : Mips::LWM16_MM; + if (inMicroMipsMode() && hasMips32r6()) + NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MMR6 : Mips::LWM16_MMR6; + else + NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MM : Mips::LWM16_MM; + } Inst.setOpcode(NewOpcode); Instructions.push_back(Inst); @@ -2328,44 +2656,126 @@ MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { + bool EmittedNoMacroWarning = false; unsigned PseudoOpcode = Inst.getOpcode(); unsigned SrcReg = Inst.getOperand(0).getReg(); - unsigned TrgReg = Inst.getOperand(1).getReg(); + const MCOperand &TrgOp = Inst.getOperand(1); const MCExpr *OffsetExpr = Inst.getOperand(2).getExpr(); unsigned ZeroSrcOpcode, ZeroTrgOpcode; - bool ReverseOrderSLT, IsUnsigned, AcceptsEquality; + bool ReverseOrderSLT, IsUnsigned, IsLikely, AcceptsEquality; + + unsigned TrgReg; + if (TrgOp.isReg()) + TrgReg = TrgOp.getReg(); + else if (TrgOp.isImm()) { + warnIfNoMacro(IDLoc); + EmittedNoMacroWarning = true; + + TrgReg = getATReg(IDLoc); + if (!TrgReg) + return true; + + switch(PseudoOpcode) { + default: + llvm_unreachable("unknown opcode for branch pseudo-instruction"); + case Mips::BLTImmMacro: + PseudoOpcode = Mips::BLT; + break; + case Mips::BLEImmMacro: + PseudoOpcode = Mips::BLE; + break; + case Mips::BGEImmMacro: + PseudoOpcode = Mips::BGE; + break; + case Mips::BGTImmMacro: + PseudoOpcode = Mips::BGT; + break; + case Mips::BLTUImmMacro: + PseudoOpcode = Mips::BLTU; + break; + case Mips::BLEUImmMacro: + PseudoOpcode = Mips::BLEU; + break; + case Mips::BGEUImmMacro: + PseudoOpcode = Mips::BGEU; + break; + case Mips::BGTUImmMacro: + PseudoOpcode = Mips::BGTU; + break; + case Mips::BLTLImmMacro: + PseudoOpcode = Mips::BLTL; + break; + case Mips::BLELImmMacro: + PseudoOpcode = Mips::BLEL; + break; + case Mips::BGELImmMacro: + PseudoOpcode = Mips::BGEL; + break; + case Mips::BGTLImmMacro: + PseudoOpcode = Mips::BGTL; + break; + case Mips::BLTULImmMacro: + PseudoOpcode = Mips::BLTUL; + break; + case Mips::BLEULImmMacro: + PseudoOpcode = Mips::BLEUL; + break; + case Mips::BGEULImmMacro: + PseudoOpcode = Mips::BGEUL; + break; + case Mips::BGTULImmMacro: + PseudoOpcode = Mips::BGTUL; + break; + } + + if (loadImmediate(TrgOp.getImm(), TrgReg, Mips::NoRegister, !isGP64bit(), + false, IDLoc, Instructions)) + return true; + } switch (PseudoOpcode) { case Mips::BLT: case Mips::BLTU: + case Mips::BLTL: + case Mips::BLTUL: AcceptsEquality = false; ReverseOrderSLT = false; - IsUnsigned = (PseudoOpcode == Mips::BLTU); + IsUnsigned = ((PseudoOpcode == Mips::BLTU) || (PseudoOpcode == Mips::BLTUL)); + IsLikely = ((PseudoOpcode == Mips::BLTL) || (PseudoOpcode == Mips::BLTUL)); ZeroSrcOpcode = Mips::BGTZ; ZeroTrgOpcode = Mips::BLTZ; break; case Mips::BLE: case Mips::BLEU: + case Mips::BLEL: + case Mips::BLEUL: AcceptsEquality = true; ReverseOrderSLT = true; - IsUnsigned = (PseudoOpcode == Mips::BLEU); + IsUnsigned = ((PseudoOpcode == Mips::BLEU) || (PseudoOpcode == Mips::BLEUL)); + IsLikely = ((PseudoOpcode == Mips::BLEL) || (PseudoOpcode == Mips::BLEUL)); ZeroSrcOpcode = Mips::BGEZ; ZeroTrgOpcode = Mips::BLEZ; break; case Mips::BGE: case Mips::BGEU: + case Mips::BGEL: + case Mips::BGEUL: AcceptsEquality = true; ReverseOrderSLT = false; - IsUnsigned = (PseudoOpcode == Mips::BGEU); + IsUnsigned = ((PseudoOpcode == Mips::BGEU) || (PseudoOpcode == Mips::BGEUL)); + IsLikely = ((PseudoOpcode == Mips::BGEL) || (PseudoOpcode == Mips::BGEUL)); ZeroSrcOpcode = Mips::BLEZ; ZeroTrgOpcode = Mips::BGEZ; break; case Mips::BGT: case Mips::BGTU: + case Mips::BGTL: + case Mips::BGTUL: AcceptsEquality = false; ReverseOrderSLT = true; - IsUnsigned = (PseudoOpcode == Mips::BGTU); + IsUnsigned = ((PseudoOpcode == Mips::BGTU) || (PseudoOpcode == Mips::BGTUL)); + IsLikely = ((PseudoOpcode == Mips::BGTL) || (PseudoOpcode == Mips::BGTUL)); ZeroSrcOpcode = Mips::BLTZ; ZeroTrgOpcode = Mips::BGTZ; break; @@ -2373,7 +2783,6 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, llvm_unreachable("unknown opcode for branch pseudo-instruction"); } - MCInst BranchInst; bool IsTrgRegZero = (TrgReg == Mips::ZERO); bool IsSrcRegZero = (SrcReg == Mips::ZERO); if (IsSrcRegZero && IsTrgRegZero) { @@ -2381,51 +2790,37 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, // with GAS' behaviour. However, they may not generate the most efficient // code in some circumstances. if (PseudoOpcode == Mips::BLT) { - BranchInst.setOpcode(Mips::BLTZ); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRX(Mips::BLTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, + Instructions); return false; } if (PseudoOpcode == Mips::BLE) { - BranchInst.setOpcode(Mips::BLEZ); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRX(Mips::BLEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, + Instructions); Warning(IDLoc, "branch is always taken"); return false; } if (PseudoOpcode == Mips::BGE) { - BranchInst.setOpcode(Mips::BGEZ); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRX(Mips::BGEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, + Instructions); Warning(IDLoc, "branch is always taken"); return false; } if (PseudoOpcode == Mips::BGT) { - BranchInst.setOpcode(Mips::BGTZ); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRX(Mips::BGTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, + Instructions); return false; } if (PseudoOpcode == Mips::BGTU) { - BranchInst.setOpcode(Mips::BNE); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRRX(Mips::BNE, Mips::ZERO, Mips::ZERO, + MCOperand::createExpr(OffsetExpr), IDLoc, Instructions); return false; } if (AcceptsEquality) { // If both registers are $0 and the pseudo-branch accepts equality, it // will always be taken, so we emit an unconditional branch. - BranchInst.setOpcode(Mips::BEQ); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO, + MCOperand::createExpr(OffsetExpr), IDLoc, Instructions); Warning(IDLoc, "branch is always taken"); return false; } @@ -2449,11 +2844,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, // the pseudo-branch will always be taken, so we emit an unconditional // branch. // This only applies to unsigned pseudo-branches. - BranchInst.setOpcode(Mips::BEQ); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO, + MCOperand::createExpr(OffsetExpr), IDLoc, Instructions); Warning(IDLoc, "branch is always taken"); return false; } @@ -2470,21 +2862,17 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, // // Because only BLEU and BGEU branch on equality, we can use the // AcceptsEquality variable to decide when to emit the BEQZ. - BranchInst.setOpcode(AcceptsEquality ? Mips::BEQ : Mips::BNE); - BranchInst.addOperand( - MCOperand::createReg(IsSrcRegZero ? TrgReg : SrcReg)); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRRX(AcceptsEquality ? Mips::BEQ : Mips::BNE, + IsSrcRegZero ? TrgReg : SrcReg, Mips::ZERO, + MCOperand::createExpr(OffsetExpr), IDLoc, Instructions); return false; } // If we have a signed pseudo-branch and one of the registers is $0, // we can use an appropriate compare-to-zero branch. We select which one // to use in the switch statement above. - BranchInst.setOpcode(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode); - BranchInst.addOperand(MCOperand::createReg(IsSrcRegZero ? TrgReg : SrcReg)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRX(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode, + IsSrcRegZero ? TrgReg : SrcReg, MCOperand::createExpr(OffsetExpr), + IDLoc, Instructions); return false; } @@ -2494,7 +2882,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, if (!ATRegNum) return true; - warnIfNoMacro(IDLoc); + if (!EmittedNoMacroWarning) + warnIfNoMacro(IDLoc); // SLT fits well with 2 of our 4 pseudo-branches: // BLT, where $rs < $rt, translates into "slt $at, $rs, $rt" and @@ -2511,23 +2900,135 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, // // The same applies to the unsigned variants, except that SLTu is used // instead of SLT. - MCInst SetInst; - SetInst.setOpcode(IsUnsigned ? Mips::SLTu : Mips::SLT); - SetInst.addOperand(MCOperand::createReg(ATRegNum)); - SetInst.addOperand(MCOperand::createReg(ReverseOrderSLT ? TrgReg : SrcReg)); - SetInst.addOperand(MCOperand::createReg(ReverseOrderSLT ? SrcReg : TrgReg)); - Instructions.push_back(SetInst); - - BranchInst.setOpcode(AcceptsEquality ? Mips::BEQ : Mips::BNE); - BranchInst.addOperand(MCOperand::createReg(ATRegNum)); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRRR(IsUnsigned ? Mips::SLTu : Mips::SLT, ATRegNum, + ReverseOrderSLT ? TrgReg : SrcReg, ReverseOrderSLT ? SrcReg : TrgReg, + IDLoc, Instructions); + + emitRRX(IsLikely ? (AcceptsEquality ? Mips::BEQL : Mips::BNEL) + : (AcceptsEquality ? Mips::BEQ : Mips::BNE), + ATRegNum, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, + Instructions); return false; } -bool MipsAsmParser::expandUlhu(MCInst &Inst, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { +bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions, + const bool IsMips64, const bool Signed) { + if (hasMips32r6()) { + Error(IDLoc, "instruction not supported on mips32r6 or mips64r6"); + return false; + } + + warnIfNoMacro(IDLoc); + + const MCOperand &RsRegOp = Inst.getOperand(0); + assert(RsRegOp.isReg() && "expected register operand kind"); + unsigned RsReg = RsRegOp.getReg(); + + const MCOperand &RtRegOp = Inst.getOperand(1); + assert(RtRegOp.isReg() && "expected register operand kind"); + unsigned RtReg = RtRegOp.getReg(); + unsigned DivOp; + unsigned ZeroReg; + + if (IsMips64) { + DivOp = Signed ? Mips::DSDIV : Mips::DUDIV; + ZeroReg = Mips::ZERO_64; + } else { + DivOp = Signed ? Mips::SDIV : Mips::UDIV; + ZeroReg = Mips::ZERO; + } + + bool UseTraps = useTraps(); + + if (RsReg == Mips::ZERO || RsReg == Mips::ZERO_64) { + if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) + Warning(IDLoc, "dividing zero by zero"); + if (IsMips64) { + if (Signed && (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)) { + if (UseTraps) { + emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions); + return false; + } + + emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions); + return false; + } + } else { + emitRR(DivOp, RsReg, RtReg, IDLoc, Instructions); + return false; + } + } + + if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) { + Warning(IDLoc, "division by zero"); + if (Signed) { + if (UseTraps) { + emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions); + return false; + } + + emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions); + return false; + } + } + + // FIXME: The values for these two BranchTarget variables may be different in + // micromips. These magic numbers need to be removed. + unsigned BranchTargetNoTraps; + unsigned BranchTarget; + + if (UseTraps) { + BranchTarget = IsMips64 ? 12 : 8; + emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions); + } else { + BranchTarget = IsMips64 ? 20 : 16; + BranchTargetNoTraps = 8; + // Branch to the li instruction. + emitRRI(Mips::BNE, RtReg, ZeroReg, BranchTargetNoTraps, IDLoc, + Instructions); + } + + emitRR(DivOp, RsReg, RtReg, IDLoc, Instructions); + + if (!UseTraps) + emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions); + + if (!Signed) { + emitR(Mips::MFLO, RsReg, IDLoc, Instructions); + return false; + } + + unsigned ATReg = getATReg(IDLoc); + if (!ATReg) + return true; + + emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, Instructions); + if (IsMips64) { + // Branch to the mflo instruction. + emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, Instructions); + emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, Instructions); + emitRRI(Mips::DSLL32, ATReg, ATReg, 0x1f, IDLoc, Instructions); + } else { + // Branch to the mflo instruction. + emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, Instructions); + emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, Instructions); + } + + if (UseTraps) + emitRRI(Mips::TEQ, RsReg, ATReg, 0x6, IDLoc, Instructions); + else { + // Branch to the mflo instruction. + emitRRI(Mips::BNE, RsReg, ATReg, BranchTargetNoTraps, IDLoc, Instructions); + emitRRI(Mips::SLL, ZeroReg, ZeroReg, 0, IDLoc, Instructions); + emitII(Mips::BREAK, 0x6, 0, IDLoc, Instructions); + } + emitR(Mips::MFLO, RsReg, IDLoc, Instructions); + return false; +} + +bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { if (hasMips32r6() || hasMips64r6()) { Error(IDLoc, "instruction not supported on mips32r6 or mips64r6"); return false; @@ -2562,7 +3063,7 @@ bool MipsAsmParser::expandUlhu(MCInst &Inst, SMLoc IDLoc, LoadedOffsetInAT = true; if (loadImmediate(OffsetValue, ATReg, Mips::NoRegister, !ABI.ArePtrs64bit(), - IDLoc, Instructions)) + true, IDLoc, Instructions)) return true; // NOTE: We do this (D)ADDu here instead of doing it in loadImmediate() @@ -2590,33 +3091,15 @@ bool MipsAsmParser::expandUlhu(MCInst &Inst, SMLoc IDLoc, unsigned SllReg = LoadedOffsetInAT ? DstReg : ATReg; - MCInst TmpInst; - TmpInst.setOpcode(Mips::LBu); - TmpInst.addOperand(MCOperand::createReg(FirstLbuDstReg)); - TmpInst.addOperand(MCOperand::createReg(LbuSrcReg)); - TmpInst.addOperand(MCOperand::createImm(FirstLbuOffset)); - Instructions.push_back(TmpInst); - - TmpInst.clear(); - TmpInst.setOpcode(Mips::LBu); - TmpInst.addOperand(MCOperand::createReg(SecondLbuDstReg)); - TmpInst.addOperand(MCOperand::createReg(LbuSrcReg)); - TmpInst.addOperand(MCOperand::createImm(SecondLbuOffset)); - Instructions.push_back(TmpInst); - - TmpInst.clear(); - TmpInst.setOpcode(Mips::SLL); - TmpInst.addOperand(MCOperand::createReg(SllReg)); - TmpInst.addOperand(MCOperand::createReg(SllReg)); - TmpInst.addOperand(MCOperand::createImm(8)); - Instructions.push_back(TmpInst); - - TmpInst.clear(); - TmpInst.setOpcode(Mips::OR); - TmpInst.addOperand(MCOperand::createReg(DstReg)); - TmpInst.addOperand(MCOperand::createReg(DstReg)); - TmpInst.addOperand(MCOperand::createReg(ATReg)); - Instructions.push_back(TmpInst); + emitRRI(Signed ? Mips::LB : Mips::LBu, FirstLbuDstReg, LbuSrcReg, + FirstLbuOffset, IDLoc, Instructions); + + emitRRI(Mips::LBu, SecondLbuDstReg, LbuSrcReg, SecondLbuOffset, IDLoc, + Instructions); + + emitRRI(Mips::SLL, SllReg, SllReg, 8, IDLoc, Instructions); + + emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, Instructions); return false; } @@ -2654,7 +3137,7 @@ bool MipsAsmParser::expandUlw(MCInst &Inst, SMLoc IDLoc, warnIfNoMacro(IDLoc); if (loadImmediate(OffsetValue, ATReg, Mips::NoRegister, !ABI.ArePtrs64bit(), - IDLoc, Instructions)) + true, IDLoc, Instructions)) return true; // NOTE: We do this (D)ADDu here instead of doing it in loadImmediate() @@ -2677,37 +3160,373 @@ bool MipsAsmParser::expandUlw(MCInst &Inst, SMLoc IDLoc, RightLoadOffset = LoadedOffsetInAT ? 3 : (OffsetValue + 3); } - MCInst LeftLoadInst; - LeftLoadInst.setOpcode(Mips::LWL); - LeftLoadInst.addOperand(DstRegOp); - LeftLoadInst.addOperand(MCOperand::createReg(FinalSrcReg)); - LeftLoadInst.addOperand(MCOperand::createImm(LeftLoadOffset)); - Instructions.push_back(LeftLoadInst); + emitRRI(Mips::LWL, DstRegOp.getReg(), FinalSrcReg, LeftLoadOffset, IDLoc, + Instructions); - MCInst RightLoadInst; - RightLoadInst.setOpcode(Mips::LWR); - RightLoadInst.addOperand(DstRegOp); - RightLoadInst.addOperand(MCOperand::createReg(FinalSrcReg)); - RightLoadInst.addOperand(MCOperand::createImm(RightLoadOffset )); - Instructions.push_back(RightLoadInst); + emitRRI(Mips::LWR, DstRegOp.getReg(), FinalSrcReg, RightLoadOffset, IDLoc, + Instructions); return false; } +bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + + assert (Inst.getNumOperands() == 3 && "Invalid operand count"); + assert (Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && + Inst.getOperand(2).isImm() && "Invalid instruction operand."); + + unsigned ATReg = Mips::NoRegister; + unsigned FinalDstReg = Mips::NoRegister; + unsigned DstReg = Inst.getOperand(0).getReg(); + unsigned SrcReg = Inst.getOperand(1).getReg(); + int64_t ImmValue = Inst.getOperand(2).getImm(); + + bool Is32Bit = isInt<32>(ImmValue) || isUInt<32>(ImmValue); + + unsigned FinalOpcode = Inst.getOpcode(); + + if (DstReg == SrcReg) { + ATReg = getATReg(Inst.getLoc()); + if (!ATReg) + return true; + FinalDstReg = DstReg; + DstReg = ATReg; + } + + if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false, Inst.getLoc(), Instructions)) { + switch (FinalOpcode) { + default: + llvm_unreachable("unimplemented expansion"); + case (Mips::ADDi): + FinalOpcode = Mips::ADD; + break; + case (Mips::ADDiu): + FinalOpcode = Mips::ADDu; + break; + case (Mips::ANDi): + FinalOpcode = Mips::AND; + break; + case (Mips::NORImm): + FinalOpcode = Mips::NOR; + break; + case (Mips::ORi): + FinalOpcode = Mips::OR; + break; + case (Mips::SLTi): + FinalOpcode = Mips::SLT; + break; + case (Mips::SLTiu): + FinalOpcode = Mips::SLTu; + break; + case (Mips::XORi): + FinalOpcode = Mips::XOR; + break; + } + + if (FinalDstReg == Mips::NoRegister) + emitRRR(FinalOpcode, DstReg, DstReg, SrcReg, IDLoc, Instructions); + else + emitRRR(FinalOpcode, FinalDstReg, FinalDstReg, DstReg, IDLoc, + Instructions); + return false; + } + return true; +} + +bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + unsigned ATReg = Mips::NoRegister; + unsigned DReg = Inst.getOperand(0).getReg(); + unsigned SReg = Inst.getOperand(1).getReg(); + unsigned TReg = Inst.getOperand(2).getReg(); + unsigned TmpReg = DReg; + + unsigned FirstShift = Mips::NOP; + unsigned SecondShift = Mips::NOP; + + if (hasMips32r2()) { + + if (DReg == SReg) { + TmpReg = getATReg(Inst.getLoc()); + if (!TmpReg) + return true; + } + + if (Inst.getOpcode() == Mips::ROL) { + emitRRR(Mips::SUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions); + emitRRR(Mips::ROTRV, DReg, SReg, TmpReg, Inst.getLoc(), Instructions); + return false; + } + + if (Inst.getOpcode() == Mips::ROR) { + emitRRR(Mips::ROTRV, DReg, SReg, TReg, Inst.getLoc(), Instructions); + return false; + } + + return true; + } + + if (hasMips32()) { + + switch (Inst.getOpcode()) { + default: + llvm_unreachable("unexpected instruction opcode"); + case Mips::ROL: + FirstShift = Mips::SRLV; + SecondShift = Mips::SLLV; + break; + case Mips::ROR: + FirstShift = Mips::SLLV; + SecondShift = Mips::SRLV; + break; + } + + ATReg = getATReg(Inst.getLoc()); + if (!ATReg) + return true; + + emitRRR(Mips::SUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions); + emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), Instructions); + emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), Instructions); + emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions); + + return false; + } + + return true; +} + +bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + + unsigned ATReg = Mips::NoRegister; + unsigned DReg = Inst.getOperand(0).getReg(); + unsigned SReg = Inst.getOperand(1).getReg(); + int64_t ImmValue = Inst.getOperand(2).getImm(); + + unsigned FirstShift = Mips::NOP; + unsigned SecondShift = Mips::NOP; + + if (hasMips32r2()) { + + if (Inst.getOpcode() == Mips::ROLImm) { + uint64_t MaxShift = 32; + uint64_t ShiftValue = ImmValue; + if (ImmValue != 0) + ShiftValue = MaxShift - ImmValue; + emitRRI(Mips::ROTR, DReg, SReg, ShiftValue, Inst.getLoc(), Instructions); + return false; + } + + if (Inst.getOpcode() == Mips::RORImm) { + emitRRI(Mips::ROTR, DReg, SReg, ImmValue, Inst.getLoc(), Instructions); + return false; + } + + return true; + } + + if (hasMips32()) { + + if (ImmValue == 0) { + emitRRI(Mips::SRL, DReg, SReg, 0, Inst.getLoc(), Instructions); + return false; + } + + switch (Inst.getOpcode()) { + default: + llvm_unreachable("unexpected instruction opcode"); + case Mips::ROLImm: + FirstShift = Mips::SLL; + SecondShift = Mips::SRL; + break; + case Mips::RORImm: + FirstShift = Mips::SRL; + SecondShift = Mips::SLL; + break; + } + + ATReg = getATReg(Inst.getLoc()); + if (!ATReg) + return true; + + emitRRI(FirstShift, ATReg, SReg, ImmValue, Inst.getLoc(), Instructions); + emitRRI(SecondShift, DReg, SReg, 32 - ImmValue, Inst.getLoc(), Instructions); + emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions); + + return false; + } + + return true; +} + +bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + + unsigned ATReg = Mips::NoRegister; + unsigned DReg = Inst.getOperand(0).getReg(); + unsigned SReg = Inst.getOperand(1).getReg(); + unsigned TReg = Inst.getOperand(2).getReg(); + unsigned TmpReg = DReg; + + unsigned FirstShift = Mips::NOP; + unsigned SecondShift = Mips::NOP; + + if (hasMips64r2()) { + + if (TmpReg == SReg) { + TmpReg = getATReg(Inst.getLoc()); + if (!TmpReg) + return true; + } + + if (Inst.getOpcode() == Mips::DROL) { + emitRRR(Mips::DSUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions); + emitRRR(Mips::DROTRV, DReg, SReg, TmpReg, Inst.getLoc(), Instructions); + return false; + } + + if (Inst.getOpcode() == Mips::DROR) { + emitRRR(Mips::DROTRV, DReg, SReg, TReg, Inst.getLoc(), Instructions); + return false; + } + + return true; + } + + if (hasMips64()) { + + switch (Inst.getOpcode()) { + default: + llvm_unreachable("unexpected instruction opcode"); + case Mips::DROL: + FirstShift = Mips::DSRLV; + SecondShift = Mips::DSLLV; + break; + case Mips::DROR: + FirstShift = Mips::DSLLV; + SecondShift = Mips::DSRLV; + break; + } + + ATReg = getATReg(Inst.getLoc()); + if (!ATReg) + return true; + + emitRRR(Mips::DSUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions); + emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), Instructions); + emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), Instructions); + emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions); + + return false; + } + + return true; +} + +bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + + unsigned ATReg = Mips::NoRegister; + unsigned DReg = Inst.getOperand(0).getReg(); + unsigned SReg = Inst.getOperand(1).getReg(); + int64_t ImmValue = Inst.getOperand(2).getImm() % 64; + + unsigned FirstShift = Mips::NOP; + unsigned SecondShift = Mips::NOP; + + MCInst TmpInst; + + if (hasMips64r2()) { + + unsigned FinalOpcode = Mips::NOP; + if (ImmValue == 0) + FinalOpcode = Mips::DROTR; + else if (ImmValue % 32 == 0) + FinalOpcode = Mips::DROTR32; + else if ((ImmValue >= 1) && (ImmValue <= 32)) { + if (Inst.getOpcode() == Mips::DROLImm) + FinalOpcode = Mips::DROTR32; + else + FinalOpcode = Mips::DROTR; + } else if (ImmValue >= 33) { + if (Inst.getOpcode() == Mips::DROLImm) + FinalOpcode = Mips::DROTR; + else + FinalOpcode = Mips::DROTR32; + } + + uint64_t ShiftValue = ImmValue % 32; + if (Inst.getOpcode() == Mips::DROLImm) + ShiftValue = (32 - ImmValue % 32) % 32; + + emitRRI(FinalOpcode, DReg, SReg, ShiftValue, Inst.getLoc(), Instructions); + + return false; + } + + if (hasMips64()) { + + if (ImmValue == 0) { + emitRRI(Mips::DSRL, DReg, SReg, 0, Inst.getLoc(), Instructions); + return false; + } + + switch (Inst.getOpcode()) { + default: + llvm_unreachable("unexpected instruction opcode"); + case Mips::DROLImm: + if ((ImmValue >= 1) && (ImmValue <= 31)) { + FirstShift = Mips::DSLL; + SecondShift = Mips::DSRL32; + } + if (ImmValue == 32) { + FirstShift = Mips::DSLL32; + SecondShift = Mips::DSRL32; + } + if ((ImmValue >= 33) && (ImmValue <= 63)) { + FirstShift = Mips::DSLL32; + SecondShift = Mips::DSRL; + } + break; + case Mips::DRORImm: + if ((ImmValue >= 1) && (ImmValue <= 31)) { + FirstShift = Mips::DSRL; + SecondShift = Mips::DSLL32; + } + if (ImmValue == 32) { + FirstShift = Mips::DSRL32; + SecondShift = Mips::DSLL32; + } + if ((ImmValue >= 33) && (ImmValue <= 63)) { + FirstShift = Mips::DSRL32; + SecondShift = Mips::DSLL; + } + break; + } + + ATReg = getATReg(Inst.getLoc()); + if (!ATReg) + return true; + + emitRRI(FirstShift, ATReg, SReg, ImmValue % 32, Inst.getLoc(), Instructions); + emitRRI(SecondShift, DReg, SReg, (32 - ImmValue % 32) % 32, Inst.getLoc(), Instructions); + emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions); + + return false; + } + + return true; +} + void MipsAsmParser::createNop(bool hasShortDelaySlot, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { - MCInst NopInst; - if (hasShortDelaySlot) { - NopInst.setOpcode(Mips::MOVE16_MM); - NopInst.addOperand(MCOperand::createReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::createReg(Mips::ZERO)); - } else { - NopInst.setOpcode(Mips::SLL); - NopInst.addOperand(MCOperand::createReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::createReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::createImm(0)); - } - Instructions.push_back(NopInst); + if (hasShortDelaySlot) + emitRR(Mips::MOVE16_MM, Mips::ZERO, Mips::ZERO, IDLoc, Instructions); + else + emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, Instructions); } void MipsAsmParser::createAddu(unsigned DstReg, unsigned SrcReg, @@ -2717,6 +3536,24 @@ void MipsAsmParser::createAddu(unsigned DstReg, unsigned SrcReg, Instructions); } +void MipsAsmParser::createCpRestoreMemOp( + bool IsLoad, int StackOffset, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + // If the offset can not fit into 16 bits, we need to expand. + if (!isInt<16>(StackOffset)) { + MCInst MemInst; + MemInst.setOpcode(IsLoad ? Mips::LW : Mips::SW); + MemInst.addOperand(MCOperand::createReg(Mips::GP)); + MemInst.addOperand(MCOperand::createReg(Mips::SP)); + MemInst.addOperand(MCOperand::createImm(StackOffset)); + expandMemInst(MemInst, IDLoc, Instructions, IsLoad, true /*HasImmOpnd*/); + return; + } + + emitRRI(IsLoad ? Mips::LW : Mips::SW, Mips::GP, Mips::SP, StackOffset, IDLoc, + Instructions); +} + unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) { // As described by the Mips32r2 spec, the registers Rd and Rs for // jalr.hb must be different. @@ -2729,6 +3566,17 @@ unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) { return Match_Success; } +static SMLoc RefineErrorLoc(const SMLoc Loc, const OperandVector &Operands, + uint64_t ErrorInfo) { + if (ErrorInfo != ~0ULL && ErrorInfo < Operands.size()) { + SMLoc ErrorLoc = Operands[ErrorInfo]->getStartLoc(); + if (ErrorLoc == SMLoc()) + return Loc; + return ErrorLoc; + } + return Loc; +} + bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -2745,7 +3593,7 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (processInstruction(Inst, IDLoc, Instructions)) return true; for (unsigned i = 0; i < Instructions.size(); i++) - Out.EmitInstruction(Instructions[i], STI); + Out.EmitInstruction(Instructions[i], getSTI()); return false; } case Match_MissingFeature: @@ -2757,7 +3605,7 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); - ErrorLoc = ((MipsOperand &)*Operands[ErrorInfo]).getStartLoc(); + ErrorLoc = Operands[ErrorInfo]->getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; } @@ -2768,6 +3616,58 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(IDLoc, "invalid instruction"); case Match_RequiresDifferentSrcAndDst: return Error(IDLoc, "source and destination must be different"); + case Match_Immz: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected '0'"); + case Match_UImm1_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 1-bit unsigned immediate"); + case Match_UImm2_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 2-bit unsigned immediate"); + case Match_UImm2_1: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected immediate in range 1 .. 4"); + case Match_UImm3_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 3-bit unsigned immediate"); + case Match_UImm4_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 4-bit unsigned immediate"); + case Match_UImm5_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 5-bit unsigned immediate"); + case Match_UImm5_1: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected immediate in range 1 .. 32"); + case Match_UImm5_32: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected immediate in range 32 .. 63"); + case Match_UImm5_33: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected immediate in range 33 .. 64"); + case Match_UImm5_0_Report_UImm6: + // This is used on UImm5 operands that have a corresponding UImm5_32 + // operand to avoid confusing the user. + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 6-bit unsigned immediate"); + case Match_UImm5_Lsl2: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected both 7-bit unsigned immediate and multiple of 4"); + case Match_UImm6_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 6-bit unsigned immediate"); + case Match_SImm6: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 6-bit signed immediate"); + case Match_UImm7_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 7-bit unsigned immediate"); + case Match_UImm8_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 8-bit unsigned immediate"); + case Match_UImm10_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 10-bit unsigned immediate"); } llvm_unreachable("Implement any new match types added!"); @@ -3264,7 +4164,7 @@ MipsAsmParser::parseMemOperand(OperandVector &Operands) { const AsmToken &Tok = Parser.getTok(); // Get the next token. if (Tok.isNot(AsmToken::LParen)) { MipsOperand &Mnemonic = static_cast<MipsOperand &>(*Operands[0]); - if (Mnemonic.getToken() == "la") { + if (Mnemonic.getToken() == "la" || Mnemonic.getToken() == "dla") { SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this)); @@ -3598,12 +4498,15 @@ MipsAsmParser::parseRegisterList(OperandVector &Operands) { if (RegRange) { // Remove last register operand because registers from register range // should be inserted first. - if (RegNo == Mips::RA) { + if ((isGP64bit() && RegNo == Mips::RA_64) || + (!isGP64bit() && RegNo == Mips::RA)) { Regs.push_back(RegNo); } else { unsigned TmpReg = PrevReg + 1; while (TmpReg <= RegNo) { - if ((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) { + if ((((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) && !isGP64bit()) || + (((TmpReg < Mips::S0_64) || (TmpReg > Mips::S7_64)) && + isGP64bit())) { Error(E, "invalid register operand"); return MatchOperand_ParseFail; } @@ -3615,16 +4518,23 @@ MipsAsmParser::parseRegisterList(OperandVector &Operands) { RegRange = false; } else { - if ((PrevReg == Mips::NoRegister) && (RegNo != Mips::S0) && - (RegNo != Mips::RA)) { + if ((PrevReg == Mips::NoRegister) && + ((isGP64bit() && (RegNo != Mips::S0_64) && (RegNo != Mips::RA_64)) || + (!isGP64bit() && (RegNo != Mips::S0) && (RegNo != Mips::RA)))) { Error(E, "$16 or $31 expected"); return MatchOperand_ParseFail; - } else if (((RegNo < Mips::S0) || (RegNo > Mips::S7)) && - (RegNo != Mips::FP) && (RegNo != Mips::RA)) { + } else if (!(((RegNo == Mips::FP || RegNo == Mips::RA || + (RegNo >= Mips::S0 && RegNo <= Mips::S7)) && + !isGP64bit()) || + ((RegNo == Mips::FP_64 || RegNo == Mips::RA_64 || + (RegNo >= Mips::S0_64 && RegNo <= Mips::S7_64)) && + isGP64bit()))) { Error(E, "invalid register operand"); return MatchOperand_ParseFail; } else if ((PrevReg != Mips::NoRegister) && (RegNo != PrevReg + 1) && - (RegNo != Mips::FP) && (RegNo != Mips::RA)) { + ((RegNo != Mips::FP && RegNo != Mips::RA && !isGP64bit()) || + (RegNo != Mips::FP_64 && RegNo != Mips::RA_64 && + isGP64bit()))) { Error(E, "consecutive register numbers expected"); return MatchOperand_ParseFail; } @@ -4152,6 +5062,7 @@ bool MipsAsmParser::parseSetPopDirective() { if (AssemblerOptions.size() == 2) return reportParseError(Loc, ".set pop with no .set push"); + MCSubtargetInfo &STI = copySTI(); AssemblerOptions.pop_back(); setAvailableFeatures( ComputeAvailableFeatures(AssemblerOptions.back()->getFeatures())); @@ -4225,6 +5136,7 @@ bool MipsAsmParser::parseSetMips0Directive() { return reportParseError("unexpected token, expected end of statement"); // Reset assembler options to their initial values. + MCSubtargetInfo &STI = copySTI(); setAvailableFeatures( ComputeAvailableFeatures(AssemblerOptions.front()->getFeatures())); STI.setFeatureBits(AssemblerOptions.front()->getFeatures()); @@ -4366,6 +5278,14 @@ bool MipsAsmParser::eatComma(StringRef ErrorStr) { return true; } +// Used to determine if .cpload, .cprestore, and .cpsetup have any effect. +// In this class, it is only used for .cprestore. +// FIXME: Only keep track of IsPicEnabled in one place, instead of in both +// MipsTargetELFStreamer and MipsAsmParser. +bool MipsAsmParser::isPicAndNotNxxAbi() { + return inPicMode() && !(isABI_N32() || isABI_N64()); +} + bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) { if (AssemblerOptions.back()->isReorder()) Warning(Loc, ".cpload should be inside a noreorder section"); @@ -4398,6 +5318,54 @@ bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) { return false; } +bool MipsAsmParser::parseDirectiveCpRestore(SMLoc Loc) { + MCAsmParser &Parser = getParser(); + + // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it + // is used in non-PIC mode. + + if (inMips16Mode()) { + reportParseError(".cprestore is not supported in Mips16 mode"); + return false; + } + + // Get the stack offset value. + const MCExpr *StackOffset; + int64_t StackOffsetVal; + if (Parser.parseExpression(StackOffset)) { + reportParseError("expected stack offset value"); + return false; + } + + if (!StackOffset->evaluateAsAbsolute(StackOffsetVal)) { + reportParseError("stack offset is not an absolute expression"); + return false; + } + + if (StackOffsetVal < 0) { + Warning(Loc, ".cprestore with negative stack offset has no effect"); + IsCpRestoreSet = false; + } else { + IsCpRestoreSet = true; + CpRestoreOffset = StackOffsetVal; + } + + // If this is not the end of the statement, report an error. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token, expected end of statement"); + return false; + } + + // Store the $gp on the stack. + SmallVector<MCInst, 3> StoreInsts; + createCpRestoreMemOp(false /*IsLoad*/, CpRestoreOffset /*StackOffset*/, Loc, + StoreInsts); + + getTargetStreamer().emitDirectiveCpRestore(StoreInsts, CpRestoreOffset); + Parser.Lex(); // Consume the EndOfStatement. + return false; +} + bool MipsAsmParser::parseDirectiveCPSetup() { MCAsmParser &Parser = getParser(); unsigned FuncReg; @@ -4427,16 +5395,19 @@ bool MipsAsmParser::parseDirectiveCPSetup() { ResTy = parseAnyRegister(TmpReg); if (ResTy == MatchOperand_NoMatch) { - const AsmToken &Tok = Parser.getTok(); - if (Tok.is(AsmToken::Integer)) { - Save = Tok.getIntVal(); - SaveIsReg = false; - Parser.Lex(); - } else { - reportParseError("expected save register or stack offset"); + const MCExpr *OffsetExpr; + int64_t OffsetVal; + SMLoc ExprLoc = getLexer().getLoc(); + + if (Parser.parseExpression(OffsetExpr) || + !OffsetExpr->evaluateAsAbsolute(OffsetVal)) { + reportParseError(ExprLoc, "expected save register or stack offset"); Parser.eatToEndOfStatement(); return false; } + + Save = OffsetVal; + SaveIsReg = false; } else { MipsOperand &SaveOpnd = static_cast<MipsOperand &>(*TmpReg[0]); if (!SaveOpnd.isGPRAsmReg()) { @@ -4462,11 +5433,20 @@ bool MipsAsmParser::parseDirectiveCPSetup() { } const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr); + CpSaveLocation = Save; + CpSaveLocationIsRegister = SaveIsReg; + getTargetStreamer().emitDirectiveCpsetup(FuncReg, Save, Ref->getSymbol(), SaveIsReg); return false; } +bool MipsAsmParser::parseDirectiveCPReturn() { + getTargetStreamer().emitDirectiveCpreturn(CpSaveLocation, + CpSaveLocationIsRegister); + return false; +} + bool MipsAsmParser::parseDirectiveNaN() { MCAsmParser &Parser = getParser(); if (getLexer().isNot(AsmToken::EndOfStatement)) { @@ -4655,6 +5635,9 @@ bool MipsAsmParser::parseDirectiveOption() { StringRef Option = Tok.getIdentifier(); if (Option == "pic0") { + // MipsAsmParser needs to know if the current PIC mode changes. + IsPicEnabled = false; + getTargetStreamer().emitDirectiveOptionPic0(); Parser.Lex(); if (Parser.getTok().isNot(AsmToken::EndOfStatement)) { @@ -4666,6 +5649,9 @@ bool MipsAsmParser::parseDirectiveOption() { } if (Option == "pic2") { + // MipsAsmParser needs to know if the current PIC mode changes. + IsPicEnabled = true; + getTargetStreamer().emitDirectiveOptionPic2(); Parser.Lex(); if (Parser.getTok().isNot(AsmToken::EndOfStatement)) { @@ -4924,6 +5910,8 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".cpload") return parseDirectiveCpLoad(DirectiveID.getLoc()); + if (IDVal == ".cprestore") + return parseDirectiveCpRestore(DirectiveID.getLoc()); if (IDVal == ".dword") { parseDataDirective(8, DirectiveID.getLoc()); return false; @@ -4974,6 +5962,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { getTargetStreamer().emitDirectiveEnt(*Sym); CurrentFn = Sym; + IsCpRestoreSet = false; return false; } @@ -5002,6 +5991,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { getTargetStreamer().emitDirectiveEnd(SymbolName); CurrentFn = nullptr; + IsCpRestoreSet = false; return false; } @@ -5073,6 +6063,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { getTargetStreamer().emitFrame(StackReg, FrameSizeVal, ReturnRegOpnd.getGPR32Reg()); + IsCpRestoreSet = false; return false; } @@ -5173,6 +6164,9 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".cpsetup") return parseDirectiveCPSetup(); + if (IDVal == ".cpreturn") + return parseDirectiveCPReturn(); + if (IDVal == ".module") return parseDirectiveModule(); diff --git a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp index a34ba3b..3c1a771 100644 --- a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp +++ b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp @@ -229,6 +229,13 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst, uint64_t Address, const void *Decoder); +// DecodeBranchTarget26MM - Decode microMIPS branch offset, which is +// shifted left by 1 bit. +static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst, + unsigned Offset, + uint64_t Address, + const void *Decoder); + // DecodeJumpTargetMM - Decode microMIPS jump target, which is // shifted left by 1 bit. static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, @@ -241,17 +248,42 @@ static DecodeStatus DecodeMem(MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMemEVA(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeLoadByte9(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeLoadByte15(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + static DecodeStatus DecodeCacheOp(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeCacheOpR6(MCInst &Inst, +static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeCacheOpMM(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeCacheOpMM(MCInst &Inst, +static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodePrefeOpMM(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); @@ -261,6 +293,11 @@ static DecodeStatus DecodeSyncI(MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSynciR6(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); @@ -284,6 +321,11 @@ static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMemMMImm9(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + static DecodeStatus DecodeMemMMImm12(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -330,6 +372,11 @@ static DecodeStatus DecodeLiSimm7(MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst, + unsigned Value, + uint64_t Address, + const void *Decoder); + static DecodeStatus DecodeSimm4(MCInst &Inst, unsigned Value, uint64_t Address, @@ -340,23 +387,15 @@ static DecodeStatus DecodeSimm16(MCInst &Inst, uint64_t Address, const void *Decoder); -// Decode the immediate field of an LSA instruction which -// is off by one. -static DecodeStatus DecodeLSAImm(MCInst &Inst, - unsigned Insn, - uint64_t Address, - const void *Decoder); +template <unsigned Bits, int Offset> +static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeInsSize(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeExtSize(MCInst &Inst, - unsigned Insn, - uint64_t Address, - const void *Decoder); - static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); @@ -830,9 +869,24 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, if (IsMicroMips) { Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian); + if (Result == MCDisassembler::Fail) + return MCDisassembler::Fail; + + if (hasMips32r6()) { + DEBUG(dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n"); + // Calling the auto-generated decoder function for microMIPS32R6 + // (and microMIPS64R6) 16-bit instructions. + Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn, + Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 2; + return Result; + } + } DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n"); - // Calling the auto-generated decoder function. + // Calling the auto-generated decoder function for microMIPS 16-bit + // instructions. Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -847,24 +901,33 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, if (hasMips32r6()) { DEBUG(dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n"); // Calling the auto-generated decoder function. - Result = decodeInstruction(DecoderTableMicroMips32r632, Instr, Insn, Address, - this, STI); - } else { - DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n"); - // Calling the auto-generated decoder function. - Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address, + Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } } + + DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n"); + // Calling the auto-generated decoder function. + Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address, + this, STI); if (Result != MCDisassembler::Fail) { Size = 4; return Result; } + // This is an invalid instruction. Let the disassembler move forward by the + // minimum instruction size. + Size = 2; return MCDisassembler::Fail; } Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false); - if (Result == MCDisassembler::Fail) + if (Result == MCDisassembler::Fail) { + Size = 4; return MCDisassembler::Fail; + } if (hasCOP3()) { DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n"); @@ -925,6 +988,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, return Result; } + Size = 4; return MCDisassembler::Fail; } @@ -1079,10 +1143,66 @@ static DecodeStatus DecodeMem(MCInst &Inst, Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); Base = getReg(Decoder, Mips::GPR32RegClassID, Base); - if(Inst.getOpcode() == Mips::SC || - Inst.getOpcode() == Mips::SCD){ + if (Inst.getOpcode() == Mips::SC || + Inst.getOpcode() == Mips::SCD) Inst.addOperand(MCOperand::createReg(Reg)); - } + + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Offset)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeMemEVA(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = SignExtend32<9>(Insn >> 7); + unsigned Reg = fieldFromInstruction(Insn, 16, 5); + unsigned Base = fieldFromInstruction(Insn, 21, 5); + + Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + + if (Inst.getOpcode() == Mips::SCE) + Inst.addOperand(MCOperand::createReg(Reg)); + + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Offset)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeLoadByte9(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = SignExtend32<9>(Insn & 0x1ff); + unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned Reg = fieldFromInstruction(Insn, 21, 5); + + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); + + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Offset)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeLoadByte15(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = SignExtend32<16>(Insn & 0xffff); + unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned Reg = fieldFromInstruction(Insn, 21, 5); + + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Base)); @@ -1125,11 +1245,28 @@ static DecodeStatus DecodeCacheOpMM(MCInst &Inst, return MCDisassembler::Success; } -static DecodeStatus DecodeCacheOpR6(MCInst &Inst, +static DecodeStatus DecodePrefeOpMM(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { - int Offset = fieldFromInstruction(Insn, 7, 9); + int Offset = SignExtend32<9>(Insn & 0x1ff); + unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned Hint = fieldFromInstruction(Insn, 21, 5); + + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Offset)); + Inst.addOperand(MCOperand::createImm(Hint)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = SignExtend32<9>(Insn >> 7); unsigned Hint = fieldFromInstruction(Insn, 16, 5); unsigned Base = fieldFromInstruction(Insn, 21, 5); @@ -1142,6 +1279,24 @@ static DecodeStatus DecodeCacheOpR6(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = SignExtend32<9>(Insn & 0x1ff); + unsigned Reg = fieldFromInstruction(Insn, 21, 5); + unsigned Base = fieldFromInstruction(Insn, 16, 5); + + Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Offset)); + + return MCDisassembler::Success; +} + static DecodeStatus DecodeSyncI(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -1157,6 +1312,21 @@ static DecodeStatus DecodeSyncI(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeSynciR6(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Immediate = SignExtend32<16>(Insn & 0xffff); + unsigned Base = fieldFromInstruction(Insn, 16, 5); + + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Immediate)); + + return MCDisassembler::Success; +} + static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10)); @@ -1220,8 +1390,11 @@ static DecodeStatus DecodeMemMMImm4(MCInst &Inst, return MCDisassembler::Fail; break; case Mips::SB16_MM: + case Mips::SB16_MMR6: case Mips::SH16_MM: + case Mips::SH16_MMR6: case Mips::SW16_MM: + case Mips::SW16_MMR6: if (DecodeGPRMM16ZeroRegisterClass(Inst, Reg, Address, Decoder) == MCDisassembler::Fail) return MCDisassembler::Fail; @@ -1240,14 +1413,17 @@ static DecodeStatus DecodeMemMMImm4(MCInst &Inst, Inst.addOperand(MCOperand::createImm(Offset)); break; case Mips::SB16_MM: + case Mips::SB16_MMR6: Inst.addOperand(MCOperand::createImm(Offset)); break; case Mips::LHU16_MM: case Mips::SH16_MM: + case Mips::SH16_MMR6: Inst.addOperand(MCOperand::createImm(Offset << 1)); break; case Mips::LW16_MM: case Mips::SW16_MM: + case Mips::SW16_MMR6: Inst.addOperand(MCOperand::createImm(Offset << 2)); break; } @@ -1291,7 +1467,16 @@ static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { - int Offset = SignExtend32<4>(Insn & 0xf); + int Offset; + switch (Inst.getOpcode()) { + case Mips::LWM16_MMR6: + case Mips::SWM16_MMR6: + Offset = fieldFromInstruction(Insn, 4, 4); + break; + default: + Offset = SignExtend32<4>(Insn & 0xf); + break; + } if (DecodeRegListOperand16(Inst, Insn, Address, Decoder) == MCDisassembler::Fail) @@ -1303,6 +1488,27 @@ static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeMemMMImm9(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = SignExtend32<9>(Insn & 0x1ff); + unsigned Reg = fieldFromInstruction(Insn, 21, 5); + unsigned Base = fieldFromInstruction(Insn, 16, 5); + + Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + + if (Inst.getOpcode() == Mips::SCE_MM) + Inst.addOperand(MCOperand::createReg(Reg)); + + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Offset)); + + return MCDisassembler::Success; +} + static DecodeStatus DecodeMemMMImm12(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -1659,6 +1865,16 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst, + unsigned Offset, + uint64_t Address, + const void *Decoder) { + int32_t BranchOffset = SignExtend32<26>(Offset) << 1; + + Inst.addOperand(MCOperand::createImm(BranchOffset)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -1700,6 +1916,14 @@ static DecodeStatus DecodeLiSimm7(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst, + unsigned Value, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::createImm(Value == 0x0 ? 8 : Value)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeSimm4(MCInst &Inst, unsigned Value, uint64_t Address, @@ -1716,12 +1940,12 @@ static DecodeStatus DecodeSimm16(MCInst &Inst, return MCDisassembler::Success; } -static DecodeStatus DecodeLSAImm(MCInst &Inst, - unsigned Insn, - uint64_t Address, - const void *Decoder) { - // We add one to the immediate field as it was encoded as 'imm - 1'. - Inst.addOperand(MCOperand::createImm(Insn + 1)); +template <unsigned Bits, int Offset> +static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value, + uint64_t Address, + const void *Decoder) { + Value &= ((1 << Bits) - 1); + Inst.addOperand(MCOperand::createImm(Value + Offset)); return MCDisassembler::Success; } @@ -1736,15 +1960,6 @@ static DecodeStatus DecodeInsSize(MCInst &Inst, return MCDisassembler::Success; } -static DecodeStatus DecodeExtSize(MCInst &Inst, - unsigned Insn, - uint64_t Address, - const void *Decoder) { - int Size = (int) Insn + 1; - Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Size))); - return MCDisassembler::Success; -} - static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { Inst.addOperand(MCOperand::createImm(SignExtend32<19>(Insn) * 4)); @@ -1792,15 +2007,21 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, uint64_t Address, const void *Decoder) { unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5, - Mips::S6, Mips::FP}; + Mips::S6, Mips::S7, Mips::FP}; unsigned RegNum; unsigned RegLst = fieldFromInstruction(Insn, 21, 5); + // Empty register lists are not allowed. if (RegLst == 0) return MCDisassembler::Fail; RegNum = RegLst & 0xf; + + // RegLst values 10-15, and 26-31 are reserved. + if (RegNum > 9) + return MCDisassembler::Fail; + for (unsigned i = 0; i < RegNum; i++) Inst.addOperand(MCOperand::createReg(Regs[i])); @@ -1814,7 +2035,16 @@ static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3}; - unsigned RegLst = fieldFromInstruction(Insn, 4, 2); + unsigned RegLst; + switch(Inst.getOpcode()) { + default: + RegLst = fieldFromInstruction(Insn, 4, 2); + break; + case Mips::LWM16_MMR6: + case Mips::SWM16_MMR6: + RegLst = fieldFromInstruction(Insn, 8, 2); + break; + } unsigned RegNum = RegLst & 0x3; for (unsigned i = 0; i <= RegNum; i++) diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp index a5637b1..a7b7d2e 100644 --- a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp +++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp @@ -235,7 +235,9 @@ printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) { case Mips::SWM32_MM: case Mips::LWM32_MM: case Mips::SWM16_MM: + case Mips::SWM16_MMR6: case Mips::LWM16_MM: + case Mips::LWM16_MMR6: opNum = MI->getNumOperands() - 2; break; } diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h index 713f35c..0e61ea6 100644 --- a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h +++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h @@ -73,8 +73,6 @@ enum CondCode { const char *MipsFCCToString(Mips::CondCode CC); } // end namespace Mips -class TargetMachine; - class MipsInstPrinter : public MCInstPrinter { public: MipsInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp index 8e6c9e6..cdcc392 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp @@ -23,7 +23,7 @@ static const MCPhysReg Mips64IntRegs[8] = { Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64}; } -const ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const { +ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const { if (IsO32()) return makeArrayRef(O32IntRegs); if (IsN32() || IsN64()) @@ -31,7 +31,7 @@ const ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const { llvm_unreachable("Unhandled ABI"); } -const ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const { +ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const { if (IsO32()) return makeArrayRef(O32IntRegs); if (IsN32() || IsN64()) @@ -78,7 +78,6 @@ MipsABIInfo MipsABIInfo::computeTargetABI(const Triple &TT, StringRef CPU, .Case("mips32r3", MipsABIInfo::O32()) .Case("mips32r5", MipsABIInfo::O32()) .Case("mips32r6", MipsABIInfo::O32()) - .Case("mips16", MipsABIInfo::O32()) .Case("mips3", MipsABIInfo::N64()) .Case("mips4", MipsABIInfo::N64()) .Case("mips5", MipsABIInfo::N64()) @@ -107,6 +106,10 @@ unsigned MipsABIInfo::GetNullPtr() const { return ArePtrs64bit() ? Mips::ZERO_64 : Mips::ZERO; } +unsigned MipsABIInfo::GetZeroReg() const { + return AreGprs64bit() ? Mips::ZERO_64 : Mips::ZERO; +} + unsigned MipsABIInfo::GetPtrAdduOp() const { return ArePtrs64bit() ? Mips::DADDu : Mips::ADDu; } @@ -115,6 +118,10 @@ unsigned MipsABIInfo::GetPtrAddiuOp() const { return ArePtrs64bit() ? Mips::DADDiu : Mips::ADDiu; } +unsigned MipsABIInfo::GetGPRMoveOp() const { + return ArePtrs64bit() ? Mips::OR64 : Mips::OR; +} + unsigned MipsABIInfo::GetEhDataReg(unsigned I) const { static const unsigned EhDataReg[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h index 40c5681..ffa2c76 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h @@ -47,10 +47,10 @@ public: ABI GetEnumValue() const { return ThisABI; } /// The registers to use for byval arguments. - const ArrayRef<MCPhysReg> GetByValArgRegs() const; + ArrayRef<MCPhysReg> GetByValArgRegs() const; /// The registers to use for the variable argument list. - const ArrayRef<MCPhysReg> GetVarArgRegs() const; + ArrayRef<MCPhysReg> GetVarArgRegs() const; /// Obtain the size of the area allocated by the callee for arguments. /// CallingConv::FastCall affects the value for O32. @@ -67,9 +67,12 @@ public: unsigned GetFramePtr() const; unsigned GetBasePtr() const; unsigned GetNullPtr() const; + unsigned GetZeroReg() const; unsigned GetPtrAdduOp() const; unsigned GetPtrAddiuOp() const; + unsigned GetGPRMoveOp() const; inline bool ArePtrs64bit() const { return IsN64(); } + inline bool AreGprs64bit() const { return IsN32() || IsN64(); } unsigned GetEhDataReg(unsigned I) const; }; diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index 328e717..e4865e2 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -63,15 +63,19 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, // address range. Forcing a signed division because Value can be negative. Value = (int64_t)Value / 4; // We now check if Value can be encoded as a 16-bit signed immediate. - if (!isIntN(16, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC16 fixup"); + if (!isInt<16>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup"); + return 0; + } break; case Mips::fixup_MIPS_PC19_S2: // Forcing a signed division because Value can be negative. Value = (int64_t)Value / 4; // We now check if Value can be encoded as a 19-bit signed immediate. - if (!isIntN(19, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC19 fixup"); + if (!isInt<19>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC19 fixup"); + return 0; + } break; case Mips::fixup_Mips_26: // So far we are only using this type for jumps. @@ -104,45 +108,57 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, // Forcing a signed division because Value can be negative. Value = (int64_t) Value / 2; // We now check if Value can be encoded as a 7-bit signed immediate. - if (!isIntN(7, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC7 fixup"); + if (!isInt<7>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC7 fixup"); + return 0; + } break; case Mips::fixup_MICROMIPS_PC10_S1: Value -= 2; // Forcing a signed division because Value can be negative. Value = (int64_t) Value / 2; // We now check if Value can be encoded as a 10-bit signed immediate. - if (!isIntN(10, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC10 fixup"); + if (!isInt<10>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC10 fixup"); + return 0; + } break; case Mips::fixup_MICROMIPS_PC16_S1: Value -= 4; // Forcing a signed division because Value can be negative. Value = (int64_t)Value / 2; // We now check if Value can be encoded as a 16-bit signed immediate. - if (!isIntN(16, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC16 fixup"); + if (!isInt<16>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup"); + return 0; + } break; case Mips::fixup_MIPS_PC18_S3: // Forcing a signed division because Value can be negative. Value = (int64_t)Value / 8; // We now check if Value can be encoded as a 18-bit signed immediate. - if (!isIntN(18, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC18 fixup"); + if (!isInt<18>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup"); + return 0; + } break; case Mips::fixup_MIPS_PC21_S2: // Forcing a signed division because Value can be negative. Value = (int64_t) Value / 4; // We now check if Value can be encoded as a 21-bit signed immediate. - if (!isIntN(21, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC21 fixup"); + if (!isInt<21>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup"); + return 0; + } break; case Mips::fixup_MIPS_PC26_S2: // Forcing a signed division because Value can be negative. Value = (int64_t) Value / 4; // We now check if Value can be encoded as a 26-bit signed immediate. - if (!isIntN(26, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC26 fixup"); + if (!isInt<26>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC26 fixup"); + return 0; + } break; } @@ -232,6 +248,18 @@ void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, } } +bool MipsAsmBackend::getFixupKind(StringRef Name, MCFixupKind &MappedKind) const { + if (Name == "R_MIPS_NONE") { + MappedKind = (MCFixupKind)Mips::fixup_Mips_NONE; + return true; + } + if (Name == "R_MIPS_32") { + MappedKind = FK_Data_4; + return true; + } + return MCAsmBackend::getFixupKind(Name, MappedKind); +} + const MCFixupKindInfo &MipsAsmBackend:: getFixupKindInfo(MCFixupKind Kind) const { const static MCFixupKindInfo LittleEndianInfos[Mips::NumTargetFixupKinds] = { @@ -239,6 +267,7 @@ getFixupKindInfo(MCFixupKind Kind) const { // MipsFixupKinds.h. // // name offset bits flags + { "fixup_Mips_NONE", 0, 0, 0 }, { "fixup_Mips_16", 0, 16, 0 }, { "fixup_Mips_32", 0, 32, 0 }, { "fixup_Mips_REL32", 0, 32, 0 }, @@ -304,6 +333,7 @@ getFixupKindInfo(MCFixupKind Kind) const { // MipsFixupKinds.h. // // name offset bits flags + { "fixup_Mips_NONE", 0, 0, 0 }, { "fixup_Mips_16", 16, 16, 0 }, { "fixup_Mips_32", 0, 32, 0 }, { "fixup_Mips_REL32", 0, 32, 0 }, diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index b3d5a49..1c9af92 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -41,6 +41,7 @@ public: void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const override; + bool getFixupKind(StringRef Name, MCFixupKind &MappedKind) const override; const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; unsigned getNumFixupKinds() const override { diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 9b29527..5b9f02b 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -68,6 +68,8 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target, unsigned Kind = (unsigned)Fixup.getKind(); switch (Kind) { + case Mips::fixup_Mips_NONE: + return ELF::R_MIPS_NONE; case Mips::fixup_Mips_16: case FK_Data_2: return IsPCRel ? ELF::R_MIPS_PC16 : ELF::R_MIPS_16; @@ -325,13 +327,24 @@ static void setMatch(MipsRelocationEntry &Hi, MipsRelocationEntry &Lo) { // matching LO; // - prefer LOs without a pair; // - prefer LOs with higher offset; + +static int cmpRel(const ELFRelocationEntry *AP, const ELFRelocationEntry *BP) { + const ELFRelocationEntry &A = *AP; + const ELFRelocationEntry &B = *BP; + if (A.Offset != B.Offset) + return B.Offset - A.Offset; + if (B.Type != A.Type) + return A.Type - B.Type; + return 0; +} + void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm, std::vector<ELFRelocationEntry> &Relocs) { if (Relocs.size() < 2) return; - // The default function sorts entries by Offset in descending order. - MCELFObjectTargetWriter::sortRelocs(Asm, Relocs); + // Sorts entries by Offset in descending order. + array_pod_sort(Relocs.begin(), Relocs.end(), cmpRel); // Init MipsRelocs from Relocs. std::vector<MipsRelocationEntry> MipsRelocs; diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index b45d9cf..e7d687e 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -63,7 +63,7 @@ void MipsELFStreamer::SwitchSection(MCSection *Section, } void MipsELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) { + SMLoc Loc) { MCELFStreamer::EmitValueImpl(Value, Size, Loc); Labels.clear(); } diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h index af9311f..a241cde 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h @@ -60,8 +60,7 @@ public: /// Overriding this function allows us to dismiss all labels that are /// candidates for marking as microMIPS when .word directive is emitted. - void EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) override; + void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override; /// Emits all the option records stored up until the point it's called. void EmitMipsOptionRecords(); diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h index e601963..3652f4b 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h @@ -23,8 +23,11 @@ namespace Mips { // in MipsAsmBackend.cpp. // enum Fixups { + // Branch fixups resulting in R_MIPS_NONE. + fixup_Mips_NONE = FirstTargetFixupKind, + // Branch fixups resulting in R_MIPS_16. - fixup_Mips_16 = FirstTargetFixupKind, + fixup_Mips_16, // Pure 32 bit data fixup resulting in - R_MIPS_32. fixup_Mips_32, diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h index 5d23fcb..d4ccf03 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h @@ -17,13 +17,14 @@ #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { - class Triple; +class Triple; - class MipsMCAsmInfo : public MCAsmInfoELF { - void anchor() override; - public: - explicit MipsMCAsmInfo(const Triple &TheTriple); - }; +class MipsMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit MipsMCAsmInfo(const Triple &TheTriple); +}; } // namespace llvm diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index e36263d..4b030eb 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -190,6 +190,10 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, else NewOpcode = Mips::Std2MicroMips(Opcode, Mips::Arch_micromips); + // Check whether it is Dsp instruction. + if (NewOpcode == -1) + NewOpcode = Mips::Dsp2MicroMips(Opcode, Mips::Arch_mmdsp); + if (NewOpcode != -1) { if (Fixups.size() > N) Fixups.pop_back(); @@ -346,6 +350,23 @@ getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo, return 0; } +/// getBranchTarget26OpValueMM - Return binary encoding of the branch +/// target operand. If the machine operand requires relocation, +/// record the relocation and return zero. +unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM( + const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + + const MCOperand &MO = MI.getOperand(OpNo); + + // If the destination is an immediate, divide by 2. + if (MO.isImm()) + return MO.getImm() >> 1; + + // TODO: Push 26 PC fixup. + return 0; +} + /// getJumpOffset16OpValue - Return binary encoding of the jump /// target operand. If the machine operand requires relocation, /// record the relocation and return zero. @@ -745,7 +766,8 @@ getMemEncodingMMSPImm5Lsl2(const MCInst &MI, unsigned OpNo, const MCSubtargetInfo &STI) const { // Register is encoded in bits 9-5, offset is encoded in bits 4-0. assert(MI.getOperand(OpNo).isReg() && - MI.getOperand(OpNo).getReg() == Mips::SP && + (MI.getOperand(OpNo).getReg() == Mips::SP || + MI.getOperand(OpNo).getReg() == Mips::SP_64) && "Unexpected base register!"); unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) >> 2; @@ -769,6 +791,19 @@ getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo, } unsigned MipsMCCodeEmitter:: +getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // Base register is encoded in bits 20-16, offset is encoded in bits 8-0. + assert(MI.getOperand(OpNo).isReg()); + unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, + STI) << 16; + unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo + 1), Fixups, STI); + + return (OffBits & 0x1FF) | RegBits; +} + +unsigned MipsMCCodeEmitter:: getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { @@ -792,6 +827,19 @@ getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo, } unsigned MipsMCCodeEmitter:: +getMemEncodingMMImm16(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // Base register is encoded in bits 20-16, offset is encoded in bits 15-0. + assert(MI.getOperand(OpNo).isReg()); + unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, + STI) << 16; + unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI); + + return (OffBits & 0xFFFF) | RegBits; +} + +unsigned MipsMCCodeEmitter:: getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { @@ -801,7 +849,9 @@ getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo, default: break; case Mips::SWM16_MM: + case Mips::SWM16_MMR6: case Mips::LWM16_MM: + case Mips::LWM16_MMR6: OpNo = MI.getNumOperands() - 2; break; } @@ -815,15 +865,6 @@ getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo, return ((OffBits >> 2) & 0x0F); } -unsigned -MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - assert(MI.getOperand(OpNo).isImm()); - unsigned SizeEncoding = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI); - return SizeEncoding - 1; -} - // FIXME: should be called getMSBEncoding // unsigned @@ -838,13 +879,15 @@ MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo, return Position + Size - 1; } +template <unsigned Bits, int Offset> unsigned -MipsMCCodeEmitter::getLSAImmEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { +MipsMCCodeEmitter::getUImmWithOffsetEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { assert(MI.getOperand(OpNo).isImm()); - // The immediate is encoded as 'immediate - 1'. - return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) - 1; + unsigned Value = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI); + Value -= Offset; + return Value; } unsigned diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h index 911cc2f..fdacd17 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h @@ -137,6 +137,13 @@ public: SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; + // getBranchTarget26OpValueMM - Return binary encoding of the branch + // offset operand. If the machine operand requires relocation, + // record the relocation and return zero. + unsigned getBranchTarget26OpValueMM(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + // getJumpOffset16OpValue - Return binary encoding of the jump // offset operand. If the machine operand requires relocation, // record the relocation and return zero. @@ -172,23 +179,27 @@ public: unsigned getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; + unsigned getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; + unsigned getMemEncodingMMImm16(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; unsigned getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - unsigned getSizeExtEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; unsigned getSizeInsEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - // getLSAImmEncoding - Return binary encoding of LSA immediate. - unsigned getLSAImmEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; + /// Subtract Offset then encode as a N-bit unsigned integer. + template <unsigned Bits, int Offset> + unsigned getUImmWithOffsetEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; unsigned getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h index fd2ed17..e889972 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h @@ -51,8 +51,8 @@ public: const MCAsmLayout *Layout, const MCFixup *Fixup) const override; void visitUsedExpr(MCStreamer &Streamer) const override; - MCSection *findAssociatedSection() const override { - return getSubExpr()->findAssociatedSection(); + MCFragment *findAssociatedFragment() const override { + return getSubExpr()->findAssociatedFragment(); } // There are no TLS MipsMCExprs at the moment. diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index e4da2df..e5fa755 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -89,9 +89,15 @@ void MipsTargetStreamer::emitDirectiveSetHardFloat() { void MipsTargetStreamer::emitDirectiveSetDsp() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {} +void MipsTargetStreamer::emitDirectiveCpRestore( + SmallVector<MCInst, 3> &StoreInsts, int Offset) { + forbidModuleDirective(); +} void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, bool IsReg) { } +void MipsTargetStreamer::emitDirectiveCpreturn(unsigned SaveLocation, + bool SaveLocationIsRegister) {} void MipsTargetStreamer::emitDirectiveModuleFP() {} @@ -358,6 +364,12 @@ void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) { forbidModuleDirective(); } +void MipsTargetAsmStreamer::emitDirectiveCpRestore( + SmallVector<MCInst, 3> &StoreInsts, int Offset) { + MipsTargetStreamer::emitDirectiveCpRestore(StoreInsts, Offset); + OS << "\t.cprestore\t" << Offset << "\n"; +} + void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, @@ -373,7 +385,13 @@ void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo, OS << ", "; - OS << Sym.getName() << "\n"; + OS << Sym.getName(); + forbidModuleDirective(); +} + +void MipsTargetAsmStreamer::emitDirectiveCpreturn(unsigned SaveLocation, + bool SaveLocationIsRegister) { + OS << "\t.cpreturn"; forbidModuleDirective(); } @@ -595,8 +613,9 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) { MCSectionELF *Sec = Context.getELFSection(".pdr", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHT_REL); + MCSymbol *Sym = Context.getOrCreateSymbol(Name); const MCSymbolRefExpr *ExprRef = - MCSymbolRefExpr::create(Name, MCSymbolRefExpr::VK_None, Context); + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Context); MCA.registerSection(*Sec); Sec->setAlignment(4); @@ -622,10 +641,25 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) { GPRInfoSet = FPRInfoSet = FrameInfoSet = false; OS.PopSection(); + + // .end also implicitly sets the size. + MCSymbol *CurPCSym = Context.createTempSymbol(); + OS.EmitLabel(CurPCSym); + const MCExpr *Size = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(CurPCSym, MCSymbolRefExpr::VK_None, Context), + ExprRef, Context); + int64_t AbsSize; + if (!Size->evaluateAsAbsolute(AbsSize, MCA)) + llvm_unreachable("Function size must be evaluatable as absolute"); + Size = MCConstantExpr::create(AbsSize, Context); + static_cast<MCSymbolELF *>(Sym)->setSize(Size); } void MipsTargetELFStreamer::emitDirectiveEnt(const MCSymbol &Symbol) { GPRInfoSet = FPRInfoSet = FrameInfoSet = false; + + // .ent also acts like an implicit '.type symbol, STT_FUNC' + static_cast<const MCSymbolELF &>(Symbol).setType(ELF::STT_FUNC); } void MipsTargetELFStreamer::emitDirectiveAbiCalls() { @@ -752,6 +786,24 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) { forbidModuleDirective(); } +void MipsTargetELFStreamer::emitDirectiveCpRestore( + SmallVector<MCInst, 3> &StoreInsts, int Offset) { + MipsTargetStreamer::emitDirectiveCpRestore(StoreInsts, Offset); + // .cprestore offset + // When PIC mode is enabled and the O32 ABI is used, this directive expands + // to: + // sw $gp, offset($sp) + // and adds a corresponding LW after every JAL. + + // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it + // is used in non-PIC mode. + if (!Pic || (getABI().IsN32() || getABI().IsN64())) + return; + + for (const MCInst &Inst : StoreInsts) + getStreamer().EmitInstruction(Inst, STI); +} + void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, @@ -766,7 +818,7 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, // Either store the old $gp in a register or on the stack if (IsReg) { // move $save, $gpreg - Inst.setOpcode(Mips::DADDu); + Inst.setOpcode(Mips::OR64); Inst.addOperand(MCOperand::createReg(RegOrOffset)); Inst.addOperand(MCOperand::createReg(Mips::GP)); Inst.addOperand(MCOperand::createReg(Mips::ZERO)); @@ -810,6 +862,30 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, forbidModuleDirective(); } +void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation, + bool SaveLocationIsRegister) { + // Only N32 and N64 emit anything for .cpreturn iff PIC is set. + if (!Pic || !(getABI().IsN32() || getABI().IsN64())) + return; + + MCInst Inst; + // Either restore the old $gp from a register or on the stack + if (SaveLocationIsRegister) { + Inst.setOpcode(Mips::OR); + Inst.addOperand(MCOperand::createReg(Mips::GP)); + Inst.addOperand(MCOperand::createReg(SaveLocation)); + Inst.addOperand(MCOperand::createReg(Mips::ZERO)); + } else { + Inst.setOpcode(Mips::LD); + Inst.addOperand(MCOperand::createReg(Mips::GP)); + Inst.addOperand(MCOperand::createReg(Mips::SP)); + Inst.addOperand(MCOperand::createImm(SaveLocation)); + } + getStreamer().EmitInstruction(Inst, STI); + + forbidModuleDirective(); +} + void MipsTargetELFStreamer::emitMipsAbiFlags() { MCAssembler &MCA = getStreamer().getAssembler(); MCContext &Context = MCA.getContext(); diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td index 187a022..400f6eef 100644 --- a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td +++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td @@ -16,6 +16,64 @@ class MMR6Arch<string opstr> { string BaseOpcode = opstr; } +// Class used for microMIPS32r6 and microMIPS64r6 instructions. +class MicroMipsR6Inst16 : PredicateControl { + string DecoderNamespace = "MicroMipsR6"; + let InsnPredicates = [HasMicroMips32r6]; +} + +class BC16_FM_MM16R6 { + bits<10> offset; + + bits<16> Inst; + + let Inst{15-10} = 0x33; + let Inst{9-0} = offset; +} + +class BEQZC_BNEZC_FM_MM16R6<bits<6> op> : MicroMipsR6Inst16 { + bits<3> rs; + bits<7> offset; + + bits<16> Inst; + + let Inst{15-10} = op; + let Inst{9-7} = rs; + let Inst{6-0} = offset; +} + +class POOL16C_JALRC_FM_MM16R6<bits<5> op> { + bits<5> rs; + + bits<16> Inst; + + let Inst{15-10} = 0x11; + let Inst{9-5} = rs; + let Inst{4-0} = op; +} + +class POOL16C_JRCADDIUSP_FM_MM16R6<bits<5> op> { + bits<5> imm; + + bits<16> Inst; + + let Inst{15-10} = 0x11; + let Inst{9-5} = imm; + let Inst{4-0} = op; +} + +class POOL16C_LWM_SWM_FM_MM16R6<bits<4> funct> { + bits<2> rt; + bits<4> addr; + + bits<16> Inst; + + let Inst{15-10} = 0x11; + let Inst{9-8} = rt; + let Inst{7-4} = addr; + let Inst{3-0} = funct; +} + class POOL32A_BITSWAP_FM_MMR6<bits<6> funct> : MipsR6Inst { bits<5> rd; bits<5> rt; @@ -71,6 +129,64 @@ class ADDI_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> { let Inst{15-0} = imm16; } +class POOL32C_ST_EVA_FM_MMR6<bits<6> op, bits<3> funct> : MipsR6Inst { + bits<21> addr; + bits<5> hint; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = hint; + let Inst{20-16} = base; + let Inst{15-12} = 0b1010; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + +class LB32_FM_MMR6 : MipsR6Inst { + bits<21> addr; + bits<5> rt; + bits<5> base = addr{20-16}; + bits<16> offset = addr{15-0}; + + bits<32> Inst; + + let Inst{31-26} = 0b000111; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-0} = offset; +} + +class LBU32_FM_MMR6 : MipsR6Inst { + bits<21> addr; + bits<5> rt; + bits<5> base = addr{20-16}; + bits<16> offset = addr{15-0}; + + bits<32> Inst; + + let Inst{31-26} = 0b000101; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-0} = offset; +} + +class POOL32C_LB_LBU_FM_MMR6<bits<3> funct> : MipsR6Inst { + bits<21> addr; + bits<5> rt; + + bits<32> Inst; + + let Inst{31-26} = 0b011000; + let Inst{25-21} = rt; + let Inst{20-16} = addr{20-16}; + let Inst{15-12} = 0b0110; + let Inst{11-9} = funct; + let Inst{8-0} = addr{8-0}; +} + class SIGN_EXTEND_FM_MMR6<string instr_asm, bits<10> funct> : MMR6Arch<instr_asm> { bits<5> rd; @@ -124,6 +240,69 @@ class POOL32A_FM_MMR6<bits<10> funct> : MipsR6Inst { let Inst{9-0} = funct; } +class POOL32A_PAUSE_FM_MMR6<string instr_asm, bits<5> op> : MMR6Arch<instr_asm> { + bits<32> Inst; + + let Inst{31-26} = 0; + let Inst{25-21} = 0; + let Inst{20-16} = 0; + let Inst{15-11} = op; + let Inst{10-6} = 0; + let Inst{5-0} = 0; +} + +class POOL32A_RDPGPR_FM_MMR6<bits<10> funct> { + bits<5> rt; + bits<5> rd; + bits<32> Inst; + + let Inst{31-26} = 0; + let Inst{25-21} = rt; + let Inst{20-16} = rd; + let Inst{15-6} = funct; + let Inst{5-0} = 0b111100; +} + +class POOL32A_RDHWR_FM_MMR6 { + bits<5> rt; + bits<5> rs; + bits<3> sel; + bits<32> Inst; + + let Inst{31-26} = 0; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-14} = 0; + let Inst{13-11} = sel; + let Inst{10} = 0; + let Inst{9-0} = 0b0111000000; +} + +class POOL32A_SYNC_FM_MMR6 { + bits<5> stype; + + bits<32> Inst; + + let Inst{31-26} = 0; + let Inst{25-21} = 0; + let Inst{20-16} = stype; + let Inst{15-6} = 0b0110101101; + let Inst{5-0} = 0b111100; +} + +class POOL32I_SYNCI_FM_MMR6 { + bits<21> addr; + bits<5> base = addr{20-16}; + bits<16> immediate = addr{15-0}; + + bits<32> Inst; + + let Inst{31-26} = 0b010000; + let Inst{25-21} = 0b01100; + let Inst{20-16} = base; + let Inst{15-0} = immediate; +} + class POOL32A_2R_FM_MMR6<bits<10> funct> : MipsR6Inst { bits<5> rs; bits<5> rt; @@ -198,6 +377,78 @@ class POOL32A_LSA_FM<bits<6> funct> : MipsR6Inst { let Inst{5-0} = funct; } +class SB32_SH32_STORE_FM_MMR6<bits<6> op> { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<16> offset = addr{15-0}; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-0} = offset; +} + +class POOL32C_STORE_EVA_FM_MMR6<bits<3> funct> { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = 0b011000; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-12} = 0b1010; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + +class LOAD_WORD_EVA_FM_MMR6<bits<3> funct> { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = 0b011000; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-12} = 0b0110; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + +class LOAD_WORD_FM_MMR6 { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<16> offset = addr{15-0}; + + bits<32> Inst; + + let Inst{31-26} = 0b111111; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-0} = offset; +} + +class LOAD_UPPER_IMM_FM_MMR6 { + bits<5> rt; + bits<16> imm16; + + bits<32> Inst; + + let Inst{31-26} = 0b000100; + let Inst{25-21} = rt; + let Inst{20-16} = 0; + let Inst{15-0} = imm16; +} + class CMP_BRANCH_1R_RT_OFF16_FM_MMR6<bits<6> funct> : MipsR6Inst { bits<5> rt; bits<16> offset; @@ -222,12 +473,13 @@ class CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<bits<6> funct> : MipsR6Inst { let Inst{15-0} = offset; } -class ERET_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm> { +class POOL32A_ERET_FM_MMR6<string instr_asm, bits<10> funct> + : MMR6Arch<instr_asm> { bits<32> Inst; let Inst{31-26} = 0x00; let Inst{25-16} = 0x00; - let Inst{15-6} = 0x3cd; + let Inst{15-6} = funct; let Inst{5-0} = 0x3c; } @@ -262,7 +514,8 @@ class BARRIER_MMR6_ENC<string instr_asm, bits<5> op> : MMR6Arch<instr_asm> { let Inst{5-0} = 0x0; } -class EIDI_MMR6_ENC<string instr_asm, bits<10> funct> : MMR6Arch<instr_asm> { +class POOL32A_EIDI_MMR6_ENC<string instr_asm, bits<10> funct> + : MMR6Arch<instr_asm> { bits<32> Inst; bits<5> rt; // Actually rs but we're sharing code with the standard encodings which call it rt @@ -287,3 +540,323 @@ class SHIFT_MMR6_ENC<string instr_asm, bits<10> funct, bit rotate> : MMR6Arch<in let Inst{10} = rotate; let Inst{9-0} = funct; } + +class SW32_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> { + bits<5> rt; + bits<21> addr; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = rt; + let Inst{20-16} = addr{20-16}; + let Inst{15-0} = addr{15-0}; +} + +class POOL32C_SWE_FM_MMR6<string instr_asm, bits<6> op, bits<4> fmt, + bits<3> funct> : MMR6Arch<instr_asm> { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-12} = fmt; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + +class POOL32F_ARITH_FM_MMR6<string instr_asm, bits<2> fmt, bits<8> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15-11} = fd; + let Inst{10} = 0; + let Inst{9-8} = fmt; + let Inst{7-0} = funct; +} + +class POOL32F_ARITHF_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15-11} = fd; + let Inst{10-9} = fmt; + let Inst{8-0} = funct; +} + +class POOL32F_MOV_NEG_FM_MMR6<string instr_asm, bits<2> fmt, bits<7> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14-13} = fmt; + let Inst{12-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL32F_MINMAX_FM<string instr_asm, bits<2> fmt, bits<9> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15-11} = fd; + let Inst{10-9} = fmt; + let Inst{8-0} = funct; +} + +class POOL32F_CMP_FM<string instr_asm, bits<6> format, FIELD_CMP_COND Cond> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15-11} = fd; + let Inst{10-6} = Cond.Value; + let Inst{5-0} = format; +} + +class POOL32F_CVT_LW_FM<string instr_asm, bit fmt, bits<8> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14} = fmt; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL32F_CVT_DS_FM<string instr_asm, bits<2> fmt, bits<7> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14-13} = fmt; + let Inst{12-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL32F_ABS_FM_MMR6<string instr_asm, bits<2> fmt, bits<7> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14-13} = fmt; + let Inst{12-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL32F_MATH_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14} = fmt; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL16A_ADDU16_FM_MMR6 : MicroMipsR6Inst16 { + bits<3> rs; + bits<3> rt; + bits<3> rd; + + bits<16> Inst; + + let Inst{15-10} = 0b000001; + let Inst{9-7} = rs; + let Inst{6-4} = rt; + let Inst{3-1} = rd; + let Inst{0} = 0; +} + +class POOL16C_AND16_FM_MMR6 : MicroMipsR6Inst16 { + bits<3> rt; + bits<3> rs; + + bits<16> Inst; + + let Inst{15-10} = 0b010001; + let Inst{9-7} = rt; + let Inst{6-4} = rs; + let Inst{3-0} = 0b0001; +} + +class POOL16C_NOT16_FM_MMR6 : MicroMipsR6Inst16 { + bits<3> rt; + bits<3> rs; + + bits<16> Inst; + + let Inst{15-10} = 0x11; + let Inst{9-7} = rt; + let Inst{6-4} = rs; + let Inst{3-0} = 0b0000; +} + +class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> { + bits<3> rt; + bits<3> rs; + + bits<16> Inst; + + let Inst{15-10} = 0b010001; + let Inst{9-7} = rt; + let Inst{6-4} = rs; + let Inst{3-0} = op; +} + +class POOL16C_BREAKPOINT_FM_MMR6<bits<6> op> { + bits<4> code_; + bits<16> Inst; + + let Inst{15-10} = 0b010001; + let Inst{9-6} = code_; + let Inst{5-0} = op; +} + +class POOL16A_SUBU16_FM_MMR6 { + bits<3> rs; + bits<3> rt; + bits<3> rd; + + bits<16> Inst; + + let Inst{15-10} = 0b000001; + let Inst{9-7} = rs; + let Inst{6-4} = rt; + let Inst{3-1} = rd; + let Inst{0} = 0b1; +} + +class POOL32A_WRPGPR_WSBH_FM_MMR6<bits<10> funct> : MipsR6Inst { + bits<5> rt; + bits<5> rs; + + bits<32> Inst; + + let Inst{31-26} = 0x00; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-6} = funct; + let Inst{5-0} = 0x3c; +} + +class POOL32F_RECIP_ROUND_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14} = fmt; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL32F_RINT_FM_MMR6<string instr_asm, bits<2> fmt> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = fs; + let Inst{20-16} = fd; + let Inst{15-11} = 0; + let Inst{10-9} = fmt; + let Inst{8-0} = 0b000100000; +} + +class POOL32F_SEL_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15-11} = fd; + let Inst{10-9} = fmt; + let Inst{8-0} = funct; +} + +class POOL32F_CLASS_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = fs; + let Inst{20-16} = fd; + let Inst{15-11} = 0b00000; + let Inst{10-9} = fmt; + let Inst{8-0} = funct; +} diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td index 53bde13..31b5db0 100644 --- a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td @@ -11,6 +11,13 @@ // //===----------------------------------------------------------------------===// +def brtarget26_mm : Operand<OtherVT> { + let EncoderMethod = "getBranchTarget26OpValueMM"; + let OperandType = "OPERAND_PCREL"; + let DecoderMethod = "DecodeBranchTarget26MM"; + let ParserMatchClass = MipsJumpTargetAsmOperand; +} + //===----------------------------------------------------------------------===// // // Instruction Encodings @@ -28,6 +35,9 @@ class ALIGN_MMR6_ENC : POOL32A_ALIGN_FM_MMR6<0b011111>; class AUI_MMR6_ENC : AUI_FM_MMR6; class BALC_MMR6_ENC : BRANCH_OFF26_FM<0b101101>; class BC_MMR6_ENC : BRANCH_OFF26_FM<0b100101>; +class BC16_MMR6_ENC : BC16_FM_MM16R6; +class BEQZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x23>; +class BNEZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x2b>; class BITSWAP_MMR6_ENC : POOL32A_BITSWAP_FM_MMR6<0b101100>; class BRK_MMR6_ENC : BREAK_MMR6_ENC<"break">; class BEQZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<0b011101>; @@ -42,13 +52,19 @@ class CLZ_MMR6_ENC : SPECIAL_2R_FM_MMR6<0b010000>; class DIV_MMR6_ENC : ARITH_FM_MMR6<"div", 0x118>; class DIVU_MMR6_ENC : ARITH_FM_MMR6<"divu", 0x198>; class EHB_MMR6_ENC : BARRIER_MMR6_ENC<"ehb", 0x3>; -class EI_MMR6_ENC : EIDI_MMR6_ENC<"ei", 0x15d>; -class ERET_MMR6_ENC : ERET_FM_MMR6<"eret">; +class EI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"ei", 0x15d>; +class DI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"di", 0b0100011101>; +class ERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0x3cd>; +class DERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0b1110001101>; class ERETNC_MMR6_ENC : ERETNC_FM_MMR6<"eretnc">; +class JALRC16_MMR6_ENC : POOL16C_JALRC_FM_MM16R6<0xb>; class JIALC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b100000>; class JIC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b101000>; +class JRC16_MMR6_ENC: POOL16C_JALRC_FM_MM16R6<0x3>; +class JRCADDIUSP_MMR6_ENC : POOL16C_JRCADDIUSP_FM_MM16R6<0x13>; class LSA_MMR6_ENC : POOL32A_LSA_FM<0b001111>; class LWPC_MMR6_ENC : PCREL19_FM_MMR6<0b01>; +class LWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0x2>; class MOD_MMR6_ENC : ARITH_FM_MMR6<"mod", 0x158>; class MODU_MMR6_ENC : ARITH_FM_MMR6<"modu", 0x1d8>; class MUL_MMR6_ENC : ARITH_FM_MMR6<"mul", 0x18>; @@ -59,15 +75,99 @@ class NOR_MMR6_ENC : ARITH_FM_MMR6<"nor", 0x2d0>; class OR_MMR6_ENC : ARITH_FM_MMR6<"or", 0x290>; class ORI_MMR6_ENC : ADDI_FM_MMR6<"ori", 0x14>; class PREF_MMR6_ENC : CACHE_PREF_FM_MMR6<0b011000, 0b0010>; +class SB16_MMR6_ENC : LOAD_STORE_FM_MM16<0x22>; class SEB_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seb", 0b0010101100>; class SEH_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seh", 0b0011101100>; class SELEQZ_MMR6_ENC : POOL32A_FM_MMR6<0b0101000000>; class SELNEZ_MMR6_ENC : POOL32A_FM_MMR6<0b0110000000>; +class SH16_MMR6_ENC : LOAD_STORE_FM_MM16<0x2a>; class SLL_MMR6_ENC : SHIFT_MMR6_ENC<"sll", 0x00, 0b0>; class SUB_MMR6_ENC : ARITH_FM_MMR6<"sub", 0x190>; class SUBU_MMR6_ENC : ARITH_FM_MMR6<"subu", 0x1d0>; +class SW_MMR6_ENC : SW32_FM_MMR6<"sw", 0x3e>; +class SWE_MMR6_ENC : POOL32C_SWE_FM_MMR6<"swe", 0x18, 0xa, 0x7>; +class SW16_MMR6_ENC : LOAD_STORE_FM_MM16<0x3a>; +class SWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0xa>; +class SWSP_MMR6_ENC : LOAD_STORE_SP_FM_MM16<0x32>; +class PREFE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b010>; +class CACHEE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b011>; +class WRPGPR_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x3c5>; +class WSBH_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x1ec>; +class LB_MMR6_ENC : LB32_FM_MMR6; +class LBU_MMR6_ENC : LBU32_FM_MMR6; +class LBE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b100>; +class LBUE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b000>; +class PAUSE_MMR6_ENC : POOL32A_PAUSE_FM_MMR6<"pause", 0b00101>; +class RDHWR_MMR6_ENC : POOL32A_RDHWR_FM_MMR6; +class WAIT_MMR6_ENC : WAIT_FM_MM, MMR6Arch<"wait">; +class SSNOP_MMR6_ENC : BARRIER_FM_MM<0x1>, MMR6Arch<"ssnop">; +class SYNC_MMR6_ENC : POOL32A_SYNC_FM_MMR6; +class SYNCI_MMR6_ENC : POOL32I_SYNCI_FM_MMR6, MMR6Arch<"synci">; +class RDPGPR_MMR6_ENC : POOL32A_RDPGPR_FM_MMR6<0b1110000101>; +class SDBBP_MMR6_ENC : SDBBP_FM_MM, MMR6Arch<"sdbbp">; class XOR_MMR6_ENC : ARITH_FM_MMR6<"xor", 0x310>; class XORI_MMR6_ENC : ADDI_FM_MMR6<"xori", 0x1c>; +class ABS_S_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.s", 0, 0b0001101>; +class ABS_D_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.d", 1, 0b0001101>; +class FLOOR_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.l.s", 0, 0b00001100>; +class FLOOR_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.l.d", 1, 0b00001100>; +class FLOOR_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.w.s", 0, 0b00101100>; +class FLOOR_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.w.d", 1, 0b00101100>; +class CEIL_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.l.s", 0, 0b01001100>; +class CEIL_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.l.d", 1, 0b01001100>; +class CEIL_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.w.s", 0, 0b01101100>; +class CEIL_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.w.d", 1, 0b01101100>; +class TRUNC_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.s", 0, 0b10001100>; +class TRUNC_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.d", 1, 0b10001100>; +class TRUNC_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.s", 0, 0b10101100>; +class TRUNC_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.d", 1, 0b10101100>; +class SQRT_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.s", 0, 0b00101000>; +class SQRT_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.d", 1, 0b00101000>; +class RSQRT_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"rsqrt.s", 0, 0b00001000>; +class RSQRT_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"rsqrt.d", 1, 0b00001000>; +class SB_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b000110>; +class SBE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b100>; +class SCE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b110>; +class SH_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b001110>; +class SHE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b101>; +class LLE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b110>; +class LWE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b111>; +class LW_MMR6_ENC : LOAD_WORD_FM_MMR6; +class LUI_MMR6_ENC : LOAD_UPPER_IMM_FM_MMR6; +class RECIP_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.s", 0, 0b01001000>; +class RECIP_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.d", 1, 0b01001000>; +class RINT_S_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.s", 0>; +class RINT_D_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.d", 1>; +class ROUND_L_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.s", 0, + 0b11001100>; +class ROUND_L_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.d", 1, + 0b11001100>; +class ROUND_W_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.s", 0, + 0b11101100>; +class ROUND_W_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.d", 1, + 0b11101100>; +class SEL_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.s", 0, 0b010111000>; +class SEL_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.d", 1, 0b010111000>; +class SELEQZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.s", 0, 0b000111000>; +class SELEQZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.d", 1, 0b000111000>; +class SELENZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.s", 0, 0b001111000>; +class SELENZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.d", 1, 0b001111000>; +class CLASS_S_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.s", 0, 0b001100000>; +class CLASS_D_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.d", 1, 0b001100000>; + +class ADDU16_MMR6_ENC : POOL16A_ADDU16_FM_MMR6; +class AND16_MMR6_ENC : POOL16C_AND16_FM_MMR6; +class ANDI16_MMR6_ENC : ANDI_FM_MM16<0b001011>, MicroMipsR6Inst16; +class NOT16_MMR6_ENC : POOL16C_NOT16_FM_MMR6; +class OR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1001>; +class SLL16_MMR6_ENC : SHIFT_FM_MM16<0>, MicroMipsR6Inst16; +class SRL16_MMR6_ENC : SHIFT_FM_MM16<1>, MicroMipsR6Inst16; +class BREAK16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b011011>; +class LI16_MMR6_ENC : LI_FM_MM16; +class MOVE16_MMR6_ENC : MOVE_FM_MM16<0b000011>; +class SDBBP16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b111011>; +class SUBU16_MMR6_ENC : POOL16A_SUBU16_FM_MMR6; +class XOR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1000>; class CMP_CBR_RT_Z_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd, RegisterOperand GPROpnd> @@ -108,6 +208,43 @@ class BNEZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bnezalc", brtarget_mm, list<Register> Defs = [RA]; } +/// Floating Point Instructions +class FADD_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.s", 0, 0b00110000>; +class FADD_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.d", 1, 0b00110000>; +class FSUB_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.s", 0, 0b01110000>; +class FSUB_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.d", 1, 0b01110000>; +class FMUL_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.s", 0, 0b10110000>; +class FMUL_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.d", 1, 0b10110000>; +class FDIV_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.s", 0, 0b11110000>; +class FDIV_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.d", 1, 0b11110000>; +class MADDF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.s", 0, 0b110111000>; +class MADDF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.d", 1, 0b110111000>; +class MSUBF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.s", 0, 0b111111000>; +class MSUBF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.d", 1, 0b111111000>; +class FMOV_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.s", 0, 0b0000001>; +class FMOV_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.d", 1, 0b0000001>; +class FNEG_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.s", 0, 0b0101101>; +class FNEG_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.d", 1, 0b0101101>; +class MAX_S_MMR6_ENC : POOL32F_MINMAX_FM<"max.s", 0, 0b000001011>; +class MAX_D_MMR6_ENC : POOL32F_MINMAX_FM<"max.d", 1, 0b000001011>; +class MAXA_S_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.s", 0, 0b000101011>; +class MAXA_D_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.d", 1, 0b000101011>; +class MIN_S_MMR6_ENC : POOL32F_MINMAX_FM<"min.s", 0, 0b000000011>; +class MIN_D_MMR6_ENC : POOL32F_MINMAX_FM<"min.d", 1, 0b000000011>; +class MINA_S_MMR6_ENC : POOL32F_MINMAX_FM<"mina.s", 0, 0b000100011>; +class MINA_D_MMR6_ENC : POOL32F_MINMAX_FM<"mina.d", 1, 0b000100011>; + +class CVT_L_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.s", 0, 0b00000100>; +class CVT_L_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.d", 1, 0b00000100>; +class CVT_W_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.s", 0, 0b00100100>; +class CVT_W_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.d", 1, 0b00100100>; +class CVT_D_S_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.s", 0, 0b1001101>; +class CVT_D_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.w", 1, 0b1001101>; +class CVT_D_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.l", 2, 0b1001101>; +class CVT_S_D_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.d", 0, 0b1101101>; +class CVT_S_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.w", 1, 0b1101101>; +class CVT_S_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.l", 2, 0b1101101>; + //===----------------------------------------------------------------------===// // // Instruction Descriptions @@ -130,11 +267,34 @@ class BC_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd> bit isBarrier = 1; } -class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26> { +class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26_mm> { bit isCall = 1; list<Register> Defs = [RA]; } -class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26>; +class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26_mm>; + +class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset), + !strconcat("bc16", "\t$offset"), [], + II_BC, FrmI>, + MMR6Arch<"bc16">, MicroMipsR6Inst16 { + let isBranch = 1; + let isTerminator = 1; + let isBarrier = 1; + let hasDelaySlot = 0; + let AdditionalPredicates = [RelocPIC]; + let Defs = [AT]; +} + +class BEQZC_BNEZC_MM16R6_DESC_BASE<string instr_asm> + : CBranchZeroMM<instr_asm, brtarget7_mm, GPRMM16Opnd>, MMR6Arch<instr_asm> { + let isBranch = 1; + let isTerminator = 1; + let hasDelaySlot = 0; + let Defs = [AT]; +} +class BEQZC16_MMR6_DESC : BEQZC_BNEZC_MM16R6_DESC_BASE<"beqzc16">; +class BNEZC16_MMR6_DESC : BEQZC_BNEZC_MM16R6_DESC_BASE<"bnezc16">; + class SUB_MMR6_DESC : ArithLogicR<"sub", GPR32Opnd>; class SUBU_MMR6_DESC : ArithLogicR<"subu", GPR32Opnd>; @@ -162,6 +322,35 @@ class CACHE_HINT_MMR6_DESC<string instr_asm, Operand MemOpnd, class CACHE_MMR6_DESC : CACHE_HINT_MMR6_DESC<"cache", mem_mm_12, GPR32Opnd>; class PREF_MMR6_DESC : CACHE_HINT_MMR6_DESC<"pref", mem_mm_12, GPR32Opnd>; +class PREFE_CACHEE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd, + RegisterOperand GPROpnd> : + CACHE_HINT_MMR6_DESC<instr_asm, MemOpnd, + GPROpnd> { + string DecoderMethod = "DecodePrefeOpMM"; +} + +class PREFE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"prefe", mem_mm_9, GPR32Opnd>; +class CACHEE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"cachee", mem_mm_9, GPR32Opnd>; + +class LB_LBU_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd, + RegisterOperand GPROpnd> : MMR6Arch<instr_asm> { + dag OutOperandList = (outs GPROpnd:$rt); + dag InOperandList = (ins MemOpnd:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + string DecoderMethod = "DecodeLoadByte15"; + bit mayLoad = 1; +} +class LB_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lb", mem_mm_16, GPR32Opnd>; +class LBU_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lbu", mem_mm_16, GPR32Opnd>; + +class LBE_LBUE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd, + RegisterOperand GPROpnd> + : LB_LBU_MMR6_DESC_BASE<instr_asm, MemOpnd, GPROpnd> { + let DecoderMethod = "DecodeLoadByte9"; +} +class LBE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbe", mem_mm_9, GPR32Opnd>; +class LBUE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbue", mem_mm_9, GPR32Opnd>; + class CLO_CLZ_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> : MMR6Arch<instr_asm> { dag OutOperandList = (outs GPROpnd:$rt); @@ -174,10 +363,22 @@ class CLZ_MMR6_DESC : CLO_CLZ_MMR6_DESC_BASE<"clz", GPR32Opnd>; class EHB_MMR6_DESC : Barrier<"ehb">; class EI_MMR6_DESC : DEI_FT<"ei", GPR32Opnd>; +class DI_MMR6_DESC : DEI_FT<"di", GPR32Opnd>; class ERET_MMR6_DESC : ER_FT<"eret">; +class DERET_MMR6_DESC : ER_FT<"deret">; class ERETNC_MMR6_DESC : ER_FT<"eretnc">; +class JALRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO> + : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), + [(MipsJmpLink RO:$rs)], II_JALR, FrmR>, + MMR6Arch<opstr>, MicroMipsR6Inst16 { + let isCall = 1; + let hasDelaySlot = 0; + let Defs = [RA]; +} +class JALRC16_MMR6_DESC : JALRC16_MMR6_DESC_BASE<"jalr", GPR32Opnd>; + class JMP_MMR6_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd, RegisterOperand GPROpnd> : MMR6Arch<opstr> { @@ -200,6 +401,27 @@ class JIC_MMR6_DESC : JMP_MMR6_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, list<Register> Defs = [AT]; } +class JRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO> + : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), + [], II_JR, FrmR>, + MMR6Arch<opstr>, MicroMipsR6Inst16 { + let hasDelaySlot = 0; + let isBranch = 1; + let isIndirectBranch = 1; +} +class JRC16_MMR6_DESC : JRC16_MMR6_DESC_BASE<"jrc16", GPR32Opnd>; + +class JRCADDIUSP_MMR6_DESC + : MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jrcaddiusp\t$imm", + [], II_JRADDIUSP, FrmR>, + MMR6Arch<"jrcaddiusp">, MicroMipsR6Inst16 { + let hasDelaySlot = 0; + let isTerminator = 1; + let isBarrier = 1; + let isBranch = 1; + let isIndirectBranch = 1; +} + class ALIGN_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, Operand ImmOpnd> : MMR6Arch<instr_asm> { dag OutOperandList = (outs GPROpnd:$rd); @@ -241,7 +463,7 @@ class LSA_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, list<dag> Pattern = []; } -class LSA_MMR6_DESC : LSA_MMR6_DESC_BASE<"lsa", GPR32Opnd, uimm2>; +class LSA_MMR6_DESC : LSA_MMR6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1>; class PCREL_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, Operand ImmOpnd> : MMR6Arch<instr_asm> { @@ -264,6 +486,18 @@ class SELEQNE_Z_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> class SELEQZ_MMR6_DESC : SELEQNE_Z_MMR6_DESC_BASE<"seleqz", GPR32Opnd>; class SELNEZ_MMR6_DESC : SELEQNE_Z_MMR6_DESC_BASE<"selnez", GPR32Opnd>; +class PAUSE_MMR6_DESC : Barrier<"pause">; +class RDHWR_MMR6_DESC : MMR6Arch<"rdhwr">, MipsR6Inst { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins HWRegsOpnd:$rs, uimm3:$sel); + string AsmString = !strconcat("rdhwr", "\t$rt, $rs, $sel"); + list<dag> Pattern = []; + InstrItinClass Itinerary = II_RDHWR; + Format Form = FrmR; +} + +class WAIT_MMR6_DESC : WaitMM<"wait">; +class SSNOP_MMR6_DESC : Barrier<"ssnop">; class SLL_MMR6_DESC : shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>; class DIV_MMR6_DESC : ArithLogicR<"div", GPR32Opnd>; class DIVU_MMR6_DESC : ArithLogicR<"divu", GPR32Opnd>; @@ -277,13 +511,426 @@ class ORI_MMR6_DESC : ArithLogicI<"ori", simm16, GPR32Opnd>; class XOR_MMR6_DESC : ArithLogicR<"xor", GPR32Opnd>; class XORI_MMR6_DESC : ArithLogicI<"xori", simm16, GPR32Opnd>; +class SWE_MMR6_DESC_BASE<string opstr, DAGOperand RO, DAGOperand MO, + SDPatternOperator OpNode = null_frag, + InstrItinClass Itin = NoItinerary, + ComplexPattern Addr = addr> : + InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"), + [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> { + let DecoderMethod = "DecodeMem"; + let mayStore = 1; +} +class SW_MMR6_DESC : Store<"sw", GPR32Opnd>; +class SWE_MMR6_DESC : SWE_MMR6_DESC_BASE<"swe", GPR32Opnd, mem_simm9>; + +class WRPGPR_WSBH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO> + : MMR6Arch<instr_asm> { + dag InOperandList = (ins RO:$rs); + dag OutOperandList = (outs RO:$rt); + string AsmString = !strconcat(instr_asm, "\t$rt, $rs"); + list<dag> Pattern = []; + Format f = FrmR; + string BaseOpcode = instr_asm; + bit hasSideEffects = 0; +} +class WRPGPR_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wrpgpr", GPR32Opnd>; +class WSBH_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wsbh", GPR32Opnd>; + +/// Floating Point Instructions +class FARITH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RC, + InstrItinClass Itin, bit isComm, + SDPatternOperator OpNode = null_frag> : HARDFLOAT { + dag OutOperandList = (outs RC:$fd); + dag InOperandList = (ins RC:$ft, RC:$fs); + string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft"); + list<dag> Pattern = [(set RC:$fd, (OpNode RC:$fs, RC:$ft))]; + InstrItinClass Itinerary = Itin; + bit isCommutable = isComm; +} +class FADD_S_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>; +class FADD_D_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>; +class FSUB_S_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>; +class FSUB_D_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>; +class FMUL_S_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>; +class FMUL_D_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>; +class FDIV_S_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>; +class FDIV_D_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>; +class MADDF_S_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd>, HARDFLOAT; +class MADDF_D_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd>, HARDFLOAT; +class MSUBF_S_MMR6_DESC : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd>, HARDFLOAT; +class MSUBF_D_MMR6_DESC : COP1_4R_DESC_BASE<"msubf.d", FGR64Opnd>, HARDFLOAT; + +class FMOV_FNEG_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC, + RegisterOperand SrcRC, InstrItinClass Itin, + SDPatternOperator OpNode = null_frag> + : HARDFLOAT, NeverHasSideEffects { + dag OutOperandList = (outs DstRC:$ft); + dag InOperandList = (ins SrcRC:$fs); + string AsmString = !strconcat(instr_asm, "\t$ft, $fs"); + list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))]; + InstrItinClass Itinerary = Itin; + Format Form = FrmFR; +} +class FMOV_S_MMR6_DESC + : FMOV_FNEG_MMR6_DESC_BASE<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>; +class FMOV_D_MMR6_DESC + : FMOV_FNEG_MMR6_DESC_BASE<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>; +class FNEG_S_MMR6_DESC + : FMOV_FNEG_MMR6_DESC_BASE<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>; +class FNEG_D_MMR6_DESC + : FMOV_FNEG_MMR6_DESC_BASE<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>; + +class MAX_S_MMR6_DESC : MAX_MIN_DESC_BASE<"max.s", FGR32Opnd>, HARDFLOAT; +class MAX_D_MMR6_DESC : MAX_MIN_DESC_BASE<"max.d", FGR64Opnd>, HARDFLOAT; +class MIN_S_MMR6_DESC : MAX_MIN_DESC_BASE<"min.s", FGR32Opnd>, HARDFLOAT; +class MIN_D_MMR6_DESC : MAX_MIN_DESC_BASE<"min.d", FGR64Opnd>, HARDFLOAT; + +class MAXA_S_MMR6_DESC : MAX_MIN_DESC_BASE<"maxa.s", FGR32Opnd>, HARDFLOAT; +class MAXA_D_MMR6_DESC : MAX_MIN_DESC_BASE<"maxa.d", FGR64Opnd>, HARDFLOAT; +class MINA_S_MMR6_DESC : MAX_MIN_DESC_BASE<"mina.s", FGR32Opnd>, HARDFLOAT; +class MINA_D_MMR6_DESC : MAX_MIN_DESC_BASE<"mina.d", FGR64Opnd>, HARDFLOAT; + +class CVT_MMR6_DESC_BASE< + string instr_asm, RegisterOperand DstRC, RegisterOperand SrcRC, + InstrItinClass Itin, SDPatternOperator OpNode = null_frag> + : HARDFLOAT, NeverHasSideEffects { + dag OutOperandList = (outs DstRC:$ft); + dag InOperandList = (ins SrcRC:$fs); + string AsmString = !strconcat(instr_asm, "\t$ft, $fs"); + list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))]; + InstrItinClass Itinerary = Itin; + Format Form = FrmFR; +} + +class CVT_L_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.s", FGR64Opnd, FGR32Opnd, + II_CVT>; +class CVT_L_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.d", FGR64Opnd, FGR64Opnd, + II_CVT>; +class CVT_W_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.s", FGR32Opnd, FGR32Opnd, + II_CVT>; +class CVT_W_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.d", FGR32Opnd, AFGR64Opnd, + II_CVT>; +class CVT_D_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.s", FGR32Opnd, AFGR64Opnd, + II_CVT>; +class CVT_D_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.w", FGR32Opnd, AFGR64Opnd, + II_CVT>; +class CVT_D_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.l", FGR64Opnd, FGR64Opnd, + II_CVT>, FGR_64; +class CVT_S_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.d", AFGR64Opnd, FGR32Opnd, + II_CVT>; +class CVT_S_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.w", FGR32Opnd, FGR32Opnd, + II_CVT>; +class CVT_S_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.l", FGR64Opnd, FGR32Opnd, + II_CVT>, FGR_64; + +multiclass CMP_CC_MMR6<bits<6> format, string Typestr, + RegisterOperand FGROpnd> { + def CMP_AF_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.af.", Typestr), format, FIELD_CMP_COND_AF>, + CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_UN_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.un.", Typestr), format, FIELD_CMP_COND_UN>, + CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_EQ_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.eq.", Typestr), format, FIELD_CMP_COND_EQ>, + CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_UEQ_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.ueq.", Typestr), format, FIELD_CMP_COND_UEQ>, + CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_LT_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.lt.", Typestr), format, FIELD_CMP_COND_LT>, + CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_ULT_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.ult.", Typestr), format, FIELD_CMP_COND_ULT>, + CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_LE_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.le.", Typestr), format, FIELD_CMP_COND_LE>, + CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_ULE_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.ule.", Typestr), format, FIELD_CMP_COND_ULE>, + CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SAF_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.saf.", Typestr), format, FIELD_CMP_COND_SAF>, + CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SUN_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.sun.", Typestr), format, FIELD_CMP_COND_SUN>, + CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SEQ_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.seq.", Typestr), format, FIELD_CMP_COND_SEQ>, + CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SUEQ_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.sueq.", Typestr), format, FIELD_CMP_COND_SUEQ>, + CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SLT_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.slt.", Typestr), format, FIELD_CMP_COND_SLT>, + CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SULT_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.sult.", Typestr), format, FIELD_CMP_COND_SULT>, + CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SLE_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.sle.", Typestr), format, FIELD_CMP_COND_SLE>, + CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SULE_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.sule.", Typestr), format, FIELD_CMP_COND_SULE>, + CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; +} + +class ABSS_FT_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC, + RegisterOperand SrcRC, InstrItinClass Itin, + SDPatternOperator OpNode = null_frag> + : HARDFLOAT, NeverHasSideEffects { + dag OutOperandList = (outs DstRC:$ft); + dag InOperandList = (ins SrcRC:$fs); + string AsmString = !strconcat(instr_asm, "\t$ft, $fs"); + list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))]; + InstrItinClass Itinerary = Itin; + Format Form = FrmFR; + list<Predicate> EncodingPredicates = [HasStdEnc]; +} + +class ABS_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.s", FGR32Opnd, FGR32Opnd, + II_ABS, fabs>; +class ABS_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.d", AFGR64Opnd, AFGR64Opnd, + II_ABS, fabs>; +class FLOOR_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.s", FGR64Opnd, + FGR32Opnd, II_FLOOR>; +class FLOOR_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.d", FGR64Opnd, + FGR64Opnd, II_FLOOR>; +class FLOOR_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.w.s", FGR32Opnd, + FGR32Opnd, II_FLOOR>; +class FLOOR_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.w.d", FGR32Opnd, + AFGR64Opnd, II_FLOOR>; +class CEIL_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.l.s", FGR64Opnd, + FGR32Opnd, II_CEIL>; +class CEIL_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.l.d", FGR64Opnd, + FGR64Opnd, II_CEIL>; +class CEIL_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.w.s", FGR32Opnd, + FGR32Opnd, II_CEIL>; +class CEIL_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.w.d", FGR32Opnd, + AFGR64Opnd, II_CEIL>; +class TRUNC_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.s", FGR64Opnd, + FGR32Opnd, II_TRUNC>; +class TRUNC_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.d", FGR64Opnd, + FGR64Opnd, II_TRUNC>; +class TRUNC_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.s", FGR32Opnd, + FGR32Opnd, II_TRUNC>; +class TRUNC_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.d", FGR32Opnd, + AFGR64Opnd, II_TRUNC>; +class SQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.s", FGR32Opnd, FGR32Opnd, + II_SQRT_S, fsqrt>; +class SQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.d", AFGR64Opnd, AFGR64Opnd, + II_SQRT_D, fsqrt>; +class RSQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.s", FGR32Opnd, + FGR32Opnd, II_TRUNC>; +class RSQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.d", FGR32Opnd, + AFGR64Opnd, II_TRUNC>; +class RECIP_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.s", FGR32Opnd, + FGR32Opnd, II_ROUND>; +class RECIP_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.d", FGR32Opnd, FGR32Opnd, + II_ROUND>; +class ROUND_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.s", FGR64Opnd, + FGR32Opnd, II_ROUND>; +class ROUND_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.d", FGR64Opnd, + FGR64Opnd, II_ROUND>; +class ROUND_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.s", FGR32Opnd, + FGR32Opnd, II_ROUND>; +class ROUND_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.d", FGR64Opnd, + FGR64Opnd, II_ROUND>; + +class SEL_S_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>; +class SEL_D_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> { + // We must insert a SUBREG_TO_REG around $fd_in + bit usesCustomInserter = 1; +} + +class SELEQZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>; +class SELEQZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>; +class SELENZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>; +class SELENZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>; +class RINT_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd>; +class RINT_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>; +class CLASS_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>; +class CLASS_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>; + +class STORE_MMR6_DESC_BASE<string opstr, DAGOperand RO> + : Store<opstr, RO>, MMR6Arch<opstr> { + let DecoderMethod = "DecodeMemMMImm16"; +} +class SB_MMR6_DESC : STORE_MMR6_DESC_BASE<"sb", GPR32Opnd>; + +class STORE_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO> + : MMR6Arch<instr_asm>, MipsR6Inst { + dag OutOperandList = (outs); + dag InOperandList = (ins RO:$rt, mem_mm_9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + string DecoderMethod = "DecodeStoreEvaOpMM"; + bit mayStore = 1; +} +class SBE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sbe", GPR32Opnd>; +class SCE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sce", GPR32Opnd>; +class SH_MMR6_DESC : STORE_MMR6_DESC_BASE<"sh", GPR32Opnd>; +class SHE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"she", GPR32Opnd>; +class LOAD_WORD_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO> : + MMR6Arch<instr_asm>, MipsR6Inst { + dag OutOperandList = (outs RO:$rt); + dag InOperandList = (ins mem_mm_12:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + string DecoderMethod = "DecodeMemMMImm9"; + bit mayLoad = 1; +} +class LLE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lle", GPR32Opnd>; +class LWE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lwe", GPR32Opnd>; +class ADDU16_MMR6_DESC : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>, + MMR6Arch<"addu16">; +class AND16_MMR6_DESC : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>, + MMR6Arch<"and16">; +class ANDI16_MMR6_DESC : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, + MMR6Arch<"andi16">; +class NOT16_MMR6_DESC : NotMM16<"not16", GPRMM16Opnd>, MMR6Arch<"not16">; +class OR16_MMR6_DESC : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, + MMR6Arch<"or16">; +class SLL16_MMR6_DESC : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>, + MMR6Arch<"sll16">; +class SRL16_MMR6_DESC : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>, + MMR6Arch<"srl16">; +class BREAK16_MMR6_DESC : BrkSdbbp16MM<"break16">, MMR6Arch<"srl16">, + MicroMipsR6Inst16; +class LI16_MMR6_DESC : LoadImmMM16<"li16", li_simm7, GPRMM16Opnd>, + MMR6Arch<"srl16">, MicroMipsR6Inst16, IsAsCheapAsAMove; +class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"srl16">, + MicroMipsR6Inst16; +class SDBBP16_MMR6_DESC : BrkSdbbp16MM<"sdbbp16">, MMR6Arch<"sdbbp16">, + MicroMipsR6Inst16; +class SUBU16_MMR6_DESC : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>, + MMR6Arch<"sdbbp16">, MicroMipsR6Inst16; +class XOR16_MMR6_DESC : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>, + MMR6Arch<"sdbbp16">, MicroMipsR6Inst16; + +class LW_MMR6_DESC : MMR6Arch<"lw">, MipsR6Inst { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins mem:$addr); + string AsmString = "lw\t$rt, $addr"; + let DecoderMethod = "DecodeMemMMImm16"; + let canFoldAsLoad = 1; + let mayLoad = 1; + list<dag> Pattern = [(set GPR32Opnd:$rt, (load addrDefault:$addr))]; + InstrItinClass Itinerary = II_LW; +} + +class LUI_MMR6_DESC : IsAsCheapAsAMove, MMR6Arch<"lui">, MipsR6Inst{ + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins uimm16:$imm16); + string AsmString = "lui\t$rt, $imm16"; + list<dag> Pattern = []; + bit hasSideEffects = 0; + bit isReMaterializable = 1; + InstrItinClass Itinerary = II_LUI; + Format Form = FrmI; +} + +class SYNC_MMR6_DESC : MMR6Arch<"sync">, MipsR6Inst { + dag OutOperandList = (outs); + dag InOperandList = (ins i32imm:$stype); + string AsmString = !strconcat("sync", "\t$stype"); + list<dag> Pattern = [(MipsSync imm:$stype)]; + InstrItinClass Itinerary = NoItinerary; + bit HasSideEffects = 1; +} + +class SYNCI_MMR6_DESC : SYNCI_FT<"synci"> { + let DecoderMethod = "DecodeSynciR6"; +} + +class RDPGPR_MMR6_DESC : MMR6Arch<"rdpgpr">, MipsR6Inst { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins GPR32Opnd:$rd); + string AsmString = !strconcat("rdpgpr", "\t$rt, $rd"); +} + +class SDBBP_MMR6_DESC : MipsR6Inst { + dag OutOperandList = (outs); + dag InOperandList = (ins uimm20:$code_); + string AsmString = !strconcat("sdbbp", "\t$code_"); + list<dag> Pattern = []; +} + +class LWM16_MMR6_DESC + : MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr), + !strconcat("lwm16", "\t$rt, $addr"), [], + NoItinerary, FrmI>, + MMR6Arch<"lwm16">, MicroMipsR6Inst16 { + let DecoderMethod = "DecodeMemMMReglistImm4Lsl2"; + let mayLoad = 1; + InstrItinClass Itin = NoItinerary; + ComplexPattern Addr = addr; +} + +class SWM16_MMR6_DESC + : MicroMipsInst16<(outs), (ins reglist16:$rt, mem_mm_4sp:$addr), + !strconcat("swm16", "\t$rt, $addr"), [], + NoItinerary, FrmI>, + MMR6Arch<"swm16">, MicroMipsR6Inst16 { + let DecoderMethod = "DecodeMemMMReglistImm4Lsl2"; + let mayStore = 1; + InstrItinClass Itin = NoItinerary; + ComplexPattern Addr = addr; +} + +class SB16_MMR6_DESC_BASE<string opstr, DAGOperand RTOpnd, DAGOperand RO, + SDPatternOperator OpNode, InstrItinClass Itin, + Operand MemOpnd> + : MicroMipsInst16<(outs), (ins RTOpnd:$rt, MemOpnd:$addr), + !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI>, + MMR6Arch<opstr>, MicroMipsR6Inst16 { + let DecoderMethod = "DecodeMemMMImm4"; + let mayStore = 1; +} +class SB16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sb16", GPRMM16OpndZero, GPRMM16Opnd, + truncstorei8, II_SB, mem_mm_4>; +class SH16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sh16", GPRMM16OpndZero, GPRMM16Opnd, + truncstorei16, II_SH, mem_mm_4_lsl1>; +class SW16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sw16", GPRMM16OpndZero, GPRMM16Opnd, + store, II_SW, mem_mm_4_lsl2>; + +class SWSP_MMR6_DESC + : MicroMipsInst16<(outs), (ins GPR32Opnd:$rt, mem_mm_sp_imm5_lsl2:$offset), + !strconcat("sw", "\t$rt, $offset"), [], II_SW, FrmI>, + MMR6Arch<"sw">, MicroMipsR6Inst16 { + let DecoderMethod = "DecodeMemMMSPImm5Lsl2"; + let mayStore = 1; +} + //===----------------------------------------------------------------------===// // // Instruction Definitions // //===----------------------------------------------------------------------===// -let DecoderNamespace = "MicroMips32r6" in { +let DecoderNamespace = "MicroMipsR6" in { def ADD_MMR6 : StdMMR6Rel, ADD_MMR6_DESC, ADD_MMR6_ENC, ISA_MICROMIPS32R6; def ADDIU_MMR6 : StdMMR6Rel, ADDIU_MMR6_DESC, ADDIU_MMR6_ENC, ISA_MICROMIPS32R6; def ADDU_MMR6 : StdMMR6Rel, ADDU_MMR6_DESC, ADDU_MMR6_ENC, ISA_MICROMIPS32R6; @@ -298,6 +945,11 @@ def ALIGN_MMR6 : R6MMR6Rel, ALIGN_MMR6_ENC, ALIGN_MMR6_DESC, ISA_MICROMIPS32R6; def AUI_MMR6 : R6MMR6Rel, AUI_MMR6_ENC, AUI_MMR6_DESC, ISA_MICROMIPS32R6; def BALC_MMR6 : R6MMR6Rel, BALC_MMR6_ENC, BALC_MMR6_DESC, ISA_MICROMIPS32R6; def BC_MMR6 : R6MMR6Rel, BC_MMR6_ENC, BC_MMR6_DESC, ISA_MICROMIPS32R6; +def BC16_MMR6 : StdMMR6Rel, BC16_MMR6_DESC, BC16_MMR6_ENC, ISA_MICROMIPS32R6; +def BEQZC16_MMR6 : StdMMR6Rel, BEQZC16_MMR6_DESC, BEQZC16_MMR6_ENC, + ISA_MICROMIPS32R6; +def BNEZC16_MMR6 : StdMMR6Rel, BNEZC16_MMR6_DESC, BNEZC16_MMR6_ENC, + ISA_MICROMIPS32R6; def BITSWAP_MMR6 : R6MMR6Rel, BITSWAP_MMR6_ENC, BITSWAP_MMR6_DESC, ISA_MICROMIPS32R6; def BEQZALC_MMR6 : R6MMR6Rel, BEQZALC_MMR6_ENC, BEQZALC_MMR6_DESC, @@ -320,13 +972,21 @@ def DIV_MMR6 : R6MMR6Rel, DIV_MMR6_DESC, DIV_MMR6_ENC, ISA_MICROMIPS32R6; def DIVU_MMR6 : R6MMR6Rel, DIVU_MMR6_DESC, DIVU_MMR6_ENC, ISA_MICROMIPS32R6; def EHB_MMR6 : StdMMR6Rel, EHB_MMR6_DESC, EHB_MMR6_ENC, ISA_MICROMIPS32R6; def EI_MMR6 : StdMMR6Rel, EI_MMR6_DESC, EI_MMR6_ENC, ISA_MICROMIPS32R6; -def ERET_MMR6 : R6MMR6Rel, ERET_MMR6_DESC, ERET_MMR6_ENC, ISA_MICROMIPS32R6; +def DI_MMR6 : StdMMR6Rel, DI_MMR6_DESC, DI_MMR6_ENC, ISA_MICROMIPS32R6; +def ERET_MMR6 : StdMMR6Rel, ERET_MMR6_DESC, ERET_MMR6_ENC, ISA_MICROMIPS32R6; +def DERET_MMR6 : StdMMR6Rel, DERET_MMR6_DESC, DERET_MMR6_ENC, ISA_MICROMIPS32R6; def ERETNC_MMR6 : R6MMR6Rel, ERETNC_MMR6_DESC, ERETNC_MMR6_ENC, ISA_MICROMIPS32R6; +def JALRC16_MMR6 : R6MMR6Rel, JALRC16_MMR6_DESC, JALRC16_MMR6_ENC, + ISA_MICROMIPS32R6; def JIALC_MMR6 : R6MMR6Rel, JIALC_MMR6_ENC, JIALC_MMR6_DESC, ISA_MICROMIPS32R6; def JIC_MMR6 : R6MMR6Rel, JIC_MMR6_ENC, JIC_MMR6_DESC, ISA_MICROMIPS32R6; +def JRC16_MMR6 : R6MMR6Rel, JRC16_MMR6_DESC, JRC16_MMR6_ENC, ISA_MICROMIPS32R6; +def JRCADDIUSP_MMR6 : R6MMR6Rel, JRCADDIUSP_MMR6_DESC, JRCADDIUSP_MMR6_ENC, + ISA_MICROMIPS32R6; def LSA_MMR6 : R6MMR6Rel, LSA_MMR6_ENC, LSA_MMR6_DESC, ISA_MICROMIPS32R6; def LWPC_MMR6 : R6MMR6Rel, LWPC_MMR6_ENC, LWPC_MMR6_DESC, ISA_MICROMIPS32R6; +def LWM16_MMR6 : StdMMR6Rel, LWM16_MMR6_DESC, LWM16_MMR6_ENC, ISA_MICROMIPS32R6; def MOD_MMR6 : R6MMR6Rel, MOD_MMR6_DESC, MOD_MMR6_ENC, ISA_MICROMIPS32R6; def MODU_MMR6 : R6MMR6Rel, MODU_MMR6_DESC, MODU_MMR6_ENC, ISA_MICROMIPS32R6; def MUL_MMR6 : R6MMR6Rel, MUL_MMR6_DESC, MUL_MMR6_ENC, ISA_MICROMIPS32R6; @@ -337,17 +997,211 @@ def NOR_MMR6 : StdMMR6Rel, NOR_MMR6_DESC, NOR_MMR6_ENC, ISA_MICROMIPS32R6; def OR_MMR6 : StdMMR6Rel, OR_MMR6_DESC, OR_MMR6_ENC, ISA_MICROMIPS32R6; def ORI_MMR6 : StdMMR6Rel, ORI_MMR6_DESC, ORI_MMR6_ENC, ISA_MICROMIPS32R6; def PREF_MMR6 : R6MMR6Rel, PREF_MMR6_ENC, PREF_MMR6_DESC, ISA_MICROMIPS32R6; +def SB16_MMR6 : StdMMR6Rel, SB16_MMR6_DESC, SB16_MMR6_ENC, ISA_MICROMIPS32R6; def SEB_MMR6 : StdMMR6Rel, SEB_MMR6_DESC, SEB_MMR6_ENC, ISA_MICROMIPS32R6; def SEH_MMR6 : StdMMR6Rel, SEH_MMR6_DESC, SEH_MMR6_ENC, ISA_MICROMIPS32R6; def SELEQZ_MMR6 : R6MMR6Rel, SELEQZ_MMR6_ENC, SELEQZ_MMR6_DESC, ISA_MICROMIPS32R6; def SELNEZ_MMR6 : R6MMR6Rel, SELNEZ_MMR6_ENC, SELNEZ_MMR6_DESC, ISA_MICROMIPS32R6; +def SH16_MMR6 : StdMMR6Rel, SH16_MMR6_DESC, SH16_MMR6_ENC, ISA_MICROMIPS32R6; def SLL_MMR6 : StdMMR6Rel, SLL_MMR6_DESC, SLL_MMR6_ENC, ISA_MICROMIPS32R6; def SUB_MMR6 : StdMMR6Rel, SUB_MMR6_DESC, SUB_MMR6_ENC, ISA_MICROMIPS32R6; def SUBU_MMR6 : StdMMR6Rel, SUBU_MMR6_DESC, SUBU_MMR6_ENC, ISA_MICROMIPS32R6; +def SW16_MMR6 : StdMMR6Rel, SW16_MMR6_DESC, SW16_MMR6_ENC, ISA_MICROMIPS32R6; +def SWM16_MMR6 : StdMMR6Rel, SWM16_MMR6_DESC, SWM16_MMR6_ENC, ISA_MICROMIPS32R6; +def SWSP_MMR6 : StdMMR6Rel, SWSP_MMR6_DESC, SWSP_MMR6_ENC, ISA_MICROMIPS32R6; +def PREFE_MMR6 : StdMMR6Rel, PREFE_MMR6_ENC, PREFE_MMR6_DESC, ISA_MICROMIPS32R6; +def CACHEE_MMR6 : StdMMR6Rel, CACHEE_MMR6_ENC, CACHEE_MMR6_DESC, + ISA_MICROMIPS32R6; +def WRPGPR_MMR6 : StdMMR6Rel, WRPGPR_MMR6_ENC, WRPGPR_MMR6_DESC, + ISA_MICROMIPS32R6; +def WSBH_MMR6 : StdMMR6Rel, WSBH_MMR6_ENC, WSBH_MMR6_DESC, ISA_MICROMIPS32R6; +def LB_MMR6 : R6MMR6Rel, LB_MMR6_ENC, LB_MMR6_DESC, ISA_MICROMIPS32R6; +def LBU_MMR6 : R6MMR6Rel, LBU_MMR6_ENC, LBU_MMR6_DESC, ISA_MICROMIPS32R6; +def LBE_MMR6 : R6MMR6Rel, LBE_MMR6_ENC, LBE_MMR6_DESC, ISA_MICROMIPS32R6; +def LBUE_MMR6 : R6MMR6Rel, LBUE_MMR6_ENC, LBUE_MMR6_DESC, ISA_MICROMIPS32R6; +def PAUSE_MMR6 : StdMMR6Rel, PAUSE_MMR6_DESC, PAUSE_MMR6_ENC, ISA_MICROMIPS32R6; +def RDHWR_MMR6 : R6MMR6Rel, RDHWR_MMR6_DESC, RDHWR_MMR6_ENC, ISA_MICROMIPS32R6; +def WAIT_MMR6 : StdMMR6Rel, WAIT_MMR6_DESC, WAIT_MMR6_ENC, ISA_MICROMIPS32R6; +def SSNOP_MMR6 : StdMMR6Rel, SSNOP_MMR6_DESC, SSNOP_MMR6_ENC, ISA_MICROMIPS32R6; +def SYNC_MMR6 : StdMMR6Rel, SYNC_MMR6_DESC, SYNC_MMR6_ENC, ISA_MICROMIPS32R6; +def SYNCI_MMR6 : StdMMR6Rel, SYNCI_MMR6_DESC, SYNCI_MMR6_ENC, ISA_MICROMIPS32R6; +def RDPGPR_MMR6 : R6MMR6Rel, RDPGPR_MMR6_DESC, RDPGPR_MMR6_ENC, + ISA_MICROMIPS32R6; +def SDBBP_MMR6 : R6MMR6Rel, SDBBP_MMR6_DESC, SDBBP_MMR6_ENC, ISA_MICROMIPS32R6; def XOR_MMR6 : StdMMR6Rel, XOR_MMR6_DESC, XOR_MMR6_ENC, ISA_MICROMIPS32R6; def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6; +let DecoderMethod = "DecodeMemMMImm16" in { + def SW_MMR6 : StdMMR6Rel, SW_MMR6_DESC, SW_MMR6_ENC, ISA_MICROMIPS32R6; +} +let DecoderMethod = "DecodeMemMMImm9" in { + def SWE_MMR6 : StdMMR6Rel, SWE_MMR6_DESC, SWE_MMR6_ENC, ISA_MICROMIPS32R6; +} +/// Floating Point Instructions +def FADD_S_MMR6 : StdMMR6Rel, FADD_S_MMR6_ENC, FADD_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FADD_D_MMR6 : StdMMR6Rel, FADD_D_MMR6_ENC, FADD_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def FSUB_S_MMR6 : StdMMR6Rel, FSUB_S_MMR6_ENC, FSUB_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FSUB_D_MMR6 : StdMMR6Rel, FSUB_D_MMR6_ENC, FSUB_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def FMUL_S_MMR6 : StdMMR6Rel, FMUL_S_MMR6_ENC, FMUL_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FMUL_D_MMR6 : StdMMR6Rel, FMUL_D_MMR6_ENC, FMUL_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def FDIV_S_MMR6 : StdMMR6Rel, FDIV_S_MMR6_ENC, FDIV_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FDIV_D_MMR6 : StdMMR6Rel, FDIV_D_MMR6_ENC, FDIV_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def MADDF_S_MMR6 : R6MMR6Rel, MADDF_S_MMR6_ENC, MADDF_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def MADDF_D_MMR6 : R6MMR6Rel, MADDF_D_MMR6_ENC, MADDF_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def MSUBF_S_MMR6 : R6MMR6Rel, MSUBF_S_MMR6_ENC, MSUBF_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def MSUBF_D_MMR6 : R6MMR6Rel, MSUBF_D_MMR6_ENC, MSUBF_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def FMOV_S_MMR6 : StdMMR6Rel, FMOV_S_MMR6_ENC, FMOV_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FMOV_D_MMR6 : StdMMR6Rel, FMOV_D_MMR6_ENC, FMOV_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def FNEG_S_MMR6 : StdMMR6Rel, FNEG_S_MMR6_ENC, FNEG_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FNEG_D_MMR6 : StdMMR6Rel, FNEG_D_MMR6_ENC, FNEG_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def MAX_S_MMR6 : R6MMR6Rel, MAX_S_MMR6_ENC, MAX_S_MMR6_DESC, ISA_MICROMIPS32R6; +def MAX_D_MMR6 : R6MMR6Rel, MAX_D_MMR6_ENC, MAX_D_MMR6_DESC, ISA_MICROMIPS32R6; +def MIN_S_MMR6 : R6MMR6Rel, MIN_S_MMR6_ENC, MIN_S_MMR6_DESC, ISA_MICROMIPS32R6; +def MIN_D_MMR6 : R6MMR6Rel, MIN_D_MMR6_ENC, MIN_D_MMR6_DESC, ISA_MICROMIPS32R6; +def MAXA_S_MMR6 : R6MMR6Rel, MAXA_S_MMR6_ENC, MAXA_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def MAXA_D_MMR6 : R6MMR6Rel, MAXA_D_MMR6_ENC, MAXA_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def MINA_S_MMR6 : R6MMR6Rel, MINA_S_MMR6_ENC, MINA_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def MINA_D_MMR6 : R6MMR6Rel, MINA_D_MMR6_ENC, MINA_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_L_S_MMR6 : StdMMR6Rel, CVT_L_S_MMR6_ENC, CVT_L_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_L_D_MMR6 : StdMMR6Rel, CVT_L_D_MMR6_ENC, CVT_L_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_W_S_MMR6 : StdMMR6Rel, CVT_W_S_MMR6_ENC, CVT_W_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_W_D_MMR6 : StdMMR6Rel, CVT_W_D_MMR6_ENC, CVT_W_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_D_S_MMR6 : StdMMR6Rel, CVT_D_S_MMR6_ENC, CVT_D_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_D_W_MMR6 : StdMMR6Rel, CVT_D_W_MMR6_ENC, CVT_D_W_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_D_L_MMR6 : StdMMR6Rel, CVT_D_L_MMR6_ENC, CVT_D_L_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_S_D_MMR6 : StdMMR6Rel, CVT_S_D_MMR6_ENC, CVT_S_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_S_W_MMR6 : StdMMR6Rel, CVT_S_W_MMR6_ENC, CVT_S_W_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_S_L_MMR6 : StdMMR6Rel, CVT_S_L_MMR6_ENC, CVT_S_L_MMR6_DESC, + ISA_MICROMIPS32R6; +defm S_MMR6 : CMP_CC_MMR6<0b000101, "s", FGR32Opnd>; +defm D_MMR6 : CMP_CC_MMR6<0b010101, "d", FGR64Opnd>; +def ABS_S_MMR6 : StdMMR6Rel, ABS_S_MMR6_ENC, ABS_S_MMR6_DESC, ISA_MICROMIPS32R6; +def ABS_D_MMR6 : StdMMR6Rel, ABS_D_MMR6_ENC, ABS_D_MMR6_DESC, ISA_MICROMIPS32R6; +def FLOOR_L_S_MMR6 : StdMMR6Rel, FLOOR_L_S_MMR6_ENC, FLOOR_L_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FLOOR_L_D_MMR6 : StdMMR6Rel, FLOOR_L_D_MMR6_ENC, FLOOR_L_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def FLOOR_W_S_MMR6 : StdMMR6Rel, FLOOR_W_S_MMR6_ENC, FLOOR_W_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FLOOR_W_D_MMR6 : StdMMR6Rel, FLOOR_W_D_MMR6_ENC, FLOOR_W_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CEIL_L_S_MMR6 : StdMMR6Rel, CEIL_L_S_MMR6_ENC, CEIL_L_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CEIL_L_D_MMR6 : StdMMR6Rel, CEIL_L_D_MMR6_ENC, CEIL_L_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CEIL_W_S_MMR6 : StdMMR6Rel, CEIL_W_S_MMR6_ENC, CEIL_W_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CEIL_W_D_MMR6 : StdMMR6Rel, CEIL_W_D_MMR6_ENC, CEIL_W_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def TRUNC_L_S_MMR6 : StdMMR6Rel, TRUNC_L_S_MMR6_ENC, TRUNC_L_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def TRUNC_L_D_MMR6 : StdMMR6Rel, TRUNC_L_D_MMR6_ENC, TRUNC_L_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def TRUNC_W_S_MMR6 : StdMMR6Rel, TRUNC_W_S_MMR6_ENC, TRUNC_W_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def TRUNC_W_D_MMR6 : StdMMR6Rel, TRUNC_W_D_MMR6_ENC, TRUNC_W_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def SQRT_S_MMR6 : StdMMR6Rel, SQRT_S_MMR6_ENC, SQRT_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def SQRT_D_MMR6 : StdMMR6Rel, SQRT_D_MMR6_ENC, SQRT_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def RSQRT_S_MMR6 : StdMMR6Rel, RSQRT_S_MMR6_ENC, RSQRT_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def RSQRT_D_MMR6 : StdMMR6Rel, RSQRT_D_MMR6_ENC, RSQRT_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def SB_MMR6 : StdMMR6Rel, SB_MMR6_DESC, SB_MMR6_ENC, ISA_MICROMIPS32R6; +def SBE_MMR6 : StdMMR6Rel, SBE_MMR6_DESC, SBE_MMR6_ENC, ISA_MICROMIPS32R6; +def SCE_MMR6 : StdMMR6Rel, SCE_MMR6_DESC, SCE_MMR6_ENC, ISA_MICROMIPS32R6; +def SH_MMR6 : StdMMR6Rel, SH_MMR6_DESC, SH_MMR6_ENC, ISA_MICROMIPS32R6; +def SHE_MMR6 : StdMMR6Rel, SHE_MMR6_DESC, SHE_MMR6_ENC, ISA_MICROMIPS32R6; +def LLE_MMR6 : StdMMR6Rel, LLE_MMR6_DESC, LLE_MMR6_ENC, ISA_MICROMIPS32R6; +def LWE_MMR6 : StdMMR6Rel, LWE_MMR6_DESC, LWE_MMR6_ENC, ISA_MICROMIPS32R6; +def LW_MMR6 : StdMMR6Rel, LW_MMR6_DESC, LW_MMR6_ENC, ISA_MICROMIPS32R6; +def LUI_MMR6 : R6MMR6Rel, LUI_MMR6_DESC, LUI_MMR6_ENC, ISA_MICROMIPS32R6; +def ADDU16_MMR6 : StdMMR6Rel, ADDU16_MMR6_DESC, ADDU16_MMR6_ENC, + ISA_MICROMIPS32R6; +def AND16_MMR6 : StdMMR6Rel, AND16_MMR6_DESC, AND16_MMR6_ENC, + ISA_MICROMIPS32R6; +def ANDI16_MMR6 : StdMMR6Rel, ANDI16_MMR6_DESC, ANDI16_MMR6_ENC, + ISA_MICROMIPS32R6; +def NOT16_MMR6 : StdMMR6Rel, NOT16_MMR6_DESC, NOT16_MMR6_ENC, + ISA_MICROMIPS32R6; +def OR16_MMR6 : StdMMR6Rel, OR16_MMR6_DESC, OR16_MMR6_ENC, + ISA_MICROMIPS32R6; +def SLL16_MMR6 : StdMMR6Rel, SLL16_MMR6_DESC, SLL16_MMR6_ENC, + ISA_MICROMIPS32R6; +def SRL16_MMR6 : StdMMR6Rel, SRL16_MMR6_DESC, SRL16_MMR6_ENC, + ISA_MICROMIPS32R6; +def BREAK16_MMR6 : StdMMR6Rel, BREAK16_MMR6_DESC, BREAK16_MMR6_ENC, + ISA_MICROMIPS32R6; +def LI16_MMR6 : StdMMR6Rel, LI16_MMR6_DESC, LI16_MMR6_ENC, + ISA_MICROMIPS32R6; +def MOVE16_MMR6 : StdMMR6Rel, MOVE16_MMR6_DESC, MOVE16_MMR6_ENC, + ISA_MICROMIPS32R6; +def SDBBP16_MMR6 : StdMMR6Rel, SDBBP16_MMR6_DESC, SDBBP16_MMR6_ENC, + ISA_MICROMIPS32R6; +def SUBU16_MMR6 : StdMMR6Rel, SUBU16_MMR6_DESC, SUBU16_MMR6_ENC, + ISA_MICROMIPS32R6; +def XOR16_MMR6 : StdMMR6Rel, XOR16_MMR6_DESC, XOR16_MMR6_ENC, + ISA_MICROMIPS32R6; +def RECIP_S_MMR6 : StdMMR6Rel, RECIP_S_MMR6_ENC, RECIP_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def RECIP_D_MMR6 : StdMMR6Rel, RECIP_D_MMR6_ENC, RECIP_D_MMR6_DESC, ISA_MICROMIPS32R6; +def RINT_S_MMR6 : StdMMR6Rel, RINT_S_MMR6_ENC, RINT_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def RINT_D_MMR6 : StdMMR6Rel, RINT_D_MMR6_ENC, RINT_D_MMR6_DESC, ISA_MICROMIPS32R6; +def ROUND_L_S_MMR6 : StdMMR6Rel, ROUND_L_S_MMR6_ENC, ROUND_L_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def ROUND_L_D_MMR6 : StdMMR6Rel, ROUND_L_D_MMR6_ENC, ROUND_L_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def ROUND_W_S_MMR6 : StdMMR6Rel, ROUND_W_S_MMR6_ENC, ROUND_W_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def ROUND_W_D_MMR6 : StdMMR6Rel, ROUND_W_D_MMR6_ENC, ROUND_W_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def SEL_S_MMR6 : StdMMR6Rel, SEL_S_MMR6_ENC, SEL_S_MMR6_DESC, ISA_MICROMIPS32R6; +def SEL_D_MMR6 : StdMMR6Rel, SEL_D_MMR6_ENC, SEL_D_MMR6_DESC, ISA_MICROMIPS32R6; +def SELEQZ_S_MMR6 : StdMMR6Rel, SELEQZ_S_MMR6_ENC, SELEQZ_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def SELEQZ_D_MMR6 : StdMMR6Rel, SELEQZ_D_MMR6_ENC, SELEQZ_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def SELENZ_S_MMR6 : StdMMR6Rel, SELENZ_S_MMR6_ENC, SELENZ_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def SELENZ_D_MMR6 : StdMMR6Rel, SELENZ_D_MMR6_ENC, SELENZ_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CLASS_S_MMR6 : StdMMR6Rel, CLASS_S_MMR6_ENC, CLASS_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CLASS_D_MMR6 : StdMMR6Rel, CLASS_D_MMR6_ENC, CLASS_D_MMR6_DESC, + ISA_MICROMIPS32R6; } //===----------------------------------------------------------------------===// @@ -357,4 +1211,23 @@ def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6; //===----------------------------------------------------------------------===// def : MipsInstAlias<"ei", (EI_MMR6 ZERO), 1>, ISA_MICROMIPS32R6; +def : MipsInstAlias<"di", (DI_MMR6 ZERO), 1>, ISA_MICROMIPS32R6; def : MipsInstAlias<"nop", (SLL_MMR6 ZERO, ZERO, 0), 1>, ISA_MICROMIPS32R6; +def B_MMR6_Pseudo : MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset), + !strconcat("b", "\t$offset")> { + string DecoderNamespace = "MicroMipsR6"; +} +def : MipsInstAlias<"sync", (SYNC_MMR6 0), 1>, ISA_MICROMIPS32R6; +def : MipsInstAlias<"sdbbp", (SDBBP_MMR6 0), 1>, ISA_MICROMIPS32R6; +def : MipsInstAlias<"rdhwr $rt, $rs", + (RDHWR_MMR6 GPR32Opnd:$rt, HWRegsOpnd:$rs, 0), 1>, + ISA_MICROMIPS32R6; + +//===----------------------------------------------------------------------===// +// +// MicroMips arbitrary patterns that map to one or more instructions +// +//===----------------------------------------------------------------------===// + +def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr), + (SW16_MMR6 GPRMM16:$src, addrimm4lsl2:$addr)>, ISA_MICROMIPS32R6; diff --git a/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td new file mode 100644 index 0000000..da305a2 --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td @@ -0,0 +1,86 @@ +//=- MicroMips64r6InstrFormats.td - Instruction Formats -*- tablegen -* -=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes microMIPS64r6 instruction formats. +// +//===----------------------------------------------------------------------===// + +class DAUI_FM_MMR6 { + bits<5> rt; + bits<5> rs; + bits<16> imm; + + bits<32> Inst; + + let Inst{31-26} = 0b111100; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-0} = imm; +} + +class POOL32I_ADD_IMM_FM_MMR6<bits<5> funct> { + bits<5> rs; + bits<16> imm; + + bits<32> Inst; + + let Inst{31-26} = 0b010000; + let Inst{25-21} = funct; + let Inst{20-16} = rs; + let Inst{15-0} = imm; +} + +class POOL32S_EXTBITS_FM_MMR6<bits<6> funct> { + bits<5> rt; + bits<5> rs; + bits<5> size; + bits<5> pos; + + bits<32> Inst; + + let Inst{31-26} = 0b010110; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-11} = size; + let Inst{10-6} = pos; + let Inst{5-0} = funct; +} + +class POOL32S_DALIGN_FM_MMR6 { + bits<5> rs; + bits<5> rt; + bits<5> rd; + bits<3> bp; + + bits<32> Inst; + + let Inst{31-26} = 0b010110; + let Inst{25-21} = rs; + let Inst{20-16} = rt; + let Inst{15-11} = rd; + let Inst{10-8} = bp; + let Inst{7-6} = 0b00; + let Inst{5-0} = 0b011100; +} + +class POOL32A_DIVMOD_FM_MMR6<string instr_asm, bits<9> funct> + : MMR6Arch<instr_asm> { + bits<5> rd; + bits<5> rs; + bits<5> rt; + + bits<32> Inst; + + let Inst{31-26} = 0b010110; + let Inst{25-21} = rd; + let Inst{20-16} = rs; + let Inst{15-11} = rt; + let Inst{10-9} = 0b00; + let Inst{8-0} = funct; +} diff --git a/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td new file mode 100644 index 0000000..ec1aef8 --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td @@ -0,0 +1,119 @@ +//=- MicroMips64r6InstrInfo.td - Instruction Information -*- tablegen -*- -=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes MicroMips64r6 instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// +// Instruction Encodings +// +//===----------------------------------------------------------------------===// + +class DAUI_MMR6_ENC : DAUI_FM_MMR6; +class DAHI_MMR6_ENC : POOL32I_ADD_IMM_FM_MMR6<0b10001>; +class DATI_MMR6_ENC : POOL32I_ADD_IMM_FM_MMR6<0b10000>; +class DEXT_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b101100>; +class DEXTM_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b100100>; +class DEXTU_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b010100>; +class DALIGN_MMR6_ENC : POOL32S_DALIGN_FM_MMR6; +class DDIV_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddiv", 0b100011000>; +class DMOD_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmod", 0b101011000>; +class DDIVU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddivu", 0b110011000>; +class DMODU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmodu", 0b111011000>; + +//===----------------------------------------------------------------------===// +// +// Instruction Descriptions +// +//===----------------------------------------------------------------------===// + +class DAUI_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> + : MMR6Arch<instr_asm>, MipsR6Inst { + dag OutOperandList = (outs GPROpnd:$rt); + dag InOperandList = (ins GPROpnd:$rs, simm16:$imm); + string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm"); + list<dag> Pattern = []; +} +class DAUI_MMR6_DESC : DAUI_MMR6_DESC_BASE<"daui", GPR64Opnd>; + +class DAHI_DATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> + : MMR6Arch<instr_asm>, MipsR6Inst { + dag OutOperandList = (outs GPROpnd:$rs); + dag InOperandList = (ins GPROpnd:$rt, simm16:$imm); + string AsmString = !strconcat(instr_asm, "\t$rt, $imm"); + string Constraints = "$rs = $rt"; +} +class DAHI_MMR6_DESC : DAHI_DATI_DESC_BASE<"dahi", GPR64Opnd>; +class DATI_MMR6_DESC : DAHI_DATI_DESC_BASE<"dati", GPR64Opnd>; + +class EXTBITS_DESC_BASE<string instr_asm, RegisterOperand RO, Operand PosOpnd, + Operand SizeOpnd, SDPatternOperator Op = null_frag> + : MMR6Arch<instr_asm>, MipsR6Inst { + dag OutOperandList = (outs RO:$rt); + dag InOperandList = (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size); + string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $pos, $size"); + list<dag> Pattern = [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))]; + InstrItinClass Itinerary = II_EXT; + Format Form = FrmR; + string BaseOpcode = instr_asm; +} +// TODO: Add 'pos + size' constraint check to dext* instructions +// DEXT: 0 < pos + size <= 63 +// DEXTM, DEXTU: 32 < pos + size <= 64 +class DEXT_MMR6_DESC : EXTBITS_DESC_BASE<"dext", GPR64Opnd, uimm5, + uimm5_plus1, MipsExt>; +class DEXTM_MMR6_DESC : EXTBITS_DESC_BASE<"dextm", GPR64Opnd, uimm5, + uimm5_plus33, MipsExt>; +class DEXTU_MMR6_DESC : EXTBITS_DESC_BASE<"dextu", GPR64Opnd, uimm5_plus32, + uimm5_plus1, MipsExt>; + +class DALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, + Operand ImmOpnd> : MMR6Arch<instr_asm>, MipsR6Inst { + dag OutOperandList = (outs GPROpnd:$rd); + dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp); + string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp"); + list<dag> Pattern = []; +} + +class DALIGN_MMR6_DESC : DALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3>; + +class DDIV_MM64R6_DESC : ArithLogicR<"ddiv", GPR32Opnd>; +class DMOD_MM64R6_DESC : ArithLogicR<"dmod", GPR32Opnd>; +class DDIVU_MM64R6_DESC : ArithLogicR<"ddivu", GPR32Opnd>; +class DMODU_MM64R6_DESC : ArithLogicR<"dmodu", GPR32Opnd>; + +//===----------------------------------------------------------------------===// +// +// Instruction Definitions +// +//===----------------------------------------------------------------------===// + +let DecoderNamespace = "MicroMipsR6" in { + def DAUI_MM64R6 : StdMMR6Rel, DAUI_MMR6_DESC, DAUI_MMR6_ENC, ISA_MICROMIPS64R6; + def DAHI_MM64R6 : StdMMR6Rel, DAHI_MMR6_DESC, DAHI_MMR6_ENC, ISA_MICROMIPS64R6; + def DATI_MM64R6 : StdMMR6Rel, DATI_MMR6_DESC, DATI_MMR6_ENC, ISA_MICROMIPS64R6; + def DEXT_MM64R6 : StdMMR6Rel, DEXT_MMR6_DESC, DEXT_MMR6_ENC, + ISA_MICROMIPS64R6; + def DEXTM_MM64R6 : StdMMR6Rel, DEXTM_MMR6_DESC, DEXTM_MMR6_ENC, + ISA_MICROMIPS64R6; + def DEXTU_MM64R6 : StdMMR6Rel, DEXTU_MMR6_DESC, DEXTU_MMR6_ENC, + ISA_MICROMIPS64R6; + def DALIGN_MM64R6 : StdMMR6Rel, DALIGN_MMR6_DESC, DALIGN_MMR6_ENC, + ISA_MICROMIPS64R6; + def DDIV_MM64R6 : R6MMR6Rel, DDIV_MM64R6_DESC, DDIV_MM64R6_ENC, + ISA_MICROMIPS64R6; + def DMOD_MM64R6 : R6MMR6Rel, DMOD_MM64R6_DESC, DMOD_MM64R6_ENC, + ISA_MICROMIPS64R6; + def DDIVU_MM64R6 : R6MMR6Rel, DDIVU_MM64R6_DESC, DDIVU_MM64R6_ENC, + ISA_MICROMIPS64R6; + def DMODU_MM64R6 : R6MMR6Rel, DMODU_MM64R6_DESC, DMODU_MM64R6_ENC, + ISA_MICROMIPS64R6; +} diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td new file mode 100644 index 0000000..f11c09a --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td @@ -0,0 +1,244 @@ +//===-- MicroMipsDSPInstrFormats.td - Instruction Formats --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +class MMDSPInst<string opstr = ""> + : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl { + let InsnPredicates = [HasDSP]; + let AdditionalPredicates = [InMicroMips]; + string BaseOpcode = opstr; + string Arch = "mmdsp"; + let DecoderNamespace = "MicroMips"; +} + +class MMDSPInstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, PredicateControl { + let InsnPredicates = [HasDSP]; + let AdditionalPredicates = [InMicroMips]; +} + +class POOL32A_3R_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> { + bits<5> rd; + bits<5> rs; + bits<5> rt; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-11} = rd; + let Inst{10-0} = op; +} + +class POOL32A_2R_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-6} = op; + let Inst{5-0} = 0b111100; +} + +class POOL32A_2RAC_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<2> ac; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-14} = ac; + let Inst{13-6} = op; + let Inst{5-0} = 0b111100; +} + +class POOL32A_3RB0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> { + bits<5> rd; + bits<5> rs; + bits<5> rt; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-11} = rd; + let Inst{10} = 0b0; + let Inst{9-0} = op; +} + +class POOL32A_2RSA4_FMT<string opstr, bits<12> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<4> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-12} = sa; + let Inst{11-0} = op; +} + +class POOL32A_2RSA3_FMT<string opstr, bits<7> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<3> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-13} = sa; + let Inst{12-6} = op; + let Inst{5-0} = 0b111100; +} + +class POOL32A_2RSA5B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<5> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-11} = sa; + let Inst{10} = 0b0; + let Inst{9-0} = op; +} + +class POOL32A_2RSA4B0_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<4> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-12} = sa; + let Inst{11} = 0b0; + let Inst{10-0} = op; +} + +class POOL32A_2RSA4OP6_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<4> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-12} = sa; + let Inst{11-6} = op; + let Inst{5-0} = 0b111100; +} + +class POOL32A_1RIMM5AC_FMT<string opstr, bits<8> funct> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> imm; + bits<2> ac; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = imm; + let Inst{15-14} = ac; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111100; +} + +class POOL32A_2RSA5_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<5> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-11} = sa; + let Inst{10-0} = op; +} + +class POOL32A_1RMEMB0_FMT<string opstr, bits<10> funct> : MMDSPInst<opstr> { + bits<5> index; + bits<5> base; + bits<5> rd; + + let Inst{31-26} = 0; + let Inst{25-21} = index; + let Inst{20-16} = base; + let Inst{15-11} = rd; + let Inst{10} = 0b0; + let Inst{9-0} = funct; +} + +class POOL32A_1RAC_FMT<string instr_asm, bits<8> funct> : MMDSPInst<instr_asm> { + bits<5> rs; + bits<2> ac; + + let Inst{31-26} = 0; + let Inst{25-21} = 0; + let Inst{20-16} = rs; + let Inst{15-14} = ac; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111100; +} + +class POOL32A_1RMASK7_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<7> mask; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-14} = mask; + let Inst{13-6} = op; + let Inst{5-0} = 0b111100; +} + +class POOL32A_1RIMM10_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> { + bits<5> rd; + bits<10> imm; + + let Inst{31-26} = 0; + let Inst{25-16} = imm; + let Inst{15-11} = rd; + let Inst{10} = 0; + let Inst{9-0} = op; +} + +class POOL32A_1RIMM8_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<8> imm; + + let Inst{31-26} = 0; + let Inst{25-21} = rt; + let Inst{20-13} = imm; + let Inst{12} = 0; + let Inst{11-6} = op; + let Inst{5-0} = 0b111100; +} + +class POOL32A_4B0SHIFT6AC4B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> { + bits<6> shift; + bits<2> ac; + + let Inst{31-26} = 0b000000; + let Inst{25-22} = 0b0000; + let Inst{21-16} = shift; + let Inst{15-14} = ac; + let Inst{13-10} = 0b0000; + let Inst{9-0} = op; +} + +class POOL32A_5B01RAC_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> { + bits<5> rs; + bits<2> ac; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = 0b00000; + let Inst{20-16} = rs; + let Inst{15-14} = ac; + let Inst{13-6} = op; + let Inst{5-0} = 0b111100; +} diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td new file mode 100644 index 0000000..b342e23 --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td @@ -0,0 +1,528 @@ +//===- MicroMipsDSPInstrInfo.td - Micromips DSP instructions -*- tablegen *-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes MicroMips DSP instructions. +// +//===----------------------------------------------------------------------===// + +// Instruction encoding. +class ADDQ_PH_MM_ENC : POOL32A_3R_FMT<"addq.ph", 0b00000001101>; +class ADDQ_S_PH_MM_ENC : POOL32A_3R_FMT<"addq_s.ph", 0b10000001101>; +class ADDQ_S_W_MM_ENC : POOL32A_3RB0_FMT<"addq_s.w", 0b1100000101>; +class ADDQH_PH_MMR2_ENC : POOL32A_3R_FMT<"addqh.ph", 0b00001001101>; +class ADDQH_R_PH_MMR2_ENC : POOL32A_3R_FMT<"addqh_r.ph", 0b10001001101>; +class ADDQH_W_MMR2_ENC: POOL32A_3R_FMT<"addqh.w", 0b00010001101>; +class ADDQH_R_W_MMR2_ENC : POOL32A_3R_FMT<"addqh_r.w", 0b10010001101>; +class ADDU_PH_MMR2_ENC : POOL32A_3R_FMT<"addu.ph", 0b00100001101>; +class ADDU_S_PH_MMR2_ENC : POOL32A_3R_FMT<"addu_s.ph", 0b10100001101>; +class ADDU_QB_MM_ENC : POOL32A_3R_FMT<"addu.qb", 0b00011001101>; +class ADDU_S_QB_MM_ENC : POOL32A_3R_FMT<"addu_s.qb", 0b10011001101>; +class ADDUH_QB_MMR2_ENC : POOL32A_3R_FMT<"adduh.qb", 0b00101001101>; +class ADDUH_R_QB_MMR2_ENC : POOL32A_3R_FMT<"adduh_r.qb", 0b10101001101>; +class ADDSC_MM_ENC : POOL32A_3RB0_FMT<"addsc", 0b1110000101>; +class ADDWC_MM_ENC : POOL32A_3RB0_FMT<"addwc", 0b1111000101>; +class DPA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpa.w.ph", 0b00000010>; +class DPAQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"dpaq_s.w.ph", 0b00001010>; +class DPAQ_SA_L_W_MM_ENC : POOL32A_2RAC_FMT<"dpaq_sa.l.w", 0b01001010>; +class DPAQX_S_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpaqx_s.w.ph", 0b10001010>; +class DPAQX_SA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpaqx_sa.w.ph", 0b11001010>; +class DPAU_H_QBL_MM_ENC : POOL32A_2RAC_FMT<"dpau.h.qbl", 0b10000010>; +class DPAU_H_QBR_MM_ENC : POOL32A_2RAC_FMT<"dpau.h.qbr", 0b11000010>; +class DPAX_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpax.w.ph", 0b01000010>; +class ABSQ_S_PH_MM_ENC : POOL32A_2R_FMT<"absq_s.ph", 0b0001000100>; +class ABSQ_S_W_MM_ENC : POOL32A_2R_FMT<"absq_s.w", 0b0010000100>; +class ABSQ_S_QB_MMR2_ENC : POOL32A_2R_FMT<"absq_s.qb", 0b0000000100>; +class INSV_MM_ENC : POOL32A_2R_FMT<"insv", 0b0100000100>; +class MADD_DSP_MM_ENC : POOL32A_2RAC_FMT<"madd", 0b00101010>; +class MADDU_DSP_MM_ENC : POOL32A_2RAC_FMT<"maddu", 0b01101010>; +class MSUB_DSP_MM_ENC : POOL32A_2RAC_FMT<"msub", 0b10101010>; +class MSUBU_DSP_MM_ENC : POOL32A_2RAC_FMT<"msubu", 0b11101010>; +class MULT_DSP_MM_ENC : POOL32A_2RAC_FMT<"mult", 0b00110010>; +class MULTU_DSP_MM_ENC : POOL32A_2RAC_FMT<"multu", 0b01110010>; +class SHLL_PH_MM_ENC : POOL32A_2RSA4_FMT<"shll.ph", 0b001110110101>; +class SHLL_S_PH_MM_ENC : POOL32A_2RSA4_FMT<"shll_s.ph", 0b101110110101>; +class SHLL_QB_MM_ENC : POOL32A_2RSA3_FMT<"shll.qb", 0b0100001>; +class SHLLV_PH_MM_ENC : POOL32A_3R_FMT<"shllv.ph", 0b00000001110>; +class SHLLV_S_PH_MM_ENC : POOL32A_3R_FMT<"shllv_s.ph", 0b10000001110>; +class SHLLV_QB_MM_ENC : POOL32A_3RB0_FMT<"shllv.qb", 0b1110010101>; +class SHLLV_S_W_MM_ENC : POOL32A_3RB0_FMT<"shllv_s.w", 0b1111010101>; +class SHLL_S_W_MM_ENC : POOL32A_2RSA5B0_FMT<"shll_s.w", 0b1111110101>; +class SHRA_QB_MMR2_ENC : POOL32A_2RSA3_FMT<"shra.qb", 0b0000111>; +class SHRA_R_QB_MMR2_ENC : POOL32A_2RSA3_FMT<"shra_r.qb", 0b1000111>; +class SHRA_PH_MM_ENC : POOL32A_2RSA4B0_FMT<"shra.ph", 0b01100110101>; +class SHRA_R_PH_MM_ENC : POOL32A_2RSA4B0_FMT<"shra_r.ph", 0b11100110101>; +class SHRAV_PH_MM_ENC : POOL32A_3R_FMT<"shrav.ph", 0b00110001101>; +class SHRAV_R_PH_MM_ENC : POOL32A_3R_FMT<"shrav_r.ph", 0b10110001101>; +class SHRAV_QB_MMR2_ENC : POOL32A_3R_FMT<"shrav.qb", 0b00111001101>; +class SHRAV_R_QB_MMR2_ENC : POOL32A_3R_FMT<"shrav_r.qb", 0b10111001101>; +class SHRAV_R_W_MM_ENC : POOL32A_3RB0_FMT<"shrav_r.w", 0b1011010101>; +class SHRA_R_W_MM_ENC : POOL32A_2RSA5B0_FMT<"shra_r.w", 0b1011110101>; +class SHRL_PH_MMR2_ENC : POOL32A_2RSA4OP6_FMT<"shrl.ph", 0b001111>; +class SHRL_QB_MM_ENC : POOL32A_2RSA3_FMT<"shrl.qb", 0b1100001>; +class SHRLV_PH_MMR2_ENC : POOL32A_3RB0_FMT<"shrlv.ph", 0b1100010101>; +class SHRLV_QB_MM_ENC : POOL32A_3RB0_FMT<"shrlv.qb", 0b1101010101>; +class PRECEQ_W_PHL_MM_ENC : POOL32A_2R_FMT<"preceq.w.phl", 0b0101000100>; +class PRECEQ_W_PHR_MM_ENC : POOL32A_2R_FMT<"preceq.w.phr", 0b0110000100>; +class PRECEQU_PH_QBL_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbl", 0b0111000100>; +class PRECEQU_PH_QBLA_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbla", 0b0111001100>; +class PRECEQU_PH_QBR_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbr", 0b1001000100>; +class PRECEQU_PH_QBRA_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbra", 0b1001001100>; +class PRECEU_PH_QBL_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbl", 0b1011000100>; +class PRECEU_PH_QBLA_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbla", 0b1011001100>; +class PRECEU_PH_QBR_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbr", 0b1101000100>; +class PRECEU_PH_QBRA_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbra", 0b1101001100>; +class SUBQ_PH_MM_ENC : POOL32A_3R_FMT<"subq.ph", 0b01000001101>; +class SUBQ_S_PH_MM_ENC : POOL32A_3R_FMT<"subq_s.ph", 0b11000001101>; +class SUBQ_S_W_MM_ENC : POOL32A_3RB0_FMT<"subq_s.w", 0b1101000101>; +class SUBQH_PH_MMR2_ENC : POOL32A_3R_FMT<"subqh.ph", 0b01001001101>; +class SUBQH_R_PH_MMR2_ENC : POOL32A_3R_FMT<"subqh_r.ph", 0b11001001101>; +class SUBQH_W_MMR2_ENC : POOL32A_3R_FMT<"subqh.w", 0b01010001101>; +class SUBQH_R_W_MMR2_ENC : POOL32A_3R_FMT<"subqh_r.w", 0b11010001101>; +class SUBU_PH_MMR2_ENC : POOL32A_3R_FMT<"subu.ph", 0b01100001101>; +class SUBU_S_PH_MMR2_ENC : POOL32A_3R_FMT<"subu_s.ph", 0b11100001101>; +class SUBU_QB_MM_ENC : POOL32A_3R_FMT<"subu.qb", 0b01011001101>; +class SUBU_S_QB_MM_ENC : POOL32A_3R_FMT<"subu_s.qb", 0b11011001101>; +class SUBUH_QB_MMR2_ENC : POOL32A_3R_FMT<"subuh.qb", 0b01101001101>; +class SUBUH_R_QB_MMR2_ENC : POOL32A_3R_FMT<"subuh_r.qb", 0b11101001101>; +class EXTP_MM_ENC : POOL32A_1RIMM5AC_FMT<"extp", 0b10011001>; +class EXTPDP_MM_ENC : POOL32A_1RIMM5AC_FMT<"extpdp", 0b11011001>; +class EXTPDPV_MM_ENC : POOL32A_2RAC_FMT<"extpdpv", 0b11100010>; +class EXTPV_MM_ENC : POOL32A_2RAC_FMT<"extpv", 0b10100010>; +class EXTR_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr.w", 0b00111001>; +class EXTR_R_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_r.w", 0b01111001>; +class EXTR_RS_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_rs.w", 0b10111001>; +class EXTR_S_H_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_s.h", 0b11111001>; +class EXTRV_W_MM_ENC : POOL32A_2RAC_FMT<"extrv.w", 0b00111010>; +class EXTRV_R_W_MM_ENC : POOL32A_2RAC_FMT<"extrv_r.w", 0b01111010>; +class EXTRV_RS_W_MM_ENC : POOL32A_2RAC_FMT<"extrv_rs.w", 0b10111010>; +class EXTRV_S_H_MM_ENC : POOL32A_2RAC_FMT<"extrv_s.h", 0b11111010>; +class DPS_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dps.w.ph", 0b00010010>; +class DPSQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"dpsq_s.w.ph", 0b00011010>; +class DPSQ_SA_L_W_MM_ENC : POOL32A_2RAC_FMT<"dpsq_sa.l.w", 0b01011010>; +class DPSQX_S_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsqx_s.w.ph", 0b10011010>; +class DPSQX_SA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsqx_sa.w.ph", 0b11011010>; +class DPSU_H_QBL_MM_ENC : POOL32A_2RAC_FMT<"dpsu.h.qbl", 0b10010010>; +class DPSU_H_QBR_MM_ENC : POOL32A_2RAC_FMT<"dpsu.h.qbr", 0b11010010>; +class DPSX_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsx.w.ph", 0b01010010>; +class MUL_PH_MMR2_ENC : POOL32A_3R_FMT<"mul.ph", 0b00000101101>; +class MUL_S_PH_MMR2_ENC : POOL32A_3R_FMT<"mul_s.ph", 0b10000101101>; +class MULEQ_S_W_PHL_MM_ENC : POOL32A_3RB0_FMT<"muleq_s.w.phl", 0b0000100101>; +class MULEQ_S_W_PHR_MM_ENC : POOL32A_3RB0_FMT<"muleq_s.w.phr", 0b0001100101>; +class MULEU_S_PH_QBL_MM_ENC : POOL32A_3RB0_FMT<"muleu_s.ph.qbl", 0b0010010101>; +class MULEU_S_PH_QBR_MM_ENC : POOL32A_3RB0_FMT<"muleu_s.ph.qbr", 0b0011010101>; +class MULQ_RS_PH_MM_ENC : POOL32A_3RB0_FMT<"mulq_rs.ph", 0b0100010101>; +class MULQ_RS_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_rs.w", 0b0110010101>; +class MULQ_S_PH_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.ph", 0b0101010101>; +class MULQ_S_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.w", 0b0111010101>; +class PRECR_QB_PH_MMR2_ENC : POOL32A_3RB0_FMT<"precr.qb.ph", 0b0001101101>; +class PRECR_SRA_PH_W_MMR2_ENC + : POOL32A_2RSA5_FMT<"precr_sra.ph.w", 0b01111001101>; +class PRECR_SRA_R_PH_W_MMR2_ENC + : POOL32A_2RSA5_FMT<"precr_sra_r.ph.w", 0b11111001101>; +class PRECRQ_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq.ph.w", 0b0011101101>; +class PRECRQ_QB_PH_MM_ENC : POOL32A_3RB0_FMT<"precrq.qb.ph", 0b0010101101>; +class PRECRQU_S_QB_PH_MM_ENC + : POOL32A_3RB0_FMT<"precrqu_s.qb.ph", 0b0101101101>; +class PRECRQ_RS_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq_rs.ph.w", 0b0100101101>; +class LBUX_MM_ENC : POOL32A_1RMEMB0_FMT<"lbux", 0b1000100101>; +class LHX_MM_ENC : POOL32A_1RMEMB0_FMT<"lhx", 0b0101100101>; +class LWX_MM_ENC : POOL32A_1RMEMB0_FMT<"lwx", 0b0110100101>; +class MAQ_S_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phl", 0b01101001>; +class MAQ_SA_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phl", 0b11101001>; +class MAQ_S_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phr", 0b00101001>; +class MAQ_SA_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phr", 0b10101001>; +class MFHI_MM_ENC : POOL32A_1RAC_FMT<"mfhi", 0b00000001>; +class MFLO_MM_ENC : POOL32A_1RAC_FMT<"mflo", 0b01000001>; +class MTHI_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b10000001>; +class MTLO_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b11000001>; +class PREPEND_MMR2_ENC : POOL32A_2RSA5B0_FMT<"prepend", 0b1001010101>; +class RADDU_W_QB_MM_ENC : POOL32A_2R_FMT<"raddu.w.qb", 0b1111000100>; +class RDDSP_MM_ENC : POOL32A_1RMASK7_FMT<"rddsp", 0b00011001>; +class REPL_PH_MM_ENC : POOL32A_1RIMM10_FMT<"repl.ph", 0b0000111101>; +class REPL_QB_MM_ENC : POOL32A_1RIMM8_FMT<"repl.qb", 0b010111>; +class REPLV_PH_MM_ENC : POOL32A_2R_FMT<"replv.ph", 0b0000001100>; +class REPLV_QB_MM_ENC : POOL32A_2R_FMT<"replv.qb", 0b0001001100>; +class MTHLIP_MM_ENC : POOL32A_1RAC_FMT<"mthlip", 0b00001001>; +class PACKRL_PH_MM_ENC : POOL32A_3RB0_FMT<"packrl.ph", 0b0110101101>; +class PICK_PH_MM_ENC : POOL32A_3RB0_FMT<"pick.ph", 0b1000101101>; +class PICK_QB_MM_ENC : POOL32A_3RB0_FMT<"pick.qb", 0b0111101101>; +class SHILO_MM_ENC : POOL32A_4B0SHIFT6AC4B0_FMT<"shilo", 0b0000011101>; +class SHILOV_MM_ENC : POOL32A_5B01RAC_FMT<"shilov", 0b01001001>; +class WRDSP_MM_ENC : POOL32A_1RMASK7_FMT<"wrdsp", 0b01011001>; + +// Instruction desc. +class ABSQ_S_PH_MM_R2_DESC_BASE<string opstr, SDPatternOperator OpNode, + InstrItinClass itin, RegisterOperand ROD, + RegisterOperand ROS = ROD> { + dag OutOperandList = (outs ROD:$rt); + dag InOperandList = (ins ROS:$rs); + string AsmString = !strconcat(opstr, "\t$rt, $rs"); + list<dag> Pattern = [(set ROD:$rt, (OpNode ROS:$rs))]; + InstrItinClass Itinerary = itin; +} +class ABSQ_S_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "absq_s.ph", int_mips_absq_s_ph, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag20]>; +class ABSQ_S_W_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "absq_s.w", int_mips_absq_s_w, NoItinerary, GPR32Opnd>, Defs<[DSPOutFlag20]>; +class ABSQ_S_QB_MMR2_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "absq_s.qb", int_mips_absq_s_qb, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag20]>; +class PRECEQ_W_PHL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "preceq.w.phl", int_mips_preceq_w_phl, NoItinerary, GPR32Opnd, DSPROpnd>; +class PRECEQ_W_PHR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "preceq.w.phr", int_mips_preceq_w_phr, NoItinerary, GPR32Opnd, DSPROpnd>; +class PRECEQU_PH_QBL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "precequ.ph.qbl", int_mips_precequ_ph_qbl, NoItinerary, DSPROpnd>; +class PRECEQU_PH_QBLA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "precequ.ph.qbla", int_mips_precequ_ph_qbla, NoItinerary, DSPROpnd>; +class PRECEQU_PH_QBR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "precequ.ph.qbr", int_mips_precequ_ph_qbr, NoItinerary, DSPROpnd>; +class PRECEQU_PH_QBRA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "precequ.ph.qbra", int_mips_precequ_ph_qbra, NoItinerary, DSPROpnd>; +class PRECEU_PH_QBL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "preceu.ph.qbl", int_mips_preceu_ph_qbl, NoItinerary, DSPROpnd>; +class PRECEU_PH_QBLA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "preceu.ph.qbla", int_mips_preceu_ph_qbla, NoItinerary, DSPROpnd>; +class PRECEU_PH_QBR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "preceu.ph.qbr", int_mips_preceu_ph_qbr, NoItinerary, DSPROpnd>; +class PRECEU_PH_QBRA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "preceu.ph.qbra", int_mips_preceu_ph_qbra, NoItinerary, DSPROpnd>; + +class SHLL_R2_MM_DESC_BASE<string instr_asm, SDPatternOperator OpNode, + SDPatternOperator ImmPat, InstrItinClass itin, + RegisterOperand RO, Operand ImmOpnd> { + dag OutOperandList = (outs RO:$rt); + dag InOperandList = (ins RO:$rs, ImmOpnd:$sa); + string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa"); + list<dag> Pattern = [(set RO:$rt, (OpNode RO:$rs, ImmPat:$sa))]; + InstrItinClass Itinerary = itin; + bit hasSideEffects = 1; +} +class SHLL_PH_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shll.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>, + Defs<[DSPOutFlag22]>; +class SHLL_S_PH_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shll_s.ph", int_mips_shll_s_ph, immZExt4, NoItinerary, DSPROpnd, uimm4>, + Defs<[DSPOutFlag22]>; +class SHLL_QB_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shll.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>, + Defs<[DSPOutFlag22]>; +class SHLL_S_W_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shll_s.w", int_mips_shll_s_w, immZExt5, NoItinerary, GPR32Opnd, uimm5>, + Defs<[DSPOutFlag22]>; +class SHRA_QB_MMR2_DESC : SHLL_R2_MM_DESC_BASE< + "shra.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>; +class SHRA_R_QB_MMR2_DESC : SHLL_R2_MM_DESC_BASE< + "shra_r.qb", int_mips_shra_r_qb, immZExt3, NoItinerary, DSPROpnd, uimm3>; +class SHRA_PH_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shra.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>; +class SHRA_R_PH_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shra_r.ph", int_mips_shra_r_ph, immZExt4, NoItinerary, DSPROpnd, uimm4>; +class SHRA_R_W_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shra_r.w", int_mips_shra_r_w, immZExt5, NoItinerary, GPR32Opnd, uimm5>; +class SHRL_QB_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shrl.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>; +class SHRL_PH_MMR2_DESC : SHLL_R2_MM_DESC_BASE< + "shrl.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>; + +class SHLLV_R3_MM_DESC_BASE<string instr_asm, SDPatternOperator OpNode, + InstrItinClass itin, RegisterOperand RO> { + dag OutOperandList = (outs RO:$rd); + dag InOperandList = (ins RO:$rt, GPR32Opnd:$rs); + string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs"); + list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))]; + InstrItinClass Itinerary = itin; +} +class SHLLV_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shllv.ph", int_mips_shll_ph, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag22]>; +class SHLLV_S_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shllv_s.ph", int_mips_shll_s_ph, NoItinerary, DSPROpnd>, + Defs<[DSPOutFlag22]>; +class SHLLV_QB_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shllv.qb", int_mips_shll_qb, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag22]>; +class SHLLV_S_W_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shllv_s.w", int_mips_shll_s_w, NoItinerary, GPR32Opnd>, Defs<[DSPOutFlag22]>; +class SHRAV_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shrav.ph", int_mips_shra_ph, NoItinerary, DSPROpnd>; +class SHRAV_R_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shrav_r.ph", int_mips_shra_r_ph, NoItinerary, DSPROpnd>; +class SHRAV_QB_MMR2_DESC : SHLLV_R3_MM_DESC_BASE< + "shrav.qb", int_mips_shra_qb, NoItinerary, DSPROpnd>; +class SHRAV_R_QB_MMR2_DESC : SHLLV_R3_MM_DESC_BASE< + "shrav_r.qb", int_mips_shra_r_qb, NoItinerary, DSPROpnd>; +class SHRAV_R_W_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shrav_r.w", int_mips_shra_r_w, NoItinerary, GPR32Opnd>; +class SHRLV_PH_MMR2_DESC : SHLLV_R3_MM_DESC_BASE< + "shrlv.ph", int_mips_shrl_ph, NoItinerary, DSPROpnd>; +class SHRLV_QB_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shrlv.qb", int_mips_shrl_qb, NoItinerary, DSPROpnd>; + +class EXT_MM_2R_DESC_BASE<string instr_asm, SDPatternOperator OpNode, + InstrItinClass itin> { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$rs); + string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $rs"); + InstrItinClass Itinerary = itin; +} +class EXT_MM_1R_DESC_BASE<string instr_asm, SDPatternOperator OpNode, + InstrItinClass itin> { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm5:$imm); + string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $imm"); + InstrItinClass Itinerary = itin; +} + +class EXTP_MM_DESC + : EXT_MM_1R_DESC_BASE<"extp", MipsEXTP, NoItinerary>, + Uses<[DSPPos]>, Defs<[DSPEFI]>; +class EXTPDP_MM_DESC + : EXT_MM_1R_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>, + Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>; +class EXTPDPV_MM_DESC + : EXT_MM_2R_DESC_BASE<"extpdpv", MipsEXTPDP, NoItinerary>, + Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>; +class EXTPV_MM_DESC + : EXT_MM_2R_DESC_BASE<"extpv", MipsEXTP, NoItinerary>, + Uses<[DSPPos]>, Defs<[DSPEFI]>; +class EXTR_W_MM_DESC + : EXT_MM_1R_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTR_R_W_MM_DESC + : EXT_MM_1R_DESC_BASE<"extr_r.w", MipsEXTR_R_W, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTR_RS_W_MM_DESC + : EXT_MM_1R_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTR_S_H_MM_DESC + : EXT_MM_1R_DESC_BASE<"extr_s.h", MipsEXTR_S_H, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTRV_W_MM_DESC + : EXT_MM_2R_DESC_BASE<"extrv.w", MipsEXTR_W, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTRV_R_W_MM_DESC + : EXT_MM_2R_DESC_BASE<"extrv_r.w", MipsEXTR_R_W, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTRV_RS_W_MM_DESC + : EXT_MM_2R_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTRV_S_H_MM_DESC + : EXT_MM_2R_DESC_BASE<"extrv_s.h", MipsEXTR_S_H, NoItinerary>, + Defs<[DSPOutFlag23]>; + +class MFHI_MM_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode, + InstrItinClass itin> { + dag OutOperandList = (outs GPR32Opnd:$rs); + dag InOperandList = (ins RO:$ac); + string AsmString = !strconcat(instr_asm, "\t$rs, $ac"); + list<dag> Pattern = [(set GPR32Opnd:$rs, (OpNode RO:$ac))]; + InstrItinClass Itinerary = itin; +} + +class MFHI_MM_DESC : MFHI_MM_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI, + NoItinerary>; +class MFLO_MM_DESC : MFHI_MM_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO, + NoItinerary>; + +class RADDU_W_QB_MM_DESC { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins DSPROpnd:$rs); + string AsmString = !strconcat("raddu.w.qb", "\t$rt, $rs"); + list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_raddu_w_qb DSPROpnd:$rs))]; + InstrItinClass Itinerary = NoItinerary; + string BaseOpcode = "raddu.w.qb"; +} + +class RDDSP_MM_DESC { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins uimm16:$mask); + string AsmString = !strconcat("rddsp", "\t$rt, $mask"); + list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt10:$mask))]; + InstrItinClass Itinerary = NoItinerary; +} + +class REPL_QB_MM_DESC { + dag OutOperandList = (outs DSPROpnd:$rt); + dag InOperandList = (ins uimm16:$imm); + string AsmString = !strconcat("repl.qb", "\t$rt, $imm"); + list<dag> Pattern = [(set DSPROpnd:$rt, (int_mips_repl_qb immZExt8:$imm))]; + InstrItinClass Itinerary = NoItinerary; +} + +class REPLV_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.ph", int_mips_repl_ph, + NoItinerary, DSPROpnd, + GPR32Opnd>; +class REPLV_QB_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.qb", int_mips_repl_qb, + NoItinerary, DSPROpnd, + GPR32Opnd>; + +class WRDSP_MM_DESC { + dag OutOperandList = (outs); + dag InOperandList = (ins GPR32Opnd:$rt, uimm7:$mask); + string AsmString = !strconcat("wrdsp", "\t$rt, $mask"); + list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, immZExt7:$mask)]; + InstrItinClass Itinerary = NoItinerary; +} + +// Instruction defs. +// microMIPS DSP Rev 1 +def ADDQ_PH_MM : DspMMRel, ADDQ_PH_MM_ENC, ADDQ_PH_DESC; +def ADDQ_S_PH_MM : DspMMRel, ADDQ_S_PH_MM_ENC, ADDQ_S_PH_DESC; +def ADDQ_S_W_MM : DspMMRel, ADDQ_S_W_MM_ENC, ADDQ_S_W_DESC; +def ADDU_QB_MM : DspMMRel, ADDU_QB_MM_ENC, ADDU_QB_DESC; +def ADDU_S_QB_MM : DspMMRel, ADDU_S_QB_MM_ENC, ADDU_S_QB_DESC; +def ADDSC_MM : DspMMRel, ADDSC_MM_ENC, ADDSC_DESC; +def ADDWC_MM : DspMMRel, ADDWC_MM_ENC, ADDWC_DESC; +def DPAQ_S_W_PH_MM : DspMMRel, DPAQ_S_W_PH_MM_ENC, DPAQ_S_W_PH_DESC; +def DPAQ_SA_L_W_MM : DspMMRel, DPAQ_SA_L_W_MM_ENC, DPAQ_SA_L_W_DESC; +def DPAU_H_QBL_MM : DspMMRel, DPAU_H_QBL_MM_ENC, DPAU_H_QBL_DESC; +def DPAU_H_QBR_MM : DspMMRel, DPAU_H_QBR_MM_ENC, DPAU_H_QBR_DESC; +def ABSQ_S_PH_MM : DspMMRel, ABSQ_S_PH_MM_ENC, ABSQ_S_PH_MM_DESC; +def ABSQ_S_W_MM : DspMMRel, ABSQ_S_W_MM_ENC, ABSQ_S_W_MM_DESC; +def INSV_MM : DspMMRel, INSV_MM_ENC, INSV_DESC; +def MADD_DSP_MM : DspMMRel, MADD_DSP_MM_ENC, MADD_DSP_DESC; +def MADDU_DSP_MM : DspMMRel, MADDU_DSP_MM_ENC, MADDU_DSP_DESC; +def MSUB_DSP_MM : DspMMRel, MSUB_DSP_MM_ENC, MSUB_DSP_DESC; +def MSUBU_DSP_MM : DspMMRel, MSUBU_DSP_MM_ENC, MSUBU_DSP_DESC; +def MULT_DSP_MM : DspMMRel, MULT_DSP_MM_ENC, MULT_DSP_DESC; +def MULTU_DSP_MM : DspMMRel, MULTU_DSP_MM_ENC, MULTU_DSP_DESC; +def SHLL_PH_MM : DspMMRel, SHLL_PH_MM_ENC, SHLL_PH_MM_DESC; +def SHLL_S_PH_MM : DspMMRel, SHLL_S_PH_MM_ENC, SHLL_S_PH_MM_DESC; +def SHLL_QB_MM : DspMMRel, SHLL_QB_MM_ENC, SHLL_QB_MM_DESC; +def SHLLV_PH_MM : DspMMRel, SHLLV_PH_MM_ENC, SHLLV_PH_MM_DESC; +def SHLLV_S_PH_MM : DspMMRel, SHLLV_S_PH_MM_ENC, SHLLV_S_PH_MM_DESC; +def SHLLV_QB_MM : DspMMRel, SHLLV_QB_MM_ENC, SHLLV_QB_MM_DESC; +def SHLLV_S_W_MM : DspMMRel, SHLLV_S_W_MM_ENC, SHLLV_S_W_MM_DESC; +def SHLL_S_W_MM : DspMMRel, SHLL_S_W_MM_ENC, SHLL_S_W_MM_DESC; +def SHRA_PH_MM : DspMMRel, SHRA_PH_MM_ENC, SHRA_PH_MM_DESC; +def SHRA_R_PH_MM : DspMMRel, SHRA_R_PH_MM_ENC, SHRA_R_PH_MM_DESC; +def SHRAV_PH_MM : DspMMRel, SHRAV_PH_MM_ENC, SHRAV_PH_MM_DESC; +def SHRAV_R_PH_MM : DspMMRel, SHRAV_R_PH_MM_ENC, SHRAV_R_PH_MM_DESC; +def SHRAV_R_W_MM : DspMMRel, SHRAV_R_W_MM_ENC, SHRAV_R_W_MM_DESC; +def SHRA_R_W_MM : DspMMRel, SHRA_R_W_MM_ENC, SHRA_R_W_MM_DESC; +def SHRL_QB_MM : DspMMRel, SHRL_QB_MM_ENC, SHRL_QB_MM_DESC; +def SHRLV_QB_MM : DspMMRel, SHRLV_QB_MM_ENC, SHRLV_QB_MM_DESC; +def PRECEQ_W_PHL_MM : DspMMRel, PRECEQ_W_PHL_MM_ENC, PRECEQ_W_PHL_MM_DESC; +def PRECEQ_W_PHR_MM : DspMMRel, PRECEQ_W_PHR_MM_ENC, PRECEQ_W_PHR_MM_DESC; +def PRECEQU_PH_QBL_MM : DspMMRel, PRECEQU_PH_QBL_MM_ENC, PRECEQU_PH_QBL_MM_DESC; +def PRECEQU_PH_QBLA_MM : DspMMRel, PRECEQU_PH_QBLA_MM_ENC, + PRECEQU_PH_QBLA_MM_DESC; +def PRECEQU_PH_QBR_MM : DspMMRel, PRECEQU_PH_QBR_MM_ENC, PRECEQU_PH_QBR_MM_DESC; +def PRECEQU_PH_QBRA_MM : DspMMRel, PRECEQU_PH_QBRA_MM_ENC, + PRECEQU_PH_QBRA_MM_DESC; +def PRECEU_PH_QBL_MM : DspMMRel, PRECEU_PH_QBL_MM_ENC, PRECEU_PH_QBL_MM_DESC; +def PRECEU_PH_QBLA_MM : DspMMRel, PRECEU_PH_QBLA_MM_ENC, PRECEU_PH_QBLA_MM_DESC; +def PRECEU_PH_QBR_MM : DspMMRel, PRECEU_PH_QBR_MM_ENC, PRECEU_PH_QBR_MM_DESC; +def PRECEU_PH_QBRA_MM : DspMMRel, PRECEU_PH_QBRA_MM_ENC, PRECEU_PH_QBRA_MM_DESC; +def SUBQ_PH_MM : DspMMRel, SUBQ_PH_MM_ENC, SUBQ_PH_DESC; +def SUBQ_S_PH_MM : DspMMRel, SUBQ_S_PH_MM_ENC, SUBQ_S_PH_DESC; +def SUBQ_S_W_MM : DspMMRel, SUBQ_S_W_MM_ENC, SUBQ_S_W_DESC; +def SUBU_QB_MM : DspMMRel, SUBU_QB_MM_ENC, SUBU_QB_DESC; +def SUBU_S_QB_MM : DspMMRel, SUBU_S_QB_MM_ENC, SUBU_S_QB_DESC; +def EXTP_MM : DspMMRel, EXTP_MM_ENC, EXTP_MM_DESC; +def EXTPDP_MM : DspMMRel, EXTPDP_MM_ENC, EXTPDP_MM_DESC; +def EXTPDPV_MM : DspMMRel, EXTPDPV_MM_ENC, EXTPDPV_MM_DESC; +def EXTPV_MM : DspMMRel, EXTPV_MM_ENC, EXTPV_MM_DESC; +def EXTR_W_MM : DspMMRel, EXTR_W_MM_ENC, EXTR_W_MM_DESC; +def EXTR_R_W_MM : DspMMRel, EXTR_R_W_MM_ENC, EXTR_R_W_MM_DESC; +def EXTR_RS_W_MM : DspMMRel, EXTR_RS_W_MM_ENC, EXTR_RS_W_MM_DESC; +def EXTR_S_H_MM : DspMMRel, EXTR_S_H_MM_ENC, EXTR_S_H_MM_DESC; +def EXTRV_W_MM : DspMMRel, EXTRV_W_MM_ENC, EXTRV_W_MM_DESC; +def EXTRV_R_W_MM : DspMMRel, EXTRV_R_W_MM_ENC, EXTRV_R_W_MM_DESC; +def EXTRV_RS_W_MM : DspMMRel, EXTRV_RS_W_MM_ENC, EXTRV_RS_W_MM_DESC; +def EXTRV_S_H_MM : DspMMRel, EXTRV_S_H_MM_ENC, EXTRV_S_H_MM_DESC; +def DPSQ_S_W_PH_MM : DspMMRel, DPSQ_S_W_PH_MM_ENC, DPSQ_S_W_PH_DESC; +def DPSQ_SA_L_W_MM : DspMMRel, DPSQ_SA_L_W_MM_ENC, DPSQ_SA_L_W_DESC; +def DPSU_H_QBL_MM : DspMMRel, DPSU_H_QBL_MM_ENC, DPSU_H_QBL_DESC; +def DPSU_H_QBR_MM : DspMMRel, DPSU_H_QBR_MM_ENC, DPSU_H_QBR_DESC; +def MULEQ_S_W_PHL_MM : DspMMRel, MULEQ_S_W_PHL_MM_ENC, MULEQ_S_W_PHL_DESC; +def MULEQ_S_W_PHR_MM : DspMMRel, MULEQ_S_W_PHR_MM_ENC, MULEQ_S_W_PHR_DESC; +def MULEU_S_PH_QBL_MM : DspMMRel, MULEU_S_PH_QBL_MM_ENC, MULEU_S_PH_QBL_DESC; +def MULEU_S_PH_QBR_MM : DspMMRel, MULEU_S_PH_QBR_MM_ENC, MULEU_S_PH_QBR_DESC; +def MULQ_RS_PH_MM : DspMMRel, MULQ_RS_PH_MM_ENC, MULQ_RS_PH_DESC; +def PRECRQ_PH_W_MM : DspMMRel, PRECRQ_PH_W_MM_ENC, PRECRQ_PH_W_DESC; +def PRECRQ_QB_PH_MM : DspMMRel, PRECRQ_QB_PH_MM_ENC, PRECRQ_QB_PH_DESC; +def PRECRQU_S_QB_PH_MM : DspMMRel, PRECRQU_S_QB_PH_MM_ENC, PRECRQU_S_QB_PH_DESC; +def PRECRQ_RS_PH_W_MM : DspMMRel, PRECRQ_RS_PH_W_MM_ENC, PRECRQ_RS_PH_W_DESC; +def LBUX_MM : DspMMRel, LBUX_MM_ENC, LBUX_DESC; +def LHX_MM : DspMMRel, LHX_MM_ENC, LHX_DESC; +def LWX_MM : DspMMRel, LWX_MM_ENC, LWX_DESC; +def MAQ_S_W_PHL_MM : DspMMRel, MAQ_S_W_PHL_MM_ENC, MAQ_S_W_PHL_DESC; +def MAQ_SA_W_PHL_MM : DspMMRel, MAQ_SA_W_PHL_MM_ENC, MAQ_SA_W_PHL_DESC; +def MAQ_S_W_PHR_MM : DspMMRel, MAQ_S_W_PHR_MM_ENC, MAQ_S_W_PHR_DESC; +def MAQ_SA_W_PHR_MM : DspMMRel, MAQ_SA_W_PHR_MM_ENC, MAQ_SA_W_PHR_DESC; +def MFHI_DSP_MM : DspMMRel, MFHI_MM_ENC, MFHI_MM_DESC; +def MFLO_DSP_MM : DspMMRel, MFLO_MM_ENC, MFLO_MM_DESC; +def MTHI_DSP_MM : DspMMRel, MTHI_MM_ENC, MTHI_DESC; +def MTLO_DSP_MM : DspMMRel, MTLO_MM_ENC, MTLO_DESC; +def RADDU_W_QB_MM : DspMMRel, RADDU_W_QB_MM_ENC, RADDU_W_QB_MM_DESC; +def RDDSP_MM : DspMMRel, RDDSP_MM_ENC, RDDSP_MM_DESC; +def REPL_PH_MM : DspMMRel, REPL_PH_MM_ENC, REPL_PH_DESC; +def REPL_QB_MM : DspMMRel, REPL_QB_MM_ENC, REPL_QB_MM_DESC; +def REPLV_PH_MM : DspMMRel, REPLV_PH_MM_ENC, REPLV_PH_MM_DESC; +def REPLV_QB_MM : DspMMRel, REPLV_QB_MM_ENC, REPLV_QB_MM_DESC; +def MTHLIP_MM : DspMMRel, MTHLIP_MM_ENC, MTHLIP_DESC; +def PACKRL_PH_MM : DspMMRel, PACKRL_PH_MM_ENC, PACKRL_PH_DESC; +def PICK_PH_MM : DspMMRel, PICK_PH_MM_ENC, PICK_PH_DESC; +def PICK_QB_MM : DspMMRel, PICK_QB_MM_ENC, PICK_QB_DESC; +def SHILO_MM : DspMMRel, SHILO_MM_ENC, SHILO_DESC; +def SHILOV_MM : DspMMRel, SHILOV_MM_ENC, SHILOV_DESC; +def WRDSP_MM : DspMMRel, WRDSP_MM_ENC, WRDSP_MM_DESC; +// microMIPS DSP Rev 2 +def ABSQ_S_QB_MMR2 : DspMMRel, ABSQ_S_QB_MMR2_ENC, ABSQ_S_QB_MMR2_DESC, + ISA_DSPR2; +def ADDQH_PH_MMR2 : DspMMRel, ADDQH_PH_MMR2_ENC, ADDQH_PH_DESC, ISA_DSPR2; +def ADDQH_R_PH_MMR2 : DspMMRel, ADDQH_R_PH_MMR2_ENC, ADDQH_R_PH_DESC, ISA_DSPR2; +def ADDQH_W_MMR2 : DspMMRel, ADDQH_W_MMR2_ENC, ADDQH_W_DESC, ISA_DSPR2; +def ADDQH_R_W_MMR2 : DspMMRel, ADDQH_R_W_MMR2_ENC, ADDQH_R_W_DESC, ISA_DSPR2; +def ADDU_PH_MMR2 : DspMMRel, ADDU_PH_MMR2_ENC, ADDU_PH_DESC, ISA_DSPR2; +def ADDU_S_PH_MMR2 : DspMMRel, ADDU_S_PH_MMR2_ENC, ADDU_S_PH_DESC, ISA_DSPR2; +def ADDUH_QB_MMR2 : DspMMRel, ADDUH_QB_MMR2_ENC, ADDUH_QB_DESC, ISA_DSPR2; +def ADDUH_R_QB_MMR2 : DspMMRel, ADDUH_R_QB_MMR2_ENC, ADDUH_R_QB_DESC, ISA_DSPR2; +def DPA_W_PH_MMR2 : DspMMRel, DPA_W_PH_MMR2_ENC, DPA_W_PH_DESC, ISA_DSPR2; +def DPAQX_S_W_PH_MMR2 : DspMMRel, DPAQX_S_W_PH_MMR2_ENC, DPAQX_S_W_PH_DESC, + ISA_DSPR2; +def DPAQX_SA_W_PH_MMR2 : DspMMRel, DPAQX_SA_W_PH_MMR2_ENC, DPAQX_SA_W_PH_DESC, + ISA_DSPR2; +def DPAX_W_PH_MMR2 : DspMMRel, DPAX_W_PH_MMR2_ENC, DPAX_W_PH_DESC, ISA_DSPR2; +def SHRA_QB_MMR2 : DspMMRel, SHRA_QB_MMR2_ENC, SHRA_QB_MMR2_DESC, ISA_DSPR2; +def SHRA_R_QB_MMR2 : DspMMRel, SHRA_R_QB_MMR2_ENC, SHRA_R_QB_MMR2_DESC, + ISA_DSPR2; +def SHRAV_QB_MMR2 : DspMMRel, SHRAV_QB_MMR2_ENC, SHRAV_QB_MMR2_DESC, ISA_DSPR2; +def SHRAV_R_QB_MMR2 : DspMMRel, SHRAV_R_QB_MMR2_ENC, SHRAV_R_QB_MMR2_DESC, + ISA_DSPR2; +def SHRL_PH_MMR2 : DspMMRel, SHRL_PH_MMR2_ENC, SHRL_PH_MMR2_DESC, ISA_DSPR2; +def SHRLV_PH_MMR2 : DspMMRel, SHRLV_PH_MMR2_ENC, SHRLV_PH_MMR2_DESC, ISA_DSPR2; +def SUBQH_PH_MMR2 : DspMMRel, SUBQH_PH_MMR2_ENC, SUBQH_PH_DESC, ISA_DSPR2; +def SUBQH_R_PH_MMR2 : DspMMRel, SUBQH_R_PH_MMR2_ENC, SUBQH_R_PH_DESC, ISA_DSPR2; +def SUBQH_W_MMR2 : DspMMRel, SUBQH_W_MMR2_ENC, SUBQH_W_DESC, ISA_DSPR2; +def SUBQH_R_W_MMR2 : DspMMRel, SUBQH_R_W_MMR2_ENC, SUBQH_R_W_DESC, ISA_DSPR2; +def SUBU_PH_MMR2 : DspMMRel, SUBU_PH_MMR2_ENC, SUBU_PH_DESC, ISA_DSPR2; +def SUBU_S_PH_MMR2 : DspMMRel, SUBU_S_PH_MMR2_ENC, SUBU_S_PH_DESC, ISA_DSPR2; +def SUBUH_QB_MMR2 : DspMMRel, SUBUH_QB_MMR2_ENC, SUBUH_QB_DESC, ISA_DSPR2; +def SUBUH_R_QB_MMR2 : DspMMRel, SUBUH_R_QB_MMR2_ENC, SUBUH_R_QB_DESC, ISA_DSPR2; +def DPS_W_PH_MMR2 : DspMMRel, DPS_W_PH_MMR2_ENC, DPS_W_PH_DESC, ISA_DSPR2; +def DPSQX_S_W_PH_MMR2 : DspMMRel, DPSQX_S_W_PH_MMR2_ENC, DPSQX_S_W_PH_DESC, + ISA_DSPR2; +def DPSQX_SA_W_PH_MMR2 : DspMMRel, DPSQX_SA_W_PH_MMR2_ENC, DPSQX_SA_W_PH_DESC, + ISA_DSPR2; +def DPSX_W_PH_MMR2 : DspMMRel, DPSX_W_PH_MMR2_ENC, DPSX_W_PH_DESC, ISA_DSPR2; +def MUL_PH_MMR2 : DspMMRel, MUL_PH_MMR2_ENC, MUL_PH_DESC, ISA_DSPR2; +def MUL_S_PH_MMR2 : DspMMRel, MUL_S_PH_MMR2_ENC, MUL_S_PH_DESC, ISA_DSPR2; +def MULQ_RS_W_MMR2 : DspMMRel, MULQ_RS_W_MMR2_ENC, MULQ_RS_W_DESC, ISA_DSPR2; +def MULQ_S_PH_MMR2 : DspMMRel, MULQ_S_PH_MMR2_ENC, MULQ_S_PH_DESC, ISA_DSPR2; +def MULQ_S_W_MMR2 : DspMMRel, MULQ_S_W_MMR2_ENC, MULQ_S_W_DESC, ISA_DSPR2; +def PRECR_QB_PH_MMR2 : DspMMRel, PRECR_QB_PH_MMR2_ENC, PRECR_QB_PH_DESC, + ISA_DSPR2; +def PRECR_SRA_PH_W_MMR2 : DspMMRel, PRECR_SRA_PH_W_MMR2_ENC, + PRECR_SRA_PH_W_DESC, ISA_DSPR2; +def PRECR_SRA_R_PH_W_MMR2 : DspMMRel, PRECR_SRA_R_PH_W_MMR2_ENC, + PRECR_SRA_R_PH_W_DESC, ISA_DSPR2; +def PREPEND_MMR2 : DspMMRel, PREPEND_MMR2_ENC, PREPEND_DESC, ISA_DSPR2; + +// Instruction alias. +def : MMDSPInstAlias<"wrdsp $rt", (WRDSP_MM GPR32Opnd:$rt, 0x1F), 1>; diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td index 004b0d5..756e6c9 100644 --- a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td +++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td @@ -37,23 +37,14 @@ def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, def FCMP_D32_MM : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM_MM<1>; -def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, IIBranch, MIPS_BRANCH_F>, +def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, II_BC1F, MIPS_BRANCH_F>, BC1F_FM_MM<0x1c>, ISA_MIPS1_NOT_32R6_64R6; -def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, IIBranch, MIPS_BRANCH_T>, +def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, II_BC1T, MIPS_BRANCH_T>, BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6; - -def CEIL_W_S_MM : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>, - ROUND_W_FM_MM<0, 0x6c>; def CVT_W_S_MM : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>, ROUND_W_FM_MM<0, 0x24>; -def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>, - ROUND_W_FM_MM<0, 0x2c>; -def ROUND_W_S_MM : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, +def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, ROUND_W_FM_MM<0, 0xec>; -def TRUNC_W_S_MM : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>, - ROUND_W_FM_MM<0, 0xac>; -def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S, - fsqrt>, ROUND_W_FM_MM<0, 0x28>; def CEIL_W_MM : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>, ROUND_W_FM_MM<1, 0x6c>; @@ -61,7 +52,7 @@ def CVT_W_MM : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>, ROUND_W_FM_MM<1, 0x24>; def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>, ROUND_W_FM_MM<1, 0x2c>; -def ROUND_W_MM : MMRel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>, +def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>, ROUND_W_FM_MM<1, 0xec>; def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>, ROUND_W_FM_MM<1, 0xac>; @@ -146,3 +137,14 @@ def NMADD_D32_MM : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>, def NMSUB_D32_MM : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>, MADDS_FM_MM<0x2a>; } + +let AdditionalPredicates = [InMicroMips] in { + def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, + II_FLOOR>, ROUND_W_FM_MM<0, 0x2c>; + def TRUNC_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, + FGR32Opnd, II_TRUNC>, ROUND_W_FM_MM<0, 0xac>; + def CEIL_W_S_MM : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>, + ROUND_W_FM_MM<0, 0x6c>; + def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S, + fsqrt>, ROUND_W_FM_MM<0, 0x28>; +} diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td index 560afa4..b736367 100644 --- a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td +++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td @@ -389,6 +389,22 @@ class LW_FM_MM<bits<6> op> : MMArch { let Inst{15-0} = addr{15-0}; } +class POOL32C_LHUE_FM_MM<bits<6> op, bits<4> fmt, bits<3> funct> : MMArch { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-12} = fmt; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + class LWL_FM_MM<bits<4> funct> { bits<5> rt; bits<21> addr; @@ -402,6 +418,22 @@ class LWL_FM_MM<bits<4> funct> { let Inst{11-0} = addr{11-0}; } +class POOL32C_STEVA_LDEVA_FM_MM<bits<4> type, bits<3> funct> { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = 0x18; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-12} = type; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + class CMov_F_I_FM_MM<bits<7> func> : MMArch { bits<5> rd; bits<5> rs; @@ -655,6 +687,22 @@ class LL_FM_MM<bits<4> funct> { let Inst{11-0} = addr{11-0}; } +class LLE_FM_MM<bits<4> funct> { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = 0x18; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-12} = funct; + let Inst{11-9} = 0x6; + let Inst{8-0} = offset; +} + class ADDS_FM_MM<bits<2> fmt, bits<8> funct> : MMArch { bits<5> ft; bits<5> fs; @@ -895,7 +943,7 @@ class LWM_FM_MM<bits<4> funct> : MMArch { let Inst{11-0} = addr{11-0}; } -class LWM_FM_MM16<bits<4> funct> : MMArch { +class LWM_FM_MM16<bits<4> funct> : MMArch, PredicateControl { bits<2> rt; bits<4> addr; @@ -922,6 +970,37 @@ class CACHE_PREF_FM_MM<bits<6> op, bits<4> funct> : MMArch { let Inst{11-0} = offset; } +class CACHE_PREFE_FM_MM<bits<6> op, bits<3> funct> : MMArch { + bits<21> addr; + bits<5> hint; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = hint; + let Inst{20-16} = base; + let Inst{15-12} = 0xA; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + +class POOL32F_PREFX_FM_MM<bits<6> op, bits<9> funct> : MMArch { + bits<5> index; + bits<5> base; + bits<5> hint; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = index; + let Inst{20-16} = base; + let Inst{15-11} = hint; + let Inst{10-9} = 0x0; + let Inst{8-0} = funct; +} + class BARRIER_FM_MM<bits<5> op> : MMArch { bits<32> Inst; diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td index 3939384..99f0f44 100644 --- a/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td @@ -13,11 +13,6 @@ def simm12 : Operand<i32> { let DecoderMethod = "DecodeSimm12"; } -def uimm5_lsl2 : Operand<OtherVT> { - let EncoderMethod = "getUImm5Lsl2Encoding"; - let DecoderMethod = "DecodeUImm5lsl2"; -} - def uimm6_lsl2 : Operand<i32> { let EncoderMethod = "getUImm6Lsl2Encoding"; let DecoderMethod = "DecodeUImm6Lsl2"; @@ -30,6 +25,7 @@ def simm9_addiusp : Operand<i32> { def uimm3_shift : Operand<i32> { let EncoderMethod = "getUImm3Mod8Encoding"; + let DecoderMethod = "DecodePOOL16BEncodedField"; } def simm3_lsa2 : Operand<i32> { @@ -105,6 +101,14 @@ def mem_mm_gp_imm7_lsl2 : Operand<i32> { let EncoderMethod = "getMemEncodingMMGPImm7Lsl2"; } +def mem_mm_9 : Operand<i32> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops GPR32, simm9); + let EncoderMethod = "getMemEncodingMMImm9"; + let ParserMatchClass = MipsMemAsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + def mem_mm_12 : Operand<i32> { let PrintMethod = "printMemOperand"; let MIOperandInfo = (ops GPR32, simm12); @@ -113,6 +117,14 @@ def mem_mm_12 : Operand<i32> { let OperandType = "OPERAND_MEMORY"; } +def mem_mm_16 : Operand<i32> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops GPR32, simm16); + let EncoderMethod = "getMemEncodingMMImm16"; + let ParserMatchClass = MipsMemAsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + def MipsMemUimm4AsmOperand : AsmOperandClass { let Name = "MemOffsetUimm4"; let SuperClasses = [MipsMemAsmOperand]; @@ -166,7 +178,7 @@ def simm23_lsl2 : Operand<i32> { class CompactBranchMM<string opstr, DAGOperand opnd, PatFrag cond_op, RegisterOperand RO> : InstSE<(outs), (ins RO:$rs, opnd:$offset), - !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI> { + !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZC, FrmI> { let isBranch = 1; let isTerminator = 1; let hasDelaySlot = 0; @@ -251,6 +263,13 @@ class LLBaseMM<string opstr, RegisterOperand RO> : let mayLoad = 1; } +class LLEBaseMM<string opstr, RegisterOperand RO> : + InstSE<(outs RO:$rt), (ins mem_mm_12:$addr), + !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> { + let DecoderMethod = "DecodeMemMMImm9"; + let mayLoad = 1; +} + class SCBaseMM<string opstr, RegisterOperand RO> : InstSE<(outs RO:$dst), (ins RO:$rt, mem_mm_12:$addr), !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> { @@ -259,6 +278,14 @@ class SCBaseMM<string opstr, RegisterOperand RO> : let Constraints = "$rt = $dst"; } +class SCEBaseMM<string opstr, RegisterOperand RO> : + InstSE<(outs RO:$dst), (ins RO:$rt, mem_mm_12:$addr), + !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> { + let DecoderMethod = "DecodeMemMMImm9"; + let mayStore = 1; + let Constraints = "$rt = $dst"; +} + class LoadMM<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag, InstrItinClass Itin = NoItinerary> : InstSE<(outs RO:$rt), (ins mem_mm_12:$addr), @@ -392,7 +419,7 @@ class LoadImmMM16<string opstr, Operand Od, RegisterOperand RO> : // 16-bit Jump and Link (Call) class JumpLinkRegMM16<string opstr, RegisterOperand RO> : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), - [(MipsJmpLink RO:$rs)], IIBranch, FrmR> { + [(MipsJmpLink RO:$rs)], II_JALR, FrmR>, PredicateControl { let isCall = 1; let hasDelaySlot = 1; let Defs = [RA]; @@ -401,7 +428,7 @@ class JumpLinkRegMM16<string opstr, RegisterOperand RO> : // 16-bit Jump Reg class JumpRegMM16<string opstr, RegisterOperand RO> : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), - [], IIBranch, FrmR> { + [], II_JR, FrmR> { let hasDelaySlot = 1; let isBranch = 1; let isIndirectBranch = 1; @@ -410,7 +437,7 @@ class JumpRegMM16<string opstr, RegisterOperand RO> : // Base class for JRADDIUSP instruction. class JumpRAddiuStackMM16 : MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jraddiusp\t$imm", - [], IIBranch, FrmR> { + [], II_JRADDIUSP, FrmR> { let isTerminator = 1; let isBarrier = 1; let isBranch = 1; @@ -420,7 +447,7 @@ class JumpRAddiuStackMM16 : // 16-bit Jump and Link (Call) - Short Delay Slot class JumpLinkRegSMM16<string opstr, RegisterOperand RO> : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), - [], IIBranch, FrmR> { + [], II_JALRS, FrmR> { let isCall = 1; let hasDelaySlot = 1; let Defs = [RA]; @@ -429,7 +456,7 @@ class JumpLinkRegSMM16<string opstr, RegisterOperand RO> : // 16-bit Jump Register Compact - No delay slot class JumpRegCMM16<string opstr, RegisterOperand RO> : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), - [], IIBranch, FrmR> { + [], II_JRC, FrmR> { let isTerminator = 1; let isBarrier = 1; let isBranch = 1; @@ -444,7 +471,7 @@ class BrkSdbbp16MM<string opstr> : class CBranchZeroMM<string opstr, DAGOperand opnd, RegisterOperand RO> : MicroMipsInst16<(outs), (ins RO:$rs, opnd:$offset), - !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI> { + !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZ, FrmI> { let isBranch = 1; let isTerminator = 1; let hasDelaySlot = 1; @@ -455,18 +482,18 @@ class CBranchZeroMM<string opstr, DAGOperand opnd, RegisterOperand RO> : let isCall = 1, hasDelaySlot = 1, Defs = [RA] in { class JumpLinkMM<string opstr, DAGOperand opnd> : InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"), - [], IIBranch, FrmJ, opstr> { + [], II_JALS, FrmJ, opstr> { let DecoderMethod = "DecodeJumpTargetMM"; } class JumpLinkRegMM<string opstr, RegisterOperand RO>: InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"), - [], IIBranch, FrmR>; + [], II_JALRS, FrmR>; class BranchCompareToZeroLinkMM<string opstr, DAGOperand opnd, RegisterOperand RO> : InstSE<(outs), (ins RO:$rs, opnd:$offset), - !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr>; + !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZALS, FrmI, opstr>; } class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO, @@ -475,6 +502,10 @@ class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO, InstSE<(outs RO:$rd), (ins PtrRC:$base, PtrRC:$index), !strconcat(opstr, "\t$rd, ${index}(${base})"), [], Itin, FrmFI>; +class PrefetchIndexed<string opstr> : + InstSE<(outs), (ins PtrRC:$base, PtrRC:$index, uimm5:$hint), + !strconcat(opstr, "\t$hint, ${index}(${base})"), [], NoItinerary, FrmOther>; + class AddImmUPC<string opstr, RegisterOperand RO> : InstSE<(outs RO:$rs), (ins simm23_lsl2:$imm), !strconcat(opstr, "\t$rs, $imm"), [], NoItinerary, FrmR>; @@ -543,7 +574,7 @@ class LoadMultMM16<string opstr, class UncondBranchMM16<string opstr> : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset), !strconcat(opstr, "\t$offset"), - [], IIBranch, FrmI> { + [], II_B, FrmI> { let isBranch = 1; let isTerminator = 1; let isBarrier = 1; @@ -553,21 +584,24 @@ class UncondBranchMM16<string opstr> : } def ADDU16_MM : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>, - ARITH_FM_MM16<0>; -def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>, - ARITH_FM_MM16<1>; -def ANDI16_MM : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, ANDI_FM_MM16<0x0b>; + ARITH_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6_64R6; def AND16_MM : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>, - LOGIC_FM_MM16<0x2>; -def OR16_MM : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, - LOGIC_FM_MM16<0x3>; -def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>, - LOGIC_FM_MM16<0x1>; -def NOT16_MM : NotMM16<"not16", GPRMM16Opnd>, LOGIC_FM_MM16<0x0>; + LOGIC_FM_MM16<0x2>, ISA_MICROMIPS_NOT_32R6_64R6; +def ANDI16_MM : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, ANDI_FM_MM16<0x0b>, + ISA_MICROMIPS_NOT_32R6_64R6; +def NOT16_MM : NotMM16<"not16", GPRMM16Opnd>, LOGIC_FM_MM16<0x0>, + ISA_MICROMIPS_NOT_32R6_64R6; +def OR16_MM : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, LOGIC_FM_MM16<0x3>, + ISA_MICROMIPS_NOT_32R6_64R6; def SLL16_MM : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>, - SHIFT_FM_MM16<0>; + SHIFT_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6_64R6; def SRL16_MM : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>, - SHIFT_FM_MM16<1>; + SHIFT_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6_64R6; + +def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>, + ARITH_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6_64R6; +def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>, + LOGIC_FM_MM16<0x1>, ISA_MICROMIPS_NOT_32R6_64R6; def LBU16_MM : LoadMM16<"lbu16", GPRMM16Opnd, zextloadi8, II_LBU, mem_mm_4>, LOAD_STORE_FM_MM16<0x02>; def LHU16_MM : LoadMM16<"lhu16", GPRMM16Opnd, zextloadi16, II_LHU, @@ -597,7 +631,8 @@ def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>; def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16; def LI16_MM : LoadImmMM16<"li16", li_simm7, GPRMM16Opnd>, LI_FM_MM16, IsAsCheapAsAMove; -def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>; +def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>, + ISA_MICROMIPS32_NOT_MIPS32R6; def JALRS16_MM : JumpLinkRegSMM16<"jalrs16", GPR32Opnd>, JALR_FM_MM16<0x0f>; def JRC16_MM : JumpRegCMM16<"jrc", GPR32Opnd>, JALR_FM_MM16<0x0d>; def JRADDIUSP : JumpRAddiuStackMM16, JRADDIUSP_FM_MM16<0x18>; @@ -607,8 +642,18 @@ def BEQZ16_MM : CBranchZeroMM<"beqz16", brtarget7_mm, GPRMM16Opnd>, def BNEZ16_MM : CBranchZeroMM<"bnez16", brtarget7_mm, GPRMM16Opnd>, BEQNEZ_FM_MM16<0x2b>; def B16_MM : UncondBranchMM16<"b16">, B16_FM; -def BREAK16_MM : BrkSdbbp16MM<"break16">, BRKSDBBP16_FM_MM<0x28>; -def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16">, BRKSDBBP16_FM_MM<0x2C>; +def BREAK16_MM : BrkSdbbp16MM<"break16">, BRKSDBBP16_FM_MM<0x28>, + ISA_MICROMIPS_NOT_32R6_64R6; +def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16">, BRKSDBBP16_FM_MM<0x2C>, + ISA_MICROMIPS_NOT_32R6_64R6; + +let DecoderNamespace = "MicroMips" in { + /// Load and Store Instructions - multiple + def SWM16_MM : StoreMultMM16<"swm16">, LWM_FM_MM16<0x5>, + ISA_MICROMIPS32_NOT_MIPS32R6; + def LWM16_MM : LoadMultMM16<"lwm16">, LWM_FM_MM16<0x4>, + ISA_MICROMIPS32_NOT_MIPS32R6; +} class WaitMM<string opstr> : InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [], @@ -701,6 +746,18 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { def SW_MM : Store<"sw", GPR32Opnd>, MMRel, LW_FM_MM<0x3e>; } + let DecoderMethod = "DecodeMemMMImm9" in { + def LBE_MM : Load<"lbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>; + def LBuE_MM : Load<"lbue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x0>; + def LHE_MM : Load<"lhe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>; + def LHuE_MM : Load<"lhue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>; + def LWE_MM : Load<"lwe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>; + def SBE_MM : Store<"sbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>; + def SHE_MM : Store<"she", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>; + def SWE_MM : StoreMemory<"swe", GPR32Opnd, mem_simm9gpr>, + POOL32C_LHUE_FM_MM<0x18, 0xa, 0x7>; + } + def LWXS_MM : LoadWordIndexedScaledMM<"lwxs", GPR32Opnd>, LWXS_FM_MM<0x118>; def LWU_MM : LoadMM<"lwu", GPR32Opnd, zextloadi32, II_LWU>, LL_FM_MM<0xe>; @@ -714,12 +771,20 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { LWL_FM_MM<0x8>; def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12>, LWL_FM_MM<0x9>; + let DecoderMethod = "DecodeMemMMImm9" in { + def LWLE_MM : LoadLeftRightMM<"lwle", MipsLWL, GPR32Opnd, mem_mm_12>, + POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x2>; + def LWRE_MM : LoadLeftRightMM<"lwre", MipsLWR, GPR32Opnd, mem_mm_12>, + POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x3>; + def SWLE_MM : StoreLeftRightMM<"swle", MipsSWL, GPR32Opnd, mem_mm_12>, + POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x0>; + def SWRE_MM : StoreLeftRightMM<"swre", MipsSWR, GPR32Opnd, mem_mm_12>, + POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x1>, ISA_MIPS1_NOT_32R6_64R6; + } /// Load and Store Instructions - multiple def SWM32_MM : StoreMultMM<"swm32">, LWM_FM_MM<0xd>; def LWM32_MM : LoadMultMM<"lwm32">, LWM_FM_MM<0x5>; - def SWM16_MM : StoreMultMM16<"swm16">, LWM_FM_MM16<0x5>; - def LWM16_MM : LoadMultMM16<"lwm16">, LWM_FM_MM16<0x4>; /// Load and Store Pair Instructions def SWP_MM : StorePairMM<"swp">, LWM_FM_MM<0x9>; @@ -777,11 +842,11 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { SEB_FM_MM<0x0ec>, ISA_MIPS32R2; /// Word Swap Bytes Within Halfwords - def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM_MM<0x1ec>, - ISA_MIPS32R2; - - def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>, - EXT_FM_MM<0x2c>; + def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>, + SEB_FM_MM<0x1ec>, ISA_MIPS32R2; + // TODO: Add '0 < pos+size <= 32' constraint check to ext instruction + def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, + MipsExt>, EXT_FM_MM<0x2c>; def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM_MM<0x0c>; @@ -854,12 +919,22 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>; def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>; + def LLE_MM : LLEBaseMM<"lle", GPR32Opnd>, LLE_FM_MM<0x6>; + def SCE_MM : SCEBaseMM<"sce", GPR32Opnd>, LLE_FM_MM<0xA>; + let DecoderMethod = "DecodeCacheOpMM" in { def CACHE_MM : MMRel, CacheOp<"cache", mem_mm_12>, CACHE_PREF_FM_MM<0x08, 0x6>; def PREF_MM : MMRel, CacheOp<"pref", mem_mm_12>, CACHE_PREF_FM_MM<0x18, 0x2>; } + + let DecoderMethod = "DecodePrefeOpMM" in { + def PREFE_MM : MMRel, CacheOp<"prefe", mem_mm_9>, + CACHE_PREFE_FM_MM<0x18, 0x2>; + def CACHEE_MM : MMRel, CacheOp<"cachee", mem_mm_9>, + CACHE_PREFE_FM_MM<0x18, 0x3>; + } def SSNOP_MM : MMRel, Barrier<"ssnop">, BARRIER_FM_MM<0x1>; def EHB_MM : MMRel, Barrier<"ehb">, BARRIER_FM_MM<0x3>; def PAUSE_MM : MMRel, Barrier<"pause">, BARRIER_FM_MM<0x5>; @@ -870,7 +945,13 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { def TLBWR_MM : MMRel, TLB<"tlbwr">, COP0_TLB_FM_MM<0xcd>; def SDBBP_MM : MMRel, SYS_FT<"sdbbp">, SDBBP_FM_MM; - def RDHWR_MM : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM_MM; + + def PREFX_MM : PrefetchIndexed<"prefx">, POOL32F_PREFX_FM_MM<0x15, 0x1A0>; +} + +let DecoderNamespace = "MicroMips" in { + def RDHWR_MM : MMRel, R6MMR6Rel, ReadHardware<GPR32Opnd, HWRegsOpnd>, + RDHWR_FM_MM, ISA_MICROMIPS32_NOT_MIPS32R6; } let Predicates = [InMicroMips] in { @@ -928,7 +1009,7 @@ class UncondBranchMMPseudo<string opstr> : MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset), !strconcat(opstr, "\t$offset")>; - def B_MM_Pseudo : UncondBranchMMPseudo<"b">; +def B_MM_Pseudo : UncondBranchMMPseudo<"b">, ISA_MICROMIPS; def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>; def : MipsInstAlias<"nop", (SLL_MM ZERO, ZERO, 0), 1>; @@ -937,4 +1018,17 @@ class UncondBranchMMPseudo<string opstr> : let Predicates = [InMicroMips] in { def : MipsInstAlias<"ei", (EI_MM ZERO), 1>, ISA_MIPS32R2; +def : MipsInstAlias<"di", (DI_MM ZERO), 1>, ISA_MIPS32R2; +def : MipsInstAlias<"teq $rs, $rt", + (TEQ_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>; +def : MipsInstAlias<"tge $rs, $rt", + (TGE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>; +def : MipsInstAlias<"tgeu $rs, $rt", + (TGEU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>; +def : MipsInstAlias<"tlt $rs, $rt", + (TLT_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>; +def : MipsInstAlias<"tltu $rs, $rt", + (TLTU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>; +def : MipsInstAlias<"tne $rs, $rt", + (TNE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>; } diff --git a/contrib/llvm/lib/Target/Mips/Mips.td b/contrib/llvm/lib/Target/Mips/Mips.td index dbb5f7b..35352b6 100644 --- a/contrib/llvm/lib/Target/Mips/Mips.td +++ b/contrib/llvm/lib/Target/Mips/Mips.td @@ -154,9 +154,14 @@ def FeatureMips16 : SubtargetFeature<"mips16", "InMips16Mode", "true", def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">; def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true", "Mips DSP-R2 ASE", [FeatureDSP]>; +def FeatureDSPR3 + : SubtargetFeature<"dspr3", "HasDSPR3", "true", "Mips DSP-R3 ASE", + [ FeatureDSP, FeatureDSPR2 ]>; def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">; +def FeatureEVA : SubtargetFeature<"eva", "HasEVA", "true", "Mips EVA ASE">; + def FeatureMicroMips : SubtargetFeature<"micromips", "InMicroMipsMode", "true", "microMips mode">; @@ -164,10 +169,19 @@ def FeatureCnMips : SubtargetFeature<"cnmips", "HasCnMips", "true", "Octeon cnMIPS Support", [FeatureMips64r2]>; +def FeatureUseTCCInDIV : SubtargetFeature< + "use-tcc-in-div", + "UseTCCInDIV", "false", + "Force the assembler to use trapping">; + //===----------------------------------------------------------------------===// // Mips processors supported. //===----------------------------------------------------------------------===// +def ImplP5600 : SubtargetFeature<"p5600", "ProcImpl", + "MipsSubtarget::CPU::P5600", + "The P5600 Processor", [FeatureMips32r5]>; + class Proc<string Name, list<SubtargetFeature> Features> : Processor<Name, MipsGenericItineraries, Features>; @@ -187,12 +201,11 @@ def : Proc<"mips64r2", [FeatureMips64r2]>; def : Proc<"mips64r3", [FeatureMips64r3]>; def : Proc<"mips64r5", [FeatureMips64r5]>; def : Proc<"mips64r6", [FeatureMips64r6]>; -def : Proc<"mips16", [FeatureMips16]>; def : Proc<"octeon", [FeatureMips64r2, FeatureCnMips]>; +def : ProcessorModel<"p5600", MipsP5600Model, [ImplP5600]>; def MipsAsmParser : AsmParser { let ShouldEmitMatchRegisterName = 0; - let MnemonicContainsDot = 1; } def MipsAsmParserVariant : AsmParserVariant { diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp index 46cc99c..26426c0 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp @@ -39,7 +39,11 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF, const Mips16InstrInfo &TII = *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo()); MachineBasicBlock::iterator MBBI = MBB.begin(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc dl; + uint64_t StackSize = MFI->getStackSize(); // No need to allocate space on the stack. @@ -107,7 +111,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, const std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); - MachineBasicBlock *EntryBlock = MF->begin(); + MachineBasicBlock *EntryBlock = &MF->front(); // // Registers RA, S0,S1 are the callee saved registers and they diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp index 893fc7c..b2bc7e7 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp @@ -40,26 +40,17 @@ namespace { const MipsTargetMachine &TM; }; - class InlineAsmHelper { - LLVMContext &C; - BasicBlock *BB; - public: - InlineAsmHelper(LLVMContext &C_, BasicBlock *BB_) : - C(C_), BB(BB_) { - } - - void Out(StringRef AsmString) { - std::vector<llvm::Type *> AsmArgTypes; - std::vector<llvm::Value*> AsmArgs; - - llvm::FunctionType *AsmFTy = llvm::FunctionType::get(Type::getVoidTy(C), - AsmArgTypes, false); - llvm::InlineAsm *IA = llvm::InlineAsm::get(AsmFTy, AsmString, "", true, - /* IsAlignStack */ false, - llvm::InlineAsm::AD_ATT); - CallInst::Create(IA, AsmArgs, "", BB); - } - }; + static void EmitInlineAsm(LLVMContext &C, BasicBlock *BB, StringRef AsmText) { + std::vector<llvm::Type *> AsmArgTypes; + std::vector<llvm::Value *> AsmArgs; + + llvm::FunctionType *AsmFTy = + llvm::FunctionType::get(Type::getVoidTy(C), AsmArgTypes, false); + llvm::InlineAsm *IA = + llvm::InlineAsm::get(AsmFTy, AsmText, "", true, + /* IsAlignStack */ false, llvm::InlineAsm::AD_ATT); + CallInst::Create(IA, AsmArgs, "", BB); + } char Mips16HardFloat::ID = 0; } @@ -182,7 +173,7 @@ static bool needsFPReturnHelper(Function &F) { return whichFPReturnVariant(RetType) != NoFPRet; } -static bool needsFPReturnHelper(const FunctionType &FT) { +static bool needsFPReturnHelper(FunctionType &FT) { Type* RetType = FT.getReturnType(); return whichFPReturnVariant(RetType) != NoFPRet; } @@ -195,63 +186,72 @@ static bool needsFPHelperFromSig(Function &F) { // We swap between FP and Integer registers to allow Mips16 and Mips32 to // interoperate // -static void swapFPIntParams(FPParamVariant PV, Module *M, InlineAsmHelper &IAH, - bool LE, bool ToFP) { - //LLVMContext &Context = M->getContext(); - std::string MI = ToFP? "mtc1 ": "mfc1 "; +static std::string swapFPIntParams(FPParamVariant PV, Module *M, bool LE, + bool ToFP) { + std::string MI = ToFP ? "mtc1 ": "mfc1 "; + std::string AsmText; + switch (PV) { case FSig: - IAH.Out(MI + "$$4,$$f12"); + AsmText += MI + "$$4, $$f12\n"; break; + case FFSig: - IAH.Out(MI +"$$4,$$f12"); - IAH.Out(MI + "$$5,$$f14"); + AsmText += MI + "$$4, $$f12\n"; + AsmText += MI + "$$5, $$f14\n"; break; + case FDSig: - IAH.Out(MI + "$$4,$$f12"); + AsmText += MI + "$$4, $$f12\n"; if (LE) { - IAH.Out(MI + "$$6,$$f14"); - IAH.Out(MI + "$$7,$$f15"); + AsmText += MI + "$$6, $$f14\n"; + AsmText += MI + "$$7, $$f15\n"; } else { - IAH.Out(MI + "$$7,$$f14"); - IAH.Out(MI + "$$6,$$f15"); + AsmText += MI + "$$7, $$f14\n"; + AsmText += MI + "$$6, $$f15\n"; } break; + case DSig: if (LE) { - IAH.Out(MI + "$$4,$$f12"); - IAH.Out(MI + "$$5,$$f13"); + AsmText += MI + "$$4, $$f12\n"; + AsmText += MI + "$$5, $$f13\n"; } else { - IAH.Out(MI + "$$5,$$f12"); - IAH.Out(MI + "$$4,$$f13"); + AsmText += MI + "$$5, $$f12\n"; + AsmText += MI + "$$4, $$f13\n"; } break; + case DDSig: if (LE) { - IAH.Out(MI + "$$4,$$f12"); - IAH.Out(MI + "$$5,$$f13"); - IAH.Out(MI + "$$6,$$f14"); - IAH.Out(MI + "$$7,$$f15"); + AsmText += MI + "$$4, $$f12\n"; + AsmText += MI + "$$5, $$f13\n"; + AsmText += MI + "$$6, $$f14\n"; + AsmText += MI + "$$7, $$f15\n"; } else { - IAH.Out(MI + "$$5,$$f12"); - IAH.Out(MI + "$$4,$$f13"); - IAH.Out(MI + "$$7,$$f14"); - IAH.Out(MI + "$$6,$$f15"); + AsmText += MI + "$$5, $$f12\n"; + AsmText += MI + "$$4, $$f13\n"; + AsmText += MI + "$$7, $$f14\n"; + AsmText += MI + "$$6, $$f15\n"; } break; + case DFSig: if (LE) { - IAH.Out(MI + "$$4,$$f12"); - IAH.Out(MI + "$$5,$$f13"); + AsmText += MI + "$$4, $$f12\n"; + AsmText += MI + "$$5, $$f13\n"; } else { - IAH.Out(MI + "$$5,$$f12"); - IAH.Out(MI + "$$4,$$f13"); + AsmText += MI + "$$5, $$f12\n"; + AsmText += MI + "$$4, $$f13\n"; } - IAH.Out(MI + "$$6,$$f14"); + AsmText += MI + "$$6, $$f14\n"; break; + case NoSig: - return; + break; } + + return AsmText; } // @@ -282,68 +282,77 @@ static void assureFPCallStub(Function &F, Module *M, FStub->addFnAttr("nomips16"); FStub->setSection(SectionName); BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub); - InlineAsmHelper IAH(Context, BB); - IAH.Out(".set reorder"); FPReturnVariant RV = whichFPReturnVariant(FStub->getReturnType()); FPParamVariant PV = whichFPParamVariantNeeded(F); - swapFPIntParams(PV, M, IAH, LE, true); + + std::string AsmText; + AsmText += ".set reorder\n"; + AsmText += swapFPIntParams(PV, M, LE, true); if (RV != NoFPRet) { - IAH.Out("move $$18, $$31"); - IAH.Out("jal " + Name); + AsmText += "move $$18, $$31\n"; + AsmText += "jal " + Name + "\n"; } else { - IAH.Out("lui $$25,%hi(" + Name + ")"); - IAH.Out("addiu $$25,$$25,%lo(" + Name + ")" ); + AsmText += "lui $$25, %hi(" + Name + ")\n"; + AsmText += "addiu $$25, $$25, %lo(" + Name + ")\n"; } + switch (RV) { case FRet: - IAH.Out("mfc1 $$2,$$f0"); + AsmText += "mfc1 $$2, $$f0\n"; break; + case DRet: if (LE) { - IAH.Out("mfc1 $$2,$$f0"); - IAH.Out("mfc1 $$3,$$f1"); + AsmText += "mfc1 $$2, $$f0\n"; + AsmText += "mfc1 $$3, $$f1\n"; } else { - IAH.Out("mfc1 $$3,$$f0"); - IAH.Out("mfc1 $$2,$$f1"); + AsmText += "mfc1 $$3, $$f0\n"; + AsmText += "mfc1 $$2, $$f1\n"; } break; + case CFRet: if (LE) { - IAH.Out("mfc1 $$2,$$f0"); - IAH.Out("mfc1 $$3,$$f2"); + AsmText += "mfc1 $$2, $$f0\n"; + AsmText += "mfc1 $$3, $$f2\n"; } else { - IAH.Out("mfc1 $$3,$$f0"); - IAH.Out("mfc1 $$3,$$f2"); + AsmText += "mfc1 $$3, $$f0\n"; + AsmText += "mfc1 $$3, $$f2\n"; } break; + case CDRet: if (LE) { - IAH.Out("mfc1 $$4,$$f2"); - IAH.Out("mfc1 $$5,$$f3"); - IAH.Out("mfc1 $$2,$$f0"); - IAH.Out("mfc1 $$3,$$f1"); + AsmText += "mfc1 $$4, $$f2\n"; + AsmText += "mfc1 $$5, $$f3\n"; + AsmText += "mfc1 $$2, $$f0\n"; + AsmText += "mfc1 $$3, $$f1\n"; } else { - IAH.Out("mfc1 $$5,$$f2"); - IAH.Out("mfc1 $$4,$$f3"); - IAH.Out("mfc1 $$3,$$f0"); - IAH.Out("mfc1 $$2,$$f1"); + AsmText += "mfc1 $$5, $$f2\n"; + AsmText += "mfc1 $$4, $$f3\n"; + AsmText += "mfc1 $$3, $$f0\n"; + AsmText += "mfc1 $$2, $$f1\n"; } break; + case NoFPRet: break; } + if (RV != NoFPRet) - IAH.Out("jr $$18"); + AsmText += "jr $$18\n"; else - IAH.Out("jr $$25"); + AsmText += "jr $$25\n"; + EmitInlineAsm(Context, BB, AsmText); + new UnreachableInst(Context, BB); } // // Functions that are llvm intrinsics and don't need helpers. // -static const char *IntrinsicInline[] = { +static const char *const IntrinsicInline[] = { "fabs", "fabsf", "llvm.ceil.f32", "llvm.ceil.f64", "llvm.copysign.f32", "llvm.copysign.f64", @@ -395,7 +404,7 @@ static bool fixupFPReturnAndCall(Function &F, Module *M, Type *T = RVal->getType(); FPReturnVariant RV = whichFPReturnVariant(T); if (RV == NoFPRet) continue; - static const char* Helper[NoFPRet] = { + static const char *const Helper[NoFPRet] = { "__mips16_ret_sf", "__mips16_ret_df", "__mips16_ret_sc", "__mips16_ret_dc" }; @@ -419,11 +428,11 @@ static bool fixupFPReturnAndCall(Function &F, Module *M, CallInst::Create(F, Params, "", &Inst ); } else if (const CallInst *CI = dyn_cast<CallInst>(I)) { const Value* V = CI->getCalledValue(); - const Type* T = nullptr; + Type* T = nullptr; if (V) T = V->getType(); - const PointerType *PFT=nullptr; + PointerType *PFT = nullptr; if (T) PFT = dyn_cast<PointerType>(T); - const FunctionType *FT=nullptr; + FunctionType *FT = nullptr; if (PFT) FT = dyn_cast<FunctionType>(PFT->getElementType()); Function *F_ = CI->getCalledFunction(); if (FT && needsFPReturnHelper(*FT) && @@ -469,20 +478,21 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV, FStub->addFnAttr("nomips16"); FStub->setSection(SectionName); BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub); - InlineAsmHelper IAH(Context, BB); + + std::string AsmText; if (PicMode) { - IAH.Out(".set noreorder"); - IAH.Out(".cpload $$25"); - IAH.Out(".set reorder"); - IAH.Out(".reloc 0,R_MIPS_NONE," + Name); - IAH.Out("la $$25," + LocalName); - } - else { - IAH.Out("la $$25," + Name); - } - swapFPIntParams(PV, M, IAH, LE, false); - IAH.Out("jr $$25"); - IAH.Out(LocalName + " = " + Name); + AsmText += ".set noreorder\n"; + AsmText += ".cpload $$25\n"; + AsmText += ".set reorder\n"; + AsmText += ".reloc 0, R_MIPS_NONE, " + Name + "\n"; + AsmText += "la $$25, " + LocalName + "\n"; + } else + AsmText += "la $$25, " + Name + "\n"; + AsmText += swapFPIntParams(PV, M, LE, false); + AsmText += "jr $$25\n"; + AsmText += LocalName + " = " + Name + "\n"; + EmitInlineAsm(Context, BB, AsmText); + new UnreachableInst(FStub->getContext(), BB); } @@ -535,7 +545,7 @@ bool Mips16HardFloat::runOnModule(Module &M) { FPParamVariant V = whichFPParamVariantNeeded(*F); if (V != NoSig) { Modified = true; - createFPFnStub(F, &M, V, TM); + createFPFnStub(&*F, &M, V, TM); } } return Modified; diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp index bce2c1e..5a1c2c67 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp @@ -73,7 +73,7 @@ void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { MachineBasicBlock::iterator I = MBB.begin(); MachineRegisterInfo &RegInfo = MF.getRegInfo(); const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); - DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); + DebugLoc DL; unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg(); const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass; diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp index 3522cbb..e748325 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp @@ -530,8 +530,7 @@ emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const { // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -592,8 +591,7 @@ Mips16TargetLowering::emitSelT16(unsigned Opc1, unsigned Opc2, MachineInstr *MI, // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -657,8 +655,7 @@ Mips16TargetLowering::emitSeliT16(unsigned Opc1, unsigned Opc2, // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp index a49572e..da8ada4 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp @@ -196,7 +196,7 @@ static void addSaveRestoreRegs(MachineInstrBuilder &MIB, void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); + DebugLoc DL; MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); const BitVector Reserved = RI.getReservedRegs(MF); @@ -263,7 +263,7 @@ void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned Reg1, unsigned Reg2) const { - DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); + DebugLoc DL; // // li reg1, constant // move reg2, sp @@ -446,7 +446,7 @@ const MCInstrDesc &Mips16InstrInfo::AddiuSpImm(int64_t Imm) const { void Mips16InstrInfo::BuildAddiuSpImm (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const { - DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); + DebugLoc DL; BuildMI(MBB, I, DL, AddiuSpImm(Imm)).addImm(Imm); } diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td index 10fff03..dad6ea4 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td @@ -530,19 +530,19 @@ class MayStore { // Purpose: Add Immediate Unsigned Word (2-Operand, Extended) // To add a constant to a 32-bit integer. // -def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIAlu>; +def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIM16Alu>; -def AddiuRxRxImm16: F2RI16_ins<0b01001, "addiu", IIAlu>, +def AddiuRxRxImm16: F2RI16_ins<0b01001, "addiu", IIM16Alu>, ArithLogic16Defs<0> { let AddedComplexity = 5; } -def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIAlu>, +def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIM16Alu>, ArithLogic16Defs<0> { let isCodeGenOnly = 1; } def AddiuRxRyOffMemX16: - FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIAlu>; + FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIM16Alu>; // @@ -550,7 +550,7 @@ def AddiuRxRyOffMemX16: // Purpose: Add Immediate Unsigned Word (3-Operand, PC-Relative, Extended) // To add a constant to the program counter. // -def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>; +def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIM16Alu>; // // Format: ADDIU sp, immediate MIPS16e @@ -558,14 +558,14 @@ def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>; // To add a constant to the stack pointer. // def AddiuSpImm16 - : FI816_SP_ins<0b011, "addiu", IIAlu> { + : FI816_SP_ins<0b011, "addiu", IIM16Alu> { let Defs = [SP]; let Uses = [SP]; let AddedComplexity = 5; } def AddiuSpImmX16 - : FEXT_I816_SP_ins<0b011, "addiu", IIAlu> { + : FEXT_I816_SP_ins<0b011, "addiu", IIM16Alu> { let Defs = [SP]; let Uses = [SP]; } @@ -576,14 +576,14 @@ def AddiuSpImmX16 // To add 32-bit integers. // -def AdduRxRyRz16: FRRR16_ins<01, "addu", IIAlu>, ArithLogic16Defs<1>; +def AdduRxRyRz16: FRRR16_ins<01, "addu", IIM16Alu>, ArithLogic16Defs<1>; // // Format: AND rx, ry MIPS16e // Purpose: AND // To do a bitwise logical AND. -def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>; +def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIM16Alu>, ArithLogic16Defs<1>; // @@ -591,7 +591,7 @@ def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>; // Purpose: Branch on Equal to Zero // To test a GPR then do a PC-relative conditional branch. // -def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16; +def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIM16Alu>, cbranch16; // @@ -599,7 +599,7 @@ def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16; // Purpose: Branch on Equal to Zero (Extended) // To test a GPR then do a PC-relative conditional branch. // -def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16; +def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIM16Alu>, cbranch16; // // Format: B offset MIPS16e @@ -607,27 +607,27 @@ def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16; // To do an unconditional PC-relative branch. // -def Bimm16: FI16_ins<0b00010, "b", IIAlu>, branch16; +def Bimm16: FI16_ins<0b00010, "b", IIM16Alu>, branch16; // Format: B offset MIPS16e // Purpose: Unconditional Branch // To do an unconditional PC-relative branch. // -def BimmX16: FEXT_I16_ins<0b00010, "b", IIAlu>, branch16; +def BimmX16: FEXT_I16_ins<0b00010, "b", IIM16Alu>, branch16; // // Format: BNEZ rx, offset MIPS16e // Purpose: Branch on Not Equal to Zero // To test a GPR then do a PC-relative conditional branch. // -def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16; +def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIM16Alu>, cbranch16; // // Format: BNEZ rx, offset MIPS16e // Purpose: Branch on Not Equal to Zero (Extended) // To test a GPR then do a PC-relative conditional branch. // -def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16; +def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIM16Alu>, cbranch16; // @@ -641,11 +641,11 @@ def Break16: FRRBreakNull16_ins<"break 0", NoItinerary>; // Purpose: Branch on T Equal to Zero (Extended) // To test special register T then do a PC-relative conditional branch. // -def Bteqz16: FI816_ins<0b000, "bteqz", IIAlu>, cbranch16 { +def Bteqz16: FI816_ins<0b000, "bteqz", IIM16Alu>, cbranch16 { let Uses = [T8]; } -def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16 { +def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIM16Alu>, cbranch16 { let Uses = [T8]; } @@ -669,11 +669,11 @@ def BteqzT8SltiuX16: FEXT_T8I8I16_ins<"bteqz", "sltiu">, // To test special register T then do a PC-relative conditional branch. // -def Btnez16: FI816_ins<0b001, "btnez", IIAlu>, cbranch16 { +def Btnez16: FI816_ins<0b001, "btnez", IIM16Alu>, cbranch16 { let Uses = [T8]; } -def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16 { +def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIM16Alu> ,cbranch16 { let Uses = [T8]; } @@ -695,7 +695,7 @@ def BtnezT8SltiuX16: FEXT_T8I8I16_ins<"btnez", "sltiu">, // Purpose: Compare // To compare the contents of two GPRs. // -def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIAlu> { +def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIM16Alu> { let Defs = [T8]; } @@ -704,7 +704,7 @@ def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIAlu> { // Purpose: Compare Immediate // To compare a constant with the contents of a GPR. // -def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIAlu> { +def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIM16Alu> { let Defs = [T8]; } @@ -713,7 +713,7 @@ def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIAlu> { // Purpose: Compare Immediate (Extended) // To compare a constant with the contents of a GPR. // -def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> { +def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIM16Alu> { let Defs = [T8]; } @@ -723,7 +723,7 @@ def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> { // Purpose: Divide Word // To divide 32-bit signed integers. // -def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> { +def DivRxRy16: FRR16_div_ins<0b11010, "div", IIM16Alu> { let Defs = [HI0, LO0]; } @@ -732,7 +732,7 @@ def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> { // Purpose: Divide Unsigned Word // To divide 32-bit unsigned integers. // -def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> { +def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIM16Alu> { let Defs = [HI0, LO0]; } // @@ -742,13 +742,13 @@ def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> { // region and preserve the current ISA. // -def Jal16 : FJAL16_ins<0b0, "jal", IIAlu> { +def Jal16 : FJAL16_ins<0b0, "jal", IIM16Alu> { let hasDelaySlot = 0; // not true, but we add the nop for now let isCall=1; let Defs = [RA]; } -def JalB16 : FJALB16_ins<0b0, "jal", IIAlu>, branch16 { +def JalB16 : FJALB16_ins<0b0, "jal", IIM16Alu>, branch16 { let hasDelaySlot = 0; // not true, but we add the nop for now let isBranch=1; let Defs = [RA]; @@ -761,7 +761,7 @@ def JalB16 : FJALB16_ins<0b0, "jal", IIAlu>, branch16 { // address register. // -def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu> { +def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIM16Alu> { let isBranch = 1; let isIndirectBranch = 1; let hasDelaySlot = 1; @@ -769,14 +769,14 @@ def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu> { let isBarrier=1; } -def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIAlu> { +def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> { let isBranch = 1; let isIndirectBranch = 1; let isTerminator=1; let isBarrier=1; } -def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIAlu> { +def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIM16Alu> { let isBranch = 1; let isIndirectBranch = 1; let isTerminator=1; @@ -825,16 +825,16 @@ def LhuRxRyOffMemX16: // Purpose: Load Immediate // To load a constant into a GPR. // -def LiRxImm16: FRI16_ins<0b01101, "li", IIAlu>; +def LiRxImm16: FRI16_ins<0b01101, "li", IIM16Alu>; // // Format: LI rx, immediate MIPS16e // Purpose: Load Immediate (Extended) // To load a constant into a GPR. // -def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>; +def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIM16Alu>; -def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIAlu> { +def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIM16Alu> { let isCodeGenOnly = 1; } @@ -863,21 +863,21 @@ def LwRxPcTcpX16: FEXT_RI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad; // Purpose: Move // To move the contents of a GPR to a GPR. // -def Move32R16: FI8_MOV32R16_ins<"move", IIAlu>; +def Move32R16: FI8_MOV32R16_ins<"move", IIM16Alu>; // // Format: MOVE ry, r32 MIPS16e //Purpose: Move // To move the contents of a GPR to a GPR. // -def MoveR3216: FI8_MOVR3216_ins<"move", IIAlu>; +def MoveR3216: FI8_MOVR3216_ins<"move", IIM16Alu>; // // Format: MFHI rx MIPS16e // Purpose: Move From HI Register // To copy the special purpose HI register to a GPR. // -def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> { +def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIM16Alu> { let Uses = [HI0]; let hasSideEffects = 0; } @@ -887,7 +887,7 @@ def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> { // Purpose: Move From LO Register // To copy the special purpose LO register to a GPR. // -def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> { +def Mflo16: FRR16_M_ins<0b10010, "mflo", IIM16Alu> { let Uses = [LO0]; let hasSideEffects = 0; } @@ -895,13 +895,13 @@ def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> { // // Pseudo Instruction for mult // -def MultRxRy16: FMULT16_ins<"mult", IIAlu> { +def MultRxRy16: FMULT16_ins<"mult", IIM16Alu> { let isCommutable = 1; let hasSideEffects = 0; let Defs = [HI0, LO0]; } -def MultuRxRy16: FMULT16_ins<"multu", IIAlu> { +def MultuRxRy16: FMULT16_ins<"multu", IIM16Alu> { let isCommutable = 1; let hasSideEffects = 0; let Defs = [HI0, LO0]; @@ -912,7 +912,7 @@ def MultuRxRy16: FMULT16_ins<"multu", IIAlu> { // Purpose: Multiply Word // To multiply 32-bit signed integers. // -def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> { +def MultRxRyRz16: FMULT16_LO_ins<"mult", IIM16Alu> { let isCommutable = 1; let hasSideEffects = 0; let Defs = [HI0, LO0]; @@ -923,7 +923,7 @@ def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> { // Purpose: Multiply Unsigned Word // To multiply 32-bit unsigned integers. // -def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> { +def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIM16Alu> { let isCommutable = 1; let hasSideEffects = 0; let Defs = [HI0, LO0]; @@ -934,21 +934,21 @@ def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> { // Purpose: Negate // To negate an integer value. // -def NegRxRy16: FUnaryRR16_ins<0b11101, "neg", IIAlu>; +def NegRxRy16: FUnaryRR16_ins<0b11101, "neg", IIM16Alu>; // // Format: NOT rx, ry MIPS16e // Purpose: Not // To complement an integer value // -def NotRxRy16: FUnaryRR16_ins<0b01111, "not", IIAlu>; +def NotRxRy16: FUnaryRR16_ins<0b01111, "not", IIM16Alu>; // // Format: OR rx, ry MIPS16e // Purpose: Or // To do a bitwise logical OR. // -def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIAlu>, ArithLogic16Defs<1>; +def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIM16Alu>, ArithLogic16Defs<1>; // // Format: RESTORE {ra,}{s0/s1/s0-1,}{framesize} @@ -1012,7 +1012,7 @@ def SbRxRyOffMemX16: // Sign-extend least significant byte in register rx. // def SebRx16 - : FRR_SF16_ins<0b10001, 0b100, "seb", IIAlu>; + : FRR_SF16_ins<0b10001, 0b100, "seb", IIM16Alu>; // // Format: SEH rx MIPS16e @@ -1020,7 +1020,7 @@ def SebRx16 // Sign-extend least significant word in register rx. // def SehRx16 - : FRR_SF16_ins<0b10001, 0b101, "seh", IIAlu>; + : FRR_SF16_ins<0b10001, 0b101, "seh", IIM16Alu>; // // The Sel(T) instructions are pseudos @@ -1149,21 +1149,21 @@ def ShRxRyOffMemX16: // Purpose: Shift Word Left Logical (Extended) // To execute a left-shift of a word by a fixed number of bits-0 to 31 bits. // -def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>; +def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIM16Alu>; // // Format: SLLV ry, rx MIPS16e // Purpose: Shift Word Left Logical Variable // To execute a left-shift of a word by a variable number of bits. // -def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIAlu>; +def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIM16Alu>; // Format: SLTI rx, immediate MIPS16e // Purpose: Set on Less Than Immediate // To record the result of a less-than comparison with a constant. // // -def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIAlu> { +def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIM16Alu> { let Defs = [T8]; } @@ -1173,7 +1173,7 @@ def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIAlu> { // To record the result of a less-than comparison with a constant. // // -def SltiRxImmX16: FEXT_RI16R_ins<0b01010, "slti", IIAlu> { +def SltiRxImmX16: FEXT_RI16R_ins<0b01010, "slti", IIM16Alu> { let Defs = [T8]; } @@ -1184,7 +1184,7 @@ def SltiCCRxImmX16: FEXT_CCRXI16_ins<"slti">; // To record the result of a less-than comparison with a constant. // // -def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIAlu> { +def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIM16Alu> { let Defs = [T8]; } @@ -1194,7 +1194,7 @@ def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIAlu> { // To record the result of a less-than comparison with a constant. // // -def SltiuRxImmX16: FEXT_RI16R_ins<0b01011, "sltiu", IIAlu> { +def SltiuRxImmX16: FEXT_RI16R_ins<0b01011, "sltiu", IIM16Alu> { let Defs = [T8]; } // @@ -1209,7 +1209,7 @@ def SltiuCCRxImmX16: FEXT_CCRXI16_ins<"sltiu">; // Purpose: Set on Less Than // To record the result of a less-than comparison. // -def SltRxRy16: FRR16R_ins<0b00010, "slt", IIAlu>{ +def SltRxRy16: FRR16R_ins<0b00010, "slt", IIM16Alu>{ let Defs = [T8]; } @@ -1219,7 +1219,7 @@ def SltCCRxRy16: FCCRR16_ins<"slt">; // Purpose: Set on Less Than Unsigned // To record the result of an unsigned less-than comparison. // -def SltuRxRy16: FRR16R_ins<0b00011, "sltu", IIAlu>{ +def SltuRxRy16: FRR16R_ins<0b00011, "sltu", IIM16Alu>{ let Defs = [T8]; } @@ -1236,7 +1236,7 @@ def SltuCCRxRy16: FCCRR16_ins<"sltu">; // To execute an arithmetic right-shift of a word by a variable // number of bits. // -def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>; +def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIM16Alu>; // @@ -1245,7 +1245,7 @@ def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>; // To execute an arithmetic right-shift of a word by a fixed // number of bits-1 to 8 bits. // -def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>; +def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIM16Alu>; // @@ -1254,7 +1254,7 @@ def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>; // To execute a logical right-shift of a word by a variable // number of bits. // -def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>; +def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIM16Alu>; // @@ -1263,14 +1263,14 @@ def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>; // To execute a logical right-shift of a word by a fixed // number of bits-1 to 31 bits. // -def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIAlu>; +def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIM16Alu>; // // Format: SUBU rz, rx, ry MIPS16e // Purpose: Subtract Unsigned Word // To subtract 32-bit integers // -def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIAlu>, ArithLogic16Defs<0>; +def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIM16Alu>, ArithLogic16Defs<0>; // // Format: SW ry, offset(rx) MIPS16e @@ -1294,7 +1294,7 @@ def SwRxSpImmX16: FEXT_RI16_SP_Store_explicit_ins // Purpose: Xor // To do a bitwise logical XOR. // -def XorRxRxRy16: FRxRxRy16_ins<0b01110, "xor", IIAlu>, ArithLogic16Defs<1>; +def XorRxRxRy16: FRxRxRy16_ins<0b01110, "xor", IIM16Alu>, ArithLogic16Defs<1>; class Mips16Pat<dag pattern, dag result> : Pat<pattern, result> { let Predicates = [InMips16Mode]; @@ -1380,7 +1380,7 @@ def: Mips16Pat<(brind CPU16Regs:$rs), (JrcRx16 CPU16Regs:$rs)> { let isCall=1, hasDelaySlot=0 in def JumpLinkReg16: FRR16_JALRC<0, 0, 0, (outs), (ins CPU16Regs:$rs), - "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], IIBranch> { + "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], II_JALRC> { let Defs = [RA]; } diff --git a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td index d6ab8a6..82d2c8e 100644 --- a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td @@ -186,54 +186,56 @@ class CMP_CONDN_DESC_BASE<string CondStr, string Typestr, multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr, RegisterOperand FGROpnd>{ - def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>, - CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>, - CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>, - CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>, - CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>, + let AdditionalPredicates = [NotInMicroMips] in { + def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>, + CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>, ISA_MIPS32R6, HARDFLOAT; - def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>, - CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>, - CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>, - CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>, - CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_SAF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SAF>, - CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_SUN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUN>, - CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>, - CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_SUEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUEQ>, - CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>, + def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>, + CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>, ISA_MIPS32R6, HARDFLOAT; - def CMP_SLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLT>, - CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_SULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULT>, - CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>, + def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>, + CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>, ISA_MIPS32R6, HARDFLOAT; - def CMP_SLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLE>, - CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_SULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULE>, - CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>, + def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>, + CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>, + CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>, ISA_MIPS32R6, HARDFLOAT; + def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>, + CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>, + CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>, + CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SAF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SAF>, + CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SUN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUN>, + CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>, + CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SUEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUEQ>, + CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLT>, + CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULT>, + CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLE>, + CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULE>, + CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + } } //===----------------------------------------------------------------------===// @@ -557,7 +559,7 @@ class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd, dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint); string AsmString = !strconcat(instr_asm, "\t$hint, $addr"); list<dag> Pattern = []; - string DecoderMethod = "DecodeCacheOpR6"; + string DecoderMethod = "DecodeCacheeOp_CacheOpR6"; } class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd>; @@ -595,7 +597,7 @@ class LSA_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, list<dag> Pattern = []; } -class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2>; +class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1>; class LL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { dag OutOperandList = (outs GPROpnd:$rt); @@ -685,8 +687,10 @@ def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6; def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6; def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6; def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6; -def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def CLO_R6 : R6MMR6Rel, CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6; def CLZ_R6 : R6MMR6Rel, CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6; defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd>; @@ -702,39 +706,51 @@ def LSA_R6 : R6MMR6Rel, LSA_R6_ENC, LSA_R6_DESC, ISA_MIPS32R6; def LWC2_R6 : LWC2_R6_ENC, LWC2_R6_DESC, ISA_MIPS32R6; def LWPC : R6MMR6Rel, LWPC_ENC, LWPC_DESC, ISA_MIPS32R6; def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6; -def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def MOD : R6MMR6Rel, MOD_ENC, MOD_DESC, ISA_MIPS32R6; def MODU : R6MMR6Rel, MODU_ENC, MODU_DESC, ISA_MIPS32R6; -def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6, HARDFLOAT; +} def MUH : R6MMR6Rel, MUH_ENC, MUH_DESC, ISA_MIPS32R6; def MUHU : R6MMR6Rel, MUHU_ENC, MUHU_DESC, ISA_MIPS32R6; def MUL_R6 : R6MMR6Rel, MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6; def MULU : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6; def NAL; // BAL with rd=0 def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6; -def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6; +let AdditionalPredicates = [NotInMicroMips] in { def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6; +} def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6; def SELEQZ : R6MMR6Rel, SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32; -def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def SELNEZ : R6MMR6Rel, SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32; -def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6; //===----------------------------------------------------------------------===// @@ -743,7 +759,9 @@ def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6; // //===----------------------------------------------------------------------===// +let AdditionalPredicates = [NotInMicroMips] in { def : MipsInstAlias<"sdbbp", (SDBBP_R6 0)>, ISA_MIPS32R6; +} def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6; //===----------------------------------------------------------------------===// @@ -752,84 +770,78 @@ def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6; // //===----------------------------------------------------------------------===// -// f32 comparisons supported via another comparison -def : MipsPat<(setone f32:$lhs, f32:$rhs), - (NOR (CMP_UEQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6; -def : MipsPat<(seto f32:$lhs, f32:$rhs), - (NOR (CMP_UN_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6; -def : MipsPat<(setune f32:$lhs, f32:$rhs), - (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6; -def : MipsPat<(seteq f32:$lhs, f32:$rhs), (CMP_EQ_S f32:$lhs, f32:$rhs)>, - ISA_MIPS32R6; -def : MipsPat<(setgt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$rhs, f32:$lhs)>, - ISA_MIPS32R6; -def : MipsPat<(setge f32:$lhs, f32:$rhs), (CMP_LT_S f32:$rhs, f32:$lhs)>, - ISA_MIPS32R6; -def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LT_S f32:$lhs, f32:$rhs)>, - ISA_MIPS32R6; -def : MipsPat<(setle f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>, - ISA_MIPS32R6; -def : MipsPat<(setne f32:$lhs, f32:$rhs), - (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6; - -// f64 comparisons supported via another comparison -def : MipsPat<(setone f64:$lhs, f64:$rhs), - (NOR (CMP_UEQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6; -def : MipsPat<(seto f64:$lhs, f64:$rhs), - (NOR (CMP_UN_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6; -def : MipsPat<(setune f64:$lhs, f64:$rhs), - (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6; -def : MipsPat<(seteq f64:$lhs, f64:$rhs), (CMP_EQ_D f64:$lhs, f64:$rhs)>, - ISA_MIPS32R6; -def : MipsPat<(setgt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$rhs, f64:$lhs)>, - ISA_MIPS32R6; -def : MipsPat<(setge f64:$lhs, f64:$rhs), (CMP_LT_D f64:$rhs, f64:$lhs)>, - ISA_MIPS32R6; -def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LT_D f64:$lhs, f64:$rhs)>, - ISA_MIPS32R6; -def : MipsPat<(setle f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>, - ISA_MIPS32R6; -def : MipsPat<(setne f64:$lhs, f64:$rhs), - (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6; +// comparisons supported via another comparison +multiclass Cmp_Pats<ValueType VT, Instruction NOROp, Register ZEROReg> { +def : MipsPat<(setone VT:$lhs, VT:$rhs), + (NOROp (!cast<Instruction>("CMP_UEQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>; +def : MipsPat<(seto VT:$lhs, VT:$rhs), + (NOROp (!cast<Instruction>("CMP_UN_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>; +def : MipsPat<(setune VT:$lhs, VT:$rhs), + (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>; +def : MipsPat<(seteq VT:$lhs, VT:$rhs), + (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs)>; +def : MipsPat<(setgt VT:$lhs, VT:$rhs), + (!cast<Instruction>("CMP_LE_"#NAME) VT:$rhs, VT:$lhs)>; +def : MipsPat<(setge VT:$lhs, VT:$rhs), + (!cast<Instruction>("CMP_LT_"#NAME) VT:$rhs, VT:$lhs)>; +def : MipsPat<(setlt VT:$lhs, VT:$rhs), + (!cast<Instruction>("CMP_LT_"#NAME) VT:$lhs, VT:$rhs)>; +def : MipsPat<(setle VT:$lhs, VT:$rhs), + (!cast<Instruction>("CMP_LE_"#NAME) VT:$lhs, VT:$rhs)>; +def : MipsPat<(setne VT:$lhs, VT:$rhs), + (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>; +} + +defm S : Cmp_Pats<f32, NOR, ZERO>, ISA_MIPS32R6; +defm D : Cmp_Pats<f64, NOR, ZERO>, ISA_MIPS32R6; // i32 selects +multiclass SelectInt_Pats<ValueType RC, Instruction OROp, Instruction XORiOp, + Instruction SLTiOp, Instruction SLTiuOp, + Instruction SELEQZOp, Instruction SELNEZOp, + SDPatternOperator imm_type, ValueType Opg> { +// reg, immz +def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, RC:$f), + (OROp (SELEQZOp RC:$t, RC:$cond), (SELNEZOp RC:$f, RC:$cond))>; +def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, RC:$f), + (OROp (SELNEZOp RC:$t, RC:$cond), (SELEQZOp RC:$f, RC:$cond))>; + +// reg, immZExt16[_64] +def : MipsPat<(select (Opg (seteq RC:$cond, imm_type:$imm)), RC:$t, RC:$f), + (OROp (SELEQZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)), + (SELNEZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>; +def : MipsPat<(select (Opg (setne RC:$cond, imm_type:$imm)), RC:$t, RC:$f), + (OROp (SELNEZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)), + (SELEQZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>; + +// reg, immSExt16Plus1 +def : MipsPat<(select (Opg (setgt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f), + (OROp (SELEQZOp RC:$t, (SLTiOp RC:$cond, (Plus1 imm:$imm))), + (SELNEZOp RC:$f, (SLTiOp RC:$cond, (Plus1 imm:$imm))))>; +def : MipsPat<(select (Opg (setugt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f), + (OROp (SELEQZOp RC:$t, (SLTiuOp RC:$cond, (Plus1 imm:$imm))), + (SELNEZOp RC:$f, (SLTiuOp RC:$cond, (Plus1 imm:$imm))))>; + +def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, immz), + (SELEQZOp RC:$t, RC:$cond)>; +def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, immz), + (SELNEZOp RC:$t, RC:$cond)>; +def : MipsPat<(select (Opg (seteq RC:$cond, immz)), immz, RC:$f), + (SELNEZOp RC:$f, RC:$cond)>; +def : MipsPat<(select (Opg (setne RC:$cond, immz)), immz, RC:$f), + (SELEQZOp RC:$f, RC:$cond)>; +} + +defm : SelectInt_Pats<i32, OR, XORi, SLTi, SLTiu, SELEQZ, SELNEZ, + immZExt16, i32>, ISA_MIPS32R6; + def : MipsPat<(select i32:$cond, i32:$t, i32:$f), - (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>, - ISA_MIPS32R6; -def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, i32:$f), - (OR (SELEQZ i32:$t, i32:$cond), (SELNEZ i32:$f, i32:$cond))>, - ISA_MIPS32R6; -def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, i32:$f), - (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>, - ISA_MIPS32R6; -def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i32:$t, i32:$f), - (OR (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)), - (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>, + (OR (SELNEZ i32:$t, i32:$cond), + (SELEQZ i32:$f, i32:$cond))>, ISA_MIPS32R6; -def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i32:$t, i32:$f), - (OR (SELNEZ i32:$t, (XORi i32:$cond, immZExt16:$imm)), - (SELEQZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>, - ISA_MIPS32R6; -def : MipsPat<(select (i32 (setgt i32:$cond, immSExt16Plus1:$imm)), i32:$t, - i32:$f), - (OR (SELEQZ i32:$t, (SLTi i32:$cond, (Plus1 imm:$imm))), - (SELNEZ i32:$f, (SLTi i32:$cond, (Plus1 imm:$imm))))>, - ISA_MIPS32R6; -def : MipsPat<(select (i32 (setugt i32:$cond, immSExt16Plus1:$imm)), - i32:$t, i32:$f), - (OR (SELEQZ i32:$t, (SLTiu i32:$cond, (Plus1 imm:$imm))), - (SELNEZ i32:$f, (SLTiu i32:$cond, (Plus1 imm:$imm))))>, - ISA_MIPS32R6; - def : MipsPat<(select i32:$cond, i32:$t, immz), - (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6; -def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, immz), - (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6; -def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, immz), - (SELEQZ i32:$t, i32:$cond)>, ISA_MIPS32R6; + (SELNEZ i32:$t, i32:$cond)>, + ISA_MIPS32R6; def : MipsPat<(select i32:$cond, immz, i32:$f), - (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6; -def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i32:$f), - (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6; -def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i32:$f), - (SELNEZ i32:$f, i32:$cond)>, ISA_MIPS32R6; + (SELEQZ i32:$f, i32:$cond)>, + ISA_MIPS32R6; diff --git a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td index f917eca..cbdcdd7 100644 --- a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td @@ -16,10 +16,6 @@ //===----------------------------------------------------------------------===// // Unsigned Operand -def uimm5_64 : Operand<i64> { - let PrintMethod = "printUnsignedImm"; -} - def uimm16_64 : Operand<i64> { let PrintMethod = "printUnsignedImm"; } @@ -276,12 +272,20 @@ def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>; let isCodeGenOnly = 1 in def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM; -def DEXT : ExtBase<"dext", GPR64Opnd, uimm6, MipsExt>, EXT_FM<3>; -def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm6>, EXT_FM<2>; -def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5>, EXT_FM<1>; +let AdditionalPredicates = [NotInMicroMips] in { + // TODO: Add 'pos + size' constraint check to dext* instructions + // DEXT: 0 < pos + size <= 63 + // DEXTM, DEXTU: 32 < pos + size <= 64 + def DEXT : ExtBase<"dext", GPR64Opnd, uimm5, uimm5_plus1, MipsExt>, + EXT_FM<3>; + def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5, uimm5_plus33, MipsExt>, + EXT_FM<1>; + def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm5_plus32, uimm5_plus1, + MipsExt>, EXT_FM<2>; +} def DINS : InsBase<"dins", GPR64Opnd, uimm6, MipsIns>, EXT_FM<7>; -def DINSU : InsBase<"dinsu", GPR64Opnd, uimm6>, EXT_FM<6>; +def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32>, EXT_FM<6>; def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5>, EXT_FM<5>; let isCodeGenOnly = 1, rs = 0, shamt = 0 in { @@ -341,11 +345,11 @@ class SetCC64_I<string opstr, PatFrag cond_op>: } class CBranchBitNum<string opstr, DAGOperand opnd, PatFrag cond_op, - RegisterOperand RO, bits<64> shift = 1> : - InstSE<(outs), (ins RO:$rs, uimm5_64:$p, opnd:$offset), + RegisterOperand RO, Operand ImmOp, bits<64> shift = 1> : + InstSE<(outs), (ins RO:$rs, ImmOp:$p, opnd:$offset), !strconcat(opstr, "\t$rs, $p, $offset"), [(brcond (i32 (cond_op (and RO:$rs, (shl shift, immZExt5_64:$p)), 0)), - bb:$offset)], IIBranch, FrmI, opstr> { + bb:$offset)], II_BBIT, FrmI, opstr> { let isBranch = 1; let isTerminator = 1; let hasDelaySlot = 1; @@ -363,14 +367,17 @@ def BADDu : ArithLogicR<"baddu", GPR64Opnd, 1, II_BADDU>, ADD_FM<0x1c, 0x28>; // Branch on Bit Clear /+32 -def BBIT0 : CBranchBitNum<"bbit0", brtarget, seteq, GPR64Opnd>, BBIT_FM<0x32>; -def BBIT032: CBranchBitNum<"bbit032", brtarget, seteq, GPR64Opnd, 0x100000000>, +def BBIT0 : CBranchBitNum<"bbit0", brtarget, seteq, GPR64Opnd, + uimm5_64_report_uimm6>, BBIT_FM<0x32>; +def BBIT032: CBranchBitNum<"bbit032", brtarget, seteq, GPR64Opnd, uimm5_64, + 0x100000000>, BBIT_FM<0x36>; // Branch on Bit Set /+32 -def BBIT1 : CBranchBitNum<"bbit1", brtarget, setne, GPR64Opnd>, BBIT_FM<0x3a>; -def BBIT132: CBranchBitNum<"bbit132", brtarget, setne, GPR64Opnd, 0x100000000>, - BBIT_FM<0x3e>; +def BBIT1 : CBranchBitNum<"bbit1", brtarget, setne, GPR64Opnd, + uimm5_64_report_uimm6>, BBIT_FM<0x3a>; +def BBIT132: CBranchBitNum<"bbit132", brtarget, setne, GPR64Opnd, uimm5_64, + 0x100000000>, BBIT_FM<0x3e>; // Multiply Doubleword to GPR let Defs = [HI0, LO0, P0, P1, P2] in @@ -544,10 +551,25 @@ def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst (BBIT132 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>; } +// Atomic load patterns. +def : MipsPat<(atomic_load_8 addr:$a), (LB64 addr:$a)>; +def : MipsPat<(atomic_load_16 addr:$a), (LH64 addr:$a)>; +def : MipsPat<(atomic_load_32 addr:$a), (LW64 addr:$a)>; +def : MipsPat<(atomic_load_64 addr:$a), (LD addr:$a)>; + +// Atomic store patterns. +def : MipsPat<(atomic_store_8 addr:$a, GPR64:$v), (SB64 GPR64:$v, addr:$a)>; +def : MipsPat<(atomic_store_16 addr:$a, GPR64:$v), (SH64 GPR64:$v, addr:$a)>; +def : MipsPat<(atomic_store_32 addr:$a, GPR64:$v), (SW64 GPR64:$v, addr:$a)>; +def : MipsPat<(atomic_store_64 addr:$a, GPR64:$v), (SD GPR64:$v, addr:$a)>; + //===----------------------------------------------------------------------===// // Instruction aliases //===----------------------------------------------------------------------===// def : MipsInstAlias<"move $dst, $src", + (OR64 GPR64Opnd:$dst, GPR64Opnd:$src, ZERO_64), 1>, + GPR_64; +def : MipsInstAlias<"move $dst, $src", (DADDu GPR64Opnd:$dst, GPR64Opnd:$src, ZERO_64), 1>, GPR_64; def : MipsInstAlias<"daddu $rs, $rt, $imm", @@ -617,6 +639,38 @@ def : MipsInstAlias<"syncw", (SYNC 0x4), 0>; def : MipsInstAlias<"syncws", (SYNC 0x5), 0>; } +// cnMIPS Aliases. + +// bbit* with $p 32-63 converted to bbit*32 with $p 0-31 +def : MipsInstAlias<"bbit0 $rs, $p, $offset", + (BBIT032 GPR64Opnd:$rs, uimm5_plus32_normalize_64:$p, + brtarget:$offset), 0>, + ASE_CNMIPS; +def : MipsInstAlias<"bbit1 $rs, $p, $offset", + (BBIT132 GPR64Opnd:$rs, uimm5_plus32_normalize_64:$p, + brtarget:$offset), 0>, + ASE_CNMIPS; + +// exts with $pos 32-63 in converted to exts32 with $pos 0-31 +def : MipsInstAlias<"exts $rt, $rs, $pos, $lenm1", + (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rs, + uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>, + ASE_CNMIPS; +def : MipsInstAlias<"exts $rt, $pos, $lenm1", + (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rt, + uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>, + ASE_CNMIPS; + +// cins with $pos 32-63 in converted to cins32 with $pos 0-31 +def : MipsInstAlias<"cins $rt, $rs, $pos, $lenm1", + (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rs, + uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>, + ASE_CNMIPS; +def : MipsInstAlias<"cins $rt, $pos, $lenm1", + (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rt, + uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>, + ASE_CNMIPS; + //===----------------------------------------------------------------------===// // Assembler Pseudo Instructions //===----------------------------------------------------------------------===// @@ -625,3 +679,8 @@ class LoadImmediate64<string instr_asm, Operand Od, RegisterOperand RO> : MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm64), !strconcat(instr_asm, "\t$rt, $imm64")> ; def LoadImm64 : LoadImmediate64<"dli", imm64, GPR64Opnd>; + +def LoadAddrReg64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins mem:$addr), + "dla\t$rt, $addr">; +def LoadAddrImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins imm64:$imm64), + "dla\t$rt, $imm64">; diff --git a/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td index 6b546e8..6f34dbe 100644 --- a/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td @@ -62,7 +62,7 @@ class DCLO_R6_DESC : CLO_R6_DESC_BASE<"dclo", GPR64Opnd>; class DCLZ_R6_DESC : CLZ_R6_DESC_BASE<"dclz", GPR64Opnd>; class DDIV_DESC : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd, sdiv>; class DDIVU_DESC : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd, udiv>; -class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2>; +class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2_plus1>; class DMOD_DESC : DIVMOD_DESC_BASE<"dmod", GPR64Opnd, srem>; class DMODU_DESC : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd, urem>; class DMUH_DESC : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd, mulhs>; @@ -81,10 +81,12 @@ class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>; // //===----------------------------------------------------------------------===// -def DAHI : DAHI_ENC, DAHI_DESC, ISA_MIPS64R6; -def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6; -def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6; -def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6; +let AdditionalPredicates = [NotInMicroMips] in { + def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6; + def DAHI : DAHI_ENC, DAHI_DESC, ISA_MIPS64R6; + def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6; + def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6; +} def DBITSWAP : DBITSWAP_ENC, DBITSWAP_DESC, ISA_MIPS64R6; def DCLO_R6 : DCLO_R6_ENC, DCLO_R6_DESC, ISA_MIPS64R6; def DCLZ_R6 : DCLZ_R6_ENC, DCLZ_R6_DESC, ISA_MIPS64R6; diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index fdba064..9575293 100644 --- a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -169,12 +169,12 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) { if (MCPE.isMachineConstantPoolEntry()) EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal); else - EmitGlobalConstant(MCPE.Val.ConstVal); + EmitGlobalConstant(MF->getDataLayout(), MCPE.Val.ConstVal); return; } - MachineBasicBlock::const_instr_iterator I = MI; + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); do { @@ -202,7 +202,7 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) { llvm_unreachable("Pseudo opcode found in EmitInstruction()"); MCInst TmpInst0; - MCInstLowering.Lower(I, TmpInst0); + MCInstLowering.Lower(&*I, TmpInst0); EmitToStreamer(*OutStreamer, TmpInst0); } while ((++I != E) && I->isInsideBundle()); // Delay slot check } @@ -405,7 +405,7 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock* // If this is a landing pad, it isn't a fall through. If it has no preds, // then nothing falls through to it. - if (MBB->isLandingPad() || MBB->pred_empty()) + if (MBB->isEHPad() || MBB->pred_empty()) return false; // If there isn't exactly one predecessor, it can't be a fall through. @@ -559,7 +559,6 @@ bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum, raw_ostream &O) { - const DataLayout *DL = TM.getDataLayout(); const MachineOperand &MO = MI->getOperand(opNum); bool closeP = false; @@ -608,7 +607,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum, } case MachineOperand::MO_ConstantPoolIndex: - O << DL->getPrivateGlobalPrefix() << "CPI" + O << getDataLayout().getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_" << MO.getIndex(); if (MO.getOffset()) O << "+" << MO.getOffset(); @@ -1009,7 +1008,7 @@ void MipsAsmPrinter::EmitFPCallStub( // // Mov $18, $31 - EmitInstrRegRegReg(*STI, Mips::ADDu, Mips::S2, Mips::RA, Mips::ZERO); + EmitInstrRegRegReg(*STI, Mips::OR, Mips::S2, Mips::RA, Mips::ZERO); EmitSwapFPIntParams(*STI, Signature->ParamSig, LE, true); diff --git a/contrib/llvm/lib/Target/Mips/MipsCCState.cpp b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp index b808129..d82063e 100644 --- a/contrib/llvm/lib/Target/Mips/MipsCCState.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp @@ -29,22 +29,16 @@ static bool isF128SoftLibCall(const char *CallSym) { "powl", "rintl", "sinl", "sqrtl", "truncl"}; - const char *const *End = LibCalls + array_lengthof(LibCalls); - // Check that LibCalls is sorted alphabetically. - MipsTargetLowering::LTStr Comp; - -#ifndef NDEBUG - for (const char *const *I = LibCalls; I < End - 1; ++I) - assert(Comp(*I, *(I + 1))); -#endif - - return std::binary_search(LibCalls, End, CallSym, Comp); + auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; }; + assert(std::is_sorted(std::begin(LibCalls), std::end(LibCalls), Comp)); + return std::binary_search(std::begin(LibCalls), std::end(LibCalls), + CallSym, Comp); } /// This function returns true if Ty is fp128, {f128} or i128 which was /// originally a fp128. -static bool originalTypeIsF128(const Type *Ty, const SDNode *CallNode) { +static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) { if (Ty->isFP128Ty()) return true; diff --git a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td index 93e1908..0b4b778 100644 --- a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td +++ b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td @@ -427,3 +427,28 @@ def CSR_Mips16RetHelper : CalleeSavedRegs<(add V0, V1, FP, (sequence "A%u", 3, 0), (sequence "S%u", 7, 0), (sequence "D%u", 15, 10))>; + +def CSR_Interrupt_32R6 : CalleeSavedRegs<(add (sequence "A%u", 3, 0), + (sequence "S%u", 7, 0), + (sequence "V%u", 1, 0), + (sequence "T%u", 9, 0), + RA, FP, GP, AT)>; + +def CSR_Interrupt_32 : CalleeSavedRegs<(add (sequence "A%u", 3, 0), + (sequence "S%u", 7, 0), + (sequence "V%u", 1, 0), + (sequence "T%u", 9, 0), + RA, FP, GP, AT, LO0, HI0)>; + +def CSR_Interrupt_64R6 : CalleeSavedRegs<(add (sequence "A%u_64", 3, 0), + (sequence "V%u_64", 1, 0), + (sequence "S%u_64", 7, 0), + (sequence "T%u_64", 9, 0), + RA_64, FP_64, GP_64, AT_64)>; + +def CSR_Interrupt_64 : CalleeSavedRegs<(add (sequence "A%u_64", 3, 0), + (sequence "S%u_64", 7, 0), + (sequence "T%u_64", 9, 0), + (sequence "V%u_64", 1, 0), + RA_64, FP_64, GP_64, AT_64, + LO0_64, HI0_64)>; diff --git a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp index 96553d2..ea8c587 100644 --- a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp @@ -560,7 +560,7 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) { // identity mapping of CPI's to CPE's. const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants(); - const DataLayout &TD = *MF->getTarget().getDataLayout(); + const DataLayout &TD = MF->getDataLayout(); for (unsigned i = 0, e = CPs.size(); i != e; ++i) { unsigned Size = TD.getTypeAllocSize(CPs[i].getType()); assert(Size >= 4 && "Too small constant pool entry"); @@ -598,12 +598,12 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) { /// into the block immediately after it. static bool BBHasFallthrough(MachineBasicBlock *MBB) { // Get the next machine basic block in the function. - MachineFunction::iterator MBBI = MBB; + MachineFunction::iterator MBBI = MBB->getIterator(); // Can't fall off end of function. if (std::next(MBBI) == MBB->getParent()->end()) return false; - MachineBasicBlock *NextBB = std::next(MBBI); + MachineBasicBlock *NextBB = &*std::next(MBBI); for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) if (*I == NextBB) @@ -656,11 +656,11 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) { // alignment assumptions, as we don't know for sure the size of any // instructions in the inline assembly. for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) - computeBlockSize(I); + computeBlockSize(&*I); // Compute block offsets. - adjustBBOffsetsAfter(MF->begin()); + adjustBBOffsetsAfter(&MF->front()); // Now go back through the instructions and build up our data structures. for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); @@ -879,7 +879,7 @@ MachineBasicBlock *MipsConstantIslands::splitBlockBeforeInstr // Create a new MBB for the code after the OrigBB. MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); - MachineFunction::iterator MBBI = OrigBB; ++MBBI; + MachineFunction::iterator MBBI = ++OrigBB->getIterator(); MF->insert(MBBI, NewBB); // Splice the instructions starting with MI over to NewBB. @@ -967,8 +967,8 @@ bool MipsConstantIslands::isWaterInRange(unsigned UserOffset, unsigned CPELogAlign = getCPELogAlign(U.CPEMI); unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign); unsigned NextBlockOffset, NextBlockAlignment; - MachineFunction::const_iterator NextBlock = Water; - if (++NextBlock == MF->end()) { + MachineFunction::const_iterator NextBlock = ++Water->getIterator(); + if (NextBlock == MF->end()) { NextBlockOffset = BBInfo[Water->getNumber()].postOffset(); NextBlockAlignment = 0; } else { @@ -1261,7 +1261,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex, if (isOffsetInRange(UserOffset, CPEOffset, U)) { DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber() << format(", expected CPE offset %#x\n", CPEOffset)); - NewMBB = std::next(MachineFunction::iterator(UserMBB)); + NewMBB = &*++UserMBB->getIterator(); // Add an unconditional branch from UserMBB to fallthrough block. Record // it for branch lengthening; this new branch will not get out of range, // but if the preceding conditional branch is out of range, the targets @@ -1371,8 +1371,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { NewWaterList.insert(NewIsland); // The new CPE goes before the following block (NewMBB). - NewMBB = std::next(MachineFunction::iterator(WaterBB)); - + NewMBB = &*++WaterBB->getIterator(); } else { // No water found. // we first see if a longer form of the instrucion could have reached @@ -1389,7 +1388,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { // next iteration for constant pools, but in this context, we don't want // it. Check for this so it will be removed from the WaterList. // Also remove any entry from NewWaterList. - MachineBasicBlock *WaterBB = std::prev(MachineFunction::iterator(NewMBB)); + MachineBasicBlock *WaterBB = &*--NewMBB->getIterator(); IP = std::find(WaterList.begin(), WaterList.end(), WaterBB); if (IP != WaterList.end()) NewWaterList.erase(WaterBB); @@ -1406,7 +1405,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { WaterList.erase(IP); // Okay, we know we can put an island before NewMBB now, do it! - MF->insert(NewMBB, NewIsland); + MF->insert(NewMBB->getIterator(), NewIsland); // Update internal data structures to account for the newly inserted MBB. updateForInsertedWaterBlock(NewIsland); @@ -1431,9 +1430,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { // Increase the size of the island block to account for the new entry. BBInfo[NewIsland->getNumber()].Size += Size; - adjustBBOffsetsAfter(std::prev(MachineFunction::iterator(NewIsland))); - - + adjustBBOffsetsAfter(&*--NewIsland->getIterator()); // Finally, change the CPI in the instruction operand to be ID. for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i) @@ -1645,7 +1642,7 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) { MBB->back().eraseFromParent(); // BBInfo[SplitBB].Offset is wrong temporarily, fixed below } - MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB)); + MachineBasicBlock *NextBB = &*++MBB->getIterator(); DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber() << " also invert condition and change dest. to BB#" diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td index b5d52ce..f959bd4 100644 --- a/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td +++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td @@ -7,10 +7,30 @@ // //===----------------------------------------------------------------------===// +class DspMMRel; + +def Dsp2MicroMips : InstrMapping { + let FilterClass = "DspMMRel"; + // Instructions with the same BaseOpcode and isNVStore values form a row. + let RowFields = ["BaseOpcode"]; + // Instructions with the same predicate sense form a column. + let ColFields = ["Arch"]; + // The key column is the unpredicated instructions. + let KeyCol = ["dsp"]; + // Value columns are PredSense=true and PredSense=false + let ValueCols = [["dsp"], ["mmdsp"]]; +} + def HasDSP : Predicate<"Subtarget->hasDSP()">, AssemblerPredicate<"FeatureDSP">; def HasDSPR2 : Predicate<"Subtarget->hasDSPR2()">, AssemblerPredicate<"FeatureDSPR2">; +def HasDSPR3 : Predicate<"Subtarget->hasDSPR3()">, + AssemblerPredicate<"FeatureDSPR3">; + +class ISA_DSPR2 { + list<Predicate> InsnPredicates = [HasDSPR2]; +} // Fields. class Field6<bits<6> val> { @@ -20,14 +40,22 @@ class Field6<bits<6> val> { def SPECIAL3_OPCODE : Field6<0b011111>; def REGIMM_OPCODE : Field6<0b000001>; -class DSPInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> { - let Predicates = [HasDSP]; +class DSPInst<string opstr = ""> + : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl { + let InsnPredicates = [HasDSP]; + string BaseOpcode = opstr; + string Arch = "dsp"; } class PseudoDSP<dag outs, dag ins, list<dag> pattern, - InstrItinClass itin = IIPseudo>: - MipsPseudo<outs, ins, pattern, itin> { - let Predicates = [HasDSP]; + InstrItinClass itin = IIPseudo> + : MipsPseudo<outs, ins, pattern, itin>, PredicateControl { + let InsnPredicates = [HasDSP]; +} + +class DSPInstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, PredicateControl { + let InsnPredicates = [HasDSP]; } // ADDU.QB sub-class format. diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td index d268384..da6f174 100644 --- a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td @@ -12,9 +12,11 @@ //===----------------------------------------------------------------------===// // ImmLeaf +def immZExt1 : ImmLeaf<i32, [{return isUInt<1>(Imm);}]>; def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>; def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>; def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>; +def immZExt7 : ImmLeaf<i32, [{return isUInt<7>(Imm);}]>; def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>; def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>; def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>; @@ -263,6 +265,7 @@ class ADDU_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt"); list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -273,6 +276,7 @@ class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $rs"); list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -293,6 +297,7 @@ class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt"); list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -304,6 +309,7 @@ class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode, list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))]; InstrItinClass Itinerary = itin; string Constraints = "$src = $rt"; + string BaseOpcode = instr_asm; } class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -314,6 +320,7 @@ class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $rt"); list<dag> Pattern = [(set ROD:$rd, (OpNode ROT:$rt))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -323,6 +330,7 @@ class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $imm"); list<dag> Pattern = [(set RO:$rd, (OpNode immPat:$imm))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -332,17 +340,19 @@ class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa"); list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs_sa))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode, SDPatternOperator ImmPat, InstrItinClass itin, - RegisterOperand RO> { + RegisterOperand RO, Operand ImmOpnd> { dag OutOperandList = (outs RO:$rd); - dag InOperandList = (ins RO:$rt, uimm16:$rs_sa); + dag InOperandList = (ins RO:$rt, ImmOpnd:$rs_sa); string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa"); list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, ImmPat:$rs_sa))]; InstrItinClass Itinerary = itin; bit hasSideEffects = 1; + string BaseOpcode = instr_asm; } class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -353,6 +363,7 @@ class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode, list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))]; InstrItinClass Itinerary = itin; bit mayLoad = 1; + string BaseOpcode = instr_asm; } class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -363,17 +374,19 @@ class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt"); list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - SDPatternOperator ImmOp, InstrItinClass itin> { + Operand ImmOp, SDPatternOperator Imm, InstrItinClass itin> { dag OutOperandList = (outs GPR32Opnd:$rt); - dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$sa, GPR32Opnd:$src); + dag InOperandList = (ins GPR32Opnd:$rs, ImmOp:$sa, GPR32Opnd:$src); string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa"); list<dag> Pattern = [(set GPR32Opnd:$rt, - (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, ImmOp:$sa))]; + (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, Imm:$sa))]; InstrItinClass Itinerary = itin; string Constraints = "$src = $rt"; + string BaseOpcode = instr_asm; } class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -382,6 +395,7 @@ class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode, dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$shift_rs); string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs"); InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -390,15 +404,17 @@ class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode, dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm16:$shift_rs); string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs"); InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class SHILO_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { dag OutOperandList = (outs ACC64DSPOpnd:$ac); - dag InOperandList = (ins simm16:$shift, ACC64DSPOpnd:$acin); + dag InOperandList = (ins simm6:$shift, ACC64DSPOpnd:$acin); string AsmString = !strconcat(instr_asm, "\t$ac, $shift"); list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode immSExt6:$shift, ACC64DSPOpnd:$acin))]; string Constraints = "$acin = $ac"; + string BaseOpcode = instr_asm; } class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { @@ -408,6 +424,7 @@ class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))]; string Constraints = "$acin = $ac"; + string BaseOpcode = instr_asm; } class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { @@ -417,6 +434,7 @@ class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))]; string Constraints = "$acin = $ac"; + string BaseOpcode = instr_asm; } class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -426,15 +444,17 @@ class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $mask"); list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode, InstrItinClass itin> { dag OutOperandList = (outs); - dag InOperandList = (ins GPR32Opnd:$rs, uimm16:$mask); + dag InOperandList = (ins GPR32Opnd:$rs, uimm10:$mask); string AsmString = !strconcat(instr_asm, "\t$rs, $mask"); list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { @@ -444,6 +464,7 @@ class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))]; string Constraints = "$acin = $ac"; + string BaseOpcode = instr_asm; } class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -454,6 +475,7 @@ class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode, list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt))]; InstrItinClass Itinerary = itin; bit isCommutable = 1; + string BaseOpcode = instr_asm; } class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -465,6 +487,7 @@ class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))]; InstrItinClass Itinerary = itin; string Constraints = "$acin = $ac"; + string BaseOpcode = instr_asm; } class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode, @@ -474,6 +497,7 @@ class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $ac"); list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> { @@ -481,6 +505,7 @@ class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> dag InOperandList = (ins GPR32Opnd:$rs); string AsmString = !strconcat(instr_asm, "\t$rs, $ac"); InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> : @@ -506,6 +531,7 @@ class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode, list<dag> Pattern = [(set GPR32Opnd:$rt, (OpNode GPR32Opnd:$src, GPR32Opnd:$rs))]; InstrItinClass Itinerary = itin; string Constraints = "$src = $rt"; + string BaseOpcode = instr_asm; } //===----------------------------------------------------------------------===// @@ -639,7 +665,7 @@ class PRECEU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbra", // Shift class SHLL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shll.qb", null_frag, immZExt3, - NoItinerary, DSPROpnd>, + NoItinerary, DSPROpnd, uimm3>, Defs<[DSPOutFlag22]>; class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb, @@ -647,13 +673,13 @@ class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb, Defs<[DSPOutFlag22]>; class SHRL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shrl.qb", null_frag, immZExt3, - NoItinerary, DSPROpnd>; + NoItinerary, DSPROpnd, uimm3>; class SHRLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.qb", int_mips_shrl_qb, NoItinerary, DSPROpnd>; class SHLL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll.ph", null_frag, immZExt4, - NoItinerary, DSPROpnd>, + NoItinerary, DSPROpnd, uimm4>, Defs<[DSPOutFlag22]>; class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph, @@ -661,7 +687,8 @@ class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph, Defs<[DSPOutFlag22]>; class SHLL_S_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.ph", int_mips_shll_s_ph, - immZExt4, NoItinerary, DSPROpnd>, + immZExt4, NoItinerary, DSPROpnd, + uimm4>, Defs<[DSPOutFlag22]>; class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph, @@ -669,19 +696,21 @@ class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph, Defs<[DSPOutFlag22]>; class SHRA_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra.ph", null_frag, immZExt4, - NoItinerary, DSPROpnd>; + NoItinerary, DSPROpnd, uimm4>; class SHRAV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav.ph", int_mips_shra_ph, NoItinerary, DSPROpnd>; class SHRA_R_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.ph", int_mips_shra_r_ph, - immZExt4, NoItinerary, DSPROpnd>; + immZExt4, NoItinerary, DSPROpnd, + uimm4>; class SHRAV_R_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.ph", int_mips_shra_r_ph, NoItinerary, DSPROpnd>; class SHLL_S_W_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.w", int_mips_shll_s_w, - immZExt5, NoItinerary, GPR32Opnd>, + immZExt5, NoItinerary, GPR32Opnd, + uimm5>, Defs<[DSPOutFlag22]>; class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w, @@ -689,7 +718,8 @@ class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w, Defs<[DSPOutFlag22]>; class SHRA_R_W_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.w", int_mips_shra_r_w, - immZExt5, NoItinerary, GPR32Opnd>; + immZExt5, NoItinerary, GPR32Opnd, + uimm5>; class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w, NoItinerary, GPR32Opnd>; @@ -1039,32 +1069,33 @@ class PRECR_SRA_R_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra_r.ph.w", // Shift class SHRA_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra.qb", null_frag, immZExt3, - NoItinerary, DSPROpnd>; + NoItinerary, DSPROpnd, uimm3>; class SHRAV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav.qb", int_mips_shra_qb, NoItinerary, DSPROpnd>; class SHRA_R_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.qb", int_mips_shra_r_qb, - immZExt3, NoItinerary, DSPROpnd>; + immZExt3, NoItinerary, DSPROpnd, + uimm3>; class SHRAV_R_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.qb", int_mips_shra_r_qb, NoItinerary, DSPROpnd>; class SHRL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shrl.ph", null_frag, immZExt4, - NoItinerary, DSPROpnd>; + NoItinerary, DSPROpnd, uimm4>; class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph, NoItinerary, DSPROpnd>; // Misc -class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, immZExt5, +class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, uimm5, immZExt5, NoItinerary>; -class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, immZExt2, +class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, uimm2, immZExt2, NoItinerary>; -class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, immZExt5, - NoItinerary>; +class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, uimm5, + immZExt5, NoItinerary>; // Pseudos. def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32, @@ -1072,80 +1103,80 @@ def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32, // Instruction defs. // MIPS DSP Rev 1 -def ADDU_QB : ADDU_QB_ENC, ADDU_QB_DESC; -def ADDU_S_QB : ADDU_S_QB_ENC, ADDU_S_QB_DESC; -def SUBU_QB : SUBU_QB_ENC, SUBU_QB_DESC; -def SUBU_S_QB : SUBU_S_QB_ENC, SUBU_S_QB_DESC; -def ADDQ_PH : ADDQ_PH_ENC, ADDQ_PH_DESC; -def ADDQ_S_PH : ADDQ_S_PH_ENC, ADDQ_S_PH_DESC; -def SUBQ_PH : SUBQ_PH_ENC, SUBQ_PH_DESC; -def SUBQ_S_PH : SUBQ_S_PH_ENC, SUBQ_S_PH_DESC; -def ADDQ_S_W : ADDQ_S_W_ENC, ADDQ_S_W_DESC; -def SUBQ_S_W : SUBQ_S_W_ENC, SUBQ_S_W_DESC; -def ADDSC : ADDSC_ENC, ADDSC_DESC; -def ADDWC : ADDWC_ENC, ADDWC_DESC; +def ADDU_QB : DspMMRel, ADDU_QB_ENC, ADDU_QB_DESC; +def ADDU_S_QB : DspMMRel, ADDU_S_QB_ENC, ADDU_S_QB_DESC; +def SUBU_QB : DspMMRel, SUBU_QB_ENC, SUBU_QB_DESC; +def SUBU_S_QB : DspMMRel, SUBU_S_QB_ENC, SUBU_S_QB_DESC; +def ADDQ_PH : DspMMRel, ADDQ_PH_ENC, ADDQ_PH_DESC; +def ADDQ_S_PH : DspMMRel, ADDQ_S_PH_ENC, ADDQ_S_PH_DESC; +def SUBQ_PH : DspMMRel, SUBQ_PH_ENC, SUBQ_PH_DESC; +def SUBQ_S_PH : DspMMRel, SUBQ_S_PH_ENC, SUBQ_S_PH_DESC; +def ADDQ_S_W : DspMMRel, ADDQ_S_W_ENC, ADDQ_S_W_DESC; +def SUBQ_S_W : DspMMRel, SUBQ_S_W_ENC, SUBQ_S_W_DESC; +def ADDSC : DspMMRel, ADDSC_ENC, ADDSC_DESC; +def ADDWC : DspMMRel, ADDWC_ENC, ADDWC_DESC; def MODSUB : MODSUB_ENC, MODSUB_DESC; -def RADDU_W_QB : RADDU_W_QB_ENC, RADDU_W_QB_DESC; -def ABSQ_S_PH : ABSQ_S_PH_ENC, ABSQ_S_PH_DESC; -def ABSQ_S_W : ABSQ_S_W_ENC, ABSQ_S_W_DESC; -def PRECRQ_QB_PH : PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC; -def PRECRQ_PH_W : PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC; -def PRECRQ_RS_PH_W : PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC; -def PRECRQU_S_QB_PH : PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC; -def PRECEQ_W_PHL : PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC; -def PRECEQ_W_PHR : PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC; -def PRECEQU_PH_QBL : PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC; -def PRECEQU_PH_QBR : PRECEQU_PH_QBR_ENC, PRECEQU_PH_QBR_DESC; -def PRECEQU_PH_QBLA : PRECEQU_PH_QBLA_ENC, PRECEQU_PH_QBLA_DESC; -def PRECEQU_PH_QBRA : PRECEQU_PH_QBRA_ENC, PRECEQU_PH_QBRA_DESC; -def PRECEU_PH_QBL : PRECEU_PH_QBL_ENC, PRECEU_PH_QBL_DESC; -def PRECEU_PH_QBR : PRECEU_PH_QBR_ENC, PRECEU_PH_QBR_DESC; -def PRECEU_PH_QBLA : PRECEU_PH_QBLA_ENC, PRECEU_PH_QBLA_DESC; -def PRECEU_PH_QBRA : PRECEU_PH_QBRA_ENC, PRECEU_PH_QBRA_DESC; -def SHLL_QB : SHLL_QB_ENC, SHLL_QB_DESC; -def SHLLV_QB : SHLLV_QB_ENC, SHLLV_QB_DESC; -def SHRL_QB : SHRL_QB_ENC, SHRL_QB_DESC; -def SHRLV_QB : SHRLV_QB_ENC, SHRLV_QB_DESC; -def SHLL_PH : SHLL_PH_ENC, SHLL_PH_DESC; -def SHLLV_PH : SHLLV_PH_ENC, SHLLV_PH_DESC; -def SHLL_S_PH : SHLL_S_PH_ENC, SHLL_S_PH_DESC; -def SHLLV_S_PH : SHLLV_S_PH_ENC, SHLLV_S_PH_DESC; -def SHRA_PH : SHRA_PH_ENC, SHRA_PH_DESC; -def SHRAV_PH : SHRAV_PH_ENC, SHRAV_PH_DESC; -def SHRA_R_PH : SHRA_R_PH_ENC, SHRA_R_PH_DESC; -def SHRAV_R_PH : SHRAV_R_PH_ENC, SHRAV_R_PH_DESC; -def SHLL_S_W : SHLL_S_W_ENC, SHLL_S_W_DESC; -def SHLLV_S_W : SHLLV_S_W_ENC, SHLLV_S_W_DESC; -def SHRA_R_W : SHRA_R_W_ENC, SHRA_R_W_DESC; -def SHRAV_R_W : SHRAV_R_W_ENC, SHRAV_R_W_DESC; -def MULEU_S_PH_QBL : MULEU_S_PH_QBL_ENC, MULEU_S_PH_QBL_DESC; -def MULEU_S_PH_QBR : MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC; -def MULEQ_S_W_PHL : MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC; -def MULEQ_S_W_PHR : MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC; -def MULQ_RS_PH : MULQ_RS_PH_ENC, MULQ_RS_PH_DESC; +def RADDU_W_QB : DspMMRel, RADDU_W_QB_ENC, RADDU_W_QB_DESC; +def ABSQ_S_PH : DspMMRel, ABSQ_S_PH_ENC, ABSQ_S_PH_DESC; +def ABSQ_S_W : DspMMRel, ABSQ_S_W_ENC, ABSQ_S_W_DESC; +def PRECRQ_QB_PH : DspMMRel, PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC; +def PRECRQ_PH_W : DspMMRel, PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC; +def PRECRQ_RS_PH_W : DspMMRel, PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC; +def PRECRQU_S_QB_PH : DspMMRel, PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC; +def PRECEQ_W_PHL : DspMMRel, PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC; +def PRECEQ_W_PHR : DspMMRel, PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC; +def PRECEQU_PH_QBL : DspMMRel, PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC; +def PRECEQU_PH_QBR : DspMMRel, PRECEQU_PH_QBR_ENC, PRECEQU_PH_QBR_DESC; +def PRECEQU_PH_QBLA : DspMMRel, PRECEQU_PH_QBLA_ENC, PRECEQU_PH_QBLA_DESC; +def PRECEQU_PH_QBRA : DspMMRel, PRECEQU_PH_QBRA_ENC, PRECEQU_PH_QBRA_DESC; +def PRECEU_PH_QBL : DspMMRel, PRECEU_PH_QBL_ENC, PRECEU_PH_QBL_DESC; +def PRECEU_PH_QBR : DspMMRel, PRECEU_PH_QBR_ENC, PRECEU_PH_QBR_DESC; +def PRECEU_PH_QBLA : DspMMRel, PRECEU_PH_QBLA_ENC, PRECEU_PH_QBLA_DESC; +def PRECEU_PH_QBRA : DspMMRel, PRECEU_PH_QBRA_ENC, PRECEU_PH_QBRA_DESC; +def SHLL_QB : DspMMRel, SHLL_QB_ENC, SHLL_QB_DESC; +def SHLLV_QB : DspMMRel, SHLLV_QB_ENC, SHLLV_QB_DESC; +def SHRL_QB : DspMMRel, SHRL_QB_ENC, SHRL_QB_DESC; +def SHRLV_QB : DspMMRel, SHRLV_QB_ENC, SHRLV_QB_DESC; +def SHLL_PH : DspMMRel, SHLL_PH_ENC, SHLL_PH_DESC; +def SHLLV_PH : DspMMRel, SHLLV_PH_ENC, SHLLV_PH_DESC; +def SHLL_S_PH : DspMMRel, SHLL_S_PH_ENC, SHLL_S_PH_DESC; +def SHLLV_S_PH : DspMMRel, SHLLV_S_PH_ENC, SHLLV_S_PH_DESC; +def SHRA_PH : DspMMRel, SHRA_PH_ENC, SHRA_PH_DESC; +def SHRAV_PH : DspMMRel, SHRAV_PH_ENC, SHRAV_PH_DESC; +def SHRA_R_PH : DspMMRel, SHRA_R_PH_ENC, SHRA_R_PH_DESC; +def SHRAV_R_PH : DspMMRel, SHRAV_R_PH_ENC, SHRAV_R_PH_DESC; +def SHLL_S_W : DspMMRel, SHLL_S_W_ENC, SHLL_S_W_DESC; +def SHLLV_S_W : DspMMRel, SHLLV_S_W_ENC, SHLLV_S_W_DESC; +def SHRA_R_W : DspMMRel, SHRA_R_W_ENC, SHRA_R_W_DESC; +def SHRAV_R_W : DspMMRel, SHRAV_R_W_ENC, SHRAV_R_W_DESC; +def MULEU_S_PH_QBL : DspMMRel, MULEU_S_PH_QBL_ENC, MULEU_S_PH_QBL_DESC; +def MULEU_S_PH_QBR : DspMMRel, MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC; +def MULEQ_S_W_PHL : DspMMRel, MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC; +def MULEQ_S_W_PHR : DspMMRel, MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC; +def MULQ_RS_PH : DspMMRel, MULQ_RS_PH_ENC, MULQ_RS_PH_DESC; def MULSAQ_S_W_PH : MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC; -def MAQ_S_W_PHL : MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC; -def MAQ_S_W_PHR : MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC; -def MAQ_SA_W_PHL : MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC; -def MAQ_SA_W_PHR : MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC; -def MFHI_DSP : MFHI_ENC, MFHI_DESC; -def MFLO_DSP : MFLO_ENC, MFLO_DESC; -def MTHI_DSP : MTHI_ENC, MTHI_DESC; -def MTLO_DSP : MTLO_ENC, MTLO_DESC; -def DPAU_H_QBL : DPAU_H_QBL_ENC, DPAU_H_QBL_DESC; -def DPAU_H_QBR : DPAU_H_QBR_ENC, DPAU_H_QBR_DESC; -def DPSU_H_QBL : DPSU_H_QBL_ENC, DPSU_H_QBL_DESC; -def DPSU_H_QBR : DPSU_H_QBR_ENC, DPSU_H_QBR_DESC; -def DPAQ_S_W_PH : DPAQ_S_W_PH_ENC, DPAQ_S_W_PH_DESC; -def DPSQ_S_W_PH : DPSQ_S_W_PH_ENC, DPSQ_S_W_PH_DESC; -def DPAQ_SA_L_W : DPAQ_SA_L_W_ENC, DPAQ_SA_L_W_DESC; -def DPSQ_SA_L_W : DPSQ_SA_L_W_ENC, DPSQ_SA_L_W_DESC; -def MULT_DSP : MULT_DSP_ENC, MULT_DSP_DESC; -def MULTU_DSP : MULTU_DSP_ENC, MULTU_DSP_DESC; -def MADD_DSP : MADD_DSP_ENC, MADD_DSP_DESC; -def MADDU_DSP : MADDU_DSP_ENC, MADDU_DSP_DESC; -def MSUB_DSP : MSUB_DSP_ENC, MSUB_DSP_DESC; -def MSUBU_DSP : MSUBU_DSP_ENC, MSUBU_DSP_DESC; +def MAQ_S_W_PHL : DspMMRel, MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC; +def MAQ_S_W_PHR : DspMMRel, MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC; +def MAQ_SA_W_PHL : DspMMRel, MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC; +def MAQ_SA_W_PHR : DspMMRel, MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC; +def MFHI_DSP : DspMMRel, MFHI_ENC, MFHI_DESC; +def MFLO_DSP : DspMMRel, MFLO_ENC, MFLO_DESC; +def MTHI_DSP : DspMMRel, MTHI_ENC, MTHI_DESC; +def MTLO_DSP : DspMMRel, MTLO_ENC, MTLO_DESC; +def DPAU_H_QBL : DspMMRel, DPAU_H_QBL_ENC, DPAU_H_QBL_DESC; +def DPAU_H_QBR : DspMMRel, DPAU_H_QBR_ENC, DPAU_H_QBR_DESC; +def DPSU_H_QBL : DspMMRel, DPSU_H_QBL_ENC, DPSU_H_QBL_DESC; +def DPSU_H_QBR : DspMMRel, DPSU_H_QBR_ENC, DPSU_H_QBR_DESC; +def DPAQ_S_W_PH : DspMMRel, DPAQ_S_W_PH_ENC, DPAQ_S_W_PH_DESC; +def DPSQ_S_W_PH : DspMMRel, DPSQ_S_W_PH_ENC, DPSQ_S_W_PH_DESC; +def DPAQ_SA_L_W : DspMMRel, DPAQ_SA_L_W_ENC, DPAQ_SA_L_W_DESC; +def DPSQ_SA_L_W : DspMMRel, DPSQ_SA_L_W_ENC, DPSQ_SA_L_W_DESC; +def MULT_DSP : DspMMRel, MULT_DSP_ENC, MULT_DSP_DESC; +def MULTU_DSP : DspMMRel, MULTU_DSP_ENC, MULTU_DSP_DESC; +def MADD_DSP : DspMMRel, MADD_DSP_ENC, MADD_DSP_DESC; +def MADDU_DSP : DspMMRel, MADDU_DSP_ENC, MADDU_DSP_DESC; +def MSUB_DSP : DspMMRel, MSUB_DSP_ENC, MSUB_DSP_DESC; +def MSUBU_DSP : DspMMRel, MSUBU_DSP_ENC, MSUBU_DSP_DESC; def CMPU_EQ_QB : CMPU_EQ_QB_ENC, CMPU_EQ_QB_DESC; def CMPU_LT_QB : CMPU_LT_QB_ENC, CMPU_LT_QB_DESC; def CMPU_LE_QB : CMPU_LE_QB_ENC, CMPU_LE_QB_DESC; @@ -1156,87 +1187,85 @@ def CMP_EQ_PH : CMP_EQ_PH_ENC, CMP_EQ_PH_DESC; def CMP_LT_PH : CMP_LT_PH_ENC, CMP_LT_PH_DESC; def CMP_LE_PH : CMP_LE_PH_ENC, CMP_LE_PH_DESC; def BITREV : BITREV_ENC, BITREV_DESC; -def PACKRL_PH : PACKRL_PH_ENC, PACKRL_PH_DESC; -def REPL_QB : REPL_QB_ENC, REPL_QB_DESC; -def REPL_PH : REPL_PH_ENC, REPL_PH_DESC; -def REPLV_QB : REPLV_QB_ENC, REPLV_QB_DESC; -def REPLV_PH : REPLV_PH_ENC, REPLV_PH_DESC; -def PICK_QB : PICK_QB_ENC, PICK_QB_DESC; -def PICK_PH : PICK_PH_ENC, PICK_PH_DESC; -def LWX : LWX_ENC, LWX_DESC; -def LHX : LHX_ENC, LHX_DESC; -def LBUX : LBUX_ENC, LBUX_DESC; +def PACKRL_PH : DspMMRel, PACKRL_PH_ENC, PACKRL_PH_DESC; +def REPL_QB : DspMMRel, REPL_QB_ENC, REPL_QB_DESC; +def REPL_PH : DspMMRel, REPL_PH_ENC, REPL_PH_DESC; +def REPLV_QB : DspMMRel, REPLV_QB_ENC, REPLV_QB_DESC; +def REPLV_PH : DspMMRel, REPLV_PH_ENC, REPLV_PH_DESC; +def PICK_QB : DspMMRel, PICK_QB_ENC, PICK_QB_DESC; +def PICK_PH : DspMMRel, PICK_PH_ENC, PICK_PH_DESC; +def LWX : DspMMRel, LWX_ENC, LWX_DESC; +def LHX : DspMMRel, LHX_ENC, LHX_DESC; +def LBUX : DspMMRel, LBUX_ENC, LBUX_DESC; def BPOSGE32 : BPOSGE32_ENC, BPOSGE32_DESC; -def INSV : INSV_ENC, INSV_DESC; -def EXTP : EXTP_ENC, EXTP_DESC; -def EXTPV : EXTPV_ENC, EXTPV_DESC; -def EXTPDP : EXTPDP_ENC, EXTPDP_DESC; -def EXTPDPV : EXTPDPV_ENC, EXTPDPV_DESC; -def EXTR_W : EXTR_W_ENC, EXTR_W_DESC; -def EXTRV_W : EXTRV_W_ENC, EXTRV_W_DESC; -def EXTR_R_W : EXTR_R_W_ENC, EXTR_R_W_DESC; -def EXTRV_R_W : EXTRV_R_W_ENC, EXTRV_R_W_DESC; -def EXTR_RS_W : EXTR_RS_W_ENC, EXTR_RS_W_DESC; -def EXTRV_RS_W : EXTRV_RS_W_ENC, EXTRV_RS_W_DESC; -def EXTR_S_H : EXTR_S_H_ENC, EXTR_S_H_DESC; -def EXTRV_S_H : EXTRV_S_H_ENC, EXTRV_S_H_DESC; -def SHILO : SHILO_ENC, SHILO_DESC; -def SHILOV : SHILOV_ENC, SHILOV_DESC; -def MTHLIP : MTHLIP_ENC, MTHLIP_DESC; -def RDDSP : RDDSP_ENC, RDDSP_DESC; -def WRDSP : WRDSP_ENC, WRDSP_DESC; +def INSV : DspMMRel, INSV_ENC, INSV_DESC; +def EXTP : DspMMRel, EXTP_ENC, EXTP_DESC; +def EXTPV : DspMMRel, EXTPV_ENC, EXTPV_DESC; +def EXTPDP : DspMMRel, EXTPDP_ENC, EXTPDP_DESC; +def EXTPDPV : DspMMRel, EXTPDPV_ENC, EXTPDPV_DESC; +def EXTR_W : DspMMRel, EXTR_W_ENC, EXTR_W_DESC; +def EXTRV_W : DspMMRel, EXTRV_W_ENC, EXTRV_W_DESC; +def EXTR_R_W : DspMMRel, EXTR_R_W_ENC, EXTR_R_W_DESC; +def EXTRV_R_W : DspMMRel, EXTRV_R_W_ENC, EXTRV_R_W_DESC; +def EXTR_RS_W : DspMMRel, EXTR_RS_W_ENC, EXTR_RS_W_DESC; +def EXTRV_RS_W : DspMMRel, EXTRV_RS_W_ENC, EXTRV_RS_W_DESC; +def EXTR_S_H : DspMMRel, EXTR_S_H_ENC, EXTR_S_H_DESC; +def EXTRV_S_H : DspMMRel, EXTRV_S_H_ENC, EXTRV_S_H_DESC; +def SHILO : DspMMRel, SHILO_ENC, SHILO_DESC; +def SHILOV : DspMMRel, SHILOV_ENC, SHILOV_DESC; +def MTHLIP : DspMMRel, MTHLIP_ENC, MTHLIP_DESC; +def RDDSP : DspMMRel, RDDSP_ENC, RDDSP_DESC; +let AdditionalPredicates = [NotInMicroMips] in { + def WRDSP : WRDSP_ENC, WRDSP_DESC; +} // MIPS DSP Rev 2 -let Predicates = [HasDSPR2] in { - -def ADDU_PH : ADDU_PH_ENC, ADDU_PH_DESC; -def ADDU_S_PH : ADDU_S_PH_ENC, ADDU_S_PH_DESC; -def SUBU_PH : SUBU_PH_ENC, SUBU_PH_DESC; -def SUBU_S_PH : SUBU_S_PH_ENC, SUBU_S_PH_DESC; -def CMPGDU_EQ_QB : CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC; -def CMPGDU_LT_QB : CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC; -def CMPGDU_LE_QB : CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC; -def ABSQ_S_QB : ABSQ_S_QB_ENC, ABSQ_S_QB_DESC; -def ADDUH_QB : ADDUH_QB_ENC, ADDUH_QB_DESC; -def ADDUH_R_QB : ADDUH_R_QB_ENC, ADDUH_R_QB_DESC; -def SUBUH_QB : SUBUH_QB_ENC, SUBUH_QB_DESC; -def SUBUH_R_QB : SUBUH_R_QB_ENC, SUBUH_R_QB_DESC; -def ADDQH_PH : ADDQH_PH_ENC, ADDQH_PH_DESC; -def ADDQH_R_PH : ADDQH_R_PH_ENC, ADDQH_R_PH_DESC; -def SUBQH_PH : SUBQH_PH_ENC, SUBQH_PH_DESC; -def SUBQH_R_PH : SUBQH_R_PH_ENC, SUBQH_R_PH_DESC; -def ADDQH_W : ADDQH_W_ENC, ADDQH_W_DESC; -def ADDQH_R_W : ADDQH_R_W_ENC, ADDQH_R_W_DESC; -def SUBQH_W : SUBQH_W_ENC, SUBQH_W_DESC; -def SUBQH_R_W : SUBQH_R_W_ENC, SUBQH_R_W_DESC; -def MUL_PH : MUL_PH_ENC, MUL_PH_DESC; -def MUL_S_PH : MUL_S_PH_ENC, MUL_S_PH_DESC; -def MULQ_S_W : MULQ_S_W_ENC, MULQ_S_W_DESC; -def MULQ_RS_W : MULQ_RS_W_ENC, MULQ_RS_W_DESC; -def MULQ_S_PH : MULQ_S_PH_ENC, MULQ_S_PH_DESC; -def DPA_W_PH : DPA_W_PH_ENC, DPA_W_PH_DESC; -def DPS_W_PH : DPS_W_PH_ENC, DPS_W_PH_DESC; -def DPAQX_S_W_PH : DPAQX_S_W_PH_ENC, DPAQX_S_W_PH_DESC; -def DPAQX_SA_W_PH : DPAQX_SA_W_PH_ENC, DPAQX_SA_W_PH_DESC; -def DPAX_W_PH : DPAX_W_PH_ENC, DPAX_W_PH_DESC; -def DPSX_W_PH : DPSX_W_PH_ENC, DPSX_W_PH_DESC; -def DPSQX_S_W_PH : DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC; -def DPSQX_SA_W_PH : DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC; -def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC; -def PRECR_QB_PH : PRECR_QB_PH_ENC, PRECR_QB_PH_DESC; -def PRECR_SRA_PH_W : PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC; -def PRECR_SRA_R_PH_W : PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC; -def SHRA_QB : SHRA_QB_ENC, SHRA_QB_DESC; -def SHRAV_QB : SHRAV_QB_ENC, SHRAV_QB_DESC; -def SHRA_R_QB : SHRA_R_QB_ENC, SHRA_R_QB_DESC; -def SHRAV_R_QB : SHRAV_R_QB_ENC, SHRAV_R_QB_DESC; -def SHRL_PH : SHRL_PH_ENC, SHRL_PH_DESC; -def SHRLV_PH : SHRLV_PH_ENC, SHRLV_PH_DESC; -def APPEND : APPEND_ENC, APPEND_DESC; -def BALIGN : BALIGN_ENC, BALIGN_DESC; -def PREPEND : PREPEND_ENC, PREPEND_DESC; - -} +def ADDU_PH : DspMMRel, ADDU_PH_ENC, ADDU_PH_DESC, ISA_DSPR2; +def ADDU_S_PH : DspMMRel, ADDU_S_PH_ENC, ADDU_S_PH_DESC, ISA_DSPR2; +def SUBU_PH : DspMMRel, SUBU_PH_ENC, SUBU_PH_DESC, ISA_DSPR2; +def SUBU_S_PH : DspMMRel, SUBU_S_PH_ENC, SUBU_S_PH_DESC, ISA_DSPR2; +def CMPGDU_EQ_QB : CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC, ISA_DSPR2; +def CMPGDU_LT_QB : CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC, ISA_DSPR2; +def CMPGDU_LE_QB : CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC, ISA_DSPR2; +def ABSQ_S_QB : DspMMRel, ABSQ_S_QB_ENC, ABSQ_S_QB_DESC, ISA_DSPR2; +def ADDUH_QB : DspMMRel, ADDUH_QB_ENC, ADDUH_QB_DESC, ISA_DSPR2; +def ADDUH_R_QB : DspMMRel, ADDUH_R_QB_ENC, ADDUH_R_QB_DESC, ISA_DSPR2; +def SUBUH_QB : DspMMRel, SUBUH_QB_ENC, SUBUH_QB_DESC, ISA_DSPR2; +def SUBUH_R_QB : DspMMRel, SUBUH_R_QB_ENC, SUBUH_R_QB_DESC, ISA_DSPR2; +def ADDQH_PH : DspMMRel, ADDQH_PH_ENC, ADDQH_PH_DESC, ISA_DSPR2; +def ADDQH_R_PH : DspMMRel, ADDQH_R_PH_ENC, ADDQH_R_PH_DESC, ISA_DSPR2; +def SUBQH_PH : DspMMRel, SUBQH_PH_ENC, SUBQH_PH_DESC, ISA_DSPR2; +def SUBQH_R_PH : DspMMRel, SUBQH_R_PH_ENC, SUBQH_R_PH_DESC, ISA_DSPR2; +def ADDQH_W : DspMMRel, ADDQH_W_ENC, ADDQH_W_DESC, ISA_DSPR2; +def ADDQH_R_W : DspMMRel, ADDQH_R_W_ENC, ADDQH_R_W_DESC, ISA_DSPR2; +def SUBQH_W : DspMMRel, SUBQH_W_ENC, SUBQH_W_DESC, ISA_DSPR2; +def SUBQH_R_W : DspMMRel, SUBQH_R_W_ENC, SUBQH_R_W_DESC, ISA_DSPR2; +def MUL_PH : DspMMRel, MUL_PH_ENC, MUL_PH_DESC, ISA_DSPR2; +def MUL_S_PH : DspMMRel, MUL_S_PH_ENC, MUL_S_PH_DESC, ISA_DSPR2; +def MULQ_S_W : DspMMRel, MULQ_S_W_ENC, MULQ_S_W_DESC, ISA_DSPR2; +def MULQ_RS_W : DspMMRel, MULQ_RS_W_ENC, MULQ_RS_W_DESC, ISA_DSPR2; +def MULQ_S_PH : DspMMRel, MULQ_S_PH_ENC, MULQ_S_PH_DESC, ISA_DSPR2; +def DPA_W_PH : DspMMRel, DPA_W_PH_ENC, DPA_W_PH_DESC, ISA_DSPR2; +def DPS_W_PH : DspMMRel, DPS_W_PH_ENC, DPS_W_PH_DESC, ISA_DSPR2; +def DPAQX_S_W_PH : DspMMRel, DPAQX_S_W_PH_ENC, DPAQX_S_W_PH_DESC, ISA_DSPR2; +def DPAQX_SA_W_PH : DspMMRel, DPAQX_SA_W_PH_ENC, DPAQX_SA_W_PH_DESC, ISA_DSPR2; +def DPAX_W_PH : DspMMRel, DPAX_W_PH_ENC, DPAX_W_PH_DESC, ISA_DSPR2; +def DPSX_W_PH : DspMMRel, DPSX_W_PH_ENC, DPSX_W_PH_DESC, ISA_DSPR2; +def DPSQX_S_W_PH : DspMMRel, DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC, ISA_DSPR2; +def DPSQX_SA_W_PH : DspMMRel, DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC, ISA_DSPR2; +def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC, ISA_DSPR2; +def PRECR_QB_PH : DspMMRel, PRECR_QB_PH_ENC, PRECR_QB_PH_DESC, ISA_DSPR2; +def PRECR_SRA_PH_W : DspMMRel, PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC, ISA_DSPR2; +def PRECR_SRA_R_PH_W : DspMMRel, PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC, ISA_DSPR2; +def SHRA_QB : DspMMRel, SHRA_QB_ENC, SHRA_QB_DESC, ISA_DSPR2; +def SHRAV_QB : DspMMRel, SHRAV_QB_ENC, SHRAV_QB_DESC, ISA_DSPR2; +def SHRA_R_QB : DspMMRel, SHRA_R_QB_ENC, SHRA_R_QB_DESC, ISA_DSPR2; +def SHRAV_R_QB : DspMMRel, SHRAV_R_QB_ENC, SHRAV_R_QB_DESC, ISA_DSPR2; +def SHRL_PH : DspMMRel, SHRL_PH_ENC, SHRL_PH_DESC, ISA_DSPR2; +def SHRLV_PH : DspMMRel, SHRLV_PH_ENC, SHRLV_PH_DESC, ISA_DSPR2; +def APPEND : APPEND_ENC, APPEND_DESC, ISA_DSPR2; +def BALIGN : BALIGN_ENC, BALIGN_DESC, ISA_DSPR2; +def PREPEND : DspMMRel, PREPEND_ENC, PREPEND_DESC, ISA_DSPR2; // Pseudos. let isPseudo = 1, isCodeGenOnly = 1 in { @@ -1415,3 +1444,8 @@ let AddedComplexity = 20 in { def : IndexedLoadPat<sextloadi16, LHX>; def : IndexedLoadPat<load, LWX>; } + +// Instruction alias. +let AdditionalPredicates = [NotInMicroMips] in { + def : DSPInstAlias<"wrdsp $rt", (WRDSP GPR32Opnd:$rt, 0x1F), 1>; +} diff --git a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp index 4faeb33..8313d90 100644 --- a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -355,9 +355,8 @@ void RegDefsUses::addLiveOut(const MachineBasicBlock &MBB, for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) if (*SI != &SuccBB) - for (MachineBasicBlock::livein_iterator LI = (*SI)->livein_begin(), - LE = (*SI)->livein_end(); LI != LE; ++LI) - Uses.set(*LI); + for (const auto &LI : (*SI)->liveins()) + Uses.set(LI.PhysReg); } bool RegDefsUses::update(const MachineInstr &MI, unsigned Begin, unsigned End) { @@ -431,7 +430,7 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) { (*MI.memoperands_begin())->getPseudoValue()) { if (isa<FixedStackPseudoSourceValue>(PSV)) return false; - return !PSV->isConstant(nullptr) && PSV != PseudoSourceValue::getStack(); + return !PSV->isConstant(nullptr) && !PSV->isStack(); } return true; @@ -598,7 +597,7 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) { // Get instruction with delay slot. MachineBasicBlock::instr_iterator DSI(I); - if (InMicroMipsMode && TII->GetInstSizeInBytes(std::next(DSI)) == 2 && + if (InMicroMipsMode && TII->GetInstSizeInBytes(&*std::next(DSI)) == 2 && DSI->isCall()) { // If instruction in delay slot is 16b change opcode to // corresponding instruction with short delay slot. @@ -713,8 +712,9 @@ bool Filler::searchBackward(MachineBasicBlock &MBB, Iter Slot) const { if (DisableBackwardSearch) return false; - RegDefsUses RegDU(*MBB.getParent()->getSubtarget().getRegisterInfo()); - MemDefsUses MemDU(*TM.getDataLayout(), MBB.getParent()->getFrameInfo()); + auto *Fn = MBB.getParent(); + RegDefsUses RegDU(*Fn->getSubtarget().getRegisterInfo()); + MemDefsUses MemDU(Fn->getDataLayout(), Fn->getFrameInfo()); ReverseIter Filler; RegDU.init(*Slot); @@ -763,6 +763,7 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const { BB2BrMap BrMap; std::unique_ptr<InspectMemInstr> IM; Iter Filler; + auto *Fn = MBB.getParent(); // Iterate over SuccBB's predecessor list. for (MachineBasicBlock::pred_iterator PI = SuccBB->pred_begin(), @@ -772,15 +773,15 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const { // Do not allow moving instructions which have unallocatable register operands // across basic block boundaries. - RegDU.setUnallocatableRegs(*MBB.getParent()); + RegDU.setUnallocatableRegs(*Fn); // Only allow moving loads from stack or constants if any of the SuccBB's // predecessors have multiple successors. if (HasMultipleSuccs) { IM.reset(new LoadFromStackOrConst()); } else { - const MachineFrameInfo *MFI = MBB.getParent()->getFrameInfo(); - IM.reset(new MemDefsUses(*TM.getDataLayout(), MFI)); + const MachineFrameInfo *MFI = Fn->getFrameInfo(); + IM.reset(new MemDefsUses(Fn->getDataLayout(), MFI)); } if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Slot, @@ -800,12 +801,13 @@ MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const { // Select the successor with the larget edge weight. auto &Prob = getAnalysis<MachineBranchProbabilityInfo>(); - MachineBasicBlock *S = *std::max_element(B.succ_begin(), B.succ_end(), - [&](const MachineBasicBlock *Dst0, - const MachineBasicBlock *Dst1) { - return Prob.getEdgeWeight(&B, Dst0) < Prob.getEdgeWeight(&B, Dst1); - }); - return S->isLandingPad() ? nullptr : S; + MachineBasicBlock *S = *std::max_element( + B.succ_begin(), B.succ_end(), + [&](const MachineBasicBlock *Dst0, const MachineBasicBlock *Dst1) { + return Prob.getEdgeProbability(&B, Dst0) < + Prob.getEdgeProbability(&B, Dst1); + }); + return S->isEHPad() ? nullptr : S; } std::pair<MipsInstrInfo::BranchType, MachineInstr *> diff --git a/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td new file mode 100644 index 0000000..11e191a --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td @@ -0,0 +1,84 @@ +//===- MipsEVAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes Mips32r6 instruction formats. +// +//===----------------------------------------------------------------------===// + +class MipsEVAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, + PredicateControl, StdArch { + let DecoderNamespace = "Mips"; + let EncodingPredicates = [HasStdEnc]; +} + +//===----------------------------------------------------------------------===// +// +// Field Values +// +//===----------------------------------------------------------------------===// + +// Memory Load/Store EVA +def OPCODE6_LBE : OPCODE6<0b101100>; +def OPCODE6_LBuE : OPCODE6<0b101000>; +def OPCODE6_LHE : OPCODE6<0b101101>; +def OPCODE6_LHuE : OPCODE6<0b101001>; +def OPCODE6_LWE : OPCODE6<0b101111>; + +def OPCODE6_SBE : OPCODE6<0b011100>; +def OPCODE6_SHE : OPCODE6<0b011101>; +def OPCODE6_SWE : OPCODE6<0b011111>; + +// load/store left/right EVA +def OPCODE6_LWLE : OPCODE6<0b011001>; +def OPCODE6_LWRE : OPCODE6<0b011010>; +def OPCODE6_SWLE : OPCODE6<0b100001>; +def OPCODE6_SWRE : OPCODE6<0b100010>; + +// Load-linked EVA, Store-conditional EVA +def OPCODE6_LLE : OPCODE6<0b101110>; +def OPCODE6_SCE : OPCODE6<0b011110>; + +def OPCODE6_TLBINV : OPCODE6<0b000011>; +def OPCODE6_TLBINVF : OPCODE6<0b000100>; + +def OPCODE6_CACHEE : OPCODE6<0b011011>; +def OPCODE6_PREFE : OPCODE6<0b100011>; + +def OPGROUP_COP0 : OPGROUP<0b010000>; + +//===----------------------------------------------------------------------===// +// +// Encoding Formats +// +//===----------------------------------------------------------------------===// + +class SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6 Operation> : MipsEVAInst { + bits<21> addr; + bits<5> hint; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = OPGROUP_SPECIAL3.Value; + let Inst{25-21} = base; + let Inst{20-16} = hint; + let Inst{15-7} = offset; + let Inst{6} = 0; + let Inst{5-0} = Operation.Value; +} + +class TLB_FM<OPCODE6 Operation> : MipsEVAInst { + bits<32> Inst; + + let Inst{31-26} = OPGROUP_COP0.Value; + let Inst{25} = 1; // CO + let Inst{24-6} = 0; + let Inst{5-0} = Operation.Value; +} diff --git a/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td new file mode 100644 index 0000000..36c9694 --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td @@ -0,0 +1,192 @@ +//===- MipsEVAInstrInfo.td - EVA ASE instructions -*- tablegen ------------*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes Mips EVA ASE instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// +// Instruction encodings +// +//===----------------------------------------------------------------------===// + +// Memory Load/Store EVA encodings +class LBE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LBE>; +class LBuE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LBuE>; +class LHE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LHE>; +class LHuE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LHuE>; +class LWE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWE>; + +class SBE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SBE>; +class SHE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SHE>; +class SWE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWE>; + +// load/store left/right EVA encodings +class LWLE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWLE>; +class LWRE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWRE>; +class SWLE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWLE>; +class SWRE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWRE>; + +// Load-linked EVA, Store-conditional EVA encodings +class LLE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LLE>; +class SCE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SCE>; + +class TLBINV_ENC : TLB_FM<OPCODE6_TLBINV>; +class TLBINVF_ENC : TLB_FM<OPCODE6_TLBINVF>; + +class CACHEE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_CACHEE>; +class PREFE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_PREFE>; + +//===----------------------------------------------------------------------===// +// +// Instruction descriptions +// +//===----------------------------------------------------------------------===// + +// Memory Load/Store EVA descriptions +class LOAD_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs GPROpnd:$rt); + dag InOperandList = (ins mem_simm9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + string DecoderMethod = "DecodeMemEVA"; + bit canFoldAsLoad = 1; + bit mayLoad = 1; +} + +class LBE_DESC : LOAD_EVA_DESC_BASE<"lbe", GPR32Opnd>; +class LBuE_DESC : LOAD_EVA_DESC_BASE<"lbue", GPR32Opnd>; +class LHE_DESC : LOAD_EVA_DESC_BASE<"lhe", GPR32Opnd>; +class LHuE_DESC : LOAD_EVA_DESC_BASE<"lhue", GPR32Opnd>; +class LWE_DESC : LOAD_EVA_DESC_BASE<"lwe", GPR32Opnd>; + +class STORE_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, + SDPatternOperator OpNode = null_frag> { + dag OutOperandList = (outs); + dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + string DecoderMethod = "DecodeMemEVA"; + bit mayStore = 1; +} + +class SBE_DESC : STORE_EVA_DESC_BASE<"sbe", GPR32Opnd>; +class SHE_DESC : STORE_EVA_DESC_BASE<"she", GPR32Opnd>; +class SWE_DESC : STORE_EVA_DESC_BASE<"swe", GPR32Opnd>; + +// Load/Store Left/Right EVA descriptions +class LOAD_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs GPROpnd:$rt); + dag InOperandList = (ins mem_simm9:$addr, GPROpnd:$src); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + string DecoderMethod = "DecodeMemEVA"; + string Constraints = "$src = $rt"; + bit canFoldAsLoad = 1; +} + +class LWLE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwle", GPR32Opnd>; +class LWRE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwre", GPR32Opnd>; + +class STORE_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs); + dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + string DecoderMethod = "DecodeMemEVA"; +} + +class SWLE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swle", GPR32Opnd>; +class SWRE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swre", GPR32Opnd>; + +// Load-linked EVA, Store-conditional EVA descriptions +class LLE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs GPROpnd:$rt); + dag InOperandList = (ins mem_simm9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + bit mayLoad = 1; + string DecoderMethod = "DecodeMemEVA"; +} + +class LLE_DESC : LLE_DESC_BASE<"lle", GPR32Opnd>; + +class SCE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs GPROpnd:$dst); + dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + bit mayStore = 1; + string Constraints = "$rt = $dst"; + string DecoderMethod = "DecodeMemEVA"; +} + +class SCE_DESC : SCE_DESC_BASE<"sce", GPR32Opnd>; + +class TLB_DESC_BASE<string instr_asm> { + dag OutOperandList = (outs); + dag InOperandList = (ins); + string AsmString = instr_asm; + list<dag> Pattern = []; +} + +class TLBINV_DESC : TLB_DESC_BASE<"tlbinv">; +class TLBINVF_DESC : TLB_DESC_BASE<"tlbinvf">; + +class CACHEE_DESC_BASE<string instr_asm, Operand MemOpnd> { + dag OutOperandList = (outs); + dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint); + string AsmString = !strconcat(instr_asm, "\t$hint, $addr"); + list<dag> Pattern = []; + string DecoderMethod = "DecodeCacheeOp_CacheOpR6"; +} + +class CACHEE_DESC : CACHEE_DESC_BASE<"cachee", mem>; +class PREFE_DESC : CACHEE_DESC_BASE<"prefe", mem>; + +//===----------------------------------------------------------------------===// +// +// Instruction definitions +// +//===----------------------------------------------------------------------===// + +/// Load and Store EVA Instructions +def LBE : LBE_ENC, LBE_DESC, INSN_EVA; +def LBuE : LBuE_ENC, LBuE_DESC, INSN_EVA; +def LHE : LHE_ENC, LHE_DESC, INSN_EVA; +def LHuE : LHuE_ENC, LHuE_DESC, INSN_EVA; +let AdditionalPredicates = [NotInMicroMips] in { +def LWE : LWE_ENC, LWE_DESC, INSN_EVA; +} +def SBE : SBE_ENC, SBE_DESC, INSN_EVA; +def SHE : SHE_ENC, SHE_DESC, INSN_EVA; +let AdditionalPredicates = [NotInMicroMips] in { +def SWE : SWE_ENC, SWE_DESC, INSN_EVA; +} + +/// load/store left/right EVA +let AdditionalPredicates = [NotInMicroMips] in { +def LWLE : LWLE_ENC, LWLE_DESC, INSN_EVA_NOT_32R6_64R6; +def LWRE : LWRE_ENC, LWRE_DESC, INSN_EVA_NOT_32R6_64R6; +def SWLE : SWLE_ENC, SWLE_DESC, INSN_EVA_NOT_32R6_64R6; +def SWRE : SWRE_ENC, SWRE_DESC, INSN_EVA_NOT_32R6_64R6; +} + +/// Load-linked EVA, Store-conditional EVA +let AdditionalPredicates = [NotInMicroMips] in { +def LLE : LLE_ENC, LLE_DESC, INSN_EVA; +def SCE : SCE_ENC, SCE_DESC, INSN_EVA; +} + +def TLBINV : TLBINV_ENC, TLBINV_DESC, INSN_EVA; +def TLBINVF : TLBINVF_ENC, TLBINVF_DESC, INSN_EVA; + +def CACHEE : CACHEE_ENC, CACHEE_DESC, INSN_EVA; +def PREFE : PREFE_ENC, PREFE_DESC, INSN_EVA; diff --git a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp index 5152a07..e9eaf81 100644 --- a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -192,10 +192,10 @@ public: TII(*Subtarget->getInstrInfo()), TLI(*Subtarget->getTargetLowering()) { MFI = funcInfo.MF->getInfo<MipsFunctionInfo>(); Context = &funcInfo.Fn->getContext(); + bool ISASupported = !Subtarget->hasMips32r6() && Subtarget->hasMips32(); TargetSupported = - ((TM.getRelocationModel() == Reloc::PIC_) && - ((Subtarget->hasMips32r2() || Subtarget->hasMips32()) && - (static_cast<const MipsTargetMachine &>(TM).getABI().IsO32()))); + ISASupported && (TM.getRelocationModel() == Reloc::PIC_) && + (static_cast<const MipsTargetMachine &>(TM).getABI().IsO32()); UnsupportedFPMode = Subtarget->isFP64bit(); } @@ -236,32 +236,36 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, std::swap(LHS, RHS); unsigned Opc; - if (ISDOpc == ISD::AND) { + switch (ISDOpc) { + case ISD::AND: Opc = Mips::AND; - } else if (ISDOpc == ISD::OR) { + break; + case ISD::OR: Opc = Mips::OR; - } else if (ISDOpc == ISD::XOR) { + break; + case ISD::XOR: Opc = Mips::XOR; - } else + break; + default: llvm_unreachable("unexpected opcode"); + } unsigned LHSReg = getRegForValue(LHS); - unsigned ResultReg = createResultReg(&Mips::GPR32RegClass); - if (!ResultReg) - return 0; - - unsigned RHSReg; if (!LHSReg) return 0; + unsigned RHSReg; if (const auto *C = dyn_cast<ConstantInt>(RHS)) RHSReg = materializeInt(C, MVT::i32); else RHSReg = getRegForValue(RHS); - if (!RHSReg) return 0; + unsigned ResultReg = createResultReg(&Mips::GPR32RegClass); + if (!ResultReg) + return 0; + emitInst(Opc, ResultReg).addReg(LHSReg).addReg(RHSReg); return ResultReg; } @@ -747,7 +751,7 @@ bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr, unsigned Offset = Addr.getOffset(); MachineFrameInfo &MFI = *MF->getFrameInfo(); MachineMemOperand *MMO = MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI), MachineMemOperand::MOLoad, + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addFrameIndex(FI) @@ -798,7 +802,7 @@ bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr, unsigned Offset = Addr.getOffset(); MachineFrameInfo &MFI = *MF->getFrameInfo(); MachineMemOperand *MMO = MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI), MachineMemOperand::MOLoad, + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) .addReg(SrcReg) @@ -912,8 +916,7 @@ bool MipsFastISel::selectBranch(const Instruction *I) { BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ)) .addReg(CondReg) .addMBB(TBB); - fastEmitBranch(FBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TBB); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } return false; @@ -1057,22 +1060,16 @@ bool MipsFastISel::selectFPToInt(const Instruction *I, bool IsSigned) { // entirely within FPRs. unsigned DestReg = createResultReg(&Mips::GPR32RegClass); unsigned TempReg = createResultReg(&Mips::FGR32RegClass); - unsigned Opc; - - if (SrcVT == MVT::f32) - Opc = Mips::TRUNC_W_S; - else - Opc = Mips::TRUNC_W_D32; + unsigned Opc = (SrcVT == MVT::f32) ? Mips::TRUNC_W_S : Mips::TRUNC_W_D32; // Generate the convert. emitInst(Opc, TempReg).addReg(SrcReg); - emitInst(Mips::MFC1, DestReg).addReg(TempReg); updateValueMap(I, DestReg); return true; } -// + bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &OutVTs, unsigned &NumBytes) { @@ -1196,7 +1193,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI, unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getStack(Addr.getOffset()), + MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()), MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); (void)(MMO); // if (!emitStore(ArgVT, ArgReg, Addr, MMO)) @@ -1607,19 +1604,23 @@ bool MipsFastISel::emitIntSExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool MipsFastISel::emitIntZExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, unsigned DestReg) { + int64_t Imm; + switch (SrcVT.SimpleTy) { default: return false; case MVT::i1: - emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(1); + Imm = 1; break; case MVT::i8: - emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(0xff); + Imm = 0xff; break; case MVT::i16: - emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(0xffff); + Imm = 0xffff; break; } + + emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(Imm); return true; } diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp index fab2fdf..5680130 100644 --- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -117,6 +117,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const { case MipsISD::GPRel: return "MipsISD::GPRel"; case MipsISD::ThreadPointer: return "MipsISD::ThreadPointer"; case MipsISD::Ret: return "MipsISD::Ret"; + case MipsISD::ERet: return "MipsISD::ERet"; case MipsISD::EH_RETURN: return "MipsISD::EH_RETURN"; case MipsISD::FPBrcond: return "MipsISD::FPBrcond"; case MipsISD::FPCmp: return "MipsISD::FPCmp"; @@ -276,8 +277,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); @@ -326,6 +325,8 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::i64, Expand); setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); @@ -390,10 +391,10 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand); - setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); - setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand); - setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); + if (!Subtarget.isGP64bit()) { + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); + } setInsertFencesForAtomic(true); @@ -437,9 +438,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setStackPointerRegisterToSaveRestore(ABI.IsN64() ? Mips::SP_64 : Mips::SP); - setExceptionPointerRegister(ABI.IsN64() ? Mips::A0_64 : Mips::A0); - setExceptionSelectorRegister(ABI.IsN64() ? Mips::A1_64 : Mips::A1); - MaxStoresPerMemcpy = 16; isMicroMips = Subtarget.inMicroMipsMode(); @@ -836,6 +834,14 @@ SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) return SDValue(); } +bool MipsTargetLowering::isCheapToSpeculateCttz() const { + return Subtarget.hasMips32(); +} + +bool MipsTargetLowering::isCheapToSpeculateCtlz() const { + return Subtarget.hasMips32(); +} + void MipsTargetLowering::LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results, @@ -866,7 +872,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::GlobalTLSAddress: return lowerGlobalTLSAddress(Op, DAG); case ISD::JumpTable: return lowerJumpTable(Op, DAG); case ISD::SELECT: return lowerSELECT(Op, DAG); - case ISD::SELECT_CC: return lowerSELECT_CC(Op, DAG); case ISD::SETCC: return lowerSETCC(Op, DAG); case ISD::VASTART: return lowerVASTART(Op, DAG); case ISD::VAARG: return lowerVAARG(Op, DAG); @@ -1092,8 +1097,7 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); MF->insert(It, loopMBB); MF->insert(It, exitMBB); @@ -1204,8 +1208,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword( MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); MF->insert(It, loopMBB); MF->insert(It, sinkMBB); MF->insert(It, exitMBB); @@ -1330,15 +1333,20 @@ MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI, DebugLoc DL = MI->getDebugLoc(); unsigned LL, SC, ZERO, BNE, BEQ; - if (Size == 4) { - LL = isMicroMips ? Mips::LL_MM : Mips::LL; - SC = isMicroMips ? Mips::SC_MM : Mips::SC; + if (Size == 4) { + if (isMicroMips) { + LL = Mips::LL_MM; + SC = Mips::SC_MM; + } else { + LL = Subtarget.hasMips32r6() ? Mips::LL_R6 : Mips::LL; + SC = Subtarget.hasMips32r6() ? Mips::SC_R6 : Mips::SC; + } ZERO = Mips::ZERO; BNE = Mips::BNE; BEQ = Mips::BEQ; } else { - LL = Mips::LLD; - SC = Mips::SCD; + LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD; + SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD; ZERO = Mips::ZERO_64; BNE = Mips::BNE64; BEQ = Mips::BEQ64; @@ -1356,8 +1364,7 @@ MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); MF->insert(It, loop1MBB); MF->insert(It, loop2MBB); MF->insert(It, exitMBB); @@ -1440,8 +1447,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI, MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); MF->insert(It, loop1MBB); MF->insert(It, loop2MBB); MF->insert(It, sinkMBB); @@ -1586,9 +1592,10 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const { SDValue Addr = DAG.getNode(ISD::ADD, DL, PTy, Index, Table); EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8); - Addr = DAG.getExtLoad(ISD::SEXTLOAD, DL, PTy, Chain, Addr, - MachinePointerInfo::getJumpTable(), MemVT, false, false, - false, 0); + Addr = + DAG.getExtLoad(ISD::SEXTLOAD, DL, PTy, Chain, Addr, + MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), + MemVT, false, false, false, 0); Chain = Addr.getValue(1); if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || ABI.IsN64()) { @@ -1640,20 +1647,6 @@ lowerSELECT(SDValue Op, SelectionDAG &DAG) const SDLoc(Op)); } -SDValue MipsTargetLowering:: -lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const -{ - SDLoc DL(Op); - EVT Ty = Op.getOperand(0).getValueType(); - SDValue Cond = - DAG.getNode(ISD::SETCC, DL, getSetCCResultType(DAG.getDataLayout(), - *DAG.getContext(), Ty), - Op.getOperand(0), Op.getOperand(1), Op.getOperand(4)); - - return DAG.getNode(ISD::SELECT, DL, Op.getValueType(), Cond, Op.getOperand(2), - Op.getOperand(3)); -} - SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const { assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6()); SDValue Cond = createFPCmp(DAG, Op); @@ -1690,14 +1683,15 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op, return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64()); if (LargeGOT) - return getAddrGlobalLargeGOT(N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16, - MipsII::MO_GOT_LO16, DAG.getEntryNode(), - MachinePointerInfo::getGOT()); + return getAddrGlobalLargeGOT( + N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16, MipsII::MO_GOT_LO16, + DAG.getEntryNode(), + MachinePointerInfo::getGOT(DAG.getMachineFunction())); - return getAddrGlobal(N, SDLoc(N), Ty, DAG, - (ABI.IsN32() || ABI.IsN64()) ? MipsII::MO_GOT_DISP - : MipsII::MO_GOT16, - DAG.getEntryNode(), MachinePointerInfo::getGOT()); + return getAddrGlobal( + N, SDLoc(N), Ty, DAG, + (ABI.IsN32() || ABI.IsN64()) ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16, + DAG.getEntryNode(), MachinePointerInfo::getGOT(DAG.getMachineFunction())); } SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op, @@ -1719,6 +1713,9 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const // Local Exec TLS Model. GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + SDLoc DL(GA); const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -1813,7 +1810,8 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const static_cast<const MipsTargetObjectFile *>( getTargetMachine().getObjFileLowering()); - if (TLOF->IsConstantInSmallSection(N->getConstVal(), getTargetMachine())) + if (TLOF->IsConstantInSmallSection(DAG.getDataLayout(), N->getConstVal(), + getTargetMachine())) // %gp_rel relocation return getAddrGPRel(N, SDLoc(N), Ty, DAG); @@ -2946,8 +2944,12 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1); - Function::const_arg_iterator FuncArg = - DAG.getMachineFunction().getFunction()->arg_begin(); + const Function *Func = DAG.getMachineFunction().getFunction(); + Function::const_arg_iterator FuncArg = Func->arg_begin(); + + if (Func->hasFnAttribute("interrupt") && !Func->arg_empty()) + report_fatal_error( + "Functions with the interrupt attribute cannot have arguments!"); CCInfo.AnalyzeFormalArguments(Ins, CC_Mips_FixedArg); MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(), @@ -3019,7 +3021,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, // We ought to be able to use LocVT directly but O32 sets it to i32 // when allocating floating point values to integer registers. // This shouldn't influence how we load the value into registers unless - // we are targetting softfloat. + // we are targeting softfloat. if (VA.getValVT().isFloatingPoint() && !Subtarget.useSoftFloat()) LocVT = VA.getValVT(); } @@ -3033,9 +3035,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, // Create load nodes to retrieve arguments from the stack SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - SDValue ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + SDValue ArgValue = DAG.getLoad( + LocVT, DL, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, false, 0); OutChains.push_back(ArgValue.getValue(1)); ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG); @@ -3098,8 +3101,20 @@ MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const } SDValue -MipsTargetLowering::LowerReturn(SDValue Chain, - CallingConv::ID CallConv, bool IsVarArg, +MipsTargetLowering::LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, + SDLoc DL, SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); + + MipsFI->setISR(); + + return DAG.getNode(MipsISD::ERet, DL, MVT::Other, RetOps); +} + +SDValue +MipsTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool IsVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, SDLoc DL, SelectionDAG &DAG) const { @@ -3192,7 +3207,11 @@ MipsTargetLowering::LowerReturn(SDValue Chain, if (Flag.getNode()) RetOps.push_back(Flag); - // Return on Mips is always a "jr $ra" + // ISRs must use "eret". + if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt")) + return LowerInterruptReturn(RetOps, DL, DAG); + + // Standard return on Mips is a "jr $ra" return DAG.getNode(MipsISD::Ret, DL, MVT::Other, RetOps); } @@ -3300,7 +3319,7 @@ static std::pair<bool, bool> parsePhysicalReg(StringRef C, StringRef &Prefix, // Search for the first numeric character. StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1; - I = std::find_if(B, E, std::ptr_fun(isdigit)); + I = std::find_if(B, E, isdigit); Prefix = StringRef(B, I - B); @@ -3669,7 +3688,7 @@ void MipsTargetLowering::passByValArg( unsigned NumRegs = LastReg - FirstReg; if (NumRegs) { - const ArrayRef<MCPhysReg> ArgRegs = ABI.GetByValArgRegs(); + ArrayRef<MCPhysReg> ArgRegs = ABI.GetByValArgRegs(); bool LeftoverBytes = (NumRegs * RegSizeInBytes > ByValSizeInBytes); unsigned I = 0; @@ -3755,7 +3774,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains, SDValue Chain, SDLoc DL, SelectionDAG &DAG, CCState &State) const { - const ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs(); + ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs(); unsigned Idx = State.getFirstUnallocated(ArgRegs); unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes(); MVT RegTy = MVT::getIntegerVT(RegSizeInBytes * 8); @@ -3812,7 +3831,7 @@ void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size, if (State->getCallingConv() != CallingConv::Fast) { unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes(); - const ArrayRef<MCPhysReg> IntArgRegs = ABI.GetByValArgRegs(); + ArrayRef<MCPhysReg> IntArgRegs = ABI.GetByValArgRegs(); // FIXME: The O32 case actually describes no shadow registers. const MCPhysReg *ShadowRegs = ABI.IsO32() ? IntArgRegs.data() : Mips64DPRegs; @@ -3860,8 +3879,7 @@ MipsTargetLowering::emitPseudoSELECT(MachineInstr *MI, MachineBasicBlock *BB, // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h index b3d861d..0dc683e 100644 --- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h @@ -67,6 +67,10 @@ namespace llvm { // Return Ret, + // Interrupt, exception, error trap Return + ERet, + + // Software Exception Return. EH_RETURN, // Node used to extract integer from accumulator. @@ -231,6 +235,9 @@ namespace llvm { return MVT::i32; } + bool isCheapToSpeculateCttz() const override; + bool isCheapToSpeculateCtlz() const override; + void LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; @@ -258,17 +265,25 @@ namespace llvm { EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const override; - struct LTStr { - bool operator()(const char *S1, const char *S2) const { - return strcmp(S1, S2) < 0; - } - }; - void HandleByVal(CCState *, unsigned &, unsigned) const override; unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + return ABI.IsN64() ? Mips::A0_64 : Mips::A0; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + return ABI.IsN64() ? Mips::A1_64 : Mips::A1; + } + /// Returns true if a cast between SrcAS and DestAS is a noop. bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { // Mips doesn't have any special address spaces so we just reserve @@ -290,9 +305,10 @@ namespace llvm { unsigned GOTFlag = IsN32OrN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT; SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty), getTargetNode(N, Ty, DAG, GOTFlag)); - SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT, - MachinePointerInfo::getGOT(), false, false, - false, 0); + SDValue Load = + DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); unsigned LoFlag = IsN32OrN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO; SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty, getTargetNode(N, Ty, DAG, LoFlag)); @@ -414,7 +430,6 @@ namespace llvm { SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const; @@ -487,6 +502,9 @@ namespace llvm { const SmallVectorImpl<SDValue> &OutVals, SDLoc dl, SelectionDAG &DAG) const override; + SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, SDLoc DL, + SelectionDAG &DAG) const; + bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override; // Inline asm support diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td index cb91225..377260f 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td +++ b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td @@ -136,7 +136,7 @@ multiclass ABSS_M<string opstr, InstrItinClass Itin, multiclass ROUND_M<string opstr, InstrItinClass Itin> { def _D32 : MMRel, ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>, FGR_32; - def _D64 : ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 { + def _D64 : StdMMR6Rel, ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 { let DecoderNamespace = "Mips64"; } } @@ -267,24 +267,25 @@ defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6, //===----------------------------------------------------------------------===// // Floating Point Instructions //===----------------------------------------------------------------------===// -def ROUND_W_S : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, +def ROUND_W_S : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, ABSS_FM<0xc, 16>, ISA_MIPS2; -def TRUNC_W_S : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>, +defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2; +def TRUNC_W_S : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>, ABSS_FM<0xd, 16>, ISA_MIPS2; -def CEIL_W_S : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>, +def CEIL_W_S : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>, ABSS_FM<0xe, 16>, ISA_MIPS2; -def FLOOR_W_S : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>, +def FLOOR_W_S : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>, ABSS_FM<0xf, 16>, ISA_MIPS2; def CVT_W_S : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>, ABSS_FM<0x24, 16>; -defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2; defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2; defm CEIL_W : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2; defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2; defm CVT_W : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>; let DecoderNamespace = "Mips64" in { + let AdditionalPredicates = [NotInMicroMips] in { def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>, ABSS_FM<0x8, 16>, FGR_64; def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>, @@ -301,14 +302,17 @@ let DecoderNamespace = "Mips64" in { ABSS_FM<0xb, 16>, FGR_64; def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, II_FLOOR>, ABSS_FM<0xb, 17>, FGR_64; + } } def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>, ABSS_FM<0x20, 20>; -def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>, - ABSS_FM<0x25, 16>, INSN_MIPS3_32R2; -def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>, - ABSS_FM<0x25, 17>, INSN_MIPS3_32R2; +let AdditionalPredicates = [NotInMicroMips] in{ + def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>, + ABSS_FM<0x25, 16>, INSN_MIPS3_32R2; + def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>, + ABSS_FM<0x25, 17>, INSN_MIPS3_32R2; +} def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>, ABSS_FM<0x20, 17>, FGR_32; @@ -320,8 +324,10 @@ def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>, let DecoderNamespace = "Mips64" in { def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>, ABSS_FM<0x20, 17>, FGR_64; - def CVT_S_L : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>, - ABSS_FM<0x20, 21>, FGR_64; + let AdditionalPredicates = [NotInMicroMips] in{ + def CVT_S_L : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>, + ABSS_FM<0x20, 21>, FGR_64; + } def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>, ABSS_FM<0x21, 20>, FGR_64; def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>, @@ -345,8 +351,8 @@ def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>, defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>; defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>; -def FSQRT_S : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S, fsqrt>, - ABSS_FM<0x4, 16>, ISA_MIPS2; +def FSQRT_S : MMRel, StdMMR6Rel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, + II_SQRT_S, fsqrt>, ABSS_FM<0x4, 16>, ISA_MIPS2; defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2; // The odd-numbered registers are only referenced when doing loads, @@ -503,13 +509,13 @@ let AdditionalPredicates = [NoNaNsFPMath], def MIPS_BRANCH_F : PatLeaf<(i32 0)>; def MIPS_BRANCH_T : PatLeaf<(i32 1)>; -def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, IIBranch, MIPS_BRANCH_F>, +def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, II_BC1F, MIPS_BRANCH_F>, BC1F_FM<0, 0>, ISA_MIPS1_NOT_32R6_64R6; -def BC1FL : MMRel, BC1F_FT<"bc1fl", brtarget, IIBranch, MIPS_BRANCH_F, 0>, +def BC1FL : MMRel, BC1F_FT<"bc1fl", brtarget, II_BC1FL, MIPS_BRANCH_F, 0>, BC1F_FM<1, 0>, ISA_MIPS2_NOT_32R6_64R6; -def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, IIBranch, MIPS_BRANCH_T>, +def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, II_BC1T, MIPS_BRANCH_T>, BC1F_FM<0, 1>, ISA_MIPS1_NOT_32R6_64R6; -def BC1TL : MMRel, BC1F_FT<"bc1tl", brtarget, IIBranch, MIPS_BRANCH_T, 0>, +def BC1TL : MMRel, BC1F_FT<"bc1tl", brtarget, II_BC1TL, MIPS_BRANCH_T, 0>, BC1F_FM<1, 1>, ISA_MIPS2_NOT_32R6_64R6; /// Floating Point Compare diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td index 5f4fcc3..45baf27 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td +++ b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td @@ -132,7 +132,7 @@ class PseudoSE<dag outs, dag ins, list<dag> pattern, // These are aliases that require C++ handling to convert to the target // instruction, while InstAliases can be handled directly by tblgen. class MipsAsmPseudoInst<dag outs, dag ins, string asmstr>: - MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo> { + MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo>, PredicateControl { let isPseudo = 1; let Pattern = []; } @@ -644,16 +644,16 @@ class BRK_FM<bits<6> funct> : StdArch // Exception return format <Cop0|1|0|funct> //===----------------------------------------------------------------------===// -class ER_FM<bits<6> funct> : StdArch +class ER_FM<bits<6> funct, bit LLBit> : StdArch { bits<32> Inst; let Inst{31-26} = 0x10; let Inst{25} = 1; - let Inst{24-6} = 0; + let Inst{24-7} = 0; + let Inst{6} = LLBit; let Inst{5-0} = funct; } - //===----------------------------------------------------------------------===// // Enable/disable interrupt instruction format <Cop0|MFMC0|rt|12|0|sc|0|0> //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp index bb23cc0..b1d6950 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp @@ -60,8 +60,8 @@ MachineMemOperand *MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI, MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), Flag, - MFI.getObjectSize(FI), Align); + return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI), + Flag, MFI.getObjectSize(FI), Align); } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td index ab98c90..ffda491 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td @@ -77,6 +77,9 @@ def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>; def MipsRet : SDNode<"MipsISD::Ret", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def MipsERet : SDNode<"MipsISD::ERet", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPSideEffect]>; + // These are target-independent nodes, but have target-specific formats. def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart, [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; @@ -157,7 +160,7 @@ def HasMips3 : Predicate<"Subtarget->hasMips3()">, def HasMips4_32 : Predicate<"Subtarget->hasMips4_32()">, AssemblerPredicate<"FeatureMips4_32">; def NotMips4_32 : Predicate<"!Subtarget->hasMips4_32()">, - AssemblerPredicate<"FeatureMips4_32">; + AssemblerPredicate<"!FeatureMips4_32">; def HasMips4_32r2 : Predicate<"Subtarget->hasMips4_32r2()">, AssemblerPredicate<"FeatureMips4_32r2">; def HasMips5_32r2 : Predicate<"Subtarget->hasMips5_32r2()">, @@ -166,6 +169,8 @@ def HasMips32 : Predicate<"Subtarget->hasMips32()">, AssemblerPredicate<"FeatureMips32">; def HasMips32r2 : Predicate<"Subtarget->hasMips32r2()">, AssemblerPredicate<"FeatureMips32r2">; +def HasMips32r5 : Predicate<"Subtarget->hasMips32r5()">, + AssemblerPredicate<"FeatureMips32r5">; def HasMips32r6 : Predicate<"Subtarget->hasMips32r6()">, AssemblerPredicate<"FeatureMips32r6">; def NotMips32r6 : Predicate<"!Subtarget->hasMips32r6()">, @@ -176,6 +181,8 @@ def IsGP32bit : Predicate<"!Subtarget->isGP64bit()">, AssemblerPredicate<"!FeatureGP64Bit">; def HasMips64 : Predicate<"Subtarget->hasMips64()">, AssemblerPredicate<"FeatureMips64">; +def NotMips64 : Predicate<"!Subtarget->hasMips64()">, + AssemblerPredicate<"!FeatureMips64">; def HasMips64r2 : Predicate<"Subtarget->hasMips64r2()">, AssemblerPredicate<"FeatureMips64r2">; def HasMips64r6 : Predicate<"Subtarget->hasMips64r6()">, @@ -184,6 +191,8 @@ def NotMips64r6 : Predicate<"!Subtarget->hasMips64r6()">, AssemblerPredicate<"!FeatureMips64r6">; def HasMicroMips32r6 : Predicate<"Subtarget->inMicroMips32r6Mode()">, AssemblerPredicate<"FeatureMicroMips,FeatureMips32r6">; +def HasMicroMips64r6 : Predicate<"Subtarget->inMicroMips64r6Mode()">, + AssemblerPredicate<"FeatureMicroMips,FeatureMips64r6">; def InMips16Mode : Predicate<"Subtarget->inMips16Mode()">, AssemblerPredicate<"FeatureMips16">; def HasCnMips : Predicate<"Subtarget->hasCnMips()">, @@ -201,6 +210,12 @@ def NotInMicroMips : Predicate<"!Subtarget->inMicroMipsMode()">, def IsLE : Predicate<"Subtarget->isLittle()">; def IsBE : Predicate<"!Subtarget->isLittle()">; def IsNotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; +def UseTCCInDIV : AssemblerPredicate<"FeatureUseTCCInDIV">; +def HasEVA : Predicate<"Subtarget->hasEVA()">, + AssemblerPredicate<"FeatureEVA,FeatureMips32r2">; +def HasMSA : Predicate<"Subtarget->hasMSA()">, + AssemblerPredicate<"FeatureMSA">; + //===----------------------------------------------------------------------===// // Mips GPR size adjectives. @@ -242,6 +257,7 @@ class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; } class ISA_MIPS32R2_NOT_32R6_64R6 { list<Predicate> InsnPredicates = [HasMips32r2, NotMips32r6, NotMips64r6]; } +class ISA_MIPS32R5 { list<Predicate> InsnPredicates = [HasMips32r5]; } class ISA_MIPS64 { list<Predicate> InsnPredicates = [HasMips64]; } class ISA_MIPS64_NOT_64R6 { list<Predicate> InsnPredicates = [HasMips64, NotMips64r6]; @@ -249,9 +265,21 @@ class ISA_MIPS64_NOT_64R6 { class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; } class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; } class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; } +class ISA_MICROMIPS { list<Predicate> InsnPredicates = [InMicroMips]; } class ISA_MICROMIPS32R6 { list<Predicate> InsnPredicates = [HasMicroMips32r6]; } +class ISA_MICROMIPS64R6 { + list<Predicate> InsnPredicates = [HasMicroMips64r6]; +} +class ISA_MICROMIPS32_NOT_MIPS32R6 { + list<Predicate> InsnPredicates = [InMicroMips, NotMips32r6]; +} + +class INSN_EVA { list<Predicate> InsnPredicates = [HasEVA]; } +class INSN_EVA_NOT_32R6_64R6 { + list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6, HasEVA]; +} // The portions of MIPS-III that were also added to MIPS32 class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; } @@ -283,6 +311,28 @@ class INSN_MIPS5_32R2_NOT_32R6_64R6 { list<Predicate> InsnPredicates = [HasMips5_32r2, NotMips32r6, NotMips64r6]; } +class ASE_CNMIPS { + list<Predicate> InsnPredicates = [HasCnMips]; +} + +class ASE_MSA { + list<Predicate> InsnPredicates = [HasMSA]; +} + +class ASE_MSA_NOT_MSA64 { + list<Predicate> InsnPredicates = [HasMSA, NotMips64]; +} + +class ASE_MSA64 { + list<Predicate> InsnPredicates = [HasMSA, HasMips64]; +} + +// Class used for separating microMIPSr6 and microMIPS (r3) instruction. +// It can be used only on instructions that doesn't inherit PredicateControl. +class ISA_MICROMIPS_NOT_32R6_64R6 : PredicateControl { + let InsnPredicates = [InMicroMips, NotMips32r6, NotMips64r6]; +} + //===----------------------------------------------------------------------===// class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl { @@ -335,6 +385,81 @@ include "MipsInstrFormats.td" // Mips Operand, Complex Patterns and Transformations Definitions. //===----------------------------------------------------------------------===// +class ConstantSImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = []> + : AsmOperandClass { + let Name = "ConstantSImm" # Bits; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isConstantSImm<" # Bits # ">"; + let SuperClasses = Supers; + let DiagnosticType = "SImm" # Bits; +} + +class ConstantUImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [], + int Offset = 0> : AsmOperandClass { + let Name = "ConstantUImm" # Bits # "_" # Offset; + let RenderMethod = "addConstantUImmOperands<" # Bits # ", " # Offset # ">"; + let PredicateMethod = "isConstantUImm<" # Bits # ", " # Offset # ">"; + let SuperClasses = Supers; + let DiagnosticType = "UImm" # Bits # "_" # Offset; +} + +def ConstantUImm10AsmOperandClass + : ConstantUImmAsmOperandClass<10, []>; +def ConstantUImm8AsmOperandClass + : ConstantUImmAsmOperandClass<8, [ConstantUImm10AsmOperandClass]>; +def ConstantUImm7AsmOperandClass + : ConstantUImmAsmOperandClass<7, [ConstantUImm8AsmOperandClass]>; +def ConstantUImm6AsmOperandClass + : ConstantUImmAsmOperandClass<6, [ConstantUImm7AsmOperandClass]>; +def ConstantSImm6AsmOperandClass + : ConstantSImmAsmOperandClass<6, [ConstantUImm7AsmOperandClass]>; +def ConstantUImm5Plus1AsmOperandClass + : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 1>; +def ConstantUImm5Plus32AsmOperandClass + : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32>; +def ConstantUImm5Plus33AsmOperandClass + : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 33>; +def ConstantUImm5Plus32NormalizeAsmOperandClass + : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32> { + let Name = "ConstantUImm5_32_Norm"; + // We must also subtract 32 when we render the operand. + let RenderMethod = "addConstantUImmOperands<5, 32, -32>"; +} +def ConstantUImm5Lsl2AsmOperandClass : AsmOperandClass { + let Name = "UImm5Lsl2"; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isScaledUImm<5, 2>"; + let SuperClasses = [ConstantUImm6AsmOperandClass]; + let DiagnosticType = "UImm5_Lsl2"; +} +def ConstantUImm5ReportUImm6AsmOperandClass + : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]> { + let Name = "ConstantUImm5_0_Report_UImm6"; + let DiagnosticType = "UImm5_0_Report_UImm6"; +} +def ConstantUImm5AsmOperandClass + : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]>; +def ConstantUImm4AsmOperandClass + : ConstantUImmAsmOperandClass< + 4, [ConstantUImm5AsmOperandClass, + ConstantUImm5Plus32AsmOperandClass, + ConstantUImm5Plus32NormalizeAsmOperandClass]>; +def ConstantUImm3AsmOperandClass + : ConstantUImmAsmOperandClass<3, [ConstantUImm4AsmOperandClass]>; +def ConstantUImm2Plus1AsmOperandClass + : ConstantUImmAsmOperandClass<2, [ConstantUImm3AsmOperandClass], 1>; +def ConstantUImm2AsmOperandClass + : ConstantUImmAsmOperandClass<2, [ConstantUImm3AsmOperandClass]>; +def ConstantUImm1AsmOperandClass + : ConstantUImmAsmOperandClass<1, [ConstantUImm2AsmOperandClass]>; +def ConstantImmzAsmOperandClass : AsmOperandClass { + let Name = "ConstantImmz"; + let RenderMethod = "addConstantUImmOperands<1>"; + let PredicateMethod = "isConstantImmz"; + let SuperClasses = [ConstantUImm1AsmOperandClass]; + let DiagnosticType = "Immz"; +} + def MipsJumpTargetAsmOperand : AsmOperandClass { let Name = "JumpTarget"; let ParserMethod = "parseJumpTarget"; @@ -360,6 +485,10 @@ def calltarget : Operand<iPTR> { def imm64: Operand<i64>; +def simm6 : Operand<i32> { + let ParserMatchClass = ConstantSImm6AsmOperandClass; + let OperandType = "OPERAND_IMMEDIATE"; +} def simm9 : Operand<i32>; def simm10 : Operand<i32>; def simm11 : Operand<i32>; @@ -380,23 +509,12 @@ def simm18_lsl3 : Operand<i32> { let ParserMatchClass = MipsJumpTargetAsmOperand; } -def simm20 : Operand<i32> { -} +def simm20 : Operand<i32>; +def simm32 : Operand<i32>; def uimm20 : Operand<i32> { } -def MipsUImm10AsmOperand : AsmOperandClass { - let Name = "UImm10"; - let RenderMethod = "addImmOperands"; - let ParserMethod = "parseImm"; - let PredicateMethod = "isUImm<10>"; -} - -def uimm10 : Operand<i32> { - let ParserMatchClass = MipsUImm10AsmOperand; -} - def simm16_64 : Operand<i64> { let DecoderMethod = "DecodeSimm16"; } @@ -404,23 +522,71 @@ def simm16_64 : Operand<i64> { // Zero def uimmz : Operand<i32> { let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = ConstantImmzAsmOperandClass; +} + +// Unsigned Operands +foreach I = {1, 2, 3, 4, 5, 6, 7, 8, 10} in + def uimm # I : Operand<i32> { + let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = + !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass"); + } + +def uimm2_plus1 : Operand<i32> { + let PrintMethod = "printUnsignedImm"; + let EncoderMethod = "getUImmWithOffsetEncoding<2, 1>"; + let DecoderMethod = "DecodeUImmWithOffset<2, 1>"; + let ParserMatchClass = ConstantUImm2Plus1AsmOperandClass; } -// Unsigned Operand -def uimm2 : Operand<i32> { +def uimm5_plus1 : Operand<i32> { let PrintMethod = "printUnsignedImm"; + let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>"; + let DecoderMethod = "DecodeUImmWithOffset<5, 1>"; + let ParserMatchClass = ConstantUImm5Plus1AsmOperandClass; } -def uimm3 : Operand<i32> { +def uimm5_plus32 : Operand<i32> { let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = ConstantUImm5Plus32AsmOperandClass; } -def uimm5 : Operand<i32> { +def uimm5_plus33 : Operand<i32> { let PrintMethod = "printUnsignedImm"; + let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>"; + let DecoderMethod = "DecodeUImmWithOffset<5, 1>"; + let ParserMatchClass = ConstantUImm5Plus33AsmOperandClass; } -def uimm6 : Operand<i32> { +def uimm5_plus32_normalize : Operand<i32> { let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass; +} + +def uimm5_lsl2 : Operand<OtherVT> { + let EncoderMethod = "getUImm5Lsl2Encoding"; + let DecoderMethod = "DecodeUImm5lsl2"; + let ParserMatchClass = ConstantUImm5Lsl2AsmOperandClass; +} + +def uimm5_plus32_normalize_64 : Operand<i64> { + let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass; +} + +foreach I = {5} in + def uimm # I # _64 : Operand<i64> { + let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = + !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass"); + } + +// Like uimm5_64 but reports a less confusing error for 32-63 when +// an instruction alias permits that. +def uimm5_64_report_uimm6 : Operand<i64> { + let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = ConstantUImm5ReportUImm6AsmOperandClass; } def uimm16 : Operand<i32> { @@ -435,6 +601,22 @@ def MipsMemAsmOperand : AsmOperandClass { let ParserMethod = "parseMemOperand"; } +def MipsMemSimm9AsmOperand : AsmOperandClass { + let Name = "MemOffsetSimm9"; + let SuperClasses = [MipsMemAsmOperand]; + let RenderMethod = "addMemOperands"; + let ParserMethod = "parseMemOperand"; + let PredicateMethod = "isMemWithSimmOffset<9>"; +} + +def MipsMemSimm9GPRAsmOperand : AsmOperandClass { + let Name = "MemOffsetSimm9GPR"; + let SuperClasses = [MipsMemAsmOperand]; + let RenderMethod = "addMemOperands"; + let ParserMethod = "parseMemOperand"; + let PredicateMethod = "isMemWithSimmOffsetGPR<9>"; +} + def MipsMemSimm11AsmOperand : AsmOperandClass { let Name = "MemOffsetSimm11"; let SuperClasses = [MipsMemAsmOperand]; @@ -485,6 +667,13 @@ def mem_msa : mem_generic { def mem_simm9 : mem_generic { let MIOperandInfo = (ops ptr_rc, simm9); let EncoderMethod = "getMemEncoding"; + let ParserMatchClass = MipsMemSimm9AsmOperand; +} + +def mem_simm9gpr : mem_generic { + let MIOperandInfo = (ops ptr_rc, simm9); + let EncoderMethod = "getMemEncoding"; + let ParserMatchClass = MipsMemSimm9GPRAsmOperand; } def mem_simm11 : mem_generic { @@ -512,12 +701,6 @@ def PtrRC : Operand<iPTR> { let ParserMatchClass = GPR32AsmOperand; } -// size operand of ext instruction -def size_ext : Operand<i32> { - let EncoderMethod = "getSizeExtEncoding"; - let DecoderMethod = "DecodeExtSize"; -} - // size operand of ins instruction def size_ins : Operand<i32> { let EncoderMethod = "getSizeInsEncoding"; @@ -657,7 +840,7 @@ class shift_rotate_reg<string opstr, RegisterOperand RO, InstrItinClass itin, [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))], itin, FrmR, opstr>; -// Load Upper Imediate +// Load Upper Immediate class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>: InstSE<(outs RO:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"), [], II_LUI, FrmI, opstr>, IsAsCheapAsAMove { @@ -675,14 +858,19 @@ class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag, let mayLoad = 1; } -class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag, +class StoreMemory<string opstr, DAGOperand RO, DAGOperand MO, + SDPatternOperator OpNode = null_frag, InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> : - InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"), + InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"), [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> { let DecoderMethod = "DecodeMem"; let mayStore = 1; } +class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag, + InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> : + StoreMemory<opstr, RO, mem, OpNode, Itin, Addr>; + // Load/Store Left/Right let canFoldAsLoad = 1 in class LoadLeftRight<string opstr, SDNode OpNode, RegisterOperand RO, @@ -740,7 +928,7 @@ class CBranch<string opstr, DAGOperand opnd, PatFrag cond_op, RegisterOperand RO, bit DelaySlot = 1> : InstSE<(outs), (ins RO:$rs, RO:$rt, opnd:$offset), !strconcat(opstr, "\t$rs, $rt, $offset"), - [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], IIBranch, + [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], II_BCC, FrmI, opstr> { let isBranch = 1; let isTerminator = 1; @@ -752,7 +940,7 @@ class CBranchZero<string opstr, DAGOperand opnd, PatFrag cond_op, RegisterOperand RO, bit DelaySlot = 1> : InstSE<(outs), (ins RO:$rs, opnd:$offset), !strconcat(opstr, "\t$rs, $offset"), - [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], IIBranch, + [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], II_BCCZ, FrmI, opstr> { let isBranch = 1; let isTerminator = 1; @@ -778,7 +966,7 @@ class SetCC_I<string opstr, PatFrag cond_op, Operand Od, PatLeaf imm_type, class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator, SDPatternOperator targetoperator, string bopstr> : InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"), - [(operator targetoperator:$target)], IIBranch, FrmJ, bopstr> { + [(operator targetoperator:$target)], II_J, FrmJ, bopstr> { let isTerminator=1; let isBarrier=1; let hasDelaySlot = 1; @@ -788,7 +976,7 @@ class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator, // Unconditional branch class UncondBranch<Instruction BEQInst> : - PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], IIBranch>, + PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], II_B>, PseudoInstExpansion<(BEQInst ZERO, ZERO, brtarget:$offset)> { let isBranch = 1; let isTerminator = 1; @@ -802,7 +990,7 @@ class UncondBranch<Instruction BEQInst> : let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in class JumpFR<string opstr, RegisterOperand RO, SDPatternOperator operator = null_frag>: - InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], IIBranch, + InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], II_JR, FrmR, opstr>; // Indirect branch @@ -815,23 +1003,23 @@ class IndirectBranch<string opstr, RegisterOperand RO> : JumpFR<opstr, RO> { let isCall=1, hasDelaySlot=1, Defs = [RA] in { class JumpLink<string opstr, DAGOperand opnd> : InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"), - [(MipsJmpLink imm:$target)], IIBranch, FrmJ, opstr> { + [(MipsJmpLink tglobaladdr:$target)], II_JAL, FrmJ, opstr> { let DecoderMethod = "DecodeJumpTarget"; } class JumpLinkRegPseudo<RegisterOperand RO, Instruction JALRInst, Register RetReg, RegisterOperand ResRO = RO>: - PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], IIBranch>, + PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], II_JALR>, PseudoInstExpansion<(JALRInst RetReg, ResRO:$rs)>; class JumpLinkReg<string opstr, RegisterOperand RO>: InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"), - [], IIBranch, FrmR>; + [], II_JALR, FrmR, opstr>; class BGEZAL_FT<string opstr, DAGOperand opnd, RegisterOperand RO, bit DelaySlot = 1> : InstSE<(outs), (ins RO:$rs, opnd:$offset), - !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr> { + !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZAL, FrmI, opstr> { let hasDelaySlot = DelaySlot; } @@ -840,17 +1028,17 @@ let isCall=1, hasDelaySlot=1, Defs = [RA] in { let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1, hasExtraSrcRegAllocReq = 1, Defs = [AT] in { class TailCall<Instruction JumpInst> : - PseudoSE<(outs), (ins calltarget:$target), [], IIBranch>, + PseudoSE<(outs), (ins calltarget:$target), [], II_J>, PseudoInstExpansion<(JumpInst jmptarget:$target)>; class TailCallReg<RegisterOperand RO, Instruction JRInst, RegisterOperand ResRO = RO> : - PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], IIBranch>, + PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>, PseudoInstExpansion<(JRInst ResRO:$rs)>; } class BAL_BR_Pseudo<Instruction RealInst> : - PseudoSE<(outs), (ins brtarget:$offset), [], IIBranch>, + PseudoSE<(outs), (ins brtarget:$offset), [], II_BCCZAL>, PseudoInstExpansion<(RealInst ZERO, brtarget:$offset)> { let isBranch = 1; let isTerminator = 1; @@ -997,9 +1185,10 @@ class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO, [(set RO:$rd, (sext_inreg RO:$rt, vt))], itin, FrmR, opstr>; // Subword Swap -class SubwordSwap<string opstr, RegisterOperand RO>: - InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [], - NoItinerary, FrmR, opstr> { +class SubwordSwap<string opstr, RegisterOperand RO, + InstrItinClass itin = NoItinerary>: + InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [], itin, + FrmR, opstr> { let hasSideEffects = 0; } @@ -1010,8 +1199,8 @@ class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> : // Ext and Ins class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd, - SDPatternOperator Op = null_frag>: - InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ext:$size), + Operand SizeOpnd, SDPatternOperator Op = null_frag> : + InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size), !strconcat(opstr, " $rt, $rs, $pos, $size"), [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))], II_EXT, FrmR, opstr>, ISA_MIPS32R2; @@ -1074,6 +1263,9 @@ class TrapBase<Instruction RealInst> let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1 in def RetRA : PseudoSE<(outs), (ins), [(MipsRet)]>; +let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, hasSideEffects=1 in +def ERet : PseudoSE<(outs), (ins), [(MipsERet)]>; + let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt), [(callseq_start timm:$amt)]>; @@ -1215,10 +1407,11 @@ def LH : Load<"lh", GPR32Opnd, sextloadi16, II_LH, addrDefault>, MMRel, LW_FM<0x21>; def LHu : Load<"lhu", GPR32Opnd, zextloadi16, II_LHU>, MMRel, LW_FM<0x25>; let AdditionalPredicates = [NotInMicroMips] in { -def LW : Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel, +def LW : StdMMR6Rel, Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel, LW_FM<0x23>; } -def SB : Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel, LW_FM<0x28>; +def SB : StdMMR6Rel, Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel, + LW_FM<0x28>; def SH : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>; let AdditionalPredicates = [NotInMicroMips] in { def SW : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>; @@ -1259,15 +1452,17 @@ let DecoderNamespace = "COP3_" in { } } -def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32; -def SYNCI : MMRel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2; +def SYNC : MMRel, StdMMR6Rel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32; +def SYNCI : MMRel, StdMMR6Rel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2; -def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>, ISA_MIPS2; -def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>, ISA_MIPS2; -def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>, ISA_MIPS2; -def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>, ISA_MIPS2; -def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>, ISA_MIPS2; -def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>, ISA_MIPS2; +let AdditionalPredicates = [NotInMicroMips] in { + def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>, ISA_MIPS2; + def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>, ISA_MIPS2; + def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>, ISA_MIPS2; + def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>, ISA_MIPS2; + def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>, ISA_MIPS2; + def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>, ISA_MIPS2; +} def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>, ISA_MIPS2_NOT_32R6_64R6; @@ -1290,14 +1485,15 @@ def TRAP : TrapBase<BREAK>; def SDBBP : MMRel, SYS_FT<"sdbbp">, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6; let AdditionalPredicates = [NotInMicroMips] in { -def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>, INSN_MIPS3_32; + def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18, 0x0>, INSN_MIPS3_32; + def ERETNC : MMRel, ER_FT<"eretnc">, ER_FM<0x18, 0x1>, ISA_MIPS32R5; + def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f, 0x0>, ISA_MIPS32; } -def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>, ISA_MIPS32; let AdditionalPredicates = [NotInMicroMips] in { -def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>, ISA_MIPS32R2; + def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>, ISA_MIPS32R2; + def DI : MMRel, StdMMR6Rel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>, ISA_MIPS32R2; } -def DI : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>, ISA_MIPS32R2; let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug AdditionalPredicates = [NotInMicroMips] in { @@ -1359,7 +1555,8 @@ def TAILCALL_R : TailCallReg<GPR32Opnd, JR>; // Indirect branches are matched as PseudoIndirectBranch/PseudoIndirectBranch64 // then are expanded to JR, JR64, JALR, or JALR64 depending on the ISA. class PseudoIndirectBranchBase<RegisterOperand RO> : - MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)], IIBranch> { + MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)], + II_IndirectBranchPseudo> { let isTerminator=1; let isBarrier=1; let hasDelaySlot = 1; @@ -1369,12 +1566,12 @@ class PseudoIndirectBranchBase<RegisterOperand RO> : def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>; -// Return instructions are matched as a RetRA instruction, then ar expanded +// Return instructions are matched as a RetRA instruction, then are expanded // into PseudoReturn/PseudoReturn64 after register allocation. Finally, // MipsAsmPrinter expands this into JR, JR64, JALR, or JALR64 depending on the // ISA. class PseudoReturnBase<RegisterOperand RO> : MipsPseudo<(outs), (ins RO:$rs), - [], IIBranch> { + [], II_ReturnPseudo> { let isTerminator = 1; let isBarrier = 1; let hasDelaySlot = 1; @@ -1441,8 +1638,11 @@ def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>, def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>, ISA_MIPS32_NOT_32R6_64R6; -/// Word Swap Bytes Within Halfwords -def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>, ISA_MIPS32R2; +let AdditionalPredicates = [NotInMicroMips] in { + /// Word Swap Bytes Within Halfwords + def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>, SEB_FM<2, 0x20>, + ISA_MIPS32R2; +} /// No operation. def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>; @@ -1485,10 +1685,12 @@ def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, II_DIV, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6; def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6; - +let AdditionalPredicates = [NotInMicroMips] in { def RDHWR : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM; - -def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>, EXT_FM<0>; +} +// TODO: Add '0 < pos+size <= 32' constraint check to ext instruction +def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, MipsExt>, + EXT_FM<0>; def INS : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM<4>; /// Move Control Registers From/To CPU Registers @@ -1499,9 +1701,9 @@ def MTC2 : MTC3OP<"mtc2", COP2Opnd, GPR32Opnd>, MFC3OP_FM<0x12, 4>; class Barrier<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary, FrmOther, asmstr>; -def SSNOP : MMRel, Barrier<"ssnop">, BARRIER_FM<1>; +def SSNOP : MMRel, StdMMR6Rel, Barrier<"ssnop">, BARRIER_FM<1>; def EHB : MMRel, Barrier<"ehb">, BARRIER_FM<3>; -def PAUSE : MMRel, Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2; +def PAUSE : MMRel, StdMMR6Rel, Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2; // JR_HB and JALR_HB are defined here using the new style naming // scheme because some of this code is shared with Mips32r6InstrInfo.td @@ -1562,11 +1764,60 @@ def CACHE : MMRel, CacheOp<"cache", mem>, CACHEOP_FM<0b101111>, def PREF : MMRel, CacheOp<"pref", mem>, CACHEOP_FM<0b110011>, INSN_MIPS3_32_NOT_32R6_64R6; +def ROL : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd), + "rol\t$rs, $rt, $rd">; +def ROLImm : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), + "rol\t$rs, $rt, $imm">; +def : MipsInstAlias<"rol $rd, $rs", + (ROL GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>; +def : MipsInstAlias<"rol $rd, $imm", + (ROLImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>; + +def ROR : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd), + "ror\t$rs, $rt, $rd">; +def RORImm : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), + "ror\t$rs, $rt, $imm">; +def : MipsInstAlias<"ror $rd, $rs", + (ROR GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>; +def : MipsInstAlias<"ror $rd, $imm", + (RORImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>; + +def DROL : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd), + "drol\t$rs, $rt, $rd">, ISA_MIPS64; +def DROLImm : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), + "drol\t$rs, $rt, $imm">, ISA_MIPS64; +def : MipsInstAlias<"drol $rd, $rs", + (DROL GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>, ISA_MIPS64; +def : MipsInstAlias<"drol $rd, $imm", + (DROLImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>, ISA_MIPS64; + +def DROR : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd), + "dror\t$rs, $rt, $rd">, ISA_MIPS64; +def DRORImm : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), + "dror\t$rs, $rt, $imm">, ISA_MIPS64; +def : MipsInstAlias<"dror $rd, $rs", + (DROR GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>, ISA_MIPS64; +def : MipsInstAlias<"dror $rd, $imm", + (DRORImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>, ISA_MIPS64; + //===----------------------------------------------------------------------===// // Instruction aliases //===----------------------------------------------------------------------===// def : MipsInstAlias<"move $dst, $src", - (ADDu GPR32Opnd:$dst, GPR32Opnd:$src,ZERO), 1>, + (OR GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>, + GPR_32 { + let AdditionalPredicates = [NotInMicroMips]; +} +def : MipsInstAlias<"move $dst, $src", + (ADDu GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>, GPR_32 { let AdditionalPredicates = [NotInMicroMips]; } @@ -1630,27 +1881,27 @@ def : MipsInstAlias<"beqz $rs,$offset", def : MipsInstAlias<"beqzl $rs,$offset", (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>; def : MipsInstAlias<"syscall", (SYSCALL 0), 1>; - + def : MipsInstAlias<"break", (BREAK 0, 0), 1>; def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>; let AdditionalPredicates = [NotInMicroMips] in { -def : MipsInstAlias<"ei", (EI ZERO), 1>, ISA_MIPS32R2; -} -def : MipsInstAlias<"di", (DI ZERO), 1>, ISA_MIPS32R2; - -def : MipsInstAlias<"teq $rs, $rt", - (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; -def : MipsInstAlias<"tge $rs, $rt", - (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; -def : MipsInstAlias<"tgeu $rs, $rt", - (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; -def : MipsInstAlias<"tlt $rs, $rt", - (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; -def : MipsInstAlias<"tltu $rs, $rt", - (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; -def : MipsInstAlias<"tne $rs, $rt", - (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; - + def : MipsInstAlias<"ei", (EI ZERO), 1>, ISA_MIPS32R2; + def : MipsInstAlias<"di", (DI ZERO), 1>, ISA_MIPS32R2; +} +let AdditionalPredicates = [NotInMicroMips] in { + def : MipsInstAlias<"teq $rs, $rt", + (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; + def : MipsInstAlias<"tge $rs, $rt", + (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; + def : MipsInstAlias<"tgeu $rs, $rt", + (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; + def : MipsInstAlias<"tlt $rs, $rt", + (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; + def : MipsInstAlias<"tltu $rs, $rt", + (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; + def : MipsInstAlias<"tne $rs, $rt", + (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; +} def : MipsInstAlias<"sll $rd, $rt, $rs", (SLLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>; def : MipsInstAlias<"sub, $rd, $rs, $imm", @@ -1678,7 +1929,7 @@ def : MipsInstAlias<"sync", class LoadImmediate32<string instr_asm, Operand Od, RegisterOperand RO> : MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32), !strconcat(instr_asm, "\t$rt, $imm32")> ; -def LoadImm32 : LoadImmediate32<"li", uimm5, GPR32Opnd>; +def LoadImm32 : LoadImmediate32<"li", simm32, GPR32Opnd>; class LoadAddressFromReg32<string instr_asm, Operand MemOpnd, RegisterOperand RO> : @@ -1689,13 +1940,16 @@ def LoadAddrReg32 : LoadAddressFromReg32<"la", mem, GPR32Opnd>; class LoadAddressFromImm32<string instr_asm, Operand Od, RegisterOperand RO> : MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32), !strconcat(instr_asm, "\t$rt, $imm32")> ; -def LoadAddrImm32 : LoadAddressFromImm32<"la", uimm5, GPR32Opnd>; +def LoadAddrImm32 : LoadAddressFromImm32<"la", simm32, GPR32Opnd>; def JalTwoReg : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs), "jal\t$rd, $rs"> ; def JalOneReg : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs), "jal\t$rs"> ; +def NORImm : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), + "nor\t$rs, $rt, $imm"> ; + let hasDelaySlot = 1 in { def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins imm64:$imm64, brtarget:$offset), @@ -1718,12 +1972,62 @@ def BLTU : CondBranchPseudo<"bltu">; def BLEU : CondBranchPseudo<"bleu">; def BGEU : CondBranchPseudo<"bgeu">; def BGTU : CondBranchPseudo<"bgtu">; +def BLTL : CondBranchPseudo<"bltl">, ISA_MIPS2_NOT_32R6_64R6; +def BLEL : CondBranchPseudo<"blel">, ISA_MIPS2_NOT_32R6_64R6; +def BGEL : CondBranchPseudo<"bgel">, ISA_MIPS2_NOT_32R6_64R6; +def BGTL : CondBranchPseudo<"bgtl">, ISA_MIPS2_NOT_32R6_64R6; +def BLTUL: CondBranchPseudo<"bltul">, ISA_MIPS2_NOT_32R6_64R6; +def BLEUL: CondBranchPseudo<"bleul">, ISA_MIPS2_NOT_32R6_64R6; +def BGEUL: CondBranchPseudo<"bgeul">, ISA_MIPS2_NOT_32R6_64R6; +def BGTUL: CondBranchPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6; + +class CondBranchImmPseudo<string instr_asm> : + MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset), + !strconcat(instr_asm, "\t$rs, $imm, $offset")>; + +def BLTImmMacro : CondBranchImmPseudo<"blt">; +def BLEImmMacro : CondBranchImmPseudo<"ble">; +def BGEImmMacro : CondBranchImmPseudo<"bge">; +def BGTImmMacro : CondBranchImmPseudo<"bgt">; +def BLTUImmMacro : CondBranchImmPseudo<"bltu">; +def BLEUImmMacro : CondBranchImmPseudo<"bleu">; +def BGEUImmMacro : CondBranchImmPseudo<"bgeu">; +def BGTUImmMacro : CondBranchImmPseudo<"bgtu">; +def BLTLImmMacro : CondBranchImmPseudo<"bltl">, ISA_MIPS2_NOT_32R6_64R6; +def BLELImmMacro : CondBranchImmPseudo<"blel">, ISA_MIPS2_NOT_32R6_64R6; +def BGELImmMacro : CondBranchImmPseudo<"bgel">, ISA_MIPS2_NOT_32R6_64R6; +def BGTLImmMacro : CondBranchImmPseudo<"bgtl">, ISA_MIPS2_NOT_32R6_64R6; +def BLTULImmMacro : CondBranchImmPseudo<"bltul">, ISA_MIPS2_NOT_32R6_64R6; +def BLEULImmMacro : CondBranchImmPseudo<"bleul">, ISA_MIPS2_NOT_32R6_64R6; +def BGEULImmMacro : CondBranchImmPseudo<"bgeul">, ISA_MIPS2_NOT_32R6_64R6; +def BGTULImmMacro : CondBranchImmPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6; + +// FIXME: Predicates are removed because instructions are matched regardless of +// predicates, because PredicateControl was not in the hierarchy. This was +// done to emit more precise error message from expansion function. +// Once the tablegen-erated errors are made better, this needs to be fixed and +// predicates needs to be restored. + +def SDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt), + "div\t$rs, $rt">; //, ISA_MIPS1_NOT_32R6_64R6; + +def UDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt), + "divu\t$rs, $rt">; //, ISA_MIPS1_NOT_32R6_64R6; + +def DSDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt), + "ddiv\t$rs, $rt">; //, ISA_MIPS64_NOT_64R6; + +def DUDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt), + "ddivu\t$rs, $rt">; //, ISA_MIPS64_NOT_64R6; + +def Ulh : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr), + "ulh\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6; def Ulhu : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr), - "ulhu\t$rt, $addr">, ISA_MIPS1_NOT_32R6_64R6; + "ulhu\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6; def Ulw : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr), - "ulw\t$rt, $addr">, ISA_MIPS1_NOT_32R6_64R6; + "ulw\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6; //===----------------------------------------------------------------------===// // Arbitrary patterns that map to one or more instructions @@ -1771,8 +2075,6 @@ def : MipsPat<(MipsSync (i32 immz)), (SYNC 0)>, ISA_MIPS2; // Call -def : MipsPat<(MipsJmpLink (i32 tglobaladdr:$dst)), - (JAL tglobaladdr:$dst)>; def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)), (JAL texternalsym:$dst)>; //def : MipsPat<(MipsJmpLink GPR32:$dst), @@ -1939,6 +2241,16 @@ let AddedComplexity = 40 in { } } +// Atomic load patterns. +def : MipsPat<(atomic_load_8 addr:$a), (LB addr:$a)>; +def : MipsPat<(atomic_load_16 addr:$a), (LH addr:$a)>; +def : MipsPat<(atomic_load_32 addr:$a), (LW addr:$a)>; + +// Atomic store patterns. +def : MipsPat<(atomic_store_8 addr:$a, GPR32:$v), (SB GPR32:$v, addr:$a)>; +def : MipsPat<(atomic_store_16 addr:$a, GPR32:$v), (SH GPR32:$v, addr:$a)>; +def : MipsPat<(atomic_store_32 addr:$a, GPR32:$v), (SW GPR32:$v, addr:$a)>; + //===----------------------------------------------------------------------===// // Floating Point Support //===----------------------------------------------------------------------===// @@ -1964,6 +2276,10 @@ include "MipsDSPInstrInfo.td" include "MipsMSAInstrFormats.td" include "MipsMSAInstrInfo.td" +// EVA +include "MipsEVAInstrFormats.td" +include "MipsEVAInstrInfo.td" + // Micromips include "MicroMipsInstrFormats.td" include "MicroMipsInstrInfo.td" @@ -1972,3 +2288,11 @@ include "MicroMipsInstrFPU.td" // Micromips r6 include "MicroMips32r6InstrFormats.td" include "MicroMips32r6InstrInfo.td" + +// Micromips64 r6 +include "MicroMips64r6InstrFormats.td" +include "MicroMips64r6InstrInfo.td" + +// Micromips DSP +include "MicroMipsDSPInstrFormats.td" +include "MicroMipsDSPInstrInfo.td" diff --git a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp index 90f8cc0..49fb99a 100644 --- a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp @@ -148,7 +148,7 @@ void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) { // Insert NewMBB and fix control flow. MachineBasicBlock *Tgt = getTargetMBB(*FirstBr); NewMBB->transferSuccessors(MBB); - NewMBB->removeSuccessor(Tgt); + NewMBB->removeSuccessor(Tgt, true); MBB->addSuccessor(NewMBB); MBB->addSuccessor(Tgt); MF->insert(std::next(MachineFunction::iterator(MBB)), NewMBB); @@ -161,7 +161,7 @@ void MipsLongBranch::initMBBInfo() { // Split the MBBs if they have two branches. Each basic block should have at // most one branch after this loop is executed. for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E;) - splitMBB(I++); + splitMBB(&*I++); MF->RenumberBlocks(); MBBInfos.clear(); @@ -262,8 +262,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { static_cast<const MipsInstrInfo *>(Subtarget.getInstrInfo()); MF->insert(FallThroughMBB, LongBrMBB); - MBB->removeSuccessor(TgtMBB); - MBB->addSuccessor(LongBrMBB); + MBB->replaceSuccessor(TgtMBB, LongBrMBB); if (IsPIC) { MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB); @@ -434,7 +433,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { I.Br->addOperand(MachineOperand::CreateMBB(LongBrMBB)); } else // Change branch destination and reverse condition. - replaceBranch(*MBB, I.Br, DL, FallThroughMBB); + replaceBranch(*MBB, I.Br, DL, &*FallThroughMBB); } static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) { diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td index bff2d0f..7d25ea5 100644 --- a/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td +++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td @@ -7,18 +7,12 @@ // //===----------------------------------------------------------------------===// -def HasMSA : Predicate<"Subtarget->hasMSA()">, - AssemblerPredicate<"FeatureMSA">; - -class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> { - let Predicates = [HasMSA]; +class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, + PredicateControl, ASE_MSA { + let EncodingPredicates = [HasStdEnc]; let Inst{31-26} = 0b011110; } -class MSA64Inst : MSAInst { - let Predicates = [HasMSA, HasMips64]; -} - class MSACBranch : MSAInst { let Inst{31-26} = 0b010001; } @@ -27,10 +21,6 @@ class MSASpecial : MSAInst { let Inst{31-26} = 0b000000; } -class MSA64Special : MSA64Inst { - let Inst{31-26} = 0b000000; -} - class MSAPseudo<dag outs, dag ins, list<dag> pattern, InstrItinClass itin = IIPseudo>: MipsPseudo<outs, ins, pattern, itin> { @@ -100,7 +90,7 @@ class MSA_2R_FILL_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst { let Inst{5-0} = minor; } -class MSA_2R_FILL_D_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSA64Inst { +class MSA_2R_FILL_D_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst { bits<5> rs; bits<5> wd; @@ -293,7 +283,7 @@ class MSA_ELM_COPY_W_FMT<bits<4> major, bits<6> minor>: MSAInst { let Inst{5-0} = minor; } -class MSA_ELM_COPY_D_FMT<bits<4> major, bits<6> minor>: MSA64Inst { +class MSA_ELM_COPY_D_FMT<bits<4> major, bits<6> minor>: MSAInst { bits<4> n; bits<5> ws; bits<5> rd; @@ -345,7 +335,7 @@ class MSA_ELM_INSERT_W_FMT<bits<4> major, bits<6> minor>: MSAInst { let Inst{5-0} = minor; } -class MSA_ELM_INSERT_D_FMT<bits<4> major, bits<6> minor>: MSA64Inst { +class MSA_ELM_INSERT_D_FMT<bits<4> major, bits<6> minor>: MSAInst { bits<6> n; bits<5> rs; bits<5> wd; @@ -450,7 +440,7 @@ class SPECIAL_LSA_FMT<bits<6> minor>: MSASpecial { let Inst{5-0} = minor; } -class SPECIAL_DLSA_FMT<bits<6> minor>: MSA64Special { +class SPECIAL_DLSA_FMT<bits<6> minor>: MSASpecial { bits<5> rs; bits<5> rt; bits<5> rd; diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td index 970e98e..eacfcec 100644 --- a/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td @@ -63,30 +63,13 @@ def MipsVExtractSExt : SDNode<"MipsISD::VEXTRACT_SEXT_ELT", def MipsVExtractZExt : SDNode<"MipsISD::VEXTRACT_ZEXT_ELT", SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>; +def immZExt1Ptr : ImmLeaf<iPTR, [{return isUInt<1>(Imm);}]>; +def immZExt2Ptr : ImmLeaf<iPTR, [{return isUInt<2>(Imm);}]>; def immZExt4Ptr : ImmLeaf<iPTR, [{return isUInt<4>(Imm);}]>; def immZExt6Ptr : ImmLeaf<iPTR, [{return isUInt<6>(Imm);}]>; // Operands -// The immediate of an LSA instruction needs special handling -// as the encoded value should be subtracted by one. -def uimm2LSAAsmOperand : AsmOperandClass { - let Name = "LSAImm"; - let ParserMethod = "parseLSAImm"; - let RenderMethod = "addImmOperands"; -} - -def LSAImm : Operand<i32> { - let PrintMethod = "printUnsignedImm"; - let EncoderMethod = "getLSAImmEncoding"; - let DecoderMethod = "DecodeLSAImm"; - let ParserMatchClass = uimm2LSAAsmOperand; -} - -def uimm4 : Operand<i32> { - let PrintMethod = "printUnsignedImm8"; -} - def uimm4_ptr : Operand<iPTR> { let PrintMethod = "printUnsignedImm8"; } @@ -95,10 +78,6 @@ def uimm6_ptr : Operand<iPTR> { let PrintMethod = "printUnsignedImm8"; } -def uimm8 : Operand<i32> { - let PrintMethod = "printUnsignedImm8"; -} - def simm5 : Operand<i32>; def vsplat_uimm1 : Operand<vAny> { @@ -639,7 +618,6 @@ class COPY_S_D_ENC : MSA_ELM_COPY_D_FMT<0b0010, 0b011001>; class COPY_U_B_ENC : MSA_ELM_COPY_B_FMT<0b0011, 0b011001>; class COPY_U_H_ENC : MSA_ELM_COPY_H_FMT<0b0011, 0b011001>; class COPY_U_W_ENC : MSA_ELM_COPY_W_FMT<0b0011, 0b011001>; -class COPY_U_D_ENC : MSA_ELM_COPY_D_FMT<0b0011, 0b011001>; class CTCMSA_ENC : MSA_ELM_CTCMSA_FMT<0b0000111110, 0b011001>; @@ -1195,47 +1173,14 @@ class MSA_BIT_D_DESC_BASE<string instr_asm, SDPatternOperator OpNode, InstrItinClass Itinerary = itin; } -// This class is deprecated and will be removed soon. -class MSA_BIT_B_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - RegisterOperand ROWD, RegisterOperand ROWS = ROWD, - InstrItinClass itin = NoItinerary> { - dag OutOperandList = (outs ROWD:$wd); - dag InOperandList = (ins ROWS:$ws, uimm3:$m); - string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m"); - list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt3:$m))]; - InstrItinClass Itinerary = itin; -} - -// This class is deprecated and will be removed soon. -class MSA_BIT_H_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - RegisterOperand ROWD, RegisterOperand ROWS = ROWD, - InstrItinClass itin = NoItinerary> { - dag OutOperandList = (outs ROWD:$wd); - dag InOperandList = (ins ROWS:$ws, uimm4:$m); - string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m"); - list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$m))]; - InstrItinClass Itinerary = itin; -} - -// This class is deprecated and will be removed soon. -class MSA_BIT_W_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - RegisterOperand ROWD, RegisterOperand ROWS = ROWD, - InstrItinClass itin = NoItinerary> { - dag OutOperandList = (outs ROWD:$wd); - dag InOperandList = (ins ROWS:$ws, uimm5:$m); - string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m"); - list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt5:$m))]; - InstrItinClass Itinerary = itin; -} - -// This class is deprecated and will be removed soon. -class MSA_BIT_D_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - RegisterOperand ROWD, RegisterOperand ROWS = ROWD, - InstrItinClass itin = NoItinerary> { +class MSA_BIT_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode, + Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD, + RegisterOperand ROWS = ROWD, + InstrItinClass itin = NoItinerary> { dag OutOperandList = (outs ROWD:$wd); - dag InOperandList = (ins ROWS:$ws, uimm6:$m); + dag InOperandList = (ins ROWS:$ws, ImmOp:$m); string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m"); - list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt6:$m))]; + list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))]; InstrItinClass Itinerary = itin; } @@ -1291,13 +1236,14 @@ class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode, } class MSA_ELM_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - RegisterOperand ROWD, RegisterOperand ROWS = ROWD, + RegisterOperand ROWD, RegisterOperand ROWS, + Operand ImmOp, ImmLeaf Imm, InstrItinClass itin = NoItinerary> { dag OutOperandList = (outs ROWD:$wd); - dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, uimm4:$n); + dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ImmOp:$n); string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]"); list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws, - immZExt4:$n))]; + Imm:$n))]; string Constraints = "$wd = $wd_in"; InstrItinClass Itinerary = itin; } @@ -1479,7 +1425,7 @@ class MSA_CBRANCH_DESC_BASE<string instr_asm, RegisterOperand ROWD> { dag InOperandList = (ins ROWD:$wt, brtarget:$offset); string AsmString = !strconcat(instr_asm, "\t$wt, $offset"); list<dag> Pattern = []; - InstrItinClass Itinerary = IIBranch; + InstrItinClass Itinerary = NoItinerary; bit isBranch = 1; bit isTerminator = 1; bit hasDelaySlot = 1; @@ -1519,13 +1465,14 @@ class MSA_INSERT_VIDX_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty, } class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - RegisterOperand ROWD, RegisterOperand ROWS = ROWD, + Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD, + RegisterOperand ROWS = ROWD, InstrItinClass itin = NoItinerary> { dag OutOperandList = (outs ROWD:$wd); - dag InOperandList = (ins ROWD:$wd_in, uimm6:$n, ROWS:$ws, uimmz:$n2); + dag InOperandList = (ins ROWD:$wd_in, ImmOp:$n, ROWS:$ws, uimmz:$n2); string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[$n2]"); list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, - immZExt6:$n, + Imm:$n, ROWS:$ws, immz:$n2))]; InstrItinClass Itinerary = itin; @@ -1934,8 +1881,6 @@ class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16, GPR32Opnd, MSA128HOpnd>; class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32, GPR32Opnd, MSA128WOpnd>; -class COPY_U_D_DESC : MSA_COPY_DESC_BASE<"copy_u.d", vextract_zext_i64, v2i64, - GPR64Opnd, MSA128DOpnd>; class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32, FGR32, MSA128W>; @@ -2346,13 +2291,13 @@ class INSERT_FW_VIDX64_PSEUDO_DESC : class INSERT_FD_VIDX64_PSEUDO_DESC : MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd, GPR64Opnd>; -class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, +class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, immZExt4, MSA128BOpnd>; -class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, +class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, immZExt3, MSA128HOpnd>; -class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, +class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, immZExt2, MSA128WOpnd>; -class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, +class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, immZExt1, MSA128DOpnd>; class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -2381,7 +2326,7 @@ class LSA_DESC_BASE<string instr_asm, RegisterOperand RORD, RegisterOperand RORS = RORD, RegisterOperand RORT = RORD, InstrItinClass itin = NoItinerary > { dag OutOperandList = (outs RORD:$rd); - dag InOperandList = (ins RORS:$rs, RORT:$rt, LSAImm:$sa); + dag InOperandList = (ins RORS:$rs, RORT:$rt, uimm2_plus1:$sa); string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $sa"); list<dag> Pattern = [(set RORD:$rd, (add RORT:$rt, (shl RORS:$rs, @@ -2561,23 +2506,23 @@ class PCNT_H_DESC : MSA_2R_DESC_BASE<"pcnt.h", ctpop, MSA128HOpnd>; class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>; class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>; -class SAT_S_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b, - MSA128BOpnd>; -class SAT_S_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h, - MSA128HOpnd>; -class SAT_S_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w, - MSA128WOpnd>; -class SAT_S_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d, - MSA128DOpnd>; - -class SAT_U_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b, - MSA128BOpnd>; -class SAT_U_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h, - MSA128HOpnd>; -class SAT_U_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w, - MSA128WOpnd>; -class SAT_U_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d, - MSA128DOpnd>; +class SAT_S_B_DESC : MSA_BIT_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b, uimm3, + immZExt3, MSA128BOpnd>; +class SAT_S_H_DESC : MSA_BIT_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h, uimm4, + immZExt4, MSA128HOpnd>; +class SAT_S_W_DESC : MSA_BIT_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w, uimm5, + immZExt5, MSA128WOpnd>; +class SAT_S_D_DESC : MSA_BIT_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d, uimm6, + immZExt6, MSA128DOpnd>; + +class SAT_U_B_DESC : MSA_BIT_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b, uimm3, + immZExt3, MSA128BOpnd>; +class SAT_U_H_DESC : MSA_BIT_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h, uimm4, + immZExt4, MSA128HOpnd>; +class SAT_U_W_DESC : MSA_BIT_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w, uimm5, + immZExt5, MSA128WOpnd>; +class SAT_U_D_DESC : MSA_BIT_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d, uimm6, + immZExt6, MSA128DOpnd>; class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>; class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>; @@ -2589,13 +2534,17 @@ class SLD_W_DESC : MSA_3R_SLD_DESC_BASE<"sld.w", int_mips_sld_w, MSA128WOpnd>; class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>; class SLDI_B_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.b", int_mips_sldi_b, - MSA128BOpnd>; + MSA128BOpnd, MSA128BOpnd, uimm4, + immZExt4>; class SLDI_H_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.h", int_mips_sldi_h, - MSA128HOpnd>; + MSA128HOpnd, MSA128HOpnd, uimm3, + immZExt3>; class SLDI_W_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.w", int_mips_sldi_w, - MSA128WOpnd>; + MSA128WOpnd, MSA128WOpnd, uimm2, + immZExt2>; class SLDI_D_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.d", int_mips_sldi_d, - MSA128DOpnd>; + MSA128DOpnd, MSA128DOpnd, uimm1, + immZExt1>; class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>; class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>; @@ -2648,14 +2597,14 @@ class SRAR_H_DESC : MSA_3R_DESC_BASE<"srar.h", int_mips_srar_h, MSA128HOpnd>; class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>; class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>; -class SRARI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srari.b", int_mips_srari_b, - MSA128BOpnd>; -class SRARI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srari.h", int_mips_srari_h, - MSA128HOpnd>; -class SRARI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srari.w", int_mips_srari_w, - MSA128WOpnd>; -class SRARI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srari.d", int_mips_srari_d, - MSA128DOpnd>; +class SRARI_B_DESC : MSA_BIT_X_DESC_BASE<"srari.b", int_mips_srari_b, uimm3, + immZExt3, MSA128BOpnd>; +class SRARI_H_DESC : MSA_BIT_X_DESC_BASE<"srari.h", int_mips_srari_h, uimm4, + immZExt4, MSA128HOpnd>; +class SRARI_W_DESC : MSA_BIT_X_DESC_BASE<"srari.w", int_mips_srari_w, uimm5, + immZExt5, MSA128WOpnd>; +class SRARI_D_DESC : MSA_BIT_X_DESC_BASE<"srari.d", int_mips_srari_d, uimm6, + immZExt6, MSA128DOpnd>; class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>; class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>; @@ -2676,14 +2625,14 @@ class SRLR_H_DESC : MSA_3R_DESC_BASE<"srlr.h", int_mips_srlr_h, MSA128HOpnd>; class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>; class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>; -class SRLRI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srlri.b", int_mips_srlri_b, - MSA128BOpnd>; -class SRLRI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srlri.h", int_mips_srlri_h, - MSA128HOpnd>; -class SRLRI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srlri.w", int_mips_srlri_w, - MSA128WOpnd>; -class SRLRI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srlri.d", int_mips_srlri_d, - MSA128DOpnd>; +class SRLRI_B_DESC : MSA_BIT_X_DESC_BASE<"srlri.b", int_mips_srlri_b, uimm3, + immZExt3, MSA128BOpnd>; +class SRLRI_H_DESC : MSA_BIT_X_DESC_BASE<"srlri.h", int_mips_srlri_h, uimm4, + immZExt4, MSA128HOpnd>; +class SRLRI_W_DESC : MSA_BIT_X_DESC_BASE<"srlri.w", int_mips_srlri_w, uimm5, + immZExt5, MSA128WOpnd>; +class SRLRI_D_DESC : MSA_BIT_X_DESC_BASE<"srlri.d", int_mips_srlri_d, uimm6, + immZExt6, MSA128DOpnd>; class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode, ValueType TyNode, RegisterOperand ROWD, @@ -2991,12 +2940,11 @@ def CLTI_U_D : CLTI_U_D_ENC, CLTI_U_D_DESC; def COPY_S_B : COPY_S_B_ENC, COPY_S_B_DESC; def COPY_S_H : COPY_S_H_ENC, COPY_S_H_DESC; def COPY_S_W : COPY_S_W_ENC, COPY_S_W_DESC; -def COPY_S_D : COPY_S_D_ENC, COPY_S_D_DESC; +def COPY_S_D : COPY_S_D_ENC, COPY_S_D_DESC, ASE_MSA64; def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC; def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC; -def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC; -def COPY_U_D : COPY_U_D_ENC, COPY_U_D_DESC; +def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC, ASE_MSA64; def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC; def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC; @@ -3108,7 +3056,7 @@ def FFQR_D : FFQR_D_ENC, FFQR_D_DESC; def FILL_B : FILL_B_ENC, FILL_B_DESC; def FILL_H : FILL_H_ENC, FILL_H_DESC; def FILL_W : FILL_W_ENC, FILL_W_DESC; -def FILL_D : FILL_D_ENC, FILL_D_DESC; +def FILL_D : FILL_D_ENC, FILL_D_DESC, ASE_MSA64; def FILL_FW_PSEUDO : FILL_FW_PSEUDO_DESC; def FILL_FD_PSEUDO : FILL_FD_PSEUDO_DESC; @@ -3238,7 +3186,7 @@ def ILVR_D : ILVR_D_ENC, ILVR_D_DESC; def INSERT_B : INSERT_B_ENC, INSERT_B_DESC; def INSERT_H : INSERT_H_ENC, INSERT_H_DESC; def INSERT_W : INSERT_W_ENC, INSERT_W_DESC; -def INSERT_D : INSERT_D_ENC, INSERT_D_DESC; +def INSERT_D : INSERT_D_ENC, INSERT_D_DESC, ASE_MSA64; // INSERT_FW_PSEUDO defined after INSVE_W // INSERT_FD_PSEUDO defined after INSVE_D @@ -3280,7 +3228,7 @@ def LDI_W : LDI_W_ENC, LDI_W_DESC; def LDI_D : LDI_D_ENC, LDI_D_DESC; def LSA : LSA_ENC, LSA_DESC; -def DLSA : DLSA_ENC, DLSA_DESC; +def DLSA : DLSA_ENC, DLSA_DESC, ASE_MSA64; def MADD_Q_H : MADD_Q_H_ENC, MADD_Q_H_DESC; def MADD_Q_W : MADD_Q_W_ENC, MADD_Q_W_DESC; @@ -3787,6 +3735,28 @@ def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64, def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8, MSA128B, NoItinerary>; +// Vector extraction with fixed index. +// +// Extracting 32-bit values on MSA32 should always use COPY_S_W rather than +// COPY_U_W, even for the zero-extended case. This is because our forward +// compatibility strategy is to consider registers to be infinitely +// sign-extended so that a MIPS64 can execute MIPS32 code without getting +// different register values. +def : MSAPat<(vextract_zext_i32 (v4i32 MSA128W:$ws), immZExt2Ptr:$idx), + (COPY_S_W MSA128W:$ws, immZExt2:$idx)>, ASE_MSA_NOT_MSA64; +def : MSAPat<(vextract_zext_i32 (v4f32 MSA128W:$ws), immZExt2Ptr:$idx), + (COPY_S_W MSA128W:$ws, immZExt2:$idx)>, ASE_MSA_NOT_MSA64; + +// Extracting 64-bit values on MSA64 should always use COPY_S_D rather than +// COPY_U_D, even for the zero-extended case. This is because our forward +// compatibility strategy is to consider registers to be infinitely +// sign-extended so that a hypothetical MIPS128 would be able to execute MIPS64 +// code without getting different register values. +def : MSAPat<(vextract_zext_i64 (v2i64 MSA128D:$ws), immZExt1Ptr:$idx), + (COPY_S_D MSA128D:$ws, immZExt1:$idx)>, ASE_MSA64; +def : MSAPat<(vextract_zext_i64 (v2f64 MSA128D:$ws), immZExt1Ptr:$idx), + (COPY_S_D MSA128D:$ws, immZExt1:$idx)>, ASE_MSA64; + // Vector extraction with variable index def : MSAPat<(i32 (vextract_sext_i8 v16i8:$ws, i32:$idx)), (SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws, diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp index 0d1ee04..c7d2738 100644 --- a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp @@ -24,42 +24,6 @@ static cl::opt<bool> FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true), cl::desc("Always use $gp as the global base register.")); -// class MipsCallEntry. -MipsCallEntry::MipsCallEntry(StringRef N) { -#ifndef NDEBUG - Name = N; - Val = nullptr; -#endif -} - -MipsCallEntry::MipsCallEntry(const GlobalValue *V) { -#ifndef NDEBUG - Val = V; -#endif -} - -bool MipsCallEntry::isConstant(const MachineFrameInfo *) const { - return false; -} - -bool MipsCallEntry::isAliased(const MachineFrameInfo *) const { - return false; -} - -bool MipsCallEntry::mayAlias(const MachineFrameInfo *) const { - return false; -} - -void MipsCallEntry::printCustom(raw_ostream &O) const { - O << "MipsCallEntry: "; -#ifndef NDEBUG - if (Val) - O << Val->getName(); - else - O << Name; -#endif -} - MipsFunctionInfo::~MipsFunctionInfo() {} bool MipsFunctionInfo::globalBaseRegSet() const { @@ -111,27 +75,32 @@ void MipsFunctionInfo::createEhDataRegsFI() { } } +void MipsFunctionInfo::createISRRegFI() { + // ISRs require spill slots for Status & ErrorPC Coprocessor 0 registers. + // The current implementation only supports Mips32r2+ not Mips64rX. Status + // is always 32 bits, ErrorPC is 32 or 64 bits dependant on architecture, + // however Mips32r2+ is the supported architecture. + const TargetRegisterClass *RC = &Mips::GPR32RegClass; + + for (int I = 0; I < 2; ++I) + ISRDataRegFI[I] = MF.getFrameInfo()->CreateStackObject( + RC->getSize(), RC->getAlignment(), false); +} + bool MipsFunctionInfo::isEhDataRegFI(int FI) const { return CallsEhReturn && (FI == EhDataRegFI[0] || FI == EhDataRegFI[1] || FI == EhDataRegFI[2] || FI == EhDataRegFI[3]); } -MachinePointerInfo MipsFunctionInfo::callPtrInfo(StringRef Name) { - std::unique_ptr<const MipsCallEntry> &E = ExternalCallEntries[Name]; - - if (!E) - E = llvm::make_unique<MipsCallEntry>(Name); - - return MachinePointerInfo(E.get()); +bool MipsFunctionInfo::isISRRegFI(int FI) const { + return IsISR && (FI == ISRDataRegFI[0] || FI == ISRDataRegFI[1]); +} +MachinePointerInfo MipsFunctionInfo::callPtrInfo(const char *ES) { + return MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)); } -MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *Val) { - std::unique_ptr<const MipsCallEntry> &E = GlobalCallEntries[Val]; - - if (!E) - E = llvm::make_unique<MipsCallEntry>(Val); - - return MachinePointerInfo(E.get()); +MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *GV) { + return MachinePointerInfo(MF.getPSVManager().getGlobalValueCallEntry(GV)); } int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) { diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h index 32436ef..a2f6ee0 100644 --- a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h +++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h @@ -15,12 +15,10 @@ #define LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H #include "Mips16HardFloatInfo.h" -#include "llvm/ADT/StringMap.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/PseudoSourceValue.h" -#include "llvm/IR/GlobalValue.h" #include "llvm/IR/ValueMap.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" @@ -30,31 +28,13 @@ namespace llvm { -/// \brief A class derived from PseudoSourceValue that represents a GOT entry -/// resolved by lazy-binding. -class MipsCallEntry : public PseudoSourceValue { -public: - explicit MipsCallEntry(StringRef N); - explicit MipsCallEntry(const GlobalValue *V); - bool isConstant(const MachineFrameInfo *) const override; - bool isAliased(const MachineFrameInfo *) const override; - bool mayAlias(const MachineFrameInfo *) const override; - -private: - void printCustom(raw_ostream &O) const override; -#ifndef NDEBUG - std::string Name; - const GlobalValue *Val; -#endif -}; - /// MipsFunctionInfo - This class is derived from MachineFunction private /// Mips target-specific information for each MachineFunction. class MipsFunctionInfo : public MachineFunctionInfo { public: MipsFunctionInfo(MachineFunction &MF) : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0), - VarArgsFrameIndex(0), CallsEhReturn(false), SaveS2(false), + VarArgsFrameIndex(0), CallsEhReturn(false), IsISR(false), SaveS2(false), MoveF64ViaSpillFI(-1) {} ~MipsFunctionInfo(); @@ -86,13 +66,21 @@ public: int getEhDataRegFI(unsigned Reg) const { return EhDataRegFI[Reg]; } bool isEhDataRegFI(int FI) const; - /// \brief Create a MachinePointerInfo that has a MipsCallEntr object - /// representing a GOT entry for an external function. - MachinePointerInfo callPtrInfo(StringRef Name); + /// Create a MachinePointerInfo that has an ExternalSymbolPseudoSourceValue + /// object representing a GOT entry for an external function. + MachinePointerInfo callPtrInfo(const char *ES); + + // Functions with the "interrupt" attribute require special prologues, + // epilogues and additional spill slots. + bool isISR() const { return IsISR; } + void setISR() { IsISR = true; } + void createISRRegFI(); + int getISRRegFI(unsigned Reg) const { return ISRDataRegFI[Reg]; } + bool isISRRegFI(int FI) const; - /// \brief Create a MachinePointerInfo that has a MipsCallEntr object + /// Create a MachinePointerInfo that has a GlobalValuePseudoSourceValue object /// representing a GOT entry for a global function. - MachinePointerInfo callPtrInfo(const GlobalValue *Val); + MachinePointerInfo callPtrInfo(const GlobalValue *GV); void setSaveS2() { SaveS2 = true; } bool hasSaveS2() const { return SaveS2; } @@ -136,17 +124,18 @@ private: /// Frame objects for spilling eh data registers. int EhDataRegFI[4]; + /// ISR - Whether the function is an Interrupt Service Routine. + bool IsISR; + + /// Frame objects for spilling C0_STATUS, C0_EPC + int ISRDataRegFI[2]; + // saveS2 bool SaveS2; /// FrameIndex for expanding BuildPairF64 nodes to spill and reload when the /// O32 FPXX ABI is enabled. -1 is used to denote invalid index. int MoveF64ViaSpillFI; - - /// MipsCallEntry maps. - StringMap<std::unique_ptr<const MipsCallEntry>> ExternalCallEntries; - ValueMap<const GlobalValue *, std::unique_ptr<const MipsCallEntry>> - GlobalCallEntries; }; } // end of namespace llvm diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp index f6647e6..28e5a42 100644 --- a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp @@ -84,6 +84,16 @@ MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, const MCPhysReg * MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const MipsSubtarget &Subtarget = MF->getSubtarget<MipsSubtarget>(); + const Function *F = MF->getFunction(); + if (F->hasFnAttribute("interrupt")) { + if (Subtarget.hasMips64()) + return Subtarget.hasMips64r6() ? CSR_Interrupt_64R6_SaveList + : CSR_Interrupt_64_SaveList; + else + return Subtarget.hasMips32r6() ? CSR_Interrupt_32R6_SaveList + : CSR_Interrupt_32_SaveList; + } + if (Subtarget.isSingleFloat()) return CSR_SingleFloatOnly_SaveList; @@ -284,6 +294,16 @@ getFrameRegister(const MachineFunction &MF) const { } bool MipsRegisterInfo::canRealignStack(const MachineFunction &MF) const { + // Avoid realigning functions that explicitly do not want to be realigned. + // Normally, we should report an error when a function should be dynamically + // realigned but also has the attribute no-realign-stack. Unfortunately, + // with this attribute, MachineFrameInfo clamps each new object's alignment + // to that of the stack's alignment as specified by the ABI. As a result, + // the information of whether we have objects with larger alignment + // requirement than the stack's alignment is already lost at this point. + if (!TargetRegisterInfo::canRealignStack(MF)) + return false; + const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>(); unsigned FP = Subtarget.isGP32bit() ? Mips::FP : Mips::FP_64; unsigned BP = Subtarget.isGP32bit() ? Mips::S7 : Mips::S7_64; @@ -306,42 +326,3 @@ bool MipsRegisterInfo::canRealignStack(const MachineFunction &MF) const { // sized objects. return MF.getRegInfo().canReserveReg(BP); } - -bool MipsRegisterInfo::needsStackRealignment(const MachineFunction &MF) const { - const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>(); - const MachineFrameInfo *MFI = MF.getFrameInfo(); - - bool CanRealign = canRealignStack(MF); - - // Avoid realigning functions that explicitly do not want to be realigned. - // Normally, we should report an error when a function should be dynamically - // realigned but also has the attribute no-realign-stack. Unfortunately, - // with this attribute, MachineFrameInfo clamps each new object's alignment - // to that of the stack's alignment as specified by the ABI. As a result, - // the information of whether we have objects with larger alignment - // requirement than the stack's alignment is already lost at this point. - if (MF.getFunction()->hasFnAttribute("no-realign-stack")) - return false; - - const Function *F = MF.getFunction(); - if (F->hasFnAttribute(Attribute::StackAlignment)) { -#ifdef DEBUG - if (!CanRealign) - DEBUG(dbgs() << "It's not possible to realign the stack of the function: " - << F->getName() << "\n"); -#endif - return CanRealign; - } - - unsigned StackAlignment = Subtarget.getFrameLowering()->getStackAlignment(); - if (MFI->getMaxAlignment() > StackAlignment) { -#ifdef DEBUG - if (!CanRealign) - DEBUG(dbgs() << "It's not possible to realign the stack of the function: " - << F->getName() << "\n"); -#endif - return CanRealign; - } - - return false; -} diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h index ee1f6bc..5de68a2 100644 --- a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h +++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h @@ -61,9 +61,7 @@ public: RegScavenger *RS = nullptr) const; // Stack realignment queries. - bool canRealignStack(const MachineFunction &MF) const; - - bool needsStackRealignment(const MachineFunction &MF) const override; + bool canRealignStack(const MachineFunction &MF) const override; /// Debug information queries. unsigned getFrameRegister(const MachineFunction &MF) const override; diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp index 096b3be..a4abd62 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -17,6 +17,7 @@ #include "MipsMachineFunction.h" #include "MipsSEInstrInfo.h" #include "MipsSubtarget.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -319,6 +320,15 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB, bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, bool FP64) const { + const MachineOperand &Op1 = I->getOperand(1); + const MachineOperand &Op2 = I->getOperand(2); + + if ((Op1.isReg() && Op1.isUndef()) || (Op2.isReg() && Op2.isUndef())) { + unsigned DstReg = I->getOperand(0).getReg(); + BuildMI(MBB, I, I->getDebugLoc(), TII.get(Mips::IMPLICIT_DEF), DstReg); + return true; + } + // For fpxx and when mfhc1 is not available, use: // spill + reload via ldc1 // @@ -335,8 +345,8 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB, if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) || (FP64 && !Subtarget.useOddSPReg())) { unsigned DstReg = I->getOperand(0).getReg(); - unsigned SrcReg = I->getOperand(1).getReg(); - unsigned N = I->getOperand(2).getImm(); + unsigned SrcReg = Op1.getReg(); + unsigned N = Op2.getImm(); int64_t Offset = 4 * (Subtarget.isLittle() ? N : (1 - N)); // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are @@ -352,8 +362,7 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB, // We re-use the same spill slot each time so that the stack frame doesn't // grow too much in functions with a large number of moves. int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC); - TII.storeRegToStack(MBB, I, SrcReg, I->getOperand(1).isKill(), FI, RC, - &RegInfo, 0); + TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, &RegInfo, 0); TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, Offset); return true; } @@ -376,12 +385,12 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo()); MachineBasicBlock::iterator MBBI = MBB.begin(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + DebugLoc dl; MipsABIInfo ABI = STI.getABI(); unsigned SP = ABI.GetStackPtr(); unsigned FP = ABI.GetFramePtr(); unsigned ZERO = ABI.GetNullPtr(); - unsigned ADDu = ABI.GetPtrAdduOp(); + unsigned MOVE = ABI.GetGPRMoveOp(); unsigned ADDiu = ABI.GetPtrAddiuOp(); unsigned AND = ABI.IsN64() ? Mips::AND64 : Mips::AND; @@ -407,6 +416,9 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); + if (MF.getFunction()->hasFnAttribute("interrupt")) + emitInterruptPrologueStub(MF, MBB); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); if (CSI.size()) { @@ -491,7 +503,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, // if framepointer enabled, set it to point to the stack pointer. if (hasFP(MF)) { // Insert instruction "move $fp, $sp" at this location. - BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO) + BuildMI(MBB, MBBI, dl, TII.get(MOVE), FP).addReg(SP).addReg(ZERO) .setMIFlag(MachineInstr::FrameSetup); // emit ".cfi_def_cfa_register $fp" @@ -514,7 +526,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, if (hasBP(MF)) { // move $s7, $sp unsigned BP = STI.isABI_N64() ? Mips::S7_64 : Mips::S7; - BuildMI(MBB, MBBI, dl, TII.get(ADDu), BP) + BuildMI(MBB, MBBI, dl, TII.get(MOVE), BP) .addReg(SP) .addReg(ZERO); } @@ -522,6 +534,135 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, } } +void MipsSEFrameLowering::emitInterruptPrologueStub( + MachineFunction &MF, MachineBasicBlock &MBB) const { + + MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Report an error the target doesn't support Mips32r2 or later. + // The epilogue relies on the use of the "ehb" to clear execution + // hazards. Pre R2 Mips relies on an implementation defined number + // of "ssnop"s to clear the execution hazard. Support for ssnop hazard + // clearing is not provided so reject that configuration. + if (!STI.hasMips32r2()) + report_fatal_error( + "\"interrupt\" attribute is not supported on pre-MIPS32R2 or " + "MIPS16 targets."); + + // The GP register contains the "user" value, so we cannot perform + // any gp relative loads until we restore the "kernel" or "system" gp + // value. Until support is written we shall only accept the static + // relocation model. + if ((STI.getRelocationModel() != Reloc::Static)) + report_fatal_error("\"interrupt\" attribute is only supported for the " + "static relocation model on MIPS at the present time."); + + if (!STI.isABI_O32() || STI.hasMips64()) + report_fatal_error("\"interrupt\" attribute is only supported for the " + "O32 ABI on MIPS32R2+ at the present time."); + + // Perform ISR handling like GCC + StringRef IntKind = + MF.getFunction()->getFnAttribute("interrupt").getValueAsString(); + const TargetRegisterClass *PtrRC = &Mips::GPR32RegClass; + + // EIC interrupt handling needs to read the Cause register to disable + // interrupts. + if (IntKind == "eic") { + // Coprocessor registers are always live per se. + MBB.addLiveIn(Mips::COP013); + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K0) + .addReg(Mips::COP013) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EXT), Mips::K0) + .addReg(Mips::K0) + .addImm(10) + .addImm(6) + .setMIFlag(MachineInstr::FrameSetup); + } + + // Fetch and spill EPC + MBB.addLiveIn(Mips::COP014); + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K1) + .addReg(Mips::COP014) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + + STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false, + MipsFI->getISRRegFI(0), PtrRC, + STI.getRegisterInfo(), 0); + + // Fetch and Spill Status + MBB.addLiveIn(Mips::COP012); + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K1) + .addReg(Mips::COP012) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + + STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false, + MipsFI->getISRRegFI(1), PtrRC, + STI.getRegisterInfo(), 0); + + // Build the configuration for disabling lower priority interrupts. Non EIC + // interrupts need to be masked off with zero, EIC from the Cause register. + unsigned InsPosition = 8; + unsigned InsSize = 0; + unsigned SrcReg = Mips::ZERO; + + // If the interrupt we're tied to is the EIC, switch the source for the + // masking off interrupts to the cause register. + if (IntKind == "eic") { + SrcReg = Mips::K0; + InsPosition = 10; + InsSize = 6; + } else + InsSize = StringSwitch<unsigned>(IntKind) + .Case("sw0", 1) + .Case("sw1", 2) + .Case("hw0", 3) + .Case("hw1", 4) + .Case("hw2", 5) + .Case("hw3", 6) + .Case("hw4", 7) + .Case("hw5", 8) + .Default(0); + assert(InsSize != 0 && "Unknown interrupt type!"); + + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1) + .addReg(SrcReg) + .addImm(InsPosition) + .addImm(InsSize) + .addReg(Mips::K1) + .setMIFlag(MachineInstr::FrameSetup); + + // Mask off KSU, ERL, EXL + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1) + .addReg(Mips::ZERO) + .addImm(1) + .addImm(4) + .addReg(Mips::K1) + .setMIFlag(MachineInstr::FrameSetup); + + // Disable the FPU as we are not spilling those register sets. + if (!STI.useSoftFloat()) + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1) + .addReg(Mips::ZERO) + .addImm(29) + .addImm(1) + .addReg(Mips::K1) + .setMIFlag(MachineInstr::FrameSetup); + + // Set the new status + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012) + .addReg(Mips::K1) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); +} + void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); @@ -533,12 +674,12 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo()); - DebugLoc dl = MBBI->getDebugLoc(); + DebugLoc DL = MBBI->getDebugLoc(); MipsABIInfo ABI = STI.getABI(); unsigned SP = ABI.GetStackPtr(); unsigned FP = ABI.GetFramePtr(); unsigned ZERO = ABI.GetNullPtr(); - unsigned ADDu = ABI.GetPtrAdduOp(); + unsigned MOVE = ABI.GetGPRMoveOp(); // if framepointer enabled, restore the stack pointer. if (hasFP(MF)) { @@ -549,7 +690,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, --I; // Insert instruction "move $sp, $fp" at this location. - BuildMI(MBB, I, dl, TII.get(ADDu), SP).addReg(FP).addReg(ZERO); + BuildMI(MBB, I, DL, TII.get(MOVE), SP).addReg(FP).addReg(ZERO); } if (MipsFI->callsEhReturn()) { @@ -568,6 +709,9 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, } } + if (MF.getFunction()->hasFnAttribute("interrupt")) + emitInterruptEpilogueStub(MF, MBB); + // Get the number of bytes from FrameInfo uint64_t StackSize = MFI->getStackSize(); @@ -578,13 +722,59 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, TII.adjustStackPtr(SP, StackSize, MBB, MBBI); } +void MipsSEFrameLowering::emitInterruptEpilogueStub( + MachineFunction &MF, MachineBasicBlock &MBB) const { + + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); + DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Perform ISR handling like GCC + const TargetRegisterClass *PtrRC = &Mips::GPR32RegClass; + + // Disable Interrupts. + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::DI), Mips::ZERO); + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EHB)); + + // Restore EPC + STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1, + MipsFI->getISRRegFI(0), PtrRC, + STI.getRegisterInfo()); + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP014) + .addReg(Mips::K1) + .addImm(0); + + // Restore Status + STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1, + MipsFI->getISRRegFI(1), PtrRC, + STI.getRegisterInfo()); + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012) + .addReg(Mips::K1) + .addImm(0); +} + +int MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + MipsABIInfo ABI = STI.getABI(); + + if (MFI->isFixedObjectIndex(FI)) + FrameReg = hasFP(MF) ? ABI.GetFramePtr() : ABI.GetStackPtr(); + else + FrameReg = hasBP(MF) ? ABI.GetBasePtr() : ABI.GetStackPtr(); + + return MFI->getObjectOffset(FI) + MFI->getStackSize() - + getOffsetOfLocalArea() + MFI->getOffsetAdjustment(); +} + bool MipsSEFrameLowering:: spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); - MachineBasicBlock *EntryBlock = MF->begin(); + MachineBasicBlock *EntryBlock = &MF->front(); const TargetInstrInfo &TII = *STI.getInstrInfo(); for (unsigned i = 0, e = CSI.size(); i != e; ++i) { @@ -599,6 +789,26 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, if (!IsRAAndRetAddrIsTaken) EntryBlock->addLiveIn(Reg); + // ISRs require HI/LO to be spilled into kernel registers to be then + // spilled to the stack frame. + bool IsLOHI = (Reg == Mips::LO0 || Reg == Mips::LO0_64 || + Reg == Mips::HI0 || Reg == Mips::HI0_64); + const Function *Func = MBB.getParent()->getFunction(); + if (IsLOHI && Func->hasFnAttribute("interrupt")) { + DebugLoc DL = MI->getDebugLoc(); + + unsigned Op = 0; + if (!STI.getABI().ArePtrs64bit()) { + Op = (Reg == Mips::HI0) ? Mips::MFHI : Mips::MFLO; + Reg = Mips::K0; + } else { + Op = (Reg == Mips::HI0) ? Mips::MFHI64 : Mips::MFLO64; + Reg = Mips::K0_64; + } + BuildMI(MBB, MI, DL, TII.get(Op), Mips::K0) + .setMIFlag(MachineInstr::FrameSetup); + } + // Insert the spill to the stack frame. bool IsKill = !IsRAAndRetAddrIsTaken; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); @@ -622,7 +832,8 @@ MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { } /// Mark \p Reg and all registers aliasing it in the bitset. -void setAliasRegs(MachineFunction &MF, BitVector &SavedRegs, unsigned Reg) { +static void setAliasRegs(MachineFunction &MF, BitVector &SavedRegs, + unsigned Reg) { const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) SavedRegs.set(*AI); @@ -648,6 +859,10 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF, if (MipsFI->callsEhReturn()) MipsFI->createEhDataRegsFI(); + // Create spill slots for Coprocessor 0 registers if function is an ISR. + if (MipsFI->isISR()) + MipsFI->createISRRegFI(); + // Expand pseudo instructions which load, store or copy accumulators. // Add an emergency spill slot if a pseudo was expanded. if (ExpandPseudo(MF).expand()) { diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h index 9cb32e6..63cd3ce 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h +++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h @@ -27,6 +27,9 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, @@ -37,8 +40,13 @@ public: void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; unsigned ehDataReg(unsigned I) const; -}; +private: + void emitInterruptEpilogueStub(MachineFunction &MF, + MachineBasicBlock &MBB) const; + void emitInterruptPrologueStub(MachineFunction &MF, + MachineBasicBlock &MBB) const; +}; } // End llvm namespace #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index 2ebfbd1..6f001ea 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -136,7 +136,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { MachineBasicBlock::iterator I = MBB.begin(); MachineRegisterInfo &RegInfo = MF.getRegInfo(); const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); - DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); + DebugLoc DL; unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg(); const TargetRegisterClass *RC; const MipsABIInfo &ABI = static_cast<const MipsTargetMachine &>(TM).getABI(); diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp index b319fd0..efe22fb 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp @@ -1181,6 +1181,10 @@ bool MipsSETargetLowering::isEligibleForTailCallOptimization( if (!EnableMipsTailCalls) return false; + // Exception has to be cleared with eret. + if (FI.isISR()) + return false; + // Return false if either the callee or caller has a byval argument. if (CCInfo.getInRegsParamsCount() > 0 || FI.hasByvalArg()) return false; @@ -1786,9 +1790,11 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::UDIV, DL, Op->getValueType(0), Op->getOperand(1), Op->getOperand(2)); case Intrinsic::mips_fadd_w: - case Intrinsic::mips_fadd_d: + case Intrinsic::mips_fadd_d: { + // TODO: If intrinsics have fast-math-flags, propagate them. return DAG.getNode(ISD::FADD, DL, Op->getValueType(0), Op->getOperand(1), Op->getOperand(2)); + } // Don't lower mips_fcaf_[wd] since LLVM folds SETFALSE condcodes away case Intrinsic::mips_fceq_w: case Intrinsic::mips_fceq_d: @@ -1831,9 +1837,11 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1), Op->getOperand(2), ISD::SETUNE); case Intrinsic::mips_fdiv_w: - case Intrinsic::mips_fdiv_d: + case Intrinsic::mips_fdiv_d: { + // TODO: If intrinsics have fast-math-flags, propagate them. return DAG.getNode(ISD::FDIV, DL, Op->getValueType(0), Op->getOperand(1), Op->getOperand(2)); + } case Intrinsic::mips_ffint_u_w: case Intrinsic::mips_ffint_u_d: return DAG.getNode(ISD::UINT_TO_FP, DL, Op->getValueType(0), @@ -1856,6 +1864,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::mips_fexp2_w: case Intrinsic::mips_fexp2_d: { + // TODO: If intrinsics have fast-math-flags, propagate them. EVT ResTy = Op->getValueType(0); return DAG.getNode( ISD::FMUL, SDLoc(Op), ResTy, Op->getOperand(1), @@ -1869,11 +1878,14 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::FMA, SDLoc(Op), Op->getValueType(0), Op->getOperand(1), Op->getOperand(2), Op->getOperand(3)); case Intrinsic::mips_fmul_w: - case Intrinsic::mips_fmul_d: + case Intrinsic::mips_fmul_d: { + // TODO: If intrinsics have fast-math-flags, propagate them. return DAG.getNode(ISD::FMUL, DL, Op->getValueType(0), Op->getOperand(1), Op->getOperand(2)); + } case Intrinsic::mips_fmsub_w: case Intrinsic::mips_fmsub_d: { + // TODO: If intrinsics have fast-math-flags, propagate them. EVT ResTy = Op->getValueType(0); return DAG.getNode(ISD::FSUB, SDLoc(Op), ResTy, Op->getOperand(1), DAG.getNode(ISD::FMUL, SDLoc(Op), ResTy, @@ -1886,9 +1898,11 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_fsqrt_d: return DAG.getNode(ISD::FSQRT, DL, Op->getValueType(0), Op->getOperand(1)); case Intrinsic::mips_fsub_w: - case Intrinsic::mips_fsub_d: + case Intrinsic::mips_fsub_d: { + // TODO: If intrinsics have fast-math-flags, propagate them. return DAG.getNode(ISD::FSUB, DL, Op->getValueType(0), Op->getOperand(1), Op->getOperand(2)); + } case Intrinsic::mips_ftrunc_u_w: case Intrinsic::mips_ftrunc_u_d: return DAG.getNode(ISD::FP_TO_UINT, DL, Op->getValueType(0), diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp index 786307b..d4aeaf9 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -88,7 +88,7 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (isMicroMips) Opc = Mips::MOVE16_MM; else - Opc = Mips::ADDu, ZeroReg = Mips::ZERO; + Opc = Mips::OR, ZeroReg = Mips::ZERO; } else if (Mips::CCRRegClass.contains(SrcReg)) Opc = Mips::CFC1; else if (Mips::FGR32RegClass.contains(SrcReg)) @@ -141,7 +141,7 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Opc = Mips::FMOV_D64; else if (Mips::GPR64RegClass.contains(DestReg)) { // Copy to CPU64 Reg. if (Mips::GPR64RegClass.contains(SrcReg)) - Opc = Mips::DADDu, ZeroReg = Mips::ZERO_64; + Opc = Mips::OR64, ZeroReg = Mips::ZERO_64; else if (Mips::HI64RegClass.contains(SrcReg)) Opc = Mips::MFHI64, SrcReg = 0; else if (Mips::LO64RegClass.contains(SrcReg)) @@ -182,7 +182,6 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, int64_t Offset) const { DebugLoc DL; - if (I != MBB.end()) DL = I->getDebugLoc(); MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore); unsigned Opc = 0; @@ -213,6 +212,33 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Opc = Mips::ST_W; else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64)) Opc = Mips::ST_D; + else if (Mips::LO32RegClass.hasSubClassEq(RC)) + Opc = Mips::SW; + else if (Mips::LO64RegClass.hasSubClassEq(RC)) + Opc = Mips::SD; + else if (Mips::HI32RegClass.hasSubClassEq(RC)) + Opc = Mips::SW; + else if (Mips::HI64RegClass.hasSubClassEq(RC)) + Opc = Mips::SD; + + // Hi, Lo are normally caller save but they are callee save + // for interrupt handling. + const Function *Func = MBB.getParent()->getFunction(); + if (Func->hasFnAttribute("interrupt")) { + if (Mips::HI32RegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(Mips::MFHI), Mips::K0); + SrcReg = Mips::K0; + } else if (Mips::HI64RegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(Mips::MFHI64), Mips::K0_64); + SrcReg = Mips::K0_64; + } else if (Mips::LO32RegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(Mips::MFLO), Mips::K0); + SrcReg = Mips::K0; + } else if (Mips::LO64RegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(Mips::MFLO64), Mips::K0_64); + SrcReg = Mips::K0_64; + } + } assert(Opc && "Register class not handled!"); BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill)) @@ -228,6 +254,11 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad); unsigned Opc = 0; + const Function *Func = MBB.getParent()->getFunction(); + bool ReqIndirectLoad = Func->hasFnAttribute("interrupt") && + (DestReg == Mips::LO0 || DestReg == Mips::LO0_64 || + DestReg == Mips::HI0 || DestReg == Mips::HI0_64); + if (Mips::GPR32RegClass.hasSubClassEq(RC)) Opc = Mips::LW; else if (Mips::GPR64RegClass.hasSubClassEq(RC)) @@ -254,10 +285,44 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Opc = Mips::LD_W; else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64)) Opc = Mips::LD_D; + else if (Mips::HI32RegClass.hasSubClassEq(RC)) + Opc = Mips::LW; + else if (Mips::HI64RegClass.hasSubClassEq(RC)) + Opc = Mips::LD; + else if (Mips::LO32RegClass.hasSubClassEq(RC)) + Opc = Mips::LW; + else if (Mips::LO64RegClass.hasSubClassEq(RC)) + Opc = Mips::LD; assert(Opc && "Register class not handled!"); - BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(Offset) - .addMemOperand(MMO); + + if (!ReqIndirectLoad) + BuildMI(MBB, I, DL, get(Opc), DestReg) + .addFrameIndex(FI) + .addImm(Offset) + .addMemOperand(MMO); + else { + // Load HI/LO through K0. Notably the DestReg is encoded into the + // instruction itself. + unsigned Reg = Mips::K0; + unsigned LdOp = Mips::MTLO; + if (DestReg == Mips::HI0) + LdOp = Mips::MTHI; + + if (Subtarget.getABI().ArePtrs64bit()) { + Reg = Mips::K0_64; + if (DestReg == Mips::HI0_64) + LdOp = Mips::MTHI64; + else + LdOp = Mips::MTLO64; + } + + BuildMI(MBB, I, DL, get(Opc), Reg) + .addFrameIndex(FI) + .addImm(Offset) + .addMemOperand(MMO); + BuildMI(MBB, I, DL, get(LdOp)).addReg(Reg); + } } bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { @@ -271,6 +336,9 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case Mips::RetRA: expandRetRA(MBB, MI); break; + case Mips::ERet: + expandERet(MBB, MI); + break; case Mips::PseudoMFHI: Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI; expandPseudoMFHiLo(MBB, MI, Opc); @@ -360,7 +428,7 @@ void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { MipsABIInfo ABI = Subtarget.getABI(); - DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); + DebugLoc DL; unsigned ADDu = ABI.GetPtrAdduOp(); unsigned ADDiu = ABI.GetPtrAddiuOp(); @@ -438,6 +506,11 @@ void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB, BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn)).addReg(Mips::RA); } +void MipsSEInstrInfo::expandERet(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + BuildMI(MBB, I, I->getDebugLoc(), get(Mips::ERET)); +} + std::pair<bool, bool> MipsSEInstrInfo::compareOpndSize(unsigned Opc, const MachineFunction &MF) const { @@ -471,8 +544,6 @@ void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB, const MachineOperand &SrcLo = I->getOperand(1), &SrcHi = I->getOperand(2); MachineInstrBuilder LoInst = BuildMI(MBB, I, DL, get(LoOpc)); MachineInstrBuilder HiInst = BuildMI(MBB, I, DL, get(HiOpc)); - LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill())); - HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill())); // Add lo/hi registers if the mtlo/hi instructions created have explicit // def registers. @@ -483,6 +554,9 @@ void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB, LoInst.addReg(DstLo, RegState::Define); HiInst.addReg(DstHi, RegState::Define); } + + LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill())); + HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill())); } void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB, diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h index bebbabf..5d73545 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h +++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h @@ -82,6 +82,8 @@ private: void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; + void expandERet(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; + std::pair<bool, bool> compareOpndSize(unsigned Opc, const MachineFunction &MF) const; diff --git a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp index 132c3a1..b1e2885 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp @@ -126,17 +126,19 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II, } bool EhDataRegFI = MipsFI->isEhDataRegFI(FrameIndex); - + bool IsISRRegFI = MipsFI->isISRRegFI(FrameIndex); // The following stack frame objects are always referenced relative to $sp: // 1. Outgoing arguments. // 2. Pointer to dynamically allocated stack space. // 3. Locations for callee-saved registers. // 4. Locations for eh data registers. + // 5. Locations for ISR saved Coprocessor 0 registers 12 & 14. // Everything else is referenced relative to whatever register // getFrameRegister() returns. unsigned FrameReg; - if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI) + if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI || + IsISRRegFI) FrameReg = ABI.GetStackPtr(); else if (RegInfo->needsStackRealignment(MF)) { if (MFI->hasVarSizedObjects() && !MFI->isFixedObjectIndex(FrameIndex)) diff --git a/contrib/llvm/lib/Target/Mips/MipsSchedule.td b/contrib/llvm/lib/Target/Mips/MipsSchedule.td index 54b5d28..37f9e49 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSchedule.td +++ b/contrib/llvm/lib/Target/Mips/MipsSchedule.td @@ -16,8 +16,8 @@ def IMULDIV : FuncUnit; //===----------------------------------------------------------------------===// // Instruction Itinerary classes used for Mips //===----------------------------------------------------------------------===// -def IIAlu : InstrItinClass; -def IIBranch : InstrItinClass; +// IIM16Alu is a placeholder class for most MIPS16 instructions. +def IIM16Alu : InstrItinClass; def IIPseudo : InstrItinClass; def II_ABS : InstrItinClass; @@ -28,7 +28,19 @@ def II_ADD_D : InstrItinClass; def II_ADD_S : InstrItinClass; def II_AND : InstrItinClass; def II_ANDI : InstrItinClass; +def II_B : InstrItinClass; def II_BADDU : InstrItinClass; +def II_BBIT : InstrItinClass; // bbit[01], bbit[01]32 +def II_BC : InstrItinClass; +def II_BC1F : InstrItinClass; +def II_BC1FL : InstrItinClass; +def II_BC1T : InstrItinClass; +def II_BC1TL : InstrItinClass; +def II_BCC : InstrItinClass; // beq and bne +def II_BCCZ : InstrItinClass; // b[gl][et]z +def II_BCCZAL : InstrItinClass; // bgezal and bltzal +def II_BCCZALS : InstrItinClass; // bgezals and bltzals +def II_BCCZC : InstrItinClass; // beqzc, bnezc def II_CEIL : InstrItinClass; def II_CFC1 : InstrItinClass; def II_CLO : InstrItinClass; @@ -68,21 +80,39 @@ def II_DSUB : InstrItinClass; def II_EXT : InstrItinClass; // Any EXT instruction def II_FLOOR : InstrItinClass; def II_INS : InstrItinClass; // Any INS instruction +def II_IndirectBranchPseudo : InstrItinClass; // Indirect branch pseudo. +def II_J : InstrItinClass; +def II_JAL : InstrItinClass; +def II_JALR : InstrItinClass; +def II_JALRC : InstrItinClass; +def II_JALRS : InstrItinClass; +def II_JALS : InstrItinClass; +def II_JR : InstrItinClass; +def II_JRADDIUSP : InstrItinClass; +def II_JRC : InstrItinClass; +def II_ReturnPseudo : InstrItinClass; // Return pseudo. def II_LB : InstrItinClass; +def II_LBE : InstrItinClass; def II_LBU : InstrItinClass; +def II_LBUE : InstrItinClass; def II_LD : InstrItinClass; def II_LDC1 : InstrItinClass; def II_LDL : InstrItinClass; def II_LDR : InstrItinClass; def II_LDXC1 : InstrItinClass; def II_LH : InstrItinClass; +def II_LHE : InstrItinClass; def II_LHU : InstrItinClass; +def II_LHUE : InstrItinClass; def II_LUI : InstrItinClass; def II_LUXC1 : InstrItinClass; def II_LW : InstrItinClass; +def II_LWE : InstrItinClass; def II_LWC1 : InstrItinClass; def II_LWL : InstrItinClass; +def II_LWLE : InstrItinClass; def II_LWR : InstrItinClass; +def II_LWRE : InstrItinClass; def II_LWU : InstrItinClass; def II_LWXC1 : InstrItinClass; def II_MADD : InstrItinClass; @@ -134,6 +164,7 @@ def II_ROTRV : InstrItinClass; def II_ROUND : InstrItinClass; def II_SAVE : InstrItinClass; def II_SB : InstrItinClass; +def II_SBE : InstrItinClass; def II_SD : InstrItinClass; def II_SDC1 : InstrItinClass; def II_SDL : InstrItinClass; @@ -144,6 +175,7 @@ def II_SEH : InstrItinClass; def II_SEQ_SNE : InstrItinClass; // seq and sne def II_SEQI_SNEI : InstrItinClass; // seqi and snei def II_SH : InstrItinClass; +def II_SHE : InstrItinClass; def II_SLL : InstrItinClass; def II_SLLV : InstrItinClass; def II_SLTI_SLTIU : InstrItinClass; // slti and sltiu @@ -159,11 +191,15 @@ def II_SUB_D : InstrItinClass; def II_SUB_S : InstrItinClass; def II_SUXC1 : InstrItinClass; def II_SW : InstrItinClass; +def II_SWE : InstrItinClass; def II_SWC1 : InstrItinClass; def II_SWL : InstrItinClass; +def II_SWLE : InstrItinClass; def II_SWR : InstrItinClass; +def II_SWRE : InstrItinClass; def II_SWXC1 : InstrItinClass; def II_TRUNC : InstrItinClass; +def II_WSBH : InstrItinClass; def II_XOR : InstrItinClass; def II_XORI : InstrItinClass; @@ -171,7 +207,7 @@ def II_XORI : InstrItinClass; // Mips Generic instruction itineraries. //===----------------------------------------------------------------------===// def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [ - InstrItinData<IIAlu , [InstrStage<1, [ALU]>]>, + InstrItinData<IIM16Alu , [InstrStage<1, [ALU]>]>, InstrItinData<II_ADDI , [InstrStage<1, [ALU]>]>, InstrItinData<II_ADDIU , [InstrStage<1, [ALU]>]>, InstrItinData<II_ADDU , [InstrStage<1, [ALU]>]>, @@ -240,7 +276,29 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [ InstrItinData<II_SAVE , [InstrStage<1, [ALU]>]>, InstrItinData<II_SEQ_SNE , [InstrStage<1, [ALU]>]>, InstrItinData<II_SEQI_SNEI , [InstrStage<1, [ALU]>]>, - InstrItinData<IIBranch , [InstrStage<1, [ALU]>]>, + InstrItinData<II_B , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BBIT , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BC , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BC1F , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BC1FL , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BC1T , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BC1TL , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BCC , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BCCZ , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BCCZAL , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BCCZALS , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BCCZC , [InstrStage<1, [ALU]>]>, + InstrItinData<II_IndirectBranchPseudo, [InstrStage<1, [ALU]>]>, + InstrItinData<II_J , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JAL , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JALR , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JALRC , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JALRS , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JALS , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JR , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JRADDIUSP , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JRC , [InstrStage<1, [ALU]>]>, + InstrItinData<II_ReturnPseudo , [InstrStage<1, [ALU]>]>, InstrItinData<II_DMUL , [InstrStage<17, [IMULDIV]>]>, InstrItinData<II_DMULT , [InstrStage<17, [IMULDIV]>]>, InstrItinData<II_DMULTU , [InstrStage<17, [IMULDIV]>]>, @@ -313,3 +371,5 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [ InstrItinData<II_MFHC1 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MTHC1 , [InstrStage<2, [ALU]>]> ]>; + +include "MipsScheduleP5600.td" diff --git a/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td new file mode 100644 index 0000000..d32ae4f --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td @@ -0,0 +1,392 @@ +//==- MipsScheduleP5600.td - P5600 Scheduling Definitions --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def MipsP5600Model : SchedMachineModel { + int IssueWidth = 2; // 2x dispatched per cycle + int MicroOpBufferSize = 48; // min(48, 48, 64) + int LoadLatency = 4; + int MispredictPenalty = 8; // TODO: Estimated + + let CompleteModel = 1; +} + +let SchedModel = MipsP5600Model in { + +// ALQ Pipelines +// ============= + +def P5600ALQ : ProcResource<1> { let BufferSize = 16; } +def P5600IssueALU : ProcResource<1> { let Super = P5600ALQ; } + +// ALU Pipeline +// ------------ + +def P5600WriteALU : SchedWriteRes<[P5600IssueALU]>; + +// and, lui, nor, or, slti, sltiu, sub, subu, xor +def : ItinRW<[P5600WriteALU], + [II_AND, II_LUI, II_NOR, II_OR, II_SLTI_SLTIU, II_SUBU, II_XOR]>; + +// AGQ Pipelines +// ============= + +def P5600AGQ : ProcResource<3> { let BufferSize = 16; } +def P5600IssueAL2 : ProcResource<1> { let Super = P5600AGQ; } +def P5600IssueCTISTD : ProcResource<1> { let Super = P5600AGQ; } +def P5600IssueLDST : ProcResource<1> { let Super = P5600AGQ; } + +def P5600AL2Div : ProcResource<1>; +// Pseudo-resource used to block CTISTD when handling multi-pipeline splits. +def P5600CTISTD : ProcResource<1>; + +// CTISTD Pipeline +// --------------- + +def P5600WriteJump : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]>; +def P5600WriteJumpAndLink : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]> { + let Latency = 2; +} + +// b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal, jalx, +// jalr, jr.hb, jr +def : ItinRW<[P5600WriteJump], [II_B, II_BCC, II_BCCZ, II_BCCZAL, II_J, II_JR]>; +def : ItinRW<[P5600WriteJumpAndLink], [II_JAL, II_JALR]>; + +// LDST Pipeline +// ------------- + +def P5600WriteLoad : SchedWriteRes<[P5600IssueLDST]> { + let Latency = 4; +} + +def P5600WriteLoadShifted : SchedWriteRes<[P5600IssueLDST, P5600CTISTD]> { + let Latency = 4; +} + +def P5600WritePref : SchedWriteRes<[P5600IssueLDST]>; + +def P5600WriteStore : SchedWriteRes<[P5600IssueLDST, P5600CTISTD]> { + // FIXME: This is a bit pessimistic. P5600CTISTD is only used during cycle 2 + // not during 0, 1, and 2. + let ResourceCycles = [ 1, 3 ]; +} + +def P5600WriteGPRFromBypass : SchedWriteRes<[P5600IssueLDST]> { + let Latency = 2; +} + +def P5600WriteStoreFromOtherUnits : SchedWriteRes<[P5600IssueLDST]>; +def P5600WriteLoadToOtherUnits : SchedWriteRes<[P5600IssueLDST]> { + let Latency = 0; +} + +// l[bhw], l[bh]u, ll +def : ItinRW<[P5600WriteLoad], [II_LB, II_LBU, II_LH, II_LHU, II_LW, II_LWU]>; + +// lw[lr] +def : ItinRW<[P5600WriteLoadShifted], [II_LWL, II_LWR]>; + +// s[bhw], sw[lr] +def : ItinRW<[P5600WriteStore], [II_SB, II_SH, II_SW, II_SWL, II_SWR]>; + +// pref +// (this instruction does not exist in the backend yet) +def : ItinRW<[P5600WritePref], []>; + +// sc +// (this instruction does not exist in the backend yet) +def : ItinRW<[P5600WriteStore], []>; + +// LDST is also used in moves from general purpose registers to floating point +// and MSA. +def P5600WriteMoveGPRToOtherUnits : SchedWriteRes<[P5600IssueLDST]> { + let Latency = 0; +} + +// AL2 Pipeline +// ------------ + +def P5600WriteAL2 : SchedWriteRes<[P5600IssueAL2]>; +def P5600WriteAL2BitExt : SchedWriteRes<[P5600IssueAL2]> { let Latency = 2; } +def P5600WriteAL2ShadowMov : SchedWriteRes<[P5600IssueAL2]> { let Latency = 2; } +def P5600WriteAL2CondMov : SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> { + let Latency = 2; +} +def P5600WriteAL2Div : SchedWriteRes<[P5600IssueAL2, P5600AL2Div]> { + // Estimated worst case + let Latency = 34; + let ResourceCycles = [1, 34]; +} +def P5600WriteAL2DivU : SchedWriteRes<[P5600IssueAL2, P5600AL2Div]> { + // Estimated worst case + let Latency = 34; + let ResourceCycles = [1, 34]; +} +def P5600WriteAL2Mul : SchedWriteRes<[P5600IssueAL2]> { let Latency = 3; } +def P5600WriteAL2Mult: SchedWriteRes<[P5600IssueAL2]> { let Latency = 5; } +def P5600WriteAL2MAdd: SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> { + let Latency = 5; +} + +// clo, clz, di, mfhi, mflo +def : ItinRW<[P5600WriteAL2], [II_CLO, II_CLZ, II_MFHI_MFLO]>; + +// ehb, rdhwr, rdpgpr, wrpgpr, wsbh +def : ItinRW<[P5600WriteAL2ShadowMov], [II_RDHWR]>; + +// mov[nz] +def : ItinRW<[P5600WriteAL2CondMov], [II_MOVN, II_MOVZ]>; + +// divu? +def : ItinRW<[P5600WriteAL2Div], [II_DIV]>; +def : ItinRW<[P5600WriteAL2DivU], [II_DIVU]>; + +// mul +def : ItinRW<[P5600WriteAL2Mul], [II_MUL]>; +// multu?, multu? +def : ItinRW<[P5600WriteAL2Mult], [II_MULT, II_MULTU]>; +// maddu?, msubu?, mthi, mtlo +def : ItinRW<[P5600WriteAL2MAdd], + [II_MADD, II_MADDU, II_MSUB, II_MSUBU, II_MTHI_MTLO]>; + +// ext, ins +def : ItinRW<[P5600WriteAL2BitExt], + [II_EXT, II_INS]>; + +// Either ALU or AL2 Pipelines +// --------------------------- +// +// Some instructions can choose between ALU and AL2, but once dispatched to +// ALQ or AGQ respectively they are committed to that path. +// The decision is based on the outcome of the most recent selection when the +// choice was last available. For now, we assume ALU is always chosen. + +def P5600WriteEitherALU : SchedWriteVariant< + // FIXME: Implement selection predicate + [SchedVar<SchedPredicate<[{1}]>, [P5600WriteALU]>, + SchedVar<SchedPredicate<[{0}]>, [P5600WriteAL2]> + ]>; + +// add, addi, addiu, addu, andi, ori, rotr, se[bh], sllv?, sr[al]v?, slt, sltu, +// xori +def : ItinRW<[P5600WriteEitherALU], + [II_ADDI, II_ADDIU, II_ANDI, II_ORI, II_ROTR, II_SEB, II_SEH, + II_SLT_SLTU, II_SLL, II_SRA, II_SRL, II_XORI, II_ADDU, II_SLLV, + II_SRAV, II_SRLV]>; + +// FPU Pipelines +// ============= + +def P5600FPQ : ProcResource<3> { let BufferSize = 16; } +def P5600IssueFPUS : ProcResource<1> { let Super = P5600FPQ; } +def P5600IssueFPUL : ProcResource<1> { let Super = P5600FPQ; } +def P5600IssueFPULoad : ProcResource<1> { let Super = P5600FPQ; } + +def P5600FPUDivSqrt : ProcResource<2>; + +def P5600WriteFPUS : SchedWriteRes<[P5600IssueFPUS]>; +def P5600WriteFPUL : SchedWriteRes<[P5600IssueFPUL]> { let Latency = 4; } +def P5600WriteFPUL_MADDSUB : SchedWriteRes<[P5600IssueFPUL]> { let Latency = 6; } +def P5600WriteFPUDivS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 23 / 27 + let Latency = 23; // Using common case + let ResourceCycles = [ 1, 23 ]; +} +def P5600WriteFPUDivD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 31 / 35 + let Latency = 31; // Using common case + let ResourceCycles = [ 1, 31 ]; +} +def P5600WriteFPURcpS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 19 / 23 + let Latency = 19; // Using common case + let ResourceCycles = [ 1, 19 ]; +} +def P5600WriteFPURcpD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 27 / 31 + let Latency = 27; // Using common case + let ResourceCycles = [ 1, 27 ]; +} +def P5600WriteFPURsqrtS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 27 / 27 + let Latency = 27; // Using common case + let ResourceCycles = [ 1, 27 ]; +} +def P5600WriteFPURsqrtD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 27 / 31 + let Latency = 27; // Using common case + let ResourceCycles = [ 1, 27 ]; +} +def P5600WriteFPUSqrtS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 27 / 31 + let Latency = 27; // Using common case + let ResourceCycles = [ 1, 27 ]; +} +def P5600WriteFPUSqrtD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 35 / 39 + let Latency = 35; // Using common case + let ResourceCycles = [ 1, 35 ]; +} +def P5600WriteMSAShortLogic : SchedWriteRes<[P5600IssueFPUS]>; +def P5600WriteMSAShortInt : SchedWriteRes<[P5600IssueFPUS]> { let Latency = 2; } +def P5600WriteMoveOtherUnitsToFPU : SchedWriteRes<[P5600IssueFPUS]>; + +// FPUS is also used in moves from floating point and MSA registers to general +// purpose registers. +def P5600WriteMoveFPUSToOtherUnits : SchedWriteRes<[P5600IssueFPUS]> { + let Latency = 0; +} + +// FPUL is also used in moves from floating point and MSA registers to general +// purpose registers. +def P5600WriteMoveFPULToOtherUnits : SchedWriteRes<[P5600IssueFPUL]>; + +// Short Pipe +// ---------- +// +// abs.[ds], abs.ps, bc1[tf]l?, mov[tf].[ds], mov[tf], mov.[ds], [cm][ft]c1, +// m[ft]hc1, neg.[ds], neg.ps, nor.v, nori.b, or.v, ori.b, xor.v, xori.b, +// sdxc1, sdc1, st.[bhwd], swc1, swxc1 +def : ItinRW<[P5600WriteFPUS], [II_ABS, II_MOVF_D, II_MOVF_S, II_MOVT_D, + II_MOVT_S, II_MOV_D, II_MOV_S, II_NEG]>; + +// adds_a.[bhwd], adds_[asu].[bhwd], addvi?.[bhwd], asub_[us].[bhwd], +// aver?_[us].[bhwd] +def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADD_A_[BHWD]$")>; +def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADDS_[ASU]_[BHWD]$")>; +// TODO: ADDVI_[BHW] might be 1 cycle latency rather than 2. Need to confirm it. +def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADDVI?_[BHWD]$")>; +def : InstRW<[P5600WriteMSAShortInt], (instregex "^ASUB_[US].[BHWD]$")>; +def : InstRW<[P5600WriteMSAShortInt], (instregex "^AVER?_[US].[BHWD]$")>; + +// and.v, andi.b, move.v, ldi.[bhwd] +def : InstRW<[P5600WriteMSAShortLogic], (instregex "^MOVE_V$")>; +def : InstRW<[P5600WriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>; +def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>; +def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>; + +// Long Pipe +// ---------- +// +// add.[ds], add.ps, cvt.d.[sw], cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps, +// cvt.ps.[sw], c.<cc>.[ds], c.<cc>.ps, mul.[ds], mul.ps, sub.[ds], sub.ps, +// trunc.w.[ds], trunc.w.ps +def : ItinRW<[P5600WriteFPUL], + [II_ADD_D, II_ADD_S, II_CVT, II_C_CC_D, II_C_CC_S, II_MUL_D, + II_MUL_S, II_SUB_D, II_SUB_S, II_TRUNC]>; + +// div.[ds], div.ps +def : ItinRW<[P5600WriteFPUDivS], [II_DIV_S]>; +def : ItinRW<[P5600WriteFPUDivD], [II_DIV_D]>; + +// sqrt.[ds], sqrt.ps +def : ItinRW<[P5600WriteFPUSqrtS], [II_SQRT_S]>; +def : ItinRW<[P5600WriteFPUSqrtD], [II_SQRT_D]>; + +// madd.[ds], msub.[ds], nmadd.[ds], nmsub.[ds], +// Operand 0 is read on cycle 5. All other operands are read on operand 0. +def : ItinRW<[SchedReadAdvance<5>, P5600WriteFPUL_MADDSUB], + [II_MADD_D, II_MADD_S, II_MSUB_D, II_MSUB_S, II_NMADD_D, + II_NMADD_S, II_NMSUB_D, II_NMSUB_S]>; + +// madd.ps, msub.ps, nmadd.ps, nmsub.ps +// Operand 0 and 1 are read on cycle 5. All others are read on operand 0. +// (none of these instructions exist in the backend yet) + +// Load Pipe +// --------- +// +// This is typically used in conjunction with the load pipeline under the AGQ +// All the instructions are in the 'Tricky Instructions' section. + +def P5600WriteLoadOtherUnitsToFPU : SchedWriteRes<[P5600IssueFPULoad]> { + let Latency = 4; +} + +// Tricky Instructions +// =================== +// +// These instructions are split across multiple uops (in different pipelines) +// that must cooperate to complete the operation + +// FIXME: This isn't quite right since the implementation of WriteSequence +// current aggregates the resources and ignores the exact cycle they are +// used. +def P5600WriteMoveGPRToFPU : WriteSequence<[P5600WriteMoveGPRToOtherUnits, + P5600WriteMoveOtherUnitsToFPU]>; + +// FIXME: This isn't quite right since the implementation of WriteSequence +// current aggregates the resources and ignores the exact cycle they are +// used. +def P5600WriteMoveFPUToGPR : WriteSequence<[P5600WriteMoveFPUSToOtherUnits, + P5600WriteGPRFromBypass]>; + +// FIXME: This isn't quite right since the implementation of WriteSequence +// current aggregates the resources and ignores the exact cycle they are +// used. +def P5600WriteStoreFPUS : WriteSequence<[P5600WriteMoveFPUSToOtherUnits, + P5600WriteStoreFromOtherUnits]>; + +// FIXME: This isn't quite right since the implementation of WriteSequence +// current aggregates the resources and ignores the exact cycle they are +// used. +def P5600WriteStoreFPUL : WriteSequence<[P5600WriteMoveFPULToOtherUnits, + P5600WriteStoreFromOtherUnits]>; + +// FIXME: This isn't quite right since the implementation of WriteSequence +// current aggregates the resources and ignores the exact cycle they are +// used. +def P5600WriteLoadFPU : WriteSequence<[P5600WriteLoadToOtherUnits, + P5600WriteLoadOtherUnitsToFPU]>; + +// ctc1, mtc1, mthc1 +def : ItinRW<[P5600WriteMoveGPRToFPU], [II_CTC1, II_MTC1, II_MTHC1]>; + +// bc1[ft], cfc1, mfc1, mfhc1, movf, movt +def : ItinRW<[P5600WriteMoveFPUToGPR], + [II_BC1F, II_BC1T, II_CFC1, II_MFC1, II_MFHC1, II_MOVF, II_MOVT]>; + +// swc1, swxc1, st.[bhwd] +def : ItinRW<[P5600WriteStoreFPUS], [II_SWC1, II_SWXC1]>; +def : InstRW<[P5600WriteStoreFPUS], (instregex "^ST_[BHWD]$")>; + +// movn.[ds], movz.[ds] +def : ItinRW<[P5600WriteStoreFPUL], [II_MOVN_D, II_MOVN_S, II_MOVZ_D, II_MOVZ_S]>; + +// l[dw]x?c1, ld.[bhwd] +def : ItinRW<[P5600WriteLoadFPU], [II_LDC1, II_LDXC1, II_LWC1, II_LWXC1]>; +def : InstRW<[P5600WriteLoadFPU], (instregex "LD_[BHWD]")>; + +// Unsupported Instructions +// ======================== +// +// The following instruction classes are never valid on P5600. +// II_DADDIU, II_DADDU, II_DMFC1, II_DMTC1, II_DMULT, II_DMULTU, II_DROTR, +// II_DROTR32, II_DROTRV, II_DDIV, II_DSLL, II_DSLL32, II_DSLLV, II_DSRA, +// II_DSRA32, II_DSRAV, II_DSRL, II_DSRL32, II_DSRLV, II_DSUBU, II_DDIVU, +// II_JALRC, II_LD, II_LD[LR], II_LUXC1, II_RESTORE, II_SAVE, II_SD, II_SDC1, +// II_SDL, II_SDR, II_SDXC1 +// +// The following instructions are never valid on P5600. +// addq.ph, rdhwr, repl.ph, repl.qb, subq.ph, subu_s.qb +// +// Guesswork +// ========= +// +// This section is largely temporary guesswork. + +// ceil.[lw].[ds], floor.[lw].[ds] +// Reason behind guess: trunc.[lw].ds and the various cvt's are in FPUL +def : ItinRW<[P5600WriteFPUL], [II_CEIL, II_FLOOR, II_ROUND]>; + +// rotrv +// Reason behind guess: rotr is in the same category and the two register forms +// generally follow the immediate forms in this category +def : ItinRW<[P5600WriteEitherALU], [II_ROTRV]>; +} diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp index 471b6e1..8a18b51 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp @@ -69,8 +69,9 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU, HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false), HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false), InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false), - HasDSPR2(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), - HasMSA(false), TM(TM), TargetTriple(TT), TSInfo(), + HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), + Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasEVA(false), TM(TM), + TargetTriple(TT), TSInfo(), InstrInfo( MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))), FrameLowering(MipsFrameLowering::create(*this)), diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h index 1db8881..fbb01fe 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h +++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h @@ -42,9 +42,15 @@ class MipsSubtarget : public MipsGenSubtargetInfo { Mips3, Mips4, Mips5, Mips64, Mips64r2, Mips64r3, Mips64r5, Mips64r6 }; + enum class CPU { P5600 }; + // Mips architecture version MipsArchEnum MipsArchVersion; + // Processor implementation (unused but required to exist by + // tablegen-erated code). + CPU ProcImpl; + // IsLittle - The target is Little Endian bool IsLittle; @@ -116,8 +122,8 @@ class MipsSubtarget : public MipsGenSubtargetInfo { // InMicroMips -- can process MicroMips instructions bool InMicroMipsMode; - // HasDSP, HasDSPR2 -- supports DSP ASE. - bool HasDSP, HasDSPR2; + // HasDSP, HasDSPR2, HasDSPR3 -- supports DSP ASE. + bool HasDSP, HasDSPR2, HasDSPR3; // Allow mixed Mips16 and Mips32 in one source file bool AllowMixed16_32; @@ -130,6 +136,12 @@ class MipsSubtarget : public MipsGenSubtargetInfo { // HasMSA -- supports MSA ASE. bool HasMSA; + // UseTCCInDIV -- Enables the use of trapping in the assembler. + bool UseTCCInDIV; + + // HasEVA -- supports EVA ASE. + bool HasEVA; + InstrItineraryData InstrItins; // We can override the determination of whether we are in mips16 mode @@ -189,7 +201,7 @@ public: } bool hasMips32r5() const { return (MipsArchVersion >= Mips32r5 && MipsArchVersion < Mips32Max) || - hasMips64r2(); + hasMips64r5(); } bool hasMips32r6() const { return (MipsArchVersion >= Mips32r6 && MipsArchVersion < Mips32Max) || @@ -228,9 +240,12 @@ public: } bool inMicroMipsMode() const { return InMicroMipsMode; } bool inMicroMips32r6Mode() const { return InMicroMipsMode && hasMips32r6(); } + bool inMicroMips64r6Mode() const { return InMicroMipsMode && hasMips64r6(); } bool hasDSP() const { return HasDSP; } bool hasDSPR2() const { return HasDSPR2; } + bool hasDSPR3() const { return HasDSPR3; } bool hasMSA() const { return HasMSA; } + bool hasEVA() const { return HasEVA; } bool useSmallSection() const { return UseSmallSection; } bool hasStandardEncoding() const { return !inMips16Mode(); } diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp index 1c77745..3e63872 100644 --- a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp @@ -233,7 +233,7 @@ void MipsPassConfig::addPreRegAlloc() { } TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { if (Subtarget->allowMixed16_32()) { DEBUG(errs() << "No Target Transform Info Pass Added\n"); // FIXME: This is no longer necessary as the TTI returned is per-function. diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp index 0f2db60..146f33b 100644 --- a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp @@ -76,7 +76,7 @@ bool MipsTargetObjectFile:: IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM, SectionKind Kind) const { return (IsGlobalInSmallSectionImpl(GV, TM) && - (Kind.isDataRel() || Kind.isBSS() || Kind.isCommon())); + (Kind.isData() || Kind.isBSS() || Kind.isCommon())); } /// Return true if this global address should be placed into small data/bss @@ -107,7 +107,8 @@ IsGlobalInSmallSectionImpl(const GlobalValue *GV, return false; Type *Ty = GV->getType()->getElementType(); - return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty)); + return IsInSmallSection( + GV->getParent()->getDataLayout().getTypeAllocSize(Ty)); } MCSection * @@ -120,7 +121,7 @@ MipsTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, // Handle Small Section classification here. if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind)) return SmallBSSSection; - if (Kind.isDataRel() && IsGlobalInSmallSection(GV, TM, Kind)) + if (Kind.isData() && IsGlobalInSmallSection(GV, TM, Kind)) return SmallDataSection; // Otherwise, we work the same as ELF. @@ -128,21 +129,20 @@ MipsTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, } /// Return true if this constant should be placed into small data section. -bool MipsTargetObjectFile:: -IsConstantInSmallSection(const Constant *CN, const TargetMachine &TM) const { +bool MipsTargetObjectFile::IsConstantInSmallSection( + const DataLayout &DL, const Constant *CN, const TargetMachine &TM) const { return (static_cast<const MipsTargetMachine &>(TM) .getSubtargetImpl() ->useSmallSection() && - LocalSData && IsInSmallSection(TM.getDataLayout()->getTypeAllocSize( - CN->getType()))); + LocalSData && IsInSmallSection(DL.getTypeAllocSize(CN->getType()))); } -MCSection * -MipsTargetObjectFile::getSectionForConstant(SectionKind Kind, - const Constant *C) const { - if (IsConstantInSmallSection(C, *TM)) +/// Return true if this constant should be placed into small data section. +MCSection *MipsTargetObjectFile::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C) const { + if (IsConstantInSmallSection(DL, C, *TM)) return SmallDataSection; // Otherwise, we work the same as ELF. - return TargetLoweringObjectFileELF::getSectionForConstant(Kind, C); + return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C); } diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h index 725f2ff..ba04343 100644 --- a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h +++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h @@ -36,10 +36,10 @@ class MipsTargetMachine; const TargetMachine &TM) const override; /// Return true if this constant should be placed into small data section. - bool IsConstantInSmallSection(const Constant *CN, + bool IsConstantInSmallSection(const DataLayout &DL, const Constant *CN, const TargetMachine &TM) const; - MCSection *getSectionForConstant(SectionKind Kind, + MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, const Constant *C) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h index 6ce1be7..b3222f5 100644 --- a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h +++ b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h @@ -12,6 +12,7 @@ #include "MCTargetDesc/MipsABIFlagsSection.h" #include "MCTargetDesc/MipsABIInfo.h" +#include "llvm/ADT/Optional.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" @@ -77,8 +78,12 @@ public: // PIC support virtual void emitDirectiveCpLoad(unsigned RegNo); + virtual void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts, + int Offset); virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, bool IsReg); + virtual void emitDirectiveCpreturn(unsigned SaveLocation, + bool SaveLocationIsRegister); // FP abiflags directives virtual void emitDirectiveModuleFP(); @@ -97,18 +102,18 @@ public: // structure values. template <class PredicateLibrary> void updateABIInfo(const PredicateLibrary &P) { - ABI = &P.getABI(); + ABI = P.getABI(); ABIFlagsSection.setAllFromPredicates(P); } MipsABIFlagsSection &getABIFlagsSection() { return ABIFlagsSection; } const MipsABIInfo &getABI() const { - assert(ABI && "ABI hasn't been set!"); + assert(ABI.hasValue() && "ABI hasn't been set!"); return *ABI; } protected: - const MipsABIInfo *ABI; + llvm::Optional<MipsABIInfo> ABI; MipsABIFlagsSection ABIFlagsSection; bool GPRInfoSet; @@ -188,8 +193,12 @@ public: // PIC support void emitDirectiveCpLoad(unsigned RegNo) override; + void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts, + int Offset) override; void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, bool IsReg) override; + void emitDirectiveCpreturn(unsigned SaveLocation, + bool SaveLocationIsRegister) override; // FP abiflags directives void emitDirectiveModuleFP() override; @@ -237,8 +246,12 @@ public: // PIC support void emitDirectiveCpLoad(unsigned RegNo) override; + void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts, + int Offset) override; void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, bool IsReg) override; + void emitDirectiveCpreturn(unsigned SaveLocation, + bool SaveLocationIsRegister) override; void emitMipsAbiFlags(); }; diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h index 02c5a21..f0f223a 100644 --- a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h +++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h @@ -15,11 +15,9 @@ #define LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/Support/raw_ostream.h" namespace llvm { -class MCOperand; class MCSubtargetInfo; class NVPTXInstPrinter : public MCInstPrinter { diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h index b432e06..9ac3c88 100644 --- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h @@ -22,6 +22,7 @@ class Triple; class NVPTXMCAsmInfo : public MCAsmInfo { virtual void anchor(); + public: explicit NVPTXMCAsmInfo(const Triple &TheTriple); }; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm/lib/Target/NVPTX/NVPTX.h index fe28214..e5fae85 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTX.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.h @@ -41,24 +41,6 @@ enum CondCodes { }; } -inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) { - switch (CC) { - case NVPTXCC::NE: - return "ne"; - case NVPTXCC::EQ: - return "eq"; - case NVPTXCC::LT: - return "lt"; - case NVPTXCC::LE: - return "le"; - case NVPTXCC::GT: - return "gt"; - case NVPTXCC::GE: - return "ge"; - } - llvm_unreachable("Unknown condition code"); -} - FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel); ModulePass *createNVPTXAssignValidGlobalNamesPass(); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index ecb0f0a..e8c3608 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -355,7 +355,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { if (isABI) { if (Ty->isFloatingPointTy() || Ty->isIntegerTy()) { unsigned size = 0; - if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) { + if (auto *ITy = dyn_cast<IntegerType>(Ty)) { size = ITy->getBitWidth(); if (size < 32) size = 32; @@ -635,9 +635,7 @@ static bool usedInGlobalVarDef(const Constant *C) { return false; if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) { - if (GV->getName() == "llvm.used") - return false; - return true; + return GV->getName() != "llvm.used"; } for (const User *U : C->users()) @@ -682,7 +680,7 @@ static bool usedInOneFunc(const User *U, Function const *&oneFunc) { static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) { if (!gv->hasInternalLinkage()) return false; - const PointerType *Pty = gv->getType(); + PointerType *Pty = gv->getType(); if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED) return false; @@ -720,7 +718,7 @@ static bool useFuncSeen(const Constant *C, void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) { llvm::DenseMap<const Function *, bool> seenMap; for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) { - const Function *F = FI; + const Function *F = &*FI; if (F->isDeclaration()) { if (F->use_empty()) @@ -870,9 +868,8 @@ void NVPTXAsmPrinter::emitGlobals(const Module &M) { DenseSet<const GlobalVariable *> GVVisiting; // Visit each global variable, in order - for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) - VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting); + for (const GlobalVariable &I : M.globals()) + VisitGlobalVariableForEmission(&I, Globals, GVVisited, GVVisiting); assert(GVVisited.size() == M.getGlobalList().size() && "Missed a global variable"); @@ -1029,10 +1026,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, GVar->getName().startswith("nvvm.")) return; - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // GlobalVariables are always constant pointers themselves. - const PointerType *PTy = GVar->getType(); + PointerType *PTy = GVar->getType(); Type *ETy = PTy->getElementType(); if (GVar->hasExternalLinkage()) { @@ -1159,7 +1156,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, } if (GVar->getAlignment() == 0) - O << " .align " << (int) TD->getPrefTypeAlignment(ETy); + O << " .align " << (int)DL.getPrefTypeAlignment(ETy); else O << " .align " << GVar->getAlignment(); @@ -1185,9 +1182,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, printScalarConstant(Initializer, O); } } else { - // The frontend adds zero-initializer to variables that don't have an - // initial value, so skip warning for this case. - if (!GVar->getInitializer()->isNullValue()) { + // The frontend adds zero-initializer to device and constant variables + // that don't have an initial value, and UndefValue to shared + // variables, so skip warning for this case. + if (!GVar->getInitializer()->isNullValue() && + !isa<UndefValue>(GVar->getInitializer())) { report_fatal_error("initial value of '" + GVar->getName() + "' is not allowed in addrspace(" + Twine(PTy->getAddressSpace()) + ")"); @@ -1205,7 +1204,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, case Type::StructTyID: case Type::ArrayTyID: case Type::VectorTyID: - ElementSize = TD->getTypeStoreSize(ETy); + ElementSize = DL.getTypeStoreSize(ETy); // Ptx allows variable initilization only for constant and // global state spaces. if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || @@ -1299,7 +1298,7 @@ void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace, } std::string -NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const { +NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const { switch (Ty->getTypeID()) { default: llvm_unreachable("unexpected type"); @@ -1339,16 +1338,16 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const { void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O) { - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // GlobalVariables are always constant pointers themselves. - const PointerType *PTy = GVar->getType(); + PointerType *PTy = GVar->getType(); Type *ETy = PTy->getElementType(); O << "."; emitPTXAddressSpace(PTy->getAddressSpace(), O); if (GVar->getAlignment() == 0) - O << " .align " << (int) TD->getPrefTypeAlignment(ETy); + O << " .align " << (int)DL.getPrefTypeAlignment(ETy); else O << " .align " << GVar->getAlignment(); @@ -1370,7 +1369,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, case Type::StructTyID: case Type::ArrayTyID: case Type::VectorTyID: - ElementSize = TD->getTypeStoreSize(ETy); + ElementSize = DL.getTypeStoreSize(ETy); O << " .b8 "; getSymbol(GVar)->print(O, MAI); O << "["; @@ -1385,32 +1384,32 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, return; } -static unsigned int getOpenCLAlignment(const DataLayout *TD, Type *Ty) { +static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) { if (Ty->isSingleValueType()) - return TD->getPrefTypeAlignment(Ty); + return DL.getPrefTypeAlignment(Ty); - const ArrayType *ATy = dyn_cast<ArrayType>(Ty); + auto *ATy = dyn_cast<ArrayType>(Ty); if (ATy) - return getOpenCLAlignment(TD, ATy->getElementType()); + return getOpenCLAlignment(DL, ATy->getElementType()); - const StructType *STy = dyn_cast<StructType>(Ty); + auto *STy = dyn_cast<StructType>(Ty); if (STy) { unsigned int alignStruct = 1; // Go through each element of the struct and find the // largest alignment. for (unsigned i = 0, e = STy->getNumElements(); i != e; i++) { Type *ETy = STy->getElementType(i); - unsigned int align = getOpenCLAlignment(TD, ETy); + unsigned int align = getOpenCLAlignment(DL, ETy); if (align > alignStruct) alignStruct = align; } return alignStruct; } - const FunctionType *FTy = dyn_cast<FunctionType>(Ty); + auto *FTy = dyn_cast<FunctionType>(Ty); if (FTy) - return TD->getPointerPrefAlignment(); - return TD->getPrefTypeAlignment(Ty); + return DL.getPointerPrefAlignment(); + return DL.getPrefTypeAlignment(Ty); } void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I, @@ -1419,13 +1418,8 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I, O << "_param_" << paramIndex; } -void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) { - CurrentFnSym->print(O, MAI); - O << "_param_" << paramIndex; -} - void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); const AttributeSet &PAL = F->getAttributes(); const TargetLowering *TLI = nvptxSubtarget->getTargetLowering(); Function::const_arg_iterator I, E; @@ -1433,7 +1427,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { bool first = true; bool isKernelFunc = llvm::isKernelFunction(*F); bool isABI = (nvptxSubtarget->getSmVersion() >= 20); - MVT thePointerTy = TLI->getPointerTy(*TD); + MVT thePointerTy = TLI->getPointerTy(DL); O << "(\n"; @@ -1485,9 +1479,9 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // size = typeallocsize of element type unsigned align = PAL.getParamAlignment(paramIndex + 1); if (align == 0) - align = TD->getABITypeAlignment(Ty); + align = DL.getABITypeAlignment(Ty); - unsigned sz = TD->getTypeAllocSize(Ty); + unsigned sz = DL.getTypeAllocSize(Ty); O << "\t.param .align " << align << " .b8 "; printParamName(I, paramIndex, O); O << "[" << sz << "]"; @@ -1495,7 +1489,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { continue; } // Just a scalar - const PointerType *PTy = dyn_cast<PointerType>(Ty); + auto *PTy = dyn_cast<PointerType>(Ty); if (isKernelFunc) { if (PTy) { // Special handling for pointer arguments to kernel @@ -1519,7 +1513,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { O << ".ptr .global "; break; } - O << ".align " << (int) getOpenCLAlignment(TD, ETy) << " "; + O << ".align " << (int)getOpenCLAlignment(DL, ETy) << " "; } printParamName(I, paramIndex, O); continue; @@ -1556,7 +1550,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { } // param has byVal attribute. So should be a pointer - const PointerType *PTy = dyn_cast<PointerType>(Ty); + auto *PTy = dyn_cast<PointerType>(Ty); assert(PTy && "Param with byval attribute should be a pointer type"); Type *ETy = PTy->getElementType(); @@ -1566,9 +1560,9 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // size = typeallocsize of element type unsigned align = PAL.getParamAlignment(paramIndex + 1); if (align == 0) - align = TD->getABITypeAlignment(ETy); + align = DL.getABITypeAlignment(ETy); - unsigned sz = TD->getTypeAllocSize(ETy); + unsigned sz = DL.getTypeAllocSize(ETy); O << "\t.param .align " << align << " .b8 "; printParamName(I, paramIndex, O); O << "[" << sz << "]"; @@ -1579,7 +1573,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // Further, if a part is vector, print the above for // each vector element. SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*TLI, getDataLayout(), ETy, vtparts); + ComputeValueVTs(*TLI, DL, ETy, vtparts); for (unsigned i = 0, e = vtparts.size(); i != e; ++i) { unsigned elems = 1; EVT elemtype = vtparts[i]; @@ -1786,10 +1780,10 @@ static void ConvertDoubleToBytes(unsigned char *p, double val) { void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer) { - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); if (isa<UndefValue>(CPV) || CPV->isNullValue()) { - int s = TD->getTypeAllocSize(CPV->getType()); + int s = DL.getTypeAllocSize(CPV->getType()); if (s < Bytes) s = Bytes; aggBuffer->addZeros(s); @@ -1800,7 +1794,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, switch (CPV->getType()->getTypeID()) { case Type::IntegerTyID: { - const Type *ETy = CPV->getType(); + Type *ETy = CPV->getType(); if (ETy == Type::getInt8Ty(CPV->getContext())) { unsigned char c = (unsigned char)cast<ConstantInt>(CPV)->getZExtValue(); ConvertIntToBytes<>(ptr, c); @@ -1817,7 +1811,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, break; } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { if (const ConstantInt *constInt = dyn_cast<ConstantInt>( - ConstantFoldConstantExpression(Cexpr, *TD))) { + ConstantFoldConstantExpression(Cexpr, DL))) { int int32 = (int)(constInt->getZExtValue()); ConvertIntToBytes<>(ptr, int32); aggBuffer->addBytes(ptr, 4, Bytes); @@ -1839,7 +1833,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, break; } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { if (const ConstantInt *constInt = dyn_cast<ConstantInt>( - ConstantFoldConstantExpression(Cexpr, *TD))) { + ConstantFoldConstantExpression(Cexpr, DL))) { long long int64 = (long long)(constInt->getZExtValue()); ConvertIntToBytes<>(ptr, int64); aggBuffer->addBytes(ptr, 8, Bytes); @@ -1860,7 +1854,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, case Type::FloatTyID: case Type::DoubleTyID: { const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV); - const Type *Ty = CFP->getType(); + Type *Ty = CFP->getType(); if (Ty == Type::getFloatTy(CPV->getContext())) { float float32 = (float) CFP->getValueAPF().convertToFloat(); ConvertFloatToBytes(ptr, float32); @@ -1881,7 +1875,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, const Value *v = Cexpr->stripPointerCasts(); aggBuffer->addSymbol(v, Cexpr); } - unsigned int s = TD->getTypeAllocSize(CPV->getType()); + unsigned int s = DL.getTypeAllocSize(CPV->getType()); aggBuffer->addZeros(s); break; } @@ -1891,7 +1885,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, case Type::StructTyID: { if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) || isa<ConstantStruct>(CPV) || isa<ConstantDataSequential>(CPV)) { - int ElementSize = TD->getTypeAllocSize(CPV->getType()); + int ElementSize = DL.getTypeAllocSize(CPV->getType()); bufferAggregateConstant(CPV, aggBuffer); if (Bytes > ElementSize) aggBuffer->addZeros(Bytes - ElementSize); @@ -1909,7 +1903,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV, AggBuffer *aggBuffer) { - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); int Bytes; // Old constants @@ -1934,12 +1928,12 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV, StructType *ST = cast<StructType>(CPV->getType()); for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) { if (i == (e - 1)) - Bytes = TD->getStructLayout(ST)->getElementOffset(0) + - TD->getTypeAllocSize(ST) - - TD->getStructLayout(ST)->getElementOffset(i); + Bytes = DL.getStructLayout(ST)->getElementOffset(0) + + DL.getTypeAllocSize(ST) - + DL.getStructLayout(ST)->getElementOffset(i); else - Bytes = TD->getStructLayout(ST)->getElementOffset(i + 1) - - TD->getStructLayout(ST)->getElementOffset(i); + Bytes = DL.getStructLayout(ST)->getElementOffset(i + 1) - + DL.getStructLayout(ST)->getElementOffset(i); bufferLEByte(cast<Constant>(CPV->getOperand(i)), Bytes, aggBuffer); } } @@ -1951,18 +1945,6 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV, // buildTypeNameMap - Run through symbol table looking for type names. // -bool NVPTXAsmPrinter::isImageType(const Type *Ty) { - - std::map<const Type *, std::string>::iterator PI = TypeNameMap.find(Ty); - - if (PI != TypeNameMap.end() && (!PI->second.compare("struct._image1d_t") || - !PI->second.compare("struct._image2d_t") || - !PI->second.compare("struct._image3d_t"))) - return true; - - return false; -} - bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) { switch (MI.getOpcode()) { @@ -2054,7 +2036,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric) // If the code isn't optimized, there may be outstanding folding // opportunities. Attempt to fold the expression using DataLayout as a // last resort before giving up. - if (Constant *C = ConstantFoldConstantExpression(CE, *TM.getDataLayout())) + if (Constant *C = ConstantFoldConstantExpression(CE, getDataLayout())) if (C != CE) return lowerConstantForGV(C, ProcessingGeneric); @@ -2083,7 +2065,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric) } case Instruction::GetElementPtr: { - const DataLayout &DL = *TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // Generate a symbolic expression for the byte address APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0); @@ -2109,7 +2091,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric) return lowerConstantForGV(CE->getOperand(0), ProcessingGeneric); case Instruction::IntToPtr: { - const DataLayout &DL = *TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // Handle casts to pointers by changing them into casts to the appropriate // integer type. This promotes constant folding and simplifies this code. @@ -2120,7 +2102,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric) } case Instruction::PtrToInt: { - const DataLayout &DL = *TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // Support only foldable casts to/from pointers that can be eliminated by // changing the pointer to the appropriately sized integer type. diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h index f6f7685..76bf179 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -212,28 +212,21 @@ private: MCOperand GetSymbolRef(const MCSymbol *Symbol); unsigned encodeVirtualRegister(unsigned Reg); - void EmitAlignment(unsigned NumBits, const GlobalValue *GV = nullptr) const {} - void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier, raw_ostream &O); void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O, const char *Modifier = nullptr); - void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const; void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O, bool = false); - void printParamName(int paramIndex, raw_ostream &O); void printParamName(Function::const_arg_iterator I, int paramIndex, raw_ostream &O); void emitGlobals(const Module &M); void emitHeader(Module &M, raw_ostream &O, const NVPTXSubtarget &STI); void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const; void emitVirtualRegister(unsigned int vr, raw_ostream &); - void emitFunctionExternParamList(const MachineFunction &MF); void emitFunctionParamList(const Function *, raw_ostream &O); void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O); void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF); - void emitFunctionTempData(const MachineFunction &MF, unsigned &FrameSize); - bool isImageType(const Type *Ty); void printReturnValStr(const Function *, raw_ostream &O); void printReturnValStr(const MachineFunction &MF, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, @@ -271,7 +264,7 @@ private: // Build the map between type name and ID based on module's type // symbol table. - std::map<const Type *, std::string> TypeNameMap; + std::map<Type *, std::string> TypeNameMap; // List of variables demoted to a function scope. std::map<const Function *, std::vector<const GlobalVariable *> > localDecls; @@ -282,19 +275,15 @@ private: void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O); void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const; - std::string getPTXFundamentalTypeStr(const Type *Ty, bool = true) const; + std::string getPTXFundamentalTypeStr(Type *Ty, bool = true) const; void printScalarConstant(const Constant *CPV, raw_ostream &O); void printFPConstant(const ConstantFP *Fp, raw_ostream &O); void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer); void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer); - void printOperandProper(const MachineOperand &MO); - void emitLinkageDirective(const GlobalValue *V, raw_ostream &O); void emitDeclarations(const Module &, raw_ostream &O); void emitDeclaration(const Function *, raw_ostream &O); - - static const char *getRegisterName(unsigned RegNo); void emitDemotedVars(const Function *, raw_ostream &); bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo, diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp index 69a229e..95813c8 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp @@ -98,7 +98,7 @@ private: /// This reordering exposes to optimizeMemoryInstruction more /// optimization opportunities on loads and stores. /// - /// If this function succesfully hoists an eliminable addrspacecast or V is + /// If this function successfully hoists an eliminable addrspacecast or V is /// already such an addrspacecast, it returns the transformed value (which is /// guaranteed to be an addrspacecast); otherwise, it returns nullptr. Value *hoistAddrSpaceCastFrom(Value *V, int Depth = 0); @@ -267,14 +267,14 @@ bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) { return false; bool Changed = false; - for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { - for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) { + for (BasicBlock &B : F) { + for (Instruction &I : B) { if (isa<LoadInst>(I)) { // V = load P - Changed |= optimizeMemoryInstruction(I, 0); + Changed |= optimizeMemoryInstruction(&I, 0); } else if (isa<StoreInst>(I)) { // store V, P - Changed |= optimizeMemoryInstruction(I, 1); + Changed |= optimizeMemoryInstruction(&I, 1); } } } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp index 6fd09c4..62ca5e9 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -81,7 +81,7 @@ bool GenericToNVVM::runOnModule(Module &M) { for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E;) { - GlobalVariable *GV = I++; + GlobalVariable *GV = &*I++; if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC && !llvm::isTexture(*GV) && !llvm::isSurface(*GV) && !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) { @@ -117,7 +117,7 @@ bool GenericToNVVM::runOnModule(Module &M) { Value *Operand = II->getOperand(i); if (isa<Constant>(Operand)) { II->setOperand( - i, remapConstant(&M, I, cast<Constant>(Operand), Builder)); + i, remapConstant(&M, &*I, cast<Constant>(Operand), Builder)); } } } @@ -132,10 +132,8 @@ bool GenericToNVVM::runOnModule(Module &M) { // Walk through the metadata section and update the debug information // associated with the global variables in the default address space. - for (Module::named_metadata_iterator I = M.named_metadata_begin(), - E = M.named_metadata_end(); - I != E; I++) { - remapNamedMDNode(VM, I); + for (NamedMDNode &I : M.named_metadata()) { + remapNamedMDNode(VM, &I); } // Walk through the global variable initializers, and replace any use of @@ -318,9 +316,8 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C, NewOperands[0], NewOperands[1]); case Instruction::FCmp: // CompareConstantExpr (fcmp) - assert(false && "Address space conversion should have no effect " - "on float point CompareConstantExpr (fcmp)!"); - return C; + llvm_unreachable("Address space conversion should have no effect " + "on float point CompareConstantExpr (fcmp)!"); case Instruction::ExtractElement: // ExtractElementConstantExpr return Builder.CreateExtractElement(NewOperands[0], NewOperands[1]); @@ -364,8 +361,7 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C, return Builder.CreateCast(Instruction::CastOps(C->getOpcode()), NewOperands[0], C->getType()); } - assert(false && "GenericToNVVM encountered an unsupported ConstantExpr"); - return C; + llvm_unreachable("GenericToNVVM encountered an unsupported ConstantExpr"); } } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 232a611..2d0098b 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -12,6 +12,8 @@ //===----------------------------------------------------------------------===// #include "NVPTXISelDAGToDAG.h" +#include "NVPTXUtilities.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/CommandLine.h" @@ -530,7 +532,7 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { if (!Src) return NVPTX::PTXLdStInstCode::GENERIC; - if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) { + if (auto *PT = dyn_cast<PointerType>(Src->getType())) { switch (PT->getAddressSpace()) { case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL; case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL; @@ -544,6 +546,39 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { return NVPTX::PTXLdStInstCode::GENERIC; } +static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, + unsigned CodeAddrSpace, MachineFunction *F) { + // To use non-coherent caching, the load has to be from global + // memory and we have to prove that the memory area is not written + // to anywhere for the duration of the kernel call, not even after + // the load. + // + // To ensure that there are no writes to the memory, we require the + // underlying pointer to be a noalias (__restrict) kernel parameter + // that is never used for a write. We can only do this for kernel + // functions since from within a device function, we cannot know if + // there were or will be writes to the memory from the caller - or we + // could, but then we would have to do inter-procedural analysis. + if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL || + !isKernelFunction(*F->getFunction())) { + return false; + } + + // We use GetUnderlyingObjects() here instead of + // GetUnderlyingObject() mainly because the former looks through phi + // nodes while the latter does not. We need to look through phi + // nodes to handle pointer induction variables. + SmallVector<Value *, 8> Objs; + GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()), + Objs, F->getDataLayout()); + for (Value *Obj : Objs) { + auto *A = dyn_cast<const Argument>(Obj); + if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false; + } + + return true; +} + SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) { unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); switch (IID) { @@ -638,6 +673,10 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { // Address Space Setting unsigned int codeAddrSpace = getCodeAddrSpace(LD); + if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) { + return SelectLDGLDU(N); + } + // Volatile Setting // - .volatile is only availalble for .global and .shared bool isVolatile = LD->isVolatile(); @@ -872,6 +911,10 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD); + if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { + return SelectLDGLDU(N); + } + // Volatile Setting // - .volatile is only availalble for .global and .shared bool IsVolatile = MemSD->isVolatile(); @@ -1425,6 +1468,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::LOAD: case ISD::INTRINSIC_W_CHAIN: if (IsLDG) { switch (EltVT.getSimpleVT().SimpleTy) { @@ -1474,6 +1518,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } } break; + case NVPTXISD::LoadV2: case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1522,6 +1567,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { break; } break; + case NVPTXISD::LoadV4: case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1563,6 +1609,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::LOAD: case ISD::INTRINSIC_W_CHAIN: if (IsLDG) { switch (EltVT.getSimpleVT().SimpleTy) { @@ -1612,6 +1659,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } } break; + case NVPTXISD::LoadV2: case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1660,6 +1708,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { break; } break; + case NVPTXISD::LoadV4: case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1707,6 +1756,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::LOAD: case ISD::INTRINSIC_W_CHAIN: if (IsLDG) { switch (EltVT.getSimpleVT().SimpleTy) { @@ -1756,6 +1806,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } } break; + case NVPTXISD::LoadV2: case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1804,6 +1855,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { break; } break; + case NVPTXISD::LoadV4: case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1845,6 +1897,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::LOAD: case ISD::INTRINSIC_W_CHAIN: if (IsLDG) { switch (EltVT.getSimpleVT().SimpleTy) { @@ -1894,6 +1947,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } } break; + case NVPTXISD::LoadV2: case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1942,6 +1996,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { break; } break; + case NVPTXISD::LoadV4: case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -5039,7 +5094,7 @@ bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N, } if (!Src) return false; - if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) + if (auto *PT = dyn_cast<PointerType>(Src->getType())) return (PT->getAddressSpace() == spN); return false; } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index b75cf40..be735f6 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -124,6 +124,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // condition branches. setJumpIsExpensive(true); + // Wide divides are _very_ slow. Try to reduce the width of the divide if + // possible. + addBypassSlowDiv(64, 32); + // By default, use the Source scheduling if (sched4reg) setSchedulingPreference(Sched::RegPressure); @@ -275,6 +279,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SELECT); // Now deduce the information based on the above mentioned // actions @@ -910,7 +915,7 @@ std::string NVPTXTargetLowering::getPrototype( O << "("; if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) { unsigned size = 0; - if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) { + if (auto *ITy = dyn_cast<IntegerType>(retTy)) { size = ITy->getBitWidth(); if (size < 32) size = 32; @@ -981,7 +986,7 @@ std::string NVPTXTargetLowering::getPrototype( O << "_"; continue; } - const PointerType *PTy = dyn_cast<PointerType>(Ty); + auto *PTy = dyn_cast<PointerType>(Ty); assert(PTy && "Param with byval attribute should be a pointer type"); Type *ETy = PTy->getElementType(); @@ -1318,7 +1323,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // struct or vector SmallVector<EVT, 16> vtparts; SmallVector<uint64_t, 16> Offsets; - const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty); + auto *PTy = dyn_cast<PointerType>(Args[i].Ty); assert(PTy && "Type of a byval parameter should be pointer"); ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(), vtparts, &Offsets, 0); @@ -2007,15 +2012,6 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { return Result; } -SDValue NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname, - int idx, EVT v) const { - std::string *name = nvTM->getManagedStrPool()->getManagedString(inname); - std::stringstream suffix; - suffix << idx; - *name += suffix.str(); - return DAG.getTargetExternalSymbol(name->c_str(), v); -} - SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { std::string ParamSym; @@ -2029,10 +2025,6 @@ NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { return DAG.getTargetExternalSymbol(SavedStr->c_str(), v); } -SDValue NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) { - return getExtSymb(DAG, ".HLPPARAM", idx); -} - // Check to see if the kernel argument is image*_t or sampler_t bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) { @@ -2040,8 +2032,8 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) { "struct._image3d_t", "struct._sampler_t" }; - const Type *Ty = arg->getType(); - const PointerType *PTy = dyn_cast<PointerType>(Ty); + Type *Ty = arg->getType(); + auto *PTy = dyn_cast<PointerType>(Ty); if (!PTy) return false; @@ -2049,14 +2041,11 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) { if (!context) return false; - const StructType *STy = dyn_cast<StructType>(PTy->getElementType()); + auto *STy = dyn_cast<StructType>(PTy->getElementType()); const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : ""; - for (int i = 0, e = array_lengthof(specialTypes); i != e; ++i) - if (TypeName == specialTypes[i]) - return true; - - return false; + return std::find(std::begin(specialTypes), std::end(specialTypes), + TypeName) != std::end(specialTypes); } SDValue NVPTXTargetLowering::LowerFormalArguments( @@ -2082,10 +2071,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( std::vector<Type *> argTypes; std::vector<const Argument *> theArgs; - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I) { - theArgs.push_back(I); - argTypes.push_back(I->getType()); + for (const Argument &I : F->args()) { + theArgs.push_back(&I); + argTypes.push_back(I.getType()); } // argTypes.size() (or theArgs.size()) and Ins.size() need not match. // Ins.size() will be larger @@ -2545,20 +2533,6 @@ void NVPTXTargetLowering::LowerAsmOperandForConstraint( TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } -// NVPTX suuport vector of legal types of any length in Intrinsics because the -// NVPTX specific type legalizer -// will legalize them to the PTX supported length. -bool NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const { - if (isTypeLegal(VT)) - return true; - if (VT.isVector()) { - MVT eVT = VT.getVectorElementType(); - if (isTypeLegal(eVT)) - return true; - } - return false; -} - static unsigned getOpcForTextureInstr(unsigned Intrinsic) { switch (Intrinsic) { default: @@ -3747,9 +3721,7 @@ bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, // - [immAddr] if (AM.BaseGV) { - if (AM.BaseOffs || AM.HasBaseReg || AM.Scale) - return false; - return true; + return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; } switch (AM.Scale) { @@ -3820,11 +3792,6 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const { - return 4; -} - //===----------------------------------------------------------------------===// // NVPTX DAG Combining //===----------------------------------------------------------------------===// @@ -4057,6 +4024,67 @@ static SDValue PerformANDCombine(SDNode *N, return SDValue(); } +static SDValue PerformSELECTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Currently this detects patterns for integer min and max and + // lowers them to PTX-specific intrinsics that enable hardware + // support. + + const SDValue Cond = N->getOperand(0); + if (Cond.getOpcode() != ISD::SETCC) return SDValue(); + + const SDValue LHS = Cond.getOperand(0); + const SDValue RHS = Cond.getOperand(1); + const SDValue True = N->getOperand(1); + const SDValue False = N->getOperand(2); + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) + return SDValue(); + + const EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); + + const ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + SDValue Larger; // The larger of LHS and RHS when condition is true. + switch (CC) { + case ISD::SETULT: + case ISD::SETULE: + case ISD::SETLT: + case ISD::SETLE: + Larger = RHS; + break; + + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETUGT: + case ISD::SETUGE: + Larger = LHS; + break; + + default: + return SDValue(); + } + const bool IsMax = (Larger == True); + const bool IsSigned = ISD::isSignedIntSetCC(CC); + + unsigned IntrinsicId; + if (VT == MVT::i32) { + if (IsSigned) + IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i; + else + IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui; + } else { + assert(VT == MVT::i64); + if (IsSigned) + IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll; + else + IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull; + } + + SDLoc DL(N); + return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS); +} + enum OperandSignedness { Signed = 0, Unsigned, @@ -4113,25 +4141,16 @@ static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { APInt Val = CI->getAPIntValue(); if (LHSSign == Unsigned) { - if (Val.isIntN(OptSize)) { - return true; - } - return false; + return Val.isIntN(OptSize); } else { - if (Val.isSignedIntN(OptSize)) { - return true; - } - return false; + return Val.isSignedIntN(OptSize); } } else { OperandSignedness RHSSign; if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) return false; - if (LHSSign != RHSSign) - return false; - - return true; + return LHSSign == RHSSign; } } @@ -4247,6 +4266,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformSHLCombine(N, DCI, OptLevel); case ISD::AND: return PerformANDCombine(N, DCI); + case ISD::SELECT: + return PerformSELECTCombine(N, DCI); } return SDValue(); } @@ -4509,25 +4530,26 @@ void NVPTXTargetLowering::ReplaceNodeResults( void NVPTXSection::anchor() {} NVPTXTargetObjectFile::~NVPTXTargetObjectFile() { - delete TextSection; - delete DataSection; - delete BSSSection; - delete ReadOnlySection; - - delete StaticCtorSection; - delete StaticDtorSection; - delete LSDASection; - delete EHFrameSection; - delete DwarfAbbrevSection; - delete DwarfInfoSection; - delete DwarfLineSection; - delete DwarfFrameSection; - delete DwarfPubTypesSection; - delete DwarfDebugInlineSection; - delete DwarfStrSection; - delete DwarfLocSection; - delete DwarfARangesSection; - delete DwarfRangesSection; + delete static_cast<NVPTXSection *>(TextSection); + delete static_cast<NVPTXSection *>(DataSection); + delete static_cast<NVPTXSection *>(BSSSection); + delete static_cast<NVPTXSection *>(ReadOnlySection); + + delete static_cast<NVPTXSection *>(StaticCtorSection); + delete static_cast<NVPTXSection *>(StaticDtorSection); + delete static_cast<NVPTXSection *>(LSDASection); + delete static_cast<NVPTXSection *>(EHFrameSection); + delete static_cast<NVPTXSection *>(DwarfAbbrevSection); + delete static_cast<NVPTXSection *>(DwarfInfoSection); + delete static_cast<NVPTXSection *>(DwarfLineSection); + delete static_cast<NVPTXSection *>(DwarfFrameSection); + delete static_cast<NVPTXSection *>(DwarfPubTypesSection); + delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection); + delete static_cast<NVPTXSection *>(DwarfStrSection); + delete static_cast<NVPTXSection *>(DwarfLocSection); + delete static_cast<NVPTXSection *>(DwarfARangesSection); + delete static_cast<NVPTXSection *>(DwarfRangesSection); + delete static_cast<NVPTXSection *>(DwarfMacinfoSection); } MCSection * diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index e5c3732..60914c1 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -441,13 +441,9 @@ public: SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset, - SelectionDAG &DAG) const; const char *getTargetNodeName(unsigned Opcode) const override; - bool isTypeSupportedInIntrinsic(MVT VT) const; - bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const override; @@ -459,8 +455,13 @@ public: bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; - /// getFunctionAlignment - Return the Log2 alignment of this function. - unsigned getFunctionAlignment(const Function *F) const; + bool isTruncateFree(Type *SrcTy, Type *DstTy) const override { + // Truncating 64-bit to 32-bit is free in SASS. + if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) + return false; + return SrcTy->getPrimitiveSizeInBits() == 64 && + DstTy->getPrimitiveSizeInBits() == 32; + } EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override { @@ -515,11 +516,7 @@ public: private: const NVPTXSubtarget &STI; // cache the subtarget here - - SDValue getExtSymb(SelectionDAG &DAG, const char *name, int idx, - EVT = MVT::i32) const; SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const; - SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx); SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 76d6597..9f3cf45 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -37,30 +37,31 @@ void NVPTXInstrInfo::copyPhysReg( const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); - if (DestRC != SrcRC) - report_fatal_error("Attempted to created cross-class register copy"); - - if (DestRC == &NVPTX::Int32RegsRegClass) - BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (DestRC == &NVPTX::Int1RegsRegClass) - BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (DestRC == &NVPTX::Float32RegsRegClass) - BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (DestRC == &NVPTX::Int16RegsRegClass) - BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (DestRC == &NVPTX::Int64RegsRegClass) - BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (DestRC == &NVPTX::Float64RegsRegClass) - BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else { + if (DestRC->getSize() != SrcRC->getSize()) + report_fatal_error("Copy one register into another with a different width"); + + unsigned Op; + if (DestRC == &NVPTX::Int1RegsRegClass) { + Op = NVPTX::IMOV1rr; + } else if (DestRC == &NVPTX::Int16RegsRegClass) { + Op = NVPTX::IMOV16rr; + } else if (DestRC == &NVPTX::Int32RegsRegClass) { + Op = (SrcRC == &NVPTX::Int32RegsRegClass ? NVPTX::IMOV32rr + : NVPTX::BITCONVERT_32_F2I); + } else if (DestRC == &NVPTX::Int64RegsRegClass) { + Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64rr + : NVPTX::BITCONVERT_64_F2I); + } else if (DestRC == &NVPTX::Float32RegsRegClass) { + Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr + : NVPTX::BITCONVERT_32_I2F); + } else if (DestRC == &NVPTX::Float64RegsRegClass) { + Op = (SrcRC == &NVPTX::Float64RegsRegClass ? NVPTX::FMOV64rr + : NVPTX::BITCONVERT_64_I2F); + } else { llvm_unreachable("Bad register copy"); } + BuildMI(MBB, I, DL, get(Op), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); } bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg, @@ -86,27 +87,6 @@ bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg, return false; } -bool NVPTXInstrInfo::isReadSpecialReg(MachineInstr &MI) const { - switch (MI.getOpcode()) { - default: - return false; - case NVPTX::INT_PTX_SREG_NTID_X: - case NVPTX::INT_PTX_SREG_NTID_Y: - case NVPTX::INT_PTX_SREG_NTID_Z: - case NVPTX::INT_PTX_SREG_TID_X: - case NVPTX::INT_PTX_SREG_TID_Y: - case NVPTX::INT_PTX_SREG_TID_Z: - case NVPTX::INT_PTX_SREG_CTAID_X: - case NVPTX::INT_PTX_SREG_CTAID_Y: - case NVPTX::INT_PTX_SREG_CTAID_Z: - case NVPTX::INT_PTX_SREG_NCTAID_X: - case NVPTX::INT_PTX_SREG_NCTAID_Y: - case NVPTX::INT_PTX_SREG_NCTAID_Z: - case NVPTX::INT_PTX_SREG_WARPSIZE: - return true; - } -} - bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const { bool isLoad = false; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h index 179c068..3e40722 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -56,7 +56,6 @@ public: unsigned &DestReg) const; bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const; bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const; - bool isReadSpecialReg(MachineInstr &MI) const; virtual bool CanTailMerge(const MachineInstr *MI) const; // Branch analysis. diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index 0bf72fe..f770c2a 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -6,6 +6,8 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// +// \file // Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when // the size is large or is not a compile-time constant. // @@ -18,19 +20,20 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #define DEBUG_TYPE "nvptx" using namespace llvm; namespace { + // actual analysis class, which is a functionpass struct NVPTXLowerAggrCopies : public FunctionPass { static char ID; @@ -50,179 +53,299 @@ struct NVPTXLowerAggrCopies : public FunctionPass { return "Lower aggregate copies/intrinsics into loops"; } }; -} // namespace char NVPTXLowerAggrCopies::ID = 0; -// Lower MemTransferInst or load-store pair to loop -static void convertTransferToLoop( - Instruction *splitAt, Value *srcAddr, Value *dstAddr, Value *len, - bool srcVolatile, bool dstVolatile, LLVMContext &Context, Function &F) { - Type *indType = len->getType(); +// Lower memcpy to loop. +void convertMemCpyToLoop(Instruction *ConvertedInst, Value *SrcAddr, + Value *DstAddr, Value *CopyLen, bool SrcIsVolatile, + bool DstIsVolatile, LLVMContext &Context, + Function &F) { + Type *TypeOfCopyLen = CopyLen->getType(); - BasicBlock *origBB = splitAt->getParent(); - BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split"); - BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB); + BasicBlock *OrigBB = ConvertedInst->getParent(); + BasicBlock *NewBB = + ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split"); + BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB); - origBB->getTerminator()->setSuccessor(0, loopBB); - IRBuilder<> builder(origBB, origBB->getTerminator()); + OrigBB->getTerminator()->setSuccessor(0, LoopBB); + IRBuilder<> Builder(OrigBB->getTerminator()); - // srcAddr and dstAddr are expected to be pointer types, + // SrcAddr and DstAddr are expected to be pointer types, // so no check is made here. - unsigned srcAS = cast<PointerType>(srcAddr->getType())->getAddressSpace(); - unsigned dstAS = cast<PointerType>(dstAddr->getType())->getAddressSpace(); + unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace(); + unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); // Cast pointers to (char *) - srcAddr = builder.CreateBitCast(srcAddr, Type::getInt8PtrTy(Context, srcAS)); - dstAddr = builder.CreateBitCast(dstAddr, Type::getInt8PtrTy(Context, dstAS)); + SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS)); + DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS)); - IRBuilder<> loop(loopBB); - // The loop index (ind) is a phi node. - PHINode *ind = loop.CreatePHI(indType, 0); - // Incoming value for ind is 0 - ind->addIncoming(ConstantInt::get(indType, 0), origBB); + IRBuilder<> LoopBuilder(LoopBB); + PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); + LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB); - // load from srcAddr+ind + // load from SrcAddr+LoopIndex // TODO: we can leverage the align parameter of llvm.memcpy for more efficient // word-sized loads and stores. - Value *val = loop.CreateLoad(loop.CreateGEP(loop.getInt8Ty(), srcAddr, ind), - srcVolatile); - // store at dstAddr+ind - loop.CreateStore(val, loop.CreateGEP(loop.getInt8Ty(), dstAddr, ind), - dstVolatile); - - // The value for ind coming from backedge is (ind + 1) - Value *newind = loop.CreateAdd(ind, ConstantInt::get(indType, 1)); - ind->addIncoming(newind, loopBB); - - loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB); + Value *Element = + LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP( + LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex), + SrcIsVolatile); + // store at DstAddr+LoopIndex + LoopBuilder.CreateStore(Element, + LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(), + DstAddr, LoopIndex), + DstIsVolatile); + + // The value for LoopIndex coming from backedge is (LoopIndex + 1) + Value *NewIndex = + LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1)); + LoopIndex->addIncoming(NewIndex, LoopBB); + + LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB, + NewBB); } -// Lower MemSetInst to loop -static void convertMemSetToLoop(Instruction *splitAt, Value *dstAddr, - Value *len, Value *val, LLVMContext &Context, - Function &F) { - BasicBlock *origBB = splitAt->getParent(); - BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split"); - BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB); +// Lower memmove to IR. memmove is required to correctly copy overlapping memory +// regions; therefore, it has to check the relative positions of the source and +// destination pointers and choose the copy direction accordingly. +// +// The code below is an IR rendition of this C function: +// +// void* memmove(void* dst, const void* src, size_t n) { +// unsigned char* d = dst; +// const unsigned char* s = src; +// if (s < d) { +// // copy backwards +// while (n--) { +// d[n] = s[n]; +// } +// } else { +// // copy forward +// for (size_t i = 0; i < n; ++i) { +// d[i] = s[i]; +// } +// } +// return dst; +// } +void convertMemMoveToLoop(Instruction *ConvertedInst, Value *SrcAddr, + Value *DstAddr, Value *CopyLen, bool SrcIsVolatile, + bool DstIsVolatile, LLVMContext &Context, + Function &F) { + Type *TypeOfCopyLen = CopyLen->getType(); + BasicBlock *OrigBB = ConvertedInst->getParent(); + + // Create the a comparison of src and dst, based on which we jump to either + // the forward-copy part of the function (if src >= dst) or the backwards-copy + // part (if src < dst). + // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else + // structure. Its block terminators (unconditional branches) are replaced by + // the appropriate conditional branches when the loop is built. + ICmpInst *PtrCompare = new ICmpInst(ConvertedInst, ICmpInst::ICMP_ULT, + SrcAddr, DstAddr, "compare_src_dst"); + TerminatorInst *ThenTerm, *ElseTerm; + SplitBlockAndInsertIfThenElse(PtrCompare, ConvertedInst, &ThenTerm, + &ElseTerm); + + // Each part of the function consists of two blocks: + // copy_backwards: used to skip the loop when n == 0 + // copy_backwards_loop: the actual backwards loop BB + // copy_forward: used to skip the loop when n == 0 + // copy_forward_loop: the actual forward loop BB + BasicBlock *CopyBackwardsBB = ThenTerm->getParent(); + CopyBackwardsBB->setName("copy_backwards"); + BasicBlock *CopyForwardBB = ElseTerm->getParent(); + CopyForwardBB->setName("copy_forward"); + BasicBlock *ExitBB = ConvertedInst->getParent(); + ExitBB->setName("memmove_done"); + + // Initial comparison of n == 0 that lets us skip the loops altogether. Shared + // between both backwards and forward copy clauses. + ICmpInst *CompareN = + new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen, + ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0"); + + // Copying backwards. + BasicBlock *LoopBB = + BasicBlock::Create(Context, "copy_backwards_loop", &F, CopyForwardBB); + IRBuilder<> LoopBuilder(LoopBB); + PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); + Value *IndexPtr = LoopBuilder.CreateSub( + LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr"); + Value *Element = LoopBuilder.CreateLoad( + LoopBuilder.CreateInBoundsGEP(SrcAddr, IndexPtr), "element"); + LoopBuilder.CreateStore(Element, + LoopBuilder.CreateInBoundsGEP(DstAddr, IndexPtr)); + LoopBuilder.CreateCondBr( + LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)), + ExitBB, LoopBB); + LoopPhi->addIncoming(IndexPtr, LoopBB); + LoopPhi->addIncoming(CopyLen, CopyBackwardsBB); + BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm); + ThenTerm->eraseFromParent(); + + // Copying forward. + BasicBlock *FwdLoopBB = + BasicBlock::Create(Context, "copy_forward_loop", &F, ExitBB); + IRBuilder<> FwdLoopBuilder(FwdLoopBB); + PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr"); + Value *FwdElement = FwdLoopBuilder.CreateLoad( + FwdLoopBuilder.CreateInBoundsGEP(SrcAddr, FwdCopyPhi), "element"); + FwdLoopBuilder.CreateStore( + FwdElement, FwdLoopBuilder.CreateInBoundsGEP(DstAddr, FwdCopyPhi)); + Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd( + FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment"); + FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen), + ExitBB, FwdLoopBB); + FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB); + FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB); + + BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm); + ElseTerm->eraseFromParent(); +} - origBB->getTerminator()->setSuccessor(0, loopBB); - IRBuilder<> builder(origBB, origBB->getTerminator()); +// Lower memset to loop. +void convertMemSetToLoop(Instruction *ConvertedInst, Value *DstAddr, + Value *CopyLen, Value *SetValue, LLVMContext &Context, + Function &F) { + BasicBlock *OrigBB = ConvertedInst->getParent(); + BasicBlock *NewBB = + ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split"); + BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB); - unsigned dstAS = cast<PointerType>(dstAddr->getType())->getAddressSpace(); + OrigBB->getTerminator()->setSuccessor(0, LoopBB); + IRBuilder<> Builder(OrigBB->getTerminator()); // Cast pointer to the type of value getting stored - dstAddr = - builder.CreateBitCast(dstAddr, PointerType::get(val->getType(), dstAS)); + unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); + DstAddr = Builder.CreateBitCast(DstAddr, + PointerType::get(SetValue->getType(), dstAS)); - IRBuilder<> loop(loopBB); - PHINode *ind = loop.CreatePHI(len->getType(), 0); - ind->addIncoming(ConstantInt::get(len->getType(), 0), origBB); + IRBuilder<> LoopBuilder(LoopBB); + PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0); + LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB); - loop.CreateStore(val, loop.CreateGEP(val->getType(), dstAddr, ind), false); + LoopBuilder.CreateStore( + SetValue, + LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex), + false); - Value *newind = loop.CreateAdd(ind, ConstantInt::get(len->getType(), 1)); - ind->addIncoming(newind, loopBB); + Value *NewIndex = + LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1)); + LoopIndex->addIncoming(NewIndex, LoopBB); - loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB); + LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB, + NewBB); } bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { - SmallVector<LoadInst *, 4> aggrLoads; - SmallVector<MemTransferInst *, 4> aggrMemcpys; - SmallVector<MemSetInst *, 4> aggrMemsets; + SmallVector<LoadInst *, 4> AggrLoads; + SmallVector<MemIntrinsic *, 4> MemCalls; const DataLayout &DL = F.getParent()->getDataLayout(); LLVMContext &Context = F.getParent()->getContext(); - // - // Collect all the aggrLoads, aggrMemcpys and addrMemsets. - // + // Collect all aggregate loads and mem* calls. for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; ++II) { - if (LoadInst *load = dyn_cast<LoadInst>(II)) { - if (!load->hasOneUse()) + if (LoadInst *LI = dyn_cast<LoadInst>(II)) { + if (!LI->hasOneUse()) continue; - if (DL.getTypeStoreSize(load->getType()) < MaxAggrCopySize) + if (DL.getTypeStoreSize(LI->getType()) < MaxAggrCopySize) continue; - User *use = load->user_back(); - if (StoreInst *store = dyn_cast<StoreInst>(use)) { - if (store->getOperand(0) != load) + if (StoreInst *SI = dyn_cast<StoreInst>(LI->user_back())) { + if (SI->getOperand(0) != LI) continue; - aggrLoads.push_back(load); - } - } else if (MemTransferInst *intr = dyn_cast<MemTransferInst>(II)) { - Value *len = intr->getLength(); - // If the number of elements being copied is greater - // than MaxAggrCopySize, lower it to a loop - if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) { - if (len_int->getZExtValue() >= MaxAggrCopySize) { - aggrMemcpys.push_back(intr); - } - } else { - // turn variable length memcpy/memmov into loop - aggrMemcpys.push_back(intr); + AggrLoads.push_back(LI); } - } else if (MemSetInst *memsetintr = dyn_cast<MemSetInst>(II)) { - Value *len = memsetintr->getLength(); - if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) { - if (len_int->getZExtValue() >= MaxAggrCopySize) { - aggrMemsets.push_back(memsetintr); + } else if (MemIntrinsic *IntrCall = dyn_cast<MemIntrinsic>(II)) { + // Convert intrinsic calls with variable size or with constant size + // larger than the MaxAggrCopySize threshold. + if (ConstantInt *LenCI = dyn_cast<ConstantInt>(IntrCall->getLength())) { + if (LenCI->getZExtValue() >= MaxAggrCopySize) { + MemCalls.push_back(IntrCall); } } else { - // turn variable length memset into loop - aggrMemsets.push_back(memsetintr); + MemCalls.push_back(IntrCall); } } } } - if ((aggrLoads.size() == 0) && (aggrMemcpys.size() == 0) && - (aggrMemsets.size() == 0)) + + if (AggrLoads.size() == 0 && MemCalls.size() == 0) { return false; + } // // Do the transformation of an aggr load/copy/set to a loop // - for (LoadInst *load : aggrLoads) { - StoreInst *store = dyn_cast<StoreInst>(*load->user_begin()); - Value *srcAddr = load->getOperand(0); - Value *dstAddr = store->getOperand(1); - unsigned numLoads = DL.getTypeStoreSize(load->getType()); - Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads); - - convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(), - store->isVolatile(), Context, F); - - store->eraseFromParent(); - load->eraseFromParent(); + for (LoadInst *LI : AggrLoads) { + StoreInst *SI = dyn_cast<StoreInst>(*LI->user_begin()); + Value *SrcAddr = LI->getOperand(0); + Value *DstAddr = SI->getOperand(1); + unsigned NumLoads = DL.getTypeStoreSize(LI->getType()); + Value *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads); + + convertMemCpyToLoop(/* ConvertedInst */ SI, + /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, + /* CopyLen */ CopyLen, + /* SrcIsVolatile */ LI->isVolatile(), + /* DstIsVolatile */ SI->isVolatile(), + /* Context */ Context, + /* Function F */ F); + + SI->eraseFromParent(); + LI->eraseFromParent(); } - for (MemTransferInst *cpy : aggrMemcpys) { - convertTransferToLoop(/* splitAt */ cpy, - /* srcAddr */ cpy->getSource(), - /* dstAddr */ cpy->getDest(), - /* len */ cpy->getLength(), - /* srcVolatile */ cpy->isVolatile(), - /* dstVolatile */ cpy->isVolatile(), + // Transform mem* intrinsic calls. + for (MemIntrinsic *MemCall : MemCalls) { + if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) { + convertMemCpyToLoop(/* ConvertedInst */ Memcpy, + /* SrcAddr */ Memcpy->getRawSource(), + /* DstAddr */ Memcpy->getRawDest(), + /* CopyLen */ Memcpy->getLength(), + /* SrcIsVolatile */ Memcpy->isVolatile(), + /* DstIsVolatile */ Memcpy->isVolatile(), /* Context */ Context, /* Function F */ F); - cpy->eraseFromParent(); - } - - for (MemSetInst *memsetinst : aggrMemsets) { - Value *len = memsetinst->getLength(); - Value *val = memsetinst->getValue(); - convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context, - F); - memsetinst->eraseFromParent(); + } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) { + convertMemMoveToLoop(/* ConvertedInst */ Memmove, + /* SrcAddr */ Memmove->getRawSource(), + /* DstAddr */ Memmove->getRawDest(), + /* CopyLen */ Memmove->getLength(), + /* SrcIsVolatile */ Memmove->isVolatile(), + /* DstIsVolatile */ Memmove->isVolatile(), + /* Context */ Context, + /* Function F */ F); + + } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) { + convertMemSetToLoop(/* ConvertedInst */ Memset, + /* DstAddr */ Memset->getRawDest(), + /* CopyLen */ Memset->getLength(), + /* SetValue */ Memset->getValue(), + /* Context */ Context, + /* Function F */ F); + } + MemCall->eraseFromParent(); } return true; } +} // namespace + +namespace llvm { +void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); +} + +INITIALIZE_PASS(NVPTXLowerAggrCopies, "nvptx-lower-aggr-copies", + "Lower aggregate copies, and llvm.mem* intrinsics into loops", + false, false) + FunctionPass *llvm::createLowerAggrCopies() { return new NVPTXLowerAggrCopies(); } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp index 93d0025..624052e 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp @@ -81,7 +81,7 @@ bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) { // Check Load, Store, GEP, and BitCast Uses on alloca and make them // use the converted generic address, in order to expose non-generic // addrspacecast to NVPTXFavorNonGenericAddrSpace. For other types - // of instructions this is unecessary and may introduce redudant + // of instructions this is unnecessary and may introduce redundant // address cast. const auto &AllocaUse = *UI++; auto LI = dyn_cast<LoadInst>(AllocaUse.getUser()); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp index b533f31..6656077 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp @@ -47,6 +47,36 @@ // ... // } // +// 3. Convert pointers in a byval kernel parameter to pointers in the global +// address space. As #2, it allows NVPTX to emit more ld/st.global. E.g., +// +// struct S { +// int *x; +// int *y; +// }; +// __global__ void foo(S s) { +// int *b = s.y; +// // use b +// } +// +// "b" points to the global address space. In the IR level, +// +// define void @foo({i32*, i32*}* byval %input) { +// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 +// %b = load i32*, i32** %b_ptr +// ; use %b +// } +// +// becomes +// +// define void @foo({i32*, i32*}* byval %input) { +// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 +// %b = load i32*, i32** %b_ptr +// %b_global = addrspacecast i32* %b to i32 addrspace(1)* +// %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32* +// ; use %b_generic +// } +// // TODO: merge this pass with NVPTXFavorNonGenericAddrSpace so that other passes // don't cancel the addrspacecast pair this pass emits. //===----------------------------------------------------------------------===// @@ -54,6 +84,7 @@ #include "NVPTX.h" #include "NVPTXUtilities.h" #include "NVPTXTargetMachine.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -71,9 +102,12 @@ class NVPTXLowerKernelArgs : public FunctionPass { bool runOnFunction(Function &F) override; // handle byval parameters - void handleByValParam(Argument *); - // handle non-byval pointer parameters - void handlePointerParam(Argument *); + void handleByValParam(Argument *Arg); + // Knowing Ptr must point to the global address space, this function + // addrspacecasts Ptr to global and then back to generic. This allows + // NVPTXFavorNonGenericAddrSpace to fold the global-to-generic cast into + // loads/stores that appear later. + void markPointerAsGlobal(Value *Ptr); public: static char ID; // Pass identification, replacement for typeid @@ -104,7 +138,7 @@ INITIALIZE_PASS(NVPTXLowerKernelArgs, "nvptx-lower-kernel-args", // // The above code allocates some space in the stack and copies the incoming // struct from param space to local space. -// Then replace all occurences of %d by %temp. +// Then replace all occurrences of %d by %temp. // ============================================================================= void NVPTXLowerKernelArgs::handleByValParam(Argument *Arg) { Function *Func = Arg->getParent(); @@ -128,27 +162,33 @@ void NVPTXLowerKernelArgs::handleByValParam(Argument *Arg) { new StoreInst(LI, AllocA, FirstInst); } -void NVPTXLowerKernelArgs::handlePointerParam(Argument *Arg) { - assert(!Arg->hasByValAttr() && - "byval params should be handled by handleByValParam"); - - // Do nothing if the argument already points to the global address space. - if (Arg->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL) +void NVPTXLowerKernelArgs::markPointerAsGlobal(Value *Ptr) { + if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL) return; - Instruction *FirstInst = Arg->getParent()->getEntryBlock().begin(); - Instruction *ArgInGlobal = new AddrSpaceCastInst( - Arg, PointerType::get(Arg->getType()->getPointerElementType(), + // Deciding where to emit the addrspacecast pair. + BasicBlock::iterator InsertPt; + if (Argument *Arg = dyn_cast<Argument>(Ptr)) { + // Insert at the functon entry if Ptr is an argument. + InsertPt = Arg->getParent()->getEntryBlock().begin(); + } else { + // Insert right after Ptr if Ptr is an instruction. + InsertPt = ++cast<Instruction>(Ptr)->getIterator(); + assert(InsertPt != InsertPt->getParent()->end() && + "We don't call this function with Ptr being a terminator."); + } + + Instruction *PtrInGlobal = new AddrSpaceCastInst( + Ptr, PointerType::get(Ptr->getType()->getPointerElementType(), ADDRESS_SPACE_GLOBAL), - Arg->getName(), FirstInst); - Value *ArgInGeneric = new AddrSpaceCastInst(ArgInGlobal, Arg->getType(), - Arg->getName(), FirstInst); - // Replace with ArgInGeneric all uses of Args except ArgInGlobal. - Arg->replaceAllUsesWith(ArgInGeneric); - ArgInGlobal->setOperand(0, Arg); + Ptr->getName(), &*InsertPt); + Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(), + Ptr->getName(), &*InsertPt); + // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal. + Ptr->replaceAllUsesWith(PtrInGeneric); + PtrInGlobal->setOperand(0, Ptr); } - // ============================================================================= // Main function for this pass. // ============================================================================= @@ -157,12 +197,32 @@ bool NVPTXLowerKernelArgs::runOnFunction(Function &F) { if (!isKernelFunction(F)) return false; + if (TM && TM->getDrvInterface() == NVPTX::CUDA) { + // Mark pointers in byval structs as global. + for (auto &B : F) { + for (auto &I : B) { + if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { + if (LI->getType()->isPointerTy()) { + Value *UO = GetUnderlyingObject(LI->getPointerOperand(), + F.getParent()->getDataLayout()); + if (Argument *Arg = dyn_cast<Argument>(UO)) { + if (Arg->hasByValAttr()) { + // LI is a load from a pointer within a byval kernel parameter. + markPointerAsGlobal(LI); + } + } + } + } + } + } + } + for (Argument &Arg : F.args()) { if (Arg.getType()->isPointerTy()) { if (Arg.hasByValAttr()) handleByValParam(&Arg); else if (TM && TM->getDrvInterface() == NVPTX::CUDA) - handlePointerParam(&Arg); + markPointerAsGlobal(&Arg); } } return true; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h index 46b4b33..81a606d 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h @@ -68,7 +68,7 @@ public: return false; } void visitUsedExpr(MCStreamer &Streamer) const override {}; - MCSection *findAssociatedSection() const override { return nullptr; } + MCFragment *findAssociatedFragment() const override { return nullptr; } // There are no TLS NVPTXMCExprs at the moment. void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {} @@ -110,7 +110,7 @@ public: return false; } void visitUsedExpr(MCStreamer &Streamer) const override {}; - MCSection *findAssociatedSection() const override { return nullptr; } + MCFragment *findAssociatedFragment() const override { return nullptr; } // There are no TLS NVPTXMCExprs at the moment. void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {} diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp index 5fd69a6..17019d7 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -72,7 +72,7 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) { for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { // If last instruction is a return instruction, add an epilogue - if (!I->empty() && I->back().isReturn()) + if (I->isReturnBlock()) TFI.emitEpilogue(MF, *I); } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h index 0d2627d..45a7309 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h @@ -19,15 +19,14 @@ #include <vector> namespace llvm { -/// NVPTXSection - Represents a section in PTX -/// PTX does not have sections. We create this class in order to use -/// the ASMPrint interface. +/// Represents a section in PTX PTX does not have sections. We create this class +/// in order to use the ASMPrint interface. /// -class NVPTXSection : public MCSection { +class NVPTXSection final : public MCSection { virtual void anchor(); public: NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K, nullptr) {} - virtual ~NVPTXSection() {} + ~NVPTXSection() {} /// Override this as NVPTX has its own way of printing switching /// to a section. diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 248f9e1..aa931b1 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -53,6 +53,7 @@ void initializeGenericToNVVMPass(PassRegistry&); void initializeNVPTXAllocaHoistingPass(PassRegistry &); void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); +void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); void initializeNVPTXLowerKernelArgsPass(PassRegistry &); void initializeNVPTXLowerAllocaPass(PassRegistry &); } @@ -64,14 +65,15 @@ extern "C" void LLVMInitializeNVPTXTarget() { // FIXME: This pass is really intended to be invoked during IR optimization, // but it's very NVPTX-specific. - initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); - initializeGenericToNVVMPass(*PassRegistry::getPassRegistry()); - initializeNVPTXAllocaHoistingPass(*PassRegistry::getPassRegistry()); - initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry()); - initializeNVPTXFavorNonGenericAddrSpacesPass( - *PassRegistry::getPassRegistry()); - initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry()); - initializeNVPTXLowerAllocaPass(*PassRegistry::getPassRegistry()); + PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializeNVVMReflectPass(PR); + initializeGenericToNVVMPass(PR); + initializeNVPTXAllocaHoistingPass(PR); + initializeNVPTXAssignValidGlobalNamesPass(PR); + initializeNVPTXFavorNonGenericAddrSpacesPass(PR); + initializeNVPTXLowerKernelArgsPass(PR); + initializeNVPTXLowerAllocaPass(PR); + initializeNVPTXLowerAggrCopiesPass(PR); } static std::string computeDataLayout(bool is64Bit) { @@ -139,6 +141,10 @@ public: FunctionPass *createTargetRegisterAllocator(bool) override; void addFastRegAlloc(FunctionPass *RegAllocPass) override; void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; + +private: + // if the opt level is aggressive, add GVN; otherwise, add EarlyCSE. + void addEarlyCSEOrGVNPass(); }; } // end anonymous namespace @@ -148,11 +154,18 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { } TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(NVPTXTTIImpl(this, F)); }); } +void NVPTXPassConfig::addEarlyCSEOrGVNPass() { + if (getOptLevel() == CodeGenOpt::Aggressive) + addPass(createGVNPass()); + else + addPass(createEarlyCSEPass()); +} + void NVPTXPassConfig::addIRPasses() { // The following passes are known to not play well with virtual regs hanging // around after register allocation (which in our case, is *all* registers). @@ -161,13 +174,14 @@ void NVPTXPassConfig::addIRPasses() { // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). disablePass(&PrologEpilogCodeInserterID); disablePass(&MachineCopyPropagationID); - disablePass(&BranchFolderPassID); disablePass(&TailDuplicateID); + addPass(createNVVMReflectPass()); addPass(createNVPTXImageOptimizerPass()); - TargetPassConfig::addIRPasses(); addPass(createNVPTXAssignValidGlobalNamesPass()); addPass(createGenericToNVVMPass()); + + // === Propagate special address spaces === addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine())); // NVPTXLowerKernelArgs emits alloca for byval parameters which can often // be eliminated by SROA. @@ -178,22 +192,38 @@ void NVPTXPassConfig::addIRPasses() { // them unused. We could remove dead code in an ad-hoc manner, but that // requires manual work and might be error-prone. addPass(createDeadCodeEliminationPass()); + + // === Straight-line scalar optimizations === addPass(createSeparateConstOffsetFromGEPPass()); + addPass(createSpeculativeExecutionPass()); // ReassociateGEPs exposes more opportunites for SLSR. See // the example in reassociate-geps-and-slsr.ll. addPass(createStraightLineStrengthReducePass()); // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE // for some of our benchmarks. - if (getOptLevel() == CodeGenOpt::Aggressive) - addPass(createGVNPass()); - else - addPass(createEarlyCSEPass()); + addEarlyCSEOrGVNPass(); // Run NaryReassociate after EarlyCSE/GVN to be more effective. addPass(createNaryReassociatePass()); // NaryReassociate on GEPs creates redundant common expressions, so run // EarlyCSE after it. addPass(createEarlyCSEPass()); + + // === LSR and other generic IR passes === + TargetPassConfig::addIRPasses(); + // EarlyCSE is not always strong enough to clean up what LSR produces. For + // example, GVN can combine + // + // %0 = add %a, %b + // %1 = add %b, %a + // + // and + // + // %0 = shl nsw %a, 2 + // %1 = shl %a, 2 + // + // but EarlyCSE can do neither of them. + addEarlyCSEOrGVNPass(); } bool NVPTXPassConfig::addInstSelector() { diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h index 5ecdc87..683b9a3 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h @@ -41,6 +41,7 @@ public: DwarfLocSection = nullptr; DwarfARangesSection = nullptr; DwarfRangesSection = nullptr; + DwarfMacinfoSection = nullptr; } virtual ~NVPTXTargetObjectFile(); @@ -48,8 +49,7 @@ public: void Initialize(MCContext &ctx, const TargetMachine &TM) override { TargetLoweringObjectFile::Initialize(ctx, TM); TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText()); - DataSection = - new NVPTXSection(MCSection::SV_ELF, SectionKind::getDataRel()); + DataSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getData()); BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS()); ReadOnlySection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly()); @@ -82,9 +82,11 @@ public: new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata()); DwarfRangesSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata()); + DwarfMacinfoSection = + new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata()); } - MCSection *getSectionForConstant(SectionKind Kind, + MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, const Constant *C) const override { return ReadOnlySection; } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index e7250cd..6e679dd 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -89,12 +89,12 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) { return false; } -unsigned NVPTXTTIImpl::getArithmeticInstrCost( +int NVPTXTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 5bcd1e2..0946a32 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -52,7 +52,7 @@ public: bool isSourceOfDivergence(const Value *V); - unsigned getArithmeticInstrCost( + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 1f178af..578b466 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -335,106 +335,7 @@ bool llvm::getAlign(const CallInst &I, unsigned index, unsigned &align) { return false; } -bool llvm::isBarrierIntrinsic(Intrinsic::ID id) { - if ((id == Intrinsic::nvvm_barrier0) || - (id == Intrinsic::nvvm_barrier0_popc) || - (id == Intrinsic::nvvm_barrier0_and) || - (id == Intrinsic::nvvm_barrier0_or) || - (id == Intrinsic::cuda_syncthreads)) - return true; - return false; -} - -// Interface for checking all memory space transfer related intrinsics -bool llvm::isMemorySpaceTransferIntrinsic(Intrinsic::ID id) { - if (id == Intrinsic::nvvm_ptr_local_to_gen || - id == Intrinsic::nvvm_ptr_shared_to_gen || - id == Intrinsic::nvvm_ptr_global_to_gen || - id == Intrinsic::nvvm_ptr_constant_to_gen || - id == Intrinsic::nvvm_ptr_gen_to_global || - id == Intrinsic::nvvm_ptr_gen_to_shared || - id == Intrinsic::nvvm_ptr_gen_to_local || - id == Intrinsic::nvvm_ptr_gen_to_constant || - id == Intrinsic::nvvm_ptr_gen_to_param) { - return true; - } - - return false; -} - -// consider several special intrinsics in striping pointer casts, and -// provide an option to ignore GEP indicies for find out the base address only -// which could be used in simple alias disambigurate. -const Value * -llvm::skipPointerTransfer(const Value *V, bool ignore_GEP_indices) { - V = V->stripPointerCasts(); - while (true) { - if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) { - if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) { - V = IS->getArgOperand(0)->stripPointerCasts(); - continue; - } - } else if (ignore_GEP_indices) - if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { - V = GEP->getPointerOperand()->stripPointerCasts(); - continue; - } - break; - } - return V; -} - -// consider several special intrinsics in striping pointer casts, and -// - ignore GEP indicies for find out the base address only, and -// - tracking PHINode -// which could be used in simple alias disambigurate. -const Value * -llvm::skipPointerTransfer(const Value *V, std::set<const Value *> &processed) { - if (processed.find(V) != processed.end()) - return nullptr; - processed.insert(V); - - const Value *V2 = V->stripPointerCasts(); - if (V2 != V && processed.find(V2) != processed.end()) - return nullptr; - processed.insert(V2); - - V = V2; - - while (true) { - if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) { - if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) { - V = IS->getArgOperand(0)->stripPointerCasts(); - continue; - } - } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { - V = GEP->getPointerOperand()->stripPointerCasts(); - continue; - } else if (const PHINode *PN = dyn_cast<PHINode>(V)) { - if (V != V2 && processed.find(V) != processed.end()) - return nullptr; - processed.insert(PN); - const Value *common = nullptr; - for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { - const Value *pv = PN->getIncomingValue(i); - const Value *base = skipPointerTransfer(pv, processed); - if (base) { - if (!common) - common = base; - else if (common != base) - return PN; - } - } - if (!common) - return PN; - V = common; - } - break; - } - return V; -} - -// The following are some useful utilities for debuggung +// The following are some useful utilities for debugging BasicBlock *llvm::getParentBlock(Value *v) { if (BasicBlock *B = dyn_cast<BasicBlock>(v)) @@ -466,7 +367,7 @@ void llvm::dumpBlock(Value *v, char *blockName) { return; for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) { - BasicBlock *B = it; + BasicBlock *B = &*it; if (strcmp(B->getName().data(), blockName) == 0) { B->dump(); return; @@ -490,7 +391,7 @@ Instruction *llvm::getInst(Value *base, char *instName) { return nullptr; } -// Dump an instruction by nane +// Dump an instruction by name void llvm::dumpInst(Value *base, char *instName) { Instruction *I = getInst(base, instName); if (I) diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 7e2ce73..a5262cb 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -61,27 +61,6 @@ bool isKernelFunction(const llvm::Function &); bool getAlign(const llvm::Function &, unsigned index, unsigned &); bool getAlign(const llvm::CallInst &, unsigned index, unsigned &); -bool isBarrierIntrinsic(llvm::Intrinsic::ID); - -/// make_vector - Helper function which is useful for building temporary vectors -/// to pass into type construction of CallInst ctors. This turns a null -/// terminated list of pointers (or other value types) into a real live vector. -/// -template <typename T> inline std::vector<T> make_vector(T A, ...) { - va_list Args; - va_start(Args, A); - std::vector<T> Result; - Result.push_back(A); - while (T Val = va_arg(Args, T)) - Result.push_back(Val); - va_end(Args); - return Result; -} - -bool isMemorySpaceTransferIntrinsic(Intrinsic::ID id); -const Value *skipPointerTransfer(const Value *V, bool ignore_GEP_indices); -const Value * -skipPointerTransfer(const Value *V, std::set<const Value *> &processed); BasicBlock *getParentBlock(Value *v); Function *getParentFunction(Value *v); void dumpBlock(Value *v, char *blockName); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td index a237247..e69bbba 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td @@ -26,7 +26,7 @@ let isAsCheapAsAMove=1, VecInstType=isVecExtract.Value in { def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst), (ins V2I16Regs:$src, i8imm:$c), "mov.u16 \t$dst, $src${c:vecelem};", - [(set Int16Regs:$dst, (vector_extract + [(set Int16Regs:$dst, (extractelt (v2i16 V2I16Regs:$src), imm:$c))], IMOV16rr>; @@ -34,7 +34,7 @@ def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst), def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst), (ins V4I16Regs:$src, i8imm:$c), "mov.u16 \t$dst, $src${c:vecelem};", - [(set Int16Regs:$dst, (vector_extract + [(set Int16Regs:$dst, (extractelt (v4i16 V4I16Regs:$src), imm:$c))], IMOV16rr>; @@ -42,7 +42,7 @@ def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst), def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst), (ins V2I8Regs:$src, i8imm:$c), "mov.u16 \t$dst, $src${c:vecelem};", - [(set Int8Regs:$dst, (vector_extract + [(set Int8Regs:$dst, (extractelt (v2i8 V2I8Regs:$src), imm:$c))], IMOV8rr>; @@ -50,7 +50,7 @@ def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst), def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst), (ins V4I8Regs:$src, i8imm:$c), "mov.u16 \t$dst, $src${c:vecelem};", - [(set Int8Regs:$dst, (vector_extract + [(set Int8Regs:$dst, (extractelt (v4i8 V4I8Regs:$src), imm:$c))], IMOV8rr>; @@ -58,7 +58,7 @@ def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst), def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst), (ins V2I32Regs:$src, i8imm:$c), "mov.u32 \t$dst, $src${c:vecelem};", - [(set Int32Regs:$dst, (vector_extract + [(set Int32Regs:$dst, (extractelt (v2i32 V2I32Regs:$src), imm:$c))], IMOV32rr>; @@ -66,7 +66,7 @@ def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst), def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst), (ins V2F32Regs:$src, i8imm:$c), "mov.f32 \t$dst, $src${c:vecelem};", - [(set Float32Regs:$dst, (vector_extract + [(set Float32Regs:$dst, (extractelt (v2f32 V2F32Regs:$src), imm:$c))], FMOV32rr>; @@ -74,7 +74,7 @@ def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst), def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst), (ins V2I64Regs:$src, i8imm:$c), "mov.u64 \t$dst, $src${c:vecelem};", - [(set Int64Regs:$dst, (vector_extract + [(set Int64Regs:$dst, (extractelt (v2i64 V2I64Regs:$src), imm:$c))], IMOV64rr>; @@ -82,7 +82,7 @@ def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst), def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst), (ins V2F64Regs:$src, i8imm:$c), "mov.f64 \t$dst, $src${c:vecelem};", - [(set Float64Regs:$dst, (vector_extract + [(set Float64Regs:$dst, (extractelt (v2f64 V2F64Regs:$src), imm:$c))], FMOV64rr>; @@ -90,7 +90,7 @@ def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst), def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst), (ins V4I32Regs:$src, i8imm:$c), "mov.u32 \t$dst, $src${c:vecelem};", - [(set Int32Regs:$dst, (vector_extract + [(set Int32Regs:$dst, (extractelt (v4i32 V4I32Regs:$src), imm:$c))], IMOV32rr>; @@ -98,7 +98,7 @@ def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst), def V4f32Extract : NVPTXVecInst<(outs Float32Regs:$dst), (ins V4F32Regs:$src, i8imm:$c), "mov.f32 \t$dst, $src${c:vecelem};", - [(set Float32Regs:$dst, (vector_extract + [(set Float32Regs:$dst, (extractelt (v4f32 V4F32Regs:$src), imm:$c))], FMOV32rr>; } @@ -110,8 +110,7 @@ def V2i8Insert : NVPTXVecInst<(outs V2I8Regs:$dst), "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u16 \t$dst${c:vecelem}, $val;", [(set V2I8Regs:$dst, - (vector_insert V2I8Regs:$src, Int8Regs:$val, imm:$c))], - IMOV8rr>; + (insertelt V2I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>; // Insert v4i8 def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst), @@ -119,8 +118,7 @@ def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst), "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u16 \t$dst${c:vecelem}, $val;", [(set V4I8Regs:$dst, - (vector_insert V4I8Regs:$src, Int8Regs:$val, imm:$c))], - IMOV8rr>; + (insertelt V4I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>; // Insert v2i16 def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst), @@ -128,8 +126,8 @@ def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst), "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u16 \t$dst${c:vecelem}, $val;", [(set V2I16Regs:$dst, - (vector_insert V2I16Regs:$src, Int16Regs:$val, imm:$c))], - IMOV16rr>; + (insertelt V2I16Regs:$src, Int16Regs:$val, imm:$c))], + IMOV16rr>; // Insert v4i16 def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst), @@ -137,8 +135,8 @@ def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst), "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u16 \t$dst${c:vecelem}, $val;", [(set V4I16Regs:$dst, - (vector_insert V4I16Regs:$src, Int16Regs:$val, imm:$c))], - IMOV16rr>; + (insertelt V4I16Regs:$src, Int16Regs:$val, imm:$c))], + IMOV16rr>; // Insert v2i32 def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst), @@ -146,8 +144,8 @@ def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst), "mov.v2.u32 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u32 \t$dst${c:vecelem}, $val;", [(set V2I32Regs:$dst, - (vector_insert V2I32Regs:$src, Int32Regs:$val, imm:$c))], - IMOV32rr>; + (insertelt V2I32Regs:$src, Int32Regs:$val, imm:$c))], + IMOV32rr>; // Insert v2f32 def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst), @@ -155,8 +153,8 @@ def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst), "mov.v2.f32 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.f32 \t$dst${c:vecelem}, $val;", [(set V2F32Regs:$dst, - (vector_insert V2F32Regs:$src, Float32Regs:$val, imm:$c))], - FMOV32rr>; + (insertelt V2F32Regs:$src, Float32Regs:$val, imm:$c))], + FMOV32rr>; // Insert v2i64 def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst), @@ -164,8 +162,8 @@ def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst), "mov.v2.u64 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u64 \t$dst${c:vecelem}, $val;", [(set V2I64Regs:$dst, - (vector_insert V2I64Regs:$src, Int64Regs:$val, imm:$c))], - IMOV64rr>; + (insertelt V2I64Regs:$src, Int64Regs:$val, imm:$c))], + IMOV64rr>; // Insert v2f64 def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst), @@ -173,8 +171,8 @@ def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst), "mov.v2.f64 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.f64 \t$dst${c:vecelem}, $val;", [(set V2F64Regs:$dst, - (vector_insert V2F64Regs:$src, Float64Regs:$val, imm:$c))], - FMOV64rr>; + (insertelt V2F64Regs:$src, Float64Regs:$val, imm:$c))], + FMOV64rr>; // Insert v4i32 def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst), @@ -182,8 +180,8 @@ def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst), "mov.v4.u32 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u32 \t$dst${c:vecelem}, $val;", [(set V4I32Regs:$dst, - (vector_insert V4I32Regs:$src, Int32Regs:$val, imm:$c))], - IMOV32rr>; + (insertelt V4I32Regs:$src, Int32Regs:$val, imm:$c))], + IMOV32rr>; // Insert v4f32 def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst), @@ -191,8 +189,8 @@ def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst), "mov.v4.f32 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.f32 \t$dst${c:vecelem}, $val;", [(set V4F32Regs:$dst, - (vector_insert V4F32Regs:$src, Float32Regs:$val, imm:$c))], - FMOV32rr>; + (insertelt V4F32Regs:$src, Float32Regs:$val, imm:$c))], + FMOV32rr>; } class BinOpAsmString<string c> { diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 5e375b7..20ab5db 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -109,10 +109,10 @@ void NVVMReflect::setVarMap() { for (unsigned i = 0, e = ReflectList.size(); i != e; ++i) { DEBUG(dbgs() << "Option : " << ReflectList[i] << "\n"); SmallVector<StringRef, 4> NameValList; - StringRef(ReflectList[i]).split(NameValList, ","); + StringRef(ReflectList[i]).split(NameValList, ','); for (unsigned j = 0, ej = NameValList.size(); j != ej; ++j) { SmallVector<StringRef, 2> NameValPair; - NameValList[j].split(NameValPair, "="); + NameValList[j].split(NameValPair, '='); assert(NameValPair.size() == 2 && "name=val expected"); std::stringstream ValStream(NameValPair[1]); int Val; diff --git a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index a699a55..220c70a 100644 --- a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -243,7 +243,6 @@ namespace { struct PPCOperand; class PPCAsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; const MCInstrInfo &MII; bool IsPPC64; bool IsDarwin; @@ -291,9 +290,9 @@ class PPCAsmParser : public MCTargetAsmParser { public: - PPCAsmParser(MCSubtargetInfo &STI, MCAsmParser &, const MCInstrInfo &MII, - const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(STI), MII(MII) { + PPCAsmParser(const MCSubtargetInfo &STI, MCAsmParser &, + const MCInstrInfo &MII, const MCTargetOptions &Options) + : MCTargetAsmParser(Options, STI), MII(MII) { // Check for 64-bit vs. 32-bit pointer mode. Triple TheTriple(STI.getTargetTriple()); IsPPC64 = (TheTriple.getArch() == Triple::ppc64 || @@ -1185,7 +1184,7 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::MFTB: { - if (STI.getFeatureBits()[PPC::FeatureMFTB]) { + if (getSTI().getFeatureBits()[PPC::FeatureMFTB]) { assert(Inst.getNumOperands() == 2 && "Expecting two operands"); Inst.setOpcode(PPC::MFSPR); } @@ -1205,7 +1204,7 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // Post-process instructions (typically extended mnemonics) ProcessInstruction(Inst, Operands); Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, getSTI()); return false; case Match_MissingFeature: return Error(IDLoc, "instruction use requires an option to be enabled"); @@ -1690,7 +1689,7 @@ bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // where th can be omitted when it is 0. dcbtst is the same. We take the // server form to be the default, so swap the operands if we're parsing for // an embedded core (they'll be swapped again upon printing). - if (STI.getFeatureBits()[PPC::FeatureBookE] && + if (getSTI().getFeatureBits()[PPC::FeatureBookE] && Operands.size() == 4 && (Name == "dcbt" || Name == "dcbtst")) { std::swap(Operands[1], Operands[3]); @@ -1730,10 +1729,19 @@ bool PPCAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { if (getLexer().isNot(AsmToken::EndOfStatement)) { for (;;) { const MCExpr *Value; + SMLoc ExprLoc = getLexer().getLoc(); if (getParser().parseExpression(Value)) return false; - getParser().getStreamer().EmitValue(Value, Size); + if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) { + assert(Size <= 8 && "Invalid size"); + uint64_t IntValue = MCE->getValue(); + if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue)) + return Error(ExprLoc, "literal value out of range for directive"); + getStreamer().EmitIntValue(IntValue, Size); + } else { + getStreamer().EmitValue(Value, Size, ExprLoc); + } if (getLexer().is(AsmToken::EndOfStatement)) break; diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 93a503c..1fc84fb 100644 --- a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -401,8 +401,6 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size, decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI); if (result != MCDisassembler::Fail) return result; - - MI.clear(); } return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI); diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h index 8e18783..53eb727 100644 --- a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h +++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h @@ -18,8 +18,6 @@ namespace llvm { -class MCOperand; - class PPCInstPrinter : public MCInstPrinter { bool IsDarwin; public: diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index 992be5b..dd99495 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -113,6 +113,10 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target, break; } break; + case PPC::fixup_ppc_half16ds: + Target.print(errs()); + errs() << '\n'; + report_fatal_error("Invalid PC-relative half16ds relocation"); case FK_Data_4: case FK_PCRel_4: Type = ELF::R_PPC_REL32; @@ -305,13 +309,13 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target, break; case MCSymbolRefExpr::VK_GOT: Type = ELF::R_PPC64_GOT16_DS; - break; + break; case MCSymbolRefExpr::VK_PPC_GOT_LO: Type = ELF::R_PPC64_GOT16_LO_DS; break; case MCSymbolRefExpr::VK_PPC_TOC: Type = ELF::R_PPC64_TOC16_DS; - break; + break; case MCSymbolRefExpr::VK_PPC_TOC_LO: Type = ELF::R_PPC64_TOC16_LO_DS; break; @@ -372,16 +376,16 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target, break; case MCSymbolRefExpr::VK_None: Type = ELF::R_PPC64_ADDR64; - break; + break; case MCSymbolRefExpr::VK_PPC_DTPMOD: Type = ELF::R_PPC64_DTPMOD64; - break; + break; case MCSymbolRefExpr::VK_PPC_TPREL: Type = ELF::R_PPC64_TPREL64; - break; + break; case MCSymbolRefExpr::VK_PPC_DTPREL: Type = ELF::R_PPC64_DTPREL64; - break; + break; } break; case FK_Data_4: diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h index 86ad385..e252ac9 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h @@ -20,18 +20,19 @@ namespace llvm { class Triple; - class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin { - virtual void anchor(); - - public: - explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple&); - }; - - class PPCELFMCAsmInfo : public MCAsmInfoELF { - void anchor() override; - public: - explicit PPCELFMCAsmInfo(bool is64Bit, const Triple&); - }; +class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin { + virtual void anchor(); + +public: + explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple &); +}; + +class PPCELFMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit PPCELFMCAsmInfo(bool is64Bit, const Triple &); +}; } // namespace llvm diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h index a641780..d42a111 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h @@ -82,8 +82,8 @@ public: const MCAsmLayout *Layout, const MCFixup *Fixup) const override; void visitUsedExpr(MCStreamer &Streamer) const override; - MCSection *findAssociatedSection() const override { - return getSubExpr()->findAssociatedSection(); + MCFragment *findAssociatedFragment() const override { + return getSubExpr()->findAssociatedFragment(); } // There are no TLS PPCMCExprs at the moment. diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp index 9d72896..b54a0e1 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp @@ -241,12 +241,12 @@ bool PPCMachObjectWriter::recordScatteredRelocation( if (FixupOffset > 0xffffff) { char Buffer[32]; format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer)); - Asm.getContext().reportFatalError(Fixup.getLoc(), + Asm.getContext().reportError(Fixup.getLoc(), Twine("Section too large, can't encode " "r_address (") + Buffer + ") into 24 bits of scattered " "relocation entry."); - llvm_unreachable("fatal error returned?!"); + return false; } // Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()? diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h index 6075631..acea600 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h @@ -56,6 +56,14 @@ namespace PPC { PRED_BIT_UNSET = 1025 }; + // Bit for branch taken (plus) or not-taken (minus) hint + enum BranchHintBit { + BR_NO_HINT = 0x0, + BR_NONTAKEN_HINT = 0x2, + BR_TAKEN_HINT = 0x3, + BR_HINT_MASK = 0X3 + }; + /// Invert the specified predicate. != -> ==, < -> >=. Predicate InvertPredicate(Predicate Opcode); diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm/lib/Target/PowerPC/PPC.h index ae8d8b4..a259ed3 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPC.h +++ b/contrib/llvm/lib/Target/PowerPC/PPC.h @@ -41,13 +41,16 @@ namespace llvm { FunctionPass *createPPCVSXCopyPass(); FunctionPass *createPPCVSXFMAMutatePass(); FunctionPass *createPPCVSXSwapRemovalPass(); + FunctionPass *createPPCMIPeepholePass(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM); FunctionPass *createPPCTLSDynamicCallPass(); + FunctionPass *createPPCBoolRetToIntPass(); void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP, bool isDarwin); void initializePPCVSXFMAMutatePass(PassRegistry&); + void initializePPCBoolRetToIntPass(PassRegistry&); extern char &PPCVSXFMAMutateID; namespace PPCII { diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm/lib/Target/PowerPC/PPC.td index 641b237..b03be12 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPC.td +++ b/contrib/llvm/lib/Target/PowerPC/PPC.td @@ -50,6 +50,8 @@ def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">; def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true", "Enable 64-bit instructions">; +def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", + "Use software emulation for floating point">; def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true", "Enable 64-bit registers usage for ppc32 [beta]">; def FeatureCRBits : SubtargetFeature<"crbits", "UseCRBits", "true", @@ -137,6 +139,12 @@ def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true", "Enable Hardware Transactional Memory instructions">; def FeatureMFTB : SubtargetFeature<"", "FeatureMFTB", "true", "Implement mftb using the mfspr instruction">; +def FeatureFusion : SubtargetFeature<"fusion", "HasFusion", "true", + "Target supports add/load integer fusion.">; +def FeatureFloat128 : + SubtargetFeature<"float128", "HasFloat128", "true", + "Enable the __float128 data type for IEEE-754R Binary128.", + [FeatureVSX]>; def DeprecatedDST : SubtargetFeature<"", "DeprecatedDST", "true", "Treat vector data stream cache control instructions as deprecated">; @@ -168,7 +176,8 @@ def ProcessorFeatures { FeatureMFTB, DeprecatedDST]; list<SubtargetFeature> Power8SpecificFeatures = [DirectivePwr8, FeatureP8Altivec, FeatureP8Vector, FeatureP8Crypto, - FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic]; + FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic, + FeatureFusion]; list<SubtargetFeature> Power8FeatureList = !listconcat(Power7FeatureList, Power8SpecificFeatures); } @@ -309,7 +318,7 @@ def : ProcessorModel<"g5", G5Model, Feature64Bit /*, Feature64BitRegs */, FeatureMFTB, DeprecatedDST]>; def : ProcessorModel<"e500mc", PPCE500mcModel, - [DirectiveE500mc, FeatureMFOCRF, + [DirectiveE500mc, FeatureSTFIWX, FeatureICBT, FeatureBookE, FeatureISEL, FeatureMFTB]>; def : ProcessorModel<"e5500", PPCE5500Model, @@ -403,6 +412,7 @@ def PPCAsmParserVariant : AsmParserVariant { // InstAlias definitions use immediate literals. Set RegisterPrefix // so that those are not misinterpreted as registers. string RegisterPrefix = "%"; + string BreakCharacters = "."; } def PPC : Target { diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 8e118ec..ec354c2 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -65,19 +65,20 @@ using namespace llvm; #define DEBUG_TYPE "asmprinter" namespace { - class PPCAsmPrinter : public AsmPrinter { - protected: - MapVector<MCSymbol*, MCSymbol*> TOC; - const PPCSubtarget *Subtarget; - StackMaps SM; - public: - explicit PPCAsmPrinter(TargetMachine &TM, - std::unique_ptr<MCStreamer> Streamer) - : AsmPrinter(TM, std::move(Streamer)), SM(*this) {} - - const char *getPassName() const override { - return "PowerPC Assembly Printer"; - } +class PPCAsmPrinter : public AsmPrinter { +protected: + MapVector<MCSymbol *, MCSymbol *> TOC; + const PPCSubtarget *Subtarget; + StackMaps SM; + +public: + explicit PPCAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)), SM(*this) {} + + const char *getPassName() const override { + return "PowerPC Assembly Printer"; + } MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym); @@ -94,10 +95,8 @@ namespace { void EmitEndOfAsmFile(Module &M) override; - void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI); - void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI); + void LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI); + void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI); void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK); bool runOnMachineFunction(MachineFunction &MF) override { Subtarget = &MF.getSubtarget<PPCSubtarget>(); @@ -157,15 +156,15 @@ static const char *stripRegisterPrefix(const char *RegName) { return RegName + 1; case 'c': if (RegName[1] == 'r') return RegName + 2; } - + return RegName; } void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); const MachineOperand &MO = MI->getOperand(OpNo); - + switch (MO.getType()) { case MachineOperand::MO_Register: { const char *RegName = PPCInstPrinter::getRegisterName(MO.getReg()); @@ -184,8 +183,8 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, MO.getMBB()->getSymbol()->print(O, MAI); return; case MachineOperand::MO_ConstantPoolIndex: - O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() - << '_' << MO.getIndex(); + O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_' + << MO.getIndex(); return; case MachineOperand::MO_BlockAddress: GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI); @@ -200,19 +199,19 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, !GV->isStrongDefinitionForLinker()) { if (!GV->hasHiddenVisibility()) { SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); - MachineModuleInfoImpl::StubValueTy &StubSym = - MMI->getObjFileInfo<MachineModuleInfoMachO>() - .getGVStubEntry(SymToPrint); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry( + SymToPrint); if (!StubSym.getPointer()) StubSym = MachineModuleInfoImpl:: StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); } else if (GV->isDeclaration() || GV->hasCommonLinkage() || GV->hasAvailableExternallyLinkage()) { SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); - - MachineModuleInfoImpl::StubValueTy &StubSym = - MMI->getObjFileInfo<MachineModuleInfoMachO>(). - getHiddenGVStubEntry(SymToPrint); + + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry( + SymToPrint); if (!StubSym.getPointer()) StubSym = MachineModuleInfoImpl:: StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); @@ -295,16 +294,16 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, } case 'U': // Print 'u' for update form. case 'X': // Print 'x' for indexed form. - { - // FIXME: Currently for PowerPC memory operands are always loaded - // into a register, so we never get an update or indexed form. - // This is bad even for offset forms, since even if we know we - // have a value in -16(r1), we will generate a load into r<n> - // and then load from 0(r<n>). Until that issue is fixed, - // tolerate 'U' and 'X' but don't output anything. - assert(MI->getOperand(OpNo).isReg()); - return false; - } + { + // FIXME: Currently for PowerPC memory operands are always loaded + // into a register, so we never get an update or indexed form. + // This is bad even for offset forms, since even if we know we + // have a value in -16(r1), we will generate a load into r<n> + // and then load from 0(r<n>). Until that issue is fixed, + // tolerate 'U' and 'X' but don't output anything. + assert(MI->getOperand(OpNo).isReg()); + return false; + } } } @@ -315,7 +314,6 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, return false; } - /// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry /// exists for it. If not, create one. Then return a symbol that references /// the TOC entry. @@ -330,8 +328,7 @@ void PPCAsmPrinter::EmitEndOfAsmFile(Module &M) { SM.serializeToStackMapSection(); } -void PPCAsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI) { +void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) { unsigned NumNOPBytes = MI.getOperand(1).getImm(); SM.recordStackMap(MI); @@ -353,13 +350,12 @@ void PPCAsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, // Emit nops. for (unsigned i = 0; i < NumNOPBytes; i += 4) - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::NOP)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP)); } // Lower a patchpoint of the form: // [<def>], <id>, <numBytes>, <target>, <numArgs> -void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI) { +void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) { SM.recordPatchPoint(MI); PatchPointOpers Opers(&MI); @@ -375,60 +371,59 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); EncodedBytes = 0; // Materialize the jump address: - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI8) .addReg(ScratchReg) .addImm((CallTarget >> 32) & 0xFFFF)); ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::RLDIC) .addReg(ScratchReg) .addReg(ScratchReg) .addImm(32).addImm(16)); ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORIS8) .addReg(ScratchReg) .addReg(ScratchReg) .addImm((CallTarget >> 16) & 0xFFFF)); ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORI8) .addReg(ScratchReg) .addReg(ScratchReg) .addImm(CallTarget & 0xFFFF)); // Save the current TOC pointer before the remote call. int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::STD) .addReg(PPC::X2) .addImm(TOCSaveOffset) .addReg(PPC::X1)); ++EncodedBytes; - // If we're on ELFv1, then we need to load the actual function pointer // from the function descriptor. if (!Subtarget->isELFv2ABI()) { - // Load the new TOC pointer and the function address, but not r11 - // (needing this is rare, and loading it here would prevent passing it - // via a 'nest' parameter. - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) + // Load the new TOC pointer and the function address, but not r11 + // (needing this is rare, and loading it here would prevent passing it + // via a 'nest' parameter. + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD) .addReg(PPC::X2) .addImm(8) .addReg(ScratchReg)); ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD) .addReg(ScratchReg) .addImm(0) .addReg(ScratchReg)); ++EncodedBytes; } - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTCTR8) .addReg(ScratchReg)); ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCTRL8)); ++EncodedBytes; // Restore the TOC pointer after the call. - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD) .addReg(PPC::X2) .addImm(TOCSaveOffset) .addReg(PPC::X1)); @@ -439,7 +434,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, MCSymbol *MOSymbol = getSymbol(GValue); const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, OutContext); - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL8_NOP) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL8_NOP) .addExpr(SymVar)); EncodedBytes += 2; } @@ -454,7 +449,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, assert((NumBytes - EncodedBytes) % 4 == 0 && "Invalid number of NOP bytes requested!"); for (unsigned i = EncodedBytes; i < NumBytes; i += 4) - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::NOP)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP)); } /// EmitTlsCall -- Given a GETtls[ld]ADDR[32] instruction, print a @@ -499,16 +494,16 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { bool isDarwin = TM.getTargetTriple().isOSDarwin(); const Module *M = MF->getFunction()->getParent(); PICLevel::Level PL = M->getPICLevel(); - + // Lower multi-instruction pseudo operations. switch (MI->getOpcode()) { default: break; case TargetOpcode::DBG_VALUE: llvm_unreachable("Should be handled target independently"); case TargetOpcode::STACKMAP: - return LowerSTACKMAP(*OutStreamer, SM, *MI); + return LowerSTACKMAP(SM, *MI); case TargetOpcode::PATCHPOINT: - return LowerPATCHPOINT(*OutStreamer, SM, *MI); + return LowerPATCHPOINT(SM, *MI); case PPC::MoveGOTtoLR: { // Transform %LR = MoveGOTtoLR @@ -533,17 +528,18 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { case PPC::MovePCtoLR: case PPC::MovePCtoLR8: { // Transform %LR = MovePCtoLR - // Into this, where the label is the PIC base: + // Into this, where the label is the PIC base: // bl L1$pb // L1$pb: MCSymbol *PICBase = MF->getPICBaseSymbol(); - + // Emit the 'bl'. - EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL) - // FIXME: We would like an efficient form for this, so we don't have to do - // a lot of extra uniquing. - .addExpr(MCSymbolRefExpr::create(PICBase, OutContext))); - + EmitToStreamer(*OutStreamer, + MCInstBuilder(PPC::BL) + // FIXME: We would like an efficient form for this, so we + // don't have to do a lot of extra uniquing. + .addExpr(MCSymbolRefExpr::create(PICBase, OutContext))); + // Emit the label. OutStreamer->EmitLabel(PICBase); return; @@ -654,7 +650,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); return; } - + case PPC::ADDIStocHA: { // Transform %Xd = ADDIStocHA %X2, <ga:@sym> LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); @@ -669,28 +665,22 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { MO.isBlockAddress()) && "Invalid operand for ADDIStocHA!"); MCSymbol *MOSymbol = nullptr; - bool IsExternal = false; - bool IsNonLocalFunction = false; - bool IsCommon = false; - bool IsAvailExt = false; + bool GlobalToc = false; if (MO.isGlobal()) { const GlobalValue *GV = MO.getGlobal(); MOSymbol = getSymbol(GV); - IsExternal = GV->isDeclaration(); - IsCommon = GV->hasCommonLinkage(); - IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() && - !GV->isStrongDefinitionForLinker(); - IsAvailExt = GV->hasAvailableExternallyLinkage(); - } else if (MO.isCPI()) + unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); + GlobalToc = (GVFlags & PPCII::MO_NLP_FLAG); + } else if (MO.isCPI()) { MOSymbol = GetCPISymbol(MO.getIndex()); - else if (MO.isJTI()) + } else if (MO.isJTI()) { MOSymbol = GetJTISymbol(MO.getIndex()); - else if (MO.isBlockAddress()) + } else if (MO.isBlockAddress()) { MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress()); + } - if (IsExternal || IsNonLocalFunction || IsCommon || IsAvailExt || - MO.isJTI() || MO.isBlockAddress() || + if (GlobalToc || MO.isJTI() || MO.isBlockAddress() || TM.getCodeModel() == CodeModel::Large) MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); @@ -727,13 +717,14 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); } else if (MO.isGlobal()) { - const GlobalValue *GValue = MO.getGlobal(); - MOSymbol = getSymbol(GValue); - if (GValue->getType()->getElementType()->isFunctionTy() || - GValue->isDeclaration() || GValue->hasCommonLinkage() || - GValue->hasAvailableExternallyLinkage() || - TM.getCodeModel() == CodeModel::Large) - MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); + const GlobalValue *GV = MO.getGlobal(); + MOSymbol = getSymbol(GV); + DEBUG( + unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); + assert((GVFlags & PPCII::MO_NLP_FLAG) && + "LDtocL used on symbol that could be accessed directly is " + "invalid. Must match ADDIStocHA.")); + MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); } const MCExpr *Exp = @@ -754,21 +745,18 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { const MachineOperand &MO = MI->getOperand(2); assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL"); MCSymbol *MOSymbol = nullptr; - bool IsExternal = false; - bool IsNonLocalFunction = false; if (MO.isGlobal()) { const GlobalValue *GV = MO.getGlobal(); + DEBUG( + unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); + assert ( + !(GVFlags & PPCII::MO_NLP_FLAG) && + "Interposable definitions must use indirect access.")); MOSymbol = getSymbol(GV); - IsExternal = GV->isDeclaration(); - IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() && - !GV->isStrongDefinitionForLinker(); - } else if (MO.isCPI()) + } else if (MO.isCPI()) { MOSymbol = GetCPISymbol(MO.getIndex()); - - if (IsNonLocalFunction || IsExternal || - TM.getCodeModel() == CodeModel::Large) - MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); + } const MCExpr *Exp = MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO, @@ -840,13 +828,12 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::PPC32GOT: { - MCSymbol *GOTSymbol = OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_")); - const MCExpr *SymGotTlsL = - MCSymbolRefExpr::create(GOTSymbol, MCSymbolRefExpr::VK_PPC_LO, - OutContext); - const MCExpr *SymGotTlsHA = - MCSymbolRefExpr::create(GOTSymbol, MCSymbolRefExpr::VK_PPC_HA, - OutContext); + MCSymbol *GOTSymbol = + OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_")); + const MCExpr *SymGotTlsL = MCSymbolRefExpr::create( + GOTSymbol, MCSymbolRefExpr::VK_PPC_LO, OutContext); + const MCExpr *SymGotTlsHA = MCSymbolRefExpr::create( + GOTSymbol, MCSymbolRefExpr::VK_PPC_HA, OutContext); EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI) .addReg(MI->getOperand(0).getReg()) .addExpr(SymGotTlsL)); @@ -1079,14 +1066,14 @@ void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) { void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() { // linux/ppc32 - Normal entry label. - if (!Subtarget->isPPC64() && - (TM.getRelocationModel() != Reloc::PIC_ || + if (!Subtarget->isPPC64() && + (TM.getRelocationModel() != Reloc::PIC_ || MF->getFunction()->getParent()->getPICLevel() == PICLevel::Small)) return AsmPrinter::EmitFunctionEntryLabel(); if (!Subtarget->isPPC64()) { const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>(); - if (PPCFI->usesPICBase()) { + if (PPCFI->usesPICBase()) { MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol(); MCSymbol *PICBase = MF->getPICBaseSymbol(); OutStreamer->EmitLabel(RelocSymbol); @@ -1105,8 +1092,28 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() { } // ELFv2 ABI - Normal entry label. - if (Subtarget->isELFv2ABI()) + if (Subtarget->isELFv2ABI()) { + // In the Large code model, we allow arbitrary displacements between + // the text section and its associated TOC section. We place the + // full 8-byte offset to the TOC in memory immediatedly preceding + // the function global entry point. + if (TM.getCodeModel() == CodeModel::Large + && !MF->getRegInfo().use_empty(PPC::X2)) { + const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>(); + + MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC.")); + MCSymbol *GlobalEPSymbol = PPCFI->getGlobalEPSymbol(); + const MCExpr *TOCDeltaExpr = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext), + MCSymbolRefExpr::create(GlobalEPSymbol, + OutContext), + OutContext); + + OutStreamer->EmitLabel(PPCFI->getTOCOffsetSymbol()); + OutStreamer->EmitValue(TOCDeltaExpr, 8); + } return AsmPrinter::EmitFunctionEntryLabel(); + } // Emit an official procedure descriptor. MCSectionSubPair Current = OutStreamer->getCurrentSection(); @@ -1130,11 +1137,10 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() { OutStreamer->SwitchSection(Current.first, Current.second); } - bool PPCLinuxAsmPrinter::doFinalization(Module &M) { - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); - bool isPPC64 = TD->getPointerSizeInBits() == 64; + bool isPPC64 = DL.getPointerSizeInBits() == 64; PPCTargetStreamer &TS = static_cast<PPCTargetStreamer &>(*OutStreamer->getTargetStreamer()); @@ -1174,10 +1180,25 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() { // thus emit a prefix sequence along the following lines: // // func: + // .Lfunc_gepNN: + // # global entry point + // addis r2,r12,(.TOC.-.Lfunc_gepNN)@ha + // addi r2,r2,(.TOC.-.Lfunc_gepNN)@l + // .Lfunc_lepNN: + // .localentry func, .Lfunc_lepNN-.Lfunc_gepNN + // # local entry point, followed by function body + // + // For the Large code model, we create + // + // .Lfunc_tocNN: + // .quad .TOC.-.Lfunc_gepNN # done by EmitFunctionEntryLabel + // func: + // .Lfunc_gepNN: // # global entry point - // addis r2,r12,(.TOC.-func)@ha - // addi r2,r2,(.TOC.-func)@l - // .localentry func, .-func + // ld r2,.Lfunc_tocNN-.Lfunc_gepNN(r12) + // add r2,r2,r12 + // .Lfunc_lepNN: + // .localentry func, .Lfunc_lepNN-.Lfunc_gepNN // # local entry point, followed by function body // // This ensures we have r2 set up correctly while executing the function @@ -1185,32 +1206,49 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() { if (Subtarget->isELFv2ABI() // Only do all that if the function uses r2 in the first place. && !MF->getRegInfo().use_empty(PPC::X2)) { + const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>(); - MCSymbol *GlobalEntryLabel = OutContext.createTempSymbol(); + MCSymbol *GlobalEntryLabel = PPCFI->getGlobalEPSymbol(); OutStreamer->EmitLabel(GlobalEntryLabel); const MCSymbolRefExpr *GlobalEntryLabelExp = MCSymbolRefExpr::create(GlobalEntryLabel, OutContext); - MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC.")); - const MCExpr *TOCDeltaExpr = - MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext), - GlobalEntryLabelExp, OutContext); + if (TM.getCodeModel() != CodeModel::Large) { + MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC.")); + const MCExpr *TOCDeltaExpr = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext), + GlobalEntryLabelExp, OutContext); - const MCExpr *TOCDeltaHi = - PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext); - EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS) - .addReg(PPC::X2) - .addReg(PPC::X12) - .addExpr(TOCDeltaHi)); - - const MCExpr *TOCDeltaLo = - PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext); - EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI) - .addReg(PPC::X2) - .addReg(PPC::X2) - .addExpr(TOCDeltaLo)); - - MCSymbol *LocalEntryLabel = OutContext.createTempSymbol(); + const MCExpr *TOCDeltaHi = + PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS) + .addReg(PPC::X2) + .addReg(PPC::X12) + .addExpr(TOCDeltaHi)); + + const MCExpr *TOCDeltaLo = + PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI) + .addReg(PPC::X2) + .addReg(PPC::X2) + .addExpr(TOCDeltaLo)); + } else { + MCSymbol *TOCOffset = PPCFI->getTOCOffsetSymbol(); + const MCExpr *TOCOffsetDeltaExpr = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCOffset, OutContext), + GlobalEntryLabelExp, OutContext); + + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD) + .addReg(PPC::X2) + .addExpr(TOCOffsetDeltaExpr) + .addReg(PPC::X12)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADD8) + .addReg(PPC::X2) + .addReg(PPC::X2) + .addReg(PPC::X12)); + } + + MCSymbol *LocalEntryLabel = PPCFI->getLocalEPSymbol(); OutStreamer->EmitLabel(LocalEntryLabel); const MCSymbolRefExpr *LocalEntryLabelExp = MCSymbolRefExpr::create(LocalEntryLabel, OutContext); @@ -1293,8 +1331,8 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) { // Prime text sections so they are adjacent. This reduces the likelihood a // large data or debug section causes a branch to exceed 16M limit. - const TargetLoweringObjectFileMachO &TLOFMacho = - static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); OutStreamer->SwitchSection(TLOFMacho.getTextCoalSection()); if (TM.getRelocationModel() == Reloc::PIC_) { OutStreamer->SwitchSection( @@ -1325,7 +1363,7 @@ static MCSymbol *GetAnonSym(MCSymbol *Sym, MCContext &Ctx) { void PPCDarwinAsmPrinter:: EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) { - bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64; + bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64; // Construct a local MCSubtargetInfo and shadow EmitToStreamer here. // This is because the MachineFunction won't exist (but have not yet been @@ -1338,8 +1376,8 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) { S.EmitInstruction(Inst, *STI); }; - const TargetLoweringObjectFileMachO &TLOFMacho = - static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); // .lazy_symbol_pointer MCSection *LSPSection = TLOFMacho.getLazySymbolPointerSection(); @@ -1353,12 +1391,12 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) { for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { OutStreamer->SwitchSection(StubSection); EmitAlignment(4); - + MCSymbol *Stub = Stubs[i].first; MCSymbol *RawSym = Stubs[i].second.getPointer(); MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext); MCSymbol *AnonSymbol = GetAnonSym(Stub, OutContext); - + OutStreamer->EmitLabel(Stub); OutStreamer->EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol); @@ -1463,20 +1501,19 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) { OutStreamer->EmitSymbolValue(DyldStubBindingHelper, 4); } } - + OutStreamer->AddBlankLine(); } - bool PPCDarwinAsmPrinter::doFinalization(Module &M) { - bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64; + bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64; // Darwin/PPC always uses mach-o. - const TargetLoweringObjectFileMachO &TLOFMacho = - static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); MachineModuleInfoMachO &MMIMacho = - MMI->getObjFileInfo<MachineModuleInfoMachO>(); - + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetFnStubList(); if (!Stubs.empty()) EmitFunctionStubs(Stubs); @@ -1484,27 +1521,27 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) { if (MAI->doesSupportExceptionHandling() && MMI) { // Add the (possibly multiple) personalities to the set of global values. // Only referenced functions get into the Personalities list. - const std::vector<const Function*> &Personalities = MMI->getPersonalities(); - for (std::vector<const Function*>::const_iterator I = Personalities.begin(), - E = Personalities.end(); I != E; ++I) { - if (*I) { - MCSymbol *NLPSym = getSymbolWithGlobalValueBase(*I, "$non_lazy_ptr"); + for (const Function *Personality : MMI->getPersonalities()) { + if (Personality) { + MCSymbol *NLPSym = + getSymbolWithGlobalValueBase(Personality, "$non_lazy_ptr"); MachineModuleInfoImpl::StubValueTy &StubSym = - MMIMacho.getGVStubEntry(NLPSym); - StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(*I), true); + MMIMacho.getGVStubEntry(NLPSym); + StubSym = + MachineModuleInfoImpl::StubValueTy(getSymbol(Personality), true); } } } // Output stubs for dynamically-linked functions. Stubs = MMIMacho.GetGVStubList(); - + // Output macho stubs for external and common global variables. if (!Stubs.empty()) { // Switch with ".non_lazy_symbol_pointer" directive. OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); EmitAlignment(isPPC64 ? 3 : 2); - + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { // L_foo$stub: OutStreamer->EmitLabel(Stubs[i].first); @@ -1535,7 +1572,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) { if (!Stubs.empty()) { OutStreamer->SwitchSection(getObjFileLowering().getDataSection()); EmitAlignment(isPPC64 ? 3 : 2); - + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { // L_foo$stub: OutStreamer->EmitLabel(Stubs[i].first); @@ -1573,7 +1610,7 @@ createPPCAsmPrinterPass(TargetMachine &tm, } // Force static initialization. -extern "C" void LLVMInitializePowerPCAsmPrinter() { +extern "C" void LLVMInitializePowerPCAsmPrinter() { TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass); TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass); TargetRegistry::RegisterAsmPrinter(ThePPC64LETarget, createPPCAsmPrinterPass); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp new file mode 100644 index 0000000..7920240 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp @@ -0,0 +1,253 @@ +//===- PPCBoolRetToInt.cpp - Convert bool literals to i32 if they are returned ==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements converting i1 values to i32 if they could be more +// profitably allocated as GPRs rather than CRs. This pass will become totally +// unnecessary if Register Bank Allocation and Global Instruction Selection ever +// go upstream. +// +// Presently, the pass converts i1 Constants, and Arguments to i32 if the +// transitive closure of their uses includes only PHINodes, CallInsts, and +// ReturnInsts. The rational is that arguments are generally passed and returned +// in GPRs rather than CRs, so casting them to i32 at the LLVM IR level will +// actually save casts at the Machine Instruction level. +// +// It might be useful to expand this pass to add bit-wise operations to the list +// of safe transitive closure types. Also, we miss some opportunities when LLVM +// represents logical AND and OR operations with control flow rather than data +// flow. For example by lowering the expression: return (A && B && C) +// +// as: return A ? true : B && C. +// +// There's code in SimplifyCFG that code be used to turn control flow in data +// flow using SelectInsts. Selects are slow on some architectures (P7/P8), so +// this probably isn't good in general, but for the special case of i1, the +// Selects could be further lowered to bit operations that are fast everywhere. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Pass.h" + +using namespace llvm; + +namespace { + +#define DEBUG_TYPE "bool-ret-to-int" + +STATISTIC(NumBoolRetPromotion, + "Number of times a bool feeding a RetInst was promoted to an int"); +STATISTIC(NumBoolCallPromotion, + "Number of times a bool feeding a CallInst was promoted to an int"); +STATISTIC(NumBoolToIntPromotion, + "Total number of times a bool was promoted to an int"); + +class PPCBoolRetToInt : public FunctionPass { + + static SmallPtrSet<Value *, 8> findAllDefs(Value *V) { + SmallPtrSet<Value *, 8> Defs; + SmallVector<Value *, 8> WorkList; + WorkList.push_back(V); + Defs.insert(V); + while (!WorkList.empty()) { + Value *Curr = WorkList.back(); + WorkList.pop_back(); + if (User *CurrUser = dyn_cast<User>(Curr)) + for (auto &Op : CurrUser->operands()) + if (Defs.insert(Op).second) + WorkList.push_back(Op); + } + return Defs; + } + + // Translate a i1 value to an equivalent i32 value: + static Value *translate(Value *V) { + Type *Int32Ty = Type::getInt32Ty(V->getContext()); + if (Constant *C = dyn_cast<Constant>(V)) + return ConstantExpr::getZExt(C, Int32Ty); + if (PHINode *P = dyn_cast<PHINode>(V)) { + // Temporarily set the operands to 0. We'll fix this later in + // runOnUse. + Value *Zero = Constant::getNullValue(Int32Ty); + PHINode *Q = + PHINode::Create(Int32Ty, P->getNumIncomingValues(), P->getName(), P); + for (unsigned i = 0; i < P->getNumOperands(); ++i) + Q->addIncoming(Zero, P->getIncomingBlock(i)); + return Q; + } + + Argument *A = dyn_cast<Argument>(V); + Instruction *I = dyn_cast<Instruction>(V); + assert((A || I) && "Unknown value type"); + + auto InstPt = + A ? &*A->getParent()->getEntryBlock().begin() : I->getNextNode(); + return new ZExtInst(V, Int32Ty, "", InstPt); + } + + typedef SmallPtrSet<const PHINode *, 8> PHINodeSet; + + // A PHINode is Promotable if: + // 1. Its type is i1 AND + // 2. All of its uses are ReturnInt, CallInst, PHINode, or DbgInfoIntrinsic + // AND + // 3. All of its operands are Constant or Argument or + // CallInst or PHINode AND + // 4. All of its PHINode uses are Promotable AND + // 5. All of its PHINode operands are Promotable + static PHINodeSet getPromotablePHINodes(const Function &F) { + PHINodeSet Promotable; + // Condition 1 + for (auto &BB : F) + for (auto &I : BB) + if (const PHINode *P = dyn_cast<PHINode>(&I)) + if (P->getType()->isIntegerTy(1)) + Promotable.insert(P); + + SmallVector<const PHINode *, 8> ToRemove; + for (const auto &P : Promotable) { + // Condition 2 and 3 + auto IsValidUser = [] (const Value *V) -> bool { + return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V) || + isa<DbgInfoIntrinsic>(V); + }; + auto IsValidOperand = [] (const Value *V) -> bool { + return isa<Constant>(V) || isa<Argument>(V) || isa<CallInst>(V) || + isa<PHINode>(V); + }; + const auto &Users = P->users(); + const auto &Operands = P->operands(); + if (!std::all_of(Users.begin(), Users.end(), IsValidUser) || + !std::all_of(Operands.begin(), Operands.end(), IsValidOperand)) + ToRemove.push_back(P); + } + + // Iterate to convergence + auto IsPromotable = [&Promotable] (const Value *V) -> bool { + const PHINode *Phi = dyn_cast<PHINode>(V); + return !Phi || Promotable.count(Phi); + }; + while (!ToRemove.empty()) { + for (auto &User : ToRemove) + Promotable.erase(User); + ToRemove.clear(); + + for (const auto &P : Promotable) { + // Condition 4 and 5 + const auto &Users = P->users(); + const auto &Operands = P->operands(); + if (!std::all_of(Users.begin(), Users.end(), IsPromotable) || + !std::all_of(Operands.begin(), Operands.end(), IsPromotable)) + ToRemove.push_back(P); + } + } + + return Promotable; + } + + typedef DenseMap<Value *, Value *> B2IMap; + + public: + static char ID; + PPCBoolRetToInt() : FunctionPass(ID) { + initializePPCBoolRetToIntPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) { + PHINodeSet PromotablePHINodes = getPromotablePHINodes(F); + B2IMap Bool2IntMap; + bool Changed = false; + for (auto &BB : F) { + for (auto &I : BB) { + if (ReturnInst *R = dyn_cast<ReturnInst>(&I)) + if (F.getReturnType()->isIntegerTy(1)) + Changed |= + runOnUse(R->getOperandUse(0), PromotablePHINodes, Bool2IntMap); + + if (CallInst *CI = dyn_cast<CallInst>(&I)) + for (auto &U : CI->operands()) + if (U->getType()->isIntegerTy(1)) + Changed |= runOnUse(U, PromotablePHINodes, Bool2IntMap); + } + } + + return Changed; + } + + static bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes, + B2IMap &BoolToIntMap) { + auto Defs = findAllDefs(U); + + // If the values are all Constants or Arguments, don't bother + if (!std::any_of(Defs.begin(), Defs.end(), isa<Instruction, Value *>)) + return false; + + // Presently, we only know how to handle PHINode, Constant, and Arguments. + // Potentially, bitwise operations (AND, OR, XOR, NOT) and sign extension + // could also be handled in the future. + for (const auto &V : Defs) + if (!isa<PHINode>(V) && !isa<Constant>(V) && !isa<Argument>(V)) + return false; + + for (const auto &V : Defs) + if (const PHINode *P = dyn_cast<PHINode>(V)) + if (!PromotablePHINodes.count(P)) + return false; + + if (isa<ReturnInst>(U.getUser())) + ++NumBoolRetPromotion; + if (isa<CallInst>(U.getUser())) + ++NumBoolCallPromotion; + ++NumBoolToIntPromotion; + + for (const auto &V : Defs) + if (!BoolToIntMap.count(V)) + BoolToIntMap[V] = translate(V); + + // Replace the operands of the translated instructions. There were set to + // zero in the translate function. + for (auto &Pair : BoolToIntMap) { + User *First = dyn_cast<User>(Pair.first); + User *Second = dyn_cast<User>(Pair.second); + assert((!First || Second) && "translated from user to non-user!?"); + if (First) + for (unsigned i = 0; i < First->getNumOperands(); ++i) + Second->setOperand(i, BoolToIntMap[First->getOperand(i)]); + } + + Value *IntRetVal = BoolToIntMap[U]; + Type *Int1Ty = Type::getInt1Ty(U->getContext()); + Instruction *I = cast<Instruction>(U.getUser()); + Value *BackToBool = new TruncInst(IntRetVal, Int1Ty, "backToBool", I); + U.set(BackToBool); + + return true; + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<DominatorTreeWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } +}; +} + +char PPCBoolRetToInt::ID = 0; +INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int", + "Convert i1 constants to i32 if they are returned", + false, false) + +FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp index 940d55a..73a5305 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -91,7 +91,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { unsigned FuncSize = 0; for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ++MFI) { - MachineBasicBlock *MBB = MFI; + MachineBasicBlock *MBB = &*MFI; // The end of the previous block may have extra nops if this block has an // alignment requirement. diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp index fd150be..b6ac4d5 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -98,7 +98,7 @@ namespace { AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); } private: @@ -112,6 +112,7 @@ namespace { const DataLayout *DL; DominatorTree *DT; const TargetLibraryInfo *LibInfo; + bool PreserveLCSSA; }; char PPCCTRLoops::ID = 0; @@ -147,7 +148,7 @@ INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", false, false) @@ -169,11 +170,12 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() { bool PPCCTRLoops::runOnFunction(Function &F) { LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DL = &F.getParent()->getDataLayout(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); LibInfo = TLIP ? &TLIP->getTLI() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); bool MadeChange = false; @@ -250,8 +252,8 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr // we're definitely using CTR. case Intrinsic::ppc_is_decremented_ctr_nonzero: - case Intrinsic::ppc_mtctr: - return true; + case Intrinsic::ppc_mtctr: + return true; // VisualStudio defines setjmp as _setjmp #if defined(_MSC_VER) && defined(setjmp) && \ @@ -369,7 +371,7 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { true); if (VTy == MVT::Other) return true; - + if (TLI->isOperationLegalOrCustom(Opcode, VTy)) continue; else if (VTy.isVector() && @@ -537,7 +539,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { // the CTR register because some such uses might be reordered by the // selection DAG after the mtctr instruction). if (!Preheader || mightUseCTR(TT, Preheader)) - Preheader = InsertPreheaderForLoop(L, this); + Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); if (!Preheader) return MadeChange; @@ -554,10 +556,9 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { if (!ExitCount->getType()->isPointerTy() && ExitCount->getType() != CountType) ExitCount = SE->getZeroExtendExpr(ExitCount, CountType); - ExitCount = SE->getAddExpr(ExitCount, - SE->getConstant(CountType, 1)); - Value *ECValue = SCEVE.expandCodeFor(ExitCount, CountType, - Preheader->getTerminator()); + ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType)); + Value *ECValue = + SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator()); IRBuilder<> CountBuilder(Preheader->getTerminator()); Module *M = Preheader->getParent()->getParent(); @@ -677,7 +678,7 @@ bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) { // any other instructions that might clobber the ctr register. for (MachineFunction::iterator I = MF.begin(), IE = MF.end(); I != IE; ++I) { - MachineBasicBlock *MBB = I; + MachineBasicBlock *MBB = &*I; if (!MDT->isReachableFromEntry(MBB)) continue; @@ -694,4 +695,3 @@ bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) { return false; } #endif // NDEBUG - diff --git a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp index fc89753..7cb1bb5 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp @@ -71,15 +71,20 @@ protected: for (MachineBasicBlock::pred_iterator PI = ReturnMBB.pred_begin(), PIE = ReturnMBB.pred_end(); PI != PIE; ++PI) { bool OtherReference = false, BlockChanged = false; + + if ((*PI)->empty()) + continue; + for (MachineBasicBlock::iterator J = (*PI)->getLastNonDebugInstr();;) { - MachineInstrBuilder MIB; + if (J == (*PI)->end()) + break; + if (J->getOpcode() == PPC::B) { if (J->getOperand(0).getMBB() == &ReturnMBB) { // This is an unconditional branch to the return. Replace the // branch with a blr. - MIB = - BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode())); - MIB.copyImplicitOps(I); + BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode())) + .copyImplicitOps(I); MachineBasicBlock::iterator K = J--; K->eraseFromParent(); BlockChanged = true; @@ -90,10 +95,10 @@ protected: if (J->getOperand(2).getMBB() == &ReturnMBB) { // This is a conditional branch to the return. Replace the branch // with a bclr. - MIB = BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR)) - .addImm(J->getOperand(0).getImm()) - .addReg(J->getOperand(1).getReg()); - MIB.copyImplicitOps(I); + BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR)) + .addImm(J->getOperand(0).getImm()) + .addReg(J->getOperand(1).getReg()) + .copyImplicitOps(I); MachineBasicBlock::iterator K = J--; K->eraseFromParent(); BlockChanged = true; @@ -104,11 +109,11 @@ protected: if (J->getOperand(1).getMBB() == &ReturnMBB) { // This is a conditional branch to the return. Replace the branch // with a bclr. - MIB = BuildMI(**PI, J, J->getDebugLoc(), - TII->get(J->getOpcode() == PPC::BC ? - PPC::BCLR : PPC::BCLRn)) - .addReg(J->getOperand(0).getReg()); - MIB.copyImplicitOps(I); + BuildMI( + **PI, J, J->getDebugLoc(), + TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn)) + .addReg(J->getOperand(0).getReg()) + .copyImplicitOps(I); MachineBasicBlock::iterator K = J--; K->eraseFromParent(); BlockChanged = true; @@ -146,7 +151,7 @@ protected: } for (unsigned i = 0, ie = PredToRemove.size(); i != ie; ++i) - PredToRemove[i]->removeSuccessor(&ReturnMBB); + PredToRemove[i]->removeSuccessor(&ReturnMBB, true); if (Changed && !ReturnMBB.hasAddressTaken()) { // We now might be able to merge this blr-only block into its @@ -156,7 +161,7 @@ protected: if (PrevMBB.isLayoutSuccessor(&ReturnMBB) && PrevMBB.canFallThrough()) { // Move the blr into the preceding block. PrevMBB.splice(PrevMBB.end(), &ReturnMBB, I); - PrevMBB.removeSuccessor(&ReturnMBB); + PrevMBB.removeSuccessor(&ReturnMBB, true); } } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp index 5f236f7..b451ebf 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -164,7 +164,8 @@ class PPCFastISel final : public FastISel { unsigned DestReg, bool IsZExt); unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT); unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT); - unsigned PPCMaterializeInt(const Constant *C, MVT VT, bool UseSExt = true); + unsigned PPCMaterializeInt(const ConstantInt *CI, MVT VT, + bool UseSExt = true); unsigned PPCMaterialize32BitInt(int64_t Imm, const TargetRegisterClass *RC); unsigned PPCMaterialize64BitInt(int64_t Imm, @@ -292,10 +293,7 @@ bool PPCFastISel::isValueAvailable(const Value *V) const { return true; const auto *I = cast<Instruction>(V); - if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) - return true; - - return false; + return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB; } // Given a value Obj, create an Address object Addr that represents its @@ -527,9 +525,9 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, // VSX only provides an indexed load. if (Is32VSXLoad || Is64VSXLoad) return false; - MachineMemOperand *MMO = - FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset), + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI, + Addr.Offset), MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI), MFI.getObjectAlignment(Addr.Base.FI)); @@ -660,9 +658,9 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) { // VSX only provides an indexed store. if (Is32VSXStore || Is64VSXStore) return false; - MachineMemOperand *MMO = - FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset), + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI, + Addr.Offset), MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI), MFI.getObjectAlignment(Addr.Base.FI)); @@ -774,8 +772,7 @@ bool PPCFastISel::SelectBranch(const Instruction *I) { BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC)) .addImm(PPCPred).addReg(CondReg).addMBB(TBB); - fastEmitBranch(FBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TBB); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } } else if (const ConstantInt *CI = @@ -1607,21 +1604,18 @@ bool PPCFastISel::SelectRet(const Instruction *I) { if (ValLocs.size() > 1) return false; - // Special case for returning a constant integer of any size. - // Materialize the constant as an i64 and copy it to the return - // register. We still need to worry about properly extending the sign. E.g: - // If the constant has only one bit, it means it is a boolean. Therefore - // we can't use PPCMaterializeInt because it extends the sign which will - // cause negations of the returned value to be incorrect as they are - // implemented as the flip of the least significant bit. - if (isa<ConstantInt>(*RV)) { - const Constant *C = cast<Constant>(RV); - + // Special case for returning a constant integer of any size - materialize + // the constant as an i64 and copy it to the return register. + if (const ConstantInt *CI = dyn_cast<ConstantInt>(RV)) { CCValAssign &VA = ValLocs[0]; unsigned RetReg = VA.getLocReg(); - unsigned SrcReg = PPCMaterializeInt(C, MVT::i64, - VA.getLocInfo() == CCValAssign::SExt); + // We still need to worry about properly extending the sign. For example, + // we could have only a single bit or a constant that needs zero + // extension rather than sign extension. Make sure we pass the return + // value extension property to integer materialization. + unsigned SrcReg = + PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() == CCValAssign::SExt); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg); @@ -1761,8 +1755,8 @@ bool PPCFastISel::SelectIndirectBr(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCTR8)); const IndirectBrInst *IB = cast<IndirectBrInst>(I); - for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i) - FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]); + for (const BasicBlock *SuccBB : IB->successors()) + FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]); return true; } @@ -1898,10 +1892,9 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); CodeModel::Model CModel = TM.getCodeModel(); - MachineMemOperand *MMO = - FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, - (VT == MVT::f32) ? 4 : 8, Align); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getConstantPool(*FuncInfo.MF), + MachineMemOperand::MOLoad, (VT == MVT::f32) ? 4 : 8, Align); unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD; unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); @@ -1976,19 +1969,15 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA), HighPartReg).addReg(PPC::X2).addGlobalAddress(GV); - // If/when switches are implemented, jump tables should be handled - // on the "if" path here. - if (CModel == CodeModel::Large || - (GV->getType()->getElementType()->isFunctionTy() && - !GV->isStrongDefinitionForLinker()) || - GV->isDeclaration() || GV->hasCommonLinkage() || - GV->hasAvailableExternallyLinkage()) + unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV); + if (GVFlags & PPCII::MO_NLP_FLAG) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL), DestReg).addGlobalAddress(GV).addReg(HighPartReg); - else + } else { // Otherwise generate the ADDItocL. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDItocL), DestReg).addReg(HighPartReg).addGlobalAddress(GV); + } } return DestReg; @@ -2085,12 +2074,11 @@ unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm, // Materialize an integer constant into a register, and return // the register number (or zero if we failed to handle it). -unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT, - bool UseSExt) { +unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT, + bool UseSExt) { // If we're using CR bit registers for i1 values, handle that as a special // case first. if (VT == MVT::i1 && PPCSubTarget->useCRBits()) { - const ConstantInt *CI = cast<ConstantInt>(C); unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg); @@ -2105,12 +2093,17 @@ unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT, &PPC::GPRCRegClass); // If the constant is in range, use a load-immediate. - const ConstantInt *CI = cast<ConstantInt>(C); - if (isInt<16>(CI->getSExtValue())) { + if (UseSExt && isInt<16>(CI->getSExtValue())) { + unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI; + unsigned ImmReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) + .addImm(CI->getSExtValue()); + return ImmReg; + } else if (!UseSExt && isUInt<16>(CI->getZExtValue())) { unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI; unsigned ImmReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) - .addImm( (UseSExt) ? CI->getSExtValue() : CI->getZExtValue() ); + .addImm(CI->getZExtValue()); return ImmReg; } @@ -2138,8 +2131,8 @@ unsigned PPCFastISel::fastMaterializeConstant(const Constant *C) { return PPCMaterializeFP(CFP, VT); else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) return PPCMaterializeGV(GV, VT); - else if (isa<ConstantInt>(C)) - return PPCMaterializeInt(C, VT, VT != MVT::i1); + else if (const ConstantInt *CI = dyn_cast<ConstantInt>(C)) + return PPCMaterializeInt(CI, VT, VT != MVT::i1); return 0; } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 08ae717..beab844 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -30,7 +30,7 @@ using namespace llvm; /// VRRegNo - Map from a numbered VR register to its enum value. /// -static const uint16_t VRRegNo[] = { +static const MCPhysReg VRRegNo[] = { PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 , PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15, PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23, @@ -270,7 +270,7 @@ static void RemoveVRSaveCode(MachineInstr *MI) { // epilog blocks. for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) { // If last instruction is a return instruction, add an epilogue - if (!I->empty() && I->back().isReturn()) { + if (I->isReturnBlock()) { bool FoundIt = false; for (MBBI = I->end(); MBBI != I->begin(); ) { --MBBI; @@ -306,9 +306,10 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); DebugLoc dl = MI->getDebugLoc(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned UsedRegMask = 0; for (unsigned i = 0; i != 32; ++i) - if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i])) + if (MRI.isPhysRegModified(VRRegNo[i])) UsedRegMask |= 1 << (31-i); // Live in and live out values already must be in the mask, so don't bother @@ -325,7 +326,7 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) { for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end(); UsedRegMask != 0 && BI != BE; ++BI) { const MachineBasicBlock &MBB = *BI; - if (MBB.empty() || !MBB.back().isReturn()) + if (!MBB.isReturnBlock()) continue; const MachineInstr &Ret = MBB.back(); for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) { @@ -555,9 +556,67 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const { } } +bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB, + bool UseAtEnd, + unsigned *ScratchRegister) const { + RegScavenger RS; + unsigned R0 = Subtarget.isPPC64() ? PPC::X0 : PPC::R0; + + if (ScratchRegister) + *ScratchRegister = R0; + + // If MBB is an entry or exit block, use R0 as the scratch register + if ((UseAtEnd && MBB->isReturnBlock()) || + (!UseAtEnd && (&MBB->getParent()->front() == MBB))) + return true; + + RS.enterBasicBlock(MBB); + + if (UseAtEnd && !MBB->empty()) { + // The scratch register will be used at the end of the block, so must consider + // all registers used within the block + + MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator(); + // If no terminator, back iterator up to previous instruction. + if (MBBI == MBB->end()) + MBBI = std::prev(MBBI); + + if (MBBI != MBB->begin()) + RS.forward(MBBI); + } + + if (!RS.isRegUsed(R0)) + return true; + + unsigned Reg = RS.FindUnusedReg(Subtarget.isPPC64() ? &PPC::G8RCRegClass + : &PPC::GPRCRegClass); + + // Make sure the register scavenger was able to find an available register + // If not, use R0 but return false to indicate no register was available and + // R0 must be used (as recommended by the ABI) + if (Reg == 0) + return false; + + if (ScratchRegister) + *ScratchRegister = Reg; + + return true; +} + +bool PPCFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const { + MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); + + return findScratchRegister(TmpMBB, false, nullptr); +} + +bool PPCFrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { + MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); + + return findScratchRegister(TmpMBB, true, nullptr); +} + void PPCFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); const PPCInstrInfo &TII = @@ -589,7 +648,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, } } - // Move MBBI back to the beginning of the function. + // Move MBBI back to the beginning of the prologue block. MBBI = MBB.begin(); // Work out frame sizes. @@ -613,7 +672,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, unsigned BPReg = RegInfo->getBaseRegister(MF); unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; unsigned LRReg = isPPC64 ? PPC::LR8 : PPC::LR; - unsigned ScratchReg = isPPC64 ? PPC::X0 : PPC::R0; + unsigned ScratchReg = 0; unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg // ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.) const MCInstrDesc& MFLRInst = TII.get(isPPC64 ? PPC::MFLR8 @@ -642,6 +701,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) && "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4."); + findScratchRegister(&MBB, false, &ScratchReg); + assert(ScratchReg && "No scratch register!"); + int LROffset = getReturnSaveOffset(); int FPOffset = 0; @@ -916,27 +978,18 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, } void PPCFrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); - assert(MBBI != MBB.end() && "Returning block has no terminator"); + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc dl; + + if (MBBI != MBB.end()) + dl = MBBI->getDebugLoc(); + const PPCInstrInfo &TII = *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo()); const PPCRegisterInfo *RegInfo = static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); - unsigned RetOpcode = MBBI->getOpcode(); - DebugLoc dl; - - assert((RetOpcode == PPC::BLR || - RetOpcode == PPC::BLR8 || - RetOpcode == PPC::TCRETURNri || - RetOpcode == PPC::TCRETURNdi || - RetOpcode == PPC::TCRETURNai || - RetOpcode == PPC::TCRETURNri8 || - RetOpcode == PPC::TCRETURNdi8 || - RetOpcode == PPC::TCRETURNai8) && - "Can only insert epilog into returning blocks"); - // Get alignment info so we know how to restore the SP. const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -959,7 +1012,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1; unsigned BPReg = RegInfo->getBaseRegister(MF); unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; - unsigned ScratchReg = isPPC64 ? PPC::X0 : PPC::R0; + unsigned ScratchReg = 0; unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg const MCInstrDesc& MTLRInst = TII.get( isPPC64 ? PPC::MTLR8 : PPC::MTLR ); @@ -973,10 +1026,14 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, : PPC::ADDI ); const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8 : PPC::ADD4 ); - + int LROffset = getReturnSaveOffset(); int FPOffset = 0; + + findScratchRegister(&MBB, true, &ScratchReg); + assert(ScratchReg && "No scratch register!"); + if (HasFP) { if (isSVR4ABI) { MachineFrameInfo *FFI = MF.getFrameInfo(); @@ -1008,25 +1065,30 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, PBPOffset = FFI->getObjectOffset(PBPIndex); } - bool UsesTCRet = RetOpcode == PPC::TCRETURNri || - RetOpcode == PPC::TCRETURNdi || - RetOpcode == PPC::TCRETURNai || - RetOpcode == PPC::TCRETURNri8 || - RetOpcode == PPC::TCRETURNdi8 || - RetOpcode == PPC::TCRETURNai8; - - if (UsesTCRet) { - int MaxTCRetDelta = FI->getTailCallSPDelta(); - MachineOperand &StackAdjust = MBBI->getOperand(1); - assert(StackAdjust.isImm() && "Expecting immediate value."); - // Adjust stack pointer. - int StackAdj = StackAdjust.getImm(); - int Delta = StackAdj - MaxTCRetDelta; - assert((Delta >= 0) && "Delta must be positive"); - if (MaxTCRetDelta>0) - FrameSize += (StackAdj +Delta); - else - FrameSize += StackAdj; + bool IsReturnBlock = (MBBI != MBB.end() && MBBI->isReturn()); + + if (IsReturnBlock) { + unsigned RetOpcode = MBBI->getOpcode(); + bool UsesTCRet = RetOpcode == PPC::TCRETURNri || + RetOpcode == PPC::TCRETURNdi || + RetOpcode == PPC::TCRETURNai || + RetOpcode == PPC::TCRETURNri8 || + RetOpcode == PPC::TCRETURNdi8 || + RetOpcode == PPC::TCRETURNai8; + + if (UsesTCRet) { + int MaxTCRetDelta = FI->getTailCallSPDelta(); + MachineOperand &StackAdjust = MBBI->getOperand(1); + assert(StackAdjust.isImm() && "Expecting immediate value."); + // Adjust stack pointer. + int StackAdj = StackAdjust.getImm(); + int Delta = StackAdj - MaxTCRetDelta; + assert((Delta >= 0) && "Delta must be positive"); + if (MaxTCRetDelta>0) + FrameSize += (StackAdj +Delta); + else + FrameSize += StackAdj; + } } // Frames of 32KB & larger require special handling because they cannot be @@ -1066,7 +1128,6 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, .addImm(0) .addReg(SPReg); } - } if (MustSaveLR) @@ -1109,52 +1170,55 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, // Callee pop calling convention. Pop parameter/linkage area. Used for tail // call optimization - if (MF.getTarget().Options.GuaranteedTailCallOpt && - (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) && - MF.getFunction()->getCallingConv() == CallingConv::Fast) { - PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); - unsigned CallerAllocatedAmt = FI->getMinReservedArea(); - - if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) { - BuildMI(MBB, MBBI, dl, AddImmInst, SPReg) - .addReg(SPReg).addImm(CallerAllocatedAmt); - } else { - BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) + if (IsReturnBlock) { + unsigned RetOpcode = MBBI->getOpcode(); + if (MF.getTarget().Options.GuaranteedTailCallOpt && + (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) && + MF.getFunction()->getCallingConv() == CallingConv::Fast) { + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + unsigned CallerAllocatedAmt = FI->getMinReservedArea(); + + if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) { + BuildMI(MBB, MBBI, dl, AddImmInst, SPReg) + .addReg(SPReg).addImm(CallerAllocatedAmt); + } else { + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) .addImm(CallerAllocatedAmt >> 16); - BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) + BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) .addReg(ScratchReg, RegState::Kill) .addImm(CallerAllocatedAmt & 0xFFFF); - BuildMI(MBB, MBBI, dl, AddInst) + BuildMI(MBB, MBBI, dl, AddInst) .addReg(SPReg) .addReg(FPReg) .addReg(ScratchReg); - } - } else if (RetOpcode == PPC::TCRETURNdi) { - MBBI = MBB.getLastNonDebugInstr(); - MachineOperand &JumpTarget = MBBI->getOperand(0); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)). - addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); - } else if (RetOpcode == PPC::TCRETURNri) { - MBBI = MBB.getLastNonDebugInstr(); - assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR)); - } else if (RetOpcode == PPC::TCRETURNai) { - MBBI = MBB.getLastNonDebugInstr(); - MachineOperand &JumpTarget = MBBI->getOperand(0); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm()); - } else if (RetOpcode == PPC::TCRETURNdi8) { - MBBI = MBB.getLastNonDebugInstr(); - MachineOperand &JumpTarget = MBBI->getOperand(0); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)). - addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); - } else if (RetOpcode == PPC::TCRETURNri8) { - MBBI = MBB.getLastNonDebugInstr(); - assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8)); - } else if (RetOpcode == PPC::TCRETURNai8) { - MBBI = MBB.getLastNonDebugInstr(); - MachineOperand &JumpTarget = MBBI->getOperand(0); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm()); + } + } else if (RetOpcode == PPC::TCRETURNdi) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)). + addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); + } else if (RetOpcode == PPC::TCRETURNri) { + MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR)); + } else if (RetOpcode == PPC::TCRETURNai) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm()); + } else if (RetOpcode == PPC::TCRETURNdi8) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)). + addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); + } else if (RetOpcode == PPC::TCRETURNri8) { + MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8)); + } else if (RetOpcode == PPC::TCRETURNai8) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm()); + } } } @@ -1200,8 +1264,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF, // Reserve stack space for the PIC Base register (R30). // Only used in SVR4 32-bit. if (FI->usesPICBase()) { - int PBPSI = FI->getPICBasePointerSaveIndex(); - PBPSI = MFI->CreateFixedObject(4, -8, true); + int PBPSI = MFI->CreateFixedObject(4, -8, true); FI->setPICBasePointerSaveIndex(PBPSI); } @@ -1710,3 +1773,8 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, return true; } + +bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { + return (MF.getSubtarget<PPCSubtarget>().isSVR4ABI() && + MF.getSubtarget<PPCSubtarget>().isPPC64()); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h index d6a389b..bbe1329 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h @@ -29,6 +29,30 @@ class PPCFrameLowering: public TargetFrameLowering { const unsigned LinkageSize; const unsigned BasePointerSaveOffset; + /** + * \brief Find a register that can be used in function prologue and epilogue + * + * Find a register that can be use as the scratch register in function + * prologue and epilogue to save various registers (Link Register, Base + * Pointer, etc.). Prefer R0, if it is available. If it is not available, + * then choose a different register. + * + * This method will return true if an available register was found (including + * R0). If no available registers are found, the method returns false and sets + * ScratchRegister to R0, as per the recommendation in the ABI. + * + * \param[in] MBB The machine basic block to find an available register for + * \param[in] UseAtEnd Specify whether the scratch register will be used at + * the end of the basic block (i.e., will the scratch + * register kill a register defined in the basic block) + * \param[out] ScratchRegister The scratch register to use + * \return true if a scratch register was found. false of a scratch register + * was not found and R0 is being used as the default. + */ + bool findScratchRegister(MachineBasicBlock *MBB, + bool UseAtEnd, + unsigned *ScratchRegister) const; + public: PPCFrameLowering(const PPCSubtarget &STI); @@ -92,6 +116,13 @@ public: const SpillSlot * getCalleeSavedSpillSlots(unsigned &NumEntries) const override; + + bool enableShrinkWrapping(const MachineFunction &MF) const override; + + /// Methods used by shrink wrapping to determine if MBB can be used for the + /// function prologue/epilogue. + bool canUseAsPrologue(const MachineBasicBlock &MBB) const override; + bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 9322268..1eaa811 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -16,6 +16,8 @@ #include "MCTargetDesc/PPCPredicates.h" #include "PPCMachineFunctionInfo.h" #include "PPCTargetMachine.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -52,6 +54,11 @@ static cl::opt<bool> BPermRewriterNoMasking( "bit permutations"), cl::Hidden); +static cl::opt<bool> EnableBranchHint( + "ppc-use-branch-hint", cl::init(true), + cl::desc("Enable static hinting of branches on ppc"), + cl::Hidden); + namespace llvm { void initializePPCDAGToDAGISelPass(PassRegistry&); } @@ -286,7 +293,7 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) { // Find all return blocks, outputting a restore in each epilog. for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { - if (!BB->empty() && BB->back().isReturn()) { + if (BB->isReturnBlock()) { IP = BB->end(); --IP; // Skip over all terminator instructions, which are part of the return @@ -393,6 +400,55 @@ static bool isInt32Immediate(SDValue N, unsigned &Imm) { return isInt32Immediate(N.getNode(), Imm); } +static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo, + const SDValue &DestMBB) { + assert(isa<BasicBlockSDNode>(DestMBB)); + + if (!FuncInfo->BPI) return PPC::BR_NO_HINT; + + const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); + const TerminatorInst *BBTerm = BB->getTerminator(); + + if (BBTerm->getNumSuccessors() != 2) return PPC::BR_NO_HINT; + + const BasicBlock *TBB = BBTerm->getSuccessor(0); + const BasicBlock *FBB = BBTerm->getSuccessor(1); + + auto TProb = FuncInfo->BPI->getEdgeProbability(BB, TBB); + auto FProb = FuncInfo->BPI->getEdgeProbability(BB, FBB); + + // We only want to handle cases which are easy to predict at static time, e.g. + // C++ throw statement, that is very likely not taken, or calling never + // returned function, e.g. stdlib exit(). So we set Threshold to filter + // unwanted cases. + // + // Below is LLVM branch weight table, we only want to handle case 1, 2 + // + // Case Taken:Nontaken Example + // 1. Unreachable 1048575:1 C++ throw, stdlib exit(), + // 2. Invoke-terminating 1:1048575 + // 3. Coldblock 4:64 __builtin_expect + // 4. Loop Branch 124:4 For loop + // 5. PH/ZH/FPH 20:12 + const uint32_t Threshold = 10000; + + if (std::max(TProb, FProb) / Threshold < std::min(TProb, FProb)) + return PPC::BR_NO_HINT; + + DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName() << "::" + << BB->getName() << "'\n" + << " -> " << TBB->getName() << ": " << TProb << "\n" + << " -> " << FBB->getName() << ": " << FProb << "\n"); + + const BasicBlockSDNode *BBDN = cast<BasicBlockSDNode>(DestMBB); + + // If Dest BasicBlock is False-BasicBlock (FBB), swap branch probabilities, + // because we want 'TProb' stands for 'branch probability' to Dest BasicBlock + if (BBDN->getBasicBlock()->getBasicBlock() != TBB) + std::swap(TProb, FProb); + + return (TProb > FProb) ? PPC::BR_TAKEN_HINT : PPC::BR_NONTAKEN_HINT; +} // isOpcWithIntImmediate - This method tests to see if the node is a specific // opcode and that it has a immediate integer right operand. @@ -564,7 +620,6 @@ static unsigned SelectInt64CountDirect(int64_t Imm) { // Handle first 32 bits. unsigned Lo = Imm & 0xFFFF; - unsigned Hi = (Imm >> 16) & 0xFFFF; // Simple value. if (isInt<16>(Imm)) { @@ -586,9 +641,9 @@ static unsigned SelectInt64CountDirect(int64_t Imm) { ++Result; // Add in the last bits as required. - if ((Hi = (Remainder >> 16) & 0xFFFF)) + if ((Remainder >> 16) & 0xFFFF) ++Result; - if ((Lo = Remainder & 0xFFFF)) + if (Remainder & 0xFFFF) ++Result; return Result; @@ -1028,7 +1083,7 @@ class BitPermutationSelector { BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 && BitGroups[0].V == BitGroups[BitGroups.size()-1].V && BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) { - DEBUG(dbgs() << "\tcombining final bit group with inital one\n"); + DEBUG(dbgs() << "\tcombining final bit group with initial one\n"); BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx; BitGroups.erase(BitGroups.begin()); } @@ -1557,10 +1612,7 @@ class BitPermutationSelector { return false; } - if (VRI.RLAmt != EffRLAmt) - return false; - - return true; + return VRI.RLAmt == EffRLAmt; }; for (auto &BG : BitGroups) { @@ -2781,7 +2833,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 || N->getValueType(0) == MVT::v2i64)) { ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); - + SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1), Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1); unsigned DM[2]; @@ -2798,7 +2850,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { LoadSDNode *LD = cast<LoadSDNode>(Op1.getOperand(0)); SDValue Base, Offset; - if (LD->isUnindexed() && + if (LD->isUnindexed() && LD->hasOneUse() && Op1.hasOneUse() && (LD->getMemoryVT() == MVT::f64 || LD->getMemoryVT() == MVT::i64) && SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) { @@ -2841,8 +2893,11 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { // Op #3 is the Dest MBB // Op #4 is the Flag. // Prevent PPC::PRED_* from being selected into LI. - SDValue Pred = - getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(), dl); + unsigned PCC = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + if (EnableBranchHint) + PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(3)); + + SDValue Pred = getI32Imm(PCC, dl); SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3), N->getOperand(0), N->getOperand(4) }; return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops); @@ -2871,6 +2926,9 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { BitComp, N->getOperand(4), N->getOperand(0)); } + if (EnableBranchHint) + PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(4)); + SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl); SDValue Ops[] = { getI32Imm(PCC, dl), CondCode, N->getOperand(4), N->getOperand(0) }; @@ -2903,9 +2961,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { break; // The first source operand is a TargetGlobalAddress or a TargetJumpTable. - // If it is an externally defined symbol, a symbol with common linkage, - // a non-local function address, or a jump table address, or if we are - // generating code for large code model, we generate: + // If it must be toc-referenced according to PPCSubTarget, we generate: // LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>)) // Otherwise we generate: // ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>) @@ -2920,13 +2976,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { MVT::i64, GA, SDValue(Tmp, 0))); if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) { - const GlobalValue *GValue = G->getGlobal(); - if ((GValue->getType()->getElementType()->isFunctionTy() && - !GValue->isStrongDefinitionForLinker()) || - GValue->isDeclaration() || GValue->hasCommonLinkage() || - GValue->hasAvailableExternallyLinkage()) + const GlobalValue *GV = G->getGlobal(); + unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV); + if (GVFlags & PPCII::MO_NLP_FLAG) { return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA, SDValue(Tmp, 0))); + } } return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64, @@ -3110,7 +3165,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) { if (!CurDAG->MaskedValueIsZero(Op0, APInt::getHighBitsSet(Bits, Bits - (b+1)*8))) return false; - + LHS = Op0.getOperand(0); RHS = Op0.getOperand(1); return true; @@ -3305,7 +3360,7 @@ void PPCDAGToDAGISel::PreprocessISelDAG() { bool MadeChange = false; while (Position != CurDAG->allnodes_begin()) { - SDNode *N = --Position; + SDNode *N = &*--Position; if (N->use_empty()) continue; @@ -3989,7 +4044,7 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() { bool MadeChange = false; while (Position != CurDAG->allnodes_begin()) { - SDNode *N = --Position; + SDNode *N = &*--Position; // Skip dead nodes and any non-machine opcodes. if (N->use_empty() || !N->isMachineOpcode()) continue; @@ -4145,7 +4200,7 @@ void PPCDAGToDAGISel::PeepholePPC64() { ++Position; while (Position != CurDAG->allnodes_begin()) { - SDNode *N = --Position; + SDNode *N = &*--Position; // Skip dead nodes and any non-machine opcodes. if (N->use_empty() || !N->isMachineOpcode()) continue; @@ -4184,16 +4239,24 @@ void PPCDAGToDAGISel::PeepholePPC64() { break; } - // If this is a load or store with a zero offset, we may be able to - // fold an add-immediate into the memory operation. - if (!isa<ConstantSDNode>(N->getOperand(FirstOp)) || - N->getConstantOperandVal(FirstOp) != 0) + // If this is a load or store with a zero offset, or within the alignment, + // we may be able to fold an add-immediate into the memory operation. + // The check against alignment is below, as it can't occur until we check + // the arguments to N + if (!isa<ConstantSDNode>(N->getOperand(FirstOp))) continue; SDValue Base = N->getOperand(FirstOp + 1); if (!Base.isMachineOpcode()) continue; + // On targets with fusion, we don't want this to fire and remove a fusion + // opportunity, unless a) it results in another fusion opportunity or + // b) optimizing for size. + if (PPCSubTarget->hasFusion() && + (!MF->getFunction()->optForSize() && !Base.hasOneUse())) + continue; + unsigned Flags = 0; bool ReplaceFlags = true; @@ -4237,6 +4300,17 @@ void PPCDAGToDAGISel::PeepholePPC64() { break; } + SDValue ImmOpnd = Base.getOperand(1); + int MaxDisplacement = 0; + if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) { + const GlobalValue *GV = GA->getGlobal(); + MaxDisplacement = GV->getAlignment() - 1; + } + + int Offset = N->getConstantOperandVal(FirstOp); + if (Offset < 0 || Offset > MaxDisplacement) + continue; + // We found an opportunity. Reverse the operands from the add // immediate and substitute them into the load or store. If // needed, update the target flags for the immediate operand to @@ -4247,8 +4321,6 @@ void PPCDAGToDAGISel::PeepholePPC64() { DEBUG(N->dump(CurDAG)); DEBUG(dbgs() << "\n"); - SDValue ImmOpnd = Base.getOperand(1); - // If the relocation information isn't already present on the // immediate operand, add it now. if (ReplaceFlags) { @@ -4259,17 +4331,17 @@ void PPCDAGToDAGISel::PeepholePPC64() { // is insufficient for the instruction encoding. if (GV->getAlignment() < 4 && (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD || - StorageOpcode == PPC::LWA)) { + StorageOpcode == PPC::LWA || (Offset % 4) != 0)) { DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n"); continue; } - ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags); + ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags); } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(ImmOpnd)) { const Constant *C = CP->getConstVal(); ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64, CP->getAlignment(), - 0, Flags); + Offset, Flags); } } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 1b8f8fb..af9ad07 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -42,10 +42,6 @@ using namespace llvm; -// FIXME: Remove this once soft-float is supported. -static cl::opt<bool> DisablePPCFloatInVariadic("disable-ppc-float-in-variadic", -cl::desc("disable saving float registers for va_start on PPC"), cl::Hidden); - static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc", cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); @@ -72,8 +68,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // Set up the register classes. addRegisterClass(MVT::i32, &PPC::GPRCRegClass); - addRegisterClass(MVT::f32, &PPC::F4RCRegClass); - addRegisterClass(MVT::f64, &PPC::F8RCRegClass); + if (!Subtarget.useSoftFloat()) { + addRegisterClass(MVT::f32, &PPC::F4RCRegClass); + addRegisterClass(MVT::f64, &PPC::F8RCRegClass); + } // PowerPC has an i16 but no i8 (or i1) SEXTLOAD for (MVT VT : MVT::integer_valuetypes()) { @@ -107,8 +105,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, isPPC64 ? MVT::i64 : MVT::i32); setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); - AddPromotedToType (ISD::UINT_TO_FP, MVT::i1, - isPPC64 ? MVT::i64 : MVT::i32); + AddPromotedToType(ISD::UINT_TO_FP, MVT::i1, + isPPC64 ? MVT::i64 : MVT::i32); } else { setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom); @@ -257,10 +255,17 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); - setOperationAction(ISD::BITCAST, MVT::f32, Expand); - setOperationAction(ISD::BITCAST, MVT::i32, Expand); - setOperationAction(ISD::BITCAST, MVT::i64, Expand); - setOperationAction(ISD::BITCAST, MVT::f64, Expand); + if (Subtarget.hasDirectMove()) { + setOperationAction(ISD::BITCAST, MVT::f32, Legal); + setOperationAction(ISD::BITCAST, MVT::i32, Legal); + setOperationAction(ISD::BITCAST, MVT::i64, Legal); + setOperationAction(ISD::BITCAST, MVT::f64, Legal); + } else { + setOperationAction(ISD::BITCAST, MVT::f32, Expand); + setOperationAction(ISD::BITCAST, MVT::i32, Expand); + setOperationAction(ISD::BITCAST, MVT::i64, Expand); + setOperationAction(ISD::BITCAST, MVT::f64, Expand); + } // We cannot sextinreg(i1). Expand to shifts. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); @@ -329,6 +334,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom); + setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom); + setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom); // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -403,9 +410,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // will selectively turn on ones that can be effectively codegen'd. for (MVT VT : MVT::vector_valuetypes()) { // add/sub are legal for all supported vector VT's. - setOperationAction(ISD::ADD , VT, Legal); - setOperationAction(ISD::SUB , VT, Legal); - + setOperationAction(ISD::ADD, VT, Legal); + setOperationAction(ISD::SUB, VT, Legal); + // Vector instructions introduced in P8 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { setOperationAction(ISD::CTPOP, VT, Legal); @@ -477,6 +484,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); @@ -519,12 +528,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); } - - if (Subtarget.hasP8Altivec()) + if (Subtarget.hasP8Altivec()) setOperationAction(ISD::MUL, MVT::v4i32, Legal); else setOperationAction(ISD::MUL, MVT::v4i32, Custom); - + setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); @@ -545,6 +553,21 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (Subtarget.hasVSX()) { setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); + if (Subtarget.hasP8Vector()) { + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); + } + if (Subtarget.hasDirectMove()) { + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); + } + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); @@ -813,15 +836,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setLibcallName(RTLIB::SRA_I128, nullptr); } - if (isPPC64) { - setStackPointerRegisterToSaveRestore(PPC::X1); - setExceptionPointerRegister(PPC::X3); - setExceptionSelectorRegister(PPC::X4); - } else { - setStackPointerRegisterToSaveRestore(PPC::R1); - setExceptionPointerRegister(PPC::R3); - setExceptionSelectorRegister(PPC::R4); - } + setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::SINT_TO_FP); @@ -942,9 +957,9 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast<StructType>(Ty)) { - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + for (auto *EltTy : STy->elements()) { unsigned EltAlign = 0; - getMaxByValAlign(STy->getElementType(i), EltAlign, MaxMaxAlign); + getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == MaxMaxAlign) @@ -969,6 +984,10 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, return Align; } +bool PPCTargetLowering::useSoftFloat() const { + return Subtarget.useSoftFloat(); +} + const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; @@ -992,6 +1011,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::Lo: return "PPCISD::Lo"; case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; + case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; case PPCISD::SRL: return "PPCISD::SRL"; case PPCISD::SRA: return "PPCISD::SRA"; @@ -1236,7 +1256,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). -/// The ShuffleKind distinguishes between big-endian merges with two +/// The ShuffleKind distinguishes between big-endian merges with two /// different inputs (0), either-endian merges with two identical inputs (1), /// and little-endian merges with two different inputs (2). For the latter, /// the input operands are swapped (see PPCInstrAltivec.td). @@ -1261,7 +1281,7 @@ bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). -/// The ShuffleKind distinguishes between big-endian merges with two +/// The ShuffleKind distinguishes between big-endian merges with two /// different inputs (0), either-endian merges with two identical inputs (1), /// and little-endian merges with two different inputs (2). For the latter, /// the input operands are swapped (see PPCInstrAltivec.td). @@ -1353,7 +1373,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset, * - 2 = little-endian merge with two different inputs (inputs are swapped for * little-endian merges). * \param[in] DAG The current SelectionDAG - * \return true iff this shuffle mask + * \return true iff this shuffle mask */ bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, unsigned ShuffleKind, SelectionDAG &DAG) { @@ -1380,7 +1400,7 @@ bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift /// amount, otherwise return -1. -/// The ShuffleKind distinguishes between big-endian operations with two +/// The ShuffleKind distinguishes between big-endian operations with two /// different inputs (0), either-endian operations with two identical inputs /// (1), and little-endian operations with two different inputs (2). For the /// latter, the input operands are swapped (see PPCInstrAltivec.td). @@ -1513,8 +1533,8 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { for (unsigned i = 0; i != Multiple-1; ++i) { if (!UniquedVals[i].getNode()) continue; // Must have been undefs. - LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue(); - LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue(); + LeadingZero &= isNullConstant(UniquedVals[i]); + LeadingOnes &= isAllOnesConstant(UniquedVals[i]); } // Finally, check the least significant entry. if (LeadingZero) { @@ -1629,7 +1649,6 @@ static bool isIntS16Immediate(SDValue Op, short &Imm) { return isIntS16Immediate(Op.getNode(), Imm); } - /// SelectAddressRegReg - Given the specified addressed, check to see if it /// can be represented as an indexed [r+r] operation. Returns false if it /// can be more efficiently represented with [r+imm]. @@ -1998,10 +2017,10 @@ static SDValue getTOCEntry(SelectionDAG &DAG, SDLoc dl, bool Is64Bit, DAG.getNode(PPCISD::GlobalBaseReg, dl, VT); SDValue Ops[] = { GA, Reg }; - return DAG.getMemIntrinsicNode(PPCISD::TOC_ENTRY, dl, - DAG.getVTList(VT, MVT::Other), Ops, VT, - MachinePointerInfo::getGOT(), 0, false, true, - false, 0); + return DAG.getMemIntrinsicNode( + PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true, + false, 0); } SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, @@ -2092,6 +2111,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, // large models could be added if users need it, at the cost of // additional complexity. GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + SDLoc dl(GA); const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -2480,7 +2502,6 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG, // */ // } va_list[1]; - SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32); SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32); @@ -2536,7 +2557,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG, #include "PPCGenCallingConv.inc" -// Function whose sole purpose is to kill compiler warnings +// Function whose sole purpose is to kill compiler warnings // stemming from unused functions included from PPCGenCallingConv.inc. CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const { return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; @@ -2933,8 +2954,9 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( PPC::F8 }; unsigned NumFPArgRegs = array_lengthof(FPArgRegs); - if (DisablePPCFloatInVariadic) - NumFPArgRegs = 0; + + if (Subtarget.useSoftFloat()) + NumFPArgRegs = 0; FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs)); FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs)); @@ -3177,15 +3199,15 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( EVT ObjType = (ObjSize == 1 ? MVT::i8 : (ObjSize == 2 ? MVT::i16 : MVT::i32)); Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, - MachinePointerInfo(FuncArg), - ObjType, false, false, 0); + MachinePointerInfo(&*FuncArg), ObjType, + false, false, 0); } else { // For sizes that don't fit a truncating store (3, 5, 6, 7), // store the whole register as-is to the parameter save area // slot. - Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(FuncArg), - false, false, 0); + Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(&*FuncArg), false, false, 0); } MemOps.push_back(Store); @@ -3212,9 +3234,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( SDValue Off = DAG.getConstant(j, dl, PtrVT); Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); } - SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, - MachinePointerInfo(FuncArg, j), - false, false, 0); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, Addr, + MachinePointerInfo(&*FuncArg, j), false, false, 0); MemOps.push_back(Store); ++GPR_idx; } @@ -3592,7 +3614,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin( SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16; SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(FuncArg), + MachinePointerInfo(&*FuncArg), ObjType, false, false, 0); MemOps.push_back(Store); ++GPR_idx; @@ -3615,9 +3637,9 @@ PPCTargetLowering::LowerFormalArguments_Darwin( int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); - SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(FuncArg, j), - false, false, 0); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(&*FuncArg, j), false, false, 0); MemOps.push_back(Store); ++GPR_idx; ArgOffset += PtrByteSize; @@ -3880,7 +3902,6 @@ struct TailCallArgumentInfo { TailCallArgumentInfo() : FrameIdx(0) {} }; - } /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot. @@ -3895,9 +3916,10 @@ StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG, SDValue FIN = TailCallArgs[i].FrameIdxOp; int FI = TailCallArgs[i].FrameIdx; // Store relative to framepointer. - MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); + MemOpChains.push_back(DAG.getStore( + Chain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, 0)); } } @@ -3922,9 +3944,10 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, NewRetAddrLoc, true); EVT VT = isPPC64 ? MVT::i64 : MVT::i32; SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT); - Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx, - MachinePointerInfo::getFixedStack(NewRetAddr), - false, false, 0); + Chain = DAG.getStore( + Chain, dl, OldRetAddr, NewRetAddrFrIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), NewRetAddr), + false, false, 0); // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack // slot as the FP is never overwritten. @@ -3933,9 +3956,10 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc, true); SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT); - Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx, - MachinePointerInfo::getFixedStack(NewFPIdx), - false, false, 0); + Chain = DAG.getStore( + Chain, dl, OldFP, NewFramePtrIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), NewFPIdx), + false, false, 0); } } return Chain; @@ -4812,8 +4836,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, continue; break; case MVT::v4f32: - // When using QPX, this is handled like a FP register, otherwise, it - // is an Altivec register. + // When using QPX, this is handled like a FP register, otherwise, it + // is an Altivec register. if (Subtarget.hasQPX()) { if (++NumFPRsUsed <= NumFPRs) continue; @@ -5318,9 +5342,10 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); - Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, - MachinePointerInfo::getStack(TOCSaveOffset), - false, false, 0); + Chain = DAG.getStore( + Val.getValue(1), dl, Val, AddPtr, + MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset), + false, false, 0); // In the ELFv2 ABI, R12 must contain the address of an indirect callee. // This does not mean the MTCTR instruction must use R12; it's easier // to model this as an extra parameter, so do that. @@ -5341,9 +5366,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp, FPOp, true, TailCallArguments); - return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, - hasNest, DAG, RegsToPass, InFlag, Chain, CallSeqStart, - Callee, SPDiff, NumBytes, Ins, InVals, CS); + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, hasNest, + DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, + SPDiff, NumBytes, Ins, InVals, CS); } SDValue @@ -5798,6 +5823,22 @@ PPCTargetLowering::LowerReturn(SDValue Chain, return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps); } +SDValue PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET( + SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const { + SDLoc dl(Op); + + // Get the corect type for integers. + EVT IntVT = Op.getValueType(); + + // Get the inputs. + SDValue Chain = Op.getOperand(0); + SDValue FPSIdx = getFramePointerFrameIndex(DAG); + // Build a DYNAREAOFFSET node. + SDValue Ops[2] = {Chain, FPSIdx}; + SDVTList VTs = DAG.getVTList(IntVT); + return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops); +} + SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const { // When we pop the dynamic allocation we need to restore the SP link. @@ -5828,10 +5869,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, false, false, 0); } - - -SDValue -PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const { +SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool isPPC64 = Subtarget.isPPC64(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); @@ -5983,6 +6021,10 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { if (!DAG.getTarget().Options.NoInfsFPMath || !DAG.getTarget().Options.NoNaNsFPMath) return Op; + // TODO: Propagate flags from the select rather than global settings. + SDNodeFlags Flags; + Flags.setNoInfs(true); + Flags.setNoNaNs(true); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); @@ -6033,7 +6075,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { case ISD::SETNE: std::swap(TV, FV); case ISD::SETEQ: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); @@ -6043,25 +6085,25 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV); case ISD::SETULT: case ISD::SETLT: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOGE: case ISD::SETGE: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); case ISD::SETUGT: case ISD::SETGT: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOLE: case ISD::SETLE: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); @@ -6101,7 +6143,8 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex(); - MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(FI); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // Emit a store to the stack slot. SDValue Chain; @@ -6291,11 +6334,11 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - + SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::f64); - FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, - FPHalfs, FPHalfs, FPHalfs, FPHalfs); - + FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, FPHalfs, FPHalfs, + FPHalfs, FPHalfs); + Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); if (Op.getValueType() != MVT::v4f64) @@ -6421,17 +6464,18 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - SDValue Store = - DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, - MachinePointerInfo::getFixedStack(FrameIdx), - false, false, 0); + SDValue Store = DAG.getStore( + DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx), + false, false, 0); assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && "Expected an i32 store"); RLI.Ptr = FIdx; RLI.Chain = Store; - RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx); + RLI.MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); RLI.Alignment = 4; MachineMemOperand *MMO = @@ -6472,16 +6516,18 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, - MachinePointerInfo::getFixedStack(FrameIdx), - false, false, 0); + SDValue Store = DAG.getStore( + DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx), + false, false, 0); assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && "Expected an i32 store"); RLI.Ptr = FIdx; RLI.Chain = Store; - RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx); + RLI.MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); RLI.Alignment = 4; } @@ -6506,14 +6552,16 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, Op.getOperand(0)); // STD the extended value into the stack slot. - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Ext64, FIdx, - MachinePointerInfo::getFixedStack(FrameIdx), - false, false, 0); + SDValue Store = DAG.getStore( + DAG.getEntryNode(), dl, Ext64, FIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx), + false, false, 0); // Load the value as a double. - Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx, - MachinePointerInfo::getFixedStack(FrameIdx), - false, false, false, 0); + Ld = DAG.getLoad( + MVT::f64, dl, Store, FIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx), + false, false, false, 0); } // FCFID it and return it. @@ -6735,7 +6783,6 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1, DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2); } - /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified /// amount. The result has the specified value type. static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, @@ -6768,7 +6815,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // to a zero vector to get the boolean result. MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); - MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); @@ -6794,8 +6842,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, for (unsigned i = 0; i < 4; ++i) { if (BVN->getOperand(i).getOpcode() == ISD::UNDEF) CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); - else if (cast<ConstantSDNode>(BVN->getOperand(i))-> - getConstantIntValue()->isZero()) + else if (isNullConstant(BVN->getOperand(i))) continue; else CV[i] = One; @@ -6814,9 +6861,9 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, ValueVTs.push_back(MVT::Other); // chain SDVTList VTs = DAG.getVTList(ValueVTs); - return DAG.getMemIntrinsicNode(PPCISD::QVLFSb, - dl, VTs, Ops, MVT::v4f32, - MachinePointerInfo::getConstantPool()); + return DAG.getMemIntrinsicNode( + PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } SmallVector<SDValue, 4> Stores; @@ -6915,7 +6962,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, if (SextVal >= -16 && SextVal <= 15) return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl); - // Two instruction sequences. // If this value is in the range [-32,30] and is even, use: @@ -7304,11 +7350,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, V1, V2, VPermMask); } -/// getAltivecCompareInfo - Given an intrinsic, return false if it is not an -/// altivec comparison. If it is, return true and fill in Opc/isDot with +/// getVectorCompareInfo - Given an intrinsic, return false if it is not a +/// vector comparison. If it is, return true and fill in Opc/isDot with /// information about the intrinsic. -static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc, - bool &isDot, const PPCSubtarget &Subtarget) { +static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, + bool &isDot, const PPCSubtarget &Subtarget) { unsigned IntrinsicID = cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); CompareOpc = -1; @@ -7321,12 +7367,11 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc = 6; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc = 70; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpequd_p: + case Intrinsic::ppc_altivec_vcmpequd_p: if (Subtarget.hasP8Altivec()) { - CompareOpc = 199; - isDot = 1; - } - else + CompareOpc = 199; + isDot = 1; + } else return false; break; @@ -7335,28 +7380,48 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtsd_p: + case Intrinsic::ppc_altivec_vcmpgtsd_p: if (Subtarget.hasP8Altivec()) { - CompareOpc = 967; - isDot = 1; - } - else + CompareOpc = 967; + isDot = 1; + } else return false; break; case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtud_p: + case Intrinsic::ppc_altivec_vcmpgtud_p: if (Subtarget.hasP8Altivec()) { - CompareOpc = 711; - isDot = 1; + CompareOpc = 711; + isDot = 1; + } else + return false; + + break; + // VSX predicate comparisons use the same infrastructure + case Intrinsic::ppc_vsx_xvcmpeqdp_p: + case Intrinsic::ppc_vsx_xvcmpgedp_p: + case Intrinsic::ppc_vsx_xvcmpgtdp_p: + case Intrinsic::ppc_vsx_xvcmpeqsp_p: + case Intrinsic::ppc_vsx_xvcmpgesp_p: + case Intrinsic::ppc_vsx_xvcmpgtsp_p: + if (Subtarget.hasVSX()) { + switch (IntrinsicID) { + case Intrinsic::ppc_vsx_xvcmpeqdp_p: CompareOpc = 99; break; + case Intrinsic::ppc_vsx_xvcmpgedp_p: CompareOpc = 115; break; + case Intrinsic::ppc_vsx_xvcmpgtdp_p: CompareOpc = 107; break; + case Intrinsic::ppc_vsx_xvcmpeqsp_p: CompareOpc = 67; break; + case Intrinsic::ppc_vsx_xvcmpgesp_p: CompareOpc = 83; break; + case Intrinsic::ppc_vsx_xvcmpgtsp_p: CompareOpc = 75; break; + } + isDot = 1; } - else + else return false; break; - + // Normal Comparisons. case Intrinsic::ppc_altivec_vcmpbfp: CompareOpc = 966; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpeqfp: CompareOpc = 198; isDot = 0; break; @@ -7365,10 +7430,9 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_altivec_vcmpequw: CompareOpc = 134; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpequd: if (Subtarget.hasP8Altivec()) { - CompareOpc = 199; - isDot = 0; - } - else + CompareOpc = 199; + isDot = 0; + } else return false; break; @@ -7377,24 +7441,22 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_altivec_vcmpgtsb: CompareOpc = 774; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtsh: CompareOpc = 838; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtsw: CompareOpc = 902; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtsd: + case Intrinsic::ppc_altivec_vcmpgtsd: if (Subtarget.hasP8Altivec()) { - CompareOpc = 967; - isDot = 0; - } - else + CompareOpc = 967; + isDot = 0; + } else return false; break; case Intrinsic::ppc_altivec_vcmpgtub: CompareOpc = 518; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtuh: CompareOpc = 582; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtuw: CompareOpc = 646; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtud: + case Intrinsic::ppc_altivec_vcmpgtud: if (Subtarget.hasP8Altivec()) { - CompareOpc = 711; - isDot = 0; - } - else + CompareOpc = 711; + isDot = 0; + } else return false; break; @@ -7411,7 +7473,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc dl(Op); int CompareOpc; bool isDot; - if (!getAltivecCompareInfo(Op, CompareOpc, isDot, Subtarget)) + if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget)) return SDValue(); // Don't custom lower most intrinsics. // If this is a non-dot comparison, make the VCMP node and we are done. @@ -7536,7 +7598,7 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, FPHalfs, FPHalfs, FPHalfs, FPHalfs); - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); + Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); // Now convert to an integer and store. Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, @@ -7545,7 +7607,8 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); - MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); @@ -7752,7 +7815,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, FPHalfs, FPHalfs, FPHalfs, FPHalfs); - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); + Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); // Now convert to an integer and store. Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, @@ -7761,7 +7824,8 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); - MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); @@ -7798,11 +7862,10 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); - Stores.push_back(DAG.getTruncStore(StoreChain, dl, Loads[i], Idx, - SN->getPointerInfo().getWithOffset(i), - MVT::i8 /* memory type */, - SN->isNonTemporal(), SN->isVolatile(), - 1 /* alignment */, SN->getAAInfo())); + Stores.push_back(DAG.getTruncStore( + StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), + MVT::i8 /* memory type */, SN->isNonTemporal(), SN->isVolatile(), + 1 /* alignment */, SN->getAAInfo())); } StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); @@ -7906,6 +7969,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, Subtarget); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget); + case ISD::GET_DYNAMIC_AREA_OFFSET: return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG, Subtarget); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); @@ -7971,7 +8035,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, N->getValueType(0)); SDVTList VTs = DAG.getVTList(SVT, MVT::Other); SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0), - N->getOperand(1)); + N->getOperand(1)); Results.push_back(NewInt); Results.push_back(NewInt.getValue(1)); @@ -8020,7 +8084,6 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, } } - //===----------------------------------------------------------------------===// // Other Lowering Code //===----------------------------------------------------------------------===// @@ -8089,8 +8152,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); unsigned dest = MI->getOperand(0).getReg(); unsigned ptrA = MI->getOperand(1).getReg(); @@ -8160,8 +8222,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); unsigned dest = MI->getOperand(0).getReg(); unsigned ptrA = MI->getOperand(1).getReg(); @@ -8283,8 +8344,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; + MachineFunction::iterator I = ++MBB->getIterator(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); @@ -8384,8 +8444,8 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, .addMBB(mainMBB); MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB); - thisMBB->addSuccessor(mainMBB, /* weight */ 0); - thisMBB->addSuccessor(sinkMBB, /* weight */ 1); + thisMBB->addSuccessor(mainMBB, BranchProbability::getZero()); + thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne()); // mainMBB: // mainDstReg = 0 @@ -8562,8 +8622,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // To "insert" these instructions we actually have to insert their // control-flow patterns. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); MachineFunction *F = BB->getParent(); @@ -8675,7 +8734,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // mfspr Rx,TBU # load from TBU // mfspr Ry,TB # load from TB // mfspr Rz,TBU # load from TBU - // cmpw crX,Rx,Rz # check if ‘old’=’new’ + // cmpw crX,Rx,Rz # check if 'old'='new' // bne readLoop # branch if they're not equal // ... @@ -9137,7 +9196,7 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, return SDValue(); } -bool PPCTargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { +unsigned PPCTargetLowering::combineRepeatedFPDivisors() const { // Note: This functionality is used only when unsafe-fp-math is enabled, and // on cores with reciprocal estimates (which are used when unsafe-fp-math is // enabled for division), this functionality is redundant with the default @@ -9150,12 +9209,26 @@ bool PPCTargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { // one FP pipeline) for three or more FDIVs (for generic OOO cores). switch (Subtarget.getDarwinDirective()) { default: - return NumUsers > 2; + return 3; case PPC::DIR_440: case PPC::DIR_A2: case PPC::DIR_E500mc: case PPC::DIR_E5500: - return NumUsers > 1; + return 2; + } +} + +// isConsecutiveLSLoc needs to work even if all adds have not yet been +// collapsed, and so we need to look through chains of them. +static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base, + int64_t& Offset, SelectionDAG &DAG) { + if (DAG.isBaseWithConstantOffset(Loc)) { + Base = Loc.getOperand(0); + Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue(); + + // The base might itself be a base plus an offset, and if so, accumulate + // that as well. + getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG); } } @@ -9178,16 +9251,18 @@ static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes); } - // Handle X+C - if (DAG.isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc && - cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes) + SDValue Base1 = Loc, Base2 = BaseLoc; + int64_t Offset1 = 0, Offset2 = 0; + getBaseWithConstantOffset(Loc, Base1, Offset1, DAG); + getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG); + if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes)) return true; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const GlobalValue *GV1 = nullptr; const GlobalValue *GV2 = nullptr; - int64_t Offset1 = 0; - int64_t Offset2 = 0; + Offset1 = 0; + Offset2 = 0; bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1); bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); if (isGA1 && isGA2 && GV1 == GV2) @@ -9343,7 +9418,7 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(), IE = LoadRoots.end(); I != IE; ++I) { Queue.push_back(*I); - + while (!Queue.empty()) { SDNode *LoadRoot = Queue.pop_back_val(); if (!Visited.insert(LoadRoot).second) @@ -9470,7 +9545,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, } // Visit all inputs, collect all binary operations (and, or, xor and - // select) that are all fed by extensions. + // select) that are all fed by extensions. while (!BinOps.empty()) { SDValue BinOp = BinOps.back(); BinOps.pop_back(); @@ -9492,7 +9567,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) && BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) || isa<ConstantSDNode>(BinOp.getOperand(i))) { - Inputs.push_back(BinOp.getOperand(i)); + Inputs.push_back(BinOp.getOperand(i)); } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || BinOp.getOperand(i).getOpcode() == ISD::OR || BinOp.getOperand(i).getOpcode() == ISD::XOR || @@ -9572,7 +9647,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, if (isa<ConstantSDNode>(Inputs[i])) continue; else - DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); + DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); } // Replace all operations (these are all the same, but have a different @@ -9682,7 +9757,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, SmallPtrSet<SDNode *, 16> Visited; // Visit all inputs, collect all binary operations (and, or, xor and - // select) that are all fed by truncations. + // select) that are all fed by truncations. while (!BinOps.empty()) { SDValue BinOp = BinOps.back(); BinOps.pop_back(); @@ -9701,7 +9776,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || isa<ConstantSDNode>(BinOp.getOperand(i))) { - Inputs.push_back(BinOp.getOperand(i)); + Inputs.push_back(BinOp.getOperand(i)); } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || BinOp.getOperand(i).getOpcode() == ISD::OR || BinOp.getOperand(i).getOpcode() == ISD::XOR || @@ -9915,10 +9990,11 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, "Invalid extension type"); EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout()); SDValue ShiftCst = - DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy); - return DAG.getNode(ISD::SRA, dl, N->getValueType(0), - DAG.getNode(ISD::SHL, dl, N->getValueType(0), - N->getOperand(0), ShiftCst), ShiftCst); + DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy); + return DAG.getNode( + ISD::SRA, dl, N->getValueType(0), + DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst), + ShiftCst); } SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, @@ -10102,16 +10178,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case PPCISD::SHL: - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { - if (C->isNullValue()) // 0 << V -> 0. + if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. return N->getOperand(0); - } break; case PPCISD::SRL: - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { - if (C->isNullValue()) // 0 >>u V -> 0. + if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0. return N->getOperand(0); - } break; case PPCISD::SRA: if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { @@ -10122,7 +10194,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, break; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: - case ISD::ANY_EXTEND: + case ISD::ANY_EXTEND: return DAGCombineExtBoolTrunc(N, DCI); case ISD::TRUNCATE: case ISD::SETCC: @@ -10277,7 +10349,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // original unaligned load. MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *BaseMMO = - MF.getMachineMemOperand(LD->getMemOperand(), -MemVT.getStoreSize()+1, + MF.getMachineMemOperand(LD->getMemOperand(), + -(long)MemVT.getStoreSize()+1, 2*MemVT.getStoreSize()-1); // Create the new base load. @@ -10527,7 +10600,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, case ISD::BRCOND: { SDValue Cond = N->getOperand(1); SDValue Target = N->getOperand(2); - + if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() == Intrinsic::ppc_is_decremented_ctr_nonzero) { @@ -10558,8 +10631,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() == Intrinsic::ppc_is_decremented_ctr_nonzero && isa<ConstantSDNode>(LHS.getOperand(1)) && - !cast<ConstantSDNode>(LHS.getOperand(1))->getConstantIntValue()-> - isZero()) + !isNullConstant(LHS.getOperand(1))) LHS = LHS.getOperand(0); if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && @@ -10588,7 +10660,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && - getAltivecCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { + getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { assert(isDot && "Can't compare against a vector result!"); // If this is a comparison against something other than 0/1, then we know @@ -10739,8 +10811,11 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { // boundary so that the entire loop fits in one instruction-cache line. uint64_t LoopSize = 0; for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I) - for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) + for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) { LoopSize += TII->GetInstSizeInBytes(J); + if (LoopSize > 32) + break; + } if (LoopSize > 16 && LoopSize <= 32) return 5; @@ -10868,17 +10943,19 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &PPC::QFRCRegClass); if (VT == MVT::v4f32 && Subtarget.hasQPX()) return std::make_pair(0U, &PPC::QSRCRegClass); - return std::make_pair(0U, &PPC::VRRCRegClass); + if (Subtarget.hasAltivec()) + return std::make_pair(0U, &PPC::VRRCRegClass); case 'y': // crrc return std::make_pair(0U, &PPC::CRRCRegClass); } - } else if (Constraint == "wc") { // an individual CR bit. + } else if (Constraint == "wc" && Subtarget.useCRBits()) { + // An individual CR bit. return std::make_pair(0U, &PPC::CRBITRCRegClass); - } else if (Constraint == "wa" || Constraint == "wd" || - Constraint == "wf") { + } else if ((Constraint == "wa" || Constraint == "wd" || + Constraint == "wf") && Subtarget.hasVSX()) { return std::make_pair(0U, &PPC::VSRCRegClass); - } else if (Constraint == "ws") { - if (VT == MVT::f32) + } else if (Constraint == "ws" && Subtarget.hasVSX()) { + if (VT == MVT::f32 && Subtarget.hasP8Vector()) return std::make_pair(0U, &PPC::VSSRCRegClass); else return std::make_pair(0U, &PPC::VSFRCRegClass); @@ -10908,7 +10985,6 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return R; } - /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, @@ -11358,9 +11434,7 @@ bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); - if (BitSize == 0 || BitSize > 64) - return false; - return true; + return !(BitSize == 0 || BitSize > 64); } bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { @@ -11477,11 +11551,21 @@ PPCTargetLowering::getScratchRegisters(CallingConv::ID) const { return ScratchRegs; } +unsigned PPCTargetLowering::getExceptionPointerRegister( + const Constant *PersonalityFn) const { + return Subtarget.isPPC64() ? PPC::X3 : PPC::R3; +} + +unsigned PPCTargetLowering::getExceptionSelectorRegister( + const Constant *PersonalityFn) const { + return Subtarget.isPPC64() ? PPC::X4 : PPC::R4; +} + bool PPCTargetLowering::shouldExpandBuildVectorWithShuffles( EVT VT , unsigned DefinedValues) const { if (VT == MVT::v2i64) - return false; + return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves if (Subtarget.hasQPX()) { if (VT == MVT::v4f32 || VT == MVT::v4f64 || VT == MVT::v4i1) diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h index 6e13533..44bcb89 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -79,6 +79,11 @@ namespace llvm { /// compute an allocation on the stack. DYNALLOC, + /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to + /// compute an offset from native SP to the address of the most recent + /// dynamic alloca. + DYNAREAOFFSET, + /// GlobalBaseReg - On Darwin, this node represents the result of the mflr /// at function entry, used for PIC code. GlobalBaseReg, @@ -423,6 +428,8 @@ namespace llvm { /// DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + bool useSoftFloat() const override; + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { return MVT::i32; } @@ -655,8 +662,17 @@ namespace llvm { return Ty->isArrayTy(); } - private: + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override; + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + private: struct ReuseLoadInfo { SDValue Ptr; SDValue Chain; @@ -719,6 +735,8 @@ namespace llvm { const PPCSubtarget &Subtarget) const; SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const; + SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; @@ -853,7 +871,7 @@ namespace llvm { bool &UseOneConstNR) const override; SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const override; - bool combineRepeatedFPDivisors(unsigned NumUsers) const override; + unsigned combineRepeatedFPDivisors() const override; CCAssignFn *useFastISelCCs(unsigned Flag) const; }; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index d628330..79e4fe3 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -299,22 +299,35 @@ def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm), // 64-bit CR instructions let Interpretation64Bit = 1, isCodeGenOnly = 1 in { let hasSideEffects = 0 in { +// mtocrf's input needs to be prepared by shifting by an amount dependent +// on the cr register selected. Thus, post-ra anti-dep breaking must not +// later change that register assignment. +let hasExtraDefRegAllocReq = 1 in { def MTOCRF8: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins g8rc:$ST), "mtocrf $FXM, $ST", IIC_BrMCRX>, PPC970_DGroup_First, PPC970_Unit_CRU; +// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that +// is dependent on the cr fields being set. def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS), "mtcrf $FXM, $rS", IIC_BrMCRX>, PPC970_MicroCode, PPC970_Unit_CRU; +} // hasExtraDefRegAllocReq = 1 -let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking. +// mfocrf's input needs to be prepared by shifting by an amount dependent +// on the cr register selected. Thus, post-ra anti-dep breaking must not +// later change that register assignment. +let hasExtraSrcRegAllocReq = 1 in { def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM), "mfocrf $rT, $FXM", IIC_SprMFCRF>, PPC970_DGroup_First, PPC970_Unit_CRU; +// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that +// is dependent on the cr fields being copied. def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins), "mfcr $rT", IIC_SprMFCR>, PPC970_MicroCode, PPC970_Unit_CRU; +} // hasExtraSrcRegAllocReq = 1 } // hasSideEffects = 0 let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { @@ -369,6 +382,8 @@ let Defs = [X1], Uses = [X1] in def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8", [(set i64:$result, (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>; +def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8", + [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>; let Defs = [LR8] in { def MTLR8 : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS), diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index d4e666c..dcff6ad 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -144,6 +144,9 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx); + if (!DefMI->getParent()) + return Latency; + const MachineOperand &DefMO = DefMI->getOperand(DefIdx); unsigned Reg = DefMO.getReg(); @@ -186,6 +189,60 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, return Latency; } +// This function does not list all associative and commutative operations, but +// only those worth feeding through the machine combiner in an attempt to +// reduce the critical path. Mostly, this means floating-point operations, +// because they have high latencies (compared to other operations, such and +// and/or, which are also associative and commutative, but have low latencies). +bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { + switch (Inst.getOpcode()) { + // FP Add: + case PPC::FADD: + case PPC::FADDS: + // FP Multiply: + case PPC::FMUL: + case PPC::FMULS: + // Altivec Add: + case PPC::VADDFP: + // VSX Add: + case PPC::XSADDDP: + case PPC::XVADDDP: + case PPC::XVADDSP: + case PPC::XSADDSP: + // VSX Multiply: + case PPC::XSMULDP: + case PPC::XVMULDP: + case PPC::XVMULSP: + case PPC::XSMULSP: + // QPX Add: + case PPC::QVFADD: + case PPC::QVFADDS: + case PPC::QVFADDSs: + // QPX Multiply: + case PPC::QVFMUL: + case PPC::QVFMULS: + case PPC::QVFMULSs: + return true; + default: + return false; + } +} + +bool PPCInstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) const { + // Using the machine combiner in this way is potentially expensive, so + // restrict to when aggressive optimizations are desired. + if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive) + return false; + + // FP reassociation is only legal when we don't need strict IEEE semantics. + if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath) + return false; + + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); +} + // Detect 32 -> 64-bit extensions where we may reuse the low sub-register. bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, @@ -259,16 +316,16 @@ unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI, return 0; } -// commuteInstruction - We can commute rlwimi instructions, but only if the -// rotate amt is zero. We also have to munge the immediates a bit. -MachineInstr * -PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { +MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { MachineFunction &MF = *MI->getParent()->getParent(); // Normal instructions can be commuted the obvious way. if (MI->getOpcode() != PPC::RLWIMI && MI->getOpcode() != PPC::RLWIMIo) - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because // changing the relative order of the mask operands might change what happens @@ -286,6 +343,8 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { // Op0 = (Op2 & ~M) | (Op1 & M) // Swap op1/op2 + assert(((OpIdx1 == 1 && OpIdx2 == 2) || (OpIdx1 == 2 && OpIdx2 == 1)) && + "Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo."); unsigned Reg0 = MI->getOperand(0).getReg(); unsigned Reg1 = MI->getOperand(1).getReg(); unsigned Reg2 = MI->getOperand(2).getReg(); @@ -353,9 +412,9 @@ bool PPCInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, if (AltOpc == -1) return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); - SrcOpIdx1 = 2; - SrcOpIdx2 = 3; - return true; + // The commutable operand indices are 2 and 3. Return them in SrcOpIdx1 + // and SrcOpIdx2. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3); } void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, @@ -685,20 +744,43 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB, "isel is for regular integer GPRs only"); unsigned OpCode = Is64Bit ? PPC::ISEL8 : PPC::ISEL; - unsigned SelectPred = Cond[0].getImm(); + auto SelectPred = static_cast<PPC::Predicate>(Cond[0].getImm()); unsigned SubIdx; bool SwapOps; switch (SelectPred) { - default: llvm_unreachable("invalid predicate for isel"); - case PPC::PRED_EQ: SubIdx = PPC::sub_eq; SwapOps = false; break; - case PPC::PRED_NE: SubIdx = PPC::sub_eq; SwapOps = true; break; - case PPC::PRED_LT: SubIdx = PPC::sub_lt; SwapOps = false; break; - case PPC::PRED_GE: SubIdx = PPC::sub_lt; SwapOps = true; break; - case PPC::PRED_GT: SubIdx = PPC::sub_gt; SwapOps = false; break; - case PPC::PRED_LE: SubIdx = PPC::sub_gt; SwapOps = true; break; - case PPC::PRED_UN: SubIdx = PPC::sub_un; SwapOps = false; break; - case PPC::PRED_NU: SubIdx = PPC::sub_un; SwapOps = true; break; + case PPC::PRED_EQ: + case PPC::PRED_EQ_MINUS: + case PPC::PRED_EQ_PLUS: + SubIdx = PPC::sub_eq; SwapOps = false; break; + case PPC::PRED_NE: + case PPC::PRED_NE_MINUS: + case PPC::PRED_NE_PLUS: + SubIdx = PPC::sub_eq; SwapOps = true; break; + case PPC::PRED_LT: + case PPC::PRED_LT_MINUS: + case PPC::PRED_LT_PLUS: + SubIdx = PPC::sub_lt; SwapOps = false; break; + case PPC::PRED_GE: + case PPC::PRED_GE_MINUS: + case PPC::PRED_GE_PLUS: + SubIdx = PPC::sub_lt; SwapOps = true; break; + case PPC::PRED_GT: + case PPC::PRED_GT_MINUS: + case PPC::PRED_GT_PLUS: + SubIdx = PPC::sub_gt; SwapOps = false; break; + case PPC::PRED_LE: + case PPC::PRED_LE_MINUS: + case PPC::PRED_LE_PLUS: + SubIdx = PPC::sub_gt; SwapOps = true; break; + case PPC::PRED_UN: + case PPC::PRED_UN_MINUS: + case PPC::PRED_UN_PLUS: + SubIdx = PPC::sub_un; SwapOps = false; break; + case PPC::PRED_NU: + case PPC::PRED_NU_MINUS: + case PPC::PRED_NU_PLUS: + SubIdx = PPC::sub_un; SwapOps = true; break; case PPC::PRED_BIT_SET: SubIdx = 0; SwapOps = false; break; case PPC::PRED_BIT_UNSET: SubIdx = 0; SwapOps = true; break; } @@ -996,11 +1078,10 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MBB.insert(MI, NewMIs[i]); const MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), - MachineMemOperand::MOStore, - MFI.getObjectSize(FrameIdx), - MFI.getObjectAlignment(FrameIdx)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdx), + MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); NewMIs.back()->addMemOperand(MF, MMO); } @@ -1109,11 +1190,10 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MBB.insert(MI, NewMIs[i]); const MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FrameIdx), - MFI.getObjectAlignment(FrameIdx)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdx), + MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); NewMIs.back()->addMemOperand(MF, MMO); } @@ -1214,7 +1294,7 @@ bool PPCInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT, unsigned ExtraT, MachineBasicBlock &FMBB, unsigned NumF, unsigned ExtraF, - const BranchProbability &Probability) const { + BranchProbability Probability) const { return !(MBBDefinesCTR(TMBB) && MBBDefinesCTR(FMBB)); } @@ -1691,13 +1771,13 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr, MI->setDesc(NewDesc); if (NewDesc.ImplicitDefs) - for (const uint16_t *ImpDefs = NewDesc.getImplicitDefs(); + for (const MCPhysReg *ImpDefs = NewDesc.getImplicitDefs(); *ImpDefs; ++ImpDefs) if (!MI->definesRegister(*ImpDefs)) MI->addOperand(*MI->getParent()->getParent(), MachineOperand::CreateReg(*ImpDefs, true, true)); if (NewDesc.ImplicitUses) - for (const uint16_t *ImpUses = NewDesc.getImplicitUses(); + for (const MCPhysReg *ImpUses = NewDesc.getImplicitUses(); *ImpUses; ++ImpUses) if (!MI->readsRegister(*ImpUses)) MI->addOperand(*MI->getParent()->getParent(), @@ -1737,3 +1817,35 @@ unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { } } +std::pair<unsigned, unsigned> +PPCInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + const unsigned Mask = PPCII::MO_ACCESS_MASK; + return std::make_pair(TF & Mask, TF & ~Mask); +} + +ArrayRef<std::pair<unsigned, const char *>> +PPCInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace PPCII; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_LO, "ppc-lo"}, + {MO_HA, "ppc-ha"}, + {MO_TPREL_LO, "ppc-tprel-lo"}, + {MO_TPREL_HA, "ppc-tprel-ha"}, + {MO_DTPREL_LO, "ppc-dtprel-lo"}, + {MO_TLSLD_LO, "ppc-tlsld-lo"}, + {MO_TOC_LO, "ppc-toc-lo"}, + {MO_TLS, "ppc-tls"}}; + return makeArrayRef(TargetFlags); +} + +ArrayRef<std::pair<unsigned, const char *>> +PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { + using namespace PPCII; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_PLT_OR_STUB, "ppc-plt-or-stub"}, + {MO_PIC_FLAG, "ppc-pic"}, + {MO_NLP_FLAG, "ppc-nlp"}, + {MO_NLP_HIDDEN_FLAG, "ppc-nlp-hidden"}}; + return makeArrayRef(TargetFlags); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 40badae..c3c3a48 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -79,6 +79,23 @@ class PPCInstrInfo : public PPCGenInstrInfo { SmallVectorImpl<MachineInstr*> &NewMIs, bool &NonRI, bool &SpillsVRS) const; virtual void anchor(); + +protected: + /// Commutes the operands in the given instruction. + /// The commutable operands are specified by their indices OpIdx1 and OpIdx2. + /// + /// Do not call this method for a non-commutable instruction or for + /// non-commutable pair of operand indices OpIdx1 and OpIdx2. + /// Even though the instruction is commutable, the method may still + /// fail to commute the operands, null pointer is returned in such cases. + /// + /// For example, we can commute rlwimi instructions, but only if the + /// rotate amt is zero. We also have to munge the immediates a bit. + MachineInstr *commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const override; + public: explicit PPCInstrInfo(PPCSubtarget &STI); @@ -119,6 +136,19 @@ public: return false; } + bool useMachineCombiner() const override { + return true; + } + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in <Root>. All potential patterns are + /// output in the <Pattern> array. + bool getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &P) const override; + + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, unsigned &SubIdx) const override; @@ -127,10 +157,6 @@ public: unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const override; - // commuteInstruction - We can commute rlwimi instructions, but only if the - // rotate amt is zero. We also have to munge the immediates a bit. - MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override; - bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; @@ -183,7 +209,7 @@ public: // profitable to use the predicated branches. bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const override { + BranchProbability Probability) const override { return true; } @@ -191,12 +217,10 @@ public: unsigned NumT, unsigned ExtraT, MachineBasicBlock &FMBB, unsigned NumF, unsigned ExtraF, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; - bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, - unsigned NumCycles, - const BranchProbability - &Probability) const override { + bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, + BranchProbability Probability) const override { return true; } @@ -239,6 +263,15 @@ public: unsigned GetInstSizeInBytes(const MachineInstr *MI) const; void getNoopForMachoTarget(MCInst &NopInst) const override; + + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + + ArrayRef<std::pair<unsigned, const char *>> + getSerializableBitmaskMachineOperandTargetFlags() const override; }; } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 24fd9bd..ce0f9e6 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -226,7 +226,9 @@ def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone, // Instructions to support dynamic alloca. def SDTDynOp : SDTypeProfile<1, 2, []>; +def SDTDynAreaOp : SDTypeProfile<1, 1, []>; def PPCdynalloc : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>; +def PPCdynareaoffset : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>; //===----------------------------------------------------------------------===// // PowerPC specific transformation functions and pattern fragments. @@ -1029,6 +1031,8 @@ let Defs = [R1], Uses = [R1] in def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC", [(set i32:$result, (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>; +def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET", + [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>; // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after // instruction selection into a branch sequence. @@ -2295,22 +2299,35 @@ def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F), "#RESTORE_VRSAVE", []>; let hasSideEffects = 0 in { +// mtocrf's input needs to be prepared by shifting by an amount dependent +// on the cr register selected. Thus, post-ra anti-dep breaking must not +// later change that register assignment. +let hasExtraDefRegAllocReq = 1 in { def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST), "mtocrf $FXM, $ST", IIC_BrMCRX>, PPC970_DGroup_First, PPC970_Unit_CRU; +// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that +// is dependent on the cr fields being set. def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS), "mtcrf $FXM, $rS", IIC_BrMCRX>, PPC970_MicroCode, PPC970_Unit_CRU; +} // hasExtraDefRegAllocReq = 1 -let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking. +// mfocrf's input needs to be prepared by shifting by an amount dependent +// on the cr register selected. Thus, post-ra anti-dep breaking must not +// later change that register assignment. +let hasExtraSrcRegAllocReq = 1 in { def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM), "mfocrf $rT, $FXM", IIC_SprMFCRF>, PPC970_DGroup_First, PPC970_Unit_CRU; +// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that +// is dependent on the cr fields being copied. def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins), "mfcr $rT", IIC_SprMFCR>, PPC970_MicroCode, PPC970_Unit_CRU; +} // hasExtraSrcRegAllocReq = 1 } // hasSideEffects = 0 // Pseudo instruction to perform FADD in round-to-zero mode. @@ -3883,8 +3900,11 @@ def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0, def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>; def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>; -def : InstAlias<"cntlz $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>; -def : InstAlias<"cntlz. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>; +def : InstAlias<"cntlzw $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>; +def : InstAlias<"cntlzw. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>; +// The POWER variant +def : MnemonicAlias<"cntlz", "cntlzw">; +def : MnemonicAlias<"cntlz.", "cntlzw.">; def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td index 0a044c5..4312007 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td @@ -839,31 +839,31 @@ def : Pat<(v4f64 (scalar_to_vector f64:$A)), def : Pat<(v4f32 (scalar_to_vector f32:$A)), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>; -def : Pat<(f64 (vector_extract v4f64:$S, 0)), +def : Pat<(f64 (extractelt v4f64:$S, 0)), (EXTRACT_SUBREG $S, sub_64)>; -def : Pat<(f32 (vector_extract v4f32:$S, 0)), +def : Pat<(f32 (extractelt v4f32:$S, 0)), (EXTRACT_SUBREG $S, sub_64)>; -def : Pat<(f64 (vector_extract v4f64:$S, 1)), +def : Pat<(f64 (extractelt v4f64:$S, 1)), (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>; -def : Pat<(f64 (vector_extract v4f64:$S, 2)), +def : Pat<(f64 (extractelt v4f64:$S, 2)), (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>; -def : Pat<(f64 (vector_extract v4f64:$S, 3)), +def : Pat<(f64 (extractelt v4f64:$S, 3)), (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>; -def : Pat<(f32 (vector_extract v4f32:$S, 1)), +def : Pat<(f32 (extractelt v4f32:$S, 1)), (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>; -def : Pat<(f32 (vector_extract v4f32:$S, 2)), +def : Pat<(f32 (extractelt v4f32:$S, 2)), (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>; -def : Pat<(f32 (vector_extract v4f32:$S, 3)), +def : Pat<(f32 (extractelt v4f32:$S, 3)), (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>; -def : Pat<(f64 (vector_extract v4f64:$S, i64:$F)), +def : Pat<(f64 (extractelt v4f64:$S, i64:$F)), (EXTRACT_SUBREG (QVFPERM $S, $S, (QVLPCLSXint (RLDICR $F, 2, /* 63-2 = */ 61))), sub_64)>; -def : Pat<(f32 (vector_extract v4f32:$S, i64:$F)), +def : Pat<(f32 (extractelt v4f32:$S, i64:$F)), (EXTRACT_SUBREG (QVFPERMs $S, $S, (QVLPCLSXint (RLDICR $F, 2, /* 63-2 = */ 61))), diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td index ce63c22..df1142c 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -67,17 +67,19 @@ def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>; def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>; def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>; -multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL, - string asmbase, string asmstr, InstrItinClass itin, - list<dag> pattern> { +multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase, + string asmstr, InstrItinClass itin, Intrinsic Int, + ValueType OutTy, ValueType InTy> { let BaseName = asmbase in { - def NAME : XX3Form_Rc<opcode, xo, OOL, IOL, + def NAME : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), !strconcat(asmbase, !strconcat(" ", asmstr)), itin, - pattern>; + [(set OutTy:$XT, (Int InTy:$XA, InTy:$XB))]>; let Defs = [CR6] in - def o : XX3Form_Rc<opcode, xo, OOL, IOL, + def o : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), !strconcat(asmbase, !strconcat(". ", asmstr)), itin, - []>, isDOT; + [(set InTy:$XT, + (InTy (PPCvcmp_o InTy:$XA, InTy:$XB, xo)))]>, + isDOT; } } @@ -456,35 +458,23 @@ let Uses = [RM] in { "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>; defm XVCMPEQDP : XX3Form_Rcr<60, 99, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare, - [(set v2i64:$XT, - (int_ppc_vsx_xvcmpeqdp v2f64:$XA, v2f64:$XB))]>; + int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>; defm XVCMPEQSP : XX3Form_Rcr<60, 67, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcmpeqsp", "$XT, $XA, $XB", IIC_VecFPCompare, - [(set v4i32:$XT, - (int_ppc_vsx_xvcmpeqsp v4f32:$XA, v4f32:$XB))]>; + int_ppc_vsx_xvcmpeqsp, v4i32, v4f32>; defm XVCMPGEDP : XX3Form_Rcr<60, 115, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcmpgedp", "$XT, $XA, $XB", IIC_VecFPCompare, - [(set v2i64:$XT, - (int_ppc_vsx_xvcmpgedp v2f64:$XA, v2f64:$XB))]>; + int_ppc_vsx_xvcmpgedp, v2i64, v2f64>; defm XVCMPGESP : XX3Form_Rcr<60, 83, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcmpgesp", "$XT, $XA, $XB", IIC_VecFPCompare, - [(set v4i32:$XT, - (int_ppc_vsx_xvcmpgesp v4f32:$XA, v4f32:$XB))]>; + int_ppc_vsx_xvcmpgesp, v4i32, v4f32>; defm XVCMPGTDP : XX3Form_Rcr<60, 107, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcmpgtdp", "$XT, $XA, $XB", IIC_VecFPCompare, - [(set v2i64:$XT, - (int_ppc_vsx_xvcmpgtdp v2f64:$XA, v2f64:$XB))]>; + int_ppc_vsx_xvcmpgtdp, v2i64, v2f64>; defm XVCMPGTSP : XX3Form_Rcr<60, 75, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcmpgtsp", "$XT, $XA, $XB", IIC_VecFPCompare, - [(set v4i32:$XT, - (int_ppc_vsx_xvcmpgtsp v4f32:$XA, v4f32:$XB))]>; + int_ppc_vsx_xvcmpgtsp, v4i32, v4f32>; // Move Instructions def XSABSDP : XX2Form<60, 345, @@ -845,9 +835,9 @@ let Predicates = [IsBigEndian] in { def : Pat<(v2f64 (scalar_to_vector f64:$A)), (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>; -def : Pat<(f64 (vector_extract v2f64:$S, 0)), +def : Pat<(f64 (extractelt v2f64:$S, 0)), (f64 (EXTRACT_SUBREG $S, sub_64))>; -def : Pat<(f64 (vector_extract v2f64:$S, 1)), +def : Pat<(f64 (extractelt v2f64:$S, 1)), (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>; } @@ -856,9 +846,9 @@ def : Pat<(v2f64 (scalar_to_vector f64:$A)), (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $A, sub_64), (SUBREG_TO_REG (i64 1), $A, sub_64), 0))>; -def : Pat<(f64 (vector_extract v2f64:$S, 0)), +def : Pat<(f64 (extractelt v2f64:$S, 0)), (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>; -def : Pat<(f64 (vector_extract v2f64:$S, 1)), +def : Pat<(f64 (extractelt v2f64:$S, 1)), (f64 (EXTRACT_SUBREG $S, sub_64))>; } @@ -1206,6 +1196,23 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } + + // Single Precision Conversions (FP <-> INT) + def XSCVSXDSP : XX2Form<60, 312, + (outs vssrc:$XT), (ins vsfrc:$XB), + "xscvsxdsp $XT, $XB", IIC_VecFP, + [(set f32:$XT, (PPCfcfids f64:$XB))]>; + def XSCVUXDSP : XX2Form<60, 296, + (outs vssrc:$XT), (ins vsfrc:$XB), + "xscvuxdsp $XT, $XB", IIC_VecFP, + [(set f32:$XT, (PPCfcfidus f64:$XB))]>; + + // Conversions between vector and scalar single precision + def XSCVDPSPN : XX2Form<60, 267, (outs vsrc:$XT), (ins vssrc:$XB), + "xscvdpspn $XT, $XB", IIC_VecFP, []>; + def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB), + "xscvspdpn $XT, $XB", IIC_VecFP, []>; + } // AddedComplexity = 400 } // HasP8Vector @@ -1229,3 +1236,550 @@ let Predicates = [HasDirectMove, HasVSX] in { "mtvsrwz $XT, $rA", IIC_VecGeneral, [(set f64:$XT, (PPCmtvsrz i32:$rA))]>; } // HasDirectMove, HasVSX + +/* Direct moves of various widths from GPR's into VSR's. Each move lines + the value up into element 0 (both BE and LE). Namely, entities smaller than + a doubleword are shifted left and moved for BE. For LE, they're moved, then + swapped to go into the least significant element of the VSR. +*/ +def MovesToVSR { + dag BE_BYTE_0 = + (MTVSRD + (RLDICR + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 56, 7)); + dag BE_HALF_0 = + (MTVSRD + (RLDICR + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 48, 15)); + dag BE_WORD_0 = + (MTVSRD + (RLDICR + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 32, 31)); + dag BE_DWORD_0 = (MTVSRD $A); + + dag LE_MTVSRW = (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32)); + dag LE_WORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), + LE_MTVSRW, sub_64)); + dag LE_WORD_0 = (XXPERMDI LE_WORD_1, LE_WORD_1, 2); + dag LE_DWORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), + BE_DWORD_0, sub_64)); + dag LE_DWORD_0 = (XXPERMDI LE_DWORD_1, LE_DWORD_1, 2); +} + +/* Patterns for extracting elements out of vectors. Integer elements are + extracted using direct move operations. Patterns for extracting elements + whose indices are not available at compile time are also provided with + various _VARIABLE_ patterns. + The numbering for the DAG's is for LE, but when used on BE, the correct + LE element can just be used (i.e. LE_BYTE_2 == BE_BYTE_13). +*/ +def VectorExtractions { + // Doubleword extraction + dag LE_DWORD_0 = + (MFVSRD + (EXTRACT_SUBREG + (XXPERMDI (COPY_TO_REGCLASS $S, VSRC), + (COPY_TO_REGCLASS $S, VSRC), 2), sub_64)); + dag LE_DWORD_1 = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64)); + + // Word extraction + dag LE_WORD_0 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 2), sub_64)); + dag LE_WORD_1 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 1), sub_64)); + dag LE_WORD_2 = (MFVSRWZ (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64)); + dag LE_WORD_3 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 3), sub_64)); + + // Halfword extraction + dag LE_HALF_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 48), sub_32)); + dag LE_HALF_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 48), sub_32)); + dag LE_HALF_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 48), sub_32)); + dag LE_HALF_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 48), sub_32)); + dag LE_HALF_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 48), sub_32)); + dag LE_HALF_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 48), sub_32)); + dag LE_HALF_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 48), sub_32)); + dag LE_HALF_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 48), sub_32)); + + // Byte extraction + dag LE_BYTE_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 56), sub_32)); + dag LE_BYTE_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 56, 56), sub_32)); + dag LE_BYTE_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 56), sub_32)); + dag LE_BYTE_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 40, 56), sub_32)); + dag LE_BYTE_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 56), sub_32)); + dag LE_BYTE_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 24, 56), sub_32)); + dag LE_BYTE_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 56), sub_32)); + dag LE_BYTE_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 8, 56), sub_32)); + dag LE_BYTE_8 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 56), sub_32)); + dag LE_BYTE_9 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 56, 56), sub_32)); + dag LE_BYTE_10 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 56), sub_32)); + dag LE_BYTE_11 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 40, 56), sub_32)); + dag LE_BYTE_12 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 56), sub_32)); + dag LE_BYTE_13 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 24, 56), sub_32)); + dag LE_BYTE_14 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 56), sub_32)); + dag LE_BYTE_15 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 8, 56), sub_32)); + + /* Variable element number (BE and LE patterns must be specified separately) + This is a rather involved process. + + Conceptually, this is how the move is accomplished: + 1. Identify which doubleword contains the element + 2. Shift in the VMX register so that the correct doubleword is correctly + lined up for the MFVSRD + 3. Perform the move so that the element (along with some extra stuff) + is in the GPR + 4. Right shift within the GPR so that the element is right-justified + + Of course, the index is an element number which has a different meaning + on LE/BE so the patterns have to be specified separately. + + Note: The final result will be the element right-justified with high + order bits being arbitrarily defined (namely, whatever was in the + vector register to the left of the value originally). + */ + + /* LE variable byte + Number 1. above: + - For elements 0-7, we shift left by 8 bytes since they're on the right + - For elements 8-15, we need not shift (shift left by zero bytes) + This is accomplished by inverting the bits of the index and AND-ing + with 0x8 (i.e. clearing all bits of the index and inverting bit 60). + */ + dag LE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDC8 (LI8 8), $Idx)); + + // Number 2. above: + // - Now that we set up the shift amount, we shift in the VMX register + dag LE_VBYTE_PERMUTE = (VPERM $S, $S, LE_VBYTE_PERM_VEC); + + // Number 3. above: + // - The doubleword containing our element is moved to a GPR + dag LE_MV_VBYTE = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS LE_VBYTE_PERMUTE, VSRC)), + sub_64)); + + /* Number 4. above: + - Truncate the element number to the range 0-7 (8-15 are symmetrical + and out of range values are truncated accordingly) + - Multiply by 8 as we need to shift right by the number of bits, not bytes + - Shift right in the GPR by the calculated value + */ + dag LE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 7), $Idx), 3, 60), + sub_32); + dag LE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD LE_MV_VBYTE, LE_VBYTE_SHIFT), + sub_32); + + /* LE variable halfword + Number 1. above: + - For elements 0-3, we shift left by 8 since they're on the right + - For elements 4-7, we need not shift (shift left by zero bytes) + Similarly to the byte pattern, we invert the bits of the index, but we + AND with 0x4 (i.e. clear all bits of the index and invert bit 61). + Of course, the shift is still by 8 bytes, so we must multiply by 2. + */ + dag LE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 4), $Idx), 1, 62)); + + // Number 2. above: + // - Now that we set up the shift amount, we shift in the VMX register + dag LE_VHALF_PERMUTE = (VPERM $S, $S, LE_VHALF_PERM_VEC); + + // Number 3. above: + // - The doubleword containing our element is moved to a GPR + dag LE_MV_VHALF = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS LE_VHALF_PERMUTE, VSRC)), + sub_64)); + + /* Number 4. above: + - Truncate the element number to the range 0-3 (4-7 are symmetrical + and out of range values are truncated accordingly) + - Multiply by 16 as we need to shift right by the number of bits + - Shift right in the GPR by the calculated value + */ + dag LE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 3), $Idx), 4, 59), + sub_32); + dag LE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD LE_MV_VHALF, LE_VHALF_SHIFT), + sub_32); + + /* LE variable word + Number 1. above: + - For elements 0-1, we shift left by 8 since they're on the right + - For elements 2-3, we need not shift + */ + dag LE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61)); + + // Number 2. above: + // - Now that we set up the shift amount, we shift in the VMX register + dag LE_VWORD_PERMUTE = (VPERM $S, $S, LE_VWORD_PERM_VEC); + + // Number 3. above: + // - The doubleword containing our element is moved to a GPR + dag LE_MV_VWORD = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS LE_VWORD_PERMUTE, VSRC)), + sub_64)); + + /* Number 4. above: + - Truncate the element number to the range 0-1 (2-3 are symmetrical + and out of range values are truncated accordingly) + - Multiply by 32 as we need to shift right by the number of bits + - Shift right in the GPR by the calculated value + */ + dag LE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 1), $Idx), 5, 58), + sub_32); + dag LE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD LE_MV_VWORD, LE_VWORD_SHIFT), + sub_32); + + /* LE variable doubleword + Number 1. above: + - For element 0, we shift left by 8 since it's on the right + - For element 1, we need not shift + */ + dag LE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60)); + + // Number 2. above: + // - Now that we set up the shift amount, we shift in the VMX register + dag LE_VDWORD_PERMUTE = (VPERM $S, $S, LE_VDWORD_PERM_VEC); + + // Number 3. above: + // - The doubleword containing our element is moved to a GPR + // - Number 4. is not needed for the doubleword as the value is 64-bits + dag LE_VARIABLE_DWORD = + (MFVSRD (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS LE_VDWORD_PERMUTE, VSRC)), + sub_64)); + + /* LE variable float + - Shift the vector to line up the desired element to BE Word 0 + - Convert 32-bit float to a 64-bit single precision float + */ + dag LE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR (XOR8 (LI8 3), $Idx), 2, 61)); + dag LE_VFLOAT_PERMUTE = (VPERM $S, $S, LE_VFLOAT_PERM_VEC); + dag LE_VARIABLE_FLOAT = (XSCVSPDPN LE_VFLOAT_PERMUTE); + + /* LE variable double + Same as the LE doubleword except there is no move. + */ + dag LE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC), + (COPY_TO_REGCLASS $S, VRRC), + LE_VDWORD_PERM_VEC); + dag LE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS LE_VDOUBLE_PERMUTE, VSRC); + + /* BE variable byte + The algorithm here is the same as the LE variable byte except: + - The shift in the VMX register is by 0/8 for opposite element numbers so + we simply AND the element number with 0x8 + - The order of elements after the move to GPR is reversed, so we invert + the bits of the index prior to truncating to the range 0-7 + */ + dag BE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDIo8 $Idx, 8)); + dag BE_VBYTE_PERMUTE = (VPERM $S, $S, BE_VBYTE_PERM_VEC); + dag BE_MV_VBYTE = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)), + sub_64)); + dag BE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 7), $Idx), 3, 60), + sub_32); + dag BE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD BE_MV_VBYTE, BE_VBYTE_SHIFT), + sub_32); + + /* BE variable halfword + The algorithm here is the same as the LE variable halfword except: + - The shift in the VMX register is by 0/8 for opposite element numbers so + we simply AND the element number with 0x4 and multiply by 2 + - The order of elements after the move to GPR is reversed, so we invert + the bits of the index prior to truncating to the range 0-3 + */ + dag BE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 4), 1, 62)); + dag BE_VHALF_PERMUTE = (VPERM $S, $S, BE_VHALF_PERM_VEC); + dag BE_MV_VHALF = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS BE_VHALF_PERMUTE, VSRC)), + sub_64)); + dag BE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 3), $Idx), 4, 59), + sub_32); + dag BE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD BE_MV_VHALF, BE_VHALF_SHIFT), + sub_32); + + /* BE variable word + The algorithm is the same as the LE variable word except: + - The shift in the VMX register happens for opposite element numbers + - The order of elements after the move to GPR is reversed, so we invert + the bits of the index prior to truncating to the range 0-1 + */ + dag BE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 2), 2, 61)); + dag BE_VWORD_PERMUTE = (VPERM $S, $S, BE_VWORD_PERM_VEC); + dag BE_MV_VWORD = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS BE_VWORD_PERMUTE, VSRC)), + sub_64)); + dag BE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 1), $Idx), 5, 58), + sub_32); + dag BE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD BE_MV_VWORD, BE_VWORD_SHIFT), + sub_32); + + /* BE variable doubleword + Same as the LE doubleword except we shift in the VMX register for opposite + element indices. + */ + dag BE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 1), 3, 60)); + dag BE_VDWORD_PERMUTE = (VPERM $S, $S, BE_VDWORD_PERM_VEC); + dag BE_VARIABLE_DWORD = + (MFVSRD (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS BE_VDWORD_PERMUTE, VSRC)), + sub_64)); + + /* BE variable float + - Shift the vector to line up the desired element to BE Word 0 + - Convert 32-bit float to a 64-bit single precision float + */ + dag BE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR $Idx, 2, 61)); + dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC); + dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE); + + /* BE variable double + Same as the BE doubleword except there is no move. + */ + dag BE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC), + (COPY_TO_REGCLASS $S, VRRC), + BE_VDWORD_PERM_VEC); + dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC); +} + +// v4f32 scalar <-> vector conversions (BE) +let Predicates = [IsBigEndian, HasP8Vector] in { + def : Pat<(v4f32 (scalar_to_vector f32:$A)), + (v4f32 (XSCVDPSPN $A))>; + def : Pat<(f32 (vector_extract v4f32:$S, 0)), + (f32 (XSCVSPDPN $S))>; + def : Pat<(f32 (vector_extract v4f32:$S, 1)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 2)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 3)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>; + def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)), + (f32 VectorExtractions.BE_VARIABLE_FLOAT)>; +} // IsBigEndian, HasP8Vector + +// Variable index vector_extract for v2f64 does not require P8Vector +let Predicates = [IsBigEndian, HasVSX] in + def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)), + (f64 VectorExtractions.BE_VARIABLE_DOUBLE)>; + +let Predicates = [IsBigEndian, HasDirectMove] in { + // v16i8 scalar <-> vector conversions (BE) + def : Pat<(v16i8 (scalar_to_vector i32:$A)), + (v16i8 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64))>; + def : Pat<(v8i16 (scalar_to_vector i32:$A)), + (v8i16 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64))>; + def : Pat<(v4i32 (scalar_to_vector i32:$A)), + (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>; + def : Pat<(v2i64 (scalar_to_vector i64:$A)), + (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>; + def : Pat<(i32 (vector_extract v16i8:$S, 0)), + (i32 VectorExtractions.LE_BYTE_15)>; + def : Pat<(i32 (vector_extract v16i8:$S, 1)), + (i32 VectorExtractions.LE_BYTE_14)>; + def : Pat<(i32 (vector_extract v16i8:$S, 2)), + (i32 VectorExtractions.LE_BYTE_13)>; + def : Pat<(i32 (vector_extract v16i8:$S, 3)), + (i32 VectorExtractions.LE_BYTE_12)>; + def : Pat<(i32 (vector_extract v16i8:$S, 4)), + (i32 VectorExtractions.LE_BYTE_11)>; + def : Pat<(i32 (vector_extract v16i8:$S, 5)), + (i32 VectorExtractions.LE_BYTE_10)>; + def : Pat<(i32 (vector_extract v16i8:$S, 6)), + (i32 VectorExtractions.LE_BYTE_9)>; + def : Pat<(i32 (vector_extract v16i8:$S, 7)), + (i32 VectorExtractions.LE_BYTE_8)>; + def : Pat<(i32 (vector_extract v16i8:$S, 8)), + (i32 VectorExtractions.LE_BYTE_7)>; + def : Pat<(i32 (vector_extract v16i8:$S, 9)), + (i32 VectorExtractions.LE_BYTE_6)>; + def : Pat<(i32 (vector_extract v16i8:$S, 10)), + (i32 VectorExtractions.LE_BYTE_5)>; + def : Pat<(i32 (vector_extract v16i8:$S, 11)), + (i32 VectorExtractions.LE_BYTE_4)>; + def : Pat<(i32 (vector_extract v16i8:$S, 12)), + (i32 VectorExtractions.LE_BYTE_3)>; + def : Pat<(i32 (vector_extract v16i8:$S, 13)), + (i32 VectorExtractions.LE_BYTE_2)>; + def : Pat<(i32 (vector_extract v16i8:$S, 14)), + (i32 VectorExtractions.LE_BYTE_1)>; + def : Pat<(i32 (vector_extract v16i8:$S, 15)), + (i32 VectorExtractions.LE_BYTE_0)>; + def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)), + (i32 VectorExtractions.BE_VARIABLE_BYTE)>; + + // v8i16 scalar <-> vector conversions (BE) + def : Pat<(i32 (vector_extract v8i16:$S, 0)), + (i32 VectorExtractions.LE_HALF_7)>; + def : Pat<(i32 (vector_extract v8i16:$S, 1)), + (i32 VectorExtractions.LE_HALF_6)>; + def : Pat<(i32 (vector_extract v8i16:$S, 2)), + (i32 VectorExtractions.LE_HALF_5)>; + def : Pat<(i32 (vector_extract v8i16:$S, 3)), + (i32 VectorExtractions.LE_HALF_4)>; + def : Pat<(i32 (vector_extract v8i16:$S, 4)), + (i32 VectorExtractions.LE_HALF_3)>; + def : Pat<(i32 (vector_extract v8i16:$S, 5)), + (i32 VectorExtractions.LE_HALF_2)>; + def : Pat<(i32 (vector_extract v8i16:$S, 6)), + (i32 VectorExtractions.LE_HALF_1)>; + def : Pat<(i32 (vector_extract v8i16:$S, 7)), + (i32 VectorExtractions.LE_HALF_0)>; + def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)), + (i32 VectorExtractions.BE_VARIABLE_HALF)>; + + // v4i32 scalar <-> vector conversions (BE) + def : Pat<(i32 (vector_extract v4i32:$S, 0)), + (i32 VectorExtractions.LE_WORD_3)>; + def : Pat<(i32 (vector_extract v4i32:$S, 1)), + (i32 VectorExtractions.LE_WORD_2)>; + def : Pat<(i32 (vector_extract v4i32:$S, 2)), + (i32 VectorExtractions.LE_WORD_1)>; + def : Pat<(i32 (vector_extract v4i32:$S, 3)), + (i32 VectorExtractions.LE_WORD_0)>; + def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), + (i32 VectorExtractions.BE_VARIABLE_WORD)>; + + // v2i64 scalar <-> vector conversions (BE) + def : Pat<(i64 (vector_extract v2i64:$S, 0)), + (i64 VectorExtractions.LE_DWORD_1)>; + def : Pat<(i64 (vector_extract v2i64:$S, 1)), + (i64 VectorExtractions.LE_DWORD_0)>; + def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)), + (i64 VectorExtractions.BE_VARIABLE_DWORD)>; +} // IsBigEndian, HasDirectMove + +// v4f32 scalar <-> vector conversions (LE) +let Predicates = [IsLittleEndian, HasP8Vector] in { + def : Pat<(v4f32 (scalar_to_vector f32:$A)), + (v4f32 (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1))>; + def : Pat<(f32 (vector_extract v4f32:$S, 0)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 1)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 2)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 3)), + (f32 (XSCVSPDPN $S))>; + def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)), + (f32 VectorExtractions.LE_VARIABLE_FLOAT)>; +} // IsLittleEndian, HasP8Vector + +// Variable index vector_extract for v2f64 does not require P8Vector +let Predicates = [IsLittleEndian, HasVSX] in + def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)), + (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>; + +let Predicates = [IsLittleEndian, HasDirectMove] in { + // v16i8 scalar <-> vector conversions (LE) + def : Pat<(v16i8 (scalar_to_vector i32:$A)), + (v16i8 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>; + def : Pat<(v8i16 (scalar_to_vector i32:$A)), + (v8i16 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>; + def : Pat<(v4i32 (scalar_to_vector i32:$A)), + (v4i32 MovesToVSR.LE_WORD_0)>; + def : Pat<(v2i64 (scalar_to_vector i64:$A)), + (v2i64 MovesToVSR.LE_DWORD_0)>; + def : Pat<(i32 (vector_extract v16i8:$S, 0)), + (i32 VectorExtractions.LE_BYTE_0)>; + def : Pat<(i32 (vector_extract v16i8:$S, 1)), + (i32 VectorExtractions.LE_BYTE_1)>; + def : Pat<(i32 (vector_extract v16i8:$S, 2)), + (i32 VectorExtractions.LE_BYTE_2)>; + def : Pat<(i32 (vector_extract v16i8:$S, 3)), + (i32 VectorExtractions.LE_BYTE_3)>; + def : Pat<(i32 (vector_extract v16i8:$S, 4)), + (i32 VectorExtractions.LE_BYTE_4)>; + def : Pat<(i32 (vector_extract v16i8:$S, 5)), + (i32 VectorExtractions.LE_BYTE_5)>; + def : Pat<(i32 (vector_extract v16i8:$S, 6)), + (i32 VectorExtractions.LE_BYTE_6)>; + def : Pat<(i32 (vector_extract v16i8:$S, 7)), + (i32 VectorExtractions.LE_BYTE_7)>; + def : Pat<(i32 (vector_extract v16i8:$S, 8)), + (i32 VectorExtractions.LE_BYTE_8)>; + def : Pat<(i32 (vector_extract v16i8:$S, 9)), + (i32 VectorExtractions.LE_BYTE_9)>; + def : Pat<(i32 (vector_extract v16i8:$S, 10)), + (i32 VectorExtractions.LE_BYTE_10)>; + def : Pat<(i32 (vector_extract v16i8:$S, 11)), + (i32 VectorExtractions.LE_BYTE_11)>; + def : Pat<(i32 (vector_extract v16i8:$S, 12)), + (i32 VectorExtractions.LE_BYTE_12)>; + def : Pat<(i32 (vector_extract v16i8:$S, 13)), + (i32 VectorExtractions.LE_BYTE_13)>; + def : Pat<(i32 (vector_extract v16i8:$S, 14)), + (i32 VectorExtractions.LE_BYTE_14)>; + def : Pat<(i32 (vector_extract v16i8:$S, 15)), + (i32 VectorExtractions.LE_BYTE_15)>; + def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)), + (i32 VectorExtractions.LE_VARIABLE_BYTE)>; + + // v8i16 scalar <-> vector conversions (LE) + def : Pat<(i32 (vector_extract v8i16:$S, 0)), + (i32 VectorExtractions.LE_HALF_0)>; + def : Pat<(i32 (vector_extract v8i16:$S, 1)), + (i32 VectorExtractions.LE_HALF_1)>; + def : Pat<(i32 (vector_extract v8i16:$S, 2)), + (i32 VectorExtractions.LE_HALF_2)>; + def : Pat<(i32 (vector_extract v8i16:$S, 3)), + (i32 VectorExtractions.LE_HALF_3)>; + def : Pat<(i32 (vector_extract v8i16:$S, 4)), + (i32 VectorExtractions.LE_HALF_4)>; + def : Pat<(i32 (vector_extract v8i16:$S, 5)), + (i32 VectorExtractions.LE_HALF_5)>; + def : Pat<(i32 (vector_extract v8i16:$S, 6)), + (i32 VectorExtractions.LE_HALF_6)>; + def : Pat<(i32 (vector_extract v8i16:$S, 7)), + (i32 VectorExtractions.LE_HALF_7)>; + def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)), + (i32 VectorExtractions.LE_VARIABLE_HALF)>; + + // v4i32 scalar <-> vector conversions (LE) + def : Pat<(i32 (vector_extract v4i32:$S, 0)), + (i32 VectorExtractions.LE_WORD_0)>; + def : Pat<(i32 (vector_extract v4i32:$S, 1)), + (i32 VectorExtractions.LE_WORD_1)>; + def : Pat<(i32 (vector_extract v4i32:$S, 2)), + (i32 VectorExtractions.LE_WORD_2)>; + def : Pat<(i32 (vector_extract v4i32:$S, 3)), + (i32 VectorExtractions.LE_WORD_3)>; + def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), + (i32 VectorExtractions.LE_VARIABLE_WORD)>; + + // v2i64 scalar <-> vector conversions (LE) + def : Pat<(i64 (vector_extract v2i64:$S, 0)), + (i64 VectorExtractions.LE_DWORD_0)>; + def : Pat<(i64 (vector_extract v2i64:$S, 1)), + (i64 VectorExtractions.LE_DWORD_1)>; + def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)), + (i64 VectorExtractions.LE_VARIABLE_DWORD)>; +} // IsLittleEndian, HasDirectMove + +let Predicates = [HasDirectMove, HasVSX] in { +// bitconvert f32 -> i32 +// (convert to 32-bit fp single, shift right 1 word, move to GPR) +def : Pat<(i32 (bitconvert f32:$S)), + (i32 (MFVSRWZ (EXTRACT_SUBREG + (XXSLDWI (XSCVDPSPN $S),(XSCVDPSPN $S), 3), + sub_64)))>; +// bitconvert i32 -> f32 +// (move to FPR, shift left 1 word, convert to 64-bit fp single) +def : Pat<(f32 (bitconvert i32:$A)), + (f32 (XSCVSPDPN + (XXSLDWI MovesToVSR.LE_WORD_1, MovesToVSR.LE_WORD_1, 1)))>; + +// bitconvert f64 -> i64 +// (move to GPR, nothing else needed) +def : Pat<(i64 (bitconvert f64:$S)), + (i64 (MFVSRD $S))>; + +// bitconvert i64 -> f64 +// (move to FPR, nothing else needed) +def : Pat<(f64 (bitconvert i64:$S)), + (f64 (MTVSRD $S))>; +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp index b4e1c09..e3a35d5 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -71,10 +72,10 @@ namespace { AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); // FIXME: For some reason, preserving SE here breaks LSR (even if // this pass changes nothing). - // AU.addPreserved<ScalarEvolution>(); + // AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); } @@ -96,7 +97,7 @@ INITIALIZE_PASS_BEGIN(PPCLoopDataPrefetch, "ppc-loop-data-prefetch", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(PPCLoopDataPrefetch, "ppc-loop-data-prefetch", "PPC Loop Data Prefetch", false, false) @@ -104,7 +105,7 @@ FunctionPass *llvm::createPPCLoopDataPrefetchPass() { return new PPCLoopDataPref bool PPCLoopDataPrefetch::runOnFunction(Function &F) { LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DL = &F.getParent()->getDataLayout(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp index b6e7799..5e18826 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp @@ -73,7 +73,7 @@ namespace { AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); } bool runOnFunction(Function &F) override; @@ -84,8 +84,10 @@ namespace { private: PPCTargetMachine *TM; + DominatorTree *DT; LoopInfo *LI; ScalarEvolution *SE; + bool PreserveLCSSA; }; } @@ -93,7 +95,7 @@ char PPCLoopPreIncPrep::ID = 0; static const char *name = "Prepare loop for pre-inc. addressing modes"; INITIALIZE_PASS_BEGIN(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false) FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) { @@ -101,17 +103,20 @@ FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) { } namespace { - struct SCEVLess : std::binary_function<const SCEV *, const SCEV *, bool> - { - SCEVLess(ScalarEvolution *SE) : SE(SE) {} + struct BucketElement { + BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {} + BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {} - bool operator() (const SCEV *X, const SCEV *Y) const { - const SCEV *Diff = SE->getMinusSCEV(X, Y); - return cast<SCEVConstant>(Diff)->getValue()->getSExtValue() < 0; - } + const SCEVConstant *Offset; + Instruction *Instr; + }; - protected: - ScalarEvolution *SE; + struct Bucket { + Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B), + Elements(1, BucketElement(I)) {} + + const SCEV *BaseSCEV; + SmallVector<BucketElement, 16> Elements; }; } @@ -140,7 +145,10 @@ static Value *GetPointerOperand(Value *MemI) { bool PPCLoopPreIncPrep::runOnFunction(Function &F) { LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); bool MadeChange = false; @@ -169,7 +177,6 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { std::distance(pred_begin(Header), pred_end(Header)); // Collect buckets of comparable addresses used by loads and stores. - typedef std::multimap<const SCEV *, Instruction *, SCEVLess> Bucket; SmallVector<Bucket, 16> Buckets; for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); I != IE; ++I) { @@ -212,25 +219,24 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { } bool FoundBucket = false; - for (unsigned i = 0, e = Buckets.size(); i != e; ++i) - for (Bucket::iterator K = Buckets[i].begin(), KE = Buckets[i].end(); - K != KE; ++K) { - const SCEV *Diff = SE->getMinusSCEV(K->first, LSCEV); - if (isa<SCEVConstant>(Diff)) { - Buckets[i].insert(std::make_pair(LSCEV, MemI)); - FoundBucket = true; - break; - } + for (auto &B : Buckets) { + const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV); + if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) { + B.Elements.push_back(BucketElement(CDiff, MemI)); + FoundBucket = true; + break; } + } if (!FoundBucket) { - Buckets.push_back(Bucket(SCEVLess(SE))); - Buckets[Buckets.size()-1].insert(std::make_pair(LSCEV, MemI)); + if (Buckets.size() == MaxVars) + return MadeChange; + Buckets.push_back(Bucket(LSCEV, MemI)); } } } - if (Buckets.empty() || Buckets.size() > MaxVars) + if (Buckets.empty()) return MadeChange; BasicBlock *LoopPredecessor = L->getLoopPredecessor(); @@ -239,7 +245,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { // iteration space), insert a new preheader for the loop. if (!LoopPredecessor || !LoopPredecessor->getTerminator()->getType()->isVoidTy()) { - LoopPredecessor = InsertPreheaderForLoop(L, this); + LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); if (LoopPredecessor) MadeChange = true; } @@ -253,8 +259,45 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { // The base address of each bucket is transformed into a phi and the others // are rewritten as offsets of that variable. + // We have a choice now of which instruction's memory operand we use as the + // base for the generated PHI. Always picking the first instruction in each + // bucket does not work well, specifically because that instruction might + // be a prefetch (and there are no pre-increment dcbt variants). Otherwise, + // the choice is somewhat arbitrary, because the backend will happily + // generate direct offsets from both the pre-incremented and + // post-incremented pointer values. Thus, we'll pick the first non-prefetch + // instruction in each bucket, and adjust the recurrence and other offsets + // accordingly. + for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) { + if (auto *II = dyn_cast<IntrinsicInst>(Buckets[i].Elements[j].Instr)) + if (II->getIntrinsicID() == Intrinsic::prefetch) + continue; + + // If we'd otherwise pick the first element anyway, there's nothing to do. + if (j == 0) + break; + + // If our chosen element has no offset from the base pointer, there's + // nothing to do. + if (!Buckets[i].Elements[j].Offset || + Buckets[i].Elements[j].Offset->isZero()) + break; + + const SCEV *Offset = Buckets[i].Elements[j].Offset; + Buckets[i].BaseSCEV = SE->getAddExpr(Buckets[i].BaseSCEV, Offset); + for (auto &E : Buckets[i].Elements) { + if (E.Offset) + E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset)); + else + E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset)); + } + + std::swap(Buckets[i].Elements[j], Buckets[i].Elements[0]); + break; + } + const SCEVAddRecExpr *BasePtrSCEV = - cast<SCEVAddRecExpr>(Buckets[i].begin()->first); + cast<SCEVAddRecExpr>(Buckets[i].BaseSCEV); if (!BasePtrSCEV->isAffine()) continue; @@ -262,7 +305,9 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?"); - Instruction *MemI = Buckets[i].begin()->second; + // The instruction corresponding to the Bucket's BaseSCEV must be the first + // in the vector of elements. + Instruction *MemI = Buckets[i].Elements.begin()->Instr; Value *BasePtr = GetPointerOperand(MemI); assert(BasePtr && "No pointer operand"); @@ -302,7 +347,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { NewPHI->addIncoming(BasePtrStart, LoopPredecessor); } - Instruction *InsPoint = Header->getFirstInsertionPt(); + Instruction *InsPoint = &*Header->getFirstInsertionPt(); GetElementPtrInst *PtrInc = GetElementPtrInst::Create( I8Ty, NewPHI, BasePtrIncSCEV->getValue(), MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint); @@ -327,18 +372,20 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { BasePtr->replaceAllUsesWith(NewBasePtr); RecursivelyDeleteTriviallyDeadInstructions(BasePtr); - Value *LastNewPtr = NewBasePtr; - for (Bucket::iterator I = std::next(Buckets[i].begin()), - IE = Buckets[i].end(); I != IE; ++I) { - Value *Ptr = GetPointerOperand(I->second); + // Keep track of the replacement pointer values we've inserted so that we + // don't generate more pointer values than necessary. + SmallPtrSet<Value *, 16> NewPtrs; + NewPtrs.insert( NewBasePtr); + + for (auto I = std::next(Buckets[i].Elements.begin()), + IE = Buckets[i].Elements.end(); I != IE; ++I) { + Value *Ptr = GetPointerOperand(I->Instr); assert(Ptr && "No pointer operand"); - if (Ptr == LastNewPtr) + if (NewPtrs.count(Ptr)) continue; Instruction *RealNewPtr; - const SCEVConstant *Diff = - cast<SCEVConstant>(SE->getMinusSCEV(I->first, BasePtrSCEV)); - if (Diff->isZero()) { + if (!I->Offset || I->Offset->getValue()->isZero()) { RealNewPtr = NewBasePtr; } else { Instruction *PtrIP = dyn_cast<Instruction>(Ptr); @@ -346,13 +393,13 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent()) PtrIP = 0; else if (isa<PHINode>(PtrIP)) - PtrIP = PtrIP->getParent()->getFirstInsertionPt(); + PtrIP = &*PtrIP->getParent()->getFirstInsertionPt(); else if (!PtrIP) - PtrIP = I->second; + PtrIP = I->Instr; GetElementPtrInst *NewPtr = GetElementPtrInst::Create( - I8Ty, PtrInc, Diff->getValue(), - I->second->hasName() ? I->second->getName() + ".off" : "", PtrIP); + I8Ty, PtrInc, I->Offset->getValue(), + I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP); if (!PtrIP) NewPtr->insertAfter(cast<Instruction>(PtrInc)); NewPtr->setIsInBounds(IsPtrInBounds(Ptr)); @@ -373,7 +420,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { Ptr->replaceAllUsesWith(ReplNewPtr); RecursivelyDeleteTriviallyDeadInstructions(Ptr); - LastNewPtr = RealNewPtr; + NewPtrs.insert(RealNewPtr); } MadeChange = true; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp index 76837ec..44a692d 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -38,7 +38,7 @@ static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) { static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){ const TargetMachine &TM = AP.TM; Mangler *Mang = AP.Mang; - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = AP.getDataLayout(); MCContext &Ctx = AP.OutContext; bool isDarwin = TM.getTargetTriple().isOSDarwin(); @@ -51,13 +51,13 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){ Suffix = "$non_lazy_ptr"; if (!Suffix.empty()) - Name += DL->getPrivateGlobalPrefix(); + Name += DL.getPrivateGlobalPrefix(); unsigned PrefixLen = Name.size(); if (!MO.isGlobal()) { assert(MO.isSymbol() && "Isn't a symbol reference"); - Mangler::getNameWithPrefix(Name, MO.getSymbolName(), *DL); + Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL); } else { const GlobalValue *GV = MO.getGlobal(); TM.getNameWithPrefix(Name, GV, *Mang); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp new file mode 100644 index 0000000..fe339d7 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -0,0 +1,230 @@ +//===-------------- PPCMIPeephole.cpp - MI Peephole Cleanups -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This pass performs peephole optimizations to clean up ugly code +// sequences at the MachineInstruction layer. It runs at the end of +// the SSA phases, following VSX swap removal. A pass of dead code +// elimination follows this one for quick clean-up of any dead +// instructions introduced here. Although we could do this as callbacks +// from the generic peephole pass, this would have a couple of bad +// effects: it might remove optimization opportunities for VSX swap +// removal, and it would miss cleanups made possible following VSX +// swap removal. +// +//===---------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-mi-peepholes" + +namespace llvm { + void initializePPCMIPeepholePass(PassRegistry&); +} + +namespace { + +struct PPCMIPeephole : public MachineFunctionPass { + + static char ID; + const PPCInstrInfo *TII; + MachineFunction *MF; + MachineRegisterInfo *MRI; + + PPCMIPeephole() : MachineFunctionPass(ID) { + initializePPCMIPeepholePass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize class variables. + void initialize(MachineFunction &MFParm); + + // Perform peepholes. + bool simplifyCode(void); + + // Find the "true" register represented by SrcReg (following chains + // of copies and subreg_to_reg operations). + unsigned lookThruCopyLike(unsigned SrcReg); + +public: + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + initialize(MF); + return simplifyCode(); + } +}; + +// Initialize class variables. +void PPCMIPeephole::initialize(MachineFunction &MFParm) { + MF = &MFParm; + MRI = &MF->getRegInfo(); + TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo(); + DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n"); + DEBUG(MF->dump()); +} + +// Perform peephole optimizations. +bool PPCMIPeephole::simplifyCode(void) { + bool Simplified = false; + MachineInstr* ToErase = nullptr; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + + // If the previous instruction was marked for elimination, + // remove it now. + if (ToErase) { + ToErase->eraseFromParent(); + ToErase = nullptr; + } + + // Ignore debug instructions. + if (MI.isDebugValue()) + continue; + + // Per-opcode peepholes. + switch (MI.getOpcode()) { + + default: + break; + + case PPC::XXPERMDI: { + // Perform simplifications of 2x64 vector swaps and splats. + // A swap is identified by an immediate value of 2, and a splat + // is identified by an immediate value of 0 or 3. + int Immed = MI.getOperand(3).getImm(); + + if (Immed != 1) { + + // For each of these simplifications, we need the two source + // regs to match. Unfortunately, MachineCSE ignores COPY and + // SUBREG_TO_REG, so for example we can see + // XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed. + // We have to look through chains of COPY and SUBREG_TO_REG + // to find the real source values for comparison. + unsigned TrueReg1 = lookThruCopyLike(MI.getOperand(1).getReg()); + unsigned TrueReg2 = lookThruCopyLike(MI.getOperand(2).getReg()); + + if (TrueReg1 == TrueReg2 + && TargetRegisterInfo::isVirtualRegister(TrueReg1)) { + MachineInstr *DefMI = MRI->getVRegDef(TrueReg1); + + // If this is a splat or a swap fed by another splat, we + // can replace it with a copy. + if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) { + unsigned FeedImmed = DefMI->getOperand(3).getImm(); + unsigned FeedReg1 + = lookThruCopyLike(DefMI->getOperand(1).getReg()); + unsigned FeedReg2 + = lookThruCopyLike(DefMI->getOperand(2).getReg()); + + if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) { + DEBUG(dbgs() + << "Optimizing splat/swap or splat/splat " + "to splat/copy: "); + DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), + TII->get(PPC::COPY), MI.getOperand(0).getReg()) + .addOperand(MI.getOperand(1)); + ToErase = &MI; + Simplified = true; + } + + // If this is a splat fed by a swap, we can simplify modify + // the splat to splat the other value from the swap's input + // parameter. + else if ((Immed == 0 || Immed == 3) + && FeedImmed == 2 && FeedReg1 == FeedReg2) { + DEBUG(dbgs() << "Optimizing swap/splat => splat: "); + DEBUG(MI.dump()); + MI.getOperand(1).setReg(DefMI->getOperand(1).getReg()); + MI.getOperand(2).setReg(DefMI->getOperand(2).getReg()); + MI.getOperand(3).setImm(3 - Immed); + Simplified = true; + } + + // If this is a swap fed by a swap, we can replace it + // with a copy from the first swap's input. + else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) { + DEBUG(dbgs() << "Optimizing swap/swap => copy: "); + DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), + TII->get(PPC::COPY), MI.getOperand(0).getReg()) + .addOperand(DefMI->getOperand(1)); + ToErase = &MI; + Simplified = true; + } + } + } + } + break; + } + } + } + + // If the last instruction was marked for elimination, + // remove it now. + if (ToErase) { + ToErase->eraseFromParent(); + ToErase = nullptr; + } + } + + return Simplified; +} + +// This is used to find the "true" source register for an +// XXPERMDI instruction, since MachineCSE does not handle the +// "copy-like" operations (Copy and SubregToReg). Returns +// the original SrcReg unless it is the target of a copy-like +// operation, in which case we chain backwards through all +// such operations to the ultimate source register. If a +// physical register is encountered, we stop the search. +unsigned PPCMIPeephole::lookThruCopyLike(unsigned SrcReg) { + + while (true) { + + MachineInstr *MI = MRI->getVRegDef(SrcReg); + if (!MI->isCopyLike()) + return SrcReg; + + unsigned CopySrcReg; + if (MI->isCopy()) + CopySrcReg = MI->getOperand(1).getReg(); + else { + assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike"); + CopySrcReg = MI->getOperand(2).getReg(); + } + + if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) + return CopySrcReg; + + SrcReg = CopySrcReg; + } +} + +} // end default namespace + +INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE, + "PowerPC MI Peephole Optimization", false, false) +INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE, + "PowerPC MI Peephole Optimization", false, false) + +char PPCMIPeephole::ID = 0; +FunctionPass* +llvm::createPPCMIPeepholePass() { return new PPCMIPeephole(); } + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp index ec4e0a5..9d91e31 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp @@ -18,8 +18,29 @@ using namespace llvm; void PPCFunctionInfo::anchor() { } MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const { - const DataLayout *DL = MF.getTarget().getDataLayout(); - return MF.getContext().getOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix()) + + const DataLayout &DL = MF.getDataLayout(); + return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) + Twine(MF.getFunctionNumber()) + "$poff"); } + +MCSymbol *PPCFunctionInfo::getGlobalEPSymbol() const { + const DataLayout &DL = MF.getDataLayout(); + return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) + + "func_gep" + + Twine(MF.getFunctionNumber())); +} + +MCSymbol *PPCFunctionInfo::getLocalEPSymbol() const { + const DataLayout &DL = MF.getDataLayout(); + return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) + + "func_lep" + + Twine(MF.getFunctionNumber())); +} + +MCSymbol *PPCFunctionInfo::getTOCOffsetSymbol() const { + const DataLayout &DL = MF.getDataLayout(); + return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) + + "func_toc" + + Twine(MF.getFunctionNumber())); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h index 607cdf6..10a8ce0 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -197,6 +197,10 @@ public: bool usesPICBase() const { return UsesPICBase; } MCSymbol *getPICOffsetSymbol() const; + + MCSymbol *getGlobalEPSymbol() const; + MCSymbol *getLocalEPSymbol() const; + MCSymbol *getTOCOffsetSymbol() const; }; } // end of namespace llvm diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 2b09b2f..934bdf6 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -200,7 +200,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(PPC::R2); // System-reserved register Reserved.set(PPC::R13); // Small Data Area pointer register } - + // On PPC64, r13 is the thread pointer. Never allocate this register. if (TM.isPPC64()) { Reserved.set(PPC::R13); @@ -262,7 +262,7 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, default: return 0; case PPC::G8RC_NOX0RegClassID: - case PPC::GPRC_NOR0RegClassID: + case PPC::GPRC_NOR0RegClassID: case PPC::G8RCRegClassID: case PPC::GPRCRegClassID: { unsigned FP = TFI->hasFP(MF) ? 1 : 0; @@ -311,7 +311,7 @@ PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, //===----------------------------------------------------------------------===// /// lowerDynamicAlloc - Generate the code for allocating an object in the -/// current frame. The sequence of code with be in the general form +/// current frame. The sequence of code will be in the general form /// /// addi R0, SP, \#frameSize ; get the address of the previous frame /// stwxu R0, SP, Rnegsize ; add and update the SP with the negated size @@ -337,7 +337,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); // Get the total frame size. unsigned FrameSize = MFI->getStackSize(); - + // Get stack alignments. const PPCFrameLowering *TFI = getFrameLowering(MF); unsigned TargetAlign = TFI->getStackAlignment(); @@ -347,14 +347,14 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { // Determine the previous frame's address. If FrameSize can't be // represented as 16 bits or we need special alignment, then we load the - // previous frame's address from 0(SP). Why not do an addis of the hi? - // Because R0 is our only safe tmp register and addi/addis treat R0 as zero. - // Constructing the constant and adding would take 3 instructions. + // previous frame's address from 0(SP). Why not do an addis of the hi? + // Because R0 is our only safe tmp register and addi/addis treat R0 as zero. + // Constructing the constant and adding would take 3 instructions. // Fortunately, a frame greater than 32K is rare. const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); - + if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) { BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg) .addReg(PPC::R31) @@ -425,11 +425,32 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { .addReg(PPC::R1) .addImm(maxCallFrameSize); } - + // Discard the DYNALLOC instruction. MBB.erase(II); } +void PPCRegisterInfo::lowerDynamicAreaOffset( + MachineBasicBlock::iterator II) const { + // Get the instruction. + MachineInstr &MI = *II; + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + // Get the basic block's function. + MachineFunction &MF = *MBB.getParent(); + // Get the frame info. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + // Get the instruction info. + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + + unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); + DebugLoc dl = MI.getDebugLoc(); + BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg()) + .addImm(maxCallFrameSize); + MBB.erase(II); +} + /// lowerCRSpilling - Generate the code for spilling a CR register. Instead of /// reserving a whole register (R0), we scrounge for one here. This generates /// code like this: @@ -459,8 +480,8 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II, // We need to store the CR in the low 4-bits of the saved value. First, issue // an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg. BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg) - .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); - + .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); + // If the saved register wasn't CR0, shift the bits left so that they are in // CR0's slot. if (SrcReg != PPC::CR0) { @@ -549,8 +570,8 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg) - .addReg(getCRFromCRBit(SrcReg)); - + .addReg(getCRFromCRBit(SrcReg)); + // If the saved register wasn't CR0LT, shift the bits left so that the bit to // store is the first one. Mask all but that bit. unsigned Reg1 = Reg; @@ -602,17 +623,19 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II, unsigned ShiftBits = getEncodingValue(DestReg); // rlwimi r11, r10, 32-ShiftBits, ..., ... BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWIMI8 : PPC::RLWIMI), RegO) - .addReg(RegO, RegState::Kill).addReg(Reg, RegState::Kill) - .addImm(ShiftBits ? 32-ShiftBits : 0) - .addImm(ShiftBits).addImm(ShiftBits); - + .addReg(RegO, RegState::Kill) + .addReg(Reg, RegState::Kill) + .addImm(ShiftBits ? 32 - ShiftBits : 0) + .addImm(ShiftBits) + .addImm(ShiftBits); + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF), getCRFromCRBit(DestReg)) - .addReg(RegO, RegState::Kill) - // Make sure we have a use dependency all the way through this - // sequence of instructions. We can't have the other bits in the CR - // modified in between the mfocrf and the mtocrf. - .addReg(getCRFromCRBit(DestReg), RegState::Implicit); + .addReg(RegO, RegState::Kill) + // Make sure we have a use dependency all the way through this + // sequence of instructions. We can't have the other bits in the CR + // modified in between the mfocrf and the mtocrf. + .addReg(getCRFromCRBit(DestReg), RegState::Implicit); // Discard the pseudo instruction. MBB.erase(II); @@ -634,11 +657,11 @@ void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II, unsigned SrcReg = MI.getOperand(0).getReg(); BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg) - .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); - - addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::STW)) - .addReg(Reg, RegState::Kill), - FrameIndex); + .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); + + addFrameReference( + BuildMI(MBB, II, dl, TII.get(PPC::STW)).addReg(Reg, RegState::Kill), + FrameIndex); // Discard the pseudo instruction. MBB.erase(II); @@ -671,9 +694,8 @@ void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II, MBB.erase(II); } -bool -PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, - unsigned Reg, int &FrameIdx) const { +bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, + unsigned Reg, int &FrameIdx) const { const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4 // ABI, return true to prevent allocating an additional frame slot. @@ -752,7 +774,12 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int FPSI = FI->getFramePointerSaveIndex(); // Get the instruction opcode. unsigned OpC = MI.getOpcode(); - + + if ((OpC == PPC::DYNAREAOFFSET || OpC == PPC::DYNAREAOFFSET8)) { + lowerDynamicAreaOffset(II); + return; + } + // Special case for dynamic alloca. if (FPSI && FrameIndex == FPSI && (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) { @@ -800,8 +827,9 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // If we're not using a Frame Pointer that has been set to the value of the // SP before having the stack size subtracted from it, then add the stack size // to Offset to get the correct offset. - // Naked functions have stack size 0, although getStackSize may not reflect that - // because we didn't call all the pieces that compute it for naked functions. + // Naked functions have stack size 0, although getStackSize may not reflect + // that because we didn't call all the pieces that compute it for naked + // functions. if (!MF.getFunction()->hasFnAttribute(Attribute::Naked)) { if (!(hasBasePointer(MF) && FrameIndex < 0)) Offset += MFI->getStackSize(); @@ -840,7 +868,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, .addImm(Offset); // Convert into indexed form of the instruction: - // + // // sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0 // addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0 unsigned OperandBase; @@ -898,24 +926,6 @@ bool PPCRegisterInfo::hasBasePointer(const MachineFunction &MF) const { return needsStackRealignment(MF); } -bool PPCRegisterInfo::canRealignStack(const MachineFunction &MF) const { - if (MF.getFunction()->hasFnAttribute("no-realign-stack")) - return false; - - return true; -} - -bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const { - const PPCFrameLowering *TFI = getFrameLowering(MF); - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const Function *F = MF.getFunction(); - unsigned StackAlign = TFI->getStackAlignment(); - bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || - F->hasFnAttribute(Attribute::StackAlignment)); - - return requiresRealignment && canRealignStack(MF); -} - /// Returns true if the instruction's frame index /// reference would be better served by a base register other than FP /// or SP. Used by LocalStackFrameAllocation to determine which frame index diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index d304e1d..b15fde8 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -54,13 +54,13 @@ inline static unsigned getCRFromCRBit(unsigned SrcReg) { return Reg; } - class PPCRegisterInfo : public PPCGenRegisterInfo { DenseMap<unsigned, unsigned> ImmToIdxMap; const PPCTargetMachine &TM; + public: PPCRegisterInfo(const PPCTargetMachine &TM); - + /// getPointerRegClass - Return the register class to use to hold pointers. /// This is used for addressing modes. const TargetRegisterClass * @@ -77,7 +77,7 @@ public: const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const override; - const uint32_t *getNoPreservedMask() const; + const uint32_t *getNoPreservedMask() const override; void adjustStackMapLiveOutMask(uint32_t *Mask) const override; @@ -101,6 +101,7 @@ public: } void lowerDynamicAlloc(MachineBasicBlock::iterator II) const; + void lowerDynamicAreaOffset(MachineBasicBlock::iterator II) const; void lowerCRSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex) const; void lowerCRRestore(MachineBasicBlock::iterator II, @@ -115,9 +116,9 @@ public: unsigned FrameIndex) const; bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, - int &FrameIdx) const override; - void eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, unsigned FIOperandNum, + int &FrameIdx) const override; + void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, + unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; // Support for virtual base registers. @@ -136,8 +137,6 @@ public: // Base pointer (stack realignment) support. unsigned getBaseRegister(const MachineFunction &MF) const; bool hasBasePointer(const MachineFunction &MF) const; - bool canRealignStack(const MachineFunction &MF) const; - bool needsStackRealignment(const MachineFunction &MF) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 58dacca..c0fcb6c 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -62,6 +62,7 @@ void PPCSubtarget::initializeEnvironment() { Has64BitSupport = false; Use64BitRegs = false; UseCRBits = false; + UseSoftFloat = false; HasAltivec = false; HasSPE = false; HasQPX = false; @@ -100,6 +101,8 @@ void PPCSubtarget::initializeEnvironment() { HasDirectMove = false; IsQPXStackUnaligned = false; HasHTM = false; + HasFusion = false; + HasFloat128 = false; } void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { @@ -210,5 +213,33 @@ bool PPCSubtarget::enableSubRegLiveness() const { return UseSubRegLiveness; } +unsigned char PPCSubtarget::classifyGlobalReference( + const GlobalValue *GV) const { + // Note that currently we don't generate non-pic references. + // If a caller wants that, this will have to be updated. + + // Large code model always uses the TOC even for local symbols. + if (TM.getCodeModel() == CodeModel::Large) + return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG; + + unsigned char flags = PPCII::MO_PIC_FLAG; + + // Only if the relocation mode is PIC do we have to worry about + // interposition. In all other cases we can use a slightly looser standard to + // decide how to access the symbol. + if (TM.getRelocationModel() == Reloc::PIC_) { + // If it's local, or it's non-default, it can't be interposed. + if (!GV->hasLocalLinkage() && + GV->hasDefaultVisibility()) { + flags |= PPCII::MO_NLP_FLAG; + } + return flags; + } + + if (GV->isStrongDefinitionForLinker()) + return flags; + return flags | PPCII::MO_NLP_FLAG; +} + bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); } bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h index 0616c1f..4f5c95c 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -83,6 +83,7 @@ protected: bool Has64BitSupport; bool Use64BitRegs; bool UseCRBits; + bool UseSoftFloat; bool IsPPC64; bool HasAltivec; bool HasSPE; @@ -119,6 +120,8 @@ protected: bool HasPartwordAtomics; bool HasDirectMove; bool HasHTM; + bool HasFusion; + bool HasFloat128; /// When targeting QPX running a stock PPC64 Linux kernel where the stack /// alignment has not been changed, we need to keep the 16-byte alignment @@ -188,6 +191,8 @@ public: /// has64BitSupport - Return true if the selected CPU supports 64-bit /// instructions, regardless of whether we are in 32-bit or 64-bit mode. bool has64BitSupport() const { return Has64BitSupport; } + // useSoftFloat - Return true if soft-float option is turned on. + bool useSoftFloat() const { return UseSoftFloat; } /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit /// registers in 32-bit mode when possible. This can only true if @@ -254,6 +259,8 @@ public: return 16; } bool hasHTM() const { return HasHTM; } + bool hasFusion() const { return HasFusion; } + bool hasFloat128() const { return HasFloat128; } const Triple &getTargetTriple() const { return TargetTriple; } @@ -285,6 +292,10 @@ public: bool useAA() const override; bool enableSubRegLiveness() const override; + + /// classifyGlobalReference - Classify a global variable reference for the + /// current subtarget accourding to how we should reference it. + unsigned char classifyGlobalReference(const GlobalValue *GV) const; }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index 2dc0d82..a9d2e88 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -99,6 +99,11 @@ protected: break; } + // Don't really need to save data to the stack - the clobbered + // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr) + // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR). + BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0); + // Expand into two ops built prior to the existing instruction. MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3) .addReg(InReg); @@ -113,6 +118,8 @@ protected: .addReg(GPR3)); Call->addOperand(MI->getOperand(3)); + BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0); + BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg) .addReg(GPR3); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 1daf244..d24b590 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -42,6 +42,10 @@ static cl:: opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden, cl::desc("Disable VSX Swap Removal for PPC")); +static cl:: +opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden, + cl::desc("Disable machine peepholes for PPC")); + static cl::opt<bool> EnableGEPOpt("ppc-gep-opt", cl::Hidden, cl::desc("Enable optimizations on complex GEPs"), @@ -57,11 +61,19 @@ EnableExtraTOCRegDeps("enable-ppc-extra-toc-reg-deps", cl::desc("Add extra TOC register dependencies"), cl::init(true), cl::Hidden); +static cl::opt<bool> +EnableMachineCombinerPass("ppc-machine-combiner", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + extern "C" void LLVMInitializePowerPCTarget() { // Register the targets RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target); RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target); RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget); + + PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializePPCBoolRetToIntPass(PR); } /// Return the datalayout string of a subtarget. @@ -118,7 +130,7 @@ static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL, } if (OL != CodeGenOpt::None) { - if (!FullFS.empty()) + if (!FullFS.empty()) FullFS = "+invariant-function-descriptors," + FullFS; else FullFS = "+invariant-function-descriptors"; @@ -144,7 +156,7 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT, return PPCTargetMachine::PPC_ABI_ELFv2; assert(Options.MCOptions.getABIName().empty() && - "Unknown target-abi option!"); + "Unknown target-abi option!"); if (!TT.isMacOSX()) { switch (TT.getArch()) { @@ -160,9 +172,9 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT, return PPCTargetMachine::PPC_ABI_UNKNOWN; } -// The FeatureString here is a little subtle. We are modifying the feature string -// with what are (currently) non-function specific overrides as it goes into the -// LLVMTargetMachine constructor and then using the stored value in the +// The FeatureString here is a little subtle. We are modifying the feature +// string with what are (currently) non-function specific overrides as it goes +// into the LLVMTargetMachine constructor and then using the stored value in the // Subtarget constructor below it. PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -227,6 +239,19 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const { ? FSAttr.getValueAsString().str() : TargetFS; + // FIXME: This is related to the code below to reset the target options, + // we need to know whether or not the soft float flag is set on the + // function before we can generate a subtarget. We also need to use + // it as a key for the subtarget since that can be the only difference + // between two functions. + bool SoftFloat = + F.hasFnAttribute("use-soft-float") && + F.getFnAttribute("use-soft-float").getValueAsString() == "true"; + // If the soft float attribute is set on the function turn on the soft float + // subtarget feature. + if (SoftFloat) + FS += FS.empty() ? "+soft-float" : ",+soft-float"; + auto &I = SubtargetMap[CPU + FS]; if (!I) { // This needs to be done before we create a new subtarget since any @@ -277,6 +302,8 @@ TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) { } void PPCPassConfig::addIRPasses() { + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createPPCBoolRetToIntPass()); addPass(createAtomicExpandPass(&getPPCTargetMachine())); // For the BG/Q (or if explicitly requested), add explicit data prefetch @@ -316,6 +343,10 @@ bool PPCPassConfig::addPreISel() { bool PPCPassConfig::addILPOpts() { addPass(&EarlyIfConverterID); + + if (EnableMachineCombinerPass) + addPass(&MachineCombinerID); + return true; } @@ -339,6 +370,12 @@ void PPCPassConfig::addMachineSSAOptimization() { if (TM->getTargetTriple().getArch() == Triple::ppc64le && !DisableVSXSwapRemoval) addPass(createPPCVSXSwapRemovalPass()); + // Target-specific peephole cleanups performed after instruction + // selection. + if (!DisableMIPeephole) { + addPass(createPPCMIPeepholePass()); + addPass(&DeadMachineInstructionElimID); + } } void PPCPassConfig::addPreRegAlloc() { @@ -364,6 +401,7 @@ void PPCPassConfig::addPreEmitPass() { } TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &F) { return TargetTransformInfo(PPCTTIImpl(this, F)); }); + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo(PPCTTIImpl(this, F)); + }); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp index 9ee5db9..798bb9d 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp @@ -42,9 +42,7 @@ MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal( if (Kind.isReadOnly()) { const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); - if (GVar && GVar->isConstant() && - (GVar->getInitializer()->getRelocationInfo() == - Constant::GlobalRelocations)) + if (GVar && GVar->isConstant() && GVar->getInitializer()->needsRelocation()) Kind = SectionKind::getReadOnlyWithRel(); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index e21c2b7..cd86dab 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -35,7 +35,7 @@ PPCTTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } -unsigned PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { if (DisablePPCConstHoist) return BaseT::getIntImmCost(Imm, Ty); @@ -64,8 +64,8 @@ unsigned PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { return 4 * TTI::TCC_Basic; } -unsigned PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { +int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty) { if (DisablePPCConstHoist) return BaseT::getIntImmCost(IID, Idx, Imm, Ty); @@ -98,8 +98,8 @@ unsigned PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, return PPCTTIImpl::getIntImmCost(Imm, Ty); } -unsigned PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) { +int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) { if (DisablePPCConstHoist) return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty); @@ -197,9 +197,20 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L, } bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { + // On the A2, always unroll aggressively. For QPX unaligned loads, we depend + // on combining the loads generated for consecutive accesses, and failure to + // do so is particularly expensive. This makes it much more likely (compared + // to only using concatenation unrolling). + if (ST->getDarwinDirective() == PPC::DIR_A2) + return true; + return LoopHasReductions; } +bool PPCTTIImpl::enableInterleavedAccessVectorization() { + return true; +} + unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { if (Vector && !ST->hasAltivec() && !ST->hasQPX()) return 0; @@ -246,7 +257,7 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { return 2; } -unsigned PPCTTIImpl::getArithmeticInstrCost( +int PPCTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { @@ -257,24 +268,30 @@ unsigned PPCTTIImpl::getArithmeticInstrCost( Opd1PropInfo, Opd2PropInfo); } -unsigned PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); +int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + + // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // (at least in the sense that there need only be one non-loop-invariant + // instruction). We need one such shuffle instruction for each actual + // register (this is not true for arbitrary shuffles, but is true for the + // structured types of shuffles covered by TTI::ShuffleKind). + return LT.first; } -unsigned PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { +int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { +int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -313,41 +330,83 @@ unsigned PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return BaseT::getVectorInstrCost(Opcode, Val, Index); } -unsigned PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) { +int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode"); - unsigned Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); - - // VSX loads/stores support unaligned access. - if (ST->hasVSX()) { - if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64) - return Cost; - } + int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); - bool UnalignedAltivec = - Src->isVectorTy() && - Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() && - LT.second.getSizeInBits() == 128 && - Opcode == Instruction::Load; + // Aligned loads and stores are easy. + unsigned SrcBytes = LT.second.getStoreSize(); + if (!SrcBytes || !Alignment || Alignment >= SrcBytes) + return Cost; + + bool IsAltivecType = ST->hasAltivec() && + (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 || + LT.second == MVT::v4i32 || LT.second == MVT::v4f32); + bool IsVSXType = ST->hasVSX() && + (LT.second == MVT::v2f64 || LT.second == MVT::v2i64); + bool IsQPXType = ST->hasQPX() && + (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); + + // If we can use the permutation-based load sequence, then this is also + // relatively cheap (not counting loop-invariant instructions): one load plus + // one permute (the last load in a series has extra cost, but we're + // neglecting that here). Note that on the P7, we should do unaligned loads + // for Altivec types using the VSX instructions, but that's more expensive + // than using the permutation-based load sequence. On the P8, that's no + // longer true. + if (Opcode == Instruction::Load && + ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) && + Alignment >= LT.second.getScalarType().getStoreSize()) + return Cost + LT.first; // Add the cost of the permutations. + + // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the + // P7, unaligned vector loads are more expensive than the permutation-based + // load sequence, so that might be used instead, but regardless, the net cost + // is about the same (not counting loop-invariant instructions). + if (IsVSXType || (ST->hasVSX() && IsAltivecType)) + return Cost; // PPC in general does not support unaligned loads and stores. They'll need // to be decomposed based on the alignment factor. - unsigned SrcBytes = LT.second.getStoreSize(); - if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) { - Cost += LT.first*(SrcBytes/Alignment-1); - - // For a vector type, there is also scalarization overhead (only for - // stores, loads are expanded using the vector-load + permutation sequence, - // which is much less expensive). - if (Src->isVectorTy() && Opcode == Instruction::Store) - for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) - Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); - } + + // Add the cost of each scalar load or store. + Cost += LT.first*(SrcBytes/Alignment-1); + + // For a vector type, there is also scalarization overhead (only for + // stores, loads are expanded using the vector-load + permutation sequence, + // which is much less expensive). + if (Src->isVectorTy() && Opcode == Instruction::Store) + for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) + Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); + + return Cost; +} + +int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { + assert(isa<VectorType>(VecTy) && + "Expect a vector type for interleaved memory op"); + + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy); + + // Firstly, the cost of load/store operation. + int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); + + // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // (at least in the sense that there need only be one non-loop-invariant + // instruction). For each result vector, we need one shuffle per incoming + // vector (except that the first shuffle can take two incoming vectors + // because it does not need to take itself). + Cost += Factor*(LT.first-1); return Cost; } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 368bef9..04c1b02 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -37,7 +37,7 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> { const PPCTargetLowering *getTLI() const { return TLI; } public: - explicit PPCTTIImpl(const PPCTargetMachine *TM, Function &F) + explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -52,12 +52,11 @@ public: /// @{ using BaseT::getIntImmCost; - unsigned getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(const APInt &Imm, Type *Ty); - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty); - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); @@ -68,22 +67,27 @@ public: /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); + bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(unsigned VF); - unsigned getArithmeticInstrCost( + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp); - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace); /// @} }; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp index 5e3ae2a..782583c 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp @@ -77,6 +77,14 @@ namespace { return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI); } + bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI); + } + + bool IsVSSReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VSSRCRegClass, MRI); + } + protected: bool processBlock(MachineBasicBlock &MBB) { bool Changed = false; @@ -100,7 +108,9 @@ protected: IsVRReg(SrcMO.getReg(), MRI) ? &PPC::VSHRCRegClass : &PPC::VSLRCRegClass; assert((IsF8Reg(SrcMO.getReg(), MRI) || - IsVRReg(SrcMO.getReg(), MRI)) && + IsVRReg(SrcMO.getReg(), MRI) || + IsVSSReg(SrcMO.getReg(), MRI) || + IsVSFReg(SrcMO.getReg(), MRI)) && "Unknown source for a VSX copy"); unsigned NewVReg = MRI.createVirtualRegister(SrcRC); @@ -123,6 +133,8 @@ protected: IsVRReg(DstMO.getReg(), MRI) ? &PPC::VSHRCRegClass : &PPC::VSLRCRegClass; assert((IsF8Reg(DstMO.getReg(), MRI) || + IsVSFReg(DstMO.getReg(), MRI) || + IsVSSReg(DstMO.getReg(), MRI) || IsVRReg(DstMO.getReg(), MRI)) && "Unknown destination for a VSX copy"); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp index 46b8d13..6b19a2f 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -103,10 +103,10 @@ protected: VNInfo *AddendValNo = LIS->getInterval(MI->getOperand(1).getReg()).Query(FMAIdx).valueIn(); - if (!AddendValNo) { - // This can be null if the register is undef. + + // This can be null if the register is undef. + if (!AddendValNo) continue; - } MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def); @@ -186,18 +186,17 @@ protected: if (!KilledProdOp) continue; - // If the addend copy is used only by this MI, then the addend source - // register is likely not live here. This could be fixed (based on the - // legality checks above, the live range for the addend source register - // could be extended), but it seems likely that such a trivial copy can - // be coalesced away later, and thus is not worth the effort. - if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) && + // If the addend copy is used only by this MI, then the addend source + // register is likely not live here. This could be fixed (based on the + // legality checks above, the live range for the addend source register + // could be extended), but it seems likely that such a trivial copy can + // be coalesced away later, and thus is not worth the effort. + if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) && !LIS->getInterval(AddendSrcReg).liveAt(FMAIdx)) continue; // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3. - unsigned AddReg = AddendMI->getOperand(1).getReg(); unsigned KilledProdReg = MI->getOperand(KilledProdOp).getReg(); unsigned OtherProdReg = MI->getOperand(OtherProdOp).getReg(); @@ -221,6 +220,14 @@ protected: if (OldFMAReg == KilledProdReg) continue; + // If there isn't a class that fits, we can't perform the transform. + // This is needed for correctness with a mixture of VSX and Altivec + // instructions to make sure that a low VSX register is not assigned to + // the Altivec instruction. + if (!MRI.constrainRegClass(KilledProdReg, + MRI.getRegClass(OldFMAReg))) + continue; + assert(OldFMAReg == AddendMI->getOperand(0).getReg() && "Addend copy not tied to old FMA output!"); @@ -228,7 +235,7 @@ protected: MI->getOperand(0).setReg(KilledProdReg); MI->getOperand(1).setReg(KilledProdReg); - MI->getOperand(3).setReg(AddReg); + MI->getOperand(3).setReg(AddendSrcReg); MI->getOperand(2).setReg(OtherProdReg); MI->getOperand(0).setSubReg(KilledProdSubReg); @@ -263,8 +270,7 @@ protected: if (UseMI == AddendMI) continue; - UseMO.setReg(KilledProdReg); - UseMO.setSubReg(KilledProdSubReg); + UseMO.substVirtReg(KilledProdReg, KilledProdSubReg, *TRI); } // Extend the live intervals of the killed product operand to hold the @@ -286,6 +292,20 @@ protected: } DEBUG(dbgs() << " extended: " << NewFMAInt << '\n'); + // Extend the live interval of the addend source (it might end at the + // copy to be removed, or somewhere in between there and here). This + // is necessary only if it is a physical register. + if (!TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) + for (MCRegUnitIterator Units(AddendSrcReg, TRI); Units.isValid(); + ++Units) { + unsigned Unit = *Units; + + LiveRange &AddendSrcRange = LIS->getRegUnit(Unit); + AddendSrcRange.extendInBlock(LIS->getMBBStartIdx(&MBB), + FMAIdx.getRegSlot()); + DEBUG(dbgs() << " extended: " << AddendSrcRange << '\n'); + } + FMAInt.removeValNo(FMAValNo); DEBUG(dbgs() << " trimmed: " << FMAInt << '\n'); @@ -347,7 +367,6 @@ INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE, char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID; char PPCVSXFMAMutate::ID = 0; -FunctionPass* -llvm::createPPCVSXFMAMutatePass() { return new PPCVSXFMAMutate(); } - - +FunctionPass *llvm::createPPCVSXFMAMutatePass() { + return new PPCVSXFMAMutate(); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp index d7132d5..27c540f 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -94,7 +94,7 @@ enum SHValues { SH_NOSWAP_ST, SH_SPLAT, SH_XXPERMDI, - SH_COPYSCALAR + SH_COPYWIDEN }; struct PPCVSXSwapRemoval : public MachineFunctionPass { @@ -149,6 +149,11 @@ private: // handling. Return true iff any changes are made. bool removeSwaps(); + // Insert a swap instruction from SrcReg to DstReg at the given + // InsertPoint. + void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint, + unsigned DstReg, unsigned SrcReg); + // Update instructions requiring special handling. void handleSpecialSwappables(int EntryIdx); @@ -159,9 +164,7 @@ private: bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) { if (TargetRegisterInfo::isVirtualRegister(Reg)) return RC->hasSubClassEq(MRI->getRegClass(Reg)); - if (RC->contains(Reg)) - return true; - return false; + return RC->contains(Reg); } // Return true iff the given register is a full vector register. @@ -215,7 +218,7 @@ public: void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) { MF = &MFParm; MRI = &MF->getRegInfo(); - TII = static_cast<const PPCInstrInfo*>(MF->getSubtarget().getInstrInfo()); + TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo(); // An initial vector size of 256 appears to work well in practice. // Small/medium functions with vector content tend not to incur a @@ -343,6 +346,15 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { SwapVector[VecIdx].IsLoad = 1; SwapVector[VecIdx].IsSwap = 1; break; + case PPC::LXSDX: + case PPC::LXSSPX: + // A load of a floating-point value into the high-order half of + // a vector register is safe, provided that we introduce a swap + // following the load, which will be done by the SUBREG_TO_REG + // support. So just mark these as safe. + SwapVector[VecIdx].IsLoad = 1; + SwapVector[VecIdx].IsSwappable = 1; + break; case PPC::STVX: // Non-permuting stores are currently unsafe. We can use special // handling for this in the future. By not marking these as @@ -385,7 +397,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { else if (isVecReg(MI.getOperand(0).getReg()) && isScalarVecReg(MI.getOperand(2).getReg())) { SwapVector[VecIdx].IsSwappable = 1; - SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYSCALAR; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN; } break; } @@ -420,7 +432,14 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { case PPC::STVEHX: case PPC::STVEWX: case PPC::STVXL: + // We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX, + // by adding special handling for narrowing copies as well as + // widening ones. However, I've experimented with this, and in + // practice we currently do not appear to use STXSDX fed by + // a narrowing copy from a full vector register. Since I can't + // generate any useful test cases, I've left this alone for now. case PPC::STXSDX: + case PPC::STXSSPX: case PPC::VCIPHER: case PPC::VCIPHERLAST: case PPC::VMRGHB: @@ -543,7 +562,8 @@ unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg, } if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) { - SwapVector[VecIdx].MentionsPhysVR = 1; + if (!isScalarVecReg(CopySrcReg)) + SwapVector[VecIdx].MentionsPhysVR = 1; return CopySrcReg; } @@ -629,8 +649,8 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() { SwapVector[Repr].WebRejected = 1; DEBUG(dbgs() << - format("Web %d rejected for physreg, partial reg, or not swap[pable]\n", - Repr)); + format("Web %d rejected for physreg, partial reg, or not " + "swap[pable]\n", Repr)); DEBUG(dbgs() << " in " << EntryIdx << ": "); DEBUG(SwapVector[EntryIdx].VSEMI->dump()); DEBUG(dbgs() << "\n"); @@ -743,6 +763,21 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() { } } +// Create an xxswapd instruction and insert it prior to the given point. +// MI is used to determine basic block and debug loc information. +// FIXME: When inserting a swap, we should check whether SrcReg is +// defined by another swap: SrcReg = XXPERMDI Reg, Reg, 2; If so, +// then instead we should generate a copy from Reg to DstReg. +void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI, + MachineBasicBlock::iterator InsertPoint, + unsigned DstReg, unsigned SrcReg) { + BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), + TII->get(PPC::XXPERMDI), DstReg) + .addReg(SrcReg) + .addReg(SrcReg) + .addImm(2); +} + // The identified swap entry requires special handling to allow its // containing computation to be optimized. Perform that handling // here. @@ -752,8 +787,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { switch (SwapVector[EntryIdx].SpecialHandling) { default: - assert(false && "Unexpected special handling type"); - break; + llvm_unreachable("Unexpected special handling type"); // For splats based on an index into a vector, add N/2 modulo N // to the index, where N is the number of vector elements. @@ -766,7 +800,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { switch (MI->getOpcode()) { default: - assert(false && "Unexpected splat opcode"); + llvm_unreachable("Unexpected splat opcode"); case PPC::VSPLTB: NElts = 16; break; case PPC::VSPLTH: NElts = 8; break; case PPC::VSPLTW: NElts = 4; break; @@ -811,7 +845,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { // For a copy from a scalar floating-point register to a vector // register, removing swaps will leave the copied value in the // wrong lane. Insert a swap following the copy to fix this. - case SHValues::SH_COPYSCALAR: { + case SHValues::SH_COPYWIDEN: { MachineInstr *MI = SwapVector[EntryIdx].VSEMI; DEBUG(dbgs() << "Changing SUBREG_TO_REG: "); @@ -825,14 +859,13 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { DEBUG(dbgs() << " Into: "); DEBUG(MI->dump()); - MachineBasicBlock::iterator InsertPoint = MI->getNextNode(); + auto InsertPoint = ++MachineBasicBlock::iterator(MI); // Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG // is copying to a VRRC, we need to be careful to avoid a register // assignment problem. In this case we must copy from VRRC to VSRC // prior to the swap, and from VSRC to VRRC following the swap. // Coalescing will usually remove all this mess. - if (DstRC == &PPC::VRRCRegClass) { unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass); unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass); @@ -840,29 +873,19 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), TII->get(PPC::COPY), VSRCTmp1) .addReg(NewVReg); - DEBUG(MI->getNextNode()->dump()); + DEBUG(std::prev(InsertPoint)->dump()); - BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), - TII->get(PPC::XXPERMDI), VSRCTmp2) - .addReg(VSRCTmp1) - .addReg(VSRCTmp1) - .addImm(2); - DEBUG(MI->getNextNode()->getNextNode()->dump()); + insertSwap(MI, InsertPoint, VSRCTmp2, VSRCTmp1); + DEBUG(std::prev(InsertPoint)->dump()); BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), TII->get(PPC::COPY), DstReg) .addReg(VSRCTmp2); - DEBUG(MI->getNextNode()->getNextNode()->getNextNode()->dump()); + DEBUG(std::prev(InsertPoint)->dump()); } else { - - BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), - TII->get(PPC::XXPERMDI), DstReg) - .addReg(NewVReg) - .addReg(NewVReg) - .addImm(2); - - DEBUG(MI->getNextNode()->dump()); + insertSwap(MI, InsertPoint, DstReg, NewVReg); + DEBUG(std::prev(InsertPoint)->dump()); } break; } @@ -947,8 +970,8 @@ void PPCVSXSwapRemoval::dumpSwapVector() { case SH_XXPERMDI: DEBUG(dbgs() << "special:xxpermdi "); break; - case SH_COPYSCALAR: - DEBUG(dbgs() << "special:copyscalar "); + case SH_COPYWIDEN: + DEBUG(dbgs() << "special:copywiden "); break; } } diff --git a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index 1c4e486..a552747 100644 --- a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -14,6 +14,7 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" @@ -34,7 +35,6 @@ namespace { class SparcOperand; class SparcAsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; MCAsmParser &Parser; /// @name Auto-generated Match Functions @@ -69,6 +69,10 @@ class SparcAsmParser : public MCTargetAsmParser { OperandMatchResultTy parseBranchModifiers(OperandVector &Operands); + // Helper function for dealing with %lo / %hi in PIC mode. + const SparcMCExpr *adjustPICRelocation(SparcMCExpr::VariantKind VK, + const MCExpr *subExpr); + // returns true if Tok is matched to a register and returns register in RegNo. bool matchRegisterName(const AsmToken &Tok, unsigned &RegNo, unsigned &RegKind); @@ -77,24 +81,24 @@ class SparcAsmParser : public MCTargetAsmParser { bool parseDirectiveWord(unsigned Size, SMLoc L); bool is64Bit() const { - return STI.getTargetTriple().getArch() == Triple::sparcv9; + return getSTI().getTargetTriple().getArch() == Triple::sparcv9; } void expandSET(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); public: - SparcAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser, + SparcAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(sti), Parser(parser) { + : MCTargetAsmParser(Options, sti), Parser(parser) { // Initialize the set of available features. - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); } }; - static unsigned IntRegs[32] = { + static const MCPhysReg IntRegs[32] = { Sparc::G0, Sparc::G1, Sparc::G2, Sparc::G3, Sparc::G4, Sparc::G5, Sparc::G6, Sparc::G7, Sparc::O0, Sparc::O1, Sparc::O2, Sparc::O3, @@ -104,7 +108,7 @@ public: Sparc::I0, Sparc::I1, Sparc::I2, Sparc::I3, Sparc::I4, Sparc::I5, Sparc::I6, Sparc::I7 }; - static unsigned FloatRegs[32] = { + static const MCPhysReg FloatRegs[32] = { Sparc::F0, Sparc::F1, Sparc::F2, Sparc::F3, Sparc::F4, Sparc::F5, Sparc::F6, Sparc::F7, Sparc::F8, Sparc::F9, Sparc::F10, Sparc::F11, @@ -114,7 +118,7 @@ public: Sparc::F24, Sparc::F25, Sparc::F26, Sparc::F27, Sparc::F28, Sparc::F29, Sparc::F30, Sparc::F31 }; - static unsigned DoubleRegs[32] = { + static const MCPhysReg DoubleRegs[32] = { Sparc::D0, Sparc::D1, Sparc::D2, Sparc::D3, Sparc::D4, Sparc::D5, Sparc::D6, Sparc::D7, Sparc::D8, Sparc::D7, Sparc::D8, Sparc::D9, @@ -124,13 +128,13 @@ public: Sparc::D24, Sparc::D25, Sparc::D26, Sparc::D27, Sparc::D28, Sparc::D29, Sparc::D30, Sparc::D31 }; - static unsigned QuadFPRegs[32] = { + static const MCPhysReg QuadFPRegs[32] = { Sparc::Q0, Sparc::Q1, Sparc::Q2, Sparc::Q3, Sparc::Q4, Sparc::Q5, Sparc::Q6, Sparc::Q7, Sparc::Q8, Sparc::Q9, Sparc::Q10, Sparc::Q11, Sparc::Q12, Sparc::Q13, Sparc::Q14, Sparc::Q15 }; - static unsigned ASRRegs[32] = { + static const MCPhysReg ASRRegs[32] = { SP::Y, SP::ASR1, SP::ASR2, SP::ASR3, SP::ASR4, SP::ASR5, SP::ASR6, SP::ASR7, SP::ASR8, SP::ASR9, SP::ASR10, SP::ASR11, @@ -140,6 +144,12 @@ public: SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27, SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31}; + static const MCPhysReg IntPairRegs[] = { + Sparc::G0_G1, Sparc::G2_G3, Sparc::G4_G5, Sparc::G6_G7, + Sparc::O0_O1, Sparc::O2_O3, Sparc::O4_O5, Sparc::O6_O7, + Sparc::L0_L1, Sparc::L2_L3, Sparc::L4_L5, Sparc::L6_L7, + Sparc::I0_I1, Sparc::I2_I3, Sparc::I4_I5, Sparc::I6_I7}; + /// SparcOperand - Instances of this class represent a parsed Sparc machine /// instruction. class SparcOperand : public MCParsedAsmOperand { @@ -147,6 +157,7 @@ public: enum RegisterKind { rk_None, rk_IntReg, + rk_IntPairReg, rk_FloatReg, rk_DoubleReg, rk_QuadReg, @@ -200,6 +211,10 @@ public: bool isMEMrr() const { return Kind == k_MemoryReg; } bool isMEMri() const { return Kind == k_MemoryImm; } + bool isIntReg() const { + return (Kind == k_Register && Reg.Kind == rk_IntReg); + } + bool isFloatReg() const { return (Kind == k_Register && Reg.Kind == rk_FloatReg); } @@ -330,6 +345,25 @@ public: return Op; } + static bool MorphToIntPairReg(SparcOperand &Op) { + unsigned Reg = Op.getReg(); + assert(Op.Reg.Kind == rk_IntReg); + unsigned regIdx = 32; + if (Reg >= Sparc::G0 && Reg <= Sparc::G7) + regIdx = Reg - Sparc::G0; + else if (Reg >= Sparc::O0 && Reg <= Sparc::O7) + regIdx = Reg - Sparc::O0 + 8; + else if (Reg >= Sparc::L0 && Reg <= Sparc::L7) + regIdx = Reg - Sparc::L0 + 16; + else if (Reg >= Sparc::I0 && Reg <= Sparc::I7) + regIdx = Reg - Sparc::I0 + 24; + if (regIdx % 2 || regIdx > 31) + return false; + Op.Reg.RegNum = IntPairRegs[regIdx / 2]; + Op.Reg.Kind = rk_IntPairReg; + return true; + } + static bool MorphToDoubleReg(SparcOperand &Op) { unsigned Reg = Op.getReg(); assert(Op.Reg.Kind == rk_FloatReg); @@ -407,7 +441,22 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc, // the imm operand can be either an expression or an immediate. bool IsImm = Inst.getOperand(1).isImm(); - uint64_t ImmValue = IsImm ? MCValOp.getImm() : 0; + int64_t RawImmValue = IsImm ? MCValOp.getImm() : 0; + + // Allow either a signed or unsigned 32-bit immediate. + if (RawImmValue < -2147483648LL || RawImmValue > 4294967295LL) { + Error(IDLoc, "set: argument must be between -2147483648 and 4294967295"); + return; + } + + // If the value was expressed as a large unsigned number, that's ok. + // We want to see if it "looks like" a small signed number. + int32_t ImmValue = RawImmValue; + // For 'set' you can't use 'or' with a negative operand on V9 because + // that would splat the sign bit across the upper half of the destination + // register, whereas 'set' is defined to zero the high 32 bits. + bool IsEffectivelyImm13 = + IsImm && ((is64Bit() ? 0 : -4096) <= ImmValue && ImmValue < 4096); const MCExpr *ValExpr; if (IsImm) ValExpr = MCConstantExpr::create(ImmValue, getContext()); @@ -416,10 +465,12 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc, MCOperand PrevReg = MCOperand::createReg(Sparc::G0); - if (!IsImm || (ImmValue & ~0x1fff)) { + // If not just a signed imm13 value, then either we use a 'sethi' with a + // following 'or', or a 'sethi' by itself if there are no more 1 bits. + // In either case, start with the 'sethi'. + if (!IsEffectivelyImm13) { MCInst TmpInst; - const MCExpr *Expr = - SparcMCExpr::create(SparcMCExpr::VK_Sparc_HI, ValExpr, getContext()); + const MCExpr *Expr = adjustPICRelocation(SparcMCExpr::VK_Sparc_HI, ValExpr); TmpInst.setLoc(IDLoc); TmpInst.setOpcode(SP::SETHIi); TmpInst.addOperand(MCRegOp); @@ -428,10 +479,23 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc, PrevReg = MCRegOp; } - if (!IsImm || ((ImmValue & 0x1fff) != 0 || ImmValue == 0)) { + // The low bits require touching in 3 cases: + // * A non-immediate value will always require both instructions. + // * An effectively imm13 value needs only an 'or' instruction. + // * Otherwise, an immediate that is not effectively imm13 requires the + // 'or' only if bits remain after clearing the 22 bits that 'sethi' set. + // If the low bits are known zeros, there's nothing to do. + // In the second case, and only in that case, must we NOT clear + // bits of the immediate value via the %lo() assembler function. + // Note also, the 'or' instruction doesn't mind a large value in the case + // where the operand to 'set' was 0xFFFFFzzz - it does exactly what you mean. + if (!IsImm || IsEffectivelyImm13 || (ImmValue & 0x3ff)) { MCInst TmpInst; - const MCExpr *Expr = - SparcMCExpr::create(SparcMCExpr::VK_Sparc_LO, ValExpr, getContext()); + const MCExpr *Expr; + if (IsEffectivelyImm13) + Expr = ValExpr; + else + Expr = adjustPICRelocation(SparcMCExpr::VK_Sparc_LO, ValExpr); TmpInst.setLoc(IDLoc); TmpInst.setOpcode(SP::ORri); TmpInst.addOperand(MCRegOp); @@ -463,7 +527,7 @@ bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } for (const MCInst &I : Instructions) { - Out.EmitInstruction(I, STI); + Out.EmitInstruction(I, getSTI()); } return false; } @@ -742,6 +806,9 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op, case Sparc::PSR: Op = SparcOperand::CreateToken("%psr", S); break; + case Sparc::FSR: + Op = SparcOperand::CreateToken("%fsr", S); + break; case Sparc::WIM: Op = SparcOperand::CreateToken("%wim", S); break; @@ -766,6 +833,7 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op, case AsmToken::Minus: case AsmToken::Integer: case AsmToken::LParen: + case AsmToken::Dot: if (!getParser().parseExpression(EVal, E)) Op = SparcOperand::CreateImm(EVal, S, E); break; @@ -848,6 +916,13 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, return true; } + // %fprs is an alias of %asr6. + if (name.equals("fprs")) { + RegNo = ASRRegs[6]; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("icc")) { RegNo = Sparc::ICC; RegKind = SparcOperand::rk_Special; @@ -860,6 +935,12 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, return true; } + if (name.equals("fsr")) { + RegNo = Sparc::FSR; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("wim")) { RegNo = Sparc::WIM; RegKind = SparcOperand::rk_Special; @@ -943,6 +1024,82 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, RegKind = SparcOperand::rk_IntReg; return true; } + + if (name.equals("tpc")) { + RegNo = Sparc::TPC; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("tnpc")) { + RegNo = Sparc::TNPC; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("tstate")) { + RegNo = Sparc::TSTATE; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("tt")) { + RegNo = Sparc::TT; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("tick")) { + RegNo = Sparc::TICK; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("tba")) { + RegNo = Sparc::TBA; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("pstate")) { + RegNo = Sparc::PSTATE; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("tl")) { + RegNo = Sparc::TL; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("pil")) { + RegNo = Sparc::PIL; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("cwp")) { + RegNo = Sparc::CWP; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("cansave")) { + RegNo = Sparc::CANSAVE; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("canrestore")) { + RegNo = Sparc::CANRESTORE; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("cleanwin")) { + RegNo = Sparc::CLEANWIN; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("otherwin")) { + RegNo = Sparc::OTHERWIN; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("wstate")) { + RegNo = Sparc::WSTATE; + RegKind = SparcOperand::rk_Special; + return true; + } } return false; } @@ -975,6 +1132,32 @@ static bool hasGOTReference(const MCExpr *Expr) { return false; } +const SparcMCExpr * +SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK, + const MCExpr *subExpr) +{ + // When in PIC mode, "%lo(...)" and "%hi(...)" behave differently. + // If the expression refers contains _GLOBAL_OFFSETE_TABLE, it is + // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted + // as %got10 or %got22 relocation. + + if (getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_) { + switch(VK) { + default: break; + case SparcMCExpr::VK_Sparc_LO: + VK = (hasGOTReference(subExpr) ? SparcMCExpr::VK_Sparc_PC10 + : SparcMCExpr::VK_Sparc_GOT10); + break; + case SparcMCExpr::VK_Sparc_HI: + VK = (hasGOTReference(subExpr) ? SparcMCExpr::VK_Sparc_PC22 + : SparcMCExpr::VK_Sparc_GOT22); + break; + } + } + + return SparcMCExpr::create(VK, subExpr, getContext()); +} + bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal, SMLoc &EndLoc) { @@ -998,30 +1181,7 @@ bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal, if (Parser.parseParenExpression(subExpr, EndLoc)) return false; - bool isPIC = getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_; - - // Ugly: if a sparc assembly expression says "%hi(...)" but the - // expression within contains _GLOBAL_OFFSET_TABLE_, it REALLY means - // %pc22. Same with %lo -> %pc10. Worse, if it doesn't contain that, - // the meaning depends on whether the assembler was invoked with - // -KPIC or not: if so, it really means %got22/%got10; if not, it - // actually means what it said! Sigh, historical mistakes... - - switch(VK) { - default: break; - case SparcMCExpr::VK_Sparc_LO: - VK = (hasGOTReference(subExpr) - ? SparcMCExpr::VK_Sparc_PC10 - : (isPIC ? SparcMCExpr::VK_Sparc_GOT10 : VK)); - break; - case SparcMCExpr::VK_Sparc_HI: - VK = (hasGOTReference(subExpr) - ? SparcMCExpr::VK_Sparc_PC22 - : (isPIC ? SparcMCExpr::VK_Sparc_GOT22 : VK)); - break; - } - - EVal = SparcMCExpr::create(VK, subExpr, getContext()); + EVal = adjustPICRelocation(VK, subExpr); return true; } @@ -1051,5 +1211,9 @@ unsigned SparcAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp, break; } } + if (Op.isIntReg() && Kind == MCK_IntPair) { + if (SparcOperand::MorphToIntPairReg(Op)) + return MCTargetAsmParser::Match_Success; + } return Match_InvalidOperand; } diff --git a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp index 38bff44..c689b7f 100644 --- a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp +++ b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp @@ -122,6 +122,8 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) { continue; } + // TODO: If we ever want to support v7, this needs to be extended + // to cover all floating point operations. if (!Subtarget->isV9() && (MI->getOpcode() == SP::FCMPS || MI->getOpcode() == SP::FCMPD || MI->getOpcode() == SP::FCMPQ)) { diff --git a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp index 3e56b9e..51751ec 100644 --- a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp +++ b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp @@ -117,6 +117,19 @@ static const unsigned ASRRegDecoderTable[] = { SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27, SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31}; +static const unsigned PRRegDecoderTable[] = { + SP::TPC, SP::TNPC, SP::TSTATE, SP::TT, SP::TICK, SP::TBA, SP::PSTATE, + SP::TL, SP::PIL, SP::CWP, SP::CANSAVE, SP::CANRESTORE, SP::CLEANWIN, + SP::OTHERWIN, SP::WSTATE +}; + +static const uint16_t IntPairDecoderTable[] = { + SP::G0_G1, SP::G2_G3, SP::G4_G5, SP::G6_G7, + SP::O0_O1, SP::O2_O3, SP::O4_O5, SP::O6_O7, + SP::L0_L1, SP::L2_L3, SP::L4_L5, SP::L6_L7, + SP::I0_I1, SP::I2_I3, SP::I4_I5, SP::I6_I7, +}; + static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, @@ -196,9 +209,34 @@ static DecodeStatus DecodeASRRegsRegisterClass(MCInst &Inst, unsigned RegNo, return MCDisassembler::Success; } +static DecodeStatus DecodePRRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo >= array_lengthof(PRRegDecoderTable)) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createReg(PRRegDecoderTable[RegNo])); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeIntPairRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + if (RegNo > 31) + return MCDisassembler::Fail; + + if ((RegNo & 1)) + S = MCDisassembler::SoftFail; + + unsigned RegisterPair = IntPairDecoderTable[RegNo/2]; + Inst.addOperand(MCOperand::createReg(RegisterPair)); + return S; +} static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address, + const void *Decoder); static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder); static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address, @@ -207,6 +245,8 @@ static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder); static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder); static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn, @@ -326,6 +366,12 @@ static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address, DecodeIntRegsRegisterClass); } +static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address, + const void *Decoder) { + return DecodeMem(Inst, insn, Address, Decoder, true, + DecodeIntPairRegisterClass); +} + static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder) { return DecodeMem(Inst, insn, Address, Decoder, true, @@ -350,6 +396,12 @@ static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn, DecodeIntRegsRegisterClass); } +static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn, + uint64_t Address, const void *Decoder) { + return DecodeMem(Inst, insn, Address, Decoder, false, + DecodeIntPairRegisterClass); +} + static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder) { return DecodeMem(Inst, insn, Address, Decoder, false, diff --git a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h index 0b01b88..6f06d1d 100644 --- a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h +++ b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h @@ -15,12 +15,9 @@ #define LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCSubtargetInfo.h" namespace llvm { -class MCOperand; - class SparcInstPrinter : public MCInstPrinter { public: SparcInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h index 12386f1..ad44122 100644 --- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h +++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h @@ -21,6 +21,7 @@ class Triple; class SparcELFMCAsmInfo : public MCAsmInfoELF { void anchor() override; + public: explicit SparcELFMCAsmInfo(const Triple &TheTriple); const MCExpr* diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h index d08ad86..13f0819 100644 --- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h +++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h @@ -90,8 +90,8 @@ public: const MCAsmLayout *Layout, const MCFixup *Fixup) const override; void visitUsedExpr(MCStreamer &Streamer) const override; - MCSection *findAssociatedSection() const override { - return getSubExpr()->findAssociatedSection(); + MCFragment *findAssociatedFragment() const override { + return getSubExpr()->findAssociatedFragment(); } void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override; diff --git a/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp index c5f046b..e3b0f52 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp @@ -267,11 +267,11 @@ void SparcAsmPrinter::EmitInstruction(const MachineInstr *MI) LowerGETPCXAndEmitMCInsts(MI, getSubtargetInfo()); return; } - MachineBasicBlock::const_instr_iterator I = MI; + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); do { MCInst TmpInst; - LowerSparcMachineInstrToMCInst(I, TmpInst, *this); + LowerSparcMachineInstrToMCInst(&*I, TmpInst, *this); EmitToStreamer(*OutStreamer, TmpInst); } while ((++I != E) && I->isInsideBundle()); // Delay slot check. } @@ -296,7 +296,7 @@ void SparcAsmPrinter::EmitFunctionBodyStart() { void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum, raw_ostream &O) { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); const MachineOperand &MO = MI->getOperand (opNum); SparcMCExpr::VariantKind TF = (SparcMCExpr::VariantKind) MO.getTargetFlags(); @@ -373,7 +373,7 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum, O << MO.getSymbolName(); break; case MachineOperand::MO_ConstantPoolIndex: - O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_" + O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_" << MO.getIndex(); break; default: diff --git a/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td index dfaaabf..0aa29d1 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td +++ b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td @@ -21,7 +21,11 @@ def CC_Sparc32 : CallingConv<[ // i32 f32 arguments get passed in integer registers if there is space. CCIfType<[i32, f32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>, // f64 arguments are split and passed through registers or through stack. - CCIfType<[f64], CCCustom<"CC_Sparc_Assign_f64">>, + CCIfType<[f64], CCCustom<"CC_Sparc_Assign_Split_64">>, + // As are v2i32 arguments (this would be the default behavior for + // v2i32 if it wasn't allocated to the IntPair register-class) + CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Split_64">>, + // Alternatively, they are assigned to the stack in 4-byte aligned units. CCAssignToStack<4, 4> @@ -30,7 +34,8 @@ def CC_Sparc32 : CallingConv<[ def RetCC_Sparc32 : CallingConv<[ CCIfType<[i32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>, CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3]>>, - CCIfType<[f64], CCAssignToReg<[D0, D1]>> + CCIfType<[f64], CCAssignToReg<[D0, D1]>>, + CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Ret_Split_64">> ]>; diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp index c0279da..39b5e80 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp @@ -44,7 +44,7 @@ void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF, unsigned ADDrr, unsigned ADDri) const { - DebugLoc dl = (MBBI != MBB.end()) ? MBBI->getDebugLoc() : DebugLoc(); + DebugLoc dl; const SparcInstrInfo &TII = *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo()); @@ -90,8 +90,23 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, MachineFrameInfo *MFI = MF.getFrameInfo(); const SparcInstrInfo &TII = *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SparcRegisterInfo &RegInfo = + *static_cast<const SparcRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); MachineBasicBlock::iterator MBBI = MBB.begin(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc dl; + bool NeedsStackRealignment = RegInfo.needsStackRealignment(MF); + + // FIXME: unfortunately, returning false from canRealignStack + // actually just causes needsStackRealignment to return false, + // rather than reporting an error, as would be sensible. This is + // poor, but fixing that bogosity is going to be a large project. + // For now, just see if it's lied, and report an error here. + if (!NeedsStackRealignment && MFI->getMaxAlignment() > getStackAlignment()) + report_fatal_error("Function \"" + Twine(MF.getName()) + "\" required " + "stack re-alignment, but LLVM couldn't handle it " + "(probably because it has a dynamic alloca)."); // Get the number of bytes to allocate from the FrameInfo int NumBytes = (int) MFI->getStackSize(); @@ -104,12 +119,43 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, SAVEri = SP::ADDri; SAVErr = SP::ADDrr; } - NumBytes = -MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes); - emitSPAdjustment(MF, MBB, MBBI, NumBytes, SAVErr, SAVEri); + + // The SPARC ABI is a bit odd in that it requires a reserved 92-byte + // (128 in v9) area in the user's stack, starting at %sp. Thus, the + // first part of the stack that can actually be used is located at + // %sp + 92. + // + // We therefore need to add that offset to the total stack size + // after all the stack objects are placed by + // PrologEpilogInserter calculateFrameObjectOffsets. However, since the stack needs to be + // aligned *after* the extra size is added, we need to disable + // calculateFrameObjectOffsets's built-in stack alignment, by having + // targetHandlesStackFrameRounding return true. + + + // Add the extra call frame stack size, if needed. (This is the same + // code as in PrologEpilogInserter, but also gets disabled by + // targetHandlesStackFrameRounding) + if (MFI->adjustsStack() && hasReservedCallFrame(MF)) + NumBytes += MFI->getMaxCallFrameSize(); + + // Adds the SPARC subtarget-specific spill area to the stack + // size. Also ensures target-required alignment. + NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes); + + // Finally, ensure that the size is sufficiently aligned for the + // data on the stack. + if (MFI->getMaxAlignment() > 0) { + NumBytes = RoundUpToAlignment(NumBytes, MFI->getMaxAlignment()); + } + + // Update stack size with corrected value. + MFI->setStackSize(NumBytes); + + emitSPAdjustment(MF, MBB, MBBI, -NumBytes, SAVErr, SAVEri); MachineModuleInfo &MMI = MF.getMMI(); - const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - unsigned regFP = MRI->getDwarfRegNum(SP::I6, true); + unsigned regFP = RegInfo.getDwarfRegNum(SP::I6, true); // Emit ".cfi_def_cfa_register 30". unsigned CFIIndex = @@ -122,13 +168,19 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); - unsigned regInRA = MRI->getDwarfRegNum(SP::I7, true); - unsigned regOutRA = MRI->getDwarfRegNum(SP::O7, true); + unsigned regInRA = RegInfo.getDwarfRegNum(SP::I7, true); + unsigned regOutRA = RegInfo.getDwarfRegNum(SP::O7, true); // Emit ".cfi_register 15, 31". CFIIndex = MMI.addFrameInst( MCCFIInstruction::createRegister(nullptr, regOutRA, regInRA)); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); + + if (NeedsStackRealignment) { + // andn %o6, MaxAlign-1, %o6 + int MaxAlign = MFI->getMaxAlignment(); + BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), SP::O6).addReg(SP::O6).addImm(MaxAlign - 1); + } } void SparcFrameLowering:: @@ -167,7 +219,6 @@ void SparcFrameLowering::emitEpilogue(MachineFunction &MF, if (NumBytes == 0) return; - NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes); emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri); } @@ -180,21 +231,69 @@ bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { // pointer register. This is true if the function has variable sized allocas or // if frame pointer elimination is disabled. bool SparcFrameLowering::hasFP(const MachineFunction &MF) const { + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); return MF.getTarget().Options.DisableFramePointerElim(MF) || - MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken(); + RegInfo->needsStackRealignment(MF) || + MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken(); } +int SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const SparcRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>(); + bool isFixed = MFI->isFixedObjectIndex(FI); + + // Addressable stack objects are accessed using neg. offsets from + // %fp, or positive offsets from %sp. + bool UseFP; + + // Sparc uses FP-based references in general, even when "hasFP" is + // false. That function is rather a misnomer, because %fp is + // actually always available, unless isLeafProc. + if (FuncInfo->isLeafProc()) { + // If there's a leaf proc, all offsets need to be %sp-based, + // because we haven't caused %fp to actually point to our frame. + UseFP = false; + } else if (isFixed) { + // Otherwise, argument access should always use %fp. + UseFP = true; + } else if (RegInfo->needsStackRealignment(MF)) { + // If there is dynamic stack realignment, all local object + // references need to be via %sp, to take account of the + // re-alignment. + UseFP = false; + } else { + // Finally, default to using %fp. + UseFP = true; + } + + int64_t FrameOffset = MF.getFrameInfo()->getObjectOffset(FI) + + Subtarget.getStackPointerBias(); + + if (UseFP) { + FrameReg = RegInfo->getFrameRegister(MF); + return FrameOffset; + } else { + FrameReg = SP::O6; // %sp + return FrameOffset + MF.getFrameInfo()->getStackSize(); + } +} + static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI) { for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) - if (MRI->isPhysRegUsed(reg)) + if (!MRI->reg_nodbg_empty(reg)) return false; for (unsigned reg = SP::L0; reg <= SP::L7; ++reg) - if (MRI->isPhysRegUsed(reg)) + if (!MRI->reg_nodbg_empty(reg)) return false; return true; @@ -206,33 +305,42 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const MachineRegisterInfo &MRI = MF.getRegInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); - return !(MFI->hasCalls() // has calls - || MRI.isPhysRegUsed(SP::L0) // Too many registers needed - || MRI.isPhysRegUsed(SP::O6) // %SP is used - || hasFP(MF)); // need %FP + return !(MFI->hasCalls() // has calls + || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed + || !MRI.reg_nodbg_empty(SP::O6) // %SP is used + || hasFP(MF)); // need %FP } void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const { - MachineRegisterInfo &MRI = MF.getRegInfo(); - // Remap %i[0-7] to %o[0-7]. for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) { - if (!MRI.isPhysRegUsed(reg)) + if (MRI.reg_nodbg_empty(reg)) continue; - unsigned mapped_reg = (reg - SP::I0 + SP::O0); - assert(!MRI.isPhysRegUsed(mapped_reg)); + + unsigned mapped_reg = reg - SP::I0 + SP::O0; + assert(MRI.reg_nodbg_empty(mapped_reg)); // Replace I register with O register. MRI.replaceRegWith(reg, mapped_reg); - // Mark the reg unused. - MRI.setPhysRegUnused(reg); + // Also replace register pair super-registers. + if ((reg - SP::I0) % 2 == 0) { + unsigned preg = (reg - SP::I0) / 2 + SP::I0_I1; + unsigned mapped_preg = preg - SP::I0_I1 + SP::O0_O1; + MRI.replaceRegWith(preg, mapped_preg); + } } // Rewrite MBB's Live-ins. for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); MBB != E; ++MBB) { + for (unsigned reg = SP::I0_I1; reg <= SP::I6_I7; ++reg) { + if (!MBB->isLiveIn(reg)) + continue; + MBB->removeLiveIn(reg); + MBB->addLiveIn(reg - SP::I0_I1 + SP::O0_O1); + } for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) { if (!MBB->isLiveIn(reg)) continue; diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h index 29fc7b7..cbb4dc0 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h +++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h @@ -39,6 +39,14 @@ public: void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + + /// targetHandlesStackFrameRounding - Returns true if the target is + /// responsible for rounding up the stack frame (probably at emitPrologue + /// time). + bool targetHandlesStackFrameRounding() const override { return true; } + private: // Remap input registers to output registers for leaf procedure. void remapRegsForLeafProc(MachineFunction &MF) const; diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp index 340b72e..c4c6416 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "SparcTargetMachine.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/Compiler.h" @@ -62,6 +63,7 @@ public: private: SDNode* getGlobalBaseReg(); + SDNode *SelectInlineAsm(SDNode *N); }; } // end anonymous namespace @@ -141,6 +143,181 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) { return true; } + +// Re-assemble i64 arguments split up in SelectionDAGBuilder's +// visitInlineAsm / GetRegistersForValue functions. +// +// Note: This function was copied from, and is essentially identical +// to ARMISelDAGToDAG::SelectInlineAsm. It is very unfortunate that +// such hacking-up is necessary; a rethink of how inline asm operands +// are handled may be in order to make doing this more sane. +// +// TODO: fix inline asm support so I can simply tell it that 'i64' +// inputs to asm need to be allocated to the IntPair register type, +// and have that work. Then, delete this function. +SDNode *SparcDAGToDAGISel::SelectInlineAsm(SDNode *N){ + std::vector<SDValue> AsmNodeOperands; + unsigned Flag, Kind; + bool Changed = false; + unsigned NumOps = N->getNumOperands(); + + // Normally, i64 data is bounded to two arbitrary GPRs for "%r" + // constraint. However, some instructions (e.g. ldd/std) require + // (even/even+1) GPRs. + + // So, here, we check for this case, and mutate the inlineasm to use + // a single IntPair register instead, which guarantees such even/odd + // placement. + + SDLoc dl(N); + SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1) + : SDValue(nullptr,0); + + SmallVector<bool, 8> OpChanged; + // Glue node will be appended late. + for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) { + SDValue op = N->getOperand(i); + AsmNodeOperands.push_back(op); + + if (i < InlineAsm::Op_FirstOperand) + continue; + + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) { + Flag = C->getZExtValue(); + Kind = InlineAsm::getKind(Flag); + } + else + continue; + + // Immediate operands to inline asm in the SelectionDAG are modeled with + // two operands. The first is a constant of value InlineAsm::Kind_Imm, and + // the second is a constant with the value of the immediate. If we get here + // and we have a Kind_Imm, skip the next operand, and continue. + if (Kind == InlineAsm::Kind_Imm) { + SDValue op = N->getOperand(++i); + AsmNodeOperands.push_back(op); + continue; + } + + unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag); + if (NumRegs) + OpChanged.push_back(false); + + unsigned DefIdx = 0; + bool IsTiedToChangedOp = false; + // If it's a use that is tied with a previous def, it has no + // reg class constraint. + if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx)) + IsTiedToChangedOp = OpChanged[DefIdx]; + + if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef + && Kind != InlineAsm::Kind_RegDefEarlyClobber) + continue; + + unsigned RC; + bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC); + if ((!IsTiedToChangedOp && (!HasRC || RC != SP::IntRegsRegClassID)) + || NumRegs != 2) + continue; + + assert((i+2 < NumOps) && "Invalid number of operands in inline asm"); + SDValue V0 = N->getOperand(i+1); + SDValue V1 = N->getOperand(i+2); + unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg(); + unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg(); + SDValue PairedReg; + MachineRegisterInfo &MRI = MF->getRegInfo(); + + if (Kind == InlineAsm::Kind_RegDef || + Kind == InlineAsm::Kind_RegDefEarlyClobber) { + // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to + // the original GPRs. + + unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass); + PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32); + SDValue Chain = SDValue(N,0); + + SDNode *GU = N->getGluedUser(); + SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::v2i32, + Chain.getValue(1)); + + // Extract values from a GPRPair reg and copy to the original GPR reg. + SDValue Sub0 = CurDAG->getTargetExtractSubreg(SP::sub_even, dl, MVT::i32, + RegCopy); + SDValue Sub1 = CurDAG->getTargetExtractSubreg(SP::sub_odd, dl, MVT::i32, + RegCopy); + SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0, + RegCopy.getValue(1)); + SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1)); + + // Update the original glue user. + std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1); + Ops.push_back(T1.getValue(1)); + CurDAG->UpdateNodeOperands(GU, Ops); + } + else { + // For Kind == InlineAsm::Kind_RegUse, we first copy two GPRs into a + // GPRPair and then pass the GPRPair to the inline asm. + SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain]; + + // As REG_SEQ doesn't take RegisterSDNode, we copy them first. + SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32, + Chain.getValue(1)); + SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32, + T0.getValue(1)); + SDValue Pair = SDValue( + CurDAG->getMachineNode( + TargetOpcode::REG_SEQUENCE, dl, MVT::v2i32, + { + CurDAG->getTargetConstant(SP::IntPairRegClassID, dl, + MVT::i32), + T0, + CurDAG->getTargetConstant(SP::sub_even, dl, MVT::i32), + T1, + CurDAG->getTargetConstant(SP::sub_odd, dl, MVT::i32), + }), + 0); + + // Copy REG_SEQ into a GPRPair-typed VR and replace the original two + // i32 VRs of inline asm with it. + unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass); + PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32); + Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1)); + + AsmNodeOperands[InlineAsm::Op_InputChain] = Chain; + Glue = Chain.getValue(1); + } + + Changed = true; + + if(PairedReg.getNode()) { + OpChanged[OpChanged.size() -1 ] = true; + Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/); + if (IsTiedToChangedOp) + Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx); + else + Flag = InlineAsm::getFlagWordForRegClass(Flag, SP::IntPairRegClassID); + // Replace the current flag. + AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant( + Flag, dl, MVT::i32); + // Add the new register node and skip the original two GPRs. + AsmNodeOperands.push_back(PairedReg); + // Skip the next two GPRs. + i += 2; + } + } + + if (Glue.getNode()) + AsmNodeOperands.push_back(Glue); + if (!Changed) + return nullptr; + + SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N), + CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands); + New->setNodeId(-1); + return New.getNode(); +} + SDNode *SparcDAGToDAGISel::Select(SDNode *N) { SDLoc dl(N); if (N->isMachineOpcode()) { @@ -150,6 +327,12 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) { switch (N->getOpcode()) { default: break; + case ISD::INLINEASM: { + SDNode *ResNode = SelectInlineAsm(N); + if (ResNode) + return ResNode; + break; + } case SPISD::GLOBAL_BASE_REG: return getGlobalBaseReg(); diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 4879d4e..5e70ffe 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -49,9 +49,9 @@ static bool CC_Sparc_Assign_SRet(unsigned &ValNo, MVT &ValVT, return true; } -static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State) +static bool CC_Sparc_Assign_Split_64(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { static const MCPhysReg RegList[] = { SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5 @@ -77,6 +77,29 @@ static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT, return true; } +static bool CC_Sparc_Assign_Ret_Split_64(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) +{ + static const MCPhysReg RegList[] = { + SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5 + }; + + // Try to get first reg. + if (unsigned Reg = State.AllocateReg(RegList)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else + return false; + + // Try to get second reg. + if (unsigned Reg = State.AllocateReg(RegList)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else + return false; + + return true; +} + // Allocate a full-sized argument for the 64-bit ABI. static bool CC_Sparc64_Full(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, @@ -202,12 +225,34 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain, RetOps.push_back(SDValue()); // Copy the result values into the output registers. - for (unsigned i = 0; i != RVLocs.size(); ++i) { + for (unsigned i = 0, realRVLocIdx = 0; + i != RVLocs.size(); + ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); - Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), - OutVals[i], Flag); + SDValue Arg = OutVals[realRVLocIdx]; + + if (VA.needsCustom()) { + assert(VA.getLocVT() == MVT::v2i32); + // Legalize ret v2i32 -> ret 2 x i32 (Basically: do what would + // happen by default if this wasn't a legal type) + + SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + Arg, + DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout()))); + SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + Arg, + DAG.getConstant(1, DL, getVectorIdxTy(DAG.getDataLayout()))); + + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part0, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + VA = RVLocs[++i]; // skip ahead to next loc + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part1, + Flag); + } else + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); // Guarantee that all emitted copies are stuck together with flags. Flag = Chain.getValue(1); @@ -355,6 +400,7 @@ LowerFormalArguments_32(SDValue Chain, CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32); const unsigned StackOffset = 92; + bool IsLittleEndian = DAG.getDataLayout().isLittleEndian(); unsigned InIdx = 0; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i, ++InIdx) { @@ -375,7 +421,8 @@ LowerFormalArguments_32(SDValue Chain, if (VA.isRegLoc()) { if (VA.needsCustom()) { - assert(VA.getLocVT() == MVT::f64); + assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32); + unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass); MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi); SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32); @@ -396,9 +443,13 @@ LowerFormalArguments_32(SDValue Chain, &SP::IntRegsRegClass); LoVal = DAG.getCopyFromReg(Chain, dl, loReg, MVT::i32); } + + if (IsLittleEndian) + std::swap(LoVal, HiVal); + SDValue WholeValue = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal); - WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue); + WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), WholeValue); InVals.push_back(WholeValue); continue; } @@ -422,7 +473,7 @@ LowerFormalArguments_32(SDValue Chain, auto PtrVT = getPointerTy(DAG.getDataLayout()); if (VA.needsCustom()) { - assert(VA.getValVT() == MVT::f64); + assert(VA.getValVT() == MVT::f64 || MVT::v2i32); // If it is double-word aligned, just load. if (Offset % 8 == 0) { int FI = MF.getFrameInfo()->CreateFixedObject(8, @@ -452,9 +503,12 @@ LowerFormalArguments_32(SDValue Chain, MachinePointerInfo(), false, false, false, 0); + if (IsLittleEndian) + std::swap(LoVal, HiVal); + SDValue WholeValue = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal); - WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue); + WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), WholeValue); InVals.push_back(WholeValue); continue; } @@ -468,16 +522,12 @@ LowerFormalArguments_32(SDValue Chain, Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, MachinePointerInfo(), false, false, false, 0); + } else if (VA.getValVT() == MVT::f128) { + report_fatal_error("SPARCv8 does not handle f128 in calls; " + "pass indirectly"); } else { - ISD::LoadExtType LoadOp = ISD::SEXTLOAD; - // Sparc is big endian, so add an offset based on the ObjectVT. - unsigned Offset = 4-std::max(1U, VA.getValVT().getSizeInBits()/8); - FIPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIPtr, - DAG.getConstant(Offset, dl, MVT::i32)); - Load = DAG.getExtLoad(LoadOp, dl, MVT::i32, Chain, FIPtr, - MachinePointerInfo(), - VA.getValVT(), false, false, false,0); - Load = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Load); + // We shouldn't see any other value types here. + llvm_unreachable("Unexpected ValVT encountered in frame lowering."); } InVals.push_back(Load); } @@ -612,7 +662,7 @@ LowerFormalArguments_64(SDValue Chain, InVals.push_back(DAG.getLoad( VA.getValVT(), DL, Chain, DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())), - MachinePointerInfo::getFixedStack(FI), false, false, false, 0)); + MachinePointerInfo::getFixedStack(MF, FI), false, false, false, 0)); } if (!IsVarArg) @@ -640,9 +690,9 @@ LowerFormalArguments_64(SDValue Chain, SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); int FI = MF.getFrameInfo()->CreateFixedObject(8, ArgOffset + ArgArea, true); auto PtrVT = getPointerTy(MF.getDataLayout()); - OutChains.push_back( - DAG.getStore(Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT), - MachinePointerInfo::getFixedStack(FI), false, false, 0)); + OutChains.push_back(DAG.getStore( + Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT), + MachinePointerInfo::getFixedStack(MF, FI), false, false, 0)); } if (!OutChains.empty()) @@ -788,7 +838,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, } if (VA.needsCustom()) { - assert(VA.getLocVT() == MVT::f64); + assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32); if (VA.isMemLoc()) { unsigned Offset = VA.getLocMemOffset() + StackOffset; @@ -804,49 +854,53 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, } } - SDValue StackPtr = DAG.CreateStackTemporary(MVT::f64, MVT::i32); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, - Arg, StackPtr, MachinePointerInfo(), - false, false, 0); - // Sparc is big-endian, so the high part comes first. - SDValue Hi = DAG.getLoad(MVT::i32, dl, Store, StackPtr, - MachinePointerInfo(), false, false, false, 0); - // Increment the pointer to the other half. - StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr, - DAG.getIntPtrConstant(4, dl)); - // Load the low part. - SDValue Lo = DAG.getLoad(MVT::i32, dl, Store, StackPtr, - MachinePointerInfo(), false, false, false, 0); + if (VA.getLocVT() == MVT::f64) { + // Move from the float value from float registers into the + // integer registers. + + // TODO: The f64 -> v2i32 conversion is super-inefficient for + // constants: it sticks them in the constant pool, then loads + // to a fp register, then stores to temp memory, then loads to + // integer registers. + Arg = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, Arg); + } + + SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + Arg, + DAG.getConstant(0, dl, getVectorIdxTy(DAG.getDataLayout()))); + SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + Arg, + DAG.getConstant(1, dl, getVectorIdxTy(DAG.getDataLayout()))); if (VA.isRegLoc()) { - RegsToPass.push_back(std::make_pair(VA.getLocReg(), Hi)); + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Part0)); assert(i+1 != e); CCValAssign &NextVA = ArgLocs[++i]; if (NextVA.isRegLoc()) { - RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Lo)); + RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Part1)); } else { - // Store the low part in stack. + // Store the second part in stack. unsigned Offset = NextVA.getLocMemOffset() + StackOffset; SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32); SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff); - MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff, + MemOpChains.push_back(DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo(), false, false, 0)); } } else { unsigned Offset = VA.getLocMemOffset() + StackOffset; - // Store the high part. + // Store the first part. SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32); SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff); - MemOpChains.push_back(DAG.getStore(Chain, dl, Hi, PtrOff, + MemOpChains.push_back(DAG.getStore(Chain, dl, Part0, PtrOff, MachinePointerInfo(), false, false, 0)); - // Store the low part. + // Store the second part. PtrOff = DAG.getIntPtrConstant(Offset + 4, dl); PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff); - MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff, + MemOpChains.push_back(DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo(), false, false, 0)); } @@ -990,8 +1044,8 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const if (!CalleeFn) return 0; - assert(CalleeFn->hasStructRetAttr() && - "Callee does not have the StructRet attribute."); + // It would be nice to check for the sret attribute on CalleeFn here, + // but since it is not part of the function type, any check will misfire. PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType()); Type *ElementTy = Ty->getElementType(); @@ -1370,15 +1424,60 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) { SparcTargetLowering::SparcTargetLowering(TargetMachine &TM, const SparcSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { - auto &DL = *TM.getDataLayout(); + MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); + + // Instructions which use registers as conditionals examine all the + // bits (as does the pseudo SELECT_CC expansion). I don't think it + // matters much whether it's ZeroOrOneBooleanContent, or + // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the + // former. + setBooleanContents(ZeroOrOneBooleanContent); + setBooleanVectorContents(ZeroOrOneBooleanContent); // Set up the register classes. addRegisterClass(MVT::i32, &SP::IntRegsRegClass); addRegisterClass(MVT::f32, &SP::FPRegsRegClass); addRegisterClass(MVT::f64, &SP::DFPRegsRegClass); addRegisterClass(MVT::f128, &SP::QFPRegsRegClass); - if (Subtarget->is64Bit()) + if (Subtarget->is64Bit()) { addRegisterClass(MVT::i64, &SP::I64RegsRegClass); + } else { + // On 32bit sparc, we define a double-register 32bit register + // class, as well. This is modeled in LLVM as a 2-vector of i32. + addRegisterClass(MVT::v2i32, &SP::IntPairRegClass); + + // ...but almost all operations must be expanded, so set that as + // the default. + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { + setOperationAction(Op, MVT::v2i32, Expand); + } + // Truncating/extending stores/loads are also not supported. + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i32, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Expand); + + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, VT, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, VT, Expand); + + setTruncStoreAction(VT, MVT::v2i32, Expand); + setTruncStoreAction(MVT::v2i32, VT, Expand); + } + // However, load and store *are* legal. + setOperationAction(ISD::LOAD, MVT::v2i32, Legal); + setOperationAction(ISD::STORE, MVT::v2i32, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Legal); + + // And we need to promote i64 loads/stores into vector load/store + setOperationAction(ISD::LOAD, MVT::i64, Custom); + setOperationAction(ISD::STORE, MVT::i64, Custom); + + // Sadly, this doesn't work: + // AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); + // AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); + } // Turn FP extload into load/fextend for (MVT VT : MVT::fp_valuetypes()) { @@ -1396,10 +1495,10 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM, setTruncStoreAction(MVT::f128, MVT::f64, Expand); // Custom legalize GlobalAddress nodes into LO/HI parts. - setOperationAction(ISD::GlobalAddress, getPointerTy(DL), Custom); - setOperationAction(ISD::GlobalTLSAddress, getPointerTy(DL), Custom); - setOperationAction(ISD::ConstantPool, getPointerTy(DL), Custom); - setOperationAction(ISD::BlockAddress, getPointerTy(DL), Custom); + setOperationAction(ISD::GlobalAddress, PtrVT, Custom); + setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom); + setOperationAction(ISD::ConstantPool, PtrVT, Custom); + setOperationAction(ISD::BlockAddress, PtrVT, Custom); // Sparc doesn't have sext_inreg, replace them with shl/sra setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); @@ -1579,9 +1678,6 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM, setOperationAction(ISD::STACKRESTORE , MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); - setExceptionPointerRegister(SP::I0); - setExceptionSelectorRegister(SP::I1); - setStackPointerRegisterToSaveRestore(SP::O6); setOperationAction(ISD::CTPOP, MVT::i32, @@ -1744,18 +1840,15 @@ void SparcTargetLowering::computeKnownBitsForTargetNode // set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition. static void LookThroughSetCC(SDValue &LHS, SDValue &RHS, ISD::CondCode CC, unsigned &SPCC) { - if (isa<ConstantSDNode>(RHS) && - cast<ConstantSDNode>(RHS)->isNullValue() && + if (isNullConstant(RHS) && CC == ISD::SETNE && (((LHS.getOpcode() == SPISD::SELECT_ICC || LHS.getOpcode() == SPISD::SELECT_XCC) && LHS.getOperand(3).getOpcode() == SPISD::CMPICC) || (LHS.getOpcode() == SPISD::SELECT_FCC && LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) && - isa<ConstantSDNode>(LHS.getOperand(0)) && - isa<ConstantSDNode>(LHS.getOperand(1)) && - cast<ConstantSDNode>(LHS.getOperand(0))->isOne() && - cast<ConstantSDNode>(LHS.getOperand(1))->isNullValue()) { + isOneConstant(LHS.getOperand(0)) && + isNullConstant(LHS.getOperand(1))) { SDValue CMPCC = LHS.getOperand(3); SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue(); LHS = CMPCC.getOperand(0); @@ -1821,7 +1914,8 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setHasCalls(true); return DAG.getLoad(VT, DL, DAG.getEntryNode(), AbsAddr, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); } // This is one of the absolute code models. @@ -1872,6 +1966,9 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + SDLoc DL(GA); const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -2601,6 +2698,17 @@ static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG) return DAG.getMergeValues(Ops, dl); } +static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) +{ + LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode()); + + EVT MemVT = LdNode->getMemoryVT(); + if (MemVT == MVT::f128) + return LowerF128Load(Op, DAG); + + return Op; +} + // Lower a f128 store into two f64 stores. static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); @@ -2645,6 +2753,29 @@ static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } +static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) +{ + SDLoc dl(Op); + StoreSDNode *St = cast<StoreSDNode>(Op.getNode()); + + EVT MemVT = St->getMemoryVT(); + if (MemVT == MVT::f128) + return LowerF128Store(Op, DAG); + + if (MemVT == MVT::i64) { + // Custom handling for i64 stores: turn it into a bitcast and a + // v2i32 store. + SDValue Val = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, St->getValue()); + SDValue Chain = DAG.getStore( + St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(), + St->isVolatile(), St->isNonTemporal(), St->getAlignment(), + St->getAAInfo()); + return Chain; + } + + return SDValue(); +} + static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) { assert((Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS) && "invalid opcode"); @@ -2752,7 +2883,7 @@ static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG, SDValue MulResult = TLI.makeLibCall(DAG, RTLIB::MUL_I128, WideVT, - Args, 4, isSigned, dl).first; + Args, isSigned, dl).first; SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, MulResult, DAG.getIntPtrConstant(0, dl)); SDValue TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, @@ -2783,7 +2914,6 @@ static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) { return SDValue(); } - SDValue SparcTargetLowering:: LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -2818,8 +2948,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget); - case ISD::LOAD: return LowerF128Load(Op, DAG); - case ISD::STORE: return LowerF128Store(Op, DAG); + case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::FADD: return LowerF128Op(Op, DAG, getLibcallName(RTLIB::ADD_F128), 2); case ISD::FSUB: return LowerF128Op(Op, DAG, @@ -2921,8 +3051,7 @@ SparcTargetLowering::expandSelectCC(MachineInstr *MI, // to set, the condition code register to branch on, the true/false values to // select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -3007,7 +3136,7 @@ SparcTargetLowering::expandAtomicRMW(MachineInstr *MI, .addReg(AddrReg).addImm(0); // Split the basic block MBB before MI and insert the loop block in the hole. - MachineFunction::iterator MFI = MBB; + MachineFunction::iterator MFI = MBB->getIterator(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction *MF = MBB->getParent(); MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB); @@ -3149,9 +3278,12 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': - return std::make_pair(0U, &SP::IntRegsRegClass); + if (VT == MVT::v2i32) + return std::make_pair(0U, &SP::IntPairRegClass); + else + return std::make_pair(0U, &SP::IntRegsRegClass); } - } else if (!Constraint.empty() && Constraint.size() <= 5 + } else if (!Constraint.empty() && Constraint.size() <= 5 && Constraint[0] == '{' && *(Constraint.end()-1) == '}') { // constraint = '{r<d>}' // Remove the braces from around the name. @@ -3227,5 +3359,24 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N, getLibcallName(libCall), 1)); return; + case ISD::LOAD: { + LoadSDNode *Ld = cast<LoadSDNode>(N); + // Custom handling only for i64: turn i64 load into a v2i32 load, + // and a bitcast. + if (Ld->getValueType(0) != MVT::i64 || Ld->getMemoryVT() != MVT::i64) + return; + + SDLoc dl(N); + SDValue LoadRes = DAG.getExtLoad( + Ld->getExtensionType(), dl, MVT::v2i32, + Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), + MVT::v2i32, Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), Ld->getAlignment(), Ld->getAAInfo()); + + SDValue Res = DAG.getNode(ISD::BITCAST, dl, MVT::i64, LoadRes); + Results.push_back(Res); + Results.push_back(LoadRes.getValue(1)); + return; + } } } diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h index bbc91a4..4e46709 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h +++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h @@ -89,6 +89,20 @@ namespace llvm { return MVT::i32; } + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + return SP::I0; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + return SP::I1; + } + /// getSetCCResultType - Return the ISD::SETCC ValueType EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -167,8 +181,8 @@ namespace llvm { } void ReplaceNodeResults(SDNode *N, - SmallVectorImpl<SDValue>& Results, - SelectionDAG &DAG) const override; + SmallVectorImpl<SDValue>& Results, + SelectionDAG &DAG) const override; MachineBasicBlock *expandSelectCC(MachineInstr *MI, MachineBasicBlock *BB, unsigned BROpcode) const; diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td index 25cc652..d51e2cc 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td +++ b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td @@ -250,6 +250,7 @@ defm : int_cond_alias<"n", 0b0000>; defm : int_cond_alias<"ne", 0b1001>; defm : int_cond_alias<"nz", 0b1001>; // same as ne defm : int_cond_alias<"e", 0b0001>; +defm : int_cond_alias<"eq", 0b0001>; // same as e defm : int_cond_alias<"z", 0b0001>; // same as e defm : int_cond_alias<"g", 0b1010>; defm : int_cond_alias<"le", 0b0010>; @@ -429,6 +430,9 @@ def : InstAlias<"wr $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>; def : InstAlias<"flush", (FLUSH), 0>; +def : MnemonicAlias<"lduw", "ld">, Requires<[HasV9]>; +def : MnemonicAlias<"lduwa", "lda">, Requires<[HasV9]>; + def : MnemonicAlias<"return", "rett">, Requires<[HasV9]>; def : MnemonicAlias<"addc", "addx">, Requires<[HasV9]>; @@ -450,3 +454,8 @@ def : InstAlias<"fcmpeq $rs1, $rs2", (V9FCMPEQ FCC0, QFPRegs:$rs1, QFPRegs:$rs2)>, Requires<[HasHardQuad]>; +// signx rd -> sra rd, %g0, rd +def : InstAlias<"signx $rd", (SRArr IntRegs:$rd, IntRegs:$rd, G0), 0>, Requires<[HasV9]>; + +// signx reg, rd -> sra reg, %g0, rd +def : InstAlias<"signx $rs1, $rd", (SRArr IntRegs:$rd, IntRegs:$rs1, G0), 0>, Requires<[HasV9]>; diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index 6167c53..05006ac 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -83,7 +83,6 @@ static bool IsIntegerCC(unsigned CC) return (CC <= SPCC::ICC_VC); } - static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC) { switch(CC) { @@ -124,106 +123,103 @@ static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC) llvm_unreachable("Invalid cond code"); } -bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, - MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const -{ +static bool isUncondBranchOpcode(int Opc) { return Opc == SP::BA; } - MachineBasicBlock::iterator I = MBB.end(); - MachineBasicBlock::iterator UnCondBrIter = MBB.end(); - while (I != MBB.begin()) { - --I; - - if (I->isDebugValue()) - continue; +static bool isCondBranchOpcode(int Opc) { + return Opc == SP::FBCOND || Opc == SP::BCOND; +} - // When we see a non-terminator, we are done. - if (!isUnpredicatedTerminator(I)) - break; +static bool isIndirectBranchOpcode(int Opc) { + return Opc == SP::BINDrr || Opc == SP::BINDri; +} - // Terminator is not a branch. - if (!I->isBranch()) - return true; +static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, + SmallVectorImpl<MachineOperand> &Cond) { + Cond.push_back(MachineOperand::CreateImm(LastInst->getOperand(1).getImm())); + Target = LastInst->getOperand(0).getMBB(); +} - // Handle Unconditional branches. - if (I->getOpcode() == SP::BA) { - UnCondBrIter = I; +bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); + if (I == MBB.end()) + return false; + + if (!isUnpredicatedTerminator(I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + unsigned LastOpc = LastInst->getOpcode(); + + // If there is only one terminator instruction, process it. + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (isUncondBranchOpcode(LastOpc)) { + TBB = LastInst->getOperand(0).getMBB(); + return false; + } + if (isCondBranchOpcode(LastOpc)) { + // Block ends with fall-through condbranch. + parseCondBranch(LastInst, TBB, Cond); + return false; + } + return true; // Can't handle indirect branch. + } - if (!AllowModify) { - TBB = I->getOperand(0).getMBB(); - continue; + // Get the instruction before it if it is a terminator. + MachineInstr *SecondLastInst = I; + unsigned SecondLastOpc = SecondLastInst->getOpcode(); + + // If AllowModify is true and the block ends with two or more unconditional + // branches, delete all but the first unconditional branch. + if (AllowModify && isUncondBranchOpcode(LastOpc)) { + while (isUncondBranchOpcode(SecondLastOpc)) { + LastInst->eraseFromParent(); + LastInst = SecondLastInst; + LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + // Return now the only terminator is an unconditional branch. + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else { + SecondLastInst = I; + SecondLastOpc = SecondLastInst->getOpcode(); } + } + } - while (std::next(I) != MBB.end()) - std::next(I)->eraseFromParent(); - - Cond.clear(); - FBB = nullptr; + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + return true; - if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { - TBB = nullptr; - I->eraseFromParent(); - I = MBB.end(); - UnCondBrIter = MBB.end(); - continue; - } + // If the block ends with a B and a Bcc, handle it. + if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + parseCondBranch(SecondLastInst, TBB, Cond); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } - TBB = I->getOperand(0).getMBB(); - continue; - } + // If the block ends with two unconditional branches, handle it. The second + // one is not executed. + if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + TBB = SecondLastInst->getOperand(0).getMBB(); + return false; + } - unsigned Opcode = I->getOpcode(); - if (Opcode != SP::BCOND && Opcode != SP::FBCOND) - return true; // Unknown Opcode. - - SPCC::CondCodes BranchCode = (SPCC::CondCodes)I->getOperand(1).getImm(); - - if (Cond.empty()) { - MachineBasicBlock *TargetBB = I->getOperand(0).getMBB(); - if (AllowModify && UnCondBrIter != MBB.end() && - MBB.isLayoutSuccessor(TargetBB)) { - - // Transform the code - // - // brCC L1 - // ba L2 - // L1: - // .. - // L2: - // - // into - // - // brnCC L2 - // L1: - // ... - // L2: - // - BranchCode = GetOppositeBranchCondition(BranchCode); - MachineBasicBlock::iterator OldInst = I; - BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(Opcode)) - .addMBB(UnCondBrIter->getOperand(0).getMBB()).addImm(BranchCode); - BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(SP::BA)) - .addMBB(TargetBB); - - OldInst->eraseFromParent(); - UnCondBrIter->eraseFromParent(); - - UnCondBrIter = MBB.end(); - I = MBB.end(); - continue; - } - FBB = TBB; - TBB = I->getOperand(0).getMBB(); - Cond.push_back(MachineOperand::CreateImm(BranchCode)); - continue; - } - // FIXME: Handle subsequent conditional branches. - // For now, we can't handle multiple conditional branches. + // ...likewise if it ends with an indirect branch followed by an unconditional + // branch. + if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + I = LastInst; + if (AllowModify) + I->eraseFromParent(); return true; } - return false; + + // Otherwise, can't handle this. + return true; } unsigned @@ -277,6 +273,14 @@ unsigned SparcInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const return Count; } +bool SparcInstrInfo::ReverseBranchCondition( + SmallVectorImpl<MachineOperand> &Cond) const { + assert(Cond.size() == 1); + SPCC::CondCodes CC = static_cast<SPCC::CondCodes>(Cond[0].getImm()); + Cond[0].setImm(GetOppositeBranchCondition(CC)); + return false; +} + void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, @@ -284,7 +288,9 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned numSubRegs = 0; unsigned movOpc = 0; const unsigned *subRegIdx = nullptr; + bool ExtraG0 = false; + const unsigned DW_SubRegsIdx[] = { SP::sub_even, SP::sub_odd }; const unsigned DFP_FP_SubRegsIdx[] = { SP::sub_even, SP::sub_odd }; const unsigned QFP_DFP_SubRegsIdx[] = { SP::sub_even64, SP::sub_odd64 }; const unsigned QFP_FP_SubRegsIdx[] = { SP::sub_even, SP::sub_odd, @@ -294,7 +300,12 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (SP::IntRegsRegClass.contains(DestReg, SrcReg)) BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0) .addReg(SrcReg, getKillRegState(KillSrc)); - else if (SP::FPRegsRegClass.contains(DestReg, SrcReg)) + else if (SP::IntPairRegClass.contains(DestReg, SrcReg)) { + subRegIdx = DW_SubRegsIdx; + numSubRegs = 2; + movOpc = SP::ORrr; + ExtraG0 = true; + } else if (SP::FPRegsRegClass.contains(DestReg, SrcReg)) BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg)) { @@ -347,7 +358,11 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned Src = TRI->getSubReg(SrcReg, subRegIdx[i]); assert(Dst && Src && "Bad sub-register"); - MovMI = BuildMI(MBB, I, DL, get(movOpc), Dst).addReg(Src); + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(movOpc), Dst); + if (ExtraG0) + MIB.addReg(SP::G0); + MIB.addReg(Src); + MovMI = MIB.getInstr(); } // Add implicit super-register defs and kills to the last MovMI. MovMI->addRegisterDefined(DestReg, TRI); @@ -365,19 +380,20 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); - MachineMemOperand *MMO = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOStore, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); // On the order of operands here: think "[FrameIdx + 0] = SrcReg". - if (RC == &SP::I64RegsRegClass) + if (RC == &SP::I64RegsRegClass) BuildMI(MBB, I, DL, get(SP::STXri)).addFrameIndex(FI).addImm(0) .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO); else if (RC == &SP::IntRegsRegClass) BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0) .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO); + else if (RC == &SP::IntPairRegClass) + BuildMI(MBB, I, DL, get(SP::STDri)).addFrameIndex(FI).addImm(0) + .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO); else if (RC == &SP::FPRegsRegClass) BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0) .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO); @@ -403,11 +419,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); - MachineMemOperand *MMO = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); if (RC == &SP::I64RegsRegClass) BuildMI(MBB, I, DL, get(SP::LDXri), DestReg).addFrameIndex(FI).addImm(0) @@ -415,6 +429,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, else if (RC == &SP::IntRegsRegClass) BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0) .addMemOperand(MMO); + else if (RC == &SP::IntPairRegClass) + BuildMI(MBB, I, DL, get(SP::LDDri), DestReg).addFrameIndex(FI).addImm(0) + .addMemOperand(MMO); else if (RC == &SP::FPRegsRegClass) BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0) .addMemOperand(MMO); diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h index 15673f1..9de624c 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h +++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h @@ -76,6 +76,9 @@ public: MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; + bool + ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td index 3b9e048..ec37c22 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td +++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td @@ -283,17 +283,32 @@ multiclass Load<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode, [(set Ty:$dst, (OpNode ADDRri:$addr))]>; } +// TODO: Instructions of the LoadASI class are currently asm only; hooking up +// CodeGen's address spaces to use these is a future task. +class LoadASI<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode, + RegisterClass RC, ValueType Ty> : + F3_1_asi<3, Op3Val, (outs RC:$dst), (ins MEMrr:$addr, i8imm:$asi), + !strconcat(OpcStr, "a [$addr] $asi, $dst"), + []>; + // LoadA multiclass - As above, but also define alternate address space variant multiclass LoadA<string OpcStr, bits<6> Op3Val, bits<6> LoadAOp3Val, SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> : Load<OpcStr, Op3Val, OpNode, RC, Ty> { - // TODO: The LD*Arr instructions are currently asm only; hooking up - // CodeGen's address spaces to use these is a future task. - def Arr : F3_1_asi<3, LoadAOp3Val, (outs RC:$dst), (ins MEMrr:$addr, i8imm:$asi), - !strconcat(OpcStr, "a [$addr] $asi, $dst"), - []>; + def Arr : LoadASI<OpcStr, LoadAOp3Val, OpNode, RC, Ty>; } +// The LDSTUB instruction is supported for asm only. +// It is unlikely that general-purpose code could make use of it. +// CAS is preferred for sparc v9. +def LDSTUBrr : F3_1<3, 0b001101, (outs IntRegs:$dst), (ins MEMrr:$addr), + "ldstub [$addr], $dst", []>; +def LDSTUBri : F3_2<3, 0b001101, (outs IntRegs:$dst), (ins MEMri:$addr), + "ldstub [$addr], $dst", []>; +def LDSTUBArr : F3_1_asi<3, 0b011101, (outs IntRegs:$dst), + (ins MEMrr:$addr, i8imm:$asi), + "ldstuba [$addr] $asi, $dst", []>; + // Store multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot. multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> { @@ -307,14 +322,18 @@ multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode, [(OpNode Ty:$rd, ADDRri:$addr)]>; } -multiclass StoreA<string OpcStr, bits<6> Op3Val, bits<6> StoreAOp3Val, +// TODO: Instructions of the StoreASI class are currently asm only; hooking up +// CodeGen's address spaces to use these is a future task. +class StoreASI<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> : - Store<OpcStr, Op3Val, OpNode, RC, Ty> { - // TODO: The ST*Arr instructions are currently asm only; hooking up - // CodeGen's address spaces to use these is a future task. - def Arr : F3_1_asi<3, StoreAOp3Val, (outs), (ins MEMrr:$addr, RC:$rd, i8imm:$asi), + F3_1_asi<3, Op3Val, (outs), (ins MEMrr:$addr, RC:$rd, i8imm:$asi), !strconcat(OpcStr, "a $rd, [$addr] $asi"), []>; + +multiclass StoreA<string OpcStr, bits<6> Op3Val, bits<6> StoreAOp3Val, + SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> : + Store<OpcStr, Op3Val, OpNode, RC, Ty> { + def Arr : StoreASI<OpcStr, StoreAOp3Val, OpNode, RC, Ty>; } //===----------------------------------------------------------------------===// @@ -408,15 +427,40 @@ let DecoderMethod = "DecodeLoadInt" in { defm LD : LoadA<"ld", 0b000000, 0b010000, load, IntRegs, i32>; } +let DecoderMethod = "DecodeLoadIntPair" in + defm LDD : LoadA<"ldd", 0b000011, 0b010011, load, IntPair, v2i32>; + // Section B.2 - Load Floating-point Instructions, p. 92 -let DecoderMethod = "DecodeLoadFP" in - defm LDF : Load<"ld", 0b100000, load, FPRegs, f32>; -let DecoderMethod = "DecodeLoadDFP" in - defm LDDF : Load<"ldd", 0b100011, load, DFPRegs, f64>; +let DecoderMethod = "DecodeLoadFP" in { + defm LDF : Load<"ld", 0b100000, load, FPRegs, f32>; + def LDFArr : LoadASI<"ld", 0b110000, load, FPRegs, f32>, + Requires<[HasV9]>; +} +let DecoderMethod = "DecodeLoadDFP" in { + defm LDDF : Load<"ldd", 0b100011, load, DFPRegs, f64>; + def LDDFArr : LoadASI<"ldd", 0b110011, load, DFPRegs, f64>, + Requires<[HasV9]>; +} let DecoderMethod = "DecodeLoadQFP" in - defm LDQF : Load<"ldq", 0b100010, load, QFPRegs, f128>, + defm LDQF : LoadA<"ldq", 0b100010, 0b110010, load, QFPRegs, f128>, Requires<[HasV9, HasHardQuad]>; +let DecoderMethod = "DecodeLoadFP" in + let Defs = [FSR] in { + let rd = 0 in { + def LDFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr), + "ld [$addr], %fsr", []>; + def LDFSRri : F3_2<3, 0b100001, (outs), (ins MEMri:$addr), + "ld [$addr], %fsr", []>; + } + let rd = 1 in { + def LDXFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr), + "ldx [$addr], %fsr", []>, Requires<[HasV9]>; + def LDXFSRri : F3_2<3, 0b100001, (outs), (ins MEMri:$addr), + "ldx [$addr], %fsr", []>, Requires<[HasV9]>; + } + } + // Section B.4 - Store Integer Instructions, p. 95 let DecoderMethod = "DecodeStoreInt" in { defm STB : StoreA<"stb", 0b000101, 0b010101, truncstorei8, IntRegs, i32>; @@ -424,15 +468,40 @@ let DecoderMethod = "DecodeStoreInt" in { defm ST : StoreA<"st", 0b000100, 0b010100, store, IntRegs, i32>; } +let DecoderMethod = "DecodeStoreIntPair" in + defm STD : StoreA<"std", 0b000111, 0b010111, store, IntPair, v2i32>; + // Section B.5 - Store Floating-point Instructions, p. 97 -let DecoderMethod = "DecodeStoreFP" in +let DecoderMethod = "DecodeStoreFP" in { defm STF : Store<"st", 0b100100, store, FPRegs, f32>; -let DecoderMethod = "DecodeStoreDFP" in - defm STDF : Store<"std", 0b100111, store, DFPRegs, f64>; + def STFArr : StoreASI<"st", 0b110100, store, FPRegs, f32>, + Requires<[HasV9]>; +} +let DecoderMethod = "DecodeStoreDFP" in { + defm STDF : Store<"std", 0b100111, store, DFPRegs, f64>; + def STDFArr : StoreASI<"std", 0b110111, store, DFPRegs, f64>, + Requires<[HasV9]>; +} let DecoderMethod = "DecodeStoreQFP" in - defm STQF : Store<"stq", 0b100110, store, QFPRegs, f128>, + defm STQF : StoreA<"stq", 0b100110, 0b110110, store, QFPRegs, f128>, Requires<[HasV9, HasHardQuad]>; +let DecoderMethod = "DecodeStoreFP" in + let Defs = [FSR] in { + let rd = 0 in { + def STFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins), + "st %fsr, [$addr]", []>; + def STFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins), + "st %fsr, [$addr]", []>; + } + let rd = 1 in { + def STXFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins), + "stx %fsr, [$addr]", []>, Requires<[HasV9]>; + def STXFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins), + "stx %fsr, [$addr]", []>, Requires<[HasV9]>; + } + } + // Section B.8 - SWAP Register with Memory Instruction // (Atomic swap) let Constraints = "$val = $dst", DecoderMethod = "DecodeSWAP" in { @@ -559,6 +628,10 @@ let Defs = [Y, ICC] in { defm SMULCC : F3_12np<"smulcc", 0b011011>; } +let Defs = [Y, ICC], Uses = [Y, ICC] in { + defm MULSCC : F3_12np<"mulscc", 0b100100>; +} + // Section B.19 - Divide Instructions, p. 115 let Uses = [Y], Defs = [Y] in { defm UDIV : F3_12np<"udiv", 0b001110>; @@ -1221,8 +1294,8 @@ let Predicates = [HasV9] in { // the top 32-bits before using it. To do this clearing, we use a SRLri X,0. let rs1 = 0 in def POPCrr : F3_1<2, 0b101110, - (outs IntRegs:$dst), (ins IntRegs:$src), - "popc $src, $dst", []>, Requires<[HasV9]>; + (outs IntRegs:$rd), (ins IntRegs:$rs2), + "popc $rs2, $rd", []>, Requires<[HasV9]>; def : Pat<(ctpop i32:$src), (POPCrr (SRLri $src, 0))>; @@ -1254,6 +1327,25 @@ let hasSideEffects = 1 in { } } + +// Section A.43 - Read Privileged Register Instructions +let Predicates = [HasV9] in { +let rs2 = 0 in + def RDPR : F3_1<2, 0b101010, + (outs IntRegs:$rd), (ins PRRegs:$rs1), + "rdpr $rs1, $rd", []>; +} + +// Section A.62 - Write Privileged Register Instructions +let Predicates = [HasV9] in { + def WRPRrr : F3_1<2, 0b110010, + (outs PRRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2), + "wrpr $rs1, $rs2, $rd", []>; + def WRPRri : F3_2<2, 0b110010, + (outs PRRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13), + "wrpr $rs1, $simm13, $rd", []>; +} + //===----------------------------------------------------------------------===// // Non-Instruction Patterns //===----------------------------------------------------------------------===// @@ -1327,6 +1419,18 @@ def : Pat<(i32 (atomic_load ADDRri:$src)), (LDri ADDRri:$src)>; def : Pat<(atomic_store ADDRrr:$dst, i32:$val), (STrr ADDRrr:$dst, $val)>; def : Pat<(atomic_store ADDRri:$dst, i32:$val), (STri ADDRri:$dst, $val)>; +// extract_vector +def : Pat<(extractelt (v2i32 IntPair:$Rn), 0), + (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_even))>; +def : Pat<(extractelt (v2i32 IntPair:$Rn), 1), + (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_odd))>; + +// build_vector +def : Pat<(build_vector (i32 IntRegs:$a1), (i32 IntRegs:$a2)), + (INSERT_SUBREG + (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 IntRegs:$a1), sub_even), + (i32 IntRegs:$a2), sub_odd)>; + include "SparcInstr64Bit.td" include "SparcInstrVIS.td" diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp index 9667bc0..da31783 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp @@ -75,6 +75,18 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(SP::G6); Reserved.set(SP::G7); + // Also reserve the register pair aliases covering the above + // registers, with the same conditions. + Reserved.set(SP::G0_G1); + if (ReserveAppRegisters) + Reserved.set(SP::G2_G3); + if (ReserveAppRegisters || !Subtarget.is64Bit()) + Reserved.set(SP::G4_G5); + + Reserved.set(SP::O6_O7); + Reserved.set(SP::I6_I7); + Reserved.set(SP::G6_G7); + // Unaliased double registers are not available in non-V9 targets. if (!Subtarget.isV9()) { for (unsigned n = 0; n != 16; ++n) { @@ -158,21 +170,15 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineInstr &MI = *II; DebugLoc dl = MI.getDebugLoc(); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); - - // Addressable stack objects are accessed using neg. offsets from %fp MachineFunction &MF = *MI.getParent()->getParent(); const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>(); - int64_t Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + - MI.getOperand(FIOperandNum + 1).getImm() + - Subtarget.getStackPointerBias(); - SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>(); - unsigned FramePtr = SP::I6; - if (FuncInfo->isLeafProc()) { - // Use %sp and adjust offset if needed. - FramePtr = SP::O6; - int stackSize = MF.getFrameInfo()->getStackSize(); - Offset += (stackSize) ? Subtarget.getAdjustedFrameSize(stackSize) : 0 ; - } + const SparcFrameLowering *TFI = getFrameLowering(MF); + + unsigned FrameReg; + int Offset; + Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg); + + Offset += MI.getOperand(FIOperandNum + 1).getImm(); if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) { if (MI.getOpcode() == SP::STQFri) { @@ -182,8 +188,8 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned SrcOddReg = getSubReg(SrcReg, SP::sub_odd64); MachineInstr *StMI = BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri)) - .addReg(FramePtr).addImm(0).addReg(SrcEvenReg); - replaceFI(MF, II, *StMI, dl, 0, Offset, FramePtr); + .addReg(FrameReg).addImm(0).addReg(SrcEvenReg); + replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg); MI.setDesc(TII.get(SP::STDFri)); MI.getOperand(2).setReg(SrcOddReg); Offset += 8; @@ -194,8 +200,8 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned DestOddReg = getSubReg(DestReg, SP::sub_odd64); MachineInstr *StMI = BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg) - .addReg(FramePtr).addImm(0); - replaceFI(MF, II, *StMI, dl, 1, Offset, FramePtr); + .addReg(FrameReg).addImm(0); + replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg); MI.setDesc(TII.get(SP::LDDFri)); MI.getOperand(0).setReg(DestOddReg); @@ -203,7 +209,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } } - replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FramePtr); + replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FrameReg); } @@ -211,3 +217,25 @@ unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const { return SP::I6; } +// Sparc has no architectural need for stack realignment support, +// except that LLVM unfortunately currently implements overaligned +// stack objects by depending upon stack realignment support. +// If that ever changes, this can probably be deleted. +bool SparcRegisterInfo::canRealignStack(const MachineFunction &MF) const { + if (!TargetRegisterInfo::canRealignStack(MF)) + return false; + + // Sparc always has a fixed frame pointer register, so don't need to + // worry about needing to reserve it. [even if we don't have a frame + // pointer for our frame, it still cannot be used for other things, + // or register window traps will be SADNESS.] + + // If there's a reserved call frame, we can use SP to access locals. + if (getFrameLowering(MF)->hasReservedCallFrame(MF)) + return true; + + // Otherwise, we'd need a base pointer, but those aren't implemented + // for SPARC at the moment. + + return false; +} diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h index 764a894..32075b1 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h +++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h @@ -42,8 +42,10 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo { void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS = nullptr) const; - // Debug information queries. unsigned getFrameRegister(const MachineFunction &MF) const override; + + bool canRealignStack(const MachineFunction &MF) const override; + }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td index db8a7e8..cca9463 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td +++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td @@ -32,6 +32,12 @@ def sub_odd64 : SubRegIndex<64, 64>; // Ri - 32-bit integer registers class Ri<bits<16> Enc, string n> : SparcReg<Enc, n>; +// Rdi - pairs of 32-bit integer registers +class Rdi<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> { + let SubRegs = subregs; + let SubRegIndices = [sub_even, sub_odd]; + let CoveredBySubRegs = 1; +} // Rf - 32-bit floating-point registers class Rf<bits<16> Enc, string n> : SparcReg<Enc, n>; @@ -54,6 +60,8 @@ def ICC : SparcCtrlReg<0, "ICC">; // This represents icc and xcc in 64-bit code. foreach I = 0-3 in def FCC#I : SparcCtrlReg<I, "FCC"#I>; +def FSR : SparcCtrlReg<0, "FSR">; // Floating-point state register. + // Y register def Y : SparcCtrlReg<0, "Y">, DwarfRegNum<[64]>; // Ancillary state registers (implementation defined) @@ -94,6 +102,22 @@ def PSR : SparcCtrlReg<0, "PSR">; def WIM : SparcCtrlReg<0, "WIM">; def TBR : SparcCtrlReg<0, "TBR">; +def TPC : SparcCtrlReg<0, "TPC">; +def TNPC : SparcCtrlReg<1, "TNPC">; +def TSTATE : SparcCtrlReg<2, "TSTATE">; +def TT : SparcCtrlReg<3, "TT">; +def TICK : SparcCtrlReg<4, "TICK">; +def TBA : SparcCtrlReg<5, "TBA">; +def PSTATE : SparcCtrlReg<6, "PSTATE">; +def TL : SparcCtrlReg<7, "TL">; +def PIL : SparcCtrlReg<8, "PIL">; +def CWP : SparcCtrlReg<9, "CWP">; +def CANSAVE : SparcCtrlReg<10, "CANSAVE">; +def CANRESTORE : SparcCtrlReg<11, "CANRESTORE">; +def CLEANWIN : SparcCtrlReg<12, "CLEANWIN">; +def OTHERWIN : SparcCtrlReg<13, "OTHERWIN">; +def WSTATE : SparcCtrlReg<14, "WSTATE">; + // Integer registers def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>; def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>; @@ -217,6 +241,24 @@ def Q13 : Rq<21, "F52", [D26, D27]>; def Q14 : Rq<25, "F56", [D28, D29]>; def Q15 : Rq<29, "F60", [D30, D31]>; +// Aliases of the integer registers used for LDD/STD double-word operations +def G0_G1 : Rdi<0, "G0", [G0, G1]>; +def G2_G3 : Rdi<2, "G2", [G2, G3]>; +def G4_G5 : Rdi<4, "G4", [G4, G5]>; +def G6_G7 : Rdi<6, "G6", [G6, G7]>; +def O0_O1 : Rdi<8, "O0", [O0, O1]>; +def O2_O3 : Rdi<10, "O2", [O2, O3]>; +def O4_O5 : Rdi<12, "O4", [O4, O5]>; +def O6_O7 : Rdi<14, "O6", [O6, O7]>; +def L0_L1 : Rdi<16, "L0", [L0, L1]>; +def L2_L3 : Rdi<18, "L2", [L2, L3]>; +def L4_L5 : Rdi<20, "L4", [L4, L5]>; +def L6_L7 : Rdi<22, "L6", [L6, L7]>; +def I0_I1 : Rdi<24, "I0", [I0, I1]>; +def I2_I3 : Rdi<26, "I2", [I2, I3]>; +def I4_I5 : Rdi<28, "I4", [I4, I5]>; +def I6_I7 : Rdi<30, "I6", [I6, I7]>; + // Register classes. // // FIXME: the register order should be defined in terms of the preferred @@ -231,6 +273,13 @@ def IntRegs : RegisterClass<"SP", [i32, i64], 32, (sequence "L%u", 0, 7), (sequence "O%u", 0, 7))>; +// Should be in the same order as IntRegs. +def IntPair : RegisterClass<"SP", [v2i32], 64, + (add I0_I1, I2_I3, I4_I5, I6_I7, + G0_G1, G2_G3, G4_G5, G6_G7, + L0_L1, L2_L3, L4_L5, L6_L7, + O0_O1, O2_O3, O4_O5, O6_O7)>; + // Register class for 64-bit mode, with a 64-bit spill slot size. // These are the same as the 32-bit registers, so TableGen will consider this // to be a sub-class of IntRegs. That works out because requiring a 64-bit @@ -252,3 +301,8 @@ def ASRRegs : RegisterClass<"SP", [i32], 32, (add Y, (sequence "ASR%u", 1, 31))> { let isAllocatable = 0; } + +// Privileged Registers +def PRRegs : RegisterClass<"SP", [i64], 64, + (add TPC, TNPC, TSTATE, TT, TICK, TBA, PSTATE, TL, PIL, CWP, + CANSAVE, CANRESTORE, CLEANWIN, OTHERWIN, WSTATE)>; diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp index d69da40..d701594 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp @@ -64,7 +64,7 @@ int SparcSubtarget::getAdjustedFrameSize(int frameSize) const { frameSize += 128; // Frames with calls must also reserve space for 6 outgoing arguments // whether they are used or not. LowerCall_64 takes care of that. - assert(frameSize % 16 == 0 && "Stack size not 16-byte aligned"); + frameSize = RoundUpToAlignment(frameSize, 16); } else { // Emit the correct save instruction based on the number of bytes in // the frame. Minimum stack frame size according to V8 ABI is: @@ -81,3 +81,7 @@ int SparcSubtarget::getAdjustedFrameSize(int frameSize) const { } return frameSize; } + +bool SparcSubtarget::enableMachineScheduler() const { + return true; +} diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h index 9d21911..e2fd2f0 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h +++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h @@ -60,6 +60,8 @@ public: return &TSInfo; } + bool enableMachineScheduler() const override; + bool isV9() const { return IsV9; } bool isVIS() const { return IsVIS; } bool isVIS2() const { return IsVIS2; } @@ -85,7 +87,6 @@ public: /// returns adjusted framesize which includes space for register window /// spills and arguments. int getAdjustedFrameSize(int stackSize) const; - }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index 3aa4c6b..9c995bf 100644 --- a/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -349,7 +349,6 @@ class SystemZAsmParser : public MCTargetAsmParser { #include "SystemZGenAsmMatcher.inc" private: - MCSubtargetInfo &STI; MCAsmParser &Parser; enum RegisterGroup { RegGR, @@ -386,14 +385,14 @@ private: bool parseOperand(OperandVector &Operands, StringRef Mnemonic); public: - SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser, + SystemZAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(sti), Parser(parser) { + : MCTargetAsmParser(Options, sti), Parser(parser) { MCAsmParserExtension::Initialize(Parser); // Initialize the set of available features. - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); } // Override MCTargetAsmParser. @@ -533,14 +532,16 @@ bool SystemZAsmParser::parseRegister(Register &Reg) { } // Parse a register of group Group. If Regs is nonnull, use it to map -// the raw register number to LLVM numbering, with zero entries indicating -// an invalid register. IsAddress says whether the register appears in an -// address context. +// the raw register number to LLVM numbering, with zero entries +// indicating an invalid register. IsAddress says whether the +// register appears in an address context. Allow FP Group if expecting +// RegV Group, since the f-prefix yields the FP group even while used +// with vector instructions. bool SystemZAsmParser::parseRegister(Register &Reg, RegisterGroup Group, const unsigned *Regs, bool IsAddress) { if (parseRegister(Reg)) return true; - if (Reg.Group != Group) + if (Reg.Group != Group && !(Reg.Group == RegFP && Group == RegV)) return Error(Reg.StartLoc, "invalid operand for instruction"); if (Regs && Regs[Reg.Num] == 0) return Error(Reg.StartLoc, "invalid register pair"); @@ -791,7 +792,7 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, switch (MatchResult) { case Match_Success: Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, getSTI()); return false; case Match_MissingFeature: { diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp index 059ae3f..6444cf8 100644 --- a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp +++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp @@ -60,15 +60,15 @@ void SystemZInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const { O << '%' << getRegisterName(RegNo); } -template<unsigned N> -void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) { +template <unsigned N> +static void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) { int64_t Value = MI->getOperand(OpNum).getImm(); assert(isUInt<N>(Value) && "Invalid uimm argument"); O << Value; } -template<unsigned N> -void printSImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) { +template <unsigned N> +static void printSImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) { int64_t Value = MI->getOperand(OpNum).getImm(); assert(isInt<N>(Value) && "Invalid simm argument"); O << Value; diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h index ba55e68..7ca386f 100644 --- a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h +++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h @@ -15,7 +15,6 @@ #define LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/Support/Compiler.h" namespace llvm { class MCOperand; diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp index 5fefa31..2115d44 100644 --- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp @@ -226,7 +226,7 @@ extern "C" void LLVMInitializeSystemZTargetMC() { // Register the MCCodeEmitter. TargetRegistry::RegisterMCCodeEmitter(TheSystemZTarget, - createSystemZMCCodeEmitter); + createSystemZMCCodeEmitter); // Register the MCInstrInfo. TargetRegistry::RegisterMCInstrInfo(TheSystemZTarget, diff --git a/contrib/llvm/lib/Target/SystemZ/README.txt b/contrib/llvm/lib/Target/SystemZ/README.txt index e089047..cd367d6 100644 --- a/contrib/llvm/lib/Target/SystemZ/README.txt +++ b/contrib/llvm/lib/Target/SystemZ/README.txt @@ -52,12 +52,6 @@ We don't use the TEST DATA CLASS instructions. -- -We could use the generic floating-point forms of LOAD COMPLEMENT, -LOAD NEGATIVE and LOAD POSITIVE in cases where we don't need the -condition codes. For example, we could use LCDFR instead of LCDBR. - --- - We only use MVC, XC and CLC for constant-length block operations. We could extend them to variable-length operations too, using EXECUTE RELATIVE LONG. diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp index 3dca7bd..7527311 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -288,7 +288,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { MCSymbolRefExpr::create(getSymbol(ZCPV->getGlobalValue()), getModifierVariantKind(ZCPV->getModifier()), OutContext); - uint64_t Size = TM.getDataLayout()->getTypeAllocSize(ZCPV->getType()); + uint64_t Size = getDataLayout().getTypeAllocSize(ZCPV->getType()); OutStreamer->EmitValue(Expr, Size); } diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp index 44ea1d2..4a6beb6 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp @@ -26,21 +26,6 @@ SystemZConstantPoolValue::Create(const GlobalValue *GV, return new SystemZConstantPoolValue(GV, Modifier); } -unsigned SystemZConstantPoolValue::getRelocationInfo() const { - switch (Modifier) { - case SystemZCP::TLSGD: - case SystemZCP::TLSLDM: - case SystemZCP::DTPOFF: - // May require a dynamic relocation. - return 2; - case SystemZCP::NTPOFF: - // May require a relocation, but the relocations are always resolved - // by the static linker. - return 1; - } - llvm_unreachable("Unknown modifier"); -} - int SystemZConstantPoolValue:: getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) { unsigned AlignMask = Alignment - 1; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h index e5f1bb1..a71b595 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h @@ -43,7 +43,6 @@ public: Create(const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier); // Override MachineConstantPoolValue. - unsigned getRelocationInfo() const override; int getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) override; void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp index 16f9adc..4818ed0 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp @@ -37,13 +37,11 @@ namespace { // instructions. struct Reference { Reference() - : Def(false), Use(false), IndirectDef(false), IndirectUse(false) {} + : Def(false), Use(false) {} Reference &operator|=(const Reference &Other) { Def |= Other.Def; - IndirectDef |= Other.IndirectDef; Use |= Other.Use; - IndirectUse |= Other.IndirectUse; return *this; } @@ -53,11 +51,6 @@ struct Reference { // via a sub- or super-register. bool Def; bool Use; - - // True if the register is defined or used indirectly, by a sub- or - // super-register. - bool IndirectDef; - bool IndirectUse; }; class SystemZElimCompare : public MachineFunctionPass { @@ -104,14 +97,12 @@ static bool isCCLiveOut(MachineBasicBlock &MBB) { return false; } -// Return true if any CC result of MI would reflect the value of subreg -// SubReg of Reg. -static bool resultTests(MachineInstr *MI, unsigned Reg, unsigned SubReg) { +// Return true if any CC result of MI would reflect the value of Reg. +static bool resultTests(MachineInstr *MI, unsigned Reg) { if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() && MI->getOperand(0).isDef() && - MI->getOperand(0).getReg() == Reg && - MI->getOperand(0).getSubReg() == SubReg) + MI->getOperand(0).getReg() == Reg) return true; switch (MI->getOpcode()) { @@ -127,30 +118,25 @@ static bool resultTests(MachineInstr *MI, unsigned Reg, unsigned SubReg) { case SystemZ::LTEBR: case SystemZ::LTDBR: case SystemZ::LTXBR: - if (MI->getOperand(1).getReg() == Reg && - MI->getOperand(1).getSubReg() == SubReg) + if (MI->getOperand(1).getReg() == Reg) return true; } return false; } -// Describe the references to Reg in MI, including sub- and super-registers. +// Describe the references to Reg or any of its aliases in MI. Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) { Reference Ref; for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { const MachineOperand &MO = MI->getOperand(I); if (MO.isReg()) { if (unsigned MOReg = MO.getReg()) { - if (MOReg == Reg || TRI->regsOverlap(MOReg, Reg)) { - if (MO.isUse()) { + if (TRI->regsOverlap(MOReg, Reg)) { + if (MO.isUse()) Ref.Use = true; - Ref.IndirectUse |= (MOReg != Reg); - } - if (MO.isDef()) { + else if (MO.isDef()) Ref.Def = true; - Ref.IndirectDef |= (MOReg != Reg); - } } } } @@ -158,6 +144,30 @@ Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) { return Ref; } +// Return true if this is a load and test which can be optimized the +// same way as compare instruction. +static bool isLoadAndTestAsCmp(MachineInstr *MI) { + // If we during isel used a load-and-test as a compare with 0, the + // def operand is dead. + return ((MI->getOpcode() == SystemZ::LTEBR || + MI->getOpcode() == SystemZ::LTDBR || + MI->getOpcode() == SystemZ::LTXBR) && + MI->getOperand(0).isDead()); +} + +// Return the source register of Compare, which is the unknown value +// being tested. +static unsigned getCompareSourceReg(MachineInstr *Compare) { + unsigned reg = 0; + if (Compare->isCompare()) + reg = Compare->getOperand(0).getReg(); + else if (isLoadAndTestAsCmp(Compare)) + reg = Compare->getOperand(1).getReg(); + assert (reg); + + return reg; +} + // Compare compares the result of MI against zero. If MI is an addition // of -1 and if CCUsers is a single branch on nonzero, eliminate the addition // and convert the branch to a BRCT(G). Return true on success. @@ -188,7 +198,7 @@ SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare, // We already know that there are no references to the register between // MI and Compare. Make sure that there are also no references between // Compare and Branch. - unsigned SrcReg = Compare->getOperand(0).getReg(); + unsigned SrcReg = getCompareSourceReg(Compare); MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch; for (++MBBI; MBBI != MBBE; ++MBBI) if (getRegReferences(MBBI, SrcReg)) @@ -196,16 +206,15 @@ SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare, // The transformation is OK. Rebuild Branch as a BRCT(G). MachineOperand Target(Branch->getOperand(2)); - Branch->RemoveOperand(2); - Branch->RemoveOperand(1); - Branch->RemoveOperand(0); + while (Branch->getNumOperands()) + Branch->RemoveOperand(0); Branch->setDesc(TII->get(BRCT)); MachineInstrBuilder(*Branch->getParent()->getParent(), Branch) .addOperand(MI->getOperand(0)) .addOperand(MI->getOperand(1)) .addOperand(Target) .addReg(SystemZ::CC, RegState::ImplicitDefine); - MI->removeFromParent(); + MI->eraseFromParent(); return true; } @@ -308,6 +317,10 @@ static bool isCompareZero(MachineInstr *Compare) { return true; default: + + if (isLoadAndTestAsCmp(Compare)) + return true; + return (Compare->getNumExplicitOperands() == 2 && Compare->getOperand(1).isImm() && Compare->getOperand(1).getImm() == 0); @@ -325,8 +338,7 @@ optimizeCompareZero(MachineInstr *Compare, return false; // Search back for CC results that are based on the first operand. - unsigned SrcReg = Compare->getOperand(0).getReg(); - unsigned SrcSubReg = Compare->getOperand(0).getSubReg(); + unsigned SrcReg = getCompareSourceReg(Compare); MachineBasicBlock &MBB = *Compare->getParent(); MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB.begin(); Reference CCRefs; @@ -334,7 +346,7 @@ optimizeCompareZero(MachineInstr *Compare, while (MBBI != MBBE) { --MBBI; MachineInstr *MI = MBBI; - if (resultTests(MI, SrcReg, SrcSubReg)) { + if (resultTests(MI, SrcReg)) { // Try to remove both MI and Compare by converting a branch to BRCT(G). // We don't care in this case whether CC is modified between MI and // Compare. @@ -435,23 +447,21 @@ bool SystemZElimCompare::processBlock(MachineBasicBlock &MBB) { while (MBBI != MBB.begin()) { MachineInstr *MI = --MBBI; if (CompleteCCUsers && - MI->isCompare() && + (MI->isCompare() || isLoadAndTestAsCmp(MI)) && (optimizeCompareZero(MI, CCUsers) || fuseCompareAndBranch(MI, CCUsers))) { ++MBBI; - MI->removeFromParent(); + MI->eraseFromParent(); Changed = true; CCUsers.clear(); - CompleteCCUsers = true; continue; } - Reference CCRefs(getRegReferences(MI, SystemZ::CC)); - if (CCRefs.Def) { + if (MI->definesRegister(SystemZ::CC)) { CCUsers.clear(); - CompleteCCUsers = !CCRefs.IndirectDef; + CompleteCCUsers = true; } - if (CompleteCCUsers && CCRefs.Use) + if (MI->readsRegister(SystemZ::CC) && CompleteCCUsers) CCUsers.push_back(MI); } return Changed; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 397de47..e1b20d0 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -48,7 +48,8 @@ static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = { SystemZFrameLowering::SystemZFrameLowering() : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, - -SystemZMC::CallFrameSize, 8) { + -SystemZMC::CallFrameSize, 8, + false /* StackRealignable */) { // Create a mapping from register number to save slot offset. RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS); for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I) @@ -133,7 +134,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>(); bool IsVarArg = MF.getFunction()->isVarArg(); - DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + DebugLoc DL; // Scan the call-saved GPRs and find the bounds of the register spill area. unsigned LowGPR = 0; @@ -322,7 +323,10 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF, const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); const std::vector<CalleeSavedInfo> &CSI = MFFrame->getCalleeSavedInfo(); bool HasFP = hasFP(MF); - DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; // The current offset of the stack pointer from the CFA. int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP; @@ -394,7 +398,10 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF, // Add CFI for the this save. unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); - int64_t Offset = getFrameIndexOffset(MF, Save.getFrameIdx()); + unsigned IgnoredFrameReg; + int64_t Offset = + getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg); + unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( nullptr, DwarfReg, SPOffsetFromCFA + Offset)); CFIIndexes.push_back(CFIIndex); @@ -455,9 +462,14 @@ bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const { MF.getInfo<SystemZMachineFunctionInfo>()->getManipulatesSP()); } -int SystemZFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { +int SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { const MachineFrameInfo *MFFrame = MF.getFrameInfo(); + const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + + // Fill in FrameReg output argument. + FrameReg = RI->getFrameRegister(MF); // Start with the offset of FI from the top of the caller-allocated frame // (i.e. the top of the 160 bytes allocated by the caller). This initial diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h index 5ade757..46bb6b7 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h @@ -43,7 +43,8 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; bool hasFP(const MachineFunction &MF) const override; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 75fd37f..a909309 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -585,7 +585,7 @@ bool SystemZDAGToDAGISel::selectAddress(SDValue Addr, static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) { if (N.getNode()->getNodeId() == -1 || N.getNode()->getNodeId() > Pos->getNodeId()) { - DAG->RepositionNode(Pos, N.getNode()); + DAG->RepositionNode(Pos->getIterator(), N.getNode()); N.getNode()->setNodeId(Pos->getNodeId()); } } @@ -801,7 +801,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const { RxSBG.Input = N.getOperand(0); return true; } - + case ISD::ANY_EXTEND: // Bits above the extended operand are don't-care. RxSBG.Input = N.getOperand(0); @@ -818,7 +818,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const { return true; } // Fall through. - + case ISD::SIGN_EXTEND: { // Check that the extension bits are don't-care (i.e. are masked out // by the final mask). @@ -938,7 +938,23 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) { } return nullptr; } - } + } + + // If the RISBG operands require no rotation and just masks the bottom + // 8/16 bits, attempt to convert this to a LLC zero extension. + if (RISBG.Rotate == 0 && (RISBG.Mask == 0xff || RISBG.Mask == 0xffff)) { + unsigned OpCode = (RISBG.Mask == 0xff ? SystemZ::LLGCR : SystemZ::LLGHR); + if (VT == MVT::i32) { + if (Subtarget->hasHighWord()) + OpCode = (RISBG.Mask == 0xff ? SystemZ::LLCRMux : SystemZ::LLHRMux); + else + OpCode = (RISBG.Mask == 0xff ? SystemZ::LLCR : SystemZ::LLHR); + } + + SDValue In = convertTo(DL, VT, RISBG.Input); + N = CurDAG->getMachineNode(OpCode, DL, VT, In); + return convertTo(DL, VT, SDValue(N, 0)).getNode(); + } unsigned Opcode = SystemZ::RISBG; // Prefer RISBGN if available, since it does not clobber CC. diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 9a753c8..ee73267 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -84,8 +84,7 @@ static MachineOperand earlyUseOperand(MachineOperand Op) { SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, const SystemZSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { - auto &DL = *TM.getDataLayout(); - MVT PtrVT = getPointerTy(DL); + MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); // Set up the register classes. if (Subtarget.hasHighWord()) @@ -115,8 +114,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, computeRegisterProperties(Subtarget.getRegisterInfo()); // Set up special registers. - setExceptionPointerRegister(SystemZ::R6D); - setExceptionSelectorRegister(SystemZ::R7D); setStackPointerRegisterToSaveRestore(SystemZ::R15D); // TODO: It may be better to default to latency-oriented scheduling, however @@ -370,7 +367,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // No special instructions for these. setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); } } @@ -776,9 +775,7 @@ bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType, } bool SystemZTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!CI->isTailCall()) - return false; - return true; + return CI->isTailCall(); } // We do not yet support 128-bit single-element vector types. If the user @@ -939,8 +936,8 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + MachinePointerInfo::getFixedStack(MF, FI), false, + false, false, 0); } // Convert the value of the argument register into the value that's @@ -976,9 +973,8 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, &SystemZ::FP64BitRegClass); SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64); MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN, - MachinePointerInfo::getFixedStack(FI), + MachinePointerInfo::getFixedStack(MF, FI), false, false, 0); - } // Join the stores, which are independent of one another. Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, @@ -1060,9 +1056,9 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, // Store the argument in a stack slot and pass its address. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); - MemOpChains.push_back(DAG.getStore(Chain, DL, ArgValue, SpillSlot, - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); + MemOpChains.push_back(DAG.getStore( + Chain, DL, ArgValue, SpillSlot, + MachinePointerInfo::getFixedStack(MF, FI), false, false, 0)); ArgValue = SpillSlot; } else ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue); @@ -1607,8 +1603,8 @@ static void adjustSubwordCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) { } else if (Load->getExtensionType() == ISD::ZEXTLOAD) { if (Value > Mask) return; - assert(C.ICmpType == SystemZICMP::Any && - "Signedness shouldn't matter here."); + // If the constant is in range, we can use any comparison. + C.ICmpType = SystemZICMP::Any; } else return; @@ -2439,7 +2435,8 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node, Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT); Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result); Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); } // If there was a non-zero offset that we didn't fold, create an explicit @@ -2499,7 +2496,9 @@ SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node, } SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(Node, DAG); SDLoc DL(Node); const GlobalValue *GV = Node->getGlobal(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -2529,9 +2528,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD); Offset = DAG.getConstantPool(CPV, PtrVT, 8); - Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), - Offset, MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); // Call __tls_get_offset to retrieve the offset. Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset); @@ -2544,9 +2544,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM); Offset = DAG.getConstantPool(CPV, PtrVT, 8); - Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), - Offset, MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); // Call __tls_get_offset to retrieve the module base offset. Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset); @@ -2562,9 +2563,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF); SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8); - DTPOffset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), - DTPOffset, MachinePointerInfo::getConstantPool(), - false, false, false, 0); + DTPOffset = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), DTPOffset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset); break; @@ -2575,8 +2577,8 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_INDNTPOFF); Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset); - Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), - Offset, MachinePointerInfo::getGOT(), + Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); break; } @@ -2587,9 +2589,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF); Offset = DAG.getConstantPool(CPV, PtrVT, 8); - Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), - Offset, MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); break; } } @@ -2628,10 +2631,10 @@ SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP, SDValue Result; if (CP->isMachineConstantPoolEntry()) Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, - CP->getAlignment()); + CP->getAlignment()); else Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, - CP->getAlignment(), CP->getOffset()); + CP->getAlignment(), CP->getOffset()); // Use LARL to load the address of the constant pool entry. return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result); @@ -2736,17 +2739,37 @@ SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op, SDValue SystemZTargetLowering:: lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { + const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); + bool RealignOpt = !DAG.getMachineFunction().getFunction()-> + hasFnAttribute("no-realign-stack"); + SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); + SDValue Align = Op.getOperand(2); SDLoc DL(Op); + // If user has set the no alignment function attribute, ignore + // alloca alignments. + uint64_t AlignVal = (RealignOpt ? + dyn_cast<ConstantSDNode>(Align)->getZExtValue() : 0); + + uint64_t StackAlign = TFI->getStackAlignment(); + uint64_t RequiredAlign = std::max(AlignVal, StackAlign); + uint64_t ExtraAlignSpace = RequiredAlign - StackAlign; + unsigned SPReg = getStackPointerRegisterToSaveRestore(); + SDValue NeededSpace = Size; // Get a reference to the stack pointer. SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64); + // Add extra space for alignment if needed. + if (ExtraAlignSpace) + NeededSpace = DAG.getNode(ISD::ADD, DL, MVT::i64, NeededSpace, + DAG.getConstant(ExtraAlignSpace, DL, MVT::i64)); + // Get the new stack pointer value. - SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, Size); + SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace); // Copy the new stack pointer back. Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP); @@ -2757,6 +2780,16 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64); SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust); + // Dynamically realign if needed. + if (RequiredAlign > StackAlign) { + Result = + DAG.getNode(ISD::ADD, DL, MVT::i64, Result, + DAG.getConstant(ExtraAlignSpace, DL, MVT::i64)); + Result = + DAG.getNode(ISD::AND, DL, MVT::i64, Result, + DAG.getConstant(~(RequiredAlign - 1), DL, MVT::i64)); + } + SDValue Ops[2] = { Result, Chain }; return DAG.getMergeValues(Ops, DL); } @@ -2837,7 +2870,7 @@ SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op, } else if (DAG.ComputeNumSignBits(Op1) > 32) { Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1); Opcode = SystemZISD::SDIVREM32; - } else + } else Opcode = SystemZISD::SDIVREM64; // DSG(F) takes a 64-bit dividend, so the even register in the GR128 @@ -3247,8 +3280,8 @@ SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, if (Op->getNumValues() == 1) return CC; assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result"); - return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(), - Glued, CC); + return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(), Glued, + CC); } unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); @@ -3890,7 +3923,7 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG, GS.addUndef(); } else { GS.add(SDValue(), ResidueOps.size()); - ResidueOps.push_back(Op); + ResidueOps.push_back(BVN->getOperand(I)); } } @@ -3901,7 +3934,7 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG, // Create the BUILD_VECTOR for the remaining elements, if any. if (!ResidueOps.empty()) { while (ResidueOps.size() < NumElements) - ResidueOps.push_back(DAG.getUNDEF(VT.getVectorElementType())); + ResidueOps.push_back(DAG.getUNDEF(ResidueOps[0].getValueType())); for (auto &Op : GS.Ops) { if (!Op.getNode()) { Op = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BVN), VT, ResidueOps); @@ -4204,7 +4237,7 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, - unsigned UnpackHigh) const { + unsigned UnpackHigh) const { SDValue PackedOp = Op.getOperand(0); EVT OutVT = Op.getValueType(); EVT InVT = PackedOp.getValueType(); @@ -4566,9 +4599,9 @@ SDValue SystemZTargetLowering::combineExtract(SDLoc DL, EVT ResVT, EVT VecVT, } return Op; } else if ((Opcode == ISD::SIGN_EXTEND_VECTOR_INREG || - Opcode == ISD::ZERO_EXTEND_VECTOR_INREG || - Opcode == ISD::ANY_EXTEND_VECTOR_INREG) && - canTreatAsByteVector(Op.getValueType()) && + Opcode == ISD::ZERO_EXTEND_VECTOR_INREG || + Opcode == ISD::ANY_EXTEND_VECTOR_INREG) && + canTreatAsByteVector(Op.getValueType()) && canTreatAsByteVector(Op.getOperand(0).getValueType())) { // Make sure that only the unextended bits are significant. EVT ExtVT = Op.getValueType(); @@ -4579,14 +4612,14 @@ SDValue SystemZTargetLowering::combineExtract(SDLoc DL, EVT ResVT, EVT VecVT, unsigned SubByte = Byte % ExtBytesPerElement; unsigned MinSubByte = ExtBytesPerElement - OpBytesPerElement; if (SubByte < MinSubByte || - SubByte + BytesPerElement > ExtBytesPerElement) - break; + SubByte + BytesPerElement > ExtBytesPerElement) + break; // Get the byte offset of the unextended element Byte = Byte / ExtBytesPerElement * OpBytesPerElement; // ...then add the byte offset relative to that element. Byte += SubByte - MinSubByte; if (Byte % BytesPerElement != 0) - break; + break; Op = Op.getOperand(0); Index = Byte / BytesPerElement; Force = true; @@ -5611,6 +5644,31 @@ SystemZTargetLowering::emitTransactionBegin(MachineInstr *MI, return MBB; } +MachineBasicBlock * +SystemZTargetLowering::emitLoadAndTestCmp0(MachineInstr *MI, + MachineBasicBlock *MBB, + unsigned Opcode) const { + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo *MRI = &MF.getRegInfo(); + const SystemZInstrInfo *TII = + static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo()); + DebugLoc DL = MI->getDebugLoc(); + + unsigned SrcReg = MI->getOperand(0).getReg(); + + // Create new virtual register of the same class as source. + const TargetRegisterClass *RC = MRI->getRegClass(SrcReg); + unsigned DstReg = MRI->createVirtualRegister(RC); + + // Replace pseudo with a normal load-and-test that models the def as + // well. + BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg) + .addReg(SrcReg); + MI->eraseFromParent(); + + return MBB; +} + MachineBasicBlock *SystemZTargetLowering:: EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const { switch (MI->getOpcode()) { @@ -5858,6 +5916,13 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const { return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, true); case SystemZ::TBEGINC: return emitTransactionBegin(MI, MBB, SystemZ::TBEGINC, true); + case SystemZ::LTEBRCompare_VecPseudo: + return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTEBR); + case SystemZ::LTDBRCompare_VecPseudo: + return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTDBR); + case SystemZ::LTXBRCompare_VecPseudo: + return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR); + default: llvm_unreachable("Unexpected instr type to insert"); } diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 07ff251..391636e 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -409,6 +409,20 @@ public: return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + return SystemZ::R6D; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + return SystemZ::R7D; + } + MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const override; @@ -481,7 +495,7 @@ private: SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, - unsigned UnpackHigh) const; + unsigned UnpackHigh) const; SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const; SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp, @@ -530,6 +544,10 @@ private: MachineBasicBlock *MBB, unsigned Opcode, bool NoFloat) const; + MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr *MI, + MachineBasicBlock *MBB, + unsigned Opcode) const; + }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h index 464f79a..5a1c874 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h @@ -35,11 +35,9 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) { if (MCID.mayStore()) Flags |= MachineMemOperand::MOStore; int64_t Offset = 0; - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo( - PseudoSourceValue::getFixedStack(FI), Offset), - Flags, MFFrame->getObjectSize(FI), - MFFrame->getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags, + MFFrame->getObjectSize(FI), MFFrame->getObjectAlignment(FI)); return MIB.addFrameIndex(FI).addImm(Offset).addReg(0).addMemOperand(MMO); } diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td index 27fbd7d..0cb2672 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td @@ -46,15 +46,28 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { defm LTDBR : LoadAndTestRRE<"ltdb", 0xB312, FP64>; defm LTXBR : LoadAndTestRRE<"ltxb", 0xB342, FP128>; } -// Note that the comparison against zero operation is not available if we -// have vector support, since load-and-test instructions will partially -// clobber the target (vector) register. +// Note that LTxBRCompare is not available if we have vector support, +// since load-and-test instructions will partially clobber the target +// (vector) register. let Predicates = [FeatureNoVector] in { defm : CompareZeroFP<LTEBRCompare, FP32>; defm : CompareZeroFP<LTDBRCompare, FP64>; defm : CompareZeroFP<LTXBRCompare, FP128>; } +// Use a normal load-and-test for compare against zero in case of +// vector support (via a pseudo to simplify instruction selection). +let Defs = [CC], usesCustomInserter = 1 in { + def LTEBRCompare_VecPseudo : Pseudo<(outs), (ins FP32:$R1, FP32:$R2), []>; + def LTDBRCompare_VecPseudo : Pseudo<(outs), (ins FP64:$R1, FP64:$R2), []>; + def LTXBRCompare_VecPseudo : Pseudo<(outs), (ins FP128:$R1, FP128:$R2), []>; +} +let Predicates = [FeatureVector] in { + defm : CompareZeroFP<LTEBRCompare_VecPseudo, FP32>; + defm : CompareZeroFP<LTDBRCompare_VecPseudo, FP64>; + defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>; +} + // Moves between 64-bit integer and floating-point registers. def LGDR : UnaryRRE<"lgd", 0xB3CD, bitconvert, GR64, FP64>; def LDGR : UnaryRRE<"ldg", 0xB3C1, bitconvert, FP64, GR64>; @@ -238,26 +251,46 @@ let Predicates = [FeatureFPExtension] in { // Unary arithmetic //===----------------------------------------------------------------------===// +// We prefer generic instructions during isel, because they do not +// clobber CC and therefore give the scheduler more freedom. In cases +// the CC is actually useful, the SystemZElimCompare pass will try to +// convert generic instructions into opcodes that also set CC. Note +// that lcdf / lpdf / lndf only affect the sign bit, and can therefore +// be used with fp32 as well. This could be done for fp128, in which +// case the operands would have to be tied. + // Negation (Load Complement). let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { - def LCEBR : UnaryRRE<"lceb", 0xB303, fneg, FP32, FP32>; - def LCDBR : UnaryRRE<"lcdb", 0xB313, fneg, FP64, FP64>; + def LCEBR : UnaryRRE<"lceb", 0xB303, null_frag, FP32, FP32>; + def LCDBR : UnaryRRE<"lcdb", 0xB313, null_frag, FP64, FP64>; def LCXBR : UnaryRRE<"lcxb", 0xB343, fneg, FP128, FP128>; } +// Generic form, which does not set CC. +def LCDFR : UnaryRRE<"lcdf", 0xB373, fneg, FP64, FP64>; +let isCodeGenOnly = 1 in + def LCDFR_32 : UnaryRRE<"lcdf", 0xB373, fneg, FP32, FP32>; // Absolute value (Load Positive). let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { - def LPEBR : UnaryRRE<"lpeb", 0xB300, fabs, FP32, FP32>; - def LPDBR : UnaryRRE<"lpdb", 0xB310, fabs, FP64, FP64>; + def LPEBR : UnaryRRE<"lpeb", 0xB300, null_frag, FP32, FP32>; + def LPDBR : UnaryRRE<"lpdb", 0xB310, null_frag, FP64, FP64>; def LPXBR : UnaryRRE<"lpxb", 0xB340, fabs, FP128, FP128>; } +// Generic form, which does not set CC. +def LPDFR : UnaryRRE<"lpdf", 0xB370, fabs, FP64, FP64>; +let isCodeGenOnly = 1 in + def LPDFR_32 : UnaryRRE<"lpdf", 0xB370, fabs, FP32, FP32>; // Negative absolute value (Load Negative). let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { - def LNEBR : UnaryRRE<"lneb", 0xB301, fnabs, FP32, FP32>; - def LNDBR : UnaryRRE<"lndb", 0xB311, fnabs, FP64, FP64>; + def LNEBR : UnaryRRE<"lneb", 0xB301, null_frag, FP32, FP32>; + def LNDBR : UnaryRRE<"lndb", 0xB311, null_frag, FP64, FP64>; def LNXBR : UnaryRRE<"lnxb", 0xB341, fnabs, FP128, FP128>; } +// Generic form, which does not set CC. +def LNDFR : UnaryRRE<"lndf", 0xB371, fnabs, FP64, FP64>; +let isCodeGenOnly = 1 in + def LNDFR_32 : UnaryRRE<"lndf", 0xB371, fnabs, FP32, FP32>; // Square root. def SQEBR : UnaryRRE<"sqeb", 0xB314, fsqrt, FP32, FP32>; @@ -414,6 +447,6 @@ let Defs = [CC], CCValues = 0xF in { // Peepholes //===----------------------------------------------------------------------===// -def : Pat<(f32 fpimmneg0), (LCEBR (LZER))>; -def : Pat<(f64 fpimmneg0), (LCDBR (LZDR))>; +def : Pat<(f32 fpimmneg0), (LCDFR_32 (LZER))>; +def : Pat<(f64 fpimmneg0), (LCDFR (LZDR))>; def : Pat<(f128 fpimmneg0), (LCXBR (LZXR))>; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td index 71eb998..01f4cde 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -2381,6 +2381,7 @@ multiclass StringRRE<string mnemonic, bits<16> opcode, def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2), (ins GR64:$R1src, GR64:$R2src), mnemonic#"\t$R1, $R2", []> { + let Uses = [R0L]; let Constraints = "$R1 = $R1src, $R2 = $R2src"; let DisableEncoding = "$R1src, $R2src"; } diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 5d4a34f..e6b5fc8 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -69,6 +69,11 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI, MachineOperand &LowOffsetOp = MI->getOperand(2); LowOffsetOp.setImm(LowOffsetOp.getImm() + 8); + // Clear the kill flags for the base and index registers in the first + // instruction. + EarlierMI->getOperand(1).setIsKill(false); + EarlierMI->getOperand(3).setIsKill(false); + // Set the opcodes. unsigned HighOpcode = getOpcodeForOffset(NewOpcode, HighOffsetOp.getImm()); unsigned LowOpcode = getOpcodeForOffset(NewOpcode, LowOffsetOp.getImm()); @@ -111,7 +116,7 @@ void SystemZInstrInfo::expandRIPseudo(MachineInstr *MI, unsigned LowOpcode, } // MI is a three-operand RIE-style pseudo instruction. Replace it with -// LowOpcode3 if the registers are both low GR32s, otherwise use a move +// LowOpcodeK if the registers are both low GR32s, otherwise use a move // followed by HighOpcode or LowOpcode, depending on whether the target // is a high or low GR32. void SystemZInstrInfo::expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode, @@ -129,6 +134,7 @@ void SystemZInstrInfo::expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode, MI->getOperand(1).isKill()); MI->setDesc(get(DestIsHigh ? HighOpcode : LowOpcode)); MI->getOperand(1).setReg(DestReg); + MI->tieOperands(0, 1); } } @@ -486,11 +492,8 @@ SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare, const MachineRegisterInfo *MRI) const { assert(!SrcReg2 && "Only optimizing constant comparisons so far"); bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0; - if (Value == 0 && - !IsLogical && - removeIPMBasedCompare(Compare, SrcReg, MRI, &RI)) - return true; - return false; + return Value == 0 && !IsLogical && + removeIPMBasedCompare(Compare, SrcReg, MRI, &RI); } // If Opcode is a move that has a conditional variant, return that variant, @@ -505,16 +508,13 @@ static unsigned getConditionalMove(unsigned Opcode) { bool SystemZInstrInfo::isPredicable(MachineInstr *MI) const { unsigned Opcode = MI->getOpcode(); - if (STI.hasLoadStoreOnCond() && - getConditionalMove(Opcode)) - return true; - return false; + return STI.hasLoadStoreOnCond() && getConditionalMove(Opcode); } bool SystemZInstrInfo:: isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const { + BranchProbability Probability) const { // For now only convert single instructions. return NumCycles == 1; } @@ -524,7 +524,7 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumCyclesT, unsigned ExtraPredCyclesT, MachineBasicBlock &FMBB, unsigned NumCyclesF, unsigned ExtraPredCyclesF, - const BranchProbability &Probability) const { + BranchProbability Probability) const { // For now avoid converting mutually-exclusive cases. return false; } @@ -548,11 +548,10 @@ PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Pred) const { return false; } -void -SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { +void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { // Split 128-bit GPR moves into two 64-bit moves. This handles ADDR128 too. if (SystemZ::GR128BitRegClass.contains(DestReg, SrcReg)) { copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_h64), @@ -590,13 +589,10 @@ SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addReg(SrcReg, getKillRegState(KillSrc)); } -void -SystemZInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned SrcReg, bool isKill, - int FrameIdx, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { +void SystemZInstrInfo::storeRegToStackSlot( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, + bool isKill, int FrameIdx, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); // Callers may expect a single instruction, so keep 128-bit moves @@ -604,15 +600,14 @@ SystemZInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, unsigned LoadOpcode, StoreOpcode; getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode); addFrameReference(BuildMI(MBB, MBBI, DL, get(StoreOpcode)) - .addReg(SrcReg, getKillRegState(isKill)), FrameIdx); + .addReg(SrcReg, getKillRegState(isKill)), + FrameIdx); } -void -SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned DestReg, int FrameIdx, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { +void SystemZInstrInfo::loadRegFromStackSlot( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, + int FrameIdx, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); // Callers may expect a single instruction, so keep 128-bit moves @@ -681,7 +676,8 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, LiveVariables *LV) const { MachineInstr *MI = MBBI; MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned Opcode = MI->getOpcode(); unsigned NumOps = MI->getNumOperands(); @@ -708,14 +704,19 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode); if (ThreeOperandOpcode >= 0) { - MachineInstrBuilder MIB = - BuildMI(*MBB, MBBI, MI->getDebugLoc(), get(ThreeOperandOpcode)) - .addOperand(Dest); + // Create three address instruction without adding the implicit + // operands. Those will instead be copied over from the original + // instruction by the loop below. + MachineInstrBuilder MIB(*MF, + MF->CreateMachineInstr(get(ThreeOperandOpcode), + MI->getDebugLoc(), /*NoImplicit=*/true)); + MIB.addOperand(Dest); // Keep the kill state, but drop the tied flag. MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg()); // Keep the remaining operands as-is. for (unsigned I = 2; I < NumOps; ++I) MIB.addOperand(MI->getOperand(I)); + MBB->insert(MI, MIB); return finishConvertToThreeAddress(MI, MIB, LV); } } @@ -1191,6 +1192,12 @@ unsigned SystemZInstrInfo::getLoadAndTest(unsigned Opcode) const { case SystemZ::LER: return SystemZ::LTEBR; case SystemZ::LDR: return SystemZ::LTDBR; case SystemZ::LXR: return SystemZ::LTXBR; + case SystemZ::LCDFR: return SystemZ::LCDBR; + case SystemZ::LPDFR: return SystemZ::LPDBR; + case SystemZ::LNDFR: return SystemZ::LNDBR; + case SystemZ::LCDFR_32: return SystemZ::LCEBR; + case SystemZ::LPDFR_32: return SystemZ::LPEBR; + case SystemZ::LNDFR_32: return SystemZ::LNEBR; // On zEC12 we prefer to use RISBGN. But if there is a chance to // actually use the condition code, we may turn it back into RISGB. // Note that RISBG is not really a "load-and-test" instruction, diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h index 31c9db2..d9094ba 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -159,12 +159,12 @@ public: bool isPredicable(MachineInstr *MI) const override; bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumCyclesT, unsigned ExtraPredCyclesT, MachineBasicBlock &FMBB, unsigned NumCyclesF, unsigned ExtraPredCyclesF, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Pred) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td index 820f30b..d5dabc2 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -397,7 +397,7 @@ let mayLoad = 1, mayStore = 1 in defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>; // String moves. -let mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0L] in +let mayLoad = 1, mayStore = 1, Defs = [CC] in defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>; //===----------------------------------------------------------------------===// @@ -424,7 +424,7 @@ let hasSideEffects = 0 in { def LGFR : UnaryRRE<"lgf", 0xB914, sext32, GR64, GR32>; } let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in - def LTGFR : UnaryRRE<"ltgf", 0xB912, null_frag, GR64, GR64>; + def LTGFR : UnaryRRE<"ltgf", 0xB912, null_frag, GR64, GR32>; // Match 32-to-64-bit sign extensions in which the source is already // in a 64-bit register. @@ -490,7 +490,7 @@ def : Pat<(and GR64:$src, 0xffffffff), def LLCMux : UnaryRXYPseudo<"llc", azextloadi8, GRX32, 1>, Requires<[FeatureHighWord]>; def LLC : UnaryRXY<"llc", 0xE394, azextloadi8, GR32, 1>; -def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GR32, 1>, +def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GRH32, 1>, Requires<[FeatureHighWord]>; // 32-bit extensions from 16-bit memory. LLHMux expands to LLH or LLHH, @@ -498,7 +498,7 @@ def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GR32, 1>, def LLHMux : UnaryRXYPseudo<"llh", azextloadi16, GRX32, 2>, Requires<[FeatureHighWord]>; def LLH : UnaryRXY<"llh", 0xE395, azextloadi16, GR32, 2>; -def LLHH : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GR32, 2>, +def LLHH : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GRH32, 2>, Requires<[FeatureHighWord]>; def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_azextloadi16, GR32>; @@ -1147,7 +1147,7 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in { def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>, Requires<[FeatureHighWord]>; def CLFI : CompareRIL<"clfi", 0xC2F, z_ucmp, GR32, uimm32>; - def CLIH : CompareRIL<"clih", 0xCCF, z_ucmp, GR32, uimm32>, + def CLIH : CompareRIL<"clih", 0xCCF, z_ucmp, GRH32, uimm32>, Requires<[FeatureHighWord]>; def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>; @@ -1185,7 +1185,7 @@ let mayLoad = 1, Defs = [CC] in defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>; // String comparison. -let mayLoad = 1, Defs = [CC], Uses = [R0L] in +let mayLoad = 1, Defs = [CC] in defm CLST : StringRRE<"clst", 0xB25D, z_strcmp>; // Test under mask. @@ -1219,6 +1219,9 @@ def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>; // Atomic operations //===----------------------------------------------------------------------===// +// A serialization instruction that acts as a barrier for all memory +// accesses, which expands to "bcr 14, 0". +let hasSideEffects = 1 in def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>; let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in { @@ -1459,9 +1462,29 @@ let usesCustomInserter = 1 in { } // Search a block of memory for a character. -let mayLoad = 1, Defs = [CC], Uses = [R0L] in +let mayLoad = 1, Defs = [CC] in defm SRST : StringRRE<"srst", 0xb25e, z_search_string>; +// Other instructions for inline assembly +let hasSideEffects = 1, Defs = [CC], mayStore = 1 in + def STCK : InstS<0xB205, (outs), (ins bdaddr12only:$BD2), + "stck\t$BD2", + []>; +let hasSideEffects = 1, Defs = [CC], mayStore = 1 in + def STCKF : InstS<0xB27C, (outs), (ins bdaddr12only:$BD2), + "stckf\t$BD2", + []>; +let hasSideEffects = 1, Defs = [CC], mayStore = 1 in + def STCKE : InstS<0xB278, (outs), (ins bdaddr12only:$BD2), + "stcke\t$BD2", + []>; +let hasSideEffects = 1, Defs = [CC], mayStore = 1 in + def STFLE : InstS<0xB2B0, (outs), (ins bdaddr12only:$BD2), + "stfle\t$BD2", + []>; + + + //===----------------------------------------------------------------------===// // Peepholes. //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp index 00572d0..1a7c0d7 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp @@ -1,4 +1,4 @@ -//== SystemZMachineFuctionInfo.cpp - SystemZ machine function info-*- C++ -*-=// +//=== SystemZMachineFunctionInfo.cpp - SystemZ machine function info ------===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h index 34fc36d..f4a517b 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h @@ -1,4 +1,4 @@ -//==- SystemZMachineFuctionInfo.h - SystemZ machine function info -*- C++ -*-=// +//=== SystemZMachineFunctionInfo.h - SystemZ machine function info -*- C++ -*-// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp index dc7bd25..6fd24e3 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -69,8 +69,8 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // Decompose the frame index into a base and offset. int FrameIndex = MI->getOperand(FIOperandNum).getIndex(); - unsigned BasePtr = getFrameRegister(MF); - int64_t Offset = (TFI->getFrameIndexOffset(MF, FrameIndex) + + unsigned BasePtr; + int64_t Offset = (TFI->getFrameIndexReference(MF, FrameIndex, BasePtr) + MI->getOperand(FIOperandNum + 1).getImm()); // Special handling of dbg_value instructions. diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td index 85aa0a6..0d8b08b 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -282,4 +282,5 @@ def v128any : TypedReg<untyped, VR128>; // The 2-bit condition code field of the PSW. Every register named in an // inline asm needs a class associated with it. def CC : SystemZReg<"cc">; -def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>; +let isAllocatable = 0 in + def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp index d1a17c5..846edd5 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp @@ -16,6 +16,8 @@ #include "SystemZTargetMachine.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/Target/TargetRegisterInfo.h" using namespace llvm; @@ -35,19 +37,16 @@ public: bool runOnMachineFunction(MachineFunction &F) override; private: - bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther, - unsigned LLIxL, unsigned LLIxH); + bool shortenIIF(MachineInstr &MI, unsigned LLIxL, unsigned LLIxH); bool shortenOn0(MachineInstr &MI, unsigned Opcode); bool shortenOn01(MachineInstr &MI, unsigned Opcode); bool shortenOn001(MachineInstr &MI, unsigned Opcode); + bool shortenOn001AddCC(MachineInstr &MI, unsigned Opcode); bool shortenFPConv(MachineInstr &MI, unsigned Opcode); const SystemZInstrInfo *TII; - - // LowGPRs[I] has bit N set if LLVM register I includes the low - // word of GPR N. HighGPRs is the same for the high word. - unsigned LowGPRs[SystemZ::NUM_TARGET_REGS]; - unsigned HighGPRs[SystemZ::NUM_TARGET_REGS]; + const TargetRegisterInfo *TRI; + LivePhysRegs LiveRegs; }; char SystemZShortenInst::ID = 0; @@ -58,33 +57,31 @@ FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) { } SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm) - : MachineFunctionPass(ID), TII(nullptr), LowGPRs(), HighGPRs() { - // Set up LowGPRs and HighGPRs. - for (unsigned I = 0; I < 16; ++I) { - LowGPRs[SystemZMC::GR32Regs[I]] |= 1 << I; - LowGPRs[SystemZMC::GR64Regs[I]] |= 1 << I; - HighGPRs[SystemZMC::GRH32Regs[I]] |= 1 << I; - HighGPRs[SystemZMC::GR64Regs[I]] |= 1 << I; - if (unsigned GR128 = SystemZMC::GR128Regs[I]) { - LowGPRs[GR128] |= 3 << I; - HighGPRs[GR128] |= 3 << I; - } - } + : MachineFunctionPass(ID), TII(nullptr) {} + +// Tie operands if MI has become a two-address instruction. +static void tieOpsIfNeeded(MachineInstr &MI) { + if (MI.getDesc().getOperandConstraint(0, MCOI::TIED_TO) && + !MI.getOperand(0).isTied()) + MI.tieOperands(0, 1); } // MI loads one word of a GPR using an IIxF instruction and LLIxL and LLIxH // are the halfword immediate loads for the same word. Try to use one of them -// instead of IIxF. If MI loads the high word, GPRMap[X] is the set of high -// words referenced by LLVM register X while LiveOther is the mask of low -// words that are currently live, and vice versa. -bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned *GPRMap, - unsigned LiveOther, unsigned LLIxL, - unsigned LLIxH) { +// instead of IIxF. +bool SystemZShortenInst::shortenIIF(MachineInstr &MI, + unsigned LLIxL, unsigned LLIxH) { unsigned Reg = MI.getOperand(0).getReg(); - assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number"); - unsigned GPRs = GPRMap[Reg]; - assert(GPRs != 0 && "Register must be a GPR"); - if (GPRs & LiveOther) + // The new opcode will clear the other half of the GR64 reg, so + // cancel if that is live. + unsigned thisSubRegIdx = (SystemZ::GRH32BitRegClass.contains(Reg) ? + SystemZ::subreg_h32 : SystemZ::subreg_l32); + unsigned otherSubRegIdx = (thisSubRegIdx == SystemZ::subreg_l32 ? + SystemZ::subreg_h32 : SystemZ::subreg_l32); + unsigned GR64BitReg = TRI->getMatchingSuperReg(Reg, thisSubRegIdx, + &SystemZ::GR64BitRegClass); + unsigned OtherReg = TRI->getSubReg(GR64BitReg, otherSubRegIdx); + if (LiveRegs.contains(OtherReg)) return false; uint64_t Imm = MI.getOperand(1).getImm(); @@ -123,12 +120,26 @@ bool SystemZShortenInst::shortenOn01(MachineInstr &MI, unsigned Opcode) { } // Change MI's opcode to Opcode if register operands 0, 1 and 2 have a -// 4-bit encoding and if operands 0 and 1 are tied. +// 4-bit encoding and if operands 0 and 1 are tied. Also ties op 0 +// with op 1, if MI becomes 2-address. bool SystemZShortenInst::shortenOn001(MachineInstr &MI, unsigned Opcode) { if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 && MI.getOperand(1).getReg() == MI.getOperand(0).getReg() && SystemZMC::getFirstReg(MI.getOperand(2).getReg()) < 16) { MI.setDesc(TII->get(Opcode)); + tieOpsIfNeeded(MI); + return true; + } + return false; +} + +// Calls shortenOn001 if CCLive is false. CC def operand is added in +// case of success. +bool SystemZShortenInst::shortenOn001AddCC(MachineInstr &MI, + unsigned Opcode) { + if (!LiveRegs.contains(SystemZ::CC) && shortenOn001(MI, Opcode)) { + MachineInstrBuilder(*MI.getParent()->getParent(), &MI) + .addReg(SystemZ::CC, RegState::ImplicitDefine); return true; } return false; @@ -164,35 +175,24 @@ bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) { bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { bool Changed = false; - // Work out which words are live on exit from the block. - unsigned LiveLow = 0; - unsigned LiveHigh = 0; - for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) { - for (auto LI = (*SI)->livein_begin(), LE = (*SI)->livein_end(); - LI != LE; ++LI) { - unsigned Reg = *LI; - assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number"); - LiveLow |= LowGPRs[Reg]; - LiveHigh |= HighGPRs[Reg]; - } - } + // Set up the set of live registers at the end of MBB (live out) + LiveRegs.clear(); + LiveRegs.addLiveOuts(&MBB); // Iterate backwards through the block looking for instructions to change. for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) { MachineInstr &MI = *MBBI; switch (MI.getOpcode()) { case SystemZ::IILF: - Changed |= shortenIIF(MI, LowGPRs, LiveHigh, SystemZ::LLILL, - SystemZ::LLILH); + Changed |= shortenIIF(MI, SystemZ::LLILL, SystemZ::LLILH); break; case SystemZ::IIHF: - Changed |= shortenIIF(MI, HighGPRs, LiveLow, SystemZ::LLIHL, - SystemZ::LLIHH); + Changed |= shortenIIF(MI, SystemZ::LLIHL, SystemZ::LLIHH); break; case SystemZ::WFADB: - Changed |= shortenOn001(MI, SystemZ::ADBR); + Changed |= shortenOn001AddCC(MI, SystemZ::ADBR); break; case SystemZ::WFDDB: @@ -216,15 +216,15 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { break; case SystemZ::WFLCDB: - Changed |= shortenOn01(MI, SystemZ::LCDBR); + Changed |= shortenOn01(MI, SystemZ::LCDFR); break; case SystemZ::WFLNDB: - Changed |= shortenOn01(MI, SystemZ::LNDBR); + Changed |= shortenOn01(MI, SystemZ::LNDFR); break; case SystemZ::WFLPDB: - Changed |= shortenOn01(MI, SystemZ::LPDBR); + Changed |= shortenOn01(MI, SystemZ::LPDFR); break; case SystemZ::WFSQDB: @@ -232,7 +232,7 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { break; case SystemZ::WFSDB: - Changed |= shortenOn001(MI, SystemZ::SDBR); + Changed |= shortenOn001AddCC(MI, SystemZ::SDBR); break; case SystemZ::WFCDB: @@ -257,33 +257,17 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { break; } - unsigned UsedLow = 0; - unsigned UsedHigh = 0; - for (auto MOI = MI.operands_begin(), MOE = MI.operands_end(); - MOI != MOE; ++MOI) { - MachineOperand &MO = *MOI; - if (MO.isReg()) { - if (unsigned Reg = MO.getReg()) { - assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number"); - if (MO.isDef()) { - LiveLow &= ~LowGPRs[Reg]; - LiveHigh &= ~HighGPRs[Reg]; - } else if (!MO.isUndef()) { - UsedLow |= LowGPRs[Reg]; - UsedHigh |= HighGPRs[Reg]; - } - } - } - } - LiveLow |= UsedLow; - LiveHigh |= UsedHigh; + LiveRegs.stepBackward(MI); } return Changed; } bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) { - TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo()); + const SystemZSubtarget &ST = F.getSubtarget<SystemZSubtarget>(); + TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + LiveRegs.init(TRI); bool Changed = false; for (auto &MBB : F) diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp index 00cbbd1..f305e85 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -16,6 +16,7 @@ using namespace llvm; +extern cl::opt<bool> MISchedPostRA; extern "C" void LLVMInitializeSystemZTarget() { // Register the target. RegisterTargetMachine<SystemZTargetMachine> X(TheSystemZTarget); @@ -32,7 +33,7 @@ static bool UsesVectorABI(StringRef CPU, StringRef FS) { VectorABI = false; SmallVector<StringRef, 3> Features; - FS.split(Features, ",", -1, false /* KeepEmpty */); + FS.split(Features, ',', -1, false /* KeepEmpty */); for (auto &Feature : Features) { if (Feature == "vector" || Feature == "+vector") VectorABI = true; @@ -130,6 +131,13 @@ void SystemZPassConfig::addPreSched2() { } void SystemZPassConfig::addPreEmitPass() { + + // Do instruction shortening before compare elimination because some + // vector instructions will be shortened into opcodes that compare + // elimination recognizes. + if (getOptLevel() != CodeGenOpt::None) + addPass(createSystemZShortenInstPass(getSystemZTargetMachine()), false); + // We eliminate comparisons here rather than earlier because some // transformations can change the set of available CC values and we // generally want those transformations to have priority. This is @@ -155,9 +163,17 @@ void SystemZPassConfig::addPreEmitPass() { // preventing that would be a win or not. if (getOptLevel() != CodeGenOpt::None) addPass(createSystemZElimComparePass(getSystemZTargetMachine()), false); - if (getOptLevel() != CodeGenOpt::None) - addPass(createSystemZShortenInstPass(getSystemZTargetMachine()), false); addPass(createSystemZLongBranchPass(getSystemZTargetMachine())); + + // Do final scheduling after all other optimizations, to get an + // optimal input for the decoder (branch relaxation must happen + // after block placement). + if (getOptLevel() != CodeGenOpt::None) { + if (MISchedPostRA) + addPass(&PostMachineSchedulerID); + else + addPass(&PostRASchedulerID); + } } TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) { @@ -165,7 +181,7 @@ TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) { } TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(SystemZTTIImpl(this, F)); }); } diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h index 0a81e1f..1a8f1f7 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h @@ -43,6 +43,9 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool targetSchedulesPostRAScheduling() const override { return true; }; + }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 5a87df1..5ff5b21 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -31,7 +31,7 @@ using namespace llvm; // //===----------------------------------------------------------------------===// -unsigned SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -63,8 +63,8 @@ unsigned SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { return 4 * TTI::TCC_Basic; } -unsigned SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) { +int SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -181,8 +181,8 @@ unsigned SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, return SystemZTTIImpl::getIntImmCost(Imm, Ty); } -unsigned SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { +int SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 4b80973..9ae736d 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -28,7 +28,7 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> { const SystemZTargetLowering *getTLI() const { return TLI; } public: - explicit SystemZTTIImpl(const SystemZTargetMachine *TM, Function &F) + explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -42,12 +42,11 @@ public: /// \name Scalar TTI Implementations /// @{ - unsigned getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(const APInt &Imm, Type *Ty); - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty); - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); diff --git a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp index 19b5e2a..a0b0d8f 100644 --- a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp +++ b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp @@ -43,7 +43,6 @@ using namespace llvm; void TargetLoweringObjectFile::Initialize(MCContext &ctx, const TargetMachine &TM) { Ctx = &ctx; - DL = TM.getDataLayout(); InitMCObjectFileInfo(TM.getTargetTriple(), TM.getRelocationModel(), TM.getCodeModel(), *Ctx); } @@ -107,7 +106,7 @@ MCSymbol *TargetLoweringObjectFile::getSymbolWithGlobalValueBase( assert(!Suffix.empty()); SmallString<60> NameStr; - NameStr += DL->getPrivateGlobalPrefix(); + NameStr += GV->getParent()->getDataLayout().getPrivateGlobalPrefix(); TM.getNameWithPrefix(NameStr, GV, Mang); NameStr.append(Suffix.begin(), Suffix.end()); return Ctx->getOrCreateSymbol(NameStr); @@ -120,7 +119,7 @@ MCSymbol *TargetLoweringObjectFile::getCFIPersonalitySymbol( } void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer, - const TargetMachine &TM, + const DataLayout &, const MCSymbol *Sym) const { } @@ -170,14 +169,13 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV, // If the initializer for the global contains something that requires a // relocation, then we may have to drop this into a writable data section // even though it is marked const. - switch (C->getRelocationInfo()) { - case Constant::NoRelocation: + if (!C->needsRelocation()) { // If the global is required to have a unique address, it can't be put // into a mergable section: just drop it into the general read-only // section instead. if (!GVar->hasUnnamedAddr()) return SectionKind::getReadOnly(); - + // If initializer is a null-terminated string, put it in a "cstring" // section of the right width. if (ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) { @@ -200,7 +198,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV, // Otherwise, just drop it into a mergable constant section. If we have // a section for this size, use it, otherwise use the arbitrary sized // mergable section. - switch (TM.getDataLayout()->getTypeAllocSize(C->getType())) { + switch (GV->getParent()->getDataLayout().getTypeAllocSize(C->getType())) { case 4: return SectionKind::getMergeableConst4(); case 8: return SectionKind::getMergeableConst8(); case 16: return SectionKind::getMergeableConst16(); @@ -208,20 +206,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV, return SectionKind::getReadOnly(); } - case Constant::LocalRelocation: - // In static relocation model, the linker will resolve all addresses, so - // the relocation entries will actually be constants by the time the app - // starts up. However, we can't put this into a mergable section, because - // the linker doesn't take relocations into consideration when it tries to - // merge entries in the section. - if (ReloModel == Reloc::Static) - return SectionKind::getReadOnly(); - - // Otherwise, the dynamic linker needs to fix it up, put it in the - // writable data.rel.local section. - return SectionKind::getReadOnlyWithRelLocal(); - - case Constant::GlobalRelocations: + } else { // In static relocation model, the linker will resolve all addresses, so // the relocation entries will actually be constants by the time the app // starts up. However, we can't put this into a mergable section, because @@ -242,17 +227,11 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV, // globals together onto fewer pages, improving the locality of the dynamic // linker. if (ReloModel == Reloc::Static) - return SectionKind::getDataNoRel(); - - switch (C->getRelocationInfo()) { - case Constant::NoRelocation: - return SectionKind::getDataNoRel(); - case Constant::LocalRelocation: - return SectionKind::getDataRelLocal(); - case Constant::GlobalRelocations: - return SectionKind::getDataRel(); - } - llvm_unreachable("Invalid relocation"); + return SectionKind::getData(); + + if (C->needsRelocation()) + return SectionKind::getData(); + return SectionKind::getData(); } /// This method computes the appropriate section to emit the specified global @@ -273,7 +252,8 @@ TargetLoweringObjectFile::SectionForGlobal(const GlobalValue *GV, MCSection *TargetLoweringObjectFile::getSectionForJumpTable( const Function &F, Mangler &Mang, const TargetMachine &TM) const { - return getSectionForConstant(SectionKind::getReadOnly(), /*C=*/nullptr); + return getSectionForConstant(F.getParent()->getDataLayout(), + SectionKind::getReadOnly(), /*C=*/nullptr); } bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection( @@ -296,9 +276,8 @@ bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection( /// Given a mergable constant with the specified size and relocation /// information, return a section that it should be placed in. -MCSection * -TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind, - const Constant *C) const { +MCSection *TargetLoweringObjectFile::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C) const { if (Kind.isReadOnly() && ReadOnlySection != nullptr) return ReadOnlySection; @@ -345,7 +324,7 @@ const MCExpr *TargetLoweringObjectFile::getDebugThreadLocalSymbol(const MCSymbol } void TargetLoweringObjectFile::getNameWithPrefix( - SmallVectorImpl<char> &OutName, const GlobalValue *GV, - bool CannotUsePrivateLabel, Mangler &Mang, const TargetMachine &TM) const { - Mang.getNameWithPrefix(OutName, GV, CannotUsePrivateLabel); + SmallVectorImpl<char> &OutName, const GlobalValue *GV, Mangler &Mang, + const TargetMachine &TM) const { + Mang.getNameWithPrefix(OutName, GV, /*CannotUsePrivateLabel=*/false); } diff --git a/contrib/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm/lib/Target/TargetMachine.cpp index 83174c2..850c93c 100644 --- a/contrib/llvm/lib/Target/TargetMachine.cpp +++ b/contrib/llvm/lib/Target/TargetMachine.cpp @@ -150,24 +150,11 @@ void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const { } TargetIRAnalysis TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(F.getParent()->getDataLayout()); }); } -static bool canUsePrivateLabel(const MCAsmInfo &AsmInfo, - const MCSection &Section) { - if (!AsmInfo.isSectionAtomizableBySymbols(Section)) - return true; - - // If it is not dead stripped, it is safe to use private labels. - const MCSectionMachO &SMO = cast<MCSectionMachO>(Section); - if (SMO.hasAttribute(MachO::S_ATTR_NO_DEAD_STRIP)) - return true; - - return false; -} - void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name, const GlobalValue *GV, Mangler &Mang, bool MayAlwaysUsePrivate) const { @@ -177,11 +164,8 @@ void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name, Mang.getNameWithPrefix(Name, GV, false); return; } - SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, *this); const TargetLoweringObjectFile *TLOF = getObjFileLowering(); - const MCSection *TheSection = TLOF->SectionForGlobal(GV, GVKind, Mang, *this); - bool CannotUsePrivateLabel = !canUsePrivateLabel(*AsmInfo, *TheSection); - TLOF->getNameWithPrefix(Name, GV, CannotUsePrivateLabel, Mang, *this); + TLOF->getNameWithPrefix(Name, GV, Mang, *this); } MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV, Mangler &Mang) const { diff --git a/contrib/llvm/lib/Target/TargetMachineC.cpp b/contrib/llvm/lib/Target/TargetMachineC.cpp index 7199235..f82566c 100644 --- a/contrib/llvm/lib/Target/TargetMachineC.cpp +++ b/contrib/llvm/lib/Target/TargetMachineC.cpp @@ -32,17 +32,25 @@ using namespace llvm; -inline TargetMachine *unwrap(LLVMTargetMachineRef P) { - return reinterpret_cast<TargetMachine*>(P); +namespace llvm { +// Friend to the TargetMachine, access legacy API that are made private in C++ +struct C_API_PRIVATE_ACCESS { + static const DataLayout &getDataLayout(const TargetMachine &T) { + return T.getDataLayout(); + } +}; +} + +static TargetMachine *unwrap(LLVMTargetMachineRef P) { + return reinterpret_cast<TargetMachine *>(P); } -inline Target *unwrap(LLVMTargetRef P) { +static Target *unwrap(LLVMTargetRef P) { return reinterpret_cast<Target*>(P); } -inline LLVMTargetMachineRef wrap(const TargetMachine *P) { - return - reinterpret_cast<LLVMTargetMachineRef>(const_cast<TargetMachine*>(P)); +static LLVMTargetMachineRef wrap(const TargetMachine *P) { + return reinterpret_cast<LLVMTargetMachineRef>(const_cast<TargetMachine *>(P)); } -inline LLVMTargetRef wrap(const Target * P) { +static LLVMTargetRef wrap(const Target * P) { return reinterpret_cast<LLVMTargetRef>(const_cast<Target*>(P)); } @@ -69,16 +77,16 @@ LLVMTargetRef LLVMGetTargetFromName(const char *Name) { LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T, char **ErrorMessage) { std::string Error; - + *T = wrap(TargetRegistry::lookupTarget(TripleStr, Error)); - + if (!*T) { if (ErrorMessage) *ErrorMessage = strdup(Error.c_str()); return 1; } - + return 0; } @@ -145,10 +153,7 @@ LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T, CM, OL)); } - -void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) { - delete unwrap(T); -} +void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) { delete unwrap(T); } LLVMTargetRef LLVMGetTargetMachineTarget(LLVMTargetMachineRef T) { const Target* target = &(unwrap(T)->getTarget()); @@ -170,8 +175,9 @@ char* LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T) { return strdup(StringRep.c_str()); } +/** Deprecated: use LLVMGetDataLayout(LLVMModuleRef M) instead. */ LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) { - return wrap(unwrap(T)->getDataLayout()); + return wrap(&C_API_PRIVATE_ACCESS::getDataLayout(*unwrap(T))); } void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T, @@ -190,14 +196,7 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M, std::string error; - const DataLayout *td = TM->getDataLayout(); - - if (!td) { - error = "No DataLayout in TargetMachine"; - *ErrorMessage = strdup(error.c_str()); - return true; - } - Mod->setDataLayout(*td); + Mod->setDataLayout(TM->createDataLayout()); TargetMachine::CodeGenFileType ft; switch (codegen) { @@ -239,7 +238,6 @@ LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T, SmallString<0> CodeString; raw_svector_ostream OStream(CodeString); bool Result = LLVMTargetMachineEmit(T, M, OStream, codegen, ErrorMessage); - OStream.flush(); StringRef Data = OStream.str(); *OutMemBuf = diff --git a/contrib/llvm/lib/Target/TargetRecip.cpp b/contrib/llvm/lib/Target/TargetRecip.cpp index 42bc487..d41b643 100644 --- a/contrib/llvm/lib/Target/TargetRecip.cpp +++ b/contrib/llvm/lib/Target/TargetRecip.cpp @@ -26,7 +26,7 @@ using namespace llvm; // the key strings for queries and command-line inputs. // In addition, the command-line interface recognizes the global parameters // "all", "none", and "default". -static const char *RecipOps[] = { +static const char *const RecipOps[] = { "divd", "divf", "vec-divd", @@ -46,7 +46,7 @@ TargetRecip::TargetRecip() { RecipMap.insert(std::make_pair(RecipOps[i], RecipParams())); } -static bool parseRefinementStep(const StringRef &In, size_t &Position, +static bool parseRefinementStep(StringRef In, size_t &Position, uint8_t &Value) { const char RefStepToken = ':'; Position = In.find(RefStepToken); @@ -175,7 +175,7 @@ TargetRecip::TargetRecip(const std::vector<std::string> &Args) : parseIndividualParams(Args); } -bool TargetRecip::isEnabled(const StringRef &Key) const { +bool TargetRecip::isEnabled(StringRef Key) const { ConstRecipIter Iter = RecipMap.find(Key); assert(Iter != RecipMap.end() && "Unknown name for reciprocal map"); assert(Iter->second.Enabled != Uninitialized && @@ -183,7 +183,7 @@ bool TargetRecip::isEnabled(const StringRef &Key) const { return Iter->second.Enabled; } -unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const { +unsigned TargetRecip::getRefinementSteps(StringRef Key) const { ConstRecipIter Iter = RecipMap.find(Key); assert(Iter != RecipMap.end() && "Unknown name for reciprocal map"); assert(Iter->second.RefinementSteps != Uninitialized && @@ -192,7 +192,7 @@ unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const { } /// Custom settings (previously initialized values) override target defaults. -void TargetRecip::setDefaults(const StringRef &Key, bool Enable, +void TargetRecip::setDefaults(StringRef Key, bool Enable, unsigned RefSteps) { if (Key == "all") { for (auto &KV : RecipMap) { @@ -213,7 +213,7 @@ void TargetRecip::setDefaults(const StringRef &Key, bool Enable, bool TargetRecip::operator==(const TargetRecip &Other) const { for (const auto &KV : RecipMap) { - const StringRef &Op = KV.first; + StringRef Op = KV.first; const RecipParams &RP = KV.second; const RecipParams &OtherRP = Other.RecipMap.find(Op)->second; if (RP.RefinementSteps != OtherRP.RefinementSteps) diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt b/contrib/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt new file mode 100644 index 0000000..5e55e29 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt @@ -0,0 +1,3 @@ +add_llvm_library(LLVMWebAssemblyDisassembler + WebAssemblyDisassembler.cpp + ) diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/LLVMBuild.txt b/contrib/llvm/lib/Target/WebAssembly/Disassembler/LLVMBuild.txt new file mode 100644 index 0000000..a452ca1 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===-- ./lib/Target/WebAssembly/Disassembler/LLVMBuild.txt -----*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = WebAssemblyDisassembler +parent = WebAssembly +required_libraries = MCDisassembler WebAssemblyInfo Support +add_to_library_groups = WebAssembly diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/Makefile b/contrib/llvm/lib/Target/WebAssembly/Disassembler/Makefile new file mode 100644 index 0000000..bcd36ba --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/Makefile @@ -0,0 +1,16 @@ +##===-- lib/Target/WebAssembly/Disassembler/Makefile -------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../../.. +LIBRARYNAME = LLVMWebAssemblyDisassembler + +# Hack: we need to include 'main' target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp new file mode 100644 index 0000000..0143b10 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp @@ -0,0 +1,148 @@ +//==- WebAssemblyDisassembler.cpp - Disassembler for WebAssembly -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file is part of the WebAssembly Disassembler. +/// +/// It contains code to translate the data produced by the decoder into +/// MCInsts. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-disassembler" + +namespace { +class WebAssemblyDisassembler final : public MCDisassembler { + std::unique_ptr<const MCInstrInfo> MCII; + + DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, + ArrayRef<uint8_t> Bytes, uint64_t Address, + raw_ostream &VStream, + raw_ostream &CStream) const override; + +public: + WebAssemblyDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, + std::unique_ptr<const MCInstrInfo> MCII) + : MCDisassembler(STI, Ctx), MCII(std::move(MCII)) {} +}; +} // end anonymous namespace + +static MCDisassembler *createWebAssemblyDisassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + std::unique_ptr<const MCInstrInfo> MCII(T.createMCInstrInfo()); + return new WebAssemblyDisassembler(STI, Ctx, std::move(MCII)); +} + +extern "C" void LLVMInitializeWebAssemblyDisassembler() { + // Register the disassembler for each target. + TargetRegistry::RegisterMCDisassembler(TheWebAssemblyTarget32, + createWebAssemblyDisassembler); + TargetRegistry::RegisterMCDisassembler(TheWebAssemblyTarget64, + createWebAssemblyDisassembler); +} + +MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction( + MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/, + raw_ostream &OS, raw_ostream &CS) const { + Size = 0; + uint64_t Pos = 0; + + // Read the opcode. + if (Pos + sizeof(uint64_t) > Bytes.size()) + return MCDisassembler::Fail; + uint64_t Opcode = support::endian::read64le(Bytes.data() + Pos); + Pos += sizeof(uint64_t); + + if (Opcode >= WebAssembly::INSTRUCTION_LIST_END) + return MCDisassembler::Fail; + + MI.setOpcode(Opcode); + const MCInstrDesc &Desc = MCII->get(Opcode); + unsigned NumFixedOperands = Desc.NumOperands; + + // If it's variadic, read the number of extra operands. + unsigned NumExtraOperands = 0; + if (Desc.isVariadic()) { + if (Pos + sizeof(uint64_t) > Bytes.size()) + return MCDisassembler::Fail; + NumExtraOperands = support::endian::read64le(Bytes.data() + Pos); + Pos += sizeof(uint64_t); + } + + // Read the fixed operands. These are described by the MCInstrDesc. + for (unsigned i = 0; i < NumFixedOperands; ++i) { + const MCOperandInfo &Info = Desc.OpInfo[i]; + switch (Info.OperandType) { + case MCOI::OPERAND_IMMEDIATE: + case WebAssembly::OPERAND_BASIC_BLOCK: { + if (Pos + sizeof(uint64_t) > Bytes.size()) + return MCDisassembler::Fail; + uint64_t Imm = support::endian::read64le(Bytes.data() + Pos); + Pos += sizeof(uint64_t); + MI.addOperand(MCOperand::createImm(Imm)); + break; + } + case MCOI::OPERAND_REGISTER: { + if (Pos + sizeof(uint64_t) > Bytes.size()) + return MCDisassembler::Fail; + uint64_t Reg = support::endian::read64le(Bytes.data() + Pos); + Pos += sizeof(uint64_t); + MI.addOperand(MCOperand::createReg(Reg)); + break; + } + case WebAssembly::OPERAND_FPIMM: { + // TODO: MC converts all floating point immediate operands to double. + // This is fine for numeric values, but may cause NaNs to change bits. + if (Pos + sizeof(uint64_t) > Bytes.size()) + return MCDisassembler::Fail; + uint64_t Bits = support::endian::read64le(Bytes.data() + Pos); + Pos += sizeof(uint64_t); + double Imm; + memcpy(&Imm, &Bits, sizeof(Imm)); + MI.addOperand(MCOperand::createFPImm(Imm)); + break; + } + default: + llvm_unreachable("unimplemented operand kind"); + } + } + + // Read the extra operands. + assert(NumExtraOperands == 0 || Desc.isVariadic()); + for (unsigned i = 0; i < NumExtraOperands; ++i) { + if (Pos + sizeof(uint64_t) > Bytes.size()) + return MCDisassembler::Fail; + if (Desc.TSFlags & WebAssemblyII::VariableOpIsImmediate) { + // Decode extra immediate operands. + uint64_t Imm = support::endian::read64le(Bytes.data() + Pos); + MI.addOperand(MCOperand::createImm(Imm)); + } else { + // Decode extra register operands. + uint64_t Reg = support::endian::read64le(Bytes.data() + Pos); + MI.addOperand(MCOperand::createReg(Reg)); + } + Pos += sizeof(uint64_t); + } + + Size = Pos; + return MCDisassembler::Success; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp index fbb985a..9a95150 100644 --- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp @@ -13,7 +13,11 @@ //===----------------------------------------------------------------------===// #include "InstPrinter/WebAssemblyInstPrinter.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "WebAssembly.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" @@ -21,23 +25,164 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" -#include <cctype> +#include "llvm/Target/TargetRegisterInfo.h" using namespace llvm; #define DEBUG_TYPE "asm-printer" +#include "WebAssemblyGenAsmWriter.inc" + WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} + : MCInstPrinter(MAI, MII, MRI), ControlFlowCounter(0) {} void WebAssemblyInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - llvm_unreachable("TODO: implement printRegName"); + assert(RegNo != WebAssemblyFunctionInfo::UnusedReg); + // Note that there's an implicit get_local/set_local here! + OS << "$" << RegNo; } void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, - const MCSubtargetInfo &STI) { - llvm_unreachable("TODO: implement printInst"); + const MCSubtargetInfo & /*STI*/) { + // Print the instruction (this uses the AsmStrings from the .td files). + printInstruction(MI, OS); + + // Print any additional variadic operands. + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + if (Desc.isVariadic()) + for (auto i = Desc.getNumOperands(), e = MI->getNumOperands(); i < e; ++i) { + if (i != 0) + OS << ", "; + printOperand(MI, i, OS); + } + + // Print any added annotation. + printAnnotation(OS, Annot); + + if (CommentStream) { + // Observe any effects on the control flow stack, for use in annotating + // control flow label references. + switch (MI->getOpcode()) { + default: + break; + case WebAssembly::LOOP: { + // Grab the TopLabel value first so that labels print in numeric order. + uint64_t TopLabel = ControlFlowCounter++; + ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false)); + printAnnotation(OS, "label" + utostr(TopLabel) + ':'); + ControlFlowStack.push_back(std::make_pair(TopLabel, true)); + break; + } + case WebAssembly::BLOCK: + ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false)); + break; + case WebAssembly::END_LOOP: + ControlFlowStack.pop_back(); + printAnnotation( + OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':'); + break; + case WebAssembly::END_BLOCK: + printAnnotation( + OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':'); + break; + } + + // Annotate any control flow label references. + unsigned NumFixedOperands = Desc.NumOperands; + SmallSet<uint64_t, 8> Printed; + for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) { + const MCOperandInfo &Info = Desc.OpInfo[i]; + if (!(i < NumFixedOperands + ? (Info.OperandType == WebAssembly::OPERAND_BASIC_BLOCK) + : (Desc.TSFlags & WebAssemblyII::VariableOpImmediateIsLabel))) + continue; + uint64_t Depth = MI->getOperand(i).getImm(); + if (!Printed.insert(Depth).second) + continue; + const auto &Pair = ControlFlowStack.rbegin()[Depth]; + printAnnotation(OS, utostr(Depth) + ": " + (Pair.second ? "up" : "down") + + " to label" + utostr(Pair.first)); + } + } +} + +static std::string toString(const APFloat &FP) { + static const size_t BufBytes = 128; + char buf[BufBytes]; + if (FP.isNaN()) + assert((FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics())) || + FP.bitwiseIsEqual( + APFloat::getQNaN(FP.getSemantics(), /*Negative=*/true))) && + "convertToHexString handles neither SNaN nor NaN payloads"); + // Use C99's hexadecimal floating-point representation. + auto Written = FP.convertToHexString( + buf, /*hexDigits=*/0, /*upperCase=*/false, APFloat::rmNearestTiesToEven); + (void)Written; + assert(Written != 0); + assert(Written < BufBytes); + return buf; +} + +void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() || + MII.get(MI->getOpcode()).TSFlags == 0) && + "WebAssembly variable_ops register ops don't use TSFlags"); + unsigned WAReg = Op.getReg(); + if (int(WAReg) >= 0) + printRegName(O, WAReg); + else if (OpNo >= MII.get(MI->getOpcode()).getNumDefs()) + O << "$pop" << (WAReg & INT32_MAX); + else if (WAReg != WebAssemblyFunctionInfo::UnusedReg) + O << "$push" << (WAReg & INT32_MAX); + else + O << "$discard"; + // Add a '=' suffix if this is a def. + if (OpNo < MII.get(MI->getOpcode()).getNumDefs()) + O << '='; + } else if (Op.isImm()) { + assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() || + (MII.get(MI->getOpcode()).TSFlags & + WebAssemblyII::VariableOpIsImmediate)) && + "WebAssemblyII::VariableOpIsImmediate should be set for " + "variable_ops immediate ops"); + // TODO: (MII.get(MI->getOpcode()).TSFlags & + // WebAssemblyII::VariableOpImmediateIsLabel) + // can tell us whether this is an immediate referencing a label in the + // control flow stack, and it may be nice to pretty-print. + O << Op.getImm(); + } else if (Op.isFPImm()) { + assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() || + MII.get(MI->getOpcode()).TSFlags == 0) && + "WebAssembly variable_ops floating point ops don't use TSFlags"); + O << toString(APFloat(Op.getFPImm())); + } else { + assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() || + (MII.get(MI->getOpcode()).TSFlags & + WebAssemblyII::VariableOpIsImmediate)) && + "WebAssemblyII::VariableOpIsImmediate should be set for " + "variable_ops expr ops"); + assert(Op.isExpr() && "unknown operand kind in printOperand"); + Op.getExpr()->print(O, &MAI); + } +} + +const char *llvm::WebAssembly::TypeToString(MVT Ty) { + switch (Ty.SimpleTy) { + case MVT::i32: + return "i32"; + case MVT::i64: + return "i64"; + case MVT::f32: + return "f32"; + case MVT::f64: + return "f64"; + default: + llvm_unreachable("unsupported type"); + } } diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h index 70fcef2..cd6c59a 100644 --- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h +++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h @@ -16,14 +16,16 @@ #define LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/CodeGen/MachineValueType.h" namespace llvm { -class MCOperand; class MCSubtargetInfo; -class WebAssemblyInstPrinter : public MCInstPrinter { +class WebAssemblyInstPrinter final : public MCInstPrinter { + uint64_t ControlFlowCounter; + SmallVector<std::pair<uint64_t, bool>, 0> ControlFlowStack; + public: WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI); @@ -31,8 +33,21 @@ public: void printRegName(raw_ostream &OS, unsigned RegNo) const override; void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, const MCSubtargetInfo &STI) override; + + // Used by tblegen code. + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); }; +namespace WebAssembly { + +const char *TypeToString(MVT Ty); + +} // end namespace WebAssembly + } // end namespace llvm #endif diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp new file mode 100644 index 0000000..bba06f6 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp @@ -0,0 +1,100 @@ +//===-- WebAssemblyAsmBackend.cpp - WebAssembly Assembler Backend ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements the WebAssemblyAsmBackend class. +/// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCDirectives.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +namespace { +class WebAssemblyAsmBackend final : public MCAsmBackend { + bool Is64Bit; + +public: + explicit WebAssemblyAsmBackend(bool Is64Bit) + : MCAsmBackend(), Is64Bit(Is64Bit) {} + ~WebAssemblyAsmBackend() override {} + + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override; + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override; + + // No instruction requires relaxation + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override { + return false; + } + + unsigned getNumFixupKinds() const override { + // We currently just use the generic fixups in MCFixup.h and don't have any + // target-specific fixups. + return 0; + } + + bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } + + void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {} + + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; +}; + +bool WebAssemblyAsmBackend::writeNopData(uint64_t Count, + MCObjectWriter *OW) const { + if (Count == 0) + return true; + + // FIXME: Do something. + return false; +} + +void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value, + bool IsPCRel) const { + const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind()); + unsigned NumBytes = RoundUpToAlignment(Info.TargetSize, 8); + if (!Value) + return; // Doesn't change encoding. + + // Shift the value into position. + Value <<= Info.TargetOffset; + + unsigned Offset = Fixup.getOffset(); + assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + + // For each byte of the fragment that the fixup touches, mask in the + // bits from the fixup value. + for (unsigned i = 0; i != NumBytes; ++i) + Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); +} + +MCObjectWriter * +WebAssemblyAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const { + return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0); +} +} // end anonymous namespace + +MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) { + return new WebAssemblyAsmBackend(TT.isArch64Bit()); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp new file mode 100644 index 0000000..2bb58b3 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp @@ -0,0 +1,66 @@ +//===-- WebAssemblyELFObjectWriter.cpp - WebAssembly ELF Writer -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file handles ELF-specific object emission, converting LLVM's +/// internal fixups into the appropriate relocations. +/// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/Support/ErrorHandling.h" +using namespace llvm; + +namespace { +class WebAssemblyELFObjectWriter final : public MCELFObjectTargetWriter { +public: + WebAssemblyELFObjectWriter(bool Is64Bit, uint8_t OSABI); + +protected: + unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel) const override; +}; +} // end anonymous namespace + +WebAssemblyELFObjectWriter::WebAssemblyELFObjectWriter(bool Is64Bit, + uint8_t OSABI) + : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_WEBASSEMBLY, + /*HasRelocationAddend=*/false) {} + +unsigned WebAssemblyELFObjectWriter::GetRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + // WebAssembly functions are not allocated in the address space. To resolve a + // pointer to a function, we must use a special relocation type. + if (const MCSymbolRefExpr *SyExp = + dyn_cast<MCSymbolRefExpr>(Fixup.getValue())) + if (SyExp->getKind() == MCSymbolRefExpr::VK_WebAssembly_FUNCTION) + return ELF::R_WEBASSEMBLY_FUNCTION; + + switch (Fixup.getKind()) { + case FK_Data_4: + assert(!is64Bit() && "4-byte relocations only supported on wasm32"); + return ELF::R_WEBASSEMBLY_DATA; + case FK_Data_8: + assert(is64Bit() && "8-byte relocations only supported on wasm64"); + return ELF::R_WEBASSEMBLY_DATA; + default: + llvm_unreachable("unimplemented fixup kind"); + } +} + +MCObjectWriter *llvm::createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit, + uint8_t OSABI) { + MCELFObjectTargetWriter *MOTW = + new WebAssemblyELFObjectWriter(Is64Bit, OSABI); + return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp index 55346f7..02c717a 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp @@ -23,15 +23,16 @@ using namespace llvm; WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {} WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) { - PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit(); + PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4; // TODO: What should MaxInstLength be? - PrivateGlobalPrefix = ""; - PrivateLabelPrefix = ""; - UseDataRegionDirectives = true; + // Use .skip instead of .zero because .zero is confusing when used with two + // arguments (it doesn't actually zero things out). + ZeroDirective = "\t.skip\t"; + Data8bitsDirective = "\t.int8\t"; Data16bitsDirective = "\t.int16\t"; Data32bitsDirective = "\t.int32\t"; @@ -41,9 +42,6 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) { COMMDirectiveAlignmentIsInBytes = false; LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment; - HasDotTypeDotSizeDirective = false; - HasSingleParameterDotFile = false; - SupportsDebugInformation = true; // For now, WebAssembly does not support exceptions. diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h index d2b8fb7..2dcf2cd 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h @@ -15,13 +15,13 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoELF.h" namespace llvm { class Triple; -class WebAssemblyMCAsmInfo final : public MCAsmInfo { +class WebAssemblyMCAsmInfo final : public MCAsmInfoELF { public: explicit WebAssemblyMCAsmInfo(const Triple &T); ~WebAssemblyMCAsmInfo() override; diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp new file mode 100644 index 0000000..f409bd7 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp @@ -0,0 +1,92 @@ +//=- WebAssemblyMCCodeEmitter.cpp - Convert WebAssembly code to machine code -// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements the WebAssemblyMCCodeEmitter class. +/// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "mccodeemitter" + +STATISTIC(MCNumEmitted, "Number of MC instructions emitted."); +STATISTIC(MCNumFixups, "Number of MC fixups created."); + +namespace { +class WebAssemblyMCCodeEmitter final : public MCCodeEmitter { + const MCInstrInfo &MCII; + const MCContext &Ctx; + + // Implementation generated by tablegen. + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + +public: + WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) + : MCII(mcii), Ctx(ctx) {} +}; +} // end anonymous namespace + +MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII, + MCContext &Ctx) { + return new WebAssemblyMCCodeEmitter(MCII, Ctx); +} + +void WebAssemblyMCCodeEmitter::encodeInstruction( + const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // FIXME: This is not the real binary encoding. This is an extremely + // over-simplified encoding where we just use uint64_t for everything. This + // is a temporary measure. + support::endian::Writer<support::little>(OS).write<uint64_t>(MI.getOpcode()); + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + if (Desc.isVariadic()) + support::endian::Writer<support::little>(OS).write<uint64_t>( + MI.getNumOperands() - Desc.NumOperands); + for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg()) { + support::endian::Writer<support::little>(OS).write<uint64_t>(MO.getReg()); + } else if (MO.isImm()) { + support::endian::Writer<support::little>(OS).write<uint64_t>(MO.getImm()); + } else if (MO.isFPImm()) { + support::endian::Writer<support::little>(OS).write<double>(MO.getFPImm()); + } else if (MO.isExpr()) { + support::endian::Writer<support::little>(OS).write<uint64_t>(0); + Fixups.push_back(MCFixup::create( + (1 + MCII.get(MI.getOpcode()).isVariadic() + i) * sizeof(uint64_t), + MO.getExpr(), STI.getTargetTriple().isArch64Bit() ? FK_Data_8 : FK_Data_4, + MI.getLoc())); + ++MCNumFixups; + } else { + llvm_unreachable("unexpected operand kind"); + } + } + + ++MCNumEmitted; // Keep track of the # of mi's emitted. +} + +#include "WebAssemblyGenMCCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp index 224aa77..37000f1 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp @@ -15,10 +15,10 @@ #include "WebAssemblyMCTargetDesc.h" #include "InstPrinter/WebAssemblyInstPrinter.h" #include "WebAssemblyMCAsmInfo.h" +#include "WebAssemblyTargetStreamer.h" #include "llvm/MC/MCCodeGenInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" @@ -26,34 +26,98 @@ using namespace llvm; #define DEBUG_TYPE "wasm-mc-target-desc" +#define GET_INSTRINFO_MC_DESC +#include "WebAssemblyGenInstrInfo.inc" + #define GET_SUBTARGETINFO_MC_DESC #include "WebAssemblyGenSubtargetInfo.inc" #define GET_REGINFO_MC_DESC #include "WebAssemblyGenRegisterInfo.inc" -static MCAsmInfo *createWebAssemblyMCAsmInfo(const MCRegisterInfo &MRI, - const Triple &TT) { - MCAsmInfo *MAI = new WebAssemblyMCAsmInfo(TT); - return MAI; +static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/, + const Triple &TT) { + return new WebAssemblyMCAsmInfo(TT); +} + +static MCInstrInfo *createMCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitWebAssemblyMCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createMCRegisterInfo(const Triple & /*T*/) { + MCRegisterInfo *X = new MCRegisterInfo(); + InitWebAssemblyMCRegisterInfo(X, 0); + return X; +} + +static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) { + assert(SyntaxVariant == 0); + return new WebAssemblyInstPrinter(MAI, MII, MRI); +} + +static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo & /*MRI*/, + MCContext &Ctx) { + return createWebAssemblyMCCodeEmitter(MCII, Ctx); +} + +static MCAsmBackend *createAsmBackend(const Target & /*T*/, + const MCRegisterInfo & /*MRI*/, + const Triple &TT, StringRef /*CPU*/) { + return createWebAssemblyAsmBackend(TT); } -static MCInstPrinter * -createWebAssemblyMCInstPrinter(const Triple &T, unsigned SyntaxVariant, - const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) { - if (SyntaxVariant == 0 || SyntaxVariant == 1) - return new WebAssemblyInstPrinter(MAI, MII, MRI); - return nullptr; +static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU, + StringRef FS) { + return createWebAssemblyMCSubtargetInfoImpl(TT, CPU, FS); +} + +static MCTargetStreamer * +createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo & /*STI*/) { + return new WebAssemblyTargetELFStreamer(S); +} + +static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter * /*InstPrint*/, + bool /*isVerboseAsm*/) { + return new WebAssemblyTargetAsmStreamer(S, OS); } // Force static initialization. extern "C" void LLVMInitializeWebAssemblyTargetMC() { for (Target *T : {&TheWebAssemblyTarget32, &TheWebAssemblyTarget64}) { // Register the MC asm info. - RegisterMCAsmInfoFn X(*T, createWebAssemblyMCAsmInfo); + RegisterMCAsmInfoFn X(*T, createMCAsmInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createMCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createMCRegisterInfo); // Register the MCInstPrinter. - TargetRegistry::RegisterMCInstPrinter(*T, createWebAssemblyMCInstPrinter); + TargetRegistry::RegisterMCInstPrinter(*T, createMCInstPrinter); + + // Register the MC code emitter. + TargetRegistry::RegisterMCCodeEmitter(*T, createCodeEmitter); + + // Register the ASM Backend. + TargetRegistry::RegisterMCAsmBackend(*T, createAsmBackend); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, createMCSubtargetInfo); + + // Register the object target streamer. + TargetRegistry::RegisterObjectTargetStreamer(*T, + createObjectTargetStreamer); + // Register the asm target streamer. + TargetRegistry::RegisterAsmTargetStreamer(*T, createAsmTargetStreamer); } } diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index eebf5b7..9bac4f8 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -15,32 +15,61 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H +#include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/DataTypes.h" -#include <string> namespace llvm { -class formatted_raw_ostream; class MCAsmBackend; class MCCodeEmitter; class MCContext; class MCInstrInfo; -class MCRegisterInfo; class MCObjectWriter; -class MCStreamer; class MCSubtargetInfo; -class MCTargetStreamer; -class StringRef; class Target; class Triple; -class raw_ostream; +class raw_pwrite_stream; extern Target TheWebAssemblyTarget32; extern Target TheWebAssemblyTarget64; -MCAsmBackend *createWebAssemblyAsmBackend(const Target &T, - const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); +MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII, + MCContext &Ctx); + +MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT); + +MCObjectWriter *createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit, uint8_t OSABI); + +namespace WebAssembly { +enum OperandType { + /// Basic block label in a branch construct. + OPERAND_BASIC_BLOCK = MCOI::OPERAND_FIRST_TARGET, + /// Floating-point immediate. + OPERAND_FPIMM +}; + +/// WebAssembly-specific directive identifiers. +enum Directive { + // FIXME: This is not the real binary encoding. + DotParam = UINT64_MAX - 0, ///< .param + DotResult = UINT64_MAX - 1, ///< .result + DotLocal = UINT64_MAX - 2, ///< .local + DotEndFunc = UINT64_MAX - 3, ///< .endfunc +}; + +} // end namespace WebAssembly + +namespace WebAssemblyII { +enum { + // For variadic instructions, this flag indicates whether an operand + // in the variable_ops range is an immediate value. + VariableOpIsImmediate = (1 << 0), + // For immediate values in the variable_ops range, this flag indicates + // whether the value represents a control-flow label. + VariableOpImmediateIsLabel = (1 << 1), +}; +} // end namespace WebAssemblyII } // end namespace llvm @@ -50,6 +79,11 @@ MCAsmBackend *createWebAssemblyAsmBackend(const Target &T, #define GET_REGINFO_ENUM #include "WebAssemblyGenRegisterInfo.inc" +// Defines symbolic names for the WebAssembly instructions. +// +#define GET_INSTRINFO_ENUM +#include "WebAssemblyGenInstrInfo.inc" + #define GET_SUBTARGETINFO_ENUM #include "WebAssemblyGenSubtargetInfo.inc" diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp new file mode 100644 index 0000000..1d28228 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp @@ -0,0 +1,94 @@ +//==-- WebAssemblyTargetStreamer.cpp - WebAssembly Target Streamer Methods --=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file defines WebAssembly-specific target streamer classes. +/// These are for implementing support for target-specific assembly directives. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssemblyTargetStreamer.h" +#include "InstPrinter/WebAssemblyInstPrinter.h" +#include "WebAssemblyMCTargetDesc.h" +#include "WebAssemblyTargetObjectFile.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbolELF.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +using namespace llvm; + +WebAssemblyTargetStreamer::WebAssemblyTargetStreamer(MCStreamer &S) + : MCTargetStreamer(S) {} + +WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer( + MCStreamer &S, formatted_raw_ostream &OS) + : WebAssemblyTargetStreamer(S), OS(OS) {} + +WebAssemblyTargetELFStreamer::WebAssemblyTargetELFStreamer(MCStreamer &S) + : WebAssemblyTargetStreamer(S) {} + +static void PrintTypes(formatted_raw_ostream &OS, ArrayRef<MVT> Types) { + bool First = true; + for (MVT Type : Types) { + if (First) + First = false; + else + OS << ", "; + OS << WebAssembly::TypeToString(Type); + } + OS << '\n'; +} + +void WebAssemblyTargetAsmStreamer::emitParam(ArrayRef<MVT> Types) { + OS << "\t.param \t"; + PrintTypes(OS, Types); +} + +void WebAssemblyTargetAsmStreamer::emitResult(ArrayRef<MVT> Types) { + OS << "\t.result \t"; + PrintTypes(OS, Types); +} + +void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) { + OS << "\t.local \t"; + PrintTypes(OS, Types); +} + +void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; } + +// FIXME: What follows is not the real binary encoding. + +static void EncodeTypes(MCStreamer &Streamer, ArrayRef<MVT> Types) { + Streamer.EmitIntValue(Types.size(), sizeof(uint64_t)); + for (MVT Type : Types) + Streamer.EmitIntValue(Type.SimpleTy, sizeof(uint64_t)); +} + +void WebAssemblyTargetELFStreamer::emitParam(ArrayRef<MVT> Types) { + Streamer.EmitIntValue(WebAssembly::DotParam, sizeof(uint64_t)); + EncodeTypes(Streamer, Types); +} + +void WebAssemblyTargetELFStreamer::emitResult(ArrayRef<MVT> Types) { + Streamer.EmitIntValue(WebAssembly::DotResult, sizeof(uint64_t)); + EncodeTypes(Streamer, Types); +} + +void WebAssemblyTargetELFStreamer::emitLocal(ArrayRef<MVT> Types) { + Streamer.EmitIntValue(WebAssembly::DotLocal, sizeof(uint64_t)); + EncodeTypes(Streamer, Types); +} + +void WebAssemblyTargetELFStreamer::emitEndFunc() { + Streamer.EmitIntValue(WebAssembly::DotEndFunc, sizeof(uint64_t)); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h new file mode 100644 index 0000000..c66a515 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h @@ -0,0 +1,68 @@ +//==-- WebAssemblyTargetStreamer.h - WebAssembly Target Streamer -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file declares WebAssembly-specific target streamer classes. +/// These are for implementing support for target-specific assembly directives. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H +#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H + +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/MC/MCStreamer.h" + +namespace llvm { + +class MCELFStreamer; + +/// WebAssembly-specific streamer interface, to implement support +/// WebAssembly-specific assembly directives. +class WebAssemblyTargetStreamer : public MCTargetStreamer { +public: + explicit WebAssemblyTargetStreamer(MCStreamer &S); + + /// .param + virtual void emitParam(ArrayRef<MVT> Types) = 0; + /// .result + virtual void emitResult(ArrayRef<MVT> Types) = 0; + /// .local + virtual void emitLocal(ArrayRef<MVT> Types) = 0; + /// .endfunc + virtual void emitEndFunc() = 0; +}; + +/// This part is for ascii assembly output +class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer { + formatted_raw_ostream &OS; + +public: + WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); + + void emitParam(ArrayRef<MVT> Types) override; + void emitResult(ArrayRef<MVT> Types) override; + void emitLocal(ArrayRef<MVT> Types) override; + void emitEndFunc() override; +}; + +/// This part is for ELF object output +class WebAssemblyTargetELFStreamer final : public WebAssemblyTargetStreamer { +public: + explicit WebAssemblyTargetELFStreamer(MCStreamer &S); + + void emitParam(ArrayRef<MVT> Types) override; + void emitResult(ArrayRef<MVT> Types) override; + void emitLocal(ArrayRef<MVT> Types) override; + void emitEndFunc() override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/WebAssembly/README.txt b/contrib/llvm/lib/Target/WebAssembly/README.txt index 63e02c4..b97ea45 100644 --- a/contrib/llvm/lib/Target/WebAssembly/README.txt +++ b/contrib/llvm/lib/Target/WebAssembly/README.txt @@ -12,6 +12,16 @@ binary encoding of WebAssembly itself: * https://github.com/WebAssembly/design/blob/master/AstSemantics.md * https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md +The backend is built, tested and archived on the following waterfall: + https://build.chromium.org/p/client.wasm.llvm/console + +The backend's bringup is done using the GCC torture test suite first since it +doesn't require C library support. Current known failures are in +known_gcc_test_failures.txt, all other tests should pass. The waterfall will +turn red if not. Once most of these pass, further testing will use LLVM's own +test suite. The tests can be run locally using: + github.com/WebAssembly/experimental/blob/master/buildbot/torture_test.py + Interesting work that remains to be done: * Write a pass to restructurize irreducible control flow. This needs to be done before register allocation to be efficient, because it may duplicate basic @@ -19,8 +29,60 @@ Interesting work that remains to be done: level. Note that LLVM's GPU code has such a pass, but it linearizes control flow (e.g. both sides of branches execute and are masked) which is undesirable for WebAssembly. -* Basic relooper to expose control flow as an AST. -* Figure out how to properly use MC for virtual ISAs. This may require some - refactoring of MC. + +//===---------------------------------------------------------------------===// + +set_local instructions have a return value. We should (a) model this, +and (b) write optimizations which take advantage of it. Keep in mind that +many set_local instructions are implicit! + +//===---------------------------------------------------------------------===// + +Br, br_if, and tableswitch instructions can support having a value on the +expression stack across the jump (sometimes). We should (a) model this, and +(b) extend the stackifier to utilize it. + +//===---------------------------------------------------------------------===// + +The min/max operators aren't exactly a<b?a:b because of NaN and negative zero +behavior. The ARM target has the same kind of min/max instructions and has +implemented optimizations for them; we should do similar optimizations for +WebAssembly. + +//===---------------------------------------------------------------------===// + +AArch64 runs SeparateConstOffsetFromGEPPass, followed by EarlyCSE and LICM. +Would these be useful to run for WebAssembly too? Also, it has an option to +run SimplifyCFG after running the AtomicExpand pass. Would this be useful for +us too? + +//===---------------------------------------------------------------------===// + +When is it profitable to set isAsCheapAsAMove on instructions in WebAssembly? + +//===---------------------------------------------------------------------===// + +Register stackification uses the EXPR_STACK physical register to impose +ordering dependencies on instructions with stack operands. This is pessimistic; +we should consider alternate ways to model stack dependencies. + +//===---------------------------------------------------------------------===// + +Lots of things could be done in WebAssemblyTargetTransformInfo.cpp. Similarly, +there are numerous optimization-related hooks that can be overridden in +WebAssemblyTargetLowering. + +//===---------------------------------------------------------------------===// + +Instead of the OptimizeReturned pass, which should consider preserving the +"returned" attribute through to MachineInstrs and extending the StoreResults +pass to do this optimization on calls too. That would also let the +WebAssemblyPeephole pass clean up dead defs for such calls, as it does for +stores. + +//===---------------------------------------------------------------------===// + +Memset/memcpy/memmove should be marked with the "returned" attribute somehow, +even when they are translated through intrinsics. //===---------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/WebAssembly/Relooper.cpp b/contrib/llvm/lib/Target/WebAssembly/Relooper.cpp new file mode 100644 index 0000000..9b718ef --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/Relooper.cpp @@ -0,0 +1,984 @@ +//===-- Relooper.cpp - Top-level interface for WebAssembly ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// \brief This implements the Relooper algorithm. This implementation includes +/// optimizations added since the original academic paper [1] was published. +/// +/// [1] Alon Zakai. 2011. Emscripten: an LLVM-to-JavaScript compiler. In +/// Proceedings of the ACM international conference companion on Object +/// oriented programming systems languages and applications companion +/// (SPLASH '11). ACM, New York, NY, USA, 301-312. DOI=10.1145/2048147.2048224 +/// http://doi.acm.org/10.1145/2048147.2048224 +/// +//===-------------------------------------------------------------------===// + +#include "Relooper.h" +#include "WebAssembly.h" + +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#include <cstring> +#include <cstdlib> +#include <functional> +#include <list> +#include <stack> +#include <string> + +#define DEBUG_TYPE "relooper" + +using namespace llvm; +using namespace Relooper; + +static cl::opt<int> RelooperSplittingFactor( + "relooper-splitting-factor", + cl::desc( + "How much to discount code size when deciding whether to split a node"), + cl::init(5)); + +static cl::opt<unsigned> RelooperMultipleSwitchThreshold( + "relooper-multiple-switch-threshold", + cl::desc( + "How many entries to allow in a multiple before we use a switch"), + cl::init(10)); + +static cl::opt<unsigned> RelooperNestingLimit( + "relooper-nesting-limit", + cl::desc( + "How much nesting is acceptable"), + cl::init(20)); + + +namespace { +/// +/// Implements the relooper algorithm for a function's blocks. +/// +/// Implementation details: The Relooper instance has +/// ownership of the blocks and shapes, and frees them when done. +/// +struct RelooperAlgorithm { + std::deque<Block *> Blocks; + std::deque<Shape *> Shapes; + Shape *Root; + bool MinSize; + int BlockIdCounter; + int ShapeIdCounter; + + RelooperAlgorithm(); + ~RelooperAlgorithm(); + + void AddBlock(Block *New, int Id = -1); + + // Calculates the shapes + void Calculate(Block *Entry); + + // Sets us to try to minimize size + void SetMinSize(bool MinSize_) { MinSize = MinSize_; } +}; + +struct RelooperAnalysis final : public FunctionPass { + static char ID; + RelooperAnalysis() : FunctionPass(ID) {} + const char *getPassName() const override { return "relooper"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + bool runOnFunction(Function &F) override; +}; +} + +// RelooperAnalysis + +char RelooperAnalysis::ID = 0; +FunctionPass *llvm::createWebAssemblyRelooper() { + return new RelooperAnalysis(); +} + +bool RelooperAnalysis::runOnFunction(Function &F) { + DEBUG(dbgs() << "Relooping function '" << F.getName() << "'\n"); + RelooperAlgorithm R; + // FIXME: remove duplication between relooper's and LLVM's BBs. + std::map<const BasicBlock *, Block *> BB2B; + std::map<const Block *, const BasicBlock *> B2BB; + for (const BasicBlock &BB : F) { + // FIXME: getName is wrong here, Code is meant to represent amount of code. + // FIXME: use BranchVarInit for switch. + Block *B = new Block(BB.getName().str().data(), /*BranchVarInit=*/nullptr); + R.AddBlock(B); + assert(BB2B.find(&BB) == BB2B.end() && "Inserting the same block twice"); + assert(B2BB.find(B) == B2BB.end() && "Inserting the same block twice"); + BB2B[&BB] = B; + B2BB[B] = &BB; + } + for (Block *B : R.Blocks) { + const BasicBlock *BB = B2BB[B]; + for (const BasicBlock *Successor : successors(BB)) + // FIXME: add branch's Condition and Code below. + B->AddBranchTo(BB2B[Successor], /*Condition=*/nullptr, /*Code=*/nullptr); + } + R.Calculate(BB2B[&F.getEntryBlock()]); + return false; // Analysis passes don't modify anything. +} + +// Helpers + +typedef MapVector<Block *, BlockSet> BlockBlockSetMap; +typedef std::list<Block *> BlockList; + +template <class T, class U> +static bool contains(const T &container, const U &contained) { + return container.count(contained); +} + + +// Branch + +Branch::Branch(const char *ConditionInit, const char *CodeInit) + : Ancestor(nullptr), Labeled(true) { + // FIXME: move from char* to LLVM data structures + Condition = ConditionInit ? strdup(ConditionInit) : nullptr; + Code = CodeInit ? strdup(CodeInit) : nullptr; +} + +Branch::~Branch() { + // FIXME: move from char* to LLVM data structures + free(static_cast<void *>(const_cast<char *>(Condition))); + free(static_cast<void *>(const_cast<char *>(Code))); +} + +// Block + +Block::Block(const char *CodeInit, const char *BranchVarInit) + : Parent(nullptr), Id(-1), IsCheckedMultipleEntry(false) { + // FIXME: move from char* to LLVM data structures + Code = strdup(CodeInit); + BranchVar = BranchVarInit ? strdup(BranchVarInit) : nullptr; +} + +Block::~Block() { + // FIXME: move from char* to LLVM data structures + free(static_cast<void *>(const_cast<char *>(Code))); + free(static_cast<void *>(const_cast<char *>(BranchVar))); +} + +void Block::AddBranchTo(Block *Target, const char *Condition, + const char *Code) { + assert(!contains(BranchesOut, Target) && + "cannot add more than one branch to the same target"); + BranchesOut[Target] = make_unique<Branch>(Condition, Code); +} + +// Relooper + +RelooperAlgorithm::RelooperAlgorithm() + : Root(nullptr), MinSize(false), BlockIdCounter(1), + ShapeIdCounter(0) { // block ID 0 is reserved for clearings +} + +RelooperAlgorithm::~RelooperAlgorithm() { + for (auto Curr : Blocks) + delete Curr; + for (auto Curr : Shapes) + delete Curr; +} + +void RelooperAlgorithm::AddBlock(Block *New, int Id) { + New->Id = Id == -1 ? BlockIdCounter++ : Id; + Blocks.push_back(New); +} + +struct RelooperRecursor { + RelooperAlgorithm *Parent; + RelooperRecursor(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {} +}; + +void RelooperAlgorithm::Calculate(Block *Entry) { + // Scan and optimize the input + struct PreOptimizer : public RelooperRecursor { + PreOptimizer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {} + BlockSet Live; + + void FindLive(Block *Root) { + BlockList ToInvestigate; + ToInvestigate.push_back(Root); + while (!ToInvestigate.empty()) { + Block *Curr = ToInvestigate.front(); + ToInvestigate.pop_front(); + if (contains(Live, Curr)) + continue; + Live.insert(Curr); + for (const auto &iter : Curr->BranchesOut) + ToInvestigate.push_back(iter.first); + } + } + + // If a block has multiple entries but no exits, and it is small enough, it + // is useful to split it. A common example is a C++ function where + // everything ends up at a final exit block and does some RAII cleanup. + // Without splitting, we will be forced to introduce labelled loops to + // allow reaching the final block + void SplitDeadEnds() { + unsigned TotalCodeSize = 0; + for (const auto &Curr : Live) { + TotalCodeSize += strlen(Curr->Code); + } + BlockSet Splits; + BlockSet Removed; + for (const auto &Original : Live) { + if (Original->BranchesIn.size() <= 1 || + !Original->BranchesOut.empty()) + continue; // only dead ends, for now + if (contains(Original->BranchesOut, Original)) + continue; // cannot split a looping node + if (strlen(Original->Code) * (Original->BranchesIn.size() - 1) > + TotalCodeSize / RelooperSplittingFactor) + continue; // if splitting increases raw code size by a significant + // amount, abort + // Split the node (for simplicity, we replace all the blocks, even + // though we could have reused the original) + DEBUG(dbgs() << " Splitting '" << Original->Code << "'\n"); + for (const auto &Prior : Original->BranchesIn) { + Block *Split = new Block(Original->Code, Original->BranchVar); + Parent->AddBlock(Split, Original->Id); + Split->BranchesIn.insert(Prior); + std::unique_ptr<Branch> Details; + Details.swap(Prior->BranchesOut[Original]); + Prior->BranchesOut[Split] = make_unique<Branch>(Details->Condition, + Details->Code); + for (const auto &iter : Original->BranchesOut) { + Block *Post = iter.first; + Branch *Details = iter.second.get(); + Split->BranchesOut[Post] = make_unique<Branch>(Details->Condition, + Details->Code); + Post->BranchesIn.insert(Split); + } + Splits.insert(Split); + Removed.insert(Original); + } + for (const auto &iter : Original->BranchesOut) { + Block *Post = iter.first; + Post->BranchesIn.remove(Original); + } + } + for (const auto &iter : Splits) + Live.insert(iter); + for (const auto &iter : Removed) + Live.remove(iter); + } + }; + PreOptimizer Pre(this); + Pre.FindLive(Entry); + + // Add incoming branches from live blocks, ignoring dead code + for (unsigned i = 0; i < Blocks.size(); i++) { + Block *Curr = Blocks[i]; + if (!contains(Pre.Live, Curr)) + continue; + for (const auto &iter : Curr->BranchesOut) + iter.first->BranchesIn.insert(Curr); + } + + if (!MinSize) + Pre.SplitDeadEnds(); + + // Recursively process the graph + + struct Analyzer : public RelooperRecursor { + Analyzer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {} + + // Add a shape to the list of shapes in this Relooper calculation + void Notice(Shape *New) { + New->Id = Parent->ShapeIdCounter++; + Parent->Shapes.push_back(New); + } + + // Create a list of entries from a block. If LimitTo is provided, only + // results in that set will appear + void GetBlocksOut(Block *Source, BlockSet &Entries, + BlockSet *LimitTo = nullptr) { + for (const auto &iter : Source->BranchesOut) + if (!LimitTo || contains(*LimitTo, iter.first)) + Entries.insert(iter.first); + } + + // Converts/processes all branchings to a specific target + void Solipsize(Block *Target, Branch::FlowType Type, Shape *Ancestor, + BlockSet &From) { + DEBUG(dbgs() << " Solipsize '" << Target->Code << "' type " << Type + << "\n"); + for (auto iter = Target->BranchesIn.begin(); + iter != Target->BranchesIn.end();) { + Block *Prior = *iter; + if (!contains(From, Prior)) { + iter++; + continue; + } + std::unique_ptr<Branch> PriorOut; + PriorOut.swap(Prior->BranchesOut[Target]); + PriorOut->Ancestor = Ancestor; + PriorOut->Type = Type; + if (MultipleShape *Multiple = dyn_cast<MultipleShape>(Ancestor)) + Multiple->Breaks++; // We are breaking out of this Multiple, so need a + // loop + iter++; // carefully increment iter before erasing + Target->BranchesIn.remove(Prior); + Target->ProcessedBranchesIn.insert(Prior); + Prior->ProcessedBranchesOut[Target].swap(PriorOut); + } + } + + Shape *MakeSimple(BlockSet &Blocks, Block *Inner, BlockSet &NextEntries) { + DEBUG(dbgs() << " MakeSimple inner block '" << Inner->Code << "'\n"); + SimpleShape *Simple = new SimpleShape; + Notice(Simple); + Simple->Inner = Inner; + Inner->Parent = Simple; + if (Blocks.size() > 1) { + Blocks.remove(Inner); + GetBlocksOut(Inner, NextEntries, &Blocks); + BlockSet JustInner; + JustInner.insert(Inner); + for (const auto &iter : NextEntries) + Solipsize(iter, Branch::Direct, Simple, JustInner); + } + return Simple; + } + + Shape *MakeLoop(BlockSet &Blocks, BlockSet &Entries, + BlockSet &NextEntries) { + // Find the inner blocks in this loop. Proceed backwards from the entries + // until + // you reach a seen block, collecting as you go. + BlockSet InnerBlocks; + BlockSet Queue = Entries; + while (!Queue.empty()) { + Block *Curr = *(Queue.begin()); + Queue.remove(*Queue.begin()); + if (!contains(InnerBlocks, Curr)) { + // This element is new, mark it as inner and remove from outer + InnerBlocks.insert(Curr); + Blocks.remove(Curr); + // Add the elements prior to it + for (const auto &iter : Curr->BranchesIn) + Queue.insert(iter); + } + } + assert(!InnerBlocks.empty()); + + for (const auto &Curr : InnerBlocks) { + for (const auto &iter : Curr->BranchesOut) { + Block *Possible = iter.first; + if (!contains(InnerBlocks, Possible)) + NextEntries.insert(Possible); + } + } + + LoopShape *Loop = new LoopShape(); + Notice(Loop); + + // Solipsize the loop, replacing with break/continue and marking branches + // as Processed (will not affect later calculations) + // A. Branches to the loop entries become a continue to this shape + for (const auto &iter : Entries) + Solipsize(iter, Branch::Continue, Loop, InnerBlocks); + // B. Branches to outside the loop (a next entry) become breaks on this + // shape + for (const auto &iter : NextEntries) + Solipsize(iter, Branch::Break, Loop, InnerBlocks); + // Finish up + Shape *Inner = Process(InnerBlocks, Entries, nullptr); + Loop->Inner = Inner; + return Loop; + } + + // For each entry, find the independent group reachable by it. The + // independent group is the entry itself, plus all the blocks it can + // reach that cannot be directly reached by another entry. Note that we + // ignore directly reaching the entry itself by another entry. + // @param Ignore - previous blocks that are irrelevant + void FindIndependentGroups(BlockSet &Entries, + BlockBlockSetMap &IndependentGroups, + BlockSet *Ignore = nullptr) { + typedef std::map<Block *, Block *> BlockBlockMap; + + struct HelperClass { + BlockBlockSetMap &IndependentGroups; + BlockBlockMap Ownership; // For each block, which entry it belongs to. + // We have reached it from there. + + HelperClass(BlockBlockSetMap &IndependentGroupsInit) + : IndependentGroups(IndependentGroupsInit) {} + void InvalidateWithChildren(Block *New) { + // Being in the list means you need to be invalidated + BlockList ToInvalidate; + ToInvalidate.push_back(New); + while (!ToInvalidate.empty()) { + Block *Invalidatee = ToInvalidate.front(); + ToInvalidate.pop_front(); + Block *Owner = Ownership[Invalidatee]; + // Owner may have been invalidated, do not add to + // IndependentGroups! + if (contains(IndependentGroups, Owner)) + IndependentGroups[Owner].remove(Invalidatee); + if (Ownership[Invalidatee]) { // may have been seen before and + // invalidated already + Ownership[Invalidatee] = nullptr; + for (const auto &iter : Invalidatee->BranchesOut) { + Block *Target = iter.first; + BlockBlockMap::iterator Known = Ownership.find(Target); + if (Known != Ownership.end()) { + Block *TargetOwner = Known->second; + if (TargetOwner) + ToInvalidate.push_back(Target); + } + } + } + } + } + }; + HelperClass Helper(IndependentGroups); + + // We flow out from each of the entries, simultaneously. + // When we reach a new block, we add it as belonging to the one we got to + // it from. + // If we reach a new block that is already marked as belonging to someone, + // it is reachable by two entries and is not valid for any of them. + // Remove it and all it can reach that have been visited. + + // Being in the queue means we just added this item, and + // we need to add its children + BlockList Queue; + for (const auto &Entry : Entries) { + Helper.Ownership[Entry] = Entry; + IndependentGroups[Entry].insert(Entry); + Queue.push_back(Entry); + } + while (!Queue.empty()) { + Block *Curr = Queue.front(); + Queue.pop_front(); + Block *Owner = Helper.Ownership[Curr]; // Curr must be in the ownership + // map if we are in the queue + if (!Owner) + continue; // we have been invalidated meanwhile after being reached + // from two entries + // Add all children + for (const auto &iter : Curr->BranchesOut) { + Block *New = iter.first; + BlockBlockMap::iterator Known = Helper.Ownership.find(New); + if (Known == Helper.Ownership.end()) { + // New node. Add it, and put it in the queue + Helper.Ownership[New] = Owner; + IndependentGroups[Owner].insert(New); + Queue.push_back(New); + continue; + } + Block *NewOwner = Known->second; + if (!NewOwner) + continue; // We reached an invalidated node + if (NewOwner != Owner) + // Invalidate this and all reachable that we have seen - we reached + // this from two locations + Helper.InvalidateWithChildren(New); + // otherwise, we have the same owner, so do nothing + } + } + + // Having processed all the interesting blocks, we remain with just one + // potential issue: + // If a->b, and a was invalidated, but then b was later reached by + // someone else, we must invalidate b. To check for this, we go over all + // elements in the independent groups, if an element has a parent which + // does *not* have the same owner, we/ must remove it and all its + // children. + + for (const auto &iter : Entries) { + BlockSet &CurrGroup = IndependentGroups[iter]; + BlockList ToInvalidate; + for (const auto &iter : CurrGroup) { + Block *Child = iter; + for (const auto &iter : Child->BranchesIn) { + Block *Parent = iter; + if (Ignore && contains(*Ignore, Parent)) + continue; + if (Helper.Ownership[Parent] != Helper.Ownership[Child]) + ToInvalidate.push_back(Child); + } + } + while (!ToInvalidate.empty()) { + Block *Invalidatee = ToInvalidate.front(); + ToInvalidate.pop_front(); + Helper.InvalidateWithChildren(Invalidatee); + } + } + + // Remove empty groups + for (const auto &iter : Entries) + if (IndependentGroups[iter].empty()) + IndependentGroups.erase(iter); + } + + Shape *MakeMultiple(BlockSet &Blocks, BlockSet &Entries, + BlockBlockSetMap &IndependentGroups, Shape *Prev, + BlockSet &NextEntries) { + bool Fused = isa<SimpleShape>(Prev); + MultipleShape *Multiple = new MultipleShape(); + Notice(Multiple); + BlockSet CurrEntries; + for (auto &iter : IndependentGroups) { + Block *CurrEntry = iter.first; + BlockSet &CurrBlocks = iter.second; + // Create inner block + CurrEntries.clear(); + CurrEntries.insert(CurrEntry); + for (const auto &CurrInner : CurrBlocks) { + // Remove the block from the remaining blocks + Blocks.remove(CurrInner); + // Find new next entries and fix branches to them + for (auto iter = CurrInner->BranchesOut.begin(); + iter != CurrInner->BranchesOut.end();) { + Block *CurrTarget = iter->first; + auto Next = iter; + Next++; + if (!contains(CurrBlocks, CurrTarget)) { + NextEntries.insert(CurrTarget); + Solipsize(CurrTarget, Branch::Break, Multiple, CurrBlocks); + } + iter = Next; // increment carefully because Solipsize can remove us + } + } + Multiple->InnerMap[CurrEntry->Id] = + Process(CurrBlocks, CurrEntries, nullptr); + // If we are not fused, then our entries will actually be checked + if (!Fused) + CurrEntry->IsCheckedMultipleEntry = true; + } + // Add entries not handled as next entries, they are deferred + for (const auto &Entry : Entries) + if (!contains(IndependentGroups, Entry)) + NextEntries.insert(Entry); + // The multiple has been created, we can decide how to implement it + if (Multiple->InnerMap.size() >= RelooperMultipleSwitchThreshold) { + Multiple->UseSwitch = true; + Multiple->Breaks++; // switch captures breaks + } + return Multiple; + } + + // Main function. + // Process a set of blocks with specified entries, returns a shape + // The Make* functions receive a NextEntries. If they fill it with data, + // those are the entries for the ->Next block on them, and the blocks + // are what remains in Blocks (which Make* modify). In this way + // we avoid recursing on Next (imagine a long chain of Simples, if we + // recursed we could blow the stack). + Shape *Process(BlockSet &Blocks, BlockSet &InitialEntries, Shape *Prev) { + BlockSet *Entries = &InitialEntries; + BlockSet TempEntries[2]; + int CurrTempIndex = 0; + BlockSet *NextEntries; + Shape *Ret = nullptr; + + auto Make = [&](Shape *Temp) { + if (Prev) + Prev->Next = Temp; + if (!Ret) + Ret = Temp; + Prev = Temp; + Entries = NextEntries; + }; + + while (1) { + CurrTempIndex = 1 - CurrTempIndex; + NextEntries = &TempEntries[CurrTempIndex]; + NextEntries->clear(); + + if (Entries->empty()) + return Ret; + if (Entries->size() == 1) { + Block *Curr = *(Entries->begin()); + if (Curr->BranchesIn.empty()) { + // One entry, no looping ==> Simple + Make(MakeSimple(Blocks, Curr, *NextEntries)); + if (NextEntries->empty()) + return Ret; + continue; + } + // One entry, looping ==> Loop + Make(MakeLoop(Blocks, *Entries, *NextEntries)); + if (NextEntries->empty()) + return Ret; + continue; + } + + // More than one entry, try to eliminate through a Multiple groups of + // independent blocks from an entry/ies. It is important to remove + // through multiples as opposed to looping since the former is more + // performant. + BlockBlockSetMap IndependentGroups; + FindIndependentGroups(*Entries, IndependentGroups); + + if (!IndependentGroups.empty()) { + // We can handle a group in a multiple if its entry cannot be reached + // by another group. + // Note that it might be reachable by itself - a loop. But that is + // fine, we will create a loop inside the multiple block (which + // is the performant order to do it). + for (auto iter = IndependentGroups.begin(); + iter != IndependentGroups.end();) { + Block *Entry = iter->first; + BlockSet &Group = iter->second; + auto curr = iter++; // iterate carefully, we may delete + for (BlockSet::iterator iterBranch = Entry->BranchesIn.begin(); + iterBranch != Entry->BranchesIn.end(); iterBranch++) { + Block *Origin = *iterBranch; + if (!contains(Group, Origin)) { + // Reached from outside the group, so we cannot handle this + IndependentGroups.erase(curr); + break; + } + } + } + + // As an optimization, if we have 2 independent groups, and one is a + // small dead end, we can handle only that dead end. + // The other then becomes a Next - without nesting in the code and + // recursion in the analysis. + // TODO: if the larger is the only dead end, handle that too + // TODO: handle >2 groups + // TODO: handle not just dead ends, but also that do not branch to the + // NextEntries. However, must be careful there since we create a + // Next, and that Next can prevent eliminating a break (since we no + // longer naturally reach the same place), which may necessitate a + // one-time loop, which makes the unnesting pointless. + if (IndependentGroups.size() == 2) { + // Find the smaller one + auto iter = IndependentGroups.begin(); + Block *SmallEntry = iter->first; + auto SmallSize = iter->second.size(); + iter++; + Block *LargeEntry = iter->first; + auto LargeSize = iter->second.size(); + if (SmallSize != LargeSize) { // ignore the case where they are + // identical - keep things symmetrical + // there + if (SmallSize > LargeSize) { + Block *Temp = SmallEntry; + SmallEntry = LargeEntry; + LargeEntry = Temp; // Note: we did not flip the Sizes too, they + // are now invalid. TODO: use the smaller + // size as a limit? + } + // Check if dead end + bool DeadEnd = true; + BlockSet &SmallGroup = IndependentGroups[SmallEntry]; + for (const auto &Curr : SmallGroup) { + for (const auto &iter : Curr->BranchesOut) { + Block *Target = iter.first; + if (!contains(SmallGroup, Target)) { + DeadEnd = false; + break; + } + } + if (!DeadEnd) + break; + } + if (DeadEnd) + IndependentGroups.erase(LargeEntry); + } + } + + if (!IndependentGroups.empty()) + // Some groups removable ==> Multiple + Make(MakeMultiple(Blocks, *Entries, IndependentGroups, Prev, + *NextEntries)); + if (NextEntries->empty()) + return Ret; + continue; + } + // No independent groups, must be loopable ==> Loop + Make(MakeLoop(Blocks, *Entries, *NextEntries)); + if (NextEntries->empty()) + return Ret; + continue; + } + } + }; + + // Main + + BlockSet AllBlocks; + for (const auto &Curr : Pre.Live) { + AllBlocks.insert(Curr); + } + + BlockSet Entries; + Entries.insert(Entry); + Root = Analyzer(this).Process(AllBlocks, Entries, nullptr); + assert(Root); + + /// + /// Relooper post-optimizer + /// + struct PostOptimizer { + RelooperAlgorithm *Parent; + std::stack<Shape *> LoopStack; + + PostOptimizer(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {} + + void ShapeSwitch(Shape* var, + std::function<void (SimpleShape*)> simple, + std::function<void (MultipleShape*)> multiple, + std::function<void (LoopShape*)> loop) { + switch (var->getKind()) { + case Shape::SK_Simple: { + simple(cast<SimpleShape>(var)); + break; + } + case Shape::SK_Multiple: { + multiple(cast<MultipleShape>(var)); + break; + } + case Shape::SK_Loop: { + loop(cast<LoopShape>(var)); + break; + } + } + } + + // Find the blocks that natural control flow can get us directly to, or + // through a multiple that we ignore + void FollowNaturalFlow(Shape *S, BlockSet &Out) { + ShapeSwitch(S, [&](SimpleShape* Simple) { + Out.insert(Simple->Inner); + }, [&](MultipleShape* Multiple) { + for (const auto &iter : Multiple->InnerMap) { + FollowNaturalFlow(iter.second, Out); + } + FollowNaturalFlow(Multiple->Next, Out); + }, [&](LoopShape* Loop) { + FollowNaturalFlow(Loop->Inner, Out); + }); + } + + void FindNaturals(Shape *Root, Shape *Otherwise = nullptr) { + if (Root->Next) { + Root->Natural = Root->Next; + FindNaturals(Root->Next, Otherwise); + } else { + Root->Natural = Otherwise; + } + + ShapeSwitch(Root, [](SimpleShape* Simple) { + }, [&](MultipleShape* Multiple) { + for (const auto &iter : Multiple->InnerMap) { + FindNaturals(iter.second, Root->Natural); + } + }, [&](LoopShape* Loop){ + FindNaturals(Loop->Inner, Loop->Inner); + }); + } + + // Remove unneeded breaks and continues. + // A flow operation is trivially unneeded if the shape we naturally get to + // by normal code execution is the same as the flow forces us to. + void RemoveUnneededFlows(Shape *Root, Shape *Natural = nullptr, + LoopShape *LastLoop = nullptr, + unsigned Depth = 0) { + BlockSet NaturalBlocks; + FollowNaturalFlow(Natural, NaturalBlocks); + Shape *Next = Root; + while (Next) { + Root = Next; + Next = nullptr; + ShapeSwitch( + Root, + [&](SimpleShape* Simple) { + if (Simple->Inner->BranchVar) + LastLoop = + nullptr; // a switch clears out the loop (TODO: only for + // breaks, not continue) + + if (Simple->Next) { + if (!Simple->Inner->BranchVar && + Simple->Inner->ProcessedBranchesOut.size() == 2 && + Depth < RelooperNestingLimit) { + // If there is a next block, we already know at Simple + // creation time to make direct branches, and we can do + // nothing more in general. But, we try to optimize the + // case of a break and a direct: This would normally be + // if (break?) { break; } .. + // but if we make sure to nest the else, we can save the + // break, + // if (!break?) { .. } + // This is also better because the more canonical nested + // form is easier to further optimize later. The + // downside is more nesting, which adds to size in builds with + // whitespace. + // Note that we avoid switches, as it complicates control flow + // and is not relevant for the common case we optimize here. + bool Found = false; + bool Abort = false; + for (const auto &iter : Simple->Inner->ProcessedBranchesOut) { + Block *Target = iter.first; + Branch *Details = iter.second.get(); + if (Details->Type == Branch::Break) { + Found = true; + if (!contains(NaturalBlocks, Target)) + Abort = true; + } else if (Details->Type != Branch::Direct) + Abort = true; + } + if (Found && !Abort) { + for (const auto &iter : Simple->Inner->ProcessedBranchesOut) { + Branch *Details = iter.second.get(); + if (Details->Type == Branch::Break) { + Details->Type = Branch::Direct; + if (MultipleShape *Multiple = + dyn_cast<MultipleShape>(Details->Ancestor)) + Multiple->Breaks--; + } else { + assert(Details->Type == Branch::Direct); + Details->Type = Branch::Nested; + } + } + } + Depth++; // this optimization increases depth, for us and all + // our next chain (i.e., until this call returns) + } + Next = Simple->Next; + } else { + // If there is no next then Natural is where we will + // go to by doing nothing, so we can potentially optimize some + // branches to direct. + for (const auto &iter : Simple->Inner->ProcessedBranchesOut) { + Block *Target = iter.first; + Branch *Details = iter.second.get(); + if (Details->Type != Branch::Direct && + contains(NaturalBlocks, + Target)) { // note: cannot handle split blocks + Details->Type = Branch::Direct; + if (MultipleShape *Multiple = + dyn_cast<MultipleShape>(Details->Ancestor)) + Multiple->Breaks--; + } else if (Details->Type == Branch::Break && LastLoop && + LastLoop->Natural == Details->Ancestor->Natural) { + // it is important to simplify breaks, as simpler breaks + // enable other optimizations + Details->Labeled = false; + if (MultipleShape *Multiple = + dyn_cast<MultipleShape>(Details->Ancestor)) + Multiple->Breaks--; + } + } + } + }, [&](MultipleShape* Multiple) + { + for (const auto &iter : Multiple->InnerMap) { + RemoveUnneededFlows(iter.second, Multiple->Next, + Multiple->Breaks ? nullptr : LastLoop, + Depth + 1); + } + Next = Multiple->Next; + }, [&](LoopShape* Loop) + { + RemoveUnneededFlows(Loop->Inner, Loop->Inner, Loop, Depth + 1); + Next = Loop->Next; + }); + } + } + + // After we know which loops exist, we can calculate which need to be + // labeled + void FindLabeledLoops(Shape *Root) { + Shape *Next = Root; + while (Next) { + Root = Next; + Next = nullptr; + + ShapeSwitch( + Root, + [&](SimpleShape *Simple) { + MultipleShape *Fused = dyn_cast<MultipleShape>(Root->Next); + // If we are fusing a Multiple with a loop into this Simple, then + // visit it now + if (Fused && Fused->Breaks) + LoopStack.push(Fused); + if (Simple->Inner->BranchVar) + LoopStack.push(nullptr); // a switch means breaks are now useless, + // push a dummy + if (Fused) { + if (Fused->UseSwitch) + LoopStack.push(nullptr); // a switch means breaks are now + // useless, push a dummy + for (const auto &iter : Fused->InnerMap) { + FindLabeledLoops(iter.second); + } + } + for (const auto &iter : Simple->Inner->ProcessedBranchesOut) { + Branch *Details = iter.second.get(); + if (Details->Type == Branch::Break || + Details->Type == Branch::Continue) { + assert(!LoopStack.empty()); + if (Details->Ancestor != LoopStack.top() && Details->Labeled) { + if (MultipleShape *Multiple = + dyn_cast<MultipleShape>(Details->Ancestor)) { + Multiple->Labeled = true; + } else { + LoopShape *Loop = cast<LoopShape>(Details->Ancestor); + Loop->Labeled = true; + } + } else { + Details->Labeled = false; + } + } + if (Fused && Fused->UseSwitch) + LoopStack.pop(); + if (Simple->Inner->BranchVar) + LoopStack.pop(); + if (Fused && Fused->Breaks) + LoopStack.pop(); + if (Fused) + Next = Fused->Next; + else + Next = Root->Next; + } + } + , [&](MultipleShape* Multiple) { + if (Multiple->Breaks) + LoopStack.push(Multiple); + for (const auto &iter : Multiple->InnerMap) + FindLabeledLoops(iter.second); + if (Multiple->Breaks) + LoopStack.pop(); + Next = Root->Next; + } + , [&](LoopShape* Loop) { + LoopStack.push(Loop); + FindLabeledLoops(Loop->Inner); + LoopStack.pop(); + Next = Root->Next; + }); + } + } + + void Process(Shape * Root) { + FindNaturals(Root); + RemoveUnneededFlows(Root); + FindLabeledLoops(Root); + } + }; + + PostOptimizer(this).Process(Root); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/Relooper.h b/contrib/llvm/lib/Target/WebAssembly/Relooper.h new file mode 100644 index 0000000..7c564de --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/Relooper.h @@ -0,0 +1,186 @@ +//===-- Relooper.h - Top-level interface for WebAssembly ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===-------------------------------------------------------------------===// +/// +/// \file +/// \brief This defines an optimized C++ implemention of the Relooper +/// algorithm, originally developed as part of Emscripten, which +/// generates a structured AST from arbitrary control flow. +/// +//===-------------------------------------------------------------------===// + +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Support/Casting.h" + +#include <cassert> +#include <cstdarg> +#include <cstdio> +#include <deque> +#include <list> +#include <map> +#include <memory> +#include <set> + +namespace llvm { + +namespace Relooper { + +struct Block; +struct Shape; + +/// +/// Info about a branching from one block to another +/// +struct Branch { + enum FlowType { + Direct = 0, // We will directly reach the right location through other + // means, no need for continue or break + Break = 1, + Continue = 2, + Nested = 3 // This code is directly reached, but we must be careful to + // ensure it is nested in an if - it is not reached + // unconditionally, other code paths exist alongside it that we need to make + // sure do not intertwine + }; + Shape + *Ancestor; // If not nullptr, this shape is the relevant one for purposes + // of getting to the target block. We break or continue on it + Branch::FlowType + Type; // If Ancestor is not nullptr, this says whether to break or + // continue + bool Labeled; // If a break or continue, whether we need to use a label + const char *Condition; // The condition for which we branch. For example, + // "my_var == 1". Conditions are checked one by one. + // One of the conditions should have nullptr as the + // condition, in which case it is the default + // FIXME: move from char* to LLVM data structures + const char *Code; // If provided, code that is run right before the branch is + // taken. This is useful for phis + // FIXME: move from char* to LLVM data structures + + Branch(const char *ConditionInit, const char *CodeInit = nullptr); + ~Branch(); +}; + +typedef SetVector<Block *> BlockSet; +typedef MapVector<Block *, Branch *> BlockBranchMap; +typedef MapVector<Block *, std::unique_ptr<Branch>> OwningBlockBranchMap; + +/// +/// Represents a basic block of code - some instructions that end with a +/// control flow modifier (a branch, return or throw). +/// +struct Block { + // Branches become processed after we finish the shape relevant to them. For + // example, when we recreate a loop, branches to the loop start become + // continues and are now processed. When we calculate what shape to generate + // from a set of blocks, we ignore processed branches. Blocks own the Branch + // objects they use, and destroy them when done. + OwningBlockBranchMap BranchesOut; + BlockSet BranchesIn; + OwningBlockBranchMap ProcessedBranchesOut; + BlockSet ProcessedBranchesIn; + Shape *Parent; // The shape we are directly inside + int Id; // A unique identifier, defined when added to relooper. Note that this + // uniquely identifies a *logical* block - if we split it, the two + // instances have the same content *and* the same Id + const char *Code; // The string representation of the code in this block. + // Owning pointer (we copy the input) + // FIXME: move from char* to LLVM data structures + const char *BranchVar; // A variable whose value determines where we go; if + // this is not nullptr, emit a switch on that variable + // FIXME: move from char* to LLVM data structures + bool IsCheckedMultipleEntry; // If true, we are a multiple entry, so reaching + // us requires setting the label variable + + Block(const char *CodeInit, const char *BranchVarInit); + ~Block(); + + void AddBranchTo(Block *Target, const char *Condition, + const char *Code = nullptr); +}; + +/// +/// Represents a structured control flow shape +/// +struct Shape { + int Id; // A unique identifier. Used to identify loops, labels are Lx where x + // is the Id. Defined when added to relooper + Shape *Next; // The shape that will appear in the code right after this one + Shape *Natural; // The shape that control flow gets to naturally (if there is + // Next, then this is Next) + + /// Discriminator for LLVM-style RTTI (dyn_cast<> et al.) + enum ShapeKind { SK_Simple, SK_Multiple, SK_Loop }; + +private: + ShapeKind Kind; + +public: + ShapeKind getKind() const { return Kind; } + + Shape(ShapeKind KindInit) : Id(-1), Next(nullptr), Kind(KindInit) {} +}; + +/// +/// Simple: No control flow at all, just instructions. +/// +struct SimpleShape : public Shape { + Block *Inner; + + SimpleShape() : Shape(SK_Simple), Inner(nullptr) {} + + static bool classof(const Shape *S) { return S->getKind() == SK_Simple; } +}; + +/// +/// A shape that may be implemented with a labeled loop. +/// +struct LabeledShape : public Shape { + bool Labeled; // If we have a loop, whether it needs to be labeled + + LabeledShape(ShapeKind KindInit) : Shape(KindInit), Labeled(false) {} +}; + +// Blocks with the same id were split and are identical, so we just care about +// ids in Multiple entries +typedef std::map<int, Shape *> IdShapeMap; + +/// +/// Multiple: A shape with more than one entry. If the next block to +/// be entered is among them, we run it and continue to +/// the next shape, otherwise we continue immediately to the +/// next shape. +/// +struct MultipleShape : public LabeledShape { + IdShapeMap InnerMap; // entry block ID -> shape + int Breaks; // If we have branches on us, we need a loop (or a switch). This + // is a counter of requirements, + // if we optimize it to 0, the loop is unneeded + bool UseSwitch; // Whether to switch on label as opposed to an if-else chain + + MultipleShape() : LabeledShape(SK_Multiple), Breaks(0), UseSwitch(false) {} + + static bool classof(const Shape *S) { return S->getKind() == SK_Multiple; } +}; + +/// +/// Loop: An infinite loop. +/// +struct LoopShape : public LabeledShape { + Shape *Inner; + + LoopShape() : LabeledShape(SK_Loop), Inner(nullptr) {} + + static bool classof(const Shape *S) { return S->getKind() == SK_Loop; } +}; + +} // namespace Relooper + +} // namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h index 3ff19d4..e972da5 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h @@ -23,8 +23,22 @@ namespace llvm { class WebAssemblyTargetMachine; class FunctionPass; +FunctionPass *createWebAssemblyOptimizeReturned(); + FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM, CodeGenOpt::Level OptLevel); +FunctionPass *createWebAssemblyArgumentMove(); + +FunctionPass *createWebAssemblyStoreResults(); +FunctionPass *createWebAssemblyRegStackify(); +FunctionPass *createWebAssemblyRegColoring(); +FunctionPass *createWebAssemblyPEI(); +FunctionPass *createWebAssemblyCFGStackify(); +FunctionPass *createWebAssemblyLowerBrUnless(); +FunctionPass *createWebAssemblyRegNumbering(); +FunctionPass *createWebAssemblyPeephole(); + +FunctionPass *createWebAssemblyRelooper(); } // end namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td index a123bf6..551ad93 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td @@ -6,10 +6,11 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This is a target description file for the WebAssembly architecture, which is -// also known as "wasm". -// +/// +/// \file +/// \brief This is a target description file for the WebAssembly architecture, +/// which is also known as "wasm". +/// //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -50,6 +51,9 @@ def WebAssemblyInstrInfo : InstrInfo; // Minimal Viable Product. def : ProcessorModel<"mvp", NoSchedModel, []>; +// Generic processor: latest stable version. +def : ProcessorModel<"generic", NoSchedModel, []>; + // Latest and greatest experimental version of WebAssembly. Bugs included! def : ProcessorModel<"bleeding-edge", NoSchedModel, [FeatureSIMD128]>; diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp new file mode 100644 index 0000000..3893c40 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp @@ -0,0 +1,110 @@ +//===-- WebAssemblyArgumentMove.cpp - Argument instruction moving ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file moves ARGUMENT instructions after ScheduleDAG scheduling. +/// +/// Arguments are really live-in registers, however, since we use virtual +/// registers and LLVM doesn't support live-in virtual registers, we're +/// currently making do with ARGUMENT instructions which are placed at the top +/// of the entry block. The trick is to get them to *stay* at the top of the +/// entry block. +/// +/// The ARGUMENTS physical register keeps these instructions pinned in place +/// during liveness-aware CodeGen passes, however one thing which does not +/// respect this is the ScheduleDAG scheduler. This pass is therefore run +/// immediately after that. +/// +/// This is all hopefully a temporary solution until we find a better solution +/// for describing the live-in nature of arguments. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-argument-move" + +namespace { +class WebAssemblyArgumentMove final : public MachineFunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyArgumentMove() : MachineFunctionPass(ID) {} + + const char *getPassName() const override { + return "WebAssembly Argument Move"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved<MachineBlockFrequencyInfo>(); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // end anonymous namespace + +char WebAssemblyArgumentMove::ID = 0; +FunctionPass *llvm::createWebAssemblyArgumentMove() { + return new WebAssemblyArgumentMove(); +} + +/// Test whether the given instruction is an ARGUMENT. +static bool IsArgument(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case WebAssembly::ARGUMENT_I32: + case WebAssembly::ARGUMENT_I64: + case WebAssembly::ARGUMENT_F32: + case WebAssembly::ARGUMENT_F64: + return true; + default: + return false; + } +} + +bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) { + DEBUG({ + dbgs() << "********** Argument Move **********\n" + << "********** Function: " << MF.getName() << '\n'; + }); + + bool Changed = false; + MachineBasicBlock &EntryMBB = MF.front(); + MachineBasicBlock::iterator InsertPt = EntryMBB.end(); + + // Look for the first NonArg instruction. + for (auto MII = EntryMBB.begin(), MIE = EntryMBB.end(); MII != MIE; ++MII) { + MachineInstr *MI = MII; + if (!IsArgument(MI)) { + InsertPt = MII; + break; + } + } + + // Now move any argument instructions later in the block + // to before our first NonArg instruction. + for (auto I = InsertPt, E = EntryMBB.end(); I != E; ++I) { + MachineInstr *MI = I; + if (IsArgument(MI)) { + EntryMBB.insert(InsertPt, MI->removeFromParent()); + Changed = true; + } + } + + return Changed; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp new file mode 100644 index 0000000..45ac99d --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -0,0 +1,294 @@ +//===-- WebAssemblyAsmPrinter.cpp - WebAssembly LLVM assembly writer ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file contains a printer that converts from our internal +/// representation of machine-dependent LLVM code to the WebAssembly assembly +/// language. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "InstPrinter/WebAssemblyInstPrinter.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "MCTargetDesc/WebAssemblyTargetStreamer.h" +#include "WebAssemblyMCInstLower.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "WebAssemblyRegisterInfo.h" +#include "WebAssemblySubtarget.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +namespace { + +class WebAssemblyAsmPrinter final : public AsmPrinter { + const MachineRegisterInfo *MRI; + const WebAssemblyFunctionInfo *MFI; + +public: + WebAssemblyAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)), MRI(nullptr), MFI(nullptr) {} + +private: + const char *getPassName() const override { + return "WebAssembly Assembly Printer"; + } + + //===------------------------------------------------------------------===// + // MachineFunctionPass Implementation. + //===------------------------------------------------------------------===// + + bool runOnMachineFunction(MachineFunction &MF) override { + MRI = &MF.getRegInfo(); + MFI = MF.getInfo<WebAssemblyFunctionInfo>(); + return AsmPrinter::runOnMachineFunction(MF); + } + + //===------------------------------------------------------------------===// + // AsmPrinter Implementation. + //===------------------------------------------------------------------===// + + void EmitJumpTableInfo() override; + void EmitConstantPool() override; + void EmitFunctionBodyStart() override; + void EmitFunctionBodyEnd() override; + void EmitInstruction(const MachineInstr *MI) override; + const MCExpr *lowerConstant(const Constant *CV) override; + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &OS) override; + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &OS) override; + + MVT getRegType(unsigned RegNo) const; + const char *toString(MVT VT) const; + std::string regToString(const MachineOperand &MO); + WebAssemblyTargetStreamer *getTargetStreamer(); +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// Helpers. +//===----------------------------------------------------------------------===// + +MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const { + const TargetRegisterClass *TRC = + TargetRegisterInfo::isVirtualRegister(RegNo) + ? MRI->getRegClass(RegNo) + : MRI->getTargetRegisterInfo()->getMinimalPhysRegClass(RegNo); + for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64}) + if (TRC->hasType(T)) + return T; + DEBUG(errs() << "Unknown type for register number: " << RegNo); + llvm_unreachable("Unknown register type"); + return MVT::Other; +} + +const char *WebAssemblyAsmPrinter::toString(MVT VT) const { + return WebAssembly::TypeToString(VT); +} + +std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) { + unsigned RegNo = MO.getReg(); + assert(TargetRegisterInfo::isVirtualRegister(RegNo) && + "Unlowered physical register encountered during assembly printing"); + assert(!MFI->isVRegStackified(RegNo)); + unsigned WAReg = MFI->getWAReg(RegNo); + assert(WAReg != WebAssemblyFunctionInfo::UnusedReg); + return '$' + utostr(WAReg); +} + +WebAssemblyTargetStreamer * +WebAssemblyAsmPrinter::getTargetStreamer() { + MCTargetStreamer *TS = OutStreamer->getTargetStreamer(); + return static_cast<WebAssemblyTargetStreamer *>(TS); +} + +//===----------------------------------------------------------------------===// +// WebAssemblyAsmPrinter Implementation. +//===----------------------------------------------------------------------===// + +void WebAssemblyAsmPrinter::EmitConstantPool() { + assert(MF->getConstantPool()->getConstants().empty() && + "WebAssembly disables constant pools"); +} + +void WebAssemblyAsmPrinter::EmitJumpTableInfo() { + // Nothing to do; jump tables are incorporated into the instruction stream. +} + +static void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM, + Type *Ty, SmallVectorImpl<MVT> &ValueVTs) { + const DataLayout &DL(F.getParent()->getDataLayout()); + const WebAssemblyTargetLowering &TLI = + *TM.getSubtarget<WebAssemblySubtarget>(F).getTargetLowering(); + SmallVector<EVT, 4> VTs; + ComputeValueVTs(TLI, DL, Ty, VTs); + + for (EVT VT : VTs) { + unsigned NumRegs = TLI.getNumRegisters(F.getContext(), VT); + MVT RegisterVT = TLI.getRegisterType(F.getContext(), VT); + for (unsigned i = 0; i != NumRegs; ++i) + ValueVTs.push_back(RegisterVT); + } +} + +void WebAssemblyAsmPrinter::EmitFunctionBodyStart() { + if (!MFI->getParams().empty()) + getTargetStreamer()->emitParam(MFI->getParams()); + + SmallVector<MVT, 4> ResultVTs; + const Function &F(*MF->getFunction()); + ComputeLegalValueVTs(F, TM, F.getReturnType(), ResultVTs); + + // If the return type needs to be legalized it will get converted into + // passing a pointer. + if (ResultVTs.size() == 1) + getTargetStreamer()->emitResult(ResultVTs); + + bool AnyWARegs = false; + SmallVector<MVT, 16> LocalTypes; + for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) { + unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx); + unsigned WAReg = MFI->getWAReg(VReg); + // Don't declare unused registers. + if (WAReg == WebAssemblyFunctionInfo::UnusedReg) + continue; + // Don't redeclare parameters. + if (WAReg < MFI->getParams().size()) + continue; + // Don't declare stackified registers. + if (int(WAReg) < 0) + continue; + LocalTypes.push_back(getRegType(VReg)); + AnyWARegs = true; + } + auto &PhysRegs = MFI->getPhysRegs(); + for (unsigned PReg = 0; PReg < PhysRegs.size(); ++PReg) { + if (PhysRegs[PReg] == -1U) + continue; + LocalTypes.push_back(getRegType(PReg)); + AnyWARegs = true; + } + if (AnyWARegs) + getTargetStreamer()->emitLocal(LocalTypes); + + AsmPrinter::EmitFunctionBodyStart(); +} + +void WebAssemblyAsmPrinter::EmitFunctionBodyEnd() { + getTargetStreamer()->emitEndFunc(); +} + +void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) { + DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n'); + + switch (MI->getOpcode()) { + case WebAssembly::ARGUMENT_I32: + case WebAssembly::ARGUMENT_I64: + case WebAssembly::ARGUMENT_F32: + case WebAssembly::ARGUMENT_F64: + // These represent values which are live into the function entry, so there's + // no instruction to emit. + break; + default: { + WebAssemblyMCInstLower MCInstLowering(OutContext, *this); + MCInst TmpInst; + MCInstLowering.Lower(MI, TmpInst); + EmitToStreamer(*OutStreamer, TmpInst); + break; + } + } +} + +const MCExpr *WebAssemblyAsmPrinter::lowerConstant(const Constant *CV) { + if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) + if (GV->getValueType()->isFunctionTy()) + return MCSymbolRefExpr::create( + getSymbol(GV), MCSymbolRefExpr::VK_WebAssembly_FUNCTION, OutContext); + return AsmPrinter::lowerConstant(CV); +} + +bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI, + unsigned OpNo, unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &OS) { + if (AsmVariant != 0) + report_fatal_error("There are no defined alternate asm variants"); + + // First try the generic code, which knows about modifiers like 'c' and 'n'. + if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS)) + return false; + + if (!ExtraCode) { + const MachineOperand &MO = MI->getOperand(OpNo); + switch (MO.getType()) { + case MachineOperand::MO_Immediate: + OS << MO.getImm(); + return false; + case MachineOperand::MO_Register: + OS << regToString(MO); + return false; + case MachineOperand::MO_GlobalAddress: + getSymbol(MO.getGlobal())->print(OS, MAI); + printOffset(MO.getOffset(), OS); + return false; + case MachineOperand::MO_ExternalSymbol: + GetExternalSymbolSymbol(MO.getSymbolName())->print(OS, MAI); + printOffset(MO.getOffset(), OS); + return false; + case MachineOperand::MO_MachineBasicBlock: + MO.getMBB()->getSymbol()->print(OS, MAI); + return false; + default: + break; + } + } + + return true; +} + +bool WebAssemblyAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &OS) { + if (AsmVariant != 0) + report_fatal_error("There are no defined alternate asm variants"); + + if (!ExtraCode) { + // TODO: For now, we just hard-code 0 as the constant offset; teach + // SelectInlineAsmMemoryOperand how to do address mode matching. + OS << "0(" + regToString(MI->getOperand(OpNo)) + ')'; + return false; + } + + return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS); +} + +// Force static initialization. +extern "C" void LLVMInitializeWebAssemblyAsmPrinter() { + RegisterAsmPrinter<WebAssemblyAsmPrinter> X(TheWebAssemblyTarget32); + RegisterAsmPrinter<WebAssemblyAsmPrinter> Y(TheWebAssemblyTarget64); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp new file mode 100644 index 0000000..a39349c --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -0,0 +1,493 @@ +//===-- WebAssemblyCFGStackify.cpp - CFG Stackification -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a CFG stacking pass. +/// +/// This pass reorders the blocks in a function to put them into a reverse +/// post-order [0], with special care to keep the order as similar as possible +/// to the original order, and to keep loops contiguous even in the case of +/// split backedges. +/// +/// Then, it inserts BLOCK and LOOP markers to mark the start of scopes, since +/// scope boundaries serve as the labels for WebAssembly's control transfers. +/// +/// This is sufficient to convert arbitrary CFGs into a form that works on +/// WebAssembly, provided that all loops are single-entry. +/// +/// [0] https://en.wikipedia.org/wiki/Depth-first_search#Vertex_orderings +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblySubtarget.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-cfg-stackify" + +namespace { +class WebAssemblyCFGStackify final : public MachineFunctionPass { + const char *getPassName() const override { + return "WebAssembly CFG Stackify"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyCFGStackify() : MachineFunctionPass(ID) {} +}; +} // end anonymous namespace + +char WebAssemblyCFGStackify::ID = 0; +FunctionPass *llvm::createWebAssemblyCFGStackify() { + return new WebAssemblyCFGStackify(); +} + +static void EliminateMultipleEntryLoops(MachineFunction &MF, + const MachineLoopInfo &MLI) { + SmallPtrSet<MachineBasicBlock *, 8> InSet; + for (scc_iterator<MachineFunction *> I = scc_begin(&MF), E = scc_end(&MF); + I != E; ++I) { + const std::vector<MachineBasicBlock *> &CurrentSCC = *I; + + // Skip trivial SCCs. + if (CurrentSCC.size() == 1) + continue; + + InSet.insert(CurrentSCC.begin(), CurrentSCC.end()); + MachineBasicBlock *Header = nullptr; + for (MachineBasicBlock *MBB : CurrentSCC) { + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (InSet.count(Pred)) + continue; + if (!Header) { + Header = MBB; + break; + } + // TODO: Implement multiple-entry loops. + report_fatal_error("multiple-entry loops are not supported yet"); + } + } + assert(MLI.isLoopHeader(Header)); + + InSet.clear(); + } +} + +namespace { +/// Post-order traversal stack entry. +struct POStackEntry { + MachineBasicBlock *MBB; + SmallVector<MachineBasicBlock *, 0> Succs; + + POStackEntry(MachineBasicBlock *MBB, MachineFunction &MF, + const MachineLoopInfo &MLI); +}; +} // end anonymous namespace + +static bool LoopContains(const MachineLoop *Loop, + const MachineBasicBlock *MBB) { + return Loop ? Loop->contains(MBB) : true; +} + +POStackEntry::POStackEntry(MachineBasicBlock *MBB, MachineFunction &MF, + const MachineLoopInfo &MLI) + : MBB(MBB), Succs(MBB->successors()) { + // RPO is not a unique form, since at every basic block with multiple + // successors, the DFS has to pick which order to visit the successors in. + // Sort them strategically (see below). + MachineLoop *Loop = MLI.getLoopFor(MBB); + MachineFunction::iterator Next = next(MachineFunction::iterator(MBB)); + MachineBasicBlock *LayoutSucc = Next == MF.end() ? nullptr : &*Next; + std::stable_sort( + Succs.begin(), Succs.end(), + [=, &MLI](const MachineBasicBlock *A, const MachineBasicBlock *B) { + if (A == B) + return false; + + // Keep loops contiguous by preferring the block that's in the same + // loop. + bool LoopContainsA = LoopContains(Loop, A); + bool LoopContainsB = LoopContains(Loop, B); + if (LoopContainsA && !LoopContainsB) + return true; + if (!LoopContainsA && LoopContainsB) + return false; + + // Minimize perturbation by preferring the block which is the immediate + // layout successor. + if (A == LayoutSucc) + return true; + if (B == LayoutSucc) + return false; + + // TODO: More sophisticated orderings may be profitable here. + + return false; + }); +} + +/// Return the "bottom" block of a loop. This differs from +/// MachineLoop::getBottomBlock in that it works even if the loop is +/// discontiguous. +static MachineBasicBlock *LoopBottom(const MachineLoop *Loop) { + MachineBasicBlock *Bottom = Loop->getHeader(); + for (MachineBasicBlock *MBB : Loop->blocks()) + if (MBB->getNumber() > Bottom->getNumber()) + Bottom = MBB; + return Bottom; +} + +/// Sort the blocks in RPO, taking special care to make sure that loops are +/// contiguous even in the case of split backedges. +/// +/// TODO: Determine whether RPO is actually worthwhile, or whether we should +/// move to just a stable-topological-sort-based approach that would preserve +/// more of the original order. +static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) { + // Note that we do our own RPO rather than using + // "llvm/ADT/PostOrderIterator.h" because we want control over the order that + // successors are visited in (see above). Also, we can sort the blocks in the + // MachineFunction as we go. + SmallPtrSet<MachineBasicBlock *, 16> Visited; + SmallVector<POStackEntry, 16> Stack; + + MachineBasicBlock *EntryBlock = &*MF.begin(); + Visited.insert(EntryBlock); + Stack.push_back(POStackEntry(EntryBlock, MF, MLI)); + + for (;;) { + POStackEntry &Entry = Stack.back(); + SmallVectorImpl<MachineBasicBlock *> &Succs = Entry.Succs; + if (!Succs.empty()) { + MachineBasicBlock *Succ = Succs.pop_back_val(); + if (Visited.insert(Succ).second) + Stack.push_back(POStackEntry(Succ, MF, MLI)); + continue; + } + + // Put the block in its position in the MachineFunction. + MachineBasicBlock &MBB = *Entry.MBB; + MBB.moveBefore(&*MF.begin()); + + // Branch instructions may utilize a fallthrough, so update them if a + // fallthrough has been added or removed. + if (!MBB.empty() && MBB.back().isTerminator() && !MBB.back().isBranch() && + !MBB.back().isBarrier()) + report_fatal_error( + "Non-branch terminator with fallthrough cannot yet be rewritten"); + if (MBB.empty() || !MBB.back().isTerminator() || MBB.back().isBranch()) + MBB.updateTerminator(); + + Stack.pop_back(); + if (Stack.empty()) + break; + } + + // Now that we've sorted the blocks in RPO, renumber them. + MF.RenumberBlocks(); + +#ifndef NDEBUG + SmallSetVector<MachineLoop *, 8> OnStack; + + // Insert a sentinel representing the degenerate loop that starts at the + // function entry block and includes the entire function as a "loop" that + // executes once. + OnStack.insert(nullptr); + + for (auto &MBB : MF) { + assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative."); + + MachineLoop *Loop = MLI.getLoopFor(&MBB); + if (Loop && &MBB == Loop->getHeader()) { + // Loop header. The loop predecessor should be sorted above, and the other + // predecessors should be backedges below. + for (auto Pred : MBB.predecessors()) + assert( + (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) && + "Loop header predecessors must be loop predecessors or backedges"); + assert(OnStack.insert(Loop) && "Loops should be declared at most once."); + } else { + // Not a loop header. All predecessors should be sorted above. + for (auto Pred : MBB.predecessors()) + assert(Pred->getNumber() < MBB.getNumber() && + "Non-loop-header predecessors should be topologically sorted"); + assert(OnStack.count(MLI.getLoopFor(&MBB)) && + "Blocks must be nested in their loops"); + } + while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back())) + OnStack.pop_back(); + } + assert(OnStack.pop_back_val() == nullptr && + "The function entry block shouldn't actually be a loop header"); + assert(OnStack.empty() && + "Control flow stack pushes and pops should be balanced."); +#endif +} + +/// Test whether Pred has any terminators explicitly branching to MBB, as +/// opposed to falling through. Note that it's possible (eg. in unoptimized +/// code) for a branch instruction to both branch to a block and fallthrough +/// to it, so we check the actual branch operands to see if there are any +/// explicit mentions. +static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred, + MachineBasicBlock *MBB) { + for (MachineInstr &MI : Pred->terminators()) + for (MachineOperand &MO : MI.explicit_operands()) + if (MO.isMBB() && MO.getMBB() == MBB) + return true; + return false; +} + +/// Insert a BLOCK marker for branches to MBB (if needed). +static void PlaceBlockMarker(MachineBasicBlock &MBB, MachineFunction &MF, + SmallVectorImpl<MachineBasicBlock *> &ScopeTops, + const WebAssemblyInstrInfo &TII, + const MachineLoopInfo &MLI, + MachineDominatorTree &MDT) { + // First compute the nearest common dominator of all forward non-fallthrough + // predecessors so that we minimize the time that the BLOCK is on the stack, + // which reduces overall stack height. + MachineBasicBlock *Header = nullptr; + bool IsBranchedTo = false; + int MBBNumber = MBB.getNumber(); + for (MachineBasicBlock *Pred : MBB.predecessors()) + if (Pred->getNumber() < MBBNumber) { + Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred; + if (ExplicitlyBranchesTo(Pred, &MBB)) + IsBranchedTo = true; + } + if (!Header) + return; + if (!IsBranchedTo) + return; + + assert(&MBB != &MF.front() && "Header blocks shouldn't have predecessors"); + MachineBasicBlock *LayoutPred = &*prev(MachineFunction::iterator(&MBB)); + + // If the nearest common dominator is inside a more deeply nested context, + // walk out to the nearest scope which isn't more deeply nested. + for (MachineFunction::iterator I(LayoutPred), E(Header); I != E; --I) { + if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) { + if (ScopeTop->getNumber() > Header->getNumber()) { + // Skip over an intervening scope. + I = next(MachineFunction::iterator(ScopeTop)); + } else { + // We found a scope level at an appropriate depth. + Header = ScopeTop; + break; + } + } + } + + // If there's a loop which ends just before MBB which contains Header, we can + // reuse its label instead of inserting a new BLOCK. + for (MachineLoop *Loop = MLI.getLoopFor(LayoutPred); + Loop && Loop->contains(LayoutPred); Loop = Loop->getParentLoop()) + if (Loop && LoopBottom(Loop) == LayoutPred && Loop->contains(Header)) + return; + + // Decide where in Header to put the BLOCK. + MachineBasicBlock::iterator InsertPos; + MachineLoop *HeaderLoop = MLI.getLoopFor(Header); + if (HeaderLoop && MBB.getNumber() > LoopBottom(HeaderLoop)->getNumber()) { + // Header is the header of a loop that does not lexically contain MBB, so + // the BLOCK needs to be above the LOOP. + InsertPos = Header->begin(); + } else { + // Otherwise, insert the BLOCK as late in Header as we can, but before the + // beginning of the local expression tree and any nested BLOCKs. + InsertPos = Header->getFirstTerminator(); + while (InsertPos != Header->begin() && + prev(InsertPos)->definesRegister(WebAssembly::EXPR_STACK) && + prev(InsertPos)->getOpcode() != WebAssembly::LOOP && + prev(InsertPos)->getOpcode() != WebAssembly::END_BLOCK && + prev(InsertPos)->getOpcode() != WebAssembly::END_LOOP) + --InsertPos; + } + + // Add the BLOCK. + BuildMI(*Header, InsertPos, DebugLoc(), TII.get(WebAssembly::BLOCK)); + + // Mark the end of the block. + InsertPos = MBB.begin(); + while (InsertPos != MBB.end() && + InsertPos->getOpcode() == WebAssembly::END_LOOP) + ++InsertPos; + BuildMI(MBB, InsertPos, DebugLoc(), TII.get(WebAssembly::END_BLOCK)); + + // Track the farthest-spanning scope that ends at this point. + int Number = MBB.getNumber(); + if (!ScopeTops[Number] || + ScopeTops[Number]->getNumber() > Header->getNumber()) + ScopeTops[Number] = Header; +} + +/// Insert a LOOP marker for a loop starting at MBB (if it's a loop header). +static void PlaceLoopMarker( + MachineBasicBlock &MBB, MachineFunction &MF, + SmallVectorImpl<MachineBasicBlock *> &ScopeTops, + DenseMap<const MachineInstr *, const MachineBasicBlock *> &LoopTops, + const WebAssemblyInstrInfo &TII, const MachineLoopInfo &MLI) { + MachineLoop *Loop = MLI.getLoopFor(&MBB); + if (!Loop || Loop->getHeader() != &MBB) + return; + + // The operand of a LOOP is the first block after the loop. If the loop is the + // bottom of the function, insert a dummy block at the end. + MachineBasicBlock *Bottom = LoopBottom(Loop); + auto Iter = next(MachineFunction::iterator(Bottom)); + if (Iter == MF.end()) { + MachineBasicBlock *Label = MF.CreateMachineBasicBlock(); + // Give it a fake predecessor so that AsmPrinter prints its label. + Label->addSuccessor(Label); + MF.push_back(Label); + Iter = next(MachineFunction::iterator(Bottom)); + } + MachineBasicBlock *AfterLoop = &*Iter; + + // Mark the beginning of the loop (after the end of any existing loop that + // ends here). + auto InsertPos = MBB.begin(); + while (InsertPos != MBB.end() && + InsertPos->getOpcode() == WebAssembly::END_LOOP) + ++InsertPos; + BuildMI(MBB, InsertPos, DebugLoc(), TII.get(WebAssembly::LOOP)); + + // Mark the end of the loop. + MachineInstr *End = BuildMI(*AfterLoop, AfterLoop->begin(), DebugLoc(), + TII.get(WebAssembly::END_LOOP)); + LoopTops[End] = &MBB; + + assert((!ScopeTops[AfterLoop->getNumber()] || + ScopeTops[AfterLoop->getNumber()]->getNumber() < MBB.getNumber()) && + "With RPO we should visit the outer-most loop for a block first."); + if (!ScopeTops[AfterLoop->getNumber()]) + ScopeTops[AfterLoop->getNumber()] = &MBB; +} + +static unsigned +GetDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack, + const MachineBasicBlock *MBB) { + unsigned Depth = 0; + for (auto X : reverse(Stack)) { + if (X == MBB) + break; + ++Depth; + } + assert(Depth < Stack.size() && "Branch destination should be in scope"); + return Depth; +} + +/// Insert LOOP and BLOCK markers at appropriate places. +static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI, + const WebAssemblyInstrInfo &TII, + MachineDominatorTree &MDT) { + // For each block whose label represents the end of a scope, record the block + // which holds the beginning of the scope. This will allow us to quickly skip + // over scoped regions when walking blocks. We allocate one more than the + // number of blocks in the function to accommodate for the possible fake block + // we may insert at the end. + SmallVector<MachineBasicBlock *, 8> ScopeTops(MF.getNumBlockIDs() + 1); + + // For eacn LOOP_END, the corresponding LOOP. + DenseMap<const MachineInstr *, const MachineBasicBlock *> LoopTops; + + for (auto &MBB : MF) { + // Place the LOOP for MBB if MBB is the header of a loop. + PlaceLoopMarker(MBB, MF, ScopeTops, LoopTops, TII, MLI); + + // Place the BLOCK for MBB if MBB is branched to from above. + PlaceBlockMarker(MBB, MF, ScopeTops, TII, MLI, MDT); + } + + // Now rewrite references to basic blocks to be depth immediates. + SmallVector<const MachineBasicBlock *, 8> Stack; + for (auto &MBB : reverse(MF)) { + for (auto &MI : reverse(MBB)) { + switch (MI.getOpcode()) { + case WebAssembly::BLOCK: + assert(ScopeTops[Stack.back()->getNumber()] == &MBB && + "Block should be balanced"); + Stack.pop_back(); + break; + case WebAssembly::LOOP: + assert(Stack.back() == &MBB && "Loop top should be balanced"); + Stack.pop_back(); + Stack.pop_back(); + break; + case WebAssembly::END_BLOCK: + Stack.push_back(&MBB); + break; + case WebAssembly::END_LOOP: + Stack.push_back(&MBB); + Stack.push_back(LoopTops[&MI]); + break; + default: + if (MI.isTerminator()) { + // Rewrite MBB operands to be depth immediates. + SmallVector<MachineOperand, 4> Ops(MI.operands()); + while (MI.getNumOperands() > 0) + MI.RemoveOperand(MI.getNumOperands() - 1); + for (auto MO : Ops) { + if (MO.isMBB()) + MO = MachineOperand::CreateImm(GetDepth(Stack, MO.getMBB())); + MI.addOperand(MF, MO); + } + } + break; + } + } + } + assert(Stack.empty() && "Control flow should be balanced"); +} + +bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** CFG Stackifying **********\n" + "********** Function: " + << MF.getName() << '\n'); + + const auto &MLI = getAnalysis<MachineLoopInfo>(); + auto &MDT = getAnalysis<MachineDominatorTree>(); + // Liveness is not tracked for EXPR_STACK physreg. + const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); + MF.getRegInfo().invalidateLiveness(); + + // RPO sorting needs all loops to be single-entry. + EliminateMultipleEntryLoops(MF, MLI); + + // Sort the blocks in RPO, with contiguous loops. + SortBlocks(MF, MLI); + + // Place the BLOCK and LOOP markers to indicate the beginnings of scopes. + PlaceMarkers(MF, MLI, TII, MDT); + + return true; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp new file mode 100644 index 0000000..1b761b1 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -0,0 +1,81 @@ +//===-- WebAssemblyFastISel.cpp - WebAssembly FastISel implementation -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file defines the WebAssembly-specific support for the FastISel +/// class. Some of the target-specific code is generated by tablegen in the file +/// WebAssemblyGenFastISel.inc, which is #included here. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblySubtarget.h" +#include "WebAssemblyTargetMachine.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-fastisel" + +namespace { + +class WebAssemblyFastISel final : public FastISel { + /// Keep a pointer to the WebAssemblySubtarget around so that we can make the + /// right decision when generating code for different targets. + const WebAssemblySubtarget *Subtarget; + LLVMContext *Context; + + // Call handling routines. +private: +public: + // Backend specific FastISel code. + WebAssemblyFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) + : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) { + Subtarget = &FuncInfo.MF->getSubtarget<WebAssemblySubtarget>(); + Context = &FuncInfo.Fn->getContext(); + } + + bool fastSelectInstruction(const Instruction *I) override; + +#include "WebAssemblyGenFastISel.inc" +}; + +} // end anonymous namespace + +bool WebAssemblyFastISel::fastSelectInstruction(const Instruction *I) { + switch (I->getOpcode()) { + default: + break; + // TODO: add fast-isel selection cases here... + } + + // Fall back to target-independent instruction selection. + return selectOperator(I, I->getOpcode()); +} + +FastISel *WebAssembly::createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) { + return new WebAssemblyFastISel(FuncInfo, LibInfo); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp index e4ca82e..0eefd57 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp @@ -35,11 +35,20 @@ using namespace llvm; #define DEBUG_TYPE "wasm-frame-info" // TODO: Implement a red zone? +// TODO: wasm64 +// TODO: Prolog/epilog should be stackified too. This pass runs after register +// stackification, so we'll have to do it manually. +// TODO: Emit TargetOpcode::CFI_INSTRUCTION instructions /// Return true if the specified function should have a dedicated frame pointer /// register. bool WebAssemblyFrameLowering::hasFP(const MachineFunction &MF) const { - llvm_unreachable("TODO: implement hasFP"); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const auto *RegInfo = + MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo(); + return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || + MFI->hasStackMap() || MFI->hasPatchPoint() || + RegInfo->needsStackRealignment(MF); } /// Under normal circumstances, when a frame pointer is not required, we reserve @@ -52,23 +61,115 @@ bool WebAssemblyFrameLowering::hasReservedCallFrame( return !MF.getFrameInfo()->hasVarSizedObjects(); } + +/// Adjust the stack pointer by a constant amount. +static void adjustStackPointer(unsigned StackSize, + bool AdjustUp, + MachineFunction& MF, + MachineBasicBlock& MBB, + const TargetInstrInfo* TII, + MachineBasicBlock::iterator InsertPt, + const DebugLoc& DL) { + auto &MRI = MF.getRegInfo(); + unsigned SPReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer"); + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), SPReg) + .addExternalSymbol(SPSymbol); + // This MachinePointerInfo should reference __stack_pointer as well but + // doesn't because MachinePointerInfo() takes a GV which we don't have for + // __stack_pointer. TODO: check if PseudoSourceValue::ExternalSymbolCallEntry + // is appropriate instead. (likewise for EmitEpologue below) + auto *LoadMMO = new MachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOLoad, 4, 4); + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg) + .addImm(0) + .addReg(SPReg) + .addMemOperand(LoadMMO); + // Add/Subtract the frame size + unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg) + .addImm(StackSize); + BuildMI(MBB, InsertPt, DL, + TII->get(AdjustUp ? WebAssembly::ADD_I32 : WebAssembly::SUB_I32), + WebAssembly::SP32) + .addReg(SPReg) + .addReg(OffsetReg); + // The SP32 register now has the new stacktop. Also write it back to memory. + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg) + .addExternalSymbol(SPSymbol); + auto *MMO = new MachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, 4, 4); + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::STORE_I32), WebAssembly::SP32) + .addImm(0) + .addReg(OffsetReg) + .addReg(WebAssembly::SP32) + .addMemOperand(MMO); +} + void WebAssemblyFrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - llvm_unreachable("TODO: implement eliminateCallFramePseudoInstr"); + const auto *TII = + static_cast<const WebAssemblyInstrInfo*>(MF.getSubtarget().getInstrInfo()); + DebugLoc DL = I->getDebugLoc(); + unsigned Opc = I->getOpcode(); + bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); + unsigned Amount = I->getOperand(0).getImm(); + if (Amount) + adjustStackPointer(Amount, IsDestroy, MF, MBB, + TII, I, DL); + MBB.erase(I); } void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - llvm_unreachable("TODO: implement emitPrologue"); + // TODO: Do ".setMIFlag(MachineInstr::FrameSetup)" on emitted instructions + auto *MFI = MF.getFrameInfo(); + assert(MFI->getCalleeSavedInfo().empty() && + "WebAssembly should not have callee-saved registers"); + assert(!hasFP(MF) && "Functions needing frame pointers not yet supported"); + uint64_t StackSize = MFI->getStackSize(); + if (!StackSize && (!MFI->adjustsStack() || MFI->getMaxCallFrameSize() == 0)) + return; + + const auto *TII = MF.getSubtarget().getInstrInfo(); + + auto InsertPt = MBB.begin(); + DebugLoc DL; + + adjustStackPointer(StackSize, false, MF, MBB, TII, InsertPt, DL); } void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { - llvm_unreachable("TODO: implement emitEpilogue"); -} + uint64_t StackSize = MF.getFrameInfo()->getStackSize(); + if (!StackSize) + return; + const auto *TII = MF.getSubtarget().getInstrInfo(); + auto &MRI = MF.getRegInfo(); + unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + auto InsertPt = MBB.getFirstTerminator(); + DebugLoc DL; + + if (InsertPt != MBB.end()) { + DL = InsertPt->getDebugLoc(); + } -void WebAssemblyFrameLowering::processFunctionBeforeCalleeSavedScan( - MachineFunction &MF, RegScavenger *RS) const { - llvm_unreachable("TODO: implement processFunctionBeforeCalleeSavedScan"); + // Restore the stack pointer. Without FP its value is just SP32 - stacksize + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg) + .addImm(StackSize); + auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer"); + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::ADD_I32), WebAssembly::SP32) + .addReg(WebAssembly::SP32) + .addReg(OffsetReg); + // Re-use OffsetReg to hold the address of the stacktop + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg) + .addExternalSymbol(SPSymbol); + auto *MMO = new MachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, 4, 4); + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::STORE_I32), WebAssembly::SP32) + .addImm(0) + .addReg(OffsetReg) + .addReg(WebAssembly::SP32) + .addMemOperand(MMO); } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h index 0b112d0..5f4708f 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h @@ -38,9 +38,6 @@ public: bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; - - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def new file mode 100644 index 0000000..3a03fa5 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def @@ -0,0 +1,25 @@ +//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file describes the various WebAssembly ISD node types. +/// +//===----------------------------------------------------------------------===// + +// NOTE: NO INCLUDE GUARD DESIRED! + +HANDLE_NODETYPE(CALL1) +HANDLE_NODETYPE(CALL0) +HANDLE_NODETYPE(RETURN) +HANDLE_NODETYPE(ARGUMENT) +HANDLE_NODETYPE(Wrapper) +HANDLE_NODETYPE(BR_IF) +HANDLE_NODETYPE(TABLESWITCH) + +// add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here... diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp index 518ef33..8390f79 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp @@ -56,13 +56,68 @@ public: SDNode *Select(SDNode *Node) override; + bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, + std::vector<SDValue> &OutOps) override; + +// Include the pieces autogenerated from the target description. +#include "WebAssemblyGenDAGISel.inc" + private: // add select functions here... }; } // end anonymous namespace SDNode *WebAssemblyDAGToDAGISel::Select(SDNode *Node) { - llvm_unreachable("TODO: implement Select"); + // Dump information about the Node being selected. + DEBUG(errs() << "Selecting: "); + DEBUG(Node->dump(CurDAG)); + DEBUG(errs() << "\n"); + + // If we have a custom node, we already have selected! + if (Node->isMachineOpcode()) { + DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n"); + Node->setNodeId(-1); + return nullptr; + } + + // Few custom selection stuff. + SDNode *ResNode = nullptr; + EVT VT = Node->getValueType(0); + + switch (Node->getOpcode()) { + default: + break; + // If we need WebAssembly-specific selection, it would go here. + (void)VT; + } + + // Select the default instruction. + ResNode = SelectCode(Node); + + DEBUG(errs() << "=> "); + if (ResNode == nullptr || ResNode == Node) + DEBUG(Node->dump(CurDAG)); + else + DEBUG(ResNode->dump(CurDAG)); + DEBUG(errs() << "\n"); + + return ResNode; +} + +bool WebAssemblyDAGToDAGISel::SelectInlineAsmMemoryOperand( + const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { + switch (ConstraintID) { + case InlineAsm::Constraint_i: + case InlineAsm::Constraint_m: + // We just support simple memory operands that just have a single address + // operand and need no special handling. + OutOps.push_back(Op); + return false; + default: + break; + } + + return true; } /// This pass converts a legalized DAG into a WebAssembly-specific DAG, ready diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 4184eb6..e9933b0 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -17,10 +17,13 @@ #include "WebAssemblyMachineFunctionInfo.h" #include "WebAssemblySubtarget.h" #include "WebAssemblyTargetMachine.h" -#include "WebAssemblyTargetObjectFile.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/CommandLine.h" @@ -32,14 +35,254 @@ using namespace llvm; #define DEBUG_TYPE "wasm-lower" +namespace { +// Diagnostic information for unimplemented or unsupported feature reporting. +// TODO: This code is copied from BPF and AMDGPU; consider factoring it out +// and sharing code. +class DiagnosticInfoUnsupported final : public DiagnosticInfo { +private: + // Debug location where this diagnostic is triggered. + DebugLoc DLoc; + const Twine &Description; + const Function &Fn; + SDValue Value; + + static int KindID; + + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + DiagnosticInfoUnsupported(SDLoc DLoc, const Function &Fn, const Twine &Desc, + SDValue Value) + : DiagnosticInfo(getKindID(), DS_Error), DLoc(DLoc.getDebugLoc()), + Description(Desc), Fn(Fn), Value(Value) {} + + void print(DiagnosticPrinter &DP) const override { + std::string Str; + raw_string_ostream OS(Str); + + if (DLoc) { + auto DIL = DLoc.get(); + StringRef Filename = DIL->getFilename(); + unsigned Line = DIL->getLine(); + unsigned Column = DIL->getColumn(); + OS << Filename << ':' << Line << ':' << Column << ' '; + } + + OS << "in function " << Fn.getName() << ' ' << *Fn.getFunctionType() << '\n' + << Description; + if (Value) + Value->print(OS); + OS << '\n'; + OS.flush(); + DP << Str; + } + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; + +int DiagnosticInfoUnsupported::KindID = 0; +} // end anonymous namespace + WebAssemblyTargetLowering::WebAssemblyTargetLowering( const TargetMachine &TM, const WebAssemblySubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { + auto MVTPtr = Subtarget->hasAddr64() ? MVT::i64 : MVT::i32; + + // Booleans always contain 0 or 1. + setBooleanContents(ZeroOrOneBooleanContent); // WebAssembly does not produce floating-point exceptions on normal floating // point operations. setHasFloatingPointExceptions(false); // We don't know the microarchitecture here, so just reduce register pressure. setSchedulingPreference(Sched::RegPressure); + // Tell ISel that we have a stack pointer. + setStackPointerRegisterToSaveRestore( + Subtarget->hasAddr64() ? WebAssembly::SP64 : WebAssembly::SP32); + // Set up the register classes. + addRegisterClass(MVT::i32, &WebAssembly::I32RegClass); + addRegisterClass(MVT::i64, &WebAssembly::I64RegClass); + addRegisterClass(MVT::f32, &WebAssembly::F32RegClass); + addRegisterClass(MVT::f64, &WebAssembly::F64RegClass); + // Compute derived properties from the register classes. + computeRegisterProperties(Subtarget->getRegisterInfo()); + + setOperationAction(ISD::GlobalAddress, MVTPtr, Custom); + setOperationAction(ISD::ExternalSymbol, MVTPtr, Custom); + setOperationAction(ISD::JumpTable, MVTPtr, Custom); + + // Take the default expansion for va_arg, va_copy, and va_end. There is no + // default action for va_start, so we do that custom. + setOperationAction(ISD::VASTART, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::Other, Expand); + setOperationAction(ISD::VACOPY, MVT::Other, Expand); + setOperationAction(ISD::VAEND, MVT::Other, Expand); + + for (auto T : {MVT::f32, MVT::f64}) { + // Don't expand the floating-point types to constant pools. + setOperationAction(ISD::ConstantFP, T, Legal); + // Expand floating-point comparisons. + for (auto CC : {ISD::SETO, ISD::SETUO, ISD::SETUEQ, ISD::SETONE, + ISD::SETULT, ISD::SETULE, ISD::SETUGT, ISD::SETUGE}) + setCondCodeAction(CC, T, Expand); + // Expand floating-point library function operators. + for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOWI, ISD::FPOW, + ISD::FREM, ISD::FMA}) + setOperationAction(Op, T, Expand); + // Note supported floating-point library function operators that otherwise + // default to expand. + for (auto Op : + {ISD::FCEIL, ISD::FFLOOR, ISD::FTRUNC, ISD::FNEARBYINT, ISD::FRINT}) + setOperationAction(Op, T, Legal); + // Support minnan and maxnan, which otherwise default to expand. + setOperationAction(ISD::FMINNAN, T, Legal); + setOperationAction(ISD::FMAXNAN, T, Legal); + } + + for (auto T : {MVT::i32, MVT::i64}) { + // Expand unavailable integer operations. + for (auto Op : + {ISD::BSWAP, ISD::ROTL, ISD::ROTR, ISD::SMUL_LOHI, ISD::UMUL_LOHI, + ISD::MULHS, ISD::MULHU, ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS, + ISD::SRA_PARTS, ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC, + ISD::SUBE}) { + setOperationAction(Op, T, Expand); + } + } + + // As a special case, these operators use the type to mean the type to + // sign-extend from. + for (auto T : {MVT::i1, MVT::i8, MVT::i16, MVT::i32}) + setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand); + + // Dynamic stack allocation: use the default expansion. + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVTPtr, Expand); + + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + + // Expand these forms; we pattern-match the forms that we can handle in isel. + for (auto T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64}) + for (auto Op : {ISD::BR_CC, ISD::SELECT_CC}) + setOperationAction(Op, T, Expand); + + // We have custom switch handling. + setOperationAction(ISD::BR_JT, MVT::Other, Custom); + + // WebAssembly doesn't have: + // - Floating-point extending loads. + // - Floating-point truncating stores. + // - i1 extending loads. + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + for (auto T : MVT::integer_valuetypes()) + for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD}) + setLoadExtAction(Ext, T, MVT::i1, Promote); + + // Trap lowers to wasm unreachable + setOperationAction(ISD::TRAP, MVT::Other, Legal); +} + +FastISel *WebAssemblyTargetLowering::createFastISel( + FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) const { + return WebAssembly::createFastISel(FuncInfo, LibInfo); +} + +bool WebAssemblyTargetLowering::isOffsetFoldingLegal( + const GlobalAddressSDNode * /*GA*/) const { + // All offsets can be folded. + return true; +} + +MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/, + EVT VT) const { + unsigned BitWidth = NextPowerOf2(VT.getSizeInBits() - 1); + if (BitWidth > 1 && BitWidth < 8) + BitWidth = 8; + + if (BitWidth > 64) { + BitWidth = 64; + assert(BitWidth >= Log2_32_Ceil(VT.getSizeInBits()) && + "64-bit shift counts ought to be enough for anyone"); + } + + MVT Result = MVT::getIntegerVT(BitWidth); + assert(Result != MVT::INVALID_SIMPLE_VALUE_TYPE && + "Unable to represent scalar shift amount type"); + return Result; +} + +const char * +WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) { + case WebAssemblyISD::FIRST_NUMBER: + break; +#define HANDLE_NODETYPE(NODE) \ + case WebAssemblyISD::NODE: \ + return "WebAssemblyISD::" #NODE; +#include "WebAssemblyISD.def" +#undef HANDLE_NODETYPE + } + return nullptr; +} + +std::pair<unsigned, const TargetRegisterClass *> +WebAssemblyTargetLowering::getRegForInlineAsmConstraint( + const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { + // First, see if this is a constraint that directly corresponds to a + // WebAssembly register class. + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'r': + assert(VT != MVT::iPTR && "Pointer MVT not expected here"); + if (VT.isInteger() && !VT.isVector()) { + if (VT.getSizeInBits() <= 32) + return std::make_pair(0U, &WebAssembly::I32RegClass); + if (VT.getSizeInBits() <= 64) + return std::make_pair(0U, &WebAssembly::I64RegClass); + } + break; + default: + break; + } + } + + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); +} + +bool WebAssemblyTargetLowering::isCheapToSpeculateCttz() const { + // Assume ctz is a relatively cheap operation. + return true; +} + +bool WebAssemblyTargetLowering::isCheapToSpeculateCtlz() const { + // Assume clz is a relatively cheap operation. + return true; +} + +bool WebAssemblyTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, + Type *Ty, + unsigned AS) const { + // WebAssembly offsets are added as unsigned without wrapping. The + // isLegalAddressingMode gives us no way to determine if wrapping could be + // happening, so we approximate this by accepting only non-negative offsets. + if (AM.BaseOffs < 0) + return false; + + // WebAssembly has no scale register operands. + if (AM.Scale != 0) + return false; + + // Everything else is legal. + return true; } //===----------------------------------------------------------------------===// @@ -50,16 +293,367 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( // Lowering Code //===----------------------------------------------------------------------===// +static void fail(SDLoc DL, SelectionDAG &DAG, const char *msg) { + MachineFunction &MF = DAG.getMachineFunction(); + DAG.getContext()->diagnose( + DiagnosticInfoUnsupported(DL, *MF.getFunction(), msg, SDValue())); +} + +// Test whether the given calling convention is supported. +static bool CallingConvSupported(CallingConv::ID CallConv) { + // We currently support the language-independent target-independent + // conventions. We don't yet have a way to annotate calls with properties like + // "cold", and we don't have any call-clobbered registers, so these are mostly + // all handled the same. + return CallConv == CallingConv::C || CallConv == CallingConv::Fast || + CallConv == CallingConv::Cold || + CallConv == CallingConv::PreserveMost || + CallConv == CallingConv::PreserveAll || + CallConv == CallingConv::CXX_FAST_TLS; +} + +SDValue +WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc DL = CLI.DL; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + MachineFunction &MF = DAG.getMachineFunction(); + + CallingConv::ID CallConv = CLI.CallConv; + if (!CallingConvSupported(CallConv)) + fail(DL, DAG, + "WebAssembly doesn't support language-specific or target-specific " + "calling conventions yet"); + if (CLI.IsPatchPoint) + fail(DL, DAG, "WebAssembly doesn't support patch point yet"); + + // WebAssembly doesn't currently support explicit tail calls. If they are + // required, fail. Otherwise, just disable them. + if ((CallConv == CallingConv::Fast && CLI.IsTailCall && + MF.getTarget().Options.GuaranteedTailCallOpt) || + (CLI.CS && CLI.CS->isMustTailCall())) + fail(DL, DAG, "WebAssembly doesn't support tail call yet"); + CLI.IsTailCall = false; + + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; + if (Ins.size() > 1) + fail(DL, DAG, "WebAssembly doesn't support more than 1 returned value yet"); + + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + for (const ISD::OutputArg &Out : Outs) { + if (Out.Flags.isByVal()) + fail(DL, DAG, "WebAssembly hasn't implemented byval arguments"); + if (Out.Flags.isNest()) + fail(DL, DAG, "WebAssembly hasn't implemented nest arguments"); + if (Out.Flags.isInAlloca()) + fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments"); + if (Out.Flags.isInConsecutiveRegs()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments"); + if (Out.Flags.isInConsecutiveRegsLast()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments"); + } + + bool IsVarArg = CLI.IsVarArg; + unsigned NumFixedArgs = CLI.NumFixedArgs; + auto PtrVT = getPointerTy(MF.getDataLayout()); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + + if (IsVarArg) { + // Outgoing non-fixed arguments are placed at the top of the stack. First + // compute their offsets and the total amount of argument stack space + // needed. + for (SDValue Arg : + make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) { + EVT VT = Arg.getValueType(); + assert(VT != MVT::iPTR && "Legalized args should be concrete"); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + unsigned Offset = + CCInfo.AllocateStack(MF.getDataLayout().getTypeAllocSize(Ty), + MF.getDataLayout().getABITypeAlignment(Ty)); + CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(), + Offset, VT.getSimpleVT(), + CCValAssign::Full)); + } + } + + unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); + + SDValue NB; + if (NumBytes) { + NB = DAG.getConstant(NumBytes, DL, PtrVT, true); + Chain = DAG.getCALLSEQ_START(Chain, NB, DL); + } + + if (IsVarArg) { + // For non-fixed arguments, next emit stores to store the argument values + // to the stack at the offsets computed above. + SDValue SP = DAG.getCopyFromReg( + Chain, DL, getStackPointerRegisterToSaveRestore(), PtrVT); + unsigned ValNo = 0; + SmallVector<SDValue, 8> Chains; + for (SDValue Arg : + make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) { + assert(ArgLocs[ValNo].getValNo() == ValNo && + "ArgLocs should remain in order and only hold varargs args"); + unsigned Offset = ArgLocs[ValNo++].getLocMemOffset(); + SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, SP, + DAG.getConstant(Offset, DL, PtrVT)); + Chains.push_back(DAG.getStore(Chain, DL, Arg, Add, + MachinePointerInfo::getStack(MF, Offset), + false, false, 0)); + } + if (!Chains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + } + + // Compute the operands for the CALLn node. + SmallVector<SDValue, 16> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + // Add all fixed arguments. Note that for non-varargs calls, NumFixedArgs + // isn't reliable. + Ops.append(OutVals.begin(), + IsVarArg ? OutVals.begin() + NumFixedArgs : OutVals.end()); + + SmallVector<EVT, 8> Tys; + for (const auto &In : Ins) { + assert(!In.Flags.isByVal() && "byval is not valid for return values"); + assert(!In.Flags.isNest() && "nest is not valid for return values"); + if (In.Flags.isInAlloca()) + fail(DL, DAG, "WebAssembly hasn't implemented inalloca return values"); + if (In.Flags.isInConsecutiveRegs()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs return values"); + if (In.Flags.isInConsecutiveRegsLast()) + fail(DL, DAG, + "WebAssembly hasn't implemented cons regs last return values"); + // Ignore In.getOrigAlign() because all our arguments are passed in + // registers. + Tys.push_back(In.VT); + } + Tys.push_back(MVT::Other); + SDVTList TyList = DAG.getVTList(Tys); + SDValue Res = + DAG.getNode(Ins.empty() ? WebAssemblyISD::CALL0 : WebAssemblyISD::CALL1, + DL, TyList, Ops); + if (Ins.empty()) { + Chain = Res; + } else { + InVals.push_back(Res); + Chain = Res.getValue(1); + } + + if (NumBytes) { + SDValue Unused = DAG.getTargetConstant(0, DL, PtrVT); + Chain = DAG.getCALLSEQ_END(Chain, NB, Unused, SDValue(), DL); + } + + return Chain; +} + +bool WebAssemblyTargetLowering::CanLowerReturn( + CallingConv::ID /*CallConv*/, MachineFunction & /*MF*/, bool /*IsVarArg*/, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext & /*Context*/) const { + // WebAssembly can't currently handle returning tuples. + return Outs.size() <= 1; +} + +SDValue WebAssemblyTargetLowering::LowerReturn( + SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, SDLoc DL, + SelectionDAG &DAG) const { + assert(Outs.size() <= 1 && "WebAssembly can only return up to one value"); + if (!CallingConvSupported(CallConv)) + fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions"); + + SmallVector<SDValue, 4> RetOps(1, Chain); + RetOps.append(OutVals.begin(), OutVals.end()); + Chain = DAG.getNode(WebAssemblyISD::RETURN, DL, MVT::Other, RetOps); + + // Record the number and types of the return values. + for (const ISD::OutputArg &Out : Outs) { + assert(!Out.Flags.isByVal() && "byval is not valid for return values"); + assert(!Out.Flags.isNest() && "nest is not valid for return values"); + assert(Out.IsFixed && "non-fixed return value is not valid"); + if (Out.Flags.isInAlloca()) + fail(DL, DAG, "WebAssembly hasn't implemented inalloca results"); + if (Out.Flags.isInConsecutiveRegs()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs results"); + if (Out.Flags.isInConsecutiveRegsLast()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs last results"); + } + + return Chain; +} + +SDValue WebAssemblyTargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + + if (!CallingConvSupported(CallConv)) + fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions"); + + // Set up the incoming ARGUMENTS value, which serves to represent the liveness + // of the incoming values before they're represented by virtual registers. + MF.getRegInfo().addLiveIn(WebAssembly::ARGUMENTS); + + for (const ISD::InputArg &In : Ins) { + if (In.Flags.isByVal()) + fail(DL, DAG, "WebAssembly hasn't implemented byval arguments"); + if (In.Flags.isInAlloca()) + fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments"); + if (In.Flags.isNest()) + fail(DL, DAG, "WebAssembly hasn't implemented nest arguments"); + if (In.Flags.isInConsecutiveRegs()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments"); + if (In.Flags.isInConsecutiveRegsLast()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments"); + // Ignore In.getOrigAlign() because all our arguments are passed in + // registers. + InVals.push_back( + In.Used + ? DAG.getNode(WebAssemblyISD::ARGUMENT, DL, In.VT, + DAG.getTargetConstant(InVals.size(), DL, MVT::i32)) + : DAG.getUNDEF(In.VT)); + + // Record the number and types of arguments. + MF.getInfo<WebAssemblyFunctionInfo>()->addParam(In.VT); + } + + // Incoming varargs arguments are on the stack and will be accessed through + // va_arg, so we don't need to do anything for them here. + + return Chain; +} + //===----------------------------------------------------------------------===// -// Other Lowering Code +// Custom lowering hooks. //===----------------------------------------------------------------------===// +SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op, + SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: + llvm_unreachable("unimplemented operation lowering"); + return SDValue(); + case ISD::FrameIndex: + return LowerFrameIndex(Op, DAG); + case ISD::GlobalAddress: + return LowerGlobalAddress(Op, DAG); + case ISD::ExternalSymbol: + return LowerExternalSymbol(Op, DAG); + case ISD::JumpTable: + return LowerJumpTable(Op, DAG); + case ISD::BR_JT: + return LowerBR_JT(Op, DAG); + case ISD::VASTART: + return LowerVASTART(Op, DAG); + } +} + +SDValue WebAssemblyTargetLowering::LowerFrameIndex(SDValue Op, + SelectionDAG &DAG) const { + int FI = cast<FrameIndexSDNode>(Op)->getIndex(); + return DAG.getTargetFrameIndex(FI, Op.getValueType()); +} + +SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + const auto *GA = cast<GlobalAddressSDNode>(Op); + EVT VT = Op.getValueType(); + assert(GA->getTargetFlags() == 0 && + "Unexpected target flags on generic GlobalAddressSDNode"); + if (GA->getAddressSpace() != 0) + fail(DL, DAG, "WebAssembly only expects the 0 address space"); + return DAG.getNode( + WebAssemblyISD::Wrapper, DL, VT, + DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, GA->getOffset())); +} + +SDValue +WebAssemblyTargetLowering::LowerExternalSymbol(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + const auto *ES = cast<ExternalSymbolSDNode>(Op); + EVT VT = Op.getValueType(); + assert(ES->getTargetFlags() == 0 && + "Unexpected target flags on generic ExternalSymbolSDNode"); + // Set the TargetFlags to 0x1 which indicates that this is a "function" + // symbol rather than a data symbol. We do this unconditionally even though + // we don't know anything about the symbol other than its name, because all + // external symbols used in target-independent SelectionDAG code are for + // functions. + return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT, + DAG.getTargetExternalSymbol(ES->getSymbol(), VT, + /*TargetFlags=*/0x1)); +} + +SDValue WebAssemblyTargetLowering::LowerJumpTable(SDValue Op, + SelectionDAG &DAG) const { + // There's no need for a Wrapper node because we always incorporate a jump + // table operand into a TABLESWITCH instruction, rather than ever + // materializing it in a register. + const JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + return DAG.getTargetJumpTable(JT->getIndex(), Op.getValueType(), + JT->getTargetFlags()); +} + +SDValue WebAssemblyTargetLowering::LowerBR_JT(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1)); + SDValue Index = Op.getOperand(2); + assert(JT->getTargetFlags() == 0 && "WebAssembly doesn't set target flags"); + + SmallVector<SDValue, 8> Ops; + Ops.push_back(Chain); + Ops.push_back(Index); + + MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo(); + const auto &MBBs = MJTI->getJumpTables()[JT->getIndex()].MBBs; + + // TODO: For now, we just pick something arbitrary for a default case for now. + // We really want to sniff out the guard and put in the real default case (and + // delete the guard). + Ops.push_back(DAG.getBasicBlock(MBBs[0])); + + // Add an operand for each case. + for (auto MBB : MBBs) + Ops.push_back(DAG.getBasicBlock(MBB)); + + return DAG.getNode(WebAssemblyISD::TABLESWITCH, DL, MVT::Other, Ops); +} + +SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT PtrVT = getPointerTy(DAG.getMachineFunction().getDataLayout()); + + // The incoming non-fixed arguments are placed on the top of the stack, with + // natural alignment, at the point of the call, so the base pointer is just + // the current frame pointer. + DAG.getMachineFunction().getFrameInfo()->setFrameAddressIsTaken(true); + unsigned FP = + Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction()); + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FP, PtrVT); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), DL, FrameAddr, Op.getOperand(1), + MachinePointerInfo(SV), false, false, 0); +} + //===----------------------------------------------------------------------===// // WebAssembly Optimization Hooks //===----------------------------------------------------------------------===// - -MCSection *WebAssemblyTargetObjectFile::SelectSectionForGlobal( - const GlobalValue *GV, SectionKind Kind, Mangler &Mang, - const TargetMachine &TM) const { - return getDataSection(); -} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h index efd60a7..e7232a0 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -22,10 +22,11 @@ namespace llvm { namespace WebAssemblyISD { -enum { +enum NodeType : unsigned { FIRST_NUMBER = ISD::BUILTIN_OP_END, - - // add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here... +#define HANDLE_NODETYPE(NODE) NODE, +#include "WebAssemblyISD.def" +#undef HANDLE_NODETYPE }; } // end namespace WebAssemblyISD @@ -42,8 +43,51 @@ private: /// Keep a pointer to the WebAssemblySubtarget around so that we can make the /// right decision when generating code for different targets. const WebAssemblySubtarget *Subtarget; + + FastISel *createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) const override; + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; + const char *getTargetNodeName(unsigned Opcode) const override; + std::pair<unsigned, const TargetRegisterClass *> + getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, MVT VT) const override; + bool isCheapToSpeculateCttz() const override; + bool isCheapToSpeculateCtlz() const override; + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, + unsigned AS) const override; + + SDValue LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; + bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const override; + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, SDLoc dl, + SelectionDAG &DAG) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool IsVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; + + // Custom lowering hooks. + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; }; +namespace WebAssembly { +FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo); +} // end namespace WebAssembly + } // end namespace llvm #endif diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td index 6b5b6cd..cfa1519 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td @@ -12,10 +12,63 @@ /// //===----------------------------------------------------------------------===// -/* - * TODO(jfb): Add the following. - * - * call_direct: call function directly - * call_indirect: call function indirectly - * addressof: obtain a function pointer value for a given function - */ +// TODO: addr64: These currently assume the callee address is 32-bit. + +let Defs = [ARGUMENTS] in { + +// Call sequence markers. These have an immediate which represents the amount of +// stack space to allocate or free, which is used for varargs lowering. +let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in { +def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt), + [(WebAssemblycallseq_start timm:$amt)]>; +def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2), + [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>; +} // isCodeGenOnly = 1 + +multiclass CALL<WebAssemblyRegClass vt, string prefix> { + def CALL_#vt : I<(outs vt:$dst), (ins i32imm:$callee, variable_ops), + [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))], + !strconcat(prefix, "call\t$dst, $callee")>; + def CALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops), + [(set vt:$dst, (WebAssemblycall1 I32:$callee))], + !strconcat(prefix, "call_indirect\t$dst, $callee")>; +} +let Uses = [SP32, SP64], isCall = 1 in { + defm : CALL<I32, "i32.">; + defm : CALL<I64, "i64.">; + defm : CALL<F32, "f32.">; + defm : CALL<F64, "f64.">; + + def CALL_VOID : I<(outs), (ins i32imm:$callee, variable_ops), + [(WebAssemblycall0 (i32 imm:$callee))], + "call \t$callee">; + def CALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops), + [(WebAssemblycall0 I32:$callee)], + "call_indirect\t$callee">; +} // Uses = [SP32,SP64], isCall = 1 + +} // Defs = [ARGUMENTS] + +// Patterns for matching a direct call to a global address. +def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), + (CALL_I32 tglobaladdr:$callee)>; +def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), + (CALL_I64 tglobaladdr:$callee)>; +def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), + (CALL_F32 tglobaladdr:$callee)>; +def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), + (CALL_F64 tglobaladdr:$callee)>; +def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)), + (CALL_VOID tglobaladdr:$callee)>; + +// Patterns for matching a direct call to an external symbol. +def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), + (CALL_I32 texternalsym:$callee)>; +def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), + (CALL_I64 texternalsym:$callee)>; +def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), + (CALL_F32 texternalsym:$callee)>; +def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), + (CALL_F64 texternalsym:$callee)>; +def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)), + (CALL_VOID texternalsym:$callee)>; diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td new file mode 100644 index 0000000..fda9595 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -0,0 +1,87 @@ +//===- WebAssemblyInstrControl.td-WebAssembly control-flow ------*- tablegen -*- +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief WebAssembly control-flow code-gen constructs. +/// +//===----------------------------------------------------------------------===// + +let Defs = [ARGUMENTS] in { + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in { +// The condition operand is a boolean value which WebAssembly represents as i32. +def BR_IF : I<(outs), (ins I32:$cond, bb_op:$dst), + [(brcond I32:$cond, bb:$dst)], + "br_if \t$cond, $dst">; +let isCodeGenOnly = 1 in +def BR_UNLESS : I<(outs), (ins I32:$cond, bb_op:$dst), [], + "br_unless\t$cond, $dst">; +let isBarrier = 1 in { +def BR : I<(outs), (ins bb_op:$dst), + [(br bb:$dst)], + "br \t$dst">; +} // isBarrier = 1 +} // isBranch = 1, isTerminator = 1, hasCtrlDep = 1 + +} // Defs = [ARGUMENTS] + +def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst), + (BR_IF I32:$cond, bb_op:$dst)>; +def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst), + (BR_UNLESS I32:$cond, bb_op:$dst)>; + +let Defs = [ARGUMENTS] in { + +// TODO: SelectionDAG's lowering insists on using a pointer as the index for +// jump tables, so in practice we don't ever use TABLESWITCH_I64 in wasm32 mode +// currently. +// Set TSFlags{0} to 1 to indicate that the variable_ops are immediates. +// Set TSFlags{1} to 1 to indicate that the immediates represent labels. +let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { +def TABLESWITCH_I32 : I<(outs), (ins I32:$index, bb_op:$default, variable_ops), + [(WebAssemblytableswitch I32:$index, bb:$default)], + "tableswitch\t$index, $default"> { + let TSFlags{0} = 1; + let TSFlags{1} = 1; +} +def TABLESWITCH_I64 : I<(outs), (ins I64:$index, bb_op:$default, variable_ops), + [(WebAssemblytableswitch I64:$index, bb:$default)], + "tableswitch\t$index, $default"> { + let TSFlags{0} = 1; + let TSFlags{1} = 1; +} +} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 + +// Placemarkers to indicate the start or end of a block or loop scope. These +// use/clobber EXPR_STACK to prevent them from being moved into the middle of +// an expression tree. +let Uses = [EXPR_STACK], Defs = [EXPR_STACK] in { +def BLOCK : I<(outs), (ins), [], "block">; +def LOOP : I<(outs), (ins), [], "loop">; +def END_BLOCK : I<(outs), (ins), [], "end_block">; +def END_LOOP : I<(outs), (ins), [], "end_loop">; +} // Uses = [EXPR_STACK], Defs = [EXPR_STACK] + +multiclass RETURN<WebAssemblyRegClass vt> { + def RETURN_#vt : I<(outs), (ins vt:$val), [(WebAssemblyreturn vt:$val)], + "return \t$val">; +} + +let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { +let isReturn = 1 in { + defm : RETURN<I32>; + defm : RETURN<I64>; + defm : RETURN<F32>; + defm : RETURN<F64>; + def RETURN_VOID : I<(outs), (ins), [(WebAssemblyreturn)], "return">; +} // isReturn = 1 + def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable">; +} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 + +} // Defs = [ARGUMENTS] diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td index 3fa2906..931f4a9 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td @@ -13,32 +13,99 @@ /// //===----------------------------------------------------------------------===// -/* - * TODO(jfb): Add the following. - * - * int32.wrap[int64]: wrap a 64-bit integer to a 32-bit integer - * int32.trunc_signed[float32]: truncate a 32-bit float to a signed 32-bit integer - * int32.trunc_signed[float64]: truncate a 64-bit float to a signed 32-bit integer - * int32.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 32-bit integer - * int32.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 32-bit integer - * int32.reinterpret[float32]: reinterpret the bits of a 32-bit float as a 32-bit integer - * int64.extend_signed[int32]: extend a signed 32-bit integer to a 64-bit integer - * int64.extend_unsigned[int32]: extend an unsigned 32-bit integer to a 64-bit integer - * int64.trunc_signed[float32]: truncate a 32-bit float to a signed 64-bit integer - * int64.trunc_signed[float64]: truncate a 64-bit float to a signed 64-bit integer - * int64.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 64-bit integer - * int64.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 64-bit integer - * int64.reinterpret[float64]: reinterpret the bits of a 64-bit float as a 64-bit integer - * float32.demote[float64]: demote a 64-bit float to a 32-bit float - * float32.cvt_signed[int32]: convert a signed 32-bit integer to a 32-bit float - * float32.cvt_signed[int64]: convert a signed 64-bit integer to a 32-bit float - * float32.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 32-bit float - * float32.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 32-bit float - * float32.reinterpret[int32]: reinterpret the bits of a 32-bit integer as a 32-bit float - * float64.promote[float32]: promote a 32-bit float to a 64-bit float - * float64.cvt_signed[int32]: convert a signed 32-bit integer to a 64-bit float - * float64.cvt_signed[int64]: convert a signed 64-bit integer to a 64-bit float - * float64.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 64-bit float - * float64.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 64-bit float - * float64.reinterpret[int64]: reinterpret the bits of a 64-bit integer as a 64-bit float - */ +let Defs = [ARGUMENTS] in { + +def I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src), + [(set I32:$dst, (trunc I64:$src))], + "i32.wrap/i64\t$dst, $src">; + +def I64_EXTEND_S_I32 : I<(outs I64:$dst), (ins I32:$src), + [(set I64:$dst, (sext I32:$src))], + "i64.extend_s/i32\t$dst, $src">; +def I64_EXTEND_U_I32 : I<(outs I64:$dst), (ins I32:$src), + [(set I64:$dst, (zext I32:$src))], + "i64.extend_u/i32\t$dst, $src">; + +} // defs = [ARGUMENTS] + +// Expand a "don't care" extend into zero-extend (chosen over sign-extend +// somewhat arbitrarily, although it favors popular hardware architectures +// and is conceptually a simpler operation). +def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>; + +let Defs = [ARGUMENTS] in { + +// Conversion from floating point to integer traps on overflow and invalid. +let hasSideEffects = 1 in { +def I32_TRUNC_S_F32 : I<(outs I32:$dst), (ins F32:$src), + [(set I32:$dst, (fp_to_sint F32:$src))], + "i32.trunc_s/f32\t$dst, $src">; +def I32_TRUNC_U_F32 : I<(outs I32:$dst), (ins F32:$src), + [(set I32:$dst, (fp_to_uint F32:$src))], + "i32.trunc_u/f32\t$dst, $src">; +def I64_TRUNC_S_F32 : I<(outs I64:$dst), (ins F32:$src), + [(set I64:$dst, (fp_to_sint F32:$src))], + "i64.trunc_s/f32\t$dst, $src">; +def I64_TRUNC_U_F32 : I<(outs I64:$dst), (ins F32:$src), + [(set I64:$dst, (fp_to_uint F32:$src))], + "i64.trunc_u/f32\t$dst, $src">; +def I32_TRUNC_S_F64 : I<(outs I32:$dst), (ins F64:$src), + [(set I32:$dst, (fp_to_sint F64:$src))], + "i32.trunc_s/f64\t$dst, $src">; +def I32_TRUNC_U_F64 : I<(outs I32:$dst), (ins F64:$src), + [(set I32:$dst, (fp_to_uint F64:$src))], + "i32.trunc_u/f64\t$dst, $src">; +def I64_TRUNC_S_F64 : I<(outs I64:$dst), (ins F64:$src), + [(set I64:$dst, (fp_to_sint F64:$src))], + "i64.trunc_s/f64\t$dst, $src">; +def I64_TRUNC_U_F64 : I<(outs I64:$dst), (ins F64:$src), + [(set I64:$dst, (fp_to_uint F64:$src))], + "i64.trunc_u/f64\t$dst, $src">; +} // hasSideEffects = 1 + +def F32_CONVERT_S_I32 : I<(outs F32:$dst), (ins I32:$src), + [(set F32:$dst, (sint_to_fp I32:$src))], + "f32.convert_s/i32\t$dst, $src">; +def F32_CONVERT_U_I32 : I<(outs F32:$dst), (ins I32:$src), + [(set F32:$dst, (uint_to_fp I32:$src))], + "f32.convert_u/i32\t$dst, $src">; +def F64_CONVERT_S_I32 : I<(outs F64:$dst), (ins I32:$src), + [(set F64:$dst, (sint_to_fp I32:$src))], + "f64.convert_s/i32\t$dst, $src">; +def F64_CONVERT_U_I32 : I<(outs F64:$dst), (ins I32:$src), + [(set F64:$dst, (uint_to_fp I32:$src))], + "f64.convert_u/i32\t$dst, $src">; +def F32_CONVERT_S_I64 : I<(outs F32:$dst), (ins I64:$src), + [(set F32:$dst, (sint_to_fp I64:$src))], + "f32.convert_s/i64\t$dst, $src">; +def F32_CONVERT_U_I64 : I<(outs F32:$dst), (ins I64:$src), + [(set F32:$dst, (uint_to_fp I64:$src))], + "f32.convert_u/i64\t$dst, $src">; +def F64_CONVERT_S_I64 : I<(outs F64:$dst), (ins I64:$src), + [(set F64:$dst, (sint_to_fp I64:$src))], + "f64.convert_s/i64\t$dst, $src">; +def F64_CONVERT_U_I64 : I<(outs F64:$dst), (ins I64:$src), + [(set F64:$dst, (uint_to_fp I64:$src))], + "f64.convert_u/i64\t$dst, $src">; + +def F64_PROMOTE_F32 : I<(outs F64:$dst), (ins F32:$src), + [(set F64:$dst, (fextend F32:$src))], + "f64.promote/f32\t$dst, $src">; +def F32_DEMOTE_F64 : I<(outs F32:$dst), (ins F64:$src), + [(set F32:$dst, (fround F64:$src))], + "f32.demote/f64\t$dst, $src">; + +def I32_REINTERPRET_F32 : I<(outs I32:$dst), (ins F32:$src), + [(set I32:$dst, (bitconvert F32:$src))], + "i32.reinterpret/f32\t$dst, $src">; +def F32_REINTERPRET_I32 : I<(outs F32:$dst), (ins I32:$src), + [(set F32:$dst, (bitconvert I32:$src))], + "f32.reinterpret/i32\t$dst, $src">; +def I64_REINTERPRET_F64 : I<(outs I64:$dst), (ins F64:$src), + [(set I64:$dst, (bitconvert F64:$src))], + "i64.reinterpret/f64\t$dst, $src">; +def F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src), + [(set F64:$dst, (bitconvert I64:$src))], + "f64.reinterpret/i64\t$dst, $src">; + +} // Defs = [ARGUMENTS] diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td index 30ef633..5520c6d 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td @@ -12,33 +12,90 @@ /// //===----------------------------------------------------------------------===// -defm FADD : BinaryFP<fadd>; -defm FSUB : BinaryFP<fsub>; -defm FMUL : BinaryFP<fmul>; -defm FDIV : BinaryFP<fdiv>; -defm FABS : UnaryFP<fabs>; -defm FNEG : UnaryFP<fneg>; -defm COPYSIGN : BinaryFP<fcopysign>; -defm CEIL : UnaryFP<fceil>; -defm FLOOR : UnaryFP<ffloor>; -defm TRUNC : UnaryFP<ftrunc>; -defm NEARESTINT : UnaryFP<fnearbyint>; - -/* - * TODO(jfb): Add the following for 32-bit and 64-bit. - * - * float32.eq: compare equal - * float32.lt: less than - * float32.le: less than or equal - * float32.gt: greater than - * float32.ge: greater than or equal - */ - -defm SQRT : UnaryFP<fsqrt>; - -/* - * TODO(jfb): Add the following for 32-bit and 64-bit. - * - * float32.min: minimum (binary operator); if either operand is NaN, returns NaN - * float32.max: maximum (binary operator); if either operand is NaN, returns NaN - */ +let Defs = [ARGUMENTS] in { + +let isCommutable = 1 in +defm ADD : BinaryFP<fadd, "add ">; +defm SUB : BinaryFP<fsub, "sub ">; +let isCommutable = 1 in +defm MUL : BinaryFP<fmul, "mul ">; +defm DIV : BinaryFP<fdiv, "div ">; +defm SQRT : UnaryFP<fsqrt, "sqrt">; + +defm ABS : UnaryFP<fabs, "abs ">; +defm NEG : UnaryFP<fneg, "neg ">; +defm COPYSIGN : BinaryFP<fcopysign, "copysign">; + +let isCommutable = 1 in { +defm MIN : BinaryFP<fminnan, "min ">; +defm MAX : BinaryFP<fmaxnan, "max ">; +} // isCommutable = 1 + +defm CEIL : UnaryFP<fceil, "ceil">; +defm FLOOR : UnaryFP<ffloor, "floor">; +defm TRUNC : UnaryFP<ftrunc, "trunc">; +defm NEAREST : UnaryFP<fnearbyint, "nearest">; + +} // Defs = [ARGUMENTS] + +// DAGCombine oddly folds casts into the rhs of copysign. Unfold them. +def : Pat<(fcopysign F64:$lhs, F32:$rhs), + (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>; +def : Pat<(fcopysign F32:$lhs, F64:$rhs), + (COPYSIGN_F32 F32:$lhs, (F32_DEMOTE_F64 F64:$rhs))>; + +// WebAssembly doesn't expose inexact exceptions, so map frint to fnearbyint. +def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>; +def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>; + +let Defs = [ARGUMENTS] in { + +let isCommutable = 1 in { +defm EQ : ComparisonFP<SETOEQ, "eq ">; +defm NE : ComparisonFP<SETUNE, "ne ">; +} // isCommutable = 1 +defm LT : ComparisonFP<SETOLT, "lt ">; +defm LE : ComparisonFP<SETOLE, "le ">; +defm GT : ComparisonFP<SETOGT, "gt ">; +defm GE : ComparisonFP<SETOGE, "ge ">; + +} // Defs = [ARGUMENTS] + +// Don't care floating-point comparisons, supported via other comparisons. +def : Pat<(seteq f32:$lhs, f32:$rhs), (EQ_F32 f32:$lhs, f32:$rhs)>; +def : Pat<(setne f32:$lhs, f32:$rhs), (NE_F32 f32:$lhs, f32:$rhs)>; +def : Pat<(setlt f32:$lhs, f32:$rhs), (LT_F32 f32:$lhs, f32:$rhs)>; +def : Pat<(setle f32:$lhs, f32:$rhs), (LE_F32 f32:$lhs, f32:$rhs)>; +def : Pat<(setgt f32:$lhs, f32:$rhs), (GT_F32 f32:$lhs, f32:$rhs)>; +def : Pat<(setge f32:$lhs, f32:$rhs), (GE_F32 f32:$lhs, f32:$rhs)>; +def : Pat<(seteq f64:$lhs, f64:$rhs), (EQ_F64 f64:$lhs, f64:$rhs)>; +def : Pat<(setne f64:$lhs, f64:$rhs), (NE_F64 f64:$lhs, f64:$rhs)>; +def : Pat<(setlt f64:$lhs, f64:$rhs), (LT_F64 f64:$lhs, f64:$rhs)>; +def : Pat<(setle f64:$lhs, f64:$rhs), (LE_F64 f64:$lhs, f64:$rhs)>; +def : Pat<(setgt f64:$lhs, f64:$rhs), (GT_F64 f64:$lhs, f64:$rhs)>; +def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>; + +let Defs = [ARGUMENTS] in { + +def SELECT_F32 : I<(outs F32:$dst), (ins I32:$cond, F32:$lhs, F32:$rhs), + [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))], + "f32.select\t$dst, $cond, $lhs, $rhs">; +def SELECT_F64 : I<(outs F64:$dst), (ins I32:$cond, F64:$lhs, F64:$rhs), + [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))], + "f64.select\t$dst, $cond, $lhs, $rhs">; + +} // Defs = [ARGUMENTS] + +// ISD::SELECT requires its operand to conform to getBooleanContents, but +// WebAssembly's select interprets any non-zero value as true, so we can fold +// a setne with 0 into a select. +def : Pat<(select (i32 (setne I32:$cond, 0)), F32:$lhs, F32:$rhs), + (SELECT_F32 I32:$cond, F32:$lhs, F32:$rhs)>; +def : Pat<(select (i32 (setne I32:$cond, 0)), F64:$lhs, F64:$rhs), + (SELECT_F64 I32:$cond, F64:$lhs, F64:$rhs)>; + +// And again, this time with seteq instead of setne and the arms reversed. +def : Pat<(select (i32 (seteq I32:$cond, 0)), F32:$lhs, F32:$rhs), + (SELECT_F32 I32:$cond, F32:$rhs, F32:$lhs)>; +def : Pat<(select (i32 (seteq I32:$cond, 0)), F64:$lhs, F64:$rhs), + (SELECT_F64 I32:$cond, F64:$rhs, F64:$lhs)>; diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td index 513c36f..8008dd3 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td @@ -1,4 +1,4 @@ -// WebAssemblyInstrFormats.td - WebAssembly Instruction Formats -*- tblgen -*-// +//=- WebAssemblyInstrFormats.td - WebAssembly Instr. Formats -*- tablegen -*-=// // // The LLVM Compiler Infrastructure // @@ -12,44 +12,68 @@ /// //===----------------------------------------------------------------------===// -// WebAssembly Instruction Format -class WebAssemblyInst<string cstr> : Instruction { +// WebAssembly Instruction Format. +class WebAssemblyInst<string asmstr> : Instruction { field bits<0> Inst; // Instruction encoding. let Namespace = "WebAssembly"; let Pattern = []; - let Constraints = cstr; + let AsmString = asmstr; } -// Normal instructions -class I<dag oops, dag iops, list<dag> pattern, string cstr = ""> - : WebAssemblyInst<cstr> { +// Normal instructions. +class I<dag oops, dag iops, list<dag> pattern, string asmstr = ""> + : WebAssemblyInst<asmstr> { dag OutOperandList = oops; dag InOperandList = iops; let Pattern = pattern; } // Unary and binary instructions, for the local types that WebAssembly supports. -multiclass UnaryInt<SDNode node> { - def _I32 : I<(outs Int32:$dst), (ins Int32:$src), - [(set Int32:$dst, (node Int32:$src))]>; - def _I64 : I<(outs Int64:$dst), (ins Int64:$src), - [(set Int64:$dst, (node Int64:$src))]>; -} -multiclass BinaryInt<SDNode node> { - def _I32 : I<(outs Int32:$dst), (ins Int32:$lhs, Int32:$rhs), - [(set Int32:$dst, (node Int32:$lhs, Int32:$rhs))]>; - def _I64 : I<(outs Int64:$dst), (ins Int64:$lhs, Int64:$rhs), - [(set Int64:$dst, (node Int64:$lhs, Int64:$rhs))]>; -} -multiclass UnaryFP<SDNode node> { - def _F32 : I<(outs Float32:$dst), (ins Float32:$src), - [(set Float32:$dst, (node Float32:$src))]>; - def _F64 : I<(outs Float64:$dst), (ins Float64:$src), - [(set Float64:$dst, (node Float64:$src))]>; -} -multiclass BinaryFP<SDNode node> { - def _F32 : I<(outs Float32:$dst), (ins Float32:$lhs, Float32:$rhs), - [(set Float32:$dst, (node Float32:$lhs, Float32:$rhs))]>; - def _F64 : I<(outs Float64:$dst), (ins Float64:$lhs, Float64:$rhs), - [(set Float64:$dst, (node Float64:$lhs, Float64:$rhs))]>; +multiclass UnaryInt<SDNode node, string name> { + def _I32 : I<(outs I32:$dst), (ins I32:$src), + [(set I32:$dst, (node I32:$src))], + !strconcat("i32.", !strconcat(name, "\t$dst, $src"))>; + def _I64 : I<(outs I64:$dst), (ins I64:$src), + [(set I64:$dst, (node I64:$src))], + !strconcat("i64.", !strconcat(name, "\t$dst, $src"))>; +} +multiclass BinaryInt<SDNode node, string name> { + def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs), + [(set I32:$dst, (node I32:$lhs, I32:$rhs))], + !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; + def _I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs), + [(set I64:$dst, (node I64:$lhs, I64:$rhs))], + !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; +} +multiclass UnaryFP<SDNode node, string name> { + def _F32 : I<(outs F32:$dst), (ins F32:$src), + [(set F32:$dst, (node F32:$src))], + !strconcat("f32.", !strconcat(name, "\t$dst, $src"))>; + def _F64 : I<(outs F64:$dst), (ins F64:$src), + [(set F64:$dst, (node F64:$src))], + !strconcat("f64.", !strconcat(name, "\t$dst, $src"))>; +} +multiclass BinaryFP<SDNode node, string name> { + def _F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs), + [(set F32:$dst, (node F32:$lhs, F32:$rhs))], + !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; + def _F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs), + [(set F64:$dst, (node F64:$lhs, F64:$rhs))], + !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; +} +multiclass ComparisonInt<CondCode cond, string name> { + def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs), + [(set I32:$dst, (setcc I32:$lhs, I32:$rhs, cond))], + !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; + def _I64 : I<(outs I32:$dst), (ins I64:$lhs, I64:$rhs), + [(set I32:$dst, (setcc I64:$lhs, I64:$rhs, cond))], + !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; +} +multiclass ComparisonFP<CondCode cond, string name> { + def _F32 : I<(outs I32:$dst), (ins F32:$lhs, F32:$rhs), + [(set I32:$dst, (setcc F32:$lhs, F32:$rhs, cond))], + !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; + def _F64 : I<(outs I32:$dst), (ins F64:$lhs, F64:$rhs), + [(set I32:$dst, (setcc F64:$lhs, F64:$rhs, cond))], + !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index ea8937c..028e9af 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -24,5 +24,145 @@ using namespace llvm; #define DEBUG_TYPE "wasm-instr-info" +#define GET_INSTRINFO_CTOR_DTOR +#include "WebAssemblyGenInstrInfo.inc" + WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI) - : RI(STI.getTargetTriple()) {} + : WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN, + WebAssembly::ADJCALLSTACKUP), + RI(STI.getTargetTriple()) {} + +void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + DebugLoc DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { + // This method is called by post-RA expansion, which expects only pregs to + // exist. However we need to handle both here. + auto &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = TargetRegisterInfo::isVirtualRegister(DestReg) ? + MRI.getRegClass(DestReg) : + MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(SrcReg); + + unsigned CopyLocalOpcode; + if (RC == &WebAssembly::I32RegClass) + CopyLocalOpcode = WebAssembly::COPY_LOCAL_I32; + else if (RC == &WebAssembly::I64RegClass) + CopyLocalOpcode = WebAssembly::COPY_LOCAL_I64; + else if (RC == &WebAssembly::F32RegClass) + CopyLocalOpcode = WebAssembly::COPY_LOCAL_F32; + else if (RC == &WebAssembly::F64RegClass) + CopyLocalOpcode = WebAssembly::COPY_LOCAL_F64; + else + llvm_unreachable("Unexpected register class"); + + BuildMI(MBB, I, DL, get(CopyLocalOpcode), DestReg) + .addReg(SrcReg, KillSrc ? RegState::Kill : 0); +} + +// Branch analysis. +bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool /*AllowModify*/) const { + bool HaveCond = false; + for (MachineInstr &MI : MBB.terminators()) { + switch (MI.getOpcode()) { + default: + // Unhandled instruction; bail out. + return true; + case WebAssembly::BR_IF: + if (HaveCond) + return true; + // If we're running after CFGStackify, we can't optimize further. + if (!MI.getOperand(1).isMBB()) + return true; + Cond.push_back(MachineOperand::CreateImm(true)); + Cond.push_back(MI.getOperand(0)); + TBB = MI.getOperand(1).getMBB(); + HaveCond = true; + break; + case WebAssembly::BR_UNLESS: + if (HaveCond) + return true; + // If we're running after CFGStackify, we can't optimize further. + if (!MI.getOperand(1).isMBB()) + return true; + Cond.push_back(MachineOperand::CreateImm(false)); + Cond.push_back(MI.getOperand(0)); + TBB = MI.getOperand(1).getMBB(); + HaveCond = true; + break; + case WebAssembly::BR: + // If we're running after CFGStackify, we can't optimize further. + if (!MI.getOperand(0).isMBB()) + return true; + if (!HaveCond) + TBB = MI.getOperand(0).getMBB(); + else + FBB = MI.getOperand(0).getMBB(); + break; + } + if (MI.isBarrier()) + break; + } + + return false; +} + +unsigned WebAssemblyInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::instr_iterator I = MBB.instr_end(); + unsigned Count = 0; + + while (I != MBB.instr_begin()) { + --I; + if (I->isDebugValue()) + continue; + if (!I->isTerminator()) + break; + // Remove the branch. + I->eraseFromParent(); + I = MBB.instr_end(); + ++Count; + } + + return Count; +} + +unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, + DebugLoc DL) const { + if (Cond.empty()) { + if (!TBB) + return 0; + + BuildMI(&MBB, DL, get(WebAssembly::BR)).addMBB(TBB); + return 1; + } + + assert(Cond.size() == 2 && "Expected a flag and a successor block"); + + if (Cond[0].getImm()) { + BuildMI(&MBB, DL, get(WebAssembly::BR_IF)) + .addOperand(Cond[1]) + .addMBB(TBB); + } else { + BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS)) + .addOperand(Cond[1]) + .addMBB(TBB); + } + if (!FBB) + return 1; + + BuildMI(&MBB, DL, get(WebAssembly::BR)).addMBB(FBB); + return 2; +} + +bool WebAssemblyInstrInfo::ReverseBranchCondition( + SmallVectorImpl<MachineOperand> &Cond) const { + assert(Cond.size() == 2 && "Expected a flag and a successor block"); + Cond.front() = MachineOperand::CreateImm(!Cond.front().getImm()); + return false; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h index 1c4ae22..5ddd9b3 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h @@ -19,17 +19,35 @@ #include "WebAssemblyRegisterInfo.h" #include "llvm/Target/TargetInstrInfo.h" +#define GET_INSTRINFO_HEADER +#include "WebAssemblyGenInstrInfo.inc" + namespace llvm { class WebAssemblySubtarget; -class WebAssemblyInstrInfo final { +class WebAssemblyInstrInfo final : public WebAssemblyGenInstrInfo { const WebAssemblyRegisterInfo RI; public: explicit WebAssemblyInstrInfo(const WebAssemblySubtarget &STI); const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; } + + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + DebugLoc DL, unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify = false) const override; + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, + DebugLoc DL) const override; + bool + ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index fe3ca76..2e682a4 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -25,20 +25,58 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">, // WebAssembly-specific DAG Node Types. //===----------------------------------------------------------------------===// +def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>; +def SDT_WebAssemblyCallSeqEnd : + SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; +def SDT_WebAssemblyCall0 : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; +def SDT_WebAssemblyCall1 : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>; +def SDT_WebAssemblyTableswitch : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; +def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>; +def SDT_WebAssemblyReturn : SDTypeProfile<0, -1, []>; +def SDT_WebAssemblyWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, + SDTCisPtrTy<0>]>; + //===----------------------------------------------------------------------===// // WebAssembly-specific DAG Nodes. //===----------------------------------------------------------------------===// +def WebAssemblycallseq_start : + SDNode<"ISD::CALLSEQ_START", SDT_WebAssemblyCallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def WebAssemblycallseq_end : + SDNode<"ISD::CALLSEQ_END", SDT_WebAssemblyCallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def WebAssemblycall0 : SDNode<"WebAssemblyISD::CALL0", + SDT_WebAssemblyCall0, + [SDNPHasChain, SDNPVariadic]>; +def WebAssemblycall1 : SDNode<"WebAssemblyISD::CALL1", + SDT_WebAssemblyCall1, + [SDNPHasChain, SDNPVariadic]>; +def WebAssemblytableswitch : SDNode<"WebAssemblyISD::TABLESWITCH", + SDT_WebAssemblyTableswitch, + [SDNPHasChain, SDNPVariadic]>; +def WebAssemblyargument : SDNode<"WebAssemblyISD::ARGUMENT", + SDT_WebAssemblyArgument>; +def WebAssemblyreturn : SDNode<"WebAssemblyISD::RETURN", + SDT_WebAssemblyReturn, [SDNPHasChain]>; +def WebAssemblywrapper : SDNode<"WebAssemblyISD::Wrapper", + SDT_WebAssemblyWrapper>; + //===----------------------------------------------------------------------===// // WebAssembly-specific Operands. //===----------------------------------------------------------------------===// -/* - * TODO(jfb): Add the following. - * - * get_local: read the current value of a local variable - * set_local: set the current value of a local variable -*/ +let OperandNamespace = "WebAssembly" in { + +let OperandType = "OPERAND_BASIC_BLOCK" in +def bb_op : Operand<OtherVT>; + +let OperandType = "OPERAND_FPIMM" in { +def f32imm_op : Operand<f32>; +def f64imm_op : Operand<f64>; +} // OperandType = "OPERAND_FPIMM" + +} // OperandNamespace = "WebAssembly" //===----------------------------------------------------------------------===// // WebAssembly Instruction Format Definitions. @@ -47,13 +85,75 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">, include "WebAssemblyInstrFormats.td" //===----------------------------------------------------------------------===// +// Additional instructions. +//===----------------------------------------------------------------------===// + +multiclass ARGUMENT<WebAssemblyRegClass vt> { + let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in + def ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno), + [(set vt:$res, (WebAssemblyargument timm:$argno))]>; +} +defm : ARGUMENT<I32>; +defm : ARGUMENT<I64>; +defm : ARGUMENT<F32>; +defm : ARGUMENT<F64>; + +let Defs = [ARGUMENTS] in { + +// get_local and set_local are not generated by instruction selection; they +// are implied by virtual register uses and defs in most contexts. However, +// they are explicitly emitted for special purposes. +multiclass LOCAL<WebAssemblyRegClass vt> { + def GET_LOCAL_#vt : I<(outs vt:$res), (ins i32imm:$regno), [], + "get_local\t$res, $regno">; + // TODO: set_local returns its operand value + def SET_LOCAL_#vt : I<(outs), (ins i32imm:$regno, vt:$src), [], + "set_local\t$regno, $src">; + + // COPY_LOCAL is not an actual instruction in wasm, but since we allow + // get_local and set_local to be implicit, we can have a COPY_LOCAL which + // is actually a no-op because all the work is done in the implied + // get_local and set_local. + let isAsCheapAsAMove = 1 in + def COPY_LOCAL_#vt : I<(outs vt:$res), (ins vt:$src), [], + "copy_local\t$res, $src">; +} +defm : LOCAL<I32>; +defm : LOCAL<I64>; +defm : LOCAL<F32>; +defm : LOCAL<F64>; + +let isMoveImm = 1 in { +def CONST_I32 : I<(outs I32:$res), (ins i32imm:$imm), + [(set I32:$res, imm:$imm)], + "i32.const\t$res, $imm">; +def CONST_I64 : I<(outs I64:$res), (ins i64imm:$imm), + [(set I64:$res, imm:$imm)], + "i64.const\t$res, $imm">; +def CONST_F32 : I<(outs F32:$res), (ins f32imm_op:$imm), + [(set F32:$res, fpimm:$imm)], + "f32.const\t$res, $imm">; +def CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm), + [(set F64:$res, fpimm:$imm)], + "f64.const\t$res, $imm">; +} // isMoveImm = 1 + +} // Defs = [ARGUMENTS] + +def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)), + (CONST_I32 tglobaladdr:$addr)>; +def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)), + (CONST_I32 texternalsym:$addr)>; + +//===----------------------------------------------------------------------===// // Additional sets of instructions. //===----------------------------------------------------------------------===// include "WebAssemblyInstrMemory.td" include "WebAssemblyInstrCall.td" +include "WebAssemblyInstrControl.td" include "WebAssemblyInstrInteger.td" -include "WebAssemblyInstrFloat.td" include "WebAssemblyInstrConv.td" +include "WebAssemblyInstrFloat.td" include "WebAssemblyInstrAtomics.td" include "WebAssemblyInstrSIMD.td" diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td index 5f60fe8..09e5eaf 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td @@ -12,34 +12,77 @@ /// //===----------------------------------------------------------------------===// -defm ADD : BinaryInt<add>; -defm SUB : BinaryInt<sub>; -defm MUL : BinaryInt<mul>; -defm SDIV : BinaryInt<sdiv>; -defm UDIV : BinaryInt<udiv>; -defm SREM : BinaryInt<srem>; -defm UREM : BinaryInt<urem>; -defm AND : BinaryInt<and>; -defm IOR : BinaryInt<or>; -defm XOR : BinaryInt<xor>; -defm SHL : BinaryInt<shl>; -defm SHR : BinaryInt<srl>; -defm SAR : BinaryInt<sra>; - -/* - * TODO(jfb): Add the following for 32-bit and 64-bit. - * - * int32.eq: signed-less compare equal - * int32.slt: signed less than - * int32.sle: signed less than or equal - * int32.ult: unsigned less than - * int32.ule: unsigned less than or equal - * int32.sgt: signed greater than - * int32.sge: signed greater than or equal - * int32.ugt: unsigned greater than - * int32.uge: unsigned greater than or equal - */ - -defm CLZ : UnaryInt<ctlz>; -defm CTZ : UnaryInt<cttz>; -defm POPCNT : UnaryInt<ctpop>; +let Defs = [ARGUMENTS] in { + +// The spaces after the names are for aesthetic purposes only, to make +// operands line up vertically after tab expansion. +let isCommutable = 1 in +defm ADD : BinaryInt<add, "add ">; +defm SUB : BinaryInt<sub, "sub ">; +let isCommutable = 1 in +defm MUL : BinaryInt<mul, "mul ">; +// Divide and remainder trap on a zero denominator. +let hasSideEffects = 1 in { +defm DIV_S : BinaryInt<sdiv, "div_s">; +defm DIV_U : BinaryInt<udiv, "div_u">; +defm REM_S : BinaryInt<srem, "rem_s">; +defm REM_U : BinaryInt<urem, "rem_u">; +} // hasSideEffects = 1 +let isCommutable = 1 in { +defm AND : BinaryInt<and, "and ">; +defm OR : BinaryInt<or, "or ">; +defm XOR : BinaryInt<xor, "xor ">; +} // isCommutable = 1 +defm SHL : BinaryInt<shl, "shl ">; +defm SHR_U : BinaryInt<srl, "shr_u">; +defm SHR_S : BinaryInt<sra, "shr_s">; + +let isCommutable = 1 in { +defm EQ : ComparisonInt<SETEQ, "eq ">; +defm NE : ComparisonInt<SETNE, "ne ">; +} // isCommutable = 1 +defm LT_S : ComparisonInt<SETLT, "lt_s">; +defm LE_S : ComparisonInt<SETLE, "le_s">; +defm LT_U : ComparisonInt<SETULT, "lt_u">; +defm LE_U : ComparisonInt<SETULE, "le_u">; +defm GT_S : ComparisonInt<SETGT, "gt_s">; +defm GE_S : ComparisonInt<SETGE, "ge_s">; +defm GT_U : ComparisonInt<SETUGT, "gt_u">; +defm GE_U : ComparisonInt<SETUGE, "ge_u">; + +defm CLZ : UnaryInt<ctlz, "clz ">; +defm CTZ : UnaryInt<cttz, "ctz ">; +defm POPCNT : UnaryInt<ctpop, "popcnt">; + +} // Defs = [ARGUMENTS] + +// Expand the "don't care" operations to supported operations. +def : Pat<(ctlz_zero_undef I32:$src), (CLZ_I32 I32:$src)>; +def : Pat<(ctlz_zero_undef I64:$src), (CLZ_I64 I64:$src)>; +def : Pat<(cttz_zero_undef I32:$src), (CTZ_I32 I32:$src)>; +def : Pat<(cttz_zero_undef I64:$src), (CTZ_I64 I64:$src)>; + +let Defs = [ARGUMENTS] in { + +def SELECT_I32 : I<(outs I32:$dst), (ins I32:$cond, I32:$lhs, I32:$rhs), + [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))], + "i32.select\t$dst, $cond, $lhs, $rhs">; +def SELECT_I64 : I<(outs I64:$dst), (ins I32:$cond, I64:$lhs, I64:$rhs), + [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))], + "i64.select\t$dst, $cond, $lhs, $rhs">; + +} // Defs = [ARGUMENTS] + +// ISD::SELECT requires its operand to conform to getBooleanContents, but +// WebAssembly's select interprets any non-zero value as true, so we can fold +// a setne with 0 into a select. +def : Pat<(select (i32 (setne I32:$cond, 0)), I32:$lhs, I32:$rhs), + (SELECT_I32 I32:$cond, I32:$lhs, I32:$rhs)>; +def : Pat<(select (i32 (setne I32:$cond, 0)), I64:$lhs, I64:$rhs), + (SELECT_I64 I32:$cond, I64:$lhs, I64:$rhs)>; + +// And again, this time with seteq instead of setne and the arms reversed. +def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs), + (SELECT_I32 I32:$cond, I32:$rhs, I32:$lhs)>; +def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs), + (SELECT_I64 I32:$cond, I64:$rhs, I64:$lhs)>; diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td index 5ab40e8..b39ac52 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td @@ -12,35 +12,576 @@ /// //===----------------------------------------------------------------------===// -/* - * TODO(jfb): Add the following. - * Each has optional alignment and immediate byte offset. - * - * int32.load_sx[int8]: sign-extend to int32 - * int32.load_sx[int16]: sign-extend to int32 - * int32.load_zx[int8]: zero-extend to int32 - * int32.load_zx[int16]: zero-extend to int32 - * int32.load[int32]: (no conversion) - * int64.load_sx[int8]: sign-extend to int64 - * int64.load_sx[int16]: sign-extend to int64 - * int64.load_sx[int32]: sign-extend to int64 - * int64.load_zx[int8]: zero-extend to int64 - * int64.load_zx[int16]: zero-extend to int64 - * int64.load_zx[int32]: zero-extend to int64 - * int64.load[int64]: (no conversion) - * float32.load[float32]: (no conversion) - * float64.load[float64]: (no conversion) - * - * int32.store[int8]: wrap int32 to int8 - * int32.store[int16]: wrap int32 to int16 - * int32.store[int32]: (no conversion) - * int64.store[int8]: wrap int64 to int8 - * int64.store[int16]: wrap int64 to int16 - * int64.store[int32]: wrap int64 to int32 - * int64.store[int64]: (no conversion) - * float32.store[float32]: (no conversion) - * float64.store[float64]: (no conversion) - * - * load_global: load the value of a given global variable - * store_global: store a given value to a given global variable - */ +// TODO: +// - HasAddr64 +// - WebAssemblyTargetLowering having to do with atomics +// - Each has optional alignment. + +// WebAssembly has i8/i16/i32/i64/f32/f64 memory types, but doesn't have i8/i16 +// local types. These memory-only types instead zero- or sign-extend into local +// types when loading, and truncate when storing. + +// WebAssembly constant offsets are performed as unsigned with infinite +// precision, so we need to check for NoUnsignedWrap so that we don't fold an +// offset for an add that needs wrapping. +def regPlusImm : PatFrag<(ops node:$addr, node:$off), + (add node:$addr, node:$off), + [{ return N->getFlags()->hasNoUnsignedWrap(); }]>; + +// GlobalAddresses are conceptually unsigned values, so we can also fold them +// into immediate values as long as their offsets are non-negative. +def regPlusGA : PatFrag<(ops node:$addr, node:$off), + (add node:$addr, node:$off), + [{ + return N->getFlags()->hasNoUnsignedWrap() || + (N->getOperand(1)->getOpcode() == WebAssemblyISD::Wrapper && + isa<GlobalAddressSDNode>(N->getOperand(1)->getOperand(0)) && + cast<GlobalAddressSDNode>(N->getOperand(1)->getOperand(0)) + ->getOffset() >= 0); +}]>; + +// We don't need a regPlusES because external symbols never have constant +// offsets folded into them, so we can just use add. + +let Defs = [ARGUMENTS] in { + +// Basic load. +def LOAD_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [], + "i32.load\t$dst, ${off}(${addr})">; +def LOAD_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load\t$dst, ${off}(${addr})">; +def LOAD_F32 : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr), [], + "f32.load\t$dst, ${off}(${addr})">; +def LOAD_F64 : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr), [], + "f64.load\t$dst, ${off}(${addr})">; + +} // Defs = [ARGUMENTS] + +// Select loads with no constant offset. +def : Pat<(i32 (load I32:$addr)), (LOAD_I32 0, $addr)>; +def : Pat<(i64 (load I32:$addr)), (LOAD_I64 0, $addr)>; +def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, $addr)>; +def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, $addr)>; + +// Select loads with a constant offset. +def : Pat<(i32 (load (regPlusImm I32:$addr, imm:$off))), + (LOAD_I32 imm:$off, $addr)>; +def : Pat<(i64 (load (regPlusImm I32:$addr, imm:$off))), + (LOAD_I64 imm:$off, $addr)>; +def : Pat<(f32 (load (regPlusImm I32:$addr, imm:$off))), + (LOAD_F32 imm:$off, $addr)>; +def : Pat<(f64 (load (regPlusImm I32:$addr, imm:$off))), + (LOAD_F64 imm:$off, $addr)>; +def : Pat<(i32 (load (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (load (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD_I64 tglobaladdr:$off, $addr)>; +def : Pat<(f32 (load (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD_F32 tglobaladdr:$off, $addr)>; +def : Pat<(f64 (load (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD_F64 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))), + (LOAD_I32 texternalsym:$off, $addr)>; +def : Pat<(i64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))), + (LOAD_I64 texternalsym:$off, $addr)>; +def : Pat<(f32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))), + (LOAD_F32 texternalsym:$off, $addr)>; +def : Pat<(f64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))), + (LOAD_F64 texternalsym:$off, $addr)>; + +// Select loads with just a constant offset. +def : Pat<(i32 (load imm:$off)), (LOAD_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (load imm:$off)), (LOAD_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(f32 (load imm:$off)), (LOAD_F32 imm:$off, (CONST_I32 0))>; +def : Pat<(f64 (load imm:$off)), (LOAD_F64 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (load (WebAssemblywrapper tglobaladdr:$off))), + (LOAD_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (load (WebAssemblywrapper tglobaladdr:$off))), + (LOAD_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(f32 (load (WebAssemblywrapper tglobaladdr:$off))), + (LOAD_F32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(f64 (load (WebAssemblywrapper tglobaladdr:$off))), + (LOAD_F64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (load (WebAssemblywrapper texternalsym:$off))), + (LOAD_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (load (WebAssemblywrapper texternalsym:$off))), + (LOAD_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(f32 (load (WebAssemblywrapper texternalsym:$off))), + (LOAD_F32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(f64 (load (WebAssemblywrapper texternalsym:$off))), + (LOAD_F64 texternalsym:$off, (CONST_I32 0))>; + +let Defs = [ARGUMENTS] in { + +// Extending load. +def LOAD8_S_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [], + "i32.load8_s\t$dst, ${off}(${addr})">; +def LOAD8_U_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [], + "i32.load8_u\t$dst, ${off}(${addr})">; +def LOAD16_S_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [], + "i32.load16_s\t$dst, ${off}(${addr})">; +def LOAD16_U_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [], + "i32.load16_u\t$dst, ${off}(${addr})">; +def LOAD8_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load8_s\t$dst, ${off}(${addr})">; +def LOAD8_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load8_u\t$dst, ${off}(${addr})">; +def LOAD16_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load16_s\t$dst, ${off}(${addr})">; +def LOAD16_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load16_u\t$dst, ${off}(${addr})">; +def LOAD32_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load32_s\t$dst, ${off}(${addr})">; +def LOAD32_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load32_u\t$dst, ${off}(${addr})">; + +} // Defs = [ARGUMENTS] + +// Select extending loads with no constant offset. +def : Pat<(i32 (sextloadi8 I32:$addr)), (LOAD8_S_I32 0, $addr)>; +def : Pat<(i32 (zextloadi8 I32:$addr)), (LOAD8_U_I32 0, $addr)>; +def : Pat<(i32 (sextloadi16 I32:$addr)), (LOAD16_S_I32 0, $addr)>; +def : Pat<(i32 (zextloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>; +def : Pat<(i64 (sextloadi8 I32:$addr)), (LOAD8_S_I64 0, $addr)>; +def : Pat<(i64 (zextloadi8 I32:$addr)), (LOAD8_U_I64 0, $addr)>; +def : Pat<(i64 (sextloadi16 I32:$addr)), (LOAD16_S_I64 0, $addr)>; +def : Pat<(i64 (zextloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>; +def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, $addr)>; +def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>; + +// Select extending loads with a constant offset. +def : Pat<(i32 (sextloadi8 (regPlusImm I32:$addr, imm:$off))), + (LOAD8_S_I32 imm:$off, $addr)>; +def : Pat<(i32 (zextloadi8 (regPlusImm I32:$addr, imm:$off))), + (LOAD8_U_I32 imm:$off, $addr)>; +def : Pat<(i32 (sextloadi16 (regPlusImm I32:$addr, imm:$off))), + (LOAD16_S_I32 imm:$off, $addr)>; +def : Pat<(i32 (zextloadi16 (regPlusImm I32:$addr, imm:$off))), + (LOAD16_U_I32 imm:$off, $addr)>; +def : Pat<(i64 (sextloadi8 (regPlusImm I32:$addr, imm:$off))), + (LOAD8_S_I64 imm:$off, $addr)>; +def : Pat<(i64 (zextloadi8 (regPlusImm I32:$addr, imm:$off))), + (LOAD8_U_I64 imm:$off, $addr)>; +def : Pat<(i64 (sextloadi16 (regPlusImm I32:$addr, imm:$off))), + (LOAD16_S_I64 imm:$off, $addr)>; +def : Pat<(i64 (zextloadi16 (regPlusImm I32:$addr, imm:$off))), + (LOAD16_U_I64 imm:$off, $addr)>; +def : Pat<(i64 (sextloadi32 (regPlusImm I32:$addr, imm:$off))), + (LOAD32_S_I64 imm:$off, $addr)>; +def : Pat<(i64 (zextloadi32 (regPlusImm I32:$addr, imm:$off))), + (LOAD32_U_I64 imm:$off, $addr)>; +def : Pat<(i32 (sextloadi8 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD8_S_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (zextloadi8 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD8_U_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (sextloadi16 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD16_S_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (zextloadi16 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD16_U_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (sextloadi8 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD8_S_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (zextloadi8 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD8_U_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (sextloadi16 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD16_S_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (zextloadi16 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD16_U_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (sextloadi32 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD32_S_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (zextloadi32 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD32_U_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (sextloadi8 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD8_S_I32 texternalsym:$off, $addr)>; +def : Pat<(i32 (zextloadi8 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD8_U_I32 texternalsym:$off, $addr)>; +def : Pat<(i32 (sextloadi16 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD16_S_I32 texternalsym:$off, $addr)>; +def : Pat<(i32 (zextloadi16 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD16_U_I32 texternalsym:$off, $addr)>; +def : Pat<(i64 (sextloadi8 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD8_S_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (zextloadi8 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD8_U_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (sextloadi16 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD16_S_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (zextloadi16 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD16_U_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (sextloadi32 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD32_S_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (zextloadi32 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD32_U_I64 texternalsym:$off, $addr)>; + +// Select extending loads with just a constant offset. +def : Pat<(i32 (sextloadi8 imm:$off)), (LOAD8_S_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (zextloadi8 imm:$off)), (LOAD8_U_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (sextloadi16 imm:$off)), (LOAD16_S_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (zextloadi16 imm:$off)), (LOAD16_U_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi8 imm:$off)), (LOAD8_S_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi8 imm:$off)), (LOAD8_U_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi16 imm:$off)), (LOAD16_S_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi16 imm:$off)), (LOAD16_U_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi32 imm:$off)), (LOAD32_S_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi32 imm:$off)), (LOAD32_U_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD8_S_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD16_S_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD8_S_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD16_S_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi32 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD32_S_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi32 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (sextloadi8 (WebAssemblywrapper texternalsym:$off))), + (LOAD8_S_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i32 (zextloadi8 (WebAssemblywrapper texternalsym:$off))), + (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i32 (sextloadi16 (WebAssemblywrapper texternalsym:$off))), + (LOAD16_S_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i32 (zextloadi16 (WebAssemblywrapper texternalsym:$off))), + (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi8 (WebAssemblywrapper texternalsym:$off))), + (LOAD8_S_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi8 (WebAssemblywrapper texternalsym:$off))), + (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi16 (WebAssemblywrapper texternalsym:$off))), + (LOAD16_S_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi16 (WebAssemblywrapper texternalsym:$off))), + (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi32 (WebAssemblywrapper texternalsym:$off))), + (LOAD32_S_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi32 (WebAssemblywrapper texternalsym:$off))), + (LOAD32_U_I64 texternalsym:$off, (CONST_I32 0))>; + +// Resolve "don't care" extending loads to zero-extending loads. This is +// somewhat arbitrary, but zero-extending is conceptually simpler. + +// Select "don't care" extending loads with no constant offset. +def : Pat<(i32 (extloadi8 I32:$addr)), (LOAD8_U_I32 0, $addr)>; +def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>; +def : Pat<(i64 (extloadi8 I32:$addr)), (LOAD8_U_I64 0, $addr)>; +def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>; +def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>; + +// Select "don't care" extending loads with a constant offset. +def : Pat<(i32 (extloadi8 (regPlusImm I32:$addr, imm:$off))), + (LOAD8_U_I32 imm:$off, $addr)>; +def : Pat<(i32 (extloadi16 (regPlusImm I32:$addr, imm:$off))), + (LOAD16_U_I32 imm:$off, $addr)>; +def : Pat<(i64 (extloadi8 (regPlusImm I32:$addr, imm:$off))), + (LOAD8_U_I64 imm:$off, $addr)>; +def : Pat<(i64 (extloadi16 (regPlusImm I32:$addr, imm:$off))), + (LOAD16_U_I64 imm:$off, $addr)>; +def : Pat<(i64 (extloadi32 (regPlusImm I32:$addr, imm:$off))), + (LOAD32_U_I64 imm:$off, $addr)>; +def : Pat<(i32 (extloadi8 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD8_U_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (extloadi16 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD16_U_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (extloadi8 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD8_U_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (extloadi16 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD16_U_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (extloadi32 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), + (LOAD32_U_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (extloadi8 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD8_U_I32 texternalsym:$off, $addr)>; +def : Pat<(i32 (extloadi16 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD16_U_I32 texternalsym:$off, $addr)>; +def : Pat<(i64 (extloadi8 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD8_U_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (extloadi16 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD16_U_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (extloadi32 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), + (LOAD32_U_I64 texternalsym:$off, $addr)>; + +// Select "don't care" extending loads with just a constant offset. +def : Pat<(i32 (extloadi8 imm:$off)), (LOAD8_U_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (extloadi16 imm:$off)), (LOAD16_U_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi8 imm:$off)), (LOAD8_U_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi16 imm:$off)), (LOAD16_U_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi32 imm:$off)), (LOAD32_U_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi32 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (extloadi8 (WebAssemblywrapper texternalsym:$off))), + (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i32 (extloadi16 (WebAssemblywrapper texternalsym:$off))), + (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi8 (WebAssemblywrapper texternalsym:$off))), + (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi16 (WebAssemblywrapper texternalsym:$off))), + (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi32 (WebAssemblywrapper texternalsym:$off))), + (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>; + +let Defs = [ARGUMENTS] in { + +// Basic store. +// Note that we split the patterns out of the instruction definitions because +// WebAssembly's stores return their operand value, and tablegen doesn't like +// instruction definition patterns that don't reference all of the output +// operands. +// Note: WebAssembly inverts SelectionDAG's usual operand order. +def STORE_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [], + "i32.store\t$dst, ${off}(${addr}), $val">; +def STORE_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [], + "i64.store\t$dst, ${off}(${addr}), $val">; +def STORE_F32 : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr, F32:$val), [], + "f32.store\t$dst, ${off}(${addr}), $val">; +def STORE_F64 : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr, F64:$val), [], + "f64.store\t$dst, ${off}(${addr}), $val">; + +} // Defs = [ARGUMENTS] + +// Select stores with no constant offset. +def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, I32:$addr, I32:$val)>; +def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, I32:$addr, I64:$val)>; +def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, I32:$addr, F32:$val)>; +def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, I32:$addr, F64:$val)>; + +// Select stores with a constant offset. +def : Pat<(store I32:$val, (regPlusImm I32:$addr, imm:$off)), + (STORE_I32 imm:$off, I32:$addr, I32:$val)>; +def : Pat<(store I64:$val, (regPlusImm I32:$addr, imm:$off)), + (STORE_I64 imm:$off, I32:$addr, I64:$val)>; +def : Pat<(store F32:$val, (regPlusImm I32:$addr, imm:$off)), + (STORE_F32 imm:$off, I32:$addr, F32:$val)>; +def : Pat<(store F64:$val, (regPlusImm I32:$addr, imm:$off)), + (STORE_F64 imm:$off, I32:$addr, F64:$val)>; +def : Pat<(store I32:$val, (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), + (STORE_I32 tglobaladdr:$off, I32:$addr, I32:$val)>; +def : Pat<(store I64:$val, (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), + (STORE_I64 tglobaladdr:$off, I32:$addr, I64:$val)>; +def : Pat<(store F32:$val, (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), + (STORE_F32 tglobaladdr:$off, I32:$addr, F32:$val)>; +def : Pat<(store F64:$val, (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), + (STORE_F64 tglobaladdr:$off, I32:$addr, F64:$val)>; +def : Pat<(store I32:$val, (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), + (STORE_I32 texternalsym:$off, I32:$addr, I32:$val)>; +def : Pat<(store I64:$val, (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), + (STORE_I64 texternalsym:$off, I32:$addr, I64:$val)>; +def : Pat<(store F32:$val, (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), + (STORE_F32 texternalsym:$off, I32:$addr, F32:$val)>; +def : Pat<(store F64:$val, (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), + (STORE_F64 texternalsym:$off, I32:$addr, F64:$val)>; + +// Select stores with just a constant offset. +def : Pat<(store I32:$val, imm:$off), + (STORE_I32 imm:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(store I64:$val, imm:$off), + (STORE_I64 imm:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(store F32:$val, imm:$off), + (STORE_F32 imm:$off, (CONST_I32 0), F32:$val)>; +def : Pat<(store F64:$val, imm:$off), + (STORE_F64 imm:$off, (CONST_I32 0), F64:$val)>; +def : Pat<(store I32:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(store I64:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(store F32:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE_F32 tglobaladdr:$off, (CONST_I32 0), F32:$val)>; +def : Pat<(store F64:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE_F64 tglobaladdr:$off, (CONST_I32 0), F64:$val)>; +def : Pat<(store I32:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(store I64:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(store F32:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE_F32 texternalsym:$off, (CONST_I32 0), F32:$val)>; +def : Pat<(store F64:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE_F64 texternalsym:$off, (CONST_I32 0), F64:$val)>; + +let Defs = [ARGUMENTS] in { + +// Truncating store. +def STORE8_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [], + "i32.store8\t$dst, ${off}(${addr}), $val">; +def STORE16_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [], + "i32.store16\t$dst, ${off}(${addr}), $val">; +def STORE8_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [], + "i64.store8\t$dst, ${off}(${addr}), $val">; +def STORE16_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [], + "i64.store16\t$dst, ${off}(${addr}), $val">; +def STORE32_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [], + "i64.store32\t$dst, ${off}(${addr}), $val">; + +} // Defs = [ARGUMENTS] + +// Select truncating stores with no constant offset. +def : Pat<(truncstorei8 I32:$val, I32:$addr), + (STORE8_I32 0, I32:$addr, I32:$val)>; +def : Pat<(truncstorei16 I32:$val, I32:$addr), + (STORE16_I32 0, I32:$addr, I32:$val)>; +def : Pat<(truncstorei8 I64:$val, I32:$addr), + (STORE8_I64 0, I32:$addr, I64:$val)>; +def : Pat<(truncstorei16 I64:$val, I32:$addr), + (STORE16_I64 0, I32:$addr, I64:$val)>; +def : Pat<(truncstorei32 I64:$val, I32:$addr), + (STORE32_I64 0, I32:$addr, I64:$val)>; + +// Select truncating stores with a constant offset. +def : Pat<(truncstorei8 I32:$val, (regPlusImm I32:$addr, imm:$off)), + (STORE8_I32 imm:$off, I32:$addr, I32:$val)>; +def : Pat<(truncstorei16 I32:$val, (regPlusImm I32:$addr, imm:$off)), + (STORE16_I32 imm:$off, I32:$addr, I32:$val)>; +def : Pat<(truncstorei8 I64:$val, (regPlusImm I32:$addr, imm:$off)), + (STORE8_I64 imm:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei16 I64:$val, (regPlusImm I32:$addr, imm:$off)), + (STORE16_I64 imm:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei32 I64:$val, (regPlusImm I32:$addr, imm:$off)), + (STORE32_I64 imm:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei8 I32:$val, + (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), + (STORE8_I32 tglobaladdr:$off, I32:$addr, I32:$val)>; +def : Pat<(truncstorei16 I32:$val, + (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), + (STORE16_I32 tglobaladdr:$off, I32:$addr, I32:$val)>; +def : Pat<(truncstorei8 I64:$val, + (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), + (STORE8_I64 tglobaladdr:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei16 I64:$val, + (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), + (STORE16_I64 tglobaladdr:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei32 I64:$val, + (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), + (STORE32_I64 tglobaladdr:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei8 I32:$val, (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), + (STORE8_I32 texternalsym:$off, I32:$addr, I32:$val)>; +def : Pat<(truncstorei16 I32:$val, + (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), + (STORE16_I32 texternalsym:$off, I32:$addr, I32:$val)>; +def : Pat<(truncstorei8 I64:$val, + (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), + (STORE8_I64 texternalsym:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei16 I64:$val, + (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), + (STORE16_I64 texternalsym:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei32 I64:$val, + (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), + (STORE32_I64 texternalsym:$off, I32:$addr, I64:$val)>; + +// Select truncating stores with just a constant offset. +def : Pat<(truncstorei8 I32:$val, imm:$off), + (STORE8_I32 imm:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(truncstorei16 I32:$val, imm:$off), + (STORE16_I32 imm:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(truncstorei8 I64:$val, imm:$off), + (STORE8_I64 imm:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei16 I64:$val, imm:$off), + (STORE16_I64 imm:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei32 I64:$val, imm:$off), + (STORE32_I64 imm:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE8_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE16_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE8_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE16_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE32_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE8_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE16_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE8_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE16_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE32_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>; + +let Defs = [ARGUMENTS] in { + +// Memory size. +def MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins), + [(set I32:$dst, (int_wasm_memory_size))], + "memory_size\t$dst">, + Requires<[HasAddr32]>; +def MEMORY_SIZE_I64 : I<(outs I64:$dst), (ins), + [(set I64:$dst, (int_wasm_memory_size))], + "memory_size\t$dst">, + Requires<[HasAddr64]>; + +// Grow memory. +def GROW_MEMORY_I32 : I<(outs), (ins I32:$delta), + [(int_wasm_grow_memory I32:$delta)], + "grow_memory\t$delta">, + Requires<[HasAddr32]>; +def GROW_MEMORY_I64 : I<(outs), (ins I64:$delta), + [(int_wasm_grow_memory I64:$delta)], + "grow_memory\t$delta">, + Requires<[HasAddr64]>; + +} // Defs = [ARGUMENTS] diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp new file mode 100644 index 0000000..b009a4e --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp @@ -0,0 +1,133 @@ +//===-- WebAssemblyLowerBrUnless.cpp - Lower br_unless --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file lowers br_unless into br_if with an inverted condition. +/// +/// br_unless is not currently in the spec, but it's very convenient for LLVM +/// to use. This pass allows LLVM to use it, for now. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "WebAssemblySubtarget.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-lower-br_unless" + +namespace { +class WebAssemblyLowerBrUnless final : public MachineFunctionPass { + const char *getPassName() const override { + return "WebAssembly Lower br_unless"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyLowerBrUnless() : MachineFunctionPass(ID) {} +}; +} // end anonymous namespace + +char WebAssemblyLowerBrUnless::ID = 0; +FunctionPass *llvm::createWebAssemblyLowerBrUnless() { + return new WebAssemblyLowerBrUnless(); +} + +bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** Lowering br_unless **********\n" + "********** Function: " + << MF.getName() << '\n'); + + auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); + const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); + auto &MRI = MF.getRegInfo(); + + for (auto &MBB : MF) { + for (auto MII = MBB.begin(); MII != MBB.end(); ) { + MachineInstr *MI = &*MII++; + if (MI->getOpcode() != WebAssembly::BR_UNLESS) + continue; + + unsigned Cond = MI->getOperand(0).getReg(); + bool Inverted = false; + + // Attempt to invert the condition in place. + if (MFI.isVRegStackified(Cond)) { + assert(MRI.hasOneDef(Cond)); + MachineInstr *Def = MRI.getVRegDef(Cond); + switch (Def->getOpcode()) { + using namespace WebAssembly; + case EQ_I32: Def->setDesc(TII.get(NE_I32)); Inverted = true; break; + case NE_I32: Def->setDesc(TII.get(EQ_I32)); Inverted = true; break; + case GT_S_I32: Def->setDesc(TII.get(LE_S_I32)); Inverted = true; break; + case GE_S_I32: Def->setDesc(TII.get(LT_S_I32)); Inverted = true; break; + case LT_S_I32: Def->setDesc(TII.get(GE_S_I32)); Inverted = true; break; + case LE_S_I32: Def->setDesc(TII.get(GT_S_I32)); Inverted = true; break; + case GT_U_I32: Def->setDesc(TII.get(LE_U_I32)); Inverted = true; break; + case GE_U_I32: Def->setDesc(TII.get(LT_U_I32)); Inverted = true; break; + case LT_U_I32: Def->setDesc(TII.get(GE_U_I32)); Inverted = true; break; + case LE_U_I32: Def->setDesc(TII.get(GT_U_I32)); Inverted = true; break; + case EQ_I64: Def->setDesc(TII.get(NE_I64)); Inverted = true; break; + case NE_I64: Def->setDesc(TII.get(EQ_I64)); Inverted = true; break; + case GT_S_I64: Def->setDesc(TII.get(LE_S_I64)); Inverted = true; break; + case GE_S_I64: Def->setDesc(TII.get(LT_S_I64)); Inverted = true; break; + case LT_S_I64: Def->setDesc(TII.get(GE_S_I64)); Inverted = true; break; + case LE_S_I64: Def->setDesc(TII.get(GT_S_I64)); Inverted = true; break; + case GT_U_I64: Def->setDesc(TII.get(LE_U_I64)); Inverted = true; break; + case GE_U_I64: Def->setDesc(TII.get(LT_U_I64)); Inverted = true; break; + case LT_U_I64: Def->setDesc(TII.get(GE_U_I64)); Inverted = true; break; + case LE_U_I64: Def->setDesc(TII.get(GT_U_I64)); Inverted = true; break; + case EQ_F32: Def->setDesc(TII.get(NE_F32)); Inverted = true; break; + case NE_F32: Def->setDesc(TII.get(EQ_F32)); Inverted = true; break; + case EQ_F64: Def->setDesc(TII.get(NE_F64)); Inverted = true; break; + case NE_F64: Def->setDesc(TII.get(EQ_F64)); Inverted = true; break; + default: break; + } + } + + // If we weren't able to invert the condition in place. Insert an + // expression to invert it. + if (!Inverted) { + unsigned ZeroReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + MFI.stackifyVReg(ZeroReg); + BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::CONST_I32), ZeroReg) + .addImm(0); + unsigned Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + MFI.stackifyVReg(Tmp); + BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQ_I32), Tmp) + .addReg(Cond) + .addReg(ZeroReg); + Cond = Tmp; + Inverted = true; + } + + // The br_unless condition has now been inverted. Insert a br_if and + // delete the br_unless. + assert(Inverted); + BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::BR_IF)) + .addReg(Cond) + .addOperand(MI->getOperand(1)); + MBB.erase(MI); + } + } + + return true; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp new file mode 100644 index 0000000..022a448 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -0,0 +1,115 @@ +// WebAssemblyMCInstLower.cpp - Convert WebAssembly MachineInstr to an MCInst // +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file contains code to lower WebAssembly MachineInstrs to their +/// corresponding MCInst records. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssemblyMCInstLower.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/IR/Constants.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +MCSymbol * +WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { + return Printer.getSymbol(MO.getGlobal()); +} + +MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol( + const MachineOperand &MO) const { + return Printer.GetExternalSymbolSymbol(MO.getSymbolName()); +} + +MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym, + int64_t Offset, + bool IsFunc) const { + MCSymbolRefExpr::VariantKind VK = + IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION + : MCSymbolRefExpr::VK_None; + const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx); + + if (Offset != 0) { + if (IsFunc) + report_fatal_error("Function addresses with offsets not supported"); + Expr = + MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx); + } + + return MCOperand::createExpr(Expr); +} + +void WebAssemblyMCInstLower::Lower(const MachineInstr *MI, + MCInst &OutMI) const { + OutMI.setOpcode(MI->getOpcode()); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + + MCOperand MCOp; + switch (MO.getType()) { + default: + MI->dump(); + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_MachineBasicBlock: + MI->dump(); + llvm_unreachable("MachineBasicBlock operand should have been rewritten"); + case MachineOperand::MO_Register: { + // Ignore all implicit register operands. + if (MO.isImplicit()) + continue; + const WebAssemblyFunctionInfo &MFI = + *MI->getParent()->getParent()->getInfo<WebAssemblyFunctionInfo>(); + unsigned WAReg = MFI.getWAReg(MO.getReg()); + MCOp = MCOperand::createReg(WAReg); + break; + } + case MachineOperand::MO_Immediate: + MCOp = MCOperand::createImm(MO.getImm()); + break; + case MachineOperand::MO_FPImmediate: { + // TODO: MC converts all floating point immediate operands to double. + // This is fine for numeric values, but may cause NaNs to change bits. + const ConstantFP *Imm = MO.getFPImm(); + if (Imm->getType()->isFloatTy()) + MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToFloat()); + else if (Imm->getType()->isDoubleTy()) + MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToDouble()); + else + llvm_unreachable("unknown floating point immediate type"); + break; + } + case MachineOperand::MO_GlobalAddress: + assert(MO.getTargetFlags() == 0 && + "WebAssembly does not use target flags on GlobalAddresses"); + MCOp = LowerSymbolOperand(GetGlobalAddressSymbol(MO), MO.getOffset(), + MO.getGlobal()->getValueType()->isFunctionTy()); + break; + case MachineOperand::MO_ExternalSymbol: + // The target flag indicates whether this is a symbol for a + // variable or a function. + assert((MO.getTargetFlags() & -2) == 0 && + "WebAssembly uses only one target flag bit on ExternalSymbols"); + MCOp = LowerSymbolOperand(GetExternalSymbolSymbol(MO), /*Offset=*/0, + MO.getTargetFlags() & 1); + break; + } + + OutMI.addOperand(MCOp); + } +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h new file mode 100644 index 0000000..ab4ba1c --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h @@ -0,0 +1,46 @@ +//===-- WebAssemblyMCInstLower.h - Lower MachineInstr to MCInst -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file declares the class to lower WebAssembly MachineInstrs to +/// their corresponding MCInst records. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H +#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H + +#include "llvm/MC/MCInst.h" +#include "llvm/Support/Compiler.h" + +namespace llvm { +class AsmPrinter; +class MCContext; +class MCSymbol; +class MachineInstr; +class MachineOperand; + +/// This class is used to lower an MachineInstr into an MCInst. +class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower { + MCContext &Ctx; + AsmPrinter &Printer; + + MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; + MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const; + MCOperand LowerSymbolOperand(MCSymbol *Sym, int64_t Offset, + bool IsFunc) const; + +public: + WebAssemblyMCInstLower(MCContext &ctx, AsmPrinter &printer) + : Ctx(ctx), Printer(printer) {} + void Lower(const MachineInstr *MI, MCInst &OutMI) const; +}; +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp index 542d984..225c5d3 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp @@ -17,3 +17,9 @@ using namespace llvm; WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() {} + +void WebAssemblyFunctionInfo::initWARegs() { + assert(WARegs.empty()); + unsigned Reg = UnusedReg; + WARegs.resize(MF.getRegInfo().getNumVirtRegs(), Reg); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h index fc5e910..6a60280 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h @@ -1,4 +1,4 @@ -// WebAssemblyMachineFuctionInfo.h-WebAssembly machine function info -*- C++ -*- +// WebAssemblyMachineFunctionInfo.h-WebAssembly machine function info-*- C++ -*- // // The LLVM Compiler Infrastructure // @@ -16,8 +16,7 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H -#include "WebAssemblyRegisterInfo.h" -#include "llvm/CodeGen/MachineFunction.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "llvm/CodeGen/MachineRegisterInfo.h" namespace llvm { @@ -27,9 +26,70 @@ namespace llvm { class WebAssemblyFunctionInfo final : public MachineFunctionInfo { MachineFunction &MF; + std::vector<MVT> Params; + + /// A mapping from CodeGen vreg index to WebAssembly register number. + std::vector<unsigned> WARegs; + + /// A mapping from CodeGen vreg index to a boolean value indicating whether + /// the given register is considered to be "stackified", meaning it has been + /// determined or made to meet the stack requirements: + /// - single use (per path) + /// - single def (per path) + /// - defined and used in LIFO order with other stack registers + BitVector VRegStackified; + + // One entry for each possible target reg. we expect it to be small. + std::vector<unsigned> PhysRegs; + public: - explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {} + explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) { + PhysRegs.resize(WebAssembly::NUM_TARGET_REGS, -1U); + } ~WebAssemblyFunctionInfo() override; + + void addParam(MVT VT) { Params.push_back(VT); } + const std::vector<MVT> &getParams() const { return Params; } + + static const unsigned UnusedReg = -1u; + + void stackifyVReg(unsigned VReg) { + if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size()) + VRegStackified.resize(TargetRegisterInfo::virtReg2Index(VReg) + 1); + VRegStackified.set(TargetRegisterInfo::virtReg2Index(VReg)); + } + bool isVRegStackified(unsigned VReg) const { + if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size()) + return false; + return VRegStackified.test(TargetRegisterInfo::virtReg2Index(VReg)); + } + + void initWARegs(); + void setWAReg(unsigned VReg, unsigned WAReg) { + assert(WAReg != UnusedReg); + assert(TargetRegisterInfo::virtReg2Index(VReg) < WARegs.size()); + WARegs[TargetRegisterInfo::virtReg2Index(VReg)] = WAReg; + } + unsigned getWAReg(unsigned Reg) const { + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size()); + return WARegs[TargetRegisterInfo::virtReg2Index(Reg)]; + } + return PhysRegs[Reg]; + } + // If new virtual registers are created after initWARegs has been called, + // this function can be used to add WebAssembly register mappings for them. + void addWAReg(unsigned VReg, unsigned WAReg) { + assert(VReg = WARegs.size()); + WARegs.push_back(WAReg); + } + + void addPReg(unsigned PReg, unsigned WAReg) { + assert(PReg < WebAssembly::NUM_TARGET_REGS); + assert(WAReg < -1U); + PhysRegs[PReg] = WAReg; + } + const std::vector<unsigned> &getPhysRegs() const { return PhysRegs; } }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp new file mode 100644 index 0000000..4dc401a --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp @@ -0,0 +1,76 @@ +//===-- WebAssemblyOptimizeReturned.cpp - Optimize "returned" attributes --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief Optimize calls with "returned" attributes for WebAssembly. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-optimize-returned" + +namespace { +class OptimizeReturned final : public FunctionPass, + public InstVisitor<OptimizeReturned> { + const char *getPassName() const override { + return "WebAssembly Optimize Returned"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } + + bool runOnFunction(Function &F) override; + + DominatorTree *DT; + +public: + static char ID; + OptimizeReturned() : FunctionPass(ID), DT(nullptr) {} + + void visitCallSite(CallSite CS); +}; +} // End anonymous namespace + +char OptimizeReturned::ID = 0; +FunctionPass *llvm::createWebAssemblyOptimizeReturned() { + return new OptimizeReturned(); +} + +void OptimizeReturned::visitCallSite(CallSite CS) { + for (unsigned i = 0, e = CS.getNumArgOperands(); i < e; ++i) + if (CS.paramHasAttr(1 + i, Attribute::Returned)) { + Instruction *Inst = CS.getInstruction(); + Value *Arg = CS.getArgOperand(i); + // Ignore constants, globals, undef, etc. + if (isa<Constant>(Arg)) + continue; + // Like replaceDominatedUsesWith but using Instruction/Use dominance. + for (auto UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE;) { + Use &U = *UI++; + if (DT->dominates(Inst, U)) + U.set(Inst); + } + } +} + +bool OptimizeReturned::runOnFunction(Function &F) { + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + visit(F); + return true; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPEI.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPEI.cpp new file mode 100644 index 0000000..d570d42 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPEI.cpp @@ -0,0 +1,1066 @@ +//===-- WebAssemblyPEI.cpp - Insert Prolog/Epilog code in function --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass is responsible for finalizing the functions frame layout, saving +// callee saved registers, and for emitting prolog & epilog code for the +// function. +// +// This pass must be run after register allocation. After this pass is +// executed, it is illegal to construct MO_FrameIndex operands. +// +// This is a copy of lib/CodeGen/PrologEpilogInserter.cpp except that it does +// not assert that all virtual registers are gone (because WebAssembly currently +// uses virtual rather than physical registers), and only runs +// MRI.clearVirtRegs() if scavenging happened (which it never does). It also +// uses a different class name so it can be registered via INITIALIZE_PASS. +// It is otherwise unmodified, so any changes to the target-independent PEI +// can be easily applied. +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/IndexedMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/CodeGen/StackProtector.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <climits> + +using namespace llvm; + +#define DEBUG_TYPE "pei" +namespace llvm { +void initializeWasmPEIPass(PassRegistry&); +} +namespace { +class WasmPEI : public MachineFunctionPass { +public: + static char ID; + WasmPEI() : MachineFunctionPass(ID) { + initializeWasmPEIPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// runOnMachineFunction - Insert prolog/epilog code and replace abstract + /// frame indexes with appropriate references. + /// + bool runOnMachineFunction(MachineFunction &Fn) override; + +private: + RegScavenger *RS; + + // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved + // stack frame indexes. + unsigned MinCSFrameIndex, MaxCSFrameIndex; + + // Save and Restore blocks of the current function. Typically there is a + // single save block, unless Windows EH funclets are involved. + SmallVector<MachineBasicBlock *, 1> SaveBlocks; + SmallVector<MachineBasicBlock *, 4> RestoreBlocks; + + // Flag to control whether to use the register scavenger to resolve + // frame index materialization registers. Set according to + // TRI->requiresFrameIndexScavenging() for the current function. + bool FrameIndexVirtualScavenging; + + void calculateSets(MachineFunction &Fn); + void calculateCallsInformation(MachineFunction &Fn); + void assignCalleeSavedSpillSlots(MachineFunction &Fn, + const BitVector &SavedRegs); + void insertCSRSpillsAndRestores(MachineFunction &Fn); + void calculateFrameObjectOffsets(MachineFunction &Fn); + void replaceFrameIndices(MachineFunction &Fn); + void replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, + int &SPAdj); + void scavengeFrameVirtualRegs(MachineFunction &Fn); + void insertPrologEpilogCode(MachineFunction &Fn); +}; +} // namespace + +char WasmPEI::ID = 0; + +namespace llvm { +FunctionPass *createWebAssemblyPEI() { + return new WasmPEI(); +} +} + +static cl::opt<unsigned> +WarnStackSize("wasm-warn-stack-size", cl::Hidden, cl::init((unsigned)-1), + cl::desc("Warn for stack size bigger than the given" + " number")); + +INITIALIZE_PASS_BEGIN(WasmPEI, "wasmprologepilog", + "Wasm Prologue/Epilogue Insertion", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(StackProtector) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(WasmPEI, "wasmprologepilog", + "Wasm Prologue/Epilogue Insertion & Frame Finalization", + false, false) + +STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged"); +STATISTIC(NumBytesStackSpace, + "Number of bytes used for stack in all functions"); + +void WasmPEI::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addPreserved<MachineLoopInfo>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<StackProtector>(); + AU.addRequired<TargetPassConfig>(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +/// Compute the set of return blocks +void WasmPEI::calculateSets(MachineFunction &Fn) { + const MachineFrameInfo *MFI = Fn.getFrameInfo(); + + // Even when we do not change any CSR, we still want to insert the + // prologue and epilogue of the function. + // So set the save points for those. + + // Use the points found by shrink-wrapping, if any. + if (MFI->getSavePoint()) { + SaveBlocks.push_back(MFI->getSavePoint()); + assert(MFI->getRestorePoint() && "Both restore and save must be set"); + MachineBasicBlock *RestoreBlock = MFI->getRestorePoint(); + // If RestoreBlock does not have any successor and is not a return block + // then the end point is unreachable and we do not need to insert any + // epilogue. + if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock()) + RestoreBlocks.push_back(RestoreBlock); + return; + } + + // Save refs to entry and return blocks. + SaveBlocks.push_back(&Fn.front()); + for (MachineBasicBlock &MBB : Fn) { + if (MBB.isEHFuncletEntry()) + SaveBlocks.push_back(&MBB); + if (MBB.isReturnBlock()) + RestoreBlocks.push_back(&MBB); + } +} + +/// StackObjSet - A set of stack object indexes +typedef SmallSetVector<int, 8> StackObjSet; + +/// runOnMachineFunction - Insert prolog/epilog code and replace abstract +/// frame indexes with appropriate references. +/// +bool WasmPEI::runOnMachineFunction(MachineFunction &Fn) { + const Function* F = Fn.getFunction(); + const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); + const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); + + // LOCALMOD: assert removed from target-independent PEI + //assert(!Fn.getRegInfo().getNumVirtRegs() && "Regalloc must assign all vregs"); + + RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : nullptr; + FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn); + + // Calculate the MaxCallFrameSize and AdjustsStack variables for the + // function's frame information. Also eliminates call frame pseudo + // instructions. + calculateCallsInformation(Fn); + + // Determine which of the registers in the callee save list should be saved. + BitVector SavedRegs; + TFI->determineCalleeSaves(Fn, SavedRegs, RS); + + // Insert spill code for any callee saved registers that are modified. + assignCalleeSavedSpillSlots(Fn, SavedRegs); + + // Determine placement of CSR spill/restore code: + // place all spills in the entry block, all restores in return blocks. + calculateSets(Fn); + + // Add the code to save and restore the callee saved registers. + if (!F->hasFnAttribute(Attribute::Naked)) + insertCSRSpillsAndRestores(Fn); + + // Allow the target machine to make final modifications to the function + // before the frame layout is finalized. + TFI->processFunctionBeforeFrameFinalized(Fn, RS); + + // Calculate actual frame offsets for all abstract stack objects... + calculateFrameObjectOffsets(Fn); + + // Add prolog and epilog code to the function. This function is required + // to align the stack frame as necessary for any stack variables or + // called functions. Because of this, calculateCalleeSavedRegisters() + // must be called before this function in order to set the AdjustsStack + // and MaxCallFrameSize variables. + if (!F->hasFnAttribute(Attribute::Naked)) + insertPrologEpilogCode(Fn); + + // Replace all MO_FrameIndex operands with physical register references + // and actual offsets. + // + replaceFrameIndices(Fn); + + // If register scavenging is needed, as we've enabled doing it as a + // post-pass, scavenge the virtual registers that frame index elimination + // inserted. + if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging) { + scavengeFrameVirtualRegs(Fn); + // Clear any vregs created by virtual scavenging. + // LOCALMOD: made this call conditional with scavengeFrameVirtualregs() + Fn.getRegInfo().clearVirtRegs(); + } + + // Warn on stack size when we exceeds the given limit. + MachineFrameInfo *MFI = Fn.getFrameInfo(); + uint64_t StackSize = MFI->getStackSize(); + if (WarnStackSize.getNumOccurrences() > 0 && WarnStackSize < StackSize) { + DiagnosticInfoStackSize DiagStackSize(*F, StackSize); + F->getContext().diagnose(DiagStackSize); + } + + delete RS; + SaveBlocks.clear(); + RestoreBlocks.clear(); + return true; +} + +/// calculateCallsInformation - Calculate the MaxCallFrameSize and AdjustsStack +/// variables for the function's frame information and eliminate call frame +/// pseudo instructions. +void WasmPEI::calculateCallsInformation(MachineFunction &Fn) { + const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); + const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); + MachineFrameInfo *MFI = Fn.getFrameInfo(); + + unsigned MaxCallFrameSize = 0; + bool AdjustsStack = MFI->adjustsStack(); + + // Get the function call frame set-up and tear-down instruction opcode + unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode(); + unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); + + // Early exit for targets which have no call frame setup/destroy pseudo + // instructions. + if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u) + return; + + std::vector<MachineBasicBlock::iterator> FrameSDOps; + for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) + for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) + if (I->getOpcode() == FrameSetupOpcode || + I->getOpcode() == FrameDestroyOpcode) { + assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo" + " instructions should have a single immediate argument!"); + unsigned Size = I->getOperand(0).getImm(); + if (Size > MaxCallFrameSize) MaxCallFrameSize = Size; + AdjustsStack = true; + FrameSDOps.push_back(I); + } else if (I->isInlineAsm()) { + // Some inline asm's need a stack frame, as indicated by operand 1. + unsigned ExtraInfo = I->getOperand(InlineAsm::MIOp_ExtraInfo).getImm(); + if (ExtraInfo & InlineAsm::Extra_IsAlignStack) + AdjustsStack = true; + } + + MFI->setAdjustsStack(AdjustsStack); + MFI->setMaxCallFrameSize(MaxCallFrameSize); + + for (std::vector<MachineBasicBlock::iterator>::iterator + i = FrameSDOps.begin(), e = FrameSDOps.end(); i != e; ++i) { + MachineBasicBlock::iterator I = *i; + + // If call frames are not being included as part of the stack frame, and + // the target doesn't indicate otherwise, remove the call frame pseudos + // here. The sub/add sp instruction pairs are still inserted, but we don't + // need to track the SP adjustment for frame index elimination. + if (TFI->canSimplifyCallFramePseudos(Fn)) + TFI->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I); + } +} + +void WasmPEI::assignCalleeSavedSpillSlots(MachineFunction &F, + const BitVector &SavedRegs) { + // These are used to keep track the callee-save area. Initialize them. + MinCSFrameIndex = INT_MAX; + MaxCSFrameIndex = 0; + + if (SavedRegs.empty()) + return; + + const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo(); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F); + + std::vector<CalleeSavedInfo> CSI; + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned Reg = CSRegs[i]; + if (SavedRegs.test(Reg)) + CSI.push_back(CalleeSavedInfo(Reg)); + } + + const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering(); + MachineFrameInfo *MFI = F.getFrameInfo(); + if (!TFI->assignCalleeSavedSpillSlots(F, RegInfo, CSI)) { + // If target doesn't implement this, use generic code. + + if (CSI.empty()) + return; // Early exit if no callee saved registers are modified! + + unsigned NumFixedSpillSlots; + const TargetFrameLowering::SpillSlot *FixedSpillSlots = + TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots); + + // Now that we know which registers need to be saved and restored, allocate + // stack slots for them. + for (std::vector<CalleeSavedInfo>::iterator I = CSI.begin(), E = CSI.end(); + I != E; ++I) { + unsigned Reg = I->getReg(); + const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + + int FrameIdx; + if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) { + I->setFrameIdx(FrameIdx); + continue; + } + + // Check to see if this physreg must be spilled to a particular stack slot + // on this target. + const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots; + while (FixedSlot != FixedSpillSlots + NumFixedSpillSlots && + FixedSlot->Reg != Reg) + ++FixedSlot; + + if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) { + // Nope, just spill it anywhere convenient. + unsigned Align = RC->getAlignment(); + unsigned StackAlign = TFI->getStackAlignment(); + + // We may not be able to satisfy the desired alignment specification of + // the TargetRegisterClass if the stack alignment is smaller. Use the + // min. + Align = std::min(Align, StackAlign); + FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true); + if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; + if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; + } else { + // Spill it to the stack where we must. + FrameIdx = + MFI->CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset); + } + + I->setFrameIdx(FrameIdx); + } + } + + MFI->setCalleeSavedInfo(CSI); +} + +/// Helper function to update the liveness information for the callee-saved +/// registers. +static void updateLiveness(MachineFunction &MF) { + MachineFrameInfo *MFI = MF.getFrameInfo(); + // Visited will contain all the basic blocks that are in the region + // where the callee saved registers are alive: + // - Anything that is not Save or Restore -> LiveThrough. + // - Save -> LiveIn. + // - Restore -> LiveOut. + // The live-out is not attached to the block, so no need to keep + // Restore in this set. + SmallPtrSet<MachineBasicBlock *, 8> Visited; + SmallVector<MachineBasicBlock *, 8> WorkList; + MachineBasicBlock *Entry = &MF.front(); + MachineBasicBlock *Save = MFI->getSavePoint(); + + if (!Save) + Save = Entry; + + if (Entry != Save) { + WorkList.push_back(Entry); + Visited.insert(Entry); + } + Visited.insert(Save); + + MachineBasicBlock *Restore = MFI->getRestorePoint(); + if (Restore) + // By construction Restore cannot be visited, otherwise it + // means there exists a path to Restore that does not go + // through Save. + WorkList.push_back(Restore); + + while (!WorkList.empty()) { + const MachineBasicBlock *CurBB = WorkList.pop_back_val(); + // By construction, the region that is after the save point is + // dominated by the Save and post-dominated by the Restore. + if (CurBB == Save && Save != Restore) + continue; + // Enqueue all the successors not already visited. + // Those are by construction either before Save or after Restore. + for (MachineBasicBlock *SuccBB : CurBB->successors()) + if (Visited.insert(SuccBB).second) + WorkList.push_back(SuccBB); + } + + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + for (MachineBasicBlock *MBB : Visited) { + MCPhysReg Reg = CSI[i].getReg(); + // Add the callee-saved register as live-in. + // It's killed at the spill. + if (!MBB->isLiveIn(Reg)) + MBB->addLiveIn(Reg); + } + } +} + +/// insertCSRSpillsAndRestores - Insert spill and restore code for +/// callee saved registers used in the function. +/// +void WasmPEI::insertCSRSpillsAndRestores(MachineFunction &Fn) { + // Get callee saved register information. + MachineFrameInfo *MFI = Fn.getFrameInfo(); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + + MFI->setCalleeSavedInfoValid(true); + + // Early exit if no callee saved registers are modified! + if (CSI.empty()) + return; + + const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); + const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); + MachineBasicBlock::iterator I; + + // Spill using target interface. + for (MachineBasicBlock *SaveBlock : SaveBlocks) { + I = SaveBlock->begin(); + if (!TFI->spillCalleeSavedRegisters(*SaveBlock, I, CSI, TRI)) { + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + // Insert the spill to the stack frame. + unsigned Reg = CSI[i].getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.storeRegToStackSlot(*SaveBlock, I, Reg, true, CSI[i].getFrameIdx(), + RC, TRI); + } + } + // Update the live-in information of all the blocks up to the save point. + updateLiveness(Fn); + } + + // Restore using target interface. + for (MachineBasicBlock *MBB : RestoreBlocks) { + I = MBB->end(); + + // Skip over all terminator instructions, which are part of the return + // sequence. + MachineBasicBlock::iterator I2 = I; + while (I2 != MBB->begin() && (--I2)->isTerminator()) + I = I2; + + bool AtStart = I == MBB->begin(); + MachineBasicBlock::iterator BeforeI = I; + if (!AtStart) + --BeforeI; + + // Restore all registers immediately before the return and any + // terminators that precede it. + if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) { + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(*MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI); + assert(I != MBB->begin() && + "loadRegFromStackSlot didn't insert any code!"); + // Insert in reverse order. loadRegFromStackSlot can insert + // multiple instructions. + if (AtStart) + I = MBB->begin(); + else { + I = BeforeI; + ++I; + } + } + } + } +} + +/// AdjustStackOffset - Helper function used to adjust the stack frame offset. +static inline void +AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx, + bool StackGrowsDown, int64_t &Offset, + unsigned &MaxAlign, unsigned Skew) { + // If the stack grows down, add the object size to find the lowest address. + if (StackGrowsDown) + Offset += MFI->getObjectSize(FrameIdx); + + unsigned Align = MFI->getObjectAlignment(FrameIdx); + + // If the alignment of this object is greater than that of the stack, then + // increase the stack alignment to match. + MaxAlign = std::max(MaxAlign, Align); + + // Adjust to alignment boundary. + Offset = RoundUpToAlignment(Offset, Align, Skew); + + if (StackGrowsDown) { + DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n"); + MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset + } else { + DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n"); + MFI->setObjectOffset(FrameIdx, Offset); + Offset += MFI->getObjectSize(FrameIdx); + } +} + +/// AssignProtectedObjSet - Helper function to assign large stack objects (i.e., +/// those required to be close to the Stack Protector) to stack offsets. +static void +AssignProtectedObjSet(const StackObjSet &UnassignedObjs, + SmallSet<int, 16> &ProtectedObjs, + MachineFrameInfo *MFI, bool StackGrowsDown, + int64_t &Offset, unsigned &MaxAlign, unsigned Skew) { + + for (StackObjSet::const_iterator I = UnassignedObjs.begin(), + E = UnassignedObjs.end(); I != E; ++I) { + int i = *I; + AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew); + ProtectedObjs.insert(i); + } +} + +/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the +/// abstract stack objects. +/// +void WasmPEI::calculateFrameObjectOffsets(MachineFunction &Fn) { + const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); + StackProtector *SP = &getAnalysis<StackProtector>(); + + bool StackGrowsDown = + TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown; + + // Loop over all of the stack objects, assigning sequential addresses... + MachineFrameInfo *MFI = Fn.getFrameInfo(); + + // Start at the beginning of the local area. + // The Offset is the distance from the stack top in the direction + // of stack growth -- so it's always nonnegative. + int LocalAreaOffset = TFI.getOffsetOfLocalArea(); + if (StackGrowsDown) + LocalAreaOffset = -LocalAreaOffset; + assert(LocalAreaOffset >= 0 + && "Local area offset should be in direction of stack growth"); + int64_t Offset = LocalAreaOffset; + + // Skew to be applied to alignment. + unsigned Skew = TFI.getStackAlignmentSkew(Fn); + + // If there are fixed sized objects that are preallocated in the local area, + // non-fixed objects can't be allocated right at the start of local area. + // We currently don't support filling in holes in between fixed sized + // objects, so we adjust 'Offset' to point to the end of last fixed sized + // preallocated object. + for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) { + int64_t FixedOff; + if (StackGrowsDown) { + // The maximum distance from the stack pointer is at lower address of + // the object -- which is given by offset. For down growing stack + // the offset is negative, so we negate the offset to get the distance. + FixedOff = -MFI->getObjectOffset(i); + } else { + // The maximum distance from the start pointer is at the upper + // address of the object. + FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i); + } + if (FixedOff > Offset) Offset = FixedOff; + } + + // First assign frame offsets to stack objects that are used to spill + // callee saved registers. + if (StackGrowsDown) { + for (unsigned i = MinCSFrameIndex; i <= MaxCSFrameIndex; ++i) { + // If the stack grows down, we need to add the size to find the lowest + // address of the object. + Offset += MFI->getObjectSize(i); + + unsigned Align = MFI->getObjectAlignment(i); + // Adjust to alignment boundary + Offset = RoundUpToAlignment(Offset, Align, Skew); + + MFI->setObjectOffset(i, -Offset); // Set the computed offset + } + } else { + int MaxCSFI = MaxCSFrameIndex, MinCSFI = MinCSFrameIndex; + for (int i = MaxCSFI; i >= MinCSFI ; --i) { + unsigned Align = MFI->getObjectAlignment(i); + // Adjust to alignment boundary + Offset = RoundUpToAlignment(Offset, Align, Skew); + + MFI->setObjectOffset(i, Offset); + Offset += MFI->getObjectSize(i); + } + } + + unsigned MaxAlign = MFI->getMaxAlignment(); + + // Make sure the special register scavenging spill slot is closest to the + // incoming stack pointer if a frame pointer is required and is closer + // to the incoming rather than the final stack pointer. + const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo(); + bool EarlyScavengingSlots = (TFI.hasFP(Fn) && + TFI.isFPCloseToIncomingSP() && + RegInfo->useFPForScavengingIndex(Fn) && + !RegInfo->needsStackRealignment(Fn)); + if (RS && EarlyScavengingSlots) { + SmallVector<int, 2> SFIs; + RS->getScavengingFrameIndices(SFIs); + for (SmallVectorImpl<int>::iterator I = SFIs.begin(), + IE = SFIs.end(); I != IE; ++I) + AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew); + } + + // FIXME: Once this is working, then enable flag will change to a target + // check for whether the frame is large enough to want to use virtual + // frame index registers. Functions which don't want/need this optimization + // will continue to use the existing code path. + if (MFI->getUseLocalStackAllocationBlock()) { + unsigned Align = MFI->getLocalFrameMaxAlign(); + + // Adjust to alignment boundary. + Offset = RoundUpToAlignment(Offset, Align, Skew); + + DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n"); + + // Resolve offsets for objects in the local block. + for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) { + std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i); + int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second; + DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" << + FIOffset << "]\n"); + MFI->setObjectOffset(Entry.first, FIOffset); + } + // Allocate the local block + Offset += MFI->getLocalFrameSize(); + + MaxAlign = std::max(Align, MaxAlign); + } + + // Make sure that the stack protector comes before the local variables on the + // stack. + SmallSet<int, 16> ProtectedObjs; + if (MFI->getStackProtectorIndex() >= 0) { + StackObjSet LargeArrayObjs; + StackObjSet SmallArrayObjs; + StackObjSet AddrOfObjs; + + AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown, + Offset, MaxAlign, Skew); + + // Assign large stack objects first. + for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) { + if (MFI->isObjectPreAllocated(i) && + MFI->getUseLocalStackAllocationBlock()) + continue; + if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex) + continue; + if (RS && RS->isScavengingFrameIndex((int)i)) + continue; + if (MFI->isDeadObjectIndex(i)) + continue; + if (MFI->getStackProtectorIndex() == (int)i) + continue; + + switch (SP->getSSPLayout(MFI->getObjectAllocation(i))) { + case StackProtector::SSPLK_None: + continue; + case StackProtector::SSPLK_SmallArray: + SmallArrayObjs.insert(i); + continue; + case StackProtector::SSPLK_AddrOf: + AddrOfObjs.insert(i); + continue; + case StackProtector::SSPLK_LargeArray: + LargeArrayObjs.insert(i); + continue; + } + llvm_unreachable("Unexpected SSPLayoutKind."); + } + + AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown, + Offset, MaxAlign, Skew); + AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown, + Offset, MaxAlign, Skew); + AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown, + Offset, MaxAlign, Skew); + } + + // Then assign frame offsets to stack objects that are not used to spill + // callee saved registers. + for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) { + if (MFI->isObjectPreAllocated(i) && + MFI->getUseLocalStackAllocationBlock()) + continue; + if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex) + continue; + if (RS && RS->isScavengingFrameIndex((int)i)) + continue; + if (MFI->isDeadObjectIndex(i)) + continue; + if (MFI->getStackProtectorIndex() == (int)i) + continue; + if (ProtectedObjs.count(i)) + continue; + + AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew); + } + + // Make sure the special register scavenging spill slot is closest to the + // stack pointer. + if (RS && !EarlyScavengingSlots) { + SmallVector<int, 2> SFIs; + RS->getScavengingFrameIndices(SFIs); + for (SmallVectorImpl<int>::iterator I = SFIs.begin(), + IE = SFIs.end(); I != IE; ++I) + AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew); + } + + if (!TFI.targetHandlesStackFrameRounding()) { + // If we have reserved argument space for call sites in the function + // immediately on entry to the current function, count it as part of the + // overall stack size. + if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn)) + Offset += MFI->getMaxCallFrameSize(); + + // Round up the size to a multiple of the alignment. If the function has + // any calls or alloca's, align to the target's StackAlignment value to + // ensure that the callee's frame or the alloca data is suitably aligned; + // otherwise, for leaf functions, align to the TransientStackAlignment + // value. + unsigned StackAlign; + if (MFI->adjustsStack() || MFI->hasVarSizedObjects() || + (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0)) + StackAlign = TFI.getStackAlignment(); + else + StackAlign = TFI.getTransientStackAlignment(); + + // If the frame pointer is eliminated, all frame offsets will be relative to + // SP not FP. Align to MaxAlign so this works. + StackAlign = std::max(StackAlign, MaxAlign); + Offset = RoundUpToAlignment(Offset, StackAlign, Skew); + } + + // Update frame info to pretend that this is part of the stack... + int64_t StackSize = Offset - LocalAreaOffset; + MFI->setStackSize(StackSize); + NumBytesStackSpace += StackSize; +} + +/// insertPrologEpilogCode - Scan the function for modified callee saved +/// registers, insert spill code for these callee saved registers, then add +/// prolog and epilog code to the function. +/// +void WasmPEI::insertPrologEpilogCode(MachineFunction &Fn) { + const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); + + // Add prologue to the function... + for (MachineBasicBlock *SaveBlock : SaveBlocks) + TFI.emitPrologue(Fn, *SaveBlock); + + // Add epilogue to restore the callee-save registers in each exiting block. + for (MachineBasicBlock *RestoreBlock : RestoreBlocks) + TFI.emitEpilogue(Fn, *RestoreBlock); + + for (MachineBasicBlock *SaveBlock : SaveBlocks) + TFI.inlineStackProbe(Fn, *SaveBlock); + + // Emit additional code that is required to support segmented stacks, if + // we've been asked for it. This, when linked with a runtime with support + // for segmented stacks (libgcc is one), will result in allocating stack + // space in small chunks instead of one large contiguous block. + if (Fn.shouldSplitStack()) { + for (MachineBasicBlock *SaveBlock : SaveBlocks) + TFI.adjustForSegmentedStacks(Fn, *SaveBlock); + } + + // Emit additional code that is required to explicitly handle the stack in + // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The + // approach is rather similar to that of Segmented Stacks, but it uses a + // different conditional check and another BIF for allocating more stack + // space. + if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE) + for (MachineBasicBlock *SaveBlock : SaveBlocks) + TFI.adjustForHiPEPrologue(Fn, *SaveBlock); +} + +/// replaceFrameIndices - Replace all MO_FrameIndex operands with physical +/// register references and actual offsets. +/// +void WasmPEI::replaceFrameIndices(MachineFunction &Fn) { + const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); + if (!TFI.needsFrameIndexResolution(Fn)) return; + + // Store SPAdj at exit of a basic block. + SmallVector<int, 8> SPState; + SPState.resize(Fn.getNumBlockIDs()); + SmallPtrSet<MachineBasicBlock*, 8> Reachable; + + // Iterate over the reachable blocks in DFS order. + for (auto DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable); + DFI != DFE; ++DFI) { + int SPAdj = 0; + // Check the exit state of the DFS stack predecessor. + if (DFI.getPathLength() >= 2) { + MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2); + assert(Reachable.count(StackPred) && + "DFS stack predecessor is already visited.\n"); + SPAdj = SPState[StackPred->getNumber()]; + } + MachineBasicBlock *BB = *DFI; + replaceFrameIndices(BB, Fn, SPAdj); + SPState[BB->getNumber()] = SPAdj; + } + + // Handle the unreachable blocks. + for (auto &BB : Fn) { + if (Reachable.count(&BB)) + // Already handled in DFS traversal. + continue; + int SPAdj = 0; + replaceFrameIndices(&BB, Fn, SPAdj); + } +} + +void WasmPEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, + int &SPAdj) { + assert(Fn.getSubtarget().getRegisterInfo() && + "getRegisterInfo() must be implemented!"); + const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); + const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo(); + const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); + unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode(); + unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); + + if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB); + + bool InsideCallSequence = false; + + for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) { + + if (I->getOpcode() == FrameSetupOpcode || + I->getOpcode() == FrameDestroyOpcode) { + InsideCallSequence = (I->getOpcode() == FrameSetupOpcode); + SPAdj += TII.getSPAdjust(I); + + MachineBasicBlock::iterator PrevI = BB->end(); + if (I != BB->begin()) PrevI = std::prev(I); + TFI->eliminateCallFramePseudoInstr(Fn, *BB, I); + + // Visit the instructions created by eliminateCallFramePseudoInstr(). + if (PrevI == BB->end()) + I = BB->begin(); // The replaced instr was the first in the block. + else + I = std::next(PrevI); + continue; + } + + MachineInstr *MI = I; + bool DoIncr = true; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + if (!MI->getOperand(i).isFI()) + continue; + + // Frame indices in debug values are encoded in a target independent + // way with simply the frame index and offset rather than any + // target-specific addressing mode. + if (MI->isDebugValue()) { + assert(i == 0 && "Frame indices can only appear as the first " + "operand of a DBG_VALUE machine instruction"); + unsigned Reg; + MachineOperand &Offset = MI->getOperand(1); + Offset.setImm(Offset.getImm() + + TFI->getFrameIndexReference( + Fn, MI->getOperand(0).getIndex(), Reg)); + MI->getOperand(0).ChangeToRegister(Reg, false /*isDef*/); + continue; + } + + // TODO: This code should be commoned with the code for + // PATCHPOINT. There's no good reason for the difference in + // implementation other than historical accident. The only + // remaining difference is the unconditional use of the stack + // pointer as the base register. + if (MI->getOpcode() == TargetOpcode::STATEPOINT) { + assert((!MI->isDebugValue() || i == 0) && + "Frame indicies can only appear as the first operand of a " + "DBG_VALUE machine instruction"); + unsigned Reg; + MachineOperand &Offset = MI->getOperand(i + 1); + const unsigned refOffset = + TFI->getFrameIndexReferenceFromSP(Fn, MI->getOperand(i).getIndex(), + Reg); + + Offset.setImm(Offset.getImm() + refOffset); + MI->getOperand(i).ChangeToRegister(Reg, false /*isDef*/); + continue; + } + + // Some instructions (e.g. inline asm instructions) can have + // multiple frame indices and/or cause eliminateFrameIndex + // to insert more than one instruction. We need the register + // scavenger to go through all of these instructions so that + // it can update its register information. We keep the + // iterator at the point before insertion so that we can + // revisit them in full. + bool AtBeginning = (I == BB->begin()); + if (!AtBeginning) --I; + + // If this instruction has a FrameIndex operand, we need to + // use that target machine register info object to eliminate + // it. + TRI.eliminateFrameIndex(MI, SPAdj, i, + FrameIndexVirtualScavenging ? nullptr : RS); + + // Reset the iterator if we were at the beginning of the BB. + if (AtBeginning) { + I = BB->begin(); + DoIncr = false; + } + + MI = nullptr; + break; + } + + // If we are looking at a call sequence, we need to keep track of + // the SP adjustment made by each instruction in the sequence. + // This includes both the frame setup/destroy pseudos (handled above), + // as well as other instructions that have side effects w.r.t the SP. + // Note that this must come after eliminateFrameIndex, because + // if I itself referred to a frame index, we shouldn't count its own + // adjustment. + if (MI && InsideCallSequence) + SPAdj += TII.getSPAdjust(MI); + + if (DoIncr && I != BB->end()) ++I; + + // Update register states. + if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI); + } +} + +/// scavengeFrameVirtualRegs - Replace all frame index virtual registers +/// with physical registers. Use the register scavenger to find an +/// appropriate register to use. +/// +/// FIXME: Iterating over the instruction stream is unnecessary. We can simply +/// iterate over the vreg use list, which at this point only contains machine +/// operands for which eliminateFrameIndex need a new scratch reg. +void +WasmPEI::scavengeFrameVirtualRegs(MachineFunction &Fn) { + // Run through the instructions and find any virtual registers. + for (MachineFunction::iterator BB = Fn.begin(), + E = Fn.end(); BB != E; ++BB) { + RS->enterBasicBlock(&*BB); + + int SPAdj = 0; + + // The instruction stream may change in the loop, so check BB->end() + // directly. + for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) { + // We might end up here again with a NULL iterator if we scavenged a + // register for which we inserted spill code for definition by what was + // originally the first instruction in BB. + if (I == MachineBasicBlock::iterator(nullptr)) + I = BB->begin(); + + MachineInstr *MI = I; + MachineBasicBlock::iterator J = std::next(I); + MachineBasicBlock::iterator P = + I == BB->begin() ? MachineBasicBlock::iterator(nullptr) + : std::prev(I); + + // RS should process this instruction before we might scavenge at this + // location. This is because we might be replacing a virtual register + // defined by this instruction, and if so, registers killed by this + // instruction are available, and defined registers are not. + RS->forward(I); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + if (MI->getOperand(i).isReg()) { + MachineOperand &MO = MI->getOperand(i); + unsigned Reg = MO.getReg(); + if (Reg == 0) + continue; + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + + // When we first encounter a new virtual register, it + // must be a definition. + assert(MI->getOperand(i).isDef() && + "frame index virtual missing def!"); + // Scavenge a new scratch register + const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg); + unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj); + + ++NumScavengedRegs; + + // Replace this reference to the virtual register with the + // scratch register. + assert (ScratchReg && "Missing scratch register!"); + Fn.getRegInfo().replaceRegWith(Reg, ScratchReg); + + // Because this instruction was processed by the RS before this + // register was allocated, make sure that the RS now records the + // register as being used. + RS->setRegUsed(ScratchReg); + } + } + + // If the scavenger needed to use one of its spill slots, the + // spill code will have been inserted in between I and J. This is a + // problem because we need the spill code before I: Move I to just + // prior to J. + if (I != std::prev(J)) { + BB->splice(J, &*BB, I); + + // Before we move I, we need to prepare the RS to visit I again. + // Specifically, RS will assert if it sees uses of registers that + // it believes are undefined. Because we have already processed + // register kills in I, when it visits I again, it will believe that + // those registers are undefined. To avoid this situation, unprocess + // the instruction I. + assert(RS->getCurrentPosition() == I && + "The register scavenger has an unexpected position"); + I = P; + RS->unprocess(P); + } else + ++I; + } + } +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp new file mode 100644 index 0000000..4ad6eed --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp @@ -0,0 +1,86 @@ +//===-- WebAssemblyPeephole.cpp - WebAssembly Peephole Optimiztions -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief Late peephole optimizations for WebAssembly. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-peephole" + +namespace { +class WebAssemblyPeephole final : public MachineFunctionPass { + const char *getPassName() const override { + return "WebAssembly late peephole optimizer"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +public: + static char ID; + WebAssemblyPeephole() : MachineFunctionPass(ID) {} +}; +} // end anonymous namespace + +char WebAssemblyPeephole::ID = 0; +FunctionPass *llvm::createWebAssemblyPeephole() { + return new WebAssemblyPeephole(); +} + +bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); + + for (auto &MBB : MF) + for (auto &MI : MBB) + switch (MI.getOpcode()) { + default: + break; + case WebAssembly::STORE8_I32: + case WebAssembly::STORE16_I32: + case WebAssembly::STORE8_I64: + case WebAssembly::STORE16_I64: + case WebAssembly::STORE32_I64: + case WebAssembly::STORE_F32: + case WebAssembly::STORE_F64: + case WebAssembly::STORE_I32: + case WebAssembly::STORE_I64: { + // Store instructions return their value operand. If we ended up using + // the same register for both, replace it with a dead def so that it + // can use $discard instead. + MachineOperand &MO = MI.getOperand(0); + unsigned OldReg = MO.getReg(); + // TODO: Handle SP/physregs + if (OldReg == MI.getOperand(3).getReg() + && TargetRegisterInfo::isVirtualRegister(MI.getOperand(3).getReg())) { + Changed = true; + unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg)); + MO.setReg(NewReg); + MO.setIsDead(); + MFI.stackifyVReg(NewReg); + MFI.addWAReg(NewReg, WebAssemblyFunctionInfo::UnusedReg); + } + } + } + + return Changed; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp new file mode 100644 index 0000000..9ec6659 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp @@ -0,0 +1,175 @@ +//===-- WebAssemblyRegColoring.cpp - Register coloring --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a virtual register coloring pass. +/// +/// WebAssembly doesn't have a fixed number of registers, but it is still +/// desirable to minimize the total number of registers used in each function. +/// +/// This code is modeled after lib/CodeGen/StackSlotColoring.cpp. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-reg-coloring" + +namespace { +class WebAssemblyRegColoring final : public MachineFunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyRegColoring() : MachineFunctionPass(ID) {} + + const char *getPassName() const override { + return "WebAssembly Register Coloring"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<LiveIntervals>(); + AU.addRequired<MachineBlockFrequencyInfo>(); + AU.addPreserved<MachineBlockFrequencyInfo>(); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: +}; +} // end anonymous namespace + +char WebAssemblyRegColoring::ID = 0; +FunctionPass *llvm::createWebAssemblyRegColoring() { + return new WebAssemblyRegColoring(); +} + +// Compute the total spill weight for VReg. +static float computeWeight(const MachineRegisterInfo *MRI, + const MachineBlockFrequencyInfo *MBFI, + unsigned VReg) { + float weight = 0.0f; + for (MachineOperand &MO : MRI->reg_nodbg_operands(VReg)) + weight += LiveIntervals::getSpillWeight(MO.isDef(), MO.isUse(), MBFI, + MO.getParent()); + return weight; +} + +bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { + DEBUG({ + dbgs() << "********** Register Coloring **********\n" + << "********** Function: " << MF.getName() << '\n'; + }); + + // If there are calls to setjmp or sigsetjmp, don't perform coloring. Virtual + // registers could be modified before the longjmp is executed, resulting in + // the wrong value being used afterwards. (See <rdar://problem/8007500>.) + // TODO: Does WebAssembly need to care about setjmp for register coloring? + if (MF.exposesReturnsTwice()) + return false; + + MachineRegisterInfo *MRI = &MF.getRegInfo(); + LiveIntervals *Liveness = &getAnalysis<LiveIntervals>(); + const MachineBlockFrequencyInfo *MBFI = + &getAnalysis<MachineBlockFrequencyInfo>(); + WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); + + // Gather all register intervals into a list and sort them. + unsigned NumVRegs = MRI->getNumVirtRegs(); + SmallVector<LiveInterval *, 0> SortedIntervals; + SortedIntervals.reserve(NumVRegs); + + DEBUG(dbgs() << "Interesting register intervals:\n"); + for (unsigned i = 0; i < NumVRegs; ++i) { + unsigned VReg = TargetRegisterInfo::index2VirtReg(i); + if (MFI.isVRegStackified(VReg)) + continue; + // Skip unused registers, which can use $discard. + if (MRI->use_empty(VReg)) + continue; + + LiveInterval *LI = &Liveness->getInterval(VReg); + assert(LI->weight == 0.0f); + LI->weight = computeWeight(MRI, MBFI, VReg); + DEBUG(LI->dump()); + SortedIntervals.push_back(LI); + } + DEBUG(dbgs() << '\n'); + + // Sort them to put arguments first (since we don't want to rename live-in + // registers), by weight next, and then by position. + // TODO: Investigate more intelligent sorting heuristics. For starters, we + // should try to coalesce adjacent live intervals before non-adjacent ones. + std::sort(SortedIntervals.begin(), SortedIntervals.end(), + [MRI](LiveInterval *LHS, LiveInterval *RHS) { + if (MRI->isLiveIn(LHS->reg) != MRI->isLiveIn(RHS->reg)) + return MRI->isLiveIn(LHS->reg); + if (LHS->weight != RHS->weight) + return LHS->weight > RHS->weight; + if (LHS->empty() || RHS->empty()) + return !LHS->empty() && RHS->empty(); + return *LHS < *RHS; + }); + + DEBUG(dbgs() << "Coloring register intervals:\n"); + SmallVector<unsigned, 16> SlotMapping(SortedIntervals.size(), -1u); + SmallVector<SmallVector<LiveInterval *, 4>, 16> Assignments( + SortedIntervals.size()); + BitVector UsedColors(SortedIntervals.size()); + bool Changed = false; + for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) { + LiveInterval *LI = SortedIntervals[i]; + unsigned Old = LI->reg; + size_t Color = i; + const TargetRegisterClass *RC = MRI->getRegClass(Old); + + // Check if it's possible to reuse any of the used colors. + if (!MRI->isLiveIn(Old)) + for (int C(UsedColors.find_first()); C != -1; + C = UsedColors.find_next(C)) { + if (MRI->getRegClass(SortedIntervals[C]->reg) != RC) + continue; + for (LiveInterval *OtherLI : Assignments[C]) + if (!OtherLI->empty() && OtherLI->overlaps(*LI)) + goto continue_outer; + Color = C; + break; + continue_outer:; + } + + unsigned New = SortedIntervals[Color]->reg; + SlotMapping[i] = New; + Changed |= Old != New; + UsedColors.set(Color); + Assignments[Color].push_back(LI); + DEBUG(dbgs() << "Assigning vreg" + << TargetRegisterInfo::virtReg2Index(LI->reg) << " to vreg" + << TargetRegisterInfo::virtReg2Index(New) << "\n"); + } + if (!Changed) + return false; + + // Rewrite register operands. + for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) { + unsigned Old = SortedIntervals[i]->reg; + unsigned New = SlotMapping[i]; + if (Old != New) + MRI->replaceRegWith(Old, New); + } + return true; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp new file mode 100644 index 0000000..f621db0 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp @@ -0,0 +1,109 @@ +//===-- WebAssemblyRegNumbering.cpp - Register Numbering ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a pass which assigns WebAssembly register +/// numbers for CodeGen virtual registers. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "WebAssemblySubtarget.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-reg-numbering" + +namespace { +class WebAssemblyRegNumbering final : public MachineFunctionPass { + const char *getPassName() const override { + return "WebAssembly Register Numbering"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyRegNumbering() : MachineFunctionPass(ID) {} +}; +} // end anonymous namespace + +char WebAssemblyRegNumbering::ID = 0; +FunctionPass *llvm::createWebAssemblyRegNumbering() { + return new WebAssemblyRegNumbering(); +} + +bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** Register Numbering **********\n" + "********** Function: " + << MF.getName() << '\n'); + + WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const MachineFrameInfo &FrameInfo = *MF.getFrameInfo(); + + MFI.initWARegs(); + + // WebAssembly argument registers are in the same index space as local + // variables. Assign the numbers for them first. + MachineBasicBlock &EntryMBB = MF.front(); + for (MachineInstr &MI : EntryMBB) { + switch (MI.getOpcode()) { + case WebAssembly::ARGUMENT_I32: + case WebAssembly::ARGUMENT_I64: + case WebAssembly::ARGUMENT_F32: + case WebAssembly::ARGUMENT_F64: + MFI.setWAReg(MI.getOperand(0).getReg(), MI.getOperand(1).getImm()); + break; + default: + break; + } + } + + // Then assign regular WebAssembly registers for all remaining used + // virtual registers. TODO: Consider sorting the registers by frequency of + // use, to maximize usage of small immediate fields. + unsigned NumArgRegs = MFI.getParams().size(); + unsigned NumVRegs = MF.getRegInfo().getNumVirtRegs(); + unsigned NumStackRegs = 0; + unsigned CurReg = 0; + for (unsigned VRegIdx = 0; VRegIdx < NumVRegs; ++VRegIdx) { + unsigned VReg = TargetRegisterInfo::index2VirtReg(VRegIdx); + // Handle stackified registers. + if (MFI.isVRegStackified(VReg)) { + MFI.setWAReg(VReg, INT32_MIN | NumStackRegs++); + continue; + } + // Skip unused registers. + if (MRI.use_empty(VReg)) + continue; + if (MFI.getWAReg(VReg) == WebAssemblyFunctionInfo::UnusedReg) + MFI.setWAReg(VReg, NumArgRegs + CurReg++); + } + // Allocate locals for used physical registers + if (FrameInfo.getStackSize() > 0) + MFI.addPReg(WebAssembly::SP32, CurReg++); + + return true; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp new file mode 100644 index 0000000..537c147 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -0,0 +1,267 @@ +//===-- WebAssemblyRegStackify.cpp - Register Stackification --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a register stacking pass. +/// +/// This pass reorders instructions to put register uses and defs in an order +/// such that they form single-use expression trees. Registers fitting this form +/// are then marked as "stackified", meaning references to them are replaced by +/// "push" and "pop" from the stack. +/// +/// This is primarily a code size optimization, since temporary values on the +/// expression don't need to be named. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_* +#include "WebAssemblyMachineFunctionInfo.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-reg-stackify" + +namespace { +class WebAssemblyRegStackify final : public MachineFunctionPass { + const char *getPassName() const override { + return "WebAssembly Register Stackify"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<LiveIntervals>(); + AU.addPreserved<MachineBlockFrequencyInfo>(); + AU.addPreserved<SlotIndexes>(); + AU.addPreserved<LiveIntervals>(); + AU.addPreservedID(MachineDominatorsID); + AU.addPreservedID(LiveVariablesID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyRegStackify() : MachineFunctionPass(ID) {} +}; +} // end anonymous namespace + +char WebAssemblyRegStackify::ID = 0; +FunctionPass *llvm::createWebAssemblyRegStackify() { + return new WebAssemblyRegStackify(); +} + +// Decorate the given instruction with implicit operands that enforce the +// expression stack ordering constraints for an instruction which is on +// the expression stack. +static void ImposeStackOrdering(MachineInstr *MI) { + // Write the opaque EXPR_STACK register. + if (!MI->definesRegister(WebAssembly::EXPR_STACK)) + MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK, + /*isDef=*/true, + /*isImp=*/true)); + + // Also read the opaque EXPR_STACK register. + if (!MI->readsRegister(WebAssembly::EXPR_STACK)) + MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK, + /*isDef=*/false, + /*isImp=*/true)); +} + +// Test whether it's safe to move Def to just before Insert. +// TODO: Compute memory dependencies in a way that doesn't require always +// walking the block. +// TODO: Compute memory dependencies in a way that uses AliasAnalysis to be +// more precise. +static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert, + AliasAnalysis &AA, LiveIntervals &LIS, + MachineRegisterInfo &MRI) { + assert(Def->getParent() == Insert->getParent()); + bool SawStore = false, SawSideEffects = false; + MachineBasicBlock::const_iterator D(Def), I(Insert); + + // Check for register dependencies. + for (const MachineOperand &MO : Def->operands()) { + if (!MO.isReg() || MO.isUndef()) + continue; + unsigned Reg = MO.getReg(); + + // If the register is dead here and at Insert, ignore it. + if (MO.isDead() && Insert->definesRegister(Reg) && + !Insert->readsRegister(Reg)) + continue; + + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + // If the physical register is never modified, ignore it. + if (!MRI.isPhysRegModified(Reg)) + continue; + // Otherwise, it's a physical register with unknown liveness. + return false; + } + + // Ask LiveIntervals whether moving this virtual register use or def to + // Insert will change value numbers are seen. + const LiveInterval &LI = LIS.getInterval(Reg); + VNInfo *DefVNI = MO.isDef() ? + LI.getVNInfoAt(LIS.getInstructionIndex(Def).getRegSlot()) : + LI.getVNInfoBefore(LIS.getInstructionIndex(Def)); + assert(DefVNI && "Instruction input missing value number"); + VNInfo *InsVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(Insert)); + if (InsVNI && DefVNI != InsVNI) + return false; + } + + // Check for memory dependencies and side effects. + for (--I; I != D; --I) + SawSideEffects |= I->isSafeToMove(&AA, SawStore); + return !(SawStore && Def->mayLoad() && !Def->isInvariantLoad(&AA)) && + !(SawSideEffects && !Def->isSafeToMove(&AA, SawStore)); +} + +bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** Register Stackifying **********\n" + "********** Function: " + << MF.getName() << '\n'); + + bool Changed = false; + MachineRegisterInfo &MRI = MF.getRegInfo(); + WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + LiveIntervals &LIS = getAnalysis<LiveIntervals>(); + + // Walk the instructions from the bottom up. Currently we don't look past + // block boundaries, and the blocks aren't ordered so the block visitation + // order isn't significant, but we may want to change this in the future. + for (MachineBasicBlock &MBB : MF) { + // Don't use a range-based for loop, because we modify the list as we're + // iterating over it and the end iterator may change. + for (auto MII = MBB.rbegin(); MII != MBB.rend(); ++MII) { + MachineInstr *Insert = &*MII; + // Don't nest anything inside a phi. + if (Insert->getOpcode() == TargetOpcode::PHI) + break; + + // Don't nest anything inside an inline asm, because we don't have + // constraints for $push inputs. + if (Insert->getOpcode() == TargetOpcode::INLINEASM) + break; + + // Iterate through the inputs in reverse order, since we'll be pulling + // operands off the stack in LIFO order. + bool AnyStackified = false; + for (MachineOperand &Op : reverse(Insert->uses())) { + // We're only interested in explicit virtual register operands. + if (!Op.isReg() || Op.isImplicit() || !Op.isUse()) + continue; + + unsigned Reg = Op.getReg(); + + // Only consider registers with a single definition. + // TODO: Eventually we may relax this, to stackify phi transfers. + MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + if (!Def) + continue; + + // There's no use in nesting implicit defs inside anything. + if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) + continue; + + // Don't nest an INLINE_ASM def into anything, because we don't have + // constraints for $pop outputs. + if (Def->getOpcode() == TargetOpcode::INLINEASM) + continue; + + // Don't nest PHIs inside of anything. + if (Def->getOpcode() == TargetOpcode::PHI) + continue; + + // Argument instructions represent live-in registers and not real + // instructions. + if (Def->getOpcode() == WebAssembly::ARGUMENT_I32 || + Def->getOpcode() == WebAssembly::ARGUMENT_I64 || + Def->getOpcode() == WebAssembly::ARGUMENT_F32 || + Def->getOpcode() == WebAssembly::ARGUMENT_F64) + continue; + + // Single-use expression trees require defs that have one use. + // TODO: Eventually we'll relax this, to take advantage of set_local + // returning its result. + if (!MRI.hasOneUse(Reg)) + continue; + + // For now, be conservative and don't look across block boundaries. + // TODO: Be more aggressive? + if (Def->getParent() != &MBB) + continue; + + // Don't move instructions that have side effects or memory dependencies + // or other complications. + if (!IsSafeToMove(Def, Insert, AA, LIS, MRI)) + continue; + + Changed = true; + AnyStackified = true; + // Move the def down and nest it in the current instruction. + MBB.splice(Insert, &MBB, Def); + LIS.handleMove(Def); + MFI.stackifyVReg(Reg); + ImposeStackOrdering(Def); + Insert = Def; + } + if (AnyStackified) + ImposeStackOrdering(&*MII); + } + } + + // If we used EXPR_STACK anywhere, add it to the live-in sets everywhere + // so that it never looks like a use-before-def. + if (Changed) { + MF.getRegInfo().addLiveIn(WebAssembly::EXPR_STACK); + for (MachineBasicBlock &MBB : MF) + MBB.addLiveIn(WebAssembly::EXPR_STACK); + } + +#ifndef NDEBUG + // Verify that pushes and pops are performed in FIFO order. + SmallVector<unsigned, 0> Stack; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + for (MachineOperand &MO : reverse(MI.explicit_operands())) { + if (!MO.isReg()) + continue; + unsigned VReg = MO.getReg(); + + // Don't stackify physregs like SP or FP. + if (!TargetRegisterInfo::isVirtualRegister(VReg)) + continue; + + if (MFI.isVRegStackified(VReg)) { + if (MO.isDef()) + Stack.push_back(VReg); + else + assert(Stack.pop_back_val() == VReg); + } + } + } + // TODO: Generalize this code to support keeping values on the stack across + // basic block boundaries. + assert(Stack.empty()); + } +#endif + + return Changed; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp index 385c40b..90d8dda 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp @@ -43,7 +43,7 @@ WebAssemblyRegisterInfo::getCalleeSavedRegs(const MachineFunction *) const { } BitVector -WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction &MF) const { +WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction & /*MF*/) const { BitVector Reserved(getNumRegs()); for (auto Reg : {WebAssembly::SP32, WebAssembly::SP64, WebAssembly::FP32, WebAssembly::FP64}) @@ -52,9 +52,43 @@ WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction &MF) const { } void WebAssemblyRegisterInfo::eliminateFrameIndex( - MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, - RegScavenger *RS) const { - llvm_unreachable("WebAssemblyRegisterInfo::eliminateFrameIndex"); // FIXME + MachineBasicBlock::iterator II, int SPAdj, + unsigned FIOperandNum, RegScavenger * /*RS*/) const { + assert(SPAdj == 0); + MachineInstr &MI = *II; + + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + const MachineFrameInfo& MFI = *MF.getFrameInfo(); + int64_t FrameOffset = MFI.getStackSize() + MFI.getObjectOffset(FrameIndex); + + if (MI.mayLoadOrStore()) { + // If this is a load or store, make it relative to SP and fold the frame + // offset directly in. + assert(FrameOffset >= 0 && MI.getOperand(1).getImm() >= 0); + int64_t Offset = MI.getOperand(1).getImm() + FrameOffset; + + if (static_cast<uint64_t>(Offset) > std::numeric_limits<uint32_t>::max()) { + // If this happens the program is invalid, but better to error here than + // generate broken code. + report_fatal_error("Memory offset field overflow"); + } + MI.getOperand(1).setImm(Offset); + MI.getOperand(2).ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false); + } else { + // Otherwise create an i32.add SP, offset and make it the operand. + auto &MRI = MF.getRegInfo(); + const auto *TII = MF.getSubtarget().getInstrInfo(); + + unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(WebAssembly::CONST_I32), OffsetReg) + .addImm(FrameOffset); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(WebAssembly::ADD_I32), OffsetReg) + .addReg(WebAssembly::SP32) + .addReg(OffsetReg); + MI.getOperand(FIOperandNum).ChangeToRegister(OffsetReg, /*IsDef=*/false); + } } unsigned @@ -67,21 +101,11 @@ WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const { return Regs[TFI->hasFP(MF)][TT.isArch64Bit()]; } -bool WebAssemblyRegisterInfo::canRealignStack(const MachineFunction &MF) const { - return !MF.getFunction()->hasFnAttribute("no-realign-stack"); -} - -// FIXME: share this with other backends with identical implementation? -bool WebAssemblyRegisterInfo::needsStackRealignment( - const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const WebAssemblyFrameLowering *TFI = getFrameLowering(MF); - const Function *F = MF.getFunction(); - unsigned StackAlign = TFI->getStackAlignment(); - bool requiresRealignment = - ((MFI->getMaxAlignment() > StackAlign) || - F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackAlignment)); - - return requiresRealignment && canRealignStack(MF); +const TargetRegisterClass * +WebAssemblyRegisterInfo::getPointerRegClass(const MachineFunction &MF, + unsigned Kind) const { + assert(Kind == 0 && "Only one kind of pointer on WebAssembly"); + if (MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()) + return &WebAssembly::I64RegClass; + return &WebAssembly::I32RegClass; } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h index dbdb9d0..ad1d71e 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h @@ -42,9 +42,9 @@ public: // Debug information queries. unsigned getFrameRegister(const MachineFunction &MF) const override; - // Base pointer (stack realignment) support. - bool canRealignStack(const MachineFunction &MF) const; - bool needsStackRealignment(const MachineFunction &MF) const override; + const TargetRegisterClass * + getPointerRegClass(const MachineFunction &MF, + unsigned Kind = 0) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td index 2ba42eb..80a83fa 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td @@ -33,22 +33,26 @@ def FP64 : WebAssemblyReg<"%FP64">; def SP32 : WebAssemblyReg<"%SP32">; def SP64 : WebAssemblyReg<"%SP64">; -// TODO(jfb) The following comes from NVPTX. Is it really needed, or can we do -// away with it? Try deleting once the backend works. -// WebAssembly uses virtual registers, but the backend defines a few physical -// registers here to keep SDAG and the MachineInstr layers happy. -foreach i = 0-4 in { - def I#i : WebAssemblyReg<"%i."#i>; // i32 - def L#i : WebAssemblyReg<"%l."#i>; // i64 - def F#i : WebAssemblyReg<"%f."#i>; // f32 - def D#i : WebAssemblyReg<"%d."#i>; // f64 -} +// The register allocation framework requires register classes have at least +// one register, so we define a few for the floating point register classes +// since we otherwise don't need a physical register in those classes. +def F32_0 : WebAssemblyReg<"%f32.0">; +def F64_0 : WebAssemblyReg<"%f64.0">; + +// The expression stack "register". This is an opaque entity which serves to +// order uses and defs that must remain in LIFO order. +def EXPR_STACK : WebAssemblyReg<"STACK">; + +// The incoming arguments "register". This is an opaque entity which serves to +// order the ARGUMENT instructions that are emulating live-in registers and +// must not be scheduled below other instructions. +def ARGUMENTS : WebAssemblyReg<"ARGUMENTS">; //===----------------------------------------------------------------------===// // Register classes //===----------------------------------------------------------------------===// -def Int32 : WebAssemblyRegClass<[i32], 32, (add (sequence "I%u", 0, 4), SP32)>; -def Int64 : WebAssemblyRegClass<[i64], 64, (add (sequence "L%u", 0, 4), SP64)>; -def Float32 : WebAssemblyRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>; -def Float64 : WebAssemblyRegClass<[f64], 64, (add (sequence "D%u", 0, 4))>; +def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32)>; +def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64)>; +def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>; +def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>; diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp new file mode 100644 index 0000000..4e08b2b --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp @@ -0,0 +1,124 @@ +//===-- WebAssemblyStoreResults.cpp - Optimize using store result values --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements an optimization pass using store result values. +/// +/// WebAssembly's store instructions return the stored value. This is to enable +/// an optimization wherein uses of the stored value can be replaced by uses of +/// the store's result value, making the stored value register more likely to +/// be single-use, thus more likely to be useful to register stackifying, and +/// potentially also exposing the store to register stackifying. These both can +/// reduce get_local/set_local traffic. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "WebAssemblySubtarget.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-store-results" + +namespace { +class WebAssemblyStoreResults final : public MachineFunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyStoreResults() : MachineFunctionPass(ID) {} + + const char *getPassName() const override { + return "WebAssembly Store Results"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineBlockFrequencyInfo>(); + AU.addPreserved<MachineBlockFrequencyInfo>(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: +}; +} // end anonymous namespace + +char WebAssemblyStoreResults::ID = 0; +FunctionPass *llvm::createWebAssemblyStoreResults() { + return new WebAssemblyStoreResults(); +} + +bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) { + DEBUG({ + dbgs() << "********** Store Results **********\n" + << "********** Function: " << MF.getName() << '\n'; + }); + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>(); + bool Changed = false; + + assert(MRI.isSSA() && "StoreResults depends on SSA form"); + + for (auto &MBB : MF) { + DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n'); + for (auto &MI : MBB) + switch (MI.getOpcode()) { + default: + break; + case WebAssembly::STORE8_I32: + case WebAssembly::STORE16_I32: + case WebAssembly::STORE8_I64: + case WebAssembly::STORE16_I64: + case WebAssembly::STORE32_I64: + case WebAssembly::STORE_F32: + case WebAssembly::STORE_F64: + case WebAssembly::STORE_I32: + case WebAssembly::STORE_I64: + unsigned ToReg = MI.getOperand(0).getReg(); + unsigned FromReg = MI.getOperand(3).getReg(); + for (auto I = MRI.use_begin(FromReg), E = MRI.use_end(); I != E;) { + MachineOperand &O = *I++; + MachineInstr *Where = O.getParent(); + if (Where->getOpcode() == TargetOpcode::PHI) { + // PHIs use their operands on their incoming CFG edges rather than + // in their parent blocks. Get the basic block paired with this use + // of FromReg and check that MI's block dominates it. + MachineBasicBlock *Pred = + Where->getOperand(&O - &Where->getOperand(0) + 1).getMBB(); + if (!MDT.dominates(&MBB, Pred)) + continue; + } else { + // For a non-PHI, check that MI dominates the instruction in the + // normal way. + if (&MI == Where || !MDT.dominates(&MI, Where)) + continue; + } + Changed = true; + DEBUG(dbgs() << "Setting operand " << O << " in " << *Where + << " from " << MI << "\n"); + O.setReg(ToReg); + // If the store's def was previously dead, it is no longer. But the + // dead flag shouldn't be set yet. + assert(!MI.getOperand(0).isDead() && "Dead flag set on store result"); + } + } + } + + return Changed; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp index 3d9e7aa..cb2d5a6 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp @@ -46,3 +46,4 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT, TLInfo(TM, *this) {} bool WebAssemblySubtarget::enableMachineScheduler() const { return true; } +bool WebAssemblySubtarget::useAA() const { return true; } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h index 6f17619..f530a29 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h @@ -61,9 +61,15 @@ public: const WebAssemblyTargetLowering *getTargetLowering() const override { return &TLInfo; } + const WebAssemblyInstrInfo *getInstrInfo() const override { + return &InstrInfo; + } + const WebAssemblyRegisterInfo *getRegisterInfo() const override { + return &getInstrInfo()->getRegisterInfo(); + } const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override; - bool useAA() const override { return true; } + bool useAA() const override; // Predicates used by WebAssemblyInstrInfo.td. bool hasAddr64() const { return TargetTriple.isArch64Bit(); } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 6f93248..b290b4b 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -45,11 +45,17 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine( const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT.isArch64Bit() - ? "e-p:64:64-i64:64-v128:8:128-n32:64-S128" - : "e-p:32:32-i64:64-v128:8:128-n32:64-S128", + : LLVMTargetMachine(T, + TT.isArch64Bit() ? "e-m:e-p:64:64-i64:64-n32:64-S128" + : "e-m:e-p:32:32-i64:64-n32:64-S128", TT, CPU, FS, Options, RM, CM, OL), TLOF(make_unique<WebAssemblyTargetObjectFile>()) { + // WebAssembly type-checks expressions, but a noreturn function with a return + // type that doesn't match the context will cause a check failure. So we lower + // LLVM 'unreachable' to ISD::TRAP and then lower that to WebAssembly's + // 'unreachable' expression which is meant for that case. + this->Options.TrapUnreachable = true; + initAsmInfo(); // We need a reducible CFG, so disable some optimizations which tend to @@ -77,7 +83,7 @@ WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this); + I = llvm::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this); } return I.get(); } @@ -94,23 +100,18 @@ public: } FunctionPass *createTargetRegisterAllocator(bool) override; - void addFastRegAlloc(FunctionPass *RegAllocPass) override; - void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; void addIRPasses() override; - bool addPreISel() override; bool addInstSelector() override; bool addILPOpts() override; void addPreRegAlloc() override; - void addRegAllocPasses(bool Optimized); void addPostRegAlloc() override; - void addPreSched2() override; void addPreEmitPass() override; }; } // end anonymous namespace TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(WebAssemblyTTIImpl(this, F)); }); } @@ -124,50 +125,86 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) { return nullptr; // No reg alloc } -void WebAssemblyPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { - assert(!RegAllocPass && "WebAssembly uses no regalloc!"); - addRegAllocPasses(false); -} - -void WebAssemblyPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { - assert(!RegAllocPass && "WebAssembly uses no regalloc!"); - addRegAllocPasses(true); -} - //===----------------------------------------------------------------------===// // The following functions are called from lib/CodeGen/Passes.cpp to modify // the CodeGen pass sequence. //===----------------------------------------------------------------------===// void WebAssemblyPassConfig::addIRPasses() { - // FIXME: the default for this option is currently POSIX, whereas - // WebAssembly's MVP should default to Single. if (TM->Options.ThreadModel == ThreadModel::Single) + // In "single" mode, atomics get lowered to non-atomics. addPass(createLowerAtomicPass()); else // Expand some atomic operations. WebAssemblyTargetLowering has hooks which // control specifically what gets lowered. addPass(createAtomicExpandPass(TM)); + // Optimize "returned" function attributes. + addPass(createWebAssemblyOptimizeReturned()); + TargetPassConfig::addIRPasses(); } -bool WebAssemblyPassConfig::addPreISel() { return false; } - bool WebAssemblyPassConfig::addInstSelector() { + (void)TargetPassConfig::addInstSelector(); addPass( createWebAssemblyISelDag(getWebAssemblyTargetMachine(), getOptLevel())); + // Run the argument-move pass immediately after the ScheduleDAG scheduler + // so that we can fix up the ARGUMENT instructions before anything else + // sees them in the wrong place. + addPass(createWebAssemblyArgumentMove()); return false; } -bool WebAssemblyPassConfig::addILPOpts() { return true; } +bool WebAssemblyPassConfig::addILPOpts() { + (void)TargetPassConfig::addILPOpts(); + return true; +} + +void WebAssemblyPassConfig::addPreRegAlloc() { + TargetPassConfig::addPreRegAlloc(); -void WebAssemblyPassConfig::addPreRegAlloc() {} + // Prepare store instructions for register stackifying. + addPass(createWebAssemblyStoreResults()); +} -void WebAssemblyPassConfig::addRegAllocPasses(bool Optimized) {} +void WebAssemblyPassConfig::addPostRegAlloc() { + // TODO: The following CodeGen passes don't currently support code containing + // virtual registers. Consider removing their restrictions and re-enabling + // them. + // + // We use our own PrologEpilogInserter which is very slightly modified to + // tolerate virtual registers. + disablePass(&PrologEpilogCodeInserterID); + // Fails with: should be run after register allocation. + disablePass(&MachineCopyPropagationID); + + // Mark registers as representing wasm's expression stack. + addPass(createWebAssemblyRegStackify()); + + // Run the register coloring pass to reduce the total number of registers. + addPass(createWebAssemblyRegColoring()); + + TargetPassConfig::addPostRegAlloc(); + + // Run WebAssembly's version of the PrologEpilogInserter. Target-independent + // PEI runs after PostRegAlloc and after ShrinkWrap. Putting it here will run + // PEI before ShrinkWrap but otherwise in the same position in the order. + addPass(createWebAssemblyPEI()); +} -void WebAssemblyPassConfig::addPostRegAlloc() {} +void WebAssemblyPassConfig::addPreEmitPass() { + TargetPassConfig::addPreEmitPass(); -void WebAssemblyPassConfig::addPreSched2() {} + // Put the CFG in structured form; insert BLOCK and LOOP markers. + addPass(createWebAssemblyCFGStackify()); -void WebAssemblyPassConfig::addPreEmitPass() {} + // Lower br_unless into br_if. + addPass(createWebAssemblyLowerBrUnless()); + + // Create a mapping from LLVM CodeGen virtual registers to wasm registers. + addPass(createWebAssemblyRegNumbering()); + + // Perform the very last peephole optimizations on the code. + addPass(createWebAssemblyPeephole()); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp new file mode 100644 index 0000000..74e33b9 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp @@ -0,0 +1,24 @@ +//===-- WebAssemblyTargetObjectFile.cpp - WebAssembly Object Info ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file defines the functions of the WebAssembly-specific subclass +/// of TargetLoweringObjectFile. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssemblyTargetObjectFile.h" +#include "WebAssemblyTargetMachine.h" +using namespace llvm; + +void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h index ee78b94..39e50c9 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h @@ -16,50 +16,13 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETOBJECTFILE_H #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETOBJECTFILE_H -#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" namespace llvm { -class GlobalVariable; - -class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFile { +class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileELF { public: - WebAssemblyTargetObjectFile() { - TextSection = nullptr; - DataSection = nullptr; - BSSSection = nullptr; - ReadOnlySection = nullptr; - - StaticCtorSection = nullptr; - StaticDtorSection = nullptr; - LSDASection = nullptr; - EHFrameSection = nullptr; - DwarfAbbrevSection = nullptr; - DwarfInfoSection = nullptr; - DwarfLineSection = nullptr; - DwarfFrameSection = nullptr; - DwarfPubTypesSection = nullptr; - DwarfDebugInlineSection = nullptr; - DwarfStrSection = nullptr; - DwarfLocSection = nullptr; - DwarfARangesSection = nullptr; - DwarfRangesSection = nullptr; - } - - MCSection *getSectionForConstant(SectionKind Kind, - const Constant *C) const override { - return ReadOnlySection; - } - - MCSection *getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind, - Mangler &Mang, - const TargetMachine &TM) const override { - return DataSection; - } - - MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, - Mangler &Mang, - const TargetMachine &TM) const override; + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index fa88ed5..3566317 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -21,8 +21,7 @@ using namespace llvm; #define DEBUG_TYPE "wasmtti" TargetTransformInfo::PopcntSupportKind -WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) { +WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); - // TODO: Make Math.popcount32 happen in WebAssembly. - return TTI::PSK_Software; + return TargetTransformInfo::PSK_FastHardware; } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index 7ffb604..26dc388 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -38,7 +38,7 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> { const WebAssemblyTargetLowering *getTLI() const { return TLI; } public: - WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, Function &F) + WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -54,7 +54,7 @@ public: // TODO: Implement more Scalar TTI for WebAssembly - TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const; /// @} diff --git a/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt new file mode 100644 index 0000000..91b3fff --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt @@ -0,0 +1,309 @@ +# Tests which are known to fail from the GCC torture test suite. + +# Core dump. +920908-1.c +pr38151.c +va-arg-22.c + +# TargetRegisterInfo.h:315: static unsigned int llvm::TargetRegisterInfo::virtReg2Index(unsigned int): Assertion `isVirtualRegister(Reg) && "Not a virtual register"' failed. +struct-ret-1.c +va-arg-11.c +va-arg-21.c +va-arg-24.c +va-arg-trap-1.c + +# WebAssemblyCFGStackify.cpp:211: void SortBlocks(llvm::MachineFunction&, const llvm::MachineLoopInfo&): Assertion `L->contains( MLI.getLoopFor(&*prev(MachineFunction::iterator(&MBB)))) && "Loop isn't contiguous"' failed. +20000815-1.c +20010129-1.c +930628-1.c +980707-1.c + +# WebAssemblyISelLowering.cpp:316: virtual llvm::SDValue llvm::WebAssemblyTargetLowering::LowerCall(llvm::TargetLowering::CallLoweringInfo&, llvm::SmallVectorImpl<llvm::SDValue>&) const: Assertion `!Out.Flags.isByVal() && "byval is not valid for return values"' failed. +20030914-2.c +20040703-1.c +20081117-1.c +920625-1.c +931004-11.c +931004-13.c +980223.c +bitfld-5.c +complex-7.c +pr38969.c +pr51323.c +pr52129.c +pr57130.c + +# These were previously "Cannot select FrameIndex." Now most of them fail +# because they contain call frame pseudos (e.g. call a vararg func), +# frame pointers, or similar. This list will be updated again soon. +20000519-1.c +20000706-4.c +20000706-5.c +20000801-2.c +20000801-4.c +20011126-2.c + +20020529-1.c +20021024-1.c + +20030828-1.c +20030914-1.c + +20040302-1.c +20040625-1.c +20040823-1.c + +20041113-1.c + +20041214-1.c + +20050826-2.c + +20071213-1.c + +20080506-2.c +20080519-1.c + +20081103-1.c +20090113-1.c +20090113-2.c +20090113-3.c + +20090623-1.c + +920501-6.c +920501-8.c +920726-1.c +930518-1.c + +931004-10.c +931004-12.c +931004-14.c +931004-2.c +931004-4.c +931004-6.c +931004-8.c + +980205.c +980608-1.c +980709-1.c +980716-1.c +990127-1.c + +991216-2.c + +#cbrt.c +complex-5.c +complex-6.c + +enum-3.c +fprintf-chk-1.c +frame-address.c +loop-15.c +loop-ivopts-2.c +mayalias-3.c + +multi-ix.c + +pr20466-1.c + + +pr28778.c +pr28982b.c + +pr30778.c +pr31448-2.c +pr31448.c + +pr33870-1.c +pr33870.c + +pr38051.c + +pr39100.c + +pr39339.c + +pr43987.c + +pr44575.c + +pr44942.c +pr46309.c +pr47538.c +pr47925.c + +pr49390.c +pr49419.c + +#pr51877.c + +#pr52979-1.c +#pr52979-2.c +pr53645-2.c +pr53645.c + +pr56205.c + +pr56866.c + +pr57876.c +pr58277-1.c + +pr59643.c + +printf-chk-1.c +pta-field-1.c +pta-field-2.c + +stdarg-1.c +stdarg-2.c +stdarg-3.c +stdarg-4.c +strct-stdarg-1.c +strct-varg-1.c + +va-arg-1.c +va-arg-10.c +va-arg-12.c +va-arg-13.c +va-arg-14.c +va-arg-15.c +va-arg-16.c +va-arg-17.c +va-arg-18.c +va-arg-19.c +va-arg-2.c +va-arg-20.c +va-arg-23.c +va-arg-26.c +va-arg-4.c +va-arg-5.c +va-arg-6.c +va-arg-7.c +va-arg-8.c +va-arg-9.c +va-arg-pack-1.c +vfprintf-1.c +vfprintf-chk-1.c +vprintf-1.c +vprintf-chk-1.c + +# Cannot select callseq_end. +20040811-1.c +pr43220.c +vla-dealloc-1.c + +# Cannot select brind. +20071210-1.c +920501-4.c +920501-5.c + +# Cannot select BlockAddress. +comp-goto-1.c +980526-1.c +990208-1.c + +# WebAssembly hasn't implemented byval arguments. +20000412-3.c +20000419-1.c +20000706-1.c +20000706-2.c +20000707-1.c +20000717-1.c +20000717-5.c +20000808-1.c +20010605-2.c +20011113-1.c +20020215-1.c +20020810-1.c +20021118-1.c +20040707-1.c +20040709-1.c +20040709-2.c +20041201-1.c +20050713-1.c +20070614-1.c +920908-2.c +921112-1.c +921117-1.c +921123-2.c +921204-1.c +930126-1.c +930208-1.c +931004-5.c +931004-9.c +931031-1.c +950607-2.c +960416-1.c +990525-1.c +991118-1.c +bf64-1.c +complex-1.c +complex-2.c +pr15262-2.c +pr20621-1.c +pr23135.c +pr30185.c +pr42248.c + +# unimplemented operation lowering. +20010122-1.c +20030323-1.c +20030811-1.c +pr17377.c + +# Error: invalid output constraint '=t' in asm. +990413-2.c +990826-0.c + +# Error: __builtin_setjmp / __builtin_longjmp is not supported for the current target. +built-in-setjmp.c +pr60003.c + +# Error in the program / unsupported by Clang. +scal-to-vec1.c +scal-to-vec2.c +scal-to-vec3.c +20000822-1.c +20010209-1.c +20010605-1.c +20030501-1.c +20040520-1.c +20061220-1.c +20090219-1.c +920415-1.c +920428-2.c +920501-7.c +920612-2.c +920721-4.c +921017-1.c +921215-1.c +931002-1.c +comp-goto-2.c +nest-align-1.c +nest-stdar-1.c +nestfunc-1.c +nestfunc-2.c +nestfunc-3.c +nestfunc-5.c +nestfunc-6.c +nestfunc-7.c +pr22061-3.c +pr22061-4.c +pr24135.c +pr51447.c +20020412-1.c +20040308-1.c +20040423-1.c +20041218-2.c +20070919-1.c +align-nest.c +pr41935.c +20050107-1.c +20050119-1.c +20050119-2.c +920302-1.c +920501-3.c +920728-1.c +pr28865.c diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index 9eee4a0..09cc53a 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -10,10 +10,8 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "X86AsmInstrumentation.h" #include "X86Operand.h" -#include "X86RegisterInfo.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" @@ -118,11 +116,6 @@ bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; } bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; } -std::string FuncName(unsigned AccessSize, bool IsWrite) { - return std::string("__asan_report_") + (IsWrite ? "store" : "load") + - utostr(AccessSize); -} - class X86AddressSanitizer : public X86AsmInstrumentation { public: struct RegisterContext { @@ -136,26 +129,26 @@ public: public: RegisterContext(unsigned AddressReg, unsigned ShadowReg, unsigned ScratchReg) { - BusyRegs.push_back(convReg(AddressReg, MVT::i64)); - BusyRegs.push_back(convReg(ShadowReg, MVT::i64)); - BusyRegs.push_back(convReg(ScratchReg, MVT::i64)); + BusyRegs.push_back(convReg(AddressReg, 64)); + BusyRegs.push_back(convReg(ShadowReg, 64)); + BusyRegs.push_back(convReg(ScratchReg, 64)); } - unsigned AddressReg(MVT::SimpleValueType VT) const { - return convReg(BusyRegs[REG_OFFSET_ADDRESS], VT); + unsigned AddressReg(unsigned Size) const { + return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size); } - unsigned ShadowReg(MVT::SimpleValueType VT) const { - return convReg(BusyRegs[REG_OFFSET_SHADOW], VT); + unsigned ShadowReg(unsigned Size) const { + return convReg(BusyRegs[REG_OFFSET_SHADOW], Size); } - unsigned ScratchReg(MVT::SimpleValueType VT) const { - return convReg(BusyRegs[REG_OFFSET_SCRATCH], VT); + unsigned ScratchReg(unsigned Size) const { + return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size); } void AddBusyReg(unsigned Reg) { if (Reg != X86::NoRegister) - BusyRegs.push_back(convReg(Reg, MVT::i64)); + BusyRegs.push_back(convReg(Reg, 64)); } void AddBusyRegs(const X86Operand &Op) { @@ -163,36 +156,36 @@ public: AddBusyReg(Op.getMemIndexReg()); } - unsigned ChooseFrameReg(MVT::SimpleValueType VT) const { + unsigned ChooseFrameReg(unsigned Size) const { static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX, X86::RCX, X86::RDX, X86::RDI, X86::RSI }; for (unsigned Reg : Candidates) { if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg)) - return convReg(Reg, VT); + return convReg(Reg, Size); } return X86::NoRegister; } private: - unsigned convReg(unsigned Reg, MVT::SimpleValueType VT) const { - return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, VT); + unsigned convReg(unsigned Reg, unsigned Size) const { + return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size); } std::vector<unsigned> BusyRegs; }; - X86AddressSanitizer(const MCSubtargetInfo &STI) + X86AddressSanitizer(const MCSubtargetInfo *&STI) : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {} - virtual ~X86AddressSanitizer() {} + ~X86AddressSanitizer() override {} // X86AsmInstrumentation implementation: - virtual void InstrumentAndEmitInstruction(const MCInst &Inst, - OperandVector &Operands, - MCContext &Ctx, - const MCInstrInfo &MII, - MCStreamer &Out) override { + void InstrumentAndEmitInstruction(const MCInst &Inst, + OperandVector &Operands, + MCContext &Ctx, + const MCInstrInfo &MII, + MCStreamer &Out) override { InstrumentMOVS(Inst, Operands, Ctx, MII, Out); if (RepPrefix) EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX)); @@ -240,17 +233,16 @@ public: protected: void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); } - void EmitLEA(X86Operand &Op, MVT::SimpleValueType VT, unsigned Reg, - MCStreamer &Out) { - assert(VT == MVT::i32 || VT == MVT::i64); + void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) { + assert(Size == 32 || Size == 64); MCInst Inst; - Inst.setOpcode(VT == MVT::i32 ? X86::LEA32r : X86::LEA64r); - Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, VT))); + Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r); + Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size))); Op.addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } - void ComputeMemOperandAddress(X86Operand &Op, MVT::SimpleValueType VT, + void ComputeMemOperandAddress(X86Operand &Op, unsigned Size, unsigned Reg, MCContext &Ctx, MCStreamer &Out); // Creates new memory operand with Displacement added to an original @@ -261,13 +253,13 @@ protected: MCContext &Ctx, int64_t *Residue); bool is64BitMode() const { - return STI.getFeatureBits()[X86::Mode64Bit]; + return STI->getFeatureBits()[X86::Mode64Bit]; } bool is32BitMode() const { - return STI.getFeatureBits()[X86::Mode32Bit]; + return STI->getFeatureBits()[X86::Mode32Bit]; } bool is16BitMode() const { - return STI.getFeatureBits()[X86::Mode16Bit]; + return STI->getFeatureBits()[X86::Mode16Bit]; } unsigned getPointerWidth() { @@ -437,7 +429,7 @@ void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst, } void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, - MVT::SimpleValueType VT, + unsigned Size, unsigned Reg, MCContext &Ctx, MCStreamer &Out) { int64_t Displacement = 0; @@ -450,14 +442,14 @@ void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, // Emit Op as is. if (Displacement == 0) { - EmitLEA(Op, VT, Reg, Out); + EmitLEA(Op, Size, Reg, Out); return; } int64_t Residue; std::unique_ptr<X86Operand> NewOp = AddDisplacement(Op, Displacement, Ctx, &Residue); - EmitLEA(*NewOp, VT, Reg, Out); + EmitLEA(*NewOp, Size, Reg, Out); while (Residue != 0) { const MCConstantExpr *Disp = @@ -465,7 +457,7 @@ void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, std::unique_ptr<X86Operand> DispOp = X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(), SMLoc()); - EmitLEA(*DispOp, VT, Reg, Out); + EmitLEA(*DispOp, Size, Reg, Out); Residue -= Disp->getValue(); } } @@ -503,16 +495,16 @@ class X86AddressSanitizer32 : public X86AddressSanitizer { public: static const long kShadowOffset = 0x20000000; - X86AddressSanitizer32(const MCSubtargetInfo &STI) + X86AddressSanitizer32(const MCSubtargetInfo *&STI) : X86AddressSanitizer(STI) {} - virtual ~X86AddressSanitizer32() {} + ~X86AddressSanitizer32() override {} unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); if (FrameReg == X86::NoRegister) return FrameReg; - return getX86SubSuperRegister(FrameReg, MVT::i32); + return getX86SubSuperRegister(FrameReg, 32); } void SpillReg(MCStreamer &Out, unsigned Reg) { @@ -535,10 +527,10 @@ public: OrigSPOffset += 4; } - virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32); + void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32); assert(LocalFrameReg != X86::NoRegister); const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); @@ -558,24 +550,24 @@ public: MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */)); } - SpillReg(Out, RegCtx.AddressReg(MVT::i32)); - SpillReg(Out, RegCtx.ShadowReg(MVT::i32)); - if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister) - SpillReg(Out, RegCtx.ScratchReg(MVT::i32)); + SpillReg(Out, RegCtx.AddressReg(32)); + SpillReg(Out, RegCtx.ShadowReg(32)); + if (RegCtx.ScratchReg(32) != X86::NoRegister) + SpillReg(Out, RegCtx.ScratchReg(32)); StoreFlags(Out); } - virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32); + void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32); assert(LocalFrameReg != X86::NoRegister); RestoreFlags(Out); - if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister) - RestoreReg(Out, RegCtx.ScratchReg(MVT::i32)); - RestoreReg(Out, RegCtx.ShadowReg(MVT::i32)); - RestoreReg(Out, RegCtx.AddressReg(MVT::i32)); + if (RegCtx.ScratchReg(32) != X86::NoRegister) + RestoreReg(Out, RegCtx.ScratchReg(32)); + RestoreReg(Out, RegCtx.ShadowReg(32)); + RestoreReg(Out, RegCtx.AddressReg(32)); unsigned FrameReg = GetFrameReg(Ctx, Out); if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) { @@ -586,18 +578,18 @@ public: } } - virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, - MCStreamer &Out) override; + void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) override; private: void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx, @@ -610,10 +602,11 @@ private: .addReg(X86::ESP) .addImm(-16)); EmitInstruction( - Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(MVT::i32))); + Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32))); - const std::string &Fn = FuncName(AccessSize, IsWrite); - MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn)); + MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") + + (IsWrite ? "store" : "load") + + llvm::Twine(AccessSize)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr)); @@ -623,14 +616,14 @@ private: void X86AddressSanitizer32::InstrumentMemOperandSmall( X86Operand &Op, unsigned AccessSize, bool IsWrite, const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); - unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); - unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8); + unsigned AddressRegI32 = RegCtx.AddressReg(32); + unsigned ShadowRegI32 = RegCtx.ShadowReg(32); + unsigned ShadowRegI8 = RegCtx.ShadowReg(8); - assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister); - unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32); + assert(RegCtx.ScratchReg(32) != X86::NoRegister); + unsigned ScratchRegI32 = RegCtx.ScratchReg(32); - ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out); + ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out); EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( AddressRegI32)); @@ -673,7 +666,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall( std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc())); - EmitLEA(*Op, MVT::i32, ScratchRegI32, Out); + EmitLEA(*Op, 32, ScratchRegI32, Out); break; } case 4: @@ -698,10 +691,10 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall( void X86AddressSanitizer32::InstrumentMemOperandLarge( X86Operand &Op, unsigned AccessSize, bool IsWrite, const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); - unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); + unsigned AddressRegI32 = RegCtx.AddressReg(32); + unsigned ShadowRegI32 = RegCtx.ShadowReg(32); - ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out); + ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out); EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( AddressRegI32)); @@ -760,16 +753,16 @@ class X86AddressSanitizer64 : public X86AddressSanitizer { public: static const long kShadowOffset = 0x7fff8000; - X86AddressSanitizer64(const MCSubtargetInfo &STI) + X86AddressSanitizer64(const MCSubtargetInfo *&STI) : X86AddressSanitizer(STI) {} - virtual ~X86AddressSanitizer64() {} + ~X86AddressSanitizer64() override {} unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); if (FrameReg == X86::NoRegister) return FrameReg; - return getX86SubSuperRegister(FrameReg, MVT::i64); + return getX86SubSuperRegister(FrameReg, 64); } void SpillReg(MCStreamer &Out, unsigned Reg) { @@ -792,10 +785,10 @@ public: OrigSPOffset += 8; } - virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64); + void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64); assert(LocalFrameReg != X86::NoRegister); const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); @@ -816,24 +809,24 @@ public: } EmitAdjustRSP(Ctx, Out, -128); - SpillReg(Out, RegCtx.ShadowReg(MVT::i64)); - SpillReg(Out, RegCtx.AddressReg(MVT::i64)); - if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister) - SpillReg(Out, RegCtx.ScratchReg(MVT::i64)); + SpillReg(Out, RegCtx.ShadowReg(64)); + SpillReg(Out, RegCtx.AddressReg(64)); + if (RegCtx.ScratchReg(64) != X86::NoRegister) + SpillReg(Out, RegCtx.ScratchReg(64)); StoreFlags(Out); } - virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64); + void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64); assert(LocalFrameReg != X86::NoRegister); RestoreFlags(Out); - if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister) - RestoreReg(Out, RegCtx.ScratchReg(MVT::i64)); - RestoreReg(Out, RegCtx.AddressReg(MVT::i64)); - RestoreReg(Out, RegCtx.ShadowReg(MVT::i64)); + if (RegCtx.ScratchReg(64) != X86::NoRegister) + RestoreReg(Out, RegCtx.ScratchReg(64)); + RestoreReg(Out, RegCtx.AddressReg(64)); + RestoreReg(Out, RegCtx.ShadowReg(64)); EmitAdjustRSP(Ctx, Out, 128); unsigned FrameReg = GetFrameReg(Ctx, Out); @@ -845,18 +838,18 @@ public: } } - virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, - MCStreamer &Out) override; + void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) override; private: void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) { @@ -864,7 +857,7 @@ private: std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); - EmitLEA(*Op, MVT::i64, X86::RSP, Out); + EmitLEA(*Op, 64, X86::RSP, Out); OrigSPOffset += Offset; } @@ -878,12 +871,13 @@ private: .addReg(X86::RSP) .addImm(-16)); - if (RegCtx.AddressReg(MVT::i64) != X86::RDI) { + if (RegCtx.AddressReg(64) != X86::RDI) { EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg( - RegCtx.AddressReg(MVT::i64))); + RegCtx.AddressReg(64))); } - const std::string &Fn = FuncName(AccessSize, IsWrite); - MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn)); + MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") + + (IsWrite ? "store" : "load") + + llvm::Twine(AccessSize)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr)); @@ -893,16 +887,16 @@ private: void X86AddressSanitizer64::InstrumentMemOperandSmall( X86Operand &Op, unsigned AccessSize, bool IsWrite, const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64); - unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); - unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64); - unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); - unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8); + unsigned AddressRegI64 = RegCtx.AddressReg(64); + unsigned AddressRegI32 = RegCtx.AddressReg(32); + unsigned ShadowRegI64 = RegCtx.ShadowReg(64); + unsigned ShadowRegI32 = RegCtx.ShadowReg(32); + unsigned ShadowRegI8 = RegCtx.ShadowReg(8); - assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister); - unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32); + assert(RegCtx.ScratchReg(32) != X86::NoRegister); + unsigned ScratchRegI32 = RegCtx.ScratchReg(32); - ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out); + ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out); EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( AddressRegI64)); @@ -944,7 +938,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall( std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc())); - EmitLEA(*Op, MVT::i32, ScratchRegI32, Out); + EmitLEA(*Op, 32, ScratchRegI32, Out); break; } case 4: @@ -969,10 +963,10 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall( void X86AddressSanitizer64::InstrumentMemOperandLarge( X86Operand &Op, unsigned AccessSize, bool IsWrite, const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64); - unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64); + unsigned AddressRegI64 = RegCtx.AddressReg(64); + unsigned ShadowRegI64 = RegCtx.ShadowReg(64); - ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out); + ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out); EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( AddressRegI64)); @@ -1030,7 +1024,7 @@ void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize, } // End anonymous namespace -X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo &STI) +X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI) : STI(STI), InitialFrameReg(0) {} X86AsmInstrumentation::~X86AsmInstrumentation() {} @@ -1043,7 +1037,7 @@ void X86AsmInstrumentation::InstrumentAndEmitInstruction( void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out, const MCInst &Inst) { - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, *STI); } unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx, @@ -1067,17 +1061,17 @@ unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx, X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, const MCSubtargetInfo &STI) { - Triple T(STI.getTargetTriple()); + const MCContext &Ctx, const MCSubtargetInfo *&STI) { + Triple T(STI->getTargetTriple()); const bool hasCompilerRTSupport = T.isOSLinux(); if (ClAsanInstrumentAssembly && hasCompilerRTSupport && MCOptions.SanitizeAddress) { - if (STI.getFeatureBits()[X86::Mode32Bit] != 0) + if (STI->getFeatureBits()[X86::Mode32Bit] != 0) return new X86AddressSanitizer32(STI); - if (STI.getFeatureBits()[X86::Mode64Bit] != 0) + if (STI->getFeatureBits()[X86::Mode64Bit] != 0) return new X86AddressSanitizer64(STI); } return new X86AsmInstrumentation(STI); } -} // End llvm namespace +} // end llvm namespace diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h index 19ebcc4..470cead 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h @@ -28,7 +28,8 @@ class X86AsmInstrumentation; X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, const MCSubtargetInfo &STI); + const MCContext &Ctx, + const MCSubtargetInfo *&STI); class X86AsmInstrumentation { public: @@ -48,15 +49,16 @@ public: protected: friend X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, const MCSubtargetInfo &STI); + const MCContext &Ctx, + const MCSubtargetInfo *&STI); - X86AsmInstrumentation(const MCSubtargetInfo &STI); + X86AsmInstrumentation(const MCSubtargetInfo *&STI); unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out); void EmitInstruction(MCStreamer &Out, const MCInst &Inst); - const MCSubtargetInfo &STI; + const MCSubtargetInfo *&STI; unsigned InitialFrameReg; }; diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index bca059d..4d8ffac 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -11,7 +11,6 @@ #include "X86AsmInstrumentation.h" #include "X86AsmParserCommon.h" #include "X86Operand.h" -#include "X86ISelLowering.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" @@ -26,6 +25,7 @@ #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSection.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" @@ -57,10 +57,10 @@ static const char OpPrecedence[] = { }; class X86AsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; const MCInstrInfo &MII; ParseInstructionInfo *InstInfo; std::unique_ptr<X86AsmInstrumentation> Instrumentation; + private: SMLoc consumeToken() { MCAsmParser &Parser = getParser(); @@ -154,6 +154,7 @@ private: // Push the new operator. InfixOperatorStack.push_back(Op); } + int64_t execute() { // Push any remaining operators onto the postfix stack. while (!InfixOperatorStack.empty()) { @@ -268,6 +269,7 @@ private: bool StopOnLBrac, AddImmPrefix; InfixCalculator IC; InlineAsmIdentifierInfo Info; + public: IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) : State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0), @@ -712,10 +714,10 @@ private: SMLoc End, unsigned Size, StringRef Identifier, InlineAsmIdentifierInfo &Info); + bool parseDirectiveEven(SMLoc L); bool ParseDirectiveWord(unsigned Size, SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); - bool validateInstruction(MCInst &Inst, const OperandVector &Ops); bool processInstruction(MCInst &Inst, const OperandVector &Ops); /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds @@ -758,23 +760,24 @@ private: bool is64BitMode() const { // FIXME: Can tablegen auto-generate this? - return STI.getFeatureBits()[X86::Mode64Bit]; + return getSTI().getFeatureBits()[X86::Mode64Bit]; } bool is32BitMode() const { // FIXME: Can tablegen auto-generate this? - return STI.getFeatureBits()[X86::Mode32Bit]; + return getSTI().getFeatureBits()[X86::Mode32Bit]; } bool is16BitMode() const { // FIXME: Can tablegen auto-generate this? - return STI.getFeatureBits()[X86::Mode16Bit]; + return getSTI().getFeatureBits()[X86::Mode16Bit]; } void SwitchMode(unsigned mode) { + MCSubtargetInfo &STI = copySTI(); FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit}); FeatureBitset OldMode = STI.getFeatureBits() & AllModes; unsigned FB = ComputeAvailableFeatures( STI.ToggleFeature(OldMode.flip(mode))); setAvailableFeatures(FB); - + assert(FeatureBitset({mode}) == (STI.getFeatureBits() & AllModes)); } @@ -798,12 +801,12 @@ private: /// } public: - X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &Parser, + X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser, const MCInstrInfo &mii, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(sti), MII(mii), InstInfo(nullptr) { + : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr) { // Initialize the set of available features. - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); Instrumentation.reset( CreateX86AsmInstrumentation(Options, Parser.getContext(), STI)); } @@ -912,6 +915,11 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, if (RegNo == 0) RegNo = MatchRegisterName(Tok.getString().lower()); + // The "flags" register cannot be referenced directly. + // Treat it as an identifier instead. + if (isParsingInlineAsm() && isParsingIntelSyntax() && RegNo == X86::EFLAGS) + RegNo = 0; + if (!is64BitMode()) { // FIXME: This should be done using Requires<Not64BitMode> and // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also @@ -1042,8 +1050,11 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) { .Cases("BYTE", "byte", 8) .Cases("WORD", "word", 16) .Cases("DWORD", "dword", 32) + .Cases("FWORD", "fword", 48) .Cases("QWORD", "qword", 64) + .Cases("MMWORD","mmword", 64) .Cases("XWORD", "xword", 80) + .Cases("TBYTE", "tbyte", 80) .Cases("XMMWORD", "xmmword", 128) .Cases("YMMWORD", "ymmword", 256) .Cases("ZMMWORD", "zmmword", 512) @@ -1062,8 +1073,8 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( // Insert an explicit size if the user didn't have one. if (!Size) { Size = getPointerWidth(); - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start, - /*Len=*/0, Size)); + InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start, + /*Len=*/0, Size); } // Create an absolute memory reference in order to match against @@ -1082,8 +1093,8 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( if (!Size) { Size = Info.Type * 8; // Size is in terms of bits in this context. if (Size) - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start, - /*Len=*/0, Size)); + InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start, + /*Len=*/0, Size); } } @@ -1097,13 +1108,13 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( } static void -RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, +RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites, StringRef SymName, int64_t ImmDisp, int64_t FinalImmDisp, SMLoc &BracLoc, SMLoc &StartInBrac, SMLoc &End) { // Remove the '[' and ']' from the IR string. - AsmRewrites->push_back(AsmRewrite(AOK_Skip, BracLoc, 1)); - AsmRewrites->push_back(AsmRewrite(AOK_Skip, End, 1)); + AsmRewrites.emplace_back(AOK_Skip, BracLoc, 1); + AsmRewrites.emplace_back(AOK_Skip, End, 1); // If ImmDisp is non-zero, then we parsed a displacement before the // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp]) @@ -1114,15 +1125,14 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, // We have an immediate displacement before the bracketed expression. // Adjust this to match the final immediate displacement. bool Found = false; - for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(), - E = AsmRewrites->end(); I != E; ++I) { - if ((*I).Loc.getPointer() > BracLoc.getPointer()) + for (AsmRewrite &AR : AsmRewrites) { + if (AR.Loc.getPointer() > BracLoc.getPointer()) continue; - if ((*I).Kind == AOK_ImmPrefix || (*I).Kind == AOK_Imm) { + if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) { assert (!Found && "ImmDisp already rewritten."); - (*I).Kind = AOK_Imm; - (*I).Len = BracLoc.getPointer() - (*I).Loc.getPointer(); - (*I).Val = FinalImmDisp; + AR.Kind = AOK_Imm; + AR.Len = BracLoc.getPointer() - AR.Loc.getPointer(); + AR.Val = FinalImmDisp; Found = true; break; } @@ -1133,28 +1143,27 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, // We have a symbolic and an immediate displacement, but no displacement // before the bracketed expression. Put the immediate displacement // before the bracketed expression. - AsmRewrites->push_back(AsmRewrite(AOK_Imm, BracLoc, 0, FinalImmDisp)); + AsmRewrites.emplace_back(AOK_Imm, BracLoc, 0, FinalImmDisp); } } // Remove all the ImmPrefix rewrites within the brackets. - for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(), - E = AsmRewrites->end(); I != E; ++I) { - if ((*I).Loc.getPointer() < StartInBrac.getPointer()) + for (AsmRewrite &AR : AsmRewrites) { + if (AR.Loc.getPointer() < StartInBrac.getPointer()) continue; - if ((*I).Kind == AOK_ImmPrefix) - (*I).Kind = AOK_Delete; + if (AR.Kind == AOK_ImmPrefix) + AR.Kind = AOK_Delete; } const char *SymLocPtr = SymName.data(); // Skip everything before the symbol. if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) { assert(Len > 0 && "Expected a non-negative length."); - AsmRewrites->push_back(AsmRewrite(AOK_Skip, StartInBrac, Len)); + AsmRewrites.emplace_back(AOK_Skip, StartInBrac, Len); } // Skip everything after the symbol. if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) { SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size()); assert(Len > 0 && "Expected a non-negative length."); - AsmRewrites->push_back(AsmRewrite(AOK_Skip, Loc, Len)); + AsmRewrites.emplace_back(AOK_Skip, Loc, Len); } } @@ -1162,6 +1171,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); + AsmToken::TokenKind PrevTK = AsmToken::Error; bool Done = false; while (!Done) { bool UpdateLocLex = true; @@ -1205,7 +1215,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { return Error(Tok.getLoc(), "Unexpected identifier!"); } else { // This is a dot operator, not an adjacent identifier. - if (Identifier.find('.') != StringRef::npos) { + if (Identifier.find('.') != StringRef::npos && + PrevTK == AsmToken::RBrac) { return false; } else { InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); @@ -1223,8 +1234,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { case AsmToken::Integer: { StringRef ErrMsg; if (isParsingInlineAsm() && SM.getAddImmPrefix()) - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, - Tok.getLoc())); + InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Tok.getLoc()); // Look for 'b' or 'f' following an Integer as a directional label SMLoc Loc = getTok().getLoc(); int64_t IntVal = getTok().getIntVal(); @@ -1237,7 +1247,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b"); MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; const MCExpr *Val = - MCSymbolRefExpr::create(Sym, Variant, getContext()); + MCSymbolRefExpr::create(Sym, Variant, getContext()); if (IDVal == "b" && Sym->isUndefined()) return Error(Loc, "invalid reference to undefined symbol"); StringRef Identifier = Sym->getName(); @@ -1275,6 +1285,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { if (!Done && UpdateLocLex) End = consumeToken(); + + PrevTK = TK; } return false; } @@ -1302,7 +1314,7 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, // A symbolic displacement. Disp = Sym; if (isParsingInlineAsm()) - RewriteIntelBracExpression(InstInfo->AsmRewrites, SM.getSymName(), + RewriteIntelBracExpression(*InstInfo->AsmRewrites, SM.getSymName(), ImmDisp, SM.getImm(), BracLoc, StartInBrac, End); } @@ -1359,7 +1371,7 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, InlineAsmIdentifierInfo &Info, bool IsUnevaluatedOperand, SMLoc &End) { MCAsmParser &Parser = getParser(); - assert (isParsingInlineAsm() && "Expected to be parsing inline assembly."); + assert(isParsingInlineAsm() && "Expected to be parsing inline assembly."); Val = nullptr; StringRef LineBuf(Identifier.data()); @@ -1372,15 +1384,17 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, // Advance the token stream until the end of the current token is // after the end of what the frontend claimed. const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size(); - while (true) { + do { End = Tok.getEndLoc(); getLexer().Lex(); - - assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?"); - if (End.getPointer() == EndPtr) break; - } + } while (End.getPointer() < EndPtr); Identifier = LineBuf; + // The frontend should end parsing on an assembler token boundary, unless it + // failed parsing. + assert((End.getPointer() == EndPtr || !Result) && + "frontend claimed part of a token?"); + // If the identifier lookup was unsuccessful, assume that we are dealing with // a label. if (!Result) { @@ -1389,9 +1403,8 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, Loc, false); assert(InternalName.size() && "We should have an internal name here."); // Push a rewrite for replacing the identifier name with the internal name. - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Label, Loc, - Identifier.size(), - InternalName)); + InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(), + InternalName); } // Create the symbol reference. @@ -1418,8 +1431,7 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, AsmToken ImmDispToken = Parser.Lex(); // Eat the integer. if (isParsingInlineAsm()) - InstInfo->AsmRewrites->push_back( - AsmRewrite(AOK_ImmPrefix, ImmDispToken.getLoc())); + InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, ImmDispToken.getLoc()); if (getLexer().isNot(AsmToken::LBrac)) { // An immediate following a 'segment register', 'colon' token sequence can @@ -1588,8 +1600,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data()); unsigned Len = DotDispStr.size(); unsigned Val = OrigDispVal + DotDispVal; - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_DotOperator, Loc, Len, - Val)); + InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val); } NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext()); @@ -1613,7 +1624,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() { return nullptr; // Don't emit the offset operator. - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7)); + InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7); // The offset operator will have an 'r' constraint, thus we need to create // register operand to ensure proper matching. Just pick a GPR based on @@ -1664,7 +1675,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) { // Rewrite the type operator and the C or C++ type or variable in terms of an // immediate. E.g. TYPE foo -> $$4 unsigned Len = End.getPointer() - TypeLoc.getPointer(); - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal)); + InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal); const MCExpr *Imm = MCConstantExpr::create(CVal, getContext()); return X86Operand::CreateImm(Imm, Start, End); @@ -1688,12 +1699,14 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { return ParseIntelOperator(IOK_TYPE); } + bool PtrInOperand = false; unsigned Size = getIntelMemOperandSize(Tok.getString()); if (Size) { Parser.Lex(); // Eat operand size (e.g., byte, word). if (Tok.getString() != "PTR" && Tok.getString() != "ptr") return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!"); Parser.Lex(); // Eat ptr. + PtrInOperand = true; } Start = Tok.getLoc(); @@ -1711,10 +1724,10 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { unsigned Len = Tok.getLoc().getPointer() - Start.getPointer(); if (StartTok.getString().size() == Len) // Just add a prefix if this wasn't a complex immediate expression. - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, Start)); + InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start); else // Otherwise, rewrite the complex expression as a single immediate. - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, Start, Len, Imm)); + InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm); } if (getLexer().isNot(AsmToken::LBrac)) { @@ -1740,7 +1753,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { } // rounding mode token - if (STI.getFeatureBits()[X86::FeatureAVX512] && + if (getSTI().getFeatureBits()[X86::FeatureAVX512] && getLexer().is(AsmToken::LCurly)) return ParseRoundingModeOp(Start, End); @@ -1749,9 +1762,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { if (!ParseRegister(RegNo, Start, End)) { // If this is a segment register followed by a ':', then this is the start // of a segment override, otherwise this is a normal register reference. - if (getLexer().isNot(AsmToken::Colon)) + // In case it is a normal register and there is ptr in the operand this + // is an error + if (getLexer().isNot(AsmToken::Colon)){ + if (PtrInOperand){ + return ErrorOperand(Start, "expected memory operand after " + "'ptr', found register operand instead"); + } return X86Operand::CreateReg(RegNo, Start, End); - + } + return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size); } @@ -1798,7 +1818,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { } case AsmToken::LCurly:{ SMLoc Start = Parser.getTok().getLoc(), End; - if (STI.getFeatureBits()[X86::FeatureAVX512]) + if (getSTI().getFeatureBits()[X86::FeatureAVX512]) return ParseRoundingModeOp(Start, End); return ErrorOperand(Start, "unknown token in expression"); } @@ -1808,7 +1828,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, const MCParsedAsmOperand &Op) { MCAsmParser &Parser = getParser(); - if(STI.getFeatureBits()[X86::FeatureAVX512]) { + if(getSTI().getFeatureBits()[X86::FeatureAVX512]) { if (getLexer().is(AsmToken::LCurly)) { // Eat "{" and mark the current place. const SMLoc consumedToken = consumeToken(); @@ -1983,12 +2003,13 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, } // Validate the scale amount. - if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) && + if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) && ScaleVal != 1) { Error(Loc, "scale factor in 16-bit address must be 1"); return nullptr; - } - if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){ + } + if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && + ScaleVal != 8) { Error(Loc, "scale factor in address must be 1, 2, 4 or 8"); return nullptr; } @@ -2175,7 +2196,6 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, Name == "repne" || Name == "repnz" || Name == "rex64" || Name == "data16"; - // This does the actual operand parsing. Don't parse any more if we have a // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we // just want to parse the "lock" as the first instruction and the "incl" as @@ -2213,6 +2233,20 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, (isPrefix && getLexer().is(AsmToken::Slash))) Parser.Lex(); + // This is for gas compatibility and cannot be done in td. + // Adding "p" for some floating point with no argument. + // For example: fsub --> fsubp + bool IsFp = + Name == "fsub" || Name == "fdiv" || Name == "fsubr" || Name == "fdivr"; + if (IsFp && Operands.size() == 1) { + const char *Repl = StringSwitch<const char *>(Name) + .Case("fsub", "fsubp") + .Case("fdiv", "fdivp") + .Case("fsubr", "fsubrp") + .Case("fdivr", "fdivrp"); + static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl); + } + // This is a terrible hack to handle "out[bwl]? %al, (%dx)" -> // "outb %al, %dx". Out doesn't take a memory form, but this is a widely // documented form in various unofficial manuals, so a lot of code uses it. @@ -2242,9 +2276,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // Append default arguments to "ins[bwld]" if (Name.startswith("ins") && Operands.size() == 1 && - (Name == "insb" || Name == "insw" || Name == "insl" || - Name == "insd" )) { - AddDefaultSrcDestOperands(Operands, + (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd")) { + AddDefaultSrcDestOperands(Operands, X86Operand::CreateReg(X86::DX, NameLoc, NameLoc), DefaultMemDIOperand(NameLoc)); } @@ -2346,98 +2379,21 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // instalias with an immediate operand yet. if (Name == "int" && Operands.size() == 2) { X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]); - if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) && - cast<MCConstantExpr>(Op1.getImm())->getValue() == 3) { - Operands.erase(Operands.begin() + 1); - static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3"); - } + if (Op1.isImm()) + if (auto *CE = dyn_cast<MCConstantExpr>(Op1.getImm())) + if (CE->getValue() == 3) { + Operands.erase(Operands.begin() + 1); + static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3"); + } } return false; } -static bool convertToSExti8(MCInst &Inst, unsigned Opcode, unsigned Reg, - bool isCmp) { - MCInst TmpInst; - TmpInst.setOpcode(Opcode); - if (!isCmp) - TmpInst.addOperand(MCOperand::createReg(Reg)); - TmpInst.addOperand(MCOperand::createReg(Reg)); - TmpInst.addOperand(Inst.getOperand(0)); - Inst = TmpInst; - return true; -} - -static bool convert16i16to16ri8(MCInst &Inst, unsigned Opcode, - bool isCmp = false) { - if (!Inst.getOperand(0).isImm() || - !isImmSExti16i8Value(Inst.getOperand(0).getImm())) - return false; - - return convertToSExti8(Inst, Opcode, X86::AX, isCmp); -} - -static bool convert32i32to32ri8(MCInst &Inst, unsigned Opcode, - bool isCmp = false) { - if (!Inst.getOperand(0).isImm() || - !isImmSExti32i8Value(Inst.getOperand(0).getImm())) - return false; - - return convertToSExti8(Inst, Opcode, X86::EAX, isCmp); -} - -static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode, - bool isCmp = false) { - if (!Inst.getOperand(0).isImm() || - !isImmSExti64i8Value(Inst.getOperand(0).getImm())) - return false; - - return convertToSExti8(Inst, Opcode, X86::RAX, isCmp); -} - -bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { - switch (Inst.getOpcode()) { - default: return true; - case X86::INT: - X86Operand &Op = static_cast<X86Operand &>(*Ops[1]); - assert(Op.isImm() && "expected immediate"); - int64_t Res; - if (!Op.getImm()->evaluateAsAbsolute(Res) || Res > 255) { - Error(Op.getStartLoc(), "interrupt vector must be in range [0-255]"); - return false; - } - return true; - } - llvm_unreachable("handle the instruction appropriately"); -} - bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { switch (Inst.getOpcode()) { default: return false; - case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8); - case X86::AND32i32: return convert32i32to32ri8(Inst, X86::AND32ri8); - case X86::AND64i32: return convert64i32to64ri8(Inst, X86::AND64ri8); - case X86::XOR16i16: return convert16i16to16ri8(Inst, X86::XOR16ri8); - case X86::XOR32i32: return convert32i32to32ri8(Inst, X86::XOR32ri8); - case X86::XOR64i32: return convert64i32to64ri8(Inst, X86::XOR64ri8); - case X86::OR16i16: return convert16i16to16ri8(Inst, X86::OR16ri8); - case X86::OR32i32: return convert32i32to32ri8(Inst, X86::OR32ri8); - case X86::OR64i32: return convert64i32to64ri8(Inst, X86::OR64ri8); - case X86::CMP16i16: return convert16i16to16ri8(Inst, X86::CMP16ri8, true); - case X86::CMP32i32: return convert32i32to32ri8(Inst, X86::CMP32ri8, true); - case X86::CMP64i32: return convert64i32to64ri8(Inst, X86::CMP64ri8, true); - case X86::ADD16i16: return convert16i16to16ri8(Inst, X86::ADD16ri8); - case X86::ADD32i32: return convert32i32to32ri8(Inst, X86::ADD32ri8); - case X86::ADD64i32: return convert64i32to64ri8(Inst, X86::ADD64ri8); - case X86::SUB16i16: return convert16i16to16ri8(Inst, X86::SUB16ri8); - case X86::SUB32i32: return convert32i32to32ri8(Inst, X86::SUB32ri8); - case X86::SUB64i32: return convert64i32to64ri8(Inst, X86::SUB64ri8); - case X86::ADC16i16: return convert16i16to16ri8(Inst, X86::ADC16ri8); - case X86::ADC32i32: return convert32i32to32ri8(Inst, X86::ADC32ri8); - case X86::ADC64i32: return convert64i32to64ri8(Inst, X86::ADC64ri8); - case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8); - case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8); - case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8); + case X86::VMOVZPQILo2PQIrr: case X86::VMOVAPDrr: case X86::VMOVAPDYrr: case X86::VMOVAPSrr: @@ -2457,18 +2413,19 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { unsigned NewOpc; switch (Inst.getOpcode()) { default: llvm_unreachable("Invalid opcode"); - case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; - case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; - case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; - case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; - case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; - case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; - case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; - case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; - case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; - case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; - case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; - case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; + case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break; + case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; + case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; + case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; + case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; + case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; + case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; + case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; + case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; + case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; + case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; + case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; + case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; } Inst.setOpcode(NewOpc); return true; @@ -2573,9 +2530,6 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, isParsingIntelSyntax())) { default: llvm_unreachable("Unexpected match result!"); case Match_Success: - if (!validateInstruction(Inst, Operands)) - return true; - // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the // individual transformations can chain off each other. @@ -2819,9 +2773,6 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, unsigned NumSuccessfulMatches = std::count(std::begin(Match), std::end(Match), Match_Success); if (NumSuccessfulMatches == 1) { - if (!validateInstruction(Inst, Operands)) - return true; - // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the individual // transformations can chain off each other. @@ -2898,10 +2849,29 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { "a '%' prefix in .intel_syntax"); } return false; - } + } else if (IDVal == ".even") + return parseDirectiveEven(DirectiveID.getLoc()); return true; } +/// parseDirectiveEven +/// ::= .even +bool X86AsmParser::parseDirectiveEven(SMLoc L) { + const MCSection *Section = getStreamer().getCurrentSection().first; + if (getLexer().isNot(AsmToken::EndOfStatement)) { + TokError("unexpected token in directive"); + return false; + } + if (!Section) { + getStreamer().InitSections(false); + Section = getStreamer().getCurrentSection().first; + } + if (Section->UseCodeAlign()) + getStreamer().EmitCodeAlignment(2, 0); + else + getStreamer().EmitValueToAlignment(2, 0, 1, 0); + return false; +} /// ParseDirectiveWord /// ::= .word [ expression (, expression)* ] bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { @@ -2909,10 +2879,19 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { if (getLexer().isNot(AsmToken::EndOfStatement)) { for (;;) { const MCExpr *Value; + SMLoc ExprLoc = getLexer().getLoc(); if (getParser().parseExpression(Value)) return false; - getParser().getStreamer().EmitValue(Value, Size); + if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) { + assert(Size <= 8 && "Invalid size"); + uint64_t IntValue = MCE->getValue(); + if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue)) + return Error(ExprLoc, "literal value out of range for directive"); + getStreamer().EmitIntValue(IntValue, Size); + } else { + getStreamer().EmitValue(Value, Size, ExprLoc); + } if (getLexer().is(AsmToken::EndOfStatement)) break; diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h index 7610806..54538c8 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h @@ -13,30 +13,25 @@ namespace llvm { inline bool isImmSExti16i8Value(uint64_t Value) { - return (( Value <= 0x000000000000007FULL)|| - (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isInt<8>(Value) || + (isUInt<16>(Value) && isInt<8>(static_cast<int16_t>(Value))); } inline bool isImmSExti32i8Value(uint64_t Value) { - return (( Value <= 0x000000000000007FULL)|| - (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isInt<8>(Value) || + (isUInt<32>(Value) && isInt<8>(static_cast<int32_t>(Value))); } inline bool isImmSExti64i8Value(uint64_t Value) { - return (( Value <= 0x000000000000007FULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isInt<8>(Value); } inline bool isImmSExti64i32Value(uint64_t Value) { - return (( Value <= 0x000000007FFFFFFFULL)|| - (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isInt<32>(Value); } inline bool isImmUnsignedi8Value(uint64_t Value) { - return (( Value <= 0x00000000000000FFULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isUInt<8>(Value) || isInt<8>(Value); } } // End of namespace llvm diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index cfc3ee2..ce8fcf1 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -95,11 +95,13 @@ X86GenericDisassembler::X86GenericDisassembler( llvm_unreachable("Invalid CPU mode"); } +namespace { struct Region { ArrayRef<uint8_t> Bytes; uint64_t Base; Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {} }; +} // end anonymous namespace /// A callback function that wraps the readByte method from Region. /// @@ -831,8 +833,12 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_XMM256: case TYPE_XMM512: case TYPE_VK1: + case TYPE_VK2: + case TYPE_VK4: case TYPE_VK8: case TYPE_VK16: + case TYPE_VK32: + case TYPE_VK64: case TYPE_DEBUGREG: case TYPE_CONTROLREG: case TYPE_BNDR: @@ -962,6 +968,7 @@ static bool translateInstruction(MCInst &mcInst, return true; } + mcInst.clear(); mcInst.setOpcode(insn.instructionID); // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3 // prefix bytes should be disassembled as xrelease and xacquire then set the diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index f73fa75..040143b 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -361,7 +361,7 @@ static int readPrefixes(struct InternalInstruction* insn) { * then it should be disassembled as a xacquire/xrelease not repne/rep. */ if ((byte == 0xf2 || byte == 0xf3) && - ((nextByte == 0xf0) | + ((nextByte == 0xf0) || ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) insn->xAcquireRelease = true; /* @@ -980,6 +980,47 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { insn->opcode == 0xE3) attrMask ^= ATTR_ADSIZE; + /* + * In 64-bit mode all f64 superscripted opcodes ignore opcode size prefix + * CALL/JMP/JCC instructions need to ignore 0x66 and consume 4 bytes + */ + + if (insn->mode == MODE_64BIT && + isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) { + switch (insn->opcode) { + case 0xE8: + case 0xE9: + // Take care of psubsb and other mmx instructions. + if (insn->opcodeType == ONEBYTE) { + attrMask ^= ATTR_OPSIZE; + insn->immediateSize = 4; + insn->displacementSize = 4; + } + break; + case 0x82: + case 0x83: + case 0x84: + case 0x85: + case 0x86: + case 0x87: + case 0x88: + case 0x89: + case 0x8A: + case 0x8B: + case 0x8C: + case 0x8D: + case 0x8E: + case 0x8F: + // Take care of lea and three byte ops. + if (insn->opcodeType == TWOBYTE) { + attrMask ^= ATTR_OPSIZE; + insn->immediateSize = 4; + insn->displacementSize = 4; + } + break; + } + } + if (getIDWithAttrMask(&instructionID, insn, attrMask)) return -1; @@ -1447,8 +1488,12 @@ static int readModRM(struct InternalInstruction* insn) { case TYPE_XMM: \ return prefix##_XMM0 + index; \ case TYPE_VK1: \ + case TYPE_VK2: \ + case TYPE_VK4: \ case TYPE_VK8: \ case TYPE_VK16: \ + case TYPE_VK32: \ + case TYPE_VK64: \ if (index > 7) \ *valid = 0; \ return prefix##_K0 + index; \ diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index a79a923..28a628e 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -572,8 +572,6 @@ struct InternalInstruction { // The last byte of the opcode, not counting any ModR/M extension uint8_t opcode; - // The ModR/M byte of the instruction, if it is an opcode extension - uint8_t modRMExtension; // decode state diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index ea727e6..b4c0bc4 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -21,6 +21,7 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormattedStream.h" diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 62b6b73..bbb3090 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -15,12 +15,9 @@ #define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCSubtargetInfo.h" namespace llvm { -class MCOperand; - class X86ATTInstPrinter final : public MCInstPrinter { public: X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp index 91b144a..73f654c 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -21,6 +21,26 @@ using namespace llvm; +static unsigned getVectorRegSize(unsigned RegNo) { + if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31) + return 512; + if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31) + return 256; + if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31) + return 128; + if (X86::MM0 <= RegNo && RegNo <= X86::MM7) + return 64; + + llvm_unreachable("Unknown vector reg!"); +} + +static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT, + unsigned OperandIndex) { + unsigned OpReg = MI->getOperand(OperandIndex).getReg(); + return MVT::getVectorVT(ScalarVT, + getVectorRegSize(OpReg)/ScalarVT.getSizeInBits()); +} + /// \brief Extracts the src/dst types for a given zero extension instruction. /// \note While the number of elements in DstVT type correct, the /// number in the SrcVT type is expanded to fill the src xmm register and the @@ -107,6 +127,75 @@ static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) { } } +#define CASE_MASK_INS_COMMON(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src: \ + case X86::V##Inst##Suffix##src##k: \ + case X86::V##Inst##Suffix##src##kz: + +#define CASE_SSE_INS_COMMON(Inst, src) \ + case X86::Inst##src: + +#define CASE_AVX_INS_COMMON(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src: + +#define CASE_MOVDUP(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src) \ + CASE_AVX_INS_COMMON(Inst, , r##src) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src) \ + CASE_SSE_INS_COMMON(Inst, r##src) \ + +#define CASE_UNPCK(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src) \ + CASE_AVX_INS_COMMON(Inst, , r##src) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src) \ + CASE_SSE_INS_COMMON(Inst, r##src) \ + +#define CASE_SHUF(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src##i) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src##i) \ + CASE_AVX_INS_COMMON(Inst, , r##src##i) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src##i) \ + CASE_SSE_INS_COMMON(Inst, r##src##i) \ + +#define CASE_VPERM(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, src##i) \ + CASE_MASK_INS_COMMON(Inst, Z256, src##i) \ + CASE_MASK_INS_COMMON(Inst, Z128, src##i) \ + CASE_AVX_INS_COMMON(Inst, , src##i) \ + CASE_AVX_INS_COMMON(Inst, Y, src##i) \ + +#define CASE_VSHUF(Inst, src) \ + CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i) \ + +/// \brief Extracts the types and if it has memory operand for a given +/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction. +static void getVSHUF64x2FamilyInfo(const MCInst *MI, MVT &VT, bool &HasMemOp) { + HasMemOp = false; + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unknown VSHUF64x2 family instructions."); + break; + CASE_VSHUF(64X2, m) + HasMemOp = true; // FALL THROUGH. + CASE_VSHUF(64X2, r) + VT = getRegOperandVectorVT(MI, MVT::i64, 0); + break; + CASE_VSHUF(32X4, m) + HasMemOp = true; // FALL THROUGH. + CASE_VSHUF(32X4, r) + VT = getRegOperandVectorVT(MI, MVT::i32, 0); + break; + } +} + //===----------------------------------------------------------------------===// // Top Level Entrypoint //===----------------------------------------------------------------------===// @@ -127,23 +216,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::BLENDPDrri: case X86::VBLENDPDrri: + case X86::VBLENDPDYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::BLENDPDrmi: case X86::VBLENDPDrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v2f64, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VBLENDPDYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. case X86::VBLENDPDYrmi: if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v4f64, + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f64, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -152,23 +232,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::BLENDPSrri: case X86::VBLENDPSrri: + case X86::VBLENDPSYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::BLENDPSrmi: case X86::VBLENDPSrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v4f32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VBLENDPSYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. case X86::VBLENDPSYrmi: if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v8f32, + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -177,23 +248,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::PBLENDWrri: case X86::VPBLENDWrri: + case X86::VPBLENDWYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PBLENDWrmi: case X86::VPBLENDWrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPBLENDWYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. case X86::VPBLENDWYrmi: if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v16i16, + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i16, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -201,23 +263,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, break; case X86::VPBLENDDrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPBLENDDrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v4i32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPBLENDDYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. + case X86::VPBLENDDrmi: case X86::VPBLENDDYrmi: if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v8i32, + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -239,6 +291,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MOVLHPSrr: case X86::VMOVLHPSrr: + case X86::VMOVLHPSZrr: Src2Name = getRegName(MI->getOperand(2).getReg()); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -247,569 +300,327 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MOVHLPSrr: case X86::VMOVHLPSrr: + case X86::VMOVHLPSZrr: Src2Name = getRegName(MI->getOperand(2).getReg()); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); DecodeMOVHLPSMask(2, ShuffleMask); break; - case X86::MOVSLDUPrr: - case X86::VMOVSLDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::MOVSLDUPrm: - case X86::VMOVSLDUPrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask); - break; - - case X86::VMOVSHDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VMOVSHDUPYrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask); - break; - - case X86::VMOVSLDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_MOVDUP(MOVSLDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - case X86::VMOVSLDUPYrm: + CASE_MOVDUP(MOVSLDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask); + DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); break; - case X86::MOVSHDUPrr: - case X86::VMOVSHDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_MOVDUP(MOVSHDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - case X86::MOVSHDUPrm: - case X86::VMOVSHDUPrm: + CASE_MOVDUP(MOVSHDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask); + DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); break; - case X86::VMOVDDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_MOVDUP(MOVDDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - case X86::VMOVDDUPYrm: + CASE_MOVDUP(MOVDDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVDDUPMask(MVT::v4f64, ShuffleMask); - break; - - case X86::MOVDDUPrr: - case X86::VMOVDDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::MOVDDUPrm: - case X86::VMOVDDUPrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVDDUPMask(MVT::v2f64, ShuffleMask); + DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); break; case X86::PSLLDQri: case X86::VPSLLDQri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSLLDQMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSLLDQYri: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSLLDQMask(MVT::v32i8, + DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::PSRLDQri: case X86::VPSRLDQri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSRLDQMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSRLDQYri: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSRLDQMask(MVT::v32i8, + DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::PALIGNR128rr: case X86::VPALIGNR128rr: + case X86::VPALIGNR256rr: Src1Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PALIGNR128rm: case X86::VPALIGNR128rm: - Src2Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePALIGNRMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPALIGNR256rr: - Src1Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. case X86::VPALIGNR256rm: Src2Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePALIGNRMask(MVT::v32i8, + DecodePALIGNRMask(getRegOperandVectorVT(MI, MVT::i8, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::PSHUFDri: case X86::VPSHUFDri: + case X86::VPSHUFDYri: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::PSHUFDmi: case X86::VPSHUFDmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v4i32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSHUFDYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. case X86::VPSHUFDYmi: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v8i32, + DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::i32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::PSHUFHWri: case X86::VPSHUFHWri: + case X86::VPSHUFHWYri: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::PSHUFHWmi: case X86::VPSHUFHWmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFHWMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSHUFHWYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. case X86::VPSHUFHWYmi: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFHWMask(MVT::v16i16, + DecodePSHUFHWMask(getRegOperandVectorVT(MI, MVT::i16, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; + case X86::PSHUFLWri: case X86::VPSHUFLWri: + case X86::VPSHUFLWYri: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::PSHUFLWmi: case X86::VPSHUFLWmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFLWMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSHUFLWYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. case X86::VPSHUFLWYmi: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFLWMask(MVT::v16i16, + DecodePSHUFLWMask(getRegOperandVectorVT(MI, MVT::i16, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; - case X86::PUNPCKHBWrr: - case X86::VPUNPCKHBWrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHBWrm: - case X86::VPUNPCKHBWrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v16i8, ShuffleMask); - break; - case X86::VPUNPCKHBWYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHBWYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v32i8, ShuffleMask); - break; - case X86::PUNPCKHWDrr: - case X86::VPUNPCKHWDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHWDrm: - case X86::VPUNPCKHWDrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v8i16, ShuffleMask); - break; - case X86::VPUNPCKHWDYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHWDYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v16i16, ShuffleMask); - break; - case X86::PUNPCKHDQrr: - case X86::VPUNPCKHDQrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHDQrm: - case X86::VPUNPCKHDQrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v4i32, ShuffleMask); - break; - case X86::VPUNPCKHDQYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHDQYrm: + case X86::MMX_PSHUFWri: Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v8i32, ShuffleMask); - break; - case X86::VPUNPCKHDQZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKHDQZrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v16i32, ShuffleMask); - break; - case X86::PUNPCKHQDQrr: - case X86::VPUNPCKHQDQrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHQDQrm: - case X86::VPUNPCKHQDQrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); + case X86::MMX_PSHUFWmi: DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v2i64, ShuffleMask); - break; - case X86::VPUNPCKHQDQYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHQDQYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v4i64, ShuffleMask); - break; - case X86::VPUNPCKHQDQZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHQDQZrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v8i64, ShuffleMask); + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodePSHUFMask(MVT::v4i16, + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); break; - case X86::PUNPCKLBWrr: - case X86::VPUNPCKLBWrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKLBWrm: - case X86::VPUNPCKLBWrm: + case X86::PSWAPDrr: Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v16i8, ShuffleMask); - break; - case X86::VPUNPCKLBWYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKLBWYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v32i8, ShuffleMask); - break; - case X86::PUNPCKLWDrr: - case X86::VPUNPCKLWDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::PUNPCKLWDrm: - case X86::VPUNPCKLWDrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); + case X86::PSWAPDrm: DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v8i16, ShuffleMask); + DecodePSWAPMask(MVT::v2i32, ShuffleMask); break; - case X86::VPUNPCKLWDYrr: + + CASE_UNPCK(PUNPCKHBW, r) + case X86::MMX_PUNPCKHBWirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLWDYrm: + CASE_UNPCK(PUNPCKHBW, m) + case X86::MMX_PUNPCKHBWirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v16i16, ShuffleMask); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask); break; - case X86::PUNPCKLDQrr: - case X86::VPUNPCKLDQrr: + + CASE_UNPCK(PUNPCKHWD, r) + case X86::MMX_PUNPCKHWDirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::PUNPCKLDQrm: - case X86::VPUNPCKLDQrm: + CASE_UNPCK(PUNPCKHWD, m) + case X86::MMX_PUNPCKHWDirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v4i32, ShuffleMask); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask); break; - case X86::VPUNPCKLDQYrr: + + CASE_UNPCK(PUNPCKHDQ, r) + case X86::MMX_PUNPCKHDQirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLDQYrm: + CASE_UNPCK(PUNPCKHDQ, m) + case X86::MMX_PUNPCKHDQirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v8i32, ShuffleMask); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask); break; - case X86::VPUNPCKLDQZrr: + + CASE_UNPCK(PUNPCKHQDQ, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLDQZrm: + CASE_UNPCK(PUNPCKHQDQ, m) Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v16i32, ShuffleMask); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask); break; - case X86::PUNPCKLQDQrr: - case X86::VPUNPCKLQDQrr: + + CASE_UNPCK(PUNPCKLBW, r) + case X86::MMX_PUNPCKLBWirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::PUNPCKLQDQrm: - case X86::VPUNPCKLQDQrm: + CASE_UNPCK(PUNPCKLBW, m) + case X86::MMX_PUNPCKLBWirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v2i64, ShuffleMask); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask); break; - case X86::VPUNPCKLQDQYrr: + + CASE_UNPCK(PUNPCKLWD, r) + case X86::MMX_PUNPCKLWDirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLQDQYrm: + CASE_UNPCK(PUNPCKLWD, m) + case X86::MMX_PUNPCKLWDirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v4i64, ShuffleMask); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask); break; - case X86::VPUNPCKLQDQZrr: + + CASE_UNPCK(PUNPCKLDQ, r) + case X86::MMX_PUNPCKLDQirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLQDQZrm: + CASE_UNPCK(PUNPCKLDQ, m) + case X86::MMX_PUNPCKLDQirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v8i64, ShuffleMask); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask); break; - case X86::SHUFPDrri: - case X86::VSHUFPDrri: + CASE_UNPCK(PUNPCKLQDQ, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::SHUFPDrmi: - case X86::VSHUFPDrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeSHUFPMask(MVT::v2f64, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VSHUFPDYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VSHUFPDYrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeSHUFPMask(MVT::v4f64, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); + CASE_UNPCK(PUNPCKLQDQ, m) Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask); break; - case X86::SHUFPSrri: - case X86::VSHUFPSrri: + CASE_SHUF(SHUFPD, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::SHUFPSrmi: - case X86::VSHUFPSrmi: + CASE_SHUF(SHUFPD, m) if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeSHUFPMask(MVT::v4f32, + DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VSHUFPSYrri: + + CASE_SHUF(SHUFPS, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VSHUFPSYrmi: + CASE_SHUF(SHUFPS, m) if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeSHUFPMask(MVT::v8f32, + DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::UNPCKLPDrr: - case X86::VUNPCKLPDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::UNPCKLPDrm: - case X86::VUNPCKLPDrm: - DecodeUNPCKLMask(MVT::v2f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPDYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPDYrm: - DecodeUNPCKLMask(MVT::v4f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPDZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPDZrm: - DecodeUNPCKLMask(MVT::v8f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::UNPCKLPSrr: - case X86::VUNPCKLPSrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::UNPCKLPSrm: - case X86::VUNPCKLPSrm: - DecodeUNPCKLMask(MVT::v4f32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPSYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPSYrm: - DecodeUNPCKLMask(MVT::v8f32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPSZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPSZrm: - DecodeUNPCKLMask(MVT::v16f32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::UNPCKHPDrr: - case X86::VUNPCKHPDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::UNPCKHPDrm: - case X86::VUNPCKHPDrm: - DecodeUNPCKHMask(MVT::v2f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKHPDYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKHPDYrm: - DecodeUNPCKHMask(MVT::v4f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_VSHUF(64X2, r) + CASE_VSHUF(64X2, m) + CASE_VSHUF(32X4, r) + CASE_VSHUF(32X4, m) { + MVT VT; + bool HasMemOp; + unsigned NumOp = MI->getNumOperands(); + getVSHUF64x2FamilyInfo(MI, VT, HasMemOp); + decodeVSHUF64x2FamilyMask(VT, MI->getOperand(NumOp - 1).getImm(), + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); + if (HasMemOp) { + assert((NumOp >= 8) && "Expected at least 8 operands!"); + Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg()); + } else { + assert((NumOp >= 4) && "Expected at least 4 operands!"); + Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg()); + Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg()); + } break; - case X86::VUNPCKHPDZrr: + } + + CASE_UNPCK(UNPCKLPD, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VUNPCKHPDZrm: - DecodeUNPCKHMask(MVT::v8f64, ShuffleMask); + CASE_UNPCK(UNPCKLPD, m) + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::UNPCKHPSrr: - case X86::VUNPCKHPSrr: + + CASE_UNPCK(UNPCKLPS, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::UNPCKHPSrm: - case X86::VUNPCKHPSrm: - DecodeUNPCKHMask(MVT::v4f32, ShuffleMask); + CASE_UNPCK(UNPCKLPS, m) + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VUNPCKHPSYrr: + + CASE_UNPCK(UNPCKHPD, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VUNPCKHPSYrm: - DecodeUNPCKHMask(MVT::v8f32, ShuffleMask); + CASE_UNPCK(UNPCKHPD, m) + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VUNPCKHPSZrr: + + CASE_UNPCK(UNPCKHPS, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VUNPCKHPSZrm: - DecodeUNPCKHMask(MVT::v16f32, ShuffleMask); + CASE_UNPCK(UNPCKHPS, m) + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VPERMILPSri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPERMILPSmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v4f32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPERMILPSYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPERMILPSYmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v8f32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPERMILPDri: + + CASE_VPERM(PERMILPS, r) Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. - case X86::VPERMILPDmi: + CASE_VPERM(PERMILPS, m) if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v2f64, + DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VPERMILPDYri: + + CASE_VPERM(PERMILPD, r) Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. - case X86::VPERMILPDYmi: + CASE_VPERM(PERMILPD, m) if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v4f64, + DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VPERM2F128rr: case X86::VPERM2I128rr: Src2Name = getRegName(MI->getOperand(2).getReg()); @@ -824,6 +635,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VPERMQYri: case X86::VPERMPDYri: Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -846,6 +658,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::MOVSSrr: case X86::VMOVSSrr: Src2Name = getRegName(MI->getOperand(2).getReg()); @@ -861,6 +674,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MOVZPQILo2PQIrr: case X86::VMOVPQI2QIrr: case X86::VMOVZPQILo2PQIrr: + case X86::VMOVZPQILo2PQIZrr: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::MOVQI2PQIrm: @@ -869,9 +683,11 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VMOVQI2PQIrm: case X86::VMOVZQI2PQIrm: case X86::VMOVZPQILo2PQIrm: + case X86::VMOVZPQILo2PQIZrm: DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::MOVDI2PDIrm: case X86::VMOVDI2PDIrm: DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask); diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index 6e371da..20cd7ff 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -19,8 +19,6 @@ namespace llvm { -class MCOperand; - class X86IntelInstPrinter final : public MCInstPrinter { public: X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 629802f..133bd0e 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -69,15 +69,19 @@ public: class X86AsmBackend : public MCAsmBackend { const StringRef CPU; bool HasNopl; - const uint64_t MaxNopLength; + uint64_t MaxNopLength; public: - X86AsmBackend(const Target &T, StringRef CPU) - : MCAsmBackend(), CPU(CPU), MaxNopLength(CPU == "slm" ? 7 : 15) { + X86AsmBackend(const Target &T, StringRef CPU) : MCAsmBackend(), CPU(CPU) { HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" && CPU != "c3" && CPU != "c3-2"; + // Max length of true long nop instruction is 15 bytes. + // Max length of long nop replacement instruction is 7 bytes. + // Taking into account SilverMont architecture features max length of nops + // is reduced for it to achieve better performance. + MaxNopLength = (!HasNopl || CPU == "slm") ? 7 : 15; } unsigned getNumFixupKinds() const override { @@ -200,6 +204,14 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) { case X86::ADD64ri8: return X86::ADD64ri32; case X86::ADD64mi8: return X86::ADD64mi32; + // ADC + case X86::ADC16ri8: return X86::ADC16ri; + case X86::ADC16mi8: return X86::ADC16mi; + case X86::ADC32ri8: return X86::ADC32ri; + case X86::ADC32mi8: return X86::ADC32mi; + case X86::ADC64ri8: return X86::ADC64ri32; + case X86::ADC64mi8: return X86::ADC64mi32; + // SUB case X86::SUB16ri8: return X86::SUB16ri; case X86::SUB16mi8: return X86::SUB16mi; @@ -208,6 +220,14 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) { case X86::SUB64ri8: return X86::SUB64ri32; case X86::SUB64mi8: return X86::SUB64mi32; + // SBB + case X86::SBB16ri8: return X86::SBB16ri; + case X86::SBB16mi8: return X86::SBB16mi; + case X86::SBB32ri8: return X86::SBB32ri; + case X86::SBB32mi8: return X86::SBB32mi; + case X86::SBB64ri8: return X86::SBB64ri32; + case X86::SBB64mi8: return X86::SBB64mi32; + // CMP case X86::CMP16ri8: return X86::CMP16ri; case X86::CMP16mi8: return X86::CMP16mi; @@ -279,7 +299,7 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const { /// bytes. /// \return - true on success, false on failure bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { - static const uint8_t Nops[10][10] = { + static const uint8_t TrueNops[10][10] = { // nop {0x90}, // xchg %ax,%ax @@ -302,17 +322,31 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, }; - // This CPU doesn't support long nops. If needed add more. - // FIXME: Can we get this from the subtarget somehow? - // FIXME: We could generated something better than plain 0x90. - if (!HasNopl) { - for (uint64_t i = 0; i < Count; ++i) - OW->write8(0x90); - return true; - } + // Alternative nop instructions for CPUs which don't support long nops. + static const uint8_t AltNops[7][10] = { + // nop + {0x90}, + // xchg %ax,%ax + {0x66, 0x90}, + // lea 0x0(%esi),%esi + {0x8d, 0x76, 0x00}, + // lea 0x0(%esi),%esi + {0x8d, 0x74, 0x26, 0x00}, + // nop + lea 0x0(%esi),%esi + {0x90, 0x8d, 0x74, 0x26, 0x00}, + // lea 0x0(%esi),%esi + {0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 }, + // lea 0x0(%esi),%esi + {0x8d, 0xb4, 0x26, 0x00, 0x00, 0x00, 0x00}, + }; + + // Select the right NOP table. + // FIXME: Can we get if CPU supports long nops from the subtarget somehow? + const uint8_t (*Nops)[10] = HasNopl ? TrueNops : AltNops; + assert(HasNopl || MaxNopLength <= 7); - // 15 is the longest single nop instruction. Emit as many 15-byte nops as - // needed, then emit a nop of the remaining length. + // Emit as many largest nops as needed, then emit a nop of the remaining + // length. do { const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength); const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10; @@ -359,6 +393,17 @@ public: } }; +class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend { +public: + ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, + ELF::EM_IAMCU); + } +}; + class ELFX86_64AsmBackend : public ELFX86AsmBackend { public: ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) @@ -610,13 +655,13 @@ private: /// \brief Get the compact unwind number for a given register. The number /// corresponds to the enum lists in compact_unwind_encoding.h. int getCompactUnwindRegNum(unsigned Reg) const { - static const uint16_t CU32BitRegs[7] = { + static const MCPhysReg CU32BitRegs[7] = { X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 }; - static const uint16_t CU64BitRegs[] = { + static const MCPhysReg CU64BitRegs[] = { X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 }; - const uint16_t *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs; + const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs; for (int Idx = 1; *CURegs; ++CURegs, ++Idx) if (*CURegs == Reg) return Idx; @@ -780,6 +825,10 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, return new WindowsX86AsmBackend(T, false, CPU); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + + if (TheTriple.isOSIAMCU()) + return new ELFX86_IAMCUAsmBackend(T, OSABI, CPU); + return new ELFX86_32AsmBackend(T, OSABI, CPU); } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index f0d00b0..9ff85b9 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -41,6 +41,16 @@ namespace X86 { /// AddrNumOperands - Total number of operands in a memory reference. AddrNumOperands = 5 }; + + /// AVX512 static rounding constants. These need to match the values in + /// avx512fintrin.h. + enum STATIC_ROUNDING { + TO_NEAREST_INT = 0, + TO_NEG_INF = 1, + TO_POS_INF = 2, + TO_ZERO = 3, + CUR_DIRECTION = 4 + }; } // end namespace X86; /// X86II - This namespace holds all of the target specific flags that @@ -675,7 +685,7 @@ namespace X86II { case X86II::RawFrmSrc: case X86II::RawFrmDst: case X86II::RawFrmDstSrc: - return -1; + return -1; case X86II::MRMDestMem: return 0; case X86II::MRMSrcMem: @@ -696,23 +706,27 @@ namespace X86II { // Start from 0, skip registers encoded in VEX_VVVV or a mask register. return 0 + HasVEX_4V + HasEVEX_K; case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: - case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8: + case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5: + case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8: case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: + case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE: case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: - case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6: - case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9: - case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC: - case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF: - case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2: - case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5: - case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA: - case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED: - case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1: - case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4: - case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7: - case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA: - case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD: - case X86II::MRM_FE: case X86II::MRM_FF: + case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4: + case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7: + case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA: + case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD: + case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0: + case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3: + case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6: + case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9: + case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: + case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF: + case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2: + case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5: + case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8: + case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB: + case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE: + case X86II::MRM_FF: return -1; } } @@ -740,7 +754,7 @@ namespace X86II { case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B: case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11: case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15: - return true; + return true; } return false; } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index a33468d..736c39d 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -32,9 +32,11 @@ namespace { X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) - : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine, - // Only i386 uses Rel instead of RelA. - /*HasRelocationAddend*/ EMachine != ELF::EM_386) {} + : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine, + // Only i386 and IAMCU use Rel instead of RelA. + /*HasRelocationAddend*/ + (EMachine != ELF::EM_386) && + (EMachine != ELF::EM_IAMCU)) {} X86ELFObjectWriter::~X86ELFObjectWriter() {} @@ -246,7 +248,8 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, if (getEMachine() == ELF::EM_X86_64) return getRelocType64(Modifier, Type, IsPCRel); - assert(getEMachine() == ELF::EM_386 && "Unsupported ELF machine type."); + assert((getEMachine() == ELF::EM_386 || getEMachine() == ELF::EM_IAMCU) && + "Unsupported ELF machine type."); return getRelocType32(Modifier, getType32(Type), IsPCRel); } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h index deaad2a..30d5c80 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -20,39 +20,42 @@ #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { - class Triple; - - class X86MCAsmInfoDarwin : public MCAsmInfoDarwin { - virtual void anchor(); - - public: - explicit X86MCAsmInfoDarwin(const Triple &Triple); - }; - - struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin { - explicit X86_64MCAsmInfoDarwin(const Triple &Triple); - const MCExpr * - getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding, - MCStreamer &Streamer) const override; - }; - - class X86ELFMCAsmInfo : public MCAsmInfoELF { - void anchor() override; - public: - explicit X86ELFMCAsmInfo(const Triple &Triple); - }; - - class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { - void anchor() override; - public: - explicit X86MCAsmInfoMicrosoft(const Triple &Triple); - }; - - class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF { - void anchor() override; - public: - explicit X86MCAsmInfoGNUCOFF(const Triple &Triple); - }; +class Triple; + +class X86MCAsmInfoDarwin : public MCAsmInfoDarwin { + virtual void anchor(); + +public: + explicit X86MCAsmInfoDarwin(const Triple &Triple); +}; + +struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin { + explicit X86_64MCAsmInfoDarwin(const Triple &Triple); + const MCExpr * + getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding, + MCStreamer &Streamer) const override; +}; + +class X86ELFMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit X86ELFMCAsmInfo(const Triple &Triple); +}; + +class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { + void anchor() override; + +public: + explicit X86MCAsmInfoMicrosoft(const Triple &Triple); +}; + +class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF { + void anchor() override; + +public: + explicit X86MCAsmInfoGNUCOFF(const Triple &Triple); +}; } // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 10c434c..dfab6ec 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -510,8 +510,8 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, // Otherwise, emit the most general non-SIB encoding: [REG+disp32] EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS); - EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS, - Fixups); + EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), + CurByte, OS, Fixups); return; } @@ -988,6 +988,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, const MCInstrDesc &Desc) { unsigned REX = 0; + bool UsesHighByteReg = false; + if (TSFlags & X86II::REX_W) REX |= 1 << 3; // set REX.W @@ -1004,6 +1006,8 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, const MCOperand &MO = MI.getOperand(i); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); + if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH) + UsesHighByteReg = true; if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue; // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything // that returns non-zero. @@ -1073,6 +1077,9 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, } break; } + if (REX && UsesHighByteReg) + report_fatal_error("Cannot encode high byte register in REX-prefixed instruction"); + return REX; } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 83b4091..53a6550 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -122,7 +122,8 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, } else if (TheTriple.isOSBinFormatELF()) { // Force the use of an ELF container. MAI = new X86ELFMCAsmInfo(TheTriple); - } else if (TheTriple.isWindowsMSVCEnvironment()) { + } else if (TheTriple.isWindowsMSVCEnvironment() || + TheTriple.isWindowsCoreCLREnvironment()) { MAI = new X86MCAsmInfoMicrosoft(TheTriple); } else if (TheTriple.isOSCygMing() || TheTriple.isWindowsItaniumEnvironment()) { @@ -267,3 +268,184 @@ extern "C" void LLVMInitializeX86TargetMC() { TargetRegistry::RegisterMCAsmBackend(TheX86_64Target, createX86_64AsmBackend); } + +unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, + bool High) { + switch (Size) { + default: return 0; + case 8: + if (High) { + switch (Reg) { + default: return getX86SubSuperRegisterOrZero(Reg, 64); + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AH; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DH; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CH; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BH; + } + } else { + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AL; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DL; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CL; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BL; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SIL; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DIL; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BPL; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SPL; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8B; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9B; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10B; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11B; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12B; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13B; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14B; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15B; + } + } + case 16: + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8W; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9W; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10W; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11W; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12W; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13W; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14W; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15W; + } + case 32: + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::EAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::EDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::ECX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::EBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::ESI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::EDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::EBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::ESP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8D; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9D; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10D; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11D; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12D; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13D; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14D; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15D; + } + case 64: + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::RAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::RDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::RCX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::RBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::RSI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::RDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::RBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::RSP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15; + } + } +} + +unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) { + unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High); + assert(Res != 0 && "Unexpected register or VT"); + return Res; +} + + diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 6221bab..2d2836f 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -79,7 +79,7 @@ MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, /// Takes ownership of \p AB and \p CE. MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, raw_pwrite_stream &OS, MCCodeEmitter *CE, - bool RelaxAll); + bool RelaxAll, bool IncrementalLinkerCompatible); /// Construct an X86 Mach-O object writer. MCObjectWriter *createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, @@ -98,6 +98,17 @@ MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx); /// Construct X86-64 ELF relocation info. MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx); + +/// Returns the sub or super register of a specific X86 register. +/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX. +/// Aborts on error. +unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false); + +/// Returns the sub or super register of a specific X86 register. +/// Like getX86SubSuperRegister() but returns 0 on error. +unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned, + bool High = false); + } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 9e801fc..191ebea 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -149,14 +149,19 @@ void X86MachObjectWriter::RecordX86_64Relocation( // Neither symbol can be modified. if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || - Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) - report_fatal_error("unsupported relocation of modified symbol", false); + Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) { + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation of modified symbol"); + return; + } // We don't support PCrel relocations of differences. Darwin 'as' doesn't // implement most of these correctly. - if (IsPCRel) - report_fatal_error("unsupported pc-relative relocation of difference", - false); + if (IsPCRel) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported pc-relative relocation of difference"); + return; + } // The support for the situation where one or both of the symbols would // require a local relocation is handled just like if the symbols were @@ -168,16 +173,20 @@ void X86MachObjectWriter::RecordX86_64Relocation( // Darwin 'as' doesn't emit correct relocations for this (it ends up with a // single SIGNED relocation); reject it for now. Except the case where both // symbols don't have a base, equal but both NULL. - if (A_Base == B_Base && A_Base) - report_fatal_error("unsupported relocation with identical base", false); + if (A_Base == B_Base && A_Base) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported relocation with identical base"); + return; + } // A subtraction expression where either symbol is undefined is a // non-relocatable expression. if (A->isUndefined() || B->isUndefined()) { StringRef Name = A->isUndefined() ? A->getName() : B->getName(); - Asm.getContext().reportFatalError(Fixup.getLoc(), + Asm.getContext().reportError(Fixup.getLoc(), "unsupported relocation with subtraction expression, symbol '" + Name + "' can not be undefined in a subtraction expression"); + return; } Value += Writer->getSymbolAddress(*A, Layout) - @@ -244,12 +253,16 @@ void X86MachObjectWriter::RecordX86_64Relocation( FixedValue = Res; return; } else { - report_fatal_error("unsupported relocation of variable '" + - Symbol->getName() + "'", false); + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation of variable '" + + Symbol->getName() + "'"); + return; } } else { - report_fatal_error("unsupported relocation of undefined symbol '" + - Symbol->getName() + "'", false); + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported relocation of undefined symbol '" + + Symbol->getName() + "'"); + return; } MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind(); @@ -266,8 +279,9 @@ void X86MachObjectWriter::RecordX86_64Relocation( } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { Type = MachO::X86_64_RELOC_TLV; } else if (Modifier != MCSymbolRefExpr::VK_None) { - report_fatal_error("unsupported symbol modifier in relocation", - false); + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported symbol modifier in relocation"); + return; } else { Type = MachO::X86_64_RELOC_SIGNED; @@ -292,9 +306,12 @@ void X86MachObjectWriter::RecordX86_64Relocation( } } } else { - if (Modifier != MCSymbolRefExpr::VK_None) - report_fatal_error("unsupported symbol modifier in branch " - "relocation", false); + if (Modifier != MCSymbolRefExpr::VK_None) { + Asm.getContext().reportError( + Fixup.getLoc(), + "unsupported symbol modifier in branch relocation"); + return; + } Type = MachO::X86_64_RELOC_BRANCH; } @@ -309,16 +326,22 @@ void X86MachObjectWriter::RecordX86_64Relocation( Type = MachO::X86_64_RELOC_GOT; IsPCRel = 1; } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { - report_fatal_error("TLVP symbol modifier should have been rip-rel", - false); - } else if (Modifier != MCSymbolRefExpr::VK_None) - report_fatal_error("unsupported symbol modifier in relocation", false); - else { + Asm.getContext().reportError( + Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel"); + return; + } else if (Modifier != MCSymbolRefExpr::VK_None) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported symbol modifier in relocation"); + return; + } else { Type = MachO::X86_64_RELOC_UNSIGNED; unsigned Kind = Fixup.getKind(); - if (Kind == X86::reloc_signed_4byte) - report_fatal_error("32-bit absolute addressing is not supported in " - "64-bit mode", false); + if (Kind == X86::reloc_signed_4byte) { + Asm.getContext().reportError( + Fixup.getLoc(), + "32-bit absolute addressing is not supported in 64-bit mode"); + return; + } } } } @@ -350,10 +373,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, // See <reloc.h>. const MCSymbol *A = &Target.getSymA()->getSymbol(); - if (!A->getFragment()) - report_fatal_error("symbol '" + A->getName() + - "' can not be undefined in a subtraction expression", - false); + if (!A->getFragment()) { + Asm.getContext().reportError( + Fixup.getLoc(), + "symbol '" + A->getName() + + "' can not be undefined in a subtraction expression"); + return false; + } uint32_t Value = Writer->getSymbolAddress(*A, Layout); uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent()); @@ -363,10 +389,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, if (const MCSymbolRefExpr *B = Target.getSymB()) { const MCSymbol *SB = &B->getSymbol(); - if (!SB->getFragment()) - report_fatal_error("symbol '" + B->getSymbol().getName() + - "' can not be undefined in a subtraction expression", - false); + if (!SB->getFragment()) { + Asm.getContext().reportError( + Fixup.getLoc(), + "symbol '" + B->getSymbol().getName() + + "' can not be undefined in a subtraction expression"); + return false; + } // Select the appropriate difference relocation type. // @@ -387,12 +416,12 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, if (FixupOffset > 0xffffff) { char Buffer[32]; format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer)); - Asm.getContext().reportFatalError(Fixup.getLoc(), + Asm.getContext().reportError(Fixup.getLoc(), Twine("Section too large, can't encode " "r_address (") + Buffer + ") into 24 bits of scattered " "relocation entry."); - llvm_unreachable("fatal error returned?!"); + return false; } MachO::any_relocation_info MRE; diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 92f42b6..d045118 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -50,9 +50,11 @@ void X86WinCOFFStreamer::FinishImpl() { MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, raw_pwrite_stream &OS, - MCCodeEmitter *CE, bool RelaxAll) { + MCCodeEmitter *CE, bool RelaxAll, + bool IncrementalLinkerCompatible) { X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS); S->getAssembler().setRelaxAll(RelaxAll); + S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible); return S; } diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp index cae865a..619f7c8 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "X86ShuffleDecode.h" -#include "llvm/IR/Constants.h" #include "llvm/CodeGen/MachineValueType.h" //===----------------------------------------------------------------------===// @@ -140,13 +139,14 @@ void DecodePALIGNRMask(MVT VT, unsigned Imm, } } -/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*. +/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*. /// VT indicates the type of the vector allowing it to handle different /// datatypes and vector widths. void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits() / 128; + if (NumLanes == 0) NumLanes = 1; // Handle MMX unsigned NumLaneElts = NumElts / NumLanes; unsigned NewImm = Imm; @@ -191,6 +191,16 @@ void DecodePSHUFLWMask(MVT VT, unsigned Imm, } } +void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumHalfElts = NumElts / 2; + + for (unsigned l = 0; l != NumHalfElts; ++l) + ShuffleMask.push_back(l + NumHalfElts); + for (unsigned h = 0; h != NumHalfElts; ++h) + ShuffleMask.push_back(h); +} + /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates /// the type of the vector allowing it to handle different datatypes and vector /// widths. @@ -222,7 +232,7 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate // independently on 128-bit lanes. unsigned NumLanes = VT.getSizeInBits() / 128; - if (NumLanes == 0 ) NumLanes = 1; // Handle MMX + if (NumLanes == 0) NumLanes = 1; // Handle MMX unsigned NumLaneElts = NumElts / NumLanes; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { @@ -253,6 +263,26 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { } } +/// \brief Decode a shuffle packed values at 128-bit granularity +/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) +/// immediate mask into a shuffle mask. +void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits(); + unsigned ControlBitsMask = NumLanes - 1; + unsigned NumControlBits = NumLanes / 2; + + for (unsigned l = 0; l != NumLanes; ++l) { + unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask; + // We actually need the other source. + if (l >= NumLanes / 2) + LaneMask += NumLanes; + for (unsigned i = 0; i != NumElementsInLane; ++i) + ShuffleMask.push_back(LaneMask * NumElementsInLane + i); + } +} + void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned HalfSize = VT.getVectorNumElements() / 2; @@ -265,54 +295,6 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, } } -void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { - Type *MaskTy = C->getType(); - // It is not an error for the PSHUFB mask to not be a vector of i8 because the - // constant pool uniques constants by their bit representation. - // e.g. the following take up the same space in the constant pool: - // i128 -170141183420855150465331762880109871104 - // - // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160> - // - // <4 x i32> <i32 -2147483648, i32 -2147483648, - // i32 -2147483648, i32 -2147483648> - - unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); - - if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512. - return; - - // This is a straightforward byte vector. - if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) { - int NumElements = MaskTy->getVectorNumElements(); - ShuffleMask.reserve(NumElements); - - for (int i = 0; i < NumElements; ++i) { - // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte - // lane of the vector we're inside. - int Base = i < 16 ? 0 : 16; - Constant *COp = C->getAggregateElement(i); - if (!COp) { - ShuffleMask.clear(); - return; - } else if (isa<UndefValue>(COp)) { - ShuffleMask.push_back(SM_SentinelUndef); - continue; - } - uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); - // If the high bit (7) of the byte is set, the element is zeroed. - if (Element & (1 << 7)) - ShuffleMask.push_back(SM_SentinelZero); - else { - // Only the least significant 4 bits of the byte are used. - int Index = Base + (Element & 0xf); - ShuffleMask.push_back(Index); - } - } - } - // TODO: Handle funny-looking vectors too. -} - void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, SmallVectorImpl<int> &ShuffleMask) { for (int i = 0, e = RawMask.size(); i < e; ++i) { @@ -357,46 +339,6 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { } } -void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { - Type *MaskTy = C->getType(); - assert(MaskTy->isVectorTy() && "Expected a vector constant mask!"); - assert(MaskTy->getVectorElementType()->isIntegerTy() && - "Expected integer constant mask elements!"); - int ElementBits = MaskTy->getScalarSizeInBits(); - int NumElements = MaskTy->getVectorNumElements(); - assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && - "Unexpected number of vector elements."); - ShuffleMask.reserve(NumElements); - if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) { - assert((unsigned)NumElements == CDS->getNumElements() && - "Constant mask has a different number of elements!"); - - for (int i = 0; i < NumElements; ++i) { - int Base = (i * ElementBits / 128) * (128 / ElementBits); - uint64_t Element = CDS->getElementAsInteger(i); - // Only the least significant 2 bits of the integer are used. - int Index = Base + (Element & 0x3); - ShuffleMask.push_back(Index); - } - } else if (auto *CV = dyn_cast<ConstantVector>(C)) { - assert((unsigned)NumElements == C->getNumOperands() && - "Constant mask has a different number of elements!"); - - for (int i = 0; i < NumElements; ++i) { - int Base = (i * ElementBits / 128) * (128 / ElementBits); - Constant *COp = CV->getOperand(i); - if (isa<UndefValue>(COp)) { - ShuffleMask.push_back(SM_SentinelUndef); - continue; - } - uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); - // Only the least significant 2 bits of the integer are used. - int Index = Base + (Element & 0x3); - ShuffleMask.push_back(Index); - } - } -} - void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) { unsigned NumDstElts = DstVT.getVectorNumElements(); unsigned SrcScalarBits = SrcVT.getScalarSizeInBits(); @@ -503,4 +445,20 @@ void DecodeINSERTQIMask(int Len, int Idx, ShuffleMask.push_back(SM_SentinelUndef); } +void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + ShuffleMask.push_back((int)M); + } +} + +void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + ShuffleMask.push_back((int)M); + } +} + } // llvm namespace diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h index 3d10d18..72db6a8 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -23,7 +23,6 @@ //===----------------------------------------------------------------------===// namespace llvm { -class Constant; class MVT; enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 }; @@ -54,6 +53,9 @@ void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +/// \brief Decodes a PSWAPD 3DNow! instruction. +void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates /// the type of the vector allowing it to handle different datatypes and vector /// widths. @@ -69,9 +71,6 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); /// different datatypes and vector widths. void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); -/// \brief Decode a PSHUFB mask from an IR-level vector constant. -void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); - /// \brief Decode a PSHUFB mask from a raw array of constants such as from /// BUILD_VECTOR. void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, @@ -83,13 +82,15 @@ void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +/// \brief Decode a shuffle packed values at 128-bit granularity +/// immediate mask into a shuffle mask. +void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask); + /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); -/// \brief Decode a VPERMILP variable mask from an IR-level vector constant. -void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); - /// \brief Decode a zero extension instruction as a shuffle mask. void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &ShuffleMask); @@ -108,6 +109,14 @@ void DecodeEXTRQIMask(int Len, int Idx, /// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask. void DecodeINSERTQIMask(int Len, int Idx, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants. +void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants. +void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask); } // llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h index 8403ae6..01e65b8 100644 --- a/contrib/llvm/lib/Target/X86/X86.h +++ b/contrib/llvm/lib/Target/X86/X86.h @@ -23,56 +23,48 @@ class FunctionPass; class ImmutablePass; class X86TargetMachine; -/// createX86ISelDag - This pass converts a legalized DAG into a -/// X86-specific DAG, ready for instruction scheduling. -/// +/// This pass converts a legalized DAG into a X86-specific DAG, ready for +/// instruction scheduling. FunctionPass *createX86ISelDag(X86TargetMachine &TM, CodeGenOpt::Level OptLevel); -/// createX86GlobalBaseRegPass - This pass initializes a global base -/// register for PIC on x86-32. -FunctionPass* createX86GlobalBaseRegPass(); +/// This pass initializes a global base register for PIC on x86-32. +FunctionPass *createX86GlobalBaseRegPass(); -/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses -/// to local-dynamic TLS variables so that the TLS base address for the module -/// is only fetched once per execution path through the function. +/// This pass combines multiple accesses to local-dynamic TLS variables so that +/// the TLS base address for the module is only fetched once per execution path +/// through the function. FunctionPass *createCleanupLocalDynamicTLSPass(); -/// createX86FloatingPointStackifierPass - This function returns a pass which -/// converts floating point register references and pseudo instructions into -/// floating point stack references and physical instructions. -/// +/// This function returns a pass which converts floating-point register +/// references and pseudo instructions into floating-point stack references and +/// physical instructions. FunctionPass *createX86FloatingPointStackifierPass(); -/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions -/// before each call to avoid transition penalty between functions encoded with -/// AVX and SSE. +/// This pass inserts AVX vzeroupper instructions before each call to avoid +/// transition penalty between functions encoded with AVX and SSE. FunctionPass *createX86IssueVZeroUpperPass(); -/// createX86EmitCodeToMemory - Returns a pass that converts a register -/// allocated function into raw machine code in a dynamically -/// allocated chunk of memory. -/// -FunctionPass *createEmitX86CodeToMemory(); - -/// createX86PadShortFunctions - Return a pass that pads short functions -/// with NOOPs. This will prevent a stall when returning on the Atom. +/// Return a pass that pads short functions with NOOPs. +/// This will prevent a stall when returning on the Atom. FunctionPass *createX86PadShortFunctions(); -/// createX86FixupLEAs - Return a a pass that selectively replaces -/// certain instructions (like add, sub, inc, dec, some shifts, -/// and some multiplies) by equivalent LEA instructions, in order -/// to eliminate execution delays in some Atom processors. + +/// Return a pass that selectively replaces certain instructions (like add, +/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA +/// instructions, in order to eliminate execution delays in some processors. FunctionPass *createX86FixupLEAs(); -/// createX86CallFrameOptimization - Return a pass that optimizes -/// the code-size of x86 call sequences. This is done by replacing -/// esp-relative movs with pushes. +/// Return a pass that removes redundant LEA instructions and redundant address +/// recalculations. +FunctionPass *createX86OptimizeLEAs(); + +/// Return a pass that optimizes the code-size of x86 call sequences. This is +/// done by replacing esp-relative movs with pushes. FunctionPass *createX86CallFrameOptimization(); -/// createX86WinEHStatePass - Return an IR pass that inserts EH registration -/// stack objects and explicit EH state updates. This pass must run after EH -/// preparation, which does Windows-specific but architecture-neutral -/// preparation. +/// Return an IR pass that inserts EH registration stack objects and explicit +/// EH state updates. This pass must run after EH preparation, which does +/// Windows-specific but architecture-neutral preparation. FunctionPass *createX86WinEHStatePass(); /// Return a Machine IR pass that expands X86-specific pseudo diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td index 8522674..8902a85 100644 --- a/contrib/llvm/lib/Target/X86/X86.td +++ b/contrib/llvm/lib/Target/X86/X86.td @@ -37,14 +37,26 @@ def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true", def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true", "Support POPCNT instruction">; +def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true", + "Support fxsave/fxrestore instructions">; + +def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true", + "Support xsave instructions">; + +def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true", + "Support xsaveopt instructions">; + +def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true", + "Support xsavec instructions">; + +def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true", + "Support xsaves instructions">; -def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX", - "Enable MMX instructions">; def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", "Enable SSE instructions", // SSE codegen depends on cmovs, and all // SSE1+ processors support them. - [FeatureMMX, FeatureCMOV]>; + [FeatureCMOV]>; def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2", "Enable SSE2 instructions", [FeatureSSE1]>; @@ -60,6 +72,11 @@ def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41", def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42", "Enable SSE 4.2 instructions", [FeatureSSE41]>; +// The MMX subtarget feature is separate from the rest of the SSE features +// because it's important (for odd compatibility reasons) to be able to +// turn it off explicitly while allowing SSE+ to be on. +def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX", + "Enable MMX instructions">; def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", "Enable 3DNow! instructions", [FeatureMMX]>; @@ -79,16 +96,13 @@ def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", "Bit testing of memory is slow">; def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; -// FIXME: This is a 16-byte (SSE/AVX) feature; we should rename it to make that -// explicit. Also, it seems this would be the default state for most chips -// going forward, so it would probably be better to negate the logic and -// match the 32-byte "slow mem" feature below. -def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", - "IsUAMemFast", "true", - "Fast unaligned memory access">; +// FIXME: This should not apply to CPUs that do not have SSE. +def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16", + "IsUAMem16Slow", "true", + "Slow unaligned 16-byte memory access">; def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32", - "IsUAMem32Slow", "true", - "Slow unaligned 32-byte memory access">; + "IsUAMem32Slow", "true", + "Slow unaligned 32-byte memory access">; def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", "Support SSE 4a instructions", [FeatureSSE3]>; @@ -120,6 +134,8 @@ def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true", def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true", "Enable AVX-512 Vector Length eXtensions", [FeatureAVX512]>; +def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true", + "Enable protection keys">; def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; @@ -168,9 +184,11 @@ def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", "Support PRFCHW instructions">; def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", "Support RDSEED instruction">; +def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true", + "Support LAHF and SAHF instructions">; def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true", "Support MPX instructions">; -def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", +def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", "HasSlowDivide32", "true", @@ -181,6 +199,11 @@ def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw", def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", "PadShortFunctions", "true", "Pad short functions">; +// TODO: This feature ought to be renamed. +// What it really refers to are CPUs for which certain instructions +// (which ones besides the example below?) are microcoded. +// The best examples of this are the memory forms of CALL and PUSH +// instructions, which should be avoided in favor of a MOV + register CALL/PUSH. def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect", "CallRegIndirect", "true", "Call register indirect">; @@ -208,278 +231,473 @@ def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM", class Proc<string Name, list<SubtargetFeature> Features> : ProcessorModel<Name, GenericModel, Features>; -def : Proc<"generic", []>; -def : Proc<"i386", []>; -def : Proc<"i486", []>; -def : Proc<"i586", []>; -def : Proc<"pentium", []>; -def : Proc<"pentium-mmx", [FeatureMMX]>; -def : Proc<"i686", []>; -def : Proc<"pentiumpro", [FeatureCMOV]>; -def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>; -def : Proc<"pentium3", [FeatureSSE1]>; -def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>; -def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; -def : Proc<"pentium4", [FeatureSSE2]>; -def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>; +def : Proc<"generic", [FeatureSlowUAMem16]>; +def : Proc<"i386", [FeatureSlowUAMem16]>; +def : Proc<"i486", [FeatureSlowUAMem16]>; +def : Proc<"i586", [FeatureSlowUAMem16]>; +def : Proc<"pentium", [FeatureSlowUAMem16]>; +def : Proc<"pentium-mmx", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"i686", [FeatureSlowUAMem16]>; +def : Proc<"pentiumpro", [FeatureSlowUAMem16, FeatureCMOV]>; +def : Proc<"pentium2", [FeatureSlowUAMem16, FeatureMMX, FeatureCMOV, + FeatureFXSR]>; +def : Proc<"pentium3", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, + FeatureFXSR]>; +def : Proc<"pentium3m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, + FeatureFXSR, FeatureSlowBTMem]>; +def : Proc<"pentium-m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, + FeatureFXSR, FeatureSlowBTMem]>; +def : Proc<"pentium4", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, + FeatureFXSR]>; +def : Proc<"pentium4m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, + FeatureFXSR, FeatureSlowBTMem]>; // Intel Core Duo. def : ProcessorModel<"yonah", SandyBridgeModel, - [FeatureSSE3, FeatureSlowBTMem]>; + [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR, + FeatureSlowBTMem]>; // NetBurst. -def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; -def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; +def : Proc<"prescott", + [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR, + FeatureSlowBTMem]>; +def : Proc<"nocona", [ + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSE3, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem +]>; // Intel Core 2 Solo/Duo. -def : ProcessorModel<"core2", SandyBridgeModel, - [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; -def : ProcessorModel<"penryn", SandyBridgeModel, - [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>; +def : ProcessorModel<"core2", SandyBridgeModel, [ + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureLAHFSAHF +]>; +def : ProcessorModel<"penryn", SandyBridgeModel, [ + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSE41, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureLAHFSAHF +]>; // Atom CPUs. class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [ - ProcIntelAtom, - FeatureSSSE3, - FeatureCMPXCHG16B, - FeatureMOVBE, - FeatureSlowBTMem, - FeatureLeaForSP, - FeatureSlowDivide32, - FeatureSlowDivide64, - FeatureCallRegIndirect, - FeatureLEAUsesAG, - FeaturePadShortFunctions - ]>; + ProcIntelAtom, + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeatureSlowBTMem, + FeatureLEAForSP, + FeatureSlowDivide32, + FeatureSlowDivide64, + FeatureCallRegIndirect, + FeatureLEAUsesAG, + FeaturePadShortFunctions, + FeatureLAHFSAHF +]>; def : BonnellProc<"bonnell">; def : BonnellProc<"atom">; // Pin the generic name to the baseline. class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [ - ProcIntelSLM, - FeatureSSE42, - FeatureCMPXCHG16B, - FeatureMOVBE, - FeaturePOPCNT, - FeaturePCLMUL, - FeatureAES, - FeatureSlowDivide64, - FeatureCallRegIndirect, - FeaturePRFCHW, - FeatureSlowLEA, - FeatureSlowIncDec, - FeatureSlowBTMem, - FeatureFastUAMem - ]>; + ProcIntelSLM, + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeaturePOPCNT, + FeaturePCLMUL, + FeatureAES, + FeatureSlowDivide64, + FeatureCallRegIndirect, + FeaturePRFCHW, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureSlowBTMem, + FeatureLAHFSAHF +]>; def : SilvermontProc<"silvermont">; def : SilvermontProc<"slm">; // Legacy alias. // "Arrandale" along with corei3 and corei5 class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureSSE42, - FeatureCMPXCHG16B, - FeatureSlowBTMem, - FeatureFastUAMem, - FeaturePOPCNT - ]>; + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureLAHFSAHF +]>; def : NehalemProc<"nehalem">; def : NehalemProc<"corei7">; // Westmere is a similar machine to nehalem with some additional features. // Westmere is the corei3/i5/i7 path from nehalem to sandybridge class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureSSE42, - FeatureCMPXCHG16B, - FeatureSlowBTMem, - FeatureFastUAMem, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL - ]>; + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureLAHFSAHF +]>; def : WestmereProc<"westmere">; // SSE is not listed here since llvm treats AVX as a reimplementation of SSE, // rather than a superset. class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureAVX, - FeatureCMPXCHG16B, - FeatureFastUAMem, - FeatureSlowUAMem32, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL - ]>; + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureSlowUAMem32, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureLAHFSAHF +]>; def : SandyBridgeProc<"sandybridge">; def : SandyBridgeProc<"corei7-avx">; // Legacy alias. class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureAVX, - FeatureCMPXCHG16B, - FeatureFastUAMem, - FeatureSlowUAMem32, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL, - FeatureRDRAND, - FeatureF16C, - FeatureFSGSBase - ]>; + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureSlowUAMem32, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureLAHFSAHF +]>; def : IvyBridgeProc<"ivybridge">; def : IvyBridgeProc<"core-avx-i">; // Legacy alias. class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [ - FeatureAVX2, - FeatureCMPXCHG16B, - FeatureFastUAMem, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL, - FeatureRDRAND, - FeatureF16C, - FeatureFSGSBase, - FeatureMOVBE, - FeatureLZCNT, - FeatureBMI, - FeatureBMI2, - FeatureFMA, - FeatureRTM, - FeatureHLE, - FeatureSlowIncDec - ]>; + FeatureMMX, + FeatureAVX2, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureRDRAND, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureSlowIncDec, + FeatureLAHFSAHF +]>; def : HaswellProc<"haswell">; def : HaswellProc<"core-avx2">; // Legacy alias. class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [ - FeatureAVX2, - FeatureCMPXCHG16B, - FeatureFastUAMem, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL, - FeatureRDRAND, - FeatureF16C, - FeatureFSGSBase, - FeatureMOVBE, - FeatureLZCNT, - FeatureBMI, - FeatureBMI2, - FeatureFMA, - FeatureRTM, - FeatureHLE, - FeatureADX, - FeatureRDSEED, - FeatureSlowIncDec - ]>; + FeatureMMX, + FeatureAVX2, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureADX, + FeatureRDSEED, + FeatureSlowIncDec, + FeatureLAHFSAHF +]>; def : BroadwellProc<"broadwell">; // FIXME: define KNL model -class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, - [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI, - FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, - FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, - FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, - FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, - FeatureSlowIncDec, FeatureMPX]>; +class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureMMX, + FeatureAVX512, + FeatureFXSR, + FeatureERI, + FeatureCDI, + FeaturePFI, + FeatureCMPXCHG16B, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureSlowIncDec, + FeatureMPX, + FeatureLAHFSAHF +]>; def : KnightsLandingProc<"knl">; // FIXME: define SKX model -class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, - [FeatureAVX512, FeatureCDI, - FeatureDQI, FeatureBWI, FeatureVLX, - FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, - FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, - FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, - FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, - FeatureSlowIncDec, FeatureMPX]>; +class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureMMX, + FeatureAVX512, + FeatureFXSR, + FeatureCDI, + FeatureDQI, + FeatureBWI, + FeatureVLX, + FeaturePKU, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureADX, + FeatureRDSEED, + FeatureSlowIncDec, + FeatureMPX, + FeatureXSAVEC, + FeatureXSAVES, + FeatureLAHFSAHF +]>; def : SkylakeProc<"skylake">; def : SkylakeProc<"skx">; // Legacy alias. // AMD CPUs. -def : Proc<"k6", [FeatureMMX]>; -def : Proc<"k6-2", [Feature3DNow]>; -def : Proc<"k6-3", [Feature3DNow]>; -def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem, +def : Proc<"k6", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"k6-2", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"k6-3", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"athlon", [FeatureSlowUAMem16, Feature3DNowA, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-tbird", [FeatureSlowUAMem16, Feature3DNowA, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-4", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, + FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-xp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, + FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-mp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, + FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"k8", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem, +def : Proc<"opteron", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, +def : Proc<"athlon64", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, +def : Proc<"athlon-fx", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, +def : Proc<"k8-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"k8", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"opteron", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"amdfam10", [FeatureSSE4A, - Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowBTMem, +def : Proc<"opteron-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"barcelona", [FeatureSSE4A, - Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowBTMem, +def : Proc<"athlon64-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"amdfam10", [FeatureSSE4A, Feature3DNowA, FeatureFXSR, + FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, + FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>; +def : Proc<"barcelona", [FeatureSSE4A, Feature3DNowA, FeatureFXSR, + FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, + FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>; + // Bobcat -def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B, - FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT, - FeatureSlowSHLD]>; +def : Proc<"btver1", [ + FeatureMMX, + FeatureSSSE3, + FeatureSSE4A, + FeatureFXSR, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; // Jaguar -def : ProcessorModel<"btver2", BtVer2Model, - [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, - FeaturePRFCHW, FeatureAES, FeaturePCLMUL, - FeatureBMI, FeatureF16C, FeatureMOVBE, - FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem, - FeatureSlowSHLD]>; - -// TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips. +def : ProcessorModel<"btver2", BtVer2Model, [ + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureAES, + FeaturePCLMUL, + FeatureBMI, + FeatureF16C, + FeatureMOVBE, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; // Bulldozer -def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, - FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureAVX, FeatureSSE4A, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowSHLD]>; +def : Proc<"bdver1", [ + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; // Piledriver -def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, - FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureAVX, FeatureSSE4A, FeatureF16C, - FeatureLZCNT, FeaturePOPCNT, FeatureBMI, - FeatureTBM, FeatureFMA, FeatureSlowSHLD]>; +def : Proc<"bdver2", [ + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureTBM, + FeatureFMA, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; // Steamroller -def : Proc<"bdver3", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, - FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureAVX, FeatureSSE4A, FeatureF16C, - FeatureLZCNT, FeaturePOPCNT, FeatureBMI, - FeatureTBM, FeatureFMA, FeatureSlowSHLD, - FeatureFSGSBase]>; +def : Proc<"bdver3", [ + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureTBM, + FeatureFMA, + FeatureXSAVEOPT, + FeatureSlowSHLD, + FeatureFSGSBase, + FeatureLAHFSAHF +]>; // Excavator -def : Proc<"bdver4", [FeatureAVX2, FeatureXOP, FeatureFMA4, - FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, - FeaturePCLMUL, FeatureF16C, FeatureLZCNT, - FeaturePOPCNT, FeatureBMI, FeatureBMI2, - FeatureTBM, FeatureFMA, FeatureSSE4A, - FeatureFSGSBase]>; - -def : Proc<"geode", [Feature3DNowA]>; - -def : Proc<"winchip-c6", [FeatureMMX]>; -def : Proc<"winchip2", [Feature3DNow]>; -def : Proc<"c3", [Feature3DNow]>; -def : Proc<"c3-2", [FeatureSSE1]>; +def : Proc<"bdver4", [ + FeatureMMX, + FeatureAVX2, + FeatureFXSR, + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureBMI2, + FeatureTBM, + FeatureFMA, + FeatureXSAVEOPT, + FeatureFSGSBase, + FeatureLAHFSAHF +]>; + +def : Proc<"geode", [FeatureSlowUAMem16, Feature3DNowA]>; + +def : Proc<"winchip-c6", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"winchip2", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3-2", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, FeatureFXSR]>; // We also provide a generic 64-bit specific x86 processor model which tries to // be good for modern chips without enabling instruction set encodings past the @@ -492,8 +710,8 @@ def : Proc<"c3-2", [FeatureSSE1]>; // knobs which need to be tuned differently for AMD chips, we might consider // forming a common base for them. def : ProcessorModel<"x86-64", SandyBridgeModel, - [FeatureSSE2, Feature64Bit, FeatureSlowBTMem, - FeatureFastUAMem]>; + [FeatureMMX, FeatureSSE2, FeatureFXSR, Feature64Bit, + FeatureSlowBTMem ]>; //===----------------------------------------------------------------------===// // Register File Description @@ -520,10 +738,6 @@ include "X86CallingConv.td" // Assembly Parser //===----------------------------------------------------------------------===// -def ATTAsmParser : AsmParser { - string AsmParserClassName = "AsmParser"; -} - def ATTAsmParserVariant : AsmParserVariant { int Variant = 0; @@ -568,7 +782,6 @@ def IntelAsmWriter : AsmWriter { def X86 : Target { // Information about the instructions... let InstructionSet = X86InstrInfo; - let AssemblyParsers = [ATTAsmParser]; let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant]; let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter]; } diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp index ba33248..2170e62 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -217,10 +217,10 @@ static void printOperand(X86AsmPrinter &P, const MachineInstr *MI, if (AsmVariant == 0) O << '%'; unsigned Reg = MO.getReg(); if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { - MVT::SimpleValueType VT = (strcmp(Modifier+6,"64") == 0) ? - MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 : - ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8)); - Reg = getX86SubSuperRegister(Reg, VT); + unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 : + (strcmp(Modifier+6,"32") == 0) ? 32 : + (strcmp(Modifier+6,"16") == 0) ? 16 : 8; + Reg = getX86SubSuperRegister(Reg, Size); } O << X86ATTInstPrinter::getRegisterName(Reg); return; @@ -361,22 +361,21 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, switch (Mode) { default: return true; // Unknown mode. case 'b': // Print QImode register - Reg = getX86SubSuperRegister(Reg, MVT::i8); + Reg = getX86SubSuperRegister(Reg, 8); break; case 'h': // Print QImode high register - Reg = getX86SubSuperRegister(Reg, MVT::i8, true); + Reg = getX86SubSuperRegister(Reg, 8, true); break; case 'w': // Print HImode register - Reg = getX86SubSuperRegister(Reg, MVT::i16); + Reg = getX86SubSuperRegister(Reg, 16); break; case 'k': // Print SImode register - Reg = getX86SubSuperRegister(Reg, MVT::i32); + Reg = getX86SubSuperRegister(Reg, 32); break; case 'q': // Print 64-bit register names if 64-bit integer registers are available. // Otherwise, print 32-bit register names. - MVT::SimpleValueType Ty = P.getSubtarget().is64Bit() ? MVT::i64 : MVT::i32; - Reg = getX86SubSuperRegister(Reg, Ty); + Reg = getX86SubSuperRegister(Reg, P.getSubtarget().is64Bit() ? 64 : 32); break; } @@ -535,6 +534,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { S, MCConstantExpr::create(int64_t(1), MMI->getContext())); } } + OutStreamer->EmitSyntaxDirective(); } static void @@ -565,10 +565,11 @@ MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const { const MachineConstantPoolEntry &CPE = MF->getConstantPool()->getConstants()[CPID]; if (!CPE.isMachineConstantPoolEntry()) { - SectionKind Kind = CPE.getSectionKind(TM.getDataLayout()); + const DataLayout &DL = MF->getDataLayout(); + SectionKind Kind = CPE.getSectionKind(&DL); const Constant *C = CPE.Val.ConstVal; if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>( - getObjFileLowering().getSectionForConstant(Kind, C))) { + getObjFileLowering().getSectionForConstant(DL, Kind, C))) { if (MCSymbol *Sym = S->getCOMDATSymbol()) { if (Sym->isUndefined()) OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global); diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h index 7f5d127..9c8bd98 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h @@ -78,8 +78,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { // outputting it to the OutStream. This allows the shadow tracker to minimise // the number of NOPs used for stackmap padding. void EmitAndCountInstruction(MCInst &Inst); - - void InsertStackMapShadows(MachineFunction &MF); void LowerSTACKMAP(const MachineInstr &MI); void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp index 031ba4b..fc6ee17 100644 --- a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" @@ -53,10 +54,13 @@ private: // Information we know about a particular call site struct CallContext { CallContext() - : Call(nullptr), SPCopy(nullptr), ExpectedDist(0), - MovVector(4, nullptr), NoStackParams(false), UsePush(false){}; + : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0), + MovVector(4, nullptr), NoStackParams(false), UsePush(false){} - // Actuall call instruction + // Iterator referring to the frame setup instruction + MachineBasicBlock::iterator FrameSetup; + + // Actual call instruction MachineInstr *Call; // A copy of the stack pointer @@ -75,17 +79,16 @@ private: bool UsePush; }; - typedef DenseMap<MachineInstr *, CallContext> ContextMap; + typedef SmallVector<CallContext, 8> ContextVector; bool isLegal(MachineFunction &MF); - bool isProfitable(MachineFunction &MF, ContextMap &CallSeqMap); + bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap); void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, CallContext &Context); - bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock::iterator I, - const CallContext &Context); + bool adjustCallSequence(MachineFunction &MF, const CallContext &Context); MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, unsigned Reg); @@ -100,7 +103,8 @@ private: const char *getPassName() const override { return "X86 Optimize Call Frame"; } const TargetInstrInfo *TII; - const TargetFrameLowering *TFL; + const X86FrameLowering *TFL; + const X86Subtarget *STI; const MachineRegisterInfo *MRI; static char ID; }; @@ -124,8 +128,15 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { // No point in running this in 64-bit mode, since some arguments are // passed in-register in all common calling conventions, so the pattern // we're looking for will never match. - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); - if (STI.is64Bit()) + if (STI->is64Bit()) + return false; + + // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset + // in the compact unwind encoding that Darwin uses. So, bail if there + // is a danger of that being generated. + if (STI->isTargetDarwin() && + (!MF.getMMI().getLandingPads().empty() || + (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF)))) return false; // You would expect straight-line code between call-frame setup and @@ -161,7 +172,7 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { // Check whether this trasnformation is profitable for a particular // function - in terms of code size. bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, - ContextMap &CallSeqMap) { + ContextVector &CallSeqVector) { // This transformation is always a win when we do not expect to have // a reserved call frame. Under other circumstances, it may be either // a win or a loss, and requires a heuristic. @@ -170,24 +181,20 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, return true; // Don't do this when not optimizing for size. - bool OptForSize = - MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || - MF.getFunction()->hasFnAttribute(Attribute::MinSize); - - if (!OptForSize) + if (!MF.getFunction()->optForSize()) return false; unsigned StackAlign = TFL->getStackAlignment(); int64_t Advantage = 0; - for (auto CC : CallSeqMap) { + for (auto CC : CallSeqVector) { // Call sites where no parameters are passed on the stack // do not affect the cost, since there needs to be no // stack adjustment. - if (CC.second.NoStackParams) + if (CC.NoStackParams) continue; - if (!CC.second.UsePush) { + if (!CC.UsePush) { // If we don't use pushes for a particular call site, // we pay for not having a reserved call frame with an // additional sub/add esp pair. The cost is ~3 bytes per instruction, @@ -200,11 +207,11 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, // We'll need a add after the call. Advantage -= 3; // If we have to realign the stack, we'll also need and sub before - if (CC.second.ExpectedDist % StackAlign) + if (CC.ExpectedDist % StackAlign) Advantage -= 3; // Now, for each push, we save ~3 bytes. For small constants, we actually, // save more (up to 5 bytes), but 3 should be a good approximation. - Advantage += (CC.second.ExpectedDist / 4) * 3; + Advantage += (CC.ExpectedDist / 4) * 3; } } @@ -212,8 +219,9 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, } bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { - TII = MF.getSubtarget().getInstrInfo(); - TFL = MF.getSubtarget().getFrameLowering(); + STI = &MF.getSubtarget<X86Subtarget>(); + TII = STI->getInstrInfo(); + TFL = STI->getFrameLowering(); MRI = &MF.getRegInfo(); if (!isLegal(MF)) @@ -223,21 +231,22 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; - ContextMap CallSeqMap; + ContextVector CallSeqVector; for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) if (I->getOpcode() == FrameSetupOpcode) { - CallContext &Context = CallSeqMap[I]; + CallContext Context; collectCallInfo(MF, *BB, I, Context); + CallSeqVector.push_back(Context); } - if (!isProfitable(MF, CallSeqMap)) + if (!isProfitable(MF, CallSeqVector)) return false; - for (auto CC : CallSeqMap) - if (CC.second.UsePush) - Changed |= adjustCallSequence(MF, CC.first, CC.second); + for (auto CC : CallSeqVector) + if (CC.UsePush) + Changed |= adjustCallSequence(MF, CC); return Changed; } @@ -307,13 +316,13 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, // Check that this particular call sequence is amenable to the // transformation. const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); - unsigned StackPtr = RegInfo.getStackRegister(); + STI->getRegisterInfo()); unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); // We expect to enter this at the beginning of a call sequence assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); MachineBasicBlock::iterator FrameSetup = I++; + Context.FrameSetup = FrameSetup; // How much do we adjust the stack? This puts an upper bound on // the number of parameters actually passed on it. @@ -338,7 +347,8 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, if (!I->isCopy() || !I->getOperand(0).isReg()) return; Context.SPCopy = I++; - StackPtr = Context.SPCopy->getOperand(0).getReg(); + + unsigned StackPtr = Context.SPCopy->getOperand(0).getReg(); // Scan the call setup sequence for the pattern we're looking for. // We only handle a simple case - a sequence of MOV32mi or MOV32mr @@ -434,22 +444,22 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, } bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, - MachineBasicBlock::iterator I, const CallContext &Context) { // Ok, we can in fact do the transformation for this call. // Do not remove the FrameSetup instruction, but adjust the parameters. // PEI will end up finalizing the handling of this. - MachineBasicBlock::iterator FrameSetup = I; - MachineBasicBlock &MBB = *(I->getParent()); + MachineBasicBlock::iterator FrameSetup = Context.FrameSetup; + MachineBasicBlock &MBB = *(FrameSetup->getParent()); FrameSetup->getOperand(1).setImm(Context.ExpectedDist); - DebugLoc DL = I->getDebugLoc(); + DebugLoc DL = FrameSetup->getDebugLoc(); // Now, iterate through the vector in reverse order, and replace the movs // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to // replace uses. for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) { MachineBasicBlock::iterator MOV = *Context.MovVector[Idx]; MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); + MachineBasicBlock::iterator Push = nullptr; if (MOV->getOpcode() == X86::MOV32mi) { unsigned PushOpcode = X86::PUSHi32; // If the operand is a small (8-bit) immediate, we can use a @@ -461,21 +471,20 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, if (isInt<8>(Val)) PushOpcode = X86::PUSH32i8; } - BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).addOperand(PushOp); + Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)) + .addOperand(PushOp); } else { unsigned int Reg = PushOp.getReg(); // If PUSHrmm is not slow on this target, try to fold the source of the // push into the instruction. - const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); - bool SlowPUSHrmm = ST.isAtom() || ST.isSLM(); + bool SlowPUSHrmm = STI->isAtom() || STI->isSLM(); // Check that this is legal to fold. Right now, we're extremely // conservative about that. MachineInstr *DefMov = nullptr; if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { - MachineInstr *Push = - BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm)); + Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm)); unsigned NumOps = DefMov->getDesc().getNumOperands(); for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) @@ -483,12 +492,19 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, DefMov->eraseFromParent(); } else { - BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r)) + Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r)) .addReg(Reg) .getInstr(); } } + // For debugging, when using SP-based CFA, we need to adjust the CFA + // offset after each push. + // TODO: This is needed only if we require precise CFA. + if (!TFL->hasFP(MF)) + TFL->BuildCFI(MBB, std::next(Push), DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, 4)); + MBB.erase(MOV); } @@ -532,13 +548,10 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( DefMI->getParent() != FrameSetup->getParent()) return nullptr; - // Now, make sure everything else up until the ADJCALLSTACK is a sequence - // of MOVs. To be less conservative would require duplicating a lot of the - // logic from PeepholeOptimizer. - // FIXME: A possibly better approach would be to teach the PeepholeOptimizer - // to be smarter about folding into pushes. + // Make sure we don't have any instructions between DefMI and the + // push that make folding the load illegal. for (auto I = DefMI; I != FrameSetup; ++I) - if (I->getOpcode() != X86::MOV32rm) + if (I->isLoadFoldBarrier()) return nullptr; return DefMI; diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h index 0eb2494..a08160f 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.h +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H #define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H +#include "MCTargetDesc/X86MCTargetDesc.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/IR/CallingConv.h" @@ -42,6 +43,64 @@ inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, return false; } +inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure + // not to split i64 and double between a register and stack + static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX}; + static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]); + + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + + // If this is the first part of an double/i64/i128, or if we're already + // in the middle of a split, add to the pending list. If this is not + // the end of the split, return, otherwise go on to process the pending + // list + if (ArgFlags.isSplit() || !PendingMembers.empty()) { + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + if (!ArgFlags.isSplitEnd()) + return true; + } + + // If there are no pending members, we are not in the middle of a split, + // so do the usual inreg stuff. + if (PendingMembers.empty()) { + if (unsigned Reg = State.AllocateReg(RegList)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; + } + return false; + } + + assert(ArgFlags.isSplitEnd()); + + // We now have the entire original argument in PendingMembers, so decide + // whether to use registers or the stack. + // Per the MCU ABI: + // a) To use registers, we need to have enough of them free to contain + // the entire argument. + // b) We never want to use more than 2 registers for a single argument. + + unsigned FirstFree = State.getFirstUnallocated(RegList); + bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree); + + for (auto &It : PendingMembers) { + if (UseRegs) + It.convertToReg(State.AllocateReg(RegList[FirstFree++])); + else + It.convertToMem(State.AllocateStack(4, 4)); + State.addLoc(It); + } + + PendingMembers.clear(); + + return true; +} + } // End llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td index 8f88888..e8b96e7 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.td +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td @@ -158,6 +158,7 @@ def RetCC_X86_64_C : CallingConv<[ // The X86-64 calling convention always returns FP values in XMM0. CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>, CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>, + CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>, // MMX vector types are always returned in XMM0. CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>, @@ -202,6 +203,16 @@ def RetCC_X86_64_AnyReg : CallingConv<[ CCCustom<"CC_X86_AnyReg_Error"> ]>; +// X86-64 HHVM return-value convention. +def RetCC_X86_64_HHVM: CallingConv<[ + // Promote all types to i64 + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Return: could return in any GP register save RSP and R12. + CCIfType<[i64], CCAssignToReg<[RBX, RBP, RDI, RSI, RDX, RCX, R8, R9, + RAX, R10, R11, R13, R14, R15]>> +]>; + // This is the root return-value convention for the X86-32 backend. def RetCC_X86_32 : CallingConv<[ // If FastCC, use RetCC_X86_32_Fast. @@ -227,6 +238,9 @@ def RetCC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>, + // Handle HHVM calls. + CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>, + // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>, @@ -280,7 +294,7 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[v64i1], CCPromoteToType<v64i8>>, // The first 8 FP/Vector arguments are passed in XMM registers. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, @@ -305,7 +319,7 @@ def CC_X86_64_C : CallingConv<[ // Long doubles get stack slots whose size and alignment depends on the // subtarget. - CCIfType<[f80], CCAssignToStack<0, 0>>, + CCIfType<[f80, f128], CCAssignToStack<0, 0>>, // Vectors get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, @@ -319,6 +333,23 @@ def CC_X86_64_C : CallingConv<[ CCAssignToStack<64, 64>> ]>; +// Calling convention for X86-64 HHVM. +def CC_X86_64_HHVM : CallingConv<[ + // Use all/any GP registers for args, except RSP. + CCIfType<[i64], CCAssignToReg<[RBX, R12, RBP, R15, + RDI, RSI, RDX, RCX, R8, R9, + RAX, R10, R11, R13, R14]>> +]>; + +// Calling convention for helper functions in HHVM. +def CC_X86_64_HHVM_C : CallingConv<[ + // Pass the first argument in RBP. + CCIfType<[i64], CCAssignToReg<[RBP]>>, + + // Otherwise it's the same as the regular C calling convention. + CCDelegateTo<CC_X86_64_C> +]>; + // Calling convention used on Win64 def CC_X86_Win64_C : CallingConv<[ // FIXME: Handle byval stuff. @@ -561,6 +592,23 @@ def CC_X86_32_C : CallingConv<[ CCDelegateTo<CC_X86_32_Common> ]>; +def CC_X86_32_MCU : CallingConv<[ + // Handles byval parameters. Note that, like FastCC, we can't rely on + // the delegation to CC_X86_32_Common because that happens after code that + // puts arguments in registers. + CCIfByVal<CCPassByVal<4, 4>>, + + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + // If the call is not a vararg call, some arguments may be passed + // in integer registers. + CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_MCUInReg">>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + def CC_X86_32_FastCall : CallingConv<[ // Promote i1/i8/i16 arguments to i32. CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, @@ -708,18 +756,28 @@ def CC_Intel_OCL_BI : CallingConv<[ CCDelegateTo<CC_X86_32_C> ]>; +def CC_X86_32_Intr : CallingConv<[ + CCAssignToStack<4, 4> +]>; + +def CC_X86_64_Intr : CallingConv<[ + CCAssignToStack<8, 8> +]>; + //===----------------------------------------------------------------------===// // X86 Root Argument Calling Conventions //===----------------------------------------------------------------------===// // This is the root argument convention for the X86-32 backend. def CC_X86_32 : CallingConv<[ + CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>, CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>, + CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>, // Otherwise, drop to normal X86-32 CC CCDelegateTo<CC_X86_32_C> @@ -734,6 +792,9 @@ def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>, + CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>, + CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>, + CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, @@ -764,6 +825,18 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>; def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, (sequence "XMM%u", 6, 15))>; +// The function used by Darwin to obtain the address of a thread-local variable +// uses rdi to pass a single parameter and rax for the return value. All other +// GPRs are preserved. +def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI, + R8, R9, R10, R11)>; + +// CSRs that are handled by prologue, epilogue. +def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add)>; + +// CSRs that are handled explicitly via copies. +def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(add CSR_64_TLS_Darwin)>; + // All GPRs - except r11 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI, R8, R9, R10, RSP)>; @@ -778,6 +851,11 @@ def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, R12, R13, R14, R15, RBP, (sequence "XMM%u", 0, 15))>; +def CSR_32_AllRegs : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI, + EDI, ESP)>; +def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs, + (sequence "XMM%u", 0, 7))>; + def CSR_64_AllRegs : CalleeSavedRegs<(add CSR_64_MostRegs, RAX, RSP, (sequence "XMM%u", 16, 31))>; def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, RSP, @@ -804,3 +882,6 @@ def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15, (sequence "ZMM%u", 16, 31), K4, K5, K6, K7)>; + +// Only R12 is preserved for PHP calls in HHVM. +def CSR_64_HHVM : CalleeSavedRegs<(add R12)>; diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 6a5a28e..a09d065 100644 --- a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -19,9 +19,10 @@ #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" -#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved. +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved. #include "llvm/IR/GlobalValue.h" using namespace llvm; @@ -141,6 +142,24 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // The EH_RETURN pseudo is really removed during the MC Lowering. return true; } + case X86::IRET: { + // Adjust stack to erase error code + int64_t StackAdj = MBBI->getOperand(0).getImm(); + X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true); + // Replace pseudo with machine iret + BuildMI(MBB, MBBI, DL, + TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32)); + MBB.erase(MBBI); + return true; + } + case X86::EH_RESTORE: { + // Restore ESP and EBP, and optionally ESI if required. + bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality( + MBB.getParent()->getFunction()->getPersonalityFn())); + X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH); + MBBI->eraseFromParent(); + return true; + } } llvm_unreachable("Previous switch has a fallthrough?"); } diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp index b4319c8..f48b479 100644 --- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -298,8 +298,8 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, return false; // Make sure nothing is in the way - BasicBlock::const_iterator Start = I; - BasicBlock::const_iterator End = II; + BasicBlock::const_iterator Start(I); + BasicBlock::const_iterator End(II); for (auto Itr = std::prev(Start); Itr != End; --Itr) { // We only expect extractvalue instructions between the intrinsic and the // instruction to be selected. @@ -433,6 +433,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, X86AddressMode &AM, MachineMemOperand *MMO, bool Aligned) { + bool HasSSE2 = Subtarget->hasSSE2(); + bool HasSSE4A = Subtarget->hasSSE4A(); + bool HasAVX = Subtarget->hasAVX(); + bool IsNonTemporal = MMO && MMO->isNonTemporal(); + // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { @@ -449,35 +454,59 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, // FALLTHROUGH, handling i1 as i8. case MVT::i8: Opc = X86::MOV8mr; break; case MVT::i16: Opc = X86::MOV16mr; break; - case MVT::i32: Opc = X86::MOV32mr; break; - case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode. + case MVT::i32: + Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr; + break; + case MVT::i64: + // Must be in x86-64 mode. + Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr; + break; case MVT::f32: - Opc = X86ScalarSSEf32 ? - (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m; + if (X86ScalarSSEf32) { + if (IsNonTemporal && HasSSE4A) + Opc = X86::MOVNTSS; + else + Opc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; + } else + Opc = X86::ST_Fp32m; break; case MVT::f64: - Opc = X86ScalarSSEf64 ? - (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m; + if (X86ScalarSSEf32) { + if (IsNonTemporal && HasSSE4A) + Opc = X86::MOVNTSD; + else + Opc = HasAVX ? X86::VMOVSDmr : X86::MOVSDmr; + } else + Opc = X86::ST_Fp64m; break; case MVT::v4f32: - if (Aligned) - Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; - else - Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr; + if (Aligned) { + if (IsNonTemporal) + Opc = HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr; + else + Opc = HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr; + } else + Opc = HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr; break; case MVT::v2f64: - if (Aligned) - Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr; - else - Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr; + if (Aligned) { + if (IsNonTemporal) + Opc = HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr; + else + Opc = HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr; + } else + Opc = HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr; break; case MVT::v4i32: case MVT::v2i64: case MVT::v8i16: case MVT::v16i8: - if (Aligned) - Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr; - else + if (Aligned) { + if (IsNonTemporal) + Opc = HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr; + else + Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr; + } else Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr; break; } @@ -973,6 +1002,9 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { if (!FuncInfo.CanLowerReturn) return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) + return false; + CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::C && CC != CallingConv::Fast && @@ -1069,12 +1101,11 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { RetRegs.push_back(VA.getLocReg()); } - // The x86-64 ABI for returning structs by value requires that we copy - // the sret argument into %rax for the return. We saved the argument into - // a virtual register in the entry block, so now we copy the value out - // and into %rax. We also do the same with %eax for Win32. - if (F.hasStructRetAttr() && - (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { + // All x86 ABIs require that for returning structs by value we copy + // the sret argument into %rax/%eax (depending on ABI) for the return. + // We saved the argument into a virtual register in the entry block, + // so now we copy the value out and into %rax/%eax. + if (F.hasStructRetAttr()) { unsigned Reg = X86MFInfo->getSRetReturnReg(); assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()!"); @@ -1431,17 +1462,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { .addMBB(TrueMBB); } - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TrueMBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); - - // Emits an unconditional branch to the FalseBB, obtains the branch - // weight, and adds it to the successor list. - fastEmitBranch(FalseMBB, DbgLoc); - + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { @@ -1472,12 +1493,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) .addMBB(TrueMBB); - fastEmitBranch(FalseMBB, DbgLoc); - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TrueMBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } } @@ -1492,12 +1509,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) .addMBB(TrueMBB); - fastEmitBranch(FalseMBB, DbgLoc); - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TrueMBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } @@ -1511,12 +1523,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { .addReg(OpReg).addImm(1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) .addMBB(TrueMBB); - fastEmitBranch(FalseMBB, DbgLoc); - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TrueMBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } @@ -1945,6 +1952,9 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { unsigned ResultReg; if (Subtarget->hasAVX()) { + const TargetRegisterClass *FR32 = &X86::FR32RegClass; + const TargetRegisterClass *VR128 = &X86::VR128RegClass; + // If we have AVX, create 1 blendv instead of 3 logic instructions. // Blendv was introduced with SSE 4.1, but the 2 register form implicitly // uses XMM0 as the selection register. That may need just as many @@ -1955,10 +1965,13 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { unsigned BlendOpcode = (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; - unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill, + unsigned CmpReg = fastEmitInst_rri(CmpOpcode, FR32, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); - ResultReg = fastEmitInst_rrr(BlendOpcode, RC, RHSReg, RHSIsKill, - LHSReg, LHSIsKill, CmpReg, true); + unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill, + LHSReg, LHSIsKill, CmpReg, true); + ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg); } else { unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); @@ -2806,10 +2819,12 @@ static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget, if (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::HiPE) return 0; - if (CS && !CS->paramHasAttr(1, Attribute::StructRet)) - return 0; - if (CS && CS->paramHasAttr(1, Attribute::InReg)) - return 0; + + if (CS) + if (CS->arg_empty() || !CS->paramHasAttr(1, Attribute::StructRet) || + CS->paramHasAttr(1, Attribute::InReg) || Subtarget->isTargetMCU()) + return 0; + return 4; } @@ -2924,7 +2939,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); // Get a count of how many bytes are to be pushed on the stack. - unsigned NumBytes = CCInfo.getNextStackOffset(); + unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); @@ -3020,8 +3035,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()]; unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore, - ArgVT.getStoreSize(), Alignment); + MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset), + MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); if (Flags.isByVal()) { X86AddressMode SrcAM; SrcAM.Base.Reg = ArgReg; @@ -3252,6 +3267,30 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { updateValueMap(I, Reg); return true; } + case Instruction::BitCast: { + // Select SSE2/AVX bitcasts between 128/256 bit vector types. + if (!Subtarget->hasSSE2()) + return false; + + EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(DL, I->getType()); + + if (!SrcVT.isSimple() || !DstVT.isSimple()) + return false; + + if (!SrcVT.is128BitVector() && + !(Subtarget->hasAVX() && SrcVT.is256BitVector())) + return false; + + unsigned Reg = getRegForValue(I->getOperand(0)); + if (Reg == 0) + return false; + + // No instruction is needed for conversion. Reuse the register used by + // the fist operand. + updateValueMap(I, Reg); + return true; + } } return false; @@ -3384,8 +3423,8 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { TII.get(Opc), ResultReg); addDirectMem(MIB, AddrReg); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, - DL.getPointerSize(), Align); + MachinePointerInfo::getConstantPool(*FuncInfo.MF), + MachineMemOperand::MOLoad, DL.getPointerSize(), Align); MIB->addMemOperand(*FuncInfo.MF, MMO); return ResultReg; } diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp index 5eb4fae..1dd69e8 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -9,6 +9,7 @@ // // This file defines the pass that finds instructions that can be // re-written as LEA instructions in order to reduce pipeline delays. +// When optimizing for size it replaces suitable LEAs with INC or DEC. // //===----------------------------------------------------------------------===// @@ -61,6 +62,11 @@ class FixupLEAPass : public MachineFunctionPass { void processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI); + /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg + /// and convert them to INC or DEC respectively. + bool fixupIncDec(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) const; + /// \brief Determine if an instruction references a machine register /// and, if so, whether it reads or writes the register. RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I); @@ -89,6 +95,8 @@ public: private: MachineFunction *MF; const X86InstrInfo *TII; // Machine instruction info. + bool OptIncDec; + bool OptLEA; }; char FixupLEAPass::ID = 0; } @@ -150,7 +158,10 @@ FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); } bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>(); - if (!ST.LEAusesAG() && !ST.slowLEA()) + OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize(); + OptLEA = ST.LEAusesAG() || ST.slowLEA(); + + if (!OptLEA && !OptIncDec) return false; TII = ST.getInstrInfo(); @@ -187,7 +198,7 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) { static inline bool getPreviousInstr(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { if (I == MFI->begin()) { - if (MFI->isPredecessor(MFI)) { + if (MFI->isPredecessor(&*MFI)) { I = --MFI->end(); return true; } else @@ -222,6 +233,60 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, return nullptr; } +static inline bool isLEA(const int opcode) { + return opcode == X86::LEA16r || opcode == X86::LEA32r || + opcode == X86::LEA64r || opcode == X86::LEA64_32r; +} + +/// isLEASimpleIncOrDec - Does this LEA have one these forms: +/// lea %reg, 1(%reg) +/// lea %reg, -1(%reg) +static inline bool isLEASimpleIncOrDec(MachineInstr *LEA) { + unsigned SrcReg = LEA->getOperand(1 + X86::AddrBaseReg).getReg(); + unsigned DstReg = LEA->getOperand(0).getReg(); + unsigned AddrDispOp = 1 + X86::AddrDisp; + return SrcReg == DstReg && + LEA->getOperand(1 + X86::AddrIndexReg).getReg() == 0 && + LEA->getOperand(1 + X86::AddrSegmentReg).getReg() == 0 && + LEA->getOperand(AddrDispOp).isImm() && + (LEA->getOperand(AddrDispOp).getImm() == 1 || + LEA->getOperand(AddrDispOp).getImm() == -1); +} + +bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) const { + MachineInstr *MI = I; + int Opcode = MI->getOpcode(); + if (!isLEA(Opcode)) + return false; + + if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) { + int NewOpcode; + bool isINC = MI->getOperand(4).getImm() == 1; + switch (Opcode) { + case X86::LEA16r: + NewOpcode = isINC ? X86::INC16r : X86::DEC16r; + break; + case X86::LEA32r: + case X86::LEA64_32r: + NewOpcode = isINC ? X86::INC32r : X86::DEC32r; + break; + case X86::LEA64r: + NewOpcode = isINC ? X86::INC64r : X86::DEC64r; + break; + } + + MachineInstr *NewMI = + BuildMI(*MFI, I, MI->getDebugLoc(), TII->get(NewOpcode)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)); + MFI->erase(I); + I = static_cast<MachineBasicBlock::iterator>(NewMI); + return true; + } + return false; +} + void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { // Process a load, store, or LEA instruction. @@ -265,8 +330,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { MachineInstr *MI = I; const int opcode = MI->getOpcode(); - if (opcode != X86::LEA16r && opcode != X86::LEA32r && opcode != X86::LEA64r && - opcode != X86::LEA64_32r) + if (!isLEA(opcode)) return; if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() || !TII->isSafeToClobberEFLAGS(*MFI, I)) @@ -280,7 +344,8 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, return; int addrr_opcode, addri_opcode; switch (opcode) { - default: llvm_unreachable("Unexpected LEA instruction"); + default: + llvm_unreachable("Unexpected LEA instruction"); case X86::LEA16r: addrr_opcode = X86::ADD16rr; addri_opcode = X86::ADD16ri; @@ -330,10 +395,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI) { for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) { - if (MF.getSubtarget<X86Subtarget>().isSLM()) - processInstructionForSLM(I, MFI); - else - processInstruction(I, MFI); + if (OptIncDec) + if (fixupIncDec(I, MFI)) + continue; + + if (OptLEA) { + if (MF.getSubtarget<X86Subtarget>().isSLM()) + processInstructionForSLM(I, MFI); + else + processInstruction(I, MFI); + } } return false; } diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp index 40b9c8a..97bb8ab 100644 --- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -120,12 +120,10 @@ namespace { // Return a bitmask of FP registers in block's live-in list. static unsigned calcLiveInMask(MachineBasicBlock *MBB) { unsigned Mask = 0; - for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(), - E = MBB->livein_end(); I != E; ++I) { - unsigned Reg = *I; - if (Reg < X86::FP0 || Reg > X86::FP6) + for (const auto &LI : MBB->liveins()) { + if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6) continue; - Mask |= 1 << (Reg - X86::FP0); + Mask |= 1 << (LI.PhysReg - X86::FP0); } return Mask; } @@ -301,8 +299,9 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { bool FPIsUsed = false; static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!"); + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned i = 0; i <= 6; ++i) - if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) { + if (!MRI.reg_nodbg_empty(X86::FP0 + i)) { FPIsUsed = true; break; } @@ -321,7 +320,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // Process the function in depth first order so that we process at least one // of the predecessors for every reachable block in the function. SmallPtrSet<MachineBasicBlock*, 8> Processed; - MachineBasicBlock *Entry = MF.begin(); + MachineBasicBlock *Entry = &MF.front(); bool Changed = false; for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed)) @@ -329,9 +328,9 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // Process any unreachable blocks in arbitrary order now. if (MF.size() != Processed.size()) - for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) - if (Processed.insert(BB).second) - Changed |= processBasicBlock(MF, *BB); + for (MachineBasicBlock &BB : MF) + if (Processed.insert(&BB).second) + Changed |= processBasicBlock(MF, BB); LiveBundles.clear(); @@ -348,13 +347,12 @@ void FPS::bundleCFG(MachineFunction &MF) { LiveBundles.resize(Bundles->getNumBundles()); // Gather the actual live-in masks for all MBBs. - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { - MachineBasicBlock *MBB = I; - const unsigned Mask = calcLiveInMask(MBB); + for (MachineBasicBlock &MBB : MF) { + const unsigned Mask = calcLiveInMask(&MBB); if (!Mask) continue; // Update MBB ingoing bundle mask. - LiveBundles[Bundles->getBundle(MBB->getNumber(), false)].Mask |= Mask; + LiveBundles[Bundles->getBundle(MBB.getNumber(), false)].Mask |= Mask; } } @@ -546,17 +544,9 @@ namespace { }; } -#ifndef NDEBUG -static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) { - for (unsigned i = 0; i != NumEntries-1; ++i) - if (!(Table[i] < Table[i+1])) return false; - return true; -} -#endif - -static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) { - const TableEntry *I = std::lower_bound(Table, Table+N, Opcode); - if (I != Table+N && I->from == Opcode) +static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) { + const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode); + if (I != Table.end() && I->from == Opcode) return I->to; return -1; } @@ -567,7 +557,7 @@ static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) { #define ASSERT_SORTED(TABLE) \ { static bool TABLE##Checked = false; \ if (!TABLE##Checked) { \ - assert(TableIsSorted(TABLE, array_lengthof(TABLE)) && \ + assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) && \ "All lookup tables must be sorted for efficient access!"); \ TABLE##Checked = true; \ } \ @@ -746,7 +736,7 @@ static const TableEntry OpcodeTable[] = { static unsigned getConcreteOpcode(unsigned Opcode) { ASSERT_SORTED(OpcodeTable); - int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode); + int Opc = Lookup(OpcodeTable, Opcode); assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!"); return Opc; } @@ -797,7 +787,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) { RegMap[Stack[--StackTop]] = ~0; // Update state // Check to see if there is a popping version of this instruction... - int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode()); + int Opcode = Lookup(PopTable, I->getOpcode()); if (Opcode != -1) { I->setDesc(TII->get(Opcode)); if (Opcode == X86::UCOM_FPPr) @@ -1193,7 +1183,7 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { // We decide which form to use based on what is on the top of the stack, and // which operand is killed by this instruction. - const TableEntry *InstTable; + ArrayRef<TableEntry> InstTable; bool isForward = TOS == Op0; bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0); if (updateST0) { @@ -1208,8 +1198,7 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { InstTable = ReverseSTiTable; } - int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table), - MI->getOpcode()); + int Opcode = Lookup(InstTable, MI->getOpcode()); assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!"); // NotTOS - The register which is not on the top of stack... @@ -1520,31 +1509,6 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { return; } - case X86::WIN_FTOL_32: - case X86::WIN_FTOL_64: { - // Push the operand into ST0. - MachineOperand &Op = MI->getOperand(0); - assert(Op.isUse() && Op.isReg() && - Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6); - unsigned FPReg = getFPReg(Op); - if (Op.isKill()) - moveToTop(FPReg, Inst); - else - duplicateToTop(FPReg, ScratchFPReg, Inst); - - // Emit the call. This will pop the operand. - BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::CALLpcrel32)) - .addExternalSymbol("_ftol2") - .addReg(X86::ST0, RegState::ImplicitKill) - .addReg(X86::ECX, RegState::ImplicitDefine) - .addReg(X86::EAX, RegState::Define | RegState::Implicit) - .addReg(X86::EDX, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - --StackTop; - - break; - } - case X86::RETQ: case X86::RETL: case X86::RETIL: diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index 3a21b57..8b5fd27 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -18,25 +18,23 @@ #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/Debug.h" #include <cstdlib> using namespace llvm; -// FIXME: completely move here. -extern cl::opt<bool> ForceStackAlign; - X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride) : TargetFrameLowering(StackGrowsDown, StackAlignOverride, @@ -92,7 +90,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const { MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() || MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || - MMI.callsUnwindInit() || MMI.callsEHReturn() || + MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() || MFI->hasStackMap() || MFI->hasPatchPoint()); } @@ -148,21 +146,14 @@ static unsigned getLEArOpcode(unsigned IsLP64) { /// to this register without worry about clobbering it. static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const TargetRegisterInfo *TRI, + const X86RegisterInfo *TRI, bool Is64Bit) { const MachineFunction *MF = MBB.getParent(); const Function *F = MF->getFunction(); if (!F || MF->getMMI().callsEHReturn()) return 0; - static const uint16_t CallerSavedRegs32Bit[] = { - X86::EAX, X86::EDX, X86::ECX, 0 - }; - - static const uint16_t CallerSavedRegs64Bit[] = { - X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI, - X86::R8, X86::R9, X86::R10, X86::R11, 0 - }; + const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF); unsigned Opc = MBBI->getOpcode(); switch (Opc) { @@ -191,10 +182,9 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, Uses.insert(*AI); } - const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit; - for (; *CS; ++CS) - if (!Uses.count(*CS)) - return *CS; + for (auto CS : AvailableRegs) + if (!Uses.count(CS) && CS != X86::RIP) + return CS; } } @@ -214,8 +204,12 @@ static bool isEAXLiveIn(MachineFunction &MF) { return false; } -/// Check whether or not the terminators of \p MBB needs to read EFLAGS. -static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) { +/// Check if the flags need to be preserved before the terminators. +/// This would be the case, if the eflags is live-in of the region +/// composed by the terminators or live-out of that region, without +/// being defined by a terminator. +static bool +flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { for (const MachineInstr &MI : MBB.terminators()) { bool BreakNext = false; for (const MachineOperand &MO : MI.operands()) { @@ -225,15 +219,27 @@ static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) { if (Reg != X86::EFLAGS) continue; - // This terminator needs an eflag that is not defined - // by a previous terminator. + // This terminator needs an eflags that is not defined + // by a previous another terminator: + // EFLAGS is live-in of the region composed by the terminators. if (!MO.isDef()) return true; + // This terminator defines the eflags, i.e., we don't need to preserve it. + // However, we still need to check this specific terminator does not + // read a live-in value. BreakNext = true; } + // We found a definition of the eflags, no need to preserve them. if (BreakNext) - break; + return false; } + + // None of the terminators use or define the eflags. + // Check if they are live-out, that would imply we need to preserve them. + for (const MachineBasicBlock *Succ : MBB.successors()) + if (Succ->isLiveIn(X86::EFLAGS)) + return true; + return false; } @@ -289,6 +295,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); if (isSub) MI->setFlag(MachineInstr::FrameSetup); + else + MI->setFlag(MachineInstr::FrameDestroy); Offset -= ThisVal; continue; } @@ -298,6 +306,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue); if (isSub) MI.setMIFlag(MachineInstr::FrameSetup); + else + MI.setMIFlag(MachineInstr::FrameDestroy); Offset -= ThisVal; } @@ -312,7 +322,11 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( // is tricky. bool UseLEA; if (!InEpilogue) { - UseLEA = STI.useLeaForSP(); + // Check if inserting the prologue at the beginning + // of MBB would require to use LEA operations. + // We need to use LEA operations if EFLAGS is live in, because + // it means an instruction will read it before it gets defined. + UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS); } else { // If we can use LEA for SP but we shouldn't, check that none // of the terminators uses the eflags. Otherwise we will insert @@ -321,10 +335,10 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( // and is an optimization anyway. UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent()); if (UseLEA && !STI.useLeaForSP()) - UseLEA = terminatorsNeedFlagsAsInput(MBB); + UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB); // If that assert breaks, that means we do not do the right thing // in canUseAsEpilogue. - assert((UseLEA || !terminatorsNeedFlagsAsInput(MBB)) && + assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) && "We shouldn't have allowed this insertion point"); } @@ -347,30 +361,6 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( return MI; } -/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator. -static -void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, uint64_t *NumBytes = nullptr) { - if (MBBI == MBB.begin()) return; - - MachineBasicBlock::iterator PI = std::prev(MBBI); - unsigned Opc = PI->getOpcode(); - if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || - Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || - Opc == X86::LEA32r || Opc == X86::LEA64_32r) && - PI->getOperand(0).getReg() == StackPtr) { - if (NumBytes) - *NumBytes += PI->getOperand(2).getImm(); - MBB.erase(PI); - } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || - Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && - PI->getOperand(0).getReg() == StackPtr) { - if (NumBytes) - *NumBytes -= PI->getOperand(2).getImm(); - MBB.erase(PI); - } -} - int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, bool doMergeWithPrevious) const { @@ -436,27 +426,265 @@ X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, } } -/// usesTheStack - This function checks if any of the users of EFLAGS -/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has -/// to use the stack, and if we don't adjust the stack we clobber the first -/// frame index. -/// See X86InstrInfo::copyPhysReg. -static bool usesTheStack(const MachineFunction &MF) { - const MachineRegisterInfo &MRI = MF.getRegInfo(); - - for (MachineRegisterInfo::reg_instr_iterator - ri = MRI.reg_instr_begin(X86::EFLAGS), re = MRI.reg_instr_end(); - ri != re; ++ri) - if (ri->isCopy()) - return true; +MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + if (STI.isTargetWindowsCoreCLR()) { + if (InProlog) { + return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true); + } else { + return emitStackProbeInline(MF, MBB, MBBI, DL, false); + } + } else { + return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog); + } +} - return false; +void X86FrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const { + const StringRef ChkStkStubSymbol = "__chkstk_stub"; + MachineInstr *ChkStkStub = nullptr; + + for (MachineInstr &MI : PrologMBB) { + if (MI.isCall() && MI.getOperand(0).isSymbol() && + ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) { + ChkStkStub = &MI; + break; + } + } + + if (ChkStkStub != nullptr) { + MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator()); + assert(std::prev(MBBI).operator==(ChkStkStub) && + "MBBI expected after __chkstk_stub."); + DebugLoc DL = PrologMBB.findDebugLoc(MBBI); + emitStackProbeInline(MF, PrologMBB, MBBI, DL, true); + ChkStkStub->eraseFromParent(); + } } -void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - DebugLoc DL) const { +MachineInstr *X86FrameLowering::emitStackProbeInline( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + assert(STI.is64Bit() && "different expansion needed for 32 bit"); + assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR"); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + const BasicBlock *LLVM_BB = MBB.getBasicBlock(); + + // RAX contains the number of bytes of desired stack adjustment. + // The handling here assumes this value has already been updated so as to + // maintain stack alignment. + // + // We need to exit with RSP modified by this amount and execute suitable + // page touches to notify the OS that we're growing the stack responsibly. + // All stack probing must be done without modifying RSP. + // + // MBB: + // SizeReg = RAX; + // ZeroReg = 0 + // CopyReg = RSP + // Flags, TestReg = CopyReg - SizeReg + // FinalReg = !Flags.Ovf ? TestReg : ZeroReg + // LimitReg = gs magic thread env access + // if FinalReg >= LimitReg goto ContinueMBB + // RoundBB: + // RoundReg = page address of FinalReg + // LoopMBB: + // LoopReg = PHI(LimitReg,ProbeReg) + // ProbeReg = LoopReg - PageSize + // [ProbeReg] = 0 + // if (ProbeReg > RoundReg) goto LoopMBB + // ContinueMBB: + // RSP = RSP - RAX + // [rest of original MBB] + + // Set up the new basic blocks + MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = std::next(MBB.getIterator()); + MF.insert(MBBIter, RoundMBB); + MF.insert(MBBIter, LoopMBB); + MF.insert(MBBIter, ContinueMBB); + + // Split MBB and move the tail portion down to ContinueMBB. + MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI); + ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end()); + ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB); + + // Some useful constants + const int64_t ThreadEnvironmentStackLimit = 0x10; + const int64_t PageSize = 0x1000; + const int64_t PageMask = ~(PageSize - 1); + + // Registers we need. For the normal case we use virtual + // registers. For the prolog expansion we use RAX, RCX and RDX. + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterClass *RegClass = &X86::GR64RegClass; + const unsigned SizeReg = InProlog ? (unsigned)X86::RAX + : MRI.createVirtualRegister(RegClass), + ZeroReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass), + CopyReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + TestReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + FinalReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + RoundedReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + LimitReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass), + JoinReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass), + ProbeReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass); + + // SP-relative offsets where we can save RCX and RDX. + int64_t RCXShadowSlot = 0; + int64_t RDXShadowSlot = 0; + + // If inlining in the prolog, save RCX and RDX. + // Future optimization: don't save or restore if not live in. + if (InProlog) { + // Compute the offsets. We need to account for things already + // pushed onto the stack at this point: return address, frame + // pointer (if used), and callee saves. + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize(); + const bool HasFP = hasFP(MF); + RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0); + RDXShadowSlot = RCXShadowSlot + 8; + // Emit the saves. + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RCXShadowSlot) + .addReg(X86::RCX); + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RDXShadowSlot) + .addReg(X86::RDX); + } else { + // Not in the prolog. Copy RAX to a virtual reg. + BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX); + } + + // Add code to MBB to check for overflow and set the new target stack pointer + // to zero if so. + BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg) + .addReg(ZeroReg, RegState::Undef) + .addReg(ZeroReg, RegState::Undef); + BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP); + BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg) + .addReg(CopyReg) + .addReg(SizeReg); + BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg) + .addReg(TestReg) + .addReg(ZeroReg); + + // FinalReg now holds final stack pointer value, or zero if + // allocation would overflow. Compare against the current stack + // limit from the thread environment block. Note this limit is the + // lowest touched page on the stack, not the point at which the OS + // will cause an overflow exception, so this is just an optimization + // to avoid unnecessarily touching pages that are below the current + // SP but already commited to the stack by the OS. + BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg) + .addReg(0) + .addImm(1) + .addReg(0) + .addImm(ThreadEnvironmentStackLimit) + .addReg(X86::GS); + BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg); + // Jump if the desired stack pointer is at or above the stack limit. + BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB); + + // Add code to roundMBB to round the final stack pointer to a page boundary. + BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg) + .addReg(FinalReg) + .addImm(PageMask); + BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB); + + // LimitReg now holds the current stack limit, RoundedReg page-rounded + // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page + // and probe until we reach RoundedReg. + if (!InProlog) { + BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg) + .addReg(LimitReg) + .addMBB(RoundMBB) + .addReg(ProbeReg) + .addMBB(LoopMBB); + } + + addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg, + false, -PageSize); + + // Probe by storing a byte onto the stack. + BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi)) + .addReg(ProbeReg) + .addImm(1) + .addReg(0) + .addImm(0) + .addReg(0) + .addImm(0); + BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr)) + .addReg(RoundedReg) + .addReg(ProbeReg); + BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB); + + MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI(); + + // If in prolog, restore RDX and RCX. + if (InProlog) { + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), + X86::RCX), + X86::RSP, false, RCXShadowSlot); + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), + X86::RDX), + X86::RSP, false, RDXShadowSlot); + } + + // Now that the probing is done, add code to continueMBB to update + // the stack pointer for real. + BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(SizeReg); + + // Add the control flow edges we need. + MBB.addSuccessor(ContinueMBB); + MBB.addSuccessor(RoundMBB); + RoundMBB->addSuccessor(LoopMBB); + LoopMBB->addSuccessor(ContinueMBB); + LoopMBB->addSuccessor(LoopMBB); + + // Mark all the instructions added to the prolog as frame setup. + if (InProlog) { + for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) { + BeforeMBBI->setFlag(MachineInstr::FrameSetup); + } + for (MachineInstr &MI : *RoundMBB) { + MI.setFlag(MachineInstr::FrameSetup); + } + for (MachineInstr &MI : *LoopMBB) { + MI.setFlag(MachineInstr::FrameSetup); + } + for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin(); + CMBBI != ContinueMBBI; ++CMBBI) { + CMBBI->setFlag(MachineInstr::FrameSetup); + } + } + + // Possible TODO: physreg liveness for InProlog case. + + return ContinueMBBI; +} + +MachineInstr *X86FrameLowering::emitStackProbeCall( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; unsigned CallOp; @@ -478,6 +706,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, Symbol = "_chkstk"; MachineInstrBuilder CI; + MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI); // All current stack probes take AX and SP as input, clobber flags, and // preserve all registers. x86_64 probes leave RSP unmodified. @@ -507,6 +736,26 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, .addReg(X86::RSP) .addReg(X86::RAX); } + + if (InProlog) { + // Apply the frame setup flag to all inserted instrs. + for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI) + ExpansionMBBI->setFlag(MachineInstr::FrameSetup); + } + + return MBBI; +} + +MachineInstr *X86FrameLowering::emitStackProbeInlineStub( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { + + assert(InProlog && "ChkStkStub called outside prolog!"); + + BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("__chkstk_stub"); + + return MBBI; } static unsigned calculateSetFPREG(uint64_t SPAdjust) { @@ -526,7 +775,7 @@ uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) con const MachineFrameInfo *MFI = MF.getFrameInfo(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. unsigned StackAlign = getStackAlignment(); - if (ForceStackAlign) { + if (MF.getFunction()->hasFnAttribute("stackrealign")) { if (MFI->hasCalls()) MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; else if (MaxAlign < SlotSize) @@ -537,15 +786,14 @@ uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) con void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - DebugLoc DL, + DebugLoc DL, unsigned Reg, uint64_t MaxAlign) const { uint64_t Val = -MaxAlign; - MachineInstr *MI = - BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), - StackPtr) - .addReg(StackPtr) - .addImm(Val) - .setMIFlag(MachineInstr::FrameSetup); + unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg) + .addReg(Reg) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); // The EFLAGS implicit def is dead. MI->getOperand(3).setIsDead(); @@ -646,6 +894,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. + bool IsFunclet = MBB.isEHFuncletEntry(); + EHPersonality Personality = EHPersonality::Unknown; + if (Fn->hasPersonalityFn()) + Personality = classifyEHPersonality(Fn->getPersonalityFn()); + bool FnHasClrFunclet = + MMI.hasEHFunclets() && Personality == EHPersonality::CoreCLR; + bool IsClrFunclet = IsFunclet && FnHasClrFunclet; bool HasFP = hasFP(MF); bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv()); bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); @@ -655,9 +910,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, unsigned FramePtr = TRI->getFrameRegister(MF); const unsigned MachineFramePtr = STI.isTarget64BitILP32() - ? getX86SubSuperRegister(FramePtr, MVT::i64, false) - : FramePtr; + ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; unsigned BasePtr = TRI->getBaseRegister(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. DebugLoc DL; // Add RETADDR move area to callee saved frame size. @@ -686,11 +943,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // push and pop from the stack. if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) && !TRI->needsStackRealignment(MF) && - !MFI->hasVarSizedObjects() && // No dynamic alloca. - !MFI->adjustsStack() && // No calls. - !IsWin64CC && // Win64 has no Red Zone - !usesTheStack(MF) && // Don't push and pop. - !MF.shouldSplitStack()) { // Regular stack + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !IsWin64CC && // Win64 has no Red Zone + !MFI->hasOpaqueSPAdjustment() && // Don't push and pop. + !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); @@ -723,6 +980,24 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, uint64_t NumBytes = 0; int stackGrowth = -SlotSize; + // Find the funclet establisher parameter + unsigned Establisher = X86::NoRegister; + if (IsClrFunclet) + Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX; + else if (IsFunclet) + Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX; + + if (IsWin64Prologue && IsFunclet && !IsClrFunclet) { + // Immediately spill establisher into the home slot. + // The runtime cares about this. + // MOV64mr %rdx, 16(%rsp) + unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16) + .addReg(Establisher) + .setMIFlag(MachineInstr::FrameSetup); + MBB.addLiveIn(Establisher); + } + if (HasFP) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; @@ -739,7 +1014,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Get the offset of the stack slot for the EBP register, which is // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. // Update the frame offset adjustment. - MFI->setOffsetAdjustment(-NumBytes); + if (!IsFunclet) + MFI->setOffsetAdjustment(-NumBytes); + else + assert(MFI->getOffsetAdjustment() == -(int)NumBytes && + "should calculate same local variable offset for funclets"); // Save EBP/RBP into the appropriate stack slot. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) @@ -765,35 +1044,46 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - if (!IsWin64Prologue) { + if (!IsWin64Prologue && !IsFunclet) { // Update EBP with the new base value. BuildMI(MBB, MBBI, DL, TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); - } - if (NeedsDwarfCFI) { - // Mark effective beginning of when frame pointer becomes valid. - // Define the current CFA to use the EBP/RBP register. - unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); - BuildCFI(MBB, MBBI, DL, - MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); + if (NeedsDwarfCFI) { + // Mark effective beginning of when frame pointer becomes valid. + // Define the current CFA to use the EBP/RBP register. + unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); + BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister( + nullptr, DwarfFramePtr)); + } } - // Mark the FramePtr as live-in in every block. - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) - I->addLiveIn(MachineFramePtr); + // Mark the FramePtr as live-in in every block. Don't do this again for + // funclet prologues. + if (!IsFunclet) { + for (MachineBasicBlock &EveryMBB : MF) + EveryMBB.addLiveIn(MachineFramePtr); + } } else { + assert(!IsFunclet && "funclets without FPs not yet implemented"); NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); } + // For EH funclets, only allocate enough space for outgoing calls. Save the + // NumBytes value that we would've used for the parent frame. + unsigned ParentFrameNumBytes = NumBytes; + if (IsFunclet) + NumBytes = getWinEHFuncletFrameSize(MF); + // Skip the callee-saved push instructions. bool PushedRegs = false; int StackOffset = 2 * stackGrowth; while (MBBI != MBB.end() && + MBBI->getFlag(MachineInstr::FrameSetup) && (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { PushedRegs = true; @@ -818,9 +1108,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Realign stack after we pushed callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). // Don't do this for Win64, it needs to realign the stack after the prologue. - if (!IsWin64Prologue && TRI->needsStackRealignment(MF)) { + if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); - BuildStackAlignAND(MBB, MBBI, DL, MaxAlign); + BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign); } // If there is an SUB32ri of ESP immediately before this instruction, merge @@ -839,7 +1129,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. uint64_t AlignedNumBytes = NumBytes; - if (IsWin64Prologue && TRI->needsStackRealignment(MF)) + if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign); if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { // Check whether EAX is livein for this function. @@ -876,26 +1166,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. // We'll also use 4 already allocated bytes for EAX. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) - .setMIFlag(MachineInstr::FrameSetup); + .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) + .setMIFlag(MachineInstr::FrameSetup); } - // Save a pointer to the MI where we set AX. - MachineBasicBlock::iterator SetRAX = MBBI; - --SetRAX; - // Call __chkstk, __chkstk_ms, or __alloca. - emitStackProbeCall(MF, MBB, MBBI, DL); - - // Apply the frame setup flag to all inserted instrs. - for (; SetRAX != MBBI; ++SetRAX) - SetRAX->setFlag(MachineInstr::FrameSetup); + emitStackProbe(MF, MBB, MBBI, DL, true); if (isEAXAlive) { // Restore EAX - MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), - X86::EAX), - StackPtr, false, NumBytes - 4); + MachineInstr *MI = + addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX), + StackPtr, false, NumBytes - 4); MI->setFlag(MachineInstr::FrameSetup); MBB.insert(MBBI, MI); } @@ -909,19 +1191,72 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); int SEHFrameOffset = 0; + unsigned SPOrEstablisher; + if (IsFunclet) { + if (IsClrFunclet) { + // The establisher parameter passed to a CLR funclet is actually a pointer + // to the (mostly empty) frame of its nearest enclosing funclet; we have + // to find the root function establisher frame by loading the PSPSym from + // the intermediate frame. + unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF); + MachinePointerInfo NoInfo; + MBB.addLiveIn(Establisher); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher), + Establisher, false, PSPSlotOffset) + .addMemOperand(MF.getMachineMemOperand( + NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize)); + ; + // Save the root establisher back into the current funclet's (mostly + // empty) frame, in case a sub-funclet or the GC needs it. + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, + false, PSPSlotOffset) + .addReg(Establisher) + .addMemOperand( + MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore | + MachineMemOperand::MOVolatile, + SlotSize, SlotSize)); + } + SPOrEstablisher = Establisher; + } else { + SPOrEstablisher = StackPtr; + } + if (IsWin64Prologue && HasFP) { - SEHFrameOffset = calculateSetFPREG(NumBytes); + // Set RBP to a small fixed offset from RSP. In the funclet case, we base + // this calculation on the incoming establisher, which holds the value of + // RSP from the parent frame at the end of the prologue. + SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes); if (SEHFrameOffset) addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), - StackPtr, false, SEHFrameOffset); + SPOrEstablisher, false, SEHFrameOffset); else - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr); + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr) + .addReg(SPOrEstablisher); - if (NeedsWinCFI) + // If this is not a funclet, emit the CFI describing our frame pointer. + if (NeedsWinCFI && !IsFunclet) { BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) .addImm(SEHFrameOffset) .setMIFlag(MachineInstr::FrameSetup); + if (isAsynchronousEHPersonality(Personality)) + MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset; + } + } else if (IsFunclet && STI.is32Bit()) { + // Reset EBP / ESI to something good for funclets. + MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL); + // If we're a catch funclet, we can be returned to via catchret. Save ESP + // into the registration node so that the runtime will restore it for us. + if (!MBB.isCleanupFuncletEntry()) { + assert(Personality == EHPersonality::MSVC_CXX); + unsigned FrameReg; + int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex; + int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg); + // ESP is the first field, so no extra displacement is needed. + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg, + false, EHRegOffset) + .addReg(X86::ESP); + } } while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { @@ -932,7 +1267,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, int FI; if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { if (X86::FR64RegClass.contains(Reg)) { - int Offset = getFrameIndexOffset(MF, FI); + unsigned IgnoredFrameReg; + int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); Offset += SEHFrameOffset; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) @@ -948,14 +1284,33 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) .setMIFlag(MachineInstr::FrameSetup); + if (FnHasClrFunclet && !IsFunclet) { + // Save the so-called Initial-SP (i.e. the value of the stack pointer + // immediately after the prolog) into the PSPSlot so that funclets + // and the GC can recover it. + unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF); + auto PSPInfo = MachinePointerInfo::getFixedStack( + MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false, + PSPSlotOffset) + .addReg(StackPtr) + .addMemOperand(MF.getMachineMemOperand( + PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile, + SlotSize, SlotSize)); + } + // Realign stack after we spilled callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). // Win64 requires aligning the stack after the prologue. if (IsWin64Prologue && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); - BuildStackAlignAND(MBB, MBBI, DL, MaxAlign); + BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign); } + // We already dealt with stack realignment and funclets above. + if (IsFunclet && STI.is32Bit()) + return; + // If we need a base pointer, set it up here. It's whatever the value // of the stack pointer is at this point. Any variable size objects // will be allocated after this, so we can still use the base pointer @@ -964,7 +1319,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Update the base pointer with the current stack pointer. unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) - .addReg(StackPtr) + .addReg(SPOrEstablisher) .setMIFlag(MachineInstr::FrameSetup); if (X86FI->getRestoreBasePointer()) { // Stash value of base pointer. Saving RSP instead of EBP shortens @@ -972,18 +1327,21 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), FramePtr, true, X86FI->getRestoreBasePointerOffset()) - .addReg(StackPtr) + .addReg(SPOrEstablisher) .setMIFlag(MachineInstr::FrameSetup); } - if (X86FI->getHasSEHFramePtrSave()) { + if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) { // Stash the value of the frame pointer relative to the base pointer for // Win32 EH. This supports Win32 EH, which does the inverse of the above: // it recovers the frame pointer from the base pointer rather than the // other way around. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; - addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), BasePtr, true, - getFrameIndexOffset(MF, X86FI->getSEHFramePtrSaveIndex())) + unsigned UsedReg; + int Offset = + getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg); + assert(UsedReg == BasePtr); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset) .addReg(FramePtr) .setMIFlag(MachineInstr::FrameSetup); } @@ -1015,6 +1373,69 @@ bool X86FrameLowering::canUseLEAForSPInEpilogue( return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF); } +static bool isFuncletReturnInstr(MachineInstr *MI) { + switch (MI->getOpcode()) { + case X86::CATCHRET: + case X86::CLEANUPRET: + return true; + default: + return false; + } + llvm_unreachable("impossible"); +} + +// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the +// stack. It holds a pointer to the bottom of the root function frame. The +// establisher frame pointer passed to a nested funclet may point to the +// (mostly empty) frame of its parent funclet, but it will need to find +// the frame of the root function to access locals. To facilitate this, +// every funclet copies the pointer to the bottom of the root function +// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the +// same offset for the PSPSym in the root function frame that's used in the +// funclets' frames allows each funclet to dynamically accept any ancestor +// frame as its establisher argument (the runtime doesn't guarantee the +// immediate parent for some reason lost to history), and also allows the GC, +// which uses the PSPSym for some bookkeeping, to find it in any funclet's +// frame with only a single offset reported for the entire method. +unsigned +X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const { + const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo(); + // getFrameIndexReferenceFromSP has an out ref parameter for the stack + // pointer register; pass a dummy that we ignore + unsigned SPReg; + int Offset = getFrameIndexReferenceFromSP(MF, Info.PSPSymFrameIdx, SPReg); + assert(Offset >= 0); + return static_cast<unsigned>(Offset); +} + +unsigned +X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { + // This is the size of the pushed CSRs. + unsigned CSSize = + MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize(); + // This is the amount of stack a funclet needs to allocate. + unsigned UsedSize; + EHPersonality Personality = + classifyEHPersonality(MF.getFunction()->getPersonalityFn()); + if (Personality == EHPersonality::CoreCLR) { + // CLR funclets need to hold enough space to include the PSPSym, at the + // same offset from the stack pointer (immediately after the prolog) as it + // resides at in the main function. + UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize; + } else { + // Other funclets just need enough stack for outgoing call arguments. + UsedSize = MF.getFrameInfo()->getMaxCallFrameSize(); + } + // RBP is not included in the callee saved register block. After pushing RBP, + // everything is 16 byte aligned. Everything we allocate before an outgoing + // call must also be 16 byte aligned. + unsigned FrameSizeMinusRBP = + RoundUpToAlignment(CSSize + UsedSize, getStackAlignment()); + // Subtract out the size of the callee saved registers. This is how much stack + // each funclet will allocate. + return FrameSizeMinusRBP - CSSize; +} + void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -1027,12 +1448,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, const bool Is64BitILP32 = STI.isTarget64BitILP32(); unsigned FramePtr = TRI->getFrameRegister(MF); unsigned MachineFramePtr = - Is64BitILP32 ? getX86SubSuperRegister(FramePtr, MVT::i64, false) - : FramePtr; + Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinCFI = IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry(); + bool IsFunclet = isFuncletReturnInstr(MBBI); + MachineBasicBlock *TargetMBB = nullptr; // Get the number of bytes to allocate from the FrameInfo. uint64_t StackSize = MFI->getStackSize(); @@ -1040,7 +1462,27 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t NumBytes = 0; - if (hasFP(MF)) { + if (MBBI->getOpcode() == X86::CATCHRET) { + // SEH shouldn't use catchret. + assert(!isAsynchronousEHPersonality( + classifyEHPersonality(MF.getFunction()->getPersonalityFn())) && + "SEH should not use CATCHRET"); + + NumBytes = getWinEHFuncletFrameSize(MF); + assert(hasFP(MF) && "EH funclets without FP not yet implemented"); + TargetMBB = MBBI->getOperand(0).getMBB(); + + // Pop EBP. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), + MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); + } else if (MBBI->getOpcode() == X86::CLEANUPRET) { + NumBytes = getWinEHFuncletFrameSize(MF); + assert(hasFP(MF) && "EH funclets without FP not yet implemented"); + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), + MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); + } else if (hasFP(MF)) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; NumBytes = FrameSize - CSSize; @@ -1052,7 +1494,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Pop EBP. BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr); + TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); } else { NumBytes = StackSize - CSSize; } @@ -1063,26 +1506,50 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator PI = std::prev(MBBI); unsigned Opc = PI->getOpcode(); - if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE && - !PI->isTerminator()) + if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) && + (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) && + Opc != X86::DBG_VALUE && !PI->isTerminator()) break; --MBBI; } MachineBasicBlock::iterator FirstCSPop = MBBI; + if (TargetMBB) { + // Fill EAX/RAX with the address of the target block. + unsigned ReturnReg = STI.is64Bit() ? X86::RAX : X86::EAX; + if (STI.is64Bit()) { + // LEA64r TargetMBB(%rip), %rax + BuildMI(MBB, FirstCSPop, DL, TII.get(X86::LEA64r), ReturnReg) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addMBB(TargetMBB) + .addReg(0); + } else { + // MOV32ri $TargetMBB, %eax + BuildMI(MBB, FirstCSPop, DL, TII.get(X86::MOV32ri), ReturnReg) + .addMBB(TargetMBB); + } + // Record that we've taken the address of TargetMBB and no longer just + // reference it in a terminator. + TargetMBB->setHasAddressTaken(); + } + if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); // If there is an ADD32ri or SUB32ri of ESP immediately before this // instruction, merge the two instructions. if (NumBytes || MFI->hasVarSizedObjects()) - mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes); + NumBytes += mergeSPUpdates(MBB, MBBI, true); // If dynamic alloca is used, then reset esp to point to the last callee-saved // slot before popping them off! Same applies for the case, when stack was - // realigned. - if (TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { + // realigned. Don't do this if this was a funclet epilogue, since the funclets + // will not do realignment or dynamic stack allocation. + if ((TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) && + !IsFunclet) { if (TRI->needsStackRealignment(MF)) MBBI = FirstCSPop; unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt); @@ -1134,9 +1601,24 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } } -int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { +// NOTE: this only has a subset of the full frame index logic. In +// particular, the FI < 0 and AfterFPPop logic is handled in +// X86RegisterInfo::eliminateFrameIndex, but not here. Possibly +// (probably?) it should be moved into here. +int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // We can't calculate offset from frame pointer if the stack is realigned, + // so enforce usage of stack/base pointer. The base pointer is used when we + // have dynamic allocas in addition to dynamic realignment. + if (TRI->hasBasePointer(MF)) + FrameReg = TRI->getBaseRegister(); + else if (TRI->needsStackRealignment(MF)) + FrameReg = TRI->getStackRegister(); + else + FrameReg = TRI->getFrameRegister(MF); + // Offset will hold the offset from the stack pointer at function entry to the // object. // We need to factor in additional offsets applied during the prologue to the @@ -1207,48 +1689,62 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, return Offset + FPDelta; } -int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const { - // We can't calculate offset from frame pointer if the stack is realigned, - // so enforce usage of stack/base pointer. The base pointer is used when we - // have dynamic allocas in addition to dynamic realignment. - if (TRI->hasBasePointer(MF)) - FrameReg = TRI->getBaseRegister(); - else if (TRI->needsStackRealignment(MF)) - FrameReg = TRI->getStackRegister(); - else - FrameReg = TRI->getFrameRegister(MF); - return getFrameIndexOffset(MF, FI); -} - -// Simplified from getFrameIndexOffset keeping only StackPointer cases -int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const { +// Simplified from getFrameIndexReference keeping only StackPointer cases +int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); // Does not include any dynamic realign. const uint64_t StackSize = MFI->getStackSize(); { #ifndef NDEBUG - // Note: LLVM arranges the stack as: - // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP) - // > "Stack Slots" (<--SP) - // We can always address StackSlots from RSP. We can usually (unless - // needsStackRealignment) address CSRs from RSP, but sometimes need to - // address them from RBP. FixedObjects can be placed anywhere in the stack - // frame depending on their specific requirements (i.e. we can actually - // refer to arguments to the function which are stored in the *callers* - // frame). As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs - // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject. - - assert(!TRI->hasBasePointer(MF) && "we don't handle this case"); - - // We don't handle tail calls, and shouldn't be seeing them - // either. + // LLVM arranges the stack as follows: + // ... + // ARG2 + // ARG1 + // RETADDR + // PUSH RBP <-- RBP points here + // PUSH CSRs + // ~~~~~~~ <-- possible stack realignment (non-win64) + // ... + // STACK OBJECTS + // ... <-- RSP after prologue points here + // ~~~~~~~ <-- possible stack realignment (win64) + // + // if (hasVarSizedObjects()): + // ... <-- "base pointer" (ESI/RBX) points here + // DYNAMIC ALLOCAS + // ... <-- RSP points here + // + // Case 1: In the simple case of no stack realignment and no dynamic + // allocas, both "fixed" stack objects (arguments and CSRs) are addressable + // with fixed offsets from RSP. + // + // Case 2: In the case of stack realignment with no dynamic allocas, fixed + // stack objects are addressed with RBP and regular stack objects with RSP. + // + // Case 3: In the case of dynamic allocas and stack realignment, RSP is used + // to address stack arguments for outgoing calls and nothing else. The "base + // pointer" points to local variables, and RBP points to fixed objects. + // + // In cases 2 and 3, we can only answer for non-fixed stack objects, and the + // answer we give is relative to the SP after the prologue, and not the + // SP in the middle of the function. + + assert((!MFI->isFixedObjectIndex(FI) || !TRI->needsStackRealignment(MF) || + STI.isTargetWin64()) && + "offset from fixed object to SP is not static"); + + // We don't handle tail calls, and shouldn't be seeing them either. int TailCallReturnAddrDelta = MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta(); assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!"); #endif } + // Fill in FrameReg output argument. + FrameReg = TRI->getStackRegister(); + // This is how the math works out: // // %rsp grows (i.e. gets lower) left to right. Each box below is @@ -1280,15 +1776,6 @@ int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int F return Offset + StackSize; } -// Simplified from getFrameIndexReference keeping only StackPointer cases -int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, - int FI, - unsigned &FrameReg) const { - assert(!TRI->hasBasePointer(MF) && "we don't handle this case"); - - FrameReg = TRI->getStackRegister(); - return getFrameIndexOffsetFromSP(MF, FI); -} bool X86FrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, @@ -1358,6 +1845,11 @@ bool X86FrameLowering::spillCalleeSavedRegisters( const TargetRegisterInfo *TRI) const { DebugLoc DL = MBB.findDebugLoc(MI); + // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI + // for us, and there are no XMM CSRs on Win32. + if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows()) + return true; + // Push GPRs. It increases frame size. unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; for (unsigned i = CSI.size(); i != 0; --i) { @@ -1399,6 +1891,22 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, if (CSI.empty()) return false; + if (isFuncletReturnInstr(MI) && STI.isOSWindows()) { + // Don't restore CSRs in 32-bit EH funclets. Matches + // spillCalleeSavedRegisters. + if (STI.is32Bit()) + return true; + // Don't restore CSRs before an SEH catchret. SEH except blocks do not form + // funclets. emitEpilogue transforms these to normal jumps. + if (MI->getOpcode() == X86::CATCHRET) { + const Function *Func = MBB.getParent()->getFunction(); + bool IsSEH = isAsynchronousEHPersonality( + classifyEHPersonality(Func->getPersonalityFn())); + if (IsSEH) + return true; + } + } + DebugLoc DL = MBB.findDebugLoc(MI); // Reload XMMs from stack frame. @@ -1420,7 +1928,8 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, !X86::GR32RegClass.contains(Reg)) continue; - BuildMI(MBB, MI, DL, TII.get(Opc), Reg); + BuildMI(MBB, MI, DL, TII.get(Opc), Reg) + .setMIFlag(MachineInstr::FrameDestroy); } return true; } @@ -1450,8 +1959,16 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, } // Spill the BasePtr if it's used. - if (TRI->hasBasePointer(MF)) + if (TRI->hasBasePointer(MF)) { SavedRegs.set(TRI->getBaseRegister()); + + // Allocate a spill slot for EBP if we have a base pointer and EH funclets. + if (MF.getMMI().hasEHFunclets()) { + int FI = MFI->CreateSpillStackObject(SlotSize, SlotSize); + X86FI->setHasSEHFramePtrSave(true); + X86FI->setSEHFramePtrSaveIndex(FI); + } + } } static bool @@ -1545,11 +2062,9 @@ void X86FrameLowering::adjustForSegmentedStacks( // The MOV R10, RAX needs to be in a different block, since the RET we emit in // allocMBB needs to be last (terminating) instruction. - for (MachineBasicBlock::livein_iterator i = PrologueMBB.livein_begin(), - e = PrologueMBB.livein_end(); - i != e; i++) { - allocMBB->addLiveIn(*i); - checkMBB->addLiveIn(*i); + for (const auto &LI : PrologueMBB.liveins()) { + allocMBB->addLiveIn(LI); + checkMBB->addLiveIn(LI); } if (IsNested) @@ -1682,8 +2197,6 @@ void X86FrameLowering::adjustForSegmentedStacks( .addImm(StackSize); BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) .addImm(X86FI->getArgumentStackSize()); - MF.getRegInfo().setPhysRegUsed(Reg10); - MF.getRegInfo().setPhysRegUsed(Reg11); } else { BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); @@ -1821,11 +2334,9 @@ void X86FrameLowering::adjustForHiPEPrologue( MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock(); - for (MachineBasicBlock::livein_iterator I = PrologueMBB.livein_begin(), - E = PrologueMBB.livein_end(); - I != E; I++) { - stackCheckMBB->addLiveIn(*I); - incStackMBB->addLiveIn(*I); + for (const auto &LI : PrologueMBB.liveins()) { + stackCheckMBB->addLiveIn(LI); + incStackMBB->addLiveIn(LI); } MF.push_front(incStackMBB); @@ -1870,16 +2381,84 @@ void X86FrameLowering::adjustForHiPEPrologue( .addReg(ScratchReg), PReg, false, SPLimitOffset); BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); - stackCheckMBB->addSuccessor(&PrologueMBB, 99); - stackCheckMBB->addSuccessor(incStackMBB, 1); - incStackMBB->addSuccessor(&PrologueMBB, 99); - incStackMBB->addSuccessor(incStackMBB, 1); + stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100}); + stackCheckMBB->addSuccessor(incStackMBB, {1, 100}); + incStackMBB->addSuccessor(&PrologueMBB, {99, 100}); + incStackMBB->addSuccessor(incStackMBB, {1, 100}); } #ifdef XDEBUG MF.verify(); #endif } +bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, int Offset) const { + + if (Offset <= 0) + return false; + + if (Offset % SlotSize) + return false; + + int NumPops = Offset / SlotSize; + // This is only worth it if we have at most 2 pops. + if (NumPops != 1 && NumPops != 2) + return false; + + // Handle only the trivial case where the adjustment directly follows + // a call. This is the most common one, anyway. + if (MBBI == MBB.begin()) + return false; + MachineBasicBlock::iterator Prev = std::prev(MBBI); + if (!Prev->isCall() || !Prev->getOperand(1).isRegMask()) + return false; + + unsigned Regs[2]; + unsigned FoundRegs = 0; + + auto RegMask = Prev->getOperand(1); + + auto &RegClass = + Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass; + // Try to find up to NumPops free registers. + for (auto Candidate : RegClass) { + + // Poor man's liveness: + // Since we're immediately after a call, any register that is clobbered + // by the call and not defined by it can be considered dead. + if (!RegMask.clobbersPhysReg(Candidate)) + continue; + + bool IsDef = false; + for (const MachineOperand &MO : Prev->implicit_operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg() == Candidate) { + IsDef = true; + break; + } + } + + if (IsDef) + continue; + + Regs[FoundRegs++] = Candidate; + if (FoundRegs == (unsigned)NumPops) + break; + } + + if (FoundRegs == 0) + return false; + + // If we found only one free register, but need two, reuse the same one twice. + while (FoundRegs < (unsigned)NumPops) + Regs[FoundRegs++] = Regs[0]; + + for (int i = 0; i < NumPops; ++i) + BuildMI(MBB, MBBI, DL, + TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]); + + return true; +} + void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { @@ -1895,8 +2474,6 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub ESP, <amt>' and the // adjcallstackdown instruction into 'add ESP, <amt>' - if (Amount == 0) - return; // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next @@ -1904,15 +2481,68 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, unsigned StackAlign = getStackAlignment(); Amount = RoundUpToAlignment(Amount, StackAlign); + MachineModuleInfo &MMI = MF.getMMI(); + const Function *Fn = MF.getFunction(); + bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool DwarfCFI = !WindowsCFI && + (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); + + // If we have any exception handlers in this function, and we adjust + // the SP before calls, we may need to indicate this to the unwinder + // using GNU_ARGS_SIZE. Note that this may be necessary even when + // Amount == 0, because the preceding function may have set a non-0 + // GNU_ARGS_SIZE. + // TODO: We don't need to reset this between subsequent functions, + // if it didn't change. + bool HasDwarfEHHandlers = !WindowsCFI && + !MF.getMMI().getLandingPads().empty(); + + if (HasDwarfEHHandlers && !isDestroy && + MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences()) + BuildCFI(MBB, I, DL, + MCCFIInstruction::createGnuArgsSize(nullptr, Amount)); + + if (Amount == 0) + return; + // Factor out the amount that gets handled inside the sequence // (Pushes of argument for frame setup, callee pops for frame destroy) Amount -= InternalAmt; + // TODO: This is needed only if we require precise CFA. + // If this is a callee-pop calling convention, emit a CFA adjust for + // the amount the callee popped. + if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF)) + BuildCFI(MBB, I, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt)); + if (Amount) { // Add Amount to SP to destroy a frame, and subtract to setup. int Offset = isDestroy ? Amount : -Amount; - BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false); + + if (!(Fn->optForMinSize() && + adjustStackWithPops(MBB, I, DL, Offset))) + BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false); } + + if (DwarfCFI && !hasFP(MF)) { + // If we don't have FP, but need to generate unwind information, + // we need to set the correct CFA offset after the stack adjustment. + // How much we adjust the CFA offset depends on whether we're emitting + // CFI only for EH purposes or for debugging. EH only requires the CFA + // offset to be correct at each call site, while for debugging we want + // it to be more precise. + int CFAOffset = Amount; + // TODO: When not using precise CFA, we also need to adjust for the + // InternalAmt here. + + if (CFAOffset) { + CFAOffset = isDestroy ? -CFAOffset : CFAOffset; + BuildCFI(MBB, I, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset)); + } + } + return; } @@ -1933,12 +2563,136 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { assert(MBB.getParent() && "Block is not attached to a function!"); + // Win64 has strict requirements in terms of epilogue and we are + // not taking a chance at messing with them. + // I.e., unless this block is already an exit block, we can't use + // it as an epilogue. + if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock()) + return false; + if (canUseLEAForSPInEpilogue(*MBB.getParent())) return true; // If we cannot use LEA to adjust SP, we may need to use ADD, which - // clobbers the EFLAGS. Check that none of the terminators reads the - // EFLAGS, and if one uses it, conservatively assume this is not + // clobbers the EFLAGS. Check that we do not need to preserve it, + // otherwise, conservatively assume this is not // safe to insert the epilogue here. - return !terminatorsNeedFlagsAsInput(MBB); + return !flagsNeedToBePreservedBeforeTheTerminators(MBB); +} + +bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { + // If we may need to emit frameless compact unwind information, give + // up as this is currently broken: PR25614. + return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF); +} + +MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool RestoreSP) const { + assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env"); + assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32"); + assert(STI.is32Bit() && !Uses64BitFramePtr && + "restoring EBP/ESI on non-32-bit target"); + + MachineFunction &MF = *MBB.getParent(); + unsigned FramePtr = TRI->getFrameRegister(MF); + unsigned BasePtr = TRI->getBaseRegister(); + WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // FIXME: Don't set FrameSetup flag in catchret case. + + int FI = FuncInfo.EHRegNodeFrameIndex; + int EHRegSize = MFI->getObjectSize(FI); + + if (RestoreSP) { + // MOV32rm -EHRegSize(%ebp), %esp + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP), + X86::EBP, true, -EHRegSize) + .setMIFlag(MachineInstr::FrameSetup); + } + + unsigned UsedReg; + int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg); + int EndOffset = -EHRegOffset - EHRegSize; + FuncInfo.EHRegNodeEndOffset = EndOffset; + + if (UsedReg == FramePtr) { + // ADD $offset, %ebp + unsigned ADDri = getADDriOpcode(false, EndOffset); + BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr) + .addReg(FramePtr) + .addImm(EndOffset) + .setMIFlag(MachineInstr::FrameSetup) + ->getOperand(3) + .setIsDead(); + assert(EndOffset >= 0 && + "end of registration object above normal EBP position!"); + } else if (UsedReg == BasePtr) { + // LEA offset(%ebp), %esi + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr), + FramePtr, false, EndOffset) + .setMIFlag(MachineInstr::FrameSetup); + // MOV32rm SavedEBPOffset(%esi), %ebp + assert(X86FI->getHasSEHFramePtrSave()); + int Offset = + getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg); + assert(UsedReg == BasePtr); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr), + UsedReg, true, Offset) + .setMIFlag(MachineInstr::FrameSetup); + } else { + llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr"); + } + return MBBI; +} + +unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const { + // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue. + unsigned Offset = 16; + // RBP is immediately pushed. + Offset += SlotSize; + // All callee-saved registers are then pushed. + Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize(); + // Every funclet allocates enough stack space for the largest outgoing call. + Offset += getWinEHFuncletFrameSize(MF); + return Offset; +} + +void X86FrameLowering::processFunctionBeforeFrameFinalized( + MachineFunction &MF, RegScavenger *RS) const { + // If this function isn't doing Win64-style C++ EH, we don't need to do + // anything. + const Function *Fn = MF.getFunction(); + if (!STI.is64Bit() || !MF.getMMI().hasEHFunclets() || + classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX) + return; + + // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset + // relative to RSP after the prologue. Find the offset of the last fixed + // object, so that we can allocate a slot immediately following it. If there + // were no fixed objects, use offset -SlotSize, which is immediately after the + // return address. Fixed objects have negative frame indices. + MachineFrameInfo *MFI = MF.getFrameInfo(); + int64_t MinFixedObjOffset = -SlotSize; + for (int I = MFI->getObjectIndexBegin(); I < 0; ++I) + MinFixedObjOffset = std::min(MinFixedObjOffset, MFI->getObjectOffset(I)); + + int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize; + int UnwindHelpFI = + MFI->CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false); + MF.getWinEHFuncInfo()->UnwindHelpFrameIdx = UnwindHelpFI; + + // Store -2 into UnwindHelp on function entry. We have to scan forwards past + // other frame setup instructions. + MachineBasicBlock &MBB = MF.front(); + auto MBBI = MBB.begin(); + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) + ++MBBI; + + DebugLoc DL = MBB.findDebugLoc(MBBI); + addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)), + UnwindHelpFI) + .addImm(-2); } diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h index 495cfcd..3ab41b4 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -47,11 +47,17 @@ public: unsigned StackPtr; - /// Emit a call to the target's stack probe function. This is required for all + /// Emit target stack probe code. This is required for all /// large stack allocations on Windows. The caller is required to materialize - /// the number of bytes to probe in RAX/EAX. - void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL) const; + /// the number of bytes to probe in RAX/EAX. Returns instruction just + /// after the expansion. + MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + bool InProlog) const; + + /// Replace a StackProbe inline-stub with the actual probe code inline. + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const override; void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -91,11 +97,9 @@ public: bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; bool needsFrameIndexResolution(const MachineFunction &MF) const override; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; - int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const; int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; @@ -103,6 +107,11 @@ public: MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override; + + void processFunctionBeforeFrameFinalized(MachineFunction &MF, + RegScavenger *RS) const override; + /// Check the instruction before/after the passed instruction. If /// it is an ADD/SUB/LEA instruction it is deleted argument and the /// stack adjustment is returned as a positive value for ADD/LEA and @@ -125,7 +134,9 @@ public: /// \p MBB will be correctly handled by the target. bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; -private: + /// Returns true if the target will correctly handle shrink wrapping. + bool enableShrinkWrapping(const MachineFunction &MF) const override; + /// convertArgMovsToPushes - This method tries to convert a call sequence /// that uses sub and mov instructions to put the argument onto the stack /// into a series of pushes. @@ -135,22 +146,56 @@ private: MachineBasicBlock::iterator I, uint64_t Amount) const; - uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; - /// Wraps up getting a CFI index and building a MachineInstr for it. void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, MCCFIInstruction CFIInst) const; + /// Sets up EBP and optionally ESI based on the incoming EBP value. Only + /// needed for 32-bit. Used in funclet prologues and at catchret destinations. + MachineBasicBlock::iterator + restoreWin32EHStackPointers(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + bool RestoreSP = false) const; + +private: + uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; + + /// Emit target stack probe as a call to a helper function + MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + + /// Emit target stack probe as an inline sequence. + MachineInstr *emitStackProbeInline(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + + /// Emit a stub to later inline the target stack probe. + MachineInstr *emitStackProbeInlineStub(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + /// Aligns the stack pointer by ANDing it with -MaxAlign. void BuildStackAlignAND(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, - uint64_t MaxAlign) const; + unsigned Reg, uint64_t MaxAlign) const; + + /// Make small positive stack adjustments using POPs. + bool adjustStackWithPops(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + int Offset) const; /// Adjusts the stack pointer using LEA, SUB, or ADD. MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, int64_t Offset, bool InEpilogue) const; + + unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const; + + unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const; }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index d5351d2..868ae4e 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -46,9 +46,8 @@ STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); //===----------------------------------------------------------------------===// namespace { - /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses - /// SDValue's instead of register numbers for the leaves of the matched - /// tree. + /// This corresponds to X86AddressMode, but uses SDValue's instead of register + /// numbers for the leaves of the matched tree. struct X86ISelAddressMode { enum { RegBase, @@ -87,8 +86,7 @@ namespace { IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; } - /// isRIPRelative - Return true if this addressing mode is already RIP - /// relative. + /// Return true if this addressing mode is already RIP-relative. bool isRIPRelative() const { if (BaseType != RegBase) return false; if (RegisterSDNode *RegNode = @@ -147,16 +145,16 @@ namespace { namespace { //===--------------------------------------------------------------------===// - /// ISel - X86 specific code to select X86 machine instructions for + /// ISel - X86-specific code to select X86 machine instructions for /// SelectionDAG operations. /// class X86DAGToDAGISel final : public SelectionDAGISel { - /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - /// OptForSize - If true, selector should try to optimize for code size - /// instead of performance. + /// If true, selector should try to optimize for code size instead of + /// performance. bool OptForSize; public: @@ -184,8 +182,7 @@ namespace { return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue()); } - // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit - // sign extended field. + // True if the 64-bit immediate fits in a 32-bit sign-extended field. inline bool i64immSExt32(SDNode *N) const { uint64_t v = cast<ConstantSDNode>(N)->getZExtValue(); return (int64_t)v == (int32_t)v; @@ -196,50 +193,50 @@ namespace { private: SDNode *Select(SDNode *N) override; - SDNode *SelectGather(SDNode *N, unsigned Opc); - SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT); - - bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); - bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); - bool MatchWrapper(SDValue N, X86ISelAddressMode &AM); - bool MatchAddress(SDValue N, X86ISelAddressMode &AM); - bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, + SDNode *selectGather(SDNode *N, unsigned Opc); + SDNode *selectAtomicLoadArith(SDNode *Node, MVT NVT); + + bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); + bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); + bool matchWrapper(SDValue N, X86ISelAddressMode &AM); + bool matchAddress(SDValue N, X86ISelAddressMode &AM); + bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth); + bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth); - bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM); - bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, + bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); + bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, + bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectMOV64Imm32(SDValue N, SDValue &Imm); - bool SelectLEAAddr(SDValue N, SDValue &Base, + bool selectMOV64Imm32(SDValue N, SDValue &Imm); + bool selectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectLEA64_32Addr(SDValue N, SDValue &Base, + bool selectLEA64_32Addr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectTLSADDRAddr(SDValue N, SDValue &Base, + bool selectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectScalarSSELoad(SDNode *Root, SDValue N, + bool selectScalarSSELoad(SDNode *Root, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment, SDValue &NodeWithChain); - bool TryFoldLoad(SDNode *P, SDValue N, + bool tryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for - /// inline asm expressions. + /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) override; - void EmitSpecialCodeForMain(); + void emitSpecialCodeForMain(); inline void getAddressOperands(X86ISelAddressMode &AM, SDLoc DL, SDValue &Base, SDValue &Scale, @@ -252,7 +249,7 @@ namespace { : AM.Base_Reg; Scale = getI8Imm(AM.Scale, DL); Index = AM.IndexReg; - // These are 32-bit even in 64-bit mode since RIP relative offset + // These are 32-bit even in 64-bit mode since RIP-relative offset // is 32-bit. if (AM.GV) Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), @@ -283,32 +280,105 @@ namespace { Segment = CurDAG->getRegister(0, MVT::i32); } - /// getI8Imm - Return a target constant with the specified value, of type - /// i8. + // Utility function to determine whether we should avoid selecting + // immediate forms of instructions for better code size or not. + // At a high level, we'd like to avoid such instructions when + // we have similar constants used within the same basic block + // that can be kept in a register. + // + bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { + uint32_t UseCount = 0; + + // Do not want to hoist if we're not optimizing for size. + // TODO: We'd like to remove this restriction. + // See the comment in X86InstrInfo.td for more info. + if (!OptForSize) + return false; + + // Walk all the users of the immediate. + for (SDNode::use_iterator UI = N->use_begin(), + UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) { + + SDNode *User = *UI; + + // This user is already selected. Count it as a legitimate use and + // move on. + if (User->isMachineOpcode()) { + UseCount++; + continue; + } + + // We want to count stores of immediates as real uses. + if (User->getOpcode() == ISD::STORE && + User->getOperand(1).getNode() == N) { + UseCount++; + continue; + } + + // We don't currently match users that have > 2 operands (except + // for stores, which are handled above) + // Those instruction won't match in ISEL, for now, and would + // be counted incorrectly. + // This may change in the future as we add additional instruction + // types. + if (User->getNumOperands() != 2) + continue; + + // Immediates that are used for offsets as part of stack + // manipulation should be left alone. These are typically + // used to indicate SP offsets for argument passing and + // will get pulled into stores/pushes (implicitly). + if (User->getOpcode() == X86ISD::ADD || + User->getOpcode() == ISD::ADD || + User->getOpcode() == X86ISD::SUB || + User->getOpcode() == ISD::SUB) { + + // Find the other operand of the add/sub. + SDValue OtherOp = User->getOperand(0); + if (OtherOp.getNode() == N) + OtherOp = User->getOperand(1); + + // Don't count if the other operand is SP. + RegisterSDNode *RegNode; + if (OtherOp->getOpcode() == ISD::CopyFromReg && + (RegNode = dyn_cast_or_null<RegisterSDNode>( + OtherOp->getOperand(1).getNode()))) + if ((RegNode->getReg() == X86::ESP) || + (RegNode->getReg() == X86::RSP)) + continue; + } + + // ... otherwise, count this and move on. + UseCount++; + } + + // If we have more than 1 use, then recommend for hoisting. + return (UseCount > 1); + } + + /// Return a target constant with the specified value of type i8. inline SDValue getI8Imm(unsigned Imm, SDLoc DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i8); } - /// getI32Imm - Return a target constant with the specified value, of type - /// i32. + /// Return a target constant with the specified value, of type i32. inline SDValue getI32Imm(unsigned Imm, SDLoc DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } - /// getGlobalBaseReg - Return an SDNode that returns the value of - /// the global base register. Output instructions required to - /// initialize the global base register, if necessary. - /// + /// Return an SDNode that returns the value of the global base register. + /// Output instructions required to initialize the global base register, + /// if necessary. SDNode *getGlobalBaseReg(); - /// getTargetMachine - Return a reference to the TargetMachine, casted - /// to the target-specific type. + /// Return a reference to the TargetMachine, casted to the target-specific + /// type. const X86TargetMachine &getTargetMachine() const { return static_cast<const X86TargetMachine &>(TM); } - /// getInstrInfo - Return a reference to the TargetInstrInfo, casted - /// to the target-specific type. + /// Return a reference to the TargetInstrInfo, casted to the target-specific + /// type. const X86InstrInfo *getInstrInfo() const { return Subtarget->getInstrInfo(); } @@ -386,9 +456,9 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { return true; } -/// MoveBelowCallOrigChain - Replace the original chain operand of the call with +/// Replace the original chain operand of the call with /// load's chain operand and move load below the call's chain operand. -static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, +static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain) { SmallVector<SDValue, 8> Ops; SDValue Chain = OrigChain.getOperand(0); @@ -418,7 +488,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, CurDAG->UpdateNodeOperands(Call.getNode(), Ops); } -/// isCalleeLoad - Return true if call address is a load and it can be +/// Return true if call address is a load and it can be /// moved below CALLSEQ_START and the chains leading up to the call. /// Return the CALLSEQ_START by reference as a second output. /// In the case of a tail call, there isn't a callseq node between the call @@ -462,11 +532,11 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { void X86DAGToDAGISel::PreprocessISelDAG() { // OptForSize is used in pattern predicates that isel is matching. - OptForSize = MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize); + OptForSize = MF->getFunction()->optForSize(); for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { - SDNode *N = I++; // Preincrement iterator to avoid invalidation issues. + SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. if (OptLevel != CodeGenOpt::None && // Only does this when target favors doesn't favor register indirect @@ -500,7 +570,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { SDValue Load = N->getOperand(1); if (!isCalleeLoad(Load, Chain, HasCallSeq)) continue; - MoveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); + moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); ++NumLoadMoved; continue; } @@ -577,9 +647,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() { } -/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in -/// the main function. -void X86DAGToDAGISel::EmitSpecialCodeForMain() { +/// Emit any code that needs to be executed only in the main function. +void X86DAGToDAGISel::emitSpecialCodeForMain() { if (Subtarget->isTargetCygMing()) { TargetLowering::ArgListTy Args; auto &DL = CurDAG->getDataLayout(); @@ -599,7 +668,7 @@ void X86DAGToDAGISel::EmitFunctionEntryCode() { // If this is main, emit special code for main. if (const Function *Fn = MF->getFunction()) if (Fn->hasExternalLinkage() && Fn->getName() == "main") - EmitSpecialCodeForMain(); + emitSpecialCodeForMain(); } static bool isDispSafeForFrameIndex(int64_t Val) { @@ -612,7 +681,7 @@ static bool isDispSafeForFrameIndex(int64_t Val) { return isInt<31>(Val); } -bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset, +bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM) { // Cannot combine ExternalSymbol displacements with integer offsets. if (Offset != 0 && (AM.ES || AM.MCSym)) @@ -634,7 +703,7 @@ bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset, } -bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ +bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ SDValue Address = N->getOperand(1); // load gs:0 -> GS segment register. @@ -658,11 +727,10 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ return true; } -/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes -/// into an addressing mode. These wrap things that will resolve down into a -/// symbol reference. If no match is possible, this returns true, otherwise it -/// returns false. -bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { +/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing +/// mode. These wrap things that will resolve down into a symbol reference. +/// If no match is possible, this returns true, otherwise it returns false. +bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { // If the addressing mode already has a symbol as the displacement, we can // never match another symbol. if (AM.hasSymbolicDisplacement()) @@ -685,7 +753,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { X86ISelAddressMode Backup = AM; AM.GV = G->getGlobal(); AM.SymbolFlags = G->getTargetFlags(); - if (FoldOffsetIntoAddress(G->getOffset(), AM)) { + if (foldOffsetIntoAddress(G->getOffset(), AM)) { AM = Backup; return true; } @@ -694,7 +762,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { AM.CP = CP->getConstVal(); AM.Align = CP->getAlignment(); AM.SymbolFlags = CP->getTargetFlags(); - if (FoldOffsetIntoAddress(CP->getOffset(), AM)) { + if (foldOffsetIntoAddress(CP->getOffset(), AM)) { AM = Backup; return true; } @@ -710,7 +778,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { X86ISelAddressMode Backup = AM; AM.BlockAddr = BA->getBlockAddress(); AM.SymbolFlags = BA->getTargetFlags(); - if (FoldOffsetIntoAddress(BA->getOffset(), AM)) { + if (foldOffsetIntoAddress(BA->getOffset(), AM)) { AM = Backup; return true; } @@ -758,11 +826,10 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { return true; } -/// MatchAddress - Add the specified node to the specified addressing mode, -/// returning true if it cannot be done. This just pattern matches for the -/// addressing mode. -bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) { - if (MatchAddressRecursively(N, AM, 0)) +/// Add the specified node to the specified addressing mode, returning true if +/// it cannot be done. This just pattern matches for the addressing mode. +bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { + if (matchAddressRecursively(N, AM, 0)) return true; // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has @@ -790,15 +857,49 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) { return false; } +bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM, + unsigned Depth) { + // Add an artificial use to this node so that we can keep track of + // it if it gets CSE'd with a different node. + HandleSDNode Handle(N); + + X86ISelAddressMode Backup = AM; + if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) && + !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) + return false; + AM = Backup; + + // Try again after commuting the operands. + if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) && + !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) + return false; + AM = Backup; + + // If we couldn't fold both operands into the address at the same time, + // see if we can just put each operand into a register and fold at least + // the add. + if (AM.BaseType == X86ISelAddressMode::RegBase && + !AM.Base_Reg.getNode() && + !AM.IndexReg.getNode()) { + N = Handle.getValue(); + AM.Base_Reg = N.getOperand(0); + AM.IndexReg = N.getOperand(1); + AM.Scale = 1; + return false; + } + N = Handle.getValue(); + return true; +} + // Insert a node into the DAG at least before the Pos node's position. This // will reposition the node as needed, and will assign it a node ID that is <= // the Pos node's ID. Note that this does *not* preserve the uniqueness of node // IDs! The selection DAG must no longer depend on their uniqueness when this // is used. -static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { +static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { if (N.getNode()->getNodeId() == -1 || N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) { - DAG.RepositionNode(Pos.getNode(), N.getNode()); + DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode()); N.getNode()->setNodeId(Pos.getNode()->getNodeId()); } } @@ -807,7 +908,7 @@ static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { // safe. This allows us to convert the shift and and into an h-register // extract and a scaled index. Returns false if the simplification is // performed. -static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, +static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM) { @@ -835,12 +936,12 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. - InsertDAGNode(DAG, N, Eight); - InsertDAGNode(DAG, N, Srl); - InsertDAGNode(DAG, N, NewMask); - InsertDAGNode(DAG, N, And); - InsertDAGNode(DAG, N, ShlCount); - InsertDAGNode(DAG, N, Shl); + insertDAGNode(DAG, N, Eight); + insertDAGNode(DAG, N, Srl); + insertDAGNode(DAG, N, NewMask); + insertDAGNode(DAG, N, And); + insertDAGNode(DAG, N, ShlCount); + insertDAGNode(DAG, N, Shl); DAG.ReplaceAllUsesWith(N, Shl); AM.IndexReg = And; AM.Scale = (1 << ScaleLog); @@ -850,7 +951,7 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this // allows us to fold the shift into this addressing mode. Returns false if the // transform succeeded. -static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, +static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM) { @@ -880,9 +981,9 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. - InsertDAGNode(DAG, N, NewMask); - InsertDAGNode(DAG, N, NewAnd); - InsertDAGNode(DAG, N, NewShift); + insertDAGNode(DAG, N, NewMask); + insertDAGNode(DAG, N, NewAnd); + insertDAGNode(DAG, N, NewShift); DAG.ReplaceAllUsesWith(N, NewShift); AM.Scale = 1 << ShiftAmt; @@ -917,7 +1018,7 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, // Note that this function assumes the mask is provided as a mask *after* the // value is shifted. The input chain may or may not match that, but computing // such a mask is trivial. -static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, +static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM) { @@ -973,7 +1074,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, assert(X.getValueType() != VT); // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X); - InsertDAGNode(DAG, N, NewX); + insertDAGNode(DAG, N, NewX); X = NewX; } SDLoc DL(N); @@ -987,10 +1088,10 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. - InsertDAGNode(DAG, N, NewSRLAmt); - InsertDAGNode(DAG, N, NewSRL); - InsertDAGNode(DAG, N, NewSHLAmt); - InsertDAGNode(DAG, N, NewSHL); + insertDAGNode(DAG, N, NewSRLAmt); + insertDAGNode(DAG, N, NewSRL); + insertDAGNode(DAG, N, NewSHLAmt); + insertDAGNode(DAG, N, NewSHL); DAG.ReplaceAllUsesWith(N, NewSHL); AM.Scale = 1 << AMShiftAmt; @@ -998,7 +1099,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, return false; } -bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, +bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { SDLoc dl(N); DEBUG({ @@ -1007,7 +1108,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, }); // Limit recursion. if (Depth > 5) - return MatchAddressBase(N, AM); + return matchAddressBase(N, AM); // If this is already a %rip relative address, we can only merge immediates // into it. Instead of handling this in every case, we handle it here. @@ -1020,7 +1121,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, return true; if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) - if (!FoldOffsetIntoAddress(Cst->getSExtValue(), AM)) + if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM)) return false; return true; } @@ -1038,19 +1139,19 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, } case ISD::Constant: { uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); - if (!FoldOffsetIntoAddress(Val, AM)) + if (!foldOffsetIntoAddress(Val, AM)) return false; break; } case X86ISD::Wrapper: case X86ISD::WrapperRIP: - if (!MatchWrapper(N, AM)) + if (!matchWrapper(N, AM)) return false; break; case ISD::LOAD: - if (!MatchLoadInAddress(cast<LoadSDNode>(N), AM)) + if (!matchLoadInAddress(cast<LoadSDNode>(N), AM)) return false; break; @@ -1087,7 +1188,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getNode()->getOperand(1)); uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; - if (!FoldOffsetIntoAddress(Disp, AM)) + if (!foldOffsetIntoAddress(Disp, AM)) return false; } @@ -1119,7 +1220,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Try to fold the mask and shift into the scale, and return false if we // succeed. - if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) + if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) return false; break; } @@ -1153,7 +1254,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, ConstantSDNode *AddVal = cast<ConstantSDNode>(MulVal.getNode()->getOperand(1)); uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); - if (FoldOffsetIntoAddress(Disp, AM)) + if (foldOffsetIntoAddress(Disp, AM)) Reg = N.getNode()->getOperand(0); } else { Reg = N.getNode()->getOperand(0); @@ -1179,7 +1280,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Test if the LHS of the sub can be folded. X86ISelAddressMode Backup = AM; - if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) { + if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) { AM = Backup; break; } @@ -1227,56 +1328,26 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, AM.Scale = 1; // Insert the new nodes into the topological ordering. - InsertDAGNode(*CurDAG, N, Zero); - InsertDAGNode(*CurDAG, N, Neg); + insertDAGNode(*CurDAG, N, Zero); + insertDAGNode(*CurDAG, N, Neg); return false; } - case ISD::ADD: { - // Add an artificial use to this node so that we can keep track of - // it if it gets CSE'd with a different node. - HandleSDNode Handle(N); - - X86ISelAddressMode Backup = AM; - if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) && - !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) - return false; - AM = Backup; - - // Try again after commuting the operands. - if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&& - !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) + case ISD::ADD: + if (!matchAdd(N, AM, Depth)) return false; - AM = Backup; - - // If we couldn't fold both operands into the address at the same time, - // see if we can just put each operand into a register and fold at least - // the add. - if (AM.BaseType == X86ISelAddressMode::RegBase && - !AM.Base_Reg.getNode() && - !AM.IndexReg.getNode()) { - N = Handle.getValue(); - AM.Base_Reg = N.getOperand(0); - AM.IndexReg = N.getOperand(1); - AM.Scale = 1; - return false; - } - N = Handle.getValue(); break; - } case ISD::OR: - // Handle "X | C" as "X + C" iff X is known to have C bits clear. - if (CurDAG->isBaseWithConstantOffset(N)) { - X86ISelAddressMode Backup = AM; - ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1)); - - // Start with the LHS as an addr mode. - if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) && - !FoldOffsetIntoAddress(CN->getSExtValue(), AM)) - return false; - AM = Backup; - } + // We want to look through a transform in InstCombine and DAGCombiner that + // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'. + // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3)) + // An 'lea' can then be used to match the shift (multiply) and add: + // and $1, %esi + // lea (%rsi, %rdi, 8), %rax + if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) && + !matchAdd(N, AM, Depth)) + return false; break; case ISD::AND: { @@ -1299,27 +1370,27 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, uint64_t Mask = N.getConstantOperandVal(1); // Try to fold the mask and shift into an extract and scale. - if (!FoldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) + if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) return false; // Try to fold the mask and shift directly into the scale. - if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) + if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) return false; // Try to swap the mask and shift to place shifts which can be done as // a scale on the outside of the mask. - if (!FoldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) + if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) return false; break; } } - return MatchAddressBase(N, AM); + return matchAddressBase(N, AM); } -/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the +/// Helper for MatchAddress. Add the specified node to the /// specified addressing mode without any further recursion. -bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) { +bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { // Is the base register already occupied? if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { // If so, check to see if the scale index register is set. @@ -1339,7 +1410,7 @@ bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) { return false; } -bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, +bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { @@ -1362,7 +1433,7 @@ bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, // If Base is 0, the whole address is in index and the Scale is 1 if (isa<ConstantSDNode>(Base)) { - assert(dyn_cast<ConstantSDNode>(Base)->isNullValue() && + assert(cast<ConstantSDNode>(Base)->isNullValue() && "Unexpected base in gather/scatter"); Scale = getI8Imm(1, DL); Base = CurDAG->getRegister(0, MVT::i32); @@ -1375,14 +1446,14 @@ bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, return true; } -/// SelectAddr - returns true if it is able pattern match an addressing mode. +/// Returns true if it is able to pattern match an addressing mode. /// It returns the operands which make up the maximal addressing mode it can /// match by reference. /// /// Parent is the parent node of the addr operand that is being matched. It /// is always a load, store, atomic node, or null. It is only null when /// checking memory operands for inline asm nodes. -bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, +bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { X86ISelAddressMode AM; @@ -1404,7 +1475,7 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); } - if (MatchAddress(N, AM)) + if (matchAddress(N, AM)) return false; MVT VT = N.getSimpleValueType(); @@ -1420,14 +1491,14 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, return true; } -/// SelectScalarSSELoad - Match a scalar SSE load. In particular, we want to -/// match a load whose top elements are either undef or zeros. The load flavor -/// is derived from the type of N, which is either v4f32 or v2f64. +/// Match a scalar SSE load. In particular, we want to match a load whose top +/// elements are either undef or zeros. The load flavor is derived from the +/// type of N, which is either v4f32 or v2f64. /// /// We also return: /// PatternChainNode: this is the matched node that has a chain input and /// output. -bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, +bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment, @@ -1439,7 +1510,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, IsProfitableToFold(N.getOperand(0), N.getNode(), Root) && IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); - if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) + if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) return false; return true; } @@ -1457,7 +1528,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { // Okay, this is a zero extending load. Fold it. LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0)); - if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) + if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) return false; PatternNodeWithChain = SDValue(LD, 0); return true; @@ -1466,7 +1537,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, } -bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) { +bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { uint64_t ImmVal = CN->getZExtValue(); if ((uint32_t)ImmVal != (uint64_t)ImmVal) @@ -1495,10 +1566,10 @@ bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) { return TM.getCodeModel() == CodeModel::Small; } -bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base, +bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { - if (!SelectLEAAddr(N, Base, Scale, Index, Disp, Segment)) + if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) return false; SDLoc DL(N); @@ -1533,9 +1604,9 @@ bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base, return true; } -/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing +/// Calls SelectAddr and determines if the maximal addressing /// mode it matches can be cost effectively emitted as an LEA instruction. -bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, +bool X86DAGToDAGISel::selectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { @@ -1546,7 +1617,7 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, SDValue Copy = AM.Segment; SDValue T = CurDAG->getRegister(0, MVT::i32); AM.Segment = T; - if (MatchAddress(N, AM)) + if (matchAddress(N, AM)) return false; assert (T == AM.Segment); AM.Segment = Copy; @@ -1572,13 +1643,12 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, Complexity++; // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA - // to a LEA. This is determined with some expermentation but is by no means + // to a LEA. This is determined with some experimentation but is by no means // optimal (especially for code size consideration). LEA is nice because of // its three-address nature. Tweak the cost function again when we can run // convertToThreeAddress() at register allocation time. if (AM.hasSymbolicDisplacement()) { - // For X86-64, we should always use lea to materialize RIP relative - // addresses. + // For X86-64, always use LEA to materialize RIP-relative addresses. if (Subtarget->is64Bit()) Complexity = 4; else @@ -1596,8 +1666,8 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, return true; } -/// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes. -bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base, +/// This is only run on TargetGlobalTLSAddress nodes. +bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); @@ -1621,7 +1691,7 @@ bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base, } -bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, +bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { @@ -1630,14 +1700,13 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, !IsLegalToFold(N, P, P, OptLevel)) return false; - return SelectAddr(N.getNode(), + return selectAddr(N.getNode(), N.getOperand(1), Base, Scale, Index, Disp, Segment); } -/// getGlobalBaseReg - Return an SDNode that returns the value of -/// the global base register. Output instructions required to -/// initialize the global base register, if necessary. -/// +/// Return an SDNode that returns the value of the global base register. +/// Output instructions required to initialize the global base register, +/// if necessary. SDNode *X86DAGToDAGISel::getGlobalBaseReg() { unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); auto &DL = MF->getDataLayout(); @@ -1828,7 +1897,7 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, return Val; } -SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { +SDNode *X86DAGToDAGISel::selectAtomicLoadArith(SDNode *Node, MVT NVT) { if (Node->hasAnyUseOfValue(0)) return nullptr; @@ -1841,7 +1910,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { SDValue Ptr = Node->getOperand(1); SDValue Val = Node->getOperand(2); SDValue Base, Scale, Index, Disp, Segment; - if (!SelectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment)) + if (!selectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment)) return nullptr; // Which index into the table. @@ -1933,9 +2002,9 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { return CurDAG->getMergeValues(RetVals, dl).getNode(); } -/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has -/// any uses which require the SF or OF bits to be accurate. -static bool HasNoSignedComparisonUses(SDNode *N) { +/// Test whether the given X86ISD::CMP node has any uses which require the SF +/// or OF bits to be accurate. +static bool hasNoSignedComparisonUses(SDNode *N) { // Examine each user of the node. for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; ++UI) { @@ -1995,9 +2064,8 @@ static bool HasNoSignedComparisonUses(SDNode *N) { return true; } -/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode -/// is suitable for doing the {load; increment or decrement; store} to modify -/// transformation. +/// Check whether or not the chain ending in StoreNode is suitable for doing +/// the {load; increment or decrement; store} to modify transformation. static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, SDValue StoredVal, SelectionDAG *CurDAG, LoadSDNode* &LoadNode, SDValue &InputChain) { @@ -2081,8 +2149,8 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, return true; } -/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory -/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC. +/// Get the appropriate X86 opcode for an in-memory increment or decrement. +/// Opc should be X86ISD::DEC or X86ISD::INC. static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) { if (Opc == X86ISD::DEC) { if (LdVT == MVT::i64) return X86::DEC64m; @@ -2099,9 +2167,8 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) { llvm_unreachable("unrecognized size for LdVT"); } -/// SelectGather - Customized ISel for GATHER operations. -/// -SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) { +/// Customized ISel for GATHER operations. +SDNode *X86DAGToDAGISel::selectGather(SDNode *Node, unsigned Opc) { // Operands of Gather: VSrc, Base, VIdx, VMask, Scale SDValue Chain = Node->getOperand(0); SDValue VSrc = Node->getOperand(2); @@ -2148,6 +2215,27 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { switch (Opcode) { default: break; + case ISD::BRIND: { + if (Subtarget->isTargetNaCl()) + // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We + // leave the instruction alone. + break; + if (Subtarget->isTarget64BitILP32()) { + // Converts a 32-bit register to a 64-bit, zero-extended version of + // it. This is needed because x86-64 can do many things, but jmp %r32 + // ain't one of them. + const SDValue &Target = Node->getOperand(1); + assert(Target.getSimpleValueType() == llvm::MVT::i32); + SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64)); + SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other, + Node->getOperand(0), ZextTarget); + ReplaceUses(SDValue(Node, 0), Brind); + SelectCode(ZextTarget.getNode()); + SelectCode(Brind.getNode()); + return nullptr; + } + break; + } case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); switch (IntNo) { @@ -2190,7 +2278,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break; case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break; } - SDNode *RetVal = SelectGather(Node, Opc); + SDNode *RetVal = selectGather(Node, Opc); if (RetVal) // We already called ReplaceUses inside SelectGather. return nullptr; @@ -2217,7 +2305,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_ADD: { - SDNode *RetVal = SelectAtomicLoadArith(Node, NVT); + SDNode *RetVal = selectAtomicLoadArith(Node, NVT); if (RetVal) return RetVal; break; @@ -2404,10 +2492,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); // Multiply is commmutative. if (!foldedLoad) { - foldedLoad = TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); if (foldedLoad) std::swap(N0, N1); } @@ -2549,7 +2637,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); bool signBitIsZero = CurDAG->SignBitIsZero(N0); SDValue InFlag; @@ -2557,7 +2645,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // Special case for div8, just use a move with zero extension to AX to // clear the upper 8 bits (AH). SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain; - if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; Move = SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, @@ -2692,7 +2780,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue N1 = Node->getOperand(1); if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && - HasNoSignedComparisonUses(Node)) + hasNoSignedComparisonUses(Node)) N0 = N0.getOperand(0); // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to @@ -2709,7 +2797,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // For example, convert "testl %eax, $8" to "testb %al, $8" if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 && (!(C->getZExtValue() & 0x80) || - HasNoSignedComparisonUses(Node))) { + hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Reg = N0.getNode()->getOperand(0); @@ -2743,7 +2831,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // For example, "testl %eax, $2048" to "testb %ah, $8". if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 && (!(C->getZExtValue() & 0x8000) || - HasNoSignedComparisonUses(Node))) { + hasNoSignedComparisonUses(Node))) { // Shift the immediate right by 8 bits. SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8, dl, MVT::i8); @@ -2781,7 +2869,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 && N0.getValueType() != MVT::i16 && (!(C->getZExtValue() & 0x8000) || - HasNoSignedComparisonUses(Node))) { + hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i16); SDValue Reg = N0.getNode()->getOperand(0); @@ -2804,7 +2892,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 && N0.getValueType() == MVT::i64 && (!(C->getZExtValue() & 0x80000000) || - HasNoSignedComparisonUses(Node))) { + hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i32); SDValue Reg = N0.getNode()->getOperand(0); @@ -2854,7 +2942,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { break; SDValue Base, Scale, Index, Disp, Segment; - if (!SelectAddr(LoadNode, LoadNode->getBasePtr(), + if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp, Segment)) break; @@ -2903,7 +2991,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, case InlineAsm::Constraint_v: // not offsetable ?? case InlineAsm::Constraint_m: // memory case InlineAsm::Constraint_X: - if (!SelectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) + if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) return true; break; } @@ -2916,9 +3004,8 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, return false; } -/// createX86ISelDag - This pass converts a legalized DAG into a -/// X86-specific DAG, ready for instruction scheduling. -/// +/// This pass converts a legalized DAG into a X86-specific DAG, +/// ready for instruction scheduling. FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, CodeGenOpt::Level OptLevel) { return new X86DAGToDAGISel(TM, OptLevel); diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index 0f29b51..1ec93b5 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -18,6 +18,7 @@ #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" +#include "X86ShuffleDecodeConstantPool.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" @@ -25,6 +26,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -67,19 +69,14 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( "rather than promotion."), cl::Hidden); -// Forward declarations. -static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, - SDValue V2); - X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); - TD = TM.getDataLayout(); + MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); // Set up the TargetLowering object. - static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); @@ -118,13 +115,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); - - // The _ftol2 runtime function has an unusual calling conv, which - // is modeled by a special pseudo-instruction. - setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr); - setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr); - setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr); - setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr); } if (Subtarget->isTargetDarwin()) { @@ -175,14 +165,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); if (Subtarget->is64Bit()) { - setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); + if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) + // f32/f64 are legal, f80 is custom. + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); + else + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); } else if (!Subtarget->useSoftFloat()) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); // We have an algorithm for SSE2, and we turn this into a 64-bit - // FILD for other targets. + // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); } @@ -206,23 +200,29 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); } - // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 - // are Legal, f80 is custom lowered. - setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); - setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); - // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); - if (X86ScalarSSEf32) { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); - // f32 and f64 cases are Legal, f80 case is not - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + if (!Subtarget->useSoftFloat()) { + // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 + // are Legal, f80 is custom lowered. + setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); + + if (X86ScalarSSEf32) { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); + // f32 and f64 cases are Legal, f80 case is not + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } else { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } } else { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); + setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand); } // Handle FP_TO_UINT by promoting the destination to a larger signed @@ -232,8 +232,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); if (Subtarget->is64Bit()) { - setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); - setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); + if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { + // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); + } else { + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); + } } else if (!Subtarget->useSoftFloat()) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) @@ -242,14 +248,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // the optimal thing for SSE vs. the default expansion in the legalizer. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); else + // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom. // With SSE3 we can use fisttpll to convert to a signed i64; without // SSE, we're stuck with a fistpll. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); - } - if (isTargetFTOL()) { - // Use the _ftol2 runtime function, which has a pseudo-instruction - // to handle its weird calling convention. setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } @@ -262,7 +265,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } - } + } else if (!Subtarget->is64Bit()) + setOperationAction(ISD::BITCAST , MVT::i64 , Custom); // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes @@ -274,8 +278,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. - for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { - MVT VT = IntVTs[i]; + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); @@ -295,6 +298,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC , MVT::f32, Expand); setOperationAction(ISD::BR_CC , MVT::f64, Expand); setOperationAction(ISD::BR_CC , MVT::f80, Expand); + setOperationAction(ISD::BR_CC , MVT::f128, Expand); setOperationAction(ISD::BR_CC , MVT::i8, Expand); setOperationAction(ISD::BR_CC , MVT::i16, Expand); setOperationAction(ISD::BR_CC , MVT::i32, Expand); @@ -302,6 +306,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f128, Expand); setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); @@ -312,7 +317,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); - setOperationAction(ISD::FREM , MVT::f32 , Expand); + + if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) { + // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` + // is. We should promote the value to 64-bits to solve this. + // This is what the CRT headers do - `fmodf` is an inline header + // function casting to f64 and calling `fmod`. + setOperationAction(ISD::FREM , MVT::f32 , Promote); + } else { + setOperationAction(ISD::FREM , MVT::f32 , Expand); + } + setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); @@ -404,15 +419,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT , MVT::f32 , Custom); setOperationAction(ISD::SELECT , MVT::f64 , Custom); setOperationAction(ISD::SELECT , MVT::f80 , Custom); + setOperationAction(ISD::SELECT , MVT::f128 , Custom); setOperationAction(ISD::SETCC , MVT::i8 , Custom); setOperationAction(ISD::SETCC , MVT::i16 , Custom); setOperationAction(ISD::SETCC , MVT::i32 , Custom); setOperationAction(ISD::SETCC , MVT::f32 , Custom); setOperationAction(ISD::SETCC , MVT::f64 , Custom); setOperationAction(ISD::SETCC , MVT::f80 , Custom); + setOperationAction(ISD::SETCC , MVT::f128 , Custom); + setOperationAction(ISD::SETCCE , MVT::i8 , Custom); + setOperationAction(ISD::SETCCE , MVT::i16 , Custom); + setOperationAction(ISD::SETCCE , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::SELECT , MVT::i64 , Custom); setOperationAction(ISD::SETCC , MVT::i64 , Custom); + setOperationAction(ISD::SETCCE , MVT::i64 , Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support @@ -456,8 +477,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); // Expand certain atomics - for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { - MVT VT = IntVTs[i]; + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); @@ -473,13 +493,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } - if (Subtarget->is64Bit()) { - setExceptionPointerRegister(X86::RAX); - setExceptionSelectorRegister(X86::RDX); - } else { - setExceptionPointerRegister(X86::EAX); - setExceptionSelectorRegister(X86::EDX); - } setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); @@ -492,8 +505,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); - if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { - // TargetInfo::X86_64ABIBuiltinVaList + if (Subtarget->is64Bit()) { setOperationAction(ISD::VAARG , MVT::Other, Custom); setOperationAction(ISD::VACOPY , MVT::Other, Custom); } else { @@ -505,7 +517,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(*TD), Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); @@ -613,8 +625,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); - // Long double always uses X87. + // Long double always uses X87, except f128 in MMX. if (!Subtarget->useSoftFloat()) { + if (Subtarget->is64Bit() && Subtarget->hasMMX()) { + addRegisterClass(MVT::f128, &X86::FR128RegClass); + ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); + setOperationAction(ISD::FABS , MVT::f128, Custom); + setOperationAction(ISD::FNEG , MVT::f128, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); + } + addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -846,15 +866,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); + setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); + // ISD::CTTZ v2i64 - scalarization is faster. + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster. + // Custom lower build_vector, vector_shuffle, and extract_vector_elt. - for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { - MVT VT = (MVT::SimpleValueType)i; - // Do not attempt to custom lower non-power-of-2 vectors - if (!isPowerOf2_32(VT.getVectorNumElements())) - continue; - // Do not attempt to custom lower non-128-bit vectors - if (!VT.is128BitVector()) - continue; + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); @@ -892,13 +914,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. - for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { - MVT VT = (MVT::SimpleValueType)i; - - // Do not attempt to promote non-128-bit vectors - if (!VT.is128BitVector()) - continue; - + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v2i64); setOperationAction(ISD::OR, VT, Promote); @@ -1036,6 +1052,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRA, MVT::v4i32, Custom); } + if (Subtarget->hasXOP()) { + setOperationAction(ISD::ROTL, MVT::v16i8, Custom); + setOperationAction(ISD::ROTL, MVT::v8i16, Custom); + setOperationAction(ISD::ROTL, MVT::v4i32, Custom); + setOperationAction(ISD::ROTL, MVT::v2i64, Custom); + setOperationAction(ISD::ROTL, MVT::v32i8, Custom); + setOperationAction(ISD::ROTL, MVT::v16i16, Custom); + setOperationAction(ISD::ROTL, MVT::v8i32, Custom); + setOperationAction(ISD::ROTL, MVT::v4i64, Custom); + } + if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) { addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); @@ -1126,7 +1153,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); - if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) { + setOperationAction(ISD::CTTZ, MVT::v32i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v16i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v32i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); + + if (Subtarget->hasAnyFMA()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); @@ -1202,6 +1238,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v8i32, Custom); setOperationAction(ISD::MUL, MVT::v16i16, Custom); setOperationAction(ISD::MUL, MVT::v32i8, Custom); + + setOperationAction(ISD::SMAX, MVT::v32i8, Custom); + setOperationAction(ISD::SMAX, MVT::v16i16, Custom); + setOperationAction(ISD::SMAX, MVT::v8i32, Custom); + setOperationAction(ISD::UMAX, MVT::v32i8, Custom); + setOperationAction(ISD::UMAX, MVT::v16i16, Custom); + setOperationAction(ISD::UMAX, MVT::v8i32, Custom); + setOperationAction(ISD::SMIN, MVT::v32i8, Custom); + setOperationAction(ISD::SMIN, MVT::v16i16, Custom); + setOperationAction(ISD::SMIN, MVT::v8i32, Custom); + setOperationAction(ISD::UMIN, MVT::v32i8, Custom); + setOperationAction(ISD::UMIN, MVT::v16i16, Custom); + setOperationAction(ISD::UMIN, MVT::v8i32, Custom); } // In the customized shift lowering, the legal cases in AVX2 will be @@ -1243,15 +1292,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget->hasInt256()) setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); - // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. - for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { - MVT VT = (MVT::SimpleValueType)i; - - // Do not attempt to promote non-256-bit vectors - if (!VT.is256BitVector()) - continue; - + for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v4i64); setOperationAction(ISD::OR, VT, Promote); @@ -1293,6 +1335,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); setOperationAction(ISD::AND, MVT::i1, Legal); @@ -1311,6 +1354,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::v16f32, Legal); setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::FNEG, MVT::v16f32, Custom); + setOperationAction(ISD::FABS, MVT::v16f32, Custom); setOperationAction(ISD::FADD, MVT::v8f64, Legal); setOperationAction(ISD::FSUB, MVT::v8f64, Legal); @@ -1318,19 +1362,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::v8f64, Legal); setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); setOperationAction(ISD::FNEG, MVT::v8f64, Custom); + setOperationAction(ISD::FABS, MVT::v8f64, Custom); setOperationAction(ISD::FMA, MVT::v8f64, Legal); setOperationAction(ISD::FMA, MVT::v16f32, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); - if (Subtarget->is64Bit()) { - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); - } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); @@ -1348,12 +1383,62 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); + if (Subtarget->hasVLX()){ + setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); + + setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); + } else { + setOperationAction(ISD::MLOAD, MVT::v8i32, Custom); + setOperationAction(ISD::MLOAD, MVT::v8f32, Custom); + setOperationAction(ISD::MSTORE, MVT::v8i32, Custom); + setOperationAction(ISD::MSTORE, MVT::v8f32, Custom); + } setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); if (Subtarget->hasDQI()) { - setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); + + setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); + if (Subtarget->hasVLX()) { + setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); + } + } + if (Subtarget->hasVLX()) { + setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); } setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); @@ -1386,7 +1471,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v8i1, Custom); @@ -1395,6 +1480,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); @@ -1439,9 +1525,49 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::XOR, MVT::v16i32, Legal); if (Subtarget->hasCDI()) { - setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); - } + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Expand); + + setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Expand); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); + + if (Subtarget->hasVLX()) { + setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); + setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + } else { + setOperationAction(ISD::CTLZ, MVT::v4i64, Custom); + setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); + setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); + setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); + } + } // Subtarget->hasCDI() + if (Subtarget->hasDQI()) { setOperationAction(ISD::MUL, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v4i64, Legal); @@ -1455,7 +1581,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); } - if (EltSize >= 32 && VT.getSizeInBits() <= 512) { + if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) { setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } @@ -1481,15 +1607,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); + setOperationAction(ISD::MGATHER, VT, Legal); + setOperationAction(ISD::MSCATTER, VT, Custom); } } - for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { - MVT VT = (MVT::SimpleValueType)i; - - // Do not attempt to promote non-512-bit vectors. - if (!VT.is512BitVector()) - continue; - + for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); } @@ -1515,22 +1637,35 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHU, MVT::v32i16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::SELECT, MVT::v32i1, Custom); setOperationAction(ISD::SELECT, MVT::v64i1, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom); setOperationAction(ISD::SMAX, MVT::v64i8, Legal); setOperationAction(ISD::SMAX, MVT::v32i16, Legal); @@ -1541,19 +1676,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v64i8, Legal); setOperationAction(ISD::UMIN, MVT::v32i16, Legal); - for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { - const MVT VT = (MVT::SimpleValueType)i; + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); + setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); + if (Subtarget->hasVLX()) + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); - const unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (Subtarget->hasCDI()) { + setOperationAction(ISD::CTLZ, MVT::v32i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Expand); + } - // Do not attempt to promote non-512-bit vectors. - if (!VT.is512BitVector()) - continue; + for (auto VT : { MVT::v64i8, MVT::v32i16 }) { + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); - if (EltSize < 32) { - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); - } + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v8i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v8i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v8i64); } } @@ -1571,6 +1718,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v2i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom); setOperationAction(ISD::AND, MVT::v8i32, Legal); setOperationAction(ISD::OR, MVT::v8i32, Legal); @@ -1595,8 +1744,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - if (!Subtarget->is64Bit()) + if (!Subtarget->is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); + } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. @@ -1604,9 +1755,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. - for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { + if (VT == MVT::i64 && !Subtarget->is64Bit()) + continue; // Add/Sub/Mul with overflow operations are custom lowered. - MVT VT = IntVTs[i]; setOperationAction(ISD::SADDO, VT, Custom); setOperationAction(ISD::UADDO, VT, Custom); setOperationAction(ISD::SSUBO, VT, Custom); @@ -1615,7 +1767,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMULO, VT, Custom); } - if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); @@ -1658,12 +1809,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FMA); + setTargetDAGCombine(ISD::FMINNUM); + setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MSTORE); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); @@ -1671,24 +1826,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); - setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); + setTargetDAGCombine(ISD::MSCATTER); + setTargetDAGCombine(ISD::MGATHER); computeRegisterProperties(Subtarget->getRegisterInfo()); - // On Darwin, -Os means optimize for size without hurting performance, - // do not reduce the limit. MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores - MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; + MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores - MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores - MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemmoveOptSize = 4; setPrefLoopAlignment(4); // 2^4 bytes. - // Predictable cmov don't hurt on atom because it's in-order. + // A predictable cmov does not hurt on an in-order CPU. + // FIXME: Use a CPU attribute to trigger this, not a CPU model. PredictableSelectIsExpensive = !Subtarget->isAtom(); EnableExtLdPromotion = true; setPrefFunctionAlignment(4); // 2^4 bytes. @@ -1716,40 +1871,43 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; - const unsigned NumElts = VT.getVectorNumElements(); - const EVT EltVT = VT.getVectorElementType(); - if (VT.is512BitVector()) { - if (Subtarget->hasAVX512()) - if (EltVT == MVT::i32 || EltVT == MVT::i64 || - EltVT == MVT::f32 || EltVT == MVT::f64) - switch(NumElts) { - case 8: return MVT::v8i1; - case 16: return MVT::v16i1; - } - if (Subtarget->hasBWI()) - if (EltVT == MVT::i8 || EltVT == MVT::i16) - switch(NumElts) { - case 32: return MVT::v32i1; - case 64: return MVT::v64i1; - } - } + if (VT.isSimple()) { + MVT VVT = VT.getSimpleVT(); + const unsigned NumElts = VVT.getVectorNumElements(); + const MVT EltVT = VVT.getVectorElementType(); + if (VVT.is512BitVector()) { + if (Subtarget->hasAVX512()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + } + if (Subtarget->hasBWI()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 32: return MVT::v32i1; + case 64: return MVT::v64i1; + } + } - if (VT.is256BitVector() || VT.is128BitVector()) { - if (Subtarget->hasVLX()) - if (EltVT == MVT::i32 || EltVT == MVT::i64 || - EltVT == MVT::f32 || EltVT == MVT::f64) - switch(NumElts) { - case 2: return MVT::v2i1; - case 4: return MVT::v4i1; - case 8: return MVT::v8i1; - } - if (Subtarget->hasBWI() && Subtarget->hasVLX()) - if (EltVT == MVT::i8 || EltVT == MVT::i16) - switch(NumElts) { - case 8: return MVT::v8i1; - case 16: return MVT::v16i1; - case 32: return MVT::v32i1; - } + if (VVT.is256BitVector() || VVT.is128BitVector()) { + if (Subtarget->hasVLX()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 2: return MVT::v2i1; + case 4: return MVT::v4i1; + case 8: return MVT::v8i1; + } + if (Subtarget->hasBWI() && Subtarget->hasVLX()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + case 32: return MVT::v32i1; + } + } } return VT.changeVectorElementTypeToInteger(); @@ -1769,9 +1927,9 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast<StructType>(Ty)) { - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + for (auto *EltTy : STy->elements()) { unsigned EltAlign = 0; - getMaxByValAlign(STy->getElementType(i), EltAlign); + getMaxByValAlign(EltTy, EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == 16) @@ -1821,10 +1979,11 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, if ((!IsMemset || ZeroMemset) && !F->hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && - (Subtarget->isUnalignedMemAccessFast() || + (!Subtarget->isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { if (Size >= 32) { + // FIXME: Check if unaligned 32-byte accesses are slow. if (Subtarget->hasInt256()) return MVT::v8i32; if (Subtarget->hasFp256()) @@ -1842,6 +2001,9 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, return MVT::f64; } } + // This is a compromise. If we reach here, unaligned accesses may be slow on + // this target. However, creating smaller, aligned accesses could be even + // slower and would certainly be a lot more code. if (Subtarget->is64Bit() && Size >= 8) return MVT::i64; return MVT::i32; @@ -1860,8 +2022,22 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, bool *Fast) const { - if (Fast) - *Fast = Subtarget->isUnalignedMemAccessFast(); + if (Fast) { + switch (VT.getSizeInBits()) { + default: + // 8-byte and under are always assumed to be fast. + *Fast = true; + break; + case 128: + *Fast = !Subtarget->isUnalignedMem16Slow(); + break; + case 256: + *Fast = !Subtarget->isUnalignedMem32Slow(); + break; + // TODO: What about AVX-512 (512-bit) accesses? + } + } + // Misaligned accesses of any size are always allowed. return true; } @@ -1964,6 +2140,32 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, return true; } +Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { + if (!Subtarget->isTargetAndroid()) + return TargetLowering::getSafeStackPointerLocation(IRB); + + // Android provides a fixed TLS slot for the SafeStack pointer. See the + // definition of TLS_SLOT_SAFESTACK in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + unsigned AddressSpace, Offset; + if (Subtarget->is64Bit()) { + // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: + Offset = 0x48; + if (getTargetMachine().getCodeModel() == CodeModel::Kernel) + AddressSpace = 256; + else + AddressSpace = 257; + } else { + // %gs:0x24 on i386 + Offset = 0x24; + AddressSpace = 256; + } + + return ConstantExpr::getIntToPtr( + ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); +} + bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); @@ -1977,11 +2179,9 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, #include "X86GenCallingConv.inc" -bool -X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, - MachineFunction &MF, bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - LLVMContext &Context) const { +bool X86TargetLowering::CanLowerReturn( + CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); @@ -2001,6 +2201,9 @@ X86TargetLowering::LowerReturn(SDValue Chain, MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + if (CallConv == CallingConv::X86_INTR && !Outs.empty()) + report_fatal_error("X86 interrupts may not return any value"); + SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); @@ -2025,7 +2228,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, else if (VA.getLocInfo() == CCValAssign::ZExt) ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::AExt) { - if (ValVT.isVector() && ValVT.getScalarType() == MVT::i1) + if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); @@ -2108,13 +2311,28 @@ X86TargetLowering::LowerReturn(SDValue Chain, DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } + const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (X86::GR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } + RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); - return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps); + X86ISD::NodeType opcode = X86ISD::RET_FLAG; + if (CallConv == CallingConv::X86_INTR) + opcode = X86ISD::IRET; + return DAG.getNode(opcode, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { @@ -2193,7 +2411,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, EVT CopyVT = VA.getLocVT(); // If this is x86-64, and we disabled SSE, we can't return FP values - if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -2244,28 +2462,28 @@ enum StructReturnType { StackStructReturn }; static StructReturnType -callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { +callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) { if (Outs.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Outs[0].Flags; if (!Flags.isSRet()) return NotStructReturn; - if (Flags.isInReg()) + if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } /// Determines whether a function uses struct return semantics. static StructReturnType -argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { +argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) { if (Ins.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Ins[0].Flags; if (!Flags.isSRet()) return NotStructReturn; - if (Flags.isInReg()) + if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } @@ -2285,17 +2503,34 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, MachinePointerInfo(), MachinePointerInfo()); } -/// Return true if the calling convention is one that -/// supports tail call optimization. -static bool IsTailCallConvention(CallingConv::ID CC) { +/// Return true if the calling convention is one that we can guarantee TCO for. +static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || - CC == CallingConv::HiPE); + CC == CallingConv::HiPE || CC == CallingConv::HHVM); +} + +/// Return true if we might ever do TCO for calls with this calling convention. +static bool mayTailCallThisCC(CallingConv::ID CC) { + switch (CC) { + // C calling conventions: + case CallingConv::C: + case CallingConv::X86_64_Win64: + case CallingConv::X86_64_SysV: + // Callee pop conventions: + case CallingConv::X86_ThisCall: + case CallingConv::X86_StdCall: + case CallingConv::X86_VectorCall: + case CallingConv::X86_FastCall: + return true; + default: + return canGuaranteeTCO(CC); + } } -/// \brief Return true if the calling convention is a C calling convention. -static bool IsCCallConvention(CallingConv::ID CC) { - return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 || - CC == CallingConv::X86_64_SysV); +/// Return true if the function is being made into a tailcall target by +/// changing its ABI. +static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { + return GuaranteedTailCallOpt && canGuaranteeTCO(CC); } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { @@ -2306,19 +2541,12 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { CallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); - if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) + if (!mayTailCallThisCC(CalleeCC)) return false; return true; } -/// Return true if the function is being made into -/// a tailcall target by changing its ABI. -static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, - bool GuaranteedTailCallOpt) { - return GuaranteedTailCallOpt && IsTailCallConvention(CC); -} - SDValue X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, @@ -2329,7 +2557,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; - bool AlwaysUseMutable = FuncIsMadeTailCallSafe( + bool AlwaysUseMutable = shouldGuaranteeTCO( CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; @@ -2344,6 +2572,19 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, else ValVT = VA.getValVT(); + // Calculate SP offset of interrupt parameter, re-arrange the slot normally + // taken by a return address. + int Offset = 0; + if (CallConv == CallingConv::X86_INTR) { + const X86Subtarget& Subtarget = + static_cast<const X86Subtarget&>(DAG.getSubtarget()); + // X86 interrupts may take one or two arguments. + // On the stack there will be no return address as in regular call. + // Offset of last argument need to be set to -4/-8 bytes. + // Where offset of the first argument out of two, should be set to 0 bytes. + Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); + } + // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they @@ -2352,14 +2593,24 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI->setObjectOffset(FI, Offset); + } return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, VA.getLocMemOffset(), isImmutable); + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI->setObjectOffset(FI, Offset); + } + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - SDValue Val = DAG.getLoad(ValVT, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + SDValue Val = DAG.getLoad( + ValVT, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, false, 0); return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; } @@ -2413,15 +2664,10 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } -SDValue -X86TargetLowering::LowerFormalArguments(SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - SDLoc dl, - SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) - const { +SDValue X86TargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); @@ -2436,9 +2682,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); - assert(!(isVarArg && IsTailCallConvention(CallConv)) && + assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); + if (CallConv == CallingConv::X86_INTR) { + bool isLegal = Ins.size() == 1 || + (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || + (!Is64Bit && Ins[1].VT == MVT::i32))); + if (!isLegal) + report_fatal_error("X86 interrupts may take one or two arguments"); + } + // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); @@ -2471,6 +2725,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; + else if (RegVT == MVT::f128) + RC = &X86::FR128RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) @@ -2547,8 +2803,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. - if (FuncIsMadeTailCallSafe(CallConv, - MF.getTarget().Options.GuaranteedTailCallOpt)) + if (shouldGuaranteeTCO(CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for @@ -2561,13 +2817,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MFI->CreateFixedObject(1, StackSize, true)); } - MachineModuleInfo &MMI = MF.getMMI(); - const Function *WinEHParent = nullptr; - if (MMI.hasWinEHFuncInfo(Fn)) - WinEHParent = MMI.getWinEHParent(Fn); - bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn; - bool IsWinEHParent = WinEHParent && WinEHParent == Fn; - // Figure out if XMM registers are in use. assert(!(Subtarget->useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && @@ -2631,10 +2880,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), RSFIN, DAG.getIntPtrConstant(Offset, dl)); SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack( - FuncInfo->getRegSaveFrameIndex(), Offset), - false, false, 0); + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + FuncInfo->getRegSaveFrameIndex(), Offset), + false, false, 0); MemOps.push_back(Store); Offset += 8; } @@ -2656,27 +2906,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); - } else if (IsWin64 && IsWinEHOutlined) { - // Get to the caller-allocated home save location. Add 8 to account - // for the return address. - int HomeOffset = TFI.getOffsetOfLocalArea() + 8; - FuncInfo->setRegSaveFrameIndex(MFI->CreateFixedObject( - /*Size=*/1, /*SPOffset=*/HomeOffset + 8, /*Immutable=*/false)); - - MMI.getWinEHFuncInfo(Fn) - .CatchHandlerParentFrameObjIdx[const_cast<Function *>(Fn)] = - FuncInfo->getRegSaveFrameIndex(); - - // Store the second integer parameter (rdx) into rsp+16 relative to the - // stack pointer at the entry of the function. - SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy(DAG.getDataLayout())); - unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64); - Chain = DAG.getStore( - Val.getValue(1), dl, Val, RSFIN, - MachinePointerInfo::getFixedStack(FuncInfo->getRegSaveFrameIndex()), - /*isVolatile=*/true, /*isNonTemporal=*/false, /*Alignment=*/0); } if (isVarArg && MFI->hasMustTailInVarArgFunc()) { @@ -2723,12 +2952,15 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. + } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { + // X86 interrupts must pop the error code if present + FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4); } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. - if (!Is64Bit && !IsTailCallConvention(CallConv) && + if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && - argsAreStructReturn(Ins) == StackStructReturn) + argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } @@ -2743,21 +2975,20 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setArgumentStackSize(StackSize); - if (IsWinEHParent) { - if (Is64Bit) { - int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); - SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64); - MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI; - SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64); - Chain = DAG.getStore(Chain, dl, Neg2, StackSlot, - MachinePointerInfo::getFixedStack(UnwindHelpFI), - /*isVolatile=*/true, - /*isNonTemporal=*/false, /*Alignment=*/0); - } else { - // Functions using Win32 EH are considered to have opaque SP adjustments - // to force local variables to be addressed from the frame or base - // pointers. - MFI->setHasOpaqueSPAdjustment(true); + if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { + EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn()); + if (Personality == EHPersonality::CoreCLR) { + assert(Is64Bit); + // TODO: Add a mechanism to frame lowering that will allow us to indicate + // that we'd prefer this slot be allocated towards the bottom of the frame + // (i.e. near the stack pointer after allocating the frame). Every + // funclet needs a copy of this slot in its (mostly empty) frame, and the + // offset from the bottom of this and each funclet's frame must be the + // same, so the size of funclets' (mostly empty) frames is dictated by + // how far this slot is from the bottom (since they allocate just enough + // space to accomodate holding this slot at the correct offset). + int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); + EHInfo->PSPSymFrameIdx = PSPSymFI; } } @@ -2777,9 +3008,10 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); - return DAG.getStore(Chain, dl, Arg, PtrOff, - MachinePointerInfo::getStack(LocMemOffset), - false, false, 0); + return DAG.getStore( + Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), + false, false, 0); } /// Emit a load of return address if tail call @@ -2813,11 +3045,24 @@ static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, - MachinePointerInfo::getFixedStack(NewReturnAddrFI), + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), NewReturnAddrFI), false, false, 0); return Chain; } +/// Returns a vector_shuffle mask for an movs{s|d}, movd +/// operation of specified width. +static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, + SDValue V2) { + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> Mask; + Mask.push_back(NumElems); + for (unsigned i = 1; i != NumElems; ++i) + Mask.push_back(i); + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); +} + SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { @@ -2835,11 +3080,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); - StructReturnType SR = callIsStructReturn(Outs); + StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU()); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); + if (CallConv == CallingConv::X86_INTR) + report_fatal_error("X86 interrupts may not be called directly"); + if (Attr.getValueAsString() == "true") isTailCall = false; @@ -2878,7 +3126,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ++NumTailCalls; } - assert(!(isVarArg && IsTailCallConvention(CallConv)) && + assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); // Analyze operands of the call, assigning locations to each operand. @@ -2892,13 +3140,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CCInfo.AnalyzeCallOperands(Outs, CC_X86); // Get a count of how many bytes are to be pushed on the stack. - unsigned NumBytes = CCInfo.getNextStackOffset(); + unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); if (IsSibcall) // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; else if (MF.getTarget().Options.GuaranteedTailCallOpt && - IsTailCallConvention(CallConv)) + canGuaranteeTCO(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; @@ -2970,7 +3218,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, break; case CCValAssign::AExt: if (Arg.getValueType().isVector() && - Arg.getValueType().getScalarType() == MVT::i1) + Arg.getValueType().getVectorElementType() == MVT::i1) Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); else if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. @@ -2987,9 +3235,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Store the argument. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); - Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, - MachinePointerInfo::getFixedStack(FI), - false, false, 0); + Chain = DAG.getStore( + Chain, dl, Arg, SpillSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, 0); Arg = SpillSlot; break; } @@ -3125,10 +3374,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Flags, DAG, dl)); } else { // Store relative to framepointer. - MemOpChains2.push_back( - DAG.getStore(ArgChain, dl, Arg, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); + MemOpChains2.push_back(DAG.getStore( + ArgChain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, 0)); } } @@ -3207,7 +3456,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (ExtraLoad) Callee = DAG.getLoad( getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, + false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { unsigned char OpFlags = 0; @@ -3261,9 +3511,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); - // If this is an invoke in a 32-bit function using an MSVC personality, assume - // the function clobbers all registers. If an exception is thrown, the runtime - // will not restore CSRs. + // If this is an invoke in a 32-bit function using a funclet-based + // personality, assume the function clobbers all registers. If an exception + // is thrown, the runtime will not restore CSRs. // FIXME: Model this more precisely so that we can register allocate across // the normal edge and spill and fill across the exceptional edge. if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) { @@ -3272,7 +3522,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallerFn->hasPersonalityFn() ? classifyEHPersonality(CallerFn->getPersonalityFn()) : EHPersonality::Unknown; - if (isMSVCEHPersonality(Pers)) + if (isFuncletEHPersonality(Pers)) Mask = RegInfo->getNoPreservedMask(); } @@ -3300,7 +3550,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, DAG.getTarget().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPop = NumBytes; // Callee pops everything - else if (!Is64Bit && !IsTailCallConvention(CallConv) && + else if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && SR == StackStructReturn) // If this is a call to a struct-return function, the callee @@ -3358,8 +3608,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // EDI // local1 .. -/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned -/// for a 16 byte align requirement. +/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align +/// requirement. unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { @@ -3380,9 +3630,8 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, return Offset; } -/// MatchingStackOffset - Return true if the given stack call argument is -/// already available in the same position (relatively) of the caller's -/// incoming argument stack. +/// Return true if the given stack call argument is already available in the +/// same position (relatively) of the caller's incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, @@ -3435,25 +3684,19 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); } -/// IsEligibleForTailCallOptimization - Check whether the call is eligible -/// for tail call optimization. Targets which want to do tail call -/// optimization should implement this function. -bool -X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, - CallingConv::ID CalleeCC, - bool isVarArg, - bool isCalleeStructRet, - bool isCallerStructRet, - Type *RetTy, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - SelectionDAG &DAG) const { - if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) +/// Check whether the call is eligible for tail call optimization. Targets +/// that want to do tail call optimization should implement this function. +bool X86TargetLowering::IsEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { + if (!mayTailCallThisCC(CalleeCC)) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. - const MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, @@ -3474,7 +3717,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return false; if (DAG.getTarget().Options.GuaranteedTailCallOpt) { - if (IsTailCallConvention(CalleeCC) && CCMatch) + if (canGuaranteeTCO(CalleeCC) && CCMatch) return true; return false; } @@ -3493,19 +3736,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (isCalleeStructRet || isCallerStructRet) return false; - // An stdcall/thiscall caller is expected to clean up its arguments; the - // callee isn't going to do that. - // FIXME: this is more restrictive than needed. We could produce a tailcall - // when the stack adjustment matches. For example, with a thiscall that takes - // only one argument. - if (!CCMatch && (CallerCC == CallingConv::X86_StdCall || - CallerCC == CallingConv::X86_ThisCall)) - return false; - // Do not sibcall optimize vararg calls unless all arguments are passed via // registers. if (isVarArg && !Outs.empty()) { - // Optimizing for varargs on Win64 is unlikely to be safe without // additional testing. if (IsCalleeWin64 || IsCallerWin64) @@ -3573,6 +3806,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } } + unsigned StackArgsSize = 0; + // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { @@ -3587,11 +3822,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); - if (CCInfo.getNextStackOffset()) { - MachineFunction &MF = DAG.getMachineFunction(); - if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) - return false; + StackArgsSize = CCInfo.getNextStackOffset(); + if (CCInfo.getNextStackOffset()) { // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -3642,6 +3875,21 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } } + bool CalleeWillPop = + X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg, + MF.getTarget().Options.GuaranteedTailCallOpt); + + if (unsigned BytesToPop = + MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) { + // If we have bytes to pop, the callee must pop them. + bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; + if (!CalleePopMatches) + return false; + } else if (CalleeWillPop && StackArgsSize > 0) { + // If we don't have bytes to pop, make sure the callee doesn't pop any. + return false; + } + return true; } @@ -3672,6 +3920,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: + case X86ISD::INSERTPS: case X86ISD::PALIGNR: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: @@ -3688,11 +3937,13 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case X86ISD::VPERMI: + case X86ISD::VPERMV: + case X86ISD::VPERMV3: return true; } } -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { @@ -3707,7 +3958,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, } } -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); @@ -3772,23 +4023,23 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, return false; } -/// isCalleePop - Determines whether the callee is required to pop its -/// own arguments. Callee pop is necessary to support tail calls. +/// Determines whether the callee is required to pop its own arguments. +/// Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, - bool is64Bit, bool IsVarArg, bool TailCallOpt) { + bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { + // If GuaranteeTCO is true, we force some calls to be callee pop so that we + // can guarantee TCO. + if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) + return true; + switch (CallingConv) { default: return false; case CallingConv::X86_StdCall: case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: + case CallingConv::X86_VectorCall: return !is64Bit; - case CallingConv::Fast: - case CallingConv::GHC: - case CallingConv::HiPE: - if (IsVarArg) - return false; - return TailCallOpt; } } @@ -3807,11 +4058,26 @@ static bool isX86CCUnsigned(unsigned X86CC) { case X86::COND_BE: return true; case X86::COND_AE: return true; } - llvm_unreachable("covered switch fell through?!"); } -/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 -/// specific condition code, returning the condition code and the LHS/RHS of the +static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { + switch (SetCCOpcode) { + default: llvm_unreachable("Invalid integer condition!"); + case ISD::SETEQ: return X86::COND_E; + case ISD::SETGT: return X86::COND_G; + case ISD::SETGE: return X86::COND_GE; + case ISD::SETLT: return X86::COND_L; + case ISD::SETLE: return X86::COND_LE; + case ISD::SETNE: return X86::COND_NE; + case ISD::SETULT: return X86::COND_B; + case ISD::SETUGT: return X86::COND_A; + case ISD::SETULE: return X86::COND_BE; + case ISD::SETUGE: return X86::COND_AE; + } +} + +/// Do a one-to-one translation of a ISD::CondCode to the X86-specific +/// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { @@ -3833,19 +4099,7 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, } } - switch (SetCCOpcode) { - default: llvm_unreachable("Invalid integer condition!"); - case ISD::SETEQ: return X86::COND_E; - case ISD::SETGT: return X86::COND_G; - case ISD::SETGE: return X86::COND_GE; - case ISD::SETLT: return X86::COND_L; - case ISD::SETLE: return X86::COND_LE; - case ISD::SETNE: return X86::COND_NE; - case ISD::SETULT: return X86::COND_B; - case ISD::SETUGT: return X86::COND_A; - case ISD::SETULE: return X86::COND_BE; - case ISD::SETUGE: return X86::COND_AE; - } + return TranslateIntegerX86CC(SetCCOpcode); } // First determine if it is required or is profitable to flip the operands. @@ -3898,8 +4152,8 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, } } -/// hasFPCMov - is there a floating point cmov for the specific X86 condition -/// code. Current x86 isa includes the following FP cmov instructions: +/// Is there a floating point cmov for the specific X86 condition code? +/// Current x86 isa includes the following FP cmov instructions: /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. static bool hasFPCMov(unsigned X86CC) { switch (X86CC) { @@ -3917,7 +4171,36 @@ static bool hasFPCMov(unsigned X86CC) { } } -/// isFPImmLegal - Returns true if the target can instruction select the + +bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + + const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); + if (!IntrData) + return false; + + switch (IntrData->Type) { + case LOADA: + case LOADU: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = (IntrData->Type == LOADA ? Info.memVT.getSizeInBits()/8 : 1); + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; + } + default: + break; + } + + return false; +} + +/// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { @@ -3970,7 +4253,7 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget->hasLZCNT(); } -/// isUndefInRange - Return true if every element in Mask, beginning +/// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size is undef. static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i) @@ -3979,19 +4262,18 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { return true; } -/// isUndefOrInRange - Return true if Val is undef or if its value falls within -/// the specified range (L, H]. +/// Return true if Val is undef or if its value falls within the +/// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val < 0) || (Val >= Low && Val < Hi); } -/// isUndefOrEqual - Val is either less than zero (undef) or equal to the -/// specified value. +/// Val is either less than zero (undef) or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return (Val < 0 || Val == CmpVal); } -/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning +/// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, @@ -4002,9 +4284,8 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, return true; } -/// isVEXTRACTIndex - Return true if the specified -/// EXTRACT_SUBVECTOR operand specifies a vector extract that is -/// suitable for instruction that extract 128 or 256 bit vectors +/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector +/// extract that is suitable for instruction that extract 128 or 256 bit vectors static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) @@ -4021,7 +4302,7 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { return Result; } -/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR +/// Return true if the specified INSERT_SUBVECTOR /// operand specifies a subvector insert that is suitable for input to /// insertion of 128 or 256-bit subvectors static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { @@ -4057,8 +4338,8 @@ bool X86::isVEXTRACT256Index(SDNode *N) { static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); - if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) - llvm_unreachable("Illegal extract subvector for VEXTRACT"); + assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) && + "Illegal extract subvector for VEXTRACT"); uint64_t Index = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); @@ -4072,8 +4353,8 @@ static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); - if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) - llvm_unreachable("Illegal insert subvector for VINSERT"); + assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) && + "Illegal insert subvector for VINSERT"); uint64_t Index = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); @@ -4085,53 +4366,71 @@ static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { return Index / NumElemsPerChunk; } -/// getExtractVEXTRACT128Immediate - Return the appropriate immediate -/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 -/// and VINSERTI128 instructions. +/// Return the appropriate immediate to extract the specified +/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions. unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 128); } -/// getExtractVEXTRACT256Immediate - Return the appropriate immediate -/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 -/// and VINSERTI64x4 instructions. +/// Return the appropriate immediate to extract the specified +/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions. unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 256); } -/// getInsertVINSERT128Immediate - Return the appropriate immediate -/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 -/// and VINSERTI128 instructions. +/// Return the appropriate immediate to insert at the specified +/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions. unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 128); } -/// getInsertVINSERT256Immediate - Return the appropriate immediate -/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 -/// and VINSERTI64x4 instructions. +/// Return the appropriate immediate to insert at the specified +/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions. unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } -/// isZero - Returns true if Elt is a constant integer zero -static bool isZero(SDValue V) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); - return C && C->isNullValue(); -} - -/// isZeroNode - Returns true if Elt is a constant zero or a floating point -/// constant +0.0. +/// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { - if (isZero(Elt)) - return true; - if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt)) - return CFP->getValueAPF().isPosZero(); - return false; + return isNullConstant(Elt) || isNullFPConstant(Elt); } -/// getZeroVector - Returns a vector of specified type with all zero elements. -/// -static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, +// Build a vector of constants +// Use an UNDEF node if MaskElt == -1. +// Spilt 64-bit constants in the 32-bit mode. +static SDValue getConstVector(ArrayRef<int> Values, MVT VT, + SelectionDAG &DAG, + SDLoc dl, bool IsMask = false) { + + SmallVector<SDValue, 32> Ops; + bool Split = false; + + MVT ConstVecVT = VT; + unsigned NumElts = VT.getVectorNumElements(); + bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); + if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { + ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); + Split = true; + } + + MVT EltVT = ConstVecVT.getVectorElementType(); + for (unsigned i = 0; i < NumElts; ++i) { + bool IsUndef = Values[i] < 0 && IsMask; + SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : + DAG.getConstant(Values[i], dl, EltVT); + Ops.push_back(OpNode); + if (Split) + Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : + DAG.getConstant(0, dl, EltVT)); + } + SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops); + if (Split) + ConstsNode = DAG.getBitcast(VT, ConstsNode); + return ConstsNode; +} + +/// Returns a vector of specified type with all zero elements. +static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); @@ -4163,7 +4462,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); - } else if (VT.getScalarType() == MVT::i1) { + } else if (VT.getVectorElementType() == MVT::i1) { assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16) && "Unexpected vector type"); @@ -4195,19 +4494,18 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); + assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) - * ElemsPerChunk); + // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. + IdxVal &= ~(ElemsPerChunk - 1); // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - makeArrayRef(Vec->op_begin() + NormalizedIdxVal, - ElemsPerChunk)); + makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); - SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl); + SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); } @@ -4245,13 +4543,13 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec, // Insert the relevant vectorWidth bits. unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); + assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) - * ElemsPerChunk); + // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. + IdxVal &= ~(ElemsPerChunk - 1); - SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl); + SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } @@ -4279,7 +4577,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, Vec, ZeroIndex); // The blend instruction, and therefore its mask, depend on the data type. - MVT ScalarType = ResultVT.getScalarType().getSimpleVT(); + MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT(); if (ScalarType.isFloatingPoint()) { // Choose either vblendps (float) or vblendpd (double). unsigned ScalarSize = ScalarType.getSizeInBits(); @@ -4302,6 +4600,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); + Result = DAG.getBitcast(CastVT, Result); Vec256 = DAG.getBitcast(CastVT, Vec256); Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); return DAG.getBitcast(ResultVT, Vec256); @@ -4316,6 +4615,81 @@ static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } +/// Insert i1-subvector to i1-vector. +static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) { + + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue SubVec = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + if (IdxVal == 0 && Vec.isUndef()) // the operation is legal + return Op; + + MVT OpVT = Op.getSimpleValueType(); + MVT SubVecVT = SubVec.getSimpleValueType(); + unsigned NumElems = OpVT.getVectorNumElements(); + unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); + + assert(IdxVal + SubVecNumElems <= NumElems && + IdxVal % SubVecVT.getSizeInBits() == 0 && + "Unexpected index value in INSERT_SUBVECTOR"); + + // There are 3 possible cases: + // 1. Subvector should be inserted in the lower part (IdxVal == 0) + // 2. Subvector should be inserted in the upper part + // (IdxVal + SubVecNumElems == NumElems) + // 3. Subvector should be inserted in the middle (for example v2i1 + // to v16i1, index 2) + + SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); + SDValue Undef = DAG.getUNDEF(OpVT); + SDValue WideSubVec = + DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx); + if (Vec.isUndef()) + return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + + if (ISD::isBuildVectorAllZeros(Vec.getNode())) { + unsigned ShiftLeft = NumElems - SubVecNumElems; + unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; + WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec, + DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec; + } + + if (IdxVal == 0) { + // Zero lower bits of the Vec + SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + // Merge them together + return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); + } + + // Simple case when we put subvector in the upper part + if (IdxVal + SubVecNumElems == NumElems) { + // Zero upper bits of the Vec + WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); + } + // Subvector should be inserted in the middle - use shuffle + SmallVector<int, 64> Mask; + for (unsigned i = 0; i < NumElems; ++i) + Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ? + i : i + NumElems); + return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask); +} + /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 /// instructions. This is used because creating CONCAT_VECTOR nodes of /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower @@ -4334,18 +4708,22 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } -/// getOnesVector - Returns a vector of specified type with all bits set. +/// Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. -static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, - SDLoc dl) { +static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32); SDValue Vec; - if (VT.is256BitVector()) { - if (HasInt256) { // AVX2 + if (VT.is512BitVector()) { + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); + } else if (VT.is256BitVector()) { + if (Subtarget->hasInt256()) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // AVX @@ -4360,19 +4738,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, return DAG.getBitcast(VT, Vec); } -/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd -/// operation of specified width. -static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, - SDValue V2) { - unsigned NumElems = VT.getVectorNumElements(); - SmallVector<int, 8> Mask; - Mask.push_back(NumElems); - for (unsigned i = 1; i != NumElems; ++i) - Mask.push_back(i); - return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); -} - -/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. +/// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); @@ -4384,7 +4750,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } -/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. +/// Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); @@ -4396,10 +4762,10 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } -/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified -/// vector of zero or undef vector. This produces a shuffle where the low -/// element of V2 is swizzled into the zero/undef vector, landing at element -/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). +/// Return a vector_shuffle of the specified vector of zero or undef vector. +/// This produces a shuffle where the low element of V2 is swizzled into the +/// zero/undef vector, landing at element Idx. +/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, bool IsZero, const X86Subtarget *Subtarget, @@ -4415,13 +4781,12 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); } -/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the -/// target specific opcode. Returns true if the Mask could be calculated. Sets -/// IsUnary to true if only uses one source. Note that this will set IsUnary for -/// shuffles which use a single input multiple times, and in those cases it will +/// Calculates the shuffle mask corresponding to the target-specific opcode. +/// Returns true if the Mask could be calculated. Sets IsUnary to true if only +/// uses one source. Note that this will set IsUnary for shuffles which use a +/// single input multiple times, and in those cases it will /// adjust the mask to only have indices within that single input. -/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero. -static bool getTargetShuffleMask(SDNode *N, MVT VT, +static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, SmallVectorImpl<int> &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; @@ -4438,6 +4803,11 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; + case X86ISD::INSERTPS: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); @@ -4482,7 +4852,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { // If we have a build-vector, then things are easy. - EVT VT = MaskNode.getValueType(); + MVT VT = MaskNode.getSimpleValueType(); assert(VT.isVector() && "Can't produce a non-vector with a build_vector!"); if (!VT.isInteger()) @@ -4530,8 +4900,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { DecodePSHUFBMask(C, Mask); - if (Mask.empty()) - return false; break; } @@ -4549,11 +4917,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); - if (Mask.empty()) return false; - // Mask only contains negative index if an element is zero. - if (std::any_of(Mask.begin(), Mask.end(), - [](int M){ return M == SM_SentinelZero; })) - return false; + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); @@ -4572,9 +4936,128 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::MOVLPS: // Not yet implemented return false; + case X86ISD::VPERMV: { + IsUnary = true; + SDValue MaskNode = N->getOperand(0); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(0); + + unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()); + SmallVector<uint64_t, 32> RawMask; + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + assert(MaskNode.getSimpleValueType().isInteger() && + MaskNode.getSimpleValueType().getVectorNumElements() == + VT.getVectorNumElements()); + + for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) + RawMask.push_back((uint64_t)SM_SentinelUndef); + else if (isa<ConstantSDNode>(Op)) { + APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue(); + RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); + } else + return false; + } + DecodeVPERMVMask(RawMask, Mask); + break; + } + if (MaskNode->getOpcode() == X86ISD::VBROADCAST) { + unsigned NumEltsInMask = MaskNode->getNumOperands(); + MaskNode = MaskNode->getOperand(0); + if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode)) { + APInt MaskEltValue = CN->getAPIntValue(); + for (unsigned i = 0; i < NumEltsInMask; ++i) + RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue()); + DecodeVPERMVMask(RawMask, Mask); + break; + } + // It may be a scalar load + } + + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + DecodeVPERMVMask(C, VT, Mask); + break; + } + return false; + } + case X86ISD::VPERMV3: { + IsUnary = false; + SDValue MaskNode = N->getOperand(1); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(1); + + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + assert(MaskNode.getSimpleValueType().isInteger() && + MaskNode.getSimpleValueType().getVectorNumElements() == + VT.getVectorNumElements()); + + SmallVector<uint64_t, 32> RawMask; + unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2); + + for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) + RawMask.push_back((uint64_t)SM_SentinelUndef); + else { + auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()); + if (!CN) + return false; + APInt MaskElement = CN->getAPIntValue(); + RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); + } + } + DecodeVPERMV3Mask(RawMask, Mask); + break; + } + + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + DecodeVPERMV3Mask(C, VT, Mask); + break; + } + return false; + } default: llvm_unreachable("unknown target shuffle node"); } + // Empty mask indicates the decode failed. + if (Mask.empty()) + return false; + + // Check if we're getting a shuffle mask with zero'd elements. + if (!AllowSentinelZero) + if (std::any_of(Mask.begin(), Mask.end(), + [](int M){ return M == SM_SentinelZero; })) + return false; + // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. @@ -4586,7 +5069,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, return true; } -/// getShuffleScalarElt - Returns the scalar element that will make up the ith +/// Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth) { @@ -4613,19 +5096,19 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = V.getSimpleValueType(); - unsigned NumElems = ShufVT.getVectorNumElements(); + int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector<int, 16> ShuffleMask; bool IsUnary; - if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) + if (!getTargetShuffleMask(N, ShufVT, false, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; - if (Elt < 0) + if (Elt == SM_SentinelUndef) return DAG.getUNDEF(ShufVT.getVectorElementType()); - SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) - : N->getOperand(1); + assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); + SDValue NewV = (Elt < NumElems) ? N->getOperand(0) : N->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } @@ -4650,8 +5133,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, return SDValue(); } -/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. -/// +/// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, @@ -4721,8 +5203,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, return DAG.getBitcast(MVT::v16i8, V); } -/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. -/// +/// Custom lower build_vector of v8i16. static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, @@ -4753,7 +5234,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, return V; } -/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. +/// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { @@ -4924,7 +5405,7 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { return SDValue(); if ((Offset % RequiredAlign) & 3) return SDValue(); - int64_t StartOffset = Offset & ~(RequiredAlign-1); + int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); if (StartOffset) { SDLoc DL(Ptr); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, @@ -5157,8 +5638,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. - const Function *F = DAG.getMachineFunction().getFunction(); - bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -5188,9 +5668,10 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); - Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); + Ld = DAG.getLoad( + CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } @@ -5329,7 +5810,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { return NV; } -static SDValue ConvertI1VectorToInterger(SDValue Op, SelectionDAG &DAG) { +static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector"); @@ -5366,7 +5847,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { } if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { - SDValue Imm = ConvertI1VectorToInterger(Op, DAG); + SDValue Imm = ConvertI1VectorToInteger(Op, DAG); if (Imm.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, Imm); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); @@ -5600,7 +6081,7 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, /// node. static SDValue LowerToAddSub(const BuildVectorSDNode *BV, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - EVT VT = BV->getValueType(0); + MVT VT = BV->getSimpleValueType(0); if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) return SDValue(); @@ -5662,12 +6143,12 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, // Update InVec0 and InVec1. if (InVec0.getOpcode() == ISD::UNDEF) { InVec0 = Op0.getOperand(0); - if (InVec0.getValueType() != VT) + if (InVec0.getSimpleValueType() != VT) return SDValue(); } if (InVec1.getOpcode() == ISD::UNDEF) { InVec1 = Op1.getOperand(0); - if (InVec1.getValueType() != VT) + if (InVec1.getSimpleValueType() != VT) return SDValue(); } @@ -5703,7 +6184,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - EVT VT = BV->getValueType(0); + MVT VT = BV->getSimpleValueType(0); unsigned NumElts = VT.getVectorNumElements(); unsigned NumUndefsLO = 0; unsigned NumUndefsHI = 0; @@ -5845,7 +6326,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned NumElems = Op.getNumOperands(); // Generate vectors for predicate vectors. - if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512()) + if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512()) return LowerBUILD_VECTORvXi1(Op, DAG); // Vectors containing all zeros can be matched by pxor and xorps later @@ -5866,7 +6347,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return Op; if (!VT.is512BitVector()) - return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); + return getOnesVector(VT, Subtarget, DAG, dl); } BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); @@ -5881,7 +6362,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned NumZero = 0; unsigned NumNonZero = 0; - unsigned NonZeros = 0; + uint64_t NonZeros = 0; bool IsAllConstants = true; SmallSet<SDValue, 8> Values; for (unsigned i = 0; i < NumElems; ++i) { @@ -5895,7 +6376,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (X86::isZeroNode(Elt)) NumZero++; else { - NonZeros |= (1 << i); + assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range. + NonZeros |= ((uint64_t)1 << i); NumNonZero++; } } @@ -5919,7 +6401,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { // Handle SSE only. assert(VT == MVT::v2i64 && "Expected an SSE value type!"); - EVT VecVT = MVT::v4i32; + MVT VecVT = MVT::v4i32; // Truncate the value (which may itself be a constant) to i32, and // convert it to a vector with movd (S2V+shuffle to zero extend). @@ -6051,7 +6533,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // One half is zero or undef. unsigned Idx = countTrailingZeros(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, - Op.getOperand(Idx)); + Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); @@ -6059,13 +6541,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) - if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this)) + if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, + DAG, Subtarget, *this)) return V; if (EVTBits == 16 && NumElems == 8) - if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this)) + if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, + DAG, Subtarget, *this)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS @@ -6077,7 +6559,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SmallVector<SDValue, 8> V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { - bool isZero = !(NonZeros & (1 << i)); + bool isZero = !(NonZeros & (1ULL << i)); if (isZero) V[i] = getZeroVector(VT, Subtarget, DAG, dl); else @@ -6177,7 +6659,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction +// 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); @@ -6193,8 +6675,8 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); if (Op.getNumOperands() == 4) { - MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), - ResVT.getVectorNumElements()/2); + MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), + ResVT.getVectorNumElements()/2); SDValue V3 = Op.getOperand(2); SDValue V4 = Op.getOperand(3); return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl), @@ -6213,8 +6695,27 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, assert(isPowerOf2_32(NumOfOperands) && "Unexpected number of operands in CONCAT_VECTORS"); + SDValue Undef = DAG.getUNDEF(ResVT); if (NumOfOperands > 2) { - MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), + // Specialize the cases when all, or all but one, of the operands are undef. + unsigned NumOfDefinedOps = 0; + unsigned OpIdx = 0; + for (unsigned i = 0; i < NumOfOperands; i++) + if (!Op.getOperand(i).isUndef()) { + NumOfDefinedOps++; + OpIdx = i; + } + if (NumOfDefinedOps == 0) + return Undef; + if (NumOfDefinedOps == 1) { + unsigned SubVecNumElts = + Op.getOperand(OpIdx).getValueType().getVectorNumElements(); + SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, + Op.getOperand(OpIdx), IdxVal); + } + + MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); SmallVector<SDValue, 2> Ops; for (unsigned i = 0; i < NumOfOperands/2; i++) @@ -6227,31 +6728,38 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } + // 2 operands SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); + unsigned NumElems = ResVT.getVectorNumElements(); + assert(V1.getValueType() == V2.getValueType() && + V1.getValueType().getVectorNumElements() == NumElems/2 && + "Unexpected operands in CONCAT_VECTORS"); + + if (ResVT.getSizeInBits() >= 16) + return Op; // The operation is legal with KUNPCK + bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); - + SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl); if (IsZeroV1 && IsZeroV2) - return getZeroVector(ResVT, Subtarget, DAG, dl); + return ZeroVec; SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); - SDValue Undef = DAG.getUNDEF(ResVT); - unsigned NumElems = ResVT.getVectorNumElements(); - SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8); + if (V2.isUndef()) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); + if (IsZeroV2) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx); + + SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl); + if (V1.isUndef()) + V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); - V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx); - V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits); if (IsZeroV1) - return V2; + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal); V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); - // Zero the upper bits of V1 - V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits); - V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits); - if (IsZeroV2) - return V1; - return DAG.getNode(ISD::OR, dl, ResVT, V1, V2); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal); } static SDValue LowerCONCAT_VECTORS(SDValue Op, @@ -6272,7 +6780,6 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, return LowerAVXCONCAT_VECTORS(Op, DAG); } - //===----------------------------------------------------------------------===// // Vector shuffle lowering // @@ -6422,6 +6929,127 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, return DAG.getConstant(Imm, DL, MVT::i8); } +/// \brief Compute whether each element of a shuffle is zeroable. +/// +/// A "zeroable" vector shuffle element is one which can be lowered to zero. +/// Either it is an undef element in the shuffle mask, the element of the input +/// referenced is undef, or the element of the input referenced is known to be +/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle +/// as many lanes with this technique as possible to simplify the remaining +/// shuffle. +static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, + SDValue V1, SDValue V2) { + SmallBitVector Zeroable(Mask.size(), false); + + while (V1.getOpcode() == ISD::BITCAST) + V1 = V1->getOperand(0); + while (V2.getOpcode() == ISD::BITCAST) + V2 = V2->getOperand(0); + + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; + // Handle the easy cases. + if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { + Zeroable[i] = true; + continue; + } + + // If this is an index into a build_vector node (which has the same number + // of elements), dig out the input value and use it. + SDValue V = M < Size ? V1 : V2; + if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) + continue; + + SDValue Input = V.getOperand(M % Size); + // The UNDEF opcode check really should be dead code here, but not quite + // worth asserting on (it isn't invalid, just unexpected). + if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) + Zeroable[i] = true; + } + + return Zeroable; +} + +// X86 has dedicated unpack instructions that can handle specific blend +// operations: UNPCKH and UNPCKL. +static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + SelectionDAG &DAG) { + int NumElts = VT.getVectorNumElements(); + int NumEltsInLane = 128 / VT.getScalarSizeInBits(); + SmallVector<int, 8> Unpckl; + SmallVector<int, 8> Unpckh; + + for (int i = 0; i < NumElts; ++i) { + unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; + int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2); + int HiPos = LoPos + NumEltsInLane / 2; + Unpckl.push_back(LoPos); + Unpckh.push_back(HiPos); + } + + if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) + return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); + + // Commute and try again. + ShuffleVectorSDNode::commuteMask(Unpckl); + if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) + return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); + + ShuffleVectorSDNode::commuteMask(Unpckh); + if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); + + return SDValue(); +} + +/// \brief Try to emit a bitmask instruction for a shuffle. +/// +/// This handles cases where we can model a blend exactly as a bitmask due to +/// one of the inputs being zeroable. +static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + MVT EltVT = VT.getVectorElementType(); + int NumEltBits = EltVT.getSizeInBits(); + MVT IntEltVT = MVT::getIntegerVT(NumEltBits); + SDValue Zero = DAG.getConstant(0, DL, IntEltVT); + SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, + IntEltVT); + if (EltVT.isFloatingPoint()) { + Zero = DAG.getBitcast(EltVT, Zero); + AllOnes = DAG.getBitcast(EltVT, AllOnes); + } + SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + SDValue V; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Zeroable[i]) + continue; + if (Mask[i] % Size != i) + return SDValue(); // Not a blend. + if (!V) + V = Mask[i] < Size ? V1 : V2; + else if (V != (Mask[i] < Size ? V1 : V2)) + return SDValue(); // Can only let one input through the mask. + + VMaskOps[i] = AllOnes; + } + if (!V) + return SDValue(); // No non-zeroable elements! + + SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); + V = DAG.getNode(VT.isFloatingPoint() + ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, + DL, VT, V, VMask); + return V; +} + /// \brief Try to emit a blend instruction for a shuffle using bit math. /// /// This is used as a fallback approach when first class blend instructions are @@ -6431,7 +7059,7 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); - MVT EltVT = VT.getScalarType(); + MVT EltVT = VT.getVectorElementType(); int NumEltBits = EltVT.getSizeInBits(); SDValue Zero = DAG.getConstant(0, DL, EltVT); SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, @@ -6458,22 +7086,62 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, /// This doesn't do any checks for the availability of instructions for blending /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is -/// that the shuffle mask is in fact a blend. +/// that the shuffle mask is a blend, or convertible into a blend with zero. static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, + SDValue V2, ArrayRef<int> Original, const X86Subtarget *Subtarget, SelectionDAG &DAG) { + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + SmallVector<int, 8> Mask(Original.begin(), Original.end()); + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + bool ForceV1Zero = false, ForceV2Zero = false; + + // Attempt to generate the binary blend mask. If an input is zero then + // we can use any lane. + // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. unsigned BlendMask = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Mask[i] >= Size) { - if (Mask[i] != i + Size) - return SDValue(); // Shuffled V2 input! + int M = Mask[i]; + if (M < 0) + continue; + if (M == i) + continue; + if (M == i + Size) { BlendMask |= 1u << i; continue; } - if (Mask[i] >= 0 && Mask[i] != i) - return SDValue(); // Shuffled V1 input! + if (Zeroable[i]) { + if (V1IsZero) { + ForceV1Zero = true; + Mask[i] = i; + continue; + } + if (V2IsZero) { + ForceV2Zero = true; + BlendMask |= 1u << i; + Mask[i] = i + Size; + continue; + } + } + return SDValue(); // Shuffled input! } + + // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. + if (ForceV1Zero) + V1 = getZeroVector(VT, Subtarget, DAG, DL); + if (ForceV2Zero) + V2 = getZeroVector(VT, Subtarget, DAG, DL); + + auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) { + unsigned ScaledMask = 0; + for (int i = 0; i != Size; ++i) + if (BlendMask & (1u << i)) + for (int j = 0; j != Scale; ++j) + ScaledMask |= 1u << (i * Scale + j); + return ScaledMask; + }; + switch (VT.SimpleTy) { case MVT::v2f64: case MVT::v4f32: @@ -6493,12 +7161,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, if (Subtarget->hasAVX2()) { // Scale the blend by the number of 32-bit dwords per element. int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= Size) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - + BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); @@ -6511,12 +7174,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, // For integer shuffles we need to expand the mask and cast the inputs to // v8i16s prior to blending. int Scale = 8 / VT.getVectorNumElements(); - BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= Size) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - + BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = DAG.getBitcast(MVT::v8i16, V2); return DAG.getBitcast(VT, @@ -6541,9 +7199,13 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, // FALLTHROUGH case MVT::v16i8: case MVT::v32i8: { - assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) && + assert((VT.is128BitVector() || Subtarget->hasAVX2()) && "256-bit byte-blends require AVX2 support!"); + // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. + if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) + return Masked; + // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; @@ -6760,11 +7422,11 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, Hi = DAG.getBitcast(AlignVT, Hi); return DAG.getBitcast( - VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo, + VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi, DAG.getConstant(Rotation * Scale, DL, MVT::i8))); } - assert(VT.getSizeInBits() == 128 && + assert(VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); @@ -6785,92 +7447,6 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); } -/// \brief Compute whether each element of a shuffle is zeroable. -/// -/// A "zeroable" vector shuffle element is one which can be lowered to zero. -/// Either it is an undef element in the shuffle mask, the element of the input -/// referenced is undef, or the element of the input referenced is known to be -/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle -/// as many lanes with this technique as possible to simplify the remaining -/// shuffle. -static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, - SDValue V1, SDValue V2) { - SmallBitVector Zeroable(Mask.size(), false); - - while (V1.getOpcode() == ISD::BITCAST) - V1 = V1->getOperand(0); - while (V2.getOpcode() == ISD::BITCAST) - V2 = V2->getOperand(0); - - bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); - bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); - - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - int M = Mask[i]; - // Handle the easy cases. - if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { - Zeroable[i] = true; - continue; - } - - // If this is an index into a build_vector node (which has the same number - // of elements), dig out the input value and use it. - SDValue V = M < Size ? V1 : V2; - if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) - continue; - - SDValue Input = V.getOperand(M % Size); - // The UNDEF opcode check really should be dead code here, but not quite - // worth asserting on (it isn't invalid, just unexpected). - if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) - Zeroable[i] = true; - } - - return Zeroable; -} - -/// \brief Try to emit a bitmask instruction for a shuffle. -/// -/// This handles cases where we can model a blend exactly as a bitmask due to -/// one of the inputs being zeroable. -static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { - MVT EltVT = VT.getScalarType(); - int NumEltBits = EltVT.getSizeInBits(); - MVT IntEltVT = MVT::getIntegerVT(NumEltBits); - SDValue Zero = DAG.getConstant(0, DL, IntEltVT); - SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, - IntEltVT); - if (EltVT.isFloatingPoint()) { - Zero = DAG.getBitcast(EltVT, Zero); - AllOnes = DAG.getBitcast(EltVT, AllOnes); - } - SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - SDValue V; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Zeroable[i]) - continue; - if (Mask[i] % Size != i) - return SDValue(); // Not a blend. - if (!V) - V = Mask[i] < Size ? V1 : V2; - else if (V != (Mask[i] < Size ? V1 : V2)) - return SDValue(); // Can only let one input through the mask. - - VMaskOps[i] = AllOnes; - } - if (!V) - return SDValue(); // No non-zeroable elements! - - SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); - V = DAG.getNode(VT.isFloatingPoint() - ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, - DL, VT, V, VMask); - return V; -} - /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and @@ -6982,7 +7558,7 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, // Determine the extraction length from the part of the // lower half that isn't zeroable. int Len = HalfSize; - for (; Len >= 0; --Len) + for (; Len > 0; --Len) if (!Zeroable[Len - 1]) break; assert(Len > 0 && "Zeroable shuffle mask"); @@ -6997,8 +7573,9 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, SDValue &V = (M < Size ? V1 : V2); M = M % Size; - // All mask elements must be in the lower half. - if (M > HalfSize) + // The extracted elements must start at a valid index and all mask + // elements must be in the lower half. + if (i > M || M >= HalfSize) return SDValue(); if (Idx < 0 || (Src == V && Idx == (M - i))) { @@ -7095,64 +7672,104 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, /// /// Given a specific number of elements, element bit width, and extension /// stride, produce either a zero or any extension based on the available -/// features of the subtarget. +/// features of the subtarget. The extended elements are consecutive and +/// begin and can start from an offseted element index in the input; to +/// avoid excess shuffling the offset must either being in the bottom lane +/// or at the start of a higher lane. All extended elements must be from +/// the same lane. static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( - SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV, + SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); - int NumElements = VT.getVectorNumElements(); int EltBits = VT.getScalarSizeInBits(); + int NumElements = VT.getVectorNumElements(); + int NumEltsPerLane = 128 / EltBits; + int OffsetLane = Offset / NumEltsPerLane; assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended."); assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); + assert(0 <= Offset && "Extension offset must be positive."); + assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && + "Extension offset must be in the first lane or start an upper lane."); + + // Check that an index is in same lane as the base offset. + auto SafeOffset = [&](int Idx) { + return OffsetLane == (Idx / NumEltsPerLane); + }; + + // Shift along an input so that the offset base moves to the first element. + auto ShuffleOffset = [&](SDValue V) { + if (!Offset) + return V; + + SmallVector<int, 8> ShMask((unsigned)NumElements, -1); + for (int i = 0; i * Scale < NumElements; ++i) { + int SrcIdx = i + Offset; + ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; + } + return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); + }; // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget->hasSSE41()) { + // Not worth offseting 128-bit vectors if scale == 2, a pattern using + // PUNPCK will catch this in a later shuffle match. + if (Offset && Scale == 2 && VT.is128BitVector()) + return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); - return DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); + InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV)); + return DAG.getBitcast(VT, InputV); } + assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); + // For any extends we can cheat for larger element sizes and use shuffle // instructions that can fold with a load and/or copy. if (AnyExt && EltBits == 32) { - int PSHUFDMask[4] = {0, -1, 1, -1}; + int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, + -1}; return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } if (AnyExt && EltBits == 16 && Scale > 2) { - int PSHUFDMask[4] = {0, -1, 0, -1}; + int PSHUFDMask[4] = {Offset / 2, -1, + SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); - int PSHUFHWMask[4] = {1, -1, -1, -1}; + int PSHUFWMask[4] = {1, -1, -1, -1}; + unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW); return DAG.getBitcast( - VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, + VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, InputV), - getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG))); + getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); } // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes // to 64-bits. if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) { assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); - assert(VT.getSizeInBits() == 128 && "Unexpected vector width!"); + assert(VT.is128BitVector() && "Unexpected vector width!"); + int LoIdx = Offset * EltBits; SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getConstant(EltBits, DL, MVT::i8), - DAG.getConstant(0, DL, MVT::i8))); - if (isUndefInRange(Mask, NumElements/2, NumElements/2)) + DAG.getConstant(LoIdx, DL, MVT::i8))); + + if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) || + !SafeOffset(Offset + 1)) return DAG.getNode(ISD::BITCAST, DL, VT, Lo); - SDValue Hi = - DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, - DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, - DAG.getConstant(EltBits, DL, MVT::i8), - DAG.getConstant(EltBits, DL, MVT::i8))); + int HiIdx = (Offset + 1) * EltBits; + SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, + DAG.getConstant(EltBits, DL, MVT::i8), + DAG.getConstant(HiIdx, DL, MVT::i8))); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } @@ -7163,9 +7780,11 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { assert(NumElements == 16 && "Unexpected byte vector width!"); SDValue PSHUFBMask[16]; - for (int i = 0; i < 16; ++i) - PSHUFBMask[i] = - DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, DL, MVT::i8); + for (int i = 0; i < 16; ++i) { + int Idx = Offset + (i / Scale); + PSHUFBMask[i] = DAG.getConstant( + (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8); + } InputV = DAG.getBitcast(MVT::v16i8, InputV); return DAG.getBitcast(VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, @@ -7173,13 +7792,30 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( MVT::v16i8, PSHUFBMask))); } + // If we are extending from an offset, ensure we start on a boundary that + // we can unpack from. + int AlignToUnpack = Offset % (NumElements / Scale); + if (AlignToUnpack) { + SmallVector<int, 8> ShMask((unsigned)NumElements, -1); + for (int i = AlignToUnpack; i < NumElements; ++i) + ShMask[i - AlignToUnpack] = i; + InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); + Offset -= AlignToUnpack; + } + // Otherwise emit a sequence of unpacks. do { + unsigned UnpackLoHi = X86ISD::UNPCKL; + if (Offset >= (NumElements / 2)) { + UnpackLoHi = X86ISD::UNPCKH; + Offset -= (NumElements / 2); + } + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) : getZeroVector(InputVT, Subtarget, DAG, DL); InputV = DAG.getBitcast(InputVT, InputV); - InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext); + InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); Scale /= 2; EltBits *= 2; NumElements /= 2; @@ -7205,7 +7841,9 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Bits = VT.getSizeInBits(); + int NumLanes = Bits / 128; int NumElements = VT.getVectorNumElements(); + int NumEltsPerLane = NumElements / NumLanes; assert(VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"); assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); @@ -7215,8 +7853,11 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( auto Lower = [&](int Scale) -> SDValue { SDValue InputV; bool AnyExt = true; + int Offset = 0; + int Matches = 0; for (int i = 0; i < NumElements; ++i) { - if (Mask[i] == -1) + int M = Mask[i]; + if (M == -1) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { // Each of the extended elements need to be zeroable. @@ -7230,14 +7871,29 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( // Each of the base elements needs to be consecutive indices into the // same input vector. - SDValue V = Mask[i] < NumElements ? V1 : V2; - if (!InputV) + SDValue V = M < NumElements ? V1 : V2; + M = M % NumElements; + if (!InputV) { InputV = V; - else if (InputV != V) + Offset = M - (i / Scale); + } else if (InputV != V) return SDValue(); // Flip-flopping inputs. - if (Mask[i] % NumElements != i / Scale) + // Offset must start in the lowest 128-bit lane or at the start of an + // upper lane. + // FIXME: Is it ever worth allowing a negative base offset? + if (!((0 <= Offset && Offset < NumEltsPerLane) || + (Offset % NumEltsPerLane) == 0)) + return SDValue(); + + // If we are offsetting, all referenced entries must come from the same + // lane. + if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) + return SDValue(); + + if ((M % NumElements) != (Offset + (i / Scale))) return SDValue(); // Non-consecutive strided elements. + Matches++; } // If we fail to find an input, we have a zero-shuffle which should always @@ -7246,8 +7902,13 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( if (!InputV) return SDValue(); + // If we are offsetting, don't extend if we only match a single input, we + // can always do better by using a basic PSHUF or PUNPCK. + if (Offset != 0 && Matches < 2) + return SDValue(); + return lowerVectorShuffleAsSpecificZeroOrAnyExtend( - DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG); + DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. @@ -7355,8 +8016,9 @@ static SDValue lowerVectorShuffleAsElementInsertion( // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. - if (SDValue V2S = getScalarValueForVectorElement( - V2, Mask[V2Index] - Mask.size(), DAG)) { + SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), + DAG); + if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getBitcast(EltVT, V2S); if (EltVT == MVT::i8 || EltVT == MVT::i16) { @@ -7431,11 +8093,65 @@ static SDValue lowerVectorShuffleAsElementInsertion( return V2; } +/// \brief Try to lower broadcast of a single - truncated - integer element, +/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. +/// +/// This assumes we have AVX2. +static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0, + int BroadcastIdx, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Subtarget->hasAVX2() && + "We can only lower integer broadcasts with AVX2!"); + + EVT EltVT = VT.getVectorElementType(); + EVT V0VT = V0.getValueType(); + + assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); + assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); + + EVT V0EltVT = V0VT.getVectorElementType(); + if (!V0EltVT.isInteger()) + return SDValue(); + + const unsigned EltSize = EltVT.getSizeInBits(); + const unsigned V0EltSize = V0EltVT.getSizeInBits(); + + // This is only a truncation if the original element type is larger. + if (V0EltSize <= EltSize) + return SDValue(); + + assert(((V0EltSize % EltSize) == 0) && + "Scalar type sizes must all be powers of 2 on x86!"); + + const unsigned V0Opc = V0.getOpcode(); + const unsigned Scale = V0EltSize / EltSize; + const unsigned V0BroadcastIdx = BroadcastIdx / Scale; + + if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && + V0Opc != ISD::BUILD_VECTOR) + return SDValue(); + + SDValue Scalar = V0.getOperand(V0BroadcastIdx); + + // If we're extracting non-least-significant bits, shift so we can truncate. + // Hopefully, we can fold away the trunc/srl/load into the broadcast. + // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer + // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. + if (const int OffsetIdx = BroadcastIdx % Scale) + Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, + DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType())); + + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, + DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); +} + /// \brief Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. +/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, ArrayRef<int> Mask, const X86Subtarget *Subtarget, @@ -7476,7 +8192,7 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, int BeginIdx = (int)ConstantIdx->getZExtValue(); int EndIdx = - BeginIdx + (int)VInner.getValueType().getVectorNumElements(); + BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements(); if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { BroadcastIdx -= BeginIdx; V = VInner; @@ -7491,6 +8207,22 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. + // First, look through bitcast: if the original value has a larger element + // type than the shuffle, the broadcast element is in essence truncated. + // Make that explicit to ease folding. + if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) + if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast( + DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) + return TruncBroadcast; + + MVT BroadcastVT = VT; + + // Peek through any bitcast (only useful for loads). + SDValue BC = V; + while (BC.getOpcode() == ISD::BITCAST) + BC = BC.getOperand(0); + + // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); @@ -7499,13 +8231,32 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); + } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) { + // 32-bit targets need to load i64 as a f64 and then bitcast the result. + if (!Subtarget->is64Bit() && VT.getScalarType() == MVT::i64) + BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); + + // If we are broadcasting a load that is only used by the shuffle + // then we can reduce the vector load to the broadcasted scalar load. + LoadSDNode *Ld = cast<LoadSDNode>(BC); + SDValue BaseAddr = Ld->getOperand(1); + EVT AddrVT = BaseAddr.getValueType(); + EVT SVT = BroadcastVT.getScalarType(); + unsigned Offset = BroadcastIdx * SVT.getStoreSize(); + SDValue NewAddr = DAG.getNode( + ISD::ADD, DL, AddrVT, BaseAddr, + DAG.getConstant(Offset, DL, AddrVT)); + V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Ld->getMemOperand(), Offset, SVT.getStoreSize())); } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { // We can't broadcast from a vector register without AVX2, and we can only // broadcast from the zero-element of a vector register. return SDValue(); } - return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); + V = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, V); + return DAG.getBitcast(VT, V); } // Check for whether we can use INSERTPS to perform the shuffle. We only use @@ -7595,9 +8346,10 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. -static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { +static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "This routine only supports integer vectors."); assert(!isSingleInputShuffleMask(Mask) && @@ -7774,10 +8526,9 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) + return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, @@ -7869,10 +8620,9 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. @@ -8077,14 +8827,9 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) + return V; // Otherwise fall back to a SHUFPS lowering strategy. return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); @@ -8161,14 +8906,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. @@ -8184,8 +8924,8 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask, DAG); // Try to lower by permuting the inputs into an unpack instruction. - if (SDValue Unpack = - lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG)) + if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, + V2, Mask, DAG)) return Unpack; // We implement this with SHUFPS because it can blend from two vectors. @@ -8218,7 +8958,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, static SDValue lowerV8I16GeneralSingleInputVectorShuffle( SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - assert(VT.getScalarType() == MVT::i16 && "Bad input type!"); + assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); @@ -8286,16 +9026,18 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( assert(AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); + bool ThreeAInputs = AToAInputs.size() == 3; + // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. int ADWord, BDWord; - int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord; - int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord; - int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset; - ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs; - int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0]; + int &TripleDWord = ThreeAInputs ? ADWord : BDWord; + int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; + int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; + ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; + int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); int TripleNonInputIdx = TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); @@ -8364,8 +9106,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); } else { assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); - int APinnedIdx = - AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; + int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); } } @@ -8751,10 +9492,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Shift; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1); - if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, @@ -8798,10 +9538,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( @@ -8812,8 +9551,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; - if (SDValue Unpack = - lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG)) + if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, + V2, Mask, DAG)) return Unpack; // If we can't directly blend but can use PSHUFB, that will be better as it @@ -9037,17 +9776,14 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return V; } + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Masked; + // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {// Low half. - 0, 16, 1, 17, 2, 18, 3, 19, - // High half. - 4, 20, 5, 21, 6, 22, 7, 23})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {// Low half. - 8, 24, 9, 25, 10, 26, 11, 27, - // High half. - 12, 28, 13, 29, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + return V; // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any @@ -9086,8 +9822,8 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // FIXME: It might be worth trying to detect if the unpack-feeding // shuffles will both be pshufb, in which case we shouldn't bother with // this. - if (SDValue Unpack = - lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG)) + if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( + DL, MVT::v16i8, V1, V2, Mask, DAG)) return Unpack; } @@ -9296,7 +10032,7 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, int NumElements = VT.getVectorNumElements(); int SplitNumElements = NumElements / 2; - MVT ScalarVT = VT.getScalarType(); + MVT ScalarVT = VT.getVectorElementType(); MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); // Rather than splitting build-vectors, just build two narrower build @@ -9308,7 +10044,7 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, MVT OrigVT = V.getSimpleValueType(); int OrigNumElements = OrigVT.getVectorNumElements(); int OrigSplitNumElements = OrigNumElements / 2; - MVT OrigScalarVT = OrigVT.getScalarType(); + MVT OrigScalarVT = OrigVT.getVectorElementType(); MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); SDValue LoV, HiV; @@ -9478,7 +10214,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, ArrayRef<int> Mask, SelectionDAG &DAG) { // FIXME: This should probably be generalized for 512-bit vectors as well. - assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!"); + assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); int LaneSize = Mask.size() / 2; // If there are only inputs from one 128-bit lane, splitting will in fact be @@ -9682,6 +10418,108 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); } +/// Lower shuffles where an entire half of a 256-bit vector is UNDEF. +/// This allows for fast cases such as subvector extraction/insertion +/// or shuffling smaller vector types which can lower more efficiently. +static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + + bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); + bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); + if (!UndefLower && !UndefUpper) + return SDValue(); + + // Upper half is undef and lower half is whole upper subvector. + // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + if (UndefUpper && + isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(HalfNumElts, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(0, DL)); + } + + // Lower half is undef and upper half is whole lower subvector. + // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> + if (UndefLower && + isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(HalfNumElts, DL)); + } + + // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. + if (UndefLower && Subtarget->hasAVX2() && + (VT == MVT::v4f64 || VT == MVT::v4i64)) + return SDValue(); + + // If the shuffle only uses the lower halves of the input operands, + // then extract them and perform the 'half' shuffle at half width. + // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u> + int HalfIdx1 = -1, HalfIdx2 = -1; + SmallVector<int, 8> HalfMask; + unsigned Offset = UndefLower ? HalfNumElts : 0; + for (unsigned i = 0; i != HalfNumElts; ++i) { + int M = Mask[i + Offset]; + if (M < 0) { + HalfMask.push_back(M); + continue; + } + + // Determine which of the 4 half vectors this element is from. + // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. + int HalfIdx = M / HalfNumElts; + + // Only shuffle using the lower halves of the inputs. + // TODO: Investigate usefulness of shuffling with upper halves. + if (HalfIdx != 0 && HalfIdx != 2) + return SDValue(); + + // Determine the element index into its half vector source. + int HalfElt = M % HalfNumElts; + + // We can shuffle with up to 2 half vectors, set the new 'half' + // shuffle mask accordingly. + if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) { + HalfMask.push_back(HalfElt); + HalfIdx1 = HalfIdx; + continue; + } + if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) { + HalfMask.push_back(HalfElt + HalfNumElts); + HalfIdx2 = HalfIdx; + continue; + } + + // Too many half vectors referenced. + return SDValue(); + } + assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); + + auto GetHalfVector = [&](int HalfIdx) { + if (HalfIdx < 0) + return DAG.getUNDEF(HalfVT); + SDValue V = (HalfIdx < 2 ? V1 : V2); + HalfIdx = (HalfIdx % 2) * HalfNumElts; + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, + DAG.getIntPtrConstant(HalfIdx, DL)); + }; + + SDValue Half1 = GetHalfVector(HalfIdx1); + SDValue Half2 = GetHalfVector(HalfIdx2); + SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, + DAG.getIntPtrConstant(Offset, DL)); +} + /// \brief Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// @@ -9776,16 +10614,10 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG); } - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1); + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) + return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) @@ -9876,14 +10708,9 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Shift; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) + return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, @@ -9941,14 +10768,9 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) + return V; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. We also need to squash the @@ -9974,9 +10796,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (Subtarget->hasAVX2()) return DAG.getNode( X86ISD::VPERMV, DL, MVT::v8f32, - DAG.getBitcast(MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL, - MVT::v8i32, VPermMask)), - V1); + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, @@ -10041,14 +10861,9 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) + return V; } // Try to use shift instructions. @@ -10115,18 +10930,9 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane: - 0, 16, 1, 17, 2, 18, 3, 19, - // Second 128-bit lane: - 8, 24, 9, 25, 10, 26, 11, 27})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane: - 4, 20, 5, 21, 6, 22, 7, 23, - // Second 128-bit lane: - 12, 28, 13, 29, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) + return V; // Try to use shift instructions. if (SDValue Shift = @@ -10215,22 +11021,9 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - // Note that these are repeated 128-bit lane unpacks, not unpacks across all - // 256-bit lanes. - if (isShuffleEquivalent( - V1, V2, Mask, - {// First 128-bit lane: - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, - // Second 128-bit lane: - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); - if (isShuffleEquivalent( - V1, V2, Mask, - {// First 128-bit lane: - 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, - // Second 128-bit lane: - 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) + return V; // Try to use shift instructions. if (SDValue Shift = @@ -10296,12 +11089,17 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, VT, V1, V2, Mask, Subtarget, DAG)) return Insertion; - // There is a really nice hard cut-over between AVX1 and AVX2 that means we can - // check for those subtargets here and avoid much of the subtarget querying in - // the per-vector-type lowering routines. With AVX1 we have essentially *zero* - // ability to manipulate a 256-bit vector with integer types. Since we'll use - // floating point types there eventually, just immediately cast everything to - // a float and operate entirely in that domain. + // Handle special cases where the lower or upper half is UNDEF. + if (SDValue V = + lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + return V; + + // There is a really nice hard cut-over between AVX1 and AVX2 that means we + // can check for those subtargets here and avoid much of the subtarget + // querying in the per-vector-type lowering routines. With AVX1 we have + // essentially *zero* ability to manipulate a 256-bit vector with integer + // types. Since we'll use floating point types there eventually, just + // immediately cast everything to a float and operate entirely in that domain. if (VT.isInteger() && !Subtarget->hasAVX2()) { int ElementBits = VT.getScalarSizeInBits(); if (ElementBits < 32) @@ -10334,6 +11132,57 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } +/// \brief Try to lower a vector shuffle as a 128-bit shuffles. +static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, + ArrayRef<int> Mask, + SDValue V1, SDValue V2, + SelectionDAG &DAG) { + assert(VT.getScalarSizeInBits() == 64 && + "Unexpected element type size for 128bit shuffle."); + + // To handle 256 bit vector requires VLX and most probably + // function lowerV2X128VectorShuffle() is better solution. + assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle."); + + SmallVector<int, 4> WidenedMask; + if (!canWidenShuffleElements(Mask, WidenedMask)) + return SDValue(); + + // Form a 128-bit permutation. + // Convert the 64-bit shuffle mask selection values into 128-bit selection + // bits defined by a vshuf64x2 instruction's immediate control byte. + unsigned PermMask = 0, Imm = 0; + unsigned ControlBitsNum = WidenedMask.size() / 2; + + for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { + if (WidenedMask[i] == SM_SentinelZero) + return SDValue(); + + // Use first element in place of undef mask. + Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; + PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum); + } + + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); +} + +static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + + assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV"); + + MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); + + SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); + + return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); +} + /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, @@ -10345,21 +11194,21 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG)) + return Shuf128; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) + return Unpck; + + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); @@ -10367,22 +11216,11 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 0, 16, 1, 17, 4, 20, 5, 21, - // Second 128-bit lane. - 8, 24, 9, 25, 12, 28, 13, 29})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 2, 18, 3, 19, 6, 22, 7, 23, - // Second 128-bit lane. - 10, 26, 11, 27, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) + return Unpck; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 8-lane 64-bit integer shuffles. @@ -10396,21 +11234,21 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Shuf128; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Unpck; + + return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); @@ -10418,22 +11256,11 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 0, 16, 1, 17, 4, 20, 5, 21, - // Second 128-bit lane. - 8, 24, 9, 25, 12, 28, 13, 29})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 2, 18, 3, 19, 6, 22, 7, 23, - // Second 128-bit lane. - 10, 26, 11, 27, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) + return Unpck; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. @@ -10448,8 +11275,7 @@ static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// \brief Handle lowering of 64-lane 8-bit integer shuffles. @@ -10517,6 +11343,60 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); } +// Lower vXi1 vector shuffles. +// There is no a dedicated instruction on AVX-512 that shuffles the masks. +// The only way to shuffle bits is to sign-extend the mask vector to SIMD +// vector, shuffle and then truncate it back. +static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Subtarget->hasAVX512() && + "Cannot lower 512-bit vectors w/o basic ISA!"); + MVT ExtVT; + switch (VT.SimpleTy) { + default: + llvm_unreachable("Expected a vector of i1 elements"); + case MVT::v2i1: + ExtVT = MVT::v2i64; + break; + case MVT::v4i1: + ExtVT = MVT::v4i32; + break; + case MVT::v8i1: + ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL + break; + case MVT::v16i1: + ExtVT = MVT::v16i32; + break; + case MVT::v32i1: + ExtVT = MVT::v32i16; + break; + case MVT::v64i1: + ExtVT = MVT::v64i8; + break; + } + + if (ISD::isBuildVectorAllZeros(V1.getNode())) + V1 = getZeroVector(ExtVT, Subtarget, DAG, DL); + else if (ISD::isBuildVectorAllOnes(V1.getNode())) + V1 = getOnesVector(ExtVT, Subtarget, DAG, DL); + else + V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); + + if (V2.isUndef()) + V2 = DAG.getUNDEF(ExtVT); + else if (ISD::isBuildVectorAllZeros(V2.getNode())) + V2 = getZeroVector(ExtVT, Subtarget, DAG, DL); + else if (ISD::isBuildVectorAllOnes(V2.getNode())) + V2 = getOnesVector(ExtVT, Subtarget, DAG, DL); + else + V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); + return DAG.getNode(ISD::TRUNCATE, DL, VT, + DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask)); +} /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 @@ -10533,8 +11413,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, MVT VT = Op.getSimpleValueType(); int NumElements = VT.getVectorNumElements(); SDLoc dl(Op); + bool Is1BitVector = (VT.getVectorElementType() == MVT::i1); - assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); + assert((VT.getSizeInBits() != 64 || Is1BitVector) && + "Can't lower MMX shuffles"); bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; @@ -10572,7 +11454,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector<int, 16> WidenedMask; - if (VT.getScalarSizeInBits() < 64 && + if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && canWidenShuffleElements(Mask, WidenedMask)) { MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) @@ -10640,17 +11522,17 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, } // For each vector width, delegate to a specialized lowering routine. - if (VT.getSizeInBits() == 128) + if (VT.is128BitVector()) return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); - if (VT.getSizeInBits() == 256) + if (VT.is256BitVector()) return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); - // Force AVX-512 vectors to be scalarized for now. - // FIXME: Implement AVX-512 support! - if (VT.getSizeInBits() == 512) + if (VT.is512BitVector()) return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + if (Is1BitVector) + return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } @@ -10661,11 +11543,16 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, unsigned &MaskValue) { MaskValue = 0; unsigned NumElems = BuildVector->getNumOperands(); + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + // We don't handle the >2 lanes case right now. unsigned NumLanes = (NumElems - 1) / 8 + 1; + if (NumLanes > 2) + return false; + unsigned NumElemsInLane = NumElems / NumLanes; - // Blend for v16i16 should be symetric for the both lanes. + // Blend for v16i16 should be symmetric for the both lanes. for (unsigned i = 0; i < NumElemsInLane; ++i) { SDValue EltCond = BuildVector->getOperand(i); SDValue SndLaneEltCond = @@ -10673,20 +11560,25 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, int Lane1Cond = -1, Lane2Cond = -1; if (isa<ConstantSDNode>(EltCond)) - Lane1Cond = !isZero(EltCond); + Lane1Cond = !isNullConstant(EltCond); if (isa<ConstantSDNode>(SndLaneEltCond)) - Lane2Cond = !isZero(SndLaneEltCond); + Lane2Cond = !isNullConstant(SndLaneEltCond); + unsigned LaneMask = 0; if (Lane1Cond == Lane2Cond || Lane2Cond < 0) // Lane1Cond != 0, means we want the first argument. // Lane1Cond == 0, means we want the second argument. // The encoding of this argument is 0 for the first argument, 1 // for the second. Therefore, invert the condition. - MaskValue |= !Lane1Cond << i; + LaneMask = !Lane1Cond << i; else if (Lane1Cond < 0) - MaskValue |= !Lane2Cond << i; + LaneMask = !Lane2Cond << i; else return false; + + MaskValue |= LaneMask; + if (NumLanes == 2) + MaskValue |= LaneMask << NumElemsInLane; } return true; } @@ -10711,7 +11603,8 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { SDValue CondElt = CondBV->getOperand(i); Mask.push_back( - isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1); + isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0) + : -1); } return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); } @@ -10776,9 +11669,8 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { } if (VT.getSizeInBits() == 16) { - unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); // If Idx is 0, it's cheaper to do a move instead of a pextrw. - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return DAG.getNode( ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, @@ -10801,8 +11693,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { return SDValue(); SDNode *User = *Op.getNode()->use_begin(); if ((User->getOpcode() != ISD::STORE || - (isa<ConstantSDNode>(Op.getOperand(1)) && - cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && + isNullConstant(Op.getOperand(1))) && (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); @@ -10900,10 +11791,11 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, MVT EltVT = VecVT.getVectorElementType(); unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); + assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); - //if (IdxVal >= NumElems/2) - // IdxVal -= NumElems/2; - IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk; + // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2 + // this can be done with a mask. + IdxVal &= ElemsPerChunk - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getConstant(IdxVal, dl, MVT::i32)); } @@ -10918,8 +11810,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { SDValue Vec = Op.getOperand(0); - unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), @@ -10951,8 +11842,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. - unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return Op; // UNPCKHPD the element to the lowest double word, then movsd. @@ -11039,7 +11929,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Insert the element into the desired chunk. unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); - unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128; + assert(isPowerOf2_32(NumEltsIn128)); + // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. + unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getConstant(IdxIn128, dl, MVT::i32)); @@ -11078,8 +11970,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. - const Function *F = DAG.getMachineFunction().getFunction(); - bool MinSize = F->hasFnAttribute(Attribute::MinSize); + bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize(); if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather @@ -11199,14 +12090,25 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, // --> load32 addr if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.is256BitVector() && SubVecVT.is128BitVector() && - !Subtarget->isUnalignedMem32Slow()) { - SDValue SubVec2 = Vec.getOperand(1); - if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) { - if (Idx2->getZExtValue() == 0) { - SDValue Ops[] = { SubVec2, SubVec }; - if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) - return Ld; + OpVT.is256BitVector() && SubVecVT.is128BitVector()) { + auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); + if (Idx2 && Idx2->getZExtValue() == 0) { + SDValue SubVec2 = Vec.getOperand(1); + // If needed, look through a bitcast to get to the load. + if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST) + SubVec2 = SubVec2.getOperand(0); + + if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) { + bool Fast; + unsigned Alignment = FirstLd->getAlignment(); + unsigned AS = FirstLd->getAddressSpace(); + const X86TargetLowering *TLI = Subtarget->getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + OpVT, AS, Alignment, &Fast) && Fast) { + SDValue Ops[] = { SubVec2, SubVec }; + if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) + return Ld; + } } } } @@ -11218,37 +12120,9 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); - if (OpVT.getVectorElementType() == MVT::i1) { - if (IdxVal == 0 && Vec.getOpcode() == ISD::UNDEF) // the operation is legal - return Op; - SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); - SDValue Undef = DAG.getUNDEF(OpVT); - unsigned NumElems = OpVT.getVectorNumElements(); - SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8); - - if (IdxVal == OpVT.getVectorNumElements() / 2) { - // Zero upper bits of the Vec - Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); - - SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, - SubVec, ZeroIdx); - Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); - return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); - } - if (IdxVal == 0) { - SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, - SubVec, ZeroIdx); - // Zero upper bits of the Vec2 - Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); - Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits); - // Zero lower bits of the Vec - Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); - // Merge them together - return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); - } - } + if (OpVT.getVectorElementType() == MVT::i1) + return Insert1BitVector(Op, DAG); + return SDValue(); } @@ -11363,7 +12237,8 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { // load. if (isGlobalStubReference(OpFlag)) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); return Result; } @@ -11430,7 +12305,8 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, // load. if (isGlobalStubReference(OpFlags)) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. @@ -11587,7 +12463,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, } Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); } // The address of the thread local variable is the add of the thread @@ -11599,10 +12476,18 @@ SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + + // Cygwin uses emutls. + // FIXME: It may be EmulatedTLS-generic also for X86-Android. + if (Subtarget->isTargetWindowsCygwin()) + return LowerToTLSEmulatedModel(GA, DAG); + const GlobalValue *GV = GA->getGlobal(); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (Subtarget->isTargetELF()) { + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: @@ -11651,8 +12536,12 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); + Chain = + DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), + DAG.getIntPtrConstant(0, DL, true), SDValue(), DL); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -11825,15 +12714,23 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, return Op; } + SDValue ValueToStore = Op.getOperand(0); + if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && + !Subtarget->is64Bit()) + // Bitcasting to f64 here allows us to do a single 64-bit store from + // an SSE register, avoiding the store forwarding penalty that would come + // with two 32-bit stores. + ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); + unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, - MachinePointerInfo::getFixedStack(SSFI), - false, false, 0); + SDValue Chain = DAG.getStore( + DAG.getEntryNode(), dl, ValueToStore, StackSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, + false, 0); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); } @@ -11855,10 +12752,9 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, MachineMemOperand *MMO; if (FI) { int SSFI = FI->getIndex(); - MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, ByteSize, ByteSize); + MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOLoad, ByteSize, ByteSize); } else { MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); StackSlot = StackSlot.getOperand(1); @@ -11884,16 +12780,16 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue Ops[] = { Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag }; - MachineMemOperand *MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOStore, SSFISize, SSFISize); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOStore, SSFISize, SSFISize); Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, Ops, Op.getValueType(), MMO); - Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, - MachinePointerInfo::getFixedStack(SSFI), - false, false, false, 0); + Result = DAG.getLoad( + Op.getValueType(), DL, Chain, StackSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + false, false, false, 0); } return Result; @@ -11937,16 +12833,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, // Load the 64-bit value into an XMM register. SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0)); - SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); + SDValue CLod0 = + DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); - SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); + SDValue CLod1 = + DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); + // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; @@ -11996,10 +12895,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); // Subtract the bias. + // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); // Handle final rounding. - EVT DestVT = Op.getValueType(); + MVT DestVT = Op.getSimpleValueType(); if (DestVT.bitsLT(MVT::f64)) return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, @@ -12025,14 +12925,23 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // return (float4) lo + fhi; + // We shouldn't use it when unsafe-fp-math is enabled though: we might later + // reassociate the two FADDs, and if we do that, the algorithm fails + // spectacularly (PR24512). + // FIXME: If we ever have some kind of Machine FMF, this should be marked + // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because + // there's also the MachineCombiner reassociations happening on Machine IR. + if (DAG.getTarget().Options.UnsafeFPMath) + return SDValue(); + SDLoc DL(Op); SDValue V = Op->getOperand(0); - EVT VecIntVT = V.getValueType(); + MVT VecIntVT = V.getSimpleValueType(); bool Is128 = VecIntVT == MVT::v4i32; - EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; + MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; // If we convert to something else than the supported type, e.g., to v4f64, // abort early. - if (VecFloatVT != Op->getValueType(0)) + if (VecFloatVT != Op->getSimpleValueType(0)) return SDValue(); unsigned NumElts = VecIntVT.getVectorNumElements(); @@ -12070,7 +12979,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, SDValue Low, High; if (Subtarget.hasSSE41()) { - EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; + MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow); SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); @@ -12108,6 +13017,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); + // TODO: Are there any fast-math-flags to propagate here? SDValue FHigh = DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); // return (float4) lo + fhi; @@ -12137,11 +13047,10 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); case MVT::v16i8: case MVT::v16i16: - if (Subtarget->hasAVX512()) - return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0)); + assert(Subtarget->hasAVX512()); + return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0)); } - llvm_unreachable(nullptr); } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, @@ -12150,7 +13059,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); - if (Op.getValueType().isVector()) + if (Op.getSimpleValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG); // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't @@ -12161,6 +13070,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); + + if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && + (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) { + // Conversions from unsigned i32 to f32/f64 are legal, + // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode. + return Op; + } + if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG); if (SrcVT == MVT::i32 && X86ScalarSSEf64) @@ -12184,7 +13101,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + SDValue ValueToStore = Op.getOperand(0); + if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit()) + // Bitcasting to f64 here allows us to do a single 64-bit store from + // an SSE register, avoiding the store forwarding penalty that would come + // with two 32-bit stores. + ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo(), false, false, 0); // For i64 source, we need to add the appropriate power of 2 if the input @@ -12193,10 +13116,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // we must be careful to do the computation in x87 extended precision, not // in SSE. (The generic code can't know it's OK to do this, or how to.) int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); - MachineMemOperand *MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, 8, 8); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOLoad, 8, 8); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; @@ -12223,24 +13145,52 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? - SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), - FudgePtr, MachinePointerInfo::getConstantPool(), - MVT::f32, false, false, false, 4); + SDValue Fudge = DAG.getExtLoad( + ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, + false, false, false, 4); // Extend everything to 80 bits to force it to be done on x87. + // TODO: Are there any fast-math-flags to propagate here? SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl)); } +// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation +// is legal, or has an fp128 or f16 source (which needs to be promoted to f32), +// just return an <SDValue(), SDValue()> pair. +// Otherwise it is assumed to be a conversion from one of f32, f64 or f80 +// to i16, i32 or i64, and we lower it to a legal sequence. +// If lowered to the final integer result we return a <result, SDValue()> pair. +// Otherwise we lower it to a sequence ending with a FIST, return a +// <FIST, StackSlot> pair, and the caller is responsible for loading +// the final integer result from StackSlot. std::pair<SDValue,SDValue> -X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, - bool IsSigned, bool IsReplace) const { +X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, + bool IsSigned, bool IsReplace) const { SDLoc DL(Op); EVT DstTy = Op.getValueType(); + EVT TheVT = Op.getOperand(0).getValueType(); auto PtrVT = getPointerTy(DAG.getDataLayout()); - if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { + if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { + // f16 must be promoted before using the lowering in this routine. + // fp128 does not use this lowering. + return std::make_pair(SDValue(), SDValue()); + } + + // If using FIST to compute an unsigned i64, we'll need some fixup + // to handle values above the maximum signed i64. A FIST is always + // used for the 32-bit subtarget, but also for f80 on a 64-bit target. + bool UnsignedFixup = !IsSigned && + DstTy == MVT::i64 && + (!Subtarget->is64Bit() || + !isScalarFPTypeInSSEReg(TheVT)); + + if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) { + // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. + // The low 32 bits of the fist result will have the correct uint32 result. assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); DstTy = MVT::i64; } @@ -12258,42 +13208,87 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); - // We lower FP->int64 either into FISTP64 followed by a load from a temporary - // stack slot, or into the FTOL runtime function. + // We lower FP->int64 into FISTP64 followed by a load from a temporary + // stack slot. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); unsigned Opc; - if (!IsSigned && isIntegerTypeFTOL(DstTy)) - Opc = X86ISD::WIN_FTOL; - else - switch (DstTy.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); - case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; - case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; - case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; - } + switch (DstTy.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); + case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; + case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; + case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; + } SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); - EVT TheVT = Op.getOperand(0).getValueType(); + SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. + + if (UnsignedFixup) { + // + // Conversion to unsigned i64 is implemented with a select, + // depending on whether the source value fits in the range + // of a signed i64. Let Thresh be the FP equivalent of + // 0x8000000000000000ULL. + // + // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000; + // FistSrc = (Value < Thresh) ? Value : (Value - Thresh); + // Fist-to-mem64 FistSrc + // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent + // to XOR'ing the high 32 bits with Adjust. + // + // Being a power of 2, Thresh is exactly representable in all FP formats. + // For X87 we'd like to use the smallest FP type for this constant, but + // for DAG type consistency we have to match the FP operand type. + + APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000)); + LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK; + bool LosesInfo = false; + if (TheVT == MVT::f64) + // The rounding mode is irrelevant as the conversion should be exact. + Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, + &LosesInfo); + else if (TheVT == MVT::f80) + Status = Thresh.convert(APFloat::x87DoubleExtended, + APFloat::rmNearestTiesToEven, &LosesInfo); + + assert(Status == APFloat::opOK && !LosesInfo && + "FP conversion should have been exact"); + + SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); + + SDValue Cmp = DAG.getSetCC(DL, + getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), TheVT), + Value, ThreshVal, ISD::SETLT); + Adjust = DAG.getSelect(DL, MVT::i32, Cmp, + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0x80000000, DL, MVT::i32)); + SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); + Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), TheVT), + Value, ThreshVal, ISD::SETLT); + Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); + } + // FIXME This causes a redundant load/store if the SSE-class value is already // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, - MachinePointerInfo::getFixedStack(SSFI), - false, false, 0); + MachinePointerInfo::getFixedStack(MF, SSFI), false, + false, 0); SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(TheVT) }; MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, MemSize, MemSize); + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOLoad, MemSize, MemSize); Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); @@ -12301,28 +13296,52 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, } MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOStore, MemSize, MemSize); + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOStore, MemSize, MemSize); + + if (UnsignedFixup) { + + // Insert the FIST, load its result as two i32's, + // and XOR the high i32 with Adjust. + + SDValue FistOps[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), + FistOps, DstTy, MMO); + + SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot, + MachinePointerInfo(), + false, false, false, 0); + SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot, + DAG.getConstant(4, DL, PtrVT)); + + SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr, + MachinePointerInfo(), + false, false, false, 0); + High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust); - if (Opc != X86ISD::WIN_FTOL) { + if (Subtarget->is64Bit()) { + // Join High32 and Low32 into a 64-bit result. + // (High32 << 32) | Low32 + Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32); + High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32); + High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32, + DAG.getConstant(32, DL, MVT::i8)); + SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32); + return std::make_pair(Result, SDValue()); + } + + SDValue ResultOps[] = { Low32, High32 }; + + SDValue pair = IsReplace + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps) + : DAG.getMergeValues(ResultOps, DL); + return std::make_pair(pair, SDValue()); + } else { // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO); return std::make_pair(FIST, StackSlot); - } else { - SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, - DAG.getVTList(MVT::Other, MVT::Glue), - Chain, Value); - SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, - MVT::i32, ftol.getValue(1)); - SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, - MVT::i32, eax.getValue(2)); - SDValue Ops[] = { eax, edx }; - SDValue pair = IsReplace - ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops) - : DAG.getMergeValues(Ops, DL); - return std::make_pair(pair, SDValue()); } } @@ -12333,7 +13352,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); - if (VT.is512BitVector() || InVT.getScalarType() == MVT::i1) + if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In); // Optimize vectors in AVX mode: @@ -12426,6 +13445,62 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } +static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + MVT InVT = In.getSimpleValueType(); + + assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type."); + + // Shift LSB to MSB and use VPMOVB2M - SKX. + unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && + Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M + // Shift packed bytes not supported natively, bitcast to dword + MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, + DAG.getBitcast(ExtVT, In), + DAG.getConstant(ShiftInx, DL, ExtVT)); + ShiftNode = DAG.getBitcast(InVT, ShiftNode); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && + Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + + // Shift LSB to MSB, extend if necessary and use TESTM. + unsigned NumElts = InVT.getVectorNumElements(); + if (InVT.getSizeInBits() < 512 && + (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 || + !Subtarget->hasVLX())) { + assert((NumElts == 8 || NumElts == 16) && "Unexected vector type."); + + // TESTD/Q should be used (if BW supported we use CVT2MASK above), + // so vector should be extended to packed dword/qword. + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); + InVT = ExtVT; + ShiftInx = InVT.getScalarSizeInBits() - 1; + } + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode); +} + SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); @@ -12443,42 +13518,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); - // move vector to mask - truncate solution for SKX - if (VT.getVectorElementType() == MVT::i1) { - if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI()) - return Op; // legal, will go to VPMOVB2M, VPMOVW2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) - && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI() && Subtarget->hasVLX()) - return Op; // legal, will go to VPMOVB2M, VPMOVW2M - if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI()) - return Op; // legal, will go to VPMOVD2M, VPMOVQ2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) - && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI() && Subtarget->hasVLX()) - return Op; // legal, will go to VPMOVB2M, VPMOVQ2M - } - if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { - if (VT.getVectorElementType().getSizeInBits() >=8) - return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); - - assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); - unsigned NumElts = InVT.getVectorNumElements(); - assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); - if (InVT.getSizeInBits() < 512) { - MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; - In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); - InVT = ExtVT; - } - - SDValue OneV = - DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT); - SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); - return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); - } + if (VT.getVectorElementType() == MVT::i1) + return LowerTruncateVecI1(Op, DAG, Subtarget); + // vpmovqb/w/d, vpmovdb/w, vpmovwb + if (Subtarget->hasAVX512()) { + // word to byte only under BWI + if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8 + return DAG.getNode(X86ISD::VTRUNC, DL, VT, + DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); + return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); + } if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget->hasInt256()) { @@ -12583,7 +13633,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, /*IsSigned=*/ true, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. - if (!FIST.getNode()) return Op; + if (!FIST.getNode()) + return Op; if (StackSlot.getNode()) // Load the result. @@ -12600,7 +13651,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, /*IsSigned=*/ false, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; - assert(FIST.getNode() && "Unexpected failure"); + // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. + if (!FIST.getNode()) + return Op; if (StackSlot.getNode()) // Load the result. @@ -12643,6 +13696,8 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + bool IsF128 = (VT == MVT::f128); + // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. @@ -12650,11 +13705,16 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { MVT LogicVT; MVT EltVT; unsigned NumElts; - + if (VT.isVector()) { LogicVT = VT; EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); + } else if (IsF128) { + // SSE instructions are used for optimized f128 logical operations. + LogicVT = MVT::f128; + EltVT = VT; + NumElts = 1; } else { // There are no scalar bitwise logical SSE/AVX instructions, so we // generate a 16-byte vector constant and logic op even for the scalar case. @@ -12675,9 +13735,10 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); - SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); + SDValue Mask = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, Alignment); SDValue Op0 = Op.getOperand(0); bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); @@ -12685,7 +13746,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; - if (VT.isVector()) + if (VT.isVector() || IsF128) return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); // For the scalar case extend to a 128-bit vector, perform the logic op, @@ -12704,6 +13765,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT SrcVT = Op1.getSimpleValueType(); + bool IsF128 = (VT == MVT::f128); // If second operand is smaller, extend it first. if (SrcVT.bitsLT(VT)) { @@ -12718,13 +13780,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. + assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) && + "Unexpected type in LowerFCOPYSIGN"); const fltSemantics &Sem = - VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle; + VT == MVT::f64 ? APFloat::IEEEdouble : + (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle); const unsigned SizeInBits = VT.getSizeInBits(); SmallVector<Constant *, 4> CV( - VT == MVT::f64 ? 2 : 4, + VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4), ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); // First, clear all bits but the sign bit from the second operand (sign). @@ -12737,11 +13802,13 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // Perform all logic operations as 16-byte vectors because there are no // scalar FP logic instructions in SSE. This allows load folding of the // constants into the logic instructions. - MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; - SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); - Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); + MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32); + SDValue Mask1 = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); + if (!IsF128) + Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); // Next, clear the sign bit from the first operand (magnitude). @@ -12750,8 +13817,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { APFloat APF = Op0CN->getValueAPF(); // If the magnitude is a positive zero, the sign bit alone is enough. if (APF.isPosZero()) - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, - DAG.getIntPtrConstant(0, dl)); + return IsF128 ? SignBit : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, + DAG.getIntPtrConstant(0, dl)); APF.clearSign(); CV[0] = ConstantFP::get(*Context, APF); } else { @@ -12761,18 +13829,21 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { } C = ConstantVector::get(CV); CPIdx = DAG.getConstantPool(C, PtrVT, 16); - SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); + SDValue Val = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); // If the magnitude operand wasn't a constant, we need to AND out the sign. if (!isa<ConstantFPSDNode>(Op0)) { - Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); + if (!IsF128) + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val); } // OR the magnitude value with the sign bit. Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, - DAG.getIntPtrConstant(0, dl)); + return IsF128 ? Val : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, + DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { @@ -12859,7 +13930,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } - EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; // Cast all vectors into TestVT for PTEST. for (unsigned i = 0, e = VecIns.size(); i < e; ++i) @@ -12999,14 +14070,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { // An add of one will be selected as an INC. - if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) { + if (C->isOne() && !Subtarget->slowIncDec()) { Opcode = X86ISD::INC; NumOperands = 1; break; } // An add of negative one (subtract of one) will be selected as a DEC. - if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) { + if (C->isAllOnesValue() && !Subtarget->slowIncDec()) { Opcode = X86ISD::DEC; NumOperands = 1; break; @@ -13135,13 +14206,11 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) { - if (C->getAPIntValue() == 0) - return EmitTest(Op0, X86CC, dl, DAG); + if (isNullConstant(Op1)) + return EmitTest(Op0, X86CC, dl, DAG); - if (Op0.getValueType() == MVT::i1) - llvm_unreachable("Unexpected comparison operation for MVT::i1 operands"); - } + assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) && + "Unexpected comparison operation for MVT::i1 operands"); if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { @@ -13150,8 +14219,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, // if we're optimizing for size, however, as that'll allow better folding // of memory operations. if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && - !DAG.getMachineFunction().getFunction()->hasFnAttribute( - Attribute::MinSize) && + !DAG.getMachineFunction().getFunction()->optForMinSize() && !Subtarget->isAtom()) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; @@ -13188,6 +14256,9 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, DAG.getConstant(8, dl, MVT::i8)); SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); + + // Some 64-bit targets lack SAHF support, but they do support FCOMI. + assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } @@ -13261,13 +14332,8 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, /// This is because we still need one division to calculate the reciprocal and /// then we need two multiplies by that reciprocal as replacements for the /// original divisions. -bool X86TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { - return NumUsers > 1; -} - -static bool isAllOnes(SDValue V) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); - return C && C->isAllOnesValue(); +unsigned X86TargetLowering::combineRepeatedFPDivisors() const { + return 2; } /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node @@ -13285,8 +14351,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, if (Op1.getOpcode() == ISD::SHL) std::swap(Op0, Op1); if (Op0.getOpcode() == ISD::SHL) { - if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) - if (And00C->getZExtValue() == 1) { + if (isOneConstant(Op0.getOperand(0))) { // If we looked past a truncate, check that it's only truncating away // known zeros. unsigned BitWidth = Op0.getValueSizeInBits(); @@ -13423,7 +14488,7 @@ static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - assert(Op0.getValueType().getVectorElementType() == MVT::i1 && + assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && "Unexpected type for boolean compare operation"); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0, @@ -13467,8 +14532,8 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && - Op.getValueType().getScalarType() == MVT::i1 && + assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 && + Op.getSimpleValueType().getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); @@ -13515,7 +14580,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) for (unsigned i = 0; i < n; ++i) { ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i)); - if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT) + if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT) return SDValue(); // Avoid underflow. @@ -13606,13 +14671,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntVSETCC(Op, DAG); - EVT OpVT = Op1.getValueType(); + MVT OpVT = Op1.getSimpleValueType(); if (OpVT.getVectorElementType() == MVT::i1) return LowerBoolVSETCC_AVX512(Op, DAG); bool MaskResult = (VT.getVectorElementType() == MVT::i1); if (Subtarget->hasAVX512()) { - if (Op1.getValueType().is512BitVector() || + if (Op1.getSimpleValueType().is512BitVector() || (Subtarget->hasBWI() && Subtarget->hasVLX()) || (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); @@ -13628,6 +14693,33 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); } + // Lower using XOP integer comparisons. + if ((VT == MVT::v16i8 || VT == MVT::v8i16 || + VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) { + // Translate compare code to XOP PCOM compare mode. + unsigned CmpMode = 0; + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETULT: + case ISD::SETLT: CmpMode = 0x00; break; + case ISD::SETULE: + case ISD::SETLE: CmpMode = 0x01; break; + case ISD::SETUGT: + case ISD::SETGT: CmpMode = 0x02; break; + case ISD::SETUGE: + case ISD::SETGE: CmpMode = 0x03; break; + case ISD::SETEQ: CmpMode = 0x04; break; + case ISD::SETNE: CmpMode = 0x05; break; + } + + // Are we comparing unsigned or signed integers? + unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) + ? X86ISD::VPCOMU : X86ISD::VPCOM; + + return DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getConstant(CmpMode, dl, MVT::i8)); + } + // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. @@ -13777,7 +14869,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. if (FlipSigns) { - EVT EltVT = VT.getVectorElementType(); + MVT EltVT = VT.getVectorElementType(); SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl, VT); Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); @@ -13818,11 +14910,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && - Op1.getOpcode() == ISD::Constant && - cast<ConstantSDNode>(Op1)->isNullValue() && + isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { - SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); - if (NewSetCC.getNode()) { + if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); return NewSetCC; @@ -13831,17 +14921,14 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. - if (Op1.getOpcode() == ISD::Constant && - (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || - cast<ConstantSDNode>(Op1)->isNullValue()) && + if ((isOneConstant(Op1) || isNullConstant(Op1)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // If the input is a setcc, then reuse the input setcc or use a new one with // the inverted condition. if (Op0.getOpcode() == X86ISD::SETCC) { X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); - bool Invert = (CC == ISD::SETNE) ^ - cast<ConstantSDNode>(Op1)->isNullValue(); + bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); if (!Invert) return Op0; @@ -13854,8 +14941,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SetCC; } } - if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) && - (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) && + if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); @@ -13876,6 +14962,23 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SetCC; } +SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue Carry = Op.getOperand(2); + SDValue Cond = Op.getOperand(3); + SDLoc DL(Op); + + assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); + X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get()); + + assert(Carry.getOpcode() != ISD::CARRY_FALSE); + SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); + return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(), + DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); +} + // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getNode()->getOpcode(); @@ -13918,7 +15021,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); SDLoc DL(Op); - EVT VT = Op1.getValueType(); + MVT VT = Op1.getSimpleValueType(); SDValue CC; // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops @@ -13927,7 +15030,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (Cond.getOpcode() == ISD::SETCC && ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || (Subtarget->hasSSE1() && VT == MVT::f32)) && - VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) { + VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); int SSECC = translateX86FSETCC( cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); @@ -13961,12 +15064,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // Convert to vectors, do a VSELECT, and convert back to scalar. // All of the conversions should be optimized away. - EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; + MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); - EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; + MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; VCmp = DAG.getBitcast(VCmpVT, VCmp); SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); @@ -13980,26 +15083,26 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } } - if (VT.isVector() && VT.getScalarType() == MVT::i1) { - SDValue Op1Scalar; - if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) - Op1Scalar = ConvertI1VectorToInterger(Op1, DAG); - else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) - Op1Scalar = Op1.getOperand(0); - SDValue Op2Scalar; - if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) - Op2Scalar = ConvertI1VectorToInterger(Op2, DAG); - else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) - Op2Scalar = Op2.getOperand(0); - if (Op1Scalar.getNode() && Op2Scalar.getNode()) { - SDValue newSelect = DAG.getNode(ISD::SELECT, DL, - Op1Scalar.getValueType(), - Cond, Op1Scalar, Op2Scalar); - if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) - return DAG.getBitcast(VT, newSelect); - SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, - DAG.getIntPtrConstant(0, DL)); + if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { + SDValue Op1Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) + Op1Scalar = ConvertI1VectorToInteger(Op1, DAG); + else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) + Op1Scalar = Op1.getOperand(0); + SDValue Op2Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) + Op2Scalar = ConvertI1VectorToInteger(Op2, DAG); + else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) + Op2Scalar = Op2.getOperand(0); + if (Op1Scalar.getNode() && Op2Scalar.getNode()) { + SDValue newSelect = DAG.getNode(ISD::SELECT, DL, + Op1Scalar.getValueType(), + Cond, Op1Scalar, Op2Scalar); + if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) + return DAG.getBitcast(VT, newSelect); + SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, + DAG.getIntPtrConstant(0, DL)); } } @@ -14026,22 +15129,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && - isZero(Cond.getOperand(1).getOperand(1))) { + isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); - if ((isAllOnes(Op1) || isAllOnes(Op2)) && + if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { - SDValue Y = isAllOnes(Op2) ? Op1 : Op2; + SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; SDValue CmpOp0 = Cmp.getOperand(0); // Apply further optimizations for special cases // (select (x != 0), -1, 0) -> neg & sbb // (select (x == 0), 0, -1) -> neg & sbb - if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) - if (YC->isNullValue() && - (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { + if (isNullConstant(Y) && + (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, DAG.getConstant(0, DL, @@ -14061,11 +15163,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp); - if (isAllOnes(Op1) != (CondCode == X86::COND_E)) + if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) Res = DAG.getNOT(DL, Res, Res.getValueType()); - ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); - if (!N2C || !N2C->isNullValue()) + if (!isNullConstant(Op2)) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; } @@ -14073,11 +15174,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // Look past (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && - Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); - if (C && C->getAPIntValue() == 1) - Cond = Cond.getOperand(0); - } + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. @@ -14136,15 +15235,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } if (addTest) { - // Look pass the truncate if the high bits are known zero. + // Look past the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) - Cond = Cond.getOperand(0); + Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { - SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); - if (NewSetCC.getNode()) { + if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; @@ -14166,11 +15264,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && - (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { + (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && + (isNullConstant(Op1) || isNullConstant(Op2))) { SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cond); - if (isAllOnes(Op1) != (CondCode == X86::COND_B)) + if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; } @@ -14256,8 +15355,8 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, MVT InVT = In.getSimpleValueType(); assert(VT.getSizeInBits() == InVT.getSizeInBits()); - MVT InSVT = InVT.getScalarType(); - assert(VT.getScalarType().getScalarSizeInBits() > InSVT.getScalarSizeInBits()); + MVT InSVT = InVT.getVectorElementType(); + assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits()); if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) return SDValue(); @@ -14276,7 +15375,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, // As SRAI is only available on i16/i32 types, we expand only up to i32 // and handle i64 separately. - while (CurrVT != VT && CurrVT.getScalarType() != MVT::i32) { + while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) { Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr); MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2); CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2); @@ -14286,7 +15385,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SDValue SignExt = Curr; if (CurrVT != InVT) { unsigned SignExtShift = - CurrVT.getScalarSizeInBits() - InSVT.getScalarSizeInBits(); + CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits(); SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, DAG.getConstant(SignExtShift, dl, MVT::i8)); } @@ -14346,7 +15445,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); - MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements()/2); OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); @@ -14470,7 +15569,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, // memory. In practice, we ''widen'' MemVT. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - loadRegZize / MemVT.getScalarType().getSizeInBits()); + loadRegZize / MemVT.getScalarSizeInBits()); assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && "Invalid vector type"); @@ -14518,29 +15617,12 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, return Sext; } - // Otherwise we'll shuffle the small elements in the high bits of the - // larger type and perform an arithmetic shift. If the shift is not legal - // it's better to scalarize. - assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && - "We can't implement a sext load without an arithmetic right shift!"); - - // Redistribute the loaded elements into the different locations. - SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i * SizeRatio + SizeRatio - 1] = i; - - SDValue Shuff = DAG.getVectorShuffle( - WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); - - Shuff = DAG.getBitcast(RegVT, Shuff); - - // Build the arithmetic shift. - unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - - MemVT.getVectorElementType().getSizeInBits(); - Shuff = - DAG.getNode(ISD::SRA, dl, RegVT, Shuff, - DAG.getConstant(Amt, dl, RegVT)); + // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest + // lanes. + assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && + "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!"); + SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Shuff; } @@ -14577,11 +15659,9 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { static bool isXor1OfSetCC(SDValue Op) { if (Op.getOpcode() != ISD::XOR) return false; - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); - if (N1C && N1C->getAPIntValue() == 1) { + if (isOneConstant(Op.getOperand(1))) return Op.getOperand(0).getOpcode() == X86ISD::SETCC && - Op.getOperand(0).hasOneUse(); - } + Op.getOperand(0).hasOneUse(); return false; } @@ -14597,8 +15677,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (Cond.getOpcode() == ISD::SETCC) { // Check for setcc([su]{add,sub,mul}o == 0). if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && - isa<ConstantSDNode>(Cond.getOperand(1)) && - cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && + isNullConstant(Cond.getOperand(1)) && Cond.getOperand(0).getResNo() == 1 && (Cond.getOperand(0).getOpcode() == ISD::SADDO || Cond.getOperand(0).getOpcode() == ISD::UADDO || @@ -14625,11 +15704,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // Look pass (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && - Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); - if (C && C->getAPIntValue() == 1) - Cond = Cond.getOperand(0); - } + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. @@ -14673,16 +15750,14 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { switch (CondOpcode) { case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; case ISD::SADDO: - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; break; } X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; case ISD::SSUBO: - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; break; } @@ -14844,8 +15919,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { - SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); - if (NewSetCC.getNode()) { + if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; @@ -14877,54 +15951,40 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SplitStack; SDLoc dl(Op); + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + EVT VT = Node->getValueType(0); + + // Chain the dynamic stack allocation so that it doesn't modify the stack + // pointer when other instructions are using the stack. + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); + + bool Is64Bit = Subtarget->is64Bit(); + MVT SPTy = getPointerTy(DAG.getDataLayout()); + + SDValue Result; if (!Lower) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDNode* Node = Op.getNode(); - unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" - " not tell us which reg is the stack pointer!"); + " not tell us which reg is the stack pointer!"); EVT VT = Node->getValueType(0); - SDValue Tmp1 = SDValue(Node, 0); - SDValue Tmp2 = SDValue(Node, 1); SDValue Tmp3 = Node->getOperand(2); - SDValue Chain = Tmp1.getOperand(0); - - // Chain the dynamic stack allocation so that it doesn't modify the stack - // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), - SDLoc(Node)); - SDValue Size = Tmp2.getOperand(1); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); - Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) - Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, - DAG.getConstant(-(uint64_t)Align, dl, VT)); - Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain - - Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), - DAG.getIntPtrConstant(0, dl, true), SDValue(), - SDLoc(Node)); - - SDValue Ops[2] = { Tmp1, Tmp2 }; - return DAG.getMergeValues(Ops, dl); - } - - // Get the inputs. - SDValue Chain = Op.getOperand(0); - SDValue Size = Op.getOperand(1); - unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); - EVT VT = Op.getNode()->getValueType(0); - - bool Is64Bit = Subtarget->is64Bit(); - MVT SPTy = getPointerTy(DAG.getDataLayout()); - - if (SplitStack) { + Result = DAG.getNode(ISD::AND, dl, VT, Result, + DAG.getConstant(-(uint64_t)Align, dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain + } else if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { @@ -14942,10 +16002,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); - SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, + Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); - SDValue Ops1[2] = { Value, Chain }; - return DAG.getMergeValues(Ops1, dl); } else { SDValue Flag; const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); @@ -14967,9 +16025,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } - SDValue Ops1[2] = { SP, Chain }; - return DAG.getMergeValues(Ops1, dl); + Result = SP; } + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), + DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); + + SDValue Ops[2] = {Result, Chain}; + return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { @@ -14980,7 +16043,8 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); SDLoc DL(Op); - if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { + if (!Subtarget->is64Bit() || + Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); @@ -15019,10 +16083,11 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MemOps.push_back(Store); // Store ptr to reg_save_area. - FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(8, DL)); + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( + Subtarget->isTarget64BitLP64() ? 8 : 4, DL)); SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); - Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, - MachinePointerInfo(SV, 16), false, false, 0); + Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo( + SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0); MemOps.push_back(Store); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } @@ -15030,10 +16095,13 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->is64Bit() && "LowerVAARG only handles 64-bit va_arg!"); - assert((Subtarget->isTargetLinux() || - Subtarget->isTargetDarwin()) && - "Unhandled target in LowerVAARG"); assert(Op.getNode()->getNumOperands() == 4); + + MachineFunction &MF = DAG.getMachineFunction(); + if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) + // The Win64 ABI uses char* instead of a structure. + return DAG.expandVAArg(Op.getNode()); + SDValue Chain = Op.getOperand(0); SDValue SrcPtr = Op.getOperand(1); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); @@ -15061,8 +16129,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. assert(!Subtarget->useSoftFloat() && - !(DAG.getMachineFunction().getFunction()->hasFnAttribute( - Attribute::NoImplicitFloat)) && + !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()); } @@ -15091,8 +16158,14 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - // X86-64 va_list is a struct { i32, i32, i8*, i8* }. + // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, + // where a va_list is still an i8*. assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); + if (Subtarget->isCallingConvWin64( + DAG.getMachineFunction().getFunction()->getCallingConv())) + // Probably a Win64 va_copy. + return DAG.expandVACopy(Op.getNode()); + SDValue Chain = Op.getOperand(0); SDValue DstPtr = Op.getOperand(1); SDValue SrcPtr = Op.getOperand(2); @@ -15230,72 +16303,126 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, // The return type has to be a 128-bit type with the same element // type as the input type. MVT EltVT = VT.getVectorElementType(); - EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); + MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); ShAmt = DAG.getBitcast(ShVT, ShAmt); return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } -/// \brief Return (and \p Op, \p Mask) for compare instructions or -/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the -/// necessary casting for \p Mask when lowering masking intrinsics. -static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, - SDValue PreservedSrc, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - EVT VT = Op.getValueType(); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), - MVT::i1, VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); - SDLoc dl(Op); +/// \brief Return Mask with the necessary casting or extending +/// for \p Mask according to \p MaskVT when lowering masking intrinsics +static SDValue getMaskNode(SDValue Mask, MVT MaskVT, + const X86Subtarget *Subtarget, + SelectionDAG &DAG, SDLoc dl) { + + if (MaskVT.bitsGT(Mask.getSimpleValueType())) { + // Mask should be extended + Mask = DAG.getNode(ISD::ANY_EXTEND, dl, + MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask); + } - assert(MaskVT.isSimple() && "invalid mask type"); + if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) { + if (MaskVT == MVT::v64i1) { + assert(Subtarget->hasBWI() && "Expected AVX512BW target!"); + // In case 32bit mode, bitcast i64 is illegal, extend/split it. + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(0, dl, MVT::i32)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(1, dl, MVT::i32)); - if (isAllOnes(Mask)) - return Op; + Lo = DAG.getBitcast(MVT::v32i1, Lo); + Hi = DAG.getBitcast(MVT::v32i1, Hi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); + } else { + // MaskVT require < 64bit. Truncate mask (should succeed in any case), + // and bitcast. + MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits()); + return DAG.getBitcast(MaskVT, + DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask)); + } + } else { + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } +} - switch (Op.getOpcode()) { - default: break; - case X86ISD::PCMPEQM: - case X86ISD::PCMPGTM: - case X86ISD::CMPM: - case X86ISD::CMPMU: - return DAG.getNode(ISD::AND, dl, VT, Op, VMask); - } - if (PreservedSrc.getOpcode() == ISD::UNDEF) - PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); +/// \brief Return (and \p Op, \p Mask) for compare instructions or +/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the +/// necessary casting or extending for \p Mask when lowering masking intrinsics +static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + unsigned OpcodeSelect = ISD::VSELECT; + SDLoc dl(Op); + + if (isAllOnesConstant(Mask)) + return Op; + + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + + switch (Op.getOpcode()) { + default: break; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return DAG.getNode(ISD::AND, dl, VT, Op, VMask); + case X86ISD::VFPCLASS: + case X86ISD::VFPCLASSS: + return DAG.getNode(ISD::OR, dl, VT, Op, VMask); + case X86ISD::VTRUNC: + case X86ISD::VTRUNCS: + case X86ISD::VTRUNCUS: + // We can't use ISD::VSELECT here because it is not always "Legal" + // for the destination type. For example vpmovqb require only AVX512 + // and vselect that can operate on byte element type require BWI + OpcodeSelect = X86ISD::SELECT; + break; + } + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); } /// \brief Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). -/// The mask is comming as MVT::i8 and it should be truncated +/// The mask is coming as MVT::i8 and it should be truncated /// to MVT::i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using -/// "X86select" instead of "vselect". We just can't create the "vselect" node for -/// a scalar instruction. +/// "X86select" instead of "vselect". We just can't create the "vselect" node +/// for a scalar instruction. static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - if (isAllOnes(Mask)) - return Op; + if (isAllOnesConstant(Mask)) + return Op; - EVT VT = Op.getValueType(); - SDLoc dl(Op); - // The mask should be of type MVT::i1 - SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + // The mask should be of type MVT::i1 + SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); + + if (Op.getOpcode() == X86ISD::FSETCC) + return DAG.getNode(ISD::AND, dl, VT, Op, IMask); + if (Op.getOpcode() == X86ISD::VFPCLASS || + Op.getOpcode() == X86ISD::VFPCLASSS) + return DAG.getNode(ISD::OR, dl, VT, Op, IMask); - if (PreservedSrc.getOpcode() == ISD::UNDEF) - PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); } static int getSEHRegistrationNodeSize(const Function *Fn) { @@ -15309,15 +16436,16 @@ static int getSEHRegistrationNodeSize(const Function *Fn) { case EHPersonality::MSVC_CXX: return 16; default: break; } - report_fatal_error("can only recover FP for MSVC EH personality functions"); + report_fatal_error( + "can only recover FP for 32-bit MSVC EH personality functions"); } -/// When the 32-bit MSVC runtime transfers control to us, either to an outlined +/// When the MSVC runtime transfers control to us, either to an outlined /// function or when returning to a parent frame after catching an exception, we /// recover the parent frame pointer by doing arithmetic on the incoming EBP. /// Here's the math: /// RegNodeBase = EntryEBP - RegNodeSize -/// ParentFP = RegNodeBase - RegNodeFrameOffset +/// ParentFP = RegNodeBase - ParentFrameOffset /// Subtracting RegNodeSize takes us to the offset of the registration node, and /// subtracting the offset (negative on x86) takes us back to the parent FP. static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, @@ -15334,29 +16462,35 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, if (!Fn->hasPersonalityFn()) return EntryEBP; - int RegNodeSize = getSEHRegistrationNodeSize(Fn); - // Get an MCSymbol that will ultimately resolve to the frame offset of the EH - // registration. + // registration, or the .set_setframe offset. MCSymbol *OffsetSym = MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( GlobalValue::getRealLinkageName(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); - SDValue RegNodeFrameOffset = + SDValue ParentFrameOffset = DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); + // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after + // prologue to RBP in the parent function. + const X86Subtarget &Subtarget = + static_cast<const X86Subtarget &>(DAG.getSubtarget()); + if (Subtarget.is64Bit()) + return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); + + int RegNodeSize = getSEHRegistrationNodeSize(Fn); // RegNodeBase = EntryEBP - RegNodeSize - // ParentFP = RegNodeBase - RegNodeFrameOffset + // ParentFP = RegNodeBase - ParentFrameOffset SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, DAG.getConstant(RegNodeSize, dl, PtrVT)); - return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, RegNodeFrameOffset); + return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); } static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); if (IntrData) { switch(IntrData->Type) { @@ -15365,6 +16499,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget case INTR_TYPE_2OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case INTR_TYPE_2OP_IMM8: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2))); case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -15376,28 +16513,53 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue RoundingMode; + // We allways add rounding mode to the Node. + // If the rounding mode is not specified, we add the + // "current direction" mode. if (Op.getNumOperands() == 4) - RoundingMode = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + RoundingMode = + DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); else RoundingMode = Op.getOperand(4); unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - unsigned Round = cast<ConstantSDNode>(RoundingMode)->getZExtValue(); - if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) + if (IntrWithRoundingModeOpcode != 0) + if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src, RoundingMode), Mask, PassThru, Subtarget, DAG); - } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, RoundingMode), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_1OP_MASK: { SDValue Src = Op.getOperand(1); - SDValue Passthru = Op.getOperand(2); + SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); + // We add rounding mode to the Node when + // - RM Opcode is specified and + // - RM is not "current direction". + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(4); + unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); + if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src, Rnd), + Mask, PassThru, Subtarget, DAG); + } + } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), - Mask, Passthru, Subtarget, DAG); + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_SCALAR_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue passThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), + Mask, passThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK_RM: { SDValue Src1 = Op.getOperand(1); @@ -15405,7 +16567,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Src0 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); // There are 2 kinds of intrinsics in this group: - // (1) With supress-all-exceptions (sae) or rounding mode- 6 operands + // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands // (2) With rounding mode and sae - 7 operands. if (Op.getNumOperands() == 6) { SDValue Sae = Op.getOperand(5); @@ -15421,11 +16583,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget RoundingMode, Sae), Mask, Src0, Subtarget, DAG); } - case INTR_TYPE_2OP_MASK: { + case INTR_TYPE_2OP_MASK: + case INTR_TYPE_2OP_IMM8_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); + + if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK) + Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2); + // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. @@ -15440,8 +16607,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Mask, PassThru, Subtarget, DAG); } } - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1,Src2), + // TODO: Intrinsics should have fast-math-flags to propagate. + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_2OP_MASK_RM: { @@ -15449,7 +16616,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - // We specify 2 possible modes for intrinsics, with/without rounding modes. + // We specify 2 possible modes for intrinsics, with/without rounding + // modes. // First, we check if the intrinsic have rounding mode (6 operands), // if not, we set rounding mode to "current". SDValue Rnd; @@ -15461,12 +16629,56 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src1, Src2, Rnd), Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_3OP_MASK: { + case INTR_TYPE_3OP_SCALAR_MASK_RM: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); + SDValue Sae = Op.getOperand(6); + + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, + Src2, Src3, Sae), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_3OP_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Imm = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + // We specify 2 possible modes for intrinsics, with/without rounding + // modes. + // First, we check if the intrinsic have rounding mode (7 operands), + // if not, we set rounding mode to "current". + SDValue Rnd; + if (Op.getNumOperands() == 7) + Rnd = Op.getOperand(6); + else + Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Imm, Rnd), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_3OP_IMM8_MASK: + case INTR_TYPE_3OP_MASK: + case INSERT_SUBVEC: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + + if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK) + Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); + else if (IntrData->Type == INSERT_SUBVEC) { + // imm should be adapted to ISD::INSERT_SUBVECTOR behavior + assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!"); + unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue(); + Imm *= Src2.getSimpleValueType().getVectorNumElements(); + Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32); + } + // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. @@ -15486,7 +16698,27 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Mask, PassThru, Subtarget, DAG); } case VPERM_3OP_MASKZ: - case VPERM_3OP_MASK: + case VPERM_3OP_MASK:{ + // Src2 is the PassThru + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = SDValue(); + + // set PassThru element + if (IntrData->Type == VPERM_3OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + else + PassThru = DAG.getBitcast(VT, Src2); + + // Swap Src1 and Src2 in the node creation + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Src2, Src1, Src3), + Mask, PassThru, Subtarget, DAG); + } case FMA_OP_MASK3: case FMA_OP_MASKZ: case FMA_OP_MASK: { @@ -15494,11 +16726,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); SDValue PassThru = SDValue(); // set PassThru element - if (IntrData->Type == VPERM_3OP_MASKZ || IntrData->Type == FMA_OP_MASKZ) + if (IntrData->Type == FMA_OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); else if (IntrData->Type == FMA_OP_MASK3) PassThru = Src3; @@ -15523,6 +16755,50 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } + case TERLOG_OP_MASK: + case TERLOG_OP_MASKZ: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4)); + SDValue Mask = Op.getOperand(5); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = Src1; + // Set PassThru element. + if (IntrData->Type == TERLOG_OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Src3, Src4), + Mask, PassThru, Subtarget, DAG); + } + case FPCLASS: { + // FPclass intrinsics with mask + SDValue Src1 = Op.getOperand(1); + MVT VT = Src1.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue Imm = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm); + SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, + DAG.getTargetConstant(0, dl, MaskVT), + Subtarget, DAG); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), FPclassMask, + DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(Op.getValueType(), Res); + } + case FPCLASSS: { + SDValue Src1 = Op.getOperand(1); + SDValue Imm = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm); + SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, + DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); + return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask); + } case CMP_MASK: case CMP_MASK_CC: { // Comparison intrinsics with masks. @@ -15534,12 +16810,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget // (v2i1 (and (PCMPEQM %a, %b), // (extract_subvector // (v8i1 (bitcast %mask)), 0))), 0)))) - EVT VT = Op.getOperand(1).getValueType(); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - VT.getVectorNumElements()); + MVT VT = Op.getOperand(1).getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); SDValue Cmp; if (IntrData->Type == CMP_MASK_CC) { SDValue CC = Op.getOperand(3); @@ -15573,6 +16848,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } + case CMP_MASK_SCALAR_CC: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3)); + SDValue Mask = Op.getOperand(4); + + SDValue Cmp; + if (IntrData->Opc1 != 0) { + SDValue Rnd = Op.getOperand(5); + if (cast<ConstantSDNode>(Rnd)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd); + } + //default rounding mode + if(!Cmp.getNode()) + Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC); + + SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, + DAG.getTargetConstant(0, dl, + MVT::i1), + Subtarget, DAG); + + return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8, + DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask), + DAG.getValueType(MVT::i1)); + } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; SDValue LHS = Op.getOperand(1); @@ -15584,6 +16885,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget DAG.getConstant(X86CC, dl, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + case COMI_RM: { // Comparison intrinsics with Sae + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDValue CC = Op.getOperand(3); + SDValue Sae = Op.getOperand(4); + auto ComiType = TranslateX86ConstCondToX86CC(CC); + // choose between ordered and unordered (comi/ucomi) + unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1; + SDValue Cond; + if (cast<ConstantSDNode>(Sae)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae); + else + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG); @@ -15598,27 +16917,75 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); - if (isAllOnes(Mask)) // return data as is + if (isAllOnesConstant(Mask)) // return data as is return Op.getOperand(1); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), Mask, PassThru, Subtarget, DAG); } + case BROADCASTM: { + SDValue Mask = Op.getOperand(1); + MVT MaskVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + Mask = DAG.getBitcast(MaskVT, Mask); + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); + } case BLEND: { SDValue Mask = Op.getOperand(3); - EVT VT = Op.getValueType(); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); - SDLoc dl(Op); - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), Op.getOperand(2)); } + case KUNPCK: { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); + + SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); + SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); + // Arguments should be swapped. + SDValue Res = DAG.getNode(IntrData->Opc0, dl, + MVT::getVectorVT(MVT::i1, VT.getSizeInBits()), + Src2, Src1); + return DAG.getBitcast(VT, Res); + } + case CONVERT_TO_MASK: { + MVT SrcVT = Op.getOperand(1).getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); + + SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT, + Op.getOperand(1)); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), CvtMask, + DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(Op.getValueType(), Res); + } + case CONVERT_MASK_TO_VEC: { + SDValue Mask = Op.getOperand(1); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getNode(IntrData->Opc0, dl, VT, VMask); + } + case BRCST_SUBVEC_TO_VEC: { + SDValue Src = Op.getOperand(1); + SDValue Passthru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + EVT resVT = Passthru.getValueType(); + SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT, + DAG.getUNDEF(resVT), Src, + DAG.getIntPtrConstant(0, dl)); + SDValue immVal; + if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector()) + immVal = DAG.getConstant(0x44, dl, MVT::i8); + else + immVal = DAG.getConstant(0, dl, MVT::i8); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + subVec, subVec, immVal), + Mask, Passthru, Subtarget, DAG); + } default: break; } @@ -15832,23 +17199,17 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget * Subtarget) { SDLoc dl(Op); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); - if (!C) - llvm_unreachable("Invalid scale type"); - unsigned ScaleVal = C->getZExtValue(); - if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8) - llvm_unreachable("Valid scale values are 1, 2, 4, 8"); - + auto *C = cast<ConstantSDNode>(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); - EVT MaskVT = MVT::getVectorVT(MVT::i1, + MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); else { - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. @@ -15860,7 +17221,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); if (Src.getOpcode() == ISD::UNDEF) - Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); + Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; @@ -15871,25 +17232,19 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); - if (!C) - llvm_unreachable("Invalid scale type"); - unsigned ScaleVal = C->getZExtValue(); - if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8) - llvm_unreachable("Valid scale values are 1, 2, 4, 8"); - + auto *C = cast<ConstantSDNode>(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - EVT MaskVT = MVT::getVectorVT(MVT::i1, + MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); else { - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. @@ -15907,12 +17262,11 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); - assert(C && "Invalid scale type"); + auto *C = cast<ConstantSDNode>(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - EVT MaskVT = + MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); @@ -16034,64 +17388,59 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, DL); } -static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); - const Function *Fn = MF.getFunction(); - SDLoc dl(Op); SDValue Chain = Op.getOperand(0); + SDValue RegNode = Op.getOperand(2); + WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); + if (!EHInfo) + report_fatal_error("EH registrations only live in functions using WinEH"); + + // Cast the operand to an alloca, and remember the frame index. + auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode); + if (!FINode) + report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca"); + EHInfo->EHRegNodeFrameIndex = FINode->getIndex(); + + // Return the chain operand without making any DAG nodes. + return Chain; +} - assert(Subtarget->getFrameLowering()->hasFP(MF) && - "using llvm.x86.seh.restoreframe requires a frame pointer"); - - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - MVT VT = TLI.getPointerTy(DAG.getDataLayout()); - - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); - unsigned FrameReg = - RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); - unsigned SPReg = RegInfo->getStackRegister(); - unsigned SlotSize = RegInfo->getSlotSize(); +/// \brief Lower intrinsics for TRUNCATE_TO_MEM case +/// return truncate Store/MaskedStore Node +static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op, + SelectionDAG &DAG, + MVT ElementType) { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue DataToTruncate = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); - // Get incoming EBP. - SDValue IncomingEBP = - DAG.getCopyFromReg(Chain, dl, FrameReg, VT); + MVT VT = DataToTruncate.getSimpleValueType(); + MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements()); - // SP is saved in the first field of every registration node, so load - // [EBP-RegNodeSize] into SP. - int RegNodeSize = getSEHRegistrationNodeSize(Fn); - SDValue SPAddr = DAG.getNode(ISD::ADD, dl, VT, IncomingEBP, - DAG.getConstant(-RegNodeSize, dl, VT)); - SDValue NewSP = - DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false, - false, VT.getScalarSizeInBits() / 8); - Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); - - if (!RegInfo->needsStackRealignment(MF)) { - // Adjust EBP to point back to the original frame position. - SDValue NewFP = recoverFramePointer(DAG, Fn, IncomingEBP); - Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP); - } else { - assert(RegInfo->hasBasePointer(MF) && - "functions with Win32 EH must use frame or base pointer register"); + if (isAllOnesConstant(Mask)) // return just a truncate store + return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, + MachinePointerInfo(), SVT, false, false, + SVT.getScalarSizeInBits()/8); - // Reload the base pointer (ESI) with the adjusted incoming EBP. - SDValue NewBP = recoverFramePointer(DAG, Fn, IncomingEBP); - Chain = DAG.getCopyToReg(Chain, dl, RegInfo->getBaseRegister(), NewBP); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); - // Reload the spilled EBP value, now that the stack and base pointers are - // set up. - X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - X86FI->setHasSEHFramePtrSave(true); - int FI = MF.getFrameInfo()->CreateSpillStackObject(SlotSize, SlotSize); - X86FI->setSEHFramePtrSaveIndex(FI); - SDValue NewFP = DAG.getLoad(VT, dl, Chain, DAG.getFrameIndex(FI, VT), - MachinePointerInfo(), false, false, false, - VT.getScalarSizeInBits() / 8); - Chain = DAG.getCopyToReg(NewFP, dl, FrameReg, NewFP); - } + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, SVT.getStoreSize(), + SVT.getScalarSizeInBits()/8); - return Chain; + return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, + VMask, SVT, MMO, true); } static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, @@ -16100,16 +17449,26 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { - if (IntNo == llvm::Intrinsic::x86_seh_restoreframe) - return LowerSEHRESTOREFRAME(Op, Subtarget, DAG); + if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) + return MarkEHRegistrationNode(Op, DAG); + if (IntNo == llvm::Intrinsic::x86_flags_read_u32 || + IntNo == llvm::Intrinsic::x86_flags_read_u64 || + IntNo == llvm::Intrinsic::x86_flags_write_u32 || + IntNo == llvm::Intrinsic::x86_flags_write_u64) { + // We need a frame pointer because this will get lowered to a PUSH/POP + // sequence. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setHasOpaqueSPAdjustment(true); + // Don't do anything here, we will expand these intrinsics out later + // during ExpandISelPseudos in EmitInstrWithCustomInserter. + return SDValue(); + } return SDValue(); } SDLoc dl(Op); switch(IntrData->Type) { - default: - llvm_unreachable("Unknown Intrinsic Type"); - break; + default: llvm_unreachable("Unknown Intrinsic Type"); case RDSEED: case RDRAND: { // Emit the node with the right value type. @@ -16208,14 +17567,13 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, dl); } case COMPRESS_TO_MEM: { - SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue DataToCompress = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); - EVT VT = DataToCompress.getValueType(); - if (isAllOnes(Mask)) // return just a store + MVT VT = DataToCompress.getSimpleValueType(); + if (isAllOnesConstant(Mask)) // return just a store return DAG.getStore(Chain, dl, DataToCompress, Addr, MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); @@ -16227,15 +17585,20 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); } + case TRUNCATE_TO_MEM_VI8: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8); + case TRUNCATE_TO_MEM_VI16: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16); + case TRUNCATE_TO_MEM_VI32: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); case EXPAND_FROM_MEM: { - SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); - if (isAllOnes(Mask)) // return just a load + if (isAllOnesConstant(Mask)) // return just a load return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, false, VT.getScalarSizeInBits()/8); @@ -16248,6 +17611,25 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, Mask, PassThru, Subtarget, DAG), Chain}; return DAG.getMergeValues(Results, dl); } + case LOADU: + case LOADA: { + SDValue Mask = Op.getOperand(4); + SDValue PassThru = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + MVT VT = Op.getSimpleValueType(); + + MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op); + assert(MemIntr && "Expected MemIntrinsicSDNode!"); + + if (isAllOnesConstant(Mask)) // return just a load + return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand()); + + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT, + MemIntr->getMemOperand(), ISD::NON_EXTLOAD); + } } } @@ -16359,6 +17741,21 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); } +unsigned X86TargetLowering::getExceptionPointerRegister( + const Constant *PersonalityFn) const { + if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR) + return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; + + return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX; +} + +unsigned X86TargetLowering::getExceptionSelectorRegister( + const Constant *PersonalityFn) const { + // Funclet personalities don't use selectors (the runtime does the selection). + assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))); + return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; +} + SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); @@ -16497,9 +17894,11 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) - if (Attrs.hasAttribute(Idx, Attribute::InReg)) + if (Attrs.hasAttribute(Idx, Attribute::InReg)) { + auto &DL = DAG.getDataLayout(); // FIXME: should only count parameters that are lowered to integers. - InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; + InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; + } if (InRegCount > 2) { report_fatal_error("Nest register in use - reduce number of inreg" @@ -16588,8 +17987,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOStore, 2, 2); + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOStore, 2, 2); SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, @@ -16623,12 +18022,75 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } -static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { +/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction. +// +// 1. i32/i64 128/256-bit vector (native support require VLX) are expended +// to 512-bit vector. +// 2. i8/i16 vector implemented using dword LZCNT vector instruction +// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, +// split the vector, perform operation on it's Lo a Hi part and +// concatenate the results. +static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) { + SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - EVT OpVT = VT; + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + + if (EltVT == MVT::i64 || EltVT == MVT::i32) { + // Extend to 512 bit vector. + assert((VT.is256BitVector() || VT.is128BitVector()) && + "Unsupported value type for operation"); + + MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits()); + SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, + DAG.getUNDEF(NewVT), + Op.getOperand(0), + DAG.getIntPtrConstant(0, dl)); + SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512); + + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode, + DAG.getIntPtrConstant(0, dl)); + } + + assert((EltVT == MVT::i8 || EltVT == MVT::i16) && + "Unsupported element type"); + + if (16 < NumElems) { + // Split vector, it's Lo and Hi parts will be handled in next iteration. + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); + MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2); + + Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo); + Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + } + + MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); + + assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && + "Unsupported value type for operation"); + + // Use native supported vector instruction vplzcntd. + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0)); + SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op); + SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode); + SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT); + + return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); +} + +static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + MVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); + if (VT.isVector() && Subtarget->hasAVX512()) + return LowerVectorCTLZ_AVX512(Op, DAG); + Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. @@ -16658,7 +18120,8 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { return Op; } -static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -16686,13 +18149,39 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); - unsigned NumBits = VT.getSizeInBits(); + unsigned NumBits = VT.getScalarSizeInBits(); SDLoc dl(Op); - Op = Op.getOperand(0); + + if (VT.isVector()) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + SDValue N0 = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, dl, VT); + + // lsb(x) = (x & -x) + SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0, + DAG.getNode(ISD::SUB, dl, VT, Zero, N0)); + + // cttz_undef(x) = (width - 1) - ctlz(lsb) + if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF && + TLI.isOperationLegal(ISD::CTLZ, VT)) { + SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT); + return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne, + DAG.getNode(ISD::CTLZ, dl, VT, LSB)); + } + + // cttz(x) = ctpop(lsb - 1) + SDValue One = DAG.getConstant(1, dl, VT); + return DAG.getNode(ISD::CTPOP, dl, VT, + DAG.getNode(ISD::SUB, dl, VT, LSB, One)); + } + + assert(Op.getOpcode() == ISD::CTTZ && + "Only scalar CTTZ requires custom lowering"); // Issue a bsf (scan bits forward) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(VT, MVT::i32); - Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); + Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0)); // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = { @@ -16753,6 +18242,13 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { return Lower256IntArith(Op, DAG); } +static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { + assert(Op.getSimpleValueType().is256BitVector() && + Op.getSimpleValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntArith(Op, DAG); +} + static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -16885,7 +18381,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SDValue AhiBlo = Ahi; SDValue AloBhi = Bhi; // Bit cast to 32-bit vectors for MULUDQ - EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : + MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; A = DAG.getBitcast(MulVT, A); B = DAG.getBitcast(MulVT, B); @@ -16962,7 +18458,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); - EVT VT = Op0.getValueType(); + MVT VT = Op0.getSimpleValueType(); SDLoc dl(Op); assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || @@ -17034,7 +18530,7 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Ops, dl); } -// Return true if the requred (according to Opcode) shift-imm form is natively +// Return true if the required (according to Opcode) shift-imm form is natively // supported by the Subtarget static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { @@ -17054,14 +18550,14 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, } // The shift amount is a variable, but it is the same for all vector lanes. -// These instrcutions are defined together with shift-immediate. +// These instructions are defined together with shift-immediate. static bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { return SupportedVectorShiftWithImm(VT, Subtarget, Opcode); } -// Return true if the requred (according to Opcode) variable-shift form is +// Return true if the required (according to Opcode) variable-shift form is // natively supported by the Subtarget static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { @@ -17133,27 +18629,37 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, // i64 SRA needs to be performed as partial shifts. if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && - Op.getOpcode() == ISD::SRA) + Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP()) return ArithmeticShiftRight64(ShiftAmt); - if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) { + if (VT == MVT::v16i8 || + (Subtarget->hasInt256() && VT == MVT::v32i8) || + VT == MVT::v64i8) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); - if (Op.getOpcode() == ISD::SHL) { - // Simple i8 add case - if (ShiftAmt == 1) - return DAG.getNode(ISD::ADD, dl, VT, R, R); + // Simple i8 add case + if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) + return DAG.getNode(ISD::ADD, dl, VT, R, R); + + // ashr(R, 7) === cmp_slt(R, 0) + if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) { + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); + } + // XOP can shift v16i8 directly instead of as shift v8i16 + mask. + if (VT == MVT::v16i8 && Subtarget->hasXOP()) + return SDValue(); + + if (Op.getOpcode() == ISD::SHL) { // Make a large shift. SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R, ShiftAmt, DAG); SHL = DAG.getBitcast(VT, SHL); // Zero out the rightmost bits. - SmallVector<SDValue, 32> V( - NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); + DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -17161,24 +18667,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, R, ShiftAmt, DAG); SRL = DAG.getBitcast(VT, SRL); // Zero out the leftmost bits. - SmallVector<SDValue, 32> V( - NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT)); } if (Op.getOpcode() == ISD::SRA) { - if (ShiftAmt == 7) { - // R s>> 7 === R s< 0 - SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); - } - - // R s>> a === ((R u>> a) ^ m) - m + // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); - SmallVector<SDValue, 32> V(NumElts, - DAG.getConstant(128 >> ShiftAmt, dl, - MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); + + SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; @@ -17189,35 +18685,51 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } // Special case in 32-bit mode, where i64 is expanded into high and low parts. - if (!Subtarget->is64Bit() && - (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && - Amt.getOpcode() == ISD::BITCAST && - Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + if (!Subtarget->is64Bit() && !Subtarget->hasXOP() && + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) { + + // Peek through any splat that was introduced for i64 shift vectorization. + int SplatIndex = -1; + if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode())) + if (SVN->isSplat()) { + SplatIndex = SVN->getSplatIndex(); + Amt = Amt.getOperand(0); + assert(SplatIndex < (int)VT.getVectorNumElements() && + "Splat shuffle referencing second operand"); + } + + if (Amt.getOpcode() != ISD::BITCAST || + Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / VT.getVectorNumElements(); unsigned RatioInLog2 = Log2_32_Ceil(Ratio); uint64_t ShiftAmt = 0; + unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio); for (unsigned i = 0; i != Ratio; ++i) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i)); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp)); if (!C) return SDValue(); // 6 == Log2(64) ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); } - // Check remaining shift amounts. - for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { - uint64_t ShAmt = 0; - for (unsigned j = 0; j != Ratio; ++j) { - ConstantSDNode *C = - dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); - if (!C) + + // Check remaining shift amounts (if not a splat). + if (SplatIndex < 0) { + for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { + uint64_t ShAmt = 0; + for (unsigned j = 0; j != Ratio; ++j) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); + if (!C) + return SDValue(); + // 6 == Log2(64) + ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); + } + if (ShAmt != ShiftAmt) return SDValue(); - // 6 == Log2(64) - ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); } - if (ShAmt != ShiftAmt) - return SDValue(); } if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) @@ -17245,7 +18757,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) { SDValue BaseShAmt; - EVT EltVT = VT.getVectorElementType(); + MVT EltVT = VT.getVectorElementType(); if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) { // Check if this build_vector node is doing a splat. @@ -17262,7 +18774,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); SDValue InVec = Amt.getOperand(0); if (InVec.getOpcode() == ISD::BUILD_VECTOR) { - assert((SplatIdx < InVec.getValueType().getVectorNumElements()) && + assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && "Unexpected shuffle index found!"); BaseShAmt = InVec.getOperand(SplatIdx); } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { @@ -17327,11 +18839,26 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return V; if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) - return V; + return V; if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode())) return Op; + // XOP has 128-bit variable logical/arithmetic shifts. + // +ve/-ve Amt = shift left/right. + if (Subtarget->hasXOP() && + (VT == MVT::v2i64 || VT == MVT::v4i32 || + VT == MVT::v8i16 || VT == MVT::v16i8)) { + if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) { + SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); + Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt); + } + if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) + return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt); + if (Op.getOpcode() == ISD::SRA) + return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt); + } + // 2i64 vector logical shifts can efficiently avoid scalarization - do the // shifts per-lane and then shuffle the partial results back together. if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { @@ -17343,6 +18870,19 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); } + // i64 vector arithmetic shift can be emulated with the transform: + // M = lshr(SIGN_BIT, Amt) + // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) + if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) && + Op.getOpcode() == ISD::SRA) { + SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT); + SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); + R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + R = DAG.getNode(ISD::XOR, dl, VT, R, M); + R = DAG.getNode(ISD::SUB, dl, VT, R, M); + return R; + } + // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. @@ -17351,9 +18891,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, (Subtarget->hasInt256() && VT == MVT::v16i16)) && ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { SmallVector<SDValue, 8> Elts; - EVT SVT = VT.getScalarType(); + MVT SVT = VT.getVectorElementType(); unsigned SVTBits = SVT.getSizeInBits(); - const APInt &One = APInt(SVTBits, 1); + APInt One(SVTBits, 1); unsigned NumElems = VT.getVectorNumElements(); for (unsigned i=0; i !=NumElems; ++i) { @@ -17364,7 +18904,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, } ConstantSDNode *ND = cast<ConstantSDNode>(Op); - const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue()); + APInt C(SVTBits, ND->getAPIntValue().getZExtValue()); uint64_t ShAmt = C.getZExtValue(); if (ShAmt >= SVTBits) { Elts.push_back(DAG.getUNDEF(SVT)); @@ -17443,7 +18983,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2)) { // Replace this node with two shifts followed by a MOVSS/MOVSD. - EVT CastVT = MVT::v4i32; + MVT CastVT = MVT::v4i32; SDValue Splat1 = DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); @@ -17507,7 +19047,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); } - if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) { + if (VT == MVT::v16i8 || + (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) { MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); unsigned ShiftOpcode = Op->getOpcode(); @@ -17627,7 +19168,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); } - if (Subtarget->hasInt256() && VT == MVT::v16i16) { + if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) { MVT ExtVT = MVT::v8i32; SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z); @@ -17710,7 +19251,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, if (VT.is256BitVector()) { unsigned NumElems = VT.getVectorNumElements(); MVT EltVT = VT.getVectorElementType(); - EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); // Extract the two vectors SDValue V1 = Extract128BitVector(R, 0, DAG, dl); @@ -17743,6 +19284,40 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return SDValue(); } +static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + + assert(VT.isVector() && "Custom lowering only for vector rotates!"); + assert(Subtarget->hasXOP() && "XOP support required for vector rotates!"); + assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported"); + + // XOP has 128-bit vector variable + immediate rotates. + // +ve/-ve Amt = rotate left/right. + + // Split 256-bit integers. + if (VT.is256BitVector()) + return Lower256IntArith(Op, DAG); + + assert(VT.is128BitVector() && "Only rotate 128-bit vectors!"); + + // Attempt to rotate by immediate. + if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { + if (auto *RotateConst = BVAmt->getConstantSplatNode()) { + uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); + assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range"); + return DAG.getNode(X86ISD::VPROTI, DL, VT, R, + DAG.getConstant(RotateAmt, DL, MVT::i8)); + } + } + + // Use general rotate by variable (per-element). + return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt); +} + static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering @@ -17759,8 +19334,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { case ISD::SADDO: // A subtract of one will be selected as a INC. Note that INC doesn't // set CF, so we can't do this for UADDO. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { BaseOp = X86ISD::INC; Cond = X86::COND_O; break; @@ -17775,8 +19349,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { case ISD::SSUBO: // A subtract of one will be selected as a DEC. Note that DEC doesn't // set CF, so we can't do this for USUBO. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { BaseOp = X86ISD::DEC; Cond = X86::COND_O; break; @@ -17827,7 +19400,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. /// Used to know whether to use cmpxchg8/16b when expanding atomic operations /// (otherwise we leave them alone to become __sync_fetch_and_... calls). -bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const { +bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) @@ -17844,21 +19417,23 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // Note: this turns large loads into lock cmpxchg8b/16b. // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. -bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { +TargetLowering::AtomicExpansionKind +X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { auto PTy = cast<PointerType>(LI->getPointerOperand()->getType()); - return needsCmpXchgNb(PTy->getElementType()); + return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; } -TargetLoweringBase::AtomicRMWExpansionKind +TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; - const Type *MemType = AI->getType(); + Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available // and default to library calls otherwise. if (MemType->getPrimitiveSizeInBits() > NativeWidth) { - return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg - : AtomicRMWExpansionKind::None; + return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; } AtomicRMWInst::BinOp Op = AI->getOperation(); @@ -17869,14 +19444,14 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::Add: case AtomicRMWInst::Sub: // It's better to use xadd, xsub or xchg for these in all cases. - return AtomicRMWExpansionKind::None; + return AtomicExpansionKind::None; case AtomicRMWInst::Or: case AtomicRMWInst::And: case AtomicRMWInst::Xor: // If the atomicrmw's result isn't actually used, we can just add a "lock" // prefix to a normal instruction for these operations. - return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg - : AtomicRMWExpansionKind::None; + return !AI->use_empty() ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; case AtomicRMWInst::Nand: case AtomicRMWInst::Max: case AtomicRMWInst::Min: @@ -17884,7 +19459,7 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::UMin: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. - return AtomicRMWExpansionKind::CmpXChg; + return AtomicExpansionKind::CmpXChg; } } @@ -17898,7 +19473,7 @@ static bool hasMFENCE(const X86Subtarget& Subtarget) { LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; - const Type *MemType = AI->getType(); + Type *MemType = AI->getType(); // Accesses larger than the native width are turned into cmpxchg/libcalls, so // there is no benefit in turning such RMWs into loads, and it is actually // harmful as it introduces a mfence. @@ -17926,7 +19501,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // lowered to just a load without a fence. A mfence flushes the store buffer, // making the optimization clearly correct. // FIXME: it is required if isAtLeastRelease(Order) but it is not clear - // otherwise, we might be able to be more agressive on relaxed idempotent + // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. if (SynchScope == SingleThread) @@ -18034,24 +19609,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, MVT SrcVT = Op.getOperand(0).getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); - if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) { + if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || + SrcVT == MVT::i64) { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); if (DstVT != MVT::f64) // This conversion needs to be expanded. return SDValue(); - SDValue InVec = Op->getOperand(0); - SDLoc dl(Op); - unsigned NumElts = SrcVT.getVectorNumElements(); - EVT SVT = SrcVT.getVectorElementType(); - - // Widen the vector in input in the case of MVT::v2i32. - // Example: from MVT::v2i32 to MVT::v4i32. + SDValue Op0 = Op->getOperand(0); SmallVector<SDValue, 16> Elts; - for (unsigned i = 0, e = NumElts; i != e; ++i) - Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, - DAG.getIntPtrConstant(i, dl))); - + SDLoc dl(Op); + unsigned NumElts; + MVT SVT; + if (SrcVT.isVector()) { + NumElts = SrcVT.getVectorNumElements(); + SVT = SrcVT.getVectorElementType(); + + // Widen the vector in input in the case of MVT::v2i32. + // Example: from MVT::v2i32 to MVT::v4i32. + for (unsigned i = 0, e = NumElts; i != e; ++i) + Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0, + DAG.getIntPtrConstant(i, dl))); + } else { + assert(SrcVT == MVT::i64 && !Subtarget->is64Bit() && + "Unexpected source type in LowerBITCAST"); + Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, + DAG.getIntPtrConstant(0, dl))); + Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, + DAG.getIntPtrConstant(1, dl))); + NumElts = 2; + SVT = MVT::i32; + } // Explicitly mark the extra elements as Undef. Elts.append(NumElts, DAG.getUNDEF(SVT)); @@ -18103,7 +19691,8 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // chunks, thus directly computes the pop count for v2i64 and v4i64. if (EltVT == MVT::i64) { SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); - V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros); + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); return DAG.getBitcast(VT, V); } @@ -18119,9 +19708,10 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // Do the horizontal sums into two v2i64s. Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); - Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, Low), Zeros); - High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, High), Zeros); // Merge them together. @@ -18311,7 +19901,7 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget, static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - assert(Op.getValueType().isVector() && + assert(Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."); return LowerVectorCTPOP(Op, Subtarget, DAG); } @@ -18357,7 +19947,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getNode()->getSimpleValueType(0); + MVT VT = Op.getNode()->getSimpleValueType(0); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) @@ -18435,31 +20025,203 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); } +/// Widen a vector input to a vector of NVT. The +/// input vector must have the same element type as NVT. +static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, + bool FillWithZeroes = false) { + // Check if InOp already has the right width. + MVT InVT = InOp.getSimpleValueType(); + if (InVT == NVT) + return InOp; + + if (InOp.isUndef()) + return DAG.getUNDEF(NVT); + + assert(InVT.getVectorElementType() == NVT.getVectorElementType() && + "input and widen element type must match"); + + unsigned InNumElts = InVT.getVectorNumElements(); + unsigned WidenNumElts = NVT.getVectorNumElements(); + assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && + "Unexpected request for vector widening"); + + EVT EltVT = NVT.getVectorElementType(); + + SDLoc dl(InOp); + if (InOp.getOpcode() == ISD::CONCAT_VECTORS && + InOp.getNumOperands() == 2) { + SDValue N1 = InOp.getOperand(1); + if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || + N1.isUndef()) { + InOp = InOp.getOperand(0); + InVT = InOp.getSimpleValueType(); + InNumElts = InVT.getVectorNumElements(); + } + } + if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || + ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { + SmallVector<SDValue, 16> Ops; + for (unsigned i = 0; i < InNumElts; ++i) + Ops.push_back(InOp.getOperand(i)); + + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : + DAG.getUNDEF(EltVT); + for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) + Ops.push_back(FillVal); + return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); + } + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : + DAG.getUNDEF(NVT); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, + InOp, DAG.getIntPtrConstant(0, dl)); +} + static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Subtarget->hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); + // X86 scatter kills mask register, so its type should be added to + // the list of return values. + // If the "scatter" has 2 return values, it is already handled. + if (Op.getNode()->getNumValues() == 2) + return Op; + MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode()); - EVT VT = N->getValue().getValueType(); + SDValue Src = N->getValue(); + MVT VT = Src.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); SDLoc dl(Op); - // X86 scatter kills mask register, so its type should be added to - // the list of return values - if (N->getNumValues() == 1) { - SDValue Index = N->getIndex(); - if (!Subtarget->hasVLX() && !VT.is512BitVector() && - !Index.getValueType().is512BitVector()) + SDValue NewScatter; + SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Chain = N->getChain(); + SDValue BasePtr = N->getBasePtr(); + MVT MemVT = N->getMemoryVT().getSimpleVT(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) { + // The v2i32 value was promoted to v2i64. + // Now we "redo" the type legalizer's work and widen the original + // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64 + // with a shuffle. + assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) && + "Unexpected memory type"); + int ShuffleMask[] = {0, 2, -1, -1}; + Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src), + DAG.getUNDEF(MVT::v4i32), ShuffleMask); + // Now we have 4 elements instead of 2. + // Expand the index. + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4); + Index = ExtendToType(Index, NewIndexVT, DAG); + + // Expand the mask with zeroes + // Mask may be <2 x i64> or <2 x i1> at this moment + assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && + "Unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + VT = MVT::v4i32; + } + + unsigned NumElts = VT.getVectorNumElements(); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && + !Index.getSimpleValueType().is512BitVector()) { + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (IndexVT == MVT::v8i32) + // Just extend index Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + else { + // The minimal number of elts in scatter is 8 + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + // Use original index here, do not modify the index twice + Index = ExtendToType(N->getIndex(), NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + // Use the original mask here, do not modify the mask twice + Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); + + // The value that should be stored + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src = ExtendToType(Src, NewVT, DAG); + } + } + // If the mask is "wide" at this point - truncate it to i1 vector + MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts); + Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask); + + // The mask is killed by scatter, add it to the values + SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other); + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index}; + NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); + return SDValue(NewScatter.getNode(), 0); +} + +static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { - SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; + MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); + MVT VT = Op.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDLoc dl(Op); - SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); - return SDValue(NewScatter.getNode(), 0); + if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && + !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + SDValue Src0 = N->getSrc0(); + Src0 = ExtendToType(Src0, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), + N->getBasePtr(), Mask, Src0, + N->getMemoryVT(), N->getMemOperand(), + N->getExtensionType()); + + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewLoad.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); + } + return Op; +} + +static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode()); + SDValue DataToStore = N->getValue(); + MVT VT = DataToStore.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDLoc dl(Op); + + if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && + !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), + Mask, N->getMemoryVT(), N->getMemOperand(), + N->isTruncatingStore()); } return Op; } @@ -18470,17 +20232,59 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget, "MGATHER/MSCATTER are supported on AVX-512 arch only"); MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode()); - EVT VT = Op.getValueType(); - assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); SDLoc dl(Op); - + MVT VT = Op.getSimpleValueType(); SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Src0 = N->getValue(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + unsigned NumElts = VT.getVectorNumElements(); + assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && - !Index.getValueType().is512BitVector()) { - Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; - DAG.UpdateNodeOperands(N, Ops); + !Index.getSimpleValueType().is512BitVector()) { + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (NumElts == 8) { + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), Index }; + DAG.UpdateNodeOperands(N, Ops); + return Op; + } + + // Minimal number of elements in Gather + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + Index = ExtendToType(Index, NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts); + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); + + // The pass-thru value + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src0 = ExtendToType(Src0, NewVT, DAG); + + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewGather.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewGather.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); } return Op; } @@ -18572,6 +20376,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::SETCCE: return LowerSETCCE(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); @@ -18592,12 +20397,14 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); - case ISD::CTLZ: return LowerCTLZ(Op, DAG); - case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); - case ISD::CTTZ: return LowerCTTZ(Op, DAG); + case ISD::CTLZ: return LowerCTLZ(Op, Subtarget, DAG); + case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::UMUL_LOHI: case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); + case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); @@ -18615,7 +20422,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); + case ISD::SMAX: + case ISD::SMIN: + case ISD::UMAX: + case ISD::UMIN: return LowerMINMAX(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); + case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); + case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); case ISD::GC_TRANSITION_START: @@ -18634,14 +20447,43 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + case X86ISD::AVG: { + // Legalize types for X86ISD::AVG by expanding vectors. + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + + auto InVT = N->getValueType(0); + auto InVTSize = InVT.getSizeInBits(); + const unsigned RegSize = + (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; + assert((!Subtarget->hasAVX512() || RegSize < 512) && + "512-bit vector requires AVX512"); + assert((!Subtarget->hasAVX2() || RegSize < 256) && + "256-bit vector requires AVX2"); + + auto ElemVT = InVT.getVectorElementType(); + auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, + RegSize / ElemVT.getSizeInBits()); + assert(RegSize % InVT.getSizeInBits() == 0); + unsigned NumConcat = RegSize / InVT.getSizeInBits(); + + SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT)); + Ops[0] = N->getOperand(0); + SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + Ops[0] = N->getOperand(1); + SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + + SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); + Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, + DAG.getIntPtrConstant(0, dl))); + return; + } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: case X86ISD::FMAXC: case X86ISD::FMAX: { EVT VT = N->getValueType(0); - if (VT != MVT::v2f32) - llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX."); + assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."); SDValue UNDEF = DAG.getUNDEF(VT); SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(0), UNDEF); @@ -18668,17 +20510,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::FP_TO_SINT: - // FP_TO_INT*_IN_MEM is not legal for f16 inputs. Do not convert - // (FP_TO_SINT (load f16)) to FP_TO_INT*. - if (N->getOperand(0).getValueType() == MVT::f16) - break; - // fallthrough case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; - if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) - return; - std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; @@ -18707,6 +20541,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, DAG.getBitcast(MVT::v2i64, VBias)); Or = DAG.getBitcast(MVT::v2f64, Or); + // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); return; @@ -18740,6 +20575,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); } } + case ISD::INTRINSIC_WO_CHAIN: { + if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG)) + Results.push_back(V); + return; + } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); @@ -18748,7 +20588,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; - EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; + MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), DAG.getConstant(0, dl, HalfT)); @@ -18884,6 +20724,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; + case X86ISD::IRET: return "X86ISD::IRET"; case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; @@ -18910,6 +20751,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::ABS: return "X86ISD::ABS"; + case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; case X86ISD::FMIN: return "X86ISD::FMIN"; @@ -18937,12 +20779,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VZEXT: return "X86ISD::VZEXT"; case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; - case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; + case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; + case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; + case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; @@ -18951,6 +20795,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::VROTLI: return "X86ISD::VROTLI"; + case X86ISD::VROTRI: return "X86ISD::VROTRI"; case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; @@ -18978,6 +20824,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TESTM: return "X86ISD::TESTM"; case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; + case X86ISD::KTEST: return "X86ISD::KTEST"; case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; @@ -19000,6 +20847,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; @@ -19009,11 +20857,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; + case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; case X86ISD::VRANGE: return "X86ISD::VRANGE"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::PSADBW: return "X86ISD::PSADBW"; + case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -19022,10 +20872,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SFENCE: return "X86ISD::SFENCE"; case X86ISD::LFENCE: return "X86ISD::LFENCE"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; - case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; + case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; + case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; + case X86ISD::VPROT: return "X86ISD::VPROT"; + case X86ISD::VPROTI: return "X86ISD::VPROTI"; + case X86ISD::VPSHA: return "X86ISD::VPSHA"; + case X86ISD::VPSHL: return "X86ISD::VPSHL"; + case X86ISD::VPCOM: return "X86ISD::VPCOM"; + case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; @@ -19038,7 +20895,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; - case X86ISD::RNDSCALE: return "X86ISD::RNDSCALE"; + case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; + case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; + case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; @@ -19064,6 +20923,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND"; case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND"; + case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; + case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; } return nullptr; } @@ -19218,7 +21079,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512())) + if (!Subtarget->hasAnyFMA()) return false; VT = VT.getScalarType(); @@ -19253,11 +21114,11 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, return false; // Not for i1 vectors - if (VT.getScalarType() == MVT::i1) + if (VT.getSimpleVT().getScalarType() == MVT::i1) return false; // Very little shuffling can be done for 64-bit vectors right now. - if (VT.getSizeInBits() == 64) + if (VT.getSimpleVT().getSizeInBits() == 64) return false; // We only care that the types being shuffled are legal. The lowering can @@ -19282,8 +21143,7 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, DebugLoc DL = MI->getDebugLoc(); const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; + MachineFunction::iterator I = ++MBB->getIterator(); // For the v = xbegin(), we generate // @@ -19407,6 +21267,47 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, return BB; } +static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + // insert input VAL into EAX + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) + .addReg(MI->getOperand(0).getReg()); + // insert zero to ECX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) + .addReg(X86::ECX) + .addReg(X86::ECX); + // insert zero to EDX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX) + .addReg(X86::EDX) + .addReg(X86::EDX); + // insert WRPKRU instruction + BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + +static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + // insert zero to ECX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) + .addReg(X86::ECX) + .addReg(X86::ECX); + // insert RDPKRU instruction + BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr)); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::EAX); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, const X86Subtarget *Subtarget) { DebugLoc dl = MI->getDebugLoc(); @@ -19531,8 +21432,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); endMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; + MachineFunction::iterator MBBIter = ++MBB->getIterator(); // Insert the new basic blocks MF->insert(MBBIter, offsetMBB); @@ -19702,8 +21602,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( // stores were performed. const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction *F = MBB->getParent(); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; + MachineFunction::iterator MBBIter = ++MBB->getIterator(); MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(MBBIter, XMMSaveMBB); @@ -19727,7 +21626,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); - if (!Subtarget->isTargetWin64()) { + if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); @@ -19744,9 +21643,8 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( // In the XMM save block, save all the XMM argument registers. for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) { int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; - MachineMemOperand *MMO = - F->getMachineMemOperand( - MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), + MachineMemOperand *MMO = F->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), MachineMemOperand::MOStore, /*Size=*/16, /*Align=*/16); BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) @@ -19800,6 +21698,39 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, return true; } +// Return true if it is OK for this CMOV pseudo-opcode to be cascaded +// together with other CMOV pseudo-opcodes into a single basic-block with +// conditional jump around it. +static bool isCMOVPseudo(MachineInstr *MI) { + switch (MI->getOpcode()) { + case X86::CMOV_FR32: + case X86::CMOV_FR64: + case X86::CMOV_GR8: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: + case X86::CMOV_V2F64: + case X86::CMOV_V2I64: + case X86::CMOV_V4F32: + case X86::CMOV_V4F64: + case X86::CMOV_V4I64: + case X86::CMOV_V16F32: + case X86::CMOV_V8F32: + case X86::CMOV_V8F64: + case X86::CMOV_V8I64: + case X86::CMOV_V8I1: + case X86::CMOV_V16I1: + case X86::CMOV_V32I1: + case X86::CMOV_V64I1: + return true; + + default: + return false; + } +} + MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -19811,8 +21742,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -19823,8 +21753,41 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); - // We also lower double CMOVs: + // This code lowers all pseudo-CMOV instructions. Generally it lowers these + // as described above, by inserting a BB, and then making a PHI at the join + // point to select the true and false operands of the CMOV in the PHI. + // + // The code also handles two different cases of multiple CMOV opcodes + // in a row. + // + // Case 1: + // In this case, there are multiple CMOVs in a row, all which are based on + // the same condition setting (or the exact opposite condition setting). + // In this case we can lower all the CMOVs using a single inserted BB, and + // then make a number of PHIs at the join point to model the CMOVs. The only + // trickiness here, is that in a case like: + // + // t2 = CMOV cond1 t1, f1 + // t3 = CMOV cond1 t2, f2 + // + // when rewriting this into PHIs, we have to perform some renaming on the + // temps since you cannot have a PHI operand refer to a PHI result earlier + // in the same block. The "simple" but wrong lowering would be: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t2(BB1), f2(BB2) + // + // but clearly t2 is not defined in BB1, so that is incorrect. The proper + // renaming is to note that on the path through BB1, t2 is really just a + // copy of t1, and do that renaming, properly generating: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t1(BB1), f2(BB2) + // + // Case 2, we lower cascaded CMOVs such as + // // (CMOV (CMOV F, T, cc1), T, cc2) + // // to two successives branches. For that, we look for another CMOV as the // following instruction. // @@ -19890,19 +21853,42 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // .LBB5_4: // retq // - MachineInstr *NextCMOV = nullptr; + MachineInstr *CascadedCMOV = nullptr; + MachineInstr *LastCMOV = MI; + X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm()); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineBasicBlock::iterator NextMIIt = std::next(MachineBasicBlock::iterator(MI)); - if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && + + // Check for case 1, where there are multiple CMOVs with the same condition + // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the + // number of jumps the most. + + if (isCMOVPseudo(MI)) { + // See if we have a string of CMOVS with the same condition. + while (NextMIIt != BB->end() && + isCMOVPseudo(NextMIIt) && + (NextMIIt->getOperand(3).getImm() == CC || + NextMIIt->getOperand(3).getImm() == OppCC)) { + LastCMOV = &*NextMIIt; + ++NextMIIt; + } + } + + // This checks for case 2, but only do this if we didn't already find + // case 1, as indicated by LastCMOV == MI. + if (LastCMOV == MI && + NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && - NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) - NextCMOV = &*NextMIIt; + NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) { + CascadedCMOV = &*NextMIIt; + } MachineBasicBlock *jcc1MBB = nullptr; - // If we have a double CMOV, we lower it to two successive branches to + // If we have a cascaded CMOV, we lower it to two successive branches to // the same block. EFLAGS is used by both, so mark it as live in the second. - if (NextCMOV) { + if (CascadedCMOV) { jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, jcc1MBB); jcc1MBB->addLiveIn(X86::EFLAGS); @@ -19917,7 +21903,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); - MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI; + MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV; if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); @@ -19926,12 +21912,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); + std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Add the true and fallthrough blocks as its successors. - if (NextCMOV) { - // The fallthrough block may be jcc1MBB, if we have a double CMOV. + if (CascadedCMOV) { + // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV. BB->addSuccessor(jcc1MBB); // In that case, jcc1MBB will itself fallthrough the copy0MBB, and @@ -19946,13 +21932,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, BB->addSuccessor(sinkMBB); // Create the conditional branch instruction. - unsigned Opc = - X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); + unsigned Opc = X86::GetCondBranchFromCond(CC); BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); - if (NextCMOV) { + if (CascadedCMOV) { unsigned Opc2 = X86::GetCondBranchFromCond( - (X86::CondCode)NextCMOV->getOperand(3).getImm()); + (X86::CondCode)CascadedCMOV->getOperand(3).getImm()); BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); } @@ -19964,28 +21949,110 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... - MachineInstrBuilder MIB = - BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), - MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) - .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); + MachineBasicBlock::iterator MIItEnd = + std::next(MachineBasicBlock::iterator(LastCMOV)); + MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin(); + DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; + MachineInstrBuilder MIB; + + // As we are creating the PHIs, we have to be careful if there is more than + // one. Later CMOVs may reference the results of earlier CMOVs, but later + // PHIs have to reference the individual true/false inputs from earlier PHIs. + // That also means that PHI construction must work forward from earlier to + // later, and that the code must maintain a mapping from earlier PHI's + // destination registers, and the registers that went into the PHI. + + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { + unsigned DestReg = MIIt->getOperand(0).getReg(); + unsigned Op1Reg = MIIt->getOperand(1).getReg(); + unsigned Op2Reg = MIIt->getOperand(2).getReg(); + + // If this CMOV we are generating is the opposite condition from + // the jump we generated, then we have to swap the operands for the + // PHI that is going to be generated. + if (MIIt->getOperand(3).getImm() == OppCC) + std::swap(Op1Reg, Op2Reg); + + if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end()) + Op1Reg = RegRewriteTable[Op1Reg].first; + + if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end()) + Op2Reg = RegRewriteTable[Op2Reg].second; + + MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL, + TII->get(X86::PHI), DestReg) + .addReg(Op1Reg).addMBB(copy0MBB) + .addReg(Op2Reg).addMBB(thisMBB); - // If we have a double CMOV, the second Jcc provides the same incoming + // Add this PHI to the rewrite table. + RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); + } + + // If we have a cascaded CMOV, the second Jcc provides the same incoming // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). - if (NextCMOV) { + if (CascadedCMOV) { MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); // Copy the PHI result to the register defined by the second CMOV. BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), - DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg()) + DL, TII->get(TargetOpcode::COPY), + CascadedCMOV->getOperand(0).getReg()) .addReg(MI->getOperand(0).getReg()); - NextCMOV->eraseFromParent(); + CascadedCMOV->eraseFromParent(); } - MI->eraseFromParent(); // The pseudo instruction is gone now. + // Now remove the CMOV(s). + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ) + (MIIt++)->eraseFromParent(); + return sinkMBB; } MachineBasicBlock * +X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI, + MachineBasicBlock *BB) const { + // Combine the following atomic floating-point modification pattern: + // a.store(reg OP a.load(acquire), release) + // Transform them into: + // OPss (%gpr), %xmm + // movss %xmm, (%gpr) + // Or sd equivalent for 64-bit operations. + unsigned MOp, FOp; + switch (MI->getOpcode()) { + default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); + case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break; + case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break; + } + const X86InstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + MachineOperand MSrc = MI->getOperand(0); + unsigned VSrc = MI->getOperand(5).getReg(); + const MachineOperand &Disp = MI->getOperand(3); + MachineOperand ZeroDisp = MachineOperand::CreateImm(0); + bool hasDisp = Disp.isGlobal() || Disp.isImm(); + if (hasDisp && MSrc.isReg()) + MSrc.setIsKill(false); + MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp)) + .addOperand(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) + .addReg(0); + MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp), + MRI.createVirtualRegister(MRI.getRegClass(VSrc))) + .addReg(VSrc) + .addOperand(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) + .addReg(/*Segment=*/0); + MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill); + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); @@ -20032,8 +22099,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, sizeVReg = MI->getOperand(1).getReg(), physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP; - MachineFunction::iterator MBBIter = BB; - ++MBBIter; + MachineFunction::iterator MBBIter = ++BB->getIterator(); MF->insert(MBBIter, bumpMBB); MF->insert(MBBIter, mallocMBB); @@ -20120,14 +22186,60 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { + assert(!Subtarget->isTargetMachO()); DebugLoc DL = MI->getDebugLoc(); + MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe( + *BB->getParent(), *BB, MI, DL, false); + MachineBasicBlock *ResumeBB = ResumeMI->getParent(); + MI->eraseFromParent(); // The pseudo instruction is gone now. + return ResumeBB; +} - assert(!Subtarget->isTargetMachO()); +MachineBasicBlock * +X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); + MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB(); + DebugLoc DL = MI->getDebugLoc(); - Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI, - DL); + assert(!isAsynchronousEHPersonality( + classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && + "SEH does not use catchret!"); - MI->eraseFromParent(); // The pseudo instruction is gone now. + // Only 32-bit EH needs to worry about manually restoring stack pointers. + if (!Subtarget->is32Bit()) + return BB; + + // C++ EH creates a new target block to hold the restore code, and wires up + // the new block to the return destination with a normal JMP_4. + MachineBasicBlock *RestoreMBB = + MF->CreateMachineBasicBlock(BB->getBasicBlock()); + assert(BB->succ_size() == 1); + MF->insert(std::next(BB->getIterator()), RestoreMBB); + RestoreMBB->transferSuccessorsAndUpdatePHIs(BB); + BB->addSuccessor(RestoreMBB); + MI->getOperand(0).setMBB(RestoreMBB); + + auto RestoreMBBI = RestoreMBB->begin(); + BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE)); + BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB); + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + const Constant *PerFn = MF->getFunction()->getPersonalityFn(); + bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn)); + // Only 32-bit SEH requires special handling for catchpad. + if (IsSEH && Subtarget->is32Bit()) { + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE)); + } + MI->eraseFromParent(); return BB; } @@ -20149,6 +22261,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = + Subtarget->is64Bit() ? + Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() : Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, @@ -20198,8 +22312,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; + MachineFunction::iterator I = ++MBB->getIterator(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); @@ -20225,7 +22338,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // For v = setjmp(buf), we generate // // thisMBB: - // buf[LabelOffset] = restoreMBB + // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB // SjLjSetup restoreMBB // // mainMBB: @@ -20245,6 +22358,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MF->insert(I, mainMBB); MF->insert(I, sinkMBB); MF->push_back(restoreMBB); + restoreMBB->setHasAddressTaken(); MachineInstrBuilder MIB; @@ -20511,35 +22625,74 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return BB; case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); + case X86::CATCHRET: + return EmitLoweredCatchRet(MI, BB); + case X86::CATCHPAD: + return EmitLoweredCatchPad(MI, BB); case X86::SEG_ALLOCA_32: case X86::SEG_ALLOCA_64: return EmitLoweredSegAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); - case X86::CMOV_GR8: case X86::CMOV_FR32: case X86::CMOV_FR64: - case X86::CMOV_V4F32: + case X86::CMOV_FR128: + case X86::CMOV_GR8: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: case X86::CMOV_V2F64: case X86::CMOV_V2I64: - case X86::CMOV_V8F32: + case X86::CMOV_V4F32: case X86::CMOV_V4F64: case X86::CMOV_V4I64: case X86::CMOV_V16F32: + case X86::CMOV_V8F32: case X86::CMOV_V8F64: case X86::CMOV_V8I64: - case X86::CMOV_GR16: - case X86::CMOV_GR32: - case X86::CMOV_RFP32: - case X86::CMOV_RFP64: - case X86::CMOV_RFP80: case X86::CMOV_V8I1: case X86::CMOV_V16I1: case X86::CMOV_V32I1: case X86::CMOV_V64I1: return EmitLoweredSelect(MI, BB); + case X86::RDFLAGS32: + case X86::RDFLAGS64: { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned PushF = + MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; + unsigned Pop = + MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; + BuildMI(*BB, MI, DL, TII->get(PushF)); + BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg()); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; + } + + case X86::WRFLAGS32: + case X86::WRFLAGS64: { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned Push = + MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r; + unsigned PopF = + MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64; + BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg()); + BuildMI(*BB, MI, DL, TII->get(PopF)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; + } + + case X86::RELEASE_FADD32mr: + case X86::RELEASE_FADD64mr: + return EmitLoweredAtomicFP(MI, BB); + case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: @@ -20652,7 +22805,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Thread synchronization. case X86::MONITOR: return EmitMonitor(MI, BB, Subtarget); - + // PKU feature + case X86::WRPKRU: + return EmitWRPKRU(MI, BB, Subtarget); + case X86::RDPKRU: + return EmitRDPKRU(MI, BB, Subtarget); // xbegin case X86::XBEGIN: return EmitXBegin(MI, BB, Subtarget->getInstrInfo()); @@ -20793,7 +22950,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( unsigned Depth) const { // SETCC_CARRY sets the dest to ~0 for true or 0 for false. if (Op.getOpcode() == X86ISD::SETCC_CARRY) - return Op.getValueType().getScalarType().getSizeInBits(); + return Op.getValueType().getScalarSizeInBits(); // Fallback case. return 1; @@ -20814,39 +22971,8 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, return TargetLowering::isGAPlusOffset(N, GA, Offset); } -/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the -/// same as extracting the high 128-bit part of 256-bit vector and then -/// inserting the result into the low part of a new 256-bit vector -static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - -/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the -/// same as extracting the low 128-bit part of 256-bit vector and then -/// inserting the result into the high part of a new 256-bit vector -static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> - for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. +/// FIXME: This could be expanded to support 512 bit vectors as well. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget* Subtarget) { @@ -20854,7 +22980,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); if (V1.getOpcode() == ISD::CONCAT_VECTORS && @@ -20920,24 +23046,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, InsV); } - //===--------------------------------------------------------------------===// - // Combine some shuffles into subvector extracts and inserts: - // - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - if (isShuffleHigh128VectorInsertLow(SVOp)) { - SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); - return DCI.CombineTo(N, InsV); - } - - // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> - if (isShuffleLow128VectorInsertHigh(SVOp)) { - SDValue V = Extract128BitVector(V1, 0, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); - return DCI.CombineTo(N, InsV); - } - return SDValue(); } @@ -20966,10 +23074,22 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, MVT RootVT = Root.getSimpleValueType(); SDLoc DL(Root); - // Just remove no-op shuffle masks. if (Mask.size() == 1) { - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), - /*AddTo*/ true); + int Index = Mask[0]; + assert((Index >= 0 || Index == SM_SentinelUndef || + Index == SM_SentinelZero) && + "Invalid shuffle index found!"); + + // We may end up with an accumulated mask of size 1 as a result of + // widening of shuffle operands (see function canWidenShuffleElements). + // If the only shuffle index is equal to SM_SentinelZero then propagate + // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle + // mask, and therefore the entire chain of shuffles can be folded away. + if (Index == SM_SentinelZero) + DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL)); + else + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), + /*AddTo*/ true); return true; } @@ -20985,7 +23105,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // doesn't preclude something switching to the shorter encoding post-RA. // // FIXME: Should teach these routines about AVX vector widths. - if (FloatDomain && VT.getSizeInBits() == 128) { + if (FloatDomain && VT.is128BitVector()) { if (Mask.equals({0, 0}) || Mask.equals({1, 1})) { bool Lo = Mask.equals({0, 0}); unsigned Shuffle; @@ -21049,7 +23169,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK // variants as none of these have single-instruction variants that are // superior to the UNPCK formulation. - if (!FloatDomain && VT.getSizeInBits() == 128 && + if (!FloatDomain && VT.is128BitVector() && (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) || Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) || Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) || @@ -21176,7 +23296,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, return false; SmallVector<int, 16> OpMask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary); + bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary); // We only can combine unary shuffles which we can decode the mask for. if (!HaveMask || !IsUnary) return false; @@ -21226,26 +23346,28 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, // See if we can recurse into the operand to combine more things. switch (Op.getOpcode()) { - case X86ISD::PSHUFB: - HasPSHUFB = true; - case X86ISD::PSHUFD: - case X86ISD::PSHUFHW: - case X86ISD::PSHUFLW: - if (Op.getOperand(0).hasOneUse() && - combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - HasPSHUFB, DAG, DCI, Subtarget)) - return true; - break; + case X86ISD::PSHUFB: + HasPSHUFB = true; + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + if (Op.getOperand(0).hasOneUse() && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; - case X86ISD::UNPCKL: - case X86ISD::UNPCKH: - assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!"); - // We can't check for single use, we have to check that this shuffle is the only user. - if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && - combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - HasPSHUFB, DAG, DCI, Subtarget)) - return true; - break; + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + assert(Op.getOperand(0) == Op.getOperand(1) && + "We only combine unary shuffles!"); + // We can't check for single use, we have to check that this shuffle is the + // only user. + if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; } // Minor canonicalization of the accumulated shuffle mask to make it easier @@ -21271,7 +23393,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { MVT VT = N.getSimpleValueType(); SmallVector<int, 4> Mask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary); + bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Mask, IsUnary); (void)HaveMask; assert(HaveMask); @@ -21360,8 +23482,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, case X86ISD::UNPCKH: // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. - if (V.getSimpleValueType().getScalarType() != MVT::i8 && - V.getSimpleValueType().getScalarType() != MVT::i16) + if (V.getSimpleValueType().getVectorElementType() != MVT::i8 && + V.getSimpleValueType().getVectorElementType() != MVT::i16) return SDValue(); // Search for a half-shuffle which we can combine with. @@ -21438,7 +23560,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, return V; } -/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. +/// \brief Search for a combinable shuffle across a chain ending in pshuflw or +/// pshufhw. /// /// We walk up the chain, skipping shuffles of the other half and looking /// through shuffles which switch halves trying to find a shuffle of the same @@ -21520,6 +23643,66 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, Mask = getPSHUFShuffleMask(N); assert(Mask.size() == 4); break; + case X86ISD::UNPCKL: { + // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in + // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE + // moves upper half elements into the lower half part. For example: + // + // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1, + // undef:v16i8 + // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2 + // + // will be combined to: + // + // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1 + + // This is only for 128-bit vectors. From SSE4.1 onward this combine may not + // happen due to advanced instructions. + if (!VT.is128BitVector()) + return SDValue(); + + auto Op0 = N.getOperand(0); + auto Op1 = N.getOperand(1); + if (Op0.getOpcode() == ISD::UNDEF && + Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) { + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask(); + + unsigned NumElts = VT.getVectorNumElements(); + SmallVector<int, 8> ExpectedMask(NumElts, -1); + std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2, + NumElts / 2); + + auto ShufOp = Op1.getOperand(0); + if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask)) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp); + } + return SDValue(); + } + case X86ISD::BLENDI: { + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && + "Unexpected input vector types"); + + // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector + // operands and changing the mask to 1. This saves us a bunch of + // pattern-matching possibilities related to scalar math ops in SSE/AVX. + // x86InstrInfo knows how to commute this back after instruction selection + // if it would help register allocation. + + // TODO: If optimizing for size or a processor that doesn't suffer from + // partial register update stalls, this should be transformed into a MOVSD + // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. + + if (VT == MVT::v2f64) + if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2))) + if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { + SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); + } + + return SDValue(); + } default: return SDValue(); } @@ -21535,7 +23718,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: - assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!"); + assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"); if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) return SDValue(); // We combined away this shuffle, so we're done. @@ -21613,9 +23796,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, /// the operands which explicitly discard the lanes which are unused by this /// operation to try to flow through the rest of the combiner the fact that /// they're unused. -static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { +static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); + if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && + (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) + return SDValue(); // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask @@ -21624,14 +23811,19 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { return SDValue(); auto *SVN = cast<ShuffleVectorSDNode>(N); - ArrayRef<int> Mask = SVN->getMask(); + SmallVector<int, 8> Mask; + for (int M : SVN->getMask()) + Mask.push_back(M); + SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); - // We require the first shuffle operand to be the SUB node, and the second to - // be the ADD node. - // FIXME: We should support the commuted patterns. - if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD) + // We require the first shuffle operand to be the FSUB node, and the second to + // be the FADD node. + if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(V1, V2); + } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) return SDValue(); // If there are other uses of these operations we can't fold them. @@ -21652,12 +23844,6 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) return SDValue(); - // Only specific types are legal at this point, assert so we notice if and - // when these change. - assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || - VT == MVT::v4f64) && - "Unknown vector type encountered!"); - return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); } @@ -21677,12 +23863,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. - if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3()) - if (SDValue AddSub = combineShuffleToAddSub(N, DAG)) + if (TLI.isTypeLegal(VT)) + if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG)) return AddSub; // Combine 256-bit vector shuffles. This is only profitable when in AVX mode - if (Subtarget->hasFp256() && VT.is256BitVector() && + if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); @@ -21780,6 +23966,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SDValue InVec = N->getOperand(0); SDValue EltNo = N->getOperand(1); + EVT EltVT = N->getValueType(0); if (!isa<ConstantSDNode>(EltNo)) return SDValue(); @@ -21808,14 +23995,22 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SmallVector<int, 16> ShuffleMask; bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), + if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); - int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; + int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt]; + + if (Idx == SM_SentinelZero) + return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) + : DAG.getConstantFP(+0.0, SDLoc(N), EltVT); + if (Idx == SM_SentinelUndef) + return DAG.getUNDEF(EltVT); + + assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1); @@ -21840,7 +24035,6 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) return SDValue(); - EVT EltVT = N->getValueType(0); // If there's a bitcast before the shuffle, check if the load type and // alignment is valid. unsigned Align = LN0->getAlignment(); @@ -21866,21 +24060,45 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } -/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are -/// special and don't usually play with other vector types, it's better to -/// handle them early to be sure we emit efficient code by avoiding -/// store-load conversions. -static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) { - if (N->getValueType(0) != MVT::x86mmx || - N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR || - N->getOperand(0)->getValueType(0) != MVT::v2i32) - return SDValue(); +static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); - SDValue V = N->getOperand(0); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1)); - if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32) - return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)), - N->getValueType(0), V.getOperand(0)); + // Detect bitcasts between i32 to x86mmx low word. Since MMX types are + // special and don't usually play with other vector types, it's better to + // handle them early to be sure we emit efficient code by avoiding + // store-load conversions. + if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && + N0.getValueType() == MVT::v2i32 && + isNullConstant(N0.getOperand(1))) { + SDValue N00 = N0->getOperand(0); + if (N00.getValueType() == MVT::i32) + return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); + } + + // Convert a bitcasted integer logic operation that has one bitcasted + // floating-point operand and one constant operand into a floating-point + // logic operation. This may create a load of the constant, but that is + // cheaper than materializing the constant in an integer register and + // transferring it to an SSE register or transferring the SSE operand to + // integer register and back. + unsigned FPOpcode; + switch (N0.getOpcode()) { + case ISD::AND: FPOpcode = X86ISD::FAND; break; + case ISD::OR: FPOpcode = X86ISD::FOR; break; + case ISD::XOR: FPOpcode = X86ISD::FXOR; break; + default: return SDValue(); + } + if (((Subtarget->hasSSE1() && VT == MVT::f32) || + (Subtarget->hasSSE2() && VT == MVT::f64)) && + isa<ConstantSDNode>(N0.getOperand(1)) && + N0.getOperand(0).getOpcode() == ISD::BITCAST && + N0.getOperand(0).getOperand(0).getValueType() == VT) { + SDValue N000 = N0.getOperand(0).getOperand(0); + SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1)); + return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst); + } return SDValue(); } @@ -21910,26 +24128,26 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, InputVector.getNode()->getOperand(0)); // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))). - SDValue MMXSrcOp = MMXSrc.getOperand(0); if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() && - MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() && - MMXSrcOp.getOpcode() == ISD::BITCAST && - MMXSrcOp.getValueType() == MVT::v1i64 && - MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx) - return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), - N->getValueType(0), - MMXSrcOp.getOperand(0)); + MMXSrc.getValueType() == MVT::i64) { + SDValue MMXSrcOp = MMXSrc.getOperand(0); + if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST && + MMXSrcOp.getValueType() == MVT::v1i64 && + MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx) + return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), + N->getValueType(0), MMXSrcOp.getOperand(0)); + } } EVT VT = N->getValueType(0); - if (VT == MVT::i1 && dyn_cast<ConstantSDNode>(N->getOperand(1)) && + if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) && InputVector.getOpcode() == ISD::BITCAST && - dyn_cast<ConstantSDNode>(InputVector.getOperand(0))) { + isa<ConstantSDNode>(InputVector.getOperand(0))) { uint64_t ExtractedElt = - cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); uint64_t InputValue = - cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue(); + cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue(); uint64_t Res = (InputValue >> ExtractedElt) & 1; return DAG.getConstant(Res, dl, MVT::i1); } @@ -22036,96 +24254,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. -static std::pair<unsigned, bool> -matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, - SelectionDAG &DAG, const X86Subtarget *Subtarget) { - if (!VT.isVector()) - return std::make_pair(0, false); - - bool NeedSplit = false; - switch (VT.getSimpleVT().SimpleTy) { - default: return std::make_pair(0, false); - case MVT::v4i64: - case MVT::v2i64: - if (!Subtarget->hasVLX()) - return std::make_pair(0, false); - break; - case MVT::v64i8: - case MVT::v32i16: - if (!Subtarget->hasBWI()) - return std::make_pair(0, false); - break; - case MVT::v16i32: - case MVT::v8i64: - if (!Subtarget->hasAVX512()) - return std::make_pair(0, false); - break; - case MVT::v32i8: - case MVT::v16i16: - case MVT::v8i32: - if (!Subtarget->hasAVX2()) - NeedSplit = true; - if (!Subtarget->hasAVX()) - return std::make_pair(0, false); - break; - case MVT::v16i8: - case MVT::v8i16: - case MVT::v4i32: - if (!Subtarget->hasSSE2()) - return std::make_pair(0, false); - } - - // SSE2 has only a small subset of the operations. - bool hasUnsigned = Subtarget->hasSSE41() || - (Subtarget->hasSSE2() && VT == MVT::v16i8); - bool hasSigned = Subtarget->hasSSE41() || - (Subtarget->hasSSE2() && VT == MVT::v8i16); - - ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); - - unsigned Opc = 0; - // Check for x CC y ? x : y. - if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && - DAG.isEqualTo(RHS, Cond.getOperand(1))) { - switch (CC) { - default: break; - case ISD::SETULT: - case ISD::SETULE: - Opc = hasUnsigned ? ISD::UMIN : 0; break; - case ISD::SETUGT: - case ISD::SETUGE: - Opc = hasUnsigned ? ISD::UMAX : 0; break; - case ISD::SETLT: - case ISD::SETLE: - Opc = hasSigned ? ISD::SMIN : 0; break; - case ISD::SETGT: - case ISD::SETGE: - Opc = hasSigned ? ISD::SMAX : 0; break; - } - // Check for x CC y ? y : x -- a min/max with reversed arms. - } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && - DAG.isEqualTo(RHS, Cond.getOperand(0))) { - switch (CC) { - default: break; - case ISD::SETULT: - case ISD::SETULE: - Opc = hasUnsigned ? ISD::UMAX : 0; break; - case ISD::SETUGT: - case ISD::SETUGE: - Opc = hasUnsigned ? ISD::UMIN : 0; break; - case ISD::SETLT: - case ISD::SETLE: - Opc = hasSigned ? ISD::SMAX : 0; break; - case ISD::SETGT: - case ISD::SETGE: - Opc = hasSigned ? ISD::SMIN : 0; break; - } - } - - return std::make_pair(Opc, NeedSplit); -} - static SDValue transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { @@ -22189,7 +24317,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // ignored in unsafe-math mode). // We also try to create v2f32 min/max nodes, which we later widen to v4f32. if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && + VT != MVT::f80 && VT != MVT::f128 && + (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -22535,32 +24664,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } - // Try to match a min/max vector operation. - if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) { - std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget); - unsigned Opc = ret.first; - bool NeedSplit = ret.second; - - if (Opc && NeedSplit) { - unsigned NumElems = VT.getVectorNumElements(); - // Extract the LHS vectors - SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL); - SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL); - - // Extract the RHS vectors - SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL); - SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL); - - // Create min/max for each subvector - LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1); - RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2); - - // Merge the result - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS); - } else if (Opc) - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - // Simplify vector selection if condition value type matches vselect // operand type if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { @@ -22635,7 +24738,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { - unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); + unsigned BitWidth = Cond.getValueType().getScalarSizeInBits(); // Don't optimize vector selects that map to mask-registers. if (BitWidth == 1) @@ -22656,14 +24759,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // FIXME: We don't support i16-element blends currently. We could and // should support them by making *all* the bits in the condition be set // rather than just the high bit and using an i8-element blend. - if (VT.getScalarType() == MVT::i16) + if (VT.getVectorElementType() == MVT::i16) return SDValue(); // Dynamic blending was only available from SSE4.1 onward. - if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41()) + if (VT.is128BitVector() && !Subtarget->hasSSE41()) return SDValue(); // Byte blends are only available in AVX2 - if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 && - !Subtarget->hasAVX2()) + if (VT == MVT::v32i8 && !Subtarget->hasAVX2()) return SDValue(); assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); @@ -22773,12 +24875,9 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; - ConstantSDNode *CS; - if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) && - CS->getZExtValue() == 1) + if (isOneConstant(SetCC.getOperand(0))) OpIdx = 1; - if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) && - CS->getZExtValue() == 1) + if (isOneConstant(SetCC.getOperand(1))) OpIdx = 0; if (OpIdx == -1) break; @@ -22857,8 +24956,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd) { if (Cond->getOpcode() == X86ISD::CMP) { - ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1)); - if (!CondOp1C || !CondOp1C->isNullValue()) + if (!isNullConstant(Cond->getOperand(1))) return false; Cond = Cond->getOperand(0); @@ -23102,106 +25200,15 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); - switch (IntNo) { - default: return SDValue(); - // SSE/AVX/AVX2 blend intrinsics. - case Intrinsic::x86_avx2_pblendvb: - // Don't try to simplify this intrinsic if we don't have AVX2. - if (!Subtarget->hasAVX2()) - return SDValue(); - // FALL-THROUGH - case Intrinsic::x86_avx_blendv_pd_256: - case Intrinsic::x86_avx_blendv_ps_256: - // Don't try to simplify this intrinsic if we don't have AVX. - if (!Subtarget->hasAVX()) - return SDValue(); - // FALL-THROUGH - case Intrinsic::x86_sse41_blendvps: - case Intrinsic::x86_sse41_blendvpd: - case Intrinsic::x86_sse41_pblendvb: { - SDValue Op0 = N->getOperand(1); - SDValue Op1 = N->getOperand(2); - SDValue Mask = N->getOperand(3); - - // Don't try to simplify this intrinsic if we don't have SSE4.1. - if (!Subtarget->hasSSE41()) - return SDValue(); - - // fold (blend A, A, Mask) -> A - if (Op0 == Op1) - return Op0; - // fold (blend A, B, allZeros) -> A - if (ISD::isBuildVectorAllZeros(Mask.getNode())) - return Op0; - // fold (blend A, B, allOnes) -> B - if (ISD::isBuildVectorAllOnes(Mask.getNode())) - return Op1; - - // Simplify the case where the mask is a constant i32 value. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) { - if (C->isNullValue()) - return Op0; - if (C->isAllOnesValue()) - return Op1; - } - - return SDValue(); - } - - // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: { - SDValue Op0 = N->getOperand(1); - SDValue Op1 = N->getOperand(2); - EVT VT = Op0.getValueType(); - assert(VT.isVector() && "Expected a vector type!"); - - if (isa<BuildVectorSDNode>(Op1)) - Op1 = Op1.getOperand(0); - - if (!isa<ConstantSDNode>(Op1)) - return SDValue(); - - EVT SVT = VT.getVectorElementType(); - unsigned SVTBits = SVT.getSizeInBits(); - - ConstantSDNode *CND = cast<ConstantSDNode>(Op1); - const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue()); - uint64_t ShAmt = C.getZExtValue(); - - // Don't try to convert this shift into a ISD::SRA if the shift - // count is bigger than or equal to the element size. - if (ShAmt >= SVTBits) - return SDValue(); - - // Trivial case: if the shift count is zero, then fold this - // into the first operand. - if (ShAmt == 0) - return Op0; - - // Replace this packed shift intrinsic with a target independent - // shift dag node. - SDLoc DL(N); - SDValue Splat = DAG.getConstant(C, DL, VT); - return DAG.getNode(ISD::SRA, DL, VT, Op0, Splat); - } - } -} - /// PerformMulCombine - Optimize a single multiply with constant into two /// in order to implement it with two cheaper instructions, e.g. /// LEA + SHL, LEA + LEA. static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + // An imul is usually smaller than the alternative sequence. + if (DAG.getMachineFunction().getFunction()->optForMinSize()) + return SDValue(); + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); @@ -23228,9 +25235,11 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, MulAmt1 = 3; MulAmt2 = MulAmt / 3; } + + SDLoc DL(N); + SDValue NewMul; if (MulAmt2 && (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ - SDLoc DL(N); if (isPowerOf2_64(MulAmt2) && !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) @@ -23239,7 +25248,6 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, // is an add. std::swap(MulAmt1, MulAmt2); - SDValue NewMul; if (isPowerOf2_64(MulAmt1)) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); @@ -23253,10 +25261,31 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); + } + + if (!NewMul) { + assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) + && "Both cases that could cause potential overflows should have " + "already been handled."); + if (isPowerOf2_64(MulAmt - 1)) + // (mul x, 2^N + 1) => (add (shl x, N), x) + NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt - 1), DL, + MVT::i8))); + else if (isPowerOf2_64(MulAmt + 1)) + // (mul x, 2^N - 1) => (sub (shl x, N), x) + NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, + N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt + 1), + DL, MVT::i8)), N->getOperand(0)); + } + + if (NewMul) // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, NewMul, false); - } + return SDValue(); } @@ -23272,18 +25301,34 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == X86ISD::SETCC_CARRY || - ((N00.getOpcode() == ISD::ANY_EXTEND || - N00.getOpcode() == ISD::ZERO_EXTEND) && - N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { - APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); - APInt ShAmt = N1C->getAPIntValue(); - Mask = Mask.shl(ShAmt); - if (Mask != 0) { - SDLoc DL(N); - return DAG.getNode(ISD::AND, DL, VT, - N00, DAG.getConstant(Mask, DL, VT)); - } + APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); + APInt ShAmt = N1C->getAPIntValue(); + Mask = Mask.shl(ShAmt); + bool MaskOK = false; + // We can handle cases concerning bit-widening nodes containing setcc_c if + // we carefully interrogate the mask to make sure we are semantics + // preserving. + // The transform is not safe if the result of C1 << C2 exceeds the bitwidth + // of the underlying setcc_c operation if the setcc_c was zero extended. + // Consider the following example: + // zext(setcc_c) -> i32 0x0000FFFF + // c1 -> i32 0x0000FFFF + // c2 -> i32 0x00000001 + // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE + // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE + if (N00.getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = true; + } else if (N00.getOpcode() == ISD::SIGN_EXTEND && + N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = true; + } else if ((N00.getOpcode() == ISD::ZERO_EXTEND || + N00.getOpcode() == ISD::ANY_EXTEND) && + N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits()); + } + if (MaskOK && Mask != 0) { + SDLoc DL(N); + return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT)); } } @@ -23304,6 +25349,59 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + unsigned Size = VT.getSizeInBits(); + + // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) + // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or + // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) + // depending on sign of (SarConst - [56,48,32,24,16]) + + // sexts in X86 are MOVs. The MOVs have the same code size + // as above SHIFTs (only SHIFT on 1 has lower code size). + // However the MOVs have 2 advantages to a SHIFT: + // 1. MOVs can write to a register that differs from source + // 2. MOVs accept memory operands + + if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant || + N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || + N0.getOperand(1).getOpcode() != ISD::Constant) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue(); + APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue(); + EVT CVT = N1.getValueType(); + + if (SarConst.isNegative()) + return SDValue(); + + for (MVT SVT : MVT::integer_valuetypes()) { + unsigned ShiftSize = SVT.getSizeInBits(); + // skipping types without corresponding sext/zext and + // ShlConst that is not one of [56,48,32,24,16] + if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize) + continue; + SDLoc DL(N); + SDValue NN = + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); + SarConst = SarConst - (Size - ShiftSize); + if (SarConst == 0) + return NN; + else if (SarConst.isNegative()) + return DAG.getNode(ISD::SHL, DL, VT, NN, + DAG.getConstant(-SarConst, DL, CVT)); + else + return DAG.getNode(ISD::SRA, DL, VT, NN, + DAG.getConstant(SarConst, DL, CVT)); + } + return SDValue(); +} + /// \brief Returns a vector of 0s if the node in input is a vector logical /// shift by a constant amount which is known to be bigger than or equal /// to the vector element size in bits. @@ -23321,14 +25419,15 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt)) if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { APInt ShiftAmt = AmtSplat->getAPIntValue(); - unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); + unsigned MaxAmount = + VT.getSimpleVT().getVectorElementType().getSizeInBits(); // SSE2/AVX2 logical shifts always return a vector of 0s // if the shift amount is bigger than or equal to // the element size. The constant shift amount will be // encoded as a 8-bit immediate. if (ShiftAmt.trunc(8).uge(MaxAmount)) - return getZeroVector(VT, Subtarget, DAG, DL); + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL); } return SDValue(); @@ -23342,6 +25441,10 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, if (SDValue V = PerformSHLCombine(N, DAG)) return V; + if (N->getOpcode() == ISD::SRA) + if (SDValue V = PerformSRACombine(N, DAG)) + return V; + // Try to fold this logical shift into a zero vector. if (N->getOpcode() != ISD::SRA) if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) @@ -23537,7 +25640,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0->getOperand(0); if (RHSConstSplat) { - N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), + N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(), SDValue(RHSConstSplat, 0)); SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); @@ -23552,9 +25655,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, case ISD::ANY_EXTEND: return Op; case ISD::ZERO_EXTEND: { - unsigned InBits = NarrowVT.getScalarType().getSizeInBits(); + unsigned InBits = NarrowVT.getScalarSizeInBits(); APInt Mask = APInt::getAllOnesValue(InBits); - Mask = Mask.zext(VT.getScalarType().getSizeInBits()); + Mask = Mask.zext(VT.getScalarSizeInBits()); return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(Mask, DL, VT)); } @@ -23656,6 +25759,41 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(N0.getValueType(), NewShuffle); } +/// If both input operands of a logic op are being cast from floating point +/// types, try to convert this into a floating point logic node to avoid +/// unnecessary moves from SSE to integer registers. +static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + unsigned FPOpcode = ISD::DELETED_NODE; + if (N->getOpcode() == ISD::AND) + FPOpcode = X86ISD::FAND; + else if (N->getOpcode() == ISD::OR) + FPOpcode = X86ISD::FOR; + else if (N->getOpcode() == ISD::XOR) + FPOpcode = X86ISD::FXOR; + + assert(FPOpcode != ISD::DELETED_NODE && + "Unexpected input node for FP logic conversion"); + + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && + ((Subtarget->hasSSE1() && VT == MVT::i32) || + (Subtarget->hasSSE2() && VT == MVT::i64))) { + SDValue N00 = N0.getOperand(0); + SDValue N10 = N1.getOperand(0); + EVT N00Type = N00.getValueType(); + EVT N10Type = N10.getValueType(); + if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) { + SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); + return DAG.getBitcast(VT, FPLogic); + } + } + return SDValue(); +} + static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -23668,6 +25806,9 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) return R; + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -23728,6 +25869,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) return R; + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); @@ -23799,7 +25943,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (!Subtarget->hasSSE41()) return SDValue(); - EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; + MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; X = DAG.getBitcast(BlendVT, X); Y = DAG.getBitcast(BlendVT, Y); @@ -23813,9 +25957,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = - MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); // SHLD/SHRD instructions have lower register pressure, but on some // platforms they have higher latency than the equivalent @@ -23913,17 +26055,188 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes +// Try to turn tests against the signbit in the form of: +// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) +// into: +// SETGT(X, -1) +static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { + // This is only worth doing if the output type is i8. + if (N->getValueType(0) != MVT::i8) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // We should be performing an xor against a truncated shift. + if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse()) + return SDValue(); + + // Make sure we are performing an xor against one. + if (!isOneConstant(N1)) + return SDValue(); + + // SetCC on x86 zero extends so only act on this if it's a logical shift. + SDValue Shift = N0.getOperand(0); + if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse()) + return SDValue(); + + // Make sure we are truncating from one of i16, i32 or i64. + EVT ShiftTy = Shift.getValueType(); + if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64) + return SDValue(); + + // Make sure the shift amount extracts the sign bit. + if (!isa<ConstantSDNode>(Shift.getOperand(1)) || + Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1) + return SDValue(); + + // Create a greater-than comparison against -1. + // N.B. Using SETGE against 0 works but we want a canonical looking + // comparison, using SETGT matches up with what TranslateX86CC. + SDLoc DL(N); + SDValue ShiftOp = Shift.getOperand(0); + EVT ShiftOpTy = ShiftOp.getValueType(); + SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp, + DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT); + return Cond; +} + static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); + if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) + return RV; + if (Subtarget->hasCMov()) if (SDValue RV = performIntegerAbsCombine(N, DAG)) return RV; + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + + return SDValue(); +} + +/// This function detects the AVG pattern between vectors of unsigned i8/i16, +/// which is c = (a + b + 1) / 2, and replace this operation with the efficient +/// X86ISD::AVG instruction. +static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget *Subtarget, SDLoc DL) { + if (!VT.isVector() || !VT.isSimple()) + return SDValue(); + EVT InVT = In.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + EVT ScalarVT = VT.getVectorElementType(); + if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && + isPowerOf2_32(NumElems))) + return SDValue(); + + // InScalarVT is the intermediate type in AVG pattern and it should be greater + // than the original input type (i8/i16). + EVT InScalarVT = InVT.getVectorElementType(); + if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) + return SDValue(); + + if (Subtarget->hasAVX512()) { + if (VT.getSizeInBits() > 512) + return SDValue(); + } else if (Subtarget->hasAVX2()) { + if (VT.getSizeInBits() > 256) + return SDValue(); + } else { + if (VT.getSizeInBits() > 128) + return SDValue(); + } + + // Detect the following pattern: + // + // %1 = zext <N x i8> %a to <N x i32> + // %2 = zext <N x i8> %b to <N x i32> + // %3 = add nuw nsw <N x i32> %1, <i32 1 x N> + // %4 = add nuw nsw <N x i32> %3, %2 + // %5 = lshr <N x i32> %N, <i32 1 x N> + // %6 = trunc <N x i32> %5 to <N x i8> + // + // In AVX512, the last instruction can also be a trunc store. + + if (In.getOpcode() != ISD::SRL) + return SDValue(); + + // A lambda checking the given SDValue is a constant vector and each element + // is in the range [Min, Max]. + auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { + BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V); + if (!BV || !BV->isConstant()) + return false; + for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i)); + if (!C) + return false; + uint64_t Val = C->getZExtValue(); + if (Val < Min || Val > Max) + return false; + } + return true; + }; + + // Check if each element of the vector is left-shifted by one. + auto LHS = In.getOperand(0); + auto RHS = In.getOperand(1); + if (!IsConstVectorInRange(RHS, 1, 1)) + return SDValue(); + if (LHS.getOpcode() != ISD::ADD) + return SDValue(); + + // Detect a pattern of a + b + 1 where the order doesn't matter. + SDValue Operands[3]; + Operands[0] = LHS.getOperand(0); + Operands[1] = LHS.getOperand(1); + + // Take care of the case when one of the operands is a constant vector whose + // element is in the range [1, 256]. + if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && + Operands[0].getOpcode() == ISD::ZERO_EXTEND && + Operands[0].getOperand(0).getValueType() == VT) { + // The pattern is detected. Subtract one from the constant vector, then + // demote it and emit X86ISD::AVG instruction. + SDValue One = DAG.getConstant(1, DL, InScalarVT); + SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT, + SmallVector<SDValue, 8>(NumElems, One)); + Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones); + Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1]); + } + + if (Operands[0].getOpcode() == ISD::ADD) + std::swap(Operands[0], Operands[1]); + else if (Operands[1].getOpcode() != ISD::ADD) + return SDValue(); + Operands[2] = Operands[1].getOperand(0); + Operands[1] = Operands[1].getOperand(1); + + // Now we have three operands of two additions. Check that one of them is a + // constant vector with ones, and the other two are promoted from i8/i16. + for (int i = 0; i < 3; ++i) { + if (!IsConstVectorInRange(Operands[i], 1, 1)) + continue; + std::swap(Operands[i], Operands[2]); + + // Check if Operands[0] and Operands[1] are results of type promotion. + for (int j = 0; j < 2; ++j) + if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || + Operands[j].getOperand(0).getValueType() != VT) + return SDValue(); + + // The pattern is detected, emit X86ISD::AVG instruction. + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1].getOperand(0)); + } + return SDValue(); } @@ -23940,10 +26253,13 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // For chips with slow 32-byte unaligned loads, break the 32-byte operation // into two 16-byte operations. ISD::LoadExtType Ext = Ld->getExtensionType(); + bool Fast; + unsigned AddressSpace = Ld->getAddressSpace(); unsigned Alignment = Ld->getAlignment(); - bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; - if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && - !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { + if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && + Ext == ISD::NON_EXTLOAD && + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, + AddressSpace, Alignment, &Fast) && !Fast) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); @@ -24012,8 +26328,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. - assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) - && "WideVecVT should be legal"); + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); } @@ -24026,8 +26342,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; - for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) - ShuffleVec[i] = NumElems*SizeRatio; + for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) + ShuffleVec[i] = NumElems * SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), &ShuffleVec[0]); @@ -24055,7 +26371,6 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, ISD::NON_EXTLOAD); SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); - } /// PerformMSTORECombine - Resolve truncating stores static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, @@ -24073,6 +26388,15 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + // From, To sizes and ElemCount must be pow of two assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"); @@ -24096,12 +26420,12 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. - assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) - && "WideVecVT should be legal"); + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, - DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); SDValue NewMask; SDValue Mask = Mst->getMask(); @@ -24133,8 +26457,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } - return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), - NewMask, StVT, Mst->getMemOperand(), false); + return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, + Mst->getBasePtr(), NewMask, StVT, + Mst->getMemOperand(), false); } /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, @@ -24148,10 +26473,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // If we are saving a concatenation of two XMM registers and 32-byte stores // are slow, such as on Sandy Bridge, perform two 16-byte stores. + bool Fast; + unsigned AddressSpace = St->getAddressSpace(); unsigned Alignment = St->getAlignment(); - bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; - if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && - StVT == VT && !IsAligned) { + if (VT.is256BitVector() && StVT == VT && + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + AddressSpace, Alignment, &Fast) && !Fast) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); @@ -24178,12 +26505,29 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { + // Check if we can detect an AVG pattern from the truncation. If yes, + // replace the trunc store by a normal store with the result of X86ISD::AVG + // instruction. + SDValue Avg = + detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl); + if (Avg.getNode()) + return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); // We are going to use the original vector elt for storing. @@ -24306,7 +26650,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. if (Subtarget->is64Bit() || F64IsLegal) { - EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; + MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), @@ -24539,8 +26883,234 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS. +static SDValue +combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, + SmallVector<SDValue, 8> &Regs) { + assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || + Regs[0].getValueType() == MVT::v2i64)); + EVT OutVT = N->getValueType(0); + EVT OutSVT = OutVT.getVectorElementType(); + EVT InVT = Regs[0].getValueType(); + EVT InSVT = InVT.getVectorElementType(); + SDLoc DL(N); + + // First, use mask to unset all bits that won't appear in the result. + assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) && + "OutSVT can only be either i8 or i16."); + SDValue MaskVal = + DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT); + SDValue MaskVec = DAG.getNode( + ISD::BUILD_VECTOR, DL, InVT, + SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal)); + for (auto &Reg : Regs) + Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg); + + MVT UnpackedVT, PackedVT; + if (OutSVT == MVT::i8) { + UnpackedVT = MVT::v8i16; + PackedVT = MVT::v16i8; + } else { + UnpackedVT = MVT::v4i32; + PackedVT = MVT::v8i16; + } + + // In each iteration, truncate the type by a half size. + auto RegNum = Regs.size(); + for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits(); + j < e; j *= 2, RegNum /= 2) { + for (unsigned i = 0; i < RegNum; i++) + Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]); + for (unsigned i = 0; i < RegNum / 2; i++) + Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2], + Regs[i * 2 + 1]); + } + + // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and + // then extract a subvector as the result since v8i8 is not a legal type. + if (OutVT == MVT::v8i8) { + Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]); + Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0], + DAG.getIntPtrConstant(0, DL)); + return Regs[0]; + } else if (RegNum > 1) { + Regs.resize(RegNum); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} + +/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. +static SDValue +combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, + SmallVector<SDValue, 8> &Regs) { + assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32); + EVT OutVT = N->getValueType(0); + SDLoc DL(N); + + // Shift left by 16 bits, then arithmetic-shift right by 16 bits. + SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32); + for (auto &Reg : Regs) { + Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG); + Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG); + } + + for (unsigned i = 0, e = Regs.size() / 2; i < e; i++) + Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2], + Regs[i * 2 + 1]); + + if (Regs.size() > 2) { + Regs.resize(Regs.size() / 2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} + +/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into +/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type +/// legalization the truncation will be translated into a BUILD_VECTOR with each +/// element that is extracted from a vector and then truncated, and it is +/// diffcult to do this optimization based on them. +static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT OutVT = N->getValueType(0); + if (!OutVT.isVector()) + return SDValue(); + + SDValue In = N->getOperand(0); + if (!In.getValueType().isSimple()) + return SDValue(); + + EVT InVT = In.getValueType(); + unsigned NumElems = OutVT.getVectorNumElements(); + + // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on + // SSE2, and we need to take care of it specially. + // AVX512 provides vpmovdb. + if (!Subtarget->hasSSE2() || Subtarget->hasAVX2()) + return SDValue(); + + EVT OutSVT = OutVT.getVectorElementType(); + EVT InSVT = InVT.getVectorElementType(); + if (!((InSVT == MVT::i32 || InSVT == MVT::i64) && + (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) && + NumElems >= 8)) + return SDValue(); + + // SSSE3's pshufb results in less instructions in the cases below. + if (Subtarget->hasSSSE3() && NumElems == 8 && + ((OutSVT == MVT::i8 && InSVT != MVT::i64) || + (InSVT == MVT::i32 && OutSVT == MVT::i16))) + return SDValue(); + + SDLoc DL(N); + + // Split a long vector into vectors of legal type. + unsigned RegNum = InVT.getSizeInBits() / 128; + SmallVector<SDValue, 8> SubVec(RegNum); + if (InSVT == MVT::i32) { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(i * 4, DL)); + } else { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(i * 2, DL)); + } + + // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS + // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to + // truncate 2 x v4i32 to v8i16. + if (Subtarget->hasSSE41() || OutSVT == MVT::i8) + return combineVectorTruncationWithPACKUS(N, DAG, SubVec); + else if (InSVT == MVT::i32) + return combineVectorTruncationWithPACKSS(N, DAG, SubVec); + else + return SDValue(); +} + +static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // Try to detect AVG pattern first. + SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, + Subtarget, SDLoc(N)); + if (Avg.getNode()) + return Avg; + + return combineVectorTruncation(N, DAG, Subtarget); +} + +/// Do target-specific dag combines on floating point negations. +static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + SDValue Arg = N->getOperand(0); + SDLoc DL(N); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + // If we're negating a FMUL node on a target with FMA, then we can avoid the + // use of a constant by performing (-0 - A*B) instead. + // FIXME: Check rounding control flags as well once it becomes available. + if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && + Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { + SDValue Zero = DAG.getConstantFP(0.0, DL, VT); + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Zero); + } + + // If we're negating a FMA node, then we can adjust the + // instruction to include the extra negation. + if (Arg.hasOneUse()) { + switch (Arg.getOpcode()) { + case X86ISD::FMADD: + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FMSUB: + return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMADD: + return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMSUB: + return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + } + } + return SDValue(); +} + +static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (VT.is512BitVector() && !Subtarget->hasDQI()) { + // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention. + // These logic operations may be executed in the integer domain. + SDLoc dl(N); + MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements()); + + SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0)); + SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1)); + unsigned IntOpcode = 0; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected FP logic op"); + case X86ISD::FOR: IntOpcode = ISD::OR; break; + case X86ISD::FXOR: IntOpcode = ISD::XOR; break; + case X86ISD::FAND: IntOpcode = ISD::AND; break; + case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; + } + SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); + return DAG.getNode(ISD::BITCAST, dl, VT, IntOp); + } + return SDValue(); +} /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. -static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); // F[X]OR(0.0, x) -> x @@ -24552,7 +27122,8 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(0); - return SDValue(); + + return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. @@ -24576,8 +27147,65 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { N->getOperand(0), N->getOperand(1)); } +static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + if (Subtarget->useSoftFloat()) + return SDValue(); + + // TODO: Check for global or instruction-level "nnan". In that case, we + // should be able to lower to FMAX/FMIN alone. + // TODO: If an operand is already known to be a NaN or not a NaN, this + // should be an optional swap and FMAX/FMIN. + + EVT VT = N->getValueType(0); + if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) + return SDValue(); + + // This takes at least 3 instructions, so favor a library call when operating + // on a scalar and minimizing code size. + if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize()) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDLoc DL(N); + EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( + DAG.getDataLayout(), *DAG.getContext(), VT); + + // There are 4 possibilities involving NaN inputs, and these are the required + // outputs: + // Op1 + // Num NaN + // ---------------- + // Num | Max | Op0 | + // Op0 ---------------- + // NaN | Op1 | NaN | + // ---------------- + // + // The SSE FP max/min instructions were not designed for this case, but rather + // to implement: + // Min = Op1 < Op0 ? Op1 : Op0 + // Max = Op1 > Op0 ? Op1 : Op0 + // + // So they always return Op0 if either input is a NaN. However, we can still + // use those instructions for fmaxnum by selecting away a NaN input. + + // If either operand is NaN, the 2nd source operand (Op0) is passed through. + auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; + SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); + SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO); + + // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands + // are NaN, the NaN value of Op1 is the result. + auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; + return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); +} + /// Do target-specific dag combines on X86ISD::FAND nodes. -static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { // FAND(0.0, x) -> 0.0 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) @@ -24588,11 +27216,12 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { if (C->getValueAPF().isPosZero()) return N->getOperand(1); - return SDValue(); + return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FANDN nodes -static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { // FANDN(0.0, x) -> x if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) @@ -24603,7 +27232,7 @@ static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { if (C->getValueAPF().isPosZero()) return N->getOperand(1); - return SDValue(); + return lowerX86FPLogicOp(N, DAG, Subtarget); } static SDValue PerformBTCombine(SDNode *N, @@ -24673,6 +27302,83 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// sext(add_nsw(x, C)) --> add(sext(x), C_sext) +/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities +/// to combine math ops, use an LEA, or use a complex addressing mode. This can +/// eliminate extend, add, and shift instructions. +static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // TODO: This should be valid for other integer types. + EVT VT = Sext->getValueType(0); + if (VT != MVT::i64) + return SDValue(); + + // We need an 'add nsw' feeding into the 'sext'. + SDValue Add = Sext->getOperand(0); + if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap()) + return SDValue(); + + // Having a constant operand to the 'add' ensures that we are not increasing + // the instruction count because the constant is extended for free below. + // A constant operand can also become the displacement field of an LEA. + auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1)); + if (!AddOp1) + return SDValue(); + + // Don't make the 'add' bigger if there's no hope of combining it with some + // other 'add' or 'shl' instruction. + // TODO: It may be profitable to generate simpler LEA instructions in place + // of single 'add' instructions, but the cost model for selecting an LEA + // currently has a high threshold. + bool HasLEAPotential = false; + for (auto *User : Sext->uses()) { + if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) { + HasLEAPotential = true; + break; + } + } + if (!HasLEAPotential) + return SDValue(); + + // Everything looks good, so pull the 'sext' ahead of the 'add'. + int64_t AddConstant = AddOp1->getSExtValue(); + SDValue AddOp0 = Add.getOperand(0); + SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0); + SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT); + + // The wider add is guaranteed to not wrap because both operands are + // sign-extended. + SDNodeFlags Flags; + Flags.setNoSignedWrap(true); + return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags); +} + +/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) -> +/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y) +/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly +/// extends from AH (which we otherwise need to do contortions to access). +static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + auto OpcodeN = N->getOpcode(); + auto OpcodeN0 = N0.getOpcode(); + if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) || + (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM))) + return SDValue(); + + EVT VT = N->getValueType(0); + EVT InVT = N0.getValueType(); + if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32) + return SDValue(); + + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG + : X86ISD::UDIVREM8_ZEXT_HREG; + SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0), + N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); +} + static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -24683,18 +27389,8 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, EVT InSVT = InVT.getScalarType(); SDLoc DL(N); - // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> - // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) - // This exposes the sext to the sdivrem lowering, so that it directly extends - // from AH (which we otherwise need to do contortions to access). - if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && - InVT == MVT::i8 && VT == MVT::i32) { - SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); - SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys, - N0.getOperand(0), N0.getOperand(1)); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); - return R.getValue(1); - } + if (SDValue DivRem8 = getDivRem8(N, DAG)) + return DivRem8; if (!DCI.isBeforeLegalizeOps()) { if (InVT == MVT::i1) { @@ -24763,13 +27459,13 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, } } - if (!Subtarget->hasFp256()) - return SDValue(); - - if (VT.isVector() && VT.getSizeInBits() == 256) + if (Subtarget->hasAVX() && VT.is256BitVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; + if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget)) + return NewAdd; + return SDValue(); } @@ -24783,9 +27479,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || - (!Subtarget->hasFMA() && !Subtarget->hasFMA4() && - !Subtarget->hasAVX512())) + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA()) return SDValue(); SDValue A = N->getOperand(0); @@ -24830,8 +27524,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); - if (!C || C->getZExtValue() != 1) + if (!isOneConstant(N0.getOperand(1))) return SDValue(); return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, @@ -24856,19 +27549,8 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; - // (i8,i32 zext (udivrem (i8 x, i8 y)) -> - // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) - // This exposes the zext to the udivrem lowering, so that it directly extends - // from AH (which we otherwise need to do contortions to access). - if (N0.getOpcode() == ISD::UDIVREM && - N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && - (VT == MVT::i32 || VT == MVT::i64)) { - SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); - SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, - N0.getOperand(0), N0.getOperand(1)); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); - return R.getValue(1); - } + if (SDValue DivRem8 = getDivRem8(N, DAG)) + return DivRem8; return SDValue(); } @@ -24884,21 +27566,19 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) - if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, - LHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); - } + if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, + LHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) - if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, - RHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); - } + if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, + RHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } if (VT.getScalarType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { @@ -24936,75 +27616,17 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, - SelectionDAG &DAG) { - SDLoc dl(Load); - MVT VT = Load->getSimpleValueType(0); - MVT EVT = VT.getVectorElementType(); - SDValue Addr = Load->getOperand(1); - SDValue NewAddr = DAG.getNode( - ISD::ADD, dl, Addr.getSimpleValueType(), Addr, - DAG.getConstant(Index * EVT.getStoreSize(), dl, - Addr.getSimpleValueType())); - - SDValue NewLoad = - DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, - DAG.getMachineFunction().getMachineMemOperand( - Load->getMemOperand(), 0, EVT.getStoreSize())); - return NewLoad; -} - -static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - SDLoc dl(N); - MVT VT = N->getOperand(1)->getSimpleValueType(0); - assert((VT == MVT::v4f32 || VT == MVT::v4i32) && - "X86insertps is only defined for v4x32"); - - SDValue Ld = N->getOperand(1); - if (MayFoldLoad(Ld)) { - // Extract the countS bits from the immediate so we can get the proper - // address when narrowing the vector load to a specific element. - // When the second source op is a memory address, insertps doesn't use - // countS and just gets an f32 from that address. - unsigned DestIndex = - cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6; - - Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG); - - // Create this as a scalar to vector to match the instruction pattern. - SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); - // countS bits are ignored when loading from memory on insertps, which - // means we don't need to explicitly set them to 0. - return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), - LoadScalarToVector, N->getOperand(2)); - } - return SDValue(); -} - -static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { - SDValue V0 = N->getOperand(0); - SDValue V1 = N->getOperand(1); +static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); - EVT VT = N->getValueType(0); - - // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector - // operands and changing the mask to 1. This saves us a bunch of - // pattern-matching possibilities related to scalar math ops in SSE/AVX. - // x86InstrInfo knows how to commute this back after instruction selection - // if it would help register allocation. - - // TODO: If optimizing for size or a processor that doesn't suffer from - // partial register update stalls, this should be transformed into a MOVSD - // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. - - if (VT == MVT::v2f64) - if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2))) - if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { - SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); - } - + // Gather and Scatter instructions use k-registers for masks. The type of + // the masks is v*i1. So the mask will be truncated anyway. + // The SIGN_EXTEND_INREG my be dropped. + SDValue Mask = N->getOperand(2); + if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { + SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); + NewOps[2] = Mask.getOperand(0); + DAG.UpdateNodeOperands(N, NewOps); + } return SDValue(); } @@ -25182,7 +27804,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. - if (Op0.getOpcode() == ISD::LOAD) { + if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); EVT LdVT = Ld->getValueType(0); @@ -25357,15 +27979,14 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, } // Check if we can bypass extracting and re-inserting an element of an input - // vector. Essentialy: + // vector. Essentially: // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { SDValue ExtractedV = V.getOperand(0); SDValue OrigV = ExtractedV.getOperand(0); - if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1))) - if (ExtractIdx->getZExtValue() == 0) { + if (isNullConstant(ExtractedV.getOperand(1))) { MVT OrigVT = OrigV.getSimpleValueType(); // Extract a subvector if necessary... if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { @@ -25394,7 +28015,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT: case X86ISD::SHRUNKBLEND: return PerformSELECTCombine(N, DAG, DCI, Subtarget); - case ISD::BITCAST: return PerformBITCASTCombine(N, DAG); + case ISD::BITCAST: return PerformBITCASTCombine(N, DAG, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); @@ -25414,12 +28035,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); + case ISD::FNEG: return PerformFNEGCombine(N, DAG, Subtarget); + case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget); case X86ISD::FXOR: - case X86ISD::FOR: return PerformFORCombine(N, DAG); + case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); case X86ISD::FMIN: case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); - case X86ISD::FAND: return PerformFANDCombine(N, DAG); - case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); + case ISD::FMINNUM: + case ISD::FMAXNUM: return performFMinNumFMaxNumCombine(N, DAG, + Subtarget); + case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget); + case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ANY_EXTEND: @@ -25433,6 +28059,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGNR: + case X86ISD::BLENDI: case X86ISD::UNPCKH: case X86ISD::UNPCKL: case X86ISD::MOVHLPS: @@ -25447,14 +28074,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); - case ISD::INTRINSIC_WO_CHAIN: - return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); - case X86ISD::INSERTPS: { - if (getTargetMachine().getOptLevel() > CodeGenOpt::None) - return PerformINSERTPSCombine(N, DAG, Subtarget); - break; - } - case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); + case ISD::MGATHER: + case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG); } return SDValue(); @@ -25489,6 +28110,18 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { } } +/// This function checks if any of the users of EFLAGS copies the EFLAGS. We +/// know that the code that lowers COPY of EFLAGS has to use the stack, and if +/// we don't adjust the stack we clobber the first frame index. +/// See X86InstrInfo::copyPhysReg. +bool X86TargetLowering::hasCopyImplyingStackAdjustment( + MachineFunction *MF) const { + const MachineRegisterInfo &MRI = MF->getRegInfo(); + + return any_of(MRI.reg_instructions(X86::EFLAGS), + [](const MachineInstr &RI) { return RI.isCopy(); }); +} + /// IsDesirableToPromoteOp - This method query the target whether it is /// beneficial for dag combiner to promote the specified node. If true, it /// should return the desired promotion type by reference. @@ -26084,6 +28717,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case MVT::f64: case MVT::i64: return std::make_pair(0U, &X86::FR64RegClass); + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. // Vector types. case MVT::v16i8: case MVT::v8i16: @@ -26168,17 +28802,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass || Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) { unsigned Size = VT.getSizeInBits(); - MVT::SimpleValueType SimpleTy = Size == 1 || Size == 8 ? MVT::i8 - : Size == 16 ? MVT::i16 - : Size == 32 ? MVT::i32 - : Size == 64 ? MVT::i64 - : MVT::Other; - unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, SimpleTy); + if (Size == 1) Size = 8; + unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); if (DestReg > 0) { Res.first = DestReg; - Res.second = SimpleTy == MVT::i8 ? &X86::GR8RegClass - : SimpleTy == MVT::i16 ? &X86::GR16RegClass - : SimpleTy == MVT::i32 ? &X86::GR32RegClass + Res.second = Size == 8 ? &X86::GR8RegClass + : Size == 16 ? &X86::GR16RegClass + : Size == 32 ? &X86::GR32RegClass : &X86::GR64RegClass; assert(Res.second->contains(Res.first) && "Register in register class"); } else { @@ -26196,6 +28826,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // target independent register mapper will just pick the first match it can // find, ignoring the required type. + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. if (VT == MVT::f32 || VT == MVT::i32) Res.second = &X86::FR32RegClass; else if (VT == MVT::f64 || VT == MVT::i64) @@ -26244,6 +28875,63 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, return -1; } -bool X86TargetLowering::isTargetFTOL() const { - return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit(); +bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { + // Integer division on x86 is expensive. However, when aggressively optimizing + // for code size, we prefer to use a div instruction, as it is usually smaller + // than the alternative sequence. + // The exception to this is vector division. Since x86 doesn't have vector + // integer division, leaving the division as-is is a loss even in terms of + // size, because it will have to be scalarized, while the alternative code + // sequence can be performed in vector form. + bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex, + Attribute::MinSize); + return OptSize && !VT.isVector(); +} + +void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + if (!Subtarget->is64Bit()) + return; + + // Update IsSplitCSR in X86MachineFunctionInfo. + X86MachineFunctionInfo *AFI = + Entry->getParent()->getInfo<X86MachineFunctionInfo>(); + AFI->setIsSplitCSR(true); +} + +void X86TargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const { + const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (X86::GR64RegClass.contains(*I)) + RC = &X86::GR64RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + NewVR) + .addReg(*I); + + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + *I) + .addReg(NewVR); + } } diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h index 723d530..0ab786e 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -126,6 +126,9 @@ namespace llvm { /// 1 is the number of bytes of stack to pop. RET_FLAG, + /// Return from interrupt. Operand 0 is the number of bytes to pop. + IRET, + /// Repeat fill, corresponds to X86::REP_STOSx. REP_STOS, @@ -182,6 +185,8 @@ namespace llvm { /// Compute Sum of Absolute Differences. PSADBW, + /// Compute Double Block Packed Sum-Absolute-Differences + DBPSADBW, /// Bitwise Logical AND NOT of Packed FP values. ANDNP, @@ -211,6 +216,8 @@ namespace llvm { // FP vector get exponent FGETEXP_RND, + // Extract Normalized Mantissas + VGETMANT, // FP Scale SCALEF, // Integer add/sub with unsigned saturation. @@ -236,6 +243,9 @@ namespace llvm { // Integer absolute value ABS, + // Detect Conflicts Within a Vector + CONFLICT, + /// Floating point max and min. FMAX, FMIN, @@ -282,9 +292,8 @@ namespace llvm { // Vector integer truncate. VTRUNC, - - // Vector integer truncate with mask. - VTRUNCM, + // Vector integer truncate with unsigned/signed saturation. + VTRUNCUS, VTRUNCS, // Vector FP extend. VFPEXT, @@ -295,6 +304,9 @@ namespace llvm { // Vector signed/unsigned integer to double. CVTDQ2PD, CVTUDQ2PD, + // Convert a vector to mask, set bits base on MSB. + CVT2MASK, + // 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, @@ -304,6 +316,9 @@ namespace llvm { // Vector shift elements by immediate VSHLI, VSRLI, VSRAI, + // Bit rotate by immediate + VROTLI, VROTRI, + // Vector packed double/float comparison. CMPP, @@ -349,6 +364,7 @@ namespace llvm { // OR/AND test for masks KORTEST, + KTEST, // Several flavors of instructions with vector shuffle behaviors. PACKSS, @@ -382,12 +398,24 @@ namespace llvm { VPERMIV3, VPERMI, VPERM2X128, - //Fix Up Special Packed Float32/64 values + // Bitwise ternary logic + VPTERNLOG, + // Fix Up Special Packed Float32/64 values VFIXUPIMM, - //Range Restriction Calculation For Packed Pairs of Float32/64 values + // Range Restriction Calculation For Packed Pairs of Float32/64 values VRANGE, + // Reduce - Perform Reduction Transformation on scalar\packed FP + VREDUCE, + // RndScale - Round FP Values To Include A Given Number Of Fraction Bits + VRNDSCALE, + // VFPCLASS - Tests Types Of a FP Values for packed types. + VFPCLASS, + // VFPCLASSS - Tests Types Of a FP Values for scalar types. + VFPCLASSS, // Broadcast scalar to vector VBROADCAST, + // Broadcast mask to vector + VBROADCASTM, // Broadcast subvector to vector SUBV_BROADCAST, // Insert/Extract vector element @@ -397,13 +425,21 @@ namespace llvm { /// SSE4A Extraction and Insertion. EXTRQI, INSERTQI, + // XOP variable/immediate rotations + VPROT, VPROTI, + // XOP arithmetic/logical shifts + VPSHA, VPSHL, + // XOP signed/unsigned integer comparisons + VPCOM, VPCOMU, + // Vector multiply packed unsigned doubleword integers PMULUDQ, // Vector multiply packed signed doubleword integers PMULDQ, // Vector Multiply Packed UnsignedIntegers with Round and Scale MULHRS, - + // Multiply and Add Packed Integers + VPMADDUBSW, VPMADDWD, // FMA nodes FMADD, FNMADD, @@ -418,7 +454,6 @@ namespace llvm { FNMSUB_RND, FMADDSUB_RND, FMSUBADD_RND, - RNDSCALE, // Compress and expand COMPRESS, @@ -443,9 +478,6 @@ namespace llvm { // falls back to heap allocation if not. SEG_ALLOCA, - // Windows's _ftol2 runtime routine to do fptoui. - WIN_FTOL, - // Memory barrier MEMBARRIER, MFENCE, @@ -580,15 +612,6 @@ namespace llvm { bool isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool TailCallOpt); - /// AVX512 static rounding constants. These need to match the values in - /// avx512fintrin.h. - enum STATIC_ROUNDING { - TO_NEAREST_INT = 0, - TO_NEG_INF = 1, - TO_POS_INF = 2, - TO_ZERO = 3, - CUR_DIRECTION = 4 - }; } //===--------------------------------------------------------------------===// @@ -677,6 +700,10 @@ namespace llvm { /// and some i16 instructions are slow. bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; + /// Return true if the MachineFunction contains a COPY which would imply + /// HasOpaqueSPAdjustment. + bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const override; @@ -813,6 +840,13 @@ namespace llvm { /// from i32 to i8 but not from i32 to i16. bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; + /// Given an intrinsic, checks if on the target the intrinsic will need to map + /// to a MemIntrinsicNode (touches memory). If this is the case, it returns + /// true and stores the intrinsic information into the IntrinsicInfo that was + /// passed to the function. + bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + unsigned Intrinsic) const override; + /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. @@ -850,16 +884,7 @@ namespace llvm { /// register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 - (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 - } - - /// Return true if the target uses the MSVC _ftol2 routine for fptoui. - bool isTargetFTOL() const; - - /// Return true if the MSVC _ftol2 routine should be used for fptoui to the - /// given type. - bool isIntegerTypeFTOL(EVT VT) const { - return isTargetFTOL() && VT == MVT::i64; + (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 } /// \brief Returns true if it is beneficial to convert a load of a constant @@ -879,6 +904,16 @@ namespace llvm { unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override; + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + /// This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, @@ -890,6 +925,11 @@ namespace llvm { bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const override; + /// Return true if the target stores SafeStack pointer at a fixed offset in + /// some non-standard address space, and populates the address space and + /// offset as appropriate. + Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; + SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const; @@ -899,6 +939,8 @@ namespace llvm { /// \brief Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; + bool isIntDivCheap(EVT VT, AttributeSet Attr) const override; + protected: std::pair<const TargetRegisterClass *, uint8_t> findRepresentativeClass(const TargetRegisterInfo *TRI, @@ -908,7 +950,6 @@ namespace llvm { /// Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - const DataLayout *TD; /// Select between SSE or x87 floating point ops. /// When SSE is available, use it for f32 operations. @@ -955,7 +996,6 @@ namespace llvm { const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG& DAG) const; - bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const; SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, int FPDiff, SDLoc dl) const; @@ -969,7 +1009,6 @@ namespace llvm { SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const; @@ -994,9 +1033,9 @@ namespace llvm { SDValue LowerToBT(SDValue And, ISD::CondCode CC, SDLoc dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; @@ -1028,6 +1067,15 @@ namespace llvm { const SmallVectorImpl<SDValue> &OutVals, SDLoc dl, SelectionDAG &DAG) const override; + bool supportSplitCSR(MachineFunction *MF) const override { + return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; + bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(CallInst *CI) const override; @@ -1042,27 +1090,16 @@ namespace llvm { const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; - bool shouldExpandAtomicLoadInIR(LoadInst *SI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicLoadInIR(LoadInst *SI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - TargetLoweringBase::AtomicRMWExpansionKind + TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; - bool needsCmpXchgNb(const Type *MemType) const; - - /// Utility function to emit atomic-load-arith operations (and, or, xor, - /// nand, max, min, umax, umin). It takes the corresponding instruction to - /// expand, the associated machine basic block, and the associated X86 - /// opcodes for reg/reg. - MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI, - MachineBasicBlock *MBB) const; - - /// Utility function to emit atomic-load-arith operations (and, or, xor, - /// nand, add, sub, swap) for 64-bit operands on 32-bit target. - MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI, - MachineBasicBlock *MBB) const; + bool needsCmpXchgNb(Type *MemType) const; // Utility function to emit the low-level va_arg code for X86-64. MachineBasicBlock *EmitVAARG64WithCustomInserter( @@ -1077,18 +1114,24 @@ namespace llvm { MachineBasicBlock *EmitLoweredSelect(MachineInstr *I, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr *I, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredCatchRet(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredCatchPad(MachineInstr *MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI, MachineBasicBlock *BB) const; - MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI, - MachineBasicBlock *BB) const; - MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const; @@ -1121,7 +1164,7 @@ namespace llvm { unsigned &RefinementSteps) const override; /// Reassociate floating point divisions into multiply by reciprocal. - bool combineRepeatedFPDivisors(unsigned NumUsers) const override; + unsigned combineRepeatedFPDivisors() const override; }; namespace X86 { diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td index faa9150..49be648 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td @@ -79,7 +79,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, !if (!eq (TypeVariantName, "i"), !if (!eq (Size, 128), "v2i64", !if (!eq (Size, 256), "v4i64", - !if (!eq (Size, 512), + !if (!eq (Size, 512), !if (!eq (EltSize, 64), "v8i64", "v16i32"), VTName))), VTName)); @@ -145,6 +145,8 @@ def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; // We map scalar types to the smallest (128-bit) vector type // with the appropriate element type. This allows to use the same masking logic. +def i32x_info : X86VectorVTInfo<1, i32, GR32, "si">; +def i64x_info : X86VectorVTInfo<1, i64, GR64, "sq">; def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">; def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">; @@ -186,7 +188,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F, let isCommutable = IsCommutable in def NAME: AVX512<O, F, Outs, Ins, OpcodeStr#"\t{"#AttSrcAsm#", $dst|"# - "$dst , "#IntelSrcAsm#"}", + "$dst, "#IntelSrcAsm#"}", Pattern, itin>; // Prefer over VMOV*rrk Pat<> @@ -274,6 +276,22 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _, OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>; +// Similar to AVX512_maskable_3rc but in this case the input VT for the tied +// operand differs from the output VT. This requires a bitconvert on +// the preserved vector going into the vselect. +multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT, + X86VectorVTInfo InVT, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS> : + AVX512_maskable_common<O, F, OutVT, Outs, + !con((ins InVT.RC:$src1), NonTiedIns), + !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), + !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (vselect InVT.KRCWM:$mask, RHS, + (bitconvert InVT.RC:$src1))>; + multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag NonTiedIns, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, @@ -305,18 +323,16 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, list<dag> Pattern, - list<dag> MaskingPattern, - string Round = "", - InstrItinClass itin = NoItinerary> { + list<dag> MaskingPattern> { def NAME: AVX512<O, F, Outs, Ins, - OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"# - "$dst "#Round#", "#IntelSrcAsm#"}", - Pattern, itin>; + OpcodeStr#"\t{"#AttSrcAsm#", $dst|"# + "$dst, "#IntelSrcAsm#"}", + Pattern, NoItinerary>; def NAME#k: AVX512<O, F, Outs, MaskingIns, - OpcodeStr#"\t{"#Round#AttSrcAsm#", $dst {${mask}}|"# - "$dst {${mask}}, "#IntelSrcAsm#Round#"}", - MaskingPattern, itin>, EVEX_K; + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# + "$dst {${mask}}, "#IntelSrcAsm#"}", + MaskingPattern, NoItinerary>, EVEX_K; } multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _, @@ -324,33 +340,27 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _, dag Ins, dag MaskingIns, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, dag MaskingRHS, - string Round = "", - InstrItinClass itin = NoItinerary> : + dag RHS, dag MaskingRHS> : AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr, AttSrcAsm, IntelSrcAsm, [(set _.KRC:$dst, RHS)], - [(set _.KRC:$dst, MaskingRHS)], - Round, NoItinerary>; + [(set _.KRC:$dst, MaskingRHS)]>; multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, string Round = "", - InstrItinClass itin = NoItinerary> : + dag RHS> : AVX512_maskable_common_cmp<O, F, _, Outs, Ins, !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, - (and _.KRCWM:$mask, RHS), - Round, itin>; + (and _.KRCWM:$mask, RHS)>; multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm> : AVX512_maskable_custom_cmp<O, F, Outs, Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr, - AttSrcAsm, IntelSrcAsm, - [],[],"", NoItinerary>; + AttSrcAsm, IntelSrcAsm, [],[]>; // Bitcasts between 512-bit vector types. Return the original type since // no instruction is needed for the conversion @@ -471,84 +481,123 @@ def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // - -multiclass vinsert_for_size_no_alt<int Opcode, - X86VectorVTInfo From, X86VectorVTInfo To, - PatFrag vinsert_insert, - SDNodeXForm INSERT_get_vinsert_imm> { +multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To, + PatFrag vinsert_insert> { let hasSideEffects = 0, ExeDomain = To.ExeDomain in { - def rr : AVX512AIi8<Opcode, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, From.RC:$src2, u8imm:$src3), - "vinsert" # From.EltTypeName # "x" # From.NumElts # - "\t{$src3, $src2, $src1, $dst|" - "$dst, $src1, $src2, $src3}", - [(set To.RC:$dst, (vinsert_insert:$src3 (To.VT VR512:$src1), - (From.VT From.RC:$src2), - (iPTR imm)))]>, - EVEX_4V, EVEX_V512; + defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst), + (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (vinsert_insert:$src3 (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm))>, AVX512AIi8Base, EVEX_4V; - let mayLoad = 1 in - def rm : AVX512AIi8<Opcode, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, From.MemOp:$src2, u8imm:$src3), - "vinsert" # From.EltTypeName # "x" # From.NumElts # - "\t{$src3, $src2, $src1, $dst|" - "$dst, $src1, $src2, $src3}", - []>, - EVEX_4V, EVEX_V512, EVEX_CD8<From.EltSize, From.CD8TupleForm>; - } -} - -multiclass vinsert_for_size<int Opcode, - X86VectorVTInfo From, X86VectorVTInfo To, - X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo, - PatFrag vinsert_insert, - SDNodeXForm INSERT_get_vinsert_imm> : - vinsert_for_size_no_alt<Opcode, From, To, - vinsert_insert, INSERT_get_vinsert_imm> { - // Codegen pattern with the alternative types, e.g. v2i64 -> v8i64 for - // vinserti32x4. Only add this if 64x2 and friends are not supported - // natively via AVX512DQ. - let Predicates = [NoDQI] in + let mayLoad = 1 in + defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst), + (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (vinsert_insert:$src3 (To.VT To.RC:$src1), + (From.VT (bitconvert (From.LdFrag addr:$src2))), + (iPTR imm))>, AVX512AIi8Base, EVEX_4V, + EVEX_CD8<From.EltSize, From.CD8TupleForm>; + } +} + +multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, PatFrag vinsert_insert, + SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> { + let Predicates = p in { def : Pat<(vinsert_insert:$ins - (AltTo.VT VR512:$src1), (AltFrom.VT From.RC:$src2), (iPTR imm)), - (AltTo.VT (!cast<Instruction>(NAME # From.EltSize # "x4rr") - VR512:$src1, From.RC:$src2, - (INSERT_get_vinsert_imm VR512:$ins)))>; + (To.VT To.RC:$src1), (From.VT From.RC:$src2), (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rr") + To.RC:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm To.RC:$ins)))>; + + def : Pat<(vinsert_insert:$ins + (To.VT To.RC:$src1), + (From.VT (bitconvert (From.LdFrag addr:$src2))), + (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rm") + To.RC:$src1, addr:$src2, + (INSERT_get_vinsert_imm To.RC:$ins)))>; + } } multiclass vinsert_for_type<ValueType EltVT32, int Opcode128, ValueType EltVT64, int Opcode256> { - defm NAME # "32x4" : vinsert_for_size<Opcode128, + + let Predicates = [HasVLX] in + defm NAME # "32x4Z256" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 4, EltVT32, VR128X>, + X86VectorVTInfo< 8, EltVT32, VR256X>, + vinsert128_insert>, EVEX_V256; + + defm NAME # "32x4Z" : vinsert_for_size<Opcode128, X86VectorVTInfo< 4, EltVT32, VR128X>, X86VectorVTInfo<16, EltVT32, VR512>, - X86VectorVTInfo< 2, EltVT64, VR128X>, + vinsert128_insert>, EVEX_V512; + + defm NAME # "64x4Z" : vinsert_for_size<Opcode256, + X86VectorVTInfo< 4, EltVT64, VR256X>, X86VectorVTInfo< 8, EltVT64, VR512>, - vinsert128_insert, - INSERT_get_vinsert128_imm>; - let Predicates = [HasDQI] in - defm NAME # "64x2" : vinsert_for_size_no_alt<Opcode128, + vinsert256_insert>, VEX_W, EVEX_V512; + + let Predicates = [HasVLX, HasDQI] in + defm NAME # "64x2Z256" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 2, EltVT64, VR128X>, + X86VectorVTInfo< 4, EltVT64, VR256X>, + vinsert128_insert>, VEX_W, EVEX_V256; + + let Predicates = [HasDQI] in { + defm NAME # "64x2Z" : vinsert_for_size<Opcode128, X86VectorVTInfo< 2, EltVT64, VR128X>, X86VectorVTInfo< 8, EltVT64, VR512>, - vinsert128_insert, - INSERT_get_vinsert128_imm>, VEX_W; - defm NAME # "64x4" : vinsert_for_size<Opcode256, - X86VectorVTInfo< 4, EltVT64, VR256X>, - X86VectorVTInfo< 8, EltVT64, VR512>, - X86VectorVTInfo< 8, EltVT32, VR256>, - X86VectorVTInfo<16, EltVT32, VR512>, - vinsert256_insert, - INSERT_get_vinsert256_imm>, VEX_W; - let Predicates = [HasDQI] in - defm NAME # "32x8" : vinsert_for_size_no_alt<Opcode256, - X86VectorVTInfo< 8, EltVT32, VR256X>, - X86VectorVTInfo<16, EltVT32, VR512>, - vinsert256_insert, - INSERT_get_vinsert256_imm>; + vinsert128_insert>, VEX_W, EVEX_V512; + + defm NAME # "32x8Z" : vinsert_for_size<Opcode256, + X86VectorVTInfo< 8, EltVT32, VR256X>, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert256_insert>, EVEX_V512; + } } defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>; defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>; +// Codegen pattern with the alternative types, +// Only add this if 64x2 and its friends are not supported natively via AVX512DQ. +defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>; + +defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>; + +defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>; +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>; + +// Codegen pattern with the alternative types insert VEC128 into VEC256 +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; +// Codegen pattern with the alternative types insert VEC128 into VEC512 +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; +// Codegen pattern with the alternative types insert VEC256 into VEC512 +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; + // vinsertps - insert f32 to XMM def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, u8imm:$src3), @@ -566,90 +615,158 @@ def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), // AVX-512 VECTOR EXTRACT //--- +multiclass vextract_for_size_first_position_lowering<X86VectorVTInfo From, + X86VectorVTInfo To> { + // A subvector extract from the first vector position is + // a subregister copy that needs no instruction. + def NAME # To.NumElts: + Pat<(To.VT (extract_subvector (From.VT From.RC:$src),(iPTR 0))), + (To.VT (EXTRACT_SUBREG (From.VT From.RC:$src), To.SubRegIdx))>; +} + multiclass vextract_for_size<int Opcode, - X86VectorVTInfo From, X86VectorVTInfo To, - X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo, - PatFrag vextract_extract, - SDNodeXForm EXTRACT_get_vextract_imm> { + X86VectorVTInfo From, X86VectorVTInfo To, + PatFrag vextract_extract> : + vextract_for_size_first_position_lowering<From, To> { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to + // vextract_extract), we interesting only in patterns without mask, + // intrinsics pattern match generated bellow. defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst), - (ins VR512:$src1, u8imm:$idx), - "vextract" # To.EltTypeName # "x4", + (ins From.RC:$src1, i32u8imm:$idx), + "vextract" # To.EltTypeName # "x" # To.NumElts, "$idx, $src1", "$src1, $idx", - [(set To.RC:$dst, (vextract_extract:$idx (From.VT VR512:$src1), + [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)))]>, - AVX512AIi8Base, EVEX, EVEX_V512; - let mayStore = 1 in - def rm : AVX512AIi8<Opcode, MRMDestMem, (outs), - (ins To.MemOp:$dst, VR512:$src1, u8imm:$src2), - "vextract" # To.EltTypeName # "x4\t{$src2, $src1, $dst|" - "$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, EVEX_CD8<To.EltSize, CD8VT4>; - } - - // Codegen pattern with the alternative types, e.g. v8i64 -> v2i64 for - // vextracti32x4 - def : Pat<(vextract_extract:$ext (AltFrom.VT VR512:$src1), (iPTR imm)), - (AltTo.VT (!cast<Instruction>(NAME # To.EltSize # "x4rr") - VR512:$src1, - (EXTRACT_get_vextract_imm To.RC:$ext)))>; - - // A 128/256-bit subvector extract from the first 512-bit vector position is - // a subregister copy that needs no instruction. - def : Pat<(To.VT (extract_subvector (From.VT VR512:$src), (iPTR 0))), - (To.VT - (EXTRACT_SUBREG (From.VT VR512:$src), To.SubRegIdx))>; - - // And for the alternative types. - def : Pat<(AltTo.VT (extract_subvector (AltFrom.VT VR512:$src), (iPTR 0))), - (AltTo.VT - (EXTRACT_SUBREG (AltFrom.VT VR512:$src), AltTo.SubRegIdx))>; + AVX512AIi8Base, EVEX; + let mayStore = 1 in { + def rm : AVX512AIi8<Opcode, MRMDestMem, (outs), + (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$src2), + "vextract" # To.EltTypeName # "x" # To.NumElts # + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX; + + def rmk : AVX512AIi8<Opcode, MRMDestMem, (outs), + (ins To.MemOp:$dst, To.KRCWM:$mask, + From.RC:$src1, i32u8imm:$src2), + "vextract" # To.EltTypeName # "x" # To.NumElts # + "\t{$src2, $src1, $dst {${mask}}|" + "$dst {${mask}}, $src1, $src2}", + []>, EVEX_K, EVEX; + }//mayStore = 1 + } // Intrinsic call with masking. def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x4_512") - VR512:$src1, (iPTR imm:$idx), To.RC:$src0, GR8:$mask), - (!cast<Instruction>(NAME # To.EltSize # "x4rrk") To.RC:$src0, - (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), - VR512:$src1, imm:$idx)>; + "x" # To.NumElts # "_" # From.Size) + From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask), + (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # + From.ZSuffix # "rrk") + To.RC:$src0, + (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM), + From.RC:$src1, imm:$idx)>; // Intrinsic call with zero-masking. def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x4_512") - VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, GR8:$mask), - (!cast<Instruction>(NAME # To.EltSize # "x4rrkz") - (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), - VR512:$src1, imm:$idx)>; + "x" # To.NumElts # "_" # From.Size) + From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask), + (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # + From.ZSuffix # "rrkz") + (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM), + From.RC:$src1, imm:$idx)>; // Intrinsic call without masking. def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x4_512") - VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)), - (!cast<Instruction>(NAME # To.EltSize # "x4rr") - VR512:$src1, imm:$idx)>; + "x" # To.NumElts # "_" # From.Size) + From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)), + (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # + From.ZSuffix # "rr") + From.RC:$src1, imm:$idx)>; +} + +// Codegen pattern for the alternative types +multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, PatFrag vextract_extract, + SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> : + vextract_for_size_first_position_lowering<From, To> { + + let Predicates = p in + def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rr") + From.RC:$src1, + (EXTRACT_get_vextract_imm To.RC:$ext)))>; } -multiclass vextract_for_type<ValueType EltVT32, int Opcode32, - ValueType EltVT64, int Opcode64> { - defm NAME # "32x4" : vextract_for_size<Opcode32, +multiclass vextract_for_type<ValueType EltVT32, int Opcode128, + ValueType EltVT64, int Opcode256> { + defm NAME # "32x4Z" : vextract_for_size<Opcode128, X86VectorVTInfo<16, EltVT32, VR512>, X86VectorVTInfo< 4, EltVT32, VR128X>, + vextract128_extract>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; + defm NAME # "64x4Z" : vextract_for_size<Opcode256, X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 4, EltVT64, VR256X>, + vextract256_extract>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; + let Predicates = [HasVLX] in + defm NAME # "32x4Z256" : vextract_for_size<Opcode128, + X86VectorVTInfo< 8, EltVT32, VR256X>, + X86VectorVTInfo< 4, EltVT32, VR128X>, + vextract128_extract>, + EVEX_V256, EVEX_CD8<32, CD8VT4>; + let Predicates = [HasVLX, HasDQI] in + defm NAME # "64x2Z256" : vextract_for_size<Opcode128, + X86VectorVTInfo< 4, EltVT64, VR256X>, X86VectorVTInfo< 2, EltVT64, VR128X>, - vextract128_extract, - EXTRACT_get_vextract128_imm>; - defm NAME # "64x4" : vextract_for_size<Opcode64, + vextract128_extract>, + VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; + let Predicates = [HasDQI] in { + defm NAME # "64x2Z" : vextract_for_size<Opcode128, X86VectorVTInfo< 8, EltVT64, VR512>, - X86VectorVTInfo< 4, EltVT64, VR256X>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + vextract128_extract>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; + defm NAME # "32x8Z" : vextract_for_size<Opcode256, X86VectorVTInfo<16, EltVT32, VR512>, - X86VectorVTInfo< 8, EltVT32, VR256>, - vextract256_extract, - EXTRACT_get_vextract256_imm>, VEX_W; + X86VectorVTInfo< 8, EltVT32, VR256X>, + vextract256_extract>, + EVEX_V512, EVEX_CD8<32, CD8VT8>; + } } defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>; defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>; +// extract_subvector codegen patterns with the alternative types. +// Only add this if 64x2 and its friends are not supported natively via AVX512DQ. +defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>; + +defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>; +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>; + +defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>; + +// Codegen pattern with the alternative types extract VEC128 from VEC512 +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; +// Codegen pattern with the alternative types extract VEC256 from VEC512 +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; + // A 128-bit subvector insert to the first 512-bit vector position // is a subregister copy that needs no instruction. def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)), @@ -677,6 +794,10 @@ def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)), (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; // vextractps - extract 32 bits from XMM def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), @@ -694,50 +815,49 @@ def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), //===---------------------------------------------------------------------===// // AVX-512 BROADCAST //--- -multiclass avx512_fp_broadcast<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, - ValueType svt, X86VectorVTInfo _> { - defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins SrcRC:$src), "vbroadcast"## !subst("p", "s", _.Suffix), - "$src", "$src", (_.VT (OpNode (svt SrcRC:$src)))>, - T8PD, EVEX; - let mayLoad = 1 in { - defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.ScalarMemOp:$src), - "vbroadcast"##!subst("p", "s", _.Suffix), "$src", "$src", - (_.VT (OpNode (_.ScalarLdFrag addr:$src)))>, - T8PD, EVEX; - } +multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, + X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> { + + defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src", + (DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>, + T8PD, EVEX; + let mayLoad = 1 in + defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (DestInfo.VT (X86VBroadcast + (SrcInfo.ScalarLdFrag addr:$src)))>, + T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>; } -multiclass avx512_fp_broadcast_vl<bits<8> opc, SDNode OpNode, - AVX512VLVectorVTInfo _> { - defm Z : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info512>, +multiclass avx512_fp_broadcast_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> { + defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>, EVEX_V512; let Predicates = [HasVLX] in { - defm Z256 : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info256>, - EVEX_V256; + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>, + EVEX_V256; } } let ExeDomain = SSEPackedSingle in { - defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, X86VBroadcast, - avx512vl_f32_info>, EVEX_CD8<32, CD8VT1>; + defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, "vbroadcastss", + avx512vl_f32_info>; let Predicates = [HasVLX] in { - defm VBROADCASTSSZ128 : avx512_fp_broadcast<0x18, X86VBroadcast, VR128X, - v4f32, v4f32x_info>, EVEX_V128, - EVEX_CD8<32, CD8VT1>; + defm VBROADCASTSSZ128 : avx512_broadcast_rm<0x18, "vbroadcastss", + v4f32x_info, v4f32x_info>, EVEX_V128; } } let ExeDomain = SSEPackedDouble in { - defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, X86VBroadcast, - avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, "vbroadcastsd", + avx512vl_f64_info>, VEX_W; } // avx512_broadcast_pat introduces patterns for broadcast with a scalar argument. -// Later, we can canonize broadcast instructions before ISel phase and +// Later, we can canonize broadcast instructions before ISel phase and // eliminate additional patterns on ISel. // SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar // representations of source @@ -834,70 +954,50 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src), (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))), (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>; -multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr, - X86MemOperand x86memop, PatFrag ld_frag, - RegisterClass DstRC, ValueType OpVT, ValueType SrcVT, - RegisterClass KRC> { - def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, - (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX; - def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, - VR128X:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} |${dst} {${mask}}, $src}"), - []>, EVEX, EVEX_K; - def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, - VR128X:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, - (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX; - def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, - x86memop:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}} , $src}"), - []>, EVEX, EVEX_K; - def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, - x86memop:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [(set DstRC:$dst, (OpVT (vselect KRC:$mask, - (X86VBroadcast (ld_frag addr:$src)), - (OpVT (bitconvert (v16i32 immAllZerosV))))))]>, EVEX, EVEX_KZ; - } -} - -defm VPBROADCASTDZ : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem, - loadi32, VR512, v16i32, v4i32, VK16WM>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; -defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem, - loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VT1>; +// Provide aliases for broadcast from the same register class that +// automatically does the extract. +multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo> { + def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))), + (!cast<Instruction>(NAME#DestInfo.ZSuffix#"r") + (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>; +} + +multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd> { + let Predicates = [prd] in { + defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>, + avx512_int_broadcast_rm_lowering<_.info512, _.info256>, + EVEX_V512; + // Defined separately to avoid redefinition. + defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>; + } + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>, + avx512_int_broadcast_rm_lowering<_.info256, _.info256>, + EVEX_V256; + defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>, + EVEX_V128; + } +} + +defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb", + avx512vl_i8_info, HasBWI>; +defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw", + avx512vl_i16_info, HasBWI>; +defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd", + avx512vl_i32_info, HasAVX512>; +defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq", + avx512vl_i64_info, HasAVX512>, VEX_W; multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { - let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Src.MemOp:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set _Dst.RC:$dst, - (_Dst.VT (X86SubVBroadcast - (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))))]>, EVEX; - def rmk : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Dst.KRCWM:$mask, - _Src.MemOp:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), - []>, EVEX, EVEX_K; - def rmkz : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Dst.KRCWM:$mask, - _Src.MemOp:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - } + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src", + (_Dst.VT (X86SubVBroadcast + (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, + AVX5128IBase, EVEX; } defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", @@ -944,10 +1044,45 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8", EVEX_V512, EVEX_CD8<32, CD8VT8>; } -def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))), - (VPBROADCASTDZrr VR128X:$src)>; -def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))), - (VPBROADCASTQZrr VR128X:$src)>; +multiclass avx512_broadcast_32x2<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _Dst, X86VectorVTInfo _Src, + SDNode OpNode = X86SubVBroadcast> { + + defm r : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src), OpcodeStr, "$src", "$src", + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src)))>, + T8PD, EVEX; + let mayLoad = 1 in + defm m : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (_Dst.VT (OpNode + (_Src.VT (scalar_to_vector(loadi64 addr:$src)))))>, + T8PD, EVEX, EVEX_CD8<_Src.EltSize, CD8VT2>; +} + +multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> { + let Predicates = [HasDQI] in + defm Z : avx512_broadcast_32x2<opc, OpcodeStr, _.info512, _.info128>, + EVEX_V512; + let Predicates = [HasDQI, HasVLX] in + defm Z256 : avx512_broadcast_32x2<opc, OpcodeStr, _.info256, _.info128>, + EVEX_V256; +} + +multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> : + avx512_common_broadcast_32x2<opc, OpcodeStr, _> { + + let Predicates = [HasDQI, HasVLX] in + defm Z128 : avx512_broadcast_32x2<opc, OpcodeStr, _.info128, _.info128, + X86SubV32x2Broadcast>, EVEX_V128; +} + +defm VPBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2", + avx512vl_i32_info>; +defm VPBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2", + avx512vl_f32_info>; def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; @@ -959,21 +1094,6 @@ def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))), (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>; -def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))), - (VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; -def : Pat<(v16i32 (X86VBroadcast (v8i32 VR256X:$src))), - (VPBROADCASTDZrr (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm))>; - -def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))), - (VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; -def : Pat<(v8i64 (X86VBroadcast (v4i64 VR256X:$src))), - (VPBROADCASTQZrr (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm))>; - -def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))), - (VBROADCASTSSZr VR128X:$src)>; -def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))), - (VBROADCASTSDZr VR128X:$src)>; - // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), @@ -985,170 +1105,178 @@ def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), //===----------------------------------------------------------------------===// // AVX-512 BROADCAST MASK TO VECTOR REGISTER //--- - -multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, - RegisterClass KRC> { -let Predicates = [HasCDI] in -def Zrr : AVX512XS8I<opc, MRMSrcReg, (outs VR512:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX, EVEX_V512; - -let Predicates = [HasCDI, HasVLX] in { -def Z128rr : AVX512XS8I<opc, MRMSrcReg, (outs VR128:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX, EVEX_V128; -def Z256rr : AVX512XS8I<opc, MRMSrcReg, (outs VR256:$dst), (ins KRC:$src), +multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, RegisterClass KRC> { + def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX, EVEX_V256; + [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX; } + +multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> { + let Predicates = [HasCDI] in + defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512; + let Predicates = [HasCDI, HasVLX] in { + defm Z256 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info256, KRC>, EVEX_V256; + defm Z128 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info128, KRC>, EVEX_V128; + } } -let Predicates = [HasCDI] in { defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", - VK16>; + avx512vl_i32_info, VK16>; defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", - VK8>, VEX_W; -} + avx512vl_i64_info, VK8>, VEX_W; //===----------------------------------------------------------------------===// -// AVX-512 - VPERM -// -// -- immediate form -- -multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in { - def ri : AVX512AIi8<opc, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src1, u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, - (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>, - EVEX; - def mi : AVX512AIi8<opc, MRMSrcMem, (outs _.RC:$dst), - (ins _.MemOp:$src1, u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, - (_.VT (OpNode (_.LdFrag addr:$src1), - (i8 imm:$src2))))]>, - EVEX, EVEX_CD8<_.EltSize, CD8VF>; +// -- VPERMI2 - 3 source operands form -- +multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { +let Constraints = "$src1 = $dst" in { + defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V, + AVX5128IBase; + + let mayLoad = 1 in + defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, + (_.VT (bitconvert (_.LdFrag addr:$src3)))))>, + EVEX_4V, AVX5128IBase; + } } +multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { + let mayLoad = 1, Constraints = "$src1 = $dst" in + defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), + !strconcat("$src2, ${src3}", _.BroadcastStr ), + (_.VT (X86VPermi2X IdxVT.RC:$src1, + _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, + AVX5128IBase, EVEX_4V, EVEX_B; } -multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _, - X86VectorVTInfo Ctrl> : - avx512_perm_imm<OpcImm, "vpermil" # _.Suffix, X86VPermilpi, _> { - let ExeDomain = _.ExeDomain in { - def rr : AVX5128I<OpcVar, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2), - !strconcat("vpermil" # _.Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, - (_.VT (X86VPermilpv _.RC:$src1, - (Ctrl.VT Ctrl.RC:$src2))))]>, - EVEX_4V; - def rm : AVX5128I<OpcVar, MRMSrcMem, (outs _.RC:$dst), - (ins _.RC:$src1, Ctrl.MemOp:$src2), - !strconcat("vpermil" # _.Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, - (_.VT (X86VPermilpv _.RC:$src1, - (Ctrl.VT (Ctrl.LdFrag addr:$src2)))))]>, - EVEX_4V; - } -} -defm VPERMILPSZ : avx512_permil<0x04, 0x0C, v16f32_info, v16i32_info>, - EVEX_V512; -defm VPERMILPDZ : avx512_permil<0x05, 0x0D, v8f64_info, v8i64_info>, - EVEX_V512, VEX_W; - -def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), - (VPERMILPSZri VR512:$src1, imm:$imm)>; -def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), - (VPERMILPDZri VR512:$src1, imm:$imm)>; - -// -- VPERM2I - 3 source operands form -- -multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _> { +multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo ShuffleMask> { + defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, EVEX_V256; + } +} + +multiclass avx512_perm_i_sizes_w<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo Idx> { + let Predicates = [HasBWI] in + defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512, + Idx.info512>, EVEX_V512; + let Predicates = [HasBWI, HasVLX] in { + defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128, + Idx.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256, + Idx.info256>, EVEX_V256; + } +} + +defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d", + avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q", + avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMI2W : avx512_perm_i_sizes_w<0x75, "vpermi2w", + avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", + avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", + avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; + +// VPERMT2 +multiclass avx512_perm_t<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { let Constraints = "$src1 = $dst" in { defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src2, _.RC:$src3), + (ins IdxVT.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V, + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3))>, EVEX_4V, AVX5128IBase; let mayLoad = 1 in defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src2, _.MemOp:$src3), + (ins IdxVT.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, - (_.VT (bitconvert (_.LdFrag addr:$src3)))))>, + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, + (bitconvert (_.LdFrag addr:$src3))))>, EVEX_4V, AVX5128IBase; } } -multiclass avx512_perm_3src_mb<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _> { +multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { let mayLoad = 1, Constraints = "$src1 = $dst" in defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src2, _.ScalarMemOp:$src3), + (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), - (_.VT (OpNode _.RC:$src1, - _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, + (_.VT (X86VPermt2 _.RC:$src1, + IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, AVX5128IBase, EVEX_4V, EVEX_B; } -multiclass avx512_perm_3src_sizes<bits<8> opc, string OpcodeStr, - SDNode OpNode, AVX512VLVectorVTInfo VTInfo> { - let Predicates = [HasAVX512] in - defm NAME: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info512>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512; +multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo ShuffleMask> { + defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, + avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, EVEX_V512; let Predicates = [HasVLX] in { - defm NAME#128: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info128>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info128>, - EVEX_V128; - defm NAME#256: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info256>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info256>, - EVEX_V256; + defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, + avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, EVEX_V128; + defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, + avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, EVEX_V256; } } -multiclass avx512_perm_3src_sizes_w<bits<8> opc, string OpcodeStr, - SDNode OpNode, AVX512VLVectorVTInfo VTInfo> { + +multiclass avx512_perm_t_sizes_w<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo Idx> { let Predicates = [HasBWI] in - defm NAME: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info512>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info512>, - EVEX_V512; + defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512, + Idx.info512>, EVEX_V512; let Predicates = [HasBWI, HasVLX] in { - defm NAME#128: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info128>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info128>, - EVEX_V128; - defm NAME#256: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info256>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info256>, - EVEX_V256; - } -} -defm VPERMI2D : avx512_perm_3src_sizes<0x76, "vpermi2d", X86VPermiv3, - avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMI2Q : avx512_perm_3src_sizes<0x76, "vpermi2q", X86VPermiv3, - avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMI2PS : avx512_perm_3src_sizes<0x77, "vpermi2ps", X86VPermiv3, - avx512vl_f32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMI2PD : avx512_perm_3src_sizes<0x77, "vpermi2pd", X86VPermiv3, - avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPERMT2D : avx512_perm_3src_sizes<0x7E, "vpermt2d", X86VPermv3, - avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMT2Q : avx512_perm_3src_sizes<0x7E, "vpermt2q", X86VPermv3, - avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMT2PS : avx512_perm_3src_sizes<0x7F, "vpermt2ps", X86VPermv3, - avx512vl_f32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMT2PD : avx512_perm_3src_sizes<0x7F, "vpermt2pd", X86VPermv3, - avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPERMT2W : avx512_perm_3src_sizes_w<0x7D, "vpermt2w", X86VPermv3, - avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPERMI2W : avx512_perm_3src_sizes_w<0x75, "vpermi2w", X86VPermiv3, - avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; + defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128, + Idx.info128>, EVEX_V128; + defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256, + Idx.info256>, EVEX_V256; + } +} + +defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d", + avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q", + avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMT2W : avx512_perm_t_sizes_w<0x7D, "vpermt2w", + avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps", + avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", + avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask @@ -1158,7 +1286,7 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, - "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"), + "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), []>, EVEX_4V; def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), @@ -1175,7 +1303,7 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, - "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"), + "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), @@ -1265,41 +1393,85 @@ def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), //===----------------------------------------------------------------------===// // avx512_cmp_scalar - AVX512 CMPSS and CMPSD -multiclass avx512_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, - SDNode OpNode, ValueType VT, - PatFrag ld_frag, string Suffix> { - def rr : AVX512Ii8<0xC2, MRMSrcReg, - (outs VK1:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc), - !strconcat("vcmp${cc}", Suffix, + +multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>{ + + defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc)>, EVEX_4V; + let mayLoad = 1 in + defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + + defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc, + (i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B; + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs VK1:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V; + defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + + defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">, + EVEX_4V, EVEX_B; + }// let isAsmParserOnly = 1, hasSideEffects = 0 + + let isCodeGenOnly = 1 in { + def rr : AVX512Ii8<0xC2, MRMSrcReg, + (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", _.Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VK1:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))], + [(set _.KRC:$dst, (OpNode _.FRC:$src1, + _.FRC:$src2, + imm:$cc))], IIC_SSE_ALU_F32S_RR>, EVEX_4V; - def rm : AVX512Ii8<0xC2, MRMSrcMem, - (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc), - !strconcat("vcmp${cc}", Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VK1:$dst, (OpNode (VT RC:$src1), - (ld_frag addr:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V; - let isAsmParserOnly = 1, hasSideEffects = 0 in { - def rri_alt : AVX512Ii8<0xC2, MRMSrcReg, - (outs VK1:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), - !strconcat("vcmp", Suffix, - "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32S_RR>, EVEX_4V; let mayLoad = 1 in - def rmi_alt : AVX512Ii8<0xC2, MRMSrcMem, - (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), - !strconcat("vcmp", Suffix, - "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rm : AVX512Ii8<0xC2, MRMSrcMem, + (outs _.KRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", _.Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src2), + imm:$cc))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; } } let Predicates = [HasAVX512] in { -defm VCMPSSZ : avx512_cmp_scalar<FR32X, f32mem, X86cmpms, f32, loadf32, "ss">, - XS; -defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, X86cmpms, f64, loadf64, "sd">, - XD, VEX_W; + defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>, + AVX512XSIi8Base; + defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>, + AVX512XDIi8Base, VEX_W; } multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -1651,7 +1823,7 @@ multiclass avx512_vcmp_sae<X86VectorVTInfo _> { defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), "vcmp${cc}"#_.Suffix, - "{sae}, $src2, $src1", "$src1, $src2,{sae}", + "{sae}, $src2, $src1", "$src1, $src2, {sae}", (X86cmpmRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc, @@ -1662,8 +1834,8 @@ multiclass avx512_vcmp_sae<X86VectorVTInfo _> { (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, - "$cc,{sae}, $src2, $src1", - "$src1, $src2,{sae}, $cc">, EVEX_B; + "$cc, {sae}, $src2, $src1", + "$src1, $src2, {sae}, $cc">, EVEX_B; } } @@ -1700,6 +1872,128 @@ def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), imm:$cc), VK8)>; +// ---------------------------------------------------------------- +// FPClass +//handle fpclass instruction mask = op(reg_scalar,imm) +// op(mem_scalar,imm) +multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, Predicate prd> { + let Predicates = [prd] in { + def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), + (i32 imm:$src2)))], NoItinerary>; + def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix# + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + let mayLoad = 1, AddedComplexity = 20 in { + def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix## + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.KRC:$dst, + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2)))], NoItinerary>; + def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix## + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + } + } +} + +//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm) +// fpclass(reg_vec, mem_vec, imm) +// fpclass(reg_vec, broadcast(eltVt), imm) +multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, string mem, string broadcast>{ + def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), + (i32 imm:$src2)))], NoItinerary>; + def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix# + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + let mayLoad = 1 in { + def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##mem# + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode + (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2)))], NoItinerary>; + def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##mem# + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode + (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## + _.BroadcastStr##", $dst|$dst, ${src1}" + ##_.BroadcastStr##", $src2}", + [(set _.KRC:$dst,(OpNode + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2)))], NoItinerary>,EVEX_B; + def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## + _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"## + _.BroadcastStr##", $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2))))], NoItinerary>, + EVEX_B, EVEX_K; + } +} + +multiclass avx512_vector_fpclass_all<string OpcodeStr, + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd, + string broadcast>{ + let Predicates = [prd] in { + defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}", + broadcast>, EVEX_V512; + } + let Predicates = [prd, HasVLX] in { + defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info128, "{x}", + broadcast>, EVEX_V128; + defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info256, "{y}", + broadcast>, EVEX_V256; + } +} + +multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec, + bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{ + defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec, + VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>; + defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec, + VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W; + defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, + f32x_info, prd>, EVEX_CD8<32, CD8VT1>; + defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, + f64x_info, prd>, EVEX_CD8<64, CD8VT1>, VEX_W; +} + +defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass, + X86Vfpclasss, HasDQI>, AVX512AIi8Base,EVEX; + //----------------------------------------------------------------- // Mask register copy, including // - copy between mask registers @@ -1786,6 +2080,11 @@ let Predicates = [HasDQI] in { (KMOVBmk addr:$dst, VK8:$src)>; def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), (KMOVBkm addr:$src)>; + + def : Pat<(store VK4:$src, addr:$dst), + (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>; + def : Pat<(store VK2:$src, addr:$dst), + (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>; } let Predicates = [HasAVX512, NoDQI] in { def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), @@ -1837,10 +2136,15 @@ let Predicates = [HasAVX512] in { (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>; def : Pat<(i32 (anyext VK1:$src)), (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>; + def : Pat<(i8 (zext VK1:$src)), (EXTRACT_SUBREG (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>; + def : Pat<(i8 (anyext VK1:$src)), + (EXTRACT_SUBREG + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>; + def : Pat<(i64 (zext VK1:$src)), (AND64ri8 (SUBREG_TO_REG (i64 0), (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>; @@ -1848,17 +2152,19 @@ let Predicates = [HasAVX512] in { (EXTRACT_SUBREG (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_16bit)>; - def : Pat<(v16i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK16)>; - def : Pat<(v8i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK8)>; -} -let Predicates = [HasBWI] in { - def : Pat<(v32i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK32)>; - def : Pat<(v64i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK64)>; } +def : Pat<(v16i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK16)>; +def : Pat<(v8i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK8)>; +def : Pat<(v4i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK4)>; +def : Pat<(v2i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK2)>; +def : Pat<(v32i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK32)>; +def : Pat<(v64i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK64)>; // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. @@ -1955,11 +2261,12 @@ multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, } multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, bit IsCommutable> { + SDPatternOperator OpNode, bit IsCommutable, + Predicate prdW = HasAVX512> { defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD; defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, - HasAVX512, IsCommutable>, VEX_4V, VEX_L, PS; + prdW, IsCommutable>, VEX_4V, VEX_L, PS; defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD; defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, @@ -1974,6 +2281,7 @@ defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>; defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor, 1>; defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>; defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn, 0>; +defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>; multiclass avx512_mask_binop_int<string IntName, string InstName> { let Predicates = [HasAVX512] in @@ -2047,59 +2355,49 @@ def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)), (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; // Mask unpacking -multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr, - RegisterClass KRC> { - let Predicates = [HasAVX512] in - def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; -} +multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT, + RegisterClass KRCSrc, Predicate prd> { + let Predicates = [prd] in { + let hasSideEffects = 0 in + def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst), + (ins KRC:$src1, KRC:$src2), + "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + VEX_4V, VEX_L; -multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> { - defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16>, - VEX_4V, VEX_L, PD; + def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)), + (!cast<Instruction>(NAME##rr) + (COPY_TO_REGCLASS KRCSrc:$src2, KRC), + (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>; + } } -defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">; -def : Pat<(v16i1 (concat_vectors (v8i1 VK8:$src1), (v8i1 VK8:$src2))), - (KUNPCKBWrr (COPY_TO_REGCLASS VK8:$src2, VK16), - (COPY_TO_REGCLASS VK8:$src1, VK16))>; - - -multiclass avx512_mask_unpck_int<string IntName, string InstName> { - let Predicates = [HasAVX512] in - def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_bw") - (i16 GR16:$src1), (i16 GR16:$src2)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"BWrr") - (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)), - (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>; -} -defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">; +defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD; +defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS; +defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W; // Mask bit testing multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC, - SDNode OpNode> { - let Predicates = [HasAVX512], Defs = [EFLAGS] in + SDNode OpNode, Predicate prd> { + let Predicates = [prd], Defs = [EFLAGS] in def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>; } -multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> { - defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, - VEX, PS; - let Predicates = [HasDQI] in - defm B : avx512_mask_testop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode>, - VEX, PD; - let Predicates = [HasBWI] in { - defm Q : avx512_mask_testop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode>, - VEX, PS, VEX_W; - defm D : avx512_mask_testop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode>, - VEX, PD, VEX_W; - } +multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode, + Predicate prdW = HasAVX512> { + defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, HasDQI>, + VEX, PD; + defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, prdW>, + VEX, PS; + defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, HasBWI>, + VEX, PS, VEX_W; + defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, HasBWI>, + VEX, PD, VEX_W; } defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>; +defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>; // Mask shift multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, @@ -2124,7 +2422,7 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr, let Predicates = [HasDQI] in defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>, VEX, TAPD; - } + } } defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>; @@ -2167,24 +2465,52 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; +def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))), + (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>; + +def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))), + (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>; + def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))), (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>; def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))), (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>; -let Predicates = [HasVLX] in { - def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))), - (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; - def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), - (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; - def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), - (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>; - def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), - (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; - def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), - (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>; -} +def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; + +def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>; + +def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>; + +def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; +def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; + +def : Pat<(v32i1 (insert_subvector undef, VK2:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK2:$src, VK32))>; +def : Pat<(v32i1 (insert_subvector undef, VK4:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK4:$src, VK32))>; +def : Pat<(v32i1 (insert_subvector undef, VK8:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK8:$src, VK32))>; +def : Pat<(v32i1 (insert_subvector undef, VK16:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK16:$src, VK32))>; + +def : Pat<(v64i1 (insert_subvector undef, VK2:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK2:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK4:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK4:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK8:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK8:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK16:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK16:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK32:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK32:$src, VK64))>; + def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))), (v8i1 (COPY_TO_REGCLASS @@ -2304,23 +2630,21 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore> { - let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { - def rr_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), - OpcodeStr # "\t{$src, $dst|$dst, $src}", [], - _.ExeDomain>, EVEX; - let Constraints = "$src1 = $dst" in - def rrk_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), - (ins _.RC:$src1, _.KRCWM:$mask, _.RC:$src2), - OpcodeStr # - "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}", + + def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), + OpcodeStr # ".s\t{$src, $dst|$dst, $src}", + [], _.ExeDomain>, EVEX; + def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"# + "${dst} {${mask}}, $src}", [], _.ExeDomain>, EVEX, EVEX_K; - def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src), - OpcodeStr # - "\t{$src, ${dst} {${mask}} {z}|" # + OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" # "${dst} {${mask}} {z}, $src}", [], _.ExeDomain>, EVEX, EVEX_KZ; - } + let mayStore = 1 in { def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), @@ -2383,30 +2707,6 @@ defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0>, avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; -def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr, - (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), - (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; - -def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr, - (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)), - (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; - -def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr, - (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), - (VMOVAPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; - -def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr, - (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)), - (VMOVAPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; - -def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr, - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), - (VMOVAPDZrm addr:$ptr)>; - -def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr, - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), - (VMOVAPSZrm addr:$ptr)>; - def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src), GR16:$mask), (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), @@ -2425,22 +2725,6 @@ def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src), (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; -let Predicates = [HasAVX512, NoVLX] in { -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)), - (VMOVUPSZmrk addr:$ptr, - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; - -def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)), - (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; - -def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))), - (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm), - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; -} - defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, HasAVX512>, avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, @@ -2502,17 +2786,6 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), (v16i32 VR512:$src))), (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; } -// NoVLX patterns -let Predicates = [HasAVX512, NoVLX] in { -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), - (VMOVDQU32Zmrk addr:$ptr, - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), - (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; - -def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)), - (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; -} // Move Int Doubleword to Packed Double Int // @@ -2520,32 +2793,37 @@ def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src "vmovd\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, - EVEX, VEX_LIG; + EVEX; def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v4i32 (scalar_to_vector (loadi32 addr:$src))))], - IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v2i64 (scalar_to_vector GR64:$src)))], - IIC_SSE_MOVDQ>, EVEX, VEX_W, VEX_LIG; + IIC_SSE_MOVDQ>, EVEX, VEX_W; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), + (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", []>, + EVEX, VEX_W, EVEX_CD8<64, CD8VT1>; let isCodeGenOnly = 1 in { -def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), +def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (bitconvert GR64:$src))], + [(set FR64X:$dst, (bitconvert GR64:$src))], IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; -def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), +def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (bitconvert FR64:$src))], + [(set GR64:$dst, (bitconvert FR64X:$src))], IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; -} -def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), +def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(store (i64 (bitconvert FR64:$src)), addr:$dst)], + [(store (i64 (bitconvert FR64X:$src)), addr:$dst)], IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>, EVEX_CD8<64, CD8VT1>; +} // Move Int Doubleword to Single Scalar // @@ -2553,27 +2831,27 @@ let isCodeGenOnly = 1 in { def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set FR32X:$dst, (bitconvert GR32:$src))], - IIC_SSE_MOVDQ>, EVEX, VEX_LIG; + IIC_SSE_MOVDQ>, EVEX; def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))], - IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; } // Move doubleword from xmm register to r/m32 // def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src), "vmovd\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (vector_extract (v4i32 VR128X:$src), + [(set GR32:$dst, (extractelt (v4i32 VR128X:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, - EVEX, VEX_LIG; + EVEX; def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128X:$src), "vmovd\t{$src, $dst|$dst, $src}", - [(store (i32 (vector_extract (v4i32 VR128X:$src), + [(store (i32 (extractelt (v4i32 VR128X:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, - EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + EVEX, EVEX_CD8<32, CD8VT1>; // Move quadword from xmm1 register to r/m64 // @@ -2581,16 +2859,28 @@ def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (extractelt (v2i64 VR128X:$src), (iPTR 0)))], - IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_LIG, VEX_W, + IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, Requires<[HasAVX512, In64BitMode]>; -def VMOVPQIto64Zmr : I<0xD6, MRMDestMem, (outs), - (ins i64mem:$dst, VR128X:$src), - "vmovq\t{$src, $dst|$dst, $src}", - [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)), - addr:$dst)], IIC_SSE_MOVDQ>, - EVEX, PD, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>, - Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, + Requires<[HasAVX512, In64BitMode]>; + +def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs), + (ins i64mem:$dst, VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)), + addr:$dst)], IIC_SSE_MOVDQ>, + EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>, + Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>; + +let hasSideEffects = 0 in +def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src), + "vmovq.s\t{$src, $dst|$dst, $src}",[]>, + EVEX, VEX_W; // Move Scalar Single to Double Int // @@ -2599,92 +2889,95 @@ def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32X:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32X:$src))], - IIC_SSE_MOVD_ToGP>, EVEX, VEX_LIG; + IIC_SSE_MOVD_ToGP>, EVEX; def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32X:$src), "vmovd\t{$src, $dst|$dst, $src}", [(store (i32 (bitconvert FR32X:$src)), addr:$dst)], - IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; } // Move Quadword Int to Packed Quadword Int // -def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), +def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, - EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + EVEX, VEX_W, EVEX_CD8<8, CD8VT8>; //===----------------------------------------------------------------------===// // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar <string asm, RegisterClass RC, - SDNode OpNode, ValueType vt, - X86MemOperand x86memop, PatFrag mem_pat> { - let hasSideEffects = 0 in { - def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), - !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, (vt (OpNode VR128X:$src1, - (scalar_to_vector RC:$src2))))], - IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG; - let Constraints = "$src1 = $dst" in - def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3), - !strconcat(asm, - "\t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"), - [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K; - def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>, - EVEX, VEX_LIG; +multiclass avx512_move_scalar <string asm, SDNode OpNode, + X86VectorVTInfo _> { + defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), + asm, "$src2, $src1","$src1, $src2", + (_.VT (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2))), + IIC_SSE_MOV_S_RR>, EVEX_4V; + let Constraints = "$src1 = $dst" , mayLoad = 1 in + defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _, + (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), + asm,"$src","$src", + (_.VT (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src)))))>, EVEX; + let isCodeGenOnly = 1 in { + def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.FRC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, + (scalar_to_vector _.FRC:$src2))))], + _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V; + let mayLoad = 1 in + def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))], + _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX; + } let mayStore = 1 in { - def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), - !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, - EVEX, VEX_LIG; - def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src), - !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), - [], IIC_SSE_MOV_S_MR>, - EVEX, VEX_LIG, EVEX_K; + def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>, + EVEX; + def mrk: AVX512PI<0x11, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src), + !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K; } // mayStore - } //hasSideEffects = 0 } -let ExeDomain = SSEPackedSingle in -defm VMOVSSZ : avx512_move_scalar<"movss", FR32X, X86Movss, v4f32, f32mem, - loadf32>, XS, EVEX_CD8<32, CD8VT1>; +defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>, + VEX_LIG, XS, EVEX_CD8<32, CD8VT1>; -let ExeDomain = SSEPackedDouble in -defm VMOVSDZ : avx512_move_scalar<"movsd", FR64X, X86Movsd, v2f64, f64mem, - loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>, + VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), - (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X), - VK1WM:$mask, (f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>; + (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X), + VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>; def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), - (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), - VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; + (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X), + VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>; def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; -// For the disassembler -let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { - def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), - (ins VR128X:$src1, FR32X:$src2), - "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], - IIC_SSE_MOV_S_RR>, - XS, EVEX_4V, VEX_LIG; - def VMOVSDZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), - (ins VR128X:$src1, FR64X:$src2), - "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], - IIC_SSE_MOV_S_RR>, - XD, EVEX_4V, VEX_LIG, VEX_W; -} +defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info, + (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), + "vmovss.s", "$src2, $src1", "$src1, $src2", []>, + XS, EVEX_4V, VEX_LIG; + +defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info, + (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), + "vmovsd.s", "$src2, $src1", "$src1, $src2", []>, + XD, EVEX_4V, VEX_LIG, VEX_W; let Predicates = [HasAVX512] in { let AddedComplexity = 15 in { @@ -2768,10 +3061,10 @@ let Predicates = [HasAVX512] in { (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>; // Extract and store. - def : Pat<(store (f32 (vector_extract (v4f32 VR128X:$src), (iPTR 0))), + def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), addr:$dst), (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>; - def : Pat<(store (f64 (vector_extract (v2f64 VR128X:$src), (iPTR 0))), + def : Pat<(store (f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))), addr:$dst), (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>; @@ -2835,7 +3128,7 @@ def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), (v2i64 VR128X:$src))))], IIC_SSE_MOVQ_RR>, EVEX, VEX_W; -let AddedComplexity = 20 in +let AddedComplexity = 20 , isCodeGenOnly = 1 in def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), (ins i128mem:$src), "vmovq\t{$src, $dst|$dst, $src}", @@ -2964,7 +3257,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, OpndItins itins, bit IsCommutable = 0> { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2)), itins.rr, IsCommutable>, @@ -2972,7 +3265,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))), @@ -2986,7 +3279,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> { let mayLoad = 1 in defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, @@ -3058,20 +3351,20 @@ multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, SDNode OpNode, OpndItins itins, Predicate prd, bit IsCommutable = 0> { - defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr, OpNode, itins, prd, + defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd, IsCommutable>; - defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr, OpNode, itins, prd, + defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd, IsCommutable>; } multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, SDNode OpNode, OpndItins itins, Predicate prd, bit IsCommutable = 0> { - defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr, OpNode, itins, prd, + defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, itins, prd, IsCommutable>; - defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr, OpNode, itins, prd, + defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, itins, prd, IsCommutable>; } @@ -3086,15 +3379,15 @@ multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w, } multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, - SDNode OpNode,X86VectorVTInfo _Src, + SDNode OpNode,X86VectorVTInfo _Src, X86VectorVTInfo _Dst, bit IsCommutable = 0> { - defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, - "$src2, $src1","$src1, $src2", - (_Dst.VT (OpNode - (_Src.VT _Src.RC:$src1), + "$src2, $src1","$src1, $src2", + (_Dst.VT (OpNode + (_Src.VT _Src.RC:$src1), (_Src.VT _Src.RC:$src2))), - itins.rr, IsCommutable>, + itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V; let mayLoad = 1 in { defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), @@ -3106,12 +3399,12 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, AVX512BIBase, EVEX_4V; defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), - (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2), + (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_Dst.BroadcastStr##", $src1", "$src1, ${src2}"##_Dst.BroadcastStr, - (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Dst.VT (X86VBroadcast + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert + (_Dst.VT (X86VBroadcast (_Dst.ScalarLdFrag addr:$src2)))))), itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B; @@ -3127,24 +3420,24 @@ defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds, defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs, SSE_INTALU_ITINS_P, HasBWI, 0>; defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus, - SSE_INTALU_ITINS_P, HasBWI, 1>; + SSE_INTALU_ITINS_P, HasBWI, 1>; defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus, - SSE_INTALU_ITINS_P, HasBWI, 0>; -defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmull", mul, - SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul, - SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul, - SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; -defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulh", mulhs, SSE_INTALU_ITINS_P, - HasBWI, 1>; -defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhu", mulhu, SSE_INTMUL_ITINS_P, + SSE_INTALU_ITINS_P, HasBWI, 0>; +defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; +defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul, + SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; +defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrs", X86mulhrs, SSE_INTMUL_ITINS_P, - HasBWI, 1>, T8PD; +defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SSE_INTMUL_ITINS_P, + HasBWI, 1>; +defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_ITINS_P, + HasBWI, 1>, T8PD; defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, - SSE_INTALU_ITINS_P, HasBWI, 1>; - + SSE_INTALU_ITINS_P, HasBWI, 1>; + multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins, SDNode OpNode, bit IsCommutable = 0> { @@ -3159,7 +3452,7 @@ multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins, v4i32x_info, v2i64x_info, IsCommutable>, EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W; } -} +} defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P, X86pmuldq, 1>,T8PD; @@ -3170,25 +3463,25 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _Src, X86VectorVTInfo _Dst> { let mayLoad = 1 in { defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), - (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), + (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_Src.BroadcastStr##", $src1", "$src1, ${src2}"##_Src.BroadcastStr, - (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Src.VT (X86VBroadcast + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert + (_Src.VT (X86VBroadcast (_Src.ScalarLdFrag addr:$src2))))))>, EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>; } } -multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, - SDNode OpNode,X86VectorVTInfo _Src, +multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode,X86VectorVTInfo _Src, X86VectorVTInfo _Dst> { - defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, - "$src2, $src1","$src1, $src2", - (_Dst.VT (OpNode - (_Src.VT _Src.RC:$src1), + "$src2, $src1","$src1, $src2", + (_Dst.VT (OpNode + (_Src.VT _Src.RC:$src1), (_Src.VT _Src.RC:$src2)))>, EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V; let mayLoad = 1 in { @@ -3229,102 +3522,59 @@ multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr, v16i8x_info>, EVEX_V128; } } + +multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr, + SDNode OpNode, AVX512VLVectorVTInfo _Src, + AVX512VLVectorVTInfo _Dst> { + defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512, + _Dst.info512>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256, + _Dst.info256>, EVEX_V256; + defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128, + _Dst.info128>, EVEX_V128; + } +} + let Predicates = [HasBWI] in { defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, PD; defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, T8PD; defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase, VEX_W; defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W; + + defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw, + avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD; + defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd, + avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase; } -defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", smax, +defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; -defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", smax, +defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax, SSE_INTALU_ITINS_P, HasBWI, 1>; defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", umax, +defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax, SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", umax, +defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", smin, +defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; -defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", smin, +defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin, SSE_INTALU_ITINS_P, HasBWI, 1>; defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", umin, +defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin, SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", umin, +defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; - -//===----------------------------------------------------------------------===// -// AVX-512 - Unpack Instructions -//===----------------------------------------------------------------------===// - -multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt, - PatFrag mem_frag, RegisterClass RC, - X86MemOperand x86memop, string asm, - Domain d> { - def rr : AVX512PI<opc, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2), - asm, [(set RC:$dst, - (vt (OpNode RC:$src1, RC:$src2)))], - d>, EVEX_4V; - def rm : AVX512PI<opc, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2), - asm, [(set RC:$dst, - (vt (OpNode RC:$src1, - (bitconvert (mem_frag addr:$src2)))))], - d>, EVEX_4V; -} - -defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, loadv8f64, - VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, loadv8f64, - VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, loadv8f64, - VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, loadv8f64, - VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType OpVT, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop> { - def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], - IIC_SSE_UNPCK>, EVEX_4V; - def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), - (bitconvert (memop_frag addr:$src2)))))], - IIC_SSE_UNPCK>, EVEX_4V; -} -defm VPUNPCKLDQZ : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32, - VR512, loadv16i32, i512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64, - VR512, loadv8i64, i512mem>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; -defm VPUNPCKHDQZ : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32, - VR512, loadv16i32, i512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64, - VR512, loadv8i64, i512mem>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; //===----------------------------------------------------------------------===// // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// @@ -3362,12 +3612,12 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, let isCodeGenOnly = 1, isCommutable = IsCommutable, Predicates = [HasAVX512] in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), - (ins _.FRC:$src1, _.FRC:$src2), + (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))], itins.rr>; def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), - (ins _.FRC:$src1, _.ScalarMemOp:$src2), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2)))], itins.rr>; @@ -3375,7 +3625,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, } multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, - SDNode VecNode, OpndItins itins, bit IsCommutable> { + SDNode VecNode, OpndItins itins, bit IsCommutable = 0> { defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, @@ -3470,7 +3720,7 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, EVEX_4V, EVEX_B; } -multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, bit IsCommutable = 0> { defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info, IsCommutable>, EVEX_V512, PS, @@ -3514,7 +3764,7 @@ defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>, avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>; defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>, avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>; -defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>, +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>, avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>; defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>, avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>; @@ -3550,13 +3800,34 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, }//let mayLoad = 1 } -multiclass avx512_fp_scalef_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { - defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>, +multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>; + let mayLoad = 1 in { + defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>; + }//let mayLoad = 1 +} + +multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode> { + defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>, avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>, + defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>, avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f32x_info>, + avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNode, SSE_ALU_ITINS_S.s>, + EVEX_4V,EVEX_CD8<32, CD8VT1>; + defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f64x_info>, + avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNode, SSE_ALU_ITINS_S.d>, + EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; + // Define only if AVX512VL feature is present. let Predicates = [HasVLX] in { defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>, @@ -3569,7 +3840,7 @@ multiclass avx512_fp_scalef_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; } } -defm VSCALEF : avx512_fp_scalef_all<0x2C, "vscalef", X86scalef>, T8PD; +defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef>, T8PD; //===----------------------------------------------------------------------===// // AVX-512 VPTESTM instructions @@ -3586,7 +3857,7 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode, defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (OpNode (_.VT _.RC:$src1), + (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))))>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; @@ -3748,12 +4019,12 @@ multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM, VTInfo.info256>, EVEX_V256; defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, VTInfo.info128>, - avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, VTInfo.info128>, EVEX_V128; } } -multiclass avx512_shift_rmi_w<bits<8> opcw, +multiclass avx512_shift_rmi_w<bits<8> opcw, Format ImmFormR, Format ImmFormM, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in @@ -3785,8 +4056,8 @@ defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>, defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>, avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V; -defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", rotr>, AVX512BIi8Base, EVEX_4V; -defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", rotl>, AVX512BIi8Base, EVEX_4V; +defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri>, AVX512BIi8Base, EVEX_4V; +defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli>, AVX512BIi8Base, EVEX_4V; defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>; defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>; @@ -3846,6 +4117,27 @@ multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr, avx512vl_i64_info>, VEX_W; } +// Use 512bit version to implement 128/256 bit in case NoVLX. +multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> { + let Predicates = [HasBWI, NoVLX] in { + def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), + (_.info256.VT _.info256.RC:$src2))), + (EXTRACT_SUBREG + (!cast<Instruction>(NAME#"WZrr") + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + sub_ymm)>; + + def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), + (_.info128.VT _.info128.RC:$src2))), + (EXTRACT_SUBREG + (!cast<Instruction>(NAME#"WZrr") + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), + sub_xmm)>; + } +} + multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in @@ -3861,11 +4153,14 @@ multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, } defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>, - avx512_var_shift_w<0x12, "vpsllvw", shl>; + avx512_var_shift_w<0x12, "vpsllvw", shl>, + avx512_var_shift_w_lowering<avx512vl_i16_info, shl>; defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>, - avx512_var_shift_w<0x11, "vpsravw", sra>; + avx512_var_shift_w<0x11, "vpsravw", sra>, + avx512_var_shift_w_lowering<avx512vl_i16_info, sra>; defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, - avx512_var_shift_w<0x10, "vpsrlvw", srl>; + avx512_var_shift_w<0x10, "vpsrlvw", srl>, + avx512_var_shift_w_lowering<avx512vl_i16_info, srl>; defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>; defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>; @@ -3916,19 +4211,77 @@ defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq", defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd", X86VPermi, avx512vl_f64_info>, EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; +//===----------------------------------------------------------------------===// +// AVX-512 - VPERMIL +//===----------------------------------------------------------------------===// + +multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, X86VectorVTInfo Ctrl> { + defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, + (Ctrl.VT Ctrl.RC:$src2)))>, + T8PD, EVEX_4V; + let mayLoad = 1 in { + defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode + _.RC:$src1, + (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>, + T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode + _.RC:$src1, + (Ctrl.VT (X86VBroadcast + (Ctrl.ScalarLdFrag addr:$src2)))))>, + T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + }//let mayLoad = 1 +} + +multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar, + AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{ + let Predicates = [HasAVX512] in { + defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512, + Ctrl.info512>, EVEX_V512; + } + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128, + Ctrl.info128>, EVEX_V128; + defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256, + Ctrl.info256>, EVEX_V256; + } +} + +multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar, + AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{ + + defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>; + defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr, + X86VPermilpi, _>, + EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>; +} +defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info, + avx512vl_i32_info>; +defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info, + avx512vl_i64_info>, VEX_W; //===----------------------------------------------------------------------===// // AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW //===----------------------------------------------------------------------===// defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd", - X86PShufd, avx512vl_i32_info>, + X86PShufd, avx512vl_i32_info>, EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>; defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw", - X86PShufhw>, EVEX, AVX512XSIi8Base, VEX_W; + X86PShufhw>, EVEX, AVX512XSIi8Base; defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw", - X86PShuflw>, EVEX, AVX512XDIi8Base, VEX_W; - + X86PShuflw>, EVEX, AVX512XDIi8Base; + multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512; @@ -3942,55 +4295,6 @@ multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>; //===----------------------------------------------------------------------===// -// AVX-512 - MOVDDUP -//===----------------------------------------------------------------------===// - -multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, - X86MemOperand x86memop, PatFrag memop_frag> { -def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX; -def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, - (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX; -} - -defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, loadv8f64>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))), - (VMOVDDUPZrm addr:$src)>; - -//===---------------------------------------------------------------------===// -// Replicate Single FP - MOVSHDUP and MOVSLDUP -//===---------------------------------------------------------------------===// -multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, - ValueType vt, RegisterClass RC, PatFrag mem_frag, - X86MemOperand x86memop> { - def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX; - let mayLoad = 1 in - def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX; -} - -defm VMOVSHDUPZ : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup", - v16f32, VR512, loadv16f32, f512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VMOVSLDUPZ : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup", - v16f32, VR512, loadv16f32, f512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - -def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movshdup (loadv16i32 addr:$src))), - (VMOVSHDUPZrm addr:$src)>; -def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movsldup (loadv16i32 addr:$src))), - (VMOVSLDUPZrm addr:$src)>; - -//===----------------------------------------------------------------------===// // Move Low to High and High to Low packed FP Instructions //===----------------------------------------------------------------------===// def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst), @@ -4017,6 +4321,115 @@ let Predicates = [HasAVX512] in { } //===----------------------------------------------------------------------===// +// VMOVHPS/PD VMOVLPS Instructions +// All patterns was taken from SSS implementation. +//===----------------------------------------------------------------------===// +multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayLoad = 1 in + def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, f64mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (OpNode _.RC:$src1, + (_.VT (bitconvert + (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))], + IIC_SSE_MOV_LH>, EVEX_4V; +} + +defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps, + v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS; +defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Movlhpd, + v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W; +defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps, + v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS; +defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movlpd, + v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W; + +let Predicates = [HasAVX512] in { + // VMOVHPS patterns + def : Pat<(X86Movlhps VR128X:$src1, + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), + (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128X:$src1, + (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>; + // VMOVHPD patterns + def : Pat<(v2f64 (X86Unpckl VR128X:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpckl VR128X:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; + // VMOVLPS patterns + def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))), + (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86Movlps VR128X:$src1, (load addr:$src2))), + (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>; + // VMOVLPD patterns + def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2i64 (X86Movlpd VR128X:$src1, (load addr:$src2))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128X:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; +} + +let mayStore = 1 in { +def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovhps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)), + (bc_v2f64 (v4f32 VR128X:$src))), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<32, CD8VT2>; +def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovhpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<64, CD8VT1>, VEX_W; +def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovlps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128X:$src)), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<32, CD8VT2>; +def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovlpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (v2f64 VR128X:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<64, CD8VT1>, VEX_W; +} +let Predicates = [HasAVX512] in { + // VMOVHPD patterns + def : Pat<(store (f64 (vector_extract + (v2f64 (X86VPermilpi VR128X:$src, (i8 1))), + (iPTR 0))), addr:$dst), + (VMOVHPDZ128mr addr:$dst, VR128X:$src)>; + // VMOVLPS patterns + def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)), + addr:$src1), + (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>; + def : Pat<(store (v4i32 (X86Movlps + (bc_v4i32 (loadv2i64 addr:$src1)), VR128X:$src2)), addr:$src1), + (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>; + // VMOVLPD patterns + def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)), + addr:$src1), + (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>; + def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128X:$src2)), + addr:$src1), + (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>; +} +//===----------------------------------------------------------------------===// // FMA - Fused Multiply Operations // @@ -4034,7 +4447,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>, - AVX512FMA3Base; + AVX512FMA3Base; defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), @@ -4435,50 +4848,55 @@ def : Pat<(f64 (uint_to_fp GR64:$src)), //===----------------------------------------------------------------------===// // AVX-512 Scalar convert from float/double to integer //===----------------------------------------------------------------------===// -multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, - Intrinsic Int, Operand memop, ComplexPattern mem_cpat, - string asm> { -let hasSideEffects = 0 in { - def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG, - Requires<[HasAVX512]>; - let mayLoad = 1 in - def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG, - Requires<[HasAVX512]>; -} // hasSideEffects = 0 +multiclass avx512_cvt_s_int_round<bits<8> opc, RegisterClass SrcRC, + RegisterClass DstRC, Intrinsic Int, + Operand memop, ComplexPattern mem_cpat, string asm> { + let hasSideEffects = 0, Predicates = [HasAVX512] in { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG; + def rb : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc), + !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), []>, + EVEX, VEX_LIG, EVEX_B, EVEX_RC; + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG; + } // hasSideEffects = 0, Predicates = [HasAVX512] } -let Predicates = [HasAVX512] in { + // Convert float/double to signed/unsigned int 32/64 -defm VCVTSS2SIZ: avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse_cvtss2si, +defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse_cvtss2si, ssmem, sse_load_f32, "cvtss2si">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2SI64Z: avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64, +defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, + int_x86_sse_cvtss2si64, ssmem, sse_load_f32, "cvtss2si">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2USIZ: avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi, +defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, + int_x86_avx512_cvtss2usi, ssmem, sse_load_f32, "cvtss2usi">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64, +defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64, int_x86_avx512_cvtss2usi64, ssmem, sse_load_f32, "cvtss2usi">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTSD2SIZ: avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si, +defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2SI64Z: avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64, +defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, + int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2USIZ: avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi, +defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, + int_x86_avx512_cvtsd2usi, sdmem, sse_load_f64, "cvtsd2usi">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64, +defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64, int_x86_avx512_cvtsd2usi64, sdmem, sse_load_f64, "cvtsd2usi">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1 , Predicates = [HasAVX512] in { defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", SSE_CVT_Scalar, 0>, XS, EVEX_4V; @@ -4495,121 +4913,170 @@ let isCodeGenOnly = 1 in { defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}", SSE_CVT_Scalar, 0>, XD, EVEX_4V; -} // isCodeGenOnly = 1 +} // isCodeGenOnly = 1, Predicates = [HasAVX512] // Convert float/double to signed/unsigned int 32/64 with truncation -let isCodeGenOnly = 1 in { - defm Int_VCVTTSS2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse_cvttss2si, - ssmem, sse_load_f32, "cvttss2si">, - XS, EVEX_CD8<32, CD8VT1>; - defm Int_VCVTTSS2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64, - int_x86_sse_cvttss2si64, ssmem, sse_load_f32, - "cvttss2si">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; - defm Int_VCVTTSD2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse2_cvttsd2si, - sdmem, sse_load_f64, "cvttsd2si">, XD, - EVEX_CD8<64, CD8VT1>; - defm Int_VCVTTSD2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64, - int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, - "cvttsd2si">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; - defm Int_VCVTTSS2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32, - int_x86_avx512_cvttss2usi, ssmem, sse_load_f32, - "cvttss2usi">, XS, EVEX_CD8<32, CD8VT1>; - defm Int_VCVTTSS2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64, - int_x86_avx512_cvttss2usi64, ssmem, - sse_load_f32, "cvttss2usi">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; - defm Int_VCVTTSD2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32, - int_x86_avx512_cvttsd2usi, - sdmem, sse_load_f64, "cvttsd2usi">, XD, - EVEX_CD8<64, CD8VT1>; - defm Int_VCVTTSD2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64, - int_x86_avx512_cvttsd2usi64, sdmem, - sse_load_f64, "cvttsd2usi">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; -} // isCodeGenOnly = 1 - -multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, - SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, - string asm> { - def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), +multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, + X86VectorVTInfo _DstRC, SDNode OpNode, + SDNode OpNodeRnd>{ +let Predicates = [HasAVX512] in { + def rr : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX; - def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), + [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX; + def rb : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), + !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), + []>, EVEX, EVEX_B; + def rm : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.MemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX; -} - -defm VCVTTSS2SIZ : avx512_cvt_s<0x2C, FR32X, GR32, fp_to_sint, f32mem, - loadf32, "cvttss2si">, XS, - EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2USIZ : avx512_cvt_s<0x78, FR32X, GR32, fp_to_uint, f32mem, - loadf32, "cvttss2usi">, XS, - EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2SI64Z : avx512_cvt_s<0x2C, FR32X, GR64, fp_to_sint, f32mem, - loadf32, "cvttss2si">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2USI64Z : avx512_cvt_s<0x78, FR32X, GR64, fp_to_uint, f32mem, - loadf32, "cvttss2usi">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; -defm VCVTTSD2SIZ : avx512_cvt_s<0x2C, FR64X, GR32, fp_to_sint, f64mem, - loadf64, "cvttsd2si">, XD, - EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2USIZ : avx512_cvt_s<0x78, FR64X, GR32, fp_to_uint, f64mem, - loadf64, "cvttsd2usi">, XD, - EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2SI64Z : avx512_cvt_s<0x2C, FR64X, GR64, fp_to_sint, f64mem, - loadf64, "cvttsd2si">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2USI64Z : avx512_cvt_s<0x78, FR64X, GR64, fp_to_uint, f64mem, - loadf64, "cvttsd2usi">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; + [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, + EVEX; + + let isCodeGenOnly = 1,hasSideEffects = 0 in { + def rr_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, + (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG; + def rb_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), + !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), + [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, + (i32 FROUND_NO_EXC)))]>, + EVEX,VEX_LIG , EVEX_B; + let mayLoad = 1 in + def rm_Int : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), + (ins _SrcRC.MemOp:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + []>, EVEX, VEX_LIG; + + } // isCodeGenOnly = 1, hasSideEffects = 0 +} //HasAVX512 +} + + +defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, + fp_to_sint,X86cvttss2IntRnd>, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, + fp_to_sint,X86cvttss2IntRnd>, + VEX_W, XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, + fp_to_sint,X86cvttsd2IntRnd>, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, + fp_to_sint,X86cvttsd2IntRnd>, + VEX_W, XD, EVEX_CD8<64, CD8VT1>; + +defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, + fp_to_uint,X86cvttss2UIntRnd>, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, + fp_to_uint,X86cvttss2UIntRnd>, + XS,VEX_W, EVEX_CD8<32, CD8VT1>; +defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, + fp_to_uint,X86cvttsd2UIntRnd>, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, + fp_to_uint,X86cvttsd2UIntRnd>, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; +let Predicates = [HasAVX512] in { + def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))), + (VCVTTSS2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))), + (VCVTTSS2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))), + (VCVTTSD2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>; + def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))), + (VCVTTSD2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>; + } // HasAVX512 //===----------------------------------------------------------------------===// // AVX-512 Convert form float to double and back //===----------------------------------------------------------------------===// -let hasSideEffects = 0 in { -def VCVTSS2SDZrr : AVX512XSI<0x5A, MRMSrcReg, (outs FR64X:$dst), - (ins FR32X:$src1, FR32X:$src2), - "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; -let mayLoad = 1 in -def VCVTSS2SDZrm : AVX512XSI<0x5A, MRMSrcMem, (outs FR64X:$dst), - (ins FR32X:$src1, f32mem:$src2), - "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, - EVEX_CD8<32, CD8VT1>; - -// Convert scalar double to scalar single -def VCVTSD2SSZrr : AVX512XDI<0x5A, MRMSrcReg, (outs FR32X:$dst), - (ins FR64X:$src1, FR64X:$src2), - "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX_4V, VEX_LIG, VEX_W, Sched<[WriteCvtF2F]>; -let mayLoad = 1 in -def VCVTSD2SSZrm : AVX512XDI<0x5A, MRMSrcMem, (outs FR32X:$dst), - (ins FR64X:$src1, f64mem:$src2), - "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX_4V, VEX_LIG, VEX_W, - Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_CD8<64, CD8VT1>; -} - -def : Pat<(f64 (fextend FR32X:$src)), (VCVTSS2SDZrr FR32X:$src, FR32X:$src)>, - Requires<[HasAVX512]>; -def : Pat<(fextend (loadf32 addr:$src)), - (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512]>; - -def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, +multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNode> { + defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2)))>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; + defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_Src.VT _Src.RC:$src1), + (_Src.VT (scalar_to_vector + (_Src.ScalarLdFrag addr:$src2)))))>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; +} + +// Scalar Coversion with SAE - suppress all exceptions +multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd> { + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2), + (i32 FROUND_NO_EXC)))>, + EVEX_4V, VEX_LIG, EVEX_B; +} + +// Scalar Conversion with rounding control (RC) +multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd> { + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, + EVEX_B, EVEX_RC; +} +multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, + X86VectorVTInfo _dst> { + let Predicates = [HasAVX512] in { + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>, + avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src, + OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>, + EVEX_V512, XD; + } +} + +multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, + X86VectorVTInfo _dst> { + let Predicates = [HasAVX512] in { + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>, + avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>, + EVEX_CD8<32, CD8VT1>, XS, EVEX_V512; + } +} +defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround, + X86froundRnd, f64x_info, f32x_info>; +defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, + X86fpextRnd,f32x_info, f64x_info >; + +def : Pat<(f64 (fextend FR32X:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), + (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>, + Requires<[HasAVX512]>; +def : Pat<(f64 (fextend (loadf32 addr:$src))), + (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, + Requires<[HasAVX512]>; + +def : Pat<(f64 (extloadf32 addr:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, Requires<[HasAVX512, OptForSize]>; -def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDZrr (f32 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>, - Requires<[HasAVX512, OptForSpeed]>; +def : Pat<(f64 (extloadf32 addr:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>, + Requires<[HasAVX512, OptForSpeed]>; -def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>, +def : Pat<(f32 (fround FR64X:$src)), + (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), + (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; - //===----------------------------------------------------------------------===// // AVX-512 Vector convert from signed/unsigned integer to float/double // and from float/double to signed/unsigned integer @@ -4992,7 +5459,7 @@ defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUlongToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>; -let Predicates = [NoVLX] in { +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; @@ -5024,40 +5491,102 @@ let Predicates = [HasAVX512] in { //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// -multiclass avx512_cvtph2ps<RegisterClass destRC, RegisterClass srcRC, - X86MemOperand x86memop> { - def rr : AVX5128I<0x13, MRMSrcReg, (outs destRC:$dst), (ins srcRC:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", - []>, EVEX; - let hasSideEffects = 0, mayLoad = 1 in - def rm : AVX5128I<0x13, MRMSrcMem, (outs destRC:$dst), (ins x86memop:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, EVEX; -} - -multiclass avx512_cvtps2ph<RegisterClass destRC, RegisterClass srcRC, - X86MemOperand x86memop> { - def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst), - (ins srcRC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX; - let hasSideEffects = 0, mayStore = 1 in - def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), - (ins x86memop:$dst, srcRC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; +multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, + X86MemOperand x86memop, PatFrag ld_frag> { + defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), + "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT _src.RC:$src), + (i32 FROUND_CURRENT))>, T8PD; + let hasSideEffects = 0, mayLoad = 1 in { + defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src), + "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))), + (i32 FROUND_CURRENT))>, T8PD; + } +} + +multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> { + defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), + "vcvtph2ps", "{sae}, $src", "$src, {sae}", + (X86cvtph2ps (_src.VT _src.RC:$src), + (i32 FROUND_NO_EXC))>, T8PD, EVEX_B; + } -defm VCVTPH2PSZ : avx512_cvtph2ps<VR512, VR256X, f256mem>, EVEX_V512, - EVEX_CD8<32, CD8VH>; -defm VCVTPS2PHZ : avx512_cvtps2ph<VR256X, VR512, f256mem>, EVEX_V512, - EVEX_CD8<32, CD8VH>; +let Predicates = [HasAVX512] in { + defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>, + avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; + let Predicates = [HasVLX] in { + defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem, + loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; + defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem, + loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; + } +} + +multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, + X86MemOperand x86memop> { + defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph", "$src2, $src1", "$src1, $src2", + (X86cvtps2ph (_src.VT _src.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>, AVX512AIi8Base; + let hasSideEffects = 0, mayStore = 1 in { + def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1), + (i32 imm:$src2), (i32 FROUND_CURRENT) )), + addr:$dst)]>; + def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + []>, EVEX_K; + } +} +multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> { + defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}", + (X86cvtps2ph (_src.VT _src.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_NO_EXC))>, EVEX_B, AVX512AIi8Base; +} +let Predicates = [HasAVX512] in { + defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>, + avx512_cvtps2ph_sae<v16i16x_info, v16f32_info>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; + let Predicates = [HasVLX] in { + defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>, + EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; + defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>, + EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; + } +} -def : Pat<(v16i16 (int_x86_avx512_mask_vcvtps2ph_512 (v16f32 VR512:$src), - imm:$rc, (bc_v16i16(v8i32 immAllZerosV)), (i16 -1))), - (VCVTPS2PHZrr VR512:$src, imm:$rc)>; +// Unordered/Ordered scalar fp compare with Sea and set EFLAGS +multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, SDNode OpNode, + string OpcodeStr> { + def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), + [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2, + (i32 FROUND_NO_EXC)))], + IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128, + Sched<[WriteFAdd]>; +} -def : Pat<(v16f32 (int_x86_avx512_mask_vcvtph2ps_512 (v16i16 VR256X:$src), - (bc_v16f32(v16i32 immAllZerosV)), (i16 -1), (i32 FROUND_CURRENT))), - (VCVTPH2PSZrr VR256X:$src)>; +let Defs = [EFLAGS], Predicates = [HasAVX512] in { + defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, X86ucomiSae, "vucomiss">, + AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; + defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, X86ucomiSae, "vucomisd">, + AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, X86comiSae, "vcomiss">, + AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; + defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, X86comiSae, "vcomisd">, + AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; +} let Defs = [EFLAGS], Predicates = [HasAVX512] in { defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32, @@ -5067,10 +5596,10 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { "ucomisd">, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; let Pattern = []<dag> in { - defm VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, undef, v4f32, f128mem, load, + defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32, "comiss">, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; - defm VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, undef, v2f64, f128mem, load, + defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64, "comisd">, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } @@ -5092,50 +5621,31 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { } /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd -multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop> { - let hasSideEffects = 0 in { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; +multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let hasSideEffects = 0, AddedComplexity = 20 , Predicates = [HasAVX512] in { + defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V; let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; + defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V; } } } -defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; -defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; - -def : Pat <(v4f32 (int_x86_avx512_rcp14_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))), - (COPY_TO_REGCLASS (VRCP14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; - -def : Pat <(v2f64 (int_x86_avx512_rcp14_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))), - (COPY_TO_REGCLASS (VRCP14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; - -def : Pat <(v4f32 (int_x86_avx512_rsqrt14_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))), - (COPY_TO_REGCLASS (VRSQRT14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; - -def : Pat <(v2f64 (int_x86_avx512_rsqrt14_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))), - (COPY_TO_REGCLASS (VRSQRT14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; +defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>, + EVEX_CD8<32, CD8VT1>, T8PD; +defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>, + VEX_W, EVEX_CD8<64, CD8VT1>, T8PD; +defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>, + EVEX_CD8<32, CD8VT1>, T8PD; +defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>, + VEX_W, EVEX_CD8<64, CD8VT1>, T8PD; /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -5183,20 +5693,6 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>; defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>; -def : Pat <(v16f32 (int_x86_avx512_rsqrt14_ps_512 (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), - (VRSQRT14PSZr VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rsqrt14_pd_512 (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), - (VRSQRT14PDZr VR512:$src)>; - -def : Pat <(v16f32 (int_x86_avx512_rcp14_ps_512 (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), - (VRCP14PSZr VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), - (VRCP14PDZr VR512:$src)>; - /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode OpNode> { @@ -5232,6 +5728,8 @@ let hasSideEffects = 0, Predicates = [HasERI] in { defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V; defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V; } + +defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V; /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, @@ -5322,67 +5820,6 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, } } -multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, - Intrinsic F32Int, Intrinsic F64Int, - OpndItins itins_s, OpndItins itins_d> { - def SSZr : SI<opc, MRMSrcReg, (outs FR32X:$dst), - (ins FR32X:$src1, FR32X:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [], itins_s.rr>, XS, EVEX_4V; - let isCodeGenOnly = 1 in - def SSZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F32Int VR128X:$src1, VR128X:$src2))], - itins_s.rr>, XS, EVEX_4V; - let mayLoad = 1 in { - def SSZm : SI<opc, MRMSrcMem, (outs FR32X:$dst), - (ins FR32X:$src1, f32mem:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; - let isCodeGenOnly = 1 in - def SSZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, ssmem:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F32Int VR128X:$src1, sse_load_f32:$src2))], - itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; - } - def SDZr : SI<opc, MRMSrcReg, (outs FR64X:$dst), - (ins FR64X:$src1, FR64X:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - XD, EVEX_4V, VEX_W; - let isCodeGenOnly = 1 in - def SDZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F64Int VR128X:$src1, VR128X:$src2))], - itins_s.rr>, XD, EVEX_4V, VEX_W; - let mayLoad = 1 in { - def SDZm : SI<opc, MRMSrcMem, (outs FR64X:$dst), - (ins FR64X:$src1, f64mem:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; - let isCodeGenOnly = 1 in - def SDZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, sdmem:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F64Int VR128X:$src1, sse_load_f64:$src2))]>, - XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; - } -} - multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, @@ -5416,93 +5853,77 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr, v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; } +multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + string SUFF, SDNode OpNode, SDNode OpNodeRnd> { + + defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 FROUND_CURRENT))>; + let mayLoad = 1 in + defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT))>; + + defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$rc))>, + EVEX_B, EVEX_RC; + + let isCodeGenOnly = 1 in { + def r : I<opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>; + + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>; + } + + def : Pat<(_.EltVT (OpNode _.FRC:$src)), + (!cast<Instruction>(NAME#SUFF#Zr) + (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>; + + def : Pat<(_.EltVT (OpNode (load addr:$src))), + (!cast<Instruction>(NAME#SUFF#Zm) + (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[OptForSize]>; +} + +multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> { + defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt, + X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS; + defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt, + X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W; +} + defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>, avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>; -defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", - int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, - SSE_SQRTSS, SSE_SQRTSD>; +defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG; let Predicates = [HasAVX512] in { - def : Pat<(f32 (fsqrt FR32X:$src)), - (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; - def : Pat<(f32 (fsqrt (load addr:$src))), - (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[OptForSize]>; - def : Pat<(f64 (fsqrt FR64X:$src)), - (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>; - def : Pat<(f64 (fsqrt (load addr:$src))), - (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>, - Requires<[OptForSize]>; - def : Pat<(f32 (X86frsqrt FR32X:$src)), - (VRSQRT14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>; + (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>; def : Pat<(f32 (X86frsqrt (load addr:$src))), - (VRSQRT14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>, + (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, Requires<[OptForSize]>; - def : Pat<(f32 (X86frcp FR32X:$src)), - (VRCP14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>; + (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>; def : Pat<(f32 (X86frcp (load addr:$src))), - (VRCP14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>, + (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, Requires<[OptForSize]>; - - def : Pat<(int_x86_sse_sqrt_ss VR128X:$src), - (COPY_TO_REGCLASS (VSQRTSSZr (f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128X:$src, FR32)), - VR128X)>; - def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src), - (VSQRTSSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; - - def : Pat<(int_x86_sse2_sqrt_sd VR128X:$src), - (COPY_TO_REGCLASS (VSQRTSDZr (f64 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128X:$src, FR64)), - VR128X)>; - def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), - (VSQRTSDZm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; -} - - -multiclass avx512_rndscale<bits<8> opc, string OpcodeStr, - X86MemOperand x86memop, RegisterClass RC, - PatFrag mem_frag, Domain d> { -let ExeDomain = d in { - // Intrinsic operation, reg. - // Vector intrinsic operation, reg - def r : AVX512AIi8<opc, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, EVEX; - - // Vector intrinsic operation, mem - def m : AVX512AIi8<opc, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, EVEX; -} // ExeDomain } -defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512, - loadv16f32, SSEPackedSingle>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - -def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1), - imm:$src2, (v16f32 VR512:$src1), (i16 -1), - FROUND_CURRENT)), - (VRNDSCALEPSZr VR512:$src1, imm:$src2)>; - - -defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512, - loadv8f64, SSEPackedDouble>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; - -def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1), - imm:$src2, (v8f64 VR512:$src1), (i8 -1), - FROUND_CURRENT)), - (VRNDSCALEPDZr VR512:$src1, imm:$src2)>; - multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { @@ -5510,20 +5931,20 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 imm:$src3), (i32 FROUND_CURRENT)))>; defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, - "{sae}, $src3, $src2, $src1", "$src1, $src2, $src3, {sae}", - (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2), + "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3", + (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B; let mayLoad = 1 in defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (_.VT (X86RndScale (_.VT _.RC:$src1), + (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), (i32 imm:$src3), (i32 FROUND_CURRENT)))>; } @@ -5568,109 +5989,238 @@ defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>, defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>; -let Predicates = [HasAVX512] in { -def : Pat<(v16f32 (ffloor VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0x1))>; -def : Pat<(v16f32 (fnearbyint VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0xC))>; -def : Pat<(v16f32 (fceil VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0x2))>; -def : Pat<(v16f32 (frint VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0x4))>; -def : Pat<(v16f32 (ftrunc VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0x3))>; - -def : Pat<(v8f64 (ffloor VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0x1))>; -def : Pat<(v8f64 (fnearbyint VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0xC))>; -def : Pat<(v8f64 (fceil VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0x2))>; -def : Pat<(v8f64 (frint VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0x4))>; -def : Pat<(v8f64 (ftrunc VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0x3))>; -} //------------------------------------------------- // Integer truncate and extend operations //------------------------------------------------- -multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, - RegisterClass dstRC, RegisterClass srcRC, - RegisterClass KRC, X86MemOperand x86memop> { - def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), - (ins srcRC:$src), - !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), +multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo, + X86MemOperand x86memop> { + + defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>, + EVEX, T8XS; + + // for intrinsic patter match + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + undef)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + DestInfo.ImmAllZerosV)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + DestInfo.RC:$src0)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0, + DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + let mayStore = 1 in { + def mr : AVX512XS8I<opc, MRMDestMem, (outs), + (ins x86memop:$dst, SrcInfo.RC:$src), + OpcodeStr # "\t{$src, $dst|$dst, $src}", []>, EVEX; - def rrk : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), - (ins KRC:$mask, srcRC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), + def mrk : AVX512XS8I<opc, MRMDestMem, (outs), + (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", []>, EVEX, EVEX_K; + }//mayStore = 1 +} - def rrkz : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), - (ins KRC:$mask, srcRC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; +multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo, + X86VectorVTInfo DestInfo, + PatFrag truncFrag, PatFrag mtruncFrag > { - def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX; + def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) + addr:$dst, SrcInfo.RC:$src)>; - def mrk : AVX512XS8I<opc, MRMDestMem, (outs), - (ins x86memop:$dst, KRC:$mask, srcRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"), - []>, EVEX, EVEX_K; + def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask, + (SrcInfo.VT SrcInfo.RC:$src)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) + addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>; +} + +multiclass avx512_trunc_sat_mr_lowering<X86VectorVTInfo SrcInfo, + X86VectorVTInfo DestInfo, string sat > { + + def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# + DestInfo.Suffix#"_mem_"#SrcInfo.Size) + addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), SrcInfo.MRC:$mask), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) addr:$ptr, + (COPY_TO_REGCLASS SrcInfo.MRC:$mask, SrcInfo.KRCWM), + (SrcInfo.VT SrcInfo.RC:$src))>; + def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# + DestInfo.Suffix#"_mem_"#SrcInfo.Size) + addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), -1), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) addr:$ptr, + (SrcInfo.VT SrcInfo.RC:$src))>; } -defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVSQB : avx512_trunc_sat<0x22, "vpmovsqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVUSQB : avx512_trunc_sat<0x12, "vpmovusqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVQW : avx512_trunc_sat<0x34, "vpmovqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVSQW : avx512_trunc_sat<0x24, "vpmovsqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVUSQW : avx512_trunc_sat<0x14, "vpmovusqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVQD : avx512_trunc_sat<0x35, "vpmovqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVSQD : avx512_trunc_sat<0x25, "vpmovsqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVUSQD : avx512_trunc_sat<0x15, "vpmovusqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVDW : avx512_trunc_sat<0x33, "vpmovdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVSDW : avx512_trunc_sat<0x23, "vpmovsdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVUSDW : avx512_trunc_sat<0x13, "vpmovusdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVDB : avx512_trunc_sat<0x31, "vpmovdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; -defm VPMOVSDB : avx512_trunc_sat<0x21, "vpmovsdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; -defm VPMOVUSDB : avx512_trunc_sat<0x11, "vpmovusdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; - -def : Pat<(v16i8 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQBrr VR512:$src)>; -def : Pat<(v8i16 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQWrr VR512:$src)>; -def : Pat<(v16i16 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDWrr VR512:$src)>; -def : Pat<(v16i8 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr VR512:$src)>; -def : Pat<(v8i32 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQDrr VR512:$src)>; - -def : Pat<(v16i8 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), - (VPMOVDBrrkz VK16WM:$mask, VR512:$src)>; -def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), - (VPMOVDWrrkz VK16WM:$mask, VR512:$src)>; -def : Pat<(v8i16 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), - (VPMOVQWrrkz VK8WM:$mask, VR512:$src)>; -def : Pat<(v8i32 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), - (VPMOVQDrrkz VK8WM:$mask, VR512:$src)>; +multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, + X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, + X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag, + Predicate prd = HasAVX512>{ + + let Predicates = [HasVLX, prd] in { + defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128, + DestInfoZ128, x86memopZ128>, + avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128, + truncFrag, mtruncFrag>, EVEX_V128; + + defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256, + DestInfoZ256, x86memopZ256>, + avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256, + truncFrag, mtruncFrag>, EVEX_V256; + } + let Predicates = [prd] in + defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512, + DestInfoZ, x86memopZ>, + avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ, + truncFrag, mtruncFrag>, EVEX_V512; +} + +multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, + X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, + X86MemOperand x86memopZ, string sat, Predicate prd = HasAVX512>{ + + let Predicates = [HasVLX, prd] in { + defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128, + DestInfoZ128, x86memopZ128>, + avx512_trunc_sat_mr_lowering<VTSrcInfo.info128, DestInfoZ128, + sat>, EVEX_V128; + + defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256, + DestInfoZ256, x86memopZ256>, + avx512_trunc_sat_mr_lowering<VTSrcInfo.info256, DestInfoZ256, + sat>, EVEX_V256; + } + let Predicates = [prd] in + defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512, + DestInfoZ, x86memopZ>, + avx512_trunc_sat_mr_lowering<VTSrcInfo.info512, DestInfoZ, + sat>, EVEX_V512; +} + +multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem, + truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VO>; +} +multiclass avx512_trunc_sat_qb<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qb", OpNode, avx512vl_i64_info, + v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem, + sat>, EVEX_CD8<8, CD8VO>; +} + +multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem, + truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VQ>; +} +multiclass avx512_trunc_sat_qw<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qw", OpNode, avx512vl_i64_info, + v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem, + sat>, EVEX_CD8<16, CD8VQ>; +} + +multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem, + truncstorevi32, masked_truncstorevi32>, EVEX_CD8<32, CD8VH>; +} +multiclass avx512_trunc_sat_qd<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qd", OpNode, avx512vl_i64_info, + v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem, + sat>, EVEX_CD8<32, CD8VH>; +} + +multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info, + v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem, + truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VQ>; +} +multiclass avx512_trunc_sat_db<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"db", OpNode, avx512vl_i32_info, + v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem, + sat>, EVEX_CD8<8, CD8VQ>; +} + +multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info, + v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem, + truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VH>; +} +multiclass avx512_trunc_sat_dw<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"dw", OpNode, avx512vl_i32_info, + v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem, + sat>, EVEX_CD8<16, CD8VH>; +} + +multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info, + v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem, + truncstorevi8, masked_truncstorevi8,HasBWI>, EVEX_CD8<16, CD8VH>; +} +multiclass avx512_trunc_sat_wb<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"wb", OpNode, avx512vl_i16_info, + v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem, + sat, HasBWI>, EVEX_CD8<16, CD8VH>; +} + +defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc>; +defm VPMOVSQB : avx512_trunc_sat_qb<0x22, "s", X86vtruncs>; +defm VPMOVUSQB : avx512_trunc_sat_qb<0x12, "us", X86vtruncus>; + +defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc>; +defm VPMOVSQW : avx512_trunc_sat_qw<0x24, "s", X86vtruncs>; +defm VPMOVUSQW : avx512_trunc_sat_qw<0x14, "us", X86vtruncus>; + +defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc>; +defm VPMOVSQD : avx512_trunc_sat_qd<0x25, "s", X86vtruncs>; +defm VPMOVUSQD : avx512_trunc_sat_qd<0x15, "us", X86vtruncus>; + +defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc>; +defm VPMOVSDB : avx512_trunc_sat_db<0x21, "s", X86vtruncs>; +defm VPMOVUSDB : avx512_trunc_sat_db<0x11, "us", X86vtruncus>; + +defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc>; +defm VPMOVSDW : avx512_trunc_sat_dw<0x23, "s", X86vtruncs>; +defm VPMOVUSDW : avx512_trunc_sat_dw<0x13, "us", X86vtruncus>; + +defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc>; +defm VPMOVSWB : avx512_trunc_sat_wb<0x20, "s", X86vtruncs>; +defm VPMOVUSWB : avx512_trunc_sat_wb<0x10, "us", X86vtruncus>; + +let Predicates = [HasAVX512, NoVLX] in { +def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))), + (v8i16 (EXTRACT_SUBREG + (v16i16 (VPMOVDWZrr (v16i32 (SUBREG_TO_REG (i32 0), + VR256X:$src, sub_ymm)))), sub_xmm))>; +def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))), + (v4i32 (EXTRACT_SUBREG + (v8i32 (VPMOVQDZrr (v8i64 (SUBREG_TO_REG (i32 0), + VR256X:$src, sub_ymm)))), sub_xmm))>; +} + +let Predicates = [HasBWI, NoVLX] in { +def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))), + (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (SUBREG_TO_REG (i32 0), + VR256X:$src, sub_ymm))), sub_xmm))>; +} multiclass avx512_extend_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, @@ -5985,163 +6535,11 @@ defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd", VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -//===----------------------------------------------------------------------===// -// VSHUFPS - VSHUFPD Operations - -multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop, - ValueType vt, string OpcodeStr, PatFrag mem_frag, - Domain d> { - def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, u8imm:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), - (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, - EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>; - def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, u8imm:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, - (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, - EVEX_4V, Sched<[WriteShuffle]>; -} - -defm VSHUFPSZ : avx512_shufp<VR512, f512mem, v16f32, "vshufps", loadv16f32, - SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VSHUFPDZ : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", loadv8f64, - SSEPackedDouble>, PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>; -def : Pat<(v16i32 (X86Shufp VR512:$src1, - (loadv16i32 addr:$src2), (i8 imm:$imm))), - (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>; - -def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>; -def : Pat<(v8i64 (X86Shufp VR512:$src1, - (loadv8i64 addr:$src2), (i8 imm:$imm))), - (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>; // Helper fragments to match sext vXi1 to vXiY. def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>; -multiclass avx512_conflict<bits<8> opc, string OpcodeStr, - RegisterClass RC, RegisterClass KRC, - X86MemOperand x86memop, - X86MemOperand x86scalar_mop, string BrdcstStr> { - let hasSideEffects = 0 in { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"), - []>, EVEX; - let mayLoad = 1 in - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"), - []>, EVEX; - let mayLoad = 1 in - def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins x86scalar_mop:$src), - !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, - ", ${dst}|${dst}, ${src}", BrdcstStr, "}"), - []>, EVEX, EVEX_B; - def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - let mayLoad = 1 in - def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, x86memop:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - let mayLoad = 1 in - def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, x86scalar_mop:$src), - !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, - ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}", - BrdcstStr, "}"), - []>, EVEX, EVEX_KZ, EVEX_B; - - let Constraints = "$src1 = $dst" in { - def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2), - !strconcat(OpcodeStr, - "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - []>, EVEX, EVEX_K; - let mayLoad = 1 in - def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, x86memop:$src2), - !strconcat(OpcodeStr, - "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - []>, EVEX, EVEX_K; - let mayLoad = 1 in - def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2), - !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, - ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"), - []>, EVEX, EVEX_K, EVEX_B; - } - } -} - -let Predicates = [HasCDI] in { -defm VPCONFLICTD : avx512_conflict<0xC4, "vpconflictd", VR512, VK16WM, - i512mem, i32mem, "{1to16}">, - EVEX_V512, EVEX_CD8<32, CD8VF>; - - -defm VPCONFLICTQ : avx512_conflict<0xC4, "vpconflictq", VR512, VK8WM, - i512mem, i64mem, "{1to8}">, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -} - -def : Pat<(int_x86_avx512_mask_conflict_d_512 VR512:$src2, VR512:$src1, - GR16:$mask), - (VPCONFLICTDrrk VR512:$src1, - (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>; - -def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1, - GR8:$mask), - (VPCONFLICTQrrk VR512:$src1, - (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; - -let Predicates = [HasCDI] in { -defm VPLZCNTD : avx512_conflict<0x44, "vplzcntd", VR512, VK16WM, - i512mem, i32mem, "{1to16}">, - EVEX_V512, EVEX_CD8<32, CD8VF>; - - -defm VPLZCNTQ : avx512_conflict<0x44, "vplzcntq", VR512, VK8WM, - i512mem, i64mem, "{1to8}">, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -} - -def : Pat<(int_x86_avx512_mask_lzcnt_d_512 VR512:$src2, VR512:$src1, - GR16:$mask), - (VPLZCNTDrrk VR512:$src1, - (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>; - -def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1, - GR8:$mask), - (VPLZCNTQrrk VR512:$src1, - (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; - -def : Pat<(v16i32 (ctlz (loadv16i32 addr:$src))), - (VPLZCNTDrm addr:$src)>; -def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))), - (VPLZCNTDrr VR512:$src)>; -def : Pat<(v8i64 (ctlz (loadv8i64 addr:$src))), - (VPLZCNTQrm addr:$src)>; -def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))), - (VPLZCNTQrr VR512:$src)>; - def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; @@ -6197,7 +6595,7 @@ defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > { def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set _.KRC:$dst, (trunc (_.VT _.RC:$src)))]>, EVEX; + [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX; } multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr, @@ -6230,19 +6628,19 @@ defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m", multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _, string OpcodeStr> { defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst), - (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", (_.VT (X86compress _.RC:$src1))>, AVX5128IBase; let mayStore = 1 in { def mr : AVX5128I<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), - OpcodeStr # "\t{$src, $dst |$dst, $src}", + OpcodeStr # "\t{$src, $dst|$dst, $src}", []>, EVEX_CD8<_.EltSize, CD8VT1>; def mrk : AVX5128I<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), - OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", - [(store (_.VT (vselect _.KRCWM:$mask, + OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + [(store (_.VT (vselect _.KRCWM:$mask, (_.VT (X86compress _.RC:$src)), _.ImmAllZerosV)), addr:$dst)]>, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; @@ -6272,7 +6670,7 @@ defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _, string OpcodeStr> { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", (_.VT (X86expand _.RC:$src1))>, AVX5128IBase; let mayLoad = 1 in @@ -6302,6 +6700,62 @@ defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>, defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, EVEX, VEX_W; +//handle instruction reg_vec1 = op(reg_vec,imm) +// op(mem_vec,imm) +// op(broadcast(eltVt),imm) +//all instruction created with FROUND_CURRENT +multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _>{ + defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, $src1", "$src2, $src2", + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>; + let mayLoad = 1 in { + defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>; + defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr, + "${src1}"##_.BroadcastStr##", $src2", + (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>, EVEX_B; + } +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} +multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86VectorVTInfo _>{ + defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, {sae}, $src1", + "$src1, {sae}, $src2", + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_NO_EXC))>, EVEX_B; +} + +multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr, + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{ + let Predicates = [prd] in { + defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>, + avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>, + EVEX_V512; + } + let Predicates = [prd, HasVLX] in { + defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>, + EVEX_V128; + defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>, + EVEX_V256; + } +} + //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) // op(reg_vec2,mem_vec,imm) // op(reg_vec2,broadcast(eltVt),imm) @@ -6309,49 +6763,60 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>; let mayLoad = 1 in { defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), + (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), + (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>, EVEX_B; } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) // op(reg_vec2,mem_vec,imm) +multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{ + + defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), + (SrcInfo.VT SrcInfo.RC:$src2), + (i8 imm:$src3)))>; + let mayLoad = 1 in + defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), + (SrcInfo.VT (bitconvert + (SrcInfo.LdFrag addr:$src2))), + (i8 imm:$src3)))>; +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) +// op(reg_vec2,mem_vec,imm) // op(reg_vec2,broadcast(eltVt),imm) multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ - defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), - OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - (i8 imm:$src3))>; - let mayLoad = 1 in { - defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), - OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert (_.LdFrag addr:$src2))), - (i8 imm:$src3))>; + X86VectorVTInfo _>: + avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{ + + let mayLoad = 1 in defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", @@ -6359,7 +6824,6 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), (i8 imm:$src3))>, EVEX_B; - } } //handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) @@ -6369,20 +6833,20 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>; let mayLoad = 1 in { defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), + (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>; let isAsmParserOnly = 1 in { @@ -6398,18 +6862,25 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), - OpcodeStr, "$src3,{sae}, $src2, $src1", - "$src1, $src2,{sae}, $src3", + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, {sae}, $src2, $src1", + "$src1, $src2, {sae}, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_NO_EXC))>, EVEX_B; } //handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { - defm NAME: avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _>; + defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, {sae}, $src2, $src1", + "$src1, $src2, {sae}, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$src3), + (i32 FROUND_NO_EXC))>, EVEX_B; } multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr, @@ -6428,6 +6899,20 @@ multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr, } } +multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr, + AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo>{ + let Predicates = [HasBWI] in { + defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info512, + SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V; + } + let Predicates = [HasBWI, HasVLX] in { + defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info128, + SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V; + defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info256, + SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V; + } +} + multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode>{ let Predicates = [HasAVX512] in { @@ -6447,6 +6932,14 @@ multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr, } } +multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr, + bits<8> opcPs, bits<8> opcPd, SDNode OpNode, Predicate prd>{ + defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info, + opcPs, OpNode, prd>, EVEX_CD8<32, CD8VF>; + defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info, + opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W; +} + defm VFIXUPIMMPD : avx512_common_fp_sae_packed_imm<"vfixupimmpd", avx512vl_f64_info, 0x54, X86VFixupimm, HasAVX512>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; @@ -6461,6 +6954,14 @@ defm VFIXUPIMMSS: avx512_common_fp_sae_scalar_imm<"vfixupimmss", f32x_info, 0x55, X86VFixupimm, HasAVX512>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; +defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56, + X86VReduce, HasDQI>, AVX512AIi8Base, EVEX; +defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09, + X86VRndScale, HasAVX512>, AVX512AIi8Base, EVEX; +defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26, + X86VGetMant, HasAVX512>, AVX512AIi8Base, EVEX; + + defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info, 0x50, X86VRange, HasDQI>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; @@ -6475,6 +6976,19 @@ defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info, 0x51, X86VRange, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; +defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info, + 0x57, X86Reduces, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info, + 0x57, X86Reduces, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info, + 0x27, X86GetMants, HasAVX512>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info, + 0x27, X86GetMants, HasAVX512>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode = X86Shuf128>{ @@ -6486,6 +7000,29 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _, defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; } } +let Predicates = [HasAVX512] in { +def : Pat<(v16f32 (ffloor VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>; +def : Pat<(v16f32 (fnearbyint VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>; +def : Pat<(v16f32 (fceil VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>; +def : Pat<(v16f32 (frint VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>; +def : Pat<(v16f32 (ftrunc VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>; + +def : Pat<(v8f64 (ffloor VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>; +def : Pat<(v8f64 (fnearbyint VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>; +def : Pat<(v8f64 (fceil VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>; +def : Pat<(v8f64 (frint VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>; +def : Pat<(v8f64 (ftrunc VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>; +} defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; @@ -6496,31 +7033,51 @@ defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>, defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; -multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I, - AVX512VLVectorVTInfo VTInfo_FP>{ +multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> { defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>, AVX512AIi8Base, EVEX_4V; - let isCodeGenOnly = 1 in { - defm NAME#_FP: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0x03, X86VAlign>, - AVX512AIi8Base, EVEX_4V; - } } -defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info, avx512vl_f32_info>, +defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info, avx512vl_f64_info>, +defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W; +multiclass avx512_vpalign_lowering<X86VectorVTInfo _ , list<Predicate> p>{ + let Predicates = p in + def NAME#_.VTName#rri: + Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))), + (!cast<Instruction>(NAME#_.ZSuffix#rri) + _.RC:$src1, _.RC:$src2, imm:$imm)>; +} + +multiclass avx512_vpalign_lowering_common<AVX512VLVectorVTInfo _>: + avx512_vpalign_lowering<_.info512, [HasBWI]>, + avx512_vpalign_lowering<_.info128, [HasBWI, HasVLX]>, + avx512_vpalign_lowering<_.info256, [HasBWI, HasVLX]>; + +defm VPALIGN: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" , + avx512vl_i8_info, avx512vl_i8_info>, + avx512_vpalign_lowering_common<avx512vl_i16_info>, + avx512_vpalign_lowering_common<avx512vl_i32_info>, + avx512_vpalign_lowering_common<avx512vl_f32_info>, + avx512_vpalign_lowering_common<avx512vl_i64_info>, + avx512_vpalign_lowering_common<avx512vl_f64_info>, + EVEX_CD8<8, CD8VF>; + +defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" , + avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>; + multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1), OpcodeStr##_.Suffix, + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase; let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.MemOp:$src1), OpcodeStr##_.Suffix, + (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1", (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>, EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>; @@ -6531,7 +7088,7 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, avx512_unary_rm<opc, OpcodeStr, OpNode, _> { let mayLoad = 1 in defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.ScalarMemOp:$src1), OpcodeStr##_.Suffix, + (ins _.ScalarMemOp:$src1), OpcodeStr, "${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr, (_.VT (OpNode (X86VBroadcast @@ -6568,15 +7125,16 @@ multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, SDNode OpNode, Predicate prd> { - defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr, OpNode, avx512vl_i64_info, + defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, avx512vl_i64_info, prd>, VEX_W; - defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr, OpNode, avx512vl_i32_info, prd>; + defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, avx512vl_i32_info, + prd>; } multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, SDNode OpNode, Predicate prd> { - defm W : avx512_unary_rm_vl<opc_w, OpcodeStr, OpNode, avx512vl_i16_info, prd>; - defm B : avx512_unary_rm_vl<opc_b, OpcodeStr, OpNode, avx512vl_i8_info, prd>; + defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, avx512vl_i16_info, prd>; + defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, avx512vl_i8_info, prd>; } multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w, @@ -6598,3 +7156,332 @@ def : Pat<(xor (bc_v8i64 (v8i1sextv8i64)), (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))), (VPABSQZrr VR512:$src)>; + +multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{ + + defm NAME : avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>; +} + +defm VPLZCNT : avx512_ctlz<0x44, "vplzcnt", HasCDI>; +defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>; + +//===---------------------------------------------------------------------===// +// Replicate Single FP - MOVSHDUP and MOVSLDUP +//===---------------------------------------------------------------------===// +multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{ + defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, avx512vl_f32_info, + HasAVX512>, XS; +} + +defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>; +defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>; + +//===----------------------------------------------------------------------===// +// AVX-512 - MOVDDUP +//===----------------------------------------------------------------------===// + +multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX; + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src)))))>, + EVEX, EVEX_CD8<_.EltSize, CD8VH>; +} + +multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo> { + + defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{ + defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode, + avx512vl_f64_info>, XD, VEX_W; +} + +defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>; + +def : Pat<(X86Movddup (loadv2f64 addr:$src)), + (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; +def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Unpack Instructions +//===----------------------------------------------------------------------===// +defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh>; +defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl>; + +defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl, + SSE_INTALU_ITINS_P, HasBWI>; +defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh, + SSE_INTALU_ITINS_P, HasBWI>; +defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl, + SSE_INTALU_ITINS_P, HasBWI>; +defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh, + SSE_INTALU_ITINS_P, HasBWI>; + +defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl, + SSE_INTALU_ITINS_P, HasAVX512>; +defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh, + SSE_INTALU_ITINS_P, HasAVX512>; +defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl, + SSE_INTALU_ITINS_P, HasAVX512>; +defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh, + SSE_INTALU_ITINS_P, HasAVX512>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Extract & Insert Integer Instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayStore = 1 in + def mr : AVX512Ii8<opc, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1), + imm:$src2)))), + addr:$dst)]>, + EVEX, EVEX_CD8<_.EltSize, CD8VT1>; +} + +multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<0x14, MRMDestReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, + (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, TAPD; + + defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD; + } +} + +multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, + (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, PD; + + def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + EVEX, TAPD; + + defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD; + } +} + +multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _, + RegisterClass GRC> { + let Predicates = [HasDQI] in { + def rr : AVX512Ii8<0x16, MRMDestReg, (outs GRC:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GRC:$dst, + (extractelt (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, TAPD; + + let mayStore = 1 in + def mr : AVX512Ii8<0x16, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (extractelt (_.VT _.RC:$src1), + imm:$src2),addr:$dst)]>, + EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD; + } +} + +defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>; +defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>; +defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>; +defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W; + +multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, PatFrag LdFrag> { + def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; +} + +multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, PatFrag LdFrag> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V; + + defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>; + } +} + +multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, RegisterClass GRC> { + let Predicates = [HasDQI] in { + def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, GRC:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>, + EVEX_4V, TAPD; + + defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _, + _.ScalarLdFrag>, TAPD; + } +} + +defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info, + extloadi8>, TAPD; +defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info, + extloadi16>, PD; +defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>; +defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W; +//===----------------------------------------------------------------------===// +// VSHUFPS - VSHUFPD Operations +//===----------------------------------------------------------------------===// +multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I, + AVX512VLVectorVTInfo VTInfo_FP>{ + defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp>, + EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>, + AVX512AIi8Base, EVEX_4V; +} + +defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS; +defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W; +//===----------------------------------------------------------------------===// +// AVX-512 - Byte shift Left/Right +//===----------------------------------------------------------------------===// + +multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr, + Format MRMm, string OpcodeStr, X86VectorVTInfo _>{ + def rr : AVX512<opc, MRMr, + (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>; + let mayLoad = 1 in + def rm : AVX512<opc, MRMm, + (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst,(_.VT (OpNode + (_.LdFrag addr:$src1), (i8 imm:$src2))))]>; +} + +multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr, + Format MRMm, string OpcodeStr, Predicate prd>{ + let Predicates = [prd] in + defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, + OpcodeStr, v8i64_info>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, + OpcodeStr, v4i64x_info>, EVEX_V256; + defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, + OpcodeStr, v2i64x_info>, EVEX_V128; + } +} +defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", + HasBWI>, AVX512PDIi8Base, EVEX_4V; +defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", + HasBWI>, AVX512PDIi8Base, EVEX_4V; + + +multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode, + string OpcodeStr, X86VectorVTInfo _dst, + X86VectorVTInfo _src>{ + def rr : AVX512BI<opc, MRMSrcReg, + (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _dst.RC:$dst,(_dst.VT + (OpNode (_src.VT _src.RC:$src1), + (_src.VT _src.RC:$src2))))]>; + let mayLoad = 1 in + def rm : AVX512BI<opc, MRMSrcMem, + (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _dst.RC:$dst,(_dst.VT + (OpNode (_src.VT _src.RC:$src1), + (_src.VT (bitconvert + (_src.LdFrag addr:$src2))))))]>; +} + +multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode, + string OpcodeStr, Predicate prd> { + let Predicates = [prd] in + defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info, + v64i8_info>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info, + v32i8x_info>, EVEX_V256; + defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info, + v16i8x_info>, EVEX_V128; + } +} + +defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", + HasBWI>, EVEX_4V; + +multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _>{ + let Constraints = "$src1 = $dst" in { + defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, u8imm:$src4), + OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT _.RC:$src3), + (i8 imm:$src4))>, AVX512AIi8Base, EVEX_4V; + let mayLoad = 1 in { + defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4), + OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT (bitconvert (_.LdFrag addr:$src3))), + (i8 imm:$src4))>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4), + OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2", + "$src2, ${src3}"##_.BroadcastStr##", $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (i8 imm:$src4))>, EVEX_B, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + } + }// Constraints = "$src1 = $dst" +} + +multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{ + let Predicates = [HasAVX512] in + defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info512>, EVEX_V512; + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info128>, EVEX_V128; + defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info256>, EVEX_V256; + } +} + +defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>; +defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W; + diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td index 5e19ad4..1a2e786 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -615,14 +615,14 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass, def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">; -def Xi8 : X86TypeInfo<i8 , "b", GR8 , loadi8 , i8mem , - Imm8 , i8imm , imm, i8imm , invalid_node, +def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem, + Imm8, i8imm, imm8_su, i8imm, invalid_node, 0, OpSizeFixed, 0>; def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem, - Imm16, i16imm, imm, i16i8imm, i16immSExt8, + Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su, 1, OpSize16, 0>; def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem, - Imm32, i32imm, imm, i32i8imm, i32immSExt8, + Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su, 1, OpSize32, 0>; def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, Imm32S, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8, @@ -928,15 +928,22 @@ class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, let hasSideEffects = 0; } -// BinOpAI_FF - Instructions like "adc %eax, %eax, imm", that implicitly define +// BinOpAI_RFF - Instructions like "adc %eax, %eax, imm", that implicitly define // and use EFLAGS. -class BinOpAI_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - Register areg, string operands> +class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> : BinOpAI<opcode, mnemonic, typeinfo, areg, operands, IIC_BIN_CARRY_NONMEM> { let Uses = [areg, EFLAGS]; } +// BinOpAI_F - Instructions like "cmp %eax, %eax, imm", that imp-def EFLAGS. +class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> + : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> { + let Defs = [EFLAGS]; +} + /// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is /// defined with "(set GPR:$dst, EFLAGS, (...". /// @@ -1092,14 +1099,14 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } } // Uses = [EFLAGS], Defs = [EFLAGS] - def NAME#8i8 : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|al, $src}">; - def NAME#16i16 : BinOpAI_FF<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|ax, $src}">; - def NAME#32i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|eax, $src}">; - def NAME#64i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|rax, $src}">; + def NAME#8i8 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; } /// ArithBinOp_F - This is an arithmetic binary operator where the pattern is @@ -1170,14 +1177,14 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } } // Defs = [EFLAGS] - def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|al, $src}">; - def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|ax, $src}">; - def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|eax, $src}">; - def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|rax, $src}">; + def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; } @@ -1246,14 +1253,14 @@ let isCompare = 1 in { "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; } // Defs = [EFLAGS] - def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL, - "{$src, %al|al, $src}">; - def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX, - "{$src, %ax|ax, $src}">; - def TEST32i32 : BinOpAI<0xA8, "test", Xi32, EAX, - "{$src, %eax|eax, $src}">; - def TEST64i32 : BinOpAI<0xA8, "test", Xi64, RAX, - "{$src, %rax|rax, $src}">; + def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, + "{$src, %al|al, $src}">; + def TEST16i16 : BinOpAI_F<0xA8, "test", Xi16, AX, + "{$src, %ax|ax, $src}">; + def TEST32i32 : BinOpAI_F<0xA8, "test", Xi32, EAX, + "{$src, %eax|eax, $src}">; + def TEST64i32 : BinOpAI_F<0xA8, "test", Xi64, RAX, + "{$src, %rax|rax, $src}">; } // isCompare //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h index 2056056..787f15b 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h +++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h @@ -156,10 +156,9 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) { Flags |= MachineMemOperand::MOLoad; if (MCID.mayStore()) Flags |= MachineMemOperand::MOStore; - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI, Offset), - Flags, MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); return addOffset(MIB.addFrameIndex(FI), Offset) .addMemOperand(MMO); } diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td index 315f213..c73c950 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -13,7 +13,7 @@ //===----------------------------------------------------------------------===// -// SetCC instructions. +// CMOV instructions. multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> { let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", isCommutable = 1, SchedRW = [WriteALU] in { diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td index 7f850d6..96a29ca 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td @@ -132,26 +132,6 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), Requires<[In64BitMode]>; } -// The MSVC runtime contains an _ftol2 routine for converting floating-point -// to integer values. It has a strange calling convention: the input is -// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is -// used as a temporary register. No other registers (aside from flags) are -// touched. -// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80 -// variant is unnecessary. - -let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in { - def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src), - "# win32 fptoui", - [(X86WinFTOL RFP32:$src)]>, - Requires<[Not64BitMode]>; - - def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src), - "# win32 fptoui", - [(X86WinFTOL RFP64:$src)]>, - Requires<[Not64BitMode]>; -} - //===----------------------------------------------------------------------===// // EH Pseudo Instructions // @@ -172,6 +152,29 @@ def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr), } +let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, + isCodeGenOnly = 1, isReturn = 1 in { + def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>; + + // CATCHRET needs a custom inserter for SEH. + let usesCustomInserter = 1 in + def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from), + "# CATCHRET", + [(catchret bb:$dst, bb:$from)]>; +} + +let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in +def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>; + +// This instruction is responsible for re-establishing stack pointers after an +// exception has been caught and we are rejoining normal control flow in the +// parent function or funclet. It generally sets ESP and EBP, and optionally +// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us +// elsewhere. +let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in +def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>; + let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf), @@ -259,18 +262,33 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { let AddedComplexity = 20; } +let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], + AddedComplexity = 1 in { + // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC, + // which only require 3 bytes compared to MOV32ri which requires 5. + let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in { + def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, 1)]>; + def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, -1)]>; + } + + // MOV16ri is 4 bytes, so the instructions above are smaller. + def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>; + def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>; +} + // Materialize i64 constant where top 32-bits are zero. This could theoretically // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. -let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, - isCodeGenOnly = 1, hasSideEffects = 0 in -def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src), - "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>; +let isReMaterializable = 1, isAsCheapAsAMove = 1, + isPseudo = 1, hasSideEffects = 0 in +def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>; // This 64-bit pseudo-move can be used for both a 64-bit constant that is -// actually the zero-extension of a 32-bit constant, and for labels in the +// actually the zero-extension of a 32-bit constant and for labels in the // x86-64 small code model. -def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>; +def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>; let AddedComplexity = 1 in def : Pat<(i64 mov64imm32:$src), @@ -509,6 +527,7 @@ let usesCustomInserter = 1, Uses = [EFLAGS] in { defm _FR32 : CMOVrr_PSEUDO<FR32, f32>; defm _FR64 : CMOVrr_PSEUDO<FR64, f64>; + defm _FR128 : CMOVrr_PSEUDO<FR128, f128>; defm _V4F32 : CMOVrr_PSEUDO<VR128, v4f32>; defm _V2F64 : CMOVrr_PSEUDO<VR128, v2f64>; defm _V2I64 : CMOVrr_PSEUDO<VR128, v2i64>; @@ -535,8 +554,8 @@ let usesCustomInserter = 1, Uses = [EFLAGS] in { // TODO: Get this to fold the constant into the instruction. let isCodeGenOnly = 1, Defs = [EFLAGS] in def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero), - "or{l}\t{$zero, $dst|$dst, $zero}", - [], IIC_ALU_MEM>, Requires<[Not64BitMode]>, LOCK, + "or{l}\t{$zero, $dst|$dst, $zero}", [], + IIC_ALU_MEM>, Requires<[Not64BitMode]>, OpSize32, LOCK, Sched<[WriteALULd, WriteRMW]>; let hasSideEffects = 1 in @@ -752,67 +771,111 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", /* The following multiclass tries to make sure that in code like * x.store (immediate op x.load(acquire), release) + * and + * x.store (register op x.load(acquire), release) * an operation directly on memory is generated instead of wasting a register. * It is not automatic as atomic_store/load are only lowered to MOV instructions * extremely late to prevent them from being accidentally reordered in the backend * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions) */ -multiclass RELEASE_BINOP_MI<string op> { +multiclass RELEASE_BINOP_MI<SDNode op> { def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), - "#RELEASE_BINOP PSEUDO!", - [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op) + "#BINOP "#NAME#"8mi PSEUDO!", + [(atomic_store_8 addr:$dst, (op (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; + def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src), + "#BINOP "#NAME#"8mr PSEUDO!", + [(atomic_store_8 addr:$dst, (op + (atomic_load_8 addr:$dst), GR8:$src))]>; // NAME#16 is not generated as 16-bit arithmetic instructions are considered // costly and avoided as far as possible by this backend anyway def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), - "#RELEASE_BINOP PSEUDO!", - [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op) + "#BINOP "#NAME#"32mi PSEUDO!", + [(atomic_store_32 addr:$dst, (op (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, (op + (atomic_load_32 addr:$dst), GR32:$src))]>; def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), - "#RELEASE_BINOP PSEUDO!", - [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op) + "#BINOP "#NAME#"64mi32 PSEUDO!", + [(atomic_store_64 addr:$dst, (op (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, (op + (atomic_load_64 addr:$dst), GR64:$src))]>; +} +let Defs = [EFLAGS] in { + defm RELEASE_ADD : RELEASE_BINOP_MI<add>; + defm RELEASE_AND : RELEASE_BINOP_MI<and>; + defm RELEASE_OR : RELEASE_BINOP_MI<or>; + defm RELEASE_XOR : RELEASE_BINOP_MI<xor>; + // Note: we don't deal with sub, because substractions of constants are + // optimized into additions before this code can run. +} + +// Same as above, but for floating-point. +// FIXME: imm version. +// FIXME: Version that doesn't clobber $src, using AVX's VADDSS. +// FIXME: This could also handle SIMD operations with *ps and *pd instructions. +let usesCustomInserter = 1 in { +multiclass RELEASE_FP_BINOP_MI<SDNode op> { + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, + (i32 (bitconvert (op + (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))), + FR32:$src))))]>, Requires<[HasSSE1]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, + (i64 (bitconvert (op + (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))), + FR64:$src))))]>, Requires<[HasSSE2]>; +} +defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>; +// FIXME: Add fsub, fmul, fdiv, ... } -defm RELEASE_ADD : RELEASE_BINOP_MI<"add">; -defm RELEASE_AND : RELEASE_BINOP_MI<"and">; -defm RELEASE_OR : RELEASE_BINOP_MI<"or">; -defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">; -// Note: we don't deal with sub, because substractions of constants are -// optimized into additions before this code can run multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> { def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"8m PSEUDO!", [(atomic_store_8 addr:$dst, dag8)]>; def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"16m PSEUDO!", [(atomic_store_16 addr:$dst, dag16)]>; def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"32m PSEUDO!", [(atomic_store_32 addr:$dst, dag32)]>; def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"64m PSEUDO!", [(atomic_store_64 addr:$dst, dag64)]>; } -defm RELEASE_INC : RELEASE_UNOP< - (add (atomic_load_8 addr:$dst), (i8 1)), - (add (atomic_load_16 addr:$dst), (i16 1)), - (add (atomic_load_32 addr:$dst), (i32 1)), - (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>; -defm RELEASE_DEC : RELEASE_UNOP< - (add (atomic_load_8 addr:$dst), (i8 -1)), - (add (atomic_load_16 addr:$dst), (i16 -1)), - (add (atomic_load_32 addr:$dst), (i32 -1)), - (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>; +let Defs = [EFLAGS] in { + defm RELEASE_INC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 1)), + (add (atomic_load_16 addr:$dst), (i16 1)), + (add (atomic_load_32 addr:$dst), (i32 1)), + (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>; + defm RELEASE_DEC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 -1)), + (add (atomic_load_16 addr:$dst), (i16 -1)), + (add (atomic_load_32 addr:$dst), (i32 -1)), + (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>; +} /* TODO: These don't work because the type inference of TableGen fails. TODO: find a way to fix it. -defm RELEASE_NEG : RELEASE_UNOP< - (ineg (atomic_load_8 addr:$dst)), - (ineg (atomic_load_16 addr:$dst)), - (ineg (atomic_load_32 addr:$dst)), - (ineg (atomic_load_64 addr:$dst))>; +let Defs = [EFLAGS] in { + defm RELEASE_NEG : RELEASE_UNOP< + (ineg (atomic_load_8 addr:$dst)), + (ineg (atomic_load_16 addr:$dst)), + (ineg (atomic_load_32 addr:$dst)), + (ineg (atomic_load_64 addr:$dst))>; +} +// NOT doesn't set flags. defm RELEASE_NOT : RELEASE_UNOP< (not (atomic_load_8 addr:$dst)), (not (atomic_load_16 addr:$dst)), @@ -821,42 +884,42 @@ defm RELEASE_NOT : RELEASE_UNOP< */ def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV8mi PSEUDO!", [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV16mi PSEUDO!", [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV32mi PSEUDO!", [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV64mi32 PSEUDO!", [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV8mr PSEUDO!", [(atomic_store_8 addr:$dst, GR8 :$src)]>; def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV16mr PSEUDO!", [(atomic_store_16 addr:$dst, GR16:$src)]>; def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV32mr PSEUDO!", [(atomic_store_32 addr:$dst, GR32:$src)]>; def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV64mr PSEUDO!", [(atomic_store_64 addr:$dst, GR64:$src)]>; def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV8rm PSEUDO!", [(set GR8:$dst, (atomic_load_8 addr:$src))]>; def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV16rm PSEUDO!", [(set GR16:$dst, (atomic_load_16 addr:$src))]>; def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV32rm PSEUDO!", [(set GR32:$dst, (atomic_load_32 addr:$src))]>; def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV64rm PSEUDO!", [(set GR64:$dst, (atomic_load_64 addr:$src))]>; //===----------------------------------------------------------------------===// @@ -1077,11 +1140,11 @@ defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>; // zextload bool -> zextload byte def : Pat<(zextloadi8i1 addr:$src), (AND8ri (MOV8rm addr:$src), (i8 1))>; -def : Pat<(zextloadi16i1 addr:$src), (AND16ri (MOVZX16rm8 addr:$src), (i16 1))>; -def : Pat<(zextloadi32i1 addr:$src), (AND32ri (MOVZX32rm8 addr:$src), (i32 1))>; +def : Pat<(zextloadi16i1 addr:$src), (AND16ri8 (MOVZX16rm8 addr:$src), (i16 1))>; +def : Pat<(zextloadi32i1 addr:$src), (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1))>; def : Pat<(zextloadi64i1 addr:$src), (SUBREG_TO_REG (i64 0), - (AND32ri (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>; + (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>; // extload bool -> extload byte // When extloading from 16-bit and smaller memory locations into 64-bit @@ -1298,7 +1361,6 @@ def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)), sub_32bit)>; // r & (2^16-1) ==> movz -let AddedComplexity = 1 in // Give priority over i64immZExt32. def : Pat<(and GR64:$src, 0xffff), (SUBREG_TO_REG (i64 0), (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))), diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td index 4cd5563..8c351a5 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrControl.td +++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td @@ -53,6 +53,19 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, "{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>; def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt), "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16; + + // The machine return from interrupt instruction, but sometimes we need to + // perform a post-epilogue stack adjustment. Codegen emits the pseudo form + // which expands to include an SP adjustment if necessary. + def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, + OpSize16; + def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], + IIC_IRET>, OpSize32; + def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", [], + IIC_IRET>, Requires<[In64BitMode]>; + let isCodeGenOnly = 1 in + def IRET : PseudoI<(outs), (ins i16imm:$adj), [(X86iret timm:$adj)]>; + } // Unconditional branches. diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td index c4b2d6d..af43d9f 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrExtension.td +++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td @@ -98,22 +98,22 @@ let hasSideEffects = 0, isCodeGenOnly = 1 in { def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg, (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX", - [], IIC_MOVZX>, TB, Sched<[WriteALU]>; + [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALU]>; let mayLoad = 1 in def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem, (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX", - [], IIC_MOVZX>, TB, Sched<[WriteALULd]>; + [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALULd]>; def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg, (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX", - [], IIC_MOVSX>, TB, Sched<[WriteALU]>; + [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALU]>; let mayLoad = 1 in def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem, (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX", - [], IIC_MOVSX>, TB, Sched<[WriteALULd]>; + [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALULd]>; } // MOVSX64rr8 always has a REX prefix and it has an 8-bit register @@ -146,18 +146,22 @@ def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), Sched<[WriteALULd]>, Requires<[In64BitMode]>; // movzbq and movzwq encodings for the disassembler -def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src), - "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, - TB, Sched<[WriteALU]>; -def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src), - "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, - TB, Sched<[WriteALULd]>; -def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), - "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, - TB, Sched<[WriteALU]>; -def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), - "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, - TB, Sched<[WriteALULd]>; +let hasSideEffects = 0 in { +def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src), + "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src), + "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALULd]>; +def MOVZX64rr16 : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), + "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVZX64rm16 : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALULd]>; +} // 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a // 32-bit register. diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td index 7cc3b59..fd800cf 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFMA.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td @@ -15,13 +15,31 @@ // FMA3 - Intel 3 operand Fused Multiply-Add instructions //===----------------------------------------------------------------------===// -let Constraints = "$src1 = $dst" in { +// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined +// below, both the register and memory variants are commutable. +// For the register form the commutable operands are 1, 2 and 3. +// For the memory variant the folded operand must be in 3. Thus, +// in that case, only the operands 1 and 2 can be swapped. +// Commuting some of operands may require the opcode change. +// FMA*213*: +// operands 1 and 2 (memory & register forms): *213* --> *213*(no changes); +// operands 1 and 3 (register forms only): *213* --> *231*; +// operands 2 and 3 (register forms only): *213* --> *132*. +// FMA*132*: +// operands 1 and 2 (memory & register forms): *132* --> *231*; +// operands 1 and 3 (register forms only): *132* --> *132*(no changes); +// operands 2 and 3 (register forms only): *132* --> *213*. +// FMA*231*: +// operands 1 and 2 (memory & register forms): *231* --> *132*; +// operands 1 and 3 (register forms only): *231* --> *213*; +// operands 2 and 3 (register forms only): *231* --> *231*(no changes). + +let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in multiclass fma3p_rm<bits<8> opc, string OpcodeStr, PatFrag MemFrag128, PatFrag MemFrag256, ValueType OpVT128, ValueType OpVT256, - bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0, SDPatternOperator Op = null_frag> { - let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in + let usesCustomInserter = 1 in def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -29,7 +47,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, VR128:$src3)))]>; - let mayLoad = 1, isCommutable = IsMVariantCommutable in + let mayLoad = 1 in def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, f128mem:$src3), !strconcat(OpcodeStr, @@ -37,7 +55,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, (MemFrag128 addr:$src3))))]>; - let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in + let usesCustomInserter = 1 in def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, @@ -45,7 +63,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1, VR256:$src3)))]>, VEX_L; - let mayLoad = 1, isCommutable = IsMVariantCommutable in + let mayLoad = 1 in def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3), !strconcat(OpcodeStr, @@ -54,34 +72,20 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, (OpVT256 (Op VR256:$src2, VR256:$src1, (MemFrag256 addr:$src3))))]>, VEX_L; } -} // Constraints = "$src1 = $dst" multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, string OpcodeStr, string PackTy, PatFrag MemFrag128, PatFrag MemFrag256, SDNode Op, ValueType OpTy128, ValueType OpTy256> { - // For 213, both the register and memory variant are commutable. - // Indeed, the commutable operands are 1 and 2 and both live in registers - // for both variants. defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, "213", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256, - /* IsRVariantCommutable */ 1, - /* IsMVariantCommutable */ 1, - Op>; -let hasSideEffects = 0 in { + MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, "132", PackTy), MemFrag128, MemFrag256, OpTy128, OpTy256>; - // For 231, only the register variant is commutable. - // For the memory variant the folded operand must be in 3. Thus, - // in that case, it cannot be swapped with 2. defm r231 : fma3p_rm<opc231, !strconcat(OpcodeStr, "231", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256, - /* IsRVariantCommutable */ 1, - /* IsMVariantCommutable */ 0>; -} // hasSideEffects = 0 + MemFrag128, MemFrag256, OpTy128, OpTy256>; } // Fused Multiply-Add @@ -126,83 +130,122 @@ let ExeDomain = SSEPackedDouble in { v4f64>, VEX_W; } -let Constraints = "$src1 = $dst" in { -multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, - RegisterClass RC, ValueType OpVT, PatFrag mem_frag, - bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0, +// All source register operands of FMA opcodes defined in fma3s_rm multiclass +// can be commuted. In many cases such commute transformation requres an opcode +// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form +// would require an opcode change to FMA*231: +// FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2; +// --> +// FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2; +// Please see more detailed comment at the very beginning of the section +// defining FMA3 opcodes above. +let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in +multiclass fma3s_rm<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, SDPatternOperator OpNode = null_frag> { - let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in + let usesCustomInserter = 1 in def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, - (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>; + [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>; - let mayLoad = 1, isCommutable = IsMVariantCommutable in + let mayLoad = 1 in def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, - (OpVT (OpNode RC:$src2, RC:$src1, - (mem_frag addr:$src3))))]>; + (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>; +} + +// These FMA*_Int instructions are defined specially for being used when +// the scalar FMA intrinsics are lowered to machine instructions, and in that +// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc. +// instructions. +// +// All of the FMA*_Int opcodes are defined as commutable here. +// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial +// and the corresponding optimizations have been developed. +// Commuting the 1st operand of FMA*_Int requires some additional analysis, +// the commute optimization is legal only if all users of FMA*_Int use only +// the lowest element of the FMA*_Int instruction. Even though such analysis +// may be not implemented yet we allow the routines doing the actual commute +// transformation to decide if one or another instruction is commutable or not. +let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1, + hasSideEffects = 0 in +multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, + Operand memopr, RegisterClass RC> { + def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>; + + let mayLoad = 1 in + def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, memopr:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>; } -} // Constraints = "$src1 = $dst" multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, - string OpStr, string PackTy, string PT2, Intrinsic Int, - SDNode OpNode, RegisterClass RC, ValueType OpVT, - X86MemOperand x86memop, Operand memop, PatFrag mem_frag, - ComplexPattern mem_cpat> { -let hasSideEffects = 0 in { - defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), - x86memop, RC, OpVT, mem_frag>; - // See the other defm of r231 for the explanation regarding the - // commutable flags. - defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), - x86memop, RC, OpVT, mem_frag, - /* IsRVariantCommutable */ 1, - /* IsMVariantCommutable */ 0>; + string OpStr, string PackTy, + SDNode OpNode, RegisterClass RC, + X86MemOperand x86memop> { + defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>; + defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC, + OpNode>; + defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC>; } -// See the other defm of r213 for the explanation regarding the -// commutable flags. -defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), - x86memop, RC, OpVT, mem_frag, - /* IsRVariantCommutable */ 1, - /* IsMVariantCommutable */ 1, - OpNode>; +// The FMA 213 form is created for lowering of scalar FMA intrinscis +// to machine instructions. +// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands +// of FMA 213 form. +// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132 +// forms and is possible only after special analysis of all uses of the initial +// instruction. Such analysis do not exist yet and thus introducing the 231 +// form of FMA*_Int instructions is done using an optimistic assumption that +// such analysis will be implemented eventually. +multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpStr, string PackTy, + RegisterClass RC, Operand memop> { + defm r132 : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy), + memop, RC>; + defm r213 : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy), + memop, RC>; + defm r231 : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy), + memop, RC>; } multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, string OpStr, Intrinsic IntF32, Intrinsic IntF64, SDNode OpNode> { - defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode, - FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>; - defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode, - FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W; + let ExeDomain = SSEPackedSingle in + defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode, + FR32, f32mem>, + fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>; + + let ExeDomain = SSEPackedDouble in + defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode, + FR64, f64mem>, + fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>, + VEX_W; -// These patterns use the 123 ordering, instead of 213, even though -// they match the intrinsic to the 213 version of the instruction. -// This is because src1 is tied to dest, and the scalar intrinsics -// require the pass-through values to come from the first source -// operand, not the second. + // These patterns use the 123 ordering, instead of 213, even though + // they match the intrinsic to the 213 version of the instruction. + // This is because src1 is tied to dest, and the scalar intrinsics + // require the pass-through values to come from the first source + // operand, not the second. def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3), - (COPY_TO_REGCLASS - (!cast<Instruction>(NAME#"SSr213r") - (COPY_TO_REGCLASS $src1, FR32), - (COPY_TO_REGCLASS $src2, FR32), - (COPY_TO_REGCLASS $src3, FR32)), - VR128)>; + (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SSr213r_Int") + $src1, $src2, $src3), VR128)>; def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3), - (COPY_TO_REGCLASS - (!cast<Instruction>(NAME#"SDr213r") - (COPY_TO_REGCLASS $src1, FR64), - (COPY_TO_REGCLASS $src2, FR64), - (COPY_TO_REGCLASS $src3, FR64)), - VR128)>; + (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SDr213r_Int") + $src1, $src2, $src3), VR128)>; } defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss, @@ -334,36 +377,23 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { } // isCodeGenOnly = 1 } -defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, - fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, - int_x86_fma_vfmadd_ss>; -defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>, - fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfmadd_sd>; -defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>, - fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32, - int_x86_fma_vfmsub_ss>; -defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>, - fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfmsub_sd>; -defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, - X86Fnmadd, loadf32>, - fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32, - int_x86_fma_vfnmadd_ss>; -defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, - X86Fnmadd, loadf64>, - fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfnmadd_sd>; -defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, - X86Fnmsub, loadf32>, - fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32, - int_x86_fma_vfnmsub_ss>; -defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, - X86Fnmsub, loadf64>, - fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfnmsub_sd>; - let ExeDomain = SSEPackedSingle in { + // Scalar Instructions + defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, + fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, + int_x86_fma_vfmadd_ss>; + defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>, + fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32, + int_x86_fma_vfmsub_ss>; + defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, + X86Fnmadd, loadf32>, + fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32, + int_x86_fma_vfnmadd_ss>; + defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, + X86Fnmsub, loadf32>, + fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32, + int_x86_fma_vfnmsub_ss>; + // Packed Instructions defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, loadv4f32, loadv8f32>; defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, @@ -379,6 +409,22 @@ let ExeDomain = SSEPackedSingle in { } let ExeDomain = SSEPackedDouble in { + // Scalar Instructions + defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>, + fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfmadd_sd>; + defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>, + fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfmsub_sd>; + defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, + X86Fnmadd, loadf64>, + fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfnmadd_sd>; + defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, + X86Fnmsub, loadf64>, + fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfnmsub_sd>; + // Packed Instructions defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, loadv2f64, loadv4f64>; defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td index 49068e9..03ae211 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td @@ -137,69 +137,99 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, // The FopST0 series are not included here because of the irregularities // in where the 'r' goes in assembly output. // These instructions cannot address 80-bit memory. -multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> { +multiclass FPBinary<SDNode OpNode, Format fp, string asmstring, + bit Forward = 1> { // ST(0) = ST(0) + [mem] def _Fp32m : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP32:$dst, - (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (loadf32 addr:$src2))), + (set RFP32:$dst, + (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>; def _Fp64m : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP64:$dst, - (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (loadf64 addr:$src2))), + (set RFP64:$dst, + (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>; def _Fp64m32: FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP64:$dst, - (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))), + (set RFP64:$dst, + (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>; def _Fp80m32: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP80:$dst, - (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>; + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>; def _Fp80m64: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP80:$dst, - (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>; + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>; +let mayLoad = 1 in def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), - !strconcat("f", asmstring, "{s}\t$src")> { - let mayLoad = 1; -} + !strconcat("f", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), - !strconcat("f", asmstring, "{l}\t$src")> { - let mayLoad = 1; -} + !strconcat("f", asmstring, "{l}\t$src")>; // ST(0) = ST(0) + [memint] def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW, - [(set RFP32:$dst, (OpNode RFP32:$src1, - (X86fild addr:$src2, i16)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i16))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>; def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW, - [(set RFP32:$dst, (OpNode RFP32:$src1, - (X86fild addr:$src2, i32)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i32))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>; def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW, - [(set RFP64:$dst, (OpNode RFP64:$src1, - (X86fild addr:$src2, i16)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i16))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>; def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW, - [(set RFP64:$dst, (OpNode RFP64:$src1, - (X86fild addr:$src2, i32)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i32))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>; def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), - OneArgFPRW, - [(set RFP80:$dst, (OpNode RFP80:$src1, - (X86fild addr:$src2, i16)))]>; + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i16))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>; def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), - OneArgFPRW, - [(set RFP80:$dst, (OpNode RFP80:$src1, - (X86fild addr:$src2, i32)))]>; + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i32))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>; +let mayLoad = 1 in def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), - !strconcat("fi", asmstring, "{s}\t$src")> { - let mayLoad = 1; -} + !strconcat("fi", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), - !strconcat("fi", asmstring, "{l}\t$src")> { - let mayLoad = 1; -} + !strconcat("fi", asmstring, "{l}\t$src")>; } let Defs = [FPSW] in { @@ -213,14 +243,14 @@ defm DIV : FPBinary_rr<fdiv>; let SchedRW = [WriteFAddLd] in { defm ADD : FPBinary<fadd, MRM0m, "add">; defm SUB : FPBinary<fsub, MRM4m, "sub">; -defm SUBR: FPBinary<fsub ,MRM5m, "subr">; +defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>; } let SchedRW = [WriteFMulLd] in { defm MUL : FPBinary<fmul, MRM1m, "mul">; } let SchedRW = [WriteFDivLd] in { defm DIV : FPBinary<fdiv, MRM6m, "div">; -defm DIVR: FPBinary<fdiv, MRM7m, "divr">; +defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>; } } @@ -306,13 +336,13 @@ def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">; def FRSTORm : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">; def FSAVEm : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">; -def FNSTSWm : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">; +def FNSTSWm : FPI<0xDD, MRM7m, (outs i16mem:$dst), (ins), "fnstsw\t$dst">; def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">; def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">; -def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f32mem:$src), "fbld\t$src">; -def FBSTPm : FPI<0xDF, MRM6m, (outs f32mem:$dst), (ins), "fbstp\t$dst">; +def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">; +def FBSTPm : FPI<0xDF, MRM6m, (outs f80mem:$dst), (ins), "fbstp\t$dst">; // Floating point cmovs. class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> : @@ -633,16 +663,18 @@ def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>; def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>; def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>; -def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), - "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB; -def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), - "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], - IIC_FXSAVE>, TB, Requires<[In64BitMode]>; -def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB; -def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)], - IIC_FXRSTOR>, TB, Requires<[In64BitMode]>; +let Predicates = [HasFXSR] in { + def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), + "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB; + def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), + "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], + IIC_FXSAVE>, TB, Requires<[In64BitMode]>; + def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB; + def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)], + IIC_FXRSTOR>, TB, Requires<[In64BitMode]>; +} // Predicates = [FeatureFXSR] } // SchedRW //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 1f61ffa..6432863 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -38,6 +38,8 @@ def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>; def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisFP<1>, SDTCisVT<3, i8>, SDTCisVec<1>]>; +def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, + SDTCisSameAs<1, 2>, SDTCisInt<3>]>; def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; @@ -58,13 +60,17 @@ def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; +def X86frsqrt14s: SDNode<"X86ISD::FRSQRT", SDTFPBinOp>; +def X86frcp14s : SDNode<"X86ISD::FRCP", SDTFPBinOp>; def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>; def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; +def X86comiSae : SDNode<"X86ISD::COMI", SDTX86CmpTestSae>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; +def X86ucomiSae: SDNode<"X86ISD::UCOMI", SDTX86CmpTestSae>; def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; //def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD", @@ -74,11 +80,18 @@ def X86cvtudq2pd: SDNode<"X86ISD::CVTUDQ2PD", SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>, SDTCisVT<1, v4i32>]>>; def X86pshufb : SDNode<"X86ISD::PSHUFB", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; def X86psadbw : SDNode<"X86ISD::PSADBW", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>>; + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCVecEltisVT<1, i8>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; +def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, i8>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>, SDTCisInt<3>]>>; def X86andnp : SDNode<"X86ISD::ANDNP", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; @@ -86,9 +99,11 @@ def X86psign : SDNode<"X86ISD::PSIGN", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; def X86pextrb : SDNode<"X86ISD::PEXTRB", - SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>, + SDTCisPtrTy<2>]>>; def X86pextrw : SDNode<"X86ISD::PEXTRW", - SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>, + SDTCisPtrTy<2>]>>; def X86pinsrb : SDNode<"X86ISD::PINSRB", SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; @@ -114,19 +129,17 @@ def X86vsext : SDNode<"X86ISD::VSEXT", SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>]>>; -def X86vtrunc : SDNode<"X86ISD::VTRUNC", - SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisOpSmallerThanOp<0, 1>]>>; +def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>, + SDTCisOpSmallerThanOp<0, 1>]>; + +def X86vtrunc : SDNode<"X86ISD::VTRUNC", SDTVtrunc>; +def X86vtruncs : SDNode<"X86ISD::VTRUNCS", SDTVtrunc>; +def X86vtruncus : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>; + def X86trunc : SDNode<"X86ISD::TRUNC", SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<0, 1>]>>; - -def X86vtruncm : SDNode<"X86ISD::VTRUNCM", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisVec<2>, SDTCisInt<2>, - SDTCisOpSmallerThanOp<0, 2>]>>; def X86vfpext : SDNode<"X86ISD::VFPEXT", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>, @@ -136,6 +149,35 @@ def X86vfpround: SDNode<"X86ISD::VFPROUND", SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>]>>; +def X86fround: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, f64>, + SDTCVecEltisVT<2, f64>, + SDTCisOpSmallerThanOp<0, 1>]>>; +def X86froundRnd: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, f64>, + SDTCVecEltisVT<2, f64>, + SDTCisOpSmallerThanOp<0, 1>, + SDTCisInt<3>]>>; + +def X86fpext : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, + SDTCVecEltisVT<2, f32>, + SDTCisOpSmallerThanOp<1, 0>]>>; + +def X86fpextRnd : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, + SDTCVecEltisVT<2, f32>, + SDTCisOpSmallerThanOp<1, 0>, + SDTCisInt<3>]>>; + def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>; def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>; def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>; @@ -159,10 +201,15 @@ def X86CmpMaskCCRound : def X86CmpMaskCCScalar : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; -def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; -def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; -def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; -def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>; +def X86CmpMaskCCScalarRound : + SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, + SDTCisInt<4>]>; + +def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; +def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; +def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; +def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>; +def X86cmpmsRnd : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalarRound>; def X86vshl : SDNode<"X86ISD::VSHL", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, @@ -178,6 +225,32 @@ def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>; def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>; def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; +def X86vrotli : SDNode<"X86ISD::VROTLI", SDTIntShiftOp>; +def X86vrotri : SDNode<"X86ISD::VROTRI", SDTIntShiftOp>; + +def X86vprot : SDNode<"X86ISD::VPROT", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86vproti : SDNode<"X86ISD::VPROTI", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVT<2, i8>]>>; + +def X86vpshl : SDNode<"X86ISD::VPSHL", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86vpsha : SDNode<"X86ISD::VPSHA", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; + +def X86vpcom : SDNode<"X86ISD::VPCOM", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i8>]>>; +def X86vpcomu : SDNode<"X86ISD::VPCOMU", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i8>]>>; + def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; @@ -190,6 +263,7 @@ def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; +def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>; def X86testm : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>, SDTCVecEltisVT<0, i1>, @@ -201,11 +275,15 @@ def X86testnm : SDNode<"X86ISD::TESTNM", SDTypeProfile<1, 2, [SDTCisVec<0>, def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>; def X86pmuludq : SDNode<"X86ISD::PMULUDQ", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisSameAs<1,2>]>>; + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCVecEltisVT<1, i32>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; def X86pmuldq : SDNode<"X86ISD::PMULDQ", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisSameAs<1,2>]>>; + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCVecEltisVT<1, i32>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; def X86extrqi : SDNode<"X86ISD::EXTRQI", SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, @@ -221,24 +299,30 @@ def X86insertqi : SDNode<"X86ISD::INSERTQI", def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>; -def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>; def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisVec<2>]>; + SDTCisSameSizeAs<0,2>, + SDTCisSameNumEltsAs<0,2>]>; def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, - SDTCisSameAs<0,1>, SDTCisInt<2>]>; + SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>; def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, SDTCisInt<3>]>; + SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>; def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>, SDTCisInt<4>]>; +def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisInt<2>, SDTCisInt<3>]>; def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; -def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>; +def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>, + SDTCisInt<0>, SDTCisInt<1>]>; def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>; +def SDTTernlog : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisSameAs<0,3>, + SDTCisVT<4, i8>]>; + def SDTFPBinOpRound : SDTypeProfile<1, 3, [ // fadd_round, fmul_round, etc. SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>]>; @@ -250,15 +334,17 @@ def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisInt<4>]>; def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, - SDTCisVec<0>, SDTCisInt<2>]>; + SDTCisVec<0>, SDTCisVT<2, i32>]>; def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, - SDTCisVec<0>, SDTCisInt<3>]>; + SDTCisVec<0>, SDTCisVT<3, i32>]>; def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, - SDTCisVec<0>, SDTCisInt<3>, SDTCisInt<4>]>; + SDTCisVec<0>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; -def X86Abs : SDNode<"X86ISD::ABS", SDTIntUnaryOp>; + +def X86Abs : SDNode<"X86ISD::ABS", SDTIntUnaryOp>; +def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>; def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>; def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>; @@ -281,33 +367,74 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>; def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>; def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; -def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; +def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>; def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>; def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>; def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; +def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>; +def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack>; + def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>; def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>; -def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>; +def X86VPermv : SDNode<"X86ISD::VPERMV", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisSameNumEltsAs<0,1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<0,2>]>>; def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; -def X86VPermv3 : SDNode<"X86ISD::VPERMV3", SDTShuff3Op>; -def X86VPermiv3 : SDNode<"X86ISD::VPERMIV3", SDTShuff3Op>; +def X86VPermt2 : SDNode<"X86ISD::VPERMV3", + SDTypeProfile<1, 3, [SDTCisVec<0>, + SDTCisSameAs<0,1>, SDTCisInt<2>, + SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>, + SDTCisSameSizeAs<0,2>, + SDTCisSameAs<0,3>]>, []>; + +def X86VPermi2X : SDNode<"X86ISD::VPERMIV3", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisSameAs<0,3>]>, []>; + +def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; -def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>; -def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>; +def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>; +def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>; +def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImmRound>; +def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>; +def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImmRound>; +def X86Vfpclass : SDNode<"X86ISD::VFPCLASS", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, + SDTCisVec<1>, SDTCisFP<1>, + SDTCisSameNumEltsAs<0,1>, + SDTCisVT<2, i32>]>, []>; +def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS", + SDTypeProfile<1, 2, [SDTCisVT<0, i1>, + SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>; def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSubVecOfVec<1, 0>]>, []>; +// SDTCisSubVecOfVec restriction cannot be applied for 128 bit version of VBROADCASTI32x2. +def X86SubV32x2Broadcast : SDNode<"X86ISD::SUBV_BROADCAST", + SDTypeProfile<1, 1, [SDTCisVec<0>, + SDTCisSameAs<0,1>]>, []>; + def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; +def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3, - [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>; + [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>, + SDTCisPtrTy<3>]>, []>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, - [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>; + [SDTCisEltOfVec<0, 1>, SDTCisVec<1>, + SDTCisPtrTy<2>]>, []>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; @@ -317,11 +444,13 @@ def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>; def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; -def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; -def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; -def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; -def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; -def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; +def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; +def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; +def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; +def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; +def X86fsqrtRnds : SDNode<"X86ISD::FSQRT_RND", STDFp2SrcRm>; +def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; +def X86fgetexpRnds : SDNode<"X86ISD::FGETEXP_RND", STDFp2SrcRm>; def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; @@ -341,9 +470,11 @@ def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>; def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>; def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>; -def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>; -def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>; -def X86RndScale : SDNode<"X86ISD::RNDSCALE", STDFp3SrcRm>; +def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>; +def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>; +def X86RndScales : SDNode<"X86ISD::VRNDSCALE", STDFp3SrcRm>; +def X86Reduces : SDNode<"X86ISD::VREDUCE", STDFp3SrcRm>; +def X86GetMants : SDNode<"X86ISD::VGETMANT", STDFp3SrcRm>; def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, @@ -362,7 +493,8 @@ def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>, - SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisInt<3>]>; + SDTCisSameAs<0,1>, SDTCisInt<2>, + SDTCisVT<3, i32>]>; def SDTDoubleToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>; @@ -371,9 +503,12 @@ def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, def SDTDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>; +def SDTSDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>,SDTCisFP<1>, + SDTCVecEltisVT<1, f64>, SDTCisInt<2>]>; def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>; - +def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>, + SDTCVecEltisVT<1, f32>, SDTCisInt<2>]>; def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCVecEltisVT<1, i32>, SDTCisInt<2>]>; @@ -392,6 +527,10 @@ def SDTVFPToLongRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, def X86SintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTintToFPRound>; def X86UintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTintToFPRound>; +def X86cvttss2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSFloatToIntRnd>; +def X86cvttss2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSFloatToIntRnd>; +def X86cvttsd2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSDoubleToIntRnd>; +def X86cvttsd2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSDoubleToIntRnd>; // Vector with rounding mode // cvtt fp-to-int staff @@ -417,17 +556,35 @@ def X86cvtps2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToInt>; def X86cvtpd2Int : SDNode<"X86ISD::FP_TO_SINT_RND", SDTDoubleToInt>; def X86cvtpd2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTDoubleToInt>; +def X86cvtph2ps : SDNode<"ISD::FP16_TO_FP", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, i16>, + SDTCisFP<0>, + SDTCisVT<2, i32>]> >; + +def X86cvtps2ph : SDNode<"ISD::FP_TO_FP16", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, + SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>, + SDTCisFP<1>, SDTCisVT<2, i32>, + SDTCisVT<3, i32>]> >; def X86vfpextRnd : SDNode<"X86ISD::VFPEXT", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>, + SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, SDTCisOpSmallerThanOp<1, 0>, - SDTCisInt<2>]>>; + SDTCisVT<2, i32>]>>; def X86vfproundRnd: SDNode<"X86ISD::VFPROUND", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>, SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, f64>, - SDTCisInt<2>]>>; + SDTCisOpSmallerThanOp<0, 1>, + SDTCisVT<2, i32>]>>; + +def X86cvt2mask : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>; //===----------------------------------------------------------------------===// // SSE Complex Patterns @@ -436,10 +593,10 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND", // These are 'extloads' from a scalar to the low element of a vector, zeroing // the top elements. These are used for the SSE 'ss' and 'sd' instruction // forms. -def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [], +def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [], [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPWantRoot]>; -def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [], +def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [], [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPWantRoot]>; @@ -490,9 +647,9 @@ def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; // The memory operand is required to be a 128-bit load, so it must be converted // from a vector to a scalar. def loadf32_128 : PatFrag<(ops node:$ptr), - (f32 (vector_extract (loadv4f32 node:$ptr), (iPTR 0)))>; + (f32 (extractelt (loadv4f32 node:$ptr), (iPTR 0)))>; def loadf64_128 : PatFrag<(ops node:$ptr), - (f64 (vector_extract (loadv2f64 node:$ptr), (iPTR 0)))>; + (f64 (extractelt (loadv2f64 node:$ptr), (iPTR 0)))>; // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), @@ -590,9 +747,9 @@ def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; // The memory operand is required to be a 128-bit load, so it must be converted // from a vector to a scalar. def memopfsf32_128 : PatFrag<(ops node:$ptr), - (f32 (vector_extract (memopv4f32 node:$ptr), (iPTR 0)))>; + (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>; def memopfsf64_128 : PatFrag<(ops node:$ptr), - (f64 (vector_extract (memopv2f64 node:$ptr), (iPTR 0)))>; + (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>; // SSSE3 uses MMX registers for some instructions. They aren't aligned on a @@ -604,32 +761,6 @@ def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>; -// MOVNT Support -// Like 'store', but requires the non-temporal bit to be set -def nontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) - return ST->isNonTemporal(); - return false; -}]>; - -def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) - return ST->isNonTemporal() && !ST->isTruncatingStore() && - ST->getAddressingMode() == ISD::UNINDEXED && - ST->getAlignment() >= 16; - return false; -}]>; - -def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) - return ST->isNonTemporal() && - ST->getAlignment() < 16; - return false; -}]>; - def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_gather node:$src1, node:$src2, node:$src3) , [{ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) @@ -851,29 +982,59 @@ def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), return isa<MaskedLoadSDNode>(N); }]>; +// masked store fragments. +// X86mstore can't be implemented in core DAG files because some targets +// doesn't support vector type ( llvm-tblgen will fail) +def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return !cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +}]>; + def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (X86mstore node:$src1, node:$src2, node:$src3), [{ if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) return Store->getAlignment() >= 16; return false; }]>; def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (X86mstore node:$src1, node:$src2, node:$src3), [{ if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) return Store->getAlignment() >= 32; return false; }]>; def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (X86mstore node:$src1, node:$src2, node:$src3), [{ if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) return Store->getAlignment() >= 64; return false; }]>; def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (X86mstore node:$src1, node:$src2, node:$src3), [{ return isa<MaskedStoreSDNode>(N); }]>; +// masked truncstore fragments +// X86mtruncstore can't be implemented in core DAG files because some targets +// doesn't support vector type ( llvm-tblgen will fail) +def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +}]>; +def masked_truncstorevi8 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def masked_truncstorevi16 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def masked_truncstorevi32 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index cf68ef0..246804e 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -101,9 +101,11 @@ struct X86MemoryFoldTableEntry { void X86InstrInfo::anchor() {} X86InstrInfo::X86InstrInfo(X86Subtarget &STI) - : X86GenInstrInfo( - (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), - (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), + : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 + : X86::ADJCALLSTACKDOWN32), + (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 + : X86::ADJCALLSTACKUP32), + X86::CATCHRET), Subtarget(STI), RI(STI.getTargetTriple()) { static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { @@ -332,6 +334,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD }, { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE }, { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE }, + { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD }, + { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD }, + { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD }, { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, @@ -495,7 +500,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 }, { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, - { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 }, { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, @@ -605,7 +609,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 }, { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, - { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 }, { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, { X86::VPABSDrr128, X86::VPABSDrm128, 0 }, @@ -1647,6 +1650,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PEXT32rr, X86::PEXT32rm, 0 }, { X86::PEXT64rr, X86::PEXT64rm, 0 }, + // ADX foldable instructions + { X86::ADCX32rr, X86::ADCX32rm, 0 }, + { X86::ADCX64rr, X86::ADCX64rm, 0 }, + { X86::ADOX32rr, X86::ADOX32rm, 0 }, + { X86::ADOX64rr, X86::ADOX64rm, 0 }, + // AVX-512 foldable instructions { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, @@ -1729,11 +1738,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { // FMA foldable instructions { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFMADDSSr231r_Int, X86::VFMADDSSr231m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFMADDSDr231r_Int, X86::VFMADDSDr231m_Int, TB_ALIGN_NONE }, { X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, TB_ALIGN_NONE }, { X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE }, + { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, TB_ALIGN_NONE }, { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE }, { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE }, @@ -1749,11 +1764,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE }, { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr231r_Int, X86::VFNMADDSSr231m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr231r_Int, X86::VFNMADDSDr231m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, TB_ALIGN_NONE }, { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE }, { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE }, @@ -1769,11 +1790,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE }, { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr231r_Int, X86::VFMSUBSSr231m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr231r_Int, X86::VFMSUBSDr231m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, TB_ALIGN_NONE }, { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE }, { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE }, @@ -1789,11 +1816,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE }, { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr231r_Int, X86::VFNMSUBSSr231m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr231r_Int, X86::VFNMSUBSDr231m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE }, { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE }, @@ -2282,7 +2315,35 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, case X86::FsVMOVAPSrm: case X86::FsVMOVAPDrm: case X86::FsMOVAPSrm: - case X86::FsMOVAPDrm: { + case X86::FsMOVAPDrm: + // AVX-512 + case X86::VMOVAPDZ128rm: + case X86::VMOVAPDZ256rm: + case X86::VMOVAPDZrm: + case X86::VMOVAPSZ128rm: + case X86::VMOVAPSZ256rm: + case X86::VMOVAPSZrm: + case X86::VMOVDQA32Z128rm: + case X86::VMOVDQA32Z256rm: + case X86::VMOVDQA32Zrm: + case X86::VMOVDQA64Z128rm: + case X86::VMOVDQA64Z256rm: + case X86::VMOVDQA64Zrm: + case X86::VMOVDQU16Z128rm: + case X86::VMOVDQU16Z256rm: + case X86::VMOVDQU16Zrm: + case X86::VMOVDQU32Z128rm: + case X86::VMOVDQU32Z256rm: + case X86::VMOVDQU32Zrm: + case X86::VMOVDQU64Z128rm: + case X86::VMOVDQU64Z256rm: + case X86::VMOVDQU64Zrm: + case X86::VMOVDQU8Z128rm: + case X86::VMOVDQU8Z256rm: + case X86::VMOVDQU8Zrm: + case X86::VMOVUPSZ128rm: + case X86::VMOVUPSZ256rm: + case X86::VMOVUPSZrm: { // Loads from constant pools are trivially rematerializable. if (MI->getOperand(1+X86::AddrBaseReg).isReg() && MI->getOperand(1+X86::AddrScaleAmt).isImm() && @@ -2363,9 +2424,8 @@ bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB, // It is safe to clobber EFLAGS at the end of a block of no successor has it // live in. if (Iter == E) { - for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), - SE = MBB.succ_end(); SI != SE; ++SI) - if ((*SI)->isLiveIn(X86::EFLAGS)) + for (MachineBasicBlock *S : MBB.successors()) + if (S->isLiveIn(X86::EFLAGS)) return false; return true; } @@ -2411,13 +2471,29 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, const TargetRegisterInfo &TRI) const { - // MOV32r0 is implemented with a xor which clobbers condition code. - // Re-materialize it as movri instructions to avoid side effects. - unsigned Opc = Orig->getOpcode(); - if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) { + bool ClobbersEFLAGS = false; + for (const MachineOperand &MO : Orig->operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { + ClobbersEFLAGS = true; + break; + } + } + + if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) { + // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side + // effects. + int Value; + switch (Orig->getOpcode()) { + case X86::MOV32r0: Value = 0; break; + case X86::MOV32r1: Value = 1; break; + case X86::MOV32r_1: Value = -1; break; + default: + llvm_unreachable("Unexpected instruction!"); + } + DebugLoc DL = Orig->getDebugLoc(); BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0)) - .addImm(0); + .addImm(Value); } else { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); MBB.insert(I, MI); @@ -2428,7 +2504,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, } /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead. -static bool hasLiveCondCodeDef(MachineInstr *MI) { +bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const { for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (MO.isReg() && MO.isDef() && @@ -2453,7 +2529,7 @@ inline static unsigned getTruncatedShiftCount(MachineInstr *MI, inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { // Left shift instructions can be transformed into load-effective-address // instructions if we can encode them appropriately. - // A LEA instruction utilizes a SIB byte to encode it's scale factor. + // A LEA instruction utilizes a SIB byte to encode its scale factor. // The SIB.scale field is two bits wide which means that we can encode any // shift amount less than 4. return ShAmt < 4 && ShAmt > 0; @@ -2493,7 +2569,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, ImplicitOp = Src; ImplicitOp.setImplicit(); - NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64); + NewSrc = getX86SubSuperRegister(Src.getReg(), 64); MachineBasicBlock::LivenessQueryResult LQR = MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI); @@ -2914,10 +2990,162 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return NewMI; } -/// We have a few instructions that must be hacked on to commute them. -/// -MachineInstr * -X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { +/// Returns true if the given instruction opcode is FMA3. +/// Otherwise, returns false. +/// The second parameter is optional and is used as the second return from +/// the function. It is set to true if the given instruction has FMA3 opcode +/// that is used for lowering of scalar FMA intrinsics, and it is set to false +/// otherwise. +static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) { + if (IsIntrinsic) + *IsIntrinsic = false; + + switch (Opcode) { + case X86::VFMADDSDr132r: case X86::VFMADDSDr132m: + case X86::VFMADDSSr132r: case X86::VFMADDSSr132m: + case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m: + case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m: + case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m: + case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m: + case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m: + case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m: + + case X86::VFMADDSDr213r: case X86::VFMADDSDr213m: + case X86::VFMADDSSr213r: case X86::VFMADDSSr213m: + case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m: + case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m: + case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m: + case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m: + case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m: + case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m: + + case X86::VFMADDSDr231r: case X86::VFMADDSDr231m: + case X86::VFMADDSSr231r: case X86::VFMADDSSr231m: + case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m: + case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m: + case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m: + case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m: + case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m: + case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m: + + case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m: + case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m: + case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m: + case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m: + case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY: + case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY: + case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY: + case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY: + + case X86::VFMADDPDr132r: case X86::VFMADDPDr132m: + case X86::VFMADDPSr132r: case X86::VFMADDPSr132m: + case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m: + case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m: + case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m: + case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m: + case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m: + case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m: + case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY: + case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY: + case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY: + case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY: + case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY: + case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY: + case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY: + case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY: + + case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m: + case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m: + case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m: + case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m: + case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY: + case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY: + case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY: + case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY: + + case X86::VFMADDPDr213r: case X86::VFMADDPDr213m: + case X86::VFMADDPSr213r: case X86::VFMADDPSr213m: + case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m: + case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m: + case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m: + case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m: + case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m: + case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m: + case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY: + case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY: + case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY: + case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY: + case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY: + case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY: + case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY: + case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY: + + case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m: + case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m: + case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m: + case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m: + case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY: + case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY: + case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY: + case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY: + + case X86::VFMADDPDr231r: case X86::VFMADDPDr231m: + case X86::VFMADDPSr231r: case X86::VFMADDPSr231m: + case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m: + case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m: + case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m: + case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m: + case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m: + case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m: + case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY: + case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY: + case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY: + case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY: + case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY: + case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY: + case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY: + case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY: + return true; + + case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int: + case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int: + case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int: + case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int: + case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int: + case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int: + case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int: + case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int: + + case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int: + case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int: + case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int: + case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int: + case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int: + case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int: + case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int: + case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int: + + case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int: + case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int: + case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int: + case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int: + case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int: + case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int: + case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int: + case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int: + if (IsIntrinsic) + *IsIntrinsic = true; + return true; + default: + return false; + } + llvm_unreachable("Opcode not handled by the switch"); +} + +MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { switch (MI->getOpcode()) { case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I) case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I) @@ -2944,7 +3172,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { } MI->setDesc(get(Opc)); MI->getOperand(3).setImm(Size-Amt); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::BLENDPDrri: case X86::BLENDPSrri: @@ -2980,7 +3208,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { NewMI = false; } MI->getOperand(3).setImm(Mask ^ Imm); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::PCLMULQDQrr: case X86::VPCLMULQDQrr:{ @@ -2995,7 +3223,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { NewMI = false; } MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::CMPPDrri: case X86::CMPPSrri: @@ -3016,7 +3244,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { MI = MF.CloneMachineInstr(MI); NewMI = false; } - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); default: return nullptr; } @@ -3045,7 +3273,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { NewMI = false; } MI->getOperand(3).setImm(Imm); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: @@ -3124,11 +3352,272 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { // Fallthrough intended. } default: - return TargetInstrInfo::commuteInstruction(MI, NewMI); + if (isFMA3(MI->getOpcode())) { + unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2); + if (Opc == 0) + return nullptr; + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->setDesc(get(Opc)); + } + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } } -bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, +bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + + unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3; + + // Only the first RegOpsNum operands are commutable. + // Also, the value 'CommuteAnyOperandIndex' is valid here as it means + // that the operand is not specified/fixed. + if (SrcOpIdx1 != CommuteAnyOperandIndex && + (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum)) + return false; + if (SrcOpIdx2 != CommuteAnyOperandIndex && + (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum)) + return false; + + // Look for two different register operands assumed to be commutable + // regardless of the FMA opcode. The FMA opcode is adjusted later. + if (SrcOpIdx1 == CommuteAnyOperandIndex || + SrcOpIdx2 == CommuteAnyOperandIndex) { + unsigned CommutableOpIdx1 = SrcOpIdx1; + unsigned CommutableOpIdx2 = SrcOpIdx2; + + // At least one of operands to be commuted is not specified and + // this method is free to choose appropriate commutable operands. + if (SrcOpIdx1 == SrcOpIdx2) + // Both of operands are not fixed. By default set one of commutable + // operands to the last register operand of the instruction. + CommutableOpIdx2 = RegOpsNum; + else if (SrcOpIdx2 == CommuteAnyOperandIndex) + // Only one of operands is not fixed. + CommutableOpIdx2 = SrcOpIdx1; + + // CommutableOpIdx2 is well defined now. Let's choose another commutable + // operand and assign its index to CommutableOpIdx1. + unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg(); + for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) { + // The commuted operands must have different registers. + // Otherwise, the commute transformation does not change anything and + // is useless then. + if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg()) + break; + } + + // No appropriate commutable operands were found. + if (CommutableOpIdx1 == 0) + return false; + + // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2 + // to return those values. + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + } + + // Check if we can adjust the opcode to preserve the semantics when + // commute the register operands. + return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0; +} + +unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const { + unsigned Opc = MI->getOpcode(); + + // Define the array that holds FMA opcodes in groups + // of 3 opcodes(132, 213, 231) in each group. + static const unsigned RegularOpcodeGroups[][3] = { + { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r }, + { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r }, + { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r }, + { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r }, + { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY }, + { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY }, + { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m }, + { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m }, + { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m }, + { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m }, + { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY }, + { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY }, + + { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r }, + { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r }, + { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r }, + { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r }, + { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY }, + { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY }, + { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m }, + { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m }, + { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m }, + { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m }, + { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY }, + { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY }, + + { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r }, + { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r }, + { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r }, + { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r }, + { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY }, + { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY }, + { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m }, + { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m }, + { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m }, + { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m }, + { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY }, + { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY }, + + { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r }, + { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r }, + { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r }, + { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r }, + { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY }, + { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY }, + { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m }, + { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m }, + { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m }, + { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m }, + { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY }, + { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY }, + + { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r }, + { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r }, + { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY }, + { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY }, + { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m }, + { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m }, + { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY }, + { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY }, + + { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r }, + { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r }, + { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY }, + { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY }, + { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m }, + { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m }, + { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY }, + { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY } + }; + + // Define the array that holds FMA*_Int opcodes in groups + // of 3 opcodes(132, 213, 231) in each group. + static const unsigned IntrinOpcodeGroups[][3] = { + { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int }, + { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int }, + { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int }, + { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int }, + + { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int }, + { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int }, + { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int }, + { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int }, + + { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int }, + { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int }, + { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int }, + { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int }, + + { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int }, + { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int }, + { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int }, + { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int }, + }; + + const unsigned Form132Index = 0; + const unsigned Form213Index = 1; + const unsigned Form231Index = 2; + const unsigned FormsNum = 3; + + bool IsIntrinOpcode; + isFMA3(Opc, &IsIntrinOpcode); + + size_t GroupsNum; + const unsigned (*OpcodeGroups)[3]; + if (IsIntrinOpcode) { + GroupsNum = array_lengthof(IntrinOpcodeGroups); + OpcodeGroups = IntrinOpcodeGroups; + } else { + GroupsNum = array_lengthof(RegularOpcodeGroups); + OpcodeGroups = RegularOpcodeGroups; + } + + const unsigned *FoundOpcodesGroup = nullptr; + size_t FormIndex; + + // Look for the input opcode in the corresponding opcodes table. + for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup; + ++GroupIndex) { + for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) { + if (OpcodeGroups[GroupIndex][FormIndex] == Opc) { + FoundOpcodesGroup = OpcodeGroups[GroupIndex]; + break; + } + } + } + + // The input opcode does not match with any of the opcodes from the tables. + // The unsupported FMA opcode must be added to one of the two opcode groups + // defined above. + assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode"); + + // Put the lowest index to SrcOpIdx1 to simplify the checks below. + if (SrcOpIdx1 > SrcOpIdx2) + std::swap(SrcOpIdx1, SrcOpIdx2); + + // TODO: Commuting the 1st operand of FMA*_Int requires some additional + // analysis. The commute optimization is legal only if all users of FMA*_Int + // use only the lowest element of the FMA*_Int instruction. Such analysis are + // not implemented yet. So, just return 0 in that case. + // When such analysis are available this place will be the right place for + // calling it. + if (IsIntrinOpcode && SrcOpIdx1 == 1) + return 0; + + unsigned Case; + if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2) + Case = 0; + else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) + Case = 1; + else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) + Case = 2; + else + return 0; + + // Define the FMA forms mapping array that helps to map input FMA form + // to output FMA form to preserve the operation semantics after + // commuting the operands. + static const unsigned FormMapping[][3] = { + // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2; + // FMA132 A, C, b; ==> FMA231 C, A, b; + // FMA213 B, A, c; ==> FMA213 A, B, c; + // FMA231 C, A, b; ==> FMA132 A, C, b; + { Form231Index, Form213Index, Form132Index }, + // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3; + // FMA132 A, c, B; ==> FMA132 B, c, A; + // FMA213 B, a, C; ==> FMA231 C, a, B; + // FMA231 C, a, B; ==> FMA213 B, a, C; + { Form132Index, Form231Index, Form213Index }, + // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3; + // FMA132 a, C, B; ==> FMA213 a, B, C; + // FMA213 b, A, C; ==> FMA132 b, C, A; + // FMA231 c, A, B; ==> FMA231 c, B, A; + { Form213Index, Form132Index, Form231Index } + }; + + // Everything is ready, just adjust the FMA opcode and return it. + FormIndex = FormMapping[Case][FormIndex]; + return FoundOpcodesGroup[FormIndex]; +} + +bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { switch (MI->getOpcode()) { case X86::CMPPDrri: @@ -3141,46 +3630,22 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, // Ordered/Unordered/Equal/NotEqual tests unsigned Imm = MI->getOperand(3).getImm() & 0x7; switch (Imm) { - case 0x00: // EQUAL - case 0x03: // UNORDERED - case 0x04: // NOT EQUAL - case 0x07: // ORDERED - SrcOpIdx1 = 1; - SrcOpIdx2 = 2; - return true; + case 0x00: // EQUAL + case 0x03: // UNORDERED + case 0x04: // NOT EQUAL + case 0x07: // ORDERED + // The indices of the commutable operands are 1 and 2. + // Assign them to the returned operand indices here. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); } return false; } - case X86::VFMADDPDr231r: - case X86::VFMADDPSr231r: - case X86::VFMADDSDr231r: - case X86::VFMADDSSr231r: - case X86::VFMSUBPDr231r: - case X86::VFMSUBPSr231r: - case X86::VFMSUBSDr231r: - case X86::VFMSUBSSr231r: - case X86::VFNMADDPDr231r: - case X86::VFNMADDPSr231r: - case X86::VFNMADDSDr231r: - case X86::VFNMADDSSr231r: - case X86::VFNMSUBPDr231r: - case X86::VFNMSUBPSr231r: - case X86::VFNMSUBSDr231r: - case X86::VFNMSUBSSr231r: - case X86::VFMADDPDr231rY: - case X86::VFMADDPSr231rY: - case X86::VFMSUBPDr231rY: - case X86::VFMSUBPSr231rY: - case X86::VFNMADDPDr231rY: - case X86::VFNMADDPSr231rY: - case X86::VFNMSUBPDr231rY: - case X86::VFNMSUBPSr231rY: - SrcOpIdx1 = 2; - SrcOpIdx2 = 3; - return true; default: + if (isFMA3(MI->getOpcode())) + return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); } + return false; } static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { @@ -3821,15 +4286,58 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, return 0; } -inline static bool MaskRegClassContains(unsigned Reg) { +static bool MaskRegClassContains(unsigned Reg) { return X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) || X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg) || X86::VK1RegClass.contains(Reg); } + +static bool GRRegClassContains(unsigned Reg) { + return X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg) || + X86::GR16RegClass.contains(Reg) || + X86::GR8RegClass.contains(Reg); +} +static +unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) { + if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); + return X86::KMOVBrk; + } + if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); + return X86::KMOVBkr; + } + return 0; +} + +static +unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) { + if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg)) + return X86::KMOVQkk; + if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg)) + return X86::KMOVDrk; + if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg)) + return X86::KMOVQrk; + if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg)) + return X86::KMOVDkr; + if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg)) + return X86::KMOVQkr; + return 0; +} + static -unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { +unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg, + const X86Subtarget &Subtarget) +{ + if (Subtarget.hasDQI()) + if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg)) + return Opc; + if (Subtarget.hasBWI()) + if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg)) + return Opc; if (X86::VR128XRegClass.contains(DestReg, SrcReg) || X86::VR256XRegClass.contains(DestReg, SrcReg) || X86::VR512RegClass.contains(DestReg, SrcReg)) { @@ -3837,21 +4345,14 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { SrcReg = get512BitSuperRegister(SrcReg); return X86::VMOVAPSZrr; } - if (MaskRegClassContains(DestReg) && - MaskRegClassContains(SrcReg)) + if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) return X86::KMOVWkk; - if (MaskRegClassContains(DestReg) && - (X86::GR32RegClass.contains(SrcReg) || - X86::GR16RegClass.contains(SrcReg) || - X86::GR8RegClass.contains(SrcReg))) { - SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32); + if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); return X86::KMOVWkr; } - if ((X86::GR32RegClass.contains(DestReg) || - X86::GR16RegClass.contains(DestReg) || - X86::GR8RegClass.contains(DestReg)) && - MaskRegClassContains(SrcReg)) { - DestReg = getX86SubSuperRegister(DestReg, MVT::i32); + if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); return X86::KMOVWrk; } return 0; @@ -3886,7 +4387,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (X86::VR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MMX_MOVQ64rr; else if (HasAVX512) - Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg); + Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget); else if (X86::VR128RegClass.contains(DestReg, SrcReg)) Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; else if (X86::VR256RegClass.contains(DestReg, SrcReg)) @@ -3900,34 +4401,91 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - // Moving EFLAGS to / from another register requires a push and a pop. - // Notice that we have to adjust the stack if we don't want to clobber the - // first frame index. See X86FrameLowering.cpp - clobbersTheStack. - if (SrcReg == X86::EFLAGS) { - if (X86::GR64RegClass.contains(DestReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSHF64)); - BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg); + bool FromEFLAGS = SrcReg == X86::EFLAGS; + bool ToEFLAGS = DestReg == X86::EFLAGS; + int Reg = FromEFLAGS ? DestReg : SrcReg; + bool is32 = X86::GR32RegClass.contains(Reg); + bool is64 = X86::GR64RegClass.contains(Reg); + + if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) { + int Mov = is64 ? X86::MOV64rr : X86::MOV32rr; + int Push = is64 ? X86::PUSH64r : X86::PUSH32r; + int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32; + int Pop = is64 ? X86::POP64r : X86::POP32r; + int PopF = is64 ? X86::POPF64 : X86::POPF32; + int AX = is64 ? X86::RAX : X86::EAX; + + if (!Subtarget.hasLAHFSAHF()) { + assert(Subtarget.is64Bit() && + "Not having LAHF/SAHF only happens on 64-bit."); + // Moving EFLAGS to / from another register requires a push and a pop. + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - usesTheStack. + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(PushF)); + BuildMI(MBB, MI, DL, get(Pop), DestReg); + } + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Push)) + .addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(PopF)); + } return; } - if (X86::GR32RegClass.contains(DestReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSHF32)); - BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg); - return; + + // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is + // inefficient. Instead: + // - Save the overflow flag OF into AL using SETO, and restore it using a + // signed 8-bit addition of AL and INT8_MAX. + // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH + // using LAHF/SAHF. + // - When RAX/EAX is live and isn't the destination register, make sure it + // isn't clobbered by PUSH/POP'ing it before and after saving/restoring + // the flags. + // This approach is ~2.25x faster than using PUSHF/POPF. + // + // This is still somewhat inefficient because we don't know which flags are + // actually live inside EFLAGS. Were we able to do a single SETcc instead of + // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster. + // + // PUSHF/POPF is also potentially incorrect because it affects other flags + // such as TF/IF/DF, which LLVM doesn't model. + // + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. + // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment. + + + bool AXDead = (Reg == AX) || + (MachineBasicBlock::LQR_Dead == + MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI)); + if (!AXDead) { + // FIXME: If computeRegisterLiveness() reported LQR_Unknown then AX may + // actually be dead. This is not a problem for correctness as we are just + // (unnecessarily) saving+restoring a dead register. However the + // MachineVerifier expects operands that read from dead registers + // to be marked with the "undef" flag. + // An example of this can be found in + // test/CodeGen/X86/peephole-na-phys-copy-folding.ll and + // test/CodeGen/X86/cmpxchg-clobber-flags.ll when using + // -verify-machineinstrs. + BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true)); } - } - if (DestReg == X86::EFLAGS) { - if (X86::GR64RegClass.contains(SrcReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSH64r)) - .addReg(SrcReg, getKillRegState(KillSrc)); - BuildMI(MBB, MI, DL, get(X86::POPF64)); - return; + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL); + BuildMI(MBB, MI, DL, get(X86::LAHF)); + BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX); } - if (X86::GR32RegClass.contains(SrcReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSH32r)) - .addReg(SrcReg, getKillRegState(KillSrc)); - BuildMI(MBB, MI, DL, get(X86::POPF32)); - return; + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL) + .addReg(X86::AL) + .addImm(INT8_MAX); + BuildMI(MBB, MI, DL, get(X86::SAHF)); } + if (!AXDead) + BuildMI(MBB, MI, DL, get(Pop), AX); + return; } DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) @@ -4602,9 +5160,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // live-out. If it is live-out, do not optimize. if ((IsCmpZero || IsSwapped) && !IsSafe) { MachineBasicBlock *MBB = CmpInstr->getParent(); - for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), - SE = MBB->succ_end(); SI != SE; ++SI) - if ((*SI)->isLiveIn(X86::EFLAGS)) + for (MachineBasicBlock *Successor : MBB->successors()) + if (Successor->isLiveIn(X86::EFLAGS)) return false; } @@ -4645,8 +5202,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, CmpInstr->eraseFromParent(); // Modify the condition code of instructions in OpsToUpdate. - for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++) - OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second)); + for (auto &Op : OpsToUpdate) + Op.first->setDesc(get(Op.second)); return true; } @@ -4694,8 +5251,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, return nullptr; // Check whether we can fold the def into SrcOperandId. - MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI); - if (FoldMI) { + if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI)) { FoldAsLoadDefReg = 0; return FoldMI; } @@ -4725,6 +5281,38 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB, return true; } +/// Expand a single-def pseudo instruction to a two-addr +/// instruction with two %k0 reads. +/// This is used for mapping: +/// %k4 = K_SET1 +/// to: +/// %k4 = KXNORrr %k0, %k0 +static bool Expand2AddrKreg(MachineInstrBuilder &MIB, + const MCInstrDesc &Desc, unsigned Reg) { + assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); + MIB->setDesc(Desc); + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + return true; +} + +static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, + bool MinusOne) { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + unsigned Reg = MIB->getOperand(0).getReg(); + + // Insert the XOR. + BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + + // Turn the pseudo into an INC or DEC. + MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r)); + MIB.addReg(Reg); + + return true; +} + // LoadStackGuard has so far only been implemented for 64-bit MachO. Different // code sequence is needed for other targets. static void expandLoadStackGuard(MachineInstrBuilder &MIB, @@ -4735,8 +5323,8 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, const GlobalValue *GV = cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; - MachineMemOperand *MMO = MBB.getParent()-> - getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8); + MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( + MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8); MachineBasicBlock::iterator I = MIB.getInstr(); BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1) @@ -4753,6 +5341,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { switch (MI->getOpcode()) { case X86::MOV32r0: return Expand2AddrUndef(MIB, get(X86::XOR32rr)); + case X86::MOV32r1: + return expandMOV32r1(MIB, *this, /*MinusOne=*/ false); + case X86::MOV32r_1: + return expandMOV32r1(MIB, *this, /*MinusOne=*/ true); case X86::SETB_C8r: return Expand2AddrUndef(MIB, get(X86::SBB8rr)); case X86::SETB_C16r: @@ -4777,10 +5369,25 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; + case X86::MOV32ri64: + MI->setDesc(get(X86::MOV32ri)); + return true; + + // KNL does not recognize dependency-breaking idioms for mask registers, + // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1. + // Using %k0 as the undef input register is a performance heuristic based + // on the assumption that %k0 is used less frequently than the other mask + // registers, since it is not usable as a write mask. + // FIXME: A more advanced approach would be to choose the best input mask + // register based on context. case X86::KSET0B: - case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr)); + case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0); + case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0); + case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0); case X86::KSET1B: - case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr)); + case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0); + case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0); + case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0); case TargetOpcode::LOAD_STACK_GUARD: expandLoadStackGuard(MIB, *this); return true; @@ -4788,12 +5395,28 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { return false; } -static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs) { +static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs, + int PtrOffset = 0) { unsigned NumAddrOps = MOs.size(); - for (unsigned i = 0; i != NumAddrOps; ++i) - MIB.addOperand(MOs[i]); - if (NumAddrOps < 4) // FrameIndex only - addOffset(MIB, 0); + + if (NumAddrOps < 4) { + // FrameIndex only - add an immediate offset (whether its zero or not). + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.addOperand(MOs[i]); + addOffset(MIB, PtrOffset); + } else { + // General Memory Addressing - we need to add any offset to an existing + // offset. + assert(MOs.size() == 5 && "Unexpected memory operand list length"); + for (unsigned i = 0; i != NumAddrOps; ++i) { + const MachineOperand &MO = MOs[i]; + if (i == 3 && PtrOffset != 0) { + MIB.addDisp(MO, PtrOffset); + } else { + MIB.addOperand(MO); + } + } + } } static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, @@ -4828,7 +5451,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, - MachineInstr *MI, const TargetInstrInfo &TII) { + MachineInstr *MI, const TargetInstrInfo &TII, + int PtrOffset = 0) { // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), MI->getDebugLoc(), true); @@ -4838,7 +5462,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, MachineOperand &MO = MI->getOperand(i); if (i == OpNo) { assert(MO.isReg() && "Expected to fold into reg operand!"); - addOperands(MIB, MOs); + addOperands(MIB, MOs, PtrOffset); } else { MIB.addOperand(MO); } @@ -4860,6 +5484,40 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, return MIB.addImm(0); } +MachineInstr *X86InstrInfo::foldMemoryOperandCustom( + MachineFunction &MF, MachineInstr *MI, unsigned OpNum, + ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const { + switch (MI->getOpcode()) { + case X86::INSERTPSrr: + case X86::VINSERTPSrr: + // Attempt to convert the load of inserted vector into a fold load + // of a single float. + if (OpNum == 2) { + unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + unsigned ZMask = Imm & 15; + unsigned DstIdx = (Imm >> 4) & 3; + unsigned SrcIdx = (Imm >> 6) & 3; + + unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize(); + if (Size <= RCSize && 4 <= Align) { + int PtrOffset = SrcIdx * 4; + unsigned NewImm = (DstIdx << 4) | ZMask; + unsigned NewOpCode = + (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm + : X86::INSERTPSrm); + MachineInstr *NewMI = + FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); + NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); + return NewMI; + } + } + break; + }; + + return nullptr; +} + MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr *MI, unsigned OpNum, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, @@ -4869,10 +5527,13 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( bool isCallRegIndirect = Subtarget.callRegIndirect(); bool isTwoAddrFold = false; - // For CPUs that favor the register form of a call, - // do not fold loads into calls. - if (isCallRegIndirect && - (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) + // For CPUs that favor the register form of a call or push, + // do not fold loads into calls or pushes, unless optimizing for size + // aggressively. + if (isCallRegIndirect && !MF.getFunction()->optForMinSize() && + (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r || + MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r || + MI->getOpcode() == X86::PUSH64r)) return nullptr; unsigned NumOps = MI->getDesc().getNumOperands(); @@ -4886,6 +5547,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( return nullptr; MachineInstr *NewMI = nullptr; + + // Attempt to fold any custom cases we have. + if (MachineInstr *CustomMI = + foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align)) + return CustomMI; + // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. @@ -4963,60 +5630,56 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // If the instruction and target operand are commutable, commute the // instruction and try again. if (AllowCommute) { - unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2; + unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex; if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { bool HasDef = MI->getDesc().getNumDefs(); unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg(); unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg(); - bool Tied0 = - 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); bool Tied1 = + 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); + bool Tied2 = 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); // If either of the commutable operands are tied to the destination // then we can not commute + fold. - if ((HasDef && Reg0 == Reg1 && Tied0) || - (HasDef && Reg0 == Reg2 && Tied1)) + if ((HasDef && Reg0 == Reg1 && Tied1) || + (HasDef && Reg0 == Reg2 && Tied2)) return nullptr; - if ((CommuteOpIdx1 == OriginalOpIdx) || - (CommuteOpIdx2 == OriginalOpIdx)) { - MachineInstr *CommutedMI = commuteInstruction(MI, false); - if (!CommutedMI) { - // Unable to commute. - return nullptr; - } - if (CommutedMI != MI) { - // New instruction. We can't fold from this. - CommutedMI->eraseFromParent(); - return nullptr; - } + MachineInstr *CommutedMI = + commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + if (!CommutedMI) { + // Unable to commute. + return nullptr; + } + if (CommutedMI != MI) { + // New instruction. We can't fold from this. + CommutedMI->eraseFromParent(); + return nullptr; + } - // Attempt to fold with the commuted version of the instruction. - unsigned CommuteOp = - (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1); - NewMI = - foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, InsertPt, Size, Align, - /*AllowCommute=*/false); - if (NewMI) - return NewMI; - - // Folding failed again - undo the commute before returning. - MachineInstr *UncommutedMI = commuteInstruction(MI, false); - if (!UncommutedMI) { - // Unable to commute. - return nullptr; - } - if (UncommutedMI != MI) { - // New instruction. It doesn't need to be kept. - UncommutedMI->eraseFromParent(); - return nullptr; - } + // Attempt to fold with the commuted version of the instruction. + NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, + Size, Align, /*AllowCommute=*/false); + if (NewMI) + return NewMI; - // Return here to prevent duplicate fuse failure report. + // Folding failed again - undo the commute before returning. + MachineInstr *UncommutedMI = + commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + if (!UncommutedMI) { + // Unable to commute. return nullptr; } + if (UncommutedMI != MI) { + // New instruction. It doesn't need to be kept. + UncommutedMI->eraseFromParent(); + return nullptr; + } + + // Return here to prevent duplicate fuse failure report. + return nullptr; } } @@ -5208,13 +5871,14 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, // If MI kills this register, the false dependence is already broken. if (MI->killsRegister(Reg, TRI)) return; + if (X86::VR128RegClass.contains(Reg)) { // These instructions are all floating point domain, so xorps is the best // choice. - bool HasAVX = Subtarget.hasAVX(); - unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr; + unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr; BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg) .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + MI->addRegisterKilled(Reg, TRI, true); } else if (X86::VR256RegClass.contains(Reg)) { // Use vxorps to clear the full ymm register. // It wants to read and write the xmm sub-register. @@ -5222,21 +5886,20 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg) .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef) .addReg(Reg, RegState::ImplicitDefine); - } else - return; - MI->addRegisterKilled(Reg, TRI, true); + MI->addRegisterKilled(Reg, TRI, true); + } } MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex) const { // Check switch flag - if (NoFusing) return nullptr; + if (NoFusing) + return nullptr; // Unless optimizing for size, don't fold to avoid partial // register update stalls - if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) && - hasPartialRegUpdate(MI->getOpcode())) + if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode())) return nullptr; const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -5303,6 +5966,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: + case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int: + case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int: + case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int: + case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int: + case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int: + case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int: return false; default: return true; @@ -5318,6 +5987,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: + case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int: + case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int: + case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int: + case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int: + case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int: + case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int: return false; default: return true; @@ -5342,10 +6017,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // Check switch flag if (NoFusing) return nullptr; - // Unless optimizing for size, don't fold to avoid partial - // register update stalls - if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) && - hasPartialRegUpdate(MI->getOpcode())) + // Avoid partial register update stalls unless optimizing for size. + if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode())) return nullptr; // Determine the alignment of the load. @@ -5460,62 +6133,6 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( /*Size=*/0, Alignment, /*AllowCommute=*/true); } -bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef<unsigned> Ops) const { - // Check switch flag - if (NoFusing) return 0; - - if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { - switch (MI->getOpcode()) { - default: return false; - case X86::TEST8rr: - case X86::TEST16rr: - case X86::TEST32rr: - case X86::TEST64rr: - return true; - case X86::ADD32ri: - // FIXME: AsmPrinter doesn't know how to handle - // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. - if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) - return false; - break; - } - } - - if (Ops.size() != 1) - return false; - - unsigned OpNum = Ops[0]; - unsigned Opc = MI->getOpcode(); - unsigned NumOps = MI->getDesc().getNumOperands(); - bool isTwoAddr = NumOps > 1 && - MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; - - // Folding a memory location into the two-address part of a two-address - // instruction is different than folding it other places. It requires - // replacing the *two* registers with the memory location. - const DenseMap<unsigned, - std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; - if (isTwoAddr && NumOps >= 2 && OpNum < 2) { - OpcodeTablePtr = &RegOp2MemOpTable2Addr; - } else if (OpNum == 0) { - if (Opc == X86::MOV32r0) - return true; - - OpcodeTablePtr = &RegOp2MemOpTable0; - } else if (OpNum == 1) { - OpcodeTablePtr = &RegOp2MemOpTable1; - } else if (OpNum == 2) { - OpcodeTablePtr = &RegOp2MemOpTable2; - } else if (OpNum == 3) { - OpcodeTablePtr = &RegOp2MemOpTable3; - } - - if (OpcodeTablePtr && OpcodeTablePtr->count(Opc)) - return true; - return TargetInstrInfo::canFoldMemoryOperand(MI, Ops); -} - bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl<MachineInstr*> &NewMIs) const { @@ -5536,9 +6153,10 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, const MCInstrDesc &MCID = get(Opc); const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); + // TODO: Check if 32-byte or greater accesses are slow too? if (!MI->hasOneMemOperand() && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMem16Slow()) // Without memoperands, loadRegFromAddr and storeRegToStackSlot will // conservatively assume the address is unaligned. That's bad for // performance. @@ -5582,20 +6200,19 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, if (FoldedStore) MIB.addReg(Reg, RegState::Define); - for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i) - MIB.addOperand(BeforeOps[i]); + for (MachineOperand &BeforeOp : BeforeOps) + MIB.addOperand(BeforeOp); if (FoldedLoad) MIB.addReg(Reg); - for (unsigned i = 0, e = AfterOps.size(); i != e; ++i) - MIB.addOperand(AfterOps[i]); - for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) { - MachineOperand &MO = ImpOps[i]; - MIB.addReg(MO.getReg(), - getDefRegState(MO.isDef()) | + for (MachineOperand &AfterOp : AfterOps) + MIB.addOperand(AfterOp); + for (MachineOperand &ImpOp : ImpOps) { + MIB.addReg(ImpOp.getReg(), + getDefRegState(ImpOp.isDef()) | RegState::Implicit | - getKillRegState(MO.isKill()) | - getDeadRegState(MO.isDead()) | - getUndefRegState(MO.isUndef())); + getKillRegState(ImpOp.isKill()) | + getDeadRegState(ImpOp.isDead()) | + getUndefRegState(ImpOp.isUndef())); } // Change CMP32ri r, 0 back to TEST32rr r, r, etc. switch (DataMI->getOpcode()) { @@ -5686,9 +6303,11 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast<MachineSDNode>(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMem16Slow()) // Do not introduce a slow unaligned load. return false; + // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte + // memory access is slow above. unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; @@ -5729,9 +6348,11 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast<MachineSDNode>(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMem16Slow()) // Do not introduce a slow unaligned store. return false; + // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte + // memory access is slow above. unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; @@ -6192,16 +6813,16 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { // domains, but they require a bit more work than just switching opcodes. static const uint16_t *lookup(unsigned opcode, unsigned domain) { - for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) - if (ReplaceableInstrs[i][domain-1] == opcode) - return ReplaceableInstrs[i]; + for (const uint16_t (&Row)[3] : ReplaceableInstrs) + if (Row[domain-1] == opcode) + return Row; return nullptr; } static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { - for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i) - if (ReplaceableInstrsAVX2[i][domain-1] == opcode) - return ReplaceableInstrsAVX2[i]; + for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2) + if (Row[domain-1] == opcode) + return Row; return nullptr; } @@ -6347,230 +6968,181 @@ hasHighOperandLatency(const TargetSchedModel &SchedModel, return isHighLatencyDef(DefMI->getOpcode()); } -static bool hasVirtualRegDefsInBasicBlock(const MachineInstr &Inst, - const MachineBasicBlock *MBB) { - assert(Inst.getNumOperands() == 3 && "Reassociation needs binary operators"); - const MachineOperand &Op1 = Inst.getOperand(1); - const MachineOperand &Op2 = Inst.getOperand(2); - const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - - // We need virtual register definitions. - MachineInstr *MI1 = nullptr; - MachineInstr *MI2 = nullptr; - if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg())) - MI1 = MRI.getUniqueVRegDef(Op1.getReg()); - if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg())) - MI2 = MRI.getUniqueVRegDef(Op2.getReg()); - - // And they need to be in the trace (otherwise, they won't have a depth). - if (MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB) - return true; - - return false; -} - -static bool hasReassocSibling(const MachineInstr &Inst, bool &Commuted) { - const MachineBasicBlock *MBB = Inst.getParent(); - const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg()); - MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg()); - unsigned AssocOpcode = Inst.getOpcode(); - - // If only one operand has the same opcode and it's the second source operand, - // the operands must be commuted. - Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode; - if (Commuted) - std::swap(MI1, MI2); - - // 1. The previous instruction must be the same type as Inst. - // 2. The previous instruction must have virtual register definitions for its - // operands in the same basic block as Inst. - // 3. The previous instruction's result must only be used by Inst. - if (MI1->getOpcode() == AssocOpcode && - hasVirtualRegDefsInBasicBlock(*MI1, MBB) && - MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg())) - return true; +bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst, + const MachineBasicBlock *MBB) const { + assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) && + "Reassociation needs binary operators"); + + // Integer binary math/logic instructions have a third source operand: + // the EFLAGS register. That operand must be both defined here and never + // used; ie, it must be dead. If the EFLAGS operand is live, then we can + // not change anything because rearranging the operands could affect other + // instructions that depend on the exact status flags (zero, sign, etc.) + // that are set by using these particular operands with this operation. + if (Inst.getNumOperands() == 4) { + assert(Inst.getOperand(3).isReg() && + Inst.getOperand(3).getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + if (!Inst.getOperand(3).isDead()) + return false; + } - return false; + return TargetInstrInfo::hasReassociableOperands(Inst, MBB); } // TODO: There are many more machine instruction opcodes to match: // 1. Other data types (integer, vectors) -// 2. Other math / logic operations (and, or) -static bool isAssociativeAndCommutative(unsigned Opcode) { - switch (Opcode) { +// 2. Other math / logic operations (xor, or) +// 3. Other forms of the same operation (intrinsics and other variants) +bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { + switch (Inst.getOpcode()) { + case X86::AND8rr: + case X86::AND16rr: + case X86::AND32rr: + case X86::AND64rr: + case X86::OR8rr: + case X86::OR16rr: + case X86::OR32rr: + case X86::OR64rr: + case X86::XOR8rr: + case X86::XOR16rr: + case X86::XOR32rr: + case X86::XOR64rr: + case X86::IMUL16rr: + case X86::IMUL32rr: + case X86::IMUL64rr: + case X86::PANDrr: + case X86::PORrr: + case X86::PXORrr: + case X86::VPANDrr: + case X86::VPANDYrr: + case X86::VPORrr: + case X86::VPORYrr: + case X86::VPXORrr: + case X86::VPXORYrr: + // Normal min/max instructions are not commutative because of NaN and signed + // zero semantics, but these are. Thus, there's no need to check for global + // relaxed math; the instructions themselves have the properties we need. + case X86::MAXCPDrr: + case X86::MAXCPSrr: + case X86::MAXCSDrr: + case X86::MAXCSSrr: + case X86::MINCPDrr: + case X86::MINCPSrr: + case X86::MINCSDrr: + case X86::MINCSSrr: + case X86::VMAXCPDrr: + case X86::VMAXCPSrr: + case X86::VMAXCPDYrr: + case X86::VMAXCPSYrr: + case X86::VMAXCSDrr: + case X86::VMAXCSSrr: + case X86::VMINCPDrr: + case X86::VMINCPSrr: + case X86::VMINCPDYrr: + case X86::VMINCPSYrr: + case X86::VMINCSDrr: + case X86::VMINCSSrr: + return true; + case X86::ADDPDrr: + case X86::ADDPSrr: case X86::ADDSDrr: case X86::ADDSSrr: - case X86::VADDSDrr: - case X86::VADDSSrr: + case X86::MULPDrr: + case X86::MULPSrr: case X86::MULSDrr: case X86::MULSSrr: + case X86::VADDPDrr: + case X86::VADDPSrr: + case X86::VADDPDYrr: + case X86::VADDPSYrr: + case X86::VADDSDrr: + case X86::VADDSSrr: + case X86::VMULPDrr: + case X86::VMULPSrr: + case X86::VMULPDYrr: + case X86::VMULPSYrr: case X86::VMULSDrr: case X86::VMULSSrr: - return true; + return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; default: return false; } } -/// Return true if the input instruction is part of a chain of dependent ops -/// that are suitable for reassociation, otherwise return false. -/// If the instruction's operands must be commuted to have a previous -/// instruction of the same type define the first source operand, Commuted will -/// be set to true. -static bool isReassocCandidate(const MachineInstr &Inst, bool &Commuted) { - // 1. The operation must be associative and commutative. - // 2. The instruction must have virtual register definitions for its - // operands in the same basic block. - // 3. The instruction must have a reassociable sibling. - if (isAssociativeAndCommutative(Inst.getOpcode()) && - hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) && - hasReassocSibling(Inst, Commuted)) - return true; - - return false; -} - -// FIXME: This has the potential to be expensive (compile time) while not -// improving the code at all. Some ways to limit the overhead: -// 1. Track successful transforms; bail out if hit rate gets too low. -// 2. Only enable at -O3 or some other non-default optimization level. -// 3. Pre-screen pattern candidates here: if an operand of the previous -// instruction is known to not increase the critical path, then don't match -// that pattern. -bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const { - if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath) - return false; - - // TODO: There is nothing x86-specific here except the instruction type. - // This logic could be hoisted into the machine combiner pass itself. - - // Look for this reassociation pattern: - // B = A op X (Prev) - // C = B op Y (Root) - - bool Commute; - if (isReassocCandidate(Root, Commute)) { - // We found a sequence of instructions that may be suitable for a - // reassociation of operands to increase ILP. Specify each commutation - // possibility for the Prev instruction in the sequence and let the - // machine combiner decide if changing the operands is worthwhile. - if (Commute) { - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_YB); - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_YB); - } else { - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_BY); - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_BY); - } - return true; - } +/// This is an architecture-specific helper function of reassociateOps. +/// Set special operand attributes for new instructions after reassociation. +void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, + MachineInstr &OldMI2, + MachineInstr &NewMI1, + MachineInstr &NewMI2) const { + // Integer instructions define an implicit EFLAGS source register operand as + // the third source (fourth total) operand. + if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4) + return; - return false; + assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 && + "Unexpected instruction type for reassociation"); + + MachineOperand &OldOp1 = OldMI1.getOperand(3); + MachineOperand &OldOp2 = OldMI2.getOperand(3); + MachineOperand &NewOp1 = NewMI1.getOperand(3); + MachineOperand &NewOp2 = NewMI2.getOperand(3); + + assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() && + "Must have dead EFLAGS operand in reassociable instruction"); + assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() && + "Must have dead EFLAGS operand in reassociable instruction"); + + (void)OldOp1; + (void)OldOp2; + + assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + + // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations + // of this pass or other passes. The EFLAGS operands must be dead in these new + // instructions because the EFLAGS operands in the original instructions must + // be dead in order for reassociation to occur. + NewOp1.setIsDead(); + NewOp2.setIsDead(); } -/// Attempt the following reassociation to reduce critical path length: -/// B = A op X (Prev) -/// C = B op Y (Root) -/// ===> -/// B = X op Y -/// C = A op B -static void reassociateOps(MachineInstr &Root, MachineInstr &Prev, - MachineCombinerPattern::MC_PATTERN Pattern, - SmallVectorImpl<MachineInstr *> &InsInstrs, - SmallVectorImpl<MachineInstr *> &DelInstrs, - DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { - MachineFunction *MF = Root.getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); - - // This array encodes the operand index for each parameter because the - // operands may be commuted. Each row corresponds to a pattern value, - // and each column specifies the index of A, B, X, Y. - unsigned OpIdx[4][4] = { - { 1, 1, 2, 2 }, - { 1, 2, 2, 1 }, - { 2, 1, 1, 2 }, - { 2, 2, 1, 1 } - }; - - MachineOperand &OpA = Prev.getOperand(OpIdx[Pattern][0]); - MachineOperand &OpB = Root.getOperand(OpIdx[Pattern][1]); - MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]); - MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]); - MachineOperand &OpC = Root.getOperand(0); - - unsigned RegA = OpA.getReg(); - unsigned RegB = OpB.getReg(); - unsigned RegX = OpX.getReg(); - unsigned RegY = OpY.getReg(); - unsigned RegC = OpC.getReg(); - - if (TargetRegisterInfo::isVirtualRegister(RegA)) - MRI.constrainRegClass(RegA, RC); - if (TargetRegisterInfo::isVirtualRegister(RegB)) - MRI.constrainRegClass(RegB, RC); - if (TargetRegisterInfo::isVirtualRegister(RegX)) - MRI.constrainRegClass(RegX, RC); - if (TargetRegisterInfo::isVirtualRegister(RegY)) - MRI.constrainRegClass(RegY, RC); - if (TargetRegisterInfo::isVirtualRegister(RegC)) - MRI.constrainRegClass(RegC, RC); - - // Create a new virtual register for the result of (X op Y) instead of - // recycling RegB because the MachineCombiner's computation of the critical - // path requires a new register definition rather than an existing one. - unsigned NewVR = MRI.createVirtualRegister(RC); - InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); - - unsigned Opcode = Root.getOpcode(); - bool KillA = OpA.isKill(); - bool KillX = OpX.isKill(); - bool KillY = OpY.isKill(); - - // Create new instructions for insertion. - MachineInstrBuilder MIB1 = - BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR) - .addReg(RegX, getKillRegState(KillX)) - .addReg(RegY, getKillRegState(KillY)); - InsInstrs.push_back(MIB1); - - MachineInstrBuilder MIB2 = - BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC) - .addReg(RegA, getKillRegState(KillA)) - .addReg(NewVR, getKillRegState(true)); - InsInstrs.push_back(MIB2); - - // Record old instructions for deletion. - DelInstrs.push_back(&Prev); - DelInstrs.push_back(&Root); +std::pair<unsigned, unsigned> +X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + return std::make_pair(TF, 0u); } -void X86InstrInfo::genAlternativeCodeSequence( - MachineInstr &Root, - MachineCombinerPattern::MC_PATTERN Pattern, - SmallVectorImpl<MachineInstr *> &InsInstrs, - SmallVectorImpl<MachineInstr *> &DelInstrs, - DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const { - MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo(); - - // Select the previous instruction in the sequence based on the input pattern. - MachineInstr *Prev = nullptr; - switch (Pattern) { - case MachineCombinerPattern::MC_REASSOC_AX_BY: - case MachineCombinerPattern::MC_REASSOC_XA_BY: - Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); - break; - case MachineCombinerPattern::MC_REASSOC_AX_YB: - case MachineCombinerPattern::MC_REASSOC_XA_YB: - Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); - } - assert(Prev && "Unknown pattern for machine combiner"); - - reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg); - return; +ArrayRef<std::pair<unsigned, const char *>> +X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace X86II; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"}, + {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"}, + {MO_GOT, "x86-got"}, + {MO_GOTOFF, "x86-gotoff"}, + {MO_GOTPCREL, "x86-gotpcrel"}, + {MO_PLT, "x86-plt"}, + {MO_TLSGD, "x86-tlsgd"}, + {MO_TLSLD, "x86-tlsld"}, + {MO_TLSLDM, "x86-tlsldm"}, + {MO_GOTTPOFF, "x86-gottpoff"}, + {MO_INDNTPOFF, "x86-indntpoff"}, + {MO_TPOFF, "x86-tpoff"}, + {MO_DTPOFF, "x86-dtpoff"}, + {MO_NTPOFF, "x86-ntpoff"}, + {MO_GOTNTPOFF, "x86-gotntpoff"}, + {MO_DLLIMPORT, "x86-dllimport"}, + {MO_DARWIN_STUB, "x86-darwin-stub"}, + {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"}, + {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"}, + {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"}, + {MO_TLVP, "x86-tlvp"}, + {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"}, + {MO_SECREL, "x86-secrel"}}; + return makeArrayRef(TargetFlags); } namespace { diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h index bf63336..edd09d6 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -26,19 +26,6 @@ namespace llvm { class X86RegisterInfo; class X86Subtarget; - namespace MachineCombinerPattern { - enum MC_PATTERN : int { - // These are commutative variants for reassociating a computation chain - // of the form: - // B = A op X (Prev) - // C = B op Y (Root) - MC_REASSOC_AX_BY = 0, - MC_REASSOC_AX_YB = 1, - MC_REASSOC_XA_BY = 2, - MC_REASSOC_XA_YB = 3, - }; - } // end namespace MachineCombinerPattern - namespace X86 { // X86 specific condition code. These correspond to X86_*_COND in // X86InstrInfo.td. They must be kept in synch. @@ -259,14 +246,64 @@ public: MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const override; - /// commuteInstruction - We have a few instructions that must be hacked on to - /// commute them. + /// Returns true iff the routine could find two commutable operands in the + /// given machine instruction. + /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their + /// input values can be re-defined in this method only if the input values + /// are not pre-defined, which is designated by the special value + /// 'CommuteAnyOperandIndex' assigned to it. + /// If both of indices are pre-defined and refer to some operands, then the + /// method simply returns true if the corresponding operands are commutable + /// and returns false otherwise. /// - MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override; - + /// For example, calling this method this way: + /// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex; + /// findCommutedOpIndices(MI, Op1, Op2); + /// can be interpreted as a query asking to find an operand that would be + /// commutable with the operand#1. bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; + /// Returns true if the routine could find two commutable operands + /// in the given FMA instruction. Otherwise, returns false. + /// + /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments. + /// The output indices of the commuted operands are returned in these + /// arguments. Also, the input values of these arguments may be preset either + /// to indices of operands that must be commuted or be equal to a special + /// value 'CommuteAnyOperandIndex' which means that the corresponding + /// operand index is not set and this method is free to pick any of + /// available commutable operands. + /// + /// For example, calling this method this way: + /// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex; + /// findFMA3CommutedOpIndices(MI, Idx1, Idx2); + /// can be interpreted as a query asking if the operand #1 can be swapped + /// with any other available operand (e.g. operand #2, operand #3, etc.). + /// + /// The returned FMA opcode may differ from the opcode in the given MI. + /// For example, commuting the operands #1 and #3 in the following FMA + /// FMA213 #1, #2, #3 + /// results into instruction with adjusted opcode: + /// FMA231 #3, #2, #1 + bool findFMA3CommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const; + + /// Returns an adjusted FMA opcode that must be used in FMA instruction that + /// performs the same computations as the given MI but which has the operands + /// \p SrcOpIdx1 and \p SrcOpIdx2 commuted. + /// It may return 0 if it is unsafe to commute the operands. + /// + /// The returned FMA opcode may differ from the opcode in the given \p MI. + /// For example, commuting the operands #1 and #3 in the following FMA + /// FMA213 #1, #2, #3 + /// results into instruction with adjusted opcode: + /// FMA231 #3, #2, #1 + unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const; + // Branch analysis. bool isUnpredicatedTerminator(const MachineInstr* MI) const override; bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, @@ -342,11 +379,6 @@ public: MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const override; - /// canFoldMemoryOperand - Returns true if the specified load / store is - /// folding is possible. - bool canFoldMemoryOperand(const MachineInstr *, - ArrayRef<unsigned>) const override; - /// unfoldMemoryOperand - Separate a single instruction which folded a load or /// a store or a load and a store into two or more instruction. If this is /// possible, returns true as well as the new instructions by reference. @@ -406,10 +438,9 @@ public: bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; - static bool isX86_64ExtendedReg(const MachineOperand &MO) { - if (!MO.isReg()) return false; - return X86II::isX86_64ExtendedReg(MO.getReg()); - } + /// True if MI has a condition code def, e.g. EFLAGS, that is + /// not marked dead. + bool hasLiveCondCodeDef(MachineInstr *MI) const; /// getGlobalBaseReg - Return a virtual register initialized with the /// the global base register value. Output instructions required to @@ -452,26 +483,19 @@ public: const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const override; - bool useMachineCombiner() const override { return true; } - - /// Return true when there is potentially a faster code sequence - /// for an instruction chain ending in <Root>. All potential patterns are - /// output in the <Pattern> array. - bool getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &P) const override; - - /// When getMachineCombinerPatterns() finds a pattern, this function generates - /// the instructions that could replace the original code sequence. - void genAlternativeCodeSequence( - MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P, - SmallVectorImpl<MachineInstr *> &InsInstrs, - SmallVectorImpl<MachineInstr *> &DelInstrs, - DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override; + + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; + + bool hasReassociableOperands(const MachineInstr &Inst, + const MachineBasicBlock *MBB) const override; + + void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, + MachineInstr &NewMI1, + MachineInstr &NewMI2) const override; /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2 if having two register operands, and the value it @@ -500,12 +524,42 @@ public: unsigned &FoldAsLoadDefReg, MachineInstr *&DefMI) const override; + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + +protected: + /// Commutes the operands in the given instruction by changing the operands + /// order and/or changing the instruction's opcode and/or the immediate value + /// operand. + /// + /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands + /// to be commuted. + /// + /// Do not call this method for a non-commutable instruction or + /// non-commutable operands. + /// Even though the instruction is commutable, the method may still + /// fail to commute the operands, null pointer is returned in such cases. + MachineInstr *commuteInstructionImpl(MachineInstr *MI, bool NewMI, + unsigned CommuteOpIdx1, + unsigned CommuteOpIdx2) const override; + private: MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const; + /// Handles memory folding for special case instructions, for instance those + /// requiring custom manipulation of the address. + MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr *MI, + unsigned OpNum, + ArrayRef<MachineOperand> MOs, + MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const; + /// isFrameOperand - Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. bool isFrameOperand(const MachineInstr *MI, unsigned int Op, diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td index 52bab9c..9c8339a 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -106,8 +106,6 @@ def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; -def SDT_X86WIN_FTOL : SDTypeProfile<0, 1, [SDTCisFP<0>]>; - def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; @@ -158,6 +156,8 @@ def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret, + [SDNPHasChain, SDNPOptInGlue]>; def X86vastart_save_xmm_regs : SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", @@ -250,9 +250,6 @@ def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; -def X86WinFTOL : SDNode<"X86ISD::WIN_FTOL", SDT_X86WIN_FTOL, - [SDNPHasChain, SDNPOutGlue]>; - //===----------------------------------------------------------------------===// // X86 Operand Definitions. // @@ -344,18 +341,21 @@ def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64XOperand>; def vz32mem : X86VMemOperand<VR512, "printi32mem", X86MemVZ32Operand>; def vz64mem : X86VMemOperand<VR512, "printi64mem", X86MemVZ64Operand>; -// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of -// plain GR64, so that it doesn't potentially require a REX prefix. -def i8mem_NOREX : Operand<i64> { +// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead +// of a plain GPR, so that it doesn't potentially require a REX prefix. +def ptr_rc_norex : PointerLikeRegClass<2>; +def ptr_rc_norex_nosp : PointerLikeRegClass<3>; + +def i8mem_NOREX : Operand<iPTR> { let PrintMethod = "printi8mem"; - let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm); + let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, i8imm); let ParserMatchClass = X86Mem8AsmOperand; let OperandType = "OPERAND_MEMORY"; } // GPRs available for tailcall. // It represents GR32_TC, GR64_TC or GR64_TCW64. -def ptr_rc_tailcall : PointerLikeRegClass<2>; +def ptr_rc_tailcall : PointerLikeRegClass<4>; // Special i32mem for addresses of load folding tail calls. These are not // allowed to use callee-saved registers since they must be scheduled @@ -697,34 +697,34 @@ def lea64mem : Operand<i64> { // X86 Complex Pattern Definitions. // -// Define X86 specific addressing mode. -def addr : ComplexPattern<iPTR, 5, "SelectAddr", [], [SDNPWantParent]>; -def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr", +// Define X86-specific addressing mode. +def addr : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>; +def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr", [add, sub, mul, X86mul_imm, shl, or, frameindex], []>; // In 64-bit mode 32-bit LEAs can use RIP-relative addressing. -def lea64_32addr : ComplexPattern<i32, 5, "SelectLEA64_32Addr", +def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr", [add, sub, mul, X86mul_imm, shl, or, frameindex, X86WrapperRIP], []>; -def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr", +def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr", [tglobaltlsaddr], []>; -def tls32baseaddr : ComplexPattern<i32, 5, "SelectTLSADDRAddr", +def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr", [tglobaltlsaddr], []>; -def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr", +def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr", [add, sub, mul, X86mul_imm, shl, or, frameindex, X86WrapperRIP], []>; -def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", +def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr", [tglobaltlsaddr], []>; -def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", +def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr", [tglobaltlsaddr], []>; -def vectoraddr : ComplexPattern<iPTR, 5, "SelectVectorAddr", [],[SDNPWantParent]>; +def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>; //===----------------------------------------------------------------------===// // X86 Instruction Predicate Definitions. @@ -767,12 +767,21 @@ def HasDQI : Predicate<"Subtarget->hasDQI()">, def NoDQI : Predicate<"!Subtarget->hasDQI()">; def HasBWI : Predicate<"Subtarget->hasBWI()">, AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">; +def NoBWI : Predicate<"!Subtarget->hasBWI()">; def HasVLX : Predicate<"Subtarget->hasVLX()">, AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">; def NoVLX : Predicate<"!Subtarget->hasVLX()">; +def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">; +def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">; +def PKU : Predicate<"!Subtarget->hasPKU()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; +def HasFXSR : Predicate<"Subtarget->hasFXSR()">; +def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">; +def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">; +def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">; +def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">; def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; def HasFMA : Predicate<"Subtarget->hasFMA()">; def UseFMAOnAVX : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">; @@ -794,6 +803,7 @@ def HasSHA : Predicate<"Subtarget->hasSHA()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; +def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; def HasMPX : Predicate<"Subtarget->hasMPX()">; @@ -867,20 +877,54 @@ def X86_COND_E_OR_NE : ImmLeaf<i8, [{ }]>; -def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; -def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; -def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; +def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>; +def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>; +def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>; + +// If we have multiple users of an immediate, it's much smaller to reuse +// the register, rather than encode the immediate in every instruction. +// This has the risk of increasing register pressure from stretched live +// ranges, however, the immediates should be trivial to rematerialize by +// the RA in the event of high register pressure. +// TODO : This is currently enabled for stores and binary ops. There are more +// cases for which this can be enabled, though this catches the bulk of the +// issues. +// TODO2 : This should really also be enabled under O2, but there's currently +// an issue with RA where we don't pull the constants into their users +// when we rematerialize them. I'll follow-up on enabling O2 after we fix that +// issue. +// TODO3 : This is currently limited to single basic blocks (DAG creation +// pulls block immediates to the top and merges them if necessary). +// Eventually, it would be nice to allow ConstantHoisting to merge constants +// globally for potentially added savings. +// +def imm8_su : PatLeaf<(i8 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm16_su : PatLeaf<(i16 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm32_su : PatLeaf<(i32 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + +def i16immSExt8_su : PatLeaf<(i16immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i32immSExt8_su : PatLeaf<(i32immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; -def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>; +def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>; // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit // unsigned field. -def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>; +def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>; def i64immZExt32SExt8 : ImmLeaf<i64, [{ - return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm; + return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm)); }]>; // Helper fragments for loads. @@ -914,11 +958,12 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{ return false; }]>; -def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; -def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; -def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; -def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; -def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; +def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; +def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; +def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; +def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; +def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; +def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>; def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>; def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>; @@ -1020,12 +1065,8 @@ def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[], IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>; def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[], IIC_PUSH_REG>, OpSize16; -def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[], - IIC_PUSH_MEM>, OpSize16; def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[], IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>; -def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[], - IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>; def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm), "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16; @@ -1039,6 +1080,40 @@ def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[Not64BitMode]>; } // mayStore, SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { +def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[], + IIC_PUSH_MEM>, OpSize16; +def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[], + IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>; +} // mayLoad, mayStore, SchedRW + +} + +let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, + SchedRW = [WriteRMW], Defs = [ESP] in { + let Uses = [ESP, EFLAGS] in + def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins), + [(set GR32:$dst, (int_x86_flags_read_u32))]>, + Requires<[Not64BitMode]>; + + let Uses = [RSP, EFLAGS] in + def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins), + [(set GR64:$dst, (int_x86_flags_read_u64))]>, + Requires<[In64BitMode]>; +} + +let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, + SchedRW = [WriteRMW] in { + let Defs = [ESP, EFLAGS], Uses = [ESP] in + def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src), + [(int_x86_flags_write_u32 GR32:$src)]>, + Requires<[Not64BitMode]>; + + let Defs = [RSP, EFLAGS], Uses = [RSP] in + def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src), + [(int_x86_flags_write_u64 GR64:$src)]>, + Requires<[In64BitMode]>; } let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0, @@ -1071,15 +1146,18 @@ def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>; def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>; +} // mayStore, SchedRW +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [], IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>; -} // mayStore, SchedRW +} // mayLoad, mayStore, SchedRW } let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1, SchedRW = [WriteStore] in { def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm), - "push{q}\t$imm", [], IIC_PUSH_IMM>, Requires<[In64BitMode]>; + "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32, + Requires<[In64BitMode]>; def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm), "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[In64BitMode]>; @@ -1195,7 +1273,7 @@ def STOSW : I<0xAB, RawFrmDst, (outs dstidx16:$dst), (ins), let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in def STOSL : I<0xAB, RawFrmDst, (outs dstidx32:$dst), (ins), "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32; -let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in +let Defs = [RDI], Uses = [RAX,RDI,EFLAGS] in def STOSQ : RI<0xAB, RawFrmDst, (outs dstidx64:$dst), (ins), "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>; @@ -1275,13 +1353,13 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src), let SchedRW = [WriteStore] in { def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", - [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>; + [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>; def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), "mov{w}\t{$src, $dst|$dst, $src}", - [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16; + [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16; def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", - [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32; + [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32; def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>; @@ -1457,10 +1535,12 @@ def MOV8rm_NOREX : I<0x8A, MRMSrcMem, let SchedRW = [WriteALU] in { let Defs = [EFLAGS], Uses = [AH] in def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", - [(set EFLAGS, (X86sahf AH))], IIC_AHF>; + [(set EFLAGS, (X86sahf AH))], IIC_AHF>, + Requires<[HasLAHFSAHF]>; let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [], - IIC_AHF>; // AH = flags + IIC_AHF>, // AH = flags + Requires<[HasLAHFSAHF]>; } // SchedRW //===----------------------------------------------------------------------===// @@ -1894,37 +1974,38 @@ def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB; } // Table lookup instructions +let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>, Sched<[WriteLoad]>; let SchedRW = [WriteMicrocoded] in { // ASCII Adjust After Addition -// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS +let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>, Requires<[Not64BitMode]>; // ASCII Adjust AX Before Division -// sets AL, AH and EFLAGS and uses AL and AH +let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src), "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>; // ASCII Adjust AX After Multiply -// sets AL, AH and EFLAGS and uses AL +let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src), "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>; // ASCII Adjust AL After Subtraction - sets -// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS +let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>, Requires<[Not64BitMode]>; // Decimal Adjust AL after Addition -// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS +let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>, Requires<[Not64BitMode]>; // Decimal Adjust AL after Subtraction -// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS +let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>, Requires<[Not64BitMode]>; } // SchedRW @@ -2357,6 +2438,32 @@ defm TZMSK : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>; } // HasTBM, EFLAGS //===----------------------------------------------------------------------===// +// MONITORX/MWAITX Instructions +// +let SchedRW = [WriteSystem] in { +let Uses = [EAX, ECX, EDX] in +def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [], + IIC_SSE_MONITOR>, TB; +let Uses = [ECX, EAX, EBX] in +def MWAITXrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", [], IIC_SSE_MWAIT>, + TB; +} // SchedRW + +def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrr)>, Requires<[Not64BitMode]>; +def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrr)>, Requires<[In64BitMode]>; + +def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>, + Requires<[Not64BitMode]>; +def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>, + Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// CLZERO Instruction +// +let Uses = [EAX] in +def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB; + +//===----------------------------------------------------------------------===// // Pattern fragments to auto generate TBM instructions. //===----------------------------------------------------------------------===// @@ -2498,8 +2605,8 @@ def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>; def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>; def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>; -def : MnemonicAlias<"loopz", "loope", "att">; -def : MnemonicAlias<"loopnz", "loopne", "att">; +def : MnemonicAlias<"loopz", "loope">; +def : MnemonicAlias<"loopnz", "loopne">; def : MnemonicAlias<"pop", "popw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>; @@ -2532,14 +2639,15 @@ def : MnemonicAlias<"pusha", "pushaw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"popa", "popal", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"pusha", "pushal", "att">, Requires<[In32BitMode]>; -def : MnemonicAlias<"repe", "rep", "att">; -def : MnemonicAlias<"repz", "rep", "att">; -def : MnemonicAlias<"repnz", "repne", "att">; +def : MnemonicAlias<"repe", "rep">; +def : MnemonicAlias<"repz", "rep">; +def : MnemonicAlias<"repnz", "repne">; def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sal", "shl", "intel">; def : MnemonicAlias<"salb", "shlb", "att">; def : MnemonicAlias<"salw", "shlw", "att">; def : MnemonicAlias<"sall", "shll", "att">; @@ -2579,14 +2687,14 @@ def : MnemonicAlias<"fcmova", "fcmovnbe", "att">; def : MnemonicAlias<"fcmovnae", "fcmovb", "att">; def : MnemonicAlias<"fcmovna", "fcmovbe", "att">; def : MnemonicAlias<"fcmovae", "fcmovnb", "att">; -def : MnemonicAlias<"fcomip", "fcompi", "att">; +def : MnemonicAlias<"fcomip", "fcompi">; def : MnemonicAlias<"fildq", "fildll", "att">; def : MnemonicAlias<"fistpq", "fistpll", "att">; def : MnemonicAlias<"fisttpq", "fisttpll", "att">; def : MnemonicAlias<"fldcww", "fldcw", "att">; def : MnemonicAlias<"fnstcww", "fnstcw", "att">; def : MnemonicAlias<"fnstsww", "fnstsw", "att">; -def : MnemonicAlias<"fucomip", "fucompi", "att">; +def : MnemonicAlias<"fucomip", "fucompi">; def : MnemonicAlias<"fwait", "wait">; def : MnemonicAlias<"fxsaveq", "fxsave64", "att">; @@ -2594,7 +2702,9 @@ def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">; def : MnemonicAlias<"xsaveq", "xsave64", "att">; def : MnemonicAlias<"xrstorq", "xrstor64", "att">; def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">; - +def : MnemonicAlias<"xrstorsq", "xrstors64", "att">; +def : MnemonicAlias<"xsavecq", "xsavec64", "att">; +def : MnemonicAlias<"xsavesq", "xsaves64", "att">; class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond, string VariantName> @@ -2640,61 +2750,61 @@ defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">; //===----------------------------------------------------------------------===// // aad/aam default to base 10 if no operand is specified. -def : InstAlias<"aad", (AAD8i8 10)>; -def : InstAlias<"aam", (AAM8i8 10)>; +def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>; +def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>; // Disambiguate the mem/imm form of bt-without-a-suffix as btl. // Likewise for btc/btr/bts. -def : InstAlias<"bt {$imm, $mem|$mem, $imm}", +def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}", (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>; -def : InstAlias<"btc {$imm, $mem|$mem, $imm}", +def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}", (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>; -def : InstAlias<"btr {$imm, $mem|$mem, $imm}", +def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}", (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>; -def : InstAlias<"bts {$imm, $mem|$mem, $imm}", +def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}", (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>; // clr aliases. -def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>; -def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>; -def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>; -def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>; +def : InstAlias<"clrb\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>; +def : InstAlias<"clrw\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>; +def : InstAlias<"clrl\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>; +def : InstAlias<"clrq\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>; // lods aliases. Accept the destination being omitted because it's implicit // in the mnemonic, or the mnemonic suffix being omitted because it's implicit // in the destination. -def : InstAlias<"lodsb $src", (LODSB srcidx8:$src), 0>; -def : InstAlias<"lodsw $src", (LODSW srcidx16:$src), 0>; -def : InstAlias<"lods{l|d} $src", (LODSL srcidx32:$src), 0>; -def : InstAlias<"lodsq $src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; -def : InstAlias<"lods {$src, %al|al, $src}", (LODSB srcidx8:$src), 0>; -def : InstAlias<"lods {$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>; -def : InstAlias<"lods {$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>; -def : InstAlias<"lods {$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; +def : InstAlias<"lodsb\t$src", (LODSB srcidx8:$src), 0>; +def : InstAlias<"lodsw\t$src", (LODSW srcidx16:$src), 0>; +def : InstAlias<"lods{l|d}\t$src", (LODSL srcidx32:$src), 0>; +def : InstAlias<"lodsq\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; +def : InstAlias<"lods\t{$src, %al|al, $src}", (LODSB srcidx8:$src), 0>; +def : InstAlias<"lods\t{$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>; +def : InstAlias<"lods\t{$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>; +def : InstAlias<"lods\t{$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; // stos aliases. Accept the source being omitted because it's implicit in // the mnemonic, or the mnemonic suffix being omitted because it's implicit // in the source. -def : InstAlias<"stosb $dst", (STOSB dstidx8:$dst), 0>; -def : InstAlias<"stosw $dst", (STOSW dstidx16:$dst), 0>; -def : InstAlias<"stos{l|d} $dst", (STOSL dstidx32:$dst), 0>; -def : InstAlias<"stosq $dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; -def : InstAlias<"stos {%al, $dst|$dst, al}", (STOSB dstidx8:$dst), 0>; -def : InstAlias<"stos {%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>; -def : InstAlias<"stos {%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>; -def : InstAlias<"stos {%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"stosb\t$dst", (STOSB dstidx8:$dst), 0>; +def : InstAlias<"stosw\t$dst", (STOSW dstidx16:$dst), 0>; +def : InstAlias<"stos{l|d}\t$dst", (STOSL dstidx32:$dst), 0>; +def : InstAlias<"stosq\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"stos\t{%al, $dst|$dst, al}", (STOSB dstidx8:$dst), 0>; +def : InstAlias<"stos\t{%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>; +def : InstAlias<"stos\t{%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>; +def : InstAlias<"stos\t{%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; // scas aliases. Accept the destination being omitted because it's implicit // in the mnemonic, or the mnemonic suffix being omitted because it's implicit // in the destination. -def : InstAlias<"scasb $dst", (SCASB dstidx8:$dst), 0>; -def : InstAlias<"scasw $dst", (SCASW dstidx16:$dst), 0>; -def : InstAlias<"scas{l|d} $dst", (SCASL dstidx32:$dst), 0>; -def : InstAlias<"scasq $dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; -def : InstAlias<"scas {$dst, %al|al, $dst}", (SCASB dstidx8:$dst), 0>; -def : InstAlias<"scas {$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>; -def : InstAlias<"scas {$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>; -def : InstAlias<"scas {$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"scasb\t$dst", (SCASB dstidx8:$dst), 0>; +def : InstAlias<"scasw\t$dst", (SCASW dstidx16:$dst), 0>; +def : InstAlias<"scas{l|d}\t$dst", (SCASL dstidx32:$dst), 0>; +def : InstAlias<"scasq\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"scas\t{$dst, %al|al, $dst}", (SCASB dstidx8:$dst), 0>; +def : InstAlias<"scas\t{$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>; +def : InstAlias<"scas\t{$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>; +def : InstAlias<"scas\t{$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; // div and idiv aliases for explicit A register. def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>; @@ -2719,8 +2829,10 @@ def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>; // Various unary fpstack operations default to operating on on ST1. // For example, "fxch" -> "fxch %st(1)" def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; +def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>; def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>; def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>; +def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>; def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>; def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>; def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>; @@ -2780,90 +2892,90 @@ def : InstAlias<"fnstsw" , (FNSTSW16r)>; // lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but // this is compatible with what GAS does. -def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; -def : InstAlias<"ljmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; -def : InstAlias<"lcall {*}$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; -def : InstAlias<"ljmp {*}$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; -def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; -def : InstAlias<"ljmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; -def : InstAlias<"lcall {*}$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; -def : InstAlias<"ljmp {*}$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; - -def : InstAlias<"call {*}$dst", (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>; -def : InstAlias<"jmp {*}$dst", (JMP64m i64mem:$dst), 0>, Requires<[In64BitMode]>; -def : InstAlias<"call {*}$dst", (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>; -def : InstAlias<"jmp {*}$dst", (JMP32m i32mem:$dst), 0>, Requires<[In32BitMode]>; -def : InstAlias<"call {*}$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>; -def : InstAlias<"jmp {*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"ljmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"lcall\t{*}$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"ljmp\t{*}$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"lcall\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; +def : InstAlias<"lcall\t{*}$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp\t{*}$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; + +def : InstAlias<"call\t{*}$dst", (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"jmp\t{*}$dst", (JMP64m i64mem:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"call\t{*}$dst", (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>; +def : InstAlias<"jmp\t{*}$dst", (JMP32m i32mem:$dst), 0>, Requires<[In32BitMode]>; +def : InstAlias<"call\t{*}$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"jmp\t{*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>; // "imul <imm>, B" is an alias for "imul <imm>, B, B". -def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; -def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; -def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; -def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; -def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; -def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; +def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; +def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; +def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; +def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; +def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; +def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; // inb %dx -> inb %al, %dx def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>; def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>; def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>; -def : InstAlias<"inb\t$port", (IN8ri i8imm:$port), 0>; -def : InstAlias<"inw\t$port", (IN16ri i8imm:$port), 0>; -def : InstAlias<"inl\t$port", (IN32ri i8imm:$port), 0>; +def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>; +def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>; +def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>; // jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp -def : InstAlias<"call $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; -def : InstAlias<"jmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; -def : InstAlias<"call $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>; -def : InstAlias<"jmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>; -def : InstAlias<"callw $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>; -def : InstAlias<"jmpw $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>; -def : InstAlias<"calll $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>; -def : InstAlias<"jmpl $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"call\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; +def : InstAlias<"jmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; +def : InstAlias<"call\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>; +def : InstAlias<"jmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>; +def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>; +def : InstAlias<"jmpw\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>; +def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"jmpl\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; // Force mov without a suffix with a segment and mem to prefer the 'l' form of // the move. All segment/mem forms are equivalent, this has the shortest // encoding. -def : InstAlias<"mov {$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>; -def : InstAlias<"mov {$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>; +def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>; +def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>; // Match 'movq <largeimm>, <reg>' as an alias for movabsq. -def : InstAlias<"movq {$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>; +def : InstAlias<"movq\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>; // Match 'movq GR64, MMX' as an alias for movd. -def : InstAlias<"movq {$src, $dst|$dst, $src}", +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; -def : InstAlias<"movq {$src, $dst|$dst, $src}", +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; // movsx aliases -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; // movzx aliases -def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; -def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; -def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; -def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; -def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>; -def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0>; // Note: No GR32->GR64 movzx form. // outb %dx -> outb %al, %dx def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>; def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>; def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>; -def : InstAlias<"outb\t$port", (OUT8ir i8imm:$port), 0>; -def : InstAlias<"outw\t$port", (OUT16ir i8imm:$port), 0>; -def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>; +def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>; +def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>; +def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>; // 'sldt <mem>' can be encoded with either sldtw or sldtq with the same // effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity @@ -2940,3 +3052,34 @@ def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>; def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>; + +// These aliases exist to get the parser to prioritize matching 8-bit +// immediate encodings over matching the implicit ax/eax/rax encodings. By +// explicitly mentioning the A register here, these entries will be ordered +// first due to the more explicit immediate type. +def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}", (OR16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>; + +def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}", (OR32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>; + +def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}", (OR64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td index eaa7894..83f9b14 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrMMX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td @@ -249,6 +249,7 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src), (MMX_X86movd2w (x86mmx VR64:$src)))], IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>; +let isBitcast = 1 in def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (bitconvert GR64:$src))], @@ -262,7 +263,7 @@ def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst), // These are 64 bit moves, but since the OS X assembler doesn't // recognize a register-register movq, we write them as // movd. -let SchedRW = [WriteMove] in { +let SchedRW = [WriteMove], isBitcast = 1 in { def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR64:$src), "movd\t{$src, $dst|$dst, $src}", @@ -303,7 +304,7 @@ def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (x86mmx (bitconvert - (i64 (vector_extract (v2i64 VR128:$src), + (i64 (extractelt (v2i64 VR128:$src), (iPTR 0))))))], IIC_MMX_MOVQ_RR>; @@ -326,6 +327,7 @@ def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), } } // SchedRW +let Predicates = [HasSSE1] in def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movntq\t{$src, $dst|$dst, $src}", [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)], @@ -355,6 +357,7 @@ defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, MMX_INTALU_ITINS, 1>; defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, MMX_INTALU_ITINS, 1>; +let Predicates = [HasSSE2] in defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, MMX_INTALUQ_ITINS, 1>; defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, @@ -382,6 +385,7 @@ defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w, MMX_INTALU_ITINS>; defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d, MMX_INTALU_ITINS>; +let Predicates = [HasSSE2] in defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q, MMX_INTALUQ_ITINS>; @@ -408,8 +412,10 @@ defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, MMX_PMUL_ITINS, 1>; +let Predicates = [HasSSE1] in defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, MMX_PMUL_ITINS, 1>; +let Predicates = [HasSSE2] in defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, MMX_PMUL_ITINS, 1>; let isCommutable = 1 in @@ -422,6 +428,7 @@ defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw", int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>; +let Predicates = [HasSSE1] in { defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, MMX_MISC_FUNC_ITINS, 1>; defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, @@ -439,6 +446,7 @@ defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, MMX_PSADBW_ITINS, 1>; +} defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b, MMX_MISC_FUNC_ITINS>; @@ -594,6 +602,7 @@ let Constraints = "$src1 = $dst" in { } // Extract / Insert +let Predicates = [HasSSE1] in def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -601,6 +610,7 @@ def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, imm:$src2))], IIC_MMX_PEXTR>, Sched<[WriteShuffle]>; let Constraints = "$src1 = $dst" in { +let Predicates = [HasSSE1] in { def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3), @@ -618,8 +628,10 @@ let Constraints = "$src1 = $dst" in { imm:$src3))], IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>; } +} // Mask creation +let Predicates = [HasSSE1] in def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR64:$src), "pmovmskb\t{$src, $dst|$dst, $src}", @@ -639,12 +651,12 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))), // Misc. let SchedRW = [WriteShuffle] in { -let Uses = [EDI] in +let Uses = [EDI], Predicates = [HasSSE1,Not64BitMode] in def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), "maskmovq\t{$mask, $src|$src, $mask}", [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)], IIC_MMX_MASKMOV>; -let Uses = [RDI] in +let Uses = [RDI], Predicates = [HasSSE1,In64BitMode] in def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), "maskmovq\t{$mask, $src|$src, $mask}", [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)], @@ -653,10 +665,6 @@ def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), // 64-bit bit convert. let Predicates = [HasSSE2] in { -def : Pat<(x86mmx (bitconvert (i64 GR64:$src))), - (MMX_MOVD64to64rr GR64:$src)>; -def : Pat<(i64 (bitconvert (x86mmx VR64:$src))), - (MMX_MOVD64from64rr VR64:$src)>; def : Pat<(f64 (bitconvert (x86mmx VR64:$src))), (MMX_MOVQ2FR64rr VR64:$src)>; def : Pat<(x86mmx (bitconvert (f64 FR64:$src))), diff --git a/contrib/llvm/lib/Target/X86/X86InstrMPX.td b/contrib/llvm/lib/Target/X86/X86InstrMPX.td index cf5e2e3..71ab973 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrMPX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrMPX.td @@ -15,10 +15,10 @@ multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> { def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src), - OpcodeStr#" \t{$src, $dst|$dst, $src}", []>, + OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, Requires<[HasMPX, Not64BitMode]>; def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), - OpcodeStr#" \t{$src, $dst|$dst, $src}", []>, + OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, Requires<[HasMPX, In64BitMode]>; } @@ -26,16 +26,16 @@ defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS; multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> { def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i32mem:$src2), - OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>, + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, Requires<[HasMPX, Not64BitMode]>; def 64rm: RI<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i64mem:$src2), - OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>, + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, Requires<[HasMPX, In64BitMode]>; def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2), - OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>, + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, Requires<[HasMPX, Not64BitMode]>; def 64rr: RI<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2), - OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>, + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, Requires<[HasMPX, In64BitMode]>; } defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS; @@ -43,28 +43,28 @@ defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD; defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD; def BNDMOVRMrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src), - "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX]>; def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), - "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX, Not64BitMode]>; def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src), - "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX, In64BitMode]>; def BNDMOVMRrr : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src), - "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX]>; def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs i64mem:$dst), (ins BNDR:$src), - "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX, Not64BitMode]>; def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs i128mem:$dst), (ins BNDR:$src), - "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX, In64BitMode]>; def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src), - "bndstx \t{$src, $dst|$dst, $src}", []>, TB, + "bndstx\t{$src, $dst|$dst, $src}", []>, PS, Requires<[HasMPX]>; def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), - "bndldx \t{$src, $dst|$dst, $src}", []>, TB, - Requires<[HasMPX]>;
\ No newline at end of file + "bndldx\t{$src, $dst|$dst, $src}", []>, PS, + Requires<[HasMPX]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index 99386b0..6a7c456 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -330,9 +330,9 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, //===----------------------------------------------------------------------===// // A vector extract of the first f32/f64 position is a subregister copy -def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), +def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; -def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), +def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>; // A 128-bit subvector extract from the first 256-bit vector position @@ -413,6 +413,8 @@ let Predicates = [HasSSE2] in { def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>; + def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>; } // Bitcasts between 256-bit vector types. Return the original type since @@ -650,10 +652,10 @@ let Predicates = [UseAVX] in { } // Extract and store. - def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), addr:$dst), (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>; - def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), addr:$dst), (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>; @@ -736,7 +738,7 @@ let Predicates = [UseSSE1] in { } // Extract and store. - def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), addr:$dst), (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>; @@ -770,7 +772,7 @@ let Predicates = [UseSSE2] in { } // Extract and store. - def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), addr:$dst), (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>; @@ -935,22 +937,6 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, IIC_SSE_MOVU_P_RR>, VEX, VEX_L; } -let Predicates = [HasAVX] in { -def : Pat<(v8i32 (X86vzmovl - (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; -def : Pat<(v4i64 (X86vzmovl - (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; -def : Pat<(v8f32 (X86vzmovl - (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; -def : Pat<(v4f64 (X86vzmovl - (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; -} - - def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), (VMOVUPSYmr addr:$dst, VR256:$src)>; def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src), @@ -1172,12 +1158,13 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode, multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode, string base_opc, InstrItinClass itin> { - defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, + let Predicates = [UseAVX] in + defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, "\t{$src2, $src1, $dst|$dst, $src1, $src2}", itin>, VEX_4V; -let Constraints = "$src1 = $dst" in - defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, + let Constraints = "$src1 = $dst" in + defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, "\t{$src2, $dst|$dst, $src2}", itin>; } @@ -1188,29 +1175,31 @@ let AddedComplexity = 20 in { } let SchedRW = [WriteStore] in { +let Predicates = [UseAVX] in { def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), + [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract (v2f64 VR128:$src), + [(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; +}// UseAVX def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), + [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract (v2f64 VR128:$src), + [(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; } // SchedRW -let Predicates = [HasAVX] in { +let Predicates = [UseAVX] in { // Shuffle with VMOVLPS def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), (VMOVLPSrm VR128:$src1, addr:$src2)>; @@ -1243,7 +1232,7 @@ let Predicates = [HasAVX] in { let Predicates = [UseSSE1] in { // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS - def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)), + def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)), (iPTR 0))), addr:$src1), (MOVLPSmr addr:$src1, VR128:$src2)>; @@ -1297,31 +1286,33 @@ let AddedComplexity = 20 in { let SchedRW = [WriteStore] in { // v2f64 extract element 1 is always custom lowered to unpack high to low // and extract element 0 so the non-store version isn't too horrible. +let Predicates = [UseAVX] in { def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract + [(store (f64 (extractelt (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), (bc_v2f64 (v4f32 VR128:$src))), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract + [(store (f64 (extractelt (v2f64 (X86Unpckh VR128:$src, VR128:$src)), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; +} // UseAVX def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract + [(store (f64 (extractelt (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), (bc_v2f64 (v4f32 VR128:$src))), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract + [(store (f64 (extractelt (v2f64 (X86Unpckh VR128:$src, VR128:$src)), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; } // SchedRW -let Predicates = [HasAVX] in { +let Predicates = [UseAVX] in { // VMOVHPS patterns def : Pat<(X86Movlhps VR128:$src1, (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), @@ -1345,7 +1336,7 @@ let Predicates = [HasAVX] in { (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (VMOVHPDrm VR128:$src1, addr:$src2)>; - def : Pat<(store (f64 (vector_extract + def : Pat<(store (f64 (extractelt (v2f64 (X86VPermilpi VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (VMOVHPDmr addr:$dst, VR128:$src)>; @@ -1377,7 +1368,7 @@ let Predicates = [UseSSE2] in { (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (MOVHPDrm VR128:$src1, addr:$src2)>; - def : Pat<(store (f64 (vector_extract + def : Pat<(store (f64 (extractelt (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (MOVHPDmr addr:$dst, VR128:$src)>; @@ -1475,6 +1466,8 @@ def SSE_CVT_SD2SI : OpndItins< IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM >; +// FIXME: We probably want to match the rm form only when optimizing for +// size, to avoid false depenendecies (see sse_fp_unop_s for details) multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, string asm, OpndItins itins> { @@ -1498,6 +1491,8 @@ let hasSideEffects = 0 in { } } +// FIXME: We probably want to match the rm form only when optimizing for +// size, to avoid false depenendecies (see sse_fp_unop_s for details) multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm> { let hasSideEffects = 0, Predicates = [UseAVX] in { @@ -1635,6 +1630,8 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", // Conversion Instructions Intrinsics - Match intrinsics which expect MM // and/or XMM operand(s). +// FIXME: We probably want to match the rm form only when optimizing for +// size, to avoid false depenendecies (see sse_fp_unop_s for details) multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, Operand memop, ComplexPattern mem_cpat, string asm, OpndItins itins> { @@ -1811,7 +1808,7 @@ def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>; def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", - (CVTSD2SI64rm GR64:$dst, sdmem:$src)>; + (CVTSD2SI64rm GR64:$dst, sdmem:$src), 0>; /// SSE 2 Only @@ -2073,14 +2070,16 @@ def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; let Predicates = [HasAVX] in { - def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), + def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), (VCVTDQ2PSrr VR128:$src)>; - def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))), (VCVTDQ2PSrm addr:$src)>; +} - def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), (VCVTDQ2PSrr VR128:$src)>; - def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))), + def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), (VCVTDQ2PSrm addr:$src)>; def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), @@ -2149,7 +2148,7 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), (VCVTTPD2DQYrr VR256:$src)>; def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), @@ -2306,7 +2305,9 @@ let Predicates = [HasAVX] in { (VCVTDQ2PSYrr VR256:$src)>; def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))), (VCVTDQ2PSYrm addr:$src)>; +} +let Predicates = [HasAVX, NoVLX] in { // Match fround and fextend for 128/256-bit conversions def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), (VCVTPD2PSrr VR128:$src)>; @@ -2452,9 +2453,9 @@ let Defs = [EFLAGS] in { defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, "ucomisd">, PD, VEX, VEX_LIG; let Pattern = []<dag> in { - defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, + defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, "comiss">, PS, VEX, VEX_LIG; - defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, + defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, "comisd">, PD, VEX, VEX_LIG; } @@ -2475,9 +2476,9 @@ let Defs = [EFLAGS] in { "ucomisd">, PD; let Pattern = []<dag> in { - defm COMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, + defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, "comiss">, PS; - defm COMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, + defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, "comisd">, PD; } @@ -2605,19 +2606,20 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, Sched<[WriteFShuffle]>; } -defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, +let Predicates = [HasAVX, NoVLX] in { + defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", loadv4f32, SSEPackedSingle>, PS, VEX_4V; -defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, + defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L; -defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", loadv2f64, SSEPackedDouble>, PD, VEX_4V; -defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, + defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L; - +} let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", @@ -2627,7 +2629,7 @@ let Constraints = "$src1 = $dst" in { memopv2f64, SSEPackedDouble>, PD; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (X86Shufp VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))), (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; @@ -2694,6 +2696,7 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, Sched<[WriteFShuffleLd, ReadAfterLd]>; } +let Predicates = [HasAVX, NoVLX] in { defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedSingle>, PS, VEX_4V; @@ -2719,7 +2722,7 @@ defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedDouble>, PD, VEX_4V, VEX_L; - +}// Predicates = [HasAVX, NoVLX] let Constraints = "$src1 = $dst" in { defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", @@ -2845,8 +2848,8 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, ValueType OpVT128, ValueType OpVT256, - OpndItins itins, bit IsCommutable = 0> { -let Predicates = [HasAVX, NoVLX] in + OpndItins itins, bit IsCommutable = 0, Predicate prd> { +let Predicates = [HasAVX, prd] in defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; @@ -2854,7 +2857,7 @@ let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, memopv2i64, i128mem, itins, IsCommutable, 1>; -let Predicates = [HasAVX2, NoVLX] in +let Predicates = [HasAVX2, prd] in defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT256, VR256, loadv4i64, i256mem, itins, IsCommutable, 0>, VEX_4V, VEX_L; @@ -2863,13 +2866,13 @@ let Predicates = [HasAVX2, NoVLX] in // These are ordered here for pattern ordering requirements with the fp versions defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 1>; + SSE_VEC_BIT_ITINS_P, 1, NoVLX>; defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 1>; + SSE_VEC_BIT_ITINS_P, 1, NoVLX>; defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 1>; + SSE_VEC_BIT_ITINS_P, 1, NoVLX>; defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 0>; + SSE_VEC_BIT_ITINS_P, 0, NoVLX>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Logical Instructions @@ -2911,7 +2914,7 @@ let isCodeGenOnly = 1 in { // Multiclass for vectors using the X86 logical operation aliases for FP. multiclass sse12_fp_packed_vector_logical_alias< bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { - let Predicates = [HasAVX, NoVLX] in { + let Predicates = [HasAVX, NoVLX_Or_NoDQI] in { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>, PS, VEX_4V; @@ -2923,7 +2926,7 @@ multiclass sse12_fp_packed_vector_logical_alias< defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>, PS, VEX_4V, VEX_L; - + defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>, PD, VEX_4V, VEX_L; @@ -3183,7 +3186,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { let Predicates = [UseSSE1] in { // extracted scalar math op with insert via movss def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))))), (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -3198,7 +3201,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { let Predicates = [UseSSE41] in { // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (i8 1))), (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -3215,7 +3218,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { let Predicates = [HasAVX] in { // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (i8 1))), (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -3241,7 +3244,7 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { let Predicates = [UseSSE2] in { // extracted scalar math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))))), (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; @@ -3256,7 +3259,7 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { let Predicates = [UseSSE41] in { // extracted scalar math op with insert via blend def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))), (i8 1))), (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; @@ -3271,14 +3274,14 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { let Predicates = [HasAVX] in { // extracted scalar math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))))), (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; // extracted scalar math op with insert via blend def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))), (i8 1))), (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; @@ -3390,9 +3393,18 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, def : Pat<(Intr (load addr:$src)), (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m) addr:$src), VR128))>; - def : Pat<(Intr mem_cpat:$src), - (!cast<Instruction>(NAME#Suffix##m_Int) - (vt (IMPLICIT_DEF)), mem_cpat:$src)>; + } + // We don't want to fold scalar loads into these instructions unless + // optimizing for size. This is because the folded instruction will have a + // partial register update, while the unfolded sequence will not, e.g. + // movss mem, %xmm0 + // rcpss %xmm0, %xmm0 + // which has a clobber before the rcp, vs. + // rcpss mem, %xmm0 + let Predicates = [target, OptForSize] in { + def : Pat<(Intr mem_cpat:$src), + (!cast<Instruction>(NAME#Suffix##m_Int) + (vt (IMPLICIT_DEF)), mem_cpat:$src)>; } } @@ -3423,34 +3435,43 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, } } + // We don't want to fold scalar loads into these instructions unless + // optimizing for size. This is because the folded instruction will have a + // partial register update, while the unfolded sequence will not, e.g. + // vmovss mem, %xmm0 + // vrcpss %xmm0, %xmm0, %xmm0 + // which has a clobber before the rcp, vs. + // vrcpss mem, %xmm0, %xmm0 + // TODO: In theory, we could fold the load, and avoid the stall caused by + // the partial register store, either in ExeDepFix or with smarter RA. let Predicates = [UseAVX] in { def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), RC:$src)>; - - def : Pat<(vt (OpNode mem_cpat:$src)), - (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), - mem_cpat:$src)>; - } let Predicates = [HasAVX] in { def : Pat<(Intr VR128:$src), (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)), VR128:$src)>; - - def : Pat<(Intr mem_cpat:$src), - (!cast<Instruction>("V"#NAME#Suffix##m_Int) + } + let Predicates = [HasAVX, OptForSize] in { + def : Pat<(Intr mem_cpat:$src), + (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), mem_cpat:$src)>; } - let Predicates = [UseAVX, OptForSize] in - def : Pat<(ScalarVT (OpNode (load addr:$src))), - (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), - addr:$src)>; + let Predicates = [UseAVX, OptForSize] in { + def : Pat<(ScalarVT (OpNode (load addr:$src))), + (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), + addr:$src)>; + def : Pat<(vt (OpNode mem_cpat:$src)), + (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), + mem_cpat:$src)>; + } } /// sse1_fp_unop_p - SSE1 unops in packed form. multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { -let Predicates = [HasAVX] in { + OpndItins itins, list<Predicate> prds> { +let Predicates = prds in { def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat("v", OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), @@ -3546,16 +3567,16 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, // Square root. defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>, sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>; + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >; defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, - sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>; + sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>; // There is no f64 version of the reciprocal approximation instructions. @@ -4018,39 +4039,43 @@ multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, } // ExeDomain = SSEPackedInt defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX>; defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, - SSE_INTALUQ_ITINS_P, 1>; + SSE_INTALUQ_ITINS_P, 1, NoVLX>; defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, - SSE_INTMUL_ITINS_P, 1>; + SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, - SSE_INTMUL_ITINS_P, 1>; + SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, - SSE_INTMUL_ITINS_P, 1>; + SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX>; defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, - SSE_INTALUQ_ITINS_P, 0>; + SSE_INTALUQ_ITINS_P, 0, NoVLX>; defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; // Intrinsic forms defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b, @@ -4067,26 +4092,18 @@ defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w, int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; -defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, - int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>; -defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, - int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; -defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, - int_x86_avx2_psad_bw, SSE_PMADD, 1>; - -let Predicates = [HasAVX2] in - def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1), - (v32i8 VR256:$src2))), - (VPSADBWYrr VR256:$src2, VR256:$src1)>; let Predicates = [HasAVX] in - def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (VPSADBWrr VR128:$src2, VR128:$src1)>; - -def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (PSADBWrr VR128:$src2, VR128:$src1)>; +defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, + loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V; +let Predicates = [HasAVX2] in +defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, + loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in +defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, + memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>; let Predicates = [HasAVX] in defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, @@ -4105,9 +4122,6 @@ defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, //===---------------------------------------------------------------------===// let Predicates = [HasAVX, NoVLX] in { -defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, - VR128, v8i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; @@ -4115,9 +4129,6 @@ defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, VR128, v2i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, - VR128, v8i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; @@ -4125,14 +4136,26 @@ defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, VR128, v2i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, - VR128, v8i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +} // Predicates = [HasAVX, NoVLX] -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { +defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +} // Predicates = [HasAVX, NoVLX_Or_NoBWI] + + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] , + Predicates = [HasAVX, NoVLX_Or_NoBWI]in { // 128-bit logical shifts. def VPSLLDQri : PDIi8<0x73, MRM7r, (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), @@ -4147,13 +4170,9 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>, VEX_4V; // PSRADQri doesn't exist in SSE[1-3]. -} -} // Predicates = [HasAVX] +} // Predicates = [HasAVX, NoVLX_Or_NoBWI] let Predicates = [HasAVX2, NoVLX] in { -defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, - VR256, v16i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; @@ -4161,9 +4180,6 @@ defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, VR256, v4i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, - VR256, v16i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; @@ -4171,14 +4187,25 @@ defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, VR256, v4i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, - VR256, v16i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +}// Predicates = [HasAVX2, NoVLX] -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { +defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +}// Predicates = [HasAVX2, NoVLX_Or_NoBWI] + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 , + Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { // 256-bit logical shifts. def VPSLLDQYri : PDIi8<0x73, MRM7r, (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), @@ -4193,8 +4220,7 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>, VEX_4V, VEX_L; // PSRADQYri doesn't exist in SSE[1-3]. -} -} // Predicates = [HasAVX2] +} // Predicates = [HasAVX2, NoVLX_Or_NoBWI] let Constraints = "$src1 = $dst" in { defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, @@ -4247,17 +4273,17 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { //===---------------------------------------------------------------------===// defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX>; defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX>; //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Shuffle Instructions @@ -4511,40 +4537,43 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt, Sched<[WriteShuffleLd, ReadAfterLd]>; } -let Predicates = [HasAVX] in { + +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, bc_v16i8, loadv2i64, 0>, VEX_4V; defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, bc_v8i16, loadv2i64, 0>, VEX_4V; - defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, - bc_v4i32, loadv2i64, 0>, VEX_4V; - defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, - bc_v2i64, loadv2i64, 0>, VEX_4V; - defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, bc_v16i8, loadv2i64, 0>, VEX_4V; defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, bc_v8i16, loadv2i64, 0>, VEX_4V; +} +let Predicates = [HasAVX, NoVLX] in { + defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, + bc_v4i32, loadv2i64, 0>, VEX_4V; + defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, + bc_v2i64, loadv2i64, 0>, VEX_4V; defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, bc_v4i32, loadv2i64, 0>, VEX_4V; defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, bc_v2i64, loadv2i64, 0>, VEX_4V; } -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl, bc_v32i8>, VEX_4V, VEX_L; defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl, bc_v16i16>, VEX_4V, VEX_L; - defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl, - bc_v8i32>, VEX_4V, VEX_L; - defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, - bc_v4i64>, VEX_4V, VEX_L; - defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh, bc_v32i8>, VEX_4V, VEX_L; defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh, bc_v16i16>, VEX_4V, VEX_L; +} +let Predicates = [HasAVX2, NoVLX] in { + defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl, + bc_v8i32>, VEX_4V, VEX_L; + defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, + bc_v4i64>, VEX_4V, VEX_L; defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh, bc_v8i32>, VEX_4V, VEX_L; defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, @@ -4600,7 +4629,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { } // Extract -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in def VPEXTRWri : Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -4615,7 +4644,7 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg, Sched<[WriteShuffleLd, ReadAfterLd]>; // Insert -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V; let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in @@ -4683,7 +4712,7 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), } // ExeDomain = SSEPackedInt //===---------------------------------------------------------------------===// -// SSE2 - Move Doubleword +// SSE2 - Move Doubleword/Quadword //===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===// @@ -4770,23 +4799,23 @@ let isCodeGenOnly = 1 in { // def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), + [(set GR32:$dst, (extractelt (v4i32 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>; def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (vector_extract (v4i32 VR128:$src), + [(store (i32 (extractelt (v4i32 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), + [(set GR32:$dst, (extractelt (v4i32 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (vector_extract (v4i32 VR128:$src), + [(store (i32 (extractelt (v4i32 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; @@ -4808,24 +4837,25 @@ def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), let SchedRW = [WriteMove] in { def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), - (iPTR 0)))], + [(set GR64:$dst, (extractelt (v2i64 VR128:$src), + (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX; def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), + [(set GR64:$dst, (extractelt (v2i64 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>; } //SchedRW let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in -def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs i64mem:$dst), - (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", +def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs), + (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in -def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs i64mem:$dst), (ins VR128:$src), +def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; @@ -4883,30 +4913,18 @@ let isCodeGenOnly = 1 in { IIC_SSE_MOVDQ>, Sched<[WriteStore]>; } -//===---------------------------------------------------------------------===// -// Patterns and instructions to describe movd/movq to XMM register zero-extends -// -let isCodeGenOnly = 1, SchedRW = [WriteMove] in { -let AddedComplexity = 15 in { -def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), - "movq\t{$src, $dst|$dst, $src}", // X86-64 only - [(set VR128:$dst, (v2i64 (X86vzmovl - (v2i64 (scalar_to_vector GR64:$src)))))], - IIC_SSE_MOVDQ>, - VEX, VEX_W; -def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only - [(set VR128:$dst, (v2i64 (X86vzmovl - (v2i64 (scalar_to_vector GR64:$src)))))], - IIC_SSE_MOVDQ>; -} -} // isCodeGenOnly, SchedRW - let Predicates = [UseAVX] in { - let AddedComplexity = 15 in + let AddedComplexity = 15 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), (VMOVDI2PDIrr GR32:$src)>; + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (VMOV64toPQIrr GR64:$src)>; + + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>; + } // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. // These instructions also write zeros in the high part of a 256-bit register. let AddedComplexity = 20 in { @@ -4924,16 +4942,16 @@ let Predicates = [UseAVX] in { def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>; } let Predicates = [UseSSE2] in { - let AddedComplexity = 15 in + let AddedComplexity = 15 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), (MOVDI2PDIrr GR32:$src)>; + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (MOV64toPQIrr GR64:$src)>; + } let AddedComplexity = 20 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (MOVDI2PDIrm addr:$src)>; @@ -4985,12 +5003,12 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (vector_extract (v2i64 VR128:$src), + [(store (i64 (extractelt (v2i64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, VEX; def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (vector_extract (v2i64 VR128:$src), + [(store (i64 (extractelt (v2i64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>; } // ExeDomain, SchedRW @@ -5119,7 +5137,7 @@ def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), IIC_SSE_MOV_LH>, Sched<[WriteLoad]>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", v4f32, VR128, loadv4f32, f128mem>, VEX; defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", @@ -5134,7 +5152,7 @@ defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, memopv4f32, f128mem>; -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (X86Movshdup VR128:$src)), (VMOVSHDUPrr VR128:$src)>; def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))), @@ -5190,21 +5208,30 @@ def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (v4f64 (X86Movddup - (scalar_to_vector (loadf64 addr:$src)))))]>, + (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, Sched<[WriteLoad]>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L; } defm MOVDDUP : sse3_replicate_dfp<"movddup">; -let Predicates = [HasAVX] in { + +let Predicates = [HasAVX, NoVLX] in { def : Pat<(X86Movddup (loadv2f64 addr:$src)), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + + // 256-bit version + def : Pat<(X86Movddup (loadv4i64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4i64 VR256:$src)), + (VMOVDDUPYrr VR256:$src)>; +} + +let Predicates = [HasAVX] in { def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))), @@ -5212,16 +5239,6 @@ let Predicates = [HasAVX] in { def : Pat<(X86Movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - - // 256-bit version - def : Pat<(X86Movddup (loadv4f64 addr:$src)), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (loadv4i64 addr:$src)), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (v4i64 VR256:$src)), - (VMOVDDUPYrr VR256:$src)>; } let Predicates = [UseAVX, OptForSize] in { @@ -5791,37 +5808,37 @@ let Predicates = [HasAVX2] in let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in defm PALIGN : ssse3_palignr<"palignr">; -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; } let Predicates = [UseSSSE3] in { def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; } //===---------------------------------------------------------------------===// @@ -6145,7 +6162,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { imm:$src2)))), addr:$dst)]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; defm PEXTRB : SS41I_extract8<0x14, "pextrb">; @@ -6170,7 +6187,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { imm:$src2)))), addr:$dst)]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; defm PEXTRW : SS41I_extract16<0x15, "pextrw">; @@ -6194,7 +6211,7 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { addr:$dst)]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoDQI] in defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; defm PEXTRD : SS41I_extract32<0x16, "pextrd">; @@ -6217,7 +6234,7 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { addr:$dst)]>, REX_W; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoDQI] in defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; @@ -6285,7 +6302,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRB : SS41I_insert8<0x20, "pinsrb">; @@ -6311,7 +6328,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoDQI] in defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRD : SS41I_insert32<0x22, "pinsrd">; @@ -6337,7 +6354,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoDQI] in defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; let Constraints = "$src1 = $dst" in defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; @@ -6543,71 +6560,71 @@ let Predicates = [HasAVX] in { let Predicates = [UseAVX] in { def : Pat<(ffloor FR32:$src), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; def : Pat<(f64 (ffloor FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; def : Pat<(f32 (fnearbyint FR32:$src)), (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; def : Pat<(f64 (fnearbyint FR64:$src)), (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; def : Pat<(f32 (fceil FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; def : Pat<(f64 (fceil FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; def : Pat<(f32 (frint FR32:$src)), (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; def : Pat<(f64 (frint FR64:$src)), (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; def : Pat<(f32 (ftrunc FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; def : Pat<(f64 (ftrunc FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; } let Predicates = [HasAVX] in { def : Pat<(v4f32 (ffloor VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0x1))>; + (VROUNDPSr VR128:$src, (i32 0x9))>; def : Pat<(v4f32 (fnearbyint VR128:$src)), (VROUNDPSr VR128:$src, (i32 0xC))>; def : Pat<(v4f32 (fceil VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0x2))>; + (VROUNDPSr VR128:$src, (i32 0xA))>; def : Pat<(v4f32 (frint VR128:$src)), (VROUNDPSr VR128:$src, (i32 0x4))>; def : Pat<(v4f32 (ftrunc VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0x3))>; + (VROUNDPSr VR128:$src, (i32 0xB))>; def : Pat<(v2f64 (ffloor VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0x1))>; + (VROUNDPDr VR128:$src, (i32 0x9))>; def : Pat<(v2f64 (fnearbyint VR128:$src)), (VROUNDPDr VR128:$src, (i32 0xC))>; def : Pat<(v2f64 (fceil VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0x2))>; + (VROUNDPDr VR128:$src, (i32 0xA))>; def : Pat<(v2f64 (frint VR128:$src)), (VROUNDPDr VR128:$src, (i32 0x4))>; def : Pat<(v2f64 (ftrunc VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0x3))>; + (VROUNDPDr VR128:$src, (i32 0xB))>; def : Pat<(v8f32 (ffloor VR256:$src)), - (VROUNDYPSr VR256:$src, (i32 0x1))>; + (VROUNDYPSr VR256:$src, (i32 0x9))>; def : Pat<(v8f32 (fnearbyint VR256:$src)), (VROUNDYPSr VR256:$src, (i32 0xC))>; def : Pat<(v8f32 (fceil VR256:$src)), - (VROUNDYPSr VR256:$src, (i32 0x2))>; + (VROUNDYPSr VR256:$src, (i32 0xA))>; def : Pat<(v8f32 (frint VR256:$src)), (VROUNDYPSr VR256:$src, (i32 0x4))>; def : Pat<(v8f32 (ftrunc VR256:$src)), - (VROUNDYPSr VR256:$src, (i32 0x3))>; + (VROUNDYPSr VR256:$src, (i32 0xB))>; def : Pat<(v4f64 (ffloor VR256:$src)), - (VROUNDYPDr VR256:$src, (i32 0x1))>; + (VROUNDYPDr VR256:$src, (i32 0x9))>; def : Pat<(v4f64 (fnearbyint VR256:$src)), (VROUNDYPDr VR256:$src, (i32 0xC))>; def : Pat<(v4f64 (fceil VR256:$src)), - (VROUNDYPDr VR256:$src, (i32 0x2))>; + (VROUNDYPDr VR256:$src, (i32 0xA))>; def : Pat<(v4f64 (frint VR256:$src)), (VROUNDYPDr VR256:$src, (i32 0x4))>; def : Pat<(v4f64 (ftrunc VR256:$src)), - (VROUNDYPDr VR256:$src, (i32 0x3))>; + (VROUNDYPDr VR256:$src, (i32 0xB))>; } defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, @@ -6619,47 +6636,47 @@ defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", let Predicates = [UseSSE41] in { def : Pat<(ffloor FR32:$src), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; def : Pat<(f64 (ffloor FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; def : Pat<(f32 (fnearbyint FR32:$src)), (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; def : Pat<(f64 (fnearbyint FR64:$src)), (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; def : Pat<(f32 (fceil FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; def : Pat<(f64 (fceil FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; def : Pat<(f32 (frint FR32:$src)), (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; def : Pat<(f64 (frint FR64:$src)), (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; def : Pat<(f32 (ftrunc FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; def : Pat<(f64 (ftrunc FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; def : Pat<(v4f32 (ffloor VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0x1))>; + (ROUNDPSr VR128:$src, (i32 0x9))>; def : Pat<(v4f32 (fnearbyint VR128:$src)), (ROUNDPSr VR128:$src, (i32 0xC))>; def : Pat<(v4f32 (fceil VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0x2))>; + (ROUNDPSr VR128:$src, (i32 0xA))>; def : Pat<(v4f32 (frint VR128:$src)), (ROUNDPSr VR128:$src, (i32 0x4))>; def : Pat<(v4f32 (ftrunc VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0x3))>; + (ROUNDPSr VR128:$src, (i32 0xB))>; def : Pat<(v2f64 (ffloor VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0x1))>; + (ROUNDPDr VR128:$src, (i32 0x9))>; def : Pat<(v2f64 (fnearbyint VR128:$src)), (ROUNDPDr VR128:$src, (i32 0xC))>; def : Pat<(v2f64 (fceil VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0x2))>; + (ROUNDPDr VR128:$src, (i32 0xA))>; def : Pat<(v2f64 (frint VR128:$src)), (ROUNDPDr VR128:$src, (i32 0x4))>; def : Pat<(v2f64 (ftrunc VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0x3))>; + (ROUNDPDr VR128:$src, (i32 0xB))>; } //===----------------------------------------------------------------------===// @@ -7815,62 +7832,56 @@ def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), // VBROADCAST - Load from memory and broadcast to all elements of the // destination operand // -class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> : - AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX; - -class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC, +class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, ValueType VT, PatFrag ld_frag, SchedWrite Sched> : AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, - Sched<[Sched]>, VEX { - let mayLoad = 1; -} + Sched<[Sched]>, VEX; // AVX2 adds register forms -class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC, - Intrinsic Int, SchedWrite Sched> : +class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType ResVT, ValueType OpVT, SchedWrite Sched> : AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX; + [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, + Sched<[Sched]>, VEX; let ExeDomain = SSEPackedSingle in { - def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128, + def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, f32mem, v4f32, loadf32, WriteLoad>; - def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256, + def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, f32mem, v8f32, loadf32, WriteFShuffleLd>, VEX_L; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem, +def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, v4f64, loadf64, WriteFShuffleLd>, VEX_L; -def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, - int_x86_avx_vbroadcastf128_pd_256, - WriteFShuffleLd>, VEX_L; let ExeDomain = SSEPackedSingle in { - def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128, - int_x86_avx2_vbroadcast_ss_ps, - WriteFShuffle>; - def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256, - int_x86_avx2_vbroadcast_ss_ps_256, - WriteFShuffle256>, VEX_L; + def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, + v4f32, v4f32, WriteFShuffle>; + def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, + v8f32, v4f32, WriteFShuffle256>, VEX_L; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, - int_x86_avx2_vbroadcast_sd_pd_256, - WriteFShuffle256>, VEX_L; +def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, + v4f64, v2f64, WriteFShuffle256>, VEX_L; -let mayLoad = 1, Predicates = [HasAVX2] in +let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, Sched<[WriteLoad]>, VEX, VEX_L; +def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), + (ins f128mem:$src), + "vbroadcastf128\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>, + Sched<[WriteFShuffleLd]>, VEX, VEX_L; + let Predicates = [HasAVX] in def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), (VBROADCASTF128 addr:$src)>; @@ -7891,7 +7902,7 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, @@ -8080,17 +8091,19 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, (bitconvert (i_frag addr:$src2))))]>, VEX_4V, Sched<[WriteFShuffleLd, ReadAfterLd]>; - def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), + let Predicates = [HasAVX, NoVLX] in { + def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffle]>; - def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), + def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), (ins x86memop_f:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffleLd]>; + }// Predicates = [HasAVX, NoVLX] } let ExeDomain = SSEPackedSingle in { @@ -8106,7 +8119,7 @@ let ExeDomain = SSEPackedDouble in { loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))), (VPERMILPSYrr VR256:$src1, VR256:$src2)>; def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), @@ -8244,12 +8257,15 @@ let Predicates = [HasF16C] in { (VCVTPH2PSrm addr:$src)>; def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), (VCVTPH2PSrm addr:$src)>; + def : Pat<(int_x86_vcvtph2ps_128 (bitconvert + (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (VCVTPH2PSrm addr:$src)>; - def : Pat<(store (f64 (vector_extract (bc_v2f64 (v8i16 + def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16 (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; - def : Pat<(store (i64 (vector_extract (bc_v2i64 (v8i16 + def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16 (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; @@ -8309,97 +8325,62 @@ defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32, // multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, PatFrag ld_frag, - Intrinsic Int128, Intrinsic Int256> { - def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + ValueType OpVT128, ValueType OpVT256, Predicate prd> { + let Predicates = [HasAVX2, prd] in { + def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (Int128 VR128:$src))]>, + [(set VR128:$dst, + (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, Sched<[WriteShuffle]>, VEX; - def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, + (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>, Sched<[WriteLoad]>, VEX; - def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (Int256 VR128:$src))]>, + [(set VR256:$dst, + (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, Sched<[WriteShuffle256]>, VEX, VEX_L; - def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (Int256 (scalar_to_vector (ld_frag addr:$src))))]>, + (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>, Sched<[WriteLoad]>, VEX, VEX_L; + + // Provide aliases for broadcast from the same register class that + // automatically does the extract. + def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), + (!cast<Instruction>(NAME#"Yrr") + (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; + } } defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, - int_x86_avx2_pbroadcastb_128, - int_x86_avx2_pbroadcastb_256>; + v16i8, v32i8, NoVLX_Or_NoBWI>; defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, - int_x86_avx2_pbroadcastw_128, - int_x86_avx2_pbroadcastw_256>; + v8i16, v16i16, NoVLX_Or_NoBWI>; defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, - int_x86_avx2_pbroadcastd_128, - int_x86_avx2_pbroadcastd_256>; + v4i32, v8i32, NoVLX>; defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, - int_x86_avx2_pbroadcastq_128, - int_x86_avx2_pbroadcastq_256>; + v2i64, v4i64, NoVLX>; let Predicates = [HasAVX2] in { - def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))), - (VPBROADCASTBrm addr:$src)>; - def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))), - (VPBROADCASTBYrm addr:$src)>; - def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWrm addr:$src)>; - def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWYrm addr:$src)>; - def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDrm addr:$src)>; - def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDYrm addr:$src)>; - def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), - (VPBROADCASTQrm addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), - (VPBROADCASTQYrm addr:$src)>; - - def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))), - (VPBROADCASTBrr VR128:$src)>; - def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))), - (VPBROADCASTBYrr VR128:$src)>; - def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))), - (VPBROADCASTWrr VR128:$src)>; - def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))), - (VPBROADCASTWYrr VR128:$src)>; - def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))), - (VPBROADCASTDrr VR128:$src)>; - def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))), - (VPBROADCASTDYrr VR128:$src)>; - def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))), - (VPBROADCASTQrr VR128:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))), - (VPBROADCASTQYrr VR128:$src)>; - def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))), - (VBROADCASTSSrr VR128:$src)>; - def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))), - (VBROADCASTSSYrr VR128:$src)>; - def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))), - (VPBROADCASTQrr VR128:$src)>; - def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), - (VBROADCASTSDYrr VR128:$src)>; + // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. + // This means we'll encounter truncated i32 loads; match that here. + def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; // Provide aliases for broadcast from the same register class that // automatically does the extract. - def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), - (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), - sub_xmm)))>; - def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))), - (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), - sub_xmm)))>; - def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))), - (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), - sub_xmm)))>; - def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))), - (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), - sub_xmm)))>; def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))>; @@ -8598,7 +8579,7 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; } -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX] in { def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, @@ -8722,16 +8703,16 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", int_x86_avx2_maskstore_q, int_x86_avx2_maskstore_q_256>, VEX_W; -def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>; -def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; -def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)), +def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)), (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>; -def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)), +def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)), (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>; def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), @@ -8776,10 +8757,10 @@ def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0) (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr), VR128:$mask)>; -def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>; -def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), @@ -8804,10 +8785,10 @@ def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0) (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), VR256:$mask)>; -def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)), +def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)), (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>; -def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)), +def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)), (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>; def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), @@ -8865,12 +8846,13 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; } -defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; -defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; -defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; -defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; -defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; - +let Predicates = [HasAVX2, NoVLX] in { + defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; + defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; + defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; + defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; + defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; +} //===----------------------------------------------------------------------===// // VGATHER - GATHER Operations multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, @@ -8905,3 +8887,59 @@ let mayLoad = 1, Constraints defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>; } } + +//===----------------------------------------------------------------------===// +// Extra selection patterns for FR128, f128, f128mem + +// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2. +def : Pat<(store (f128 FR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>; + +def : Pat<(loadf128 addr:$src), + (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>; + +// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 +def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86fand FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(and FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86for FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(or FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86fxor FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(xor FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td index caecf70..c1df978 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -31,21 +31,21 @@ def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>; } // Uses = [CL] -def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), +def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "shl{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. -def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "shl{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "shl{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), - (ins GR64:$src1, i8imm:$src2), + (ins GR64:$src1, u8imm:$src2), "shl{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -85,19 +85,19 @@ def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst), "shl{q}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src), +def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src), "shl{b}\t{$src, $dst|$dst, $src}", [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; -def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src), +def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src), "shl{w}\t{$src, $dst|$dst, $src}", [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize16; -def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src), +def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src), "shl{l}\t{$src, $dst|$dst, $src}", [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize32; -def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src), +def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src), "shl{q}\t{$src, $dst|$dst, $src}", [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; @@ -137,18 +137,18 @@ def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>; } -def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), +def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2), "shr{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; -def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "shr{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "shr{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; -def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), +def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2), "shr{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -185,19 +185,19 @@ def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst), "shr{q}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src), +def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src), "shr{b}\t{$src, $dst|$dst, $src}", [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; -def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src), +def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src), "shr{w}\t{$src, $dst|$dst, $src}", [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize16; -def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src), +def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src), "shr{l}\t{$src, $dst|$dst, $src}", [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize32; -def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src), +def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src), "shr{q}\t{$src, $dst|$dst, $src}", [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; @@ -241,20 +241,20 @@ def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1), IIC_SR>; } -def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), +def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "sar{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))], IIC_SR>; -def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "sar{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "sar{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst), - (ins GR64:$src1, i8imm:$src2), + (ins GR64:$src1, u8imm:$src2), "sar{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -298,19 +298,19 @@ def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), [(store (sra (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src), +def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src), "sar{b}\t{$src, $dst|$dst, $src}", [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; -def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src), +def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src), "sar{w}\t{$src, $dst|$dst, $src}", [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize16; -def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src), +def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src), "sar{l}\t{$src, $dst|$dst, $src}", [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize32; -def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src), +def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src), "sar{q}\t{$src, $dst|$dst, $src}", [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; @@ -342,7 +342,7 @@ let hasSideEffects = 0 in { let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1), "rcl{b}\t$dst", [], IIC_SR>; -def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), +def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt), "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), @@ -350,7 +350,7 @@ def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), "rcl{w}\t$dst", [], IIC_SR>, OpSize16; -def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), +def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt), "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; let Uses = [CL] in def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), @@ -358,7 +358,7 @@ def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1), "rcl{l}\t$dst", [], IIC_SR>, OpSize32; -def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), +def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt), "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; let Uses = [CL] in def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), @@ -367,7 +367,7 @@ def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "rcl{q}\t$dst", [], IIC_SR>; -def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), +def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt), "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), @@ -376,7 +376,7 @@ def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1), "rcr{b}\t$dst", [], IIC_SR>; -def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), +def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt), "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), @@ -384,7 +384,7 @@ def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), "rcr{w}\t$dst", [], IIC_SR>, OpSize16; -def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), +def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt), "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; let Uses = [CL] in def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), @@ -392,7 +392,7 @@ def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1), "rcr{l}\t$dst", [], IIC_SR>, OpSize32; -def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), +def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt), "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; let Uses = [CL] in def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), @@ -400,7 +400,7 @@ def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "rcr{q}\t$dst", [], IIC_SR>; -def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), +def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt), "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), @@ -411,36 +411,36 @@ def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), let SchedRW = [WriteShiftLd, WriteRMW] in { def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst), "rcl{b}\t$dst", [], IIC_SR>; -def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt), +def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt), "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst), "rcl{w}\t$dst", [], IIC_SR>, OpSize16; -def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt), +def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt), "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst), "rcl{l}\t$dst", [], IIC_SR>, OpSize32; -def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt), +def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt), "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst), "rcl{q}\t$dst", [], IIC_SR>; -def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt), +def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt), "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst), "rcr{b}\t$dst", [], IIC_SR>; -def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt), +def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt), "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst), "rcr{w}\t$dst", [], IIC_SR>, OpSize16; -def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt), +def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt), "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst), "rcr{l}\t$dst", [], IIC_SR>, OpSize32; -def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt), +def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt), "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst), "rcr{q}\t$dst", [], IIC_SR>; -def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt), +def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt), "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in { @@ -482,19 +482,19 @@ def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>; } -def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), +def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "rol{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; -def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "rol{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "rol{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), - (ins GR64:$src1, i8imm:$src2), + (ins GR64:$src1, u8imm:$src2), "rol{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -537,19 +537,19 @@ def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst), [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src1), +def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1), "rol{b}\t{$src1, $dst|$dst, $src1}", [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)], IIC_SR>; -def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src1), +def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1), "rol{w}\t{$src1, $dst|$dst, $src1}", [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)], IIC_SR>, OpSize16; -def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src1), +def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1), "rol{l}\t{$src1, $dst|$dst, $src1}", [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)], IIC_SR>, OpSize32; -def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src1), +def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1), "rol{q}\t{$src1, $dst|$dst, $src1}", [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)], IIC_SR>; @@ -589,19 +589,19 @@ def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>; } -def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), +def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "ror{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>; -def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "ror{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "ror{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), - (ins GR64:$src1, i8imm:$src2), + (ins GR64:$src1, u8imm:$src2), "ror{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -644,19 +644,19 @@ def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src), +def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src), "ror{b}\t{$src, $dst|$dst, $src}", [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; -def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src), +def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src), "ror{w}\t{$src, $dst|$dst, $src}", [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize16; -def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src), +def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src), "ror{l}\t{$src, $dst|$dst, $src}", [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize32; -def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src), +def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src), "ror{q}\t{$src, $dst|$dst, $src}", [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; @@ -727,42 +727,42 @@ def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), let isCommutable = 1 in { // These instructions commute to each other. def SHLD16rri8 : Ii8<0xA4, MRMDestReg, (outs GR16:$dst), - (ins GR16:$src1, GR16:$src2, i8imm:$src3), + (ins GR16:$src1, GR16:$src2, u8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, (i8 imm:$src3)))], IIC_SHD16_REG_IM>, TB, OpSize16; def SHRD16rri8 : Ii8<0xAC, MRMDestReg, (outs GR16:$dst), - (ins GR16:$src1, GR16:$src2, i8imm:$src3), + (ins GR16:$src1, GR16:$src2, u8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, (i8 imm:$src3)))], IIC_SHD16_REG_IM>, TB, OpSize16; def SHLD32rri8 : Ii8<0xA4, MRMDestReg, (outs GR32:$dst), - (ins GR32:$src1, GR32:$src2, i8imm:$src3), + (ins GR32:$src1, GR32:$src2, u8imm:$src3), "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, (i8 imm:$src3)))], IIC_SHD32_REG_IM>, TB, OpSize32; def SHRD32rri8 : Ii8<0xAC, MRMDestReg, (outs GR32:$dst), - (ins GR32:$src1, GR32:$src2, i8imm:$src3), + (ins GR32:$src1, GR32:$src2, u8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, (i8 imm:$src3)))], IIC_SHD32_REG_IM>, TB, OpSize32; def SHLD64rri8 : RIi8<0xA4, MRMDestReg, (outs GR64:$dst), - (ins GR64:$src1, GR64:$src2, i8imm:$src3), + (ins GR64:$src1, GR64:$src2, u8imm:$src3), "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, (i8 imm:$src3)))], IIC_SHD64_REG_IM>, TB; def SHRD64rri8 : RIi8<0xAC, MRMDestReg, (outs GR64:$dst), - (ins GR64:$src1, GR64:$src2, i8imm:$src3), + (ins GR64:$src1, GR64:$src2, u8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, (i8 imm:$src3)))], IIC_SHD64_REG_IM>, @@ -801,14 +801,14 @@ def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), } def SHLD16mri8 : Ii8<0xA4, MRMDestMem, - (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), + (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shld (loadi16 addr:$dst), GR16:$src2, (i8 imm:$src3)), addr:$dst)], IIC_SHD16_MEM_IM>, TB, OpSize16; def SHRD16mri8 : Ii8<0xAC, MRMDestMem, - (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), + (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, (i8 imm:$src3)), addr:$dst)], @@ -816,14 +816,14 @@ def SHRD16mri8 : Ii8<0xAC, MRMDestMem, TB, OpSize16; def SHLD32mri8 : Ii8<0xA4, MRMDestMem, - (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), + (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3), "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shld (loadi32 addr:$dst), GR32:$src2, (i8 imm:$src3)), addr:$dst)], IIC_SHD32_MEM_IM>, TB, OpSize32; def SHRD32mri8 : Ii8<0xAC, MRMDestMem, - (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), + (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, (i8 imm:$src3)), addr:$dst)], @@ -831,14 +831,14 @@ def SHRD32mri8 : Ii8<0xAC, MRMDestMem, TB, OpSize32; def SHLD64mri8 : RIi8<0xA4, MRMDestMem, - (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), + (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3), "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shld (loadi64 addr:$dst), GR64:$src2, (i8 imm:$src3)), addr:$dst)], IIC_SHD64_MEM_IM>, TB; def SHRD64mri8 : RIi8<0xAC, MRMDestMem, - (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), + (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, (i8 imm:$src3)), addr:$dst)], @@ -860,12 +860,12 @@ def ROT64L2R_imm8 : SDNodeXForm<imm, [{ multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> { let hasSideEffects = 0 in { - def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), + def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, TAXD, VEX, Sched<[WriteShift]>; let mayLoad = 1 in def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst), - (ins x86memop:$src1, i8imm:$src2), + (ins x86memop:$src1, u8imm:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, TAXD, VEX, Sched<[WriteShiftLd]>; } diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td index 0350566..a97d1e5 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSystem.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td @@ -44,7 +44,7 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", let SchedRW = [WriteSystem] in { -def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", +def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap", [(int_x86_int imm:$trap)], IIC_INT>; @@ -60,12 +60,6 @@ def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [], IIC_SYS_ENTER_EXIT>, TB; def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [], IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>; - -def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize16; -def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>, - OpSize32; -def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>, - Requires<[In64BitMode]>; } // SchedRW def : Pat<(debugtrap), @@ -88,13 +82,13 @@ def IN32rr : I<0xED, RawFrm, (outs), (ins), "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32; let Defs = [AL] in -def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port), +def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port), "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>; let Defs = [AX] in -def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), +def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port), "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16; let Defs = [EAX] in -def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), +def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port), "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32; let Uses = [DX, AL] in @@ -108,13 +102,13 @@ def OUT32rr : I<0xEF, RawFrm, (outs), (ins), "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32; let Uses = [AL] in -def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port), +def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port), "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>; let Uses = [AX] in -def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), +def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port), "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16; let Uses = [EAX] in -def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), +def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port), "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32; } // SchedRW @@ -478,39 +472,60 @@ def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB; //===----------------------------------------------------------------------===// // XSAVE instructions let SchedRW = [WriteSystem] in { +let Predicates = [HasXSAVE] in { let Defs = [EDX, EAX], Uses = [ECX] in def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; let Uses = [EDX, EAX, ECX] in def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB; +} -let Uses = [RDX, RAX] in { - def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), - "xsave\t$dst", []>, TB; - def XSAVE64 : RI<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), - "xsave64\t$dst", []>, TB, Requires<[In64BitMode]>; +let Uses = [EDX, EAX] in { +let Predicates = [HasXSAVE] in { + def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst), + "xsave\t$dst", + [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB; + def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst), + "xsave64\t$dst", + [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), - "xrstor\t$dst", []>, TB; + "xrstor\t$dst", + [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB; def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), - "xrstor64\t$dst", []>, TB, Requires<[In64BitMode]>; - def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), - "xsaveopt\t$dst", []>, PS; - def XSAVEOPT64 : RI<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), - "xsaveopt64\t$dst", []>, PS, Requires<[In64BitMode]>; - + "xrstor64\t$dst", + [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; +} +let Predicates = [HasXSAVEOPT] in { + def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), + "xsaveopt\t$dst", + [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS; + def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), + "xsaveopt64\t$dst", + [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[In64BitMode]>; +} +let Predicates = [HasXSAVEC] in { + def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst), + "xsavec\t$dst", + [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB; + def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst), + "xsavec64\t$dst", + [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; +} +let Predicates = [HasXSAVES] in { + def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst), + "xsaves\t$dst", + [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB; + def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst), + "xsaves64\t$dst", + [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), - "xrstors\t$dst", []>, TB; + "xrstors\t$dst", + [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB; def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), - "xrstors64\t$dst", []>, TB, Requires<[In64BitMode]>; - def XSAVEC : I<0xC7, MRM4m, (outs opaque512mem:$dst), (ins), - "xsavec\t$dst", []>, TB; - def XSAVEC64 : RI<0xC7, MRM4m, (outs opaque512mem:$dst), (ins), - "xsavec64\t$dst", []>, TB, Requires<[In64BitMode]>; - def XSAVES : I<0xC7, MRM5m, (outs opaque512mem:$dst), (ins), - "xsaves\t$dst", []>, TB; - def XSAVES64 : RI<0xC7, MRM5m, (outs opaque512mem:$dst), (ins), - "xsaves64\t$dst", []>, TB, Requires<[In64BitMode]>; + "xrstors64\t$dst", + [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; } +} // Uses } // SchedRW //===----------------------------------------------------------------------===// @@ -534,6 +549,19 @@ let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in { } let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB; +//==-----------------------------------------------------------------------===// +// PKU - enable protection key +let usesCustomInserter = 1 in { + def WRPKRU : PseudoI<(outs), (ins GR32:$src), + [(int_x86_wrpkru GR32:$src)]>; + def RDPKRU : PseudoI<(outs GR32:$dst), (ins), + [(set GR32:$dst, (int_x86_rdpkru))]>; +} + +let Defs = [EAX, EDX], Uses = [ECX] in + def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB; +let Uses = [EAX, ECX, EDX] in + def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB; //===----------------------------------------------------------------------===// // FS/GS Base Instructions diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td index 8455b8d..4cb2304 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrXOP.td +++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td @@ -83,57 +83,64 @@ let ExeDomain = SSEPackedDouble in { defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>; } -multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> { +multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, XOP_4VOp3; + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>, + XOP_4VOp3, Sched<[WriteVarVecShift]>; def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2))))]>, - XOP_4V, VEX_W; + (vt128 (OpNode (vt128 VR128:$src1), + (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, + XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>; def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int (bitconvert (loadv2i64 addr:$src1)), VR128:$src2))]>, - XOP_4VOp3; + (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), + (vt128 VR128:$src2))))]>, + XOP_4VOp3, Sched<[WriteVarVecShift, ReadAfterLd]>; } let ExeDomain = SSEPackedInt in { - defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; - defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; - defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; - defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; - defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; - defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; - defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; - defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; - defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; - defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; - defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; - defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; + defm VPROTB : xop3op<0x90, "vprotb", X86vprot, v16i8>; + defm VPROTD : xop3op<0x92, "vprotd", X86vprot, v4i32>; + defm VPROTQ : xop3op<0x93, "vprotq", X86vprot, v2i64>; + defm VPROTW : xop3op<0x91, "vprotw", X86vprot, v8i16>; + defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>; + defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>; + defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>; + defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>; + defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>; + defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>; + defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>; + defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>; } -multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { +multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128> { def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, XOP; + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, XOP; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins i128mem:$src1, i8imm:$src2), + (ins i128mem:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int (bitconvert (loadv2i64 addr:$src1)), imm:$src2))]>, XOP; + (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, XOP; } let ExeDomain = SSEPackedInt in { - defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; - defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; - defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; - defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; + defm VPROTB : xop3opimm<0xC0, "vprotb", X86vproti, v16i8>; + defm VPROTD : xop3opimm<0xC2, "vprotd", X86vproti, v4i32>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vproti, v2i64>; + defm VPROTW : xop3opimm<0xC1, "vprotw", X86vproti, v8i16>; } // Instruction where second source can be memory, but third must be register @@ -170,30 +177,34 @@ let ExeDomain = SSEPackedInt in { } // Instruction where second source can be memory, third must be imm8 -multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> { +multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> { let isCommutable = 1 in def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, XOPCC:$cc), !strconcat("vpcom${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, i8immZExt3:$cc))]>, + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), + i8immZExt3:$cc)))]>, XOP_4V; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, XOPCC:$cc), !strconcat("vpcom${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), - i8immZExt3:$cc))]>, XOP_4V; + (vt128 (OpNode (vt128 VR128:$src1), + (vt128 (bitconvert (loadv2i64 addr:$src2))), + i8immZExt3:$cc)))]>, + XOP_4V; let isAsmParserOnly = 1, hasSideEffects = 0 in { def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), !strconcat("vpcom", Suffix, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, XOP_4V; let mayLoad = 1 in def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), !strconcat("vpcom", Suffix, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, XOP_4V; @@ -201,14 +212,14 @@ multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> { } let ExeDomain = SSEPackedInt in { // SSE integer instructions - defm VPCOMB : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>; - defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>; - defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>; - defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>; - defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>; - defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>; - defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>; - defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>; + defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8>; + defm VPCOMW : xopvpcom<0xCD, "w", X86vpcom, v8i16>; + defm VPCOMD : xopvpcom<0xCE, "d", X86vpcom, v4i32>; + defm VPCOMQ : xopvpcom<0xCF, "q", X86vpcom, v2i64>; + defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8>; + defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16>; + defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32>; + defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>; } // Instruction where either second or third source can be memory @@ -270,42 +281,52 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { let ExeDomain = SSEPackedInt in defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; +let Predicates = [HasXOP] in { + def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, VR128:$src2))), + (VPCMOVrr VR128:$src1, VR128:$src2, VR128:$src3)>; + + def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, VR256:$src2))), + (VPCMOVrrY VR256:$src1, VR256:$src2, VR256:$src3)>; +} + multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, VR128:$src3, i8imm:$src4), + (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>; def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, f128mem:$src3, i8imm:$src4), + (ins VR128:$src1, VR128:$src2, f128mem:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>, VEX_W, MemOp4; def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f128mem:$src2, VR128:$src3, i8imm:$src4), + (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR128:$dst, (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>; def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, VR256:$src3, i8imm:$src4), + (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L; def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4), + (ins VR256:$src1, VR256:$src2, f256mem:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>, VEX_W, MemOp4, VEX_L; def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4), + (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 2c8b95b..b525d5e 100644 --- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -18,14 +18,19 @@ namespace llvm { enum IntrinsicType { INTR_NO_TYPE, - GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, - INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, - CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, - INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK, - VPERM_3OP_MASKZ, - INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, - EXPAND_FROM_MEM, BLEND + GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS, + INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP, + CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI, COMI_RM, + INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, + INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK, + INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK, + FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK, + VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK, + INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM, + COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, + TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, + EXPAND_FROM_MEM, LOADA, LOADU, BLEND, INSERT_SUBVEC, + TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK }; struct IntrinsicData { @@ -138,6 +143,54 @@ static const IntrinsicData IntrinsicsWithChain[] = { EXPAND_FROM_MEM, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512, EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_load_pd_128, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_pd_256, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_pd_512, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_ps_128, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_ps_256, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_ps_512, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_pd_128, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_pd_256, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_pd_512, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_ps_128, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_ps_256, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_ps_512, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), @@ -209,6 +262,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -241,6 +296,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0), X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0), @@ -274,16 +330,56 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtsi2sd32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), - + X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD, @@ -371,6 +467,50 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_blend_w_128, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_blend_w_256, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_blend_w_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0), @@ -388,6 +528,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC, X86ISD::FSETCC, + X86ISD::FSETCC), + X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, X86ISD::FSETCC, + X86ISD::FSETCC), X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0), @@ -415,7 +559,184 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - + X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTDQ2PD, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), // no rm + X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, ISD::SINT_TO_FP), //er + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, INTR_TYPE_1OP_MASK, + X86ISD::VFPROUND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, INTR_TYPE_1OP_MASK_RM, + ISD::FP_ROUND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK_RM, + ISD::FP_ROUND, X86ISD::VFPROUND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK, + X86ISD::VFPEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK, + ISD::FP_EXTEND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK, + ISD::FP_EXTEND, X86ISD::VFPEXT), + X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, ISD::SINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, ISD::SINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::VFPROUND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::VFPEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTUDQ2PD, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), // no rm + X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, ISD::UINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, ISD::UINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, ISD::UINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::DBPSADBW, 0), + X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::DBPSADBW, 0), + X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::DBPSADBW, 0), X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV, @@ -452,6 +773,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM, X86ISD::FGETEXP_RND, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM, @@ -464,6 +793,62 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FGETEXP_RND, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM, X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, @@ -472,10 +857,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX, - X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX, - X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMAX, X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMAX, X86ISD::FMAX_RND), X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, @@ -484,10 +869,32 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, - X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, - X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_movddup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movddup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK, + X86ISD::MOVSD, 0), + X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK, + X86ISD::MOVSS, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, @@ -554,6 +961,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_palignr_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::PALIGNR, 0), + X86_INTRINSIC_DATA(avx512_mask_palignr_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::PALIGNR, 0), + X86_INTRINSIC_DATA(avx512_mask_palignr_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::PALIGNR, 0), X86_INTRINSIC_DATA(avx512_mask_pand_d_128, INTR_TYPE_2OP_MASK, ISD::AND, 0), X86_INTRINSIC_DATA(avx512_mask_pand_d_256, INTR_TYPE_2OP_MASK, ISD::AND, 0), X86_INTRINSIC_DATA(avx512_mask_pand_d_512, INTR_TYPE_2OP_MASK, ISD::AND, 0), @@ -596,6 +1009,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), @@ -644,6 +1069,186 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_128, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_256, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_512, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_128, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_256, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_512, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_128, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_256, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_512, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_128, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_256, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_512, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_128, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_256, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_512, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_128, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_256, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_512, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_128, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_256, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_512, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_128, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_256, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_512, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_128, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_256, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_512, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_128, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_256, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_512, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_128, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_256, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_512, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_128, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_256, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_512, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK, @@ -680,28 +1285,139 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_por_q_128, INTR_TYPE_2OP_MASK, ISD::OR, 0), X86_INTRINSIC_DATA(avx512_mask_por_q_256, INTR_TYPE_2OP_MASK, ISD::OR, 0), X86_INTRINSIC_DATA(avx512_mask_por_q_512, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_prol_d_128, INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(avx512_mask_prol_d_256, INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(avx512_mask_prol_d_512, INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(avx512_mask_prol_q_128, INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(avx512_mask_prol_q_256, INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(avx512_mask_prol_q_512, INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(avx512_mask_prolv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0), + X86_INTRINSIC_DATA(avx512_mask_prolv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0), + X86_INTRINSIC_DATA(avx512_mask_prolv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0), + X86_INTRINSIC_DATA(avx512_mask_prolv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0), + X86_INTRINSIC_DATA(avx512_mask_prolv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0), + X86_INTRINSIC_DATA(avx512_mask_prolv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0), + X86_INTRINSIC_DATA(avx512_mask_pror_d_128, INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0), + X86_INTRINSIC_DATA(avx512_mask_pror_d_256, INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0), + X86_INTRINSIC_DATA(avx512_mask_pror_d_512, INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0), + X86_INTRINSIC_DATA(avx512_mask_pror_q_128, INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0), + X86_INTRINSIC_DATA(avx512_mask_pror_q_256, INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0), + X86_INTRINSIC_DATA(avx512_mask_pror_q_512, INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0), + X86_INTRINSIC_DATA(avx512_mask_prorv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), + X86_INTRINSIC_DATA(avx512_mask_prorv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), + X86_INTRINSIC_DATA(avx512_mask_prorv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), + X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), + X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), + X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), X86_INTRINSIC_DATA(avx512_mask_pshuf_b_128, INTR_TYPE_2OP_MASK, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx512_mask_pshuf_b_256, INTR_TYPE_2OP_MASK, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx512_mask_pshuf_b_512, INTR_TYPE_2OP_MASK, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(avx512_mask_pshuf_d_128, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFD, 0), + X86_INTRINSIC_DATA(avx512_mask_pshuf_d_256, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFD, 0), + X86_INTRINSIC_DATA(avx512_mask_pshuf_d_512, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFD, 0), + X86_INTRINSIC_DATA(avx512_mask_pshufh_w_128, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFHW, 0), + X86_INTRINSIC_DATA(avx512_mask_pshufh_w_256, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFHW, 0), + X86_INTRINSIC_DATA(avx512_mask_pshufh_w_512, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFHW, 0), + X86_INTRINSIC_DATA(avx512_mask_pshufl_w_128, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFLW, 0), + X86_INTRINSIC_DATA(avx512_mask_pshufl_w_256, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFLW, 0), + X86_INTRINSIC_DATA(avx512_mask_pshufl_w_512, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFLW, 0), X86_INTRINSIC_DATA(avx512_mask_psll_d, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_mask_psll_q, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_wi_128, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_wi_256, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_mask_pslli_d, VSHIFT_MASK, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_mask_pslli_q, VSHIFT_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv16_hi, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv2_di, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv32hi, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv4_di, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv4_si, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv8_hi, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv8_si, INTR_TYPE_2OP_MASK, ISD::SHL, 0), X86_INTRINSIC_DATA(avx512_mask_psllv_d, INTR_TYPE_2OP_MASK, ISD::SHL, 0), X86_INTRINSIC_DATA(avx512_mask_psllv_q, INTR_TYPE_2OP_MASK, ISD::SHL, 0), X86_INTRINSIC_DATA(avx512_mask_psra_d, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psra_q, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_wi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_wi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psrai_d, VSHIFT_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psrai_q, VSHIFT_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav16_hi, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav32_hi, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav4_si, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav8_hi, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav8_si, INTR_TYPE_2OP_MASK, ISD::SRA, 0), X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_TYPE_2OP_MASK, ISD::SRA, 0), X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_q_128, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_q_256, INTR_TYPE_2OP_MASK, ISD::SRA, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_q, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_wi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_wi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrli_d, VSHIFT_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrli_q, VSHIFT_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv16_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv2_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv32hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv4_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv4_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv8_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv8_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrlv_q, INTR_TYPE_2OP_MASK, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_mask_psub_b_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), @@ -728,16 +1444,98 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckld_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckld_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckld_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_d_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_d_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_d_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_q_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_q_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_q_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::RNDSCALE, 0), + X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::RNDSCALE, 0), + X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::SCALEF, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM, @@ -750,6 +1548,38 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::SCALEF, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_pd_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_pd_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_pd_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_ps_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_ps_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_ps_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT, @@ -758,6 +1588,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT, X86ISD::FSQRT_RND), + X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSQRT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSQRT_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB, @@ -782,9 +1616,54 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0), - X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0), - + X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM, + ISD::FP16_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM, + ISD::FP16_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM, + ISD::FP16_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK_RM, + ISD::FP_TO_FP16, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK_RM, + ISD::FP_TO_FP16, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK_RM, + ISD::FP_TO_FP16, 0), X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD, @@ -821,7 +1700,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB, X86ISD::FNMSUB_RND), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK, X86ISD::VPERMIV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK, @@ -852,54 +1730,56 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPERMIV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK, X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_128, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_256, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_512, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_128, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_256, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_512, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_128, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_256, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_128, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_256, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK, @@ -910,7 +1790,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_xor_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), - + X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, X86ISD::FMADD, @@ -959,14 +1850,59 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastb_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastb_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastb_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastd_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastd_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastd_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastq_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastq_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastq_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastw_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastw_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastw_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0), + X86_INTRINSIC_DATA(avx512_psll_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSHLDQ, 0), + X86_INTRINSIC_DATA(avx512_psrl_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSRLDQ, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), + X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), @@ -1017,6 +1953,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0), X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), @@ -1024,6 +1962,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(sse2_pshuf_d, INTR_TYPE_2OP, X86ISD::PSHUFD, 0), X86_INTRINSIC_DATA(sse2_pshufh_w, INTR_TYPE_2OP, X86ISD::PSHUFHW, 0), X86_INTRINSIC_DATA(sse2_pshufl_w, INTR_TYPE_2OP, X86ISD::PSHUFLW, 0), @@ -1066,12 +2005,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, ISD::SMIN, 0), X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, ISD::UMIN, 0), - X86_INTRINSIC_DATA(sse41_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), @@ -1105,7 +2038,31 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), - X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0) + X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpcomub, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomud, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vprotd, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotdi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vprotq, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotqi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vprotw, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotwi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshaq, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshaw, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshlb, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshld, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshlq, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshlw, INTR_TYPE_2OP, X86ISD::VPSHL, 0) }; /* @@ -1128,6 +2085,102 @@ static void verifyIntrinsicTables() { std::is_sorted(std::begin(IntrinsicsWithChain), std::end(IntrinsicsWithChain)) && "Intrinsic data tables should be sorted by Intrinsic ID"); + assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain), + std::end(IntrinsicsWithoutChain)) == + std::end(IntrinsicsWithoutChain)) && + (std::adjacent_find(std::begin(IntrinsicsWithChain), + std::end(IntrinsicsWithChain)) == + std::end(IntrinsicsWithChain)) && + "Intrinsic data tables should have unique entries"); +} + +// X86 specific compare constants. +// They must be kept in synch with avxintrin.h +#define _X86_CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ +#define _X86_CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ +#define _X86_CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ +#define _X86_CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ +#define _X86_CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ +#define _X86_CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ +#define _X86_CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ +#define _X86_CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ +#define _X86_CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ +#define _X86_CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ +#define _X86_CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ +#define _X86_CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ +#define _X86_CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ +#define _X86_CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ +#define _X86_CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ +#define _X86_CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ +#define _X86_CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ +#define _X86_CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ +#define _X86_CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ +#define _X86_CMP_UNORD_S 0x13 /* Unordered (signaling) */ +#define _X86_CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ +#define _X86_CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ +#define _X86_CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ +#define _X86_CMP_ORD_S 0x17 /* Ordered (signaling) */ +#define _X86_CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ +#define _X86_CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ +#define _X86_CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ +#define _X86_CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ +#define _X86_CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ +#define _X86_CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ +#define _X86_CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ +#define _X86_CMP_TRUE_US 0x1f /* True (unordered, signaling) */ + +/* +* Get comparison modifier from _mm_comi_round_sd/ss intrinsic +* Return tuple <isOrdered, X86 condcode> +*/ +static std::tuple<bool,unsigned> TranslateX86ConstCondToX86CC(SDValue &imm) { + ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(imm); + unsigned IntImm = CImm->getZExtValue(); + // On a floating point condition, the flags are set as follows: + // ZF PF CF op + // 0 | 0 | 0 | X > Y + // 0 | 0 | 1 | X < Y + // 1 | 0 | 0 | X == Y + // 1 | 1 | 1 | unordered + switch (IntImm) { + default: llvm_unreachable("Invalid floating point compare value for Comi!"); + case _X86_CMP_EQ_OQ: // 0x00 - Equal (ordered, nonsignaling) + case _X86_CMP_EQ_OS: // 0x10 - Equal (ordered, signaling) + return std::make_tuple(true, X86::COND_E); + case _X86_CMP_EQ_UQ: // 0x08 - Equal (unordered, non-signaling) + case _X86_CMP_EQ_US: // 0x18 - Equal (unordered, signaling) + return std::make_tuple(false , X86::COND_E); + case _X86_CMP_LT_OS: // 0x01 - Less-than (ordered, signaling) + case _X86_CMP_LT_OQ: // 0x11 - Less-than (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_B); + case _X86_CMP_NGE_US: // 0x09 - Not-greater-than-or-equal (unordered, signaling) + case _X86_CMP_NGE_UQ: // 0x19 - Not-greater-than-or-equal (unordered, nonsignaling) + return std::make_tuple(false , X86::COND_B); + case _X86_CMP_LE_OS: // 0x02 - Less-than-or-equal (ordered, signaling) + case _X86_CMP_LE_OQ: // 0x12 - Less-than-or-equal (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_BE); + case _X86_CMP_NGT_US: // 0x0A - Not-greater-than (unordered, signaling) + case _X86_CMP_NGT_UQ: // 0x1A - Not-greater-than (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_BE); + case _X86_CMP_GT_OS: // 0x0E - Greater-than (ordered, signaling) + case _X86_CMP_GT_OQ: // 0x1E - Greater-than (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_A); + case _X86_CMP_NLE_US: // 0x06 - Not-less-than-or-equal (unordered,signaling) + case _X86_CMP_NLE_UQ: // 0x16 - Not-less-than-or-equal (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_A); + case _X86_CMP_GE_OS: // 0x0D - Greater-than-or-equal (ordered, signaling) + case _X86_CMP_GE_OQ: // 0x1D - Greater-than-or-equal (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_AE); + case _X86_CMP_NLT_US: // 0x05 - Not-less-than (unordered, signaling) + case _X86_CMP_NLT_UQ: // 0x15 - Not-less-than (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_AE); + case _X86_CMP_NEQ_OQ: // 0x0C - Not-equal (ordered, non-signaling) + case _X86_CMP_NEQ_OS: // 0x1C - Not-equal (ordered, signaling) + return std::make_tuple(true, X86::COND_NE); + case _X86_CMP_NEQ_UQ: // 0x04 - Not-equal (unordered, nonsignaling) + case _X86_CMP_NEQ_US: // 0x14 - Not-equal (unordered, signaling) + return std::make_tuple(false, X86::COND_NE); + } } } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp index 3415ced..e1ca558 100644 --- a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -14,6 +14,7 @@ #include "X86AsmPrinter.h" #include "X86RegisterInfo.h" +#include "X86ShuffleDecodeConstantPool.h" #include "InstPrinter/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" #include "Utils/X86ShuffleDecode.h" @@ -92,7 +93,6 @@ namespace llvm { SmallVector<MCFixup, 4> Fixups; raw_svector_ostream VecOS(Code); CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI); - VecOS.flush(); CurrentShadowSize += Code.size(); if (CurrentShadowSize >= RequiredShadowSize) InShadow = false; // The shadow is big enough. Stop counting. @@ -128,7 +128,7 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { /// operand to an MCSymbol. MCSymbol *X86MCInstLower:: GetSymbolFromOperand(const MachineOperand &MO) const { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = MF.getDataLayout(); assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference"); MCSymbol *Sym = nullptr; @@ -151,7 +151,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { } if (!Suffix.empty()) - Name += DL->getPrivateGlobalPrefix(); + Name += DL.getPrivateGlobalPrefix(); unsigned PrefixLen = Name.size(); @@ -159,7 +159,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { const GlobalValue *GV = MO.getGlobal(); AsmPrinter.getNameWithPrefix(Name, GV); } else if (MO.isSymbol()) { - Mangler::getNameWithPrefix(Name, MO.getSymbolName(), *DL); + Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL); } else if (MO.isMBB()) { assert(Suffix.empty()); Sym = MO.getMBB()->getSymbol(); @@ -455,12 +455,9 @@ ReSimplify: "LEA has segment specified!"); break; - case X86::MOV32ri64: - OutMI.setOpcode(X86::MOV32ri); - break; - // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B // if one of the registers is extended, but other isn't. + case X86::VMOVZPQILo2PQIrr: case X86::VMOVAPDrr: case X86::VMOVAPDYrr: case X86::VMOVAPSrr: @@ -478,18 +475,19 @@ ReSimplify: unsigned NewOpc; switch (OutMI.getOpcode()) { default: llvm_unreachable("Invalid opcode"); - case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; - case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; - case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; - case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; - case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; - case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; - case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; - case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; - case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; - case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; - case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; - case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; + case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break; + case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; + case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; + case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; + case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; + case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; + case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; + case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; + case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; + case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; + case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; + case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; + case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; } OutMI.setOpcode(NewOpc); } @@ -532,6 +530,23 @@ ReSimplify: break; } + case X86::CLEANUPRET: { + // Replace CATCHRET with the appropriate RET. + OutMI = MCInst(); + OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget())); + break; + } + + case X86::CATCHRET: { + // Replace CATCHRET with the appropriate RET. + const X86Subtarget &Subtarget = AsmPrinter.getSubtarget(); + unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; + OutMI = MCInst(); + OutMI.setOpcode(getRetOpcode(Subtarget)); + OutMI.addOperand(MCOperand::createReg(ReturnReg)); + break; + } + // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions. case X86::TAILJMPr: case X86::TAILJMPd: @@ -598,17 +613,29 @@ ReSimplify: case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify; case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify; case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify; + case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify; case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify; + case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify; case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify; + case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify; case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify; + case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify; case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify; + case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify; case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify; + case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify; case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify; + case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify; case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify; + case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify; case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify; + case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify; case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify; + case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify; case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify; + case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify; case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify; + case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify; case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify; case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify; case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify; @@ -875,7 +902,10 @@ void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI, MCInst LoadMI; LoadMI.setOpcode(LoadOpcode); - LoadMI.addOperand(MCOperand::createReg(LoadDefRegister)); + + if (LoadDefRegister != X86::NoRegister) + LoadMI.addOperand(MCOperand::createReg(LoadDefRegister)); + for (auto I = MI.operands_begin() + LoadOperandsBeginIdx, E = MI.operands_end(); I != E; ++I) @@ -1062,6 +1092,18 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86ATTInstPrinter::getRegisterName(Reg)); break; } + case X86::CLEANUPRET: { + // Lower these as normal, but add some comments. + OutStreamer->AddComment("CLEANUPRET"); + break; + } + + case X86::CATCHRET: { + // Lower these as normal, but add some comments. + OutStreamer->AddComment("CATCHRET"); + break; + } + case X86::TAILJMPr: case X86::TAILJMPm: case X86::TAILJMPd: @@ -1095,12 +1137,30 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32) .addExpr(MCSymbolRefExpr::create(PICBase, OutContext))); + const X86FrameLowering* FrameLowering = + MF->getSubtarget<X86Subtarget>().getFrameLowering(); + bool hasFP = FrameLowering->hasFP(*MF); + + // TODO: This is needed only if we require precise CFA. + bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() && + !OutStreamer->getDwarfFrameInfos().back().End; + + int stackGrowth = -RI->getSlotSize(); + + if (HasActiveDwarfFrame && !hasFP) { + OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth); + } + // Emit the label. OutStreamer->EmitLabel(PICBase); // popl $reg EmitAndCountInstruction(MCInstBuilder(X86::POP32r) .addReg(MI->getOperand(0).getReg())); + + if (HasActiveDwarfFrame && !hasFP) { + OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth); + } return; } @@ -1206,19 +1266,48 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } - // Lower PSHUFB and VPERMILP normally but add a comment if we can find - // a constant shuffle mask. We won't be able to do this at the MC layer - // because the mask isn't an immediate. + // Lower PSHUFB and VPERMILP normally but add a comment if we can find + // a constant shuffle mask. We won't be able to do this at the MC layer + // because the mask isn't an immediate. case X86::PSHUFBrm: case X86::VPSHUFBrm: - case X86::VPSHUFBYrm: { + case X86::VPSHUFBYrm: + case X86::VPSHUFBZ128rm: + case X86::VPSHUFBZ128rmk: + case X86::VPSHUFBZ128rmkz: + case X86::VPSHUFBZ256rm: + case X86::VPSHUFBZ256rmk: + case X86::VPSHUFBZ256rmkz: + case X86::VPSHUFBZrm: + case X86::VPSHUFBZrmk: + case X86::VPSHUFBZrmkz: { if (!OutStreamer->isVerboseAsm()) break; - assert(MI->getNumOperands() > 5 && - "We should always have at least 5 operands!"); + unsigned SrcIdx, MaskIdx; + switch (MI->getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::PSHUFBrm: + case X86::VPSHUFBrm: + case X86::VPSHUFBYrm: + case X86::VPSHUFBZ128rm: + case X86::VPSHUFBZ256rm: + case X86::VPSHUFBZrm: + SrcIdx = 1; MaskIdx = 5; break; + case X86::VPSHUFBZ128rmkz: + case X86::VPSHUFBZ256rmkz: + case X86::VPSHUFBZrmkz: + SrcIdx = 2; MaskIdx = 6; break; + case X86::VPSHUFBZ128rmk: + case X86::VPSHUFBZ256rmk: + case X86::VPSHUFBZrmk: + SrcIdx = 3; MaskIdx = 7; break; + } + + assert(MI->getNumOperands() >= 6 && + "We should always have at least 6 operands!"); const MachineOperand &DstOp = MI->getOperand(0); - const MachineOperand &SrcOp = MI->getOperand(1); - const MachineOperand &MaskOp = MI->getOperand(5); + const MachineOperand &SrcOp = MI->getOperand(SrcIdx); + const MachineOperand &MaskOp = MI->getOperand(MaskIdx); if (auto *C = getConstantFromPool(*MI, MaskOp)) { SmallVector<int, 16> Mask; @@ -1240,35 +1329,53 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { const MachineOperand &SrcOp = MI->getOperand(1); const MachineOperand &MaskOp = MI->getOperand(5); + unsigned ElSize; + switch (MI->getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VPERMILPSrm: case X86::VPERMILPSYrm: ElSize = 32; break; + case X86::VPERMILPDrm: case X86::VPERMILPDYrm: ElSize = 64; break; + } + if (auto *C = getConstantFromPool(*MI, MaskOp)) { SmallVector<int, 16> Mask; - DecodeVPERMILPMask(C, Mask); + DecodeVPERMILPMask(C, ElSize, Mask); if (!Mask.empty()) OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask)); } break; } - // For loads from a constant pool to a vector register, print the constant - // loaded. - case X86::MOVAPDrm: - case X86::VMOVAPDrm: - case X86::VMOVAPDYrm: - case X86::MOVUPDrm: - case X86::VMOVUPDrm: - case X86::VMOVUPDYrm: - case X86::MOVAPSrm: - case X86::VMOVAPSrm: - case X86::VMOVAPSYrm: - case X86::MOVUPSrm: - case X86::VMOVUPSrm: - case X86::VMOVUPSYrm: - case X86::MOVDQArm: - case X86::VMOVDQArm: - case X86::VMOVDQAYrm: - case X86::MOVDQUrm: - case X86::VMOVDQUrm: - case X86::VMOVDQUYrm: +#define MOV_CASE(Prefix, Suffix) \ + case X86::Prefix##MOVAPD##Suffix##rm: \ + case X86::Prefix##MOVAPS##Suffix##rm: \ + case X86::Prefix##MOVUPD##Suffix##rm: \ + case X86::Prefix##MOVUPS##Suffix##rm: \ + case X86::Prefix##MOVDQA##Suffix##rm: \ + case X86::Prefix##MOVDQU##Suffix##rm: + +#define MOV_AVX512_CASE(Suffix) \ + case X86::VMOVDQA64##Suffix##rm: \ + case X86::VMOVDQA32##Suffix##rm: \ + case X86::VMOVDQU64##Suffix##rm: \ + case X86::VMOVDQU32##Suffix##rm: \ + case X86::VMOVDQU16##Suffix##rm: \ + case X86::VMOVDQU8##Suffix##rm: \ + case X86::VMOVAPS##Suffix##rm: \ + case X86::VMOVAPD##Suffix##rm: \ + case X86::VMOVUPS##Suffix##rm: \ + case X86::VMOVUPD##Suffix##rm: + +#define CASE_ALL_MOV_RM() \ + MOV_CASE(, ) /* SSE */ \ + MOV_CASE(V, ) /* AVX-128 */ \ + MOV_CASE(V, Y) /* AVX-256 */ \ + MOV_AVX512_CASE(Z) \ + MOV_AVX512_CASE(Z256) \ + MOV_AVX512_CASE(Z128) + + // For loads from a constant pool to a vector register, print the constant + // loaded. + CASE_ALL_MOV_RM() if (!OutStreamer->isVerboseAsm()) break; if (MI->getNumOperands() > 4) @@ -1302,7 +1409,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { if (isa<UndefValue>(COp)) { CS << "u"; } else if (auto *CI = dyn_cast<ConstantInt>(COp)) { - CS << CI->getZExtValue(); + if (CI->getBitWidth() <= 64) { + CS << CI->getZExtValue(); + } else { + // print multi-word constant as (w0,w1) + auto Val = CI->getValue(); + CS << "("; + for (int i = 0, N = Val.getNumWords(); i < N; ++i) { + if (i > 0) + CS << ","; + CS << Val.getRawData()[i]; + } + CS << ")"; + } } else if (auto *CF = dyn_cast<ConstantFP>(COp)) { SmallString<32> Str; CF->getValueAPF().toString(Str); diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp index ac2cdc8c..c9e636f 100644 --- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -1,4 +1,4 @@ -//===-- X86MachineFuctionInfo.cpp - X86 machine function info -------------===// +//===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h index e6db970..00515dd 100644 --- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -1,4 +1,4 @@ -//===-- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===// +//===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -84,14 +84,18 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// of pushes to pass function parameters. bool HasPushSequences = false; - /// True if the function uses llvm.x86.seh.restoreframe, and it needed a spill - /// slot for the frame pointer. + /// True if the function recovers from an SEH exception, and therefore needs + /// to spill and restore the frame pointer. bool HasSEHFramePtrSave = false; /// The frame index of a stack object containing the original frame pointer /// used to address arguments in a function using a base pointer. int SEHFramePtrSaveIndex = 0; + /// True if this function has a subset of CSRs that is handled explicitly via + /// copies. + bool IsSplitCSR = false; + private: /// ForwardedMustTailRegParms - A list of virtual and physical registers /// that must be forwarded to every musttail call. @@ -100,7 +104,7 @@ private: public: X86MachineFunctionInfo() = default; - explicit X86MachineFunctionInfo(MachineFunction &MF) {}; + explicit X86MachineFunctionInfo(MachineFunction &MF) {} bool getForceFramePointer() const { return ForceFramePointer;} void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } @@ -160,6 +164,9 @@ public: SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() { return ForwardedMustTailRegParms; } + + bool isSplitCSR() const { return IsSplitCSR; } + void setIsSplitCSR(bool s) { IsSplitCSR = s; } }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp new file mode 100644 index 0000000..45cc0ae --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -0,0 +1,503 @@ +//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass that performs some optimizations with LEA +// instructions in order to improve code size. +// Currently, it does two things: +// 1) If there are two LEA instructions calculating addresses which only differ +// by displacement inside a basic block, one of them is removed. +// 2) Address calculations in load and store instructions are replaced by +// existing LEA def registers where possible. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-optimize-LEAs" + +static cl::opt<bool> EnableX86LEAOpt("enable-x86-lea-opt", cl::Hidden, + cl::desc("X86: Enable LEA optimizations."), + cl::init(false)); + +STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions"); +STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed"); + +namespace { +class OptimizeLEAPass : public MachineFunctionPass { +public: + OptimizeLEAPass() : MachineFunctionPass(ID) {} + + const char *getPassName() const override { return "X86 LEA Optimize"; } + + /// \brief Loop over all of the basic blocks, replacing address + /// calculations in load and store instructions, if it's already + /// been calculated by LEA. Also, remove redundant LEAs. + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + /// \brief Returns a distance between two instructions inside one basic block. + /// Negative result means, that instructions occur in reverse order. + int calcInstrDist(const MachineInstr &First, const MachineInstr &Last); + + /// \brief Choose the best \p LEA instruction from the \p List to replace + /// address calculation in \p MI instruction. Return the address displacement + /// and the distance between \p MI and the choosen \p LEA in \p AddrDispShift + /// and \p Dist. + bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List, + const MachineInstr &MI, MachineInstr *&LEA, + int64_t &AddrDispShift, int &Dist); + + /// \brief Returns true if two machine operand are identical and they are not + /// physical registers. + bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2); + + /// \brief Returns true if the instruction is LEA. + bool isLEA(const MachineInstr &MI); + + /// \brief Returns true if the \p Last LEA instruction can be replaced by the + /// \p First. The difference between displacements of the addresses calculated + /// by these LEAs is returned in \p AddrDispShift. It'll be used for proper + /// replacement of the \p Last LEA's uses with the \p First's def register. + bool isReplaceable(const MachineInstr &First, const MachineInstr &Last, + int64_t &AddrDispShift); + + /// \brief Returns true if two instructions have memory operands that only + /// differ by displacement. The numbers of the first memory operands for both + /// instructions are specified through \p N1 and \p N2. The address + /// displacement is returned through AddrDispShift. + bool isSimilarMemOp(const MachineInstr &MI1, unsigned N1, + const MachineInstr &MI2, unsigned N2, + int64_t &AddrDispShift); + + /// \brief Find all LEA instructions in the basic block. Also, assign position + /// numbers to all instructions in the basic block to speed up calculation of + /// distance between them. + void findLEAs(const MachineBasicBlock &MBB, + SmallVectorImpl<MachineInstr *> &List); + + /// \brief Removes redundant address calculations. + bool removeRedundantAddrCalc(const SmallVectorImpl<MachineInstr *> &List); + + /// \brief Removes LEAs which calculate similar addresses. + bool removeRedundantLEAs(SmallVectorImpl<MachineInstr *> &List); + + DenseMap<const MachineInstr *, unsigned> InstrPos; + + MachineRegisterInfo *MRI; + const X86InstrInfo *TII; + const X86RegisterInfo *TRI; + + static char ID; +}; +char OptimizeLEAPass::ID = 0; +} + +FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); } + +int OptimizeLEAPass::calcInstrDist(const MachineInstr &First, + const MachineInstr &Last) { + // Both instructions must be in the same basic block and they must be + // presented in InstrPos. + assert(Last.getParent() == First.getParent() && + "Instructions are in different basic blocks"); + assert(InstrPos.find(&First) != InstrPos.end() && + InstrPos.find(&Last) != InstrPos.end() && + "Instructions' positions are undefined"); + + return InstrPos[&Last] - InstrPos[&First]; +} + +// Find the best LEA instruction in the List to replace address recalculation in +// MI. Such LEA must meet these requirements: +// 1) The address calculated by the LEA differs only by the displacement from +// the address used in MI. +// 2) The register class of the definition of the LEA is compatible with the +// register class of the address base register of MI. +// 3) Displacement of the new memory operand should fit in 1 byte if possible. +// 4) The LEA should be as close to MI as possible, and prior to it if +// possible. +bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List, + const MachineInstr &MI, MachineInstr *&LEA, + int64_t &AddrDispShift, int &Dist) { + const MachineFunction *MF = MI.getParent()->getParent(); + const MCInstrDesc &Desc = MI.getDesc(); + int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) + + X86II::getOperandBias(Desc); + + LEA = nullptr; + + // Loop over all LEA instructions. + for (auto DefMI : List) { + int64_t AddrDispShiftTemp = 0; + + // Compare instructions memory operands. + if (!isSimilarMemOp(MI, MemOpNo, *DefMI, 1, AddrDispShiftTemp)) + continue; + + // Make sure address displacement fits 4 bytes. + if (!isInt<32>(AddrDispShiftTemp)) + continue; + + // Check that LEA def register can be used as MI address base. Some + // instructions can use a limited set of registers as address base, for + // example MOV8mr_NOREX. We could constrain the register class of the LEA + // def to suit MI, however since this case is very rare and hard to + // reproduce in a test it's just more reliable to skip the LEA. + if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) != + MRI->getRegClass(DefMI->getOperand(0).getReg())) + continue; + + // Choose the closest LEA instruction from the list, prior to MI if + // possible. Note that we took into account resulting address displacement + // as well. Also note that the list is sorted by the order in which the LEAs + // occur, so the break condition is pretty simple. + int DistTemp = calcInstrDist(*DefMI, MI); + assert(DistTemp != 0 && + "The distance between two different instructions cannot be zero"); + if (DistTemp > 0 || LEA == nullptr) { + // Do not update return LEA, if the current one provides a displacement + // which fits in 1 byte, while the new candidate does not. + if (LEA != nullptr && !isInt<8>(AddrDispShiftTemp) && + isInt<8>(AddrDispShift)) + continue; + + LEA = DefMI; + AddrDispShift = AddrDispShiftTemp; + Dist = DistTemp; + } + + // FIXME: Maybe we should not always stop at the first LEA after MI. + if (DistTemp < 0) + break; + } + + return LEA != nullptr; +} + +bool OptimizeLEAPass::isIdenticalOp(const MachineOperand &MO1, + const MachineOperand &MO2) { + return MO1.isIdenticalTo(MO2) && + (!MO1.isReg() || + !TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); +} + +bool OptimizeLEAPass::isLEA(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + return Opcode == X86::LEA16r || Opcode == X86::LEA32r || + Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; +} + +// Check that the Last LEA can be replaced by the First LEA. To be so, +// these requirements must be met: +// 1) Addresses calculated by LEAs differ only by displacement. +// 2) Def registers of LEAs belong to the same class. +// 3) All uses of the Last LEA def register are replaceable, thus the +// register is used only as address base. +bool OptimizeLEAPass::isReplaceable(const MachineInstr &First, + const MachineInstr &Last, + int64_t &AddrDispShift) { + assert(isLEA(First) && isLEA(Last) && + "The function works only with LEA instructions"); + + // Compare instructions' memory operands. + if (!isSimilarMemOp(Last, 1, First, 1, AddrDispShift)) + return false; + + // Make sure that LEA def registers belong to the same class. There may be + // instructions (like MOV8mr_NOREX) which allow a limited set of registers to + // be used as their operands, so we must be sure that replacing one LEA + // with another won't lead to putting a wrong register in the instruction. + if (MRI->getRegClass(First.getOperand(0).getReg()) != + MRI->getRegClass(Last.getOperand(0).getReg())) + return false; + + // Loop over all uses of the Last LEA to check that its def register is + // used only as address base for memory accesses. If so, it can be + // replaced, otherwise - no. + for (auto &MO : MRI->use_operands(Last.getOperand(0).getReg())) { + MachineInstr &MI = *MO.getParent(); + + // Get the number of the first memory operand. + const MCInstrDesc &Desc = MI.getDesc(); + int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()); + + // If the use instruction has no memory operand - the LEA is not + // replaceable. + if (MemOpNo < 0) + return false; + + MemOpNo += X86II::getOperandBias(Desc); + + // If the address base of the use instruction is not the LEA def register - + // the LEA is not replaceable. + if (!isIdenticalOp(MI.getOperand(MemOpNo + X86::AddrBaseReg), MO)) + return false; + + // If the LEA def register is used as any other operand of the use + // instruction - the LEA is not replaceable. + for (unsigned i = 0; i < MI.getNumOperands(); i++) + if (i != (unsigned)(MemOpNo + X86::AddrBaseReg) && + isIdenticalOp(MI.getOperand(i), MO)) + return false; + + // Check that the new address displacement will fit 4 bytes. + if (MI.getOperand(MemOpNo + X86::AddrDisp).isImm() && + !isInt<32>(MI.getOperand(MemOpNo + X86::AddrDisp).getImm() + + AddrDispShift)) + return false; + } + + return true; +} + +// Check if MI1 and MI2 have memory operands which represent addresses that +// differ only by displacement. +bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1, + const MachineInstr &MI2, unsigned N2, + int64_t &AddrDispShift) { + // Address base, scale, index and segment operands must be identical. + static const int IdenticalOpNums[] = {X86::AddrBaseReg, X86::AddrScaleAmt, + X86::AddrIndexReg, X86::AddrSegmentReg}; + for (auto &N : IdenticalOpNums) + if (!isIdenticalOp(MI1.getOperand(N1 + N), MI2.getOperand(N2 + N))) + return false; + + // Address displacement operands may differ by a constant. + const MachineOperand *Op1 = &MI1.getOperand(N1 + X86::AddrDisp); + const MachineOperand *Op2 = &MI2.getOperand(N2 + X86::AddrDisp); + if (!isIdenticalOp(*Op1, *Op2)) { + if (Op1->isImm() && Op2->isImm()) + AddrDispShift = Op1->getImm() - Op2->getImm(); + else if (Op1->isGlobal() && Op2->isGlobal() && + Op1->getGlobal() == Op2->getGlobal()) + AddrDispShift = Op1->getOffset() - Op2->getOffset(); + else + return false; + } + + return true; +} + +void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, + SmallVectorImpl<MachineInstr *> &List) { + unsigned Pos = 0; + for (auto &MI : MBB) { + // Assign the position number to the instruction. Note that we are going to + // move some instructions during the optimization however there will never + // be a need to move two instructions before any selected instruction. So to + // avoid multiple positions' updates during moves we just increase position + // counter by two leaving a free space for instructions which will be moved. + InstrPos[&MI] = Pos += 2; + + if (isLEA(MI)) + List.push_back(const_cast<MachineInstr *>(&MI)); + } +} + +// Try to find load and store instructions which recalculate addresses already +// calculated by some LEA and replace their memory operands with its def +// register. +bool OptimizeLEAPass::removeRedundantAddrCalc( + const SmallVectorImpl<MachineInstr *> &List) { + bool Changed = false; + + assert(List.size() > 0); + MachineBasicBlock *MBB = List[0]->getParent(); + + // Process all instructions in basic block. + for (auto I = MBB->begin(), E = MBB->end(); I != E;) { + MachineInstr &MI = *I++; + unsigned Opcode = MI.getOpcode(); + + // Instruction must be load or store. + if (!MI.mayLoadOrStore()) + continue; + + // Get the number of the first memory operand. + const MCInstrDesc &Desc = MI.getDesc(); + int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, Opcode); + + // If instruction has no memory operand - skip it. + if (MemOpNo < 0) + continue; + + MemOpNo += X86II::getOperandBias(Desc); + + // Get the best LEA instruction to replace address calculation. + MachineInstr *DefMI; + int64_t AddrDispShift; + int Dist; + if (!chooseBestLEA(List, MI, DefMI, AddrDispShift, Dist)) + continue; + + // If LEA occurs before current instruction, we can freely replace + // the instruction. If LEA occurs after, we can lift LEA above the + // instruction and this way to be able to replace it. Since LEA and the + // instruction have similar memory operands (thus, the same def + // instructions for these operands), we can always do that, without + // worries of using registers before their defs. + if (Dist < 0) { + DefMI->removeFromParent(); + MBB->insert(MachineBasicBlock::iterator(&MI), DefMI); + InstrPos[DefMI] = InstrPos[&MI] - 1; + + // Make sure the instructions' position numbers are sane. + assert(((InstrPos[DefMI] == 1 && DefMI == MBB->begin()) || + InstrPos[DefMI] > + InstrPos[std::prev(MachineBasicBlock::iterator(DefMI))]) && + "Instruction positioning is broken"); + } + + // Since we can possibly extend register lifetime, clear kill flags. + MRI->clearKillFlags(DefMI->getOperand(0).getReg()); + + ++NumSubstLEAs; + DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump();); + + // Change instruction operands. + MI.getOperand(MemOpNo + X86::AddrBaseReg) + .ChangeToRegister(DefMI->getOperand(0).getReg(), false); + MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1); + MI.getOperand(MemOpNo + X86::AddrIndexReg) + .ChangeToRegister(X86::NoRegister, false); + MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift); + MI.getOperand(MemOpNo + X86::AddrSegmentReg) + .ChangeToRegister(X86::NoRegister, false); + + DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump();); + + Changed = true; + } + + return Changed; +} + +// Try to find similar LEAs in the list and replace one with another. +bool +OptimizeLEAPass::removeRedundantLEAs(SmallVectorImpl<MachineInstr *> &List) { + bool Changed = false; + + // Loop over all LEA pairs. + auto I1 = List.begin(); + while (I1 != List.end()) { + MachineInstr &First = **I1; + auto I2 = std::next(I1); + while (I2 != List.end()) { + MachineInstr &Last = **I2; + int64_t AddrDispShift; + + // LEAs should be in occurence order in the list, so we can freely + // replace later LEAs with earlier ones. + assert(calcInstrDist(First, Last) > 0 && + "LEAs must be in occurence order in the list"); + + // Check that the Last LEA instruction can be replaced by the First. + if (!isReplaceable(First, Last, AddrDispShift)) { + ++I2; + continue; + } + + // Loop over all uses of the Last LEA and update their operands. Note that + // the correctness of this has already been checked in the isReplaceable + // function. + for (auto UI = MRI->use_begin(Last.getOperand(0).getReg()), + UE = MRI->use_end(); + UI != UE;) { + MachineOperand &MO = *UI++; + MachineInstr &MI = *MO.getParent(); + + // Get the number of the first memory operand. + const MCInstrDesc &Desc = MI.getDesc(); + int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) + + X86II::getOperandBias(Desc); + + // Update address base. + MO.setReg(First.getOperand(0).getReg()); + + // Update address disp. + MachineOperand *Op = &MI.getOperand(MemOpNo + X86::AddrDisp); + if (Op->isImm()) + Op->setImm(Op->getImm() + AddrDispShift); + else if (Op->isGlobal()) + Op->setOffset(Op->getOffset() + AddrDispShift); + else + llvm_unreachable("Invalid address displacement operand"); + } + + // Since we can possibly extend register lifetime, clear kill flags. + MRI->clearKillFlags(First.getOperand(0).getReg()); + + ++NumRedundantLEAs; + DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: "; Last.dump();); + + // By this moment, all of the Last LEA's uses must be replaced. So we can + // freely remove it. + assert(MRI->use_empty(Last.getOperand(0).getReg()) && + "The LEA's def register must have no uses"); + Last.eraseFromParent(); + + // Erase removed LEA from the list. + I2 = List.erase(I2); + + Changed = true; + } + ++I1; + } + + return Changed; +} + +bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + + // Perform this optimization only if we care about code size. + if (!EnableX86LEAOpt || !MF.getFunction()->optForSize()) + return false; + + MRI = &MF.getRegInfo(); + TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); + TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo(); + + // Process all basic blocks. + for (auto &MBB : MF) { + SmallVector<MachineInstr *, 16> LEAs; + InstrPos.clear(); + + // Find all LEA instructions in basic block. + findLEAs(MBB, LEAs); + + // If current basic block has no LEAs, move on to the next one. + if (LEAs.empty()) + continue; + + // Remove redundant LEA instructions. The optimization may have a negative + // effect on performance, so do it only for -Oz. + if (MF.getFunction()->optForMinSize()) + Changed |= removeRedundantLEAs(LEAs); + + // Remove redundant address calculations. + Changed |= removeRedundantAddrCalc(LEAs); + } + + return Changed; +} diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp index 143e70b..0f425e2 100644 --- a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -93,8 +93,7 @@ FunctionPass *llvm::createX86PadShortFunctions() { /// runOnMachineFunction - Loop over all of the basic blocks, inserting /// NOOP instructions before early exits. bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { - if (MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || - MF.getFunction()->hasFnAttribute(Attribute::MinSize)) { + if (MF.getFunction()->optForSize()) { return false; } @@ -107,7 +106,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { // Search through basic blocks and mark the ones that have early returns ReturnBBs.clear(); VisitedBBs.clear(); - findReturns(MF.begin()); + findReturns(&MF.front()); bool MadeChange = false; diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp index d8495e5..274b566 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -27,7 +27,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" @@ -44,12 +43,6 @@ using namespace llvm; #define GET_REGINFO_TARGET_DESC #include "X86GenRegisterInfo.inc" -cl::opt<bool> -ForceStackAlign("force-align-stack", - cl::desc("Force align the stack to the minimum alignment" - " needed for the function."), - cl::init(false), cl::Hidden); - static cl::opt<bool> EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); @@ -174,21 +167,34 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, if (Subtarget.isTarget64BitLP64()) return &X86::GR64_NOSPRegClass; return &X86::GR32_NOSPRegClass; - case 2: // Available for tailcall (not callee-saved GPRs). - const Function *F = MF.getFunction(); - if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64)) - return &X86::GR64_TCW64RegClass; - else if (Is64Bit) - return &X86::GR64_TCRegClass; - - bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false); - if (hasHipeCC) - return &X86::GR32RegClass; - return &X86::GR32_TCRegClass; + case 2: // NOREX GPRs. + if (Subtarget.isTarget64BitLP64()) + return &X86::GR64_NOREXRegClass; + return &X86::GR32_NOREXRegClass; + case 3: // NOREX GPRs except the stack pointer (for encoding reasons). + if (Subtarget.isTarget64BitLP64()) + return &X86::GR64_NOREX_NOSPRegClass; + return &X86::GR32_NOREX_NOSPRegClass; + case 4: // Available for tailcall (not callee-saved GPRs). + return getGPRsForTailCall(MF); } } const TargetRegisterClass * +X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { + const Function *F = MF.getFunction(); + if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64)) + return &X86::GR64_TCW64RegClass; + else if (Is64Bit) + return &X86::GR64_TCRegClass; + + bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false); + if (hasHipeCC) + return &X86::GR32RegClass; + return &X86::GR32_TCRegClass; +} + +const TargetRegisterClass * X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { if (RC == &X86::CCRRegClass) { if (Is64Bit) @@ -222,6 +228,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, const MCPhysReg * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>(); + bool HasSSE = Subtarget.hasSSE1(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); bool CallsEHReturn = MF->getMMI().callsEHReturn(); @@ -241,6 +248,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (HasAVX) return CSR_64_RT_AllRegs_AVX_SaveList; return CSR_64_RT_AllRegs_SaveList; + case CallingConv::CXX_FAST_TLS: + if (Is64Bit) + return MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR() ? + CSR_64_CXX_TLS_Darwin_PE_SaveList : CSR_64_TLS_Darwin_SaveList; + break; case CallingConv::Intel_OCL_BI: { if (HasAVX512 && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX512_SaveList; @@ -254,6 +266,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_64_Intel_OCL_BI_SaveList; break; } + case CallingConv::HHVM: + return CSR_64_HHVM_SaveList; case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_SaveList; @@ -264,6 +278,18 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (CallsEHReturn) return CSR_64EHRet_SaveList; return CSR_64_SaveList; + case CallingConv::X86_INTR: + if (Is64Bit) { + if (HasAVX) + return CSR_64_AllRegs_AVX_SaveList; + else + return CSR_64_AllRegs_SaveList; + } else { + if (HasSSE) + return CSR_32_AllRegs_SSE_SaveList; + else + return CSR_32_AllRegs_SaveList; + } default: break; } @@ -280,10 +306,20 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_32_SaveList; } +const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy( + const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR()) + return CSR_64_CXX_TLS_Darwin_ViaCopy_SaveList; + return nullptr; +} + const uint32_t * X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); + bool HasSSE = Subtarget.hasSSE1(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); @@ -301,6 +337,10 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, if (HasAVX) return CSR_64_RT_AllRegs_AVX_RegMask; return CSR_64_RT_AllRegs_RegMask; + case CallingConv::CXX_FAST_TLS: + if (Is64Bit) + return CSR_64_TLS_Darwin_RegMask; + break; case CallingConv::Intel_OCL_BI: { if (HasAVX512 && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX512_RegMask; @@ -314,16 +354,30 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_64_Intel_OCL_BI_RegMask; break; } + case CallingConv::HHVM: + return CSR_64_HHVM_RegMask; case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_RegMask; break; - default: - break; case CallingConv::X86_64_Win64: return CSR_Win64_RegMask; case CallingConv::X86_64_SysV: return CSR_64_RegMask; + case CallingConv::X86_INTR: + if (Is64Bit) { + if (HasAVX) + return CSR_64_AllRegs_AVX_RegMask; + else + return CSR_64_AllRegs_RegMask; + } else { + if (HasSSE) + return CSR_32_AllRegs_SSE_RegMask; + else + return CSR_32_AllRegs_RegMask; + } + default: + break; } // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check @@ -341,6 +395,10 @@ X86RegisterInfo::getNoPreservedMask() const { return CSR_NoRegs_RegMask; } +const uint32_t *X86RegisterInfo::getDarwinTLSCallPreservedMask() const { + return CSR_64_TLS_Darwin_RegMask; +} + BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const X86FrameLowering *TFI = getFrameLowering(MF); @@ -371,8 +429,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { "Stack realignment in presence of dynamic allocas is not supported with" "this calling convention."); - unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), MVT::i64, - false); + unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64); for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true); I.isValid(); ++I) Reserved.set(*I); @@ -439,6 +496,10 @@ void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { // Stack Frame Processing methods //===----------------------------------------------------------------------===// +static bool CantUseSP(const MachineFrameInfo *MFI) { + return MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment(); +} + bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -451,13 +512,11 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { // reference locals while also adjusting the stack pointer. When we can't // use both the SP and the FP, we need a separate base pointer register. bool CantUseFP = needsStackRealignment(MF); - bool CantUseSP = - MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment(); - return CantUseFP && CantUseSP; + return CantUseFP && CantUseSP(MFI); } bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { - if (MF.getFunction()->hasFnAttribute("no-realign-stack")) + if (!TargetRegisterInfo::canRealignStack(MF)) return false; const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -470,26 +529,11 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { // If a base pointer is necessary. Check that it isn't too late to reserve // it. - if (MFI->hasVarSizedObjects()) + if (CantUseSP(MFI)) return MRI->canReserveReg(BasePtr); return true; } -bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const X86FrameLowering *TFI = getFrameLowering(MF); - const Function *F = MF.getFunction(); - unsigned StackAlign = TFI->getStackAlignment(); - bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || - F->hasFnAttribute(Attribute::StackAlignment)); - - // If we've requested that we force align the stack do so now. - if (ForceStackAlign) - return canRealignStack(MF); - - return requiresRealignment && canRealignStack(MF); -} - bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, int &FrameIdx) const { // Since X86 defines assignCalleeSavedSpillSlots which always return true @@ -510,6 +554,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned Opc = MI.getOpcode(); bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm || Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64; + if (hasBasePointer(MF)) BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister()); else if (needsStackRealignment(MF)) @@ -524,14 +569,11 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // offset is from the traditional base pointer location. On 64-bit, the // offset is from the SP at the end of the prologue, not the FP location. This // matches the behavior of llvm.frameaddress. + unsigned IgnoredFrameReg; if (Opc == TargetOpcode::LOCAL_ESCAPE) { MachineOperand &FI = MI.getOperand(FIOperandNum); - bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); int Offset; - if (IsWinEH) - Offset = TFI->getFrameIndexOffsetFromSP(MF, FrameIndex); - else - Offset = TFI->getFrameIndexOffset(MF, FrameIndex); + Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); FI.ChangeToImmediate(Offset); return; } @@ -540,7 +582,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // register as source operand, semantic is the same and destination is // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr)) - BasePtr = getX86SubSuperRegister(BasePtr, MVT::i64, false); + BasePtr = getX86SubSuperRegister(BasePtr, 64); // This must be part of a four operand memory reference. Replace the // FrameIndex with base register with EBP. Add an offset to the offset. @@ -553,7 +595,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, const MachineFrameInfo *MFI = MF.getFrameInfo(); FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea(); } else - FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex); + FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); if (BasePtr == StackPtr) FIOffset += SPAdj; @@ -592,193 +634,11 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); unsigned FrameReg = getFrameRegister(MF); if (Subtarget.isTarget64BitILP32()) - FrameReg = getX86SubSuperRegister(FrameReg, MVT::i32, false); + FrameReg = getX86SubSuperRegister(FrameReg, 32); return FrameReg; } -namespace llvm { -unsigned getX86SubSuperRegisterOrZero(unsigned Reg, MVT::SimpleValueType VT, - bool High) { - switch (VT) { - default: return 0; - case MVT::i8: - if (High) { - switch (Reg) { - default: return getX86SubSuperRegister(Reg, MVT::i64); - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::SI; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::DI; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::BP; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::SP; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::AH; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::DH; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::CH; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::BH; - } - } else { - switch (Reg) { - default: return 0; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::AL; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::DL; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::CL; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::BL; - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::SIL; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::DIL; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::BPL; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::SPL; - case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: - return X86::R8B; - case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: - return X86::R9B; - case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: - return X86::R10B; - case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: - return X86::R11B; - case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: - return X86::R12B; - case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: - return X86::R13B; - case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: - return X86::R14B; - case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: - return X86::R15B; - } - } - case MVT::i16: - switch (Reg) { - default: return 0; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::AX; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::DX; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::CX; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::BX; - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::SI; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::DI; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::BP; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::SP; - case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: - return X86::R8W; - case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: - return X86::R9W; - case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: - return X86::R10W; - case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: - return X86::R11W; - case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: - return X86::R12W; - case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: - return X86::R13W; - case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: - return X86::R14W; - case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: - return X86::R15W; - } - case MVT::i32: - switch (Reg) { - default: return 0; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::EAX; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::EDX; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::ECX; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::EBX; - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::ESI; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::EDI; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::EBP; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::ESP; - case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: - return X86::R8D; - case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: - return X86::R9D; - case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: - return X86::R10D; - case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: - return X86::R11D; - case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: - return X86::R12D; - case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: - return X86::R13D; - case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: - return X86::R14D; - case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: - return X86::R15D; - } - case MVT::i64: - switch (Reg) { - default: return 0; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::RAX; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::RDX; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::RCX; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::RBX; - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::RSI; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::RDI; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::RBP; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::RSP; - case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: - return X86::R8; - case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: - return X86::R9; - case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: - return X86::R10; - case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: - return X86::R11; - case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: - return X86::R12; - case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: - return X86::R13; - case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: - return X86::R14; - case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: - return X86::R15; - } - } -} - -unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, - bool High) { - unsigned Res = getX86SubSuperRegisterOrZero(Reg, VT, High); - if (Res == 0) - llvm_unreachable("Unexpected register or VT"); - return Res; -} - -unsigned get512BitSuperRegister(unsigned Reg) { +unsigned llvm::get512BitSuperRegister(unsigned Reg) { if (Reg >= X86::XMM0 && Reg <= X86::XMM31) return X86::ZMM0 + (Reg - X86::XMM0); if (Reg >= X86::YMM0 && Reg <= X86::YMM31) @@ -787,5 +647,3 @@ unsigned get512BitSuperRegister(unsigned Reg) { return Reg; llvm_unreachable("Unexpected SIMD register"); } - -} diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h index 8de1d0b..8d0094c 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h @@ -87,6 +87,11 @@ public: const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override; + /// getGPRsForTailCall - Returns a register class with registers that can be + /// used in forming tail calls. + const TargetRegisterClass * + getGPRsForTailCall(const MachineFunction &MF) const; + unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; @@ -94,9 +99,15 @@ public: /// callee-save registers on this target. const MCPhysReg * getCalleeSavedRegs(const MachineFunction* MF) const override; + const MCPhysReg * + getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; - const uint32_t *getNoPreservedMask() const; + const uint32_t *getNoPreservedMask() const override; + + // Calls involved in thread-local variable lookup save more registers than + // normal calls, so they need a different mask to represent this. + const uint32_t *getDarwinTLSCallPreservedMask() const; /// getReservedRegs - Returns a bitset indexed by physical register number /// indicating if a register is a special register that has particular uses and @@ -108,9 +119,7 @@ public: bool hasBasePointer(const MachineFunction &MF) const; - bool canRealignStack(const MachineFunction &MF) const; - - bool needsStackRealignment(const MachineFunction &MF) const override; + bool canRealignStack(const MachineFunction &MF) const override; bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, int &FrameIdx) const override; @@ -128,16 +137,6 @@ public: unsigned getSlotSize() const { return SlotSize; } }; -/// Returns the sub or super register of a specific X86 register. -/// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) returns X86::AX. -/// Aborts on error. -unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false); - -/// Returns the sub or super register of a specific X86 register. -/// Like getX86SubSuperRegister() but returns 0 on error. -unsigned getX86SubSuperRegisterOrZero(unsigned, MVT::SimpleValueType, - bool High = false); - //get512BitRegister - X86 utility - returns 512-bit super register unsigned get512BitSuperRegister(unsigned Reg); diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td index cdb151c..56f0d93 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td @@ -225,15 +225,15 @@ let SubRegIndices = [sub_ymm] in { } } - // Mask Registers, used by AVX-512 instructions. - def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>; - def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>; - def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>; - def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>; - def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>; - def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>; - def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>; - def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>; +// Mask Registers, used by AVX-512 instructions. +def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>; +def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>; +def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>; +def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>; +def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>; +def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>; +def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>; +def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>; // Floating point stack registers. These don't map one-to-one to the FP // pseudo registers, but we still mark them as aliasing FP registers. That @@ -375,7 +375,7 @@ def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>; def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, R8, R9, R11, RIP)>; def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, - R8, R9, R11)>; + R8, R9, R10, R11, RIP)>; // GR8_NOREX - GR8 registers which do not require a REX prefix. def GR8_NOREX : RegisterClass<"X86", [i8], 8, @@ -423,6 +423,8 @@ def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; +def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>; + // FIXME: This sets up the floating point register files as though they are f64 // values, though they really are f80 values. This will cause us to spill @@ -442,10 +444,11 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> { } // Generic vector registers: VR64 and VR128. +// Ensure that float types are declared first - only float is legal on SSE1. def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; -def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], +def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64], 128, (add FR32)>; -def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], +def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 15)>; // Status flags registers. @@ -459,8 +462,8 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { } // AVX-512 vector/mask registers. -def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], 512, - (sequence "ZMM%u", 0, 31)>; +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], + 512, (sequence "ZMM%u", 0, 31)>; // Scalar AVX-512 floating point registers. def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; @@ -468,10 +471,10 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; // Extended VR128 and VR256 for AVX-512 instructions -def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - 128, (add FR32X)>; -def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - 256, (sequence "YMM%u", 0, 31)>; +def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64], + 128, (add FR32X)>; +def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], + 256, (sequence "YMM%u", 0, 31)>; // Mask registers def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;} @@ -491,4 +494,4 @@ def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} // Bound registers -def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
\ No newline at end of file +def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>; diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index ce79fcf..b1a0161 100644 --- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -44,13 +44,10 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible( return false; } -SDValue -X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, - SDValue Chain, - SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, - bool isVolatile, - MachinePointerInfo DstPtrInfo) const { +SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( + SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo) const { ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); const X86Subtarget &Subtarget = DAG.getMachineFunction().getSubtarget<X86Subtarget>(); @@ -74,10 +71,10 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); - if (const char *bzeroEntry = V && + if (const char *bzeroEntry = V && V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) { - EVT IntPtr = - DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; @@ -94,7 +91,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, 0) .setDiscardResult(); - std::pair<SDValue,SDValue> CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI); + std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); return CallResult.second; } @@ -144,8 +141,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, BytesLeft = SizeVal % UBytes; } - Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT), - InFlag); + Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT), + InFlag); InFlag = Chain.getValue(1); } else { AVT = MVT::i8; @@ -172,9 +169,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl, CVT)); - Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : - X86::ECX, - Left, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX, + Left, InFlag); InFlag = Chain.getValue(1); Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; @@ -249,17 +245,14 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( unsigned BytesLeft = SizeVal % UBytes; SDValue InFlag; - Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : - X86::ECX, - Count, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, + Count, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : - X86::EDI, - Dst, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, + Dst, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : - X86::ESI, - Src, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI, + Src, InFlag); InFlag = Chain.getValue(1); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp new file mode 100644 index 0000000..ef16c5b --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp @@ -0,0 +1,190 @@ +//===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics using +// constants from the constant pool. +// +//===----------------------------------------------------------------------===// + +#include "X86ShuffleDecodeConstantPool.h" +#include "Utils/X86ShuffleDecode.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/IR/Constants.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { + +void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + // It is not an error for the PSHUFB mask to not be a vector of i8 because the + // constant pool uniques constants by their bit representation. + // e.g. the following take up the same space in the constant pool: + // i128 -170141183420855150465331762880109871104 + // + // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160> + // + // <4 x i32> <i32 -2147483648, i32 -2147483648, + // i32 -2147483648, i32 -2147483648> + +#ifndef NDEBUG + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512); +#endif + + // This is a straightforward byte vector. + if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) { + int NumElements = MaskTy->getVectorNumElements(); + ShuffleMask.reserve(NumElements); + + for (int i = 0; i < NumElements; ++i) { + // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte + // lane of the vector we're inside. + int Base = i & ~0xf; + Constant *COp = C->getAggregateElement(i); + if (!COp) { + ShuffleMask.clear(); + return; + } else if (isa<UndefValue>(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; + } + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + // If the high bit (7) of the byte is set, the element is zeroed. + if (Element & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { + // Only the least significant 4 bits of the byte are used. + int Index = Base + (Element & 0xf); + ShuffleMask.push_back(Index); + } + } + } + // TODO: Handle funny-looking vectors too. +} + +void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + // It is not an error for the PSHUFB mask to not be a vector of i8 because the + // constant pool uniques constants by their bit representation. + // e.g. the following take up the same space in the constant pool: + // i128 -170141183420855150465331762880109871104 + // + // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160> + // + // <4 x i32> <i32 -2147483648, i32 -2147483648, + // i32 -2147483648, i32 -2147483648> + + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + + if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512. + return; + + // Only support vector types. + if (!MaskTy->isVectorTy()) + return; + + // Make sure its an integer type. + Type *VecEltTy = MaskTy->getVectorElementType(); + if (!VecEltTy->isIntegerTy()) + return; + + // Support any element type from byte up to element size. + // This is necesary primarily because 64-bit elements get split to 32-bit + // in the constant pool on 32-bit target. + unsigned EltTySize = VecEltTy->getIntegerBitWidth(); + if (EltTySize < 8 || EltTySize > ElSize) + return; + + unsigned NumElements = MaskTySize / ElSize; + assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && + "Unexpected number of vector elements."); + ShuffleMask.reserve(NumElements); + unsigned NumElementsPerLane = 128 / ElSize; + unsigned Factor = ElSize / EltTySize; + + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i * Factor); + if (!COp) { + ShuffleMask.clear(); + return; + } else if (isa<UndefValue>(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; + } + int Index = i & ~(NumElementsPerLane - 1); + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + if (ElSize == 64) + Index += (Element >> 1) & 0x1; + else + Index += Element & 0x3; + ShuffleMask.push_back(Index); + } + + // TODO: Handle funny-looking vectors too. +} + +void DecodeVPERMVMask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + if (MaskTy->isVectorTy()) { + unsigned NumElements = MaskTy->getVectorNumElements(); + if (NumElements == VT.getVectorNumElements()) { + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) { + ShuffleMask.clear(); + return; + } + if (isa<UndefValue>(COp)) + ShuffleMask.push_back(SM_SentinelUndef); + else { + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + Element &= (1 << NumElements) - 1; + ShuffleMask.push_back(Element); + } + } + } + return; + } + // Scalar value; just broadcast it + if (!isa<ConstantInt>(C)) + return; + uint64_t Element = cast<ConstantInt>(C)->getZExtValue(); + int NumElements = VT.getVectorNumElements(); + Element &= (1 << NumElements) - 1; + for (int i = 0; i < NumElements; ++i) + ShuffleMask.push_back(Element); +} + +void DecodeVPERMV3Mask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + unsigned NumElements = MaskTy->getVectorNumElements(); + if (NumElements == VT.getVectorNumElements()) { + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i); + if (!COp) { + ShuffleMask.clear(); + return; + } + if (isa<UndefValue>(COp)) + ShuffleMask.push_back(SM_SentinelUndef); + else { + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + Element &= (1 << NumElements*2) - 1; + ShuffleMask.push_back(Element); + } + } + } +} +} // llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h new file mode 100644 index 0000000..bcf4632 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h @@ -0,0 +1,45 @@ +//===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics using +// constants from the constant pool. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H +#define LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H + +#include "llvm/ADT/SmallVector.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { +class Constant; +class MVT; + +/// \brief Decode a PSHUFB mask from an IR-level vector constant. +void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERMILP variable mask from an IR-level vector constant. +void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant. +void DecodeVPERMVMask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant. +void DecodeVPERMV3Mask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask); + +} // llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp index dff3624..8ef08c9 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp @@ -44,9 +44,8 @@ X86EarlyIfConv("x86-early-ifcvt", cl::Hidden, cl::desc("Enable early if-conversion on X86")); -/// ClassifyBlockAddressReference - Classify a blockaddress reference for the -/// current subtarget according to how we should reference it in a non-pcrel -/// context. +/// Classify a blockaddress reference for the current subtarget according to how +/// we should reference it in a non-pcrel context. unsigned char X86Subtarget::ClassifyBlockAddressReference() const { if (isPICStyleGOT()) // 32-bit ELF targets. return X86II::MO_GOTOFF; @@ -58,9 +57,8 @@ unsigned char X86Subtarget::ClassifyBlockAddressReference() const { return X86II::MO_NO_FLAG; } -/// ClassifyGlobalReference - Classify a global variable reference for the -/// current subtarget according to how we should reference it in a non-pcrel -/// context. +/// Classify a global variable reference for the current subtarget according to +/// how we should reference it in a non-pcrel context. unsigned char X86Subtarget:: ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { // DLLImport only exists on windows, it is implemented as a load from a @@ -147,9 +145,9 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { } -/// getBZeroEntry - This function returns the name of a function which has an -/// interface like the non-standard bzero function, if such a function exists on -/// the current subtarget and it is considered prefereable over memset with zero +/// This function returns the name of a function which has an interface like +/// the non-standard bzero function, if such a function exists on the +/// current subtarget and it is considered preferable over memset with zero /// passed as the second argument. Otherwise it returns null. const char *X86Subtarget::getBZeroEntry() const { // Darwin 10 has a __bzero entry point for this purpose. @@ -166,8 +164,7 @@ bool X86Subtarget::hasSinCos() const { is64Bit(); } -/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls -/// to immediate address. +/// Return true if the subtarget allows calls to immediate address. bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const { // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32 // but WinCOFFObjectWriter::RecordRelocation cannot emit them. Once it does, @@ -192,9 +189,25 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { FullFS = "+64bit,+sse2"; } + // LAHF/SAHF are always supported in non-64-bit mode. + if (!In64BitMode) { + if (!FullFS.empty()) + FullFS = "+sahf," + FullFS; + else + FullFS = "+sahf"; + } + + // Parse features string and set the CPU. ParseSubtargetFeatures(CPUName, FullFS); + // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of + // 16-bytes and under that are reasonably fast. These features were + // introduced with Intel's Nehalem/Silvermont and AMD's Family10h + // micro-architectures respectively. + if (hasSSE42() || hasSSE4A()) + IsUAMem16Slow = false; + InstrItins = getInstrItineraryForCPU(CPUName); // It's important to keep the MCSubtargetInfo feature bits in sync with @@ -224,13 +237,18 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { } void X86Subtarget::initializeEnvironment() { - X86SSELevel = NoMMXSSE; + X86SSELevel = NoSSE; X863DNowLevel = NoThreeDNow; HasCMov = false; HasX86_64 = false; HasPOPCNT = false; HasSSE4A = false; HasAES = false; + HasFXSR = false; + HasXSAVE = false; + HasXSAVEOPT = false; + HasXSAVEC = false; + HasXSAVES = false; HasPCLMUL = false; HasFMA = false; HasFMA4 = false; @@ -252,13 +270,15 @@ void X86Subtarget::initializeEnvironment() { HasBWI = false; HasVLX = false; HasADX = false; + HasPKU = false; HasSHA = false; HasPRFCHW = false; HasRDSEED = false; + HasLAHFSAHF = false; HasMPX = false; IsBTMemSlow = false; IsSHLDSlow = false; - IsUAMemFast = false; + IsUAMem16Slow = false; IsUAMem32Slow = false; HasSSEUnalignedMem = false; HasCmpxchg16b = false; diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h index f026d42..13d1026 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -47,11 +47,11 @@ class X86Subtarget final : public X86GenSubtargetInfo { protected: enum X86SSEEnum { - NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F + NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F }; enum X863DNowEnum { - NoThreeDNow, ThreeDNow, ThreeDNowA + NoThreeDNow, MMX, ThreeDNow, ThreeDNowA }; enum X86ProcFamilyEnum { @@ -64,10 +64,10 @@ protected: /// Which PIC style to use PICStyles::Style PICStyle; - /// MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported. + /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported. X86SSEEnum X86SSELevel; - /// 3DNow, 3DNow Athlon, or none supported. + /// MMX, 3DNow, 3DNow Athlon, or none supported. X863DNowEnum X863DNowLevel; /// True if this processor has conditional move instructions @@ -86,6 +86,18 @@ protected: /// Target has AES instructions bool HasAES; + /// Target has FXSAVE/FXRESTOR instructions + bool HasFXSR; + + /// Target has XSAVE instructions + bool HasXSAVE; + /// Target has XSAVEOPT instructions + bool HasXSAVEOPT; + /// Target has XSAVEC instructions + bool HasXSAVEC; + /// Target has XSAVES instructions + bool HasXSAVES; + /// Target has carry-less multiplication bool HasPCLMUL; @@ -140,16 +152,19 @@ protected: /// Processor has RDSEED instructions. bool HasRDSEED; + /// Processor has LAHF/SAHF instructions. + bool HasLAHFSAHF; + /// True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; /// True if SHLD instructions are slow. bool IsSHLDSlow; - /// True if unaligned memory access is fast. - bool IsUAMemFast; + /// True if unaligned memory accesses of 16-bytes are slow. + bool IsUAMem16Slow; - /// True if unaligned 32-byte memory accesses are slow. + /// True if unaligned memory accesses of 32-bytes are slow. bool IsUAMem32Slow; /// True if SSE operations can have unaligned memory operands. @@ -208,6 +223,9 @@ protected: /// Processor has AVX-512 Vector Length eXtenstions bool HasVLX; + /// Processor has PKU extenstions + bool HasPKU; + /// Processot supports MPX - Memory Protection Extensions bool HasMPX; @@ -319,7 +337,6 @@ public: void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } bool hasCMov() const { return HasCMov; } - bool hasMMX() const { return X86SSELevel >= MMX; } bool hasSSE1() const { return X86SSELevel >= SSE1; } bool hasSSE2() const { return X86SSELevel >= SSE2; } bool hasSSE3() const { return X86SSELevel >= SSE3; } @@ -332,14 +349,22 @@ public: bool hasFp256() const { return hasAVX(); } bool hasInt256() const { return hasAVX2(); } bool hasSSE4A() const { return HasSSE4A; } + bool hasMMX() const { return X863DNowLevel >= MMX; } bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } bool hasPOPCNT() const { return HasPOPCNT; } bool hasAES() const { return HasAES; } + bool hasFXSR() const { return HasFXSR; } + bool hasXSAVE() const { return HasXSAVE; } + bool hasXSAVEOPT() const { return HasXSAVEOPT; } + bool hasXSAVEC() const { return HasXSAVEC; } + bool hasXSAVES() const { return HasXSAVES; } bool hasPCLMUL() const { return HasPCLMUL; } - bool hasFMA() const { return HasFMA; } - // FIXME: Favor FMA when both are enabled. Is this the right thing to do? - bool hasFMA4() const { return HasFMA4 && !HasFMA; } + // Prefer FMA4 to FMA - its better for commutation/memory folding and + // has equal or better performance on all supported targets. + bool hasFMA() const { return HasFMA && !HasFMA4; } + bool hasFMA4() const { return HasFMA4; } + bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); } bool hasXOP() const { return HasXOP; } bool hasTBM() const { return HasTBM; } bool hasMOVBE() const { return HasMOVBE; } @@ -355,9 +380,10 @@ public: bool hasSHA() const { return HasSHA; } bool hasPRFCHW() const { return HasPRFCHW; } bool hasRDSEED() const { return HasRDSEED; } + bool hasLAHFSAHF() const { return HasLAHFSAHF; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } - bool isUnalignedMemAccessFast() const { return IsUAMemFast; } + bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } @@ -375,6 +401,7 @@ public: bool hasDQI() const { return HasDQI; } bool hasBWI() const { return HasBWI; } bool hasVLX() const { return HasVLX; } + bool hasPKU() const { return HasPKU; } bool hasMPX() const { return HasMPX; } bool isAtom() const { return X86ProcFamily == IntelAtom; } @@ -394,9 +421,11 @@ public: bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } + bool isTargetAndroid() const { return TargetTriple.isAndroid(); } bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } + bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); } bool isTargetWindowsMSVC() const { return TargetTriple.isWindowsMSVCEnvironment(); @@ -406,6 +435,10 @@ public: return TargetTriple.isKnownWindowsMSVCEnvironment(); } + bool isTargetWindowsCoreCLR() const { + return TargetTriple.isWindowsCoreCLREnvironment(); + } + bool isTargetWindowsCygwin() const { return TargetTriple.isWindowsCygwinEnvironment(); } diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp index fb9cb4b..0e7e4c0 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -28,10 +28,17 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner", cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); +namespace llvm { +void initializeWinEHStatePassPass(PassRegistry &); +} + extern "C" void LLVMInitializeX86Target() { // Register the target. RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target); RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target); + + PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializeWinEHStatePassPass(PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -45,7 +52,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { return make_unique<X86LinuxNaClTargetObjectFile>(); if (TT.isOSBinFormatELF()) return make_unique<X86ELFTargetObjectFile>(); - if (TT.isKnownWindowsMSVCEnvironment()) + if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment()) return make_unique<X86WindowsTargetObjectFile>(); if (TT.isOSBinFormatCOFF()) return make_unique<TargetLoweringObjectFileCOFF>(); @@ -175,8 +182,9 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, //===----------------------------------------------------------------------===// TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &F) { return TargetTransformInfo(X86TTIImpl(this, F)); }); + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo(X86TTIImpl(this, F)); + }); } @@ -246,6 +254,9 @@ bool X86PassConfig::addPreISel() { } void X86PassConfig::addPreRegAlloc() { + if (getOptLevel() != CodeGenOpt::None) + addPass(createX86OptimizeLEAs()); + addPass(createX86CallFrameOptimization()); } diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp index 6f900ea..782768d 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp @@ -16,6 +16,7 @@ #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCValue.h" +#include "llvm/Support/COFF.h" #include "llvm/Support/Dwarf.h" #include "llvm/Target/TargetLowering.h" @@ -152,9 +153,8 @@ static std::string scalarConstantToHexString(const Constant *C) { } } -MCSection * -X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind, - const Constant *C) const { +MCSection *X86WindowsTargetObjectFile::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C) const { if (Kind.isMergeableConst() && C) { const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ | @@ -171,5 +171,5 @@ X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind, COFF::IMAGE_COMDAT_SELECT_ANY); } - return TargetLoweringObjectFile::getSectionForConstant(Kind, C); + return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C); } diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h index 66366b2..6b2448c 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h @@ -58,7 +58,7 @@ namespace llvm { /// \brief Given a mergeable constant with the specified size and relocation /// information, return a section that it should be placed in. - MCSection *getSectionForConstant(SectionKind Kind, + MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, const Constant *C) const override; }; diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 7df7260..2e7bbb2 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" + using namespace llvm; #define DEBUG_TYPE "x86tti" @@ -62,8 +63,8 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { if (ST->is64Bit()) return 64; - return 32; + return 32; } unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { @@ -84,12 +85,12 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { return 2; } -unsigned X86TTIImpl::getArithmeticInstrCost( +int X86TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -101,10 +102,9 @@ unsigned X86TTIImpl::getArithmeticInstrCost( // normally expanded to the sequence SRA + SRL + ADD + SRA. // The OperandValue properties many not be same as that of previous // operation;conservatively assume OP_None. - unsigned Cost = - 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); + int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, + Op2Info, TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); @@ -115,8 +115,7 @@ unsigned X86TTIImpl::getArithmeticInstrCost( return Cost; } - static const CostTblEntry<MVT::SimpleValueType> - AVX2UniformConstCostTable[] = { + static const CostTblEntry AVX2UniformConstCostTable[] = { { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence @@ -127,12 +126,12 @@ unsigned X86TTIImpl::getArithmeticInstrCost( if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && ST->hasAVX2()) { - int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * AVX2UniformConstCostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; } - static const CostTblEntry<MVT::SimpleValueType> AVX512CostTable[] = { + static const CostTblEntry AVX512CostTable[] = { { ISD::SHL, MVT::v16i32, 1 }, { ISD::SRL, MVT::v16i32, 1 }, { ISD::SRA, MVT::v16i32, 1 }, @@ -141,7 +140,12 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v8i64, 1 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = { + if (ST->hasAVX512()) { + if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX2CostTable[] = { // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to // customize them to detect the cases where shift amount is a scalar one. { ISD::SHL, MVT::v4i32, 1 }, @@ -154,7 +158,57 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SRL, MVT::v2i64, 1 }, { ISD::SHL, MVT::v4i64, 1 }, { ISD::SRL, MVT::v4i64, 1 }, + }; + + // Look for AVX2 lowering tricks. + if (ST->hasAVX2()) { + if (ISD == ISD::SHL && LT.second == MVT::v16i16 && + (Op2Info == TargetTransformInfo::OK_UniformConstantValue || + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) + // On AVX2, a packed v16i16 shift left by a constant build_vector + // is lowered into a vector multiply (vpmullw). + return LT.first; + + if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry XOPCostTable[] = { + // 128bit shifts take 1cy, but right shifts require negation beforehand. + { ISD::SHL, MVT::v16i8, 1 }, + { ISD::SRL, MVT::v16i8, 2 }, + { ISD::SRA, MVT::v16i8, 2 }, + { ISD::SHL, MVT::v8i16, 1 }, + { ISD::SRL, MVT::v8i16, 2 }, + { ISD::SRA, MVT::v8i16, 2 }, + { ISD::SHL, MVT::v4i32, 1 }, + { ISD::SRL, MVT::v4i32, 2 }, + { ISD::SRA, MVT::v4i32, 2 }, + { ISD::SHL, MVT::v2i64, 1 }, + { ISD::SRL, MVT::v2i64, 2 }, + { ISD::SRA, MVT::v2i64, 2 }, + // 256bit shifts require splitting if AVX2 didn't catch them above. + { ISD::SHL, MVT::v32i8, 2 }, + { ISD::SRL, MVT::v32i8, 4 }, + { ISD::SRA, MVT::v32i8, 4 }, + { ISD::SHL, MVT::v16i16, 2 }, + { ISD::SRL, MVT::v16i16, 4 }, + { ISD::SRA, MVT::v16i16, 4 }, + { ISD::SHL, MVT::v8i32, 2 }, + { ISD::SRL, MVT::v8i32, 4 }, + { ISD::SRA, MVT::v8i32, 4 }, + { ISD::SHL, MVT::v4i64, 2 }, + { ISD::SRL, MVT::v4i64, 4 }, + { ISD::SRA, MVT::v4i64, 4 }, + }; + // Look for XOP lowering tricks. + if (ST->hasXOP()) { + if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX2CustomCostTable[] = { { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. @@ -163,7 +217,8 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. - { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. + { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. // Vectorizing division is a bad idea. See the SSE2 table for more comments. { ISD::SDIV, MVT::v32i8, 32*20 }, @@ -176,44 +231,44 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::UDIV, MVT::v4i64, 4*20 }, }; - if (ST->hasAVX512()) { - int Idx = CostTableLookup(AVX512CostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * AVX512CostTable[Idx].Cost; - } - // Look for AVX2 lowering tricks. + // Look for AVX2 lowering tricks for custom cases. if (ST->hasAVX2()) { - if (ISD == ISD::SHL && LT.second == MVT::v16i16 && - (Op2Info == TargetTransformInfo::OK_UniformConstantValue || - Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) - // On AVX2, a packed v16i16 shift left by a constant build_vector - // is lowered into a vector multiply (vpmullw). - return LT.first; - - int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * AVX2CostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; } - static const CostTblEntry<MVT::SimpleValueType> + static const CostTblEntry SSE2UniformConstCostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. // Constant splats are cheaper for the following instructions. { ISD::SHL, MVT::v16i8, 1 }, // psllw. + { ISD::SHL, MVT::v32i8, 2 }, // psllw. { ISD::SHL, MVT::v8i16, 1 }, // psllw. + { ISD::SHL, MVT::v16i16, 2 }, // psllw. { ISD::SHL, MVT::v4i32, 1 }, // pslld + { ISD::SHL, MVT::v8i32, 2 }, // pslld { ISD::SHL, MVT::v2i64, 1 }, // psllq. + { ISD::SHL, MVT::v4i64, 2 }, // psllq. { ISD::SRL, MVT::v16i8, 1 }, // psrlw. + { ISD::SRL, MVT::v32i8, 2 }, // psrlw. { ISD::SRL, MVT::v8i16, 1 }, // psrlw. + { ISD::SRL, MVT::v16i16, 2 }, // psrlw. { ISD::SRL, MVT::v4i32, 1 }, // psrld. + { ISD::SRL, MVT::v8i32, 2 }, // psrld. { ISD::SRL, MVT::v2i64, 1 }, // psrlq. + { ISD::SRL, MVT::v4i64, 2 }, // psrlq. { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + { ISD::SRA, MVT::v32i8, 8 }, // psrlw, pand, pxor, psubb. { ISD::SRA, MVT::v8i16, 1 }, // psraw. + { ISD::SRA, MVT::v16i16, 2 }, // psraw. { ISD::SRA, MVT::v4i32, 1 }, // psrad. + { ISD::SRA, MVT::v8i32, 2 }, // psrad. { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle. + { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle. { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence @@ -227,27 +282,34 @@ unsigned X86TTIImpl::getArithmeticInstrCost( if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) return LT.first * 15; - int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * SSE2UniformConstCostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; } if (ISD == ISD::SHL && Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { - EVT VT = LT.second; + MVT VT = LT.second; + // Vector shift left by non uniform constant can be lowered + // into vector multiply (pmullw/pmulld). if ((VT == MVT::v8i16 && ST->hasSSE2()) || (VT == MVT::v4i32 && ST->hasSSE41())) - // Vector shift left by non uniform constant can be lowered - // into vector multiply (pmullw/pmulld). return LT.first; + + // v16i16 and v8i32 shifts by non-uniform constants are lowered into a + // sequence of extract + two vector multiply + insert. + if ((VT == MVT::v8i32 || VT == MVT::v16i16) && + (ST->hasAVX() && !ST->hasAVX2())) + ISD = ISD::MUL; + + // A vector shift left by non uniform constant is converted + // into a vector multiply; the new multiply is eventually + // lowered into a sequence of shuffles and 2 x pmuludq. if (VT == MVT::v4i32 && ST->hasSSE2()) - // A vector shift left by non uniform constant is converted - // into a vector multiply; the new multiply is eventually - // lowered into a sequence of shuffles and 2 x pmuludq. ISD = ISD::MUL; } - static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = { + static const CostTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. // For some cases, where the shift amount is a scalar we would be able @@ -257,20 +319,31 @@ unsigned X86TTIImpl::getArithmeticInstrCost( // used for vectorization and we don't want to make vectorized code worse // than scalar code. { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SHL, MVT::v32i8, 2*26 }, // cmpgtb sequence. { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SHL, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. - { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. - { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. + { ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul. + { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SRL, MVT::v32i8, 2*26 }, // cmpgtb sequence. { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRL, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized. + { ISD::SRL, MVT::v8i32, 2*16 }, // Shift each lane + blend. + { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. + { ISD::SRA, MVT::v32i8, 2*54 }, // unpacked cmpgtb sequence. { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRA, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. + { ISD::SRA, MVT::v8i32, 2*16 }, // Shift each lane + blend. + { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence. // It is not a good idea to vectorize division. We have to scalarize it and // in the process we will often end up having to spilling regular @@ -289,12 +362,11 @@ unsigned X86TTIImpl::getArithmeticInstrCost( }; if (ST->hasSSE2()) { - int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * SSE2CostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; } - static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = { + static const CostTblEntry AVX1CostTable[] = { // We don't have to scalarize unsupported ops. We can issue two half-sized // operations and we only need to extract the upper YMM half. // Two ops + 1 extract + 1 insert = 4. @@ -314,29 +386,21 @@ unsigned X86TTIImpl::getArithmeticInstrCost( // Look for AVX1 lowering tricks. if (ST->hasAVX() && !ST->hasAVX2()) { - EVT VT = LT.second; + MVT VT = LT.second; - // v16i16 and v8i32 shifts by non-uniform constants are lowered into a - // sequence of extract + two vector multiply + insert. - if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) && - Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) - ISD = ISD::MUL; - - int Idx = CostTableLookup(AVX1CostTable, ISD, VT); - if (Idx != -1) - return LT.first * AVX1CostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT)) + return LT.first * Entry->Cost; } // Custom lowering of vectors. - static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = { + static const CostTblEntry CustomLowered[] = { // A v2i64/v4i64 and multiply is custom lowered as a series of long // multiplies(3), shifts(4) and adds(2). { ISD::MUL, MVT::v2i64, 9 }, { ISD::MUL, MVT::v4i64, 9 }, }; - int Idx = CostTableLookup(CustomLowered, ISD, LT.second); - if (Idx != -1) - return LT.first * CustomLowered[Idx].Cost; + if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second)) + return LT.first * Entry->Cost; // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle, // 2x pmuludq, 2x shuffle. @@ -348,15 +412,15 @@ unsigned X86TTIImpl::getArithmeticInstrCost( return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); } -unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { +int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { // We only estimate the cost of reverse and alternate shuffles. if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); if (Kind == TTI::SK_Reverse) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - unsigned Cost = 1; + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + int Cost = 1; if (LT.second.getSizeInBits() > 128) Cost = 3; // Extract + insert + copy. @@ -367,14 +431,14 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, if (Kind == TTI::SK_Alternate) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); // The backend knows how to generate a single VEX.256 version of // instruction VPBLENDW if the target supports AVX2. if (ST->hasAVX2() && LT.second == MVT::v16i16) return LT.first; - static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = { + static const CostTblEntry AVXAltShuffleTbl[] = { {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd @@ -390,13 +454,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9} }; - if (ST->hasAVX()) { - int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx != -1) - return LT.first * AVXAltShuffleTbl[Idx].Cost; - } + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; - static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = { + static const CostTblEntry SSE41AltShuffleTbl[] = { // These are lowered into movsd. {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, @@ -414,13 +477,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} }; - if (ST->hasSSE41()) { - int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx != -1) - return LT.first * SSE41AltShuffleTbl[Idx].Cost; - } + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, + LT.second)) + return LT.first * Entry->Cost; - static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = { + static const CostTblEntry SSSE3AltShuffleTbl[] = { {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd @@ -433,13 +495,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or }; - if (ST->hasSSSE3()) { - int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx != -1) - return LT.first * SSSE3AltShuffleTbl[Idx].Cost; - } + if (ST->hasSSSE3()) + if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; - static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = { + static const CostTblEntry SSEAltShuffleTbl[] = { {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd @@ -454,65 +515,47 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, }; // Fall-back (SSE3 and SSE2). - int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx != -1) - return LT.first * SSEAltShuffleTbl[Idx].Cost; + if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } -unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); - std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); - - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - SSE2ConvTbl[] = { - // These are somewhat magic numbers justified by looking at the output of - // Intel's IACA, running some kernels and making sure when we take - // legalization into account the throughput will be overestimated. - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, - // There are faster sequences for float conversions. - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + // FIXME: Need a better design of the cost table to handle non-simple types of + // potential massive combinations (elem_num x src_type x dst_type). + + static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, + + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, + { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, }; - if (ST->hasSSE2() && !ST->hasAVX()) { - int Idx = - ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second); - if (Idx != -1) - return LTSrc.first * SSE2ConvTbl[Idx].Cost; - } - - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - AVX512ConversionTbl[] = { + static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, - { ISD::FP_ROUND, MVT::v16f32, MVT::v8f64, 3 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, - { ISD::TRUNCATE, MVT::v16i32, MVT::v8i64, 4 }, // v16i1 -> v16i32 - load + broadcast { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, @@ -522,33 +565,49 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, - { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, - }; - if (ST->hasAVX512()) { - int Idx = ConvertCostTableLookup(AVX512ConversionTbl, ISD, LTDest.second, - LTSrc.second); - if (Idx != -1) - return AVX512ConversionTbl[Idx].Cost; - } - EVT SrcTy = TLI->getValueType(DL, Src); - EVT DstTy = TLI->getValueType(DL, Dst); - - // The function getSimpleVT only handles simple value types. - if (!SrcTy.isSimple() || !DstTy.isSimple()) - return BaseT::getCastInstrCost(Opcode, Dst, Src); + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, + + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, + { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, + }; - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - AVX2ConversionTbl[] = { + static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, @@ -579,8 +638,7 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, }; - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - AVXConversionTbl[] = { + static const TypeConversionCostTblEntry AVXConversionTbl[] = { { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, @@ -650,34 +708,158 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, }; + static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 }, + + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 30 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, + }; + + static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { + // These are somewhat magic numbers justified by looking at the output of + // Intel's IACA, running some kernels and making sure when we take + // legalization into account the throughput will be overestimated. + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + // There are faster sequences for float conversions. + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, + + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, + }; + + std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); + std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); + + if (ST->hasSSE2() && !ST->hasAVX()) { + if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, + LTDest.second, LTSrc.second)) + return LTSrc.first * Entry->Cost; + } + + EVT SrcTy = TLI->getValueType(DL, Src); + EVT DstTy = TLI->getValueType(DL, Dst); + + // The function getSimpleVT only handles simple value types. + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return BaseT::getCastInstrCost(Opcode, Dst, Src); + + if (ST->hasDQI()) + if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + + if (ST->hasAVX512()) + if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + if (ST->hasAVX2()) { - int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD, - DstTy.getSimpleVT(), SrcTy.getSimpleVT()); - if (Idx != -1) - return AVX2ConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } if (ST->hasAVX()) { - int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(), - SrcTy.getSimpleVT()); - if (Idx != -1) - return AVXConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + if (ST->hasSSE41()) { + if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + if (ST->hasSSE2()) { + if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { +int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = { + static const CostTblEntry SSE42CostTbl[] = { { ISD::SETCC, MVT::v2f64, 1 }, { ISD::SETCC, MVT::v4f32, 1 }, { ISD::SETCC, MVT::v2i64, 1 }, @@ -686,7 +868,7 @@ unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v16i8, 1 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = { + static const CostTblEntry AVX1CostTbl[] = { { ISD::SETCC, MVT::v4f64, 1 }, { ISD::SETCC, MVT::v8f32, 1 }, // AVX1 does not support 8-wide integer compare. @@ -696,54 +878,45 @@ unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v32i8, 4 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = { + static const CostTblEntry AVX2CostTbl[] = { { ISD::SETCC, MVT::v4i64, 1 }, { ISD::SETCC, MVT::v8i32, 1 }, { ISD::SETCC, MVT::v16i16, 1 }, { ISD::SETCC, MVT::v32i8, 1 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX512CostTbl[] = { + static const CostTblEntry AVX512CostTbl[] = { { ISD::SETCC, MVT::v8i64, 1 }, { ISD::SETCC, MVT::v16i32, 1 }, { ISD::SETCC, MVT::v8f64, 1 }, { ISD::SETCC, MVT::v16f32, 1 }, }; - if (ST->hasAVX512()) { - int Idx = CostTableLookup(AVX512CostTbl, ISD, MTy); - if (Idx != -1) - return LT.first * AVX512CostTbl[Idx].Cost; - } + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasAVX2()) { - int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy); - if (Idx != -1) - return LT.first * AVX2CostTbl[Idx].Cost; - } + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasAVX()) { - int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy); - if (Idx != -1) - return LT.first * AVX1CostTbl[Idx].Cost; - } + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE42()) { - int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy); - if (Idx != -1) - return LT.first * SSE42CostTbl[Idx].Cost; - } + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { +int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); if (Index != -1U) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); // This type is legalized to a scalar type. if (!LT.second.isVector()) @@ -761,10 +934,9 @@ unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return BaseT::getVectorInstrCost(Opcode, Val, Index); } -unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, - bool Extract) { +int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { assert (Ty->isVectorTy() && "Can only scalarize vectors"); - unsigned Cost = 0; + int Cost = 0; for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { if (Insert) @@ -776,9 +948,8 @@ unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, return Cost; } -unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) { +int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) { // Handle non-power-of-two vectors such as <3 x float> if (VectorType *VTy = dyn_cast<VectorType>(Src)) { unsigned NumElem = VTy->getVectorNumElements(); @@ -796,22 +967,21 @@ unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // Assume that all other non-power-of-two numbers are scalarized. if (!isPowerOf2_32(NumElem)) { - unsigned Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), - Alignment, AddressSpace); - unsigned SplitCost = getScalarizationOverhead(Src, - Opcode == Instruction::Load, - Opcode==Instruction::Store); + int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment, + AddressSpace); + int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load, + Opcode == Instruction::Store); return NumElem * Cost + SplitCost; } } // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode"); // Each load/store unit costs 1. - unsigned Cost = LT.first * 1; + int Cost = LT.first * 1; // On Sandybridge 256bit load/stores are double pumped // (but not on Haswell). @@ -821,9 +991,9 @@ unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, return Cost; } -unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, - unsigned Alignment, - unsigned AddressSpace) { +int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, + unsigned Alignment, + unsigned AddressSpace) { VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); if (!SrcVTy) // To calculate scalar take the regular cost, without mask @@ -832,34 +1002,33 @@ unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, unsigned NumElem = SrcVTy->getVectorNumElements(); VectorType *MaskTy = VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem); - if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) || - (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) || + if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) || + (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) { // Scalarization - unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); - unsigned ScalarCompareCost = - getCmpSelInstrCost(Instruction::ICmp, - Type::getInt8Ty(getGlobalContext()), NULL); - unsigned BranchCost = getCFInstrCost(Instruction::Br); - unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); - - unsigned ValueSplitCost = - getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load, - Opcode == Instruction::Store); - unsigned MemopCost = + int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); + int ScalarCompareCost = getCmpSelInstrCost( + Instruction::ICmp, Type::getInt8Ty(getGlobalContext()), nullptr); + int BranchCost = getCFInstrCost(Instruction::Br); + int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); + + int ValueSplitCost = getScalarizationOverhead( + SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store); + int MemopCost = NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace); return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; } // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); - unsigned Cost = 0; - if (LT.second != TLI->getValueType(DL, SrcVTy).getSimpleVT() && + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); + auto VT = TLI->getValueType(DL, SrcVTy); + int Cost = 0; + if (VT.isSimple() && LT.second != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires expand/truncate for data and a shuffle for mask. - Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) + - getShuffleCost(TTI::SK_Alternate, MaskTy, 0, 0); + Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) + + getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr); else if (LT.second.getVectorNumElements() > NumElem) { VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), @@ -874,7 +1043,7 @@ unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, return Cost+LT.first; } -unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -887,10 +1056,10 @@ unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { return BaseT::getAddressComputationCost(Ty, IsComplex); } -unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, - bool IsPairwise) { +int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, + bool IsPairwise) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; @@ -900,7 +1069,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. - static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = { + static const CostTblEntry SSE42CostTblPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". @@ -908,7 +1077,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v8i16, 5 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = { + static const CostTblEntry AVX1CostTblPairWise[] = { { ISD::FADD, MVT::v4f32, 4 }, { ISD::FADD, MVT::v4f64, 5 }, { ISD::FADD, MVT::v8f32, 7 }, @@ -919,7 +1088,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v8i32, 5 }, }; - static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = { + static const CostTblEntry SSE42CostTblNoPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". @@ -927,7 +1096,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". }; - static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = { + static const CostTblEntry AVX1CostTblNoPairWise[] = { { ISD::FADD, MVT::v4f32, 3 }, { ISD::FADD, MVT::v4f64, 3 }, { ISD::FADD, MVT::v8f32, 4 }, @@ -939,29 +1108,21 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, }; if (IsPairwise) { - if (ST->hasAVX()) { - int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy); - if (Idx != -1) - return LT.first * AVX1CostTblPairWise[Idx].Cost; - } + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE42()) { - int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy); - if (Idx != -1) - return LT.first * SSE42CostTblPairWise[Idx].Cost; - } + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; } else { - if (ST->hasAVX()) { - int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy); - if (Idx != -1) - return LT.first * AVX1CostTblNoPairWise[Idx].Cost; - } + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE42()) { - int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy); - if (Idx != -1) - return LT.first * SSE42CostTblNoPairWise[Idx].Cost; - } + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; } return BaseT::getReductionCost(Opcode, ValTy, IsPairwise); @@ -970,7 +1131,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. -unsigned X86TTIImpl::getIntImmCost(int64_t Val) { +int X86TTIImpl::getIntImmCost(int64_t Val) { if (Val == 0) return TTI::TCC_Free; @@ -980,7 +1141,7 @@ unsigned X86TTIImpl::getIntImmCost(int64_t Val) { return 2 * TTI::TCC_Basic; } -unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -1004,18 +1165,18 @@ unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { // Split the constant into 64-bit chunks and calculate the cost for each // chunk. - unsigned Cost = 0; + int Cost = 0; for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); int64_t Val = Tmp.getSExtValue(); Cost += getIntImmCost(Val); } // We need at least one instruction to materialze the constant. - return std::max(1U, Cost); + return std::max(1, Cost); } -unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) { +int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -1038,6 +1199,26 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, case Instruction::Store: ImmIdx = 0; break; + case Instruction::ICmp: + // This is an imperfect hack to prevent constant hoisting of + // compares that might be trying to check if a 64-bit value fits in + // 32-bits. The backend can optimize these cases using a right shift by 32. + // Ideally we would check the compare predicate here. There also other + // similar immediates the backend can use shifts for. + if (Idx == 1 && Imm.getBitWidth() == 64) { + uint64_t ImmVal = Imm.getZExtValue(); + if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) + return TTI::TCC_Free; + } + ImmIdx = 1; + break; + case Instruction::And: + // We support 64-bit ANDs with immediates with 32-bits of leading zeroes + // by using a 32-bit operation with implicit zero extension. Detect such + // immediates here as the normal path expects bit 31 to be sign extended. + if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) + return TTI::TCC_Free; + // Fallthrough case Instruction::Add: case Instruction::Sub: case Instruction::Mul: @@ -1045,10 +1226,8 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: - case Instruction::And: case Instruction::Or: case Instruction::Xor: - case Instruction::ICmp: ImmIdx = 1; break; // Always return TCC_Free for the shift value of a shift instruction. @@ -1073,18 +1252,18 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, } if (Idx == ImmIdx) { - unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = X86TTIImpl::getIntImmCost(Imm, Ty); + int NumConstants = (BitSize + 63) / 64; + int Cost = X86TTIImpl::getIntImmCost(Imm, Ty); return (Cost <= NumConstants * TTI::TCC_Basic) - ? static_cast<unsigned>(TTI::TCC_Free) + ? static_cast<int>(TTI::TCC_Free) : Cost; } return X86TTIImpl::getIntImmCost(Imm, Ty); } -unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { +int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -1118,23 +1297,181 @@ unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, return X86TTIImpl::getIntImmCost(Imm, Ty); } -bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) { - int DataWidth = DataTy->getPrimitiveSizeInBits(); +// Return an average cost of Gather / Scatter instruction, maybe improved later +int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, + unsigned Alignment, unsigned AddressSpace) { + + assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); + unsigned VF = SrcVTy->getVectorNumElements(); + + // Try to reduce index size from 64 bit (default for GEP) + // to 32. It is essential for VF 16. If the index can't be reduced to 32, the + // operation will use 16 x 64 indices which do not fit in a zmm and needs + // to split. Also check that the base pointer is the same for all lanes, + // and that there's at most one variable index. + auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) { + unsigned IndexSize = DL.getPointerSizeInBits(); + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); + if (IndexSize < 64 || !GEP) + return IndexSize; + + unsigned NumOfVarIndices = 0; + Value *Ptrs = GEP->getPointerOperand(); + if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) + return IndexSize; + for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { + if (isa<Constant>(GEP->getOperand(i))) + continue; + Type *IndxTy = GEP->getOperand(i)->getType(); + if (IndxTy->isVectorTy()) + IndxTy = IndxTy->getVectorElementType(); + if ((IndxTy->getPrimitiveSizeInBits() == 64 && + !isa<SExtInst>(GEP->getOperand(i))) || + ++NumOfVarIndices > 1) + return IndexSize; // 64 + } + return (unsigned)32; + }; + + + // Trying to reduce IndexSize to 32 bits for vector 16. + // By default the IndexSize is equal to pointer size. + unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) : + DL.getPointerSizeInBits(); + + Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(), + IndexSize), VF); + std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); + std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); + int SplitFactor = std::max(IdxsLT.first, SrcLT.first); + if (SplitFactor > 1) { + // Handle splitting of vector of pointers + Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); + return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, + AddressSpace); + } + + // The gather / scatter cost is given by Intel architects. It is a rough + // number since we are looking at one instruction in a time. + const int GSOverhead = 2; + return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); +} + +/// Return the cost of full scalarization of gather / scatter operation. +/// +/// Opcode - Load or Store instruction. +/// SrcVTy - The type of the data vector that should be gathered or scattered. +/// VariableMask - The mask is non-constant at compile time. +/// Alignment - Alignment for one element. +/// AddressSpace - pointer[s] address space. +/// +int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, + bool VariableMask, unsigned Alignment, + unsigned AddressSpace) { + unsigned VF = SrcVTy->getVectorNumElements(); + + int MaskUnpackCost = 0; + if (VariableMask) { + VectorType *MaskTy = + VectorType::get(Type::getInt1Ty(getGlobalContext()), VF); + MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true); + int ScalarCompareCost = + getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()), + nullptr); + int BranchCost = getCFInstrCost(Instruction::Br); + MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); + } - // Todo: AVX512 allows gather/scatter, works with strided and random as well - if ((DataWidth < 32) || (Consecutive == 0)) + // The cost of the scalar loads/stores. + int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); + + int InsertExtractCost = 0; + if (Opcode == Instruction::Load) + for (unsigned i = 0; i < VF; ++i) + // Add the cost of inserting each scalar load into the vector + InsertExtractCost += + getVectorInstrCost(Instruction::InsertElement, SrcVTy, i); + else + for (unsigned i = 0; i < VF; ++i) + // Add the cost of extracting each element out of the data vector + InsertExtractCost += + getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i); + + return MemoryOpCost + MaskUnpackCost + InsertExtractCost; +} + +/// Calculate the cost of Gather / Scatter operation +int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, + Value *Ptr, bool VariableMask, + unsigned Alignment) { + assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); + unsigned VF = SrcVTy->getVectorNumElements(); + PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); + if (!PtrTy && Ptr->getType()->isVectorTy()) + PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType()); + assert(PtrTy && "Unexpected type for Ptr argument"); + unsigned AddressSpace = PtrTy->getAddressSpace(); + + bool Scalarize = false; + if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) || + (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy))) + Scalarize = true; + // Gather / Scatter for vector 2 is not profitable on KNL / SKX + // Vector-4 of gather/scatter instruction does not exist on KNL. + // We can extend it to 8 elements, but zeroing upper bits of + // the mask vector will add more instructions. Right now we give the scalar + // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is + // better in the VariableMask case. + if (VF == 2 || (VF == 4 && !ST->hasVLX())) + Scalarize = true; + + if (Scalarize) + return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); + + return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); +} + +bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { + Type *ScalarTy = DataTy->getScalarType(); + int DataWidth = isa<PointerType>(ScalarTy) ? + DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); + + return (DataWidth >= 32 && ST->hasAVX2()); +} + +bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { + return isLegalMaskedLoad(DataType); +} + +bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { + // This function is called now in two cases: from the Loop Vectorizer + // and from the Scalarizer. + // When the Loop Vectorizer asks about legality of the feature, + // the vectorization factor is not calculated yet. The Loop Vectorizer + // sends a scalar type and the decision is based on the width of the + // scalar element. + // Later on, the cost model will estimate usage this intrinsic based on + // the vector type. + // The Scalarizer asks again about legality. It sends a vector type. + // In this case we can reject non-power-of-2 vectors. + if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements())) return false; - if (ST->hasAVX512() || ST->hasAVX2()) - return true; - return false; + Type *ScalarTy = DataTy->getScalarType(); + int DataWidth = isa<PointerType>(ScalarTy) ? + DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); + + // AVX-512 allows gather and scatter + return DataWidth >= 32 && ST->hasAVX512(); } -bool X86TTIImpl::isLegalMaskedStore(Type *DataType, int Consecutive) { - return isLegalMaskedLoad(DataType, Consecutive); +bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { + return isLegalMaskedGather(DataType); } -bool X86TTIImpl::hasCompatibleFunctionAttributes(const Function *Caller, - const Function *Callee) const { +bool X86TTIImpl::areInlineCompatible(const Function *Caller, + const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); // Work this as a subsetting of subtarget features. diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h index da3f36c..adb745e 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -33,13 +33,13 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { const X86Subtarget *ST; const X86TargetLowering *TLI; - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); + int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); const X86Subtarget *getST() const { return ST; } const X86TargetLowering *getTLI() const { return TLI; } public: - explicit X86TTIImpl(const X86TargetMachine *TM, Function &F) + explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -62,38 +62,44 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(unsigned VF); - unsigned getArithmeticInstrCost( + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp); - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); - unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); - - unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex); - - unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); - - unsigned getIntImmCost(int64_t); - - unsigned getIntImmCost(const APInt &Imm, Type *Ty); - - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty); - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); - bool isLegalMaskedLoad(Type *DataType, int Consecutive); - bool isLegalMaskedStore(Type *DataType, int Consecutive); - bool hasCompatibleFunctionAttributes(const Function *Caller, - const Function *Callee) const; + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool VariableMask, unsigned Alignment); + int getAddressComputationCost(Type *PtrTy, bool IsComplex); + + int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + + int getIntImmCost(int64_t); + + int getIntImmCost(const APInt &Imm, Type *Ty); + + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); + bool isLegalMaskedLoad(Type *DataType); + bool isLegalMaskedStore(Type *DataType); + bool isLegalMaskedGather(Type *DataType); + bool isLegalMaskedScatter(Type *DataType); + bool areInlineCompatible(const Function *Caller, + const Function *Callee) const; +private: + int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, + unsigned Alignment, unsigned AddressSpace); + int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr, + unsigned Alignment, unsigned AddressSpace); /// @} }; diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp index 9190d0b..dce94a9 100644 --- a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp @@ -15,7 +15,8 @@ //===----------------------------------------------------------------------===// #include "X86.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/WinEHFuncInfo.h" @@ -38,12 +39,16 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "winehstate" +namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); } + namespace { class WinEHStatePass : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid. - WinEHStatePass() : FunctionPass(ID) {} + WinEHStatePass() : FunctionPass(ID) { + initializeWinEHStatePassPass(*PassRegistry::getPassRegistry()); + } bool runOnFunction(Function &Fn) override; @@ -62,18 +67,13 @@ private: void linkExceptionRegistration(IRBuilder<> &Builder, Function *Handler); void unlinkExceptionRegistration(IRBuilder<> &Builder); - void addCXXStateStores(Function &F, MachineModuleInfo &MMI); - void addSEHStateStores(Function &F, MachineModuleInfo &MMI); - void addCXXStateStoresToFunclet(Value *ParentRegNode, WinEHFuncInfo &FuncInfo, - Function &F, int BaseState); + void addStateStores(Function &F, WinEHFuncInfo &FuncInfo); void insertStateNumberStore(Value *ParentRegNode, Instruction *IP, int State); Value *emitEHLSDA(IRBuilder<> &Builder, Function *F); Function *generateLSDAInEAXThunk(Function *ParentFunc); - int escapeRegNode(Function &F); - // Module-level type getters. Type *getEHLinkRegistrationType(); Type *getSEHRegistrationType(); @@ -111,6 +111,9 @@ FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); } char WinEHStatePass::ID = 0; +INITIALIZE_PASS(WinEHStatePass, "x86-winehstate", + "Insert stores for EH state numbers", false, false) + bool WinEHStatePass::doInitialization(Module &M) { TheModule = &M; FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::localescape); @@ -138,14 +141,7 @@ void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const { } bool WinEHStatePass::runOnFunction(Function &F) { - // If this is an outlined handler, don't do anything. We'll do state insertion - // for it in the parent. - StringRef WinEHParentName = - F.getFnAttribute("wineh-parent").getValueAsString(); - if (WinEHParentName != F.getName() && !WinEHParentName.empty()) - return false; - - // Check the personality. Do nothing if this is not an MSVC personality. + // Check the personality. Do nothing if this personality doesn't use funclets. if (!F.hasPersonalityFn()) return false; PersonalityFn = @@ -153,7 +149,19 @@ bool WinEHStatePass::runOnFunction(Function &F) { if (!PersonalityFn) return false; Personality = classifyEHPersonality(PersonalityFn); - if (!isMSVCEHPersonality(Personality)) + if (!isFuncletEHPersonality(Personality)) + return false; + + // Skip this function if there are no EH pads and we aren't using IR-level + // outlining. + bool HasPads = false; + for (BasicBlock &BB : F) { + if (BB.isEHPad()) { + HasPads = true; + break; + } + } + if (!HasPads) return false; // Disable frame pointer elimination in this function. @@ -163,14 +171,13 @@ bool WinEHStatePass::runOnFunction(Function &F) { emitExceptionRegistrationRecord(&F); - auto *MMIPtr = getAnalysisIfAvailable<MachineModuleInfo>(); - assert(MMIPtr && "MachineModuleInfo should always be available"); - MachineModuleInfo &MMI = *MMIPtr; - switch (Personality) { - default: llvm_unreachable("unexpected personality function"); - case EHPersonality::MSVC_CXX: addCXXStateStores(F, MMI); break; - case EHPersonality::MSVC_X86SEH: addSEHStateStores(F, MMI); break; - } + // The state numbers calculated here in IR must agree with what we calculate + // later on for the MachineFunction. In particular, if an IR pass deletes an + // unreachable EH pad after this point before machine CFG construction, we + // will be in trouble. If this assumption is ever broken, we should turn the + // numbers into an immutable analysis pass. + WinEHFuncInfo FuncInfo; + addStateStores(F, FuncInfo); // Reset per-function state. PersonalityFn = nullptr; @@ -261,7 +268,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); // TryLevel = -1 StateFieldIndex = 2; - insertStateNumberStore(RegNode, Builder.GetInsertPoint(), -1); + insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(), -1); // Handler = __ehhandler$F Function *Trampoline = generateLSDAInEAXThunk(F); Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 1); @@ -278,7 +285,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); // TryLevel = -2 / -1 StateFieldIndex = 4; - insertStateNumberStore(RegNode, Builder.GetInsertPoint(), + insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(), UseStackGuard ? -2 : -1); // ScopeTable = llvm.x86.seh.lsda(F) Value *FI8 = Builder.CreateBitCast(F, Int8PtrType); @@ -347,7 +354,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) { Value *CastPersonality = Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo()); auto AI = Trampoline->arg_begin(); - Value *Args[5] = {LSDA, AI++, AI++, AI++, AI++}; + Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++}; CallInst *Call = Builder.CreateCall(CastPersonality, Args); // Can't use musttail due to prototype mismatch, but we can use tail. Call->setTailCall(true); @@ -391,160 +398,53 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) { Builder.CreateStore(Next, FSZero); } -void WinEHStatePass::addCXXStateStores(Function &F, MachineModuleInfo &MMI) { - WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(&F); - calculateWinCXXEHStateNumbers(&F, FuncInfo); - - // The base state for the parent is -1. - addCXXStateStoresToFunclet(RegNode, FuncInfo, F, -1); - - // Set up RegNodeEscapeIndex - int RegNodeEscapeIndex = escapeRegNode(F); - FuncInfo.EHRegNodeEscapeIndex = RegNodeEscapeIndex; - - // Only insert stores in catch handlers. - Constant *FI8 = - ConstantExpr::getBitCast(&F, Type::getInt8PtrTy(TheModule->getContext())); - for (auto P : FuncInfo.HandlerBaseState) { - Function *Handler = const_cast<Function *>(P.first); - int BaseState = P.second; - IRBuilder<> Builder(&Handler->getEntryBlock(), - Handler->getEntryBlock().begin()); - // FIXME: Find and reuse such a call if present. - Value *ParentFP = Builder.CreateCall(FrameAddress, {Builder.getInt32(1)}); - Value *RecoveredRegNode = Builder.CreateCall( - FrameRecover, {FI8, ParentFP, Builder.getInt32(RegNodeEscapeIndex)}); - RecoveredRegNode = - Builder.CreateBitCast(RecoveredRegNode, RegNodeTy->getPointerTo(0)); - addCXXStateStoresToFunclet(RecoveredRegNode, FuncInfo, *Handler, BaseState); - } -} - -/// Escape RegNode so that we can access it from child handlers. Find the call -/// to localescape, if any, in the entry block and append RegNode to the list -/// of arguments. -int WinEHStatePass::escapeRegNode(Function &F) { - // Find the call to localescape and extract its arguments. - IntrinsicInst *EscapeCall = nullptr; - for (Instruction &I : F.getEntryBlock()) { - IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); - if (II && II->getIntrinsicID() == Intrinsic::localescape) { - EscapeCall = II; - break; - } - } - SmallVector<Value *, 8> Args; - if (EscapeCall) { - auto Ops = EscapeCall->arg_operands(); - Args.append(Ops.begin(), Ops.end()); - } - Args.push_back(RegNode); - - // Replace the call (if it exists) with new one. Otherwise, insert at the end - // of the entry block. - Instruction *InsertPt = EscapeCall; - if (!EscapeCall) - InsertPt = F.getEntryBlock().getTerminator(); - IRBuilder<> Builder(&F.getEntryBlock(), InsertPt); - Builder.CreateCall(FrameEscape, Args); - if (EscapeCall) - EscapeCall->eraseFromParent(); - return Args.size() - 1; -} +void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { + // Mark the registration node. The backend needs to know which alloca it is so + // that it can recover the original frame pointer. + IRBuilder<> Builder(RegNode->getParent(), std::next(RegNode->getIterator())); + Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getInt8PtrTy()); + Builder.CreateCall( + Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode), + {RegNodeI8}); + + // Calculate state numbers. + if (isAsynchronousEHPersonality(Personality)) + calculateSEHStateNumbers(&F, FuncInfo); + else + calculateWinCXXEHStateNumbers(&F, FuncInfo); -void WinEHStatePass::addCXXStateStoresToFunclet(Value *ParentRegNode, - WinEHFuncInfo &FuncInfo, - Function &F, int BaseState) { // Iterate all the instructions and emit state number stores. + DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(F); for (BasicBlock &BB : F) { + // Figure out what state we should assign calls in this block. + int BaseState = -1; + auto &BBColors = BlockColors[&BB]; + + assert(BBColors.size() == 1 && + "multi-color BB not removed by preparation"); + BasicBlock *FuncletEntryBB = BBColors.front(); + if (auto *FuncletPad = + dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) { + auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad); + if (BaseStateI != FuncInfo.FuncletBaseStateMap.end()) + BaseState = BaseStateI->second; + } + for (Instruction &I : BB) { if (auto *CI = dyn_cast<CallInst>(&I)) { // Possibly throwing call instructions have no actions to take after // an unwind. Ensure they are in the -1 state. if (CI->doesNotThrow()) continue; - insertStateNumberStore(ParentRegNode, CI, BaseState); + insertStateNumberStore(RegNode, CI, BaseState); } else if (auto *II = dyn_cast<InvokeInst>(&I)) { // Look up the state number of the landingpad this unwinds to. - LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst(); - // FIXME: Why does this assertion fail? - //assert(FuncInfo.LandingPadStateMap.count(LPI) && "LP has no state!"); - int State = FuncInfo.LandingPadStateMap[LPI]; - insertStateNumberStore(ParentRegNode, II, State); - } - } - } -} - -/// Assign every distinct landingpad a unique state number for SEH. Unlike C++ -/// EH, we can use this very simple algorithm while C++ EH cannot because catch -/// handlers aren't outlined and the runtime doesn't have to figure out which -/// catch handler frame to unwind to. -/// FIXME: __finally blocks are outlined, so this approach may break down there. -void WinEHStatePass::addSEHStateStores(Function &F, MachineModuleInfo &MMI) { - WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(&F); - - // Remember and return the index that we used. We save it in WinEHFuncInfo so - // that we can lower llvm.x86.seh.recoverfp later in filter functions without - // too much trouble. - int RegNodeEscapeIndex = escapeRegNode(F); - FuncInfo.EHRegNodeEscapeIndex = RegNodeEscapeIndex; - - // Iterate all the instructions and emit state number stores. - int CurState = 0; - SmallPtrSet<BasicBlock *, 4> ExceptBlocks; - for (BasicBlock &BB : F) { - for (auto I = BB.begin(), E = BB.end(); I != E; ++I) { - if (auto *CI = dyn_cast<CallInst>(I)) { - auto *Intrin = dyn_cast<IntrinsicInst>(CI); - if (Intrin) { - // Calls that "don't throw" are considered to be able to throw asynch - // exceptions, but intrinsics cannot. - continue; - } - insertStateNumberStore(RegNode, CI, -1); - } else if (auto *II = dyn_cast<InvokeInst>(I)) { - // Look up the state number of the landingpad this unwinds to. - LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst(); - auto InsertionPair = - FuncInfo.LandingPadStateMap.insert(std::make_pair(LPI, CurState)); - auto Iter = InsertionPair.first; - int &State = Iter->second; - bool Inserted = InsertionPair.second; - if (Inserted) { - // Each action consumes a state number. - auto *EHActions = cast<IntrinsicInst>(LPI->getNextNode()); - SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList; - parseEHActions(EHActions, ActionList); - assert(!ActionList.empty()); - CurState += ActionList.size(); - State += ActionList.size() - 1; - - // Remember all the __except block targets. - for (auto &Handler : ActionList) { - if (auto *CH = dyn_cast<CatchHandler>(Handler.get())) { - auto *BA = cast<BlockAddress>(CH->getHandlerBlockOrFunc()); -#ifndef NDEBUG - for (BasicBlock *Pred : predecessors(BA->getBasicBlock())) - assert(Pred->isLandingPad() && - "WinEHPrepare failed to split block"); -#endif - ExceptBlocks.insert(BA->getBasicBlock()); - } - } - } + assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!"); + int State = FuncInfo.InvokeStateMap[II]; insertStateNumberStore(RegNode, II, State); } } } - - // Insert llvm.x86.seh.restoreframe() into each __except block. - Function *RestoreFrame = - Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_restoreframe); - for (BasicBlock *ExceptBB : ExceptBlocks) { - IRBuilder<> Builder(ExceptBB->begin()); - Builder.CreateCall(RestoreFrame, {}); - } } void WinEHStatePass::insertStateNumberStore(Value *ParentRegNode, diff --git a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp index 2e44ac9..aaf267a 100644 --- a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp +++ b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp @@ -224,7 +224,7 @@ static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { if (Val > 11) return MCDisassembler::Fail; - static unsigned Values[] = { + static const unsigned Values[] = { 32 /*bpw*/, 1, 2, 3, 4, 5, 6, 7, 8, 16, 24, 32 }; Inst.addOperand(MCOperand::createImm(Values[Val])); diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h index 6fd2dec..dc513f7 100644 --- a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h +++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h @@ -19,8 +19,6 @@ namespace llvm { -class TargetMachine; - class XCoreInstPrinter : public MCInstPrinter { public: XCoreInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp index 702056d..b00cdd5 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp @@ -115,14 +115,14 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { EmitSpecialLLVMGlobal(GV)) return; - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); OutStreamer->SwitchSection( getObjFileLowering().SectionForGlobal(GV, *Mang, TM)); MCSymbol *GVSym = getSymbol(GV); const Constant *C = GV->getInitializer(); - unsigned Align = (unsigned)TD->getPreferredTypeAlignmentShift(C->getType()); - + unsigned Align = (unsigned)DL.getPreferredTypeAlignmentShift(C->getType()); + // Mark the start of the global getTargetStreamer().emitCCTopData(GVSym->getName()); @@ -154,15 +154,15 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { if (GV->isThreadLocal()) { report_fatal_error("TLS is not supported by this target!"); } - unsigned Size = TD->getTypeAllocSize(C->getType()); + unsigned Size = DL.getTypeAllocSize(C->getType()); if (MAI->hasDotTypeDotSizeDirective()) { OutStreamer->EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject); OutStreamer->emitELFSize(cast<MCSymbolELF>(GVSym), MCConstantExpr::create(Size, OutContext)); } OutStreamer->EmitLabel(GVSym); - - EmitGlobalConstant(C); + + EmitGlobalConstant(DL, C); // The ABI requires that unsigned scalar types smaller than 32 bits // are padded to 32 bits. if (Size < 4) @@ -208,7 +208,7 @@ printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O, void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum, raw_ostream &O) { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); const MachineOperand &MO = MI->getOperand(opNum); switch (MO.getType()) { case MachineOperand::MO_Register: @@ -224,8 +224,8 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum, getSymbol(MO.getGlobal())->print(O, MAI); break; case MachineOperand::MO_ConstantPoolIndex: - O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() - << '_' << MO.getIndex(); + O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_' + << MO.getIndex(); break; case MachineOperand::MO_BlockAddress: GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI); diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp index 76c3d81..ae493de 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp @@ -160,27 +160,26 @@ static void GetSpillList(SmallVectorImpl<StackSlotInfo> &SpillList, /// As offsets are negative, the largest offsets will be first. static void GetEHSpillList(SmallVectorImpl<StackSlotInfo> &SpillList, MachineFrameInfo *MFI, XCoreFunctionInfo *XFI, + const Constant *PersonalityFn, const TargetLowering *TL) { assert(XFI->hasEHSpillSlot() && "There are no EH register spill slots"); - const int* EHSlot = XFI->getEHSpillSlot(); - SpillList.push_back(StackSlotInfo(EHSlot[0], - MFI->getObjectOffset(EHSlot[0]), - TL->getExceptionPointerRegister())); - SpillList.push_back(StackSlotInfo(EHSlot[0], - MFI->getObjectOffset(EHSlot[1]), - TL->getExceptionSelectorRegister())); + const int *EHSlot = XFI->getEHSpillSlot(); + SpillList.push_back( + StackSlotInfo(EHSlot[0], MFI->getObjectOffset(EHSlot[0]), + TL->getExceptionPointerRegister(PersonalityFn))); + SpillList.push_back( + StackSlotInfo(EHSlot[0], MFI->getObjectOffset(EHSlot[1]), + TL->getExceptionSelectorRegister(PersonalityFn))); std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset); } - static MachineMemOperand * getFrameIndexMMO(MachineBasicBlock &MBB, int FrameIndex, unsigned flags) { MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); - MachineMemOperand *MMO = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex), - flags, MFI.getObjectSize(FrameIndex), - MFI.getObjectAlignment(FrameIndex)); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FrameIndex), flags, + MFI.getObjectSize(FrameIndex), MFI.getObjectAlignment(FrameIndex)); return MMO; } @@ -323,8 +322,11 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF, if (XFI->hasEHSpillSlot()) { // The unwinder requires stack slot & CFI offsets for the exception info. // We do not save/spill these registers. - SmallVector<StackSlotInfo,2> SpillList; - GetEHSpillList(SpillList, MFI, XFI, + const Function *Fn = MF.getFunction(); + const Constant *PersonalityFn = + Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr; + SmallVector<StackSlotInfo, 2> SpillList; + GetEHSpillList(SpillList, MFI, XFI, PersonalityFn, MF.getSubtarget().getTargetLowering()); assert(SpillList.size()==2 && "Unexpected SpillList size"); EmitCfiOffset(MBB, MBBI, dl, TII, MMI, @@ -355,8 +357,12 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF, if (RetOpcode == XCore::EH_RETURN) { // 'Restore' the exception info the unwinder has placed into the stack // slots. - SmallVector<StackSlotInfo,2> SpillList; - GetEHSpillList(SpillList, MFI, XFI, MF.getSubtarget().getTargetLowering()); + const Function *Fn = MF.getFunction(); + const Constant *PersonalityFn = + Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr; + SmallVector<StackSlotInfo, 2> SpillList; + GetEHSpillList(SpillList, MFI, XFI, PersonalityFn, + MF.getSubtarget().getTargetLowering()); RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList); // Return to the landing pad. diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp index 9d4a966..9f61c84 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp @@ -151,8 +151,9 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) { MVT::Other, CPIdx, CurDAG->getEntryNode()); MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = MF->getMachineMemOperand( - MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, 4, 4); + MemOp[0] = + MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), + MachineMemOperand::MOLoad, 4, 4); cast<MachineSDNode>(node)->setMemRefs(MemOp, MemOp + 1); return node; } diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp index d62e742..105b2cf 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp @@ -79,9 +79,6 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM, // Compute derived properties from the register classes computeRegisterProperties(Subtarget.getRegisterInfo()); - // Division is expensive - setIntDivIsCheap(false); - setStackPointerRegisterToSaveRestore(XCore::SP); setSchedulingPreference(Sched::Source); @@ -154,8 +151,6 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM, // Exception handling setOperationAction(ISD::EH_RETURN, MVT::Other, Custom); - setExceptionPointerRegister(XCore::R0); - setExceptionSelectorRegister(XCore::R1); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); // Atomic operations @@ -839,7 +834,7 @@ LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); return DAG.getLoad( getPointerTy(DAG.getDataLayout()), SDLoc(Op), DAG.getEntryNode(), FIN, - MachinePointerInfo::getFixedStack(FI), false, false, false, 0); + MachinePointerInfo::getFixedStack(MF, FI), false, false, false, 0); } SDValue XCoreTargetLowering:: @@ -1367,8 +1362,8 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain, //from this parameter SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); ArgIn = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + MachinePointerInfo::getFixedStack(MF, FI), false, + false, false, 0); } const ArgDataPair ADP = { ArgIn, Ins[i].Flags }; ArgData.push_back(ADP); @@ -1517,9 +1512,10 @@ XCoreTargetLowering::LowerReturn(SDValue Chain, // Create a SelectionDAG node corresponding to a store // to this memory location. SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); - MemOpChains.push_back(DAG.getStore(Chain, dl, OutVals[i], FIN, - MachinePointerInfo::getFixedStack(FI), false, false, - 0)); + MemOpChains.push_back(DAG.getStore( + Chain, dl, OutVals[i], FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, 0)); } // Transform all store nodes into one single node because @@ -1567,8 +1563,7 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // to set, the condition code register to branch on, the true/false values to // select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -1828,9 +1823,8 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N, SDValue Chain = ST->getChain(); unsigned StoreBits = ST->getMemoryVT().getStoreSizeInBits(); - if (StoreBits % 8) { - break; - } + assert((StoreBits % 8) == 0 && + "Store size in bits must be a multiple of 8"); unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment( ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext())); unsigned Alignment = ST->getAlignment(); diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h index ddd675c..b6f09ff 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h +++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h @@ -125,6 +125,20 @@ namespace llvm { bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + return XCore::R0; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + return XCore::R1; + } + private: const TargetMachine &TM; const XCoreSubtarget &Subtarget; diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp index ee30344..e4129ae 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp @@ -368,11 +368,10 @@ void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, DL = I->getDebugLoc(); MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); - MachineMemOperand *MMO = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex), - MachineMemOperand::MOStore, - MFI.getObjectSize(FrameIndex), - MFI.getObjectAlignment(FrameIndex)); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FrameIndex), + MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex), + MFI.getObjectAlignment(FrameIndex)); BuildMI(MBB, I, DL, get(XCore::STWFI)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FrameIndex) @@ -391,11 +390,10 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, DL = I->getDebugLoc(); MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); - MachineMemOperand *MMO = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FrameIndex), - MFI.getObjectAlignment(FrameIndex)); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FrameIndex), + MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex), + MFI.getObjectAlignment(FrameIndex)); BuildMI(MBB, I, DL, get(XCore::LDWFI), DestReg) .addFrameIndex(FrameIndex) .addImm(0) diff --git a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp index 996c6f5..f0b7201 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp @@ -228,12 +228,9 @@ bool XCoreLowerThreadLocal::runOnModule(Module &M) { // Find thread local globals. bool MadeChange = false; SmallVector<GlobalVariable *, 16> ThreadLocalGlobals; - for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); - GVI != E; ++GVI) { - GlobalVariable *GV = GVI; - if (GV->isThreadLocal()) - ThreadLocalGlobals.push_back(GV); - } + for (GlobalVariable &GV : M.globals()) + if (GV.isThreadLocal()) + ThreadLocalGlobals.push_back(&GV); for (unsigned I = 0, E = ThreadLocalGlobals.size(); I != E; ++I) { MadeChange |= lowerGlobal(ThreadLocalGlobals[I]); } diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp index 9ef9752..6c77096 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp @@ -1,4 +1,4 @@ -//===-- XCoreMachineFuctionInfo.cpp - XCore machine function info ---------===// +//===-- XCoreMachineFunctionInfo.cpp - XCore machine function info --------===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h index 078ffde..cdcc52f 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h @@ -1,4 +1,4 @@ -//===-- XCoreMachineFuctionInfo.h - XCore machine function info -*- C++ -*-===// +//===- XCoreMachineFunctionInfo.h - XCore machine function info -*- C++ -*-===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp index f420081..4a79dac 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp @@ -85,7 +85,7 @@ extern "C" void LLVMInitializeXCoreTarget() { } TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(XCoreTTIImpl(this, F)); }); } diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp index b5a9905..aa16ecc 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp @@ -123,18 +123,21 @@ XCoreTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, if (Kind.isMergeableConst16()) return MergeableConst16Section; } Type *ObjType = GV->getType()->getPointerElementType(); + auto &DL = GV->getParent()->getDataLayout(); if (TM.getCodeModel() == CodeModel::Small || !ObjType->isSized() || - TM.getDataLayout()->getTypeAllocSize(ObjType) < CodeModelLargeSize) { + DL.getTypeAllocSize(ObjType) < CodeModelLargeSize) { if (Kind.isReadOnly()) return UseCPRel? ReadOnlySection : DataRelROSection; if (Kind.isBSS() || Kind.isCommon())return BSSSection; - if (Kind.isDataRel()) return DataSection; + if (Kind.isData()) + return DataSection; if (Kind.isReadOnlyWithRel()) return DataRelROSection; } else { if (Kind.isReadOnly()) return UseCPRel? ReadOnlySectionLarge : DataRelROSectionLarge; if (Kind.isBSS() || Kind.isCommon())return BSSSectionLarge; - if (Kind.isDataRel()) return DataSectionLarge; + if (Kind.isData()) + return DataSectionLarge; if (Kind.isReadOnlyWithRel()) return DataRelROSectionLarge; } @@ -142,9 +145,8 @@ XCoreTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, report_fatal_error("Target does not support TLS or Common sections"); } -MCSection * -XCoreTargetObjectFile::getSectionForConstant(SectionKind Kind, - const Constant *C) const { +MCSection *XCoreTargetObjectFile::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C) const { if (Kind.isMergeableConst4()) return MergeableConst4Section; if (Kind.isMergeableConst8()) return MergeableConst8Section; if (Kind.isMergeableConst16()) return MergeableConst16Section; diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h index 2a5ac23..6701c66 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h +++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h @@ -33,7 +33,7 @@ static const unsigned CodeModelLargeSize = 256; Mangler &Mang, const TargetMachine &TM) const override; - MCSection *getSectionForConstant(SectionKind Kind, + MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, const Constant *C) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h b/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h index e23aef3..b2cb889 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h @@ -37,7 +37,7 @@ class XCoreTTIImpl : public BasicTTIImplBase<XCoreTTIImpl> { const XCoreTargetLowering *getTLI() const { return TLI; } public: - explicit XCoreTTIImpl(const XCoreTargetMachine *TM, Function &F) + explicit XCoreTTIImpl(const XCoreTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 4762011..0e05129 100644 --- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -34,8 +34,11 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/CallSite.h" @@ -63,7 +66,8 @@ namespace { /// struct ArgPromotion : public CallGraphSCCPass { void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); CallGraphSCCPass::getAnalysisUsage(AU); } @@ -81,7 +85,8 @@ namespace { bool isDenselyPacked(Type *type, const DataLayout &DL); bool canPaddingBeAccessed(Argument *Arg); CallGraphNode *PromoteArguments(CallGraphNode *CGN); - bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const; + bool isSafeToPromoteArgument(Argument *Arg, bool isByVal, + AAResults &AAR) const; CallGraphNode *DoPromotion(Function *F, SmallPtrSetImpl<Argument*> &ArgsToPromote, SmallPtrSetImpl<Argument*> &ByValArgsToTransform); @@ -90,15 +95,15 @@ namespace { bool doInitialization(CallGraph &CG) override; /// The maximum number of elements to expand, or 0 for unlimited. unsigned maxElements; - DenseMap<const Function *, DISubprogram *> FunctionDIs; }; } char ArgPromotion::ID = 0; INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion", "Promote 'by reference' arguments to scalars", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(ArgPromotion, "argpromotion", "Promote 'by reference' arguments to scalars", false, false) @@ -217,9 +222,9 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { // First check: see if there are any pointer arguments! If not, quick exit. SmallVector<Argument*, 16> PointerArgs; - for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) - if (I->getType()->isPointerTy()) - PointerArgs.push_back(I); + for (Argument &I : F->args()) + if (I.getType()->isPointerTy()) + PointerArgs.push_back(&I); if (PointerArgs.empty()) return nullptr; // Second check: make sure that all callers are direct callers. We can't @@ -237,6 +242,14 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { const DataLayout &DL = F->getParent()->getDataLayout(); + // We need to manually construct BasicAA directly in order to disable its use + // of other function analyses. + BasicAAResult BAR(createLegacyPMBasicAAResult(*this, *F)); + + // Construct our own AA results for this function. We do this manually to + // work around the limitations of the legacy pass manager. + AAResults AAR(createLegacyPMAAResults(*this, *F, BAR)); + // Check to see which arguments are promotable. If an argument is promotable, // add it to ArgsToPromote. SmallPtrSet<Argument*, 8> ArgsToPromote; @@ -281,8 +294,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { // If all the elements are single-value types, we can promote it. bool AllSimple = true; - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - if (!STy->getElementType(i)->isSingleValueType()) { + for (const auto *EltTy : STy->elements()) { + if (!EltTy->isSingleValueType()) { AllSimple = false; break; } @@ -303,8 +316,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { if (isSelfRecursive) { if (StructType *STy = dyn_cast<StructType>(AgTy)) { bool RecursiveType = false; - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - if (STy->getElementType(i) == PtrArg->getType()) { + for (const auto *EltTy : STy->elements()) { + if (EltTy == PtrArg->getType()) { RecursiveType = true; break; } @@ -315,7 +328,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { } // Otherwise, see if we can promote the pointer to its value. - if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr())) + if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR)) ArgsToPromote.insert(PtrArg); } @@ -416,7 +429,8 @@ static void MarkIndicesSafe(const ArgPromotion::IndicesVector &ToMark, /// elements of the aggregate in order to avoid exploding the number of /// arguments passed in. bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, - bool isByValOrInAlloca) const { + bool isByValOrInAlloca, + AAResults &AAR) const { typedef std::set<IndicesVector> GEPIndicesSet; // Quick exit for unused arguments @@ -453,12 +467,11 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, // First, iterate the entry block and mark loads of (geps of) arguments as // safe. - BasicBlock *EntryBlock = Arg->getParent()->begin(); + BasicBlock &EntryBlock = Arg->getParent()->front(); // Declare this here so we can reuse it IndicesVector Indices; - for (BasicBlock::iterator I = EntryBlock->begin(), E = EntryBlock->end(); - I != E; ++I) - if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + for (Instruction &I : EntryBlock) + if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { Value *V = LI->getPointerOperand(); if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) { V = GEP->getPointerOperand(); @@ -501,12 +514,11 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, if (GEP->use_empty()) { // Dead GEP's cause trouble later. Just remove them if we run into // them. - getAnalysis<AliasAnalysis>().deleteValue(GEP); GEP->eraseFromParent(); // TODO: This runs the above loop over and over again for dead GEPs // Couldn't we just do increment the UI iterator earlier and erase the // use? - return isSafeToPromoteArgument(Arg, isByValOrInAlloca); + return isSafeToPromoteArgument(Arg, isByValOrInAlloca, AAR); } // Ensure that all of the indices are constants. @@ -563,8 +575,6 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, // blocks we know to be transparent to the load. SmallPtrSet<BasicBlock*, 16> TranspBlocks; - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - for (unsigned i = 0, e = Loads.size(); i != e; ++i) { // Check to see if the load is invalidated from the start of the block to // the load itself. @@ -572,8 +582,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, BasicBlock *BB = Load->getParent(); MemoryLocation Loc = MemoryLocation::get(Load); - if (AA.canInstructionRangeModRef(BB->front(), *Load, Loc, - AliasAnalysis::Mod)) + if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, MRI_Mod)) return false; // Pointer is invalidated! // Now check every path from the entry block to the load for transparency. @@ -581,7 +590,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, // loading block. for (BasicBlock *P : predecessors(BB)) { for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks)) - if (AA.canBasicBlockModify(*TranspBB, Loc)) + if (AAR.canBasicBlockModify(*TranspBB, Loc)) return false; } } @@ -637,13 +646,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, unsigned ArgIndex = 1; for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I, ++ArgIndex) { - if (ByValArgsToTransform.count(I)) { + if (ByValArgsToTransform.count(&*I)) { // Simple byval argument? Just add all the struct element types. Type *AgTy = cast<PointerType>(I->getType())->getElementType(); StructType *STy = cast<StructType>(AgTy); Params.insert(Params.end(), STy->element_begin(), STy->element_end()); ++NumByValArgsPromoted; - } else if (!ArgsToPromote.count(I)) { + } else if (!ArgsToPromote.count(&*I)) { // Unchanged argument Params.push_back(I->getType()); AttributeSet attrs = PAL.getParamAttributes(ArgIndex); @@ -661,7 +670,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // In this table, we will track which indices are loaded from the argument // (where direct loads are tracked as no indices). - ScalarizeTable &ArgIndices = ScalarizedElements[I]; + ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; for (User *U : I->users()) { Instruction *UI = cast<Instruction>(U); Type *SrcTy; @@ -687,7 +696,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, else // Take any load, we will use it only to update Alias Analysis OrigLoad = cast<LoadInst>(UI->user_back()); - OriginalLoads[std::make_pair(I, Indices)] = OrigLoad; + OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad; } // Add a parameter to the function for each element passed in. @@ -722,15 +731,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, NF->copyAttributesFrom(F); // Patch the pointer to LLVM function in debug info descriptor. - auto DI = FunctionDIs.find(F); - if (DI != FunctionDIs.end()) { - DISubprogram *SP = DI->second; - SP->replaceFunction(NF); - // Ensure the map is updated so it can be reused on subsequent argument - // promotions of the same function. - FunctionDIs.erase(DI); - FunctionDIs[NF] = SP; - } + NF->setSubprogram(F->getSubprogram()); + F->setSubprogram(nullptr); DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n" << "From: " << *F); @@ -740,13 +742,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, NF->setAttributes(AttributeSet::get(F->getContext(), AttributesVec)); AttributesVec.clear(); - F->getParent()->getFunctionList().insert(F, NF); + F->getParent()->getFunctionList().insert(F->getIterator(), NF); NF->takeName(F); - // Get the alias analysis information that we need to update to reflect our - // changes. - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - // Get the callgraph information that we need to update to reflect our // changes. CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); @@ -775,7 +773,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, ArgIndex = 1; for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I, ++AI, ++ArgIndex) - if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) { + if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { Args.push_back(*AI); // Unmodified argument if (CallPAL.hasAttributes(ArgIndex)) { @@ -783,7 +781,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, AttributesVec. push_back(AttributeSet::get(F->getContext(), Args.size(), B)); } - } else if (ByValArgsToTransform.count(I)) { + } else if (ByValArgsToTransform.count(&*I)) { // Emit a GEP and load for each element of the struct. Type *AgTy = cast<PointerType>(I->getType())->getElementType(); StructType *STy = cast<StructType>(AgTy); @@ -798,14 +796,14 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, } } else if (!I->use_empty()) { // Non-dead argument: insert GEPs and loads as appropriate. - ScalarizeTable &ArgIndices = ScalarizedElements[I]; + ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; // Store the Value* version of the indices in here, but declare it now // for reuse. std::vector<Value*> Ops; for (ScalarizeTable::iterator SI = ArgIndices.begin(), E = ArgIndices.end(); SI != E; ++SI) { Value *V = *AI; - LoadInst *OrigLoad = OriginalLoads[std::make_pair(I, SI->second)]; + LoadInst *OrigLoad = OriginalLoads[std::make_pair(&*I, SI->second)]; if (!SI->second.empty()) { Ops.reserve(SI->second.size()); Type *ElTy = V->getType(); @@ -873,10 +871,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, Args.clear(); AttributesVec.clear(); - // Update the alias analysis implementation to know that we are replacing - // the old call with a new one. - AA.replaceWithNewValue(Call, New); - // Update the callgraph to know that the callsite has been transformed. CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()]; CalleeNode->replaceCallEdge(CS, CallSite(New), NF_CGN); @@ -901,20 +895,19 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), I2 = NF->arg_begin(); I != E; ++I) { - if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) { + if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { // If this is an unmodified argument, move the name and users over to the // new version. - I->replaceAllUsesWith(I2); - I2->takeName(I); - AA.replaceWithNewValue(I, I2); + I->replaceAllUsesWith(&*I2); + I2->takeName(&*I); ++I2; continue; } - if (ByValArgsToTransform.count(I)) { + if (ByValArgsToTransform.count(&*I)) { // In the callee, we create an alloca, and store each of the new incoming // arguments into the alloca. - Instruction *InsertPt = NF->begin()->begin(); + Instruction *InsertPt = &NF->begin()->front(); // Just add all the struct element types. Type *AgTy = cast<PointerType>(I->getType())->getElementType(); @@ -929,13 +922,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i), InsertPt); I2->setName(I->getName()+"."+Twine(i)); - new StoreInst(I2++, Idx, InsertPt); + new StoreInst(&*I2++, Idx, InsertPt); } // Anything that used the arg should now use the alloca. I->replaceAllUsesWith(TheAlloca); - TheAlloca->takeName(I); - AA.replaceWithNewValue(I, TheAlloca); + TheAlloca->takeName(&*I); // If the alloca is used in a call, we must clear the tail flag since // the callee now uses an alloca from the caller. @@ -948,23 +940,20 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, continue; } - if (I->use_empty()) { - AA.deleteValue(I); + if (I->use_empty()) continue; - } // Otherwise, if we promoted this argument, then all users are load // instructions (or GEPs with only load users), and all loads should be // using the new argument that we added. - ScalarizeTable &ArgIndices = ScalarizedElements[I]; + ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; while (!I->use_empty()) { if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) { assert(ArgIndices.begin()->second.empty() && "Load element should sort to front!"); I2->setName(I->getName()+".val"); - LI->replaceAllUsesWith(I2); - AA.replaceWithNewValue(LI, I2); + LI->replaceAllUsesWith(&*I2); LI->eraseFromParent(); DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName() << "' in function '" << F->getName() << "'\n"); @@ -1000,11 +989,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // the argument specified by ArgNo. while (!GEP->use_empty()) { LoadInst *L = cast<LoadInst>(GEP->user_back()); - L->replaceAllUsesWith(TheArg); - AA.replaceWithNewValue(L, TheArg); + L->replaceAllUsesWith(&*TheArg); L->eraseFromParent(); } - AA.deleteValue(GEP); GEP->eraseFromParent(); } } @@ -1013,10 +1000,6 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, std::advance(I2, ArgIndices.size()); } - // Tell the alias analysis that the old function is about to disappear. - AA.replaceWithNewValue(F, NF); - - NF_CGN->stealCalledFunctionsFrom(CG[F]); // Now that the old function is dead, delete it. If there is a dangling @@ -1032,6 +1015,5 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, } bool ArgPromotion::doInitialization(CallGraph &CG) { - FunctionDIs = makeSubprogramMap(CG.getModule()); return CallGraphSCCPass::doInitialization(CG); } diff --git a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp index 8ce7646..0aa49d6 100644 --- a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp @@ -119,7 +119,7 @@ bool ConstantMerge::runOnModule(Module &M) { // First: Find the canonical constants others will be merged with. for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); GVI != E; ) { - GlobalVariable *GV = GVI++; + GlobalVariable *GV = &*GVI++; // If this GV is dead, remove it. GV->removeDeadConstantUsers(); @@ -160,7 +160,7 @@ bool ConstantMerge::runOnModule(Module &M) { // invalidating the Constant* pointers in CMap. for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); GVI != E; ) { - GlobalVariable *GV = GVI++; + GlobalVariable *GV = &*GVI++; // Only process constants with initializers in the default address space. if (!GV->isConstant() || !GV->hasDefinitiveInitializer() || diff --git a/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp new file mode 100644 index 0000000..5bbb751 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -0,0 +1,166 @@ +//===-- CrossDSOCFI.cpp - Externalize this module's CFI checks ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass exports all llvm.bitset's found in the module in the form of a +// __cfi_check function, which can be used to verify cross-DSO call targets. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "cross-dso-cfi" + +STATISTIC(TypeIds, "Number of unique type identifiers"); + +namespace { + +struct CrossDSOCFI : public ModulePass { + static char ID; + CrossDSOCFI() : ModulePass(ID) { + initializeCrossDSOCFIPass(*PassRegistry::getPassRegistry()); + } + + Module *M; + MDNode *VeryLikelyWeights; + + ConstantInt *extractBitSetTypeId(MDNode *MD); + void buildCFICheck(); + + bool doInitialization(Module &M) override; + bool runOnModule(Module &M) override; +}; + +} // anonymous namespace + +INITIALIZE_PASS_BEGIN(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false, + false) +INITIALIZE_PASS_END(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false, false) +char CrossDSOCFI::ID = 0; + +ModulePass *llvm::createCrossDSOCFIPass() { return new CrossDSOCFI; } + +bool CrossDSOCFI::doInitialization(Module &Mod) { + M = &Mod; + VeryLikelyWeights = + MDBuilder(M->getContext()).createBranchWeights((1U << 20) - 1, 1); + + return false; +} + +/// extractBitSetTypeId - Extracts TypeId from a hash-based bitset MDNode. +ConstantInt *CrossDSOCFI::extractBitSetTypeId(MDNode *MD) { + // This check excludes vtables for classes inside anonymous namespaces. + auto TM = dyn_cast<ValueAsMetadata>(MD->getOperand(0)); + if (!TM) + return nullptr; + auto C = dyn_cast_or_null<ConstantInt>(TM->getValue()); + if (!C) return nullptr; + // We are looking for i64 constants. + if (C->getBitWidth() != 64) return nullptr; + + // Sanity check. + auto FM = dyn_cast_or_null<ValueAsMetadata>(MD->getOperand(1)); + // Can be null if a function was removed by an optimization. + if (FM) { + auto F = dyn_cast<Function>(FM->getValue()); + // But can never be a function declaration. + assert(!F || !F->isDeclaration()); + (void)F; // Suppress unused variable warning in the no-asserts build. + } + return C; +} + +/// buildCFICheck - emits __cfi_check for the current module. +void CrossDSOCFI::buildCFICheck() { + // FIXME: verify that __cfi_check ends up near the end of the code section, + // but before the jump slots created in LowerBitSets. + llvm::DenseSet<uint64_t> BitSetIds; + NamedMDNode *BitSetNM = M->getNamedMetadata("llvm.bitsets"); + + if (BitSetNM) + for (unsigned I = 0, E = BitSetNM->getNumOperands(); I != E; ++I) + if (ConstantInt *TypeId = extractBitSetTypeId(BitSetNM->getOperand(I))) + BitSetIds.insert(TypeId->getZExtValue()); + + LLVMContext &Ctx = M->getContext(); + Constant *C = M->getOrInsertFunction( + "__cfi_check", + FunctionType::get( + Type::getVoidTy(Ctx), + {Type::getInt64Ty(Ctx), PointerType::getUnqual(Type::getInt8Ty(Ctx))}, + false)); + Function *F = dyn_cast<Function>(C); + F->setAlignment(4096); + auto args = F->arg_begin(); + Argument &CallSiteTypeId = *(args++); + CallSiteTypeId.setName("CallSiteTypeId"); + Argument &Addr = *(args++); + Addr.setName("Addr"); + assert(args == F->arg_end()); + + BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F); + + BasicBlock *TrapBB = BasicBlock::Create(Ctx, "trap", F); + IRBuilder<> IRBTrap(TrapBB); + Function *TrapFn = Intrinsic::getDeclaration(M, Intrinsic::trap); + llvm::CallInst *TrapCall = IRBTrap.CreateCall(TrapFn); + TrapCall->setDoesNotReturn(); + TrapCall->setDoesNotThrow(); + IRBTrap.CreateUnreachable(); + + BasicBlock *ExitBB = BasicBlock::Create(Ctx, "exit", F); + IRBuilder<> IRBExit(ExitBB); + IRBExit.CreateRetVoid(); + + IRBuilder<> IRB(BB); + SwitchInst *SI = IRB.CreateSwitch(&CallSiteTypeId, TrapBB, BitSetIds.size()); + for (uint64_t TypeId : BitSetIds) { + ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId); + BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F); + IRBuilder<> IRBTest(TestBB); + Function *BitsetTestFn = + Intrinsic::getDeclaration(M, Intrinsic::bitset_test); + + Value *Test = IRBTest.CreateCall( + BitsetTestFn, {&Addr, MetadataAsValue::get( + Ctx, ConstantAsMetadata::get(CaseTypeId))}); + BranchInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB); + BI->setMetadata(LLVMContext::MD_prof, VeryLikelyWeights); + + SI->addCase(CaseTypeId, TestBB); + ++TypeIds; + } +} + +bool CrossDSOCFI::runOnModule(Module &M) { + if (M.getModuleFlag("Cross-DSO CFI") == nullptr) + return false; + buildCFICheck(); + return true; +} diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index d044764..4de3d95 100644 --- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -35,6 +35,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include <map> #include <set> #include <tuple> @@ -121,14 +122,6 @@ namespace { typedef SmallVector<RetOrArg, 5> UseVector; - // Map each LLVM function to corresponding metadata with debug info. If - // the function is replaced with another one, we should patch the pointer - // to LLVM function in metadata. - // As the code generation for module is finished (and DIBuilder is - // finalized) we assume that subprogram descriptors won't be changed, and - // they are stored in map for short duration anyway. - DenseMap<const Function *, DISubprogram *> FunctionDIs; - protected: // DAH uses this to specify a different ID. explicit DAE(char &ID) : ModulePass(ID) {} @@ -198,6 +191,13 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { if (Fn.hasAddressTaken()) return false; + // Don't touch naked functions. The assembly might be using an argument, or + // otherwise rely on the frame layout in a way that this analysis will not + // see. + if (Fn.hasFnAttribute(Attribute::Naked)) { + return false; + } + // Okay, we know we can transform this function if safe. Scan its body // looking for calls marked musttail or calls to llvm.vastart. for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { @@ -229,7 +229,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { // Create the new function body and insert it into the module... Function *NF = Function::Create(NFTy, Fn.getLinkage()); NF->copyAttributesFrom(&Fn); - Fn.getParent()->getFunctionList().insert(&Fn, NF); + Fn.getParent()->getFunctionList().insert(Fn.getIterator(), NF); NF->takeName(&Fn); // Loop over all of the callers of the function, transforming the call sites @@ -296,20 +296,12 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(), I2 = NF->arg_begin(); I != E; ++I, ++I2) { // Move the name and users over to the new version. - I->replaceAllUsesWith(I2); - I2->takeName(I); + I->replaceAllUsesWith(&*I2); + I2->takeName(&*I); } // Patch the pointer to LLVM function in debug info descriptor. - auto DI = FunctionDIs.find(&Fn); - if (DI != FunctionDIs.end()) { - DISubprogram *SP = DI->second; - SP->replaceFunction(NF); - // Ensure the map is updated so it can be reused on non-varargs argument - // eliminations of the same function. - FunctionDIs.erase(DI); - FunctionDIs[NF] = SP; - } + NF->setSubprogram(Fn.getSubprogram()); // Fix up any BlockAddresses that refer to the function. Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType())); @@ -345,16 +337,19 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn) if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg()) return false; + // Don't touch naked functions. The assembly might be using an argument, or + // otherwise rely on the frame layout in a way that this analysis will not + // see. + if (Fn.hasFnAttribute(Attribute::Naked)) + return false; + if (Fn.use_empty()) return false; SmallVector<unsigned, 8> UnusedArgs; - for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(); - I != E; ++I) { - Argument *Arg = I; - - if (Arg->use_empty() && !Arg->hasByValOrInAllocaAttr()) - UnusedArgs.push_back(Arg->getArgNo()); + for (Argument &Arg : Fn.args()) { + if (Arg.use_empty() && !Arg.hasByValOrInAllocaAttr()) + UnusedArgs.push_back(Arg.getArgNo()); } if (UnusedArgs.empty()) @@ -485,6 +480,10 @@ DAE::Liveness DAE::SurveyUse(const Use *U, if (F) { // Used in a direct call. + // The function argument is live if it is used as a bundle operand. + if (CS.isBundleOperand(U)) + return Live; + // Find the argument number. We know for sure that this use is an // argument, since if it was the function argument this would be an // indirect call and the we know can't be looking at a value of the @@ -543,6 +542,14 @@ void DAE::SurveyFunction(const Function &F) { return; } + // Don't touch naked functions. The assembly might be using an argument, or + // otherwise rely on the frame layout in a way that this analysis will not + // see. + if (F.hasFnAttribute(Attribute::Naked)) { + MarkLive(F); + return; + } + unsigned RetCount = NumRetVals(&F); // Assume all return values are dead typedef SmallVector<Liveness, 5> RetVals; @@ -648,7 +655,7 @@ void DAE::SurveyFunction(const Function &F) { } else { // See what the effect of this use is (recording any uses that cause // MaybeLive in MaybeLiveArgUses). - Result = SurveyUses(AI, MaybeLiveArgUses); + Result = SurveyUses(&*AI, MaybeLiveArgUses); } // Mark the result. @@ -878,7 +885,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { NF->setAttributes(NewPAL); // Insert the new function before the old function, so we won't be processing // it again. - F->getParent()->getFunctionList().insert(F, NF); + F->getParent()->getFunctionList().insert(F->getIterator(), NF); NF->takeName(F); // Loop over all of the callers of the function, transforming the call sites @@ -946,7 +953,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { Instruction *New; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), - Args, "", Call); + Args, "", Call->getParent()); cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); cast<InvokeInst>(New)->setAttributes(NewCallPAL); } else { @@ -976,9 +983,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { " must have been a struct or an array!"); Instruction *InsertPt = Call; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { - BasicBlock::iterator IP = II->getNormalDest()->begin(); - while (isa<PHINode>(IP)) ++IP; - InsertPt = IP; + BasicBlock *NewEdge = SplitEdge(New->getParent(), II->getNormalDest()); + InsertPt = &*NewEdge->getFirstInsertionPt(); } // We used to return a struct or array. Instead of doing smart stuff @@ -1026,8 +1032,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { if (ArgAlive[i]) { // If this is a live argument, move the name and users over to the new // version. - I->replaceAllUsesWith(I2); - I2->takeName(I); + I->replaceAllUsesWith(&*I2); + I2->takeName(&*I); ++I2; } else { // If this argument is dead, replace any uses of it with null constants @@ -1079,9 +1085,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { } // Patch the pointer to LLVM function in debug info descriptor. - auto DI = FunctionDIs.find(F); - if (DI != FunctionDIs.end()) - DI->second->replaceFunction(NF); + NF->setSubprogram(F->getSubprogram()); // Now that the old function is dead, delete it. F->eraseFromParent(); @@ -1092,9 +1096,6 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { bool DAE::runOnModule(Module &M) { bool Changed = false; - // Collect debug info descriptors for functions. - FunctionDIs = makeSubprogramMap(M); - // First pass: Do a simple check to see if any functions can have their "..." // removed. We can do this if they never call va_start. This loop cannot be // fused with the next loop, because deleting a function invalidates @@ -1119,7 +1120,7 @@ bool DAE::runOnModule(Module &M) { for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { // Increment now, because the function will probably get removed (ie. // replaced by a new one). - Function *F = I++; + Function *F = &*I++; Changed |= RemoveDeadStuffFromFunction(F); } diff --git a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp index 67ba72d..af313a6 100644 --- a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp @@ -1,4 +1,5 @@ -//===-- ElimAvailExtern.cpp - DCE unreachable internal functions ----------------===// +//===-- ElimAvailExtern.cpp - DCE unreachable internal functions +//----------------===// // // The LLVM Compiler Infrastructure // @@ -15,9 +16,7 @@ #include "llvm/Transforms/IPO.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" -#include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/GlobalStatus.h" #include "llvm/Pass.h" using namespace llvm; @@ -28,18 +27,18 @@ STATISTIC(NumFunctions, "Number of functions removed"); STATISTIC(NumVariables, "Number of global variables removed"); namespace { - struct EliminateAvailableExternally : public ModulePass { - static char ID; // Pass identification, replacement for typeid - EliminateAvailableExternally() : ModulePass(ID) { - initializeEliminateAvailableExternallyPass( - *PassRegistry::getPassRegistry()); - } +struct EliminateAvailableExternally : public ModulePass { + static char ID; // Pass identification, replacement for typeid + EliminateAvailableExternally() : ModulePass(ID) { + initializeEliminateAvailableExternallyPass( + *PassRegistry::getPassRegistry()); + } - // run - Do the EliminateAvailableExternally pass on the specified module, - // optionally updating the specified callgraph to reflect the changes. - // - bool runOnModule(Module &M) override; - }; + // run - Do the EliminateAvailableExternally pass on the specified module, + // optionally updating the specified callgraph to reflect the changes. + // + bool runOnModule(Module &M) override; +}; } char EliminateAvailableExternally::ID = 0; @@ -54,30 +53,31 @@ bool EliminateAvailableExternally::runOnModule(Module &M) { bool Changed = false; // Drop initializers of available externally global variables. - for (Module::global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) { - if (!I->hasAvailableExternallyLinkage()) + for (GlobalVariable &GV : M.globals()) { + if (!GV.hasAvailableExternallyLinkage()) continue; - if (I->hasInitializer()) { - Constant *Init = I->getInitializer(); - I->setInitializer(nullptr); + if (GV.hasInitializer()) { + Constant *Init = GV.getInitializer(); + GV.setInitializer(nullptr); if (isSafeToDestroyConstant(Init)) Init->destroyConstant(); } - I->removeDeadConstantUsers(); - I->setLinkage(GlobalValue::ExternalLinkage); + GV.removeDeadConstantUsers(); + GV.setLinkage(GlobalValue::ExternalLinkage); NumVariables++; + Changed = true; } // Drop the bodies of available externally functions. - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - if (!I->hasAvailableExternallyLinkage()) + for (Function &F : M) { + if (!F.hasAvailableExternallyLinkage()) continue; - if (!I->isDeclaration()) + if (!F.isDeclaration()) // This will set the linkage to external - I->deleteBody(); - I->removeDeadConstantUsers(); + F.deleteBody(); + F.removeDeadConstantUsers(); NumFunctions++; + Changed = true; } return Changed; diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp index b9462f2..1a3b925 100644 --- a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp @@ -83,7 +83,7 @@ namespace { for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I) { bool Delete = - deleteStuff == (bool)Named.count(I) && !I->isDeclaration(); + deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration(); if (!Delete) { if (I->hasAvailableExternallyLinkage()) continue; @@ -103,7 +103,7 @@ namespace { // Visit the Functions. for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { bool Delete = - deleteStuff == (bool)Named.count(I) && !I->isDeclaration(); + deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration(); if (!Delete) { if (I->hasAvailableExternallyLinkage()) continue; @@ -124,7 +124,7 @@ namespace { Module::alias_iterator CurI = I; ++I; - bool Delete = deleteStuff == (bool)Named.count(CurI); + bool Delete = deleteStuff == (bool)Named.count(&*CurI); makeVisible(*CurI, Delete); if (Delete) { @@ -143,7 +143,7 @@ namespace { } CurI->replaceAllUsesWith(Declaration); - delete CurI; + delete &*CurI; } } diff --git a/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp new file mode 100644 index 0000000..6df0447 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp @@ -0,0 +1,121 @@ +//===- ForceFunctionAttrs.cpp - Force function attrs for debugging --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/ForceFunctionAttrs.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "forceattrs" + +static cl::list<std::string> + ForceAttributes("force-attribute", cl::Hidden, + cl::desc("Add an attribute to a function. This should be a " + "pair of 'function-name:attribute-name', for " + "example -force-attribute=foo:noinline. This " + "option can be specified multiple times.")); + +static Attribute::AttrKind parseAttrKind(StringRef Kind) { + return StringSwitch<Attribute::AttrKind>(Kind) + .Case("alwaysinline", Attribute::AlwaysInline) + .Case("builtin", Attribute::Builtin) + .Case("cold", Attribute::Cold) + .Case("convergent", Attribute::Convergent) + .Case("inlinehint", Attribute::InlineHint) + .Case("jumptable", Attribute::JumpTable) + .Case("minsize", Attribute::MinSize) + .Case("naked", Attribute::Naked) + .Case("nobuiltin", Attribute::NoBuiltin) + .Case("noduplicate", Attribute::NoDuplicate) + .Case("noimplicitfloat", Attribute::NoImplicitFloat) + .Case("noinline", Attribute::NoInline) + .Case("nonlazybind", Attribute::NonLazyBind) + .Case("noredzone", Attribute::NoRedZone) + .Case("noreturn", Attribute::NoReturn) + .Case("norecurse", Attribute::NoRecurse) + .Case("nounwind", Attribute::NoUnwind) + .Case("optnone", Attribute::OptimizeNone) + .Case("optsize", Attribute::OptimizeForSize) + .Case("readnone", Attribute::ReadNone) + .Case("readonly", Attribute::ReadOnly) + .Case("argmemonly", Attribute::ArgMemOnly) + .Case("returns_twice", Attribute::ReturnsTwice) + .Case("safestack", Attribute::SafeStack) + .Case("sanitize_address", Attribute::SanitizeAddress) + .Case("sanitize_memory", Attribute::SanitizeMemory) + .Case("sanitize_thread", Attribute::SanitizeThread) + .Case("ssp", Attribute::StackProtect) + .Case("sspreq", Attribute::StackProtectReq) + .Case("sspstrong", Attribute::StackProtectStrong) + .Case("uwtable", Attribute::UWTable) + .Default(Attribute::None); +} + +/// If F has any forced attributes given on the command line, add them. +static void addForcedAttributes(Function &F) { + for (auto &S : ForceAttributes) { + auto KV = StringRef(S).split(':'); + if (KV.first != F.getName()) + continue; + + auto Kind = parseAttrKind(KV.second); + if (Kind == Attribute::None) { + DEBUG(dbgs() << "ForcedAttribute: " << KV.second + << " unknown or not handled!\n"); + continue; + } + if (F.hasFnAttribute(Kind)) + continue; + F.addFnAttr(Kind); + } +} + +PreservedAnalyses ForceFunctionAttrsPass::run(Module &M) { + if (ForceAttributes.empty()) + return PreservedAnalyses::all(); + + for (Function &F : M.functions()) + addForcedAttributes(F); + + // Just conservatively invalidate analyses, this isn't likely to be important. + return PreservedAnalyses::none(); +} + +namespace { +struct ForceFunctionAttrsLegacyPass : public ModulePass { + static char ID; // Pass identification, replacement for typeid + ForceFunctionAttrsLegacyPass() : ModulePass(ID) { + initializeForceFunctionAttrsLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + if (ForceAttributes.empty()) + return false; + + for (Function &F : M.functions()) + addForcedAttributes(F); + + // Conservatively assume we changed something. + return true; + } +}; +} + +char ForceFunctionAttrsLegacyPass::ID = 0; +INITIALIZE_PASS(ForceFunctionAttrsLegacyPass, "forceattrs", + "Force set function attributes", false, false) + +Pass *llvm::createForceFunctionAttrsLegacyPass() { + return new ForceFunctionAttrsLegacyPass(); +} diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index bb5e64a..527fdd1 100644 --- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -6,16 +6,11 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This file implements a simple interprocedural pass which walks the -// call-graph, looking for functions which do not access or only read -// non-local memory, and marking them readnone/readonly. It does the -// same with function arguments independently, marking them readonly/ -// readnone/nocapture. Finally, well-known library call declarations -// are marked with all attributes that are consistent with the -// function's standard definition. This pass is implemented as a -// bottom-up traversal of the call-graph. -// +/// +/// \file +/// This file implements interprocedural passes which walk the +/// call-graph deducing and/or propagating function attributes. +/// //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO.h" @@ -23,14 +18,21 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; @@ -42,230 +44,185 @@ STATISTIC(NumNoCapture, "Number of arguments marked nocapture"); STATISTIC(NumReadNoneArg, "Number of arguments marked readnone"); STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly"); STATISTIC(NumNoAlias, "Number of function returns marked noalias"); -STATISTIC(NumAnnotated, "Number of attributes added to library functions"); +STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull"); +STATISTIC(NumNoRecurse, "Number of functions marked as norecurse"); namespace { - struct FunctionAttrs : public CallGraphSCCPass { - static char ID; // Pass identification, replacement for typeid - FunctionAttrs() : CallGraphSCCPass(ID), AA(nullptr) { - initializeFunctionAttrsPass(*PassRegistry::getPassRegistry()); - } +typedef SmallSetVector<Function *, 8> SCCNodeSet; +} - // runOnSCC - Analyze the SCC, performing the transformation if possible. - bool runOnSCC(CallGraphSCC &SCC) override; +namespace { +struct PostOrderFunctionAttrs : public CallGraphSCCPass { + static char ID; // Pass identification, replacement for typeid + PostOrderFunctionAttrs() : CallGraphSCCPass(ID) { + initializePostOrderFunctionAttrsPass(*PassRegistry::getPassRegistry()); + } - // AddReadAttrs - Deduce readonly/readnone attributes for the SCC. - bool AddReadAttrs(const CallGraphSCC &SCC); + bool runOnSCC(CallGraphSCC &SCC) override; - // AddArgumentAttrs - Deduce nocapture attributes for the SCC. - bool AddArgumentAttrs(const CallGraphSCC &SCC); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + CallGraphSCCPass::getAnalysisUsage(AU); + } - // IsFunctionMallocLike - Does this function allocate new memory? - bool IsFunctionMallocLike(Function *F, - SmallPtrSet<Function*, 8> &) const; +private: + TargetLibraryInfo *TLI; +}; +} - // AddNoAliasAttrs - Deduce noalias attributes for the SCC. - bool AddNoAliasAttrs(const CallGraphSCC &SCC); +char PostOrderFunctionAttrs::ID = 0; +INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrs, "functionattrs", + "Deduce function attributes", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(PostOrderFunctionAttrs, "functionattrs", + "Deduce function attributes", false, false) - // Utility methods used by inferPrototypeAttributes to add attributes - // and maintain annotation statistics. +Pass *llvm::createPostOrderFunctionAttrsPass() { return new PostOrderFunctionAttrs(); } - void setDoesNotAccessMemory(Function &F) { - if (!F.doesNotAccessMemory()) { - F.setDoesNotAccessMemory(); - ++NumAnnotated; - } - } +namespace { +/// The three kinds of memory access relevant to 'readonly' and +/// 'readnone' attributes. +enum MemoryAccessKind { + MAK_ReadNone = 0, + MAK_ReadOnly = 1, + MAK_MayWrite = 2 +}; +} - void setOnlyReadsMemory(Function &F) { - if (!F.onlyReadsMemory()) { - F.setOnlyReadsMemory(); - ++NumAnnotated; - } - } +static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR, + const SCCNodeSet &SCCNodes) { + FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F); + if (MRB == FMRB_DoesNotAccessMemory) + // Already perfect! + return MAK_ReadNone; + + // Definitions with weak linkage may be overridden at linktime with + // something that writes memory, so treat them like declarations. + if (F.isDeclaration() || F.mayBeOverridden()) { + if (AliasAnalysis::onlyReadsMemory(MRB)) + return MAK_ReadOnly; + + // Conservatively assume it writes to memory. + return MAK_MayWrite; + } - void setDoesNotThrow(Function &F) { - if (!F.doesNotThrow()) { - F.setDoesNotThrow(); - ++NumAnnotated; - } - } + // Scan the function body for instructions that may read or write memory. + bool ReadsMemory = false; + for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { + Instruction *I = &*II; + + // Some instructions can be ignored even if they read or write memory. + // Detect these now, skipping to the next instruction if one is found. + CallSite CS(cast<Value>(I)); + if (CS) { + // Ignore calls to functions in the same SCC. + if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction())) + continue; + FunctionModRefBehavior MRB = AAR.getModRefBehavior(CS); - void setDoesNotCapture(Function &F, unsigned n) { - if (!F.doesNotCapture(n)) { - F.setDoesNotCapture(n); - ++NumAnnotated; - } - } + // If the call doesn't access memory, we're done. + if (!(MRB & MRI_ModRef)) + continue; - void setOnlyReadsMemory(Function &F, unsigned n) { - if (!F.onlyReadsMemory(n)) { - F.setOnlyReadsMemory(n); - ++NumAnnotated; + if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) { + // The call could access any memory. If that includes writes, give up. + if (MRB & MRI_Mod) + return MAK_MayWrite; + // If it reads, note it. + if (MRB & MRI_Ref) + ReadsMemory = true; + continue; } - } - void setDoesNotAlias(Function &F, unsigned n) { - if (!F.doesNotAlias(n)) { - F.setDoesNotAlias(n); - ++NumAnnotated; - } - } + // Check whether all pointer arguments point to local memory, and + // ignore calls that only access local memory. + for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end(); + CI != CE; ++CI) { + Value *Arg = *CI; + if (!Arg->getType()->isPtrOrPtrVectorTy()) + continue; - // inferPrototypeAttributes - Analyze the name and prototype of the - // given function and set any applicable attributes. Returns true - // if any attributes were set and false otherwise. - bool inferPrototypeAttributes(Function &F); + AAMDNodes AAInfo; + I->getAAMetadata(AAInfo); + MemoryLocation Loc(Arg, MemoryLocation::UnknownSize, AAInfo); - // annotateLibraryCalls - Adds attributes to well-known standard library - // call declarations. - bool annotateLibraryCalls(const CallGraphSCC &SCC); + // Skip accesses to local or constant memory as they don't impact the + // externally visible mod/ref behavior. + if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) + continue; - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<AliasAnalysis>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - CallGraphSCCPass::getAnalysisUsage(AU); + if (MRB & MRI_Mod) + // Writes non-local memory. Give up. + return MAK_MayWrite; + if (MRB & MRI_Ref) + // Ok, it reads non-local memory. + ReadsMemory = true; + } + continue; + } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + // Ignore non-volatile loads from local memory. (Atomic is okay here.) + if (!LI->isVolatile()) { + MemoryLocation Loc = MemoryLocation::get(LI); + if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) + continue; + } + } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + // Ignore non-volatile stores to local memory. (Atomic is okay here.) + if (!SI->isVolatile()) { + MemoryLocation Loc = MemoryLocation::get(SI); + if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) + continue; + } + } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) { + // Ignore vaargs on local memory. + MemoryLocation Loc = MemoryLocation::get(VI); + if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true)) + continue; } - private: - AliasAnalysis *AA; - TargetLibraryInfo *TLI; - }; -} + // Any remaining instructions need to be taken seriously! Check if they + // read or write memory. + if (I->mayWriteToMemory()) + // Writes memory. Just give up. + return MAK_MayWrite; -char FunctionAttrs::ID = 0; -INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs", - "Deduce function attributes", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END(FunctionAttrs, "functionattrs", - "Deduce function attributes", false, false) - -Pass *llvm::createFunctionAttrsPass() { return new FunctionAttrs(); } - - -/// AddReadAttrs - Deduce readonly/readnone attributes for the SCC. -bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) { - SmallPtrSet<Function*, 8> SCCNodes; + // If this instruction may read memory, remember that. + ReadsMemory |= I->mayReadFromMemory(); + } - // Fill SCCNodes with the elements of the SCC. Used for quickly - // looking up whether a given CallGraphNode is in this SCC. - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) - SCCNodes.insert((*I)->getFunction()); + return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone; +} +/// Deduce readonly/readnone attributes for the SCC. +template <typename AARGetterT> +static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) { // Check if any of the functions in the SCC read or write memory. If they // write memory then they can't be marked readnone or readonly. bool ReadsMemory = false; - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); + for (Function *F : SCCNodes) { + // Call the callable parameter to look up AA results for this function. + AAResults &AAR = AARGetter(*F); - if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) - // External node or node we don't want to optimize - assume it may write - // memory and give up. + switch (checkFunctionMemoryAccess(*F, AAR, SCCNodes)) { + case MAK_MayWrite: return false; - - AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(F); - if (MRB == AliasAnalysis::DoesNotAccessMemory) - // Already perfect! - continue; - - // Definitions with weak linkage may be overridden at linktime with - // something that writes memory, so treat them like declarations. - if (F->isDeclaration() || F->mayBeOverridden()) { - if (!AliasAnalysis::onlyReadsMemory(MRB)) - // May write memory. Just give up. - return false; - + case MAK_ReadOnly: ReadsMemory = true; - continue; - } - - // Scan the function body for instructions that may read or write memory. - for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { - Instruction *I = &*II; - - // Some instructions can be ignored even if they read or write memory. - // Detect these now, skipping to the next instruction if one is found. - CallSite CS(cast<Value>(I)); - if (CS) { - // Ignore calls to functions in the same SCC. - if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction())) - continue; - AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(CS); - // If the call doesn't access arbitrary memory, we may be able to - // figure out something. - if (AliasAnalysis::onlyAccessesArgPointees(MRB)) { - // If the call does access argument pointees, check each argument. - if (AliasAnalysis::doesAccessArgPointees(MRB)) - // Check whether all pointer arguments point to local memory, and - // ignore calls that only access local memory. - for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end(); - CI != CE; ++CI) { - Value *Arg = *CI; - if (Arg->getType()->isPointerTy()) { - AAMDNodes AAInfo; - I->getAAMetadata(AAInfo); - - MemoryLocation Loc(Arg, MemoryLocation::UnknownSize, AAInfo); - if (!AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) { - if (MRB & AliasAnalysis::Mod) - // Writes non-local memory. Give up. - return false; - if (MRB & AliasAnalysis::Ref) - // Ok, it reads non-local memory. - ReadsMemory = true; - } - } - } - continue; - } - // The call could access any memory. If that includes writes, give up. - if (MRB & AliasAnalysis::Mod) - return false; - // If it reads, note it. - if (MRB & AliasAnalysis::Ref) - ReadsMemory = true; - continue; - } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - // Ignore non-volatile loads from local memory. (Atomic is okay here.) - if (!LI->isVolatile()) { - MemoryLocation Loc = MemoryLocation::get(LI); - if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) - continue; - } - } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - // Ignore non-volatile stores to local memory. (Atomic is okay here.) - if (!SI->isVolatile()) { - MemoryLocation Loc = MemoryLocation::get(SI); - if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) - continue; - } - } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) { - // Ignore vaargs on local memory. - MemoryLocation Loc = MemoryLocation::get(VI); - if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) - continue; - } - - // Any remaining instructions need to be taken seriously! Check if they - // read or write memory. - if (I->mayWriteToMemory()) - // Writes memory. Just give up. - return false; - - // If this instruction may read memory, remember that. - ReadsMemory |= I->mayReadFromMemory(); + break; + case MAK_ReadNone: + // Nothing to do! + break; } } // Success! Functions in this SCC do not access memory, or only read memory. // Give them the appropriate attribute. bool MadeChange = false; - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); - + for (Function *F : SCCNodes) { if (F->doesNotAccessMemory()) // Already perfect! continue; @@ -278,11 +235,10 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) { // Clear out any existing attributes. AttrBuilder B; - B.addAttribute(Attribute::ReadOnly) - .addAttribute(Attribute::ReadNone); - F->removeAttributes(AttributeSet::FunctionIndex, - AttributeSet::get(F->getContext(), - AttributeSet::FunctionIndex, B)); + B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone); + F->removeAttributes( + AttributeSet::FunctionIndex, + AttributeSet::get(F->getContext(), AttributeSet::FunctionIndex, B)); // Add in the new attribute. F->addAttribute(AttributeSet::FunctionIndex, @@ -298,124 +254,140 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) { } namespace { - // For a given pointer Argument, this retains a list of Arguments of functions - // in the same SCC that the pointer data flows into. We use this to build an - // SCC of the arguments. - struct ArgumentGraphNode { - Argument *Definition; - SmallVector<ArgumentGraphNode*, 4> Uses; - }; - - class ArgumentGraph { - // We store pointers to ArgumentGraphNode objects, so it's important that - // that they not move around upon insert. - typedef std::map<Argument*, ArgumentGraphNode> ArgumentMapTy; +/// For a given pointer Argument, this retains a list of Arguments of functions +/// in the same SCC that the pointer data flows into. We use this to build an +/// SCC of the arguments. +struct ArgumentGraphNode { + Argument *Definition; + SmallVector<ArgumentGraphNode *, 4> Uses; +}; + +class ArgumentGraph { + // We store pointers to ArgumentGraphNode objects, so it's important that + // that they not move around upon insert. + typedef std::map<Argument *, ArgumentGraphNode> ArgumentMapTy; + + ArgumentMapTy ArgumentMap; + + // There is no root node for the argument graph, in fact: + // void f(int *x, int *y) { if (...) f(x, y); } + // is an example where the graph is disconnected. The SCCIterator requires a + // single entry point, so we maintain a fake ("synthetic") root node that + // uses every node. Because the graph is directed and nothing points into + // the root, it will not participate in any SCCs (except for its own). + ArgumentGraphNode SyntheticRoot; + +public: + ArgumentGraph() { SyntheticRoot.Definition = nullptr; } + + typedef SmallVectorImpl<ArgumentGraphNode *>::iterator iterator; + + iterator begin() { return SyntheticRoot.Uses.begin(); } + iterator end() { return SyntheticRoot.Uses.end(); } + ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; } + + ArgumentGraphNode *operator[](Argument *A) { + ArgumentGraphNode &Node = ArgumentMap[A]; + Node.Definition = A; + SyntheticRoot.Uses.push_back(&Node); + return &Node; + } +}; - ArgumentMapTy ArgumentMap; +/// This tracker checks whether callees are in the SCC, and if so it does not +/// consider that a capture, instead adding it to the "Uses" list and +/// continuing with the analysis. +struct ArgumentUsesTracker : public CaptureTracker { + ArgumentUsesTracker(const SCCNodeSet &SCCNodes) + : Captured(false), SCCNodes(SCCNodes) {} - // There is no root node for the argument graph, in fact: - // void f(int *x, int *y) { if (...) f(x, y); } - // is an example where the graph is disconnected. The SCCIterator requires a - // single entry point, so we maintain a fake ("synthetic") root node that - // uses every node. Because the graph is directed and nothing points into - // the root, it will not participate in any SCCs (except for its own). - ArgumentGraphNode SyntheticRoot; + void tooManyUses() override { Captured = true; } - public: - ArgumentGraph() { SyntheticRoot.Definition = nullptr; } + bool captured(const Use *U) override { + CallSite CS(U->getUser()); + if (!CS.getInstruction()) { + Captured = true; + return true; + } - typedef SmallVectorImpl<ArgumentGraphNode*>::iterator iterator; + Function *F = CS.getCalledFunction(); + if (!F || F->isDeclaration() || F->mayBeOverridden() || + !SCCNodes.count(F)) { + Captured = true; + return true; + } - iterator begin() { return SyntheticRoot.Uses.begin(); } - iterator end() { return SyntheticRoot.Uses.end(); } - ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; } + // Note: the callee and the two successor blocks *follow* the argument + // operands. This means there is no need to adjust UseIndex to account for + // these. - ArgumentGraphNode *operator[](Argument *A) { - ArgumentGraphNode &Node = ArgumentMap[A]; - Node.Definition = A; - SyntheticRoot.Uses.push_back(&Node); - return &Node; - } - }; + unsigned UseIndex = + std::distance(const_cast<const Use *>(CS.arg_begin()), U); - // This tracker checks whether callees are in the SCC, and if so it does not - // consider that a capture, instead adding it to the "Uses" list and - // continuing with the analysis. - struct ArgumentUsesTracker : public CaptureTracker { - ArgumentUsesTracker(const SmallPtrSet<Function*, 8> &SCCNodes) - : Captured(false), SCCNodes(SCCNodes) {} + assert(UseIndex < CS.data_operands_size() && + "Indirect function calls should have been filtered above!"); - void tooManyUses() override { Captured = true; } + if (UseIndex >= CS.getNumArgOperands()) { + // Data operand, but not a argument operand -- must be a bundle operand + assert(CS.hasOperandBundles() && "Must be!"); - bool captured(const Use *U) override { - CallSite CS(U->getUser()); - if (!CS.getInstruction()) { Captured = true; return true; } + // CaptureTracking told us that we're being captured by an operand bundle + // use. In this case it does not matter if the callee is within our SCC + // or not -- we've been captured in some unknown way, and we have to be + // conservative. + Captured = true; + return true; + } - Function *F = CS.getCalledFunction(); - if (!F || !SCCNodes.count(F)) { Captured = true; return true; } - - bool Found = false; - Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end(); - for (CallSite::arg_iterator PI = CS.arg_begin(), PE = CS.arg_end(); - PI != PE; ++PI, ++AI) { - if (AI == AE) { - assert(F->isVarArg() && "More params than args in non-varargs call"); - Captured = true; - return true; - } - if (PI == U) { - Uses.push_back(AI); - Found = true; - break; - } - } - assert(Found && "Capturing call-site captured nothing?"); - (void)Found; - return false; + if (UseIndex >= F->arg_size()) { + assert(F->isVarArg() && "More params than args in non-varargs call"); + Captured = true; + return true; } - bool Captured; // True only if certainly captured (used outside our SCC). - SmallVector<Argument*, 4> Uses; // Uses within our SCC. + Uses.push_back(&*std::next(F->arg_begin(), UseIndex)); + return false; + } + + bool Captured; // True only if certainly captured (used outside our SCC). + SmallVector<Argument *, 4> Uses; // Uses within our SCC. - const SmallPtrSet<Function*, 8> &SCCNodes; - }; + const SCCNodeSet &SCCNodes; +}; } namespace llvm { - template<> struct GraphTraits<ArgumentGraphNode*> { - typedef ArgumentGraphNode NodeType; - typedef SmallVectorImpl<ArgumentGraphNode*>::iterator ChildIteratorType; +template <> struct GraphTraits<ArgumentGraphNode *> { + typedef ArgumentGraphNode NodeType; + typedef SmallVectorImpl<ArgumentGraphNode *>::iterator ChildIteratorType; - static inline NodeType *getEntryNode(NodeType *A) { return A; } - static inline ChildIteratorType child_begin(NodeType *N) { - return N->Uses.begin(); - } - static inline ChildIteratorType child_end(NodeType *N) { - return N->Uses.end(); - } - }; - template<> struct GraphTraits<ArgumentGraph*> - : public GraphTraits<ArgumentGraphNode*> { - static NodeType *getEntryNode(ArgumentGraph *AG) { - return AG->getEntryNode(); - } - static ChildIteratorType nodes_begin(ArgumentGraph *AG) { - return AG->begin(); - } - static ChildIteratorType nodes_end(ArgumentGraph *AG) { - return AG->end(); - } - }; + static inline NodeType *getEntryNode(NodeType *A) { return A; } + static inline ChildIteratorType child_begin(NodeType *N) { + return N->Uses.begin(); + } + static inline ChildIteratorType child_end(NodeType *N) { + return N->Uses.end(); + } +}; +template <> +struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> { + static NodeType *getEntryNode(ArgumentGraph *AG) { + return AG->getEntryNode(); + } + static ChildIteratorType nodes_begin(ArgumentGraph *AG) { + return AG->begin(); + } + static ChildIteratorType nodes_end(ArgumentGraph *AG) { return AG->end(); } +}; } -// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone. +/// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone. static Attribute::AttrKind determinePointerReadAttrs(Argument *A, - const SmallPtrSet<Argument*, 8> &SCCNodes) { - - SmallVector<Use*, 32> Worklist; - SmallSet<Use*, 32> Visited; - int Count = 0; + const SmallPtrSet<Argument *, 8> &SCCNodes) { + + SmallVector<Use *, 32> Worklist; + SmallSet<Use *, 32> Visited; // inalloca arguments are always clobbered by the call. if (A->hasInAllocaAttr()) @@ -425,9 +397,6 @@ determinePointerReadAttrs(Argument *A, // We don't need to track IsWritten. If A is written to, return immediately. for (Use &U : A->uses()) { - if (Count++ >= 20) - return Attribute::None; - Visited.insert(&U); Worklist.push_back(&U); } @@ -435,7 +404,6 @@ determinePointerReadAttrs(Argument *A, while (!Worklist.empty()) { Use *U = Worklist.pop_back_val(); Instruction *I = cast<Instruction>(U->getUser()); - Value *V = U->get(); switch (I->getOpcode()) { case Instruction::BitCast: @@ -479,24 +447,44 @@ determinePointerReadAttrs(Argument *A, return Attribute::None; } - Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end(); - CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); - for (CallSite::arg_iterator A = B; A != E; ++A, ++AI) { - if (A->get() == V) { - if (AI == AE) { - assert(F->isVarArg() && - "More params than args in non-varargs call."); - return Attribute::None; - } - Captures &= !CS.doesNotCapture(A - B); - if (SCCNodes.count(AI)) - continue; - if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(A - B)) - return Attribute::None; - if (!CS.doesNotAccessMemory(A - B)) - IsRead = true; - } + // Note: the callee and the two successor blocks *follow* the argument + // operands. This means there is no need to adjust UseIndex to account + // for these. + + unsigned UseIndex = std::distance(CS.arg_begin(), U); + + // U cannot be the callee operand use: since we're exploring the + // transitive uses of an Argument, having such a use be a callee would + // imply the CallSite is an indirect call or invoke; and we'd take the + // early exit above. + assert(UseIndex < CS.data_operands_size() && + "Data operand use expected!"); + + bool IsOperandBundleUse = UseIndex >= CS.getNumArgOperands(); + + if (UseIndex >= F->arg_size() && !IsOperandBundleUse) { + assert(F->isVarArg() && "More params than args in non-varargs call"); + return Attribute::None; } + + Captures &= !CS.doesNotCapture(UseIndex); + + // Since the optimizer (by design) cannot see the data flow corresponding + // to a operand bundle use, these cannot participate in the optimistic SCC + // analysis. Instead, we model the operand bundle uses as arguments in + // call to a function external to the SCC. + if (!SCCNodes.count(&*std::next(F->arg_begin(), UseIndex)) || + IsOperandBundleUse) { + + // The accessors used on CallSite here do the right thing for calls and + // invokes with operand bundles. + + if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(UseIndex)) + return Attribute::None; + if (!CS.doesNotAccessMemory(UseIndex)) + IsRead = true; + } + AddUsersToWorklistIfCapturing(); break; } @@ -517,21 +505,10 @@ determinePointerReadAttrs(Argument *A, return IsRead ? Attribute::ReadOnly : Attribute::ReadNone; } -/// AddArgumentAttrs - Deduce nocapture attributes for the SCC. -bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { +/// Deduce nocapture attributes for the SCC. +static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { bool Changed = false; - SmallPtrSet<Function*, 8> SCCNodes; - - // Fill SCCNodes with the elements of the SCC. Used for quickly - // looking up whether a given CallGraphNode is in this SCC. - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); - if (F && !F->isDeclaration() && !F->mayBeOverridden() && - !F->hasFnAttribute(Attribute::OptimizeNone)) - SCCNodes.insert(F); - } - ArgumentGraph AG; AttrBuilder B; @@ -539,14 +516,7 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { // Check each function in turn, determining which pointer arguments are not // captured. - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); - - if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) - // External node or function we're trying not to optimize - only a problem - // for arguments that we pass to it. - continue; - + for (Function *F : SCCNodes) { // Definitions with weak linkage may be overridden at linktime with // something that captures pointers, so treat them like declarations. if (F->isDeclaration() || F->mayBeOverridden()) @@ -556,8 +526,8 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { // a value can't capture arguments. Don't analyze them. if (F->onlyReadsMemory() && F->doesNotThrow() && F->getReturnType()->isVoidTy()) { - for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); - A != E; ++A) { + for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E; + ++A) { if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) { A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B)); ++NumNoCapture; @@ -567,26 +537,30 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { continue; } - for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); - A != E; ++A) { - if (!A->getType()->isPointerTy()) continue; + for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E; + ++A) { + if (!A->getType()->isPointerTy()) + continue; bool HasNonLocalUses = false; if (!A->hasNoCaptureAttr()) { ArgumentUsesTracker Tracker(SCCNodes); - PointerMayBeCaptured(A, &Tracker); + PointerMayBeCaptured(&*A, &Tracker); if (!Tracker.Captured) { if (Tracker.Uses.empty()) { // If it's trivially not captured, mark it nocapture now. - A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo()+1, B)); + A->addAttr( + AttributeSet::get(F->getContext(), A->getArgNo() + 1, B)); ++NumNoCapture; Changed = true; } else { // If it's not trivially captured and not trivially not captured, // then it must be calling into another function in our SCC. Save // its particulars for Argument-SCC analysis later. - ArgumentGraphNode *Node = AG[A]; - for (SmallVectorImpl<Argument*>::iterator UI = Tracker.Uses.begin(), - UE = Tracker.Uses.end(); UI != UE; ++UI) { + ArgumentGraphNode *Node = AG[&*A]; + for (SmallVectorImpl<Argument *>::iterator + UI = Tracker.Uses.begin(), + UE = Tracker.Uses.end(); + UI != UE; ++UI) { Node->Uses.push_back(AG[*UI]); if (*UI != A) HasNonLocalUses = true; @@ -600,9 +574,9 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { // Note that we don't allow any calls at all here, or else our result // will be dependent on the iteration order through the functions in the // SCC. - SmallPtrSet<Argument*, 8> Self; - Self.insert(A); - Attribute::AttrKind R = determinePointerReadAttrs(A, Self); + SmallPtrSet<Argument *, 8> Self; + Self.insert(&*A); + Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self); if (R != Attribute::None) { AttrBuilder B; B.addAttribute(R); @@ -621,10 +595,11 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { // made. If the definition doesn't have a 'nocapture' attribute by now, it // captures. - for (scc_iterator<ArgumentGraph*> I = scc_begin(&AG); !I.isAtEnd(); ++I) { + for (scc_iterator<ArgumentGraph *> I = scc_begin(&AG); !I.isAtEnd(); ++I) { const std::vector<ArgumentGraphNode *> &ArgumentSCC = *I; if (ArgumentSCC.size() == 1) { - if (!ArgumentSCC[0]->Definition) continue; // synthetic root node + if (!ArgumentSCC[0]->Definition) + continue; // synthetic root node // eg. "void f(int* x) { if (...) f(x); }" if (ArgumentSCC[0]->Uses.size() == 1 && @@ -646,9 +621,10 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { SCCCaptured = true; } } - if (SCCCaptured) continue; + if (SCCCaptured) + continue; - SmallPtrSet<Argument*, 8> ArgumentSCCNodes; + SmallPtrSet<Argument *, 8> ArgumentSCCNodes; // Fill ArgumentSCCNodes with the elements of the ArgumentSCC. Used for // quickly looking up whether a given Argument is in this ArgumentSCC. for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); I != E; ++I) { @@ -658,8 +634,9 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); I != E && !SCCCaptured; ++I) { ArgumentGraphNode *N = *I; - for (SmallVectorImpl<ArgumentGraphNode*>::iterator UI = N->Uses.begin(), - UE = N->Uses.end(); UI != UE; ++UI) { + for (SmallVectorImpl<ArgumentGraphNode *>::iterator UI = N->Uses.begin(), + UE = N->Uses.end(); + UI != UE; ++UI) { Argument *A = (*UI)->Definition; if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A)) continue; @@ -667,7 +644,8 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { break; } } - if (SCCCaptured) continue; + if (SCCCaptured) + continue; for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; @@ -704,8 +682,7 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { if (ReadAttr != Attribute::None) { AttrBuilder B, R; B.addAttribute(ReadAttr); - R.addAttribute(Attribute::ReadOnly) - .addAttribute(Attribute::ReadNone); + R.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone); for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; // Clear out existing readonly/readnone attributes @@ -720,10 +697,11 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) { return Changed; } -/// IsFunctionMallocLike - A function is malloc-like if it returns either null -/// or a pointer that doesn't alias any other pointer visible to the caller. -bool FunctionAttrs::IsFunctionMallocLike(Function *F, - SmallPtrSet<Function*, 8> &SCCNodes) const { +/// Tests whether a function is "malloc-like". +/// +/// A function is "malloc-like" if it returns either null or a pointer that +/// doesn't alias any other pointer visible to the caller. +static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) { SmallSetVector<Value *, 8> FlowsToReturn; for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) if (ReturnInst *Ret = dyn_cast<ReturnInst>(I->getTerminator())) @@ -744,39 +722,38 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F, if (Instruction *RVI = dyn_cast<Instruction>(RetVal)) switch (RVI->getOpcode()) { - // Extend the analysis by looking upwards. - case Instruction::BitCast: - case Instruction::GetElementPtr: - case Instruction::AddrSpaceCast: - FlowsToReturn.insert(RVI->getOperand(0)); - continue; - case Instruction::Select: { - SelectInst *SI = cast<SelectInst>(RVI); - FlowsToReturn.insert(SI->getTrueValue()); - FlowsToReturn.insert(SI->getFalseValue()); - continue; - } - case Instruction::PHI: { - PHINode *PN = cast<PHINode>(RVI); - for (Value *IncValue : PN->incoming_values()) - FlowsToReturn.insert(IncValue); - continue; - } + // Extend the analysis by looking upwards. + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::AddrSpaceCast: + FlowsToReturn.insert(RVI->getOperand(0)); + continue; + case Instruction::Select: { + SelectInst *SI = cast<SelectInst>(RVI); + FlowsToReturn.insert(SI->getTrueValue()); + FlowsToReturn.insert(SI->getFalseValue()); + continue; + } + case Instruction::PHI: { + PHINode *PN = cast<PHINode>(RVI); + for (Value *IncValue : PN->incoming_values()) + FlowsToReturn.insert(IncValue); + continue; + } - // Check whether the pointer came from an allocation. - case Instruction::Alloca: + // Check whether the pointer came from an allocation. + case Instruction::Alloca: + break; + case Instruction::Call: + case Instruction::Invoke: { + CallSite CS(RVI); + if (CS.paramHasAttr(0, Attribute::NoAlias)) + break; + if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction())) break; - case Instruction::Call: - case Instruction::Invoke: { - CallSite CS(RVI); - if (CS.paramHasAttr(0, Attribute::NoAlias)) - break; - if (CS.getCalledFunction() && - SCCNodes.count(CS.getCalledFunction())) - break; - } // fall-through - default: - return false; // Did not come from an allocation. + } // fall-through + default: + return false; // Did not come from an allocation. } if (PointerMayBeCaptured(RetVal, false, /*StoreCaptures=*/false)) @@ -786,24 +763,11 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F, return true; } -/// AddNoAliasAttrs - Deduce noalias attributes for the SCC. -bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) { - SmallPtrSet<Function*, 8> SCCNodes; - - // Fill SCCNodes with the elements of the SCC. Used for quickly - // looking up whether a given CallGraphNode is in this SCC. - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) - SCCNodes.insert((*I)->getFunction()); - +/// Deduce noalias attributes for the SCC. +static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) { // Check each function in turn, determining which functions return noalias // pointers. - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); - - if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) - // External node or node we don't want to optimize - skip it; - return false; - + for (Function *F : SCCNodes) { // Already noalias. if (F->doesNotAlias(0)) continue; @@ -813,18 +777,17 @@ bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) { if (F->isDeclaration() || F->mayBeOverridden()) return false; - // We annotate noalias return values, which are only applicable to + // We annotate noalias return values, which are only applicable to // pointer types. if (!F->getReturnType()->isPointerTy()) continue; - if (!IsFunctionMallocLike(F, SCCNodes)) + if (!isFunctionMallocLike(F, SCCNodes)) return false; } bool MadeChange = false; - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); + for (Function *F : SCCNodes) { if (F->doesNotAlias(0) || !F->getReturnType()->isPointerTy()) continue; @@ -836,880 +799,308 @@ bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) { return MadeChange; } -/// inferPrototypeAttributes - Analyze the name and prototype of the -/// given function and set any applicable attributes. Returns true -/// if any attributes were set and false otherwise. -bool FunctionAttrs::inferPrototypeAttributes(Function &F) { - if (F.hasFnAttribute(Attribute::OptimizeNone)) - return false; +/// Tests whether this function is known to not return null. +/// +/// Requires that the function returns a pointer. +/// +/// Returns true if it believes the function will not return a null, and sets +/// \p Speculative based on whether the returned conclusion is a speculative +/// conclusion due to SCC calls. +static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes, + const TargetLibraryInfo &TLI, bool &Speculative) { + assert(F->getReturnType()->isPointerTy() && + "nonnull only meaningful on pointer types"); + Speculative = false; - FunctionType *FTy = F.getFunctionType(); - LibFunc::Func TheLibFunc; - if (!(TLI->getLibFunc(F.getName(), TheLibFunc) && TLI->has(TheLibFunc))) - return false; + SmallSetVector<Value *, 8> FlowsToReturn; + for (BasicBlock &BB : *F) + if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) + FlowsToReturn.insert(Ret->getReturnValue()); - switch (TheLibFunc) { - case LibFunc::strlen: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setOnlyReadsMemory(F); - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::strchr: - case LibFunc::strrchr: - if (FTy->getNumParams() != 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isIntegerTy()) - return false; - setOnlyReadsMemory(F); - setDoesNotThrow(F); - break; - case LibFunc::strtol: - case LibFunc::strtod: - case LibFunc::strtof: - case LibFunc::strtoul: - case LibFunc::strtoll: - case LibFunc::strtold: - case LibFunc::strtoull: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::strcpy: - case LibFunc::stpcpy: - case LibFunc::strcat: - case LibFunc::strncat: - case LibFunc::strncpy: - case LibFunc::stpncpy: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::strxfrm: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::strcmp: //0,1 - case LibFunc::strspn: // 0,1 - case LibFunc::strncmp: // 0,1 - case LibFunc::strcspn: //0,1 - case LibFunc::strcoll: //0,1 - case LibFunc::strcasecmp: // 0,1 - case LibFunc::strncasecmp: // - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setOnlyReadsMemory(F); - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - break; - case LibFunc::strstr: - case LibFunc::strpbrk: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setOnlyReadsMemory(F); - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::strtok: - case LibFunc::strtok_r: - if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::scanf: - if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::setbuf: - case LibFunc::setvbuf: - if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::strdup: - case LibFunc::strndup: - if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::stat: - case LibFunc::statvfs: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::sscanf: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::sprintf: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::snprintf: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(2)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 3); - setOnlyReadsMemory(F, 3); - break; - case LibFunc::setitimer: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(1)->isPointerTy() || - !FTy->getParamType(2)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - setDoesNotCapture(F, 3); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::system: - if (FTy->getNumParams() != 1 || - !FTy->getParamType(0)->isPointerTy()) - return false; - // May throw; "system" is a valid pthread cancellation point. - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::malloc: - if (FTy->getNumParams() != 1 || - !FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - break; - case LibFunc::memcmp: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setOnlyReadsMemory(F); - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - break; - case LibFunc::memchr: - case LibFunc::memrchr: - if (FTy->getNumParams() != 3) - return false; - setOnlyReadsMemory(F); - setDoesNotThrow(F); - break; - case LibFunc::modf: - case LibFunc::modff: - case LibFunc::modfl: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::memcpy: - case LibFunc::memccpy: - case LibFunc::memmove: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::memalign: - if (!FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotAlias(F, 0); - break; - case LibFunc::mkdir: - if (FTy->getNumParams() == 0 || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::mktime: - if (FTy->getNumParams() == 0 || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::realloc: - if (FTy->getNumParams() != 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - break; - case LibFunc::read: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(1)->isPointerTy()) - return false; - // May throw; "read" is a valid pthread cancellation point. - setDoesNotCapture(F, 2); - break; - case LibFunc::rewind: - if (FTy->getNumParams() < 1 || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::rmdir: - case LibFunc::remove: - case LibFunc::realpath: - if (FTy->getNumParams() < 1 || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::rename: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::readlink: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::write: - if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) - return false; - // May throw; "write" is a valid pthread cancellation point. - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::bcopy: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::bcmp: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setOnlyReadsMemory(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - break; - case LibFunc::bzero: - if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::calloc: - if (FTy->getNumParams() != 2 || - !FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - break; - case LibFunc::chmod: - case LibFunc::chown: - if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::ctermid: - case LibFunc::clearerr: - case LibFunc::closedir: - if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::atoi: - case LibFunc::atol: - case LibFunc::atof: - case LibFunc::atoll: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setOnlyReadsMemory(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::access: - if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::fopen: - if (FTy->getNumParams() != 2 || - !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::fdopen: - if (FTy->getNumParams() != 2 || - !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::feof: - case LibFunc::free: - case LibFunc::fseek: - case LibFunc::ftell: - case LibFunc::fgetc: - case LibFunc::fseeko: - case LibFunc::ftello: - case LibFunc::fileno: - case LibFunc::fflush: - case LibFunc::fclose: - case LibFunc::fsetpos: - case LibFunc::flockfile: - case LibFunc::funlockfile: - case LibFunc::ftrylockfile: - if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::ferror: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F); - break; - case LibFunc::fputc: - case LibFunc::fstat: - case LibFunc::frexp: - case LibFunc::frexpf: - case LibFunc::frexpl: - case LibFunc::fstatvfs: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::fgets: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(2)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 3); - break; - case LibFunc::fread: - if (FTy->getNumParams() != 4 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(3)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 4); - break; - case LibFunc::fwrite: - if (FTy->getNumParams() != 4 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(3)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 4); - break; - case LibFunc::fputs: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::fscanf: - case LibFunc::fprintf: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::fgetpos: - if (FTy->getNumParams() < 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - break; - case LibFunc::getc: - case LibFunc::getlogin_r: - case LibFunc::getc_unlocked: - if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::getenv: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setOnlyReadsMemory(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::gets: - case LibFunc::getchar: - setDoesNotThrow(F); - break; - case LibFunc::getitimer: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::getpwnam: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::ungetc: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::uname: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::unlink: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::unsetenv: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::utime: - case LibFunc::utimes: - if (FTy->getNumParams() != 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::putc: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::puts: - case LibFunc::printf: - case LibFunc::perror: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::pread: - if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) - return false; - // May throw; "pread" is a valid pthread cancellation point. - setDoesNotCapture(F, 2); - break; - case LibFunc::pwrite: - if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) - return false; - // May throw; "pwrite" is a valid pthread cancellation point. - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::putchar: - setDoesNotThrow(F); - break; - case LibFunc::popen: - if (FTy->getNumParams() != 2 || - !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::pclose: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::vscanf: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::vsscanf: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(1)->isPointerTy() || - !FTy->getParamType(2)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::vfscanf: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(1)->isPointerTy() || - !FTy->getParamType(2)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::valloc: - if (!FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - break; - case LibFunc::vprintf: - if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::vfprintf: - case LibFunc::vsprintf: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::vsnprintf: - if (FTy->getNumParams() != 4 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(2)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 3); - setOnlyReadsMemory(F, 3); - break; - case LibFunc::open: - if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) - return false; - // May throw; "open" is a valid pthread cancellation point. - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::opendir: - if (FTy->getNumParams() != 1 || - !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::tmpfile: - if (!FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - break; - case LibFunc::times: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::htonl: - case LibFunc::htons: - case LibFunc::ntohl: - case LibFunc::ntohs: - setDoesNotThrow(F); - setDoesNotAccessMemory(F); - break; - case LibFunc::lstat: - if (FTy->getNumParams() != 2 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::lchown: - if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::qsort: - if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy()) - return false; - // May throw; places call through function pointer. - setDoesNotCapture(F, 4); - break; - case LibFunc::dunder_strdup: - case LibFunc::dunder_strndup: - if (FTy->getNumParams() < 1 || - !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::dunder_strtok_r: - if (FTy->getNumParams() != 3 || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::under_IO_getc: - if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::under_IO_putc: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::dunder_isoc99_scanf: - if (FTy->getNumParams() < 1 || - !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::stat64: - case LibFunc::lstat64: - case LibFunc::statvfs64: - if (FTy->getNumParams() < 1 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::dunder_isoc99_sscanf: - if (FTy->getNumParams() < 1 || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::fopen64: - if (FTy->getNumParams() != 2 || - !FTy->getReturnType()->isPointerTy() || - !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - setOnlyReadsMemory(F, 1); - setOnlyReadsMemory(F, 2); - break; - case LibFunc::fseeko64: - case LibFunc::ftello64: - if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - break; - case LibFunc::tmpfile64: - if (!FTy->getReturnType()->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotAlias(F, 0); - break; - case LibFunc::fstat64: - case LibFunc::fstatvfs64: - if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) - return false; - setDoesNotThrow(F); - setDoesNotCapture(F, 2); - break; - case LibFunc::open64: - if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) + for (unsigned i = 0; i != FlowsToReturn.size(); ++i) { + Value *RetVal = FlowsToReturn[i]; + + // If this value is locally known to be non-null, we're good + if (isKnownNonNull(RetVal, &TLI)) + continue; + + // Otherwise, we need to look upwards since we can't make any local + // conclusions. + Instruction *RVI = dyn_cast<Instruction>(RetVal); + if (!RVI) return false; - // May throw; "open" is a valid pthread cancellation point. - setDoesNotCapture(F, 1); - setOnlyReadsMemory(F, 1); - break; - case LibFunc::gettimeofday: - if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || - !FTy->getParamType(1)->isPointerTy()) + switch (RVI->getOpcode()) { + // Extend the analysis by looking upwards. + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::AddrSpaceCast: + FlowsToReturn.insert(RVI->getOperand(0)); + continue; + case Instruction::Select: { + SelectInst *SI = cast<SelectInst>(RVI); + FlowsToReturn.insert(SI->getTrueValue()); + FlowsToReturn.insert(SI->getFalseValue()); + continue; + } + case Instruction::PHI: { + PHINode *PN = cast<PHINode>(RVI); + for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + FlowsToReturn.insert(PN->getIncomingValue(i)); + continue; + } + case Instruction::Call: + case Instruction::Invoke: { + CallSite CS(RVI); + Function *Callee = CS.getCalledFunction(); + // A call to a node within the SCC is assumed to return null until + // proven otherwise + if (Callee && SCCNodes.count(Callee)) { + Speculative = true; + continue; + } return false; - // Currently some platforms have the restrict keyword on the arguments to - // gettimeofday. To be conservative, do not add noalias to gettimeofday's - // arguments. - setDoesNotThrow(F); - setDoesNotCapture(F, 1); - setDoesNotCapture(F, 2); - break; - default: - // Didn't mark any attributes. - return false; + } + default: + return false; // Unknown source, may be null + }; + llvm_unreachable("should have either continued or returned"); } return true; } -/// annotateLibraryCalls - Adds attributes to well-known standard library -/// call declarations. -bool FunctionAttrs::annotateLibraryCalls(const CallGraphSCC &SCC) { +/// Deduce nonnull attributes for the SCC. +static bool addNonNullAttrs(const SCCNodeSet &SCCNodes, + const TargetLibraryInfo &TLI) { + // Speculative that all functions in the SCC return only nonnull + // pointers. We may refute this as we analyze functions. + bool SCCReturnsNonNull = true; + bool MadeChange = false; - // Check each function in turn annotating well-known library function - // declarations with attributes. - for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - Function *F = (*I)->getFunction(); + // Check each function in turn, determining which functions return nonnull + // pointers. + for (Function *F : SCCNodes) { + // Already nonnull. + if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + Attribute::NonNull)) + continue; + + // Definitions with weak linkage may be overridden at linktime, so + // treat them like declarations. + if (F->isDeclaration() || F->mayBeOverridden()) + return false; + + // We annotate nonnull return values, which are only applicable to + // pointer types. + if (!F->getReturnType()->isPointerTy()) + continue; + + bool Speculative = false; + if (isReturnNonNull(F, SCCNodes, TLI, Speculative)) { + if (!Speculative) { + // Mark the function eagerly since we may discover a function + // which prevents us from speculating about the entire SCC + DEBUG(dbgs() << "Eagerly marking " << F->getName() << " as nonnull\n"); + F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + ++NumNonNullReturn; + MadeChange = true; + } + continue; + } + // At least one function returns something which could be null, can't + // speculate any more. + SCCReturnsNonNull = false; + } + + if (SCCReturnsNonNull) { + for (Function *F : SCCNodes) { + if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + Attribute::NonNull) || + !F->getReturnType()->isPointerTy()) + continue; - if (F && F->isDeclaration()) - MadeChange |= inferPrototypeAttributes(*F); + DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n"); + F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + ++NumNonNullReturn; + MadeChange = true; + } } return MadeChange; } -bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) { - AA = &getAnalysis<AliasAnalysis>(); +static bool setDoesNotRecurse(Function &F) { + if (F.doesNotRecurse()) + return false; + F.setDoesNotRecurse(); + ++NumNoRecurse; + return true; +} + +static bool addNoRecurseAttrs(const CallGraphSCC &SCC) { + // Try and identify functions that do not recurse. + + // If the SCC contains multiple nodes we know for sure there is recursion. + if (!SCC.isSingular()) + return false; + + const CallGraphNode *CGN = *SCC.begin(); + Function *F = CGN->getFunction(); + if (!F || F->isDeclaration() || F->doesNotRecurse()) + return false; + + // If all of the calls in F are identifiable and are to norecurse functions, F + // is norecurse. This check also detects self-recursion as F is not currently + // marked norecurse, so any called from F to F will not be marked norecurse. + if (std::all_of(CGN->begin(), CGN->end(), + [](const CallGraphNode::CallRecord &CR) { + Function *F = CR.second->getFunction(); + return F && F->doesNotRecurse(); + })) + // Function calls a potentially recursive function. + return setDoesNotRecurse(*F); + + // Nothing else we can deduce usefully during the postorder traversal. + return false; +} + +bool PostOrderFunctionAttrs::runOnSCC(CallGraphSCC &SCC) { TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + bool Changed = false; + + // We compute dedicated AA results for each function in the SCC as needed. We + // use a lambda referencing external objects so that they live long enough to + // be queried, but we re-use them each time. + Optional<BasicAAResult> BAR; + Optional<AAResults> AAR; + auto AARGetter = [&](Function &F) -> AAResults & { + BAR.emplace(createLegacyPMBasicAAResult(*this, F)); + AAR.emplace(createLegacyPMAAResults(*this, F, *BAR)); + return *AAR; + }; + + // Fill SCCNodes with the elements of the SCC. Used for quickly looking up + // whether a given CallGraphNode is in this SCC. Also track whether there are + // any external or opt-none nodes that will prevent us from optimizing any + // part of the SCC. + SCCNodeSet SCCNodes; + bool ExternalNode = false; + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); + if (!F || F->hasFnAttribute(Attribute::OptimizeNone)) { + // External node or function we're trying not to optimize - we both avoid + // transform them and avoid leveraging information they provide. + ExternalNode = true; + continue; + } + + SCCNodes.insert(F); + } + + Changed |= addReadAttrs(SCCNodes, AARGetter); + Changed |= addArgumentAttrs(SCCNodes); + + // If we have no external nodes participating in the SCC, we can deduce some + // more precise attributes as well. + if (!ExternalNode) { + Changed |= addNoAliasAttrs(SCCNodes); + Changed |= addNonNullAttrs(SCCNodes, *TLI); + } + + Changed |= addNoRecurseAttrs(SCC); + return Changed; +} + +namespace { +/// A pass to do RPO deduction and propagation of function attributes. +/// +/// This pass provides a general RPO or "top down" propagation of +/// function attributes. For a few (rare) cases, we can deduce significantly +/// more about function attributes by working in RPO, so this pass +/// provides the compliment to the post-order pass above where the majority of +/// deduction is performed. +// FIXME: Currently there is no RPO CGSCC pass structure to slide into and so +// this is a boring module pass, but eventually it should be an RPO CGSCC pass +// when such infrastructure is available. +struct ReversePostOrderFunctionAttrs : public ModulePass { + static char ID; // Pass identification, replacement for typeid + ReversePostOrderFunctionAttrs() : ModulePass(ID) { + initializeReversePostOrderFunctionAttrsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<CallGraphWrapperPass>(); + } +}; +} + +char ReversePostOrderFunctionAttrs::ID = 0; +INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrs, "rpo-functionattrs", + "Deduce function attributes in RPO", false, false) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_END(ReversePostOrderFunctionAttrs, "rpo-functionattrs", + "Deduce function attributes in RPO", false, false) + +Pass *llvm::createReversePostOrderFunctionAttrsPass() { + return new ReversePostOrderFunctionAttrs(); +} + +static bool addNoRecurseAttrsTopDown(Function &F) { + // We check the preconditions for the function prior to calling this to avoid + // the cost of building up a reversible post-order list. We assert them here + // to make sure none of the invariants this relies on were violated. + assert(!F.isDeclaration() && "Cannot deduce norecurse without a definition!"); + assert(!F.doesNotRecurse() && + "This function has already been deduced as norecurs!"); + assert(F.hasInternalLinkage() && + "Can only do top-down deduction for internal linkage functions!"); + + // If F is internal and all of its uses are calls from a non-recursive + // functions, then none of its calls could in fact recurse without going + // through a function marked norecurse, and so we can mark this function too + // as norecurse. Note that the uses must actually be calls -- otherwise + // a pointer to this function could be returned from a norecurse function but + // this function could be recursively (indirectly) called. Note that this + // also detects if F is directly recursive as F is not yet marked as + // a norecurse function. + for (auto *U : F.users()) { + auto *I = dyn_cast<Instruction>(U); + if (!I) + return false; + CallSite CS(I); + if (!CS || !CS.getParent()->getParent()->doesNotRecurse()) + return false; + } + return setDoesNotRecurse(F); +} + +bool ReversePostOrderFunctionAttrs::runOnModule(Module &M) { + // We only have a post-order SCC traversal (because SCCs are inherently + // discovered in post-order), so we accumulate them in a vector and then walk + // it in reverse. This is simpler than using the RPO iterator infrastructure + // because we need to combine SCC detection and the PO walk of the call + // graph. We can also cheat egregiously because we're primarily interested in + // synthesizing norecurse and so we can only save the singular SCCs as SCCs + // with multiple functions in them will clearly be recursive. + auto &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); + SmallVector<Function *, 16> Worklist; + for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) { + if (I->size() != 1) + continue; + + Function *F = I->front()->getFunction(); + if (F && !F->isDeclaration() && !F->doesNotRecurse() && + F->hasInternalLinkage()) + Worklist.push_back(F); + } + + bool Changed = false; + for (auto *F : reverse(Worklist)) + Changed |= addNoRecurseAttrsTopDown(*F); - bool Changed = annotateLibraryCalls(SCC); - Changed |= AddReadAttrs(SCC); - Changed |= AddArgumentAttrs(SCC); - Changed |= AddNoAliasAttrs(SCC); return Changed; } diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp new file mode 100644 index 0000000..5e0df95 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -0,0 +1,445 @@ +//===- FunctionImport.cpp - ThinLTO Summary-based Function Import ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements Function import based on summaries. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/FunctionImport.h" + +#include "llvm/ADT/StringSet.h" +#include "llvm/IR/AutoUpgrade.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Object/FunctionIndexObjectFile.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/SourceMgr.h" + +#include <map> + +using namespace llvm; + +#define DEBUG_TYPE "function-import" + +/// Limit on instruction count of imported functions. +static cl::opt<unsigned> ImportInstrLimit( + "import-instr-limit", cl::init(100), cl::Hidden, cl::value_desc("N"), + cl::desc("Only import functions with less than N instructions")); + +// Load lazily a module from \p FileName in \p Context. +static std::unique_ptr<Module> loadFile(const std::string &FileName, + LLVMContext &Context) { + SMDiagnostic Err; + DEBUG(dbgs() << "Loading '" << FileName << "'\n"); + // Metadata isn't loaded or linked until after all functions are + // imported, after which it will be materialized and linked. + std::unique_ptr<Module> Result = + getLazyIRFileModule(FileName, Err, Context, + /* ShouldLazyLoadMetadata = */ true); + if (!Result) { + Err.print("function-import", errs()); + return nullptr; + } + + return Result; +} + +namespace { +/// Helper to load on demand a Module from file and cache it for subsequent +/// queries. It can be used with the FunctionImporter. +class ModuleLazyLoaderCache { + /// Cache of lazily loaded module for import. + StringMap<std::unique_ptr<Module>> ModuleMap; + + /// Retrieve a Module from the cache or lazily load it on demand. + std::function<std::unique_ptr<Module>(StringRef FileName)> createLazyModule; + +public: + /// Create the loader, Module will be initialized in \p Context. + ModuleLazyLoaderCache(std::function< + std::unique_ptr<Module>(StringRef FileName)> createLazyModule) + : createLazyModule(createLazyModule) {} + + /// Retrieve a Module from the cache or lazily load it on demand. + Module &operator()(StringRef FileName); + + std::unique_ptr<Module> takeModule(StringRef FileName) { + auto I = ModuleMap.find(FileName); + assert(I != ModuleMap.end()); + std::unique_ptr<Module> Ret = std::move(I->second); + ModuleMap.erase(I); + return Ret; + } +}; + +// Get a Module for \p FileName from the cache, or load it lazily. +Module &ModuleLazyLoaderCache::operator()(StringRef Identifier) { + auto &Module = ModuleMap[Identifier]; + if (!Module) + Module = createLazyModule(Identifier); + return *Module; +} +} // anonymous namespace + +/// Walk through the instructions in \p F looking for external +/// calls not already in the \p CalledFunctions set. If any are +/// found they are added to the \p Worklist for importing. +static void findExternalCalls(const Module &DestModule, Function &F, + const FunctionInfoIndex &Index, + StringSet<> &CalledFunctions, + SmallVector<StringRef, 64> &Worklist) { + // We need to suffix internal function calls imported from other modules, + // prepare the suffix ahead of time. + std::string Suffix; + if (F.getParent() != &DestModule) + Suffix = + (Twine(".llvm.") + + Twine(Index.getModuleId(F.getParent()->getModuleIdentifier()))).str(); + + for (auto &BB : F) { + for (auto &I : BB) { + if (isa<CallInst>(I)) { + auto CalledFunction = cast<CallInst>(I).getCalledFunction(); + // Insert any new external calls that have not already been + // added to set/worklist. + if (!CalledFunction || !CalledFunction->hasName()) + continue; + // Ignore intrinsics early + if (CalledFunction->isIntrinsic()) { + assert(CalledFunction->getIntrinsicID() != 0); + continue; + } + auto ImportedName = CalledFunction->getName(); + auto Renamed = (ImportedName + Suffix).str(); + // Rename internal functions + if (CalledFunction->hasInternalLinkage()) { + ImportedName = Renamed; + } + auto It = CalledFunctions.insert(ImportedName); + if (!It.second) { + // This is a call to a function we already considered, skip. + continue; + } + // Ignore functions already present in the destination module + auto *SrcGV = DestModule.getNamedValue(ImportedName); + if (SrcGV) { + if (GlobalAlias *SGA = dyn_cast<GlobalAlias>(SrcGV)) + SrcGV = SGA->getBaseObject(); + assert(isa<Function>(SrcGV) && "Name collision during import"); + if (!cast<Function>(SrcGV)->isDeclaration()) { + DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Ignoring " + << ImportedName << " already in DestinationModule\n"); + continue; + } + } + + Worklist.push_back(It.first->getKey()); + DEBUG(dbgs() << DestModule.getModuleIdentifier() + << ": Adding callee for : " << ImportedName << " : " + << F.getName() << "\n"); + } + } + } +} + +// Helper function: given a worklist and an index, will process all the worklist +// and decide what to import based on the summary information. +// +// Nothing is actually imported, functions are materialized in their source +// module and analyzed there. +// +// \p ModuleToFunctionsToImportMap is filled with the set of Function to import +// per Module. +static void GetImportList(Module &DestModule, + SmallVector<StringRef, 64> &Worklist, + StringSet<> &CalledFunctions, + std::map<StringRef, DenseSet<const GlobalValue *>> + &ModuleToFunctionsToImportMap, + const FunctionInfoIndex &Index, + ModuleLazyLoaderCache &ModuleLoaderCache) { + while (!Worklist.empty()) { + auto CalledFunctionName = Worklist.pop_back_val(); + DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Process import for " + << CalledFunctionName << "\n"); + + // Try to get a summary for this function call. + auto InfoList = Index.findFunctionInfoList(CalledFunctionName); + if (InfoList == Index.end()) { + DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": No summary for " + << CalledFunctionName << " Ignoring.\n"); + continue; + } + assert(!InfoList->second.empty() && "No summary, error at import?"); + + // Comdat can have multiple entries, FIXME: what do we do with them? + auto &Info = InfoList->second[0]; + assert(Info && "Nullptr in list, error importing summaries?\n"); + + auto *Summary = Info->functionSummary(); + if (!Summary) { + // FIXME: in case we are lazyloading summaries, we can do it now. + DEBUG(dbgs() << DestModule.getModuleIdentifier() + << ": Missing summary for " << CalledFunctionName + << ", error at import?\n"); + llvm_unreachable("Missing summary"); + } + + if (Summary->instCount() > ImportInstrLimit) { + DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Skip import of " + << CalledFunctionName << " with " << Summary->instCount() + << " instructions (limit " << ImportInstrLimit << ")\n"); + continue; + } + + // Get the module path from the summary. + auto ModuleIdentifier = Summary->modulePath(); + DEBUG(dbgs() << DestModule.getModuleIdentifier() << ": Importing " + << CalledFunctionName << " from " << ModuleIdentifier << "\n"); + + auto &SrcModule = ModuleLoaderCache(ModuleIdentifier); + + // The function that we will import! + GlobalValue *SGV = SrcModule.getNamedValue(CalledFunctionName); + + if (!SGV) { + // The destination module is referencing function using their renamed name + // when importing a function that was originally local in the source + // module. The source module we have might not have been renamed so we try + // to remove the suffix added during the renaming to recover the original + // name in the source module. + std::pair<StringRef, StringRef> Split = + CalledFunctionName.split(".llvm."); + SGV = SrcModule.getNamedValue(Split.first); + assert(SGV && "Can't find function to import in source module"); + } + if (!SGV) { + report_fatal_error(Twine("Can't load function '") + CalledFunctionName + + "' in Module '" + SrcModule.getModuleIdentifier() + + "', error in the summary?\n"); + } + + Function *F = dyn_cast<Function>(SGV); + if (!F && isa<GlobalAlias>(SGV)) { + auto *SGA = dyn_cast<GlobalAlias>(SGV); + F = dyn_cast<Function>(SGA->getBaseObject()); + CalledFunctionName = F->getName(); + } + assert(F && "Imported Function is ... not a Function"); + + // We cannot import weak_any functions/aliases without possibly affecting + // the order they are seen and selected by the linker, changing program + // semantics. + if (SGV->hasWeakAnyLinkage()) { + DEBUG(dbgs() << DestModule.getModuleIdentifier() + << ": Ignoring import request for weak-any " + << (isa<Function>(SGV) ? "function " : "alias ") + << CalledFunctionName << " from " + << SrcModule.getModuleIdentifier() << "\n"); + continue; + } + + // Add the function to the import list + auto &Entry = ModuleToFunctionsToImportMap[SrcModule.getModuleIdentifier()]; + Entry.insert(F); + + // Process the newly imported functions and add callees to the worklist. + F->materialize(); + findExternalCalls(DestModule, *F, Index, CalledFunctions, Worklist); + } +} + +// Automatically import functions in Module \p DestModule based on the summaries +// index. +// +// The current implementation imports every called functions that exists in the +// summaries index. +bool FunctionImporter::importFunctions(Module &DestModule) { + DEBUG(dbgs() << "Starting import for Module " + << DestModule.getModuleIdentifier() << "\n"); + unsigned ImportedCount = 0; + + /// First step is collecting the called external functions. + StringSet<> CalledFunctions; + SmallVector<StringRef, 64> Worklist; + for (auto &F : DestModule) { + if (F.isDeclaration() || F.hasFnAttribute(Attribute::OptimizeNone)) + continue; + findExternalCalls(DestModule, F, Index, CalledFunctions, Worklist); + } + if (Worklist.empty()) + return false; + + /// Second step: for every call to an external function, try to import it. + + // Linker that will be used for importing function + Linker TheLinker(DestModule); + + // Map of Module -> List of Function to import from the Module + std::map<StringRef, DenseSet<const GlobalValue *>> + ModuleToFunctionsToImportMap; + + // Analyze the summaries and get the list of functions to import by + // populating ModuleToFunctionsToImportMap + ModuleLazyLoaderCache ModuleLoaderCache(ModuleLoader); + GetImportList(DestModule, Worklist, CalledFunctions, + ModuleToFunctionsToImportMap, Index, ModuleLoaderCache); + assert(Worklist.empty() && "Worklist hasn't been flushed in GetImportList"); + + StringMap<std::unique_ptr<DenseMap<unsigned, MDNode *>>> + ModuleToTempMDValsMap; + + // Do the actual import of functions now, one Module at a time + for (auto &FunctionsToImportPerModule : ModuleToFunctionsToImportMap) { + // Get the module for the import + auto &FunctionsToImport = FunctionsToImportPerModule.second; + std::unique_ptr<Module> SrcModule = + ModuleLoaderCache.takeModule(FunctionsToImportPerModule.first); + assert(&DestModule.getContext() == &SrcModule->getContext() && + "Context mismatch"); + + // Save the mapping of value ids to temporary metadata created when + // importing this function. If we have already imported from this module, + // add new temporary metadata to the existing mapping. + auto &TempMDVals = ModuleToTempMDValsMap[SrcModule->getModuleIdentifier()]; + if (!TempMDVals) + TempMDVals = llvm::make_unique<DenseMap<unsigned, MDNode *>>(); + + // Link in the specified functions. + if (TheLinker.linkInModule(std::move(SrcModule), Linker::Flags::None, + &Index, &FunctionsToImport, TempMDVals.get())) + report_fatal_error("Function Import: link error"); + + ImportedCount += FunctionsToImport.size(); + } + + // Now link in metadata for all modules from which we imported functions. + for (StringMapEntry<std::unique_ptr<DenseMap<unsigned, MDNode *>>> &SME : + ModuleToTempMDValsMap) { + // Load the specified source module. + auto &SrcModule = ModuleLoaderCache(SME.getKey()); + // The modules were created with lazy metadata loading. Materialize it + // now, before linking it. + SrcModule.materializeMetadata(); + UpgradeDebugInfo(SrcModule); + + // Link in all necessary metadata from this module. + if (TheLinker.linkInMetadata(SrcModule, SME.getValue().get())) + return false; + } + + DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module " + << DestModule.getModuleIdentifier() << "\n"); + return ImportedCount; +} + +/// Summary file to use for function importing when using -function-import from +/// the command line. +static cl::opt<std::string> + SummaryFile("summary-file", + cl::desc("The summary file to use for function importing.")); + +static void diagnosticHandler(const DiagnosticInfo &DI) { + raw_ostream &OS = errs(); + DiagnosticPrinterRawOStream DP(OS); + DI.print(DP); + OS << '\n'; +} + +/// Parse the function index out of an IR file and return the function +/// index object if found, or nullptr if not. +static std::unique_ptr<FunctionInfoIndex> +getFunctionIndexForFile(StringRef Path, std::string &Error, + DiagnosticHandlerFunction DiagnosticHandler) { + std::unique_ptr<MemoryBuffer> Buffer; + ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr = + MemoryBuffer::getFile(Path); + if (std::error_code EC = BufferOrErr.getError()) { + Error = EC.message(); + return nullptr; + } + Buffer = std::move(BufferOrErr.get()); + ErrorOr<std::unique_ptr<object::FunctionIndexObjectFile>> ObjOrErr = + object::FunctionIndexObjectFile::create(Buffer->getMemBufferRef(), + DiagnosticHandler); + if (std::error_code EC = ObjOrErr.getError()) { + Error = EC.message(); + return nullptr; + } + return (*ObjOrErr)->takeIndex(); +} + +namespace { +/// Pass that performs cross-module function import provided a summary file. +class FunctionImportPass : public ModulePass { + /// Optional function summary index to use for importing, otherwise + /// the summary-file option must be specified. + const FunctionInfoIndex *Index; + +public: + /// Pass identification, replacement for typeid + static char ID; + + /// Specify pass name for debug output + const char *getPassName() const override { + return "Function Importing"; + } + + explicit FunctionImportPass(const FunctionInfoIndex *Index = nullptr) + : ModulePass(ID), Index(Index) {} + + bool runOnModule(Module &M) override { + if (SummaryFile.empty() && !Index) + report_fatal_error("error: -function-import requires -summary-file or " + "file from frontend\n"); + std::unique_ptr<FunctionInfoIndex> IndexPtr; + if (!SummaryFile.empty()) { + if (Index) + report_fatal_error("error: -summary-file and index from frontend\n"); + std::string Error; + IndexPtr = getFunctionIndexForFile(SummaryFile, Error, diagnosticHandler); + if (!IndexPtr) { + errs() << "Error loading file '" << SummaryFile << "': " << Error + << "\n"; + return false; + } + Index = IndexPtr.get(); + } + + // First we need to promote to global scope and rename any local values that + // are potentially exported to other modules. + if (renameModuleForThinLTO(M, Index)) { + errs() << "Error renaming module\n"; + return false; + } + + // Perform the import now. + auto ModuleLoader = [&M](StringRef Identifier) { + return loadFile(Identifier, M.getContext()); + }; + FunctionImporter Importer(*Index, ModuleLoader); + return Importer.importFunctions(M); + } +}; +} // anonymous namespace + +char FunctionImportPass::ID = 0; +INITIALIZE_PASS_BEGIN(FunctionImportPass, "function-import", + "Summary Based Function Import", false, false) +INITIALIZE_PASS_END(FunctionImportPass, "function-import", + "Summary Based Function Import", false, false) + +namespace llvm { +Pass *createFunctionImportPass(const FunctionInfoIndex *Index = nullptr) { + return new FunctionImportPass(Index); +} +} diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp index 61d0ff9..9b276ed 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp @@ -92,33 +92,28 @@ bool GlobalDCE::runOnModule(Module &M) { ComdatMembers.insert(std::make_pair(C, &GA)); // Loop over the module, adding globals which are obviously necessary. - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - Changed |= RemoveUnusedGlobalValue(*I); + for (Function &F : M) { + Changed |= RemoveUnusedGlobalValue(F); // Functions with external linkage are needed if they have a body - if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) { - if (!I->isDiscardableIfUnused()) - GlobalIsNeeded(I); - } + if (!F.isDeclaration() && !F.hasAvailableExternallyLinkage()) + if (!F.isDiscardableIfUnused()) + GlobalIsNeeded(&F); } - for (Module::global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) { - Changed |= RemoveUnusedGlobalValue(*I); + for (GlobalVariable &GV : M.globals()) { + Changed |= RemoveUnusedGlobalValue(GV); // Externally visible & appending globals are needed, if they have an // initializer. - if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) { - if (!I->isDiscardableIfUnused()) - GlobalIsNeeded(I); - } + if (!GV.isDeclaration() && !GV.hasAvailableExternallyLinkage()) + if (!GV.isDiscardableIfUnused()) + GlobalIsNeeded(&GV); } - for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); - I != E; ++I) { - Changed |= RemoveUnusedGlobalValue(*I); + for (GlobalAlias &GA : M.aliases()) { + Changed |= RemoveUnusedGlobalValue(GA); // Externally visible aliases are needed. - if (!I->isDiscardableIfUnused()) { - GlobalIsNeeded(I); - } + if (!GA.isDiscardableIfUnused()) + GlobalIsNeeded(&GA); } // Now that all globals which are needed are in the AliveGlobals set, we loop @@ -126,52 +121,50 @@ bool GlobalDCE::runOnModule(Module &M) { // // The first pass is to drop initializers of global variables which are dead. - std::vector<GlobalVariable*> DeadGlobalVars; // Keep track of dead globals - for (Module::global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) - if (!AliveGlobals.count(I)) { - DeadGlobalVars.push_back(I); // Keep track of dead globals - if (I->hasInitializer()) { - Constant *Init = I->getInitializer(); - I->setInitializer(nullptr); + std::vector<GlobalVariable *> DeadGlobalVars; // Keep track of dead globals + for (GlobalVariable &GV : M.globals()) + if (!AliveGlobals.count(&GV)) { + DeadGlobalVars.push_back(&GV); // Keep track of dead globals + if (GV.hasInitializer()) { + Constant *Init = GV.getInitializer(); + GV.setInitializer(nullptr); if (isSafeToDestroyConstant(Init)) Init->destroyConstant(); } } // The second pass drops the bodies of functions which are dead... - std::vector<Function*> DeadFunctions; - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) - if (!AliveGlobals.count(I)) { - DeadFunctions.push_back(I); // Keep track of dead globals - if (!I->isDeclaration()) - I->deleteBody(); + std::vector<Function *> DeadFunctions; + for (Function &F : M) + if (!AliveGlobals.count(&F)) { + DeadFunctions.push_back(&F); // Keep track of dead globals + if (!F.isDeclaration()) + F.deleteBody(); } // The third pass drops targets of aliases which are dead... std::vector<GlobalAlias*> DeadAliases; - for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E; - ++I) - if (!AliveGlobals.count(I)) { - DeadAliases.push_back(I); - I->setAliasee(nullptr); + for (GlobalAlias &GA : M.aliases()) + if (!AliveGlobals.count(&GA)) { + DeadAliases.push_back(&GA); + GA.setAliasee(nullptr); } if (!DeadFunctions.empty()) { // Now that all interferences have been dropped, delete the actual objects // themselves. - for (unsigned i = 0, e = DeadFunctions.size(); i != e; ++i) { - RemoveUnusedGlobalValue(*DeadFunctions[i]); - M.getFunctionList().erase(DeadFunctions[i]); + for (Function *F : DeadFunctions) { + RemoveUnusedGlobalValue(*F); + M.getFunctionList().erase(F); } NumFunctions += DeadFunctions.size(); Changed = true; } if (!DeadGlobalVars.empty()) { - for (unsigned i = 0, e = DeadGlobalVars.size(); i != e; ++i) { - RemoveUnusedGlobalValue(*DeadGlobalVars[i]); - M.getGlobalList().erase(DeadGlobalVars[i]); + for (GlobalVariable *GV : DeadGlobalVars) { + RemoveUnusedGlobalValue(*GV); + M.getGlobalList().erase(GV); } NumVariables += DeadGlobalVars.size(); Changed = true; @@ -179,9 +172,9 @@ bool GlobalDCE::runOnModule(Module &M) { // Now delete any dead aliases. if (!DeadAliases.empty()) { - for (unsigned i = 0, e = DeadAliases.size(); i != e; ++i) { - RemoveUnusedGlobalValue(*DeadAliases[i]); - M.getAliasList().erase(DeadAliases[i]); + for (GlobalAlias *GA : DeadAliases) { + RemoveUnusedGlobalValue(*GA); + M.getAliasList().erase(GA); } NumAliases += DeadAliases.size(); Changed = true; @@ -222,21 +215,15 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) { // any globals used will be marked as needed. Function *F = cast<Function>(G); - if (F->hasPrefixData()) - MarkUsedGlobalsAsNeeded(F->getPrefixData()); - - if (F->hasPrologueData()) - MarkUsedGlobalsAsNeeded(F->getPrologueData()); + for (Use &U : F->operands()) + MarkUsedGlobalsAsNeeded(cast<Constant>(U.get())); - if (F->hasPersonalityFn()) - MarkUsedGlobalsAsNeeded(F->getPersonalityFn()); - - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U) - if (GlobalValue *GV = dyn_cast<GlobalValue>(*U)) + for (BasicBlock &BB : *F) + for (Instruction &I : BB) + for (Use &U : I.operands()) + if (GlobalValue *GV = dyn_cast<GlobalValue>(U)) GlobalIsNeeded(GV); - else if (Constant *C = dyn_cast<Constant>(*U)) + else if (Constant *C = dyn_cast<Constant>(U)) MarkUsedGlobalsAsNeeded(C); } } @@ -247,9 +234,9 @@ void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) { // Loop over all of the operands of the constant, adding any globals they // use to the list of needed globals. - for (User::op_iterator I = C->op_begin(), E = C->op_end(); I != E; ++I) { + for (Use &U : C->operands()) { // If we've already processed this constant there's no need to do it again. - Constant *Op = dyn_cast<Constant>(*I); + Constant *Op = dyn_cast<Constant>(U); if (Op && SeenConstants.insert(Op).second) MarkUsedGlobalsAsNeeded(Op); } @@ -262,7 +249,8 @@ void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) { // might make it deader. // bool GlobalDCE::RemoveUnusedGlobalValue(GlobalValue &GV) { - if (GV.use_empty()) return false; + if (GV.use_empty()) + return false; GV.removeDeadConstantUsers(); return GV.use_empty(); } diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 5ffe15d..fd77369 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -54,7 +55,6 @@ STATISTIC(NumSRA , "Number of aggregate globals broken into scalars"); STATISTIC(NumHeapSRA , "Number of heap objects SRA'd"); STATISTIC(NumSubstitute,"Number of globals with initializers stored into them"); STATISTIC(NumDeleted , "Number of globals deleted"); -STATISTIC(NumFnDeleted , "Number of functions deleted"); STATISTIC(NumGlobUses , "Number of global uses devirtualized"); STATISTIC(NumLocalized , "Number of globals localized"); STATISTIC(NumShrunkToBool , "Number of global vars shrunk to booleans"); @@ -69,6 +69,7 @@ namespace { struct GlobalOpt : public ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); } static char ID; // Pass identification, replacement for typeid GlobalOpt() : ModulePass(ID) { @@ -81,11 +82,14 @@ namespace { bool OptimizeFunctions(Module &M); bool OptimizeGlobalVars(Module &M); bool OptimizeGlobalAliases(Module &M); - bool ProcessGlobal(GlobalVariable *GV,Module::global_iterator &GVI); - bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI, - const GlobalStatus &GS); + bool deleteIfDead(GlobalValue &GV); + bool processGlobal(GlobalValue &GV); + bool processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS); bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn); + bool isPointerValueDeadOnEntryToFunction(const Function *F, + GlobalValue *GV); + TargetLibraryInfo *TLI; SmallSet<const Comdat *, 8> NotDiscardableComdats; }; @@ -95,13 +99,14 @@ char GlobalOpt::ID = 0; INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt", "Global Variable Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(GlobalOpt, "globalopt", "Global Variable Optimizer", false, false) ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); } -/// isLeakCheckerRoot - Is this global variable possibly used by a leak checker -/// as a root? If so, we might not really want to eliminate the stores to it. +/// Is this global variable possibly used by a leak checker as a root? If so, +/// we might not really want to eliminate the stores to it. static bool isLeakCheckerRoot(GlobalVariable *GV) { // A global variable is a root if it is a pointer, or could plausibly contain // a pointer. There are two challenges; one is that we could have a struct @@ -176,10 +181,9 @@ static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) { } while (1); } -/// CleanupPointerRootUsers - This GV is a pointer root. Loop over all users -/// of the global and clean up any that obviously don't assign the global a -/// value that isn't dynamically allocated. -/// +/// This GV is a pointer root. Loop over all users of the global and clean up +/// any that obviously don't assign the global a value that isn't dynamically +/// allocated. static bool CleanupPointerRootUsers(GlobalVariable *GV, const TargetLibraryInfo *TLI) { // A brief explanation of leak checkers. The goal is to find bugs where @@ -263,10 +267,9 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV, return Changed; } -/// CleanupConstantGlobalUsers - We just marked GV constant. Loop over all -/// users of the global, cleaning up the obvious ones. This is largely just a -/// quick scan over the use list to clean up the easy and obvious cruft. This -/// returns true if it made a change. +/// We just marked GV constant. Loop over all users of the global, cleaning up +/// the obvious ones. This is largely just a quick scan over the use list to +/// clean up the easy and obvious cruft. This returns true if it made a change. static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, const DataLayout &DL, TargetLibraryInfo *TLI) { @@ -353,8 +356,8 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, return Changed; } -/// isSafeSROAElementUse - Return true if the specified instruction is a safe -/// user of a derived expression from a global that we want to SROA. +/// Return true if the specified instruction is a safe user of a derived +/// expression from a global that we want to SROA. static bool isSafeSROAElementUse(Value *V) { // We might have a dead and dangling constant hanging off of here. if (Constant *C = dyn_cast<Constant>(V)) @@ -385,9 +388,8 @@ static bool isSafeSROAElementUse(Value *V) { } -/// IsUserOfGlobalSafeForSRA - U is a direct user of the specified global value. -/// Look at it and its uses and decide whether it is safe to SROA this global. -/// +/// U is a direct user of the specified global value. Look at it and its uses +/// and decide whether it is safe to SROA this global. static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) { // The user of the global must be a GEP Inst or a ConstantExpr GEP. if (!isa<GetElementPtrInst>(U) && @@ -452,9 +454,8 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) { return true; } -/// GlobalUsersSafeToSRA - Look at all uses of the global and decide whether it -/// is safe for us to perform this transformation. -/// +/// Look at all uses of the global and decide whether it is safe for us to +/// perform this transformation. static bool GlobalUsersSafeToSRA(GlobalValue *GV) { for (User *U : GV->users()) if (!IsUserOfGlobalSafeForSRA(U, GV)) @@ -464,10 +465,10 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) { } -/// SRAGlobal - Perform scalar replacement of aggregates on the specified global -/// variable. This opens the door for other optimizations by exposing the -/// behavior of the program in a more fine-grained way. We have determined that -/// this transformation is safe already. We return the first global variable we +/// Perform scalar replacement of aggregates on the specified global variable. +/// This opens the door for other optimizations by exposing the behavior of the +/// program in a more fine-grained way. We have determined that this +/// transformation is safe already. We return the first global variable we /// insert so that the caller can reprocess it. static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { // Make sure this global only has simple uses that we can SRA. @@ -497,7 +498,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { In, GV->getName()+"."+Twine(i), GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); - Globals.insert(GV, NGV); + NGV->setExternallyInitialized(GV->isExternallyInitialized()); + Globals.push_back(NGV); NewGlobals.push_back(NGV); // Calculate the known alignment of the field. If the original aggregate @@ -530,7 +532,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { In, GV->getName()+"."+Twine(i), GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); - Globals.insert(GV, NGV); + NGV->setExternallyInitialized(GV->isExternallyInitialized()); + Globals.push_back(NGV); NewGlobals.push_back(NGV); // Calculate the known alignment of the field. If the original aggregate @@ -545,7 +548,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { if (NewGlobals.empty()) return nullptr; - DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV); + DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n"); Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext())); @@ -610,9 +613,9 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : nullptr; } -/// AllUsesOfValueWillTrapIfNull - Return true if all users of the specified -/// value will trap if the value is dynamically null. PHIs keeps track of any -/// phi nodes we've seen to avoid reprocessing them. +/// Return true if all users of the specified value will trap if the value is +/// dynamically null. PHIs keeps track of any phi nodes we've seen to avoid +/// reprocessing them. static bool AllUsesOfValueWillTrapIfNull(const Value *V, SmallPtrSetImpl<const PHINode*> &PHIs) { for (const User *U : V->users()) @@ -653,9 +656,9 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V, return true; } -/// AllUsesOfLoadedValueWillTrapIfNull - Return true if all uses of any loads -/// from GV will trap if the loaded value is null. Note that this also permits -/// comparisons of the loaded value against null, as a special case. +/// Return true if all uses of any loads from GV will trap if the loaded value +/// is null. Note that this also permits comparisons of the loaded value +/// against null, as a special case. static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) { for (const User *U : GV->users()) if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { @@ -735,10 +738,10 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { } -/// OptimizeAwayTrappingUsesOfLoads - The specified global has only one non-null -/// value stored into it. If there are uses of the loaded value that would trap -/// if the loaded value is dynamically null, then we know that they cannot be -/// reachable with a null optimize away the load. +/// The specified global has only one non-null value stored into it. If there +/// are uses of the loaded value that would trap if the loaded value is +/// dynamically null, then we know that they cannot be reachable with a null +/// optimize away the load. static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, const DataLayout &DL, TargetLibraryInfo *TLI) { @@ -778,7 +781,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, } if (Changed) { - DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV); + DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV << "\n"); ++NumGlobUses; } @@ -801,8 +804,8 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, return Changed; } -/// ConstantPropUsersOf - Walk the use list of V, constant folding all of the -/// instructions that are foldable. +/// Walk the use list of V, constant folding all of the instructions that are +/// foldable. static void ConstantPropUsersOf(Value *V, const DataLayout &DL, TargetLibraryInfo *TLI) { for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; ) @@ -818,11 +821,11 @@ static void ConstantPropUsersOf(Value *V, const DataLayout &DL, } } -/// OptimizeGlobalAddressOfMalloc - This function takes the specified global -/// variable, and transforms the program as if it always contained the result of -/// the specified malloc. Because it is always the result of the specified -/// malloc, there is no reason to actually DO the malloc. Instead, turn the -/// malloc into a global, and any loads of GV as uses of the new global. +/// This function takes the specified global variable, and transforms the +/// program as if it always contained the result of the specified malloc. +/// Because it is always the result of the specified malloc, there is no reason +/// to actually DO the malloc. Instead, turn the malloc into a global, and any +/// loads of GV as uses of the new global. static GlobalVariable * OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, ConstantInt *NElements, const DataLayout &DL, @@ -838,13 +841,10 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, // Create the new global variable. The contents of the malloc'd memory is // undefined, so initialize with an undef value. - GlobalVariable *NewGV = new GlobalVariable(*GV->getParent(), - GlobalType, false, - GlobalValue::InternalLinkage, - UndefValue::get(GlobalType), - GV->getName()+".body", - GV, - GV->getThreadLocalMode()); + GlobalVariable *NewGV = new GlobalVariable( + *GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage, + UndefValue::get(GlobalType), GV->getName() + ".body", nullptr, + GV->getThreadLocalMode()); // If there are bitcast users of the malloc (which is typical, usually we have // a malloc + bitcast) then replace them with uses of the new global. Update @@ -935,7 +935,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, cast<StoreInst>(InitBool->user_back())->eraseFromParent(); delete InitBool; } else - GV->getParent()->getGlobalList().insert(GV, InitBool); + GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool); // Now the GV is dead, nuke it and the malloc.. GV->eraseFromParent(); @@ -951,10 +951,9 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, return NewGV; } -/// ValueIsOnlyUsedLocallyOrStoredToOneGlobal - Scan the use-list of V checking -/// to make sure that there are no complex uses of V. We permit simple things -/// like dereferencing the pointer, but not storing through the address, unless -/// it is to the specified global. +/// Scan the use-list of V checking to make sure that there are no complex uses +/// of V. We permit simple things like dereferencing the pointer, but not +/// storing through the address, unless it is to the specified global. static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V, const GlobalVariable *GV, SmallPtrSetImpl<const PHINode*> &PHIs) { @@ -998,10 +997,9 @@ static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V, return true; } -/// ReplaceUsesOfMallocWithGlobal - The Alloc pointer is stored into GV -/// somewhere. Transform all uses of the allocation into loads from the -/// global and uses of the resultant pointer. Further, delete the store into -/// GV. This assumes that these value pass the +/// The Alloc pointer is stored into GV somewhere. Transform all uses of the +/// allocation into loads from the global and uses of the resultant pointer. +/// Further, delete the store into GV. This assumes that these value pass the /// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate. static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc, GlobalVariable *GV) { @@ -1043,9 +1041,9 @@ static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc, } } -/// LoadUsesSimpleEnoughForHeapSRA - Verify that all uses of V (a load, or a phi -/// of a load) are simple enough to perform heap SRA on. This permits GEP's -/// that index through the array and struct field, icmps of null, and PHIs. +/// Verify that all uses of V (a load, or a phi of a load) are simple enough to +/// perform heap SRA on. This permits GEP's that index through the array and +/// struct field, icmps of null, and PHIs. static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V, SmallPtrSetImpl<const PHINode*> &LoadUsingPHIs, SmallPtrSetImpl<const PHINode*> &LoadUsingPHIsPerLoad) { @@ -1096,8 +1094,8 @@ static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V, } -/// AllGlobalLoadUsesSimpleEnoughForHeapSRA - If all users of values loaded from -/// GV are simple enough to perform HeapSRA, return true. +/// If all users of values loaded from GV are simple enough to perform HeapSRA, +/// return true. static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV, Instruction *StoredVal) { SmallPtrSet<const PHINode*, 32> LoadUsingPHIs; @@ -1186,8 +1184,8 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo, return FieldVals[FieldNo] = Result; } -/// RewriteHeapSROALoadUser - Given a load instruction and a value derived from -/// the load, rewrite the derived value to use the HeapSRoA'd load. +/// Given a load instruction and a value derived from the load, rewrite the +/// derived value to use the HeapSRoA'd load. static void RewriteHeapSROALoadUser(Instruction *LoadUser, DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues, std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) { @@ -1248,10 +1246,9 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser, } } -/// RewriteUsesOfLoadForHeapSRoA - We are performing Heap SRoA on a global. Ptr -/// is a value loaded from the global. Eliminate all uses of Ptr, making them -/// use FieldGlobals instead. All uses of loaded values satisfy -/// AllGlobalLoadUsesSimpleEnoughForHeapSRA. +/// We are performing Heap SRoA on a global. Ptr is a value loaded from the +/// global. Eliminate all uses of Ptr, making them use FieldGlobals instead. +/// All uses of loaded values satisfy AllGlobalLoadUsesSimpleEnoughForHeapSRA. static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues, std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) { @@ -1266,8 +1263,8 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, } } -/// PerformHeapAllocSRoA - CI is an allocation of an array of structures. Break -/// it up into multiple allocations of arrays of the fields. +/// CI is an allocation of an array of structures. Break it up into multiple +/// allocations of arrays of the fields. static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, Value *NElems, const DataLayout &DL, const TargetLibraryInfo *TLI) { @@ -1291,12 +1288,10 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, Type *FieldTy = STy->getElementType(FieldNo); PointerType *PFieldTy = PointerType::get(FieldTy, AS); - GlobalVariable *NGV = - new GlobalVariable(*GV->getParent(), - PFieldTy, false, GlobalValue::InternalLinkage, - Constant::getNullValue(PFieldTy), - GV->getName() + ".f" + Twine(FieldNo), GV, - GV->getThreadLocalMode()); + GlobalVariable *NGV = new GlobalVariable( + *GV->getParent(), PFieldTy, false, GlobalValue::InternalLinkage, + Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo), + nullptr, GV->getThreadLocalMode()); FieldGlobals.push_back(NGV); unsigned TypeSize = DL.getTypeAllocSize(FieldTy); @@ -1336,7 +1331,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, // Split the basic block at the old malloc. BasicBlock *OrigBB = CI->getParent(); - BasicBlock *ContBB = OrigBB->splitBasicBlock(CI, "malloc_cont"); + BasicBlock *ContBB = + OrigBB->splitBasicBlock(CI->getIterator(), "malloc_cont"); // Create the block to check the first condition. Put all these blocks at the // end of the function as they are unlikely to be executed. @@ -1376,9 +1372,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, // CI is no longer needed, remove it. CI->eraseFromParent(); - /// InsertedScalarizedLoads - As we process loads, if we can't immediately - /// update all uses of the load, keep track of what scalarized loads are - /// inserted for a given load. + /// As we process loads, if we can't immediately update all uses of the load, + /// keep track of what scalarized loads are inserted for a given load. DenseMap<Value*, std::vector<Value*> > InsertedScalarizedValues; InsertedScalarizedValues[GV] = FieldGlobals; @@ -1454,13 +1449,11 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, return cast<GlobalVariable>(FieldGlobals[0]); } -/// TryToOptimizeStoreOfMallocToGlobal - This function is called when we see a -/// pointer global variable with a single value stored it that is a malloc or -/// cast of malloc. -static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, +/// This function is called when we see a pointer global variable with a single +/// value stored it that is a malloc or cast of malloc. +static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, Type *AllocTy, AtomicOrdering Ordering, - Module::global_iterator &GVI, const DataLayout &DL, TargetLibraryInfo *TLI) { // If this is a malloc of an abstract type, don't touch it. @@ -1499,7 +1492,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, // (2048 bytes currently), as we don't want to introduce a 16M global or // something. if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) { - GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI); + OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI); return true; } @@ -1544,19 +1537,18 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, CI = cast<CallInst>(Malloc); } - GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), - DL, TLI); + PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), DL, + TLI); return true; } return false; } -// OptimizeOnceStoredGlobal - Try to optimize globals based on the knowledge -// that only one value (besides its initializer) is ever stored to the global. -static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, +// Try to optimize globals based on the knowledge that only one value (besides +// its initializer) is ever stored to the global. +static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, AtomicOrdering Ordering, - Module::global_iterator &GVI, const DataLayout &DL, TargetLibraryInfo *TLI) { // Ignore no-op GEPs and bitcasts. @@ -1577,9 +1569,8 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, return true; } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) { Type *MallocType = getMallocAllocatedType(CI, TLI); - if (MallocType && - TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, Ordering, GVI, - DL, TLI)) + if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, + Ordering, DL, TLI)) return true; } } @@ -1587,10 +1578,10 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, return false; } -/// TryToShrinkGlobalToBoolean - At this point, we have learned that the only -/// two values ever stored into GV are its initializer and OtherVal. See if we -/// can shrink the global into a boolean and select between the two values -/// whenever it is used. This exposes the values to other scalar optimizations. +/// At this point, we have learned that the only two values ever stored into GV +/// are its initializer and OtherVal. See if we can shrink the global into a +/// boolean and select between the two values whenever it is used. This exposes +/// the values to other scalar optimizations. static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { Type *GVElType = GV->getType()->getElementType(); @@ -1610,7 +1601,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { if (!isa<LoadInst>(U) && !isa<StoreInst>(U)) return false; - DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV); + DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV << "\n"); // Create the new global, initializing it to false. GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()), @@ -1620,7 +1611,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { GV->getName()+".b", GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); - GV->getParent()->getGlobalList().insert(GV, NewGV); + GV->getParent()->getGlobalList().insert(GV->getIterator(), NewGV); Constant *InitVal = GV->getInitializer(); assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) && @@ -1688,61 +1679,213 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { return true; } +bool GlobalOpt::deleteIfDead(GlobalValue &GV) { + GV.removeDeadConstantUsers(); -/// ProcessGlobal - Analyze the specified global variable and optimize it if -/// possible. If we make a change, return true. -bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, - Module::global_iterator &GVI) { - // Do more involved optimizations if the global is internal. - GV->removeDeadConstantUsers(); + if (!GV.isDiscardableIfUnused()) + return false; - if (GV->use_empty()) { - DEBUG(dbgs() << "GLOBAL DEAD: " << *GV); - GV->eraseFromParent(); - ++NumDeleted; - return true; - } + if (const Comdat *C = GV.getComdat()) + if (!GV.hasLocalLinkage() && NotDiscardableComdats.count(C)) + return false; - if (!GV->hasLocalLinkage()) + bool Dead; + if (auto *F = dyn_cast<Function>(&GV)) + Dead = F->isDefTriviallyDead(); + else + Dead = GV.use_empty(); + if (!Dead) + return false; + + DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n"); + GV.eraseFromParent(); + ++NumDeleted; + return true; +} + +/// Analyze the specified global variable and optimize it if possible. If we +/// make a change, return true. +bool GlobalOpt::processGlobal(GlobalValue &GV) { + // Do more involved optimizations if the global is internal. + if (!GV.hasLocalLinkage()) return false; GlobalStatus GS; - if (GlobalStatus::analyzeGlobal(GV, GS)) + if (GlobalStatus::analyzeGlobal(&GV, GS)) return false; - if (!GS.IsCompared && !GV->hasUnnamedAddr()) { - GV->setUnnamedAddr(true); + bool Changed = false; + if (!GS.IsCompared && !GV.hasUnnamedAddr()) { + GV.setUnnamedAddr(true); NumUnnamed++; + Changed = true; } - if (GV->isConstant() || !GV->hasInitializer()) + auto *GVar = dyn_cast<GlobalVariable>(&GV); + if (!GVar) + return Changed; + + if (GVar->isConstant() || !GVar->hasInitializer()) + return Changed; + + return processInternalGlobal(GVar, GS) || Changed; +} + +bool GlobalOpt::isPointerValueDeadOnEntryToFunction(const Function *F, GlobalValue *GV) { + // Find all uses of GV. We expect them all to be in F, and if we can't + // identify any of the uses we bail out. + // + // On each of these uses, identify if the memory that GV points to is + // used/required/live at the start of the function. If it is not, for example + // if the first thing the function does is store to the GV, the GV can + // possibly be demoted. + // + // We don't do an exhaustive search for memory operations - simply look + // through bitcasts as they're quite common and benign. + const DataLayout &DL = GV->getParent()->getDataLayout(); + SmallVector<LoadInst *, 4> Loads; + SmallVector<StoreInst *, 4> Stores; + for (auto *U : GV->users()) { + if (Operator::getOpcode(U) == Instruction::BitCast) { + for (auto *UU : U->users()) { + if (auto *LI = dyn_cast<LoadInst>(UU)) + Loads.push_back(LI); + else if (auto *SI = dyn_cast<StoreInst>(UU)) + Stores.push_back(SI); + else + return false; + } + continue; + } + + Instruction *I = dyn_cast<Instruction>(U); + if (!I) + return false; + assert(I->getParent()->getParent() == F); + + if (auto *LI = dyn_cast<LoadInst>(I)) + Loads.push_back(LI); + else if (auto *SI = dyn_cast<StoreInst>(I)) + Stores.push_back(SI); + else + return false; + } + + // We have identified all uses of GV into loads and stores. Now check if all + // of them are known not to depend on the value of the global at the function + // entry point. We do this by ensuring that every load is dominated by at + // least one store. + auto &DT = getAnalysis<DominatorTreeWrapperPass>(*const_cast<Function *>(F)) + .getDomTree(); + + // The below check is quadratic. Check we're not going to do too many tests. + // FIXME: Even though this will always have worst-case quadratic time, we + // could put effort into minimizing the average time by putting stores that + // have been shown to dominate at least one load at the beginning of the + // Stores array, making subsequent dominance checks more likely to succeed + // early. + // + // The threshold here is fairly large because global->local demotion is a + // very powerful optimization should it fire. + const unsigned Threshold = 100; + if (Loads.size() * Stores.size() > Threshold) return false; - return ProcessInternalGlobal(GV, GVI, GS); + for (auto *L : Loads) { + auto *LTy = L->getType(); + if (!std::any_of(Stores.begin(), Stores.end(), [&](StoreInst *S) { + auto *STy = S->getValueOperand()->getType(); + // The load is only dominated by the store if DomTree says so + // and the number of bits loaded in L is less than or equal to + // the number of bits stored in S. + return DT.dominates(S, L) && + DL.getTypeStoreSize(LTy) <= DL.getTypeStoreSize(STy); + })) + return false; + } + // All loads have known dependences inside F, so the global can be localized. + return true; +} + +/// C may have non-instruction users. Can all of those users be turned into +/// instructions? +static bool allNonInstructionUsersCanBeMadeInstructions(Constant *C) { + // We don't do this exhaustively. The most common pattern that we really need + // to care about is a constant GEP or constant bitcast - so just looking + // through one single ConstantExpr. + // + // The set of constants that this function returns true for must be able to be + // handled by makeAllConstantUsesInstructions. + for (auto *U : C->users()) { + if (isa<Instruction>(U)) + continue; + if (!isa<ConstantExpr>(U)) + // Non instruction, non-constantexpr user; cannot convert this. + return false; + for (auto *UU : U->users()) + if (!isa<Instruction>(UU)) + // A constantexpr used by another constant. We don't try and recurse any + // further but just bail out at this point. + return false; + } + + return true; +} + +/// C may have non-instruction users, and +/// allNonInstructionUsersCanBeMadeInstructions has returned true. Convert the +/// non-instruction users to instructions. +static void makeAllConstantUsesInstructions(Constant *C) { + SmallVector<ConstantExpr*,4> Users; + for (auto *U : C->users()) { + if (isa<ConstantExpr>(U)) + Users.push_back(cast<ConstantExpr>(U)); + else + // We should never get here; allNonInstructionUsersCanBeMadeInstructions + // should not have returned true for C. + assert( + isa<Instruction>(U) && + "Can't transform non-constantexpr non-instruction to instruction!"); + } + + SmallVector<Value*,4> UUsers; + for (auto *U : Users) { + UUsers.clear(); + for (auto *UU : U->users()) + UUsers.push_back(UU); + for (auto *UU : UUsers) { + Instruction *UI = cast<Instruction>(UU); + Instruction *NewU = U->getAsInstruction(); + NewU->insertBefore(UI); + UI->replaceUsesOfWith(U, NewU); + } + U->dropAllReferences(); + } } -/// ProcessInternalGlobal - Analyze the specified global variable and optimize +/// Analyze the specified global variable and optimize /// it if possible. If we make a change, return true. -bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, - Module::global_iterator &GVI, +bool GlobalOpt::processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS) { auto &DL = GV->getParent()->getDataLayout(); - // If this is a first class global and has only one accessing function - // and this function is main (which we know is not recursive), we replace - // the global with a local alloca in this function. + // If this is a first class global and has only one accessing function and + // this function is non-recursive, we replace the global with a local alloca + // in this function. // // NOTE: It doesn't make sense to promote non-single-value types since we // are just replacing static memory to stack memory. // // If the global is in different address space, don't bring it to stack. if (!GS.HasMultipleAccessingFunctions && - GS.AccessingFunction && !GS.HasNonInstructionUser && + GS.AccessingFunction && GV->getType()->getElementType()->isSingleValueType() && - GS.AccessingFunction->getName() == "main" && - GS.AccessingFunction->hasExternalLinkage() && - GV->getType()->getAddressSpace() == 0) { - DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV); + GV->getType()->getAddressSpace() == 0 && + !GV->isExternallyInitialized() && + allNonInstructionUsersCanBeMadeInstructions(GV) && + GS.AccessingFunction->doesNotRecurse() && + isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV) ) { + DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n"); Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction ->getEntryBlock().begin()); Type *ElemTy = GV->getType()->getElementType(); @@ -1752,6 +1895,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, if (!isa<UndefValue>(GV->getInitializer())) new StoreInst(GV->getInitializer(), Alloca, &FirstI); + makeAllConstantUsesInstructions(GV); + GV->replaceAllUsesWith(Alloca); GV->eraseFromParent(); ++NumLocalized; @@ -1761,7 +1906,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, // If the global is never loaded (but may be stored to), it is dead. // Delete it now. if (!GS.IsLoaded) { - DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV); + DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n"); bool Changed; if (isLeakCheckerRoot(GV)) { @@ -1800,11 +1945,9 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, return true; } else if (!GV->getInitializer()->getType()->isSingleValueType()) { const DataLayout &DL = GV->getParent()->getDataLayout(); - if (GlobalVariable *FirstNewGV = SRAGlobal(GV, DL)) { - GVI = FirstNewGV; // Don't skip the newly produced globals! + if (SRAGlobal(GV, DL)) return true; - } - } else if (GS.StoredType == GlobalStatus::StoredOnce) { + } else if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) { // If the initial value for the global was an undef value, and if only // one other value was stored into it, we can just change the // initializer to be the stored value, then delete all stores to the @@ -1822,8 +1965,6 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, << "simplify all users and delete global!\n"); GV->eraseFromParent(); ++NumDeleted; - } else { - GVI = GV; } ++NumSubstitute; return true; @@ -1831,8 +1972,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, // Try to optimize globals based on the knowledge that only one value // (besides its initializer) is ever stored to the global. - if (OptimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, GVI, - DL, TLI)) + if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL, TLI)) return true; // Otherwise, if the global was not a boolean, we can shrink it to be a @@ -1850,8 +1990,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, return false; } -/// ChangeCalleesToFastCall - Walk all of the direct calls of the specified -/// function, changing them to FastCC. +/// Walk all of the direct calls of the specified function, changing them to +/// FastCC. static void ChangeCalleesToFastCall(Function *F) { for (User *U : F->users()) { if (isa<BlockAddress>(U)) @@ -1898,38 +2038,38 @@ bool GlobalOpt::OptimizeFunctions(Module &M) { bool Changed = false; // Optimize functions. for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) { - Function *F = FI++; + Function *F = &*FI++; // Functions without names cannot be referenced outside this module. if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage()) F->setLinkage(GlobalValue::InternalLinkage); - const Comdat *C = F->getComdat(); - bool inComdat = C && NotDiscardableComdats.count(C); - F->removeDeadConstantUsers(); - if ((!inComdat || F->hasLocalLinkage()) && F->isDefTriviallyDead()) { - F->eraseFromParent(); + if (deleteIfDead(*F)) { Changed = true; - ++NumFnDeleted; - } else if (F->hasLocalLinkage()) { - if (isProfitableToMakeFastCC(F) && !F->isVarArg() && - !F->hasAddressTaken()) { - // If this function has a calling convention worth changing, is not a - // varargs function, and is only called directly, promote it to use the - // Fast calling convention. - F->setCallingConv(CallingConv::Fast); - ChangeCalleesToFastCall(F); - ++NumFastCallFns; - Changed = true; - } + continue; + } - if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) && - !F->hasAddressTaken()) { - // The function is not used by a trampoline intrinsic, so it is safe - // to remove the 'nest' attribute. - RemoveNestAttribute(F); - ++NumNestRemoved; - Changed = true; - } + Changed |= processGlobal(*F); + + if (!F->hasLocalLinkage()) + continue; + if (isProfitableToMakeFastCC(F) && !F->isVarArg() && + !F->hasAddressTaken()) { + // If this function has a calling convention worth changing, is not a + // varargs function, and is only called directly, promote it to use the + // Fast calling convention. + F->setCallingConv(CallingConv::Fast); + ChangeCalleesToFastCall(F); + ++NumFastCallFns; + Changed = true; + } + + if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) && + !F->hasAddressTaken()) { + // The function is not used by a trampoline intrinsic, so it is safe + // to remove the 'nest' attribute. + RemoveNestAttribute(F); + ++NumNestRemoved; + Changed = true; } } return Changed; @@ -1940,7 +2080,7 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) { for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); GVI != E; ) { - GlobalVariable *GV = GVI++; + GlobalVariable *GV = &*GVI++; // Global variables without names cannot be referenced outside this module. if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage()) GV->setLinkage(GlobalValue::InternalLinkage); @@ -1953,12 +2093,12 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) { GV->setInitializer(New); } - if (GV->isDiscardableIfUnused()) { - if (const Comdat *C = GV->getComdat()) - if (NotDiscardableComdats.count(C) && !GV->hasLocalLinkage()) - continue; - Changed |= ProcessGlobal(GV, GVI); + if (deleteIfDead(*GV)) { + Changed = true; + continue; } + + Changed |= processGlobal(*GV); } return Changed; } @@ -1968,8 +2108,8 @@ isSimpleEnoughValueToCommit(Constant *C, SmallPtrSetImpl<Constant *> &SimpleConstants, const DataLayout &DL); -/// isSimpleEnoughValueToCommit - Return true if the specified constant can be -/// handled by the code generator. We don't want to generate something like: +/// Return true if the specified constant can be handled by the code generator. +/// We don't want to generate something like: /// void *X = &X/42; /// because the code generator doesn't have a relocation that can handle that. /// @@ -2044,11 +2184,11 @@ isSimpleEnoughValueToCommit(Constant *C, } -/// isSimpleEnoughPointerToCommit - Return true if this constant is simple -/// enough for us to understand. In particular, if it is a cast to anything -/// other than from one pointer type to another pointer type, we punt. -/// We basically just support direct accesses to globals and GEP's of -/// globals. This should be kept up to date with CommitValueTo. +/// Return true if this constant is simple enough for us to understand. In +/// particular, if it is a cast to anything other than from one pointer type to +/// another pointer type, we punt. We basically just support direct accesses to +/// globals and GEP's of globals. This should be kept up to date with +/// CommitValueTo. static bool isSimpleEnoughPointerToCommit(Constant *C) { // Conservatively, avoid aggregate types. This is because we don't // want to worry about them partially overlapping other stores. @@ -2095,9 +2235,9 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) { return false; } -/// EvaluateStoreInto - Evaluate a piece of a constantexpr store into a global -/// initializer. This returns 'Init' modified to reflect 'Val' stored into it. -/// At this point, the GEP operands of Addr [0, OpNo) have been stepped into. +/// Evaluate a piece of a constantexpr store into a global initializer. This +/// returns 'Init' modified to reflect 'Val' stored into it. At this point, the +/// GEP operands of Addr [0, OpNo) have been stepped into. static Constant *EvaluateStoreInto(Constant *Init, Constant *Val, ConstantExpr *Addr, unsigned OpNo) { // Base case of the recursion. @@ -2144,7 +2284,7 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val, return ConstantVector::get(Elts); } -/// CommitValueTo - We have decided that Addr (which satisfies the predicate +/// We have decided that Addr (which satisfies the predicate /// isSimpleEnoughPointerToCommit) should get Val as its value. Make it happen. static void CommitValueTo(Constant *Val, Constant *Addr) { if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) { @@ -2160,10 +2300,10 @@ static void CommitValueTo(Constant *Val, Constant *Addr) { namespace { -/// Evaluator - This class evaluates LLVM IR, producing the Constant -/// representing each SSA instruction. Changes to global variables are stored -/// in a mapping that can be iterated over after the evaluation is complete. -/// Once an evaluation call fails, the evaluation object should not be reused. +/// This class evaluates LLVM IR, producing the Constant representing each SSA +/// instruction. Changes to global variables are stored in a mapping that can +/// be iterated over after the evaluation is complete. Once an evaluation call +/// fails, the evaluation object should not be reused. class Evaluator { public: Evaluator(const DataLayout &DL, const TargetLibraryInfo *TLI) @@ -2180,15 +2320,15 @@ public: Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType())); } - /// EvaluateFunction - Evaluate a call to function F, returning true if - /// successful, false if we can't evaluate it. ActualArgs contains the formal - /// arguments for the function. + /// Evaluate a call to function F, returning true if successful, false if we + /// can't evaluate it. ActualArgs contains the formal arguments for the + /// function. bool EvaluateFunction(Function *F, Constant *&RetVal, const SmallVectorImpl<Constant*> &ActualArgs); - /// EvaluateBlock - Evaluate all instructions in block BB, returning true if - /// successful, false if we can't evaluate it. NewBB returns the next BB that - /// control flows into, or null upon return. + /// Evaluate all instructions in block BB, returning true if successful, false + /// if we can't evaluate it. NewBB returns the next BB that control flows + /// into, or null upon return. bool EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB); Constant *getVal(Value *V) { @@ -2213,32 +2353,31 @@ public: private: Constant *ComputeLoadResult(Constant *P); - /// ValueStack - As we compute SSA register values, we store their contents - /// here. The back of the deque contains the current function and the stack - /// contains the values in the calling frames. + /// As we compute SSA register values, we store their contents here. The back + /// of the deque contains the current function and the stack contains the + /// values in the calling frames. std::deque<DenseMap<Value*, Constant*>> ValueStack; - /// CallStack - This is used to detect recursion. In pathological situations - /// we could hit exponential behavior, but at least there is nothing - /// unbounded. + /// This is used to detect recursion. In pathological situations we could hit + /// exponential behavior, but at least there is nothing unbounded. SmallVector<Function*, 4> CallStack; - /// MutatedMemory - For each store we execute, we update this map. Loads - /// check this to get the most up-to-date value. If evaluation is successful, - /// this state is committed to the process. + /// For each store we execute, we update this map. Loads check this to get + /// the most up-to-date value. If evaluation is successful, this state is + /// committed to the process. DenseMap<Constant*, Constant*> MutatedMemory; - /// AllocaTmps - To 'execute' an alloca, we create a temporary global variable - /// to represent its body. This vector is needed so we can delete the - /// temporary globals when we are done. + /// To 'execute' an alloca, we create a temporary global variable to represent + /// its body. This vector is needed so we can delete the temporary globals + /// when we are done. SmallVector<std::unique_ptr<GlobalVariable>, 32> AllocaTmps; - /// Invariants - These global variables have been marked invariant by the - /// static constructor. + /// These global variables have been marked invariant by the static + /// constructor. SmallPtrSet<GlobalVariable*, 8> Invariants; - /// SimpleConstants - These are constants we have checked and know to be - /// simple enough to live in a static initializer of a global. + /// These are constants we have checked and know to be simple enough to live + /// in a static initializer of a global. SmallPtrSet<Constant*, 8> SimpleConstants; const DataLayout &DL; @@ -2247,9 +2386,8 @@ private: } // anonymous namespace -/// ComputeLoadResult - Return the value that would be computed by a load from -/// P after the stores reflected by 'memory' have been performed. If we can't -/// decide, return null. +/// Return the value that would be computed by a load from P after the stores +/// reflected by 'memory' have been performed. If we can't decide, return null. Constant *Evaluator::ComputeLoadResult(Constant *P) { // If this memory location has been recently stored, use the stored value: it // is the most up-to-date. @@ -2275,9 +2413,9 @@ Constant *Evaluator::ComputeLoadResult(Constant *P) { return nullptr; // don't know how to evaluate. } -/// EvaluateBlock - Evaluate all instructions in block BB, returning true if -/// successful, false if we can't evaluate it. NewBB returns the next BB that -/// control flows into, or null upon return. +/// Evaluate all instructions in block BB, returning true if successful, false +/// if we can't evaluate it. NewBB returns the next BB that control flows into, +/// or null upon return. bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB) { // This is the main evaluation loop. @@ -2438,7 +2576,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, InstResult = AllocaTmps.back().get(); DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n"); } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) { - CallSite CS(CurInst); + CallSite CS(&*CurInst); // Debug info can safely be ignored here. if (isa<DbgInfoIntrinsic>(CS.getInstruction())) { @@ -2504,6 +2642,10 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, // Continue even if we do nothing. ++CurInst; continue; + } else if (II->getIntrinsicID() == Intrinsic::assume) { + DEBUG(dbgs() << "Skipping assume intrinsic.\n"); + ++CurInst; + continue; } DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n"); @@ -2600,7 +2742,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult)) InstResult = ConstantFoldConstantExpression(CE, DL, TLI); - setVal(CurInst, InstResult); + setVal(&*CurInst, InstResult); } // If we just processed an invoke, we finished evaluating the block. @@ -2615,9 +2757,9 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, } } -/// EvaluateFunction - Evaluate a call to function F, returning true if -/// successful, false if we can't evaluate it. ActualArgs contains the formal -/// arguments for the function. +/// Evaluate a call to function F, returning true if successful, false if we +/// can't evaluate it. ActualArgs contains the formal arguments for the +/// function. bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, const SmallVectorImpl<Constant*> &ActualArgs) { // Check to see if this function is already executing (recursion). If so, @@ -2631,7 +2773,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, unsigned ArgNo = 0; for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E; ++AI, ++ArgNo) - setVal(AI, ActualArgs[ArgNo]); + setVal(&*AI, ActualArgs[ArgNo]); // ExecutedBlocks - We only handle non-looping, non-recursive code. As such, // we can only evaluate any one basic block at most once. This set keeps @@ -2639,7 +2781,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, SmallPtrSet<BasicBlock*, 32> ExecutedBlocks; // CurBB - The current basic block we're evaluating. - BasicBlock *CurBB = F->begin(); + BasicBlock *CurBB = &F->front(); BasicBlock::iterator CurInst = CurBB->begin(); @@ -2679,8 +2821,8 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, } } -/// EvaluateStaticConstructor - Evaluate static constructors in the function, if -/// we can. Return true if we can, false otherwise. +/// Evaluate static constructors in the function, if we can. Return true if we +/// can, false otherwise. static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL, const TargetLibraryInfo *TLI) { // Call the function. @@ -2708,7 +2850,8 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL, } static int compareNames(Constant *const *A, Constant *const *B) { - return (*A)->getName().compare((*B)->getName()); + return (*A)->stripPointerCasts()->getName().compare( + (*B)->stripPointerCasts()->getName()); } static void setUsedInitializer(GlobalVariable &V, @@ -2742,7 +2885,7 @@ static void setUsedInitializer(GlobalVariable &V, } namespace { -/// \brief An easy to access representation of llvm.used and llvm.compiler.used. +/// An easy to access representation of llvm.used and llvm.compiler.used. class LLVMUsed { SmallPtrSet<GlobalValue *, 8> Used; SmallPtrSet<GlobalValue *, 8> CompilerUsed; @@ -2861,10 +3004,17 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E;) { - Module::alias_iterator J = I++; + GlobalAlias *J = &*I++; + // Aliases without names cannot be referenced outside this module. if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage()) J->setLinkage(GlobalValue::InternalLinkage); + + if (deleteIfDead(*J)) { + Changed = true; + continue; + } + // If the aliasee may change at link time, nothing can be done - bail out. if (J->mayBeOverridden()) continue; @@ -2889,15 +3039,15 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) { if (RenameTarget) { // Give the aliasee the name, linkage and other attributes of the alias. - Target->takeName(J); + Target->takeName(&*J); Target->setLinkage(J->getLinkage()); Target->setVisibility(J->getVisibility()); Target->setDLLStorageClass(J->getDLLStorageClass()); - if (Used.usedErase(J)) + if (Used.usedErase(&*J)) Used.usedInsert(Target); - if (Used.compilerUsedErase(J)) + if (Used.compilerUsedErase(&*J)) Used.compilerUsedInsert(Target); } else if (mayHaveOtherReferences(*J, Used)) continue; @@ -2936,8 +3086,8 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) { return Fn; } -/// cxxDtorIsEmpty - Returns whether the given function is an empty C++ -/// destructor and can therefore be eliminated. +/// Returns whether the given function is an empty C++ destructor and can +/// therefore be eliminated. /// Note that we assume that other optimization passes have already simplified /// the code so we only look for a function with a single basic block, where /// the only allowed instructions are 'ret', 'call' to an empty C++ dtor and @@ -3081,3 +3231,4 @@ bool GlobalOpt::runOnModule(Module &M) { return Changed; } + diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp index 50f56b0..89629cf0 100644 --- a/contrib/llvm/lib/Transforms/IPO/IPO.cpp +++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// // -// This file implements the common infrastructure (including C bindings) for -// libLLVMIPO.a, which implements several transformations over the LLVM +// This file implements the common infrastructure (including C bindings) for +// libLLVMIPO.a, which implements several transformations over the LLVM // intermediate representation. // //===----------------------------------------------------------------------===// @@ -24,14 +24,16 @@ using namespace llvm; void llvm::initializeIPO(PassRegistry &Registry) { initializeArgPromotionPass(Registry); initializeConstantMergePass(Registry); + initializeCrossDSOCFIPass(Registry); initializeDAEPass(Registry); initializeDAHPass(Registry); - initializeFunctionAttrsPass(Registry); + initializeForceFunctionAttrsLegacyPassPass(Registry); initializeGlobalDCEPass(Registry); initializeGlobalOptPass(Registry); initializeIPCPPass(Registry); initializeAlwaysInlinerPass(Registry); initializeSimpleInlinerPass(Registry); + initializeInferFunctionAttrsLegacyPassPass(Registry); initializeInternalizePassPass(Registry); initializeLoopExtractorPass(Registry); initializeBlockExtractorPassPass(Registry); @@ -39,14 +41,18 @@ void llvm::initializeIPO(PassRegistry &Registry) { initializeLowerBitSetsPass(Registry); initializeMergeFunctionsPass(Registry); initializePartialInlinerPass(Registry); + initializePostOrderFunctionAttrsPass(Registry); + initializeReversePostOrderFunctionAttrsPass(Registry); initializePruneEHPass(Registry); - initializeStripDeadPrototypesPassPass(Registry); + initializeStripDeadPrototypesLegacyPassPass(Registry); initializeStripSymbolsPass(Registry); initializeStripDebugDeclarePass(Registry); initializeStripDeadDebugInfoPass(Registry); initializeStripNonDebugSymbolsPass(Registry); initializeBarrierNoopPass(Registry); initializeEliminateAvailableExternallyPass(Registry); + initializeSampleProfileLoaderPass(Registry); + initializeFunctionImportPassPass(Registry); } void LLVMInitializeIPO(LLVMPassRegistryRef R) { @@ -66,7 +72,7 @@ void LLVMAddDeadArgEliminationPass(LLVMPassManagerRef PM) { } void LLVMAddFunctionAttrsPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createFunctionAttrsPass()); + unwrap(PM)->add(createPostOrderFunctionAttrsPass()); } void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) { diff --git a/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp new file mode 100644 index 0000000..4295a75 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp @@ -0,0 +1,988 @@ +//===- InferFunctionAttrs.cpp - Infer implicit function attributes --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/InferFunctionAttrs.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "inferattrs" + +STATISTIC(NumReadNone, "Number of functions inferred as readnone"); +STATISTIC(NumReadOnly, "Number of functions inferred as readonly"); +STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly"); +STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind"); +STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture"); +STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly"); +STATISTIC(NumNoAlias, "Number of function returns inferred as noalias"); +STATISTIC(NumNonNull, "Number of function returns inferred as nonnull returns"); + +static bool setDoesNotAccessMemory(Function &F) { + if (F.doesNotAccessMemory()) + return false; + F.setDoesNotAccessMemory(); + ++NumReadNone; + return true; +} + +static bool setOnlyReadsMemory(Function &F) { + if (F.onlyReadsMemory()) + return false; + F.setOnlyReadsMemory(); + ++NumReadOnly; + return true; +} + +static bool setOnlyAccessesArgMemory(Function &F) { + if (F.onlyAccessesArgMemory()) + return false; + F.setOnlyAccessesArgMemory (); + ++NumArgMemOnly; + return true; +} + + +static bool setDoesNotThrow(Function &F) { + if (F.doesNotThrow()) + return false; + F.setDoesNotThrow(); + ++NumNoUnwind; + return true; +} + +static bool setDoesNotCapture(Function &F, unsigned n) { + if (F.doesNotCapture(n)) + return false; + F.setDoesNotCapture(n); + ++NumNoCapture; + return true; +} + +static bool setOnlyReadsMemory(Function &F, unsigned n) { + if (F.onlyReadsMemory(n)) + return false; + F.setOnlyReadsMemory(n); + ++NumReadOnlyArg; + return true; +} + +static bool setDoesNotAlias(Function &F, unsigned n) { + if (F.doesNotAlias(n)) + return false; + F.setDoesNotAlias(n); + ++NumNoAlias; + return true; +} + +static bool setNonNull(Function &F, unsigned n) { + assert((n != AttributeSet::ReturnIndex || + F.getReturnType()->isPointerTy()) && + "nonnull applies only to pointers"); + if (F.getAttributes().hasAttribute(n, Attribute::NonNull)) + return false; + F.addAttribute(n, Attribute::NonNull); + ++NumNonNull; + return true; +} + +/// Analyze the name and prototype of the given function and set any applicable +/// attributes. +/// +/// Returns true if any attributes were set and false otherwise. +static bool inferPrototypeAttributes(Function &F, + const TargetLibraryInfo &TLI) { + if (F.hasFnAttribute(Attribute::OptimizeNone)) + return false; + + FunctionType *FTy = F.getFunctionType(); + LibFunc::Func TheLibFunc; + if (!(TLI.getLibFunc(F.getName(), TheLibFunc) && TLI.has(TheLibFunc))) + return false; + + bool Changed = false; + switch (TheLibFunc) { + case LibFunc::strlen: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::strchr: + case LibFunc::strrchr: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isIntegerTy()) + return false; + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotThrow(F); + return Changed; + case LibFunc::strtol: + case LibFunc::strtod: + case LibFunc::strtof: + case LibFunc::strtoul: + case LibFunc::strtoll: + case LibFunc::strtold: + case LibFunc::strtoull: + if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::strcpy: + case LibFunc::stpcpy: + case LibFunc::strcat: + case LibFunc::strncat: + case LibFunc::strncpy: + case LibFunc::stpncpy: + if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::strxfrm: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::strcmp: // 0,1 + case LibFunc::strspn: // 0,1 + case LibFunc::strncmp: // 0,1 + case LibFunc::strcspn: // 0,1 + case LibFunc::strcoll: // 0,1 + case LibFunc::strcasecmp: // 0,1 + case LibFunc::strncasecmp: // + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::strstr: + case LibFunc::strpbrk: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::strtok: + case LibFunc::strtok_r: + if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::scanf: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::setbuf: + case LibFunc::setvbuf: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::strdup: + case LibFunc::strndup: + if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::stat: + case LibFunc::statvfs: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::sscanf: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::sprintf: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::snprintf: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 3); + Changed |= setOnlyReadsMemory(F, 3); + return Changed; + case LibFunc::setitimer: + if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + Changed |= setDoesNotCapture(F, 3); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::system: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + // May throw; "system" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::malloc: + if (FTy->getNumParams() != 1 || !FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + return Changed; + case LibFunc::memcmp: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::memchr: + case LibFunc::memrchr: + if (FTy->getNumParams() != 3) + return false; + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotThrow(F); + return Changed; + case LibFunc::modf: + case LibFunc::modff: + case LibFunc::modfl: + if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::memcpy: + case LibFunc::memccpy: + case LibFunc::memmove: + if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::memalign: + if (!FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotAlias(F, 0); + return Changed; + case LibFunc::mkdir: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::mktime: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::realloc: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::read: + if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) + return false; + // May throw; "read" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::rewind: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::rmdir: + case LibFunc::remove: + case LibFunc::realpath: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::rename: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::readlink: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::write: + if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) + return false; + // May throw; "write" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::bcopy: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::bcmp: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::bzero: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::calloc: + if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + return Changed; + case LibFunc::chmod: + case LibFunc::chown: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::ctermid: + case LibFunc::clearerr: + case LibFunc::closedir: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::atoi: + case LibFunc::atol: + case LibFunc::atof: + case LibFunc::atoll: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::access: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::fopen: + if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::fdopen: + if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::feof: + case LibFunc::free: + case LibFunc::fseek: + case LibFunc::ftell: + case LibFunc::fgetc: + case LibFunc::fseeko: + case LibFunc::ftello: + case LibFunc::fileno: + case LibFunc::fflush: + case LibFunc::fclose: + case LibFunc::fsetpos: + case LibFunc::flockfile: + case LibFunc::funlockfile: + case LibFunc::ftrylockfile: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::ferror: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F); + return Changed; + case LibFunc::fputc: + case LibFunc::fstat: + case LibFunc::frexp: + case LibFunc::frexpf: + case LibFunc::frexpl: + case LibFunc::fstatvfs: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::fgets: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 3); + return Changed; + case LibFunc::fread: + if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(3)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 4); + return Changed; + case LibFunc::fwrite: + if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(3)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 4); + return Changed; + case LibFunc::fputs: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::fscanf: + case LibFunc::fprintf: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::fgetpos: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::getc: + case LibFunc::getlogin_r: + case LibFunc::getc_unlocked: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::getenv: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setOnlyReadsMemory(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::gets: + case LibFunc::getchar: + Changed |= setDoesNotThrow(F); + return Changed; + case LibFunc::getitimer: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::getpwnam: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::ungetc: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::uname: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::unlink: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::unsetenv: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::utime: + case LibFunc::utimes: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::putc: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::puts: + case LibFunc::printf: + case LibFunc::perror: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::pread: + if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) + return false; + // May throw; "pread" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::pwrite: + if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) + return false; + // May throw; "pwrite" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::putchar: + Changed |= setDoesNotThrow(F); + return Changed; + case LibFunc::popen: + if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::pclose: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::vscanf: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::vsscanf: + if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::vfscanf: + if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::valloc: + if (!FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + return Changed; + case LibFunc::vprintf: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::vfprintf: + case LibFunc::vsprintf: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::vsnprintf: + if (FTy->getNumParams() != 4 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 3); + Changed |= setOnlyReadsMemory(F, 3); + return Changed; + case LibFunc::open: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) + return false; + // May throw; "open" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::opendir: + if (FTy->getNumParams() != 1 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::tmpfile: + if (!FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + return Changed; + case LibFunc::times: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::htonl: + case LibFunc::htons: + case LibFunc::ntohl: + case LibFunc::ntohs: + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAccessMemory(F); + return Changed; + case LibFunc::lstat: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::lchown: + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::qsort: + if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy()) + return false; + // May throw; places call through function pointer. + Changed |= setDoesNotCapture(F, 4); + return Changed; + case LibFunc::dunder_strdup: + case LibFunc::dunder_strndup: + if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::dunder_strtok_r: + if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::under_IO_getc: + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::under_IO_putc: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::dunder_isoc99_scanf: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::stat64: + case LibFunc::lstat64: + case LibFunc::statvfs64: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::dunder_isoc99_sscanf: + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::fopen64: + if (FTy->getNumParams() != 2 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + Changed |= setOnlyReadsMemory(F, 1); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + case LibFunc::fseeko64: + case LibFunc::ftello64: + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + return Changed; + case LibFunc::tmpfile64: + if (!FTy->getReturnType()->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotAlias(F, 0); + return Changed; + case LibFunc::fstat64: + case LibFunc::fstatvfs64: + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return false; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 2); + return Changed; + case LibFunc::open64: + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) + return false; + // May throw; "open" is a valid pthread cancellation point. + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; + case LibFunc::gettimeofday: + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return false; + // Currently some platforms have the restrict keyword on the arguments to + // gettimeofday. To be conservative, do not add noalias to gettimeofday's + // arguments. + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setDoesNotCapture(F, 2); + return Changed; + + case LibFunc::Znwj: // new(unsigned int) + case LibFunc::Znwm: // new(unsigned long) + case LibFunc::Znaj: // new[](unsigned int) + case LibFunc::Znam: // new[](unsigned long) + case LibFunc::msvc_new_int: // new(unsigned int) + case LibFunc::msvc_new_longlong: // new(unsigned long long) + case LibFunc::msvc_new_array_int: // new[](unsigned int) + case LibFunc::msvc_new_array_longlong: // new[](unsigned long long) + if (FTy->getNumParams() != 1) + return false; + // Operator new always returns a nonnull noalias pointer + Changed |= setNonNull(F, AttributeSet::ReturnIndex); + Changed |= setDoesNotAlias(F, AttributeSet::ReturnIndex); + return Changed; + + //TODO: add LibFunc entries for: + //case LibFunc::memset_pattern4: + //case LibFunc::memset_pattern8: + case LibFunc::memset_pattern16: + if (FTy->isVarArg() || FTy->getNumParams() != 3 || + !isa<PointerType>(FTy->getParamType(0)) || + !isa<PointerType>(FTy->getParamType(1)) || + !isa<IntegerType>(FTy->getParamType(2))) + return false; + + Changed |= setOnlyAccessesArgMemory(F); + Changed |= setOnlyReadsMemory(F, 2); + return Changed; + + default: + // FIXME: It'd be really nice to cover all the library functions we're + // aware of here. + return false; + } +} + +static bool inferAllPrototypeAttributes(Module &M, + const TargetLibraryInfo &TLI) { + bool Changed = false; + + for (Function &F : M.functions()) + // We only infer things using the prototype if the definition isn't around + // to analyze directly. + if (F.isDeclaration()) + Changed |= inferPrototypeAttributes(F, TLI); + + return Changed; +} + +PreservedAnalyses InferFunctionAttrsPass::run(Module &M, + AnalysisManager<Module> *AM) { + auto &TLI = AM->getResult<TargetLibraryAnalysis>(M); + + if (!inferAllPrototypeAttributes(M, TLI)) + // If we didn't infer anything, preserve all analyses. + return PreservedAnalyses::all(); + + // Otherwise, we may have changed fundamental function attributes, so clear + // out all the passes. + return PreservedAnalyses::none(); +} + +namespace { +struct InferFunctionAttrsLegacyPass : public ModulePass { + static char ID; // Pass identification, replacement for typeid + InferFunctionAttrsLegacyPass() : ModulePass(ID) { + initializeInferFunctionAttrsLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfoWrapperPass>(); + } + + bool runOnModule(Module &M) override { + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + return inferAllPrototypeAttributes(M, TLI); + } +}; +} + +char InferFunctionAttrsLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(InferFunctionAttrsLegacyPass, "inferattrs", + "Infer set function attributes", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(InferFunctionAttrsLegacyPass, "inferattrs", + "Infer set function attributes", false, false) + +Pass *llvm::createInferFunctionAttrsLegacyPass() { + return new InferFunctionAttrsLegacyPass(); +} diff --git a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp index dc56a02..1704bfe 100644 --- a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp +++ b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp @@ -14,10 +14,10 @@ #include "llvm/Transforms/IPO.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" @@ -35,17 +35,15 @@ namespace { /// \brief Inliner pass which only handles "always inline" functions. class AlwaysInliner : public Inliner { - InlineCostAnalysis *ICA; public: // Use extremely low threshold. - AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true), - ICA(nullptr) { + AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true) { initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry()); } AlwaysInliner(bool InsertLifetime) - : Inliner(ID, -2000000000, InsertLifetime), ICA(nullptr) { + : Inliner(ID, -2000000000, InsertLifetime) { initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry()); } @@ -53,9 +51,6 @@ public: InlineCost getInlineCost(CallSite CS) override; - void getAnalysisUsage(AnalysisUsage &AU) const override; - bool runOnSCC(CallGraphSCC &SCC) override; - using llvm::Pass::doFinalization; bool doFinalization(CallGraph &CG) override { return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/ true); @@ -67,10 +62,9 @@ public: char AlwaysInliner::ID = 0; INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline", "Inliner for always_inline functions", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(AlwaysInliner, "always-inline", "Inliner for always_inline functions", false, false) @@ -99,19 +93,8 @@ InlineCost AlwaysInliner::getInlineCost(CallSite CS) { // that are viable for inlining. FIXME: We shouldn't even get here for // declarations. if (Callee && !Callee->isDeclaration() && - CS.hasFnAttr(Attribute::AlwaysInline) && - ICA->isInlineViable(*Callee)) + CS.hasFnAttr(Attribute::AlwaysInline) && isInlineViable(*Callee)) return InlineCost::getAlways(); return InlineCost::getNever(); } - -bool AlwaysInliner::runOnSCC(CallGraphSCC &SCC) { - ICA = &getAnalysis<InlineCostAnalysis>(); - return Inliner::runOnSCC(SCC); -} - -void AlwaysInliner::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<InlineCostAnalysis>(); - Inliner::getAnalysisUsage(AU); -} diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp index 9b01d81..45609f8 100644 --- a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp +++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp @@ -11,11 +11,11 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" @@ -23,6 +23,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/InlinerPass.h" using namespace llvm; @@ -37,26 +38,30 @@ namespace { /// inliner pass and the always inliner pass. The two passes use different cost /// analyses to determine when to inline. class SimpleInliner : public Inliner { - InlineCostAnalysis *ICA; public: - SimpleInliner() : Inliner(ID), ICA(nullptr) { + SimpleInliner() : Inliner(ID) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } SimpleInliner(int Threshold) - : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(nullptr) { + : Inliner(ID, Threshold, /*InsertLifetime*/ true) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } static char ID; // Pass identification, replacement for typeid InlineCost getInlineCost(CallSite CS) override { - return ICA->getInlineCost(CS, getInlineThreshold(CS)); + Function *Callee = CS.getCalledFunction(); + TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); + return llvm::getInlineCost(CS, getInlineThreshold(CS), TTI, ACT); } bool runOnSCC(CallGraphSCC &SCC) override; void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + TargetTransformInfoWrapperPass *TTIWP; }; static int computeThresholdFromOptLevels(unsigned OptLevel, @@ -75,10 +80,10 @@ static int computeThresholdFromOptLevels(unsigned OptLevel, char SimpleInliner::ID = 0; INITIALIZE_PASS_BEGIN(SimpleInliner, "inline", "Function Integration/Inlining", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(SimpleInliner, "inline", "Function Integration/Inlining", false, false) @@ -95,11 +100,11 @@ Pass *llvm::createFunctionInliningPass(unsigned OptLevel, } bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) { - ICA = &getAnalysis<InlineCostAnalysis>(); + TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); return Inliner::runOnSCC(SCC); } void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<InlineCostAnalysis>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); Inliner::getAnalysisUsage(AU); } diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp index 5273c3d..bbe5f876 100644 --- a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -64,20 +65,22 @@ ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(225), // Threshold to use when optsize is specified (and there is no -inline-limit). const int OptSizeThreshold = 75; -Inliner::Inliner(char &ID) - : CallGraphSCCPass(ID), InlineThreshold(InlineLimit), InsertLifetime(true) {} +Inliner::Inliner(char &ID) + : CallGraphSCCPass(ID), InlineThreshold(InlineLimit), InsertLifetime(true) { +} Inliner::Inliner(char &ID, int Threshold, bool InsertLifetime) - : CallGraphSCCPass(ID), InlineThreshold(InlineLimit.getNumOccurrences() > 0 ? - InlineLimit : Threshold), - InsertLifetime(InsertLifetime) {} + : CallGraphSCCPass(ID), + InlineThreshold(InlineLimit.getNumOccurrences() > 0 ? InlineLimit + : Threshold), + InsertLifetime(InsertLifetime) {} /// For this class, we declare that we require and preserve the call graph. /// If the derived class implements this method, it should /// always explicitly call the implementation here. void Inliner::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AliasAnalysis>(); AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); CallGraphSCCPass::getAnalysisUsage(AU); } @@ -85,39 +88,6 @@ void Inliner::getAnalysisUsage(AnalysisUsage &AU) const { typedef DenseMap<ArrayType*, std::vector<AllocaInst*> > InlinedArrayAllocasTy; -/// \brief If the inlined function had a higher stack protection level than the -/// calling function, then bump up the caller's stack protection level. -static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) { - // If upgrading the SSP attribute, clear out the old SSP Attributes first. - // Having multiple SSP attributes doesn't actually hurt, but it adds useless - // clutter to the IR. - AttrBuilder B; - B.addAttribute(Attribute::StackProtect) - .addAttribute(Attribute::StackProtectStrong) - .addAttribute(Attribute::StackProtectReq); - AttributeSet OldSSPAttr = AttributeSet::get(Caller->getContext(), - AttributeSet::FunctionIndex, - B); - - if (Callee->hasFnAttribute(Attribute::SafeStack)) { - Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); - Caller->addFnAttr(Attribute::SafeStack); - } else if (Callee->hasFnAttribute(Attribute::StackProtectReq) && - !Caller->hasFnAttribute(Attribute::SafeStack)) { - Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); - Caller->addFnAttr(Attribute::StackProtectReq); - } else if (Callee->hasFnAttribute(Attribute::StackProtectStrong) && - !Caller->hasFnAttribute(Attribute::SafeStack) && - !Caller->hasFnAttribute(Attribute::StackProtectReq)) { - Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr); - Caller->addFnAttr(Attribute::StackProtectStrong); - } else if (Callee->hasFnAttribute(Attribute::StackProtect) && - !Caller->hasFnAttribute(Attribute::SafeStack) && - !Caller->hasFnAttribute(Attribute::StackProtectReq) && - !Caller->hasFnAttribute(Attribute::StackProtectStrong)) - Caller->addFnAttr(Attribute::StackProtect); -} - /// If it is possible to inline the specified call site, /// do so and update the CallGraph for this operation. /// @@ -126,18 +96,26 @@ static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) { /// available from other functions inlined into the caller. If we are able to /// inline this call site we attempt to reuse already available allocas or add /// any new allocas to the set if not possible. -static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, +static bool InlineCallIfPossible(Pass &P, CallSite CS, InlineFunctionInfo &IFI, InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory, bool InsertLifetime) { Function *Callee = CS.getCalledFunction(); Function *Caller = CS.getCaller(); + // We need to manually construct BasicAA directly in order to disable + // its use of other function analyses. + BasicAAResult BAR(createLegacyPMBasicAAResult(P, *Callee)); + + // Construct our own AA results for this function. We do this manually to + // work around the limitations of the legacy pass manager. + AAResults AAR(createLegacyPMAAResults(P, *Callee, BAR)); + // Try to inline the function. Get the list of static allocas that were // inlined. - if (!InlineFunction(CS, IFI, InsertLifetime)) + if (!InlineFunction(CS, IFI, &AAR, InsertLifetime)) return false; - AdjustCallerSSPLevel(Caller, Callee); + AttributeFuncs::mergeAttributesForInlining(*Caller, *Callee); // Look at all of the allocas that we inlined through this call site. If we // have already inlined other allocas through other calls into this function, @@ -219,6 +197,14 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, DEBUG(dbgs() << " ***MERGED ALLOCA: " << *AI << "\n\t\tINTO: " << *AvailableAlloca << '\n'); + // Move affected dbg.declare calls immediately after the new alloca to + // avoid the situation when a dbg.declare preceeds its alloca. + if (auto *L = LocalAsMetadata::getIfExists(AI)) + if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L)) + for (User *U : MDV->users()) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) + DDI->moveBefore(AvailableAlloca->getNextNode()); + AI->replaceAllUsesWith(AvailableAlloca); if (Align1 != Align2) { @@ -258,39 +244,64 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, } unsigned Inliner::getInlineThreshold(CallSite CS) const { - int thres = InlineThreshold; // -inline-threshold or else selected by - // overall opt level + int Threshold = InlineThreshold; // -inline-threshold or else selected by + // overall opt level // If -inline-threshold is not given, listen to the optsize attribute when it // would decrease the threshold. Function *Caller = CS.getCaller(); bool OptSize = Caller && !Caller->isDeclaration() && + // FIXME: Use Function::optForSize(). Caller->hasFnAttribute(Attribute::OptimizeForSize); if (!(InlineLimit.getNumOccurrences() > 0) && OptSize && - OptSizeThreshold < thres) - thres = OptSizeThreshold; + OptSizeThreshold < Threshold) + Threshold = OptSizeThreshold; - // Listen to the inlinehint attribute when it would increase the threshold - // and the caller does not need to minimize its size. Function *Callee = CS.getCalledFunction(); - bool InlineHint = Callee && !Callee->isDeclaration() && - Callee->hasFnAttribute(Attribute::InlineHint); - if (InlineHint && HintThreshold > thres && - !Caller->hasFnAttribute(Attribute::MinSize)) - thres = HintThreshold; + if (!Callee || Callee->isDeclaration()) + return Threshold; + + // If profile information is available, use that to adjust threshold of hot + // and cold functions. + // FIXME: The heuristic used below for determining hotness and coldness are + // based on preliminary SPEC tuning and may not be optimal. Replace this with + // a well-tuned heuristic based on *callsite* hotness and not callee hotness. + uint64_t FunctionCount = 0, MaxFunctionCount = 0; + bool HasPGOCounts = false; + if (Callee->getEntryCount() && + Callee->getParent()->getMaximumFunctionCount()) { + HasPGOCounts = true; + FunctionCount = Callee->getEntryCount().getValue(); + MaxFunctionCount = + Callee->getParent()->getMaximumFunctionCount().getValue(); + } - // Listen to the cold attribute when it would decrease the threshold. - bool ColdCallee = Callee && !Callee->isDeclaration() && - Callee->hasFnAttribute(Attribute::Cold); + // Listen to the inlinehint attribute or profile based hotness information + // when it would increase the threshold and the caller does not need to + // minimize its size. + bool InlineHint = + Callee->hasFnAttribute(Attribute::InlineHint) || + (HasPGOCounts && + FunctionCount >= (uint64_t)(0.3 * (double)MaxFunctionCount)); + if (InlineHint && HintThreshold > Threshold && + !Caller->hasFnAttribute(Attribute::MinSize)) + Threshold = HintThreshold; + + // Listen to the cold attribute or profile based coldness information + // when it would decrease the threshold. + bool ColdCallee = + Callee->hasFnAttribute(Attribute::Cold) || + (HasPGOCounts && + FunctionCount <= (uint64_t)(0.01 * (double)MaxFunctionCount)); // Command line argument for InlineLimit will override the default // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold, // do not use the default cold threshold even if it is smaller. if ((InlineLimit.getNumOccurrences() == 0 || ColdThreshold.getNumOccurrences() > 0) && ColdCallee && - ColdThreshold < thres) - thres = ColdThreshold; + ColdThreshold < Threshold) + Threshold = ColdThreshold; - return thres; + return Threshold; } static void emitAnalysis(CallSite CS, const Twine &Msg) { @@ -430,10 +441,8 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID, bool Inliner::runOnSCC(CallGraphSCC &SCC) { CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); - AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); - auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; - AliasAnalysis *AA = &getAnalysis<AliasAnalysis>(); + ACT = &getAnalysis<AssumptionCacheTracker>(); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); SmallPtrSet<Function*, 8> SCCFunctions; DEBUG(dbgs() << "Inliner visiting SCC:"); @@ -469,8 +478,9 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { // If this is a direct call to an external function, we can never inline // it. If it is an indirect call, inlining may resolve it to be a // direct call, so we keep it. - if (CS.getCalledFunction() && CS.getCalledFunction()->isDeclaration()) - continue; + if (Function *Callee = CS.getCalledFunction()) + if (Callee->isDeclaration()) + continue; CallSites.push_back(std::make_pair(CS, -1)); } @@ -492,7 +502,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { InlinedArrayAllocasTy InlinedArrayAllocas; - InlineFunctionInfo InlineInfo(&CG, AA, ACT); + InlineFunctionInfo InlineInfo(&CG, ACT); // Now that we have all of the call sites, loop over them and inline them if // it looks profitable to do so. @@ -513,7 +523,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { // just delete the call instead of trying to inline it, regardless of // size. This happens because IPSCCP propagates the result out of the // call and then we're left with the dead call. - if (isInstructionTriviallyDead(CS.getInstruction(), TLI)) { + if (isInstructionTriviallyDead(CS.getInstruction(), &TLI)) { DEBUG(dbgs() << " -> Deleting dead call: " << *CS.getInstruction() << "\n"); // Update the call graph by deleting the edge from Callee to Caller. @@ -550,7 +560,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) { } // Attempt to inline the function. - if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas, + if (!InlineCallIfPossible(*this, CS, InlineInfo, InlinedArrayAllocas, InlineHistoryID, InsertLifetime)) { emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc, Twine(Callee->getName() + @@ -647,8 +657,8 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) { // Scan for all of the functions, looking for ones that should now be removed // from the program. Insert the dead ones in the FunctionsToRemove set. - for (auto I : CG) { - CallGraphNode *CGN = I.second; + for (const auto &I : CG) { + CallGraphNode *CGN = I.second.get(); Function *F = CGN->getFunction(); if (!F || F->isDeclaration()) continue; diff --git a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp index 7950163..21bb5d0 100644 --- a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp +++ b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp @@ -60,6 +60,10 @@ namespace { explicit InternalizePass(); explicit InternalizePass(ArrayRef<const char *> ExportList); void LoadFile(const char *Filename); + bool maybeInternalize(GlobalValue &GV, + const std::set<const Comdat *> &ExternalComdats); + void checkComdatVisibility(GlobalValue &GV, + std::set<const Comdat *> &ExternalComdats); bool runOnModule(Module &M) override; void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -105,40 +109,85 @@ void InternalizePass::LoadFile(const char *Filename) { } } -static bool shouldInternalize(const GlobalValue &GV, - const std::set<std::string> &ExternalNames) { +static bool isExternallyVisible(const GlobalValue &GV, + const std::set<std::string> &ExternalNames) { // Function must be defined here if (GV.isDeclaration()) - return false; + return true; // Available externally is really just a "declaration with a body". if (GV.hasAvailableExternallyLinkage()) - return false; + return true; // Assume that dllexported symbols are referenced elsewhere if (GV.hasDLLExportStorageClass()) - return false; - - // Already has internal linkage - if (GV.hasLocalLinkage()) - return false; + return true; // Marked to keep external? - if (ExternalNames.count(GV.getName())) - return false; + if (!GV.hasLocalLinkage() && ExternalNames.count(GV.getName())) + return true; + + return false; +} +// Internalize GV if it is possible to do so, i.e. it is not externally visible +// and is not a member of an externally visible comdat. +bool InternalizePass::maybeInternalize( + GlobalValue &GV, const std::set<const Comdat *> &ExternalComdats) { + if (Comdat *C = GV.getComdat()) { + if (ExternalComdats.count(C)) + return false; + + // If a comdat is not externally visible we can drop it. + if (auto GO = dyn_cast<GlobalObject>(&GV)) + GO->setComdat(nullptr); + + if (GV.hasLocalLinkage()) + return false; + } else { + if (GV.hasLocalLinkage()) + return false; + + if (isExternallyVisible(GV, ExternalNames)) + return false; + } + + GV.setVisibility(GlobalValue::DefaultVisibility); + GV.setLinkage(GlobalValue::InternalLinkage); return true; } +// If GV is part of a comdat and is externally visible, keep track of its +// comdat so that we don't internalize any of its members. +void InternalizePass::checkComdatVisibility( + GlobalValue &GV, std::set<const Comdat *> &ExternalComdats) { + Comdat *C = GV.getComdat(); + if (!C) + return; + + if (isExternallyVisible(GV, ExternalNames)) + ExternalComdats.insert(C); +} + bool InternalizePass::runOnModule(Module &M) { CallGraphWrapperPass *CGPass = getAnalysisIfAvailable<CallGraphWrapperPass>(); CallGraph *CG = CGPass ? &CGPass->getCallGraph() : nullptr; CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : nullptr; - bool Changed = false; SmallPtrSet<GlobalValue *, 8> Used; collectUsedGlobalVariables(M, Used, false); + // Collect comdat visiblity information for the module. + std::set<const Comdat *> ExternalComdats; + if (!M.getComdatSymbolTable().empty()) { + for (Function &F : M) + checkComdatVisibility(F, ExternalComdats); + for (GlobalVariable &GV : M.globals()) + checkComdatVisibility(GV, ExternalComdats); + for (GlobalAlias &GA : M.aliases()) + checkComdatVisibility(GA, ExternalComdats); + } + // We must assume that globals in llvm.used have a reference that not even // the linker can see, so we don't internalize them. // For llvm.compiler.used the situation is a bit fuzzy. The assembler and @@ -153,20 +202,16 @@ bool InternalizePass::runOnModule(Module &M) { } // Mark all functions not in the api as internal. - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - if (!shouldInternalize(*I, ExternalNames)) + for (Function &I : M) { + if (!maybeInternalize(I, ExternalComdats)) continue; - I->setVisibility(GlobalValue::DefaultVisibility); - I->setLinkage(GlobalValue::InternalLinkage); - if (ExternalNode) // Remove a callgraph edge from the external node to this function. - ExternalNode->removeOneAbstractEdgeTo((*CG)[I]); + ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]); - Changed = true; ++NumFunctions; - DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n"); + DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n"); } // Never internalize the llvm.used symbol. It is used to implement @@ -191,12 +236,9 @@ bool InternalizePass::runOnModule(Module &M) { // internal as well. for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I) { - if (!shouldInternalize(*I, ExternalNames)) + if (!maybeInternalize(*I, ExternalComdats)) continue; - I->setVisibility(GlobalValue::DefaultVisibility); - I->setLinkage(GlobalValue::InternalLinkage); - Changed = true; ++NumGlobals; DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n"); } @@ -204,17 +246,20 @@ bool InternalizePass::runOnModule(Module &M) { // Mark all aliases that are not in the api as internal as well. for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E; ++I) { - if (!shouldInternalize(*I, ExternalNames)) + if (!maybeInternalize(*I, ExternalComdats)) continue; - I->setVisibility(GlobalValue::DefaultVisibility); - I->setLinkage(GlobalValue::InternalLinkage); - Changed = true; ++NumAliases; DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n"); } - return Changed; + // We do not keep track of whether this pass changed the module because + // it adds unnecessary complexity: + // 1) This pass will generally be near the start of the pass pipeline, so + // there will be no analyses to invalidate. + // 2) This pass will most likely end up changing the module and it isn't worth + // worrying about optimizing the case where the module is unchanged. + return true; } ModulePass *llvm::createInternalizePass() { return new InternalizePass(); } diff --git a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp index 41334ca..3c6a7bb 100644 --- a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp +++ b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp @@ -38,17 +38,18 @@ namespace { static char ID; // Pass identification, replacement for typeid unsigned NumLoops; - explicit LoopExtractor(unsigned numLoops = ~0) + explicit LoopExtractor(unsigned numLoops = ~0) : LoopPass(ID), NumLoops(numLoops) { initializeLoopExtractorPass(*PassRegistry::getPassRegistry()); } - bool runOnLoop(Loop *L, LPPassManager &LPM) override; + bool runOnLoop(Loop *L, LPPassManager &) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(BreakCriticalEdgesID); AU.addRequiredID(LoopSimplifyID); AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); } }; } @@ -79,7 +80,7 @@ INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single", // Pass *llvm::createLoopExtractorPass() { return new LoopExtractor(); } -bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { +bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &) { if (skipOptnoneFunction(L)) return false; @@ -92,6 +93,7 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { return false; DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); bool Changed = false; // If there is more than one top-level loop in this function, extract all of @@ -120,14 +122,14 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { } if (ShouldExtractLoop) { - // We must omit landing pads. Landing pads must accompany the invoke + // We must omit EH pads. EH pads must accompany the invoke // instruction. But this would result in a loop in the extracted // function. An infinite cycle occurs when it tries to extract that loop as // well. SmallVector<BasicBlock*, 8> ExitBlocks; L->getExitBlocks(ExitBlocks); for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) - if (ExitBlocks[i]->isLandingPad()) { + if (ExitBlocks[i]->isEHPad()) { ShouldExtractLoop = false; break; } @@ -141,7 +143,7 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { Changed = true; // After extraction, the loop is replaced by a function call, so // we shouldn't try to run any more loop passes on it. - LPM.deleteLoopFromQueue(L); + LI.markAsRemoved(L); } ++NumExtracted; } @@ -259,7 +261,7 @@ bool BlockExtractorPass::runOnModule(Module &M) { // Figure out which index the basic block is in its function. Function::iterator BBI = MF->begin(); std::advance(BBI, std::distance(F->begin(), Function::iterator(BB))); - TranslatedBlocksToNotExtract.insert(BBI); + TranslatedBlocksToNotExtract.insert(&*BBI); } while (!BlocksToNotExtractByName.empty()) { @@ -278,7 +280,7 @@ bool BlockExtractorPass::runOnModule(Module &M) { BasicBlock &BB = *BI; if (BB.getName() != BlockName) continue; - TranslatedBlocksToNotExtract.insert(BI); + TranslatedBlocksToNotExtract.insert(&*BI); } } @@ -291,8 +293,8 @@ bool BlockExtractorPass::runOnModule(Module &M) { for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { SplitLandingPadPreds(&*F); for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) - if (!TranslatedBlocksToNotExtract.count(BB)) - BlocksToExtract.push_back(BB); + if (!TranslatedBlocksToNotExtract.count(&*BB)) + BlocksToExtract.push_back(&*BB); } for (unsigned i = 0, e = BlocksToExtract.size(); i != e; ++i) { diff --git a/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp b/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp index c6795c6..7b51574 100644 --- a/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp +++ b/contrib/llvm/lib/Transforms/IPO/LowerBitSets.cpp @@ -19,6 +19,8 @@ #include "llvm/ADT/Triple.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalObject.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -26,6 +28,8 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; @@ -59,9 +63,9 @@ bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const { bool BitSetInfo::containsValue( const DataLayout &DL, - const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout, Value *V, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout, Value *V, uint64_t COffset) const { - if (auto GV = dyn_cast<GlobalVariable>(V)) { + if (auto GV = dyn_cast<GlobalObject>(V)) { auto I = GlobalLayout.find(GV); if (I == GlobalLayout.end()) return false; @@ -90,6 +94,21 @@ bool BitSetInfo::containsValue( return false; } +void BitSetInfo::print(raw_ostream &OS) const { + OS << "offset " << ByteOffset << " size " << BitSize << " align " + << (1 << AlignLog2); + + if (isAllOnes()) { + OS << " all-ones\n"; + return; + } + + OS << " { "; + for (uint64_t B : Bits) + OS << B << ' '; + OS << "}\n"; +} + BitSetInfo BitSetBuilder::build() { if (Min > Max) Min = 0; @@ -193,34 +212,48 @@ struct LowerBitSets : public ModulePass { Module *M; bool LinkerSubsectionsViaSymbols; + Triple::ArchType Arch; + Triple::ObjectFormatType ObjectFormat; IntegerType *Int1Ty; IntegerType *Int8Ty; IntegerType *Int32Ty; Type *Int32PtrTy; IntegerType *Int64Ty; - Type *IntPtrTy; + IntegerType *IntPtrTy; // The llvm.bitsets named metadata. NamedMDNode *BitSetNM; - // Mapping from bitset mdstrings to the call sites that test them. - DenseMap<MDString *, std::vector<CallInst *>> BitSetTestCallSites; + // Mapping from bitset identifiers to the call sites that test them. + DenseMap<Metadata *, std::vector<CallInst *>> BitSetTestCallSites; std::vector<ByteArrayInfo> ByteArrayInfos; BitSetInfo - buildBitSet(MDString *BitSet, - const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout); + buildBitSet(Metadata *BitSet, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout); ByteArrayInfo *createByteArray(BitSetInfo &BSI); void allocateByteArrays(); Value *createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, ByteArrayInfo *&BAI, Value *BitOffset); + void lowerBitSetCalls(ArrayRef<Metadata *> BitSets, + Constant *CombinedGlobalAddr, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout); Value * lowerBitSetCall(CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI, - GlobalVariable *CombinedGlobal, - const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout); - void buildBitSetsFromGlobals(const std::vector<MDString *> &BitSets, - const std::vector<GlobalVariable *> &Globals); + Constant *CombinedGlobal, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout); + void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> BitSets, + ArrayRef<GlobalVariable *> Globals); + unsigned getJumpTableEntrySize(); + Type *getJumpTableEntryType(); + Constant *createJumpTableEntry(GlobalObject *Src, Function *Dest, + unsigned Distance); + void verifyBitSetMDNode(MDNode *Op); + void buildBitSetsFromFunctions(ArrayRef<Metadata *> BitSets, + ArrayRef<Function *> Functions); + void buildBitSetsFromDisjointSet(ArrayRef<Metadata *> BitSets, + ArrayRef<GlobalObject *> Globals); bool buildBitSets(); bool eraseBitSetMetadata(); @@ -228,7 +261,7 @@ struct LowerBitSets : public ModulePass { bool runOnModule(Module &M) override; }; -} // namespace +} // anonymous namespace INITIALIZE_PASS_BEGIN(LowerBitSets, "lowerbitsets", "Lower bitset metadata", false, false) @@ -244,6 +277,8 @@ bool LowerBitSets::doInitialization(Module &Mod) { Triple TargetTriple(M->getTargetTriple()); LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX(); + Arch = TargetTriple.getArch(); + ObjectFormat = TargetTriple.getObjectFormat(); Int1Ty = Type::getInt1Ty(M->getContext()); Int8Ty = Type::getInt8Ty(M->getContext()); @@ -262,8 +297,8 @@ bool LowerBitSets::doInitialization(Module &Mod) { /// Build a bit set for BitSet using the object layouts in /// GlobalLayout. BitSetInfo LowerBitSets::buildBitSet( - MDString *BitSet, - const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) { + Metadata *BitSet, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) { BitSetBuilder BSB; // Compute the byte offset of each element of this bitset. @@ -271,8 +306,11 @@ BitSetInfo LowerBitSets::buildBitSet( for (MDNode *Op : BitSetNM->operands()) { if (Op->getOperand(0) != BitSet || !Op->getOperand(1)) continue; - auto OpGlobal = dyn_cast<GlobalVariable>( - cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); + Constant *OpConst = + cast<ConstantAsMetadata>(Op->getOperand(1))->getValue(); + if (auto GA = dyn_cast<GlobalAlias>(OpConst)) + OpConst = GA->getAliasee(); + auto OpGlobal = dyn_cast<GlobalObject>(OpConst); if (!OpGlobal) continue; uint64_t Offset = @@ -360,9 +398,8 @@ void LowerBitSets::allocateByteArrays() { if (LinkerSubsectionsViaSymbols) { BAI->ByteArray->replaceAllUsesWith(GEP); } else { - GlobalAlias *Alias = - GlobalAlias::create(PointerType::getUnqual(Int8Ty), - GlobalValue::PrivateLinkage, "bits", GEP, M); + GlobalAlias *Alias = GlobalAlias::create( + Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, M); BAI->ByteArray->replaceAllUsesWith(Alias); } BAI->ByteArray->eraseFromParent(); @@ -404,7 +441,7 @@ Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, // Each use of the byte array uses a different alias. This makes the // backend less likely to reuse previously computed byte array addresses, // improving the security of the CFI mechanism based on this pass. - ByteArray = GlobalAlias::create(BAI->ByteArray->getType(), + ByteArray = GlobalAlias::create(BAI->ByteArray->getValueType(), 0, GlobalValue::PrivateLinkage, "bits_use", ByteArray, M); } @@ -421,17 +458,16 @@ Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, /// replace the call with. Value *LowerBitSets::lowerBitSetCall( CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI, - GlobalVariable *CombinedGlobal, - const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) { + Constant *CombinedGlobalIntAddr, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) { Value *Ptr = CI->getArgOperand(0); const DataLayout &DL = M->getDataLayout(); if (BSI.containsValue(DL, GlobalLayout, Ptr)) - return ConstantInt::getTrue(CombinedGlobal->getParent()->getContext()); + return ConstantInt::getTrue(M->getContext()); - Constant *GlobalAsInt = ConstantExpr::getPtrToInt(CombinedGlobal, IntPtrTy); Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd( - GlobalAsInt, ConstantInt::get(IntPtrTy, BSI.ByteOffset)); + CombinedGlobalIntAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)); BasicBlock *InitialBB = CI->getParent(); @@ -490,18 +526,19 @@ Value *LowerBitSets::lowerBitSetCall( /// Given a disjoint set of bitsets and globals, layout the globals, build the /// bit sets and lower the llvm.bitset.test calls. -void LowerBitSets::buildBitSetsFromGlobals( - const std::vector<MDString *> &BitSets, - const std::vector<GlobalVariable *> &Globals) { +void LowerBitSets::buildBitSetsFromGlobalVariables( + ArrayRef<Metadata *> BitSets, ArrayRef<GlobalVariable *> Globals) { // Build a new global with the combined contents of the referenced globals. + // This global is a struct whose even-indexed elements contain the original + // contents of the referenced globals and whose odd-indexed elements contain + // any padding required to align the next element to the next power of 2. std::vector<Constant *> GlobalInits; const DataLayout &DL = M->getDataLayout(); for (GlobalVariable *G : Globals) { GlobalInits.push_back(G->getInitializer()); - uint64_t InitSize = DL.getTypeAllocSize(G->getInitializer()->getType()); + uint64_t InitSize = DL.getTypeAllocSize(G->getValueType()); - // Compute the amount of padding required to align the next element to the - // next power of 2. + // Compute the amount of padding required. uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize; // Cap at 128 was found experimentally to have a good data/instruction @@ -515,34 +552,20 @@ void LowerBitSets::buildBitSetsFromGlobals( if (!GlobalInits.empty()) GlobalInits.pop_back(); Constant *NewInit = ConstantStruct::getAnon(M->getContext(), GlobalInits); - auto CombinedGlobal = + auto *CombinedGlobal = new GlobalVariable(*M, NewInit->getType(), /*isConstant=*/true, GlobalValue::PrivateLinkage, NewInit); - const StructLayout *CombinedGlobalLayout = - DL.getStructLayout(cast<StructType>(NewInit->getType())); + StructType *NewTy = cast<StructType>(NewInit->getType()); + const StructLayout *CombinedGlobalLayout = DL.getStructLayout(NewTy); // Compute the offsets of the original globals within the new global. - DenseMap<GlobalVariable *, uint64_t> GlobalLayout; + DenseMap<GlobalObject *, uint64_t> GlobalLayout; for (unsigned I = 0; I != Globals.size(); ++I) // Multiply by 2 to account for padding elements. GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2); - // For each bitset in this disjoint set... - for (MDString *BS : BitSets) { - // Build the bitset. - BitSetInfo BSI = buildBitSet(BS, GlobalLayout); - - ByteArrayInfo *BAI = 0; - - // Lower each call to llvm.bitset.test for this bitset. - for (CallInst *CI : BitSetTestCallSites[BS]) { - ++NumBitSetCallsLowered; - Value *Lowered = lowerBitSetCall(CI, BSI, BAI, CombinedGlobal, GlobalLayout); - CI->replaceAllUsesWith(Lowered); - CI->eraseFromParent(); - } - } + lowerBitSetCalls(BitSets, CombinedGlobal, GlobalLayout); // Build aliases pointing to offsets into the combined global for each // global from which we built the combined global, and replace references @@ -556,9 +579,11 @@ void LowerBitSets::buildBitSetsFromGlobals( if (LinkerSubsectionsViaSymbols) { Globals[I]->replaceAllUsesWith(CombinedGlobalElemPtr); } else { - GlobalAlias *GAlias = - GlobalAlias::create(Globals[I]->getType(), Globals[I]->getLinkage(), - "", CombinedGlobalElemPtr, M); + assert(Globals[I]->getType()->getAddressSpace() == 0); + GlobalAlias *GAlias = GlobalAlias::create(NewTy->getElementType(I * 2), 0, + Globals[I]->getLinkage(), "", + CombinedGlobalElemPtr, M); + GAlias->setVisibility(Globals[I]->getVisibility()); GAlias->takeName(Globals[I]); Globals[I]->replaceAllUsesWith(GAlias); } @@ -566,6 +591,331 @@ void LowerBitSets::buildBitSetsFromGlobals( } } +void LowerBitSets::lowerBitSetCalls( + ArrayRef<Metadata *> BitSets, Constant *CombinedGlobalAddr, + const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) { + Constant *CombinedGlobalIntAddr = + ConstantExpr::getPtrToInt(CombinedGlobalAddr, IntPtrTy); + + // For each bitset in this disjoint set... + for (Metadata *BS : BitSets) { + // Build the bitset. + BitSetInfo BSI = buildBitSet(BS, GlobalLayout); + DEBUG({ + if (auto BSS = dyn_cast<MDString>(BS)) + dbgs() << BSS->getString() << ": "; + else + dbgs() << "<unnamed>: "; + BSI.print(dbgs()); + }); + + ByteArrayInfo *BAI = nullptr; + + // Lower each call to llvm.bitset.test for this bitset. + for (CallInst *CI : BitSetTestCallSites[BS]) { + ++NumBitSetCallsLowered; + Value *Lowered = + lowerBitSetCall(CI, BSI, BAI, CombinedGlobalIntAddr, GlobalLayout); + CI->replaceAllUsesWith(Lowered); + CI->eraseFromParent(); + } + } +} + +void LowerBitSets::verifyBitSetMDNode(MDNode *Op) { + if (Op->getNumOperands() != 3) + report_fatal_error( + "All operands of llvm.bitsets metadata must have 3 elements"); + if (!Op->getOperand(1)) + return; + + auto OpConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(1)); + if (!OpConstMD) + report_fatal_error("Bit set element must be a constant"); + auto OpGlobal = dyn_cast<GlobalObject>(OpConstMD->getValue()); + if (!OpGlobal) + return; + + if (OpGlobal->isThreadLocal()) + report_fatal_error("Bit set element may not be thread-local"); + if (OpGlobal->hasSection()) + report_fatal_error("Bit set element may not have an explicit section"); + + if (isa<GlobalVariable>(OpGlobal) && OpGlobal->isDeclarationForLinker()) + report_fatal_error("Bit set global var element must be a definition"); + + auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(2)); + if (!OffsetConstMD) + report_fatal_error("Bit set element offset must be a constant"); + auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue()); + if (!OffsetInt) + report_fatal_error("Bit set element offset must be an integer constant"); +} + +static const unsigned kX86JumpTableEntrySize = 8; + +unsigned LowerBitSets::getJumpTableEntrySize() { + if (Arch != Triple::x86 && Arch != Triple::x86_64) + report_fatal_error("Unsupported architecture for jump tables"); + + return kX86JumpTableEntrySize; +} + +// Create a constant representing a jump table entry for the target. This +// consists of an instruction sequence containing a relative branch to Dest. The +// constant will be laid out at address Src+(Len*Distance) where Len is the +// target-specific jump table entry size. +Constant *LowerBitSets::createJumpTableEntry(GlobalObject *Src, Function *Dest, + unsigned Distance) { + if (Arch != Triple::x86 && Arch != Triple::x86_64) + report_fatal_error("Unsupported architecture for jump tables"); + + const unsigned kJmpPCRel32Code = 0xe9; + const unsigned kInt3Code = 0xcc; + + ConstantInt *Jmp = ConstantInt::get(Int8Ty, kJmpPCRel32Code); + + // Build a constant representing the displacement between the constant's + // address and Dest. This will resolve to a PC32 relocation referring to Dest. + Constant *DestInt = ConstantExpr::getPtrToInt(Dest, IntPtrTy); + Constant *SrcInt = ConstantExpr::getPtrToInt(Src, IntPtrTy); + Constant *Disp = ConstantExpr::getSub(DestInt, SrcInt); + ConstantInt *DispOffset = + ConstantInt::get(IntPtrTy, Distance * kX86JumpTableEntrySize + 5); + Constant *OffsetedDisp = ConstantExpr::getSub(Disp, DispOffset); + OffsetedDisp = ConstantExpr::getTruncOrBitCast(OffsetedDisp, Int32Ty); + + ConstantInt *Int3 = ConstantInt::get(Int8Ty, kInt3Code); + + Constant *Fields[] = { + Jmp, OffsetedDisp, Int3, Int3, Int3, + }; + return ConstantStruct::getAnon(Fields, /*Packed=*/true); +} + +Type *LowerBitSets::getJumpTableEntryType() { + if (Arch != Triple::x86 && Arch != Triple::x86_64) + report_fatal_error("Unsupported architecture for jump tables"); + + return StructType::get(M->getContext(), + {Int8Ty, Int32Ty, Int8Ty, Int8Ty, Int8Ty}, + /*Packed=*/true); +} + +/// Given a disjoint set of bitsets and functions, build a jump table for the +/// functions, build the bit sets and lower the llvm.bitset.test calls. +void LowerBitSets::buildBitSetsFromFunctions(ArrayRef<Metadata *> BitSets, + ArrayRef<Function *> Functions) { + // Unlike the global bitset builder, the function bitset builder cannot + // re-arrange functions in a particular order and base its calculations on the + // layout of the functions' entry points, as we have no idea how large a + // particular function will end up being (the size could even depend on what + // this pass does!) Instead, we build a jump table, which is a block of code + // consisting of one branch instruction for each of the functions in the bit + // set that branches to the target function, and redirect any taken function + // addresses to the corresponding jump table entry. In the object file's + // symbol table, the symbols for the target functions also refer to the jump + // table entries, so that addresses taken outside the module will pass any + // verification done inside the module. + // + // In more concrete terms, suppose we have three functions f, g, h which are + // members of a single bitset, and a function foo that returns their + // addresses: + // + // f: + // mov 0, %eax + // ret + // + // g: + // mov 1, %eax + // ret + // + // h: + // mov 2, %eax + // ret + // + // foo: + // mov f, %eax + // mov g, %edx + // mov h, %ecx + // ret + // + // To create a jump table for these functions, we instruct the LLVM code + // generator to output a jump table in the .text section. This is done by + // representing the instructions in the jump table as an LLVM constant and + // placing them in a global variable in the .text section. The end result will + // (conceptually) look like this: + // + // f: + // jmp .Ltmp0 ; 5 bytes + // int3 ; 1 byte + // int3 ; 1 byte + // int3 ; 1 byte + // + // g: + // jmp .Ltmp1 ; 5 bytes + // int3 ; 1 byte + // int3 ; 1 byte + // int3 ; 1 byte + // + // h: + // jmp .Ltmp2 ; 5 bytes + // int3 ; 1 byte + // int3 ; 1 byte + // int3 ; 1 byte + // + // .Ltmp0: + // mov 0, %eax + // ret + // + // .Ltmp1: + // mov 1, %eax + // ret + // + // .Ltmp2: + // mov 2, %eax + // ret + // + // foo: + // mov f, %eax + // mov g, %edx + // mov h, %ecx + // ret + // + // Because the addresses of f, g, h are evenly spaced at a power of 2, in the + // normal case the check can be carried out using the same kind of simple + // arithmetic that we normally use for globals. + + assert(!Functions.empty()); + + // Build a simple layout based on the regular layout of jump tables. + DenseMap<GlobalObject *, uint64_t> GlobalLayout; + unsigned EntrySize = getJumpTableEntrySize(); + for (unsigned I = 0; I != Functions.size(); ++I) + GlobalLayout[Functions[I]] = I * EntrySize; + + // Create a constant to hold the jump table. + ArrayType *JumpTableType = + ArrayType::get(getJumpTableEntryType(), Functions.size()); + auto JumpTable = new GlobalVariable(*M, JumpTableType, + /*isConstant=*/true, + GlobalValue::PrivateLinkage, nullptr); + JumpTable->setSection(ObjectFormat == Triple::MachO + ? "__TEXT,__text,regular,pure_instructions" + : ".text"); + lowerBitSetCalls(BitSets, JumpTable, GlobalLayout); + + // Build aliases pointing to offsets into the jump table, and replace + // references to the original functions with references to the aliases. + for (unsigned I = 0; I != Functions.size(); ++I) { + Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast( + ConstantExpr::getGetElementPtr( + JumpTableType, JumpTable, + ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0), + ConstantInt::get(IntPtrTy, I)}), + Functions[I]->getType()); + if (LinkerSubsectionsViaSymbols || Functions[I]->isDeclarationForLinker()) { + Functions[I]->replaceAllUsesWith(CombinedGlobalElemPtr); + } else { + assert(Functions[I]->getType()->getAddressSpace() == 0); + GlobalAlias *GAlias = GlobalAlias::create(Functions[I]->getValueType(), 0, + Functions[I]->getLinkage(), "", + CombinedGlobalElemPtr, M); + GAlias->setVisibility(Functions[I]->getVisibility()); + GAlias->takeName(Functions[I]); + Functions[I]->replaceAllUsesWith(GAlias); + } + if (!Functions[I]->isDeclarationForLinker()) + Functions[I]->setLinkage(GlobalValue::PrivateLinkage); + } + + // Build and set the jump table's initializer. + std::vector<Constant *> JumpTableEntries; + for (unsigned I = 0; I != Functions.size(); ++I) + JumpTableEntries.push_back( + createJumpTableEntry(JumpTable, Functions[I], I)); + JumpTable->setInitializer( + ConstantArray::get(JumpTableType, JumpTableEntries)); +} + +void LowerBitSets::buildBitSetsFromDisjointSet( + ArrayRef<Metadata *> BitSets, ArrayRef<GlobalObject *> Globals) { + llvm::DenseMap<Metadata *, uint64_t> BitSetIndices; + llvm::DenseMap<GlobalObject *, uint64_t> GlobalIndices; + for (unsigned I = 0; I != BitSets.size(); ++I) + BitSetIndices[BitSets[I]] = I; + for (unsigned I = 0; I != Globals.size(); ++I) + GlobalIndices[Globals[I]] = I; + + // For each bitset, build a set of indices that refer to globals referenced by + // the bitset. + std::vector<std::set<uint64_t>> BitSetMembers(BitSets.size()); + if (BitSetNM) { + for (MDNode *Op : BitSetNM->operands()) { + // Op = { bitset name, global, offset } + if (!Op->getOperand(1)) + continue; + auto I = BitSetIndices.find(Op->getOperand(0)); + if (I == BitSetIndices.end()) + continue; + + auto OpGlobal = dyn_cast<GlobalObject>( + cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); + if (!OpGlobal) + continue; + BitSetMembers[I->second].insert(GlobalIndices[OpGlobal]); + } + } + + // Order the sets of indices by size. The GlobalLayoutBuilder works best + // when given small index sets first. + std::stable_sort( + BitSetMembers.begin(), BitSetMembers.end(), + [](const std::set<uint64_t> &O1, const std::set<uint64_t> &O2) { + return O1.size() < O2.size(); + }); + + // Create a GlobalLayoutBuilder and provide it with index sets as layout + // fragments. The GlobalLayoutBuilder tries to lay out members of fragments as + // close together as possible. + GlobalLayoutBuilder GLB(Globals.size()); + for (auto &&MemSet : BitSetMembers) + GLB.addFragment(MemSet); + + // Build the bitsets from this disjoint set. + if (Globals.empty() || isa<GlobalVariable>(Globals[0])) { + // Build a vector of global variables with the computed layout. + std::vector<GlobalVariable *> OrderedGVs(Globals.size()); + auto OGI = OrderedGVs.begin(); + for (auto &&F : GLB.Fragments) { + for (auto &&Offset : F) { + auto GV = dyn_cast<GlobalVariable>(Globals[Offset]); + if (!GV) + report_fatal_error( + "Bit set may not contain both global variables and functions"); + *OGI++ = GV; + } + } + + buildBitSetsFromGlobalVariables(BitSets, OrderedGVs); + } else { + // Build a vector of functions with the computed layout. + std::vector<Function *> OrderedFns(Globals.size()); + auto OFI = OrderedFns.begin(); + for (auto &&F : GLB.Fragments) { + for (auto &&Offset : F) { + auto Fn = dyn_cast<Function>(Globals[Offset]); + if (!Fn) + report_fatal_error( + "Bit set may not contain both global variables and functions"); + *OFI++ = Fn; + } + } + + buildBitSetsFromFunctions(BitSets, OrderedFns); + } +} + /// Lower all bit sets in this module. bool LowerBitSets::buildBitSets() { Function *BitSetTestFunc = @@ -576,24 +926,36 @@ bool LowerBitSets::buildBitSets() { // Equivalence class set containing bitsets and the globals they reference. // This is used to partition the set of bitsets in the module into disjoint // sets. - typedef EquivalenceClasses<PointerUnion<GlobalVariable *, MDString *>> + typedef EquivalenceClasses<PointerUnion<GlobalObject *, Metadata *>> GlobalClassesTy; GlobalClassesTy GlobalClasses; + // Verify the bitset metadata and build a mapping from bitset identifiers to + // their last observed index in BitSetNM. This will used later to + // deterministically order the list of bitset identifiers. + llvm::DenseMap<Metadata *, unsigned> BitSetIdIndices; + if (BitSetNM) { + for (unsigned I = 0, E = BitSetNM->getNumOperands(); I != E; ++I) { + MDNode *Op = BitSetNM->getOperand(I); + verifyBitSetMDNode(Op); + BitSetIdIndices[Op->getOperand(0)] = I; + } + } + for (const Use &U : BitSetTestFunc->uses()) { auto CI = cast<CallInst>(U.getUser()); auto BitSetMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1)); - if (!BitSetMDVal || !isa<MDString>(BitSetMDVal->getMetadata())) + if (!BitSetMDVal) report_fatal_error( - "Second argument of llvm.bitset.test must be metadata string"); - auto BitSet = cast<MDString>(BitSetMDVal->getMetadata()); + "Second argument of llvm.bitset.test must be metadata"); + auto BitSet = BitSetMDVal->getMetadata(); // Add the call site to the list of call sites for this bit set. We also use // BitSetTestCallSites to keep track of whether we have seen this bit set // before. If we have, we don't need to re-add the referenced globals to the // equivalence class. - std::pair<DenseMap<MDString *, std::vector<CallInst *>>::iterator, + std::pair<DenseMap<Metadata *, std::vector<CallInst *>>::iterator, bool> Ins = BitSetTestCallSites.insert( std::make_pair(BitSet, std::vector<CallInst *>())); @@ -608,31 +970,16 @@ bool LowerBitSets::buildBitSets() { if (!BitSetNM) continue; - // Verify the bitset metadata and add the referenced globals to the bitset's - // equivalence class. + // Add the referenced globals to the bitset's equivalence class. for (MDNode *Op : BitSetNM->operands()) { - if (Op->getNumOperands() != 3) - report_fatal_error( - "All operands of llvm.bitsets metadata must have 3 elements"); - if (Op->getOperand(0) != BitSet || !Op->getOperand(1)) continue; - auto OpConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(1)); - if (!OpConstMD) - report_fatal_error("Bit set element must be a constant"); - auto OpGlobal = dyn_cast<GlobalVariable>(OpConstMD->getValue()); + auto OpGlobal = dyn_cast<GlobalObject>( + cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); if (!OpGlobal) continue; - auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Op->getOperand(2)); - if (!OffsetConstMD) - report_fatal_error("Bit set element offset must be a constant"); - auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue()); - if (!OffsetInt) - report_fatal_error( - "Bit set element offset must be an integer constant"); - CurSet = GlobalClasses.unionSets( CurSet, GlobalClasses.findLeader(GlobalClasses.insert(OpGlobal))); } @@ -641,79 +988,51 @@ bool LowerBitSets::buildBitSets() { if (GlobalClasses.empty()) return false; - // For each disjoint set we found... + // Build a list of disjoint sets ordered by their maximum BitSetNM index + // for determinism. + std::vector<std::pair<GlobalClassesTy::iterator, unsigned>> Sets; for (GlobalClassesTy::iterator I = GlobalClasses.begin(), E = GlobalClasses.end(); I != E; ++I) { if (!I->isLeader()) continue; - ++NumBitSetDisjointSets; - // Build the list of bitsets and referenced globals in this disjoint set. - std::vector<MDString *> BitSets; - std::vector<GlobalVariable *> Globals; - llvm::DenseMap<MDString *, uint64_t> BitSetIndices; - llvm::DenseMap<GlobalVariable *, uint64_t> GlobalIndices; + unsigned MaxIndex = 0; for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I); MI != GlobalClasses.member_end(); ++MI) { - if ((*MI).is<MDString *>()) { - BitSetIndices[MI->get<MDString *>()] = BitSets.size(); - BitSets.push_back(MI->get<MDString *>()); - } else { - GlobalIndices[MI->get<GlobalVariable *>()] = Globals.size(); - Globals.push_back(MI->get<GlobalVariable *>()); - } + if ((*MI).is<Metadata *>()) + MaxIndex = std::max(MaxIndex, BitSetIdIndices[MI->get<Metadata *>()]); } + Sets.emplace_back(I, MaxIndex); + } + std::sort(Sets.begin(), Sets.end(), + [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1, + const std::pair<GlobalClassesTy::iterator, unsigned> &S2) { + return S1.second < S2.second; + }); - // For each bitset, build a set of indices that refer to globals referenced - // by the bitset. - std::vector<std::set<uint64_t>> BitSetMembers(BitSets.size()); - if (BitSetNM) { - for (MDNode *Op : BitSetNM->operands()) { - // Op = { bitset name, global, offset } - if (!Op->getOperand(1)) - continue; - auto I = BitSetIndices.find(cast<MDString>(Op->getOperand(0))); - if (I == BitSetIndices.end()) - continue; - - auto OpGlobal = dyn_cast<GlobalVariable>( - cast<ConstantAsMetadata>(Op->getOperand(1))->getValue()); - if (!OpGlobal) - continue; - BitSetMembers[I->second].insert(GlobalIndices[OpGlobal]); - } + // For each disjoint set we found... + for (const auto &S : Sets) { + // Build the list of bitsets in this disjoint set. + std::vector<Metadata *> BitSets; + std::vector<GlobalObject *> Globals; + for (GlobalClassesTy::member_iterator MI = + GlobalClasses.member_begin(S.first); + MI != GlobalClasses.member_end(); ++MI) { + if ((*MI).is<Metadata *>()) + BitSets.push_back(MI->get<Metadata *>()); + else + Globals.push_back(MI->get<GlobalObject *>()); } - // Order the sets of indices by size. The GlobalLayoutBuilder works best - // when given small index sets first. - std::stable_sort( - BitSetMembers.begin(), BitSetMembers.end(), - [](const std::set<uint64_t> &O1, const std::set<uint64_t> &O2) { - return O1.size() < O2.size(); - }); - - // Create a GlobalLayoutBuilder and provide it with index sets as layout - // fragments. The GlobalLayoutBuilder tries to lay out members of fragments - // as close together as possible. - GlobalLayoutBuilder GLB(Globals.size()); - for (auto &&MemSet : BitSetMembers) - GLB.addFragment(MemSet); - - // Build a vector of globals with the computed layout. - std::vector<GlobalVariable *> OrderedGlobals(Globals.size()); - auto OGI = OrderedGlobals.begin(); - for (auto &&F : GLB.Fragments) - for (auto &&Offset : F) - *OGI++ = Globals[Offset]; - - // Order bitsets by name for determinism. - std::sort(BitSets.begin(), BitSets.end(), [](MDString *S1, MDString *S2) { - return S1->getString() < S2->getString(); + // Order bitsets by BitSetNM index for determinism. This ordering is stable + // as there is a one-to-one mapping between metadata and indices. + std::sort(BitSets.begin(), BitSets.end(), [&](Metadata *M1, Metadata *M2) { + return BitSetIdIndices[M1] < BitSetIdIndices[M2]; }); - // Build the bitsets from this disjoint set. - buildBitSetsFromGlobals(BitSets, OrderedGlobals); + // Lower the bitsets in this disjoint set. + buildBitSetsFromDisjointSet(BitSets, Globals); } allocateByteArrays(); diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp index 2e3519e..8a209a1 100644 --- a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp +++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -27,6 +27,14 @@ // -- We define Function* container class with custom "operator<" (FunctionPtr). // -- "FunctionPtr" instances are stored in std::set collection, so every // std::set::insert operation will give you result in log(N) time. +// +// As an optimization, a hash of the function structure is calculated first, and +// two functions are only compared if they have the same hash. This hash is +// cheap to compute, and has the property that if function F == G according to +// the comparison function, then hash(F) == hash(G). This consistency property +// is critical to ensuring all possible merging opportunities are exploited. +// Collisions in the hash affect the speed of the pass but not the correctness +// or determinism of the resulting transformation. // // When a match is found the functions are folded. If both functions are // overridable, we move the functionality into a new internal function and @@ -87,6 +95,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Hashing.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -97,12 +106,14 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/IR/ValueMap.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include <vector> + using namespace llvm; #define DEBUG_TYPE "mergefunc" @@ -121,21 +132,64 @@ static cl::opt<unsigned> NumFunctionsForSanityCheck( namespace { +/// GlobalNumberState assigns an integer to each global value in the program, +/// which is used by the comparison routine to order references to globals. This +/// state must be preserved throughout the pass, because Functions and other +/// globals need to maintain their relative order. Globals are assigned a number +/// when they are first visited. This order is deterministic, and so the +/// assigned numbers are as well. When two functions are merged, neither number +/// is updated. If the symbols are weak, this would be incorrect. If they are +/// strong, then one will be replaced at all references to the other, and so +/// direct callsites will now see one or the other symbol, and no update is +/// necessary. Note that if we were guaranteed unique names, we could just +/// compare those, but this would not work for stripped bitcodes or for those +/// few symbols without a name. +class GlobalNumberState { + struct Config : ValueMapConfig<GlobalValue*> { + enum { FollowRAUW = false }; + }; + // Each GlobalValue is mapped to an identifier. The Config ensures when RAUW + // occurs, the mapping does not change. Tracking changes is unnecessary, and + // also problematic for weak symbols (which may be overwritten). + typedef ValueMap<GlobalValue *, uint64_t, Config> ValueNumberMap; + ValueNumberMap GlobalNumbers; + // The next unused serial number to assign to a global. + uint64_t NextNumber; + public: + GlobalNumberState() : GlobalNumbers(), NextNumber(0) {} + uint64_t getNumber(GlobalValue* Global) { + ValueNumberMap::iterator MapIter; + bool Inserted; + std::tie(MapIter, Inserted) = GlobalNumbers.insert({Global, NextNumber}); + if (Inserted) + NextNumber++; + return MapIter->second; + } + void clear() { + GlobalNumbers.clear(); + } +}; + /// FunctionComparator - Compares two functions to determine whether or not /// they will generate machine code with the same behaviour. DataLayout is /// used if available. The comparator always fails conservatively (erring on the /// side of claiming that two functions are different). class FunctionComparator { public: - FunctionComparator(const Function *F1, const Function *F2) - : FnL(F1), FnR(F2) {} + FunctionComparator(const Function *F1, const Function *F2, + GlobalNumberState* GN) + : FnL(F1), FnR(F2), GlobalNumbers(GN) {} /// Test whether the two functions have equivalent behaviour. int compare(); + /// Hash a function. Equivalent functions will have the same hash, and unequal + /// functions will have different hashes with high probability. + typedef uint64_t FunctionHash; + static FunctionHash functionHash(Function &); private: /// Test whether two basic blocks have equivalent behaviour. - int compare(const BasicBlock *BBL, const BasicBlock *BBR); + int cmpBasicBlocks(const BasicBlock *BBL, const BasicBlock *BBR); /// Constants comparison. /// Its analog to lexicographical comparison between hypothetical numbers @@ -241,6 +295,10 @@ private: /// If these properties are equal - compare their contents. int cmpConstants(const Constant *L, const Constant *R); + /// Compares two global values by number. Uses the GlobalNumbersState to + /// identify the same gobals across function calls. + int cmpGlobalValues(GlobalValue *L, GlobalValue *R); + /// Assign or look up previously assigned numbers for the two values, and /// return whether the numbers are equal. Numbers are assigned in the order /// visited. @@ -320,8 +378,9 @@ private: /// /// 1. If types are of different kind (different type IDs). /// Return result of type IDs comparison, treating them as numbers. - /// 2. If types are vectors or integers, compare Type* values as numbers. - /// 3. Types has same ID, so check whether they belongs to the next group: + /// 2. If types are integers, check that they have the same width. If they + /// are vectors, check that they have the same count and subtype. + /// 3. Types have the same ID, so check whether they are one of: /// * Void /// * Float /// * Double @@ -330,8 +389,7 @@ private: /// * PPC_FP128 /// * Label /// * Metadata - /// If so - return 0, yes - we can treat these types as equal only because - /// their IDs are same. + /// We can treat these types as equal whenever their IDs are same. /// 4. If Left and Right are pointers, return result of address space /// comparison (numbers comparison). We can treat pointer types of same /// address space as equal. @@ -343,11 +401,13 @@ private: int cmpTypes(Type *TyL, Type *TyR) const; int cmpNumbers(uint64_t L, uint64_t R) const; - int cmpAPInts(const APInt &L, const APInt &R) const; int cmpAPFloats(const APFloat &L, const APFloat &R) const; - int cmpStrings(StringRef L, StringRef R) const; + int cmpInlineAsm(const InlineAsm *L, const InlineAsm *R) const; + int cmpMem(StringRef L, StringRef R) const; int cmpAttrs(const AttributeSet L, const AttributeSet R) const; + int cmpRangeMetadata(const MDNode* L, const MDNode* R) const; + int cmpOperandBundlesSchema(const Instruction *L, const Instruction *R) const; // The two functions undergoing comparison. const Function *FnL, *FnR; @@ -386,30 +446,30 @@ private: /// could be operands from further BBs we didn't scan yet. /// So it's impossible to use dominance properties in general. DenseMap<const Value*, int> sn_mapL, sn_mapR; + + // The global state we will use + GlobalNumberState* GlobalNumbers; }; class FunctionNode { mutable AssertingVH<Function> F; - + FunctionComparator::FunctionHash Hash; public: - FunctionNode(Function *F) : F(F) {} + // Note the hash is recalculated potentially multiple times, but it is cheap. + FunctionNode(Function *F) + : F(F), Hash(FunctionComparator::functionHash(*F)) {} Function *getFunc() const { return F; } + FunctionComparator::FunctionHash getHash() const { return Hash; } /// Replace the reference to the function F by the function G, assuming their /// implementations are equal. void replaceBy(Function *G) const { - assert(!(*this < FunctionNode(G)) && !(FunctionNode(G) < *this) && - "The two functions must be equal"); - F = G; } - void release() { F = 0; } - bool operator<(const FunctionNode &RHS) const { - return (FunctionComparator(F, RHS.getFunc()).compare()) == -1; - } + void release() { F = nullptr; } }; -} +} // end anonymous namespace int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const { if (L < R) return -1; @@ -426,13 +486,25 @@ int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const { } int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const { - if (int Res = cmpNumbers((uint64_t)&L.getSemantics(), - (uint64_t)&R.getSemantics())) + // Floats are ordered first by semantics (i.e. float, double, half, etc.), + // then by value interpreted as a bitstring (aka APInt). + const fltSemantics &SL = L.getSemantics(), &SR = R.getSemantics(); + if (int Res = cmpNumbers(APFloat::semanticsPrecision(SL), + APFloat::semanticsPrecision(SR))) + return Res; + if (int Res = cmpNumbers(APFloat::semanticsMaxExponent(SL), + APFloat::semanticsMaxExponent(SR))) + return Res; + if (int Res = cmpNumbers(APFloat::semanticsMinExponent(SL), + APFloat::semanticsMinExponent(SR))) + return Res; + if (int Res = cmpNumbers(APFloat::semanticsSizeInBits(SL), + APFloat::semanticsSizeInBits(SR))) return Res; return cmpAPInts(L.bitcastToAPInt(), R.bitcastToAPInt()); } -int FunctionComparator::cmpStrings(StringRef L, StringRef R) const { +int FunctionComparator::cmpMem(StringRef L, StringRef R) const { // Prevent heavy comparison, compare sizes first. if (int Res = cmpNumbers(L.size(), R.size())) return Res; @@ -466,6 +538,59 @@ int FunctionComparator::cmpAttrs(const AttributeSet L, return 0; } +int FunctionComparator::cmpRangeMetadata(const MDNode* L, + const MDNode* R) const { + if (L == R) + return 0; + if (!L) + return -1; + if (!R) + return 1; + // Range metadata is a sequence of numbers. Make sure they are the same + // sequence. + // TODO: Note that as this is metadata, it is possible to drop and/or merge + // this data when considering functions to merge. Thus this comparison would + // return 0 (i.e. equivalent), but merging would become more complicated + // because the ranges would need to be unioned. It is not likely that + // functions differ ONLY in this metadata if they are actually the same + // function semantically. + if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands())) + return Res; + for (size_t I = 0; I < L->getNumOperands(); ++I) { + ConstantInt* LLow = mdconst::extract<ConstantInt>(L->getOperand(I)); + ConstantInt* RLow = mdconst::extract<ConstantInt>(R->getOperand(I)); + if (int Res = cmpAPInts(LLow->getValue(), RLow->getValue())) + return Res; + } + return 0; +} + +int FunctionComparator::cmpOperandBundlesSchema(const Instruction *L, + const Instruction *R) const { + ImmutableCallSite LCS(L); + ImmutableCallSite RCS(R); + + assert(LCS && RCS && "Must be calls or invokes!"); + assert(LCS.isCall() == RCS.isCall() && "Can't compare otherwise!"); + + if (int Res = + cmpNumbers(LCS.getNumOperandBundles(), RCS.getNumOperandBundles())) + return Res; + + for (unsigned i = 0, e = LCS.getNumOperandBundles(); i != e; ++i) { + auto OBL = LCS.getOperandBundleAt(i); + auto OBR = RCS.getOperandBundleAt(i); + + if (int Res = OBL.getTagName().compare(OBR.getTagName())) + return Res; + + if (int Res = cmpNumbers(OBL.Inputs.size(), OBR.Inputs.size())) + return Res; + } + + return 0; +} + /// Constants comparison: /// 1. Check whether type of L constant could be losslessly bitcasted to R /// type. @@ -500,9 +625,9 @@ int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) { unsigned TyLWidth = 0; unsigned TyRWidth = 0; - if (const VectorType *VecTyL = dyn_cast<VectorType>(TyL)) + if (auto *VecTyL = dyn_cast<VectorType>(TyL)) TyLWidth = VecTyL->getBitWidth(); - if (const VectorType *VecTyR = dyn_cast<VectorType>(TyR)) + if (auto *VecTyR = dyn_cast<VectorType>(TyR)) TyRWidth = VecTyR->getBitWidth(); if (TyLWidth != TyRWidth) @@ -538,11 +663,29 @@ int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) { if (!L->isNullValue() && R->isNullValue()) return -1; + auto GlobalValueL = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(L)); + auto GlobalValueR = const_cast<GlobalValue*>(dyn_cast<GlobalValue>(R)); + if (GlobalValueL && GlobalValueR) { + return cmpGlobalValues(GlobalValueL, GlobalValueR); + } + if (int Res = cmpNumbers(L->getValueID(), R->getValueID())) return Res; + if (const auto *SeqL = dyn_cast<ConstantDataSequential>(L)) { + const auto *SeqR = cast<ConstantDataSequential>(R); + // This handles ConstantDataArray and ConstantDataVector. Note that we + // compare the two raw data arrays, which might differ depending on the host + // endianness. This isn't a problem though, because the endiness of a module + // will affect the order of the constants, but this order is the same + // for a given input module and host platform. + return cmpMem(SeqL->getRawDataValues(), SeqR->getRawDataValues()); + } + switch (L->getValueID()) { - case Value::UndefValueVal: return TypesRes; + case Value::UndefValueVal: + case Value::ConstantTokenNoneVal: + return TypesRes; case Value::ConstantIntVal: { const APInt &LInt = cast<ConstantInt>(L)->getValue(); const APInt &RInt = cast<ConstantInt>(R)->getValue(); @@ -609,19 +752,55 @@ int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) { } return 0; } - case Value::FunctionVal: - case Value::GlobalVariableVal: - case Value::GlobalAliasVal: - default: // Unknown constant, cast L and R pointers to numbers and compare. - return cmpNumbers((uint64_t)L, (uint64_t)R); + case Value::BlockAddressVal: { + const BlockAddress *LBA = cast<BlockAddress>(L); + const BlockAddress *RBA = cast<BlockAddress>(R); + if (int Res = cmpValues(LBA->getFunction(), RBA->getFunction())) + return Res; + if (LBA->getFunction() == RBA->getFunction()) { + // They are BBs in the same function. Order by which comes first in the + // BB order of the function. This order is deterministic. + Function* F = LBA->getFunction(); + BasicBlock *LBB = LBA->getBasicBlock(); + BasicBlock *RBB = RBA->getBasicBlock(); + if (LBB == RBB) + return 0; + for(BasicBlock &BB : F->getBasicBlockList()) { + if (&BB == LBB) { + assert(&BB != RBB); + return -1; + } + if (&BB == RBB) + return 1; + } + llvm_unreachable("Basic Block Address does not point to a basic block in " + "its function."); + return -1; + } else { + // cmpValues said the functions are the same. So because they aren't + // literally the same pointer, they must respectively be the left and + // right functions. + assert(LBA->getFunction() == FnL && RBA->getFunction() == FnR); + // cmpValues will tell us if these are equivalent BasicBlocks, in the + // context of their respective functions. + return cmpValues(LBA->getBasicBlock(), RBA->getBasicBlock()); + } } + default: // Unknown constant, abort. + DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n"); + llvm_unreachable("Constant ValueID not recognized."); + return -1; + } +} + +int FunctionComparator::cmpGlobalValues(GlobalValue *L, GlobalValue* R) { + return cmpNumbers(GlobalNumbers->getNumber(L), GlobalNumbers->getNumber(R)); } /// cmpType - compares two types, /// defines total ordering among the types set. /// See method declaration comments for more details. int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const { - PointerType *PTyL = dyn_cast<PointerType>(TyL); PointerType *PTyR = dyn_cast<PointerType>(TyR); @@ -642,10 +821,15 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const { llvm_unreachable("Unknown type!"); // Fall through in Release mode. case Type::IntegerTyID: - case Type::VectorTyID: - // TyL == TyR would have returned true earlier. - return cmpNumbers((uint64_t)TyL, (uint64_t)TyR); - + return cmpNumbers(cast<IntegerType>(TyL)->getBitWidth(), + cast<IntegerType>(TyR)->getBitWidth()); + case Type::VectorTyID: { + VectorType *VTyL = cast<VectorType>(TyL), *VTyR = cast<VectorType>(TyR); + if (int Res = cmpNumbers(VTyL->getNumElements(), VTyR->getNumElements())) + return Res; + return cmpTypes(VTyL->getElementType(), VTyR->getElementType()); + } + // TyL == TyR would have returned true earlier, because types are uniqued. case Type::VoidTyID: case Type::FloatTyID: case Type::DoubleTyID: @@ -654,6 +838,7 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const { case Type::PPC_FP128TyID: case Type::LabelTyID: case Type::MetadataTyID: + case Type::TokenTyID: return 0; case Type::PointerTyID: { @@ -759,8 +944,8 @@ int FunctionComparator::cmpOperations(const Instruction *L, if (int Res = cmpNumbers(LI->getSynchScope(), cast<LoadInst>(R)->getSynchScope())) return Res; - return cmpNumbers((uint64_t)LI->getMetadata(LLVMContext::MD_range), - (uint64_t)cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range)); + return cmpRangeMetadata(LI->getMetadata(LLVMContext::MD_range), + cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range)); } if (const StoreInst *SI = dyn_cast<StoreInst>(L)) { if (int Res = @@ -783,20 +968,24 @@ int FunctionComparator::cmpOperations(const Instruction *L, if (int Res = cmpAttrs(CI->getAttributes(), cast<CallInst>(R)->getAttributes())) return Res; - return cmpNumbers( - (uint64_t)CI->getMetadata(LLVMContext::MD_range), - (uint64_t)cast<CallInst>(R)->getMetadata(LLVMContext::MD_range)); + if (int Res = cmpOperandBundlesSchema(CI, R)) + return Res; + return cmpRangeMetadata( + CI->getMetadata(LLVMContext::MD_range), + cast<CallInst>(R)->getMetadata(LLVMContext::MD_range)); } - if (const InvokeInst *CI = dyn_cast<InvokeInst>(L)) { - if (int Res = cmpNumbers(CI->getCallingConv(), + if (const InvokeInst *II = dyn_cast<InvokeInst>(L)) { + if (int Res = cmpNumbers(II->getCallingConv(), cast<InvokeInst>(R)->getCallingConv())) return Res; if (int Res = - cmpAttrs(CI->getAttributes(), cast<InvokeInst>(R)->getAttributes())) + cmpAttrs(II->getAttributes(), cast<InvokeInst>(R)->getAttributes())) + return Res; + if (int Res = cmpOperandBundlesSchema(II, R)) return Res; - return cmpNumbers( - (uint64_t)CI->getMetadata(LLVMContext::MD_range), - (uint64_t)cast<InvokeInst>(R)->getMetadata(LLVMContext::MD_range)); + return cmpRangeMetadata( + II->getMetadata(LLVMContext::MD_range), + cast<InvokeInst>(R)->getMetadata(LLVMContext::MD_range)); } if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) { ArrayRef<unsigned> LIndices = IVI->getIndices(); @@ -876,9 +1065,8 @@ int FunctionComparator::cmpGEPs(const GEPOperator *GEPL, if (GEPL->accumulateConstantOffset(DL, OffsetL) && GEPR->accumulateConstantOffset(DL, OffsetR)) return cmpAPInts(OffsetL, OffsetR); - - if (int Res = cmpNumbers((uint64_t)GEPL->getPointerOperand()->getType(), - (uint64_t)GEPR->getPointerOperand()->getType())) + if (int Res = cmpTypes(GEPL->getSourceElementType(), + GEPR->getSourceElementType())) return Res; if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands())) @@ -892,6 +1080,28 @@ int FunctionComparator::cmpGEPs(const GEPOperator *GEPL, return 0; } +int FunctionComparator::cmpInlineAsm(const InlineAsm *L, + const InlineAsm *R) const { + // InlineAsm's are uniqued. If they are the same pointer, obviously they are + // the same, otherwise compare the fields. + if (L == R) + return 0; + if (int Res = cmpTypes(L->getFunctionType(), R->getFunctionType())) + return Res; + if (int Res = cmpMem(L->getAsmString(), R->getAsmString())) + return Res; + if (int Res = cmpMem(L->getConstraintString(), R->getConstraintString())) + return Res; + if (int Res = cmpNumbers(L->hasSideEffects(), R->hasSideEffects())) + return Res; + if (int Res = cmpNumbers(L->isAlignStack(), R->isAlignStack())) + return Res; + if (int Res = cmpNumbers(L->getDialect(), R->getDialect())) + return Res; + llvm_unreachable("InlineAsm blocks were not uniqued."); + return 0; +} + /// Compare two values used by the two functions under pair-wise comparison. If /// this is the first time the values are seen, they're added to the mapping so /// that we will detect mismatches on next use. @@ -926,7 +1136,7 @@ int FunctionComparator::cmpValues(const Value *L, const Value *R) { const InlineAsm *InlineAsmR = dyn_cast<InlineAsm>(R); if (InlineAsmL && InlineAsmR) - return cmpNumbers((uint64_t)L, (uint64_t)R); + return cmpInlineAsm(InlineAsmL, InlineAsmR); if (InlineAsmL) return 1; if (InlineAsmR) @@ -938,12 +1148,13 @@ int FunctionComparator::cmpValues(const Value *L, const Value *R) { return cmpNumbers(LeftSN.first->second, RightSN.first->second); } // Test whether two basic blocks have equivalent behaviour. -int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) { +int FunctionComparator::cmpBasicBlocks(const BasicBlock *BBL, + const BasicBlock *BBR) { BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end(); BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end(); do { - if (int Res = cmpValues(InstL, InstR)) + if (int Res = cmpValues(&*InstL, &*InstR)) return Res; const GetElementPtrInst *GEPL = dyn_cast<GetElementPtrInst>(InstL); @@ -961,7 +1172,7 @@ int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) { if (int Res = cmpGEPs(GEPL, GEPR)) return Res; } else { - if (int Res = cmpOperations(InstL, InstR)) + if (int Res = cmpOperations(&*InstL, &*InstR)) return Res; assert(InstL->getNumOperands() == InstR->getNumOperands()); @@ -970,11 +1181,8 @@ int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) { Value *OpR = InstR->getOperand(i); if (int Res = cmpValues(OpL, OpR)) return Res; - if (int Res = cmpNumbers(OpL->getValueID(), OpR->getValueID())) - return Res; - // TODO: Already checked in cmpOperation - if (int Res = cmpTypes(OpL->getType(), OpR->getType())) - return Res; + // cmpValues should ensure this is true. + assert(cmpTypes(OpL->getType(), OpR->getType()) == 0); } } @@ -990,7 +1198,6 @@ int FunctionComparator::compare(const BasicBlock *BBL, const BasicBlock *BBR) { // Test whether the two functions have equivalent behaviour. int FunctionComparator::compare() { - sn_mapL.clear(); sn_mapR.clear(); @@ -1001,7 +1208,7 @@ int FunctionComparator::compare() { return Res; if (FnL->hasGC()) { - if (int Res = cmpNumbers((uint64_t)FnL->getGC(), (uint64_t)FnR->getGC())) + if (int Res = cmpMem(FnL->getGC(), FnR->getGC())) return Res; } @@ -1009,7 +1216,7 @@ int FunctionComparator::compare() { return Res; if (FnL->hasSection()) { - if (int Res = cmpStrings(FnL->getSection(), FnR->getSection())) + if (int Res = cmpMem(FnL->getSection(), FnR->getSection())) return Res; } @@ -1033,7 +1240,7 @@ int FunctionComparator::compare() { ArgRI = FnR->arg_begin(), ArgLE = FnL->arg_end(); ArgLI != ArgLE; ++ArgLI, ++ArgRI) { - if (cmpValues(ArgLI, ArgRI) != 0) + if (cmpValues(&*ArgLI, &*ArgRI) != 0) llvm_unreachable("Arguments repeat!"); } @@ -1055,7 +1262,7 @@ int FunctionComparator::compare() { if (int Res = cmpValues(BBL, BBR)) return Res; - if (int Res = compare(BBL, BBR)) + if (int Res = cmpBasicBlocks(BBL, BBR)) return Res; const TerminatorInst *TermL = BBL->getTerminator(); @@ -1074,6 +1281,68 @@ int FunctionComparator::compare() { } namespace { +// Accumulate the hash of a sequence of 64-bit integers. This is similar to a +// hash of a sequence of 64bit ints, but the entire input does not need to be +// available at once. This interface is necessary for functionHash because it +// needs to accumulate the hash as the structure of the function is traversed +// without saving these values to an intermediate buffer. This form of hashing +// is not often needed, as usually the object to hash is just read from a +// buffer. +class HashAccumulator64 { + uint64_t Hash; +public: + // Initialize to random constant, so the state isn't zero. + HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; } + void add(uint64_t V) { + Hash = llvm::hashing::detail::hash_16_bytes(Hash, V); + } + // No finishing is required, because the entire hash value is used. + uint64_t getHash() { return Hash; } +}; +} // end anonymous namespace + +// A function hash is calculated by considering only the number of arguments and +// whether a function is varargs, the order of basic blocks (given by the +// successors of each basic block in depth first order), and the order of +// opcodes of each instruction within each of these basic blocks. This mirrors +// the strategy compare() uses to compare functions by walking the BBs in depth +// first order and comparing each instruction in sequence. Because this hash +// does not look at the operands, it is insensitive to things such as the +// target of calls and the constants used in the function, which makes it useful +// when possibly merging functions which are the same modulo constants and call +// targets. +FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) { + HashAccumulator64 H; + H.add(F.isVarArg()); + H.add(F.arg_size()); + + SmallVector<const BasicBlock *, 8> BBs; + SmallSet<const BasicBlock *, 16> VisitedBBs; + + // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(), + // accumulating the hash of the function "structure." (BB and opcode sequence) + BBs.push_back(&F.getEntryBlock()); + VisitedBBs.insert(BBs[0]); + while (!BBs.empty()) { + const BasicBlock *BB = BBs.pop_back_val(); + // This random value acts as a block header, as otherwise the partition of + // opcodes into BBs wouldn't affect the hash, only the order of the opcodes + H.add(45798); + for (auto &Inst : *BB) { + H.add(Inst.getOpcode()); + } + const TerminatorInst *Term = BB->getTerminator(); + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + if (!VisitedBBs.insert(Term->getSuccessor(i)).second) + continue; + BBs.push_back(Term->getSuccessor(i)); + } + } + return H.getHash(); +} + + +namespace { /// MergeFunctions finds functions which will generate identical machine code, /// by considering all pointer types to be equivalent. Once identified, @@ -1084,14 +1353,31 @@ class MergeFunctions : public ModulePass { public: static char ID; MergeFunctions() - : ModulePass(ID), HasGlobalAliases(false) { + : ModulePass(ID), FnTree(FunctionNodeCmp(&GlobalNumbers)), FNodesInTree(), + HasGlobalAliases(false) { initializeMergeFunctionsPass(*PassRegistry::getPassRegistry()); } bool runOnModule(Module &M) override; private: - typedef std::set<FunctionNode> FnTreeType; + // The function comparison operator is provided here so that FunctionNodes do + // not need to become larger with another pointer. + class FunctionNodeCmp { + GlobalNumberState* GlobalNumbers; + public: + FunctionNodeCmp(GlobalNumberState* GN) : GlobalNumbers(GN) {} + bool operator()(const FunctionNode &LHS, const FunctionNode &RHS) const { + // Order first by hashes, then full function comparison. + if (LHS.getHash() != RHS.getHash()) + return LHS.getHash() < RHS.getHash(); + FunctionComparator FCmp(LHS.getFunc(), RHS.getFunc(), GlobalNumbers); + return FCmp.compare() == -1; + } + }; + typedef std::set<FunctionNode, FunctionNodeCmp> FnTreeType; + + GlobalNumberState GlobalNumbers; /// A work queue of functions that may have been modified and should be /// analyzed again. @@ -1133,17 +1419,23 @@ private: void writeAlias(Function *F, Function *G); /// Replace function F with function G in the function tree. - void replaceFunctionInTree(FnTreeType::iterator &IterToF, Function *G); + void replaceFunctionInTree(const FunctionNode &FN, Function *G); /// The set of all distinct functions. Use the insert() and remove() methods - /// to modify it. + /// to modify it. The map allows efficient lookup and deferring of Functions. FnTreeType FnTree; + // Map functions to the iterators of the FunctionNode which contains them + // in the FnTree. This must be updated carefully whenever the FnTree is + // modified, i.e. in insert(), remove(), and replaceFunctionInTree(), to avoid + // dangling iterators into FnTree. The invariant that preserves this is that + // there is exactly one mapping F -> FN for each FunctionNode FN in FnTree. + ValueMap<Function*, FnTreeType::iterator> FNodesInTree; /// Whether or not the target supports global aliases. bool HasGlobalAliases; }; -} // end anonymous namespace +} // end anonymous namespace char MergeFunctions::ID = 0; INITIALIZE_PASS(MergeFunctions, "mergefunc", "Merge Functions", false, false) @@ -1166,8 +1458,8 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { for (std::vector<WeakVH>::iterator J = I; J != E && j < Max; ++J, ++j) { Function *F1 = cast<Function>(*I); Function *F2 = cast<Function>(*J); - int Res1 = FunctionComparator(F1, F2).compare(); - int Res2 = FunctionComparator(F2, F1).compare(); + int Res1 = FunctionComparator(F1, F2, &GlobalNumbers).compare(); + int Res2 = FunctionComparator(F2, F1, &GlobalNumbers).compare(); // If F1 <= F2, then F2 >= F1, otherwise report failure. if (Res1 != -Res2) { @@ -1188,8 +1480,8 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { continue; Function *F3 = cast<Function>(*K); - int Res3 = FunctionComparator(F1, F3).compare(); - int Res4 = FunctionComparator(F2, F3).compare(); + int Res3 = FunctionComparator(F1, F3, &GlobalNumbers).compare(); + int Res4 = FunctionComparator(F2, F3, &GlobalNumbers).compare(); bool Transitive = true; @@ -1227,11 +1519,33 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { bool MergeFunctions::runOnModule(Module &M) { bool Changed = false; - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) - Deferred.push_back(WeakVH(I)); + // All functions in the module, ordered by hash. Functions with a unique + // hash value are easily eliminated. + std::vector<std::pair<FunctionComparator::FunctionHash, Function *>> + HashedFuncs; + for (Function &Func : M) { + if (!Func.isDeclaration() && !Func.hasAvailableExternallyLinkage()) { + HashedFuncs.push_back({FunctionComparator::functionHash(Func), &Func}); + } } + std::stable_sort( + HashedFuncs.begin(), HashedFuncs.end(), + [](const std::pair<FunctionComparator::FunctionHash, Function *> &a, + const std::pair<FunctionComparator::FunctionHash, Function *> &b) { + return a.first < b.first; + }); + + auto S = HashedFuncs.begin(); + for (auto I = HashedFuncs.begin(), IE = HashedFuncs.end(); I != IE; ++I) { + // If the hash value matches the previous value or the next one, we must + // consider merging it. Otherwise it is dropped and never considered again. + if ((I != S && std::prev(I)->first == I->first) || + (std::next(I) != IE && std::next(I)->first == I->first) ) { + Deferred.push_back(WeakVH(I->second)); + } + } + do { std::vector<WeakVH> Worklist; Deferred.swap(Worklist); @@ -1270,6 +1584,7 @@ bool MergeFunctions::runOnModule(Module &M) { } while (!Deferred.empty()); FnTree.clear(); + GlobalNumbers.clear(); return Changed; } @@ -1282,6 +1597,32 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) { ++UI; CallSite CS(U->getUser()); if (CS && CS.isCallee(U)) { + // Transfer the called function's attributes to the call site. Due to the + // bitcast we will 'lose' ABI changing attributes because the 'called + // function' is no longer a Function* but the bitcast. Code that looks up + // the attributes from the called function will fail. + + // FIXME: This is not actually true, at least not anymore. The callsite + // will always have the same ABI affecting attributes as the callee, + // because otherwise the original input has UB. Note that Old and New + // always have matching ABI, so no attributes need to be changed. + // Transferring other attributes may help other optimizations, but that + // should be done uniformly and not in this ad-hoc way. + auto &Context = New->getContext(); + auto NewFuncAttrs = New->getAttributes(); + auto CallSiteAttrs = CS.getAttributes(); + + CallSiteAttrs = CallSiteAttrs.addAttributes( + Context, AttributeSet::ReturnIndex, NewFuncAttrs.getRetAttributes()); + + for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++) { + AttributeSet Attrs = NewFuncAttrs.getParamAttributes(argIdx); + if (Attrs.getNumSlots()) + CallSiteAttrs = CallSiteAttrs.addAttributes(Context, argIdx, Attrs); + } + + CS.setAttributes(CallSiteAttrs); + remove(CS.getInstruction()->getParent()->getParent()); U->set(BitcastNew); } @@ -1352,15 +1693,15 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { SmallVector<Value *, 16> Args; unsigned i = 0; FunctionType *FFTy = F->getFunctionType(); - for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end(); - AI != AE; ++AI) { - Args.push_back(createCast(Builder, (Value*)AI, FFTy->getParamType(i))); + for (Argument & AI : NewG->args()) { + Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i))); ++i; } CallInst *CI = Builder.CreateCall(F, Args); CI->setTailCall(); CI->setCallingConv(F->getCallingConv()); + CI->setAttributes(F->getAttributes()); if (NewG->getReturnType()->isVoidTy()) { Builder.CreateRetVoid(); } else { @@ -1379,8 +1720,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { // Replace G with an alias to F and delete G. void MergeFunctions::writeAlias(Function *F, Function *G) { - PointerType *PTy = G->getType(); - auto *GA = GlobalAlias::create(PTy, G->getLinkage(), "", F); + auto *GA = GlobalAlias::create(G->getLinkage(), "", F); F->setAlignment(std::max(F->getAlignment(), G->getAlignment())); GA->takeName(G); GA->setVisibility(G->getVisibility()); @@ -1425,19 +1765,24 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) { ++NumFunctionsMerged; } -/// Replace function F for function G in the map. -void MergeFunctions::replaceFunctionInTree(FnTreeType::iterator &IterToF, +/// Replace function F by function G. +void MergeFunctions::replaceFunctionInTree(const FunctionNode &FN, Function *G) { - Function *F = IterToF->getFunc(); - - // A total order is already guaranteed otherwise because we process strong - // functions before weak functions. - assert(((F->mayBeOverridden() && G->mayBeOverridden()) || - (!F->mayBeOverridden() && !G->mayBeOverridden())) && - "Only change functions if both are strong or both are weak"); - (void)F; - - IterToF->replaceBy(G); + Function *F = FN.getFunc(); + assert(FunctionComparator(F, G, &GlobalNumbers).compare() == 0 && + "The two functions must be equal"); + + auto I = FNodesInTree.find(F); + assert(I != FNodesInTree.end() && "F should be in FNodesInTree"); + assert(FNodesInTree.count(G) == 0 && "FNodesInTree should not contain G"); + + FnTreeType::iterator IterToFNInFnTree = I->second; + assert(&(*IterToFNInFnTree) == &FN && "F should map to FN in FNodesInTree."); + // Remove F -> FN and insert G -> FN + FNodesInTree.erase(I); + FNodesInTree.insert({G, IterToFNInFnTree}); + // Replace F with G in FN, which is stored inside the FnTree. + FN.replaceBy(G); } // Insert a ComparableFunction into the FnTree, or merge it away if equal to one @@ -1447,6 +1792,8 @@ bool MergeFunctions::insert(Function *NewFunction) { FnTree.insert(FunctionNode(NewFunction)); if (Result.second) { + assert(FNodesInTree.count(NewFunction) == 0); + FNodesInTree.insert({NewFunction, Result.first}); DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() << '\n'); return false; } @@ -1476,7 +1823,7 @@ bool MergeFunctions::insert(Function *NewFunction) { if (OldF.getFunc()->getName() > NewFunction->getName()) { // Swap the two functions. Function *F = OldF.getFunc(); - replaceFunctionInTree(Result.first, NewFunction); + replaceFunctionInTree(*Result.first, NewFunction); NewFunction = F; assert(OldF.getFunc() != F && "Must have swapped the functions."); } @@ -1495,18 +1842,13 @@ bool MergeFunctions::insert(Function *NewFunction) { // Remove a function from FnTree. If it was already in FnTree, add // it to Deferred so that we'll look at it in the next round. void MergeFunctions::remove(Function *F) { - // We need to make sure we remove F, not a function "equal" to F per the - // function equality comparator. - FnTreeType::iterator found = FnTree.find(FunctionNode(F)); - size_t Erased = 0; - if (found != FnTree.end() && found->getFunc() == F) { - Erased = 1; - FnTree.erase(found); - } - - if (Erased) { - DEBUG(dbgs() << "Removed " << F->getName() - << " from set and deferred it.\n"); + auto I = FNodesInTree.find(F); + if (I != FNodesInTree.end()) { + DEBUG(dbgs() << "Deferred " << F->getName()<< ".\n"); + FnTree.erase(I->second); + // I->second has been invalidated, remove it from the FNodesInTree map to + // preserve the invariant. + FNodesInTree.erase(I); Deferred.emplace_back(F); } } @@ -1516,6 +1858,8 @@ void MergeFunctions::remove(Function *F) { void MergeFunctions::removeUsers(Value *V) { std::vector<Value *> Worklist; Worklist.push_back(V); + SmallSet<Value*, 8> Visited; + Visited.insert(V); while (!Worklist.empty()) { Value *V = Worklist.back(); Worklist.pop_back(); @@ -1526,8 +1870,10 @@ void MergeFunctions::removeUsers(Value *V) { } else if (isa<GlobalValue>(U)) { // do nothing } else if (Constant *C = dyn_cast<Constant>(U)) { - for (User *UU : C->users()) - Worklist.push_back(UU); + for (User *UU : C->users()) { + if (!Visited.insert(UU).second) + Worklist.push_back(UU); + } } } } diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp index 4a7cb7b..0c5c84b 100644 --- a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -50,7 +50,7 @@ ModulePass* llvm::createPartialInliningPass() { return new PartialInliner(); } Function* PartialInliner::unswitchFunction(Function* F) { // First, verify that this function is an unswitching candidate... - BasicBlock* entryBlock = F->begin(); + BasicBlock *entryBlock = &F->front(); BranchInst *BR = dyn_cast<BranchInst>(entryBlock->getTerminator()); if (!BR || BR->isUnconditional()) return nullptr; @@ -89,18 +89,18 @@ Function* PartialInliner::unswitchFunction(Function* F) { // of which will go outside. BasicBlock* preReturn = newReturnBlock; newReturnBlock = newReturnBlock->splitBasicBlock( - newReturnBlock->getFirstNonPHI()); + newReturnBlock->getFirstNonPHI()->getIterator()); BasicBlock::iterator I = preReturn->begin(); - BasicBlock::iterator Ins = newReturnBlock->begin(); + Instruction *Ins = &newReturnBlock->front(); while (I != preReturn->end()) { PHINode* OldPhi = dyn_cast<PHINode>(I); if (!OldPhi) break; - - PHINode* retPhi = PHINode::Create(OldPhi->getType(), 2, "", Ins); + + PHINode *retPhi = PHINode::Create(OldPhi->getType(), 2, "", Ins); OldPhi->replaceAllUsesWith(retPhi); Ins = newReturnBlock->getFirstNonPHI(); - - retPhi->addIncoming(I, preReturn); + + retPhi->addIncoming(&*I, preReturn); retPhi->addIncoming(OldPhi->getIncomingValueForBlock(newEntryBlock), newEntryBlock); OldPhi->removeIncomingValue(newEntryBlock); @@ -116,8 +116,8 @@ Function* PartialInliner::unswitchFunction(Function* F) { FE = duplicateFunction->end(); FI != FE; ++FI) if (&*FI != newEntryBlock && &*FI != newReturnBlock && &*FI != newNonReturnBlock) - toExtract.push_back(FI); - + toExtract.push_back(&*FI); + // The CodeExtractor needs a dominator tree. DominatorTree DT; DT.recalculate(*duplicateFunction); diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 909baae..faada9c 100644 --- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -12,19 +12,26 @@ // //===----------------------------------------------------------------------===// - #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm-c/Transforms/PassManagerBuilder.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/CFLAliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TypeBasedAliasAnalysis.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/Verifier.h" +#include "llvm/IR/FunctionInfo.h" #include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Verifier.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ManagedStatic.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/ForceFunctionAttrs.h" +#include "llvm/Transforms/IPO/InferFunctionAttrs.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Vectorize.h" @@ -89,11 +96,21 @@ static cl::opt<bool> EnableLoopDistribute( "enable-loop-distribute", cl::init(false), cl::Hidden, cl::desc("Enable the new, experimental LoopDistribution Pass")); +static cl::opt<bool> EnableNonLTOGlobalsModRef( + "enable-non-lto-gmr", cl::init(true), cl::Hidden, + cl::desc( + "Enable the GlobalsModRef AliasAnalysis outside of the LTO pipeline.")); + +static cl::opt<bool> EnableLoopLoadElim( + "enable-loop-load-elim", cl::init(false), cl::Hidden, + cl::desc("Enable the new, experimental LoopLoadElimination Pass")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; LibraryInfo = nullptr; Inliner = nullptr; + FunctionIndex = nullptr; DisableUnitAtATime = false; DisableUnrollLoops = false; BBVectorize = RunBBVectorization; @@ -143,10 +160,9 @@ void PassManagerBuilder::addInitialAliasAnalysisPasses( // BasicAliasAnalysis wins if they disagree. This is intended to help // support "obvious" type-punning idioms. if (UseCFLAA) - PM.add(createCFLAliasAnalysisPass()); - PM.add(createTypeBasedAliasAnalysisPass()); - PM.add(createScopedNoAliasAAPass()); - PM.add(createBasicAliasAnalysisPass()); + PM.add(createCFLAAWrapperPass()); + PM.add(createTypeBasedAAWrapperPass()); + PM.add(createScopedNoAliasAAWrapperPass()); } void PassManagerBuilder::populateFunctionPassManager( @@ -172,6 +188,9 @@ void PassManagerBuilder::populateFunctionPassManager( void PassManagerBuilder::populateModulePassManager( legacy::PassManagerBase &MPM) { + // Allow forcing function attributes as a debugging and tuning aid. + MPM.add(createForceFunctionAttrsLegacyPass()); + // If all optimizations are disabled, just run the always-inline pass and, // if enabled, the function merging pass. if (OptLevel == 0) { @@ -201,10 +220,15 @@ void PassManagerBuilder::populateModulePassManager( addInitialAliasAnalysisPasses(MPM); if (!DisableUnitAtATime) { + // Infer attributes about declarations if possible. + MPM.add(createInferFunctionAttrsLegacyPass()); + addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); MPM.add(createIPSCCPPass()); // IP SCCP MPM.add(createGlobalOptimizerPass()); // Optimize out global vars + // Promote any localized global vars + MPM.add(createPromoteMemoryToRegisterPass()); MPM.add(createDeadArgEliminationPass()); // Dead argument elimination @@ -213,6 +237,12 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE } + if (EnableNonLTOGlobalsModRef) + // We add a module alias analysis pass here. In part due to bugs in the + // analysis infrastructure this "works" in that the analysis stays alive + // for the entire SCC pass run below. + MPM.add(createGlobalsAAWrapperPass()); + // Start of CallGraph SCC passes. if (!DisableUnitAtATime) MPM.add(createPruneEHPass()); // Remove dead EH info @@ -221,7 +251,7 @@ void PassManagerBuilder::populateModulePassManager( Inliner = nullptr; } if (!DisableUnitAtATime) - MPM.add(createFunctionAttrsPass()); // Set readonly/readnone attrs + MPM.add(createPostOrderFunctionAttrsPass()); if (OptLevel > 2) MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args @@ -245,6 +275,7 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); MPM.add(createLICMPass()); // Hoist loop invariants MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3)); + MPM.add(createCFGSimplificationPass()); MPM.add(createInstructionCombiningPass()); MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. @@ -315,9 +346,45 @@ void PassManagerBuilder::populateModulePassManager( // we must insert a no-op module pass to reset the pass manager. MPM.add(createBarrierNoopPass()); + if (!DisableUnitAtATime) + MPM.add(createReversePostOrderFunctionAttrsPass()); + + if (!DisableUnitAtATime && OptLevel > 1 && !PrepareForLTO) { + // Remove avail extern fns and globals definitions if we aren't + // compiling an object file for later LTO. For LTO we want to preserve + // these so they are eligible for inlining at link-time. Note if they + // are unreferenced they will be removed by GlobalDCE later, so + // this only impacts referenced available externally globals. + // Eventually they will be suppressed during codegen, but eliminating + // here enables more opportunity for GlobalDCE as it may make + // globals referenced by available external functions dead + // and saves running remaining passes on the eliminated functions. + MPM.add(createEliminateAvailableExternallyPass()); + } + + if (EnableNonLTOGlobalsModRef) + // We add a fresh GlobalsModRef run at this point. This is particularly + // useful as the above will have inlined, DCE'ed, and function-attr + // propagated everything. We should at this point have a reasonably minimal + // and richly annotated call graph. By computing aliasing and mod/ref + // information for all local globals here, the late loop passes and notably + // the vectorizer will be able to use them to help recognize vectorizable + // memory operations. + // + // Note that this relies on a bug in the pass manager which preserves + // a module analysis into a function pass pipeline (and throughout it) so + // long as the first function pass doesn't invalidate the module analysis. + // Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for + // this to work. Fortunately, it is trivial to preserve AliasAnalysis + // (doing nothing preserves it as it is required to be conservatively + // correct in the face of IR changes). + MPM.add(createGlobalsAAWrapperPass()); + if (RunFloat2Int) MPM.add(createFloat2IntPass()); + addExtensionsToPM(EP_VectorizerStart, MPM); + // Re-rotate loops in all our loop nests. These may have fallout out of // rotated form due to GVN or other transformations, and the vectorizer relies // on the rotated form. Disable header duplication at -Oz. @@ -329,6 +396,12 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createLoopDistributePass()); MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize)); + + // Eliminate loads by forwarding stores from the previous iteration to loads + // of the current iteration. + if (EnableLoopLoadElim) + MPM.add(createLoopLoadEliminationPass()); + // FIXME: Because of #pragma vectorize enable, the passes below are always // inserted in the pipeline, even when the vectorizer doesn't run (ex. when // on -O1 and no #pragma is found). Would be good to have these two passes @@ -402,17 +475,6 @@ void PassManagerBuilder::populateModulePassManager( // GlobalOpt already deletes dead functions and globals, at -O2 try a // late pass of GlobalDCE. It is capable of deleting dead cycles. if (OptLevel > 1) { - if (!PrepareForLTO) { - // Remove avail extern fns and globals definitions if we aren't - // compiling an object file for later LTO. For LTO we want to preserve - // these so they are eligible for inlining at link-time. Note if they - // are unreferenced they will be removed by GlobalDCE below, so - // this only impacts referenced available externally globals. - // Eventually they will be suppressed during codegen, but eliminating - // here enables more opportunity for GlobalDCE as it may make - // globals referenced by available external functions dead. - MPM.add(createEliminateAvailableExternallyPass()); - } MPM.add(createGlobalDCEPass()); // Remove dead fns and globals. MPM.add(createConstantMergePass()); // Merge dup global constants } @@ -428,13 +490,26 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // Provide AliasAnalysis services for optimizations. addInitialAliasAnalysisPasses(PM); + if (FunctionIndex) + PM.add(createFunctionImportPass(FunctionIndex)); + + // Allow forcing function attributes as a debugging and tuning aid. + PM.add(createForceFunctionAttrsLegacyPass()); + + // Infer attributes about declarations if possible. + PM.add(createInferFunctionAttrsLegacyPass()); + // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function // pointers passed as arguments to direct uses of functions. PM.add(createIPSCCPPass()); // Now that we internalized some globals, see if we can hack on them! + PM.add(createPostOrderFunctionAttrsPass()); + PM.add(createReversePostOrderFunctionAttrsPass()); PM.add(createGlobalOptimizerPass()); + // Promote any localized global vars. + PM.add(createPromoteMemoryToRegisterPass()); // Linking modules together can lead to duplicated global constants, only // keep one copy of each constant. @@ -480,8 +555,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createScalarReplAggregatesPass()); // Run a few AA driven optimizations here and now, to cleanup the code. - PM.add(createFunctionAttrsPass()); // Add nocapture. - PM.add(createGlobalsModRefPass()); // IP alias analysis. + PM.add(createPostOrderFunctionAttrsPass()); // Add nocapture. + PM.add(createGlobalsAAWrapperPass()); // IP alias analysis. PM.add(createLICMPass()); // Hoist loop invariants. if (EnableMLSM) @@ -500,6 +575,15 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createLoopVectorizePass(true, LoopVectorize)); + // Now that we've optimized loops (in particular loop induction variables), + // we may have exposed more scalar opportunities. Run parts of the scalar + // optimizer again at this point. + PM.add(createInstructionCombiningPass()); // Initial cleanup + PM.add(createCFGSimplificationPass()); // if-convert + PM.add(createSCCPPass()); // Propagate exposed constants + PM.add(createInstructionCombiningPass()); // Clean up again + PM.add(createBitTrackingDCEPass()); + // More scalar chains could be vectorized due to more alias information if (RunSLPAfterLoopVectorization) if (SLPVectorize) @@ -524,6 +608,9 @@ void PassManagerBuilder::addLateLTOOptimizationPasses( // Delete basic blocks, which optimization passes may have killed. PM.add(createCFGSimplificationPass()); + // Drop bodies of available externally objects to improve GlobalDCE. + PM.add(createEliminateAvailableExternallyPass()); + // Now that we have optimized the program, discard unreachable functions. PM.add(createGlobalDCEPass()); @@ -543,6 +630,10 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) { if (OptLevel > 1) addLTOOptimizationPasses(PM); + // Create a function that performs CFI checks for cross-DSO calls with targets + // in the current module. + PM.add(createCrossDSOCFIPass()); + // Lower bit sets to globals. This pass supports Clang's control flow // integrity mechanisms (-fsanitize=cfi*) and needs to run at link time if CFI // is enabled. The pass does nothing if CFI is disabled. diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp index b2f1010..3af4afb 100644 --- a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp @@ -21,7 +21,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -153,21 +153,16 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) { // If the SCC doesn't unwind or doesn't throw, note this fact. if (!SCCMightUnwind || !SCCMightReturn) for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { - AttrBuilder NewAttributes; - - if (!SCCMightUnwind) - NewAttributes.addAttribute(Attribute::NoUnwind); - if (!SCCMightReturn) - NewAttributes.addAttribute(Attribute::NoReturn); - Function *F = (*I)->getFunction(); - const AttributeSet &PAL = F->getAttributes().getFnAttributes(); - const AttributeSet &NPAL = AttributeSet::get( - F->getContext(), AttributeSet::FunctionIndex, NewAttributes); - if (PAL != NPAL) { + if (!SCCMightUnwind && !F->hasFnAttribute(Attribute::NoUnwind)) { + F->addFnAttr(Attribute::NoUnwind); + MadeChange = true; + } + + if (!SCCMightReturn && !F->hasFnAttribute(Attribute::NoReturn)) { + F->addFnAttr(Attribute::NoReturn); MadeChange = true; - F->addAttributes(AttributeSet::FunctionIndex, NPAL); } } @@ -191,9 +186,13 @@ bool PruneEH::SimplifyFunction(Function *F) { for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(F)) { - SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); + SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end()); + SmallVector<OperandBundleDef, 1> OpBundles; + II->getOperandBundlesAsDefs(OpBundles); + // Insert a call instruction before the invoke. - CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II); + CallInst *Call = CallInst::Create(II->getCalledValue(), Args, OpBundles, + "", II); Call->takeName(II); Call->setCallingConv(II->getCallingConv()); Call->setAttributes(II->getAttributes()); @@ -233,7 +232,7 @@ bool PruneEH::SimplifyFunction(Function *F) { // Remove the uncond branch and add an unreachable. BB->getInstList().pop_back(); - new UnreachableInst(BB->getContext(), BB); + new UnreachableInst(BB->getContext(), &*BB); DeleteBasicBlock(New); // Delete the new BB. MadeChange = true; diff --git a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp index c8dfa54..928d92e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp +++ b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -22,7 +22,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" @@ -44,7 +43,11 @@ #include "llvm/ProfileData/SampleProfReader.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Cloning.h" #include <cctype> using namespace llvm; @@ -61,27 +64,51 @@ static cl::opt<unsigned> SampleProfileMaxPropagateIterations( "sample-profile-max-propagate-iterations", cl::init(100), cl::desc("Maximum number of iterations to go through when propagating " "sample block/edge weights through the CFG.")); +static cl::opt<unsigned> SampleProfileRecordCoverage( + "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"), + cl::desc("Emit a warning if less than N% of records in the input profile " + "are matched to the IR.")); +static cl::opt<unsigned> SampleProfileSampleCoverage( + "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"), + cl::desc("Emit a warning if less than N% of samples in the input profile " + "are matched to the IR.")); +static cl::opt<double> SampleProfileHotThreshold( + "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"), + cl::desc("Inlined functions that account for more than N% of all samples " + "collected in the parent function, will be inlined again.")); +static cl::opt<double> SampleProfileGlobalHotThreshold( + "sample-profile-global-hot-threshold", cl::init(30), cl::value_desc("N"), + cl::desc("Top-level functions that account for more than N% of all samples " + "collected in the profile, will be marked as hot for the inliner " + "to consider.")); +static cl::opt<double> SampleProfileGlobalColdThreshold( + "sample-profile-global-cold-threshold", cl::init(0.5), cl::value_desc("N"), + cl::desc("Top-level functions that account for less than N% of all samples " + "collected in the profile, will be marked as cold for the inliner " + "to consider.")); namespace { -typedef DenseMap<BasicBlock *, unsigned> BlockWeightMap; -typedef DenseMap<BasicBlock *, BasicBlock *> EquivalenceClassMap; -typedef std::pair<BasicBlock *, BasicBlock *> Edge; -typedef DenseMap<Edge, unsigned> EdgeWeightMap; -typedef DenseMap<BasicBlock *, SmallVector<BasicBlock *, 8>> BlockEdgeMap; +typedef DenseMap<const BasicBlock *, uint64_t> BlockWeightMap; +typedef DenseMap<const BasicBlock *, const BasicBlock *> EquivalenceClassMap; +typedef std::pair<const BasicBlock *, const BasicBlock *> Edge; +typedef DenseMap<Edge, uint64_t> EdgeWeightMap; +typedef DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>> + BlockEdgeMap; /// \brief Sample profile pass. /// /// This pass reads profile data from the file specified by /// -sample-profile-file and annotates every affected function with the /// profile information found in that file. -class SampleProfileLoader : public FunctionPass { +class SampleProfileLoader : public ModulePass { public: // Class identification, replacement for typeinfo static char ID; SampleProfileLoader(StringRef Name = SampleProfileFile) - : FunctionPass(ID), DT(nullptr), PDT(nullptr), LI(nullptr), Ctx(nullptr), - Reader(), Samples(nullptr), Filename(Name), ProfileIsValid(false) { + : ModulePass(ID), DT(nullptr), PDT(nullptr), LI(nullptr), Reader(), + Samples(nullptr), Filename(Name), ProfileIsValid(false), + TotalCollectedSamples(0) { initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry()); } @@ -91,36 +118,37 @@ public: const char *getPassName() const override { return "Sample profile pass"; } - bool runOnFunction(Function &F) override; + bool runOnModule(Module &M) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<PostDominatorTree>(); } protected: + bool runOnFunction(Function &F); unsigned getFunctionLoc(Function &F); bool emitAnnotations(Function &F); - unsigned getInstWeight(Instruction &I); - unsigned getBlockWeight(BasicBlock *BB); + ErrorOr<uint64_t> getInstWeight(const Instruction &I) const; + ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB) const; + const FunctionSamples *findCalleeFunctionSamples(const CallInst &I) const; + const FunctionSamples *findFunctionSamples(const Instruction &I) const; + bool inlineHotFunctions(Function &F); + bool emitInlineHints(Function &F); void printEdgeWeight(raw_ostream &OS, Edge E); - void printBlockWeight(raw_ostream &OS, BasicBlock *BB); - void printBlockEquivalence(raw_ostream &OS, BasicBlock *BB); + void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const; + void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); bool computeBlockWeights(Function &F); void findEquivalenceClasses(Function &F); void findEquivalencesFor(BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants, DominatorTreeBase<BasicBlock> *DomTree); void propagateWeights(Function &F); - unsigned visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); + uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); void buildEdges(Function &F); bool propagateThroughEdges(Function &F); - - /// \brief Line number for the function header. Used to compute absolute - /// line numbers from the relative line numbers found in the profile. - unsigned HeaderLineno; + void computeDominanceAndLoopInfo(Function &F); + unsigned getOffset(unsigned L, unsigned H) const; + void clearFunctionData(); /// \brief Map basic blocks to their computed weights. /// @@ -135,7 +163,7 @@ protected: EdgeWeightMap EdgeWeights; /// \brief Set of visited blocks during propagation. - SmallPtrSet<BasicBlock *, 128> VisitedBlocks; + SmallPtrSet<const BasicBlock *, 128> VisitedBlocks; /// \brief Set of visited edges during propagation. SmallSet<Edge, 128> VisitedEdges; @@ -149,9 +177,9 @@ protected: EquivalenceClassMap EquivalenceClass; /// \brief Dominance, post-dominance and loop information. - DominatorTree *DT; - PostDominatorTree *PDT; - LoopInfo *LI; + std::unique_ptr<DominatorTree> DT; + std::unique_ptr<DominatorTreeBase<BasicBlock>> PDT; + std::unique_ptr<LoopInfo> LI; /// \brief Predecessors for each basic block in the CFG. BlockEdgeMap Predecessors; @@ -159,9 +187,6 @@ protected: /// \brief Successors for each basic block in the CFG. BlockEdgeMap Successors; - /// \brief LLVM context holding the debug data we need. - LLVMContext *Ctx; - /// \brief Profile reader object. std::unique_ptr<SampleProfileReader> Reader; @@ -173,7 +198,207 @@ protected: /// \brief Flag indicating whether the profile input loaded successfully. bool ProfileIsValid; + + /// \brief Total number of samples collected in this profile. + /// + /// This is the sum of all the samples collected in all the functions executed + /// at runtime. + uint64_t TotalCollectedSamples; }; + +class SampleCoverageTracker { +public: + SampleCoverageTracker() : SampleCoverage(), TotalUsedSamples(0) {} + + bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset, + uint32_t Discriminator, uint64_t Samples); + unsigned computeCoverage(unsigned Used, unsigned Total) const; + unsigned countUsedRecords(const FunctionSamples *FS) const; + unsigned countBodyRecords(const FunctionSamples *FS) const; + uint64_t getTotalUsedSamples() const { return TotalUsedSamples; } + uint64_t countBodySamples(const FunctionSamples *FS) const; + void clear() { + SampleCoverage.clear(); + TotalUsedSamples = 0; + } + +private: + typedef std::map<LineLocation, unsigned> BodySampleCoverageMap; + typedef DenseMap<const FunctionSamples *, BodySampleCoverageMap> + FunctionSamplesCoverageMap; + + /// Coverage map for sampling records. + /// + /// This map keeps a record of sampling records that have been matched to + /// an IR instruction. This is used to detect some form of staleness in + /// profiles (see flag -sample-profile-check-coverage). + /// + /// Each entry in the map corresponds to a FunctionSamples instance. This is + /// another map that counts how many times the sample record at the + /// given location has been used. + FunctionSamplesCoverageMap SampleCoverage; + + /// Number of samples used from the profile. + /// + /// When a sampling record is used for the first time, the samples from + /// that record are added to this accumulator. Coverage is later computed + /// based on the total number of samples available in this function and + /// its callsites. + /// + /// Note that this accumulator tracks samples used from a single function + /// and all the inlined callsites. Strictly, we should have a map of counters + /// keyed by FunctionSamples pointers, but these stats are cleared after + /// every function, so we just need to keep a single counter. + uint64_t TotalUsedSamples; +}; + +SampleCoverageTracker CoverageTracker; + +/// Return true if the given callsite is hot wrt to its caller. +/// +/// Functions that were inlined in the original binary will be represented +/// in the inline stack in the sample profile. If the profile shows that +/// the original inline decision was "good" (i.e., the callsite is executed +/// frequently), then we will recreate the inline decision and apply the +/// profile from the inlined callsite. +/// +/// To decide whether an inlined callsite is hot, we compute the fraction +/// of samples used by the callsite with respect to the total number of samples +/// collected in the caller. +/// +/// If that fraction is larger than the default given by +/// SampleProfileHotThreshold, the callsite will be inlined again. +bool callsiteIsHot(const FunctionSamples *CallerFS, + const FunctionSamples *CallsiteFS) { + if (!CallsiteFS) + return false; // The callsite was not inlined in the original binary. + + uint64_t ParentTotalSamples = CallerFS->getTotalSamples(); + if (ParentTotalSamples == 0) + return false; // Avoid division by zero. + + uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples(); + if (CallsiteTotalSamples == 0) + return false; // Callsite is trivially cold. + + double PercentSamples = + (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0; + return PercentSamples >= SampleProfileHotThreshold; +} + +} + +/// Mark as used the sample record for the given function samples at +/// (LineOffset, Discriminator). +/// +/// \returns true if this is the first time we mark the given record. +bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS, + uint32_t LineOffset, + uint32_t Discriminator, + uint64_t Samples) { + LineLocation Loc(LineOffset, Discriminator); + unsigned &Count = SampleCoverage[FS][Loc]; + bool FirstTime = (++Count == 1); + if (FirstTime) + TotalUsedSamples += Samples; + return FirstTime; +} + +/// Return the number of sample records that were applied from this profile. +/// +/// This count does not include records from cold inlined callsites. +unsigned +SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const { + auto I = SampleCoverage.find(FS); + + // The size of the coverage map for FS represents the number of records + // that were marked used at least once. + unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0; + + // If there are inlined callsites in this function, count the samples found + // in the respective bodies. However, do not bother counting callees with 0 + // total samples, these are callees that were never invoked at runtime. + for (const auto &I : FS->getCallsiteSamples()) { + const FunctionSamples *CalleeSamples = &I.second; + if (callsiteIsHot(FS, CalleeSamples)) + Count += countUsedRecords(CalleeSamples); + } + + return Count; +} + +/// Return the number of sample records in the body of this profile. +/// +/// This count does not include records from cold inlined callsites. +unsigned +SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const { + unsigned Count = FS->getBodySamples().size(); + + // Only count records in hot callsites. + for (const auto &I : FS->getCallsiteSamples()) { + const FunctionSamples *CalleeSamples = &I.second; + if (callsiteIsHot(FS, CalleeSamples)) + Count += countBodyRecords(CalleeSamples); + } + + return Count; +} + +/// Return the number of samples collected in the body of this profile. +/// +/// This count does not include samples from cold inlined callsites. +uint64_t +SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const { + uint64_t Total = 0; + for (const auto &I : FS->getBodySamples()) + Total += I.second.getSamples(); + + // Only count samples in hot callsites. + for (const auto &I : FS->getCallsiteSamples()) { + const FunctionSamples *CalleeSamples = &I.second; + if (callsiteIsHot(FS, CalleeSamples)) + Total += countBodySamples(CalleeSamples); + } + + return Total; +} + +/// Return the fraction of sample records used in this profile. +/// +/// The returned value is an unsigned integer in the range 0-100 indicating +/// the percentage of sample records that were used while applying this +/// profile to the associated function. +unsigned SampleCoverageTracker::computeCoverage(unsigned Used, + unsigned Total) const { + assert(Used <= Total && + "number of used records cannot exceed the total number of records"); + return Total > 0 ? Used * 100 / Total : 100; +} + +/// Clear all the per-function data used to load samples and propagate weights. +void SampleProfileLoader::clearFunctionData() { + BlockWeights.clear(); + EdgeWeights.clear(); + VisitedBlocks.clear(); + VisitedEdges.clear(); + EquivalenceClass.clear(); + DT = nullptr; + PDT = nullptr; + LI = nullptr; + Predecessors.clear(); + Successors.clear(); + CoverageTracker.clear(); +} + +/// \brief Returns the offset of lineno \p L to head_lineno \p H +/// +/// \param L Lineno +/// \param H Header lineno of the function +/// +/// \returns offset to the header lineno. 16 bits are used to represent offset. +/// We assume that a single function will not exceed 65535 LOC. +unsigned SampleProfileLoader::getOffset(unsigned L, unsigned H) const { + return (L - H) & 0xffff; } /// \brief Print the weight of edge \p E on stream \p OS. @@ -190,8 +415,8 @@ void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) { /// \param OS Stream to emit the output to. /// \param BB Block to print. void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS, - BasicBlock *BB) { - BasicBlock *Equiv = EquivalenceClass[BB]; + const BasicBlock *BB) { + const BasicBlock *Equiv = EquivalenceClass[BB]; OS << "equivalence[" << BB->getName() << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n"; } @@ -200,8 +425,11 @@ void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS, /// /// \param OS Stream to emit the output to. /// \param BB Block to print. -void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) { - OS << "weight[" << BB->getName() << "]: " << BlockWeights[BB] << "\n"; +void SampleProfileLoader::printBlockWeight(raw_ostream &OS, + const BasicBlock *BB) const { + const auto &I = BlockWeights.find(BB); + uint64_t W = (I == BlockWeights.end() ? 0 : I->second); + OS << "weight[" << BB->getName() << "]: " << W << "\n"; } /// \brief Get the weight for an instruction. @@ -214,51 +442,67 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) { /// /// \param Inst Instruction to query. /// -/// \returns The profiled weight of I. -unsigned SampleProfileLoader::getInstWeight(Instruction &Inst) { +/// \returns the weight of \p Inst. +ErrorOr<uint64_t> +SampleProfileLoader::getInstWeight(const Instruction &Inst) const { DebugLoc DLoc = Inst.getDebugLoc(); if (!DLoc) - return 0; + return std::error_code(); - unsigned Lineno = DLoc.getLine(); - if (Lineno < HeaderLineno) - return 0; + const FunctionSamples *FS = findFunctionSamples(Inst); + if (!FS) + return std::error_code(); const DILocation *DIL = DLoc; - int LOffset = Lineno - HeaderLineno; - unsigned Discriminator = DIL->getDiscriminator(); - unsigned Weight = Samples->samplesAt(LOffset, Discriminator); - DEBUG(dbgs() << " " << Lineno << "." << Discriminator << ":" << Inst - << " (line offset: " << LOffset << "." << Discriminator - << " - weight: " << Weight << ")\n"); - return Weight; + unsigned Lineno = DLoc.getLine(); + unsigned HeaderLineno = DIL->getScope()->getSubprogram()->getLine(); + + uint32_t LineOffset = getOffset(Lineno, HeaderLineno); + uint32_t Discriminator = DIL->getDiscriminator(); + ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator); + if (R) { + bool FirstMark = + CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get()); + if (FirstMark) { + const Function *F = Inst.getParent()->getParent(); + LLVMContext &Ctx = F->getContext(); + emitOptimizationRemark( + Ctx, DEBUG_TYPE, *F, DLoc, + Twine("Applied ") + Twine(*R) + " samples from profile (offset: " + + Twine(LineOffset) + + ((Discriminator) ? Twine(".") + Twine(Discriminator) : "") + ")"); + } + DEBUG(dbgs() << " " << Lineno << "." << DIL->getDiscriminator() << ":" + << Inst << " (line offset: " << Lineno - HeaderLineno << "." + << DIL->getDiscriminator() << " - weight: " << R.get() + << ")\n"); + } + return R; } /// \brief Compute the weight of a basic block. /// /// The weight of basic block \p BB is the maximum weight of all the -/// instructions in BB. The weight of \p BB is computed and cached in -/// the BlockWeights map. +/// instructions in BB. /// /// \param BB The basic block to query. /// -/// \returns The computed weight of BB. -unsigned SampleProfileLoader::getBlockWeight(BasicBlock *BB) { - // If we've computed BB's weight before, return it. - std::pair<BlockWeightMap::iterator, bool> Entry = - BlockWeights.insert(std::make_pair(BB, 0)); - if (!Entry.second) - return Entry.first->second; - - // Otherwise, compute and cache BB's weight. - unsigned Weight = 0; +/// \returns the weight for \p BB. +ErrorOr<uint64_t> +SampleProfileLoader::getBlockWeight(const BasicBlock *BB) const { + bool Found = false; + uint64_t Weight = 0; for (auto &I : BB->getInstList()) { - unsigned InstWeight = getInstWeight(I); - if (InstWeight > Weight) - Weight = InstWeight; + const ErrorOr<uint64_t> &R = getInstWeight(I); + if (R && R.get() >= Weight) { + Weight = R.get(); + Found = true; + } } - Entry.first->second = Weight; - return Weight; + if (Found) + return Weight; + else + return std::error_code(); } /// \brief Compute and store the weights of every basic block. @@ -270,15 +514,199 @@ unsigned SampleProfileLoader::getBlockWeight(BasicBlock *BB) { bool SampleProfileLoader::computeBlockWeights(Function &F) { bool Changed = false; DEBUG(dbgs() << "Block weights\n"); - for (auto &BB : F) { - unsigned Weight = getBlockWeight(&BB); - Changed |= (Weight > 0); + for (const auto &BB : F) { + ErrorOr<uint64_t> Weight = getBlockWeight(&BB); + if (Weight) { + BlockWeights[&BB] = Weight.get(); + VisitedBlocks.insert(&BB); + Changed = true; + } DEBUG(printBlockWeight(dbgs(), &BB)); } return Changed; } +/// \brief Get the FunctionSamples for a call instruction. +/// +/// The FunctionSamples of a call instruction \p Inst is the inlined +/// instance in which that call instruction is calling to. It contains +/// all samples that resides in the inlined instance. We first find the +/// inlined instance in which the call instruction is from, then we +/// traverse its children to find the callsite with the matching +/// location and callee function name. +/// +/// \param Inst Call instruction to query. +/// +/// \returns The FunctionSamples pointer to the inlined instance. +const FunctionSamples * +SampleProfileLoader::findCalleeFunctionSamples(const CallInst &Inst) const { + const DILocation *DIL = Inst.getDebugLoc(); + if (!DIL) { + return nullptr; + } + DISubprogram *SP = DIL->getScope()->getSubprogram(); + if (!SP) + return nullptr; + + Function *CalleeFunc = Inst.getCalledFunction(); + if (!CalleeFunc) { + return nullptr; + } + + StringRef CalleeName = CalleeFunc->getName(); + const FunctionSamples *FS = findFunctionSamples(Inst); + if (FS == nullptr) + return nullptr; + + return FS->findFunctionSamplesAt( + CallsiteLocation(getOffset(DIL->getLine(), SP->getLine()), + DIL->getDiscriminator(), CalleeName)); +} + +/// \brief Get the FunctionSamples for an instruction. +/// +/// The FunctionSamples of an instruction \p Inst is the inlined instance +/// in which that instruction is coming from. We traverse the inline stack +/// of that instruction, and match it with the tree nodes in the profile. +/// +/// \param Inst Instruction to query. +/// +/// \returns the FunctionSamples pointer to the inlined instance. +const FunctionSamples * +SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { + SmallVector<CallsiteLocation, 10> S; + const DILocation *DIL = Inst.getDebugLoc(); + if (!DIL) { + return Samples; + } + StringRef CalleeName; + for (const DILocation *DIL = Inst.getDebugLoc(); DIL; + DIL = DIL->getInlinedAt()) { + DISubprogram *SP = DIL->getScope()->getSubprogram(); + if (!SP) + return nullptr; + if (!CalleeName.empty()) { + S.push_back(CallsiteLocation(getOffset(DIL->getLine(), SP->getLine()), + DIL->getDiscriminator(), CalleeName)); + } + CalleeName = SP->getLinkageName(); + } + if (S.size() == 0) + return Samples; + const FunctionSamples *FS = Samples; + for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) { + FS = FS->findFunctionSamplesAt(S[i]); + } + return FS; +} + +/// \brief Emit an inline hint if \p F is globally hot or cold. +/// +/// If \p F consumes a significant fraction of samples (indicated by +/// SampleProfileGlobalHotThreshold), apply the InlineHint attribute for the +/// inliner to consider the function hot. +/// +/// If \p F consumes a small fraction of samples (indicated by +/// SampleProfileGlobalColdThreshold), apply the Cold attribute for the inliner +/// to consider the function cold. +/// +/// FIXME - This setting of inline hints is sub-optimal. Instead of marking a +/// function globally hot or cold, we should be annotating individual callsites. +/// This is not currently possible, but work on the inliner will eventually +/// provide this ability. See http://reviews.llvm.org/D15003 for details and +/// discussion. +/// +/// \returns True if either attribute was applied to \p F. +bool SampleProfileLoader::emitInlineHints(Function &F) { + if (TotalCollectedSamples == 0) + return false; + + uint64_t FunctionSamples = Samples->getTotalSamples(); + double SamplesPercent = + (double)FunctionSamples / (double)TotalCollectedSamples * 100.0; + + // If the function collected more samples than the hot threshold, mark + // it globally hot. + if (SamplesPercent >= SampleProfileGlobalHotThreshold) { + F.addFnAttr(llvm::Attribute::InlineHint); + std::string Msg; + raw_string_ostream S(Msg); + S << "Applied inline hint to globally hot function '" << F.getName() + << "' with " << format("%.2f", SamplesPercent) + << "% of samples (threshold: " + << format("%.2f", SampleProfileGlobalHotThreshold.getValue()) << "%)"; + S.flush(); + emitOptimizationRemark(F.getContext(), DEBUG_TYPE, F, DebugLoc(), Msg); + return true; + } + + // If the function collected fewer samples than the cold threshold, mark + // it globally cold. + if (SamplesPercent <= SampleProfileGlobalColdThreshold) { + F.addFnAttr(llvm::Attribute::Cold); + std::string Msg; + raw_string_ostream S(Msg); + S << "Applied cold hint to globally cold function '" << F.getName() + << "' with " << format("%.2f", SamplesPercent) + << "% of samples (threshold: " + << format("%.2f", SampleProfileGlobalColdThreshold.getValue()) << "%)"; + S.flush(); + emitOptimizationRemark(F.getContext(), DEBUG_TYPE, F, DebugLoc(), Msg); + return true; + } + + return false; +} + +/// \brief Iteratively inline hot callsites of a function. +/// +/// Iteratively traverse all callsites of the function \p F, and find if +/// the corresponding inlined instance exists and is hot in profile. If +/// it is hot enough, inline the callsites and adds new callsites of the +/// callee into the caller. +/// +/// TODO: investigate the possibility of not invoking InlineFunction directly. +/// +/// \param F function to perform iterative inlining. +/// +/// \returns True if there is any inline happened. +bool SampleProfileLoader::inlineHotFunctions(Function &F) { + bool Changed = false; + LLVMContext &Ctx = F.getContext(); + while (true) { + bool LocalChanged = false; + SmallVector<CallInst *, 10> CIS; + for (auto &BB : F) { + for (auto &I : BB.getInstList()) { + CallInst *CI = dyn_cast<CallInst>(&I); + if (CI && callsiteIsHot(Samples, findCalleeFunctionSamples(*CI))) + CIS.push_back(CI); + } + } + for (auto CI : CIS) { + InlineFunctionInfo IFI; + Function *CalledFunction = CI->getCalledFunction(); + DebugLoc DLoc = CI->getDebugLoc(); + uint64_t NumSamples = findCalleeFunctionSamples(*CI)->getTotalSamples(); + if (InlineFunction(CI, IFI)) { + LocalChanged = true; + emitOptimizationRemark(Ctx, DEBUG_TYPE, F, DLoc, + Twine("inlined hot callee '") + + CalledFunction->getName() + "' with " + + Twine(NumSamples) + " samples into '" + + F.getName() + "'"); + } + } + if (LocalChanged) { + Changed = true; + } else { + break; + } + } + return Changed; +} + /// \brief Find equivalence classes for the given block. /// /// This finds all the blocks that are guaranteed to execute the same @@ -305,12 +733,13 @@ bool SampleProfileLoader::computeBlockWeights(Function &F) { void SampleProfileLoader::findEquivalencesFor( BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants, DominatorTreeBase<BasicBlock> *DomTree) { - for (auto *BB2 : Descendants) { + const BasicBlock *EC = EquivalenceClass[BB1]; + uint64_t Weight = BlockWeights[EC]; + for (const auto *BB2 : Descendants) { bool IsDomParent = DomTree->dominates(BB2, BB1); bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2); - if (BB1 != BB2 && VisitedBlocks.insert(BB2).second && IsDomParent && - IsInSameLoop) { - EquivalenceClass[BB2] = BB1; + if (BB1 != BB2 && IsDomParent && IsInSameLoop) { + EquivalenceClass[BB2] = EC; // If BB2 is heavier than BB1, make BB2 have the same weight // as BB1. @@ -320,11 +749,10 @@ void SampleProfileLoader::findEquivalencesFor( // during the propagation phase. Right now, we just want to // make sure that BB1 has the largest weight of all the // members of its equivalence set. - unsigned &BB1Weight = BlockWeights[BB1]; - unsigned &BB2Weight = BlockWeights[BB2]; - BB1Weight = std::max(BB1Weight, BB2Weight); + Weight = std::max(Weight, BlockWeights[BB2]); } } + BlockWeights[EC] = Weight; } /// \brief Find equivalence classes. @@ -364,19 +792,7 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) { // class by making BB2's equivalence class be BB1. DominatedBBs.clear(); DT->getDescendants(BB1, DominatedBBs); - findEquivalencesFor(BB1, DominatedBBs, PDT->DT); - - // Repeat the same logic for all the blocks post-dominated by BB1. - // We are looking for every basic block BB2 such that: - // - // 1- BB1 post-dominates BB2. - // 2- BB2 dominates BB1. - // 3- BB1 and BB2 are in the same loop nest. - // - // If all those conditions hold, BB2's equivalence class is BB1. - DominatedBBs.clear(); - PDT->getDescendants(BB1, DominatedBBs); - findEquivalencesFor(BB1, DominatedBBs, DT); + findEquivalencesFor(BB1, DominatedBBs, PDT.get()); DEBUG(printBlockEquivalence(dbgs(), BB1)); } @@ -389,8 +805,8 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) { // to all the blocks in that equivalence class. DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n"); for (auto &BI : F) { - BasicBlock *BB = &BI; - BasicBlock *EquivBB = EquivalenceClass[BB]; + const BasicBlock *BB = &BI; + const BasicBlock *EquivBB = EquivalenceClass[BB]; if (BB != EquivBB) BlockWeights[BB] = BlockWeights[EquivBB]; DEBUG(printBlockWeight(dbgs(), BB)); @@ -407,7 +823,7 @@ void SampleProfileLoader::findEquivalenceClasses(Function &F) { /// \param UnknownEdge Set if E has not been visited before. /// /// \returns E's weight, if known. Otherwise, return 0. -unsigned SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges, +uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge) { if (!VisitedEdges.count(E)) { (*NumUnknownEdges)++; @@ -432,8 +848,9 @@ unsigned SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges, bool SampleProfileLoader::propagateThroughEdges(Function &F) { bool Changed = false; DEBUG(dbgs() << "\nPropagation through edges\n"); - for (auto &BI : F) { - BasicBlock *BB = &BI; + for (const auto &BI : F) { + const BasicBlock *BB = &BI; + const BasicBlock *EC = EquivalenceClass[BB]; // Visit all the predecessor and successor edges to determine // which ones have a weight assigned already. Note that it doesn't @@ -441,7 +858,7 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) { // only case we are interested in handling is when only a single // edge is unknown (see setEdgeOrBlockWeight). for (unsigned i = 0; i < 2; i++) { - unsigned TotalWeight = 0; + uint64_t TotalWeight = 0; unsigned NumUnknownEdges = 0; Edge UnknownEdge, SelfReferentialEdge; @@ -485,7 +902,7 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) { // all edges will get a weight, or iteration will stop when // it reaches SampleProfileMaxPropagateIterations. if (NumUnknownEdges <= 1) { - unsigned &BBWeight = BlockWeights[BB]; + uint64_t &BBWeight = BlockWeights[EC]; if (NumUnknownEdges == 0) { // If we already know the weight of all edges, the weight of the // basic block can be computed. It should be no larger than the sum @@ -497,9 +914,9 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) { << " known. Set weight for block: "; printBlockWeight(dbgs(), BB);); } - if (VisitedBlocks.insert(BB).second) + if (VisitedBlocks.insert(EC).second) Changed = true; - } else if (NumUnknownEdges == 1 && VisitedBlocks.count(BB)) { + } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) { // If there is a single unknown edge and the block has been // visited, then we can compute E's weight. if (BBWeight >= TotalWeight) @@ -511,8 +928,8 @@ bool SampleProfileLoader::propagateThroughEdges(Function &F) { DEBUG(dbgs() << "Set weight for edge: "; printEdgeWeight(dbgs(), UnknownEdge)); } - } else if (SelfReferentialEdge.first && VisitedBlocks.count(BB)) { - unsigned &BBWeight = BlockWeights[BB]; + } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) { + uint64_t &BBWeight = BlockWeights[BB]; // We have a self-referential edge and the weight of BB is known. if (BBWeight >= TotalWeight) EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight; @@ -578,7 +995,7 @@ void SampleProfileLoader::buildEdges(Function &F) { /// known). void SampleProfileLoader::propagateWeights(Function &F) { bool Changed = true; - unsigned i = 0; + unsigned I = 0; // Add an entry count to the function using the samples gathered // at the function entry. @@ -592,14 +1009,15 @@ void SampleProfileLoader::propagateWeights(Function &F) { buildEdges(F); // Propagate until we converge or we go past the iteration limit. - while (Changed && i++ < SampleProfileMaxPropagateIterations) { + while (Changed && I++ < SampleProfileMaxPropagateIterations) { Changed = propagateThroughEdges(F); } // Generate MD_prof metadata for every branch instruction using the // edge weights computed during propagation. DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n"); - MDBuilder MDB(F.getContext()); + LLVMContext &Ctx = F.getContext(); + MDBuilder MDB(Ctx); for (auto &BI : F) { BasicBlock *BB = &BI; TerminatorInst *TI = BB->getTerminator(); @@ -610,24 +1028,44 @@ void SampleProfileLoader::propagateWeights(Function &F) { DEBUG(dbgs() << "\nGetting weights for branch at line " << TI->getDebugLoc().getLine() << ".\n"); - SmallVector<unsigned, 4> Weights; - bool AllWeightsZero = true; + SmallVector<uint32_t, 4> Weights; + uint32_t MaxWeight = 0; + DebugLoc MaxDestLoc; for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { BasicBlock *Succ = TI->getSuccessor(I); Edge E = std::make_pair(BB, Succ); - unsigned Weight = EdgeWeights[E]; + uint64_t Weight = EdgeWeights[E]; DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E)); - Weights.push_back(Weight); - if (Weight != 0) - AllWeightsZero = false; + // Use uint32_t saturated arithmetic to adjust the incoming weights, + // if needed. Sample counts in profiles are 64-bit unsigned values, + // but internally branch weights are expressed as 32-bit values. + if (Weight > std::numeric_limits<uint32_t>::max()) { + DEBUG(dbgs() << " (saturated due to uint32_t overflow)"); + Weight = std::numeric_limits<uint32_t>::max(); + } + Weights.push_back(static_cast<uint32_t>(Weight)); + if (Weight != 0) { + if (Weight > MaxWeight) { + MaxWeight = Weight; + MaxDestLoc = Succ->getFirstNonPHIOrDbgOrLifetime()->getDebugLoc(); + } + } } // Only set weights if there is at least one non-zero weight. // In any other case, let the analyzer set weights. - if (!AllWeightsZero) { + if (MaxWeight > 0) { DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); + DebugLoc BranchLoc = TI->getDebugLoc(); + emitOptimizationRemark( + Ctx, DEBUG_TYPE, F, MaxDestLoc, + Twine("most popular destination for conditional branches at ") + + ((BranchLoc) ? Twine(BranchLoc->getFilename() + ":" + + Twine(BranchLoc.getLine()) + ":" + + Twine(BranchLoc.getCol())) + : Twine("<UNKNOWN LOCATION>"))); } else { DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n"); } @@ -649,7 +1087,7 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) { if (DISubprogram *S = getDISubprogram(&F)) return S->getLine(); - // If could not find the start of \p F, emit a diagnostic to inform the user + // If the start of \p F is missing, emit a diagnostic to inform the user // about the missed opportunity. F.getContext().diagnose(DiagnosticInfoSampleProfile( "No debug information found in function " + F.getName() + @@ -658,6 +1096,17 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) { return 0; } +void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) { + DT.reset(new DominatorTree); + DT->recalculate(F); + + PDT.reset(new DominatorTreeBase<BasicBlock>(true)); + PDT->recalculate(F); + + LI.reset(new LoopInfo); + LI->analyze(*DT); +} + /// \brief Generate branch weight metadata for all branches in \p F. /// /// Branch weights are computed out of instruction samples using a @@ -710,18 +1159,23 @@ unsigned SampleProfileLoader::getFunctionLoc(Function &F) { bool SampleProfileLoader::emitAnnotations(Function &F) { bool Changed = false; - // Initialize invariants used during computation and propagation. - HeaderLineno = getFunctionLoc(F); - if (HeaderLineno == 0) + if (getFunctionLoc(F) == 0) return false; DEBUG(dbgs() << "Line number for the first instruction in " << F.getName() - << ": " << HeaderLineno << "\n"); + << ": " << getFunctionLoc(F) << "\n"); + + Changed |= emitInlineHints(F); + + Changed |= inlineHotFunctions(F); // Compute basic block weights. Changed |= computeBlockWeights(F); if (Changed) { + // Compute dominance and loop info needed for propagation. + computeDominanceAndLoopInfo(F); + // Find equivalence classes. findEquivalenceClasses(F); @@ -729,24 +1183,48 @@ bool SampleProfileLoader::emitAnnotations(Function &F) { propagateWeights(F); } + // If coverage checking was requested, compute it now. + if (SampleProfileRecordCoverage) { + unsigned Used = CoverageTracker.countUsedRecords(Samples); + unsigned Total = CoverageTracker.countBodyRecords(Samples); + unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); + if (Coverage < SampleProfileRecordCoverage) { + F.getContext().diagnose(DiagnosticInfoSampleProfile( + getDISubprogram(&F)->getFilename(), getFunctionLoc(F), + Twine(Used) + " of " + Twine(Total) + " available profile records (" + + Twine(Coverage) + "%) were applied", + DS_Warning)); + } + } + + if (SampleProfileSampleCoverage) { + uint64_t Used = CoverageTracker.getTotalUsedSamples(); + uint64_t Total = CoverageTracker.countBodySamples(Samples); + unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); + if (Coverage < SampleProfileSampleCoverage) { + F.getContext().diagnose(DiagnosticInfoSampleProfile( + getDISubprogram(&F)->getFilename(), getFunctionLoc(F), + Twine(Used) + " of " + Twine(Total) + " available profile samples (" + + Twine(Coverage) + "%) were applied", + DS_Warning)); + } + } return Changed; } char SampleProfileLoader::ID = 0; INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile", "Sample Profile loader", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AddDiscriminators) INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile", "Sample Profile loader", false, false) bool SampleProfileLoader::doInitialization(Module &M) { - auto ReaderOrErr = SampleProfileReader::create(Filename, M.getContext()); + auto &Ctx = M.getContext(); + auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx); if (std::error_code EC = ReaderOrErr.getError()) { std::string Msg = "Could not open profile: " + EC.message(); - M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg)); + Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); return false; } Reader = std::move(ReaderOrErr.get()); @@ -754,22 +1232,32 @@ bool SampleProfileLoader::doInitialization(Module &M) { return true; } -FunctionPass *llvm::createSampleProfileLoaderPass() { +ModulePass *llvm::createSampleProfileLoaderPass() { return new SampleProfileLoader(SampleProfileFile); } -FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) { +ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) { return new SampleProfileLoader(Name); } -bool SampleProfileLoader::runOnFunction(Function &F) { +bool SampleProfileLoader::runOnModule(Module &M) { if (!ProfileIsValid) return false; - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - PDT = &getAnalysis<PostDominatorTree>(); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - Ctx = &F.getParent()->getContext(); + // Compute the total number of samples collected in this profile. + for (const auto &I : Reader->getProfiles()) + TotalCollectedSamples += I.second.getTotalSamples(); + + bool retval = false; + for (auto &F : M) + if (!F.isDeclaration()) { + clearFunctionData(); + retval |= runOnFunction(F); + } + return retval; +} + +bool SampleProfileLoader::runOnFunction(Function &F) { Samples = Reader->getSamplesFor(F); if (!Samples->empty()) return emitAnnotations(F); diff --git a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp index 956991a..c94cc7c 100644 --- a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp +++ b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp @@ -7,47 +7,31 @@ // //===----------------------------------------------------------------------===// // -// This pass loops over all of the functions in the input module, looking for +// This pass loops over all of the functions in the input module, looking for // dead declarations and removes them. Dead declarations are declarations of // functions for which no implementation is available (i.e., declarations for // unused library functions). // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/StripDeadPrototypes.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" + using namespace llvm; #define DEBUG_TYPE "strip-dead-prototypes" STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed"); -namespace { - -/// @brief Pass to remove unused function declarations. -class StripDeadPrototypesPass : public ModulePass { -public: - static char ID; // Pass identification, replacement for typeid - StripDeadPrototypesPass() : ModulePass(ID) { - initializeStripDeadPrototypesPassPass(*PassRegistry::getPassRegistry()); - } - bool runOnModule(Module &M) override; -}; - -} // end anonymous namespace - -char StripDeadPrototypesPass::ID = 0; -INITIALIZE_PASS(StripDeadPrototypesPass, "strip-dead-prototypes", - "Strip Unused Function Prototypes", false, false) - -bool StripDeadPrototypesPass::runOnModule(Module &M) { +static bool stripDeadPrototypes(Module &M) { bool MadeChange = false; - + // Erase dead function prototypes. for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { - Function *F = I++; + Function *F = &*I++; // Function must be a prototype and unused. if (F->isDeclaration() && F->use_empty()) { F->eraseFromParent(); @@ -59,16 +43,42 @@ bool StripDeadPrototypesPass::runOnModule(Module &M) { // Erase dead global var prototypes. for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ) { - GlobalVariable *GV = I++; + GlobalVariable *GV = &*I++; // Global must be a prototype and unused. if (GV->isDeclaration() && GV->use_empty()) GV->eraseFromParent(); } - + // Return an indication of whether we changed anything or not. return MadeChange; } +PreservedAnalyses StripDeadPrototypesPass::run(Module &M) { + if (stripDeadPrototypes(M)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +namespace { + +class StripDeadPrototypesLegacyPass : public ModulePass { +public: + static char ID; // Pass identification, replacement for typeid + StripDeadPrototypesLegacyPass() : ModulePass(ID) { + initializeStripDeadPrototypesLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + bool runOnModule(Module &M) override { + return stripDeadPrototypes(M); + } +}; + +} // end anonymous namespace + +char StripDeadPrototypesLegacyPass::ID = 0; +INITIALIZE_PASS(StripDeadPrototypesLegacyPass, "strip-dead-prototypes", + "Strip Unused Function Prototypes", false, false) + ModulePass *llvm::createStripDeadPrototypesPass() { - return new StripDeadPrototypesPass(); + return new StripDeadPrototypesLegacyPass(); } diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp index a4f30c5..46f352f 100644 --- a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp +++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp @@ -211,13 +211,13 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) { for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I) { - if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0) + if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0) if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg")) I->setName(""); // Internal symbols can't participate in linkage } for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0) + if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0) if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg")) I->setName(""); // Internal symbols can't participate in linkage StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo); @@ -305,6 +305,12 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { SmallVector<Metadata *, 64> LiveSubprograms; DenseSet<const MDNode *> VisitedSet; + std::set<DISubprogram *> LiveSPs; + for (Function &F : M) { + if (DISubprogram *SP = F.getSubprogram()) + LiveSPs.insert(SP); + } + for (DICompileUnit *DIC : F.compile_units()) { // Create our live subprogram list. bool SubprogramChange = false; @@ -314,7 +320,7 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { continue; // If the function referenced by DISP is not null, the function is live. - if (DISP->getFunction()) + if (LiveSPs.count(DISP)) LiveSubprograms.push_back(DISP); else SubprogramChange = true; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 2d2c109f..6f49399 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1,4 +1,4 @@ -//===- InstCombineAddSub.cpp ----------------------------------------------===// +//===- InstCombineAddSub.cpp ------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -17,6 +17,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/PatternMatch.h" + using namespace llvm; using namespace PatternMatch; @@ -67,17 +68,17 @@ namespace { private: bool insaneIntVal(int V) { return V > 4 || V < -4; } - APFloat *getFpValPtr(void) + APFloat *getFpValPtr() { return reinterpret_cast<APFloat*>(&FpValBuf.buffer[0]); } - const APFloat *getFpValPtr(void) const + const APFloat *getFpValPtr() const { return reinterpret_cast<const APFloat*>(&FpValBuf.buffer[0]); } - const APFloat &getFpVal(void) const { + const APFloat &getFpVal() const { assert(IsFp && BufHasFpVal && "Incorret state"); return *getFpValPtr(); } - APFloat &getFpVal(void) { + APFloat &getFpVal() { assert(IsFp && BufHasFpVal && "Incorret state"); return *getFpValPtr(); } @@ -92,8 +93,8 @@ namespace { // TODO: We should get rid of this function when APFloat can be constructed // from an *SIGNED* integer. APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val); - private: + private: bool IsFp; // True iff FpValBuf contains an instance of APFloat. @@ -114,10 +115,10 @@ namespace { /// class FAddend { public: - FAddend() { Val = nullptr; } + FAddend() : Val(nullptr) {} - Value *getSymVal (void) const { return Val; } - const FAddendCoef &getCoef(void) const { return Coeff; } + Value *getSymVal() const { return Val; } + const FAddendCoef &getCoef() const { return Coeff; } bool isConstant() const { return Val == nullptr; } bool isZero() const { return Coeff.isZero(); } @@ -182,7 +183,6 @@ namespace { InstCombiner::BuilderTy *Builder; Instruction *Instr; - private: // Debugging stuff are clustered here. #ifndef NDEBUG unsigned CreateInstrNum; @@ -193,7 +193,8 @@ namespace { void incCreateInstNum() {} #endif }; -} + +} // anonymous namespace //===----------------------------------------------------------------------===// // @@ -602,7 +603,6 @@ Value *FAddCombine::simplify(Instruction *I) { } Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { - unsigned AddendNum = Addends.size(); assert(AddendNum <= 4 && "Too many addends"); @@ -886,7 +886,7 @@ static bool checkRippleForAdd(const APInt &Op0KnownZero, return Op0ZeroPosition >= Op1OnePosition; } -/// WillNotOverflowSignedAdd - Return true if we can prove that: +/// Return true if we can prove that: /// (sext (add LHS, RHS)) === (add (sext LHS), (sext RHS)) /// This basically requires proving that the add in the original type would not /// overflow to change the sign bit or have a carry out. @@ -1118,8 +1118,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // (X + signbit) + C could have gotten canonicalized to (X ^ signbit) + C, // transform them into (X + (signbit ^ C)) if (XorRHS->getValue().isSignBit()) - return BinaryOperator::CreateAdd(XorLHS, - ConstantExpr::getXor(XorRHS, CI)); + return BinaryOperator::CreateAdd(XorLHS, + ConstantExpr::getXor(XorRHS, CI)); } } @@ -1421,7 +1421,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { return Changed ? &I : nullptr; } - /// Optimize pointer differences into the same array into a size. Consider: /// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer /// operands to the ptrtoint instructions for the LHS/RHS of the subtract. @@ -1589,7 +1588,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { } } - { Value *Y; // X-(X+Y) == -Y X-(Y+X) == -Y @@ -1611,32 +1609,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { return BinaryOperator::CreateAnd(A, B); } - // (sub (select (a, c, b)), (select (a, d, b))) -> (select (a, (sub c, d), 0)) - // (sub (select (a, b, c)), (select (a, b, d))) -> (select (a, 0, (sub c, d))) - if (auto *SI0 = dyn_cast<SelectInst>(Op0)) { - if (auto *SI1 = dyn_cast<SelectInst>(Op1)) { - if (SI0->getCondition() == SI1->getCondition()) { - if (Value *V = SimplifySubInst( - SI0->getFalseValue(), SI1->getFalseValue(), I.hasNoSignedWrap(), - I.hasNoUnsignedWrap(), DL, TLI, DT, AC)) - return SelectInst::Create( - SI0->getCondition(), - Builder->CreateSub(SI0->getTrueValue(), SI1->getTrueValue(), "", - /*HasNUW=*/I.hasNoUnsignedWrap(), - /*HasNSW=*/I.hasNoSignedWrap()), - V); - if (Value *V = SimplifySubInst(SI0->getTrueValue(), SI1->getTrueValue(), - I.hasNoSignedWrap(), - I.hasNoUnsignedWrap(), DL, TLI, DT, AC)) - return SelectInst::Create( - SI0->getCondition(), V, - Builder->CreateSub(SI0->getFalseValue(), SI1->getFalseValue(), "", - /*HasNUW=*/I.hasNoUnsignedWrap(), - /*HasNSW=*/I.hasNoSignedWrap())); - } - } - } - if (Op0->hasOneUse()) { Value *Y = nullptr; // ((X | Y) - X) --> (~X & Y) diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 15e0889..95c50d3 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -37,9 +37,9 @@ static inline Value *dyn_castNotVal(Value *V) { return nullptr; } -/// getFCmpCode - Similar to getICmpCode but for FCmpInst. This encodes a fcmp -/// predicate into a three bit mask. It also returns whether it is an ordered -/// predicate by reference. +/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into +/// a three bit mask. It also returns whether it is an ordered predicate by +/// reference. static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) { isOrdered = false; switch (CC) { @@ -64,10 +64,10 @@ static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) { } } -/// getNewICmpValue - This is the complement of getICmpCode, which turns an -/// opcode and two operands into either a constant true or false, or a brand -/// new ICmp instruction. The sign is passed in to determine which kind -/// of predicate to use in the new icmp instruction. +/// This is the complement of getICmpCode, which turns an opcode and two +/// operands into either a constant true or false, or a brand new ICmp +/// instruction. The sign is passed in to determine which kind of predicate to +/// use in the new icmp instruction. static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS, InstCombiner::BuilderTy *Builder) { ICmpInst::Predicate NewPred; @@ -76,9 +76,9 @@ static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS, return Builder->CreateICmp(NewPred, LHS, RHS); } -/// getFCmpValue - This is the complement of getFCmpCode, which turns an -/// opcode and two operands into either a FCmp instruction. isordered is passed -/// in to determine which kind of predicate to use in the new fcmp instruction. +/// This is the complement of getFCmpCode, which turns an opcode and two +/// operands into either a FCmp instruction. isordered is passed in to determine +/// which kind of predicate to use in the new fcmp instruction. static Value *getFCmpValue(bool isordered, unsigned code, Value *LHS, Value *RHS, InstCombiner::BuilderTy *Builder) { @@ -150,14 +150,13 @@ Value *InstCombiner::SimplifyBSwap(BinaryOperator &I) { else //if (Op == Instruction::Xor) BinOp = Builder->CreateXor(NewLHS, NewRHS); - Module *M = I.getParent()->getParent()->getParent(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy); + Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap, ITy); return Builder->CreateCall(F, BinOp); } -// OptAndOp - This handles expressions of the form ((val OP C1) & C2). Where -// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'. Op is -// guaranteed to be a binary operator. +/// This handles expressions of the form ((val OP C1) & C2). Where +/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'. Op is +/// guaranteed to be a binary operator. Instruction *InstCombiner::OptAndOp(Instruction *Op, ConstantInt *OpRHS, ConstantInt *AndRHS, @@ -341,10 +340,10 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, return Builder->CreateICmpUGT(Add, LowerBound); } -// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s with -// any number of 0s on either side. The 1s are allowed to wrap from LSB to -// MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. 0x0F0F0000 is -// not, since all 1s are not contiguous. +/// Returns true iff Val consists of one contiguous run of 1s with any number +/// of 0s on either side. The 1s are allowed to wrap from LSB to MSB, +/// so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. 0x0F0F0000 is +/// not, since all 1s are not contiguous. static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) { const APInt& V = Val->getValue(); uint32_t BitWidth = Val->getType()->getBitWidth(); @@ -357,9 +356,8 @@ static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) { return true; } -/// FoldLogicalPlusAnd - This is part of an expression (LHS +/- RHS) & Mask, -/// where isSub determines whether the operator is a sub. If we can fold one of -/// the following xforms: +/// This is part of an expression (LHS +/- RHS) & Mask, where isSub determines +/// whether the operator is a sub. If we can fold one of the following xforms: /// /// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask /// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0 @@ -449,8 +447,8 @@ enum MaskedICmpType { FoldMskICmp_BMask_NotMixed = 512 }; -/// return the set of pattern classes (from MaskedICmpType) -/// that (icmp SCC (A & B), C) satisfies +/// Return the set of pattern classes (from MaskedICmpType) +/// that (icmp SCC (A & B), C) satisfies. static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, ICmpInst::Predicate SCC) { @@ -538,8 +536,8 @@ static unsigned conjugateICmpMask(unsigned Mask) { return NewMask; } -/// decomposeBitTestICmp - Decompose an icmp into the form ((X & Y) pred Z) -/// if possible. The returned predicate is either == or !=. Returns false if +/// Decompose an icmp into the form ((X & Y) pred Z) if possible. +/// The returned predicate is either == or !=. Returns false if /// decomposition fails. static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred, Value *&X, Value *&Y, Value *&Z) { @@ -585,10 +583,9 @@ static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred, return true; } -/// foldLogOpOfMaskedICmpsHelper: -/// handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) -/// return the set of pattern classes (from MaskedICmpType) -/// that both LHS and RHS satisfy +/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) +/// Return the set of pattern classes (from MaskedICmpType) +/// that both LHS and RHS satisfy. static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, Value*& B, Value*& C, Value*& D, Value*& E, @@ -700,9 +697,9 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, unsigned right_type = getTypeOfMaskedICmp(A, D, E, RHSCC); return left_type & right_type; } -/// foldLogOpOfMaskedICmps: -/// try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) -/// into a single (icmp(A & X) ==/!= Y) + +/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) +/// into a single (icmp(A & X) ==/!= Y). static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, llvm::InstCombiner::BuilderTy *Builder) { Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr; @@ -879,7 +876,7 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, return Builder->CreateICmp(NewPred, Input, RangeEnd); } -/// FoldAndOfICmps - Fold (icmp)&(icmp) if possible. +/// Fold (icmp)&(icmp) if possible. Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); @@ -1123,9 +1120,8 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { return nullptr; } -/// FoldAndOfFCmps - Optimize (fcmp)&(fcmp). NOTE: Unlike the rest of -/// instcombine, this returns a Value which should already be inserted into the -/// function. +/// Optimize (fcmp)&(fcmp). NOTE: Unlike the rest of instcombine, this returns +/// a Value which should already be inserted into the function. Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { if (LHS->getPredicate() == FCmpInst::FCMP_ORD && RHS->getPredicate() == FCmpInst::FCMP_ORD) { @@ -1203,6 +1199,54 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { return nullptr; } +/// Match De Morgan's Laws: +/// (~A & ~B) == (~(A | B)) +/// (~A | ~B) == (~(A & B)) +static Instruction *matchDeMorgansLaws(BinaryOperator &I, + InstCombiner::BuilderTy *Builder) { + auto Opcode = I.getOpcode(); + assert((Opcode == Instruction::And || Opcode == Instruction::Or) && + "Trying to match De Morgan's Laws with something other than and/or"); + // Flip the logic operation. + if (Opcode == Instruction::And) + Opcode = Instruction::Or; + else + Opcode = Instruction::And; + + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + // TODO: Use pattern matchers instead of dyn_cast. + if (Value *Op0NotVal = dyn_castNotVal(Op0)) + if (Value *Op1NotVal = dyn_castNotVal(Op1)) + if (Op0->hasOneUse() && Op1->hasOneUse()) { + Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal, + I.getName() + ".demorgan"); + return BinaryOperator::CreateNot(LogicOp); + } + + // De Morgan's Law in disguise: + // (zext(bool A) ^ 1) & (zext(bool B) ^ 1) -> zext(~(A | B)) + // (zext(bool A) ^ 1) | (zext(bool B) ^ 1) -> zext(~(A & B)) + Value *A = nullptr; + Value *B = nullptr; + ConstantInt *C1 = nullptr; + if (match(Op0, m_OneUse(m_Xor(m_ZExt(m_Value(A)), m_ConstantInt(C1)))) && + match(Op1, m_OneUse(m_Xor(m_ZExt(m_Value(B)), m_Specific(C1))))) { + // TODO: This check could be loosened to handle different type sizes. + // Alternatively, we could fix the definition of m_Not to recognize a not + // operation hidden by a zext? + if (A->getType()->isIntegerTy(1) && B->getType()->isIntegerTy(1) && + C1->isOne()) { + Value *LogicOp = Builder->CreateBinOp(Opcode, A, B, + I.getName() + ".demorgan"); + Value *Not = Builder->CreateNot(LogicOp); + return CastInst::CreateZExtOrBitCast(Not, I.getType()); + } + } + + return nullptr; +} + Instruction *InstCombiner::visitAnd(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -1273,6 +1317,10 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I)) return BinaryOperator::CreateAnd(V, AndRHS); + // -x & 1 -> x & 1 + if (AndRHSMask == 1 && match(Op0LHS, m_Zero())) + return BinaryOperator::CreateAnd(Op0RHS, AndRHS); + // (A - N) & AndRHS -> -N & AndRHS iff A&AndRHS==0 and AndRHS // has 1's for all bits that the subtraction with A might affect. if (Op0I->hasOneUse() && !match(Op0LHS, m_Zero())) { @@ -1329,15 +1377,8 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return NV; } - - // (~A & ~B) == (~(A | B)) - De Morgan's Law - if (Value *Op0NotVal = dyn_castNotVal(Op0)) - if (Value *Op1NotVal = dyn_castNotVal(Op1)) - if (Op0->hasOneUse() && Op1->hasOneUse()) { - Value *Or = Builder->CreateOr(Op0NotVal, Op1NotVal, - I.getName()+".demorgan"); - return BinaryOperator::CreateNot(Or); - } + if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) + return DeMorgan; { Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; @@ -1446,14 +1487,15 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return ReplaceInstUsesWith(I, Res); - // fold (and (cast A), (cast B)) -> (cast (and A, B)) - if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) + if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) { + Value *Op0COp = Op0C->getOperand(0); + Type *SrcTy = Op0COp->getType(); + // fold (and (cast A), (cast B)) -> (cast (and A, B)) if (CastInst *Op1C = dyn_cast<CastInst>(Op1)) { - Type *SrcTy = Op0C->getOperand(0)->getType(); if (Op0C->getOpcode() == Op1C->getOpcode() && // same cast kind ? SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntOrIntVectorTy()) { - Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0); + Value *Op1COp = Op1C->getOperand(0); // Only do this if the casts both really cause code to be generated. if (ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) && @@ -1478,6 +1520,20 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { } } + // If we are masking off the sign bit of a floating-point value, convert + // this to the canonical fabs intrinsic call and cast back to integer. + // The backend should know how to optimize fabs(). + // TODO: This transform should also apply to vectors. + ConstantInt *CI; + if (isa<BitCastInst>(Op0C) && SrcTy->isFloatingPointTy() && + match(Op1, m_ConstantInt(CI)) && CI->isMaxValue(true)) { + Module *M = I.getModule(); + Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, SrcTy); + Value *Call = Builder->CreateCall(Fabs, Op0COp, "fabs"); + return CastInst::CreateBitOrPointerCast(Call, I.getType()); + } + } + { Value *X = nullptr; bool OpsSwapped = false; @@ -1509,163 +1565,195 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return Changed ? &I : nullptr; } -/// CollectBSwapParts - Analyze the specified subexpression and see if it is -/// capable of providing pieces of a bswap. The subexpression provides pieces -/// of a bswap if it is proven that each of the non-zero bytes in the output of -/// the expression came from the corresponding "byte swapped" byte in some other -/// value. For example, if the current subexpression is "(shl i32 %X, 24)" then -/// we know that the expression deposits the low byte of %X into the high byte -/// of the bswap result and that all other bytes are zero. This expression is -/// accepted, the high byte of ByteValues is set to X to indicate a correct -/// match. + +/// Analyze the specified subexpression and see if it is capable of providing +/// pieces of a bswap or bitreverse. The subexpression provides a potential +/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in +/// the output of the expression came from a corresponding bit in some other +/// value. This function is recursive, and the end result is a mapping of +/// (value, bitnumber) to bitnumber. It is the caller's responsibility to +/// validate that all `value`s are identical and that the bitnumber to bitnumber +/// mapping is correct for a bswap or bitreverse. +/// +/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know +/// that the expression deposits the low byte of %X into the high byte of the +/// result and that all other bits are zero. This expression is accepted, +/// BitValues[24-31] are set to %X and BitProvenance[24-31] are set to [0-7]. /// /// This function returns true if the match was unsuccessful and false if so. /// On entry to the function the "OverallLeftShift" is a signed integer value -/// indicating the number of bytes that the subexpression is later shifted. For +/// indicating the number of bits that the subexpression is later shifted. For /// example, if the expression is later right shifted by 16 bits, the -/// OverallLeftShift value would be -2 on entry. This is used to specify which -/// byte of ByteValues is actually being set. +/// OverallLeftShift value would be -16 on entry. This is used to specify which +/// bits of BitValues are actually being set. /// -/// Similarly, ByteMask is a bitmask where a bit is clear if its corresponding -/// byte is masked to zero by a user. For example, in (X & 255), X will be -/// processed with a bytemask of 1. Because bytemask is 32-bits, this limits -/// this function to working on up to 32-byte (256 bit) values. ByteMask is -/// always in the local (OverallLeftShift) coordinate space. +/// Similarly, BitMask is a bitmask where a bit is clear if its corresponding +/// bit is masked to zero by a user. For example, in (X & 255), X will be +/// processed with a bytemask of 255. BitMask is always in the local +/// (OverallLeftShift) coordinate space. /// -static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, - SmallVectorImpl<Value *> &ByteValues) { +static bool CollectBitParts(Value *V, int OverallLeftShift, APInt BitMask, + SmallVectorImpl<Value *> &BitValues, + SmallVectorImpl<int> &BitProvenance) { if (Instruction *I = dyn_cast<Instruction>(V)) { // If this is an or instruction, it may be an inner node of the bswap. - if (I->getOpcode() == Instruction::Or) { - return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, - ByteValues) || - CollectBSwapParts(I->getOperand(1), OverallLeftShift, ByteMask, - ByteValues); - } - - // If this is a logical shift by a constant multiple of 8, recurse with - // OverallLeftShift and ByteMask adjusted. + if (I->getOpcode() == Instruction::Or) + return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, + BitValues, BitProvenance) || + CollectBitParts(I->getOperand(1), OverallLeftShift, BitMask, + BitValues, BitProvenance); + + // If this is a logical shift by a constant, recurse with OverallLeftShift + // and BitMask adjusted. if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) { unsigned ShAmt = - cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U); - // Ensure the shift amount is defined and of a byte value. - if ((ShAmt & 7) || (ShAmt > 8*ByteValues.size())) + cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U); + // Ensure the shift amount is defined. + if (ShAmt > BitValues.size()) return true; - unsigned ByteShift = ShAmt >> 3; + unsigned BitShift = ShAmt; if (I->getOpcode() == Instruction::Shl) { - // X << 2 -> collect(X, +2) - OverallLeftShift += ByteShift; - ByteMask >>= ByteShift; + // X << C -> collect(X, +C) + OverallLeftShift += BitShift; + BitMask = BitMask.lshr(BitShift); } else { - // X >>u 2 -> collect(X, -2) - OverallLeftShift -= ByteShift; - ByteMask <<= ByteShift; - ByteMask &= (~0U >> (32-ByteValues.size())); + // X >>u C -> collect(X, -C) + OverallLeftShift -= BitShift; + BitMask = BitMask.shl(BitShift); } - if (OverallLeftShift >= (int)ByteValues.size()) return true; - if (OverallLeftShift <= -(int)ByteValues.size()) return true; + if (OverallLeftShift >= (int)BitValues.size()) + return true; + if (OverallLeftShift <= -(int)BitValues.size()) + return true; - return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, - ByteValues); + return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, + BitValues, BitProvenance); } - // If this is a logical 'and' with a mask that clears bytes, clear the - // corresponding bytes in ByteMask. + // If this is a logical 'and' with a mask that clears bits, clear the + // corresponding bits in BitMask. if (I->getOpcode() == Instruction::And && isa<ConstantInt>(I->getOperand(1))) { - // Scan every byte of the and mask, seeing if the byte is either 0 or 255. - unsigned NumBytes = ByteValues.size(); - APInt Byte(I->getType()->getPrimitiveSizeInBits(), 255); + unsigned NumBits = BitValues.size(); + APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1); const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue(); - for (unsigned i = 0; i != NumBytes; ++i, Byte <<= 8) { - // If this byte is masked out by a later operation, we don't care what + for (unsigned i = 0; i != NumBits; ++i, Bit <<= 1) { + // If this bit is masked out by a later operation, we don't care what // the and mask is. - if ((ByteMask & (1 << i)) == 0) + if (BitMask[i] == 0) continue; - // If the AndMask is all zeros for this byte, clear the bit. - APInt MaskB = AndMask & Byte; + // If the AndMask is zero for this bit, clear the bit. + APInt MaskB = AndMask & Bit; if (MaskB == 0) { - ByteMask &= ~(1U << i); + BitMask.clearBit(i); continue; } - // If the AndMask is not all ones for this byte, it's not a bytezap. - if (MaskB != Byte) - return true; - - // Otherwise, this byte is kept. + // Otherwise, this bit is kept. } - return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, - ByteValues); + return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, + BitValues, BitProvenance); } } // Okay, we got to something that isn't a shift, 'or' or 'and'. This must be - // the input value to the bswap. Some observations: 1) if more than one byte - // is demanded from this input, then it could not be successfully assembled - // into a byteswap. At least one of the two bytes would not be aligned with - // their ultimate destination. - if (!isPowerOf2_32(ByteMask)) return true; - unsigned InputByteNo = countTrailingZeros(ByteMask); - - // 2) The input and ultimate destinations must line up: if byte 3 of an i32 - // is demanded, it needs to go into byte 0 of the result. This means that the - // byte needs to be shifted until it lands in the right byte bucket. The - // shift amount depends on the position: if the byte is coming from the high - // part of the value (e.g. byte 3) then it must be shifted right. If from the - // low part, it must be shifted left. - unsigned DestByteNo = InputByteNo + OverallLeftShift; - if (ByteValues.size()-1-DestByteNo != InputByteNo) + // the input value to the bswap/bitreverse. To be part of a bswap or + // bitreverse we must be demanding a contiguous range of bits from it. + unsigned InputBitLen = BitMask.countPopulation(); + unsigned InputBitNo = BitMask.countTrailingZeros(); + if (BitMask.getBitWidth() - BitMask.countLeadingZeros() - InputBitNo != + InputBitLen) + // Not a contiguous set range of bits! return true; - // If the destination byte value is already defined, the values are or'd - // together, which isn't a bswap (unless it's an or of the same bits). - if (ByteValues[DestByteNo] && ByteValues[DestByteNo] != V) + // We know we're moving a contiguous range of bits from the input to the + // output. Record which bits in the output came from which bits in the input. + unsigned DestBitNo = InputBitNo + OverallLeftShift; + for (unsigned I = 0; I < InputBitLen; ++I) + BitProvenance[DestBitNo + I] = InputBitNo + I; + + // If the destination bit value is already defined, the values are or'd + // together, which isn't a bswap/bitreverse (unless it's an or of the same + // bits). + if (BitValues[DestBitNo] && BitValues[DestBitNo] != V) return true; - ByteValues[DestByteNo] = V; + for (unsigned I = 0; I < InputBitLen; ++I) + BitValues[DestBitNo + I] = V; + return false; } -/// MatchBSwap - Given an OR instruction, check to see if this is a bswap idiom. -/// If so, insert the new bswap intrinsic and return it. -Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) { - IntegerType *ITy = dyn_cast<IntegerType>(I.getType()); - if (!ITy || ITy->getBitWidth() % 16 || - // ByteMask only allows up to 32-byte values. - ITy->getBitWidth() > 32*8) - return nullptr; // Can only bswap pairs of bytes. Can't do vectors. +static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To, + unsigned BitWidth) { + if (From % 8 != To % 8) + return false; + // Convert from bit indices to byte indices and check for a byte reversal. + From >>= 3; + To >>= 3; + BitWidth >>= 3; + return From == BitWidth - To - 1; +} - /// ByteValues - For each byte of the result, we keep track of which value - /// defines each byte. - SmallVector<Value*, 8> ByteValues; - ByteValues.resize(ITy->getBitWidth()/8); +static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To, + unsigned BitWidth) { + return From == BitWidth - To - 1; +} +/// Given an OR instruction, check to see if this is a bswap or bitreverse +/// idiom. If so, insert the new intrinsic and return it. +Instruction *InstCombiner::MatchBSwapOrBitReverse(BinaryOperator &I) { + IntegerType *ITy = dyn_cast<IntegerType>(I.getType()); + if (!ITy) + return nullptr; // Can't do vectors. + unsigned BW = ITy->getBitWidth(); + + /// We keep track of which bit (BitProvenance) inside which value (BitValues) + /// defines each bit in the result. + SmallVector<Value *, 8> BitValues(BW, nullptr); + SmallVector<int, 8> BitProvenance(BW, -1); + // Try to find all the pieces corresponding to the bswap. - uint32_t ByteMask = ~0U >> (32-ByteValues.size()); - if (CollectBSwapParts(&I, 0, ByteMask, ByteValues)) + APInt BitMask = APInt::getAllOnesValue(BitValues.size()); + if (CollectBitParts(&I, 0, BitMask, BitValues, BitProvenance)) return nullptr; - // Check to see if all of the bytes come from the same value. - Value *V = ByteValues[0]; - if (!V) return nullptr; // Didn't find a byte? Must be zero. + // Check to see if all of the bits come from the same value. + Value *V = BitValues[0]; + if (!V) return nullptr; // Didn't find a bit? Must be zero. - // Check to make sure that all of the bytes come from the same value. - for (unsigned i = 1, e = ByteValues.size(); i != e; ++i) - if (ByteValues[i] != V) - return nullptr; - Module *M = I.getParent()->getParent()->getParent(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy); + if (!std::all_of(BitValues.begin(), BitValues.end(), + [&](const Value *X) { return X == V; })) + return nullptr; + + // Now, is the bit permutation correct for a bswap or a bitreverse? We can + // only byteswap values with an even number of bytes. + bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;; + for (unsigned i = 0, e = BitValues.size(); i != e; ++i) { + OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW); + OKForBitReverse &= + bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW); + } + + Intrinsic::ID Intrin; + if (OKForBSwap) + Intrin = Intrinsic::bswap; + else if (OKForBitReverse) + Intrin = Intrinsic::bitreverse; + else + return nullptr; + + Function *F = Intrinsic::getDeclaration(I.getModule(), Intrin, ITy); return CallInst::Create(F, V); } -/// MatchSelectFromAndOr - We have an expression of the form (A&C)|(B&D). Check -/// If A is (cond?-1:0) and either B or D is ~(cond?-1,0) or (cond?0,-1), then -/// we can simplify this expression to "cond ? C : D or B". +/// We have an expression of the form (A&C)|(B&D). Check if A is (cond?-1:0) +/// and either B or D is ~(cond?-1,0) or (cond?0,-1), then we can simplify this +/// expression to "cond ? C : D or B". static Instruction *MatchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D) { // If A is not a select of -1/0, this cannot match. @@ -1688,7 +1776,7 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B, return nullptr; } -/// FoldOrOfICmps - Fold (icmp)|(icmp) if possible. +/// Fold (icmp)|(icmp) if possible. Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI) { ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); @@ -1905,14 +1993,14 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, case ICmpInst::ICMP_EQ: if (LHS->getOperand(0) == RHS->getOperand(0)) { // if LHSCst and RHSCst differ only by one bit: - // (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1 + // (A == C1 || A == C2) -> (A | (C1 ^ C2)) == C2 assert(LHSCst->getValue().ule(LHSCst->getValue())); APInt Xor = LHSCst->getValue() ^ RHSCst->getValue(); if (Xor.isPowerOf2()) { - Value *NegCst = Builder->getInt(~Xor); - Value *And = Builder->CreateAnd(LHS->getOperand(0), NegCst); - return Builder->CreateICmp(ICmpInst::ICMP_EQ, And, LHSCst); + Value *Cst = Builder->getInt(Xor); + Value *Or = Builder->CreateOr(LHS->getOperand(0), Cst); + return Builder->CreateICmp(ICmpInst::ICMP_EQ, Or, RHSCst); } } @@ -2020,9 +2108,8 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, return nullptr; } -/// FoldOrOfFCmps - Optimize (fcmp)|(fcmp). NOTE: Unlike the rest of -/// instcombine, this returns a Value which should already be inserted into the -/// function. +/// Optimize (fcmp)|(fcmp). NOTE: Unlike the rest of instcombine, this returns +/// a Value which should already be inserted into the function. Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { if (LHS->getPredicate() == FCmpInst::FCMP_UNO && RHS->getPredicate() == FCmpInst::FCMP_UNO && @@ -2080,7 +2167,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { return nullptr; } -/// FoldOrWithConstants - This helper function folds: +/// This helper function folds: /// /// ((A | B) & C1) | (B & C2) /// @@ -2199,14 +2286,18 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { ConstantInt *C1 = nullptr, *C2 = nullptr; // (A | B) | C and A | (B | C) -> bswap if possible. + bool OrOfOrs = match(Op0, m_Or(m_Value(), m_Value())) || + match(Op1, m_Or(m_Value(), m_Value())); // (A >> B) | (C << D) and (A << B) | (B >> C) -> bswap if possible. - if (match(Op0, m_Or(m_Value(), m_Value())) || - match(Op1, m_Or(m_Value(), m_Value())) || - (match(Op0, m_LogicalShift(m_Value(), m_Value())) && - match(Op1, m_LogicalShift(m_Value(), m_Value())))) { - if (Instruction *BSwap = MatchBSwap(I)) + bool OrOfShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) && + match(Op1, m_LogicalShift(m_Value(), m_Value())); + // (A & B) | (C & D) -> bswap if possible. + bool OrOfAnds = match(Op0, m_And(m_Value(), m_Value())) && + match(Op1, m_And(m_Value(), m_Value())); + + if (OrOfOrs || OrOfShifts || OrOfAnds) + if (Instruction *BSwap = MatchBSwapOrBitReverse(I)) return BSwap; - } // (X^C)|Y -> (X|Y)^C iff Y&C == 0 if (Op0->hasOneUse() && @@ -2360,14 +2451,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A)))) return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C)); - // (~A | ~B) == (~(A & B)) - De Morgan's Law - if (Value *Op0NotVal = dyn_castNotVal(Op0)) - if (Value *Op1NotVal = dyn_castNotVal(Op1)) - if (Op0->hasOneUse() && Op1->hasOneUse()) { - Value *And = Builder->CreateAnd(Op0NotVal, Op1NotVal, - I.getName()+".demorgan"); - return BinaryOperator::CreateNot(And); - } + if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) + return DeMorgan; // Canonicalize xor to the RHS. bool SwappedForXor = false; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 6de380b..090245d 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -67,8 +67,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { unsigned CopyAlign = MI->getAlignment(); if (CopyAlign < MinAlign) { - MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), - MinAlign, false)); + MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), MinAlign, false)); return MI; } @@ -198,12 +197,140 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { return nullptr; } +static Value *SimplifyX86immshift(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + bool LogicalShift = false; + bool ShiftLeft = false; + + switch (II.getIntrinsicID()) { + default: + return nullptr; + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + LogicalShift = false; ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + LogicalShift = true; ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + LogicalShift = true; ShiftLeft = true; + break; + } + assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + + // Simplify if count is constant. + auto Arg1 = II.getArgOperand(1); + auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1); + auto CDV = dyn_cast<ConstantDataVector>(Arg1); + auto CInt = dyn_cast<ConstantInt>(Arg1); + if (!CAZ && !CDV && !CInt) + return nullptr; + + APInt Count(64, 0); + if (CDV) { + // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + auto VT = cast<VectorType>(CDV->getType()); + unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits(); + assert((64 % BitWidth) == 0 && "Unexpected packed shift size"); + unsigned NumSubElts = 64 / BitWidth; + + // Concatenate the sub-elements to create the 64-bit value. + for (unsigned i = 0; i != NumSubElts; ++i) { + unsigned SubEltIdx = (NumSubElts - 1) - i; + auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); + Count = Count.shl(BitWidth); + Count |= SubElt->getValue().zextOrTrunc(64); + } + } + else if (CInt) + Count = CInt->getValue(); + + auto Vec = II.getArgOperand(0); + auto VT = cast<VectorType>(Vec->getType()); + auto SVT = VT->getElementType(); + unsigned VWidth = VT->getNumElements(); + unsigned BitWidth = SVT->getPrimitiveSizeInBits(); + + // If shift-by-zero then just return the original value. + if (Count == 0) + return Vec; + + // Handle cases when Shift >= BitWidth. + if (Count.uge(BitWidth)) { + // If LogicalShift - just return zero. + if (LogicalShift) + return ConstantAggregateZero::get(VT); + + // If ArithmeticShift - clamp Shift to (BitWidth - 1). + Count = APInt(64, BitWidth - 1); + } + + // Get a constant vector of the same type as the first operand. + auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); + auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); + + if (ShiftLeft) + return Builder.CreateShl(Vec, ShiftVec); + + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); + + return Builder.CreateAShr(Vec, ShiftVec); +} + +static Value *SimplifyX86extend(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder, + bool SignExtend) { + VectorType *SrcTy = cast<VectorType>(II.getArgOperand(0)->getType()); + VectorType *DstTy = cast<VectorType>(II.getType()); + unsigned NumDstElts = DstTy->getNumElements(); + + // Extract a subvector of the first NumDstElts lanes and sign/zero extend. + SmallVector<int, 8> ShuffleMask; + for (int i = 0; i != (int)NumDstElts; ++i) + ShuffleMask.push_back(i); + + Value *SV = Builder.CreateShuffleVector(II.getArgOperand(0), + UndefValue::get(SrcTy), ShuffleMask); + return SignExtend ? Builder.CreateSExt(SV, DstTy) + : Builder.CreateZExt(SV, DstTy); +} + static Value *SimplifyX86insertps(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) { VectorType *VecTy = cast<VectorType>(II.getType()); assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); - + // The immediate permute control byte looks like this: // [3:0] - zero mask for each 32-bit lane // [5:4] - select one 32-bit destination lane @@ -248,12 +375,202 @@ static Value *SimplifyX86insertps(const IntrinsicInst &II, // Replace the selected destination lane with the selected source lane. ShuffleMask[DestLane] = SourceLane + 4; } - + return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); } return nullptr; } +/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding +/// or conversion to a shuffle vector. +static Value *SimplifyX86extrq(IntrinsicInst &II, Value *Op0, + ConstantInt *CILength, ConstantInt *CIIndex, + InstCombiner::BuilderTy &Builder) { + auto LowConstantHighUndef = [&](uint64_t Val) { + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + }; + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast<Constant>(Op0); + ConstantInt *CI0 = + C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0)) + : nullptr; + + // Attempt to constant fold. + if (CILength && CIIndex) { + // From AMD documentation: "The bit index and field length are each six + // bits in length other bits of the field are ignored." + APInt APIndex = CIIndex->getValue().zextOrTrunc(6); + APInt APLength = CILength->getValue().zextOrTrunc(6); + + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize EXTRQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Type *IntTy32 = Type::getInt32Ty(II.getContext()); + VectorType *ShufTy = VectorType::get(IntTy8, 16); + + SmallVector<Constant *, 16> ShuffleMask; + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + Index))); + for (int i = Length; i != 8; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(UndefValue::get(IntTy32)); + + Value *SV = Builder.CreateShuffleVector( + Builder.CreateBitCast(Op0, ShufTy), + ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask)); + return Builder.CreateBitCast(SV, II.getType()); + } + + // Constant Fold - shift Index'th bit to lowest position and mask off + // Length bits. + if (CI0) { + APInt Elt = CI0->getValue(); + Elt = Elt.lshr(Index).zextOrTrunc(Length); + return LowConstantHighUndef(Elt.getZExtValue()); + } + + // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { + Value *Args[] = {Op0, CILength, CIIndex}; + Module *M = II.getModule(); + Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); + return Builder.CreateCall(F, Args); + } + } + + // Constant Fold - extraction from zero is always {zero, undef}. + if (CI0 && CI0->equalsInt(0)) + return LowConstantHighUndef(0); + + return nullptr; +} + +/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant +/// folding or conversion to a shuffle vector. +static Value *SimplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, + APInt APLength, APInt APIndex, + InstCombiner::BuilderTy &Builder) { + + // From AMD documentation: "The bit index and field length are each six bits + // in length other bits of the field are ignored." + APIndex = APIndex.zextOrTrunc(6); + APLength = APLength.zextOrTrunc(6); + + // Attempt to constant fold. + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize INSERTQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Type *IntTy32 = Type::getInt32Ty(II.getContext()); + VectorType *ShufTy = VectorType::get(IntTy8, 16); + + SmallVector<Constant *, 16> ShuffleMask; + for (int i = 0; i != (int)Index; ++i) + ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); + for (int i = Index + Length; i != 8; ++i) + ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(UndefValue::get(IntTy32)); + + Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), + Builder.CreateBitCast(Op1, ShufTy), + ConstantVector::get(ShuffleMask)); + return Builder.CreateBitCast(SV, II.getType()); + } + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast<Constant>(Op0); + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CI00 = + C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CI10 = + C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0)) + : nullptr; + + // Constant Fold - insert bottom Length bits starting at the Index'th bit. + if (CI00 && CI10) { + APInt V00 = CI00->getValue(); + APInt V10 = CI10->getValue(); + APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); + V00 = V00 & ~Mask; + V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); + APInt Val = V00 | V10; + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + } + + // If we were an INSERTQ call, we'll save demanded elements if we convert to + // INSERTQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Constant *CILength = ConstantInt::get(IntTy8, Length, false); + Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); + + Value *Args[] = {Op0, Op1, CILength, CIIndex}; + Module *M = II.getModule(); + Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + return Builder.CreateCall(F, Args); + } + + return nullptr; +} + /// The shuffle mask for a perm2*128 selects any two halves of two 256-bit /// source vectors, unless a zero bit is set. If a zero bit is set, /// then ignore that half of the mask and clear that half of the vector. @@ -289,7 +606,7 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II, // The high bit of the selection field chooses the 1st or 2nd operand. bool LowInputSelect = Imm & 0x02; bool HighInputSelect = Imm & 0x20; - + // The low bit of the selection field chooses the low or high half // of the selected operand. bool LowHalfSelect = Imm & 0x01; @@ -298,11 +615,11 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II, // Determine which operand(s) are actually in use for this instruction. Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); - + // If needed, replace operands based on zero mask. V0 = LowHalfZero ? ZeroVector : V0; V1 = HighHalfZero ? ZeroVector : V1; - + // Permute low half of result. unsigned StartIndex = LowHalfSelect ? HalfSize : 0; for (unsigned i = 0; i < HalfSize; ++i) @@ -319,6 +636,43 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II, return nullptr; } +/// Decode XOP integer vector comparison intrinsics. +static Value *SimplifyX86vpcom(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder, bool IsSigned) { + if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) { + uint64_t Imm = CInt->getZExtValue() & 0x7; + VectorType *VecTy = cast<VectorType>(II.getType()); + CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + + switch (Imm) { + case 0x0: + Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; + break; + case 0x1: + Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; + break; + case 0x2: + Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; + break; + case 0x3: + Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; + break; + case 0x4: + Pred = ICmpInst::ICMP_EQ; break; + case 0x5: + Pred = ICmpInst::ICMP_NE; break; + case 0x6: + return ConstantInt::getSigned(VecTy, 0); // FALSE + case 0x7: + return ConstantInt::getSigned(VecTy, -1); // TRUE + } + + if (Value *Cmp = Builder.CreateICmp(Pred, II.getArgOperand(0), II.getArgOperand(1))) + return Builder.CreateSExtOrTrunc(Cmp, VecTy); + } + return nullptr; +} + /// visitCallInst - CallInst simplification. This mostly only handles folding /// of intrinsic instructions. For normal calls, it allows visitCallSite to do /// the heavy lifting. @@ -371,7 +725,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) { if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource())) if (GVSrc->isConstant()) { - Module *M = CI.getParent()->getParent()->getParent(); + Module *M = CI.getModule(); Intrinsic::ID MemCpyID = Intrinsic::memcpy; Type *Tys[3] = { CI.getArgOperand(0)->getType(), CI.getArgOperand(1)->getType(), @@ -400,6 +754,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (Changed) return II; } + auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, unsigned DemandedWidth) + { + APInt UndefElts(Width, 0); + APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); + return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); + }; + switch (II->getIntrinsicID()) { default: break; case Intrinsic::objectsize: { @@ -427,6 +788,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::bitreverse: { + Value *IIOperand = II->getArgOperand(0); + Value *X = nullptr; + + // bitreverse(bitreverse(x)) -> x + if (match(IIOperand, m_Intrinsic<Intrinsic::bitreverse>(m_Value(X)))) + return ReplaceInstUsesWith(CI, X); + break; + } + case Intrinsic::powi: if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { // powi(x, 0) -> 1.0 @@ -669,6 +1040,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return new StoreInst(II->getArgOperand(0), Ptr); } break; + case Intrinsic::x86_sse_storeu_ps: case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: @@ -682,6 +1054,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } break; + case Intrinsic::x86_vcvtph2ps_128: + case Intrinsic::x86_vcvtph2ps_256: { + auto Arg = II->getArgOperand(0); + auto ArgType = cast<VectorType>(Arg->getType()); + auto RetType = cast<VectorType>(II->getType()); + unsigned ArgWidth = ArgType->getNumElements(); + unsigned RetWidth = RetType->getNumElements(); + assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths"); + assert(ArgType->isIntOrIntVectorTy() && + ArgType->getScalarSizeInBits() == 16 && + "CVTPH2PS input type should be 16-bit integer vector"); + assert(RetType->getScalarType()->isFloatTy() && + "CVTPH2PS output type should be 32-bit float vector"); + + // Constant folding: Convert to generic half to single conversion. + if (isa<ConstantAggregateZero>(Arg)) + return ReplaceInstUsesWith(*II, ConstantAggregateZero::get(RetType)); + + if (isa<ConstantDataVector>(Arg)) { + auto VectorHalfAsShorts = Arg; + if (RetWidth < ArgWidth) { + SmallVector<int, 8> SubVecMask; + for (unsigned i = 0; i != RetWidth; ++i) + SubVecMask.push_back((int)i); + VectorHalfAsShorts = Builder->CreateShuffleVector( + Arg, UndefValue::get(ArgType), SubVecMask); + } + + auto VectorHalfType = + VectorType::get(Type::getHalfTy(II->getContext()), RetWidth); + auto VectorHalfs = + Builder->CreateBitCast(VectorHalfAsShorts, VectorHalfType); + auto VectorFloats = Builder->CreateFPExt(VectorHalfs, RetType); + return ReplaceInstUsesWith(*II, VectorFloats); + } + + // We only use the lowest lanes of the argument. + if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) { + II->setArgOperand(0, V); + return II; + } + break; + } + case Intrinsic::x86_sse_cvtss2si: case Intrinsic::x86_sse_cvtss2si64: case Intrinsic::x86_sse_cvttss2si: @@ -692,194 +1108,229 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_sse2_cvttsd2si64: { // These intrinsics only demand the 0th element of their input vectors. If // we can simplify the input based on that, do so now. - unsigned VWidth = - cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements(); - APInt DemandedElts(VWidth, 1); - APInt UndefElts(VWidth, 0); - if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0), - DemandedElts, UndefElts)) { + Value *Arg = II->getArgOperand(0); + unsigned VWidth = Arg->getType()->getVectorNumElements(); + if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { II->setArgOperand(0, V); return II; } break; } - // Constant fold <A x Bi> << Ci. - // FIXME: We don't handle _dq because it's a shift of an i128, but is - // represented in the IR as <2 x i64>. A per element shift is wrong. - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_sse2_psll_w: + // Constant fold ashr( <A x Bi>, Ci ). + // Constant fold lshr( <A x Bi>, Ci ). + // Constant fold shl( <A x Bi>, Ci ). + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: case Intrinsic::x86_sse2_pslli_d: case Intrinsic::x86_sse2_pslli_q: case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: case Intrinsic::x86_avx2_pslli_w: + if (Value *V = SimplifyX86immshift(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: case Intrinsic::x86_sse2_psrl_d: case Intrinsic::x86_sse2_psrl_q: case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_sse2_psrli_w: case Intrinsic::x86_avx2_psrl_d: case Intrinsic::x86_avx2_psrl_q: case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: { - // Simplify if count is constant. To 0 if >= BitWidth, - // otherwise to shl/lshr. - auto CDV = dyn_cast<ConstantDataVector>(II->getArgOperand(1)); - auto CInt = dyn_cast<ConstantInt>(II->getArgOperand(1)); - if (!CDV && !CInt) - break; - ConstantInt *Count; - if (CDV) - Count = cast<ConstantInt>(CDV->getElementAsConstant(0)); - else - Count = CInt; - - auto Vec = II->getArgOperand(0); - auto VT = cast<VectorType>(Vec->getType()); - if (Count->getZExtValue() > - VT->getElementType()->getPrimitiveSizeInBits() - 1) - return ReplaceInstUsesWith( - CI, ConstantAggregateZero::get(Vec->getType())); - - bool isPackedShiftLeft = true; - switch (II->getIntrinsicID()) { - default : break; - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: isPackedShiftLeft = false; break; - } - - unsigned VWidth = VT->getNumElements(); - // Get a constant vector of the same type as the first operand. - auto VTCI = ConstantInt::get(VT->getElementType(), Count->getZExtValue()); - if (isPackedShiftLeft) - return BinaryOperator::CreateShl(Vec, - Builder->CreateVectorSplat(VWidth, VTCI)); - - return BinaryOperator::CreateLShr(Vec, - Builder->CreateVectorSplat(VWidth, VTCI)); + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: { + if (Value *V = SimplifyX86immshift(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + + // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + Value *Arg1 = II->getArgOperand(1); + assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && + "Unexpected packed shift size"); + unsigned VWidth = Arg1->getType()->getVectorNumElements(); + + if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { + II->setArgOperand(1, V); + return II; + } + break; } - case Intrinsic::x86_sse41_pmovsxbw: - case Intrinsic::x86_sse41_pmovsxwd: - case Intrinsic::x86_sse41_pmovsxdq: + case Intrinsic::x86_avx2_pmovsxbd: + case Intrinsic::x86_avx2_pmovsxbq: + case Intrinsic::x86_avx2_pmovsxbw: + case Intrinsic::x86_avx2_pmovsxdq: + case Intrinsic::x86_avx2_pmovsxwd: + case Intrinsic::x86_avx2_pmovsxwq: + if (Value *V = SimplifyX86extend(*II, *Builder, true)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse41_pmovzxbd: + case Intrinsic::x86_sse41_pmovzxbq: case Intrinsic::x86_sse41_pmovzxbw: + case Intrinsic::x86_sse41_pmovzxdq: case Intrinsic::x86_sse41_pmovzxwd: - case Intrinsic::x86_sse41_pmovzxdq: { - // pmov{s|z}x ignores the upper half of their input vectors. - unsigned VWidth = - cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements(); - unsigned LowHalfElts = VWidth / 2; - APInt InputDemandedElts(APInt::getBitsSet(VWidth, 0, LowHalfElts)); - APInt UndefElts(VWidth, 0); - if (Value *TmpV = SimplifyDemandedVectorElts( - II->getArgOperand(0), InputDemandedElts, UndefElts)) { - II->setArgOperand(0, TmpV); + case Intrinsic::x86_sse41_pmovzxwq: + case Intrinsic::x86_avx2_pmovzxbd: + case Intrinsic::x86_avx2_pmovzxbq: + case Intrinsic::x86_avx2_pmovzxbw: + case Intrinsic::x86_avx2_pmovzxdq: + case Intrinsic::x86_avx2_pmovzxwd: + case Intrinsic::x86_avx2_pmovzxwq: + if (Value *V = SimplifyX86extend(*II, *Builder, false)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse41_insertps: + if (Value *V = SimplifyX86insertps(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse4a_extrq: { + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth0 = Op0->getType()->getVectorNumElements(); + unsigned VWidth1 = Op1->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 16 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CILength = + C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CIIndex = + C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or EXTRQI call. + if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder)) + return ReplaceInstUsesWith(*II, V); + + // EXTRQ only uses the lowest 64-bits of the first 128-bit vector + // operands and the lowest 16-bits of the second. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + II->setArgOperand(0, V); + return II; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { + II->setArgOperand(1, V); return II; } break; } - case Intrinsic::x86_sse41_insertps: - if (Value *V = SimplifyX86insertps(*II, *Builder)) + + case Intrinsic::x86_sse4a_extrqi: { + // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining + // bits of the lower 64-bits. The upper 64-bits are undefined. + Value *Op0 = II->getArgOperand(0); + unsigned VWidth = Op0->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1)); + ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2)); + + // Attempt to simplify to a constant or shuffle vector. + if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder)) return ReplaceInstUsesWith(*II, V); + + // EXTRQI only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + II->setArgOperand(0, V); + return II; + } + break; + } + + case Intrinsic::x86_sse4a_insertq: { + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth = Op0->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + Op1->getType()->getVectorNumElements() == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CI11 = + C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or INSERTQI call. + if (CI11) { + APInt V11 = CI11->getValue(); + APInt Len = V11.zextOrTrunc(6); + APInt Idx = V11.lshr(8).zextOrTrunc(6); + if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder)) + return ReplaceInstUsesWith(*II, V); + } + + // INSERTQ only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + II->setArgOperand(0, V); + return II; + } break; - + } + case Intrinsic::x86_sse4a_insertqi: { - // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top - // ones undef - // TODO: eventually we should lower this intrinsic to IR - if (auto CIWidth = dyn_cast<ConstantInt>(II->getArgOperand(2))) { - if (auto CIStart = dyn_cast<ConstantInt>(II->getArgOperand(3))) { - unsigned Index = CIStart->getZExtValue(); - // From AMD documentation: "a value of zero in the field length is - // defined as length of 64". - unsigned Length = CIWidth->equalsInt(0) ? 64 : CIWidth->getZExtValue(); - - // From AMD documentation: "If the sum of the bit index + length field - // is greater than 64, the results are undefined". - - // Note that both field index and field length are 8-bit quantities. - // Since variables 'Index' and 'Length' are unsigned values - // obtained from zero-extending field index and field length - // respectively, their sum should never wrap around. - if ((Index + Length) > 64) - return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); - - if (Length == 64 && Index == 0) { - Value *Vec = II->getArgOperand(1); - Value *Undef = UndefValue::get(Vec->getType()); - const uint32_t Mask[] = { 0, 2 }; - return ReplaceInstUsesWith( - CI, - Builder->CreateShuffleVector( - Vec, Undef, ConstantDataVector::get( - II->getContext(), makeArrayRef(Mask)))); - - } else if (auto Source = - dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { - if (Source->hasOneUse() && - Source->getArgOperand(1) == II->getArgOperand(1)) { - // If the source of the insert has only one use and it's another - // insert (and they're both inserting from the same vector), try to - // bundle both together. - auto CISourceWidth = - dyn_cast<ConstantInt>(Source->getArgOperand(2)); - auto CISourceStart = - dyn_cast<ConstantInt>(Source->getArgOperand(3)); - if (CISourceStart && CISourceWidth) { - unsigned Start = CIStart->getZExtValue(); - unsigned Width = CIWidth->getZExtValue(); - unsigned End = Start + Width; - unsigned SourceStart = CISourceStart->getZExtValue(); - unsigned SourceWidth = CISourceWidth->getZExtValue(); - unsigned SourceEnd = SourceStart + SourceWidth; - unsigned NewStart, NewWidth; - bool ShouldReplace = false; - if (Start <= SourceStart && SourceStart <= End) { - NewStart = Start; - NewWidth = std::max(End, SourceEnd) - NewStart; - ShouldReplace = true; - } else if (SourceStart <= Start && Start <= SourceEnd) { - NewStart = SourceStart; - NewWidth = std::max(SourceEnd, End) - NewStart; - ShouldReplace = true; - } - - if (ShouldReplace) { - Constant *ConstantWidth = ConstantInt::get( - II->getArgOperand(2)->getType(), NewWidth, false); - Constant *ConstantStart = ConstantInt::get( - II->getArgOperand(3)->getType(), NewStart, false); - Value *Args[4] = { Source->getArgOperand(0), - II->getArgOperand(1), ConstantWidth, - ConstantStart }; - Module *M = CI.getParent()->getParent()->getParent(); - Value *F = - Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); - return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args)); - } - } - } - } - } + // INSERTQI: Extract lowest Length bits from lower half of second source and + // insert over first source starting at Index bit. The upper 64-bits are + // undefined. + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth0 = Op0->getType()->getVectorNumElements(); + unsigned VWidth1 = Op1->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 2 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2)); + ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3)); + + // Attempt to simplify to a constant or shuffle vector. + if (CILength && CIIndex) { + APInt Len = CILength->getValue().zextOrTrunc(6); + APInt Idx = CIIndex->getValue().zextOrTrunc(6); + if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder)) + return ReplaceInstUsesWith(*II, V); + } + + // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector + // operands. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + II->setArgOperand(0, V); + return II; + } + + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { + II->setArgOperand(1, V); + return II; } break; } @@ -894,7 +1345,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // This optimization is convoluted because the intrinsic is defined as // getting a vector of floats or doubles for the ps and pd versions. // FIXME: That should be changed. + + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); Value *Mask = II->getArgOperand(2); + + // fold (blend A, A, Mask) -> A + if (Op0 == Op1) + return ReplaceInstUsesWith(CI, Op0); + + // Zero Mask - select 1st argument. + if (isa<ConstantAggregateZero>(Mask)) + return ReplaceInstUsesWith(CI, Op0); + + // Constant Mask - select 1st/2nd argument lane based on top bit of mask. if (auto C = dyn_cast<ConstantDataVector>(Mask)) { auto Tyi1 = Builder->getInt1Ty(); auto SelectorType = cast<VectorType>(Mask->getType()); @@ -917,11 +1381,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Selectors.push_back(ConstantInt::get(Tyi1, Selector >> (BitWidth - 1))); } auto NewSelector = ConstantVector::get(Selectors); - return SelectInst::Create(NewSelector, II->getArgOperand(1), - II->getArgOperand(0), "blendv"); - } else { - break; + return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); } + break; + } + + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: { + // Turn pshufb(V1,mask) -> shuffle(V1,Zero,mask) if mask is a constant. + auto *V = II->getArgOperand(1); + auto *VTy = cast<VectorType>(V->getType()); + unsigned NumElts = VTy->getNumElements(); + assert((NumElts == 16 || NumElts == 32) && + "Unexpected number of elements in shuffle mask!"); + // Initialize the resulting shuffle mask to all zeroes. + uint32_t Indexes[32] = {0}; + + if (auto *Mask = dyn_cast<ConstantDataVector>(V)) { + // Each byte in the shuffle control mask forms an index to permute the + // corresponding byte in the destination operand. + for (unsigned I = 0; I < NumElts; ++I) { + int8_t Index = Mask->getElementAsInteger(I); + // If the most significant bit (bit[7]) of each byte of the shuffle + // control mask is set, then zero is written in the result byte. + // The zero vector is in the right-hand side of the resulting + // shufflevector. + + // The value of each index is the least significant 4 bits of the + // shuffle control byte. + Indexes[I] = (Index < 0) ? NumElts : Index & 0xF; + } + } else if (!isa<ConstantAggregateZero>(V)) + break; + + // The value of each index for the high 128-bit lane is the least + // significant 4 bits of the respective shuffle control byte. + for (unsigned I = 16; I < NumElts; ++I) + Indexes[I] += I & 0xF0; + + auto NewC = ConstantDataVector::get(V->getContext(), + makeArrayRef(Indexes, NumElts)); + auto V1 = II->getArgOperand(0); + auto V2 = Constant::getNullValue(II->getType()); + auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC); + return ReplaceInstUsesWith(CI, Shuffle); } case Intrinsic::x86_avx_vpermilvar_ps: @@ -972,6 +1475,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return ReplaceInstUsesWith(*II, V); break; + case Intrinsic::x86_xop_vpcomb: + case Intrinsic::x86_xop_vpcomd: + case Intrinsic::x86_xop_vpcomq: + case Intrinsic::x86_xop_vpcomw: + if (Value *V = SimplifyX86vpcom(*II, *Builder, true)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_xop_vpcomub: + case Intrinsic::x86_xop_vpcomud: + case Intrinsic::x86_xop_vpcomuq: + case Intrinsic::x86_xop_vpcomuw: + if (Value *V = SimplifyX86vpcom(*II, *Builder, false)) + return ReplaceInstUsesWith(*II, V); + break; + case Intrinsic::ppc_altivec_vperm: // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. // Note that ppc_altivec_vperm has a big-endian bias, so when creating @@ -1115,15 +1634,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // happen when variable allocas are DCE'd. if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { if (SS->getIntrinsicID() == Intrinsic::stacksave) { - BasicBlock::iterator BI = SS; - if (&*++BI == II) + if (&*++SS->getIterator() == II) return EraseInstFromFunction(CI); } } // Scan down this block to see if there is another stack restore in the // same block without an intervening call/alloca. - BasicBlock::iterator BI = II; + BasicBlock::iterator BI(II); TerminatorInst *TI = II->getParent()->getTerminator(); bool CannotRemove = false; for (++BI; &*BI != TI; ++BI) { @@ -1153,6 +1671,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return EraseInstFromFunction(CI); break; } + case Intrinsic::lifetime_start: { + // Remove trivially empty lifetime_start/end ranges, i.e. a start + // immediately followed by an end (ignoring debuginfo or other + // lifetime markers in between). + BasicBlock::iterator BI = II->getIterator(), BE = II->getParent()->end(); + for (++BI; BI != BE; ++BI) { + if (IntrinsicInst *LTE = dyn_cast<IntrinsicInst>(BI)) { + if (isa<DbgInfoIntrinsic>(LTE) || + LTE->getIntrinsicID() == Intrinsic::lifetime_start) + continue; + if (LTE->getIntrinsicID() == Intrinsic::lifetime_end) { + if (II->getOperand(0) == LTE->getOperand(0) && + II->getOperand(1) == LTE->getOperand(1)) { + EraseInstFromFunction(*LTE); + return EraseInstFromFunction(*II); + } + continue; + } + } + break; + } + break; + } case Intrinsic::assume: { // Canonicalize assume(a && b) -> assume(a); assume(b); // Note: New assumption intrinsics created here are registered by @@ -1206,8 +1747,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // Translate facts known about a pointer before relocating into // facts about the relocate value, while being careful to // preserve relocation semantics. - GCRelocateOperands Operands(II); - Value *DerivedPtr = Operands.getDerivedPtr(); + Value *DerivedPtr = cast<GCRelocateInst>(II)->getDerivedPtr(); auto *GCRelocateType = cast<PointerType>(II->getType()); // Remove the relocation if unused, note that this check is required @@ -1233,7 +1773,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } // isKnownNonNull -> nonnull attribute - if (isKnownNonNull(DerivedPtr)) + if (isKnownNonNullAt(DerivedPtr, II, DT, TLI)) II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); // isDereferenceablePointer -> deref attribute @@ -1355,9 +1895,10 @@ static IntrinsicInst *FindInitTrampolineFromBB(IntrinsicInst *AdjustTramp, Value *TrampMem) { // Visit all the previous instructions in the basic block, and try to find a // init.trampoline which has a direct path to the adjust.trampoline. - for (BasicBlock::iterator I = AdjustTramp, - E = AdjustTramp->getParent()->begin(); I != E; ) { - Instruction *Inst = --I; + for (BasicBlock::iterator I = AdjustTramp->getIterator(), + E = AdjustTramp->getParent()->begin(); + I != E;) { + Instruction *Inst = &*--I; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) if (II->getIntrinsicID() == Intrinsic::init_trampoline && II->getOperand(0) == TrampMem) @@ -1400,20 +1941,27 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { // Mark any parameters that are known to be non-null with the nonnull // attribute. This is helpful for inlining calls to functions with null // checks on their arguments. + SmallVector<unsigned, 4> Indices; unsigned ArgNo = 0; + for (Value *V : CS.args()) { - if (!CS.paramHasAttr(ArgNo+1, Attribute::NonNull) && - isKnownNonNull(V)) { - AttributeSet AS = CS.getAttributes(); - AS = AS.addAttribute(CS.getInstruction()->getContext(), ArgNo+1, - Attribute::NonNull); - CS.setAttributes(AS); - Changed = true; - } + if (V->getType()->isPointerTy() && !CS.paramHasAttr(ArgNo+1, Attribute::NonNull) && + isKnownNonNullAt(V, CS.getInstruction(), DT, TLI)) + Indices.push_back(ArgNo + 1); ArgNo++; } + assert(ArgNo == CS.arg_size() && "sanity check"); + if (!Indices.empty()) { + AttributeSet AS = CS.getAttributes(); + LLVMContext &Ctx = CS.getInstruction()->getContext(); + AS = AS.addAttribute(Ctx, Indices, + Attribute::get(Ctx, Attribute::NonNull)); + CS.setAttributes(AS); + Changed = true; + } + // If the callee is a pointer to a function, attempt to move any casts to the // arguments of the call/invoke. Value *Callee = CS.getCalledValue(); @@ -1725,16 +2273,19 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(), attrVec); + SmallVector<OperandBundleDef, 1> OpBundles; + CS.getOperandBundlesAsDefs(OpBundles); + Instruction *NC; if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { - NC = Builder->CreateInvoke(Callee, II->getNormalDest(), - II->getUnwindDest(), Args); + NC = Builder->CreateInvoke(Callee, II->getNormalDest(), II->getUnwindDest(), + Args, OpBundles); NC->takeName(II); cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv()); cast<InvokeInst>(NC)->setAttributes(NewCallerPAL); } else { CallInst *CI = cast<CallInst>(Caller); - NC = Builder->CreateCall(Callee, Args); + NC = Builder->CreateCall(Callee, Args, OpBundles); NC->takeName(CI); if (CI->isTailCall()) cast<CallInst>(NC)->setTailCall(); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 48ab0eb..0f01d18 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -21,11 +21,11 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -/// DecomposeSimpleLinearExpr - Analyze 'Val', seeing if it is a simple linear -/// expression. If so, decompose it, returning some value X, such that Val is +/// Analyze 'Val', seeing if it is a simple linear expression. +/// If so, decompose it, returning some value X, such that Val is /// X*Scale+Offset. /// -static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, +static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale, uint64_t &Offset) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) { Offset = CI->getZExtValue(); @@ -62,7 +62,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, // where C1 is divisible by C2. unsigned SubScale; Value *SubVal = - DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset); + decomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset); Offset += RHS->getZExtValue(); Scale = SubScale; return SubVal; @@ -76,14 +76,14 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, return Val; } -/// PromoteCastOfAllocation - If we find a cast of an allocation instruction, -/// try to eliminate the cast by moving the type information into the alloc. +/// If we find a cast of an allocation instruction, try to eliminate the cast by +/// moving the type information into the alloc. Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI) { PointerType *PTy = cast<PointerType>(CI.getType()); BuilderTy AllocaBuilder(*Builder); - AllocaBuilder.SetInsertPoint(AI.getParent(), &AI); + AllocaBuilder.SetInsertPoint(&AI); // Get the type really allocated and the type casted to. Type *AllocElTy = AI.getAllocatedType(); @@ -114,7 +114,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, unsigned ArraySizeScale; uint64_t ArrayOffset; Value *NumElements = // See if the array size is a decomposable linear expr. - DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset); + decomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset); // If we can now satisfy the modulus, by using a non-1 scale, we really can // do the xform. @@ -154,9 +154,8 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, return ReplaceInstUsesWith(CI, New); } -/// EvaluateInDifferentType - Given an expression that -/// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually -/// insert the code to evaluate the expression. +/// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns +/// true for, actually insert the code to evaluate the expression. Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned) { if (Constant *C = dyn_cast<Constant>(V)) { @@ -261,9 +260,9 @@ isEliminableCastPair(const CastInst *CI, ///< First cast instruction return Instruction::CastOps(Res); } -/// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually -/// results in any code being generated and is interesting to optimize out. If -/// the cast can be eliminated by some other simple transformation, we prefer +/// Return true if the cast from "V to Ty" actually results in any code being +/// generated and is interesting to optimize out. +/// If the cast can be eliminated by some other simple transformation, we prefer /// to do the simplification first. bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V, Type *Ty) { @@ -318,9 +317,9 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) { return nullptr; } -/// CanEvaluateTruncated - Return true if we can evaluate the specified -/// expression tree as type Ty instead of its larger type, and arrive with the -/// same value. This is used by code that tries to eliminate truncates. +/// Return true if we can evaluate the specified expression tree as type Ty +/// instead of its larger type, and arrive with the same value. +/// This is used by code that tries to eliminate truncates. /// /// Ty will always be a type smaller than V. We should return true if trunc(V) /// can be computed by computing V in the smaller type. If V is an instruction, @@ -329,7 +328,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) { /// /// This function works on both vectors and scalars. /// -static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, +static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, Instruction *CxtI) { // We can always evaluate constants in another type. if (isa<Constant>(V)) @@ -359,8 +358,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, case Instruction::Or: case Instruction::Xor: // These operators can all arbitrarily be extended or truncated. - return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && - CanEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && + canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); case Instruction::UDiv: case Instruction::URem: { @@ -371,8 +370,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, APInt Mask = APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth); if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) && IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) { - return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && - CanEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && + canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); } } break; @@ -383,7 +382,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) { uint32_t BitWidth = Ty->getScalarSizeInBits(); if (CI->getLimitedValue(BitWidth) < BitWidth) - return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); } break; case Instruction::LShr: @@ -396,7 +395,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, if (IC.MaskedValueIsZero(I->getOperand(0), APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth), 0, CxtI) && CI->getLimitedValue(BitWidth) < BitWidth) { - return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); } } break; @@ -410,8 +409,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, return true; case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); - return CanEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) && - CanEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI); + return canEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) && + canEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI); } case Instruction::PHI: { // We can change a phi if we can change all operands. Note that we never @@ -419,7 +418,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, // instructions with a single use. PHINode *PN = cast<PHINode>(I); for (Value *IncValue : PN->incoming_values()) - if (!CanEvaluateTruncated(IncValue, Ty, IC, CxtI)) + if (!canEvaluateTruncated(IncValue, Ty, IC, CxtI)) return false; return true; } @@ -431,6 +430,50 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, return false; } +/// Given a vector that is bitcast to an integer, optionally logically +/// right-shifted, and truncated, convert it to an extractelement. +/// Example (big endian): +/// trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32 +/// ---> +/// extractelement <4 x i32> %X, 1 +static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC, + const DataLayout &DL) { + Value *TruncOp = Trunc.getOperand(0); + Type *DestType = Trunc.getType(); + if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType)) + return nullptr; + + Value *VecInput = nullptr; + ConstantInt *ShiftVal = nullptr; + if (!match(TruncOp, m_CombineOr(m_BitCast(m_Value(VecInput)), + m_LShr(m_BitCast(m_Value(VecInput)), + m_ConstantInt(ShiftVal)))) || + !isa<VectorType>(VecInput->getType())) + return nullptr; + + VectorType *VecType = cast<VectorType>(VecInput->getType()); + unsigned VecWidth = VecType->getPrimitiveSizeInBits(); + unsigned DestWidth = DestType->getPrimitiveSizeInBits(); + unsigned ShiftAmount = ShiftVal ? ShiftVal->getZExtValue() : 0; + + if ((VecWidth % DestWidth != 0) || (ShiftAmount % DestWidth != 0)) + return nullptr; + + // If the element type of the vector doesn't match the result type, + // bitcast it to a vector type that we can extract from. + unsigned NumVecElts = VecWidth / DestWidth; + if (VecType->getElementType() != DestType) { + VecType = VectorType::get(DestType, NumVecElts); + VecInput = IC.Builder->CreateBitCast(VecInput, VecType, "bc"); + } + + unsigned Elt = ShiftAmount / DestWidth; + if (DL.isBigEndian()) + Elt = NumVecElts - 1 - Elt; + + return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); +} + Instruction *InstCombiner::visitTrunc(TruncInst &CI) { if (Instruction *Result = commonCastTransforms(CI)) return Result; @@ -441,7 +484,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // min/max. Value *LHS, *RHS; if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0))) - if (matchSelectPattern(SI, LHS, RHS) != SPF_UNKNOWN) + if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN) return nullptr; // See if we can simplify any instructions used by the input whose sole @@ -457,7 +500,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // expression tree to something weird like i93 unless the source is also // strange. if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && - CanEvaluateTruncated(Src, DestTy, *this, &CI)) { + canEvaluateTruncated(Src, DestTy, *this, &CI)) { // If this cast is a truncate, evaluting in a different type always // eliminates the cast, so it is always a win. @@ -470,7 +513,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector. if (DestTy->getScalarSizeInBits() == 1) { - Constant *One = ConstantInt::get(Src->getType(), 1); + Constant *One = ConstantInt::get(SrcTy, 1); Src = Builder->CreateAnd(Src, One); Value *Zero = Constant::getNullValue(Src->getType()); return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero); @@ -489,31 +532,54 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // If the shift amount is larger than the size of A, then the result is // known to be zero because all the input bits got shifted out. if (Cst->getZExtValue() >= ASize) - return ReplaceInstUsesWith(CI, Constant::getNullValue(CI.getType())); + return ReplaceInstUsesWith(CI, Constant::getNullValue(DestTy)); // Since we're doing an lshr and a zero extend, and know that the shift // amount is smaller than ASize, it is always safe to do the shift in A's // type, then zero extend or truncate to the result. Value *Shift = Builder->CreateLShr(A, Cst->getZExtValue()); Shift->takeName(Src); - return CastInst::CreateIntegerCast(Shift, CI.getType(), false); + return CastInst::CreateIntegerCast(Shift, DestTy, false); + } + + // Transform trunc(lshr (sext A), Cst) to ashr A, Cst to eliminate type + // conversion. + // It works because bits coming from sign extension have the same value as + // the sign bit of the original value; performing ashr instead of lshr + // generates bits of the same value as the sign bit. + if (Src->hasOneUse() && + match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) && + cast<Instruction>(Src)->getOperand(0)->hasOneUse()) { + const unsigned ASize = A->getType()->getPrimitiveSizeInBits(); + // This optimization can be only performed when zero bits generated by + // the original lshr aren't pulled into the value after truncation, so we + // can only shift by values smaller than the size of destination type (in + // bits). + if (Cst->getValue().ult(ASize)) { + Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue()); + Shift->takeName(Src); + return CastInst::CreateIntegerCast(Shift, CI.getType(), true); + } } // Transform "trunc (and X, cst)" -> "and (trunc X), cst" so long as the dest // type isn't non-native. - if (Src->hasOneUse() && isa<IntegerType>(Src->getType()) && - ShouldChangeType(Src->getType(), CI.getType()) && + if (Src->hasOneUse() && isa<IntegerType>(SrcTy) && + ShouldChangeType(SrcTy, DestTy) && match(Src, m_And(m_Value(A), m_ConstantInt(Cst)))) { - Value *NewTrunc = Builder->CreateTrunc(A, CI.getType(), A->getName()+".tr"); + Value *NewTrunc = Builder->CreateTrunc(A, DestTy, A->getName() + ".tr"); return BinaryOperator::CreateAnd(NewTrunc, - ConstantExpr::getTrunc(Cst, CI.getType())); + ConstantExpr::getTrunc(Cst, DestTy)); } + if (Instruction *I = foldVecTruncToExtElt(CI, *this, DL)) + return I; + return nullptr; } -/// transformZExtICmp - Transform (zext icmp) to bitwise / integer operations -/// in order to eliminate the icmp. +/// Transform (zext icmp) to bitwise / integer operations in order to eliminate +/// the icmp. Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, bool DoXform) { // If we are just checking for a icmp eq of a single bit and zext'ing it @@ -525,19 +591,19 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, // zext (x <s 0) to i32 --> x>>u31 true if signbit set. // zext (x >s -1) to i32 --> (x>>u31)^1 true if signbit clear. if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) || - (ICI->getPredicate() == ICmpInst::ICMP_SGT &&Op1CV.isAllOnesValue())) { + (ICI->getPredicate() == ICmpInst::ICMP_SGT && Op1CV.isAllOnesValue())) { if (!DoXform) return ICI; Value *In = ICI->getOperand(0); Value *Sh = ConstantInt::get(In->getType(), - In->getType()->getScalarSizeInBits()-1); - In = Builder->CreateLShr(In, Sh, In->getName()+".lobit"); + In->getType()->getScalarSizeInBits() - 1); + In = Builder->CreateLShr(In, Sh, In->getName() + ".lobit"); if (In->getType() != CI.getType()) In = Builder->CreateIntCast(In, CI.getType(), false/*ZExt*/); if (ICI->getPredicate() == ICmpInst::ICMP_SGT) { Constant *One = ConstantInt::get(In->getType(), 1); - In = Builder->CreateXor(In, One, In->getName()+".not"); + In = Builder->CreateXor(In, One, In->getName() + ".not"); } return ReplaceInstUsesWith(CI, In); @@ -573,13 +639,13 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, return ReplaceInstUsesWith(CI, Res); } - uint32_t ShiftAmt = KnownZeroMask.logBase2(); + uint32_t ShAmt = KnownZeroMask.logBase2(); Value *In = ICI->getOperand(0); - if (ShiftAmt) { + if (ShAmt) { // Perform a logical shr by shiftamt. // Insert the shift to put the result in the low bit. - In = Builder->CreateLShr(In, ConstantInt::get(In->getType(),ShiftAmt), - In->getName()+".lobit"); + In = Builder->CreateLShr(In, ConstantInt::get(In->getType(), ShAmt), + In->getName() + ".lobit"); } if ((Op1CV != 0) == isNE) { // Toggle the low bit. @@ -637,8 +703,8 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, return nullptr; } -/// CanEvaluateZExtd - Determine if the specified value can be computed in the -/// specified wider type and produce the same low bits. If not, return false. +/// Determine if the specified value can be computed in the specified wider type +/// and produce the same low bits. If not, return false. /// /// If this function returns true, it can also return a non-zero number of bits /// (in BitsToClear) which indicates that the value it computes is correct for @@ -655,7 +721,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, /// clear the top bits anyway, doing this has no extra cost. /// /// This function works on both vectors and scalars. -static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, +static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, InstCombiner &IC, Instruction *CxtI) { BitsToClear = 0; if (isa<Constant>(V)) @@ -685,8 +751,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, case Instruction::Add: case Instruction::Sub: case Instruction::Mul: - if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) || - !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI)) + if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) || + !canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI)) return false; // These can all be promoted if neither operand has 'bits to clear'. if (BitsToClear == 0 && Tmp == 0) @@ -713,7 +779,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, // We can promote shl(x, cst) if we can promote x. Since shl overwrites the // upper bits we can reduce BitsToClear by the shift amount. if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) { - if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) + if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) return false; uint64_t ShiftAmt = Amt->getZExtValue(); BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0; @@ -724,7 +790,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, // We can promote lshr(x, cst) if we can promote x. This requires the // ultimate 'and' to clear out the high zero bits we're clearing out though. if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) { - if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) + if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) return false; BitsToClear += Amt->getZExtValue(); if (BitsToClear > V->getType()->getScalarSizeInBits()) @@ -734,8 +800,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, // Cannot promote variable LSHR. return false; case Instruction::Select: - if (!CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) || - !CanEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) || + if (!canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) || + !canEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) || // TODO: If important, we could handle the case when the BitsToClear are // known zero in the disagreeing side. Tmp != BitsToClear) @@ -747,10 +813,10 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, // get into trouble with cyclic PHIs here because we only consider // instructions with a single use. PHINode *PN = cast<PHINode>(I); - if (!CanEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI)) + if (!canEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI)) return false; for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) - if (!CanEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) || + if (!canEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) || // TODO: If important, we could handle the case when the BitsToClear // are known zero in the disagreeing input. Tmp != BitsToClear) @@ -787,13 +853,13 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { // strange. unsigned BitsToClear; if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && - CanEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) { + canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) { assert(BitsToClear < SrcTy->getScalarSizeInBits() && "Unreasonable BitsToClear"); // Okay, we can transform this! Insert the new expression now. DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type" - " to avoid zero extend: " << CI); + " to avoid zero extend: " << CI << '\n'); Value *Res = EvaluateInDifferentType(Src, DestTy, false); assert(Res->getType() == DestTy); @@ -897,8 +963,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { return nullptr; } -/// transformSExtICmp - Transform (sext icmp) to bitwise / integer operations -/// in order to eliminate the icmp. +/// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp. Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1); ICmpInst::Predicate Pred = ICI->getPredicate(); @@ -985,15 +1050,14 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { return nullptr; } -/// CanEvaluateSExtd - Return true if we can take the specified value -/// and return it as type Ty without inserting any new casts and without -/// changing the value of the common low bits. This is used by code that tries -/// to promote integer operations to a wider types will allow us to eliminate -/// the extension. +/// Return true if we can take the specified value and return it as type Ty +/// without inserting any new casts and without changing the value of the common +/// low bits. This is used by code that tries to promote integer operations to +/// a wider types will allow us to eliminate the extension. /// /// This function works on both vectors and scalars. /// -static bool CanEvaluateSExtd(Value *V, Type *Ty) { +static bool canEvaluateSExtd(Value *V, Type *Ty) { assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() && "Can't sign extend type to a smaller type"); // If this is a constant, it can be trivially promoted. @@ -1023,15 +1087,15 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) { case Instruction::Sub: case Instruction::Mul: // These operators can all arbitrarily be extended if their inputs can. - return CanEvaluateSExtd(I->getOperand(0), Ty) && - CanEvaluateSExtd(I->getOperand(1), Ty); + return canEvaluateSExtd(I->getOperand(0), Ty) && + canEvaluateSExtd(I->getOperand(1), Ty); //case Instruction::Shl: TODO //case Instruction::LShr: TODO case Instruction::Select: - return CanEvaluateSExtd(I->getOperand(1), Ty) && - CanEvaluateSExtd(I->getOperand(2), Ty); + return canEvaluateSExtd(I->getOperand(1), Ty) && + canEvaluateSExtd(I->getOperand(2), Ty); case Instruction::PHI: { // We can change a phi if we can change all operands. Note that we never @@ -1039,7 +1103,7 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) { // instructions with a single use. PHINode *PN = cast<PHINode>(I); for (Value *IncValue : PN->incoming_values()) - if (!CanEvaluateSExtd(IncValue, Ty)) return false; + if (!canEvaluateSExtd(IncValue, Ty)) return false; return true; } default: @@ -1081,10 +1145,10 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) { // expression tree to something weird like i93 unless the source is also // strange. if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && - CanEvaluateSExtd(Src, DestTy)) { + canEvaluateSExtd(Src, DestTy)) { // Okay, we can transform this! Insert the new expression now. DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type" - " to avoid sign extend: " << CI); + " to avoid sign extend: " << CI << '\n'); Value *Res = EvaluateInDifferentType(Src, DestTy, true); assert(Res->getType() == DestTy); @@ -1149,9 +1213,9 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) { } -/// FitsInFPType - Return a Constant* for the specified FP constant if it fits +/// Return a Constant* for the specified floating-point constant if it fits /// in the specified FP type without changing its value. -static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { +static Constant *fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { bool losesInfo; APFloat F = CFP->getValueAPF(); (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo); @@ -1160,12 +1224,12 @@ static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { return nullptr; } -/// LookThroughFPExtensions - If this is an fp extension instruction, look +/// If this is a floating-point extension instruction, look /// through it until we get the source value. -static Value *LookThroughFPExtensions(Value *V) { +static Value *lookThroughFPExtensions(Value *V) { if (Instruction *I = dyn_cast<Instruction>(V)) if (I->getOpcode() == Instruction::FPExt) - return LookThroughFPExtensions(I->getOperand(0)); + return lookThroughFPExtensions(I->getOperand(0)); // If this value is a constant, return the constant in the smallest FP type // that can accurately represent it. This allows us to turn @@ -1174,14 +1238,14 @@ static Value *LookThroughFPExtensions(Value *V) { if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext())) return V; // No constant folding of this. // See if the value can be truncated to half and then reextended. - if (Value *V = FitsInFPType(CFP, APFloat::IEEEhalf)) + if (Value *V = fitsInFPType(CFP, APFloat::IEEEhalf)) return V; // See if the value can be truncated to float and then reextended. - if (Value *V = FitsInFPType(CFP, APFloat::IEEEsingle)) + if (Value *V = fitsInFPType(CFP, APFloat::IEEEsingle)) return V; if (CFP->getType()->isDoubleTy()) return V; // Won't shrink. - if (Value *V = FitsInFPType(CFP, APFloat::IEEEdouble)) + if (Value *V = fitsInFPType(CFP, APFloat::IEEEdouble)) return V; // Don't try to shrink to various long double types. } @@ -1193,7 +1257,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { if (Instruction *I = commonCastTransforms(CI)) return I; // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to - // simpilify this expression to avoid one or more of the trunc/extend + // simplify this expression to avoid one or more of the trunc/extend // operations if we can do so without changing the numerical results. // // The exact manner in which the widths of the operands interact to limit @@ -1201,8 +1265,8 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { // is explained below in the various case statements. BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0)); if (OpI && OpI->hasOneUse()) { - Value *LHSOrig = LookThroughFPExtensions(OpI->getOperand(0)); - Value *RHSOrig = LookThroughFPExtensions(OpI->getOperand(1)); + Value *LHSOrig = lookThroughFPExtensions(OpI->getOperand(0)); + Value *RHSOrig = lookThroughFPExtensions(OpI->getOperand(1)); unsigned OpWidth = OpI->getType()->getFPMantissaWidth(); unsigned LHSWidth = LHSOrig->getType()->getFPMantissaWidth(); unsigned RHSWidth = RHSOrig->getType()->getFPMantissaWidth(); @@ -1307,10 +1371,16 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { // (fptrunc (select cond, R1, Cst)) --> // (select cond, (fptrunc R1), (fptrunc Cst)) + // + // - but only if this isn't part of a min/max operation, else we'll + // ruin min/max canonical form which is to have the select and + // compare's operands be of the same type with no casts to look through. + Value *LHS, *RHS; SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)); if (SI && (isa<ConstantFP>(SI->getOperand(1)) || - isa<ConstantFP>(SI->getOperand(2)))) { + isa<ConstantFP>(SI->getOperand(2))) && + matchSelectPattern(SI, LHS, RHS).Flavor == SPF_UNKNOWN) { Value *LHSTrunc = Builder->CreateFPTrunc(SI->getOperand(1), CI.getType()); Value *RHSTrunc = Builder->CreateFPTrunc(SI->getOperand(2), @@ -1327,9 +1397,8 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0), CI.getType()); Type *IntrinsicType[] = { CI.getType() }; - Function *Overload = - Intrinsic::getDeclaration(CI.getParent()->getParent()->getParent(), - II->getIntrinsicID(), IntrinsicType); + Function *Overload = Intrinsic::getDeclaration( + CI.getModule(), II->getIntrinsicID(), IntrinsicType); Value *Args[] = { InnerTrunc }; return CallInst::Create(Overload, Args, II->getName()); @@ -1483,12 +1552,12 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false); } -/// OptimizeVectorResize - This input value (which is known to have vector type) -/// is being zero extended or truncated to the specified vector type. Try to -/// replace it with a shuffle (and vector/vector bitcast) if possible. +/// This input value (which is known to have vector type) is being zero extended +/// or truncated to the specified vector type. +/// Try to replace it with a shuffle (and vector/vector bitcast) if possible. /// /// The source and destination vector types may have different element types. -static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy, +static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy, InstCombiner &IC) { // We can only do this optimization if the output is a multiple of the input // element size, or the input is a multiple of the output element size. @@ -1548,8 +1617,8 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) { return Value / Ty->getPrimitiveSizeInBits(); } -/// CollectInsertionElements - V is a value which is inserted into a vector of -/// VecEltTy. Look through the value to see if we can decompose it into +/// V is a value which is inserted into a vector of VecEltTy. +/// Look through the value to see if we can decompose it into /// insertions into the vector. See the example in the comment for /// OptimizeIntegerToVectorInsertions for the pattern this handles. /// The type of V is always a non-zero multiple of VecEltTy's size. @@ -1558,7 +1627,7 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) { /// /// This returns false if the pattern can't be matched or true if it can, /// filling in Elements with the elements found here. -static bool CollectInsertionElements(Value *V, unsigned Shift, +static bool collectInsertionElements(Value *V, unsigned Shift, SmallVectorImpl<Value *> &Elements, Type *VecEltTy, bool isBigEndian) { assert(isMultipleOfTypeSize(Shift, VecEltTy) && @@ -1595,7 +1664,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, // If the constant is the size of a vector element, we just need to bitcast // it to the right type so it gets properly inserted. if (NumElts == 1) - return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy), + return collectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy), Shift, Elements, VecEltTy, isBigEndian); // Okay, this is a constant that covers multiple elements. Slice it up into @@ -1611,7 +1680,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(), ShiftI)); Piece = ConstantExpr::getTrunc(Piece, ElementIntTy); - if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy, + if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy, isBigEndian)) return false; } @@ -1625,19 +1694,19 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, switch (I->getOpcode()) { default: return false; // Unhandled case. case Instruction::BitCast: - return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, isBigEndian); case Instruction::ZExt: if (!isMultipleOfTypeSize( I->getOperand(0)->getType()->getPrimitiveSizeInBits(), VecEltTy)) return false; - return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, isBigEndian); case Instruction::Or: - return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, isBigEndian) && - CollectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy, + collectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy, isBigEndian); case Instruction::Shl: { // Must be shifting by a constant that is a multiple of the element size. @@ -1645,7 +1714,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, if (!CI) return false; Shift += CI->getZExtValue(); if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false; - return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, isBigEndian); } @@ -1653,8 +1722,8 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, } -/// OptimizeIntegerToVectorInsertions - If the input is an 'or' instruction, we -/// may be doing shifts and ors to assemble the elements of the vector manually. +/// If the input is an 'or' instruction, we may be doing shifts and ors to +/// assemble the elements of the vector manually. /// Try to rip the code out and replace it with insertelements. This is to /// optimize code like this: /// @@ -1667,13 +1736,13 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, /// %tmp43 = bitcast i64 %ins35 to <2 x float> /// /// Into two insertelements that do "buildvector{%inc, %inc5}". -static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, +static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI, InstCombiner &IC) { VectorType *DestVecTy = cast<VectorType>(CI.getType()); Value *IntInput = CI.getOperand(0); SmallVector<Value*, 8> Elements(DestVecTy->getNumElements()); - if (!CollectInsertionElements(IntInput, 0, Elements, + if (!collectInsertionElements(IntInput, 0, Elements, DestVecTy->getElementType(), IC.getDataLayout().isBigEndian())) return nullptr; @@ -1692,63 +1761,29 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, return Result; } - -/// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double -/// bitcast. The various long double bitcasts can't get in here. -static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI, InstCombiner &IC, +/// Canonicalize scalar bitcasts of extracted elements into a bitcast of the +/// vector followed by extract element. The backend tends to handle bitcasts of +/// vectors better than bitcasts of scalars because vector registers are +/// usually not type-specific like scalar integer or scalar floating-point. +static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast, + InstCombiner &IC, const DataLayout &DL) { - Value *Src = CI.getOperand(0); - Type *DestTy = CI.getType(); - - // If this is a bitcast from int to float, check to see if the int is an - // extraction from a vector. - Value *VecInput = nullptr; - // bitcast(trunc(bitcast(somevector))) - if (match(Src, m_Trunc(m_BitCast(m_Value(VecInput)))) && - isa<VectorType>(VecInput->getType())) { - VectorType *VecTy = cast<VectorType>(VecInput->getType()); - unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); - - if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0) { - // If the element type of the vector doesn't match the result type, - // bitcast it to be a vector type we can extract from. - if (VecTy->getElementType() != DestTy) { - VecTy = VectorType::get(DestTy, - VecTy->getPrimitiveSizeInBits() / DestWidth); - VecInput = IC.Builder->CreateBitCast(VecInput, VecTy); - } - - unsigned Elt = 0; - if (DL.isBigEndian()) - Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1; - return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); - } - } + // TODO: Create and use a pattern matcher for ExtractElementInst. + auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0)); + if (!ExtElt || !ExtElt->hasOneUse()) + return nullptr; - // bitcast(trunc(lshr(bitcast(somevector), cst)) - ConstantInt *ShAmt = nullptr; - if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)), - m_ConstantInt(ShAmt)))) && - isa<VectorType>(VecInput->getType())) { - VectorType *VecTy = cast<VectorType>(VecInput->getType()); - unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); - if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0 && - ShAmt->getZExtValue() % DestWidth == 0) { - // If the element type of the vector doesn't match the result type, - // bitcast it to be a vector type we can extract from. - if (VecTy->getElementType() != DestTy) { - VecTy = VectorType::get(DestTy, - VecTy->getPrimitiveSizeInBits() / DestWidth); - VecInput = IC.Builder->CreateBitCast(VecInput, VecTy); - } + // The bitcast must be to a vectorizable type, otherwise we can't make a new + // type to extract from. + Type *DestType = BitCast.getType(); + if (!VectorType::isValidElementType(DestType)) + return nullptr; - unsigned Elt = ShAmt->getZExtValue() / DestWidth; - if (DL.isBigEndian()) - Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1 - Elt; - return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); - } - } - return nullptr; + unsigned NumElts = ExtElt->getVectorOperandType()->getNumElements(); + auto *NewVecType = VectorType::get(DestType, NumElts); + auto *NewBC = IC.Builder->CreateBitCast(ExtElt->getVectorOperand(), + NewVecType, "bc"); + return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand()); } Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { @@ -1794,11 +1829,6 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { } } - // Try to optimize int -> float bitcasts. - if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy)) - if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this, DL)) - return I; - if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) { if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) { Value *Elem = Builder->CreateBitCast(Src, DestVTy->getElementType()); @@ -1815,7 +1845,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { CastInst *SrcCast = cast<CastInst>(Src); if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0))) if (isa<VectorType>(BCIn->getOperand(0)->getType())) - if (Instruction *I = OptimizeVectorResize(BCIn->getOperand(0), + if (Instruction *I = optimizeVectorResize(BCIn->getOperand(0), cast<VectorType>(DestTy), *this)) return I; } @@ -1823,7 +1853,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { // If the input is an 'or' instruction, we may be doing shifts and ors to // assemble the elements of the vector manually. Try to rip the code out // and replace it with insertelements. - if (Value *V = OptimizeIntegerToVectorInsertions(CI, *this)) + if (Value *V = optimizeIntegerToVectorInsertions(CI, *this)) return ReplaceInstUsesWith(CI, V); } } @@ -1872,6 +1902,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { } } + if (Instruction *I = canonicalizeBitCastExtElt(CI, *this, DL)) + return I; + if (SrcTy->isPointerTy()) return commonPointerCastTransforms(CI); return commonCastTransforms(CI); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 95bba3c..c0786af 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -216,8 +216,6 @@ static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero, Max = KnownOne|UnknownBits; } - - /// FoldCmpLoadFromIndexedGlobal - Called we see this pattern: /// cmp pred (load (gep GV, ...)), cmpcst /// where GV is a global variable with a constant initializer. Try to simplify @@ -371,7 +369,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, } } - // If this element is in range, update our magic bitvector. if (i < 64 && IsTrueForElt) MagicBitvector |= 1ULL << i; @@ -469,7 +466,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End); } - // If a magic bitvector captures the entire comparison state // of this load, replace it with computation that does: // ((magic_cst >> i) & 1) != 0 @@ -496,7 +492,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, return nullptr; } - /// EvaluateGEPOffsetExpression - Return a value that can be used to compare /// the *offset* implied by a GEP to zero. For example, if we have &A[i], we /// want to return 'i' for "icmp ne i, 0". Note that, in general, indices can @@ -562,8 +557,6 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC, } } - - // Okay, we know we have a single variable index, which must be a // pointer/array/vector index. If there is no offset, life is simple, return // the index. @@ -737,6 +730,83 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, return nullptr; } +Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca, + Value *Other) { + assert(ICI.isEquality() && "Cannot fold non-equality comparison."); + + // It would be tempting to fold away comparisons between allocas and any + // pointer not based on that alloca (e.g. an argument). However, even + // though such pointers cannot alias, they can still compare equal. + // + // But LLVM doesn't specify where allocas get their memory, so if the alloca + // doesn't escape we can argue that it's impossible to guess its value, and we + // can therefore act as if any such guesses are wrong. + // + // The code below checks that the alloca doesn't escape, and that it's only + // used in a comparison once (the current instruction). The + // single-comparison-use condition ensures that we're trivially folding all + // comparisons against the alloca consistently, and avoids the risk of + // erroneously folding a comparison of the pointer with itself. + + unsigned MaxIter = 32; // Break cycles and bound to constant-time. + + SmallVector<Use *, 32> Worklist; + for (Use &U : Alloca->uses()) { + if (Worklist.size() >= MaxIter) + return nullptr; + Worklist.push_back(&U); + } + + unsigned NumCmps = 0; + while (!Worklist.empty()) { + assert(Worklist.size() <= MaxIter); + Use *U = Worklist.pop_back_val(); + Value *V = U->getUser(); + --MaxIter; + + if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V) || isa<PHINode>(V) || + isa<SelectInst>(V)) { + // Track the uses. + } else if (isa<LoadInst>(V)) { + // Loading from the pointer doesn't escape it. + continue; + } else if (auto *SI = dyn_cast<StoreInst>(V)) { + // Storing *to* the pointer is fine, but storing the pointer escapes it. + if (SI->getValueOperand() == U->get()) + return nullptr; + continue; + } else if (isa<ICmpInst>(V)) { + if (NumCmps++) + return nullptr; // Found more than one cmp. + continue; + } else if (auto *Intrin = dyn_cast<IntrinsicInst>(V)) { + switch (Intrin->getIntrinsicID()) { + // These intrinsics don't escape or compare the pointer. Memset is safe + // because we don't allow ptrtoint. Memcpy and memmove are safe because + // we don't allow stores, so src cannot point to V. + case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: + case Intrinsic::dbg_declare: case Intrinsic::dbg_value: + case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: + continue; + default: + return nullptr; + } + } else { + return nullptr; + } + for (Use &U : V->uses()) { + if (Worklist.size() >= MaxIter) + return nullptr; + Worklist.push_back(&U); + } + } + + Type *CmpTy = CmpInst::makeCmpResultType(Other->getType()); + return ReplaceInstUsesWith( + ICI, + ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate()))); +} + /// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X". Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI, @@ -851,7 +921,6 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, // to the same result value. HiOverflow = AddWithOverflow(HiBound, LoBound, RangeSize, false); } - } else if (DivRHS->getValue().isStrictlyPositive()) { // Divisor is > 0. if (CmpRHSV == 0) { // (X / pos) op 0 // Can't overflow. e.g. X/2 op 0 --> [-1, 2) @@ -996,7 +1065,6 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, return Res; } - // If we are comparing against bits always shifted out, the // comparison cannot succeed. APInt Comp = CmpRHSV << ShAmtVal; @@ -1074,18 +1142,22 @@ Instruction *InstCombiner::FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A, if (AP1 == AP2) return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType())); - // Get the distance between the highest bit that's set. int Shift; - // Both the constants are negative, take their positive to calculate log. if (IsAShr && AP1.isNegative()) - // Get the ones' complement of AP2 and AP1 when computing the distance. - Shift = (~AP2).logBase2() - (~AP1).logBase2(); + Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes(); else - Shift = AP2.logBase2() - AP1.logBase2(); + Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros(); if (Shift > 0) { - if (IsAShr ? AP1 == AP2.ashr(Shift) : AP1 == AP2.lshr(Shift)) + if (IsAShr && AP1 == AP2.ashr(Shift)) { + // There are multiple solutions if we are comparing against -1 and the LHS + // of the ashr is not a power of two. + if (AP1.isAllOnesValue() && !AP2.isPowerOf2()) + return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift)); + return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); + } else if (AP1 == AP2.lshr(Shift)) { return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); + } } // Shifting const2 will never be equal to const1. return getConstant(false); @@ -1145,6 +1217,14 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, switch (LHSI->getOpcode()) { case Instruction::Trunc: + if (RHS->isOne() && RHSV.getBitWidth() > 1) { + // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1 + Value *V = nullptr; + if (ICI.getPredicate() == ICmpInst::ICMP_SLT && + match(LHSI->getOperand(0), m_Signum(m_Value(V)))) + return new ICmpInst(ICmpInst::ICMP_SLT, V, + ConstantInt::get(V->getType(), 1)); + } if (ICI.isEquality() && LHSI->hasOneUse()) { // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all // of the high bits truncated out of x are known. @@ -1447,9 +1527,35 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, ICI.getPredicate() == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_ULE, LHSI->getOperand(0), SubOne(RHS)); + + // (icmp eq (and %A, C), 0) -> (icmp sgt (trunc %A), -1) + // iff C is a power of 2 + if (ICI.isEquality() && LHSI->hasOneUse() && match(RHS, m_Zero())) { + if (auto *CI = dyn_cast<ConstantInt>(LHSI->getOperand(1))) { + const APInt &AI = CI->getValue(); + int32_t ExactLogBase2 = AI.exactLogBase2(); + if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) { + Type *NTy = IntegerType::get(ICI.getContext(), ExactLogBase2 + 1); + Value *Trunc = Builder->CreateTrunc(LHSI->getOperand(0), NTy); + return new ICmpInst(ICI.getPredicate() == ICmpInst::ICMP_EQ + ? ICmpInst::ICMP_SGE + : ICmpInst::ICMP_SLT, + Trunc, Constant::getNullValue(NTy)); + } + } + } break; case Instruction::Or: { + if (RHS->isOne()) { + // icmp slt signum(V) 1 --> icmp slt V, 1 + Value *V = nullptr; + if (ICI.getPredicate() == ICmpInst::ICMP_SLT && + match(LHSI, m_Signum(m_Value(V)))) + return new ICmpInst(ICmpInst::ICMP_SLT, V, + ConstantInt::get(V->getType(), 1)); + } + if (!ICI.isEquality() || !RHS->isNullValue() || !LHSI->hasOneUse()) break; Value *P, *Q; @@ -2083,11 +2189,9 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, // If the pattern matches, truncate the inputs to the narrower type and // use the sadd_with_overflow intrinsic to efficiently compute both the // result and the overflow bit. - Module *M = I.getParent()->getParent()->getParent(); - Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth); - Value *F = Intrinsic::getDeclaration(M, Intrinsic::sadd_with_overflow, - NewType); + Value *F = Intrinsic::getDeclaration(I.getModule(), + Intrinsic::sadd_with_overflow, NewType); InstCombiner::BuilderTy *Builder = IC.Builder; @@ -2123,6 +2227,12 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS, return true; }; + // If the overflow check was an add followed by a compare, the insertion point + // may be pointing to the compare. We want to insert the new instructions + // before the add in case there are uses of the add between the add and the + // compare. + Builder->SetInsertPoint(&OrigI); + switch (OCF) { case OCF_INVALID: llvm_unreachable("bad overflow check kind!"); @@ -2223,7 +2333,9 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal, assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal); assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal); - Instruction *MulInstr = cast<Instruction>(MulVal); + auto *MulInstr = dyn_cast<Instruction>(MulVal); + if (!MulInstr) + return nullptr; assert(MulInstr->getOpcode() == Instruction::Mul); auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)), @@ -2357,7 +2469,6 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal, InstCombiner::BuilderTy *Builder = IC.Builder; Builder->SetInsertPoint(MulInstr); - Module *M = I.getParent()->getParent()->getParent(); // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B) Value *MulA = A, *MulB = B; @@ -2365,8 +2476,8 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal, MulA = Builder->CreateZExt(A, MulType); if (WidthB < MulWidth) MulB = Builder->CreateZExt(B, MulType); - Value *F = - Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow, MulType); + Value *F = Intrinsic::getDeclaration(I.getModule(), + Intrinsic::umul_with_overflow, MulType); CallInst *Call = Builder->CreateCall(F, {MulA, MulB}, "umul"); IC.Worklist.Add(MulInstr); @@ -2468,7 +2579,6 @@ static APInt DemandedBitsLHSMask(ICmpInst &I, default: return APInt::getAllOnesValue(BitWidth); } - } /// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst @@ -2905,7 +3015,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { ConstantInt::get(X->getType(), CI->countTrailingZeros())); } - break; } case ICmpInst::ICMP_NE: { @@ -2950,7 +3059,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { ConstantInt::get(X->getType(), CI->countTrailingZeros())); } - break; } case ICmpInst::ICMP_ULT: @@ -3103,7 +3211,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // comparison into the select arms, which will cause one to be // constant folded and the select turned into a bitwise or. Value *Op1 = nullptr, *Op2 = nullptr; - ConstantInt *CI = 0; + ConstantInt *CI = nullptr; if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) { Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC); CI = dyn_cast<ConstantInt>(Op1); @@ -3177,6 +3285,17 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { ICmpInst::getSwappedPredicate(I.getPredicate()), I)) return NI; + // Try to optimize equality comparisons against alloca-based pointers. + if (Op0->getType()->isPointerTy() && I.isEquality()) { + assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?"); + if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op0, DL))) + if (Instruction *New = FoldAllocaCmp(I, Alloca, Op1)) + return New; + if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op1, DL))) + if (Instruction *New = FoldAllocaCmp(I, Alloca, Op0)) + return New; + } + // Test to see if the operands of the icmp are casted versions of other // values. If the ptr->ptr cast can be stripped off both arguments, we do so // now. @@ -3304,6 +3423,26 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { match(B, m_One())) return new ICmpInst(CmpInst::ICMP_SGE, A, Op1); + // icmp sgt X, (Y + -1) -> icmp sge X, Y + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT && + match(D, m_AllOnes())) + return new ICmpInst(CmpInst::ICMP_SGE, Op0, C); + + // icmp sle X, (Y + -1) -> icmp slt X, Y + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE && + match(D, m_AllOnes())) + return new ICmpInst(CmpInst::ICMP_SLT, Op0, C); + + // icmp sge X, (Y + 1) -> icmp sgt X, Y + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && + match(D, m_One())) + return new ICmpInst(CmpInst::ICMP_SGT, Op0, C); + + // icmp slt X, (Y + 1) -> icmp sle X, Y + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && + match(D, m_One())) + return new ICmpInst(CmpInst::ICMP_SLE, Op0, C); + // if C1 has greater magnitude than C2: // icmp (X + C1), (Y + C2) -> icmp (X + C3), Y // s.t. C3 = C1 - C2 @@ -3473,6 +3612,18 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { } } } + + if (BO0) { + // Transform A & (L - 1) `ult` L --> L != 0 + auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes()); + auto BitwiseAnd = + m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value())); + + if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) { + auto *Zero = Constant::getNullValue(BO0->getType()); + return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero); + } + } } { Value *A, *B; @@ -3697,15 +3848,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType()); - // Check to see that the input is converted from an integer type that is small - // enough that preserves all bits. TODO: check here for "known" sign bits. - // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e. - unsigned InputSize = IntTy->getScalarSizeInBits(); - - // If this is a uitofp instruction, we need an extra bit to hold the sign. bool LHSUnsigned = isa<UIToFPInst>(LHSI); - if (LHSUnsigned) - ++InputSize; if (I.isEquality()) { FCmpInst::Predicate P = I.getPredicate(); @@ -3732,13 +3875,30 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, // equality compares as integer? } - // Comparisons with zero are a special case where we know we won't lose - // information. - bool IsCmpZero = RHS.isPosZero(); + // Check to see that the input is converted from an integer type that is small + // enough that preserves all bits. TODO: check here for "known" sign bits. + // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e. + unsigned InputSize = IntTy->getScalarSizeInBits(); - // If the conversion would lose info, don't hack on this. - if ((int)InputSize > MantissaWidth && !IsCmpZero) - return nullptr; + // Following test does NOT adjust InputSize downwards for signed inputs, + // because the most negative value still requires all the mantissa bits + // to distinguish it from one less than that value. + if ((int)InputSize > MantissaWidth) { + // Conversion would lose accuracy. Check if loss can impact comparison. + int Exp = ilogb(RHS); + if (Exp == APFloat::IEK_Inf) { + int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics())); + if (MaxExponent < (int)InputSize - !LHSUnsigned) + // Conversion could create infinity. + return nullptr; + } else { + // Note that if RHS is zero or NaN, then Exp is negative + // and first condition is trivially false. + if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned) + // Conversion could affect comparison. + return nullptr; + } + } // Otherwise, we can potentially simplify the comparison. We know that it // will always come through as an integer value and we know the constant is diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index ac934f1..e4e5065 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -281,6 +281,7 @@ public: ICmpInst::Predicate Pred); Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, ICmpInst::Predicate Cond, Instruction &I); + Instruction *FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca, Value *Other); Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1, BinaryOperator &I); Instruction *commonCastTransforms(CastInst &CI); @@ -341,6 +342,7 @@ public: const unsigned SIOpd); private: + bool ShouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const; bool ShouldChangeType(Type *From, Type *To) const; Value *dyn_castNegVal(Value *V) const; Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const; @@ -360,6 +362,11 @@ private: /// \brief Try to optimize a sequence of instructions checking if an operation /// on LHS and RHS overflows. /// + /// If this overflow check is done via one of the overflow check intrinsics, + /// then CtxI has to be the call instruction calling that intrinsic. If this + /// overflow check is done by arithmetic followed by a compare, then CtxI has + /// to be the arithmetic instruction. + /// /// If a simplification is possible, stores the simplified result of the /// operation in OperationResult and result of the overflow check in /// OverflowResult, and return true. If no simplification is possible, @@ -393,7 +400,7 @@ public: assert(New && !New->getParent() && "New instruction already inserted into a basic block!"); BasicBlock *BB = Old.getParent(); - BB->getInstList().insert(&Old, New); // Insert inst + BB->getInstList().insert(Old.getIterator(), New); // Insert inst Worklist.Add(New); return New; } @@ -407,7 +414,7 @@ public: /// \brief A combiner-aware RAUW-like routine. /// /// This method is to be used when an instruction is found to be dead, - /// replacable with another preexisting expression. Here we add all uses of + /// replaceable with another preexisting expression. Here we add all uses of /// I to the worklist, replace all uses of I with the new value, then return /// I, so that the inst combiner will know that I was modified. Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) { @@ -539,6 +546,7 @@ private: Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN); Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN); Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN); + Instruction *FoldPHIArgZextsIntoPHI(PHINode &PN); Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS, ConstantInt *AndRHS, BinaryOperator &TheAnd); @@ -548,7 +556,7 @@ private: Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, bool isSigned, bool Inside); Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI); - Instruction *MatchBSwap(BinaryOperator &I); + Instruction *MatchBSwapOrBitReverse(BinaryOperator &I); bool SimplifyStoreAtEndOfBlock(StoreInst &SI); Instruction *SimplifyMemTransfer(MemIntrinsic *MI); Instruction *SimplifyMemSet(MemSetInst *MI); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index e3179db..47406b9 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Loads.h" #include "llvm/IR/DataLayout.h" @@ -90,21 +91,23 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, if (CS.isCallee(&U)) continue; + unsigned DataOpNo = CS.getDataOperandNo(&U); + bool IsArgOperand = CS.isArgOperand(&U); + // Inalloca arguments are clobbered by the call. - unsigned ArgNo = CS.getArgumentNo(&U); - if (CS.isInAllocaArgument(ArgNo)) + if (IsArgOperand && CS.isInAllocaArgument(DataOpNo)) return false; // If this is a readonly/readnone call site, then we know it is just a // load (but one that potentially returns the value itself), so we can // ignore it if we know that the value isn't captured. if (CS.onlyReadsMemory() && - (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo))) + (CS.getInstruction()->use_empty() || CS.doesNotCapture(DataOpNo))) continue; // If this is being passed as a byval argument, the caller is making a // copy, so it is only a read of the alloca. - if (CS.isByValArgument(ArgNo)) + if (IsArgOperand && CS.isByValArgument(DataOpNo)) continue; } @@ -186,7 +189,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) { // Scan to the end of the allocation instructions, to skip over a block of // allocas if possible...also skip interleaved debug info // - BasicBlock::iterator It = New; + BasicBlock::iterator It(New); while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It)) ++It; @@ -367,7 +370,13 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT MDB.createRange(NonNullInt, NullInt)); } break; - + case LLVMContext::MD_align: + case LLVMContext::MD_dereferenceable: + case LLVMContext::MD_dereferenceable_or_null: + // These only directly apply if the new type is also a pointer. + if (NewTy->isPointerTy()) + NewLoad->setMetadata(ID, N); + break; case LLVMContext::MD_range: // FIXME: It would be nice to propagate this in some way, but the type // conversions make it hard. If the new type is a pointer, we could @@ -418,6 +427,9 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value case LLVMContext::MD_invariant_load: case LLVMContext::MD_nonnull: case LLVMContext::MD_range: + case LLVMContext::MD_align: + case LLVMContext::MD_dereferenceable: + case LLVMContext::MD_dereferenceable_or_null: // These don't apply for stores. break; } @@ -511,16 +523,46 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { if (!T->isAggregateType()) return nullptr; - assert(LI.getAlignment() && "Alignement must be set at this point"); + assert(LI.getAlignment() && "Alignment must be set at this point"); if (auto *ST = dyn_cast<StructType>(T)) { // If the struct only have one element, we unpack. - if (ST->getNumElements() == 1) { + unsigned Count = ST->getNumElements(); + if (Count == 1) { LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U), ".unpack"); return IC.ReplaceInstUsesWith(LI, IC.Builder->CreateInsertValue( UndefValue::get(T), NewLoad, 0, LI.getName())); } + + // We don't want to break loads with padding here as we'd loose + // the knowledge that padding exists for the rest of the pipeline. + const DataLayout &DL = IC.getDataLayout(); + auto *SL = DL.getStructLayout(ST); + if (SL->hasPadding()) + return nullptr; + + auto Name = LI.getName(); + SmallString<16> LoadName = Name; + LoadName += ".unpack"; + SmallString<16> EltName = Name; + EltName += ".elt"; + auto *Addr = LI.getPointerOperand(); + Value *V = UndefValue::get(T); + auto *IdxType = Type::getInt32Ty(ST->getContext()); + auto *Zero = ConstantInt::get(IdxType, 0); + for (unsigned i = 0; i < Count; i++) { + Value *Indices[2] = { + Zero, + ConstantInt::get(IdxType, i), + }; + auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), EltName); + auto *L = IC.Builder->CreateLoad(ST->getTypeAtIndex(i), Ptr, LoadName); + V = IC.Builder->CreateInsertValue(V, L, i); + } + + V->setName(Name); + return IC.ReplaceInstUsesWith(LI, V); } if (auto *AT = dyn_cast<ArrayType>(T)) { @@ -681,7 +723,7 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI, // FIXME: If the GEP is not inbounds, and there are extra indices after the // one we'll replace, those could cause the address computation to wrap // (rendering the IsAllNonNegative() check below insufficient). We can do - // better, ignoring zero indicies (and other indicies we can prove small + // better, ignoring zero indices (and other indices we can prove small // enough not to wrap). if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds()) return false; @@ -748,19 +790,19 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { // Do really simple store-to-load forwarding and load CSE, to catch cases // where there are several consecutive memory accesses to the same location, // separated by a few arithmetic operations. - BasicBlock::iterator BBI = &LI; + BasicBlock::iterator BBI(LI); AAMDNodes AATags; - if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI, - 6, AA, &AATags)) { + if (Value *AvailableVal = + FindAvailableLoadedValue(Op, LI.getParent(), BBI, + DefMaxInstsToScan, AA, &AATags)) { if (LoadInst *NLI = dyn_cast<LoadInst>(AvailableVal)) { unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_range, - LLVMContext::MD_invariant_load, - LLVMContext::MD_nonnull, - }; + LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_range, + LLVMContext::MD_invariant_load, LLVMContext::MD_nonnull, + LLVMContext::MD_invariant_group, LLVMContext::MD_align, + LLVMContext::MD_dereferenceable, + LLVMContext::MD_dereferenceable_or_null}; combineMetadata(NLI, &LI, KnownIDs); }; @@ -822,7 +864,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { } // load (select (cond, null, P)) -> load P - if (isa<ConstantPointerNull>(SI->getOperand(1)) && + if (isa<ConstantPointerNull>(SI->getOperand(1)) && LI.getPointerAddressSpace() == 0) { LI.setOperand(0, SI->getOperand(2)); return &LI; @@ -857,7 +899,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { /// /// \returns true if the store was successfully combined away. This indicates /// the caller must erase the store instruction. We have to let the caller erase -/// the store instruction sas otherwise there is no way to signal whether it was +/// the store instruction as otherwise there is no way to signal whether it was /// combined or not: IC.EraseInstFromFunction returns a null pointer. static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) { // FIXME: We could probably with some care handle both volatile and atomic @@ -893,11 +935,38 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) { if (auto *ST = dyn_cast<StructType>(T)) { // If the struct only have one element, we unpack. - if (ST->getNumElements() == 1) { + unsigned Count = ST->getNumElements(); + if (Count == 1) { V = IC.Builder->CreateExtractValue(V, 0); combineStoreToNewValue(IC, SI, V); return true; } + + // We don't want to break loads with padding here as we'd loose + // the knowledge that padding exists for the rest of the pipeline. + const DataLayout &DL = IC.getDataLayout(); + auto *SL = DL.getStructLayout(ST); + if (SL->hasPadding()) + return false; + + SmallString<16> EltName = V->getName(); + EltName += ".elt"; + auto *Addr = SI.getPointerOperand(); + SmallString<16> AddrName = Addr->getName(); + AddrName += ".repack"; + auto *IdxType = Type::getInt32Ty(ST->getContext()); + auto *Zero = ConstantInt::get(IdxType, 0); + for (unsigned i = 0; i < Count; i++) { + Value *Indices[2] = { + Zero, + ConstantInt::get(IdxType, i), + }; + auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), AddrName); + auto *Val = IC.Builder->CreateExtractValue(V, i, EltName); + IC.Builder->CreateStore(Val, Ptr); + } + + return true; } if (auto *AT = dyn_cast<ArrayType>(T)) { @@ -971,9 +1040,9 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { return &SI; } - // Don't hack volatile/atomic stores. - // FIXME: Some bits are legal for atomic stores; needs refactoring. - if (!SI.isSimple()) return nullptr; + // Don't hack volatile/ordered stores. + // FIXME: Some bits are legal for ordered atomic stores; needs refactoring. + if (!SI.isUnordered()) return nullptr; // If the RHS is an alloca with a single use, zapify the store, making the // alloca dead. @@ -991,7 +1060,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { // Do really simple DSE, to catch cases where there are several consecutive // stores to the same location, separated by a few arithmetic operations. This // situation often occurs with bitfield accesses. - BasicBlock::iterator BBI = &SI; + BasicBlock::iterator BBI(SI); for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts; --ScanInsts) { --BBI; @@ -1005,7 +1074,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) { // Prev store isn't volatile, and stores to the same location? - if (PrevSI->isSimple() && equivalentAddressValues(PrevSI->getOperand(1), + if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1), SI.getOperand(1))) { ++NumDeadStore; ++BBI; @@ -1019,9 +1088,10 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { // the pointer we're loading and is producing the pointer we're storing, // then *this* store is dead (X = load P; store X -> P). if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) { - if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) && - LI->isSimple()) + if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) { + assert(SI.isUnordered() && "can't eliminate ordering operation"); return EraseInstFromFunction(SI); + } // Otherwise, this is a load from some other location. Stores before it // may not be dead. @@ -1047,10 +1117,14 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { if (isa<UndefValue>(Val)) return EraseInstFromFunction(SI); + // The code below needs to be audited and adjusted for unordered atomics + if (!SI.isSimple()) + return nullptr; + // If this store is the last instruction in the basic block (possibly // excepting debug info instructions), and if the block ends with an // unconditional branch, try to move it to the successor block. - BBI = &SI; + BBI = SI.getIterator(); do { ++BBI; } while (isa<DbgInfoIntrinsic>(BBI) || @@ -1106,7 +1180,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) { return false; // Verify that the other block ends in a branch and is not otherwise empty. - BasicBlock::iterator BBI = OtherBB->getTerminator(); + BasicBlock::iterator BBI(OtherBB->getTerminator()); BranchInst *OtherBr = dyn_cast<BranchInst>(BBI); if (!OtherBr || BBI == OtherBB->begin()) return false; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index a554e9f..160792b 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -22,9 +22,9 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -/// simplifyValueKnownNonZero - The specific integer value is used in a context -/// where it is known to be non-zero. If this allows us to simplify the -/// computation, do so and return the new operand, otherwise return null. +/// The specific integer value is used in a context where it is known to be +/// non-zero. If this allows us to simplify the computation, do so and return +/// the new operand, otherwise return null. static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC, Instruction &CxtI) { // If V has multiple uses, then we would have to do more analysis to determine @@ -76,8 +76,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC, } -/// MultiplyOverflows - True if the multiply can not be expressed in an int -/// this size. +/// True if the multiply can not be expressed in an int this size. static bool MultiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product, bool IsSigned) { bool Overflow; @@ -95,6 +94,14 @@ static bool IsMultiple(const APInt &C1, const APInt &C2, APInt &Quotient, assert(C1.getBitWidth() == C2.getBitWidth() && "Inconsistent width of constants!"); + // Bail if we will divide by zero. + if (C2.isMinValue()) + return false; + + // Bail if we would divide INT_MIN by -1. + if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue()) + return false; + APInt Remainder(C1.getBitWidth(), /*Val=*/0ULL, IsSigned); if (IsSigned) APInt::sdivrem(C1, C2, Quotient, Remainder); @@ -629,7 +636,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { // if pattern detected emit alternate sequence if (OpX && OpY) { BuilderTy::FastMathFlagGuard Guard(*Builder); - Builder->SetFastMathFlags(Log2->getFastMathFlags()); + Builder->setFastMathFlags(Log2->getFastMathFlags()); Log2->setArgOperand(0, OpY); Value *FMulVal = Builder->CreateFMul(OpX, Log2); Value *FSub = Builder->CreateFSub(FMulVal, OpX); @@ -645,7 +652,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { bool IgnoreZeroSign = I.hasNoSignedZeros(); if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) { BuilderTy::FastMathFlagGuard Guard(*Builder); - Builder->SetFastMathFlags(I.getFastMathFlags()); + Builder->setFastMathFlags(I.getFastMathFlags()); Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign); Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign); @@ -686,7 +693,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (Y) { BuilderTy::FastMathFlagGuard Guard(*Builder); - Builder->SetFastMathFlags(I.getFastMathFlags()); + Builder->setFastMathFlags(I.getFastMathFlags()); Value *T = Builder->CreateFMul(Opnd1, Opnd1); Value *R = Builder->CreateFMul(T, Y); @@ -705,8 +712,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { return Changed ? &I : nullptr; } -/// SimplifyDivRemOfSelect - Try to fold a divide or remainder of a select -/// instruction. +/// Try to fold a divide or remainder of a select instruction. bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { SelectInst *SI = cast<SelectInst>(I.getOperand(1)); @@ -740,7 +746,7 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { return true; // Scan the current block backward, looking for other uses of SI. - BasicBlock::iterator BBI = &I, BBFront = I.getParent()->begin(); + BasicBlock::iterator BBI = I.getIterator(), BBFront = I.getParent()->begin(); while (BBI != BBFront) { --BBI; @@ -754,10 +760,10 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { I != E; ++I) { if (*I == SI) { *I = SI->getOperand(NonNullOperand); - Worklist.Add(BBI); + Worklist.Add(&*BBI); } else if (*I == SelectCond) { *I = Builder->getInt1(NonNullOperand == 1); - Worklist.Add(BBI); + Worklist.Add(&*BBI); } } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 460f6eb..f1aa98b 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "instcombine" @@ -245,7 +246,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) { /// non-address-taken alloca. Doing so will cause us to not promote the alloca /// to a register. static bool isSafeAndProfitableToSinkLoad(LoadInst *L) { - BasicBlock::iterator BBI = L, E = L->getParent()->end(); + BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end(); for (++BBI; BBI != E; ++BBI) if (BBI->mayWriteToMemory()) @@ -349,24 +350,40 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { Value *InVal = FirstLI->getOperand(0); NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); + LoadInst *NewLI = new LoadInst(NewPN, "", isVolatile, LoadAlignment); + + unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, + LLVMContext::MD_range, + LLVMContext::MD_invariant_load, + LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + LLVMContext::MD_nonnull, + LLVMContext::MD_align, + LLVMContext::MD_dereferenceable, + LLVMContext::MD_dereferenceable_or_null, + }; - // Add all operands to the new PHI. + for (unsigned ID : KnownIDs) + NewLI->setMetadata(ID, FirstLI->getMetadata(ID)); + + // Add all operands to the new PHI and combine TBAA metadata. for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { - Value *NewInVal = cast<LoadInst>(PN.getIncomingValue(i))->getOperand(0); + LoadInst *LI = cast<LoadInst>(PN.getIncomingValue(i)); + combineMetadata(NewLI, LI, KnownIDs); + Value *NewInVal = LI->getOperand(0); if (NewInVal != InVal) InVal = nullptr; NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i)); } - Value *PhiVal; if (InVal) { // The new PHI unions all of the same values together. This is really // common, so we handle it intelligently here for compile-time speed. - PhiVal = InVal; + NewLI->setOperand(0, InVal); delete NewPN; } else { InsertNewInstBefore(NewPN, PN); - PhiVal = NewPN; } // If this was a volatile load that we are merging, make sure to loop through @@ -376,17 +393,94 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { for (Value *IncValue : PN.incoming_values()) cast<LoadInst>(IncValue)->setVolatile(false); - LoadInst *NewLI = new LoadInst(PhiVal, "", isVolatile, LoadAlignment); NewLI->setDebugLoc(FirstLI->getDebugLoc()); return NewLI; } +/// TODO: This function could handle other cast types, but then it might +/// require special-casing a cast from the 'i1' type. See the comment in +/// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types. +Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) { + // We cannot create a new instruction after the PHI if the terminator is an + // EHPad because there is no valid insertion point. + if (TerminatorInst *TI = Phi.getParent()->getTerminator()) + if (TI->isEHPad()) + return nullptr; + + // Early exit for the common case of a phi with two operands. These are + // handled elsewhere. See the comment below where we check the count of zexts + // and constants for more details. + unsigned NumIncomingValues = Phi.getNumIncomingValues(); + if (NumIncomingValues < 3) + return nullptr; + // Find the narrower type specified by the first zext. + Type *NarrowType = nullptr; + for (Value *V : Phi.incoming_values()) { + if (auto *Zext = dyn_cast<ZExtInst>(V)) { + NarrowType = Zext->getSrcTy(); + break; + } + } + if (!NarrowType) + return nullptr; + + // Walk the phi operands checking that we only have zexts or constants that + // we can shrink for free. Store the new operands for the new phi. + SmallVector<Value *, 4> NewIncoming; + unsigned NumZexts = 0; + unsigned NumConsts = 0; + for (Value *V : Phi.incoming_values()) { + if (auto *Zext = dyn_cast<ZExtInst>(V)) { + // All zexts must be identical and have one use. + if (Zext->getSrcTy() != NarrowType || !Zext->hasOneUse()) + return nullptr; + NewIncoming.push_back(Zext->getOperand(0)); + NumZexts++; + } else if (auto *C = dyn_cast<Constant>(V)) { + // Make sure that constants can fit in the new type. + Constant *Trunc = ConstantExpr::getTrunc(C, NarrowType); + if (ConstantExpr::getZExt(Trunc, C->getType()) != C) + return nullptr; + NewIncoming.push_back(Trunc); + NumConsts++; + } else { + // If it's not a cast or a constant, bail out. + return nullptr; + } + } + + // The more common cases of a phi with no constant operands or just one + // variable operand are handled by FoldPHIArgOpIntoPHI() and FoldOpIntoPhi() + // respectively. FoldOpIntoPhi() wants to do the opposite transform that is + // performed here. It tries to replicate a cast in the phi operand's basic + // block to expose other folding opportunities. Thus, InstCombine will + // infinite loop without this check. + if (NumConsts == 0 || NumZexts < 2) + return nullptr; + + // All incoming values are zexts or constants that are safe to truncate. + // Create a new phi node of the narrow type, phi together all of the new + // operands, and zext the result back to the original type. + PHINode *NewPhi = PHINode::Create(NarrowType, NumIncomingValues, + Phi.getName() + ".shrunk"); + for (unsigned i = 0; i != NumIncomingValues; ++i) + NewPhi->addIncoming(NewIncoming[i], Phi.getIncomingBlock(i)); + + InsertNewInstBefore(NewPhi, Phi); + return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType()); +} /// If all operands to a PHI node are the same "unary" operator and they all are /// only used by the PHI, PHI together their inputs, and do the operation once, /// to the result of the PHI. Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) { + // We cannot create a new instruction after the PHI if the terminator is an + // EHPad because there is no valid insertion point. + if (TerminatorInst *TI = PN.getParent()->getTerminator()) + if (TI->isEHPad()) + return nullptr; + Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0)); if (isa<GetElementPtrInst>(FirstInst)) @@ -740,7 +834,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { } // Otherwise, do an extract in the predecessor. - Builder->SetInsertPoint(Pred, Pred->getTerminator()); + Builder->SetInsertPoint(Pred->getTerminator()); Value *Res = InVal; if (Offset) Res = Builder->CreateLShr(Res, ConstantInt::get(InVal->getType(), @@ -787,6 +881,9 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) { if (Value *V = SimplifyInstruction(&PN, DL, TLI, DT, AC)) return ReplaceInstUsesWith(PN, V); + if (Instruction *Result = FoldPHIArgZextsIntoPHI(PN)) + return Result; + // If all PHI operands are the same operation, pull them through the PHI, // reducing code size. if (isa<Instruction>(PN.getIncomingValue(0)) && diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index f51442a..51219bc 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -38,7 +38,8 @@ getInverseMinMaxSelectPattern(SelectPatternFlavor SPF) { } } -static CmpInst::Predicate getICmpPredicateForMinMax(SelectPatternFlavor SPF) { +static CmpInst::Predicate getCmpPredicateForMinMax(SelectPatternFlavor SPF, + bool Ordered=false) { switch (SPF) { default: llvm_unreachable("unhandled!"); @@ -51,17 +52,22 @@ static CmpInst::Predicate getICmpPredicateForMinMax(SelectPatternFlavor SPF) { return ICmpInst::ICMP_SGT; case SPF_UMAX: return ICmpInst::ICMP_UGT; + case SPF_FMINNUM: + return Ordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT; + case SPF_FMAXNUM: + return Ordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT; } } static Value *generateMinMaxSelectPattern(InstCombiner::BuilderTy *Builder, SelectPatternFlavor SPF, Value *A, Value *B) { - CmpInst::Predicate Pred = getICmpPredicateForMinMax(SPF); + CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF); + assert(CmpInst::isIntPredicate(Pred)); return Builder->CreateSelect(Builder->CreateICmp(Pred, A, B), A, B); } -/// GetSelectFoldableOperands - We want to turn code that looks like this: +/// We want to turn code that looks like this: /// %C = or %A, %B /// %D = select %cond, %C, %A /// into: @@ -90,8 +96,8 @@ static unsigned GetSelectFoldableOperands(Instruction *I) { } } -/// GetSelectFoldableConstant - For the same transformation as the previous -/// function, return the identity constant that goes into the select. +/// For the same transformation as the previous function, return the identity +/// constant that goes into the select. static Constant *GetSelectFoldableConstant(Instruction *I) { switch (I->getOpcode()) { default: llvm_unreachable("This cannot happen!"); @@ -110,7 +116,7 @@ static Constant *GetSelectFoldableConstant(Instruction *I) { } } -/// FoldSelectOpOp - Here we have (select c, TI, FI), and we know that TI and FI +/// Here we have (select c, TI, FI), and we know that TI and FI /// have the same opcode and only one use each. Try to simplify this. Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI) { @@ -197,8 +203,8 @@ static bool isSelect01(Constant *C1, Constant *C2) { C2I->isOne() || C2I->isAllOnesValue(); } -/// FoldSelectIntoOp - Try fold the select into one of the operands to -/// facilitate further optimization. +/// Try to fold the select into one of the operands to allow further +/// optimization. Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, Value *FalseVal) { // See the comment above GetSelectFoldableOperands for a description of the @@ -276,7 +282,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, return nullptr; } -/// foldSelectICmpAndOr - We want to turn: +/// We want to turn: /// (select (icmp eq (and X, C1), 0), Y, (or Y, C2)) /// into: /// (or (shl (and X, C1), C3), y) @@ -394,9 +400,7 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, return nullptr; } -/// visitSelectInstWithICmp - Visit a SelectInst that has an -/// ICmpInst as its first operand. -/// +/// Visit a SelectInst that has an ICmpInst as its first operand. Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI) { bool Changed = false; @@ -595,10 +599,9 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, } -/// CanSelectOperandBeMappingIntoPredBlock - SI is a select whose condition is a -/// PHI node (but the two may be in different blocks). See if the true/false -/// values (V) are live in all of the predecessor blocks of the PHI. For -/// example, cases like this cannot be mapped: +/// SI is a select whose condition is a PHI node (but the two may be in +/// different blocks). See if the true/false values (V) are live in all of the +/// predecessor blocks of the PHI. For example, cases like this can't be mapped: /// /// X = phi [ C1, BB1], [C2, BB2] /// Y = add @@ -632,7 +635,7 @@ static bool CanSelectOperandBeMappingIntoPredBlock(const Value *V, return false; } -/// FoldSPFofSPF - We have an SPF (e.g. a min or max) of an SPF of the form: +/// We have an SPF (e.g. a min or max) of an SPF of the form: /// SPF2(SPF1(A, B), C) Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1, @@ -745,10 +748,10 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner, return nullptr; } -/// foldSelectICmpAnd - If one of the constants is zero (we know they can't -/// both be) and we have an icmp instruction with zero, and we have an 'and' -/// with the non-constant value and a power of two we can turn the select -/// into a shift on the result of the 'and'. +/// If one of the constants is zero (we know they can't both be) and we have an +/// icmp instruction with zero, and we have an 'and' with the non-constant value +/// and a power of two we can turn the select into a shift on the result of the +/// 'and'. static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal, ConstantInt *FalseVal, InstCombiner::BuilderTy *Builder) { @@ -926,6 +929,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { FCmpInst::Predicate InvPred = FCI->getInversePredicate(); + IRBuilder<>::FastMathFlagGuard FMFG(*Builder); + Builder->setFastMathFlags(FCI->getFastMathFlags()); Value *NewCond = Builder->CreateFCmp(InvPred, TrueVal, FalseVal, FCI->getName() + ".inv"); @@ -967,6 +972,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { // (X ugt Y) ? X : Y -> (X ole Y) ? X : Y if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { FCmpInst::Predicate InvPred = FCI->getInversePredicate(); + IRBuilder<>::FastMathFlagGuard FMFG(*Builder); + Builder->setFastMathFlags(FCI->getFastMathFlags()); Value *NewCond = Builder->CreateFCmp(InvPred, FalseVal, TrueVal, FCI->getName() + ".inv"); @@ -1054,35 +1061,50 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { } // See if we can fold the select into one of our operands. - if (SI.getType()->isIntOrIntVectorTy()) { + if (SI.getType()->isIntOrIntVectorTy() || SI.getType()->isFPOrFPVectorTy()) { if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal)) return FoldI; Value *LHS, *RHS, *LHS2, *RHS2; Instruction::CastOps CastOp; - SelectPatternFlavor SPF = matchSelectPattern(&SI, LHS, RHS, &CastOp); + SelectPatternResult SPR = matchSelectPattern(&SI, LHS, RHS, &CastOp); + auto SPF = SPR.Flavor; - if (SPF) { + if (SelectPatternResult::isMinOrMax(SPF)) { // Canonicalize so that type casts are outside select patterns. if (LHS->getType()->getPrimitiveSizeInBits() != SI.getType()->getPrimitiveSizeInBits()) { - CmpInst::Predicate Pred = getICmpPredicateForMinMax(SPF); - Value *Cmp = Builder->CreateICmp(Pred, LHS, RHS); + CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF, SPR.Ordered); + + Value *Cmp; + if (CmpInst::isIntPredicate(Pred)) { + Cmp = Builder->CreateICmp(Pred, LHS, RHS); + } else { + IRBuilder<>::FastMathFlagGuard FMFG(*Builder); + auto FMF = cast<FPMathOperator>(SI.getCondition())->getFastMathFlags(); + Builder->setFastMathFlags(FMF); + Cmp = Builder->CreateFCmp(Pred, LHS, RHS); + } + Value *NewSI = Builder->CreateCast(CastOp, Builder->CreateSelect(Cmp, LHS, RHS), SI.getType()); return ReplaceInstUsesWith(SI, NewSI); } + } + if (SPF) { // MAX(MAX(a, b), a) -> MAX(a, b) // MIN(MIN(a, b), a) -> MIN(a, b) // MAX(MIN(a, b), a) -> a // MIN(MAX(a, b), a) -> a - if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2)) + // ABS(ABS(a)) -> ABS(a) + // NABS(NABS(a)) -> NABS(a) + if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor) if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2, SI, SPF, RHS)) return R; - if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2)) + if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2).Flavor) if (Instruction *R = FoldSPFofSPF(cast<Instruction>(RHS),SPF2,LHS2,RHS2, SI, SPF, LHS)) return R; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index d04ed58..0c7defa 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -55,7 +55,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { return nullptr; } -/// CanEvaluateShifted - See if we can compute the specified value, but shifted +/// See if we can compute the specified value, but shifted /// logically to the left or right by some number of bits. This should return /// true if the expression can be computed for the same cost as the current /// expression tree. This is used to eliminate extraneous shifting from things @@ -184,7 +184,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, } } -/// GetShiftedValue - When CanEvaluateShifted returned true for an expression, +/// When CanEvaluateShifted returned true for an expression, /// this value inserts the new computation that produces the shifted value. static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, InstCombiner &IC, const DataLayout &DL) { diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 80628b2..743d514 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -410,9 +410,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // If this is a select as part of a min/max pattern, don't simplify any // further in case we break the structure. Value *LHS, *RHS; - if (matchSelectPattern(I, LHS, RHS) != SPF_UNKNOWN) + if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN) return nullptr; - + if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, RHSKnownZero, RHSKnownOne, Depth + 1) || SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, LHSKnownZero, @@ -1057,7 +1057,13 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt LeftDemanded(DemandedElts), RightDemanded(DemandedElts); if (ConstantVector* CV = dyn_cast<ConstantVector>(I->getOperand(0))) { for (unsigned i = 0; i < VWidth; i++) { - if (CV->getAggregateElement(i)->isNullValue()) + Constant *CElt = CV->getAggregateElement(i); + // Method isNullValue always returns false when called on a + // ConstantExpr. If CElt is a ConstantExpr then skip it in order to + // to avoid propagating incorrect information. + if (isa<ConstantExpr>(CElt)) + continue; + if (CElt->isNullValue()) LeftDemanded.clearBit(i); else RightDemanded.clearBit(i); @@ -1082,6 +1088,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, if (!VTy) break; unsigned InVWidth = VTy->getNumElements(); APInt InputDemandedElts(InVWidth, 0); + UndefElts2 = APInt(InVWidth, 0); unsigned Ratio; if (VWidth == InVWidth) { @@ -1089,29 +1096,25 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // elements as are demanded of us. Ratio = 1; InputDemandedElts = DemandedElts; - } else if (VWidth > InVWidth) { - // Untested so far. - break; - - // If there are more elements in the result than there are in the source, - // then an input element is live if any of the corresponding output - // elements are live. - Ratio = VWidth/InVWidth; - for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { + } else if ((VWidth % InVWidth) == 0) { + // If the number of elements in the output is a multiple of the number of + // elements in the input then an input element is live if any of the + // corresponding output elements are live. + Ratio = VWidth / InVWidth; + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) if (DemandedElts[OutIdx]) - InputDemandedElts.setBit(OutIdx/Ratio); - } - } else { - // Untested so far. - break; - - // If there are more elements in the source than there are in the result, - // then an input element is live if the corresponding output element is - // live. - Ratio = InVWidth/VWidth; + InputDemandedElts.setBit(OutIdx / Ratio); + } else if ((InVWidth % VWidth) == 0) { + // If the number of elements in the input is a multiple of the number of + // elements in the output then an input element is live if the + // corresponding output element is live. + Ratio = InVWidth / VWidth; for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) - if (DemandedElts[InIdx/Ratio]) + if (DemandedElts[InIdx / Ratio]) InputDemandedElts.setBit(InIdx); + } else { + // Unsupported so far. + break; } // div/rem demand all inputs, because they don't want divide by zero. @@ -1122,24 +1125,26 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, MadeChange = true; } - UndefElts = UndefElts2; - if (VWidth > InVWidth) { - llvm_unreachable("Unimp"); - // If there are more elements in the result than there are in the source, - // then an output element is undef if the corresponding input element is - // undef. + if (VWidth == InVWidth) { + UndefElts = UndefElts2; + } else if ((VWidth % InVWidth) == 0) { + // If the number of elements in the output is a multiple of the number of + // elements in the input then an output element is undef if the + // corresponding input element is undef. for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) - if (UndefElts2[OutIdx/Ratio]) + if (UndefElts2[OutIdx / Ratio]) + UndefElts.setBit(OutIdx); + } else if ((InVWidth % VWidth) == 0) { + // If the number of elements in the input is a multiple of the number of + // elements in the output then an output element is undef if all of the + // corresponding input elements are undef. + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { + APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio); + if (SubUndef.countPopulation() == Ratio) UndefElts.setBit(OutIdx); - } else if (VWidth < InVWidth) { + } + } else { llvm_unreachable("Unimp"); - // If there are more elements in the source than there are in the result, - // then a result element is undef if all of the corresponding input - // elements are undef. - UndefElts = ~0ULL >> (64-VWidth); // Start out all undef. - for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) - if (!UndefElts2[InIdx]) // Not undef? - UndefElts.clearBit(InIdx/Ratio); // Clear undef bit. } break; } @@ -1237,6 +1242,15 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // like undef&0. The result is known zero, not undef. UndefElts &= UndefElts2; break; + + // SSE4A instructions leave the upper 64-bits of the 128-bit result + // in an undefined state. + case Intrinsic::x86_sse4a_extrq: + case Intrinsic::x86_sse4a_extrqi: + case Intrinsic::x86_sse4a_insertq: + case Intrinsic::x86_sse4a_insertqi: + UndefElts |= APInt::getHighBitsSet(VWidth, VWidth / 2); + break; } break; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 2730472..5cde31a 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -22,10 +22,10 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -/// CheapToScalarize - Return true if the value is cheaper to scalarize than it -/// is to leave as a vector operation. isConstant indicates whether we're -/// extracting one known element. If false we're extracting a variable index. -static bool CheapToScalarize(Value *V, bool isConstant) { +/// Return true if the value is cheaper to scalarize than it is to leave as a +/// vector operation. isConstant indicates whether we're extracting one known +/// element. If false we're extracting a variable index. +static bool cheapToScalarize(Value *V, bool isConstant) { if (Constant *C = dyn_cast<Constant>(V)) { if (isConstant) return true; @@ -50,13 +50,13 @@ static bool CheapToScalarize(Value *V, bool isConstant) { return true; if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) if (BO->hasOneUse() && - (CheapToScalarize(BO->getOperand(0), isConstant) || - CheapToScalarize(BO->getOperand(1), isConstant))) + (cheapToScalarize(BO->getOperand(0), isConstant) || + cheapToScalarize(BO->getOperand(1), isConstant))) return true; if (CmpInst *CI = dyn_cast<CmpInst>(I)) if (CI->hasOneUse() && - (CheapToScalarize(CI->getOperand(0), isConstant) || - CheapToScalarize(CI->getOperand(1), isConstant))) + (cheapToScalarize(CI->getOperand(0), isConstant) || + cheapToScalarize(CI->getOperand(1), isConstant))) return true; return false; @@ -82,7 +82,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { // and that it is a binary operation which is cheap to scalarize. // otherwise return NULL. if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) || - !(isa<BinaryOperator>(PHIUser)) || !CheapToScalarize(PHIUser, true)) + !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true)) return nullptr; // Create a scalar PHI node that will replace the vector PHI node @@ -115,8 +115,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { Instruction *pos = dyn_cast<Instruction>(PHIInVal); BasicBlock::iterator InsertPos; if (pos && !isa<PHINode>(pos)) { - InsertPos = pos; - ++InsertPos; + InsertPos = ++pos->getIterator(); } else { InsertPos = inBB->getFirstInsertionPt(); } @@ -137,7 +136,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { // If vector val is constant with all elements the same, replace EI with // that element. We handle a known element # below. if (Constant *C = dyn_cast<Constant>(EI.getOperand(0))) - if (CheapToScalarize(C, false)) + if (cheapToScalarize(C, false)) return ReplaceInstUsesWith(EI, C->getAggregateElement(0U)); // If extracting a specified index from the vector, see if we can recursively @@ -163,7 +162,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { } } - // If the this extractelement is directly using a bitcast from a vector of + // If this extractelement is directly using a bitcast from a vector of // the same number of elements, see if we can find the source element from // it. In this case, we will end up needing to bitcast the scalars. if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) { @@ -184,10 +183,10 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) { // Push extractelement into predecessor operation if legal and - // profitable to do so + // profitable to do so. if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { if (I->hasOneUse() && - CheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) { + cheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) { Value *newEI0 = Builder->CreateExtractElement(BO->getOperand(0), EI.getOperand(1), EI.getName()+".lhs"); @@ -230,8 +229,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { SrcIdx, false)); } } else if (CastInst *CI = dyn_cast<CastInst>(I)) { - // Canonicalize extractelement(cast) -> cast(extractelement) - // bitcasts can change the number of vector elements and they cost nothing + // Canonicalize extractelement(cast) -> cast(extractelement). + // Bitcasts can change the number of vector elements, and they cost + // nothing. if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) { Value *EE = Builder->CreateExtractElement(CI->getOperand(0), EI.getIndexOperand()); @@ -245,7 +245,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { // fight the vectorizer. // If we are extracting an element from a vector select or a select on - // vectors, a select on the scalars extracted from the vector arguments. + // vectors, create a select on the scalars extracted from the vector + // arguments. Value *TrueVal = SI->getTrueValue(); Value *FalseVal = SI->getFalseValue(); @@ -275,10 +276,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { return nullptr; } -/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns -/// elements from either LHS or RHS, return the shuffle mask and true. -/// Otherwise, return false. -static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, +/// If V is a shuffle of values that ONLY returns elements from either LHS or +/// RHS, return the shuffle mask and true. Otherwise, return false. +static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, SmallVectorImpl<Constant*> &Mask) { assert(LHS->getType() == RHS->getType() && "Invalid CollectSingleShuffleElements"); @@ -315,7 +315,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, if (isa<UndefValue>(ScalarOp)) { // inserting undef into vector. // We can handle this if the vector we are inserting into is // transitively ok. - if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { + if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { // If so, update the mask to reflect the inserted undef. Mask[InsertedIdx] = UndefValue::get(Type::getInt32Ty(V->getContext())); return true; @@ -330,7 +330,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) { // We can handle this if the vector we are inserting into is // transitively ok. - if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { + if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { // If so, update the mask to reflect the inserted value. if (EI->getOperand(0) == LHS) { Mask[InsertedIdx % NumElts] = @@ -352,6 +352,58 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, return false; } +/// If we have insertion into a vector that is wider than the vector that we +/// are extracting from, try to widen the source vector to allow a single +/// shufflevector to replace one or more insert/extract pairs. +static void replaceExtractElements(InsertElementInst *InsElt, + ExtractElementInst *ExtElt, + InstCombiner &IC) { + VectorType *InsVecType = InsElt->getType(); + VectorType *ExtVecType = ExtElt->getVectorOperandType(); + unsigned NumInsElts = InsVecType->getVectorNumElements(); + unsigned NumExtElts = ExtVecType->getVectorNumElements(); + + // The inserted-to vector must be wider than the extracted-from vector. + if (InsVecType->getElementType() != ExtVecType->getElementType() || + NumExtElts >= NumInsElts) + return; + + // Create a shuffle mask to widen the extended-from vector using undefined + // values. The mask selects all of the values of the original vector followed + // by as many undefined values as needed to create a vector of the same length + // as the inserted-to vector. + SmallVector<Constant *, 16> ExtendMask; + IntegerType *IntType = Type::getInt32Ty(InsElt->getContext()); + for (unsigned i = 0; i < NumExtElts; ++i) + ExtendMask.push_back(ConstantInt::get(IntType, i)); + for (unsigned i = NumExtElts; i < NumInsElts; ++i) + ExtendMask.push_back(UndefValue::get(IntType)); + + Value *ExtVecOp = ExtElt->getVectorOperand(); + auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), + ConstantVector::get(ExtendMask)); + + // Insert the new shuffle after the vector operand of the extract is defined + // (as long as it's not a PHI) or at the start of the basic block of the + // extract, so any subsequent extracts in the same basic block can use it. + // TODO: Insert before the earliest ExtractElementInst that is replaced. + auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp); + if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst)) + WideVec->insertAfter(ExtVecOpInst); + else + IC.InsertNewInstWith(WideVec, *ExtElt->getParent()->getFirstInsertionPt()); + + // Replace extracts from the original narrow vector with extracts from the new + // wide vector. + for (User *U : ExtVecOp->users()) { + ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U); + if (!OldExt || OldExt->getParent() != WideVec->getParent()) + continue; + auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1)); + NewExt->insertAfter(WideVec); + IC.ReplaceInstUsesWith(*OldExt, NewExt); + } +} /// We are building a shuffle to create V, which is a sequence of insertelement, /// extractelement pairs. If PermittedRHS is set, then we must either use it or @@ -363,9 +415,10 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, /// often been chosen carefully to be efficiently implementable on the target. typedef std::pair<Value *, Value *> ShuffleOps; -static ShuffleOps CollectShuffleElements(Value *V, +static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<Constant *> &Mask, - Value *PermittedRHS) { + Value *PermittedRHS, + InstCombiner &IC) { assert(V->getType()->isVectorTy() && "Invalid shuffle!"); unsigned NumElts = cast<VectorType>(V->getType())->getNumElements(); @@ -396,10 +449,14 @@ static ShuffleOps CollectShuffleElements(Value *V, // otherwise we'd end up with a shuffle of three inputs. if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) { Value *RHS = EI->getOperand(0); - ShuffleOps LR = CollectShuffleElements(VecOp, Mask, RHS); + ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC); assert(LR.second == nullptr || LR.second == RHS); if (LR.first->getType() != RHS->getType()) { + // Although we are giving up for now, see if we can create extracts + // that match the inserts for another round of combining. + replaceExtractElements(IEI, EI, IC); + // We tried our best, but we can't find anything compatible with RHS // further up the chain. Return a trivial shuffle. for (unsigned i = 0; i < NumElts; ++i) @@ -429,14 +486,14 @@ static ShuffleOps CollectShuffleElements(Value *V, // If this insertelement is a chain that comes from exactly these two // vectors, return the vector and the effective shuffle. if (EI->getOperand(0)->getType() == PermittedRHS->getType() && - CollectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS, + collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS, Mask)) return std::make_pair(EI->getOperand(0), PermittedRHS); } } } - // Otherwise, can't do anything fancy. Return an identity vector. + // Otherwise, we can't do anything fancy. Return an identity vector. for (unsigned i = 0; i != NumElts; ++i) Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i)); return std::make_pair(V, nullptr); @@ -512,7 +569,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { // (and any insertelements it points to), into one big shuffle. if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) { SmallVector<Constant*, 16> Mask; - ShuffleOps LR = CollectShuffleElements(&IE, Mask, nullptr); + ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this); // The proposed shuffle may be trivial, in which case we shouldn't // perform the combine. @@ -588,8 +645,8 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask, case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::GetElementPtr: { - for (int i = 0, e = I->getNumOperands(); i != e; ++i) { - if (!CanEvaluateShuffled(I->getOperand(i), Mask, Depth-1)) + for (Value *Operand : I->operands()) { + if (!CanEvaluateShuffled(Operand, Mask, Depth-1)) return false; } return true; @@ -617,7 +674,7 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask, /// Rebuild a new instruction just like 'I' but with the new operands given. /// In the event of type mismatch, the type of the operands is correct. -static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) { +static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) { // We don't want to use the IRBuilder here because we want the replacement // instructions to appear next to 'I', not the builder's insertion point. switch (I->getOpcode()) { @@ -760,7 +817,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { NeedsRebuild |= (V != I->getOperand(i)); } if (NeedsRebuild) { - return BuildNew(I, NewOps); + return buildNew(I, NewOps); } return I; } @@ -792,7 +849,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { llvm_unreachable("failed to reorder elements of vector instruction!"); } -static void RecognizeIdentityMask(const SmallVectorImpl<int> &Mask, +static void recognizeIdentityMask(const SmallVectorImpl<int> &Mask, bool &isLHSID, bool &isRHSID) { isLHSID = isRHSID = true; @@ -891,7 +948,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (VWidth == LHSWidth) { // Analyze the shuffle, are the LHS or RHS and identity shuffles? bool isLHSID, isRHSID; - RecognizeIdentityMask(Mask, isLHSID, isRHSID); + recognizeIdentityMask(Mask, isLHSID, isRHSID); // Eliminate identity shuffles. if (isLHSID) return ReplaceInstUsesWith(SVI, LHS); @@ -1177,7 +1234,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { // If the result mask is an identity, replace uses of this instruction with // corresponding argument. bool isLHSID, isRHSID; - RecognizeIdentityMask(newMask, isLHSID, isRHSID); + recognizeIdentityMask(newMask, isLHSID, isRHSID); if (isLHSID && VWidth == LHSOp0Width) return ReplaceInstUsesWith(SVI, newLHS); if (isRHSID && VWidth == RHSOp0Width) return ReplaceInstUsesWith(SVI, newRHS); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index fd34a24..903a0b5 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -42,8 +42,9 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LibCallSemantics.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -79,14 +80,12 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) { return llvm::EmitGEPOffset(Builder, DL, GEP); } -/// ShouldChangeType - Return true if it is desirable to convert a computation -/// from 'From' to 'To'. We don't want to convert from a legal to an illegal -/// type for example, or from a smaller to a larger illegal type. -bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { - assert(From->isIntegerTy() && To->isIntegerTy()); - - unsigned FromWidth = From->getPrimitiveSizeInBits(); - unsigned ToWidth = To->getPrimitiveSizeInBits(); +/// Return true if it is desirable to convert an integer computation from a +/// given bit width to a new bit width. +/// We don't want to convert from a legal to an illegal type for example or from +/// a smaller to a larger illegal type. +bool InstCombiner::ShouldChangeType(unsigned FromWidth, + unsigned ToWidth) const { bool FromLegal = DL.isLegalInteger(FromWidth); bool ToLegal = DL.isLegalInteger(ToWidth); @@ -103,6 +102,17 @@ bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { return true; } +/// Return true if it is desirable to convert a computation from 'From' to 'To'. +/// We don't want to convert from a legal to an illegal type for example or from +/// a smaller to a larger illegal type. +bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { + assert(From->isIntegerTy() && To->isIntegerTy()); + + unsigned FromWidth = From->getPrimitiveSizeInBits(); + unsigned ToWidth = To->getPrimitiveSizeInBits(); + return ShouldChangeType(FromWidth, ToWidth); +} + // Return true, if No Signed Wrap should be maintained for I. // The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C", // where both B and C should be ConstantInts, results in a constant that does @@ -156,27 +166,26 @@ static void ClearSubclassDataAfterReassociation(BinaryOperator &I) { I.setFastMathFlags(FMF); } -/// SimplifyAssociativeOrCommutative - This performs a few simplifications for -/// operators which are associative or commutative: -// -// Commutative operators: -// -// 1. Order operands such that they are listed from right (least complex) to -// left (most complex). This puts constants before unary operators before -// binary operators. -// -// Associative operators: -// -// 2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. -// 3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. -// -// Associative and commutative operators: -// -// 4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. -// 5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. -// 6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" -// if C1 and C2 are constants. -// +/// This performs a few simplifications for operators that are associative or +/// commutative: +/// +/// Commutative operators: +/// +/// 1. Order operands such that they are listed from right (least complex) to +/// left (most complex). This puts constants before unary operators before +/// binary operators. +/// +/// Associative operators: +/// +/// 2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. +/// 3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. +/// +/// Associative and commutative operators: +/// +/// 4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. +/// 5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. +/// 6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" +/// if C1 and C2 are constants. bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { Instruction::BinaryOps Opcode = I.getOpcode(); bool Changed = false; @@ -322,7 +331,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { } while (1); } -/// LeftDistributesOverRight - Whether "X LOp (Y ROp Z)" is always equal to +/// Return whether "X LOp (Y ROp Z)" is always equal to /// "(X LOp Y) ROp (X LOp Z)". static bool LeftDistributesOverRight(Instruction::BinaryOps LOp, Instruction::BinaryOps ROp) { @@ -361,7 +370,7 @@ static bool LeftDistributesOverRight(Instruction::BinaryOps LOp, } } -/// RightDistributesOverLeft - Whether "(X LOp Y) ROp Z" is always equal to +/// Return whether "(X LOp Y) ROp Z" is always equal to /// "(X ROp Z) LOp (Y ROp Z)". static bool RightDistributesOverLeft(Instruction::BinaryOps LOp, Instruction::BinaryOps ROp) { @@ -519,7 +528,7 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder, if (isa<OverflowingBinaryOperator>(Op1)) HasNSW &= Op1->hasNoSignedWrap(); - // We can propogate 'nsw' if we know that + // We can propagate 'nsw' if we know that // %Y = mul nsw i16 %X, C // %Z = add nsw i16 %Y, %X // => @@ -537,11 +546,11 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder, return SimplifiedInst; } -/// SimplifyUsingDistributiveLaws - This tries to simplify binary operations -/// which some other binary operation distributes over either by factorizing -/// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this -/// results in simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is -/// a win). Returns the simplified value, or null if it didn't simplify. +/// This tries to simplify binary operations which some other binary operation +/// distributes over either by factorizing out common terms +/// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in +/// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win). +/// Returns the simplified value, or null if it didn't simplify. Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS); @@ -623,12 +632,38 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { } } + // (op (select (a, c, b)), (select (a, d, b))) -> (select (a, (op c, d), 0)) + // (op (select (a, b, c)), (select (a, b, d))) -> (select (a, 0, (op c, d))) + if (auto *SI0 = dyn_cast<SelectInst>(LHS)) { + if (auto *SI1 = dyn_cast<SelectInst>(RHS)) { + if (SI0->getCondition() == SI1->getCondition()) { + Value *SI = nullptr; + if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getFalseValue(), + SI1->getFalseValue(), DL, TLI, DT, AC)) + SI = Builder->CreateSelect(SI0->getCondition(), + Builder->CreateBinOp(TopLevelOpcode, + SI0->getTrueValue(), + SI1->getTrueValue()), + V); + if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getTrueValue(), + SI1->getTrueValue(), DL, TLI, DT, AC)) + SI = Builder->CreateSelect( + SI0->getCondition(), V, + Builder->CreateBinOp(TopLevelOpcode, SI0->getFalseValue(), + SI1->getFalseValue())); + if (SI) { + SI->takeName(&I); + return SI; + } + } + } + } + return nullptr; } -// dyn_castNegVal - Given a 'sub' instruction, return the RHS of the instruction -// if the LHS is a constant zero (which is the 'negate' form). -// +/// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a +/// constant zero (which is the 'negate' form). Value *InstCombiner::dyn_castNegVal(Value *V) const { if (BinaryOperator::isNeg(V)) return BinaryOperator::getNegArgument(V); @@ -644,10 +679,8 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const { return nullptr; } -// dyn_castFNegVal - Given a 'fsub' instruction, return the RHS of the -// instruction if the LHS is a constant negative zero (which is the 'negate' -// form). -// +/// Given a 'fsub' instruction, return the RHS of the instruction if the LHS is +/// a constant negative zero (which is the 'negate' form). Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const { if (BinaryOperator::isFNeg(V, IgnoreZeroSign)) return BinaryOperator::getFNegArgument(V); @@ -700,10 +733,10 @@ static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO, llvm_unreachable("Unknown binary instruction type!"); } -// FoldOpIntoSelect - Given an instruction with a select as one operand and a -// constant as the other operand, try to fold the binary operator into the -// select arguments. This also works for Cast instructions, which obviously do -// not have a second operand. +/// Given an instruction with a select as one operand and a constant as the +/// other operand, try to fold the binary operator into the select arguments. +/// This also works for Cast instructions, which obviously do not have a second +/// operand. Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { // Don't modify shared select instructions if (!SI->hasOneUse()) return nullptr; @@ -752,10 +785,9 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { return nullptr; } -/// FoldOpIntoPhi - Given a binary operator, cast instruction, or select which -/// has a PHI node as operand #0, see if we can fold the instruction into the -/// PHI (which is only possible if all operands to the PHI are constants). -/// +/// Given a binary operator, cast instruction, or select which has a PHI node as +/// operand #0, see if we can fold the instruction into the PHI (which is only +/// possible if all operands to the PHI are constants). Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { PHINode *PN = cast<PHINode>(I.getOperand(0)); unsigned NumPHIValues = PN->getNumIncomingValues(); @@ -819,7 +851,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { NewPN->takeName(PN); // If we are going to have to insert a new computation, do so right before the - // predecessors terminator. + // predecessor's terminator. if (NonConstBB) Builder->SetInsertPoint(NonConstBB->getTerminator()); @@ -893,10 +925,10 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { return ReplaceInstUsesWith(I, NewPN); } -/// FindElementAtOffset - Given a pointer type and a constant offset, determine -/// whether or not there is a sequence of GEP indices into the pointed type that -/// will land us at the specified offset. If so, fill them into NewIndices and -/// return the resultant element type, otherwise return null. +/// Given a pointer type and a constant offset, determine whether or not there +/// is a sequence of GEP indices into the pointed type that will land us at the +/// specified offset. If so, fill them into NewIndices and return the resultant +/// element type, otherwise return null. Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset, SmallVectorImpl<Value *> &NewIndices) { Type *Ty = PtrTy->getElementType(); @@ -965,8 +997,8 @@ static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) { return true; } -/// Descale - Return a value X such that Val = X * Scale, or null if none. If -/// the multiplication is known not to overflow then NoSignedWrap is set. +/// Return a value X such that Val = X * Scale, or null if none. +/// If the multiplication is known not to overflow, then NoSignedWrap is set. Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!"); assert(cast<IntegerType>(Val->getType())->getBitWidth() == @@ -1008,11 +1040,11 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { // 0'th operand of Val. std::pair<Instruction*, unsigned> Parent; - // RequireNoSignedWrap - Set if the transform requires a descaling at deeper - // levels that doesn't overflow. + // Set if the transform requires a descaling at deeper levels that doesn't + // overflow. bool RequireNoSignedWrap = false; - // logScale - log base 2 of the scale. Negative if not a power of 2. + // Log base 2 of the scale. Negative if not a power of 2. int32_t logScale = Scale.exactLogBase2(); for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down @@ -1213,16 +1245,11 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { /// specified one but with other operands. static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS, InstCombiner::BuilderTy *B) { - Value *BORes = B->CreateBinOp(Inst.getOpcode(), LHS, RHS); - if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BORes)) { - if (isa<OverflowingBinaryOperator>(NewBO)) { - NewBO->setHasNoSignedWrap(Inst.hasNoSignedWrap()); - NewBO->setHasNoUnsignedWrap(Inst.hasNoUnsignedWrap()); - } - if (isa<PossiblyExactOperator>(NewBO)) - NewBO->setIsExact(Inst.isExact()); - } - return BORes; + Value *BO = B->CreateBinOp(Inst.getOpcode(), LHS, RHS); + // If LHS and RHS are constant, BO won't be a binary operator. + if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BO)) + NewBO->copyIRFlags(&Inst); + return BO; } /// \brief Makes transformation of binary operation specific for vector types. @@ -1256,9 +1283,8 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) { LShuf->getMask() == RShuf->getMask()) { Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0), RShuf->getOperand(0), Builder); - Value *Res = Builder->CreateShuffleVector(NewBO, + return Builder->CreateShuffleVector(NewBO, UndefValue::get(NewBO->getType()), LShuf->getMask()); - return Res; } } @@ -1294,18 +1320,11 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) { } if (MayChange) { Constant *C2 = ConstantVector::get(C2M); - Value *NewLHS, *NewRHS; - if (isa<Constant>(LHS)) { - NewLHS = C2; - NewRHS = Shuffle->getOperand(0); - } else { - NewLHS = Shuffle->getOperand(0); - NewRHS = C2; - } + Value *NewLHS = isa<Constant>(LHS) ? C2 : Shuffle->getOperand(0); + Value *NewRHS = isa<Constant>(LHS) ? Shuffle->getOperand(0) : C2; Value *NewBO = CreateBinOpAsGiven(Inst, NewLHS, NewRHS, Builder); - Value *Res = Builder->CreateShuffleVector(NewBO, + return Builder->CreateShuffleVector(NewBO, UndefValue::get(Inst.getType()), Shuffle->getMask()); - return Res; } } @@ -1323,7 +1342,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Eliminate unneeded casts for indices, and replace indices which displace // by multiples of a zero size type with zero. bool MadeChange = false; - Type *IntPtrTy = DL.getIntPtrType(GEP.getPointerOperandType()); + Type *IntPtrTy = + DL.getIntPtrType(GEP.getPointerOperandType()->getScalarType()); gep_type_iterator GTI = gep_type_begin(GEP); for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E; @@ -1333,21 +1353,25 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (!SeqTy) continue; + // Index type should have the same width as IntPtr + Type *IndexTy = (*I)->getType(); + Type *NewIndexType = IndexTy->isVectorTy() ? + VectorType::get(IntPtrTy, IndexTy->getVectorNumElements()) : IntPtrTy; + // If the element type has zero size then any index over it is equivalent // to an index of zero, so replace it with zero if it is not zero already. if (SeqTy->getElementType()->isSized() && DL.getTypeAllocSize(SeqTy->getElementType()) == 0) if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) { - *I = Constant::getNullValue(IntPtrTy); + *I = Constant::getNullValue(NewIndexType); MadeChange = true; } - Type *IndexTy = (*I)->getType(); - if (IndexTy != IntPtrTy) { + if (IndexTy != NewIndexType) { // If we are using a wider index than needed for this platform, shrink // it to what we need. If narrower, sign-extend it to what we need. // This explicit cast can make subsequent optimizations more obvious. - *I = Builder->CreateIntCast(*I, IntPtrTy, true); + *I = Builder->CreateIntCast(*I, NewIndexType, true); MadeChange = true; } } @@ -1421,8 +1445,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } } - GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone()); + // If not all GEPs are identical we'll have to create a new PHI node. + // Check that the old PHI node has only one use so that it will get + // removed. + if (DI != -1 && !PN->hasOneUse()) + return nullptr; + GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone()); if (DI == -1) { // All the GEPs feeding the PHI are identical. Clone one down into our // BB so that it can be merged with the current GEP. @@ -1432,11 +1461,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // All the GEPs feeding the PHI differ at a single offset. Clone a GEP // into the current block so it can be merged, and create a new PHI to // set that index. - Instruction *InsertPt = Builder->GetInsertPoint(); - Builder->SetInsertPoint(PN); - PHINode *NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(), - PN->getNumOperands()); - Builder->SetInsertPoint(InsertPt); + PHINode *NewPN; + { + IRBuilderBase::InsertPointGuard Guard(*Builder); + Builder->SetInsertPoint(PN); + NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(), + PN->getNumOperands()); + } for (auto &I : PN->operands()) NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI), @@ -1790,7 +1821,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (Instruction *I = visitBitCast(*BCI)) { if (I != BCI) { I->takeName(BCI); - BCI->getParent()->getInstList().insert(BCI, I); + BCI->getParent()->getInstList().insert(BCI->getIterator(), I); ReplaceInstUsesWith(*BCI, I); } return &GEP; @@ -1931,7 +1962,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) { if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) { // Replace invoke with a NOP intrinsic to maintain the original CFG - Module *M = II->getParent()->getParent()->getParent(); + Module *M = II->getModule(); Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing); InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(), None, "", II->getParent()); @@ -2280,9 +2311,10 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { } if (LoadInst *L = dyn_cast<LoadInst>(Agg)) // If the (non-volatile) load only has one use, we can rewrite this to a - // load from a GEP. This reduces the size of the load. - // FIXME: If a load is used only by extractvalue instructions then this - // could be done regardless of having multiple uses. + // load from a GEP. This reduces the size of the load. If a load is used + // only by extractvalue instructions then this either must have been + // optimized before, or it is a struct with padding, in which case we + // don't want to do the transformation as it loses padding knowledge. if (L->isSimple() && L->hasOneUse()) { // extractvalue has integer indices, getelementptr has Value*s. Convert. SmallVector<Value*, 4> Indices; @@ -2294,7 +2326,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { // We need to insert these at the location of the old load, not at that of // the extractvalue. - Builder->SetInsertPoint(L->getParent(), L); + Builder->SetInsertPoint(L); Value *GEP = Builder->CreateInBoundsGEP(L->getType(), L->getPointerOperand(), Indices); // Returning the load directly will cause the main loop to insert it in @@ -2312,7 +2344,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { return nullptr; } -/// isCatchAll - Return 'true' if the given typeinfo will match anything. +/// Return 'true' if the given typeinfo will match anything. static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) { switch (Personality) { case EHPersonality::GNU_C: @@ -2330,6 +2362,7 @@ static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) { case EHPersonality::MSVC_X86SEH: case EHPersonality::MSVC_Win64SEH: case EHPersonality::MSVC_CXX: + case EHPersonality::CoreCLR: return TypeInfo->isNullValue(); } llvm_unreachable("invalid enum"); @@ -2441,10 +2474,24 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { SawCatchAll = true; break; } - if (AlreadyCaught.count(TypeInfo)) - // Already caught by an earlier clause, so having it in the filter - // is pointless. - continue; + + // Even if we've seen a type in a catch clause, we don't want to + // remove it from the filter. An unexpected type handler may be + // set up for a call site which throws an exception of the same + // type caught. In order for the exception thrown by the unexpected + // handler to propogate correctly, the filter must be correctly + // described for the call site. + // + // Example: + // + // void unexpected() { throw 1;} + // void foo() throw (int) { + // std::set_unexpected(unexpected); + // try { + // throw 2.0; + // } catch (int i) {} + // } + // There is no point in having multiple copies of the same typeinfo in // a filter, so only add it if we didn't already. if (SeenInFilter.insert(TypeInfo).second) @@ -2637,15 +2684,15 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { return nullptr; } -/// TryToSinkInstruction - Try to move the specified instruction from its -/// current block into the beginning of DestBlock, which can only happen if it's -/// safe to move the instruction past all of the instructions between it and the -/// end of its block. +/// Try to move the specified instruction from its current block into the +/// beginning of DestBlock, which can only happen if it's safe to move the +/// instruction past all of the instructions between it and the end of its +/// block. static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { assert(I->hasOneUse() && "Invariants didn't hold!"); // Cannot move control-flow-involving, volatile loads, vaarg, etc. - if (isa<PHINode>(I) || isa<LandingPadInst>(I) || I->mayHaveSideEffects() || + if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() || isa<TerminatorInst>(I)) return false; @@ -2654,17 +2701,24 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { &DestBlock->getParent()->getEntryBlock()) return false; + // Do not sink convergent call instructions. + if (auto *CI = dyn_cast<CallInst>(I)) { + if (CI->isConvergent()) + return false; + } + // We can only sink load instructions if there is nothing between the load and // the end of block that could change the value. if (I->mayReadFromMemory()) { - for (BasicBlock::iterator Scan = I, E = I->getParent()->end(); + for (BasicBlock::iterator Scan = I->getIterator(), + E = I->getParent()->end(); Scan != E; ++Scan) if (Scan->mayWriteToMemory()) return false; } BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt(); - I->moveBefore(InsertPos); + I->moveBefore(&*InsertPos); ++NumSunkInst; return true; } @@ -2698,6 +2752,27 @@ bool InstCombiner::run() { } } + // In general, it is possible for computeKnownBits to determine all bits in a + // value even when the operands are not all constants. + if (!I->use_empty() && I->getType()->isIntegerTy()) { + unsigned BitWidth = I->getType()->getScalarSizeInBits(); + APInt KnownZero(BitWidth, 0); + APInt KnownOne(BitWidth, 0); + computeKnownBits(I, KnownZero, KnownOne, /*Depth*/0, I); + if ((KnownZero | KnownOne).isAllOnesValue()) { + Constant *C = ConstantInt::get(I->getContext(), KnownOne); + DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C << + " from: " << *I << '\n'); + + // Add operands to the worklist. + ReplaceInstUsesWith(*I, C); + ++NumConstProp; + EraseInstFromFunction(*I); + MadeIRChange = true; + continue; + } + } + // See if we can trivially sink this instruction to a successor basic block. if (I->hasOneUse()) { BasicBlock *BB = I->getParent(); @@ -2738,7 +2813,7 @@ bool InstCombiner::run() { } // Now that we have an instruction, try combining it to simplify it. - Builder->SetInsertPoint(I->getParent(), I); + Builder->SetInsertPoint(I); Builder->SetCurrentDebugLocation(I->getDebugLoc()); #ifndef NDEBUG @@ -2768,7 +2843,7 @@ bool InstCombiner::run() { // Insert the new instruction into the basic block... BasicBlock *InstParent = I->getParent(); - BasicBlock::iterator InsertPos = I; + BasicBlock::iterator InsertPos = I->getIterator(); // If we replace a PHI with something that isn't a PHI, fix up the // insertion point. @@ -2801,8 +2876,8 @@ bool InstCombiner::run() { return MadeIRChange; } -/// AddReachableCodeToWorklist - Walk the function in depth-first order, adding -/// all reachable code to the worklist. +/// Walk the function in depth-first order, adding all reachable code to the +/// worklist. /// /// This has a couple of tricks to make the code faster and more powerful. In /// particular, we constant fold and DCE instructions as we go, to avoid adding @@ -2829,7 +2904,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, continue; for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { - Instruction *Inst = BBI++; + Instruction *Inst = &*BBI++; // DCE instruction if trivially dead. if (isInstructionTriviallyDead(Inst, TLI)) { @@ -2900,8 +2975,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, } } - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - Worklist.push_back(TI->getSuccessor(i)); + for (BasicBlock *SuccBB : TI->successors()) + Worklist.push_back(SuccBB); } while (!Worklist.empty()); // Once we've found all of the instructions to add to instcombine's worklist, @@ -2909,8 +2984,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, // of the function down. This jives well with the way that it adds all uses // of instructions to the worklist after doing a transformation, thus avoiding // some N^2 behavior in pathological cases. - ICWorklist.AddInitialGroup(&InstrsForInstCombineWorklist[0], - InstrsForInstCombineWorklist.size()); + ICWorklist.AddInitialGroup(InstrsForInstCombineWorklist); return MadeIRChange; } @@ -2930,13 +3004,13 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, // track of which blocks we visit. SmallPtrSet<BasicBlock *, 64> Visited; MadeIRChange |= - AddReachableCodeToWorklist(F.begin(), DL, Visited, ICWorklist, TLI); + AddReachableCodeToWorklist(&F.front(), DL, Visited, ICWorklist, TLI); // Do a quick scan over the function. If we find any blocks that are // unreachable, remove any instructions inside of them. This prevents // the instcombine code from having to deal with some bad special cases. for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (Visited.count(BB)) + if (Visited.count(&*BB)) continue; // Delete the instructions backwards, as it has a reduced likelihood of @@ -2944,11 +3018,10 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. while (EndInst != BB->begin()) { // Delete the next to last instruction. - BasicBlock::iterator I = EndInst; - Instruction *Inst = --I; - if (!Inst->use_empty()) + Instruction *Inst = &*--EndInst->getIterator(); + if (!Inst->use_empty() && !Inst->getType()->isTokenTy()) Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); - if (isa<LandingPadInst>(Inst)) { + if (Inst->isEHPad() || Inst->getType()->isTokenTy()) { EndInst = Inst; continue; } @@ -2968,8 +3041,6 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, LoopInfo *LI = nullptr) { - // Minimizing size? - bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize); auto &DL = F.getParent()->getDataLayout(); /// Builder - This is an IRBuilder that automatically inserts new @@ -2992,7 +3063,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist)) Changed = true; - InstCombiner IC(Worklist, &Builder, MinimizeSize, + InstCombiner IC(Worklist, &Builder, F.optForMinSize(), AA, &AC, &TLI, &DT, DL, LI); if (IC.run()) Changed = true; @@ -3046,11 +3117,12 @@ public: void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } bool InstructionCombiningPass::runOnFunction(Function &F) { @@ -3058,7 +3130,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) { return false; // Required analyses. - auto AA = &getAnalysis<AliasAnalysis>(); + auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -3076,7 +3148,8 @@ INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine", "Combine redundant instructions", false, false) diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index e7ef9f9..a9df5e5 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" @@ -90,7 +91,9 @@ static const char *const kAsanUnregisterGlobalsName = "__asan_unregister_globals"; static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init"; static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init"; -static const char *const kAsanInitName = "__asan_init_v5"; +static const char *const kAsanInitName = "__asan_init"; +static const char *const kAsanVersionCheckName = + "__asan_version_mismatch_check_v6"; static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp"; static const char *const kAsanPtrSub = "__sanitizer_ptr_sub"; static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return"; @@ -119,6 +122,10 @@ static const unsigned kAllocaRzSize = 32; static cl::opt<bool> ClEnableKasan( "asan-kernel", cl::desc("Enable KernelAddressSanitizer instrumentation"), cl::Hidden, cl::init(false)); +static cl::opt<bool> ClRecover( + "asan-recover", + cl::desc("Enable recovery mode (continue-after-error)."), + cl::Hidden, cl::init(false)); // This flag may need to be replaced with -f[no-]asan-reads. static cl::opt<bool> ClInstrumentReads("asan-instrument-reads", @@ -177,7 +184,7 @@ static cl::opt<std::string> ClMemoryAccessCallbackPrefix( cl::init("__asan_")); static cl::opt<bool> ClInstrumentAllocas("asan-instrument-allocas", cl::desc("instrument dynamic allocas"), - cl::Hidden, cl::init(false)); + cl::Hidden, cl::init(true)); static cl::opt<bool> ClSkipPromotableAllocas( "asan-skip-promotable-allocas", cl::desc("Do not instrument promotable allocas"), cl::Hidden, @@ -273,6 +280,11 @@ class GlobalsMetadata { GlobalsMetadata() : inited_(false) {} + void reset() { + inited_ = false; + Entries.clear(); + } + void init(Module &M) { assert(!inited_); inited_ = true; @@ -321,7 +333,7 @@ struct ShadowMapping { static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize, bool IsKasan) { - bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android; + bool IsAndroid = TargetTriple.isAndroid(); bool IsIOS = TargetTriple.isiOS(); bool IsFreeBSD = TargetTriple.isOSFreeBSD(); bool IsLinux = TargetTriple.isOSLinux(); @@ -338,6 +350,8 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize, ShadowMapping Mapping; if (LongSize == 32) { + // Android is always PIE, which means that the beginning of the address + // space is always available. if (IsAndroid) Mapping.Offset = 0; else if (IsMIPS32) @@ -376,7 +390,8 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize, // OR-ing shadow offset if more efficient (at least on x86) if the offset // is a power of two, but on ppc64 we have to use add since the shadow // offset is not necessary 1/8-th of the address space. - Mapping.OrShadowOffset = !IsPPC64 && !(Mapping.Offset & (Mapping.Offset - 1)); + Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 + && !(Mapping.Offset & (Mapping.Offset - 1)); return Mapping; } @@ -389,8 +404,9 @@ static size_t RedzoneSizeForScale(int MappingScale) { /// AddressSanitizer: instrument the code in module to find memory bugs. struct AddressSanitizer : public FunctionPass { - explicit AddressSanitizer(bool CompileKernel = false) - : FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan) { + explicit AddressSanitizer(bool CompileKernel = false, bool Recover = false) + : FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan), + Recover(Recover || ClRecover) { initializeAddressSanitizerPass(*PassRegistry::getPassRegistry()); } const char *getPassName() const override { @@ -437,7 +453,9 @@ struct AddressSanitizer : public FunctionPass { Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); bool runOnFunction(Function &F) override; bool maybeInsertAsanInitAtFunctionEntry(Function &F); + void markEscapedLocalAllocas(Function &F); bool doInitialization(Module &M) override; + bool doFinalization(Module &M) override; static char ID; // Pass identification, replacement for typeid DominatorTree &getDominatorTree() const { return *DT; } @@ -450,10 +468,21 @@ struct AddressSanitizer : public FunctionPass { bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr, uint64_t TypeSize) const; + /// Helper to cleanup per-function state. + struct FunctionStateRAII { + AddressSanitizer *Pass; + FunctionStateRAII(AddressSanitizer *Pass) : Pass(Pass) { + assert(Pass->ProcessedAllocas.empty() && + "last pass forgot to clear cache"); + } + ~FunctionStateRAII() { Pass->ProcessedAllocas.clear(); } + }; + LLVMContext *C; Triple TargetTriple; int LongSize; bool CompileKernel; + bool Recover; Type *IntptrTy; ShadowMapping Mapping; DominatorTree *DT; @@ -477,8 +506,10 @@ struct AddressSanitizer : public FunctionPass { class AddressSanitizerModule : public ModulePass { public: - explicit AddressSanitizerModule(bool CompileKernel = false) - : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan) {} + explicit AddressSanitizerModule(bool CompileKernel = false, + bool Recover = false) + : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan), + Recover(Recover || ClRecover) {} bool runOnModule(Module &M) override; static char ID; // Pass identification, replacement for typeid const char *getPassName() const override { return "AddressSanitizerModule"; } @@ -496,6 +527,7 @@ class AddressSanitizerModule : public ModulePass { GlobalsMetadata GlobalsMD; bool CompileKernel; + bool Recover; Type *IntptrTy; LLVMContext *C; Triple TargetTriple; @@ -525,6 +557,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { ShadowMapping Mapping; SmallVector<AllocaInst *, 16> AllocaVec; + SmallSetVector<AllocaInst *, 16> NonInstrumentedStaticAllocaVec; SmallVector<Instruction *, 8> RetVec; unsigned StackAlignment; @@ -545,12 +578,14 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { SmallVector<AllocaInst *, 1> DynamicAllocaVec; SmallVector<IntrinsicInst *, 1> StackRestoreVec; AllocaInst *DynamicAllocaLayout = nullptr; + IntrinsicInst *LocalEscapeCall = nullptr; // Maps Value to an AllocaInst from which the Value is originated. typedef DenseMap<Value *, AllocaInst *> AllocaForValueMapTy; AllocaForValueMapTy AllocaForValue; - bool HasNonEmptyInlineAsm; + bool HasNonEmptyInlineAsm = false; + bool HasReturnsTwiceCall = false; std::unique_ptr<CallInst> EmptyInlineAsm; FunctionStackPoisoner(Function &F, AddressSanitizer &ASan) @@ -562,7 +597,6 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { IntptrPtrTy(PointerType::get(IntptrTy, 0)), Mapping(ASan.Mapping), StackAlignment(1 << Mapping.Scale), - HasNonEmptyInlineAsm(false), EmptyInlineAsm(CallInst::Create(ASan.EmptyAsm)) {} bool runOnFunction() { @@ -596,9 +630,24 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore, Value *SavedStack) { IRBuilder<> IRB(InstBefore); + Value *DynamicAreaPtr = IRB.CreatePtrToInt(SavedStack, IntptrTy); + // When we insert _asan_allocas_unpoison before @llvm.stackrestore, we + // need to adjust extracted SP to compute the address of the most recent + // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for + // this purpose. + if (!isa<ReturnInst>(InstBefore)) { + Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration( + InstBefore->getModule(), Intrinsic::get_dynamic_area_offset, + {IntptrTy}); + + Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {}); + + DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy), + DynamicAreaOffset); + } + IRB.CreateCall(AsanAllocasUnpoisonFunc, - {IRB.CreateLoad(DynamicAllocaLayout), - IRB.CreatePtrToInt(SavedStack, IntptrTy)}); + {IRB.CreateLoad(DynamicAllocaLayout), DynamicAreaPtr}); } // Unpoison dynamic allocas redzones. @@ -625,7 +674,10 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { /// \brief Collect Alloca instructions we want (and can) handle. void visitAllocaInst(AllocaInst &AI) { - if (!ASan.isInterestingAlloca(AI)) return; + if (!ASan.isInterestingAlloca(AI)) { + if (AI.isStaticAlloca()) NonInstrumentedStaticAllocaVec.insert(&AI); + return; + } StackAlignment = std::max(StackAlignment, AI.getAlignment()); if (ASan.isDynamicAlloca(AI)) @@ -639,6 +691,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { void visitIntrinsicInst(IntrinsicInst &II) { Intrinsic::ID ID = II.getIntrinsicID(); if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II); + if (ID == Intrinsic::localescape) LocalEscapeCall = &II; if (!ClCheckLifetime) return; if (ID != Intrinsic::lifetime_start && ID != Intrinsic::lifetime_end) return; @@ -660,9 +713,13 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { AllocaPoisonCallVec.push_back(APC); } - void visitCallInst(CallInst &CI) { - HasNonEmptyInlineAsm |= - CI.isInlineAsm() && !CI.isIdenticalTo(EmptyInlineAsm.get()); + void visitCallSite(CallSite CS) { + Instruction *I = CS.getInstruction(); + if (CallInst *CI = dyn_cast<CallInst>(I)) { + HasNonEmptyInlineAsm |= + CI->isInlineAsm() && !CI->isIdenticalTo(EmptyInlineAsm.get()); + HasReturnsTwiceCall |= CI->canReturnTwice(); + } } // ---------------------- Helpers. @@ -689,7 +746,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { Instruction *ThenTerm, Value *ValueIfFalse); }; -} // namespace +} // anonymous namespace char AddressSanitizer::ID = 0; INITIALIZE_PASS_BEGIN( @@ -697,12 +754,15 @@ INITIALIZE_PASS_BEGIN( "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END( AddressSanitizer, "asan", "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false, false) -FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel) { - return new AddressSanitizer(CompileKernel); +FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel, + bool Recover) { + assert(!CompileKernel || Recover); + return new AddressSanitizer(CompileKernel, Recover); } char AddressSanitizerModule::ID = 0; @@ -711,8 +771,10 @@ INITIALIZE_PASS( "AddressSanitizer: detects use-after-free and out-of-bounds bugs." "ModulePass", false, false) -ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel) { - return new AddressSanitizerModule(CompileKernel); +ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel, + bool Recover) { + assert(!CompileKernel || Recover); + return new AddressSanitizerModule(CompileKernel, Recover); } static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { @@ -799,8 +861,10 @@ bool AddressSanitizer::isInterestingAlloca(AllocaInst &AI) { getAllocaSizeInBytes(&AI) > 0 && // We are only interested in allocas not promotable to registers. // Promotable allocas are common under -O0. - (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI) || - isDynamicAlloca(AI))); + (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) && + // inalloca allocas are not treated as static, and we don't want + // dynamic alloca instrumentation for them as well. + !AI.isUsedWithInAlloca()); ProcessedAllocas[&AI] = IsInteresting; return IsInteresting; @@ -868,10 +932,8 @@ static bool isInterestingPointerComparisonOrSubtraction(Instruction *I) { } else { return false; } - if (!isPointerOperand(I->getOperand(0)) || - !isPointerOperand(I->getOperand(1))) - return false; - return true; + return isPointerOperand(I->getOperand(0)) && + isPointerOperand(I->getOperand(1)); } bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) { @@ -919,7 +981,7 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, // If initialization order checking is disabled, a simple access to a // dynamically initialized global is always valid. GlobalVariable *G = dyn_cast<GlobalVariable>(GetUnderlyingObject(Addr, DL)); - if (G != NULL && (!ClInitializers || GlobalIsLinkerInitialized(G)) && + if (G && (!ClInitializers || GlobalIsLinkerInitialized(G)) && isSafeAccess(ObjSizeVis, Addr, TypeSize)) { NumOptimizedAccessesToGlobalVar++; return; @@ -1041,13 +1103,17 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, BasicBlock *NextBB = CheckTerm->getSuccessor(0); IRB.SetInsertPoint(CheckTerm); Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize); - BasicBlock *CrashBlock = + if (Recover) { + CrashTerm = SplitBlockAndInsertIfThen(Cmp2, CheckTerm, false); + } else { + BasicBlock *CrashBlock = BasicBlock::Create(*C, "", NextBB->getParent(), NextBB); - CrashTerm = new UnreachableInst(*C, CrashBlock); - BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2); - ReplaceInstWithInst(CheckTerm, NewTerm); + CrashTerm = new UnreachableInst(*C, CrashBlock); + BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2); + ReplaceInstWithInst(CheckTerm, NewTerm); + } } else { - CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, true); + CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, !Recover); } Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite, @@ -1084,7 +1150,8 @@ void AddressSanitizer::instrumentUnusualSizeOrAlignment( void AddressSanitizerModule::poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName) { // Set up the arguments to our poison/unpoison functions. - IRBuilder<> IRB(GlobalInit.begin()->getFirstInsertionPt()); + IRBuilder<> IRB(&GlobalInit.front(), + GlobalInit.front().getFirstInsertionPt()); // Add a call to poison all external globals before the given function starts. Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy); @@ -1147,6 +1214,14 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { // Do not instrument globals from special LLVM sections. if (Section.find("__llvm") != StringRef::npos) return false; + // Do not instrument function pointers to initialization and termination + // routines: dynamic linker will not properly handle redzones. + if (Section.startswith(".preinit_array") || + Section.startswith(".init_array") || + Section.startswith(".fini_array")) { + return false; + } + // Callbacks put into the CRT initializer/terminator sections // should not be instrumented. // See https://code.google.com/p/address-sanitizer/issues/detail?id=305 @@ -1162,10 +1237,7 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { bool TAAParsed; std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier( Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize); - if (!ErrorCode.empty()) { - assert(false && "Invalid section specifier."); - return false; - } + assert(ErrorCode.empty() && "Invalid section specifier."); // Ignore the globals from the __OBJC section. The ObjC runtime assumes // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to @@ -1383,13 +1455,11 @@ void AddressSanitizer::initializeCallbacks(Module &M) { const std::string TypeStr = AccessIsWrite ? "store" : "load"; const std::string ExpStr = Exp ? "exp_" : ""; const std::string SuffixStr = CompileKernel ? "N" : "_n"; - const std::string EndingStr = CompileKernel ? "_noabort" : ""; - const Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr; - // TODO(glider): for KASan builds add _noabort to error reporting - // functions and make them actually noabort (remove the UnreachableInst). + const std::string EndingStr = Recover ? "_noabort" : ""; + Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr; AsanErrorCallbackSized[AccessIsWrite][Exp] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( - kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr, + kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr + EndingStr, IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr)); AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( @@ -1400,7 +1470,7 @@ void AddressSanitizer::initializeCallbacks(Module &M) { const std::string Suffix = TypeStr + itostr(1 << AccessSizeIndex); AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( - kAsanReportErrorTemplate + ExpStr + Suffix, + kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr, IRB.getVoidTy(), IntptrTy, ExpType, nullptr)); AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( @@ -1448,15 +1518,20 @@ bool AddressSanitizer::doInitialization(Module &M) { if (!CompileKernel) { std::tie(AsanCtorFunction, AsanInitFunction) = - createSanitizerCtorAndInitFunctions(M, kAsanModuleCtorName, kAsanInitName, - /*InitArgTypes=*/{}, - /*InitArgs=*/{}); + createSanitizerCtorAndInitFunctions( + M, kAsanModuleCtorName, kAsanInitName, + /*InitArgTypes=*/{}, /*InitArgs=*/{}, kAsanVersionCheckName); appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority); } Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel); return true; } +bool AddressSanitizer::doFinalization(Module &M) { + GlobalsMD.reset(); + return false; +} + bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { // For each NSObject descendant having a +load method, this method is invoked // by the ObjC runtime before any of the static constructors is called. @@ -1466,13 +1541,41 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { // We cannot just ignore these methods, because they may call other // instrumented functions. if (F.getName().find(" load]") != std::string::npos) { - IRBuilder<> IRB(F.begin()->begin()); + IRBuilder<> IRB(&F.front(), F.front().begin()); IRB.CreateCall(AsanInitFunction, {}); return true; } return false; } +void AddressSanitizer::markEscapedLocalAllocas(Function &F) { + // Find the one possible call to llvm.localescape and pre-mark allocas passed + // to it as uninteresting. This assumes we haven't started processing allocas + // yet. This check is done up front because iterating the use list in + // isInterestingAlloca would be algorithmically slower. + assert(ProcessedAllocas.empty() && "must process localescape before allocas"); + + // Try to get the declaration of llvm.localescape. If it's not in the module, + // we can exit early. + if (!F.getParent()->getFunction("llvm.localescape")) return; + + // Look for a call to llvm.localescape call in the entry block. It can't be in + // any other block. + for (Instruction &I : F.getEntryBlock()) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); + if (II && II->getIntrinsicID() == Intrinsic::localescape) { + // We found a call. Mark all the allocas passed in as uninteresting. + for (Value *Arg : II->arg_operands()) { + AllocaInst *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts()); + assert(AI && AI->isStaticAlloca() && + "non-static alloca arg to localescape"); + ProcessedAllocas[AI] = false; + } + break; + } + } +} + bool AddressSanitizer::runOnFunction(Function &F) { if (&F == AsanCtorFunction) return false; if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false; @@ -1488,6 +1591,12 @@ bool AddressSanitizer::runOnFunction(Function &F) { if (!ClDebugFunc.empty() && ClDebugFunc != F.getName()) return false; + FunctionStateRAII CleanupObj(this); + + // We can't instrument allocas used with llvm.localescape. Only static allocas + // can be passed to that intrinsic. + markEscapedLocalAllocas(F); + // We want to instrument every address only once per basic block (unless there // are calls between uses). SmallSet<Value *, 16> TempsToInstrument; @@ -1715,6 +1824,16 @@ void FunctionStackPoisoner::createDynamicAllocasInitStorage() { void FunctionStackPoisoner::poisonStack() { assert(AllocaVec.size() > 0 || DynamicAllocaVec.size() > 0); + // Insert poison calls for lifetime intrinsics for alloca. + bool HavePoisonedAllocas = false; + for (const auto &APC : AllocaPoisonCallVec) { + assert(APC.InsBefore); + assert(APC.AI); + IRBuilder<> IRB(APC.InsBefore); + poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison); + HavePoisonedAllocas |= APC.DoPoison; + } + if (ClInstrumentAllocas && DynamicAllocaVec.size() > 0) { // Handle dynamic allocas. createDynamicAllocasInitStorage(); @@ -1723,7 +1842,7 @@ void FunctionStackPoisoner::poisonStack() { unpoisonDynamicAllocas(); } - if (AllocaVec.size() == 0) return; + if (AllocaVec.empty()) return; int StackMallocIdx = -1; DebugLoc EntryDebugLocation; @@ -1734,6 +1853,19 @@ void FunctionStackPoisoner::poisonStack() { IRBuilder<> IRB(InsBefore); IRB.SetCurrentDebugLocation(EntryDebugLocation); + // Make sure non-instrumented allocas stay in the entry block. Otherwise, + // debug info is broken, because only entry-block allocas are treated as + // regular stack slots. + auto InsBeforeB = InsBefore->getParent(); + assert(InsBeforeB == &F.getEntryBlock()); + for (BasicBlock::iterator I(InsBefore); I != InsBeforeB->end(); ++I) + if (auto *AI = dyn_cast<AllocaInst>(I)) + if (NonInstrumentedStaticAllocaVec.count(AI) > 0) + AI->moveBefore(InsBefore); + + // If we have a call to llvm.localescape, keep it in the entry block. + if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore); + SmallVector<ASanStackVariableDescription, 16> SVD; SVD.reserve(AllocaVec.size()); for (AllocaInst *AI : AllocaVec) { @@ -1751,10 +1883,15 @@ void FunctionStackPoisoner::poisonStack() { uint64_t LocalStackSize = L.FrameSize; bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel && LocalStackSize <= kMaxStackMallocSize; - // Don't do dynamic alloca or stack malloc in presence of inline asm: - // too often it makes assumptions on which registers are available. - bool DoDynamicAlloca = ClDynamicAllocaStack && !HasNonEmptyInlineAsm; - DoStackMalloc &= !HasNonEmptyInlineAsm; + bool DoDynamicAlloca = ClDynamicAllocaStack; + // Don't do dynamic alloca or stack malloc if: + // 1) There is inline asm: too often it makes assumptions on which registers + // are available. + // 2) There is a returns_twice call (typically setjmp), which is + // optimization-hostile, and doesn't play well with introduced indirect + // register-relative calculation of local variable addresses. + DoDynamicAlloca &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall; + DoStackMalloc &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall; Value *StaticAlloca = DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false); @@ -1804,16 +1941,6 @@ void FunctionStackPoisoner::poisonStack() { DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca; } - // Insert poison calls for lifetime intrinsics for alloca. - bool HavePoisonedAllocas = false; - for (const auto &APC : AllocaPoisonCallVec) { - assert(APC.InsBefore); - assert(APC.AI); - IRBuilder<> IRB(APC.InsBefore); - poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison); - HavePoisonedAllocas |= APC.DoPoison; - } - // Replace Alloca instructions with base+offset. for (const auto &Desc : SVD) { AllocaInst *AI = Desc.AI; diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index f685803..fd3dfd9 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -106,7 +106,7 @@ void BoundsChecking::emitBranchToTrap(Value *Cmp) { } ++ChecksAdded; - Instruction *Inst = Builder->GetInsertPoint(); + BasicBlock::iterator Inst = Builder->GetInsertPoint(); BasicBlock *OldBB = Inst->getParent(); BasicBlock *Cont = OldBB->splitBasicBlock(Inst); OldBB->getTerminator()->eraseFromParent(); diff --git a/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h new file mode 100644 index 0000000..c47fdbf --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h @@ -0,0 +1,217 @@ +//===-- CFGMST.h - Minimum Spanning Tree for CFG ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a Union-find algorithm to compute Minimum Spanning Tree +// for a given CFG. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <string> +#include <utility> +#include <vector> + +namespace llvm { + +#define DEBUG_TYPE "cfgmst" + +/// \brief An union-find based Minimum Spanning Tree for CFG +/// +/// Implements a Union-find algorithm to compute Minimum Spanning Tree +/// for a given CFG. +template <class Edge, class BBInfo> class CFGMST { +public: + Function &F; + + // Store all the edges in CFG. It may contain some stale edges + // when Removed is set. + std::vector<std::unique_ptr<Edge>> AllEdges; + + // This map records the auxiliary information for each BB. + DenseMap<const BasicBlock *, std::unique_ptr<BBInfo>> BBInfos; + + // Find the root group of the G and compress the path from G to the root. + BBInfo *findAndCompressGroup(BBInfo *G) { + if (G->Group != G) + G->Group = findAndCompressGroup(static_cast<BBInfo *>(G->Group)); + return static_cast<BBInfo *>(G->Group); + } + + // Union BB1 and BB2 into the same group and return true. + // Returns false if BB1 and BB2 are already in the same group. + bool unionGroups(const BasicBlock *BB1, const BasicBlock *BB2) { + BBInfo *BB1G = findAndCompressGroup(&getBBInfo(BB1)); + BBInfo *BB2G = findAndCompressGroup(&getBBInfo(BB2)); + + if (BB1G == BB2G) + return false; + + // Make the smaller rank tree a direct child or the root of high rank tree. + if (BB1G->Rank < BB2G->Rank) + BB1G->Group = BB2G; + else { + BB2G->Group = BB1G; + // If the ranks are the same, increment root of one tree by one. + if (BB1G->Rank == BB2G->Rank) + BB1G->Rank++; + } + return true; + } + + // Give BB, return the auxiliary information. + BBInfo &getBBInfo(const BasicBlock *BB) const { + auto It = BBInfos.find(BB); + assert(It->second.get() != nullptr); + return *It->second.get(); + } + + // Traverse the CFG using a stack. Find all the edges and assign the weight. + // Edges with large weight will be put into MST first so they are less likely + // to be instrumented. + void buildEdges() { + DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n"); + + const BasicBlock *BB = &(F.getEntryBlock()); + uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2); + // Add a fake edge to the entry. + addEdge(nullptr, BB, EntryWeight); + + // Special handling for single BB functions. + if (succ_empty(BB)) { + addEdge(BB, nullptr, EntryWeight); + return; + } + + static const uint32_t CriticalEdgeMultiplier = 1000; + + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + TerminatorInst *TI = BB->getTerminator(); + uint64_t BBWeight = + (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2); + uint64_t Weight = 2; + if (int successors = TI->getNumSuccessors()) { + for (int i = 0; i != successors; ++i) { + BasicBlock *TargetBB = TI->getSuccessor(i); + bool Critical = isCriticalEdge(TI, i); + uint64_t scaleFactor = BBWeight; + if (Critical) { + if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier) + scaleFactor *= CriticalEdgeMultiplier; + else + scaleFactor = UINT64_MAX; + } + if (BPI != nullptr) + Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor); + addEdge(&*BB, TargetBB, Weight).IsCritical = Critical; + DEBUG(dbgs() << " Edge: from " << BB->getName() << " to " + << TargetBB->getName() << " w=" << Weight << "\n"); + } + } else { + addEdge(&*BB, nullptr, BBWeight); + DEBUG(dbgs() << " Edge: from " << BB->getName() << " to exit" + << " w = " << BBWeight << "\n"); + } + } + } + + // Sort CFG edges based on its weight. + void sortEdgesByWeight() { + std::stable_sort(AllEdges.begin(), AllEdges.end(), + [](const std::unique_ptr<Edge> &Edge1, + const std::unique_ptr<Edge> &Edge2) { + return Edge1->Weight > Edge2->Weight; + }); + } + + // Traverse all the edges and compute the Minimum Weight Spanning Tree + // using union-find algorithm. + void computeMinimumSpanningTree() { + // First, put all the critical edge with landing-pad as the Dest to MST. + // This works around the insufficient support of critical edges split + // when destination BB is a landing pad. + for (auto &Ei : AllEdges) { + if (Ei->Removed) + continue; + if (Ei->IsCritical) { + if (Ei->DestBB && Ei->DestBB->isLandingPad()) { + if (unionGroups(Ei->SrcBB, Ei->DestBB)) + Ei->InMST = true; + } + } + } + + for (auto &Ei : AllEdges) { + if (Ei->Removed) + continue; + if (unionGroups(Ei->SrcBB, Ei->DestBB)) + Ei->InMST = true; + } + } + + // Dump the Debug information about the instrumentation. + void dumpEdges(raw_ostream &OS, const Twine &Message) const { + if (!Message.str().empty()) + OS << Message << "\n"; + OS << " Number of Basic Blocks: " << BBInfos.size() << "\n"; + for (auto &BI : BBInfos) { + const BasicBlock *BB = BI.first; + OS << " BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << " " + << BI.second->infoString() << "\n"; + } + + OS << " Number of Edges: " << AllEdges.size() + << " (*: Instrument, C: CriticalEdge, -: Removed)\n"; + uint32_t Count = 0; + for (auto &EI : AllEdges) + OS << " Edge " << Count++ << ": " << getBBInfo(EI->SrcBB).Index << "-->" + << getBBInfo(EI->DestBB).Index << EI->infoString() << "\n"; + } + + // Add an edge to AllEdges with weight W. + Edge &addEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W) { + uint32_t Index = BBInfos.size(); + auto Iter = BBInfos.end(); + bool Inserted; + std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr)); + if (Inserted) { + // Newly inserted, update the real info. + Iter->second = std::move(llvm::make_unique<BBInfo>(Index)); + Index++; + } + std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr)); + if (Inserted) + // Newly inserted, update the real info. + Iter->second = std::move(llvm::make_unique<BBInfo>(Index)); + AllEdges.emplace_back(new Edge(Src, Dest, W)); + return *AllEdges.back(); + } + + BranchProbabilityInfo *BPI; + BlockFrequencyInfo *BFI; + +public: + CFGMST(Function &Func, BranchProbabilityInfo *BPI_ = nullptr, + BlockFrequencyInfo *BFI_ = nullptr) + : F(Func), BPI(BPI_), BFI(BFI_) { + buildEdges(); + sortEdgesByWeight(); + computeMinimumSpanningTree(); + } +}; + +#undef DEBUG_TYPE // "cfgmst" +} // end namespace llvm diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 2de6e1a..d459fc5 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -72,6 +72,11 @@ using namespace llvm; +// External symbol to be used when generating the shadow address for +// architectures with multiple VMAs. Instead of using a constant integer +// the runtime will set the external mask based on the VMA range. +static const char *const kDFSanExternShadowPtrMask = "__dfsan_shadow_ptr_mask"; + // The -dfsan-preserve-alignment flag controls whether this pass assumes that // alignment requirements provided by the input IR are correct. For example, // if the input IR contains a load with alignment 8, this flag will cause @@ -124,6 +129,7 @@ static cl::opt<bool> ClDebugNonzeroLabels( "load or return with a nonzero label"), cl::Hidden); + namespace { StringRef GetGlobalTypeString(const GlobalValue &G) { @@ -231,6 +237,7 @@ class DataFlowSanitizer : public ModulePass { void *(*GetRetvalTLSPtr)(); Constant *GetArgTLS; Constant *GetRetvalTLS; + Constant *ExternalShadowMask; FunctionType *DFSanUnionFnTy; FunctionType *DFSanUnionLoadFnTy; FunctionType *DFSanUnimplementedFnTy; @@ -248,7 +255,7 @@ class DataFlowSanitizer : public ModulePass { DFSanABIList ABIList; DenseMap<Value *, Function *> UnwrappedFnMap; AttributeSet ReadOnlyNoneAttrs; - DenseMap<const Function *, DISubprogram *> FunctionDIs; + bool DFSanRuntimeShadowMask; Value *getShadowAddress(Value *Addr, Instruction *Pos); bool isInstrumented(const Function *F); @@ -362,7 +369,8 @@ llvm::createDataFlowSanitizerPass(const std::vector<std::string> &ABIListFiles, DataFlowSanitizer::DataFlowSanitizer( const std::vector<std::string> &ABIListFiles, void *(*getArgTLS)(), void *(*getRetValTLS)()) - : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS) { + : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS), + DFSanRuntimeShadowMask(false) { std::vector<std::string> AllABIListFiles(std::move(ABIListFiles)); AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(), ClABIListFiles.end()); @@ -420,6 +428,8 @@ bool DataFlowSanitizer::doInitialization(Module &M) { bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64; bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 || TargetTriple.getArch() == llvm::Triple::mips64el; + bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64 || + TargetTriple.getArch() == llvm::Triple::aarch64_be; const DataLayout &DL = M.getDataLayout(); @@ -434,6 +444,9 @@ bool DataFlowSanitizer::doInitialization(Module &M) { ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL); else if (IsMIPS64) ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL); + // AArch64 supports multiple VMAs and the shadow mask is set at runtime. + else if (IsAArch64) + DFSanRuntimeShadowMask = true; else report_fatal_error("unsupported triple"); @@ -578,7 +591,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT, DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true); Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI; for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) - DFSF.ValShadowMap[ValAI] = ShadowAI; + DFSF.ValShadowMap[&*ValAI] = &*ShadowAI; DFSanVisitor(DFSF).visitCallInst(*CI); if (!FT->getReturnType()->isVoidTy()) new StoreInst(DFSF.getShadow(RI->getReturnValue()), @@ -592,8 +605,6 @@ bool DataFlowSanitizer::runOnModule(Module &M) { if (ABIList.isIn(M, "skip")) return false; - FunctionDIs = makeSubprogramMap(M); - if (!GetArgTLSPtr) { Type *ArgTLSTy = ArrayType::get(ShadowTy, 64); ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy); @@ -606,6 +617,9 @@ bool DataFlowSanitizer::runOnModule(Module &M) { G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel); } + ExternalShadowMask = + Mod->getOrInsertGlobal(kDFSanExternShadowPtrMask, IntptrTy); + DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy); if (Function *F = dyn_cast<Function>(DFSanUnionFn)) { F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind); @@ -643,16 +657,16 @@ bool DataFlowSanitizer::runOnModule(Module &M) { std::vector<Function *> FnsToInstrument; llvm::SmallPtrSet<Function *, 2> FnsWithNativeABI; - for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) { - if (!i->isIntrinsic() && - i != DFSanUnionFn && - i != DFSanCheckedUnionFn && - i != DFSanUnionLoadFn && - i != DFSanUnimplementedFn && - i != DFSanSetLabelFn && - i != DFSanNonzeroLabelFn && - i != DFSanVarargWrapperFn) - FnsToInstrument.push_back(&*i); + for (Function &i : M) { + if (!i.isIntrinsic() && + &i != DFSanUnionFn && + &i != DFSanCheckedUnionFn && + &i != DFSanUnionLoadFn && + &i != DFSanUnimplementedFn && + &i != DFSanSetLabelFn && + &i != DFSanNonzeroLabelFn && + &i != DFSanVarargWrapperFn) + FnsToInstrument.push_back(&i); } // Give function aliases prefixes when necessary, and build wrappers where the @@ -710,7 +724,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { NewFArg = NewF->arg_begin(), FArgEnd = F.arg_end(); FArg != FArgEnd; ++FArg, ++NewFArg) { - FArg->replaceAllUsesWith(NewFArg); + FArg->replaceAllUsesWith(&*NewFArg); } NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList()); @@ -750,11 +764,6 @@ bool DataFlowSanitizer::runOnModule(Module &M) { ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)); F.replaceAllUsesWith(WrappedFnCst); - // Patch the pointer to LLVM function in debug info descriptor. - auto DI = FunctionDIs.find(&F); - if (DI != FunctionDIs.end()) - DI->second->replaceFunction(&F); - UnwrappedFnMap[WrappedFnCst] = &F; *i = NewF; @@ -842,7 +851,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) { if (Instruction *I = dyn_cast<Instruction>(V)) Pos = I->getNextNode(); else - Pos = DFSF.F->getEntryBlock().begin(); + Pos = &DFSF.F->getEntryBlock().front(); while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos)) Pos = Pos->getNextNode(); IRBuilder<> IRB(Pos); @@ -864,7 +873,7 @@ Value *DFSanFunction::getArgTLSPtr() { if (DFS.ArgTLS) return ArgTLSPtr = DFS.ArgTLS; - IRBuilder<> IRB(F->getEntryBlock().begin()); + IRBuilder<> IRB(&F->getEntryBlock().front()); return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS, {}); } @@ -874,7 +883,7 @@ Value *DFSanFunction::getRetvalTLS() { if (DFS.RetvalTLS) return RetvalTLSPtr = DFS.RetvalTLS; - IRBuilder<> IRB(F->getEntryBlock().begin()); + IRBuilder<> IRB(&F->getEntryBlock().front()); return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS, {}); } @@ -906,7 +915,7 @@ Value *DFSanFunction::getShadow(Value *V) { Function::arg_iterator i = F->arg_begin(); while (ArgIdx--) ++i; - Shadow = i; + Shadow = &*i; assert(Shadow->getType() == DFS.ShadowTy); break; } @@ -928,9 +937,15 @@ void DFSanFunction::setShadow(Instruction *I, Value *Shadow) { Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) { assert(Addr != RetvalTLS && "Reinstrumenting?"); IRBuilder<> IRB(Pos); + Value *ShadowPtrMaskValue; + if (DFSanRuntimeShadowMask) + ShadowPtrMaskValue = IRB.CreateLoad(IntptrTy, ExternalShadowMask); + else + ShadowPtrMaskValue = ShadowPtrMask; return IRB.CreateIntToPtr( IRB.CreateMul( - IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), ShadowPtrMask), + IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), + IRB.CreatePtrToInt(ShadowPtrMaskValue, IntptrTy)), ShadowPtrMul), ShadowPtrTy); } @@ -991,7 +1006,7 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) { Call->addAttribute(2, Attribute::ZExt); BasicBlock *Tail = BI->getSuccessor(0); - PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", Tail->begin()); + PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front()); Phi->addIncoming(Call, Call->getParent()); Phi->addIncoming(V1, Head); @@ -1105,7 +1120,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow); BasicBlock *Head = Pos->getParent(); - BasicBlock *Tail = Head->splitBasicBlock(Pos); + BasicBlock *Tail = Head->splitBasicBlock(Pos->getIterator()); if (DomTreeNode *OldNode = DT.getNode(Head)) { std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end()); @@ -1475,8 +1490,8 @@ void DFSanVisitor::visitCallSite(CallSite CS) { if (FT->isVarArg()) { auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy, CS.arg_size() - FT->getNumParams()); - auto *LabelVAAlloca = new AllocaInst(LabelVATy, "labelva", - DFSF.F->getEntryBlock().begin()); + auto *LabelVAAlloca = new AllocaInst( + LabelVATy, "labelva", &DFSF.F->getEntryBlock().front()); for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) { auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n); @@ -1490,7 +1505,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) { if (!DFSF.LabelReturnAlloca) { DFSF.LabelReturnAlloca = new AllocaInst(DFSF.DFS.ShadowTy, "labelreturn", - DFSF.F->getEntryBlock().begin()); + &DFSF.F->getEntryBlock().front()); } Args.push_back(DFSF.LabelReturnAlloca); } @@ -1529,13 +1544,14 @@ void DFSanVisitor::visitCallSite(CallSite CS) { if (!CS.getType()->isVoidTy()) { if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) { if (II->getNormalDest()->getSinglePredecessor()) { - Next = II->getNormalDest()->begin(); + Next = &II->getNormalDest()->front(); } else { BasicBlock *NewBB = SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT); - Next = NewBB->begin(); + Next = &NewBB->front(); } } else { + assert(CS->getIterator() != CS->getParent()->end()); Next = CS->getNextNode(); } @@ -1568,7 +1584,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) { unsigned VarArgSize = CS.arg_size() - FT->getNumParams(); ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize); AllocaInst *VarArgShadow = - new AllocaInst(VarArgArrayTy, "", DFSF.F->getEntryBlock().begin()); + new AllocaInst(VarArgArrayTy, "", &DFSF.F->getEntryBlock().front()); Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0)); for (unsigned n = 0; i != e; ++i, ++n) { IRB.CreateStore( diff --git a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 9a3ed5c..fa939ae 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -138,6 +138,7 @@ namespace { Module *M; LLVMContext *Ctx; SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs; + DenseMap<DISubprogram *, Function *> FnMap; }; } @@ -309,13 +310,12 @@ namespace { // object users can construct, the blocks and lines will be rooted here. class GCOVFunction : public GCOVRecord { public: - GCOVFunction(const DISubprogram *SP, raw_ostream *os, uint32_t Ident, - bool UseCfgChecksum, bool ExitBlockBeforeBody) + GCOVFunction(const DISubprogram *SP, Function *F, raw_ostream *os, + uint32_t Ident, bool UseCfgChecksum, bool ExitBlockBeforeBody) : SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0), ReturnBlock(1, os) { this->os = os; - Function *F = SP->getFunction(); DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n"); uint32_t i = 0; @@ -347,8 +347,8 @@ namespace { std::string EdgeDestinations; raw_string_ostream EDOS(EdgeDestinations); Function *F = Blocks.begin()->first->getParent(); - for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { - GCOVBlock &Block = getBlock(I); + for (BasicBlock &I : *F) { + GCOVBlock &Block = getBlock(&I); for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) EDOS << Block.OutEdges[i]->Number; } @@ -389,8 +389,8 @@ namespace { // Emit edges between blocks. if (Blocks.empty()) return; Function *F = Blocks.begin()->first->getParent(); - for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { - GCOVBlock &Block = getBlock(I); + for (BasicBlock &I : *F) { + GCOVBlock &Block = getBlock(&I); if (Block.OutEdges.empty()) continue; writeBytes(EdgeTag, 4); @@ -405,9 +405,8 @@ namespace { } // Emit lines for each block. - for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { - getBlock(I).writeOut(); - } + for (BasicBlock &I : *F) + getBlock(&I).writeOut(); } private: @@ -451,6 +450,12 @@ bool GCOVProfiler::runOnModule(Module &M) { this->M = &M; Ctx = &M.getContext(); + FnMap.clear(); + for (Function &F : M) { + if (DISubprogram *SP = F.getSubprogram()) + FnMap[SP] = &F; + } + if (Options.EmitNotes) emitProfileNotes(); if (Options.EmitData) return emitProfileArcs(); return false; @@ -495,7 +500,7 @@ void GCOVProfiler::emitProfileNotes() { unsigned FunctionIdent = 0; for (auto *SP : CU->getSubprograms()) { - Function *F = SP->getFunction(); + Function *F = FnMap[SP]; if (!F) continue; if (!functionHasLines(F)) continue; @@ -507,13 +512,13 @@ void GCOVProfiler::emitProfileNotes() { ++It; EntryBlock.splitBasicBlock(It); - Funcs.push_back(make_unique<GCOVFunction>(SP, &out, FunctionIdent++, + Funcs.push_back(make_unique<GCOVFunction>(SP, F, &out, FunctionIdent++, Options.UseCfgChecksum, Options.ExitBlockBeforeBody)); GCOVFunction &Func = *Funcs.back(); for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { - GCOVBlock &Block = Func.getBlock(BB); + GCOVBlock &Block = Func.getBlock(&*BB); TerminatorInst *TI = BB->getTerminator(); if (int successors = TI->getNumSuccessors()) { for (int i = 0; i != successors; ++i) { @@ -574,7 +579,7 @@ bool GCOVProfiler::emitProfileArcs() { auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i)); SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP; for (auto *SP : CU->getSubprograms()) { - Function *F = SP->getFunction(); + Function *F = FnMap[SP]; if (!F) continue; if (!functionHasLines(F)) continue; if (!Result) Result = true; @@ -605,7 +610,7 @@ bool GCOVProfiler::emitProfileArcs() { int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors(); if (Successors) { if (Successors == 1) { - IRBuilder<> Builder(BB->getFirstInsertionPt()); + IRBuilder<> Builder(&*BB->getFirstInsertionPt()); Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Edge); Value *Count = Builder.CreateLoad(Counter); @@ -625,7 +630,7 @@ bool GCOVProfiler::emitProfileArcs() { Count = Builder.CreateAdd(Count, Builder.getInt64(1)); Builder.CreateStore(Count, Counter); } else { - ComplexEdgePreds.insert(BB); + ComplexEdgePreds.insert(&*BB); for (int i = 0; i != Successors; ++i) ComplexEdgeSuccs.insert(TI->getSuccessor(i)); } @@ -641,13 +646,13 @@ bool GCOVProfiler::emitProfileArcs() { GlobalVariable *EdgeState = getEdgeStateValue(); for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) { - IRBuilder<> Builder(ComplexEdgePreds[i + 1]->getFirstInsertionPt()); + IRBuilder<> Builder(&*ComplexEdgePreds[i + 1]->getFirstInsertionPt()); Builder.CreateStore(Builder.getInt32(i), EdgeState); } for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) { // Call runtime to perform increment. - IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstInsertionPt()); + IRBuilder<> Builder(&*ComplexEdgeSuccs[i + 1]->getFirstInsertionPt()); Value *CounterPtrArray = Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0, i * ComplexEdgePreds.size()); @@ -731,8 +736,8 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable( IRBuilder<> Builder(Succ); Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Edge + i); - EdgeTable[((Succs.idFor(Succ)-1) * Preds.size()) + - (Preds.idFor(BB)-1)] = cast<Constant>(Counter); + EdgeTable[((Succs.idFor(Succ) - 1) * Preds.size()) + + (Preds.idFor(&*BB) - 1)] = cast<Constant>(Counter); } } Edge += Successors; @@ -901,7 +906,7 @@ void GCOVProfiler::insertIndirectCounterIncrement() { // uint32_t pred = *predecessor; // if (pred == 0xffffffff) return; - Argument *Arg = Fn->arg_begin(); + Argument *Arg = &*Fn->arg_begin(); Arg->setName("predecessor"); Value *Pred = Builder.CreateLoad(Arg, "pred"); Value *Cond = Builder.CreateICmpEQ(Pred, Builder.getInt32(0xffffffff)); @@ -912,7 +917,7 @@ void GCOVProfiler::insertIndirectCounterIncrement() { // uint64_t *counter = counters[pred]; // if (!counter) return; Value *ZExtPred = Builder.CreateZExt(Pred, Builder.getInt64Ty()); - Arg = std::next(Fn->arg_begin()); + Arg = &*std::next(Fn->arg_begin()); Arg->setName("counters"); Value *GEP = Builder.CreateGEP(Type::getInt64PtrTy(*Ctx), Arg, ZExtPred); Value *Counter = Builder.CreateLoad(GEP, "counter"); diff --git a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 712bf8e..28483e7 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -7,18 +7,18 @@ // //===----------------------------------------------------------------------===// // -// This pass lowers instrprof_increment intrinsics emitted by a frontend for -// profiling. It also builds the data structures and initialization code needed -// for updating execution counts and emitting the profile at runtime. +// This pass lowers instrprof_* intrinsics emitted by a frontend for profiling. +// It also builds the data structures and initialization code needed for +// updating execution counts and emitting the profile at runtime. // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Instrumentation.h" - #include "llvm/ADT/Triple.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/ModuleUtils.h" using namespace llvm; @@ -49,7 +49,15 @@ public: private: InstrProfOptions Options; Module *M; - DenseMap<GlobalVariable *, GlobalVariable *> RegionCounters; + typedef struct PerFunctionProfileData { + uint32_t NumValueSites[IPVK_Last+1]; + GlobalVariable* RegionCounters; + GlobalVariable* DataVar; + PerFunctionProfileData() : RegionCounters(nullptr), DataVar(nullptr) { + memset(NumValueSites, 0, sizeof(uint32_t) * (IPVK_Last+1)); + } + } PerFunctionProfileData; + DenseMap<GlobalVariable *, PerFunctionProfileData> ProfileDataMap; std::vector<Value *> UsedVars; bool isMachO() const { @@ -58,29 +66,35 @@ private: /// Get the section name for the counter variables. StringRef getCountersSection() const { - return isMachO() ? "__DATA,__llvm_prf_cnts" : "__llvm_prf_cnts"; + return getInstrProfCountersSectionName(isMachO()); } /// Get the section name for the name variables. StringRef getNameSection() const { - return isMachO() ? "__DATA,__llvm_prf_names" : "__llvm_prf_names"; + return getInstrProfNameSectionName(isMachO()); } /// Get the section name for the profile data variables. StringRef getDataSection() const { - return isMachO() ? "__DATA,__llvm_prf_data" : "__llvm_prf_data"; + return getInstrProfDataSectionName(isMachO()); } /// Get the section name for the coverage mapping data. StringRef getCoverageSection() const { - return isMachO() ? "__DATA,__llvm_covmap" : "__llvm_covmap"; + return getInstrProfCoverageSectionName(isMachO()); } + /// Count the number of instrumented value sites for the function. + void computeNumValueSiteCounts(InstrProfValueProfileInst *Ins); + + /// Replace instrprof_value_profile with a call to runtime library. + void lowerValueProfileInst(InstrProfValueProfileInst *Ins); + /// Replace instrprof_increment with an increment of the appropriate value. void lowerIncrement(InstrProfIncrementInst *Inc); - /// Set up the section and uses for coverage data and its references. - void lowerCoverageData(GlobalVariable *CoverageData); + /// Force emitting of name vars for unused functions. + void lowerCoverageData(GlobalVariable *CoverageNamesVar); /// Get the region counters for an increment, creating them if necessary. /// @@ -117,20 +131,37 @@ bool InstrProfiling::runOnModule(Module &M) { bool MadeChange = false; this->M = &M; - RegionCounters.clear(); + ProfileDataMap.clear(); UsedVars.clear(); + // We did not know how many value sites there would be inside + // the instrumented function. This is counting the number of instrumented + // target value sites to enter it as field in the profile data variable. for (Function &F : M) for (BasicBlock &BB : F) for (auto I = BB.begin(), E = BB.end(); I != E;) - if (auto *Inc = dyn_cast<InstrProfIncrementInst>(I++)) { + if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I++)) + computeNumValueSiteCounts(Ind); + + for (Function &F : M) + for (BasicBlock &BB : F) + for (auto I = BB.begin(), E = BB.end(); I != E;) { + auto Instr = I++; + if (auto *Inc = dyn_cast<InstrProfIncrementInst>(Instr)) { lowerIncrement(Inc); MadeChange = true; + } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) { + lowerValueProfileInst(Ind); + MadeChange = true; } - if (GlobalVariable *Coverage = M.getNamedGlobal("__llvm_coverage_mapping")) { - lowerCoverageData(Coverage); + } + + if (GlobalVariable *CoverageNamesVar = + M.getNamedGlobal(getCoverageNamesVarName())) { + lowerCoverageData(CoverageNamesVar); MadeChange = true; } + if (!MadeChange) return false; @@ -141,10 +172,59 @@ bool InstrProfiling::runOnModule(Module &M) { return true; } +static Constant *getOrInsertValueProfilingCall(Module &M) { + LLVMContext &Ctx = M.getContext(); + auto *ReturnTy = Type::getVoidTy(M.getContext()); + Type *ParamTypes[] = { +#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType +#include "llvm/ProfileData/InstrProfData.inc" + }; + auto *ValueProfilingCallTy = + FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false); + return M.getOrInsertFunction(getInstrProfValueProfFuncName(), + ValueProfilingCallTy); +} + +void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) { + + GlobalVariable *Name = Ind->getName(); + uint64_t ValueKind = Ind->getValueKind()->getZExtValue(); + uint64_t Index = Ind->getIndex()->getZExtValue(); + auto It = ProfileDataMap.find(Name); + if (It == ProfileDataMap.end()) { + PerFunctionProfileData PD; + PD.NumValueSites[ValueKind] = Index + 1; + ProfileDataMap[Name] = PD; + } else if (It->second.NumValueSites[ValueKind] <= Index) + It->second.NumValueSites[ValueKind] = Index + 1; +} + +void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { + + GlobalVariable *Name = Ind->getName(); + auto It = ProfileDataMap.find(Name); + assert(It != ProfileDataMap.end() && It->second.DataVar && + "value profiling detected in function with no counter incerement"); + + GlobalVariable *DataVar = It->second.DataVar; + uint64_t ValueKind = Ind->getValueKind()->getZExtValue(); + uint64_t Index = Ind->getIndex()->getZExtValue(); + for (uint32_t Kind = IPVK_First; Kind < ValueKind; ++Kind) + Index += It->second.NumValueSites[Kind]; + + IRBuilder<> Builder(Ind); + Value* Args[3] = {Ind->getTargetValue(), + Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()), + Builder.getInt32(Index)}; + Ind->replaceAllUsesWith( + Builder.CreateCall(getOrInsertValueProfilingCall(*M), Args)); + Ind->eraseFromParent(); +} + void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) { GlobalVariable *Counters = getOrCreateRegionCounters(Inc); - IRBuilder<> Builder(Inc->getParent(), *Inc); + IRBuilder<> Builder(Inc); uint64_t Index = Inc->getIndex()->getZExtValue(); Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index); Value *Count = Builder.CreateLoad(Addr, "pgocount"); @@ -153,29 +233,16 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) { Inc->eraseFromParent(); } -void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageData) { - CoverageData->setSection(getCoverageSection()); - CoverageData->setAlignment(8); - - Constant *Init = CoverageData->getInitializer(); - // We're expecting { i32, i32, i32, i32, [n x { i8*, i32, i32 }], [m x i8] } - // for some C. If not, the frontend's given us something broken. - assert(Init->getNumOperands() == 6 && "bad number of fields in coverage map"); - assert(isa<ConstantArray>(Init->getAggregateElement(4)) && - "invalid function list in coverage map"); - ConstantArray *Records = cast<ConstantArray>(Init->getAggregateElement(4)); - for (unsigned I = 0, E = Records->getNumOperands(); I < E; ++I) { - Constant *Record = Records->getOperand(I); - Value *V = const_cast<Value *>(Record->getOperand(0))->stripPointerCasts(); +void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) { + ConstantArray *Names = + cast<ConstantArray>(CoverageNamesVar->getInitializer()); + for (unsigned I = 0, E = Names->getNumOperands(); I < E; ++I) { + Constant *NC = Names->getOperand(I); + Value *V = NC->stripPointerCasts(); assert(isa<GlobalVariable>(V) && "Missing reference to function name"); GlobalVariable *Name = cast<GlobalVariable>(V); - // If we have region counters for this name, we've already handled it. - auto It = RegionCounters.find(Name); - if (It != RegionCounters.end()) - continue; - // Move the name variable to the right section. Name->setSection(getNameSection()); Name->setAlignment(1); @@ -183,69 +250,108 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageData) { } /// Get the name of a profiling variable for a particular function. -static std::string getVarName(InstrProfIncrementInst *Inc, StringRef VarName) { - auto *Arr = cast<ConstantDataArray>(Inc->getName()->getInitializer()); - StringRef Name = Arr->isCString() ? Arr->getAsCString() : Arr->getAsString(); - return ("__llvm_profile_" + VarName + "_" + Name).str(); +static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) { + StringRef NamePrefix = getInstrProfNameVarPrefix(); + StringRef Name = Inc->getName()->getName().substr(NamePrefix.size()); + return (Prefix + Name).str(); +} + +static inline bool shouldRecordFunctionAddr(Function *F) { + // Check the linkage + if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() && + !F->hasAvailableExternallyLinkage()) + return true; + // Check uses of this function for other than direct calls or invokes to it. + return F->hasAddressTaken(); +} + +static inline Comdat *getOrCreateProfileComdat(Module &M, + InstrProfIncrementInst *Inc) { + // COFF format requires a COMDAT section to have a key symbol with the same + // name. The linker targeting COFF also requires that the COMDAT section + // a section is associated to must precede the associating section. For this + // reason, we must choose the name var's name as the name of the comdat. + StringRef ComdatPrefix = (Triple(M.getTargetTriple()).isOSBinFormatCOFF() + ? getInstrProfNameVarPrefix() + : getInstrProfComdatPrefix()); + return M.getOrInsertComdat(StringRef(getVarName(Inc, ComdatPrefix))); } GlobalVariable * InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { - GlobalVariable *Name = Inc->getName(); - auto It = RegionCounters.find(Name); - if (It != RegionCounters.end()) - return It->second; - - // Move the name variable to the right section. Make sure it is placed in the - // same comdat as its associated function. Otherwise, we may get multiple - // counters for the same function in certain cases. + GlobalVariable *NamePtr = Inc->getName(); + auto It = ProfileDataMap.find(NamePtr); + PerFunctionProfileData PD; + if (It != ProfileDataMap.end()) { + if (It->second.RegionCounters) + return It->second.RegionCounters; + PD = It->second; + } + + // Move the name variable to the right section. Place them in a COMDAT group + // if the associated function is a COMDAT. This will make sure that + // only one copy of counters of the COMDAT function will be emitted after + // linking. Function *Fn = Inc->getParent()->getParent(); - Name->setSection(getNameSection()); - Name->setAlignment(1); - Name->setComdat(Fn->getComdat()); + Comdat *ProfileVarsComdat = nullptr; + if (Fn->hasComdat()) + ProfileVarsComdat = getOrCreateProfileComdat(*M, Inc); + NamePtr->setSection(getNameSection()); + NamePtr->setAlignment(1); + NamePtr->setComdat(ProfileVarsComdat); uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); LLVMContext &Ctx = M->getContext(); ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters); // Create the counters variable. - auto *Counters = new GlobalVariable(*M, CounterTy, false, Name->getLinkage(), - Constant::getNullValue(CounterTy), - getVarName(Inc, "counters")); - Counters->setVisibility(Name->getVisibility()); - Counters->setSection(getCountersSection()); - Counters->setAlignment(8); - Counters->setComdat(Fn->getComdat()); - - RegionCounters[Inc->getName()] = Counters; + auto *CounterPtr = + new GlobalVariable(*M, CounterTy, false, NamePtr->getLinkage(), + Constant::getNullValue(CounterTy), + getVarName(Inc, getInstrProfCountersVarPrefix())); + CounterPtr->setVisibility(NamePtr->getVisibility()); + CounterPtr->setSection(getCountersSection()); + CounterPtr->setAlignment(8); + CounterPtr->setComdat(ProfileVarsComdat); // Create data variable. - auto *NameArrayTy = Name->getType()->getPointerElementType(); - auto *Int32Ty = Type::getInt32Ty(Ctx); - auto *Int64Ty = Type::getInt64Ty(Ctx); auto *Int8PtrTy = Type::getInt8PtrTy(Ctx); - auto *Int64PtrTy = Type::getInt64PtrTy(Ctx); - - Type *DataTypes[] = {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int64PtrTy}; + auto *Int16Ty = Type::getInt16Ty(Ctx); + auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last+1); + Type *DataTypes[] = { + #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType, + #include "llvm/ProfileData/InstrProfData.inc" + }; auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes)); + + Constant *FunctionAddr = shouldRecordFunctionAddr(Fn) ? + ConstantExpr::getBitCast(Fn, Int8PtrTy) : + ConstantPointerNull::get(Int8PtrTy); + + Constant *Int16ArrayVals[IPVK_Last+1]; + for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) + Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]); + Constant *DataVals[] = { - ConstantInt::get(Int32Ty, NameArrayTy->getArrayNumElements()), - ConstantInt::get(Int32Ty, NumCounters), - ConstantInt::get(Int64Ty, Inc->getHash()->getZExtValue()), - ConstantExpr::getBitCast(Name, Int8PtrTy), - ConstantExpr::getBitCast(Counters, Int64PtrTy)}; - auto *Data = new GlobalVariable(*M, DataTy, true, Name->getLinkage(), + #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init, + #include "llvm/ProfileData/InstrProfData.inc" + }; + auto *Data = new GlobalVariable(*M, DataTy, false, NamePtr->getLinkage(), ConstantStruct::get(DataTy, DataVals), - getVarName(Inc, "data")); - Data->setVisibility(Name->getVisibility()); + getVarName(Inc, getInstrProfDataVarPrefix())); + Data->setVisibility(NamePtr->getVisibility()); Data->setSection(getDataSection()); - Data->setAlignment(8); - Data->setComdat(Fn->getComdat()); + Data->setAlignment(INSTR_PROF_DATA_ALIGNMENT); + Data->setComdat(ProfileVarsComdat); + + PD.RegionCounters = CounterPtr; + PD.DataVar = Data; + ProfileDataMap[NamePtr] = PD; // Mark the data variable as used so that it isn't stripped out. UsedVars.push_back(Data); - return Counters; + return CounterPtr; } void InstrProfiling::emitRegistration() { @@ -253,20 +359,24 @@ void InstrProfiling::emitRegistration() { if (Triple(M->getTargetTriple()).isOSDarwin()) return; + // Use linker script magic to get data/cnts/name start/end. + if (Triple(M->getTargetTriple()).isOSLinux() || + Triple(M->getTargetTriple()).isOSFreeBSD()) + return; + // Construct the function. auto *VoidTy = Type::getVoidTy(M->getContext()); auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext()); auto *RegisterFTy = FunctionType::get(VoidTy, false); auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage, - "__llvm_profile_register_functions", M); + getInstrProfRegFuncsName(), M); RegisterF->setUnnamedAddr(true); - if (Options.NoRedZone) - RegisterF->addFnAttr(Attribute::NoRedZone); + if (Options.NoRedZone) RegisterF->addFnAttr(Attribute::NoRedZone); auto *RuntimeRegisterTy = FunctionType::get(VoidTy, VoidPtrTy, false); auto *RuntimeRegisterF = Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage, - "__llvm_profile_register_function", M); + getInstrProfRegFuncName(), M); IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF)); for (Value *Data : UsedVars) @@ -275,26 +385,27 @@ void InstrProfiling::emitRegistration() { } void InstrProfiling::emitRuntimeHook() { - const char *const RuntimeVarName = "__llvm_profile_runtime"; - const char *const RuntimeUserName = "__llvm_profile_runtime_user"; - // If the module's provided its own runtime, we don't need to do anything. - if (M->getGlobalVariable(RuntimeVarName)) + // We expect the linker to be invoked with -u<hook_var> flag for linux, + // for which case there is no need to emit the user function. + if (Triple(M->getTargetTriple()).isOSLinux()) return; + // If the module's provided its own runtime, we don't need to do anything. + if (M->getGlobalVariable(getInstrProfRuntimeHookVarName())) return; + // Declare an external variable that will pull in the runtime initialization. auto *Int32Ty = Type::getInt32Ty(M->getContext()); auto *Var = new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage, - nullptr, RuntimeVarName); + nullptr, getInstrProfRuntimeHookVarName()); // Make a function that uses it. - auto *User = - Function::Create(FunctionType::get(Int32Ty, false), - GlobalValue::LinkOnceODRLinkage, RuntimeUserName, M); + auto *User = Function::Create(FunctionType::get(Int32Ty, false), + GlobalValue::LinkOnceODRLinkage, + getInstrProfRuntimeHookVarUseFuncName(), M); User->addFnAttr(Attribute::NoInline); - if (Options.NoRedZone) - User->addFnAttr(Attribute::NoRedZone); + if (Options.NoRedZone) User->addFnAttr(Attribute::NoRedZone); User->setVisibility(GlobalValue::HiddenVisibility); IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User)); @@ -330,26 +441,23 @@ void InstrProfiling::emitUses() { LLVMUsed = new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage, ConstantArray::get(ATy, MergedVars), "llvm.used"); - LLVMUsed->setSection("llvm.metadata"); } void InstrProfiling::emitInitialization() { std::string InstrProfileOutput = Options.InstrProfileOutput; - Constant *RegisterF = M->getFunction("__llvm_profile_register_functions"); - if (!RegisterF && InstrProfileOutput.empty()) - return; + Constant *RegisterF = M->getFunction(getInstrProfRegFuncsName()); + if (!RegisterF && InstrProfileOutput.empty()) return; // Create the initialization function. auto *VoidTy = Type::getVoidTy(M->getContext()); - auto *F = - Function::Create(FunctionType::get(VoidTy, false), - GlobalValue::InternalLinkage, "__llvm_profile_init", M); + auto *F = Function::Create(FunctionType::get(VoidTy, false), + GlobalValue::InternalLinkage, + getInstrProfInitFuncName(), M); F->setUnnamedAddr(true); F->addFnAttr(Attribute::NoInline); - if (Options.NoRedZone) - F->addFnAttr(Attribute::NoRedZone); + if (Options.NoRedZone) F->addFnAttr(Attribute::NoRedZone); // Add the basic block and the necessary calls. IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F)); @@ -358,9 +466,8 @@ void InstrProfiling::emitInitialization() { if (!InstrProfileOutput.empty()) { auto *Int8PtrTy = Type::getInt8PtrTy(M->getContext()); auto *SetNameTy = FunctionType::get(VoidTy, Int8PtrTy, false); - auto *SetNameF = - Function::Create(SetNameTy, GlobalValue::ExternalLinkage, - "__llvm_profile_override_default_filename", M); + auto *SetNameF = Function::Create(SetNameTy, GlobalValue::ExternalLinkage, + getInstrProfFileOverriderFuncName(), M); // Create variable for profile name. Constant *ProfileNameConst = diff --git a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp index 2750585..a05a5fa 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -12,12 +12,47 @@ // //===----------------------------------------------------------------------===// -#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Instrumentation.h" #include "llvm-c/Initialization.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/InitializePasses.h" #include "llvm/PassRegistry.h" using namespace llvm; +/// Moves I before IP. Returns new insert point. +static BasicBlock::iterator moveBeforeInsertPoint(BasicBlock::iterator I, BasicBlock::iterator IP) { + // If I is IP, move the insert point down. + if (I == IP) + return ++IP; + // Otherwise, move I before IP and return IP. + I->moveBefore(&*IP); + return IP; +} + +/// Instrumentation passes often insert conditional checks into entry blocks. +/// Call this function before splitting the entry block to move instructions +/// that must remain in the entry block up before the split point. Static +/// allocas and llvm.localescape calls, for example, must remain in the entry +/// block. +BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB, + BasicBlock::iterator IP) { + assert(&BB.getParent()->getEntryBlock() == &BB); + for (auto I = IP, E = BB.end(); I != E; ++I) { + bool KeepInEntry = false; + if (auto *AI = dyn_cast<AllocaInst>(I)) { + if (AI->isStaticAlloca()) + KeepInEntry = true; + } else if (auto *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == llvm::Intrinsic::localescape) + KeepInEntry = true; + } + if (KeepInEntry) + IP = moveBeforeInsertPoint(I, IP); + } + return IP; +} + /// initializeInstrumentation - Initialize all passes in the TransformUtils /// library. void llvm::initializeInstrumentation(PassRegistry &Registry) { @@ -25,6 +60,8 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) { initializeAddressSanitizerModulePass(Registry); initializeBoundsCheckingPass(Registry); initializeGCOVProfilerPass(Registry); + initializePGOInstrumentationGenPass(Registry); + initializePGOInstrumentationUsePass(Registry); initializeInstrProfilingPass(Registry); initializeMemorySanitizerPass(Registry); initializeThreadSanitizerPass(Registry); diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 286a563..34aaa7f 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -148,7 +148,7 @@ static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call", cl::desc("poison uninitialized stack variables with a call"), cl::Hidden, cl::init(false)); static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern", - cl::desc("poison uninitialized stack variables with the given patter"), + cl::desc("poison uninitialized stack variables with the given pattern"), cl::Hidden, cl::init(0xff)); static cl::opt<bool> ClPoisonUndef("msan-poison-undef", cl::desc("poison undef temps"), @@ -222,10 +222,17 @@ static const MemoryMapParams Linux_I386_MemoryMapParams = { // x86_64 Linux static const MemoryMapParams Linux_X86_64_MemoryMapParams = { +#ifdef MSAN_LINUX_X86_64_OLD_MAPPING 0x400000000000, // AndMask 0, // XorMask (not used) 0, // ShadowBase (not used) 0x200000000000, // OriginBase +#else + 0, // AndMask (not used) + 0x500000000000, // XorMask + 0, // ShadowBase (not used) + 0x100000000000, // OriginBase +#endif }; // mips64 Linux @@ -244,6 +251,14 @@ static const MemoryMapParams Linux_PowerPC64_MemoryMapParams = { 0x1C0000000000, // OriginBase }; +// aarch64 Linux +static const MemoryMapParams Linux_AArch64_MemoryMapParams = { + 0, // AndMask (not used) + 0x06000000000, // XorMask + 0, // ShadowBase (not used) + 0x01000000000, // OriginBase +}; + // i386 FreeBSD static const MemoryMapParams FreeBSD_I386_MemoryMapParams = { 0x000180000000, // AndMask @@ -266,15 +281,20 @@ static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = { }; static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = { - NULL, + nullptr, &Linux_MIPS64_MemoryMapParams, }; static const PlatformMemoryMapParams Linux_PowerPC_MemoryMapParams = { - NULL, + nullptr, &Linux_PowerPC64_MemoryMapParams, }; +static const PlatformMemoryMapParams Linux_ARM_MemoryMapParams = { + nullptr, + &Linux_AArch64_MemoryMapParams, +}; + static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = { &FreeBSD_I386_MemoryMapParams, &FreeBSD_X86_64_MemoryMapParams, @@ -353,8 +373,9 @@ class MemorySanitizer : public FunctionPass { friend struct MemorySanitizerVisitor; friend struct VarArgAMD64Helper; friend struct VarArgMIPS64Helper; + friend struct VarArgAArch64Helper; }; -} // namespace +} // anonymous namespace char MemorySanitizer::ID = 0; INITIALIZE_PASS(MemorySanitizer, "msan", @@ -377,7 +398,6 @@ static GlobalVariable *createPrivateNonConstGlobalForString(Module &M, GlobalValue::PrivateLinkage, StrConst, ""); } - /// \brief Insert extern declaration of runtime-provided functions and globals. void MemorySanitizer::initializeCallbacks(Module &M) { // Only do this once. @@ -496,6 +516,10 @@ bool MemorySanitizer::doInitialization(Module &M) { case Triple::ppc64le: MapParams = Linux_PowerPC_MemoryMapParams.bits64; break; + case Triple::aarch64: + case Triple::aarch64_be: + MapParams = Linux_ARM_MemoryMapParams.bits64; + break; default: report_fatal_error("unsupported architecture"); } @@ -668,7 +692,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { const DataLayout &DL = F.getParent()->getDataLayout(); unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment); unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType()); - if (isa<StructType>(Shadow->getType())) { + if (Shadow->getType()->isAggregateType()) { paintOrigin(IRB, updateOrigin(Origin, IRB), getOriginPtr(Addr, IRB, Alignment), StoreSize, OriginAlignment); @@ -697,7 +721,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *Cmp = IRB.CreateICmpNE( ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp"); Instruction *CheckTerm = SplitBlockAndInsertIfThen( - Cmp, IRB.GetInsertPoint(), false, MS.OriginStoreWeights); + Cmp, &*IRB.GetInsertPoint(), false, MS.OriginStoreWeights); IRBuilder<> IRBNew(CheckTerm); paintOrigin(IRBNew, updateOrigin(Origin, IRBNew), getOriginPtr(Addr, IRBNew, Alignment), StoreSize, @@ -893,16 +917,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// /// Offset = (Addr & ~AndMask) ^ XorMask Value *getShadowPtrOffset(Value *Addr, IRBuilder<> &IRB) { + Value *OffsetLong = IRB.CreatePointerCast(Addr, MS.IntptrTy); + uint64_t AndMask = MS.MapParams->AndMask; - assert(AndMask != 0 && "AndMask shall be specified"); - Value *OffsetLong = - IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy), - ConstantInt::get(MS.IntptrTy, ~AndMask)); + if (AndMask) + OffsetLong = + IRB.CreateAnd(OffsetLong, ConstantInt::get(MS.IntptrTy, ~AndMask)); uint64_t XorMask = MS.MapParams->XorMask; - if (XorMask != 0) - OffsetLong = IRB.CreateXor(OffsetLong, - ConstantInt::get(MS.IntptrTy, XorMask)); + if (XorMask) + OffsetLong = + IRB.CreateXor(OffsetLong, ConstantInt::get(MS.IntptrTy, XorMask)); return OffsetLong; } @@ -1339,6 +1364,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { } void visitBitCastInst(BitCastInst &I) { + // Special case: if this is the bitcast (there is exactly 1 allowed) between + // a musttail call and a ret, don't instrument. New instructions are not + // allowed after a musttail call. + if (auto *CI = dyn_cast<CallInst>(I.getOperand(0))) + if (CI->isMustTailCall()) + return; IRBuilder<> IRB(&I); setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I))); setOrigin(&I, getOrigin(&I, 0)); @@ -1570,18 +1601,24 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Type *EltTy = Ty->getSequentialElementType(); SmallVector<Constant *, 16> Elements; for (unsigned Idx = 0; Idx < NumElements; ++Idx) { - ConstantInt *Elt = - dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx)); - APInt V = Elt->getValue(); - APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); - Elements.push_back(ConstantInt::get(EltTy, V2)); + if (ConstantInt *Elt = + dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx))) { + APInt V = Elt->getValue(); + APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); + Elements.push_back(ConstantInt::get(EltTy, V2)); + } else { + Elements.push_back(ConstantInt::get(EltTy, 1)); + } } ShadowMul = ConstantVector::get(Elements); } else { - ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg); - APInt V = Elt->getValue(); - APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); - ShadowMul = ConstantInt::get(Elt->getType(), V2); + if (ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg)) { + APInt V = Elt->getValue(); + APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros(); + ShadowMul = ConstantInt::get(Ty, V2); + } else { + ShadowMul = ConstantInt::get(Ty, 1); + } } IRBuilder<> IRB(&I); @@ -1730,25 +1767,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// \brief Instrument signed relational comparisons. /// - /// Handle (x<0) and (x>=0) comparisons (essentially, sign bit tests) by - /// propagating the highest bit of the shadow. Everything else is delegated - /// to handleShadowOr(). + /// Handle sign bit tests: x<0, x>=0, x<=-1, x>-1 by propagating the highest + /// bit of the shadow. Everything else is delegated to handleShadowOr(). void handleSignedRelationalComparison(ICmpInst &I) { - Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0)); - Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1)); - Value* op = nullptr; - CmpInst::Predicate pre = I.getPredicate(); - if (constOp0 && constOp0->isNullValue() && - (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE)) { - op = I.getOperand(1); - } else if (constOp1 && constOp1->isNullValue() && - (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) { + Constant *constOp; + Value *op = nullptr; + CmpInst::Predicate pre; + if ((constOp = dyn_cast<Constant>(I.getOperand(1)))) { op = I.getOperand(0); + pre = I.getPredicate(); + } else if ((constOp = dyn_cast<Constant>(I.getOperand(0)))) { + op = I.getOperand(1); + pre = I.getSwappedPredicate(); + } else { + handleShadowOr(I); + return; } - if (op) { + + if ((constOp->isNullValue() && + (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) || + (constOp->isAllOnesValue() && + (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE))) { IRBuilder<> IRB(&I); - Value* Shadow = - IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op), "_msprop_icmpslt"); + Value *Shadow = IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op), + "_msprop_icmp_s"); setShadow(&I, Shadow); setOrigin(&I, getOrigin(op)); } else { @@ -1860,25 +1902,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { VAHelper->visitVACopyInst(I); } - enum IntrinsicKind { - IK_DoesNotAccessMemory, - IK_OnlyReadsMemory, - IK_WritesMemory - }; - - static IntrinsicKind getIntrinsicKind(Intrinsic::ID iid) { - const int DoesNotAccessMemory = IK_DoesNotAccessMemory; - const int OnlyReadsArgumentPointees = IK_OnlyReadsMemory; - const int OnlyReadsMemory = IK_OnlyReadsMemory; - const int OnlyAccessesArgumentPointees = IK_WritesMemory; - const int UnknownModRefBehavior = IK_WritesMemory; -#define GET_INTRINSIC_MODREF_BEHAVIOR -#define ModRefBehavior IntrinsicKind -#include "llvm/IR/Intrinsics.gen" -#undef ModRefBehavior -#undef GET_INTRINSIC_MODREF_BEHAVIOR - } - /// \brief Handle vector store-like intrinsics. /// /// Instrument intrinsics that look like a simple SIMD store: writes memory, @@ -1978,17 +2001,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (NumArgOperands == 0) return false; - Intrinsic::ID iid = I.getIntrinsicID(); - IntrinsicKind IK = getIntrinsicKind(iid); - bool OnlyReadsMemory = IK == IK_OnlyReadsMemory; - bool WritesMemory = IK == IK_WritesMemory; - assert(!(OnlyReadsMemory && WritesMemory)); - if (NumArgOperands == 2 && I.getArgOperand(0)->getType()->isPointerTy() && I.getArgOperand(1)->getType()->isVectorTy() && I.getType()->isVoidTy() && - WritesMemory) { + !I.onlyReadsMemory()) { // This looks like a vector store. return handleVectorStoreIntrinsic(I); } @@ -1996,12 +2013,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (NumArgOperands == 1 && I.getArgOperand(0)->getType()->isPointerTy() && I.getType()->isVectorTy() && - OnlyReadsMemory) { + I.onlyReadsMemory()) { // This looks like a vector load. return handleVectorLoadIntrinsic(I); } - if (!OnlyReadsMemory && !WritesMemory) + if (I.doesNotAccessMemory()) if (maybeHandleSimpleNomemIntrinsic(I)) return true; @@ -2493,13 +2510,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Now, get the shadow for the RetVal. if (!I.getType()->isSized()) return; + // Don't emit the epilogue for musttail call returns. + if (CS.isCall() && cast<CallInst>(&I)->isMustTailCall()) return; IRBuilder<> IRBBefore(&I); // Until we have full dynamic coverage, make sure the retval shadow is 0. Value *Base = getShadowPtrForRetval(&I, IRBBefore); IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment); - Instruction *NextInsn = nullptr; + BasicBlock::iterator NextInsn; if (CS.isCall()) { - NextInsn = I.getNextNode(); + NextInsn = ++I.getIterator(); + assert(NextInsn != I.getParent()->end()); } else { BasicBlock *NormalDest = cast<InvokeInst>(&I)->getNormalDest(); if (!NormalDest->getSinglePredecessor()) { @@ -2511,10 +2531,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return; } NextInsn = NormalDest->getFirstInsertionPt(); - assert(NextInsn && + assert(NextInsn != NormalDest->end() && "Could not find insertion point for retval shadow load"); } - IRBuilder<> IRBAfter(NextInsn); + IRBuilder<> IRBAfter(&*NextInsn); Value *RetvalShadow = IRBAfter.CreateAlignedLoad(getShadowPtrForRetval(&I, IRBAfter), kShadowTLSAlignment, "_msret"); @@ -2523,10 +2543,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOrigin(&I, IRBAfter.CreateLoad(getOriginPtrForRetval(IRBAfter))); } + bool isAMustTailRetVal(Value *RetVal) { + if (auto *I = dyn_cast<BitCastInst>(RetVal)) { + RetVal = I->getOperand(0); + } + if (auto *I = dyn_cast<CallInst>(RetVal)) { + return I->isMustTailCall(); + } + return false; + } + void visitReturnInst(ReturnInst &I) { IRBuilder<> IRB(&I); Value *RetVal = I.getReturnValue(); if (!RetVal) return; + // Don't emit the epilogue for musttail call returns. + if (isAMustTailRetVal(RetVal)) return; Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB); if (CheckReturnValue) { insertShadowCheck(RetVal, &I); @@ -2653,6 +2685,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOrigin(&I, getCleanOrigin()); } + void visitCatchSwitchInst(CatchSwitchInst &I) { + setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); + } + + void visitFuncletPadInst(FuncletPadInst &I) { + setShadow(&I, getCleanShadow(&I)); + setOrigin(&I, getCleanOrigin()); + } + void visitGetElementPtrInst(GetElementPtrInst &I) { handleShadowOr(I); } @@ -2696,6 +2738,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Nothing to do here. } + void visitCleanupReturnInst(CleanupReturnInst &CRI) { + DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n"); + // Nothing to do here. + } + + void visitCatchReturnInst(CatchReturnInst &CRI) { + DEBUG(dbgs() << "CatchReturn: " << CRI << "\n"); + // Nothing to do here. + } + void visitInstruction(Instruction &I) { // Everything else: stop propagating and check for poisoned shadow. if (ClDumpStrictInstructions) @@ -2808,6 +2860,8 @@ struct VarArgAMD64Helper : public VarArgHelper { } void visitVAStartInst(VAStartInst &I) override { + if (F.getCallingConv() == CallingConv::X86_64_Win64) + return; IRBuilder<> IRB(&I); VAStartInstrumentationList.push_back(&I); Value *VAListTag = I.getArgOperand(0); @@ -2820,6 +2874,8 @@ struct VarArgAMD64Helper : public VarArgHelper { } void visitVACopyInst(VACopyInst &I) override { + if (F.getCallingConv() == CallingConv::X86_64_Win64) + return; IRBuilder<> IRB(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); @@ -2979,6 +3035,242 @@ struct VarArgMIPS64Helper : public VarArgHelper { } }; + +/// \brief AArch64-specific implementation of VarArgHelper. +struct VarArgAArch64Helper : public VarArgHelper { + static const unsigned kAArch64GrArgSize = 56; + static const unsigned kAArch64VrArgSize = 128; + + static const unsigned AArch64GrBegOffset = 0; + static const unsigned AArch64GrEndOffset = kAArch64GrArgSize; + // Make VR space aligned to 16 bytes. + static const unsigned AArch64VrBegOffset = AArch64GrEndOffset + 8; + static const unsigned AArch64VrEndOffset = AArch64VrBegOffset + + kAArch64VrArgSize; + static const unsigned AArch64VAEndOffset = AArch64VrEndOffset; + + Function &F; + MemorySanitizer &MS; + MemorySanitizerVisitor &MSV; + Value *VAArgTLSCopy; + Value *VAArgOverflowSize; + + SmallVector<CallInst*, 16> VAStartInstrumentationList; + + VarArgAArch64Helper(Function &F, MemorySanitizer &MS, + MemorySanitizerVisitor &MSV) + : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr), + VAArgOverflowSize(nullptr) {} + + enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory }; + + ArgKind classifyArgument(Value* arg) { + Type *T = arg->getType(); + if (T->isFPOrFPVectorTy()) + return AK_FloatingPoint; + if ((T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64) + || (T->isPointerTy())) + return AK_GeneralPurpose; + return AK_Memory; + } + + // The instrumentation stores the argument shadow in a non ABI-specific + // format because it does not know which argument is named (since Clang, + // like x86_64 case, lowers the va_args in the frontend and this pass only + // sees the low level code that deals with va_list internals). + // The first seven GR registers are saved in the first 56 bytes of the + // va_arg tls arra, followers by the first 8 FP/SIMD registers, and then + // the remaining arguments. + // Using constant offset within the va_arg TLS array allows fast copy + // in the finalize instrumentation. + void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override { + unsigned GrOffset = AArch64GrBegOffset; + unsigned VrOffset = AArch64VrBegOffset; + unsigned OverflowOffset = AArch64VAEndOffset; + + const DataLayout &DL = F.getParent()->getDataLayout(); + for (CallSite::arg_iterator ArgIt = CS.arg_begin() + 1, End = CS.arg_end(); + ArgIt != End; ++ArgIt) { + Value *A = *ArgIt; + ArgKind AK = classifyArgument(A); + if (AK == AK_GeneralPurpose && GrOffset >= AArch64GrEndOffset) + AK = AK_Memory; + if (AK == AK_FloatingPoint && VrOffset >= AArch64VrEndOffset) + AK = AK_Memory; + Value *Base; + switch (AK) { + case AK_GeneralPurpose: + Base = getShadowPtrForVAArgument(A->getType(), IRB, GrOffset); + GrOffset += 8; + break; + case AK_FloatingPoint: + Base = getShadowPtrForVAArgument(A->getType(), IRB, VrOffset); + VrOffset += 16; + break; + case AK_Memory: + uint64_t ArgSize = DL.getTypeAllocSize(A->getType()); + Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset); + OverflowOffset += RoundUpToAlignment(ArgSize, 8); + break; + } + IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); + } + Constant *OverflowSize = + ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset); + IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS); + } + + /// Compute the shadow address for a given va_arg. + Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, + int ArgOffset) { + Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy); + Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset)); + return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0), + "_msarg"); + } + + void visitVAStartInst(VAStartInst &I) override { + IRBuilder<> IRB(&I); + VAStartInstrumentationList.push_back(&I); + Value *VAListTag = I.getArgOperand(0); + Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); + // Unpoison the whole __va_list_tag. + // FIXME: magic ABI constants (size of va_list). + IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), + /* size */32, /* alignment */8, false); + } + + void visitVACopyInst(VACopyInst &I) override { + IRBuilder<> IRB(&I); + Value *VAListTag = I.getArgOperand(0); + Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB); + // Unpoison the whole __va_list_tag. + // FIXME: magic ABI constants (size of va_list). + IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), + /* size */32, /* alignment */8, false); + } + + // Retrieve a va_list field of 'void*' size. + Value* getVAField64(IRBuilder<> &IRB, Value *VAListTag, int offset) { + Value *SaveAreaPtrPtr = + IRB.CreateIntToPtr( + IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), + ConstantInt::get(MS.IntptrTy, offset)), + Type::getInt64PtrTy(*MS.C)); + return IRB.CreateLoad(SaveAreaPtrPtr); + } + + // Retrieve a va_list field of 'int' size. + Value* getVAField32(IRBuilder<> &IRB, Value *VAListTag, int offset) { + Value *SaveAreaPtr = + IRB.CreateIntToPtr( + IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy), + ConstantInt::get(MS.IntptrTy, offset)), + Type::getInt32PtrTy(*MS.C)); + Value *SaveArea32 = IRB.CreateLoad(SaveAreaPtr); + return IRB.CreateSExt(SaveArea32, MS.IntptrTy); + } + + void finalizeInstrumentation() override { + assert(!VAArgOverflowSize && !VAArgTLSCopy && + "finalizeInstrumentation called twice"); + if (!VAStartInstrumentationList.empty()) { + // If there is a va_start in this function, make a backup copy of + // va_arg_tls somewhere in the function entry block. + IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI()); + VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS); + Value *CopySize = + IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset), + VAArgOverflowSize); + VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); + IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8); + } + + Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize); + Value *VrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64VrArgSize); + + // Instrument va_start, copy va_list shadow from the backup copy of + // the TLS contents. + for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) { + CallInst *OrigInst = VAStartInstrumentationList[i]; + IRBuilder<> IRB(OrigInst->getNextNode()); + + Value *VAListTag = OrigInst->getArgOperand(0); + + // The variadic ABI for AArch64 creates two areas to save the incoming + // argument registers (one for 64-bit general register xn-x7 and another + // for 128-bit FP/SIMD vn-v7). + // We need then to propagate the shadow arguments on both regions + // 'va::__gr_top + va::__gr_offs' and 'va::__vr_top + va::__vr_offs'. + // The remaning arguments are saved on shadow for 'va::stack'. + // One caveat is it requires only to propagate the non-named arguments, + // however on the call site instrumentation 'all' the arguments are + // saved. So to copy the shadow values from the va_arg TLS array + // we need to adjust the offset for both GR and VR fields based on + // the __{gr,vr}_offs value (since they are stores based on incoming + // named arguments). + + // Read the stack pointer from the va_list. + Value *StackSaveAreaPtr = getVAField64(IRB, VAListTag, 0); + + // Read both the __gr_top and __gr_off and add them up. + Value *GrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 8); + Value *GrOffSaveArea = getVAField32(IRB, VAListTag, 24); + + Value *GrRegSaveAreaPtr = IRB.CreateAdd(GrTopSaveAreaPtr, GrOffSaveArea); + + // Read both the __vr_top and __vr_off and add them up. + Value *VrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 16); + Value *VrOffSaveArea = getVAField32(IRB, VAListTag, 28); + + Value *VrRegSaveAreaPtr = IRB.CreateAdd(VrTopSaveAreaPtr, VrOffSaveArea); + + // It does not know how many named arguments is being used and, on the + // callsite all the arguments were saved. Since __gr_off is defined as + // '0 - ((8 - named_gr) * 8)', the idea is to just propagate the variadic + // argument by ignoring the bytes of shadow from named arguments. + Value *GrRegSaveAreaShadowPtrOff = + IRB.CreateAdd(GrArgSize, GrOffSaveArea); + + Value *GrRegSaveAreaShadowPtr = + MSV.getShadowPtr(GrRegSaveAreaPtr, IRB.getInt8Ty(), IRB); + + Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, + GrRegSaveAreaShadowPtrOff); + Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff); + + IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, GrSrcPtr, GrCopySize, 8); + + // Again, but for FP/SIMD values. + Value *VrRegSaveAreaShadowPtrOff = + IRB.CreateAdd(VrArgSize, VrOffSaveArea); + + Value *VrRegSaveAreaShadowPtr = + MSV.getShadowPtr(VrRegSaveAreaPtr, IRB.getInt8Ty(), IRB); + + Value *VrSrcPtr = IRB.CreateInBoundsGEP( + IRB.getInt8Ty(), + IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, + IRB.getInt32(AArch64VrBegOffset)), + VrRegSaveAreaShadowPtrOff); + Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff); + + IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, VrSrcPtr, VrCopySize, 8); + + // And finally for remaining arguments. + Value *StackSaveAreaShadowPtr = + MSV.getShadowPtr(StackSaveAreaPtr, IRB.getInt8Ty(), IRB); + + Value *StackSrcPtr = + IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, + IRB.getInt32(AArch64VAEndOffset)); + + IRB.CreateMemCpy(StackSaveAreaShadowPtr, StackSrcPtr, + VAArgOverflowSize, 16); + } + } +}; + /// \brief A no-op implementation of VarArgHelper. struct VarArgNoOpHelper : public VarArgHelper { VarArgNoOpHelper(Function &F, MemorySanitizer &MS, @@ -3003,11 +3295,13 @@ VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, else if (TargetTriple.getArch() == llvm::Triple::mips64 || TargetTriple.getArch() == llvm::Triple::mips64el) return new VarArgMIPS64Helper(Func, Msan, Visitor); + else if (TargetTriple.getArch() == llvm::Triple::aarch64) + return new VarArgAArch64Helper(Func, Msan, Visitor); else return new VarArgNoOpHelper(Func, Msan, Visitor); } -} // namespace +} // anonymous namespace bool MemorySanitizer::runOnFunction(Function &F) { if (&F == MsanCtorFunction) diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp new file mode 100644 index 0000000..4b59b93 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -0,0 +1,718 @@ +//===-- PGOInstrumentation.cpp - MST-based PGO Instrumentation ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements PGO instrumentation using a minimum spanning tree based +// on the following paper: +// [1] Donald E. Knuth, Francis R. Stevenson. Optimal measurement of points +// for program frequency counts. BIT Numerical Mathematics 1973, Volume 13, +// Issue 3, pp 313-322 +// The idea of the algorithm based on the fact that for each node (except for +// the entry and exit), the sum of incoming edge counts equals the sum of +// outgoing edge counts. The count of edge on spanning tree can be derived from +// those edges not on the spanning tree. Knuth proves this method instruments +// the minimum number of edges. +// +// The minimal spanning tree here is actually a maximum weight tree -- on-tree +// edges have higher frequencies (more likely to execute). The idea is to +// instrument those less frequently executed edges to reduce the runtime +// overhead of instrumented binaries. +// +// This file contains two passes: +// (1) Pass PGOInstrumentationGen which instruments the IR to generate edge +// count profile, and +// (2) Pass PGOInstrumentationUse which reads the edge count profile and +// annotates the branch weights. +// To get the precise counter information, These two passes need to invoke at +// the same compilation point (so they see the same IR). For pass +// PGOInstrumentationGen, the real work is done in instrumentOneFunc(). For +// pass PGOInstrumentationUse, the real work in done in class PGOUseFunc and +// the profile is opened in module level and passed to each PGOUseFunc instance. +// The shared code for PGOInstrumentationGen and PGOInstrumentationUse is put +// in class FuncPGOInstrumentation. +// +// Class PGOEdge represents a CFG edge and some auxiliary information. Class +// BBInfo contains auxiliary information for each BB. These two classes are used +// in pass PGOInstrumentationGen. Class PGOUseEdge and UseBBInfo are the derived +// class of PGOEdge and BBInfo, respectively. They contains extra data structure +// used in populating profile counters. +// The MST implementation is in Class CFGMST (CFGMST.h). +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation.h" +#include "CFGMST.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/ProfileData/InstrProfReader.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/JamCRC.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <string> +#include <utility> +#include <vector> + +using namespace llvm; + +#define DEBUG_TYPE "pgo-instrumentation" + +STATISTIC(NumOfPGOInstrument, "Number of edges instrumented."); +STATISTIC(NumOfPGOEdge, "Number of edges."); +STATISTIC(NumOfPGOBB, "Number of basic-blocks."); +STATISTIC(NumOfPGOSplit, "Number of critical edge splits."); +STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts."); +STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile."); +STATISTIC(NumOfPGOMissing, "Number of functions without profile."); + +// Command line option to specify the file to read profile from. This is +// mainly used for testing. +static cl::opt<std::string> + PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden, + cl::value_desc("filename"), + cl::desc("Specify the path of profile data file. This is" + "mainly for test purpose.")); + +namespace { +class PGOInstrumentationGen : public ModulePass { +public: + static char ID; + + PGOInstrumentationGen() : ModulePass(ID) { + initializePGOInstrumentationGenPass(*PassRegistry::getPassRegistry()); + } + + const char *getPassName() const override { + return "PGOInstrumentationGenPass"; + } + +private: + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<BlockFrequencyInfoWrapperPass>(); + } +}; + +class PGOInstrumentationUse : public ModulePass { +public: + static char ID; + + // Provide the profile filename as the parameter. + PGOInstrumentationUse(std::string Filename = "") + : ModulePass(ID), ProfileFileName(Filename) { + if (!PGOTestProfileFile.empty()) + ProfileFileName = PGOTestProfileFile; + initializePGOInstrumentationUsePass(*PassRegistry::getPassRegistry()); + } + + const char *getPassName() const override { + return "PGOInstrumentationUsePass"; + } + +private: + std::string ProfileFileName; + std::unique_ptr<IndexedInstrProfReader> PGOReader; + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<BlockFrequencyInfoWrapperPass>(); + } +}; +} // end anonymous namespace + +char PGOInstrumentationGen::ID = 0; +INITIALIZE_PASS_BEGIN(PGOInstrumentationGen, "pgo-instr-gen", + "PGO instrumentation.", false, false) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_END(PGOInstrumentationGen, "pgo-instr-gen", + "PGO instrumentation.", false, false) + +ModulePass *llvm::createPGOInstrumentationGenPass() { + return new PGOInstrumentationGen(); +} + +char PGOInstrumentationUse::ID = 0; +INITIALIZE_PASS_BEGIN(PGOInstrumentationUse, "pgo-instr-use", + "Read PGO instrumentation profile.", false, false) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_END(PGOInstrumentationUse, "pgo-instr-use", + "Read PGO instrumentation profile.", false, false) + +ModulePass *llvm::createPGOInstrumentationUsePass(StringRef Filename) { + return new PGOInstrumentationUse(Filename.str()); +} + +namespace { +/// \brief An MST based instrumentation for PGO +/// +/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO +/// in the function level. +struct PGOEdge { + // This class implements the CFG edges. Note the CFG can be a multi-graph. + // So there might be multiple edges with same SrcBB and DestBB. + const BasicBlock *SrcBB; + const BasicBlock *DestBB; + uint64_t Weight; + bool InMST; + bool Removed; + bool IsCritical; + PGOEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1) + : SrcBB(Src), DestBB(Dest), Weight(W), InMST(false), Removed(false), + IsCritical(false) {} + // Return the information string of an edge. + const std::string infoString() const { + return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") + + (IsCritical ? "c" : " ") + " W=" + Twine(Weight)).str(); + } +}; + +// This class stores the auxiliary information for each BB. +struct BBInfo { + BBInfo *Group; + uint32_t Index; + uint32_t Rank; + + BBInfo(unsigned IX) : Group(this), Index(IX), Rank(0) {} + + // Return the information string of this object. + const std::string infoString() const { + return (Twine("Index=") + Twine(Index)).str(); + } +}; + +// This class implements the CFG edges. Note the CFG can be a multi-graph. +template <class Edge, class BBInfo> class FuncPGOInstrumentation { +private: + Function &F; + void computeCFGHash(); + +public: + std::string FuncName; + GlobalVariable *FuncNameVar; + // CFG hash value for this function. + uint64_t FunctionHash; + + // The Minimum Spanning Tree of function CFG. + CFGMST<Edge, BBInfo> MST; + + // Give an edge, find the BB that will be instrumented. + // Return nullptr if there is no BB to be instrumented. + BasicBlock *getInstrBB(Edge *E); + + // Return the auxiliary BB information. + BBInfo &getBBInfo(const BasicBlock *BB) const { return MST.getBBInfo(BB); } + + // Dump edges and BB information. + void dumpInfo(std::string Str = "") const { + MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " + + Twine(FunctionHash) + "\t" + Str); + } + + FuncPGOInstrumentation(Function &Func, bool CreateGlobalVar = false, + BranchProbabilityInfo *BPI = nullptr, + BlockFrequencyInfo *BFI = nullptr) + : F(Func), FunctionHash(0), MST(F, BPI, BFI) { + FuncName = getPGOFuncName(F); + computeCFGHash(); + DEBUG(dumpInfo("after CFGMST")); + + NumOfPGOBB += MST.BBInfos.size(); + for (auto &E : MST.AllEdges) { + if (E->Removed) + continue; + NumOfPGOEdge++; + if (!E->InMST) + NumOfPGOInstrument++; + } + + if (CreateGlobalVar) + FuncNameVar = createPGOFuncNameVar(F, FuncName); + }; +}; + +// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index +// value of each BB in the CFG. The higher 32 bits record the number of edges. +template <class Edge, class BBInfo> +void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() { + std::vector<char> Indexes; + JamCRC JC; + for (auto &BB : F) { + const TerminatorInst *TI = BB.getTerminator(); + for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) { + BasicBlock *Succ = TI->getSuccessor(I); + uint32_t Index = getBBInfo(Succ).Index; + for (int J = 0; J < 4; J++) + Indexes.push_back((char)(Index >> (J * 8))); + } + } + JC.update(Indexes); + FunctionHash = (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC(); +} + +// Given a CFG E to be instrumented, find which BB to place the instrumented +// code. The function will split the critical edge if necessary. +template <class Edge, class BBInfo> +BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) { + if (E->InMST || E->Removed) + return nullptr; + + BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB); + BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB); + // For a fake edge, instrument the real BB. + if (SrcBB == nullptr) + return DestBB; + if (DestBB == nullptr) + return SrcBB; + + // Instrument the SrcBB if it has a single successor, + // otherwise, the DestBB if this is not a critical edge. + TerminatorInst *TI = SrcBB->getTerminator(); + if (TI->getNumSuccessors() <= 1) + return SrcBB; + if (!E->IsCritical) + return DestBB; + + // For a critical edge, we have to split. Instrument the newly + // created BB. + NumOfPGOSplit++; + DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index << " --> " + << getBBInfo(DestBB).Index << "\n"); + unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB); + BasicBlock *InstrBB = SplitCriticalEdge(TI, SuccNum); + assert(InstrBB && "Critical edge is not split"); + + E->Removed = true; + return InstrBB; +} + +// Visit all edge and instrument the edges not in MST. +// Critical edges will be split. +static void instrumentOneFunc(Function &F, Module *M, + BranchProbabilityInfo *BPI, + BlockFrequencyInfo *BFI) { + unsigned NumCounters = 0; + FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, true, BPI, BFI); + for (auto &E : FuncInfo.MST.AllEdges) { + if (!E->InMST && !E->Removed) + NumCounters++; + } + + uint32_t I = 0; + for (auto &E : FuncInfo.MST.AllEdges) { + BasicBlock *InstrBB = FuncInfo.getInstrBB(E.get()); + if (!InstrBB) + continue; + + IRBuilder<> Builder(InstrBB, InstrBB->getFirstInsertionPt()); + assert(Builder.GetInsertPoint() != InstrBB->end() && + "Cannot get the Instrumentation point"); + Type *I8PtrTy = Type::getInt8PtrTy(M->getContext()); + Builder.CreateCall( + Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment), + {llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy), + Builder.getInt64(FuncInfo.FunctionHash), Builder.getInt32(NumCounters), + Builder.getInt32(I++)}); + } +} + +// This class represents a CFG edge in profile use compilation. +struct PGOUseEdge : public PGOEdge { + bool CountValid; + uint64_t CountValue; + PGOUseEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1) + : PGOEdge(Src, Dest, W), CountValid(false), CountValue(0) {} + + // Set edge count value + void setEdgeCount(uint64_t Value) { + CountValue = Value; + CountValid = true; + } + + // Return the information string for this object. + const std::string infoString() const { + if (!CountValid) + return PGOEdge::infoString(); + return (Twine(PGOEdge::infoString()) + " Count=" + Twine(CountValue)).str(); + } +}; + +typedef SmallVector<PGOUseEdge *, 2> DirectEdges; + +// This class stores the auxiliary information for each BB. +struct UseBBInfo : public BBInfo { + uint64_t CountValue; + bool CountValid; + int32_t UnknownCountInEdge; + int32_t UnknownCountOutEdge; + DirectEdges InEdges; + DirectEdges OutEdges; + UseBBInfo(unsigned IX) + : BBInfo(IX), CountValue(0), CountValid(false), UnknownCountInEdge(0), + UnknownCountOutEdge(0) {} + UseBBInfo(unsigned IX, uint64_t C) + : BBInfo(IX), CountValue(C), CountValid(true), UnknownCountInEdge(0), + UnknownCountOutEdge(0) {} + + // Set the profile count value for this BB. + void setBBInfoCount(uint64_t Value) { + CountValue = Value; + CountValid = true; + } + + // Return the information string of this object. + const std::string infoString() const { + if (!CountValid) + return BBInfo::infoString(); + return (Twine(BBInfo::infoString()) + " Count=" + Twine(CountValue)).str(); + } +}; + +// Sum up the count values for all the edges. +static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) { + uint64_t Total = 0; + for (auto &E : Edges) { + if (E->Removed) + continue; + Total += E->CountValue; + } + return Total; +} + +class PGOUseFunc { +private: + Function &F; + Module *M; + // This member stores the shared information with class PGOGenFunc. + FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo; + + // Return the auxiliary BB information. + UseBBInfo &getBBInfo(const BasicBlock *BB) const { + return FuncInfo.getBBInfo(BB); + } + + // The maximum count value in the profile. This is only used in PGO use + // compilation. + uint64_t ProgramMaxCount; + + // Find the Instrumented BB and set the value. + void setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile); + + // Set the edge counter value for the unknown edge -- there should be only + // one unknown edge. + void setEdgeCount(DirectEdges &Edges, uint64_t Value); + + // Return FuncName string; + const std::string getFuncName() const { return FuncInfo.FuncName; } + + // Set the hot/cold inline hints based on the count values. + // FIXME: This function should be removed once the functionality in + // the inliner is implemented. + void applyFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) { + if (ProgramMaxCount == 0) + return; + // Threshold of the hot functions. + const BranchProbability HotFunctionThreshold(1, 100); + // Threshold of the cold functions. + const BranchProbability ColdFunctionThreshold(2, 10000); + if (EntryCount >= HotFunctionThreshold.scale(ProgramMaxCount)) + F.addFnAttr(llvm::Attribute::InlineHint); + else if (MaxCount <= ColdFunctionThreshold.scale(ProgramMaxCount)) + F.addFnAttr(llvm::Attribute::Cold); + } + +public: + PGOUseFunc(Function &Func, Module *Modu, BranchProbabilityInfo *BPI = nullptr, + BlockFrequencyInfo *BFI = nullptr) + : F(Func), M(Modu), FuncInfo(Func, false, BPI, BFI) {} + + // Read counts for the instrumented BB from profile. + bool readCounters(IndexedInstrProfReader *PGOReader); + + // Populate the counts for all BBs. + void populateCounters(); + + // Set the branch weights based on the count values. + void setBranchWeights(); +}; + +// Visit all the edges and assign the count value for the instrumented +// edges and the BB. +void PGOUseFunc::setInstrumentedCounts( + const std::vector<uint64_t> &CountFromProfile) { + + // Use a worklist as we will update the vector during the iteration. + std::vector<PGOUseEdge *> WorkList; + for (auto &E : FuncInfo.MST.AllEdges) + WorkList.push_back(E.get()); + + uint32_t I = 0; + for (auto &E : WorkList) { + BasicBlock *InstrBB = FuncInfo.getInstrBB(E); + if (!InstrBB) + continue; + uint64_t CountValue = CountFromProfile[I++]; + if (!E->Removed) { + getBBInfo(InstrBB).setBBInfoCount(CountValue); + E->setEdgeCount(CountValue); + continue; + } + + // Need to add two new edges. + BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB); + BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB); + // Add new edge of SrcBB->InstrBB. + PGOUseEdge &NewEdge = FuncInfo.MST.addEdge(SrcBB, InstrBB, 0); + NewEdge.setEdgeCount(CountValue); + // Add new edge of InstrBB->DestBB. + PGOUseEdge &NewEdge1 = FuncInfo.MST.addEdge(InstrBB, DestBB, 0); + NewEdge1.setEdgeCount(CountValue); + NewEdge1.InMST = true; + getBBInfo(InstrBB).setBBInfoCount(CountValue); + } +} + +// Set the count value for the unknown edge. There should be one and only one +// unknown edge in Edges vector. +void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) { + for (auto &E : Edges) { + if (E->CountValid) + continue; + E->setEdgeCount(Value); + + getBBInfo(E->SrcBB).UnknownCountOutEdge--; + getBBInfo(E->DestBB).UnknownCountInEdge--; + return; + } + llvm_unreachable("Cannot find the unknown count edge"); +} + +// Read the profile from ProfileFileName and assign the value to the +// instrumented BB and the edges. This function also updates ProgramMaxCount. +// Return true if the profile are successfully read, and false on errors. +bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) { + auto &Ctx = M->getContext(); + ErrorOr<InstrProfRecord> Result = + PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash); + if (std::error_code EC = Result.getError()) { + if (EC == instrprof_error::unknown_function) + NumOfPGOMissing++; + else if (EC == instrprof_error::hash_mismatch || + EC == llvm::instrprof_error::malformed) + NumOfPGOMismatch++; + + std::string Msg = EC.message() + std::string(" ") + F.getName().str(); + Ctx.diagnose( + DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning)); + return false; + } + std::vector<uint64_t> &CountFromProfile = Result.get().Counts; + + NumOfPGOFunc++; + DEBUG(dbgs() << CountFromProfile.size() << " counts\n"); + uint64_t ValueSum = 0; + for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) { + DEBUG(dbgs() << " " << I << ": " << CountFromProfile[I] << "\n"); + ValueSum += CountFromProfile[I]; + } + + DEBUG(dbgs() << "SUM = " << ValueSum << "\n"); + + getBBInfo(nullptr).UnknownCountOutEdge = 2; + getBBInfo(nullptr).UnknownCountInEdge = 2; + + setInstrumentedCounts(CountFromProfile); + ProgramMaxCount = PGOReader->getMaximumFunctionCount(); + return true; +} + +// Populate the counters from instrumented BBs to all BBs. +// In the end of this operation, all BBs should have a valid count value. +void PGOUseFunc::populateCounters() { + // First set up Count variable for all BBs. + for (auto &E : FuncInfo.MST.AllEdges) { + if (E->Removed) + continue; + + const BasicBlock *SrcBB = E->SrcBB; + const BasicBlock *DestBB = E->DestBB; + UseBBInfo &SrcInfo = getBBInfo(SrcBB); + UseBBInfo &DestInfo = getBBInfo(DestBB); + SrcInfo.OutEdges.push_back(E.get()); + DestInfo.InEdges.push_back(E.get()); + SrcInfo.UnknownCountOutEdge++; + DestInfo.UnknownCountInEdge++; + + if (!E->CountValid) + continue; + DestInfo.UnknownCountInEdge--; + SrcInfo.UnknownCountOutEdge--; + } + + bool Changes = true; + unsigned NumPasses = 0; + while (Changes) { + NumPasses++; + Changes = false; + + // For efficient traversal, it's better to start from the end as most + // of the instrumented edges are at the end. + for (auto &BB : reverse(F)) { + UseBBInfo &Count = getBBInfo(&BB); + if (!Count.CountValid) { + if (Count.UnknownCountOutEdge == 0) { + Count.CountValue = sumEdgeCount(Count.OutEdges); + Count.CountValid = true; + Changes = true; + } else if (Count.UnknownCountInEdge == 0) { + Count.CountValue = sumEdgeCount(Count.InEdges); + Count.CountValid = true; + Changes = true; + } + } + if (Count.CountValid) { + if (Count.UnknownCountOutEdge == 1) { + uint64_t Total = Count.CountValue - sumEdgeCount(Count.OutEdges); + setEdgeCount(Count.OutEdges, Total); + Changes = true; + } + if (Count.UnknownCountInEdge == 1) { + uint64_t Total = Count.CountValue - sumEdgeCount(Count.InEdges); + setEdgeCount(Count.InEdges, Total); + Changes = true; + } + } + } + } + + DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n"); + // Assert every BB has a valid counter. + uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue; + uint64_t FuncMaxCount = FuncEntryCount; + for (auto &BB : F) { + assert(getBBInfo(&BB).CountValid && "BB count is not valid"); + uint64_t Count = getBBInfo(&BB).CountValue; + if (Count > FuncMaxCount) + FuncMaxCount = Count; + } + applyFunctionAttributes(FuncEntryCount, FuncMaxCount); + + DEBUG(FuncInfo.dumpInfo("after reading profile.")); +} + +// Assign the scaled count values to the BB with multiple out edges. +void PGOUseFunc::setBranchWeights() { + // Generate MD_prof metadata for every branch instruction. + DEBUG(dbgs() << "\nSetting branch weights.\n"); + MDBuilder MDB(M->getContext()); + for (auto &BB : F) { + TerminatorInst *TI = BB.getTerminator(); + if (TI->getNumSuccessors() < 2) + continue; + if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)) + continue; + if (getBBInfo(&BB).CountValue == 0) + continue; + + // We have a non-zero Branch BB. + const UseBBInfo &BBCountInfo = getBBInfo(&BB); + unsigned Size = BBCountInfo.OutEdges.size(); + SmallVector<unsigned, 2> EdgeCounts(Size, 0); + uint64_t MaxCount = 0; + for (unsigned s = 0; s < Size; s++) { + const PGOUseEdge *E = BBCountInfo.OutEdges[s]; + const BasicBlock *SrcBB = E->SrcBB; + const BasicBlock *DestBB = E->DestBB; + if (DestBB == 0) + continue; + unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB); + uint64_t EdgeCount = E->CountValue; + if (EdgeCount > MaxCount) + MaxCount = EdgeCount; + EdgeCounts[SuccNum] = EdgeCount; + } + assert(MaxCount > 0 && "Bad max count"); + uint64_t Scale = calculateCountScale(MaxCount); + SmallVector<unsigned, 4> Weights; + for (const auto &ECI : EdgeCounts) + Weights.push_back(scaleBranchCount(ECI, Scale)); + + TI->setMetadata(llvm::LLVMContext::MD_prof, + MDB.createBranchWeights(Weights)); + DEBUG(dbgs() << "Weight is: "; + for (const auto &W : Weights) { dbgs() << W << " "; } + dbgs() << "\n";); + } +} +} // end anonymous namespace + +bool PGOInstrumentationGen::runOnModule(Module &M) { + for (auto &F : M) { + if (F.isDeclaration()) + continue; + BranchProbabilityInfo *BPI = + &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI()); + BlockFrequencyInfo *BFI = + &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI()); + instrumentOneFunc(F, &M, BPI, BFI); + } + return true; +} + +static void setPGOCountOnFunc(PGOUseFunc &Func, + IndexedInstrProfReader *PGOReader) { + if (Func.readCounters(PGOReader)) { + Func.populateCounters(); + Func.setBranchWeights(); + } +} + +bool PGOInstrumentationUse::runOnModule(Module &M) { + DEBUG(dbgs() << "Read in profile counters: "); + auto &Ctx = M.getContext(); + // Read the counter array from file. + auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName); + if (std::error_code EC = ReaderOrErr.getError()) { + Ctx.diagnose( + DiagnosticInfoPGOProfile(ProfileFileName.data(), EC.message())); + return false; + } + + PGOReader = std::move(ReaderOrErr.get()); + if (!PGOReader) { + Ctx.diagnose(DiagnosticInfoPGOProfile(ProfileFileName.data(), + "Cannot get PGOReader")); + return false; + } + + for (auto &F : M) { + if (F.isDeclaration()) + continue; + BranchProbabilityInfo *BPI = + &(getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI()); + BlockFrequencyInfo *BFI = + &(getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI()); + PGOUseFunc Func(F, &M, BPI, BFI); + setPGOCountOnFunc(Func, PGOReader.get()); + } + return true; +} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp index 6b185a2..abed465 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp @@ -18,8 +18,9 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Triple.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -37,6 +38,8 @@ #include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_os_ostream.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -44,6 +47,17 @@ using namespace llvm; #define DEBUG_TYPE "safestack" +enum UnsafeStackPtrStorageVal { ThreadLocalUSP, SingleThreadUSP }; + +static cl::opt<UnsafeStackPtrStorageVal> USPStorage("safe-stack-usp-storage", + cl::Hidden, cl::init(ThreadLocalUSP), + cl::desc("Type of storage for the unsafe stack pointer"), + cl::values(clEnumValN(ThreadLocalUSP, "thread-local", + "Thread-local storage"), + clEnumValN(SingleThreadUSP, "single-thread", + "Non-thread-local storage"), + clEnumValEnd)); + namespace llvm { STATISTIC(NumFunctions, "Total number of functions"); @@ -54,118 +68,48 @@ STATISTIC(NumUnsafeStackRestorePointsFunctions, STATISTIC(NumAllocas, "Total number of allocas"); STATISTIC(NumUnsafeStaticAllocas, "Number of unsafe static allocas"); STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas"); +STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments"); STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads"); } // namespace llvm namespace { -/// Check whether a given alloca instruction (AI) should be put on the safe -/// stack or not. The function analyzes all uses of AI and checks whether it is -/// only accessed in a memory safe way (as decided statically). -bool IsSafeStackAlloca(const AllocaInst *AI) { - // Go through all uses of this alloca and check whether all accesses to the - // allocated object are statically known to be memory safe and, hence, the - // object can be placed on the safe stack. - - SmallPtrSet<const Value *, 16> Visited; - SmallVector<const Instruction *, 8> WorkList; - WorkList.push_back(AI); +/// Rewrite an SCEV expression for a memory access address to an expression that +/// represents offset from the given alloca. +/// +/// The implementation simply replaces all mentions of the alloca with zero. +class AllocaOffsetRewriter : public SCEVRewriteVisitor<AllocaOffsetRewriter> { + const Value *AllocaPtr; - // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc. - while (!WorkList.empty()) { - const Instruction *V = WorkList.pop_back_val(); - for (const Use &UI : V->uses()) { - auto I = cast<const Instruction>(UI.getUser()); - assert(V == UI.get()); - - switch (I->getOpcode()) { - case Instruction::Load: - // Loading from a pointer is safe. - break; - case Instruction::VAArg: - // "va-arg" from a pointer is safe. - break; - case Instruction::Store: - if (V == I->getOperand(0)) - // Stored the pointer - conservatively assume it may be unsafe. - return false; - // Storing to the pointee is safe. - break; - - case Instruction::GetElementPtr: - if (!cast<const GetElementPtrInst>(I)->hasAllConstantIndices()) - // GEP with non-constant indices can lead to memory errors. - // This also applies to inbounds GEPs, as the inbounds attribute - // represents an assumption that the address is in bounds, rather than - // an assertion that it is. - return false; - - // We assume that GEP on static alloca with constant indices is safe, - // otherwise a compiler would detect it and warn during compilation. - - if (!isa<const ConstantInt>(AI->getArraySize())) - // However, if the array size itself is not constant, the access - // might still be unsafe at runtime. - return false; - - /* fallthrough */ - - case Instruction::BitCast: - case Instruction::IntToPtr: - case Instruction::PHI: - case Instruction::PtrToInt: - case Instruction::Select: - // The object can be safe or not, depending on how the result of the - // instruction is used. - if (Visited.insert(I).second) - WorkList.push_back(cast<const Instruction>(I)); - break; - - case Instruction::Call: - case Instruction::Invoke: { - // FIXME: add support for memset and memcpy intrinsics. - ImmutableCallSite CS(I); - - // LLVM 'nocapture' attribute is only set for arguments whose address - // is not stored, passed around, or used in any other non-trivial way. - // We assume that passing a pointer to an object as a 'nocapture' - // argument is safe. - // FIXME: a more precise solution would require an interprocedural - // analysis here, which would look at all uses of an argument inside - // the function being called. - ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); - for (ImmutableCallSite::arg_iterator A = B; A != E; ++A) - if (A->get() == V && !CS.doesNotCapture(A - B)) - // The parameter is not marked 'nocapture' - unsafe. - return false; - continue; - } +public: + AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr) + : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {} - default: - // The object is unsafe if it is used in any other way. - return false; - } - } + const SCEV *visitUnknown(const SCEVUnknown *Expr) { + if (Expr->getValue() == AllocaPtr) + return SE.getZero(Expr->getType()); + return Expr; } +}; - // All uses of the alloca are safe, we can place it on the safe stack. - return true; -} - -/// The SafeStack pass splits the stack of each function into the -/// safe stack, which is only accessed through memory safe dereferences -/// (as determined statically), and the unsafe stack, which contains all -/// local variables that are accessed in unsafe ways. +/// The SafeStack pass splits the stack of each function into the safe +/// stack, which is only accessed through memory safe dereferences (as +/// determined statically), and the unsafe stack, which contains all +/// local variables that are accessed in ways that we can't prove to +/// be safe. class SafeStack : public FunctionPass { + const TargetMachine *TM; + const TargetLoweringBase *TL; const DataLayout *DL; + ScalarEvolution *SE; Type *StackPtrTy; Type *IntPtrTy; Type *Int32Ty; Type *Int8Ty; - Constant *UnsafeStackPtr = nullptr; + Value *UnsafeStackPtr = nullptr; /// Unsafe stack alignment. Each stack frame must ensure that the stack is /// aligned to this value. We need to re-align the unsafe stack if the @@ -175,26 +119,31 @@ class SafeStack : public FunctionPass { /// might expect to appear on the stack on most common targets. enum { StackAlignment = 16 }; - /// \brief Build a constant representing a pointer to the unsafe stack - /// pointer. - Constant *getOrCreateUnsafeStackPtr(Module &M); + /// \brief Build a value representing a pointer to the unsafe stack pointer. + Value *getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F); /// \brief Find all static allocas, dynamic allocas, return instructions and /// stack restore points (exception unwind blocks and setjmp calls) in the /// given function and append them to the respective vectors. void findInsts(Function &F, SmallVectorImpl<AllocaInst *> &StaticAllocas, SmallVectorImpl<AllocaInst *> &DynamicAllocas, + SmallVectorImpl<Argument *> &ByValArguments, SmallVectorImpl<ReturnInst *> &Returns, SmallVectorImpl<Instruction *> &StackRestorePoints); + /// \brief Calculate the allocation size of a given alloca. Returns 0 if the + /// size can not be statically determined. + uint64_t getStaticAllocaAllocationSize(const AllocaInst* AI); + /// \brief Allocate space for all static allocas in \p StaticAllocas, /// replace allocas with pointers into the unsafe stack and generate code to /// restore the stack pointer before all return instructions in \p Returns. /// /// \returns A pointer to the top of the unsafe stack after all unsafe static /// allocas are allocated. - Value *moveStaticAllocasToUnsafeStack(Function &F, + Value *moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F, ArrayRef<AllocaInst *> StaticAllocas, + ArrayRef<Argument *> ByValArguments, ArrayRef<ReturnInst *> Returns); /// \brief Generate code to restore the stack after all stack restore points @@ -203,7 +152,7 @@ class SafeStack : public FunctionPass { /// \returns A local variable in which to maintain the dynamic top of the /// unsafe stack if needed. AllocaInst * - createStackRestorePoints(Function &F, + createStackRestorePoints(IRBuilder<> &IRB, Function &F, ArrayRef<Instruction *> StackRestorePoints, Value *StaticTop, bool NeedDynamicTop); @@ -214,17 +163,26 @@ class SafeStack : public FunctionPass { AllocaInst *DynamicTop, ArrayRef<AllocaInst *> DynamicAllocas); + bool IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize); + + bool IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U, + const Value *AllocaPtr, uint64_t AllocaSize); + bool IsAccessSafe(Value *Addr, uint64_t Size, const Value *AllocaPtr, + uint64_t AllocaSize); + public: static char ID; // Pass identification, replacement for typeid. - SafeStack() : FunctionPass(ID), DL(nullptr) { + SafeStack(const TargetMachine *TM) + : FunctionPass(ID), TM(TM), TL(nullptr), DL(nullptr) { initializeSafeStackPass(*PassRegistry::getPassRegistry()); } + SafeStack() : SafeStack(nullptr) {} - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AliasAnalysis>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ScalarEvolutionWrapperPass>(); } - virtual bool doInitialization(Module &M) { + bool doInitialization(Module &M) override { DL = &M.getDataLayout(); StackPtrTy = Type::getInt8PtrTy(M.getContext()); @@ -235,51 +193,203 @@ public: return false; } - bool runOnFunction(Function &F); - + bool runOnFunction(Function &F) override; }; // class SafeStack -Constant *SafeStack::getOrCreateUnsafeStackPtr(Module &M) { - // The unsafe stack pointer is stored in a global variable with a magic name. - const char *kUnsafeStackPtrVar = "__safestack_unsafe_stack_ptr"; +uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) { + uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType()); + if (AI->isArrayAllocation()) { + auto C = dyn_cast<ConstantInt>(AI->getArraySize()); + if (!C) + return 0; + Size *= C->getZExtValue(); + } + return Size; +} + +bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize, + const Value *AllocaPtr, uint64_t AllocaSize) { + AllocaOffsetRewriter Rewriter(*SE, AllocaPtr); + const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr)); + + uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType()); + ConstantRange AccessStartRange = SE->getUnsignedRange(Expr); + ConstantRange SizeRange = + ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize)); + ConstantRange AccessRange = AccessStartRange.add(SizeRange); + ConstantRange AllocaRange = + ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AllocaSize)); + bool Safe = AllocaRange.contains(AccessRange); + + DEBUG(dbgs() << "[SafeStack] " + << (isa<AllocaInst>(AllocaPtr) ? "Alloca " : "ByValArgument ") + << *AllocaPtr << "\n" + << " Access " << *Addr << "\n" + << " SCEV " << *Expr + << " U: " << SE->getUnsignedRange(Expr) + << ", S: " << SE->getSignedRange(Expr) << "\n" + << " Range " << AccessRange << "\n" + << " AllocaRange " << AllocaRange << "\n" + << " " << (Safe ? "safe" : "unsafe") << "\n"); + + return Safe; +} + +bool SafeStack::IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U, + const Value *AllocaPtr, + uint64_t AllocaSize) { + // All MemIntrinsics have destination address in Arg0 and size in Arg2. + if (MI->getRawDest() != U) return true; + const auto *Len = dyn_cast<ConstantInt>(MI->getLength()); + // Non-constant size => unsafe. FIXME: try SCEV getRange. + if (!Len) return false; + return IsAccessSafe(U, Len->getZExtValue(), AllocaPtr, AllocaSize); +} + +/// Check whether a given allocation must be put on the safe +/// stack or not. The function analyzes all uses of AI and checks whether it is +/// only accessed in a memory safe way (as decided statically). +bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { + // Go through all uses of this alloca and check whether all accesses to the + // allocated object are statically known to be memory safe and, hence, the + // object can be placed on the safe stack. + SmallPtrSet<const Value *, 16> Visited; + SmallVector<const Value *, 8> WorkList; + WorkList.push_back(AllocaPtr); + + // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc. + while (!WorkList.empty()) { + const Value *V = WorkList.pop_back_val(); + for (const Use &UI : V->uses()) { + auto I = cast<const Instruction>(UI.getUser()); + assert(V == UI.get()); + + switch (I->getOpcode()) { + case Instruction::Load: { + if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr, + AllocaSize)) + return false; + break; + } + case Instruction::VAArg: + // "va-arg" from a pointer is safe. + break; + case Instruction::Store: { + if (V == I->getOperand(0)) { + // Stored the pointer - conservatively assume it may be unsafe. + DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr + << "\n store of address: " << *I << "\n"); + return false; + } + + if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()), + AllocaPtr, AllocaSize)) + return false; + break; + } + case Instruction::Ret: { + // Information leak. + return false; + } + + case Instruction::Call: + case Instruction::Invoke: { + ImmutableCallSite CS(I); + + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) + continue; + } + + if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) { + if (!IsMemIntrinsicSafe(MI, UI, AllocaPtr, AllocaSize)) { + DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr + << "\n unsafe memintrinsic: " << *I + << "\n"); + return false; + } + continue; + } + // LLVM 'nocapture' attribute is only set for arguments whose address + // is not stored, passed around, or used in any other non-trivial way. + // We assume that passing a pointer to an object as a 'nocapture + // readnone' argument is safe. + // FIXME: a more precise solution would require an interprocedural + // analysis here, which would look at all uses of an argument inside + // the function being called. + ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end(); + for (ImmutableCallSite::arg_iterator A = B; A != E; ++A) + if (A->get() == V) + if (!(CS.doesNotCapture(A - B) && (CS.doesNotAccessMemory(A - B) || + CS.doesNotAccessMemory()))) { + DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr + << "\n unsafe call: " << *I << "\n"); + return false; + } + continue; + } + + default: + if (Visited.insert(I).second) + WorkList.push_back(cast<const Instruction>(I)); + } + } + } + + // All uses of the alloca are safe, we can place it on the safe stack. + return true; +} + +Value *SafeStack::getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F) { + // Check if there is a target-specific location for the unsafe stack pointer. + if (TL) + if (Value *V = TL->getSafeStackPointerLocation(IRB)) + return V; + + // Otherwise, assume the target links with compiler-rt, which provides a + // thread-local variable with a magic name. + Module &M = *F.getParent(); + const char *UnsafeStackPtrVar = "__safestack_unsafe_stack_ptr"; auto UnsafeStackPtr = - dyn_cast_or_null<GlobalVariable>(M.getNamedValue(kUnsafeStackPtrVar)); + dyn_cast_or_null<GlobalVariable>(M.getNamedValue(UnsafeStackPtrVar)); + + bool UseTLS = USPStorage == ThreadLocalUSP; if (!UnsafeStackPtr) { + auto TLSModel = UseTLS ? + GlobalValue::InitialExecTLSModel : + GlobalValue::NotThreadLocal; // The global variable is not defined yet, define it ourselves. - // We use the initial-exec TLS model because we do not support the variable - // living anywhere other than in the main executable. + // We use the initial-exec TLS model because we do not support the + // variable living anywhere other than in the main executable. UnsafeStackPtr = new GlobalVariable( - /*Module=*/M, /*Type=*/StackPtrTy, - /*isConstant=*/false, /*Linkage=*/GlobalValue::ExternalLinkage, - /*Initializer=*/0, /*Name=*/kUnsafeStackPtrVar, - /*InsertBefore=*/nullptr, - /*ThreadLocalMode=*/GlobalValue::InitialExecTLSModel); + M, StackPtrTy, false, GlobalValue::ExternalLinkage, nullptr, + UnsafeStackPtrVar, nullptr, TLSModel); } else { // The variable exists, check its type and attributes. - if (UnsafeStackPtr->getValueType() != StackPtrTy) { - report_fatal_error(Twine(kUnsafeStackPtrVar) + " must have void* type"); - } - - if (!UnsafeStackPtr->isThreadLocal()) { - report_fatal_error(Twine(kUnsafeStackPtrVar) + " must be thread-local"); - } + if (UnsafeStackPtr->getValueType() != StackPtrTy) + report_fatal_error(Twine(UnsafeStackPtrVar) + " must have void* type"); + if (UseTLS != UnsafeStackPtr->isThreadLocal()) + report_fatal_error(Twine(UnsafeStackPtrVar) + " must " + + (UseTLS ? "" : "not ") + "be thread-local"); } - return UnsafeStackPtr; } void SafeStack::findInsts(Function &F, SmallVectorImpl<AllocaInst *> &StaticAllocas, SmallVectorImpl<AllocaInst *> &DynamicAllocas, + SmallVectorImpl<Argument *> &ByValArguments, SmallVectorImpl<ReturnInst *> &Returns, SmallVectorImpl<Instruction *> &StackRestorePoints) { - for (Instruction &I : inst_range(&F)) { + for (Instruction &I : instructions(&F)) { if (auto AI = dyn_cast<AllocaInst>(&I)) { ++NumAllocas; - if (IsSafeStackAlloca(AI)) + uint64_t Size = getStaticAllocaAllocationSize(AI); + if (IsSafeStackAlloca(AI, Size)) continue; if (AI->isStaticAlloca()) { @@ -304,19 +414,26 @@ void SafeStack::findInsts(Function &F, "gcroot intrinsic not compatible with safestack attribute"); } } + for (Argument &Arg : F.args()) { + if (!Arg.hasByValAttr()) + continue; + uint64_t Size = + DL->getTypeStoreSize(Arg.getType()->getPointerElementType()); + if (IsSafeStackAlloca(&Arg, Size)) + continue; + + ++NumUnsafeByValArguments; + ByValArguments.push_back(&Arg); + } } AllocaInst * -SafeStack::createStackRestorePoints(Function &F, +SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F, ArrayRef<Instruction *> StackRestorePoints, Value *StaticTop, bool NeedDynamicTop) { if (StackRestorePoints.empty()) return nullptr; - IRBuilder<> IRB(StaticTop - ? cast<Instruction>(StaticTop)->getNextNode() - : (Instruction *)F.getEntryBlock().getFirstInsertionPt()); - // We need the current value of the shadow stack pointer to restore // after longjmp or exception catching. @@ -342,7 +459,7 @@ SafeStack::createStackRestorePoints(Function &F, for (Instruction *I : StackRestorePoints) { ++NumUnsafeStackRestorePoints; - IRB.SetInsertPoint(cast<Instruction>(I->getNextNode())); + IRB.SetInsertPoint(I->getNextNode()); Value *CurrentTop = DynamicTop ? IRB.CreateLoad(DynamicTop) : StaticTop; IRB.CreateStore(CurrentTop, UnsafeStackPtr); } @@ -350,14 +467,12 @@ SafeStack::createStackRestorePoints(Function &F, return DynamicTop; } -Value * -SafeStack::moveStaticAllocasToUnsafeStack(Function &F, - ArrayRef<AllocaInst *> StaticAllocas, - ArrayRef<ReturnInst *> Returns) { - if (StaticAllocas.empty()) +Value *SafeStack::moveStaticAllocasToUnsafeStack( + IRBuilder<> &IRB, Function &F, ArrayRef<AllocaInst *> StaticAllocas, + ArrayRef<Argument *> ByValArguments, ArrayRef<ReturnInst *> Returns) { + if (StaticAllocas.empty() && ByValArguments.empty()) return nullptr; - IRBuilder<> IRB(F.getEntryBlock().getFirstInsertionPt()); DIBuilder DIB(*F.getParent()); // We explicitly compute and set the unsafe stack layout for all unsafe @@ -377,6 +492,13 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F, // Compute maximum alignment among static objects on the unsafe stack. unsigned MaxAlignment = 0; + for (Argument *Arg : ByValArguments) { + Type *Ty = Arg->getType()->getPointerElementType(); + unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty), + Arg->getParamAlignment()); + if (Align > MaxAlignment) + MaxAlignment = Align; + } for (AllocaInst *AI : StaticAllocas) { Type *Ty = AI->getAllocatedType(); unsigned Align = @@ -388,22 +510,51 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F, if (MaxAlignment > StackAlignment) { // Re-align the base pointer according to the max requested alignment. assert(isPowerOf2_32(MaxAlignment)); - IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode())); + IRB.SetInsertPoint(BasePointer->getNextNode()); BasePointer = cast<Instruction>(IRB.CreateIntToPtr( IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy), ConstantInt::get(IntPtrTy, ~uint64_t(MaxAlignment - 1))), StackPtrTy)); } - // Allocate space for every unsafe static AllocaInst on the unsafe stack. int64_t StaticOffset = 0; // Current stack top. + IRB.SetInsertPoint(BasePointer->getNextNode()); + + for (Argument *Arg : ByValArguments) { + Type *Ty = Arg->getType()->getPointerElementType(); + + uint64_t Size = DL->getTypeStoreSize(Ty); + if (Size == 0) + Size = 1; // Don't create zero-sized stack objects. + + // Ensure the object is properly aligned. + unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty), + Arg->getParamAlignment()); + + // Add alignment. + // NOTE: we ensure that BasePointer itself is aligned to >= Align. + StaticOffset += Size; + StaticOffset = RoundUpToAlignment(StaticOffset, Align); + + Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8* + ConstantInt::get(Int32Ty, -StaticOffset)); + Value *NewArg = IRB.CreateBitCast(Off, Arg->getType(), + Arg->getName() + ".unsafe-byval"); + + // Replace alloc with the new location. + replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB, + /*Deref=*/true, -StaticOffset); + Arg->replaceAllUsesWith(NewArg); + IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode()); + IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment()); + } + + // Allocate space for every unsafe static AllocaInst on the unsafe stack. for (AllocaInst *AI : StaticAllocas) { IRB.SetInsertPoint(AI); - auto CArraySize = cast<ConstantInt>(AI->getArraySize()); Type *Ty = AI->getAllocatedType(); - - uint64_t Size = DL->getTypeAllocSize(Ty) * CArraySize->getZExtValue(); + uint64_t Size = getStaticAllocaAllocationSize(AI); if (Size == 0) Size = 1; // Don't create zero-sized stack objects. @@ -423,7 +574,7 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F, cast<Instruction>(NewAI)->takeName(AI); // Replace alloc with the new location. - replaceDbgDeclareForAlloca(AI, NewAI, DIB, /*Deref=*/true); + replaceDbgDeclareForAlloca(AI, BasePointer, DIB, /*Deref=*/true, -StaticOffset); AI->replaceAllUsesWith(NewAI); AI->eraseFromParent(); } @@ -434,7 +585,7 @@ SafeStack::moveStaticAllocasToUnsafeStack(Function &F, StaticOffset = RoundUpToAlignment(StaticOffset, StackAlignment); // Update shadow stack pointer in the function epilogue. - IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode())); + IRB.SetInsertPoint(BasePointer->getNextNode()); Value *StaticTop = IRB.CreateGEP(BasePointer, ConstantInt::get(Int32Ty, -StaticOffset), @@ -478,7 +629,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( if (DynamicTop) IRB.CreateStore(NewTop, DynamicTop); - Value *NewAI = IRB.CreateIntToPtr(SP, AI->getType()); + Value *NewAI = IRB.CreatePointerCast(NewTop, AI->getType()); if (AI->hasName() && isa<Instruction>(NewAI)) NewAI->takeName(AI); @@ -513,8 +664,6 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( } bool SafeStack::runOnFunction(Function &F) { - auto AA = &getAnalysis<AliasAnalysis>(); - DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n"); if (!F.hasFnAttribute(Attribute::SafeStack)) { @@ -529,6 +678,9 @@ bool SafeStack::runOnFunction(Function &F) { return false; } + TL = TM ? TM->getSubtargetImpl(F)->getTargetLowering() : nullptr; + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + { // Make sure the regular stack protector won't run on this function // (safestack attribute takes precedence). @@ -541,16 +693,11 @@ bool SafeStack::runOnFunction(Function &F) { AttributeSet::get(F.getContext(), AttributeSet::FunctionIndex, B)); } - if (AA->onlyReadsMemory(&F)) { - // XXX: we don't protect against information leak attacks for now. - DEBUG(dbgs() << "[SafeStack] function only reads memory\n"); - return false; - } - ++NumFunctions; SmallVector<AllocaInst *, 16> StaticAllocas; SmallVector<AllocaInst *, 4> DynamicAllocas; + SmallVector<Argument *, 4> ByValArguments; SmallVector<ReturnInst *, 4> Returns; // Collect all points where stack gets unwound and needs to be restored @@ -562,23 +709,26 @@ bool SafeStack::runOnFunction(Function &F) { // Find all static and dynamic alloca instructions that must be moved to the // unsafe stack, all return instructions and stack restore points. - findInsts(F, StaticAllocas, DynamicAllocas, Returns, StackRestorePoints); + findInsts(F, StaticAllocas, DynamicAllocas, ByValArguments, Returns, + StackRestorePoints); if (StaticAllocas.empty() && DynamicAllocas.empty() && - StackRestorePoints.empty()) + ByValArguments.empty() && StackRestorePoints.empty()) return false; // Nothing to do in this function. - if (!StaticAllocas.empty() || !DynamicAllocas.empty()) + if (!StaticAllocas.empty() || !DynamicAllocas.empty() || + !ByValArguments.empty()) ++NumUnsafeStackFunctions; // This function has the unsafe stack. if (!StackRestorePoints.empty()) ++NumUnsafeStackRestorePointsFunctions; - if (!UnsafeStackPtr) - UnsafeStackPtr = getOrCreateUnsafeStackPtr(*F.getParent()); + IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt()); + UnsafeStackPtr = getOrCreateUnsafeStackPtr(IRB, F); // The top of the unsafe stack after all unsafe static allocas are allocated. - Value *StaticTop = moveStaticAllocasToUnsafeStack(F, StaticAllocas, Returns); + Value *StaticTop = moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas, + ByValArguments, Returns); // Safe stack object that stores the current unsafe stack top. It is updated // as unsafe dynamic (non-constant-sized) allocas are allocated and freed. @@ -587,7 +737,7 @@ bool SafeStack::runOnFunction(Function &F) { // FIXME: a better alternative might be to store the unsafe stack pointer // before setjmp / invoke instructions. AllocaInst *DynamicTop = createStackRestorePoints( - F, StackRestorePoints, StaticTop, !DynamicAllocas.empty()); + IRB, F, StackRestorePoints, StaticTop, !DynamicAllocas.empty()); // Handle dynamic allocas. moveDynamicAllocasToUnsafeStack(F, UnsafeStackPtr, DynamicTop, @@ -597,13 +747,14 @@ bool SafeStack::runOnFunction(Function &F) { return true; } -} // end anonymous namespace +} // anonymous namespace char SafeStack::ID = 0; -INITIALIZE_PASS_BEGIN(SafeStack, "safe-stack", - "Safe Stack instrumentation pass", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END(SafeStack, "safe-stack", "Safe Stack instrumentation pass", - false, false) +INITIALIZE_TM_PASS_BEGIN(SafeStack, "safe-stack", + "Safe Stack instrumentation pass", false, false) +INITIALIZE_TM_PASS_END(SafeStack, "safe-stack", + "Safe Stack instrumentation pass", false, false) -FunctionPass *llvm::createSafeStackPass() { return new SafeStack(); } +FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) { + return new SafeStack(TM); +} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 7a5b4cb..09de7a2 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -31,6 +31,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" @@ -59,6 +60,7 @@ static const char *const kSanCovIndirCallName = "__sanitizer_cov_indir_call16"; static const char *const kSanCovTraceEnter = "__sanitizer_cov_trace_func_enter"; static const char *const kSanCovTraceBB = "__sanitizer_cov_trace_basic_block"; static const char *const kSanCovTraceCmp = "__sanitizer_cov_trace_cmp"; +static const char *const kSanCovTraceSwitch = "__sanitizer_cov_trace_switch"; static const char *const kSanCovModuleCtorName = "sancov.module_ctor"; static const uint64_t kSanCtorAndDtorPriority = 2; @@ -148,19 +150,25 @@ class SanitizerCoverageModule : public ModulePass { void InjectCoverageForIndirectCalls(Function &F, ArrayRef<Instruction *> IndirCalls); void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets); + void InjectTraceForSwitch(Function &F, + ArrayRef<Instruction *> SwitchTraceTargets); bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks); void SetNoSanitizeMetadata(Instruction *I); void InjectCoverageAtBlock(Function &F, BasicBlock &BB, bool UseCalls); unsigned NumberOfInstrumentedBlocks() { - return SanCovFunction->getNumUses() + SanCovWithCheckFunction->getNumUses(); + return SanCovFunction->getNumUses() + + SanCovWithCheckFunction->getNumUses() + SanCovTraceBB->getNumUses() + + SanCovTraceEnter->getNumUses(); } Function *SanCovFunction; Function *SanCovWithCheckFunction; Function *SanCovIndirCallFunction; Function *SanCovTraceEnter, *SanCovTraceBB; Function *SanCovTraceCmpFunction; + Function *SanCovTraceSwitchFunction; InlineAsm *EmptyAsm; - Type *IntptrTy, *Int64Ty; + Type *IntptrTy, *Int64Ty, *Int64PtrTy; + Module *CurModule; LLVMContext *C; const DataLayout *DL; @@ -177,11 +185,13 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { return false; C = &(M.getContext()); DL = &M.getDataLayout(); + CurModule = &M; IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits()); Type *VoidTy = Type::getVoidTy(*C); IRBuilder<> IRB(*C); Type *Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty()); Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); + Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty()); Int64Ty = IRB.getInt64Ty(); SanCovFunction = checkSanitizerInterfaceFunction( @@ -194,18 +204,19 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { SanCovTraceCmpFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction( kSanCovTraceCmp, VoidTy, Int64Ty, Int64Ty, Int64Ty, nullptr)); + SanCovTraceSwitchFunction = + checkSanitizerInterfaceFunction(M.getOrInsertFunction( + kSanCovTraceSwitch, VoidTy, Int64Ty, Int64PtrTy, nullptr)); // We insert an empty inline asm after cov callbacks to avoid callback merge. EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), StringRef(""), StringRef(""), /*hasSideEffects=*/true); - if (Options.TraceBB) { - SanCovTraceEnter = checkSanitizerInterfaceFunction( - M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr)); - SanCovTraceBB = checkSanitizerInterfaceFunction( - M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr)); - } + SanCovTraceEnter = checkSanitizerInterfaceFunction( + M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr)); + SanCovTraceBB = checkSanitizerInterfaceFunction( + M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr)); // At this point we create a dummy array of guards because we don't // know how many elements we will need. @@ -280,11 +291,18 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) { if (F.empty()) return false; if (F.getName().find(".module_ctor") != std::string::npos) return false; // Should not instrument sanitizer init functions. + // Don't instrument functions using SEH for now. Splitting basic blocks like + // we do for coverage breaks WinEHPrepare. + // FIXME: Remove this when SEH no longer uses landingpad pattern matching. + if (F.hasPersonalityFn() && + isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) + return false; if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge) SplitAllCriticalEdges(F); SmallVector<Instruction*, 8> IndirCalls; SmallVector<BasicBlock*, 16> AllBlocks; SmallVector<Instruction*, 8> CmpTraceTargets; + SmallVector<Instruction*, 8> SwitchTraceTargets; for (auto &BB : F) { AllBlocks.push_back(&BB); for (auto &Inst : BB) { @@ -293,13 +311,18 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) { if (CS && !CS.getCalledFunction()) IndirCalls.push_back(&Inst); } - if (Options.TraceCmp && isa<ICmpInst>(&Inst)) - CmpTraceTargets.push_back(&Inst); + if (Options.TraceCmp) { + if (isa<ICmpInst>(&Inst)) + CmpTraceTargets.push_back(&Inst); + if (isa<SwitchInst>(&Inst)) + SwitchTraceTargets.push_back(&Inst); + } } } InjectCoverage(F, AllBlocks); InjectCoverageForIndirectCalls(F, IndirCalls); InjectTraceForCmp(F, CmpTraceTargets); + InjectTraceForSwitch(F, SwitchTraceTargets); return true; } @@ -348,6 +371,45 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls( } } +// For every switch statement we insert a call: +// __sanitizer_cov_trace_switch(CondValue, +// {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... }) + +void SanitizerCoverageModule::InjectTraceForSwitch( + Function &F, ArrayRef<Instruction *> SwitchTraceTargets) { + for (auto I : SwitchTraceTargets) { + if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { + IRBuilder<> IRB(I); + SmallVector<Constant *, 16> Initializers; + Value *Cond = SI->getCondition(); + if (Cond->getType()->getScalarSizeInBits() > + Int64Ty->getScalarSizeInBits()) + continue; + Initializers.push_back(ConstantInt::get(Int64Ty, SI->getNumCases())); + Initializers.push_back( + ConstantInt::get(Int64Ty, Cond->getType()->getScalarSizeInBits())); + if (Cond->getType()->getScalarSizeInBits() < + Int64Ty->getScalarSizeInBits()) + Cond = IRB.CreateIntCast(Cond, Int64Ty, false); + for (auto It: SI->cases()) { + Constant *C = It.getCaseValue(); + if (C->getType()->getScalarSizeInBits() < + Int64Ty->getScalarSizeInBits()) + C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty); + Initializers.push_back(C); + } + ArrayType *ArrayOfInt64Ty = ArrayType::get(Int64Ty, Initializers.size()); + GlobalVariable *GV = new GlobalVariable( + *CurModule, ArrayOfInt64Ty, false, GlobalVariable::InternalLinkage, + ConstantArray::get(ArrayOfInt64Ty, Initializers), + "__sancov_gen_cov_switch_values"); + IRB.CreateCall(SanCovTraceSwitchFunction, + {Cond, IRB.CreatePointerCast(GV, Int64PtrTy)}); + } + } +} + + void SanitizerCoverageModule::InjectTraceForCmp( Function &F, ArrayRef<Instruction *> CmpTraceTargets) { for (auto I : CmpTraceTargets) { @@ -369,8 +431,7 @@ void SanitizerCoverageModule::InjectTraceForCmp( void SanitizerCoverageModule::SetNoSanitizeMetadata(Instruction *I) { I->setMetadata( - I->getParent()->getParent()->getParent()->getMDKindID("nosanitize"), - MDNode::get(*C, None)); + I->getModule()->getMDKindID("nosanitize"), MDNode::get(*C, None)); } void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, @@ -382,34 +443,31 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, // locations. if (isa<UnreachableInst>(BB.getTerminator())) return; - BasicBlock::iterator IP = BB.getFirstInsertionPt(), BE = BB.end(); - // Skip static allocas at the top of the entry block so they don't become - // dynamic when we split the block. If we used our optimized stack layout, - // then there will only be one alloca and it will come first. - for (; IP != BE; ++IP) { - AllocaInst *AI = dyn_cast<AllocaInst>(IP); - if (!AI || !AI->isStaticAlloca()) - break; - } + BasicBlock::iterator IP = BB.getFirstInsertionPt(); bool IsEntryBB = &BB == &F.getEntryBlock(); DebugLoc EntryLoc; if (IsEntryBB) { if (auto SP = getDISubprogram(&F)) EntryLoc = DebugLoc::get(SP->getScopeLine(), 0, SP); + // Keep static allocas and llvm.localescape calls in the entry block. Even + // if we aren't splitting the block, it's nice for allocas to be before + // calls. + IP = PrepareToSplitEntryBlock(BB, IP); } else { EntryLoc = IP->getDebugLoc(); } - IRBuilder<> IRB(IP); + IRBuilder<> IRB(&*IP); IRB.SetCurrentDebugLocation(EntryLoc); - SmallVector<Value *, 1> Indices; Value *GuardP = IRB.CreateAdd( IRB.CreatePointerCast(GuardArray, IntptrTy), ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4)); Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy); - if (UseCalls) { + if (Options.TraceBB) { + IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP); + } else if (UseCalls) { IRB.CreateCall(SanCovWithCheckFunction, GuardP); } else { LoadInst *Load = IRB.CreateLoad(GuardP); @@ -418,7 +476,7 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, SetNoSanitizeMetadata(Load); Value *Cmp = IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load); Instruction *Ins = SplitBlockAndInsertIfThen( - Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); + Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); IRB.SetInsertPoint(Ins); IRB.SetCurrentDebugLocation(EntryLoc); // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC. @@ -427,7 +485,7 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, } if (Options.Use8bitCounters) { - IRB.SetInsertPoint(IP); + IRB.SetInsertPoint(&*IP); Value *P = IRB.CreateAdd( IRB.CreatePointerCast(EightBitCounterArray, IntptrTy), ConstantInt::get(IntptrTy, NumberOfInstrumentedBlocks() - 1)); @@ -438,13 +496,6 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, SetNoSanitizeMetadata(LI); SetNoSanitizeMetadata(SI); } - - if (Options.TraceBB) { - // Experimental support for tracing. - // Insert a callback with the same guard variable as used for coverage. - IRB.SetInsertPoint(IP); - IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP); - } } char SanitizerCoverageModule::ID = 0; diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 1a46bbb..9331e1d 100644 --- a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -142,37 +142,35 @@ void ThreadSanitizer::initializeCallbacks(Module &M) { M.getOrInsertFunction("__tsan_func_exit", IRB.getVoidTy(), nullptr)); OrdTy = IRB.getInt32Ty(); for (size_t i = 0; i < kNumberOfAccessSizes; ++i) { - const size_t ByteSize = 1 << i; - const size_t BitSize = ByteSize * 8; - SmallString<32> ReadName("__tsan_read" + itostr(ByteSize)); + const unsigned ByteSize = 1U << i; + const unsigned BitSize = ByteSize * 8; + std::string ByteSizeStr = utostr(ByteSize); + std::string BitSizeStr = utostr(BitSize); + SmallString<32> ReadName("__tsan_read" + ByteSizeStr); TsanRead[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); - SmallString<32> WriteName("__tsan_write" + itostr(ByteSize)); + SmallString<32> WriteName("__tsan_write" + ByteSizeStr); TsanWrite[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); - SmallString<64> UnalignedReadName("__tsan_unaligned_read" + - itostr(ByteSize)); + SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr); TsanUnalignedRead[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( UnalignedReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); - SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + - itostr(ByteSize)); + SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr); TsanUnalignedWrite[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( UnalignedWriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr)); Type *Ty = Type::getIntNTy(M.getContext(), BitSize); Type *PtrTy = Ty->getPointerTo(); - SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) + - "_load"); + SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load"); TsanAtomicLoad[i] = checkSanitizerInterfaceFunction( M.getOrInsertFunction(AtomicLoadName, Ty, PtrTy, OrdTy, nullptr)); - SmallString<32> AtomicStoreName("__tsan_atomic" + itostr(BitSize) + - "_store"); + SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store"); TsanAtomicStore[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy, nullptr)); @@ -201,7 +199,7 @@ void ThreadSanitizer::initializeCallbacks(Module &M) { M.getOrInsertFunction(RMWName, Ty, PtrTy, Ty, OrdTy, nullptr)); } - SmallString<32> AtomicCASName("__tsan_atomic" + itostr(BitSize) + + SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr + "_compare_exchange_val"); TsanAtomicCAS[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, nullptr)); @@ -513,8 +511,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { int Idx = getMemoryAccessFuncIndex(Addr, DL); if (Idx < 0) return false; - const size_t ByteSize = 1 << Idx; - const size_t BitSize = ByteSize * 8; + const unsigned ByteSize = 1U << Idx; + const unsigned BitSize = ByteSize * 8; Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); Type *PtrTy = Ty->getPointerTo(); Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), @@ -527,8 +525,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { int Idx = getMemoryAccessFuncIndex(Addr, DL); if (Idx < 0) return false; - const size_t ByteSize = 1 << Idx; - const size_t BitSize = ByteSize * 8; + const unsigned ByteSize = 1U << Idx; + const unsigned BitSize = ByteSize * 8; Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); Type *PtrTy = Ty->getPointerTo(); Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), @@ -544,8 +542,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx]; if (!F) return false; - const size_t ByteSize = 1 << Idx; - const size_t BitSize = ByteSize * 8; + const unsigned ByteSize = 1U << Idx; + const unsigned BitSize = ByteSize * 8; Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); Type *PtrTy = Ty->getPointerTo(); Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), @@ -558,8 +556,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) { int Idx = getMemoryAccessFuncIndex(Addr, DL); if (Idx < 0) return false; - const size_t ByteSize = 1 << Idx; - const size_t BitSize = ByteSize * 8; + const unsigned ByteSize = 1U << Idx; + const unsigned BitSize = ByteSize * 8; Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); Type *PtrTy = Ty->getPointerTo(); Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h b/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h deleted file mode 100644 index 636c65c..0000000 --- a/contrib/llvm/lib/Transforms/ObjCARC/ARCInstKind.h +++ /dev/null @@ -1,123 +0,0 @@ -//===--- ARCInstKind.h - ARC instruction equivalence classes -*- C++ -*----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H -#define LLVM_LIB_TRANSFORMS_OBJCARC_ARCINSTKIND_H - -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Function.h" - -namespace llvm { -namespace objcarc { - -/// \enum ARCInstKind -/// -/// \brief Equivalence classes of instructions in the ARC Model. -/// -/// Since we do not have "instructions" to represent ARC concepts in LLVM IR, -/// we instead operate on equivalence classes of instructions. -/// -/// TODO: This should be split into two enums: a runtime entry point enum -/// (possibly united with the ARCRuntimeEntrypoint class) and an enum that deals -/// with effects of instructions in the ARC model (which would handle the notion -/// of a User or CallOrUser). -enum class ARCInstKind { - Retain, ///< objc_retain - RetainRV, ///< objc_retainAutoreleasedReturnValue - RetainBlock, ///< objc_retainBlock - Release, ///< objc_release - Autorelease, ///< objc_autorelease - AutoreleaseRV, ///< objc_autoreleaseReturnValue - AutoreleasepoolPush, ///< objc_autoreleasePoolPush - AutoreleasepoolPop, ///< objc_autoreleasePoolPop - NoopCast, ///< objc_retainedObject, etc. - FusedRetainAutorelease, ///< objc_retainAutorelease - FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue - LoadWeakRetained, ///< objc_loadWeakRetained (primitive) - StoreWeak, ///< objc_storeWeak (primitive) - InitWeak, ///< objc_initWeak (derived) - LoadWeak, ///< objc_loadWeak (derived) - MoveWeak, ///< objc_moveWeak (derived) - CopyWeak, ///< objc_copyWeak (derived) - DestroyWeak, ///< objc_destroyWeak (derived) - StoreStrong, ///< objc_storeStrong (derived) - IntrinsicUser, ///< clang.arc.use - CallOrUser, ///< could call objc_release and/or "use" pointers - Call, ///< could call objc_release - User, ///< could "use" a pointer - None ///< anything that is inert from an ARC perspective. -}; - -raw_ostream &operator<<(raw_ostream &OS, const ARCInstKind Class); - -/// \brief Test if the given class is a kind of user. -bool IsUser(ARCInstKind Class); - -/// \brief Test if the given class is objc_retain or equivalent. -bool IsRetain(ARCInstKind Class); - -/// \brief Test if the given class is objc_autorelease or equivalent. -bool IsAutorelease(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which return their -/// argument verbatim. -bool IsForwarding(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which do nothing if -/// passed a null pointer. -bool IsNoopOnNull(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the "tail" keyword. -bool IsAlwaysTail(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which are never safe -/// to mark with the "tail" keyword. -bool IsNeverTail(ARCInstKind Class); - -/// \brief Test if the given class represents instructions which are always safe -/// to mark with the nounwind attribute. -bool IsNoThrow(ARCInstKind Class); - -/// Test whether the given instruction can autorelease any pointer or cause an -/// autoreleasepool pop. -bool CanInterruptRV(ARCInstKind Class); - -/// \brief Determine if F is one of the special known Functions. If it isn't, -/// return ARCInstKind::CallOrUser. -ARCInstKind GetFunctionClass(const Function *F); - -/// \brief Determine which objc runtime call instruction class V belongs to. -/// -/// This is similar to GetARCInstKind except that it only detects objc -/// runtime calls. This allows it to be faster. -/// -static inline ARCInstKind GetBasicARCInstKind(const Value *V) { - if (const CallInst *CI = dyn_cast<CallInst>(V)) { - if (const Function *F = CI->getCalledFunction()) - return GetFunctionClass(F); - // Otherwise, be conservative. - return ARCInstKind::CallOrUser; - } - - // Otherwise, be conservative. - return isa<InvokeInst>(V) ? ARCInstKind::CallOrUser : ARCInstKind::User; -} - -/// Map V to its ARCInstKind equivalence class. -ARCInstKind GetARCInstKind(const Value *V); - -/// Returns false if conservatively we can prove that any instruction mapped to -/// this kind can not decrement ref counts. Returns true otherwise. -bool CanDecrementRefCount(ARCInstKind Kind); - -} // end namespace objcarc -} // end namespace llvm - -#endif diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp index 4edd029..9d78e5a 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp @@ -49,7 +49,7 @@ bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr, assert(CS && "Only calls can alter reference counts!"); // See if AliasAnalysis can help us with the call. - AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS); + FunctionModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS); if (AliasAnalysis::onlyReadsMemory(MRB)) return false; if (AliasAnalysis::onlyAccessesArgPointees(MRB)) { @@ -226,7 +226,7 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor, SmallPtrSetImpl<Instruction *> &DependingInsts, SmallPtrSetImpl<const BasicBlock *> &Visited, ProvenanceAnalysis &PA) { - BasicBlock::iterator StartPos = StartInst; + BasicBlock::iterator StartPos = StartInst->getIterator(); SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist; Worklist.push_back(std::make_pair(StartBB, StartPos)); @@ -252,7 +252,7 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor, break; } - Instruction *Inst = --LocalStartPos; + Instruction *Inst = &*--LocalStartPos; if (Depends(Flavor, Inst, Arg, PA)) { DependingInsts.insert(Inst); break; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp index 6ea038b..d860723 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp @@ -26,18 +26,10 @@ namespace llvm { using namespace llvm; using namespace llvm::objcarc; -/// \brief A handy option to enable/disable all ARC Optimizations. -bool llvm::objcarc::EnableARCOpts; -static cl::opt<bool, true> -EnableARCOptimizations("enable-objc-arc-opts", - cl::desc("enable/disable all ARC Optimizations"), - cl::location(EnableARCOpts), - cl::init(true)); - /// initializeObjCARCOptsPasses - Initialize all passes linked into the /// ObjCARCOpts library. void llvm::initializeObjCARCOpts(PassRegistry &Registry) { - initializeObjCARCAliasAnalysisPass(Registry); + initializeObjCARCAAWrapperPassPass(Registry); initializeObjCARCAPElimPass(Registry); initializeObjCARCExpandPass(Registry); initializeObjCARCContractPass(Registry); diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h index 7595e2d..5fd45b0 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h @@ -26,6 +26,8 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Optional.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ObjCARCAnalysisUtils.h" +#include "llvm/Analysis/ObjCARCInstKind.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" @@ -34,7 +36,6 @@ #include "llvm/Pass.h" #include "llvm/Transforms/ObjCARC.h" #include "llvm/Transforms/Utils/Local.h" -#include "ARCInstKind.h" namespace llvm { class raw_ostream; @@ -43,99 +44,6 @@ class raw_ostream; namespace llvm { namespace objcarc { -/// \brief A handy option to enable/disable all ARC Optimizations. -extern bool EnableARCOpts; - -/// \brief Test if the given module looks interesting to run ARC optimization -/// on. -static inline bool ModuleHasARC(const Module &M) { - return - M.getNamedValue("objc_retain") || - M.getNamedValue("objc_release") || - M.getNamedValue("objc_autorelease") || - M.getNamedValue("objc_retainAutoreleasedReturnValue") || - M.getNamedValue("objc_retainBlock") || - M.getNamedValue("objc_autoreleaseReturnValue") || - M.getNamedValue("objc_autoreleasePoolPush") || - M.getNamedValue("objc_loadWeakRetained") || - M.getNamedValue("objc_loadWeak") || - M.getNamedValue("objc_destroyWeak") || - M.getNamedValue("objc_storeWeak") || - M.getNamedValue("objc_initWeak") || - M.getNamedValue("objc_moveWeak") || - M.getNamedValue("objc_copyWeak") || - M.getNamedValue("objc_retainedObject") || - M.getNamedValue("objc_unretainedObject") || - M.getNamedValue("objc_unretainedPointer") || - M.getNamedValue("clang.arc.use"); -} - -/// \brief This is a wrapper around getUnderlyingObject which also knows how to -/// look through objc_retain and objc_autorelease calls, which we know to return -/// their argument verbatim. -static inline const Value *GetUnderlyingObjCPtr(const Value *V, - const DataLayout &DL) { - for (;;) { - V = GetUnderlyingObject(V, DL); - if (!IsForwarding(GetBasicARCInstKind(V))) - break; - V = cast<CallInst>(V)->getArgOperand(0); - } - - return V; -} - -/// The RCIdentity root of a value \p V is a dominating value U for which -/// retaining or releasing U is equivalent to retaining or releasing V. In other -/// words, ARC operations on \p V are equivalent to ARC operations on \p U. -/// -/// We use this in the ARC optimizer to make it easier to match up ARC -/// operations by always mapping ARC operations to RCIdentityRoots instead of -/// pointers themselves. -/// -/// The two ways that we see RCIdentical values in ObjC are via: -/// -/// 1. PointerCasts -/// 2. Forwarding Calls that return their argument verbatim. -/// -/// Thus this function strips off pointer casts and forwarding calls. *NOTE* -/// This implies that two RCIdentical values must alias. -static inline const Value *GetRCIdentityRoot(const Value *V) { - for (;;) { - V = V->stripPointerCasts(); - if (!IsForwarding(GetBasicARCInstKind(V))) - break; - V = cast<CallInst>(V)->getArgOperand(0); - } - return V; -} - -/// Helper which calls const Value *GetRCIdentityRoot(const Value *V) and just -/// casts away the const of the result. For documentation about what an -/// RCIdentityRoot (and by extension GetRCIdentityRoot is) look at that -/// function. -static inline Value *GetRCIdentityRoot(Value *V) { - return const_cast<Value *>(GetRCIdentityRoot((const Value *)V)); -} - -/// \brief Assuming the given instruction is one of the special calls such as -/// objc_retain or objc_release, return the RCIdentity root of the argument of -/// the call. -static inline Value *GetArgRCIdentityRoot(Value *Inst) { - return GetRCIdentityRoot(cast<CallInst>(Inst)->getArgOperand(0)); -} - -static inline bool IsNullOrUndef(const Value *V) { - return isa<ConstantPointerNull>(V) || isa<UndefValue>(V); -} - -static inline bool IsNoopInstruction(const Instruction *I) { - return isa<BitCastInst>(I) || - (isa<GetElementPtrInst>(I) && - cast<GetElementPtrInst>(I)->hasAllZeroIndices()); -} - - /// \brief Erase the given instruction. /// /// Many ObjC calls return their argument verbatim, @@ -162,152 +70,6 @@ static inline void EraseInstruction(Instruction *CI) { RecursivelyDeleteTriviallyDeadInstructions(OldArg); } -/// \brief Test whether the given value is possible a retainable object pointer. -static inline bool IsPotentialRetainableObjPtr(const Value *Op) { - // Pointers to static or stack storage are not valid retainable object - // pointers. - if (isa<Constant>(Op) || isa<AllocaInst>(Op)) - return false; - // Special arguments can not be a valid retainable object pointer. - if (const Argument *Arg = dyn_cast<Argument>(Op)) - if (Arg->hasByValAttr() || - Arg->hasInAllocaAttr() || - Arg->hasNestAttr() || - Arg->hasStructRetAttr()) - return false; - // Only consider values with pointer types. - // - // It seemes intuitive to exclude function pointer types as well, since - // functions are never retainable object pointers, however clang occasionally - // bitcasts retainable object pointers to function-pointer type temporarily. - PointerType *Ty = dyn_cast<PointerType>(Op->getType()); - if (!Ty) - return false; - // Conservatively assume anything else is a potential retainable object - // pointer. - return true; -} - -static inline bool IsPotentialRetainableObjPtr(const Value *Op, - AliasAnalysis &AA) { - // First make the rudimentary check. - if (!IsPotentialRetainableObjPtr(Op)) - return false; - - // Objects in constant memory are not reference-counted. - if (AA.pointsToConstantMemory(Op)) - return false; - - // Pointers in constant memory are not pointing to reference-counted objects. - if (const LoadInst *LI = dyn_cast<LoadInst>(Op)) - if (AA.pointsToConstantMemory(LI->getPointerOperand())) - return false; - - // Otherwise assume the worst. - return true; -} - -/// \brief Helper for GetARCInstKind. Determines what kind of construct CS -/// is. -static inline ARCInstKind GetCallSiteClass(ImmutableCallSite CS) { - for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); - I != E; ++I) - if (IsPotentialRetainableObjPtr(*I)) - return CS.onlyReadsMemory() ? ARCInstKind::User : ARCInstKind::CallOrUser; - - return CS.onlyReadsMemory() ? ARCInstKind::None : ARCInstKind::Call; -} - -/// \brief Return true if this value refers to a distinct and identifiable -/// object. -/// -/// This is similar to AliasAnalysis's isIdentifiedObject, except that it uses -/// special knowledge of ObjC conventions. -static inline bool IsObjCIdentifiedObject(const Value *V) { - // Assume that call results and arguments have their own "provenance". - // Constants (including GlobalVariables) and Allocas are never - // reference-counted. - if (isa<CallInst>(V) || isa<InvokeInst>(V) || - isa<Argument>(V) || isa<Constant>(V) || - isa<AllocaInst>(V)) - return true; - - if (const LoadInst *LI = dyn_cast<LoadInst>(V)) { - const Value *Pointer = - GetRCIdentityRoot(LI->getPointerOperand()); - if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) { - // A constant pointer can't be pointing to an object on the heap. It may - // be reference-counted, but it won't be deleted. - if (GV->isConstant()) - return true; - StringRef Name = GV->getName(); - // These special variables are known to hold values which are not - // reference-counted pointers. - if (Name.startswith("\01l_objc_msgSend_fixup_")) - return true; - - StringRef Section = GV->getSection(); - if (Section.find("__message_refs") != StringRef::npos || - Section.find("__objc_classrefs") != StringRef::npos || - Section.find("__objc_superrefs") != StringRef::npos || - Section.find("__objc_methname") != StringRef::npos || - Section.find("__cstring") != StringRef::npos) - return true; - } - } - - return false; -} - -enum class ARCMDKindID { - ImpreciseRelease, - CopyOnEscape, - NoObjCARCExceptions, -}; - -/// A cache of MDKinds used by various ARC optimizations. -class ARCMDKindCache { - Module *M; - - /// The Metadata Kind for clang.imprecise_release metadata. - llvm::Optional<unsigned> ImpreciseReleaseMDKind; - - /// The Metadata Kind for clang.arc.copy_on_escape metadata. - llvm::Optional<unsigned> CopyOnEscapeMDKind; - - /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata. - llvm::Optional<unsigned> NoObjCARCExceptionsMDKind; - -public: - void init(Module *Mod) { - M = Mod; - ImpreciseReleaseMDKind = NoneType::None; - CopyOnEscapeMDKind = NoneType::None; - NoObjCARCExceptionsMDKind = NoneType::None; - } - - unsigned get(ARCMDKindID ID) { - switch (ID) { - case ARCMDKindID::ImpreciseRelease: - if (!ImpreciseReleaseMDKind) - ImpreciseReleaseMDKind = - M->getContext().getMDKindID("clang.imprecise_release"); - return *ImpreciseReleaseMDKind; - case ARCMDKindID::CopyOnEscape: - if (!CopyOnEscapeMDKind) - CopyOnEscapeMDKind = - M->getContext().getMDKindID("clang.arc.copy_on_escape"); - return *CopyOnEscapeMDKind; - case ARCMDKindID::NoObjCARCExceptions: - if (!NoObjCARCExceptionsMDKind) - NoObjCARCExceptionsMDKind = - M->getContext().getMDKindID("clang.arc.no_objc_arc_exceptions"); - return *NoObjCARCExceptionsMDKind; - } - llvm_unreachable("Covered switch isn't covered?!"); - } -}; - } // end namespace objcarc } // end namespace llvm diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp index d318643..969e77c 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp @@ -72,12 +72,9 @@ bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) { if (const Function *Callee = CS.getCalledFunction()) { if (Callee->isDeclaration() || Callee->mayBeOverridden()) return true; - for (Function::const_iterator I = Callee->begin(), E = Callee->end(); - I != E; ++I) { - const BasicBlock *BB = I; - for (BasicBlock::const_iterator J = BB->begin(), F = BB->end(); - J != F; ++J) - if (ImmutableCallSite JCS = ImmutableCallSite(J)) + for (const BasicBlock &BB : *Callee) { + for (const Instruction &I : BB) + if (ImmutableCallSite JCS = ImmutableCallSite(&I)) // This recursion depth limit is arbitrary. It's just great // enough to cover known interesting testcases. if (Depth < 3 && @@ -96,7 +93,7 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) { Instruction *Push = nullptr; for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { - Instruction *Inst = I++; + Instruction *Inst = &*I++; switch (GetBasicARCInstKind(Inst)) { case ARCInstKind::AutoreleasepoolPush: Push = Inst; @@ -169,7 +166,7 @@ bool ObjCARCAPElim::runOnModule(Module &M) { if (std::next(F->begin()) != F->end()) continue; // Ok, a single-block constructor function definition. Try to optimize it. - Changed |= OptimizeBB(F->begin()); + Changed |= OptimizeBB(&F->front()); } return Changed; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h deleted file mode 100644 index eecc82f..0000000 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h +++ /dev/null @@ -1,74 +0,0 @@ -//===- ObjCARCAliasAnalysis.h - ObjC ARC Optimization -*- C++ -*-----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file declares a simple ARC-aware AliasAnalysis using special knowledge -/// of Objective C to enhance other optimization passes which rely on the Alias -/// Analysis infrastructure. -/// -/// WARNING: This file knows about certain library functions. It recognizes them -/// by name, and hardwires knowledge of their semantics. -/// -/// WARNING: This file knows about how certain Objective-C library functions are -/// used. Naive LLVM IR transformations which would otherwise be -/// behavior-preserving may break these assumptions. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H -#define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H - -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Pass.h" - -namespace llvm { -namespace objcarc { - - /// \brief This is a simple alias analysis implementation that uses knowledge - /// of ARC constructs to answer queries. - /// - /// TODO: This class could be generalized to know about other ObjC-specific - /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing - /// even though their offsets are dynamic. - class ObjCARCAliasAnalysis : public ImmutablePass, - public AliasAnalysis { - public: - static char ID; // Class identification, replacement for typeinfo - ObjCARCAliasAnalysis() : ImmutablePass(ID) { - initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry()); - } - - private: - bool doInitialization(Module &M) override; - - /// This method is used when a pass implements an analysis interface through - /// multiple inheritance. If needed, it should override this to adjust the - /// this pointer as needed for the specified pass info. - void *getAdjustedAnalysisPointer(const void *PI) override { - if (PI == &AliasAnalysis::ID) - return static_cast<AliasAnalysis *>(this); - return this; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override; - AliasResult alias(const MemoryLocation &LocA, - const MemoryLocation &LocB) override; - bool pointsToConstantMemory(const MemoryLocation &Loc, - bool OrLocal) override; - ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override; - ModRefBehavior getModRefBehavior(const Function *F) override; - ModRefResult getModRefInfo(ImmutableCallSite CS, - const MemoryLocation &Loc) override; - ModRefResult getModRefInfo(ImmutableCallSite CS1, - ImmutableCallSite CS2) override; - }; - -} // namespace objcarc -} // namespace llvm - -#endif diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index baca76b..1cdf568 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -119,9 +119,9 @@ bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) { return false; // Check that the call is next to the retain. - BasicBlock::const_iterator I = Call; - ++I; - while (IsNoopInstruction(I)) ++I; + BasicBlock::const_iterator I = ++Call->getIterator(); + while (IsNoopInstruction(&*I)) + ++I; if (&*I != Retain) return false; @@ -247,7 +247,7 @@ static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load, // Ok, now we know we have not seen a store yet. See if Inst can write to // our load location, if it can not, just ignore the instruction. - if (!(AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod)) + if (!(AA->getModRefInfo(Inst, Loc) & MRI_Mod)) continue; Store = dyn_cast<StoreInst>(Inst); @@ -282,9 +282,9 @@ findRetainForStoreStrongContraction(Value *New, StoreInst *Store, Instruction *Release, ProvenanceAnalysis &PA) { // Walk up from the Store to find the retain. - BasicBlock::iterator I = Store; + BasicBlock::iterator I = Store->getIterator(); BasicBlock::iterator Begin = Store->getParent()->begin(); - while (I != Begin && GetBasicARCInstKind(I) != ARCInstKind::Retain) { + while (I != Begin && GetBasicARCInstKind(&*I) != ARCInstKind::Retain) { Instruction *Inst = &*I; // It is only safe to move the retain to the store if we can prove @@ -294,7 +294,7 @@ findRetainForStoreStrongContraction(Value *New, StoreInst *Store, return nullptr; --I; } - Instruction *Retain = I; + Instruction *Retain = &*I; if (GetBasicARCInstKind(Retain) != ARCInstKind::Retain) return nullptr; if (GetArgRCIdentityRoot(Retain) != New) @@ -429,7 +429,7 @@ bool ObjCARCContract::tryToPeepholeInstruction( // insert it now. if (!RetainRVMarker) return false; - BasicBlock::iterator BBI = Inst; + BasicBlock::iterator BBI = Inst->getIterator(); BasicBlock *InstParent = Inst->getParent(); // Step up to see if the call immediately precedes the RetainRV call. @@ -440,11 +440,11 @@ bool ObjCARCContract::tryToPeepholeInstruction( BasicBlock *Pred = InstParent->getSinglePredecessor(); if (!Pred) goto decline_rv_optimization; - BBI = Pred->getTerminator(); + BBI = Pred->getTerminator()->getIterator(); break; } --BBI; - } while (IsNoopInstruction(BBI)); + } while (IsNoopInstruction(&*BBI)); if (&*BBI == GetArgRCIdentityRoot(Inst)) { DEBUG(dbgs() << "Adding inline asm marker for " @@ -511,10 +511,10 @@ bool ObjCARCContract::runOnFunction(Function &F) { return false; Changed = false; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - PA.setAA(&getAnalysis<AliasAnalysis>()); + PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults()); DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n"); @@ -629,13 +629,13 @@ bool ObjCARCContract::runOnFunction(Function &F) { char ObjCARCContract::ID = 0; INITIALIZE_PASS_BEGIN(ObjCARCContract, "objc-arc-contract", "ObjC ARC contraction", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(ObjCARCContract, "objc-arc-contract", "ObjC ARC contraction", false, false) void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); } diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 9edbb17..f0ee6e2 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -28,7 +28,6 @@ #include "ARCRuntimeEntryPoints.h" #include "BlotMapVector.h" #include "DependencyAnalysis.h" -#include "ObjCARCAliasAnalysis.h" #include "ProvenanceAnalysis.h" #include "PtrState.h" #include "llvm/ADT/DenseMap.h" @@ -36,6 +35,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ObjCARCAliasAnalysis.h" #include "llvm/IR/CFG.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" @@ -482,7 +482,7 @@ namespace { /// A flag indicating whether this optimization pass should run. bool Run; - /// Flags which determine whether each of the interesting runtine functions + /// Flags which determine whether each of the interesting runtime functions /// is in fact used in the current function. unsigned UsedInThisFunction; @@ -556,7 +556,7 @@ namespace { char ObjCARCOpt::ID = 0; INITIALIZE_PASS_BEGIN(ObjCARCOpt, "objc-arc", "ObjC ARC optimization", false, false) -INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(ObjCARCAAWrapperPass) INITIALIZE_PASS_END(ObjCARCOpt, "objc-arc", "ObjC ARC optimization", false, false) @@ -565,8 +565,8 @@ Pass *llvm::createObjCARCOptPass() { } void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<ObjCARCAliasAnalysis>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<ObjCARCAAWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); // ARC optimization doesn't currently split critical edges. AU.setPreservesCFG(); } @@ -581,16 +581,18 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { ImmutableCallSite CS(Arg); if (const Instruction *Call = CS.getInstruction()) { if (Call->getParent() == RetainRV->getParent()) { - BasicBlock::const_iterator I = Call; + BasicBlock::const_iterator I(Call); ++I; - while (IsNoopInstruction(I)) ++I; + while (IsNoopInstruction(&*I)) + ++I; if (&*I == RetainRV) return false; } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) { BasicBlock *RetainRVParent = RetainRV->getParent(); if (II->getNormalDest() == RetainRVParent) { BasicBlock::const_iterator I = RetainRVParent->begin(); - while (IsNoopInstruction(I)) ++I; + while (IsNoopInstruction(&*I)) + ++I; if (&*I == RetainRV) return false; } @@ -599,18 +601,21 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { // Check for being preceded by an objc_autoreleaseReturnValue on the same // pointer. In this case, we can delete the pair. - BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin(); + BasicBlock::iterator I = RetainRV->getIterator(), + Begin = RetainRV->getParent()->begin(); if (I != Begin) { - do --I; while (I != Begin && IsNoopInstruction(I)); - if (GetBasicARCInstKind(I) == ARCInstKind::AutoreleaseRV && - GetArgRCIdentityRoot(I) == Arg) { + do + --I; + while (I != Begin && IsNoopInstruction(&*I)); + if (GetBasicARCInstKind(&*I) == ARCInstKind::AutoreleaseRV && + GetArgRCIdentityRoot(&*I) == Arg) { Changed = true; ++NumPeeps; DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n" << "Erasing " << *RetainRV << "\n"); - EraseInstruction(I); + EraseInstruction(&*I); EraseInstruction(RetainRV); return true; } @@ -1216,7 +1221,7 @@ bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB, // Visit all the instructions, bottom-up. for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) { - Instruction *Inst = std::prev(I); + Instruction *Inst = &*std::prev(I); // Invoke instructions are visited as part of their successors (below). if (isa<InvokeInst>(Inst)) @@ -1264,7 +1269,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, Arg = GetArgRCIdentityRoot(Inst); TopDownPtrState &S = MyStates.getPtrTopDownState(Arg); NestingDetected |= S.InitTopDown(Class, Inst); - // A retain can be a potential use; procede to the generic checking + // A retain can be a potential use; proceed to the generic checking // code below. break; } @@ -1342,12 +1347,10 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, << "Performing Dataflow:\n"); // Visit all the instructions, top-down. - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - Instruction *Inst = I; + for (Instruction &Inst : *BB) { + DEBUG(dbgs() << " Visiting " << Inst << "\n"); - DEBUG(dbgs() << " Visiting " << *Inst << "\n"); - - NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates); + NestingDetected |= VisitInstructionTopDown(&Inst, Releases, MyStates); } DEBUG(llvm::dbgs() << "\nState Before Checking for CFG Hazards:\n" @@ -1413,16 +1416,15 @@ ComputePostOrders(Function &F, // Functions may have many exits, and there also blocks which we treat // as exits due to ignored edges. SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack; - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { - BasicBlock *ExitBB = I; - BBState &MyStates = BBStates[ExitBB]; + for (BasicBlock &ExitBB : F) { + BBState &MyStates = BBStates[&ExitBB]; if (!MyStates.isExit()) continue; MyStates.SetAsExit(); - PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin())); - Visited.insert(ExitBB); + PredStack.push_back(std::make_pair(&ExitBB, MyStates.pred_begin())); + Visited.insert(&ExitBB); while (!PredStack.empty()) { reverse_dfs_next_succ: BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end(); @@ -1830,7 +1832,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { // analysis too, but that would want caching. A better approach would be to // use the technique that EarlyCSE uses. inst_iterator Current = std::prev(I); - BasicBlock *CurrentBB = Current.getBasicBlockIterator(); + BasicBlock *CurrentBB = &*Current.getBasicBlockIterator(); for (BasicBlock::iterator B = CurrentBB->begin(), J = Current.getInstructionIterator(); J != B; --J) { @@ -2008,10 +2010,7 @@ HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain, // Check that the call is a regular call. ARCInstKind Class = GetBasicARCInstKind(Call); - if (Class != ARCInstKind::CallOrUser && Class != ARCInstKind::Call) - return false; - - return true; + return Class == ARCInstKind::CallOrUser || Class == ARCInstKind::Call; } /// Find a dependent retain that precedes the given autorelease for which there @@ -2081,9 +2080,8 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { SmallPtrSet<Instruction *, 4> DependingInstructions; SmallPtrSet<const BasicBlock *, 4> Visited; - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { - BasicBlock *BB = FI; - ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back()); + for (BasicBlock &BB: F) { + ReturnInst *Ret = dyn_cast<ReturnInst>(&BB.back()); DEBUG(dbgs() << "Visiting: " << *Ret << "\n"); @@ -2095,19 +2093,16 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { // Look for an ``autorelease'' instruction that is a predecessor of Ret and // dependent on Arg such that there are no instructions dependent on Arg // that need a positive ref count in between the autorelease and Ret. - CallInst *Autorelease = - FindPredecessorAutoreleaseWithSafePath(Arg, BB, Ret, - DependingInstructions, Visited, - PA); + CallInst *Autorelease = FindPredecessorAutoreleaseWithSafePath( + Arg, &BB, Ret, DependingInstructions, Visited, PA); DependingInstructions.clear(); Visited.clear(); if (!Autorelease) continue; - CallInst *Retain = - FindPredecessorRetainWithSafePath(Arg, BB, Autorelease, - DependingInstructions, Visited, PA); + CallInst *Retain = FindPredecessorRetainWithSafePath( + Arg, &BB, Autorelease, DependingInstructions, Visited, PA); DependingInstructions.clear(); Visited.clear(); @@ -2192,7 +2187,7 @@ bool ObjCARCOpt::runOnFunction(Function &F) { DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName() << " >>>" "\n"); - PA.setAA(&getAnalysis<AliasAnalysis>()); + PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults()); #ifndef NDEBUG if (AreStatisticsEnabled()) { diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h index 0ac41d3..1a12b659 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h @@ -26,10 +26,10 @@ #define LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H #include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/AliasAnalysis.h" namespace llvm { class Value; - class AliasAnalysis; class DataLayout; class PHINode; class SelectInst; diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp index 0be75af..c274e81 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp @@ -35,7 +35,7 @@ char PAEval::ID = 0; PAEval::PAEval() : FunctionPass(ID) {} void PAEval::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); } static StringRef getName(Value *V) { @@ -65,7 +65,7 @@ bool PAEval::runOnFunction(Function &F) { } ProvenanceAnalysis PA; - PA.setAA(&getAnalysis<AliasAnalysis>()); + PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults()); const DataLayout &DL = F.getParent()->getDataLayout(); for (Value *V1 : Values) { @@ -89,6 +89,6 @@ FunctionPass *llvm::createPAEvalPass() { return new PAEval(); } INITIALIZE_PASS_BEGIN(PAEval, "pa-eval", "Evaluate ProvenanceAnalysis on all pairs", false, true) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(PAEval, "pa-eval", "Evaluate ProvenanceAnalysis on all pairs", false, true) diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp index ae20e7e..df64fa3 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp +++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp @@ -256,9 +256,9 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst, // one of its successor blocks, since we can't insert code after it // in its own block, and we don't want to split critical edges. if (isa<InvokeInst>(Inst)) - InsertReverseInsertPt(BB->getFirstInsertionPt()); + InsertReverseInsertPt(&*BB->getFirstInsertionPt()); else - InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst))); + InsertReverseInsertPt(&*++Inst->getIterator()); SetSeq(S_Use); } else if (Seq == S_Release && IsUser(Class)) { DEBUG(dbgs() << " PreciseReleaseUse: Seq: " << GetSeq() << "; " @@ -268,9 +268,9 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst, assert(!HasReverseInsertPts()); // As above; handle invoke specially. if (isa<InvokeInst>(Inst)) - InsertReverseInsertPt(BB->getFirstInsertionPt()); + InsertReverseInsertPt(&*BB->getFirstInsertionPt()); else - InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst))); + InsertReverseInsertPt(&*++Inst->getIterator()); } break; case S_Stop: diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h index e45e1ea..9749e44 100644 --- a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h +++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h @@ -17,8 +17,8 @@ #ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H #define LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H -#include "ARCInstKind.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/ObjCARCInstKind.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Value.h" #include "llvm/Support/raw_ostream.h" @@ -96,7 +96,7 @@ struct RRInfo { }; /// \brief This class summarizes several per-pointer runtime properties which -/// are propogated through the flow graph. +/// are propagated through the flow graph. class PtrState { protected: /// True if the reference count is known to be incremented. @@ -172,7 +172,7 @@ struct BottomUpPtrState : PtrState { bool InitBottomUp(ARCMDKindCache &Cache, Instruction *I); /// Return true if this set of releases can be paired with a release. Modifies - /// state appropriately to reflect that the matching occured if it is + /// state appropriately to reflect that the matching occurred if it is /// successful. /// /// It is assumed that one has already checked that the RCIdentity of the @@ -194,7 +194,7 @@ struct TopDownPtrState : PtrState { /// Return true if this set of retains can be paired with the given /// release. Modifies state appropriately to reflect that the matching - /// occured. + /// occurred. bool MatchWithRelease(ARCMDKindCache &Cache, Instruction *Release); void HandlePotentialUse(Instruction *Inst, const Value *Ptr, diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp index d6fc916..590a52d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -1,4 +1,4 @@ -//===- DCE.cpp - Code to perform dead code elimination --------------------===// +//===- ADCE.cpp - Code to perform dead code elimination -------------------===// // // The LLVM Compiler Infrastructure // @@ -14,52 +14,33 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/ADCE.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; #define DEBUG_TYPE "adce" STATISTIC(NumRemoved, "Number of instructions removed"); -namespace { -struct ADCE : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - ADCE() : FunctionPass(ID) { - initializeADCEPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function& F) override; - - void getAnalysisUsage(AnalysisUsage& AU) const override { - AU.setPreservesCFG(); - } -}; -} - -char ADCE::ID = 0; -INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false) - -bool ADCE::runOnFunction(Function& F) { - if (skipOptnoneFunction(F)) - return false; - +static bool aggressiveDCE(Function& F) { SmallPtrSet<Instruction*, 128> Alive; SmallVector<Instruction*, 128> Worklist; // Collect the set of "root" instructions that are known live. - for (Instruction &I : inst_range(F)) { - if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || - isa<LandingPadInst>(I) || I.mayHaveSideEffects()) { + for (Instruction &I : instructions(F)) { + if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || I.isEHPad() || + I.mayHaveSideEffects()) { Alive.insert(&I); Worklist.push_back(&I); } @@ -79,7 +60,7 @@ bool ADCE::runOnFunction(Function& F) { // which have no side effects and do not influence the control flow or return // value of the function, and may therefore be deleted safely. // NOTE: We reuse the Worklist vector here for memory efficiency. - for (Instruction &I : inst_range(F)) { + for (Instruction &I : instructions(F)) { if (!Alive.count(&I)) { Worklist.push_back(&I); I.dropAllReferences(); @@ -94,6 +75,34 @@ bool ADCE::runOnFunction(Function& F) { return !Worklist.empty(); } -FunctionPass *llvm::createAggressiveDCEPass() { - return new ADCE(); +PreservedAnalyses ADCEPass::run(Function &F) { + if (aggressiveDCE(F)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); } + +namespace { +struct ADCELegacyPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + ADCELegacyPass() : FunctionPass(ID) { + initializeADCELegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function& F) override { + if (skipOptnoneFunction(F)) + return false; + return aggressiveDCE(F); + } + + void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.setPreservesCFG(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } +}; +} + +char ADCELegacyPass::ID = 0; +INITIALIZE_PASS(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination", + false, false) + +FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 8918909..4b721d3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -21,6 +21,8 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -54,13 +56,15 @@ struct AlignmentFromAssumptions : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); } // For memory transfers, we need a common alignment for both the source and @@ -84,7 +88,7 @@ INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME, aip_name, false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME, aip_name, false, false) @@ -249,8 +253,7 @@ bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I, // The mask must have some trailing ones (otherwise the condition is // trivial and tells us nothing about the alignment of the left operand). - unsigned TrailingOnes = - MaskSCEV->getValue()->getValue().countTrailingOnes(); + unsigned TrailingOnes = MaskSCEV->getAPInt().countTrailingOnes(); if (!TrailingOnes) return false; @@ -270,7 +273,7 @@ bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I, OffSCEV = nullptr; if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) { AAPtr = PToI->getPointerOperand(); - OffSCEV = SE->getConstant(Int64Ty, 0); + OffSCEV = SE->getZero(Int64Ty); } else if (const SCEVAddExpr* AndLHSAddSCEV = dyn_cast<SCEVAddExpr>(AndLHSSCEV)) { // Try to find the ptrtoint; subtract it and the rest is the offset. @@ -410,7 +413,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) { bool AlignmentFromAssumptions::runOnFunction(Function &F) { bool Changed = false; auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); NewDestAlignments.clear(); diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp index 09c605e..cb9b8b6 100644 --- a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp @@ -15,26 +15,18 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/BasicBlock.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/DemandedBits.h" #include "llvm/IR/CFG.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" - using namespace llvm; #define DEBUG_TYPE "bdce" @@ -53,342 +45,42 @@ struct BDCE : public FunctionPass { void getAnalysisUsage(AnalysisUsage& AU) const override { AU.setPreservesCFG(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<DemandedBits>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } - - void determineLiveOperandBits(const Instruction *UserI, - const Instruction *I, unsigned OperandNo, - const APInt &AOut, APInt &AB, - APInt &KnownZero, APInt &KnownOne, - APInt &KnownZero2, APInt &KnownOne2); - - AssumptionCache *AC; - DominatorTree *DT; }; } char BDCE::ID = 0; INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DemandedBits) INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", false, false) -static bool isAlwaysLive(Instruction *I) { - return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || - isa<LandingPadInst>(I) || I->mayHaveSideEffects(); -} - -void BDCE::determineLiveOperandBits(const Instruction *UserI, - const Instruction *I, unsigned OperandNo, - const APInt &AOut, APInt &AB, - APInt &KnownZero, APInt &KnownOne, - APInt &KnownZero2, APInt &KnownOne2) { - unsigned BitWidth = AB.getBitWidth(); - - // We're called once per operand, but for some instructions, we need to - // compute known bits of both operands in order to determine the live bits of - // either (when both operands are instructions themselves). We don't, - // however, want to do this twice, so we cache the result in APInts that live - // in the caller. For the two-relevant-operands case, both operand values are - // provided here. - auto ComputeKnownBits = - [&](unsigned BitWidth, const Value *V1, const Value *V2) { - const DataLayout &DL = I->getModule()->getDataLayout(); - KnownZero = APInt(BitWidth, 0); - KnownOne = APInt(BitWidth, 0); - computeKnownBits(const_cast<Value *>(V1), KnownZero, KnownOne, DL, 0, - AC, UserI, DT); - - if (V2) { - KnownZero2 = APInt(BitWidth, 0); - KnownOne2 = APInt(BitWidth, 0); - computeKnownBits(const_cast<Value *>(V2), KnownZero2, KnownOne2, DL, - 0, AC, UserI, DT); - } - }; - - switch (UserI->getOpcode()) { - default: break; - case Instruction::Call: - case Instruction::Invoke: - if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI)) - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::bswap: - // The alive bits of the input are the swapped alive bits of - // the output. - AB = AOut.byteSwap(); - break; - case Intrinsic::ctlz: - if (OperandNo == 0) { - // We need some output bits, so we need all bits of the - // input to the left of, and including, the leftmost bit - // known to be one. - ComputeKnownBits(BitWidth, I, nullptr); - AB = APInt::getHighBitsSet(BitWidth, - std::min(BitWidth, KnownOne.countLeadingZeros()+1)); - } - break; - case Intrinsic::cttz: - if (OperandNo == 0) { - // We need some output bits, so we need all bits of the - // input to the right of, and including, the rightmost bit - // known to be one. - ComputeKnownBits(BitWidth, I, nullptr); - AB = APInt::getLowBitsSet(BitWidth, - std::min(BitWidth, KnownOne.countTrailingZeros()+1)); - } - break; - } - break; - case Instruction::Add: - case Instruction::Sub: - // Find the highest live output bit. We don't need any more input - // bits than that (adds, and thus subtracts, ripple only to the - // left). - AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits()); - break; - case Instruction::Shl: - if (OperandNo == 0) - if (ConstantInt *CI = - dyn_cast<ConstantInt>(UserI->getOperand(1))) { - uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); - AB = AOut.lshr(ShiftAmt); - - // If the shift is nuw/nsw, then the high bits are not dead - // (because we've promised that they *must* be zero). - const ShlOperator *S = cast<ShlOperator>(UserI); - if (S->hasNoSignedWrap()) - AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1); - else if (S->hasNoUnsignedWrap()) - AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt); - } - break; - case Instruction::LShr: - if (OperandNo == 0) - if (ConstantInt *CI = - dyn_cast<ConstantInt>(UserI->getOperand(1))) { - uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); - AB = AOut.shl(ShiftAmt); - - // If the shift is exact, then the low bits are not dead - // (they must be zero). - if (cast<LShrOperator>(UserI)->isExact()) - AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); - } - break; - case Instruction::AShr: - if (OperandNo == 0) - if (ConstantInt *CI = - dyn_cast<ConstantInt>(UserI->getOperand(1))) { - uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); - AB = AOut.shl(ShiftAmt); - // Because the high input bit is replicated into the - // high-order bits of the result, if we need any of those - // bits, then we must keep the highest input bit. - if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt)) - .getBoolValue()) - AB.setBit(BitWidth-1); - - // If the shift is exact, then the low bits are not dead - // (they must be zero). - if (cast<AShrOperator>(UserI)->isExact()) - AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); - } - break; - case Instruction::And: - AB = AOut; - - // For bits that are known zero, the corresponding bits in the - // other operand are dead (unless they're both zero, in which - // case they can't both be dead, so just mark the LHS bits as - // dead). - if (OperandNo == 0) { - ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); - AB &= ~KnownZero2; - } else { - if (!isa<Instruction>(UserI->getOperand(0))) - ComputeKnownBits(BitWidth, UserI->getOperand(0), I); - AB &= ~(KnownZero & ~KnownZero2); - } - break; - case Instruction::Or: - AB = AOut; - - // For bits that are known one, the corresponding bits in the - // other operand are dead (unless they're both one, in which - // case they can't both be dead, so just mark the LHS bits as - // dead). - if (OperandNo == 0) { - ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); - AB &= ~KnownOne2; - } else { - if (!isa<Instruction>(UserI->getOperand(0))) - ComputeKnownBits(BitWidth, UserI->getOperand(0), I); - AB &= ~(KnownOne & ~KnownOne2); - } - break; - case Instruction::Xor: - case Instruction::PHI: - AB = AOut; - break; - case Instruction::Trunc: - AB = AOut.zext(BitWidth); - break; - case Instruction::ZExt: - AB = AOut.trunc(BitWidth); - break; - case Instruction::SExt: - AB = AOut.trunc(BitWidth); - // Because the high input bit is replicated into the - // high-order bits of the result, if we need any of those - // bits, then we must keep the highest input bit. - if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(), - AOut.getBitWidth() - BitWidth)) - .getBoolValue()) - AB.setBit(BitWidth-1); - break; - case Instruction::Select: - if (OperandNo != 0) - AB = AOut; - break; - } -} - bool BDCE::runOnFunction(Function& F) { if (skipOptnoneFunction(F)) return false; + DemandedBits &DB = getAnalysis<DemandedBits>(); - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - - DenseMap<Instruction *, APInt> AliveBits; SmallVector<Instruction*, 128> Worklist; - - // The set of visited instructions (non-integer-typed only). - SmallPtrSet<Instruction*, 128> Visited; - - // Collect the set of "root" instructions that are known live. - for (Instruction &I : inst_range(F)) { - if (!isAlwaysLive(&I)) - continue; - - DEBUG(dbgs() << "BDCE: Root: " << I << "\n"); - // For integer-valued instructions, set up an initial empty set of alive - // bits and add the instruction to the work list. For other instructions - // add their operands to the work list (for integer values operands, mark - // all bits as live). - if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) { - if (!AliveBits.count(&I)) { - AliveBits[&I] = APInt(IT->getBitWidth(), 0); - Worklist.push_back(&I); - } - - continue; - } - - // Non-integer-typed instructions... - for (Use &OI : I.operands()) { - if (Instruction *J = dyn_cast<Instruction>(OI)) { - if (IntegerType *IT = dyn_cast<IntegerType>(J->getType())) - AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth()); - Worklist.push_back(J); - } - } - // To save memory, we don't add I to the Visited set here. Instead, we - // check isAlwaysLive on every instruction when searching for dead - // instructions later (we need to check isAlwaysLive for the - // integer-typed instructions anyway). - } - - // Propagate liveness backwards to operands. - while (!Worklist.empty()) { - Instruction *UserI = Worklist.pop_back_val(); - - DEBUG(dbgs() << "BDCE: Visiting: " << *UserI); - APInt AOut; - if (UserI->getType()->isIntegerTy()) { - AOut = AliveBits[UserI]; - DEBUG(dbgs() << " Alive Out: " << AOut); - } - DEBUG(dbgs() << "\n"); - - if (!UserI->getType()->isIntegerTy()) - Visited.insert(UserI); - - APInt KnownZero, KnownOne, KnownZero2, KnownOne2; - // Compute the set of alive bits for each operand. These are anded into the - // existing set, if any, and if that changes the set of alive bits, the - // operand is added to the work-list. - for (Use &OI : UserI->operands()) { - if (Instruction *I = dyn_cast<Instruction>(OI)) { - if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) { - unsigned BitWidth = IT->getBitWidth(); - APInt AB = APInt::getAllOnesValue(BitWidth); - if (UserI->getType()->isIntegerTy() && !AOut && - !isAlwaysLive(UserI)) { - AB = APInt(BitWidth, 0); - } else { - // If all bits of the output are dead, then all bits of the input - // Bits of each operand that are used to compute alive bits of the - // output are alive, all others are dead. - determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB, - KnownZero, KnownOne, - KnownZero2, KnownOne2); - } - - // If we've added to the set of alive bits (or the operand has not - // been previously visited), then re-queue the operand to be visited - // again. - APInt ABPrev(BitWidth, 0); - auto ABI = AliveBits.find(I); - if (ABI != AliveBits.end()) - ABPrev = ABI->second; - - APInt ABNew = AB | ABPrev; - if (ABNew != ABPrev || ABI == AliveBits.end()) { - AliveBits[I] = std::move(ABNew); - Worklist.push_back(I); - } - } else if (!Visited.count(I)) { - Worklist.push_back(I); - } - } - } - } - bool Changed = false; - // The inverse of the live set is the dead set. These are those instructions - // which have no side effects and do not influence the control flow or return - // value of the function, and may therefore be deleted safely. - // NOTE: We reuse the Worklist vector here for memory efficiency. - for (Instruction &I : inst_range(F)) { - // For live instructions that have all dead bits, first make them dead by - // replacing all uses with something else. Then, if they don't need to - // remain live (because they have side effects, etc.) we can remove them. - if (I.getType()->isIntegerTy()) { - auto ABI = AliveBits.find(&I); - if (ABI != AliveBits.end()) { - if (ABI->second.getBoolValue()) - continue; - - DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); - // FIXME: In theory we could substitute undef here instead of zero. - // This should be reconsidered once we settle on the semantics of - // undef, poison, etc. - Value *Zero = ConstantInt::get(I.getType(), 0); - ++NumSimplified; - I.replaceAllUsesWith(Zero); - Changed = true; - } - } else if (Visited.count(&I)) { - continue; + for (Instruction &I : instructions(F)) { + if (I.getType()->isIntegerTy() && + !DB.getDemandedBits(&I).getBoolValue()) { + // For live instructions that have all dead bits, first make them dead by + // replacing all uses with something else. Then, if they don't need to + // remain live (because they have side effects, etc.) we can remove them. + DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); + // FIXME: In theory we could substitute undef here instead of zero. + // This should be reconsidered once we settle on the semantics of + // undef, poison, etc. + Value *Zero = ConstantInt::get(I.getType(), 0); + ++NumSimplified; + I.replaceAllUsesWith(Zero); + Changed = true; } - - if (isAlwaysLive(&I)) + if (!DB.isInstructionDead(&I)) continue; Worklist.push_back(&I); diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 4288742..84f7f5f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -223,10 +223,10 @@ Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst, } // The simple and common case. This also includes constant expressions. - if (!isa<PHINode>(Inst) && !isa<LandingPadInst>(Inst)) + if (!isa<PHINode>(Inst) && !Inst->isEHPad()) return Inst; - // We can't insert directly before a phi node or landing pad. Insert before + // We can't insert directly before a phi node or an eh pad. Insert before // the terminator of the incoming or dominating block. assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!"); if (Idx != ~0U && isa<PHINode>(Inst)) @@ -365,9 +365,9 @@ void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap, /// into an instruction itself. void ConstantHoisting::collectConstantCandidates(Function &Fn) { ConstCandMapType ConstCandMap; - for (Function::iterator BB : Fn) - for (BasicBlock::iterator Inst : *BB) - collectConstantCandidates(ConstCandMap, Inst); + for (BasicBlock &BB : Fn) + for (Instruction &Inst : BB) + collectConstantCandidates(ConstCandMap, &Inst); } /// \brief Find the base constant within the given range and rebase all other diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 79624b2..686bd40 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/IR/CFG.h" @@ -32,6 +33,7 @@ STATISTIC(NumPhis, "Number of phis propagated"); STATISTIC(NumSelects, "Number of selects propagated"); STATISTIC(NumMemAccess, "Number of memory access targets propagated"); STATISTIC(NumCmps, "Number of comparisons propagated"); +STATISTIC(NumReturns, "Number of return values propagated"); STATISTIC(NumDeadCases, "Number of switch cases removed"); namespace { @@ -43,6 +45,11 @@ namespace { bool processMemAccess(Instruction *I); bool processCmp(CmpInst *C); bool processSwitch(SwitchInst *SI); + bool processCallSite(CallSite CS); + + /// Return a constant value for V usable at At and everything it + /// dominates. If no such Constant can be found, return nullptr. + Constant *getConstantAt(Value *V, Instruction *At); public: static char ID; @@ -54,6 +61,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } }; } @@ -178,44 +186,33 @@ bool CorrelatedValuePropagation::processMemAccess(Instruction *I) { return true; } -/// processCmp - If the value of this comparison could be determined locally, -/// constant propagation would already have figured it out. Instead, walk -/// the predecessors and statically evaluate the comparison based on information -/// available on that edge. If a given static evaluation is true on ALL -/// incoming edges, then it's true universally and we can simplify the compare. +/// processCmp - See if LazyValueInfo's ability to exploit edge conditions, +/// or range information is sufficient to prove this comparison. Even for +/// local conditions, this can sometimes prove conditions instcombine can't by +/// exploiting range information. bool CorrelatedValuePropagation::processCmp(CmpInst *C) { Value *Op0 = C->getOperand(0); - if (isa<Instruction>(Op0) && - cast<Instruction>(Op0)->getParent() == C->getParent()) - return false; - Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); if (!Op1) return false; - pred_iterator PI = pred_begin(C->getParent()), PE = pred_end(C->getParent()); - if (PI == PE) return false; + // As a policy choice, we choose not to waste compile time on anything where + // the comparison is testing local values. While LVI can sometimes reason + // about such cases, it's not its primary purpose. We do make sure to do + // the block local query for uses from terminator instructions, but that's + // handled in the code for each terminator. + auto *I = dyn_cast<Instruction>(Op0); + if (I && I->getParent() == C->getParent()) + return false; - LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(), - C->getOperand(0), Op1, *PI, - C->getParent(), C); + LazyValueInfo::Tristate Result = + LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C); if (Result == LazyValueInfo::Unknown) return false; - ++PI; - while (PI != PE) { - LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(), - C->getOperand(0), Op1, *PI, - C->getParent(), C); - if (Res != Result) return false; - ++PI; - } - ++NumCmps; - if (Result == LazyValueInfo::True) C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext())); else C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext())); - C->eraseFromParent(); return true; @@ -307,6 +304,59 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) { return Changed; } +/// processCallSite - Infer nonnull attributes for the arguments at the +/// specified callsite. +bool CorrelatedValuePropagation::processCallSite(CallSite CS) { + SmallVector<unsigned, 4> Indices; + unsigned ArgNo = 0; + + for (Value *V : CS.args()) { + PointerType *Type = dyn_cast<PointerType>(V->getType()); + + if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) && + LVI->getPredicateAt(ICmpInst::ICMP_EQ, V, + ConstantPointerNull::get(Type), + CS.getInstruction()) == LazyValueInfo::False) + Indices.push_back(ArgNo + 1); + ArgNo++; + } + + assert(ArgNo == CS.arg_size() && "sanity check"); + + if (Indices.empty()) + return false; + + AttributeSet AS = CS.getAttributes(); + LLVMContext &Ctx = CS.getInstruction()->getContext(); + AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull)); + CS.setAttributes(AS); + + return true; +} + +Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) { + if (Constant *C = LVI->getConstant(V, At->getParent(), At)) + return C; + + // TODO: The following really should be sunk inside LVI's core algorithm, or + // at least the outer shims around such. + auto *C = dyn_cast<CmpInst>(V); + if (!C) return nullptr; + + Value *Op0 = C->getOperand(0); + Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); + if (!Op1) return nullptr; + + LazyValueInfo::Tristate Result = + LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At); + if (Result == LazyValueInfo::Unknown) + return nullptr; + + return (Result == LazyValueInfo::True) ? + ConstantInt::getTrue(C->getContext()) : + ConstantInt::getFalse(C->getContext()); +} + bool CorrelatedValuePropagation::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; @@ -318,7 +368,7 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { bool BBChanged = false; for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) { - Instruction *II = BI++; + Instruction *II = &*BI++; switch (II->getOpcode()) { case Instruction::Select: BBChanged |= processSelect(cast<SelectInst>(II)); @@ -334,6 +384,10 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { case Instruction::Store: BBChanged |= processMemAccess(II); break; + case Instruction::Call: + case Instruction::Invoke: + BBChanged |= processCallSite(CallSite(II)); + break; } } @@ -342,7 +396,21 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { case Instruction::Switch: BBChanged |= processSwitch(cast<SwitchInst>(Term)); break; + case Instruction::Ret: { + auto *RI = cast<ReturnInst>(Term); + // Try to determine the return value if we can. This is mainly here to + // simplify the writing of unit tests, but also helps to enable IPO by + // constant folding the return values of callees. + auto *RetVal = RI->getReturnValue(); + if (!RetVal) break; // handle "ret void" + if (isa<Constant>(RetVal)) break; // nothing to do + if (auto *C = getConstantAt(RetVal, RI)) { + ++NumReturns; + RI->replaceUsesOfWith(RetVal, C); + BBChanged = true; + } } + }; FnChanged |= BBChanged; } diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp index 3b262a2..b67c3c7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp @@ -17,6 +17,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" @@ -46,7 +47,7 @@ namespace { TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; bool Changed = false; for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { - Instruction *Inst = DI++; + Instruction *Inst = &*DI++; if (isInstructionTriviallyDead(Inst, TLI)) { Inst->eraseFromParent(); Changed = true; @@ -92,6 +93,34 @@ namespace { char DCE::ID = 0; INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false) +static bool DCEInstruction(Instruction *I, + SmallSetVector<Instruction *, 16> &WorkList, + const TargetLibraryInfo *TLI) { + if (isInstructionTriviallyDead(I, TLI)) { + // Null out all of the instruction's operands to see if any operand becomes + // dead as we go. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + Value *OpV = I->getOperand(i); + I->setOperand(i, nullptr); + + if (!OpV->use_empty() || I == OpV) + continue; + + // If the operand is an instruction that became dead as we nulled out the + // operand, and if it is 'trivially' dead, delete it in a future loop + // iteration. + if (Instruction *OpI = dyn_cast<Instruction>(OpV)) + if (isInstructionTriviallyDead(OpI, TLI)) + WorkList.insert(OpI); + } + + I->eraseFromParent(); + ++DCEEliminated; + return true; + } + return false; +} + bool DCE::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; @@ -99,39 +128,24 @@ bool DCE::runOnFunction(Function &F) { auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; - // Start out with all of the instructions in the worklist... - std::vector<Instruction*> WorkList; - for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) - WorkList.push_back(&*i); - - // Loop over the worklist finding instructions that are dead. If they are - // dead make them drop all of their uses, making other instructions - // potentially dead, and work until the worklist is empty. - // bool MadeChange = false; + SmallSetVector<Instruction *, 16> WorkList; + // Iterate over the original function, only adding insts to the worklist + // if they actually need to be revisited. This avoids having to pre-init + // the worklist with the entire function's worth of instructions. + for (inst_iterator FI = inst_begin(F), FE = inst_end(F); FI != FE;) { + Instruction *I = &*FI; + ++FI; + + // We're visiting this instruction now, so make sure it's not in the + // worklist from an earlier visit. + if (!WorkList.count(I)) + MadeChange |= DCEInstruction(I, WorkList, TLI); + } + while (!WorkList.empty()) { - Instruction *I = WorkList.back(); - WorkList.pop_back(); - - if (isInstructionTriviallyDead(I, TLI)) { // If the instruction is dead. - // Loop over all of the values that the instruction uses, if there are - // instructions being used, add them to the worklist, because they might - // go dead after this one is removed. - // - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) - if (Instruction *Used = dyn_cast<Instruction>(*OI)) - WorkList.push_back(Used); - - // Remove the instruction. - I->eraseFromParent(); - - // Remove the instruction from the worklist if it still exists in it. - WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I), - WorkList.end()); - - MadeChange = true; - ++DCEEliminated; - } + Instruction *I = WorkList.pop_back_val(); + MadeChange |= DCEInstruction(I, WorkList, TLI); } return MadeChange; } diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index c505584..36ad0a5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -40,6 +41,7 @@ using namespace llvm; #define DEBUG_TYPE "dse" +STATISTIC(NumRedundantStores, "Number of redundant stores deleted"); STATISTIC(NumFastStores, "Number of stores deleted"); STATISTIC(NumFastOther , "Number of other instrs removed"); @@ -59,23 +61,24 @@ namespace { if (skipOptnoneFunction(F)) return false; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); MD = &getAnalysis<MemoryDependenceAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TLI = AA->getTargetLibraryInfo(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); bool Changed = false; - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) + for (BasicBlock &I : F) // Only check non-dead blocks. Dead blocks may have strange pointer // cycles that will confuse alias analysis. - if (DT->isReachableFromEntry(I)) - Changed |= runOnBasicBlock(*I); + if (DT->isReachableFromEntry(&I)) + Changed |= runOnBasicBlock(I); AA = nullptr; MD = nullptr; DT = nullptr; return Changed; } bool runOnBasicBlock(BasicBlock &BB); + bool MemoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI); bool HandleFree(CallInst *F); bool handleEndBlock(BasicBlock &BB); void RemoveAccessedObjects(const MemoryLocation &LoadedLoc, @@ -85,10 +88,11 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); - AU.addPreserved<AliasAnalysis>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<MemoryDependenceAnalysis>(); } }; @@ -97,8 +101,10 @@ namespace { char DSE::ID = 0; INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false) FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } @@ -115,7 +121,7 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } /// static void DeleteDeadInstruction(Instruction *I, MemoryDependenceAnalysis &MD, - const TargetLibraryInfo *TLI, + const TargetLibraryInfo &TLI, SmallSetVector<Value*, 16> *ValueSet = nullptr) { SmallVector<Instruction*, 32> NowDeadInsts; @@ -140,7 +146,7 @@ static void DeleteDeadInstruction(Instruction *I, if (!Op->use_empty()) continue; if (Instruction *OpI = dyn_cast<Instruction>(Op)) - if (isInstructionTriviallyDead(OpI, TLI)) + if (isInstructionTriviallyDead(OpI, &TLI)) NowDeadInsts.push_back(OpI); } @@ -153,7 +159,7 @@ static void DeleteDeadInstruction(Instruction *I, /// hasMemoryWrite - Does this instruction write some memory? This only returns /// true for things that we can analyze with other helpers below. -static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { +static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) { if (isa<StoreInst>(I)) return true; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { @@ -170,20 +176,20 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { } if (auto CS = CallSite(I)) { if (Function *F = CS.getCalledFunction()) { - if (TLI && TLI->has(LibFunc::strcpy) && - F->getName() == TLI->getName(LibFunc::strcpy)) { + if (TLI.has(LibFunc::strcpy) && + F->getName() == TLI.getName(LibFunc::strcpy)) { return true; } - if (TLI && TLI->has(LibFunc::strncpy) && - F->getName() == TLI->getName(LibFunc::strncpy)) { + if (TLI.has(LibFunc::strncpy) && + F->getName() == TLI.getName(LibFunc::strncpy)) { return true; } - if (TLI && TLI->has(LibFunc::strcat) && - F->getName() == TLI->getName(LibFunc::strcat)) { + if (TLI.has(LibFunc::strcat) && + F->getName() == TLI.getName(LibFunc::strcat)) { return true; } - if (TLI && TLI->has(LibFunc::strncat) && - F->getName() == TLI->getName(LibFunc::strncat)) { + if (TLI.has(LibFunc::strncat) && + F->getName() == TLI.getName(LibFunc::strncat)) { return true; } } @@ -224,9 +230,9 @@ static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { /// getLocForRead - Return the location read by the specified "hasMemoryWrite" /// instruction if any. -static MemoryLocation getLocForRead(Instruction *Inst, AliasAnalysis &AA) { - assert(hasMemoryWrite(Inst, AA.getTargetLibraryInfo()) && - "Unknown instruction case"); +static MemoryLocation getLocForRead(Instruction *Inst, + const TargetLibraryInfo &TLI) { + assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case"); // The only instructions that both read and write are the mem transfer // instructions (memcpy/memmove). @@ -313,9 +319,9 @@ static Value *getStoredPointerOperand(Instruction *I) { } static uint64_t getPointerSize(const Value *V, const DataLayout &DL, - const TargetLibraryInfo *TLI) { + const TargetLibraryInfo &TLI) { uint64_t Size; - if (getObjectSize(V, Size, DL, TLI)) + if (getObjectSize(V, Size, DL, &TLI)) return Size; return MemoryLocation::UnknownSize; } @@ -336,7 +342,7 @@ namespace { static OverwriteResult isOverwrite(const MemoryLocation &Later, const MemoryLocation &Earlier, const DataLayout &DL, - const TargetLibraryInfo *TLI, + const TargetLibraryInfo &TLI, int64_t &EarlierOff, int64_t &LaterOff) { const Value *P1 = Earlier.Ptr->stripPointerCasts(); const Value *P2 = Later.Ptr->stripPointerCasts(); @@ -442,10 +448,12 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, /// because the DSE inducing instruction may be a self-read. static bool isPossibleSelfRead(Instruction *Inst, const MemoryLocation &InstStoreLoc, - Instruction *DepWrite, AliasAnalysis &AA) { + Instruction *DepWrite, + const TargetLibraryInfo &TLI, + AliasAnalysis &AA) { // Self reads can only happen for instructions that read memory. Get the // location read. - MemoryLocation InstReadLoc = getLocForRead(Inst, AA); + MemoryLocation InstReadLoc = getLocForRead(Inst, TLI); if (!InstReadLoc.Ptr) return false; // Not a reading instruction. // If the read and written loc obviously don't alias, it isn't a read. @@ -459,7 +467,7 @@ static bool isPossibleSelfRead(Instruction *Inst, // Here we don't know if A/B may alias, but we do know that B/B are must // aliases, so removing the first memcpy is safe (assuming it writes <= # // bytes as the second one. - MemoryLocation DepReadLoc = getLocForRead(DepWrite, AA); + MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI); if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr)) return false; @@ -475,11 +483,12 @@ static bool isPossibleSelfRead(Instruction *Inst, //===----------------------------------------------------------------------===// bool DSE::runOnBasicBlock(BasicBlock &BB) { + const DataLayout &DL = BB.getModule()->getDataLayout(); bool MadeChange = false; // Do a top-down walk on the BB. for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { - Instruction *Inst = BBI++; + Instruction *Inst = &*BBI++; // Handle 'free' calls specially. if (CallInst *F = isFreeCall(Inst, TLI)) { @@ -488,42 +497,68 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { } // If we find something that writes memory, get its memory dependence. - if (!hasMemoryWrite(Inst, TLI)) - continue; - - MemDepResult InstDep = MD->getDependency(Inst); - - // Ignore any store where we can't find a local dependence. - // FIXME: cross-block DSE would be fun. :) - if (!InstDep.isDef() && !InstDep.isClobber()) + if (!hasMemoryWrite(Inst, *TLI)) continue; // If we're storing the same value back to a pointer that we just // loaded from, then the store can be removed. if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) { + + auto RemoveDeadInstAndUpdateBBI = [&](Instruction *DeadInst) { + // DeleteDeadInstruction can delete the current instruction. Save BBI + // in case we need it. + WeakVH NextInst(&*BBI); + + DeleteDeadInstruction(DeadInst, *MD, *TLI); + + if (!NextInst) // Next instruction deleted. + BBI = BB.begin(); + else if (BBI != BB.begin()) // Revisit this instruction if possible. + --BBI; + ++NumRedundantStores; + MadeChange = true; + }; + + if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) { if (SI->getPointerOperand() == DepLoad->getPointerOperand() && - SI->getOperand(0) == DepLoad && isRemovable(SI)) { + isRemovable(SI) && + MemoryIsNotModifiedBetween(DepLoad, SI)) { + DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n " << "LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); - // DeleteDeadInstruction can delete the current instruction. Save BBI - // in case we need it. - WeakVH NextInst(BBI); + RemoveDeadInstAndUpdateBBI(SI); + continue; + } + } - DeleteDeadInstruction(SI, *MD, TLI); + // Remove null stores into the calloc'ed objects + Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand()); - if (!NextInst) // Next instruction deleted. - BBI = BB.begin(); - else if (BBI != BB.begin()) // Revisit this instruction if possible. - --BBI; - ++NumFastStores; - MadeChange = true; + if (StoredConstant && StoredConstant->isNullValue() && + isRemovable(SI)) { + Instruction *UnderlyingPointer = dyn_cast<Instruction>( + GetUnderlyingObject(SI->getPointerOperand(), DL)); + + if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) && + MemoryIsNotModifiedBetween(UnderlyingPointer, SI)) { + DEBUG(dbgs() + << "DSE: Remove null store to the calloc'ed object:\n DEAD: " + << *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n'); + + RemoveDeadInstAndUpdateBBI(SI); continue; } } } + MemDepResult InstDep = MD->getDependency(Inst); + + // Ignore any store where we can't find a local dependence. + // FIXME: cross-block DSE would be fun. :) + if (!InstDep.isDef() && !InstDep.isClobber()) + continue; + // Figure out what location is being stored to. MemoryLocation Loc = getLocForWrite(Inst, *AA); @@ -549,24 +584,22 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { // completely obliterated by the store to 'Loc', and c) which we know that // 'Inst' doesn't load from, then we can remove it. if (isRemovable(DepWrite) && - !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { + !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { int64_t InstWriteOffset, DepWriteOffset; - const DataLayout &DL = BB.getModule()->getDataLayout(); OverwriteResult OR = - isOverwrite(Loc, DepLoc, DL, AA->getTargetLibraryInfo(), - DepWriteOffset, InstWriteOffset); + isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset); if (OR == OverwriteComplete) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); // Delete the store and now-dead instructions that feed it. - DeleteDeadInstruction(DepWrite, *MD, TLI); + DeleteDeadInstruction(DepWrite, *MD, *TLI); ++NumFastStores; MadeChange = true; // DeleteDeadInstruction can delete the current instruction in loop // cases, reset BBI. - BBI = Inst; + BBI = Inst->getIterator(); if (BBI != BB.begin()) --BBI; break; @@ -609,10 +642,11 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { if (DepWrite == &BB.front()) break; // Can't look past this instruction if it might read 'Loc'. - if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref) + if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref) break; - InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB); + InstDep = MD->getPointerDependencyFrom(Loc, false, + DepWrite->getIterator(), &BB); } } @@ -624,6 +658,64 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { return MadeChange; } +/// Returns true if the memory which is accessed by the second instruction is not +/// modified between the first and the second instruction. +/// Precondition: Second instruction must be dominated by the first +/// instruction. +bool DSE::MemoryIsNotModifiedBetween(Instruction *FirstI, + Instruction *SecondI) { + SmallVector<BasicBlock *, 16> WorkList; + SmallPtrSet<BasicBlock *, 8> Visited; + BasicBlock::iterator FirstBBI(FirstI); + ++FirstBBI; + BasicBlock::iterator SecondBBI(SecondI); + BasicBlock *FirstBB = FirstI->getParent(); + BasicBlock *SecondBB = SecondI->getParent(); + MemoryLocation MemLoc = MemoryLocation::get(SecondI); + + // Start checking the store-block. + WorkList.push_back(SecondBB); + bool isFirstBlock = true; + + // Check all blocks going backward until we reach the load-block. + while (!WorkList.empty()) { + BasicBlock *B = WorkList.pop_back_val(); + + // Ignore instructions before LI if this is the FirstBB. + BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin()); + + BasicBlock::iterator EI; + if (isFirstBlock) { + // Ignore instructions after SI if this is the first visit of SecondBB. + assert(B == SecondBB && "first block is not the store block"); + EI = SecondBBI; + isFirstBlock = false; + } else { + // It's not SecondBB or (in case of a loop) the second visit of SecondBB. + // In this case we also have to look at instructions after SI. + EI = B->end(); + } + for (; BI != EI; ++BI) { + Instruction *I = &*BI; + if (I->mayWriteToMemory() && I != SecondI) { + auto Res = AA->getModRefInfo(I, MemLoc); + if (Res != MRI_NoModRef) + return false; + } + } + if (B != FirstBB) { + assert(B != &FirstBB->getParent()->getEntryBlock() && + "Should not hit the entry block because SI must be dominated by LI"); + for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) { + if (!Visited.insert(*PredI).second) + continue; + WorkList.push_back(*PredI); + } + } + } + return true; +} + /// Find all blocks that will unconditionally lead to the block BB and append /// them to F. static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks, @@ -655,10 +747,11 @@ bool DSE::HandleFree(CallInst *F) { Instruction *InstPt = BB->getTerminator(); if (BB == F->getParent()) InstPt = F; - MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB); + MemDepResult Dep = + MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB); while (Dep.isDef() || Dep.isClobber()) { Instruction *Dependency = Dep.getInst(); - if (!hasMemoryWrite(Dependency, TLI) || !isRemovable(Dependency)) + if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency)) break; Value *DepPointer = @@ -668,10 +761,10 @@ bool DSE::HandleFree(CallInst *F) { if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) break; - Instruction *Next = std::next(BasicBlock::iterator(Dependency)); + auto Next = ++Dependency->getIterator(); // DCE instructions only used to calculate that store - DeleteDeadInstruction(Dependency, *MD, TLI); + DeleteDeadInstruction(Dependency, *MD, *TLI); ++NumFastStores; MadeChange = true; @@ -704,23 +797,22 @@ bool DSE::handleEndBlock(BasicBlock &BB) { SmallSetVector<Value*, 16> DeadStackObjects; // Find all of the alloca'd pointers in the entry block. - BasicBlock *Entry = BB.getParent()->begin(); - for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) { - if (isa<AllocaInst>(I)) - DeadStackObjects.insert(I); + BasicBlock &Entry = BB.getParent()->front(); + for (Instruction &I : Entry) { + if (isa<AllocaInst>(&I)) + DeadStackObjects.insert(&I); // Okay, so these are dead heap objects, but if the pointer never escapes // then it's leaked by this function anyways. - else if (isAllocLikeFn(I, TLI) && !PointerMayBeCaptured(I, true, true)) - DeadStackObjects.insert(I); + else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true)) + DeadStackObjects.insert(&I); } // Treat byval or inalloca arguments the same, stores to them are dead at the // end of the function. - for (Function::arg_iterator AI = BB.getParent()->arg_begin(), - AE = BB.getParent()->arg_end(); AI != AE; ++AI) - if (AI->hasByValOrInAllocaAttr()) - DeadStackObjects.insert(AI); + for (Argument &AI : BB.getParent()->args()) + if (AI.hasByValOrInAllocaAttr()) + DeadStackObjects.insert(&AI); const DataLayout &DL = BB.getModule()->getDataLayout(); @@ -729,10 +821,10 @@ bool DSE::handleEndBlock(BasicBlock &BB) { --BBI; // If we find a store, check to see if it points into a dead stack value. - if (hasMemoryWrite(BBI, TLI) && isRemovable(BBI)) { + if (hasMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) { // See through pointer-to-pointer bitcasts SmallVector<Value *, 4> Pointers; - GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers, DL); + GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL); // Stores to stack values are valid candidates for removal. bool AllDead = true; @@ -744,7 +836,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { } if (AllDead) { - Instruction *Dead = BBI++; + Instruction *Dead = &*BBI++; DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: " << *Dead << "\n Objects: "; @@ -757,7 +849,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { dbgs() << '\n'); // DCE instructions only used to calculate that store. - DeleteDeadInstruction(Dead, *MD, TLI, &DeadStackObjects); + DeleteDeadInstruction(Dead, *MD, *TLI, &DeadStackObjects); ++NumFastStores; MadeChange = true; continue; @@ -765,9 +857,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) { } // Remove any dead non-memory-mutating instructions. - if (isInstructionTriviallyDead(BBI, TLI)) { - Instruction *Inst = BBI++; - DeleteDeadInstruction(Inst, *MD, TLI, &DeadStackObjects); + if (isInstructionTriviallyDead(&*BBI, TLI)) { + Instruction *Inst = &*BBI++; + DeleteDeadInstruction(Inst, *MD, *TLI, &DeadStackObjects); ++NumFastOther; MadeChange = true; continue; @@ -776,15 +868,15 @@ bool DSE::handleEndBlock(BasicBlock &BB) { if (isa<AllocaInst>(BBI)) { // Remove allocas from the list of dead stack objects; there can't be // any references before the definition. - DeadStackObjects.remove(BBI); + DeadStackObjects.remove(&*BBI); continue; } - if (auto CS = CallSite(BBI)) { + if (auto CS = CallSite(&*BBI)) { // Remove allocation function calls from the list of dead stack objects; // there can't be any references before the definition. - if (isAllocLikeFn(BBI, TLI)) - DeadStackObjects.remove(BBI); + if (isAllocLikeFn(&*BBI, TLI)) + DeadStackObjects.remove(&*BBI); // If this call does not access memory, it can't be loading any of our // pointers. @@ -795,10 +887,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // the call is live. DeadStackObjects.remove_if([&](Value *I) { // See if the call site touches the value. - AliasAnalysis::ModRefResult A = AA->getModRefInfo( - CS, I, getPointerSize(I, DL, AA->getTargetLibraryInfo())); + ModRefInfo A = AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI)); - return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref; + return A == MRI_ModRef || A == MRI_Ref; }); // If all of the allocas were clobbered by the call then we're not going @@ -864,8 +955,7 @@ void DSE::RemoveAccessedObjects(const MemoryLocation &LoadedLoc, // Remove objects that could alias LoadedLoc. DeadStackObjects.remove_if([&](Value *I) { // See if the loaded location could alias the stack location. - MemoryLocation StackLoc(I, - getPointerSize(I, DL, AA->getTargetLibraryInfo())); + MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI)); return !AA->isNoAlias(StackLoc, LoadedLoc); }); } diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 029b44c..7ef062e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -263,7 +264,6 @@ namespace { /// expected that a later pass of GVN will catch the interesting/hard cases. class EarlyCSE { public: - Function &F; const TargetLibraryInfo &TLI; const TargetTransformInfo &TTI; DominatorTree &DT; @@ -281,20 +281,37 @@ public: /// that dominated values can succeed in their lookup. ScopedHTType AvailableValues; - /// \brief A scoped hash table of the current values of loads. + /// A scoped hash table of the current values of previously encounted memory + /// locations. /// - /// This allows us to get efficient access to dominating loads when we have - /// a fully redundant load. In addition to the most recent load, we keep - /// track of a generation count of the read, which is compared against the - /// current generation count. The current generation count is incremented + /// This allows us to get efficient access to dominating loads or stores when + /// we have a fully redundant load. In addition to the most recent load, we + /// keep track of a generation count of the read, which is compared against + /// the current generation count. The current generation count is incremented /// after every possibly writing memory operation, which ensures that we only - /// CSE loads with other loads that have no intervening store. - typedef RecyclingAllocator< - BumpPtrAllocator, - ScopedHashTableVal<Value *, std::pair<Value *, unsigned>>> + /// CSE loads with other loads that have no intervening store. Ordering + /// events (such as fences or atomic instructions) increment the generation + /// count as well; essentially, we model these as writes to all possible + /// locations. Note that atomic and/or volatile loads and stores can be + /// present the table; it is the responsibility of the consumer to inspect + /// the atomicity/volatility if needed. + struct LoadValue { + Value *Data; + unsigned Generation; + int MatchingId; + bool IsAtomic; + LoadValue() + : Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {} + LoadValue(Value *Data, unsigned Generation, unsigned MatchingId, + bool IsAtomic) + : Data(Data), Generation(Generation), MatchingId(MatchingId), + IsAtomic(IsAtomic) {} + }; + typedef RecyclingAllocator<BumpPtrAllocator, + ScopedHashTableVal<Value *, LoadValue>> LoadMapAllocator; - typedef ScopedHashTable<Value *, std::pair<Value *, unsigned>, - DenseMapInfo<Value *>, LoadMapAllocator> LoadHTType; + typedef ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>, + LoadMapAllocator> LoadHTType; LoadHTType AvailableLoads; /// \brief A scoped hash table of the current values of read-only call @@ -308,10 +325,9 @@ public: unsigned CurrentGeneration; /// \brief Set up the EarlyCSE runner for a particular function. - EarlyCSE(Function &F, const TargetLibraryInfo &TLI, - const TargetTransformInfo &TTI, DominatorTree &DT, - AssumptionCache &AC) - : F(F), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {} + EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, + DominatorTree &DT, AssumptionCache &AC) + : TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {} bool run(); @@ -382,57 +398,91 @@ private: class ParseMemoryInst { public: ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI) - : Load(false), Store(false), Vol(false), MayReadFromMemory(false), - MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) { - MayReadFromMemory = Inst->mayReadFromMemory(); - MayWriteToMemory = Inst->mayWriteToMemory(); - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { - MemIntrinsicInfo Info; - if (!TTI.getTgtMemIntrinsic(II, Info)) - return; - if (Info.NumMemRefs == 1) { - Store = Info.WriteMem; - Load = Info.ReadMem; - MatchingId = Info.MatchingId; - MayReadFromMemory = Info.ReadMem; - MayWriteToMemory = Info.WriteMem; - Vol = Info.Vol; - Ptr = Info.PtrVal; - } - } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { - Load = true; - Vol = !LI->isSimple(); - Ptr = LI->getPointerOperand(); + : IsTargetMemInst(false), Inst(Inst) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) + if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1) + IsTargetMemInst = true; + } + bool isLoad() const { + if (IsTargetMemInst) return Info.ReadMem; + return isa<LoadInst>(Inst); + } + bool isStore() const { + if (IsTargetMemInst) return Info.WriteMem; + return isa<StoreInst>(Inst); + } + bool isAtomic() const { + if (IsTargetMemInst) { + assert(Info.IsSimple && "need to refine IsSimple in TTI"); + return false; + } + return Inst->isAtomic(); + } + bool isUnordered() const { + if (IsTargetMemInst) { + assert(Info.IsSimple && "need to refine IsSimple in TTI"); + return true; + } + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + return LI->isUnordered(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + return SI->isUnordered(); + } + // Conservative answer + return !Inst->isAtomic(); + } + + bool isVolatile() const { + if (IsTargetMemInst) { + assert(Info.IsSimple && "need to refine IsSimple in TTI"); + return false; + } + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + return LI->isVolatile(); } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - Store = true; - Vol = !SI->isSimple(); - Ptr = SI->getPointerOperand(); + return SI->isVolatile(); } + // Conservative answer + return true; } - bool isLoad() { return Load; } - bool isStore() { return Store; } - bool isVolatile() { return Vol; } - bool isMatchingMemLoc(const ParseMemoryInst &Inst) { - return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId; + + + bool isMatchingMemLoc(const ParseMemoryInst &Inst) const { + return (getPointerOperand() == Inst.getPointerOperand() && + getMatchingId() == Inst.getMatchingId()); } - bool isValid() { return Ptr != nullptr; } - int getMatchingId() { return MatchingId; } - Value *getPtr() { return Ptr; } - bool mayReadFromMemory() { return MayReadFromMemory; } - bool mayWriteToMemory() { return MayWriteToMemory; } + bool isValid() const { return getPointerOperand() != nullptr; } - private: - bool Load; - bool Store; - bool Vol; - bool MayReadFromMemory; - bool MayWriteToMemory; // For regular (non-intrinsic) loads/stores, this is set to -1. For // intrinsic loads/stores, the id is retrieved from the corresponding // field in the MemIntrinsicInfo structure. That field contains // non-negative values only. - int MatchingId; - Value *Ptr; + int getMatchingId() const { + if (IsTargetMemInst) return Info.MatchingId; + return -1; + } + Value *getPointerOperand() const { + if (IsTargetMemInst) return Info.PtrVal; + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + return LI->getPointerOperand(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + return SI->getPointerOperand(); + } + return nullptr; + } + bool mayReadFromMemory() const { + if (IsTargetMemInst) return Info.ReadMem; + return Inst->mayReadFromMemory(); + } + bool mayWriteToMemory() const { + if (IsTargetMemInst) return Info.WriteMem; + return Inst->mayWriteToMemory(); + } + + private: + bool IsTargetMemInst; + MemIntrinsicInfo Info; + Instruction *Inst; }; bool processNode(DomTreeNode *Node); @@ -497,7 +547,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // See if any instructions in the block can be eliminated. If so, do it. If // not, add them to AvailableValues. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { - Instruction *Inst = I++; + Instruction *Inst = &*I++; // Dead instructions should just be removed. if (isInstructionTriviallyDead(Inst, &TLI)) { @@ -548,24 +598,26 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ParseMemoryInst MemInst(Inst, TTI); // If this is a non-volatile load, process it. if (MemInst.isValid() && MemInst.isLoad()) { - // Ignore volatile loads. - if (MemInst.isVolatile()) { + // (conservatively) we can't peak past the ordering implied by this + // operation, but we can add this load to our set of available values + if (MemInst.isVolatile() || !MemInst.isUnordered()) { LastStore = nullptr; - // Don't CSE across synchronization boundaries. - if (Inst->mayWriteToMemory()) - ++CurrentGeneration; - continue; + ++CurrentGeneration; } // If we have an available version of this load, and if it is the right // generation, replace this instruction. - std::pair<Value *, unsigned> InVal = - AvailableLoads.lookup(MemInst.getPtr()); - if (InVal.first != nullptr && InVal.second == CurrentGeneration) { - Value *Op = getOrCreateResult(InVal.first, Inst->getType()); + LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); + if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration && + InVal.MatchingId == MemInst.getMatchingId() && + // We don't yet handle removing loads with ordering of any kind. + !MemInst.isVolatile() && MemInst.isUnordered() && + // We can't replace an atomic load with one which isn't also atomic. + InVal.IsAtomic >= MemInst.isAtomic()) { + Value *Op = getOrCreateResult(InVal.Data, Inst->getType()); if (Op != nullptr) { DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst - << " to: " << *InVal.first << '\n'); + << " to: " << *InVal.Data << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(Op); Inst->eraseFromParent(); @@ -576,8 +628,10 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { } // Otherwise, remember that we have this instruction. - AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( - Inst, CurrentGeneration)); + AvailableLoads.insert( + MemInst.getPointerOperand(), + LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(), + MemInst.isAtomic())); LastStore = nullptr; continue; } @@ -613,6 +667,44 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { continue; } + // A release fence requires that all stores complete before it, but does + // not prevent the reordering of following loads 'before' the fence. As a + // result, we don't need to consider it as writing to memory and don't need + // to advance the generation. We do need to prevent DSE across the fence, + // but that's handled above. + if (FenceInst *FI = dyn_cast<FenceInst>(Inst)) + if (FI->getOrdering() == Release) { + assert(Inst->mayReadFromMemory() && "relied on to prevent DSE above"); + continue; + } + + // write back DSE - If we write back the same value we just loaded from + // the same location and haven't passed any intervening writes or ordering + // operations, we can remove the write. The primary benefit is in allowing + // the available load table to remain valid and value forward past where + // the store originally was. + if (MemInst.isValid() && MemInst.isStore()) { + LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); + if (InVal.Data && + InVal.Data == getOrCreateResult(Inst, InVal.Data->getType()) && + InVal.Generation == CurrentGeneration && + InVal.MatchingId == MemInst.getMatchingId() && + // We don't yet handle removing stores with ordering of any kind. + !MemInst.isVolatile() && MemInst.isUnordered()) { + assert((!LastStore || + ParseMemoryInst(LastStore, TTI).getPointerOperand() == + MemInst.getPointerOperand()) && + "can't have an intervening store!"); + DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n'); + Inst->eraseFromParent(); + Changed = true; + ++NumDSE; + // We can avoid incrementing the generation count since we were able + // to eliminate this store. + continue; + } + } + // Okay, this isn't something we can CSE at all. Check to see if it is // something that could modify memory. If so, our available memory values // cannot be used so bump the generation count. @@ -622,8 +714,16 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (MemInst.isValid() && MemInst.isStore()) { // We do a trivial form of DSE if there are two stores to the same // location with no intervening loads. Delete the earlier store. + // At the moment, we don't remove ordered stores, but do remove + // unordered atomic stores. There's no special requirement (for + // unordered atomics) about removing atomic stores only in favor of + // other atomic stores since we we're going to execute the non-atomic + // one anyway and the atomic one might never have become visible. if (LastStore) { ParseMemoryInst LastStoreMemInst(LastStore, TTI); + assert(LastStoreMemInst.isUnordered() && + !LastStoreMemInst.isVolatile() && + "Violated invariant"); if (LastStoreMemInst.isMatchingMemLoc(MemInst)) { DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << " due to: " << *Inst << '\n'); @@ -640,12 +740,22 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // version of the pointer. It is safe to forward from volatile stores // to non-volatile loads, so we don't have to check for volatility of // the store. - AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( - Inst, CurrentGeneration)); - - // Remember that this was the last store we saw for DSE. - if (!MemInst.isVolatile()) + AvailableLoads.insert( + MemInst.getPointerOperand(), + LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(), + MemInst.isAtomic())); + + // Remember that this was the last unordered store we saw for DSE. We + // don't yet handle DSE on ordered or volatile stores since we don't + // have a good way to model the ordering requirement for following + // passes once the store is removed. We could insert a fence, but + // since fences are slightly stronger than stores in their ordering, + // it's not clear this is a profitable transform. Another option would + // be to merge the ordering with that of the post dominating store. + if (MemInst.isUnordered() && !MemInst.isVolatile()) LastStore = Inst; + else + LastStore = nullptr; } } } @@ -714,7 +824,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F, auto &DT = AM->getResult<DominatorTreeAnalysis>(F); auto &AC = AM->getResult<AssumptionAnalysis>(F); - EarlyCSE CSE(F, TLI, TTI, DT, AC); + EarlyCSE CSE(TLI, TTI, DT, AC); if (!CSE.run()) return PreservedAnalyses::all(); @@ -751,7 +861,7 @@ public: auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - EarlyCSE CSE(F, TLI, TTI, DT, AC); + EarlyCSE CSE(TLI, TTI, DT, AC); return CSE.run(); } @@ -761,6 +871,7 @@ public: AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.setPreservesCFG(); } }; diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index 0430c18..185cdbd 100644 --- a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -30,7 +30,7 @@ public: bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); } private: @@ -41,7 +41,7 @@ private: char FlattenCFGPass::ID = 0; INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, false) @@ -59,7 +59,7 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end();) { - if (FlattenCFG(BBIt++, AA)) { + if (FlattenCFG(&*BBIt++, AA)) { LocalChange = true; } } @@ -69,7 +69,7 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { } bool FlattenCFGPass::runOnFunction(Function &F) { - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); bool EverChanged = false; // iterativelyFlattenCFG can make some blocks dead. while (iterativelyFlattenCFG(F, AA)) { diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp index c931422..7f5d786 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -19,6 +19,8 @@ #include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" @@ -41,7 +43,7 @@ using namespace llvm; // integer domain inputs, produce an integer output; fadd, for example. // // If a non-mappable instruction is seen, this entire def-use graph is marked -// as non-transformable. If we see an instruction that converts from the +// as non-transformable. If we see an instruction that converts from the // integer domain to FP domain (uitofp,sitofp), we terminate our walk. /// The largest integer type worth dealing with. @@ -60,6 +62,7 @@ namespace { bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addPreserved<GlobalsAAWrapperPass>(); } void findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots); @@ -82,7 +85,9 @@ namespace { } char Float2Int::ID = 0; -INITIALIZE_PASS(Float2Int, "float2int", "Float to int", false, false) +INITIALIZE_PASS_BEGIN(Float2Int, "float2int", "Float to int", false, false) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_END(Float2Int, "float2int", "Float to int", false, false) // Given a FCmp predicate, return a matching ICmp predicate if one // exists, otherwise return BAD_ICMP_PREDICATE. @@ -125,7 +130,9 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) { // Find the roots - instructions that convert from the FP domain to // integer domain. void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { - for (auto &I : inst_range(F)) { + for (auto &I : instructions(F)) { + if (isa<VectorType>(I.getType())) + continue; switch (I.getOpcode()) { default: break; case Instruction::FPToUI: @@ -133,7 +140,7 @@ void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { Roots.insert(&I); break; case Instruction::FCmp: - if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) != + if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) != CmpInst::BAD_ICMP_PREDICATE) Roots.insert(&I); break; @@ -176,7 +183,7 @@ ConstantRange Float2Int::validateRange(ConstantRange R) { // - walkForwards: Iterate over SeenInsts in reverse order, so we visit // defs before their uses. Calculate the real range info. -// Breadth-first walk of the use-def graph; determine the set of nodes +// Breadth-first walk of the use-def graph; determine the set of nodes // we care about and eagerly determine if some of them are poisonous. void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) { std::deque<Instruction*> Worklist(Roots.begin(), Roots.end()); @@ -222,14 +229,14 @@ void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) { seen(I, unknownRange()); break; } - + for (Value *O : I->operands()) { if (Instruction *OI = dyn_cast<Instruction>(O)) { // Unify def-use chains if they interfere. ECs.unionSets(I, OI); - if (SeenInsts.find(I)->second != badRange()) + if (SeenInsts.find(I)->second != badRange()) Worklist.push_back(OI); - } else if (!isa<ConstantFP>(O)) { + } else if (!isa<ConstantFP>(O)) { // Not an instruction or ConstantFP? we can't do anything. seen(I, badRange()); } @@ -240,11 +247,11 @@ void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) { // Walk forwards down the list of seen instructions, so we visit defs before // uses. void Float2Int::walkForwards() { - for (auto It = SeenInsts.rbegin(), E = SeenInsts.rend(); It != E; ++It) { - if (It->second != unknownRange()) + for (auto &It : make_range(SeenInsts.rbegin(), SeenInsts.rend())) { + if (It.second != unknownRange()) continue; - Instruction *I = It->first; + Instruction *I = It.first; std::function<ConstantRange(ArrayRef<ConstantRange>)> Op; switch (I->getOpcode()) { // FIXME: Handle select and phi nodes. @@ -299,7 +306,7 @@ void Float2Int::walkForwards() { for (Value *O : I->operands()) { if (Instruction *OI = dyn_cast<Instruction>(O)) { assert(SeenInsts.find(OI) != SeenInsts.end() && - "def not seen before use!"); + "def not seen before use!"); OpRanges.push_back(SeenInsts.find(OI)->second); } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) { // Work out if the floating point number can be losslessly represented @@ -314,11 +321,11 @@ void Float2Int::walkForwards() { APFloat F = CF->getValueAPF(); // First, weed out obviously incorrect values. Non-finite numbers - // can't be represented and neither can negative zero, unless + // can't be represented and neither can negative zero, unless // we're in fast math mode. if (!F.isFinite() || (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) && - !I->hasNoSignedZeros())) { + !I->hasNoSignedZeros())) { seen(I, badRange()); Abort = true; break; @@ -345,7 +352,7 @@ void Float2Int::walkForwards() { // Reduce the operands' ranges to a single range and return. if (!Abort) - seen(I, Op(OpRanges)); + seen(I, Op(OpRanges)); } } @@ -395,7 +402,7 @@ bool Float2Int::validateAndTransform() { R.isFullSet() || R.isSignWrappedSet()) continue; assert(ConvertedToTy && "Must have set the convertedtoty by this point!"); - + // The number of bits required is the maximum of the upper and // lower limits, plus one so it can be signed. unsigned MinBW = std::max(R.getLower().getMinSignedBits(), @@ -505,9 +512,8 @@ Value *Float2Int::convert(Instruction *I, Type *ToTy) { // Perform dead code elimination on the instructions we just modified. void Float2Int::cleanup() { - for (auto I = ConvertedInsts.rbegin(), E = ConvertedInsts.rend(); - I != E; ++I) - I->first->eraseFromParent(); + for (auto &I : make_range(ConvertedInsts.rbegin(), ConvertedInsts.rend())) + I.first->eraseFromParent(); } bool Float2Int::runOnFunction(Function &F) { @@ -534,7 +540,4 @@ bool Float2Int::runOnFunction(Function &F) { return Modified; } -FunctionPass *llvm::createFloat2IntPass() { - return new Float2Int(); -} - +FunctionPass *llvm::createFloat2IntPass() { return new Float2Int(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index 89a0d0a..a028b8c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" @@ -128,6 +129,7 @@ namespace { uint32_t lookup(Value *V) const; uint32_t lookup_or_add_cmp(unsigned Opcode, CmpInst::Predicate Pred, Value *LHS, Value *RHS); + bool exists(Value *V) const; void add(Value *V, uint32_t num); void clear(); void erase(Value *v); @@ -388,6 +390,9 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) { } } +/// Returns true if a value number exists for the specified value. +bool ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; } + /// lookup_or_add - Returns the value number for the specified value, assigning /// it a new number if it did not have one before. uint32_t ValueTable::lookup_or_add(Value *V) { @@ -608,6 +613,10 @@ namespace { DenseMap<uint32_t, LeaderTableEntry> LeaderTable; BumpPtrAllocator TableAllocator; + // Block-local map of equivalent values to their leader, does not + // propagate to any successors. Entries added mid-block are applied + // to the remaining instructions in the block. + SmallMapVector<llvm::Value *, llvm::Constant *, 4> ReplaceWithConstMap; SmallVector<Instruction*, 8> InstrsToErase; typedef SmallVector<NonLocalDepResult, 64> LoadDepVect; @@ -689,16 +698,17 @@ namespace { AU.addRequired<TargetLibraryInfoWrapperPass>(); if (!NoLoads) AU.addRequired<MemoryDependenceAnalysis>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } - // Helper fuctions of redundant load elimination + // Helper functions of redundant load elimination bool processLoad(LoadInst *L); bool processNonLocalLoad(LoadInst *L); + bool processAssumeIntrinsic(IntrinsicInst *II); void AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks); @@ -719,7 +729,9 @@ namespace { void verifyRemoved(const Instruction *I) const; bool splitCriticalEdges(); BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ); - bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root); + bool replaceOperandsWithConsts(Instruction *I) const; + bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, + bool DominatesByEdge); bool processFoldableCondBr(BranchInst *BI); void addDeadBlock(BasicBlock *BB); void assignValNumForDeadCode(); @@ -738,7 +750,8 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1290,8 +1303,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, SSAUpdater SSAUpdate(&NewPHIs); SSAUpdate.Initialize(LI->getType(), LI->getName()); - for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { - const AvailableValueInBlock &AV = ValuesPerBlock[i]; + for (const AvailableValueInBlock &AV : ValuesPerBlock) { BasicBlock *BB = AV.BB; if (SSAUpdate.HasValueForBlock(BB)) @@ -1301,24 +1313,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, } // Perform PHI construction. - Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); - - // If new PHI nodes were created, notify alias analysis. - if (V->getType()->getScalarType()->isPointerTy()) { - AliasAnalysis *AA = gvn.getAliasAnalysis(); - - // Scan the new PHIs and inform alias analysis that we've added potentially - // escaping uses to any values that are operands to these PHIs. - for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) { - PHINode *P = NewPHIs[i]; - for (unsigned ii = 0, ee = P->getNumIncomingValues(); ii != ee; ++ii) { - unsigned jj = PHINode::getOperandNumForIncomingValue(ii); - AA->addEscapingUse(P->getOperandUse(jj)); - } - } - } - - return V; + return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); } Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI, @@ -1518,9 +1513,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // that we only have to insert *one* load (which means we're basically moving // the load, not inserting a new one). - SmallPtrSet<BasicBlock *, 4> Blockers; - for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) - Blockers.insert(UnavailableBlocks[i]); + SmallPtrSet<BasicBlock *, 4> Blockers(UnavailableBlocks.begin(), + UnavailableBlocks.end()); // Let's find the first basic block with more than one predecessor. Walk // backwards through predecessors if needed. @@ -1550,15 +1544,22 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // available. MapVector<BasicBlock *, Value *> PredLoads; DenseMap<BasicBlock*, char> FullyAvailableBlocks; - for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) - FullyAvailableBlocks[ValuesPerBlock[i].BB] = true; - for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) - FullyAvailableBlocks[UnavailableBlocks[i]] = false; + for (const AvailableValueInBlock &AV : ValuesPerBlock) + FullyAvailableBlocks[AV.BB] = true; + for (BasicBlock *UnavailableBB : UnavailableBlocks) + FullyAvailableBlocks[UnavailableBB] = false; SmallVector<BasicBlock *, 4> CriticalEdgePred; - for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); - PI != E; ++PI) { - BasicBlock *Pred = *PI; + for (BasicBlock *Pred : predecessors(LoadBB)) { + // If any predecessor block is an EH pad that does not allow non-PHI + // instructions before the terminator, we can't PRE the load. + if (Pred->getTerminator()->isEHPad()) { + DEBUG(dbgs() + << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '" + << Pred->getName() << "': " << *LI << '\n'); + return false; + } + if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { continue; } @@ -1570,9 +1571,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, return false; } - if (LoadBB->isLandingPad()) { + if (LoadBB->isEHPad()) { DEBUG(dbgs() - << "COULD NOT PRE LOAD BECAUSE OF LANDING PAD CRITICAL EDGE '" + << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '" << Pred->getName() << "': " << *LI << '\n'); return false; } @@ -1655,12 +1656,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, << *NewInsts.back() << '\n'); // Assign value numbers to the new instructions. - for (unsigned i = 0, e = NewInsts.size(); i != e; ++i) { + for (Instruction *I : NewInsts) { // FIXME: We really _ought_ to insert these value numbers into their // parent's availability map. However, in doing so, we risk getting into // ordering issues. If a block hasn't been processed yet, we would be // marking a value as AVAIL-IN, which isn't what we intend. - VN.lookup_or_add(NewInsts[i]); + VN.lookup_or_add(I); } for (const auto &PredLoad : PredLoads) { @@ -1677,6 +1678,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (Tags) NewLoad->setAAMetadata(Tags); + if (auto *MD = LI->getMetadata(LLVMContext::MD_invariant_load)) + NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD); + if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group)) + NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD); + // Transfer DebugLoc. NewLoad->setDebugLoc(LI->getDebugLoc()); @@ -1704,6 +1710,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, /// Attempt to eliminate a load whose dependencies are /// non-local by performing PHI construction. bool GVN::processNonLocalLoad(LoadInst *LI) { + // non-local speculations are not allowed under asan. + if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeAddress)) + return false; + // Step 1: Find the non-local dependencies of the load. LoadDepVect Deps; MD->getNonLocalPointerDependency(LI, Deps); @@ -1777,6 +1787,63 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks); } +bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { + assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume && + "This function can only be called with llvm.assume intrinsic"); + Value *V = IntrinsicI->getArgOperand(0); + + if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) { + if (Cond->isZero()) { + Type *Int8Ty = Type::getInt8Ty(V->getContext()); + // Insert a new store to null instruction before the load to indicate that + // this code is not reachable. FIXME: We could insert unreachable + // instruction directly because we can modify the CFG. + new StoreInst(UndefValue::get(Int8Ty), + Constant::getNullValue(Int8Ty->getPointerTo()), + IntrinsicI); + } + markInstructionForDeletion(IntrinsicI); + return false; + } + + Constant *True = ConstantInt::getTrue(V->getContext()); + bool Changed = false; + + for (BasicBlock *Successor : successors(IntrinsicI->getParent())) { + BasicBlockEdge Edge(IntrinsicI->getParent(), Successor); + + // This property is only true in dominated successors, propagateEquality + // will check dominance for us. + Changed |= propagateEquality(V, True, Edge, false); + } + + // We can replace assume value with true, which covers cases like this: + // call void @llvm.assume(i1 %cmp) + // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true + ReplaceWithConstMap[V] = True; + + // If one of *cmp *eq operand is const, adding it to map will cover this: + // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen + // call void @llvm.assume(i1 %cmp) + // ret float %0 ; will change it to ret float 3.000000e+00 + if (auto *CmpI = dyn_cast<CmpInst>(V)) { + if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ || + CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ || + (CmpI->getPredicate() == CmpInst::Predicate::FCMP_UEQ && + CmpI->getFastMathFlags().noNaNs())) { + Value *CmpLHS = CmpI->getOperand(0); + Value *CmpRHS = CmpI->getOperand(1); + if (isa<Constant>(CmpLHS)) + std::swap(CmpLHS, CmpRHS); + auto *RHSConst = dyn_cast<Constant>(CmpRHS); + + // If only one operand is constant. + if (RHSConst != nullptr && !isa<Constant>(CmpLHS)) + ReplaceWithConstMap[CmpLHS] = RHSConst; + } + } + return Changed; +} static void patchReplacementInstruction(Instruction *I, Value *Repl) { // Patch the replacement so that it is not more restrictive than the value @@ -1789,7 +1856,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) { // FIXME: If both the original and replacement value are part of the // same control-flow region (meaning that the execution of one - // guarentees the executation of the other), then we can combine the + // guarantees the execution of the other), then we can combine the // noalias scopes here and do better than the general conservative // answer used in combineMetadata(). @@ -1797,13 +1864,10 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { // regions, and so we need a conservative combination of the noalias // scopes. static const unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_range, - LLVMContext::MD_fpmath, - LLVMContext::MD_invariant_load, - }; + LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_range, + LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load, + LLVMContext::MD_invariant_group}; combineMetadata(ReplInst, I, KnownIDs); } } @@ -1890,10 +1954,8 @@ bool GVN::processLoad(LoadInst *L) { ++NumGVNLoad; return true; } - } - // If the value isn't available, don't do anything! - if (Dep.isClobber()) { + // If the value isn't available, don't do anything! DEBUG( // fast print dep, using operator<< on instruction is too slow. dbgs() << "GVN: load "; @@ -2049,11 +2111,31 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, return Pred != nullptr; } +// Tries to replace instruction with const, using information from +// ReplaceWithConstMap. +bool GVN::replaceOperandsWithConsts(Instruction *Instr) const { + bool Changed = false; + for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) { + Value *Operand = Instr->getOperand(OpNum); + auto it = ReplaceWithConstMap.find(Operand); + if (it != ReplaceWithConstMap.end()) { + assert(!isa<Constant>(Operand) && + "Replacing constants with constants is invalid"); + DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second + << " in instruction " << *Instr << '\n'); + Instr->setOperand(OpNum, it->second); + Changed = true; + } + } + return Changed; +} + /// The given values are known to be equal in every block /// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with /// 'RHS' everywhere in the scope. Returns whether a change was made. -bool GVN::propagateEquality(Value *LHS, Value *RHS, - const BasicBlockEdge &Root) { +/// If DominatesByEdge is false, then it means that it is dominated by Root.End. +bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, + bool DominatesByEdge) { SmallVector<std::pair<Value*, Value*>, 4> Worklist; Worklist.push_back(std::make_pair(LHS, RHS)); bool Changed = false; @@ -2065,11 +2147,13 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, std::pair<Value*, Value*> Item = Worklist.pop_back_val(); LHS = Item.first; RHS = Item.second; - if (LHS == RHS) continue; + if (LHS == RHS) + continue; assert(LHS->getType() == RHS->getType() && "Equality but unequal types!"); // Don't try to propagate equalities between constants. - if (isa<Constant>(LHS) && isa<Constant>(RHS)) continue; + if (isa<Constant>(LHS) && isa<Constant>(RHS)) + continue; // Prefer a constant on the right-hand side, or an Argument if no constants. if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS))) @@ -2108,7 +2192,11 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, // LHS always has at least one use that is not dominated by Root, this will // never do anything if LHS has only one use. if (!LHS->hasOneUse()) { - unsigned NumReplacements = replaceDominatedUsesWith(LHS, RHS, *DT, Root); + unsigned NumReplacements = + DominatesByEdge + ? replaceDominatedUsesWith(LHS, RHS, *DT, Root) + : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getEnd()); + Changed |= NumReplacements > 0; NumGVNEqProp += NumReplacements; } @@ -2180,7 +2268,10 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, Value *NotCmp = findLeader(Root.getEnd(), Num); if (NotCmp && isa<Instruction>(NotCmp)) { unsigned NumReplacements = - replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root); + DominatesByEdge + ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root) + : replaceDominatedUsesWith(NotCmp, NotVal, *DT, + Root.getEnd()); Changed |= NumReplacements > 0; NumGVNEqProp += NumReplacements; } @@ -2220,6 +2311,10 @@ bool GVN::processInstruction(Instruction *I) { return true; } + if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I)) + if (IntrinsicI->getIntrinsicID() == Intrinsic::assume) + return processAssumeIntrinsic(IntrinsicI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { if (processLoad(LI)) return true; @@ -2250,11 +2345,11 @@ bool GVN::processInstruction(Instruction *I) { Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext()); BasicBlockEdge TrueE(Parent, TrueSucc); - Changed |= propagateEquality(BranchCond, TrueVal, TrueE); + Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true); Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext()); BasicBlockEdge FalseE(Parent, FalseSucc); - Changed |= propagateEquality(BranchCond, FalseVal, FalseE); + Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true); return Changed; } @@ -2276,7 +2371,7 @@ bool GVN::processInstruction(Instruction *I) { // If there is only a single edge, propagate the case value into it. if (SwitchEdges.lookup(Dst) == 1) { BasicBlockEdge E(Parent, Dst); - Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E); + Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E, true); } } return Changed; @@ -2284,7 +2379,8 @@ bool GVN::processInstruction(Instruction *I) { // Instructions with void type don't return a value, so there's // no point in trying to find redundancies in them. - if (I->getType()->isVoidTy()) return false; + if (I->getType()->isVoidTy()) + return false; uint32_t NextNum = VN.getNextUnusedValueNumber(); unsigned Num = VN.lookup_or_add(I); @@ -2306,17 +2402,21 @@ bool GVN::processInstruction(Instruction *I) { // Perform fast-path value-number based elimination of values inherited from // dominators. - Value *repl = findLeader(I->getParent(), Num); - if (!repl) { + Value *Repl = findLeader(I->getParent(), Num); + if (!Repl) { // Failure, just remember this instance for future use. addToLeaderTable(Num, I, I->getParent()); return false; + } else if (Repl == I) { + // If I was the result of a shortcut PRE, it might already be in the table + // and the best replacement for itself. Nothing to do. + return false; } // Remove it! - patchAndReplaceAllUsesWith(I, repl); - if (MD && repl->getType()->getScalarType()->isPointerTy()) - MD->invalidateCachedPointerInfo(repl); + patchAndReplaceAllUsesWith(I, Repl); + if (MD && Repl->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(Repl); markInstructionForDeletion(I); return true; } @@ -2331,7 +2431,7 @@ bool GVN::runOnFunction(Function& F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); + VN.setAliasAnalysis(&getAnalysis<AAResultsWrapperPass>().getAAResults()); VN.setMemDep(MD); VN.setDomTree(DT); @@ -2341,10 +2441,10 @@ bool GVN::runOnFunction(Function& F) { // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { - BasicBlock *BB = FI++; + BasicBlock *BB = &*FI++; - bool removedBlock = MergeBlockIntoPredecessor( - BB, DT, /* LoopInfo */ nullptr, VN.getAliasAnalysis(), MD); + bool removedBlock = + MergeBlockIntoPredecessor(BB, DT, /* LoopInfo */ nullptr, MD); if (removedBlock) ++NumGVNBlocks; Changed |= removedBlock; @@ -2382,7 +2482,6 @@ bool GVN::runOnFunction(Function& F) { return Changed; } - bool GVN::processBlock(BasicBlock *BB) { // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function // (and incrementing BI before processing an instruction). @@ -2391,11 +2490,16 @@ bool GVN::processBlock(BasicBlock *BB) { if (DeadBlocks.count(BB)) return false; + // Clearing map before every BB because it can be used only for single BB. + ReplaceWithConstMap.clear(); bool ChangedFunction = false; for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - ChangedFunction |= processInstruction(BI); + if (!ReplaceWithConstMap.empty()) + ChangedFunction |= replaceOperandsWithConsts(&*BI); + ChangedFunction |= processInstruction(&*BI); + if (InstrsToErase.empty()) { ++BI; continue; @@ -2439,7 +2543,14 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, Value *Op = Instr->getOperand(i); if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) continue; - + // This could be a newly inserted instruction, in which case, we won't + // find a value number, and should give up before we hurt ourselves. + // FIXME: Rewrite the infrastructure to let it easier to value number + // and process newly inserted instructions. + if (!VN.exists(Op)) { + success = false; + break; + } if (Value *V = findLeader(Pred, VN.lookup(Op))) { Instr->setOperand(i, V); } else { @@ -2499,9 +2610,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { BasicBlock *CurrentBlock = CurInst->getParent(); predMap.clear(); - for (pred_iterator PI = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock); - PI != PE; ++PI) { - BasicBlock *P = *PI; + for (BasicBlock *P : predecessors(CurrentBlock)) { // We're not interested in PRE where the block is its // own predecessor, or in blocks with predecessors // that are not reachable. @@ -2570,7 +2679,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { // Create a PHI to make the value available in this block. PHINode *Phi = PHINode::Create(CurInst->getType(), predMap.size(), - CurInst->getName() + ".pre-phi", CurrentBlock->begin()); + CurInst->getName() + ".pre-phi", &CurrentBlock->front()); for (unsigned i = 0, e = predMap.size(); i != e; ++i) { if (Value *V = predMap[i].first) Phi->addIncoming(V, predMap[i].second); @@ -2582,18 +2691,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) { addToLeaderTable(ValNo, Phi, CurrentBlock); Phi->setDebugLoc(CurInst->getDebugLoc()); CurInst->replaceAllUsesWith(Phi); - if (Phi->getType()->getScalarType()->isPointerTy()) { - // Because we have added a PHI-use of the pointer value, it has now - // "escaped" from alias analysis' perspective. We need to inform - // AA of this. - for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; ++ii) { - unsigned jj = PHINode::getOperandNumForIncomingValue(ii); - VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj)); - } - - if (MD) - MD->invalidateCachedPointerInfo(Phi); - } + if (MD && Phi->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(Phi); VN.erase(CurInst); removeFromLeaderTable(ValNo, CurInst, CurrentBlock); @@ -2616,15 +2715,15 @@ bool GVN::performPRE(Function &F) { if (CurrentBlock == &F.getEntryBlock()) continue; - // Don't perform PRE on a landing pad. - if (CurrentBlock->isLandingPad()) + // Don't perform PRE on an EH pad. + if (CurrentBlock->isEHPad()) continue; for (BasicBlock::iterator BI = CurrentBlock->begin(), BE = CurrentBlock->end(); BI != BE;) { - Instruction *CurInst = BI++; - Changed = performScalarPRE(CurInst); + Instruction *CurInst = &*BI++; + Changed |= performScalarPRE(CurInst); } } @@ -2637,8 +2736,8 @@ bool GVN::performPRE(Function &F) { /// Split the critical edge connecting the given two blocks, and return /// the block inserted to the critical edge. BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { - BasicBlock *BB = SplitCriticalEdge( - Pred, Succ, CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); + BasicBlock *BB = + SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT)); if (MD) MD->invalidateCachedPredecessors(); return BB; @@ -2652,7 +2751,7 @@ bool GVN::splitCriticalEdges() { do { std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val(); SplitCriticalEdge(Edge.first, Edge.second, - CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); + CriticalEdgeSplittingOptions(DT)); } while (!toSplit.empty()); if (MD) MD->invalidateCachedPredecessors(); return true; @@ -2728,17 +2827,14 @@ void GVN::addDeadBlock(BasicBlock *BB) { DeadBlocks.insert(Dom.begin(), Dom.end()); // Figure out the dominance-frontier(D). - for (SmallVectorImpl<BasicBlock *>::iterator I = Dom.begin(), - E = Dom.end(); I != E; I++) { - BasicBlock *B = *I; - for (succ_iterator SI = succ_begin(B), SE = succ_end(B); SI != SE; SI++) { - BasicBlock *S = *SI; + for (BasicBlock *B : Dom) { + for (BasicBlock *S : successors(B)) { if (DeadBlocks.count(S)) continue; bool AllPredDead = true; - for (pred_iterator PI = pred_begin(S), PE = pred_end(S); PI != PE; PI++) - if (!DeadBlocks.count(*PI)) { + for (BasicBlock *P : predecessors(S)) + if (!DeadBlocks.count(P)) { AllPredDead = false; break; } @@ -2766,10 +2862,7 @@ void GVN::addDeadBlock(BasicBlock *BB) { continue; SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B)); - for (SmallVectorImpl<BasicBlock *>::iterator PI = Preds.begin(), - PE = Preds.end(); PI != PE; PI++) { - BasicBlock *P = *PI; - + for (BasicBlock *P : Preds) { if (!DeadBlocks.count(P)) continue; @@ -2794,7 +2887,7 @@ void GVN::addDeadBlock(BasicBlock *BB) { // R be the target of the dead out-coming edge. // 1) Identify the set of dead blocks implied by the branch's dead outcoming // edge. The result of this step will be {X| X is dominated by R} -// 2) Identify those blocks which haves at least one dead prodecessor. The +// 2) Identify those blocks which haves at least one dead predecessor. The // result of this step will be dominance-frontier(R). // 3) Update the PHIs in DF(R) by replacing the operands corresponding to // dead blocks with "UndefVal" in an hope these PHIs will optimized away. @@ -2829,14 +2922,10 @@ bool GVN::processFoldableCondBr(BranchInst *BI) { // instructions, it makes more sense just to "fabricate" a val-number for the // dead code than checking if instruction involved is dead or not. void GVN::assignValNumForDeadCode() { - for (SetVector<BasicBlock *>::iterator I = DeadBlocks.begin(), - E = DeadBlocks.end(); I != E; I++) { - BasicBlock *BB = *I; - for (BasicBlock::iterator II = BB->begin(), EE = BB->end(); - II != EE; II++) { - Instruction *Inst = &*II; - unsigned ValNum = VN.lookup_or_add(Inst); - addToLeaderTable(ValNum, Inst, BB); + for (BasicBlock *BB : DeadBlocks) { + for (Instruction &Inst : *BB) { + unsigned ValNum = VN.lookup_or_add(&Inst); + addToLeaderTable(ValNum, &Inst, BB); } } } diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 2a954d9..ec5e15f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -28,9 +28,11 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" @@ -48,6 +50,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" using namespace llvm; @@ -83,64 +86,62 @@ static cl::opt<ReplaceExitVal> ReplaceExitValue( namespace { struct RewritePhi; -} -namespace { - class IndVarSimplify : public LoopPass { - LoopInfo *LI; - ScalarEvolution *SE; - DominatorTree *DT; - TargetLibraryInfo *TLI; - const TargetTransformInfo *TTI; - - SmallVector<WeakVH, 16> DeadInsts; - bool Changed; - public: - - static char ID; // Pass identification, replacement for typeid - IndVarSimplify() - : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) { - initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); - } +class IndVarSimplify : public LoopPass { + LoopInfo *LI; + ScalarEvolution *SE; + DominatorTree *DT; + TargetLibraryInfo *TLI; + const TargetTransformInfo *TTI; - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolution>(); - AU.addRequiredID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreserved<ScalarEvolution>(); - AU.addPreservedID(LoopSimplifyID); - AU.addPreservedID(LCSSAID); - AU.setPreservesCFG(); - } + SmallVector<WeakVH, 16> DeadInsts; + bool Changed; +public: - private: - void releaseMemory() override { - DeadInsts.clear(); - } + static char ID; // Pass identification, replacement for typeid + IndVarSimplify() + : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) { + initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); + } - bool isValidRewrite(Value *FromVal, Value *ToVal); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreservedID(LoopSimplifyID); + AU.addPreservedID(LCSSAID); + AU.setPreservesCFG(); + } - void HandleFloatingPointIV(Loop *L, PHINode *PH); - void RewriteNonIntegerIVs(Loop *L); +private: + void releaseMemory() override { + DeadInsts.clear(); + } - void SimplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LPPassManager &LPM); + bool isValidRewrite(Value *FromVal, Value *ToVal); - bool CanLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet); - void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); + void handleFloatingPointIV(Loop *L, PHINode *PH); + void rewriteNonIntegerIVs(Loop *L); - Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, - PHINode *IndVar, SCEVExpander &Rewriter); + void simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI); - void SinkUnusedInvariants(Loop *L); + bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet); + void rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); - Value *ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, - Instruction *InsertPt, Type *Ty, - bool &IsHighCostExpansion); - }; + Value *linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, + PHINode *IndVar, SCEVExpander &Rewriter); + + void sinkUnusedInvariants(Loop *L); + + Value *expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, + Instruction *InsertPt, Type *Ty); +}; } char IndVarSimplify::ID = 0; @@ -148,7 +149,7 @@ INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars", "Induction Variable Simplification", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(IndVarSimplify, "indvars", @@ -158,10 +159,10 @@ Pass *llvm::createIndVarSimplifyPass() { return new IndVarSimplify(); } -/// isValidRewrite - Return true if the SCEV expansion generated by the -/// rewriter can replace the original value. SCEV guarantees that it -/// produces the same value, but the way it is produced may be illegal IR. -/// Ideally, this function will only be called for verification. +/// Return true if the SCEV expansion generated by the rewriter can replace the +/// original value. SCEV guarantees that it produces the same value, but the way +/// it is produced may be illegal IR. Ideally, this function will only be +/// called for verification. bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) { // If an SCEV expression subsumed multiple pointers, its expansion could // reassociate the GEP changing the base pointer. This is illegal because the @@ -175,10 +176,10 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) { // because it understands lcssa phis while SCEV does not. Value *FromPtr = FromVal; Value *ToPtr = ToVal; - if (GEPOperator *GEP = dyn_cast<GEPOperator>(FromVal)) { + if (auto *GEP = dyn_cast<GEPOperator>(FromVal)) { FromPtr = GEP->getPointerOperand(); } - if (GEPOperator *GEP = dyn_cast<GEPOperator>(ToVal)) { + if (auto *GEP = dyn_cast<GEPOperator>(ToVal)) { ToPtr = GEP->getPointerOperand(); } if (FromPtr != FromVal || ToPtr != ToVal) { @@ -215,7 +216,7 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) { /// loop. For PHI nodes, there may be multiple uses, so compute the nearest /// common dominator for the incoming blocks. static Instruction *getInsertPointForUses(Instruction *User, Value *Def, - DominatorTree *DT) { + DominatorTree *DT, LoopInfo *LI) { PHINode *PHI = dyn_cast<PHINode>(User); if (!PHI) return User; @@ -234,17 +235,28 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def, InsertPt = InsertBB->getTerminator(); } assert(InsertPt && "Missing phi operand"); - assert((!isa<Instruction>(Def) || - DT->dominates(cast<Instruction>(Def), InsertPt)) && - "def does not dominate all uses"); - return InsertPt; + + auto *DefI = dyn_cast<Instruction>(Def); + if (!DefI) + return InsertPt; + + assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses"); + + auto *L = LI->getLoopFor(DefI->getParent()); + assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent()))); + + for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom()) + if (LI->getLoopFor(DTN->getBlock()) == L) + return DTN->getBlock()->getTerminator(); + + llvm_unreachable("DefI dominates InsertPt!"); } //===----------------------------------------------------------------------===// -// RewriteNonIntegerIVs and helpers. Prefer integer IVs. +// rewriteNonIntegerIVs and helpers. Prefer integer IVs. //===----------------------------------------------------------------------===// -/// ConvertToSInt - Convert APF to an integer, if possible. +/// Convert APF to an integer, if possible. static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { bool isExact = false; // See if we can convert this to an int64_t @@ -256,8 +268,8 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { return true; } -/// HandleFloatingPointIV - If the loop has floating induction variable -/// then insert corresponding integer induction variable if possible. +/// If the loop has floating induction variable then insert corresponding +/// integer induction variable if possible. /// For example, /// for(double i = 0; i < 10000; ++i) /// bar(i) @@ -265,13 +277,12 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { /// for(int i = 0; i < 10000; ++i) /// bar((double)i); /// -void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { +void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0)); unsigned BackEdge = IncomingEdge^1; // Check incoming value. - ConstantFP *InitValueVal = - dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); + auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); int64_t InitValue; if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) @@ -279,8 +290,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // Check IV increment. Reject this PN if increment operation is not // an add or increment value can not be represented by an integer. - BinaryOperator *Incr = - dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); + auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return; // If this is not an add of the PHI with a constantfp, or if the constant fp @@ -456,14 +466,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // platforms. if (WeakPH) { Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", - PN->getParent()->getFirstInsertionPt()); + &*PN->getParent()->getFirstInsertionPt()); PN->replaceAllUsesWith(Conv); RecursivelyDeleteTriviallyDeadInstructions(PN, TLI); } Changed = true; } -void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { +void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) { // First step. Check to see if there are any floating-point recurrences. // If there are, change them into integer recurrences, permitting analysis by // the SCEV routines. @@ -477,7 +487,7 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { for (unsigned i = 0, e = PHIs.size(); i != e; ++i) if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i])) - HandleFloatingPointIV(L, PN); + handleFloatingPointIV(L, PN); // If the loop previously had floating-point IV, ScalarEvolution // may not have been able to compute a trip count. Now that we've done some @@ -488,7 +498,7 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { namespace { // Collect information about PHI nodes which can be transformed in -// RewriteLoopExitValues. +// rewriteLoopExitValues. struct RewritePhi { PHINode *PN; unsigned Ith; // Ith incoming value. @@ -501,70 +511,37 @@ struct RewritePhi { }; } -Value *IndVarSimplify::ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, +Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, Instruction *InsertPt, - Type *ResultTy, - bool &IsHighCostExpansion) { - using namespace llvm::PatternMatch; - - if (!Rewriter.isHighCostExpansion(S, L)) { - IsHighCostExpansion = false; - return Rewriter.expandCodeFor(S, ResultTy, InsertPt); - } - + Type *ResultTy) { // Before expanding S into an expensive LLVM expression, see if we can use an - // already existing value as the expansion for S. There is potential to make - // this significantly smarter, but this simple heuristic already gets some - // interesting cases. - - SmallVector<BasicBlock *, 4> Latches; - L->getLoopLatches(Latches); - - for (BasicBlock *BB : Latches) { - ICmpInst::Predicate Pred; - Instruction *LHS, *RHS; - BasicBlock *TrueBB, *FalseBB; - - if (!match(BB->getTerminator(), - m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)), - TrueBB, FalseBB))) - continue; - - if (SE->getSCEV(LHS) == S && DT->dominates(LHS, InsertPt)) { - IsHighCostExpansion = false; - return LHS; - } - - if (SE->getSCEV(RHS) == S && DT->dominates(RHS, InsertPt)) { - IsHighCostExpansion = false; - return RHS; - } - } + // already existing value as the expansion for S. + if (Value *ExistingValue = Rewriter.findExistingExpansion(S, InsertPt, L)) + if (ExistingValue->getType() == ResultTy) + return ExistingValue; // We didn't find anything, fall back to using SCEVExpander. - assert(Rewriter.isHighCostExpansion(S, L) && "this should not have changed!"); - IsHighCostExpansion = true; return Rewriter.expandCodeFor(S, ResultTy, InsertPt); } //===----------------------------------------------------------------------===// -// RewriteLoopExitValues - Optimize IV users outside the loop. +// rewriteLoopExitValues - Optimize IV users outside the loop. // As a side effect, reduces the amount of IV processing within the loop. //===----------------------------------------------------------------------===// -/// RewriteLoopExitValues - Check to see if this loop has a computable -/// loop-invariant execution count. If so, this means that we can compute the -/// final value of any expressions that are recurrent in the loop, and -/// substitute the exit values from the loop into any instructions outside of -/// the loop that use the final values of the current expressions. +/// Check to see if this loop has a computable loop-invariant execution count. +/// If so, this means that we can compute the final value of any expressions +/// that are recurrent in the loop, and substitute the exit values from the loop +/// into any instructions outside of the loop that use the final values of the +/// current expressions. /// /// This is mostly redundant with the regular IndVarSimplify activities that /// happen later, except that it's more powerful in some cases, because it's /// able to brute-force evaluate arbitrary instructions as long as they have /// constant operands at the beginning of the loop. -void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { - // Verify the input to the pass in already in LCSSA form. - assert(L->isLCSSAForm(*DT)); +void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { + // Check a pre-condition. + assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!"); SmallVector<BasicBlock*, 8> ExitBlocks; L->getUniqueExitBlocks(ExitBlocks); @@ -679,9 +656,9 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { continue; } - bool HighCost = false; - Value *ExitVal = ExpandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, - PN->getType(), HighCost); + bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst); + Value *ExitVal = + expandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, PN->getType()); DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n' << " LoopVal = " << *Inst << "\n"); @@ -698,7 +675,7 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { } } - bool LoopCanBeDel = CanLoopBeDeleted(L, RewritePhiSet); + bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet); // Transformation. for (const RewritePhi &Phi : RewritePhiSet) { @@ -735,10 +712,10 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { Rewriter.clearInsertPoint(); } -/// CanLoopBeDeleted - Check whether it is possible to delete the loop after -/// rewriting exit value. If it is possible, ignore ReplaceExitValue and -/// do rewriting aggressively. -bool IndVarSimplify::CanLoopBeDeleted( +/// Check whether it is possible to delete the loop after rewriting exit +/// value. If it is possible, ignore ReplaceExitValue and do rewriting +/// aggressively. +bool IndVarSimplify::canLoopBeDeleted( Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) { BasicBlock *Preheader = L->getLoopPreheader(); @@ -782,14 +759,9 @@ bool IndVarSimplify::CanLoopBeDeleted( ++BI; } - for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); - LI != LE; ++LI) { - for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end(); BI != BE; - ++BI) { - if (BI->mayHaveSideEffects()) - return false; - } - } + for (auto *BB : L->blocks()) + if (any_of(*BB, [](Instruction &I) { return I.mayHaveSideEffects(); })) + return false; return true; } @@ -799,22 +771,19 @@ bool IndVarSimplify::CanLoopBeDeleted( //===----------------------------------------------------------------------===// namespace { - // Collect information about induction variables that are used by sign/zero - // extend operations. This information is recorded by CollectExtend and - // provides the input to WidenIV. - struct WideIVInfo { - PHINode *NarrowIV; - Type *WidestNativeType; // Widest integer type created [sz]ext - bool IsSigned; // Was a sext user seen before a zext? - - WideIVInfo() : NarrowIV(nullptr), WidestNativeType(nullptr), - IsSigned(false) {} - }; +// Collect information about induction variables that are used by sign/zero +// extend operations. This information is recorded by CollectExtend and provides +// the input to WidenIV. +struct WideIVInfo { + PHINode *NarrowIV = nullptr; + Type *WidestNativeType = nullptr; // Widest integer type created [sz]ext + bool IsSigned = false; // Was a sext user seen before a zext? +}; } -/// visitCast - Update information about the induction variable that is -/// extended by this sign or zero extend operation. This is used to determine -/// the final width of the IV before actually widening it. +/// Update information about the induction variable that is extended by this +/// sign or zero extend operation. This is used to determine the final width of +/// the IV before actually widening it. static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, const TargetTransformInfo *TTI) { bool IsSigned = Cast->getOpcode() == Instruction::SExt; @@ -855,24 +824,29 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, namespace { -/// NarrowIVDefUse - Record a link in the Narrow IV def-use chain along with the -/// WideIV that computes the same value as the Narrow IV def. This avoids -/// caching Use* pointers. +/// Record a link in the Narrow IV def-use chain along with the WideIV that +/// computes the same value as the Narrow IV def. This avoids caching Use* +/// pointers. struct NarrowIVDefUse { - Instruction *NarrowDef; - Instruction *NarrowUse; - Instruction *WideDef; - - NarrowIVDefUse(): NarrowDef(nullptr), NarrowUse(nullptr), WideDef(nullptr) {} - - NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD): - NarrowDef(ND), NarrowUse(NU), WideDef(WD) {} + Instruction *NarrowDef = nullptr; + Instruction *NarrowUse = nullptr; + Instruction *WideDef = nullptr; + + // True if the narrow def is never negative. Tracking this information lets + // us use a sign extension instead of a zero extension or vice versa, when + // profitable and legal. + bool NeverNegative = false; + + NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD, + bool NeverNegative) + : NarrowDef(ND), NarrowUse(NU), WideDef(WD), + NeverNegative(NeverNegative) {} }; -/// WidenIV - The goal of this transform is to remove sign and zero extends -/// without creating any new induction variables. To do this, it creates a new -/// phi of the wider type and redirects all users, either removing extends or -/// inserting truncs whenever we stop propagating the type. +/// The goal of this transform is to remove sign and zero extends without +/// creating any new induction variables. To do this, it creates a new phi of +/// the wider type and redirects all users, either removing extends or inserting +/// truncs whenever we stop propagating the type. /// class WidenIV { // Parameters @@ -913,32 +887,35 @@ public: assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV"); } - PHINode *CreateWideIV(SCEVExpander &Rewriter); + PHINode *createWideIV(SCEVExpander &Rewriter); protected: - Value *getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, - Instruction *Use); + Value *createExtendInst(Value *NarrowOper, Type *WideType, bool IsSigned, + Instruction *Use); - Instruction *CloneIVUser(NarrowIVDefUse DU); + Instruction *cloneIVUser(NarrowIVDefUse DU, const SCEVAddRecExpr *WideAR); + Instruction *cloneArithmeticIVUser(NarrowIVDefUse DU, + const SCEVAddRecExpr *WideAR); + Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU); - const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse); + const SCEVAddRecExpr *getWideRecurrence(Instruction *NarrowUse); - const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU); + const SCEVAddRecExpr* getExtendedOperandRecurrence(NarrowIVDefUse DU); - const SCEV *GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, + const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, unsigned OpCode) const; - Instruction *WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); + Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); - bool WidenLoopCompare(NarrowIVDefUse DU); + bool widenLoopCompare(NarrowIVDefUse DU); void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef); }; } // anonymous namespace -/// isLoopInvariant - Perform a quick domtree based check for loop invariance -/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems -/// gratuitous for this purpose. +/// Perform a quick domtree based check for loop invariance assuming that V is +/// used within the loop. LoopInfo::isLoopInvariant() seems gratuitous for this +/// purpose. static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) { Instruction *Inst = dyn_cast<Instruction>(V); if (!Inst) @@ -947,8 +924,8 @@ static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) { return DT->properlyDominates(Inst->getParent(), L->getHeader()); } -Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, - Instruction *Use) { +Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType, + bool IsSigned, Instruction *Use) { // Set the debug location and conservative insertion point. IRBuilder<> Builder(Use); // Hoist the insertion point into loop preheaders as far as possible. @@ -961,10 +938,11 @@ Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, Builder.CreateZExt(NarrowOper, WideType); } -/// CloneIVUser - Instantiate a wide operation to replace a narrow -/// operation. This only needs to handle operations that can evaluation to -/// SCEVAddRec. It can safely return 0 for any operation we decide not to clone. -Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { +/// Instantiate a wide operation to replace a narrow operation. This only needs +/// to handle operations that can evaluation to SCEVAddRec. It can safely return +/// 0 for any operation we decide not to clone. +Instruction *WidenIV::cloneIVUser(NarrowIVDefUse DU, + const SCEVAddRecExpr *WideAR) { unsigned Opcode = DU.NarrowUse->getOpcode(); switch (Opcode) { default: @@ -973,40 +951,140 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { case Instruction::Mul: case Instruction::UDiv: case Instruction::Sub: + return cloneArithmeticIVUser(DU, WideAR); + case Instruction::And: case Instruction::Or: case Instruction::Xor: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: - DEBUG(dbgs() << "Cloning IVUser: " << *DU.NarrowUse << "\n"); - - // Replace NarrowDef operands with WideDef. Otherwise, we don't know - // anything about the narrow operand yet so must insert a [sz]ext. It is - // probably loop invariant and will be folded or hoisted. If it actually - // comes from a widened IV, it should be removed during a future call to - // WidenIVUse. - Value *LHS = (DU.NarrowUse->getOperand(0) == DU.NarrowDef) ? DU.WideDef : - getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, DU.NarrowUse); - Value *RHS = (DU.NarrowUse->getOperand(1) == DU.NarrowDef) ? DU.WideDef : - getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, DU.NarrowUse); - - BinaryOperator *NarrowBO = cast<BinaryOperator>(DU.NarrowUse); - BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), - LHS, RHS, - NarrowBO->getName()); - IRBuilder<> Builder(DU.NarrowUse); - Builder.Insert(WideBO); - if (const OverflowingBinaryOperator *OBO = - dyn_cast<OverflowingBinaryOperator>(NarrowBO)) { - if (OBO->hasNoUnsignedWrap()) WideBO->setHasNoUnsignedWrap(); - if (OBO->hasNoSignedWrap()) WideBO->setHasNoSignedWrap(); + return cloneBitwiseIVUser(DU); + } +} + +Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) { + Instruction *NarrowUse = DU.NarrowUse; + Instruction *NarrowDef = DU.NarrowDef; + Instruction *WideDef = DU.WideDef; + + DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n"); + + // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything + // about the narrow operand yet so must insert a [sz]ext. It is probably loop + // invariant and will be folded or hoisted. If it actually comes from a + // widened IV, it should be removed during a future call to widenIVUse. + Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(0), WideType, + IsSigned, NarrowUse); + Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(1), WideType, + IsSigned, NarrowUse); + + auto *NarrowBO = cast<BinaryOperator>(NarrowUse); + auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS, + NarrowBO->getName()); + IRBuilder<> Builder(NarrowUse); + Builder.Insert(WideBO); + WideBO->copyIRFlags(NarrowBO); + return WideBO; +} + +Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU, + const SCEVAddRecExpr *WideAR) { + Instruction *NarrowUse = DU.NarrowUse; + Instruction *NarrowDef = DU.NarrowDef; + Instruction *WideDef = DU.WideDef; + + DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n"); + + unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1; + + // We're trying to find X such that + // + // Widen(NarrowDef `op` NonIVNarrowDef) == WideAR == WideDef `op.wide` X + // + // We guess two solutions to X, sext(NonIVNarrowDef) and zext(NonIVNarrowDef), + // and check using SCEV if any of them are correct. + + // Returns true if extending NonIVNarrowDef according to `SignExt` is a + // correct solution to X. + auto GuessNonIVOperand = [&](bool SignExt) { + const SCEV *WideLHS; + const SCEV *WideRHS; + + auto GetExtend = [this, SignExt](const SCEV *S, Type *Ty) { + if (SignExt) + return SE->getSignExtendExpr(S, Ty); + return SE->getZeroExtendExpr(S, Ty); + }; + + if (IVOpIdx == 0) { + WideLHS = SE->getSCEV(WideDef); + const SCEV *NarrowRHS = SE->getSCEV(NarrowUse->getOperand(1)); + WideRHS = GetExtend(NarrowRHS, WideType); + } else { + const SCEV *NarrowLHS = SE->getSCEV(NarrowUse->getOperand(0)); + WideLHS = GetExtend(NarrowLHS, WideType); + WideRHS = SE->getSCEV(WideDef); + } + + // WideUse is "WideDef `op.wide` X" as described in the comment. + const SCEV *WideUse = nullptr; + + switch (NarrowUse->getOpcode()) { + default: + llvm_unreachable("No other possibility!"); + + case Instruction::Add: + WideUse = SE->getAddExpr(WideLHS, WideRHS); + break; + + case Instruction::Mul: + WideUse = SE->getMulExpr(WideLHS, WideRHS); + break; + + case Instruction::UDiv: + WideUse = SE->getUDivExpr(WideLHS, WideRHS); + break; + + case Instruction::Sub: + WideUse = SE->getMinusSCEV(WideLHS, WideRHS); + break; } - return WideBO; + + return WideUse == WideAR; + }; + + bool SignExtend = IsSigned; + if (!GuessNonIVOperand(SignExtend)) { + SignExtend = !SignExtend; + if (!GuessNonIVOperand(SignExtend)) + return nullptr; } + + Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(0), WideType, + SignExtend, NarrowUse); + Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(1), WideType, + SignExtend, NarrowUse); + + auto *NarrowBO = cast<BinaryOperator>(NarrowUse); + auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS, + NarrowBO->getName()); + + IRBuilder<> Builder(NarrowUse); + Builder.Insert(WideBO); + WideBO->copyIRFlags(NarrowBO); + return WideBO; } -const SCEV *WidenIV::GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, +const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, unsigned OpCode) const { if (OpCode == Instruction::Add) return SE->getAddExpr(LHS, RHS); @@ -1022,7 +1100,7 @@ const SCEV *WidenIV::GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, /// operands. Generate the SCEV value for the widened operation without /// actually modifying the IR yet. If the expression after extending the /// operands is an AddRec for this loop, return it. -const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { +const SCEVAddRecExpr* WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) { // Handle the common case of add<nsw/nuw> const unsigned OpCode = DU.NarrowUse->getOpcode(); @@ -1062,19 +1140,18 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { if (ExtendOperIdx == 0) std::swap(lhs, rhs); const SCEVAddRecExpr *AddRec = - dyn_cast<SCEVAddRecExpr>(GetSCEVByOpCode(lhs, rhs, OpCode)); + dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode)); if (!AddRec || AddRec->getLoop() != L) return nullptr; return AddRec; } -/// GetWideRecurrence - Is this instruction potentially interesting for further -/// simplification after widening it's type? In other words, can the -/// extend be safely hoisted out of the loop with SCEV reducing the value to a -/// recurrence on the same loop. If so, return the sign or zero extended -/// recurrence. Otherwise return NULL. -const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { +/// Is this instruction potentially interesting for further simplification after +/// widening it's type? In other words, can the extend be safely hoisted out of +/// the loop with SCEV reducing the value to a recurrence on the same loop. If +/// so, return the sign or zero extended recurrence. Otherwise return NULL. +const SCEVAddRecExpr *WidenIV::getWideRecurrence(Instruction *NarrowUse) { if (!SE->isSCEVable(NarrowUse->getType())) return nullptr; @@ -1097,10 +1174,11 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { /// This IV user cannot be widen. Replace this use of the original narrow IV /// with a truncation of the new wide IV to isolate and eliminate the narrow IV. -static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) { +static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) { DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef << " for user " << *DU.NarrowUse << "\n"); - IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); + IRBuilder<> Builder( + getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI)); Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType()); DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc); } @@ -1108,13 +1186,27 @@ static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) { /// If the narrow use is a compare instruction, then widen the compare // (and possibly the other operand). The extend operation is hoisted into the // loop preheader as far as possible. -bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) { +bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) { ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse); if (!Cmp) return false; - // Sign of IV user and compare must match. - if (IsSigned != CmpInst::isSigned(Cmp->getPredicate())) + // We can legally widen the comparison in the following two cases: + // + // - The signedness of the IV extension and comparison match + // + // - The narrow IV is always positive (and thus its sign extension is equal + // to its zero extension). For instance, let's say we're zero extending + // %narrow for the following use + // + // icmp slt i32 %narrow, %val ... (A) + // + // and %narrow is always positive. Then + // + // (A) == icmp slt i32 sext(%narrow), sext(%val) + // == icmp slt i32 zext(%narrow), sext(%val) + + if (!(DU.NeverNegative || IsSigned == Cmp->isSigned())) return false; Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0); @@ -1123,20 +1215,21 @@ bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) { assert (CastWidth <= IVWidth && "Unexpected width while widening compare."); // Widen the compare instruction. - IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); + IRBuilder<> Builder( + getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI)); DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef); // Widen the other operand of the compare, if necessary. if (CastWidth < IVWidth) { - Value *ExtOp = getExtend(Op, WideType, IsSigned, Cmp); + Value *ExtOp = createExtendInst(Op, WideType, Cmp->isSigned(), Cmp); DU.NarrowUse->replaceUsesOfWith(Op, ExtOp); } return true; } -/// WidenIVUse - Determine whether an individual user of the narrow IV can be -/// widened. If so, return the wide clone of the user. -Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { +/// Determine whether an individual user of the narrow IV can be widened. If so, +/// return the wide clone of the user. +Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // Stop traversing the def-use chain at inner-loop phis or post-loop phis. if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) { @@ -1145,13 +1238,13 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // After SimplifyCFG most loop exit targets have a single predecessor. // Otherwise fall back to a truncate within the loop. if (UsePhi->getNumOperands() != 1) - truncateIVUse(DU, DT); + truncateIVUse(DU, DT, LI); else { PHINode *WidePhi = PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide", UsePhi); WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0)); - IRBuilder<> Builder(WidePhi->getParent()->getFirstInsertionPt()); + IRBuilder<> Builder(&*WidePhi->getParent()->getFirstInsertionPt()); Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType()); UsePhi->replaceAllUsesWith(Trunc); DeadInsts.emplace_back(UsePhi); @@ -1200,20 +1293,20 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { } // Does this user itself evaluate to a recurrence after widening? - const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse); + const SCEVAddRecExpr *WideAddRec = getWideRecurrence(DU.NarrowUse); if (!WideAddRec) - WideAddRec = GetExtendedOperandRecurrence(DU); + WideAddRec = getExtendedOperandRecurrence(DU); if (!WideAddRec) { // If use is a loop condition, try to promote the condition instead of // truncating the IV first. - if (WidenLoopCompare(DU)) + if (widenLoopCompare(DU)) return nullptr; // This user does not evaluate to a recurence after widening, so don't // follow it. Instead insert a Trunc to kill off the original use, // eventually isolating the original narrow IV so it can be removed. - truncateIVUse(DU, DT); + truncateIVUse(DU, DT, LI); return nullptr; } // Assume block terminators cannot evaluate to a recurrence. We can't to @@ -1228,7 +1321,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { && Rewriter.hoistIVInc(WideInc, DU.NarrowUse)) WideUse = WideInc; else { - WideUse = CloneIVUser(DU); + WideUse = cloneIVUser(DU, WideAddRec); if (!WideUse) return nullptr; } @@ -1248,9 +1341,13 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { return WideUse; } -/// pushNarrowIVUsers - Add eligible users of NarrowDef to NarrowIVUsers. +/// Add eligible users of NarrowDef to NarrowIVUsers. /// void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { + const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef); + bool NeverNegative = + SE->isKnownPredicate(ICmpInst::ICMP_SGE, NarrowSCEV, + SE->getConstant(NarrowSCEV->getType(), 0)); for (User *U : NarrowDef->users()) { Instruction *NarrowUser = cast<Instruction>(U); @@ -1258,21 +1355,21 @@ void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { if (!Widened.insert(NarrowUser).second) continue; - NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUser, WideDef)); + NarrowIVUsers.push_back( + NarrowIVDefUse(NarrowDef, NarrowUser, WideDef, NeverNegative)); } } -/// CreateWideIV - Process a single induction variable. First use the -/// SCEVExpander to create a wide induction variable that evaluates to the same -/// recurrence as the original narrow IV. Then use a worklist to forward -/// traverse the narrow IV's def-use chain. After WidenIVUse has processed all -/// interesting IV users, the narrow IV will be isolated for removal by -/// DeleteDeadPHIs. +/// Process a single induction variable. First use the SCEVExpander to create a +/// wide induction variable that evaluates to the same recurrence as the +/// original narrow IV. Then use a worklist to forward traverse the narrow IV's +/// def-use chain. After widenIVUse has processed all interesting IV users, the +/// narrow IV will be isolated for removal by DeleteDeadPHIs. /// /// It would be simpler to delete uses as they are processed, but we must avoid /// invalidating SCEV expressions. /// -PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { +PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) { // Is this phi an induction variable? const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi)); if (!AddRec) @@ -1302,11 +1399,11 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { // either find an existing phi or materialize a new one. Either way, we // expect a well-formed cyclic phi-with-increments. i.e. any operand not part // of the phi-SCC dominates the loop entry. - Instruction *InsertPt = L->getHeader()->begin(); + Instruction *InsertPt = &L->getHeader()->front(); WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt)); // Remembering the WideIV increment generated by SCEVExpander allows - // WidenIVUse to reuse it when widening the narrow IV's increment. We don't + // widenIVUse to reuse it when widening the narrow IV's increment. We don't // employ a general reuse mechanism because the call above is the only call to // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses. if (BasicBlock *LatchBlock = L->getLoopLatch()) { @@ -1329,13 +1426,13 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { // Process a def-use edge. This may replace the use, so don't hold a // use_iterator across it. - Instruction *WideUse = WidenIVUse(DU, Rewriter); + Instruction *WideUse = widenIVUse(DU, Rewriter); // Follow all def-use edges from the previous narrow use. if (WideUse) pushNarrowIVUsers(DU.NarrowUse, WideUse); - // WidenIVUse may have removed the def-use edge. + // widenIVUse may have removed the def-use edge. if (DU.NarrowDef->use_empty()) DeadInsts.emplace_back(DU.NarrowDef); } @@ -1352,38 +1449,38 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { //===----------------------------------------------------------------------===// namespace { - class IndVarSimplifyVisitor : public IVVisitor { - ScalarEvolution *SE; - const TargetTransformInfo *TTI; - PHINode *IVPhi; - - public: - WideIVInfo WI; - - IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, - const TargetTransformInfo *TTI, - const DominatorTree *DTree) - : SE(SCEV), TTI(TTI), IVPhi(IV) { - DT = DTree; - WI.NarrowIV = IVPhi; - if (ReduceLiveIVs) - setSplitOverflowIntrinsics(); - } +class IndVarSimplifyVisitor : public IVVisitor { + ScalarEvolution *SE; + const TargetTransformInfo *TTI; + PHINode *IVPhi; - // Implement the interface used by simplifyUsersOfIV. - void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } - }; +public: + WideIVInfo WI; + + IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, + const TargetTransformInfo *TTI, + const DominatorTree *DTree) + : SE(SCEV), TTI(TTI), IVPhi(IV) { + DT = DTree; + WI.NarrowIV = IVPhi; + if (ReduceLiveIVs) + setSplitOverflowIntrinsics(); + } + + // Implement the interface used by simplifyUsersOfIV. + void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } +}; } -/// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV -/// users. Each successive simplification may push more users which may -/// themselves be candidates for simplification. +/// Iteratively perform simplification on a worklist of IV users. Each +/// successive simplification may push more users which may themselves be +/// candidates for simplification. /// /// Sign/Zero extend elimination is interleaved with IV simplification. /// -void IndVarSimplify::SimplifyAndExtend(Loop *L, +void IndVarSimplify::simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, - LPPassManager &LPM) { + LoopInfo *LI) { SmallVector<WideIVInfo, 8> WideIVs; SmallVector<PHINode*, 8> LoopPhis; @@ -1400,14 +1497,14 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, // extension. The first time SCEV attempts to normalize sign/zero extension, // the result becomes final. So for the most predictable results, we delay // evaluation of sign/zero extend evaluation until needed, and avoid running - // other SCEV based analysis prior to SimplifyAndExtend. + // other SCEV based analysis prior to simplifyAndExtend. do { PHINode *CurrIV = LoopPhis.pop_back_val(); // Information about sign/zero extensions of CurrIV. IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); - Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor); + Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, &Visitor); if (Visitor.WI.WidestNativeType) { WideIVs.push_back(Visitor.WI); @@ -1416,7 +1513,7 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, for (; !WideIVs.empty(); WideIVs.pop_back()) { WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts); - if (PHINode *WidePhi = Widener.CreateWideIV(Rewriter)) { + if (PHINode *WidePhi = Widener.createWideIV(Rewriter)) { Changed = true; LoopPhis.push_back(WidePhi); } @@ -1425,12 +1522,12 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, } //===----------------------------------------------------------------------===// -// LinearFunctionTestReplace and its kin. Rewrite the loop exit condition. +// linearFunctionTestReplace and its kin. Rewrite the loop exit condition. //===----------------------------------------------------------------------===// -/// canExpandBackedgeTakenCount - Return true if this loop's backedge taken -/// count expression can be safely and cheaply expanded into an instruction -/// sequence that can be used by LinearFunctionTestReplace. +/// Return true if this loop's backedge taken count expression can be safely and +/// cheaply expanded into an instruction sequence that can be used by +/// linearFunctionTestReplace. /// /// TODO: This fails for pointer-type loop counters with greater than one byte /// strides, consequently preventing LFTR from running. For the purpose of LFTR @@ -1461,8 +1558,7 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE, return true; } -/// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop -/// invariant value to the phi. +/// Return the loop header phi IFF IncV adds a loop invariant value to the phi. static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { Instruction *IncI = dyn_cast<Instruction>(IncV); if (!IncI) @@ -1513,8 +1609,8 @@ static ICmpInst *getLoopTest(Loop *L) { return dyn_cast<ICmpInst>(BI->getCondition()); } -/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show -/// that the current exit test is already sufficiently canonical. +/// linearFunctionTestReplace policy. Return true unless we can show that the +/// current exit test is already sufficiently canonical. static bool needsLFTR(Loop *L, DominatorTree *DT) { // Do LFTR to simplify the exit condition to an ICMP. ICmpInst *Cond = getLoopTest(L); @@ -1574,10 +1670,10 @@ static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited, return false; // Optimistically handle other instructions. - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) { - if (!Visited.insert(*OI).second) + for (Value *Op : I->operands()) { + if (!Visited.insert(Op).second) continue; - if (!hasConcreteDefImpl(*OI, Visited, Depth+1)) + if (!hasConcreteDefImpl(Op, Visited, Depth+1)) return false; } return true; @@ -1594,8 +1690,8 @@ static bool hasConcreteDef(Value *V) { return hasConcreteDefImpl(V, Visited, 0); } -/// AlmostDeadIV - Return true if this IV has any uses other than the (soon to -/// be rewritten) loop exit test. +/// Return true if this IV has any uses other than the (soon to be rewritten) +/// loop exit test. static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { int LatchIdx = Phi->getBasicBlockIndex(LatchBlock); Value *IncV = Phi->getIncomingValue(LatchIdx); @@ -1608,7 +1704,7 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { return true; } -/// FindLoopCounter - Find an affine IV in canonical form. +/// Find an affine IV in canonical form. /// /// BECount may be an i8* pointer type. The pointer difference is already /// valid count without scaling the address stride, so it remains a pointer @@ -1702,8 +1798,8 @@ static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount, return BestPhi; } -/// genLoopLimit - Help LinearFunctionTestReplace by generating a value that -/// holds the RHS of the new loop test. +/// Help linearFunctionTestReplace by generating a value that holds the RHS of +/// the new loop test. static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, SCEVExpander &Rewriter, ScalarEvolution *SE) { const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); @@ -1785,13 +1881,13 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, } } -/// LinearFunctionTestReplace - This method rewrites the exit condition of the -/// loop to be a canonical != comparison against the incremented loop induction -/// variable. This pass is able to rewrite the exit tests of any loop where the -/// SCEV analysis can determine a loop-invariant trip count of the loop, which -/// is actually a much broader range than just linear tests. +/// This method rewrites the exit condition of the loop to be a canonical != +/// comparison against the incremented loop induction variable. This pass is +/// able to rewrite the exit tests of any loop where the SCEV analysis can +/// determine a loop-invariant trip count of the loop, which is actually a much +/// broader range than just linear tests. Value *IndVarSimplify:: -LinearFunctionTestReplace(Loop *L, +linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, PHINode *IndVar, SCEVExpander &Rewriter) { @@ -1809,7 +1905,7 @@ LinearFunctionTestReplace(Loop *L, // This addition may overflow, which is valid as long as the comparison is // truncated to BackedgeTakenCount->getType(). IVCount = SE->getAddExpr(BackedgeTakenCount, - SE->getConstant(BackedgeTakenCount->getType(), 1)); + SE->getOne(BackedgeTakenCount->getType())); // The BackedgeTaken expression contains the number of times that the // backedge branches to the loop header. This is one less than the // number of times the loop executes, so use the incremented indvar. @@ -1847,8 +1943,8 @@ LinearFunctionTestReplace(Loop *L, const SCEV *ARStep = AR->getStepRecurrence(*SE); // For constant IVCount, avoid truncation. if (isa<SCEVConstant>(ARStart) && isa<SCEVConstant>(IVCount)) { - const APInt &Start = cast<SCEVConstant>(ARStart)->getValue()->getValue(); - APInt Count = cast<SCEVConstant>(IVCount)->getValue()->getValue(); + const APInt &Start = cast<SCEVConstant>(ARStart)->getAPInt(); + APInt Count = cast<SCEVConstant>(IVCount)->getAPInt(); // Note that the post-inc value of BackedgeTakenCount may have overflowed // above such that IVCount is now zero. if (IVCount != BackedgeTakenCount && Count == 0) { @@ -1886,21 +1982,21 @@ LinearFunctionTestReplace(Loop *L, } //===----------------------------------------------------------------------===// -// SinkUnusedInvariants. A late subpass to cleanup loop preheaders. +// sinkUnusedInvariants. A late subpass to cleanup loop preheaders. //===----------------------------------------------------------------------===// /// If there's a single exit block, sink any loop-invariant values that /// were defined in the preheader but not used inside the loop into the /// exit block to reduce register pressure in the loop. -void IndVarSimplify::SinkUnusedInvariants(Loop *L) { +void IndVarSimplify::sinkUnusedInvariants(Loop *L) { BasicBlock *ExitBlock = L->getExitBlock(); if (!ExitBlock) return; BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) return; - Instruction *InsertPt = ExitBlock->getFirstInsertionPt(); - BasicBlock::iterator I = Preheader->getTerminator(); + Instruction *InsertPt = &*ExitBlock->getFirstInsertionPt(); + BasicBlock::iterator I(Preheader->getTerminator()); while (I != Preheader->begin()) { --I; // New instructions were inserted at the end of the preheader. @@ -1920,8 +2016,8 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { if (isa<DbgInfoIntrinsic>(I)) continue; - // Skip landingpad instructions. - if (isa<LandingPadInst>(I)) + // Skip eh pad instructions. + if (I->isEHPad()) continue; // Don't sink alloca: we never want to sink static alloca's out of the @@ -1953,7 +2049,7 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { continue; // Otherwise, sink it to the exit block. - Instruction *ToMove = I; + Instruction *ToMove = &*I; bool Done = false; if (I != Preheader->begin()) { @@ -1994,7 +2090,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { return false; LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); TLI = TLIP ? &TLIP->getTLI() : nullptr; @@ -2007,7 +2103,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // If there are any floating-point recurrences, attempt to // transform them to use integer recurrences. - RewriteNonIntegerIVs(L); + rewriteNonIntegerIVs(L); const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); @@ -2024,7 +2120,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // other expressions involving loop IVs have been evaluated. This helps SCEV // set no-wrap flags before normalizing sign/zero extension. Rewriter.disableCanonicalMode(); - SimplifyAndExtend(L, Rewriter, LPM); + simplifyAndExtend(L, Rewriter, LI); // Check to see if this loop has a computable loop-invariant execution count. // If so, this means that we can compute the final value of any expressions @@ -2034,7 +2130,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // if (ReplaceExitValue != NeverRepl && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) - RewriteLoopExitValues(L, Rewriter); + rewriteLoopExitValues(L, Rewriter); // Eliminate redundant IV cycles. NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); @@ -2054,7 +2150,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // explicitly check any assumptions made by SCEV. Brittle. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount); if (!AR || AR->getLoop()->getLoopPreheader()) - (void)LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, + (void)linearFunctionTestReplace(L, BackedgeTakenCount, IndVar, Rewriter); } } @@ -2074,13 +2170,13 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Loop-invariant instructions in the preheader that aren't used in the // loop may be sunk below the loop to reduce register pressure. - SinkUnusedInvariants(L); + sinkUnusedInvariants(L); // Clean up dead instructions. Changed |= DeleteDeadPHIs(L->getHeader(), TLI); + // Check a post-condition. - assert(L->isLCSSAForm(*DT) && - "Indvars did not leave the loop in lcssa form!"); + assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!"); // Verify that LFTR, and any other change have not interfered with SCEV's // ability to compute trip count. diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index cbdacad..dea61f6 100644 --- a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -214,8 +214,8 @@ public: AU.addRequired<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - AU.addRequired<ScalarEvolution>(); - AU.addRequired<BranchProbabilityInfo>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<BranchProbabilityInfoWrapperPass>(); } bool runOnLoop(Loop *L, LPPassManager &LPM) override; @@ -224,8 +224,15 @@ public: char InductiveRangeCheckElimination::ID = 0; } -INITIALIZE_PASS(InductiveRangeCheckElimination, "irce", - "Inductive range check elimination", false, false) +INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce", + "Inductive range check elimination", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce", + "Inductive range check elimination", false, false) const char *InductiveRangeCheck::rangeCheckKindToStr( InductiveRangeCheck::RangeCheckKind RCK) { @@ -1044,9 +1051,9 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( auto BBInsertLocation = std::next(Function::iterator(LS.Latch)); RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector", - &F, BBInsertLocation); + &F, &*BBInsertLocation); RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F, - BBInsertLocation); + &*BBInsertLocation); BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin()); bool Increasing = LS.IndVarIncreasing; @@ -1399,8 +1406,9 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { LLVMContext &Context = Preheader->getContext(); InductiveRangeCheck::AllocatorTy IRCAlloc; SmallVector<InductiveRangeCheck *, 16> RangeChecks; - ScalarEvolution &SE = getAnalysis<ScalarEvolution>(); - BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>(); + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + BranchProbabilityInfo &BPI = + getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); for (auto BBI : L->getBlocks()) if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator())) diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 1130d22..dcdcfed 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -18,15 +18,22 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" @@ -36,6 +43,8 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include <algorithm> +#include <memory> using namespace llvm; #define DEBUG_TYPE "jump-threading" @@ -49,6 +58,13 @@ BBDuplicateThreshold("jump-threading-threshold", cl::desc("Max block size to duplicate for jump threading"), cl::init(6), cl::Hidden); +static cl::opt<unsigned> +ImplicationSearchThreshold( + "jump-threading-implication-search-threshold", + cl::desc("The number of predecessors to search for a stronger " + "condition to use to thread over a weaker condition"), + cl::init(3), cl::Hidden); + namespace { // These are at global scope so static functions can use them too. typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo; @@ -80,10 +96,13 @@ namespace { class JumpThreading : public FunctionPass { TargetLibraryInfo *TLI; LazyValueInfo *LVI; + std::unique_ptr<BlockFrequencyInfo> BFI; + std::unique_ptr<BranchProbabilityInfo> BPI; + bool HasProfileData; #ifdef NDEBUG - SmallPtrSet<BasicBlock*, 16> LoopHeaders; + SmallPtrSet<const BasicBlock *, 16> LoopHeaders; #else - SmallSet<AssertingVH<BasicBlock>, 16> LoopHeaders; + SmallSet<AssertingVH<const BasicBlock>, 16> LoopHeaders; #endif DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet; @@ -114,9 +133,15 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); AU.addPreserved<LazyValueInfo>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } + void releaseMemory() override { + BFI.reset(); + BPI.reset(); + } + void FindLoopHeaders(Function &F); bool ProcessBlock(BasicBlock *BB); bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs, @@ -134,9 +159,17 @@ namespace { bool ProcessBranchOnPHI(PHINode *PN); bool ProcessBranchOnXOR(BinaryOperator *BO); + bool ProcessImpliedCondition(BasicBlock *BB); bool SimplifyPartiallyRedundantLoad(LoadInst *LI); bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB); + bool TryToUnfoldSelectInCurrBB(BasicBlock *BB); + + private: + BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds, + const char *Suffix); + void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB, + BasicBlock *NewBB, BasicBlock *SuccBB); }; } @@ -160,23 +193,34 @@ bool JumpThreading::runOnFunction(Function &F) { DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); LVI = &getAnalysis<LazyValueInfo>(); + BFI.reset(); + BPI.reset(); + // When profile data is available, we need to update edge weights after + // successful jump threading, which requires both BPI and BFI being available. + HasProfileData = F.getEntryCount().hasValue(); + if (HasProfileData) { + LoopInfo LI{DominatorTree(F)}; + BPI.reset(new BranchProbabilityInfo(F, LI)); + BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); + } // Remove unreachable blocks from function as they may result in infinite // loop. We do threading if we found something profitable. Jump threading a // branch can create other opportunities. If these opportunities form a cycle - // i.e. if any jump treading is undoing previous threading in the path, then + // i.e. if any jump threading is undoing previous threading in the path, then // we will loop forever. We take care of this issue by not jump threading for // back edges. This works for normal cases but not for unreachable blocks as // they may have cycle with no back edge. - removeUnreachableBlocks(F); + bool EverChanged = false; + EverChanged |= removeUnreachableBlocks(F, LVI); FindLoopHeaders(F); - bool Changed, EverChanged = false; + bool Changed; do { Changed = false; for (Function::iterator I = F.begin(), E = F.end(); I != E;) { - BasicBlock *BB = I; + BasicBlock *BB = &*I; // Thread all of the branches we can over this block. while (ProcessBlock(BB)) Changed = true; @@ -239,11 +283,26 @@ bool JumpThreading::runOnFunction(Function &F) { static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, unsigned Threshold) { /// Ignore PHI nodes, these will be flattened when duplication happens. - BasicBlock::const_iterator I = BB->getFirstNonPHI(); + BasicBlock::const_iterator I(BB->getFirstNonPHI()); // FIXME: THREADING will delete values that are just used to compute the // branch, so they shouldn't count against the duplication cost. + unsigned Bonus = 0; + const TerminatorInst *BBTerm = BB->getTerminator(); + // Threading through a switch statement is particularly profitable. If this + // block ends in a switch, decrease its cost to make it more likely to happen. + if (isa<SwitchInst>(BBTerm)) + Bonus = 6; + + // The same holds for indirect branches, but slightly more so. + if (isa<IndirectBrInst>(BBTerm)) + Bonus = 8; + + // Bump the threshold up so the early exit from the loop doesn't skip the + // terminator-based Size adjustment at the end. + Threshold += Bonus; + // Sum up the cost of each instruction until we get to the terminator. Don't // include the terminator because the copy won't include it. unsigned Size = 0; @@ -260,6 +319,11 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, if (isa<BitCastInst>(I) && I->getType()->isPointerTy()) continue; + // Bail out if this instruction gives back a token type, it is not possible + // to duplicate it if it is used outside this BB. + if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB)) + return ~0U; + // All other instructions count for at least one unit. ++Size; @@ -268,7 +332,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, // as having cost of 2 total, and if they are a vector intrinsic, we model // them as having cost 1. if (const CallInst *CI = dyn_cast<CallInst>(I)) { - if (CI->cannotDuplicate()) + if (CI->cannotDuplicate() || CI->isConvergent()) // Blocks with NoDuplicate are modelled as having infinite cost, so they // are never duplicated. return ~0U; @@ -279,16 +343,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, } } - // Threading through a switch statement is particularly profitable. If this - // block ends in a switch, decrease its cost to make it more likely to happen. - if (isa<SwitchInst>(I)) - Size = Size > 6 ? Size-6 : 0; - - // The same holds for indirect branches, but slightly more so. - if (isa<IndirectBrInst>(I)) - Size = Size > 8 ? Size-8 : 0; - - return Size; + return Size > Bonus ? Size - Bonus : 0; } /// FindLoopHeaders - We do not want jump threading to turn proper loop @@ -310,8 +365,8 @@ void JumpThreading::FindLoopHeaders(Function &F) { SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges; FindFunctionBackedges(F, Edges); - for (unsigned i = 0, e = Edges.size(); i != e; ++i) - LoopHeaders.insert(const_cast<BasicBlock*>(Edges[i].second)); + for (const auto &Edge : Edges) + LoopHeaders.insert(Edge.second); } /// getKnownConstant - Helper method to determine if we can thread over a @@ -357,8 +412,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, // If V is a constant, then it is known in all predecessors. if (Constant *KC = getKnownConstant(V, Preference)) { - for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) - Result.push_back(std::make_pair(KC, *PI)); + for (BasicBlock *Pred : predecessors(BB)) + Result.push_back(std::make_pair(KC, Pred)); return true; } @@ -381,8 +436,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, // "X < 4" and "X < 3" is known true but "X < 4" itself is not available. // Perhaps getConstantOnEdge should be smart enough to do this? - for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { - BasicBlock *P = *PI; + for (BasicBlock *P : predecessors(BB)) { // If the value is known by LazyValueInfo to be a constant in a // predecessor, use that information to try to thread this block. Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI); @@ -438,22 +492,17 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, // Scan for the sentinel. If we find an undef, force it to the // interesting value: x|undef -> true and x&undef -> false. - for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) - if (LHSVals[i].first == InterestingVal || - isa<UndefValue>(LHSVals[i].first)) { - Result.push_back(LHSVals[i]); - Result.back().first = InterestingVal; - LHSKnownBBs.insert(LHSVals[i].second); + for (const auto &LHSVal : LHSVals) + if (LHSVal.first == InterestingVal || isa<UndefValue>(LHSVal.first)) { + Result.emplace_back(InterestingVal, LHSVal.second); + LHSKnownBBs.insert(LHSVal.second); } - for (unsigned i = 0, e = RHSVals.size(); i != e; ++i) - if (RHSVals[i].first == InterestingVal || - isa<UndefValue>(RHSVals[i].first)) { + for (const auto &RHSVal : RHSVals) + if (RHSVal.first == InterestingVal || isa<UndefValue>(RHSVal.first)) { // If we already inferred a value for this block on the LHS, don't // re-add it. - if (!LHSKnownBBs.count(RHSVals[i].second)) { - Result.push_back(RHSVals[i]); - Result.back().first = InterestingVal; - } + if (!LHSKnownBBs.count(RHSVal.second)) + Result.emplace_back(InterestingVal, RHSVal.second); } return !Result.empty(); @@ -469,8 +518,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, return false; // Invert the known values. - for (unsigned i = 0, e = Result.size(); i != e; ++i) - Result[i].first = ConstantExpr::getNot(Result[i].first); + for (auto &R : Result) + R.first = ConstantExpr::getNot(R.first); return true; } @@ -485,12 +534,12 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, WantInteger, CxtI); // Try to use constant folding to simplify the binary operator. - for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) { - Constant *V = LHSVals[i].first; + for (const auto &LHSVal : LHSVals) { + Constant *V = LHSVal.first; Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI); if (Constant *KC = getKnownConstant(Folded, WantInteger)) - Result.push_back(std::make_pair(KC, LHSVals[i].second)); + Result.push_back(std::make_pair(KC, LHSVal.second)); } } @@ -538,8 +587,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) { Constant *RHSCst = cast<Constant>(Cmp->getOperand(1)); - for (pred_iterator PI = pred_begin(BB), E = pred_end(BB);PI != E; ++PI){ - BasicBlock *P = *PI; + for (BasicBlock *P : predecessors(BB)) { // If the value is known by LazyValueInfo to be a constant in a // predecessor, use that information to try to thread this block. LazyValueInfo::Tristate Res = @@ -562,12 +610,12 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, WantInteger, CxtI); - for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) { - Constant *V = LHSVals[i].first; + for (const auto &LHSVal : LHSVals) { + Constant *V = LHSVal.first; Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(), V, CmpConst); if (Constant *KC = getKnownConstant(Folded, WantInteger)) - Result.push_back(std::make_pair(KC, LHSVals[i].second)); + Result.push_back(std::make_pair(KC, LHSVal.second)); } return !Result.empty(); @@ -584,8 +632,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, if ((TrueVal || FalseVal) && ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds, WantInteger, CxtI)) { - for (unsigned i = 0, e = Conds.size(); i != e; ++i) { - Constant *Cond = Conds[i].first; + for (auto &C : Conds) { + Constant *Cond = C.first; // Figure out what value to use for the condition. bool KnownCond; @@ -602,7 +650,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, // See if the select has a known constant value for this predecessor. if (Constant *Val = KnownCond ? TrueVal : FalseVal) - Result.push_back(std::make_pair(Val, Conds[i].second)); + Result.push_back(std::make_pair(Val, C.second)); } return !Result.empty(); @@ -612,8 +660,8 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, // If all else fails, see if LVI can figure out a constant value for us. Constant *CI = LVI->getConstant(V, BB, CxtI); if (Constant *KC = getKnownConstant(CI, Preference)) { - for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) - Result.push_back(std::make_pair(KC, *PI)); + for (BasicBlock *Pred : predecessors(BB)) + Result.push_back(std::make_pair(KC, Pred)); } return !Result.empty(); @@ -669,7 +717,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // because now the condition in this block can be threaded through // predecessors of our predecessor block. if (BasicBlock *SinglePred = BB->getSinglePredecessor()) { - if (SinglePred->getTerminator()->getNumSuccessors() == 1 && + const TerminatorInst *TI = SinglePred->getTerminator(); + if (!TI->isExceptional() && TI->getNumSuccessors() == 1 && SinglePred != BB && !hasAddressTakenAndUsed(BB)) { // If SinglePred was a loop header, BB becomes one. if (LoopHeaders.erase(SinglePred)) @@ -682,6 +731,9 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { } } + if (TryToUnfoldSelectInCurrBB(BB)) + return true; + // What kind of constant we're looking for. ConstantPreference Preference = WantInteger; @@ -761,7 +813,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // If we're branching on a conditional, LVI might be able to determine // it's value at the branch instruction. We only handle comparisons // against a constant at this time. - // TODO: This should be extended to handle switches as well. + // TODO: This should be extended to handle switches as well. BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1)); if (CondBr && CondConst && CondBr->isConditional()) { @@ -829,9 +881,40 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator())) return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst)); + // Search for a stronger dominating condition that can be used to simplify a + // conditional branch leaving BB. + if (ProcessImpliedCondition(BB)) + return true; + + return false; +} + +bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) { + auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (!BI || !BI->isConditional()) + return false; + + Value *Cond = BI->getCondition(); + BasicBlock *CurrentBB = BB; + BasicBlock *CurrentPred = BB->getSinglePredecessor(); + unsigned Iter = 0; - // TODO: If we have: "br (X > 0)" and we have a predecessor where we know - // "(X == 4)", thread through this block. + auto &DL = BB->getModule()->getDataLayout(); + + while (CurrentPred && Iter++ < ImplicationSearchThreshold) { + auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator()); + if (!PBI || !PBI->isConditional() || PBI->getSuccessor(0) != CurrentBB) + return false; + + if (isImpliedCondition(PBI->getCondition(), Cond, DL)) { + BI->getSuccessor(1)->removePredecessor(BB); + BranchInst::Create(BI->getSuccessor(0), BI); + BI->eraseFromParent(); + return true; + } + CurrentBB = CurrentPred; + CurrentPred = CurrentBB->getSinglePredecessor(); + } return false; } @@ -850,10 +933,10 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (LoadBB->getSinglePredecessor()) return false; - // If the load is defined in a landing pad, it can't be partially redundant, - // because the edges between the invoke and the landing pad cannot have other + // If the load is defined in an EH pad, it can't be partially redundant, + // because the edges between the invoke and the EH pad cannot have other // instructions between them. - if (LoadBB->isLandingPad()) + if (LoadBB->isEHPad()) return false; Value *LoadedPtr = LI->getOperand(0); @@ -866,11 +949,11 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Scan a few instructions up from the load, to see if it is obviously live at // the entry to its block. - BasicBlock::iterator BBIt = LI; + BasicBlock::iterator BBIt(LI); if (Value *AvailableVal = - FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, 6)) { - // If the value if the load is locally available within the block, just use + FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, DefMaxInstsToScan)) { + // If the value of the load is locally available within the block, just use // it. This frequently occurs for reg2mem'd allocas. //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n"; @@ -903,10 +986,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // If we got here, the loaded value is transparent through to the start of the // block. Check to see if it is available in any of the predecessor blocks. - for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB); - PI != PE; ++PI) { - BasicBlock *PredBB = *PI; - + for (BasicBlock *PredBB : predecessors(LoadBB)) { // If we already scanned this predecessor, skip it. if (!PredsScanned.insert(PredBB).second) continue; @@ -914,7 +994,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Scan the predecessor to see if the value is available in the pred. BBIt = PredBB->end(); AAMDNodes ThisAATags; - Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6, + Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, + DefMaxInstsToScan, nullptr, &ThisAATags); if (!PredAvailable) { OneUnavailablePred = PredBB; @@ -952,13 +1033,11 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { SmallVector<BasicBlock*, 8> PredsToSplit; SmallPtrSet<BasicBlock*, 8> AvailablePredSet; - for (unsigned i = 0, e = AvailablePreds.size(); i != e; ++i) - AvailablePredSet.insert(AvailablePreds[i].first); + for (const auto &AvailablePred : AvailablePreds) + AvailablePredSet.insert(AvailablePred.first); // Add all the unavailable predecessors to the PredsToSplit list. - for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB); - PI != PE; ++PI) { - BasicBlock *P = *PI; + for (BasicBlock *P : predecessors(LoadBB)) { // If the predecessor is an indirect goto, we can't split the edge. if (isa<IndirectBrInst>(P->getTerminator())) return false; @@ -968,8 +1047,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { } // Split them out to their own block. - UnavailablePred = - SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split"); + UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split"); } // If the value isn't available in all predecessors, then there will be @@ -995,7 +1073,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Create a PHI node at the start of the block for the PRE'd load value. pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB); PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "", - LoadBB->begin()); + &LoadBB->front()); PN->takeName(LI); PN->setDebugLoc(LI->getDebugLoc()); @@ -1044,9 +1122,9 @@ FindMostPopularDest(BasicBlock *BB, // blocks with known and real destinations to threading undef. We'll handle // them later if interesting. DenseMap<BasicBlock*, unsigned> DestPopularity; - for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i) - if (PredToDestList[i].second) - DestPopularity[PredToDestList[i].second]++; + for (const auto &PredToDest : PredToDestList) + if (PredToDest.second) + DestPopularity[PredToDest.second]++; // Find the most popular dest. DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin(); @@ -1109,10 +1187,10 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, "ComputeValueKnownInPredecessors returned true with no values"); DEBUG(dbgs() << "IN BB: " << *BB; - for (unsigned i = 0, e = PredValues.size(); i != e; ++i) { + for (const auto &PredValue : PredValues) { dbgs() << " BB '" << BB->getName() << "': FOUND condition = " - << *PredValues[i].first - << " for pred '" << PredValues[i].second->getName() << "'.\n"; + << *PredValue.first + << " for pred '" << PredValue.second->getName() << "'.\n"; }); // Decide what we want to thread through. Convert our list of known values to @@ -1125,8 +1203,8 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, BasicBlock *OnlyDest = nullptr; BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL; - for (unsigned i = 0, e = PredValues.size(); i != e; ++i) { - BasicBlock *Pred = PredValues[i].second; + for (const auto &PredValue : PredValues) { + BasicBlock *Pred = PredValue.second; if (!SeenPreds.insert(Pred).second) continue; // Duplicate predecessor entry. @@ -1135,7 +1213,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, if (isa<IndirectBrInst>(Pred->getTerminator())) continue; - Constant *Val = PredValues[i].first; + Constant *Val = PredValue.first; BasicBlock *DestBB; if (isa<UndefValue>(Val)) @@ -1175,16 +1253,15 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, // Now that we know what the most popular destination is, factor all // predecessors that will jump to it into a single predecessor. SmallVector<BasicBlock*, 16> PredsToFactor; - for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i) - if (PredToDestList[i].second == MostPopularDest) { - BasicBlock *Pred = PredToDestList[i].first; + for (const auto &PredToDest : PredToDestList) + if (PredToDest.second == MostPopularDest) { + BasicBlock *Pred = PredToDest.first; // This predecessor may be a switch or something else that has multiple // edges to the block. Factor each of these edges by listing them // according to # occurrences in PredsToFactor. - TerminatorInst *PredTI = Pred->getTerminator(); - for (unsigned i = 0, e = PredTI->getNumSuccessors(); i != e; ++i) - if (PredTI->getSuccessor(i) == BB) + for (BasicBlock *Succ : successors(Pred)) + if (Succ == BB) PredsToFactor.push_back(Pred); } @@ -1262,7 +1339,7 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { // Into: // BB': // %Y = icmp ne i32 %A, %B - // br i1 %Z, ... + // br i1 %Y, ... PredValueInfoTy XorOpValues; bool isLHS = true; @@ -1281,11 +1358,11 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { // Scan the information to see which is most popular: true or false. The // predecessors can be of the set true, false, or undef. unsigned NumTrue = 0, NumFalse = 0; - for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) { - if (isa<UndefValue>(XorOpValues[i].first)) + for (const auto &XorOpValue : XorOpValues) { + if (isa<UndefValue>(XorOpValue.first)) // Ignore undefs for the count. continue; - if (cast<ConstantInt>(XorOpValues[i].first)->isZero()) + if (cast<ConstantInt>(XorOpValue.first)->isZero()) ++NumFalse; else ++NumTrue; @@ -1301,12 +1378,11 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { // Collect all of the blocks that this can be folded into so that we can // factor this once and clone it once. SmallVector<BasicBlock*, 8> BlocksToFoldInto; - for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) { - if (XorOpValues[i].first != SplitVal && - !isa<UndefValue>(XorOpValues[i].first)) + for (const auto &XorOpValue : XorOpValues) { + if (XorOpValue.first != SplitVal && !isa<UndefValue>(XorOpValue.first)) continue; - BlocksToFoldInto.push_back(XorOpValues[i].second); + BlocksToFoldInto.push_back(XorOpValue.second); } // If we inferred a value for all of the predecessors, then duplication won't @@ -1387,14 +1463,14 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, return false; } - // And finally, do it! Start by factoring the predecessors is needed. + // And finally, do it! Start by factoring the predecessors if needed. BasicBlock *PredBB; if (PredBBs.size() == 1) PredBB = PredBBs[0]; else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); + PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm"); } // And finally, do it! @@ -1415,6 +1491,13 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, BB->getParent(), BB); NewBB->moveAfter(PredBB); + // Set the block frequency of NewBB. + if (HasProfileData) { + auto NewBBFreq = + BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB); + BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); + } + BasicBlock::iterator BI = BB->begin(); for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); @@ -1425,7 +1508,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, Instruction *New = BI->clone(); New->setName(BI->getName()); NewBB->getInstList().push_back(New); - ValueMapping[BI] = New; + ValueMapping[&*BI] = New; // Remap operands to patch up intra-block references. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) @@ -1438,7 +1521,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // We didn't copy the terminator from BB over to NewBB, because there is now // an unconditional jump to SuccBB. Insert the unconditional jump. - BranchInst *NewBI =BranchInst::Create(SuccBB, NewBB); + BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB); NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc()); // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the @@ -1451,10 +1534,10 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // PHI insertion, of which we are prepared to do, clean these up now. SSAUpdater SSAUpdate; SmallVector<Use*, 16> UsesToRename; - for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { + for (Instruction &I : *BB) { // Scan all uses of this instruction to see if it is used outside of its // block, and if so, record them in UsesToRename. - for (Use &U : I->uses()) { + for (Use &U : I.uses()) { Instruction *User = cast<Instruction>(U.getUser()); if (PHINode *UserPN = dyn_cast<PHINode>(User)) { if (UserPN->getIncomingBlock(U) == BB) @@ -1469,14 +1552,14 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, if (UsesToRename.empty()) continue; - DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n"); + DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); // We found a use of I outside of BB. Rename all uses of I that are outside // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks // with the two values we know. - SSAUpdate.Initialize(I->getType(), I->getName()); - SSAUpdate.AddAvailableValue(BB, I); - SSAUpdate.AddAvailableValue(NewBB, ValueMapping[I]); + SSAUpdate.Initialize(I.getType(), I.getName()); + SSAUpdate.AddAvailableValue(BB, &I); + SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]); while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); @@ -1499,11 +1582,98 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // frequently happens because of phi translation. SimplifyInstructionsInBlock(NewBB, TLI); + // Update the edge weight from BB to SuccBB, which should be less than before. + UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB); + // Threaded an edge! ++NumThreads; return true; } +/// Create a new basic block that will be the predecessor of BB and successor of +/// all blocks in Preds. When profile data is availble, update the frequency of +/// this new block. +BasicBlock *JumpThreading::SplitBlockPreds(BasicBlock *BB, + ArrayRef<BasicBlock *> Preds, + const char *Suffix) { + // Collect the frequencies of all predecessors of BB, which will be used to + // update the edge weight on BB->SuccBB. + BlockFrequency PredBBFreq(0); + if (HasProfileData) + for (auto Pred : Preds) + PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB); + + BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix); + + // Set the block frequency of the newly created PredBB, which is the sum of + // frequencies of Preds. + if (HasProfileData) + BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency()); + return PredBB; +} + +/// Update the block frequency of BB and branch weight and the metadata on the +/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 - +/// Freq(PredBB->BB) / Freq(BB->SuccBB). +void JumpThreading::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, + BasicBlock *BB, + BasicBlock *NewBB, + BasicBlock *SuccBB) { + if (!HasProfileData) + return; + + assert(BFI && BPI && "BFI & BPI should have been created here"); + + // As the edge from PredBB to BB is deleted, we have to update the block + // frequency of BB. + auto BBOrigFreq = BFI->getBlockFreq(BB); + auto NewBBFreq = BFI->getBlockFreq(NewBB); + auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB); + auto BBNewFreq = BBOrigFreq - NewBBFreq; + BFI->setBlockFreq(BB, BBNewFreq.getFrequency()); + + // Collect updated outgoing edges' frequencies from BB and use them to update + // edge probabilities. + SmallVector<uint64_t, 4> BBSuccFreq; + for (BasicBlock *Succ : successors(BB)) { + auto SuccFreq = (Succ == SuccBB) + ? BB2SuccBBFreq - NewBBFreq + : BBOrigFreq * BPI->getEdgeProbability(BB, Succ); + BBSuccFreq.push_back(SuccFreq.getFrequency()); + } + + uint64_t MaxBBSuccFreq = + *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end()); + + SmallVector<BranchProbability, 4> BBSuccProbs; + if (MaxBBSuccFreq == 0) + BBSuccProbs.assign(BBSuccFreq.size(), + {1, static_cast<unsigned>(BBSuccFreq.size())}); + else { + for (uint64_t Freq : BBSuccFreq) + BBSuccProbs.push_back( + BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq)); + // Normalize edge probabilities so that they sum up to one. + BranchProbability::normalizeProbabilities(BBSuccProbs.begin(), + BBSuccProbs.end()); + } + + // Update edge probabilities in BPI. + for (int I = 0, E = BBSuccProbs.size(); I < E; I++) + BPI->setEdgeProbability(BB, I, BBSuccProbs[I]); + + if (BBSuccProbs.size() >= 2) { + SmallVector<uint32_t, 4> Weights; + for (auto Prob : BBSuccProbs) + Weights.push_back(Prob.getNumerator()); + + auto TI = BB->getTerminator(); + TI->setMetadata( + LLVMContext::MD_prof, + MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights)); + } +} + /// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch /// to BB which contains an i1 PHI node and a conditional branch on that PHI. /// If we can duplicate the contents of BB up into PredBB do so now, this @@ -1530,14 +1700,14 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, return false; } - // And finally, do it! Start by factoring the predecessors is needed. + // And finally, do it! Start by factoring the predecessors if needed. BasicBlock *PredBB; if (PredBBs.size() == 1) PredBB = PredBBs[0]; else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); + PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm"); } // Okay, we decided to do this! Clone all the instructions in BB onto the end @@ -1581,12 +1751,12 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, if (Value *IV = SimplifyInstruction(New, BB->getModule()->getDataLayout())) { delete New; - ValueMapping[BI] = IV; + ValueMapping[&*BI] = IV; } else { // Otherwise, insert the new instruction into the block. New->setName(BI->getName()); - PredBB->getInstList().insert(OldPredBranch, New); - ValueMapping[BI] = New; + PredBB->getInstList().insert(OldPredBranch->getIterator(), New); + ValueMapping[&*BI] = New; } } @@ -1604,10 +1774,10 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, // PHI insertion, of which we are prepared to do, clean these up now. SSAUpdater SSAUpdate; SmallVector<Use*, 16> UsesToRename; - for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { + for (Instruction &I : *BB) { // Scan all uses of this instruction to see if it is used outside of its // block, and if so, record them in UsesToRename. - for (Use &U : I->uses()) { + for (Use &U : I.uses()) { Instruction *User = cast<Instruction>(U.getUser()); if (PHINode *UserPN = dyn_cast<PHINode>(User)) { if (UserPN->getIncomingBlock(U) == BB) @@ -1622,14 +1792,14 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, if (UsesToRename.empty()) continue; - DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n"); + DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); // We found a use of I outside of BB. Rename all uses of I that are outside // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks // with the two values we know. - SSAUpdate.Initialize(I->getType(), I->getName()); - SSAUpdate.AddAvailableValue(BB, I); - SSAUpdate.AddAvailableValue(PredBB, ValueMapping[I]); + SSAUpdate.Initialize(I.getType(), I.getName()); + SSAUpdate.AddAvailableValue(BB, &I); + SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&I]); while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); @@ -1724,3 +1894,62 @@ bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { } return false; } + +/// TryToUnfoldSelectInCurrBB - Look for PHI/Select in the same BB of the form +/// bb: +/// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ... +/// %s = select p, trueval, falseval +/// +/// And expand the select into a branch structure. This later enables +/// jump-threading over bb in this pass. +/// +/// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold +/// select if the associated PHI has at least one constant. If the unfolded +/// select is not jump-threaded, it will be folded again in the later +/// optimizations. +bool JumpThreading::TryToUnfoldSelectInCurrBB(BasicBlock *BB) { + // If threading this would thread across a loop header, don't thread the edge. + // See the comments above FindLoopHeaders for justifications and caveats. + if (LoopHeaders.count(BB)) + return false; + + // Look for a Phi/Select pair in the same basic block. The Phi feeds the + // condition of the Select and at least one of the incoming values is a + // constant. + for (BasicBlock::iterator BI = BB->begin(); + PHINode *PN = dyn_cast<PHINode>(BI); ++BI) { + unsigned NumPHIValues = PN->getNumIncomingValues(); + if (NumPHIValues == 0 || !PN->hasOneUse()) + continue; + + SelectInst *SI = dyn_cast<SelectInst>(PN->user_back()); + if (!SI || SI->getParent() != BB) + continue; + + Value *Cond = SI->getCondition(); + if (!Cond || Cond != PN || !Cond->getType()->isIntegerTy(1)) + continue; + + bool HasConst = false; + for (unsigned i = 0; i != NumPHIValues; ++i) { + if (PN->getIncomingBlock(i) == BB) + return false; + if (isa<ConstantInt>(PN->getIncomingValue(i))) + HasConst = true; + } + + if (HasConst) { + // Expand the select. + TerminatorInst *Term = + SplitBlockAndInsertIfThen(SI->getCondition(), SI, false); + PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI); + NewPN->addIncoming(SI->getTrueValue(), Term->getParent()); + NewPN->addIncoming(SI->getFalseValue(), BB); + SI->replaceAllUsesWith(NewPN); + SI->eraseFromParent(); + return true; + } + } + + return false; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp index 43fc50e..8923ff7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -34,10 +34,13 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" @@ -72,10 +75,12 @@ DisablePromotion("disable-licm-promotion", cl::Hidden, cl::desc("Disable memory promotion in LICM pass")); static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); -static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop); +static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, + const LICMSafetyInfo *SafetyInfo); static bool hoist(Instruction &I, BasicBlock *Preheader); static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, - const Loop *CurLoop, AliasSetTracker *CurAST ); + const Loop *CurLoop, AliasSetTracker *CurAST, + const LICMSafetyInfo *SafetyInfo); static bool isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT, const Loop *CurLoop, @@ -89,10 +94,10 @@ static bool isSafeToExecuteUnconditionally(const Instruction &Inst, static bool pointerInvalidatedByLoop(Value *V, uint64_t Size, const AAMDNodes &AAInfo, AliasSetTracker *CurAST); -static Instruction *CloneInstructionInExitBlock(const Instruction &I, - BasicBlock &ExitBlock, - PHINode &PN, - const LoopInfo *LI); +static Instruction * +CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, + const LoopInfo *LI, + const LICMSafetyInfo *SafetyInfo); static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, AliasSetTracker *CurAST, @@ -118,9 +123,12 @@ namespace { AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); - AU.addRequired<AliasAnalysis>(); - AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<ScalarEvolution>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } @@ -164,9 +172,12 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) Pass *llvm::createLICMPass() { return new LICM(); } @@ -183,7 +194,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // Get our Loop and Alias Analysis information... LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); @@ -192,9 +203,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { CurAST = new AliasSetTracker(*AA); // Collect Alias info from subloops. - for (Loop::iterator LoopItr = L->begin(), LoopItrE = L->end(); - LoopItr != LoopItrE; ++LoopItr) { - Loop *InnerL = *LoopItr; + for (Loop *InnerL : L->getSubLoops()) { AliasSetTracker *InnerAST = LoopToAliasSetMap[InnerL]; assert(InnerAST && "Where is my AST?"); @@ -216,9 +225,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // Because subloops have already been incorporated into AST, we skip blocks in // subloops. // - for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); - I != E; ++I) { - BasicBlock *BB = *I; + for (BasicBlock *BB : L->blocks()) { if (LI->getLoopFor(BB) == L) // Ignore blocks in subloops. CurAST->add(*BB); // Incorporate the specified basic block } @@ -252,9 +259,8 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { PredIteratorCache PIC; // Loop over all of the alias sets in the tracker object. - for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end(); - I != E; ++I) - Changed |= promoteLoopAccessesToScalars(*I, ExitBlocks, InsertPts, + for (AliasSet &AS : *CurAST) + Changed |= promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts, PIC, LI, DT, CurLoop, CurAST, &SafetyInfo); @@ -264,9 +270,10 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // FIXME: This is really heavy handed. It would be a bit better to use an // SSAUpdater strategy during promotion that was LCSSA aware and reformed // it as it went. - if (Changed) - formLCSSARecursively(*L, *DT, LI, - getAnalysisIfAvailable<ScalarEvolution>()); + if (Changed) { + auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + formLCSSARecursively(*L, *DT, LI, SEWP ? &SEWP->getSE() : nullptr); + } } // Check that neither this loop nor its parent have had LCSSA broken. LICM is @@ -312,9 +319,9 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // We are processing blocks in reverse dfo, so process children first. const std::vector<DomTreeNode*> &Children = N->getChildren(); - for (unsigned i = 0, e = Children.size(); i != e; ++i) - Changed |= - sinkRegion(Children[i], AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo); + for (DomTreeNode *Child : Children) + Changed |= sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo); + // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). if (inSubLoop(BB,CurLoop,LI)) return Changed; @@ -338,10 +345,10 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // outside of the loop. In this case, it doesn't even matter if the // operands of the instruction are loop invariant. // - if (isNotUsedInLoop(I, CurLoop) && + if (isNotUsedInLoop(I, CurLoop, SafetyInfo) && canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo)) { ++II; - Changed |= sink(I, LI, DT, CurLoop, CurAST); + Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo); } } return Changed; @@ -395,14 +402,13 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, } const std::vector<DomTreeNode*> &Children = N->getChildren(); - for (unsigned i = 0, e = Children.size(); i != e; ++i) - Changed |= - hoistRegion(Children[i], AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo); + for (DomTreeNode *Child : Children) + Changed |= hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo); return Changed; } /// Computes loop safety information, checks loop body & header -/// for the possiblity of may throw exception. +/// for the possibility of may throw exception. /// void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { assert(CurLoop != nullptr && "CurLoop cant be null"); @@ -410,7 +416,7 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { // Setting default safety values. SafetyInfo->MayThrow = false; SafetyInfo->HeaderMayThrow = false; - // Iterate over header and compute dafety info. + // Iterate over header and compute safety info. for (BasicBlock::iterator I = Header->begin(), E = Header->end(); (I != E) && !SafetyInfo->HeaderMayThrow; ++I) SafetyInfo->HeaderMayThrow |= I->mayThrow(); @@ -422,6 +428,14 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); (I != E) && !SafetyInfo->MayThrow; ++I) SafetyInfo->MayThrow |= I->mayThrow(); + + // Compute funclet colors if we might sink/hoist in a function with a funclet + // personality routine. + Function *Fn = CurLoop->getHeader()->getParent(); + if (Fn->hasPersonalityFn()) + if (Constant *PersonalityFn = Fn->getPersonalityFn()) + if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))) + SafetyInfo->BlockColors = colorEHFunclets(*Fn); } /// canSinkOrHoistInst - Return true if the hoister and sinker can handle this @@ -445,7 +459,7 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT, // Don't hoist loads which have may-aliased stores in loop. uint64_t Size = 0; if (LI->getType()->isSized()) - Size = AA->getTypeStoreSize(LI->getType()); + Size = I.getModule()->getDataLayout().getTypeStoreSize(LI->getType()); AAMDNodes AAInfo; LI->getAAMetadata(AAInfo); @@ -456,17 +470,30 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT, if (isa<DbgInfoIntrinsic>(I)) return false; + // Don't sink calls which can throw. + if (CI->mayThrow()) + return false; + // Handle simple cases by querying alias analysis. - AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI); - if (Behavior == AliasAnalysis::DoesNotAccessMemory) + FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI); + if (Behavior == FMRB_DoesNotAccessMemory) return true; if (AliasAnalysis::onlyReadsMemory(Behavior)) { + // A readonly argmemonly function only reads from memory pointed to by + // it's arguments with arbitrary offsets. If we can prove there are no + // writes to this memory in the loop, we can hoist or sink. + if (AliasAnalysis::onlyAccessesArgPointees(Behavior)) { + for (Value *Op : CI->arg_operands()) + if (Op->getType()->isPointerTy() && + pointerInvalidatedByLoop(Op, MemoryLocation::UnknownSize, + AAMDNodes(), CurAST)) + return false; + return true; + } // If this call only reads from memory and there are no writes to memory // in the loop, we can hoist or sink the call as appropriate. bool FoundMod = false; - for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end(); - I != E; ++I) { - AliasSet &AS = *I; + for (AliasSet &AS : *CurAST) { if (!AS.isForwardingAliasSet() && AS.isMod()) { FoundMod = true; break; @@ -513,10 +540,24 @@ static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) { /// the loop. If this is true, we can sink the instruction to the exit /// blocks of the loop. /// -static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop) { +static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, + const LICMSafetyInfo *SafetyInfo) { + const auto &BlockColors = SafetyInfo->BlockColors; for (const User *U : I.users()) { const Instruction *UI = cast<Instruction>(U); if (const PHINode *PN = dyn_cast<PHINode>(UI)) { + const BasicBlock *BB = PN->getParent(); + // We cannot sink uses in catchswitches. + if (isa<CatchSwitchInst>(BB->getTerminator())) + return false; + + // We need to sink a callsite to a unique funclet. Avoid sinking if the + // phi use is too muddled. + if (isa<CallInst>(I)) + if (!BlockColors.empty() && + BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1) + return false; + // A PHI node where all of the incoming values are this instruction are // special -- they can just be RAUW'ed with the instruction and thus // don't require a use in the predecessor. This is a particular important @@ -544,11 +585,41 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop) { return true; } -static Instruction *CloneInstructionInExitBlock(const Instruction &I, - BasicBlock &ExitBlock, - PHINode &PN, - const LoopInfo *LI) { - Instruction *New = I.clone(); +static Instruction * +CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, + const LoopInfo *LI, + const LICMSafetyInfo *SafetyInfo) { + Instruction *New; + if (auto *CI = dyn_cast<CallInst>(&I)) { + const auto &BlockColors = SafetyInfo->BlockColors; + + // Sinking call-sites need to be handled differently from other + // instructions. The cloned call-site needs a funclet bundle operand + // appropriate for it's location in the CFG. + SmallVector<OperandBundleDef, 1> OpBundles; + for (unsigned BundleIdx = 0, BundleEnd = CI->getNumOperandBundles(); + BundleIdx != BundleEnd; ++BundleIdx) { + OperandBundleUse Bundle = CI->getOperandBundleAt(BundleIdx); + if (Bundle.getTagID() == LLVMContext::OB_funclet) + continue; + + OpBundles.emplace_back(Bundle); + } + + if (!BlockColors.empty()) { + const ColorVector &CV = BlockColors.find(&ExitBlock)->second; + assert(CV.size() == 1 && "non-unique color for exit block!"); + BasicBlock *BBColor = CV.front(); + Instruction *EHPad = BBColor->getFirstNonPHI(); + if (EHPad->isEHPad()) + OpBundles.emplace_back("funclet", EHPad); + } + + New = CallInst::Create(CI, OpBundles); + } else { + New = I.clone(); + } + ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New); if (!I.getName().empty()) New->setName(I.getName() + ".le"); @@ -566,7 +637,7 @@ static Instruction *CloneInstructionInExitBlock(const Instruction &I, if (!OLoop->contains(&PN)) { PHINode *OpPN = PHINode::Create(OInst->getType(), PN.getNumIncomingValues(), - OInst->getName() + ".lcssa", ExitBlock.begin()); + OInst->getName() + ".lcssa", &ExitBlock.front()); for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) OpPN->addIncoming(OInst, PN.getIncomingBlock(i)); *OI = OpPN; @@ -580,7 +651,8 @@ static Instruction *CloneInstructionInExitBlock(const Instruction &I, /// position, and may either delete it or move it to outside of the loop. /// static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, - const Loop *CurLoop, AliasSetTracker *CurAST ) { + const Loop *CurLoop, AliasSetTracker *CurAST, + const LICMSafetyInfo *SafetyInfo) { DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); bool Changed = false; if (isa<LoadInst>(I)) ++NumMovedLoads; @@ -631,7 +703,7 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, New = It->second; else New = SunkCopies[ExitBlock] = - CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI); + CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo); PN->replaceAllUsesWith(New); PN->eraseFromParent(); @@ -651,6 +723,10 @@ static bool hoist(Instruction &I, BasicBlock *Preheader) { // Move the new node to the Preheader, before its terminator. I.moveBefore(Preheader->getTerminator()); + // Metadata can be dependent on the condition we are hoisting above. + // Conservatively strip all metadata on the instruction. + I.dropUnknownNonDebugMetadata(); + if (isa<LoadInst>(I)) ++NumMovedLoads; else if (isa<CallInst>(I)) ++NumMovedCalls; ++NumHoisted; @@ -699,8 +775,8 @@ static bool isGuaranteedToExecute(const Instruction &Inst, CurLoop->getExitBlocks(ExitBlocks); // Verify that the block dominates each of the exit blocks of the loop. - for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) - if (!DT->dominates(Inst.getParent(), ExitBlocks[i])) + for (BasicBlock *ExitBlock : ExitBlocks) + if (!DT->dominates(Inst.getParent(), ExitBlock)) return false; // As a degenerate case, if the loop is statically infinite then we haven't @@ -730,9 +806,9 @@ namespace { if (!L->contains(BB)) { // We need to create an LCSSA PHI node for the incoming value and // store that. - PHINode *PN = PHINode::Create( - I->getType(), PredCache.size(BB), - I->getName() + ".lcssa", BB->begin()); + PHINode *PN = + PHINode::Create(I->getType(), PredCache.size(BB), + I->getName() + ".lcssa", &BB->front()); for (BasicBlock *Pred : PredCache.get(BB)) PN->addIncoming(I, Pred); return PN; @@ -867,17 +943,17 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, // If there is an non-load/store instruction in the loop, we can't promote // it. - if (const LoadInst *load = dyn_cast<LoadInst>(UI)) { - assert(!load->isVolatile() && "AST broken"); - if (!load->isSimple()) + if (const LoadInst *Load = dyn_cast<LoadInst>(UI)) { + assert(!Load->isVolatile() && "AST broken"); + if (!Load->isSimple()) return Changed; - } else if (const StoreInst *store = dyn_cast<StoreInst>(UI)) { + } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) { // Stores *of* the pointer are not interesting, only stores *to* the // pointer. if (UI->getOperand(1) != ASIV) continue; - assert(!store->isVolatile() && "AST broken"); - if (!store->isSimple()) + assert(!Store->isVolatile() && "AST broken"); + if (!Store->isSimple()) return Changed; // Don't sink stores from loops without dedicated block exits. Exits // containing indirect branches are not transformed by loop simplify, @@ -895,7 +971,7 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, // restrictive (and performant) alignment and if we are sure this // instruction will be executed, update the alignment. // Larger is better, with the exception of 0 being the best alignment. - unsigned InstAlignment = store->getAlignment(); + unsigned InstAlignment = Store->getAlignment(); if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0) if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) { GuaranteedToExecute = true; @@ -925,6 +1001,21 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, if (!GuaranteedToExecute) return Changed; + // Figure out the loop exits and their insertion points, if this is the + // first promotion. + if (ExitBlocks.empty()) { + CurLoop->getUniqueExitBlocks(ExitBlocks); + InsertPts.clear(); + InsertPts.reserve(ExitBlocks.size()); + for (BasicBlock *ExitBlock : ExitBlocks) + InsertPts.push_back(&*ExitBlock->getFirstInsertionPt()); + } + + // Can't insert into a catchswitch. + for (BasicBlock *ExitBlock : ExitBlocks) + if (isa<CatchSwitchInst>(ExitBlock->getTerminator())) + return Changed; + // Otherwise, this is safe to promote, lets do it! DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n'); Changed = true; @@ -936,15 +1027,6 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, // location is better than none. DebugLoc DL = LoopUses[0]->getDebugLoc(); - // Figure out the loop exits and their insertion points, if this is the - // first promotion. - if (ExitBlocks.empty()) { - CurLoop->getUniqueExitBlocks(ExitBlocks); - InsertPts.resize(ExitBlocks.size()); - for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) - InsertPts[i] = ExitBlocks[i]->getFirstInsertionPt(); - } - // We use the SSAUpdater interface to insert phi nodes as required. SmallVector<PHINode*, 16> NewPHIs; SSAUpdater SSA(&NewPHIs); @@ -973,7 +1055,7 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, return Changed; } -/// Simple Analysis hook. Clone alias set info. +/// Simple analysis hook. Clone alias set info. /// void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) { AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp index c19cd19..1648878 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetFolder.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -56,7 +57,7 @@ class LoadCombine : public BasicBlockPass { public: LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) { - initializeSROAPass(*PassRegistry::getPassRegistry()); + initializeLoadCombinePass(*PassRegistry::getPassRegistry()); } using llvm::Pass::doInitialization; @@ -223,7 +224,7 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { if (skipOptnoneFunction(BB)) return false; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); IRBuilder<true, TargetFolder> TheBuilder( BB.getContext(), TargetFolder(BB.getModule()->getDataLayout())); @@ -262,8 +263,8 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addRequired<AliasAnalysis>(); - AU.addPreserved<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } char LoadCombine::ID = 0; @@ -274,7 +275,8 @@ BasicBlockPass *llvm::createLoadCombinePass() { INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", "Combine Adjacent Loads", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(LoadCombine, "load-combine", "Combine Adjacent Loads", false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 98b068e..7b1940b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -17,6 +17,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/Dominators.h" @@ -35,18 +36,19 @@ namespace { } // Possibly eliminate loop L if it is dead. - bool runOnLoop(Loop *L, LPPassManager &LPM) override; + bool runOnLoop(Loop *L, LPPassManager &) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); } @@ -64,7 +66,7 @@ INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", "Delete dead loops", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopDeletion, "loop-deletion", @@ -130,7 +132,7 @@ bool LoopDeletion::isLoopDead(Loop *L, /// so could change the halting/non-halting nature of a program. /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA /// in order to make various safety checks work. -bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { +bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) { if (skipOptnoneFunction(L)) return false; @@ -169,7 +171,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { // Don't remove loops for which we can't solve the trip count. // They could be infinite, in which case we'd be changing program behavior. - ScalarEvolution &SE = getAnalysis<ScalarEvolution>(); + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const SCEV *S = SE.getMaxBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(S)) return Changed; @@ -242,9 +244,8 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { for (BasicBlock *BB : blocks) loopInfo.removeBlock(BB); - // The last step is to inform the loop pass manager that we've - // eliminated this loop. - LPM.deleteLoopFromQueue(L); + // The last step is to update LoopInfo now that we've eliminated this loop. + loopInfo.markAsRemoved(L); Changed = true; ++NumDeleted; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 1b9859b..3d3cf3e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -34,6 +34,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include <list> @@ -54,6 +55,11 @@ static cl::opt<bool> DistributeNonIfConvertible( "if-convertible by the loop vectorizer"), cl::init(false)); +static cl::opt<unsigned> DistributeSCEVCheckThreshold( + "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed for Loop " + "Distribution")); + STATISTIC(NumLoopsDistributed, "Number of loops distributed"); namespace { @@ -164,9 +170,7 @@ public: // Delete the instructions backwards, as it has a reduced likelihood of // having to update as many def-use and use-def chains. - for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I) { - auto *Inst = *I; - + for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) { if (!Inst->use_empty()) Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); Inst->eraseFromParent(); @@ -373,7 +377,7 @@ public: /// \brief This performs the main chunk of the work of cloning the loops for /// the partitions. - void cloneLoops(Pass *P) { + void cloneLoops() { BasicBlock *OrigPH = L->getLoopPreheader(); // At this point the predecessor of the preheader is either the memcheck // block or the top part of the original preheader. @@ -547,11 +551,11 @@ public: MemoryInstructionDependences( const SmallVectorImpl<Instruction *> &Instructions, - const SmallVectorImpl<Dependence> &InterestingDependences) { + const SmallVectorImpl<Dependence> &Dependences) { Accesses.append(Instructions.begin(), Instructions.end()); DEBUG(dbgs() << "Backward dependences:\n"); - for (auto &Dep : InterestingDependences) + for (auto &Dep : Dependences) if (Dep.isPossiblyBackward()) { // Note that the designations source and destination follow the program // order, i.e. source is always first. (The direction is given by the @@ -567,25 +571,6 @@ private: AccessesType Accesses; }; -/// \brief Returns the instructions that use values defined in the loop. -static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) { - SmallVector<Instruction *, 8> UsedOutside; - - for (auto *Block : L->getBlocks()) - // FIXME: I believe that this could use copy_if if the Inst reference could - // be adapted into a pointer. - for (auto &Inst : *Block) { - auto Users = Inst.users(); - if (std::any_of(Users.begin(), Users.end(), [&](User *U) { - auto *Use = cast<Instruction>(U); - return !L->contains(Use->getParent()); - })) - UsedOutside.push_back(&Inst); - } - - return UsedOutside; -} - /// \brief The pass class. class LoopDistribute : public FunctionPass { public: @@ -597,6 +582,7 @@ public: LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); LAA = &getAnalysis<LoopAccessAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); // Build up a worklist of inner-loops to vectorize. This is necessary as the // act of distributing a loop creates new loops and can invalidate iterators @@ -619,6 +605,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<LoopAccessAnalysis>(); @@ -629,6 +616,45 @@ public: static char ID; private: + /// \brief Filter out checks between pointers from the same partition. + /// + /// \p PtrToPartition contains the partition number for pointers. Partition + /// number -1 means that the pointer is used in multiple partitions. In this + /// case we can't safely omit the check. + SmallVector<RuntimePointerChecking::PointerCheck, 4> + includeOnlyCrossPartitionChecks( + const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, + const SmallVectorImpl<int> &PtrToPartition, + const RuntimePointerChecking *RtPtrChecking) { + SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; + + std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), + [&](const RuntimePointerChecking::PointerCheck &Check) { + for (unsigned PtrIdx1 : Check.first->Members) + for (unsigned PtrIdx2 : Check.second->Members) + // Only include this check if there is a pair of pointers + // that require checking and the pointers fall into + // separate partitions. + // + // (Note that we already know at this point that the two + // pointer groups need checking but it doesn't follow + // that each pair of pointers within the two groups need + // checking as well. + // + // In other words we don't want to include a check just + // because there is a pair of pointers between the two + // pointer groups that require checks and a different + // pair whose pointers fall into different partitions.) + if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && + !RuntimePointerChecking::arePointersInSamePartition( + PtrToPartition, PtrIdx1, PtrIdx2)) + return true; + return false; + }); + + return Checks; + } + /// \brief Try to distribute an inner-most loop. bool processLoop(Loop *L) { assert(L->empty() && "Only process inner loops."); @@ -655,9 +681,8 @@ private: DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization"); return false; } - auto *InterestingDependences = - LAI.getDepChecker().getInterestingDependences(); - if (!InterestingDependences || InterestingDependences->empty()) { + auto *Dependences = LAI.getDepChecker().getDependences(); + if (!Dependences || Dependences->empty()) { DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate"); return false; } @@ -685,7 +710,7 @@ private: // NumUnsafeDependencesActive reaches 0. const MemoryDepChecker &DepChecker = LAI.getDepChecker(); MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), - *InterestingDependences); + *Dependences); int NumUnsafeDependencesActive = 0; for (auto &InstDep : MID) { @@ -735,6 +760,13 @@ private: return false; } + // Don't distribute the loop if we need too many SCEV run-time checks. + const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate(); + if (Pred.getComplexity() > DistributeSCEVCheckThreshold) { + DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); + return false; + } + DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); // We're done forming the partitions set up the reverse mapping from // instructions to partitions. @@ -746,20 +778,25 @@ private: if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) SplitBlock(PH, PH->getTerminator(), DT, LI); - // If we need run-time checks to disambiguate pointers are run-time, version - // the loop now. + // If we need run-time checks, version the loop now. auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI); - LoopVersioning LVer(LAI, L, LI, DT, &PtrToPartition); - if (LVer.needsRuntimeChecks()) { + const auto *RtPtrChecking = LAI.getRuntimePointerChecking(); + const auto &AllChecks = RtPtrChecking->getChecks(); + auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, + RtPtrChecking); + + if (!Pred.isAlwaysTrue() || !Checks.empty()) { DEBUG(dbgs() << "\nPointers:\n"); - DEBUG(LAI.getRuntimePointerChecking()->print(dbgs(), 0, &PtrToPartition)); - LVer.versionLoop(this); - LVer.addPHINodes(DefsUsedOutside); + DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); + LoopVersioning LVer(LAI, L, LI, DT, SE, false); + LVer.setAliasChecks(std::move(Checks)); + LVer.setSCEVChecks(LAI.PSE.getUnionPredicate()); + LVer.versionLoop(DefsUsedOutside); } // Create identical copies of the original loop for each partition and hook // them up sequentially. - Partitions.cloneLoops(this); + Partitions.cloneLoops(); // Now, we remove the instruction from each loop that don't belong to that // partition. @@ -780,6 +817,7 @@ private: LoopInfo *LI; LoopAccessAnalysis *LAA; DominatorTree *DT; + ScalarEvolution *SE; }; } // anonymous namespace @@ -790,6 +828,7 @@ INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) namespace llvm { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a21ca24..4521640 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -31,11 +31,6 @@ // void foo(_Complex float *P) // for (i) { __real__(*P) = 0; __imag__(*P) = 0; } // -// We should enhance this to handle negative strides through memory. -// Alternatively (and perhaps better) we could rely on an earlier pass to force -// forward iteration through memory, which is generally better for cache -// behavior. Negative strides *do* happen for memset/memcpy loops. -// // This could recognize common matrix multiplies and dot product idioms and // replace them with calls to BLAS (if linked in??). // @@ -44,7 +39,10 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -67,149 +65,87 @@ STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); namespace { - class LoopIdiomRecognize; +class LoopIdiomRecognize : public LoopPass { + Loop *CurLoop; + AliasAnalysis *AA; + DominatorTree *DT; + LoopInfo *LI; + ScalarEvolution *SE; + TargetLibraryInfo *TLI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + static char ID; + explicit LoopIdiomRecognize() : LoopPass(ID) { + initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); + } - /// This class defines some utility functions for loop idiom recognization. - class LIRUtil { - public: - /// Return true iff the block contains nothing but an uncondition branch - /// (aka goto instruction). - static bool isAlmostEmpty(BasicBlock *); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG. + /// + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } - static BranchInst *getBranch(BasicBlock *BB) { - return dyn_cast<BranchInst>(BB->getTerminator()); - } +private: + typedef SmallVector<StoreInst *, 8> StoreList; + StoreList StoreRefsForMemset; + StoreList StoreRefsForMemcpy; + bool HasMemset; + bool HasMemsetPattern; + bool HasMemcpy; - /// Derive the precondition block (i.e the block that guards the loop - /// preheader) from the given preheader. - static BasicBlock *getPrecondBb(BasicBlock *PreHead); - }; - - /// This class is to recoginize idioms of population-count conducted in - /// a noncountable loop. Currently it only recognizes this pattern: - /// \code - /// while(x) {cnt++; ...; x &= x - 1; ...} - /// \endcode - class NclPopcountRecognize { - LoopIdiomRecognize &LIR; - Loop *CurLoop; - BasicBlock *PreCondBB; - - typedef IRBuilder<> IRBuilderTy; - - public: - explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR); - bool recognize(); - - private: - /// Take a glimpse of the loop to see if we need to go ahead recoginizing - /// the idiom. - bool preliminaryScreen(); - - /// Check if the given conditional branch is based on the comparison - /// between a variable and zero, and if the variable is non-zero, the - /// control yields to the loop entry. If the branch matches the behavior, - /// the variable involved in the comparion is returned. This function will - /// be called to see if the precondition and postcondition of the loop - /// are in desirable form. - Value *matchCondition(BranchInst *Br, BasicBlock *NonZeroTarget) const; - - /// Return true iff the idiom is detected in the loop. and 1) \p CntInst - /// is set to the instruction counting the population bit. 2) \p CntPhi - /// is set to the corresponding phi node. 3) \p Var is set to the value - /// whose population bits are being counted. - bool detectIdiom - (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const; - - /// Insert ctpop intrinsic function and some obviously dead instructions. - void transform(Instruction *CntInst, PHINode *CntPhi, Value *Var); - - /// Create llvm.ctpop.* intrinsic function. - CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL); - }; - - class LoopIdiomRecognize : public LoopPass { - Loop *CurLoop; - DominatorTree *DT; - ScalarEvolution *SE; - TargetLibraryInfo *TLI; - const TargetTransformInfo *TTI; - public: - static char ID; - explicit LoopIdiomRecognize() : LoopPass(ID) { - initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); - DT = nullptr; - SE = nullptr; - TLI = nullptr; - TTI = nullptr; - } + /// \name Countable Loop Idiom Handling + /// @{ - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, - SmallVectorImpl<BasicBlock*> &ExitBlocks); - - bool processLoopStore(StoreInst *SI, const SCEV *BECount); - bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); - - bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, - unsigned StoreAlignment, - Value *SplatValue, Instruction *TheStore, - const SCEVAddRecExpr *Ev, - const SCEV *BECount); - bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, - const SCEVAddRecExpr *StoreEv, - const SCEVAddRecExpr *LoadEv, - const SCEV *BECount); - - /// This transformation requires natural loop information & requires that - /// loop preheaders be inserted into the CFG. - /// - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addRequired<AliasAnalysis>(); - AU.addPreserved<AliasAnalysis>(); - AU.addRequired<ScalarEvolution>(); - AU.addPreserved<ScalarEvolution>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - } + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl<BasicBlock *> &ExitBlocks); - DominatorTree *getDominatorTree() { - return DT ? DT - : (DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree()); - } + void collectStores(BasicBlock *BB); + bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemcpy); + bool processLoopStore(StoreInst *SI, const SCEV *BECount); + bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); - ScalarEvolution *getScalarEvolution() { - return SE ? SE : (SE = &getAnalysis<ScalarEvolution>()); - } + bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, + unsigned StoreAlignment, Value *StoredVal, + Instruction *TheStore, const SCEVAddRecExpr *Ev, + const SCEV *BECount, bool NegStride); + bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); - TargetLibraryInfo *getTargetLibraryInfo() { - if (!TLI) - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + /// @} + /// \name Noncountable Loop Idiom Handling + /// @{ - return TLI; - } + bool runOnNoncountableLoop(); - const TargetTransformInfo *getTargetTransformInfo() { - return TTI ? TTI - : (TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( - *CurLoop->getHeader()->getParent())); - } + bool recognizePopcount(); + void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, + PHINode *CntPhi, Value *Var); - Loop *getLoop() const { return CurLoop; } + /// @} +}; - private: - bool runOnNoncountableLoop(); - bool runOnCountableLoop(); - }; -} +} // End anonymous namespace. char LoopIdiomRecognize::ID = 0; INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", @@ -218,9 +154,12 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) @@ -242,448 +181,244 @@ static void deleteDeadInstruction(Instruction *I, //===----------------------------------------------------------------------===// // -// Implementation of LIRUtil -// -//===----------------------------------------------------------------------===// - -// This function will return true iff the given block contains nothing but goto. -// A typical usage of this function is to check if the preheader function is -// "almost" empty such that generated intrinsic functions can be moved across -// the preheader and be placed at the end of the precondition block without -// the concern of breaking data dependence. -bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { - if (BranchInst *Br = getBranch(BB)) { - return Br->isUnconditional() && Br == BB->begin(); - } - return false; -} - -BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) { - if (BasicBlock *BB = PreHead->getSinglePredecessor()) { - BranchInst *Br = getBranch(BB); - return Br && Br->isConditional() ? BB : nullptr; - } - return nullptr; -} - -//===----------------------------------------------------------------------===// -// -// Implementation of NclPopcountRecognize +// Implementation of LoopIdiomRecognize // //===----------------------------------------------------------------------===// -NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR): - LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(nullptr) { -} - -bool NclPopcountRecognize::preliminaryScreen() { - const TargetTransformInfo *TTI = LIR.getTargetTransformInfo(); - if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) - return false; - - // Counting population are usually conducted by few arithmetic instructions. - // Such instructions can be easilly "absorbed" by vacant slots in a - // non-compact loop. Therefore, recognizing popcount idiom only makes sense - // in a compact loop. - - // Give up if the loop has multiple blocks or multiple backedges. - if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) - return false; - - BasicBlock *LoopBody = *(CurLoop->block_begin()); - if (LoopBody->size() >= 20) { - // The loop is too big, bail out. +bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) return false; - } - // It should have a preheader containing nothing but a goto instruction. - BasicBlock *PreHead = CurLoop->getLoopPreheader(); - if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead)) + CurLoop = L; + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) return false; - // It should have a precondition block where the generated popcount instrinsic - // function will be inserted. - PreCondBB = LIRUtil::getPrecondBb(PreHead); - if (!PreCondBB) + // Disable loop idiom recognition if the function's name is a common idiom. + StringRef Name = L->getHeader()->getParent()->getName(); + if (Name == "memset" || Name == "memcpy") return false; - return true; -} - -Value *NclPopcountRecognize::matchCondition(BranchInst *Br, - BasicBlock *LoopEntry) const { - if (!Br || !Br->isConditional()) - return nullptr; + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *CurLoop->getHeader()->getParent()); + DL = &CurLoop->getHeader()->getModule()->getDataLayout(); - ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition()); - if (!Cond) - return nullptr; + HasMemset = TLI->has(LibFunc::memset); + HasMemsetPattern = TLI->has(LibFunc::memset_pattern16); + HasMemcpy = TLI->has(LibFunc::memcpy); - ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); - if (!CmpZero || !CmpZero->isZero()) - return nullptr; - - ICmpInst::Predicate Pred = Cond->getPredicate(); - if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) || - (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry)) - return Cond->getOperand(0); + if (HasMemset || HasMemsetPattern || HasMemcpy) + if (SE->hasLoopInvariantBackedgeTakenCount(L)) + return runOnCountableLoop(); - return nullptr; + return runOnNoncountableLoop(); } -bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, - PHINode *&CntPhi, - Value *&Var) const { - // Following code tries to detect this idiom: - // - // if (x0 != 0) - // goto loop-exit // the precondition of the loop - // cnt0 = init-val; - // do { - // x1 = phi (x0, x2); - // cnt1 = phi(cnt0, cnt2); - // - // cnt2 = cnt1 + 1; - // ... - // x2 = x1 & (x1 - 1); - // ... - // } while(x != 0); - // - // loop-exit: - // - - // step 1: Check to see if the look-back branch match this pattern: - // "if (a!=0) goto loop-entry". - BasicBlock *LoopEntry; - Instruction *DefX2, *CountInst; - Value *VarX1, *VarX0; - PHINode *PhiX, *CountPhi; - - DefX2 = CountInst = nullptr; - VarX1 = VarX0 = nullptr; - PhiX = CountPhi = nullptr; - LoopEntry = *(CurLoop->block_begin()); - - // step 1: Check if the loop-back branch is in desirable form. - { - if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry)) - DefX2 = dyn_cast<Instruction>(T); - else - return false; - } - - // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" - { - if (!DefX2 || DefX2->getOpcode() != Instruction::And) - return false; - - BinaryOperator *SubOneOp; - - if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) - VarX1 = DefX2->getOperand(1); - else { - VarX1 = DefX2->getOperand(0); - SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); - } - if (!SubOneOp) - return false; - - Instruction *SubInst = cast<Instruction>(SubOneOp); - ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); - if (!Dec || - !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || - (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) { - return false; - } - } +bool LoopIdiomRecognize::runOnCountableLoop() { + const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); + assert(!isa<SCEVCouldNotCompute>(BECount) && + "runOnCountableLoop() called on a loop without a predictable" + "backedge-taken count"); - // step 3: Check the recurrence of variable X - { - PhiX = dyn_cast<PHINode>(VarX1); - if (!PhiX || - (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { + // If this loop executes exactly one time, then it should be peeled, not + // optimized by this pass. + if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) + if (BECst->getAPInt() == 0) return false; - } - } - // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 - { - CountInst = nullptr; - for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(), - IterE = LoopEntry->end(); Iter != IterE; Iter++) { - Instruction *Inst = Iter; - if (Inst->getOpcode() != Instruction::Add) - continue; - - ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); - if (!Inc || !Inc->isOne()) - continue; - - PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); - if (!Phi || Phi->getParent() != LoopEntry) - continue; - - // Check if the result of the instruction is live of the loop. - bool LiveOutLoop = false; - for (User *U : Inst->users()) { - if ((cast<Instruction>(U))->getParent() != LoopEntry) { - LiveOutLoop = true; break; - } - } - - if (LiveOutLoop) { - CountInst = Inst; - CountPhi = Phi; - break; - } - } + SmallVector<BasicBlock *, 8> ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); - if (!CountInst) - return false; - } + DEBUG(dbgs() << "loop-idiom Scanning: F[" + << CurLoop->getHeader()->getParent()->getName() << "] Loop %" + << CurLoop->getHeader()->getName() << "\n"); - // step 5: check if the precondition is in this form: - // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" - { - BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); - Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader()); - if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) - return false; + bool MadeChange = false; + // Scan all the blocks in the loop that are not in subloops. + for (auto *BB : CurLoop->getBlocks()) { + // Ignore blocks in subloops. + if (LI->getLoopFor(BB) != CurLoop) + continue; - CntInst = CountInst; - CntPhi = CountPhi; - Var = T; + MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); } - - return true; + return MadeChange; } -void NclPopcountRecognize::transform(Instruction *CntInst, - PHINode *CntPhi, Value *Var) { - - ScalarEvolution *SE = LIR.getScalarEvolution(); - TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo(); - BasicBlock *PreHead = CurLoop->getLoopPreheader(); - BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); - const DebugLoc DL = CntInst->getDebugLoc(); - - // Assuming before transformation, the loop is following: - // if (x) // the precondition - // do { cnt++; x &= x - 1; } while(x); - - // Step 1: Insert the ctpop instruction at the end of the precondition block - IRBuilderTy Builder(PreCondBr); - Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; - { - PopCnt = createPopcntIntrinsic(Builder, Var, DL); - NewCount = PopCntZext = - Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); - - if (NewCount != PopCnt) - (cast<Instruction>(NewCount))->setDebugLoc(DL); - - // TripCnt is exactly the number of iterations the loop has - TripCnt = NewCount; - - // If the population counter's initial value is not zero, insert Add Inst. - Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); - ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); - if (!InitConst || !InitConst->isZero()) { - NewCount = Builder.CreateAdd(NewCount, CntInitVal); - (cast<Instruction>(NewCount))->setDebugLoc(DL); - } - } - - // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to - // "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic - // function would be partial dead code, and downstream passes will drag - // it back from the precondition block to the preheader. - { - ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); - - Value *Opnd0 = PopCntZext; - Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); - if (PreCond->getOperand(0) != Var) - std::swap(Opnd0, Opnd1); - - ICmpInst *NewPreCond = - cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); - PreCondBr->setCondition(NewPreCond); +static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) { + uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType()); + assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) && + "Don't overflow unsigned."); + return (unsigned)SizeInBits >> 3; +} - RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); - } +static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) { + const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1)); + return ConstStride->getAPInt().getZExtValue(); +} - // Step 3: Note that the population count is exactly the trip count of the - // loop in question, which enble us to to convert the loop from noncountable - // loop into a countable one. The benefit is twofold: - // - // - If the loop only counts population, the entire loop become dead after - // the transformation. It is lots easier to prove a countable loop dead - // than to prove a noncountable one. (In some C dialects, a infite loop - // isn't dead even if it computes nothing useful. In general, DCE needs - // to prove a noncountable loop finite before safely delete it.) - // - // - If the loop also performs something else, it remains alive. - // Since it is transformed to countable form, it can be aggressively - // optimized by some optimizations which are in general not applicable - // to a noncountable loop. - // - // After this step, this loop (conceptually) would look like following: - // newcnt = __builtin_ctpop(x); - // t = newcnt; - // if (x) - // do { cnt++; x &= x-1; t--) } while (t > 0); - BasicBlock *Body = *(CurLoop->block_begin()); - { - BranchInst *LbBr = LIRUtil::getBranch(Body); - ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); - Type *Ty = TripCnt->getType(); +/// getMemSetPatternValue - If a strided store of the specified value is safe to +/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should +/// be passed in. Otherwise, return null. +/// +/// Note that we don't ever attempt to use memset_pattern8 or 4, because these +/// just replicate their input array and then pass on to memset_pattern16. +static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { + // If the value isn't a constant, we can't promote it to being in a constant + // array. We could theoretically do a store to an alloca or something, but + // that doesn't seem worthwhile. + Constant *C = dyn_cast<Constant>(V); + if (!C) + return nullptr; - PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin()); + // Only handle simple values that are a power of two bytes in size. + uint64_t Size = DL->getTypeSizeInBits(V->getType()); + if (Size == 0 || (Size & 7) || (Size & (Size - 1))) + return nullptr; - Builder.SetInsertPoint(LbCond); - Value *Opnd1 = cast<Value>(TcPhi); - Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1)); - Instruction *TcDec = - cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true)); + // Don't care enough about darwin/ppc to implement this. + if (DL->isBigEndian()) + return nullptr; - TcPhi->addIncoming(TripCnt, PreHead); - TcPhi->addIncoming(TcDec, Body); + // Convert to size in bytes. + Size /= 8; - CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ? - CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; - LbCond->setPredicate(Pred); - LbCond->setOperand(0, TcDec); - LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0))); - } + // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see + // if the top and bottom are the same (e.g. for vectors and large integers). + if (Size > 16) + return nullptr; - // Step 4: All the references to the original population counter outside - // the loop are replaced with the NewCount -- the value returned from - // __builtin_ctpop(). - CntInst->replaceUsesOutsideBlock(NewCount, Body); + // If the constant is exactly 16 bytes, just use it. + if (Size == 16) + return C; - // step 5: Forget the "non-computable" trip-count SCEV associated with the - // loop. The loop would otherwise not be deleted even if it becomes empty. - SE->forgetLoop(CurLoop); + // Otherwise, we'll use an array of the constants. + unsigned ArraySize = 16 / Size; + ArrayType *AT = ArrayType::get(V->getType(), ArraySize); + return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); } -CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, - Value *Val, DebugLoc DL) { - Value *Ops[] = { Val }; - Type *Tys[] = { Val->getType() }; - - Module *M = (*(CurLoop->block_begin()))->getParent()->getParent(); - Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); - CallInst *CI = IRBuilder.CreateCall(Func, Ops); - CI->setDebugLoc(DL); - - return CI; -} +bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, + bool &ForMemcpy) { + // Don't touch volatile stores. + if (!SI->isSimple()) + return false; -/// recognize - detect population count idiom in a non-countable loop. If -/// detected, transform the relevant code to popcount intrinsic function -/// call, and return true; otherwise, return false. -bool NclPopcountRecognize::recognize() { + Value *StoredVal = SI->getValueOperand(); + Value *StorePtr = SI->getPointerOperand(); - if (!LIR.getTargetTransformInfo()) + // Reject stores that are so large that they overflow an unsigned. + uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); + if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) return false; - LIR.getScalarEvolution(); - - if (!preliminaryScreen()) + // See if the pointer expression is an AddRec like {base,+,1} on the current + // loop, which indicates a strided store. If we have something else, it's a + // random store we can't handle. + const SCEVAddRecExpr *StoreEv = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); + if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) return false; - Instruction *CntInst; - PHINode *CntPhi; - Value *Val; - if (!detectIdiom(CntInst, CntPhi, Val)) + // Check to see if we have a constant stride. + if (!isa<SCEVConstant>(StoreEv->getOperand(1))) return false; - transform(CntInst, CntPhi, Val); - return true; -} + // See if the store can be turned into a memset. -//===----------------------------------------------------------------------===// -// -// Implementation of LoopIdiomRecognize -// -//===----------------------------------------------------------------------===// + // If the stored value is a byte-wise value (like i32 -1), then it may be + // turned into a memset of i8 -1, assuming that all the consecutive bytes + // are stored. A store of i32 0x01020304 can never be turned into a memset, + // but it can be turned into memset_pattern if the target supports it. + Value *SplatValue = isBytewiseValue(StoredVal); + Constant *PatternValue = nullptr; -bool LoopIdiomRecognize::runOnCountableLoop() { - const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); - assert(!isa<SCEVCouldNotCompute>(BECount) && - "runOnCountableLoop() called on a loop without a predictable" - "backedge-taken count"); + // If we're allowed to form a memset, and the stored value would be + // acceptable for memset, use it. + if (HasMemset && SplatValue && + // Verify that the stored value is loop invariant. If not, we can't + // promote the memset. + CurLoop->isLoopInvariant(SplatValue)) { + // It looks like we can use SplatValue. + ForMemset = true; + return true; + } else if (HasMemsetPattern && + // Don't create memset_pattern16s with address spaces. + StorePtr->getType()->getPointerAddressSpace() == 0 && + (PatternValue = getMemSetPatternValue(StoredVal, DL))) { + // It looks like we can use PatternValue! + ForMemset = true; + return true; + } - // If this loop executes exactly one time, then it should be peeled, not - // optimized by this pass. - if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) - if (BECst->getValue()->getValue() == 0) + // Otherwise, see if the store can be turned into a memcpy. + if (HasMemcpy) { + // Check to see if the stride matches the size of the store. If so, then we + // know that every byte is touched in the loop. + unsigned Stride = getStoreStride(StoreEv); + unsigned StoreSize = getStoreSizeInBytes(SI, DL); + if (StoreSize != Stride && StoreSize != -Stride) return false; - // set DT - (void)getDominatorTree(); - - LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - - // set TLI - (void)getTargetLibraryInfo(); - - SmallVector<BasicBlock*, 8> ExitBlocks; - CurLoop->getUniqueExitBlocks(ExitBlocks); - - DEBUG(dbgs() << "loop-idiom Scanning: F[" - << CurLoop->getHeader()->getParent()->getName() - << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); + // The store must be feeding a non-volatile load. + LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); + if (!LI || !LI->isSimple()) + return false; - bool MadeChange = false; - // Scan all the blocks in the loop that are not in subloops. - for (auto *BB : CurLoop->getBlocks()) { - // Ignore blocks in subloops. - if (LI.getLoopFor(BB) != CurLoop) - continue; + // See if the pointer expression is an AddRec like {base,+,1} on the current + // loop, which indicates a strided load. If we have something else, it's a + // random load we can't handle. + const SCEVAddRecExpr *LoadEv = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); + if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) + return false; - MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); - } - return MadeChange; -} + // The store and load must share the same stride. + if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) + return false; -bool LoopIdiomRecognize::runOnNoncountableLoop() { - NclPopcountRecognize Popcount(*this); - if (Popcount.recognize()) + // Success. This store can be converted into a memcpy. + ForMemcpy = true; return true; - + } + // This store can't be transformed into a memset/memcpy. return false; } -bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { - if (skipOptnoneFunction(L)) - return false; - - CurLoop = L; - - // If the loop could not be converted to canonical form, it must have an - // indirectbr in it, just give up. - if (!L->getLoopPreheader()) - return false; +void LoopIdiomRecognize::collectStores(BasicBlock *BB) { + StoreRefsForMemset.clear(); + StoreRefsForMemcpy.clear(); + for (Instruction &I : *BB) { + StoreInst *SI = dyn_cast<StoreInst>(&I); + if (!SI) + continue; - // Disable loop idiom recognition if the function's name is a common idiom. - StringRef Name = L->getHeader()->getParent()->getName(); - if (Name == "memset" || Name == "memcpy") - return false; + bool ForMemset = false; + bool ForMemcpy = false; + // Make sure this is a strided store with a constant stride. + if (!isLegalStore(SI, ForMemset, ForMemcpy)) + continue; - SE = &getAnalysis<ScalarEvolution>(); - if (SE->hasLoopInvariantBackedgeTakenCount(L)) - return runOnCountableLoop(); - return runOnNoncountableLoop(); + // Save the store locations. + if (ForMemset) + StoreRefsForMemset.push_back(SI); + else if (ForMemcpy) + StoreRefsForMemcpy.push_back(SI); + } } /// runOnLoopBlock - Process the specified block, which lives in a counted loop /// with the specified backedge count. This block is known to be in the current /// loop and not in any subloops. -bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, - SmallVectorImpl<BasicBlock*> &ExitBlocks) { +bool LoopIdiomRecognize::runOnLoopBlock( + BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl<BasicBlock *> &ExitBlocks) { // We can only promote stores in this block if they are unconditionally // executed in the loop. For a block to be unconditionally executed, it has // to dominate all the exit blocks of the loop. Verify this now. @@ -692,25 +427,24 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, return false; bool MadeChange = false; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { - Instruction *Inst = I++; - // Look for store instructions, which may be optimized to memset/memcpy. - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - WeakVH InstPtr(I); - if (!processLoopStore(SI, BECount)) continue; - MadeChange = true; + // Look for store instructions, which may be optimized to memset/memcpy. + collectStores(BB); - // If processing the store invalidated our iterator, start over from the - // top of the block. - if (!InstPtr) - I = BB->begin(); - continue; - } + // Look for a single store which can be optimized into a memset. + for (auto &SI : StoreRefsForMemset) + MadeChange |= processLoopStore(SI, BECount); + // Optimize the store into a memcpy, if it feeds an similarly strided load. + for (auto &SI : StoreRefsForMemcpy) + MadeChange |= processLoopStoreOfLoopLoad(SI, BECount); + + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { + Instruction *Inst = &*I++; // Look for memset instructions, which may be optimized to a larger memset. - if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { - WeakVH InstPtr(I); - if (!processLoopMemSet(MSI, BECount)) continue; + if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { + WeakVH InstPtr(&*I); + if (!processLoopMemSet(MSI, BECount)) + continue; MadeChange = true; // If processing the memset invalidated our iterator, start over from the @@ -724,71 +458,34 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, return MadeChange; } - -/// processLoopStore - See if this store can be promoted to a memset or memcpy. +/// processLoopStore - See if this store can be promoted to a memset. bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { - if (!SI->isSimple()) return false; + assert(SI->isSimple() && "Expected only non-volatile stores."); Value *StoredVal = SI->getValueOperand(); Value *StorePtr = SI->getPointerOperand(); - // Reject stores that are so large that they overflow an unsigned. - auto &DL = CurLoop->getHeader()->getModule()->getDataLayout(); - uint64_t SizeInBits = DL.getTypeSizeInBits(StoredVal->getType()); - if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) - return false; - - // See if the pointer expression is an AddRec like {base,+,1} on the current - // loop, which indicates a strided store. If we have something else, it's a - // random store we can't handle. - const SCEVAddRecExpr *StoreEv = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); - if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) - return false; - // Check to see if the stride matches the size of the store. If so, then we // know that every byte is touched in the loop. - unsigned StoreSize = (unsigned)SizeInBits >> 3; - const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); - - if (!Stride || StoreSize != Stride->getValue()->getValue()) { - // TODO: Could also handle negative stride here someday, that will require - // the validity check in mayLoopAccessLocation to be updated though. - // Enable this to print exact negative strides. - if (0 && Stride && StoreSize == -Stride->getValue()->getValue()) { - dbgs() << "NEGATIVE STRIDE: " << *SI << "\n"; - dbgs() << "BB: " << *SI->getParent(); - } - + const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); + unsigned Stride = getStoreStride(StoreEv); + unsigned StoreSize = getStoreSizeInBytes(SI, DL); + if (StoreSize != Stride && StoreSize != -Stride) return false; - } - // See if we can optimize just this store in isolation. - if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), - StoredVal, SI, StoreEv, BECount)) - return true; + bool NegStride = StoreSize == -Stride; - // If the stored value is a strided load in the same loop with the same stride - // this this may be transformable into a memcpy. This kicks in for stuff like - // for (i) A[i] = B[i]; - if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { - const SCEVAddRecExpr *LoadEv = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0))); - if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() && - StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple()) - if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount)) - return true; - } - //errs() << "UNHANDLED strided store: " << *StoreEv << " - " << *SI << "\n"; - - return false; + // See if we can optimize just this store in isolation. + return processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), + StoredVal, SI, StoreEv, BECount, NegStride); } /// processLoopMemSet - See if this memset can be promoted to a large memset. -bool LoopIdiomRecognize:: -processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { +bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, + const SCEV *BECount) { // We can only handle non-volatile memsets with a constant size. - if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) return false; + if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) + return false; // If we're not allowed to hack on memset, we fail. if (!TLI->has(LibFunc::memset)) @@ -817,18 +514,23 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { if (!Stride || MSI->getLength() != Stride->getValue()) return false; + // Verify that the memset value is loop invariant. If not, we can't promote + // the memset. + Value *SplatValue = MSI->getValue(); + if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue)) + return false; + return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, - MSI->getAlignment(), MSI->getValue(), - MSI, Ev, BECount); + MSI->getAlignment(), SplatValue, MSI, Ev, + BECount, /*NegStride=*/false); } - /// mayLoopAccessLocation - Return true if the specified loop might access the /// specified pointer location, which is a loop-strided access. The 'Access' /// argument specifies what the verboten forms of access are (read or write). -static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, - Loop *L, const SCEV *BECount, - unsigned StoreSize, AliasAnalysis &AA, +static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, + const SCEV *BECount, unsigned StoreSize, + AliasAnalysis &AA, Instruction *IgnoredStore) { // Get the location that may be stored across the loop. Since the access is // strided positively through memory, we say that the modified location starts @@ -838,7 +540,7 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, // If the loop iterates a fixed number of times, we can refine the access size // to be exactly the size of the memset, which is (BECount+1)*StoreSize if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) - AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize; + AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize; // TODO: For this to be really effective, we have to dive into the pointer // operand in the store. Store to &A[i] of 100 will always return may alias @@ -849,96 +551,55 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; ++BI) for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) - if (&*I != IgnoredStore && - (AA.getModRefInfo(I, StoreLoc) & Access)) + if (&*I != IgnoredStore && (AA.getModRefInfo(&*I, StoreLoc) & Access)) return true; return false; } -/// getMemSetPatternValue - If a strided store of the specified value is safe to -/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should -/// be passed in. Otherwise, return null. -/// -/// Note that we don't ever attempt to use memset_pattern8 or 4, because these -/// just replicate their input array and then pass on to memset_pattern16. -static Constant *getMemSetPatternValue(Value *V, const DataLayout &DL) { - // If the value isn't a constant, we can't promote it to being in a constant - // array. We could theoretically do a store to an alloca or something, but - // that doesn't seem worthwhile. - Constant *C = dyn_cast<Constant>(V); - if (!C) return nullptr; - - // Only handle simple values that are a power of two bytes in size. - uint64_t Size = DL.getTypeSizeInBits(V->getType()); - if (Size == 0 || (Size & 7) || (Size & (Size-1))) - return nullptr; - - // Don't care enough about darwin/ppc to implement this. - if (DL.isBigEndian()) - return nullptr; - - // Convert to size in bytes. - Size /= 8; - - // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see - // if the top and bottom are the same (e.g. for vectors and large integers). - if (Size > 16) return nullptr; - - // If the constant is exactly 16 bytes, just use it. - if (Size == 16) return C; - - // Otherwise, we'll use an array of the constants. - unsigned ArraySize = 16/Size; - ArrayType *AT = ArrayType::get(V->getType(), ArraySize); - return ConstantArray::get(AT, std::vector<Constant*>(ArraySize, C)); +// If we have a negative stride, Start refers to the end of the memory location +// we're trying to memset. Therefore, we need to recompute the base pointer, +// which is just Start - BECount*Size. +static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, + Type *IntPtr, unsigned StoreSize, + ScalarEvolution *SE) { + const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr); + if (StoreSize != 1) + Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize), + SCEV::FlagNUW); + return SE->getMinusSCEV(Start, Index); } - /// processLoopStridedStore - We see a strided store of some value. If we can /// transform this into a memset or memset_pattern in the loop preheader, do so. -bool LoopIdiomRecognize:: -processLoopStridedStore(Value *DestPtr, unsigned StoreSize, - unsigned StoreAlignment, Value *StoredVal, - Instruction *TheStore, const SCEVAddRecExpr *Ev, - const SCEV *BECount) { - - // If the stored value is a byte-wise value (like i32 -1), then it may be - // turned into a memset of i8 -1, assuming that all the consecutive bytes - // are stored. A store of i32 0x01020304 can never be turned into a memset, - // but it can be turned into memset_pattern if the target supports it. +bool LoopIdiomRecognize::processLoopStridedStore( + Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, + Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev, + const SCEV *BECount, bool NegStride) { Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = nullptr; - auto &DL = CurLoop->getHeader()->getModule()->getDataLayout(); - unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); - // If we're allowed to form a memset, and the stored value would be acceptable - // for memset, use it. - if (SplatValue && TLI->has(LibFunc::memset) && - // Verify that the stored value is loop invariant. If not, we can't - // promote the memset. - CurLoop->isLoopInvariant(SplatValue)) { - // Keep and use SplatValue. - PatternValue = nullptr; - } else if (DestAS == 0 && TLI->has(LibFunc::memset_pattern16) && - (PatternValue = getMemSetPatternValue(StoredVal, DL))) { - // Don't create memset_pattern16s with address spaces. - // It looks like we can use PatternValue! - SplatValue = nullptr; - } else { - // Otherwise, this isn't an idiom we can transform. For example, we can't - // do anything with a 3-byte store. - return false; - } + if (!SplatValue) + PatternValue = getMemSetPatternValue(StoredVal, DL); + + assert((SplatValue || PatternValue) && + "Expected either splat value or pattern value."); // The trip count of the loop and the base pointer of the addrec SCEV is // guaranteed to be loop invariant, which means that it should dominate the // header. This allows us to insert code for it in the preheader. + unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); BasicBlock *Preheader = CurLoop->getLoopPreheader(); IRBuilder<> Builder(Preheader->getTerminator()); - SCEVExpander Expander(*SE, DL, "loop-idiom"); + SCEVExpander Expander(*SE, *DL, "loop-idiom"); Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); + Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS); + + const SCEV *Start = Ev->getStart(); + // Handle negative strided loops. + if (NegStride) + Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE); // Okay, we have a strided store "p[i]" of a splattable value. We can turn // this into a memset in the loop preheader now if we want. However, this @@ -946,12 +607,9 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // or write to the aliased location. Check for any overlap by generating the // base pointer and checking the region. Value *BasePtr = - Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy, - Preheader->getTerminator()); - - if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef, - CurLoop, BECount, - StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) { + Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); + if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize, + *AA, TheStore)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); @@ -962,36 +620,30 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = Builder.getIntPtrTy(DL, DestAS); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); - const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), - SCEV::FlagNUW); + const SCEV *NumBytesS = + SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW); if (StoreSize != 1) { NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), SCEV::FlagNUW); } Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); CallInst *NewCall; if (SplatValue) { - NewCall = Builder.CreateMemSet(BasePtr, - SplatValue, - NumBytes, - StoreAlignment); + NewCall = + Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); } else { // Everything is emitted in default address space Type *Int8PtrTy = DestInt8PtrTy; - Module *M = TheStore->getParent()->getParent()->getParent(); - Value *MSP = M->getOrInsertFunction("memset_pattern16", - Builder.getVoidTy(), - Int8PtrTy, - Int8PtrTy, - IntPtr, - (void*)nullptr); + Module *M = TheStore->getModule(); + Value *MSP = + M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), + Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr); // Otherwise we should form a memset_pattern16. PatternValue is known to be // an constant array of 16-bytes. Plop the value into a mergable global. @@ -1015,26 +667,43 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, return true; } -/// processLoopStoreOfLoopLoad - We see a strided store whose value is a -/// same-strided load. -bool LoopIdiomRecognize:: -processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, - const SCEVAddRecExpr *StoreEv, - const SCEVAddRecExpr *LoadEv, - const SCEV *BECount) { - // If we're not allowed to form memcpy, we fail. - if (!TLI->has(LibFunc::memcpy)) - return false; +/// If the stored value is a strided load in the same loop with the same stride +/// this may be transformable into a memcpy. This kicks in for stuff like +/// for (i) A[i] = B[i]; +bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, + const SCEV *BECount) { + assert(SI->isSimple() && "Expected only non-volatile stores."); + Value *StorePtr = SI->getPointerOperand(); + const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); + unsigned Stride = getStoreStride(StoreEv); + unsigned StoreSize = getStoreSizeInBytes(SI, DL); + bool NegStride = StoreSize == -Stride; + + // The store must be feeding a non-volatile load. LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); + assert(LI->isSimple() && "Expected only non-volatile stores."); + + // See if the pointer expression is an AddRec like {base,+,1} on the current + // loop, which indicates a strided load. If we have something else, it's a + // random load we can't handle. + const SCEVAddRecExpr *LoadEv = + cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); // The trip count of the loop and the base pointer of the addrec SCEV is // guaranteed to be loop invariant, which means that it should dominate the // header. This allows us to insert code for it in the preheader. BasicBlock *Preheader = CurLoop->getLoopPreheader(); IRBuilder<> Builder(Preheader->getTerminator()); - const DataLayout &DL = Preheader->getModule()->getDataLayout(); - SCEVExpander Expander(*SE, DL, "loop-idiom"); + SCEVExpander Expander(*SE, *DL, "loop-idiom"); + + const SCEV *StrStart = StoreEv->getStart(); + unsigned StrAS = SI->getPointerAddressSpace(); + Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS); + + // Handle negative strided loops. + if (NegStride) + StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE); // Okay, we have a strided store "p[i]" of a loaded value. We can turn // this into a memcpy in the loop preheader now if we want. However, this @@ -1042,29 +711,31 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // or write the memory region we're storing to. This includes the load that // feeds the stores. Check for an alias by generating the base address and // checking everything. - Value *StoreBasePtr = - Expander.expandCodeFor(StoreEv->getStart(), - Builder.getInt8PtrTy(SI->getPointerAddressSpace()), - Preheader->getTerminator()); - - if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef, - CurLoop, BECount, StoreSize, - getAnalysis<AliasAnalysis>(), SI)) { + Value *StoreBasePtr = Expander.expandCodeFor( + StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); + + if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, + StoreSize, *AA, SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); return false; } + const SCEV *LdStart = LoadEv->getStart(); + unsigned LdAS = LI->getPointerAddressSpace(); + + // Handle negative strided loops. + if (NegStride) + LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); + // For a memcpy, we have to make sure that the input array is not being // mutated by the loop. - Value *LoadBasePtr = - Expander.expandCodeFor(LoadEv->getStart(), - Builder.getInt8PtrTy(LI->getPointerAddressSpace()), - Preheader->getTerminator()); + Value *LoadBasePtr = Expander.expandCodeFor( + LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); - if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount, - StoreSize, getAnalysis<AliasAnalysis>(), SI)) { + if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, + *AA, SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); @@ -1074,34 +745,368 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // Okay, everything is safe, we can transform this! - // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtrTy = Builder.getIntPtrTy(DL, SI->getPointerAddressSpace()); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); - const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1), - SCEV::FlagNUW); + const SCEV *NumBytesS = + SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW); if (StoreSize != 1) NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), SCEV::FlagNUW); Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); CallInst *NewCall = - Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, - std::min(SI->getAlignment(), LI->getAlignment())); + Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, + std::min(SI->getAlignment(), LI->getAlignment())); NewCall->setDebugLoc(SI->getDebugLoc()); DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); - - // Okay, the memset has been formed. Zap the original store and anything that + // Okay, the memcpy has been formed. Zap the original store and anything that // feeds into it. deleteDeadInstruction(SI, TLI); ++NumMemCpy; return true; } + +bool LoopIdiomRecognize::runOnNoncountableLoop() { + return recognizePopcount(); +} + +/// Check if the given conditional branch is based on the comparison between +/// a variable and zero, and if the variable is non-zero, the control yields to +/// the loop entry. If the branch matches the behavior, the variable involved +/// in the comparion is returned. This function will be called to see if the +/// precondition and postcondition of the loop are in desirable form. +static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) { + if (!BI || !BI->isConditional()) + return nullptr; + + ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); + if (!Cond) + return nullptr; + + ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); + if (!CmpZero || !CmpZero->isZero()) + return nullptr; + + ICmpInst::Predicate Pred = Cond->getPredicate(); + if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) || + (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry)) + return Cond->getOperand(0); + + return nullptr; +} + +/// Return true iff the idiom is detected in the loop. +/// +/// Additionally: +/// 1) \p CntInst is set to the instruction counting the population bit. +/// 2) \p CntPhi is set to the corresponding phi node. +/// 3) \p Var is set to the value whose population bits are being counted. +/// +/// The core idiom we are trying to detect is: +/// \code +/// if (x0 != 0) +/// goto loop-exit // the precondition of the loop +/// cnt0 = init-val; +/// do { +/// x1 = phi (x0, x2); +/// cnt1 = phi(cnt0, cnt2); +/// +/// cnt2 = cnt1 + 1; +/// ... +/// x2 = x1 & (x1 - 1); +/// ... +/// } while(x != 0); +/// +/// loop-exit: +/// \endcode +static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, + Instruction *&CntInst, PHINode *&CntPhi, + Value *&Var) { + // step 1: Check to see if the look-back branch match this pattern: + // "if (a!=0) goto loop-entry". + BasicBlock *LoopEntry; + Instruction *DefX2, *CountInst; + Value *VarX1, *VarX0; + PHINode *PhiX, *CountPhi; + + DefX2 = CountInst = nullptr; + VarX1 = VarX0 = nullptr; + PhiX = CountPhi = nullptr; + LoopEntry = *(CurLoop->block_begin()); + + // step 1: Check if the loop-back branch is in desirable form. + { + if (Value *T = matchCondition( + dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) + DefX2 = dyn_cast<Instruction>(T); + else + return false; + } + + // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" + { + if (!DefX2 || DefX2->getOpcode() != Instruction::And) + return false; + + BinaryOperator *SubOneOp; + + if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) + VarX1 = DefX2->getOperand(1); + else { + VarX1 = DefX2->getOperand(0); + SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); + } + if (!SubOneOp) + return false; + + Instruction *SubInst = cast<Instruction>(SubOneOp); + ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); + if (!Dec || + !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || + (SubInst->getOpcode() == Instruction::Add && + Dec->isAllOnesValue()))) { + return false; + } + } + + // step 3: Check the recurrence of variable X + { + PhiX = dyn_cast<PHINode>(VarX1); + if (!PhiX || + (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { + return false; + } + } + + // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 + { + CountInst = nullptr; + for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), + IterE = LoopEntry->end(); + Iter != IterE; Iter++) { + Instruction *Inst = &*Iter; + if (Inst->getOpcode() != Instruction::Add) + continue; + + ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); + if (!Inc || !Inc->isOne()) + continue; + + PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); + if (!Phi || Phi->getParent() != LoopEntry) + continue; + + // Check if the result of the instruction is live of the loop. + bool LiveOutLoop = false; + for (User *U : Inst->users()) { + if ((cast<Instruction>(U))->getParent() != LoopEntry) { + LiveOutLoop = true; + break; + } + } + + if (LiveOutLoop) { + CountInst = Inst; + CountPhi = Phi; + break; + } + } + + if (!CountInst) + return false; + } + + // step 5: check if the precondition is in this form: + // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" + { + auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); + Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader()); + if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) + return false; + + CntInst = CountInst; + CntPhi = CountPhi; + Var = T; + } + + return true; +} + +/// Recognizes a population count idiom in a non-countable loop. +/// +/// If detected, transforms the relevant code to issue the popcount intrinsic +/// function call, and returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizePopcount() { + if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) + return false; + + // Counting population are usually conducted by few arithmetic instructions. + // Such instructions can be easily "absorbed" by vacant slots in a + // non-compact loop. Therefore, recognizing popcount idiom only makes sense + // in a compact loop. + + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) + return false; + + BasicBlock *LoopBody = *(CurLoop->block_begin()); + if (LoopBody->size() >= 20) { + // The loop is too big, bail out. + return false; + } + + // It should have a preheader containing nothing but an unconditional branch. + BasicBlock *PH = CurLoop->getLoopPreheader(); + if (!PH) + return false; + if (&PH->front() != PH->getTerminator()) + return false; + auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator()); + if (!EntryBI || EntryBI->isConditional()) + return false; + + // It should have a precondition block where the generated popcount instrinsic + // function can be inserted. + auto *PreCondBB = PH->getSinglePredecessor(); + if (!PreCondBB) + return false; + auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); + if (!PreCondBI || PreCondBI->isUnconditional()) + return false; + + Instruction *CntInst; + PHINode *CntPhi; + Value *Val; + if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val)) + return false; + + transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val); + return true; +} + +static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, + DebugLoc DL) { + Value *Ops[] = {Val}; + Type *Tys[] = {Val->getType()}; + + Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); + CallInst *CI = IRBuilder.CreateCall(Func, Ops); + CI->setDebugLoc(DL); + + return CI; +} + +void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, + Instruction *CntInst, + PHINode *CntPhi, Value *Var) { + BasicBlock *PreHead = CurLoop->getLoopPreheader(); + auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); + const DebugLoc DL = CntInst->getDebugLoc(); + + // Assuming before transformation, the loop is following: + // if (x) // the precondition + // do { cnt++; x &= x - 1; } while(x); + + // Step 1: Insert the ctpop instruction at the end of the precondition block + IRBuilder<> Builder(PreCondBr); + Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; + { + PopCnt = createPopcntIntrinsic(Builder, Var, DL); + NewCount = PopCntZext = + Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); + + if (NewCount != PopCnt) + (cast<Instruction>(NewCount))->setDebugLoc(DL); + + // TripCnt is exactly the number of iterations the loop has + TripCnt = NewCount; + + // If the population counter's initial value is not zero, insert Add Inst. + Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); + ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); + if (!InitConst || !InitConst->isZero()) { + NewCount = Builder.CreateAdd(NewCount, CntInitVal); + (cast<Instruction>(NewCount))->setDebugLoc(DL); + } + } + + // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to + // "if (NewCount == 0) loop-exit". Without this change, the intrinsic + // function would be partial dead code, and downstream passes will drag + // it back from the precondition block to the preheader. + { + ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); + + Value *Opnd0 = PopCntZext; + Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); + if (PreCond->getOperand(0) != Var) + std::swap(Opnd0, Opnd1); + + ICmpInst *NewPreCond = cast<ICmpInst>( + Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); + PreCondBr->setCondition(NewPreCond); + + RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); + } + + // Step 3: Note that the population count is exactly the trip count of the + // loop in question, which enable us to to convert the loop from noncountable + // loop into a countable one. The benefit is twofold: + // + // - If the loop only counts population, the entire loop becomes dead after + // the transformation. It is a lot easier to prove a countable loop dead + // than to prove a noncountable one. (In some C dialects, an infinite loop + // isn't dead even if it computes nothing useful. In general, DCE needs + // to prove a noncountable loop finite before safely delete it.) + // + // - If the loop also performs something else, it remains alive. + // Since it is transformed to countable form, it can be aggressively + // optimized by some optimizations which are in general not applicable + // to a noncountable loop. + // + // After this step, this loop (conceptually) would look like following: + // newcnt = __builtin_ctpop(x); + // t = newcnt; + // if (x) + // do { cnt++; x &= x-1; t--) } while (t > 0); + BasicBlock *Body = *(CurLoop->block_begin()); + { + auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); + ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); + Type *Ty = TripCnt->getType(); + + PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); + + Builder.SetInsertPoint(LbCond); + Instruction *TcDec = cast<Instruction>( + Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), + "tcdec", false, true)); + + TcPhi->addIncoming(TripCnt, PreHead); + TcPhi->addIncoming(TcDec, Body); + + CmpInst::Predicate Pred = + (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; + LbCond->setPredicate(Pred); + LbCond->setOperand(0, TcDec); + LbCond->setOperand(1, ConstantInt::get(Ty, 0)); + } + + // Step 4: All the references to the original population counter outside + // the loop are replaced with the NewCount -- the value returned from + // __builtin_ctpop(). + CntInst->replaceUsesOutsideBlock(NewCount, Body); + + // step 5: Forget the "non-computable" trip-count SCEV associated with the + // loop. The loop would otherwise not be deleted even if it becomes empty. + SE->forgetLoop(CurLoop); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index e125026..b4102fe 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -48,7 +48,7 @@ namespace { AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; @@ -112,7 +112,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Simplify instructions in the current basic block. for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - Instruction *I = BI++; + Instruction *I = &*BI++; // The first time through the loop ToSimplify is empty and we try to // simplify all instructions. On later iterations ToSimplify is not diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 9d7e57f..4295235 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -99,7 +99,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, return false; if (St && !St->isSimple()) return false; - MemInstr.push_back(I); + MemInstr.push_back(&*I); } } @@ -176,7 +176,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, } } - // We don't have a DepMatrix to check legality return false + // We don't have a DepMatrix to check legality return false. if (DepMatrix.size() == 0) return false; return true; @@ -331,9 +331,9 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { class LoopInterchangeLegality { public: LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, - LoopInterchange *Pass) - : OuterLoop(Outer), InnerLoop(Inner), SE(SE), CurrentPass(Pass), - InnerLoopHasReduction(false) {} + LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), + PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {} /// Check if the loops can be interchanged. bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, @@ -357,9 +357,10 @@ private: Loop *OuterLoop; Loop *InnerLoop; - /// Scev analysis. ScalarEvolution *SE; - LoopInterchange *CurrentPass; + LoopInfo *LI; + DominatorTree *DT; + bool PreserveLCSSA; bool InnerLoopHasReduction; }; @@ -371,7 +372,7 @@ public: LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE) : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {} - /// Check if the loop interchange is profitable + /// Check if the loop interchange is profitable. bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix); @@ -385,12 +386,12 @@ private: ScalarEvolution *SE; }; -/// LoopInterchangeTransform interchanges the loop +/// LoopInterchangeTransform interchanges the loop. class LoopInterchangeTransform { public: LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, - LoopInterchange *Pass, BasicBlock *LoopNestExit, + BasicBlock *LoopNestExit, bool InnerLoopContainsReductions) : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LoopExit(LoopNestExit), @@ -424,21 +425,22 @@ private: bool InnerLoopHasReduction; }; -// Main LoopInterchange Pass +// Main LoopInterchange Pass. struct LoopInterchange : public FunctionPass { static char ID; ScalarEvolution *SE; LoopInfo *LI; DependenceAnalysis *DA; DominatorTree *DT; + bool PreserveLCSSA; LoopInterchange() : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) { initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<ScalarEvolution>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DependenceAnalysis>(); @@ -447,11 +449,13 @@ struct LoopInterchange : public FunctionPass { } bool runOnFunction(Function &F) override { - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DA = &getAnalysis<DependenceAnalysis>(); auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + // Build up a worklist of loop pairs to analyze. SmallVector<LoopVector, 8> Worklist; @@ -489,7 +493,7 @@ struct LoopInterchange : public FunctionPass { unsigned selectLoopForInterchange(LoopVector LoopList) { // TODO: Add a better heuristic to select the loop to be interchanged based - // on the dependece matrix. Currently we select the innermost loop. + // on the dependence matrix. Currently we select the innermost loop. return LoopList.size() - 1; } @@ -544,7 +548,7 @@ struct LoopInterchange : public FunctionPass { } unsigned SelecLoopId = selectLoopForInterchange(LoopList); - // Move the selected loop outwards to the best posible position. + // Move the selected loop outwards to the best possible position. for (unsigned i = SelecLoopId; i > 0; i--) { bool Interchanged = processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix); @@ -574,7 +578,8 @@ struct LoopInterchange : public FunctionPass { Loop *InnerLoop = LoopList[InnerLoopId]; Loop *OuterLoop = LoopList[OuterLoopId]; - LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, this); + LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT, + PreserveLCSSA); if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); return false; @@ -586,7 +591,7 @@ struct LoopInterchange : public FunctionPass { return false; } - LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, this, + LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit, LIL.hasInnerLoopReduction()); LIT.transform(); DEBUG(dbgs() << "Loops interchanged\n"); @@ -655,7 +660,7 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch \n"); // We do not have any basic block in between now make sure the outer header - // and outer loop latch doesnt contain any unsafe instructions. + // and outer loop latch doesn't contain any unsafe instructions. if (containsUnsafeInstructionsInHeader(OuterLoopHeader) || containsUnsafeInstructionsInLatch(OuterLoopLatch)) return false; @@ -698,9 +703,9 @@ bool LoopInterchangeLegality::findInductionAndReductions( return false; for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { RecurrenceDescriptor RD; + InductionDescriptor ID; PHINode *PHI = cast<PHINode>(I); - ConstantInt *StepValue = nullptr; - if (isInductionPHI(PHI, SE, StepValue)) + if (InductionDescriptor::isInductionPHI(PHI, SE, ID)) Inductions.push_back(PHI); else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) Reductions.push_back(PHI); @@ -836,7 +841,7 @@ bool LoopInterchangeLegality::currentLimitations() { else FoundInduction = true; } - // The loop latch ended and we didnt find the induction variable return as + // The loop latch ended and we didn't find the induction variable return as // current limitation. if (!FoundInduction) return true; @@ -867,12 +872,14 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() || isa<PHINode>(OuterLoopPreHeader->begin()) || !OuterLoopPreHeader->getUniquePredecessor()) { - OuterLoopPreHeader = InsertPreheaderForLoop(OuterLoop, CurrentPass); + OuterLoopPreHeader = + InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA); } if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() || InnerLoopPreHeader == OuterLoop->getHeader()) { - InnerLoopPreHeader = InsertPreheaderForLoop(InnerLoop, CurrentPass); + InnerLoopPreHeader = + InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA); } // TODO: The loops could not be interchanged due to current limitations in the @@ -966,7 +973,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { - // TODO: Add Better Profitibility checks. + // TODO: Add better profitability checks. // e.g // 1) Construct dependency matrix and move the one with no loop carried dep // inside to enable vectorization. @@ -980,7 +987,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, if (Cost < 0) return true; - // It is not profitable as per current cache profitibility model. But check if + // It is not profitable as per current cache profitability model. But check if // we can move this loop outside to improve parallelism. bool ImprovesPar = isProfitabileForVectorization(InnerLoopId, OuterLoopId, DepMatrix); @@ -996,7 +1003,7 @@ void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, return; } } - assert(false && "Couldn't find loop"); + llvm_unreachable("Couldn't find loop"); } void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop, @@ -1045,7 +1052,7 @@ bool LoopInterchangeTransform::transform() { splitInnerLoopLatch(InnerIndexVar); DEBUG(dbgs() << "splitInnerLoopLatch Done\n"); - // Splits the inner loops phi nodes out into a seperate basic block. + // Splits the inner loops phi nodes out into a separate basic block. splitInnerLoopHeader(); DEBUG(dbgs() << "splitInnerLoopHeader Done\n"); } @@ -1113,8 +1120,8 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { auto &ToList = InsertBefore->getParent()->getInstList(); auto &FromList = FromBB->getInstList(); - ToList.splice(InsertBefore, FromList, FromList.begin(), - FromBB->getTerminator()); + ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(), + FromBB->getTerminator()->getIterator()); } void LoopInterchangeTransform::adjustOuterLoopPreheader() { @@ -1181,8 +1188,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() { if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) return false; - BasicBlock *InnerLoopHeaderSucessor = InnerLoopHeader->getUniqueSuccessor(); - if (!InnerLoopHeaderSucessor) + BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor(); + if (!InnerLoopHeaderSuccessor) return false; // Adjust Loop Preheader and headers @@ -1198,11 +1205,11 @@ bool LoopInterchangeTransform::adjustLoopBranches() { if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch) OuterLoopHeaderBI->setSuccessor(i, LoopExit); else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader) - OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSucessor); + OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor); } // Adjust reduction PHI's now that the incoming block has changed. - updateIncomingBlock(InnerLoopHeaderSucessor, InnerLoopHeader, + updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader, OuterLoopHeader); BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI); @@ -1286,10 +1293,10 @@ bool LoopInterchangeTransform::adjustLoopLinks() { char LoopInterchange::ID = 0; INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", "Interchanges loops for cache reuse", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp new file mode 100644 index 0000000..1064d08 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -0,0 +1,566 @@ +//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implement a loop-aware load elimination pass. +// +// It uses LoopAccessAnalysis to identify loop-carried dependences with a +// distance of one between stores and loads. These form the candidates for the +// transformation. The source value of each store then propagated to the user +// of the corresponding load. This makes the load dead. +// +// The pass can also version the loop and add memchecks in order to prove that +// may-aliasing stores can't change the value in memory before it's read by the +// load. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" +#include <forward_list> + +#define LLE_OPTION "loop-load-elim" +#define DEBUG_TYPE LLE_OPTION + +using namespace llvm; + +static cl::opt<unsigned> CheckPerElim( + "runtime-check-per-loop-load-elim", cl::Hidden, + cl::desc("Max number of memchecks allowed per eliminated load on average"), + cl::init(1)); + +static cl::opt<unsigned> LoadElimSCEVCheckThreshold( + "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed for Loop " + "Load Elimination")); + + +STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE"); + +namespace { + +/// \brief Represent a store-to-forwarding candidate. +struct StoreToLoadForwardingCandidate { + LoadInst *Load; + StoreInst *Store; + + StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store) + : Load(Load), Store(Store) {} + + /// \brief Return true if the dependence from the store to the load has a + /// distance of one. E.g. A[i+1] = A[i] + bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const { + Value *LoadPtr = Load->getPointerOperand(); + Value *StorePtr = Store->getPointerOperand(); + Type *LoadPtrType = LoadPtr->getType(); + Type *LoadType = LoadPtrType->getPointerElementType(); + + assert(LoadPtrType->getPointerAddressSpace() == + StorePtr->getType()->getPointerAddressSpace() && + LoadType == StorePtr->getType()->getPointerElementType() && + "Should be a known dependence"); + + auto &DL = Load->getParent()->getModule()->getDataLayout(); + unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType)); + + auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr)); + auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr)); + + // We don't need to check non-wrapping here because forward/backward + // dependence wouldn't be valid if these weren't monotonic accesses. + auto *Dist = cast<SCEVConstant>( + PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV)); + const APInt &Val = Dist->getAPInt(); + return Val.abs() == TypeByteSize; + } + + Value *getLoadPtr() const { return Load->getPointerOperand(); } + +#ifndef NDEBUG + friend raw_ostream &operator<<(raw_ostream &OS, + const StoreToLoadForwardingCandidate &Cand) { + OS << *Cand.Store << " -->\n"; + OS.indent(2) << *Cand.Load << "\n"; + return OS; + } +#endif +}; + +/// \brief Check if the store dominates all latches, so as long as there is no +/// intervening store this value will be loaded in the next iteration. +bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, + DominatorTree *DT) { + SmallVector<BasicBlock *, 8> Latches; + L->getLoopLatches(Latches); + return std::all_of(Latches.begin(), Latches.end(), + [&](const BasicBlock *Latch) { + return DT->dominates(StoreBlock, Latch); + }); +} + +/// \brief The per-loop class that does most of the work. +class LoadEliminationForLoop { +public: + LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, + DominatorTree *DT) + : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.PSE) {} + + /// \brief Look through the loop-carried and loop-independent dependences in + /// this loop and find store->load dependences. + /// + /// Note that no candidate is returned if LAA has failed to analyze the loop + /// (e.g. if it's not bottom-tested, contains volatile memops, etc.) + std::forward_list<StoreToLoadForwardingCandidate> + findStoreToLoadDependences(const LoopAccessInfo &LAI) { + std::forward_list<StoreToLoadForwardingCandidate> Candidates; + + const auto *Deps = LAI.getDepChecker().getDependences(); + if (!Deps) + return Candidates; + + // Find store->load dependences (consequently true dep). Both lexically + // forward and backward dependences qualify. Disqualify loads that have + // other unknown dependences. + + SmallSet<Instruction *, 4> LoadsWithUnknownDepedence; + + for (const auto &Dep : *Deps) { + Instruction *Source = Dep.getSource(LAI); + Instruction *Destination = Dep.getDestination(LAI); + + if (Dep.Type == MemoryDepChecker::Dependence::Unknown) { + if (isa<LoadInst>(Source)) + LoadsWithUnknownDepedence.insert(Source); + if (isa<LoadInst>(Destination)) + LoadsWithUnknownDepedence.insert(Destination); + continue; + } + + if (Dep.isBackward()) + // Note that the designations source and destination follow the program + // order, i.e. source is always first. (The direction is given by the + // DepType.) + std::swap(Source, Destination); + else + assert(Dep.isForward() && "Needs to be a forward dependence"); + + auto *Store = dyn_cast<StoreInst>(Source); + if (!Store) + continue; + auto *Load = dyn_cast<LoadInst>(Destination); + if (!Load) + continue; + Candidates.emplace_front(Load, Store); + } + + if (!LoadsWithUnknownDepedence.empty()) + Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) { + return LoadsWithUnknownDepedence.count(C.Load); + }); + + return Candidates; + } + + /// \brief Return the index of the instruction according to program order. + unsigned getInstrIndex(Instruction *Inst) { + auto I = InstOrder.find(Inst); + assert(I != InstOrder.end() && "No index for instruction"); + return I->second; + } + + /// \brief If a load has multiple candidates associated (i.e. different + /// stores), it means that it could be forwarding from multiple stores + /// depending on control flow. Remove these candidates. + /// + /// Here, we rely on LAA to include the relevant loop-independent dependences. + /// LAA is known to omit these in the very simple case when the read and the + /// write within an alias set always takes place using the *same* pointer. + /// + /// However, we know that this is not the case here, i.e. we can rely on LAA + /// to provide us with loop-independent dependences for the cases we're + /// interested. Consider the case for example where a loop-independent + /// dependece S1->S2 invalidates the forwarding S3->S2. + /// + /// A[i] = ... (S1) + /// ... = A[i] (S2) + /// A[i+1] = ... (S3) + /// + /// LAA will perform dependence analysis here because there are two + /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]). + void removeDependencesFromMultipleStores( + std::forward_list<StoreToLoadForwardingCandidate> &Candidates) { + // If Store is nullptr it means that we have multiple stores forwarding to + // this store. + typedef DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *> + LoadToSingleCandT; + LoadToSingleCandT LoadToSingleCand; + + for (const auto &Cand : Candidates) { + bool NewElt; + LoadToSingleCandT::iterator Iter; + + std::tie(Iter, NewElt) = + LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand)); + if (!NewElt) { + const StoreToLoadForwardingCandidate *&OtherCand = Iter->second; + // Already multiple stores forward to this load. + if (OtherCand == nullptr) + continue; + + // Handle the very basic of case when the two stores are in the same + // block so deciding which one forwards is easy. The later one forwards + // as long as they both have a dependence distance of one to the load. + if (Cand.Store->getParent() == OtherCand->Store->getParent() && + Cand.isDependenceDistanceOfOne(PSE) && + OtherCand->isDependenceDistanceOfOne(PSE)) { + // They are in the same block, the later one will forward to the load. + if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) + OtherCand = &Cand; + } else + OtherCand = nullptr; + } + } + + Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) { + if (LoadToSingleCand[Cand.Load] != &Cand) { + DEBUG(dbgs() << "Removing from candidates: \n" << Cand + << " The load may have multiple stores forwarding to " + << "it\n"); + return true; + } + return false; + }); + } + + /// \brief Given two pointers operations by their RuntimePointerChecking + /// indices, return true if they require an alias check. + /// + /// We need a check if one is a pointer for a candidate load and the other is + /// a pointer for a possibly intervening store. + bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2, + const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath, + const std::set<Value *> &CandLoadPtrs) { + Value *Ptr1 = + LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue; + Value *Ptr2 = + LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue; + return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) || + (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1))); + } + + /// \brief Return pointers that are possibly written to on the path from a + /// forwarding store to a load. + /// + /// These pointers need to be alias-checked against the forwarding candidates. + SmallSet<Value *, 4> findPointersWrittenOnForwardingPath( + const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { + // From FirstStore to LastLoad neither of the elimination candidate loads + // should overlap with any of the stores. + // + // E.g.: + // + // st1 C[i] + // ld1 B[i] <-------, + // ld0 A[i] <----, | * LastLoad + // ... | | + // st2 E[i] | | + // st3 B[i+1] -- | -' * FirstStore + // st0 A[i+1] ---' + // st4 D[i] + // + // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with + // ld0. + + LoadInst *LastLoad = + std::max_element(Candidates.begin(), Candidates.end(), + [&](const StoreToLoadForwardingCandidate &A, + const StoreToLoadForwardingCandidate &B) { + return getInstrIndex(A.Load) < getInstrIndex(B.Load); + }) + ->Load; + StoreInst *FirstStore = + std::min_element(Candidates.begin(), Candidates.end(), + [&](const StoreToLoadForwardingCandidate &A, + const StoreToLoadForwardingCandidate &B) { + return getInstrIndex(A.Store) < + getInstrIndex(B.Store); + }) + ->Store; + + // We're looking for stores after the first forwarding store until the end + // of the loop, then from the beginning of the loop until the last + // forwarded-to load. Collect the pointer for the stores. + SmallSet<Value *, 4> PtrsWrittenOnFwdingPath; + + auto InsertStorePtr = [&](Instruction *I) { + if (auto *S = dyn_cast<StoreInst>(I)) + PtrsWrittenOnFwdingPath.insert(S->getPointerOperand()); + }; + const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions(); + std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1, + MemInstrs.end(), InsertStorePtr); + std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)], + InsertStorePtr); + + return PtrsWrittenOnFwdingPath; + } + + /// \brief Determine the pointer alias checks to prove that there are no + /// intervening stores. + SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks( + const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { + + SmallSet<Value *, 4> PtrsWrittenOnFwdingPath = + findPointersWrittenOnForwardingPath(Candidates); + + // Collect the pointers of the candidate loads. + // FIXME: SmallSet does not work with std::inserter. + std::set<Value *> CandLoadPtrs; + std::transform(Candidates.begin(), Candidates.end(), + std::inserter(CandLoadPtrs, CandLoadPtrs.begin()), + std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr)); + + const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks(); + SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; + + std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), + [&](const RuntimePointerChecking::PointerCheck &Check) { + for (auto PtrIdx1 : Check.first->Members) + for (auto PtrIdx2 : Check.second->Members) + if (needsChecking(PtrIdx1, PtrIdx2, + PtrsWrittenOnFwdingPath, CandLoadPtrs)) + return true; + return false; + }); + + DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n"); + DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); + + return Checks; + } + + /// \brief Perform the transformation for a candidate. + void + propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand, + SCEVExpander &SEE) { + // + // loop: + // %x = load %gep_i + // = ... %x + // store %y, %gep_i_plus_1 + // + // => + // + // ph: + // %x.initial = load %gep_0 + // loop: + // %x.storeforward = phi [%x.initial, %ph] [%y, %loop] + // %x = load %gep_i <---- now dead + // = ... %x.storeforward + // store %y, %gep_i_plus_1 + + Value *Ptr = Cand.Load->getPointerOperand(); + auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr)); + auto *PH = L->getLoopPreheader(); + Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(), + PH->getTerminator()); + Value *Initial = + new LoadInst(InitialPtr, "load_initial", PH->getTerminator()); + PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded", + &L->getHeader()->front()); + PHI->addIncoming(Initial, PH); + PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch()); + + Cand.Load->replaceAllUsesWith(PHI); + } + + /// \brief Top-level driver for each loop: find store->load forwarding + /// candidates, add run-time checks and perform transformation. + bool processLoop() { + DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName() + << "\" checking " << *L << "\n"); + // Look for store-to-load forwarding cases across the + // backedge. E.g.: + // + // loop: + // %x = load %gep_i + // = ... %x + // store %y, %gep_i_plus_1 + // + // => + // + // ph: + // %x.initial = load %gep_0 + // loop: + // %x.storeforward = phi [%x.initial, %ph] [%y, %loop] + // %x = load %gep_i <---- now dead + // = ... %x.storeforward + // store %y, %gep_i_plus_1 + + // First start with store->load dependences. + auto StoreToLoadDependences = findStoreToLoadDependences(LAI); + if (StoreToLoadDependences.empty()) + return false; + + // Generate an index for each load and store according to the original + // program order. This will be used later. + InstOrder = LAI.getDepChecker().generateInstructionOrderMap(); + + // To keep things simple for now, remove those where the load is potentially + // fed by multiple stores. + removeDependencesFromMultipleStores(StoreToLoadDependences); + if (StoreToLoadDependences.empty()) + return false; + + // Filter the candidates further. + SmallVector<StoreToLoadForwardingCandidate, 4> Candidates; + unsigned NumForwarding = 0; + for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) { + DEBUG(dbgs() << "Candidate " << Cand); + // Make sure that the stored values is available everywhere in the loop in + // the next iteration. + if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT)) + continue; + + // Check whether the SCEV difference is the same as the induction step, + // thus we load the value in the next iteration. + if (!Cand.isDependenceDistanceOfOne(PSE)) + continue; + + ++NumForwarding; + DEBUG(dbgs() + << NumForwarding + << ". Valid store-to-load forwarding across the loop backedge\n"); + Candidates.push_back(Cand); + } + if (Candidates.empty()) + return false; + + // Check intervening may-alias stores. These need runtime checks for alias + // disambiguation. + SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks = + collectMemchecks(Candidates); + + // Too many checks are likely to outweigh the benefits of forwarding. + if (Checks.size() > Candidates.size() * CheckPerElim) { + DEBUG(dbgs() << "Too many run-time checks needed.\n"); + return false; + } + + if (LAI.PSE.getUnionPredicate().getComplexity() > + LoadElimSCEVCheckThreshold) { + DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); + return false; + } + + // Point of no-return, start the transformation. First, version the loop if + // necessary. + if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) { + LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false); + LV.setAliasChecks(std::move(Checks)); + LV.setSCEVChecks(LAI.PSE.getUnionPredicate()); + LV.versionLoop(); + } + + // Next, propagate the value stored by the store to the users of the load. + // Also for the first iteration, generate the initial value of the load. + SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(), + "storeforward"); + for (const auto &Cand : Candidates) + propagateStoredValueToLoadUsers(Cand, SEE); + NumLoopLoadEliminted += NumForwarding; + + return true; + } + +private: + Loop *L; + + /// \brief Maps the load/store instructions to their index according to + /// program order. + DenseMap<Instruction *, unsigned> InstOrder; + + // Analyses used. + LoopInfo *LI; + const LoopAccessInfo &LAI; + DominatorTree *DT; + PredicatedScalarEvolution PSE; +}; + +/// \brief The pass. Most of the work is delegated to the per-loop +/// LoadEliminationForLoop class. +class LoopLoadElimination : public FunctionPass { +public: + LoopLoadElimination() : FunctionPass(ID) { + initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto *LAA = &getAnalysis<LoopAccessAnalysis>(); + auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + // Build up a worklist of inner-loops to vectorize. This is necessary as the + // act of distributing a loop creates new loops and can invalidate iterators + // across the loops. + SmallVector<Loop *, 8> Worklist; + + for (Loop *TopLevelLoop : *LI) + for (Loop *L : depth_first(TopLevelLoop)) + // We only handle inner-most loops. + if (L->empty()) + Worklist.push_back(L); + + // Now walk the identified inner loops. + bool Changed = false; + for (Loop *L : Worklist) { + const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); + // The actual work is performed by LoadEliminationForLoop. + LoadEliminationForLoop LEL(L, LI, LAI, DT); + Changed |= LEL.processLoop(); + } + + // Process each loop nest in the function. + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<LoopAccessAnalysis>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } + + static char ID; +}; +} + +char LoopLoadElimination::ID; +static const char LLE_name[] = "Loop Load Elimination"; + +INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) + +namespace llvm { +FunctionPass *createLoopLoadEliminationPass() { + return new LoopLoadElimination(); +} +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index ed103e6..27c2d88 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -147,12 +147,12 @@ namespace { bool runOnLoop(Loop *L, LPPassManager &LPM) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } @@ -162,11 +162,15 @@ namespace { ScalarEvolution *SE; TargetLibraryInfo *TLI; DominatorTree *DT; + bool PreserveLCSSA; typedef SmallVector<Instruction *, 16> SmallInstructionVector; typedef SmallSet<Instruction *, 16> SmallInstructionSet; - // A chain of isomorphic instructions, indentified by a single-use PHI, + // Map between induction variable and its increment + DenseMap<Instruction *, int64_t> IVToIncMap; + + // A chain of isomorphic instructions, identified by a single-use PHI // representing a reduction. Only the last value may be used outside the // loop. struct SimpleLoopReduction { @@ -300,22 +304,6 @@ namespace { // The functions below can be called after we've finished processing all // instructions in the loop, and we know which reductions were selected. - // Is the provided instruction the PHI of a reduction selected for - // rerolling? - bool isSelectedPHI(Instruction *J) { - if (!isa<PHINode>(J)) - return false; - - for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); - RI != RIE; ++RI) { - int i = *RI; - if (cast<Instruction>(J) == PossibleReds[i].getPHI()) - return true; - } - - return false; - } - bool validateSelected(); void replaceSelected(); @@ -335,7 +323,7 @@ namespace { // x[i*3+1] = y2 // x[i*3+2] = y3 // - // Base instruction -> i*3 + // Base instruction -> i*3 // +---+----+ // / | \ // ST[y1] +1 +2 <-- Roots @@ -366,8 +354,11 @@ namespace { struct DAGRootTracker { DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV, ScalarEvolution *SE, AliasAnalysis *AA, - TargetLibraryInfo *TLI) - : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), IV(IV) {} + TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI, + bool PreserveLCSSA, + DenseMap<Instruction *, int64_t> &IncrMap) + : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI), + PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap) {} /// Stage 1: Find all the DAG roots for the induction variable. bool findRoots(); @@ -413,11 +404,14 @@ namespace { ScalarEvolution *SE; AliasAnalysis *AA; TargetLibraryInfo *TLI; + DominatorTree *DT; + LoopInfo *LI; + bool PreserveLCSSA; // The loop induction variable. Instruction *IV; // Loop step amount. - uint64_t Inc; + int64_t Inc; // Loop reroll count; if Inc == 1, this records the scaling applied // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ; // If Inc is not 1, Scale = Inc. @@ -430,6 +424,8 @@ namespace { // they are used in (or specially, IL_All for instructions // used in the loop increment mechanism). UsesTy Uses; + // Map between induction variable and its increment + DenseMap<Instruction *, int64_t> &IVToIncMap; }; void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); @@ -442,10 +438,10 @@ namespace { char LoopReroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) @@ -477,21 +473,20 @@ void LoopReroll::collectPossibleIVs(Loop *L, continue; if (const SCEVAddRecExpr *PHISCEV = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) { + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) { if (PHISCEV->getLoop() != L) continue; if (!PHISCEV->isAffine()) continue; if (const SCEVConstant *IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) { - if (!IncSCEV->getValue()->getValue().isStrictlyPositive()) + const APInt &AInt = IncSCEV->getAPInt().abs(); + if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc)) continue; - if (IncSCEV->getValue()->uge(MaxInc)) - continue; - - DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << - *PHISCEV << "\n"); - PossibleIVs.push_back(I); + IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue(); + DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV + << "\n"); + PossibleIVs.push_back(&*I); } } } @@ -552,7 +547,7 @@ void LoopReroll::collectPossibleReductions(Loop *L, if (!I->getType()->isSingleValueType()) continue; - SimpleLoopReduction SLR(I, L); + SimpleLoopReduction SLR(&*I, L); if (!SLR.valid()) continue; @@ -699,17 +694,11 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { } } - int64_t V = CI->getValue().getSExtValue(); + int64_t V = std::abs(CI->getValue().getSExtValue()); if (Roots.find(V) != Roots.end()) // No duplicates, please. return false; - // FIXME: Add support for negative values. - if (V < 0) { - DEBUG(dbgs() << "LRR: Aborting due to negative value: " << V << "\n"); - return false; - } - Roots[V] = cast<Instruction>(I); } @@ -731,7 +720,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { unsigned NumBaseUses = BaseUsers.size(); if (NumBaseUses == 0) NumBaseUses = Roots.begin()->second->getNumUses(); - + // Check that every node has the same number of users. for (auto &KV : Roots) { if (KV.first == 0) @@ -744,7 +733,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { } } - return true; + return true; } bool LoopReroll::DAGRootTracker:: @@ -787,7 +776,7 @@ findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { if (!collectPossibleRoots(IVU, V)) return false; - // If we didn't get a root for index zero, then IVU must be + // If we didn't get a root for index zero, then IVU must be // subsumed. if (V.find(0) == V.end()) SubsumedInsts.insert(IVU); @@ -818,13 +807,10 @@ findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { } bool LoopReroll::DAGRootTracker::findRoots() { - - const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); - Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> - getValue()->getZExtValue(); + Inc = IVToIncMap[IV]; assert(RootSets.empty() && "Unclean state!"); - if (Inc == 1) { + if (std::abs(Inc) == 1) { for (auto *IVU : IV->users()) { if (isLoopIncrement(IVU, IV)) LoopIncs.push_back(cast<Instruction>(IVU)); @@ -996,6 +982,25 @@ bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I, return false; } +static bool isIgnorableInst(const Instruction *I) { + if (isa<DbgInfoIntrinsic>(I)) + return true; + const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + if (!II) + return false; + switch (II->getIntrinsicID()) { + default: + return false; + case llvm::Intrinsic::annotation: + case Intrinsic::ptr_annotation: + case Intrinsic::var_annotation: + // TODO: the following intrinsics may also be whitelisted: + // lifetime_start, lifetime_end, invariant_start, invariant_end + return true; + } + return false; +} + bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { // We now need to check for equivalence of the use graph of each root with // that of the primary induction variable (excluding the roots). Our goal @@ -1029,7 +1034,7 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { // Make sure all instructions in the loop are in one and only one // set. for (auto &KV : Uses) { - if (KV.second.count() != 1) { + if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) { DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: " << *KV.first << " (#uses=" << KV.second.count() << ")\n"); return false; @@ -1103,15 +1108,15 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { " vs. " << *RootInst << "\n"); return false; } - + RootIt = TryIt; RootInst = TryIt->first; } // All instructions between the last root and this root - // may belong to some other iteration. If they belong to a + // may belong to some other iteration. If they belong to a // future iteration, then they're dangerous to alias with. - // + // // Note that because we allow a limited amount of flexibility in the order // that we visit nodes, LastRootIt might be *before* RootIt, in which // case we've already checked this set of instructions so we shouldn't @@ -1267,6 +1272,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { ++J; } + bool Negative = IVToIncMap[IV] < 0; const DataLayout &DL = Header->getModule()->getDataLayout(); // We need to create a new induction variable for each different BaseInst. @@ -1275,13 +1281,12 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); const SCEV *Start = RealIVSCEV->getStart(); - const SCEVAddRecExpr *H = cast<SCEVAddRecExpr> - (SE->getAddRecExpr(Start, - SE->getConstant(RealIVSCEV->getType(), 1), - L, SCEV::FlagAnyWrap)); + const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>(SE->getAddRecExpr( + Start, SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1), L, + SCEV::FlagAnyWrap)); { // Limit the lifetime of SCEVExpander. SCEVExpander Expander(*SE, DL, "reroll"); - Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); + Value *NewIV = Expander.expandCodeFor(H, IV->getType(), &Header->front()); for (auto &KV : Uses) { if (KV.second.find_first() == 0) @@ -1294,8 +1299,8 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); // Iteration count SCEV minus 1 - const SCEV *ICMinus1SCEV = - SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); + const SCEV *ICMinus1SCEV = SE->getMinusSCEV( + ICSCEV, SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1)); Value *ICMinus1; // Iteration count minus 1 if (isa<SCEVConstant>(ICMinus1SCEV)) { @@ -1303,7 +1308,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { } else { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) - Preheader = InsertPreheaderForLoop(L, Parent); + Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), Preheader->getTerminator()); @@ -1444,13 +1449,14 @@ void LoopReroll::ReductionTracker::replaceSelected() { bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, ReductionTracker &Reductions) { - DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI); + DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA, + IVToIncMap); if (!DAGRoots.findRoots()) return false; DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV << "\n"); - + if (!DAGRoots.validate(Reductions)) return false; if (!Reductions.validateSelected()) @@ -1469,11 +1475,12 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { if (skipOptnoneFunction(L)) return false; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << @@ -1490,13 +1497,13 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { return Changed; const SCEV *LIBETC = SE->getBackedgeTakenCount(L); - const SCEV *IterCount = - SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1)); + const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType())); DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n"); // First, we need to find the induction variable with respect to which we can // reroll (there may be several possible options). SmallInstructionVector PossibleIVs; + IVToIncMap.clear(); collectPossibleIVs(L, PossibleIVs); if (PossibleIVs.empty()) { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp index a675e12..5e6c2da 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -13,11 +13,15 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" @@ -41,95 +45,6 @@ DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden, cl::desc("The default maximum header size for automatic loop rotation")); STATISTIC(NumRotated, "Number of loops rotated"); -namespace { - - class LoopRotate : public LoopPass { - public: - static char ID; // Pass ID, replacement for typeid - LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { - initializeLoopRotatePass(*PassRegistry::getPassRegistry()); - if (SpecifiedMaxHeaderSize == -1) - MaxHeaderSize = DefaultRotationThreshold; - else - MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); - } - - // LCSSA form makes instruction renaming easier. - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - } - - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - bool simplifyLoopLatch(Loop *L); - bool rotateLoop(Loop *L, bool SimplifiedLatch); - - private: - unsigned MaxHeaderSize; - LoopInfo *LI; - const TargetTransformInfo *TTI; - AssumptionCache *AC; - DominatorTree *DT; - }; -} - -char LoopRotate::ID = 0; -INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) - -Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { - return new LoopRotate(MaxHeaderSize); -} - -/// Rotate Loop L as many times as possible. Return true if -/// the loop is rotated at least once. -bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { - if (skipOptnoneFunction(L)) - return false; - - // Save the loop metadata. - MDNode *LoopMD = L->getLoopID(); - - Function &F = *L->getHeader()->getParent(); - - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; - - // Simplify the loop latch before attempting to rotate the header - // upward. Rotation may not be needed if the loop tail can be folded into the - // loop exit. - bool SimplifiedLatch = simplifyLoopLatch(L); - - // One loop can be rotated multiple times. - bool MadeChange = false; - while (rotateLoop(L, SimplifiedLatch)) { - MadeChange = true; - SimplifiedLatch = false; - } - - // Restore the loop metadata. - // NB! We presume LoopRotation DOESN'T ADD its own metadata. - if ((MadeChange || SimplifiedLatch) && LoopMD) - L->setLoopID(LoopMD); - - return MadeChange; -} /// RewriteUsesOfClonedInstructions - We just cloned the instructions from the /// old header into the preheader. If there were uses of the values produced by @@ -147,7 +62,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, // as necessary. SSAUpdater SSA; for (I = OrigHeader->begin(); I != E; ++I) { - Value *OrigHeaderVal = I; + Value *OrigHeaderVal = &*I; // If there are no uses of the value (e.g. because it returns void), there // is nothing to rewrite. @@ -196,127 +111,6 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, } } -/// Determine whether the instructions in this range may be safely and cheaply -/// speculated. This is not an important enough situation to develop complex -/// heuristics. We handle a single arithmetic instruction along with any type -/// conversions. -static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, - BasicBlock::iterator End, Loop *L) { - bool seenIncrement = false; - bool MultiExitLoop = false; - - if (!L->getExitingBlock()) - MultiExitLoop = true; - - for (BasicBlock::iterator I = Begin; I != End; ++I) { - - if (!isSafeToSpeculativelyExecute(I)) - return false; - - if (isa<DbgInfoIntrinsic>(I)) - continue; - - switch (I->getOpcode()) { - default: - return false; - case Instruction::GetElementPtr: - // GEPs are cheap if all indices are constant. - if (!cast<GEPOperator>(I)->hasAllConstantIndices()) - return false; - // fall-thru to increment case - case Instruction::Add: - case Instruction::Sub: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: { - Value *IVOpnd = !isa<Constant>(I->getOperand(0)) - ? I->getOperand(0) - : !isa<Constant>(I->getOperand(1)) - ? I->getOperand(1) - : nullptr; - if (!IVOpnd) - return false; - - // If increment operand is used outside of the loop, this speculation - // could cause extra live range interference. - if (MultiExitLoop) { - for (User *UseI : IVOpnd->users()) { - auto *UserInst = cast<Instruction>(UseI); - if (!L->contains(UserInst)) - return false; - } - } - - if (seenIncrement) - return false; - seenIncrement = true; - break; - } - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - // ignore type conversions - break; - } - } - return true; -} - -/// Fold the loop tail into the loop exit by speculating the loop tail -/// instructions. Typically, this is a single post-increment. In the case of a -/// simple 2-block loop, hoisting the increment can be much better than -/// duplicating the entire loop header. In the case of loops with early exits, -/// rotation will not work anyway, but simplifyLoopLatch will put the loop in -/// canonical form so downstream passes can handle it. -/// -/// I don't believe this invalidates SCEV. -bool LoopRotate::simplifyLoopLatch(Loop *L) { - BasicBlock *Latch = L->getLoopLatch(); - if (!Latch || Latch->hasAddressTaken()) - return false; - - BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); - if (!Jmp || !Jmp->isUnconditional()) - return false; - - BasicBlock *LastExit = Latch->getSinglePredecessor(); - if (!LastExit || !L->isLoopExiting(LastExit)) - return false; - - BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); - if (!BI) - return false; - - if (!shouldSpeculateInstrs(Latch->begin(), Jmp, L)) - return false; - - DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " - << LastExit->getName() << "\n"); - - // Hoist the instructions from Latch into LastExit. - LastExit->getInstList().splice(BI, Latch->getInstList(), Latch->begin(), Jmp); - - unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; - BasicBlock *Header = Jmp->getSuccessor(0); - assert(Header == L->getHeader() && "expected a backward branch"); - - // Remove Latch from the CFG so that LastExit becomes the new Latch. - BI->setSuccessor(FallThruPath, Header); - Latch->replaceSuccessorsPhiUsesWith(LastExit); - Jmp->eraseFromParent(); - - // Nuke the Latch block. - assert(Latch->empty() && "unable to evacuate Latch"); - LI->removeBlock(Latch); - if (DT) - DT->eraseNode(Latch); - Latch->eraseFromParent(); - return true; -} - /// Rotate loop LP. Return true if the loop is rotated. /// /// \param SimplifiedLatch is true if the latch was just folded into the final @@ -327,7 +121,10 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) { /// rotation. LoopRotate should be repeatable and converge to a canonical /// form. This property is satisfied because simplifying the loop latch can only /// happen once across multiple invocations of the LoopRotate pass. -bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { +static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + DominatorTree *DT, ScalarEvolution *SE, + bool SimplifiedLatch) { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; @@ -382,7 +179,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // Anything ScalarEvolution may know about this loop or the PHI nodes // in its header will soon be invalidated. - if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) + if (SE) SE->forgetLoop(L); DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); @@ -420,7 +217,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // possible or create a clone in the OldPreHeader if not. TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); while (I != E) { - Instruction *Inst = I++; + Instruction *Inst = &*I++; // If the instruction's operands are invariant and it doesn't read or write // memory, then it is safe to hoist. Doing this doesn't change the order of @@ -465,8 +262,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's // successors by duplicating their incoming values for OrigHeader. TerminatorInst *TI = OrigHeader->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - for (BasicBlock::iterator BI = TI->getSuccessor(i)->begin(); + for (BasicBlock *SuccBB : TI->successors()) + for (BasicBlock::iterator BI = SuccBB->begin(); PHINode *PN = dyn_cast<PHINode>(BI); ++BI) PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); @@ -607,3 +404,221 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { ++NumRotated; return true; } + +/// Determine whether the instructions in this range may be safely and cheaply +/// speculated. This is not an important enough situation to develop complex +/// heuristics. We handle a single arithmetic instruction along with any type +/// conversions. +static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, + BasicBlock::iterator End, Loop *L) { + bool seenIncrement = false; + bool MultiExitLoop = false; + + if (!L->getExitingBlock()) + MultiExitLoop = true; + + for (BasicBlock::iterator I = Begin; I != End; ++I) { + + if (!isSafeToSpeculativelyExecute(&*I)) + return false; + + if (isa<DbgInfoIntrinsic>(I)) + continue; + + switch (I->getOpcode()) { + default: + return false; + case Instruction::GetElementPtr: + // GEPs are cheap if all indices are constant. + if (!cast<GEPOperator>(I)->hasAllConstantIndices()) + return false; + // fall-thru to increment case + case Instruction::Add: + case Instruction::Sub: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: { + Value *IVOpnd = !isa<Constant>(I->getOperand(0)) + ? I->getOperand(0) + : !isa<Constant>(I->getOperand(1)) + ? I->getOperand(1) + : nullptr; + if (!IVOpnd) + return false; + + // If increment operand is used outside of the loop, this speculation + // could cause extra live range interference. + if (MultiExitLoop) { + for (User *UseI : IVOpnd->users()) { + auto *UserInst = cast<Instruction>(UseI); + if (!L->contains(UserInst)) + return false; + } + } + + if (seenIncrement) + return false; + seenIncrement = true; + break; + } + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + // ignore type conversions + break; + } + } + return true; +} + +/// Fold the loop tail into the loop exit by speculating the loop tail +/// instructions. Typically, this is a single post-increment. In the case of a +/// simple 2-block loop, hoisting the increment can be much better than +/// duplicating the entire loop header. In the case of loops with early exits, +/// rotation will not work anyway, but simplifyLoopLatch will put the loop in +/// canonical form so downstream passes can handle it. +/// +/// I don't believe this invalidates SCEV. +static bool simplifyLoopLatch(Loop *L, LoopInfo *LI, DominatorTree *DT) { + BasicBlock *Latch = L->getLoopLatch(); + if (!Latch || Latch->hasAddressTaken()) + return false; + + BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); + if (!Jmp || !Jmp->isUnconditional()) + return false; + + BasicBlock *LastExit = Latch->getSinglePredecessor(); + if (!LastExit || !L->isLoopExiting(LastExit)) + return false; + + BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); + if (!BI) + return false; + + if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L)) + return false; + + DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " + << LastExit->getName() << "\n"); + + // Hoist the instructions from Latch into LastExit. + LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(), + Latch->begin(), Jmp->getIterator()); + + unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; + BasicBlock *Header = Jmp->getSuccessor(0); + assert(Header == L->getHeader() && "expected a backward branch"); + + // Remove Latch from the CFG so that LastExit becomes the new Latch. + BI->setSuccessor(FallThruPath, Header); + Latch->replaceSuccessorsPhiUsesWith(LastExit); + Jmp->eraseFromParent(); + + // Nuke the Latch block. + assert(Latch->empty() && "unable to evacuate Latch"); + LI->removeBlock(Latch); + if (DT) + DT->eraseNode(Latch); + Latch->eraseFromParent(); + return true; +} + +/// Rotate \c L as many times as possible. Return true if the loop is rotated +/// at least once. +static bool iterativelyRotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, + const TargetTransformInfo *TTI, + AssumptionCache *AC, DominatorTree *DT, + ScalarEvolution *SE) { + // Save the loop metadata. + MDNode *LoopMD = L->getLoopID(); + + // Simplify the loop latch before attempting to rotate the header + // upward. Rotation may not be needed if the loop tail can be folded into the + // loop exit. + bool SimplifiedLatch = simplifyLoopLatch(L, LI, DT); + + // One loop can be rotated multiple times. + bool MadeChange = false; + while (rotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE, SimplifiedLatch)) { + MadeChange = true; + SimplifiedLatch = false; + } + + // Restore the loop metadata. + // NB! We presume LoopRotation DOESN'T ADD its own metadata. + if ((MadeChange || SimplifiedLatch) && LoopMD) + L->setLoopID(LoopMD); + + return MadeChange; +} + +namespace { + +class LoopRotate : public LoopPass { + unsigned MaxHeaderSize; + +public: + static char ID; // Pass ID, replacement for typeid + LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { + initializeLoopRotatePass(*PassRegistry::getPassRegistry()); + if (SpecifiedMaxHeaderSize == -1) + MaxHeaderSize = DefaultRotationThreshold; + else + MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); + } + + // LCSSA form makes instruction renaming easier. + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<AAResultsWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipOptnoneFunction(L)) + return false; + Function &F = *L->getHeader()->getParent(); + + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + auto *SE = SEWP ? &SEWP->getSE() : nullptr; + + return iterativelyRotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE); + } +}; +} + +char LoopRotate::ID = 0; +INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) + +Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { + return new LoopRotate(MaxHeaderSize); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 4b59f3d..2101225 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -105,10 +105,33 @@ static bool StressIVChain = false; namespace { -/// RegSortData - This class holds data which is used to order reuse candidates. +struct MemAccessTy { + /// Used in situations where the accessed memory type is unknown. + static const unsigned UnknownAddressSpace = ~0u; + + Type *MemTy; + unsigned AddrSpace; + + MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {} + + MemAccessTy(Type *Ty, unsigned AS) : + MemTy(Ty), AddrSpace(AS) {} + + bool operator==(MemAccessTy Other) const { + return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace; + } + + bool operator!=(MemAccessTy Other) const { return !(*this == Other); } + + static MemAccessTy getUnknown(LLVMContext &Ctx) { + return MemAccessTy(Type::getVoidTy(Ctx), UnknownAddressSpace); + } +}; + +/// This class holds data which is used to order reuse candidates. class RegSortData { public: - /// UsedByIndices - This represents the set of LSRUse indices which reference + /// This represents the set of LSRUse indices which reference /// a particular register. SmallBitVector UsedByIndices; @@ -122,16 +145,14 @@ void RegSortData::print(raw_ostream &OS) const { OS << "[NumUses=" << UsedByIndices.count() << ']'; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void RegSortData::dump() const { print(errs()); errs() << '\n'; } -#endif namespace { -/// RegUseTracker - Map register candidates to information about how they are -/// used. +/// Map register candidates to information about how they are used. class RegUseTracker { typedef DenseMap<const SCEV *, RegSortData> RegUsesTy; @@ -139,9 +160,9 @@ class RegUseTracker { SmallVector<const SCEV *, 16> RegSequence; public: - void CountRegister(const SCEV *Reg, size_t LUIdx); - void DropRegister(const SCEV *Reg, size_t LUIdx); - void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx); + void countRegister(const SCEV *Reg, size_t LUIdx); + void dropRegister(const SCEV *Reg, size_t LUIdx); + void swapAndDropUse(size_t LUIdx, size_t LastLUIdx); bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; @@ -160,7 +181,7 @@ public: } void -RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) { +RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) { std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.insert(std::make_pair(Reg, RegSortData())); RegSortData &RSD = Pair.first->second; @@ -171,7 +192,7 @@ RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) { } void -RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { +RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) { RegUsesTy::iterator It = RegUsesMap.find(Reg); assert(It != RegUsesMap.end()); RegSortData &RSD = It->second; @@ -180,7 +201,7 @@ RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { } void -RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) { +RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) { assert(LUIdx <= LastLUIdx); // Update RegUses. The data structure is not optimized for this purpose; @@ -219,9 +240,8 @@ void RegUseTracker::clear() { namespace { -/// Formula - This class holds information that describes a formula for -/// computing satisfying a use. It may include broken-out immediates and scaled -/// registers. +/// This class holds information that describes a formula for computing +/// satisfying a use. It may include broken-out immediates and scaled registers. struct Formula { /// Global base address used for complex addressing. GlobalValue *BaseGV; @@ -235,8 +255,8 @@ struct Formula { /// The scale of any complex addressing. int64_t Scale; - /// BaseRegs - The list of "base" registers for this use. When this is - /// non-empty. The canonical representation of a formula is + /// The list of "base" registers for this use. When this is non-empty. The + /// canonical representation of a formula is /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty(). /// #1 enforces that the scaled register is always used when at least two @@ -247,31 +267,31 @@ struct Formula { /// form. SmallVector<const SCEV *, 4> BaseRegs; - /// ScaledReg - The 'scaled' register for this use. This should be non-null - /// when Scale is not zero. + /// The 'scaled' register for this use. This should be non-null when Scale is + /// not zero. const SCEV *ScaledReg; - /// UnfoldedOffset - An additional constant offset which added near the - /// use. This requires a temporary register, but the offset itself can - /// live in an add immediate field rather than a register. + /// An additional constant offset which added near the use. This requires a + /// temporary register, but the offset itself can live in an add immediate + /// field rather than a register. int64_t UnfoldedOffset; Formula() : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0), ScaledReg(nullptr), UnfoldedOffset(0) {} - void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); + void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); bool isCanonical() const; - void Canonicalize(); + void canonicalize(); - bool Unscale(); + bool unscale(); size_t getNumRegs() const; Type *getType() const; - void DeleteBaseReg(const SCEV *&S); + void deleteBaseReg(const SCEV *&S); bool referencesReg(const SCEV *S) const; bool hasRegsUsedByUsesOtherThan(size_t LUIdx, @@ -283,7 +303,7 @@ struct Formula { } -/// DoInitialMatch - Recursion helper for InitialMatch. +/// Recursion helper for initialMatch. static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl<const SCEV *> &Good, SmallVectorImpl<const SCEV *> &Bad, @@ -336,10 +356,9 @@ static void DoInitialMatch(const SCEV *S, Loop *L, Bad.push_back(S); } -/// InitialMatch - Incorporate loop-variant parts of S into this Formula, -/// attempting to keep all loop-invariant and loop-computable values in a -/// single base register. -void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { +/// Incorporate loop-variant parts of S into this Formula, attempting to keep +/// all loop-invariant and loop-computable values in a single base register. +void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { SmallVector<const SCEV *, 4> Good; SmallVector<const SCEV *, 4> Bad; DoInitialMatch(S, L, Good, Bad, SE); @@ -355,7 +374,7 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { BaseRegs.push_back(Sum); HasBaseReg = true; } - Canonicalize(); + canonicalize(); } /// \brief Check whether or not this formula statisfies the canonical @@ -373,7 +392,7 @@ bool Formula::isCanonical() const { /// field. Otherwise, we would have to do special cases everywhere in LSR /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ... /// On the other hand, 1*reg should be canonicalized into reg. -void Formula::Canonicalize() { +void Formula::canonicalize() { if (isCanonical()) return; // So far we did not need this case. This is easy to implement but it is @@ -394,7 +413,7 @@ void Formula::Canonicalize() { /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2. /// \return true if it was possible to get rid of the scale, false otherwise. /// \note After this operation the formula may not be in the canonical form. -bool Formula::Unscale() { +bool Formula::unscale() { if (Scale != 1) return false; Scale = 0; @@ -403,15 +422,14 @@ bool Formula::Unscale() { return true; } -/// getNumRegs - Return the total number of register operands used by this -/// formula. This does not include register uses implied by non-constant -/// addrec strides. +/// Return the total number of register operands used by this formula. This does +/// not include register uses implied by non-constant addrec strides. size_t Formula::getNumRegs() const { return !!ScaledReg + BaseRegs.size(); } -/// getType - Return the type of this formula, if it has one, or null -/// otherwise. This type is meaningless except for the bit size. +/// Return the type of this formula, if it has one, or null otherwise. This type +/// is meaningless except for the bit size. Type *Formula::getType() const { return !BaseRegs.empty() ? BaseRegs.front()->getType() : ScaledReg ? ScaledReg->getType() : @@ -419,21 +437,21 @@ Type *Formula::getType() const { nullptr; } -/// DeleteBaseReg - Delete the given base reg from the BaseRegs list. -void Formula::DeleteBaseReg(const SCEV *&S) { +/// Delete the given base reg from the BaseRegs list. +void Formula::deleteBaseReg(const SCEV *&S) { if (&S != &BaseRegs.back()) std::swap(S, BaseRegs.back()); BaseRegs.pop_back(); } -/// referencesReg - Test if this formula references the given register. +/// Test if this formula references the given register. bool Formula::referencesReg(const SCEV *S) const { return S == ScaledReg || std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end(); } -/// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers -/// which are used by uses other than the use with the given index. +/// Test whether this formula uses registers which are used by uses other than +/// the use with the given index. bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, const RegUseTracker &RegUses) const { if (ScaledReg) @@ -481,30 +499,29 @@ void Formula::print(raw_ostream &OS) const { } } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void Formula::dump() const { print(errs()); errs() << '\n'; } -#endif -/// isAddRecSExtable - Return true if the given addrec can be sign-extended -/// without changing its value. +/// Return true if the given addrec can be sign-extended without changing its +/// value. static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { Type *WideTy = IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1); return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); } -/// isAddSExtable - Return true if the given add can be sign-extended -/// without changing its value. +/// Return true if the given add can be sign-extended without changing its +/// value. static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { Type *WideTy = IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1); return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy)); } -/// isMulSExtable - Return true if the given mul can be sign-extended -/// without changing its value. +/// Return true if the given mul can be sign-extended without changing its +/// value. static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { Type *WideTy = IntegerType::get(SE.getContext(), @@ -512,12 +529,11 @@ static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy)); } -/// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined -/// and if the remainder is known to be zero, or null otherwise. If -/// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified -/// to Y, ignoring that the multiplication may overflow, which is useful when -/// the result will be used in a context where the most significant bits are -/// ignored. +/// Return an expression for LHS /s RHS, if it can be determined and if the +/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits +/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that +/// the multiplication may overflow, which is useful when the result will be +/// used in a context where the most significant bits are ignored. static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits = false) { @@ -528,7 +544,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, // Handle a few RHS special cases. const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS); if (RC) { - const APInt &RA = RC->getValue()->getValue(); + const APInt &RA = RC->getAPInt(); // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do // some folding. if (RA.isAllOnesValue()) @@ -542,8 +558,8 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) { if (!RC) return nullptr; - const APInt &LA = C->getValue()->getValue(); - const APInt &RA = RC->getValue()->getValue(); + const APInt &LA = C->getAPInt(); + const APInt &RA = RC->getAPInt(); if (LA.srem(RA) != 0) return nullptr; return SE.getConstant(LA.sdiv(RA)); @@ -603,12 +619,11 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, return nullptr; } -/// ExtractImmediate - If S involves the addition of a constant integer value, -/// return that integer value, and mutate S to point to a new SCEV with that -/// value excluded. +/// If S involves the addition of a constant integer value, return that integer +/// value, and mutate S to point to a new SCEV with that value excluded. static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { - if (C->getValue()->getValue().getMinSignedBits() <= 64) { + if (C->getAPInt().getMinSignedBits() <= 64) { S = SE.getConstant(C->getType(), 0); return C->getValue()->getSExtValue(); } @@ -630,9 +645,8 @@ static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { return 0; } -/// ExtractSymbol - If S involves the addition of a GlobalValue address, -/// return that symbol, and mutate S to point to a new SCEV with that -/// value excluded. +/// If S involves the addition of a GlobalValue address, return that symbol, and +/// mutate S to point to a new SCEV with that value excluded. static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { @@ -657,8 +671,8 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { return nullptr; } -/// isAddressUse - Returns true if the specified instruction is using the -/// specified value as an address. +/// Returns true if the specified instruction is using the specified value as an +/// address. static bool isAddressUse(Instruction *Inst, Value *OperandVal) { bool isAddress = isa<LoadInst>(Inst); if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { @@ -682,12 +696,15 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { return isAddress; } -/// getAccessType - Return the type of the memory being accessed. -static Type *getAccessType(const Instruction *Inst) { - Type *AccessTy = Inst->getType(); - if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) - AccessTy = SI->getOperand(0)->getType(); - else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { +/// Return the type of the memory being accessed. +static MemAccessTy getAccessType(const Instruction *Inst) { + MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace); + if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + AccessTy.MemTy = SI->getOperand(0)->getType(); + AccessTy.AddrSpace = SI->getPointerAddressSpace(); + } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + AccessTy.AddrSpace = LI->getPointerAddressSpace(); + } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { // Addressing modes can also be folded into prefetches and a variety // of intrinsics. switch (II->getIntrinsicID()) { @@ -696,21 +713,21 @@ static Type *getAccessType(const Instruction *Inst) { case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: case Intrinsic::x86_sse2_storel_dq: - AccessTy = II->getArgOperand(0)->getType(); + AccessTy.MemTy = II->getArgOperand(0)->getType(); break; } } // All pointers have the same requirements, so canonicalize them to an // arbitrary pointer type to minimize variation. - if (PointerType *PTy = dyn_cast<PointerType>(AccessTy)) - AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), - PTy->getAddressSpace()); + if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy)) + AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), + PTy->getAddressSpace()); return AccessTy; } -/// isExistingPhi - Return true if this AddRec is already a phi in its loop. +/// Return true if this AddRec is already a phi in its loop. static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I) { @@ -793,9 +810,8 @@ static bool isHighCostExpansion(const SCEV *S, return true; } -/// DeleteTriviallyDeadInstructions - If any of the instructions is the -/// specified set are trivially dead, delete them and see if this makes any of -/// their operands subsequently dead. +/// If any of the instructions is the specified set are trivially dead, delete +/// them and see if this makes any of their operands subsequently dead. static bool DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { bool Changed = false; @@ -842,7 +858,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, namespace { -/// Cost - This class is used to measure and compare candidate formulae. +/// This class is used to measure and compare candidate formulae. class Cost { /// TODO: Some of these could be merged. Also, a lexical ordering /// isn't always optimal. @@ -905,7 +921,7 @@ private: } -/// RateRegister - Tally up interesting quantities from the given register. +/// Tally up interesting quantities from the given register. void Cost::RateRegister(const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, @@ -951,9 +967,9 @@ void Cost::RateRegister(const SCEV *Reg, SE.hasComputableLoopEvolution(Reg, L); } -/// RatePrimaryRegister - Record this register in the set. If we haven't seen it -/// before, rate it. Optional LoserRegs provides a way to declare any formula -/// that refers to one of those regs an instant loser. +/// Record this register in the set. If we haven't seen it before, rate +/// it. Optional LoserRegs provides a way to declare any formula that refers to +/// one of those regs an instant loser. void Cost::RatePrimaryRegister(const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, @@ -1024,7 +1040,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, assert(isValid() && "invalid cost"); } -/// Lose - Set this cost to a losing value. +/// Set this cost to a losing value. void Cost::Lose() { NumRegs = ~0u; AddRecCost = ~0u; @@ -1035,7 +1051,7 @@ void Cost::Lose() { ScaleCost = ~0u; } -/// operator< - Choose the lower cost. +/// Choose the lower cost. bool Cost::operator<(const Cost &Other) const { return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost, ImmCost, SetupCost) < @@ -1061,37 +1077,35 @@ void Cost::print(raw_ostream &OS) const { OS << ", plus " << SetupCost << " setup cost"; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void Cost::dump() const { print(errs()); errs() << '\n'; } -#endif namespace { -/// LSRFixup - An operand value in an instruction which is to be replaced -/// with some equivalent, possibly strength-reduced, replacement. +/// An operand value in an instruction which is to be replaced with some +/// equivalent, possibly strength-reduced, replacement. struct LSRFixup { - /// UserInst - The instruction which will be updated. + /// The instruction which will be updated. Instruction *UserInst; - /// OperandValToReplace - The operand of the instruction which will - /// be replaced. The operand may be used more than once; every instance - /// will be replaced. + /// The operand of the instruction which will be replaced. The operand may be + /// used more than once; every instance will be replaced. Value *OperandValToReplace; - /// PostIncLoops - If this user is to use the post-incremented value of an - /// induction variable, this variable is non-null and holds the loop - /// associated with the induction variable. + /// If this user is to use the post-incremented value of an induction + /// variable, this variable is non-null and holds the loop associated with the + /// induction variable. PostIncLoopSet PostIncLoops; - /// LUIdx - The index of the LSRUse describing the expression which - /// this fixup needs, minus an offset (below). + /// The index of the LSRUse describing the expression which this fixup needs, + /// minus an offset (below). size_t LUIdx; - /// Offset - A constant offset to be added to the LSRUse expression. - /// This allows multiple fixups to share the same LSRUse with different - /// offsets, for example in an unrolled loop. + /// A constant offset to be added to the LSRUse expression. This allows + /// multiple fixups to share the same LSRUse with different offsets, for + /// example in an unrolled loop. int64_t Offset; bool isUseFullyOutsideLoop(const Loop *L) const; @@ -1108,8 +1122,7 @@ LSRFixup::LSRFixup() : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)), Offset(0) {} -/// isUseFullyOutsideLoop - Test whether this fixup always uses its -/// value outside of the given loop. +/// Test whether this fixup always uses its value outside of the given loop. bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { // PHI nodes use their value in their incoming blocks. if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { @@ -1149,16 +1162,15 @@ void LSRFixup::print(raw_ostream &OS) const { OS << ", Offset=" << Offset; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRFixup::dump() const { print(errs()); errs() << '\n'; } -#endif namespace { -/// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding -/// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*. +/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted +/// SmallVectors of const SCEV*. struct UniquifierDenseMapInfo { static SmallVector<const SCEV *, 4> getEmptyKey() { SmallVector<const SCEV *, 4> V; @@ -1182,17 +1194,17 @@ struct UniquifierDenseMapInfo { } }; -/// LSRUse - This class holds the state that LSR keeps for each use in -/// IVUsers, as well as uses invented by LSR itself. It includes information -/// about what kinds of things can be folded into the user, information about -/// the user itself, and information about how the use may be satisfied. -/// TODO: Represent multiple users of the same expression in common? +/// This class holds the state that LSR keeps for each use in IVUsers, as well +/// as uses invented by LSR itself. It includes information about what kinds of +/// things can be folded into the user, information about the user itself, and +/// information about how the use may be satisfied. TODO: Represent multiple +/// users of the same expression in common? class LSRUse { DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier; public: - /// KindType - An enum for a kind of use, indicating what types of - /// scaled and immediate operands it might support. + /// An enum for a kind of use, indicating what types of scaled and immediate + /// operands it might support. enum KindType { Basic, ///< A normal use, with no folding. Special, ///< A special case of basic, allowing -1 scales. @@ -1204,15 +1216,14 @@ public: typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair; KindType Kind; - Type *AccessTy; + MemAccessTy AccessTy; SmallVector<int64_t, 8> Offsets; int64_t MinOffset; int64_t MaxOffset; - /// AllFixupsOutsideLoop - This records whether all of the fixups using this - /// LSRUse are outside of the loop, in which case some special-case heuristics - /// may be used. + /// This records whether all of the fixups using this LSRUse are outside of + /// the loop, in which case some special-case heuristics may be used. bool AllFixupsOutsideLoop; /// RigidFormula is set to true to guarantee that this use will be associated @@ -1222,26 +1233,24 @@ public: /// changing the formula. bool RigidFormula; - /// WidestFixupType - This records the widest use type for any fixup using - /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different - /// max fixup widths to be equivalent, because the narrower one may be relying - /// on the implicit truncation to truncate away bogus bits. + /// This records the widest use type for any fixup using this + /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max + /// fixup widths to be equivalent, because the narrower one may be relying on + /// the implicit truncation to truncate away bogus bits. Type *WidestFixupType; - /// Formulae - A list of ways to build a value that can satisfy this user. - /// After the list is populated, one of these is selected heuristically and - /// used to formulate a replacement for OperandValToReplace in UserInst. + /// A list of ways to build a value that can satisfy this user. After the + /// list is populated, one of these is selected heuristically and used to + /// formulate a replacement for OperandValToReplace in UserInst. SmallVector<Formula, 12> Formulae; - /// Regs - The set of register candidates used by all formulae in this LSRUse. + /// The set of register candidates used by all formulae in this LSRUse. SmallPtrSet<const SCEV *, 4> Regs; - LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T), - MinOffset(INT64_MAX), - MaxOffset(INT64_MIN), - AllFixupsOutsideLoop(true), - RigidFormula(false), - WidestFixupType(nullptr) {} + LSRUse(KindType K, MemAccessTy AT) + : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN), + AllFixupsOutsideLoop(true), RigidFormula(false), + WidestFixupType(nullptr) {} bool HasFormulaWithSameRegs(const Formula &F) const; bool InsertFormula(const Formula &F); @@ -1254,8 +1263,8 @@ public: } -/// HasFormula - Test whether this use as a formula which has the same -/// registers as the given formula. +/// Test whether this use as a formula which has the same registers as the given +/// formula. bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { SmallVector<const SCEV *, 4> Key = F.BaseRegs; if (F.ScaledReg) Key.push_back(F.ScaledReg); @@ -1264,9 +1273,8 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { return Uniquifier.count(Key); } -/// InsertFormula - If the given formula has not yet been inserted, add it to -/// the list, and return true. Return false otherwise. -/// The formula must be in canonical form. +/// If the given formula has not yet been inserted, add it to the list, and +/// return true. Return false otherwise. The formula must be in canonical form. bool LSRUse::InsertFormula(const Formula &F) { assert(F.isCanonical() && "Invalid canonical representation"); @@ -1300,14 +1308,14 @@ bool LSRUse::InsertFormula(const Formula &F) { return true; } -/// DeleteFormula - Remove the given formula from this use's list. +/// Remove the given formula from this use's list. void LSRUse::DeleteFormula(Formula &F) { if (&F != &Formulae.back()) std::swap(F, Formulae.back()); Formulae.pop_back(); } -/// RecomputeRegs - Recompute the Regs field, and update RegUses. +/// Recompute the Regs field, and update RegUses. void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { // Now that we've filtered out some formulae, recompute the Regs set. SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs); @@ -1320,7 +1328,7 @@ void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { // Update the RegTracker. for (const SCEV *S : OldRegs) if (!Regs.count(S)) - RegUses.DropRegister(S, LUIdx); + RegUses.dropRegister(S, LUIdx); } void LSRUse::print(raw_ostream &OS) const { @@ -1331,10 +1339,13 @@ void LSRUse::print(raw_ostream &OS) const { case ICmpZero: OS << "ICmpZero"; break; case Address: OS << "Address of "; - if (AccessTy->isPointerTy()) + if (AccessTy.MemTy->isPointerTy()) OS << "pointer"; // the full pointer type could be really verbose - else - OS << *AccessTy; + else { + OS << *AccessTy.MemTy; + } + + OS << " in addrspace(" << AccessTy.AddrSpace << ')'; } OS << ", Offsets={"; @@ -1353,19 +1364,19 @@ void LSRUse::print(raw_ostream &OS) const { OS << ", widest fixup type: " << *WidestFixupType; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRUse::dump() const { print(errs()); errs() << '\n'; } -#endif static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { switch (Kind) { case LSRUse::Address: - return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); + return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset, + HasBaseReg, Scale, AccessTy.AddrSpace); case LSRUse::ICmpZero: // There's not even a target hook for querying whether it would be legal to @@ -1412,7 +1423,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { // Check for overflow. @@ -1433,7 +1444,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, const Formula &F) { // For the purpose of isAMCompletelyFolded either having a canonical formula // or a scale not equal to zero is correct. @@ -1447,11 +1458,11 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); } -/// isLegalUse - Test whether we know how to expand the current formula. +/// Test whether we know how to expand the current formula. static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, - int64_t Scale) { + int64_t MaxOffset, LSRUse::KindType Kind, + MemAccessTy AccessTy, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { // We know how to expand completely foldable formulae. return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale) || @@ -1463,8 +1474,8 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, } static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, - const Formula &F) { + int64_t MaxOffset, LSRUse::KindType Kind, + MemAccessTy AccessTy, const Formula &F) { return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); } @@ -1490,14 +1501,12 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, switch (LU.Kind) { case LSRUse::Address: { // Check the scaling factor cost with both the min and max offsets. - int ScaleCostMinOffset = - TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, - F.BaseOffset + LU.MinOffset, - F.HasBaseReg, F.Scale); - int ScaleCostMaxOffset = - TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, - F.BaseOffset + LU.MaxOffset, - F.HasBaseReg, F.Scale); + int ScaleCostMinOffset = TTI.getScalingFactorCost( + LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg, + F.Scale, LU.AccessTy.AddrSpace); + int ScaleCostMaxOffset = TTI.getScalingFactorCost( + LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg, + F.Scale, LU.AccessTy.AddrSpace); assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 && "Legal addressing mode has an illegal cost!"); @@ -1515,7 +1524,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, } static bool isAlwaysFoldable(const TargetTransformInfo &TTI, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg) { // Fast-path: zero is always foldable. @@ -1539,7 +1548,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, static bool isAlwaysFoldable(const TargetTransformInfo &TTI, ScalarEvolution &SE, int64_t MinOffset, int64_t MaxOffset, LSRUse::KindType Kind, - Type *AccessTy, const SCEV *S, bool HasBaseReg) { + MemAccessTy AccessTy, const SCEV *S, + bool HasBaseReg) { // Fast-path: zero is always foldable. if (S->isZero()) return true; @@ -1564,9 +1574,9 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, namespace { -/// IVInc - An individual increment in a Chain of IV increments. -/// Relate an IV user to an expression that computes the IV it uses from the IV -/// used by the previous link in the Chain. +/// An individual increment in a Chain of IV increments. Relate an IV user to +/// an expression that computes the IV it uses from the IV used by the previous +/// link in the Chain. /// /// For the head of a chain, IncExpr holds the absolute SCEV expression for the /// original IVOperand. The head of the chain's IVOperand is only valid during @@ -1582,8 +1592,8 @@ struct IVInc { UserInst(U), IVOperand(O), IncExpr(E) {} }; -// IVChain - The list of IV increments in program order. -// We typically add the head of a chain without finding subsequent links. +// The list of IV increments in program order. We typically add the head of a +// chain without finding subsequent links. struct IVChain { SmallVector<IVInc,1> Incs; const SCEV *ExprBase; @@ -1595,7 +1605,7 @@ struct IVChain { typedef SmallVectorImpl<IVInc>::const_iterator const_iterator; - // begin - return the first increment in the chain. + // Return the first increment in the chain. const_iterator begin() const { assert(!Incs.empty()); return std::next(Incs.begin()); @@ -1604,32 +1614,30 @@ struct IVChain { return Incs.end(); } - // hasIncs - Returns true if this chain contains any increments. + // Returns true if this chain contains any increments. bool hasIncs() const { return Incs.size() >= 2; } - // add - Add an IVInc to the end of this chain. + // Add an IVInc to the end of this chain. void add(const IVInc &X) { Incs.push_back(X); } - // tailUserInst - Returns the last UserInst in the chain. + // Returns the last UserInst in the chain. Instruction *tailUserInst() const { return Incs.back().UserInst; } - // isProfitableIncrement - Returns true if IncExpr can be profitably added to - // this chain. + // Returns true if IncExpr can be profitably added to this chain. bool isProfitableIncrement(const SCEV *OperExpr, const SCEV *IncExpr, ScalarEvolution&); }; -/// ChainUsers - Helper for CollectChains to track multiple IV increment uses. -/// Distinguish between FarUsers that definitely cross IV increments and -/// NearUsers that may be used between IV increments. +/// Helper for CollectChains to track multiple IV increment uses. Distinguish +/// between FarUsers that definitely cross IV increments and NearUsers that may +/// be used between IV increments. struct ChainUsers { SmallPtrSet<Instruction*, 4> FarUsers; SmallPtrSet<Instruction*, 4> NearUsers; }; -/// LSRInstance - This class holds state for the main loop strength reduction -/// logic. +/// This class holds state for the main loop strength reduction logic. class LSRInstance { IVUsers &IU; ScalarEvolution &SE; @@ -1639,25 +1647,25 @@ class LSRInstance { Loop *const L; bool Changed; - /// IVIncInsertPos - This is the insert position that the current loop's - /// induction variable increment should be placed. In simple loops, this is - /// the latch block's terminator. But in more complicated cases, this is a - /// position which will dominate all the in-loop post-increment users. + /// This is the insert position that the current loop's induction variable + /// increment should be placed. In simple loops, this is the latch block's + /// terminator. But in more complicated cases, this is a position which will + /// dominate all the in-loop post-increment users. Instruction *IVIncInsertPos; - /// Factors - Interesting factors between use strides. + /// Interesting factors between use strides. SmallSetVector<int64_t, 8> Factors; - /// Types - Interesting use types, to facilitate truncation reuse. + /// Interesting use types, to facilitate truncation reuse. SmallSetVector<Type *, 4> Types; - /// Fixups - The list of operands which are to be replaced. + /// The list of operands which are to be replaced. SmallVector<LSRFixup, 16> Fixups; - /// Uses - The list of interesting uses. + /// The list of interesting uses. SmallVector<LSRUse, 16> Uses; - /// RegUses - Track which uses use which register candidates. + /// Track which uses use which register candidates. RegUseTracker RegUses; // Limit the number of chains to avoid quadratic behavior. We don't expect to @@ -1665,10 +1673,10 @@ class LSRInstance { // back to normal LSR behavior for those uses. static const unsigned MaxChains = 8; - /// IVChainVec - IV users can form a chain of IV increments. + /// IV users can form a chain of IV increments. SmallVector<IVChain, MaxChains> IVChainVec; - /// IVIncSet - IV users that belong to profitable IVChains. + /// IV users that belong to profitable IVChains. SmallPtrSet<Use*, MaxChains> IVIncSet; void OptimizeShadowIV(); @@ -1696,11 +1704,10 @@ class LSRInstance { UseMapTy UseMap; bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, - LSRUse::KindType Kind, Type *AccessTy); + LSRUse::KindType Kind, MemAccessTy AccessTy); - std::pair<size_t, int64_t> getUse(const SCEV *&Expr, - LSRUse::KindType Kind, - Type *AccessTy); + std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind, + MemAccessTy AccessTy); void DeleteUse(LSRUse &LU, size_t LUIdx); @@ -1769,18 +1776,16 @@ class LSRInstance { void RewriteForPHI(PHINode *PN, const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts, - Pass *P) const; + SmallVectorImpl<WeakVH> &DeadInsts) const; void Rewrite(const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts, - Pass *P) const; - void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, - Pass *P); + SmallVectorImpl<WeakVH> &DeadInsts) const; + void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution); public: - LSRInstance(Loop *L, Pass *P); + LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, + LoopInfo &LI, const TargetTransformInfo &TTI); bool getChanged() const { return Changed; } @@ -1793,8 +1798,8 @@ public: } -/// OptimizeShadowIV - If IV is used in a int-to-float cast -/// inside the loop then try to eliminate the cast operation. +/// If IV is used in a int-to-float cast inside the loop then try to eliminate +/// the cast operation. void LSRInstance::OptimizeShadowIV() { const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) @@ -1902,9 +1907,8 @@ void LSRInstance::OptimizeShadowIV() { } } -/// FindIVUserForCond - If Cond has an operand that is an expression of an IV, -/// set the IV user and stride information and return true, otherwise return -/// false. +/// If Cond has an operand that is an expression of an IV, set the IV user and +/// stride information and return true, otherwise return false. bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { for (IVStrideUse &U : IU) if (U.getUser() == Cond) { @@ -1917,8 +1921,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { return false; } -/// OptimizeMax - Rewrite the loop's terminating condition if it uses -/// a max computation. +/// Rewrite the loop's terminating condition if it uses a max computation. /// /// This is a narrow solution to a specific, but acute, problem. For loops /// like this: @@ -2076,8 +2079,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { return NewCond; } -/// OptimizeLoopTermCond - Change loop terminating condition to use the -/// postinc iv when possible. +/// Change loop terminating condition to use the postinc iv when possible. void LSRInstance::OptimizeLoopTermCond() { SmallPtrSet<Instruction *, 4> PostIncs; @@ -2152,16 +2154,18 @@ LSRInstance::OptimizeLoopTermCond() { C->getValue().isMinSignedValue()) goto decline_post_inc; // Check for possible scaled-address reuse. - Type *AccessTy = getAccessType(UI->getUser()); + MemAccessTy AccessTy = getAccessType(UI->getUser()); int64_t Scale = C->getSExtValue(); - if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, - /*BaseOffset=*/ 0, - /*HasBaseReg=*/ false, Scale)) + if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, + /*BaseOffset=*/0, + /*HasBaseReg=*/false, Scale, + AccessTy.AddrSpace)) goto decline_post_inc; Scale = -Scale; - if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, - /*BaseOffset=*/ 0, - /*HasBaseReg=*/ false, Scale)) + if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, + /*BaseOffset=*/0, + /*HasBaseReg=*/false, Scale, + AccessTy.AddrSpace)) goto decline_post_inc; } } @@ -2180,7 +2184,7 @@ LSRInstance::OptimizeLoopTermCond() { ICmpInst *OldCond = Cond; Cond = cast<ICmpInst>(Cond->clone()); Cond->setName(L->getHeader()->getName() + ".termcond"); - ExitingBlock->getInstList().insert(TermBr, Cond); + ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond); // Clone the IVUse, as the old use still exists! CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); @@ -2213,15 +2217,14 @@ LSRInstance::OptimizeLoopTermCond() { } } -/// reconcileNewOffset - Determine if the given use can accommodate a fixup -/// at the given offset and other details. If so, update the use and -/// return true. -bool -LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, - LSRUse::KindType Kind, Type *AccessTy) { +/// Determine if the given use can accommodate a fixup at the given offset and +/// other details. If so, update the use and return true. +bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, + bool HasBaseReg, LSRUse::KindType Kind, + MemAccessTy AccessTy) { int64_t NewMinOffset = LU.MinOffset; int64_t NewMaxOffset = LU.MaxOffset; - Type *NewAccessTy = AccessTy; + MemAccessTy NewAccessTy = AccessTy; // Check for a mismatched kind. It's tempting to collapse mismatched kinds to // something conservative, however this can pessimize in the case that one of @@ -2232,8 +2235,10 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, // Check for a mismatched access type, and fall back conservatively as needed. // TODO: Be less conservative when the type is similar and can use the same // addressing modes. - if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) - NewAccessTy = Type::getVoidTy(AccessTy->getContext()); + if (Kind == LSRUse::Address) { + if (AccessTy != LU.AccessTy) + NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext()); + } // Conservatively assume HasBaseReg is true for now. if (NewOffset < LU.MinOffset) { @@ -2257,12 +2262,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, return true; } -/// getUse - Return an LSRUse index and an offset value for a fixup which -/// needs the given expression, with the given kind and optional access type. -/// Either reuse an existing use or create a new one, as needed. -std::pair<size_t, int64_t> -LSRInstance::getUse(const SCEV *&Expr, - LSRUse::KindType Kind, Type *AccessTy) { +/// Return an LSRUse index and an offset value for a fixup which needs the given +/// expression, with the given kind and optional access type. Either reuse an +/// existing use or create a new one, as needed. +std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr, + LSRUse::KindType Kind, + MemAccessTy AccessTy) { const SCEV *Copy = Expr; int64_t Offset = ExtractImmediate(Expr, SE); @@ -2300,18 +2305,18 @@ LSRInstance::getUse(const SCEV *&Expr, return std::make_pair(LUIdx, Offset); } -/// DeleteUse - Delete the given use from the Uses list. +/// Delete the given use from the Uses list. void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) { if (&LU != &Uses.back()) std::swap(LU, Uses.back()); Uses.pop_back(); // Update RegUses. - RegUses.SwapAndDropUse(LUIdx, Uses.size()); + RegUses.swapAndDropUse(LUIdx, Uses.size()); } -/// FindUseWithFormula - Look for a use distinct from OrigLU which is has -/// a formula that has the same registers as the given formula. +/// Look for a use distinct from OrigLU which is has a formula that has the same +/// registers as the given formula. LSRUse * LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, const LSRUse &OrigLU) { @@ -2396,14 +2401,14 @@ void LSRInstance::CollectInterestingTypesAndFactors() { if (const SCEVConstant *Factor = dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride, SE, true))) { - if (Factor->getValue()->getValue().getMinSignedBits() <= 64) - Factors.insert(Factor->getValue()->getValue().getSExtValue()); + if (Factor->getAPInt().getMinSignedBits() <= 64) + Factors.insert(Factor->getAPInt().getSExtValue()); } else if (const SCEVConstant *Factor = dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride, NewStride, SE, true))) { - if (Factor->getValue()->getValue().getMinSignedBits() <= 64) - Factors.insert(Factor->getValue()->getValue().getSExtValue()); + if (Factor->getAPInt().getMinSignedBits() <= 64) + Factors.insert(Factor->getAPInt().getSExtValue()); } } @@ -2415,9 +2420,9 @@ void LSRInstance::CollectInterestingTypesAndFactors() { DEBUG(print_factors_and_types(dbgs())); } -/// findIVOperand - Helper for CollectChains that finds an IV operand (computed -/// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped -/// Instructions to IVStrideUses, we could partially skip this. +/// Helper for CollectChains that finds an IV operand (computed by an AddRec in +/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to +/// IVStrideUses, we could partially skip this. static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE) { @@ -2436,29 +2441,28 @@ findIVOperand(User::op_iterator OI, User::op_iterator OE, return OI; } -/// getWideOperand - IVChain logic must consistenctly peek base TruncInst -/// operands, so wrap it in a convenient helper. +/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in +/// a convenient helper. static Value *getWideOperand(Value *Oper) { if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper)) return Trunc->getOperand(0); return Oper; } -/// isCompatibleIVType - Return true if we allow an IV chain to include both -/// types. +/// Return true if we allow an IV chain to include both types. static bool isCompatibleIVType(Value *LVal, Value *RVal) { Type *LType = LVal->getType(); Type *RType = RVal->getType(); return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy()); } -/// getExprBase - Return an approximation of this SCEV expression's "base", or -/// NULL for any constant. Returning the expression itself is -/// conservative. Returning a deeper subexpression is more precise and valid as -/// long as it isn't less complex than another subexpression. For expressions -/// involving multiple unscaled values, we need to return the pointer-type -/// SCEVUnknown. This avoids forming chains across objects, such as: -/// PrevOper==a[i], IVOper==b[i], IVInc==b-a. +/// Return an approximation of this SCEV expression's "base", or NULL for any +/// constant. Returning the expression itself is conservative. Returning a +/// deeper subexpression is more precise and valid as long as it isn't less +/// complex than another subexpression. For expressions involving multiple +/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids +/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i], +/// IVInc==b-a. /// /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost /// SCEVUnknown, we simply return the rightmost SCEV operand. @@ -2601,8 +2605,7 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users, return cost < 0; } -/// ChainInstruction - Add this IV user to an existing chain or make it the head -/// of a new chain. +/// Add this IV user to an existing chain or make it the head of a new chain. void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, SmallVectorImpl<ChainUsers> &ChainUsersVec) { // When IVs are used as types of varying widths, they are generally converted @@ -2714,7 +2717,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, ChainUsersVec[ChainIdx].FarUsers.erase(UserInst); } -/// CollectChains - Populate the vector of Chains. +/// Populate the vector of Chains. /// /// This decreases ILP at the architecture level. Targets with ample registers, /// multiple memory ports, and no register renaming probably don't want @@ -2755,19 +2758,19 @@ void LSRInstance::CollectChains() { for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end(); I != E; ++I) { // Skip instructions that weren't seen by IVUsers analysis. - if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I)) + if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&*I)) continue; // Ignore users that are part of a SCEV expression. This way we only // consider leaf IV Users. This effectively rediscovers a portion of // IVUsers analysis but in program order this time. - if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I))) + if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(&*I))) continue; // Remove this instruction from any NearUsers set it may be in. for (unsigned ChainIdx = 0, NChains = IVChainVec.size(); ChainIdx < NChains; ++ChainIdx) { - ChainUsersVec[ChainIdx].NearUsers.erase(I); + ChainUsersVec[ChainIdx].NearUsers.erase(&*I); } // Search for operands that can be chained. SmallPtrSet<Instruction*, 4> UniqueOperands; @@ -2776,7 +2779,7 @@ void LSRInstance::CollectChains() { while (IVOpIter != IVOpEnd) { Instruction *IVOpInst = cast<Instruction>(*IVOpIter); if (UniqueOperands.insert(IVOpInst).second) - ChainInstruction(I, IVOpInst, ChainUsersVec); + ChainInstruction(&*I, IVOpInst, ChainUsersVec); IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); } } // Continue walking down the instructions. @@ -2828,20 +2831,20 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, if (!IncConst || !isAddressUse(UserInst, Operand)) return false; - if (IncConst->getValue()->getValue().getMinSignedBits() > 64) + if (IncConst->getAPInt().getMinSignedBits() > 64) return false; + MemAccessTy AccessTy = getAccessType(UserInst); int64_t IncOffset = IncConst->getValue()->getSExtValue(); - if (!isAlwaysFoldable(TTI, LSRUse::Address, - getAccessType(UserInst), /*BaseGV=*/ nullptr, - IncOffset, /*HaseBaseReg=*/ false)) + if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, + IncOffset, /*HaseBaseReg=*/false)) return false; return true; } -/// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to -/// materialize the IV user's operand from the previous IV user's operand. +/// Generate an add or subtract for each IVInc in a chain to materialize the IV +/// user's operand from the previous IV user's operand. void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, SmallVectorImpl<WeakVH> &DeadInsts) { // Find the new IVOperand for the head of the chain. It may have been replaced @@ -2961,7 +2964,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.PostIncLoops = U.getPostIncLoops(); LSRUse::KindType Kind = LSRUse::Basic; - Type *AccessTy = nullptr; + MemAccessTy AccessTy; if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { Kind = LSRUse::Address; AccessTy = getAccessType(LF.UserInst); @@ -3027,9 +3030,8 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { DEBUG(print_fixups(dbgs())); } -/// InsertInitialFormula - Insert a formula for the given expression into -/// the given use, separating out loop-variant portions from loop-invariant -/// and loop-computable portions. +/// Insert a formula for the given expression into the given use, separating out +/// loop-variant portions from loop-invariant and loop-computable portions. void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { // Mark uses whose expressions cannot be expanded. @@ -3037,13 +3039,13 @@ LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { LU.RigidFormula = true; Formula F; - F.InitialMatch(S, L, SE); + F.initialMatch(S, L, SE); bool Inserted = InsertFormula(LU, LUIdx, F); assert(Inserted && "Initial formula already exists!"); (void)Inserted; } -/// InsertSupplementalFormula - Insert a simple single-register formula for -/// the given expression into the given use. +/// Insert a simple single-register formula for the given expression into the +/// given use. void LSRInstance::InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { @@ -3054,17 +3056,16 @@ LSRInstance::InsertSupplementalFormula(const SCEV *S, assert(Inserted && "Supplemental formula already exists!"); (void)Inserted; } -/// CountRegisters - Note which registers are used by the given formula, -/// updating RegUses. +/// Note which registers are used by the given formula, updating RegUses. void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { if (F.ScaledReg) - RegUses.CountRegister(F.ScaledReg, LUIdx); + RegUses.countRegister(F.ScaledReg, LUIdx); for (const SCEV *BaseReg : F.BaseRegs) - RegUses.CountRegister(BaseReg, LUIdx); + RegUses.countRegister(BaseReg, LUIdx); } -/// InsertFormula - If the given formula has not yet been inserted, add it to -/// the list, and return true. Return false otherwise. +/// If the given formula has not yet been inserted, add it to the list, and +/// return true. Return false otherwise. bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { // Do not insert formula that we will not be able to expand. assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) && @@ -3076,9 +3077,9 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { return true; } -/// CollectLoopInvariantFixupsAndFormulae - Check for other uses of -/// loop-invariant values which we're tracking. These other uses will pin these -/// values in registers, making them less profitable for elimination. +/// Check for other uses of loop-invariant values which we're tracking. These +/// other uses will pin these values in registers, making them less profitable +/// for elimination. /// TODO: This currently misses non-constant addrec step registers. /// TODO: Should this give more weight to users inside the loop? void @@ -3124,6 +3125,9 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { PHINode::getIncomingValueNumForOperand(U.getOperandNo())); if (!DT.dominates(L->getHeader(), UseBB)) continue; + // Don't bother if the instruction is in a BB which ends in an EHPad. + if (UseBB->getTerminator()->isEHPad()) + continue; // Ignore uses which are part of other SCEV expressions, to avoid // analyzing them multiple times. if (SE.isSCEVable(UserInst->getType())) { @@ -3148,7 +3152,8 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { LSRFixup &LF = getNewFixup(); LF.UserInst = const_cast<Instruction *>(UserInst); LF.OperandValToReplace = U; - std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, nullptr); + std::pair<size_t, int64_t> P = getUse( + S, LSRUse::Basic, MemAccessTy()); LF.LUIdx = P.first; LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; @@ -3165,8 +3170,8 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { } } -/// CollectSubexprs - Split S into subexpressions which can be pulled out into -/// separate registers. If C is non-null, multiply each subexpression by C. +/// Split S into subexpressions which can be pulled out into separate +/// registers. If C is non-null, multiply each subexpression by C. /// /// Return remainder expression after factoring the subexpressions captured by /// Ops. If Ops is complete, return NULL. @@ -3300,7 +3305,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, F.BaseRegs.push_back(*J); // We may have changed the number of register in base regs, adjust the // formula accordingly. - F.Canonicalize(); + F.canonicalize(); if (InsertFormula(LU, LUIdx, F)) // If that formula hadn't been seen before, recurse to find more like @@ -3309,8 +3314,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, } } -/// GenerateReassociations - Split out subexpressions from adds and the bases of -/// addrecs. +/// Split out subexpressions from adds and the bases of addrecs. void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, unsigned Depth) { assert(Base.isCanonical() && "Input must be in the canonical form"); @@ -3326,8 +3330,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, /* Idx */ -1, /* IsScaledReg */ true); } -/// GenerateCombinations - Generate a formula consisting of all of the -/// loop-dominating registers added into a single register. +/// Generate a formula consisting of all of the loop-dominating registers added +/// into a single register. void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base) { // This method is only interesting on a plurality of registers. @@ -3336,7 +3340,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before // processing the formula. - Base.Unscale(); + Base.unscale(); Formula F = Base; F.BaseRegs.clear(); SmallVector<const SCEV *, 4> Ops; @@ -3354,7 +3358,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // rather than proceed with zero in a register. if (!Sum->isZero()) { F.BaseRegs.push_back(Sum); - F.Canonicalize(); + F.canonicalize(); (void)InsertFormula(LU, LUIdx, F); } } @@ -3379,7 +3383,7 @@ void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, (void)InsertFormula(LU, LUIdx, F); } -/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets. +/// Generate reuse formulae using symbolic offsets. void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base) { // We can't add a symbolic offset if the address already contains one. @@ -3410,8 +3414,8 @@ void LSRInstance::GenerateConstantOffsetsImpl( F.Scale = 0; F.ScaledReg = nullptr; } else - F.DeleteBaseReg(F.BaseRegs[Idx]); - F.Canonicalize(); + F.deleteBaseReg(F.BaseRegs[Idx]); + F.canonicalize(); } else if (IsScaledReg) F.ScaledReg = NewG; else @@ -3452,8 +3456,8 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, /* IsScaledReg */ true); } -/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up -/// the comparison. For example, x == y -> x*c == y*c. +/// For ICmpZero, check to see if we can scale up the comparison. For example, x +/// == y -> x*c == y*c. void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base) { if (LU.Kind != LSRUse::ICmpZero) return; @@ -3538,8 +3542,8 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, } } -/// GenerateScales - Generate stride factor reuse formulae by making use of -/// scaled-offset address modes, for example. +/// Generate stride factor reuse formulae by making use of scaled-offset address +/// modes, for example. void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // Determine the integer type for the base formula. Type *IntTy = Base.getType(); @@ -3547,10 +3551,10 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // If this Formula already has a scaled register, we can't add another one. // Try to unscale the formula to generate a better scale. - if (Base.Scale != 0 && !Base.Unscale()) + if (Base.Scale != 0 && !Base.unscale()) return; - assert(Base.Scale == 0 && "Unscale did not did its job!"); + assert(Base.Scale == 0 && "unscale did not did its job!"); // Check each interesting stride. for (int64_t Factor : Factors) { @@ -3587,7 +3591,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // TODO: This could be optimized to avoid all the copying. Formula F = Base; F.ScaledReg = Quotient; - F.DeleteBaseReg(F.BaseRegs[i]); + F.deleteBaseReg(F.BaseRegs[i]); // The canonical representation of 1*reg is reg, which is already in // Base. In that case, do not try to insert the formula, it will be // rejected anyway. @@ -3599,7 +3603,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { } } -/// GenerateTruncates - Generate reuse formulae from different IV types. +/// Generate reuse formulae from different IV types. void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { // Don't bother truncating symbolic values. if (Base.BaseGV) return; @@ -3629,9 +3633,9 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { namespace { -/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to -/// defer modifications so that the search phase doesn't have to worry about -/// the data structures moving underneath it. +/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer +/// modifications so that the search phase doesn't have to worry about the data +/// structures moving underneath it. struct WorkItem { size_t LUIdx; int64_t Imm; @@ -3651,14 +3655,13 @@ void WorkItem::print(raw_ostream &OS) const { << " , add offset " << Imm; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void WorkItem::dump() const { print(errs()); errs() << '\n'; } -#endif -/// GenerateCrossUseConstantOffsets - Look for registers which are a constant -/// distance apart and try to form reuse opportunities between them. +/// Look for registers which are a constant distance apart and try to form reuse +/// opportunities between them. void LSRInstance::GenerateCrossUseConstantOffsets() { // Group the registers by their value without any added constant offset. typedef std::map<int64_t, const SCEV *> ImmMapTy; @@ -3751,7 +3754,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // very similar but slightly different. Investigate if they // could be merged. That way, we would not have to unscale the // Formula. - F.Unscale(); + F.unscale(); // Use the immediate in the scaled register. if (F.ScaledReg == OrigReg) { int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; @@ -3770,14 +3773,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // value to the immediate would produce a value closer to zero than the // immediate itself, then the formula isn't worthwhile. if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) - if (C->getValue()->isNegative() != - (NewF.BaseOffset < 0) && - (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale)) - .ule(std::abs(NewF.BaseOffset))) + if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) && + (C->getAPInt().abs() * APInt(BitWidth, F.Scale)) + .ule(std::abs(NewF.BaseOffset))) continue; // OK, looks good. - NewF.Canonicalize(); + NewF.canonicalize(); (void)InsertFormula(LU, LUIdx, NewF); } else { // Use the immediate in a base register. @@ -3801,15 +3803,15 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // zero than the immediate itself, then the formula isn't worthwhile. for (const SCEV *NewReg : NewF.BaseRegs) if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) - if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt( - std::abs(NewF.BaseOffset)) && - (C->getValue()->getValue() + - NewF.BaseOffset).countTrailingZeros() >= - countTrailingZeros<uint64_t>(NewF.BaseOffset)) + if ((C->getAPInt() + NewF.BaseOffset) + .abs() + .slt(std::abs(NewF.BaseOffset)) && + (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >= + countTrailingZeros<uint64_t>(NewF.BaseOffset)) goto skip_formula; // Ok, looks good. - NewF.Canonicalize(); + NewF.canonicalize(); (void)InsertFormula(LU, LUIdx, NewF); break; skip_formula:; @@ -3819,7 +3821,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { } } -/// GenerateAllReuseFormulae - Generate formulae for each use. +/// Generate formulae for each use. void LSRInstance::GenerateAllReuseFormulae() { // This is split into multiple loops so that hasRegsUsedByUsesOtherThan @@ -3959,10 +3961,9 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { // This is a rough guess that seems to work fairly well. static const size_t ComplexityLimit = UINT16_MAX; -/// EstimateSearchSpaceComplexity - Estimate the worst-case number of -/// solutions the solver might have to consider. It almost never considers -/// this many solutions because it prune the search space, but the pruning -/// isn't always sufficient. +/// Estimate the worst-case number of solutions the solver might have to +/// consider. It almost never considers this many solutions because it prune the +/// search space, but the pruning isn't always sufficient. size_t LSRInstance::EstimateSearchSpaceComplexity() const { size_t Power = 1; for (const LSRUse &LU : Uses) { @@ -3978,10 +3979,9 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const { return Power; } -/// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset -/// of the registers of another formula, it won't help reduce register -/// pressure (though it may not necessarily hurt register pressure); remove -/// it to simplify the system. +/// When one formula uses a superset of the registers of another formula, it +/// won't help reduce register pressure (though it may not necessarily hurt +/// register pressure); remove it to simplify the system. void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { DEBUG(dbgs() << "The search space is too complex.\n"); @@ -4042,9 +4042,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { } } -/// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers -/// for expressions like A, A+1, A+2, etc., allocate a single register for -/// them. +/// When there are many registers for expressions like A, A+1, A+2, etc., +/// allocate a single register for them. void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { if (EstimateSearchSpaceComplexity() < ComplexityLimit) return; @@ -4121,8 +4120,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); } -/// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call -/// FilterOutUndesirableDedicatedRegisters again, if necessary, now that +/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that /// we've done more filtering, as it may be able to find more formulae to /// eliminate. void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ @@ -4139,9 +4137,9 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ } } -/// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely -/// to be profitable, and then in any use which has any reference to that -/// register, delete all formulae which do not reference that register. +/// Pick a register which seems likely to be profitable, and then in any use +/// which has any reference to that register, delete all formulae which do not +/// reference that register. void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { // With all other options exhausted, loop until the system is simple // enough to handle. @@ -4202,10 +4200,10 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { } } -/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of -/// formulae to choose from, use some rough heuristics to prune down the number -/// of formulae. This keeps the main solver from taking an extraordinary amount -/// of time in some worst-case scenarios. +/// If there are an extraordinary number of formulae to choose from, use some +/// rough heuristics to prune down the number of formulae. This keeps the main +/// solver from taking an extraordinary amount of time in some worst-case +/// scenarios. void LSRInstance::NarrowSearchSpaceUsingHeuristics() { NarrowSearchSpaceByDetectingSupersets(); NarrowSearchSpaceByCollapsingUnrolledCode(); @@ -4213,7 +4211,7 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() { NarrowSearchSpaceByPickingWinnerRegs(); } -/// SolveRecurse - This is the recursive solver. +/// This is the recursive solver. void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, Cost &SolutionCost, SmallVectorImpl<const Formula *> &Workspace, @@ -4291,8 +4289,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, } } -/// Solve - Choose one formula from each use. Return the results in the given -/// Solution vector. +/// Choose one formula from each use. Return the results in the given Solution +/// vector. void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { SmallVector<const Formula *, 8> Workspace; Cost SolutionCost; @@ -4326,10 +4324,9 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { assert(Solution.size() == Uses.size() && "Malformed solution!"); } -/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up -/// the dominator tree far as we can go while still being dominated by the -/// input positions. This helps canonicalize the insert position, which -/// encourages sharing. +/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as +/// we can go while still being dominated by the input positions. This helps +/// canonicalize the insert position, which encourages sharing. BasicBlock::iterator LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, const SmallVectorImpl<Instruction *> &Inputs) @@ -4365,21 +4362,21 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, // instead of at the end, so that it can be used for other expansions. if (IDom == Inst->getParent() && (!BetterPos || !DT.dominates(Inst, BetterPos))) - BetterPos = std::next(BasicBlock::iterator(Inst)); + BetterPos = &*std::next(BasicBlock::iterator(Inst)); } if (!AllDominate) break; if (BetterPos) - IP = BetterPos; + IP = BetterPos->getIterator(); else - IP = Tentative; + IP = Tentative->getIterator(); } return IP; } -/// AdjustInsertPositionForExpand - Determine an input position which will be -/// dominated by the operands and which will dominate the result. +/// Determine an input position which will be dominated by the operands and +/// which will dominate the result. BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, const LSRFixup &LF, @@ -4417,7 +4414,7 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, } } - assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP) + assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() && !isa<DbgInfoIntrinsic>(LowestIP) && "Insertion point must be a normal instruction"); @@ -4429,7 +4426,7 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, while (isa<PHINode>(IP)) ++IP; // Ignore landingpad instructions. - while (isa<LandingPadInst>(IP)) ++IP; + while (!isa<TerminatorInst>(IP) && IP->isEHPad()) ++IP; // Ignore debug intrinsics. while (isa<DbgInfoIntrinsic>(IP)) ++IP; @@ -4437,13 +4434,14 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, // Set IP below instructions recently inserted by SCEVExpander. This keeps the // IP consistent across expansions and allows the previously inserted // instructions to be reused by subsequent expansion. - while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP; + while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP) + ++IP; return IP; } -/// Expand - Emit instructions for the leading candidate expression for this -/// LSRUse (this is called "expanding"). +/// Emit instructions for the leading candidate expression for this LSRUse (this +/// is called "expanding"). Value *LSRInstance::Expand(const LSRFixup &LF, const Formula &F, BasicBlock::iterator IP, @@ -4487,7 +4485,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, LF.UserInst, LF.OperandValToReplace, Loops, SE, DT); - Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, IP))); + Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, &*IP))); } // Expand the ScaledReg portion. @@ -4505,14 +4503,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Expand ScaleReg as if it was part of the base regs. if (F.Scale == 1) Ops.push_back( - SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP))); + SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP))); else { // An interesting way of "folding" with an icmp is to use a negated // scale, which we'll implement by inserting it into the other operand // of the icmp. assert(F.Scale == -1 && "The only scale supported by ICmpZero uses is -1!"); - ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP); + ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, &*IP); } } else { // Otherwise just expand the scaled register and an explicit scale, @@ -4522,11 +4520,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Unless the addressing mode will not be folded. if (!Ops.empty() && LU.Kind == LSRUse::Address && isAMCompletelyFolded(TTI, LU, F)) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } - ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP)); + ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP)); if (F.Scale != 1) ScaledS = SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale)); @@ -4538,7 +4536,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, if (F.BaseGV) { // Flush the operand list to suppress SCEVExpander hoisting. if (!Ops.empty()) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } @@ -4548,7 +4546,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Flush the operand list to suppress SCEVExpander hoisting of both folded and // unfolded offsets. LSR assumes they both live next to their uses. if (!Ops.empty()) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } @@ -4584,7 +4582,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, const SCEV *FullS = Ops.empty() ? SE.getConstant(IntTy, 0) : SE.getAddExpr(Ops); - Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP); + Value *FullV = Rewriter.expandCodeFor(FullS, Ty, &*IP); // We're done expanding now, so reset the rewriter. Rewriter.clearPostInc(); @@ -4626,15 +4624,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF, return FullV; } -/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use -/// of their operands effectively happens in their predecessor blocks, so the -/// expression may need to be expanded in multiple places. +/// Helper for Rewrite. PHI nodes are special because the use of their operands +/// effectively happens in their predecessor blocks, so the expression may need +/// to be expanded in multiple places. void LSRInstance::RewriteForPHI(PHINode *PN, const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts, - Pass *P) const { + SmallVectorImpl<WeakVH> &DeadInsts) const { DenseMap<BasicBlock *, Value *> Inserted; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) if (PN->getIncomingValue(i) == LF.OperandValToReplace) { @@ -4658,8 +4655,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN, .setDontDeleteUselessPHIs()); } else { SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, - /*AliasAnalysis*/ nullptr, &DT, &LI); + SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI); NewBB = NewBBs[0]; } // If NewBB==NULL, then SplitCriticalEdge refused to split because all @@ -4685,7 +4681,8 @@ void LSRInstance::RewriteForPHI(PHINode *PN, if (!Pair.second) PN->setIncomingValue(i, Pair.first->second); else { - Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts); + Value *FullV = Expand(LF, F, BB->getTerminator()->getIterator(), + Rewriter, DeadInsts); // If this is reuse-by-noop-cast, insert the noop cast. Type *OpTy = LF.OperandValToReplace->getType(); @@ -4702,20 +4699,20 @@ void LSRInstance::RewriteForPHI(PHINode *PN, } } -/// Rewrite - Emit instructions for the leading candidate expression for this -/// LSRUse (this is called "expanding"), and update the UserInst to reference -/// the newly expanded value. +/// Emit instructions for the leading candidate expression for this LSRUse (this +/// is called "expanding"), and update the UserInst to reference the newly +/// expanded value. void LSRInstance::Rewrite(const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts, - Pass *P) const { + SmallVectorImpl<WeakVH> &DeadInsts) const { // First, find an insertion point that dominates UserInst. For PHI nodes, // find the nearest block which dominates all the relevant uses. if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) { - RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P); + RewriteForPHI(PN, LF, F, Rewriter, DeadInsts); } else { - Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts); + Value *FullV = + Expand(LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts); // If this is reuse-by-noop-cast, insert the noop cast. Type *OpTy = LF.OperandValToReplace->getType(); @@ -4740,11 +4737,10 @@ void LSRInstance::Rewrite(const LSRFixup &LF, DeadInsts.emplace_back(LF.OperandValToReplace); } -/// ImplementSolution - Rewrite all the fixup locations with new values, -/// following the chosen solution. -void -LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, - Pass *P) { +/// Rewrite all the fixup locations with new values, following the chosen +/// solution. +void LSRInstance::ImplementSolution( + const SmallVectorImpl<const Formula *> &Solution) { // Keep track of instructions we may have made dead, so that // we can remove them after we are done working. SmallVector<WeakVH, 16> DeadInsts; @@ -4766,7 +4762,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, // Expand the new value definitions and update the users. for (const LSRFixup &Fixup : Fixups) { - Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P); + Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts); Changed = true; } @@ -4782,13 +4778,11 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, Changed |= DeleteTriviallyDeadInstructions(DeadInsts); } -LSRInstance::LSRInstance(Loop *L, Pass *P) - : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), - DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()), - LI(P->getAnalysis<LoopInfoWrapperPass>().getLoopInfo()), - TTI(P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI( - *L->getHeader()->getParent())), - L(L), Changed(false), IVIncInsertPos(nullptr) { +LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, + DominatorTree &DT, LoopInfo &LI, + const TargetTransformInfo &TTI) + : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false), + IVIncInsertPos(nullptr) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; @@ -4879,7 +4873,7 @@ LSRInstance::LSRInstance(Loop *L, Pass *P) #endif // Now that we've decided what we want, make it so. - ImplementSolution(Solution, P); + ImplementSolution(Solution); } void LSRInstance::print_factors_and_types(raw_ostream &OS) const { @@ -4931,11 +4925,10 @@ void LSRInstance::print(raw_ostream &OS) const { print_uses(OS); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRInstance::dump() const { print(errs()); errs() << '\n'; } -#endif namespace { @@ -4956,7 +4949,7 @@ INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(IVUsers) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) @@ -4982,8 +4975,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredID(LoopSimplifyID); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); - AU.addPreserved<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); // Requiring LoopSimplify a second time here prevents IVUsers from running // twice, since LoopSimplify was invalidated by running ScalarEvolution. AU.addRequiredID(LoopSimplifyID); @@ -4996,17 +4989,24 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { if (skipOptnoneFunction(L)) return false; + auto &IU = getAnalysis<IVUsers>(); + auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *L->getHeader()->getParent()); bool Changed = false; // Run the main LSR transformation. - Changed |= LSRInstance(L, this).getChanged(); + Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged(); // Remove any extra phis created by processing inner loops. Changed |= DeleteDeadPHIs(L->getHeader()); if (EnablePhiElim && L->isLoopSimplifyForm()) { SmallVector<WeakVH, 16> DeadInsts; const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); - SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), DL, "lsr"); + SCEVExpander Rewriter(getAnalysis<ScalarEvolutionWrapperPass>().getSE(), DL, + "lsr"); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index d78db6c..ecef6db 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -38,16 +39,16 @@ using namespace llvm; #define DEBUG_TYPE "loop-unroll" static cl::opt<unsigned> - UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, + UnrollThreshold("unroll-threshold", cl::Hidden, cl::desc("The baseline cost threshold for loop unrolling")); static cl::opt<unsigned> UnrollPercentDynamicCostSavedThreshold( - "unroll-percent-dynamic-cost-saved-threshold", cl::init(20), cl::Hidden, + "unroll-percent-dynamic-cost-saved-threshold", cl::Hidden, cl::desc("The percentage of estimated dynamic cost which must be saved by " "unrolling to allow unrolling up to the max threshold.")); static cl::opt<unsigned> UnrollDynamicCostSavingsDiscount( - "unroll-dynamic-cost-savings-discount", cl::init(2000), cl::Hidden, + "unroll-dynamic-cost-savings-discount", cl::Hidden, cl::desc("This is the amount discounted from the total unroll cost when " "the unrolled form has a high dynamic cost savings (triggered by " "the '-unroll-perecent-dynamic-cost-saved-threshold' flag).")); @@ -58,17 +59,17 @@ static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze( "iterations when checking full unroll profitability")); static cl::opt<unsigned> -UnrollCount("unroll-count", cl::init(0), cl::Hidden, +UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes")); static cl::opt<bool> -UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden, +UnrollAllowPartial("unroll-allow-partial", cl::Hidden, cl::desc("Allows loops to be partially unrolled until " "-unroll-threshold loop size is reached.")); static cl::opt<bool> -UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::init(false), cl::Hidden, +UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden, cl::desc("Unroll loops with run-time trip counts")); static cl::opt<unsigned> @@ -76,178 +77,95 @@ PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden cl::desc("Unrolled size limit for loops with an unroll(full) or " "unroll_count pragma.")); -namespace { - class LoopUnroll : public LoopPass { - public: - static char ID; // Pass ID, replacement for typeid - LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) { - CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T); - CurrentPercentDynamicCostSavedThreshold = - UnrollPercentDynamicCostSavedThreshold; - CurrentDynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount; - CurrentCount = (C == -1) ? UnrollCount : unsigned(C); - CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P; - CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R; - - UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0); - UserPercentDynamicCostSavedThreshold = - (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0); - UserDynamicCostSavingsDiscount = - (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0); - UserAllowPartial = (P != -1) || - (UnrollAllowPartial.getNumOccurrences() > 0); - UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0); - UserCount = (C != -1) || (UnrollCount.getNumOccurrences() > 0); - - initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); - } - /// A magic value for use with the Threshold parameter to indicate - /// that the loop unroll should be performed regardless of how much - /// code expansion would result. - static const unsigned NoThreshold = UINT_MAX; - - // Threshold to use when optsize is specified (and there is no - // explicit -unroll-threshold). - static const unsigned OptSizeUnrollThreshold = 50; - - // Default unroll count for loops with run-time trip count if - // -unroll-count is not set - static const unsigned UnrollRuntimeCount = 8; - - unsigned CurrentCount; - unsigned CurrentThreshold; - unsigned CurrentPercentDynamicCostSavedThreshold; - unsigned CurrentDynamicCostSavingsDiscount; - bool CurrentAllowPartial; - bool CurrentRuntime; - - // Flags for whether the 'current' settings are user-specified. - bool UserCount; - bool UserThreshold; - bool UserPercentDynamicCostSavedThreshold; - bool UserDynamicCostSavingsDiscount; - bool UserAllowPartial; - bool UserRuntime; - - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - - /// This transformation requires natural loop information & requires that - /// loop preheaders be inserted into the CFG... - /// - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addRequired<ScalarEvolution>(); - AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. - // If loop unroll does not preserve dom info then LCSSA pass on next - // loop will receive invalid dom info. - // For now, recreate dom info, if loop is unrolled. - AU.addPreserved<DominatorTreeWrapperPass>(); - } +/// A magic value for use with the Threshold parameter to indicate +/// that the loop unroll should be performed regardless of how much +/// code expansion would result. +static const unsigned NoThreshold = UINT_MAX; - // Fill in the UnrollingPreferences parameter with values from the - // TargetTransformationInfo. - void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI, - TargetTransformInfo::UnrollingPreferences &UP) { - UP.Threshold = CurrentThreshold; - UP.PercentDynamicCostSavedThreshold = - CurrentPercentDynamicCostSavedThreshold; - UP.DynamicCostSavingsDiscount = CurrentDynamicCostSavingsDiscount; - UP.OptSizeThreshold = OptSizeUnrollThreshold; - UP.PartialThreshold = CurrentThreshold; - UP.PartialOptSizeThreshold = OptSizeUnrollThreshold; - UP.Count = CurrentCount; - UP.MaxCount = UINT_MAX; - UP.Partial = CurrentAllowPartial; - UP.Runtime = CurrentRuntime; - UP.AllowExpensiveTripCount = false; - TTI.getUnrollingPreferences(L, UP); - } +/// Default unroll count for loops with run-time trip count if +/// -unroll-count is not set +static const unsigned DefaultUnrollRuntimeCount = 8; - // Select and return an unroll count based on parameters from - // user, unroll preferences, unroll pragmas, or a heuristic. - // SetExplicitly is set to true if the unroll count is is set by - // the user or a pragma rather than selected heuristically. - unsigned - selectUnrollCount(const Loop *L, unsigned TripCount, bool PragmaFullUnroll, - unsigned PragmaCount, - const TargetTransformInfo::UnrollingPreferences &UP, - bool &SetExplicitly); - - // Select threshold values used to limit unrolling based on a - // total unrolled size. Parameters Threshold and PartialThreshold - // are set to the maximum unrolled size for fully and partially - // unrolled loops respectively. - void selectThresholds(const Loop *L, bool HasPragma, - const TargetTransformInfo::UnrollingPreferences &UP, - unsigned &Threshold, unsigned &PartialThreshold, - unsigned &PercentDynamicCostSavedThreshold, - unsigned &DynamicCostSavingsDiscount) { - // Determine the current unrolling threshold. While this is - // normally set from UnrollThreshold, it is overridden to a - // smaller value if the current function is marked as - // optimize-for-size, and the unroll threshold was not user - // specified. - Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; - PartialThreshold = UserThreshold ? CurrentThreshold : UP.PartialThreshold; - PercentDynamicCostSavedThreshold = - UserPercentDynamicCostSavedThreshold - ? CurrentPercentDynamicCostSavedThreshold - : UP.PercentDynamicCostSavedThreshold; - DynamicCostSavingsDiscount = UserDynamicCostSavingsDiscount - ? CurrentDynamicCostSavingsDiscount - : UP.DynamicCostSavingsDiscount; - - if (!UserThreshold && - L->getHeader()->getParent()->hasFnAttribute( - Attribute::OptimizeForSize)) { - Threshold = UP.OptSizeThreshold; - PartialThreshold = UP.PartialOptSizeThreshold; - } - if (HasPragma) { - // If the loop has an unrolling pragma, we want to be more - // aggressive with unrolling limits. Set thresholds to at - // least the PragmaTheshold value which is larger than the - // default limits. - if (Threshold != NoThreshold) - Threshold = std::max<unsigned>(Threshold, PragmaUnrollThreshold); - if (PartialThreshold != NoThreshold) - PartialThreshold = - std::max<unsigned>(PartialThreshold, PragmaUnrollThreshold); - } - } - bool canUnrollCompletely(Loop *L, unsigned Threshold, - unsigned PercentDynamicCostSavedThreshold, - unsigned DynamicCostSavingsDiscount, - uint64_t UnrolledCost, uint64_t RolledDynamicCost); - }; -} +/// Gather the various unrolling parameters based on the defaults, compiler +/// flags, TTI overrides, pragmas, and user specified parameters. +static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( + Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold, + Optional<unsigned> UserCount, Optional<bool> UserAllowPartial, + Optional<bool> UserRuntime, unsigned PragmaCount, bool PragmaFullUnroll, + bool PragmaEnableUnroll, unsigned TripCount) { + TargetTransformInfo::UnrollingPreferences UP; -char LoopUnroll::ID = 0; -INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) -INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) + // Set up the defaults + UP.Threshold = 150; + UP.PercentDynamicCostSavedThreshold = 20; + UP.DynamicCostSavingsDiscount = 2000; + UP.OptSizeThreshold = 50; + UP.PartialThreshold = UP.Threshold; + UP.PartialOptSizeThreshold = UP.OptSizeThreshold; + UP.Count = 0; + UP.MaxCount = UINT_MAX; + UP.Partial = false; + UP.Runtime = false; + UP.AllowExpensiveTripCount = false; + + // Override with any target specific settings + TTI.getUnrollingPreferences(L, UP); + + // Apply size attributes + if (L->getHeader()->getParent()->optForSize()) { + UP.Threshold = UP.OptSizeThreshold; + UP.PartialThreshold = UP.PartialOptSizeThreshold; + } -Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, - int Runtime) { - return new LoopUnroll(Threshold, Count, AllowPartial, Runtime); -} + // Apply unroll count pragmas + if (PragmaCount) + UP.Count = PragmaCount; + else if (PragmaFullUnroll) + UP.Count = TripCount; -Pass *llvm::createSimpleLoopUnrollPass() { - return llvm::createLoopUnrollPass(-1, -1, 0, 0); + // Apply any user values specified by cl::opt + if (UnrollThreshold.getNumOccurrences() > 0) { + UP.Threshold = UnrollThreshold; + UP.PartialThreshold = UnrollThreshold; + } + if (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0) + UP.PercentDynamicCostSavedThreshold = + UnrollPercentDynamicCostSavedThreshold; + if (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0) + UP.DynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount; + if (UnrollCount.getNumOccurrences() > 0) + UP.Count = UnrollCount; + if (UnrollAllowPartial.getNumOccurrences() > 0) + UP.Partial = UnrollAllowPartial; + if (UnrollRuntime.getNumOccurrences() > 0) + UP.Runtime = UnrollRuntime; + + // Apply user values provided by argument + if (UserThreshold.hasValue()) { + UP.Threshold = *UserThreshold; + UP.PartialThreshold = *UserThreshold; + } + if (UserCount.hasValue()) + UP.Count = *UserCount; + if (UserAllowPartial.hasValue()) + UP.Partial = *UserAllowPartial; + if (UserRuntime.hasValue()) + UP.Runtime = *UserRuntime; + + if (PragmaCount > 0 || + ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount != 0)) { + // If the loop has an unrolling pragma, we want to be more aggressive with + // unrolling limits. Set thresholds to at least the PragmaTheshold value + // which is larger than the default limits. + if (UP.Threshold != NoThreshold) + UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold); + if (UP.PartialThreshold != NoThreshold) + UP.PartialThreshold = + std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold); + } + + return UP; } namespace { @@ -278,8 +196,8 @@ class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> { public: UnrolledInstAnalyzer(unsigned Iteration, DenseMap<Value *, Constant *> &SimplifiedValues, - const Loop *L, ScalarEvolution &SE) - : Iteration(Iteration), SimplifiedValues(SimplifiedValues), L(L), SE(SE) { + ScalarEvolution &SE) + : SimplifiedValues(SimplifiedValues), SE(SE) { IterationNumber = SE.getConstant(APInt(64, Iteration)); } @@ -295,13 +213,6 @@ private: /// results saved. DenseMap<Value *, SimplifiedAddress> SimplifiedAddresses; - /// \brief Number of currently simulated iteration. - /// - /// If an expression is ConstAddress+Constant, then the Constant is - /// Start + Iteration*Step, where Start and Step could be obtained from - /// SCEVGEPCache. - unsigned Iteration; - /// \brief SCEV expression corresponding to number of currently simulated /// iteration. const SCEV *IterationNumber; @@ -316,7 +227,6 @@ private: /// post-unrolling. DenseMap<Value *, Constant *> &SimplifiedValues; - const Loop *L; ScalarEvolution &SE; /// \brief Try to simplify instruction \param I using its SCEV expression. @@ -368,11 +278,9 @@ private: return simplifyInstWithSCEV(&I); } - /// TODO: Add visitors for other instruction types, e.g. ZExt, SExt. - /// Try to simplify binary operator I. /// - /// TODO: Probaly it's worth to hoist the code for estimating the + /// TODO: Probably it's worth to hoist the code for estimating the /// simplifications effects to a separate class, since we have a very similar /// code in InlineCost already. bool visitBinaryOperator(BinaryOperator &I) { @@ -412,7 +320,7 @@ private: auto *GV = dyn_cast<GlobalVariable>(AddressIt->second.Base); // We're only interested in loads that can be completely folded to a // constant. - if (!GV || !GV->hasInitializer()) + if (!GV || !GV->hasDefinitiveInitializer() || !GV->isConstant()) return false; ConstantDataSequential *CDS = @@ -420,6 +328,12 @@ private: if (!CDS) return false; + // We might have a vector load from an array. FIXME: for now we just bail + // out in this case, but we should be able to resolve and simplify such + // loads. + if(!CDS->isElementTypeCompatible(I.getType())) + return false; + int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; assert(SimplifiedAddrOp->getValue().getActiveBits() < 64 && "Unexpectedly large index value."); @@ -436,6 +350,59 @@ private: return true; } + + bool visitCastInst(CastInst &I) { + // Propagate constants through casts. + Constant *COp = dyn_cast<Constant>(I.getOperand(0)); + if (!COp) + COp = SimplifiedValues.lookup(I.getOperand(0)); + if (COp) + if (Constant *C = + ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { + SimplifiedValues[&I] = C; + return true; + } + + return Base::visitCastInst(I); + } + + bool visitCmpInst(CmpInst &I) { + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + + // First try to handle simplified comparisons. + if (!isa<Constant>(LHS)) + if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) + LHS = SimpleLHS; + if (!isa<Constant>(RHS)) + if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) + RHS = SimpleRHS; + + if (!isa<Constant>(LHS) && !isa<Constant>(RHS)) { + auto SimplifiedLHS = SimplifiedAddresses.find(LHS); + if (SimplifiedLHS != SimplifiedAddresses.end()) { + auto SimplifiedRHS = SimplifiedAddresses.find(RHS); + if (SimplifiedRHS != SimplifiedAddresses.end()) { + SimplifiedAddress &LHSAddr = SimplifiedLHS->second; + SimplifiedAddress &RHSAddr = SimplifiedRHS->second; + if (LHSAddr.Base == RHSAddr.Base) { + LHS = LHSAddr.Offset; + RHS = RHSAddr.Offset; + } + } + } + } + + if (Constant *CLHS = dyn_cast<Constant>(LHS)) { + if (Constant *CRHS = dyn_cast<Constant>(RHS)) { + if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { + SimplifiedValues[&I] = C; + return true; + } + } + } + + return Base::visitCmpInst(I); + } }; } // namespace @@ -443,11 +410,11 @@ private: namespace { struct EstimatedUnrollCost { /// \brief The estimated cost after unrolling. - unsigned UnrolledCost; + int UnrolledCost; /// \brief The estimated dynamic cost of executing the instructions in the /// rolled form. - unsigned RolledDynamicCost; + int RolledDynamicCost; }; } @@ -464,10 +431,10 @@ struct EstimatedUnrollCost { /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. -Optional<EstimatedUnrollCost> -analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, - const TargetTransformInfo &TTI, - unsigned MaxUnrolledLoopSize) { +static Optional<EstimatedUnrollCost> +analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, + ScalarEvolution &SE, const TargetTransformInfo &TTI, + int MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. @@ -481,24 +448,61 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, SmallSetVector<BasicBlock *, 16> BBWorklist; DenseMap<Value *, Constant *> SimplifiedValues; + SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. - unsigned UnrolledCost = 0; + int UnrolledCost = 0; // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic // instructions due to the simplifications which we expect to occur after // unrolling. - unsigned RolledDynamicCost = 0; + int RolledDynamicCost = 0; + + // Ensure that we don't violate the loop structure invariants relied on by + // this analysis. + assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); + assert(L->isLCSSAForm(DT) && + "Must have loops in LCSSA form to track live-out values."); + + DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n"); // Simulate execution of each iteration of the loop counting instructions, // which would be simplified. // Since the same load will take different values on different iterations, // we literally have to go through all loop's iterations. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { + DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); + + // Prepare for the iteration by collecting any simplified entry or backedge + // inputs. + for (Instruction &I : *L->getHeader()) { + auto *PHI = dyn_cast<PHINode>(&I); + if (!PHI) + break; + + // The loop header PHI nodes must have exactly two input: one from the + // loop preheader and one from the loop latch. + assert( + PHI->getNumIncomingValues() == 2 && + "Must have an incoming value only for the preheader and the latch."); + + Value *V = PHI->getIncomingValueForBlock( + Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); + Constant *C = dyn_cast<Constant>(V); + if (Iteration != 0 && !C) + C = SimplifiedValues.lookup(V); + if (C) + SimplifiedInputValues.push_back({PHI, C}); + } + + // Now clear and re-populate the map for the next iteration. SimplifiedValues.clear(); - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, L, SE); + while (!SimplifiedInputValues.empty()) + SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); + + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); @@ -510,21 +514,67 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { - unsigned InstCost = TTI.getUserCost(&I); + int InstCost = TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, // and if the visitor returns false, include this instruction in the // unrolled cost. if (!Analyzer.visit(I)) UnrolledCost += InstCost; + else { + DEBUG(dbgs() << " " << I + << " would be simplified if loop is unrolled.\n"); + (void)0; + } // Also track this instructions expected cost when executing the rolled // loop form. RolledDynamicCost += InstCost; // If unrolled body turns out to be too big, bail out. - if (UnrolledCost > MaxUnrolledLoopSize) + if (UnrolledCost > MaxUnrolledLoopSize) { + DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" + << " UnrolledCost: " << UnrolledCost + << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize + << "\n"); return None; + } + } + + TerminatorInst *TI = BB->getTerminator(); + + // Add in the live successors by first checking whether we have terminator + // that may be simplified based on the values simplified by this call. + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (BI->isConditional()) { + if (Constant *SimpleCond = + SimplifiedValues.lookup(BI->getCondition())) { + BasicBlock *Succ = nullptr; + // Just take the first successor if condition is undef + if (isa<UndefValue>(SimpleCond)) + Succ = BI->getSuccessor(0); + else + Succ = BI->getSuccessor( + cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0); + if (L->contains(Succ)) + BBWorklist.insert(Succ); + continue; + } + } + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + if (Constant *SimpleCond = + SimplifiedValues.lookup(SI->getCondition())) { + BasicBlock *Succ = nullptr; + // Just take the first successor if condition is undef + if (isa<UndefValue>(SimpleCond)) + Succ = SI->getSuccessor(0); + else + Succ = SI->findCaseValue(cast<ConstantInt>(SimpleCond)) + .getCaseSuccessor(); + if (L->contains(Succ)) + BBWorklist.insert(Succ); + continue; + } } // Add BB's successors to the worklist. @@ -535,9 +585,15 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. - if (UnrolledCost == RolledDynamicCost) + if (UnrolledCost == RolledDynamicCost) { + DEBUG(dbgs() << " No opportunities found.. exiting.\n" + << " UnrolledCost: " << UnrolledCost << "\n"); return None; + } } + DEBUG(dbgs() << "Analysis finished:\n" + << "UnrolledCost: " << UnrolledCost << ", " + << "RolledDynamicCost: " << RolledDynamicCost << "\n"); return {{UnrolledCost, RolledDynamicCost}}; } @@ -583,6 +639,12 @@ static bool HasUnrollFullPragma(const Loop *L) { return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full"); } +// Returns true if the loop has an unroll(enable) pragma. This metadata is used +// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives. +static bool HasUnrollEnablePragma(const Loop *L) { + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable"); +} + // Returns true if the loop has an unroll(disable) pragma. static bool HasUnrollDisablePragma(const Loop *L) { return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable"); @@ -644,12 +706,11 @@ static void SetLoopAlreadyUnrolled(Loop *L) { L->setLoopID(NewLoopID); } -bool LoopUnroll::canUnrollCompletely(Loop *L, unsigned Threshold, - unsigned PercentDynamicCostSavedThreshold, - unsigned DynamicCostSavingsDiscount, - uint64_t UnrolledCost, - uint64_t RolledDynamicCost) { - +static bool canUnrollCompletely(Loop *L, unsigned Threshold, + unsigned PercentDynamicCostSavedThreshold, + unsigned DynamicCostSavingsDiscount, + uint64_t UnrolledCost, + uint64_t RolledDynamicCost) { if (Threshold == NoThreshold) { DEBUG(dbgs() << " Can fully unroll, because no threshold is set.\n"); return true; @@ -697,58 +758,13 @@ bool LoopUnroll::canUnrollCompletely(Loop *L, unsigned Threshold, return false; } -unsigned LoopUnroll::selectUnrollCount( - const Loop *L, unsigned TripCount, bool PragmaFullUnroll, - unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP, - bool &SetExplicitly) { - SetExplicitly = true; - - // User-specified count (either as a command-line option or - // constructor parameter) has highest precedence. - unsigned Count = UserCount ? CurrentCount : 0; - - // If there is no user-specified count, unroll pragmas have the next - // highest precendence. - if (Count == 0) { - if (PragmaCount) { - Count = PragmaCount; - } else if (PragmaFullUnroll) { - Count = TripCount; - } - } - - if (Count == 0) - Count = UP.Count; - - if (Count == 0) { - SetExplicitly = false; - if (TripCount == 0) - // Runtime trip count. - Count = UnrollRuntimeCount; - else - // Conservative heuristic: if we know the trip count, see if we can - // completely unroll (subject to the threshold, checked below); otherwise - // try to find greatest modulo of the trip count which is still under - // threshold value. - Count = TripCount; - } - if (TripCount && Count > TripCount) - return TripCount; - return Count; -} - -bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { - if (skipOptnoneFunction(L)) - return false; - - Function &F = *L->getHeader()->getParent(); - - LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); - const TargetTransformInfo &TTI = - getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - +static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, + ScalarEvolution *SE, const TargetTransformInfo &TTI, + AssumptionCache &AC, bool PreserveLCSSA, + Optional<unsigned> ProvidedCount, + Optional<unsigned> ProvidedThreshold, + Optional<bool> ProvidedAllowPartial, + Optional<bool> ProvidedRuntime) { BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() << "] Loop %" << Header->getName() << "\n"); @@ -757,11 +773,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { return false; } bool PragmaFullUnroll = HasUnrollFullPragma(L); + bool PragmaEnableUnroll = HasUnrollEnablePragma(L); unsigned PragmaCount = UnrollCountPragmaValue(L); - bool HasPragma = PragmaFullUnroll || PragmaCount > 0; - - TargetTransformInfo::UnrollingPreferences UP; - getUnrollingPreferences(L, TTI, UP); + bool HasPragma = PragmaFullUnroll || PragmaEnableUnroll || PragmaCount > 0; // Find trip count and trip multiple if count is not available unsigned TripCount = 0; @@ -777,11 +791,18 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock); } - // Select an initial unroll count. This may be reduced later based - // on size thresholds. - bool CountSetExplicitly; - unsigned Count = selectUnrollCount(L, TripCount, PragmaFullUnroll, - PragmaCount, UP, CountSetExplicitly); + TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( + L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, + ProvidedRuntime, PragmaCount, PragmaFullUnroll, PragmaEnableUnroll, + TripCount); + + unsigned Count = UP.Count; + bool CountSetExplicitly = Count != 0; + // Use a heuristic count if we didn't set anything explicitly. + if (!CountSetExplicitly) + Count = TripCount == 0 ? DefaultUnrollRuntimeCount : TripCount; + if (TripCount && Count > TripCount) + Count = TripCount; unsigned NumInlineCandidates; bool notDuplicatable; @@ -803,13 +824,6 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { return false; } - unsigned Threshold, PartialThreshold; - unsigned PercentDynamicCostSavedThreshold; - unsigned DynamicCostSavingsDiscount; - selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold, - PercentDynamicCostSavedThreshold, - DynamicCostSavingsDiscount); - // Given Count, TripCount and thresholds determine the type of // unrolling which is to be performed. enum { Full = 0, Partial = 1, Runtime = 2 }; @@ -817,7 +831,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { if (TripCount && Count == TripCount) { Unrolling = Partial; // If the loop is really small, we don't need to run an expensive analysis. - if (canUnrollCompletely(L, Threshold, 100, DynamicCostSavingsDiscount, + if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount, UnrolledSize, UnrolledSize)) { Unrolling = Full; } else { @@ -825,10 +839,12 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // helps to remove a significant number of instructions. // To check that, run additional analysis on the loop. if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( - L, TripCount, *SE, TTI, Threshold + DynamicCostSavingsDiscount)) - if (canUnrollCompletely(L, Threshold, PercentDynamicCostSavedThreshold, - DynamicCostSavingsDiscount, Cost->UnrolledCost, - Cost->RolledDynamicCost)) { + L, TripCount, DT, *SE, TTI, + UP.Threshold + UP.DynamicCostSavingsDiscount)) + if (canUnrollCompletely(L, UP.Threshold, + UP.PercentDynamicCostSavedThreshold, + UP.DynamicCostSavingsDiscount, + Cost->UnrolledCost, Cost->RolledDynamicCost)) { Unrolling = Full; } } @@ -840,22 +856,22 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Reduce count based on the type of unrolling and the threshold values. unsigned OriginalCount = Count; - bool AllowRuntime = - (PragmaCount > 0) || (UserRuntime ? CurrentRuntime : UP.Runtime); + bool AllowRuntime = PragmaEnableUnroll || (PragmaCount > 0) || UP.Runtime; // Don't unroll a runtime trip count loop with unroll full pragma. if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) { AllowRuntime = false; } if (Unrolling == Partial) { - bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; + bool AllowPartial = PragmaEnableUnroll || UP.Partial; if (!AllowPartial && !CountSetExplicitly) { DEBUG(dbgs() << " will not try to unroll partially because " << "-unroll-allow-partial not given\n"); return false; } - if (PartialThreshold != NoThreshold && UnrolledSize > PartialThreshold) { + if (UP.PartialThreshold != NoThreshold && + UnrolledSize > UP.PartialThreshold) { // Reduce unroll count to be modulo of TripCount for partial unrolling. - Count = (std::max(PartialThreshold, 3u)-2) / (LoopSize-2); + Count = (std::max(UP.PartialThreshold, 3u) - 2) / (LoopSize - 2); while (Count != 0 && TripCount % Count != 0) Count--; } @@ -867,7 +883,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } // Reduce unroll count to be the largest power-of-two factor of // the original count which satisfies the threshold limit. - while (Count != 0 && UnrolledSize > PartialThreshold) { + while (Count != 0 && UnrolledSize > UP.PartialThreshold) { Count >>= 1; UnrolledSize = (LoopSize-2) * Count + 2; } @@ -887,23 +903,27 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { DebugLoc LoopLoc = L->getStartLoc(); Function *F = Header->getParent(); LLVMContext &Ctx = F->getContext(); - if (PragmaFullUnroll && PragmaCount == 0) { - if (TripCount && Count != TripCount) { - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll(full) pragma " - "because unrolled size is too large."); - } else if (!TripCount) { - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll(full) pragma " - "because loop has a runtime trip count."); - } - } else if (PragmaCount > 0 && Count != OriginalCount) { + if ((PragmaCount > 0) && Count != OriginalCount) { emitOptimizationRemarkMissed( Ctx, DEBUG_TYPE, *F, LoopLoc, "Unable to unroll loop the number of times directed by " "unroll_count pragma because unrolled size is too large."); + } else if (PragmaFullUnroll && !TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll(full) pragma " + "because loop has a runtime trip count."); + } else if (PragmaEnableUnroll && Count != TripCount && Count < 2) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to unroll loop as directed by unroll(enable) pragma because " + "unrolled size is too large."); + } else if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && + Count != TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll pragma because " + "unrolled size is too large."); } } @@ -915,8 +935,96 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Unroll the loop. if (!UnrollLoop(L, Count, TripCount, AllowRuntime, UP.AllowExpensiveTripCount, - TripMultiple, LI, this, &LPM, &AC)) + TripMultiple, LI, SE, &DT, &AC, PreserveLCSSA)) return false; return true; } + +namespace { +class LoopUnroll : public LoopPass { +public: + static char ID; // Pass ID, replacement for typeid + LoopUnroll(Optional<unsigned> Threshold = None, + Optional<unsigned> Count = None, + Optional<bool> AllowPartial = None, Optional<bool> Runtime = None) + : LoopPass(ID), ProvidedCount(Count), ProvidedThreshold(Threshold), + ProvidedAllowPartial(AllowPartial), ProvidedRuntime(Runtime) { + initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); + } + + Optional<unsigned> ProvidedCount; + Optional<unsigned> ProvidedThreshold; + Optional<bool> ProvidedAllowPartial; + Optional<bool> ProvidedRuntime; + + bool runOnLoop(Loop *L, LPPassManager &) override { + if (skipOptnoneFunction(L)) + return false; + + Function &F = *L->getHeader()->getParent(); + + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + const TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + + return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, PreserveLCSSA, ProvidedCount, + ProvidedThreshold, ProvidedAllowPartial, + ProvidedRuntime); + } + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG... + /// + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. + // If loop unroll does not preserve dom info then LCSSA pass on next + // loop will receive invalid dom info. + // For now, recreate dom info, if loop is unrolled. + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } +}; +} + +char LoopUnroll::ID = 0; +INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) + +Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, + int Runtime) { + // TODO: It would make more sense for this function to take the optionals + // directly, but that's dangerous since it would silently break out of tree + // callers. + return new LoopUnroll(Threshold == -1 ? None : Optional<unsigned>(Threshold), + Count == -1 ? None : Optional<unsigned>(Count), + AllowPartial == -1 ? None + : Optional<bool>(AllowPartial), + Runtime == -1 ? None : Optional<bool>(Runtime)); +} + +Pass *llvm::createSimpleLoopUnrollPass() { + return llvm::createLoopUnrollPass(-1, -1, 0, 0); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index cbc563b..95d7f8a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -30,6 +30,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -37,6 +38,10 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Support/BranchProbability.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" @@ -70,6 +75,19 @@ static cl::opt<unsigned> Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden); +static cl::opt<bool> +LoopUnswitchWithBlockFrequency("loop-unswitch-with-block-frequency", + cl::init(false), cl::Hidden, + cl::desc("Enable the use of the block frequency analysis to access PGO " + "heuristics to minimize code growth in cold regions.")); + +static cl::opt<unsigned> +ColdnessThreshold("loop-unswitch-coldness-threshold", cl::init(1), cl::Hidden, + cl::desc("Coldness threshold in percentage. The loop header frequency " + "(relative to the entry frequency) is compared with this " + "threshold to determine if non-trivial unswitching should be " + "enabled.")); + namespace { class LUAnalysisCache { @@ -148,12 +166,19 @@ namespace { LPPassManager *LPM; AssumptionCache *AC; - // LoopProcessWorklist - Used to check if second loop needs processing - // after RewriteLoopBodyWithConditionConstant rewrites first loop. + // Used to check if second loop needs processing after + // RewriteLoopBodyWithConditionConstant rewrites first loop. std::vector<Loop*> LoopProcessWorklist; LUAnalysisCache BranchesInfo; + bool EnabledPGO; + + // BFI and ColdEntryFreq are only used when PGO and + // LoopUnswitchWithBlockFrequency are enabled. + BlockFrequencyInfo BFI; + BlockFrequency ColdEntryFreq; + bool OptimizeForSize; bool redoLoop; @@ -192,9 +217,11 @@ namespace { AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } private: @@ -210,7 +237,10 @@ namespace { /// Split all of the edges from inside the loop to their exit blocks. /// Update the appropriate Phi nodes as we do so. - void SplitExitEdges(Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks); + void SplitExitEdges(Loop *L, + const SmallVectorImpl<BasicBlock *> &ExitBlocks); + + bool TryTrivialLoopUnswitch(bool &Changed); bool UnswitchIfProfitable(Value *LoopCond, Constant *Val, TerminatorInst *TI = nullptr); @@ -229,9 +259,6 @@ namespace { TerminatorInst *TI); void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L); - bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = nullptr, - BasicBlock **LoopExit = nullptr); - }; } @@ -367,9 +394,8 @@ Pass *llvm::createLoopUnswitchPass(bool Os) { return new LoopUnswitch(Os); } -/// FindLIVLoopCondition - Cond is a condition that occurs in L. If it is -/// invariant in the loop, or has an invariant piece, return the invariant. -/// Otherwise, return null. +/// Cond is a condition that occurs in L. If it is invariant in the loop, or has +/// an invariant piece, return the invariant. Otherwise, return null. static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { // We started analyze new instruction, increment scanned instructions counter. @@ -411,11 +437,23 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { *L->getHeader()->getParent()); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); LPM = &LPM_Ref; - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); currentLoop = L; Function *F = currentLoop->getHeader()->getParent(); + + EnabledPGO = F->getEntryCount().hasValue(); + + if (LoopUnswitchWithBlockFrequency && EnabledPGO) { + BranchProbabilityInfo BPI(*F, *LI); + BFI.calculate(*L->getHeader()->getParent(), BPI, *LI); + + // Use BranchProbability to compute a minimum frequency based on + // function entry baseline frequency. Loops with headers below this + // frequency are considered as cold. + const BranchProbability ColdProb(ColdnessThreshold, 100); + ColdEntryFreq = BlockFrequency(BFI.getEntryFreq()) * ColdProb; + } + bool Changed = false; do { assert(currentLoop->isLCSSAForm(*DT)); @@ -423,16 +461,13 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { Changed |= processCurrentLoop(); } while(redoLoop); - if (Changed) { - // FIXME: Reconstruct dom info, because it is not preserved properly. - if (DT) - DT->recalculate(*F); - } + // FIXME: Reconstruct dom info, because it is not preserved properly. + if (Changed) + DT->recalculate(*F); return Changed; } -/// processCurrentLoop - Do actual work and unswitch loop if possible -/// and profitable. +/// Do actual work and unswitch loop if possible and profitable. bool LoopUnswitch::processCurrentLoop() { bool Changed = false; @@ -452,14 +487,48 @@ bool LoopUnswitch::processCurrentLoop() { LLVMContext &Context = loopHeader->getContext(); - // Probably we reach the quota of branches for this loop. If so - // stop unswitching. + // Analyze loop cost, and stop unswitching if loop content can not be duplicated. if (!BranchesInfo.countLoop( currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI( *currentLoop->getHeader()->getParent()), AC)) return false; + // Try trivial unswitch first before loop over other basic blocks in the loop. + if (TryTrivialLoopUnswitch(Changed)) { + return true; + } + + // Do not unswitch loops containing convergent operations, as we might be + // making them control dependent on the unswitch value when they were not + // before. + // FIXME: This could be refined to only bail if the convergent operation is + // not already control-dependent on the unswitch value. + for (const auto BB : currentLoop->blocks()) { + for (auto &I : *BB) { + auto CS = CallSite(&I); + if (!CS) continue; + if (CS.hasFnAttr(Attribute::Convergent)) + return false; + } + } + + // Do not do non-trivial unswitch while optimizing for size. + // FIXME: Use Function::optForSize(). + if (OptimizeForSize || + loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize)) + return false; + + if (LoopUnswitchWithBlockFrequency && EnabledPGO) { + // Compute the weighted frequency of the hottest block in the + // loop (loopHeader in this case since inner loops should be + // processed before outer loop). If it is less than ColdFrequency, + // we should not unswitch. + BlockFrequency LoopEntryFreq = BFI.getBlockFreq(loopHeader); + if (LoopEntryFreq < ColdEntryFreq) + return false; + } + // Loop over all of the basic blocks in the loop. If we find an interior // block that is branching on a loop-invariant condition, we can unswitch this // loop. @@ -528,8 +597,8 @@ bool LoopUnswitch::processCurrentLoop() { return Changed; } -/// isTrivialLoopExitBlock - Check to see if all paths from BB exit the -/// loop with no side effects (including infinite loops). +/// Check to see if all paths from BB exit the loop with no side effects +/// (including infinite loops). /// /// If true, we return true and set ExitBB to the block we /// exit through. @@ -566,9 +635,9 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, return true; } -/// isTrivialLoopExitBlock - Return true if the specified block unconditionally -/// leads to an exit from the specified loop, and has no side-effects in the -/// process. If so, return the block that is exited to, otherwise return null. +/// Return true if the specified block unconditionally leads to an exit from +/// the specified loop, and has no side-effects in the process. If so, return +/// the block that is exited to, otherwise return null. static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { std::set<BasicBlock*> Visited; Visited.insert(L->getHeader()); // Branches to header make infinite loops. @@ -578,105 +647,11 @@ static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { return nullptr; } -/// IsTrivialUnswitchCondition - Check to see if this unswitch condition is -/// trivial: that is, that the condition controls whether or not the loop does -/// anything at all. If this is a trivial condition, unswitching produces no -/// code duplications (equivalently, it produces a simpler loop and a new empty -/// loop, which gets deleted). -/// -/// If this is a trivial condition, return true, otherwise return false. When -/// returning true, this sets Cond and Val to the condition that controls the -/// trivial condition: when Cond dynamically equals Val, the loop is known to -/// exit. Finally, this sets LoopExit to the BB that the loop exits to when -/// Cond == Val. -/// -bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, - BasicBlock **LoopExit) { - BasicBlock *Header = currentLoop->getHeader(); - TerminatorInst *HeaderTerm = Header->getTerminator(); - LLVMContext &Context = Header->getContext(); - - BasicBlock *LoopExitBB = nullptr; - if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) { - // If the header block doesn't end with a conditional branch on Cond, we - // can't handle it. - if (!BI->isConditional() || BI->getCondition() != Cond) - return false; - - // Check to see if a successor of the branch is guaranteed to - // exit through a unique exit block without having any - // side-effects. If so, determine the value of Cond that causes it to do - // this. - if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, - BI->getSuccessor(0)))) { - if (Val) *Val = ConstantInt::getTrue(Context); - } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, - BI->getSuccessor(1)))) { - if (Val) *Val = ConstantInt::getFalse(Context); - } - } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) { - // If this isn't a switch on Cond, we can't handle it. - if (SI->getCondition() != Cond) return false; - - // Check to see if a successor of the switch is guaranteed to go to the - // latch block or exit through a one exit block without having any - // side-effects. If so, determine the value of Cond that causes it to do - // this. - // Note that we can't trivially unswitch on the default case or - // on already unswitched cases. - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); - i != e; ++i) { - BasicBlock *LoopExitCandidate; - if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, - i.getCaseSuccessor()))) { - // Okay, we found a trivial case, remember the value that is trivial. - ConstantInt *CaseVal = i.getCaseValue(); - - // Check that it was not unswitched before, since already unswitched - // trivial vals are looks trivial too. - if (BranchesInfo.isUnswitched(SI, CaseVal)) - continue; - LoopExitBB = LoopExitCandidate; - if (Val) *Val = CaseVal; - break; - } - } - } - - // If we didn't find a single unique LoopExit block, or if the loop exit block - // contains phi nodes, this isn't trivial. - if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) - return false; // Can't handle this. - - if (LoopExit) *LoopExit = LoopExitBB; - - // We already know that nothing uses any scalar values defined inside of this - // loop. As such, we just have to check to see if this loop will execute any - // side-effecting instructions (e.g. stores, calls, volatile loads) in the - // part of the loop that the code *would* execute. We already checked the - // tail, check the header now. - for (BasicBlock::iterator I = Header->begin(), E = Header->end(); I != E; ++I) - if (I->mayHaveSideEffects()) - return false; - return true; -} - -/// UnswitchIfProfitable - We have found that we can unswitch currentLoop when -/// LoopCond == Val to simplify the loop. If we decide that this is profitable, +/// We have found that we can unswitch currentLoop when LoopCond == Val to +/// simplify the loop. If we decide that this is profitable, /// unswitch the loop, reprocess the pieces, then return true. bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val, TerminatorInst *TI) { - Function *F = loopHeader->getParent(); - Constant *CondVal = nullptr; - BasicBlock *ExitBlock = nullptr; - - if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) { - // If the condition is trivial, always unswitch. There is no code growth - // for this case. - UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, ExitBlock, TI); - return true; - } - // Check to see if it would be profitable to unswitch current loop. if (!BranchesInfo.CostAllowsUnswitching()) { DEBUG(dbgs() << "NOT unswitching loop %" @@ -687,32 +662,27 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val, return false; } - // Do not do non-trivial unswitch while optimizing for size. - if (OptimizeForSize || F->hasFnAttribute(Attribute::OptimizeForSize)) - return false; - UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI); return true; } -/// CloneLoop - Recursively clone the specified loop and all of its children, +/// Recursively clone the specified loop and all of its children, /// mapping the blocks with the specified map. static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, LoopInfo *LI, LPPassManager *LPM) { - Loop *New = new Loop(); - LPM->insertLoop(New, PL); + Loop &New = LPM->addLoop(PL); // Add all of the blocks in L to the new loop. for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) if (LI->getLoopFor(*I) == L) - New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI); + New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI); // Add all of the subloops to the new loop. for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) - CloneLoop(*I, New, VM, LI, LPM); + CloneLoop(*I, &New, VM, LI, LPM); - return New; + return &New; } static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst, @@ -744,15 +714,15 @@ static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst, } } // fallthrough. + case LLVMContext::MD_make_implicit: case LLVMContext::MD_dbg: DstInst->setMetadata(MD.first, MD.second); } } } -/// EmitPreheaderBranchOnCondition - Emit a conditional branch on two values -/// if LIC == Val, branch to TrueDst, otherwise branch to FalseDest. Insert the -/// code immediately before InsertPt. +/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst, +/// otherwise branch to FalseDest. Insert the code immediately before InsertPt. void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, BasicBlock *TrueDest, BasicBlock *FalseDest, @@ -782,11 +752,11 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, SplitCriticalEdge(BI, 1, Options); } -/// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable -/// condition in it (a cond branch from its header block to its latch block, -/// where the path through the loop that doesn't execute its body has no -/// side-effects), unswitch it. This doesn't involve any code duplication, just -/// moving the conditional branch outside of the loop and updating loop info. +/// Given a loop that has a trivial unswitchable condition in it (a cond branch +/// from its header block to its latch block, where the path through the loop +/// that doesn't execute its body has no side-effects), unswitch it. This +/// doesn't involve any code duplication, just moving the conditional branch +/// outside of the loop and updating loop info. void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, BasicBlock *ExitBlock, TerminatorInst *TI) { @@ -810,7 +780,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, // without actually branching to it (the exit block should be dominated by the // loop header, not the preheader). assert(!L->contains(ExitBlock) && "Exit block is in the loop?"); - BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), DT, LI); + BasicBlock *NewExit = SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI); // Okay, now we have a position to branch from and a position to branch to, // insert the new conditional branch. @@ -829,8 +799,155 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, ++NumTrivial; } -/// SplitExitEdges - Split all of the edges from inside the loop to their exit -/// blocks. Update the appropriate Phi nodes as we do so. +/// Check if the first non-constant condition starting from the loop header is +/// a trivial unswitch condition: that is, a condition controls whether or not +/// the loop does anything at all. If it is a trivial condition, unswitching +/// produces no code duplications (equivalently, it produces a simpler loop and +/// a new empty loop, which gets deleted). Therefore always unswitch trivial +/// condition. +bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { + BasicBlock *CurrentBB = currentLoop->getHeader(); + TerminatorInst *CurrentTerm = CurrentBB->getTerminator(); + LLVMContext &Context = CurrentBB->getContext(); + + // If loop header has only one reachable successor (currently via an + // unconditional branch or constant foldable conditional branch, but + // should also consider adding constant foldable switch instruction in + // future), we should keep looking for trivial condition candidates in + // the successor as well. An alternative is to constant fold conditions + // and merge successors into loop header (then we only need to check header's + // terminator). The reason for not doing this in LoopUnswitch pass is that + // it could potentially break LoopPassManager's invariants. Folding dead + // branches could either eliminate the current loop or make other loops + // unreachable. LCSSA form might also not be preserved after deleting + // branches. The following code keeps traversing loop header's successors + // until it finds the trivial condition candidate (condition that is not a + // constant). Since unswitching generates branches with constant conditions, + // this scenario could be very common in practice. + SmallSet<BasicBlock*, 8> Visited; + + while (true) { + // If we exit loop or reach a previous visited block, then + // we can not reach any trivial condition candidates (unfoldable + // branch instructions or switch instructions) and no unswitch + // can happen. Exit and return false. + if (!currentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second) + return false; + + // Check if this loop will execute any side-effecting instructions (e.g. + // stores, calls, volatile loads) in the part of the loop that the code + // *would* execute. Check the header first. + for (Instruction &I : *CurrentBB) + if (I.mayHaveSideEffects()) + return false; + + // FIXME: add check for constant foldable switch instructions. + if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) { + if (BI->isUnconditional()) { + CurrentBB = BI->getSuccessor(0); + } else if (BI->getCondition() == ConstantInt::getTrue(Context)) { + CurrentBB = BI->getSuccessor(0); + } else if (BI->getCondition() == ConstantInt::getFalse(Context)) { + CurrentBB = BI->getSuccessor(1); + } else { + // Found a trivial condition candidate: non-foldable conditional branch. + break; + } + } else { + break; + } + + CurrentTerm = CurrentBB->getTerminator(); + } + + // CondVal is the condition that controls the trivial condition. + // LoopExitBB is the BasicBlock that loop exits when meets trivial condition. + Constant *CondVal = nullptr; + BasicBlock *LoopExitBB = nullptr; + + if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) { + // If this isn't branching on an invariant condition, we can't unswitch it. + if (!BI->isConditional()) + return false; + + Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), + currentLoop, Changed); + + // Unswitch only if the trivial condition itself is an LIV (not + // partial LIV which could occur in and/or) + if (!LoopCond || LoopCond != BI->getCondition()) + return false; + + // Check to see if a successor of the branch is guaranteed to + // exit through a unique exit block without having any + // side-effects. If so, determine the value of Cond that causes + // it to do this. + if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, + BI->getSuccessor(0)))) { + CondVal = ConstantInt::getTrue(Context); + } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, + BI->getSuccessor(1)))) { + CondVal = ConstantInt::getFalse(Context); + } + + // If we didn't find a single unique LoopExit block, or if the loop exit + // block contains phi nodes, this isn't trivial. + if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) + return false; // Can't handle this. + + UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB, + CurrentTerm); + ++NumBranches; + return true; + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) { + // If this isn't switching on an invariant condition, we can't unswitch it. + Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), + currentLoop, Changed); + + // Unswitch only if the trivial condition itself is an LIV (not + // partial LIV which could occur in and/or) + if (!LoopCond || LoopCond != SI->getCondition()) + return false; + + // Check to see if a successor of the switch is guaranteed to go to the + // latch block or exit through a one exit block without having any + // side-effects. If so, determine the value of Cond that causes it to do + // this. + // Note that we can't trivially unswitch on the default case or + // on already unswitched cases. + for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); + i != e; ++i) { + BasicBlock *LoopExitCandidate; + if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, + i.getCaseSuccessor()))) { + // Okay, we found a trivial case, remember the value that is trivial. + ConstantInt *CaseVal = i.getCaseValue(); + + // Check that it was not unswitched before, since already unswitched + // trivial vals are looks trivial too. + if (BranchesInfo.isUnswitched(SI, CaseVal)) + continue; + LoopExitBB = LoopExitCandidate; + CondVal = CaseVal; + break; + } + } + + // If we didn't find a single unique LoopExit block, or if the loop exit + // block contains phi nodes, this isn't trivial. + if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) + return false; // Can't handle this. + + UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB, + nullptr); + ++NumSwitches; + return true; + } + return false; +} + +/// Split all of the edges from inside the loop to their exit blocks. +/// Update the appropriate Phi nodes as we do so. void LoopUnswitch::SplitExitEdges(Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks){ @@ -841,15 +958,14 @@ void LoopUnswitch::SplitExitEdges(Loop *L, // Although SplitBlockPredecessors doesn't preserve loop-simplify in // general, if we call it on all predecessors of all exits then it does. - SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", - /*AliasAnalysis*/ nullptr, DT, LI, + SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI, /*PreserveLCSSA*/ true); } } -/// UnswitchNontrivialCondition - We determined that the loop is profitable -/// to unswitch when LIC equal Val. Split it into loop versions and test the -/// condition outside of either loop. Return the loops created as Out1/Out2. +/// We determined that the loop is profitable to unswitch when LIC equal Val. +/// Split it into loop versions and test the condition outside of either loop. +/// Return the loops created as Out1/Out2. void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, Loop *L, TerminatorInst *TI) { Function *F = loopHeader->getParent(); @@ -858,8 +974,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, << " blocks] in Function " << F->getName() << " when '" << *Val << "' == " << *LIC << "\n"); - if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) - SE->forgetLoop(L); + if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>()) + SEWP->getSE().forgetLoop(L); LoopBlocks.clear(); NewBlocks.clear(); @@ -901,8 +1017,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // Splice the newly inserted blocks into the function right before the // original preheader. - F->getBasicBlockList().splice(NewPreheader, F->getBasicBlockList(), - NewBlocks[0], F->end()); + F->getBasicBlockList().splice(NewPreheader->getIterator(), + F->getBasicBlockList(), + NewBlocks[0]->getIterator(), F->end()); // FIXME: We could register any cloned assumptions instead of clearing the // whole function's cache. @@ -944,7 +1061,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, if (LandingPadInst *LPad = NewExit->getLandingPadInst()) { PHINode *PN = PHINode::Create(LPad->getType(), 0, "", - ExitSucc->getFirstInsertionPt()); + &*ExitSucc->getFirstInsertionPt()); for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc); I != E; ++I) { @@ -960,7 +1077,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) - RemapInstruction(I, VMap,RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); + RemapInstruction(&*I, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); // Rewrite the original preheader to select between versions of the loop. BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator()); @@ -994,8 +1112,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true); } -/// RemoveFromWorklist - Remove all instances of I from the worklist vector -/// specified. +/// Remove all instances of I from the worklist vector specified. static void RemoveFromWorklist(Instruction *I, std::vector<Instruction*> &Worklist) { @@ -1003,7 +1120,7 @@ static void RemoveFromWorklist(Instruction *I, Worklist.end()); } -/// ReplaceUsesOfWith - When we find that I really equals V, remove I from the +/// When we find that I really equals V, remove I from the /// program, replacing all uses with V and update the worklist. static void ReplaceUsesOfWith(Instruction *I, Value *V, std::vector<Instruction*> &Worklist, @@ -1025,9 +1142,9 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V, ++NumSimplify; } -// RewriteLoopBodyWithConditionConstant - We know either that the value LIC has -// the value specified by Val in the specified loop, or we know it does NOT have -// that value. Rewrite any uses of LIC or of properties correlated to it. +/// We know either that the value LIC has the value specified by Val in the +/// specified loop, or we know it does NOT have that value. +/// Rewrite any uses of LIC or of properties correlated to it. void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, Constant *Val, bool IsEqual) { @@ -1138,18 +1255,16 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, // domtree here -- instead we force it to do a full recomputation // after the pass is complete -- but we do need to inform it of // new blocks. - if (DT) - DT->addNewBlock(Abort, NewSISucc); + DT->addNewBlock(Abort, NewSISucc); } SimplifyCode(Worklist, L); } -/// SimplifyCode - Okay, now that we have simplified some instructions in the -/// loop, walk over it and constant prop, dce, and fold control flow where -/// possible. Note that this is effectively a very simple loop-structure-aware -/// optimizer. During processing of this loop, L could very well be deleted, so -/// it must not be used. +/// Now that we have simplified some instructions in the loop, walk over it and +/// constant prop, dce, and fold control flow where possible. Note that this is +/// effectively a very simple loop-structure-aware optimizer. During processing +/// of this loop, L could very well be deleted, so it must not be used. /// /// FIXME: When the loop optimizer is more mature, separate this out to a new /// pass. @@ -1207,8 +1322,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { Succ->replaceAllUsesWith(Pred); // Move all of the successor contents from Succ to Pred. - Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(), - Succ->end()); + Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(), + Succ->begin(), Succ->end()); LPM->deleteSimpleAnalysisValue(BI, L); BI->eraseFromParent(); RemoveFromWorklist(BI, Worklist); diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp index 3314e1e..41511bc 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp @@ -22,7 +22,7 @@ using namespace llvm; #define DEBUG_TYPE "loweratomic" static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { - IRBuilder<> Builder(CXI->getParent(), CXI); + IRBuilder<> Builder(CXI); Value *Ptr = CXI->getPointerOperand(); Value *Cmp = CXI->getCompareOperand(); Value *Val = CXI->getNewValOperand(); @@ -41,7 +41,7 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { } static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) { - IRBuilder<> Builder(RMWI->getParent(), RMWI); + IRBuilder<> Builder(RMWI); Value *Ptr = RMWI->getPointerOperand(); Value *Val = RMWI->getValOperand(); @@ -120,7 +120,7 @@ namespace { return false; bool Changed = false; for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) { - Instruction *Inst = DI++; + Instruction *Inst = &*DI++; if (FenceInst *FI = dyn_cast<FenceInst>(Inst)) Changed |= LowerFenceInst(FI); else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst)) diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 0c47cbd..2ace902 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -139,7 +139,7 @@ static bool lowerExpectIntrinsic(Function &F) { ExpectIntrinsicsHandled++; } - // remove llvm.expect intrinsics. + // Remove llvm.expect intrinsics. for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { CallInst *CI = dyn_cast<CallInst>(BI++); if (!CI) diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 85012af..6b43b0f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -30,7 +31,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" -#include <list> +#include <algorithm> using namespace llvm; #define DEBUG_TYPE "memcpyopt" @@ -71,9 +72,9 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, return Offset; } -/// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a -/// constant offset, and return that constant offset. For example, Ptr1 might -/// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. +/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and +/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2 +/// might be &A[40]. In this case offset would be -8. static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, const DataLayout &DL) { Ptr1 = Ptr1->stripPointerCasts(); @@ -125,7 +126,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, } -/// MemsetRange - Represents a range of memset'd bytes with the ByteVal value. +/// Represents a range of memset'd bytes with the ByteVal value. /// This allows us to analyze stores like: /// store 0 -> P+1 /// store 0 -> P+0 @@ -164,8 +165,8 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // If any of the stores are a memset, then it is always good to extend the // memset. - for (unsigned i = 0, e = TheStores.size(); i != e; ++i) - if (!isa<StoreInst>(TheStores[i])) + for (Instruction *SI : TheStores) + if (!isa<StoreInst>(SI)) return true; // Assume that the code generator is capable of merging pairs of stores @@ -189,7 +190,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { unsigned NumPointerStores = Bytes / MaxIntSize; // Assume the remaining bytes if any are done a byte at a time. - unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize; + unsigned NumByteStores = Bytes % MaxIntSize; // If we will reduce the # stores (according to this heuristic), do the // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 @@ -200,15 +201,14 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { namespace { class MemsetRanges { - /// Ranges - A sorted list of the memset ranges. We use std::list here - /// because each element is relatively large and expensive to copy. - std::list<MemsetRange> Ranges; - typedef std::list<MemsetRange>::iterator range_iterator; + /// A sorted list of the memset ranges. + SmallVector<MemsetRange, 8> Ranges; + typedef SmallVectorImpl<MemsetRange>::iterator range_iterator; const DataLayout &DL; public: MemsetRanges(const DataLayout &DL) : DL(DL) {} - typedef std::list<MemsetRange>::const_iterator const_iterator; + typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator; const_iterator begin() const { return Ranges.begin(); } const_iterator end() const { return Ranges.end(); } bool empty() const { return Ranges.empty(); } @@ -240,26 +240,20 @@ public: } // end anon namespace -/// addRange - Add a new store to the MemsetRanges data structure. This adds a +/// Add a new store to the MemsetRanges data structure. This adds a /// new range for the specified store at the specified offset, merging into /// existing ranges as appropriate. -/// -/// Do a linear search of the ranges to see if this can be joined and/or to -/// find the insertion point in the list. We keep the ranges sorted for -/// simplicity here. This is a linear search of a linked list, which is ugly, -/// however the number of ranges is limited, so this won't get crazy slow. void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, unsigned Alignment, Instruction *Inst) { int64_t End = Start+Size; - range_iterator I = Ranges.begin(), E = Ranges.end(); - while (I != E && Start > I->End) - ++I; + range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start, + [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; }); // We now know that I == E, in which case we didn't find anything to merge // with, or that Start <= I->End. If End < I->Start or I == E, then we need // to insert a new range. Handle this now. - if (I == E || End < I->Start) { + if (I == Ranges.end() || End < I->Start) { MemsetRange &R = *Ranges.insert(I, MemsetRange()); R.Start = Start; R.End = End; @@ -295,7 +289,7 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, if (End > I->End) { I->End = End; range_iterator NextI = I; - while (++NextI != E && End >= NextI->Start) { + while (++NextI != Ranges.end() && End >= NextI->Start) { // Merge the range in. I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end()); if (NextI->End > I->End) @@ -331,9 +325,9 @@ namespace { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<MemoryDependenceAnalysis>(); } @@ -357,7 +351,7 @@ namespace { char MemCpyOpt::ID = 0; } -// createMemCpyOptPass - The public interface to this file... +/// The public interface to this file... FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", @@ -366,14 +360,15 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false) -/// tryMergingIntoMemset - When scanning forward over instructions, we look for -/// some other patterns to fold away. In particular, this looks for stores to -/// neighboring locations of memory. If it sees enough consecutive ones, it -/// attempts to merge them together into a memcpy/memset. +/// When scanning forward over instructions, we look for some other patterns to +/// fold away. In particular, this looks for stores to neighboring locations of +/// memory. If it sees enough consecutive ones, it attempts to merge them +/// together into a memcpy/memset. Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, Value *StartPtr, Value *ByteVal) { const DataLayout &DL = StartInst->getModule()->getDataLayout(); @@ -384,7 +379,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // are stored. MemsetRanges Ranges(DL); - BasicBlock::iterator BI = StartInst; + BasicBlock::iterator BI(StartInst); for (++BI; !isa<TerminatorInst>(BI); ++BI) { if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { // If the instruction is readnone, ignore it, otherwise bail out. We @@ -439,14 +434,12 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // If we create any memsets, we put it right before the first instruction that // isn't part of the memset block. This ensure that the memset is dominated // by any addressing instruction needed by the start of the block. - IRBuilder<> Builder(BI); + IRBuilder<> Builder(&*BI); // Now that we have full information about ranges, loop over the ranges and // emit memset's for anything big enough to be worthwhile. Instruction *AMemSet = nullptr; - for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); - I != E; ++I) { - const MemsetRange &Range = *I; + for (const MemsetRange &Range : Ranges) { if (Range.TheStores.size() == 1) continue; @@ -470,19 +463,17 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); DEBUG(dbgs() << "Replace stores:\n"; - for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) - dbgs() << *Range.TheStores[i] << '\n'; + for (Instruction *SI : Range.TheStores) + dbgs() << *SI << '\n'; dbgs() << "With: " << *AMemSet << '\n'); if (!Range.TheStores.empty()) AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); // Zap all the stores. - for (SmallVectorImpl<Instruction *>::const_iterator - SI = Range.TheStores.begin(), - SE = Range.TheStores.end(); SI != SE; ++SI) { - MD->removeInstruction(*SI); - (*SI)->eraseFromParent(); + for (Instruction *SI : Range.TheStores) { + MD->removeInstruction(SI); + SI->eraseFromParent(); } ++NumMemSetInfer; } @@ -490,17 +481,111 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, return AMemSet; } +static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI, + const LoadInst *LI) { + unsigned StoreAlign = SI->getAlignment(); + if (!StoreAlign) + StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType()); + unsigned LoadAlign = LI->getAlignment(); + if (!LoadAlign) + LoadAlign = DL.getABITypeAlignment(LI->getType()); + + return std::min(StoreAlign, LoadAlign); +} bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; + + // Avoid merging nontemporal stores since the resulting + // memcpy/memset would not be able to preserve the nontemporal hint. + // In theory we could teach how to propagate the !nontemporal metadata to + // memset calls. However, that change would force the backend to + // conservatively expand !nontemporal memset calls back to sequences of + // store instructions (effectively undoing the merging). + if (SI->getMetadata(LLVMContext::MD_nontemporal)) + return false; + const DataLayout &DL = SI->getModule()->getDataLayout(); - // Detect cases where we're performing call slot forwarding, but - // happen to be using a load-store pair to implement it, rather than - // a memcpy. + // Load to store forwarding can be interpreted as memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { if (LI->isSimple() && LI->hasOneUse() && LI->getParent() == SI->getParent()) { + + auto *T = LI->getType(); + if (T->isAggregateType()) { + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + MemoryLocation LoadLoc = MemoryLocation::get(LI); + + // We use alias analysis to check if an instruction may store to + // the memory we load from in between the load and the store. If + // such an instruction is found, we try to promote there instead + // of at the store position. + Instruction *P = SI; + for (BasicBlock::iterator I = ++LI->getIterator(), E = SI->getIterator(); + I != E; ++I) { + if (!(AA.getModRefInfo(&*I, LoadLoc) & MRI_Mod)) + continue; + + // We found an instruction that may write to the loaded memory. + // We can try to promote at this position instead of the store + // position if nothing alias the store memory after this and the store + // destination is not in the range. + P = &*I; + for (; I != E; ++I) { + MemoryLocation StoreLoc = MemoryLocation::get(SI); + if (&*I == SI->getOperand(1) || + AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) { + P = nullptr; + break; + } + } + + break; + } + + // If a valid insertion position is found, then we can promote + // the load/store pair to a memcpy. + if (P) { + // If we load from memory that may alias the memory we store to, + // memmove must be used to preserve semantic. If not, memcpy can + // be used. + bool UseMemMove = false; + if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc)) + UseMemMove = true; + + unsigned Align = findCommonAlignment(DL, SI, LI); + uint64_t Size = DL.getTypeStoreSize(T); + + IRBuilder<> Builder(P); + Instruction *M; + if (UseMemMove) + M = Builder.CreateMemMove(SI->getPointerOperand(), + LI->getPointerOperand(), Size, + Align, SI->isVolatile()); + else + M = Builder.CreateMemCpy(SI->getPointerOperand(), + LI->getPointerOperand(), Size, + Align, SI->isVolatile()); + + DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI + << " => " << *M << "\n"); + + MD->removeInstruction(SI); + SI->eraseFromParent(); + MD->removeInstruction(LI); + LI->eraseFromParent(); + ++NumMemCpyInstr; + + // Make sure we do not invalidate the iterator. + BBI = M->getIterator(); + return true; + } + } + + // Detect cases where we're performing call slot forwarding, but + // happen to be using a load-store pair to implement it, rather than + // a memcpy. MemDepResult ldep = MD->getDependency(LI); CallInst *C = nullptr; if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) @@ -509,11 +594,11 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (C) { // Check that nothing touches the dest of the "copy" between // the call and the store. - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); MemoryLocation StoreLoc = MemoryLocation::get(SI); - for (BasicBlock::iterator I = --BasicBlock::iterator(SI), - E = C; I != E; --I) { - if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) { + for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator(); + I != E; --I) { + if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) { C = nullptr; break; } @@ -521,18 +606,11 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { } if (C) { - unsigned storeAlign = SI->getAlignment(); - if (!storeAlign) - storeAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType()); - unsigned loadAlign = LI->getAlignment(); - if (!loadAlign) - loadAlign = DL.getABITypeAlignment(LI->getType()); - bool changed = performCallSlotOptzn( LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), DL.getTypeStoreSize(SI->getOperand(0)->getType()), - std::min(storeAlign, loadAlign), C); + findCommonAlignment(DL, SI, LI), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); @@ -551,13 +629,39 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. - if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) + auto *V = SI->getOperand(0); + if (Value *ByteVal = isBytewiseValue(V)) { if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { - BBI = I; // Don't invalidate iterator. + BBI = I->getIterator(); // Don't invalidate iterator. return true; } + // If we have an aggregate, we try to promote it to memset regardless + // of opportunity for merging as it can expose optimization opportunities + // in subsequent passes. + auto *T = V->getType(); + if (T->isAggregateType()) { + uint64_t Size = DL.getTypeStoreSize(T); + unsigned Align = SI->getAlignment(); + if (!Align) + Align = DL.getABITypeAlignment(T); + IRBuilder<> Builder(SI); + auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, + Size, Align, SI->isVolatile()); + + DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n"); + + MD->removeInstruction(SI); + SI->eraseFromParent(); + NumMemSetInfer++; + + // Make sure we do not invalidate the iterator. + BBI = M->getIterator(); + return true; + } + } + return false; } @@ -567,14 +671,14 @@ bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile()) if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), MSI->getValue())) { - BBI = I; // Don't invalidate iterator. + BBI = I->getIterator(); // Don't invalidate iterator. return true; } return false; } -/// performCallSlotOptzn - takes a memcpy and a call that it depends on, +/// Takes a memcpy and a call that it depends on, /// and checks for the possibility of a call slot optimization by having /// the call write its result directly into the destination of the memcpy. bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, @@ -710,12 +814,12 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // unexpected manner, for example via a global, which we deduce from // the use analysis, we also need to know that it does not sneakily // access dest. We rely on AA to figure this out for us. - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - AliasAnalysis::ModRefResult MR = AA.getModRefInfo(C, cpyDest, srcSize); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize); // If necessary, perform additional analysis. - if (MR != AliasAnalysis::NoModRef) + if (MR != MRI_NoModRef) MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT); - if (MR != AliasAnalysis::NoModRef) + if (MR != MRI_NoModRef) return false; // All the checks have passed, so do the transformation. @@ -749,11 +853,9 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // Update AA metadata // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be // handled here, but combineMetadata doesn't support them yet - unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - }; + unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + LLVMContext::MD_invariant_group}; combineMetadata(C, cpy, KnownIDs); // Remove the memcpy. @@ -763,10 +865,8 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return true; } -/// processMemCpyMemCpyDependence - We've found that the (upward scanning) -/// memory dependence of memcpy 'M' is the memcpy 'MDep'. Try to simplify M to -/// copy from MDep's input if we can. -/// +/// We've found that the (upward scanning) memory dependence of memcpy 'M' is +/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can. bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) { // We can only transforms memcpy's where the dest of one is the source of the // other. @@ -788,7 +888,7 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) { if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) return false; - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); // Verify that the copied-from memory doesn't change in between the two // transfers. For example, in: @@ -802,8 +902,9 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) { // // NOTE: This is conservative, it will stop on any read from the source loc, // not just the defining memcpy. - MemDepResult SourceDep = MD->getPointerDependencyFrom( - MemoryLocation::getForSource(MDep), false, M, M->getParent()); + MemDepResult SourceDep = + MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, + M->getIterator(), M->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; @@ -860,8 +961,9 @@ bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy, return false; // Check that there are no other dependencies on the memset destination. - MemDepResult DstDepInfo = MD->getPointerDependencyFrom( - MemoryLocation::getForDest(MemSet), false, MemCpy, MemCpy->getParent()); + MemDepResult DstDepInfo = + MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false, + MemCpy->getIterator(), MemCpy->getParent()); if (DstDepInfo.getInst() != MemSet) return false; @@ -936,7 +1038,7 @@ bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, return true; } -/// processMemCpy - perform simplification of memcpy's. If we have memcpy A +/// Perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy @@ -998,8 +1100,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { } MemoryLocation SrcLoc = MemoryLocation::getForSource(M); - MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, - M, M->getParent()); + MemDepResult SrcDepInfo = MD->getPointerDependencyFrom( + SrcLoc, true, M->getIterator(), M->getParent()); if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) @@ -1037,10 +1139,10 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { return false; } -/// processMemMove - Transforms memmove calls to memcpy calls when the src/dst -/// are guaranteed not to alias. +/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed +/// not to alias. bool MemCpyOpt::processMemMove(MemMoveInst *M) { - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); if (!TLI->has(LibFunc::memmove)) return false; @@ -1053,12 +1155,11 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n"); // If not, then we know we can transform this. - Module *Mod = M->getParent()->getParent()->getParent(); Type *ArgTys[3] = { M->getRawDest()->getType(), M->getRawSource()->getType(), M->getLength()->getType() }; - M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, - ArgTys)); + M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(), + Intrinsic::memcpy, ArgTys)); // MemDep may have over conservative information about this instruction, just // conservatively flush it from the cache. @@ -1068,7 +1169,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { return true; } -/// processByValArgument - This is called on every byval argument in call sites. +/// This is called on every byval argument in call sites. bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout(); // Find out what feeds this byval argument. @@ -1076,8 +1177,8 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); uint64_t ByValSize = DL.getTypeAllocSize(ByValTy); MemDepResult DepInfo = MD->getPointerDependencyFrom( - MemoryLocation(ByValArg, ByValSize), true, CS.getInstruction(), - CS.getInstruction()->getParent()); + MemoryLocation(ByValArg, ByValSize), true, + CS.getInstruction()->getIterator(), CS.getInstruction()->getParent()); if (!DepInfo.isClobber()) return false; @@ -1119,9 +1220,9 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // // NOTE: This is conservative, it will stop on any read from the source loc, // not just the defining memcpy. - MemDepResult SourceDep = - MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, - CS.getInstruction(), MDep->getParent()); + MemDepResult SourceDep = MD->getPointerDependencyFrom( + MemoryLocation::getForSource(MDep), false, + CS.getInstruction()->getIterator(), MDep->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; @@ -1140,7 +1241,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { return true; } -/// iterateOnFunction - Executes one iteration of MemCpyOpt. +/// Executes one iteration of MemCpyOpt. bool MemCpyOpt::iterateOnFunction(Function &F) { bool MadeChange = false; @@ -1148,7 +1249,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) { for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { // Avoid invalidating the iterator. - Instruction *I = BI++; + Instruction *I = &*BI++; bool RepeatInstruction = false; @@ -1177,9 +1278,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { return MadeChange; } -// MemCpyOpt::runOnFunction - This is the main transformation entry point for a -// function. -// +/// This is the main transformation entry point for a function. bool MemCpyOpt::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 643f374..c812d61 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -78,6 +78,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" @@ -91,6 +92,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <vector> + using namespace llvm; #define DEBUG_TYPE "mldst-motion" @@ -106,7 +108,7 @@ class MergedLoadStoreMotion : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid - explicit MergedLoadStoreMotion(void) + MergedLoadStoreMotion() : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) { initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry()); } @@ -116,10 +118,11 @@ public: private: // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<MemoryDependenceAnalysis>(); - AU.addPreserved<AliasAnalysis>(); } // Helper routines @@ -156,7 +159,7 @@ private: }; char MergedLoadStoreMotion::ID = 0; -} +} // anonymous namespace /// /// \brief createMergedLoadStoreMotionPass - The public interface to this file. @@ -169,7 +172,8 @@ INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) @@ -236,12 +240,11 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { /// being loaded or protect against the load from happening /// it is considered a hoist barrier. /// - bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, const Instruction& End, LoadInst* LI) { MemoryLocation Loc = MemoryLocation::get(LI); - return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Mod); + return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod); } /// @@ -256,7 +259,7 @@ LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1, for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE; ++BBI) { - Instruction *Inst = BBI; + Instruction *Inst = &*BBI; // Only merge and hoist loads when their result in used only in BB if (!isa<LoadInst>(Inst) || Inst->isUsedOutsideOfBlock(BB1)) @@ -293,7 +296,7 @@ void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB, // Intersect optional metadata. HoistCand->intersectOptionalDataWith(ElseInst); - HoistCand->dropUnknownMetadata(); + HoistCand->dropUnknownNonDebugMetadata(); // Prepend point for instruction insert Instruction *HoistPt = BB->getTerminator(); @@ -363,8 +366,7 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { int NLoads = 0; for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end(); BBI != BBE;) { - - Instruction *I = BBI; + Instruction *I = &*BBI; ++BBI; // Only move non-simple (atomic, volatile) loads. @@ -394,11 +396,10 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { /// value being stored or protect against the store from /// happening it is considered a sink barrier. /// - bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start, const Instruction &End, MemoryLocation Loc) { - return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::ModRef); + return AA->canInstructionRangeModRef(Start, End, Loc, MRI_ModRef); } /// @@ -438,23 +439,16 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1, PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1) { // Create a phi if the values mismatch. - PHINode *NewPN = 0; + PHINode *NewPN = nullptr; Value *Opd1 = S0->getValueOperand(); Value *Opd2 = S1->getValueOperand(); if (Opd1 != Opd2) { NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink", - BB->begin()); + &BB->front()); NewPN->addIncoming(Opd1, S0->getParent()); NewPN->addIncoming(Opd2, S1->getParent()); - if (NewPN->getType()->getScalarType()->isPointerTy()) { - // AA needs to be informed when a PHI-use of the pointer value is added - for (unsigned I = 0, E = NewPN->getNumIncomingValues(); I != E; ++I) { - unsigned J = PHINode::getOperandNumForIncomingValue(I); - AA->addEscapingUse(NewPN->getOperandUse(J)); - } - if (MD) - MD->invalidateCachedPointerInfo(NewPN); - } + if (MD && NewPN->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(NewPN); } return NewPN; } @@ -479,12 +473,12 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, BasicBlock::iterator InsertPt = BB->getFirstInsertionPt(); // Intersect optional metadata. S0->intersectOptionalDataWith(S1); - S0->dropUnknownMetadata(); + S0->dropUnknownNonDebugMetadata(); // Create the new store to be inserted at the join point. StoreInst *SNew = (StoreInst *)(S0->clone()); Instruction *ANew = A0->clone(); - SNew->insertBefore(InsertPt); + SNew->insertBefore(&*InsertPt); ANew->insertBefore(SNew); assert(S0->getParent() == A0->getParent()); @@ -566,12 +560,13 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { } return MergedStores; } + /// /// \brief Run the transformation for each function /// bool MergedLoadStoreMotion::runOnFunction(Function &F) { MD = getAnalysisIfAvailable<MemoryDependenceAnalysis>(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); bool Changed = false; DEBUG(dbgs() << "Instruction Merger\n"); @@ -579,7 +574,7 @@ bool MergedLoadStoreMotion::runOnFunction(Function &F) { // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) { - BasicBlock *BB = FI++; + BasicBlock *BB = &*FI++; // Hoist equivalent loads and sink stores // outside diamonds when possible diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp index f42f830..c8f885e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -71,8 +71,8 @@ // // Limitations and TODO items: // -// 1) We only considers n-ary adds for now. This should be extended and -// generalized. +// 1) We only considers n-ary adds and muls for now. This should be extended +// and generalized. // //===----------------------------------------------------------------------===// @@ -110,11 +110,11 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addPreserved<TargetLibraryInfoWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.setPreservesCFG(); @@ -145,12 +145,23 @@ private: unsigned I, Value *LHS, Value *RHS, Type *IndexedType); - // Reassociate Add for better CSE. - Instruction *tryReassociateAdd(BinaryOperator *I); - // A helper function for tryReassociateAdd. LHS and RHS are explicitly passed. - Instruction *tryReassociateAdd(Value *LHS, Value *RHS, Instruction *I); - // Rewrites I to LHS + RHS if LHS is computed already. - Instruction *tryReassociatedAdd(const SCEV *LHS, Value *RHS, Instruction *I); + // Reassociate binary operators for better CSE. + Instruction *tryReassociateBinaryOp(BinaryOperator *I); + + // A helper function for tryReassociateBinaryOp. LHS and RHS are explicitly + // passed. + Instruction *tryReassociateBinaryOp(Value *LHS, Value *RHS, + BinaryOperator *I); + // Rewrites I to (LHS op RHS) if LHS is computed already. + Instruction *tryReassociatedBinaryOp(const SCEV *LHS, Value *RHS, + BinaryOperator *I); + + // Tries to match Op1 and Op2 by using V. + bool matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1, Value *&Op2); + + // Gets SCEV for (LHS op RHS). + const SCEV *getBinarySCEV(BinaryOperator *I, const SCEV *LHS, + const SCEV *RHS); // Returns the closest dominator of \c Dominatee that computes // \c CandidateExpr. Returns null if not found. @@ -161,11 +172,6 @@ private: // GEP's pointer size, i.e., whether Index needs to be sign-extended in order // to be an index of GEP. bool requiresSignExtension(Value *Index, GetElementPtrInst *GEP); - // Returns whether V is known to be non-negative at context \c Ctxt. - bool isKnownNonNegative(Value *V, Instruction *Ctxt); - // Returns whether AO may sign overflow at context \c Ctxt. It computes a - // conservative result -- it answers true when not sure. - bool maySignOverflow(AddOperator *AO, Instruction *Ctxt); AssumptionCache *AC; const DataLayout *DL; @@ -182,7 +188,7 @@ private: // foo(a + b); // if (p2) // bar(a + b); - DenseMap<const SCEV *, SmallVector<Instruction *, 2>> SeenExprs; + DenseMap<const SCEV *, SmallVector<WeakVH, 2>> SeenExprs; }; } // anonymous namespace @@ -191,7 +197,7 @@ INITIALIZE_PASS_BEGIN(NaryReassociate, "nary-reassociate", "Nary reassociation", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(NaryReassociate, "nary-reassociate", "Nary reassociation", @@ -207,7 +213,7 @@ bool NaryReassociate::runOnFunction(Function &F) { AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); @@ -224,6 +230,7 @@ static bool isPotentiallyNaryReassociable(Instruction *I) { switch (I->getOpcode()) { case Instruction::Add: case Instruction::GetElementPtr: + case Instruction::Mul: return true; default: return false; @@ -239,19 +246,21 @@ bool NaryReassociate::doOneIteration(Function &F) { Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) { BasicBlock *BB = Node->getBlock(); for (auto I = BB->begin(); I != BB->end(); ++I) { - if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(I)) { - const SCEV *OldSCEV = SE->getSCEV(I); - if (Instruction *NewI = tryReassociate(I)) { + if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(&*I)) { + const SCEV *OldSCEV = SE->getSCEV(&*I); + if (Instruction *NewI = tryReassociate(&*I)) { Changed = true; - SE->forgetValue(I); + SE->forgetValue(&*I); I->replaceAllUsesWith(NewI); - RecursivelyDeleteTriviallyDeadInstructions(I, TLI); - I = NewI; + // If SeenExprs constains I's WeakVH, that entry will be replaced with + // nullptr. + RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI); + I = NewI->getIterator(); } // Add the rewritten instruction to SeenExprs; the original instruction // is deleted. - const SCEV *NewSCEV = SE->getSCEV(I); - SeenExprs[NewSCEV].push_back(I); + const SCEV *NewSCEV = SE->getSCEV(&*I); + SeenExprs[NewSCEV].push_back(WeakVH(&*I)); // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I) // is equivalent to I. However, ScalarEvolution::getSCEV may // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose @@ -271,7 +280,7 @@ bool NaryReassociate::doOneIteration(Function &F) { // // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll. if (NewSCEV != OldSCEV) - SeenExprs[OldSCEV].push_back(I); + SeenExprs[OldSCEV].push_back(WeakVH(&*I)); } } } @@ -281,7 +290,8 @@ bool NaryReassociate::doOneIteration(Function &F) { Instruction *NaryReassociate::tryReassociate(Instruction *I) { switch (I->getOpcode()) { case Instruction::Add: - return tryReassociateAdd(cast<BinaryOperator>(I)); + case Instruction::Mul: + return tryReassociateBinaryOp(cast<BinaryOperator>(I)); case Instruction::GetElementPtr: return tryReassociateGEP(cast<GetElementPtrInst>(I)); default: @@ -352,27 +362,6 @@ bool NaryReassociate::requiresSignExtension(Value *Index, return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits; } -bool NaryReassociate::isKnownNonNegative(Value *V, Instruction *Ctxt) { - bool NonNegative, Negative; - // TODO: ComputeSignBits is expensive. Consider caching the results. - ComputeSignBit(V, NonNegative, Negative, *DL, 0, AC, Ctxt, DT); - return NonNegative; -} - -bool NaryReassociate::maySignOverflow(AddOperator *AO, Instruction *Ctxt) { - if (AO->hasNoSignedWrap()) - return false; - - Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1); - // If LHS or RHS has the same sign as the sum, AO doesn't sign overflow. - // TODO: handle the negative case as well. - if (isKnownNonNegative(AO, Ctxt) && - (isKnownNonNegative(LHS, Ctxt) || isKnownNonNegative(RHS, Ctxt))) - return false; - - return true; -} - GetElementPtrInst * NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I, Type *IndexedType) { @@ -381,7 +370,7 @@ NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I, IndexToSplit = SExt->getOperand(0); } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(IndexToSplit)) { // zext can be treated as sext if the source is non-negative. - if (isKnownNonNegative(ZExt->getOperand(0), GEP)) + if (isKnownNonNegative(ZExt->getOperand(0), *DL, 0, AC, GEP, DT)) IndexToSplit = ZExt->getOperand(0); } @@ -389,8 +378,11 @@ NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I, // If the I-th index needs sext and the underlying add is not equipped with // nsw, we cannot split the add because // sext(LHS + RHS) != sext(LHS) + sext(RHS). - if (requiresSignExtension(IndexToSplit, GEP) && maySignOverflow(AO, GEP)) + if (requiresSignExtension(IndexToSplit, GEP) && + computeOverflowForSignedAdd(AO, *DL, AC, GEP, DT) != + OverflowResult::NeverOverflows) return nullptr; + Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1); // IndexToSplit = LHS + RHS. if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType)) @@ -415,7 +407,7 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex( IndexExprs.push_back(SE->getSCEV(*Index)); // Replace the I-th index with LHS. IndexExprs[I] = SE->getSCEV(LHS); - if (isKnownNonNegative(LHS, GEP) && + if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) && DL->getTypeSizeInBits(LHS->getType()) < DL->getTypeSizeInBits(GEP->getOperand(I)->getType())) { // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to @@ -429,19 +421,20 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex( GEP->getSourceElementType(), SE->getSCEV(GEP->getPointerOperand()), IndexExprs, GEP->isInBounds()); - auto *Candidate = findClosestMatchingDominator(CandidateExpr, GEP); + Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP); if (Candidate == nullptr) return nullptr; - PointerType *TypeOfCandidate = dyn_cast<PointerType>(Candidate->getType()); - // Pretty rare but theoretically possible when a numeric value happens to - // share CandidateExpr. - if (TypeOfCandidate == nullptr) - return nullptr; + IRBuilder<> Builder(GEP); + // Candidate does not necessarily have the same pointer type as GEP. Use + // bitcast or pointer cast to make sure they have the same type, so that the + // later RAUW doesn't complain. + Candidate = Builder.CreateBitOrPointerCast(Candidate, GEP->getType()); + assert(Candidate->getType() == GEP->getType()); // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType) uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType); - Type *ElementType = TypeOfCandidate->getElementType(); + Type *ElementType = GEP->getType()->getElementType(); uint64_t ElementSize = DL->getTypeAllocSize(ElementType); // Another less rare case: because I is not necessarily the last index of the // GEP, the size of the type at the I-th index (IndexedSize) is not @@ -461,8 +454,7 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex( return nullptr; // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0]))); - IRBuilder<> Builder(GEP); - Type *IntPtrTy = DL->getIntPtrType(TypeOfCandidate); + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); if (RHS->getType() != IntPtrTy) RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy); if (IndexedSize != ElementSize) { @@ -476,54 +468,89 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex( return NewGEP; } -Instruction *NaryReassociate::tryReassociateAdd(BinaryOperator *I) { +Instruction *NaryReassociate::tryReassociateBinaryOp(BinaryOperator *I) { Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); - if (auto *NewI = tryReassociateAdd(LHS, RHS, I)) + if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I)) return NewI; - if (auto *NewI = tryReassociateAdd(RHS, LHS, I)) + if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I)) return NewI; return nullptr; } -Instruction *NaryReassociate::tryReassociateAdd(Value *LHS, Value *RHS, - Instruction *I) { +Instruction *NaryReassociate::tryReassociateBinaryOp(Value *LHS, Value *RHS, + BinaryOperator *I) { Value *A = nullptr, *B = nullptr; - // To be conservative, we reassociate I only when it is the only user of A+B. - if (LHS->hasOneUse() && match(LHS, m_Add(m_Value(A), m_Value(B)))) { - // I = (A + B) + RHS - // = (A + RHS) + B or (B + RHS) + A + // To be conservative, we reassociate I only when it is the only user of (A op + // B). + if (LHS->hasOneUse() && matchTernaryOp(I, LHS, A, B)) { + // I = (A op B) op RHS + // = (A op RHS) op B or (B op RHS) op A const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B); const SCEV *RHSExpr = SE->getSCEV(RHS); if (BExpr != RHSExpr) { - if (auto *NewI = tryReassociatedAdd(SE->getAddExpr(AExpr, RHSExpr), B, I)) + if (auto *NewI = + tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I)) return NewI; } if (AExpr != RHSExpr) { - if (auto *NewI = tryReassociatedAdd(SE->getAddExpr(BExpr, RHSExpr), A, I)) + if (auto *NewI = + tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I)) return NewI; } } return nullptr; } -Instruction *NaryReassociate::tryReassociatedAdd(const SCEV *LHSExpr, - Value *RHS, Instruction *I) { - auto Pos = SeenExprs.find(LHSExpr); - // Bail out if LHSExpr is not previously seen. - if (Pos == SeenExprs.end()) - return nullptr; - +Instruction *NaryReassociate::tryReassociatedBinaryOp(const SCEV *LHSExpr, + Value *RHS, + BinaryOperator *I) { // Look for the closest dominator LHS of I that computes LHSExpr, and replace - // I with LHS + RHS. + // I with LHS op RHS. auto *LHS = findClosestMatchingDominator(LHSExpr, I); if (LHS == nullptr) return nullptr; - Instruction *NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I); + Instruction *NewI = nullptr; + switch (I->getOpcode()) { + case Instruction::Add: + NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I); + break; + case Instruction::Mul: + NewI = BinaryOperator::CreateMul(LHS, RHS, "", I); + break; + default: + llvm_unreachable("Unexpected instruction."); + } NewI->takeName(I); return NewI; } +bool NaryReassociate::matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1, + Value *&Op2) { + switch (I->getOpcode()) { + case Instruction::Add: + return match(V, m_Add(m_Value(Op1), m_Value(Op2))); + case Instruction::Mul: + return match(V, m_Mul(m_Value(Op1), m_Value(Op2))); + default: + llvm_unreachable("Unexpected instruction."); + } + return false; +} + +const SCEV *NaryReassociate::getBinarySCEV(BinaryOperator *I, const SCEV *LHS, + const SCEV *RHS) { + switch (I->getOpcode()) { + case Instruction::Add: + return SE->getAddExpr(LHS, RHS); + case Instruction::Mul: + return SE->getMulExpr(LHS, RHS); + default: + llvm_unreachable("Unexpected instruction."); + } + return nullptr; +} + Instruction * NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr, Instruction *Dominatee) { @@ -537,9 +564,13 @@ NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr, // future instruction either. Therefore, we pop it out of the stack. This // optimization makes the algorithm O(n). while (!Candidates.empty()) { - Instruction *Candidate = Candidates.back(); - if (DT->dominates(Candidate, Dominatee)) - return Candidate; + // Candidates stores WeakVHs, so a candidate can be nullptr if it's removed + // during rewriting. + if (Value *Candidate = Candidates.back()) { + Instruction *CandidateInstruction = cast<Instruction>(Candidate); + if (DT->dominates(CandidateInstruction, Dominatee)) + return CandidateInstruction; + } Candidates.pop_back(); } return nullptr; diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 31d7df3..9f26f78 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -154,7 +154,7 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call, Phi->addIncoming(Call, &CurrBB); Phi->addIncoming(LibCall, LibCallBB); - BB = JoinBB; + BB = JoinBB->getIterator(); return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp index 366301a..b56b355 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -27,7 +27,7 @@ // well defined state for inspection by the collector. In the current // implementation, this is done via the insertion of poll sites at method entry // and the backedge of most loops. We try to avoid inserting more polls than -// are neccessary to ensure a finite period between poll sites. This is not +// are necessary to ensure a finite period between poll sites. This is not // because the poll itself is expensive in the generated code; it's not. Polls // do tend to impact the optimizer itself in negative ways; we'd like to avoid // perturbing the optimization of the method as much as we can. @@ -91,13 +91,15 @@ STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution"); using namespace llvm; -// Ignore oppurtunities to avoid placing safepoints on backedges, useful for +// Ignore opportunities to avoid placing safepoints on backedges, useful for // validation static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden, cl::init(false)); -/// If true, do not place backedge safepoints in counted loops. -static cl::opt<bool> SkipCounted("spp-counted", cl::Hidden, cl::init(true)); +/// How narrow does the trip count of a loop have to be to have to be considered +/// "counted"? Counted loops do not get safepoints at backedges. +static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width", + cl::Hidden, cl::init(32)); // If true, split the backedge of a loop when placing the safepoint, otherwise // split the latch block itself. Both are useful to support for @@ -121,7 +123,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { std::vector<TerminatorInst *> PollLocations; /// True unless we're running spp-no-calls in which case we need to disable - /// the call dependend placement opts. + /// the call-dependent placement opts. bool CallSafepointsEnabled; ScalarEvolution *SE = nullptr; @@ -142,7 +144,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { } bool runOnFunction(Function &F) override { - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); for (auto I = LI->begin(), E = LI->end(); I != E; I++) { @@ -153,7 +155,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); // We no longer modify the IR at all in this pass. Thus all // analysis are preserved. @@ -190,10 +192,8 @@ static void InsertSafepointPoll(Instruction *InsertBefore, std::vector<CallSite> &ParsePointsNeeded /*rval*/); -static bool isGCLeafFunction(const CallSite &CS); - static bool needsStatepoint(const CallSite &CS) { - if (isGCLeafFunction(CS)) + if (callsGCLeafFunction(CS)) return false; if (CS.isCall()) { CallInst *call = cast<CallInst>(CS.getInstruction()); @@ -206,7 +206,7 @@ static bool needsStatepoint(const CallSite &CS) { return true; } -static Value *ReplaceWithStatepoint(const CallSite &CS, Pass *P); +static Value *ReplaceWithStatepoint(const CallSite &CS); /// Returns true if this loop is known to contain a call safepoint which /// must unconditionally execute on any iteration of the loop which returns @@ -220,7 +220,7 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, // For the moment, we look only for the 'cuts' that consist of a single call // instruction in a block which is dominated by the Header and dominates the // loop latch (Pred) block. Somewhat surprisingly, walking the entire chain - // of such dominating blocks gets substaintially more occurences than just + // of such dominating blocks gets substantially more occurrences than just // checking the Pred and Header blocks themselves. This may be due to the // density of loop exit conditions caused by range and null checks. // TODO: structure this as an analysis pass, cache the result for subloops, @@ -255,18 +255,12 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, /// conservatism in the analysis. static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE, BasicBlock *Pred) { - // Only used when SkipCounted is off - const unsigned upperTripBound = 8192; - // A conservative bound on the loop as a whole. const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L); - if (MaxTrips != SE->getCouldNotCompute()) { - if (SE->getUnsignedRange(MaxTrips).getUnsignedMax().ult(upperTripBound)) - return true; - if (SkipCounted && - SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(32)) - return true; - } + if (MaxTrips != SE->getCouldNotCompute() && + SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN( + CountedLoopTripWidth)) + return true; // If this is a conditional branch to the header with the alternate path // being outside the loop, we can ask questions about the execution frequency @@ -275,13 +269,10 @@ static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE, // This returns an exact expression only. TODO: We really only need an // upper bound here, but SE doesn't expose that. const SCEV *MaxExec = SE->getExitCount(L, Pred); - if (MaxExec != SE->getCouldNotCompute()) { - if (SE->getUnsignedRange(MaxExec).getUnsignedMax().ult(upperTripBound)) - return true; - if (SkipCounted && - SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(32)) + if (MaxExec != SE->getCouldNotCompute() && + SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN( + CountedLoopTripWidth)) return true; - } } return /* not finite */ false; @@ -432,14 +423,14 @@ static Instruction *findLocationForEntrySafepoint(Function &F, assert(hasNextInstruction(I) && "first check if there is a next instruction!"); if (I->isTerminator()) { - return I->getParent()->getUniqueSuccessor()->begin(); + return &I->getParent()->getUniqueSuccessor()->front(); } else { - return std::next(BasicBlock::iterator(I)); + return &*++I->getIterator(); } }; Instruction *cursor = nullptr; - for (cursor = F.getEntryBlock().begin(); hasNextInstruction(cursor); + for (cursor = &F.getEntryBlock().front(); hasNextInstruction(cursor); cursor = nextInstruction(cursor)) { // We need to ensure a safepoint poll occurs before any 'real' call. The @@ -466,7 +457,7 @@ static Instruction *findLocationForEntrySafepoint(Function &F, static void findCallSafepoints(Function &F, std::vector<CallSite> &Found /*rval*/) { assert(Found.empty() && "must be empty!"); - for (Instruction &I : inst_range(F)) { + for (Instruction &I : instructions(F)) { Instruction *inst = &I; if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) { CallSite CS(inst); @@ -508,7 +499,7 @@ static bool isGCSafepointPoll(Function &F) { static bool shouldRewriteFunction(Function &F) { // TODO: This should check the GCStrategy if (F.hasGC()) { - const char *FunctionGCName = F.getGC(); + const auto &FunctionGCName = F.getGC(); const StringRef StatepointExampleName("statepoint-example"); const StringRef CoreCLRName("coreclr"); return (StatepointExampleName == FunctionGCName) || @@ -713,7 +704,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) { Invoke->getParent()); } - Value *GCResult = ReplaceWithStatepoint(CS, nullptr); + Value *GCResult = ReplaceWithStatepoint(CS); Results.push_back(GCResult); } assert(Results.size() == ParsePointNeeded.size()); @@ -747,7 +738,7 @@ FunctionPass *llvm::createPlaceSafepointsPass() { INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl, "place-backedge-safepoints-impl", "Place Backedge Safepoints", false, false) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl, @@ -759,31 +750,6 @@ INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints", INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints", false, false) -static bool isGCLeafFunction(const CallSite &CS) { - Instruction *inst = CS.getInstruction(); - if (isa<IntrinsicInst>(inst)) { - // Most LLVM intrinsics are things which can never take a safepoint. - // As a result, we don't need to have the stack parsable at the - // callsite. This is a highly useful optimization since intrinsic - // calls are fairly prevelent, particularly in debug builds. - return true; - } - - // If this function is marked explicitly as a leaf call, we don't need to - // place a safepoint of it. In fact, for correctness we *can't* in many - // cases. Note: Indirect calls return Null for the called function, - // these obviously aren't runtime functions with attributes - // TODO: Support attributes on the call site as well. - const Function *F = CS.getCalledFunction(); - bool isLeaf = - F && - F->getFnAttribute("gc-leaf-function").getValueAsString().equals("true"); - if (isLeaf) { - return true; - } - return false; -} - static void InsertSafepointPoll(Instruction *InsertBefore, std::vector<CallSite> &ParsePointsNeeded /*rval*/) { @@ -796,6 +762,7 @@ InsertSafepointPoll(Instruction *InsertBefore, // path call - where we need to insert a safepoint (parsepoint). auto *F = M->getFunction(GCSafepointPollName); + assert(F && "gc.safepoint_poll function is missing"); assert(F->getType()->getElementType() == FunctionType::get(Type::getVoidTy(M->getContext()), false) && "gc.safepoint_poll declared with wrong type"); @@ -864,10 +831,8 @@ InsertSafepointPoll(Instruction *InsertBefore, /// Replaces the given call site (Call or Invoke) with a gc.statepoint /// intrinsic with an empty deoptimization arguments list. This does /// NOT do explicit relocation for GC support. -static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ - Pass *P) { - assert(CS.getInstruction()->getParent()->getParent()->getParent() && - "must be set"); +static Value *ReplaceWithStatepoint(const CallSite &CS /* to replace */) { + assert(CS.getInstruction()->getModule() && "must be set"); // TODO: technically, a pass is not allowed to get functions from within a // function pass since it might trigger a new function addition. Refactor @@ -917,15 +882,10 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ CS.getInstruction()->getContext(), AttributeSet::FunctionIndex, AttrsToRemove); - Value *StatepointTarget = NumPatchBytes == 0 - ? CS.getCalledValue() - : ConstantPointerNull::get(cast<PointerType>( - CS.getCalledValue()->getType())); - if (CS.isCall()) { CallInst *ToReplace = cast<CallInst>(CS.getInstruction()); CallInst *Call = Builder.CreateGCStatepointCall( - ID, NumPatchBytes, StatepointTarget, + ID, NumPatchBytes, CS.getCalledValue(), makeArrayRef(CS.arg_begin(), CS.arg_end()), None, None, "safepoint_token"); Call->setTailCall(ToReplace->isTailCall()); @@ -938,7 +898,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ Token = Call; - // Put the following gc_result and gc_relocate calls immediately after the + // Put the following gc_result and gc_relocate calls immediately after // the old call (which we're about to delete). assert(ToReplace->getNextNode() && "not a terminator, must have next"); Builder.SetInsertPoint(ToReplace->getNextNode()); @@ -951,7 +911,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ // original block. Builder.SetInsertPoint(ToReplace->getParent()); InvokeInst *Invoke = Builder.CreateGCStatepointInvoke( - ID, NumPatchBytes, StatepointTarget, ToReplace->getNormalDest(), + ID, NumPatchBytes, CS.getCalledValue(), ToReplace->getNormalDest(), ToReplace->getUnwindDest(), makeArrayRef(CS.arg_begin(), CS.arg_end()), None, None, "safepoint_token"); @@ -967,7 +927,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ // We'll insert the gc.result into the normal block BasicBlock *NormalDest = ToReplace->getNormalDest(); // Can not insert gc.result in case of phi nodes preset. - // Should have removed this cases prior to runnning this function + // Should have removed this cases prior to running this function assert(!isa<PHINode>(NormalDest->begin())); Instruction *IP = &*(NormalDest->getFirstInsertionPt()); Builder.SetInsertPoint(IP); diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp index d1acf78..bcadd4e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -26,6 +26,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -62,7 +64,7 @@ namespace { /// Print out the expression identified in the Ops list. /// static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { - Module *M = I->getParent()->getParent()->getParent(); + Module *M = I->getModule(); dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " " << *Ops[0].Op->getType() << '\t'; for (unsigned i = 0, e = Ops.size(); i != e; ++i) { @@ -82,20 +84,6 @@ namespace { Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {} - /// \brief Sort factors by their Base. - struct BaseSorter { - bool operator()(const Factor &LHS, const Factor &RHS) { - return LHS.Base < RHS.Base; - } - }; - - /// \brief Compare factors for equal bases. - struct BaseEqual { - bool operator()(const Factor &LHS, const Factor &RHS) { - return LHS.Base == RHS.Base; - } - }; - /// \brief Sort factors in descending order by their power. struct PowerDescendingSorter { bool operator()(const Factor &LHS, const Factor &RHS) { @@ -172,6 +160,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addPreserved<GlobalsAAWrapperPass>(); } private: void BuildRankMap(Function &F); @@ -194,6 +183,8 @@ namespace { Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); Value *RemoveFactorFromExpression(Value *V, Value *Factor); void EraseInst(Instruction *I); + void RecursivelyEraseDeadInsts(Instruction *I, + SetVector<AssertingVH<Instruction>> &Insts); void OptimizeInst(Instruction *I); Instruction *canonicalizeNegConstExpr(Instruction *I); }; @@ -255,27 +246,6 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1, return nullptr; } -static bool isUnmovableInstruction(Instruction *I) { - switch (I->getOpcode()) { - case Instruction::PHI: - case Instruction::LandingPad: - case Instruction::Alloca: - case Instruction::Load: - case Instruction::Invoke: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - return true; - case Instruction::Call: - return !isa<DbgInfoIntrinsic>(I); - default: - return false; - } -} - void Reassociate::BuildRankMap(Function &F) { unsigned i = 2; @@ -295,7 +265,7 @@ void Reassociate::BuildRankMap(Function &F) { // we cannot move. This ensures that the ranks for these instructions are // all different in the block. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - if (isUnmovableInstruction(I)) + if (mayBeMemoryDependent(*I)) ValueRankMap[&*I] = ++BBRank; } } @@ -913,7 +883,11 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, /// that computes the negative version of the value specified. The negative /// version of the value is returned, and BI is left pointing at the instruction /// that should be processed next by the reassociation pass. -static Value *NegateValue(Value *V, Instruction *BI) { +/// Also add intermediate instructions to the redo list that are modified while +/// pushing the negates through adds. These will be revisited to see if +/// additional opportunities have been exposed. +static Value *NegateValue(Value *V, Instruction *BI, + SetVector<AssertingVH<Instruction>> &ToRedo) { if (Constant *C = dyn_cast<Constant>(V)) { if (C->getType()->isFPOrFPVectorTy()) { return ConstantExpr::getFNeg(C); @@ -934,8 +908,8 @@ static Value *NegateValue(Value *V, Instruction *BI) { if (BinaryOperator *I = isReassociableOp(V, Instruction::Add, Instruction::FAdd)) { // Push the negates through the add. - I->setOperand(0, NegateValue(I->getOperand(0), BI)); - I->setOperand(1, NegateValue(I->getOperand(1), BI)); + I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo)); + I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo)); if (I->getOpcode() == Instruction::Add) { I->setHasNoUnsignedWrap(false); I->setHasNoSignedWrap(false); @@ -948,6 +922,10 @@ static Value *NegateValue(Value *V, Instruction *BI) { // I->moveBefore(BI); I->setName(I->getName()+".neg"); + + // Add the intermediate negates to the redo list as processing them later + // could expose more reassociating opportunities. + ToRedo.insert(I); return I; } @@ -972,26 +950,28 @@ static Value *NegateValue(Value *V, Instruction *BI) { if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) { InsertPt = II->getNormalDest()->begin(); } else { - InsertPt = InstInput; - ++InsertPt; + InsertPt = ++InstInput->getIterator(); } while (isa<PHINode>(InsertPt)) ++InsertPt; } else { InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin(); } - TheNeg->moveBefore(InsertPt); + TheNeg->moveBefore(&*InsertPt); if (TheNeg->getOpcode() == Instruction::Sub) { TheNeg->setHasNoUnsignedWrap(false); TheNeg->setHasNoSignedWrap(false); } else { TheNeg->andIRFlags(BI); } + ToRedo.insert(TheNeg); return TheNeg; } // Insert a 'neg' instruction that subtracts the value from zero to get the // negation. - return CreateNeg(V, V->getName() + ".neg", BI, BI); + BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI); + ToRedo.insert(NewNeg); + return NewNeg; } /// Return true if we should break up this subtract of X-Y into (X + -Y). @@ -1025,14 +1005,15 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) { /// If we have (X-Y), and if either X is an add, or if this is only used by an /// add, transform this into (X+(0-Y)) to promote better reassociation. -static BinaryOperator *BreakUpSubtract(Instruction *Sub) { +static BinaryOperator * +BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) { // Convert a subtract into an add and a neg instruction. This allows sub // instructions to be commuted with other add instructions. // // Calculate the negative value of Operand 1 of the sub instruction, // and set it as the RHS of the add instruction we just made. // - Value *NegVal = NegateValue(Sub->getOperand(1), Sub); + Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo); BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub); Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op. Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op. @@ -1166,7 +1147,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { return nullptr; } - BasicBlock::iterator InsertPt = BO; ++InsertPt; + BasicBlock::iterator InsertPt = ++BO->getIterator(); // If this was just a single multiply, remove the multiply and return the only // remaining operand. @@ -1179,7 +1160,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { } if (NeedsNegate) - V = CreateNeg(V, "neg", InsertPt, BO); + V = CreateNeg(V, "neg", &*InsertPt, BO); return V; } @@ -1250,7 +1231,7 @@ static Value *OptimizeAndOrXor(unsigned Opcode, return nullptr; } -/// Helper funciton of CombineXorOpnd(). It creates a bitwise-and +/// Helper function of CombineXorOpnd(). It creates a bitwise-and /// instruction with the given two operands, and return the resulting /// instruction. There are two special cases: 1) if the constant operand is 0, /// it will return NULL. 2) if the constant is ~0, the symbolic operand will @@ -1947,6 +1928,22 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I, return nullptr; } +// Remove dead instructions and if any operands are trivially dead add them to +// Insts so they will be removed as well. +void Reassociate::RecursivelyEraseDeadInsts( + Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) { + assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); + SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end()); + ValueRankMap.erase(I); + Insts.remove(I); + RedoInsts.remove(I); + I->eraseFromParent(); + for (auto Op : Ops) + if (Instruction *OpInst = dyn_cast<Instruction>(Op)) + if (OpInst->use_empty()) + Insts.insert(OpInst); +} + /// Zap the given instruction, adding interesting operands to the work list. void Reassociate::EraseInst(Instruction *I) { assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); @@ -2083,7 +2080,7 @@ void Reassociate::OptimizeInst(Instruction *I) { return; // Don't optimize floating point instructions that don't have unsafe algebra. - if (I->getType()->isFloatingPointTy() && !I->hasUnsafeAlgebra()) + if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra()) return; // Do not reassociate boolean (i1) expressions. We want to preserve the @@ -2099,7 +2096,7 @@ void Reassociate::OptimizeInst(Instruction *I) { // see if we can convert it to X+-Y. if (I->getOpcode() == Instruction::Sub) { if (ShouldBreakUpSubtract(I)) { - Instruction *NI = BreakUpSubtract(I); + Instruction *NI = BreakUpSubtract(I, RedoInsts); RedoInsts.insert(I); MadeChange = true; I = NI; @@ -2110,6 +2107,12 @@ void Reassociate::OptimizeInst(Instruction *I) { (!I->hasOneUse() || !isReassociableOp(I->user_back(), Instruction::Mul))) { Instruction *NI = LowerNegateToMultiply(I); + // If the negate was simplified, revisit the users to see if we can + // reassociate further. + for (User *U : NI->users()) { + if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U)) + RedoInsts.insert(Tmp); + } RedoInsts.insert(I); MadeChange = true; I = NI; @@ -2117,7 +2120,7 @@ void Reassociate::OptimizeInst(Instruction *I) { } } else if (I->getOpcode() == Instruction::FSub) { if (ShouldBreakUpSubtract(I)) { - Instruction *NI = BreakUpSubtract(I); + Instruction *NI = BreakUpSubtract(I, RedoInsts); RedoInsts.insert(I); MadeChange = true; I = NI; @@ -2127,7 +2130,13 @@ void Reassociate::OptimizeInst(Instruction *I) { if (isReassociableOp(I->getOperand(1), Instruction::FMul) && (!I->hasOneUse() || !isReassociableOp(I->user_back(), Instruction::FMul))) { + // If the negate was simplified, revisit the users to see if we can + // reassociate further. Instruction *NI = LowerNegateToMultiply(I); + for (User *U : NI->users()) { + if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U)) + RedoInsts.insert(Tmp); + } RedoInsts.insert(I); MadeChange = true; I = NI; @@ -2142,8 +2151,15 @@ void Reassociate::OptimizeInst(Instruction *I) { // If this is an interior node of a reassociable tree, ignore it until we // get to the root of the tree, to avoid N^2 analysis. unsigned Opcode = BO->getOpcode(); - if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) + if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) { + // During the initial run we will get to the root of the tree. + // But if we get here while we are redoing instructions, there is no + // guarantee that the root will be visited. So Redo later + if (BO->user_back() != BO && + BO->getParent() == BO->user_back()->getParent()) + RedoInsts.insert(BO->user_back()); return; + } // If this is an add tree that is used by a sub instruction, ignore it // until we process the subtract. @@ -2250,15 +2266,29 @@ bool Reassociate::runOnFunction(Function &F) { for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { // Optimize every instruction in the basic block. for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; ) - if (isInstructionTriviallyDead(II)) { - EraseInst(II++); + if (isInstructionTriviallyDead(&*II)) { + EraseInst(&*II++); } else { - OptimizeInst(II); + OptimizeInst(&*II); assert(II->getParent() == BI && "Moved to a different block!"); ++II; } - // If this produced extra instructions to optimize, handle them now. + // Make a copy of all the instructions to be redone so we can remove dead + // instructions. + SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts); + // Iterate over all instructions to be reevaluated and remove trivially dead + // instructions. If any operand of the trivially dead instruction becomes + // dead mark it for deletion as well. Continue this process until all + // trivially dead instructions have been removed. + while (!ToRedo.empty()) { + Instruction *I = ToRedo.pop_back_val(); + if (isInstructionTriviallyDead(I)) + RecursivelyEraseDeadInsts(I, ToRedo); + } + + // Now that we have removed dead instructions, we can reoptimize the + // remaining instructions. while (!RedoInsts.empty()) { Instruction *I = RedoInsts.pop_back_val(); if (isInstructionTriviallyDead(I)) diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 1b46727..915f897 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -82,10 +82,9 @@ bool RegToMem::runOnFunction(Function &F) { BasicBlock::iterator I = BBEntry->begin(); while (isa<AllocaInst>(I)) ++I; - CastInst *AllocaInsertionPoint = - new BitCastInst(Constant::getNullValue(Type::getInt32Ty(F.getContext())), - Type::getInt32Ty(F.getContext()), - "reg2mem alloca point", I); + CastInst *AllocaInsertionPoint = new BitCastInst( + Constant::getNullValue(Type::getInt32Ty(F.getContext())), + Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I); // Find the escaped instructions. But don't create stack slots for // allocas in entry block. @@ -95,7 +94,7 @@ bool RegToMem::runOnFunction(Function &F) { for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end(); iib != iie; ++iib) { if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) && - valueEscapes(iib)) { + valueEscapes(&*iib)) { WorkList.push_front(&*iib); } } diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index ae2ae3a..d77d574 100644 --- a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -14,12 +14,14 @@ #include "llvm/Pass.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/MapVector.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Dominators.h" @@ -46,10 +48,6 @@ using namespace llvm; -// Print tracing output -static cl::opt<bool> TraceLSP("trace-rewrite-statepoints", cl::Hidden, - cl::init(false)); - // Print the liveset found at the insert location static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden, cl::init(false)); @@ -74,6 +72,19 @@ static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live", cl::location(ClobberNonLive), cl::Hidden); +static cl::opt<bool> UseDeoptBundles("rs4gc-use-deopt-bundles", cl::Hidden, + cl::init(false)); +static cl::opt<bool> + AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info", + cl::Hidden, cl::init(true)); + +/// Should we split vectors of pointers into their individual elements? This +/// is known to be buggy, but the alternate implementation isn't yet ready. +/// This is purely to provide a debugging and dianostic hook until the vector +/// split is replaced with vector relocations. +static cl::opt<bool> UseVectorSplit("rs4gc-split-vector-values", cl::Hidden, + cl::init(true)); + namespace { struct RewriteStatepointsForGC : public ModulePass { static char ID; // Pass identification, replacement for typeid @@ -88,10 +99,10 @@ struct RewriteStatepointsForGC : public ModulePass { Changed |= runOnFunction(F); if (Changed) { - // stripDereferenceabilityInfo asserts that shouldRewriteStatepointsIn + // stripNonValidAttributes asserts that shouldRewriteStatepointsIn // returns true for at least one function in the module. Since at least // one function changed, we know that the precondition is satisfied. - stripDereferenceabilityInfo(M); + stripNonValidAttributes(M); } return Changed; @@ -108,15 +119,16 @@ struct RewriteStatepointsForGC : public ModulePass { /// dereferenceability that are no longer valid/correct after /// RewriteStatepointsForGC has run. This is because semantically, after /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire - /// heap. stripDereferenceabilityInfo (conservatively) restores correctness + /// heap. stripNonValidAttributes (conservatively) restores correctness /// by erasing all attributes in the module that externally imply /// dereferenceability. - /// - void stripDereferenceabilityInfo(Module &M); + /// Similar reasoning also applies to the noalias attributes. gc.statepoint + /// can touch the entire heap including noalias objects. + void stripNonValidAttributes(Module &M); - // Helpers for stripDereferenceabilityInfo - void stripDereferenceabilityInfoFromBody(Function &F); - void stripDereferenceabilityInfoFromPrototype(Function &F); + // Helpers for stripNonValidAttributes + void stripNonValidAttributesFromBody(Function &F); + void stripNonValidAttributesFromPrototype(Function &F); }; } // namespace @@ -160,15 +172,16 @@ struct GCPtrLivenessData { // base relation will remain. Internally, we add a mixture of the two // types, then update all the second type to the first type typedef DenseMap<Value *, Value *> DefiningValueMapTy; -typedef DenseSet<llvm::Value *> StatepointLiveSetTy; -typedef DenseMap<Instruction *, Value *> RematerializedValueMapTy; +typedef DenseSet<Value *> StatepointLiveSetTy; +typedef DenseMap<AssertingVH<Instruction>, AssertingVH<Value>> + RematerializedValueMapTy; struct PartiallyConstructedSafepointRecord { - /// The set of values known to be live accross this safepoint - StatepointLiveSetTy liveset; + /// The set of values known to be live across this safepoint + StatepointLiveSetTy LiveSet; /// Mapping from live pointers to a base-defining-value - DenseMap<llvm::Value *, llvm::Value *> PointerToBase; + DenseMap<Value *, Value *> PointerToBase; /// The *new* gc.statepoint instruction itself. This produces the token /// that normal path gc.relocates and the gc.result are tied to. @@ -179,12 +192,26 @@ struct PartiallyConstructedSafepointRecord { Instruction *UnwindToken; /// Record live values we are rematerialized instead of relocating. - /// They are not included into 'liveset' field. + /// They are not included into 'LiveSet' field. /// Maps rematerialized copy to it's original value. RematerializedValueMapTy RematerializedValues; }; } +static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) { + assert(UseDeoptBundles && "Should not be called otherwise!"); + + Optional<OperandBundleUse> DeoptBundle = CS.getOperandBundle("deopt"); + + if (!DeoptBundle.hasValue()) { + assert(AllowStatepointWithNoDeoptInfo && + "Found non-leaf call without deopt info!"); + return None; + } + + return DeoptBundle.getValue().Inputs; +} + /// Compute the live-in set for every basic block in the function static void computeLiveInValues(DominatorTree &DT, Function &F, GCPtrLivenessData &Data); @@ -195,10 +222,10 @@ static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data, StatepointLiveSetTy &out); // TODO: Once we can get to the GCStrategy, this becomes -// Optional<bool> isGCManagedPointer(const Value *V) const override { +// Optional<bool> isGCManagedPointer(const Type *Ty) const override { -static bool isGCPointerType(const Type *T) { - if (const PointerType *PT = dyn_cast<PointerType>(T)) +static bool isGCPointerType(Type *T) { + if (auto *PT = dyn_cast<PointerType>(T)) // For the sake of this example GC, we arbitrarily pick addrspace(1) as our // GC managed heap. We know that a pointer into this heap needs to be // updated and that no other pointer does. @@ -233,9 +260,8 @@ static bool containsGCPtrType(Type *Ty) { if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) return containsGCPtrType(AT->getElementType()); if (StructType *ST = dyn_cast<StructType>(Ty)) - return std::any_of( - ST->subtypes().begin(), ST->subtypes().end(), - [](Type *SubType) { return containsGCPtrType(SubType); }); + return std::any_of(ST->subtypes().begin(), ST->subtypes().end(), + containsGCPtrType); return false; } @@ -247,7 +273,7 @@ static bool isUnhandledGCPointerType(Type *Ty) { } #endif -static bool order_by_name(llvm::Value *a, llvm::Value *b) { +static bool order_by_name(Value *a, Value *b) { if (a->hasName() && b->hasName()) { return -1 == a->getName().compare(b->getName()); } else if (a->hasName() && !b->hasName()) { @@ -260,6 +286,13 @@ static bool order_by_name(llvm::Value *a, llvm::Value *b) { } } +// Return the name of the value suffixed with the provided value, or if the +// value didn't have a name, the default value specified. +static std::string suffixed_name_or(Value *V, StringRef Suffix, + StringRef DefaultName) { + return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str(); +} + // Conservatively identifies any definitions which might be live at the // given instruction. The analysis is performed immediately before the // given instruction. Values defined by that instruction are not considered @@ -269,30 +302,56 @@ static void analyzeParsePointLiveness( const CallSite &CS, PartiallyConstructedSafepointRecord &result) { Instruction *inst = CS.getInstruction(); - StatepointLiveSetTy liveset; - findLiveSetAtInst(inst, OriginalLivenessData, liveset); + StatepointLiveSetTy LiveSet; + findLiveSetAtInst(inst, OriginalLivenessData, LiveSet); if (PrintLiveSet) { // Note: This output is used by several of the test cases - // The order of elemtns in a set is not stable, put them in a vec and sort + // The order of elements in a set is not stable, put them in a vec and sort // by name - SmallVector<Value *, 64> temp; - temp.insert(temp.end(), liveset.begin(), liveset.end()); - std::sort(temp.begin(), temp.end(), order_by_name); + SmallVector<Value *, 64> Temp; + Temp.insert(Temp.end(), LiveSet.begin(), LiveSet.end()); + std::sort(Temp.begin(), Temp.end(), order_by_name); errs() << "Live Variables:\n"; - for (Value *V : temp) { - errs() << " " << V->getName(); // no newline - V->dump(); - } + for (Value *V : Temp) + dbgs() << " " << V->getName() << " " << *V << "\n"; } if (PrintLiveSetSize) { errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n"; - errs() << "Number live values: " << liveset.size() << "\n"; + errs() << "Number live values: " << LiveSet.size() << "\n"; + } + result.LiveSet = LiveSet; +} + +static bool isKnownBaseResult(Value *V); +namespace { +/// A single base defining value - An immediate base defining value for an +/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'. +/// For instructions which have multiple pointer [vector] inputs or that +/// transition between vector and scalar types, there is no immediate base +/// defining value. The 'base defining value' for 'Def' is the transitive +/// closure of this relation stopping at the first instruction which has no +/// immediate base defining value. The b.d.v. might itself be a base pointer, +/// but it can also be an arbitrary derived pointer. +struct BaseDefiningValueResult { + /// Contains the value which is the base defining value. + Value * const BDV; + /// True if the base defining value is also known to be an actual base + /// pointer. + const bool IsKnownBase; + BaseDefiningValueResult(Value *BDV, bool IsKnownBase) + : BDV(BDV), IsKnownBase(IsKnownBase) { +#ifndef NDEBUG + // Check consistency between new and old means of checking whether a BDV is + // a base. + bool MustBeBase = isKnownBaseResult(BDV); + assert(!MustBeBase || MustBeBase == IsKnownBase); +#endif } - result.liveset = liveset; +}; } -static Value *findBaseDefiningValue(Value *I); +static BaseDefiningValueResult findBaseDefiningValue(Value *I); /// Return a base defining value for the 'Index' element of the given vector /// instruction 'I'. If Index is null, returns a BDV for the entire vector @@ -303,61 +362,27 @@ static Value *findBaseDefiningValue(Value *I); /// vector returned is a BDV (and possibly a base) of the entire vector 'I'. /// If the later, the return pointer is a BDV (or possibly a base) for the /// particular element in 'I'. -static std::pair<Value *, bool> -findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) { - assert(I->getType()->isVectorTy() && - cast<VectorType>(I->getType())->getElementType()->isPointerTy() && - "Illegal to ask for the base pointer of a non-pointer type"); - +static BaseDefiningValueResult +findBaseDefiningValueOfVector(Value *I) { // Each case parallels findBaseDefiningValue below, see that code for // detailed motivation. if (isa<Argument>(I)) // An incoming argument to the function is a base pointer - return std::make_pair(I, true); - - // We shouldn't see the address of a global as a vector value? - assert(!isa<GlobalVariable>(I) && - "unexpected global variable found in base of vector"); - - // inlining could possibly introduce phi node that contains - // undef if callee has multiple returns - if (isa<UndefValue>(I)) - // utterly meaningless, but useful for dealing with partially optimized - // code. - return std::make_pair(I, true); - - // Due to inheritance, this must be _after_ the global variable and undef - // checks - if (Constant *Con = dyn_cast<Constant>(I)) { - assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) && - "order of checks wrong!"); - assert(Con->isNullValue() && "null is the only case which makes sense"); - return std::make_pair(Con, true); - } - + return BaseDefiningValueResult(I, true); + + if (isa<Constant>(I)) + // Constant vectors consist only of constant pointers. + return BaseDefiningValueResult(I, true); + if (isa<LoadInst>(I)) - return std::make_pair(I, true); - - // For an insert element, we might be able to look through it if we know - // something about the indexes. - if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(I)) { - if (Index) { - Value *InsertIndex = IEI->getOperand(2); - // This index is inserting the value, look for its BDV - if (InsertIndex == Index) - return std::make_pair(findBaseDefiningValue(IEI->getOperand(1)), false); - // Both constant, and can't be equal per above. This insert is definitely - // not relevant, look back at the rest of the vector and keep trying. - if (isa<ConstantInt>(Index) && isa<ConstantInt>(InsertIndex)) - return findBaseDefiningValueOfVector(IEI->getOperand(0), Index); - } - + return BaseDefiningValueResult(I, true); + + if (isa<InsertElementInst>(I)) // We don't know whether this vector contains entirely base pointers or // not. To be conservatively correct, we treat it as a BDV and will // duplicate code as needed to construct a parallel vector of bases. - return std::make_pair(IEI, false); - } + return BaseDefiningValueResult(I, false); if (isa<ShuffleVectorInst>(I)) // We don't know whether this vector contains entirely base pointers or @@ -365,105 +390,47 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) { // duplicate code as needed to construct a parallel vector of bases. // TODO: There a number of local optimizations which could be applied here // for particular sufflevector patterns. - return std::make_pair(I, false); + return BaseDefiningValueResult(I, false); // A PHI or Select is a base defining value. The outer findBasePointer // algorithm is responsible for constructing a base value for this BDV. assert((isa<SelectInst>(I) || isa<PHINode>(I)) && "unknown vector instruction - no base found for vector element"); - return std::make_pair(I, false); + return BaseDefiningValueResult(I, false); } -static bool isKnownBaseResult(Value *V); - /// Helper function for findBasePointer - Will return a value which either a) -/// defines the base pointer for the input or b) blocks the simple search -/// (i.e. a PHI or Select of two derived pointers) -static Value *findBaseDefiningValue(Value *I) { - if (I->getType()->isVectorTy()) - return findBaseDefiningValueOfVector(I).first; - - assert(I->getType()->isPointerTy() && +/// defines the base pointer for the input, b) blocks the simple search +/// (i.e. a PHI or Select of two derived pointers), or c) involves a change +/// from pointer to vector type or back. +static BaseDefiningValueResult findBaseDefiningValue(Value *I) { + assert(I->getType()->isPtrOrPtrVectorTy() && "Illegal to ask for the base pointer of a non-pointer type"); - // This case is a bit of a hack - it only handles extracts from vectors which - // trivially contain only base pointers or cases where we can directly match - // the index of the original extract element to an insertion into the vector. - // See note inside the function for how to improve this. - if (auto *EEI = dyn_cast<ExtractElementInst>(I)) { - Value *VectorOperand = EEI->getVectorOperand(); - Value *Index = EEI->getIndexOperand(); - std::pair<Value *, bool> pair = - findBaseDefiningValueOfVector(VectorOperand, Index); - Value *VectorBase = pair.first; - if (VectorBase->getType()->isPointerTy()) - // We found a BDV for this specific element with the vector. This is an - // optimization, but in practice it covers most of the useful cases - // created via scalarization. - return VectorBase; - else { - assert(VectorBase->getType()->isVectorTy()); - if (pair.second) - // If the entire vector returned is known to be entirely base pointers, - // then the extractelement is valid base for this value. - return EEI; - else { - // Otherwise, we have an instruction which potentially produces a - // derived pointer and we need findBasePointers to clone code for us - // such that we can create an instruction which produces the - // accompanying base pointer. - // Note: This code is currently rather incomplete. We don't currently - // support the general form of shufflevector of insertelement. - // Conceptually, these are just 'base defining values' of the same - // variety as phi or select instructions. We need to update the - // findBasePointers algorithm to insert new 'base-only' versions of the - // original instructions. This is relative straight forward to do, but - // the case which would motivate the work hasn't shown up in real - // workloads yet. - assert((isa<PHINode>(VectorBase) || isa<SelectInst>(VectorBase)) && - "need to extend findBasePointers for generic vector" - "instruction cases"); - return VectorBase; - } - } - } + if (I->getType()->isVectorTy()) + return findBaseDefiningValueOfVector(I); if (isa<Argument>(I)) // An incoming argument to the function is a base pointer // We should have never reached here if this argument isn't an gc value - return I; - - if (isa<GlobalVariable>(I)) - // base case - return I; - - // inlining could possibly introduce phi node that contains - // undef if callee has multiple returns - if (isa<UndefValue>(I)) - // utterly meaningless, but useful for dealing with - // partially optimized code. - return I; - - // Due to inheritance, this must be _after_ the global variable and undef - // checks - if (Constant *Con = dyn_cast<Constant>(I)) { - assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) && - "order of checks wrong!"); - // Note: Finding a constant base for something marked for relocation - // doesn't really make sense. The most likely case is either a) some - // screwed up the address space usage or b) your validating against - // compiled C++ code w/o the proper separation. The only real exception - // is a null pointer. You could have generic code written to index of - // off a potentially null value and have proven it null. We also use - // null pointers in dead paths of relocation phis (which we might later - // want to find a base pointer for). - assert(isa<ConstantPointerNull>(Con) && - "null is the only case which makes sense"); - return Con; - } + return BaseDefiningValueResult(I, true); + + if (isa<Constant>(I)) + // We assume that objects with a constant base (e.g. a global) can't move + // and don't need to be reported to the collector because they are always + // live. All constants have constant bases. Besides global references, all + // kinds of constants (e.g. undef, constant expressions, null pointers) can + // be introduced by the inliner or the optimizer, especially on dynamically + // dead paths. See e.g. test4 in constants.ll. + return BaseDefiningValueResult(I, true); if (CastInst *CI = dyn_cast<CastInst>(I)) { Value *Def = CI->stripPointerCasts(); + // If stripping pointer casts changes the address space there is an + // addrspacecast in between. + assert(cast<PointerType>(Def->getType())->getAddressSpace() == + cast<PointerType>(CI->getType())->getAddressSpace() && + "unsupported addrspacecast"); // If we find a cast instruction here, it means we've found a cast which is // not simply a pointer cast (i.e. an inttoptr). We don't know how to // handle int->ptr conversion. @@ -472,7 +439,9 @@ static Value *findBaseDefiningValue(Value *I) { } if (isa<LoadInst>(I)) - return I; // The value loaded is an gc base itself + // The value loaded is an gc base itself + return BaseDefiningValueResult(I, true); + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) // The base of this GEP is the base @@ -480,14 +449,11 @@ static Value *findBaseDefiningValue(Value *I) { if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { switch (II->getIntrinsicID()) { - case Intrinsic::experimental_gc_result_ptr: default: // fall through to general call handling break; case Intrinsic::experimental_gc_statepoint: - case Intrinsic::experimental_gc_result_float: - case Intrinsic::experimental_gc_result_int: - llvm_unreachable("these don't produce pointers"); + llvm_unreachable("statepoints don't produce pointers"); case Intrinsic::experimental_gc_relocate: { // Rerunning safepoint insertion after safepoints are already // inserted is not supported. It could probably be made to work, @@ -506,17 +472,17 @@ static Value *findBaseDefiningValue(Value *I) { // pointers. This should probably be generalized via attributes to support // both source language and internal functions. if (isa<CallInst>(I) || isa<InvokeInst>(I)) - return I; + return BaseDefiningValueResult(I, true); // I have absolutely no idea how to implement this part yet. It's not - // neccessarily hard, I just haven't really looked at it yet. + // necessarily hard, I just haven't really looked at it yet. assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented"); if (isa<AtomicCmpXchgInst>(I)) // A CAS is effectively a atomic store and load combined under a // predicate. From the perspective of base pointers, we just treat it // like a load. - return I; + return BaseDefiningValueResult(I, true); assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are " "binary ops which don't apply to pointers"); @@ -525,34 +491,41 @@ static Value *findBaseDefiningValue(Value *I) { // stack, but in either case, this is simply a field load. As a result, // this is a defining definition of the base just like a load is. if (isa<ExtractValueInst>(I)) - return I; + return BaseDefiningValueResult(I, true); // We should never see an insert vector since that would require we be // tracing back a struct value not a pointer value. assert(!isa<InsertValueInst>(I) && "Base pointer for a struct is meaningless"); + // An extractelement produces a base result exactly when it's input does. + // We may need to insert a parallel instruction to extract the appropriate + // element out of the base vector corresponding to the input. Given this, + // it's analogous to the phi and select case even though it's not a merge. + if (isa<ExtractElementInst>(I)) + // Note: There a lot of obvious peephole cases here. This are deliberately + // handled after the main base pointer inference algorithm to make writing + // test cases to exercise that code easier. + return BaseDefiningValueResult(I, false); + // The last two cases here don't return a base pointer. Instead, they - // return a value which dynamically selects from amoung several base + // return a value which dynamically selects from among several base // derived pointers (each with it's own base potentially). It's the job of // the caller to resolve these. assert((isa<SelectInst>(I) || isa<PHINode>(I)) && "missing instruction case in findBaseDefiningValing"); - return I; + return BaseDefiningValueResult(I, false); } /// Returns the base defining value for this value. static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) { Value *&Cached = Cache[I]; if (!Cached) { - Cached = findBaseDefiningValue(I); + Cached = findBaseDefiningValue(I).BDV; + DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> " + << Cached->getName() << "\n"); } assert(Cache[I] != nullptr); - - if (TraceLSP) { - dbgs() << "fBDV-cached: " << I->getName() << " -> " << Cached->getName() - << "\n"; - } return Cached; } @@ -572,7 +545,9 @@ static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) { /// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV, /// is it known to be a base pointer? Or do we need to continue searching. static bool isKnownBaseResult(Value *V) { - if (!isa<PHINode>(V) && !isa<SelectInst>(V)) { + if (!isa<PHINode>(V) && !isa<SelectInst>(V) && + !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) && + !isa<ShuffleVectorInst>(V)) { // no recursion possible return true; } @@ -587,17 +562,19 @@ static bool isKnownBaseResult(Value *V) { return false; } -// TODO: find a better name for this namespace { -class PhiState { +/// Models the state of a single base defining value in the findBasePointer +/// algorithm for determining where a new instruction is needed to propagate +/// the base of this BDV. +class BDVState { public: enum Status { Unknown, Base, Conflict }; - PhiState(Status s, Value *b = nullptr) : status(s), base(b) { + BDVState(Status s, Value *b = nullptr) : status(s), base(b) { assert(status != Base || b); } - PhiState(Value *b) : status(Base), base(b) {} - PhiState() : status(Unknown), base(nullptr) {} + explicit BDVState(Value *b) : status(Base), base(b) {} + BDVState() : status(Unknown), base(nullptr) {} Status getStatus() const { return status; } Value *getBase() const { return base; } @@ -606,72 +583,80 @@ public: bool isUnknown() const { return getStatus() == Unknown; } bool isConflict() const { return getStatus() == Conflict; } - bool operator==(const PhiState &other) const { + bool operator==(const BDVState &other) const { return base == other.base && status == other.status; } - bool operator!=(const PhiState &other) const { return !(*this == other); } + bool operator!=(const BDVState &other) const { return !(*this == other); } - void dump() { - errs() << status << " (" << base << " - " - << (base ? base->getName() : "nullptr") << "): "; + LLVM_DUMP_METHOD + void dump() const { print(dbgs()); dbgs() << '\n'; } + + void print(raw_ostream &OS) const { + switch (status) { + case Unknown: + OS << "U"; + break; + case Base: + OS << "B"; + break; + case Conflict: + OS << "C"; + break; + }; + OS << " (" << base << " - " + << (base ? base->getName() : "nullptr") << "): "; } private: Status status; - Value *base; // non null only if status == base + AssertingVH<Value> base; // non null only if status == base }; +} -typedef DenseMap<Value *, PhiState> ConflictStateMapTy; -// Values of type PhiState form a lattice, and this is a helper +#ifndef NDEBUG +static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) { + State.print(OS); + return OS; +} +#endif + +namespace { +// Values of type BDVState form a lattice, and this is a helper // class that implementes the meet operation. The meat of the meet -// operation is implemented in MeetPhiStates::pureMeet -class MeetPhiStates { +// operation is implemented in MeetBDVStates::pureMeet +class MeetBDVStates { public: - // phiStates is a mapping from PHINodes and SelectInst's to PhiStates. - explicit MeetPhiStates(const ConflictStateMapTy &phiStates) - : phiStates(phiStates) {} - - // Destructively meet the current result with the base V. V can - // either be a merge instruction (SelectInst / PHINode), in which - // case its status is looked up in the phiStates map; or a regular - // SSA value, in which case it is assumed to be a base. - void meetWith(Value *V) { - PhiState otherState = getStateForBDV(V); - assert((MeetPhiStates::pureMeet(otherState, currentResult) == - MeetPhiStates::pureMeet(currentResult, otherState)) && - "math is wrong: meet does not commute!"); - currentResult = MeetPhiStates::pureMeet(otherState, currentResult); + /// Initializes the currentResult to the TOP state so that if can be met with + /// any other state to produce that state. + MeetBDVStates() {} + + // Destructively meet the current result with the given BDVState + void meetWith(BDVState otherState) { + currentResult = meet(otherState, currentResult); } - PhiState getResult() const { return currentResult; } + BDVState getResult() const { return currentResult; } private: - const ConflictStateMapTy &phiStates; - PhiState currentResult; - - /// Return a phi state for a base defining value. We'll generate a new - /// base state for known bases and expect to find a cached state otherwise - PhiState getStateForBDV(Value *baseValue) { - if (isKnownBaseResult(baseValue)) { - return PhiState(baseValue); - } else { - return lookupFromMap(baseValue); - } - } + BDVState currentResult; - PhiState lookupFromMap(Value *V) { - auto I = phiStates.find(V); - assert(I != phiStates.end() && "lookup failed!"); - return I->second; + /// Perform a meet operation on two elements of the BDVState lattice. + static BDVState meet(BDVState LHS, BDVState RHS) { + assert((pureMeet(LHS, RHS) == pureMeet(RHS, LHS)) && + "math is wrong: meet does not commute!"); + BDVState Result = pureMeet(LHS, RHS); + DEBUG(dbgs() << "meet of " << LHS << " with " << RHS + << " produced " << Result << "\n"); + return Result; } - static PhiState pureMeet(const PhiState &stateA, const PhiState &stateB) { + static BDVState pureMeet(const BDVState &stateA, const BDVState &stateB) { switch (stateA.getStatus()) { - case PhiState::Unknown: + case BDVState::Unknown: return stateB; - case PhiState::Base: + case BDVState::Base: assert(stateA.getBase() && "can't be null"); if (stateB.isUnknown()) return stateA; @@ -681,18 +666,20 @@ private: assert(stateA == stateB && "equality broken!"); return stateA; } - return PhiState(PhiState::Conflict); + return BDVState(BDVState::Conflict); } assert(stateB.isConflict() && "only three states!"); - return PhiState(PhiState::Conflict); + return BDVState(BDVState::Conflict); - case PhiState::Conflict: + case BDVState::Conflict: return stateA; } llvm_unreachable("only three states!"); } }; } + + /// For a given value or instruction, figure out what base ptr it's derived /// from. For gc objects, this is simply itself. On success, returns a value /// which is the base pointer. (This is reliable and can be used for @@ -723,171 +710,252 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // // Note: A simpler form of this would be to add the conflict form of all // PHIs without running the optimistic algorithm. This would be - // analougous to pessimistic data flow and would likely lead to an + // analogous to pessimistic data flow and would likely lead to an // overall worse solution. - ConflictStateMapTy states; - states[def] = PhiState(); - // Recursively fill in all phis & selects reachable from the initial one - // for which we don't already know a definite base value for - // TODO: This should be rewritten with a worklist - bool done = false; - while (!done) { - done = true; - // Since we're adding elements to 'states' as we run, we can't keep - // iterators into the set. - SmallVector<Value *, 16> Keys; - Keys.reserve(states.size()); - for (auto Pair : states) { - Value *V = Pair.first; - Keys.push_back(V); - } - for (Value *v : Keys) { - assert(!isKnownBaseResult(v) && "why did it get added?"); - if (PHINode *phi = dyn_cast<PHINode>(v)) { - assert(phi->getNumIncomingValues() > 0 && - "zero input phis are illegal"); - for (Value *InVal : phi->incoming_values()) { - Value *local = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(local) && states.find(local) == states.end()) { - states[local] = PhiState(); - done = false; - } - } - } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) { - Value *local = findBaseOrBDV(sel->getTrueValue(), cache); - if (!isKnownBaseResult(local) && states.find(local) == states.end()) { - states[local] = PhiState(); - done = false; - } - local = findBaseOrBDV(sel->getFalseValue(), cache); - if (!isKnownBaseResult(local) && states.find(local) == states.end()) { - states[local] = PhiState(); - done = false; - } +#ifndef NDEBUG + auto isExpectedBDVType = [](Value *BDV) { + return isa<PHINode>(BDV) || isa<SelectInst>(BDV) || + isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV); + }; +#endif + + // Once populated, will contain a mapping from each potentially non-base BDV + // to a lattice value (described above) which corresponds to that BDV. + // We use the order of insertion (DFS over the def/use graph) to provide a + // stable deterministic ordering for visiting DenseMaps (which are unordered) + // below. This is important for deterministic compilation. + MapVector<Value *, BDVState> States; + + // Recursively fill in all base defining values reachable from the initial + // one for which we don't already know a definite base value for + /* scope */ { + SmallVector<Value*, 16> Worklist; + Worklist.push_back(def); + States.insert(std::make_pair(def, BDVState())); + while (!Worklist.empty()) { + Value *Current = Worklist.pop_back_val(); + assert(!isKnownBaseResult(Current) && "why did it get added?"); + + auto visitIncomingValue = [&](Value *InVal) { + Value *Base = findBaseOrBDV(InVal, cache); + if (isKnownBaseResult(Base)) + // Known bases won't need new instructions introduced and can be + // ignored safely + return; + assert(isExpectedBDVType(Base) && "the only non-base values " + "we see should be base defining values"); + if (States.insert(std::make_pair(Base, BDVState())).second) + Worklist.push_back(Base); + }; + if (PHINode *Phi = dyn_cast<PHINode>(Current)) { + for (Value *InVal : Phi->incoming_values()) + visitIncomingValue(InVal); + } else if (SelectInst *Sel = dyn_cast<SelectInst>(Current)) { + visitIncomingValue(Sel->getTrueValue()); + visitIncomingValue(Sel->getFalseValue()); + } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) { + visitIncomingValue(EE->getVectorOperand()); + } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) { + visitIncomingValue(IE->getOperand(0)); // vector operand + visitIncomingValue(IE->getOperand(1)); // scalar operand + } else { + // There is one known class of instructions we know we don't handle. + assert(isa<ShuffleVectorInst>(Current)); + llvm_unreachable("unimplemented instruction case"); } } } - if (TraceLSP) { - errs() << "States after initialization:\n"; - for (auto Pair : states) { - Instruction *v = cast<Instruction>(Pair.first); - PhiState state = Pair.second; - state.dump(); - v->dump(); - } +#ifndef NDEBUG + DEBUG(dbgs() << "States after initialization:\n"); + for (auto Pair : States) { + DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); } +#endif - // TODO: come back and revisit the state transitions around inputs which - // have reached conflict state. The current version seems too conservative. + // Return a phi state for a base defining value. We'll generate a new + // base state for known bases and expect to find a cached state otherwise. + auto getStateForBDV = [&](Value *baseValue) { + if (isKnownBaseResult(baseValue)) + return BDVState(baseValue); + auto I = States.find(baseValue); + assert(I != States.end() && "lookup failed!"); + return I->second; + }; bool progress = true; while (progress) { #ifndef NDEBUG - size_t oldSize = states.size(); + const size_t oldSize = States.size(); #endif progress = false; - // We're only changing keys in this loop, thus safe to keep iterators - for (auto Pair : states) { - MeetPhiStates calculateMeet(states); - Value *v = Pair.first; - assert(!isKnownBaseResult(v) && "why did it get added?"); - if (SelectInst *select = dyn_cast<SelectInst>(v)) { - calculateMeet.meetWith(findBaseOrBDV(select->getTrueValue(), cache)); - calculateMeet.meetWith(findBaseOrBDV(select->getFalseValue(), cache)); - } else - for (Value *Val : cast<PHINode>(v)->incoming_values()) - calculateMeet.meetWith(findBaseOrBDV(Val, cache)); - - PhiState oldState = states[v]; - PhiState newState = calculateMeet.getResult(); + // We're only changing values in this loop, thus safe to keep iterators. + // Since this is computing a fixed point, the order of visit does not + // effect the result. TODO: We could use a worklist here and make this run + // much faster. + for (auto Pair : States) { + Value *BDV = Pair.first; + assert(!isKnownBaseResult(BDV) && "why did it get added?"); + + // Given an input value for the current instruction, return a BDVState + // instance which represents the BDV of that value. + auto getStateForInput = [&](Value *V) mutable { + Value *BDV = findBaseOrBDV(V, cache); + return getStateForBDV(BDV); + }; + + MeetBDVStates calculateMeet; + if (SelectInst *select = dyn_cast<SelectInst>(BDV)) { + calculateMeet.meetWith(getStateForInput(select->getTrueValue())); + calculateMeet.meetWith(getStateForInput(select->getFalseValue())); + } else if (PHINode *Phi = dyn_cast<PHINode>(BDV)) { + for (Value *Val : Phi->incoming_values()) + calculateMeet.meetWith(getStateForInput(Val)); + } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) { + // The 'meet' for an extractelement is slightly trivial, but it's still + // useful in that it drives us to conflict if our input is. + calculateMeet.meetWith(getStateForInput(EE->getVectorOperand())); + } else { + // Given there's a inherent type mismatch between the operands, will + // *always* produce Conflict. + auto *IE = cast<InsertElementInst>(BDV); + calculateMeet.meetWith(getStateForInput(IE->getOperand(0))); + calculateMeet.meetWith(getStateForInput(IE->getOperand(1))); + } + + BDVState oldState = States[BDV]; + BDVState newState = calculateMeet.getResult(); if (oldState != newState) { progress = true; - states[v] = newState; + States[BDV] = newState; } } - assert(oldSize <= states.size()); - assert(oldSize == states.size() || progress); + assert(oldSize == States.size() && + "fixed point shouldn't be adding any new nodes to state"); } - if (TraceLSP) { - errs() << "States after meet iteration:\n"; - for (auto Pair : states) { - Instruction *v = cast<Instruction>(Pair.first); - PhiState state = Pair.second; - state.dump(); - v->dump(); - } +#ifndef NDEBUG + DEBUG(dbgs() << "States after meet iteration:\n"); + for (auto Pair : States) { + DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); } - +#endif + // Insert Phis for all conflicts - // We want to keep naming deterministic in the loop that follows, so - // sort the keys before iteration. This is useful in allowing us to - // write stable tests. Note that there is no invalidation issue here. - SmallVector<Value *, 16> Keys; - Keys.reserve(states.size()); - for (auto Pair : states) { - Value *V = Pair.first; - Keys.push_back(V); - } - std::sort(Keys.begin(), Keys.end(), order_by_name); // TODO: adjust naming patterns to avoid this order of iteration dependency - for (Value *V : Keys) { - Instruction *v = cast<Instruction>(V); - PhiState state = states[V]; - assert(!isKnownBaseResult(v) && "why did it get added?"); - assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); - if (!state.isConflict()) + for (auto Pair : States) { + Instruction *I = cast<Instruction>(Pair.first); + BDVState State = Pair.second; + assert(!isKnownBaseResult(I) && "why did it get added?"); + assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); + + // extractelement instructions are a bit special in that we may need to + // insert an extract even when we know an exact base for the instruction. + // The problem is that we need to convert from a vector base to a scalar + // base for the particular indice we're interested in. + if (State.isBase() && isa<ExtractElementInst>(I) && + isa<VectorType>(State.getBase()->getType())) { + auto *EE = cast<ExtractElementInst>(I); + // TODO: In many cases, the new instruction is just EE itself. We should + // exploit this, but can't do it here since it would break the invariant + // about the BDV not being known to be a base. + auto *BaseInst = ExtractElementInst::Create(State.getBase(), + EE->getIndexOperand(), + "base_ee", EE); + BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); + States[I] = BDVState(BDVState::Base, BaseInst); + } + + // Since we're joining a vector and scalar base, they can never be the + // same. As a result, we should always see insert element having reached + // the conflict state. + if (isa<InsertElementInst>(I)) { + assert(State.isConflict()); + } + + if (!State.isConflict()) continue; - if (isa<PHINode>(v)) { - int num_preds = - std::distance(pred_begin(v->getParent()), pred_end(v->getParent())); - assert(num_preds > 0 && "how did we reach here"); - PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v); - // Add metadata marking this as a base value - auto *const_1 = ConstantInt::get( - Type::getInt32Ty( - v->getParent()->getParent()->getParent()->getContext()), - 1); - auto MDConst = ConstantAsMetadata::get(const_1); - MDNode *md = MDNode::get( - v->getParent()->getParent()->getParent()->getContext(), MDConst); - phi->setMetadata("is_base_value", md); - states[v] = PhiState(PhiState::Conflict, phi); + /// Create and insert a new instruction which will represent the base of + /// the given instruction 'I'. + auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* { + if (isa<PHINode>(I)) { + BasicBlock *BB = I->getParent(); + int NumPreds = std::distance(pred_begin(BB), pred_end(BB)); + assert(NumPreds > 0 && "how did we reach here"); + std::string Name = suffixed_name_or(I, ".base", "base_phi"); + return PHINode::Create(I->getType(), NumPreds, Name, I); + } else if (SelectInst *Sel = dyn_cast<SelectInst>(I)) { + // The undef will be replaced later + UndefValue *Undef = UndefValue::get(Sel->getType()); + std::string Name = suffixed_name_or(I, ".base", "base_select"); + return SelectInst::Create(Sel->getCondition(), Undef, + Undef, Name, Sel); + } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { + UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType()); + std::string Name = suffixed_name_or(I, ".base", "base_ee"); + return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name, + EE); + } else { + auto *IE = cast<InsertElementInst>(I); + UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType()); + UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType()); + std::string Name = suffixed_name_or(I, ".base", "base_ie"); + return InsertElementInst::Create(VecUndef, ScalarUndef, + IE->getOperand(2), Name, IE); + } + + }; + Instruction *BaseInst = MakeBaseInstPlaceholder(I); + // Add metadata marking this as a base value + BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); + States[I] = BDVState(BDVState::Conflict, BaseInst); + } + + // Returns a instruction which produces the base pointer for a given + // instruction. The instruction is assumed to be an input to one of the BDVs + // seen in the inference algorithm above. As such, we must either already + // know it's base defining value is a base, or have inserted a new + // instruction to propagate the base of it's BDV and have entered that newly + // introduced instruction into the state table. In either case, we are + // assured to be able to determine an instruction which produces it's base + // pointer. + auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) { + Value *BDV = findBaseOrBDV(Input, cache); + Value *Base = nullptr; + if (isKnownBaseResult(BDV)) { + Base = BDV; } else { - SelectInst *sel = cast<SelectInst>(v); - // The undef will be replaced later - UndefValue *undef = UndefValue::get(sel->getType()); - SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef, - undef, "base_select", sel); - // Add metadata marking this as a base value - auto *const_1 = ConstantInt::get( - Type::getInt32Ty( - v->getParent()->getParent()->getParent()->getContext()), - 1); - auto MDConst = ConstantAsMetadata::get(const_1); - MDNode *md = MDNode::get( - v->getParent()->getParent()->getParent()->getContext(), MDConst); - basesel->setMetadata("is_base_value", md); - states[v] = PhiState(PhiState::Conflict, basesel); + // Either conflict or base. + assert(States.count(BDV)); + Base = States[BDV].getBase(); } - } + assert(Base && "can't be null"); + // The cast is needed since base traversal may strip away bitcasts + if (Base->getType() != Input->getType() && + InsertPt) { + Base = new BitCastInst(Base, Input->getType(), "cast", + InsertPt); + } + return Base; + }; - // Fixup all the inputs of the new PHIs - for (auto Pair : states) { - Instruction *v = cast<Instruction>(Pair.first); - PhiState state = Pair.second; + // Fixup all the inputs of the new PHIs. Visit order needs to be + // deterministic and predictable because we're naming newly created + // instructions. + for (auto Pair : States) { + Instruction *BDV = cast<Instruction>(Pair.first); + BDVState State = Pair.second; - assert(!isKnownBaseResult(v) && "why did it get added?"); - assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); - if (!state.isConflict()) + assert(!isKnownBaseResult(BDV) && "why did it get added?"); + assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); + if (!State.isConflict()) continue; - if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) { - PHINode *phi = cast<PHINode>(v); + if (PHINode *basephi = dyn_cast<PHINode>(State.getBase())) { + PHINode *phi = cast<PHINode>(BDV); unsigned NumPHIValues = phi->getNumIncomingValues(); for (unsigned i = 0; i < NumPHIValues; i++) { Value *InVal = phi->getIncomingValue(i); @@ -906,104 +974,145 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { if (blockIndex != -1) { Value *oldBase = basephi->getIncomingValue(blockIndex); basephi->addIncoming(oldBase, InBB); + #ifndef NDEBUG - Value *base = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(base)) { - // Either conflict or base. - assert(states.count(base)); - base = states[base].getBase(); - assert(base != nullptr && "unknown PhiState!"); - } - - // In essense this assert states: the only way two + Value *Base = getBaseForInput(InVal, nullptr); + // In essence this assert states: the only way two // values incoming from the same basic block may be // different is by being different bitcasts of the same // value. A cleanup that remains TODO is changing // findBaseOrBDV to return an llvm::Value of the correct // type (and still remain pure). This will remove the // need to add bitcasts. - assert(base->stripPointerCasts() == oldBase->stripPointerCasts() && + assert(Base->stripPointerCasts() == oldBase->stripPointerCasts() && "sanity -- findBaseOrBDV should be pure!"); #endif continue; } - // Find either the defining value for the PHI or the normal base for - // a non-phi node - Value *base = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(base)) { - // Either conflict or base. - assert(states.count(base)); - base = states[base].getBase(); - assert(base != nullptr && "unknown PhiState!"); - } - assert(base && "can't be null"); - // Must use original input BB since base may not be Instruction - // The cast is needed since base traversal may strip away bitcasts - if (base->getType() != basephi->getType()) { - base = new BitCastInst(base, basephi->getType(), "cast", - InBB->getTerminator()); - } - basephi->addIncoming(base, InBB); + // Find the instruction which produces the base for each input. We may + // need to insert a bitcast in the incoming block. + // TODO: Need to split critical edges if insertion is needed + Value *Base = getBaseForInput(InVal, InBB->getTerminator()); + basephi->addIncoming(Base, InBB); } assert(basephi->getNumIncomingValues() == NumPHIValues); - } else { - SelectInst *basesel = cast<SelectInst>(state.getBase()); - SelectInst *sel = cast<SelectInst>(v); + } else if (SelectInst *BaseSel = dyn_cast<SelectInst>(State.getBase())) { + SelectInst *Sel = cast<SelectInst>(BDV); // Operand 1 & 2 are true, false path respectively. TODO: refactor to // something more safe and less hacky. for (int i = 1; i <= 2; i++) { - Value *InVal = sel->getOperand(i); - // Find either the defining value for the PHI or the normal base for - // a non-phi node - Value *base = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(base)) { - // Either conflict or base. - assert(states.count(base)); - base = states[base].getBase(); - assert(base != nullptr && "unknown PhiState!"); - } - assert(base && "can't be null"); - // Must use original input BB since base may not be Instruction - // The cast is needed since base traversal may strip away bitcasts - if (base->getType() != basesel->getType()) { - base = new BitCastInst(base, basesel->getType(), "cast", basesel); - } - basesel->setOperand(i, base); + Value *InVal = Sel->getOperand(i); + // Find the instruction which produces the base for each input. We may + // need to insert a bitcast. + Value *Base = getBaseForInput(InVal, BaseSel); + BaseSel->setOperand(i, Base); } + } else if (auto *BaseEE = dyn_cast<ExtractElementInst>(State.getBase())) { + Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand(); + // Find the instruction which produces the base for each input. We may + // need to insert a bitcast. + Value *Base = getBaseForInput(InVal, BaseEE); + BaseEE->setOperand(0, Base); + } else { + auto *BaseIE = cast<InsertElementInst>(State.getBase()); + auto *BdvIE = cast<InsertElementInst>(BDV); + auto UpdateOperand = [&](int OperandIdx) { + Value *InVal = BdvIE->getOperand(OperandIdx); + Value *Base = getBaseForInput(InVal, BaseIE); + BaseIE->setOperand(OperandIdx, Base); + }; + UpdateOperand(0); // vector operand + UpdateOperand(1); // scalar operand + } + + } + + // Now that we're done with the algorithm, see if we can optimize the + // results slightly by reducing the number of new instructions needed. + // Arguably, this should be integrated into the algorithm above, but + // doing as a post process step is easier to reason about for the moment. + DenseMap<Value *, Value *> ReverseMap; + SmallPtrSet<Instruction *, 16> NewInsts; + SmallSetVector<AssertingVH<Instruction>, 16> Worklist; + // Note: We need to visit the states in a deterministic order. We uses the + // Keys we sorted above for this purpose. Note that we are papering over a + // bigger problem with the algorithm above - it's visit order is not + // deterministic. A larger change is needed to fix this. + for (auto Pair : States) { + auto *BDV = Pair.first; + auto State = Pair.second; + Value *Base = State.getBase(); + assert(BDV && Base); + assert(!isKnownBaseResult(BDV) && "why did it get added?"); + assert(isKnownBaseResult(Base) && + "must be something we 'know' is a base pointer"); + if (!State.isConflict()) + continue; + + ReverseMap[Base] = BDV; + if (auto *BaseI = dyn_cast<Instruction>(Base)) { + NewInsts.insert(BaseI); + Worklist.insert(BaseI); + } + } + auto ReplaceBaseInstWith = [&](Value *BDV, Instruction *BaseI, + Value *Replacement) { + // Add users which are new instructions (excluding self references) + for (User *U : BaseI->users()) + if (auto *UI = dyn_cast<Instruction>(U)) + if (NewInsts.count(UI) && UI != BaseI) + Worklist.insert(UI); + // Then do the actual replacement + NewInsts.erase(BaseI); + ReverseMap.erase(BaseI); + BaseI->replaceAllUsesWith(Replacement); + assert(States.count(BDV)); + assert(States[BDV].isConflict() && States[BDV].getBase() == BaseI); + States[BDV] = BDVState(BDVState::Conflict, Replacement); + BaseI->eraseFromParent(); + }; + const DataLayout &DL = cast<Instruction>(def)->getModule()->getDataLayout(); + while (!Worklist.empty()) { + Instruction *BaseI = Worklist.pop_back_val(); + assert(NewInsts.count(BaseI)); + Value *Bdv = ReverseMap[BaseI]; + if (auto *BdvI = dyn_cast<Instruction>(Bdv)) + if (BaseI->isIdenticalTo(BdvI)) { + DEBUG(dbgs() << "Identical Base: " << *BaseI << "\n"); + ReplaceBaseInstWith(Bdv, BaseI, Bdv); + continue; + } + if (Value *V = SimplifyInstruction(BaseI, DL)) { + DEBUG(dbgs() << "Base " << *BaseI << " simplified to " << *V << "\n"); + ReplaceBaseInstWith(Bdv, BaseI, V); + continue; } } // Cache all of our results so we can cheaply reuse them // NOTE: This is actually two caches: one of the base defining value // relation and one of the base pointer relation! FIXME - for (auto item : states) { - Value *v = item.first; - Value *base = item.second.getBase(); - assert(v && base); - assert(!isKnownBaseResult(v) && "why did it get added?"); - - if (TraceLSP) { - std::string fromstr = - cache.count(v) ? (cache[v]->hasName() ? cache[v]->getName() : "") - : "none"; - errs() << "Updating base value cache" - << " for: " << (v->hasName() ? v->getName() : "") - << " from: " << fromstr - << " to: " << (base->hasName() ? base->getName() : "") << "\n"; - } - - assert(isKnownBaseResult(base) && - "must be something we 'know' is a base pointer"); - if (cache.count(v)) { + for (auto Pair : States) { + auto *BDV = Pair.first; + Value *base = Pair.second.getBase(); + assert(BDV && base); + + std::string fromstr = cache.count(BDV) ? cache[BDV]->getName() : "none"; + DEBUG(dbgs() << "Updating base value cache" + << " for: " << BDV->getName() + << " from: " << fromstr + << " to: " << base->getName() << "\n"); + + if (cache.count(BDV)) { // Once we transition from the BDV relation being store in the cache to // the base relation being stored, it must be stable - assert((!isKnownBaseResult(cache[v]) || cache[v] == base) && + assert((!isKnownBaseResult(cache[BDV]) || cache[BDV] == base) && "base relation should be stable"); } - cache[v] = base; + cache[BDV] = base; } - assert(cache.find(def) != cache.end()); + assert(cache.count(def)); return cache[def]; } @@ -1024,7 +1133,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // pointer was a base pointer. static void findBasePointers(const StatepointLiveSetTy &live, - DenseMap<llvm::Value *, llvm::Value *> &PointerToBase, + DenseMap<Value *, Value *> &PointerToBase, DominatorTree *DT, DefiningValueMapTy &DVCache) { // For the naming of values inserted to be deterministic - which makes for // much cleaner and more stable tests - we need to assign an order to the @@ -1043,7 +1152,7 @@ findBasePointers(const StatepointLiveSetTy &live, // If you see this trip and like to live really dangerously, the code should // be correct, just with idioms the verifier can't handle. You can try - // disabling the verifier at your own substaintial risk. + // disabling the verifier at your own substantial risk. assert(!isa<ConstantPointerNull>(base) && "the relocation code needs adjustment to handle the relocation of " "a null pointer constant without causing false positives in the " @@ -1056,8 +1165,8 @@ findBasePointers(const StatepointLiveSetTy &live, static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, const CallSite &CS, PartiallyConstructedSafepointRecord &result) { - DenseMap<llvm::Value *, llvm::Value *> PointerToBase; - findBasePointers(result.liveset, PointerToBase, &DT, DVCache); + DenseMap<Value *, Value *> PointerToBase; + findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache); if (PrintBasePointers) { // Note: Need to print these in a stable order since this is checked in @@ -1071,8 +1180,11 @@ static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, std::sort(Temp.begin(), Temp.end(), order_by_name); for (Value *Ptr : Temp) { Value *Base = PointerToBase[Ptr]; - errs() << " derived %" << Ptr->getName() << " base %" << Base->getName() - << "\n"; + errs() << " derived "; + Ptr->printAsOperand(errs(), false); + errs() << " base "; + Base->printAsOperand(errs(), false); + errs() << "\n";; } } @@ -1086,10 +1198,10 @@ static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, PartiallyConstructedSafepointRecord &result); static void recomputeLiveInValues( - Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate, + Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate, MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { // TODO-PERF: reuse the original liveness, then simply run the dataflow - // again. The old values are still live and will help it stablize quickly. + // again. The old values are still live and will help it stabilize quickly. GCPtrLivenessData RevisedLivenessData; computeLiveInValues(DT, F, RevisedLivenessData); for (size_t i = 0; i < records.size(); i++) { @@ -1099,69 +1211,66 @@ static void recomputeLiveInValues( } } -// When inserting gc.relocate calls, we need to ensure there are no uses -// of the original value between the gc.statepoint and the gc.relocate call. -// One case which can arise is a phi node starting one of the successor blocks. -// We also need to be able to insert the gc.relocates only on the path which -// goes through the statepoint. We might need to split an edge to make this -// possible. +// When inserting gc.relocate and gc.result calls, we need to ensure there are +// no uses of the original value / return value between the gc.statepoint and +// the gc.relocate / gc.result call. One case which can arise is a phi node +// starting one of the successor blocks. We also need to be able to insert the +// gc.relocates only on the path which goes through the statepoint. We might +// need to split an edge to make this possible. static BasicBlock * normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent, DominatorTree &DT) { BasicBlock *Ret = BB; - if (!BB->getUniquePredecessor()) { - Ret = SplitBlockPredecessors(BB, InvokeParent, "", nullptr, &DT); - } + if (!BB->getUniquePredecessor()) + Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT); - // Now that 'ret' has unique predecessor we can safely remove all phi nodes + // Now that 'Ret' has unique predecessor we can safely remove all phi nodes // from it FoldSingleEntryPHINodes(Ret); - assert(!isa<PHINode>(Ret->begin())); + assert(!isa<PHINode>(Ret->begin()) && + "All PHI nodes should have been removed!"); - // At this point, we can safely insert a gc.relocate as the first instruction - // in Ret if needed. + // At this point, we can safely insert a gc.relocate or gc.result as the first + // instruction in Ret if needed. return Ret; } -static int find_index(ArrayRef<Value *> livevec, Value *val) { - auto itr = std::find(livevec.begin(), livevec.end(), val); - assert(livevec.end() != itr); - size_t index = std::distance(livevec.begin(), itr); - assert(index < livevec.size()); - return index; -} - -// Create new attribute set containing only attributes which can be transfered +// Create new attribute set containing only attributes which can be transferred // from original call to the safepoint. static AttributeSet legalizeCallAttributes(AttributeSet AS) { - AttributeSet ret; + AttributeSet Ret; for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) { - unsigned index = AS.getSlotIndex(Slot); + unsigned Index = AS.getSlotIndex(Slot); - if (index == AttributeSet::ReturnIndex || - index == AttributeSet::FunctionIndex) { + if (Index == AttributeSet::ReturnIndex || + Index == AttributeSet::FunctionIndex) { - for (auto it = AS.begin(Slot), it_end = AS.end(Slot); it != it_end; - ++it) { - Attribute attr = *it; + for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) { // Do not allow certain attributes - just skip them // Safepoint can not be read only or read none. - if (attr.hasAttribute(Attribute::ReadNone) || - attr.hasAttribute(Attribute::ReadOnly)) + if (Attr.hasAttribute(Attribute::ReadNone) || + Attr.hasAttribute(Attribute::ReadOnly)) + continue; + + // These attributes control the generation of the gc.statepoint call / + // invoke itself; and once the gc.statepoint is in place, they're of no + // use. + if (Attr.hasAttribute("statepoint-num-patch-bytes") || + Attr.hasAttribute("statepoint-id")) continue; - ret = ret.addAttributes( - AS.getContext(), index, - AttributeSet::get(AS.getContext(), index, AttrBuilder(attr))); + Ret = Ret.addAttributes( + AS.getContext(), Index, + AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr))); } } // Just skip parameter attributes for now } - return ret; + return Ret; } /// Helper function to place all gc relocates necessary for the given @@ -1173,225 +1282,306 @@ static AttributeSet legalizeCallAttributes(AttributeSet AS) { /// statepointToken - statepoint instruction to which relocates should be /// bound. /// Builder - Llvm IR builder to be used to construct new calls. -static void CreateGCRelocates(ArrayRef<llvm::Value *> LiveVariables, +static void CreateGCRelocates(ArrayRef<Value *> LiveVariables, const int LiveStart, - ArrayRef<llvm::Value *> BasePtrs, + ArrayRef<Value *> BasePtrs, Instruction *StatepointToken, IRBuilder<> Builder) { - SmallVector<Instruction *, 64> NewDefs; - NewDefs.reserve(LiveVariables.size()); + if (LiveVariables.empty()) + return; + + auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) { + auto ValIt = std::find(LiveVec.begin(), LiveVec.end(), Val); + assert(ValIt != LiveVec.end() && "Val not found in LiveVec!"); + size_t Index = std::distance(LiveVec.begin(), ValIt); + assert(Index < LiveVec.size() && "Bug in std::find?"); + return Index; + }; + Module *M = StatepointToken->getModule(); + + // All gc_relocate are generated as i8 addrspace(1)* (or a vector type whose + // element type is i8 addrspace(1)*). We originally generated unique + // declarations for each pointer type, but this proved problematic because + // the intrinsic mangling code is incomplete and fragile. Since we're moving + // towards a single unified pointer type anyways, we can just cast everything + // to an i8* of the right address space. A bitcast is added later to convert + // gc_relocate to the actual value's type. + auto getGCRelocateDecl = [&] (Type *Ty) { + assert(isHandledGCPointerType(Ty)); + auto AS = Ty->getScalarType()->getPointerAddressSpace(); + Type *NewTy = Type::getInt8PtrTy(M->getContext(), AS); + if (auto *VT = dyn_cast<VectorType>(Ty)) + NewTy = VectorType::get(NewTy, VT->getNumElements()); + return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, + {NewTy}); + }; - Module *M = StatepointToken->getParent()->getParent()->getParent(); + // Lazily populated map from input types to the canonicalized form mentioned + // in the comment above. This should probably be cached somewhere more + // broadly. + DenseMap<Type*, Value*> TypeToDeclMap; for (unsigned i = 0; i < LiveVariables.size(); i++) { - // We generate a (potentially) unique declaration for every pointer type - // combination. This results is some blow up the function declarations in - // the IR, but removes the need for argument bitcasts which shrinks the IR - // greatly and makes it much more readable. - SmallVector<Type *, 1> Types; // one per 'any' type - // All gc_relocate are set to i8 addrspace(1)* type. This could help avoid - // cases where the actual value's type mangling is not supported by llvm. A - // bitcast is added later to convert gc_relocate to the actual value's type. - Types.push_back(Type::getInt8PtrTy(M->getContext(), 1)); - Value *GCRelocateDecl = Intrinsic::getDeclaration( - M, Intrinsic::experimental_gc_relocate, Types); - // Generate the gc.relocate call and save the result Value *BaseIdx = - ConstantInt::get(Type::getInt32Ty(M->getContext()), - LiveStart + find_index(LiveVariables, BasePtrs[i])); - Value *LiveIdx = ConstantInt::get( - Type::getInt32Ty(M->getContext()), - LiveStart + find_index(LiveVariables, LiveVariables[i])); + Builder.getInt32(LiveStart + FindIndex(LiveVariables, BasePtrs[i])); + Value *LiveIdx = Builder.getInt32(LiveStart + i); + + Type *Ty = LiveVariables[i]->getType(); + if (!TypeToDeclMap.count(Ty)) + TypeToDeclMap[Ty] = getGCRelocateDecl(Ty); + Value *GCRelocateDecl = TypeToDeclMap[Ty]; // only specify a debug name if we can give a useful one - Value *Reloc = Builder.CreateCall( + CallInst *Reloc = Builder.CreateCall( GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx}, - LiveVariables[i]->hasName() ? LiveVariables[i]->getName() + ".relocated" - : ""); + suffixed_name_or(LiveVariables[i], ".relocated", "")); // Trick CodeGen into thinking there are lots of free registers at this // fake call. - cast<CallInst>(Reloc)->setCallingConv(CallingConv::Cold); + Reloc->setCallingConv(CallingConv::Cold); + } +} + +namespace { + +/// This struct is used to defer RAUWs and `eraseFromParent` s. Using this +/// avoids having to worry about keeping around dangling pointers to Values. +class DeferredReplacement { + AssertingVH<Instruction> Old; + AssertingVH<Instruction> New; + +public: + explicit DeferredReplacement(Instruction *Old, Instruction *New) : + Old(Old), New(New) { + assert(Old != New && "Not allowed!"); + } + + /// Does the task represented by this instance. + void doReplacement() { + Instruction *OldI = Old; + Instruction *NewI = New; + + assert(OldI != NewI && "Disallowed at construction?!"); + + Old = nullptr; + New = nullptr; - NewDefs.push_back(cast<Instruction>(Reloc)); + if (NewI) + OldI->replaceAllUsesWith(NewI); + OldI->eraseFromParent(); } - assert(NewDefs.size() == LiveVariables.size() && - "missing or extra redefinition at safepoint"); +}; } static void -makeStatepointExplicitImpl(const CallSite &CS, /* to replace */ - const SmallVectorImpl<llvm::Value *> &basePtrs, - const SmallVectorImpl<llvm::Value *> &liveVariables, - Pass *P, - PartiallyConstructedSafepointRecord &result) { - assert(basePtrs.size() == liveVariables.size()); - assert(isStatepoint(CS) && +makeStatepointExplicitImpl(const CallSite CS, /* to replace */ + const SmallVectorImpl<Value *> &BasePtrs, + const SmallVectorImpl<Value *> &LiveVariables, + PartiallyConstructedSafepointRecord &Result, + std::vector<DeferredReplacement> &Replacements) { + assert(BasePtrs.size() == LiveVariables.size()); + assert((UseDeoptBundles || isStatepoint(CS)) && "This method expects to be rewriting a statepoint"); - BasicBlock *BB = CS.getInstruction()->getParent(); - assert(BB); - Function *F = BB->getParent(); - assert(F && "must be set"); - Module *M = F->getParent(); - (void)M; - assert(M && "must be set"); - - // We're not changing the function signature of the statepoint since the gc - // arguments go into the var args section. - Function *gc_statepoint_decl = CS.getCalledFunction(); - // Then go ahead and use the builder do actually do the inserts. We insert // immediately before the previous instruction under the assumption that all // arguments will be available here. We can't insert afterwards since we may // be replacing a terminator. - Instruction *insertBefore = CS.getInstruction(); - IRBuilder<> Builder(insertBefore); - // Copy all of the arguments from the original statepoint - this includes the - // target, call args, and deopt args - SmallVector<llvm::Value *, 64> args; - args.insert(args.end(), CS.arg_begin(), CS.arg_end()); - // TODO: Clear the 'needs rewrite' flag - - // add all the pointers to be relocated (gc arguments) - // Capture the start of the live variable list for use in the gc_relocates - const int live_start = args.size(); - args.insert(args.end(), liveVariables.begin(), liveVariables.end()); + Instruction *InsertBefore = CS.getInstruction(); + IRBuilder<> Builder(InsertBefore); + + ArrayRef<Value *> GCArgs(LiveVariables); + uint64_t StatepointID = 0xABCDEF00; + uint32_t NumPatchBytes = 0; + uint32_t Flags = uint32_t(StatepointFlags::None); + + ArrayRef<Use> CallArgs; + ArrayRef<Use> DeoptArgs; + ArrayRef<Use> TransitionArgs; + + Value *CallTarget = nullptr; + + if (UseDeoptBundles) { + CallArgs = {CS.arg_begin(), CS.arg_end()}; + DeoptArgs = GetDeoptBundleOperands(CS); + // TODO: we don't fill in TransitionArgs or Flags in this branch, but we + // could have an operand bundle for that too. + AttributeSet OriginalAttrs = CS.getAttributes(); + + Attribute AttrID = OriginalAttrs.getAttribute(AttributeSet::FunctionIndex, + "statepoint-id"); + if (AttrID.isStringAttribute()) + AttrID.getValueAsString().getAsInteger(10, StatepointID); + + Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute( + AttributeSet::FunctionIndex, "statepoint-num-patch-bytes"); + if (AttrNumPatchBytes.isStringAttribute()) + AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes); + + CallTarget = CS.getCalledValue(); + } else { + // This branch will be gone soon, and we will soon only support the + // UseDeoptBundles == true configuration. + Statepoint OldSP(CS); + StatepointID = OldSP.getID(); + NumPatchBytes = OldSP.getNumPatchBytes(); + Flags = OldSP.getFlags(); + + CallArgs = {OldSP.arg_begin(), OldSP.arg_end()}; + DeoptArgs = {OldSP.vm_state_begin(), OldSP.vm_state_end()}; + TransitionArgs = {OldSP.gc_transition_args_begin(), + OldSP.gc_transition_args_end()}; + CallTarget = OldSP.getCalledValue(); + } // Create the statepoint given all the arguments - Instruction *token = nullptr; - AttributeSet return_attributes; + Instruction *Token = nullptr; + AttributeSet ReturnAttrs; if (CS.isCall()) { - CallInst *toReplace = cast<CallInst>(CS.getInstruction()); - CallInst *call = - Builder.CreateCall(gc_statepoint_decl, args, "safepoint_token"); - call->setTailCall(toReplace->isTailCall()); - call->setCallingConv(toReplace->getCallingConv()); + CallInst *ToReplace = cast<CallInst>(CS.getInstruction()); + CallInst *Call = Builder.CreateGCStatepointCall( + StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs, + TransitionArgs, DeoptArgs, GCArgs, "safepoint_token"); + + Call->setTailCall(ToReplace->isTailCall()); + Call->setCallingConv(ToReplace->getCallingConv()); // Currently we will fail on parameter attributes and on certain // function attributes. - AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); - // In case if we can handle this set of sttributes - set up function attrs + AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes()); + // In case if we can handle this set of attributes - set up function attrs // directly on statepoint and return attrs later for gc_result intrinsic. - call->setAttributes(new_attrs.getFnAttributes()); - return_attributes = new_attrs.getRetAttributes(); + Call->setAttributes(NewAttrs.getFnAttributes()); + ReturnAttrs = NewAttrs.getRetAttributes(); - token = call; + Token = Call; // Put the following gc_result and gc_relocate calls immediately after the // the old call (which we're about to delete) - BasicBlock::iterator next(toReplace); - assert(BB->end() != next && "not a terminator, must have next"); - next++; - Instruction *IP = &*(next); - Builder.SetInsertPoint(IP); - Builder.SetCurrentDebugLocation(IP->getDebugLoc()); - + assert(ToReplace->getNextNode() && "Not a terminator, must have next!"); + Builder.SetInsertPoint(ToReplace->getNextNode()); + Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc()); } else { - InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction()); + InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction()); // Insert the new invoke into the old block. We'll remove the old one in a // moment at which point this will become the new terminator for the // original block. - InvokeInst *invoke = InvokeInst::Create( - gc_statepoint_decl, toReplace->getNormalDest(), - toReplace->getUnwindDest(), args, "", toReplace->getParent()); - invoke->setCallingConv(toReplace->getCallingConv()); + InvokeInst *Invoke = Builder.CreateGCStatepointInvoke( + StatepointID, NumPatchBytes, CallTarget, ToReplace->getNormalDest(), + ToReplace->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs, + GCArgs, "statepoint_token"); + + Invoke->setCallingConv(ToReplace->getCallingConv()); // Currently we will fail on parameter attributes and on certain // function attributes. - AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); - // In case if we can handle this set of sttributes - set up function attrs + AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes()); + // In case if we can handle this set of attributes - set up function attrs // directly on statepoint and return attrs later for gc_result intrinsic. - invoke->setAttributes(new_attrs.getFnAttributes()); - return_attributes = new_attrs.getRetAttributes(); + Invoke->setAttributes(NewAttrs.getFnAttributes()); + ReturnAttrs = NewAttrs.getRetAttributes(); - token = invoke; + Token = Invoke; // Generate gc relocates in exceptional path - BasicBlock *unwindBlock = toReplace->getUnwindDest(); - assert(!isa<PHINode>(unwindBlock->begin()) && - unwindBlock->getUniquePredecessor() && + BasicBlock *UnwindBlock = ToReplace->getUnwindDest(); + assert(!isa<PHINode>(UnwindBlock->begin()) && + UnwindBlock->getUniquePredecessor() && "can't safely insert in this block!"); - Instruction *IP = &*(unwindBlock->getFirstInsertionPt()); - Builder.SetInsertPoint(IP); - Builder.SetCurrentDebugLocation(toReplace->getDebugLoc()); + Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt()); + Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc()); - // Extract second element from landingpad return value. We will attach - // exceptional gc relocates to it. - const unsigned idx = 1; - Instruction *exceptional_token = - cast<Instruction>(Builder.CreateExtractValue( - unwindBlock->getLandingPadInst(), idx, "relocate_token")); - result.UnwindToken = exceptional_token; + // Attach exceptional gc relocates to the landingpad. + Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst(); + Result.UnwindToken = ExceptionalToken; - // Just throw away return value. We will use the one we got for normal - // block. - (void)CreateGCRelocates(liveVariables, live_start, basePtrs, - exceptional_token, Builder); + const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx(); + CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, ExceptionalToken, + Builder); // Generate gc relocates and returns for normal block - BasicBlock *normalDest = toReplace->getNormalDest(); - assert(!isa<PHINode>(normalDest->begin()) && - normalDest->getUniquePredecessor() && + BasicBlock *NormalDest = ToReplace->getNormalDest(); + assert(!isa<PHINode>(NormalDest->begin()) && + NormalDest->getUniquePredecessor() && "can't safely insert in this block!"); - IP = &*(normalDest->getFirstInsertionPt()); - Builder.SetInsertPoint(IP); + Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt()); // gc relocates will be generated later as if it were regular call // statepoint } - assert(token); - - // Take the name of the original value call if it had one. - token->takeName(CS.getInstruction()); + assert(Token && "Should be set in one of the above branches!"); + + if (UseDeoptBundles) { + Token->setName("statepoint_token"); + if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) { + StringRef Name = + CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : ""; + CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name); + GCResult->setAttributes(CS.getAttributes().getRetAttributes()); + + // We cannot RAUW or delete CS.getInstruction() because it could be in the + // live set of some other safepoint, in which case that safepoint's + // PartiallyConstructedSafepointRecord will hold a raw pointer to this + // llvm::Instruction. Instead, we defer the replacement and deletion to + // after the live sets have been made explicit in the IR, and we no longer + // have raw pointers to worry about. + Replacements.emplace_back(CS.getInstruction(), GCResult); + } else { + Replacements.emplace_back(CS.getInstruction(), nullptr); + } + } else { + assert(!CS.getInstruction()->hasNUsesOrMore(2) && + "only valid use before rewrite is gc.result"); + assert(!CS.getInstruction()->hasOneUse() || + isGCResult(cast<Instruction>(*CS.getInstruction()->user_begin()))); -// The GCResult is already inserted, we just need to find it -#ifndef NDEBUG - Instruction *toReplace = CS.getInstruction(); - assert((toReplace->hasNUses(0) || toReplace->hasNUses(1)) && - "only valid use before rewrite is gc.result"); - assert(!toReplace->hasOneUse() || - isGCResult(cast<Instruction>(*toReplace->user_begin()))); -#endif + // Take the name of the original statepoint token if there was one. + Token->takeName(CS.getInstruction()); - // Update the gc.result of the original statepoint (if any) to use the newly - // inserted statepoint. This is safe to do here since the token can't be - // considered a live reference. - CS.getInstruction()->replaceAllUsesWith(token); + // Update the gc.result of the original statepoint (if any) to use the newly + // inserted statepoint. This is safe to do here since the token can't be + // considered a live reference. + CS.getInstruction()->replaceAllUsesWith(Token); + CS.getInstruction()->eraseFromParent(); + } - result.StatepointToken = token; + Result.StatepointToken = Token; // Second, create a gc.relocate for every live variable - CreateGCRelocates(liveVariables, live_start, basePtrs, token, Builder); + const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx(); + CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder); } namespace { -struct name_ordering { - Value *base; - Value *derived; - bool operator()(name_ordering const &a, name_ordering const &b) { - return -1 == a.derived->getName().compare(b.derived->getName()); +struct NameOrdering { + Value *Base; + Value *Derived; + + bool operator()(NameOrdering const &a, NameOrdering const &b) { + return -1 == a.Derived->getName().compare(b.Derived->getName()); } }; } -static void stablize_order(SmallVectorImpl<Value *> &basevec, - SmallVectorImpl<Value *> &livevec) { - assert(basevec.size() == livevec.size()); - - SmallVector<name_ordering, 64> temp; - for (size_t i = 0; i < basevec.size(); i++) { - name_ordering v; - v.base = basevec[i]; - v.derived = livevec[i]; - temp.push_back(v); - } - std::sort(temp.begin(), temp.end(), name_ordering()); - for (size_t i = 0; i < basevec.size(); i++) { - basevec[i] = temp[i].base; - livevec[i] = temp[i].derived; + +static void StabilizeOrder(SmallVectorImpl<Value *> &BaseVec, + SmallVectorImpl<Value *> &LiveVec) { + assert(BaseVec.size() == LiveVec.size()); + + SmallVector<NameOrdering, 64> Temp; + for (size_t i = 0; i < BaseVec.size(); i++) { + NameOrdering v; + v.Base = BaseVec[i]; + v.Derived = LiveVec[i]; + Temp.push_back(v); + } + + std::sort(Temp.begin(), Temp.end(), NameOrdering()); + for (size_t i = 0; i < BaseVec.size(); i++) { + BaseVec[i] = Temp[i].Base; + LiveVec[i] = Temp[i].Derived; } } @@ -1401,71 +1591,63 @@ static void stablize_order(SmallVectorImpl<Value *> &basevec, // WARNING: Does not do any fixup to adjust users of the original live // values. That's the callers responsibility. static void -makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, Pass *P, - PartiallyConstructedSafepointRecord &result) { - auto liveset = result.liveset; - auto PointerToBase = result.PointerToBase; +makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, + PartiallyConstructedSafepointRecord &Result, + std::vector<DeferredReplacement> &Replacements) { + const auto &LiveSet = Result.LiveSet; + const auto &PointerToBase = Result.PointerToBase; // Convert to vector for efficient cross referencing. - SmallVector<Value *, 64> basevec, livevec; - livevec.reserve(liveset.size()); - basevec.reserve(liveset.size()); - for (Value *L : liveset) { - livevec.push_back(L); - - assert(PointerToBase.find(L) != PointerToBase.end()); - Value *base = PointerToBase[L]; - basevec.push_back(base); + SmallVector<Value *, 64> BaseVec, LiveVec; + LiveVec.reserve(LiveSet.size()); + BaseVec.reserve(LiveSet.size()); + for (Value *L : LiveSet) { + LiveVec.push_back(L); + assert(PointerToBase.count(L)); + Value *Base = PointerToBase.find(L)->second; + BaseVec.push_back(Base); } - assert(livevec.size() == basevec.size()); + assert(LiveVec.size() == BaseVec.size()); // To make the output IR slightly more stable (for use in diffs), ensure a // fixed order of the values in the safepoint (by sorting the value name). // The order is otherwise meaningless. - stablize_order(basevec, livevec); + StabilizeOrder(BaseVec, LiveVec); // Do the actual rewriting and delete the old statepoint - makeStatepointExplicitImpl(CS, basevec, livevec, P, result); - CS.getInstruction()->eraseFromParent(); + makeStatepointExplicitImpl(CS, BaseVec, LiveVec, Result, Replacements); } // Helper function for the relocationViaAlloca. -// It receives iterator to the statepoint gc relocates and emits store to the -// assigned -// location (via allocaMap) for the each one of them. -// Add visited values into the visitedLiveValues set we will later use them -// for sanity check. +// +// It receives iterator to the statepoint gc relocates and emits a store to the +// assigned location (via allocaMap) for the each one of them. It adds the +// visited values into the visitedLiveValues set, which we will later use them +// for sanity checking. static void insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs, DenseMap<Value *, Value *> &AllocaMap, DenseSet<Value *> &VisitedLiveValues) { for (User *U : GCRelocs) { - if (!isa<IntrinsicInst>(U)) + GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U); + if (!Relocate) continue; - IntrinsicInst *RelocatedValue = cast<IntrinsicInst>(U); - - // We only care about relocates - if (RelocatedValue->getIntrinsicID() != - Intrinsic::experimental_gc_relocate) { - continue; - } - - GCRelocateOperands RelocateOperands(RelocatedValue); - Value *OriginalValue = - const_cast<Value *>(RelocateOperands.getDerivedPtr()); + Value *OriginalValue = const_cast<Value *>(Relocate->getDerivedPtr()); assert(AllocaMap.count(OriginalValue)); Value *Alloca = AllocaMap[OriginalValue]; // Emit store into the related alloca - // All gc_relocate are i8 addrspace(1)* typed, and it must be bitcasted to + // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to // the correct type according to alloca. - assert(RelocatedValue->getNextNode() && "Should always have one since it's not a terminator"); - IRBuilder<> Builder(RelocatedValue->getNextNode()); + assert(Relocate->getNextNode() && + "Should always have one since it's not a terminator"); + IRBuilder<> Builder(Relocate->getNextNode()); Value *CastedRelocatedValue = - Builder.CreateBitCast(RelocatedValue, cast<AllocaInst>(Alloca)->getAllocatedType(), - RelocatedValue->hasName() ? RelocatedValue->getName() + ".casted" : ""); + Builder.CreateBitCast(Relocate, + cast<AllocaInst>(Alloca)->getAllocatedType(), + suffixed_name_or(Relocate, ".casted", "")); StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca); Store->insertAfter(cast<Instruction>(CastedRelocatedValue)); @@ -1501,10 +1683,10 @@ insertRematerializationStores( } } -/// do all the relocation update via allocas and mem2reg +/// Do all the relocation update via allocas and mem2reg static void relocationViaAlloca( Function &F, DominatorTree &DT, ArrayRef<Value *> Live, - ArrayRef<struct PartiallyConstructedSafepointRecord> Records) { + ArrayRef<PartiallyConstructedSafepointRecord> Records) { #ifndef NDEBUG // record initial number of (static) allocas; we'll check we have the same // number when we get done. @@ -1531,15 +1713,12 @@ static void relocationViaAlloca( PromotableAllocas.push_back(Alloca); }; - // emit alloca for each live gc pointer - for (unsigned i = 0; i < Live.size(); i++) { - emitAllocaFor(Live[i]); - } - - // emit allocas for rematerialized values - for (size_t i = 0; i < Records.size(); i++) { - const struct PartiallyConstructedSafepointRecord &Info = Records[i]; + // Emit alloca for each live gc pointer + for (Value *V : Live) + emitAllocaFor(V); + // Emit allocas for rematerialized values + for (const auto &Info : Records) for (auto RematerializedValuePair : Info.RematerializedValues) { Value *OriginalValue = RematerializedValuePair.second; if (AllocaMap.count(OriginalValue) != 0) @@ -1548,20 +1727,17 @@ static void relocationViaAlloca( emitAllocaFor(OriginalValue); ++NumRematerializedValues; } - } // The next two loops are part of the same conceptual operation. We need to // insert a store to the alloca after the original def and at each // redefinition. We need to insert a load before each use. These are split // into distinct loops for performance reasons. - // update gc pointer after each statepoint - // either store a relocated value or null (if no relocated value found for - // this gc pointer and it is not a gc_result) - // this must happen before we update the statepoint with load of alloca - // otherwise we lose the link between statepoint and old def - for (size_t i = 0; i < Records.size(); i++) { - const struct PartiallyConstructedSafepointRecord &Info = Records[i]; + // Update gc pointer after each statepoint: either store a relocated value or + // null (if no relocated value was found for this gc pointer and it is not a + // gc_result). This must happen before we update the statepoint with load of + // alloca otherwise we lose the link between statepoint and old def. + for (const auto &Info : Records) { Value *Statepoint = Info.StatepointToken; // This will be used for consistency check @@ -1582,7 +1758,7 @@ static void relocationViaAlloca( VisitedLiveValues); if (ClobberNonLive) { - // As a debuging aid, pretend that an unrelocated pointer becomes null at + // As a debugging aid, pretend that an unrelocated pointer becomes null at // the gc.statepoint. This will turn some subtle GC problems into // slightly easier to debug SEGVs. Note that on large IR files with // lots of gc.statepoints this is extremely costly both memory and time @@ -1612,23 +1788,22 @@ static void relocationViaAlloca( // Insert the clobbering stores. These may get intermixed with the // gc.results and gc.relocates, but that's fine. if (auto II = dyn_cast<InvokeInst>(Statepoint)) { - InsertClobbersAt(II->getNormalDest()->getFirstInsertionPt()); - InsertClobbersAt(II->getUnwindDest()->getFirstInsertionPt()); + InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt()); + InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt()); } else { - BasicBlock::iterator Next(cast<CallInst>(Statepoint)); - Next++; - InsertClobbersAt(Next); + InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode()); } } } - // update use with load allocas and add store for gc_relocated + + // Update use with load allocas and add store for gc_relocated. for (auto Pair : AllocaMap) { Value *Def = Pair.first; Value *Alloca = Pair.second; - // we pre-record the uses of allocas so that we dont have to worry about - // later update - // that change the user information. + // We pre-record the uses of allocas so that we dont have to worry about + // later update that changes the user information.. + SmallVector<Instruction *, 20> Uses; // PERF: trade a linear scan for repeated reallocation Uses.reserve(std::distance(Def->user_begin(), Def->user_end())); @@ -1663,9 +1838,9 @@ static void relocationViaAlloca( } } - // emit store for the initial gc value - // store must be inserted after load, otherwise store will be in alloca's - // use list and an extra load will be inserted before it + // Emit store for the initial gc value. Store must be inserted after load, + // otherwise store will be in alloca's use list and an extra load will be + // inserted before it. StoreInst *Store = new StoreInst(Def, Alloca); if (Instruction *Inst = dyn_cast<Instruction>(Def)) { if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) { @@ -1688,14 +1863,13 @@ static void relocationViaAlloca( assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues && "we must have the same allocas with lives"); if (!PromotableAllocas.empty()) { - // apply mem2reg to promote alloca to SSA + // Apply mem2reg to promote alloca to SSA PromoteMemToReg(PromotableAllocas, DT); } #ifndef NDEBUG - for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); I != E; - I++) - if (isa<AllocaInst>(*I)) + for (auto &I : F.getEntryBlock()) + if (isa<AllocaInst>(I)) InitialAllocaNum--; assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas"); #endif @@ -1719,28 +1893,27 @@ static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values, // No values to hold live, might as well not insert the empty holder return; - Module *M = CS.getInstruction()->getParent()->getParent()->getParent(); + Module *M = CS.getInstruction()->getModule(); // Use a dummy vararg function to actually hold the values live Function *Func = cast<Function>(M->getOrInsertFunction( "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true))); if (CS.isCall()) { // For call safepoints insert dummy calls right after safepoint - BasicBlock::iterator Next(CS.getInstruction()); - Next++; - Holders.push_back(CallInst::Create(Func, Values, "", Next)); + Holders.push_back(CallInst::Create(Func, Values, "", + &*++CS.getInstruction()->getIterator())); return; } // For invoke safepooints insert dummy calls both in normal and // exceptional destination blocks auto *II = cast<InvokeInst>(CS.getInstruction()); Holders.push_back(CallInst::Create( - Func, Values, "", II->getNormalDest()->getFirstInsertionPt())); + Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt())); Holders.push_back(CallInst::Create( - Func, Values, "", II->getUnwindDest()->getFirstInsertionPt())); + Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt())); } static void findLiveReferences( - Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate, + Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate, MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { GCPtrLivenessData OriginalLivenessData; computeLiveInValues(DT, F, OriginalLivenessData); @@ -1751,12 +1924,12 @@ static void findLiveReferences( } } -/// Remove any vector of pointers from the liveset by scalarizing them over the -/// statepoint instruction. Adds the scalarized pieces to the liveset. It -/// would be preferrable to include the vector in the statepoint itself, but +/// Remove any vector of pointers from the live set by scalarizing them over the +/// statepoint instruction. Adds the scalarized pieces to the live set. It +/// would be preferable to include the vector in the statepoint itself, but /// the lowering code currently does not handle that. Extending it would be /// slightly non-trivial since it requires a format change. Given how rare -/// such cases are (for the moment?) scalarizing is an acceptable comprimise. +/// such cases are (for the moment?) scalarizing is an acceptable compromise. static void splitVectorValues(Instruction *StatepointInst, StatepointLiveSetTy &LiveSet, DenseMap<Value *, Value *>& PointerToBase, @@ -1887,7 +2060,7 @@ static void splitVectorValues(Instruction *StatepointInst, // Helper function for the "rematerializeLiveValues". It walks use chain // starting from the "CurrentValue" until it meets "BaseValue". Only "simple" // values are visited (currently it is GEP's and casts). Returns true if it -// sucessfully reached "BaseValue" and false otherwise. +// successfully reached "BaseValue" and false otherwise. // Fills "ChainToBase" array with all visited values. "BaseValue" is not // recorded. static bool findRematerializableChainToBasePointer( @@ -1907,16 +2080,12 @@ static bool findRematerializableChainToBasePointer( } if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) { - Value *Def = CI->stripPointerCasts(); - - // This two checks are basically similar. First one is here for the - // consistency with findBasePointers logic. - assert(!isa<CastInst>(Def) && "not a pointer cast found"); if (!CI->isNoopCast(CI->getModule()->getDataLayout())) return false; ChainToBase.push_back(CI); - return findRematerializableChainToBasePointer(ChainToBase, Def, BaseValue); + return findRematerializableChainToBasePointer(ChainToBase, + CI->getOperand(0), BaseValue); } // Not supported instruction in the chain @@ -1957,8 +2126,8 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain, return Cost; } -// From the statepoint liveset pick values that are cheaper to recompute then to -// relocate. Remove this values from the liveset, rematerialize them after +// From the statepoint live set pick values that are cheaper to recompute then +// to relocate. Remove this values from the live set, rematerialize them after // statepoint and record them in "Info" structure. Note that similar to // relocated values we don't do any user adjustments here. static void rematerializeLiveValues(CallSite CS, @@ -1970,10 +2139,10 @@ static void rematerializeLiveValues(CallSite CS, // We can not di this in following loop due to iterator invalidation. SmallVector<Value *, 32> LiveValuesToBeDeleted; - for (Value *LiveValue: Info.liveset) { + for (Value *LiveValue: Info.LiveSet) { // For each live pointer find it's defining chain SmallVector<Instruction *, 3> ChainToBase; - assert(Info.PointerToBase.find(LiveValue) != Info.PointerToBase.end()); + assert(Info.PointerToBase.count(LiveValue)); bool FoundChain = findRematerializableChainToBasePointer(ChainToBase, LiveValue, @@ -2059,9 +2228,9 @@ static void rematerializeLiveValues(CallSite CS, InvokeInst *Invoke = cast<InvokeInst>(CS.getInstruction()); Instruction *NormalInsertBefore = - Invoke->getNormalDest()->getFirstInsertionPt(); + &*Invoke->getNormalDest()->getFirstInsertionPt(); Instruction *UnwindInsertBefore = - Invoke->getUnwindDest()->getFirstInsertionPt(); + &*Invoke->getUnwindDest()->getFirstInsertionPt(); Instruction *NormalRematerializedValue = rematerializeChain(NormalInsertBefore); @@ -2075,22 +2244,23 @@ static void rematerializeLiveValues(CallSite CS, // Remove rematerializaed values from the live set for (auto LiveValue: LiveValuesToBeDeleted) { - Info.liveset.erase(LiveValue); + Info.LiveSet.erase(LiveValue); } } -static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, - SmallVectorImpl<CallSite> &toUpdate) { +static bool insertParsePoints(Function &F, DominatorTree &DT, + TargetTransformInfo &TTI, + SmallVectorImpl<CallSite> &ToUpdate) { #ifndef NDEBUG // sanity check the input - std::set<CallSite> uniqued; - uniqued.insert(toUpdate.begin(), toUpdate.end()); - assert(uniqued.size() == toUpdate.size() && "no duplicates please!"); + std::set<CallSite> Uniqued; + Uniqued.insert(ToUpdate.begin(), ToUpdate.end()); + assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!"); - for (size_t i = 0; i < toUpdate.size(); i++) { - CallSite &CS = toUpdate[i]; + for (CallSite CS : ToUpdate) { assert(CS.getInstruction()->getParent()->getParent() == &F); - assert(isStatepoint(CS) && "expected to already be a deopt statepoint"); + assert((UseDeoptBundles || isStatepoint(CS)) && + "expected to already be a deopt statepoint"); } #endif @@ -2098,50 +2268,45 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, // the top of the successor blocks. See the comment on // normalForInvokeSafepoint on exactly what is needed. Note that this step // may restructure the CFG. - for (CallSite CS : toUpdate) { + for (CallSite CS : ToUpdate) { if (!CS.isInvoke()) continue; - InvokeInst *invoke = cast<InvokeInst>(CS.getInstruction()); - normalizeForInvokeSafepoint(invoke->getNormalDest(), invoke->getParent(), - DT); - normalizeForInvokeSafepoint(invoke->getUnwindDest(), invoke->getParent(), - DT); + auto *II = cast<InvokeInst>(CS.getInstruction()); + normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT); + normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT); } // A list of dummy calls added to the IR to keep various values obviously // live in the IR. We'll remove all of these when done. - SmallVector<CallInst *, 64> holders; + SmallVector<CallInst *, 64> Holders; // Insert a dummy call with all of the arguments to the vm_state we'll need // for the actual safepoint insertion. This ensures reference arguments in // the deopt argument list are considered live through the safepoint (and // thus makes sure they get relocated.) - for (size_t i = 0; i < toUpdate.size(); i++) { - CallSite &CS = toUpdate[i]; - Statepoint StatepointCS(CS); - + for (CallSite CS : ToUpdate) { SmallVector<Value *, 64> DeoptValues; - for (Use &U : StatepointCS.vm_state_args()) { - Value *Arg = cast<Value>(&U); + + iterator_range<const Use *> DeoptStateRange = + UseDeoptBundles + ? iterator_range<const Use *>(GetDeoptBundleOperands(CS)) + : iterator_range<const Use *>(Statepoint(CS).vm_state_args()); + + for (Value *Arg : DeoptStateRange) { assert(!isUnhandledGCPointerType(Arg->getType()) && "support for FCA unimplemented"); if (isHandledGCPointerType(Arg->getType())) DeoptValues.push_back(Arg); } - insertUseHolderAfter(CS, DeoptValues, holders); - } - SmallVector<struct PartiallyConstructedSafepointRecord, 64> records; - records.reserve(toUpdate.size()); - for (size_t i = 0; i < toUpdate.size(); i++) { - struct PartiallyConstructedSafepointRecord info; - records.push_back(info); + insertUseHolderAfter(CS, DeoptValues, Holders); } - assert(records.size() == toUpdate.size()); - // A) Identify all gc pointers which are staticly live at the given call + SmallVector<PartiallyConstructedSafepointRecord, 64> Records(ToUpdate.size()); + + // A) Identify all gc pointers which are statically live at the given call // site. - findLiveReferences(F, DT, P, toUpdate, records); + findLiveReferences(F, DT, ToUpdate, Records); // B) Find the base pointers for each live pointer /* scope for caching */ { @@ -2150,10 +2315,9 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, // large numbers of duplicate base_phis. DefiningValueMapTy DVCache; - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - CallSite &CS = toUpdate[i]; - findBasePointers(DT, DVCache, CS, info); + for (size_t i = 0; i < Records.size(); i++) { + PartiallyConstructedSafepointRecord &info = Records[i]; + findBasePointers(DT, DVCache, ToUpdate[i], info); } } // end of cache scope @@ -2170,63 +2334,79 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, // the base pointers which were identified for that safepoint. We'll then // ask liveness for _every_ base inserted to see what is now live. Then we // remove the dummy calls. - holders.reserve(holders.size() + records.size()); - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - CallSite &CS = toUpdate[i]; + Holders.reserve(Holders.size() + Records.size()); + for (size_t i = 0; i < Records.size(); i++) { + PartiallyConstructedSafepointRecord &Info = Records[i]; SmallVector<Value *, 128> Bases; - for (auto Pair : info.PointerToBase) { + for (auto Pair : Info.PointerToBase) Bases.push_back(Pair.second); - } - insertUseHolderAfter(CS, Bases, holders); + + insertUseHolderAfter(ToUpdate[i], Bases, Holders); } // By selecting base pointers, we've effectively inserted new uses. Thus, we // need to rerun liveness. We may *also* have inserted new defs, but that's // not the key issue. - recomputeLiveInValues(F, DT, P, toUpdate, records); + recomputeLiveInValues(F, DT, ToUpdate, Records); if (PrintBasePointers) { - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; + for (auto &Info : Records) { errs() << "Base Pairs: (w/Relocation)\n"; - for (auto Pair : info.PointerToBase) { - errs() << " derived %" << Pair.first->getName() << " base %" - << Pair.second->getName() << "\n"; + for (auto Pair : Info.PointerToBase) { + errs() << " derived "; + Pair.first->printAsOperand(errs(), false); + errs() << " base "; + Pair.second->printAsOperand(errs(), false); + errs() << "\n"; } } } - for (size_t i = 0; i < holders.size(); i++) { - holders[i]->eraseFromParent(); - holders[i] = nullptr; - } - holders.clear(); + + // It is possible that non-constant live variables have a constant base. For + // example, a GEP with a variable offset from a global. In this case we can + // remove it from the liveset. We already don't add constants to the liveset + // because we assume they won't move at runtime and the GC doesn't need to be + // informed about them. The same reasoning applies if the base is constant. + // Note that the relocation placement code relies on this filtering for + // correctness as it expects the base to be in the liveset, which isn't true + // if the base is constant. + for (auto &Info : Records) + for (auto &BasePair : Info.PointerToBase) + if (isa<Constant>(BasePair.second)) + Info.LiveSet.erase(BasePair.first); + + for (CallInst *CI : Holders) + CI->eraseFromParent(); + + Holders.clear(); // Do a limited scalarization of any live at safepoint vector values which // contain pointers. This enables this pass to run after vectorization at - // the cost of some possible performance loss. TODO: it would be nice to - // natively support vectors all the way through the backend so we don't need - // to scalarize here. - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - Instruction *statepoint = toUpdate[i].getInstruction(); - splitVectorValues(cast<Instruction>(statepoint), info.liveset, - info.PointerToBase, DT); - } + // the cost of some possible performance loss. Note: This is known to not + // handle updating of the side tables correctly which can lead to relocation + // bugs when the same vector is live at multiple statepoints. We're in the + // process of implementing the alternate lowering - relocating the + // vector-of-pointers as first class item and updating the backend to + // understand that - but that's not yet complete. + if (UseVectorSplit) + for (size_t i = 0; i < Records.size(); i++) { + PartiallyConstructedSafepointRecord &Info = Records[i]; + Instruction *Statepoint = ToUpdate[i].getInstruction(); + splitVectorValues(cast<Instruction>(Statepoint), Info.LiveSet, + Info.PointerToBase, DT); + } // In order to reduce live set of statepoint we might choose to rematerialize - // some values instead of relocating them. This is purelly an optimization and + // some values instead of relocating them. This is purely an optimization and // does not influence correctness. - TargetTransformInfo &TTI = - P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - CallSite &CS = toUpdate[i]; + for (size_t i = 0; i < Records.size(); i++) + rematerializeLiveValues(ToUpdate[i], Records[i], TTI); - rematerializeLiveValues(CS, info, TTI); - } + // We need this to safely RAUW and delete call or invoke return values that + // may themselves be live over a statepoint. For details, please see usage in + // makeStatepointExplicitImpl. + std::vector<DeferredReplacement> Replacements; // Now run through and replace the existing statepoints with new ones with // the live variables listed. We do not yet update uses of the values being @@ -2234,61 +2414,78 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, // survive to the last iteration of this loop. (By construction, the // previous statepoint can not be a live variable, thus we can and remove // the old statepoint calls as we go.) - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - CallSite &CS = toUpdate[i]; - makeStatepointExplicit(DT, CS, P, info); + for (size_t i = 0; i < Records.size(); i++) + makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements); + + ToUpdate.clear(); // prevent accident use of invalid CallSites + + for (auto &PR : Replacements) + PR.doReplacement(); + + Replacements.clear(); + + for (auto &Info : Records) { + // These live sets may contain state Value pointers, since we replaced calls + // with operand bundles with calls wrapped in gc.statepoint, and some of + // those calls may have been def'ing live gc pointers. Clear these out to + // avoid accidentally using them. + // + // TODO: We should create a separate data structure that does not contain + // these live sets, and migrate to using that data structure from this point + // onward. + Info.LiveSet.clear(); + Info.PointerToBase.clear(); } - toUpdate.clear(); // prevent accident use of invalid CallSites // Do all the fixups of the original live variables to their relocated selves - SmallVector<Value *, 128> live; - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; + SmallVector<Value *, 128> Live; + for (size_t i = 0; i < Records.size(); i++) { + PartiallyConstructedSafepointRecord &Info = Records[i]; + // We can't simply save the live set from the original insertion. One of // the live values might be the result of a call which needs a safepoint. // That Value* no longer exists and we need to use the new gc_result. - // Thankfully, the liveset is embedded in the statepoint (and updated), so + // Thankfully, the live set is embedded in the statepoint (and updated), so // we just grab that. - Statepoint statepoint(info.StatepointToken); - live.insert(live.end(), statepoint.gc_args_begin(), - statepoint.gc_args_end()); + Statepoint Statepoint(Info.StatepointToken); + Live.insert(Live.end(), Statepoint.gc_args_begin(), + Statepoint.gc_args_end()); #ifndef NDEBUG // Do some basic sanity checks on our liveness results before performing // relocation. Relocation can and will turn mistakes in liveness results // into non-sensical code which is must harder to debug. // TODO: It would be nice to test consistency as well - assert(DT.isReachableFromEntry(info.StatepointToken->getParent()) && + assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) && "statepoint must be reachable or liveness is meaningless"); - for (Value *V : statepoint.gc_args()) { + for (Value *V : Statepoint.gc_args()) { if (!isa<Instruction>(V)) // Non-instruction values trivial dominate all possible uses continue; - auto LiveInst = cast<Instruction>(V); + auto *LiveInst = cast<Instruction>(V); assert(DT.isReachableFromEntry(LiveInst->getParent()) && "unreachable values should never be live"); - assert(DT.dominates(LiveInst, info.StatepointToken) && + assert(DT.dominates(LiveInst, Info.StatepointToken) && "basic SSA liveness expectation violated by liveness analysis"); } #endif } - unique_unsorted(live); + unique_unsorted(Live); #ifndef NDEBUG // sanity check - for (auto ptr : live) { - assert(isGCPointerType(ptr->getType()) && "must be a gc pointer type"); - } + for (auto *Ptr : Live) + assert(isHandledGCPointerType(Ptr->getType()) && + "must be a gc pointer type"); #endif - relocationViaAlloca(F, DT, live, records); - return !records.empty(); + relocationViaAlloca(F, DT, Live, Records); + return !Records.empty(); } // Handles both return values and arguments for Functions and CallSites. template <typename AttrHolder> -static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, - unsigned Index) { +static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, + unsigned Index) { AttrBuilder R; if (AH.getDereferenceableBytes(Index)) R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable, @@ -2296,6 +2493,8 @@ static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, if (AH.getDereferenceableOrNullBytes(Index)) R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull, AH.getDereferenceableOrNullBytes(Index))); + if (AH.doesNotAlias(Index)) + R.addAttribute(Attribute::NoAlias); if (!R.empty()) AH.setAttributes(AH.getAttributes().removeAttributes( @@ -2303,25 +2502,25 @@ static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, } void -RewriteStatepointsForGC::stripDereferenceabilityInfoFromPrototype(Function &F) { +RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) { LLVMContext &Ctx = F.getContext(); for (Argument &A : F.args()) if (isa<PointerType>(A.getType())) - RemoveDerefAttrAtIndex(Ctx, F, A.getArgNo() + 1); + RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1); if (isa<PointerType>(F.getReturnType())) - RemoveDerefAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex); + RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex); } -void RewriteStatepointsForGC::stripDereferenceabilityInfoFromBody(Function &F) { +void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) { if (F.empty()) return; LLVMContext &Ctx = F.getContext(); MDBuilder Builder(Ctx); - for (Instruction &I : inst_range(F)) { + for (Instruction &I : instructions(F)) { if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); bool IsImmutableTBAA = @@ -2344,9 +2543,9 @@ void RewriteStatepointsForGC::stripDereferenceabilityInfoFromBody(Function &F) { if (CallSite CS = CallSite(&I)) { for (int i = 0, e = CS.arg_size(); i != e; i++) if (isa<PointerType>(CS.getArgument(i)->getType())) - RemoveDerefAttrAtIndex(Ctx, CS, i + 1); + RemoveNonValidAttrAtIndex(Ctx, CS, i + 1); if (isa<PointerType>(CS.getType())) - RemoveDerefAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex); + RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex); } } } @@ -2356,7 +2555,7 @@ void RewriteStatepointsForGC::stripDereferenceabilityInfoFromBody(Function &F) { static bool shouldRewriteStatepointsIn(Function &F) { // TODO: This should check the GCStrategy if (F.hasGC()) { - const char *FunctionGCName = F.getGC(); + const auto &FunctionGCName = F.getGC(); const StringRef StatepointExampleName("statepoint-example"); const StringRef CoreCLRName("coreclr"); return (StatepointExampleName == FunctionGCName) || @@ -2365,17 +2564,17 @@ static bool shouldRewriteStatepointsIn(Function &F) { return false; } -void RewriteStatepointsForGC::stripDereferenceabilityInfo(Module &M) { +void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) { #ifndef NDEBUG assert(std::any_of(M.begin(), M.end(), shouldRewriteStatepointsIn) && "precondition!"); #endif for (Function &F : M) - stripDereferenceabilityInfoFromPrototype(F); + stripNonValidAttributesFromPrototype(F); for (Function &F : M) - stripDereferenceabilityInfoFromBody(F); + stripNonValidAttributesFromBody(F); } bool RewriteStatepointsForGC::runOnFunction(Function &F) { @@ -2389,15 +2588,27 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) { return false; DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); + TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + auto NeedsRewrite = [](Instruction &I) { + if (UseDeoptBundles) { + if (ImmutableCallSite CS = ImmutableCallSite(&I)) + return !callsGCLeafFunction(CS); + return false; + } + + return isStatepoint(I); + }; // Gather all the statepoints which need rewritten. Be careful to only // consider those in reachable code since we need to ask dominance queries // when rewriting. We'll delete the unreachable ones in a moment. SmallVector<CallSite, 64> ParsePointNeeded; bool HasUnreachableStatepoint = false; - for (Instruction &I : inst_range(F)) { + for (Instruction &I : instructions(F)) { // TODO: only the ones with the flag set! - if (isStatepoint(I)) { + if (NeedsRewrite(I)) { if (DT.isReachableFromEntry(I.getParent())) ParsePointNeeded.push_back(CallSite(&I)); else @@ -2428,7 +2639,38 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) { FoldSingleEntryPHINodes(&BB); } - MadeChange |= insertParsePoints(F, DT, this, ParsePointNeeded); + // Before we start introducing relocations, we want to tweak the IR a bit to + // avoid unfortunate code generation effects. The main example is that we + // want to try to make sure the comparison feeding a branch is after any + // safepoints. Otherwise, we end up with a comparison of pre-relocation + // values feeding a branch after relocation. This is semantically correct, + // but results in extra register pressure since both the pre-relocation and + // post-relocation copies must be available in registers. For code without + // relocations this is handled elsewhere, but teaching the scheduler to + // reverse the transform we're about to do would be slightly complex. + // Note: This may extend the live range of the inputs to the icmp and thus + // increase the liveset of any statepoint we move over. This is profitable + // as long as all statepoints are in rare blocks. If we had in-register + // lowering for live values this would be a much safer transform. + auto getConditionInst = [](TerminatorInst *TI) -> Instruction* { + if (auto *BI = dyn_cast<BranchInst>(TI)) + if (BI->isConditional()) + return dyn_cast<Instruction>(BI->getCondition()); + // TODO: Extend this to handle switches + return nullptr; + }; + for (BasicBlock &BB : F) { + TerminatorInst *TI = BB.getTerminator(); + if (auto *Cond = getConditionInst(TI)) + // TODO: Handle more than just ICmps here. We should be able to move + // most instructions without side effects or memory access. + if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) { + MadeChange = true; + Cond->moveBefore(TI); + } + } + + MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded); return MadeChange; } @@ -2461,7 +2703,7 @@ static void computeLiveInValues(BasicBlock::reverse_iterator rbegin, "support for FCA unimplemented"); if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) { // The choice to exclude all things constant here is slightly subtle. - // There are two idependent reasons: + // There are two independent reasons: // - We assume that things which are constant (from LLVM's definition) // do not move at runtime. For example, the address of a global // variable is fixed, even though it's contents may not be. @@ -2599,7 +2841,7 @@ static void computeLiveInValues(DominatorTree &DT, Function &F, } // while( !worklist.empty() ) #ifndef NDEBUG - // Sanity check our ouput against SSA properties. This helps catch any + // Sanity check our output against SSA properties. This helps catch any // missing kills during the above iteration. for (BasicBlock &BB : F) { checkBasicSSA(DT, Data, BB); @@ -2620,7 +2862,7 @@ static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data, // call result is not live (normal), nor are it's arguments // (unless they're used again later). This adjustment is // specifically what we need to relocate - BasicBlock::reverse_iterator rend(Inst); + BasicBlock::reverse_iterator rend(Inst->getIterator()); computeLiveInValues(BB->rbegin(), rend, LiveOut); LiveOut.erase(Inst); Out.insert(LiveOut.begin(), LiveOut.end()); @@ -2669,5 +2911,5 @@ static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, assert(Updated.count(KVPair.first) && "record for non-live value"); #endif - Info.liveset = Updated; + Info.LiveSet = Updated; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index 4d3a708..8569e08 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CallSite.h" @@ -479,6 +480,13 @@ private: void visitExtractValueInst(ExtractValueInst &EVI); void visitInsertValueInst(InsertValueInst &IVI); void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); } + void visitFuncletPadInst(FuncletPadInst &FPI) { + markAnythingOverdefined(&FPI); + } + void visitCatchSwitchInst(CatchSwitchInst &CPI) { + markAnythingOverdefined(&CPI); + visitTerminatorInst(CPI); + } // Instructions that cannot be folded away. void visitStoreInst (StoreInst &I); @@ -539,9 +547,9 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, return; } - if (isa<InvokeInst>(TI)) { - // Invoke instructions successors are always executable. - Succs[0] = Succs[1] = true; + // Unwinding instructions successors are always executable. + if (TI.isExceptional()) { + Succs.assign(TI.getNumSuccessors(), true); return; } @@ -605,8 +613,8 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { return BI->getSuccessor(CI->isZero()) == To; } - // Invoke instructions successors are always executable. - if (isa<InvokeInst>(TI)) + // Unwinding instructions successors are always executable. + if (TI->isExceptional()) return true; if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { @@ -630,7 +638,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { #ifndef NDEBUG dbgs() << "Unknown terminator instruction: " << *TI << '\n'; #endif - llvm_unreachable(nullptr); + llvm_unreachable("SCCP: Don't know how to handle this terminator!"); } // visit Implementations - Something changed in this instruction, either an @@ -749,9 +757,14 @@ void SCCPSolver::visitCastInst(CastInst &I) { LatticeVal OpSt = getValueState(I.getOperand(0)); if (OpSt.isOverdefined()) // Inherit overdefinedness of operand markOverdefined(&I); - else if (OpSt.isConstant()) // Propagate constant value - markConstant(&I, ConstantExpr::getCast(I.getOpcode(), - OpSt.getConstant(), I.getType())); + else if (OpSt.isConstant()) { + Constant *C = + ConstantExpr::getCast(I.getOpcode(), OpSt.getConstant(), I.getType()); + if (isa<UndefValue>(C)) + return; + // Propagate constant value + markConstant(&I, C); + } } @@ -851,10 +864,14 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { LatticeVal &IV = ValueState[&I]; if (IV.isOverdefined()) return; - if (V1State.isConstant() && V2State.isConstant()) - return markConstant(IV, &I, - ConstantExpr::get(I.getOpcode(), V1State.getConstant(), - V2State.getConstant())); + if (V1State.isConstant() && V2State.isConstant()) { + Constant *C = ConstantExpr::get(I.getOpcode(), V1State.getConstant(), + V2State.getConstant()); + // X op Y -> undef. + if (isa<UndefValue>(C)) + return; + return markConstant(IV, &I, C); + } // If something is undef, wait for it to resolve. if (!V1State.isOverdefined() && !V2State.isOverdefined()) @@ -909,10 +926,13 @@ void SCCPSolver::visitCmpInst(CmpInst &I) { LatticeVal &IV = ValueState[&I]; if (IV.isOverdefined()) return; - if (V1State.isConstant() && V2State.isConstant()) - return markConstant(IV, &I, ConstantExpr::getCompare(I.getPredicate(), - V1State.getConstant(), - V2State.getConstant())); + if (V1State.isConstant() && V2State.isConstant()) { + Constant *C = ConstantExpr::getCompare( + I.getPredicate(), V1State.getConstant(), V2State.getConstant()); + if (isa<UndefValue>(C)) + return; + return markConstant(IV, &I, C); + } // If operands are still undefined, wait for it to resolve. if (!V1State.isOverdefined() && !V2State.isOverdefined()) @@ -1012,8 +1032,11 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) { Constant *Ptr = Operands[0]; auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end()); - markConstant(&I, ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, - Indices)); + Constant *C = + ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, Indices); + if (isa<UndefValue>(C)) + return; + markConstant(&I, C); } void SCCPSolver::visitStoreInst(StoreInst &SI) { @@ -1053,9 +1076,9 @@ void SCCPSolver::visitLoadInst(LoadInst &I) { Constant *Ptr = PtrVal.getConstant(); - // load null -> null + // load null is undefined. if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0) - return markConstant(IV, &I, UndefValue::get(I.getType())); + return; // Transform load (constant global) into the value loaded. if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) { @@ -1071,8 +1094,11 @@ void SCCPSolver::visitLoadInst(LoadInst &I) { } // Transform load from a constant into a constant if possible. - if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, DL)) + if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, DL)) { + if (isa<UndefValue>(C)) + return; return markConstant(IV, &I, C); + } // Otherwise we cannot say for certain what value this load will produce. // Bail out. @@ -1114,8 +1140,12 @@ CallOverdefined: // If we can constant fold this, mark the result of the call as a // constant. - if (Constant *C = ConstantFoldCall(F, Operands, TLI)) + if (Constant *C = ConstantFoldCall(F, Operands, TLI)) { + // call -> undef. + if (isa<UndefValue>(C)) + return; return markConstant(I, C); + } } // Otherwise, we don't know anything about this call, mark it overdefined. @@ -1126,7 +1156,7 @@ CallOverdefined: // entry block executable and merge in the actual arguments to the call into // the formal arguments of the function. if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){ - MarkBlockExecutable(F->begin()); + MarkBlockExecutable(&F->front()); // Propagate information from this call site into the callee. CallSite::arg_iterator CAI = CS.arg_begin(); @@ -1135,17 +1165,17 @@ CallOverdefined: // If this argument is byval, and if the function is not readonly, there // will be an implicit copy formed of the input aggregate. if (AI->hasByValAttr() && !F->onlyReadsMemory()) { - markOverdefined(AI); + markOverdefined(&*AI); continue; } if (StructType *STy = dyn_cast<StructType>(AI->getType())) { for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { LatticeVal CallArg = getStructValueState(*CAI, i); - mergeInValue(getStructValueState(AI, i), AI, CallArg); + mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg); } } else { - mergeInValue(AI, getValueState(*CAI)); + mergeInValue(&*AI, getValueState(*CAI)); } } } @@ -1246,18 +1276,18 @@ void SCCPSolver::Solve() { /// even if X isn't defined. bool SCCPSolver::ResolvedUndefsIn(Function &F) { for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (!BBExecutable.count(BB)) + if (!BBExecutable.count(&*BB)) continue; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + for (Instruction &I : *BB) { // Look for instructions which produce undef values. - if (I->getType()->isVoidTy()) continue; + if (I.getType()->isVoidTy()) continue; - if (StructType *STy = dyn_cast<StructType>(I->getType())) { + if (StructType *STy = dyn_cast<StructType>(I.getType())) { // Only a few things that can be structs matter for undef. // Tracked calls must never be marked overdefined in ResolvedUndefsIn. - if (CallSite CS = CallSite(I)) + if (CallSite CS = CallSite(&I)) if (Function *F = CS.getCalledFunction()) if (MRVFunctionsTracked.count(F)) continue; @@ -1270,14 +1300,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // Send the results of everything else to overdefined. We could be // more precise than this but it isn't worth bothering. for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - LatticeVal &LV = getStructValueState(I, i); + LatticeVal &LV = getStructValueState(&I, i); if (LV.isUndefined()) - markOverdefined(LV, I); + markOverdefined(LV, &I); } continue; } - LatticeVal &LV = getValueState(I); + LatticeVal &LV = getValueState(&I); if (!LV.isUndefined()) continue; // extractvalue is safe; check here because the argument is a struct. @@ -1287,24 +1317,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // Compute the operand LatticeVals, for convenience below. // Anything taking a struct is conservatively assumed to require // overdefined markings. - if (I->getOperand(0)->getType()->isStructTy()) { - markOverdefined(I); + if (I.getOperand(0)->getType()->isStructTy()) { + markOverdefined(&I); return true; } - LatticeVal Op0LV = getValueState(I->getOperand(0)); + LatticeVal Op0LV = getValueState(I.getOperand(0)); LatticeVal Op1LV; - if (I->getNumOperands() == 2) { - if (I->getOperand(1)->getType()->isStructTy()) { - markOverdefined(I); + if (I.getNumOperands() == 2) { + if (I.getOperand(1)->getType()->isStructTy()) { + markOverdefined(&I); return true; } - Op1LV = getValueState(I->getOperand(1)); + Op1LV = getValueState(I.getOperand(1)); } // If this is an instructions whose result is defined even if the input is // not fully defined, propagate the information. - Type *ITy = I->getType(); - switch (I->getOpcode()) { + Type *ITy = I.getType(); + switch (I.getOpcode()) { case Instruction::Add: case Instruction::Sub: case Instruction::Trunc: @@ -1318,9 +1348,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::FRem: // Floating-point binary operation: be conservative. if (Op0LV.isUndefined() && Op1LV.isUndefined()) - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); else - markOverdefined(I); + markOverdefined(&I); return true; case Instruction::ZExt: case Instruction::SExt: @@ -1332,7 +1362,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::SIToFP: case Instruction::UIToFP: // undef -> 0; some outputs are impossible - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; case Instruction::Mul: case Instruction::And: @@ -1341,7 +1371,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { break; // undef * X -> 0. X could be zero. // undef & X -> 0. X could be zero. - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; case Instruction::Or: @@ -1349,7 +1379,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { if (Op0LV.isUndefined() && Op1LV.isUndefined()) break; // undef | X -> -1. X could be -1. - markForcedConstant(I, Constant::getAllOnesValue(ITy)); + markForcedConstant(&I, Constant::getAllOnesValue(ITy)); return true; case Instruction::Xor: @@ -1357,7 +1387,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // necessary, but we try to be nice to people who expect this // behavior in simple cases if (Op0LV.isUndefined() && Op1LV.isUndefined()) { - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; } // undef ^ X -> undef @@ -1371,9 +1401,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // X % undef -> undef. No change. if (Op1LV.isUndefined()) break; + // X / 0 -> undef. No change. + // X % 0 -> undef. No change. + if (Op1LV.isConstant() && Op1LV.getConstant()->isZeroValue()) + break; + // undef / X -> 0. X could be maxint. // undef % X -> 0. X could be 1. - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; case Instruction::AShr: @@ -1381,7 +1416,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { if (Op1LV.isUndefined()) break; // undef >>a X -> all ones - markForcedConstant(I, Constant::getAllOnesValue(ITy)); + markForcedConstant(&I, Constant::getAllOnesValue(ITy)); return true; case Instruction::LShr: case Instruction::Shl: @@ -1391,17 +1426,17 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // undef << X -> 0 // undef >> X -> 0 - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; case Instruction::Select: - Op1LV = getValueState(I->getOperand(1)); + Op1LV = getValueState(I.getOperand(1)); // undef ? X : Y -> X or Y. There could be commonality between X/Y. if (Op0LV.isUndefined()) { if (!Op1LV.isConstant()) // Pick the constant one if there is any. - Op1LV = getValueState(I->getOperand(2)); + Op1LV = getValueState(I.getOperand(2)); } else if (Op1LV.isUndefined()) { // c ? undef : undef -> undef. No change. - Op1LV = getValueState(I->getOperand(2)); + Op1LV = getValueState(I.getOperand(2)); if (Op1LV.isUndefined()) break; // Otherwise, c ? undef : x -> x. @@ -1410,9 +1445,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { } if (Op1LV.isConstant()) - markForcedConstant(I, Op1LV.getConstant()); + markForcedConstant(&I, Op1LV.getConstant()); else - markOverdefined(I); + markOverdefined(&I); return true; case Instruction::Load: // A load here means one of two things: a load of undef from a global, @@ -1421,9 +1456,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { break; case Instruction::ICmp: // X == undef -> undef. Other comparisons get more complicated. - if (cast<ICmpInst>(I)->isEquality()) + if (cast<ICmpInst>(&I)->isEquality()) break; - markOverdefined(I); + markOverdefined(&I); return true; case Instruction::Call: case Instruction::Invoke: { @@ -1432,19 +1467,19 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // 2. It could be constant-foldable. // Because of the way we solve return values, tracked calls must // never be marked overdefined in ResolvedUndefsIn. - if (Function *F = CallSite(I).getCalledFunction()) + if (Function *F = CallSite(&I).getCalledFunction()) if (TrackedRetVals.count(F)) break; // If the call is constant-foldable, we mark it overdefined because // we do not know what return values are valid. - markOverdefined(I); + markOverdefined(&I); return true; } default: // If we don't know what should happen here, conservatively mark it // overdefined. - markOverdefined(I); + markOverdefined(&I); return true; } } @@ -1462,7 +1497,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // false. if (isa<UndefValue>(BI->getCondition())) { BI->setCondition(ConstantInt::getFalse(BI->getContext())); - markEdgeExecutable(BB, TI->getSuccessor(1)); + markEdgeExecutable(&*BB, TI->getSuccessor(1)); return true; } @@ -1484,7 +1519,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // the first constant. if (isa<UndefValue>(SI->getCondition())) { SI->setCondition(SI->case_begin().getCaseValue()); - markEdgeExecutable(BB, SI->case_begin().getCaseSuccessor()); + markEdgeExecutable(&*BB, SI->case_begin().getCaseSuccessor()); return true; } @@ -1506,6 +1541,7 @@ namespace { struct SCCP : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } static char ID; // Pass identification, replacement for typeid SCCP() : FunctionPass(ID) { @@ -1541,11 +1577,10 @@ static void DeleteInstructionInBlock(BasicBlock *BB) { Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. while (EndInst != BB->begin()) { // Delete the next to last instruction. - BasicBlock::iterator I = EndInst; - Instruction *Inst = --I; + Instruction *Inst = &*--EndInst->getIterator(); if (!Inst->use_empty()) Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); - if (isa<LandingPadInst>(Inst)) { + if (Inst->isEHPad()) { EndInst = Inst; continue; } @@ -1568,11 +1603,11 @@ bool SCCP::runOnFunction(Function &F) { SCCPSolver Solver(DL, TLI); // Mark the first block of the function as being executable. - Solver.MarkBlockExecutable(F.begin()); + Solver.MarkBlockExecutable(&F.front()); // Mark all arguments to the function as being overdefined. - for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;++AI) - Solver.markAnythingOverdefined(AI); + for (Argument &AI : F.args()) + Solver.markAnythingOverdefined(&AI); // Solve for constants. bool ResolvedUndefs = true; @@ -1589,8 +1624,8 @@ bool SCCP::runOnFunction(Function &F) { // as we cannot modify the CFG of the function. for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (!Solver.isBlockExecutable(BB)) { - DeleteInstructionInBlock(BB); + if (!Solver.isBlockExecutable(&*BB)) { + DeleteInstructionInBlock(&*BB); MadeChanges = true; continue; } @@ -1599,7 +1634,7 @@ bool SCCP::runOnFunction(Function &F) { // constants if we have found them to be of constant values. // for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { - Instruction *Inst = BI++; + Instruction *Inst = &*BI++; if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst)) continue; @@ -1713,36 +1748,34 @@ bool IPSCCP::runOnModule(Module &M) { // If this is a strong or ODR definition of this function, then we can // propagate information about its result into callsites of it. if (!F->mayBeOverridden()) - Solver.AddTrackedFunction(F); + Solver.AddTrackedFunction(&*F); // If this function only has direct calls that we can see, we can track its // arguments and return value aggressively, and can assume it is not called // unless we see evidence to the contrary. if (F->hasLocalLinkage()) { - if (AddressIsTaken(F)) - AddressTakenFunctions.insert(F); + if (AddressIsTaken(&*F)) + AddressTakenFunctions.insert(&*F); else { - Solver.AddArgumentTrackedFunction(F); + Solver.AddArgumentTrackedFunction(&*F); continue; } } // Assume the function is called. - Solver.MarkBlockExecutable(F->begin()); + Solver.MarkBlockExecutable(&F->front()); // Assume nothing about the incoming arguments. - for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); - AI != E; ++AI) - Solver.markAnythingOverdefined(AI); + for (Argument &AI : F->args()) + Solver.markAnythingOverdefined(&AI); } // Loop over global variables. We inform the solver about any internal global // variables that do not have their 'addresses taken'. If they don't have // their addresses taken, we can propagate constants through them. - for (Module::global_iterator G = M.global_begin(), E = M.global_end(); - G != E; ++G) - if (!G->isConstant() && G->hasLocalLinkage() && !AddressIsTaken(G)) - Solver.TrackValueOfGlobalVariable(G); + for (GlobalVariable &G : M.globals()) + if (!G.isConstant() && G.hasLocalLinkage() && !AddressIsTaken(&G)) + Solver.TrackValueOfGlobalVariable(&G); // Solve for constants. bool ResolvedUndefs = true; @@ -1763,7 +1796,10 @@ bool IPSCCP::runOnModule(Module &M) { SmallVector<BasicBlock*, 512> BlocksToErase; for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (Solver.isBlockExecutable(F->begin())) { + if (F->isDeclaration()) + continue; + + if (Solver.isBlockExecutable(&F->front())) { for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E; ++AI) { if (AI->use_empty() || AI->getType()->isStructTy()) continue; @@ -1771,7 +1807,7 @@ bool IPSCCP::runOnModule(Module &M) { // TODO: Could use getStructLatticeValueFor to find out if the entire // result is a constant and replace it entirely if so. - LatticeVal IV = Solver.getLatticeValueFor(AI); + LatticeVal IV = Solver.getLatticeValueFor(&*AI); if (IV.isOverdefined()) continue; Constant *CST = IV.isConstant() ? @@ -1786,28 +1822,27 @@ bool IPSCCP::runOnModule(Module &M) { } for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { - if (!Solver.isBlockExecutable(BB)) { - DeleteInstructionInBlock(BB); + if (!Solver.isBlockExecutable(&*BB)) { + DeleteInstructionInBlock(&*BB); MadeChanges = true; TerminatorInst *TI = BB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { - BasicBlock *Succ = TI->getSuccessor(i); + for (BasicBlock *Succ : TI->successors()) { if (!Succ->empty() && isa<PHINode>(Succ->begin())) - TI->getSuccessor(i)->removePredecessor(BB); + Succ->removePredecessor(&*BB); } if (!TI->use_empty()) TI->replaceAllUsesWith(UndefValue::get(TI->getType())); TI->eraseFromParent(); - new UnreachableInst(M.getContext(), BB); + new UnreachableInst(M.getContext(), &*BB); if (&*BB != &F->front()) - BlocksToErase.push_back(BB); + BlocksToErase.push_back(&*BB); continue; } for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { - Instruction *Inst = BI++; + Instruction *Inst = &*BI++; if (Inst->getType()->isVoidTy() || Inst->getType()->isStructTy()) continue; diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp index 947513a..a7361b5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -23,12 +23,12 @@ /// //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/SROA.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" @@ -37,8 +37,6 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" @@ -53,9 +51,9 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/TimeValue.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" #if __cplusplus >= 201103L && !defined(NDEBUG) // We only use this for a debug check in C++11 @@ -63,6 +61,7 @@ #endif using namespace llvm; +using namespace llvm::sroa; #define DEBUG_TYPE "sroa" @@ -77,11 +76,6 @@ STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); STATISTIC(NumDeleted, "Number of instructions deleted"); STATISTIC(NumVectorized, "Number of vectorized aggregates"); -/// Hidden option to force the pass to not use DomTree and mem2reg, instead -/// forming SSA values through the SSAUpdater infrastructure. -static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false), - cl::Hidden); - /// Hidden option to enable randomly shuffling the slices to help uncover /// instability in their order. static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices", @@ -205,7 +199,6 @@ template <typename T> struct isPodLike; template <> struct isPodLike<Slice> { static const bool value = true; }; } -namespace { /// \brief Representation of the alloca slices. /// /// This class represents the slices of an alloca which are formed by its @@ -213,7 +206,7 @@ namespace { /// for the slices used and we reflect that in this structure. The uses are /// stored, sorted by increasing beginning offset and with unsplittable slices /// starting at a particular offset before splittable slices. -class AllocaSlices { +class llvm::sroa::AllocaSlices { public: /// \brief Construct the slices of a particular alloca. AllocaSlices(const DataLayout &DL, AllocaInst &AI); @@ -253,281 +246,10 @@ public: std::inplace_merge(Slices.begin(), SliceI, Slices.end()); } - // Forward declare an iterator to befriend it. + // Forward declare the iterator and range accessor for walking the + // partitions. class partition_iterator; - - /// \brief A partition of the slices. - /// - /// An ephemeral representation for a range of slices which can be viewed as - /// a partition of the alloca. This range represents a span of the alloca's - /// memory which cannot be split, and provides access to all of the slices - /// overlapping some part of the partition. - /// - /// Objects of this type are produced by traversing the alloca's slices, but - /// are only ephemeral and not persistent. - class Partition { - private: - friend class AllocaSlices; - friend class AllocaSlices::partition_iterator; - - /// \brief The begining and ending offsets of the alloca for this partition. - uint64_t BeginOffset, EndOffset; - - /// \brief The start end end iterators of this partition. - iterator SI, SJ; - - /// \brief A collection of split slice tails overlapping the partition. - SmallVector<Slice *, 4> SplitTails; - - /// \brief Raw constructor builds an empty partition starting and ending at - /// the given iterator. - Partition(iterator SI) : SI(SI), SJ(SI) {} - - public: - /// \brief The start offset of this partition. - /// - /// All of the contained slices start at or after this offset. - uint64_t beginOffset() const { return BeginOffset; } - - /// \brief The end offset of this partition. - /// - /// All of the contained slices end at or before this offset. - uint64_t endOffset() const { return EndOffset; } - - /// \brief The size of the partition. - /// - /// Note that this can never be zero. - uint64_t size() const { - assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); - return EndOffset - BeginOffset; - } - - /// \brief Test whether this partition contains no slices, and merely spans - /// a region occupied by split slices. - bool empty() const { return SI == SJ; } - - /// \name Iterate slices that start within the partition. - /// These may be splittable or unsplittable. They have a begin offset >= the - /// partition begin offset. - /// @{ - // FIXME: We should probably define a "concat_iterator" helper and use that - // to stitch together pointee_iterators over the split tails and the - // contiguous iterators of the partition. That would give a much nicer - // interface here. We could then additionally expose filtered iterators for - // split, unsplit, and unsplittable splices based on the usage patterns. - iterator begin() const { return SI; } - iterator end() const { return SJ; } - /// @} - - /// \brief Get the sequence of split slice tails. - /// - /// These tails are of slices which start before this partition but are - /// split and overlap into the partition. We accumulate these while forming - /// partitions. - ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } - }; - - /// \brief An iterator over partitions of the alloca's slices. - /// - /// This iterator implements the core algorithm for partitioning the alloca's - /// slices. It is a forward iterator as we don't support backtracking for - /// efficiency reasons, and re-use a single storage area to maintain the - /// current set of split slices. - /// - /// It is templated on the slice iterator type to use so that it can operate - /// with either const or non-const slice iterators. - class partition_iterator - : public iterator_facade_base<partition_iterator, - std::forward_iterator_tag, Partition> { - friend class AllocaSlices; - - /// \brief Most of the state for walking the partitions is held in a class - /// with a nice interface for examining them. - Partition P; - - /// \brief We need to keep the end of the slices to know when to stop. - AllocaSlices::iterator SE; - - /// \brief We also need to keep track of the maximum split end offset seen. - /// FIXME: Do we really? - uint64_t MaxSplitSliceEndOffset; - - /// \brief Sets the partition to be empty at given iterator, and sets the - /// end iterator. - partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) - : P(SI), SE(SE), MaxSplitSliceEndOffset(0) { - // If not already at the end, advance our state to form the initial - // partition. - if (SI != SE) - advance(); - } - - /// \brief Advance the iterator to the next partition. - /// - /// Requires that the iterator not be at the end of the slices. - void advance() { - assert((P.SI != SE || !P.SplitTails.empty()) && - "Cannot advance past the end of the slices!"); - - // Clear out any split uses which have ended. - if (!P.SplitTails.empty()) { - if (P.EndOffset >= MaxSplitSliceEndOffset) { - // If we've finished all splits, this is easy. - P.SplitTails.clear(); - MaxSplitSliceEndOffset = 0; - } else { - // Remove the uses which have ended in the prior partition. This - // cannot change the max split slice end because we just checked that - // the prior partition ended prior to that max. - P.SplitTails.erase( - std::remove_if( - P.SplitTails.begin(), P.SplitTails.end(), - [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), - P.SplitTails.end()); - assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(), - [&](Slice *S) { - return S->endOffset() == MaxSplitSliceEndOffset; - }) && - "Could not find the current max split slice offset!"); - assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(), - [&](Slice *S) { - return S->endOffset() <= MaxSplitSliceEndOffset; - }) && - "Max split slice end offset is not actually the max!"); - } - } - - // If P.SI is already at the end, then we've cleared the split tail and - // now have an end iterator. - if (P.SI == SE) { - assert(P.SplitTails.empty() && "Failed to clear the split slices!"); - return; - } - - // If we had a non-empty partition previously, set up the state for - // subsequent partitions. - if (P.SI != P.SJ) { - // Accumulate all the splittable slices which started in the old - // partition into the split list. - for (Slice &S : P) - if (S.isSplittable() && S.endOffset() > P.EndOffset) { - P.SplitTails.push_back(&S); - MaxSplitSliceEndOffset = - std::max(S.endOffset(), MaxSplitSliceEndOffset); - } - - // Start from the end of the previous partition. - P.SI = P.SJ; - - // If P.SI is now at the end, we at most have a tail of split slices. - if (P.SI == SE) { - P.BeginOffset = P.EndOffset; - P.EndOffset = MaxSplitSliceEndOffset; - return; - } - - // If the we have split slices and the next slice is after a gap and is - // not splittable immediately form an empty partition for the split - // slices up until the next slice begins. - if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && - !P.SI->isSplittable()) { - P.BeginOffset = P.EndOffset; - P.EndOffset = P.SI->beginOffset(); - return; - } - } - - // OK, we need to consume new slices. Set the end offset based on the - // current slice, and step SJ past it. The beginning offset of the - // parttion is the beginning offset of the next slice unless we have - // pre-existing split slices that are continuing, in which case we begin - // at the prior end offset. - P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; - P.EndOffset = P.SI->endOffset(); - ++P.SJ; - - // There are two strategies to form a partition based on whether the - // partition starts with an unsplittable slice or a splittable slice. - if (!P.SI->isSplittable()) { - // When we're forming an unsplittable region, it must always start at - // the first slice and will extend through its end. - assert(P.BeginOffset == P.SI->beginOffset()); - - // Form a partition including all of the overlapping slices with this - // unsplittable slice. - while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { - if (!P.SJ->isSplittable()) - P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); - ++P.SJ; - } - - // We have a partition across a set of overlapping unsplittable - // partitions. - return; - } - - // If we're starting with a splittable slice, then we need to form - // a synthetic partition spanning it and any other overlapping splittable - // splices. - assert(P.SI->isSplittable() && "Forming a splittable partition!"); - - // Collect all of the overlapping splittable slices. - while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && - P.SJ->isSplittable()) { - P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); - ++P.SJ; - } - - // Back upiP.EndOffset if we ended the span early when encountering an - // unsplittable slice. This synthesizes the early end offset of - // a partition spanning only splittable slices. - if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { - assert(!P.SJ->isSplittable()); - P.EndOffset = P.SJ->beginOffset(); - } - } - - public: - bool operator==(const partition_iterator &RHS) const { - assert(SE == RHS.SE && - "End iterators don't match between compared partition iterators!"); - - // The observed positions of partitions is marked by the P.SI iterator and - // the emptyness of the split slices. The latter is only relevant when - // P.SI == SE, as the end iterator will additionally have an empty split - // slices list, but the prior may have the same P.SI and a tail of split - // slices. - if (P.SI == RHS.P.SI && - P.SplitTails.empty() == RHS.P.SplitTails.empty()) { - assert(P.SJ == RHS.P.SJ && - "Same set of slices formed two different sized partitions!"); - assert(P.SplitTails.size() == RHS.P.SplitTails.size() && - "Same slice position with differently sized non-empty split " - "slice tails!"); - return true; - } - return false; - } - - partition_iterator &operator++() { - advance(); - return *this; - } - - Partition &operator*() { return P; } - }; - - /// \brief A forward range over the partitions of the alloca's slices. - /// - /// This accesses an iterator range over the partitions of the alloca's - /// slices. It computes these partitions on the fly based on the overlapping - /// offsets of the slices and the ability to split them. It will visit "empty" - /// partitions to cover regions of the alloca only accessed via split - /// slices. - iterator_range<partition_iterator> partitions() { - return make_range(partition_iterator(begin(), end()), - partition_iterator(end(), end())); - } + iterator_range<partition_iterator> partitions(); /// \brief Access the dead users for this alloca. ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; } @@ -595,6 +317,280 @@ private: /// the alloca. SmallVector<Use *, 8> DeadOperands; }; + +/// \brief A partition of the slices. +/// +/// An ephemeral representation for a range of slices which can be viewed as +/// a partition of the alloca. This range represents a span of the alloca's +/// memory which cannot be split, and provides access to all of the slices +/// overlapping some part of the partition. +/// +/// Objects of this type are produced by traversing the alloca's slices, but +/// are only ephemeral and not persistent. +class llvm::sroa::Partition { +private: + friend class AllocaSlices; + friend class AllocaSlices::partition_iterator; + + typedef AllocaSlices::iterator iterator; + + /// \brief The beginning and ending offsets of the alloca for this + /// partition. + uint64_t BeginOffset, EndOffset; + + /// \brief The start end end iterators of this partition. + iterator SI, SJ; + + /// \brief A collection of split slice tails overlapping the partition. + SmallVector<Slice *, 4> SplitTails; + + /// \brief Raw constructor builds an empty partition starting and ending at + /// the given iterator. + Partition(iterator SI) : SI(SI), SJ(SI) {} + +public: + /// \brief The start offset of this partition. + /// + /// All of the contained slices start at or after this offset. + uint64_t beginOffset() const { return BeginOffset; } + + /// \brief The end offset of this partition. + /// + /// All of the contained slices end at or before this offset. + uint64_t endOffset() const { return EndOffset; } + + /// \brief The size of the partition. + /// + /// Note that this can never be zero. + uint64_t size() const { + assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); + return EndOffset - BeginOffset; + } + + /// \brief Test whether this partition contains no slices, and merely spans + /// a region occupied by split slices. + bool empty() const { return SI == SJ; } + + /// \name Iterate slices that start within the partition. + /// These may be splittable or unsplittable. They have a begin offset >= the + /// partition begin offset. + /// @{ + // FIXME: We should probably define a "concat_iterator" helper and use that + // to stitch together pointee_iterators over the split tails and the + // contiguous iterators of the partition. That would give a much nicer + // interface here. We could then additionally expose filtered iterators for + // split, unsplit, and unsplittable splices based on the usage patterns. + iterator begin() const { return SI; } + iterator end() const { return SJ; } + /// @} + + /// \brief Get the sequence of split slice tails. + /// + /// These tails are of slices which start before this partition but are + /// split and overlap into the partition. We accumulate these while forming + /// partitions. + ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } +}; + +/// \brief An iterator over partitions of the alloca's slices. +/// +/// This iterator implements the core algorithm for partitioning the alloca's +/// slices. It is a forward iterator as we don't support backtracking for +/// efficiency reasons, and re-use a single storage area to maintain the +/// current set of split slices. +/// +/// It is templated on the slice iterator type to use so that it can operate +/// with either const or non-const slice iterators. +class AllocaSlices::partition_iterator + : public iterator_facade_base<partition_iterator, std::forward_iterator_tag, + Partition> { + friend class AllocaSlices; + + /// \brief Most of the state for walking the partitions is held in a class + /// with a nice interface for examining them. + Partition P; + + /// \brief We need to keep the end of the slices to know when to stop. + AllocaSlices::iterator SE; + + /// \brief We also need to keep track of the maximum split end offset seen. + /// FIXME: Do we really? + uint64_t MaxSplitSliceEndOffset; + + /// \brief Sets the partition to be empty at given iterator, and sets the + /// end iterator. + partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) + : P(SI), SE(SE), MaxSplitSliceEndOffset(0) { + // If not already at the end, advance our state to form the initial + // partition. + if (SI != SE) + advance(); + } + + /// \brief Advance the iterator to the next partition. + /// + /// Requires that the iterator not be at the end of the slices. + void advance() { + assert((P.SI != SE || !P.SplitTails.empty()) && + "Cannot advance past the end of the slices!"); + + // Clear out any split uses which have ended. + if (!P.SplitTails.empty()) { + if (P.EndOffset >= MaxSplitSliceEndOffset) { + // If we've finished all splits, this is easy. + P.SplitTails.clear(); + MaxSplitSliceEndOffset = 0; + } else { + // Remove the uses which have ended in the prior partition. This + // cannot change the max split slice end because we just checked that + // the prior partition ended prior to that max. + P.SplitTails.erase( + std::remove_if( + P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), + P.SplitTails.end()); + assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { + return S->endOffset() == MaxSplitSliceEndOffset; + }) && + "Could not find the current max split slice offset!"); + assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { + return S->endOffset() <= MaxSplitSliceEndOffset; + }) && + "Max split slice end offset is not actually the max!"); + } + } + + // If P.SI is already at the end, then we've cleared the split tail and + // now have an end iterator. + if (P.SI == SE) { + assert(P.SplitTails.empty() && "Failed to clear the split slices!"); + return; + } + + // If we had a non-empty partition previously, set up the state for + // subsequent partitions. + if (P.SI != P.SJ) { + // Accumulate all the splittable slices which started in the old + // partition into the split list. + for (Slice &S : P) + if (S.isSplittable() && S.endOffset() > P.EndOffset) { + P.SplitTails.push_back(&S); + MaxSplitSliceEndOffset = + std::max(S.endOffset(), MaxSplitSliceEndOffset); + } + + // Start from the end of the previous partition. + P.SI = P.SJ; + + // If P.SI is now at the end, we at most have a tail of split slices. + if (P.SI == SE) { + P.BeginOffset = P.EndOffset; + P.EndOffset = MaxSplitSliceEndOffset; + return; + } + + // If the we have split slices and the next slice is after a gap and is + // not splittable immediately form an empty partition for the split + // slices up until the next slice begins. + if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && + !P.SI->isSplittable()) { + P.BeginOffset = P.EndOffset; + P.EndOffset = P.SI->beginOffset(); + return; + } + } + + // OK, we need to consume new slices. Set the end offset based on the + // current slice, and step SJ past it. The beginning offset of the + // partition is the beginning offset of the next slice unless we have + // pre-existing split slices that are continuing, in which case we begin + // at the prior end offset. + P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; + P.EndOffset = P.SI->endOffset(); + ++P.SJ; + + // There are two strategies to form a partition based on whether the + // partition starts with an unsplittable slice or a splittable slice. + if (!P.SI->isSplittable()) { + // When we're forming an unsplittable region, it must always start at + // the first slice and will extend through its end. + assert(P.BeginOffset == P.SI->beginOffset()); + + // Form a partition including all of the overlapping slices with this + // unsplittable slice. + while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { + if (!P.SJ->isSplittable()) + P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); + ++P.SJ; + } + + // We have a partition across a set of overlapping unsplittable + // partitions. + return; + } + + // If we're starting with a splittable slice, then we need to form + // a synthetic partition spanning it and any other overlapping splittable + // splices. + assert(P.SI->isSplittable() && "Forming a splittable partition!"); + + // Collect all of the overlapping splittable slices. + while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && + P.SJ->isSplittable()) { + P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); + ++P.SJ; + } + + // Back upiP.EndOffset if we ended the span early when encountering an + // unsplittable slice. This synthesizes the early end offset of + // a partition spanning only splittable slices. + if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { + assert(!P.SJ->isSplittable()); + P.EndOffset = P.SJ->beginOffset(); + } + } + +public: + bool operator==(const partition_iterator &RHS) const { + assert(SE == RHS.SE && + "End iterators don't match between compared partition iterators!"); + + // The observed positions of partitions is marked by the P.SI iterator and + // the emptiness of the split slices. The latter is only relevant when + // P.SI == SE, as the end iterator will additionally have an empty split + // slices list, but the prior may have the same P.SI and a tail of split + // slices. + if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) { + assert(P.SJ == RHS.P.SJ && + "Same set of slices formed two different sized partitions!"); + assert(P.SplitTails.size() == RHS.P.SplitTails.size() && + "Same slice position with differently sized non-empty split " + "slice tails!"); + return true; + } + return false; + } + + partition_iterator &operator++() { + advance(); + return *this; + } + + Partition &operator*() { return P; } +}; + +/// \brief A forward range over the partitions of the alloca's slices. +/// +/// This accesses an iterator range over the partitions of the alloca's +/// slices. It computes these partitions on the fly based on the overlapping +/// offsets of the slices and the ability to split them. It will visit "empty" +/// partitions to cover regions of the alloca only accessed via split +/// slices. +iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() { + return make_range(partition_iterator(begin(), end()), + partition_iterator(end(), end())); } static Value *foldSelectInst(SelectInst &SI) { @@ -1072,217 +1068,6 @@ LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); } #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -namespace { -/// \brief Implementation of LoadAndStorePromoter for promoting allocas. -/// -/// This subclass of LoadAndStorePromoter adds overrides to handle promoting -/// the loads and stores of an alloca instruction, as well as updating its -/// debug information. This is used when a domtree is unavailable and thus -/// mem2reg in its full form can't be used to handle promotion of allocas to -/// scalar values. -class AllocaPromoter : public LoadAndStorePromoter { - AllocaInst &AI; - DIBuilder &DIB; - - SmallVector<DbgDeclareInst *, 4> DDIs; - SmallVector<DbgValueInst *, 4> DVIs; - -public: - AllocaPromoter(ArrayRef<const Instruction *> Insts, - SSAUpdater &S, - AllocaInst &AI, DIBuilder &DIB) - : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} - - void run(const SmallVectorImpl<Instruction *> &Insts) { - // Retain the debug information attached to the alloca for use when - // rewriting loads and stores. - if (auto *L = LocalAsMetadata::getIfExists(&AI)) { - if (auto *DINode = MetadataAsValue::getIfExists(AI.getContext(), L)) { - for (User *U : DINode->users()) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) - DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) - DVIs.push_back(DVI); - } - } - - LoadAndStorePromoter::run(Insts); - - // While we have the debug information, clear it off of the alloca. The - // caller takes care of deleting the alloca. - while (!DDIs.empty()) - DDIs.pop_back_val()->eraseFromParent(); - while (!DVIs.empty()) - DVIs.pop_back_val()->eraseFromParent(); - } - - bool - isInstInList(Instruction *I, - const SmallVectorImpl<Instruction *> &Insts) const override { - Value *Ptr; - if (LoadInst *LI = dyn_cast<LoadInst>(I)) - Ptr = LI->getOperand(0); - else - Ptr = cast<StoreInst>(I)->getPointerOperand(); - - // Only used to detect cycles, which will be rare and quickly found as - // we're walking up a chain of defs rather than down through uses. - SmallPtrSet<Value *, 4> Visited; - - do { - if (Ptr == &AI) - return true; - - if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr)) - Ptr = BCI->getOperand(0); - else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) - Ptr = GEPI->getPointerOperand(); - else - return false; - - } while (Visited.insert(Ptr).second); - - return false; - } - - void updateDebugInfo(Instruction *Inst) const override { - for (DbgDeclareInst *DDI : DDIs) - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) - ConvertDebugDeclareToDebugValue(DDI, SI, DIB); - else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) - ConvertDebugDeclareToDebugValue(DDI, LI, DIB); - for (DbgValueInst *DVI : DVIs) { - Value *Arg = nullptr; - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - // If an argument is zero extended then use argument directly. The ZExt - // may be zapped by an optimization pass in future. - if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) - Arg = dyn_cast<Argument>(ZExt->getOperand(0)); - else if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) - Arg = dyn_cast<Argument>(SExt->getOperand(0)); - if (!Arg) - Arg = SI->getValueOperand(); - } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { - Arg = LI->getPointerOperand(); - } else { - continue; - } - DIB.insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(), - DVI->getExpression(), DVI->getDebugLoc(), - Inst); - } - } -}; -} // end anon namespace - -namespace { -/// \brief An optimization pass providing Scalar Replacement of Aggregates. -/// -/// This pass takes allocations which can be completely analyzed (that is, they -/// don't escape) and tries to turn them into scalar SSA values. There are -/// a few steps to this process. -/// -/// 1) It takes allocations of aggregates and analyzes the ways in which they -/// are used to try to split them into smaller allocations, ideally of -/// a single scalar data type. It will split up memcpy and memset accesses -/// as necessary and try to isolate individual scalar accesses. -/// 2) It will transform accesses into forms which are suitable for SSA value -/// promotion. This can be replacing a memset with a scalar store of an -/// integer value, or it can involve speculating operations on a PHI or -/// select to be a PHI or select of the results. -/// 3) Finally, this will try to detect a pattern of accesses which map cleanly -/// onto insert and extract operations on a vector value, and convert them to -/// this form. By doing so, it will enable promotion of vector aggregates to -/// SSA vector values. -class SROA : public FunctionPass { - const bool RequiresDomTree; - - LLVMContext *C; - DominatorTree *DT; - AssumptionCache *AC; - - /// \brief Worklist of alloca instructions to simplify. - /// - /// Each alloca in the function is added to this. Each new alloca formed gets - /// added to it as well to recursively simplify unless that alloca can be - /// directly promoted. Finally, each time we rewrite a use of an alloca other - /// the one being actively rewritten, we add it back onto the list if not - /// already present to ensure it is re-visited. - SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> Worklist; - - /// \brief A collection of instructions to delete. - /// We try to batch deletions to simplify code and make things a bit more - /// efficient. - SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts; - - /// \brief Post-promotion worklist. - /// - /// Sometimes we discover an alloca which has a high probability of becoming - /// viable for SROA after a round of promotion takes place. In those cases, - /// the alloca is enqueued here for re-processing. - /// - /// Note that we have to be very careful to clear allocas out of this list in - /// the event they are deleted. - SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> PostPromotionWorklist; - - /// \brief A collection of alloca instructions we can directly promote. - std::vector<AllocaInst *> PromotableAllocas; - - /// \brief A worklist of PHIs to speculate prior to promoting allocas. - /// - /// All of these PHIs have been checked for the safety of speculation and by - /// being speculated will allow promoting allocas currently in the promotable - /// queue. - SetVector<PHINode *, SmallVector<PHINode *, 2>> SpeculatablePHIs; - - /// \brief A worklist of select instructions to speculate prior to promoting - /// allocas. - /// - /// All of these select instructions have been checked for the safety of - /// speculation and by being speculated will allow promoting allocas - /// currently in the promotable queue. - SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects; - -public: - SROA(bool RequiresDomTree = true) - : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr), - DT(nullptr) { - initializeSROAPass(*PassRegistry::getPassRegistry()); - } - bool runOnFunction(Function &F) override; - void getAnalysisUsage(AnalysisUsage &AU) const override; - - const char *getPassName() const override { return "SROA"; } - static char ID; - -private: - friend class PHIOrSelectSpeculator; - friend class AllocaSliceRewriter; - - bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS); - AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::Partition &P); - bool splitAlloca(AllocaInst &AI, AllocaSlices &AS); - bool runOnAlloca(AllocaInst &AI); - void clobberUse(Use &U); - void deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas); - bool promoteAllocas(Function &F); -}; -} - -char SROA::ID = 0; - -FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { - return new SROA(RequiresDomTree); -} - -INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false, - false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false, - false) - /// Walk the range of a partitioning looking for a common type to cover this /// sequence of slices. static Type *findCommonType(AllocaSlices::const_iterator B, @@ -1373,7 +1158,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) { // Ensure that there are no instructions between the PHI and the load that // could store. - for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) + for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI) if (BBI->mayWriteToMemory()) return false; @@ -1934,10 +1719,10 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, /// \brief Test whether the given slice use can be promoted to a vector. /// -/// This function is called to test each entry in a partioning which is slated +/// This function is called to test each entry in a partition which is slated /// for a single slice. -static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P, - const Slice &S, VectorType *Ty, +static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, + VectorType *Ty, uint64_t ElementSize, const DataLayout &DL) { // First validate the slice offsets. @@ -2012,8 +1797,7 @@ static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P, /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P, - const DataLayout &DL) { +static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { // Collect the candidate types for vector-based promotion. Also track whether // we have different element types. SmallVector<VectorType *, 4> CandidateTys; @@ -2130,7 +1914,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t RelEnd = S.endOffset() - AllocBeginOffset; // We can't reasonably handle cases where the load or store extends past - // the end of the aloca's type and into its padding. + // the end of the alloca's type and into its padding. if (RelEnd > Size) return false; @@ -2199,7 +1983,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S, /// This is a quick test to check whether we can rewrite the integer loads and /// stores to a particular alloca into wider loads and stores and be able to /// promote the resulting alloca. -static bool isIntegerWideningViable(AllocaSlices::Partition &P, Type *AllocaTy, +static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL) { uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy); // Don't create integer types larger than the maximum bitwidth. @@ -2368,14 +2152,14 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, return V; } -namespace { /// \brief Visitor to rewrite instructions using p particular slice of an alloca /// to use a new alloca. /// /// Also implements the rewriting to vector-based accesses when the partition /// passes the isVectorPromotionViable predicate. Most of the rewriting logic /// lives here. -class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { +class llvm::sroa::AllocaSliceRewriter + : public InstVisitor<AllocaSliceRewriter, bool> { // Befriend the base class so it can delegate to private visit methods. friend class llvm::InstVisitor<AllocaSliceRewriter, bool>; typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base; @@ -2583,9 +2367,19 @@ private: V = convertValue(DL, IRB, V, IntTy); assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; - if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) - V = extractInteger(DL, IRB, V, cast<IntegerType>(LI.getType()), Offset, - "extract"); + if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) { + IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8); + V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract"); + } + // It is possible that the extracted type is not the load type. This + // happens if there is a load past the end of the alloca, and as + // a consequence the slice is narrower but still a candidate for integer + // lowering. To handle this case, we just zero extend the extracted + // integer. + assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 && + "Can only handle an extract for an overly wide load"); + if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8) + V = IRB.CreateZExt(V, LI.getType()); return V; } @@ -2648,7 +2442,7 @@ private: DL.getTypeStoreSizeInBits(LI.getType()) && "Non-byte-multiple bit width"); // Move the insertion point just past the load so that we can refer to it. - IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI))); + IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI))); // Create a placeholder value with the same type as LI to use as the // basis for the new value. This allows us to replace the uses of LI with // the computed value, and then replace the placeholder with LI, leaving @@ -3126,7 +2920,7 @@ private: // dominate the PHI. IRBuilderTy PtrBuilder(IRB); if (isa<PHINode>(OldPtr)) - PtrBuilder.SetInsertPoint(OldPtr->getParent()->getFirstInsertionPt()); + PtrBuilder.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt()); else PtrBuilder.SetInsertPoint(OldPtr); PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc()); @@ -3169,7 +2963,6 @@ private: return true; } }; -} namespace { /// \brief Visitor to rewrite aggregate loads and stores as scalar. @@ -3181,8 +2974,6 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { // Befriend the base class so it can delegate to private visit methods. friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; - const DataLayout &DL; - /// Queue of pointer uses to analyze and potentially rewrite. SmallVector<Use *, 8> Queue; @@ -3194,8 +2985,6 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { Use *U; public: - AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {} - /// Rewrite loads and stores through a pointer and all pointers derived from /// it. bool rewrite(Instruction &I) { @@ -3711,7 +3500,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { return true; }), Stores.end()); - // Now we have to go *back* through all te stores, because a later store may + // Now we have to go *back* through all the stores, because a later store may // have caused an earlier store's load to become unsplittable and if it is // unsplittable for the later store, then we can't rely on it being split in // the earlier store either. @@ -3773,7 +3562,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { "Cannot represent alloca access size using 64-bit integers!"); Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand()); - IRB.SetInsertPoint(BasicBlock::iterator(LI)); + IRB.SetInsertPoint(LI); DEBUG(dbgs() << " Splitting load: " << *LI << "\n"); @@ -3825,7 +3614,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { } Value *StoreBasePtr = SI->getPointerOperand(); - IRB.SetInsertPoint(BasicBlock::iterator(SI)); + IRB.SetInsertPoint(SI); DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); @@ -3914,7 +3703,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { if (SplitLoads) { PLoad = (*SplitLoads)[Idx]; } else { - IRB.SetInsertPoint(BasicBlock::iterator(LI)); + IRB.SetInsertPoint(LI); PLoad = IRB.CreateAlignedLoad( getAdjustedPtr(IRB, DL, LoadBasePtr, APInt(DL.getPointerSizeInBits(), PartOffset), @@ -3924,7 +3713,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { } // And store this partition. - IRB.SetInsertPoint(BasicBlock::iterator(SI)); + IRB.SetInsertPoint(SI); StoreInst *PStore = IRB.CreateAlignedStore( PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, APInt(DL.getPointerSizeInBits(), PartOffset), @@ -3972,7 +3761,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // Mark the original store as dead now that we've split it up and kill its // slice. Note that we leave the original load in place unless this store - // was its ownly use. It may in turn be split up if it is an alloca load + // was its only use. It may in turn be split up if it is an alloca load // for some other alloca, but it may be a normal load. This may introduce // redundant loads, but where those can be merged the rest of the optimizer // should handle the merging, and this uncovers SSA splits which is more @@ -4024,7 +3813,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { /// at enabling promotion and if it was successful queues the alloca to be /// promoted. AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::Partition &P) { + Partition &P) { // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. @@ -4230,12 +4019,11 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca); // Migrate debug information from the old alloca to the new alloca(s) - // and the individial partitions. + // and the individual partitions. if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) { auto *Var = DbgDecl->getVariable(); auto *Expr = DbgDecl->getExpression(); - DIBuilder DIB(*AI.getParent()->getParent()->getParent(), - /*AllowUnresolved*/ false); + DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false); bool IsSplit = Pieces.size() > 1; for (auto Piece : Pieces) { // Create a piece expression describing the new partition or reuse AI's @@ -4308,7 +4096,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) { // First, split any FCA loads and stores touching this alloca to promote // better splitting and promotion opportunities. - AggLoadStoreRewriter AggRewriter(DL); + AggLoadStoreRewriter AggRewriter; Changed |= AggRewriter.rewrite(AI); // Build the slices using a recursive instruction-visiting builder. @@ -4388,107 +4176,29 @@ void SROA::deleteDeadInstructions( } } -static void enqueueUsersInWorklist(Instruction &I, - SmallVectorImpl<Instruction *> &Worklist, - SmallPtrSetImpl<Instruction *> &Visited) { - for (User *U : I.users()) - if (Visited.insert(cast<Instruction>(U)).second) - Worklist.push_back(cast<Instruction>(U)); -} - /// \brief Promote the allocas, using the best available technique. /// /// This attempts to promote whatever allocas have been identified as viable in /// the PromotableAllocas list. If that list is empty, there is nothing to do. -/// If there is a domtree available, we attempt to promote using the full power -/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is -/// based on the SSAUpdater utilities. This function returns whether any -/// promotion occurred. +/// This function returns whether any promotion occurred. bool SROA::promoteAllocas(Function &F) { if (PromotableAllocas.empty()) return false; NumPromoted += PromotableAllocas.size(); - if (DT && !ForceSSAUpdater) { - DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); - PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC); - PromotableAllocas.clear(); - return true; - } - - DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); - SSAUpdater SSA; - DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); - SmallVector<Instruction *, 64> Insts; - - // We need a worklist to walk the uses of each alloca. - SmallVector<Instruction *, 8> Worklist; - SmallPtrSet<Instruction *, 8> Visited; - SmallVector<Instruction *, 32> DeadInsts; - - for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { - AllocaInst *AI = PromotableAllocas[Idx]; - Insts.clear(); - Worklist.clear(); - Visited.clear(); - - enqueueUsersInWorklist(*AI, Worklist, Visited); - - while (!Worklist.empty()) { - Instruction *I = Worklist.pop_back_val(); - - // FIXME: Currently the SSAUpdater infrastructure doesn't reason about - // lifetime intrinsics and so we strip them (and the bitcasts+GEPs - // leading to them) here. Eventually it should use them to optimize the - // scalar values produced. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { - assert(II->getIntrinsicID() == Intrinsic::lifetime_start || - II->getIntrinsicID() == Intrinsic::lifetime_end); - II->eraseFromParent(); - continue; - } - - // Push the loads and stores we find onto the list. SROA will already - // have validated that all loads and stores are viable candidates for - // promotion. - if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - assert(LI->getType() == AI->getAllocatedType()); - Insts.push_back(LI); - continue; - } - if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - assert(SI->getValueOperand()->getType() == AI->getAllocatedType()); - Insts.push_back(SI); - continue; - } - - // For everything else, we know that only no-op bitcasts and GEPs will - // make it this far, just recurse through them and recall them for later - // removal. - DeadInsts.push_back(I); - enqueueUsersInWorklist(*I, Worklist, Visited); - } - AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); - while (!DeadInsts.empty()) - DeadInsts.pop_back_val()->eraseFromParent(); - AI->eraseFromParent(); - } - + DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); + PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC); PromotableAllocas.clear(); return true; } -bool SROA::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - +PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, + AssumptionCache &RunAC) { DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + DT = &RunDT; + AC = &RunAC; BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); @@ -4527,12 +4237,55 @@ bool SROA::runOnFunction(Function &F) { PostPromotionWorklist.clear(); } while (!Worklist.empty()); - return Changed; + // FIXME: Even when promoting allocas we should preserve some abstract set of + // CFG-specific analyses. + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } -void SROA::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AssumptionCacheTracker>(); - if (RequiresDomTree) - AU.addRequired<DominatorTreeWrapperPass>(); - AU.setPreservesCFG(); +PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> *AM) { + return runImpl(F, AM->getResult<DominatorTreeAnalysis>(F), + AM->getResult<AssumptionAnalysis>(F)); } + +/// A legacy pass for the legacy pass manager that wraps the \c SROA pass. +/// +/// This is in the llvm namespace purely to allow it to be a friend of the \c +/// SROA pass. +class llvm::sroa::SROALegacyPass : public FunctionPass { + /// The SROA implementation. + SROA Impl; + +public: + SROALegacyPass() : FunctionPass(ID) { + initializeSROALegacyPassPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override { + if (skipOptnoneFunction(F)) + return false; + + auto PA = Impl.runImpl( + F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(), + getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F)); + return !PA.areAllPreserved(); + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.setPreservesCFG(); + } + + const char *getPassName() const override { return "SROA"; } + static char ID; +}; + +char SROALegacyPass::ID = 0; + +FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); } + +INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", + "Scalar Replacement Of Aggregates", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", + false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp index d5d3605..52d477c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -16,7 +16,10 @@ #include "llvm/Transforms/Scalar.h" #include "llvm-c/Initialization.h" #include "llvm-c/Transforms/Scalar.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" +#include "llvm/Analysis/TypeBasedAliasAnalysis.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" @@ -27,10 +30,9 @@ using namespace llvm; /// initializeScalarOptsPasses - Initialize all passes linked into the /// ScalarOpts library. void llvm::initializeScalarOpts(PassRegistry &Registry) { - initializeADCEPass(Registry); + initializeADCELegacyPassPass(Registry); initializeBDCEPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); - initializeSampleProfileLoaderPass(Registry); initializeConstantHoistingPass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); @@ -66,7 +68,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeRewriteStatepointsForGCPass(Registry); initializeSCCPPass(Registry); initializeIPSCCPPass(Registry); - initializeSROAPass(Registry); + initializeSROALegacyPassPass(Registry); initializeSROA_DTPass(Registry); initializeSROA_SSAUpPass(Registry); initializeCFGSimplifyPassPass(Registry); @@ -81,6 +83,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializePlaceSafepointsPass(Registry); initializeFloat2IntPass(Registry); initializeLoopDistributePass(Registry); + initializeLoopLoadEliminationPass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { @@ -225,15 +228,15 @@ void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) { } void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createTypeBasedAliasAnalysisPass()); + unwrap(PM)->add(createTypeBasedAAWrapperPass()); } void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createScopedNoAliasAAPass()); + unwrap(PM)->add(createScopedNoAliasAAWrapperPass()); } void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createBasicAliasAnalysisPass()); + unwrap(PM)->add(createBasicAAWrapperPass()); } void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) { diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp index d955da7..114d22d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -60,6 +60,7 @@ STATISTIC(NumAdjusted, "Number of scalar allocas adjusted to allow promotion"); STATISTIC(NumConverted, "Number of aggregates converted to scalar"); namespace { +#define SROA SROA_ struct SROA : public FunctionPass { SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT) : FunctionPass(ID), HasDomTree(hasDT) { @@ -382,8 +383,8 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // Create and insert the integer alloca. NewTy = IntegerType::get(AI->getContext(), BitWidth); } - AllocaInst *NewAI = new AllocaInst(NewTy, nullptr, "", - AI->getParent()->begin()); + AllocaInst *NewAI = + new AllocaInst(NewTy, nullptr, "", &AI->getParent()->front()); ConvertUsesToScalar(AI, NewAI, 0, nullptr); return NewAI; } @@ -1195,7 +1196,7 @@ static bool isSafePHIToSpeculate(PHINode *PN) { // Ensure that there are no instructions between the PHI and the load that // could store. - for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI) + for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI) if (BBI->mayWriteToMemory()) return false; diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 0493003..054bacd 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -253,10 +253,10 @@ bool Scalarizer::doInitialization(Module &M) { } bool Scalarizer::runOnFunction(Function &F) { - for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) { - BasicBlock *BB = BBI; - for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { - Instruction *I = II; + assert(Gathered.empty() && Scattered.empty()); + for (BasicBlock &BB : F) { + for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) { + Instruction *I = &*II; bool Done = visit(I); ++II; if (Done && I->getType()->isVoidTy()) @@ -285,7 +285,7 @@ Scatterer Scalarizer::scatter(Instruction *Point, Value *V) { } // In the fallback case, just put the scattered before Point and // keep the result local to Point. - return Scatterer(Point->getParent(), Point, V); + return Scatterer(Point->getParent(), Point->getIterator(), V); } // Replace Op with the gathered form of the components in CV. Defer the @@ -377,7 +377,7 @@ bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) { return false; unsigned NumElems = VT->getNumElements(); - IRBuilder<> Builder(I.getParent(), &I); + IRBuilder<> Builder(&I); Scatterer Op0 = scatter(&I, I.getOperand(0)); Scatterer Op1 = scatter(&I, I.getOperand(1)); assert(Op0.size() == NumElems && "Mismatched binary operation"); @@ -397,7 +397,7 @@ bool Scalarizer::visitSelectInst(SelectInst &SI) { return false; unsigned NumElems = VT->getNumElements(); - IRBuilder<> Builder(SI.getParent(), &SI); + IRBuilder<> Builder(&SI); Scatterer Op1 = scatter(&SI, SI.getOperand(1)); Scatterer Op2 = scatter(&SI, SI.getOperand(2)); assert(Op1.size() == NumElems && "Mismatched select"); @@ -438,7 +438,7 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) { if (!VT) return false; - IRBuilder<> Builder(GEPI.getParent(), &GEPI); + IRBuilder<> Builder(&GEPI); unsigned NumElems = VT->getNumElements(); unsigned NumIndices = GEPI.getNumIndices(); @@ -472,7 +472,7 @@ bool Scalarizer::visitCastInst(CastInst &CI) { return false; unsigned NumElems = VT->getNumElements(); - IRBuilder<> Builder(CI.getParent(), &CI); + IRBuilder<> Builder(&CI); Scatterer Op0 = scatter(&CI, CI.getOperand(0)); assert(Op0.size() == NumElems && "Mismatched cast"); ValueVector Res; @@ -492,7 +492,7 @@ bool Scalarizer::visitBitCastInst(BitCastInst &BCI) { unsigned DstNumElems = DstVT->getNumElements(); unsigned SrcNumElems = SrcVT->getNumElements(); - IRBuilder<> Builder(BCI.getParent(), &BCI); + IRBuilder<> Builder(&BCI); Scatterer Op0 = scatter(&BCI, BCI.getOperand(0)); ValueVector Res; Res.resize(DstNumElems); @@ -569,7 +569,7 @@ bool Scalarizer::visitPHINode(PHINode &PHI) { return false; unsigned NumElems = VT->getNumElements(); - IRBuilder<> Builder(PHI.getParent(), &PHI); + IRBuilder<> Builder(&PHI); ValueVector Res; Res.resize(NumElems); @@ -600,7 +600,7 @@ bool Scalarizer::visitLoadInst(LoadInst &LI) { return false; unsigned NumElems = Layout.VecTy->getNumElements(); - IRBuilder<> Builder(LI.getParent(), &LI); + IRBuilder<> Builder(&LI); Scatterer Ptr = scatter(&LI, LI.getPointerOperand()); ValueVector Res; Res.resize(NumElems); @@ -625,7 +625,7 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) { return false; unsigned NumElems = Layout.VecTy->getNumElements(); - IRBuilder<> Builder(SI.getParent(), &SI); + IRBuilder<> Builder(&SI); Scatterer Ptr = scatter(&SI, SI.getPointerOperand()); Scatterer Val = scatter(&SI, FullValue); @@ -642,7 +642,9 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) { // Delete the instructions that we scalarized. If a full vector result // is still needed, recreate it using InsertElements. bool Scalarizer::finish() { - if (Gathered.empty()) + // The presence of data in Gathered or Scattered indicates changes + // made to the Function. + if (Gathered.empty() && Scattered.empty()) return false; for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end(); GMI != GME; ++GMI) { @@ -655,7 +657,7 @@ bool Scalarizer::finish() { Value *Res = UndefValue::get(Ty); BasicBlock *BB = Op->getParent(); unsigned Count = Ty->getVectorNumElements(); - IRBuilder<> Builder(BB, Op); + IRBuilder<> Builder(Op); if (isa<PHINode>(Op)) Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); for (unsigned I = 0; I < Count; ++I) diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 4a87531..86a10d2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -156,6 +156,10 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" @@ -164,6 +168,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Operator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" @@ -174,6 +179,7 @@ #include "llvm/IR/IRBuilder.h" using namespace llvm; +using namespace llvm::PatternMatch; static cl::opt<bool> DisableSeparateConstOffsetFromGEP( "disable-separate-const-offset-from-gep", cl::init(false), @@ -319,8 +325,11 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.setPreservesCFG(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } bool doInitialization(Module &M) override { @@ -373,15 +382,42 @@ private: /// /// Verified in @i32_add in split-gep.ll bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); + /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow. + /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting + /// the constant offset. After extraction, it becomes desirable to reunion the + /// distributed sexts. For example, + /// + /// &a[sext(i +nsw (j +nsw 5)] + /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)] + /// => constant extraction &a[sext(i) + sext(j)] + 5 + /// => reunion &a[sext(i +nsw j)] + 5 + bool reuniteExts(Function &F); + /// A helper that reunites sexts in an instruction. + bool reuniteExts(Instruction *I); + /// Find the closest dominator of <Dominatee> that is equivalent to <Key>. + Instruction *findClosestMatchingDominator(const SCEV *Key, + Instruction *Dominatee); /// Verify F is free of dead code. void verifyNoDeadCode(Function &F); + bool hasMoreThanOneUseInLoop(Value *v, Loop *L); + // Swap the index operand of two GEP. + void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second); + // Check if it is safe to swap operand of two GEP. + bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second, + Loop *CurLoop); + const DataLayout *DL; - const DominatorTree *DT; + DominatorTree *DT; + ScalarEvolution *SE; const TargetMachine *TM; + + LoopInfo *LI; + TargetLibraryInfo *TLI; /// Whether to lower a GEP with multiple indices into arithmetic operations or /// multiple GEPs with a single index. bool LowerGEP; + DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs; }; } // anonymous namespace @@ -391,7 +427,10 @@ INITIALIZE_PASS_BEGIN( "Split GEPs to a variadic base and a constant offset for better CSE", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END( SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", "Split GEPs to a variadic base and a constant offset for better CSE", false, @@ -734,6 +773,13 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( Type *I8PtrTy = Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace()); Value *ResultPtr = Variadic->getOperand(0); + Loop *L = LI->getLoopFor(Variadic->getParent()); + // Check if the base is not loop invariant or used more than once. + bool isSwapCandidate = + L && L->isLoopInvariant(ResultPtr) && + !hasMoreThanOneUseInLoop(ResultPtr, L); + Value *FirstResult = nullptr; + if (ResultPtr->getType() != I8PtrTy) ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); @@ -762,6 +808,8 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( // Create an ugly GEP with a single index for each index. ResultPtr = Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep"); + if (FirstResult == nullptr) + FirstResult = ResultPtr; } } @@ -770,7 +818,17 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset); ResultPtr = Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep"); - } + } else + isSwapCandidate = false; + + // If we created a GEP with constant index, and the base is loop invariant, + // then we swap the first one with it, so LICM can move constant GEP out + // later. + GetElementPtrInst *FirstGEP = dyn_cast<GetElementPtrInst>(FirstResult); + GetElementPtrInst *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr); + if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L)) + swapGEPOperand(FirstGEP, SecondGEP); + if (ResultPtr->getType() != Variadic->getType()) ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType()); @@ -891,13 +949,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // Clear the inbounds attribute because the new index may be off-bound. // e.g., // - // b = add i64 a, 5 - // addr = gep inbounds float* p, i64 b + // b = add i64 a, 5 + // addr = gep inbounds float, float* p, i64 b // // is transformed to: // - // addr2 = gep float* p, i64 a - // addr = gep float* addr2, i64 5 + // addr2 = gep float, float* p, i64 a ; inbounds removed + // addr = gep inbounds float, float* addr2, i64 5 // // If a is -4, although the old index b is in bounds, the new index a is // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the @@ -907,6 +965,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // // TODO(jingyue): do some range analysis to keep as many inbounds as // possible. GEPs with inbounds are more friendly to alias analysis. + bool GEPWasInBounds = GEP->isInBounds(); GEP->setIsInBounds(false); // Lowers a GEP to either GEPs with a single index or arithmetic operations. @@ -968,6 +1027,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP); + // Inherit the inbounds attribute of the original GEP. + cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds); } else { // Unlikely but possible. For example, // #pragma pack(1) @@ -990,6 +1051,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { Type::getInt8Ty(GEP->getContext()), NewGEP, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep", GEP); + // Inherit the inbounds attribute of the original GEP. + cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds); if (GEP->getType() != I8PtrTy) NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP); } @@ -1008,24 +1071,96 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { return false; DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); bool Changed = false; for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { - for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) { - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) { + for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;) + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) Changed |= splitGEP(GEP); - } - // No need to split GEP ConstantExprs because all its indices are constant - // already. - } + // No need to split GEP ConstantExprs because all its indices are constant + // already. } + Changed |= reuniteExts(F); + if (VerifyNoDeadCode) verifyNoDeadCode(F); return Changed; } +Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator( + const SCEV *Key, Instruction *Dominatee) { + auto Pos = DominatingExprs.find(Key); + if (Pos == DominatingExprs.end()) + return nullptr; + + auto &Candidates = Pos->second; + // Because we process the basic blocks in pre-order of the dominator tree, a + // candidate that doesn't dominate the current instruction won't dominate any + // future instruction either. Therefore, we pop it out of the stack. This + // optimization makes the algorithm O(n). + while (!Candidates.empty()) { + Instruction *Candidate = Candidates.back(); + if (DT->dominates(Candidate, Dominatee)) + return Candidate; + Candidates.pop_back(); + } + return nullptr; +} + +bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) { + if (!SE->isSCEVable(I->getType())) + return false; + + // Dom: LHS+RHS + // I: sext(LHS)+sext(RHS) + // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom). + // TODO: handle zext + Value *LHS = nullptr, *RHS = nullptr; + if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) || + match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { + if (LHS->getType() == RHS->getType()) { + const SCEV *Key = + SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); + if (auto *Dom = findClosestMatchingDominator(Key, I)) { + Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); + NewSExt->takeName(I); + I->replaceAllUsesWith(NewSExt); + RecursivelyDeleteTriviallyDeadInstructions(I); + return true; + } + } + } + + // Add I to DominatingExprs if it's an add/sub that can't sign overflow. + if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) || + match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) { + if (isKnownNotFullPoison(I)) { + const SCEV *Key = + SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); + DominatingExprs[Key].push_back(I); + } + } + return false; +} + +bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) { + bool Changed = false; + DominatingExprs.clear(); + for (auto Node = GraphTraits<DominatorTree *>::nodes_begin(DT); + Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) { + BasicBlock *BB = Node->getBlock(); + for (auto I = BB->begin(); I != BB->end(); ) { + Instruction *Cur = &*I++; + Changed |= reuniteExts(Cur); + } + } + return Changed; +} + void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { for (auto &B : F) { for (auto &I : B) { @@ -1038,3 +1173,93 @@ void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { } } } + +bool SeparateConstOffsetFromGEP::isLegalToSwapOperand( + GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) { + if (!FirstGEP || !FirstGEP->hasOneUse()) + return false; + + if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent()) + return false; + + if (FirstGEP == SecondGEP) + return false; + + unsigned FirstNum = FirstGEP->getNumOperands(); + unsigned SecondNum = SecondGEP->getNumOperands(); + // Give up if the number of operands are not 2. + if (FirstNum != SecondNum || FirstNum != 2) + return false; + + Value *FirstBase = FirstGEP->getOperand(0); + Value *SecondBase = SecondGEP->getOperand(0); + Value *FirstOffset = FirstGEP->getOperand(1); + // Give up if the index of the first GEP is loop invariant. + if (CurLoop->isLoopInvariant(FirstOffset)) + return false; + + // Give up if base doesn't have same type. + if (FirstBase->getType() != SecondBase->getType()) + return false; + + Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset); + + // Check if the second operand of first GEP has constant coefficient. + // For an example, for the following code, we won't gain anything by + // hoisting the second GEP out because the second GEP can be folded away. + // %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256 + // %67 = shl i64 %scevgep.sum.ur159, 2 + // %uglygep160 = getelementptr i8* %65, i64 %67 + // %uglygep161 = getelementptr i8* %uglygep160, i64 -1024 + + // Skip constant shift instruction which may be generated by Splitting GEPs. + if (FirstOffsetDef && FirstOffsetDef->isShift() && + isa<ConstantInt>(FirstOffsetDef->getOperand(1))) + FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0)); + + // Give up if FirstOffsetDef is an Add or Sub with constant. + // Because it may not profitable at all due to constant folding. + if (FirstOffsetDef) + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) { + unsigned opc = BO->getOpcode(); + if ((opc == Instruction::Add || opc == Instruction::Sub) && + (isa<ConstantInt>(BO->getOperand(0)) || + isa<ConstantInt>(BO->getOperand(1)))) + return false; + } + return true; +} + +bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) { + int UsesInLoop = 0; + for (User *U : V->users()) { + if (Instruction *User = dyn_cast<Instruction>(U)) + if (L->contains(User)) + if (++UsesInLoop > 1) + return true; + } + return false; +} + +void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First, + GetElementPtrInst *Second) { + Value *Offset1 = First->getOperand(1); + Value *Offset2 = Second->getOperand(1); + First->setOperand(1, Offset2); + Second->setOperand(1, Offset1); + + // We changed p+o+c to p+c+o, p+c may not be inbound anymore. + const DataLayout &DAL = First->getModule()->getDataLayout(); + APInt Offset(DAL.getPointerSizeInBits( + cast<PointerType>(First->getType())->getAddressSpace()), + 0); + Value *NewBase = + First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset); + uint64_t ObjectSize; + if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) || + Offset.ugt(ObjectSize)) { + First->setIsInBounds(false); + Second->setIsInBounds(false); + } else + First->setIsInBounds(true); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 231411a..63c8836 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Attributes.h" @@ -67,15 +68,14 @@ static bool mergeEmptyReturnBlocks(Function &F) { // single PHI node that is the operand to the return. if (Ret != &BB.front()) { // Check for something else in the block. - BasicBlock::iterator I = Ret; + BasicBlock::iterator I(Ret); --I; // Skip over debug info. while (isa<DbgInfoIntrinsic>(I) && I != BB.begin()) --I; if (!isa<DbgInfoIntrinsic>(I) && - (!isa<PHINode>(I) || I != BB.begin() || - Ret->getNumOperands() == 0 || - Ret->getOperand(0) != I)) + (!isa<PHINode>(I) || I != BB.begin() || Ret->getNumOperands() == 0 || + Ret->getOperand(0) != &*I)) continue; } @@ -136,7 +136,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // Loop over all of the basic blocks and remove them if they are unneeded. for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, AC)) { + if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC)) { LocalChange = true; ++NumSimpl; } @@ -217,6 +217,7 @@ struct CFGSimplifyPass : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } }; } diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp index f49f4ea..64109b2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp @@ -48,7 +48,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); FunctionPass::getAnalysisUsage(AU); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); @@ -66,7 +66,7 @@ char Sinking::ID = 0; INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false) FunctionPass *llvm::createSinkingPass() { return new Sinking(); } @@ -99,7 +99,7 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, bool Sinking::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); bool MadeChange, EverMadeChange = false; @@ -119,7 +119,7 @@ bool Sinking::runOnFunction(Function &F) { bool Sinking::ProcessBlock(BasicBlock &BB) { // Can't sink anything out of a block that has less than two successors. - if (BB.getTerminator()->getNumSuccessors() <= 1 || BB.empty()) return false; + if (BB.getTerminator()->getNumSuccessors() <= 1) return false; // Don't bother sinking code out of unreachable blocks. In addition to being // unprofitable, it can also lead to infinite looping, because in an @@ -134,7 +134,7 @@ bool Sinking::ProcessBlock(BasicBlock &BB) { bool ProcessedBegin = false; SmallPtrSet<Instruction *, 8> Stores; do { - Instruction *Inst = I; // The instruction to sink. + Instruction *Inst = &*I; // The instruction to sink. // Predecrement I (if it's not begin) so that it isn't invalidated by // sinking. @@ -165,14 +165,16 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { MemoryLocation Loc = MemoryLocation::get(L); for (Instruction *S : Stores) - if (AA->getModRefInfo(S, Loc) & AliasAnalysis::Mod) + if (AA->getModRefInfo(S, Loc) & MRI_Mod) return false; } - if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst)) + if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst) || Inst->isEHPad() || + Inst->mayThrow()) return false; - // Convergent operations can only be moved to control equivalent blocks. + // Convergent operations cannot be made control-dependent on additional + // values. if (auto CS = CallSite(Inst)) { if (CS.hasFnAttr(Attribute::Convergent)) return false; @@ -193,6 +195,11 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, if (Inst->getParent() == SuccToSinkTo) return false; + // It's never legal to sink an instruction into a block which terminates in an + // EH-pad. + if (SuccToSinkTo->getTerminator()->isExceptional()) + return false; + // If the block has multiple predecessors, this would introduce computation // on different code paths. We could split the critical edge, but for now we // just punt. @@ -278,6 +285,6 @@ bool Sinking::SinkInstruction(Instruction *Inst, dbgs() << ")\n"); // Move the instruction. - Inst->moveBefore(SuccToSinkTo->getFirstInsertionPt()); + Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt()); return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index ff3f00a..147d615 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -227,7 +227,7 @@ bool SpeculativeExecution::considerHoistingFromTo(BasicBlock &FromBlock, // changes the list that I is iterating through. auto Current = I; ++I; - if (!NotHoisted.count(Current)) { + if (!NotHoisted.count(&*Current)) { Current->moveBefore(ToBlock.getTerminator()); } } diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 6d9d417..1faa65e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -131,7 +131,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); // We do not modify the shape of the CFG. AU.setPreservesCFG(); @@ -212,7 +212,7 @@ char StraightLineStrengthReduce::ID = 0; INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr", "Straight line strength reduction", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr", "Straight line strength reduction", false, false) @@ -234,6 +234,7 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, Basis.CandidateKind == C.CandidateKind); } +// TODO: use TTI->getGEPCost. static bool isGEPFoldable(GetElementPtrInst *GEP, const TargetTransformInfo *TTI, const DataLayout *DL) { @@ -523,7 +524,7 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( continue; const SCEV *OrigIndexExpr = IndexExprs[I - 1]; - IndexExprs[I - 1] = SE->getConstant(OrigIndexExpr->getType(), 0); + IndexExprs[I - 1] = SE->getZero(OrigIndexExpr->getType()); // The base of this candidate is GEP's base plus the offsets of all // indices except this current one. @@ -689,7 +690,7 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) { TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); // Traverse the dominator tree in the depth-first order. This order makes sure // all bases of a candidate are in Candidates when we process it. for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT); diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 4f23e20..662513c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -358,13 +358,9 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { BasicBlock *BB = N->getNodeAs<BasicBlock>(); BranchInst *Term = cast<BranchInst>(BB->getTerminator()); - for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { - BasicBlock *Succ = Term->getSuccessor(i); - - if (Visited.count(Succ)) { + for (BasicBlock *Succ : Term->successors()) + if (Visited.count(Succ)) Loops[Succ] = BB; - } - } } } @@ -903,14 +899,14 @@ void StructurizeCFG::rebuildSSA() { continue; } - if (DT->dominates(II, User)) + if (DT->dominates(&*II, User)) continue; if (!Initialized) { Value *Undef = UndefValue::get(II->getType()); Updater.Initialize(II->getType(), ""); Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); - Updater.AddAvailableValue(BB, II); + Updater.AddAvailableValue(BB, &*II); Initialized = true; } Updater.RewriteUseAfterInsertions(U); diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index c7de2e2..4e84d72 100644 --- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -54,6 +54,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InlineCost.h" @@ -136,6 +137,7 @@ FunctionPass *llvm::createTailCallEliminationPass() { void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } /// \brief Scan the specified function for alloca instructions. @@ -195,8 +197,8 @@ struct AllocaDerivedValueTracker { case Instruction::Call: case Instruction::Invoke: { CallSite CS(I); - bool IsNocapture = !CS.isCallee(U) && - CS.doesNotCapture(CS.getArgumentNo(U)); + bool IsNocapture = + CS.isDataOperand(U) && CS.doesNotCapture(CS.getDataOperandNo(U)); callUsesLocalStack(CS, IsNocapture); if (IsNocapture) { // If the alloca-derived argument is passed in as nocapture, then it @@ -302,7 +304,9 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) { if (!CI || CI->isTailCall()) continue; - if (CI->doesNotAccessMemory()) { + bool IsNoTail = CI->isNoTailCall(); + + if (!IsNoTail && CI->doesNotAccessMemory()) { // A call to a readnone function whose arguments are all things computed // outside this function can be marked tail. Even if you stored the // alloca address into a global, a readnone function can't load the @@ -330,7 +334,7 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) { } } - if (Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { + if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { DeferredTails.push_back(CI); } else { AllCallsAreTailCalls = false; @@ -404,7 +408,7 @@ bool TailCallElim::runTRE(Function &F) { // Until this is resolved, disable this transformation if that would ever // happen. This bug is PR962. for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) { - BasicBlock *BB = BBI++; // FoldReturnAndProcessPred may delete BB. + BasicBlock *BB = &*BBI++; // FoldReturnAndProcessPred may delete BB. if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, !CanTRETailMarkedCall); @@ -421,9 +425,7 @@ bool TailCallElim::runTRE(Function &F) { // with themselves. Check to see if we did and clean up our mess if so. This // occurs when a function passes an argument straight through to its tail // call. - for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) { - PHINode *PN = ArgumentPHIs[i]; - + for (PHINode *PN : ArgumentPHIs) { // If the PHI Node is a dynamic constant, replace it with the value it is. if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) { PN->replaceAllUsesWith(PNV); @@ -464,10 +466,7 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) { // return value of the call, it must only use things that are defined before // the call, or movable instructions between the call and the instruction // itself. - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) - if (I->getOperand(i) == CI) - return false; - return true; + return std::find(I->op_begin(), I->op_end(), CI) == I->op_end(); } /// Return true if the specified value is the same when the return would exit @@ -574,7 +573,7 @@ TailCallElim::FindTRECandidate(Instruction *TI, // Scan backwards from the return, checking to see if there is a tail call in // this block. If so, set CI to it. CallInst *CI = nullptr; - BasicBlock::iterator BBI = TI; + BasicBlock::iterator BBI(TI); while (true) { CI = dyn_cast<CallInst>(BBI); if (CI && CI->getCalledFunction() == F) @@ -595,9 +594,8 @@ TailCallElim::FindTRECandidate(Instruction *TI, // and disable this xform in this case, because the code generator will // lower the call to fabs into inline code. if (BB == &F->getEntryBlock() && - FirstNonDbg(BB->front()) == CI && - FirstNonDbg(std::next(BB->begin())) == TI && - CI->getCalledFunction() && + FirstNonDbg(BB->front().getIterator()) == CI && + FirstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() && !TTI->isLoweredToCall(CI->getCalledFunction())) { // A single-block function with just a call and a return. Check that // the arguments match. @@ -636,19 +634,19 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, // tail call if all of the instructions between the call and the return are // movable to above the call itself, leaving the call next to the return. // Check that this is the case now. - BasicBlock::iterator BBI = CI; + BasicBlock::iterator BBI(CI); for (++BBI; &*BBI != Ret; ++BBI) { - if (CanMoveAboveCall(BBI, CI)) continue; + if (CanMoveAboveCall(&*BBI, CI)) continue; // If we can't move the instruction above the call, it might be because it // is an associative and commutative operation that could be transformed // using accumulator recursion elimination. Check to see if this is the // case, and if so, remember the initial accumulator value for later. if ((AccumulatorRecursionEliminationInitVal = - CanTransformAccumulatorRecursion(BBI, CI))) { + CanTransformAccumulatorRecursion(&*BBI, CI))) { // Yes, this is accumulator recursion. Remember which instruction // accumulates. - AccumulatorRecursionInstr = BBI; + AccumulatorRecursionInstr = &*BBI; } else { return false; // Otherwise, we cannot eliminate the tail recursion! } @@ -698,19 +696,19 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, NEBI = NewEntry->begin(); OEBI != E; ) if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++)) if (isa<ConstantInt>(AI->getArraySize())) - AI->moveBefore(NEBI); + AI->moveBefore(&*NEBI); // Now that we have created a new block, which jumps to the entry // block, insert a PHI node for each argument of the function. // For now, we initialize each PHI to only have the real arguments // which are passed in. - Instruction *InsertPos = OldEntry->begin(); + Instruction *InsertPos = &OldEntry->front(); for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) { PHINode *PN = PHINode::Create(I->getType(), 2, I->getName() + ".tr", InsertPos); I->replaceAllUsesWith(PN); // Everyone use the PHI node now! - PN->addIncoming(I, NewEntry); + PN->addIncoming(&*I, NewEntry); ArgumentPHIs.push_back(PN); } } @@ -739,10 +737,9 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, Instruction *AccRecInstr = AccumulatorRecursionInstr; // Start by inserting a new PHI node for the accumulator. pred_iterator PB = pred_begin(OldEntry), PE = pred_end(OldEntry); - PHINode *AccPN = - PHINode::Create(AccumulatorRecursionEliminationInitVal->getType(), - std::distance(PB, PE) + 1, - "accumulator.tr", OldEntry->begin()); + PHINode *AccPN = PHINode::Create( + AccumulatorRecursionEliminationInitVal->getType(), + std::distance(PB, PE) + 1, "accumulator.tr", &OldEntry->front()); // Loop over all of the predecessors of the tail recursion block. For the // real entry into the function we seed the PHI with the initial value, diff --git a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp index 03c3a80..409326e 100644 --- a/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ASanStackFrameLayout.cpp @@ -12,8 +12,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/ASanStackFrameLayout.h" #include "llvm/ADT/SmallString.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" #include <algorithm> namespace llvm { diff --git a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp index e9f6239..0262358f 100644 --- a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp +++ b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp @@ -52,32 +52,34 @@ // http://wiki.dwarfstd.org/index.php?title=Path_Discriminators //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; #define DEBUG_TYPE "add-discriminators" namespace { - struct AddDiscriminators : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - AddDiscriminators() : FunctionPass(ID) { - initializeAddDiscriminatorsPass(*PassRegistry::getPassRegistry()); - } +struct AddDiscriminators : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + AddDiscriminators() : FunctionPass(ID) { + initializeAddDiscriminatorsPass(*PassRegistry::getPassRegistry()); + } - bool runOnFunction(Function &F) override; - }; + bool runOnFunction(Function &F) override; +}; } char AddDiscriminators::ID = 0; @@ -89,17 +91,17 @@ INITIALIZE_PASS_END(AddDiscriminators, "add-discriminators", // Command line option to disable discriminator generation even in the // presence of debug information. This is only needed when debugging // debug info generation issues. -static cl::opt<bool> -NoDiscriminators("no-discriminators", cl::init(false), - cl::desc("Disable generation of discriminator information.")); +static cl::opt<bool> NoDiscriminators( + "no-discriminators", cl::init(false), + cl::desc("Disable generation of discriminator information.")); FunctionPass *llvm::createAddDiscriminatorsPass() { return new AddDiscriminators(); } static bool hasDebugInfo(const Function &F) { - NamedMDNode *CUNodes = F.getParent()->getNamedMetadata("llvm.dbg.cu"); - return CUNodes != nullptr; + DISubprogram *S = getDISubprogram(&F); + return S != nullptr; } /// \brief Assign DWARF discriminators. @@ -159,8 +161,7 @@ bool AddDiscriminators::runOnFunction(Function &F) { // Simlarly, if the function has no debug info, do nothing. // Finally, if this module is built with dwarf versions earlier than 4, // do nothing (discriminator support is a DWARF 4 feature). - if (NoDiscriminators || - !hasDebugInfo(F) || + if (NoDiscriminators || !hasDebugInfo(F) || F.getParent()->getDwarfVersion() < 4) return false; @@ -169,59 +170,77 @@ bool AddDiscriminators::runOnFunction(Function &F) { LLVMContext &Ctx = M->getContext(); DIBuilder Builder(*M, /*AllowUnresolved*/ false); - // Traverse all the blocks looking for instructions in different - // blocks that are at the same file:line location. - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { - BasicBlock *B = I; - TerminatorInst *Last = B->getTerminator(); - const DILocation *LastDIL = Last->getDebugLoc(); - if (!LastDIL) - continue; - - for (unsigned I = 0; I < Last->getNumSuccessors(); ++I) { - BasicBlock *Succ = Last->getSuccessor(I); - Instruction *First = Succ->getFirstNonPHIOrDbgOrLifetime(); - const DILocation *FirstDIL = First->getDebugLoc(); - if (!FirstDIL) + typedef std::pair<StringRef, unsigned> Location; + typedef DenseMap<const BasicBlock *, Metadata *> BBScopeMap; + typedef DenseMap<Location, BBScopeMap> LocationBBMap; + + LocationBBMap LBM; + + // Traverse all instructions in the function. If the source line location + // of the instruction appears in other basic block, assign a new + // discriminator for this instruction. + for (BasicBlock &B : F) { + for (auto &I : B.getInstList()) { + if (isa<DbgInfoIntrinsic>(&I)) + continue; + const DILocation *DIL = I.getDebugLoc(); + if (!DIL) + continue; + Location L = std::make_pair(DIL->getFilename(), DIL->getLine()); + auto &BBMap = LBM[L]; + auto R = BBMap.insert(std::make_pair(&B, (Metadata *)nullptr)); + if (BBMap.size() == 1) + continue; + bool InsertSuccess = R.second; + Metadata *&NewScope = R.first->second; + // If we could insert a different block in the same location, a + // discriminator is needed to distinguish both instructions. + if (InsertSuccess) { + auto *Scope = DIL->getScope(); + auto *File = + Builder.createFile(DIL->getFilename(), Scope->getDirectory()); + NewScope = Builder.createLexicalBlockFile( + Scope, File, DIL->computeNewDiscriminator()); + } + I.setDebugLoc(DILocation::get(Ctx, DIL->getLine(), DIL->getColumn(), + NewScope, DIL->getInlinedAt())); + DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":" + << DIL->getColumn() << ":" + << dyn_cast<DILexicalBlockFile>(NewScope)->getDiscriminator() + << I << "\n"); + Changed = true; + } + } + + // Traverse all instructions and assign new discriminators to call + // instructions with the same lineno that are in the same basic block. + // Sample base profile needs to distinguish different function calls within + // a same source line for correct profile annotation. + for (BasicBlock &B : F) { + const DILocation *FirstDIL = NULL; + for (auto &I : B.getInstList()) { + CallInst *Current = dyn_cast<CallInst>(&I); + if (!Current || isa<DbgInfoIntrinsic>(&I)) continue; - // If the first instruction (First) of Succ is at the same file - // location as B's last instruction (Last), add a new - // discriminator for First's location and all the instructions - // in Succ that share the same location with First. - if (!FirstDIL->canDiscriminate(*LastDIL)) { - // Create a new lexical scope and compute a new discriminator - // number for it. - StringRef Filename = FirstDIL->getFilename(); - auto *Scope = FirstDIL->getScope(); - auto *File = Builder.createFile(Filename, Scope->getDirectory()); - - // FIXME: Calculate the discriminator here, based on local information, - // and delete DILocation::computeNewDiscriminator(). The current - // solution gives different results depending on other modules in the - // same context. All we really need is to discriminate between - // FirstDIL and LastDIL -- a local map would suffice. - unsigned Discriminator = FirstDIL->computeNewDiscriminator(); - auto *NewScope = - Builder.createLexicalBlockFile(Scope, File, Discriminator); - auto *NewDIL = - DILocation::get(Ctx, FirstDIL->getLine(), FirstDIL->getColumn(), - NewScope, FirstDIL->getInlinedAt()); - DebugLoc newDebugLoc = NewDIL; - - // Attach this new debug location to First and every - // instruction following First that shares the same location. - for (BasicBlock::iterator I1(*First), E1 = Succ->end(); I1 != E1; - ++I1) { - if (I1->getDebugLoc().get() != FirstDIL) - break; - I1->setDebugLoc(newDebugLoc); - DEBUG(dbgs() << NewDIL->getFilename() << ":" << NewDIL->getLine() - << ":" << NewDIL->getColumn() << ":" - << NewDIL->getDiscriminator() << *I1 << "\n"); + DILocation *CurrentDIL = Current->getDebugLoc(); + if (FirstDIL) { + if (CurrentDIL && CurrentDIL->getLine() == FirstDIL->getLine() && + CurrentDIL->getFilename() == FirstDIL->getFilename()) { + auto *Scope = FirstDIL->getScope(); + auto *File = Builder.createFile(FirstDIL->getFilename(), + Scope->getDirectory()); + auto *NewScope = Builder.createLexicalBlockFile( + Scope, File, FirstDIL->computeNewDiscriminator()); + Current->setDebugLoc(DILocation::get( + Ctx, CurrentDIL->getLine(), CurrentDIL->getColumn(), NewScope, + CurrentDIL->getInlinedAt())); + Changed = true; + } else { + FirstDIL = CurrentDIL; } - DEBUG(dbgs() << "\n"); - Changed = true; + } else { + FirstDIL = CurrentDIL; } } } diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index ef7daca..72db980 100644 --- a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -41,8 +41,8 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) { // Loop through all of our successors and make sure they know that one // of their predecessors is going away. - for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) - BBTerm->getSuccessor(i)->removePredecessor(BB); + for (BasicBlock *Succ : BBTerm->successors()) + Succ->removePredecessor(BB); // Zap all the instructions in the block. while (!BB->empty()) { @@ -65,7 +65,7 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) { /// any single-entry PHI nodes in it, fold them away. This handles the case /// when all entries to the PHI nodes in a block are guaranteed equal, such as /// when the block has exactly one predecessor. -void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, AliasAnalysis *AA, +void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, MemoryDependenceAnalysis *MemDep) { if (!isa<PHINode>(BB->begin())) return; @@ -77,8 +77,6 @@ void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, AliasAnalysis *AA, if (MemDep) MemDep->removeInstruction(PN); // Memdep updates AA itself. - else if (AA && isa<PointerType>(PN->getType())) - AA->deleteValue(PN); PN->eraseFromParent(); } @@ -108,7 +106,7 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) { /// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor, /// if possible. The return value indicates success or failure. bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT, - LoopInfo *LI, AliasAnalysis *AA, + LoopInfo *LI, MemoryDependenceAnalysis *MemDep) { // Don't merge away blocks who have their address taken. if (BB->hasAddressTaken()) return false; @@ -119,8 +117,9 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT, // Don't break self-loops. if (PredBB == BB) return false; - // Don't break invokes. - if (isa<InvokeInst>(PredBB->getTerminator())) return false; + // Don't break unwinding instructions. + if (PredBB->getTerminator()->isExceptional()) + return false; succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB)); BasicBlock *OnlySucc = BB; @@ -145,7 +144,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT, // Begin by getting rid of unneeded PHIs. if (isa<PHINode>(BB->front())) - FoldSingleEntryPHINodes(BB, AA, MemDep); + FoldSingleEntryPHINodes(BB, MemDep); // Delete the unconditional branch from the predecessor... PredBB->getInstList().pop_back(); @@ -253,7 +252,7 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT, // block. assert(SP == BB && "CFG broken"); SP = nullptr; - return SplitBlock(Succ, Succ->begin(), DT, LI); + return SplitBlock(Succ, &Succ->front(), DT, LI); } // Otherwise, if BB has a single successor, split it at the bottom of the @@ -284,8 +283,8 @@ llvm::SplitAllCriticalEdges(Function &F, /// BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, DominatorTree *DT, LoopInfo *LI) { - BasicBlock::iterator SplitIt = SplitPt; - while (isa<PHINode>(SplitIt) || isa<LandingPadInst>(SplitIt)) + BasicBlock::iterator SplitIt = SplitPt->getIterator(); + while (isa<PHINode>(SplitIt) || SplitIt->isEHPad()) ++SplitIt; BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split"); @@ -393,7 +392,7 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB, /// from NewBB. This also updates AliasAnalysis, if available. static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, ArrayRef<BasicBlock *> Preds, BranchInst *BI, - AliasAnalysis *AA, bool HasLoopExit) { + bool HasLoopExit) { // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB. SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end()); for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) { @@ -474,17 +473,20 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB, /// BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, ArrayRef<BasicBlock *> Preds, - const char *Suffix, AliasAnalysis *AA, - DominatorTree *DT, LoopInfo *LI, - bool PreserveLCSSA) { + const char *Suffix, DominatorTree *DT, + LoopInfo *LI, bool PreserveLCSSA) { + // Do not attempt to split that which cannot be split. + if (!BB->canSplitPredecessors()) + return nullptr; + // For the landingpads we need to act a bit differently. // Delegate this work to the SplitLandingPadPredecessors. if (BB->isLandingPad()) { SmallVector<BasicBlock*, 2> NewBBs; std::string NewName = std::string(Suffix) + ".split-lp"; - SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), - NewBBs, AA, DT, LI, PreserveLCSSA); + SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs, DT, + LI, PreserveLCSSA); return NewBBs[0]; } @@ -523,7 +525,7 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, HasLoopExit); // Update the PHI nodes in BB with the values coming from NewBB. - UpdatePHINodes(BB, NewBB, Preds, BI, AA, HasLoopExit); + UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit); return NewBB; } @@ -544,8 +546,8 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef<BasicBlock *> Preds, const char *Suffix1, const char *Suffix2, SmallVectorImpl<BasicBlock *> &NewBBs, - AliasAnalysis *AA, DominatorTree *DT, - LoopInfo *LI, bool PreserveLCSSA) { + DominatorTree *DT, LoopInfo *LI, + bool PreserveLCSSA) { assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!"); // Create a new basic block for OrigBB's predecessors listed in Preds. Insert @@ -574,7 +576,7 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, HasLoopExit); // Update the PHI nodes in OrigBB with the values coming from NewBB1. - UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, AA, HasLoopExit); + UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, HasLoopExit); // Move the remaining edges from OrigBB to point to NewBB2. SmallVector<BasicBlock*, 8> NewBB2Preds; @@ -611,7 +613,7 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, PreserveLCSSA, HasLoopExit); // Update the PHI nodes in OrigBB with the values coming from NewBB2. - UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, AA, HasLoopExit); + UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, HasLoopExit); } LandingPadInst *LPad = OrigBB->getLandingPadInst(); @@ -624,11 +626,17 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB, Clone2->setName(Twine("lpad") + Suffix2); NewBB2->getInstList().insert(NewBB2->getFirstInsertionPt(), Clone2); - // Create a PHI node for the two cloned landingpad instructions. - PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad); - PN->addIncoming(Clone1, NewBB1); - PN->addIncoming(Clone2, NewBB2); - LPad->replaceAllUsesWith(PN); + // Create a PHI node for the two cloned landingpad instructions only + // if the original landingpad instruction has some uses. + if (!LPad->use_empty()) { + assert(!LPad->getType()->isTokenTy() && + "Split cannot be applied if LPad is token type. Otherwise an " + "invalid PHINode of token type would be created."); + PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad); + PN->addIncoming(Clone1, NewBB1); + PN->addIncoming(Clone2, NewBB2); + LPad->replaceAllUsesWith(PN); + } LPad->eraseFromParent(); } else { // There is no second clone. Just replace the landing pad with the first @@ -661,7 +669,7 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, // return instruction. V = BCI->getOperand(0); NewBC = BCI->clone(); - Pred->getInstList().insert(NewRet, NewBC); + Pred->getInstList().insert(NewRet->getIterator(), NewBC); *i = NewBC; } if (PHINode *PN = dyn_cast<PHINode>(V)) { @@ -707,7 +715,7 @@ TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond, MDNode *BranchWeights, DominatorTree *DT) { BasicBlock *Head = SplitBefore->getParent(); - BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); + BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator()); TerminatorInst *HeadOldTerm = Head->getTerminator(); LLVMContext &C = Head->getContext(); BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); @@ -757,7 +765,7 @@ void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore, TerminatorInst **ElseTerm, MDNode *BranchWeights) { BasicBlock *Head = SplitBefore->getParent(); - BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); + BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator()); TerminatorInst *HeadOldTerm = Head->getTerminator(); LLVMContext &C = Head->getContext(); BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); diff --git a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index 7e83c9e..9582599 100644 --- a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -101,10 +101,9 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds, continue; // Otherwise a new PHI is needed. Create one and populate it. - PHINode *NewPN = - PHINode::Create(PN->getType(), Preds.size(), "split", - SplitBB->isLandingPad() ? - SplitBB->begin() : SplitBB->getTerminator()); + PHINode *NewPN = PHINode::Create( + PN->getType(), Preds.size(), "split", + SplitBB->isLandingPad() ? &SplitBB->front() : SplitBB->getTerminator()); for (unsigned i = 0, e = Preds.size(); i != e; ++i) NewPN->addIncoming(V, Preds[i]); @@ -141,9 +140,9 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, BasicBlock *TIBB = TI->getParent(); BasicBlock *DestBB = TI->getSuccessor(SuccNum); - // Splitting the critical edge to a landing pad block is non-trivial. Don't do + // Splitting the critical edge to a pad block is non-trivial. Don't do // it in this generic function. - if (DestBB->isLandingPad()) return nullptr; + if (DestBB->isEHPad()) return nullptr; // Create a new basic block, linking it into the CFG. BasicBlock *NewBB = BasicBlock::Create(TI->getContext(), @@ -157,7 +156,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, // Insert the block into the function... right after the block TI lives in. Function &F = *TIBB->getParent(); - Function::iterator FBBI = TIBB; + Function::iterator FBBI = TIBB->getIterator(); F.getBasicBlockList().insert(++FBBI, NewBB); // If there are any PHI nodes in DestBB, we need to update them so that they @@ -197,7 +196,6 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, } // If we have nothing to update, just return. - auto *AA = Options.AA; auto *DT = Options.DT; auto *LI = Options.LI; if (!DT && !LI) @@ -319,10 +317,9 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, LoopPreds.push_back(P); } if (!LoopPreds.empty()) { - assert(!DestBB->isLandingPad() && - "We don't split edges to landing pads!"); + assert(!DestBB->isEHPad() && "We don't split edges to EH pads!"); BasicBlock *NewExitBB = SplitBlockPredecessors( - DestBB, LoopPreds, "split", AA, DT, LI, Options.PreserveLCSSA); + DestBB, LoopPreds, "split", DT, LI, Options.PreserveLCSSA); if (Options.PreserveLCSSA) createPHIsForSplitLoopExit(LoopPreds, NewExitBB, DestBB); } diff --git a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 8aa7b2a..64b44a6 100644 --- a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/ADT/SmallString.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -21,7 +22,6 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "llvm/Analysis/TargetLibraryInfo.h" using namespace llvm; @@ -55,32 +55,6 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL, return CI; } -/// EmitStrNLen - Emit a call to the strnlen function to the builder, for the -/// specified pointer. Ptr is required to be some pointer type, MaxLen must -/// be of size_t type, and the return value has 'intptr_t' type. -Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B, - const DataLayout &DL, const TargetLibraryInfo *TLI) { - if (!TLI->has(LibFunc::strnlen)) - return nullptr; - - Module *M = B.GetInsertBlock()->getParent()->getParent(); - AttributeSet AS[2]; - AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture); - Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind }; - AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs); - - LLVMContext &Context = B.GetInsertBlock()->getContext(); - Constant *StrNLen = - M->getOrInsertFunction("strnlen", AttributeSet::get(M->getContext(), AS), - DL.getIntPtrType(Context), B.getInt8PtrTy(), - DL.getIntPtrType(Context), nullptr); - CallInst *CI = B.CreateCall(StrNLen, {CastToCStr(Ptr, B), MaxLen}, "strnlen"); - if (const Function *F = dyn_cast<Function>(StrNLen->stripPointerCasts())) - CI->setCallingConv(F->getCallingConv()); - - return CI; -} - /// EmitStrChr - Emit a call to the strchr function to the builder, for the /// specified pointer and character. Ptr is required to be some pointer type, /// and the return value has 'i8*' type. diff --git a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp index f2d5e07..42287d3 100644 --- a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -74,17 +74,13 @@ namespace llvm { // insertFastDiv - Substitutes the div/rem instruction with code that checks the // value of the operands and uses a shorter-faster div/rem instruction when // possible and the longer-slower div/rem instruction otherwise. -static bool insertFastDiv(Function &F, - Function::iterator &I, - BasicBlock::iterator &J, - IntegerType *BypassType, - bool UseDivOp, - bool UseSignedOp, +static bool insertFastDiv(Instruction *I, IntegerType *BypassType, + bool UseDivOp, bool UseSignedOp, DivCacheTy &PerBBDivCache) { + Function *F = I->getParent()->getParent(); // Get instruction operands - Instruction *Instr = J; - Value *Dividend = Instr->getOperand(0); - Value *Divisor = Instr->getOperand(1); + Value *Dividend = I->getOperand(0); + Value *Divisor = I->getOperand(1); if (isa<ConstantInt>(Divisor) || (isa<ConstantInt>(Dividend) && isa<ConstantInt>(Divisor))) { @@ -94,13 +90,12 @@ static bool insertFastDiv(Function &F, } // Basic Block is split before divide - BasicBlock *MainBB = I; - BasicBlock *SuccessorBB = I->splitBasicBlock(J); - ++I; //advance iterator I to successorBB + BasicBlock *MainBB = &*I->getParent(); + BasicBlock *SuccessorBB = MainBB->splitBasicBlock(I); // Add new basic block for slow divide operation - BasicBlock *SlowBB = BasicBlock::Create(F.getContext(), "", - MainBB->getParent(), SuccessorBB); + BasicBlock *SlowBB = + BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB); SlowBB->moveBefore(SuccessorBB); IRBuilder<> SlowBuilder(SlowBB, SlowBB->begin()); Value *SlowQuotientV; @@ -115,8 +110,8 @@ static bool insertFastDiv(Function &F, SlowBuilder.CreateBr(SuccessorBB); // Add new basic block for fast divide operation - BasicBlock *FastBB = BasicBlock::Create(F.getContext(), "", - MainBB->getParent(), SuccessorBB); + BasicBlock *FastBB = + BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB); FastBB->moveBefore(SlowBB); IRBuilder<> FastBuilder(FastBB, FastBB->begin()); Value *ShortDivisorV = FastBuilder.CreateCast(Instruction::Trunc, Divisor, @@ -139,19 +134,19 @@ static bool insertFastDiv(Function &F, // Phi nodes for result of div and rem IRBuilder<> SuccessorBuilder(SuccessorBB, SuccessorBB->begin()); - PHINode *QuoPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2); + PHINode *QuoPhi = SuccessorBuilder.CreatePHI(I->getType(), 2); QuoPhi->addIncoming(SlowQuotientV, SlowBB); QuoPhi->addIncoming(FastQuotientV, FastBB); - PHINode *RemPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2); + PHINode *RemPhi = SuccessorBuilder.CreatePHI(I->getType(), 2); RemPhi->addIncoming(SlowRemainderV, SlowBB); RemPhi->addIncoming(FastRemainderV, FastBB); - // Replace Instr with appropriate phi node + // Replace I with appropriate phi node if (UseDivOp) - Instr->replaceAllUsesWith(QuoPhi); + I->replaceAllUsesWith(QuoPhi); else - Instr->replaceAllUsesWith(RemPhi); - Instr->eraseFromParent(); + I->replaceAllUsesWith(RemPhi); + I->eraseFromParent(); // Combine operands into a single value with OR for value testing below MainBB->getInstList().back().eraseFromParent(); @@ -168,9 +163,6 @@ static bool insertFastDiv(Function &F, Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV); MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB); - // point iterator J at first instruction of successorBB - J = I->begin(); - // Cache phi nodes to be used later in place of other instances // of div or rem with the same sign, dividend, and divisor DivOpInfo Key(UseSignedOp, Dividend, Divisor); @@ -179,57 +171,54 @@ static bool insertFastDiv(Function &F, return true; } -// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder if -// operands and operation are identical. Otherwise call insertFastDiv to perform -// the optimization and cache the resulting dividend and remainder. -static bool reuseOrInsertFastDiv(Function &F, - Function::iterator &I, - BasicBlock::iterator &J, - IntegerType *BypassType, - bool UseDivOp, - bool UseSignedOp, +// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder from +// the current BB if operands and operation are identical. Otherwise calls +// insertFastDiv to perform the optimization and caches the resulting dividend +// and remainder. +static bool reuseOrInsertFastDiv(Instruction *I, IntegerType *BypassType, + bool UseDivOp, bool UseSignedOp, DivCacheTy &PerBBDivCache) { // Get instruction operands - Instruction *Instr = J; - DivOpInfo Key(UseSignedOp, Instr->getOperand(0), Instr->getOperand(1)); + DivOpInfo Key(UseSignedOp, I->getOperand(0), I->getOperand(1)); DivCacheTy::iterator CacheI = PerBBDivCache.find(Key); if (CacheI == PerBBDivCache.end()) { // If previous instance does not exist, insert fast div - return insertFastDiv(F, I, J, BypassType, UseDivOp, UseSignedOp, - PerBBDivCache); + return insertFastDiv(I, BypassType, UseDivOp, UseSignedOp, PerBBDivCache); } // Replace operation value with previously generated phi node DivPhiNodes &Value = CacheI->second; if (UseDivOp) { // Replace all uses of div instruction with quotient phi node - J->replaceAllUsesWith(Value.Quotient); + I->replaceAllUsesWith(Value.Quotient); } else { // Replace all uses of rem instruction with remainder phi node - J->replaceAllUsesWith(Value.Remainder); + I->replaceAllUsesWith(Value.Remainder); } - // Advance to next operation - ++J; - // Remove redundant operation - Instr->eraseFromParent(); + I->eraseFromParent(); return true; } -// bypassSlowDivision - This optimization identifies DIV instructions that can -// be profitably bypassed and carried out with a shorter, faster divide. -bool llvm::bypassSlowDivision(Function &F, - Function::iterator &I, - const DenseMap<unsigned int, unsigned int> &BypassWidths) { +// bypassSlowDivision - This optimization identifies DIV instructions in a BB +// that can be profitably bypassed and carried out with a shorter, faster +// divide. +bool llvm::bypassSlowDivision( + BasicBlock *BB, const DenseMap<unsigned int, unsigned int> &BypassWidths) { DivCacheTy DivCache; bool MadeChange = false; - for (BasicBlock::iterator J = I->begin(); J != I->end(); J++) { + Instruction* Next = &*BB->begin(); + while (Next != nullptr) { + // We may add instructions immediately after I, but we want to skip over + // them. + Instruction* I = Next; + Next = Next->getNextNode(); // Get instruction details - unsigned Opcode = J->getOpcode(); + unsigned Opcode = I->getOpcode(); bool UseDivOp = Opcode == Instruction::SDiv || Opcode == Instruction::UDiv; bool UseRemOp = Opcode == Instruction::SRem || Opcode == Instruction::URem; bool UseSignedOp = Opcode == Instruction::SDiv || @@ -240,11 +229,11 @@ bool llvm::bypassSlowDivision(Function &F, continue; // Skip division on vector types, only optimize integer instructions - if (!J->getType()->isIntegerTy()) + if (!I->getType()->isIntegerTy()) continue; // Get bitwidth of div/rem instruction - IntegerType *T = cast<IntegerType>(J->getType()); + IntegerType *T = cast<IntegerType>(I->getType()); unsigned int bitwidth = T->getBitWidth(); // Continue if bitwidth is not bypassed @@ -253,10 +242,9 @@ bool llvm::bypassSlowDivision(Function &F, continue; // Get type for div/rem instruction with bypass bitwidth - IntegerType *BT = IntegerType::get(J->getContext(), BI->second); + IntegerType *BT = IntegerType::get(I->getContext(), BI->second); - MadeChange |= reuseOrInsertFastDiv(F, I, J, BT, UseDivOp, - UseSignedOp, DivCache); + MadeChange |= reuseOrInsertFastDiv(I, BT, UseDivOp, UseSignedOp, DivCache); } return MadeChange; diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp index cc4d6c6..6454afb 100644 --- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -52,8 +52,8 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, if (II->hasName()) NewInst->setName(II->getName()+NameSuffix); NewBB->getInstList().push_back(NewInst); - VMap[II] = NewInst; // Add instruction map to value. - + VMap[&*II] = NewInst; // Add instruction map to value. + hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II)); if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) { if (isa<ConstantInt>(AI->getArraySize())) @@ -85,9 +85,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, assert(NameSuffix && "NameSuffix cannot be null!"); #ifndef NDEBUG - for (Function::const_arg_iterator I = OldFunc->arg_begin(), - E = OldFunc->arg_end(); I != E; ++I) - assert(VMap.count(I) && "No mapping from source argument specified!"); + for (const Argument &I : OldFunc->args()) + assert(VMap.count(&I) && "No mapping from source argument specified!"); #endif // Copy all attributes other than those stored in the AttributeSet. We need @@ -96,6 +95,13 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, NewFunc->copyAttributesFrom(OldFunc); NewFunc->setAttributes(NewAttrs); + // Fix up the personality function that got copied over. + if (OldFunc->hasPersonalityFn()) + NewFunc->setPersonalityFn( + MapValue(OldFunc->getPersonalityFn(), VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer)); + AttributeSet OldAttrs = OldFunc->getAttributes(); // Clone any argument attributes that are present in the VMap. for (const Argument &OldArg : OldFunc->args()) @@ -136,7 +142,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, if (BB.hasAddressTaken()) { Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc), const_cast<BasicBlock*>(&BB)); - VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB); + VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB); } // Note return instructions for the caller. @@ -146,11 +152,13 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, // Loop over all of the instructions in the function, fixing up operand // references as we go. This uses VMap to do all the hard work. - for (Function::iterator BB = cast<BasicBlock>(VMap[OldFunc->begin()]), - BE = NewFunc->end(); BB != BE; ++BB) + for (Function::iterator BB = + cast<BasicBlock>(VMap[&OldFunc->front()])->getIterator(), + BE = NewFunc->end(); + BB != BE; ++BB) // Loop over all instructions, fixing each one as we find it... - for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II) - RemapInstruction(II, VMap, + for (Instruction &II : *BB) + RemapInstruction(&II, VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, TypeMapper, Materializer); } @@ -187,11 +195,9 @@ static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc, const DISubprogram *OldSubprogramMDNode = FindSubprogram(OldFunc, Finder); if (!OldSubprogramMDNode) return; - // Ensure that OldFunc appears in the map. - // (if it's already there it must point to NewFunc anyway) - VMap[OldFunc] = NewFunc; auto *NewSubprogram = cast<DISubprogram>(MapMetadata(OldSubprogramMDNode, VMap)); + NewFunc->setSubprogram(NewSubprogram); for (auto *CU : Finder.compile_units()) { auto Subprograms = CU->getSubprograms(); @@ -222,10 +228,9 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap, // The user might be deleting arguments to the function by specifying them in // the VMap. If so, we need to not add the arguments to the arg ty vector // - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I) - if (VMap.count(I) == 0) // Haven't mapped the argument to anything yet? - ArgTypes.push_back(I->getType()); + for (const Argument &I : F->args()) + if (VMap.count(&I) == 0) // Haven't mapped the argument to anything yet? + ArgTypes.push_back(I.getType()); // Create a new function type... FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(), @@ -236,11 +241,10 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap, // Loop over the arguments, copying the names of the mapped arguments over... Function::arg_iterator DestI = NewF->arg_begin(); - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I) - if (VMap.count(I) == 0) { // Is this argument preserved? - DestI->setName(I->getName()); // Copy the name over... - VMap[I] = DestI++; // Add mapping to VMap + for (const Argument & I : F->args()) + if (VMap.count(&I) == 0) { // Is this argument preserved? + DestI->setName(I.getName()); // Copy the name over... + VMap[&I] = &*DestI++; // Add mapping to VMap } if (ModuleLevelChanges) @@ -262,27 +266,14 @@ namespace { bool ModuleLevelChanges; const char *NameSuffix; ClonedCodeInfo *CodeInfo; - CloningDirector *Director; - ValueMapTypeRemapper *TypeMapper; - ValueMaterializer *Materializer; public: PruningFunctionCloner(Function *newFunc, const Function *oldFunc, ValueToValueMapTy &valueMap, bool moduleLevelChanges, - const char *nameSuffix, ClonedCodeInfo *codeInfo, - CloningDirector *Director) + const char *nameSuffix, ClonedCodeInfo *codeInfo) : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap), ModuleLevelChanges(moduleLevelChanges), NameSuffix(nameSuffix), - CodeInfo(codeInfo), Director(Director) { - // These are optional components. The Director may return null. - if (Director) { - TypeMapper = Director->getTypeRemapper(); - Materializer = Director->getValueMaterializer(); - } else { - TypeMapper = nullptr; - Materializer = nullptr; - } - } + CodeInfo(codeInfo) {} /// The specified block is found to be reachable, clone it and /// anything that it can reach. @@ -328,23 +319,6 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, // loop doesn't include the terminator. for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end(); II != IE; ++II) { - // If the "Director" remaps the instruction, don't clone it. - if (Director) { - CloningDirector::CloningAction Action - = Director->handleInstruction(VMap, II, NewBB); - // If the cloning director says stop, we want to stop everything, not - // just break out of the loop (which would cause the terminator to be - // cloned). The cloning director is responsible for inserting a proper - // terminator into the new basic block in this case. - if (Action == CloningDirector::StopCloningBB) - return; - // If the cloning director says skip, continue to the next instruction. - // In this case, the cloning director is responsible for mapping the - // skipped instruction to some value that is defined in the new - // basic block. - if (Action == CloningDirector::SkipInstruction) - continue; - } Instruction *NewInst = II->clone(); @@ -352,8 +326,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, // nodes for which we defer processing until we update the CFG. if (!isa<PHINode>(NewInst)) { RemapInstruction(NewInst, VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, - TypeMapper, Materializer); + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); // If we can simplify this instruction to some other value, simply add // a mapping to that value rather than inserting a new instruction into @@ -365,7 +338,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, if (Value *MappedV = VMap.lookup(V)) V = MappedV; - VMap[II] = V; + VMap[&*II] = V; delete NewInst; continue; } @@ -373,9 +346,15 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, if (II->hasName()) NewInst->setName(II->getName()+NameSuffix); - VMap[II] = NewInst; // Add instruction map to value. + VMap[&*II] = NewInst; // Add instruction map to value. NewBB->getInstList().push_back(NewInst); hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II)); + + if (CodeInfo) + if (auto CS = ImmutableCallSite(&*II)) + if (CS.hasOperandBundles()) + CodeInfo->OperandBundleCallSites.push_back(NewInst); + if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) { if (isa<ConstantInt>(AI->getArraySize())) hasStaticAllocas = true; @@ -387,26 +366,6 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, // Finally, clone over the terminator. const TerminatorInst *OldTI = BB->getTerminator(); bool TerminatorDone = false; - if (Director) { - CloningDirector::CloningAction Action - = Director->handleInstruction(VMap, OldTI, NewBB); - // If the cloning director says stop, we want to stop everything, not - // just break out of the loop (which would cause the terminator to be - // cloned). The cloning director is responsible for inserting a proper - // terminator into the new basic block in this case. - if (Action == CloningDirector::StopCloningBB) - return; - if (Action == CloningDirector::CloneSuccessors) { - // If the director says to skip with a terminate instruction, we still - // need to clone this block's successors. - const TerminatorInst *TI = NewBB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - ToClone.push_back(TI->getSuccessor(i)); - return; - } - assert(Action != CloningDirector::SkipInstruction && - "SkipInstruction is not valid for terminators."); - } if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) { if (BI->isConditional()) { // If the condition was a known constant in the callee... @@ -447,11 +406,16 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, NewInst->setName(OldTI->getName()+NameSuffix); NewBB->getInstList().push_back(NewInst); VMap[OldTI] = NewInst; // Add instruction map to value. - + + if (CodeInfo) + if (auto CS = ImmutableCallSite(OldTI)) + if (CS.hasOperandBundles()) + CodeInfo->OperandBundleCallSites.push_back(NewInst); + // Recursively clone any reachable successor blocks. const TerminatorInst *TI = BB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - ToClone.push_back(TI->getSuccessor(i)); + for (const BasicBlock *Succ : TI->successors()) + ToClone.push_back(Succ); } if (CodeInfo) { @@ -470,41 +434,34 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, ValueToValueMapTy &VMap, bool ModuleLevelChanges, SmallVectorImpl<ReturnInst *> &Returns, - const char *NameSuffix, - ClonedCodeInfo *CodeInfo, - CloningDirector *Director) { + const char *NameSuffix, + ClonedCodeInfo *CodeInfo) { assert(NameSuffix && "NameSuffix cannot be null!"); ValueMapTypeRemapper *TypeMapper = nullptr; ValueMaterializer *Materializer = nullptr; - if (Director) { - TypeMapper = Director->getTypeRemapper(); - Materializer = Director->getValueMaterializer(); - } - #ifndef NDEBUG - // If the cloning starts at the begining of the function, verify that + // If the cloning starts at the beginning of the function, verify that // the function arguments are mapped. if (!StartingInst) - for (Function::const_arg_iterator II = OldFunc->arg_begin(), - E = OldFunc->arg_end(); II != E; ++II) - assert(VMap.count(II) && "No mapping from source argument specified!"); + for (const Argument &II : OldFunc->args()) + assert(VMap.count(&II) && "No mapping from source argument specified!"); #endif PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges, - NameSuffix, CodeInfo, Director); + NameSuffix, CodeInfo); const BasicBlock *StartingBB; if (StartingInst) StartingBB = StartingInst->getParent(); else { StartingBB = &OldFunc->getEntryBlock(); - StartingInst = StartingBB->begin(); + StartingInst = &StartingBB->front(); } // Clone the entry block, and anything recursively reachable from it. std::vector<const BasicBlock*> CloneWorklist; - PFC.CloneBlock(StartingBB, StartingInst, CloneWorklist); + PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist); while (!CloneWorklist.empty()) { const BasicBlock *BB = CloneWorklist.back(); CloneWorklist.pop_back(); @@ -517,9 +474,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // // Defer PHI resolution until rest of function is resolved. SmallVector<const PHINode*, 16> PHIToResolve; - for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end(); - BI != BE; ++BI) { - Value *V = VMap[BI]; + for (const BasicBlock &BI : *OldFunc) { + Value *V = VMap[&BI]; BasicBlock *NewBB = cast_or_null<BasicBlock>(V); if (!NewBB) continue; // Dead block. @@ -528,7 +484,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // Handle PHI nodes specially, as we have to remove references to dead // blocks. - for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); I != E; ++I) { + for (BasicBlock::const_iterator I = BI.begin(), E = BI.end(); I != E; ++I) { // PHI nodes may have been remapped to non-PHI nodes by the caller or // during the cloning process. if (const PHINode *PN = dyn_cast<PHINode>(I)) { @@ -621,8 +577,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, while ((PN = dyn_cast<PHINode>(I++))) { Value *NV = UndefValue::get(PN->getType()); PN->replaceAllUsesWith(NV); - assert(VMap[OldI] == PN && "VMap mismatch"); - VMap[OldI] = NV; + assert(VMap[&*OldI] == PN && "VMap mismatch"); + VMap[&*OldI] = NV; PN->eraseFromParent(); ++OldI; } @@ -644,15 +600,15 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // and zap unconditional fall-through branches. This happens all the time when // specializing code: code specialization turns conditional branches into // uncond branches, and this code folds them. - Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB]); + Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator(); Function::iterator I = Begin; while (I != NewFunc->end()) { // Check if this block has become dead during inlining or other // simplifications. Note that the first block will appear dead, as it has // not yet been wired up properly. - if (I != Begin && (pred_begin(I) == pred_end(I) || - I->getSinglePredecessor() == I)) { - BasicBlock *DeadBB = I++; + if (I != Begin && (pred_begin(&*I) == pred_end(&*I) || + I->getSinglePredecessor() == &*I)) { + BasicBlock *DeadBB = &*I++; DeleteDeadBlock(DeadBB); continue; } @@ -662,7 +618,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // simplification required looking through PHI nodes, those are only // available after forming the full basic block. That may leave some here, // and we still want to prune the dead code as early as possible. - ConstantFoldTerminator(I); + ConstantFoldTerminator(&*I); BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator()); if (!BI || BI->isConditional()) { ++I; continue; } @@ -681,7 +637,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, BI->eraseFromParent(); // Make all PHI nodes that referred to Dest now refer to I as their source. - Dest->replaceAllUsesWith(I); + Dest->replaceAllUsesWith(&*I); // Move all the instructions in the succ to the pred. I->getInstList().splice(I->end(), Dest->getInstList()); @@ -695,7 +651,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // Make a final pass over the basic blocks from the old function to gather // any return instructions which survived folding. We have to do this here // because we can iteratively remove and merge returns above. - for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB]), + for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB])->getIterator(), E = NewFunc->end(); I != E; ++I) if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) @@ -717,9 +673,8 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, const char *NameSuffix, ClonedCodeInfo *CodeInfo, Instruction *TheCall) { - CloneAndPruneIntoFromInst(NewFunc, OldFunc, OldFunc->front().begin(), VMap, - ModuleLevelChanges, Returns, NameSuffix, CodeInfo, - nullptr); + CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap, + ModuleLevelChanges, Returns, NameSuffix, CodeInfo); } /// \brief Remaps instructions in \p Blocks using the mapping in \p VMap. @@ -780,9 +735,10 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, } // Move them physically from the end of the block list. - F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH); - F->getBasicBlockList().splice(Before, F->getBasicBlockList(), - NewLoop->getHeader(), F->end()); + F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), + NewPH); + F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(), + NewLoop->getHeader()->getIterator(), F->end()); return NewLoop; } diff --git a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp index 61f1811..ab08335 100644 --- a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp @@ -20,21 +20,28 @@ #include "llvm-c/Core.h" using namespace llvm; -/// CloneModule - Return an exact copy of the specified module. This is not as -/// easy as it might seem because we have to worry about making copies of global -/// variables and functions, and making their (initializers and references, -/// respectively) refer to the right globals. +/// This is not as easy as it might seem because we have to worry about making +/// copies of global variables and functions, and making their (initializers and +/// references, respectively) refer to the right globals. /// -Module *llvm::CloneModule(const Module *M) { +std::unique_ptr<Module> llvm::CloneModule(const Module *M) { // Create the value map that maps things from the old module over to the new // module. ValueToValueMapTy VMap; return CloneModule(M, VMap); } -Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { +std::unique_ptr<Module> llvm::CloneModule(const Module *M, + ValueToValueMapTy &VMap) { + return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; }); +} + +std::unique_ptr<Module> llvm::CloneModule( + const Module *M, ValueToValueMapTy &VMap, + std::function<bool(const GlobalValue *)> ShouldCloneDefinition) { // First off, we need to create the new module. - Module *New = new Module(M->getModuleIdentifier(), M->getContext()); + std::unique_ptr<Module> New = + llvm::make_unique<Module>(M->getModuleIdentifier(), M->getContext()); New->setDataLayout(M->getDataLayout()); New->setTargetTriple(M->getTargetTriple()); New->setModuleInlineAsm(M->getModuleInlineAsm()); @@ -52,26 +59,48 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { (GlobalVariable*) nullptr, I->getThreadLocalMode(), I->getType()->getAddressSpace()); - GV->copyAttributesFrom(I); - VMap[I] = GV; + GV->copyAttributesFrom(&*I); + VMap[&*I] = GV; } // Loop over the functions in the module, making external functions as before for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) { Function *NF = - Function::Create(cast<FunctionType>(I->getType()->getElementType()), - I->getLinkage(), I->getName(), New); - NF->copyAttributesFrom(I); - VMap[I] = NF; + Function::Create(cast<FunctionType>(I->getType()->getElementType()), + I->getLinkage(), I->getName(), New.get()); + NF->copyAttributesFrom(&*I); + VMap[&*I] = NF; } // Loop over the aliases in the module for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end(); I != E; ++I) { - auto *PTy = cast<PointerType>(I->getType()); - auto *GA = GlobalAlias::create(PTy, I->getLinkage(), I->getName(), New); - GA->copyAttributesFrom(I); - VMap[I] = GA; + if (!ShouldCloneDefinition(&*I)) { + // An alias cannot act as an external reference, so we need to create + // either a function or a global variable depending on the value type. + // FIXME: Once pointee types are gone we can probably pick one or the + // other. + GlobalValue *GV; + if (I->getValueType()->isFunctionTy()) + GV = Function::Create(cast<FunctionType>(I->getValueType()), + GlobalValue::ExternalLinkage, I->getName(), + New.get()); + else + GV = new GlobalVariable( + *New, I->getValueType(), false, GlobalValue::ExternalLinkage, + (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr, + I->getThreadLocalMode(), I->getType()->getAddressSpace()); + VMap[&*I] = GV; + // We do not copy attributes (mainly because copying between different + // kinds of globals is forbidden), but this is generally not required for + // correctness. + continue; + } + auto *GA = GlobalAlias::create(I->getValueType(), + I->getType()->getPointerAddressSpace(), + I->getLinkage(), I->getName(), New.get()); + GA->copyAttributesFrom(&*I); + VMap[&*I] = GA; } // Now that all of the things that global variable initializer can refer to @@ -80,7 +109,12 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { // for (Module::const_global_iterator I = M->global_begin(), E = M->global_end(); I != E; ++I) { - GlobalVariable *GV = cast<GlobalVariable>(VMap[I]); + GlobalVariable *GV = cast<GlobalVariable>(VMap[&*I]); + if (!ShouldCloneDefinition(&*I)) { + // Skip after setting the correct linkage for an external reference. + GV->setLinkage(GlobalValue::ExternalLinkage); + continue; + } if (I->hasInitializer()) GV->setInitializer(MapValue(I->getInitializer(), VMap)); } @@ -88,18 +122,22 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { // Similarly, copy over function bodies now... // for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) { - Function *F = cast<Function>(VMap[I]); + Function *F = cast<Function>(VMap[&*I]); + if (!ShouldCloneDefinition(&*I)) { + // Skip after setting the correct linkage for an external reference. + F->setLinkage(GlobalValue::ExternalLinkage); + continue; + } if (!I->isDeclaration()) { Function::arg_iterator DestI = F->arg_begin(); for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end(); ++J) { DestI->setName(J->getName()); - VMap[J] = DestI++; + VMap[&*J] = &*DestI++; } SmallVector<ReturnInst*, 8> Returns; // Ignore returns cloned. - CloneFunctionInto(F, I, VMap, /*ModuleLevelChanges=*/true, Returns); - + CloneFunctionInto(F, &*I, VMap, /*ModuleLevelChanges=*/true, Returns); } if (I->hasPersonalityFn()) @@ -109,7 +147,10 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { // And aliases for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end(); I != E; ++I) { - GlobalAlias *GA = cast<GlobalAlias>(VMap[I]); + // We already dealt with undefined aliases above. + if (!ShouldCloneDefinition(&*I)) + continue; + GlobalAlias *GA = cast<GlobalAlias>(VMap[&*I]); if (const Constant *C = I->getAliasee()) GA->setAliasee(MapValue(C, VMap)); } @@ -129,7 +170,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { extern "C" { LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) { - return wrap(CloneModule(unwrap(M))); + return wrap(CloneModule(unwrap(M)).release()); } } diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp index ab89b41..823696d 100644 --- a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -51,7 +51,7 @@ AggregateArgsOpt("aggregate-extracted-args", cl::Hidden, /// \brief Test whether a block is valid for extraction. static bool isBlockValidForExtraction(const BasicBlock &BB) { // Landing pads must be in the function where they were inserted for cleanup. - if (BB.isLandingPad()) + if (BB.isEHPad()) return false; // Don't hoist code containing allocas, invokes, or vastarts. @@ -175,7 +175,7 @@ void CodeExtractor::findInputsOutputs(ValueSet &Inputs, for (User *U : II->users()) if (!definedInRegion(Blocks, U)) { - Outputs.insert(II); + Outputs.insert(&*II); break; } } @@ -211,7 +211,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // containing PHI nodes merging values from outside of the region, and a // second that contains all of the code for the block and merges back any // incoming values from inside of the region. - BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI(); + BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI()->getIterator(); BasicBlock *NewBB = Header->splitBasicBlock(AfterPHIs, Header->getName()+".ce"); @@ -246,7 +246,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // Create a new PHI node in the new region, which has an incoming value // from OldPred of PN. PHINode *NewPN = PHINode::Create(PN->getType(), 1 + NumPredsFromRegion, - PN->getName()+".ce", NewBB->begin()); + PN->getName() + ".ce", &NewBB->front()); NewPN->addIncoming(PN, OldPred); // Loop over all of the incoming value in PN, moving them to NewPN if they @@ -266,7 +266,8 @@ void CodeExtractor::splitReturnBlocks() { for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end(); I != E; ++I) if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator())) { - BasicBlock *New = (*I)->splitBasicBlock(RI, (*I)->getName()+".ret"); + BasicBlock *New = + (*I)->splitBasicBlock(RI->getIterator(), (*I)->getName() + ".ret"); if (DT) { // Old dominates New. New node dominates all other nodes dominated // by Old. @@ -365,10 +366,10 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i); TerminatorInst *TI = newFunction->begin()->getTerminator(); GetElementPtrInst *GEP = GetElementPtrInst::Create( - StructTy, AI, Idx, "gep_" + inputs[i]->getName(), TI); + StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI); RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI); } else - RewriteVal = AI++; + RewriteVal = &*AI++; std::vector<User*> Users(inputs[i]->user_begin(), inputs[i]->user_end()); for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end(); @@ -440,8 +441,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, StructValues.push_back(*i); } else { AllocaInst *alloca = - new AllocaInst((*i)->getType(), nullptr, (*i)->getName()+".loc", - codeReplacer->getParent()->begin()->begin()); + new AllocaInst((*i)->getType(), nullptr, (*i)->getName() + ".loc", + &codeReplacer->getParent()->front().front()); ReloadOutputs.push_back(alloca); params.push_back(alloca); } @@ -457,9 +458,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, // Allocate a struct at the beginning of this function StructArgTy = StructType::get(newFunction->getContext(), ArgTypes); - Struct = - new AllocaInst(StructArgTy, nullptr, "structArg", - codeReplacer->getParent()->begin()->begin()); + Struct = new AllocaInst(StructArgTy, nullptr, "structArg", + &codeReplacer->getParent()->front().front()); params.push_back(Struct); for (unsigned i = 0, e = inputs.size(); i != e; ++i) { @@ -566,8 +566,12 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, bool DominatesDef = true; - if (InvokeInst *Invoke = dyn_cast<InvokeInst>(outputs[out])) { - DefBlock = Invoke->getNormalDest(); + BasicBlock *NormalDest = nullptr; + if (auto *Invoke = dyn_cast<InvokeInst>(outputs[out])) + NormalDest = Invoke->getNormalDest(); + + if (NormalDest) { + DefBlock = NormalDest; // Make sure we are looking at the original successor block, not // at a newly inserted exit block, which won't be in the dominator @@ -606,11 +610,11 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut+out); GetElementPtrInst *GEP = GetElementPtrInst::Create( - StructArgTy, OAI, Idx, "gep_" + outputs[out]->getName(), + StructArgTy, &*OAI, Idx, "gep_" + outputs[out]->getName(), NTRet); new StoreInst(outputs[out], GEP, NTRet); } else { - new StoreInst(outputs[out], OAI, NTRet); + new StoreInst(outputs[out], &*OAI, NTRet); } } // Advance output iterator even if we don't emit a store diff --git a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp index dc95089..b56ff68 100644 --- a/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/CtorUtils.cpp @@ -50,7 +50,7 @@ void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) { GlobalVariable *NGV = new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(), CA, "", GCL->getThreadLocalMode()); - GCL->getParent()->getGlobalList().insert(GCL, NGV); + GCL->getParent()->getGlobalList().insert(GCL->getIterator(), NGV); NGV->takeName(GCL); // Nuke the old list, replacing any uses with the new one. diff --git a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp index 003da58..75a1dde 100644 --- a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp +++ b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp @@ -35,8 +35,8 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, I.getName()+".reg2mem", AllocaPoint); } else { Function *F = I.getParent()->getParent(); - Slot = new AllocaInst(I.getType(), nullptr, I.getName()+".reg2mem", - F->getEntryBlock().begin()); + Slot = new AllocaInst(I.getType(), nullptr, I.getName() + ".reg2mem", + &F->getEntryBlock().front()); } // We cannot demote invoke instructions to the stack if their normal edge @@ -89,16 +89,15 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, // AFTER the terminator instruction. BasicBlock::iterator InsertPt; if (!isa<TerminatorInst>(I)) { - InsertPt = &I; - ++InsertPt; - for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt) + InsertPt = ++I.getIterator(); + for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt) /* empty */; // Don't insert before PHI nodes or landingpad instrs. } else { InvokeInst &II = cast<InvokeInst>(I); InsertPt = II.getNormalDest()->getFirstInsertionPt(); } - new StoreInst(&I, Slot, InsertPt); + new StoreInst(&I, Slot, &*InsertPt); return Slot; } @@ -118,8 +117,8 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) { P->getName()+".reg2mem", AllocaPoint); } else { Function *F = P->getParent()->getParent(); - Slot = new AllocaInst(P->getType(), nullptr, P->getName()+".reg2mem", - F->getEntryBlock().begin()); + Slot = new AllocaInst(P->getType(), nullptr, P->getName() + ".reg2mem", + &F->getEntryBlock().front()); } // Iterate over each operand inserting a store in each predecessor. @@ -133,12 +132,12 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) { } // Insert a load in place of the PHI and replace all uses. - BasicBlock::iterator InsertPt = P; + BasicBlock::iterator InsertPt = P->getIterator(); - for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt) + for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt) /* empty */; // Don't insert before PHI nodes or landingpad instrs. - Value *V = new LoadInst(Slot, P->getName()+".reload", InsertPt); + Value *V = new LoadInst(Slot, P->getName() + ".reload", &*InsertPt); P->replaceAllUsesWith(V); // Delete PHI. diff --git a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp index 4eb3e3d..492ae9f 100644 --- a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp +++ b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp @@ -28,12 +28,11 @@ class FlattenCFGOpt { AliasAnalysis *AA; /// \brief Use parallel-and or parallel-or to generate conditions for /// conditional branches. - bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, - Pass *P = nullptr); + bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder); /// \brief If \param BB is the merge block of an if-region, attempt to merge /// the if-region with an adjacent if-region upstream if two if-regions /// contain identical instructions. - bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = nullptr); + bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder); /// \brief Compare a pair of blocks: \p Block1 and \p Block2, which /// are from two if-regions whose entry blocks are \p Head1 and \p /// Head2. \returns true if \p Block1 and \p Block2 contain identical @@ -122,8 +121,7 @@ public: /// its predecessor. In Case 2, \param BB (BB3) only has conditional branches /// as its predecessors. /// -bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, - Pass *P) { +bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) { PHINode *PHI = dyn_cast<PHINode>(BB->begin()); if (PHI) return false; // For simplicity, avoid cases containing PHI nodes. @@ -177,8 +175,9 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, // Instructions in the internal condition blocks should be safe // to hoist up. - for (BasicBlock::iterator BI = Pred->begin(), BE = PBI; BI != BE;) { - Instruction *CI = BI++; + for (BasicBlock::iterator BI = Pred->begin(), BE = PBI->getIterator(); + BI != BE;) { + Instruction *CI = &*BI++; if (isa<PHINode>(CI) || !isSafeToSpeculativelyExecute(CI)) return false; } @@ -315,7 +314,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, BasicBlock *Block1, BasicBlock *Block2) { TerminatorInst *PTI2 = Head2->getTerminator(); - Instruction *PBI2 = Head2->begin(); + Instruction *PBI2 = &Head2->front(); bool eq1 = (Block1 == Head1); bool eq2 = (Block2 == Head2); @@ -327,9 +326,9 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, // Check whether instructions in Block1 and Block2 are identical // and do not alias with instructions in Head2. BasicBlock::iterator iter1 = Block1->begin(); - BasicBlock::iterator end1 = Block1->getTerminator(); + BasicBlock::iterator end1 = Block1->getTerminator()->getIterator(); BasicBlock::iterator iter2 = Block2->begin(); - BasicBlock::iterator end2 = Block2->getTerminator(); + BasicBlock::iterator end2 = Block2->getTerminator()->getIterator(); while (1) { if (iter1 == end1) { @@ -338,7 +337,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, break; } - if (!iter1->isIdenticalTo(iter2)) + if (!iter1->isIdenticalTo(&*iter2)) return false; // Illegal to remove instructions with side effects except @@ -356,10 +355,10 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, return false; if (iter1->mayWriteToMemory()) { - for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) { + for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) { if (BI->mayReadFromMemory() || BI->mayWriteToMemory()) { // Check alias with Head2. - if (!AA || AA->alias(iter1, BI)) + if (!AA || AA->alias(&*iter1, &*BI)) return false; } } @@ -386,8 +385,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, /// if (a || b) /// statement; /// -bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, - Pass *P) { +bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) { BasicBlock *IfTrue2, *IfFalse2; Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2); Instruction *CInst2 = dyn_cast_or_null<Instruction>(IfCond2); @@ -413,7 +411,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, return false; TerminatorInst *PTI2 = SecondEntryBlock->getTerminator(); - Instruction *PBI2 = SecondEntryBlock->begin(); + Instruction *PBI2 = &SecondEntryBlock->front(); if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfTrue1, IfTrue2)) @@ -425,8 +423,8 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, // Check whether \param SecondEntryBlock has side-effect and is safe to // speculate. - for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) { - Instruction *CI = BI; + for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) { + Instruction *CI = &*BI; if (isa<PHINode>(CI) || CI->mayHaveSideEffects() || !isSafeToSpeculativelyExecute(CI)) return false; diff --git a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp index 44b7d25..3893a75 100644 --- a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp +++ b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp @@ -49,6 +49,10 @@ bool llvm::isSafeToDestroyConstant(const Constant *C) { static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, SmallPtrSetImpl<const PHINode *> &PhiUsers) { + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + if (GV->isExternallyInitialized()) + GS.StoredType = GlobalStatus::StoredOnce; + for (const Use &U : V->uses()) { const User *UR = U.getUser(); if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) { diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp index d2d60d7..1457411 100644 --- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -13,14 +13,15 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Attributes.h" @@ -41,6 +42,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/CommandLine.h" #include <algorithm> + using namespace llvm; static cl::opt<bool> @@ -54,17 +56,17 @@ PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining", cl::desc("Convert align attributes to assumptions during inlining.")); bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI, - bool InsertLifetime) { - return InlineFunction(CallSite(CI), IFI, InsertLifetime); + AAResults *CalleeAAR, bool InsertLifetime) { + return InlineFunction(CallSite(CI), IFI, CalleeAAR, InsertLifetime); } bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI, - bool InsertLifetime) { - return InlineFunction(CallSite(II), IFI, InsertLifetime); + AAResults *CalleeAAR, bool InsertLifetime) { + return InlineFunction(CallSite(II), IFI, CalleeAAR, InsertLifetime); } namespace { - /// A class for recording information about inlining through an invoke. - class InvokeInliningInfo { + /// A class for recording information about inlining a landing pad. + class LandingPadInliningInfo { BasicBlock *OuterResumeDest; ///< Destination of the invoke's unwind. BasicBlock *InnerResumeDest; ///< Destination for the callee's resume. LandingPadInst *CallerLPad; ///< LandingPadInst associated with the invoke. @@ -72,7 +74,7 @@ namespace { SmallVector<Value*, 8> UnwindDestPHIValues; public: - InvokeInliningInfo(InvokeInst *II) + LandingPadInliningInfo(InvokeInst *II) : OuterResumeDest(II->getUnwindDest()), InnerResumeDest(nullptr), CallerLPad(nullptr), InnerEHValuesPHI(nullptr) { // If there are PHI nodes in the unwind destination block, we need to keep @@ -121,14 +123,14 @@ namespace { } } }; -} +} // anonymous namespace /// Get or create a target for the branch from ResumeInsts. -BasicBlock *InvokeInliningInfo::getInnerResumeDest() { +BasicBlock *LandingPadInliningInfo::getInnerResumeDest() { if (InnerResumeDest) return InnerResumeDest; // Split the landing pad. - BasicBlock::iterator SplitPoint = CallerLPad; ++SplitPoint; + BasicBlock::iterator SplitPoint = ++CallerLPad->getIterator(); InnerResumeDest = OuterResumeDest->splitBasicBlock(SplitPoint, OuterResumeDest->getName() + ".body"); @@ -137,7 +139,7 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() { const unsigned PHICapacity = 2; // Create corresponding new PHIs for all the PHIs in the outer landing pad. - BasicBlock::iterator InsertPoint = InnerResumeDest->begin(); + Instruction *InsertPoint = &InnerResumeDest->front(); BasicBlock::iterator I = OuterResumeDest->begin(); for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) { PHINode *OuterPHI = cast<PHINode>(I); @@ -162,8 +164,8 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() { /// When the landing pad block has only one predecessor, this is a simple /// branch. When there is more than one predecessor, we need to split the /// landing pad block after the landingpad instruction and jump to there. -void InvokeInliningInfo::forwardResume(ResumeInst *RI, - SmallPtrSetImpl<LandingPadInst*> &InlinedLPads) { +void LandingPadInliningInfo::forwardResume( + ResumeInst *RI, SmallPtrSetImpl<LandingPadInst *> &InlinedLPads) { BasicBlock *Dest = getInnerResumeDest(); BasicBlock *Src = RI->getParent(); @@ -182,33 +184,39 @@ void InvokeInliningInfo::forwardResume(ResumeInst *RI, /// This function analyze BB to see if there are any calls, and if so, /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI /// nodes in that block with the values specified in InvokeDestPHIValues. -static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, - InvokeInliningInfo &Invoke) { +static BasicBlock * +HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) { for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { - Instruction *I = BBI++; + Instruction *I = &*BBI++; // We only need to check for function calls: inlined invoke // instructions require no special handling. CallInst *CI = dyn_cast<CallInst>(I); - // If this call cannot unwind, don't convert it to an invoke. - // Inline asm calls cannot throw. if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue())) continue; // Convert this function call into an invoke instruction. First, split the // basic block. - BasicBlock *Split = BB->splitBasicBlock(CI, CI->getName()+".noexc"); + BasicBlock *Split = + BB->splitBasicBlock(CI->getIterator(), CI->getName() + ".noexc"); // Delete the unconditional branch inserted by splitBasicBlock BB->getInstList().pop_back(); // Create the new invoke instruction. - ImmutableCallSite CS(CI); - SmallVector<Value*, 8> InvokeArgs(CS.arg_begin(), CS.arg_end()); - InvokeInst *II = InvokeInst::Create(CI->getCalledValue(), Split, - Invoke.getOuterResumeDest(), - InvokeArgs, CI->getName(), BB); + SmallVector<Value*, 8> InvokeArgs(CI->arg_begin(), CI->arg_end()); + SmallVector<OperandBundleDef, 1> OpBundles; + + CI->getOperandBundlesAsDefs(OpBundles); + + // Note: we're round tripping operand bundles through memory here, and that + // can potentially be avoided with a cleverer API design that we do not have + // as of this time. + + InvokeInst *II = + InvokeInst::Create(CI->getCalledValue(), Split, UnwindEdge, InvokeArgs, + OpBundles, CI->getName(), BB); II->setDebugLoc(CI->getDebugLoc()); II->setCallingConv(CI->getCallingConv()); II->setAttributes(CI->getAttributes()); @@ -219,12 +227,9 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, // Delete the original call Split->getInstList().pop_front(); - - // Update any PHI nodes in the exceptional block to indicate that there is - // now a new entry in them. - Invoke.addIncomingPHIValuesFor(BB); - return; + return BB; } + return nullptr; } /// If we inlined an invoke site, we need to convert calls @@ -233,8 +238,8 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, /// II is the invoke instruction being inlined. FirstNewBlock is the first /// block of the inlined code (the last block is the end of the function), /// and InlineCodeInfo is information about the code that got inlined. -static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, - ClonedCodeInfo &InlinedCodeInfo) { +static void HandleInlinedLandingPad(InvokeInst *II, BasicBlock *FirstNewBlock, + ClonedCodeInfo &InlinedCodeInfo) { BasicBlock *InvokeDest = II->getUnwindDest(); Function *Caller = FirstNewBlock->getParent(); @@ -242,11 +247,12 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, // The inlined code is currently at the end of the function, scan from the // start of the inlined code to its end, checking for stuff we need to // rewrite. - InvokeInliningInfo Invoke(II); + LandingPadInliningInfo Invoke(II); // Get all of the inlined landing pad instructions. SmallPtrSet<LandingPadInst*, 16> InlinedLPads; - for (Function::iterator I = FirstNewBlock, E = Caller->end(); I != E; ++I) + for (Function::iterator I = FirstNewBlock->getIterator(), E = Caller->end(); + I != E; ++I) if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) InlinedLPads.insert(II->getLandingPadInst()); @@ -262,9 +268,14 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, InlinedLPad->setCleanup(true); } - for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){ + for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); + BB != E; ++BB) { if (InlinedCodeInfo.ContainsCalls) - HandleCallsInBlockInlinedThroughInvoke(BB, Invoke); + if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke( + &*BB, Invoke.getOuterResumeDest())) + // Update any PHI nodes in the exceptional block to indicate that there + // is now a new entry in them. + Invoke.addIncomingPHIValuesFor(NewBB); // Forward any resumes that are remaining here. if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) @@ -278,6 +289,99 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, InvokeDest->removePredecessor(II->getParent()); } +/// If we inlined an invoke site, we need to convert calls +/// in the body of the inlined function into invokes. +/// +/// II is the invoke instruction being inlined. FirstNewBlock is the first +/// block of the inlined code (the last block is the end of the function), +/// and InlineCodeInfo is information about the code that got inlined. +static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock, + ClonedCodeInfo &InlinedCodeInfo) { + BasicBlock *UnwindDest = II->getUnwindDest(); + Function *Caller = FirstNewBlock->getParent(); + + assert(UnwindDest->getFirstNonPHI()->isEHPad() && "unexpected BasicBlock!"); + + // If there are PHI nodes in the unwind destination block, we need to keep + // track of which values came into them from the invoke before removing the + // edge from this block. + SmallVector<Value *, 8> UnwindDestPHIValues; + llvm::BasicBlock *InvokeBB = II->getParent(); + for (Instruction &I : *UnwindDest) { + // Save the value to use for this edge. + PHINode *PHI = dyn_cast<PHINode>(&I); + if (!PHI) + break; + UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB)); + } + + // Add incoming-PHI values to the unwind destination block for the given basic + // block, using the values for the original invoke's source block. + auto UpdatePHINodes = [&](BasicBlock *Src) { + BasicBlock::iterator I = UnwindDest->begin(); + for (Value *V : UnwindDestPHIValues) { + PHINode *PHI = cast<PHINode>(I); + PHI->addIncoming(V, Src); + ++I; + } + }; + + // This connects all the instructions which 'unwind to caller' to the invoke + // destination. + for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end(); + BB != E; ++BB) { + if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) { + if (CRI->unwindsToCaller()) { + CleanupReturnInst::Create(CRI->getCleanupPad(), UnwindDest, CRI); + CRI->eraseFromParent(); + UpdatePHINodes(&*BB); + } + } + + Instruction *I = BB->getFirstNonPHI(); + if (!I->isEHPad()) + continue; + + Instruction *Replacement = nullptr; + if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) { + if (CatchSwitch->unwindsToCaller()) { + auto *NewCatchSwitch = CatchSwitchInst::Create( + CatchSwitch->getParentPad(), UnwindDest, + CatchSwitch->getNumHandlers(), CatchSwitch->getName(), + CatchSwitch); + for (BasicBlock *PadBB : CatchSwitch->handlers()) + NewCatchSwitch->addHandler(PadBB); + Replacement = NewCatchSwitch; + } + } else if (!isa<FuncletPadInst>(I)) { + llvm_unreachable("unexpected EHPad!"); + } + + if (Replacement) { + Replacement->takeName(I); + I->replaceAllUsesWith(Replacement); + I->eraseFromParent(); + UpdatePHINodes(&*BB); + } + } + + if (InlinedCodeInfo.ContainsCalls) + for (Function::iterator BB = FirstNewBlock->getIterator(), + E = Caller->end(); + BB != E; ++BB) + if (BasicBlock *NewBB = + HandleCallsInBlockInlinedThroughInvoke(&*BB, UnwindDest)) + // Update any PHI nodes in the exceptional block to indicate that there + // is now a new entry in them. + UpdatePHINodes(NewBB); + + // Now that everything is happy, we have one final detail. The PHI nodes in + // the exception destination block still have entries due to the original + // invoke instruction. Eliminate these entries (which might even delete the + // PHI node) now. + UnwindDest->removePredecessor(InvokeBB); +} + /// When inlining a function that contains noalias scope metadata, /// this metadata needs to be cloned so that the inlined blocks /// have different "unqiue scopes" at every call site. Were this not done, then @@ -395,17 +499,16 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) { /// parameters with noalias metadata specifying the new scope, and tag all /// non-derived loads, stores and memory intrinsics with the new alias scopes. static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, - const DataLayout &DL, AliasAnalysis *AA) { + const DataLayout &DL, AAResults *CalleeAAR) { if (!EnableNoAliasConversion) return; const Function *CalledFunc = CS.getCalledFunction(); SmallVector<const Argument *, 4> NoAliasArgs; - for (Function::const_arg_iterator I = CalledFunc->arg_begin(), - E = CalledFunc->arg_end(); I != E; ++I) { - if (I->hasNoAliasAttr() && !I->hasNUses(0)) - NoAliasArgs.push_back(I); + for (const Argument &I : CalledFunc->args()) { + if (I.hasNoAliasAttr() && !I.hasNUses(0)) + NoAliasArgs.push_back(&I); } if (NoAliasArgs.empty()) @@ -480,10 +583,10 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, continue; IsFuncCall = true; - if (AA) { - AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(ICS); - if (MRB == AliasAnalysis::OnlyAccessesArgumentPointees || - MRB == AliasAnalysis::OnlyReadsArgumentPointees) + if (CalleeAAR) { + FunctionModRefBehavior MRB = CalleeAAR->getModRefBehavior(ICS); + if (MRB == FMRB_OnlyAccessesArgumentPointees || + MRB == FMRB_OnlyReadsArgumentPointees) IsArgMemOnlyCall = true; } @@ -518,7 +621,7 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap, for (unsigned i = 0, ie = PtrArgs.size(); i != ie; ++i) { SmallVector<Value *, 4> Objects; GetUnderlyingObjects(const_cast<Value*>(PtrArgs[i]), - Objects, DL, /* MaxLookup = */ 0); + Objects, DL, /* LI = */ nullptr); for (Value *O : Objects) ObjSet.insert(O); @@ -646,7 +749,7 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) { // caller, then don't bother inserting the assumption. Value *Arg = CS.getArgument(I->getArgNo()); if (getKnownAlignment(Arg, DL, CS.getInstruction(), - &IFI.ACT->getAssumptionCache(*CalledFunc), + &IFI.ACT->getAssumptionCache(*CS.getCaller()), &DT) >= Align) continue; @@ -731,7 +834,7 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M, BasicBlock *InsertBlock, InlineFunctionInfo &IFI) { Type *AggTy = cast<PointerType>(Src->getType())->getElementType(); - IRBuilder<> Builder(InsertBlock->begin()); + IRBuilder<> Builder(InsertBlock, InsertBlock->begin()); Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy)); @@ -851,9 +954,8 @@ updateInlinedAtInfo(DebugLoc DL, DILocation *InlinedAtNode, LLVMContext &Ctx, // Starting from the top, rebuild the nodes to point to the new inlined-at // location (then rebuilding the rest of the chain behind it) and update the // map of already-constructed inlined-at nodes. - for (auto I = InlinedAtLocations.rbegin(), E = InlinedAtLocations.rend(); - I != E; ++I) { - const DILocation *MD = *I; + for (const DILocation *MD : make_range(InlinedAtLocations.rbegin(), + InlinedAtLocations.rend())) { Last = IANodes[MD] = DILocation::getDistinct( Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last); } @@ -917,7 +1019,7 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, /// exists in the instruction stream. Similarly this will inline a recursive /// function by one level. bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, - bool InsertLifetime) { + AAResults *CalleeAAR, bool InsertLifetime) { Instruction *TheCall = CS.getInstruction(); assert(TheCall->getParent() && TheCall->getParent()->getParent() && "Instruction not in function!"); @@ -930,6 +1032,22 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, CalledFunc->isDeclaration() || // call, or call to a vararg function! CalledFunc->getFunctionType()->isVarArg()) return false; + // The inliner does not know how to inline through calls with operand bundles + // in general ... + if (CS.hasOperandBundles()) { + for (int i = 0, e = CS.getNumOperandBundles(); i != e; ++i) { + uint32_t Tag = CS.getOperandBundleAt(i).getTagID(); + // ... but it knows how to inline through "deopt" operand bundles ... + if (Tag == LLVMContext::OB_deopt) + continue; + // ... and "funclet" operand bundles. + if (Tag == LLVMContext::OB_funclet) + continue; + + return false; + } + } + // If the call to the callee cannot throw, set the 'nounwind' flag on any // calls that we inline. bool MarkNoUnwind = CS.doesNotThrow(); @@ -950,13 +1068,17 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Get the personality function from the callee if it contains a landing pad. Constant *CalledPersonality = - CalledFunc->hasPersonalityFn() ? CalledFunc->getPersonalityFn() : nullptr; + CalledFunc->hasPersonalityFn() + ? CalledFunc->getPersonalityFn()->stripPointerCasts() + : nullptr; // Find the personality function used by the landing pads of the caller. If it // exists, then check to see that it matches the personality function used in // the callee. Constant *CallerPersonality = - Caller->hasPersonalityFn() ? Caller->getPersonalityFn() : nullptr; + Caller->hasPersonalityFn() + ? Caller->getPersonalityFn()->stripPointerCasts() + : nullptr; if (CalledPersonality) { if (!CallerPersonality) Caller->setPersonalityFn(CalledPersonality); @@ -968,9 +1090,46 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, return false; } + // We need to figure out which funclet the callsite was in so that we may + // properly nest the callee. + Instruction *CallSiteEHPad = nullptr; + if (CallerPersonality) { + EHPersonality Personality = classifyEHPersonality(CallerPersonality); + if (isFuncletEHPersonality(Personality)) { + Optional<OperandBundleUse> ParentFunclet = + CS.getOperandBundle(LLVMContext::OB_funclet); + if (ParentFunclet) + CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front()); + + // OK, the inlining site is legal. What about the target function? + + if (CallSiteEHPad) { + if (Personality == EHPersonality::MSVC_CXX) { + // The MSVC personality cannot tolerate catches getting inlined into + // cleanup funclets. + if (isa<CleanupPadInst>(CallSiteEHPad)) { + // Ok, the call site is within a cleanuppad. Let's check the callee + // for catchpads. + for (const BasicBlock &CalledBB : *CalledFunc) { + if (isa<CatchSwitchInst>(CalledBB.getFirstNonPHI())) + return false; + } + } + } else if (isAsynchronousEHPersonality(Personality)) { + // SEH is even less tolerant, there may not be any sort of exceptional + // funclet in the callee. + for (const BasicBlock &CalledBB : *CalledFunc) { + if (CalledBB.isEHPad()) + return false; + } + } + } + } + } + // Get an iterator to the last basic block in the function, which will have // the new function inlined after it. - Function::iterator LastBlock = &Caller->back(); + Function::iterator LastBlock = --Caller->end(); // Make sure to capture all of the return instructions from the cloned // function. @@ -1007,7 +1166,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI)); } - VMap[I] = ActualArg; + VMap[&*I] = ActualArg; } // Add alignment assumptions if necessary. We do this before the inlined @@ -1029,7 +1188,61 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Inject byval arguments initialization. for (std::pair<Value*, Value*> &Init : ByValInit) HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(), - FirstNewBlock, IFI); + &*FirstNewBlock, IFI); + + Optional<OperandBundleUse> ParentDeopt = + CS.getOperandBundle(LLVMContext::OB_deopt); + if (ParentDeopt) { + SmallVector<OperandBundleDef, 2> OpDefs; + + for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) { + Instruction *I = dyn_cast_or_null<Instruction>(VH); + if (!I) continue; // instruction was DCE'd or RAUW'ed to undef + + OpDefs.clear(); + + CallSite ICS(I); + OpDefs.reserve(ICS.getNumOperandBundles()); + + for (unsigned i = 0, e = ICS.getNumOperandBundles(); i < e; ++i) { + auto ChildOB = ICS.getOperandBundleAt(i); + if (ChildOB.getTagID() != LLVMContext::OB_deopt) { + // If the inlined call has other operand bundles, let them be + OpDefs.emplace_back(ChildOB); + continue; + } + + // It may be useful to separate this logic (of handling operand + // bundles) out to a separate "policy" component if this gets crowded. + // Prepend the parent's deoptimization continuation to the newly + // inlined call's deoptimization continuation. + std::vector<Value *> MergedDeoptArgs; + MergedDeoptArgs.reserve(ParentDeopt->Inputs.size() + + ChildOB.Inputs.size()); + + MergedDeoptArgs.insert(MergedDeoptArgs.end(), + ParentDeopt->Inputs.begin(), + ParentDeopt->Inputs.end()); + MergedDeoptArgs.insert(MergedDeoptArgs.end(), ChildOB.Inputs.begin(), + ChildOB.Inputs.end()); + + OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs)); + } + + Instruction *NewI = nullptr; + if (isa<CallInst>(I)) + NewI = CallInst::Create(cast<CallInst>(I), OpDefs, I); + else + NewI = InvokeInst::Create(cast<InvokeInst>(I), OpDefs, I); + + // Note: the RAUW does the appropriate fixup in VMap, so we need to do + // this even if the call returns void. + I->replaceAllUsesWith(NewI); + + VH = nullptr; + I->eraseFromParent(); + } + } // Update the callgraph if requested. if (IFI.CG) @@ -1042,7 +1255,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, CloneAliasScopeMetadata(CS, VMap); // Add noalias metadata if necessary. - AddAliasScopeMetadata(CS, VMap, DL, IFI.AA); + AddAliasScopeMetadata(CS, VMap, DL, CalleeAAR); // FIXME: We could register any cloned assumptions instead of clearing the // whole function's cache. @@ -1085,9 +1298,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Transfer all of the allocas over in a block. Using splice means // that the instructions aren't removed from the symbol table, then // reinserted. - Caller->getEntryBlock().getInstList().splice(InsertPoint, - FirstNewBlock->getInstList(), - AI, I); + Caller->getEntryBlock().getInstList().splice( + InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I); } // Move any dbg.declares describing the allocas into the entry basic block. DIBuilder DIB(*Caller->getParent()); @@ -1137,7 +1349,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Leave lifetime markers for the static alloca's, scoping them to the // function we just inlined. if (InsertLifetime && !IFI.StaticAllocas.empty()) { - IRBuilder<> builder(FirstNewBlock->begin()); + IRBuilder<> builder(&FirstNewBlock->front()); for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) { AllocaInst *AI = IFI.StaticAllocas[ai]; @@ -1189,7 +1401,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore); // Insert the llvm.stacksave. - CallInst *SavedPtr = IRBuilder<>(FirstNewBlock, FirstNewBlock->begin()) + CallInst *SavedPtr = IRBuilder<>(&*FirstNewBlock, FirstNewBlock->begin()) .CreateCall(StackSave, {}, "savedstack"); // Insert a call to llvm.stackrestore before any return instructions in the @@ -1203,10 +1415,74 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, } } + // Update the lexical scopes of the new funclets and callsites. + // Anything that had 'none' as its parent is now nested inside the callsite's + // EHPad. + + if (CallSiteEHPad) { + for (Function::iterator BB = FirstNewBlock->getIterator(), + E = Caller->end(); + BB != E; ++BB) { + // Add bundle operands to any top-level call sites. + SmallVector<OperandBundleDef, 1> OpBundles; + for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) { + Instruction *I = &*BBI++; + CallSite CS(I); + if (!CS) + continue; + + // Skip call sites which are nounwind intrinsics. + auto *CalledFn = + dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); + if (CalledFn && CalledFn->isIntrinsic() && CS.doesNotThrow()) + continue; + + // Skip call sites which already have a "funclet" bundle. + if (CS.getOperandBundle(LLVMContext::OB_funclet)) + continue; + + CS.getOperandBundlesAsDefs(OpBundles); + OpBundles.emplace_back("funclet", CallSiteEHPad); + + Instruction *NewInst; + if (CS.isCall()) + NewInst = CallInst::Create(cast<CallInst>(I), OpBundles, I); + else + NewInst = InvokeInst::Create(cast<InvokeInst>(I), OpBundles, I); + NewInst->setDebugLoc(I->getDebugLoc()); + NewInst->takeName(I); + I->replaceAllUsesWith(NewInst); + I->eraseFromParent(); + + OpBundles.clear(); + } + + Instruction *I = BB->getFirstNonPHI(); + if (!I->isEHPad()) + continue; + + if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) { + if (isa<ConstantTokenNone>(CatchSwitch->getParentPad())) + CatchSwitch->setParentPad(CallSiteEHPad); + } else { + auto *FPI = cast<FuncletPadInst>(I); + if (isa<ConstantTokenNone>(FPI->getParentPad())) + FPI->setParentPad(CallSiteEHPad); + } + } + } + // If we are inlining for an invoke instruction, we must make sure to rewrite // any call instructions into invoke instructions. - if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) - HandleInlinedInvoke(II, FirstNewBlock, InlinedFunctionInfo); + if (auto *II = dyn_cast<InvokeInst>(TheCall)) { + BasicBlock *UnwindDest = II->getUnwindDest(); + Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI(); + if (isa<LandingPadInst>(FirstNonPHI)) { + HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo); + } else { + HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo); + } + } // Handle any inlined musttail call sites. In order for a new call site to be // musttail, the source of the clone and the inlined call site must have been @@ -1250,7 +1526,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // the calling basic block. if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) { // Move all of the instructions right before the call. - OrigBB->getInstList().splice(TheCall, FirstNewBlock->getInstList(), + OrigBB->getInstList().splice(TheCall->getIterator(), + FirstNewBlock->getInstList(), FirstNewBlock->begin(), FirstNewBlock->end()); // Remove the cloned basic block. Caller->getBasicBlockList().pop_back(); @@ -1297,15 +1574,16 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Split the basic block. This guarantees that no PHI nodes will have to be // updated due to new incoming edges, and make the invoke case more // symmetric to the call case. - AfterCallBB = OrigBB->splitBasicBlock(CreatedBranchToNormalDest, - CalledFunc->getName()+".exit"); + AfterCallBB = + OrigBB->splitBasicBlock(CreatedBranchToNormalDest->getIterator(), + CalledFunc->getName() + ".exit"); } else { // It's a call // If this is a call instruction, we need to split the basic block that // the call lives in. // - AfterCallBB = OrigBB->splitBasicBlock(TheCall, - CalledFunc->getName()+".exit"); + AfterCallBB = OrigBB->splitBasicBlock(TheCall->getIterator(), + CalledFunc->getName() + ".exit"); } // Change the branch that used to go to AfterCallBB to branch to the first @@ -1314,14 +1592,14 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, TerminatorInst *Br = OrigBB->getTerminator(); assert(Br && Br->getOpcode() == Instruction::Br && "splitBasicBlock broken!"); - Br->setOperand(0, FirstNewBlock); - + Br->setOperand(0, &*FirstNewBlock); // Now that the function is correct, make it a little bit nicer. In // particular, move the basic blocks inserted from the end of the function // into the space made by splitting the source basic block. - Caller->getBasicBlockList().splice(AfterCallBB, Caller->getBasicBlockList(), - FirstNewBlock, Caller->end()); + Caller->getBasicBlockList().splice(AfterCallBB->getIterator(), + Caller->getBasicBlockList(), FirstNewBlock, + Caller->end()); // Handle all of the return instructions that we just cloned in, and eliminate // any users of the original call/invoke instruction. @@ -1333,7 +1611,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // possible incoming values. if (!TheCall->use_empty()) { PHI = PHINode::Create(RTy, Returns.size(), TheCall->getName(), - AfterCallBB->begin()); + &AfterCallBB->front()); // Anything that used the result of the function call should now use the // PHI node as their operand. TheCall->replaceAllUsesWith(PHI); @@ -1350,7 +1628,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, } } - // Add a branch to the merge points and remove return instructions. DebugLoc Loc; for (unsigned i = 0, e = Returns.size(); i != e; ++i) { @@ -1413,7 +1690,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Splice the code entry block into calling block, right before the // unconditional branch. CalleeEntry->replaceAllUsesWith(OrigBB); // Update PHI nodes - OrigBB->getInstList().splice(Br, CalleeEntry->getInstList()); + OrigBB->getInstList().splice(Br->getIterator(), CalleeEntry->getInstList()); // Remove the unconditional branch. OrigBB->getInstList().erase(Br); diff --git a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp index 30edf3b..5687afa 100644 --- a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp +++ b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp @@ -380,14 +380,10 @@ bool llvm::expandRemainder(BinaryOperator *Rem) { IRBuilder<> Builder(Rem); - Type *RemTy = Rem->getType(); - if (RemTy->isVectorTy()) - llvm_unreachable("Div over vectors not supported"); - - unsigned RemTyBitWidth = RemTy->getIntegerBitWidth(); - - if (RemTyBitWidth != 32 && RemTyBitWidth != 64) - llvm_unreachable("Div of bitwidth other than 32 or 64 not supported"); + assert(!Rem->getType()->isVectorTy() && "Div over vectors not supported"); + assert((Rem->getType()->getIntegerBitWidth() == 32 || + Rem->getType()->getIntegerBitWidth() == 64) && + "Div of bitwidth other than 32 or 64 not supported"); // First prepare the sign if it's a signed remainder if (Rem->getOpcode() == Instruction::SRem) { @@ -401,7 +397,7 @@ bool llvm::expandRemainder(BinaryOperator *Rem) { // If we didn't actually generate an urem instruction, we're done // This happens for example if the input were constant. In this case the // Builder insertion point was unchanged - if (Rem == Builder.GetInsertPoint()) + if (Rem == Builder.GetInsertPoint().getNodePtrUnchecked()) return true; BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint()); @@ -440,14 +436,10 @@ bool llvm::expandDivision(BinaryOperator *Div) { IRBuilder<> Builder(Div); - Type *DivTy = Div->getType(); - if (DivTy->isVectorTy()) - llvm_unreachable("Div over vectors not supported"); - - unsigned DivTyBitWidth = DivTy->getIntegerBitWidth(); - - if (DivTyBitWidth != 32 && DivTyBitWidth != 64) - llvm_unreachable("Div of bitwidth other than 32 or 64 not supported"); + assert(!Div->getType()->isVectorTy() && "Div over vectors not supported"); + assert((Div->getType()->getIntegerBitWidth() == 32 || + Div->getType()->getIntegerBitWidth() == 64) && + "Div of bitwidth other than 32 or 64 not supported"); // First prepare the sign if it's a signed division if (Div->getOpcode() == Instruction::SDiv) { @@ -461,7 +453,7 @@ bool llvm::expandDivision(BinaryOperator *Div) { // If we didn't actually generate an udiv instruction, we're done // This happens for example if the input were constant. In this case the // Builder insertion point was unchanged - if (Div == Builder.GetInsertPoint()) + if (Div == Builder.GetInsertPoint().getNodePtrUnchecked()) return true; BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint()); @@ -492,15 +484,14 @@ bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) { "Trying to expand remainder from a non-remainder function"); Type *RemTy = Rem->getType(); - if (RemTy->isVectorTy()) - llvm_unreachable("Div over vectors not supported"); + assert(!RemTy->isVectorTy() && "Div over vectors not supported"); unsigned RemTyBitWidth = RemTy->getIntegerBitWidth(); - if (RemTyBitWidth > 32) - llvm_unreachable("Div of bitwidth greater than 32 not supported"); + assert(RemTyBitWidth <= 32 && + "Div of bitwidth greater than 32 not supported"); - if (RemTyBitWidth == 32) + if (RemTyBitWidth == 32) return expandRemainder(Rem); // If bitwidth smaller than 32 extend inputs, extend output and proceed @@ -542,15 +533,13 @@ bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) { "Trying to expand remainder from a non-remainder function"); Type *RemTy = Rem->getType(); - if (RemTy->isVectorTy()) - llvm_unreachable("Div over vectors not supported"); + assert(!RemTy->isVectorTy() && "Div over vectors not supported"); unsigned RemTyBitWidth = RemTy->getIntegerBitWidth(); - if (RemTyBitWidth > 64) - llvm_unreachable("Div of bitwidth greater than 64 not supported"); + assert(RemTyBitWidth <= 64 && "Div of bitwidth greater than 64 not supported"); - if (RemTyBitWidth == 64) + if (RemTyBitWidth == 64) return expandRemainder(Rem); // If bitwidth smaller than 64 extend inputs, extend output and proceed @@ -593,13 +582,11 @@ bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) { "Trying to expand division from a non-division function"); Type *DivTy = Div->getType(); - if (DivTy->isVectorTy()) - llvm_unreachable("Div over vectors not supported"); + assert(!DivTy->isVectorTy() && "Div over vectors not supported"); unsigned DivTyBitWidth = DivTy->getIntegerBitWidth(); - if (DivTyBitWidth > 32) - llvm_unreachable("Div of bitwidth greater than 32 not supported"); + assert(DivTyBitWidth <= 32 && "Div of bitwidth greater than 32 not supported"); if (DivTyBitWidth == 32) return expandDivision(Div); @@ -643,13 +630,12 @@ bool llvm::expandDivisionUpTo64Bits(BinaryOperator *Div) { "Trying to expand division from a non-division function"); Type *DivTy = Div->getType(); - if (DivTy->isVectorTy()) - llvm_unreachable("Div over vectors not supported"); + assert(!DivTy->isVectorTy() && "Div over vectors not supported"); unsigned DivTyBitWidth = DivTy->getIntegerBitWidth(); - if (DivTyBitWidth > 64) - llvm_unreachable("Div of bitwidth greater than 64 not supported"); + assert(DivTyBitWidth <= 64 && + "Div of bitwidth greater than 64 not supported"); if (DivTyBitWidth == 64) return expandDivision(Div); diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp index 9d40b69..b4b2e14 100644 --- a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -31,8 +31,10 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -64,6 +66,13 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, PredIteratorCache &PredCache, LoopInfo *LI) { SmallVector<Use *, 16> UsesToRewrite; + // Tokens cannot be used in PHI nodes, so we skip over them. + // We can run into tokens which are live out of a loop with catchswitch + // instructions in Windows EH if the catchswitch has one catchpad which + // is inside the loop and another which is not. + if (Inst.getType()->isTokenTy()) + return false; + BasicBlock *InstBB = Inst.getParent(); for (Use &U : Inst.uses()) { @@ -84,9 +93,8 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, // Invoke instructions are special in that their result value is not available // along their unwind edge. The code below tests to see whether DomBB - // dominates - // the value, so adjust DomBB to the normal destination block, which is - // effectively where the value is first usable. + // dominates the value, so adjust DomBB to the normal destination block, + // which is effectively where the value is first usable. BasicBlock *DomBB = Inst.getParent(); if (InvokeInst *Inv = dyn_cast<InvokeInst>(&Inst)) DomBB = Inv->getNormalDest(); @@ -101,10 +109,7 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, // Insert the LCSSA phi's into all of the exit blocks dominated by the // value, and add them to the Phi's map. - for (SmallVectorImpl<BasicBlock *>::const_iterator BBI = ExitBlocks.begin(), - BBE = ExitBlocks.end(); - BBI != BBE; ++BBI) { - BasicBlock *ExitBB = *BBI; + for (BasicBlock *ExitBB : ExitBlocks) { if (!DT.dominates(DomNode, DT.getNode(ExitBB))) continue; @@ -113,7 +118,7 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, continue; PHINode *PN = PHINode::Create(Inst.getType(), PredCache.size(ExitBB), - Inst.getName() + ".lcssa", ExitBB->begin()); + Inst.getName() + ".lcssa", &ExitBB->front()); // Add inputs from inside the loop for this PHI. for (BasicBlock *Pred : PredCache.get(ExitBB)) { @@ -148,26 +153,26 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, // Rewrite all uses outside the loop in terms of the new PHIs we just // inserted. - for (unsigned i = 0, e = UsesToRewrite.size(); i != e; ++i) { + for (Use *UseToRewrite : UsesToRewrite) { // If this use is in an exit block, rewrite to use the newly inserted PHI. // This is required for correctness because SSAUpdate doesn't handle uses in // the same block. It assumes the PHI we inserted is at the end of the // block. - Instruction *User = cast<Instruction>(UsesToRewrite[i]->getUser()); + Instruction *User = cast<Instruction>(UseToRewrite->getUser()); BasicBlock *UserBB = User->getParent(); if (PHINode *PN = dyn_cast<PHINode>(User)) - UserBB = PN->getIncomingBlock(*UsesToRewrite[i]); + UserBB = PN->getIncomingBlock(*UseToRewrite); if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) { // Tell the VHs that the uses changed. This updates SCEV's caches. - if (UsesToRewrite[i]->get()->hasValueHandle()) - ValueHandleBase::ValueIsRAUWd(*UsesToRewrite[i], UserBB->begin()); - UsesToRewrite[i]->set(UserBB->begin()); + if (UseToRewrite->get()->hasValueHandle()) + ValueHandleBase::ValueIsRAUWd(*UseToRewrite, &UserBB->front()); + UseToRewrite->set(&UserBB->front()); continue; } // Otherwise, do full PHI insertion. - SSAUpdate.RewriteUse(*UsesToRewrite[i]); + SSAUpdate.RewriteUse(*UseToRewrite); } // Post process PHI instructions that were inserted into another disjoint loop @@ -190,10 +195,9 @@ static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT, } // Remove PHI nodes that did not have any uses rewritten. - for (unsigned i = 0, e = AddedPHIs.size(); i != e; ++i) { - if (AddedPHIs[i]->use_empty()) - AddedPHIs[i]->eraseFromParent(); - } + for (PHINode *PN : AddedPHIs) + if (PN->use_empty()) + PN->eraseFromParent(); return true; } @@ -205,8 +209,8 @@ blockDominatesAnExit(BasicBlock *BB, DominatorTree &DT, const SmallVectorImpl<BasicBlock *> &ExitBlocks) { DomTreeNode *DomNode = DT.getNode(BB); - for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) - if (DT.dominates(DomNode, DT.getNode(ExitBlocks[i]))) + for (BasicBlock *ExitBB : ExitBlocks) + if (DT.dominates(DomNode, DT.getNode(ExitBB))) return true; return false; @@ -227,25 +231,22 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI, // Look at all the instructions in the loop, checking to see if they have uses // outside the loop. If so, rewrite those uses. - for (Loop::block_iterator BBI = L.block_begin(), BBE = L.block_end(); - BBI != BBE; ++BBI) { - BasicBlock *BB = *BBI; - + for (BasicBlock *BB : L.blocks()) { // For large loops, avoid use-scanning by using dominance information: In // particular, if a block does not dominate any of the loop exits, then none // of the values defined in the block could be used outside the loop. if (!blockDominatesAnExit(BB, DT, ExitBlocks)) continue; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + for (Instruction &I : *BB) { // Reject two common cases fast: instructions with no uses (like stores) // and instructions with one use that is in the same block as this. - if (I->use_empty() || - (I->hasOneUse() && I->user_back()->getParent() == BB && - !isa<PHINode>(I->user_back()))) + if (I.use_empty() || + (I.hasOneUse() && I.user_back()->getParent() == BB && + !isa<PHINode>(I.user_back()))) continue; - Changed |= processInstruction(L, *I, DT, ExitBlocks, PredCache, LI); + Changed |= processInstruction(L, I, DT, ExitBlocks, PredCache, LI); } } @@ -266,8 +267,8 @@ bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI, bool Changed = false; // Recurse depth-first through inner loops. - for (Loop::iterator I = L.begin(), E = L.end(); I != E; ++I) - Changed |= formLCSSARecursively(**I, DT, LI, SE); + for (Loop *SubLoop : L.getSubLoops()) + Changed |= formLCSSARecursively(*SubLoop, DT, LI, SE); Changed |= formLCSSA(L, DT, LI, SE); return Changed; @@ -296,8 +297,10 @@ struct LCSSA : public FunctionPass { AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreservedID(LoopSimplifyID); - AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); } }; } @@ -306,6 +309,8 @@ char LCSSA::ID = 0; INITIALIZE_PASS_BEGIN(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_END(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) Pass *llvm::createLCSSAPass() { return new LCSSA(); } @@ -317,7 +322,8 @@ bool LCSSA::runOnFunction(Function &F) { bool Changed = false; LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = getAnalysisIfAvailable<ScalarEvolution>(); + auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + SE = SEWP ? &SEWP->getSE() : nullptr; // Simplify each loop nest in the function. for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp index ba8af47..d2793e5 100644 --- a/contrib/llvm/lib/Transforms/Utils/Local.cpp +++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp @@ -17,11 +17,13 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LibCallSemantics.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -188,9 +190,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, BasicBlock *BB = SI->getParent(); // Remove entries from PHI nodes which we no longer branch to... - for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) { + for (BasicBlock *Succ : SI->successors()) { // Found case matching a constant operand? - BasicBlock *Succ = SI->getSuccessor(i); if (Succ == TheOnlyDest) TheOnlyDest = nullptr; // Don't modify the first branch to TheOnlyDest else @@ -230,6 +231,11 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, SIDef->getValue().getZExtValue())); } + // Update make.implicit metadata to the newly-created conditional branch. + MDNode *MakeImplicitMD = SI->getMetadata(LLVMContext::MD_make_implicit); + if (MakeImplicitMD) + NewBr->setMetadata(LLVMContext::MD_make_implicit, MakeImplicitMD); + // Delete the old switch. SI->eraseFromParent(); return true; @@ -283,8 +289,9 @@ bool llvm::isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI) { if (!I->use_empty() || isa<TerminatorInst>(I)) return false; - // We don't want the landingpad instruction removed by anything this general. - if (isa<LandingPadInst>(I)) + // We don't want the landingpad-like instructions removed by anything this + // general. + if (I->isEHPad()) return false; // We don't want debug info removed by anything this general, unless @@ -414,6 +421,49 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN, return false; } +static bool +simplifyAndDCEInstruction(Instruction *I, + SmallSetVector<Instruction *, 16> &WorkList, + const DataLayout &DL, + const TargetLibraryInfo *TLI) { + if (isInstructionTriviallyDead(I, TLI)) { + // Null out all of the instruction's operands to see if any operand becomes + // dead as we go. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + Value *OpV = I->getOperand(i); + I->setOperand(i, nullptr); + + if (!OpV->use_empty() || I == OpV) + continue; + + // If the operand is an instruction that became dead as we nulled out the + // operand, and if it is 'trivially' dead, delete it in a future loop + // iteration. + if (Instruction *OpI = dyn_cast<Instruction>(OpV)) + if (isInstructionTriviallyDead(OpI, TLI)) + WorkList.insert(OpI); + } + + I->eraseFromParent(); + + return true; + } + + if (Value *SimpleV = SimplifyInstruction(I, DL)) { + // Add the users to the worklist. CAREFUL: an instruction can use itself, + // in the case of a phi node. + for (User *U : I->users()) + if (U != I) + WorkList.insert(cast<Instruction>(U)); + + // Replace the instruction with its simplified value. + I->replaceAllUsesWith(SimpleV); + I->eraseFromParent(); + return true; + } + return false; +} + /// SimplifyInstructionsInBlock - Scan the specified basic block and try to /// simplify any instructions in it and recursively delete dead instructions. /// @@ -422,30 +472,34 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN, bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetLibraryInfo *TLI) { bool MadeChange = false; + const DataLayout &DL = BB->getModule()->getDataLayout(); #ifndef NDEBUG // In debug builds, ensure that the terminator of the block is never replaced // or deleted by these simplifications. The idea of simplification is that it // cannot introduce new instructions, and there is no way to replace the // terminator of a block without introducing a new instruction. - AssertingVH<Instruction> TerminatorVH(--BB->end()); + AssertingVH<Instruction> TerminatorVH(&BB->back()); #endif - for (BasicBlock::iterator BI = BB->begin(), E = --BB->end(); BI != E; ) { + SmallSetVector<Instruction *, 16> WorkList; + // Iterate over the original function, only adding insts to the worklist + // if they actually need to be revisited. This avoids having to pre-init + // the worklist with the entire function's worth of instructions. + for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end()); BI != E;) { assert(!BI->isTerminator()); - Instruction *Inst = BI++; + Instruction *I = &*BI; + ++BI; - WeakVH BIHandle(BI); - if (recursivelySimplifyInstruction(Inst, TLI)) { - MadeChange = true; - if (BIHandle != BI) - BI = BB->begin(); - continue; - } + // We're visiting this instruction now, so make sure it's not in the + // worklist from an earlier visit. + if (!WorkList.count(I)) + MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI); + } - MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI); - if (BIHandle != BI) - BI = BB->begin(); + while (!WorkList.empty()) { + Instruction *I = WorkList.pop_back_val(); + MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI); } return MadeChange; } @@ -808,7 +862,8 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) { // Copy over any phi, debug or lifetime instruction. BB->getTerminator()->eraseFromParent(); - Succ->getInstList().splice(Succ->getFirstNonPHI(), BB->getInstList()); + Succ->getInstList().splice(Succ->getFirstNonPHI()->getIterator(), + BB->getInstList()); } else { while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) { // We explicitly check for such uses in CanPropagatePredecessorsForPHIs. @@ -997,9 +1052,31 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, ExtendedArg = dyn_cast<Argument>(ZExt->getOperand(0)); if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0)); - if (ExtendedArg) - Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, DIExpr, + if (ExtendedArg) { + // We're now only describing a subset of the variable. The piece we're + // describing will always be smaller than the variable size, because + // VariableSize == Size of Alloca described by DDI. Since SI stores + // to the alloca described by DDI, if it's first operand is an extend, + // we're guaranteed that before extension, the value was narrower than + // the size of the alloca, hence the size of the described variable. + SmallVector<uint64_t, 3> NewDIExpr; + unsigned PieceOffset = 0; + // If this already is a bit piece, we drop the bit piece from the expression + // and record the offset. + if (DIExpr->isBitPiece()) { + NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()-3); + PieceOffset = DIExpr->getBitPieceOffset(); + } else { + NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()); + } + NewDIExpr.push_back(dwarf::DW_OP_bit_piece); + NewDIExpr.push_back(PieceOffset); //Offset + const DataLayout &DL = DDI->getModule()->getDataLayout(); + NewDIExpr.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); // Size + Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, + Builder.createExpression(NewDIExpr), DDI->getDebugLoc(), SI); + } else Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr, DDI->getDebugLoc(), SI); @@ -1017,8 +1094,13 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, if (LdStHasDebugValue(DIVar, LI)) return true; - Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0, DIVar, DIExpr, - DDI->getDebugLoc(), LI); + // We are now tracking the loaded value instead of the address. In the + // future if multi-location support is added to the IR, it might be + // preferable to keep tracking both the loaded value and the original + // address in case the alloca can not be elided. + Instruction *DbgValue = Builder.insertDbgValueIntrinsic( + LI, 0, DIVar, DIExpr, DDI->getDebugLoc(), (Instruction *)nullptr); + DbgValue->insertAfter(LI); return true; } @@ -1034,8 +1116,8 @@ bool llvm::LowerDbgDeclare(Function &F) { DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); SmallVector<DbgDeclareInst *, 4> Dbgs; for (auto &FI : F) - for (BasicBlock::iterator BI : FI) - if (auto DDI = dyn_cast<DbgDeclareInst>(BI)) + for (Instruction &BI : FI) + if (auto DDI = dyn_cast<DbgDeclareInst>(&BI)) Dbgs.push_back(DDI); if (Dbgs.empty()) @@ -1060,9 +1142,13 @@ bool llvm::LowerDbgDeclare(Function &F) { // This is a call by-value or some other instruction that // takes a pointer to the variable. Insert a *value* // intrinsic that describes the alloca. + SmallVector<uint64_t, 1> NewDIExpr; + auto *DIExpr = DDI->getExpression(); + NewDIExpr.push_back(dwarf::DW_OP_deref); + NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()); DIB.insertDbgValueIntrinsic(AI, 0, DDI->getVariable(), - DDI->getExpression(), DDI->getDebugLoc(), - CI); + DIB.createExpression(NewDIExpr), + DDI->getDebugLoc(), CI); } DDI->eraseFromParent(); } @@ -1082,9 +1168,10 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) { return nullptr; } -bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, - DIBuilder &Builder, bool Deref) { - DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI); +bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress, + Instruction *InsertBefore, DIBuilder &Builder, + bool Deref, int Offset) { + DbgDeclareInst *DDI = FindAllocaDbgDeclare(Address); if (!DDI) return false; DebugLoc Loc = DDI->getDebugLoc(); @@ -1092,29 +1179,40 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, auto *DIExpr = DDI->getExpression(); assert(DIVar && "Missing variable"); - if (Deref) { + if (Deref || Offset) { // Create a copy of the original DIDescriptor for user variable, prepending // "deref" operation to a list of address elements, as new llvm.dbg.declare // will take a value storing address of the memory for variable, not // alloca itself. SmallVector<uint64_t, 4> NewDIExpr; - NewDIExpr.push_back(dwarf::DW_OP_deref); + if (Deref) + NewDIExpr.push_back(dwarf::DW_OP_deref); + if (Offset > 0) { + NewDIExpr.push_back(dwarf::DW_OP_plus); + NewDIExpr.push_back(Offset); + } else if (Offset < 0) { + NewDIExpr.push_back(dwarf::DW_OP_minus); + NewDIExpr.push_back(-Offset); + } if (DIExpr) NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()); DIExpr = Builder.createExpression(NewDIExpr); } - // Insert llvm.dbg.declare in the same basic block as the original alloca, - // and remove old llvm.dbg.declare. - BasicBlock *BB = AI->getParent(); - Builder.insertDeclare(NewAllocaAddress, DIVar, DIExpr, Loc, BB); + // Insert llvm.dbg.declare immediately after the original alloca, and remove + // old llvm.dbg.declare. + Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore); DDI->eraseFromParent(); return true; } -/// changeToUnreachable - Insert an unreachable instruction before the specified -/// instruction, making it and the rest of the code in the block dead. -static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { +bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress, + DIBuilder &Builder, bool Deref, int Offset) { + return replaceDbgDeclare(AI, NewAllocaAddress, AI->getNextNode(), Builder, + Deref, Offset); +} + +void llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap) { BasicBlock *BB = I->getParent(); // Loop over all of the successors, removing BB's entry from any PHI // nodes. @@ -1132,7 +1230,7 @@ static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { new UnreachableInst(I->getContext(), I); // All instructions after this are dead. - BasicBlock::iterator BBI = I, BBE = BB->end(); + BasicBlock::iterator BBI = I->getIterator(), BBE = BB->end(); while (BBI != BBE) { if (!BBI->use_empty()) BBI->replaceAllUsesWith(UndefValue::get(BBI->getType())); @@ -1142,8 +1240,11 @@ static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { /// changeToCall - Convert the specified invoke into a normal call. static void changeToCall(InvokeInst *II) { - SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); - CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II); + SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end()); + SmallVector<OperandBundleDef, 1> OpBundles; + II->getOperandBundlesAsDefs(OpBundles); + CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, OpBundles, + "", II); NewCall->takeName(II); NewCall->setCallingConv(II->getCallingConv()); NewCall->setAttributes(II->getAttributes()); @@ -1162,7 +1263,7 @@ static bool markAliveBlocks(Function &F, SmallPtrSetImpl<BasicBlock*> &Reachable) { SmallVector<BasicBlock*, 128> Worklist; - BasicBlock *BB = F.begin(); + BasicBlock *BB = &F.front(); Worklist.push_back(BB); Reachable.insert(BB); bool Changed = false; @@ -1187,7 +1288,7 @@ static bool markAliveBlocks(Function &F, if (MakeUnreachable) { // Don't insert a call to llvm.trap right before the unreachable. - changeToUnreachable(BBI, false); + changeToUnreachable(&*BBI, false); Changed = true; break; } @@ -1201,7 +1302,7 @@ static bool markAliveBlocks(Function &F, ++BBI; if (!isa<UnreachableInst>(BBI)) { // Don't insert a call to llvm.trap right before the unreachable. - changeToUnreachable(BBI, false); + changeToUnreachable(&*BBI, false); Changed = true; } break; @@ -1227,8 +1328,9 @@ static bool markAliveBlocks(Function &F, } } - // Turn invokes that call 'nounwind' functions into ordinary calls. - if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { + TerminatorInst *Terminator = BB->getTerminator(); + if (auto *II = dyn_cast<InvokeInst>(Terminator)) { + // Turn invokes that call 'nounwind' functions into ordinary calls. Value *Callee = II->getCalledValue(); if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { changeToUnreachable(II, true); @@ -1243,6 +1345,44 @@ static bool markAliveBlocks(Function &F, changeToCall(II); Changed = true; } + } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Terminator)) { + // Remove catchpads which cannot be reached. + struct CatchPadDenseMapInfo { + static CatchPadInst *getEmptyKey() { + return DenseMapInfo<CatchPadInst *>::getEmptyKey(); + } + static CatchPadInst *getTombstoneKey() { + return DenseMapInfo<CatchPadInst *>::getTombstoneKey(); + } + static unsigned getHashValue(CatchPadInst *CatchPad) { + return static_cast<unsigned>(hash_combine_range( + CatchPad->value_op_begin(), CatchPad->value_op_end())); + } + static bool isEqual(CatchPadInst *LHS, CatchPadInst *RHS) { + if (LHS == getEmptyKey() || LHS == getTombstoneKey() || + RHS == getEmptyKey() || RHS == getTombstoneKey()) + return LHS == RHS; + return LHS->isIdenticalTo(RHS); + } + }; + + // Set of unique CatchPads. + SmallDenseMap<CatchPadInst *, detail::DenseSetEmpty, 4, + CatchPadDenseMapInfo, detail::DenseSetPair<CatchPadInst *>> + HandlerSet; + detail::DenseSetEmpty Empty; + for (CatchSwitchInst::handler_iterator I = CatchSwitch->handler_begin(), + E = CatchSwitch->handler_end(); + I != E; ++I) { + BasicBlock *HandlerBB = *I; + auto *CatchPad = cast<CatchPadInst>(HandlerBB->getFirstNonPHI()); + if (!HandlerSet.insert({CatchPad, Empty}).second) { + CatchSwitch->removeHandler(I); + --I; + --E; + Changed = true; + } + } } Changed |= ConstantFoldTerminator(BB, true); @@ -1253,10 +1393,44 @@ static bool markAliveBlocks(Function &F, return Changed; } +void llvm::removeUnwindEdge(BasicBlock *BB) { + TerminatorInst *TI = BB->getTerminator(); + + if (auto *II = dyn_cast<InvokeInst>(TI)) { + changeToCall(II); + return; + } + + TerminatorInst *NewTI; + BasicBlock *UnwindDest; + + if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) { + NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI); + UnwindDest = CRI->getUnwindDest(); + } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) { + auto *NewCatchSwitch = CatchSwitchInst::Create( + CatchSwitch->getParentPad(), nullptr, CatchSwitch->getNumHandlers(), + CatchSwitch->getName(), CatchSwitch); + for (BasicBlock *PadBB : CatchSwitch->handlers()) + NewCatchSwitch->addHandler(PadBB); + + NewTI = NewCatchSwitch; + UnwindDest = CatchSwitch->getUnwindDest(); + } else { + llvm_unreachable("Could not find unwind successor"); + } + + NewTI->takeName(TI); + NewTI->setDebugLoc(TI->getDebugLoc()); + UnwindDest->removePredecessor(BB); + TI->replaceAllUsesWith(NewTI); + TI->eraseFromParent(); +} + /// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even /// if they are in a dead cycle. Return true if a change was made, false /// otherwise. -bool llvm::removeUnreachableBlocks(Function &F) { +bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) { SmallPtrSet<BasicBlock*, 128> Reachable; bool Changed = markAliveBlocks(F, Reachable); @@ -1270,17 +1444,20 @@ bool llvm::removeUnreachableBlocks(Function &F) { // Loop over all of the basic blocks that are not reachable, dropping all of // their internal references... for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) { - if (Reachable.count(BB)) + if (Reachable.count(&*BB)) continue; - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) + for (succ_iterator SI = succ_begin(&*BB), SE = succ_end(&*BB); SI != SE; + ++SI) if (Reachable.count(*SI)) - (*SI)->removePredecessor(BB); + (*SI)->removePredecessor(&*BB); + if (LVI) + LVI->eraseBlock(&*BB); BB->dropAllReferences(); } for (Function::iterator I = ++F.begin(); I != F.end();) - if (!Reachable.count(I)) + if (!Reachable.count(&*I)) I = F.getBasicBlockList().erase(I); else ++I; @@ -1288,9 +1465,10 @@ bool llvm::removeUnreachableBlocks(Function &F) { return true; } -void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsigned> KnownIDs) { +void llvm::combineMetadata(Instruction *K, const Instruction *J, + ArrayRef<unsigned> KnownIDs) { SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; - K->dropUnknownMetadata(KnownIDs); + K->dropUnknownNonDebugMetadata(KnownIDs); K->getAllMetadataOtherThanDebugLoc(Metadata); for (unsigned i = 0, n = Metadata.size(); i < n; ++i) { unsigned Kind = Metadata[i].first; @@ -1326,8 +1504,29 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsign // Only set the !nonnull if it is present in both instructions. K->setMetadata(Kind, JMD); break; + case LLVMContext::MD_invariant_group: + // Preserve !invariant.group in K. + break; + case LLVMContext::MD_align: + K->setMetadata(Kind, + MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); + break; + case LLVMContext::MD_dereferenceable: + case LLVMContext::MD_dereferenceable_or_null: + K->setMetadata(Kind, + MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); + break; } } + // Set !invariant.group from J if J has it. If both instructions have it + // then we will just pick it from J - even when they are different. + // Also make sure that K is load or store - f.e. combining bitcast with load + // could produce bitcast with invariant.group metadata, which is invalid. + // FIXME: we should try to preserve both invariant.group md if they are + // different, but right now instruction can only have one invariant.group. + if (auto *JMD = J->getMetadata(LLVMContext::MD_invariant_group)) + if (isa<LoadInst>(K) || isa<StoreInst>(K)) + K->setMetadata(LLVMContext::MD_invariant_group, JMD); } unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, @@ -1349,3 +1548,40 @@ unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, } return Count; } + +unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, + DominatorTree &DT, + const BasicBlock *BB) { + assert(From->getType() == To->getType()); + + unsigned Count = 0; + for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); + UI != UE;) { + Use &U = *UI++; + auto *I = cast<Instruction>(U.getUser()); + if (DT.dominates(BB, I->getParent())) { + U.set(To); + DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as " + << *To << " in " << *U << "\n"); + ++Count; + } + } + return Count; +} + +bool llvm::callsGCLeafFunction(ImmutableCallSite CS) { + if (isa<IntrinsicInst>(CS.getInstruction())) + // Most LLVM intrinsics are things which can never take a safepoint. + // As a result, we don't need to have the stack parsable at the + // callsite. This is a highly useful optimization since intrinsic + // calls are fairly prevalent, particularly in debug builds. + return true; + + // Check if the function is specifically marked as a gc leaf function. + if (CS.hasFnAttr("gc-leaf-function")) + return true; + if (const Function *F = CS.getCalledFunction()) + return F->hasFnAttribute("gc-leaf-function"); + + return false; +} diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp index 5c98043..1fa4695 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -44,11 +44,14 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -78,7 +81,7 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB, SmallVectorImpl<BasicBlock *> &SplitPreds, Loop *L) { // Check to see if NewBB is already well placed. - Function::iterator BBI = NewBB; --BBI; + Function::iterator BBI = --NewBB->getIterator(); for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { if (&*BBI == SplitPreds[i]) return; @@ -92,9 +95,8 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB, // block that neighbors a BB actually in the loop. BasicBlock *FoundBB = nullptr; for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { - Function::iterator BBI = SplitPreds[i]; - if (++BBI != NewBB->getParent()->end() && - L->contains(BBI)) { + Function::iterator BBI = SplitPreds[i]->getIterator(); + if (++BBI != NewBB->getParent()->end() && L->contains(&*BBI)) { FoundBB = SplitPreds[i]; break; } @@ -112,17 +114,10 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB, /// preheader, this method is called to insert one. This method has two phases: /// preheader insertion and analysis updating. /// -BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { +BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT, + LoopInfo *LI, bool PreserveLCSSA) { BasicBlock *Header = L->getHeader(); - // Get analyses that we try to update. - auto *AA = PP->getAnalysisIfAvailable<AliasAnalysis>(); - auto *DTWP = PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; - auto *LIWP = PP->getAnalysisIfAvailable<LoopInfoWrapperPass>(); - auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); - // Compute the set of predecessors of the loop that are not in the loop. SmallVector<BasicBlock*, 8> OutsideBlocks; for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); @@ -141,8 +136,10 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { // Split out the loop pre-header. BasicBlock *PreheaderBB; - PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", - AA, DT, LI, PreserveLCSSA); + PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", DT, + LI, PreserveLCSSA); + if (!PreheaderBB) + return nullptr; DEBUG(dbgs() << "LoopSimplify: Creating pre-header " << PreheaderBB->getName() << "\n"); @@ -159,8 +156,8 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) { /// This method is used to split exit blocks that have predecessors outside of /// the loop. static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, - AliasAnalysis *AA, DominatorTree *DT, - LoopInfo *LI, Pass *PP) { + DominatorTree *DT, LoopInfo *LI, + bool PreserveLCSSA) { SmallVector<BasicBlock*, 8> LoopBlocks; for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) { BasicBlock *P = *I; @@ -175,10 +172,10 @@ static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?"); BasicBlock *NewExitBB = nullptr; - bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); - - NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", AA, DT, - LI, PreserveLCSSA); + NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", DT, LI, + PreserveLCSSA); + if (!NewExitBB) + return nullptr; DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block " << NewExitBB->getName() << "\n"); @@ -206,8 +203,7 @@ static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock, /// \brief The first part of loop-nestification is to find a PHI node that tells /// us how to partition the loops. -static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA, - DominatorTree *DT, +static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT, AssumptionCache *AC) { const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) { @@ -216,7 +212,6 @@ static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA, if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) { // This is a degenerate PHI already, don't modify it! PN->replaceAllUsesWith(V); - if (AA) AA->deleteValue(PN); PN->eraseFromParent(); continue; } @@ -251,18 +246,18 @@ static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA, /// created. /// static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, - AliasAnalysis *AA, DominatorTree *DT, - LoopInfo *LI, ScalarEvolution *SE, Pass *PP, + DominatorTree *DT, LoopInfo *LI, + ScalarEvolution *SE, bool PreserveLCSSA, AssumptionCache *AC) { // Don't try to separate loops without a preheader. if (!Preheader) return nullptr; // The header is not a landing pad; preheader insertion should ensure this. - assert(!L->getHeader()->isLandingPad() && - "Can't insert backedge to landing pad"); + BasicBlock *Header = L->getHeader(); + assert(!Header->isEHPad() && "Can't insert backedge to EH pad"); - PHINode *PN = findPHIToPartitionLoops(L, AA, DT, AC); + PHINode *PN = findPHIToPartitionLoops(L, DT, AC); if (!PN) return nullptr; // No known way to partition. // Pull out all predecessors that have varying values in the loop. This @@ -286,11 +281,8 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, if (SE) SE->forgetLoop(L); - bool PreserveLCSSA = PP->mustPreserveAnalysisID(LCSSAID); - - BasicBlock *Header = L->getHeader(); BasicBlock *NewBB = SplitBlockPredecessors(Header, OuterLoopPreds, ".outer", - AA, DT, LI, PreserveLCSSA); + DT, LI, PreserveLCSSA); // Make sure that NewBB is put someplace intelligent, which doesn't mess up // code layout too horribly. @@ -357,7 +349,6 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader, /// and have that block branch to the loop header. This ensures that loops /// have exactly one backedge. static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, - AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI) { assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!"); @@ -369,8 +360,8 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, if (!Preheader) return nullptr; - // The header is not a landing pad; preheader insertion should ensure this. - assert(!Header->isLandingPad() && "Can't insert backedge to landing pad"); + // The header is not an EH pad; preheader insertion should ensure this. + assert(!Header->isEHPad() && "Can't insert backedge to EH pad"); // Figure out which basic blocks contain back-edges to the loop header. std::vector<BasicBlock*> BackedgeBlocks; @@ -394,7 +385,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, << BEBlock->getName() << "\n"); // Move the new backedge block to right after the last backedge block. - Function::iterator InsertPos = BackedgeBlocks.back(); ++InsertPos; + Function::iterator InsertPos = ++BackedgeBlocks.back()->getIterator(); F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock); // Now that the block has been inserted into the function, create PHI nodes in @@ -443,7 +434,6 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, // eliminate the PHI Node. if (HasUniqueIncomingValue) { NewPN->replaceAllUsesWith(UniqueValue); - if (AA) AA->deleteValue(NewPN); BEBlock->getInstList().erase(NewPN); } } @@ -470,15 +460,10 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, } /// \brief Simplify one loop and queue further loops for simplification. -/// -/// FIXME: Currently this accepts both lots of analyses that it uses and a raw -/// Pass pointer. The Pass pointer is used by numerous utilities to update -/// specific analyses. Rather than a pass it would be much cleaner and more -/// explicit if they accepted the analysis directly and then updated it. static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist, - AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, - ScalarEvolution *SE, Pass *PP, - AssumptionCache *AC) { + DominatorTree *DT, LoopInfo *LI, + ScalarEvolution *SE, AssumptionCache *AC, + bool PreserveLCSSA) { bool Changed = false; ReprocessLoop: @@ -544,7 +529,7 @@ ReprocessLoop: // Does the loop already have a preheader? If so, don't insert one. BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { - Preheader = InsertPreheaderForLoop(L, PP); + Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); if (Preheader) { ++NumInserted; Changed = true; @@ -568,7 +553,7 @@ ReprocessLoop: // Must be exactly this loop: no subloops, parent loops, or non-loop preds // allowed. if (!L->contains(*PI)) { - if (rewriteLoopExitBlock(L, ExitBlock, AA, DT, LI, PP)) { + if (rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA)) { ++NumInserted; Changed = true; } @@ -585,7 +570,7 @@ ReprocessLoop: // common backedge instead. if (L->getNumBackEdges() < 8) { if (Loop *OuterL = - separateNestedLoop(L, Preheader, AA, DT, LI, SE, PP, AC)) { + separateNestedLoop(L, Preheader, DT, LI, SE, PreserveLCSSA, AC)) { ++NumNested; // Enqueue the outer loop as it should be processed next in our // depth-first nest walk. @@ -602,7 +587,7 @@ ReprocessLoop: // If we either couldn't, or didn't want to, identify nesting of the loops, // insert a new block that all backedges target, then make it jump to the // loop header. - LoopLatch = insertUniqueBackedgeBlock(L, Preheader, AA, DT, LI); + LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI); if (LoopLatch) { ++NumInserted; Changed = true; @@ -618,7 +603,6 @@ ReprocessLoop: for (BasicBlock::iterator I = L->getHeader()->begin(); (PN = dyn_cast<PHINode>(I++)); ) if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) { - if (AA) AA->deleteValue(PN); if (SE) SE->forgetValue(PN); PN->replaceAllUsesWith(V); PN->eraseFromParent(); @@ -654,7 +638,7 @@ ReprocessLoop: bool AllInvariant = true; bool AnyInvariant = false; for (BasicBlock::iterator I = ExitingBlock->begin(); &*I != BI; ) { - Instruction *Inst = I++; + Instruction *Inst = &*I++; // Skip debug info intrinsics. if (isa<DbgInfoIntrinsic>(Inst)) continue; @@ -716,9 +700,9 @@ ReprocessLoop: return Changed; } -bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP, - AliasAnalysis *AA, ScalarEvolution *SE, - AssumptionCache *AC) { +bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, + ScalarEvolution *SE, AssumptionCache *AC, + bool PreserveLCSSA) { bool Changed = false; // Worklist maintains our depth-first queue of loops in this nest to process. @@ -734,8 +718,8 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP, } while (!Worklist.empty()) - Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI, - SE, PP, AC); + Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, DT, LI, SE, + AC, PreserveLCSSA); return Changed; } @@ -747,9 +731,6 @@ namespace { initializeLoopSimplifyPass(*PassRegistry::getPassRegistry()); } - // AA - If we have an alias analysis object to update, this is it, otherwise - // this is null. - AliasAnalysis *AA; DominatorTree *DT; LoopInfo *LI; ScalarEvolution *SE; @@ -767,8 +748,11 @@ namespace { AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); - AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); AU.addPreserved<DependenceAnalysis>(); AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added. } @@ -784,6 +768,9 @@ INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops", false, false) @@ -796,15 +783,16 @@ Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); } /// bool LoopSimplify::runOnFunction(Function &F) { bool Changed = false; - AA = getAnalysisIfAvailable<AliasAnalysis>(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = getAnalysisIfAvailable<ScalarEvolution>(); + auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + SE = SEWP ? &SEWP->getSE() : nullptr; AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); // Simplify each loop nest in the function. for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) - Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, AC); + Changed |= simplifyLoop(*I, DT, LI, SE, AC, PreserveLCSSA); return Changed; } diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 1dbce47..eea9237 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -73,7 +73,7 @@ static inline void RemapInstruction(Instruction *I, /// of loops that have already been forgotten to prevent redundant, expensive /// calls to ScalarEvolution::forgetLoop. Returns the new combined block. static BasicBlock * -FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM, +FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, ScalarEvolution *SE, SmallPtrSetImpl<Loop *> &ForgottenLoops) { // Merge basic blocks into their predecessor if there is only one distinct // pred, and if there is only one distinct successor of the predecessor, and @@ -109,12 +109,10 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM, // Erase basic block from the function... // ScalarEvolution holds references to loop exit blocks. - if (LPM) { - if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) { - if (Loop *L = LI->getLoopFor(BB)) { - if (ForgottenLoops.insert(L).second) - SE->forgetLoop(L); - } + if (SE) { + if (Loop *L = LI->getLoopFor(BB)) { + if (ForgottenLoops.insert(L).second) + SE->forgetLoop(L); } } LI->removeBlock(BB); @@ -155,15 +153,13 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM, /// /// The LoopInfo Analysis that is passed will be kept consistent. /// -/// If a LoopPassManager is passed in, and the loop is fully removed, it will be -/// removed from the LoopPassManager as well. LPM can also be NULL. -/// -/// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are -/// available from the Pass it must also preserve those analyses. +/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and +/// DominatorTree if they are non-null. bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool AllowRuntime, bool AllowExpensiveTripCount, - unsigned TripMultiple, LoopInfo *LI, Pass *PP, - LPPassManager *LPM, AssumptionCache *AC) { + unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE, + DominatorTree *DT, AssumptionCache *AC, + bool PreserveLCSSA) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n"); @@ -220,6 +216,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // Are we eliminating the loop control altogether? bool CompletelyUnroll = Count == TripCount; + SmallVector<BasicBlock *, 4> ExitBlocks; + L->getExitBlocks(ExitBlocks); + Loop *ParentL = L->getParentLoop(); + bool AllExitsAreInsideParentLoop = !ParentL || + std::all_of(ExitBlocks.begin(), ExitBlocks.end(), + [&](BasicBlock *BB) { return ParentL->contains(BB); }); // We assume a run-time trip count if the compiler cannot // figure out the loop trip count and the unroll-runtime @@ -227,13 +229,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime); if (RuntimeTripCount && - !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, LPM)) + !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, SE, DT, + PreserveLCSSA)) return false; // Notify ScalarEvolution that the loop will be substantially changed, // if not outright eliminated. - ScalarEvolution *SE = - PP ? PP->getAnalysisIfAvailable<ScalarEvolution>() : nullptr; if (SE) SE->forgetLoop(L); @@ -392,7 +393,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, for (unsigned i = 0; i < NewBlocks.size(); ++i) for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) - ::RemapInstruction(I, LastValueMap); + ::RemapInstruction(&*I, LastValueMap); } // Loop over the PHI nodes in the original block, setting incoming values. @@ -432,8 +433,9 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // For a complete unroll, make the last iteration end with a branch // to the exit block. - if (CompletelyUnroll && j == 0) { - Dest = LoopExit; + if (CompletelyUnroll) { + if (j == 0) + Dest = LoopExit; NeedConditional = false; } @@ -473,7 +475,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator()); if (Term->isUnconditional()) { BasicBlock *Dest = Term->getSuccessor(0); - if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM, + if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, SE, ForgottenLoops)) std::replace(Latches.begin(), Latches.end(), Dest, Fold); } @@ -483,29 +485,24 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // whole function's cache. AC->clear(); - DominatorTree *DT = nullptr; - if (PP) { - // FIXME: Reconstruct dom info, because it is not preserved properly. - // Incrementally updating domtree after loop unrolling would be easy. - if (DominatorTreeWrapperPass *DTWP = - PP->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) { - DT = &DTWP->getDomTree(); - DT->recalculate(*L->getHeader()->getParent()); - } - - // Simplify any new induction variables in the partially unrolled loop. - if (SE && !CompletelyUnroll) { - SmallVector<WeakVH, 16> DeadInsts; - simplifyLoopIVs(L, SE, LPM, DeadInsts); - - // Aggressively clean up dead instructions that simplifyLoopIVs already - // identified. Any remaining should be cleaned up below. - while (!DeadInsts.empty()) - if (Instruction *Inst = - dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) - RecursivelyDeleteTriviallyDeadInstructions(Inst); - } + // FIXME: Reconstruct dom info, because it is not preserved properly. + // Incrementally updating domtree after loop unrolling would be easy. + if (DT) + DT->recalculate(*L->getHeader()->getParent()); + + // Simplify any new induction variables in the partially unrolled loop. + if (SE && !CompletelyUnroll) { + SmallVector<WeakVH, 16> DeadInsts; + simplifyLoopIVs(L, SE, DT, LI, DeadInsts); + + // Aggressively clean up dead instructions that simplifyLoopIVs already + // identified. Any remaining should be cleaned up below. + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(Inst); } + // At this point, the code is well formed. We now do a quick sweep over the // inserted code, doing constant propagation and dead code elimination as we // go. @@ -514,7 +511,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(), BBE = NewLoopBlocks.end(); BB != BBE; ++BB) for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) { - Instruction *Inst = I++; + Instruction *Inst = &*I++; if (isInstructionTriviallyDead(Inst)) (*BB)->getInstList().erase(Inst); @@ -529,29 +526,33 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, ++NumUnrolled; Loop *OuterL = L->getParentLoop(); - // Remove the loop from the LoopPassManager if it's completely removed. - if (CompletelyUnroll && LPM != nullptr) - LPM->deleteLoopFromQueue(L); + // Update LoopInfo if the loop is completely removed. + if (CompletelyUnroll) + LI->markAsRemoved(L); // If we have a pass and a DominatorTree we should re-simplify impacted loops // to ensure subsequent analyses can rely on this form. We want to simplify // at least one layer outside of the loop that was unrolled so that any // changes to the parent loop exposed by the unrolling are considered. - if (PP && DT) { + if (DT) { if (!OuterL && !CompletelyUnroll) OuterL = L; if (OuterL) { - simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, AC); + bool Simplified = simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA); // LCSSA must be performed on the outermost affected loop. The unrolled // loop's last loop latch is guaranteed to be in the outermost loop after - // deleteLoopFromQueue updates LoopInfo. + // LoopInfo's been updated by markAsRemoved. Loop *LatchLoop = LI->getLoopFor(Latches.back()); if (!OuterL->contains(LatchLoop)) while (OuterL->getParentLoop() != LatchLoop) OuterL = OuterL->getParentLoop(); - formLCSSARecursively(*OuterL, *DT, LI, SE); + if (CompletelyUnroll && (!AllExitsAreInsideParentLoop || Simplified)) + formLCSSARecursively(*OuterL, *DT, LI, SE); + else + assert(OuterL->isLCSSAForm(*DT) && + "Loops should be in LCSSA form after loop-unroll."); } } diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index add5432..0d68f18 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -62,8 +62,8 @@ STATISTIC(NumRuntimeUnrolled, static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, BasicBlock *LastPrologBB, BasicBlock *PrologEnd, BasicBlock *OrigPH, BasicBlock *NewPH, - ValueToValueMapTy &VMap, AliasAnalysis *AA, - DominatorTree *DT, LoopInfo *LI, Pass *P) { + ValueToValueMapTy &VMap, DominatorTree *DT, + LoopInfo *LI, bool PreserveLCSSA) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); @@ -127,8 +127,8 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, assert(Exit && "Loop must have a single exit block only"); // Split the exit to maintain loop canonicalization guarantees SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit)); - SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", AA, DT, LI, - P->mustPreserveAnalysisID(LCSSAID)); + SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", DT, LI, + PreserveLCSSA); // Add the branch to the exit block (around the unrolled loop) B.CreateCondBr(BrLoopExit, Exit, NewPH); InsertPt->eraseFromParent(); @@ -150,7 +150,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, Function *F = Header->getParent(); LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO(); LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); - Loop *NewLoop = 0; + Loop *NewLoop = nullptr; Loop *ParentLoop = L->getParentLoop(); if (!UnrollProlog) { NewLoop = new Loop(); @@ -206,9 +206,9 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, // Change the incoming values to the ones defined in the preheader or // cloned loop. for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { - PHINode *NewPHI = cast<PHINode>(VMap[I]); + PHINode *NewPHI = cast<PHINode>(VMap[&*I]); if (UnrollProlog) { - VMap[I] = NewPHI->getIncomingValueForBlock(Preheader); + VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader); cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI); } else { unsigned idx = NewPHI->getBasicBlockIndex(Preheader); @@ -279,7 +279,8 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, /// bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, bool AllowExpensiveTripCount, LoopInfo *LI, - LPPassManager *LPM) { + ScalarEvolution *SE, DominatorTree *DT, + bool PreserveLCSSA) { // for now, only unroll loops that contain a single exit if (!L->getExitingBlock()) return false; @@ -291,9 +292,6 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, // Use Scalar Evolution to compute the trip count. This allows more // loops to be unrolled than relying on induction var simplification - if (!LPM) - return false; - ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); if (!SE) return false; @@ -308,7 +306,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, // Add 1 since the backedge count doesn't include the first loop iteration const SCEV *TripCountSC = - SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); + SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); if (isa<SCEVCouldNotCompute>(TripCountSC)) return false; @@ -333,10 +331,6 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, if (Loop *ParentLoop = L->getParentLoop()) SE->forgetLoop(ParentLoop); - // Grab analyses that we preserve. - auto *DTWP = LPM->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; - BasicBlock *PH = L->getLoopPreheader(); BasicBlock *Latch = L->getLoopLatch(); // It helps to splits the original preheader twice, one for the end of the @@ -397,8 +391,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, VMap, LI); // Insert the cloned blocks into function just before the original loop - F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0], - F->end()); + F->getBasicBlockList().splice(PEnd->getIterator(), F->getBasicBlockList(), + NewBlocks[0]->getIterator(), F->end()); // Rewrite the cloned instruction operands to use the values // created when the clone is created. @@ -406,7 +400,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) { - RemapInstruction(I, VMap, + RemapInstruction(&*I, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); } } @@ -414,8 +408,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, // Connect the prolog code to the original loop and update the // PHI functions. BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]); - ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, - /*AliasAnalysis*/ nullptr, DT, LI, LPM->getAsPass()); + ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, DT, LI, + PreserveLCSSA); NumRuntimeUnrolled++; return true; } diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp index 5cbde94..fa958e9 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -12,13 +12,13 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Debug.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; @@ -34,6 +34,124 @@ bool RecurrenceDescriptor::areAllUsesIn(Instruction *I, return true; } +bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurrenceKind Kind) { + switch (Kind) { + default: + break; + case RK_IntegerAdd: + case RK_IntegerMult: + case RK_IntegerOr: + case RK_IntegerAnd: + case RK_IntegerXor: + case RK_IntegerMinMax: + return true; + } + return false; +} + +bool RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurrenceKind Kind) { + return (Kind != RK_NoRecurrence) && !isIntegerRecurrenceKind(Kind); +} + +bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurrenceKind Kind) { + switch (Kind) { + default: + break; + case RK_IntegerAdd: + case RK_IntegerMult: + case RK_FloatAdd: + case RK_FloatMult: + return true; + } + return false; +} + +Instruction * +RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT, + SmallPtrSetImpl<Instruction *> &Visited, + SmallPtrSetImpl<Instruction *> &CI) { + if (!Phi->hasOneUse()) + return Phi; + + const APInt *M = nullptr; + Instruction *I, *J = cast<Instruction>(Phi->use_begin()->getUser()); + + // Matches either I & 2^x-1 or 2^x-1 & I. If we find a match, we update RT + // with a new integer type of the corresponding bit width. + if (match(J, m_CombineOr(m_And(m_Instruction(I), m_APInt(M)), + m_And(m_APInt(M), m_Instruction(I))))) { + int32_t Bits = (*M + 1).exactLogBase2(); + if (Bits > 0) { + RT = IntegerType::get(Phi->getContext(), Bits); + Visited.insert(Phi); + CI.insert(J); + return J; + } + } + return Phi; +} + +bool RecurrenceDescriptor::getSourceExtensionKind( + Instruction *Start, Instruction *Exit, Type *RT, bool &IsSigned, + SmallPtrSetImpl<Instruction *> &Visited, + SmallPtrSetImpl<Instruction *> &CI) { + + SmallVector<Instruction *, 8> Worklist; + bool FoundOneOperand = false; + unsigned DstSize = RT->getPrimitiveSizeInBits(); + Worklist.push_back(Exit); + + // Traverse the instructions in the reduction expression, beginning with the + // exit value. + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + for (Use &U : I->operands()) { + + // Terminate the traversal if the operand is not an instruction, or we + // reach the starting value. + Instruction *J = dyn_cast<Instruction>(U.get()); + if (!J || J == Start) + continue; + + // Otherwise, investigate the operation if it is also in the expression. + if (Visited.count(J)) { + Worklist.push_back(J); + continue; + } + + // If the operand is not in Visited, it is not a reduction operation, but + // it does feed into one. Make sure it is either a single-use sign- or + // zero-extend instruction. + CastInst *Cast = dyn_cast<CastInst>(J); + bool IsSExtInst = isa<SExtInst>(J); + if (!Cast || !Cast->hasOneUse() || !(isa<ZExtInst>(J) || IsSExtInst)) + return false; + + // Ensure the source type of the extend is no larger than the reduction + // type. It is not necessary for the types to be identical. + unsigned SrcSize = Cast->getSrcTy()->getPrimitiveSizeInBits(); + if (SrcSize > DstSize) + return false; + + // Furthermore, ensure that all such extends are of the same kind. + if (FoundOneOperand) { + if (IsSigned != IsSExtInst) + return false; + } else { + FoundOneOperand = true; + IsSigned = IsSExtInst; + } + + // Lastly, if the source type of the extend matches the reduction type, + // add the extend to CI so that we can avoid accounting for it in the + // cost model. + if (SrcSize == DstSize) + CI.insert(Cast); + } + } + return true; +} + bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, Loop *TheLoop, bool HasFunNoNaNAttr, RecurrenceDescriptor &RedDes) { @@ -68,10 +186,32 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, unsigned NumCmpSelectPatternInst = 0; InstDesc ReduxDesc(false, nullptr); + // Data used for determining if the recurrence has been type-promoted. + Type *RecurrenceType = Phi->getType(); + SmallPtrSet<Instruction *, 4> CastInsts; + Instruction *Start = Phi; + bool IsSigned = false; + SmallPtrSet<Instruction *, 8> VisitedInsts; SmallVector<Instruction *, 8> Worklist; - Worklist.push_back(Phi); - VisitedInsts.insert(Phi); + + // Return early if the recurrence kind does not match the type of Phi. If the + // recurrence kind is arithmetic, we attempt to look through AND operations + // resulting from the type promotion performed by InstCombine. Vector + // operations are not limited to the legal integer widths, so we may be able + // to evaluate the reduction in the narrower width. + if (RecurrenceType->isFloatingPointTy()) { + if (!isFloatingPointRecurrenceKind(Kind)) + return false; + } else { + if (!isIntegerRecurrenceKind(Kind)) + return false; + if (isArithmeticRecurrenceKind(Kind)) + Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts); + } + + Worklist.push_back(Start); + VisitedInsts.insert(Start); // A value in the reduction can be used: // - By the reduction: @@ -110,10 +250,14 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0)))) return false; - // Any reduction instruction must be of one of the allowed kinds. - ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, HasFunNoNaNAttr); - if (!ReduxDesc.isRecurrence()) - return false; + // Any reduction instruction must be of one of the allowed kinds. We ignore + // the starting value (the Phi or an AND instruction if the Phi has been + // type-promoted). + if (Cur != Start) { + ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, HasFunNoNaNAttr); + if (!ReduxDesc.isRecurrence()) + return false; + } // A reduction operation must only have one use of the reduction value. if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && @@ -131,7 +275,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, ++NumCmpSelectPatternInst; // Check whether we found a reduction operator. - FoundReduxOp |= !IsAPhi; + FoundReduxOp |= !IsAPhi && Cur != Start; // Process users of current instruction. Push non-PHI nodes after PHI nodes // onto the stack. This way we are going to have seen all inputs to PHI @@ -193,6 +337,14 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; + // If we think Phi may have been type-promoted, we also need to ensure that + // all source operands of the reduction are either SExtInsts or ZEstInsts. If + // so, we will be able to evaluate the reduction in the narrower bit width. + if (Start != Phi) + if (!getSourceExtensionKind(Start, ExitInstruction, RecurrenceType, + IsSigned, VisitedInsts, CastInsts)) + return false; + // We found a reduction var if we have reached the original phi node and we // only have a single instruction with out-of-loop users. @@ -200,9 +352,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, // is saved as part of the RecurrenceDescriptor. // Save the description of this reduction variable. - RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, - ReduxDesc.getMinMaxKind()); - + RecurrenceDescriptor RD( + RdxStart, ExitInstruction, Kind, ReduxDesc.getMinMaxKind(), + ReduxDesc.getUnsafeAlgebraInst(), RecurrenceType, IsSigned, CastInsts); RedDes = RD; return true; @@ -263,14 +415,14 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind, InstDesc &Prev, bool HasFunNoNaNAttr) { bool FP = I->getType()->isFloatingPointTy(); - bool FastMath = FP && I->hasUnsafeAlgebra(); + Instruction *UAI = Prev.getUnsafeAlgebraInst(); + if (!UAI && FP && !I->hasUnsafeAlgebra()) + UAI = I; // Found an unsafe (unvectorizable) algebra instruction. + switch (I->getOpcode()) { default: return InstDesc(false, I); case Instruction::PHI: - if (FP && - (Kind != RK_FloatMult && Kind != RK_FloatAdd && Kind != RK_FloatMinMax)) - return InstDesc(false, I); return InstDesc(I, Prev.getMinMaxKind()); case Instruction::Sub: case Instruction::Add: @@ -284,10 +436,10 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind, case Instruction::Xor: return InstDesc(Kind == RK_IntegerXor, I); case Instruction::FMul: - return InstDesc(Kind == RK_FloatMult && FastMath, I); + return InstDesc(Kind == RK_FloatMult, I, UAI); case Instruction::FSub: case Instruction::FAdd: - return InstDesc(Kind == RK_FloatAdd && FastMath, I); + return InstDesc(Kind == RK_FloatAdd, I, UAI); case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: @@ -442,6 +594,13 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder, break; } + // We only match FP sequences with unsafe algebra, so we can unconditionally + // set it on any generated instructions. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + FastMathFlags FMF; + FMF.setUnsafeAlgebra(); + Builder.setFastMathFlags(FMF); + Value *Cmp; if (RK == MRK_FloatMin || RK == MRK_FloatMax) Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); @@ -452,8 +611,54 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder, return Select; } -bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, - ConstantInt *&StepValue) { +InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K, + ConstantInt *Step) + : StartValue(Start), IK(K), StepValue(Step) { + assert(IK != IK_NoInduction && "Not an induction"); + assert(StartValue && "StartValue is null"); + assert(StepValue && !StepValue->isZero() && "StepValue is zero"); + assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && + "StartValue is not a pointer for pointer induction"); + assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && + "StartValue is not an integer for integer induction"); + assert(StepValue->getType()->isIntegerTy() && + "StepValue is not an integer"); +} + +int InductionDescriptor::getConsecutiveDirection() const { + if (StepValue && (StepValue->isOne() || StepValue->isMinusOne())) + return StepValue->getSExtValue(); + return 0; +} + +Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index) const { + switch (IK) { + case IK_IntInduction: + assert(Index->getType() == StartValue->getType() && + "Index type does not match StartValue type"); + if (StepValue->isMinusOne()) + return B.CreateSub(StartValue, Index); + if (!StepValue->isOne()) + Index = B.CreateMul(Index, StepValue); + return B.CreateAdd(StartValue, Index); + + case IK_PtrInduction: + assert(Index->getType() == StepValue->getType() && + "Index type does not match StepValue type"); + if (StepValue->isMinusOne()) + Index = B.CreateNeg(Index); + else if (!StepValue->isOne()) + Index = B.CreateMul(Index, StepValue); + return B.CreateGEP(nullptr, StartValue, Index); + + case IK_NoInduction: + return nullptr; + } + llvm_unreachable("invalid enum"); +} + +bool InductionDescriptor::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, + InductionDescriptor &D) { Type *PhiTy = Phi->getType(); // We only handle integer and pointer inductions variables. if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) @@ -467,6 +672,10 @@ bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, return false; } + assert(AR->getLoop()->getHeader() == Phi->getParent() && + "PHI is an AddRec for a different loop?!"); + Value *StartValue = + Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader()); const SCEV *Step = AR->getStepRecurrence(*SE); // Calculate the pointer stride and check if it is consecutive. const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); @@ -475,7 +684,7 @@ bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, ConstantInt *CV = C->getValue(); if (PhiTy->isIntegerTy()) { - StepValue = CV; + D = InductionDescriptor(StartValue, IK_IntInduction, CV); return true; } @@ -494,6 +703,27 @@ bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, int64_t CVSize = CV->getSExtValue(); if (CVSize % Size) return false; - StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size); + auto *StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size); + + D = InductionDescriptor(StartValue, IK_PtrInduction, StepValue); return true; } + +/// \brief Returns the instructions that use values defined in the loop. +SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) { + SmallVector<Instruction *, 8> UsedOutside; + + for (auto *Block : L->getBlocks()) + // FIXME: I believe that this could use copy_if if the Inst reference could + // be adapted into a pointer. + for (auto &Inst : *Block) { + auto Users = Inst.users(); + if (std::any_of(Users.begin(), Users.end(), [&](User *U) { + auto *Use = cast<Instruction>(U); + return !L->contains(Use->getParent()); + })) + UsedOutside.push_back(&Inst); + } + + return UsedOutside; +} diff --git a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp index 832079d..9a2a06c 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -13,43 +13,81 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/Dominators.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/LoopVersioning.h" using namespace llvm; LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI, - DominatorTree *DT, - const SmallVector<int, 8> *PtrToPartition) - : VersionedLoop(L), NonVersionedLoop(nullptr), - PtrToPartition(PtrToPartition), LAI(LAI), LI(LI), DT(DT) { + DominatorTree *DT, ScalarEvolution *SE, + bool UseLAIChecks) + : VersionedLoop(L), NonVersionedLoop(nullptr), LAI(LAI), LI(LI), DT(DT), + SE(SE) { assert(L->getExitBlock() && "No single exit block"); assert(L->getLoopPreheader() && "No preheader"); + if (UseLAIChecks) { + setAliasChecks(LAI.getRuntimePointerChecking()->getChecks()); + setSCEVChecks(LAI.PSE.getUnionPredicate()); + } } -bool LoopVersioning::needsRuntimeChecks() const { - return LAI.getRuntimePointerChecking()->needsAnyChecking(PtrToPartition); +void LoopVersioning::setAliasChecks( + const SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks) { + AliasChecks = std::move(Checks); } -void LoopVersioning::versionLoop(Pass *P) { +void LoopVersioning::setSCEVChecks(SCEVUnionPredicate Check) { + Preds = std::move(Check); +} + +void LoopVersioning::versionLoop( + const SmallVectorImpl<Instruction *> &DefsUsedOutside) { Instruction *FirstCheckInst; Instruction *MemRuntimeCheck; + Value *SCEVRuntimeCheck; + Value *RuntimeCheck = nullptr; + // Add the memcheck in the original preheader (this is empty initially). - BasicBlock *MemCheckBB = VersionedLoop->getLoopPreheader(); + BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader(); std::tie(FirstCheckInst, MemRuntimeCheck) = - LAI.addRuntimeCheck(MemCheckBB->getTerminator(), PtrToPartition); + LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks); assert(MemRuntimeCheck && "called even though needsAnyChecking = false"); + const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate(); + SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(), + "scev.check"); + SCEVRuntimeCheck = + Exp.expandCodeForPredicate(&Pred, RuntimeCheckBB->getTerminator()); + auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck); + + // Discard the SCEV runtime check if it is always true. + if (CI && CI->isZero()) + SCEVRuntimeCheck = nullptr; + + if (MemRuntimeCheck && SCEVRuntimeCheck) { + RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck, + SCEVRuntimeCheck, "ldist.safe"); + if (auto *I = dyn_cast<Instruction>(RuntimeCheck)) + I->insertBefore(RuntimeCheckBB->getTerminator()); + } else + RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck; + + assert(RuntimeCheck && "called even though we don't need " + "any runtime checks"); + // Rename the block to make the IR more readable. - MemCheckBB->setName(VersionedLoop->getHeader()->getName() + ".lver.memcheck"); + RuntimeCheckBB->setName(VersionedLoop->getHeader()->getName() + + ".lver.check"); // Create empty preheader for the loop (and after cloning for the // non-versioned loop). - BasicBlock *PH = SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI); + BasicBlock *PH = + SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI); PH->setName(VersionedLoop->getHeader()->getName() + ".ph"); // Clone the loop including the preheader. @@ -58,20 +96,23 @@ void LoopVersioning::versionLoop(Pass *P) { // block is a join between the two loops. SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks; NonVersionedLoop = - cloneLoopWithPreheader(PH, MemCheckBB, VersionedLoop, VMap, ".lver.orig", - LI, DT, NonVersionedLoopBlocks); + cloneLoopWithPreheader(PH, RuntimeCheckBB, VersionedLoop, VMap, + ".lver.orig", LI, DT, NonVersionedLoopBlocks); remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap); // Insert the conditional branch based on the result of the memchecks. - Instruction *OrigTerm = MemCheckBB->getTerminator(); + Instruction *OrigTerm = RuntimeCheckBB->getTerminator(); BranchInst::Create(NonVersionedLoop->getLoopPreheader(), - VersionedLoop->getLoopPreheader(), MemRuntimeCheck, - OrigTerm); + VersionedLoop->getLoopPreheader(), RuntimeCheck, OrigTerm); OrigTerm->eraseFromParent(); // The loops merge in the original exit block. This is now dominated by the // memchecking block. - DT->changeImmediateDominator(VersionedLoop->getExitBlock(), MemCheckBB); + DT->changeImmediateDominator(VersionedLoop->getExitBlock(), RuntimeCheckBB); + + // Adds the necessary PHI nodes for the versioned loops based on the + // loop-defined values used outside of the loop. + addPHINodes(DefsUsedOutside); } void LoopVersioning::addPHINodes( @@ -94,7 +135,7 @@ void LoopVersioning::addPHINodes( // If not create it. if (!PN) { PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver", - PHIBlock->begin()); + &PHIBlock->front()); for (auto *User : Inst->users()) if (!VersionedLoop->contains(cast<Instruction>(User)->getParent())) User->replaceUsesOfWith(Inst, PN); diff --git a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp index 66d57b0..b0ad4d5 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp @@ -69,7 +69,7 @@ bool LowerInvoke::runOnFunction(Function &F) { BranchInst::Create(II->getNormalDest(), II); // Remove any PHI node entries from the exception destination. - II->getUnwindDest()->removePredecessor(BB); + II->getUnwindDest()->removePredecessor(&*BB); // Remove the invoke instruction now. BB->getInstList().erase(II); diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp index 4acd988..52beb15 100644 --- a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -49,8 +49,7 @@ namespace { return I != Ranges.end() && I->Low <= R.Low; } - /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch - /// instructions. + /// Replace all SwitchInst instructions with chained branch instructions. class LowerSwitch : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid @@ -78,7 +77,7 @@ namespace { typedef std::vector<CaseRange> CaseVector; typedef std::vector<CaseRange>::iterator CaseItr; private: - void processSwitchInst(SwitchInst *SI); + void processSwitchInst(SwitchInst *SI, SmallPtrSetImpl<BasicBlock*> &DeleteList); BasicBlock *switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, ConstantInt *UpperBound, @@ -116,21 +115,30 @@ FunctionPass *llvm::createLowerSwitchPass() { bool LowerSwitch::runOnFunction(Function &F) { bool Changed = false; + SmallPtrSet<BasicBlock*, 8> DeleteList; for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { - BasicBlock *Cur = I++; // Advance over block so we don't traverse new blocks + BasicBlock *Cur = &*I++; // Advance over block so we don't traverse new blocks + + // If the block is a dead Default block that will be deleted later, don't + // waste time processing it. + if (DeleteList.count(Cur)) + continue; if (SwitchInst *SI = dyn_cast<SwitchInst>(Cur->getTerminator())) { Changed = true; - processSwitchInst(SI); + processSwitchInst(SI, DeleteList); } } + for (BasicBlock* BB: DeleteList) { + DeleteDeadBlock(BB); + } + return Changed; } -// operator<< - Used for debugging purposes. -// +/// Used for debugging purposes. static raw_ostream& operator<<(raw_ostream &O, const LowerSwitch::CaseVector &C) LLVM_ATTRIBUTE_USED; @@ -147,23 +155,24 @@ static raw_ostream& operator<<(raw_ostream &O, return O << "]"; } -// \brief Update the first occurrence of the "switch statement" BB in the PHI -// node with the "new" BB. The other occurrences will: -// -// 1) Be updated by subsequent calls to this function. Switch statements may -// have more than one outcoming edge into the same BB if they all have the same -// value. When the switch statement is converted these incoming edges are now -// coming from multiple BBs. -// 2) Removed if subsequent incoming values now share the same case, i.e., -// multiple outcome edges are condensed into one. This is necessary to keep the -// number of phi values equal to the number of branches to SuccBB. +/// \brief Update the first occurrence of the "switch statement" BB in the PHI +/// node with the "new" BB. The other occurrences will: +/// +/// 1) Be updated by subsequent calls to this function. Switch statements may +/// have more than one outcoming edge into the same BB if they all have the same +/// value. When the switch statement is converted these incoming edges are now +/// coming from multiple BBs. +/// 2) Removed if subsequent incoming values now share the same case, i.e., +/// multiple outcome edges are condensed into one. This is necessary to keep the +/// number of phi values equal to the number of branches to SuccBB. static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB, unsigned NumMergedCases) { - for (BasicBlock::iterator I = SuccBB->begin(), IE = SuccBB->getFirstNonPHI(); + for (BasicBlock::iterator I = SuccBB->begin(), + IE = SuccBB->getFirstNonPHI()->getIterator(); I != IE; ++I) { PHINode *PN = cast<PHINode>(I); - // Only update the first occurence. + // Only update the first occurrence. unsigned Idx = 0, E = PN->getNumIncomingValues(); unsigned LocalNumMergedCases = NumMergedCases; for (; Idx != E; ++Idx) { @@ -173,7 +182,7 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB, } } - // Remove additional occurences coming from condensed cases and keep the + // Remove additional occurrences coming from condensed cases and keep the // number of incoming values equal to the number of branches to SuccBB. SmallVector<unsigned, 8> Indices; for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx) @@ -188,11 +197,11 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB, } } -// switchConvert - Convert the switch statement into a binary lookup of -// the case values. The function recursively builds this tree. -// LowerBound and UpperBound are used to keep track of the bounds for Val -// that have already been checked by a block emitted by one of the previous -// calls to switchConvert in the call stack. +/// Convert the switch statement into a binary lookup of the case values. +/// The function recursively builds this tree. LowerBound and UpperBound are +/// used to keep track of the bounds for Val that have already been checked by +/// a block emitted by one of the previous calls to switchConvert in the call +/// stack. BasicBlock * LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, ConstantInt *UpperBound, Value *Val, @@ -278,28 +287,24 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound, UpperBound, Val, NewNode, OrigBlock, Default, UnreachableRanges); - Function::iterator FI = OrigBlock; - F->getBasicBlockList().insert(++FI, NewNode); + F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewNode); NewNode->getInstList().push_back(Comp); BranchInst::Create(LBranch, RBranch, Comp, NewNode); return NewNode; } -// newLeafBlock - Create a new leaf block for the binary lookup tree. It -// checks if the switch's value == the case's value. If not, then it -// jumps to the default branch. At this point in the tree, the value -// can't be another valid case value, so the jump to the "default" branch -// is warranted. -// +/// Create a new leaf block for the binary lookup tree. It checks if the +/// switch's value == the case's value. If not, then it jumps to the default +/// branch. At this point in the tree, the value can't be another valid case +/// value, so the jump to the "default" branch is warranted. BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, BasicBlock* OrigBlock, BasicBlock* Default) { Function* F = OrigBlock->getParent(); BasicBlock* NewLeaf = BasicBlock::Create(Val->getContext(), "LeafBlock"); - Function::iterator FI = OrigBlock; - F->getBasicBlockList().insert(++FI, NewLeaf); + F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewLeaf); // Emit comparison ICmpInst* Comp = nullptr; @@ -352,7 +357,7 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, return NewLeaf; } -// Clusterify - Transform simple list of Cases into list of CaseRange's +/// Transform simple list of Cases into list of CaseRange's. unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { unsigned numCmps = 0; @@ -394,10 +399,10 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { return numCmps; } -// processSwitchInst - Replace the specified switch instruction with a sequence -// of chained if-then insts in a balanced binary search. -// -void LowerSwitch::processSwitchInst(SwitchInst *SI) { +/// Replace the specified switch instruction with a sequence of chained if-then +/// insts in a balanced binary search. +void LowerSwitch::processSwitchInst(SwitchInst *SI, + SmallPtrSetImpl<BasicBlock*> &DeleteList) { BasicBlock *CurBlock = SI->getParent(); BasicBlock *OrigBlock = CurBlock; Function *F = CurBlock->getParent(); @@ -424,7 +429,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) { std::vector<IntRange> UnreachableRanges; if (isa<UnreachableInst>(Default->getFirstNonPHIOrDbg())) { - // Make the bounds tightly fitted around the case value range, becase we + // Make the bounds tightly fitted around the case value range, because we // know that the value passed to the switch must be exactly one of the case // values. assert(!Cases.empty()); @@ -495,7 +500,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) { // Create a new, empty default block so that the new hierarchy of // if-then statements go to this and the PHI nodes are happy. BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault"); - F->getBasicBlockList().insert(Default, NewDefault); + F->getBasicBlockList().insert(Default->getIterator(), NewDefault); BranchInst::Create(Default, NewDefault); // If there is an entry in any PHI nodes for the default edge, make sure @@ -518,7 +523,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) { BasicBlock *OldDefault = SI->getDefaultDest(); CurBlock->getInstList().erase(SI); - // If the Default block has no more predecessors just remove it. + // If the Default block has no more predecessors just add it to DeleteList. if (pred_begin(OldDefault) == pred_end(OldDefault)) - DeleteDeadBlock(OldDefault); + DeleteList.insert(OldDefault); } diff --git a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp index 00cf4e6..aa1e35d 100644 --- a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp +++ b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp @@ -63,6 +63,9 @@ bool PromotePass::runOnFunction(Function &F) { BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function + if (F.hasFnAttribute(Attribute::OptimizeNone)) + return false; + bool Changed = false; DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); diff --git a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp index 395a46b..c999bd0 100644 --- a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp +++ b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp @@ -42,6 +42,24 @@ namespace { } }; + static const char *const metaNames[] = { + // See http://en.wikipedia.org/wiki/Metasyntactic_variable + "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge", + "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam" + }; + + struct Renamer { + Renamer(unsigned int seed) { + prng.srand(seed); + } + + const char *newName() { + return metaNames[prng.rand() % array_lengthof(metaNames)]; + } + + PRNG prng; + }; + struct MetaRenamer : public ModulePass { static char ID; // Pass identification, replacement for typeid MetaRenamer() : ModulePass(ID) { @@ -53,36 +71,26 @@ namespace { } bool runOnModule(Module &M) override { - static const char *const metaNames[] = { - // See http://en.wikipedia.org/wiki/Metasyntactic_variable - "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge", - "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam" - }; - // Seed our PRNG with simple additive sum of ModuleID. We're looking to // simply avoid always having the same function names, and we need to // remain deterministic. unsigned int randSeed = 0; - for (std::string::const_iterator I = M.getModuleIdentifier().begin(), - E = M.getModuleIdentifier().end(); I != E; ++I) - randSeed += *I; + for (auto C : M.getModuleIdentifier()) + randSeed += C; - PRNG prng; - prng.srand(randSeed); + Renamer renamer(randSeed); // Rename all aliases - for (Module::alias_iterator AI = M.alias_begin(), AE = M.alias_end(); - AI != AE; ++AI) { + for (auto AI = M.alias_begin(), AE = M.alias_end(); AI != AE; ++AI) { StringRef Name = AI->getName(); if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1)) continue; AI->setName("alias"); } - + // Rename all global variables - for (Module::global_iterator GI = M.global_begin(), GE = M.global_end(); - GI != GE; ++GI) { + for (auto GI = M.global_begin(), GE = M.global_end(); GI != GE; ++GI) { StringRef Name = GI->getName(); if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1)) continue; @@ -93,40 +101,37 @@ namespace { // Rename all struct types TypeFinder StructTypes; StructTypes.run(M, true); - for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) { - StructType *STy = StructTypes[i]; + for (StructType *STy : StructTypes) { if (STy->isLiteral() || STy->getName().empty()) continue; SmallString<128> NameStorage; - STy->setName((Twine("struct.") + metaNames[prng.rand() % - array_lengthof(metaNames)]).toStringRef(NameStorage)); + STy->setName((Twine("struct.") + + renamer.newName()).toStringRef(NameStorage)); } // Rename all functions - for (Module::iterator FI = M.begin(), FE = M.end(); - FI != FE; ++FI) { - StringRef Name = FI->getName(); + for (auto &F : M) { + StringRef Name = F.getName(); if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1)) continue; - FI->setName(metaNames[prng.rand() % array_lengthof(metaNames)]); - runOnFunction(*FI); + F.setName(renamer.newName()); + runOnFunction(F); } return true; } bool runOnFunction(Function &F) { - for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); - AI != AE; ++AI) + for (auto AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI) if (!AI->getType()->isVoidTy()) AI->setName("arg"); - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - BB->setName("bb"); + for (auto &BB : F) { + BB.setName("bb"); - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - if (!I->getType()->isVoidTy()) - I->setName("tmp"); + for (auto &I : BB) + if (!I.getType()->isVoidTy()) + I.setName("tmp"); } return true; } diff --git a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp index d69a81e..9ec28a3 100644 --- a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -43,9 +43,9 @@ static void appendToGlobalArray(const char *Array, } GVCtor->eraseFromParent(); } else { - // Use a simple two-field struct if there isn't one already. + // Use the new three-field struct if there isn't one already. EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy), - nullptr); + IRB.getInt8PtrTy(), nullptr); } // Build a 2 or 3 field global_ctor entry. We don't take a comdat key. @@ -107,7 +107,8 @@ Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) { std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions( Module &M, StringRef CtorName, StringRef InitName, - ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs) { + ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs, + StringRef VersionCheckName) { assert(!InitName.empty() && "Expected init function name"); assert(InitArgTypes.size() == InitArgTypes.size() && "Sanitizer's init function expects different number of arguments"); @@ -122,6 +123,13 @@ std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions( AttributeSet())); InitFunction->setLinkage(Function::ExternalLinkage); IRB.CreateCall(InitFunction, InitArgs); + if (!VersionCheckName.empty()) { + Function *VersionCheckFunction = + checkSanitizerInterfaceFunction(M.getOrInsertFunction( + VersionCheckName, FunctionType::get(IRB.getVoidTy(), {}, false), + AttributeSet())); + IRB.CreateCall(VersionCheckFunction, {}); + } return std::make_pair(Ctor, InitFunction); } diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index a87f850..c4f9b9f 100644 --- a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -205,10 +205,9 @@ public: // avoid gratuitus rescans. const BasicBlock *BB = I->getParent(); unsigned InstNo = 0; - for (BasicBlock::const_iterator BBI = BB->begin(), E = BB->end(); BBI != E; - ++BBI) - if (isInterestingInstruction(BBI)) - InstNumbers[BBI] = InstNo++; + for (const Instruction &BBI : *BB) + if (isInterestingInstruction(&BBI)) + InstNumbers[&BBI] = InstNo++; It = InstNumbers.find(I); assert(It != InstNumbers.end() && "Didn't insert instruction?"); @@ -402,8 +401,7 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, // Record debuginfo for the store and remove the declaration's // debuginfo. if (DbgDeclareInst *DDI = Info.DbgDeclare) { - DIBuilder DIB(*AI->getParent()->getParent()->getParent(), - /*AllowUnresolved*/ false); + DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false); ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, DIB); DDI->eraseFromParent(); LBI.deleteValue(DDI); @@ -425,14 +423,17 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, /// using the Alloca. /// /// If we cannot promote this alloca (because it is read before it is written), -/// return true. This is necessary in cases where, due to control flow, the -/// alloca is potentially undefined on some control flow paths. e.g. code like -/// this is potentially correct: -/// -/// for (...) { if (c) { A = undef; undef = B; } } -/// -/// ... so long as A is not used before undef is set. -static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, +/// return false. This is necessary in cases where, due to control flow, the +/// alloca is undefined only on some control flow paths. e.g. code like +/// this is correct in LLVM IR: +/// // A is an alloca with no stores so far +/// for (...) { +/// int t = *A; +/// if (!first_iteration) +/// use(t); +/// *A = 42; +/// } +static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, LargeBlockInfo &LBI, AliasSetTracker *AST) { // The trickiest case to handle is when we have large blocks. Because of this, @@ -467,10 +468,15 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, std::make_pair(LoadIdx, static_cast<StoreInst *>(nullptr)), less_first()); - - if (I == StoresByIndex.begin()) - // If there is no store before this load, the load takes the undef value. - LI->replaceAllUsesWith(UndefValue::get(LI->getType())); + if (I == StoresByIndex.begin()) { + if (StoresByIndex.empty()) + // If there are no stores, the load takes the undef value. + LI->replaceAllUsesWith(UndefValue::get(LI->getType())); + else + // There is no store before this load, bail out (load may be affected + // by the following stores - see main comment). + return false; + } else // Otherwise, there was a store before this load, the load takes its value. LI->replaceAllUsesWith(std::prev(I)->second->getOperand(0)); @@ -486,8 +492,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, StoreInst *SI = cast<StoreInst>(AI->user_back()); // Record debuginfo for the store before removing it. if (DbgDeclareInst *DDI = Info.DbgDeclare) { - DIBuilder DIB(*AI->getParent()->getParent()->getParent(), - /*AllowUnresolved*/ false); + DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false); ConvertDebugDeclareToDebugValue(DDI, SI, DIB); } SI->eraseFromParent(); @@ -506,6 +511,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, } ++NumLocalPromoted; + return true; } void PromoteMem2Reg::run() { @@ -557,9 +563,8 @@ void PromoteMem2Reg::run() { // If the alloca is only read and written in one basic block, just perform a // linear sweep over the block to eliminate it. - if (Info.OnlyUsedInOneBlock) { - promoteSingleBlockAlloca(AI, Info, LBI, AST); - + if (Info.OnlyUsedInOneBlock && + promoteSingleBlockAlloca(AI, Info, LBI, AST)) { // The alloca has been processed, move on. RemoveFromAllocasList(AllocaNum); continue; @@ -636,7 +641,7 @@ void PromoteMem2Reg::run() { // and inserting the phi nodes we marked as necessary // std::vector<RenamePassData> RenamePassWorkList; - RenamePassWorkList.emplace_back(F.begin(), nullptr, std::move(Values)); + RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values)); do { RenamePassData RPD; RPD.swap(RenamePassWorkList.back()); @@ -854,7 +859,7 @@ bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo, // BasicBlock. PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(), getNumPreds(BB), Allocas[AllocaNo]->getName() + "." + Twine(Version++), - BB->begin()); + &BB->front()); ++NumPHIInsert; PhiToAllocaMap[PN] = AllocaNo; @@ -919,7 +924,7 @@ NextIteration: return; for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II);) { - Instruction *I = II++; // get the instruction, increment iterator + Instruction *I = &*II++; // get the instruction, increment iterator if (LoadInst *LI = dyn_cast<LoadInst>(I)) { AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand()); diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 36781c1..3125a2c 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -14,11 +14,13 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -43,7 +45,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include <algorithm> #include <map> @@ -73,6 +74,22 @@ static cl::opt<bool> HoistCondStores( "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true), cl::desc("Hoist conditional stores if an unconditional store precedes")); +static cl::opt<bool> MergeCondStores( + "simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true), + cl::desc("Hoist conditional stores even if an unconditional store does not " + "precede - hoist multiple conditional stores into a single " + "predicated store")); + +static cl::opt<bool> MergeCondStoresAggressively( + "simplifycfg-merge-cond-stores-aggressively", cl::Hidden, cl::init(false), + cl::desc("When merging conditional stores, do so even if the resultant " + "basic blocks are unlikely to be if-converted as a result")); + +static cl::opt<bool> SpeculateOneExpensiveInst( + "speculate-one-expensive-inst", cl::Hidden, cl::init(true), + cl::desc("Allow exactly one expensive instruction to be speculatively " + "executed")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables"); @@ -83,13 +100,13 @@ STATISTIC(NumSpeculations, "Number of speculative executed instructions"); namespace { // The first field contains the value that the switch produces when a certain - // case group is selected, and the second field is a vector containing the cases - // composing the case group. + // case group is selected, and the second field is a vector containing the + // cases composing the case group. typedef SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2> SwitchCaseResultVectorTy; // The first field contains the phi node that generates a result of the switch - // and the second field contains the value generated for a certain case in the switch - // for that PHI. + // and the second field contains the value generated for a certain case in the + // switch for that PHI. typedef SmallVector<std::pair<PHINode *, Constant *>, 4> SwitchCaseResultsTy; /// ValueEqualityComparisonCase - Represents a case of a switch. @@ -124,6 +141,9 @@ class SimplifyCFGOpt { bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder); bool SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder); + bool SimplifySingleResume(ResumeInst *RI); + bool SimplifyCommonResume(ResumeInst *RI); + bool SimplifyCleanupReturn(CleanupReturnInst *RI); bool SimplifyUnreachable(UnreachableInst *UI); bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder); bool SimplifyIndirectBr(IndirectBrInst *IBI); @@ -226,6 +246,7 @@ static unsigned ComputeSpeculationCost(const User *I, "Instruction is not safe to speculatively execute!"); return TTI.getUserCost(I); } + /// If we have a merge point of an "if condition" as accepted above, /// return true if the specified value dominates the block. We /// don't handle the true generality of domination here, just a special case @@ -246,7 +267,8 @@ static unsigned ComputeSpeculationCost(const User *I, static bool DominatesMergePoint(Value *V, BasicBlock *BB, SmallPtrSetImpl<Instruction*> *AggressiveInsts, unsigned &CostRemaining, - const TargetTransformInfo &TTI) { + const TargetTransformInfo &TTI, + unsigned Depth = 0) { Instruction *I = dyn_cast<Instruction>(V); if (!I) { // Non-instructions all dominate instructions, but not all constantexprs @@ -284,15 +306,24 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, unsigned Cost = ComputeSpeculationCost(I, TTI); - if (Cost > CostRemaining) + // Allow exactly one instruction to be speculated regardless of its cost + // (as long as it is safe to do so). + // This is intended to flatten the CFG even if the instruction is a division + // or other expensive operation. The speculation of an expensive instruction + // is expected to be undone in CodeGenPrepare if the speculation has not + // enabled further IR optimizations. + if (Cost > CostRemaining && + (!SpeculateOneExpensiveInst || !AggressiveInsts->empty() || Depth > 0)) return false; - CostRemaining -= Cost; + // Avoid unsigned wrap. + CostRemaining = (Cost > CostRemaining) ? 0 : CostRemaining - Cost; // Okay, we can only really hoist these out if their operands do // not take us over the cost threshold. for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) - if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI)) + if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI, + Depth + 1)) return false; // Okay, it's safe to do this! Remember this instruction. AggressiveInsts->insert(I); @@ -970,8 +1001,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, // Okay, at this point, we know which new successor Pred will get. Make // sure we update the number of entries in the PHI nodes for these // successors. - for (unsigned i = 0, e = NewSuccessors.size(); i != e; ++i) - AddPredecessorToBlock(NewSuccessors[i], Pred, BB); + for (BasicBlock *NewSuccessor : NewSuccessors) + AddPredecessorToBlock(NewSuccessor, Pred, BB); Builder.SetInsertPoint(PTI); // Convert pointer to int before we switch. @@ -984,8 +1015,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, SwitchInst *NewSI = Builder.CreateSwitch(CV, PredDefault, PredCases.size()); NewSI->setDebugLoc(PTI->getDebugLoc()); - for (unsigned i = 0, e = PredCases.size(); i != e; ++i) - NewSI->addCase(PredCases[i].Value, PredCases[i].Dest); + for (ValueEqualityComparisonCase &V : PredCases) + NewSI->addCase(V.Value, V.Dest); if (PredHasWeights || SuccHasWeights) { // Halve the weights if any of them cannot fit in an uint32_t @@ -1059,15 +1090,15 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, BasicBlock::iterator BB1_Itr = BB1->begin(); BasicBlock::iterator BB2_Itr = BB2->begin(); - Instruction *I1 = BB1_Itr++, *I2 = BB2_Itr++; + Instruction *I1 = &*BB1_Itr++, *I2 = &*BB2_Itr++; // Skip debug info if it is not identical. DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1); DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2); if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) { while (isa<DbgInfoIntrinsic>(I1)) - I1 = BB1_Itr++; + I1 = &*BB1_Itr++; while (isa<DbgInfoIntrinsic>(I2)) - I2 = BB2_Itr++; + I2 = &*BB2_Itr++; } if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2) || (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))) @@ -1088,31 +1119,30 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, // For a normal instruction, we just move one to right before the branch, // then replace all uses of the other with the first. Finally, we remove // the now redundant second instruction. - BIParent->getInstList().splice(BI, BB1->getInstList(), I1); + BIParent->getInstList().splice(BI->getIterator(), BB1->getInstList(), I1); if (!I2->use_empty()) I2->replaceAllUsesWith(I1); I1->intersectOptionalDataWith(I2); unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_range, - LLVMContext::MD_fpmath, - LLVMContext::MD_invariant_load, - LLVMContext::MD_nonnull - }; + LLVMContext::MD_tbaa, LLVMContext::MD_range, + LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load, + LLVMContext::MD_nonnull, LLVMContext::MD_invariant_group, + LLVMContext::MD_align, LLVMContext::MD_dereferenceable, + LLVMContext::MD_dereferenceable_or_null}; combineMetadata(I1, I2, KnownIDs); I2->eraseFromParent(); Changed = true; - I1 = BB1_Itr++; - I2 = BB2_Itr++; + I1 = &*BB1_Itr++; + I2 = &*BB2_Itr++; // Skip debug info if it is not identical. DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1); DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2); if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) { while (isa<DbgInfoIntrinsic>(I1)) - I1 = BB1_Itr++; + I1 = &*BB1_Itr++; while (isa<DbgInfoIntrinsic>(I2)) - I2 = BB2_Itr++; + I2 = &*BB2_Itr++; } } while (I1->isIdenticalToWhenDefined(I2)); @@ -1147,7 +1177,7 @@ HoistTerminator: // Okay, it is safe to hoist the terminator. Instruction *NT = I1->clone(); - BIParent->getInstList().insert(BI, NT); + BIParent->getInstList().insert(BI->getIterator(), NT); if (!NT->getType()->isVoidTy()) { I1->replaceAllUsesWith(NT); I2->replaceAllUsesWith(NT); @@ -1265,7 +1295,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { // Cannot move control-flow-involving, volatile loads, vaarg, etc. if (isa<PHINode>(I1) || isa<PHINode>(I2) || isa<TerminatorInst>(I1) || isa<TerminatorInst>(I2) || - isa<LandingPadInst>(I1) || isa<LandingPadInst>(I2) || + I1->isEHPad() || I2->isEHPad() || isa<AllocaInst>(I1) || isa<AllocaInst>(I2) || I1->mayHaveSideEffects() || I2->mayHaveSideEffects() || I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() || @@ -1324,7 +1354,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { if (!NewPN) { NewPN = PHINode::Create(DifferentOp1->getType(), 2, - DifferentOp1->getName() + ".sink", BBEnd->begin()); + DifferentOp1->getName() + ".sink", &BBEnd->front()); NewPN->addIncoming(DifferentOp1, BB1); NewPN->addIncoming(DifferentOp2, BB2); DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";); @@ -1339,7 +1369,8 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { // instruction in the basic block down. bool UpdateRE1 = (I1 == BB1->begin()), UpdateRE2 = (I2 == BB2->begin()); // Sink the instruction. - BBEnd->getInstList().splice(FirstNonPhiInBBEnd, BB1->getInstList(), I1); + BBEnd->getInstList().splice(FirstNonPhiInBBEnd->getIterator(), + BB1->getInstList(), I1); if (!OldPN->use_empty()) OldPN->replaceAllUsesWith(I1); OldPN->eraseFromParent(); @@ -1355,7 +1386,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { RE1 = BB1->getInstList().rend(); if (UpdateRE2) RE2 = BB2->getInstList().rend(); - FirstNonPhiInBBEnd = I1; + FirstNonPhiInBBEnd = &*I1; NumSinkCommons++; Changed = true; } @@ -1491,7 +1522,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, for (BasicBlock::iterator BBI = ThenBB->begin(), BBE = std::prev(ThenBB->end()); BBI != BBE; ++BBI) { - Instruction *I = BBI; + Instruction *I = &*BBI; // Skip debug info. if (isa<DbgInfoIntrinsic>(I)) continue; @@ -1604,9 +1635,14 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, SpeculatedStore->setOperand(0, S); } + // Metadata can be dependent on the condition we are hoisting above. + // Conservatively strip all metadata on the instruction. + for (auto &I: *ThenBB) + I.dropUnknownNonDebugMetadata(); + // Hoist the instructions. - BB->getInstList().splice(BI, ThenBB->getInstList(), ThenBB->begin(), - std::prev(ThenBB->end())); + BB->getInstList().splice(BI->getIterator(), ThenBB->getInstList(), + ThenBB->begin(), std::prev(ThenBB->end())); // Insert selects and rewrite the PHI operands. IRBuilder<true, NoFolder> Builder(BI); @@ -1747,13 +1783,13 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) { // Check for trivial simplification. if (Value *V = SimplifyInstruction(N, DL)) { - TranslateMap[BBI] = V; + TranslateMap[&*BBI] = V; delete N; // Instruction folded away, don't need actual inst } else { // Insert the new instruction into its new home. EdgeBB->getInstList().insert(InsertPt, N); if (!BBI->use_empty()) - TranslateMap[BBI] = N; + TranslateMap[&*BBI] = N; } } @@ -1850,7 +1886,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, } else { DomBlock = *pred_begin(IfBlock1); for (BasicBlock::iterator I = IfBlock1->begin();!isa<TerminatorInst>(I);++I) - if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) { + if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) { // This is not an aggressive instruction that we can promote. // Because of this, we won't be able to get rid of the control // flow, so the xform is not worth it. @@ -1863,7 +1899,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, } else { DomBlock = *pred_begin(IfBlock2); for (BasicBlock::iterator I = IfBlock2->begin();!isa<TerminatorInst>(I);++I) - if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) { + if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) { // This is not an aggressive instruction that we can promote. // Because of this, we won't be able to get rid of the control // flow, so the xform is not worth it. @@ -1882,13 +1918,13 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, // Move all 'aggressive' instructions, which are defined in the // conditional parts of the if's up to the dominating block. if (IfBlock1) - DomBlock->getInstList().splice(InsertPt, + DomBlock->getInstList().splice(InsertPt->getIterator(), IfBlock1->getInstList(), IfBlock1->begin(), - IfBlock1->getTerminator()); + IfBlock1->getTerminator()->getIterator()); if (IfBlock2) - DomBlock->getInstList().splice(InsertPt, + DomBlock->getInstList().splice(InsertPt->getIterator(), IfBlock2->getInstList(), IfBlock2->begin(), - IfBlock2->getTerminator()); + IfBlock2->getTerminator()->getIterator()); while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) { // Change the PHI node into a select instruction. @@ -2057,7 +2093,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { BI->getSuccessor(0) == PBI->getSuccessor(1))) { for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { - Instruction *Curr = I++; + Instruction *Curr = &*I++; if (isa<CmpInst>(Curr)) { Cond = Curr; break; @@ -2077,7 +2113,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { return false; // Make sure the instruction after the condition is the cond branch. - BasicBlock::iterator CondIt = Cond; ++CondIt; + BasicBlock::iterator CondIt = ++Cond->getIterator(); // Ignore dbg intrinsics. while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt; @@ -2095,7 +2131,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { // Ignore dbg intrinsics. if (isa<DbgInfoIntrinsic>(I)) continue; - if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(I)) + if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(&*I)) return false; // I has only one use and can be executed unconditionally. Instruction *User = dyn_cast<Instruction>(I->user_back()); @@ -2192,17 +2228,17 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { Instruction *NewBonusInst = BonusInst->clone(); RemapInstruction(NewBonusInst, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); - VMap[BonusInst] = NewBonusInst; + VMap[&*BonusInst] = NewBonusInst; // If we moved a load, we cannot any longer claim any knowledge about // its potential value. The previous information might have been valid // only given the branch precondition. // For an analogous reason, we must also drop all the metadata whose // semantics we don't understand. - NewBonusInst->dropUnknownMetadata(LLVMContext::MD_dbg); + NewBonusInst->dropUnknownNonDebugMetadata(); - PredBlock->getInstList().insert(PBI, NewBonusInst); - NewBonusInst->takeName(BonusInst); + PredBlock->getInstList().insert(PBI->getIterator(), NewBonusInst); + NewBonusInst->takeName(&*BonusInst); BonusInst->setName(BonusInst->getName() + ".old"); } @@ -2211,7 +2247,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { Instruction *New = Cond->clone(); RemapInstruction(New, VMap, RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); - PredBlock->getInstList().insert(PBI, New); + PredBlock->getInstList().insert(PBI->getIterator(), New); New->takeName(Cond); Cond->setName(New->getName() + ".old"); @@ -2332,11 +2368,297 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { return false; } +// If there is only one store in BB1 and BB2, return it, otherwise return +// nullptr. +static StoreInst *findUniqueStoreInBlocks(BasicBlock *BB1, BasicBlock *BB2) { + StoreInst *S = nullptr; + for (auto *BB : {BB1, BB2}) { + if (!BB) + continue; + for (auto &I : *BB) + if (auto *SI = dyn_cast<StoreInst>(&I)) { + if (S) + // Multiple stores seen. + return nullptr; + else + S = SI; + } + } + return S; +} + +static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB, + Value *AlternativeV = nullptr) { + // PHI is going to be a PHI node that allows the value V that is defined in + // BB to be referenced in BB's only successor. + // + // If AlternativeV is nullptr, the only value we care about in PHI is V. It + // doesn't matter to us what the other operand is (it'll never get used). We + // could just create a new PHI with an undef incoming value, but that could + // increase register pressure if EarlyCSE/InstCombine can't fold it with some + // other PHI. So here we directly look for some PHI in BB's successor with V + // as an incoming operand. If we find one, we use it, else we create a new + // one. + // + // If AlternativeV is not nullptr, we care about both incoming values in PHI. + // PHI must be exactly: phi <ty> [ %BB, %V ], [ %OtherBB, %AlternativeV] + // where OtherBB is the single other predecessor of BB's only successor. + PHINode *PHI = nullptr; + BasicBlock *Succ = BB->getSingleSuccessor(); + + for (auto I = Succ->begin(); isa<PHINode>(I); ++I) + if (cast<PHINode>(I)->getIncomingValueForBlock(BB) == V) { + PHI = cast<PHINode>(I); + if (!AlternativeV) + break; + + assert(std::distance(pred_begin(Succ), pred_end(Succ)) == 2); + auto PredI = pred_begin(Succ); + BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI; + if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV) + break; + PHI = nullptr; + } + if (PHI) + return PHI; + + // If V is not an instruction defined in BB, just return it. + if (!AlternativeV && + (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != BB)) + return V; + + PHI = PHINode::Create(V->getType(), 2, "simplifycfg.merge", &Succ->front()); + PHI->addIncoming(V, BB); + for (BasicBlock *PredBB : predecessors(Succ)) + if (PredBB != BB) + PHI->addIncoming(AlternativeV ? AlternativeV : UndefValue::get(V->getType()), + PredBB); + return PHI; +} + +static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, + BasicBlock *QTB, BasicBlock *QFB, + BasicBlock *PostBB, Value *Address, + bool InvertPCond, bool InvertQCond) { + auto IsaBitcastOfPointerType = [](const Instruction &I) { + return Operator::getOpcode(&I) == Instruction::BitCast && + I.getType()->isPointerTy(); + }; + + // If we're not in aggressive mode, we only optimize if we have some + // confidence that by optimizing we'll allow P and/or Q to be if-converted. + auto IsWorthwhile = [&](BasicBlock *BB) { + if (!BB) + return true; + // Heuristic: if the block can be if-converted/phi-folded and the + // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to + // thread this store. + unsigned N = 0; + for (auto &I : *BB) { + // Cheap instructions viable for folding. + if (isa<BinaryOperator>(I) || isa<GetElementPtrInst>(I) || + isa<StoreInst>(I)) + ++N; + // Free instructions. + else if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || + IsaBitcastOfPointerType(I)) + continue; + else + return false; + } + return N <= PHINodeFoldingThreshold; + }; + + if (!MergeCondStoresAggressively && (!IsWorthwhile(PTB) || + !IsWorthwhile(PFB) || + !IsWorthwhile(QTB) || + !IsWorthwhile(QFB))) + return false; + + // For every pointer, there must be exactly two stores, one coming from + // PTB or PFB, and the other from QTB or QFB. We don't support more than one + // store (to any address) in PTB,PFB or QTB,QFB. + // FIXME: We could relax this restriction with a bit more work and performance + // testing. + StoreInst *PStore = findUniqueStoreInBlocks(PTB, PFB); + StoreInst *QStore = findUniqueStoreInBlocks(QTB, QFB); + if (!PStore || !QStore) + return false; + + // Now check the stores are compatible. + if (!QStore->isUnordered() || !PStore->isUnordered()) + return false; + + // Check that sinking the store won't cause program behavior changes. Sinking + // the store out of the Q blocks won't change any behavior as we're sinking + // from a block to its unconditional successor. But we're moving a store from + // the P blocks down through the middle block (QBI) and past both QFB and QTB. + // So we need to check that there are no aliasing loads or stores in + // QBI, QTB and QFB. We also need to check there are no conflicting memory + // operations between PStore and the end of its parent block. + // + // The ideal way to do this is to query AliasAnalysis, but we don't + // preserve AA currently so that is dangerous. Be super safe and just + // check there are no other memory operations at all. + for (auto &I : *QFB->getSinglePredecessor()) + if (I.mayReadOrWriteMemory()) + return false; + for (auto &I : *QFB) + if (&I != QStore && I.mayReadOrWriteMemory()) + return false; + if (QTB) + for (auto &I : *QTB) + if (&I != QStore && I.mayReadOrWriteMemory()) + return false; + for (auto I = BasicBlock::iterator(PStore), E = PStore->getParent()->end(); + I != E; ++I) + if (&*I != PStore && I->mayReadOrWriteMemory()) + return false; + + // OK, we're going to sink the stores to PostBB. The store has to be + // conditional though, so first create the predicate. + Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator()) + ->getCondition(); + Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator()) + ->getCondition(); + + Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(), + PStore->getParent()); + Value *QPHI = ensureValueAvailableInSuccessor(QStore->getValueOperand(), + QStore->getParent(), PPHI); + + IRBuilder<> QB(&*PostBB->getFirstInsertionPt()); + + Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond); + Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond); + + if (InvertPCond) + PPred = QB.CreateNot(PPred); + if (InvertQCond) + QPred = QB.CreateNot(QPred); + Value *CombinedPred = QB.CreateOr(PPred, QPred); + + auto *T = + SplitBlockAndInsertIfThen(CombinedPred, &*QB.GetInsertPoint(), false); + QB.SetInsertPoint(T); + StoreInst *SI = cast<StoreInst>(QB.CreateStore(QPHI, Address)); + AAMDNodes AAMD; + PStore->getAAMetadata(AAMD, /*Merge=*/false); + PStore->getAAMetadata(AAMD, /*Merge=*/true); + SI->setAAMetadata(AAMD); + + QStore->eraseFromParent(); + PStore->eraseFromParent(); + + return true; +} + +static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) { + // The intention here is to find diamonds or triangles (see below) where each + // conditional block contains a store to the same address. Both of these + // stores are conditional, so they can't be unconditionally sunk. But it may + // be profitable to speculatively sink the stores into one merged store at the + // end, and predicate the merged store on the union of the two conditions of + // PBI and QBI. + // + // This can reduce the number of stores executed if both of the conditions are + // true, and can allow the blocks to become small enough to be if-converted. + // This optimization will also chain, so that ladders of test-and-set + // sequences can be if-converted away. + // + // We only deal with simple diamonds or triangles: + // + // PBI or PBI or a combination of the two + // / \ | \ + // PTB PFB | PFB + // \ / | / + // QBI QBI + // / \ | \ + // QTB QFB | QFB + // \ / | / + // PostBB PostBB + // + // We model triangles as a type of diamond with a nullptr "true" block. + // Triangles are canonicalized so that the fallthrough edge is represented by + // a true condition, as in the diagram above. + // + BasicBlock *PTB = PBI->getSuccessor(0); + BasicBlock *PFB = PBI->getSuccessor(1); + BasicBlock *QTB = QBI->getSuccessor(0); + BasicBlock *QFB = QBI->getSuccessor(1); + BasicBlock *PostBB = QFB->getSingleSuccessor(); + + bool InvertPCond = false, InvertQCond = false; + // Canonicalize fallthroughs to the true branches. + if (PFB == QBI->getParent()) { + std::swap(PFB, PTB); + InvertPCond = true; + } + if (QFB == PostBB) { + std::swap(QFB, QTB); + InvertQCond = true; + } + + // From this point on we can assume PTB or QTB may be fallthroughs but PFB + // and QFB may not. Model fallthroughs as a nullptr block. + if (PTB == QBI->getParent()) + PTB = nullptr; + if (QTB == PostBB) + QTB = nullptr; + + // Legality bailouts. We must have at least the non-fallthrough blocks and + // the post-dominating block, and the non-fallthroughs must only have one + // predecessor. + auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) { + return BB->getSinglePredecessor() == P && + BB->getSingleSuccessor() == S; + }; + if (!PostBB || + !HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) || + !HasOnePredAndOneSucc(QFB, QBI->getParent(), PostBB)) + return false; + if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) || + (QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB))) + return false; + if (PostBB->getNumUses() != 2 || QBI->getParent()->getNumUses() != 2) + return false; + + // OK, this is a sequence of two diamonds or triangles. + // Check if there are stores in PTB or PFB that are repeated in QTB or QFB. + SmallPtrSet<Value *,4> PStoreAddresses, QStoreAddresses; + for (auto *BB : {PTB, PFB}) { + if (!BB) + continue; + for (auto &I : *BB) + if (StoreInst *SI = dyn_cast<StoreInst>(&I)) + PStoreAddresses.insert(SI->getPointerOperand()); + } + for (auto *BB : {QTB, QFB}) { + if (!BB) + continue; + for (auto &I : *BB) + if (StoreInst *SI = dyn_cast<StoreInst>(&I)) + QStoreAddresses.insert(SI->getPointerOperand()); + } + + set_intersect(PStoreAddresses, QStoreAddresses); + // set_intersect mutates PStoreAddresses in place. Rename it here to make it + // clear what it contains. + auto &CommonAddresses = PStoreAddresses; + + bool Changed = false; + for (auto *Address : CommonAddresses) + Changed |= mergeConditionalStoreToAddress( + PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond); + return Changed; +} + /// If we have a conditional branch as a predecessor of another block, /// this function tries to simplify it. We know /// that PBI and BI are both conditional branches, and BI is in one of the /// successor blocks of PBI - PBI branches to BI. -static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { +static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, + const DataLayout &DL) { assert(PBI->isConditional() && BI->isConditional()); BasicBlock *BB = BI->getParent(); @@ -2360,10 +2682,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { // simplifycfg will thread the block. if (BlockIsSimpleEnoughToThreadThrough(BB)) { pred_iterator PB = pred_begin(BB), PE = pred_end(BB); - PHINode *NewPN = PHINode::Create(Type::getInt1Ty(BB->getContext()), - std::distance(PB, PE), - BI->getCondition()->getName() + ".pr", - BB->begin()); + PHINode *NewPN = PHINode::Create( + Type::getInt1Ty(BB->getContext()), std::distance(PB, PE), + BI->getCondition()->getName() + ".pr", &BB->front()); // Okay, we're going to insert the PHI node. Since PBI is not the only // predecessor, compute the PHI'd conditional value for all of the preds. // Any predecessor where the condition is not computable we keep symbolic. @@ -2386,6 +2707,29 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { } } + if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition())) + if (CE->canTrap()) + return false; + + // If BI is reached from the true path of PBI and PBI's condition implies + // BI's condition, we know the direction of the BI branch. + if (PBI->getSuccessor(0) == BI->getParent() && + isImpliedCondition(PBI->getCondition(), BI->getCondition(), DL) && + PBI->getSuccessor(0) != PBI->getSuccessor(1) && + BB->getSinglePredecessor()) { + // Turn this into a branch on constant. + auto *OldCond = BI->getCondition(); + BI->setCondition(ConstantInt::getTrue(BB->getContext())); + RecursivelyDeleteTriviallyDeadInstructions(OldCond); + return true; // Nuke the branch on constant. + } + + // If both branches are conditional and both contain stores to the same + // address, remove the stores from the conditionals and create a conditional + // merged store at the end. + if (MergeCondStores && mergeConditionalStores(PBI, BI)) + return true; + // If this is a conditional branch in an empty block, and if any // predecessors are a conditional branch to one of our destinations, // fold the conditions into logical ops and one cond br. @@ -2396,11 +2740,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { if (&*BBI != BI) return false; - - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BI->getCondition())) - if (CE->canTrap()) - return false; - int PBIOp, BIOp; if (PBI->getSuccessor(0) == BI->getSuccessor(0)) PBIOp = BIOp = 0; @@ -2565,15 +2904,15 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond, BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr; // Then remove the rest. - for (unsigned I = 0, E = OldTerm->getNumSuccessors(); I != E; ++I) { - BasicBlock *Succ = OldTerm->getSuccessor(I); + for (BasicBlock *Succ : OldTerm->successors()) { // Make sure only to keep exactly one copy of each edge. if (Succ == KeepEdge1) KeepEdge1 = nullptr; else if (Succ == KeepEdge2) KeepEdge2 = nullptr; else - Succ->removePredecessor(OldTerm->getParent()); + Succ->removePredecessor(OldTerm->getParent(), + /*DontDeleteUselessPHIs=*/true); } IRBuilder<> Builder(OldTerm); @@ -2827,7 +3166,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder, Values.erase(std::unique(Values.begin(), Values.end()), Values.end()); // If Extra was used, we require at least two switch values to do the - // transformation. A switch with one value is just an cond branch. + // transformation. A switch with one value is just a conditional branch. if (ExtraCase && Values.size() < 2) return false; // TODO: Preserve branch weight metadata, similarly to how @@ -2847,7 +3186,8 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder, // then we evaluate them with an explicit branch first. Split the block // right before the condbr to handle it. if (ExtraCase) { - BasicBlock *NewBB = BB->splitBasicBlock(BI, "switch.early.test"); + BasicBlock *NewBB = + BB->splitBasicBlock(BI->getIterator(), "switch.early.test"); // Remove the uncond branch added to the old block. TerminatorInst *OldTI = BB->getTerminator(); Builder.SetInsertPoint(OldTI); @@ -2901,47 +3241,233 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder, } bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) { - // If this is a trivial landing pad that just continues unwinding the caught - // exception then zap the landing pad, turning its invokes into calls. + if (isa<PHINode>(RI->getValue())) + return SimplifyCommonResume(RI); + else if (isa<LandingPadInst>(RI->getParent()->getFirstNonPHI()) && + RI->getValue() == RI->getParent()->getFirstNonPHI()) + // The resume must unwind the exception that caused control to branch here. + return SimplifySingleResume(RI); + + return false; +} + +// Simplify resume that is shared by several landing pads (phi of landing pad). +bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) { + BasicBlock *BB = RI->getParent(); + + // Check that there are no other instructions except for debug intrinsics + // between the phi of landing pads (RI->getValue()) and resume instruction. + BasicBlock::iterator I = cast<Instruction>(RI->getValue())->getIterator(), + E = RI->getIterator(); + while (++I != E) + if (!isa<DbgInfoIntrinsic>(I)) + return false; + + SmallSet<BasicBlock *, 4> TrivialUnwindBlocks; + auto *PhiLPInst = cast<PHINode>(RI->getValue()); + + // Check incoming blocks to see if any of them are trivial. + for (unsigned Idx = 0, End = PhiLPInst->getNumIncomingValues(); + Idx != End; Idx++) { + auto *IncomingBB = PhiLPInst->getIncomingBlock(Idx); + auto *IncomingValue = PhiLPInst->getIncomingValue(Idx); + + // If the block has other successors, we can not delete it because + // it has other dependents. + if (IncomingBB->getUniqueSuccessor() != BB) + continue; + + auto *LandingPad = + dyn_cast<LandingPadInst>(IncomingBB->getFirstNonPHI()); + // Not the landing pad that caused the control to branch here. + if (IncomingValue != LandingPad) + continue; + + bool isTrivial = true; + + I = IncomingBB->getFirstNonPHI()->getIterator(); + E = IncomingBB->getTerminator()->getIterator(); + while (++I != E) + if (!isa<DbgInfoIntrinsic>(I)) { + isTrivial = false; + break; + } + + if (isTrivial) + TrivialUnwindBlocks.insert(IncomingBB); + } + + // If no trivial unwind blocks, don't do any simplifications. + if (TrivialUnwindBlocks.empty()) return false; + + // Turn all invokes that unwind here into calls. + for (auto *TrivialBB : TrivialUnwindBlocks) { + // Blocks that will be simplified should be removed from the phi node. + // Note there could be multiple edges to the resume block, and we need + // to remove them all. + while (PhiLPInst->getBasicBlockIndex(TrivialBB) != -1) + BB->removePredecessor(TrivialBB, true); + + for (pred_iterator PI = pred_begin(TrivialBB), PE = pred_end(TrivialBB); + PI != PE;) { + BasicBlock *Pred = *PI++; + removeUnwindEdge(Pred); + } + + // In each SimplifyCFG run, only the current processed block can be erased. + // Otherwise, it will break the iteration of SimplifyCFG pass. So instead + // of erasing TrivialBB, we only remove the branch to the common resume + // block so that we can later erase the resume block since it has no + // predecessors. + TrivialBB->getTerminator()->eraseFromParent(); + new UnreachableInst(RI->getContext(), TrivialBB); + } + + // Delete the resume block if all its predecessors have been removed. + if (pred_empty(BB)) + BB->eraseFromParent(); + + return !TrivialUnwindBlocks.empty(); +} + +// Simplify resume that is only used by a single (non-phi) landing pad. +bool SimplifyCFGOpt::SimplifySingleResume(ResumeInst *RI) { BasicBlock *BB = RI->getParent(); LandingPadInst *LPInst = dyn_cast<LandingPadInst>(BB->getFirstNonPHI()); - if (RI->getValue() != LPInst) - // Not a landing pad, or the resume is not unwinding the exception that - // caused control to branch here. - return false; + assert (RI->getValue() == LPInst && + "Resume must unwind the exception that caused control to here"); // Check that there are no other instructions except for debug intrinsics. - BasicBlock::iterator I = LPInst, E = RI; + BasicBlock::iterator I = LPInst->getIterator(), E = RI->getIterator(); while (++I != E) if (!isa<DbgInfoIntrinsic>(I)) return false; // Turn all invokes that unwind here into calls and delete the basic block. for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) { - InvokeInst *II = cast<InvokeInst>((*PI++)->getTerminator()); - SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); - // Insert a call instruction before the invoke. - CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II); - Call->takeName(II); - Call->setCallingConv(II->getCallingConv()); - Call->setAttributes(II->getAttributes()); - Call->setDebugLoc(II->getDebugLoc()); + BasicBlock *Pred = *PI++; + removeUnwindEdge(Pred); + } - // Anything that used the value produced by the invoke instruction now uses - // the value produced by the call instruction. Note that we do this even - // for void functions and calls with no uses so that the callgraph edge is - // updated. - II->replaceAllUsesWith(Call); - BB->removePredecessor(II->getParent()); + // The landingpad is now unreachable. Zap it. + BB->eraseFromParent(); + return true; +} - // Insert a branch to the normal destination right before the invoke. - BranchInst::Create(II->getNormalDest(), II); +bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) { + // If this is a trivial cleanup pad that executes no instructions, it can be + // eliminated. If the cleanup pad continues to the caller, any predecessor + // that is an EH pad will be updated to continue to the caller and any + // predecessor that terminates with an invoke instruction will have its invoke + // instruction converted to a call instruction. If the cleanup pad being + // simplified does not continue to the caller, each predecessor will be + // updated to continue to the unwind destination of the cleanup pad being + // simplified. + BasicBlock *BB = RI->getParent(); + CleanupPadInst *CPInst = RI->getCleanupPad(); + if (CPInst->getParent() != BB) + // This isn't an empty cleanup. + return false; - // Finally, delete the invoke instruction! - II->eraseFromParent(); + // Check that there are no other instructions except for debug intrinsics. + BasicBlock::iterator I = CPInst->getIterator(), E = RI->getIterator(); + while (++I != E) + if (!isa<DbgInfoIntrinsic>(I)) + return false; + + // If the cleanup return we are simplifying unwinds to the caller, this will + // set UnwindDest to nullptr. + BasicBlock *UnwindDest = RI->getUnwindDest(); + Instruction *DestEHPad = UnwindDest ? UnwindDest->getFirstNonPHI() : nullptr; + + // We're about to remove BB from the control flow. Before we do, sink any + // PHINodes into the unwind destination. Doing this before changing the + // control flow avoids some potentially slow checks, since we can currently + // be certain that UnwindDest and BB have no common predecessors (since they + // are both EH pads). + if (UnwindDest) { + // First, go through the PHI nodes in UnwindDest and update any nodes that + // reference the block we are removing + for (BasicBlock::iterator I = UnwindDest->begin(), + IE = DestEHPad->getIterator(); + I != IE; ++I) { + PHINode *DestPN = cast<PHINode>(I); + + int Idx = DestPN->getBasicBlockIndex(BB); + // Since BB unwinds to UnwindDest, it has to be in the PHI node. + assert(Idx != -1); + // This PHI node has an incoming value that corresponds to a control + // path through the cleanup pad we are removing. If the incoming + // value is in the cleanup pad, it must be a PHINode (because we + // verified above that the block is otherwise empty). Otherwise, the + // value is either a constant or a value that dominates the cleanup + // pad being removed. + // + // Because BB and UnwindDest are both EH pads, all of their + // predecessors must unwind to these blocks, and since no instruction + // can have multiple unwind destinations, there will be no overlap in + // incoming blocks between SrcPN and DestPN. + Value *SrcVal = DestPN->getIncomingValue(Idx); + PHINode *SrcPN = dyn_cast<PHINode>(SrcVal); + + // Remove the entry for the block we are deleting. + DestPN->removeIncomingValue(Idx, false); + + if (SrcPN && SrcPN->getParent() == BB) { + // If the incoming value was a PHI node in the cleanup pad we are + // removing, we need to merge that PHI node's incoming values into + // DestPN. + for (unsigned SrcIdx = 0, SrcE = SrcPN->getNumIncomingValues(); + SrcIdx != SrcE; ++SrcIdx) { + DestPN->addIncoming(SrcPN->getIncomingValue(SrcIdx), + SrcPN->getIncomingBlock(SrcIdx)); + } + } else { + // Otherwise, the incoming value came from above BB and + // so we can just reuse it. We must associate all of BB's + // predecessors with this value. + for (auto *pred : predecessors(BB)) { + DestPN->addIncoming(SrcVal, pred); + } + } + } + + // Sink any remaining PHI nodes directly into UnwindDest. + Instruction *InsertPt = DestEHPad; + for (BasicBlock::iterator I = BB->begin(), + IE = BB->getFirstNonPHI()->getIterator(); + I != IE;) { + // The iterator must be incremented here because the instructions are + // being moved to another block. + PHINode *PN = cast<PHINode>(I++); + if (PN->use_empty()) + // If the PHI node has no uses, just leave it. It will be erased + // when we erase BB below. + continue; + + // Otherwise, sink this PHI node into UnwindDest. + // Any predecessors to UnwindDest which are not already represented + // must be back edges which inherit the value from the path through + // BB. In this case, the PHI value must reference itself. + for (auto *pred : predecessors(UnwindDest)) + if (pred != BB) + PN->addIncoming(PN, pred); + PN->moveBefore(InsertPt); + } } - // The landingpad is now unreachable. Zap it. + for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) { + // The iterator must be updated here because we are removing this pred. + BasicBlock *PredBB = *PI++; + if (UnwindDest == nullptr) { + removeUnwindEdge(PredBB); + } else { + TerminatorInst *TI = PredBB->getTerminator(); + TI->replaceUsesOfWith(BB, UnwindDest); + } + } + + // The cleanup pad is now unreachable. Zap it. BB->eraseFromParent(); return true; } @@ -3003,8 +3529,8 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { // If there are any instructions immediately before the unreachable that can // be removed, do so. - while (UI != BB->begin()) { - BasicBlock::iterator BBI = UI; + while (UI->getIterator() != BB->begin()) { + BasicBlock::iterator BBI = UI->getIterator(); --BBI; // Do not delete instructions that can have side effects which might cause // the unreachable to not be reachable; specifically, calls and volatile @@ -3012,18 +3538,26 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)) break; if (BBI->mayHaveSideEffects()) { - if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) { + if (auto *SI = dyn_cast<StoreInst>(BBI)) { if (SI->isVolatile()) break; - } else if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) { + } else if (auto *LI = dyn_cast<LoadInst>(BBI)) { if (LI->isVolatile()) break; - } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(BBI)) { + } else if (auto *RMWI = dyn_cast<AtomicRMWInst>(BBI)) { if (RMWI->isVolatile()) break; - } else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) { + } else if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) { if (CXI->isVolatile()) break; + } else if (isa<CatchPadInst>(BBI)) { + // A catchpad may invoke exception object constructors and such, which + // in some languages can be arbitrary code, so be conservative by + // default. + // For CoreCLR, it just involves a type test, so can be removed. + if (classifyEHPersonality(BB->getParent()->getPersonalityFn()) != + EHPersonality::CoreCLR) + break; } else if (!isa<FenceInst>(BBI) && !isa<VAArgInst>(BBI) && !isa<LandingPadInst>(BBI)) { break; @@ -3049,7 +3583,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { for (unsigned i = 0, e = Preds.size(); i != e; ++i) { TerminatorInst *TI = Preds[i]->getTerminator(); IRBuilder<> Builder(TI); - if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (auto *BI = dyn_cast<BranchInst>(TI)) { if (BI->isUnconditional()) { if (BI->getSuccessor(0) == BB) { new UnreachableInst(TI->getContext(), TI); @@ -3066,7 +3600,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { Changed = true; } } - } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + } else if (auto *SI = dyn_cast<SwitchInst>(TI)) { for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) if (i.getCaseSuccessor() == BB) { @@ -3075,25 +3609,48 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { --i; --e; Changed = true; } - } else if (InvokeInst *II = dyn_cast<InvokeInst>(TI)) { + } else if (auto *II = dyn_cast<InvokeInst>(TI)) { if (II->getUnwindDest() == BB) { - // Convert the invoke to a call instruction. This would be a good - // place to note that the call does not throw though. - BranchInst *BI = Builder.CreateBr(II->getNormalDest()); - II->removeFromParent(); // Take out of symbol table - - // Insert the call now... - SmallVector<Value*, 8> Args(II->op_begin(), II->op_end()-3); - Builder.SetInsertPoint(BI); - CallInst *CI = Builder.CreateCall(II->getCalledValue(), - Args, II->getName()); - CI->setCallingConv(II->getCallingConv()); - CI->setAttributes(II->getAttributes()); - // If the invoke produced a value, the call does now instead. - II->replaceAllUsesWith(CI); - delete II; + removeUnwindEdge(TI->getParent()); Changed = true; } + } else if (auto *CSI = dyn_cast<CatchSwitchInst>(TI)) { + if (CSI->getUnwindDest() == BB) { + removeUnwindEdge(TI->getParent()); + Changed = true; + continue; + } + + for (CatchSwitchInst::handler_iterator I = CSI->handler_begin(), + E = CSI->handler_end(); + I != E; ++I) { + if (*I == BB) { + CSI->removeHandler(I); + --I; + --E; + Changed = true; + } + } + if (CSI->getNumHandlers() == 0) { + BasicBlock *CatchSwitchBB = CSI->getParent(); + if (CSI->hasUnwindDest()) { + // Redirect preds to the unwind dest + CatchSwitchBB->replaceAllUsesWith(CSI->getUnwindDest()); + } else { + // Rewrite all preds to unwind to caller (or from invoke to call). + SmallVector<BasicBlock *, 8> EHPreds(predecessors(CatchSwitchBB)); + for (BasicBlock *EHPred : EHPreds) + removeUnwindEdge(EHPred); + } + // The catchswitch is no longer reachable. + new UnreachableInst(CSI->getContext(), CSI); + CSI->eraseFromParent(); + Changed = true; + } + } else if (isa<CleanupReturnInst>(TI)) { + new UnreachableInst(TI->getContext(), TI); + TI->eraseFromParent(); + Changed = true; } } @@ -3249,6 +3806,29 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC, } } + // If we can prove that the cases must cover all possible values, the + // default destination becomes dead and we can remove it. If we know some + // of the bits in the value, we can use that to more precisely compute the + // number of possible unique case values. + bool HasDefault = + !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg()); + const unsigned NumUnknownBits = Bits - + (KnownZero.Or(KnownOne)).countPopulation(); + assert(NumUnknownBits <= Bits); + if (HasDefault && DeadCases.empty() && + NumUnknownBits < 64 /* avoid overflow */ && + SI->getNumCases() == (1ULL << NumUnknownBits)) { + DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n"); + BasicBlock *NewDefault = SplitBlockPredecessors(SI->getDefaultDest(), + SI->getParent(), ""); + SI->setDefaultDest(&*NewDefault); + SplitBlock(&*NewDefault, &NewDefault->front()); + auto *OldTI = NewDefault->getTerminator(); + new UnreachableInst(SI->getContext(), OldTI); + EraseTerminatorInstAndDCECond(OldTI); + return true; + } + SmallVector<uint64_t, 8> Weights; bool HasWeight = HasBranchWeights(SI); if (HasWeight) { @@ -3439,7 +4019,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest, } else if (isa<DbgInfoIntrinsic>(I)) { // Skip debug intrinsic. continue; - } else if (Constant *C = ConstantFold(I, DL, ConstantPool)) { + } else if (Constant *C = ConstantFold(&*I, DL, ConstantPool)) { // Instruction is side-effect free and constant. // If the instruction has uses outside this block or a phi node slot for @@ -3456,7 +4036,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest, return false; } - ConstantPool.insert(std::make_pair(I, C)); + ConstantPool.insert(std::make_pair(&*I, C)); } else { break; } @@ -3664,7 +4244,7 @@ namespace { /// Return true if a table with TableSize elements of /// type ElementType would fit in a target-legal register. static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize, - const Type *ElementType); + Type *ElementType); private: // Depending on the contents of the table, it can be represented in @@ -3880,8 +4460,8 @@ Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) { bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL, uint64_t TableSize, - const Type *ElementType) { - const IntegerType *IT = dyn_cast<IntegerType>(ElementType); + Type *ElementType) { + auto *IT = dyn_cast<IntegerType>(ElementType); if (!IT) return false; // FIXME: If the type is wider than it needs to be, e.g. i8 but all values @@ -3992,7 +4572,7 @@ static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock, assert((CaseConst == TrueConst || CaseConst == FalseConst) && "Expect true or false as compare result."); } - + // Check if the branch instruction dominates the phi node. It's a simple // dominance check, but sufficient for our needs. // Although this check is invariant in the calling loops, it's better to do it @@ -4422,7 +5002,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ return true; // If the Terminator is the only non-phi instruction, simplify the block. - BasicBlock::iterator I = BB->getFirstNonPHIOrDbg(); + BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator(); if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() && TryToSimplifyUncondBranchFromEmptyBlock(BB)) return true; @@ -4457,6 +5037,16 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ return false; } +static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) { + BasicBlock *PredPred = nullptr; + for (auto *P : predecessors(BB)) { + BasicBlock *PPred = P->getSinglePredecessor(); + if (!PPred || (PredPred && PredPred != PPred)) + return nullptr; + PredPred = PPred; + } + return PredPred; +} bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { BasicBlock *BB = BI->getParent(); @@ -4537,9 +5127,17 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator())) if (PBI != BI && PBI->isConditional()) - if (SimplifyCondBranchToCondBranch(PBI, BI)) + if (SimplifyCondBranchToCondBranch(PBI, BI, DL)) return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; + // Look for diamond patterns. + if (MergeCondStores) + if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB)) + if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator())) + if (PBI != BI && PBI->isConditional()) + if (mergeConditionalStores(PBI, BI)) + return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true; + return false; } @@ -4663,6 +5261,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { if (SimplifyReturn(RI, Builder)) return true; } else if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) { if (SimplifyResume(RI, Builder)) return true; + } else if (CleanupReturnInst *RI = + dyn_cast<CleanupReturnInst>(BB->getTerminator())) { + if (SimplifyCleanupReturn(RI)) return true; } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { if (SimplifySwitch(SI, Builder)) return true; } else if (UnreachableInst *UI = diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index ab30aa1..ddd8775 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -47,15 +47,16 @@ namespace { Loop *L; LoopInfo *LI; ScalarEvolution *SE; + DominatorTree *DT; SmallVectorImpl<WeakVH> &DeadInsts; bool Changed; public: - SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LoopInfo *LI, - SmallVectorImpl<WeakVH> &Dead) - : L(Loop), LI(LI), SE(SE), DeadInsts(Dead), Changed(false) { + SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT, + LoopInfo *LI,SmallVectorImpl<WeakVH> &Dead) + : L(Loop), LI(LI), SE(SE), DT(DT), DeadInsts(Dead), Changed(false) { assert(LI && "IV simplification requires LoopInfo"); } @@ -63,11 +64,13 @@ namespace { /// Iteratively perform simplification on a worklist of users of the /// specified induction variable. This is the top-level driver that applies - /// all simplicitions to users of an IV. + /// all simplifications to users of an IV. void simplifyUsers(PHINode *CurrIV, IVVisitor *V = nullptr); Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand); + bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand); + bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand); void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand); void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand, @@ -166,19 +169,65 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) { S = SE->getSCEVAtScope(S, ICmpLoop); X = SE->getSCEVAtScope(X, ICmpLoop); + ICmpInst::Predicate InvariantPredicate; + const SCEV *InvariantLHS, *InvariantRHS; + // If the condition is always true or always false, replace it with // a constant value. - if (SE->isKnownPredicate(Pred, S, X)) + if (SE->isKnownPredicate(Pred, S, X)) { ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext())); - else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) + DeadInsts.emplace_back(ICmp); + DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); + } else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) { ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext())); - else + DeadInsts.emplace_back(ICmp); + DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); + } else if (isa<PHINode>(IVOperand) && + SE->isLoopInvariantPredicate(Pred, S, X, ICmpLoop, + InvariantPredicate, InvariantLHS, + InvariantRHS)) { + + // Rewrite the comparison to a loop invariant comparison if it can be done + // cheaply, where cheaply means "we don't need to emit any new + // instructions". + + Value *NewLHS = nullptr, *NewRHS = nullptr; + + if (S == InvariantLHS || X == InvariantLHS) + NewLHS = + ICmp->getOperand(S == InvariantLHS ? IVOperIdx : (1 - IVOperIdx)); + + if (S == InvariantRHS || X == InvariantRHS) + NewRHS = + ICmp->getOperand(S == InvariantRHS ? IVOperIdx : (1 - IVOperIdx)); + + for (Value *Incoming : cast<PHINode>(IVOperand)->incoming_values()) { + if (NewLHS && NewRHS) + break; + + const SCEV *IncomingS = SE->getSCEV(Incoming); + + if (!NewLHS && IncomingS == InvariantLHS) + NewLHS = Incoming; + if (!NewRHS && IncomingS == InvariantRHS) + NewRHS = Incoming; + } + + if (!NewLHS || !NewRHS) + // We could not find an existing value to replace either LHS or RHS. + // Generating new instructions has subtler tradeoffs, so avoid doing that + // for now. + return; + + DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n'); + ICmp->setPredicate(InvariantPredicate); + ICmp->setOperand(0, NewLHS); + ICmp->setOperand(1, NewRHS); + } else return; - DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); ++NumElimCmp; Changed = true; - DeadInsts.emplace_back(ICmp); } /// SimplifyIVUsers helper for eliminating useless @@ -207,8 +256,7 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem, Rem->replaceAllUsesWith(Rem->getOperand(0)); else { // (i+1) % n --> (i+1)==n?0:(i+1) if i is in [0,n). - const SCEV *LessOne = - SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1)); + const SCEV *LessOne = SE->getMinusSCEV(S, SE->getOne(S->getType())); if (IsSigned && !SE->isKnownNonNegative(LessOne)) return; @@ -232,9 +280,9 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem, DeadInsts.emplace_back(Rem); } -/// Eliminate an operation that consumes a simple IV and has -/// no observable side-effect given the range of IV values. -/// IVOperand is guaranteed SCEVable, but UseInst may not be. +/// Eliminate an operation that consumes a simple IV and has no observable +/// side-effect given the range of IV values. IVOperand is guaranteed SCEVable, +/// but UseInst may not be. bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst, Instruction *IVOperand) { if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) { @@ -249,12 +297,45 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst, } } - // Eliminate any operation that SCEV can prove is an identity function. + if (eliminateIdentitySCEV(UseInst, IVOperand)) + return true; + + return false; +} + +/// Eliminate any operation that SCEV can prove is an identity function. +bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst, + Instruction *IVOperand) { if (!SE->isSCEVable(UseInst->getType()) || (UseInst->getType() != IVOperand->getType()) || (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand))) return false; + // getSCEV(X) == getSCEV(Y) does not guarantee that X and Y are related in the + // dominator tree, even if X is an operand to Y. For instance, in + // + // %iv = phi i32 {0,+,1} + // br %cond, label %left, label %merge + // + // left: + // %X = add i32 %iv, 0 + // br label %merge + // + // merge: + // %M = phi (%X, %iv) + // + // getSCEV(%M) == getSCEV(%X) == {0,+,1}, but %X does not dominate %M, and + // %M.replaceAllUsesWith(%X) would be incorrect. + + if (isa<PHINode>(UseInst)) + // If UseInst is not a PHI node then we know that IVOperand dominates + // UseInst directly from the legality of SSA. + if (!DT || !DT->dominates(IVOperand, UseInst)) + return false; + + if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand)) + return false; + DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n'); UseInst->replaceAllUsesWith(IVOperand); @@ -436,8 +517,8 @@ static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) { /// This algorithm does not require IVUsers analysis. Instead, it simplifies /// instructions in-place during analysis. Rather than rewriting induction /// variables bottom-up from their users, it transforms a chain of IVUsers -/// top-down, updating the IR only when it encouters a clear optimization -/// opportunitiy. +/// top-down, updating the IR only when it encounters a clear optimization +/// opportunity. /// /// Once DisableIVRewrite is default, LSR will be the only client of IVUsers. /// @@ -513,22 +594,21 @@ void IVVisitor::anchor() { } /// Simplify instructions that use this induction variable /// by using ScalarEvolution to analyze the IV's recurrence. -bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, LPPassManager *LPM, - SmallVectorImpl<WeakVH> &Dead, IVVisitor *V) -{ - LoopInfo *LI = &LPM->getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, LI, Dead); +bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT, + LoopInfo *LI, SmallVectorImpl<WeakVH> &Dead, + IVVisitor *V) { + SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, Dead); SIV.simplifyUsers(CurrIV, V); return SIV.hasChanged(); } /// Simplify users of induction variables within this /// loop. This does not actually change or add IVs. -bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, LPPassManager *LPM, - SmallVectorImpl<WeakVH> &Dead) { +bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT, + LoopInfo *LI, SmallVectorImpl<WeakVH> &Dead) { bool Changed = false; for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { - Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, LPM, Dead); + Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, Dead); } return Changed; } diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp index c499c87..d5377f9 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp @@ -20,12 +20,12 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -64,7 +64,7 @@ namespace { // Here be subtlety: the iterator must be incremented before the loop // body (not sure why), so a range-for loop won't work here. for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - Instruction *I = BI++; + Instruction *I = &*BI++; // The first time through the loop ToSimplify is empty and we try to // simplify all instructions. On later iterations ToSimplify is not // empty and we only bother simplifying instructions that are in it. diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 6bbf828..dc07440 100644 --- a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" @@ -30,8 +31,8 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace PatternMatch; @@ -52,20 +53,11 @@ static cl::opt<bool> //===----------------------------------------------------------------------===// static bool ignoreCallingConv(LibFunc::Func Func) { - switch (Func) { - case LibFunc::abs: - case LibFunc::labs: - case LibFunc::llabs: - case LibFunc::strlen: - return true; - default: - return false; - } - llvm_unreachable("All cases should be covered in the switch."); + return Func == LibFunc::abs || Func == LibFunc::labs || + Func == LibFunc::llabs || Func == LibFunc::strlen; } -/// isOnlyUsedInZeroEqualityComparison - Return true if it only matters that the -/// value is equal or not-equal to zero. +/// Return true if it only matters that the value is equal or not-equal to zero. static bool isOnlyUsedInZeroEqualityComparison(Value *V) { for (User *U : V->users()) { if (ICmpInst *IC = dyn_cast<ICmpInst>(U)) @@ -79,8 +71,7 @@ static bool isOnlyUsedInZeroEqualityComparison(Value *V) { return true; } -/// isOnlyUsedInEqualityComparison - Return true if it is only used in equality -/// comparisons with With. +/// Return true if it is only used in equality comparisons with With. static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) { for (User *U : V->users()) { if (ICmpInst *IC = dyn_cast<ICmpInst>(U)) @@ -93,16 +84,13 @@ static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) { } static bool callHasFloatingPointArgument(const CallInst *CI) { - for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end(); - it != e; ++it) { - if ((*it)->getType()->isFloatingPointTy()) - return true; - } - return false; + return std::any_of(CI->op_begin(), CI->op_end(), [](const Use &OI) { + return OI->getType()->isFloatingPointTy(); + }); } /// \brief Check whether the overloaded unary floating point function -/// corresponing to \a Ty is available. +/// corresponding to \a Ty is available. static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty, LibFunc::Func DoubleFn, LibFunc::Func FloatFn, LibFunc::Func LongDoubleFn) { @@ -116,6 +104,23 @@ static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty, } } +/// \brief Check whether we can use unsafe floating point math for +/// the function passed as input. +static bool canUseUnsafeFPMath(Function *F) { + + // FIXME: For finer-grain optimization, we need intrinsics to have the same + // fast-math flag decorations that are applied to FP instructions. For now, + // we have to rely on the function-level unsafe-fp-math attribute to do this + // optimization because there's no other way to express that the call can be + // relaxed. + if (F->hasFnAttribute("unsafe-fp-math")) { + Attribute Attr = F->getFnAttribute("unsafe-fp-math"); + if (Attr.getValueAsString() == "true") + return true; + } + return false; +} + /// \brief Returns whether \p F matches the signature expected for the /// string/memory copying library function \p Func. /// Acceptable functions are st[rp][n]?cpy, memove, memcpy, and memset. @@ -242,12 +247,12 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) { !FT->getParamType(2)->isIntegerTy()) return nullptr; - // Extract some information from the instruction + // Extract some information from the instruction. Value *Dst = CI->getArgOperand(0); Value *Src = CI->getArgOperand(1); uint64_t Len; - // We don't do anything if length is not constant + // We don't do anything if length is not constant. if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) Len = LengthArg->getZExtValue(); else @@ -265,12 +270,12 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) { if (SrcLen == 0 || Len == 0) return Dst; - // We don't optimize this case + // We don't optimize this case. if (Len < SrcLen) return nullptr; // strncat(x, s, c) -> strcat(x, s) - // s is constant so the strcat can be optimized further + // s is constant so the strcat can be optimized further. return emitStrLenMemCpy(Src, Dst, SrcLen, B); } @@ -303,7 +308,8 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) { StringRef Str; if (!getConstantStringInfo(SrcStr, Str)) { if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p) - return B.CreateGEP(B.getInt8Ty(), SrcStr, EmitStrLen(SrcStr, B, DL, TLI), "strchr"); + return B.CreateGEP(B.getInt8Ty(), SrcStr, EmitStrLen(SrcStr, B, DL, TLI), + "strchr"); return nullptr; } @@ -467,9 +473,6 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - // Verify the "stpcpy" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (!checkStringCopyLibFuncSignature(Callee, LibFunc::stpcpy)) return nullptr; @@ -484,10 +487,10 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { if (Len == 0) return nullptr; - Type *PT = FT->getParamType(0); + Type *PT = Callee->getFunctionType()->getParamType(0); Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len); - Value *DstEnd = - B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(DL.getIntPtrType(PT), Len - 1)); + Value *DstEnd = B.CreateGEP(B.getInt8Ty(), Dst, + ConstantInt::get(DL.getIntPtrType(PT), Len - 1)); // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. @@ -497,8 +500,6 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strncpy)) return nullptr; @@ -531,7 +532,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { if (Len > SrcLen + 1) return nullptr; - Type *PT = FT->getParamType(0); + Type *PT = Callee->getFunctionType()->getParamType(0); // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant] B.CreateMemCpy(Dst, Src, ConstantInt::get(DL.getIntPtrType(PT), Len), 1); @@ -597,7 +598,8 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) { if (I == StringRef::npos) // No match. return Constant::getNullValue(CI->getType()); - return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I), "strpbrk"); + return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I), + "strpbrk"); } // strpbrk(s, "a") -> strchr(s, 'a') @@ -862,6 +864,29 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) { return B.CreateSub(LHSV, RHSV, "chardiff"); } + // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0 + if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) { + + IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8); + unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType); + + if (getKnownAlignment(LHS, DL, CI) >= PrefAlignment && + getKnownAlignment(RHS, DL, CI) >= PrefAlignment) { + + Type *LHSPtrTy = + IntType->getPointerTo(LHS->getType()->getPointerAddressSpace()); + Type *RHSPtrTy = + IntType->getPointerTo(RHS->getType()->getPointerAddressSpace()); + + Value *LHSV = + B.CreateLoad(B.CreateBitCast(LHS, LHSPtrTy, "lhsc"), "lhsv"); + Value *RHSV = + B.CreateLoad(B.CreateBitCast(RHS, RHSPtrTy, "rhsc"), "rhsv"); + + return B.CreateZExt(B.CreateICmpNE(LHSV, RHSV), CI->getType(), "memcmp"); + } + } + // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant) StringRef LHSStr, RHSStr; if (getConstantStringInfo(LHS, LHSStr) && @@ -969,10 +994,14 @@ Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, Value *V = valueHasFloatPrecision(CI->getArgOperand(0)); if (V == nullptr) return nullptr; + + // Propagate fast-math flags from the existing call to the new call. + IRBuilder<>::FastMathFlagGuard Guard(B); + B.setFastMathFlags(CI->getFastMathFlags()); // floor((double)floatval) -> (double)floorf(floatval) if (Callee->isIntrinsic()) { - Module *M = CI->getParent()->getParent()->getParent(); + Module *M = CI->getModule(); Intrinsic::ID IID = Callee->getIntrinsicID(); Function *F = Intrinsic::getDeclaration(M, IID, B.getFloatTy()); V = B.CreateCall(F, V); @@ -1004,6 +1033,10 @@ Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) { if (V2 == nullptr) return nullptr; + // Propagate fast-math flags from the existing call to the new call. + IRBuilder<>::FastMathFlagGuard Guard(B); + B.setFastMathFlags(CI->getFastMathFlags()); + // fmin((double)floatval1, (double)floatval2) // -> (double)fminf(floatval1, floatval2) // TODO: Handle intrinsics in the same way as in optimizeUnaryDoubleFP(). @@ -1015,9 +1048,9 @@ Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); Value *Ret = nullptr; - if (UnsafeFPShrink && Callee->getName() == "cos" && TLI->has(LibFunc::cosf)) { + StringRef Name = Callee->getName(); + if (UnsafeFPShrink && Name == "cos" && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, true); - } FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 1 argument of FP type, which matches the @@ -1035,13 +1068,37 @@ Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) { return Ret; } +static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) { + // Multiplications calculated using Addition Chains. + // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html + + assert(Exp != 0 && "Incorrect exponent 0 not handled"); + + if (InnerChain[Exp]) + return InnerChain[Exp]; + + static const unsigned AddChain[33][2] = { + {0, 0}, // Unused. + {0, 0}, // Unused (base case = pow1). + {1, 1}, // Unused (pre-computed). + {1, 2}, {2, 2}, {2, 3}, {3, 3}, {2, 5}, {4, 4}, + {1, 8}, {5, 5}, {1, 10}, {6, 6}, {4, 9}, {7, 7}, + {3, 12}, {8, 8}, {8, 9}, {2, 16}, {1, 18}, {10, 10}, + {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13}, + {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16}, + }; + + InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B), + getPow(InnerChain, AddChain[Exp][1], B)); + return InnerChain[Exp]; +} + Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - Value *Ret = nullptr; - if (UnsafeFPShrink && Callee->getName() == "pow" && TLI->has(LibFunc::powf)) { + StringRef Name = Callee->getName(); + if (UnsafeFPShrink && Name == "pow" && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, true); - } FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 2 arguments of the same FP type, which match the @@ -1060,7 +1117,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) { if (Op1C->isExactlyValue(2.0) && hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f, LibFunc::exp2l)) - return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes()); + return EmitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp2), B, + Callee->getAttributes()); // pow(10.0, x) -> exp10(x) if (Op1C->isExactlyValue(10.0) && hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp10, LibFunc::exp10f, @@ -1069,6 +1127,29 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) { Callee->getAttributes()); } + // FIXME: Use instruction-level FMF. + bool UnsafeFPMath = canUseUnsafeFPMath(CI->getParent()->getParent()); + + // pow(exp(x), y) -> exp(x * y) + // pow(exp2(x), y) -> exp2(x * y) + // We enable these only with fast-math. Besides rounding differences, the + // transformation changes overflow and underflow behavior quite dramatically. + // Example: x = 1000, y = 0.001. + // pow(exp(x), y) = pow(inf, 0.001) = inf, whereas exp(x*y) = exp(1). + auto *OpC = dyn_cast<CallInst>(Op1); + if (OpC && OpC->hasUnsafeAlgebra() && CI->hasUnsafeAlgebra()) { + LibFunc::Func Func; + Function *OpCCallee = OpC->getCalledFunction(); + if (OpCCallee && TLI->getLibFunc(OpCCallee->getName(), Func) && + TLI->has(Func) && (Func == LibFunc::exp || Func == LibFunc::exp2)) { + IRBuilder<>::FastMathFlagGuard Guard(B); + B.setFastMathFlags(CI->getFastMathFlags()); + Value *FMul = B.CreateFMul(OpC->getArgOperand(0), Op2, "mul"); + return EmitUnaryFloatFnCall(FMul, OpCCallee->getName(), B, + OpCCallee->getAttributes()); + } + } + ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2); if (!Op2C) return Ret; @@ -1081,10 +1162,18 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) { LibFunc::sqrtl) && hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf, LibFunc::fabsl)) { + + // In -ffast-math, pow(x, 0.5) -> sqrt(x). + if (CI->hasUnsafeAlgebra()) { + IRBuilder<>::FastMathFlagGuard Guard(B); + B.setFastMathFlags(CI->getFastMathFlags()); + return EmitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B, + Callee->getAttributes()); + } + // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))). // This is faster than calling pow, and still handles negative zero // and negative infinity correctly. - // TODO: In fast-math mode, this could be just sqrt(x). // TODO: In finite-only mode, this could be just fabs(sqrt(x)). Value *Inf = ConstantFP::getInfinity(CI->getType()); Value *NegInf = ConstantFP::getInfinity(CI->getType(), true); @@ -1102,18 +1191,42 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) { return B.CreateFMul(Op1, Op1, "pow2"); if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip"); + + // In -ffast-math, generate repeated fmul instead of generating pow(x, n). + if (UnsafeFPMath) { + APFloat V = abs(Op2C->getValueAPF()); + // We limit to a max of 7 fmul(s). Thus max exponent is 32. + // This transformation applies to integer exponents only. + if (V.compare(APFloat(V.getSemantics(), 32.0)) == APFloat::cmpGreaterThan || + !V.isInteger()) + return nullptr; + + // We will memoize intermediate products of the Addition Chain. + Value *InnerChain[33] = {nullptr}; + InnerChain[1] = Op1; + InnerChain[2] = B.CreateFMul(Op1, Op1); + + // We cannot readily convert a non-double type (like float) to a double. + // So we first convert V to something which could be converted to double. + bool ignored; + V.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored); + Value *FMul = getPow(InnerChain, V.convertToDouble(), B); + // For negative exponents simply compute the reciprocal. + if (Op2C->isNegative()) + FMul = B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), FMul); + return FMul; + } + return nullptr; } Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); Function *Caller = CI->getParent()->getParent(); - Value *Ret = nullptr; - if (UnsafeFPShrink && Callee->getName() == "exp2" && - TLI->has(LibFunc::exp2f)) { + StringRef Name = Callee->getName(); + if (UnsafeFPShrink && Name == "exp2" && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, true); - } FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 1 argument of FP type, which matches the @@ -1162,11 +1275,10 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - Value *Ret = nullptr; - if (Callee->getName() == "fabs" && TLI->has(LibFunc::fabsf)) { + StringRef Name = Callee->getName(); + if (Name == "fabs" && hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, false); - } FunctionType *FT = Callee->getFunctionType(); // Make sure this has 1 argument of FP type which matches the result type. @@ -1184,6 +1296,102 @@ Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) { return Ret; } +Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) { + // If we can shrink the call to a float function rather than a double + // function, do that first. + Function *Callee = CI->getCalledFunction(); + StringRef Name = Callee->getName(); + if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name)) + if (Value *Ret = optimizeBinaryDoubleFP(CI, B)) + return Ret; + + // Make sure this has 2 arguments of FP type which match the result type. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + !FT->getParamType(0)->isFloatingPointTy()) + return nullptr; + + IRBuilder<>::FastMathFlagGuard Guard(B); + FastMathFlags FMF; + if (CI->hasUnsafeAlgebra()) { + // Unsafe algebra sets all fast-math-flags to true. + FMF.setUnsafeAlgebra(); + } else { + // At a minimum, no-nans-fp-math must be true. + if (!CI->hasNoNaNs()) + return nullptr; + // No-signed-zeros is implied by the definitions of fmax/fmin themselves: + // "Ideally, fmax would be sensitive to the sign of zero, for example + // fmax(-0. 0, +0. 0) would return +0; however, implementation in software + // might be impractical." + FMF.setNoSignedZeros(); + FMF.setNoNaNs(); + } + B.setFastMathFlags(FMF); + + // We have a relaxed floating-point environment. We can ignore NaN-handling + // and transform to a compare and select. We do not have to consider errno or + // exceptions, because fmin/fmax do not have those. + Value *Op0 = CI->getArgOperand(0); + Value *Op1 = CI->getArgOperand(1); + Value *Cmp = Callee->getName().startswith("fmin") ? + B.CreateFCmpOLT(Op0, Op1) : B.CreateFCmpOGT(Op0, Op1); + return B.CreateSelect(Cmp, Op0, Op1); +} + +Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) { + Function *Callee = CI->getCalledFunction(); + Value *Ret = nullptr; + StringRef Name = Callee->getName(); + if (UnsafeFPShrink && hasFloatVersion(Name)) + Ret = optimizeUnaryDoubleFP(CI, B, true); + FunctionType *FT = Callee->getFunctionType(); + + // Just make sure this has 1 argument of FP type, which matches the + // result type. + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isFloatingPointTy()) + return Ret; + + if (!CI->hasUnsafeAlgebra()) + return Ret; + Value *Op1 = CI->getArgOperand(0); + auto *OpC = dyn_cast<CallInst>(Op1); + + // The earlier call must also be unsafe in order to do these transforms. + if (!OpC || !OpC->hasUnsafeAlgebra()) + return Ret; + + // log(pow(x,y)) -> y*log(x) + // This is only applicable to log, log2, log10. + if (Name != "log" && Name != "log2" && Name != "log10") + return Ret; + + IRBuilder<>::FastMathFlagGuard Guard(B); + FastMathFlags FMF; + FMF.setUnsafeAlgebra(); + B.setFastMathFlags(FMF); + + LibFunc::Func Func; + Function *F = OpC->getCalledFunction(); + if (F && ((TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) && + Func == LibFunc::pow) || F->getIntrinsicID() == Intrinsic::pow)) + return B.CreateFMul(OpC->getArgOperand(1), + EmitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B, + Callee->getAttributes()), "mul"); + + // log(exp2(y)) -> y*log(2) + if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) && + TLI->has(Func) && Func == LibFunc::exp2) + return B.CreateFMul( + OpC->getArgOperand(0), + EmitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0), + Callee->getName(), B, Callee->getAttributes()), + "logmul"); + return Ret; +} + Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); @@ -1192,73 +1400,99 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) { Callee->getIntrinsicID() == Intrinsic::sqrt)) Ret = optimizeUnaryDoubleFP(CI, B, true); - // FIXME: For finer-grain optimization, we need intrinsics to have the same - // fast-math flag decorations that are applied to FP instructions. For now, - // we have to rely on the function-level unsafe-fp-math attribute to do this - // optimization because there's no other way to express that the sqrt can be - // reassociated. - Function *F = CI->getParent()->getParent(); - if (F->hasFnAttribute("unsafe-fp-math")) { - // Check for unsafe-fp-math = true. - Attribute Attr = F->getFnAttribute("unsafe-fp-math"); - if (Attr.getValueAsString() != "true") - return Ret; - } - Value *Op = CI->getArgOperand(0); - if (Instruction *I = dyn_cast<Instruction>(Op)) { - if (I->getOpcode() == Instruction::FMul && I->hasUnsafeAlgebra()) { - // We're looking for a repeated factor in a multiplication tree, - // so we can do this fold: sqrt(x * x) -> fabs(x); - // or this fold: sqrt(x * x * y) -> fabs(x) * sqrt(y). - Value *Op0 = I->getOperand(0); - Value *Op1 = I->getOperand(1); - Value *RepeatOp = nullptr; - Value *OtherOp = nullptr; - if (Op0 == Op1) { - // Simple match: the operands of the multiply are identical. - RepeatOp = Op0; - } else { - // Look for a more complicated pattern: one of the operands is itself - // a multiply, so search for a common factor in that multiply. - // Note: We don't bother looking any deeper than this first level or for - // variations of this pattern because instcombine's visitFMUL and/or the - // reassociation pass should give us this form. - Value *OtherMul0, *OtherMul1; - if (match(Op0, m_FMul(m_Value(OtherMul0), m_Value(OtherMul1)))) { - // Pattern: sqrt((x * y) * z) - if (OtherMul0 == OtherMul1) { - // Matched: sqrt((x * x) * z) - RepeatOp = OtherMul0; - OtherOp = Op1; - } - } - } - if (RepeatOp) { - // Fast math flags for any created instructions should match the sqrt - // and multiply. - // FIXME: We're not checking the sqrt because it doesn't have - // fast-math-flags (see earlier comment). - IRBuilder<true, ConstantFolder, - IRBuilderDefaultInserter<true> >::FastMathFlagGuard Guard(B); - B.SetFastMathFlags(I->getFastMathFlags()); - // If we found a repeated factor, hoist it out of the square root and - // replace it with the fabs of that factor. - Module *M = Callee->getParent(); - Type *ArgType = Op->getType(); - Value *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType); - Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs"); - if (OtherOp) { - // If we found a non-repeated factor, we still need to get its square - // root. We then multiply that by the value that was simplified out - // of the square root calculation. - Value *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType); - Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt"); - return B.CreateFMul(FabsCall, SqrtCall); - } - return FabsCall; + if (!CI->hasUnsafeAlgebra()) + return Ret; + + Instruction *I = dyn_cast<Instruction>(CI->getArgOperand(0)); + if (!I || I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra()) + return Ret; + + // We're looking for a repeated factor in a multiplication tree, + // so we can do this fold: sqrt(x * x) -> fabs(x); + // or this fold: sqrt((x * x) * y) -> fabs(x) * sqrt(y). + Value *Op0 = I->getOperand(0); + Value *Op1 = I->getOperand(1); + Value *RepeatOp = nullptr; + Value *OtherOp = nullptr; + if (Op0 == Op1) { + // Simple match: the operands of the multiply are identical. + RepeatOp = Op0; + } else { + // Look for a more complicated pattern: one of the operands is itself + // a multiply, so search for a common factor in that multiply. + // Note: We don't bother looking any deeper than this first level or for + // variations of this pattern because instcombine's visitFMUL and/or the + // reassociation pass should give us this form. + Value *OtherMul0, *OtherMul1; + if (match(Op0, m_FMul(m_Value(OtherMul0), m_Value(OtherMul1)))) { + // Pattern: sqrt((x * y) * z) + if (OtherMul0 == OtherMul1 && + cast<Instruction>(Op0)->hasUnsafeAlgebra()) { + // Matched: sqrt((x * x) * z) + RepeatOp = OtherMul0; + OtherOp = Op1; } } } + if (!RepeatOp) + return Ret; + + // Fast math flags for any created instructions should match the sqrt + // and multiply. + IRBuilder<>::FastMathFlagGuard Guard(B); + B.setFastMathFlags(I->getFastMathFlags()); + + // If we found a repeated factor, hoist it out of the square root and + // replace it with the fabs of that factor. + Module *M = Callee->getParent(); + Type *ArgType = I->getType(); + Value *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType); + Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs"); + if (OtherOp) { + // If we found a non-repeated factor, we still need to get its square + // root. We then multiply that by the value that was simplified out + // of the square root calculation. + Value *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType); + Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt"); + return B.CreateFMul(FabsCall, SqrtCall); + } + return FabsCall; +} + +// TODO: Generalize to handle any trig function and its inverse. +Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) { + Function *Callee = CI->getCalledFunction(); + Value *Ret = nullptr; + StringRef Name = Callee->getName(); + if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name)) + Ret = optimizeUnaryDoubleFP(CI, B, true); + FunctionType *FT = Callee->getFunctionType(); + + // Just make sure this has 1 argument of FP type, which matches the + // result type. + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isFloatingPointTy()) + return Ret; + + Value *Op1 = CI->getArgOperand(0); + auto *OpC = dyn_cast<CallInst>(Op1); + if (!OpC) + return Ret; + + // Both calls must allow unsafe optimizations in order to remove them. + if (!CI->hasUnsafeAlgebra() || !OpC->hasUnsafeAlgebra()) + return Ret; + + // tan(atan(x)) -> x + // tanf(atanf(x)) -> x + // tanl(atanl(x)) -> x + LibFunc::Func Func; + Function *F = OpC->getCalledFunction(); + if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) && + ((Func == LibFunc::atan && Callee->getName() == "tan") || + (Func == LibFunc::atanf && Callee->getName() == "tanf") || + (Func == LibFunc::atanl && Callee->getName() == "tanl"))) + Ret = OpC->getArgOperand(0); return Ret; } @@ -1329,9 +1563,9 @@ LibCallSimplifier::classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat, return; Function *Callee = CI->getCalledFunction(); - StringRef FuncName = Callee->getName(); LibFunc::Func Func; - if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func) || !isTrigLibCall(CI)) + if (!Callee || !TLI->getLibFunc(Callee->getName(), Func) || !TLI->has(Func) || + !isTrigLibCall(CI)) return; if (IsFloat) { @@ -1353,10 +1587,8 @@ LibCallSimplifier::classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat, void LibCallSimplifier::replaceTrigInsts(SmallVectorImpl<CallInst *> &Calls, Value *Res) { - for (SmallVectorImpl<CallInst *>::iterator I = Calls.begin(), E = Calls.end(); - I != E; ++I) { - replaceAllUsesWith(*I, Res); - } + for (CallInst *C : Calls) + replaceAllUsesWith(C, Res); } void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, @@ -1387,8 +1619,7 @@ void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) { // If the argument is an instruction, it must dominate all uses so put our // sincos call there. - BasicBlock::iterator Loc = ArgInst; - B.SetInsertPoint(ArgInst->getParent(), ++Loc); + B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator()); } else { // Otherwise (e.g. for a constant) the beginning of the function is as // good a place as any. @@ -1413,15 +1644,16 @@ void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, // Integer Library Call Optimizations //===----------------------------------------------------------------------===// +static bool checkIntUnaryReturnAndParam(Function *Callee) { + FunctionType *FT = Callee->getFunctionType(); + return FT->getNumParams() == 1 && FT->getReturnType()->isIntegerTy(32) && + FT->getParamType(0)->isIntegerTy(); +} + Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - // Just make sure this has 2 arguments of the same FP type, which match the - // result type. - if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy(32) || - !FT->getParamType(0)->isIntegerTy()) + if (!checkIntUnaryReturnAndParam(Callee)) return nullptr; - Value *Op = CI->getArgOperand(0); // Constant fold. @@ -1436,7 +1668,7 @@ Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) { Type *ArgType = Op->getType(); Value *F = Intrinsic::getDeclaration(Callee->getParent(), Intrinsic::cttz, ArgType); - Value *V = B.CreateCall(F, {Op, B.getFalse()}, "cttz"); + Value *V = B.CreateCall(F, {Op, B.getTrue()}, "cttz"); V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1)); V = B.CreateIntCast(V, B.getInt32Ty(), false); @@ -1461,11 +1693,7 @@ Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) { } Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) { - Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - // We require integer(i32) - if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || - !FT->getParamType(0)->isIntegerTy(32)) + if (!checkIntUnaryReturnAndParam(CI->getCalledFunction())) return nullptr; // isdigit(c) -> (c-'0') <u 10 @@ -1476,11 +1704,7 @@ Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) { } Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) { - Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - // We require integer(i32) - if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || - !FT->getParamType(0)->isIntegerTy(32)) + if (!checkIntUnaryReturnAndParam(CI->getCalledFunction())) return nullptr; // isascii(c) -> c <u 128 @@ -1490,11 +1714,7 @@ Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) { } Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) { - Function *Callee = CI->getCalledFunction(); - FunctionType *FT = Callee->getFunctionType(); - // We require i32(i32) - if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isIntegerTy(32)) + if (!checkIntUnaryReturnAndParam(CI->getCalledFunction())) return nullptr; // toascii(c) -> c & 0x7f @@ -1529,10 +1749,7 @@ Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B, } static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) { - if (!ColdErrorCalls) - return false; - - if (!Callee || !Callee->isDeclaration()) + if (!ColdErrorCalls || !Callee || !Callee->isDeclaration()) return false; if (StreamArg < 0) @@ -1962,22 +2179,17 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { LibFunc::Func Func; Function *Callee = CI->getCalledFunction(); StringRef FuncName = Callee->getName(); - IRBuilder<> Builder(CI); + + SmallVector<OperandBundleDef, 2> OpBundles; + CI->getOperandBundlesAsDefs(OpBundles); + IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles); bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C; // Command-line parameter overrides function attribute. if (EnableUnsafeFPShrink.getNumOccurrences() > 0) UnsafeFPShrink = EnableUnsafeFPShrink; - else if (Callee->hasFnAttribute("unsafe-fp-math")) { - // FIXME: This is the same problem as described in optimizeSqrt(). - // If calls gain access to IR-level FMF, then use that instead of a - // function attribute. - - // Check for unsafe-fp-math = true. - Attribute Attr = Callee->getFnAttribute("unsafe-fp-math"); - if (Attr.getValueAsString() == "true") - UnsafeFPShrink = true; - } + else if (canUseUnsafeFPMath(Callee)) + UnsafeFPShrink = true; // First, check for intrinsics. if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { @@ -1990,6 +2202,8 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { return optimizeExp2(CI, Builder); case Intrinsic::fabs: return optimizeFabs(CI, Builder); + case Intrinsic::log: + return optimizeLog(CI, Builder); case Intrinsic::sqrt: return optimizeSqrt(CI, Builder); default: @@ -2001,13 +2215,17 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { if (Value *SimplifiedFortifiedCI = FortifiedSimplifier.optimizeCall(CI)) { // Try to further simplify the result. CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI); - if (SimplifiedCI && SimplifiedCI->getCalledFunction()) - if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) { + if (SimplifiedCI && SimplifiedCI->getCalledFunction()) { + // Use an IR Builder from SimplifiedCI if available instead of CI + // to guarantee we reach all uses we might replace later on. + IRBuilder<> TmpBuilder(SimplifiedCI); + if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) { // If we were able to further simplify, remove the now redundant call. SimplifiedCI->replaceAllUsesWith(V); SimplifiedCI->eraseFromParent(); return V; } + } return SimplifiedFortifiedCI; } @@ -2068,8 +2286,18 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { return optimizeFWrite(CI, Builder); case LibFunc::fputs: return optimizeFPuts(CI, Builder); + case LibFunc::log: + case LibFunc::log10: + case LibFunc::log1p: + case LibFunc::log2: + case LibFunc::logb: + return optimizeLog(CI, Builder); case LibFunc::puts: return optimizePuts(CI, Builder); + case LibFunc::tan: + case LibFunc::tanf: + case LibFunc::tanl: + return optimizeTan(CI, Builder); case LibFunc::perror: return optimizeErrorReporting(CI, Builder); case LibFunc::vfprintf: @@ -2097,24 +2325,23 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { case LibFunc::exp: case LibFunc::exp10: case LibFunc::expm1: - case LibFunc::log: - case LibFunc::log10: - case LibFunc::log1p: - case LibFunc::log2: - case LibFunc::logb: case LibFunc::sin: case LibFunc::sinh: - case LibFunc::tan: case LibFunc::tanh: if (UnsafeFPShrink && hasFloatVersion(FuncName)) return optimizeUnaryDoubleFP(CI, Builder, true); return nullptr; case LibFunc::copysign: - case LibFunc::fmin: - case LibFunc::fmax: if (hasFloatVersion(FuncName)) return optimizeBinaryDoubleFP(CI, Builder); return nullptr; + case LibFunc::fminf: + case LibFunc::fmin: + case LibFunc::fminl: + case LibFunc::fmaxf: + case LibFunc::fmax: + case LibFunc::fmaxl: + return optimizeFMinFMax(CI, Builder); default: return nullptr; } @@ -2133,37 +2360,27 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) { Replacer(I, With); } -/*static*/ void LibCallSimplifier::replaceAllUsesWithDefault(Instruction *I, - Value *With) { - I->replaceAllUsesWith(With); - I->eraseFromParent(); -} - // TODO: // Additional cases that we need to add to this file: // // cbrt: // * cbrt(expN(X)) -> expN(x/3) // * cbrt(sqrt(x)) -> pow(x,1/6) -// * cbrt(sqrt(x)) -> pow(x,1/9) +// * cbrt(cbrt(x)) -> pow(x,1/9) // // exp, expf, expl: // * exp(log(x)) -> x // // log, logf, logl: // * log(exp(x)) -> x -// * log(x**y) -> y*log(x) // * log(exp(y)) -> y*log(e) -// * log(exp2(y)) -> y*log(2) // * log(exp10(y)) -> y*log(10) // * log(sqrt(x)) -> 0.5*log(x) -// * log(pow(x,y)) -> y*log(x) // // lround, lroundf, lroundl: // * lround(cnst) -> cnst' // // pow, powf, powl: -// * pow(exp(x),y) -> exp(x*y) // * pow(sqrt(x),y) -> pow(x,y*0.5) // * pow(pow(x,y),z)-> pow(x,y*z) // @@ -2179,9 +2396,6 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) { // * sqrt(Nroot(x)) -> pow(x,1/(2*N)) // * sqrt(pow(x,y)) -> pow(|x|,y*0.5) // -// tan, tanf, tanl: -// * tan(atan(x)) -> x -// // trunc, truncf, truncl: // * trunc(cnst) -> cnst' // @@ -2218,7 +2432,8 @@ bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI, return false; } -Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B) { +Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, + IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy_chk)) @@ -2232,7 +2447,8 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> & return nullptr; } -Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B) { +Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, + IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove_chk)) @@ -2246,7 +2462,8 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> return nullptr; } -Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, IRBuilder<> &B) { +Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, + IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset_chk)) @@ -2338,7 +2555,10 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) { LibFunc::Func Func; Function *Callee = CI->getCalledFunction(); StringRef FuncName = Callee->getName(); - IRBuilder<> Builder(CI); + + SmallVector<OperandBundleDef, 2> OpBundles; + CI->getOperandBundlesAsDefs(OpBundles); + IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles); bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C; // First, check that this is a known library functions. diff --git a/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp b/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp new file mode 100644 index 0000000..ad6b782 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/SplitModule.cpp @@ -0,0 +1,85 @@ +//===- SplitModule.cpp - Split a module into partitions -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the function llvm::SplitModule, which splits a module +// into multiple linkable partitions. It can be used to implement parallel code +// generation for link-time optimization. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/SplitModule.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/MD5.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +static void externalize(GlobalValue *GV) { + if (GV->hasLocalLinkage()) { + GV->setLinkage(GlobalValue::ExternalLinkage); + GV->setVisibility(GlobalValue::HiddenVisibility); + } + + // Unnamed entities must be named consistently between modules. setName will + // give a distinct name to each such entity. + if (!GV->hasName()) + GV->setName("__llvmsplit_unnamed"); +} + +// Returns whether GV should be in partition (0-based) I of N. +static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) { + if (auto GA = dyn_cast<GlobalAlias>(GV)) + if (const GlobalObject *Base = GA->getBaseObject()) + GV = Base; + + StringRef Name; + if (const Comdat *C = GV->getComdat()) + Name = C->getName(); + else + Name = GV->getName(); + + // Partition by MD5 hash. We only need a few bits for evenness as the number + // of partitions will generally be in the 1-2 figure range; the low 16 bits + // are enough. + MD5 H; + MD5::MD5Result R; + H.update(Name); + H.final(R); + return (R[0] | (R[1] << 8)) % N == I; +} + +void llvm::SplitModule( + std::unique_ptr<Module> M, unsigned N, + std::function<void(std::unique_ptr<Module> MPart)> ModuleCallback) { + for (Function &F : *M) + externalize(&F); + for (GlobalVariable &GV : M->globals()) + externalize(&GV); + for (GlobalAlias &GA : M->aliases()) + externalize(&GA); + + // FIXME: We should be able to reuse M as the last partition instead of + // cloning it. + for (unsigned I = 0; I != N; ++I) { + ValueToValueMapTy VMap; + std::unique_ptr<Module> MPart( + CloneModule(M.get(), VMap, [=](const GlobalValue *GV) { + return isInPartition(GV, I, N); + })); + if (I != 0) + MPart->setModuleInlineAsm(""); + ModuleCallback(std::move(MPart)); + } +} diff --git a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp index a2a54da..1d1f602 100644 --- a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp +++ b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp @@ -69,7 +69,6 @@ #include "llvm/Support/SourceMgr.h" #include "llvm/Support/YAMLParser.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" using namespace llvm; diff --git a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp index 7e00a80..6b1d1da 100644 --- a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp +++ b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp @@ -50,11 +50,11 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) { // std::vector<BasicBlock*> ReturningBlocks; std::vector<BasicBlock*> UnreachableBlocks; - for(Function::iterator I = F.begin(), E = F.end(); I != E; ++I) - if (isa<ReturnInst>(I->getTerminator())) - ReturningBlocks.push_back(I); - else if (isa<UnreachableInst>(I->getTerminator())) - UnreachableBlocks.push_back(I); + for (BasicBlock &I : F) + if (isa<ReturnInst>(I.getTerminator())) + ReturningBlocks.push_back(&I); + else if (isa<UnreachableInst>(I.getTerminator())) + UnreachableBlocks.push_back(&I); // Then unreachable blocks. if (UnreachableBlocks.empty()) { diff --git a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp index 8c72641..f47ddb9 100644 --- a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -19,11 +19,14 @@ #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/Operator.h" using namespace llvm; // Out of line method to get vtable etc for class. void ValueMapTypeRemapper::anchor() {} void ValueMaterializer::anchor() {} +void ValueMaterializer::materializeInitFor(GlobalValue *New, GlobalValue *Old) { +} Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, @@ -35,15 +38,28 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, // If we have a materializer and it can materialize a value, use that. if (Materializer) { - if (Value *NewV = Materializer->materializeValueFor(const_cast<Value*>(V))) - return VM[V] = NewV; + if (Value *NewV = + Materializer->materializeDeclFor(const_cast<Value *>(V))) { + VM[V] = NewV; + if (auto *NewGV = dyn_cast<GlobalValue>(NewV)) + Materializer->materializeInitFor( + NewGV, const_cast<GlobalValue *>(cast<GlobalValue>(V))); + return NewV; + } } // Global values do not need to be seeded into the VM if they // are using the identity mapping. - if (isa<GlobalValue>(V)) + if (isa<GlobalValue>(V)) { + if (Flags & RF_NullMapMissingGlobalValues) { + assert(!(Flags & RF_IgnoreMissingEntries) && + "Illegal to specify both RF_NullMapMissingGlobalValues and " + "RF_IgnoreMissingEntries"); + return nullptr; + } return VM[V] = const_cast<Value*>(V); - + } + if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) { // Inline asm may need *type* remapping. FunctionType *NewTy = IA->getFunctionType(); @@ -73,7 +89,8 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, // correct. For now, just match behaviour from before the metadata/value // split. // - // assert(MappedMD && "Referenced metadata value not in value map"); + // assert((MappedMD || (Flags & RF_NullMapMissingGlobalValues)) && + // "Referenced metadata value not in value map"); return VM[V] = MetadataAsValue::get(V->getContext(), MappedMD); } @@ -127,9 +144,13 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, Ops.push_back(MapValue(cast<Constant>(C->getOperand(OpNo)), VM, Flags, TypeMapper, Materializer)); } - + Type *NewSrcTy = nullptr; + if (TypeMapper) + if (auto *GEPO = dyn_cast<GEPOperator>(C)) + NewSrcTy = TypeMapper->remapType(GEPO->getSourceElementType()); + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) - return VM[V] = CE->getWithOperands(Ops, NewTy); + return VM[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy); if (isa<ConstantArray>(C)) return VM[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops); if (isa<ConstantStruct>(C)) @@ -146,29 +167,42 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags, } static Metadata *mapToMetadata(ValueToValueMapTy &VM, const Metadata *Key, - Metadata *Val) { + Metadata *Val, ValueMaterializer *Materializer, + RemapFlags Flags) { VM.MD()[Key].reset(Val); + if (Materializer && !(Flags & RF_HaveUnmaterializedMetadata)) { + auto *N = dyn_cast_or_null<MDNode>(Val); + // Need to invoke this once we have non-temporary MD. + if (!N || !N->isTemporary()) + Materializer->replaceTemporaryMetadata(Key, Val); + } return Val; } -static Metadata *mapToSelf(ValueToValueMapTy &VM, const Metadata *MD) { - return mapToMetadata(VM, MD, const_cast<Metadata *>(MD)); +static Metadata *mapToSelf(ValueToValueMapTy &VM, const Metadata *MD, + ValueMaterializer *Materializer, RemapFlags Flags) { + return mapToMetadata(VM, MD, const_cast<Metadata *>(MD), Materializer, Flags); } static Metadata *MapMetadataImpl(const Metadata *MD, - SmallVectorImpl<MDNode *> &Cycles, + SmallVectorImpl<MDNode *> &DistinctWorklist, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer); -static Metadata *mapMetadataOp(Metadata *Op, SmallVectorImpl<MDNode *> &Cycles, +static Metadata *mapMetadataOp(Metadata *Op, + SmallVectorImpl<MDNode *> &DistinctWorklist, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { if (!Op) return nullptr; - if (Metadata *MappedOp = - MapMetadataImpl(Op, Cycles, VM, Flags, TypeMapper, Materializer)) + + if (Materializer && !Materializer->isMetadataNeeded(Op)) + return nullptr; + + if (Metadata *MappedOp = MapMetadataImpl(Op, DistinctWorklist, VM, Flags, + TypeMapper, Materializer)) return MappedOp; // Use identity map if MappedOp is null and we can ignore missing entries. if (Flags & RF_IgnoreMissingEntries) @@ -178,89 +212,122 @@ static Metadata *mapMetadataOp(Metadata *Op, SmallVectorImpl<MDNode *> &Cycles, // correct. For now, just match behaviour from before the metadata/value // split. // - // llvm_unreachable("Referenced metadata not in value map!"); + // assert((Flags & RF_NullMapMissingGlobalValues) && + // "Referenced metadata not in value map!"); return nullptr; } -/// \brief Remap nodes. +/// Resolve uniquing cycles involving the given metadata. +static void resolveCycles(Metadata *MD, bool AllowTemps) { + if (auto *N = dyn_cast_or_null<MDNode>(MD)) { + if (AllowTemps && N->isTemporary()) + return; + if (!N->isResolved()) { + if (AllowTemps) + // Note that this will drop RAUW support on any temporaries, which + // blocks uniquing. If this ends up being an issue, in the future + // we can experiment with delaying resolving these nodes until + // after metadata is fully materialized (i.e. when linking metadata + // as a postpass after function importing). + N->resolveNonTemporaries(); + else + N->resolveCycles(); + } + } +} + +/// Remap the operands of an MDNode. /// -/// Insert \c NewNode in the value map, and then remap \c OldNode's operands. -/// Assumes that \c NewNode is already a clone of \c OldNode. +/// If \c Node is temporary, uniquing cycles are ignored. If \c Node is +/// distinct, uniquing cycles are resolved as they're found. /// -/// \pre \c NewNode is a clone of \c OldNode. -static bool remap(const MDNode *OldNode, MDNode *NewNode, - SmallVectorImpl<MDNode *> &Cycles, ValueToValueMapTy &VM, - RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, - ValueMaterializer *Materializer) { - assert(OldNode->getNumOperands() == NewNode->getNumOperands() && - "Expected nodes to match"); - assert(OldNode->isResolved() && "Expected resolved node"); - assert(!NewNode->isUniqued() && "Expected non-uniqued node"); - - // Map the node upfront so it's available for cyclic references. - mapToMetadata(VM, OldNode, NewNode); - bool AnyChanged = false; - for (unsigned I = 0, E = OldNode->getNumOperands(); I != E; ++I) { - Metadata *Old = OldNode->getOperand(I); - assert(NewNode->getOperand(I) == Old && - "Expected old operands to already be in place"); +/// \pre \c Node.isDistinct() or \c Node.isTemporary(). +static bool remapOperands(MDNode &Node, + SmallVectorImpl<MDNode *> &DistinctWorklist, + ValueToValueMapTy &VM, RemapFlags Flags, + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + assert(!Node.isUniqued() && "Expected temporary or distinct node"); + const bool IsDistinct = Node.isDistinct(); - Metadata *New = mapMetadataOp(OldNode->getOperand(I), Cycles, VM, Flags, - TypeMapper, Materializer); + bool AnyChanged = false; + for (unsigned I = 0, E = Node.getNumOperands(); I != E; ++I) { + Metadata *Old = Node.getOperand(I); + Metadata *New = mapMetadataOp(Old, DistinctWorklist, VM, Flags, TypeMapper, + Materializer); if (Old != New) { AnyChanged = true; - NewNode->replaceOperandWith(I, New); + Node.replaceOperandWith(I, New); + + // Resolve uniquing cycles underneath distinct nodes on the fly so they + // don't infect later operands. + if (IsDistinct) + resolveCycles(New, Flags & RF_HaveUnmaterializedMetadata); } } return AnyChanged; } -/// \brief Map a distinct MDNode. +/// Map a distinct MDNode. /// -/// Distinct nodes are not uniqued, so they must always recreated. +/// Whether distinct nodes change is independent of their operands. If \a +/// RF_MoveDistinctMDs, then they are reused, and their operands remapped in +/// place; effectively, they're moved from one graph to another. Otherwise, +/// they're cloned/duplicated, and the new copy's operands are remapped. static Metadata *mapDistinctNode(const MDNode *Node, - SmallVectorImpl<MDNode *> &Cycles, + SmallVectorImpl<MDNode *> &DistinctWorklist, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { assert(Node->isDistinct() && "Expected distinct node"); - MDNode *NewMD = MDNode::replaceWithDistinct(Node->clone()); - remap(Node, NewMD, Cycles, VM, Flags, TypeMapper, Materializer); + MDNode *NewMD; + if (Flags & RF_MoveDistinctMDs) + NewMD = const_cast<MDNode *>(Node); + else + NewMD = MDNode::replaceWithDistinct(Node->clone()); - // Track any cycles beneath this node. - for (Metadata *Op : NewMD->operands()) - if (auto *Node = dyn_cast_or_null<MDNode>(Op)) - if (!Node->isResolved()) - Cycles.push_back(Node); - - return NewMD; + // Remap operands later. + DistinctWorklist.push_back(NewMD); + return mapToMetadata(VM, Node, NewMD, Materializer, Flags); } /// \brief Map a uniqued MDNode. /// /// Uniqued nodes may not need to be recreated (they may map to themselves). static Metadata *mapUniquedNode(const MDNode *Node, - SmallVectorImpl<MDNode *> &Cycles, + SmallVectorImpl<MDNode *> &DistinctWorklist, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { - assert(Node->isUniqued() && "Expected uniqued node"); + assert(((Flags & RF_HaveUnmaterializedMetadata) || Node->isUniqued()) && + "Expected uniqued node"); - // Create a temporary node upfront in case we have a metadata cycle. + // Create a temporary node and map it upfront in case we have a uniquing + // cycle. If necessary, this mapping will get updated by RAUW logic before + // returning. auto ClonedMD = Node->clone(); - if (!remap(Node, ClonedMD.get(), Cycles, VM, Flags, TypeMapper, Materializer)) - // No operands changed, so use the identity mapping. - return mapToSelf(VM, Node); + mapToMetadata(VM, Node, ClonedMD.get(), Materializer, Flags); + if (!remapOperands(*ClonedMD, DistinctWorklist, VM, Flags, TypeMapper, + Materializer)) { + // No operands changed, so use the original. + ClonedMD->replaceAllUsesWith(const_cast<MDNode *>(Node)); + // Even though replaceAllUsesWith would have replaced the value map + // entry, we need to explictly map with the final non-temporary node + // to replace any temporary metadata via the callback. + return mapToSelf(VM, Node, Materializer, Flags); + } - // At least one operand has changed, so uniquify the cloned node. + // Uniquify the cloned node. Explicitly map it with the final non-temporary + // node so that replacement of temporary metadata via the callback occurs. return mapToMetadata(VM, Node, - MDNode::replaceWithUniqued(std::move(ClonedMD))); + MDNode::replaceWithUniqued(std::move(ClonedMD)), + Materializer, Flags); } static Metadata *MapMetadataImpl(const Metadata *MD, - SmallVectorImpl<MDNode *> &Cycles, + SmallVectorImpl<MDNode *> &DistinctWorklist, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { @@ -269,26 +336,28 @@ static Metadata *MapMetadataImpl(const Metadata *MD, return NewMD; if (isa<MDString>(MD)) - return mapToSelf(VM, MD); + return mapToSelf(VM, MD, Materializer, Flags); if (isa<ConstantAsMetadata>(MD)) if ((Flags & RF_NoModuleLevelChanges)) - return mapToSelf(VM, MD); + return mapToSelf(VM, MD, Materializer, Flags); if (const auto *VMD = dyn_cast<ValueAsMetadata>(MD)) { Value *MappedV = MapValue(VMD->getValue(), VM, Flags, TypeMapper, Materializer); if (VMD->getValue() == MappedV || (!MappedV && (Flags & RF_IgnoreMissingEntries))) - return mapToSelf(VM, MD); + return mapToSelf(VM, MD, Materializer, Flags); // FIXME: This assert crashes during bootstrap, but I think it should be // correct. For now, just match behaviour from before the metadata/value // split. // - // assert(MappedV && "Referenced metadata not in value map!"); + // assert((MappedV || (Flags & RF_NullMapMissingGlobalValues)) && + // "Referenced metadata not in value map!"); if (MappedV) - return mapToMetadata(VM, MD, ValueAsMetadata::get(MappedV)); + return mapToMetadata(VM, MD, ValueAsMetadata::get(MappedV), Materializer, + Flags); return nullptr; } @@ -299,37 +368,54 @@ static Metadata *MapMetadataImpl(const Metadata *MD, // If this is a module-level metadata and we know that nothing at the // module level is changing, then use an identity mapping. if (Flags & RF_NoModuleLevelChanges) - return mapToSelf(VM, MD); + return mapToSelf(VM, MD, Materializer, Flags); // Require resolved nodes whenever metadata might be remapped. - assert(Node->isResolved() && "Unexpected unresolved node"); + assert(((Flags & RF_HaveUnmaterializedMetadata) || Node->isResolved()) && + "Unexpected unresolved node"); + + if (Materializer && Node->isTemporary()) { + assert(Flags & RF_HaveUnmaterializedMetadata); + Metadata *TempMD = + Materializer->mapTemporaryMetadata(const_cast<Metadata *>(MD)); + // If the above callback returned an existing temporary node, use it + // instead of the current temporary node. This happens when earlier + // function importing passes already created and saved a temporary + // metadata node for the same value id. + if (TempMD) { + mapToMetadata(VM, MD, TempMD, Materializer, Flags); + return TempMD; + } + } if (Node->isDistinct()) - return mapDistinctNode(Node, Cycles, VM, Flags, TypeMapper, Materializer); + return mapDistinctNode(Node, DistinctWorklist, VM, Flags, TypeMapper, + Materializer); - return mapUniquedNode(Node, Cycles, VM, Flags, TypeMapper, Materializer); + return mapUniquedNode(Node, DistinctWorklist, VM, Flags, TypeMapper, + Materializer); } Metadata *llvm::MapMetadata(const Metadata *MD, ValueToValueMapTy &VM, RemapFlags Flags, ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer) { - SmallVector<MDNode *, 8> Cycles; - Metadata *NewMD = - MapMetadataImpl(MD, Cycles, VM, Flags, TypeMapper, Materializer); - - // Resolve cycles underneath MD. - if (NewMD && NewMD != MD) { - if (auto *N = dyn_cast<MDNode>(NewMD)) - if (!N->isResolved()) - N->resolveCycles(); + SmallVector<MDNode *, 8> DistinctWorklist; + Metadata *NewMD = MapMetadataImpl(MD, DistinctWorklist, VM, Flags, TypeMapper, + Materializer); - for (MDNode *N : Cycles) - if (!N->isResolved()) - N->resolveCycles(); - } else { - // Shouldn't get unresolved cycles if nothing was remapped. - assert(Cycles.empty() && "Expected no unresolved cycles"); - } + // When there are no module-level changes, it's possible that the metadata + // graph has temporaries. Skip the logic to resolve cycles, since it's + // unnecessary (and invalid) in that case. + if (Flags & RF_NoModuleLevelChanges) + return NewMD; + + // Resolve cycles involving the entry metadata. + resolveCycles(NewMD, Flags & RF_HaveUnmaterializedMetadata); + + // Remap the operands of distinct MDNodes. + while (!DistinctWorklist.empty()) + remapOperands(*DistinctWorklist.pop_back_val(), DistinctWorklist, VM, Flags, + TypeMapper, Materializer); return NewMD; } @@ -374,14 +460,11 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap, // Remap attached metadata. SmallVector<std::pair<unsigned, MDNode *>, 4> MDs; I->getAllMetadata(MDs); - for (SmallVectorImpl<std::pair<unsigned, MDNode *>>::iterator - MI = MDs.begin(), - ME = MDs.end(); - MI != ME; ++MI) { - MDNode *Old = MI->second; + for (const auto &MI : MDs) { + MDNode *Old = MI.second; MDNode *New = MapMetadata(Old, VMap, Flags, TypeMapper, Materializer); if (New != Old) - I->setMetadata(MI->first, New); + I->setMetadata(MI.first, New); } if (!TypeMapper) diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp index 215d6f9..8844d57 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp @@ -25,8 +25,11 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" @@ -204,9 +207,10 @@ namespace { BBVectorize(Pass *P, Function &F, const VectorizeConfig &C) : BasicBlockPass(ID), Config(C) { - AA = &P->getAnalysis<AliasAnalysis>(); + AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults(); DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &P->getAnalysis<ScalarEvolution>(); + SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); TTI = IgnoreTargetInfo ? nullptr : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); @@ -221,6 +225,7 @@ namespace { AliasAnalysis *AA; DominatorTree *DT; ScalarEvolution *SE; + const TargetLibraryInfo *TLI; const TargetTransformInfo *TTI; // FIXME: const correct? @@ -437,9 +442,10 @@ namespace { bool runOnBasicBlock(BasicBlock &BB) override { // OptimizeNone check deferred to vectorizeBB(). - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); TTI = IgnoreTargetInfo ? nullptr : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( @@ -450,13 +456,15 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { BasicBlockPass::getAnalysisUsage(AU); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addPreserved<AliasAnalysis>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); AU.setPreservesCFG(); } @@ -842,7 +850,7 @@ namespace { // It is important to cleanup here so that future iterations of this // function have less work to do. - (void)SimplifyInstructionsInBlock(&BB, AA->getTargetLibraryInfo()); + (void)SimplifyInstructionsInBlock(&BB, TLI); return true; } @@ -1239,20 +1247,23 @@ namespace { if (I == Start) IAfterStart = true; bool IsSimpleLoadStore; - if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; + if (!isInstVectorizable(&*I, IsSimpleLoadStore)) + continue; // Look for an instruction with which to pair instruction *I... DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) WriteSet.add(I); + if (I->mayWriteToMemory()) + WriteSet.add(&*I); bool JAfterStart = IAfterStart; BasicBlock::iterator J = std::next(I); for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { - if (J == Start) JAfterStart = true; + if (&*J == Start) + JAfterStart = true; // Determine if J uses I, if so, exit the loop. - bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep); + bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep); if (Config.FastDep) { // Note: For this heuristic to be effective, independent operations // must tend to be intermixed. This is likely to be true from some @@ -1269,25 +1280,26 @@ namespace { // J does not use I, and comes before the first use of I, so it can be // merged with I if the instructions are compatible. int CostSavings, FixedOrder; - if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len, - CostSavings, FixedOrder)) continue; + if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len, + CostSavings, FixedOrder)) + continue; // J is a candidate for merging with I. if (PairableInsts.empty() || - PairableInsts[PairableInsts.size()-1] != I) { - PairableInsts.push_back(I); + PairableInsts[PairableInsts.size() - 1] != &*I) { + PairableInsts.push_back(&*I); } - CandidatePairs[I].push_back(J); + CandidatePairs[&*I].push_back(&*J); ++TotalPairs; if (TTI) - CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J), - CostSavings)); + CandidatePairCostSavings.insert( + ValuePairWithCost(ValuePair(&*I, &*J), CostSavings)); if (FixedOrder == 1) - FixedOrderPairs.insert(ValuePair(I, J)); + FixedOrderPairs.insert(ValuePair(&*I, &*J)); else if (FixedOrder == -1) - FixedOrderPairs.insert(ValuePair(J, I)); + FixedOrderPairs.insert(ValuePair(&*J, &*I)); // The next call to this function must start after the last instruction // selected during this invocation. @@ -1468,14 +1480,16 @@ namespace { BasicBlock::iterator E = BB.end(), EL = BasicBlock::iterator(cast<Instruction>(PairableInsts.back())); for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { - if (IsInPair.find(I) == IsInPair.end()) continue; + if (IsInPair.find(&*I) == IsInPair.end()) + continue; DenseSet<Value *> Users; AliasSetTracker WriteSet(*AA); - if (I->mayWriteToMemory()) WriteSet.add(I); + if (I->mayWriteToMemory()) + WriteSet.add(&*I); for (BasicBlock::iterator J = std::next(I); J != E; ++J) { - (void) trackUsesOfI(Users, WriteSet, I, J); + (void)trackUsesOfI(Users, WriteSet, &*I, &*J); if (J == EL) break; @@ -1484,7 +1498,7 @@ namespace { for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); U != E; ++U) { if (IsInPair.find(*U) == IsInPair.end()) continue; - PairableInstUsers.insert(ValuePair(I, *U)); + PairableInstUsers.insert(ValuePair(&*I, *U)); } if (I == EL) @@ -2806,55 +2820,51 @@ namespace { Instruction *J, Instruction *K, Instruction *&InsertionPt, Instruction *&K1, Instruction *&K2) { - if (isa<StoreInst>(I)) { - AA->replaceWithNewValue(I, K); - AA->replaceWithNewValue(J, K); - } else { - Type *IType = I->getType(); - Type *JType = J->getType(); + if (isa<StoreInst>(I)) + return; - VectorType *VType = getVecTypeForPair(IType, JType); - unsigned numElem = VType->getNumElements(); + Type *IType = I->getType(); + Type *JType = J->getType(); - unsigned numElemI = getNumScalarElements(IType); - unsigned numElemJ = getNumScalarElements(JType); + VectorType *VType = getVecTypeForPair(IType, JType); + unsigned numElem = VType->getNumElements(); - if (IType->isVectorTy()) { - std::vector<Constant*> Mask1(numElemI), Mask2(numElemI); - for (unsigned v = 0; v < numElemI; ++v) { - Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v); - } + unsigned numElemI = getNumScalarElements(IType); + unsigned numElemJ = getNumScalarElements(JType); - K1 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( Mask1), - getReplacementName(K, false, 1)); - } else { - Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); - K1 = ExtractElementInst::Create(K, CV0, - getReplacementName(K, false, 1)); + if (IType->isVectorTy()) { + std::vector<Constant *> Mask1(numElemI), Mask2(numElemI); + for (unsigned v = 0; v < numElemI; ++v) { + Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v); } - if (JType->isVectorTy()) { - std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ); - for (unsigned v = 0; v < numElemJ; ++v) { - Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v); - } + K1 = new ShuffleVectorInst(K, UndefValue::get(VType), + ConstantVector::get(Mask1), + getReplacementName(K, false, 1)); + } else { + Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); + K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1)); + } - K2 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( Mask2), - getReplacementName(K, false, 2)); - } else { - Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); - K2 = ExtractElementInst::Create(K, CV1, - getReplacementName(K, false, 2)); + if (JType->isVectorTy()) { + std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ); + for (unsigned v = 0; v < numElemJ; ++v) { + Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v); } - K1->insertAfter(K); - K2->insertAfter(K1); - InsertionPt = K2; + K2 = new ShuffleVectorInst(K, UndefValue::get(VType), + ConstantVector::get(Mask2), + getReplacementName(K, false, 2)); + } else { + Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1); + K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2)); } + + K1->insertAfter(K); + K2->insertAfter(K1); + InsertionPt = K2; } // Move all uses of the function I (including pairing-induced uses) after J. @@ -2869,7 +2879,7 @@ namespace { if (I->mayWriteToMemory()) WriteSet.add(I); for (; cast<Instruction>(L) != J; ++L) - (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs); + (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs); assert(cast<Instruction>(L) == J && "Tracking has not proceeded far enough to check for dependencies"); @@ -2891,9 +2901,9 @@ namespace { if (I->mayWriteToMemory()) WriteSet.add(I); for (; cast<Instruction>(L) != J;) { - if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) { + if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) { // Move this instruction - Instruction *InstToMove = L; ++L; + Instruction *InstToMove = &*L++; DEBUG(dbgs() << "BBV: moving: " << *InstToMove << " to after " << *InsertionPt << "\n"); @@ -2924,11 +2934,11 @@ namespace { // Note: We cannot end the loop when we reach J because J could be moved // farther down the use chain by another instruction pairing. Also, J // could be before I if this is an inverted input. - for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) { - if (trackUsesOfI(Users, WriteSet, I, L)) { + for (BasicBlock::iterator E = BB.end(); L != E; ++L) { + if (trackUsesOfI(Users, WriteSet, I, &*L)) { if (L->mayReadFromMemory()) { - LoadMoveSet[L].push_back(I); - LoadMoveSetPairs.insert(ValuePair(L, I)); + LoadMoveSet[&*L].push_back(I); + LoadMoveSetPairs.insert(ValuePair(&*L, I)); } } } @@ -2991,7 +3001,7 @@ namespace { DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { - DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI); + DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI); if (P == ChosenPairs.end()) { ++PI; continue; @@ -3116,12 +3126,9 @@ namespace { } else if (!isa<StoreInst>(K)) K->mutateType(getVecTypeForPair(L->getType(), H->getType())); - unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_fpmath - }; + unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_fpmath, + LLVMContext::MD_invariant_group}; combineMetadata(K, H, KnownIDs); K->intersectOptionalDataWith(H); @@ -3145,8 +3152,6 @@ namespace { if (!isa<StoreInst>(I)) { L->replaceAllUsesWith(K1); H->replaceAllUsesWith(K2); - AA->replaceWithNewValue(L, K1); - AA->replaceWithNewValue(H, K2); } // Instructions that may read from memory may be in the load move set. @@ -3197,10 +3202,14 @@ namespace { char BBVectorize::ID = 0; static const char bb_vectorize_name[] = "Basic-Block Vectorization"; INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 69ca268..2c0d317 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -48,7 +48,6 @@ #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" @@ -58,10 +57,13 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -99,6 +101,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <algorithm> +#include <functional> #include <map> #include <tuple> @@ -123,6 +126,11 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), "trip count that is smaller than this " "value.")); +static cl::opt<bool> MaximizeBandwidth( + "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, + cl::desc("Maximize bandwidth when selecting vectorization factor which " + "will be determined by the smallest type in loop.")); + /// This enables versioning on the strides of symbolically striding memory /// accesses in code like the following. /// for (i = 0; i < N; ++i) @@ -136,7 +144,7 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), /// ... static cl::opt<bool> EnableMemAccessVersioning( "enable-mem-access-versioning", cl::init(true), cl::Hidden, - cl::desc("Enable symblic stride memory access versioning")); + cl::desc("Enable symbolic stride memory access versioning")); static cl::opt<bool> EnableInterleavedMemAccesses( "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, @@ -214,12 +222,27 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC( cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); +static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( + "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks with a " + "vectorize(enable) pragma.")); + +static cl::opt<unsigned> VectorizeSCEVCheckThreshold( + "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed.")); + +static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( + "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed with a " + "vectorize(enable) pragma")); + namespace { // Forward declarations. +class LoopVectorizeHints; class LoopVectorizationLegality; class LoopVectorizationCostModel; -class LoopVectorizeHints; +class LoopVectorizationRequirements; /// \brief This modifies LoopAccessReport to initialize message with /// loop-vectorizer-specific part. @@ -245,6 +268,32 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) { return VectorType::get(Scalar, VF); } +/// A helper function that returns GEP instruction and knows to skip a +/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination +/// pointee types of the 'bitcast' have the same size. +/// For example: +/// bitcast double** %var to i64* - can be skipped +/// bitcast double** %var to i8* - can not +static GetElementPtrInst *getGEPInstruction(Value *Ptr) { + + if (isa<GetElementPtrInst>(Ptr)) + return cast<GetElementPtrInst>(Ptr); + + if (isa<BitCastInst>(Ptr) && + isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) { + Type *BitcastTy = Ptr->getType(); + Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy(); + if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy)) + return nullptr; + Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType(); + Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType(); + const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout(); + if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty)) + return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0)); + } + return nullptr; +} + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -261,25 +310,30 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) { /// and reduction variables that were found to a given vectorization factor. class InnerLoopVectorizer { public: - InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const TargetLibraryInfo *TLI, + InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, + LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, unsigned VecWidth, unsigned UnrollFactor) - : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), - VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), + : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), + VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), - Legal(nullptr), AddedSafetyChecks(false) {} + TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr), + AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). - void vectorize(LoopVectorizationLegality *L) { + // MinimumBitWidths maps scalar integer values to the smallest bitwidth they + // can be validly truncated to. The cost model has assumed this truncation + // will happen when vectorizing. + void vectorize(LoopVectorizationLegality *L, + MapVector<Instruction*,uint64_t> MinimumBitWidths) { + MinBWs = MinimumBitWidths; Legal = L; // Create a new empty loop. Unlink the old loop and connect the new one. createEmptyLoop(); // Widen each instruction in the old loop to a new one in the new loop. // Use the Legality module to find the induction and reduction variables. vectorizeLoop(); - // Register the new loop and update the analysis passes. - updateAnalysis(); } // Return true if any runtime check is added. @@ -302,14 +356,11 @@ protected: typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, VectorParts> EdgeMaskCache; - /// \brief Add checks for strides that were assumed to be 1. - /// - /// Returns the last check instruction and the first check instruction in the - /// pair as (first, last). - std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc); - /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(); + /// Create a new induction variable inside L. + PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, + Value *Step, Instruction *DL); /// Copy and widen the instructions from the old loop. virtual void vectorizeLoop(); @@ -319,6 +370,9 @@ protected: /// See PR14725. void fixLCSSAPHIs(); + /// Shrinks vector element sizes based on information in "MinBWs". + void truncateToMinimalBitwidths(); + /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* /// mask for the block BB. @@ -329,7 +383,7 @@ protected: /// A helper function to vectorize a single BB within the innermost loop. void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); - + /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -374,6 +428,23 @@ protected: /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); + /// Returns (and creates if needed) the original loop trip count. + Value *getOrCreateTripCount(Loop *NewLoop); + + /// Returns (and creates if needed) the trip count of the widened loop. + Value *getOrCreateVectorTripCount(Loop *NewLoop); + + /// Emit a bypass check to see if the trip count would overflow, or we + /// wouldn't have enough iterations to execute one vector loop. + void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); + /// Emit a bypass check to see if the vector trip count is nonzero. + void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass); + /// Emit a bypass check to see if all of the SCEV assumptions we've + /// had to make are correct. + void emitSCEVChecks(Loop *L, BasicBlock *Bypass); + /// Emit bypass checks to check any memory assumptions we may have made. + void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + /// This is a helper class that holds the vectorizer state. It maps scalar /// instructions to vector instructions. When the code is 'unrolled' then /// then a single scalar value is mapped to multiple vector parts. The parts @@ -416,8 +487,10 @@ protected: /// The original loop. Loop *OrigLoop; - /// Scev analysis to use. - ScalarEvolution *SE; + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies + /// dynamic knowledge to simplify SCEV expressions and converts them to a + /// more usable form. + PredicatedScalarEvolution &PSE; /// Loop Info. LoopInfo *LI; /// Dominator Tree. @@ -462,12 +535,21 @@ protected: PHINode *Induction; /// The induction variable of the old basic block. PHINode *OldInduction; - /// Holds the extended (to the widest induction type) start index. - Value *ExtendedIdx; /// Maps scalars to widened vectors. ValueMap WidenMap; + /// Store instructions that should be predicated, as a pair + /// <StoreInst, Predicate> + SmallVector<std::pair<StoreInst*,Value*>, 4> PredicatedStores; EdgeMaskCache MaskCache; - + /// Trip count of the original loop. + Value *TripCount; + /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) + Value *VectorTripCount; + + /// Map of scalar integer values to the smallest bitwidth they can be legally + /// represented as. The vector equivalents of these values should be truncated + /// to this type. + MapVector<Instruction*,uint64_t> MinBWs; LoopVectorizationLegality *Legal; // Record whether runtime check is added. @@ -476,10 +558,11 @@ protected: class InnerLoopUnroller : public InnerLoopVectorizer { public: - InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const TargetLibraryInfo *TLI, + InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, + LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, unsigned UnrollFactor) - : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {} + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {} private: void scalarizeInstruction(Instruction *Instr, @@ -551,7 +634,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) { if (Kind != LLVMContext::MD_tbaa && Kind != LLVMContext::MD_alias_scope && Kind != LLVMContext::MD_noalias && - Kind != LLVMContext::MD_fpmath) + Kind != LLVMContext::MD_fpmath && + Kind != LLVMContext::MD_nontemporal) continue; To->setMetadata(Kind, M.second); @@ -559,7 +643,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) { } /// \brief Propagate known metadata from one instruction to a vector of others. -static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) { +static void propagateMetadata(SmallVectorImpl<Value *> &To, + const Instruction *From) { for (Value *V : To) if (Instruction *I = dyn_cast<Instruction>(V)) propagateMetadata(I, From); @@ -699,8 +784,9 @@ private: /// between the member and the group in a map. class InterleavedAccessInfo { public: - InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT) - : SE(SE), TheLoop(L), DT(DT) {} + InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, + DominatorTree *DT) + : PSE(PSE), TheLoop(L), DT(DT) {} ~InterleavedAccessInfo() { SmallSet<InterleaveGroup *, 4> DelSet; @@ -730,7 +816,11 @@ public: } private: - ScalarEvolution *SE; + /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. + /// Simplifies SCEV expressions in the context of existing SCEV assumptions. + /// The interleaved access analysis can also add new predicates (for example + /// by versioning strides of pointers). + PredicatedScalarEvolution &PSE; Loop *TheLoop; DominatorTree *DT; @@ -778,6 +868,304 @@ private: const ValueToValueMap &Strides); }; +/// Utility class for getting and setting loop vectorizer hints in the form +/// of loop metadata. +/// This class keeps a number of loop annotations locally (as member variables) +/// and can, upon request, write them back as metadata on the loop. It will +/// initially scan the loop for existing metadata, and will update the local +/// values based on information in the loop. +/// We cannot write all values to metadata, as the mere presence of some info, +/// for example 'force', means a decision has been made. So, we need to be +/// careful NOT to add them if the user hasn't specifically asked so. +class LoopVectorizeHints { + enum HintKind { + HK_WIDTH, + HK_UNROLL, + HK_FORCE + }; + + /// Hint - associates name and validation with the hint value. + struct Hint { + const char * Name; + unsigned Value; // This may have to change for non-numeric values. + HintKind Kind; + + Hint(const char * Name, unsigned Value, HintKind Kind) + : Name(Name), Value(Value), Kind(Kind) { } + + bool validate(unsigned Val) { + switch (Kind) { + case HK_WIDTH: + return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; + case HK_UNROLL: + return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; + case HK_FORCE: + return (Val <= 1); + } + return false; + } + }; + + /// Vectorization width. + Hint Width; + /// Vectorization interleave factor. + Hint Interleave; + /// Vectorization forced + Hint Force; + + /// Return the loop metadata prefix. + static StringRef Prefix() { return "llvm.loop."; } + +public: + enum ForceKind { + FK_Undefined = -1, ///< Not selected. + FK_Disabled = 0, ///< Forcing disabled. + FK_Enabled = 1, ///< Forcing enabled. + }; + + LoopVectorizeHints(const Loop *L, bool DisableInterleaving) + : Width("vectorize.width", VectorizerParams::VectorizationFactor, + HK_WIDTH), + Interleave("interleave.count", DisableInterleaving, HK_UNROLL), + Force("vectorize.enable", FK_Undefined, HK_FORCE), + TheLoop(L) { + // Populate values with existing loop metadata. + getHintsFromMetadata(); + + // force-vector-interleave overrides DisableInterleaving. + if (VectorizerParams::isInterleaveForced()) + Interleave.Value = VectorizerParams::VectorizationInterleave; + + DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() + << "LV: Interleaving disabled by the pass manager\n"); + } + + /// Mark the loop L as already vectorized by setting the width to 1. + void setAlreadyVectorized() { + Width.Value = Interleave.Value = 1; + Hint Hints[] = {Width, Interleave}; + writeHintsToMetadata(Hints); + } + + bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { + if (getForce() == LoopVectorizeHints::FK_Disabled) { + DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), + vectorizeAnalysisPassName(), *F, + L->getStartLoc(), emitRemark()); + return false; + } + + if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { + DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), + vectorizeAnalysisPassName(), *F, + L->getStartLoc(), emitRemark()); + return false; + } + + if (getWidth() == 1 && getInterleave() == 1) { + // FIXME: Add a separate metadata to indicate when the loop has already + // been vectorized instead of setting width and count to 1. + DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); + // FIXME: Add interleave.disable metadata. This will allow + // vectorize.disable to be used without disabling the pass and errors + // to differentiate between disabled vectorization and a width of 1. + emitOptimizationRemarkAnalysis( + F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(), + "loop not vectorized: vectorization and interleaving are explicitly " + "disabled, or vectorize width and interleave count are both set to " + "1"); + return false; + } + + return true; + } + + /// Dumps all the hint information. + std::string emitRemark() const { + VectorizationReport R; + if (Force.Value == LoopVectorizeHints::FK_Disabled) + R << "vectorization is explicitly disabled"; + else { + R << "use -Rpass-analysis=loop-vectorize for more info"; + if (Force.Value == LoopVectorizeHints::FK_Enabled) { + R << " (Force=true"; + if (Width.Value != 0) + R << ", Vector Width=" << Width.Value; + if (Interleave.Value != 0) + R << ", Interleave Count=" << Interleave.Value; + R << ")"; + } + } + + return R.str(); + } + + unsigned getWidth() const { return Width.Value; } + unsigned getInterleave() const { return Interleave.Value; } + enum ForceKind getForce() const { return (ForceKind)Force.Value; } + const char *vectorizeAnalysisPassName() const { + // If hints are provided that don't disable vectorization use the + // AlwaysPrint pass name to force the frontend to print the diagnostic. + if (getWidth() == 1) + return LV_NAME; + if (getForce() == LoopVectorizeHints::FK_Disabled) + return LV_NAME; + if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) + return LV_NAME; + return DiagnosticInfo::AlwaysPrint; + } + + bool allowReordering() const { + // When enabling loop hints are provided we allow the vectorizer to change + // the order of operations that is given by the scalar loop. This is not + // enabled by default because can be unsafe or inefficient. For example, + // reordering floating-point operations will change the way round-off + // error accumulates in the loop. + return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; + } + +private: + /// Find hints specified in the loop metadata and update local values. + void getHintsFromMetadata() { + MDNode *LoopID = TheLoop->getLoopID(); + if (!LoopID) + return; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + const MDString *S = nullptr; + SmallVector<Metadata *, 4> Args; + + // The expected hint is either a MDString or a MDNode with the first + // operand a MDString. + if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { + if (!MD || MD->getNumOperands() == 0) + continue; + S = dyn_cast<MDString>(MD->getOperand(0)); + for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) + Args.push_back(MD->getOperand(i)); + } else { + S = dyn_cast<MDString>(LoopID->getOperand(i)); + assert(Args.size() == 0 && "too many arguments for MDString"); + } + + if (!S) + continue; + + // Check if the hint starts with the loop metadata prefix. + StringRef Name = S->getString(); + if (Args.size() == 1) + setHint(Name, Args[0]); + } + } + + /// Checks string hint with one operand and set value if valid. + void setHint(StringRef Name, Metadata *Arg) { + if (!Name.startswith(Prefix())) + return; + Name = Name.substr(Prefix().size(), StringRef::npos); + + const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); + if (!C) return; + unsigned Val = C->getZExtValue(); + + Hint *Hints[] = {&Width, &Interleave, &Force}; + for (auto H : Hints) { + if (Name == H->Name) { + if (H->validate(Val)) + H->Value = Val; + else + DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); + break; + } + } + } + + /// Create a new hint from name / value pair. + MDNode *createHintMetadata(StringRef Name, unsigned V) const { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + Metadata *MDs[] = {MDString::get(Context, Name), + ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Context), V))}; + return MDNode::get(Context, MDs); + } + + /// Matches metadata with hint name. + bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { + MDString* Name = dyn_cast<MDString>(Node->getOperand(0)); + if (!Name) + return false; + + for (auto H : HintTypes) + if (Name->getString().endswith(H.Name)) + return true; + return false; + } + + /// Sets current hints into loop metadata, keeping other values intact. + void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { + if (HintTypes.size() == 0) + return; + + // Reserve the first element to LoopID (see below). + SmallVector<Metadata *, 4> MDs(1); + // If the loop already has metadata, then ignore the existing operands. + MDNode *LoopID = TheLoop->getLoopID(); + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); + // If node in update list, ignore old value. + if (!matchesHintMetadataName(Node, HintTypes)) + MDs.push_back(Node); + } + } + + // Now, add the missing hints. + for (auto H : HintTypes) + MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); + + // Replace current metadata node with new one. + LLVMContext &Context = TheLoop->getHeader()->getContext(); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + + TheLoop->setLoopID(NewLoopID); + } + + /// The loop these hints belong to. + const Loop *TheLoop; +}; + +static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop, + const LoopVectorizeHints &Hints, + const LoopAccessReport &Message) { + const char *Name = Hints.vectorizeAnalysisPassName(); + LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name); +} + +static void emitMissedWarning(Function *F, Loop *L, + const LoopVectorizeHints &LH) { + emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(), + LH.emitRemark()); + + if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { + if (LH.getWidth() != 1) + emitLoopVectorizeWarning( + F->getContext(), *F, L->getStartLoc(), + "failed explicitly specified loop vectorization"); + else if (LH.getInterleave() != 1) + emitLoopInterleaveWarning( + F->getContext(), *F, L->getStartLoc(), + "failed explicitly specified loop interleaving"); + } +} + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -793,87 +1181,17 @@ private: /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: - LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT, - TargetLibraryInfo *TLI, AliasAnalysis *AA, - Function *F, const TargetTransformInfo *TTI, - LoopAccessAnalysis *LAA) - : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F), - TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT), - Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} - - /// This enum represents the kinds of inductions that we support. - enum InductionKind { - IK_NoInduction, ///< Not an induction variable. - IK_IntInduction, ///< Integer induction variable. Step = C. - IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem). - }; - - /// A struct for saving information about induction variables. - struct InductionInfo { - InductionInfo(Value *Start, InductionKind K, ConstantInt *Step) - : StartValue(Start), IK(K), StepValue(Step) { - assert(IK != IK_NoInduction && "Not an induction"); - assert(StartValue && "StartValue is null"); - assert(StepValue && !StepValue->isZero() && "StepValue is zero"); - assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && - "StartValue is not a pointer for pointer induction"); - assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && - "StartValue is not an integer for integer induction"); - assert(StepValue->getType()->isIntegerTy() && - "StepValue is not an integer"); - } - InductionInfo() - : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {} - - /// Get the consecutive direction. Returns: - /// 0 - unknown or non-consecutive. - /// 1 - consecutive and increasing. - /// -1 - consecutive and decreasing. - int getConsecutiveDirection() const { - if (StepValue && (StepValue->isOne() || StepValue->isMinusOne())) - return StepValue->getSExtValue(); - return 0; - } - - /// Compute the transformed value of Index at offset StartValue using step - /// StepValue. - /// For integer induction, returns StartValue + Index * StepValue. - /// For pointer induction, returns StartValue[Index * StepValue]. - /// FIXME: The newly created binary instructions should contain nsw/nuw - /// flags, which can be found from the original scalar operations. - Value *transform(IRBuilder<> &B, Value *Index) const { - switch (IK) { - case IK_IntInduction: - assert(Index->getType() == StartValue->getType() && - "Index type does not match StartValue type"); - if (StepValue->isMinusOne()) - return B.CreateSub(StartValue, Index); - if (!StepValue->isOne()) - Index = B.CreateMul(Index, StepValue); - return B.CreateAdd(StartValue, Index); - - case IK_PtrInduction: - assert(Index->getType() == StepValue->getType() && - "Index type does not match StepValue type"); - if (StepValue->isMinusOne()) - Index = B.CreateNeg(Index); - else if (!StepValue->isOne()) - Index = B.CreateMul(Index, StepValue); - return B.CreateGEP(nullptr, StartValue, Index); - - case IK_NoInduction: - return nullptr; - } - llvm_unreachable("invalid enum"); - } - - /// Start value. - TrackingVH<Value> StartValue; - /// Induction kind. - InductionKind IK; - /// Step value. - ConstantInt *StepValue; - }; + LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE, + DominatorTree *DT, TargetLibraryInfo *TLI, + AliasAnalysis *AA, Function *F, + const TargetTransformInfo *TTI, + LoopAccessAnalysis *LAA, + LoopVectorizationRequirements *R, + const LoopVectorizeHints *H) + : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), + Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), + Requirements(R), Hints(H) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -881,7 +1199,7 @@ public: /// InductionList saves induction variables and maps them to the /// induction descriptor. - typedef MapVector<PHINode*, InductionInfo> InductionList; + typedef MapVector<PHINode*, InductionDescriptor> InductionList; /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this @@ -903,6 +1221,9 @@ public: /// Returns True if V is an induction variable in this loop. bool isInductionVariable(const Value *V); + /// Returns True if PN is a reduction variable in this loop. + bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } + /// Return true if the block BB needs to be predicated in order for the loop /// to be vectorized. bool blockNeedsPredication(BasicBlock *BB); @@ -954,12 +1275,12 @@ public: /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedStore(Type *DataType, Value *Ptr) { - return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr)); + return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { - return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr)); + return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); } /// Returns true if vector representation of the instruction \p I /// requires mask. @@ -999,10 +1320,6 @@ private: /// and we know that we can read from them without segfault. bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); - /// Returns the induction kind of Phi and record the step. This function may - /// return NoInduction if the PHI is not an induction variable. - InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue); - /// \brief Collect memory access with loop invariant strides. /// /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop @@ -1013,16 +1330,20 @@ private: /// not vectorized. These are handled as LoopAccessReport rather than /// VectorizationReport because the << operator of VectorizationReport returns /// LoopAccessReport. - void emitAnalysis(const LoopAccessReport &Message) { - LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); + void emitAnalysis(const LoopAccessReport &Message) const { + emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); } unsigned NumPredStores; /// The loop that we evaluate. Loop *TheLoop; - /// Scev analysis. - ScalarEvolution *SE; + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. + /// Applies dynamic knowledge to simplify SCEV expressions in the context + /// of existing SCEV assumptions. The analysis will also add a minimal set + /// of new predicates if this is required to enable vectorization and + /// unrolling. + PredicatedScalarEvolution &PSE; /// Target Library Info. TargetLibraryInfo *TLI; /// Parent function @@ -1065,12 +1386,18 @@ private: /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; + /// Vectorization requirements that will go through late-evaluation. + LoopVectorizationRequirements *Requirements; + + /// Used to emit an analysis of any legality issues. + const LoopVectorizeHints *Hints; + ValueToValueMap Strides; SmallPtrSet<Value *, 8> StrideSet; /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic - SmallPtrSet<const Instruction*, 8> MaskedOp; + SmallPtrSet<const Instruction *, 8> MaskedOp; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1082,15 +1409,14 @@ private: /// different operations. class LoopVectorizationCostModel { public: - LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, - LoopVectorizationLegality *Legal, + LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, + LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI, AssumptionCache *AC, - const Function *F, const LoopVectorizeHints *Hints) - : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), - TheFunction(F), Hints(Hints) { - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - } + const TargetLibraryInfo *TLI, DemandedBits *DB, + AssumptionCache *AC, const Function *F, + const LoopVectorizeHints *Hints) + : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), + AC(AC), TheFunction(F), Hints(Hints) {} /// Information about vectorization costs struct VectorizationFactor { @@ -1103,10 +1429,10 @@ public: /// possible. VectorizationFactor selectVectorizationFactor(bool OptForSize); - /// \return The size (in bits) of the widest type in the code that - /// needs to be vectorized. We ignore values that remain scalar such as + /// \return The size (in bits) of the smallest and widest types in the code + /// that needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. - unsigned getWidestType(); + std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); /// \return The desired interleave count. /// If interleave count has been specified by metadata it will be returned. @@ -1133,8 +1459,13 @@ public: unsigned NumInstructions; }; - /// \return information about the register usage of the loop. - RegisterUsage calculateRegisterUsage(); + /// \return Returns information about the register usages of the loop for the + /// given vectorization factors. + SmallVector<RegisterUsage, 8> + calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs); + + /// Collect values we want to ignore in the cost model. + void collectValuesToIgnore(); private: /// Returns the expected execution cost. The unit of the cost does @@ -1155,17 +1486,20 @@ private: /// not vectorized. These are handled as LoopAccessReport rather than /// VectorizationReport because the << operator of VectorizationReport returns /// LoopAccessReport. - void emitAnalysis(const LoopAccessReport &Message) { - LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); + void emitAnalysis(const LoopAccessReport &Message) const { + emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); } - /// Values used only by @llvm.assume calls. - SmallPtrSet<const Value *, 32> EphValues; +public: + /// Map of scalar integer values to the smallest bitwidth they can be legally + /// represented as. The vector equivalents of these values should be truncated + /// to this type. + MapVector<Instruction*,uint64_t> MinBWs; /// The loop that we evaluate. Loop *TheLoop; - /// Scev analysis. - ScalarEvolution *SE; + /// Predicated scalar evolution analysis. + PredicatedScalarEvolution &PSE; /// Loop Info analysis. LoopInfo *LI; /// Vectorization legality. @@ -1174,247 +1508,78 @@ private: const TargetTransformInfo &TTI; /// Target Library Info. const TargetLibraryInfo *TLI; + /// Demanded bits analysis. + DemandedBits *DB; + /// Assumption cache. + AssumptionCache *AC; const Function *TheFunction; - // Loop Vectorize Hint. + /// Loop Vectorize Hint. const LoopVectorizeHints *Hints; + /// Values to ignore in the cost model. + SmallPtrSet<const Value *, 16> ValuesToIgnore; + /// Values to ignore in the cost model when VF > 1. + SmallPtrSet<const Value *, 16> VecValuesToIgnore; }; -/// Utility class for getting and setting loop vectorizer hints in the form -/// of loop metadata. -/// This class keeps a number of loop annotations locally (as member variables) -/// and can, upon request, write them back as metadata on the loop. It will -/// initially scan the loop for existing metadata, and will update the local -/// values based on information in the loop. -/// We cannot write all values to metadata, as the mere presence of some info, -/// for example 'force', means a decision has been made. So, we need to be -/// careful NOT to add them if the user hasn't specifically asked so. -class LoopVectorizeHints { - enum HintKind { - HK_WIDTH, - HK_UNROLL, - HK_FORCE - }; - - /// Hint - associates name and validation with the hint value. - struct Hint { - const char * Name; - unsigned Value; // This may have to change for non-numeric values. - HintKind Kind; - - Hint(const char * Name, unsigned Value, HintKind Kind) - : Name(Name), Value(Value), Kind(Kind) { } - - bool validate(unsigned Val) { - switch (Kind) { - case HK_WIDTH: - return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; - case HK_UNROLL: - return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; - case HK_FORCE: - return (Val <= 1); - } - return false; - } - }; - - /// Vectorization width. - Hint Width; - /// Vectorization interleave factor. - Hint Interleave; - /// Vectorization forced - Hint Force; - - /// Return the loop metadata prefix. - static StringRef Prefix() { return "llvm.loop."; } - +/// \brief This holds vectorization requirements that must be verified late in +/// the process. The requirements are set by legalize and costmodel. Once +/// vectorization has been determined to be possible and profitable the +/// requirements can be verified by looking for metadata or compiler options. +/// For example, some loops require FP commutativity which is only allowed if +/// vectorization is explicitly specified or if the fast-math compiler option +/// has been provided. +/// Late evaluation of these requirements allows helpful diagnostics to be +/// composed that tells the user what need to be done to vectorize the loop. For +/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late +/// evaluation should be used only when diagnostics can generated that can be +/// followed by a non-expert user. +class LoopVectorizationRequirements { public: - enum ForceKind { - FK_Undefined = -1, ///< Not selected. - FK_Disabled = 0, ///< Forcing disabled. - FK_Enabled = 1, ///< Forcing enabled. - }; - - LoopVectorizeHints(const Loop *L, bool DisableInterleaving) - : Width("vectorize.width", VectorizerParams::VectorizationFactor, - HK_WIDTH), - Interleave("interleave.count", DisableInterleaving, HK_UNROLL), - Force("vectorize.enable", FK_Undefined, HK_FORCE), - TheLoop(L) { - // Populate values with existing loop metadata. - getHintsFromMetadata(); - - // force-vector-interleave overrides DisableInterleaving. - if (VectorizerParams::isInterleaveForced()) - Interleave.Value = VectorizerParams::VectorizationInterleave; - - DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() - << "LV: Interleaving disabled by the pass manager\n"); - } - - /// Mark the loop L as already vectorized by setting the width to 1. - void setAlreadyVectorized() { - Width.Value = Interleave.Value = 1; - Hint Hints[] = {Width, Interleave}; - writeHintsToMetadata(Hints); - } - - /// Dumps all the hint information. - std::string emitRemark() const { - VectorizationReport R; - if (Force.Value == LoopVectorizeHints::FK_Disabled) - R << "vectorization is explicitly disabled"; - else { - R << "use -Rpass-analysis=loop-vectorize for more info"; - if (Force.Value == LoopVectorizeHints::FK_Enabled) { - R << " (Force=true"; - if (Width.Value != 0) - R << ", Vector Width=" << Width.Value; - if (Interleave.Value != 0) - R << ", Interleave Count=" << Interleave.Value; - R << ")"; - } - } - - return R.str(); - } - - unsigned getWidth() const { return Width.Value; } - unsigned getInterleave() const { return Interleave.Value; } - enum ForceKind getForce() const { return (ForceKind)Force.Value; } - -private: - /// Find hints specified in the loop metadata and update local values. - void getHintsFromMetadata() { - MDNode *LoopID = TheLoop->getLoopID(); - if (!LoopID) - return; - - // First operand should refer to the loop id itself. - assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); - assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); - - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - const MDString *S = nullptr; - SmallVector<Metadata *, 4> Args; - - // The expected hint is either a MDString or a MDNode with the first - // operand a MDString. - if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { - if (!MD || MD->getNumOperands() == 0) - continue; - S = dyn_cast<MDString>(MD->getOperand(0)); - for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) - Args.push_back(MD->getOperand(i)); - } else { - S = dyn_cast<MDString>(LoopID->getOperand(i)); - assert(Args.size() == 0 && "too many arguments for MDString"); - } - - if (!S) - continue; - - // Check if the hint starts with the loop metadata prefix. - StringRef Name = S->getString(); - if (Args.size() == 1) - setHint(Name, Args[0]); + LoopVectorizationRequirements() + : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {} + + void addUnsafeAlgebraInst(Instruction *I) { + // First unsafe algebra instruction. + if (!UnsafeAlgebraInst) + UnsafeAlgebraInst = I; + } + + void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } + + bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { + const char *Name = Hints.vectorizeAnalysisPassName(); + bool Failed = false; + if (UnsafeAlgebraInst && !Hints.allowReordering()) { + emitOptimizationRemarkAnalysisFPCommute( + F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(), + VectorizationReport() << "cannot prove it is safe to reorder " + "floating-point operations"); + Failed = true; } - } - - /// Checks string hint with one operand and set value if valid. - void setHint(StringRef Name, Metadata *Arg) { - if (!Name.startswith(Prefix())) - return; - Name = Name.substr(Prefix().size(), StringRef::npos); - - const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); - if (!C) return; - unsigned Val = C->getZExtValue(); - Hint *Hints[] = {&Width, &Interleave, &Force}; - for (auto H : Hints) { - if (Name == H->Name) { - if (H->validate(Val)) - H->Value = Val; - else - DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); - break; - } + // Test if runtime memcheck thresholds are exceeded. + bool PragmaThresholdReached = + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; + bool ThresholdReached = + NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; + if ((ThresholdReached && !Hints.allowReordering()) || + PragmaThresholdReached) { + emitOptimizationRemarkAnalysisAliasing( + F->getContext(), Name, *F, L->getStartLoc(), + VectorizationReport() + << "cannot prove it is safe to reorder memory operations"); + DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Failed = true; } - } - /// Create a new hint from name / value pair. - MDNode *createHintMetadata(StringRef Name, unsigned V) const { - LLVMContext &Context = TheLoop->getHeader()->getContext(); - Metadata *MDs[] = {MDString::get(Context, Name), - ConstantAsMetadata::get( - ConstantInt::get(Type::getInt32Ty(Context), V))}; - return MDNode::get(Context, MDs); + return Failed; } - /// Matches metadata with hint name. - bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { - MDString* Name = dyn_cast<MDString>(Node->getOperand(0)); - if (!Name) - return false; - - for (auto H : HintTypes) - if (Name->getString().endswith(H.Name)) - return true; - return false; - } - - /// Sets current hints into loop metadata, keeping other values intact. - void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { - if (HintTypes.size() == 0) - return; - - // Reserve the first element to LoopID (see below). - SmallVector<Metadata *, 4> MDs(1); - // If the loop already has metadata, then ignore the existing operands. - MDNode *LoopID = TheLoop->getLoopID(); - if (LoopID) { - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); - // If node in update list, ignore old value. - if (!matchesHintMetadataName(Node, HintTypes)) - MDs.push_back(Node); - } - } - - // Now, add the missing hints. - for (auto H : HintTypes) - MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); - - // Replace current metadata node with new one. - LLVMContext &Context = TheLoop->getHeader()->getContext(); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - - TheLoop->setLoopID(NewLoopID); - } - - /// The loop these hints belong to. - const Loop *TheLoop; +private: + unsigned NumRuntimePointerChecks; + Instruction *UnsafeAlgebraInst; }; -static void emitMissedWarning(Function *F, Loop *L, - const LoopVectorizeHints &LH) { - emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F, - L->getStartLoc(), LH.emitRemark()); - - if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { - if (LH.getWidth() != 1) - emitLoopVectorizeWarning( - F->getContext(), *F, L->getStartLoc(), - "failed explicitly specified loop vectorization"); - else if (LH.getInterleave() != 1) - emitLoopInterleaveWarning( - F->getContext(), *F, L->getStartLoc(), - "failed explicitly specified loop interleaving"); - } -} - static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { if (L.empty()) return V.push_back(&L); @@ -1441,6 +1606,7 @@ struct LoopVectorize : public FunctionPass { DominatorTree *DT; BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; + DemandedBits *DB; AliasAnalysis *AA; AssumptionCache *AC; LoopAccessAnalysis *LAA; @@ -1450,16 +1616,17 @@ struct LoopVectorize : public FunctionPass { BlockFrequency ColdEntryFreq; bool runOnFunction(Function &F) override { - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - BFI = &getAnalysis<BlockFrequencyInfo>(); + BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); TLI = TLIP ? &TLIP->getTLI() : nullptr; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); LAA = &getAnalysis<LoopAccessAnalysis>(); + DB = &getAnalysis<DemandedBits>(); // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. @@ -1562,26 +1729,8 @@ struct LoopVectorize : public FunctionPass { // less verbose reporting vectorized loops and unvectorized loops that may // benefit from vectorization, respectively. - if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) { - DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); - emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, - L->getStartLoc(), Hints.emitRemark()); - return false; - } - - if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) { - DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); - emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, - L->getStartLoc(), Hints.emitRemark()); - return false; - } - - if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) { - DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); - emitOptimizationRemarkAnalysis( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - "loop not vectorized: vector width and interleave count are " - "explicitly set to 1"); + if (!Hints.allowVectorization(F, L, AlwaysVectorize)) { + DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); return false; } @@ -1595,15 +1744,19 @@ struct LoopVectorize : public FunctionPass { DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { DEBUG(dbgs() << "\n"); - emitOptimizationRemarkAnalysis( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - "vectorization is not beneficial and is not explicitly forced"); + emitAnalysisDiag(F, L, Hints, VectorizationReport() + << "vectorization is not beneficial " + "and is not explicitly forced"); return false; } } + PredicatedScalarEvolution PSE(*SE); + // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA); + LoopVectorizationRequirements Requirements; + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, + &Requirements, &Hints); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints); @@ -1611,16 +1764,18 @@ struct LoopVectorize : public FunctionPass { } // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints); + LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F, + &Hints); + CM.collectValuesToIgnore(); // Check the function attributes to find out if this function should be // optimized for size. bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && - F->hasFnAttribute(Attribute::OptimizeForSize); + F->optForSize(); // Compute the weighted frequency of this loop being executed and see if it // is less than 20% of the function entry baseline frequency. Note that we - // always have a canonical loop here because we think we *can* vectoriez. + // always have a canonical loop here because we think we *can* vectorize. // FIXME: This is hidden behind a flag due to pervasive problems with // exactly what block frequency models. if (LoopVectorizeWithBlockFrequency) { @@ -1630,16 +1785,17 @@ struct LoopVectorize : public FunctionPass { OptForSize = true; } - // Check the function attributes to see if implicit floats are allowed.a + // Check the function attributes to see if implicit floats are allowed. // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer // vector instructions? if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"); - emitOptimizationRemarkAnalysis( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - "loop not vectorized due to NoImplicitFloat attribute"); + emitAnalysisDiag( + F, L, Hints, + VectorizationReport() + << "loop not vectorized due to NoImplicitFloat attribute"); emitMissedWarning(F, L, Hints); return false; } @@ -1651,32 +1807,86 @@ struct LoopVectorize : public FunctionPass { // Select the interleave count. unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); - DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " - << DebugLocStr << '\n'); - DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); + // Get user interleave count. + unsigned UserIC = Hints.getInterleave(); + + // Identify the diagnostic messages that should be produced. + std::string VecDiagMsg, IntDiagMsg; + bool VectorizeLoop = true, InterleaveLoop = true; + + if (Requirements.doesNotMeet(F, L, Hints)) { + DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " + "requirements.\n"); + emitMissedWarning(F, L, Hints); + return false; + } if (VF.Width == 1) { - DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n"); + DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); + VecDiagMsg = + "the cost-model indicates that vectorization is not beneficial"; + VectorizeLoop = false; + } - if (IC == 1) { - emitOptimizationRemarkAnalysis( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - "not beneficial to vectorize and user disabled interleaving"); - return false; - } - DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n"); + if (IC == 1 && UserIC <= 1) { + // Tell the user interleaving is not beneficial. + DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); + IntDiagMsg = + "the cost-model indicates that interleaving is not beneficial"; + InterleaveLoop = false; + if (UserIC == 1) + IntDiagMsg += + " and is explicitly disabled or interleave count is set to 1"; + } else if (IC > 1 && UserIC == 1) { + // Tell the user interleaving is beneficial, but it explicitly disabled. + DEBUG(dbgs() + << "LV: Interleaving is beneficial but is explicitly disabled."); + IntDiagMsg = "the cost-model indicates that interleaving is beneficial " + "but is explicitly disabled or interleave count is set to 1"; + InterleaveLoop = false; + } - // Report the unrolling decision. - emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - Twine("interleaved by " + Twine(IC) + - " (vectorization not beneficial)")); + // Override IC if user provided an interleave count. + IC = UserIC > 0 ? UserIC : IC; + + // Emit diagnostic messages, if any. + const char *VAPassName = Hints.vectorizeAnalysisPassName(); + if (!VectorizeLoop && !InterleaveLoop) { + // Do not vectorize or interleaving the loop. + emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, + L->getStartLoc(), VecDiagMsg); + emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, + L->getStartLoc(), IntDiagMsg); + return false; + } else if (!VectorizeLoop && InterleaveLoop) { + DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); + emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, + L->getStartLoc(), VecDiagMsg); + } else if (VectorizeLoop && !InterleaveLoop) { + DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " + << DebugLocStr << '\n'); + emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, + L->getStartLoc(), IntDiagMsg); + } else if (VectorizeLoop && InterleaveLoop) { + DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " + << DebugLocStr << '\n'); + DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); + } + + if (!VectorizeLoop) { + assert(IC > 1 && "interleave count should not be 1 or 0"); + // If we decided that it is not legal to vectorize the loop then + // interleave it. + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC); + Unroller.vectorize(&LVL, CM.MinBWs); - InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC); - Unroller.vectorize(&LVL); + emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), + Twine("interleaved loop (interleaved count: ") + + Twine(IC) + ")"); } else { // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC); - LB.vectorize(&LVL); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC); + LB.vectorize(&LVL, CM.MinBWs); ++LoopsVectorized; // Add metadata to disable runtime unrolling scalar loop when there's no @@ -1686,7 +1896,7 @@ struct LoopVectorize : public FunctionPass { AddRuntimeUnrollDisableMetaData(L); // Report the vectorization decision. - emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), Twine("vectorized loop (vectorization width: ") + Twine(VF.Width) + ", interleaved count: " + Twine(IC) + ")"); @@ -1703,16 +1913,19 @@ struct LoopVectorize : public FunctionPass { AU.addRequired<AssumptionCacheTracker>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - AU.addRequired<BlockFrequencyInfo>(); + AU.addRequired<BlockFrequencyInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<LoopAccessAnalysis>(); + AU.addRequired<DemandedBits>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } }; @@ -1773,6 +1986,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); + auto *SE = PSE.getSE(); // Make sure that the pointer does not point to structs. if (Ptr->getType()->getPointerElementType()->isAggregateType()) return 0; @@ -1780,11 +1994,11 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // If this value is a pointer induction variable we know it is consecutive. PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); if (Phi && Inductions.count(Phi)) { - InductionInfo II = Inductions[Phi]; + InductionDescriptor II = Inductions[Phi]; return II.getConsecutiveDirection(); } - GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); + GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (!Gep) return 0; @@ -1802,10 +2016,10 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // Make sure that all of the index operands are loop invariant. for (unsigned i = 1; i < NumOperands; ++i) - if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) return 0; - InductionInfo II = Inductions[Phi]; + InductionDescriptor II = Inductions[Phi]; return II.getConsecutiveDirection(); } @@ -1815,14 +2029,14 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // operand. for (unsigned i = 0; i != NumOperands; ++i) if (i != InductionOperand && - !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) return 0; // We can emit wide load/stores only if the last non-zero index is the // induction variable. const SCEV *Last = nullptr; if (!Strides.count(Gep)) - Last = SE->getSCEV(Gep->getOperand(InductionOperand)); + Last = PSE.getSCEV(Gep->getOperand(InductionOperand)); else { // Because of the multiplication by a stride we can have a s/zext cast. // We are going to replace this stride by 1 so the cast is safe to ignore. @@ -1833,7 +2047,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // %idxprom = zext i32 %mul to i64 << Safe cast. // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom // - Last = replaceSymbolicStrideSCEV(SE, Strides, + Last = replaceSymbolicStrideSCEV(PSE, Strides, Gep->getOperand(InductionOperand), Gep); if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last)) Last = @@ -2177,7 +2391,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { VectorParts &Entry = WidenMap.get(Instr); // Handle consecutive loads/stores. - GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { setDebugLocFromInst(Builder, Gep); Value *PtrOperand = Gep->getPointerOperand(); @@ -2191,8 +2405,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Ptr = Builder.Insert(Gep2); } else if (Gep) { setDebugLocFromInst(Builder, Gep); - assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), - OrigLoop) && "Base ptr must be invariant"); + assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), + OrigLoop) && + "Base ptr must be invariant"); // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; @@ -2209,7 +2424,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { if (i == InductionOperand || (GepOperandInst && OrigLoop->contains(GepOperandInst))) { assert((i == InductionOperand || - SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && + PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), + OrigLoop)) && "Must be last index or loop invariant"); VectorParts &GEPParts = getVectorValue(GepOperand); @@ -2237,14 +2453,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // We don't want to update the value in the map as it might be used in // another expression. So don't use a reference type for "StoredVal". VectorParts StoredVal = getVectorValue(SI->getValueOperand()); - + for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); if (Reverse) { - // If we store to reverse consecutive memory locations then we need + // If we store to reverse consecutive memory locations, then we need // to reverse the order of elements in the stored value. StoredVal[Part] = reverseVector(StoredVal[Part]); // If the address is consecutive but reversed, then the @@ -2298,7 +2514,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + bool IfPredicateStore) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector<VectorParts, 4> Params; @@ -2318,7 +2535,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic // Try using previously calculated values. Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); - // If the src is an instruction that appeared earlier in the basic block + // If the src is an instruction that appeared earlier in the basic block, // then it should already be vectorized. if (SrcInst && OrigLoop->contains(SrcInst)) { assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); @@ -2343,19 +2560,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); - Instruction *InsertPt = Builder.GetInsertPoint(); - BasicBlock *IfBlock = Builder.GetInsertBlock(); - BasicBlock *CondBlock = nullptr; - VectorParts Cond; - Loop *VectorLp = nullptr; if (IfPredicateStore) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), Instr->getParent()); - VectorLp = LI->getLoopFor(IfBlock); - assert(VectorLp && "Must have a loop for this block"); } // For each vector unroll 'part': @@ -2367,12 +2577,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic Value *Cmp = nullptr; if (IfPredicateStore) { Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); - Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); - CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); - LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, *LI); - // Update Builder with newly created basic block. - Builder.SetInsertPoint(InsertPt); + Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, + ConstantInt::get(Cmp->getType(), 1)); } Instruction *Cloned = Instr->clone(); @@ -2396,85 +2602,223 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, Builder.getInt32(Width)); // End if-block. - if (IfPredicateStore) { - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); - Builder.SetInsertPoint(InsertPt); - ReplaceInstWithInst(IfBlock->getTerminator(), - BranchInst::Create(CondBlock, NewIfBlock, Cmp)); - IfBlock = NewIfBlock; - } + if (IfPredicateStore) + PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), + Cmp)); } } } -static Instruction *getFirstInst(Instruction *FirstInst, Value *V, - Instruction *Loc) { - if (FirstInst) - return FirstInst; - if (Instruction *I = dyn_cast<Instruction>(V)) - return I->getParent() == Loc->getParent() ? I : nullptr; - return nullptr; +PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, + Value *End, Value *Step, + Instruction *DL) { + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + // As we're just creating this loop, it's possible no latch exists + // yet. If so, use the header as this will be a single block loop. + if (!Latch) + Latch = Header; + + IRBuilder<> Builder(&*Header->getFirstInsertionPt()); + setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); + auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); + + Builder.SetInsertPoint(Latch->getTerminator()); + + // Create i+1 and fill the PHINode. + Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); + Induction->addIncoming(Start, L->getLoopPreheader()); + Induction->addIncoming(Next, Latch); + // Create the compare. + Value *ICmp = Builder.CreateICmpEQ(Next, End); + Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); + + // Now we have two terminators. Remove the old one from the block. + Latch->getTerminator()->eraseFromParent(); + + return Induction; } -std::pair<Instruction *, Instruction *> -InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { - Instruction *tnullptr = nullptr; - if (!Legal->mustCheckStrides()) - return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); - - IRBuilder<> ChkBuilder(Loc); - - // Emit checks. - Value *Check = nullptr; - Instruction *FirstInst = nullptr; - for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(), - SE = Legal->strides_end(); - SI != SE; ++SI) { - Value *Ptr = stripIntegerCast(*SI); - Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1), - "stride.chk"); - // Store the first instruction we create. - FirstInst = getFirstInst(FirstInst, C, Loc); - if (Check) - Check = ChkBuilder.CreateOr(Check, C); - else - Check = C; - } +Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { + if (TripCount) + return TripCount; - // We have to do this trickery because the IRBuilder might fold the check to a - // constant expression in which case there is no Instruction anchored in a - // the block. - LLVMContext &Ctx = Loc->getContext(); - Instruction *TheCheck = - BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx)); - ChkBuilder.Insert(TheCheck, "stride.not.one"); - FirstInst = getFirstInst(FirstInst, TheCheck, Loc); + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + // Find the loop boundaries. + ScalarEvolution *SE = PSE.getSE(); + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop); + assert(BackedgeTakenCount != SE->getCouldNotCompute() && + "Invalid loop count"); - return std::make_pair(FirstInst, TheCheck); + Type *IdxTy = Legal->getWidestInductionType(); + + // The exit count might have the type of i64 while the phi is i32. This can + // happen if we have an induction variable that is sign extended before the + // compare. The only way that we get a backedge taken count is that the + // induction variable was signed and as such will not overflow. In such a case + // truncation is legal. + if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > + IdxTy->getPrimitiveSizeInBits()) + BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); + BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); + + // Get the total trip count from the count by adding 1. + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + + // Expand the trip count and place the new instructions in the preheader. + // Notice that the pre-header does not change, only the loop body. + SCEVExpander Exp(*SE, DL, "induction"); + + // Count holds the overall loop count (N). + TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + L->getLoopPreheader()->getTerminator()); + + if (TripCount->getType()->isPointerTy()) + TripCount = + CastInst::CreatePointerCast(TripCount, IdxTy, + "exitcount.ptrcnt.to.int", + L->getLoopPreheader()->getTerminator()); + + return TripCount; } +Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { + if (VectorTripCount) + return VectorTripCount; + + Value *TC = getOrCreateTripCount(L); + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + + // Now we need to generate the expression for N - (N % VF), which is + // the part that the vectorized body will execute. + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Constant *Step = ConstantInt::get(TC->getType(), VF * UF); + Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); + VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); + + return VectorTripCount; +} + +void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, + BasicBlock *Bypass) { + Value *Count = getOrCreateTripCount(L); + BasicBlock *BB = L->getLoopPreheader(); + IRBuilder<> Builder(BB->getTerminator()); + + // Generate code to check that the loop's trip count that we computed by + // adding one to the backedge-taken count will not overflow. + Value *CheckMinIters = + Builder.CreateICmpULT(Count, + ConstantInt::get(Count->getType(), VF * UF), + "min.iters.check"); + + BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), + "min.iters.checked"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, CheckMinIters)); + LoopBypassBlocks.push_back(BB); +} + +void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, + BasicBlock *Bypass) { + Value *TC = getOrCreateVectorTripCount(L); + BasicBlock *BB = L->getLoopPreheader(); + IRBuilder<> Builder(BB->getTerminator()); + + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. + Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()), + "cmp.zero"); + + // Generate code to check that the loop's trip count that we computed by + // adding one to the backedge-taken count will not overflow. + BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), + "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, Cmp)); + LoopBypassBlocks.push_back(BB); +} + +void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { + BasicBlock *BB = L->getLoopPreheader(); + + // Generate the code to check that the SCEV assumptions that we made. + // We want the new basic block to start at the first instruction in a + // sequence of instructions that form a check. + SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), + "scev.check"); + Value *SCEVCheck = + Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); + + if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) + if (C->isZero()) + return; + + // Create a new block containing the stride check. + BB->setName("vector.scevcheck"); + auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, SCEVCheck)); + LoopBypassBlocks.push_back(BB); + AddedSafetyChecks = true; +} + +void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, + BasicBlock *Bypass) { + BasicBlock *BB = L->getLoopPreheader(); + + // Generate the code that checks in runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + Instruction *FirstCheckInst; + Instruction *MemRuntimeCheck; + std::tie(FirstCheckInst, MemRuntimeCheck) = + Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); + if (!MemRuntimeCheck) + return; + + // Create a new block containing the memory check. + BB->setName("vector.memcheck"); + auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); + LoopBypassBlocks.push_back(BB); + AddedSafetyChecks = true; +} + + void InnerLoopVectorizer::createEmptyLoop() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- Back-edge taken count overflow check. + [ ] <-- loop iteration number check. / | / v | [ ] <-- vector loop bypass (may consist of multiple blocks). | / | | / v || [ ] <-- vector pre header. - || | - || v - || [ ] \ - || [ ]_| <-- vector loop. - || | - | \ v - | >[ ] <--- middle-block. + |/ | + | v + | [ ] \ + | [ ]_| <-- vector loop. + | | + | v + | -[ ] <--- middle-block. | / | | / v -|- >[ ] <--- new preheader. @@ -2498,65 +2842,16 @@ void InnerLoopVectorizer::createEmptyLoop() { // don't. One example is c++ iterators that often have multiple pointer // induction variables. In the code below we also support a case where we // don't have a single induction variable. + // + // We try to obtain an induction variable from the original loop as hard + // as possible. However if we don't find one that: + // - is an integer + // - counts from zero, stepping by one + // - is the size of the widest induction variable type + // then we create a new one. OldInduction = Legal->getInduction(); Type *IdxTy = Legal->getWidestInductionType(); - // Find the loop boundaries. - const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); - assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); - - // The exit count might have the type of i64 while the phi is i32. This can - // happen if we have an induction variable that is sign extended before the - // compare. The only way that we get a backedge taken count is that the - // induction variable was signed and as such will not overflow. In such a case - // truncation is legal. - if (ExitCount->getType()->getPrimitiveSizeInBits() > - IdxTy->getPrimitiveSizeInBits()) - ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); - - const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); - // Get the total trip count from the count by adding 1. - ExitCount = SE->getAddExpr(BackedgeTakeCount, - SE->getConstant(BackedgeTakeCount->getType(), 1)); - - const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout(); - - // Expand the trip count and place the new instructions in the preheader. - // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*SE, DL, "induction"); - - // We need to test whether the backedge-taken count is uint##_max. Adding one - // to it will cause overflow and an incorrect loop trip count in the vector - // body. In case of overflow we want to directly jump to the scalar remainder - // loop. - Value *BackedgeCount = - Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), - VectorPH->getTerminator()); - if (BackedgeCount->getType()->isPointerTy()) - BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, - "backedge.ptrcnt.to.int", - VectorPH->getTerminator()); - Instruction *CheckBCOverflow = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, - Constant::getAllOnesValue(BackedgeCount->getType()), - "backedge.overflow", VectorPH->getTerminator()); - - // The loop index does not have to start at Zero. Find the original start - // value from the induction PHI node. If we don't have an induction variable - // then we know that it starts at zero. - Builder.SetInsertPoint(VectorPH->getTerminator()); - Value *StartIdx = ExtendedIdx = - OldInduction - ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH), - IdxTy) - : ConstantInt::get(IdxTy, 0); - - // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - VectorPH->getTerminator()); - - LoopBypassBlocks.push_back(VectorPH); - // Split the single block loop into the two loop structure described above. BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); @@ -2580,118 +2875,36 @@ void InnerLoopVectorizer::createEmptyLoop() { } Lp->addBasicBlockToLoop(VecBody, *LI); - // Use this IR builder to create the loop instructions (Phi, Br, Cmp) - // inside the loop. - Builder.SetInsertPoint(VecBody->getFirstNonPHI()); - - // Generate the induction variable. - setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); - Induction = Builder.CreatePHI(IdxTy, 2, "index"); - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Constant *Step = ConstantInt::get(IdxTy, VF * UF); - - // Generate code to check that the loop's trip count that we computed by - // adding one to the backedge-taken count will not overflow. - BasicBlock *NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow)); - VectorPH = NewVectorPH; - - // This is the IR builder that we use to add all of the logic for bypassing - // the new vector loop. - IRBuilder<> BypassBuilder(VectorPH->getTerminator()); - setDebugLocFromInst(BypassBuilder, - getDebugLocFromInstOrOperands(OldInduction)); - - // We may need to extend the index in case there is a type mismatch. - // We know that the count starts at zero and does not overflow. - if (Count->getType() != IdxTy) { - // The exit count can be of pointer type. Convert it to the correct - // integer type. - if (ExitCount->getType()->isPointerTy()) - Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); - else - Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); - } - - // Add the start index to the loop count to get the new end index. - Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); + // Find the loop boundaries. + Value *Count = getOrCreateTripCount(Lp); - // Now we need to generate the expression for N - (N % VF), which is - // the part that the vectorized body will execute. - Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); - Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); - Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, - "end.idx.rnd.down"); + Value *StartIdx = ConstantInt::get(IdxTy, 0); + // We need to test whether the backedge-taken count is uint##_max. Adding one + // to it will cause overflow and an incorrect loop trip count in the vector + // body. In case of overflow we want to directly jump to the scalar remainder + // loop. + emitMinimumIterationCountCheck(Lp, ScalarPH); // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. - Value *Cmp = - BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - ReplaceInstWithInst(VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, Cmp)); - VectorPH = NewVectorPH; - - // Generate the code to check that the strides we assumed to be one are really - // one. We want the new basic block to start at the first instruction in a - // sequence of instructions that form a check. - Instruction *StrideCheck; - Instruction *FirstCheckInst; - std::tie(FirstCheckInst, StrideCheck) = - addStrideCheck(VectorPH->getTerminator()); - if (StrideCheck) { - AddedSafetyChecks = true; - // Create a new block containing the stride check. - VectorPH->setName("vector.stridecheck"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - - // Replace the branch into the memory check block with a conditional branch - // for the "few elements case". - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck)); - - VectorPH = NewVectorPH; - } + emitVectorLoopEnteredCheck(Lp, ScalarPH); + // Generate the code to check any assumptions that we've made for SCEV + // expressions. + emitSCEVChecks(Lp, ScalarPH); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeCheck(VectorPH->getTerminator()); - if (MemRuntimeCheck) { - AddedSafetyChecks = true; - // Create a new block containing the memory check. - VectorPH->setName("vector.memcheck"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - - // Replace the branch into the memory check block with a conditional branch - // for the "few elements case". - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck)); - - VectorPH = NewVectorPH; - } + emitMemRuntimeChecks(Lp, ScalarPH); + + // Generate the induction variable. + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Constant *Step = ConstantInt::get(IdxTy, VF * UF); + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the @@ -2701,152 +2914,60 @@ void InnerLoopVectorizer::createEmptyLoop() { // If we come from a bypass edge then we need to start from the original // start value. - // This variable saves the new starting index for the scalar loop. - PHINode *ResumeIndex = nullptr; + // This variable saves the new starting index for the scalar loop. It is used + // to test if there are any tail iterations left once the vector loop has + // completed. LoopVectorizationLegality::InductionList::iterator I, E; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); - // Set builder to point to last bypass block. - BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); for (I = List->begin(), E = List->end(); I != E; ++I) { PHINode *OrigPhi = I->first; - LoopVectorizationLegality::InductionInfo II = I->second; - - Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); - PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", - MiddleBlock->getTerminator()); - // We might have extended the type of the induction variable but we need a - // truncated version for the scalar loop. - PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? - PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", - MiddleBlock->getTerminator()) : nullptr; + InductionDescriptor II = I->second; // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val", + PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, + "bc.resume.val", ScalarPH->getTerminator()); - BCResumeVal->addIncoming(ResumeVal, MiddleBlock); - - PHINode *BCTruncResumeVal = nullptr; + Value *EndValue; if (OrigPhi == OldInduction) { - BCTruncResumeVal = - PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val", - ScalarPH->getTerminator()); - BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock); - } - - Value *EndValue = nullptr; - switch (II.IK) { - case LoopVectorizationLegality::IK_NoInduction: - llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IK_IntInduction: { - // Handle the integer induction counter. - assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); - - // We have the canonical induction variable. - if (OrigPhi == OldInduction) { - // Create a truncated version of the resume value for the scalar loop, - // we might have promoted the type to a larger width. - EndValue = - BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); - // The new PHI merges the original incoming value, in case of a bypass, - // or the value at the end of the vectorized loop. - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); - TruncResumeVal->addIncoming(EndValue, VecBody); - - BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); - - // We know what the end value is. - EndValue = IdxEndRoundDown; - // We also know which PHI node holds it. - ResumeIndex = ResumeVal; - break; - } - - // Not the canonical induction variable - add the vector loop count to the - // start value. - Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, - II.StartValue->getType(), - "cast.crd"); - EndValue = II.transform(BypassBuilder, CRD); + // We know what the end value is. + EndValue = CountRoundDown; + } else { + IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); + Value *CRD = B.CreateSExtOrTrunc(CountRoundDown, + II.getStepValue()->getType(), + "cast.crd"); + EndValue = II.transform(B, CRD); EndValue->setName("ind.end"); - break; } - case LoopVectorizationLegality::IK_PtrInduction: { - Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, - II.StepValue->getType(), - "cast.crd"); - EndValue = II.transform(BypassBuilder, CRD); - EndValue->setName("ptr.ind.end"); - break; - } - }// end of case // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) { - if (OrigPhi == OldInduction) - ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); - else - ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); - } - ResumeVal->addIncoming(EndValue, VecBody); + BCResumeVal->addIncoming(EndValue, MiddleBlock); // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); // The old induction's phi node in the scalar body needs the truncated // value. - if (OrigPhi == OldInduction) { - BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]); - OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal); - } else { - BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); - OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); - } + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]); + OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); } - // If we are generating a new induction variable then we also need to - // generate the code that calculates the exit value. This value is not - // simply the end of the counter because we may skip the vectorized body - // in case of a runtime check. - if (!OldInduction){ - assert(!ResumeIndex && "Unexpected resume value found"); - ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", - MiddleBlock->getTerminator()); - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); - ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); - } - - // Make sure that we found the index where scalar loop needs to continue. - assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && - "Invalid resume Index"); - // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. - Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, - ResumeIndex, "cmp.n", + Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); ReplaceInstWithInst(MiddleBlock->getTerminator(), BranchInst::Create(ExitBlock, ScalarPH, CmpN)); - // Create i+1 and fill the PHINode. - Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); - Induction->addIncoming(StartIdx, VectorPH); - Induction->addIncoming(NextIdx, VecBody); - // Create the compare. - Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); - Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); - - // Now we have two terminators. Remove the old one from the block. - VecBody->getTerminator()->eraseFromParent(); - // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); // Save the state. - LoopVectorPreHeader = VectorPH; + LoopVectorPreHeader = Lp->getLoopPreheader(); LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; @@ -2899,7 +3020,7 @@ static void cse(SmallVector<BasicBlock *, 4> &BBs) { for (unsigned i = 0, e = BBs.size(); i != e; ++i) { BasicBlock *BB = BBs[i]; for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { - Instruction *In = I++; + Instruction *In = &*I++; if (!CSEDenseMapInfo::canHandle(In)) continue; @@ -3021,6 +3142,117 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); } +static Type *smallestIntegerVectorType(Type *T1, Type *T2) { + IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType()); + IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType()); + return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; +} +static Type *largestIntegerVectorType(Type *T1, Type *T2) { + IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType()); + IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType()); + return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; +} + +void InnerLoopVectorizer::truncateToMinimalBitwidths() { + // For every instruction `I` in MinBWs, truncate the operands, create a + // truncated version of `I` and reextend its result. InstCombine runs + // later and will remove any ext/trunc pairs. + // + for (auto &KV : MinBWs) { + VectorParts &Parts = WidenMap.get(KV.first); + for (Value *&I : Parts) { + if (I->use_empty()) + continue; + Type *OriginalTy = I->getType(); + Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), + KV.second); + Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, + OriginalTy->getVectorNumElements()); + if (TruncatedTy == OriginalTy) + continue; + + IRBuilder<> B(cast<Instruction>(I)); + auto ShrinkOperand = [&](Value *V) -> Value* { + if (auto *ZI = dyn_cast<ZExtInst>(V)) + if (ZI->getSrcTy() == TruncatedTy) + return ZI->getOperand(0); + return B.CreateZExtOrTrunc(V, TruncatedTy); + }; + + // The actual instruction modification depends on the instruction type, + // unfortunately. + Value *NewI = nullptr; + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { + NewI = B.CreateBinOp(BO->getOpcode(), + ShrinkOperand(BO->getOperand(0)), + ShrinkOperand(BO->getOperand(1))); + cast<BinaryOperator>(NewI)->copyIRFlags(I); + } else if (ICmpInst *CI = dyn_cast<ICmpInst>(I)) { + NewI = B.CreateICmp(CI->getPredicate(), + ShrinkOperand(CI->getOperand(0)), + ShrinkOperand(CI->getOperand(1))); + } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) { + NewI = B.CreateSelect(SI->getCondition(), + ShrinkOperand(SI->getTrueValue()), + ShrinkOperand(SI->getFalseValue())); + } else if (CastInst *CI = dyn_cast<CastInst>(I)) { + switch (CI->getOpcode()) { + default: llvm_unreachable("Unhandled cast!"); + case Instruction::Trunc: + NewI = ShrinkOperand(CI->getOperand(0)); + break; + case Instruction::SExt: + NewI = B.CreateSExtOrTrunc(CI->getOperand(0), + smallestIntegerVectorType(OriginalTy, + TruncatedTy)); + break; + case Instruction::ZExt: + NewI = B.CreateZExtOrTrunc(CI->getOperand(0), + smallestIntegerVectorType(OriginalTy, + TruncatedTy)); + break; + } + } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { + auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); + auto *O0 = + B.CreateZExtOrTrunc(SI->getOperand(0), + VectorType::get(ScalarTruncatedTy, Elements0)); + auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); + auto *O1 = + B.CreateZExtOrTrunc(SI->getOperand(1), + VectorType::get(ScalarTruncatedTy, Elements1)); + + NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); + } else if (isa<LoadInst>(I)) { + // Don't do anything with the operands, just extend the result. + continue; + } else { + llvm_unreachable("Unhandled instruction type!"); + } + + // Lastly, extend the result. + NewI->takeName(cast<Instruction>(I)); + Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); + I->replaceAllUsesWith(Res); + cast<Instruction>(I)->eraseFromParent(); + I = Res; + } + } + + // We'll have created a bunch of ZExts that are now parentless. Clean up. + for (auto &KV : MinBWs) { + VectorParts &Parts = WidenMap.get(KV.first); + for (Value *&I : Parts) { + ZExtInst *Inst = dyn_cast<ZExtInst>(I); + if (Inst && Inst->use_empty()) { + Value *NewI = Inst->getOperand(0); + Inst->eraseFromParent(); + I = NewI; + } + } + } +} + void InnerLoopVectorizer::vectorizeLoop() { //===------------------------------------------------===// // @@ -3051,6 +3283,11 @@ void InnerLoopVectorizer::vectorizeLoop() { be = DFS.endRPO(); bb != be; ++bb) vectorizeBlockInLoop(*bb, &RdxPHIsToFix); + // Insert truncates and extends for any truncated instructions as hints to + // InstCombine. + if (VF > 1) + truncateToMinimalBitwidths(); + // At this point every instruction in the original loop is widened to // a vector form. We are almost done. Now, we need to fix the PHI nodes // that we vectorized. The PHI nodes are currently empty because we did @@ -3066,7 +3303,7 @@ void InnerLoopVectorizer::vectorizeLoop() { assert(RdxPhi && "Unable to recover vectorized PHI"); // Find the reduction variable descriptor. - assert(Legal->getReductionVars()->count(RdxPhi) && + assert(Legal->isReductionVariable(RdxPhi) && "Unable to find the reduction variable"); RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; @@ -3141,21 +3378,33 @@ void InnerLoopVectorizer::vectorizeLoop() { // the PHIs and the values we are going to write. // This allows us to write both PHINodes and the extractelement // instructions. - Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - VectorParts RdxParts; + VectorParts RdxParts = getVectorValue(LoopExitInst); setDebugLocFromInst(Builder, LoopExitInst); - for (unsigned part = 0; part < UF; ++part) { - // This PHINode contains the vectorized reduction variable, or - // the initial value vector, if we bypass the vector loop. - VectorParts &RdxExitVal = getVectorValue(LoopExitInst); - PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); - Value *StartVal = (part == 0) ? VectorStart : Identity; - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); - NewPhi->addIncoming(RdxExitVal[part], - LoopVectorBody.back()); - RdxParts.push_back(NewPhi); + + // If the vector reduction can be performed in a smaller type, we truncate + // then extend the loop exit value to enable InstCombine to evaluate the + // entire expression in the smaller type. + if (VF > 1 && RdxPhi->getType() != RdxDesc.getRecurrenceType()) { + Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); + Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator()); + for (unsigned part = 0; part < UF; ++part) { + Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy); + Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) + : Builder.CreateZExt(Trunc, VecTy); + for (Value::user_iterator UI = RdxParts[part]->user_begin(); + UI != RdxParts[part]->user_end();) + if (*UI != Trunc) { + (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd); + RdxParts[part] = Extnd; + } else { + ++UI; + } + } + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); + for (unsigned part = 0; part < UF; ++part) + RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy); } // Reduce all of the unrolled parts into a single vector. @@ -3208,13 +3457,22 @@ void InnerLoopVectorizer::vectorizeLoop() { // The result is in the first element of the vector. ReducedPartRdx = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + + // If the reduction can be performed in a smaller type, we need to extend + // the reduction to the wider type before we branch to the original loop. + if (RdxPhi->getType() != RdxDesc.getRecurrenceType()) + ReducedPartRdx = + RdxDesc.isSigned() + ? Builder.CreateSExt(ReducedPartRdx, RdxPhi->getType()) + : Builder.CreateZExt(ReducedPartRdx, RdxPhi->getType()); } // Create a phi node that merges control-flow from the backedge-taken check // block and the middle block. PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", LoopScalarPreHeader->getTerminator()); - BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]); + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); // Now, we need to fix the users of the reduction variable @@ -3252,6 +3510,20 @@ void InnerLoopVectorizer::vectorizeLoop() { fixLCSSAPHIs(); + // Make sure DomTree is updated. + updateAnalysis(); + + // Predicate any stores. + for (auto KV : PredicatedStores) { + BasicBlock::iterator I(KV.first); + auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI); + auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DT); + I->moveBefore(T); + I->getParent()->setName("pred.store.if"); + BB->setName("pred.store.continue"); + } + DEBUG(DT->verifyDomTree()); // Remove redundant induction instructions. cse(LoopVectorBody); } @@ -3326,18 +3598,18 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { return BlockMask; } -void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, - InnerLoopVectorizer::VectorParts &Entry, - unsigned UF, unsigned VF, PhiVector *PV) { +void InnerLoopVectorizer::widenPHIInstruction( + Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, + unsigned VF, PhiVector *PV) { PHINode* P = cast<PHINode>(PN); // Handle reduction variables: - if (Legal->getReductionVars()->count(P)) { + if (Legal->isReductionVariable(P)) { for (unsigned part = 0; part < UF; ++part) { // This is phase one of vectorizing PHIs. Type *VecTy = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); - Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", - LoopVectorBody.back()-> getFirstInsertionPt()); + Entry[part] = PHINode::Create( + VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt()); } PV->push_back(P); return; @@ -3385,53 +3657,44 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); - LoopVectorizationLegality::InductionInfo II = - Legal->getInductionVars()->lookup(P); + InductionDescriptor II = Legal->getInductionVars()->lookup(P); // FIXME: The newly created binary instructions should contain nsw/nuw flags, // which can be found from the original scalar operations. - switch (II.IK) { - case LoopVectorizationLegality::IK_NoInduction: + switch (II.getKind()) { + case InductionDescriptor::IK_NoInduction: llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IK_IntInduction: { - assert(P->getType() == II.StartValue->getType() && "Types must match"); - Type *PhiTy = P->getType(); - Value *Broadcasted; - if (P == OldInduction) { - // Handle the canonical induction variable. We might have had to - // extend the type. - Broadcasted = Builder.CreateTrunc(Induction, PhiTy); - } else { - // Handle other induction variables that are now based on the - // canonical one. - Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, - "normalized.idx"); - NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); - Broadcasted = II.transform(Builder, NormalizedIdx); - Broadcasted->setName("offset.idx"); + case InductionDescriptor::IK_IntInduction: { + assert(P->getType() == II.getStartValue()->getType() && + "Types must match"); + // Handle other induction variables that are now based on the + // canonical one. + Value *V = Induction; + if (P != OldInduction) { + V = Builder.CreateSExtOrTrunc(Induction, P->getType()); + V = II.transform(Builder, V); + V->setName("offset.idx"); } - Broadcasted = getBroadcastInstrs(Broadcasted); + Value *Broadcasted = getBroadcastInstrs(V); // After broadcasting the induction variable we need to make the vector // consecutive by adding 0, 1, 2, etc. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue); + Entry[part] = getStepVector(Broadcasted, VF * part, II.getStepValue()); return; } - case LoopVectorizationLegality::IK_PtrInduction: + case InductionDescriptor::IK_PtrInduction: // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = - Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); - NormalizedIdx = - Builder.CreateSExtOrTrunc(NormalizedIdx, II.StepValue->getType()); + Value *PtrInd = Induction; + PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType()); // This is the vector of results. Notice that we don't generate // vector geps because scalar geps result in better code. for (unsigned part = 0; part < UF; ++part) { if (VF == 1) { int EltIndex = part; - Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex); - Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); + Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx); SclrGep->setName("next.gep"); Entry[part] = SclrGep; @@ -3441,8 +3704,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); for (unsigned int i = 0; i < VF; ++i) { int EltIndex = i + part * VF; - Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex); - Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); + Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx); SclrGep->setName("next.gep"); VecVal = Builder.CreateInsertElement(VecVal, SclrGep, @@ -3458,7 +3721,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - VectorParts &Entry = WidenMap.get(it); + VectorParts &Entry = WidenMap.get(&*it); + switch (it->getOpcode()) { case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the @@ -3466,7 +3730,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { continue; case Instruction::PHI: { // Vectorize PHINodes. - widenPHIInstruction(it, Entry, UF, VF, PV); + widenPHIInstruction(&*it, Entry, UF, VF, PV); continue; }// End of PHI. @@ -3504,16 +3768,17 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = V; } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } case Instruction::Select: { // Widen selects. // If the selector is loop invariant we can create a select // instruction with a scalar condition. Otherwise, use vector-select. - bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), - OrigLoop); - setDebugLocFromInst(Builder, it); + auto *SE = PSE.getSE(); + bool InvariantCond = + SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop); + setDebugLocFromInst(Builder, &*it); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. @@ -3522,7 +3787,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { VectorParts &Cond = getVectorValue(it->getOperand(0)); VectorParts &Op0 = getVectorValue(it->getOperand(1)); VectorParts &Op1 = getVectorValue(it->getOperand(2)); - + Value *ScalarCond = (VF == 1) ? Cond[0] : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); @@ -3533,7 +3798,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Op1[Part]); } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } @@ -3542,25 +3807,27 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Widen compares. Generate vector compares. bool FCmp = (it->getOpcode() == Instruction::FCmp); CmpInst *Cmp = dyn_cast<CmpInst>(it); - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); VectorParts &A = getVectorValue(it->getOperand(0)); VectorParts &B = getVectorValue(it->getOperand(1)); for (unsigned Part = 0; Part < UF; ++Part) { Value *C = nullptr; - if (FCmp) + if (FCmp) { C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); - else + cast<FCmpInst>(C)->copyFastMathFlags(&*it); + } else { C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); + } Entry[Part] = C; } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } case Instruction::Store: case Instruction::Load: - vectorizeMemoryInstruction(it); + vectorizeMemoryInstruction(&*it); break; case Instruction::ZExt: case Instruction::SExt: @@ -3575,7 +3842,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { case Instruction::FPTrunc: case Instruction::BitCast: { CastInst *CI = dyn_cast<CastInst>(it); - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); /// Optimize the special case where the source is the induction /// variable. Notice that we can only optimize the 'trunc' case /// because: a. FP conversions lose precision, b. sext/zext may wrap, @@ -3585,13 +3852,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); Value *Broadcasted = getBroadcastInstrs(ScalarCast); - LoopVectorizationLegality::InductionInfo II = + InductionDescriptor II = Legal->getInductionVars()->lookup(OldInduction); - Constant *Step = - ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue()); + Constant *Step = ConstantInt::getSigned( + CI->getType(), II.getStepValue()->getSExtValue()); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } /// Vectorize casts. @@ -3601,7 +3868,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { VectorParts &A = getVectorValue(it->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } @@ -3609,7 +3876,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Ignore dbg intrinsics. if (isa<DbgInfoIntrinsic>(it)) break; - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); Module *M = BB->getParent()->getParent(); CallInst *CI = cast<CallInst>(it); @@ -3625,7 +3892,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || ID == Intrinsic::lifetime_start)) { - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; } // The flag shows whether we use Intrinsic or a usual Call for vectorized @@ -3636,7 +3903,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { bool UseVectorIntrinsic = ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; if (!UseVectorIntrinsic && NeedToScalarize) { - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; } @@ -3677,13 +3944,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = Builder.CreateCall(VectorF, Args); } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } default: // All other instructions are unsupported. Scalarize them. - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; }// end of switch. }// end of for_each instr. @@ -3691,7 +3958,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { void InnerLoopVectorizer::updateAnalysis() { // Forget the original basic block. - SE->forgetLoop(OrigLoop); + PSE.getSE()->forgetLoop(OrigLoop); // Update the dominator tree information. assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && @@ -3701,19 +3968,12 @@ void InnerLoopVectorizer::updateAnalysis() { DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); - // Due to if predication of stores we might create a sequence of "if(pred) - // a[i] = ...; " blocks. - for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) { - if (i == 0) - DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); - else if (isPredicatedBlock(i)) { - DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]); - } else { - DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]); - } - } + // We don't predicate stores by this point, so the vector body should be a + // single loop. + assert(LoopVectorBody.size() == 1 && "Expected single block loop!"); + DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); - DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); + DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back()); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); @@ -3850,10 +4110,10 @@ bool LoopVectorizationLegality::canVectorize() { } // ScalarEvolution needs to be able to find the exit count. - const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); - if (ExitCount == SE->getCouldNotCompute()) { - emitAnalysis(VectorizationReport() << - "could not determine number of loop iterations"); + const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop); + if (ExitCount == PSE.getSE()->getCouldNotCompute()) { + emitAnalysis(VectorizationReport() + << "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } @@ -3879,10 +4139,28 @@ bool LoopVectorizationLegality::canVectorize() { : "") << "!\n"); + bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); + + // If an override option has been passed in for interleaved accesses, use it. + if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) + UseInterleaved = EnableInterleavedMemAccesses; + // Analyze interleaved memory accesses. - if (EnableInterleavedMemAccesses) + if (UseInterleaved) InterleaveInfo.analyzeInterleaving(Strides); + unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; + if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) + SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; + + if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { + emitAnalysis(VectorizationReport() + << "Too many SCEV assumptions need to be made and checked " + << "at runtime"); + DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); + return false; + } + // Okay! We can vectorize. At this point we don't have any other mem analysis // which may limit our maximum vectorization factor, so just return true with // no restrictions. @@ -3929,7 +4207,6 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, } bool LoopVectorizationLegality::canVectorizeInstrs() { - BasicBlock *PreHeader = TheLoop->getLoopPreheader(); BasicBlock *Header = TheLoop->getHeader(); // Look for the attribute signaling the absence of NaNs. @@ -3953,7 +4230,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; @@ -3965,9 +4242,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (*bb != Header) { // Check that this instruction has no outside users or is an // identified reduction value with an outside user. - if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) continue; - emitAnalysis(VectorizationReport(it) << + emitAnalysis(VectorizationReport(&*it) << "value could not be identified as " "an induction or reduction variable"); return false; @@ -3975,19 +4252,15 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // We only allow if-converted PHIs with exactly two incoming values. if (Phi->getNumIncomingValues() != 2) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } - // This is the value coming from the preheader. - Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); - ConstantInt *StepValue = nullptr; - // Check if this is an induction variable. - InductionKind IK = isInductionVariable(Phi, StepValue); - - if (IK_NoInduction != IK) { + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) { + Inductions[Phi] = ID; // Get the widest type. if (!WidestIndTy) WidestIndTy = convertPointerToIntegerType(DL, PhiTy); @@ -3995,21 +4268,24 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); // Int inductions are special because we only allow one IV. - if (IK == IK_IntInduction && StepValue->isOne()) { + if (ID.getKind() == InductionDescriptor::IK_IntInduction && + ID.getStepValue()->isOne() && + isa<Constant>(ID.getStartValue()) && + cast<Constant>(ID.getStartValue())->isNullValue()) { // Use the phi node with the widest type as induction. Use the last // one if there are multiple (no good reason for doing this other - // than it is expedient). + // than it is expedient). We've checked that it begins at zero and + // steps by one, so this is a canonical induction variable. if (!Induction || PhiTy == WidestIndTy) Induction = Phi; } DEBUG(dbgs() << "LV: Found an induction variable.\n"); - Inductions[Phi] = InductionInfo(StartValue, IK, StepValue); // Until we explicitly handle the case of an induction variable with // an outside loop user we have to give up vectorizing this loop. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(VectorizationReport(it) << + if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { + emitAnalysis(VectorizationReport(&*it) << "use of induction value outside of the " "loop is not handled by vectorizer"); return false; @@ -4018,13 +4294,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } - if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, - Reductions[Phi])) { - AllowedExit.insert(Reductions[Phi].getLoopExitInstr()); + RecurrenceDescriptor RedDes; + if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) { + if (RedDes.hasUnsafeAlgebra()) + Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); + AllowedExit.insert(RedDes.getLoopExitInstr()); + Reductions[Phi] = RedDes; continue; } - emitAnalysis(VectorizationReport(it) << + emitAnalysis(VectorizationReport(&*it) << "value that could not be identified as " "reduction is used outside the loop"); DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); @@ -4039,8 +4318,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) && !(CI->getCalledFunction() && TLI && TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { - emitAnalysis(VectorizationReport(it) << - "call instruction cannot be vectorized"); + emitAnalysis(VectorizationReport(&*it) + << "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); return false; } @@ -4049,8 +4328,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // second argument is the same (i.e. loop invariant) if (CI && hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { - if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { - emitAnalysis(VectorizationReport(it) + auto *SE = PSE.getSE(); + if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { + emitAnalysis(VectorizationReport(&*it) << "intrinsic instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); return false; @@ -4061,7 +4341,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; @@ -4085,8 +4365,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(VectorizationReport(it) << + if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { + emitAnalysis(VectorizationReport(&*it) << "value cannot be used outside the loop"); return false; } @@ -4104,6 +4384,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } } + // Now we know the widest induction type, check if our found induction + // is the same size. If it's not, unset it here and InnerLoopVectorizer + // will create another. + if (Induction && WidestIndTy != Induction->getType()) + Induction = nullptr; + return true; } @@ -4116,7 +4402,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { else return; - Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop); + Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop); if (!Stride) return; @@ -4142,7 +4428,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { BE = TheLoop->block_end(); B != BE; ++B) for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); I != IE; ++I) - if (I->getType()->isPointerTy() && isConsecutivePtr(I)) + if (I->getType()->isPointerTy() && isConsecutivePtr(&*I)) Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); while (!Worklist.empty()) { @@ -4179,30 +4465,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } - if (LAI->getNumRuntimePointerChecks() > - VectorizerParams::RuntimeMemoryCheckThreshold) { - emitAnalysis(VectorizationReport() - << LAI->getNumRuntimePointerChecks() << " exceeds limit of " - << VectorizerParams::RuntimeMemoryCheckThreshold - << " dependent memory operations checked at runtime"); - DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - return false; - } - return true; -} + Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); + PSE.addPredicate(LAI->PSE.getUnionPredicate()); -LoopVectorizationLegality::InductionKind -LoopVectorizationLegality::isInductionVariable(PHINode *Phi, - ConstantInt *&StepValue) { - if (!isInductionPHI(Phi, SE, StepValue)) - return IK_NoInduction; - - Type *PhiTy = Phi->getType(); - // Found an Integer induction variable. - if (PhiTy->isIntegerTy()) - return IK_IntInduction; - // Found an Pointer induction variable. - return IK_PtrInduction; + return true; } bool LoopVectorizationLegality::isInductionVariable(const Value *V) { @@ -4256,8 +4522,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || !isSinglePredecessor) { - // Build a masked store if it is legal for the target, otherwise scalarize - // the block. + // Build a masked store if it is legal for the target, otherwise + // scalarize the block. bool isLegalMaskedOp = isLegalMaskedStore(SI->getValueOperand()->getType(), SI->getPointerOperand()); @@ -4315,7 +4581,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses( StoreInst *SI = dyn_cast<StoreInst>(I); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); - int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides); + int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides); // The factor of the corresponding interleave group. unsigned Factor = std::abs(Stride); @@ -4324,7 +4590,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses( if (Factor < 2 || Factor > MaxInterleaveGroupFactor) continue; - const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); + const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); @@ -4411,12 +4677,12 @@ void InterleavedAccessInfo::analyzeInterleaving( continue; // Calculate the distance and prepare for the rule 3. - const SCEVConstant *DistToA = - dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev)); + const SCEVConstant *DistToA = dyn_cast<SCEVConstant>( + PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev)); if (!DistToA) continue; - int DistanceToA = DistToA->getValue()->getValue().getSExtValue(); + int DistanceToA = DistToA->getAPInt().getSExtValue(); // Skip if the distance is not multiple of size as they are not in the // same group. @@ -4454,8 +4720,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { emitAnalysis(VectorizationReport() << "runtime pointer checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os"); - DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); + "compiling with -Os/-Oz"); + DEBUG(dbgs() << + "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); return Factor; } @@ -4467,10 +4734,12 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { } // Find the trip count. - unsigned TC = SE->getSmallConstantTripCount(TheLoop); + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - unsigned WidestType = getWidestType(); + MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); + unsigned SmallestType, WidestType; + std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; if (Legal->getMaxSafeDepDistBytes() != -1U) @@ -4478,7 +4747,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { WidestRegister = ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; - DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); + + DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " + << WidestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"); @@ -4491,6 +4762,26 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { " into one vector!"); unsigned VF = MaxVectorSize; + if (MaximizeBandwidth && !OptForSize) { + // Collect all viable vectorization factors. + SmallVector<unsigned, 8> VFs; + unsigned NewMaxVectorSize = WidestRegister / SmallestType; + for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2) + VFs.push_back(VS); + + // For each VF calculate its register usage. + auto RUs = calculateRegisterUsage(VFs); + + // Select the largest VF which doesn't require more registers than existing + // ones. + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); + for (int i = RUs.size() - 1; i >= 0; --i) { + if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { + VF = VFs[i]; + break; + } + } + } // If we optimize the program for size, avoid creating the tail loop. if (OptForSize) { @@ -4499,7 +4790,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { emitAnalysis (VectorizationReport() << "unable to calculate the loop count due to complex control flow"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); + DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return Factor; } @@ -4515,8 +4806,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { "cannot optimize for size and vectorize at the " "same time. Enable vectorization of this loop " "with '#pragma clang loop vectorize(enable)' " - "when compiling with -Os"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); + "when compiling with -Os/-Oz"); + DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return Factor; } } @@ -4566,7 +4857,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { return Factor; } -unsigned LoopVectorizationCostModel::getWidestType() { +std::pair<unsigned, unsigned> +LoopVectorizationCostModel::getSmallestAndWidestTypes() { + unsigned MinWidth = -1U; unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); @@ -4579,18 +4872,22 @@ unsigned LoopVectorizationCostModel::getWidestType() { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { Type *T = it->getType(); - // Ignore ephemeral values. - if (EphValues.count(it)) + // Skip ignored values. + if (ValuesToIgnore.count(&*it)) continue; // Only examine Loads, Stores and PHINodes. if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it)) continue; - // Examine PHI nodes that are reduction variables. - if (PHINode *PN = dyn_cast<PHINode>(it)) - if (!Legal->getReductionVars()->count(PN)) + // Examine PHI nodes that are reduction variables. Update the type to + // account for the recurrence type. + if (PHINode *PN = dyn_cast<PHINode>(it)) { + if (!Legal->isReductionVariable(PN)) continue; + RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; + T = RdxDesc.getRecurrenceType(); + } // Examine the stored values. if (StoreInst *ST = dyn_cast<StoreInst>(it)) @@ -4599,15 +4896,17 @@ unsigned LoopVectorizationCostModel::getWidestType() { // Ignore loaded pointer types and stored pointer types that are not // consecutive. However, we do want to take consecutive stores/loads of // pointer vectors into account. - if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) + if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it)) continue; + MinWidth = std::min(MinWidth, + (unsigned)DL.getTypeSizeInBits(T->getScalarType())); MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(T->getScalarType())); } } - return MaxWidth; + return {MinWidth, MaxWidth}; } unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, @@ -4628,11 +4927,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - // Use the user preference, unless 'auto' is selected. - int UserUF = Hints->getInterleave(); - if (UserUF != 0) - return UserUF; - // When we optimize for size, we don't interleave. if (OptForSize) return 1; @@ -4642,7 +4936,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, return 1; // Do not interleave loops with a relatively small trip count. - unsigned TC = SE->getSmallConstantTripCount(TheLoop); + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); if (TC > 1 && TC < TinyTripCountInterleaveThreshold) return 1; @@ -4658,7 +4952,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, TargetNumRegisters = ForceTargetNumVectorRegs; } - LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); + RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); @@ -4756,8 +5050,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, } // Interleave if this is a large loop (small loops are already dealt with by - // this - // point) that could benefit from interleaving. + // this point) that could benefit from interleaving. bool HasReductions = (Legal->getReductionVars()->size() > 0); if (TTI.enableAggressiveInterleaving(HasReductions)) { DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); @@ -4768,8 +5061,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, return 1; } -LoopVectorizationCostModel::RegisterUsage -LoopVectorizationCostModel::calculateRegisterUsage() { +SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> +LoopVectorizationCostModel::calculateRegisterUsage( + const SmallVector<unsigned, 8> &VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -4790,8 +5084,8 @@ LoopVectorizationCostModel::calculateRegisterUsage() { LoopBlocksDFS DFS(TheLoop); DFS.perform(LI); - RegisterUsage R; - R.NumInstructions = 0; + RegisterUsage RU; + RU.NumInstructions = 0; // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the @@ -4810,15 +5104,13 @@ LoopVectorizationCostModel::calculateRegisterUsage() { unsigned Index = 0; for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO(); bb != be; ++bb) { - R.NumInstructions += (*bb)->size(); - for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; - ++it) { - Instruction *I = it; - IdxToInstr[Index++] = I; + RU.NumInstructions += (*bb)->size(); + for (Instruction &I : **bb) { + IdxToInstr[Index++] = &I; // Save the end location of each USE. - for (unsigned i = 0; i < I->getNumOperands(); ++i) { - Value *U = I->getOperand(i); + for (unsigned i = 0; i < I.getNumOperands(); ++i) { + Value *U = I.getOperand(i); Instruction *Instr = dyn_cast<Instruction>(U); // Ignore non-instruction values such as arguments, constants, etc. @@ -4847,42 +5139,85 @@ LoopVectorizationCostModel::calculateRegisterUsage() { TransposeEnds[it->second].push_back(it->first); SmallSet<Instruction*, 8> OpenIntervals; - unsigned MaxUsage = 0; + // Get the size of the widest register. + unsigned MaxSafeDepDist = -1U; + if (Legal->getMaxSafeDepDistBytes() != -1U) + MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; + unsigned WidestRegister = + std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + + SmallVector<RegisterUsage, 8> RUs(VFs.size()); + SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + + // A lambda that gets the register usage for the given type and VF. + auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { + unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); + return std::max<unsigned>(1, VF * TypeSize / WidestRegister); + }; + for (unsigned int i = 0; i < Index; ++i) { Instruction *I = IdxToInstr[i]; // Ignore instructions that are never used within the loop. if (!Ends.count(I)) continue; - // Ignore ephemeral values. - if (EphValues.count(I)) - continue; - // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; - for (unsigned int j=0, e = List.size(); j < e; ++j) + for (unsigned int j = 0, e = List.size(); j < e; ++j) OpenIntervals.erase(List[j]); - // Count the number of live interals. - MaxUsage = std::max(MaxUsage, OpenIntervals.size()); + // Skip ignored values. + if (ValuesToIgnore.count(I)) + continue; - DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << - OpenIntervals.size() << '\n'); + // For each VF find the maximum usage of registers. + for (unsigned j = 0, e = VFs.size(); j < e; ++j) { + if (VFs[j] == 1) { + MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); + continue; + } + + // Count the number of live intervals. + unsigned RegUsage = 0; + for (auto Inst : OpenIntervals) { + // Skip ignored values for VF > 1. + if (VecValuesToIgnore.count(Inst)) + continue; + RegUsage += GetRegUsage(Inst->getType(), VFs[j]); + } + MaxUsages[j] = std::max(MaxUsages[j], RegUsage); + } + + DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " + << OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); } - unsigned Invariant = LoopInvariants.size(); - DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); - DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); - DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); + for (unsigned i = 0, e = VFs.size(); i < e; ++i) { + unsigned Invariant = 0; + if (VFs[i] == 1) + Invariant = LoopInvariants.size(); + else { + for (auto Inst : LoopInvariants) + Invariant += GetRegUsage(Inst->getType(), VFs[i]); + } + + DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); + DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); + DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); + DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n'); - R.LoopInvariantRegs = Invariant; - R.MaxLocalUsers = MaxUsage; - return R; + RU.LoopInvariantRegs = Invariant; + RU.MaxLocalUsers = MaxUsages[i]; + RUs[i] = RU; + } + + return RUs; } unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { @@ -4900,11 +5235,11 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { if (isa<DbgInfoIntrinsic>(it)) continue; - // Ignore ephemeral values. - if (EphValues.count(it)) + // Skip ignored values. + if (ValuesToIgnore.count(&*it)) continue; - unsigned C = getInstructionCost(it, VF); + unsigned C = getInstructionCost(&*it, VF); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) @@ -4969,7 +5304,7 @@ static bool isLikelyComplexAddressComputation(Value *Ptr, if (!C) return true; - const APInt &APStepVal = C->getValue()->getValue(); + const APInt &APStepVal = C->getAPInt(); // Huge step value - give up. if (APStepVal.getBitWidth() > 64) @@ -4981,9 +5316,8 @@ static bool isLikelyComplexAddressComputation(Value *Ptr, } static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { - if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1))) - return true; - return false; + return Legal->hasStride(I->getOperand(0)) || + Legal->hasStride(I->getOperand(1)); } unsigned @@ -4994,7 +5328,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { VF = 1; Type *RetTy = I->getType(); + if (VF > 1 && MinBWs.count(I)) + RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); Type *VectorTy = ToVectorTy(RetTy, VF); + auto SE = PSE.getSE(); // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { @@ -5076,6 +5413,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); + Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); + auto It = MinBWs.find(Op0AsInstruction); + if (VF > 1 && It != MinBWs.end()) + ValTy = IntegerType::get(ValTy->getContext(), It->second); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); } @@ -5199,8 +5540,28 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { Legal->isInductionVariable(I->getOperand(0))) return TTI.getCastInstrCost(I->getOpcode(), I->getType(), I->getOperand(0)->getType()); - - Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); + + Type *SrcScalarTy = I->getOperand(0)->getType(); + Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); + if (VF > 1 && MinBWs.count(I)) { + // This cast is going to be shrunk. This may remove the cast or it might + // turn it into slightly different cast. For example, if MinBW == 16, + // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". + // + // Calculate the modified src and dest types. + Type *MinVecTy = VectorTy; + if (I->getOpcode() == Instruction::Trunc) { + SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); + VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), + MinVecTy); + } else if (I->getOpcode() == Instruction::ZExt || + I->getOpcode() == Instruction::SExt) { + SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); + VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), + MinVecTy); + } + } + return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } case Instruction::Call: { @@ -5240,15 +5601,18 @@ char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(DemandedBits) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -5269,6 +5633,79 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { return false; } +void LoopVectorizationCostModel::collectValuesToIgnore() { + // Ignore ephemeral values. + CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); + + // Ignore type-promoting instructions we identified during reduction + // detection. + for (auto &Reduction : *Legal->getReductionVars()) { + RecurrenceDescriptor &RedDes = Reduction.second; + SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); + VecValuesToIgnore.insert(Casts.begin(), Casts.end()); + } + + // Ignore induction phis that are only used in either GetElementPtr or ICmp + // instruction to exit loop. Induction variables usually have large types and + // can have big impact when estimating register usage. + // This is for when VF > 1. + for (auto &Induction : *Legal->getInductionVars()) { + auto *PN = Induction.first; + auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); + + // Check that the PHI is only used by the induction increment (UpdateV) or + // by GEPs. Then check that UpdateV is only used by a compare instruction or + // the loop header PHI. + // FIXME: Need precise def-use analysis to determine if this instruction + // variable will be vectorized. + if (std::all_of(PN->user_begin(), PN->user_end(), + [&](const User *U) -> bool { + return U == UpdateV || isa<GetElementPtrInst>(U); + }) && + std::all_of(UpdateV->user_begin(), UpdateV->user_end(), + [&](const User *U) -> bool { + return U == PN || isa<ICmpInst>(U); + })) { + VecValuesToIgnore.insert(PN); + VecValuesToIgnore.insert(UpdateV); + } + } + + // Ignore instructions that will not be vectorized. + // This is for when VF > 1. + for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; + ++bb) { + for (auto &Inst : **bb) { + switch (Inst.getOpcode()) { + case Instruction::GetElementPtr: { + // Ignore GEP if its last operand is an induction variable so that it is + // a consecutive load/store and won't be vectorized as scatter/gather + // pattern. + + GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst); + unsigned NumOperands = Gep->getNumOperands(); + unsigned InductionOperand = getGEPInductionOperand(Gep); + bool GepToIgnore = true; + + // Check that all of the gep indices are uniform except for the + // induction operand. + for (unsigned i = 0; i != NumOperands; ++i) { + if (i != InductionOperand && + !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), + TheLoop)) { + GepToIgnore = false; + break; + } + } + + if (GepToIgnore) + VecValuesToIgnore.insert(&Inst); + break; + } + } + } + } +} void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { @@ -5316,19 +5753,12 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); - Instruction *InsertPt = Builder.GetInsertPoint(); - BasicBlock *IfBlock = Builder.GetInsertBlock(); - BasicBlock *CondBlock = nullptr; - VectorParts Cond; - Loop *VectorLp = nullptr; if (IfPredicateStore) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), Instr->getParent()); - VectorLp = LI->getLoopFor(IfBlock); - assert(VectorLp && "Must have a loop for this block"); } // For each vector unroll 'part': @@ -5343,11 +5773,6 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], ConstantInt::get(Cond[Part]->getType(), 1)); - CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); - LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, *LI); - // Update Builder with newly created basic block. - Builder.SetInsertPoint(InsertPt); } Instruction *Cloned = Instr->clone(); @@ -5367,16 +5792,10 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, if (!IsVoidRetTy) VecResults[Part] = Cloned; - // End if-block. - if (IfPredicateStore) { - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); - Builder.SetInsertPoint(InsertPt); - ReplaceInstWithInst(IfBlock->getTerminator(), - BranchInst::Create(CondBlock, NewIfBlock, Cmp)); - IfBlock = NewIfBlock; - } + // End if-block. + if (IfPredicateStore) + PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), + Cmp)); } } diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b180c97..27d3337 100644 --- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopInfo.h" @@ -61,7 +62,7 @@ static cl::opt<int> "number ")); static cl::opt<bool> -ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, +ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions")); static cl::opt<bool> ShouldStartVectorizeHorAtStore( @@ -73,6 +74,14 @@ static cl::opt<int> MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); +/// Limits the size of scheduling regions in a block. +/// It avoid long compile times for _very_ large blocks where vector +/// instructions are spread over a wide range. +/// This limit is way higher than needed by real-world functions. +static cl::opt<int> +ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, + cl::desc("Limit the size of the SLP scheduling region per block")); + namespace { // FIXME: Set this via cl::opt to allow overriding. @@ -89,6 +98,10 @@ static const unsigned AliasedCheckLimit = 10; // This limit is useful for very large basic blocks. static const unsigned MaxMemDepDistance = 160; +/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling +/// regions to be handled. +static const int MinScheduleRegionSize = 16; + /// \brief Predicate for the element types that the SLP vectorizer supports. /// /// The most important thing to filter here are types which are invalid in LLVM @@ -156,13 +169,11 @@ static unsigned getAltOpcode(unsigned Op) { /// of an alternate sequence which can later be merged as /// a ShuffleVector instruction. static bool canCombineAsAltInst(unsigned Op) { - if (Op == Instruction::FAdd || Op == Instruction::FSub || - Op == Instruction::Sub || Op == Instruction::Add) - return true; - return false; + return Op == Instruction::FAdd || Op == Instruction::FSub || + Op == Instruction::Sub || Op == Instruction::Add; } -/// \returns ShuffleVector instruction if intructions in \p VL have +/// \returns ShuffleVector instruction if instructions in \p VL have /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) static unsigned isAltInst(ArrayRef<Value *> VL) { @@ -211,7 +222,7 @@ static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { } } } - + /// \returns \p I after propagating metadata from \p VL. static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { Instruction *I0 = cast<Instruction>(VL[0]); @@ -242,6 +253,9 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { case LLVMContext::MD_fpmath: MD = MDNode::getMostGenericFPMath(MD, IMD); break; + case LLVMContext::MD_nontemporal: + MD = MDNode::intersect(MD, IMD); + break; } } I->setMetadata(Kind, MD); @@ -393,7 +407,7 @@ public: /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); - /// \returns true if it is benefitial to reverse the vector order. + /// \returns true if it is beneficial to reverse the vector order. bool shouldReorder() const { return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; } @@ -441,7 +455,7 @@ private: /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); - /// \returns whether the VectorizableTree is fully vectoriable and will + /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(); @@ -492,7 +506,7 @@ private: } return Last; } - + /// -- Vectorization State -- /// Holds all of the tree entries. std::vector<TreeEntry> VectorizableTree; @@ -506,7 +520,7 @@ private: /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser (Value *S, llvm::User *U, int L) : - Scalar(S), User(U), Lane(L){}; + Scalar(S), User(U), Lane(L){} // Which scalar in our function. Value *Scalar; // Which user that uses the scalar. @@ -717,6 +731,8 @@ private: : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), ScheduleStart(nullptr), ScheduleEnd(nullptr), FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), + ScheduleRegionSize(0), + ScheduleRegionSizeLimit(ScheduleRegionSizeBudget), // Make sure that the initial SchedulingRegionID is greater than the // initial SchedulingRegionID in ScheduleData (which is 0). SchedulingRegionID(1) {} @@ -728,6 +744,13 @@ private: FirstLoadStoreInRegion = nullptr; LastLoadStoreInRegion = nullptr; + // Reduce the maximum schedule region size by the size of the + // previous scheduling run. + ScheduleRegionSizeLimit -= ScheduleRegionSize; + if (ScheduleRegionSizeLimit < MinScheduleRegionSize) + ScheduleRegionSizeLimit = MinScheduleRegionSize; + ScheduleRegionSize = 0; + // Make a new scheduling region, i.e. all existing ScheduleData is not // in the new region yet. ++SchedulingRegionID; @@ -804,7 +827,8 @@ private: void cancelScheduling(ArrayRef<Value *> VL); /// Extends the scheduling region so that V is inside the region. - void extendSchedulingRegion(Value *V); + /// \returns true if the region size is within the limit. + bool extendSchedulingRegion(Value *V); /// Initialize the ScheduleData structures for new instructions in the /// scheduling region. @@ -858,6 +882,12 @@ private: /// (can be null). ScheduleData *LastLoadStoreInRegion; + /// The current size of the scheduling region. + int ScheduleRegionSize; + + /// The maximum size allowed for the scheduling region. + int ScheduleRegionSizeLimit; + /// The ID of the scheduling region. For a new vectorization iteration this /// is incremented which "removes" all ScheduleData from the region. int SchedulingRegionID; @@ -1059,7 +1089,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { newTreeEntry(VL, false); return; } - + // Check that every instructions appears once in this bundle. for (unsigned i = 0, e = VL.size(); i < e; ++i) for (unsigned j = i+1; j < e; ++j) @@ -1077,7 +1107,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { if (!BS.tryScheduleBundle(VL, this)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); - BS.cancelScheduling(VL); + assert((!BS.getScheduleData(VL[0]) || + !BS.getScheduleData(VL[0])->isPartOfBundle()) && + "tryScheduleBundle should cancelScheduling on failure"); newTreeEntry(VL, false); return; } @@ -1125,6 +1157,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { return; } case Instruction::Load: { + // Check that a vectorized load would load the same memory as a scalar + // load. + // For example we don't want vectorize loads that are smaller than 8 bit. + // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats + // loading/storing it as an i8 struct. If we vectorize loads/stores from + // such a struct we read/write packed bits disagreeing with the + // unvectorized version. + const DataLayout &DL = F->getParent()->getDataLayout(); + Type *ScalarTy = VL[0]->getType(); + + if (DL.getTypeSizeInBits(ScalarTy) != + DL.getTypeAllocSizeInBits(ScalarTy)) { + BS.cancelScheduling(VL); + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); + return; + } // Check if the loads are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { LoadInst *L = cast<LoadInst>(VL[i]); @@ -1134,7 +1183,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } - const DataLayout &DL = F->getParent()->getDataLayout(); + if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) { ++NumLoadsWantToChangeOrder; @@ -1662,7 +1711,7 @@ int BoUpSLP::getSpillCost() { int Cost = 0; SmallPtrSet<Instruction*, 4> LiveValues; - Instruction *PrevInst = nullptr; + Instruction *PrevInst = nullptr; for (unsigned N = 0; N < VectorizableTree.size(); ++N) { Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]); @@ -1687,10 +1736,11 @@ int BoUpSLP::getSpillCost() { for (auto &J : PrevInst->operands()) { if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J)) LiveValues.insert(cast<Instruction>(&*J)); - } + } // Now find the sequence of instructions between PrevInst and Inst. - BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst); + BasicBlock::reverse_iterator InstIt(Inst->getIterator()), + PrevInstIt(PrevInst->getIterator()); --PrevInstIt; while (InstIt != PrevInstIt) { if (PrevInstIt == PrevInst->getParent()->rend()) { @@ -1730,30 +1780,29 @@ int BoUpSLP::getTreeCost() { unsigned BundleWidth = VectorizableTree[0].Scalars.size(); - for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { - int C = getEntryCost(&VectorizableTree[i]); + for (TreeEntry &TE : VectorizableTree) { + int C = getEntryCost(&TE); DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " - << *VectorizableTree[i].Scalars[0] << " .\n"); + << TE.Scalars[0] << " .\n"); Cost += C; } SmallSet<Value *, 16> ExtractCostCalculated; int ExtractCost = 0; - for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end(); - I != E; ++I) { + for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(I->Scalar).second) + if (!ExtractCostCalculated.insert(EU.Scalar).second) continue; // Uses by ephemeral values are free (because the ephemeral value will be // removed prior to code generation, and so the extraction will be // removed as well). - if (EphValues.count(I->User)) + if (EphValues.count(EU.User)) continue; - VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); + VectorType *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - I->Lane); + EU.Lane); } Cost += getSpillCost(); @@ -1890,106 +1939,126 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL, } } +// Return true if I should be commuted before adding it's left and right +// operands to the arrays Left and Right. +// +// The vectorizer is trying to either have all elements one side being +// instruction with the same opcode to enable further vectorization, or having +// a splat to lower the vectorizing cost. +static bool shouldReorderOperands(int i, Instruction &I, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right, + bool AllSameOpcodeLeft, + bool AllSameOpcodeRight, bool SplatLeft, + bool SplatRight) { + Value *VLeft = I.getOperand(0); + Value *VRight = I.getOperand(1); + // If we have "SplatRight", try to see if commuting is needed to preserve it. + if (SplatRight) { + if (VRight == Right[i - 1]) + // Preserve SplatRight + return false; + if (VLeft == Right[i - 1]) { + // Commuting would preserve SplatRight, but we don't want to break + // SplatLeft either, i.e. preserve the original order if possible. + // (FIXME: why do we care?) + if (SplatLeft && VLeft == Left[i - 1]) + return false; + return true; + } + } + // Symmetrically handle Right side. + if (SplatLeft) { + if (VLeft == Left[i - 1]) + // Preserve SplatLeft + return false; + if (VRight == Left[i - 1]) + return true; + } + + Instruction *ILeft = dyn_cast<Instruction>(VLeft); + Instruction *IRight = dyn_cast<Instruction>(VRight); + + // If we have "AllSameOpcodeRight", try to see if the left operands preserves + // it and not the right, in this case we want to commute. + if (AllSameOpcodeRight) { + unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode(); + if (IRight && RightPrevOpcode == IRight->getOpcode()) + // Do not commute, a match on the right preserves AllSameOpcodeRight + return false; + if (ILeft && RightPrevOpcode == ILeft->getOpcode()) { + // We have a match and may want to commute, but first check if there is + // not also a match on the existing operands on the Left to preserve + // AllSameOpcodeLeft, i.e. preserve the original order if possible. + // (FIXME: why do we care?) + if (AllSameOpcodeLeft && ILeft && + cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode()) + return false; + return true; + } + } + // Symmetrically handle Left side. + if (AllSameOpcodeLeft) { + unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode(); + if (ILeft && LeftPrevOpcode == ILeft->getOpcode()) + return false; + if (IRight && LeftPrevOpcode == IRight->getOpcode()) + return true; + } + return false; +} + void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right) { - SmallVector<Value *, 16> OrigLeft, OrigRight; - - bool AllSameOpcodeLeft = true; - bool AllSameOpcodeRight = true; - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - Instruction *I = cast<Instruction>(VL[i]); - Value *VLeft = I->getOperand(0); - Value *VRight = I->getOperand(1); - - OrigLeft.push_back(VLeft); - OrigRight.push_back(VRight); - - Instruction *ILeft = dyn_cast<Instruction>(VLeft); - Instruction *IRight = dyn_cast<Instruction>(VRight); - - // Check whether all operands on one side have the same opcode. In this case - // we want to preserve the original order and not make things worse by - // reordering. - if (i && AllSameOpcodeLeft && ILeft) { - if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) { - if (PLeft->getOpcode() != ILeft->getOpcode()) - AllSameOpcodeLeft = false; - } else - AllSameOpcodeLeft = false; - } - if (i && AllSameOpcodeRight && IRight) { - if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) { - if (PRight->getOpcode() != IRight->getOpcode()) - AllSameOpcodeRight = false; - } else - AllSameOpcodeRight = false; - } - - // Sort two opcodes. In the code below we try to preserve the ability to use - // broadcast of values instead of individual inserts. - // vl1 = load - // vl2 = phi - // vr1 = load - // vr2 = vr2 - // = vl1 x vr1 - // = vl2 x vr2 - // If we just sorted according to opcode we would leave the first line in - // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). - // = vl1 x vr1 - // = vr2 x vl2 - // Because vr2 and vr1 are from the same load we loose the opportunity of a - // broadcast for the packed right side in the backend: we have [vr1, vl2] - // instead of [vr1, vr2=vr1]. - if (ILeft && IRight) { - if (!i && ILeft->getOpcode() > IRight->getOpcode()) { - Left.push_back(IRight); - Right.push_back(ILeft); - } else if (i && ILeft->getOpcode() > IRight->getOpcode() && - Right[i - 1] != IRight) { - // Try not to destroy a broad cast for no apparent benefit. - Left.push_back(IRight); - Right.push_back(ILeft); - } else if (i && ILeft->getOpcode() == IRight->getOpcode() && - Right[i - 1] == ILeft) { - // Try preserve broadcasts. - Left.push_back(IRight); - Right.push_back(ILeft); - } else if (i && ILeft->getOpcode() == IRight->getOpcode() && - Left[i - 1] == IRight) { - // Try preserve broadcasts. - Left.push_back(IRight); - Right.push_back(ILeft); - } else { - Left.push_back(ILeft); - Right.push_back(IRight); - } - continue; - } - // One opcode, put the instruction on the right. - if (ILeft) { - Left.push_back(VRight); - Right.push_back(ILeft); - continue; - } + if (VL.size()) { + // Peel the first iteration out of the loop since there's nothing + // interesting to do anyway and it simplifies the checks in the loop. + auto VLeft = cast<Instruction>(VL[0])->getOperand(0); + auto VRight = cast<Instruction>(VL[0])->getOperand(1); + if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft)) + // Favor having instruction to the right. FIXME: why? + std::swap(VLeft, VRight); Left.push_back(VLeft); Right.push_back(VRight); } - bool LeftBroadcast = isSplat(Left); - bool RightBroadcast = isSplat(Right); - - // If operands end up being broadcast return this operand order. - if (LeftBroadcast || RightBroadcast) - return; + // Keep track if we have instructions with all the same opcode on one side. + bool AllSameOpcodeLeft = isa<Instruction>(Left[0]); + bool AllSameOpcodeRight = isa<Instruction>(Right[0]); + // Keep track if we have one side with all the same value (broadcast). + bool SplatLeft = true; + bool SplatRight = true; - // Don't reorder if the operands where good to begin. - if (AllSameOpcodeRight || AllSameOpcodeLeft) { - Left = OrigLeft; - Right = OrigRight; + for (unsigned i = 1, e = VL.size(); i != e; ++i) { + Instruction *I = cast<Instruction>(VL[i]); + assert(I->isCommutative() && "Can only process commutative instruction"); + // Commute to favor either a splat or maximizing having the same opcodes on + // one side. + if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, + AllSameOpcodeRight, SplatLeft, SplatRight)) { + Left.push_back(I->getOperand(1)); + Right.push_back(I->getOperand(0)); + } else { + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); + } + // Update Splat* and AllSameOpcode* after the insertion. + SplatRight = SplatRight && (Right[i - 1] == Right[i]); + SplatLeft = SplatLeft && (Left[i - 1] == Left[i]); + AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) && + (cast<Instruction>(Left[i - 1])->getOpcode() == + cast<Instruction>(Left[i])->getOpcode()); + AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) && + (cast<Instruction>(Right[i - 1])->getOpcode() == + cast<Instruction>(Right[i])->getOpcode()); } + // If one operand end up being broadcast, return this operand order. + if (SplatRight || SplatLeft) + return; + const DataLayout &DL = F->getParent()->getDataLayout(); // Finally check if we can get longer vectorizable chain by reordering @@ -2030,7 +2099,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { Instruction *VL0 = cast<Instruction>(VL[0]); - BasicBlock::iterator NextInst = VL0; + BasicBlock::iterator NextInst(VL0); ++NextInst; Builder.SetInsertPoint(VL0->getParent(), NextInst); Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); @@ -2481,13 +2550,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *BoUpSLP::vectorizeTree() { - + // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { scheduleBlock(BSIter.second.get()); } - Builder.SetInsertPoint(F->getEntryBlock().begin()); + Builder.SetInsertPoint(&F->getEntryBlock().front()); vectorizeTree(&VectorizableTree[0]); DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); @@ -2532,7 +2601,7 @@ Value *BoUpSLP::vectorizeTree() { User->replaceUsesOfWith(Scalar, Ex); } } else { - Builder.SetInsertPoint(F->getEntryBlock().begin()); + Builder.SetInsertPoint(&F->getEntryBlock().front()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); CSEBlocks.insert(&F->getEntryBlock()); User->replaceUsesOfWith(Scalar, Ex); @@ -2641,7 +2710,7 @@ void BoUpSLP::optimizeGatherSequence() { BasicBlock *BB = (*I)->getBlock(); // For all instructions in blocks containing gather sequences: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { - Instruction *In = it++; + Instruction *In = &*it++; if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) continue; @@ -2681,8 +2750,15 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, ScheduleData *Bundle = nullptr; bool ReSchedule = false; DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); + + // Make sure that the scheduling region contains all + // instructions of the bundle. + for (Value *V : VL) { + if (!extendSchedulingRegion(V)) + return false; + } + for (Value *V : VL) { - extendSchedulingRegion(V); ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); @@ -2743,7 +2819,11 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, schedule(pickedSD, ReadyInsts); } } - return Bundle->isReady(); + if (!Bundle->isReady()) { + cancelScheduling(VL); + return false; + } + return true; } void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) { @@ -2772,9 +2852,9 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) { } } -void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { +bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { if (getScheduleData(V)) - return; + return true; Instruction *I = dyn_cast<Instruction>(V); assert(I && "bundle member must be an instruction"); assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); @@ -2785,21 +2865,26 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { ScheduleEnd = I->getNextNode(); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); - return; + return true; } // Search up and down at the same time, because we don't know if the new // instruction is above or below the existing scheduling region. - BasicBlock::reverse_iterator UpIter(ScheduleStart); + BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator()); BasicBlock::reverse_iterator UpperEnd = BB->rend(); BasicBlock::iterator DownIter(ScheduleEnd); BasicBlock::iterator LowerEnd = BB->end(); for (;;) { + if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { + DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); + return false; + } + if (UpIter != UpperEnd) { if (&*UpIter == I) { initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); ScheduleStart = I; DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); - return; + return true; } UpIter++; } @@ -2810,13 +2895,14 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { ScheduleEnd = I->getNextNode(); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); - return; + return true; } DownIter++; } assert((UpIter != UpperEnd || DownIter != LowerEnd) && "instruction not found in block"); } + return true; } void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, @@ -2896,8 +2982,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, } } else { // I'm not sure if this can ever happen. But we need to be safe. - // This lets the instruction/bundle never be scheduled and eventally - // disable vectorization. + // This lets the instruction/bundle never be scheduled and + // eventually disable vectorization. BundleMember->Dependencies++; BundleMember->incrementUnscheduledDeps(1); } @@ -2985,10 +3071,10 @@ void BoUpSLP::BlockScheduling::resetSchedule() { } void BoUpSLP::scheduleBlock(BlockScheduling *BS) { - + if (!BS->ScheduleStart) return; - + DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); BS->resetSchedule(); @@ -3003,7 +3089,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { }; std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; - // Ensure that all depencency data is updated and fill the ready-list with + // Ensure that all dependency data is updated and fill the ready-list with // initial instructions. int Idx = 0; int NumToSchedule = 0; @@ -3035,7 +3121,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { Instruction *pickedInst = BundleMember->Inst; if (LastScheduledInst->getNextNode() != pickedInst) { BS->BB->getInstList().remove(pickedInst); - BS->BB->getInstList().insert(LastScheduledInst, pickedInst); + BS->BB->getInstList().insert(LastScheduledInst->getIterator(), + pickedInst); } LastScheduledInst = pickedInst; BundleMember = BundleMember->NextInBundle; @@ -3074,11 +3161,11 @@ struct SLPVectorizer : public FunctionPass { if (skipOptnoneFunction(F)) return false; - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); TLI = TLIP ? &TLIP->getTLI() : nullptr; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); @@ -3139,13 +3226,15 @@ struct SLPVectorizer : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { FunctionPass::getAnalysisUsage(AU); AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<ScalarEvolution>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.setPreservesCFG(); } @@ -3260,15 +3349,26 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, // Do a quadratic search on all of the given stores and find // all of the pairs of stores that follow each other. + SmallVector<unsigned, 16> IndexQueue; for (unsigned i = 0, e = Stores.size(); i < e; ++i) { - for (unsigned j = 0; j < e; ++j) { - if (i == j) - continue; - const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); - if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) { - Tails.insert(Stores[j]); + const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); + IndexQueue.clear(); + // If a store has multiple consecutive store candidates, search Stores + // array according to the sequence: from i+1 to e, then from i-1 to 0. + // This is because usually pairing with immediate succeeding or preceding + // candidate create the best chance to find slp vectorization opportunity. + unsigned j = 0; + for (j = i + 1; j < e; ++j) + IndexQueue.push_back(j); + for (j = i; j > 0; --j) + IndexQueue.push_back(j - 1); + + for (auto &k : IndexQueue) { + if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) { + Tails.insert(Stores[k]); Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[j]; + ConsecutiveChain[Stores[i]] = Stores[k]; + break; } } } @@ -3428,7 +3528,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, unsigned VecIdx = 0; for (auto &V : BuildVectorSlice) { IRBuilder<true, NoFolder> Builder( - ++BasicBlock::iterator(InsertAfter)); + InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter)); InsertElementInst *IE = cast<InsertElementInst>(V); Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( VectorizedRoot, Builder.getInt32(VecIdx++))); @@ -3489,7 +3589,7 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { /// \param NumEltsToRdx The number of elements that should be reduced in the /// vector. /// \param IsPairwise Whether the reduction is a pairwise or splitting -/// reduction. A pairwise reduction will generate a mask of +/// reduction. A pairwise reduction will generate a mask of /// <0,2,...> or <1,3,..> while a splitting reduction will generate /// <2,3, undef,undef> for a vector of 4 and NumElts = 2. /// \param IsLeft True will generate a mask of even elements, odd otherwise. @@ -3552,16 +3652,17 @@ class HorizontalReduction { unsigned ReductionOpcode; /// The opcode of the values we perform a reduction on. unsigned ReducedValueOpcode; - /// The width of one full horizontal reduction operation. - unsigned ReduxWidth; /// Should we model this reduction as a pairwise reduction tree or a tree that /// splits the vector in halves and adds those halves. bool IsPairwiseReduction; public: + /// The width of one full horizontal reduction operation. + unsigned ReduxWidth; + HorizontalReduction() : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), - ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} + ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {} /// \brief Try to find a reduction tree. bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { @@ -3607,11 +3708,11 @@ public: return false; // Post order traverse the reduction tree starting at B. We only handle true - // trees containing only binary operators. - SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack; + // trees containing only binary operators or selects. + SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; Stack.push_back(std::make_pair(B, 0)); while (!Stack.empty()) { - BinaryOperator *TreeN = Stack.back().first; + Instruction *TreeN = Stack.back().first; unsigned EdgeToVist = Stack.back().second++; bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; @@ -3647,9 +3748,10 @@ public: // Visit left or right. Value *NextV = TreeN->getOperand(EdgeToVist); - BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV); - if (Next) - Stack.push_back(std::make_pair(Next, 0)); + // We currently only allow BinaryOperator's and SelectInst's as reduction + // values in our tree. + if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV)) + Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0)); else if (NextV != Phi) return false; } @@ -3670,7 +3772,7 @@ public: IRBuilder<> Builder(ReductionRoot); FastMathFlags Unsafe; Unsafe.setUnsafeAlgebra(); - Builder.SetFastMathFlags(Unsafe); + Builder.setFastMathFlags(Unsafe); unsigned i = 0; for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { @@ -3717,9 +3819,12 @@ public: return VectorizedTree != nullptr; } -private: + unsigned numReductionValues() const { + return ReducedVals.size(); + } - /// \brief Calcuate the cost of a reduction. +private: + /// \brief Calculate the cost of a reduction. int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { Type *ScalarTy = FirstReducedVal->getType(); Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); @@ -3825,6 +3930,82 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) { return V->getType() < V2->getType(); } +/// \brief Try and get a reduction value from a phi node. +/// +/// Given a phi node \p P in a block \p ParentBB, consider possible reductions +/// if they come from either \p ParentBB or a containing loop latch. +/// +/// \returns A candidate reduction value if possible, or \code nullptr \endcode +/// if not possible. +static Value *getReductionValue(const DominatorTree *DT, PHINode *P, + BasicBlock *ParentBB, LoopInfo *LI) { + // There are situations where the reduction value is not dominated by the + // reduction phi. Vectorizing such cases has been reported to cause + // miscompiles. See PR25787. + auto DominatedReduxValue = [&](Value *R) { + return ( + dyn_cast<Instruction>(R) && + DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent())); + }; + + Value *Rdx = nullptr; + + // Return the incoming value if it comes from the same BB as the phi node. + if (P->getIncomingBlock(0) == ParentBB) { + Rdx = P->getIncomingValue(0); + } else if (P->getIncomingBlock(1) == ParentBB) { + Rdx = P->getIncomingValue(1); + } + + if (Rdx && DominatedReduxValue(Rdx)) + return Rdx; + + // Otherwise, check whether we have a loop latch to look at. + Loop *BBL = LI->getLoopFor(ParentBB); + if (!BBL) + return nullptr; + BasicBlock *BBLatch = BBL->getLoopLatch(); + if (!BBLatch) + return nullptr; + + // There is a loop latch, return the incoming value if it comes from + // that. This reduction pattern occassionaly turns up. + if (P->getIncomingBlock(0) == BBLatch) { + Rdx = P->getIncomingValue(0); + } else if (P->getIncomingBlock(1) == BBLatch) { + Rdx = P->getIncomingValue(1); + } + + if (Rdx && DominatedReduxValue(Rdx)) + return Rdx; + + return nullptr; +} + +/// \brief Attempt to reduce a horizontal reduction. +/// If it is legal to match a horizontal reduction feeding +/// the phi node P with reduction operators BI, then check if it +/// can be done. +/// \returns true if a horizontal reduction was matched and reduced. +/// \returns false if a horizontal reduction was not matched. +static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI, + BoUpSLP &R, TargetTransformInfo *TTI) { + if (!ShouldVectorizeHor) + return false; + + HorizontalReduction HorRdx; + if (!HorRdx.matchAssociativeReduction(P, BI)) + return false; + + // If there is a sufficient number of reduction values, reduce + // to a nearby power-of-2. Can safely generate oversized + // vectors and rely on the backend to split them to legal sizes. + HorRdx.ReduxWidth = + std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues())); + + return HorRdx.tryToReduce(R, TTI); +} + bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector<Value *, 4> Incoming; @@ -3836,9 +4017,8 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Collect the incoming values from the PHIs. Incoming.clear(); - for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie; - ++instr) { - PHINode *P = dyn_cast<PHINode>(instr); + for (Instruction &I : *BB) { + PHINode *P = dyn_cast<PHINode>(&I); if (!P) break; @@ -3881,7 +4061,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { // We may go through BB multiple times so skip the one we have checked. - if (!VisitedInstrs.insert(it).second) + if (!VisitedInstrs.insert(&*it).second) continue; if (isa<DbgInfoIntrinsic>(it)) @@ -3892,20 +4072,16 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Check that the PHI is a reduction PHI. if (P->getNumIncomingValues() != 2) return Changed; - Value *Rdx = - (P->getIncomingBlock(0) == BB - ? (P->getIncomingValue(0)) - : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) - : nullptr)); + + Value *Rdx = getReductionValue(DT, P, BB, LI); + // Check if this is a Binary Operator. BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); if (!BI) continue; // Try to match and vectorize a horizontal reduction. - HorizontalReduction HorRdx; - if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) && - HorRdx.tryToReduce(R, TTI)) { + if (canMatchHorizontalReduction(P, BI, R, TTI)) { Changed = true; it = BB->begin(); e = BB->end(); @@ -3928,15 +4104,12 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { continue; } - // Try to vectorize horizontal reductions feeding into a store. if (ShouldStartVectorizeHorAtStore) if (StoreInst *SI = dyn_cast<StoreInst>(it)) if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(SI->getValueOperand())) { - HorizontalReduction HorRdx; - if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) && - HorRdx.tryToReduce(R, TTI)) || - tryToVectorize(BinOp, R))) { + if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) || + tryToVectorize(BinOp, R)) { Changed = true; it = BB->begin(); e = BB->end(); @@ -4037,10 +4210,10 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { char SLPVectorizer::ID = 0; static const char lv_name[] = "SLP Vectorizer"; INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) |